common-bytes.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240

#!/usr/bin/python
# Given lines of fixed length, mark uncommon chars
# Copyright (C) 2013 Peter Wu <lekensteyn@gmail.com>
# License: GPLv3+

import sys
import fileinput

CLEAR_COLOR = "\033[m"

# Reads lines from files and/or stdin
def read_lines(files, line_width, lines_limit):
    lines = []
    lineno = 0
    for line in fileinput.input(files=files):
        lineno += 1
        line = line.rstrip()
        if line_width > 0:
            line = line[:line_width]
        line_len = len(line)
        if line_width > 0 and line_len != line_width:
            print("Line {} has a different length {} (expected {})"\
                 .format(lineno, line_len, line_width),
                 file=sys.stderr)
            return None, None

        lines.append(line)

        # set expected line width if unavailable
        if not (line_width > 0):
            line_width = line_len

        if lines_limit > 0 and lineno >= lines_limit:
            print("Stopped reading after {} lines".format(lineno),
                  file=sys.stderr)
            break

    if not lines:
        print("Input is empty", file=sys.stderr)

    return line_width, lines

def count_chars(lines, line_width):
    # an array for each column, storing a map counting byte occurences
    counts = []
    # for each column, store a tuple (highest count, most common chars)
    top_counts = []

    # Process columns and rows, building a map of counters
    for col in range(0, line_width):
        col_ctr = {}
        for line in lines:
            char = line[col]
            if not char in col_ctr:
               col_ctr[char] = 1
            else:
               col_ctr[char] += 1
        counts.append(col_ctr)

        # Find highest common chars count
        top_count = -1
        top_chars = []
        for char, count in col_ctr.items():
            if count > top_count:
                top_count = count
                top_chars = [char]
            elif count == top_count:
                top_chars.append(char)
        top_counts.append((top_count, len(top_chars), top_chars))

    return top_counts

# Calculate which columns need to get colored.
# 0 = the whole column is has the same single char (uncolored)
# 1 = this cell is the most common char (green)
# 2 = this cell is one of the most common chars (yellow)
# 3 = this cell is not most common (red)
CELL_ALLSAME = 0
CELL_COMMON = 1
CELL_COMMON_SHARED = 2
CELL_OUTLIER = 3
def color_cells(lines, line_width, top_counts):
    lines_count = len(lines)
    colormap = [None] * lines_count
    for lineno, line in enumerate(lines):
        line_color = colormap[lineno] = [CELL_ALLSAME] * line_width

        for col in range(0, line_width):
            char = line[col]
            max_count, common_count, common_chars = top_counts[col]

            if char in common_chars:
                # the most common char has no other chars in the same range:
                if common_count == 1:
                    # color columns with changed
                    if max_count != lines_count:
                        line_color[col] = CELL_COMMON
                else:
                    line_color[col] = CELL_COMMON_SHARED
            else: # not common at all
                line_color[col] = CELL_OUTLIER

    return colormap

# Find columns that can be skipped
# common at begin: N..[other] (threshold: 5)
# common at end: [other]..N (threshold: 5)
# common in middle: [other]..N..[other] (threshold: 9)
def mark_skippable(colormap, line_width, context):
    skip_mask = [0] * line_width
    skip_cols = 0
    threshold = 5
    is_right = True
    for col in reversed(range(0, line_width)):
        if colormap[0][col] == 0:
            # this column can maybe be skipped
            skip_cols += 1
            # If on the right, then there is no context to show. Otherwise,
            # reserve enough columns for the context.
            if is_right or skip_cols > context:
                skip_mask[col] = -1
        else:
            # this column cannot be skipped
            # For the middle, more context need to be available
            if not is_right:
                skip_cols -= 2 * context
            else:
                skip_cols -= context
            # Insert skip marker if possible, otherwise ignore
            if skip_cols > threshold:
                # Display these columns...
                for i in range(1, context + 1):
                    skip_mask[col + i] = 0
                # ... and insert the skip count after the context
                skip_mask[col + 1 + context] = skip_cols

            # Must be the middle now.
            is_right = False
            threshold = 9
            skip_cols = 0

    # at begin, are there any cells to skip?
    # Keep context if there were differences on the right (i.e. if some columns
    # on the right were not skipped, but shown)
    if not is_right:
        skip_cols -= context
    threshold = 5
    if skip_cols > threshold:
        skip_mask[0] = skip_cols

    return skip_mask

colors = {
CELL_ALLSAME:       CLEAR_COLOR,
CELL_COMMON:        "\033[0;32m", # green,
CELL_COMMON_SHARED: "\033[0;33m", # yellow
CELL_OUTLIER:       "\033[1;31m", # red
}

# Display input, coloring changes
def display_changes(lines, line_width, colormap, skip_mask, enable_skip):
    for lineno, line in enumerate(lines):
        line_out = ""
        prev_color = CLEAR_COLOR
        line_color = colormap[lineno]
        skipping = False
        for col in range(0, line_width):
            # skip common cells?
            if enable_skip:
                if skipping and skip_mask[col] == -1:
                    # This line is still the same
                    continue
                elif skip_mask[col] > 0:
                    # N columns can be skipped (N = skip_mask[col])
                    prev_color = "\033[1;30m"
                    line_out += prev_color + "..{}..".format(skip_mask[col])
                    skipping = True
                    continue
                else:
                    # end of skippable columns
                    skipping = False

            # apply coloring
            coloring = line_color[col]
            new_color = colors[coloring]
            if prev_color != new_color:
                line_out += new_color
                prev_color = new_color

            line_out += line[col]

        # Any colors must be cleared at the end of the line
        if prev_color != CLEAR_COLOR:
            line_out += CLEAR_COLOR

        print(line_out)

def main(args):
    # Grab lines from stdin and/ or command line
    line_width, lines = read_lines(args.files, args.line_width, args.lines_limit)
    if not lines:
        return 1

    # Calculate character frequency per column
    top_counts = count_chars(lines, line_width)

    # calculate coloring
    colormap = color_cells(lines, line_width, top_counts)

    # Shorten lines by cutting out common stuff
    skip_mask = mark_skippable(colormap, line_width, args.context)

    # Finally display the input, colored.
    display_changes(lines, line_width, colormap, skip_mask, args.enable_skip)

    return 0

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(
        description="Assume common lines, mark tiny differences")
    parser.add_argument("-C", required=False, type=int, default=0,
                        dest="context", metavar="NUM",
                        help="Number of columns to keep as context")
    parser.add_argument("-c", required=False, type=int, default=0,
                        dest="line_width", metavar="LENGTH",
                        help="Truncate input lines to LENGTH (default 0, unlimited)")
    parser.add_argument("-l", required=False, type=int, default=0,
                        dest="lines_limit", metavar="COUNT",
                        help="Limit the number of lines to COUNT (default 0, unlimited)")
    parser.add_argument("-s", required=False, action="store_true",
                        dest="enable_skip",
                        help="Enable skipping common columns")
    parser.add_argument("files", nargs=argparse.REMAINDER)
    args = parser.parse_args()
    try:
        sys.exit(main(args))
    except BrokenPipeError:
        # ignore EPIPE for pipes like: thisScript.py | head
        pass