summaryrefslogtreecommitdiff
path: root/digest.py
blob: 46714676940341b4ed1f73ae608a3d7fdf593903 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env python
# Calculate (multiple) digest(s) for file(s)
#
# Author: Peter Wu <peter@lekensteyn.nl>
# Licensed under the MIT license <http://opensource.org/licenses/MIT>

from __future__ import print_function
import hashlib
import sys
from threading import Thread
try:
    from queue import Queue
except:
    # Python 2 compatibility
    from Queue import Queue

def read_blocks(filename):
    if filename == '-':
        f = sys.stdin
        # Python 3 compat: read binary instead of unicode
        if hasattr(f, 'buffer'):
            f = f.buffer
    else:
        f = open(filename, 'rb')
    try:
        megabyte = 2 ** 20
        while True:
            data = f.read(megabyte)
            if not data:
                break
            yield data
    finally:
        f.close()

class Hasher(object):
    '''Calculate multiple hash digests for a piece of data.'''
    def __init__(self, algos):
        self.algos = algos
        self._hashes = {}
        for algo in self.algos:
            self._hashes[algo] = getattr(hashlib, algo)()

    def update(self, data):
        for h in self._hashes:
            h.update(data)

    def hexdigests(self):
        '''Yields the algorithm and the calculated hex digest.'''
        for algo in self.algos:
            digest = self._hashes[algo].hexdigest()
            yield algo, digest

class MtHasher(Hasher):
    # Queue size. Memory usage is this times block size (1M)
    QUEUE_SIZE = 10
    def __init__(self, algos):
        super(MtHasher, self).__init__(algos)
        self._queues = {}
        self._threads = {}
        for algo in algos:
            t = Thread(target=self._queue_updater, args=(algo,), name=algo)
            self._queues[algo] = Queue(MtHasher.QUEUE_SIZE)
            self._threads[algo] = t
            t.start()

    def _queue_updater(self, algo):
        q = self._queues[algo]
        h = self._hashes[algo]
        while True:
            data = q.get()
            # Treat an empty value as terminator
            if not data:
                break
            h.update(data)

    def update(self, data):
        if data:
            for q in self._queues.values():
                q.put(data)

    def hexdigests(self):
        # Wait until all calculations are done and yield the results in meantime
        for algo in self.algos:
            q = self._queues[algo]
            q.put(b'') # Terminate
            self._threads[algo].join()
            assert q.empty()
        return super(MtHasher, self).hexdigests()

try:
    supported_algos = hashlib.algorithms_guaranteed
except:
    supported_algos = ('md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512')

def print_usage():
    dgst_opts = ' '.join('[-{0}]'.format(algo) for algo in supported_algos)
    print('Usage: python digest.py {} [FILE]...'.format(dgst_opts),
          file=sys.stderr)

def main(*argv):
    filenames = []
    algos = []

    if any(help_arg in argv for help_arg in ('-h', '--help')):
        print_usage()
        return 1

    for arg in argv:
        if arg.startswith('-') and arg != '-':
            algo = arg.lstrip('-')  # Strip leading '-'
            if algo in supported_algos:
                # Preserve ordering, ignore duplicates
                if not algo in algos:
                    algos.append(algo)
            else:
                print('Unsupported algo:', algo, file=sys.stderr)
        else:
            filenames.append(arg)

    if not algos:
        print('Missing digest!', file=sys.stderr)
        print_usage()
        return 1

    # Assume stdin if no file is given
    if not filenames:
        filenames.append('-')

    # Calculate digest(s) for each file
    for filename in filenames:
        hasher = MtHasher(algos)

        # Try to read the file and update the hash states
        try:
            for data in read_blocks(filename):
                hasher.update(data)
        except OSError as e:
            print('digest: {0}: {1}'.format(filename, e.strerror))
            continue

        for algo, digest in hasher.hexdigests():
            print('{0}  {1}'.format(digest, filename))

if __name__ == '__main__':
    sys.exit(main(*sys.argv[1:]))