digest.py: calculate multiple digests

Created to "Simultaneously calculate multiple digests (md5, sha256)", http://unix.stackexchange.com/q/163747/8250 This implementation has a simple single-threaded class (Hasher) and a multi-threaded one. Currently uses the multithreading.Queue interface.
author: Peter Wu <peter@lekensteyn.nl> 2014-10-24 11:30:09 +0200
committer: Peter Wu <peter@lekensteyn.nl> 2014-10-24 11:37:52 +0200
commit: 5ce76c6289b072254bec1d749a2983b62e835df7 (patch)
tree: afd06c48b2cb099baaf7069375fc32c5e5802661
parent: 0ba60fe6a4639fe17dee9f2eb73590d3410ce0e1 (diff)
download: scripts-5ce76c6289b072254bec1d749a2983b62e835df7.tar.gz
1 files changed, 144 insertions, 0 deletions
diff --git a/digest.py b/digest.py
new file mode 100755
index 0000000..19218a8
--- /dev/null
+++ b/digest.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python
+# Calculate (multiple) digest(s) for file(s)
+#
+# Author: Peter Wu <peter@lekensteyn.nl>
+# Licensed under the MIT license <http://opensource.org/licenses/MIT>
+
+from __future__ import print_function
+import hashlib
+import sys
+from multiprocessing import Process, Queue
+
+def read_blocks(filename):
+    if filename == '-':
+        f = sys.stdin
+        # Python 3 compat: read binary instead of unicode
+        if hasattr(f, 'buffer'):
+            f = f.buffer
+    else:
+        f = open(filename, 'rb')
+    try:
+        megabyte = 2 ** 20
+        while True:
+            data = f.read(megabyte)
+            if not data:
+                break
+            yield data
+    finally:
+        f.close()
+
+class Hasher(object):
+    '''Calculate multiple hash digests for a piece of data.'''
+    def __init__(self, algos):
+        self.algos = algos
+        self._hashes = {}
+        for algo in self.algos:
+            self._hashes[algo] = getattr(hashlib, algo)()
+
+    def update(self, data):
+        for h in self._hashes:
+            h.update(data)
+
+    def hexdigests(self):
+        '''Yields the algorithm and the calculated hex digest.'''
+        for algo in self.algos:
+            digest = self._hashes[algo].hexdigest()
+            yield algo, digest
+
+class MtHasher(Hasher):
+    # Queue size. Memory usage is this times block size (1M)
+    QUEUE_SIZE = 10
+    def __init__(self, algos):
+        super(MtHasher, self).__init__(algos)
+        self._queues = {}
+        self._threads = {}
+        for algo in algos:
+            t = Process(target=self._queue_updater, args=(algo,), name=algo)
+            self._queues[algo] = Queue(MtHasher.QUEUE_SIZE)
+            self._threads[algo] = t
+            t.start()
+
+    def _queue_updater(self, algo):
+        q = self._queues[algo]
+        h = self._hashes[algo]
+        while True:
+            data = q.get()
+            # Treat an empty value as terminator
+            if not data:
+                break
+            h.update(data)
+        # Runs on a different process, so need to signal the result
+        q.put(h.hexdigest())
+
+    def update(self, data):
+        if data:
+            for q in self._queues.values():
+                q.put(data)
+
+    def hexdigests(self):
+        # Wait until all calculations are done and yield the results in meantime
+        for algo in self.algos:
+            q = self._queues[algo]
+            q.put(b'') # Terminate
+            self._threads[algo].join()
+            digest = q.get_nowait()
+            assert q.empty()
+            q.close()
+            yield algo, digest
+
+try:
+    supported_algos = hashlib.algorithms_guaranteed
+except:
+    supported_algos = ('md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512')
+
+def print_usage():
+    dgst_opts = ' '.join('[-{0}]'.format(algo) for algo in supported_algos)
+    print('Usage: python digest.py {} [FILE]...'.format(dgst_opts),
+          file=sys.stderr)
+
+def main(*argv):
+    filenames = []
+    algos = []
+
+    if any(help_arg in argv for help_arg in ('-h', '--help')):
+        print_usage()
+        return 1
+
+    for arg in argv:
+        if arg.startswith('-') and arg != '-':
+            algo = arg.lstrip('-')  # Strip leading '-'
+            if algo in supported_algos:
+                # Preserve ordering, ignore duplicates
+                if not algo in algos:
+                    algos.append(algo)
+            else:
+                print('Unsupported algo:', algo, file=sys.stderr)
+        else:
+            filenames.append(arg)
+
+    if not algos:
+        print('Missing digest!', file=sys.stderr)
+        print_usage()
+        return 1
+
+    # Assume stdin if no file is given
+    if not filenames:
+        filenames.append('-')
+
+    # Calculate digest(s) for each file
+    for filename in filenames:
+        hasher = MtHasher(algos)
+
+        # Try to read the file and update the hash states
+        try:
+            for data in read_blocks(filename):
+                hasher.update(data)
+        except OSError as e:
+            print('digest: {0}: {1}'.format(filename, e.strerror))
+            continue
+
+        for algo, digest in hasher.hexdigests():
+            print('{0}  {1}'.format(digest, filename))
+
+if __name__ == '__main__':
+    sys.exit(main(*sys.argv[1:]))
author	Peter Wu <peter@lekensteyn.nl>	2014-10-24 11:30:09 +0200
committer	Peter Wu <peter@lekensteyn.nl>	2014-10-24 11:37:52 +0200
commit	5ce76c6289b072254bec1d749a2983b62e835df7 (patch)
tree	afd06c48b2cb099baaf7069375fc32c5e5802661
parent	0ba60fe6a4639fe17dee9f2eb73590d3410ce0e1 (diff)
download	scripts-5ce76c6289b072254bec1d749a2983b62e835df7.tar.gz