diff options
author | Peter Wu <peter@lekensteyn.nl> | 2014-10-24 11:33:01 +0200 |
---|---|---|
committer | Peter Wu <peter@lekensteyn.nl> | 2014-10-24 12:54:38 +0200 |
commit | 8b29dcf2a1d4e9cb34a1b7756e4e6b6f110c7b1f (patch) | |
tree | 8f54a18e91b7437ef1640f953461c53564524b84 | |
parent | 5ce76c6289b072254bec1d749a2983b62e835df7 (diff) | |
download | scripts-8b29dcf2a1d4e9cb34a1b7756e4e6b6f110c7b1f.tar.gz |
digest.py: use threading instead
At first I used multithreading because threading would still run with
one CPU core due to the GIL. That probably happened because I accessed
_hashes[algos] in the loop of _queue_updater. Now that this is not
done anymore, and only the hash update function is called which releases
the GIL for data larger than 2 KiB, multiple cores are actually used.
For comparison, for a file of 2.3 GiB (min/avg/max/sd secs for n=10):
- pee sha256sum md5sum < file: 16.5/16.9/17.4/.305
- python3 digest.py -sha256 -md5 < file: 13.7/15.0/18.7/1.77
- python2 digest.py -sha256 -md5 < file: 13.7/15.9/18.7/1.64
- jacksum -a sha256+md5 -F '#CHECKSUM{i} #FILENAME': 32.7/37.1/50/6.91
The file is actually 2367029248 bytes, resident in the disk cache.
Environment:
- CPU: Intel i5-460M
- Arch Linux x86_64
- Linux 3.17-rc4
- coreutils 8.23
- moreutils 0.51
- jacksum 1.7.0 on OpenJDK 1.7.0_71
- Python 3.4.2, Python 2.7.8
-rwxr-xr-x | digest.py | 15 |
1 files changed, 8 insertions, 7 deletions
@@ -7,7 +7,12 @@ from __future__ import print_function import hashlib import sys -from multiprocessing import Process, Queue +from threading import Thread +try: + from queue import Queue +except: + # Python 2 compatibility + from Queue import Queue def read_blocks(filename): if filename == '-': @@ -53,7 +58,7 @@ class MtHasher(Hasher): self._queues = {} self._threads = {} for algo in algos: - t = Process(target=self._queue_updater, args=(algo,), name=algo) + t = Thread(target=self._queue_updater, args=(algo,), name=algo) self._queues[algo] = Queue(MtHasher.QUEUE_SIZE) self._threads[algo] = t t.start() @@ -67,8 +72,6 @@ class MtHasher(Hasher): if not data: break h.update(data) - # Runs on a different process, so need to signal the result - q.put(h.hexdigest()) def update(self, data): if data: @@ -81,10 +84,8 @@ class MtHasher(Hasher): q = self._queues[algo] q.put(b'') # Terminate self._threads[algo].join() - digest = q.get_nowait() assert q.empty() - q.close() - yield algo, digest + return super(MtHasher, self).hexdigests() try: supported_algos = hashlib.algorithms_guaranteed |