summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@mbnet.fi>2012-11-23 19:22:35 +0200
committerWerner Koch <wk@gnupg.org>2012-11-26 09:28:07 +0100
commitfaec12e23f03c7cd1614594bfdd51f1302cadb42 (patch)
tree99c77f53a97f10bc37fb4b9873e1d163ee63fe22 /src
parentfc37e805c6394c2e635d1a033670be961f36a6d2 (diff)
downloadlibgcrypt-faec12e23f03c7cd1614594bfdd51f1302cadb42.tar.gz
Optimize wipememory2 for i386 and x86-64
* src/g10lib.h (wipememory2): Add call to fast_wipememory2. (fast_wipememory2): New macros for i386 and x86-64 architectures. Empty macro provided for other architectures. -- Optimizing wipememory2 give broad range of speed improvements, as seen below. Cipher speed ratios, old-vs-new (AMD Phenom II, x86-64): ECB/Stream CBC CFB OFB CTR --------------- --------------- --------------- --------------- --------------- IDEA 1.32x 1.35x 1.29x 1.25x 1.30x 1.33x 1.33x 1.33x 1.22x 1.22x 3DES 1.13x 1.10x 1.11x 1.12x 1.13x 1.16x 1.13x 1.13x 1.10x 1.12x CAST5 1.57x 1.51x 1.56x 1.43x 1.48x 1.50x 1.49x 1.51x 1.28x 1.27x BLOWFISH 1.53x 1.52x 1.56x 1.42x 1.50x 1.51x 1.49x 1.52x 1.27x 1.28x AES 1.33x 1.33x 1.00x 1.02x 1.04x 1.02x 1.26x 1.26x 1.00x 0.98x AES192 1.33x 1.36x 1.05x 1.00x 1.04x 1.00x 1.28x 1.24x 1.02x 1.00x AES256 1.22x 1.33x 0.98x 1.00x 1.03x 1.02x 1.28x 1.25x 1.00x 1.00x TWOFISH 1.34x 1.34x 1.44x 1.25x 1.35x 1.28x 1.37x 1.37x 1.14x 1.16x ARCFOUR 1.00x 1.00x DES 1.31x 1.30x 1.34x 1.25x 1.28x 1.28x 1.34x 1.26x 1.22x 1.24x TWOFISH128 1.41x 1.45x 1.46x 1.28x 1.32x 1.37x 1.34x 1.28x 1.16x 1.16x SERPENT128 1.16x 1.20x 1.22x 1.16x 1.16x 1.16x 1.18x 1.18x 1.14x 1.11x SERPENT192 1.16x 1.20x 1.23x 1.16x 1.19x 1.18x 1.16x 1.16x 1.10x 1.10x SERPENT256 1.18x 1.23x 1.23x 1.13x 1.18x 1.16x 1.18x 1.16x 1.11x 1.11x RFC2268_40 1.00x 1.00x 1.03x 0.96x 0.98x 1.00x 0.99x 1.00x 0.99x 0.98x SEED 1.20x 1.24x 1.25x 1.18x 1.19x 1.18x 1.21x 1.22x 1.14x 1.12x CAMELLIA128 1.60x 1.69x 1.56x 1.50x 1.60x 1.53x 1.64x 1.63x 1.29x 1.32x CAMELLIA192 1.55x 1.46x 1.44x 1.34x 1.42x 1.50x 1.46x 1.51x 1.26x 1.28x CAMELLIA256 1.52x 1.50x 1.47x 1.40x 1.51x 1.44x 1.41x 1.50x 1.28x 1.28x Cipher speed ratios, old-vs-new (AMD Phenom II, i386): ECB/Stream CBC CFB OFB CTR --------------- --------------- --------------- --------------- --------------- IDEA 1.15x 1.11x 1.10x 1.08x 1.09x 1.13x 1.16x 1.07x 1.10x 1.14x 3DES 1.08x 1.08x 1.08x 1.07x 1.06x 1.06x 1.06x 1.05x 1.05x 1.05x CAST5 1.23x 1.25x 1.18x 1.17x 1.25x 1.21x 1.22x 1.17x 1.14x 1.12x BLOWFISH 1.25x 1.22x 1.21x 1.11x 1.23x 1.23x 1.24x 1.17x 1.14x 1.14x AES 1.13x 1.13x 1.02x 1.02x 0.98x 0.98x 1.16x 1.03x 1.02x 0.98x AES192 1.11x 1.12x 1.02x 0.99x 1.02x 0.95x 1.06x 1.00x 0.94x 0.91x AES256 1.05x 1.05x 0.97x 1.00x 1.00x 0.99x 1.11x 1.01x 0.99x 1.00x TWOFISH 1.11x 1.15x 1.16x 1.13x 1.12x 1.14x 1.13x 1.05x 1.07x 1.08x ARCFOUR 1.00x 0.97x DES 1.14x 1.14x 1.10x 1.07x 1.11x 1.12x 1.14x 1.08x 1.11x 1.17x TWOFISH128 1.16x 1.23x 1.18x 1.15x 1.14x 1.20x 1.15x 1.05x 1.08x 1.08x SERPENT128 1.08x 1.08x 1.08x 1.05x 1.06x 1.05x 1.09x 1.04x 1.05x 1.05x SERPENT192 1.07x 1.08x 1.08x 1.04x 1.04x 1.06x 1.08x 1.04x 1.01x 1.05x SERPENT256 1.06x 1.08x 1.05x 1.04x 1.05x 1.08x 1.07x 1.03x 1.06x 1.06x RFC2268_40 1.00x 0.99x 1.02x 1.01x 1.01x 1.00x 1.02x 0.99x 0.98x 0.99x SEED 1.12x 1.07x 1.12x 1.07x 1.09x 1.10x 1.10x 1.03x 1.07x 1.05x CAMELLIA128 1.24x 1.21x 1.16x 1.17x 1.16x 1.16x 1.21x 1.16x 1.13x 1.12x CAMELLIA192 1.19x 1.20x 1.14x 1.19x 1.20x 1.20x 1.18x 1.13x 1.13x 1.15x CAMELLIA256 1.21x 1.19x 1.14x 1.17x 1.17x 1.16x 1.17x 1.11x 1.12x 1.14x Hash speed ratios, old-vs-new (Intel Sandy-Bridge, x86-64): MD5 1.00x 1.47x 1.07x 1.00x 1.00x SHA1 1.06x 1.27x 1.06x 1.00x 1.00x RIPEMD160 1.04x 1.32x 1.11x 1.00x 1.00x TIGER192 1.05x 1.50x 1.15x 1.03x 1.05x SHA256 1.05x 1.38x 1.21x 1.04x 1.03x SHA384 1.15x 1.76x 1.25x 1.10x 1.04x SHA512 1.15x 1.76x 1.27x 1.08x 1.04x SHA224 1.05x 1.38x 1.21x 1.06x 1.00x MD4 1.17x 1.55x 1.06x 1.06x 1.00x CRC32 1.00x 1.00x 0.99x 1.04x 1.00x CRC32RFC1510 0.93x 1.00x 1.01x 1.00x 1.00x CRC24RFC2440 1.00x 1.00x 1.00x 0.99x 1.00x WHIRLPOOL 1.02x 1.00x 0.99x 1.00x 1.00x TIGER 1.05x 1.50x 1.15x 1.09x 1.05x TIGER2 1.05x 1.48x 1.16x 1.06x 0.95x Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Diffstat (limited to 'src')
-rw-r--r--src/g10lib.h43
1 files changed, 42 insertions, 1 deletions
diff --git a/src/g10lib.h b/src/g10lib.h
index c580c085..f1af399e 100644
--- a/src/g10lib.h
+++ b/src/g10lib.h
@@ -238,11 +238,52 @@ void _gcry_burn_stack (int bytes);
#define wipememory2(_ptr,_set,_len) do { \
volatile char *_vptr=(volatile char *)(_ptr); \
size_t _vlen=(_len); \
- while(_vlen) { *_vptr=(_set); _vptr++; _vlen--; } \
+ unsigned char _vset=(_set); \
+ fast_wipememory2(_vptr,_vset,_vlen); \
+ while(_vlen) { *_vptr=(_vset); _vptr++; _vlen--; } \
} while(0)
#define wipememory(_ptr,_len) wipememory2(_ptr,0,_len)
+/* Optimized fast_wipememory2 for i386 and x86-64 architechtures. Maybe leave
+ tail bytes unhandled, in which case tail bytes are handled by wipememory2.
+ */
+#if defined(__x86_64__) && __GNUC__ >= 4
+#define fast_wipememory2(_vptr,_vset,_vlen) do { \
+ unsigned long long int _vset8 = _vset; \
+ if (_vlen < 8) \
+ break; \
+ _vset8 *= 0x0101010101010101ULL; \
+ do { \
+ asm volatile("movq %[set], %[ptr]\n\t" \
+ : /**/ \
+ : [set] "Cr" (_vset8), \
+ [ptr] "m" (*_vptr) \
+ : "memory"); \
+ _vlen -= 8; \
+ _vptr += 8; \
+ } while (_vlen >= 8); \
+ } while (0)
+#elif defined (__i386__) && SIZEOF_UNSIGNED_LONG == 4 && __GNUC__ >= 4
+#define fast_wipememory2(_ptr,_set,_len) do { \
+ unsigned long _vset4 = _vset; \
+ if (_vlen < 4) \
+ break; \
+ _vset4 *= 0x01010101; \
+ do { \
+ asm volatile("movl %[set], %[ptr]\n\t" \
+ : /**/ \
+ : [set] "Cr" (_vset4), \
+ [ptr] "m" (*_vptr) \
+ : "memory"); \
+ _vlen -= 4; \
+ _vptr += 4; \
+ } while (_vlen >= 4); \
+ } while (0)
+#else
+#define fast_wipememory2(_ptr,_set,_len)
+#endif
+
/* Digit predicates. */