Add AVX and AVX2/BMI implementations for SHA-256

* LICENSES: Add 'cipher/sha256-avx-amd64.S' and 'cipher/sha256-avx2-bmi2-amd64.S'. * cipher/Makefile.am: Add 'sha256-avx-amd64.S' and 'sha256-avx2-bmi2-amd64.S'. * cipher/sha256-avx-amd64.S: New. * cipher/sha256-avx2-bmi2-amd64.S: New. * cipher/sha256-ssse3-amd64.S: Use 'lea' instead of 'add' in few places for tiny speed improvement. * cipher/sha256.c (USE_AVX, USE_AVX2): New. (SHA256_CONTEXT) [USE_AVX, USE_AVX2]: Add 'use_avx' and 'use_avx2'. (sha256_init, sha224_init) [USE_AVX, USE_AVX2]: Initialize above new context members. [USE_AVX] (_gcry_sha256_transform_amd64_avx): New. [USE_AVX2] (_gcry_sha256_transform_amd64_avx2): New. (transform) [USE_AVX2]: Use AVX2 assembly if enabled. (transform) [USE_AVX]: Use AVX assembly if enabled. * configure.ac: Add 'sha256-avx-amd64.lo' and 'sha256-avx2-bmi2-amd64.lo'. -- Patch adds fast AVX and AVX2/BMI2 implementations of SHA-256 by Intel Corporation. The assembly source is licensed under 3-clause BSD license, thus compatible with LGPL2.1+. Original source can be accessed at: http://www.intel.com/p/en_US/embedded/hwsw/technology/packet-processing#docs Implementation is described in white paper "Fast SHA - 256 Implementations on Intel® Architecture Processors" http://www.intel.com/content/www/us/en/intelligent-systems/intel-technology/sha-256-implementations-paper.html Note: AVX implementation uses SHLD instruction to emulate RORQ, since it's faster on Intel Sandy-Bridge. However, on non-Intel CPUs SHLD is much slower than RORQ, so therefore AVX implementation is (for now) limited to Intel CPUs. Note: AVX2 implementation also uses BMI2 instruction rorx, thus additional HWF flag. Benchmarks: cpu C-lang SSSE3 AVX/AVX2 C vs AVX/AVX2 vs SSSE3 Intel i5-4570 13.86 c/B 10.27 c/B 8.70 c/B 1.59x 1.18x Intel i5-2450M 17.25 c/B 12.36 c/B 10.31 c/B 1.67x 1.19x Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
author: Jussi Kivilinna <jussi.kivilinna@iki.fi> 2013-12-17 15:35:38 +0200
committer: Jussi Kivilinna <jussi.kivilinna@iki.fi> 2013-12-18 17:00:04 +0200
commit: a5c2bbfe0db515d739ab683297903c77b1eec124 (patch)
tree: ef6d9ba8d35b6e621aee58e91431d0fd446e940e /cipher/sha256-ssse3-amd64.S
parent: e4e458465b124e25b6aec7a60174bf1ca32dc5fd (diff)
download: libgcrypt-a5c2bbfe0db515d739ab683297903c77b1eec124.tar.gz
1 files changed, 6 insertions, 5 deletions
diff --git a/cipher/sha256-ssse3-amd64.S b/cipher/sha256-ssse3-amd64.S
index 9b27f8f7..80b1cec4 100644
--- a/cipher/sha256-ssse3-amd64.S
+++ b/cipher/sha256-ssse3-amd64.S
@@ -206,7 +206,7 @@ a = TMP_
 	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
 		por	XTMP1, XTMP2	/* XTMP1 = W[-15] ror 7 */
 	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
-	add	h, y0		/* h = h + S1 + CH + k + w + S0 + MAJ */
+	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
 
 ROTATE_ARGS
 		movdqa	XTMP2, XTMP3	/* XTMP2 = W[-15] */
@@ -247,7 +247,7 @@ ROTATE_ARGS
 	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
 		paddd	XTMP0, XTMP1	/* XTMP0 = W[-16] + W[-7] + s0 */
 	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
-	add	h, y0		/* h = h + S1 + CH + k + w + S0 + MAJ */
+	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
 
 ROTATE_ARGS
 		movdqa	XTMP3, XTMP2	/* XTMP3 = W[-2] {BBAA} */
@@ -288,7 +288,7 @@ ROTATE_ARGS
 		/* compute high s1 */
 		pshufd	XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */
 	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
-	add	h, y0		/* h = h + S1 + CH + k + w + S0 + MAJ */
+	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
 
 ROTATE_ARGS
 		movdqa	XTMP3, XTMP2	/* XTMP3 = W[-2] {DDCC} */
@@ -327,7 +327,7 @@ ROTATE_ARGS
 	and	y0, b		/* y0 = (a|c)&b */
 	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
 	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
-	add	h, y0		/* h = h + S1 + CH + k + w + S0 + MAJ */
+	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
 
 ROTATE_ARGS
 rotate_Xs
@@ -362,7 +362,7 @@ rotate_Xs
 	and	y0, b		/* y0 = (a|c)&b */
 	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
 	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
-	add	h, y0		/* h = h + S1 + CH + k + w + S0 + MAJ */
+	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
 	ROTATE_ARGS
 .endm
 
@@ -505,6 +505,7 @@ _gcry_sha256_transform_amd64_ssse3:
 	pop	rbx
 
 	mov     eax, STACK_SIZE + 5*8
+
 	ret
author	Jussi Kivilinna <jussi.kivilinna@iki.fi>	2013-12-17 15:35:38 +0200
committer	Jussi Kivilinna <jussi.kivilinna@iki.fi>	2013-12-18 17:00:04 +0200
commit	a5c2bbfe0db515d739ab683297903c77b1eec124 (patch)
tree	ef6d9ba8d35b6e621aee58e91431d0fd446e940e /cipher/sha256-ssse3-amd64.S
parent	e4e458465b124e25b6aec7a60174bf1ca32dc5fd (diff)
download	libgcrypt-a5c2bbfe0db515d739ab683297903c77b1eec124.tar.gz