summaryrefslogtreecommitdiff
path: root/cipher
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@mbnet.fi>2012-11-29 17:31:03 +0200
committerWerner Koch <wk@gnupg.org>2012-11-29 19:03:24 +0100
commit9ee9e25f519696d509b1a5c1cc04ab0121e98a51 (patch)
tree8f98f5846582efd09259c86df8eaabfa3c647cf4 /cipher
parent6765e0a8618000d3dc7bda035163e0708c43791b (diff)
downloadlibgcrypt-9ee9e25f519696d509b1a5c1cc04ab0121e98a51.tar.gz
Optimize AES-NI CTR mode.
* cipher/rijndael.c [USE_AESNI] (do_aesni_ctr, do_aesni_ctr_4): Make handling of 64-bit overflow and carry conditional. Avoid generic to vector register passing of value '1'. Generate and use '-1' instead. -- We only need to handle 64-bit carry in few special cases, that happen very rarely. So move carry handling to slow-path and only detect need for carry handling on fast-path. Also avoid moving '1' from generic register to vector register, as that might be slow on some CPUs. Instead generate '-1' with SSE2 instructions and use subtraction instead of addition to increase IV. Overall this gives ~8% improvement in speed for AES CTR mode on Intel Sandy-Bridge. Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Diffstat (limited to 'cipher')
-rw-r--r--cipher/rijndael.c90
1 files changed, 39 insertions, 51 deletions
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index cc7f8d61..6313ab2e 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -1015,24 +1015,20 @@ do_aesni_ctr (const RIJNDAEL_context *ctx,
asm volatile ("movdqa (%[ctr]), %%xmm0\n\t" /* xmm0, xmm2 := CTR */
"movaps %%xmm0, %%xmm2\n\t"
- "mov $1, %%esi\n\t" /* xmm2++ (big-endian) */
- "movd %%esi, %%xmm1\n\t"
-
- "movl 12(%[ctr]), %%esi\n\t" /* load lower parts of CTR */
- "bswapl %%esi\n\t"
- "movl 8(%[ctr]), %%edi\n\t"
- "bswapl %%edi\n\t"
+ "pcmpeqd %%xmm1, %%xmm1\n\t"
+ "psrldq $8, %%xmm1\n\t" /* xmm1 = -1 */
"pshufb %[mask], %%xmm2\n\t"
- "paddq %%xmm1, %%xmm2\n\t"
+ "psubq %%xmm1, %%xmm2\n\t" /* xmm2++ (big endian) */
- "addl $1, %%esi\n\t"
- "adcl $0, %%edi\n\t" /* detect 64bit overflow */
- "jnc .Lno_carry%=\n\t"
+ /* detect if 64-bit carry handling is needed */
+ "cmpl $0xffffffff, 8(%[ctr])\n\t"
+ "jne .Lno_carry%=\n\t"
+ "cmpl $0xffffffff, 12(%[ctr])\n\t"
+ "jne .Lno_carry%=\n\t"
- /* swap upper and lower halfs */
- "pshufd $0x4e, %%xmm1, %%xmm1\n\t"
- "paddq %%xmm1, %%xmm2\n\t" /* add carry to upper 64bits */
+ "pslldq $8, %%xmm1\n\t" /* move lower 64-bit to high */
+ "psubq %%xmm1, %%xmm2\n\t" /* add carry to upper 64bits */
".Lno_carry%=:\n\t"
@@ -1085,7 +1081,7 @@ do_aesni_ctr (const RIJNDAEL_context *ctx,
[key] "r" (ctx->keyschenc),
[rounds] "g" (ctx->rounds),
[mask] "m" (*be_mask)
- : "%esi", "%edi", "cc", "memory");
+ : "cc", "memory");
#undef aesenc_xmm1_xmm0
#undef aesenclast_xmm1_xmm0
}
@@ -1120,48 +1116,40 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
asm volatile ("movdqa (%[ctr]), %%xmm0\n\t" /* xmm0, xmm2 := CTR */
"movaps %%xmm0, %%xmm2\n\t"
- "mov $1, %%esi\n\t" /* xmm1 := 1 */
- "movd %%esi, %%xmm1\n\t"
-
- "movl 12(%[ctr]), %%esi\n\t" /* load lower parts of CTR */
- "bswapl %%esi\n\t"
- "movl 8(%[ctr]), %%edi\n\t"
- "bswapl %%edi\n\t"
+ "pcmpeqd %%xmm1, %%xmm1\n\t"
+ "psrldq $8, %%xmm1\n\t" /* xmm1 = -1 */
"pshufb %[mask], %%xmm2\n\t" /* xmm2 := le(xmm2) */
- "paddq %%xmm1, %%xmm2\n\t" /* xmm2++ */
+ "psubq %%xmm1, %%xmm2\n\t" /* xmm2++ */
"movaps %%xmm2, %%xmm3\n\t" /* xmm3 := xmm2 */
- "paddq %%xmm1, %%xmm3\n\t" /* xmm3++ */
+ "psubq %%xmm1, %%xmm3\n\t" /* xmm3++ */
"movaps %%xmm3, %%xmm4\n\t" /* xmm4 := xmm3 */
- "paddq %%xmm1, %%xmm4\n\t" /* xmm4++ */
+ "psubq %%xmm1, %%xmm4\n\t" /* xmm4++ */
"movaps %%xmm4, %%xmm5\n\t" /* xmm5 := xmm4 */
- "paddq %%xmm1, %%xmm5\n\t" /* xmm5++ */
-
- /* swap upper and lower halfs */
- "pshufd $0x4e, %%xmm1, %%xmm1\n\t"
-
- "addl $1, %%esi\n\t"
- "adcl $0, %%edi\n\t" /* detect 64bit overflow */
- "jc .Lcarry_xmm2%=\n\t"
- "addl $1, %%esi\n\t"
- "adcl $0, %%edi\n\t" /* detect 64bit overflow */
- "jc .Lcarry_xmm3%=\n\t"
- "addl $1, %%esi\n\t"
- "adcl $0, %%edi\n\t" /* detect 64bit overflow */
- "jc .Lcarry_xmm4%=\n\t"
- "addl $1, %%esi\n\t"
- "adcl $0, %%edi\n\t" /* detect 64bit overflow */
- "jc .Lcarry_xmm5%=\n\t"
- "jmp .Lno_carry%=\n\t"
-
- ".Lcarry_xmm2%=:\n\t"
- "paddq %%xmm1, %%xmm2\n\t"
+ "psubq %%xmm1, %%xmm5\n\t" /* xmm5++ */
+
+ /* detect if 64-bit carry handling is needed */
+ "cmpl $0xffffffff, 8(%[ctr])\n\t"
+ "jne .Lno_carry%=\n\t"
+ "movl 12(%[ctr]), %%esi\n\t"
+ "bswapl %%esi\n\t"
+ "cmpl $0xfffffffc, %%esi\n\t"
+ "jb .Lno_carry%=\n\t" /* no carry */
+
+ "pslldq $8, %%xmm1\n\t" /* move lower 64-bit to high */
+ "je .Lcarry_xmm5%=\n\t" /* esi == 0xfffffffc */
+ "cmpl $0xfffffffe, %%esi\n\t"
+ "jb .Lcarry_xmm4%=\n\t" /* esi == 0xfffffffd */
+ "je .Lcarry_xmm3%=\n\t" /* esi == 0xfffffffe */
+ /* esi == 0xffffffff */
+
+ "psubq %%xmm1, %%xmm2\n\t"
".Lcarry_xmm3%=:\n\t"
- "paddq %%xmm1, %%xmm3\n\t"
+ "psubq %%xmm1, %%xmm3\n\t"
".Lcarry_xmm4%=:\n\t"
- "paddq %%xmm1, %%xmm4\n\t"
+ "psubq %%xmm1, %%xmm4\n\t"
".Lcarry_xmm5%=:\n\t"
- "paddq %%xmm1, %%xmm5\n\t"
+ "psubq %%xmm1, %%xmm5\n\t"
".Lno_carry%=:\n\t"
"pshufb %[mask], %%xmm2\n\t" /* xmm2 := be(xmm2) */
@@ -1170,7 +1158,7 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
"pshufb %[mask], %%xmm5\n\t" /* xmm5 := be(xmm5) */
"movdqa %%xmm5, (%[ctr])\n" /* Update CTR. */
- "movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */
+ "movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */
"pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */
"pxor %%xmm1, %%xmm2\n\t" /* xmm2 ^= key[0] */
"pxor %%xmm1, %%xmm3\n\t" /* xmm3 ^= key[0] */
@@ -1275,7 +1263,7 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
[key] "r" (ctx->keyschenc),
[rounds] "g" (ctx->rounds),
[mask] "m" (*be_mask)
- : "%esi", "%edi", "cc", "memory");
+ : "%esi", "cc", "memory");
#undef aesenc_xmm1_xmm0
#undef aesenc_xmm1_xmm2
#undef aesenc_xmm1_xmm3