diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2015-08-10 22:09:56 +0300 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2015-08-10 22:09:56 +0300 |
commit | 49f52c67fb42c0656c8f9af655087f444562ca82 (patch) | |
tree | 2ef935a60649db8d61b3e1f36982788a15a10506 /cipher | |
parent | ce746936b6c210e602d106cfbf45cf60b408d871 (diff) | |
download | libgcrypt-49f52c67fb42c0656c8f9af655087f444562ca82.tar.gz |
Optimize OCB offset calculation
* cipher/cipher-internal.h (ocb_get_l): New.
* cipher/cipher-ocb.c (_gcry_cipher_ocb_authenticate)
(ocb_crypt): Use 'ocb_get_l' instead of '_gcry_cipher_ocb_get_l'.
* cipher/camellia-glue.c (get_l): Remove.
(_gcry_camellia_ocb_crypt, _gcry_camellia_ocb_auth): Precalculate
offset array when block count matches parallel operation size; Use
'ocb_get_l' instead of 'get_l'.
* cipher/rijndael-aesni.c (get_l): Add fast path for 75% most common
offsets.
(aesni_ocb_enc, aesni_ocb_dec, _gcry_aes_aesni_ocb_auth): Precalculate
offset array when block count matches parallel operation size.
* cipher/rijndael-ssse3-amd64.c (get_l): Add fast path for 75% most
common offsets.
* cipher/rijndael.c (_gcry_aes_ocb_crypt, _gcry_aes_ocb_auth): Use
'ocb_get_l' instead of '_gcry_cipher_ocb_get_l'.
* cipher/serpent.c (get_l): Remove.
(_gcry_serpent_ocb_crypt, _gcry_serpent_ocb_auth): Precalculate
offset array when block count matches parallel operation size; Use
'ocb_get_l' instead of 'get_l'.
* cipher/twofish.c (get_l): Remove.
(_gcry_twofish_ocb_crypt, _gcry_twofish_ocb_auth): Use 'ocb_get_l'
instead of 'get_l'.
--
Patch optimizes OCB offset calculation for generic code and
assembly implementations with parallel block processing.
Benchmark of OCB AES-NI on Intel Haswell:
$ tests/bench-slope --cpu-mhz 3201 cipher aes
Before:
AES | nanosecs/byte mebibytes/sec cycles/byte
CTR enc | 0.274 ns/B 3483.9 MiB/s 0.876 c/B
CTR dec | 0.273 ns/B 3490.0 MiB/s 0.875 c/B
OCB enc | 0.289 ns/B 3296.1 MiB/s 0.926 c/B
OCB dec | 0.299 ns/B 3189.9 MiB/s 0.957 c/B
OCB auth | 0.260 ns/B 3670.0 MiB/s 0.832 c/B
After:
AES | nanosecs/byte mebibytes/sec cycles/byte
CTR enc | 0.273 ns/B 3489.4 MiB/s 0.875 c/B
CTR dec | 0.273 ns/B 3487.5 MiB/s 0.875 c/B
OCB enc | 0.248 ns/B 3852.8 MiB/s 0.792 c/B
OCB dec | 0.261 ns/B 3659.5 MiB/s 0.834 c/B
OCB auth | 0.227 ns/B 4205.5 MiB/s 0.726 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher')
-rw-r--r-- | cipher/camellia-glue.c | 161 | ||||
-rw-r--r-- | cipher/cipher-internal.h | 20 | ||||
-rw-r--r-- | cipher/cipher-ocb.c | 5 | ||||
-rw-r--r-- | cipher/rijndael-aesni.c | 498 | ||||
-rw-r--r-- | cipher/rijndael-ssse3-amd64.c | 6 | ||||
-rw-r--r-- | cipher/rijndael.c | 24 | ||||
-rw-r--r-- | cipher/serpent.c | 209 | ||||
-rw-r--r-- | cipher/twofish.c | 25 |
8 files changed, 597 insertions, 351 deletions
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c index 99516fc6..2d5dd209 100644 --- a/cipher/camellia-glue.c +++ b/cipher/camellia-glue.c @@ -604,19 +604,6 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv, _gcry_burn_stack(burn_stack_depth); } -#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) -static inline const unsigned char * -get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i) -{ - unsigned int ntz = _gcry_ctz64 (i); - - if (ntz < OCB_L_TABLE_SIZE) - return c->u_mode.ocb.L[ntz]; - else - return _gcry_cipher_ocb_get_l (c, l_tmp, i); -} -#endif - /* Bulk encryption/decryption of complete blocks in OCB mode. */ size_t _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, @@ -646,17 +633,43 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, const void *Ls[32]; int i; + if (blkn % 32 == 0) + { + for (i = 0; i < 32; i += 8) + { + Ls[i + 0] = c->u_mode.ocb.L[0]; + Ls[i + 1] = c->u_mode.ocb.L[1]; + Ls[i + 2] = c->u_mode.ocb.L[0]; + Ls[i + 3] = c->u_mode.ocb.L[2]; + Ls[i + 4] = c->u_mode.ocb.L[0]; + Ls[i + 5] = c->u_mode.ocb.L[1]; + Ls[i + 6] = c->u_mode.ocb.L[0]; + } + + Ls[7] = c->u_mode.ocb.L[3]; + Ls[15] = c->u_mode.ocb.L[4]; + Ls[23] = c->u_mode.ocb.L[3]; + } + /* Process data in 32 block chunks. */ while (nblocks >= 32) { /* l_tmp will be used only every 65536-th block. */ - for (i = 0; i < 32; i += 4) + if (blkn % 32 == 0) + { + blkn += 32; + Ls[31] = ocb_get_l(c, l_tmp, blkn); + } + else { - Ls[i + 0] = get_l(c, l_tmp, blkn + 1); - Ls[i + 1] = get_l(c, l_tmp, blkn + 2); - Ls[i + 2] = get_l(c, l_tmp, blkn + 3); - Ls[i + 3] = get_l(c, l_tmp, blkn + 4); - blkn += 4; + for (i = 0; i < 32; i += 4) + { + Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1); + Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2); + Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3); + Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4); + blkn += 4; + } } if (encrypt) @@ -692,17 +705,41 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, const void *Ls[16]; int i; + if (blkn % 16 == 0) + { + for (i = 0; i < 16; i += 8) + { + Ls[i + 0] = c->u_mode.ocb.L[0]; + Ls[i + 1] = c->u_mode.ocb.L[1]; + Ls[i + 2] = c->u_mode.ocb.L[0]; + Ls[i + 3] = c->u_mode.ocb.L[2]; + Ls[i + 4] = c->u_mode.ocb.L[0]; + Ls[i + 5] = c->u_mode.ocb.L[1]; + Ls[i + 6] = c->u_mode.ocb.L[0]; + } + + Ls[7] = c->u_mode.ocb.L[3]; + } + /* Process data in 16 block chunks. */ while (nblocks >= 16) { /* l_tmp will be used only every 65536-th block. */ - for (i = 0; i < 16; i += 4) + if (blkn % 16 == 0) { - Ls[i + 0] = get_l(c, l_tmp, blkn + 1); - Ls[i + 1] = get_l(c, l_tmp, blkn + 2); - Ls[i + 2] = get_l(c, l_tmp, blkn + 3); - Ls[i + 3] = get_l(c, l_tmp, blkn + 4); - blkn += 4; + blkn += 16; + Ls[15] = ocb_get_l(c, l_tmp, blkn); + } + else + { + for (i = 0; i < 16; i += 4) + { + Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1); + Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2); + Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3); + Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4); + blkn += 4; + } } if (encrypt) @@ -768,17 +805,43 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, const void *Ls[32]; int i; + if (blkn % 32 == 0) + { + for (i = 0; i < 32; i += 8) + { + Ls[i + 0] = c->u_mode.ocb.L[0]; + Ls[i + 1] = c->u_mode.ocb.L[1]; + Ls[i + 2] = c->u_mode.ocb.L[0]; + Ls[i + 3] = c->u_mode.ocb.L[2]; + Ls[i + 4] = c->u_mode.ocb.L[0]; + Ls[i + 5] = c->u_mode.ocb.L[1]; + Ls[i + 6] = c->u_mode.ocb.L[0]; + } + + Ls[7] = c->u_mode.ocb.L[3]; + Ls[15] = c->u_mode.ocb.L[4]; + Ls[23] = c->u_mode.ocb.L[3]; + } + /* Process data in 32 block chunks. */ while (nblocks >= 32) { /* l_tmp will be used only every 65536-th block. */ - for (i = 0; i < 32; i += 4) + if (blkn % 32 == 0) { - Ls[i + 0] = get_l(c, l_tmp, blkn + 1); - Ls[i + 1] = get_l(c, l_tmp, blkn + 2); - Ls[i + 2] = get_l(c, l_tmp, blkn + 3); - Ls[i + 3] = get_l(c, l_tmp, blkn + 4); - blkn += 4; + blkn += 32; + Ls[31] = ocb_get_l(c, l_tmp, blkn); + } + else + { + for (i = 0; i < 32; i += 4) + { + Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1); + Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2); + Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3); + Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4); + blkn += 4; + } } _gcry_camellia_aesni_avx2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset, @@ -809,17 +872,41 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, const void *Ls[16]; int i; + if (blkn % 16 == 0) + { + for (i = 0; i < 16; i += 8) + { + Ls[i + 0] = c->u_mode.ocb.L[0]; + Ls[i + 1] = c->u_mode.ocb.L[1]; + Ls[i + 2] = c->u_mode.ocb.L[0]; + Ls[i + 3] = c->u_mode.ocb.L[2]; + Ls[i + 4] = c->u_mode.ocb.L[0]; + Ls[i + 5] = c->u_mode.ocb.L[1]; + Ls[i + 6] = c->u_mode.ocb.L[0]; + } + + Ls[7] = c->u_mode.ocb.L[3]; + } + /* Process data in 16 block chunks. */ while (nblocks >= 16) { /* l_tmp will be used only every 65536-th block. */ - for (i = 0; i < 16; i += 4) + if (blkn % 16 == 0) + { + blkn += 16; + Ls[15] = ocb_get_l(c, l_tmp, blkn); + } + else { - Ls[i + 0] = get_l(c, l_tmp, blkn + 1); - Ls[i + 1] = get_l(c, l_tmp, blkn + 2); - Ls[i + 2] = get_l(c, l_tmp, blkn + 3); - Ls[i + 3] = get_l(c, l_tmp, blkn + 4); - blkn += 4; + for (i = 0; i < 16; i += 4) + { + Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1); + Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2); + Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3); + Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4); + blkn += 4; + } } _gcry_camellia_aesni_avx_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset, diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h index bb86d376..29c6f338 100644 --- a/cipher/cipher-internal.h +++ b/cipher/cipher-internal.h @@ -448,4 +448,24 @@ const unsigned char *_gcry_cipher_ocb_get_l /* */ (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 n); +/* Inline version of _gcry_cipher_ocb_get_l, with hard-coded fast paths for + most common cases. */ +static inline const unsigned char * +ocb_get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 n) +{ + if (n & 1) + return c->u_mode.ocb.L[0]; + else if (n & 2) + return c->u_mode.ocb.L[1]; + else + { + unsigned int ntz = _gcry_ctz64 (n); + + if (ntz < OCB_L_TABLE_SIZE) + return c->u_mode.ocb.L[ntz]; + else + return _gcry_cipher_ocb_get_l (c, l_tmp, n); + } +} + #endif /*G10_CIPHER_INTERNAL_H*/ diff --git a/cipher/cipher-ocb.c b/cipher/cipher-ocb.c index 096975a5..a3a2c9bb 100644 --- a/cipher/cipher-ocb.c +++ b/cipher/cipher-ocb.c @@ -280,7 +280,7 @@ _gcry_cipher_ocb_authenticate (gcry_cipher_hd_t c, const unsigned char *abuf, /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ buf_xor_1 (c->u_mode.ocb.aad_offset, - _gcry_cipher_ocb_get_l (c, l_tmp, c->u_mode.ocb.aad_nblocks), + ocb_get_l (c, l_tmp, c->u_mode.ocb.aad_nblocks), OCB_BLOCK_LEN); /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ buf_xor (l_tmp, c->u_mode.ocb.aad_offset, abuf, OCB_BLOCK_LEN); @@ -392,8 +392,7 @@ ocb_crypt (gcry_cipher_hd_t c, int encrypt, /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ buf_xor_1 (c->u_iv.iv, - _gcry_cipher_ocb_get_l (c, l_tmp, - c->u_mode.ocb.data_nblocks), + ocb_get_l (c, l_tmp, c->u_mode.ocb.data_nblocks), OCB_BLOCK_LEN); /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ buf_xor (outbuf, c->u_iv.iv, inbuf, OCB_BLOCK_LEN); diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c index 910bc681..882cc79a 100644 --- a/cipher/rijndael-aesni.c +++ b/cipher/rijndael-aesni.c @@ -1307,7 +1307,11 @@ get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i, unsigned char *iv, const unsigned char *l; unsigned int ntz; - if (i & 0xffffffffU) + if (i & 1) + return c->u_mode.ocb.L[0]; + else if (i & 2) + return c->u_mode.ocb.L[1]; + else if (i & 0xffffffffU) { asm ("rep;bsf %k[low], %k[ntz]\n\t" : [ntz] "=r" (ntz) @@ -1372,6 +1376,7 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg, unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; u64 n = c->u_mode.ocb.data_nblocks; + const unsigned char *l[4] = {}; aesni_prepare_2_6_variable; aesni_prepare (); @@ -1385,87 +1390,103 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg, [ctr] "m" (*c->u_ctr.ctr) : "memory" ); - for ( ;nblocks > 3 ; nblocks -= 4 ) + if (nblocks > 3) { - const unsigned char *l[4]; - - /* l_tmp will be used only every 65536-th block. */ - l[0] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr); - l[1] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr); - l[2] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr); - l[3] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr); - - /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ - /* Checksum_i = Checksum_{i-1} xor P_i */ - /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ - asm volatile ("movdqu %[l0], %%xmm0\n\t" - "movdqu %[inbuf0], %%xmm1\n\t" - "pxor %%xmm0, %%xmm5\n\t" - "pxor %%xmm1, %%xmm6\n\t" - "pxor %%xmm5, %%xmm1\n\t" - "movdqu %%xmm5, %[outbuf0]\n\t" - : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)) - : [l0] "m" (*l[0]), - [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)) - : "memory" ); - asm volatile ("movdqu %[l1], %%xmm0\n\t" - "movdqu %[inbuf1], %%xmm2\n\t" - "pxor %%xmm0, %%xmm5\n\t" - "pxor %%xmm2, %%xmm6\n\t" - "pxor %%xmm5, %%xmm2\n\t" - "movdqu %%xmm5, %[outbuf1]\n\t" - : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)) - : [l1] "m" (*l[1]), - [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)) - : "memory" ); - asm volatile ("movdqu %[l2], %%xmm0\n\t" - "movdqu %[inbuf2], %%xmm3\n\t" - "pxor %%xmm0, %%xmm5\n\t" - "pxor %%xmm3, %%xmm6\n\t" - "pxor %%xmm5, %%xmm3\n\t" - "movdqu %%xmm5, %[outbuf2]\n\t" - : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)) - : [l2] "m" (*l[2]), - [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE)) - : "memory" ); - asm volatile ("movdqu %[l3], %%xmm0\n\t" - "movdqu %[inbuf3], %%xmm4\n\t" - "pxor %%xmm0, %%xmm5\n\t" - "pxor %%xmm4, %%xmm6\n\t" - "pxor %%xmm5, %%xmm4\n\t" - : - : [l3] "m" (*l[3]), - [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)) - : "memory" ); - - do_aesni_enc_vec4 (ctx); - - asm volatile ("movdqu %[outbuf0],%%xmm0\n\t" - "pxor %%xmm0, %%xmm1\n\t" - "movdqu %%xmm1, %[outbuf0]\n\t" - "movdqu %[outbuf1],%%xmm0\n\t" - "pxor %%xmm0, %%xmm2\n\t" - "movdqu %%xmm2, %[outbuf1]\n\t" - "movdqu %[outbuf2],%%xmm0\n\t" - "pxor %%xmm0, %%xmm3\n\t" - "movdqu %%xmm3, %[outbuf2]\n\t" - "pxor %%xmm5, %%xmm4\n\t" - "movdqu %%xmm4, %[outbuf3]\n\t" - : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)), - [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)), - [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)), - [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)) - : - : "memory" ); - - outbuf += 4*BLOCKSIZE; - inbuf += 4*BLOCKSIZE; + if (n % 4 == 0) + { + l[0] = c->u_mode.ocb.L[0]; + l[1] = c->u_mode.ocb.L[1]; + l[2] = c->u_mode.ocb.L[0]; + } + + for ( ;nblocks > 3 ; nblocks -= 4 ) + { + /* l_tmp will be used only every 65536-th block. */ + if (n % 4 == 0) + { + n += 4; + l[3] = get_l(c, l_tmp.x1, n, c->u_iv.iv, c->u_ctr.ctr); + } + else + { + l[0] = get_l(c, l_tmp.x1, n + 1, c->u_iv.iv, c->u_ctr.ctr); + l[1] = get_l(c, l_tmp.x1, n + 2, c->u_iv.iv, c->u_ctr.ctr); + l[2] = get_l(c, l_tmp.x1, n + 3, c->u_iv.iv, c->u_ctr.ctr); + l[3] = get_l(c, l_tmp.x1, n + 4, c->u_iv.iv, c->u_ctr.ctr); + n += 4; + } + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* Checksum_i = Checksum_{i-1} xor P_i */ + /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ + asm volatile ("movdqu %[l0], %%xmm0\n\t" + "movdqu %[inbuf0], %%xmm1\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm1, %%xmm6\n\t" + "pxor %%xmm5, %%xmm1\n\t" + "movdqu %%xmm5, %[outbuf0]\n\t" + : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)) + : [l0] "m" (*l[0]), + [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[l1], %%xmm0\n\t" + "movdqu %[inbuf1], %%xmm2\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm2, %%xmm6\n\t" + "pxor %%xmm5, %%xmm2\n\t" + "movdqu %%xmm5, %[outbuf1]\n\t" + : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)) + : [l1] "m" (*l[1]), + [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[l2], %%xmm0\n\t" + "movdqu %[inbuf2], %%xmm3\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm3, %%xmm6\n\t" + "pxor %%xmm5, %%xmm3\n\t" + "movdqu %%xmm5, %[outbuf2]\n\t" + : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)) + : [l2] "m" (*l[2]), + [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[l3], %%xmm0\n\t" + "movdqu %[inbuf3], %%xmm4\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm4, %%xmm6\n\t" + "pxor %%xmm5, %%xmm4\n\t" + : + : [l3] "m" (*l[3]), + [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)) + : "memory" ); + + do_aesni_enc_vec4 (ctx); + + asm volatile ("movdqu %[outbuf0],%%xmm0\n\t" + "pxor %%xmm0, %%xmm1\n\t" + "movdqu %%xmm1, %[outbuf0]\n\t" + "movdqu %[outbuf1],%%xmm0\n\t" + "pxor %%xmm0, %%xmm2\n\t" + "movdqu %%xmm2, %[outbuf1]\n\t" + "movdqu %[outbuf2],%%xmm0\n\t" + "pxor %%xmm0, %%xmm3\n\t" + "movdqu %%xmm3, %[outbuf2]\n\t" + "pxor %%xmm5, %%xmm4\n\t" + "movdqu %%xmm4, %[outbuf3]\n\t" + : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)), + [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)), + [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)), + [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)) + : + : "memory" ); + + outbuf += 4*BLOCKSIZE; + inbuf += 4*BLOCKSIZE; + } } + for ( ;nblocks; nblocks-- ) { - const unsigned char *l; - - l = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr); + l[0] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Checksum_i = Checksum_{i-1} xor P_i */ @@ -1476,7 +1497,7 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg, "pxor %%xmm0, %%xmm6\n\t" "pxor %%xmm5, %%xmm0\n\t" : - : [l] "m" (*l), + : [l] "m" (*l[0]), [inbuf] "m" (*inbuf) : "memory" ); @@ -1516,6 +1537,7 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg, unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; u64 n = c->u_mode.ocb.data_nblocks; + const unsigned char *l[4] = {}; aesni_prepare_2_6_variable; aesni_prepare (); @@ -1529,87 +1551,103 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg, [ctr] "m" (*c->u_ctr.ctr) : "memory" ); - for ( ;nblocks > 3 ; nblocks -= 4 ) + if (nblocks > 3) { - const unsigned char *l[4]; - - /* l_tmp will be used only every 65536-th block. */ - l[0] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr); - l[1] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr); - l[2] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr); - l[3] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr); - - /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ - /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ - /* Checksum_i = Checksum_{i-1} xor P_i */ - asm volatile ("movdqu %[l0], %%xmm0\n\t" - "movdqu %[inbuf0], %%xmm1\n\t" - "pxor %%xmm0, %%xmm5\n\t" - "pxor %%xmm5, %%xmm1\n\t" - "movdqu %%xmm5, %[outbuf0]\n\t" - : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)) - : [l0] "m" (*l[0]), - [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)) - : "memory" ); - asm volatile ("movdqu %[l1], %%xmm0\n\t" - "movdqu %[inbuf1], %%xmm2\n\t" - "pxor %%xmm0, %%xmm5\n\t" - "pxor %%xmm5, %%xmm2\n\t" - "movdqu %%xmm5, %[outbuf1]\n\t" - : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)) - : [l1] "m" (*l[1]), - [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)) - : "memory" ); - asm volatile ("movdqu %[l2], %%xmm0\n\t" - "movdqu %[inbuf2], %%xmm3\n\t" - "pxor %%xmm0, %%xmm5\n\t" - "pxor %%xmm5, %%xmm3\n\t" - "movdqu %%xmm5, %[outbuf2]\n\t" - : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)) - : [l2] "m" (*l[2]), - [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE)) - : "memory" ); - asm volatile ("movdqu %[l3], %%xmm0\n\t" - "movdqu %[inbuf3], %%xmm4\n\t" - "pxor %%xmm0, %%xmm5\n\t" - "pxor %%xmm5, %%xmm4\n\t" - : - : [l3] "m" (*l[3]), - [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)) - : "memory" ); - - do_aesni_dec_vec4 (ctx); - - asm volatile ("movdqu %[outbuf0],%%xmm0\n\t" - "pxor %%xmm0, %%xmm1\n\t" - "movdqu %%xmm1, %[outbuf0]\n\t" - "movdqu %[outbuf1],%%xmm0\n\t" - "pxor %%xmm0, %%xmm2\n\t" - "movdqu %%xmm2, %[outbuf1]\n\t" - "movdqu %[outbuf2],%%xmm0\n\t" - "pxor %%xmm0, %%xmm3\n\t" - "movdqu %%xmm3, %[outbuf2]\n\t" - "pxor %%xmm5, %%xmm4\n\t" - "movdqu %%xmm4, %[outbuf3]\n\t" - "pxor %%xmm1, %%xmm6\n\t" - "pxor %%xmm2, %%xmm6\n\t" - "pxor %%xmm3, %%xmm6\n\t" - "pxor %%xmm4, %%xmm6\n\t" - : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)), - [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)), - [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)), - [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)) - : - : "memory" ); - - outbuf += 4*BLOCKSIZE; - inbuf += 4*BLOCKSIZE; + if (n % 4 == 0) + { + l[0] = c->u_mode.ocb.L[0]; + l[1] = c->u_mode.ocb.L[1]; + l[2] = c->u_mode.ocb.L[0]; + } + + for ( ;nblocks > 3 ; nblocks -= 4 ) + { + /* l_tmp will be used only every 65536-th block. */ + if (n % 4 == 0) + { + n += 4; + l[3] = get_l(c, l_tmp.x1, n, c->u_iv.iv, c->u_ctr.ctr); + } + else + { + l[0] = get_l(c, l_tmp.x1, n + 1, c->u_iv.iv, c->u_ctr.ctr); + l[1] = get_l(c, l_tmp.x1, n + 2, c->u_iv.iv, c->u_ctr.ctr); + l[2] = get_l(c, l_tmp.x1, n + 3, c->u_iv.iv, c->u_ctr.ctr); + l[3] = get_l(c, l_tmp.x1, n + 4, c->u_iv.iv, c->u_ctr.ctr); + n += 4; + } + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ + /* Checksum_i = Checksum_{i-1} xor P_i */ + asm volatile ("movdqu %[l0], %%xmm0\n\t" + "movdqu %[inbuf0], %%xmm1\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm5, %%xmm1\n\t" + "movdqu %%xmm5, %[outbuf0]\n\t" + : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)) + : [l0] "m" (*l[0]), + [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[l1], %%xmm0\n\t" + "movdqu %[inbuf1], %%xmm2\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm5, %%xmm2\n\t" + "movdqu %%xmm5, %[outbuf1]\n\t" + : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)) + : [l1] "m" (*l[1]), + [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[l2], %%xmm0\n\t" + "movdqu %[inbuf2], %%xmm3\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm5, %%xmm3\n\t" + "movdqu %%xmm5, %[outbuf2]\n\t" + : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)) + : [l2] "m" (*l[2]), + [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[l3], %%xmm0\n\t" + "movdqu %[inbuf3], %%xmm4\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm5, %%xmm4\n\t" + : + : [l3] "m" (*l[3]), + [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)) + : "memory" ); + + do_aesni_dec_vec4 (ctx); + + asm volatile ("movdqu %[outbuf0],%%xmm0\n\t" + "pxor %%xmm0, %%xmm1\n\t" + "movdqu %%xmm1, %[outbuf0]\n\t" + "movdqu %[outbuf1],%%xmm0\n\t" + "pxor %%xmm0, %%xmm2\n\t" + "movdqu %%xmm2, %[outbuf1]\n\t" + "movdqu %[outbuf2],%%xmm0\n\t" + "pxor %%xmm0, %%xmm3\n\t" + "movdqu %%xmm3, %[outbuf2]\n\t" + "pxor %%xmm5, %%xmm4\n\t" + "movdqu %%xmm4, %[outbuf3]\n\t" + "pxor %%xmm1, %%xmm6\n\t" + "pxor %%xmm2, %%xmm6\n\t" + "pxor %%xmm3, %%xmm6\n\t" + "pxor %%xmm4, %%xmm6\n\t" + : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)), + [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)), + [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)), + [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)) + : + : "memory" ); + + outbuf += 4*BLOCKSIZE; + inbuf += 4*BLOCKSIZE; + } } + for ( ;nblocks; nblocks-- ) { - const unsigned char *l; - - l = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr); + l[0] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ @@ -1619,7 +1657,7 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg, "pxor %%xmm1, %%xmm5\n\t" "pxor %%xmm5, %%xmm0\n\t" : - : [l] "m" (*l), + : [l] "m" (*l[0]), [inbuf] "m" (*inbuf) : "memory" ); @@ -1670,6 +1708,7 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, RIJNDAEL_context *ctx = (void *)&c->context.c; const unsigned char *abuf = abuf_arg; u64 n = c->u_mode.ocb.aad_nblocks; + const unsigned char *l[4] = {}; aesni_prepare_2_6_variable; aesni_prepare (); @@ -1683,73 +1722,90 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, [ctr] "m" (*c->u_mode.ocb.aad_sum) : "memory" ); - for ( ;nblocks > 3 ; nblocks -= 4 ) + if (nblocks > 3) { - const unsigned char *l[4]; - - /* l_tmp will be used only every 65536-th block. */ - l[0] = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset, - c->u_mode.ocb.aad_sum); - l[1] = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset, - c->u_mode.ocb.aad_sum); - l[2] = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset, - c->u_mode.ocb.aad_sum); - l[3] = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset, - c->u_mode.ocb.aad_sum); - - /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ - /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ - asm volatile ("movdqu %[l0], %%xmm0\n\t" - "movdqu %[abuf0], %%xmm1\n\t" - "pxor %%xmm0, %%xmm5\n\t" - "pxor %%xmm5, %%xmm1\n\t" - : - : [l0] "m" (*l[0]), - [abuf0] "m" (*(abuf + 0 * BLOCKSIZE)) - : "memory" ); - asm volatile ("movdqu %[l1], %%xmm0\n\t" - "movdqu %[abuf1], %%xmm2\n\t" - "pxor %%xmm0, %%xmm5\n\t" - "pxor %%xmm5, %%xmm2\n\t" - : - : [l1] "m" (*l[1]), - [abuf1] "m" (*(abuf + 1 * BLOCKSIZE)) - : "memory" ); - asm volatile ("movdqu %[l2], %%xmm0\n\t" - "movdqu %[abuf2], %%xmm3\n\t" - "pxor %%xmm0, %%xmm5\n\t" - "pxor %%xmm5, %%xmm3\n\t" - : - : [l2] "m" (*l[2]), - [abuf2] "m" (*(abuf + 2 * BLOCKSIZE)) - : "memory" ); - asm volatile ("movdqu %[l3], %%xmm0\n\t" - "movdqu %[abuf3], %%xmm4\n\t" - "pxor %%xmm0, %%xmm5\n\t" - "pxor %%xmm5, %%xmm4\n\t" - : - : [l3] "m" (*l[3]), - [abuf3] "m" (*(abuf + 3 * BLOCKSIZE)) - : "memory" ); - - do_aesni_enc_vec4 (ctx); - - asm volatile ("pxor %%xmm1, %%xmm6\n\t" - "pxor %%xmm2, %%xmm6\n\t" - "pxor %%xmm3, %%xmm6\n\t" - "pxor %%xmm4, %%xmm6\n\t" - : - : - : "memory" ); - - abuf += 4*BLOCKSIZE; + if (n % 4 == 0) + { + l[0] = c->u_mode.ocb.L[0]; + l[1] = c->u_mode.ocb.L[1]; + l[2] = c->u_mode.ocb.L[0]; + } + + for ( ;nblocks > 3 ; nblocks -= 4 ) + { + /* l_tmp will be used only every 65536-th block. */ + if (n % 4 == 0) + { + n += 4; + l[3] = get_l(c, l_tmp.x1, n, c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum); + } + else + { + l[0] = get_l(c, l_tmp.x1, n + 1, c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum); + l[1] = get_l(c, l_tmp.x1, n + 2, c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum); + l[2] = get_l(c, l_tmp.x1, n + 3, c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum); + l[3] = get_l(c, l_tmp.x1, n + 4, c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum); + n += 4; + } + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ + asm volatile ("movdqu %[l0], %%xmm0\n\t" + "movdqu %[abuf0], %%xmm1\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm5, %%xmm1\n\t" + : + : [l0] "m" (*l[0]), + [abuf0] "m" (*(abuf + 0 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[l1], %%xmm0\n\t" + "movdqu %[abuf1], %%xmm2\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm5, %%xmm2\n\t" + : + : [l1] "m" (*l[1]), + [abuf1] "m" (*(abuf + 1 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[l2], %%xmm0\n\t" + "movdqu %[abuf2], %%xmm3\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm5, %%xmm3\n\t" + : + : [l2] "m" (*l[2]), + [abuf2] "m" (*(abuf + 2 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[l3], %%xmm0\n\t" + "movdqu %[abuf3], %%xmm4\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm5, %%xmm4\n\t" + : + : [l3] "m" (*l[3]), + [abuf3] "m" (*(abuf + 3 * BLOCKSIZE)) + : "memory" ); + + do_aesni_enc_vec4 (ctx); + + asm volatile ("pxor %%xmm1, %%xmm6\n\t" + "pxor %%xmm2, %%xmm6\n\t" + "pxor %%xmm3, %%xmm6\n\t" + "pxor %%xmm4, %%xmm6\n\t" + : + : + : "memory" ); + + abuf += 4*BLOCKSIZE; + } } + for ( ;nblocks; nblocks-- ) { - const unsigned char *l; - - l = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset, - c->u_mode.ocb.aad_sum); + l[0] = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ @@ -1758,7 +1814,7 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, "pxor %%xmm1, %%xmm5\n\t" "pxor %%xmm5, %%xmm0\n\t" : - : [l] "m" (*l), + : [l] "m" (*l[0]), [abuf] "m" (*abuf) : "memory" ); diff --git a/cipher/rijndael-ssse3-amd64.c b/cipher/rijndael-ssse3-amd64.c index 0cdb532d..937d8682 100644 --- a/cipher/rijndael-ssse3-amd64.c +++ b/cipher/rijndael-ssse3-amd64.c @@ -535,7 +535,11 @@ get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i, unsigned char *iv, const unsigned char *l; unsigned int ntz; - if (i & 0xffffffffU) + if (i & 1) + return c->u_mode.ocb.L[0]; + else if (i & 2) + return c->u_mode.ocb.L[1]; + else if (i & 0xffffffffU) { asm ("rep;bsf %k[low], %k[ntz]\n\t" : [ntz] "=r" (ntz) diff --git a/cipher/rijndael.c b/cipher/rijndael.c index 4368c6da..eff59c26 100644 --- a/cipher/rijndael.c +++ b/cipher/rijndael.c @@ -1246,13 +1246,7 @@ _gcry_aes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, for ( ;nblocks; nblocks-- ) { u64 i = ++c->u_mode.ocb.data_nblocks; - unsigned int ntz = _gcry_ctz64 (i); - const unsigned char *l; - - if (ntz < OCB_L_TABLE_SIZE) - l = c->u_mode.ocb.L[ntz]; - else - l = _gcry_cipher_ocb_get_l (c, l_tmp.x1, i); + const unsigned char *l = ocb_get_l(c, l_tmp.x1, i); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ buf_xor_1 (c->u_iv.iv, l, BLOCKSIZE); @@ -1277,13 +1271,7 @@ _gcry_aes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, for ( ;nblocks; nblocks-- ) { u64 i = ++c->u_mode.ocb.data_nblocks; - unsigned int ntz = _gcry_ctz64 (i); - const unsigned char *l; - - if (ntz < OCB_L_TABLE_SIZE) - l = c->u_mode.ocb.L[ntz]; - else - l = _gcry_cipher_ocb_get_l (c, l_tmp.x1, i); + const unsigned char *l = ocb_get_l(c, l_tmp.x1, i); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ buf_xor_1 (c->u_iv.iv, l, BLOCKSIZE); @@ -1343,13 +1331,7 @@ _gcry_aes_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks) for ( ;nblocks; nblocks-- ) { u64 i = ++c->u_mode.ocb.aad_nblocks; - unsigned int ntz = _gcry_ctz64 (i); - const unsigned char *l; - - if (ntz < OCB_L_TABLE_SIZE) - l = c->u_mode.ocb.L[ntz]; - else - l = _gcry_cipher_ocb_get_l (c, l_tmp.x1, i); + const unsigned char *l = ocb_get_l(c, l_tmp.x1, i); /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ buf_xor_1 (c->u_mode.ocb.aad_offset, l, BLOCKSIZE); diff --git a/cipher/serpent.c b/cipher/serpent.c index 0a54a17e..a47a1b77 100644 --- a/cipher/serpent.c +++ b/cipher/serpent.c @@ -1226,19 +1226,6 @@ _gcry_serpent_cfb_dec(void *context, unsigned char *iv, _gcry_burn_stack(burn_stack_depth); } -#if defined(USE_AVX2) || defined(USE_SSE2) || defined(USE_NEON) -static inline const unsigned char * -get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i) -{ - unsigned int ntz = _gcry_ctz64 (i); - - if (ntz < OCB_L_TABLE_SIZE) - return c->u_mode.ocb.L[ntz]; - else - return _gcry_cipher_ocb_get_l (c, l_tmp, i); -} -#endif - /* Bulk encryption/decryption of complete blocks in OCB mode. */ size_t _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, @@ -1265,17 +1252,41 @@ _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, const void *Ls[16]; int i; + if (blkn % 16 == 0) + { + for (i = 0; i < 16; i += 8) + { + Ls[i + 0] = c->u_mode.ocb.L[0]; + Ls[i + 1] = c->u_mode.ocb.L[1]; + Ls[i + 2] = c->u_mode.ocb.L[0]; + Ls[i + 3] = c->u_mode.ocb.L[2]; + Ls[i + 4] = c->u_mode.ocb.L[0]; + Ls[i + 5] = c->u_mode.ocb.L[1]; + Ls[i + 6] = c->u_mode.ocb.L[0]; + } + + Ls[7] = c->u_mode.ocb.L[3]; + } + /* Process data in 16 block chunks. */ while (nblocks >= 16) { /* l_tmp will be used only every 65536-th block. */ - for (i = 0; i < 16; i += 4) + if (blkn % 16 == 0) { - Ls[i + 0] = get_l(c, l_tmp, blkn + 1); - Ls[i + 1] = get_l(c, l_tmp, blkn + 2); - Ls[i + 2] = get_l(c, l_tmp, blkn + 3); - Ls[i + 3] = get_l(c, l_tmp, blkn + 4); - blkn += 4; + blkn += 16; + Ls[15] = ocb_get_l(c, l_tmp, blkn); + } + else + { + for (i = 0; i < 16; i += 4) + { + Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1); + Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2); + Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3); + Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4); + blkn += 4; + } } if (encrypt) @@ -1308,17 +1319,36 @@ _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, const void *Ls[8]; int i; + if (blkn % 8 == 0) + { + Ls[0] = c->u_mode.ocb.L[0]; + Ls[1] = c->u_mode.ocb.L[1]; + Ls[2] = c->u_mode.ocb.L[0]; + Ls[3] = c->u_mode.ocb.L[2]; + Ls[4] = c->u_mode.ocb.L[0]; + Ls[5] = c->u_mode.ocb.L[1]; + Ls[6] = c->u_mode.ocb.L[0]; + } + /* Process data in 8 block chunks. */ while (nblocks >= 8) { /* l_tmp will be used only every 65536-th block. */ - for (i = 0; i < 8; i += 4) + if (blkn % 8 == 0) { - Ls[i + 0] = get_l(c, l_tmp, blkn + 1); - Ls[i + 1] = get_l(c, l_tmp, blkn + 2); - Ls[i + 2] = get_l(c, l_tmp, blkn + 3); - Ls[i + 3] = get_l(c, l_tmp, blkn + 4); - blkn += 4; + blkn += 8; + Ls[7] = ocb_get_l(c, l_tmp, blkn); + } + else + { + for (i = 0; i < 8; i += 4) + { + Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1); + Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2); + Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3); + Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4); + blkn += 4; + } } if (encrypt) @@ -1352,17 +1382,36 @@ _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, const void *Ls[8]; int i; + if (blkn % 8 == 0) + { + Ls[0] = c->u_mode.ocb.L[0]; + Ls[1] = c->u_mode.ocb.L[1]; + Ls[2] = c->u_mode.ocb.L[0]; + Ls[3] = c->u_mode.ocb.L[2]; + Ls[4] = c->u_mode.ocb.L[0]; + Ls[5] = c->u_mode.ocb.L[1]; + Ls[6] = c->u_mode.ocb.L[0]; + } + /* Process data in 8 block chunks. */ while (nblocks >= 8) { /* l_tmp will be used only every 65536-th block. */ - for (i = 0; i < 8; i += 4) + if (blkn % 8 == 0) { - Ls[i + 0] = get_l(c, l_tmp, blkn + 1); - Ls[i + 1] = get_l(c, l_tmp, blkn + 2); - Ls[i + 2] = get_l(c, l_tmp, blkn + 3); - Ls[i + 3] = get_l(c, l_tmp, blkn + 4); - blkn += 4; + blkn += 8; + Ls[7] = ocb_get_l(c, l_tmp, blkn); + } + else + { + for (i = 0; i < 8; i += 4) + { + Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1); + Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2); + Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3); + Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4); + blkn += 4; + } } if (encrypt) @@ -1424,17 +1473,41 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, const void *Ls[16]; int i; + if (blkn % 16 == 0) + { + for (i = 0; i < 16; i += 8) + { + Ls[i + 0] = c->u_mode.ocb.L[0]; + Ls[i + 1] = c->u_mode.ocb.L[1]; + Ls[i + 2] = c->u_mode.ocb.L[0]; + Ls[i + 3] = c->u_mode.ocb.L[2]; + Ls[i + 4] = c->u_mode.ocb.L[0]; + Ls[i + 5] = c->u_mode.ocb.L[1]; + Ls[i + 6] = c->u_mode.ocb.L[0]; + } + + Ls[7] = c->u_mode.ocb.L[3]; + } + /* Process data in 16 block chunks. */ while (nblocks >= 16) { /* l_tmp will be used only every 65536-th block. */ - for (i = 0; i < 16; i += 4) + if (blkn % 16 == 0) + { + blkn += 16; + Ls[15] = ocb_get_l(c, l_tmp, blkn); + } + else { - Ls[i + 0] = get_l(c, l_tmp, blkn + 1); - Ls[i + 1] = get_l(c, l_tmp, blkn + 2); - Ls[i + 2] = get_l(c, l_tmp, blkn + 3); - Ls[i + 3] = get_l(c, l_tmp, blkn + 4); - blkn += 4; + for (i = 0; i < 16; i += 4) + { + Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1); + Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2); + Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3); + Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4); + blkn += 4; + } } _gcry_serpent_avx2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset, @@ -1462,17 +1535,36 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, const void *Ls[8]; int i; + if (blkn % 8 == 0) + { + Ls[0] = c->u_mode.ocb.L[0]; + Ls[1] = c->u_mode.ocb.L[1]; + Ls[2] = c->u_mode.ocb.L[0]; + Ls[3] = c->u_mode.ocb.L[2]; + Ls[4] = c->u_mode.ocb.L[0]; + Ls[5] = c->u_mode.ocb.L[1]; + Ls[6] = c->u_mode.ocb.L[0]; + } + /* Process data in 8 block chunks. */ while (nblocks >= 8) { /* l_tmp will be used only every 65536-th block. */ - for (i = 0; i < 8; i += 4) + if (blkn % 8 == 0) + { + blkn += 8; + Ls[7] = ocb_get_l(c, l_tmp, blkn); + } + else { - Ls[i + 0] = get_l(c, l_tmp, blkn + 1); - Ls[i + 1] = get_l(c, l_tmp, blkn + 2); - Ls[i + 2] = get_l(c, l_tmp, blkn + 3); - Ls[i + 3] = get_l(c, l_tmp, blkn + 4); - blkn += 4; + for (i = 0; i < 8; i += 4) + { + Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1); + Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2); + Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3); + Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4); + blkn += 4; + } } _gcry_serpent_sse2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset, @@ -1501,17 +1593,36 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, const void *Ls[8]; int i; + if (blkn % 8 == 0) + { + Ls[0] = c->u_mode.ocb.L[0]; + Ls[1] = c->u_mode.ocb.L[1]; + Ls[2] = c->u_mode.ocb.L[0]; + Ls[3] = c->u_mode.ocb.L[2]; + Ls[4] = c->u_mode.ocb.L[0]; + Ls[5] = c->u_mode.ocb.L[1]; + Ls[6] = c->u_mode.ocb.L[0]; + } + /* Process data in 8 block chunks. */ while (nblocks >= 8) { /* l_tmp will be used only every 65536-th block. */ - for (i = 0; i < 8; i += 4) + if (blkn % 8 == 0) + { + blkn += 8; + Ls[7] = ocb_get_l(c, l_tmp, blkn); + } + else { - Ls[i + 0] = get_l(c, l_tmp, blkn + 1); - Ls[i + 1] = get_l(c, l_tmp, blkn + 2); - Ls[i + 2] = get_l(c, l_tmp, blkn + 3); - Ls[i + 3] = get_l(c, l_tmp, blkn + 4); - blkn += 4; + for (i = 0; i < 8; i += 4) + { + Ls[i + 0] = ocb_get_l(c, l_tmp, blkn + 1); + Ls[i + 1] = ocb_get_l(c, l_tmp, blkn + 2); + Ls[i + 2] = ocb_get_l(c, l_tmp, blkn + 3); + Ls[i + 3] = ocb_get_l(c, l_tmp, blkn + 4); + blkn += 4; + } } _gcry_serpent_neon_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset, diff --git a/cipher/twofish.c b/cipher/twofish.c index 11e60a74..7f361c99 100644 --- a/cipher/twofish.c +++ b/cipher/twofish.c @@ -1247,19 +1247,6 @@ _gcry_twofish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg, _gcry_burn_stack(burn_stack_depth); } -#ifdef USE_AMD64_ASM -static inline const unsigned char * -get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i) -{ - unsigned int ntz = _gcry_ctz64 (i); - - if (ntz < OCB_L_TABLE_SIZE) - return c->u_mode.ocb.L[ntz]; - else - return _gcry_cipher_ocb_get_l (c, l_tmp, i); -} -#endif - /* Bulk encryption/decryption of complete blocks in OCB mode. */ size_t _gcry_twofish_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, @@ -1280,9 +1267,9 @@ _gcry_twofish_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, while (nblocks >= 3) { /* l_tmp will be used only every 65536-th block. */ - Ls[0] = get_l(c, l_tmp, blkn + 1); - Ls[1] = get_l(c, l_tmp, blkn + 2); - Ls[2] = get_l(c, l_tmp, blkn + 3); + Ls[0] = ocb_get_l(c, l_tmp, blkn + 1); + Ls[1] = ocb_get_l(c, l_tmp, blkn + 2); + Ls[2] = ocb_get_l(c, l_tmp, blkn + 3); blkn += 3; if (encrypt) @@ -1339,9 +1326,9 @@ _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, while (nblocks >= 3) { /* l_tmp will be used only every 65536-th block. */ - Ls[0] = get_l(c, l_tmp, blkn + 1); - Ls[1] = get_l(c, l_tmp, blkn + 2); - Ls[2] = get_l(c, l_tmp, blkn + 3); + Ls[0] = ocb_get_l(c, l_tmp, blkn + 1); + Ls[1] = ocb_get_l(c, l_tmp, blkn + 2); + Ls[2] = ocb_get_l(c, l_tmp, blkn + 3); blkn += 3; twofish_amd64_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset, |