diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2015-04-18 17:41:34 +0300 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2015-04-18 17:41:34 +0300 |
commit | 305cc878d395475c46b4ef52f4764bd0c85bf8ac (patch) | |
tree | fd146e81575e8b0e68ddc51237c7acb00df79c0b /cipher/rijndael-aesni.c | |
parent | fe38d3815b4cd203cd529949e244aca80d32897f (diff) | |
download | libgcrypt-305cc878d395475c46b4ef52f4764bd0c85bf8ac.tar.gz |
Add OCB bulk crypt/auth functions for AES/AES-NI
* cipher/cipher-internal.h (gcry_cipher_handle): Add bulk.ocb_crypt
and bulk.ocb_auth.
(_gcry_cipher_ocb_get_l): New prototype.
* cipher/cipher-ocb.c (get_l): Rename to ...
(_gcry_cipher_ocb_get_l): ... this.
(_gcry_cipher_ocb_authenticate, ocb_crypt): Use bulk function when
available.
* cipher/cipher.c (_gcry_cipher_open_internal): Setup OCB bulk
functions for AES.
* cipher/rijndael-aesni.c (get_l, aesni_ocb_enc, aes_ocb_dec)
(_gcry_aes_aesni_ocb_crypt, _gcry_aes_aesni_ocb_auth): New.
* cipher/rijndael.c [USE_AESNI] (_gcry_aes_aesni_ocb_crypt)
(_gcry_aes_aesni_ocb_auth): New prototypes.
(_gcry_aes_ocb_crypt, _gcry_aes_ocb_auth): New.
* src/cipher.h (_gcry_aes_ocb_crypt, _gcry_aes_ocb_auth): New
prototypes.
* tests/basic.c (check_ocb_cipher_largebuf): New.
(check_ocb_cipher): Add large buffer encryption/decryption test.
--
Patch adds bulk encryption/decryption/authentication code for AES-NI
accelerated AES.
Benchmark on Intel i5-4570 (3200 Mhz, turbo off):
Before:
AES | nanosecs/byte mebibytes/sec cycles/byte
OCB enc | 2.12 ns/B 449.7 MiB/s 6.79 c/B
OCB dec | 2.12 ns/B 449.6 MiB/s 6.79 c/B
OCB auth | 2.07 ns/B 459.9 MiB/s 6.64 c/B
After:
AES | nanosecs/byte mebibytes/sec cycles/byte
OCB enc | 0.292 ns/B 3262.5 MiB/s 0.935 c/B
OCB dec | 0.297 ns/B 3212.2 MiB/s 0.950 c/B
OCB auth | 0.260 ns/B 3666.1 MiB/s 0.832 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/rijndael-aesni.c')
-rw-r--r-- | cipher/rijndael-aesni.c | 483 |
1 files changed, 483 insertions, 0 deletions
diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c index 3c367cec..9a816021 100644 --- a/cipher/rijndael-aesni.c +++ b/cipher/rijndael-aesni.c @@ -29,6 +29,7 @@ #include "bufhelp.h" #include "cipher-selftest.h" #include "rijndael-internal.h" +#include "./cipher-internal.h" #ifdef USE_AESNI @@ -1251,4 +1252,486 @@ _gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx, unsigned char *outbuf, aesni_cleanup_2_6 (); } + +static inline const unsigned char * +get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i, unsigned char *iv, + unsigned char *ctr) +{ + const unsigned char *l; + unsigned int ntz; + + if (i & 0xffffffffU) + { + asm ("rep;bsf %k[low], %k[ntz]\n\t" + : [ntz] "=r" (ntz) + : [low] "r" (i & 0xffffffffU) + : "cc"); + } + else + { + if (OCB_L_TABLE_SIZE < 32) + { + ntz = 32; + } + else if (i) + { + asm ("rep;bsf %k[high], %k[ntz]\n\t" + : [ntz] "=r" (ntz) + : [high] "r" (i >> 32) + : "cc"); + ntz += 32; + } + else + { + ntz = 64; + } + } + + if (ntz < OCB_L_TABLE_SIZE) + { + l = c->u_mode.ocb.L[ntz]; + } + else + { + /* Store Offset & Checksum before calling external function */ + asm volatile ("movdqu %%xmm5, %[iv]\n\t" + "movdqu %%xmm6, %[ctr]\n\t" + : [iv] "=m" (*iv), + [ctr] "=m" (*ctr) + : + : "memory" ); + + l = _gcry_cipher_ocb_get_l (c, l_tmp, i); + + /* Restore Offset & Checksum */ + asm volatile ("movdqu %[iv], %%xmm5\n\t" + "movdqu %[ctr], %%xmm6\n\t" + : /* No output */ + : [iv] "m" (*iv), + [ctr] "m" (*ctr) + : "memory" ); + } + + return l; +} + + +static void +aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks) +{ + union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } l_tmp; + RIJNDAEL_context *ctx = (void *)&c->context.c; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + u64 n = c->u_mode.ocb.data_nblocks; + + aesni_prepare (); + + /* Preload Offset and Checksum */ + asm volatile ("movdqu %[iv], %%xmm5\n\t" + "movdqu %[ctr], %%xmm6\n\t" + : /* No output */ + : [iv] "m" (*c->u_iv.iv), + [ctr] "m" (*c->u_ctr.ctr) + : "memory" ); + + for ( ;nblocks > 3 ; nblocks -= 4 ) + { + const unsigned char *l[4]; + + /* l_tmp will be used only every 65536-th block. */ + l[0] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr); + l[1] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr); + l[2] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr); + l[3] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr); + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* Checksum_i = Checksum_{i-1} xor P_i */ + /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ + asm volatile ("movdqu %[l0], %%xmm0\n\t" + "movdqu %[inbuf0], %%xmm1\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm1, %%xmm6\n\t" + "pxor %%xmm5, %%xmm1\n\t" + "movdqu %%xmm5, %[outbuf0]\n\t" + : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)) + : [l0] "m" (*l[0]), + [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[l1], %%xmm0\n\t" + "movdqu %[inbuf1], %%xmm2\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm2, %%xmm6\n\t" + "pxor %%xmm5, %%xmm2\n\t" + "movdqu %%xmm5, %[outbuf1]\n\t" + : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)) + : [l1] "m" (*l[1]), + [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[l2], %%xmm0\n\t" + "movdqu %[inbuf2], %%xmm3\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm3, %%xmm6\n\t" + "pxor %%xmm5, %%xmm3\n\t" + "movdqu %%xmm5, %[outbuf2]\n\t" + : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)) + : [l2] "m" (*l[2]), + [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[l3], %%xmm0\n\t" + "movdqu %[inbuf3], %%xmm4\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm4, %%xmm6\n\t" + "pxor %%xmm5, %%xmm4\n\t" + : + : [l3] "m" (*l[3]), + [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)) + : "memory" ); + + do_aesni_enc_vec4 (ctx); + + asm volatile ("movdqu %[outbuf0],%%xmm0\n\t" + "pxor %%xmm0, %%xmm1\n\t" + "movdqu %%xmm1, %[outbuf0]\n\t" + "movdqu %[outbuf1],%%xmm0\n\t" + "pxor %%xmm0, %%xmm2\n\t" + "movdqu %%xmm2, %[outbuf1]\n\t" + "movdqu %[outbuf2],%%xmm0\n\t" + "pxor %%xmm0, %%xmm3\n\t" + "movdqu %%xmm3, %[outbuf2]\n\t" + "pxor %%xmm5, %%xmm4\n\t" + "movdqu %%xmm4, %[outbuf3]\n\t" + : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)), + [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)), + [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)), + [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)) + : + : "memory" ); + + outbuf += 4*BLOCKSIZE; + inbuf += 4*BLOCKSIZE; + } + for ( ;nblocks; nblocks-- ) + { + const unsigned char *l; + + l = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr); + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* Checksum_i = Checksum_{i-1} xor P_i */ + /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ + asm volatile ("movdqu %[l], %%xmm1\n\t" + "movdqu %[inbuf], %%xmm0\n\t" + "pxor %%xmm1, %%xmm5\n\t" + "pxor %%xmm0, %%xmm6\n\t" + "pxor %%xmm5, %%xmm0\n\t" + : + : [l] "m" (*l), + [inbuf] "m" (*inbuf) + : "memory" ); + + do_aesni_enc (ctx); + + asm volatile ("pxor %%xmm5, %%xmm0\n\t" + "movdqu %%xmm0, %[outbuf]\n\t" + : [outbuf] "=m" (*outbuf) + : + : "memory" ); + + inbuf += BLOCKSIZE; + outbuf += BLOCKSIZE; + } + + c->u_mode.ocb.data_nblocks = n; + asm volatile ("movdqu %%xmm5, %[iv]\n\t" + "movdqu %%xmm6, %[ctr]\n\t" + : [iv] "=m" (*c->u_iv.iv), + [ctr] "=m" (*c->u_ctr.ctr) + : + : "memory" ); + + aesni_cleanup (); + aesni_cleanup_2_6 (); + + wipememory(&l_tmp, sizeof(l_tmp)); +} + + +static void +aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks) +{ + union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } l_tmp; + RIJNDAEL_context *ctx = (void *)&c->context.c; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + u64 n = c->u_mode.ocb.data_nblocks; + + aesni_prepare (); + + /* Preload Offset and Checksum */ + asm volatile ("movdqu %[iv], %%xmm5\n\t" + "movdqu %[ctr], %%xmm6\n\t" + : /* No output */ + : [iv] "m" (*c->u_iv.iv), + [ctr] "m" (*c->u_ctr.ctr) + : "memory" ); + + for ( ;nblocks > 3 ; nblocks -= 4 ) + { + const unsigned char *l[4]; + + /* l_tmp will be used only every 65536-th block. */ + l[0] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr); + l[1] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr); + l[2] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr); + l[3] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr); + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ + /* Checksum_i = Checksum_{i-1} xor P_i */ + asm volatile ("movdqu %[l0], %%xmm0\n\t" + "movdqu %[inbuf0], %%xmm1\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm5, %%xmm1\n\t" + "movdqu %%xmm5, %[outbuf0]\n\t" + : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)) + : [l0] "m" (*l[0]), + [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[l1], %%xmm0\n\t" + "movdqu %[inbuf1], %%xmm2\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm5, %%xmm2\n\t" + "movdqu %%xmm5, %[outbuf1]\n\t" + : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE)) + : [l1] "m" (*l[1]), + [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[l2], %%xmm0\n\t" + "movdqu %[inbuf2], %%xmm3\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm5, %%xmm3\n\t" + "movdqu %%xmm5, %[outbuf2]\n\t" + : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)) + : [l2] "m" (*l[2]), + [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[l3], %%xmm0\n\t" + "movdqu %[inbuf3], %%xmm4\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm5, %%xmm4\n\t" + : + : [l3] "m" (*l[3]), + [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)) + : "memory" ); + + do_aesni_dec_vec4 (ctx); + + asm volatile ("movdqu %[outbuf0],%%xmm0\n\t" + "pxor %%xmm0, %%xmm1\n\t" + "movdqu %%xmm1, %[outbuf0]\n\t" + "movdqu %[outbuf1],%%xmm0\n\t" + "pxor %%xmm0, %%xmm2\n\t" + "movdqu %%xmm2, %[outbuf1]\n\t" + "movdqu %[outbuf2],%%xmm0\n\t" + "pxor %%xmm0, %%xmm3\n\t" + "movdqu %%xmm3, %[outbuf2]\n\t" + "pxor %%xmm5, %%xmm4\n\t" + "movdqu %%xmm4, %[outbuf3]\n\t" + "pxor %%xmm1, %%xmm6\n\t" + "pxor %%xmm2, %%xmm6\n\t" + "pxor %%xmm3, %%xmm6\n\t" + "pxor %%xmm4, %%xmm6\n\t" + : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)), + [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)), + [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)), + [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE)) + : + : "memory" ); + + outbuf += 4*BLOCKSIZE; + inbuf += 4*BLOCKSIZE; + } + for ( ;nblocks; nblocks-- ) + { + const unsigned char *l; + + l = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr); + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ + /* Checksum_i = Checksum_{i-1} xor P_i */ + asm volatile ("movdqu %[l], %%xmm1\n\t" + "movdqu %[inbuf], %%xmm0\n\t" + "pxor %%xmm1, %%xmm5\n\t" + "pxor %%xmm5, %%xmm0\n\t" + : + : [l] "m" (*l), + [inbuf] "m" (*inbuf) + : "memory" ); + + do_aesni_dec (ctx); + + asm volatile ("pxor %%xmm5, %%xmm0\n\t" + "pxor %%xmm0, %%xmm6\n\t" + "movdqu %%xmm0, %[outbuf]\n\t" + : [outbuf] "=m" (*outbuf) + : + : "memory" ); + + inbuf += BLOCKSIZE; + outbuf += BLOCKSIZE; + } + + c->u_mode.ocb.data_nblocks = n; + asm volatile ("movdqu %%xmm5, %[iv]\n\t" + "movdqu %%xmm6, %[ctr]\n\t" + : [iv] "=m" (*c->u_iv.iv), + [ctr] "=m" (*c->u_ctr.ctr) + : + : "memory" ); + + aesni_cleanup (); + aesni_cleanup_2_6 (); + + wipememory(&l_tmp, sizeof(l_tmp)); +} + + +void +_gcry_aes_aesni_ocb_crypt(gcry_cipher_hd_t c, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, int encrypt) +{ + if (encrypt) + aesni_ocb_enc(c, outbuf_arg, inbuf_arg, nblocks); + else + aesni_ocb_dec(c, outbuf_arg, inbuf_arg, nblocks); +} + + +void +_gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, + size_t nblocks) +{ + union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } l_tmp; + RIJNDAEL_context *ctx = (void *)&c->context.c; + const unsigned char *abuf = abuf_arg; + u64 n = c->u_mode.ocb.aad_nblocks; + + aesni_prepare (); + + /* Preload Offset and Sum */ + asm volatile ("movdqu %[iv], %%xmm5\n\t" + "movdqu %[ctr], %%xmm6\n\t" + : /* No output */ + : [iv] "m" (*c->u_mode.ocb.aad_offset), + [ctr] "m" (*c->u_mode.ocb.aad_sum) + : "memory" ); + + for ( ;nblocks > 3 ; nblocks -= 4 ) + { + const unsigned char *l[4]; + + /* l_tmp will be used only every 65536-th block. */ + l[0] = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum); + l[1] = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum); + l[2] = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum); + l[3] = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum); + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ + asm volatile ("movdqu %[l0], %%xmm0\n\t" + "movdqu %[abuf0], %%xmm1\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm5, %%xmm1\n\t" + : + : [l0] "m" (*l[0]), + [abuf0] "m" (*(abuf + 0 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[l1], %%xmm0\n\t" + "movdqu %[abuf1], %%xmm2\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm5, %%xmm2\n\t" + : + : [l1] "m" (*l[1]), + [abuf1] "m" (*(abuf + 1 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[l2], %%xmm0\n\t" + "movdqu %[abuf2], %%xmm3\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm5, %%xmm3\n\t" + : + : [l2] "m" (*l[2]), + [abuf2] "m" (*(abuf + 2 * BLOCKSIZE)) + : "memory" ); + asm volatile ("movdqu %[l3], %%xmm0\n\t" + "movdqu %[abuf3], %%xmm4\n\t" + "pxor %%xmm0, %%xmm5\n\t" + "pxor %%xmm5, %%xmm4\n\t" + : + : [l3] "m" (*l[3]), + [abuf3] "m" (*(abuf + 3 * BLOCKSIZE)) + : "memory" ); + + do_aesni_enc_vec4 (ctx); + + asm volatile ("pxor %%xmm1, %%xmm6\n\t" + "pxor %%xmm2, %%xmm6\n\t" + "pxor %%xmm3, %%xmm6\n\t" + "pxor %%xmm4, %%xmm6\n\t" + : + : + : "memory" ); + + abuf += 4*BLOCKSIZE; + } + for ( ;nblocks; nblocks-- ) + { + const unsigned char *l; + + l = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum); + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ + asm volatile ("movdqu %[l], %%xmm1\n\t" + "movdqu %[abuf], %%xmm0\n\t" + "pxor %%xmm1, %%xmm5\n\t" + "pxor %%xmm5, %%xmm0\n\t" + : + : [l] "m" (*l), + [abuf] "m" (*abuf) + : "memory" ); + + do_aesni_enc (ctx); + + asm volatile ("pxor %%xmm0, %%xmm6\n\t" + : + : + : "memory" ); + + abuf += BLOCKSIZE; + } + + c->u_mode.ocb.aad_nblocks = n; + asm volatile ("movdqu %%xmm5, %[iv]\n\t" + "movdqu %%xmm6, %[ctr]\n\t" + : [iv] "=m" (*c->u_mode.ocb.aad_offset), + [ctr] "=m" (*c->u_mode.ocb.aad_sum) + : + : "memory" ); + + aesni_cleanup (); + aesni_cleanup_2_6 (); + + wipememory(&l_tmp, sizeof(l_tmp)); +} + + #endif /* USE_AESNI */ |