summaryrefslogtreecommitdiff
path: root/cipher/rijndael-aesni.c
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2015-08-11 07:22:16 +0300
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2015-08-12 17:41:42 +0300
commit24ebf53f1e8a8afa27dcd768339bda70a740bb03 (patch)
tree2086fe6cd7e7d5c0cb24181fdaf332946aa3f69c /cipher/rijndael-aesni.c
parente11895da1f4af9782d89e92ba2e6b1a63235b54b (diff)
downloadlibgcrypt-24ebf53f1e8a8afa27dcd768339bda70a740bb03.tar.gz
Simplify OCB offset calculation for parallel implementations
* cipher/camellia-glue.c (_gcry_camellia_ocb_crypt) (_gcry_camellia_ocb_auth): Precalculate Ls array always, instead of just if 'blkn % <parallel blocks> == 0'. * cipher/serpent.c (_gcry_serpent_ocb_crypt) (_gcry_serpent_ocb_auth): Ditto. * cipher/rijndael-aesni.c (get_l): Remove low-bit checks. (aes_ocb_enc, aes_ocb_dec, _gcry_aes_aesni_ocb_auth): Handle leading blocks until block counter is multiple of 4, so that parallel block processing loop can use 'c->u_mode.ocb.L' array directly. * tests/basic.c (check_ocb_cipher_largebuf): Rename to... (check_ocb_cipher_largebuf_split): ...this and add option to process large buffer as two split buffers. (check_ocb_cipher_largebuf): New. -- Patch simplifies source and reduce object size. Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/rijndael-aesni.c')
-rw-r--r--cipher/rijndael-aesni.c562
1 files changed, 288 insertions, 274 deletions
diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index 66787858..5c859031 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -1338,11 +1338,7 @@ get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i, unsigned char *iv,
const unsigned char *l;
unsigned int ntz;
- if (i & 1)
- return c->u_mode.ocb.L[0];
- else if (i & 2)
- return c->u_mode.ocb.L[1];
- else if (i & 0xffffffffU)
+ if (i & 0xffffffffU)
{
asm ("rep;bsf %k[low], %k[ntz]\n\t"
: [ntz] "=r" (ntz)
@@ -1407,7 +1403,7 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
u64 n = c->u_mode.ocb.data_nblocks;
- const unsigned char *l[4] = {};
+ const unsigned char *l;
aesni_prepare_2_6_variable;
aesni_prepare ();
@@ -1421,103 +1417,112 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
[ctr] "m" (*c->u_ctr.ctr)
: "memory" );
- if (nblocks > 3)
+
+ for ( ;nblocks && n % 4; nblocks-- )
+ {
+ l = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
+ asm volatile ("movdqu %[l], %%xmm1\n\t"
+ "movdqu %[inbuf], %%xmm0\n\t"
+ "pxor %%xmm1, %%xmm5\n\t"
+ "pxor %%xmm0, %%xmm6\n\t"
+ "pxor %%xmm5, %%xmm0\n\t"
+ :
+ : [l] "m" (*l),
+ [inbuf] "m" (*inbuf)
+ : "memory" );
+
+ do_aesni_enc (ctx);
+
+ asm volatile ("pxor %%xmm5, %%xmm0\n\t"
+ "movdqu %%xmm0, %[outbuf]\n\t"
+ : [outbuf] "=m" (*outbuf)
+ :
+ : "memory" );
+
+ inbuf += BLOCKSIZE;
+ outbuf += BLOCKSIZE;
+ }
+
+ for ( ;nblocks > 3 ; nblocks -= 4 )
{
- if (n % 4 == 0)
- {
- l[0] = c->u_mode.ocb.L[0];
- l[1] = c->u_mode.ocb.L[1];
- l[2] = c->u_mode.ocb.L[0];
- }
-
- for ( ;nblocks > 3 ; nblocks -= 4 )
- {
- /* l_tmp will be used only every 65536-th block. */
- if (n % 4 == 0)
- {
- n += 4;
- l[3] = get_l(c, l_tmp.x1, n, c->u_iv.iv, c->u_ctr.ctr);
- }
- else
- {
- l[0] = get_l(c, l_tmp.x1, n + 1, c->u_iv.iv, c->u_ctr.ctr);
- l[1] = get_l(c, l_tmp.x1, n + 2, c->u_iv.iv, c->u_ctr.ctr);
- l[2] = get_l(c, l_tmp.x1, n + 3, c->u_iv.iv, c->u_ctr.ctr);
- l[3] = get_l(c, l_tmp.x1, n + 4, c->u_iv.iv, c->u_ctr.ctr);
- n += 4;
- }
-
- /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
- /* Checksum_i = Checksum_{i-1} xor P_i */
- /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
- asm volatile ("movdqu %[l0], %%xmm0\n\t"
- "movdqu %[inbuf0], %%xmm1\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm1, %%xmm6\n\t"
- "pxor %%xmm5, %%xmm1\n\t"
- "movdqu %%xmm5, %[outbuf0]\n\t"
- : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
- : [l0] "m" (*l[0]),
- [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
- : "memory" );
- asm volatile ("movdqu %[l1], %%xmm0\n\t"
- "movdqu %[inbuf1], %%xmm2\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm2, %%xmm6\n\t"
- "pxor %%xmm5, %%xmm2\n\t"
- "movdqu %%xmm5, %[outbuf1]\n\t"
- : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
- : [l1] "m" (*l[1]),
- [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
- : "memory" );
- asm volatile ("movdqu %[l2], %%xmm0\n\t"
- "movdqu %[inbuf2], %%xmm3\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm3, %%xmm6\n\t"
- "pxor %%xmm5, %%xmm3\n\t"
- "movdqu %%xmm5, %[outbuf2]\n\t"
- : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
- : [l2] "m" (*l[2]),
- [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
- : "memory" );
- asm volatile ("movdqu %[l3], %%xmm0\n\t"
- "movdqu %[inbuf3], %%xmm4\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm4, %%xmm6\n\t"
- "pxor %%xmm5, %%xmm4\n\t"
- :
- : [l3] "m" (*l[3]),
- [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
- : "memory" );
-
- do_aesni_enc_vec4 (ctx);
-
- asm volatile ("movdqu %[outbuf0],%%xmm0\n\t"
- "pxor %%xmm0, %%xmm1\n\t"
- "movdqu %%xmm1, %[outbuf0]\n\t"
- "movdqu %[outbuf1],%%xmm0\n\t"
- "pxor %%xmm0, %%xmm2\n\t"
- "movdqu %%xmm2, %[outbuf1]\n\t"
- "movdqu %[outbuf2],%%xmm0\n\t"
- "pxor %%xmm0, %%xmm3\n\t"
- "movdqu %%xmm3, %[outbuf2]\n\t"
- "pxor %%xmm5, %%xmm4\n\t"
- "movdqu %%xmm4, %[outbuf3]\n\t"
- : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)),
- [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)),
- [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)),
- [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE))
- :
- : "memory" );
-
- outbuf += 4*BLOCKSIZE;
- inbuf += 4*BLOCKSIZE;
- }
+ /* l_tmp will be used only every 65536-th block. */
+ n += 4;
+ l = get_l(c, l_tmp.x1, n, c->u_iv.iv, c->u_ctr.ctr);
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
+ asm volatile ("movdqu %[l0], %%xmm0\n\t"
+ "movdqu %[inbuf0], %%xmm1\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm1, %%xmm6\n\t"
+ "pxor %%xmm5, %%xmm1\n\t"
+ "movdqu %%xmm5, %[outbuf0]\n\t"
+ : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
+ : [l0] "m" (*c->u_mode.ocb.L[0]),
+ [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[l1], %%xmm0\n\t"
+ "movdqu %[inbuf1], %%xmm2\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm2, %%xmm6\n\t"
+ "pxor %%xmm5, %%xmm2\n\t"
+ "movdqu %%xmm5, %[outbuf1]\n\t"
+ : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
+ : [l1] "m" (*c->u_mode.ocb.L[1]),
+ [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[l2], %%xmm0\n\t"
+ "movdqu %[inbuf2], %%xmm3\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm3, %%xmm6\n\t"
+ "pxor %%xmm5, %%xmm3\n\t"
+ "movdqu %%xmm5, %[outbuf2]\n\t"
+ : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
+ : [l2] "m" (*c->u_mode.ocb.L[0]),
+ [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[l3], %%xmm0\n\t"
+ "movdqu %[inbuf3], %%xmm4\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm4, %%xmm6\n\t"
+ "pxor %%xmm5, %%xmm4\n\t"
+ :
+ : [l3] "m" (*l),
+ [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
+ : "memory" );
+
+ do_aesni_enc_vec4 (ctx);
+
+ asm volatile ("movdqu %[outbuf0],%%xmm0\n\t"
+ "pxor %%xmm0, %%xmm1\n\t"
+ "movdqu %%xmm1, %[outbuf0]\n\t"
+ "movdqu %[outbuf1],%%xmm0\n\t"
+ "pxor %%xmm0, %%xmm2\n\t"
+ "movdqu %%xmm2, %[outbuf1]\n\t"
+ "movdqu %[outbuf2],%%xmm0\n\t"
+ "pxor %%xmm0, %%xmm3\n\t"
+ "movdqu %%xmm3, %[outbuf2]\n\t"
+ "pxor %%xmm5, %%xmm4\n\t"
+ "movdqu %%xmm4, %[outbuf3]\n\t"
+ : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)),
+ [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)),
+ [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)),
+ [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE))
+ :
+ : "memory" );
+
+ outbuf += 4*BLOCKSIZE;
+ inbuf += 4*BLOCKSIZE;
}
for ( ;nblocks; nblocks-- )
{
- l[0] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
+ l = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
/* Checksum_i = Checksum_{i-1} xor P_i */
@@ -1528,7 +1533,7 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
"pxor %%xmm0, %%xmm6\n\t"
"pxor %%xmm5, %%xmm0\n\t"
:
- : [l] "m" (*l[0]),
+ : [l] "m" (*l),
[inbuf] "m" (*inbuf)
: "memory" );
@@ -1568,7 +1573,7 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
u64 n = c->u_mode.ocb.data_nblocks;
- const unsigned char *l[4] = {};
+ const unsigned char *l;
aesni_prepare_2_6_variable;
aesni_prepare ();
@@ -1582,103 +1587,111 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
[ctr] "m" (*c->u_ctr.ctr)
: "memory" );
- if (nblocks > 3)
+ for ( ;nblocks && n % 4; nblocks-- )
+ {
+ l = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+ asm volatile ("movdqu %[l], %%xmm1\n\t"
+ "movdqu %[inbuf], %%xmm0\n\t"
+ "pxor %%xmm1, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm0\n\t"
+ :
+ : [l] "m" (*l),
+ [inbuf] "m" (*inbuf)
+ : "memory" );
+
+ do_aesni_dec (ctx);
+
+ asm volatile ("pxor %%xmm5, %%xmm0\n\t"
+ "pxor %%xmm0, %%xmm6\n\t"
+ "movdqu %%xmm0, %[outbuf]\n\t"
+ : [outbuf] "=m" (*outbuf)
+ :
+ : "memory" );
+
+ inbuf += BLOCKSIZE;
+ outbuf += BLOCKSIZE;
+ }
+
+ for ( ;nblocks > 3 ; nblocks -= 4 )
{
- if (n % 4 == 0)
- {
- l[0] = c->u_mode.ocb.L[0];
- l[1] = c->u_mode.ocb.L[1];
- l[2] = c->u_mode.ocb.L[0];
- }
-
- for ( ;nblocks > 3 ; nblocks -= 4 )
- {
- /* l_tmp will be used only every 65536-th block. */
- if (n % 4 == 0)
- {
- n += 4;
- l[3] = get_l(c, l_tmp.x1, n, c->u_iv.iv, c->u_ctr.ctr);
- }
- else
- {
- l[0] = get_l(c, l_tmp.x1, n + 1, c->u_iv.iv, c->u_ctr.ctr);
- l[1] = get_l(c, l_tmp.x1, n + 2, c->u_iv.iv, c->u_ctr.ctr);
- l[2] = get_l(c, l_tmp.x1, n + 3, c->u_iv.iv, c->u_ctr.ctr);
- l[3] = get_l(c, l_tmp.x1, n + 4, c->u_iv.iv, c->u_ctr.ctr);
- n += 4;
- }
-
- /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
- /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
- /* Checksum_i = Checksum_{i-1} xor P_i */
- asm volatile ("movdqu %[l0], %%xmm0\n\t"
- "movdqu %[inbuf0], %%xmm1\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm5, %%xmm1\n\t"
- "movdqu %%xmm5, %[outbuf0]\n\t"
- : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
- : [l0] "m" (*l[0]),
- [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
- : "memory" );
- asm volatile ("movdqu %[l1], %%xmm0\n\t"
- "movdqu %[inbuf1], %%xmm2\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm5, %%xmm2\n\t"
- "movdqu %%xmm5, %[outbuf1]\n\t"
- : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
- : [l1] "m" (*l[1]),
- [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
- : "memory" );
- asm volatile ("movdqu %[l2], %%xmm0\n\t"
- "movdqu %[inbuf2], %%xmm3\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm5, %%xmm3\n\t"
- "movdqu %%xmm5, %[outbuf2]\n\t"
- : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
- : [l2] "m" (*l[2]),
- [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
- : "memory" );
- asm volatile ("movdqu %[l3], %%xmm0\n\t"
- "movdqu %[inbuf3], %%xmm4\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm5, %%xmm4\n\t"
- :
- : [l3] "m" (*l[3]),
- [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
- : "memory" );
-
- do_aesni_dec_vec4 (ctx);
-
- asm volatile ("movdqu %[outbuf0],%%xmm0\n\t"
- "pxor %%xmm0, %%xmm1\n\t"
- "movdqu %%xmm1, %[outbuf0]\n\t"
- "movdqu %[outbuf1],%%xmm0\n\t"
- "pxor %%xmm0, %%xmm2\n\t"
- "movdqu %%xmm2, %[outbuf1]\n\t"
- "movdqu %[outbuf2],%%xmm0\n\t"
- "pxor %%xmm0, %%xmm3\n\t"
- "movdqu %%xmm3, %[outbuf2]\n\t"
- "pxor %%xmm5, %%xmm4\n\t"
- "movdqu %%xmm4, %[outbuf3]\n\t"
- "pxor %%xmm1, %%xmm6\n\t"
- "pxor %%xmm2, %%xmm6\n\t"
- "pxor %%xmm3, %%xmm6\n\t"
- "pxor %%xmm4, %%xmm6\n\t"
- : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)),
- [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)),
- [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)),
- [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE))
- :
- : "memory" );
-
- outbuf += 4*BLOCKSIZE;
- inbuf += 4*BLOCKSIZE;
- }
+ /* l_tmp will be used only every 65536-th block. */
+ n += 4;
+ l = get_l(c, l_tmp.x1, n, c->u_iv.iv, c->u_ctr.ctr);
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+ asm volatile ("movdqu %[l0], %%xmm0\n\t"
+ "movdqu %[inbuf0], %%xmm1\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm1\n\t"
+ "movdqu %%xmm5, %[outbuf0]\n\t"
+ : [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE))
+ : [l0] "m" (*c->u_mode.ocb.L[0]),
+ [inbuf0] "m" (*(inbuf + 0 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[l1], %%xmm0\n\t"
+ "movdqu %[inbuf1], %%xmm2\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm2\n\t"
+ "movdqu %%xmm5, %[outbuf1]\n\t"
+ : [outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
+ : [l1] "m" (*c->u_mode.ocb.L[1]),
+ [inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[l2], %%xmm0\n\t"
+ "movdqu %[inbuf2], %%xmm3\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm3\n\t"
+ "movdqu %%xmm5, %[outbuf2]\n\t"
+ : [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE))
+ : [l2] "m" (*c->u_mode.ocb.L[0]),
+ [inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[l3], %%xmm0\n\t"
+ "movdqu %[inbuf3], %%xmm4\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm4\n\t"
+ :
+ : [l3] "m" (*l),
+ [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
+ : "memory" );
+
+ do_aesni_dec_vec4 (ctx);
+
+ asm volatile ("movdqu %[outbuf0],%%xmm0\n\t"
+ "pxor %%xmm0, %%xmm1\n\t"
+ "movdqu %%xmm1, %[outbuf0]\n\t"
+ "movdqu %[outbuf1],%%xmm0\n\t"
+ "pxor %%xmm0, %%xmm2\n\t"
+ "movdqu %%xmm2, %[outbuf1]\n\t"
+ "movdqu %[outbuf2],%%xmm0\n\t"
+ "pxor %%xmm0, %%xmm3\n\t"
+ "movdqu %%xmm3, %[outbuf2]\n\t"
+ "pxor %%xmm5, %%xmm4\n\t"
+ "movdqu %%xmm4, %[outbuf3]\n\t"
+ "pxor %%xmm1, %%xmm6\n\t"
+ "pxor %%xmm2, %%xmm6\n\t"
+ "pxor %%xmm3, %%xmm6\n\t"
+ "pxor %%xmm4, %%xmm6\n\t"
+ : [outbuf0] "+m" (*(outbuf + 0 * BLOCKSIZE)),
+ [outbuf1] "+m" (*(outbuf + 1 * BLOCKSIZE)),
+ [outbuf2] "+m" (*(outbuf + 2 * BLOCKSIZE)),
+ [outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE))
+ :
+ : "memory" );
+
+ outbuf += 4*BLOCKSIZE;
+ inbuf += 4*BLOCKSIZE;
}
for ( ;nblocks; nblocks-- )
{
- l[0] = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
+ l = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr);
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
/* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
@@ -1688,7 +1701,7 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
"pxor %%xmm1, %%xmm5\n\t"
"pxor %%xmm5, %%xmm0\n\t"
:
- : [l] "m" (*l[0]),
+ : [l] "m" (*l),
[inbuf] "m" (*inbuf)
: "memory" );
@@ -1739,7 +1752,7 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
RIJNDAEL_context *ctx = (void *)&c->context.c;
const unsigned char *abuf = abuf_arg;
u64 n = c->u_mode.ocb.aad_nblocks;
- const unsigned char *l[4] = {};
+ const unsigned char *l;
aesni_prepare_2_6_variable;
aesni_prepare ();
@@ -1753,90 +1766,91 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
[ctr] "m" (*c->u_mode.ocb.aad_sum)
: "memory" );
- if (nblocks > 3)
+ for ( ;nblocks && n % 4; nblocks-- )
+ {
+ l = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum);
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
+ asm volatile ("movdqu %[l], %%xmm1\n\t"
+ "movdqu %[abuf], %%xmm0\n\t"
+ "pxor %%xmm1, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm0\n\t"
+ :
+ : [l] "m" (*l),
+ [abuf] "m" (*abuf)
+ : "memory" );
+
+ do_aesni_enc (ctx);
+
+ asm volatile ("pxor %%xmm0, %%xmm6\n\t"
+ :
+ :
+ : "memory" );
+
+ abuf += BLOCKSIZE;
+ }
+
+ for ( ;nblocks > 3 ; nblocks -= 4 )
{
- if (n % 4 == 0)
- {
- l[0] = c->u_mode.ocb.L[0];
- l[1] = c->u_mode.ocb.L[1];
- l[2] = c->u_mode.ocb.L[0];
- }
-
- for ( ;nblocks > 3 ; nblocks -= 4 )
- {
- /* l_tmp will be used only every 65536-th block. */
- if (n % 4 == 0)
- {
- n += 4;
- l[3] = get_l(c, l_tmp.x1, n, c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum);
- }
- else
- {
- l[0] = get_l(c, l_tmp.x1, n + 1, c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum);
- l[1] = get_l(c, l_tmp.x1, n + 2, c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum);
- l[2] = get_l(c, l_tmp.x1, n + 3, c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum);
- l[3] = get_l(c, l_tmp.x1, n + 4, c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum);
- n += 4;
- }
-
- /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
- /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
- asm volatile ("movdqu %[l0], %%xmm0\n\t"
- "movdqu %[abuf0], %%xmm1\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm5, %%xmm1\n\t"
- :
- : [l0] "m" (*l[0]),
- [abuf0] "m" (*(abuf + 0 * BLOCKSIZE))
- : "memory" );
- asm volatile ("movdqu %[l1], %%xmm0\n\t"
- "movdqu %[abuf1], %%xmm2\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm5, %%xmm2\n\t"
- :
- : [l1] "m" (*l[1]),
- [abuf1] "m" (*(abuf + 1 * BLOCKSIZE))
- : "memory" );
- asm volatile ("movdqu %[l2], %%xmm0\n\t"
- "movdqu %[abuf2], %%xmm3\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm5, %%xmm3\n\t"
- :
- : [l2] "m" (*l[2]),
- [abuf2] "m" (*(abuf + 2 * BLOCKSIZE))
- : "memory" );
- asm volatile ("movdqu %[l3], %%xmm0\n\t"
- "movdqu %[abuf3], %%xmm4\n\t"
- "pxor %%xmm0, %%xmm5\n\t"
- "pxor %%xmm5, %%xmm4\n\t"
- :
- : [l3] "m" (*l[3]),
- [abuf3] "m" (*(abuf + 3 * BLOCKSIZE))
- : "memory" );
-
- do_aesni_enc_vec4 (ctx);
-
- asm volatile ("pxor %%xmm1, %%xmm6\n\t"
- "pxor %%xmm2, %%xmm6\n\t"
- "pxor %%xmm3, %%xmm6\n\t"
- "pxor %%xmm4, %%xmm6\n\t"
- :
- :
- : "memory" );
-
- abuf += 4*BLOCKSIZE;
- }
+ /* l_tmp will be used only every 65536-th block. */
+ n += 4;
+ l = get_l(c, l_tmp.x1, n, c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum);
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
+ asm volatile ("movdqu %[l0], %%xmm0\n\t"
+ "movdqu %[abuf0], %%xmm1\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm1\n\t"
+ :
+ : [l0] "m" (*c->u_mode.ocb.L[0]),
+ [abuf0] "m" (*(abuf + 0 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[l1], %%xmm0\n\t"
+ "movdqu %[abuf1], %%xmm2\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm2\n\t"
+ :
+ : [l1] "m" (*c->u_mode.ocb.L[1]),
+ [abuf1] "m" (*(abuf + 1 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[l2], %%xmm0\n\t"
+ "movdqu %[abuf2], %%xmm3\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm3\n\t"
+ :
+ : [l2] "m" (*c->u_mode.ocb.L[0]),
+ [abuf2] "m" (*(abuf + 2 * BLOCKSIZE))
+ : "memory" );
+ asm volatile ("movdqu %[l3], %%xmm0\n\t"
+ "movdqu %[abuf3], %%xmm4\n\t"
+ "pxor %%xmm0, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm4\n\t"
+ :
+ : [l3] "m" (*l),
+ [abuf3] "m" (*(abuf + 3 * BLOCKSIZE))
+ : "memory" );
+
+ do_aesni_enc_vec4 (ctx);
+
+ asm volatile ("pxor %%xmm1, %%xmm6\n\t"
+ "pxor %%xmm2, %%xmm6\n\t"
+ "pxor %%xmm3, %%xmm6\n\t"
+ "pxor %%xmm4, %%xmm6\n\t"
+ :
+ :
+ : "memory" );
+
+ abuf += 4*BLOCKSIZE;
}
for ( ;nblocks; nblocks-- )
{
- l[0] = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum);
+ l = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum);
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
@@ -1845,7 +1859,7 @@ _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
"pxor %%xmm1, %%xmm5\n\t"
"pxor %%xmm5, %%xmm0\n\t"
:
- : [l] "m" (*l[0]),
+ : [l] "m" (*l),
[abuf] "m" (*abuf)
: "memory" );