summaryrefslogtreecommitdiff
path: root/cipher/rijndael.c
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2014-12-23 12:35:28 +0200
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2014-12-23 12:37:35 +0200
commit2374753938df64f6fd8015b44613806a326eff1a (patch)
tree3e8f7c245ca838681415c13f48a9de5d98931559 /cipher/rijndael.c
parentad50e360ef4851e66e51a03fc420175636336b58 (diff)
downloadlibgcrypt-2374753938df64f6fd8015b44613806a326eff1a.tar.gz
rijndael: use more compact look-up tables and add table prefetching
* cipher/rijndael-internal.h (rijndael_prefetchfn_t): New. (RIJNDAEL_context): Add 'prefetch_enc_fn' and 'prefetch_dec_fn'. * cipher/rijndael-tables.h (S, T1, T2, T3, T4, T5, T6, T7, T8, S5, U1) (U2, U3, U4): Remove. (encT, dec_tables, decT, inv_sbox): Add. * cipher/rijndael.c (_gcry_aes_amd64_encrypt_block) (_gcry_aes_amd64_decrypt_block, _gcry_aes_arm_encrypt_block) (_gcry_aes_arm_encrypt_block): Add parameter for passing table pointer to assembly implementation. (prefetch_table, prefetch_enc, prefetch_dec): New. (do_setkey): Setup context prefetch functions depending on selected rijndael implementation; Use new tables for key setup. (prepare_decryption): Use new tables for decryption key setup. (do_encrypt_aligned): Rename to... (do_encrypt_fn): ... to this, change to use new compact tables, make handle unaligned input and unroll rounds loop by two. (do_encrypt): Remove handling of unaligned input/output; pass table pointer to assembly implementations. (rijndael_encrypt, _gcry_aes_cfb_enc, _gcry_aes_cbc_enc) (_gcry_aes_ctr_enc, _gcry_aes_cfb_dec): Prefetch encryption tables before encryption. (do_decrypt_aligned): Rename to... (do_decrypt_fn): ... to this, change to use new compact tables, make handle unaligned input and unroll rounds loop by two. (do_decrypt): Remove handling of unaligned input/output; pass table pointer to assembly implementations. (rijndael_decrypt, _gcry_aes_cbc_dec): Prefetch decryption tables before decryption. * cipher/rijndael-amd64.S: Use 1+1.25 KiB tables for encryption+decryption; remove tables from assembly file. * cipher/rijndael-arm.S: Ditto. -- Patch replaces 4+4.25 KiB look-up tables in generic implementation and 8+8 KiB look-up tables in AMD64 implementation and 2+2 KiB look-up tables in ARM implementation with 1+1.25 KiB look-up tables, and adds prefetching of look-up tables. AMD64 assembly is slower than before because of additional rotation instructions. The generic C implementation is now better optimized and actually faster than before. Benchmark results on Intel i5-4570 (turbo off) (64-bit, AMD64 assembly): tests/bench-slope --disable-hwf intel-aesni --cpu-mhz 3200 cipher aes Old: AES | nanosecs/byte mebibytes/sec cycles/byte ECB enc | 3.10 ns/B 307.5 MiB/s 9.92 c/B ECB dec | 3.15 ns/B 302.5 MiB/s 10.09 c/B CBC enc | 3.46 ns/B 275.5 MiB/s 11.08 c/B CBC dec | 3.19 ns/B 299.2 MiB/s 10.20 c/B CFB enc | 3.48 ns/B 274.4 MiB/s 11.12 c/B CFB dec | 3.23 ns/B 294.8 MiB/s 10.35 c/B OFB enc | 3.29 ns/B 290.2 MiB/s 10.52 c/B OFB dec | 3.31 ns/B 288.3 MiB/s 10.58 c/B CTR enc | 3.64 ns/B 261.7 MiB/s 11.66 c/B CTR dec | 3.65 ns/B 261.6 MiB/s 11.67 c/B New: AES | nanosecs/byte mebibytes/sec cycles/byte ECB enc | 4.21 ns/B 226.7 MiB/s 13.46 c/B ECB dec | 4.27 ns/B 223.2 MiB/s 13.67 c/B CBC enc | 4.15 ns/B 229.8 MiB/s 13.28 c/B CBC dec | 3.85 ns/B 247.8 MiB/s 12.31 c/B CFB enc | 4.16 ns/B 229.1 MiB/s 13.32 c/B CFB dec | 3.88 ns/B 245.9 MiB/s 12.41 c/B OFB enc | 4.38 ns/B 217.8 MiB/s 14.01 c/B OFB dec | 4.36 ns/B 218.6 MiB/s 13.96 c/B CTR enc | 4.30 ns/B 221.6 MiB/s 13.77 c/B CTR dec | 4.30 ns/B 221.7 MiB/s 13.76 c/B Benchmark on Intel i5-4570 (turbo off) (32-bit mingw, generic C): tests/bench-slope.exe --disable-hwf intel-aesni --cpu-mhz 3200 cipher aes Old: AES | nanosecs/byte mebibytes/sec cycles/byte ECB enc | 6.03 ns/B 158.2 MiB/s 19.29 c/B ECB dec | 5.81 ns/B 164.1 MiB/s 18.60 c/B CBC enc | 6.22 ns/B 153.4 MiB/s 19.90 c/B CBC dec | 5.91 ns/B 161.3 MiB/s 18.92 c/B CFB enc | 6.25 ns/B 152.7 MiB/s 19.99 c/B CFB dec | 6.24 ns/B 152.8 MiB/s 19.97 c/B OFB enc | 6.33 ns/B 150.6 MiB/s 20.27 c/B OFB dec | 6.33 ns/B 150.7 MiB/s 20.25 c/B CTR enc | 6.28 ns/B 152.0 MiB/s 20.08 c/B CTR dec | 6.28 ns/B 151.7 MiB/s 20.11 c/B New: AES | nanosecs/byte mebibytes/sec cycles/byte ECB enc | 5.02 ns/B 190.0 MiB/s 16.06 c/B ECB dec | 5.33 ns/B 178.8 MiB/s 17.07 c/B CBC enc | 4.64 ns/B 205.4 MiB/s 14.86 c/B CBC dec | 4.95 ns/B 192.7 MiB/s 15.84 c/B CFB enc | 4.75 ns/B 200.7 MiB/s 15.20 c/B CFB dec | 4.74 ns/B 201.1 MiB/s 15.18 c/B OFB enc | 5.29 ns/B 180.3 MiB/s 16.93 c/B OFB dec | 5.29 ns/B 180.3 MiB/s 16.93 c/B CTR enc | 4.77 ns/B 200.0 MiB/s 15.26 c/B CTR dec | 4.77 ns/B 199.8 MiB/s 15.27 c/B Benchmark on Cortex-A8 (ARM assembly): tests/bench-slope --cpu-mhz 1008 cipher aes Old: AES | nanosecs/byte mebibytes/sec cycles/byte ECB enc | 21.84 ns/B 43.66 MiB/s 22.02 c/B ECB dec | 22.35 ns/B 42.67 MiB/s 22.53 c/B CBC enc | 22.97 ns/B 41.53 MiB/s 23.15 c/B CBC dec | 23.48 ns/B 40.61 MiB/s 23.67 c/B CFB enc | 22.72 ns/B 41.97 MiB/s 22.90 c/B CFB dec | 23.41 ns/B 40.74 MiB/s 23.59 c/B OFB enc | 23.65 ns/B 40.32 MiB/s 23.84 c/B OFB dec | 23.67 ns/B 40.29 MiB/s 23.86 c/B CTR enc | 23.24 ns/B 41.03 MiB/s 23.43 c/B CTR dec | 23.23 ns/B 41.05 MiB/s 23.42 c/B New: AES | nanosecs/byte mebibytes/sec cycles/byte ECB enc | 26.03 ns/B 36.64 MiB/s 26.24 c/B ECB dec | 26.97 ns/B 35.36 MiB/s 27.18 c/B CBC enc | 23.21 ns/B 41.09 MiB/s 23.39 c/B CBC dec | 23.36 ns/B 40.83 MiB/s 23.54 c/B CFB enc | 23.02 ns/B 41.42 MiB/s 23.21 c/B CFB dec | 23.67 ns/B 40.28 MiB/s 23.86 c/B OFB enc | 27.86 ns/B 34.24 MiB/s 28.08 c/B OFB dec | 27.87 ns/B 34.21 MiB/s 28.10 c/B CTR enc | 23.47 ns/B 40.63 MiB/s 23.66 c/B CTR dec | 23.49 ns/B 40.61 MiB/s 23.67 c/B Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/rijndael.c')
-rw-r--r--cipher/rijndael.c645
1 files changed, 385 insertions, 260 deletions
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index aa1681db..5b0fe1c8 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -63,12 +63,14 @@ typedef u32 u32_a_t;
extern unsigned int _gcry_aes_amd64_encrypt_block(const void *keysched_enc,
unsigned char *out,
const unsigned char *in,
- int rounds);
+ int rounds,
+ const void *encT);
extern unsigned int _gcry_aes_amd64_decrypt_block(const void *keysched_dec,
unsigned char *out,
const unsigned char *in,
- int rounds);
+ int rounds,
+ const void *decT);
#endif /*USE_AMD64_ASM*/
#ifdef USE_AESNI
@@ -119,12 +121,14 @@ extern unsigned int _gcry_aes_padlock_decrypt (const RIJNDAEL_context *ctx,
extern unsigned int _gcry_aes_arm_encrypt_block(const void *keysched_enc,
unsigned char *out,
const unsigned char *in,
- int rounds);
+ int rounds,
+ const void *encT);
extern unsigned int _gcry_aes_arm_decrypt_block(const void *keysched_dec,
unsigned char *out,
const unsigned char *in,
- int rounds);
+ int rounds,
+ const void *decT);
#endif /*USE_ARM_ASM*/
static unsigned int do_encrypt (const RIJNDAEL_context *ctx, unsigned char *bx,
@@ -145,6 +149,38 @@ static const char *selftest(void);
+/* Prefetching for encryption/decryption tables. */
+static void prefetch_table(const volatile byte *tab, size_t len)
+{
+ size_t i;
+
+ for (i = 0; i < len; i += 8 * 32)
+ {
+ (void)tab[i + 0 * 32];
+ (void)tab[i + 1 * 32];
+ (void)tab[i + 2 * 32];
+ (void)tab[i + 3 * 32];
+ (void)tab[i + 4 * 32];
+ (void)tab[i + 5 * 32];
+ (void)tab[i + 6 * 32];
+ (void)tab[i + 7 * 32];
+ }
+
+ (void)tab[len - 1];
+}
+
+static void prefetch_enc(void)
+{
+ prefetch_table((const void *)encT, sizeof(encT));
+}
+
+static void prefetch_dec(void)
+{
+ prefetch_table((const void *)&dec_tables, sizeof(dec_tables));
+}
+
+
+
/* Perform the key setup. */
static gcry_err_code_t
do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
@@ -216,6 +252,8 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
{
ctx->encrypt_fn = _gcry_aes_aesni_encrypt;
ctx->decrypt_fn = _gcry_aes_aesni_decrypt;
+ ctx->prefetch_enc_fn = NULL;
+ ctx->prefetch_dec_fn = NULL;
ctx->use_aesni = 1;
}
#endif
@@ -224,6 +262,8 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
{
ctx->encrypt_fn = _gcry_aes_padlock_encrypt;
ctx->decrypt_fn = _gcry_aes_padlock_decrypt;
+ ctx->prefetch_enc_fn = NULL;
+ ctx->prefetch_dec_fn = NULL;
ctx->use_padlock = 1;
memcpy (ctx->padlockkey, key, keylen);
}
@@ -232,6 +272,8 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
{
ctx->encrypt_fn = do_encrypt;
ctx->decrypt_fn = do_decrypt;
+ ctx->prefetch_enc_fn = prefetch_enc;
+ ctx->prefetch_dec_fn = prefetch_dec;
}
/* NB: We don't yet support Padlock hardware key generation. */
@@ -246,14 +288,18 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
#endif
else
{
+ const byte *sbox = ((const byte *)encT) + 1;
union
{
PROPERLY_ALIGNED_TYPE dummy;
byte data[MAXKC][4];
- } k, tk;
-#define k k.data
-#define tk tk.data
+ } tkk[2];
+#define k tkk[0].data
+#define tk tkk[1].data
#define W (ctx->keyschenc)
+
+ prefetch_enc();
+
for (i = 0; i < keylen; i++)
{
k[i >> 2][i & 3] = key[i];
@@ -270,7 +316,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
{
for (; (j < KC) && (t < 4); j++, t++)
{
- *((u32_a_t*)W[r][t]) = *((u32_a_t*)tk[j]);
+ *((u32_a_t*)W[r][t]) = le_bswap32(*((u32_a_t*)tk[j]));
}
if (t == 4)
{
@@ -283,10 +329,10 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
{
/* While not enough round key material calculated calculate
new values. */
- tk[0][0] ^= S[tk[KC-1][1]];
- tk[0][1] ^= S[tk[KC-1][2]];
- tk[0][2] ^= S[tk[KC-1][3]];
- tk[0][3] ^= S[tk[KC-1][0]];
+ tk[0][0] ^= sbox[tk[KC-1][1] * 4];
+ tk[0][1] ^= sbox[tk[KC-1][2] * 4];
+ tk[0][2] ^= sbox[tk[KC-1][3] * 4];
+ tk[0][3] ^= sbox[tk[KC-1][0] * 4];
tk[0][0] ^= rcon[rconpointer++];
if (KC != 8)
@@ -302,10 +348,10 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
{
*((u32_a_t*)tk[j]) ^= *((u32_a_t*)tk[j-1]);
}
- tk[KC/2][0] ^= S[tk[KC/2 - 1][0]];
- tk[KC/2][1] ^= S[tk[KC/2 - 1][1]];
- tk[KC/2][2] ^= S[tk[KC/2 - 1][2]];
- tk[KC/2][3] ^= S[tk[KC/2 - 1][3]];
+ tk[KC/2][0] ^= sbox[tk[KC/2 - 1][0] * 4];
+ tk[KC/2][1] ^= sbox[tk[KC/2 - 1][1] * 4];
+ tk[KC/2][2] ^= sbox[tk[KC/2 - 1][2] * 4];
+ tk[KC/2][3] ^= sbox[tk[KC/2 - 1][3] * 4];
for (j = KC/2 + 1; j < KC; j++)
{
*((u32_a_t*)tk[j]) ^= *((u32_a_t*)tk[j-1]);
@@ -317,7 +363,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
{
for (; (j < KC) && (t < 4); j++, t++)
{
- *((u32_a_t*)W[r][t]) = *((u32_a_t*)tk[j]);
+ *((u32_a_t*)W[r][t]) = le_bswap32(*((u32_a_t*)tk[j]));
}
if (t == 4)
{
@@ -329,8 +375,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
#undef W
#undef tk
#undef k
- wipememory(&tk, sizeof(tk));
- wipememory(&t, sizeof(t));
+ wipememory(&tkk, sizeof(tkk));
}
return 0;
@@ -367,136 +412,190 @@ prepare_decryption( RIJNDAEL_context *ctx )
#endif /*USE_PADLOCK*/
else
{
- union
- {
- PROPERLY_ALIGNED_TYPE dummy;
- byte *w;
- } w;
-#define w w.w
+ const byte *sbox = ((const byte *)encT) + 1;
- for (r=0; r < MAXROUNDS+1; r++ )
- {
- *((u32_a_t*)ctx->keyschdec[r][0]) = *((u32_a_t*)ctx->keyschenc[r][0]);
- *((u32_a_t*)ctx->keyschdec[r][1]) = *((u32_a_t*)ctx->keyschenc[r][1]);
- *((u32_a_t*)ctx->keyschdec[r][2]) = *((u32_a_t*)ctx->keyschenc[r][2]);
- *((u32_a_t*)ctx->keyschdec[r][3]) = *((u32_a_t*)ctx->keyschenc[r][3]);
- }
-#define W (ctx->keyschdec)
- for (r = 1; r < ctx->rounds; r++)
- {
- w = W[r][0];
- *((u32_a_t*)w) = *((u32_a_t*)U1[w[0]]) ^ *((u32_a_t*)U2[w[1]])
- ^ *((u32_a_t*)U3[w[2]]) ^ *((u32_a_t*)U4[w[3]]);
+ prefetch_enc();
+ prefetch_dec();
- w = W[r][1];
- *((u32_a_t*)w) = *((u32_a_t*)U1[w[0]]) ^ *((u32_a_t*)U2[w[1]])
- ^ *((u32_a_t*)U3[w[2]]) ^ *((u32_a_t*)U4[w[3]]);
+ *((u32_a_t*)ctx->keyschdec[0][0]) = *((u32_a_t*)ctx->keyschenc[0][0]);
+ *((u32_a_t*)ctx->keyschdec[0][1]) = *((u32_a_t*)ctx->keyschenc[0][1]);
+ *((u32_a_t*)ctx->keyschdec[0][2]) = *((u32_a_t*)ctx->keyschenc[0][2]);
+ *((u32_a_t*)ctx->keyschdec[0][3]) = *((u32_a_t*)ctx->keyschenc[0][3]);
- w = W[r][2];
- *((u32_a_t*)w) = *((u32_a_t*)U1[w[0]]) ^ *((u32_a_t*)U2[w[1]])
- ^ *((u32_a_t*)U3[w[2]]) ^ *((u32_a_t*)U4[w[3]]);
-
- w = W[r][3];
- *((u32_a_t*)w) = *((u32_a_t*)U1[w[0]]) ^ *((u32_a_t*)U2[w[1]])
- ^ *((u32_a_t*)U3[w[2]]) ^ *((u32_a_t*)U4[w[3]]);
+ for (r = 1; r < ctx->rounds; r++)
+ {
+ u32_a_t *wi = (u32_a_t*)((ctx->keyschenc)[r]);
+ u32_a_t *wo = (u32_a_t*)((ctx->keyschdec)[r]);
+ u32 wt;
+
+ wt = wi[0];
+ wo[0] = rol(decT[sbox[(byte)(wt >> 0) * 4]], 8 * 0)
+ ^ rol(decT[sbox[(byte)(wt >> 8) * 4]], 8 * 1)
+ ^ rol(decT[sbox[(byte)(wt >> 16) * 4]], 8 * 2)
+ ^ rol(decT[sbox[(byte)(wt >> 24) * 4]], 8 * 3);
+
+ wt = wi[1];
+ wo[1] = rol(decT[sbox[(byte)(wt >> 0) * 4]], 8 * 0)
+ ^ rol(decT[sbox[(byte)(wt >> 8) * 4]], 8 * 1)
+ ^ rol(decT[sbox[(byte)(wt >> 16) * 4]], 8 * 2)
+ ^ rol(decT[sbox[(byte)(wt >> 24) * 4]], 8 * 3);
+
+ wt = wi[2];
+ wo[2] = rol(decT[sbox[(byte)(wt >> 0) * 4]], 8 * 0)
+ ^ rol(decT[sbox[(byte)(wt >> 8) * 4]], 8 * 1)
+ ^ rol(decT[sbox[(byte)(wt >> 16) * 4]], 8 * 2)
+ ^ rol(decT[sbox[(byte)(wt >> 24) * 4]], 8 * 3);
+
+ wt = wi[3];
+ wo[3] = rol(decT[sbox[(byte)(wt >> 0) * 4]], 8 * 0)
+ ^ rol(decT[sbox[(byte)(wt >> 8) * 4]], 8 * 1)
+ ^ rol(decT[sbox[(byte)(wt >> 16) * 4]], 8 * 2)
+ ^ rol(decT[sbox[(byte)(wt >> 24) * 4]], 8 * 3);
}
-#undef W
-#undef w
- wipememory(&w, sizeof(w));
+
+ *((u32_a_t*)ctx->keyschdec[r][0]) = *((u32_a_t*)ctx->keyschenc[r][0]);
+ *((u32_a_t*)ctx->keyschdec[r][1]) = *((u32_a_t*)ctx->keyschenc[r][1]);
+ *((u32_a_t*)ctx->keyschdec[r][2]) = *((u32_a_t*)ctx->keyschenc[r][2]);
+ *((u32_a_t*)ctx->keyschdec[r][3]) = *((u32_a_t*)ctx->keyschenc[r][3]);
}
}
-#if !defined(USE_AMD64_ASM) && !defined(USE_ARM_ASM)
-/* Encrypt one block. A and B need to be aligned on a 4 byte
- boundary. A and B may be the same. */
-static void
-do_encrypt_aligned (const RIJNDAEL_context *ctx,
- unsigned char *b, const unsigned char *a)
+#if !defined(USE_ARM_ASM) && !defined(USE_AMD64_ASM)
+/* Encrypt one block. A and B may be the same. */
+static unsigned int
+do_encrypt_fn (const RIJNDAEL_context *ctx, unsigned char *b,
+ const unsigned char *a)
{
#define rk (ctx->keyschenc)
+ const byte *sbox = ((const byte *)encT) + 1;
int rounds = ctx->rounds;
int r;
- union
- {
- u32 tempu32[4]; /* Force correct alignment. */
- byte temp[4][4];
- } u;
-
- *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(a )) ^ *((u32_a_t*)rk[0][0]);
- *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(a+ 4)) ^ *((u32_a_t*)rk[0][1]);
- *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(a+ 8)) ^ *((u32_a_t*)rk[0][2]);
- *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(a+12)) ^ *((u32_a_t*)rk[0][3]);
- *((u32_a_t*)(b )) = (*((u32_a_t*)T1[u.temp[0][0]])
- ^ *((u32_a_t*)T2[u.temp[1][1]])
- ^ *((u32_a_t*)T3[u.temp[2][2]])
- ^ *((u32_a_t*)T4[u.temp[3][3]]));
- *((u32_a_t*)(b + 4)) = (*((u32_a_t*)T1[u.temp[1][0]])
- ^ *((u32_a_t*)T2[u.temp[2][1]])
- ^ *((u32_a_t*)T3[u.temp[3][2]])
- ^ *((u32_a_t*)T4[u.temp[0][3]]));
- *((u32_a_t*)(b + 8)) = (*((u32_a_t*)T1[u.temp[2][0]])
- ^ *((u32_a_t*)T2[u.temp[3][1]])
- ^ *((u32_a_t*)T3[u.temp[0][2]])
- ^ *((u32_a_t*)T4[u.temp[1][3]]));
- *((u32_a_t*)(b +12)) = (*((u32_a_t*)T1[u.temp[3][0]])
- ^ *((u32_a_t*)T2[u.temp[0][1]])
- ^ *((u32_a_t*)T3[u.temp[1][2]])
- ^ *((u32_a_t*)T4[u.temp[2][3]]));
-
- for (r = 1; r < rounds-1; r++)
+ u32 sa[4];
+ u32 sb[4];
+
+ sb[0] = buf_get_le32(a + 0);
+ sb[1] = buf_get_le32(a + 4);
+ sb[2] = buf_get_le32(a + 8);
+ sb[3] = buf_get_le32(a + 12);
+
+ sa[0] = sb[0] ^ *((u32_a_t*)rk[0][0]);
+ sa[1] = sb[1] ^ *((u32_a_t*)rk[0][1]);
+ sa[2] = sb[2] ^ *((u32_a_t*)rk[0][2]);
+ sa[3] = sb[3] ^ *((u32_a_t*)rk[0][3]);
+
+ sb[0] = rol(encT[(byte)(sa[0] >> (0 * 8))], (0 * 8));
+ sb[3] = rol(encT[(byte)(sa[0] >> (1 * 8))], (1 * 8));
+ sb[2] = rol(encT[(byte)(sa[0] >> (2 * 8))], (2 * 8));
+ sb[1] = rol(encT[(byte)(sa[0] >> (3 * 8))], (3 * 8));
+ sa[0] = *((u32_a_t*)rk[1][0]) ^ sb[0];
+
+ sb[1] ^= rol(encT[(byte)(sa[1] >> (0 * 8))], (0 * 8));
+ sa[0] ^= rol(encT[(byte)(sa[1] >> (1 * 8))], (1 * 8));
+ sb[3] ^= rol(encT[(byte)(sa[1] >> (2 * 8))], (2 * 8));
+ sb[2] ^= rol(encT[(byte)(sa[1] >> (3 * 8))], (3 * 8));
+ sa[1] = *((u32_a_t*)rk[1][1]) ^ sb[1];
+
+ sb[2] ^= rol(encT[(byte)(sa[2] >> (0 * 8))], (0 * 8));
+ sa[1] ^= rol(encT[(byte)(sa[2] >> (1 * 8))], (1 * 8));
+ sa[0] ^= rol(encT[(byte)(sa[2] >> (2 * 8))], (2 * 8));
+ sb[3] ^= rol(encT[(byte)(sa[2] >> (3 * 8))], (3 * 8));
+ sa[2] = *((u32_a_t*)rk[1][2]) ^ sb[2];
+
+ sb[3] ^= rol(encT[(byte)(sa[3] >> (0 * 8))], (0 * 8));
+ sa[2] ^= rol(encT[(byte)(sa[3] >> (1 * 8))], (1 * 8));
+ sa[1] ^= rol(encT[(byte)(sa[3] >> (2 * 8))], (2 * 8));
+ sa[0] ^= rol(encT[(byte)(sa[3] >> (3 * 8))], (3 * 8));
+ sa[3] = *((u32_a_t*)rk[1][3]) ^ sb[3];
+
+ for (r = 2; r < rounds; r++)
{
- *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(b )) ^ *((u32_a_t*)rk[r][0]);
- *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(b+ 4)) ^ *((u32_a_t*)rk[r][1]);
- *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(b+ 8)) ^ *((u32_a_t*)rk[r][2]);
- *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(b+12)) ^ *((u32_a_t*)rk[r][3]);
-
- *((u32_a_t*)(b )) = (*((u32_a_t*)T1[u.temp[0][0]])
- ^ *((u32_a_t*)T2[u.temp[1][1]])
- ^ *((u32_a_t*)T3[u.temp[2][2]])
- ^ *((u32_a_t*)T4[u.temp[3][3]]));
- *((u32_a_t*)(b + 4)) = (*((u32_a_t*)T1[u.temp[1][0]])
- ^ *((u32_a_t*)T2[u.temp[2][1]])
- ^ *((u32_a_t*)T3[u.temp[3][2]])
- ^ *((u32_a_t*)T4[u.temp[0][3]]));
- *((u32_a_t*)(b + 8)) = (*((u32_a_t*)T1[u.temp[2][0]])
- ^ *((u32_a_t*)T2[u.temp[3][1]])
- ^ *((u32_a_t*)T3[u.temp[0][2]])
- ^ *((u32_a_t*)T4[u.temp[1][3]]));
- *((u32_a_t*)(b +12)) = (*((u32_a_t*)T1[u.temp[3][0]])
- ^ *((u32_a_t*)T2[u.temp[0][1]])
- ^ *((u32_a_t*)T3[u.temp[1][2]])
- ^ *((u32_a_t*)T4[u.temp[2][3]]));
+ sb[0] = rol(encT[(byte)(sa[0] >> (0 * 8))], (0 * 8));
+ sb[3] = rol(encT[(byte)(sa[0] >> (1 * 8))], (1 * 8));
+ sb[2] = rol(encT[(byte)(sa[0] >> (2 * 8))], (2 * 8));
+ sb[1] = rol(encT[(byte)(sa[0] >> (3 * 8))], (3 * 8));
+ sa[0] = *((u32_a_t*)rk[r][0]) ^ sb[0];
+
+ sb[1] ^= rol(encT[(byte)(sa[1] >> (0 * 8))], (0 * 8));
+ sa[0] ^= rol(encT[(byte)(sa[1] >> (1 * 8))], (1 * 8));
+ sb[3] ^= rol(encT[(byte)(sa[1] >> (2 * 8))], (2 * 8));
+ sb[2] ^= rol(encT[(byte)(sa[1] >> (3 * 8))], (3 * 8));
+ sa[1] = *((u32_a_t*)rk[r][1]) ^ sb[1];
+
+ sb[2] ^= rol(encT[(byte)(sa[2] >> (0 * 8))], (0 * 8));
+ sa[1] ^= rol(encT[(byte)(sa[2] >> (1 * 8))], (1 * 8));
+ sa[0] ^= rol(encT[(byte)(sa[2] >> (2 * 8))], (2 * 8));
+ sb[3] ^= rol(encT[(byte)(sa[2] >> (3 * 8))], (3 * 8));
+ sa[2] = *((u32_a_t*)rk[r][2]) ^ sb[2];
+
+ sb[3] ^= rol(encT[(byte)(sa[3] >> (0 * 8))], (0 * 8));
+ sa[2] ^= rol(encT[(byte)(sa[3] >> (1 * 8))], (1 * 8));
+ sa[1] ^= rol(encT[(byte)(sa[3] >> (2 * 8))], (2 * 8));
+ sa[0] ^= rol(encT[(byte)(sa[3] >> (3 * 8))], (3 * 8));
+ sa[3] = *((u32_a_t*)rk[r][3]) ^ sb[3];
+
+ r++;
+
+ sb[0] = rol(encT[(byte)(sa[0] >> (0 * 8))], (0 * 8));
+ sb[3] = rol(encT[(byte)(sa[0] >> (1 * 8))], (1 * 8));
+ sb[2] = rol(encT[(byte)(sa[0] >> (2 * 8))], (2 * 8));
+ sb[1] = rol(encT[(byte)(sa[0] >> (3 * 8))], (3 * 8));
+ sa[0] = *((u32_a_t*)rk[r][0]) ^ sb[0];
+
+ sb[1] ^= rol(encT[(byte)(sa[1] >> (0 * 8))], (0 * 8));
+ sa[0] ^= rol(encT[(byte)(sa[1] >> (1 * 8))], (1 * 8));
+ sb[3] ^= rol(encT[(byte)(sa[1] >> (2 * 8))], (2 * 8));
+ sb[2] ^= rol(encT[(byte)(sa[1] >> (3 * 8))], (3 * 8));
+ sa[1] = *((u32_a_t*)rk[r][1]) ^ sb[1];
+
+ sb[2] ^= rol(encT[(byte)(sa[2] >> (0 * 8))], (0 * 8));
+ sa[1] ^= rol(encT[(byte)(sa[2] >> (1 * 8))], (1 * 8));
+ sa[0] ^= rol(encT[(byte)(sa[2] >> (2 * 8))], (2 * 8));
+ sb[3] ^= rol(encT[(byte)(sa[2] >> (3 * 8))], (3 * 8));
+ sa[2] = *((u32_a_t*)rk[r][2]) ^ sb[2];
+
+ sb[3] ^= rol(encT[(byte)(sa[3] >> (0 * 8))], (0 * 8));
+ sa[2] ^= rol(encT[(byte)(sa[3] >> (1 * 8))], (1 * 8));
+ sa[1] ^= rol(encT[(byte)(sa[3] >> (2 * 8))], (2 * 8));
+ sa[0] ^= rol(encT[(byte)(sa[3] >> (3 * 8))], (3 * 8));
+ sa[3] = *((u32_a_t*)rk[r][3]) ^ sb[3];
}
/* Last round is special. */
- *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(b )) ^ *((u32_a_t*)rk[rounds-1][0]);
- *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(b+ 4)) ^ *((u32_a_t*)rk[rounds-1][1]);
- *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(b+ 8)) ^ *((u32_a_t*)rk[rounds-1][2]);
- *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(b+12)) ^ *((u32_a_t*)rk[rounds-1][3]);
- b[ 0] = T1[u.temp[0][0]][1];
- b[ 1] = T1[u.temp[1][1]][1];
- b[ 2] = T1[u.temp[2][2]][1];
- b[ 3] = T1[u.temp[3][3]][1];
- b[ 4] = T1[u.temp[1][0]][1];
- b[ 5] = T1[u.temp[2][1]][1];
- b[ 6] = T1[u.temp[3][2]][1];
- b[ 7] = T1[u.temp[0][3]][1];
- b[ 8] = T1[u.temp[2][0]][1];
- b[ 9] = T1[u.temp[3][1]][1];
- b[10] = T1[u.temp[0][2]][1];
- b[11] = T1[u.temp[1][3]][1];
- b[12] = T1[u.temp[3][0]][1];
- b[13] = T1[u.temp[0][1]][1];
- b[14] = T1[u.temp[1][2]][1];
- b[15] = T1[u.temp[2][3]][1];
- *((u32_a_t*)(b )) ^= *((u32_a_t*)rk[rounds][0]);
- *((u32_a_t*)(b+ 4)) ^= *((u32_a_t*)rk[rounds][1]);
- *((u32_a_t*)(b+ 8)) ^= *((u32_a_t*)rk[rounds][2]);
- *((u32_a_t*)(b+12)) ^= *((u32_a_t*)rk[rounds][3]);
+
+ sb[0] = (sbox[(byte)(sa[0] >> (0 * 8)) * 4]) << (0 * 8);
+ sb[3] = (sbox[(byte)(sa[0] >> (1 * 8)) * 4]) << (1 * 8);
+ sb[2] = (sbox[(byte)(sa[0] >> (2 * 8)) * 4]) << (2 * 8);
+ sb[1] = (sbox[(byte)(sa[0] >> (3 * 8)) * 4]) << (3 * 8);
+ sa[0] = *((u32_a_t*)rk[r][0]) ^ sb[0];
+
+ sb[1] ^= (sbox[(byte)(sa[1] >> (0 * 8)) * 4]) << (0 * 8);
+ sa[0] ^= (sbox[(byte)(sa[1] >> (1 * 8)) * 4]) << (1 * 8);
+ sb[3] ^= (sbox[(byte)(sa[1] >> (2 * 8)) * 4]) << (2 * 8);
+ sb[2] ^= (sbox[(byte)(sa[1] >> (3 * 8)) * 4]) << (3 * 8);
+ sa[1] = *((u32_a_t*)rk[r][1]) ^ sb[1];
+
+ sb[2] ^= (sbox[(byte)(sa[2] >> (0 * 8)) * 4]) << (0 * 8);
+ sa[1] ^= (sbox[(byte)(sa[2] >> (1 * 8)) * 4]) << (1 * 8);
+ sa[0] ^= (sbox[(byte)(sa[2] >> (2 * 8)) * 4]) << (2 * 8);
+ sb[3] ^= (sbox[(byte)(sa[2] >> (3 * 8)) * 4]) << (3 * 8);
+ sa[2] = *((u32_a_t*)rk[r][2]) ^ sb[2];
+
+ sb[3] ^= (sbox[(byte)(sa[3] >> (0 * 8)) * 4]) << (0 * 8);
+ sa[2] ^= (sbox[(byte)(sa[3] >> (1 * 8)) * 4]) << (1 * 8);
+ sa[1] ^= (sbox[(byte)(sa[3] >> (2 * 8)) * 4]) << (2 * 8);
+ sa[0] ^= (sbox[(byte)(sa[3] >> (3 * 8)) * 4]) << (3 * 8);
+ sa[3] = *((u32_a_t*)rk[r][3]) ^ sb[3];
+
+ buf_put_le32(b + 0, sa[0]);
+ buf_put_le32(b + 4, sa[1]);
+ buf_put_le32(b + 8, sa[2]);
+ buf_put_le32(b + 12, sa[3]);
#undef rk
+
+ return (56 + 2*sizeof(int));
}
-#endif /*!USE_AMD64_ASM && !USE_ARM_ASM*/
+#endif /*!USE_ARM_ASM && !USE_AMD64_ASM*/
static unsigned int
@@ -504,31 +603,13 @@ do_encrypt (const RIJNDAEL_context *ctx,
unsigned char *bx, const unsigned char *ax)
{
#ifdef USE_AMD64_ASM
- return _gcry_aes_amd64_encrypt_block(ctx->keyschenc, bx, ax, ctx->rounds);
+ return _gcry_aes_amd64_encrypt_block(ctx->keyschenc, bx, ax, ctx->rounds,
+ encT);
#elif defined(USE_ARM_ASM)
- return _gcry_aes_arm_encrypt_block(ctx->keyschenc, bx, ax, ctx->rounds);
+ return _gcry_aes_arm_encrypt_block(ctx->keyschenc, bx, ax, ctx->rounds, encT);
#else
- /* BX and AX are not necessary correctly aligned. Thus we might
- need to copy them here. We try to align to a 16 bytes. */
- if (((size_t)ax & 0x0f) || ((size_t)bx & 0x0f))
- {
- union
- {
- u32 dummy[4];
- byte a[16] ATTR_ALIGNED_16;
- } a;
-
- buf_cpy (a.a, ax, 16);
- do_encrypt_aligned (ctx, a.a, a.a);
- buf_cpy (bx, a.a, 16);
- }
- else
- {
- do_encrypt_aligned (ctx, bx, ax);
- }
-
- return (56 + 2*sizeof(int));
-#endif /*!USE_AMD64_ASM && !USE_ARM_ASM*/
+ return do_encrypt_fn (ctx, bx, ax);
+#endif /* !USE_ARM_ASM && !USE_AMD64_ASM*/
}
@@ -537,6 +618,9 @@ rijndael_encrypt (void *context, byte *b, const byte *a)
{
RIJNDAEL_context *ctx = context;
+ if (ctx->prefetch_enc_fn)
+ ctx->prefetch_enc_fn();
+
return ctx->encrypt_fn (ctx, b, a);
}
@@ -555,6 +639,9 @@ _gcry_aes_cfb_enc (void *context, unsigned char *iv,
const unsigned char *inbuf = inbuf_arg;
unsigned int burn_depth = 0;
+ if (ctx->prefetch_enc_fn)
+ ctx->prefetch_enc_fn();
+
if (0)
;
#ifdef USE_AESNI
@@ -599,6 +686,9 @@ _gcry_aes_cbc_enc (void *context, unsigned char *iv,
unsigned char *last_iv;
unsigned int burn_depth = 0;
+ if (ctx->prefetch_enc_fn)
+ ctx->prefetch_enc_fn();
+
if (0)
;
#ifdef USE_AESNI
@@ -651,6 +741,9 @@ _gcry_aes_ctr_enc (void *context, unsigned char *ctr,
unsigned int burn_depth = 0;
int i;
+ if (ctx->prefetch_enc_fn)
+ ctx->prefetch_enc_fn();
+
if (0)
;
#ifdef USE_AESNI
@@ -691,98 +784,139 @@ _gcry_aes_ctr_enc (void *context, unsigned char *ctr,
-#if !defined(USE_AMD64_ASM) && !defined(USE_ARM_ASM)
-/* Decrypt one block. A and B need to be aligned on a 4 byte boundary
- and the decryption must have been prepared. A and B may be the
- same. */
-static void
-do_decrypt_aligned (const RIJNDAEL_context *ctx,
- unsigned char *b, const unsigned char *a)
+#if !defined(USE_ARM_ASM) && !defined(USE_AMD64_ASM)
+/* Decrypt one block. A and B may be the same. */
+static unsigned int
+do_decrypt_fn (const RIJNDAEL_context *ctx, unsigned char *b,
+ const unsigned char *a)
{
#define rk (ctx->keyschdec)
int rounds = ctx->rounds;
int r;
- union
- {
- u32 tempu32[4]; /* Force correct alignment. */
- byte temp[4][4];
- } u;
-
-
- *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(a )) ^ *((u32_a_t*)rk[rounds][0]);
- *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(a+ 4)) ^ *((u32_a_t*)rk[rounds][1]);
- *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(a+ 8)) ^ *((u32_a_t*)rk[rounds][2]);
- *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(a+12)) ^ *((u32_a_t*)rk[rounds][3]);
-
- *((u32_a_t*)(b )) = (*((u32_a_t*)T5[u.temp[0][0]])
- ^ *((u32_a_t*)T6[u.temp[3][1]])
- ^ *((u32_a_t*)T7[u.temp[2][2]])
- ^ *((u32_a_t*)T8[u.temp[1][3]]));
- *((u32_a_t*)(b+ 4)) = (*((u32_a_t*)T5[u.temp[1][0]])
- ^ *((u32_a_t*)T6[u.temp[0][1]])
- ^ *((u32_a_t*)T7[u.temp[3][2]])
- ^ *((u32_a_t*)T8[u.temp[2][3]]));
- *((u32_a_t*)(b+ 8)) = (*((u32_a_t*)T5[u.temp[2][0]])
- ^ *((u32_a_t*)T6[u.temp[1][1]])
- ^ *((u32_a_t*)T7[u.temp[0][2]])
- ^ *((u32_a_t*)T8[u.temp[3][3]]));
- *((u32_a_t*)(b+12)) = (*((u32_a_t*)T5[u.temp[3][0]])
- ^ *((u32_a_t*)T6[u.temp[2][1]])
- ^ *((u32_a_t*)T7[u.temp[1][2]])
- ^ *((u32_a_t*)T8[u.temp[0][3]]));
-
- for (r = rounds-1; r > 1; r--)
+ u32 sa[4];
+ u32 sb[4];
+
+ sb[0] = buf_get_le32(a + 0);
+ sb[1] = buf_get_le32(a + 4);
+ sb[2] = buf_get_le32(a + 8);
+ sb[3] = buf_get_le32(a + 12);
+
+ sa[0] = sb[0] ^ *((u32_a_t*)rk[rounds][0]);
+ sa[1] = sb[1] ^ *((u32_a_t*)rk[rounds][1]);
+ sa[2] = sb[2] ^ *((u32_a_t*)rk[rounds][2]);
+ sa[3] = sb[3] ^ *((u32_a_t*)rk[rounds][3]);
+
+ for (r = rounds - 1; r > 1; r--)
{
- *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(b )) ^ *((u32_a_t*)rk[r][0]);
- *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(b+ 4)) ^ *((u32_a_t*)rk[r][1]);
- *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(b+ 8)) ^ *((u32_a_t*)rk[r][2]);
- *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(b+12)) ^ *((u32_a_t*)rk[r][3]);
- *((u32_a_t*)(b )) = (*((u32_a_t*)T5[u.temp[0][0]])
- ^ *((u32_a_t*)T6[u.temp[3][1]])
- ^ *((u32_a_t*)T7[u.temp[2][2]])
- ^ *((u32_a_t*)T8[u.temp[1][3]]));
- *((u32_a_t*)(b+ 4)) = (*((u32_a_t*)T5[u.temp[1][0]])
- ^ *((u32_a_t*)T6[u.temp[0][1]])
- ^ *((u32_a_t*)T7[u.temp[3][2]])
- ^ *((u32_a_t*)T8[u.temp[2][3]]));
- *((u32_a_t*)(b+ 8)) = (*((u32_a_t*)T5[u.temp[2][0]])
- ^ *((u32_a_t*)T6[u.temp[1][1]])
- ^ *((u32_a_t*)T7[u.temp[0][2]])
- ^ *((u32_a_t*)T8[u.temp[3][3]]));
- *((u32_a_t*)(b+12)) = (*((u32_a_t*)T5[u.temp[3][0]])
- ^ *((u32_a_t*)T6[u.temp[2][1]])
- ^ *((u32_a_t*)T7[u.temp[1][2]])
- ^ *((u32_a_t*)T8[u.temp[0][3]]));
+ sb[0] = rol(decT[(byte)(sa[0] >> (0 * 8))], (0 * 8));
+ sb[1] = rol(decT[(byte)(sa[0] >> (1 * 8))], (1 * 8));
+ sb[2] = rol(decT[(byte)(sa[0] >> (2 * 8))], (2 * 8));
+ sb[3] = rol(decT[(byte)(sa[0] >> (3 * 8))], (3 * 8));
+ sa[0] = *((u32_a_t*)rk[r][0]) ^ sb[0];
+
+ sb[1] ^= rol(decT[(byte)(sa[1] >> (0 * 8))], (0 * 8));
+ sb[2] ^= rol(decT[(byte)(sa[1] >> (1 * 8))], (1 * 8));
+ sb[3] ^= rol(decT[(byte)(sa[1] >> (2 * 8))], (2 * 8));
+ sa[0] ^= rol(decT[(byte)(sa[1] >> (3 * 8))], (3 * 8));
+ sa[1] = *((u32_a_t*)rk[r][1]) ^ sb[1];
+
+ sb[2] ^= rol(decT[(byte)(sa[2] >> (0 * 8))], (0 * 8));
+ sb[3] ^= rol(decT[(byte)(sa[2] >> (1 * 8))], (1 * 8));
+ sa[0] ^= rol(decT[(byte)(sa[2] >> (2 * 8))], (2 * 8));
+ sa[1] ^= rol(decT[(byte)(sa[2] >> (3 * 8))], (3 * 8));
+ sa[2] = *((u32_a_t*)rk[r][2]) ^ sb[2];
+
+ sb[3] ^= rol(decT[(byte)(sa[3] >> (0 * 8))], (0 * 8));
+ sa[0] ^= rol(decT[(byte)(sa[3] >> (1 * 8))], (1 * 8));
+ sa[1] ^= rol(decT[(byte)(sa[3] >> (2 * 8))], (2 * 8));
+ sa[2] ^= rol(decT[(byte)(sa[3] >> (3 * 8))], (3 * 8));
+ sa[3] = *((u32_a_t*)rk[r][3]) ^ sb[3];
+
+ r--;
+
+ sb[0] = rol(decT[(byte)(sa[0] >> (0 * 8))], (0 * 8));
+ sb[1] = rol(decT[(byte)(sa[0] >> (1 * 8))], (1 * 8));
+ sb[2] = rol(decT[(byte)(sa[0] >> (2 * 8))], (2 * 8));
+ sb[3] = rol(decT[(byte)(sa[0] >> (3 * 8))], (3 * 8));
+ sa[0] = *((u32_a_t*)rk[r][0]) ^ sb[0];
+
+ sb[1] ^= rol(decT[(byte)(sa[1] >> (0 * 8))], (0 * 8));
+ sb[2] ^= rol(decT[(byte)(sa[1] >> (1 * 8))], (1 * 8));
+ sb[3] ^= rol(decT[(byte)(sa[1] >> (2 * 8))], (2 * 8));
+ sa[0] ^= rol(decT[(byte)(sa[1] >> (3 * 8))], (3 * 8));
+ sa[1] = *((u32_a_t*)rk[r][1]) ^ sb[1];
+
+ sb[2] ^= rol(decT[(byte)(sa[2] >> (0 * 8))], (0 * 8));
+ sb[3] ^= rol(decT[(byte)(sa[2] >> (1 * 8))], (1 * 8));
+ sa[0] ^= rol(decT[(byte)(sa[2] >> (2 * 8))], (2 * 8));
+ sa[1] ^= rol(decT[(byte)(sa[2] >> (3 * 8))], (3 * 8));
+ sa[2] = *((u32_a_t*)rk[r][2]) ^ sb[2];
+
+ sb[3] ^= rol(decT[(byte)(sa[3] >> (0 * 8))], (0 * 8));
+ sa[0] ^= rol(decT[(byte)(sa[3] >> (1 * 8))], (1 * 8));
+ sa[1] ^= rol(decT[(byte)(sa[3] >> (2 * 8))], (2 * 8));
+ sa[2] ^= rol(decT[(byte)(sa[3] >> (3 * 8))], (3 * 8));
+ sa[3] = *((u32_a_t*)rk[r][3]) ^ sb[3];
}
+ sb[0] = rol(decT[(byte)(sa[0] >> (0 * 8))], (0 * 8));
+ sb[1] = rol(decT[(byte)(sa[0] >> (1 * 8))], (1 * 8));
+ sb[2] = rol(decT[(byte)(sa[0] >> (2 * 8))], (2 * 8));
+ sb[3] = rol(decT[(byte)(sa[0] >> (3 * 8))], (3 * 8));
+ sa[0] = *((u32_a_t*)rk[1][0]) ^ sb[0];
+
+ sb[1] ^= rol(decT[(byte)(sa[1] >> (0 * 8))], (0 * 8));
+ sb[2] ^= rol(decT[(byte)(sa[1] >> (1 * 8))], (1 * 8));
+ sb[3] ^= rol(decT[(byte)(sa[1] >> (2 * 8))], (2 * 8));
+ sa[0] ^= rol(decT[(byte)(sa[1] >> (3 * 8))], (3 * 8));
+ sa[1] = *((u32_a_t*)rk[1][1]) ^ sb[1];
+
+ sb[2] ^= rol(decT[(byte)(sa[2] >> (0 * 8))], (0 * 8));
+ sb[3] ^= rol(decT[(byte)(sa[2] >> (1 * 8))], (1 * 8));
+ sa[0] ^= rol(decT[(byte)(sa[2] >> (2 * 8))], (2 * 8));
+ sa[1] ^= rol(decT[(byte)(sa[2] >> (3 * 8))], (3 * 8));
+ sa[2] = *((u32_a_t*)rk[1][2]) ^ sb[2];
+
+ sb[3] ^= rol(decT[(byte)(sa[3] >> (0 * 8))], (0 * 8));
+ sa[0] ^= rol(decT[(byte)(sa[3] >> (1 * 8))], (1 * 8));
+ sa[1] ^= rol(decT[(byte)(sa[3] >> (2 * 8))], (2 * 8));
+ sa[2] ^= rol(decT[(byte)(sa[3] >> (3 * 8))], (3 * 8));
+ sa[3] = *((u32_a_t*)rk[1][3]) ^ sb[3];
+
/* Last round is special. */
- *((u32_a_t*)u.temp[0]) = *((u32_a_t*)(b )) ^ *((u32_a_t*)rk[1][0]);
- *((u32_a_t*)u.temp[1]) = *((u32_a_t*)(b+ 4)) ^ *((u32_a_t*)rk[1][1]);
- *((u32_a_t*)u.temp[2]) = *((u32_a_t*)(b+ 8)) ^ *((u32_a_t*)rk[1][2]);
- *((u32_a_t*)u.temp[3]) = *((u32_a_t*)(b+12)) ^ *((u32_a_t*)rk[1][3]);
- b[ 0] = S5[u.temp[0][0]];
- b[ 1] = S5[u.temp[3][1]];
- b[ 2] = S5[u.temp[2][2]];
- b[ 3] = S5[u.temp[1][3]];
- b[ 4] = S5[u.temp[1][0]];
- b[ 5] = S5[u.temp[0][1]];
- b[ 6] = S5[u.temp[3][2]];
- b[ 7] = S5[u.temp[2][3]];
- b[ 8] = S5[u.temp[2][0]];
- b[ 9] = S5[u.temp[1][1]];
- b[10] = S5[u.temp[0][2]];
- b[11] = S5[u.temp[3][3]];
- b[12] = S5[u.temp[3][0]];
- b[13] = S5[u.temp[2][1]];
- b[14] = S5[u.temp[1][2]];
- b[15] = S5[u.temp[0][3]];
- *((u32_a_t*)(b )) ^= *((u32_a_t*)rk[0][0]);
- *((u32_a_t*)(b+ 4)) ^= *((u32_a_t*)rk[0][1]);
- *((u32_a_t*)(b+ 8)) ^= *((u32_a_t*)rk[0][2]);
- *((u32_a_t*)(b+12)) ^= *((u32_a_t*)rk[0][3]);
+ sb[0] = inv_sbox[(byte)(sa[0] >> (0 * 8))] << (0 * 8);
+ sb[1] = inv_sbox[(byte)(sa[0] >> (1 * 8))] << (1 * 8);
+ sb[2] = inv_sbox[(byte)(sa[0] >> (2 * 8))] << (2 * 8);
+ sb[3] = inv_sbox[(byte)(sa[0] >> (3 * 8))] << (3 * 8);
+ sa[0] = sb[0] ^ *((u32_a_t*)rk[0][0]);
+
+ sb[1] ^= inv_sbox[(byte)(sa[1] >> (0 * 8))] << (0 * 8);
+ sb[2] ^= inv_sbox[(byte)(sa[1] >> (1 * 8))] << (1 * 8);
+ sb[3] ^= inv_sbox[(byte)(sa[1] >> (2 * 8))] << (2 * 8);
+ sa[0] ^= inv_sbox[(byte)(sa[1] >> (3 * 8))] << (3 * 8);
+ sa[1] = sb[1] ^ *((u32_a_t*)rk[0][1]);
+
+ sb[2] ^= inv_sbox[(byte)(sa[2] >> (0 * 8))] << (0 * 8);
+ sb[3] ^= inv_sbox[(byte)(sa[2] >> (1 * 8))] << (1 * 8);
+ sa[0] ^= inv_sbox[(byte)(sa[2] >> (2 * 8))] << (2 * 8);
+ sa[1] ^= inv_sbox[(byte)(sa[2] >> (3 * 8))] << (3 * 8);
+ sa[2] = sb[2] ^ *((u32_a_t*)rk[0][2]);
+
+ sb[3] ^= inv_sbox[(byte)(sa[3] >> (0 * 8))] << (0 * 8);
+ sa[0] ^= inv_sbox[(byte)(sa[3] >> (1 * 8))] << (1 * 8);
+ sa[1] ^= inv_sbox[(byte)(sa[3] >> (2 * 8))] << (2 * 8);
+ sa[2] ^= inv_sbox[(byte)(sa[3] >> (3 * 8))] << (3 * 8);
+ sa[3] = sb[3] ^ *((u32_a_t*)rk[0][3]);
+
+ buf_put_le32(b + 0, sa[0]);
+ buf_put_le32(b + 4, sa[1]);
+ buf_put_le32(b + 8, sa[2]);
+ buf_put_le32(b + 12, sa[3]);
#undef rk
+
+ return (56+2*sizeof(int));
}
-#endif /*!USE_AMD64_ASM && !USE_ARM_ASM*/
+#endif /*!USE_ARM_ASM && !USE_AMD64_ASM*/
/* Decrypt one block. AX and BX may be the same. */
@@ -791,31 +925,14 @@ do_decrypt (const RIJNDAEL_context *ctx, unsigned char *bx,
const unsigned char *ax)
{
#ifdef USE_AMD64_ASM
- return _gcry_aes_amd64_decrypt_block(ctx->keyschdec, bx, ax, ctx->rounds);
+ return _gcry_aes_amd64_decrypt_block(ctx->keyschdec, bx, ax, ctx->rounds,
+ &dec_tables);
#elif defined(USE_ARM_ASM)
- return _gcry_aes_arm_decrypt_block(ctx->keyschdec, bx, ax, ctx->rounds);
+ return _gcry_aes_arm_decrypt_block(ctx->keyschdec, bx, ax, ctx->rounds,
+ &dec_tables);
#else
- /* BX and AX are not necessary correctly aligned. Thus we might
- need to copy them here. We try to align to a 16 bytes. */
- if (((size_t)ax & 0x0f) || ((size_t)bx & 0x0f))
- {
- union
- {
- u32 dummy[4];
- byte a[16] ATTR_ALIGNED_16;
- } a;
-
- buf_cpy (a.a, ax, 16);
- do_decrypt_aligned (ctx, a.a, a.a);
- buf_cpy (bx, a.a, 16);
- }
- else
- {
- do_decrypt_aligned (ctx, bx, ax);
- }
-
- return (56+2*sizeof(int));
-#endif /*!USE_AMD64_ASM && !USE_ARM_ASM*/
+ return do_decrypt_fn (ctx, bx, ax);
+#endif /*!USE_ARM_ASM && !USE_AMD64_ASM*/
}
@@ -837,6 +954,9 @@ rijndael_decrypt (void *context, byte *b, const byte *a)
check_decryption_preparation (ctx);
+ if (ctx->prefetch_dec_fn)
+ ctx->prefetch_dec_fn();
+
return ctx->decrypt_fn (ctx, b, a);
}
@@ -855,6 +975,9 @@ _gcry_aes_cfb_dec (void *context, unsigned char *iv,
const unsigned char *inbuf = inbuf_arg;
unsigned int burn_depth = 0;
+ if (ctx->prefetch_enc_fn)
+ ctx->prefetch_enc_fn();
+
if (0)
;
#ifdef USE_AESNI
@@ -898,6 +1021,9 @@ _gcry_aes_cbc_dec (void *context, unsigned char *iv,
check_decryption_preparation (ctx);
+ if (ctx->prefetch_dec_fn)
+ ctx->prefetch_dec_fn();
+
if (0)
;
#ifdef USE_AESNI
@@ -932,7 +1058,6 @@ _gcry_aes_cbc_dec (void *context, unsigned char *iv,
}
-
/* Run the self-tests for AES 128. Returns NULL on success. */
static const char*