diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2015-10-23 22:30:48 +0300 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2015-10-28 20:08:56 +0200 |
commit | 74184c28fbe7ff58cf57f0094ef957d94045da7d (patch) | |
tree | 3d1c33a07e749439aa010abbdfbdd6fff0364356 | |
parent | 909644ef5883927262366c356eed530e55aba478 (diff) | |
download | libgcrypt-74184c28fbe7ff58cf57f0094ef957d94045da7d.tar.gz |
keccak: rewrite for improved performance
* cipher/Makefile.am: Add 'keccak_permute_32.h' and
'keccak_permute_64.h'.
* cipher/hash-common.h [USE_SHA3] (MD_BLOCK_MAX_BLOCKSIZE): Remove.
* cipher/keccak.c (USE_64BIT, USE_32BIT, USE_64BIT_BMI2)
(USE_64BIT_SHLD, USE_32BIT_BMI2, NEED_COMMON64, NEED_COMMON32BI)
(keccak_ops_t): New.
(KECCAK_STATE): Add 'state64' and 'state32bi' members.
(KECCAK_CONTEXT): Remove 'bctx'; add 'blocksize', 'count' and 'ops'.
(rol64, keccak_f1600_state_permute): Remove.
[NEED_COMMON64] (round_consts_64bit, keccak_extract_inplace64): New.
[NEED_COMMON32BI] (round_consts_32bit, keccak_extract_inplace32bi)
(keccak_absorb_lane32bi): New.
[USE_64BIT] (ANDN64, ROL64, keccak_f1600_state_permute64)
(keccak_absorb_lanes64, keccak_generic64_ops): New.
[USE_64BIT_SHLD] (ANDN64, ROL64, keccak_f1600_state_permute64_shld)
(keccak_absorb_lanes64_shld, keccak_shld_64_ops): New.
[USE_64BIT_BMI2] (ANDN64, ROL64, keccak_f1600_state_permute64_bmi2)
(keccak_absorb_lanes64_bmi2, keccak_bmi2_64_ops): New.
[USE_32BIT] (ANDN64, ROL64, keccak_f1600_state_permute32bi)
(keccak_absorb_lanes32bi, keccak_generic32bi_ops): New.
[USE_32BIT_BMI2] (ANDN64, ROL64, keccak_f1600_state_permute32bi_bmi2)
(pext, pdep, keccak_absorb_lane32bi_bmi2, keccak_absorb_lanes32bi_bmi2)
(keccak_extract_inplace32bi_bmi2, keccak_bmi2_32bi_ops): New.
(keccak_write): New.
(keccak_init): Adjust to KECCAK_CONTEXT changes; add implementation
selection based on HWF features.
(keccak_final): Adjust to KECCAK_CONTEXT changes; use selected 'ops'
for state manipulation.
(keccak_read): Adjust to KECCAK_CONTEXT changes.
(_gcry_digest_spec_sha3_224, _gcry_digest_spec_sha3_256)
(_gcry_digest_spec_sha3_348, _gcry_digest_spec_sha3_512): Use
'keccak_write' instead of '_gcry_md_block_write'.
* cipher/keccak_permute_32.h: New.
* cipher/keccak_permute_64.h: New.
--
Patch adds new generic 64-bit and 32-bit implementations and
optimized implementations for SHA3:
- Generic 64-bit implementation based on 'simple' implementation
from SUPERCOP package.
- Generic 32-bit bit-inteleaved implementataion based on
'simple32bi' implementation from SUPERCOP package.
- Intel BMI2 optimized variants of 64-bit and 32-bit BI
implementations.
- Intel SHLD optimized variant of 64-bit implementation.
Patch also makes proper use of sponge construction to avoid
use of addition input buffer.
Below are bench-slope benchmarks for new 64-bit implementations
made on Intel Core i5-4570 (no turbo, 3.2 Ghz, gcc-4.9.2).
Before (amd64):
SHA3-224 | 3.92 ns/B 243.2 MiB/s 12.55 c/B
SHA3-256 | 4.15 ns/B 230.0 MiB/s 13.27 c/B
SHA3-384 | 5.40 ns/B 176.6 MiB/s 17.29 c/B
SHA3-512 | 7.77 ns/B 122.7 MiB/s 24.87 c/B
After (generic 64-bit, amd64), 1.10x faster):
SHA3-224 | 3.57 ns/B 267.4 MiB/s 11.42 c/B
SHA3-256 | 3.77 ns/B 252.8 MiB/s 12.07 c/B
SHA3-384 | 4.91 ns/B 194.1 MiB/s 15.72 c/B
SHA3-512 | 7.06 ns/B 135.0 MiB/s 22.61 c/B
After (Intel SHLD 64-bit, amd64, 1.13x faster):
SHA3-224 | 3.48 ns/B 273.7 MiB/s 11.15 c/B
SHA3-256 | 3.68 ns/B 258.9 MiB/s 11.79 c/B
SHA3-384 | 4.80 ns/B 198.7 MiB/s 15.36 c/B
SHA3-512 | 6.89 ns/B 138.4 MiB/s 22.05 c/B
After (Intel BMI2 64-bit, amd64, 1.45x faster):
SHA3-224 | 2.71 ns/B 352.1 MiB/s 8.67 c/B
SHA3-256 | 2.86 ns/B 333.2 MiB/s 9.16 c/B
SHA3-384 | 3.72 ns/B 256.2 MiB/s 11.91 c/B
SHA3-512 | 5.34 ns/B 178.5 MiB/s 17.10 c/B
Benchmarks of new 32-bit implementations on Intel Core i5-4570
(no turbo, 3.2 Ghz, gcc-4.9.2):
Before (win32):
SHA3-224 | 12.05 ns/B 79.16 MiB/s 38.56 c/B
SHA3-256 | 12.75 ns/B 74.78 MiB/s 40.82 c/B
SHA3-384 | 16.63 ns/B 57.36 MiB/s 53.22 c/B
SHA3-512 | 23.97 ns/B 39.79 MiB/s 76.72 c/B
After (generic 32-bit BI, win32, 1.23x to 1.29x faster):
SHA3-224 | 9.76 ns/B 97.69 MiB/s 31.25 c/B
SHA3-256 | 10.27 ns/B 92.82 MiB/s 32.89 c/B
SHA3-384 | 13.22 ns/B 72.16 MiB/s 42.31 c/B
SHA3-512 | 18.65 ns/B 51.13 MiB/s 59.70 c/B
After (Intel BMI2 32-bit BI, win32, 1.66x to 1.70x faster):
SHA3-224 | 7.26 ns/B 131.4 MiB/s 23.23 c/B
SHA3-256 | 7.65 ns/B 124.7 MiB/s 24.47 c/B
SHA3-384 | 9.87 ns/B 96.67 MiB/s 31.58 c/B
SHA3-512 | 14.05 ns/B 67.85 MiB/s 44.99 c/B
Benchmarks of new 32-bit implementation on ARM Cortex-A8
(1008 Mhz, gcc-4.9.1):
Before:
SHA3-224 | 148.6 ns/B 6.42 MiB/s 149.8 c/B
SHA3-256 | 157.2 ns/B 6.07 MiB/s 158.4 c/B
SHA3-384 | 205.3 ns/B 4.65 MiB/s 206.9 c/B
SHA3-512 | 296.3 ns/B 3.22 MiB/s 298.6 c/B
After (1.56x faster):
SHA3-224 | 96.12 ns/B 9.92 MiB/s 96.89 c/B
SHA3-256 | 101.5 ns/B 9.40 MiB/s 102.3 c/B
SHA3-384 | 131.4 ns/B 7.26 MiB/s 132.5 c/B
SHA3-512 | 188.2 ns/B 5.07 MiB/s 189.7 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
-rw-r--r-- | cipher/Makefile.am | 2 | ||||
-rw-r--r-- | cipher/hash-common.h | 12 | ||||
-rw-r--r-- | cipher/keccak.c | 808 | ||||
-rw-r--r-- | cipher/keccak_permute_32.h | 535 | ||||
-rw-r--r-- | cipher/keccak_permute_64.h | 290 |
5 files changed, 1404 insertions, 243 deletions
diff --git a/cipher/Makefile.am b/cipher/Makefile.am index b08c9a97..be03d065 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -90,7 +90,7 @@ sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \ sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S sha256-avx2-bmi2-amd64.S \ sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S sha512-avx2-bmi2-amd64.S \ sha512-armv7-neon.S \ -keccak.c \ +keccak.c keccak_permute_32.h keccak_permute_64.h \ stribog.c \ tiger.c \ whirlpool.c whirlpool-sse2-amd64.S \ diff --git a/cipher/hash-common.h b/cipher/hash-common.h index e1ae5a24..27d670d8 100644 --- a/cipher/hash-common.h +++ b/cipher/hash-common.h @@ -33,15 +33,9 @@ typedef unsigned int (*_gcry_md_block_write_t) (void *c, const unsigned char *blks, size_t nblks); -#if defined(HAVE_U64_TYPEDEF) && (defined(USE_SHA512) || defined(USE_SHA3) || \ - defined(USE_WHIRLPOOL)) -/* SHA-512, SHA-3 and Whirlpool needs u64. SHA-512 and SHA3 need larger - * buffer. */ -# ifdef USE_SHA3 -# define MD_BLOCK_MAX_BLOCKSIZE (1152 / 8) -# else -# define MD_BLOCK_MAX_BLOCKSIZE 128 -# endif +#if defined(HAVE_U64_TYPEDEF) && (defined(USE_SHA512) || defined(USE_WHIRLPOOL)) +/* SHA-512 and Whirlpool needs u64. SHA-512 needs larger buffer. */ +# define MD_BLOCK_MAX_BLOCKSIZE 128 # define MD_NBLOCKS_TYPE u64 #else # define MD_BLOCK_MAX_BLOCKSIZE 64 diff --git a/cipher/keccak.c b/cipher/keccak.c index 4a9c1f27..3a72294a 100644 --- a/cipher/keccak.c +++ b/cipher/keccak.c @@ -27,11 +27,45 @@ #include "hash-common.h" -/* The code is based on public-domain/CC0 "Keccak-readable-and-compact.c" - * implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni, - * Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer. From: - * https://github.com/gvanas/KeccakCodePackage - */ + +/* USE_64BIT indicates whether to use 64-bit generic implementation. + * USE_32BIT indicates whether to use 32-bit generic implementation. */ +#undef USE_64BIT +#if defined(__x86_64__) || SIZEOF_UNSIGNED_LONG == 8 +# define USE_64BIT 1 +#else +# define USE_32BIT 1 +#endif + + +/* USE_64BIT_BMI2 indicates whether to compile with 64-bit Intel BMI2 code. */ +#undef USE_64BIT_BMI2 +#if defined(USE_64BIT) && defined(HAVE_GCC_INLINE_ASM_BMI2) +# define USE_64BIT_BMI2 1 +#endif + + +/* USE_64BIT_SHLD indicates whether to compile with 64-bit Intel SHLD code. */ +#undef USE_64BIT_SHLD +#if defined(USE_64BIT) && defined (__GNUC__) && defined(__x86_64__) +# define USE_64BIT_SHLD 1 +#endif + + +/* USE_32BIT_BMI2 indicates whether to compile with 32-bit Intel BMI2 code. */ +#undef USE_32BIT_BMI2 +#if defined(USE_32BIT) && defined(HAVE_GCC_INLINE_ASM_BMI2) +# define USE_32BIT_BMI2 1 +#endif + + +#ifdef USE_64BIT +# define NEED_COMMON64 1 +#endif + +#ifdef USE_32BIT +# define NEED_COMMON32BI 1 +#endif #define SHA3_DELIMITED_SUFFIX 0x06 @@ -40,220 +74,528 @@ typedef struct { - u64 state[5][5]; + union { +#ifdef NEED_COMMON64 + u64 state64[25]; +#endif +#ifdef NEED_COMMON32BI + u32 state32bi[50]; +#endif + } u; } KECCAK_STATE; typedef struct { - gcry_md_block_ctx_t bctx; + unsigned int (*permute)(KECCAK_STATE *hd); + unsigned int (*absorb)(KECCAK_STATE *hd, int pos, const byte *lanes, + unsigned int nlanes, int blocklanes); + unsigned int (*extract_inplace) (KECCAK_STATE *hd, unsigned int outlen); +} keccak_ops_t; + + +typedef struct KECCAK_CONTEXT_S +{ KECCAK_STATE state; unsigned int outlen; + unsigned int blocksize; + unsigned int count; + const keccak_ops_t *ops; } KECCAK_CONTEXT; -static inline u64 -rol64 (u64 x, unsigned int n) + +#ifdef NEED_COMMON64 + +static const u64 round_consts_64bit[24] = { - return ((x << n) | (x >> (64 - n))); -} + U64_C(0x0000000000000001), U64_C(0x0000000000008082), + U64_C(0x800000000000808A), U64_C(0x8000000080008000), + U64_C(0x000000000000808B), U64_C(0x0000000080000001), + U64_C(0x8000000080008081), U64_C(0x8000000000008009), + U64_C(0x000000000000008A), U64_C(0x0000000000000088), + U64_C(0x0000000080008009), U64_C(0x000000008000000A), + U64_C(0x000000008000808B), U64_C(0x800000000000008B), + U64_C(0x8000000000008089), U64_C(0x8000000000008003), + U64_C(0x8000000000008002), U64_C(0x8000000000000080), + U64_C(0x000000000000800A), U64_C(0x800000008000000A), + U64_C(0x8000000080008081), U64_C(0x8000000000008080), + U64_C(0x0000000080000001), U64_C(0x8000000080008008) +}; -/* Function that computes the Keccak-f[1600] permutation on the given state. */ -static unsigned int keccak_f1600_state_permute(KECCAK_STATE *hd) +static unsigned int +keccak_extract_inplace64(KECCAK_STATE *hd, unsigned int outlen) { - static const u64 round_consts[24] = - { - U64_C(0x0000000000000001), U64_C(0x0000000000008082), - U64_C(0x800000000000808A), U64_C(0x8000000080008000), - U64_C(0x000000000000808B), U64_C(0x0000000080000001), - U64_C(0x8000000080008081), U64_C(0x8000000000008009), - U64_C(0x000000000000008A), U64_C(0x0000000000000088), - U64_C(0x0000000080008009), U64_C(0x000000008000000A), - U64_C(0x000000008000808B), U64_C(0x800000000000008B), - U64_C(0x8000000000008089), U64_C(0x8000000000008003), - U64_C(0x8000000000008002), U64_C(0x8000000000000080), - U64_C(0x000000000000800A), U64_C(0x800000008000000A), - U64_C(0x8000000080008081), U64_C(0x8000000000008080), - U64_C(0x0000000080000001), U64_C(0x8000000080008008) - }; - unsigned int round; + unsigned int i; - for (round = 0; round < 24; round++) + for (i = 0; i < outlen / 8 + !!(outlen % 8); i++) { - { - /* θ step (see [Keccak Reference, Section 2.3.2]) === */ - u64 C[5], D[5]; - - /* Compute the parity of the columns */ - C[0] = hd->state[0][0] ^ hd->state[1][0] ^ hd->state[2][0] - ^ hd->state[3][0] ^ hd->state[4][0]; - C[1] = hd->state[0][1] ^ hd->state[1][1] ^ hd->state[2][1] - ^ hd->state[3][1] ^ hd->state[4][1]; - C[2] = hd->state[0][2] ^ hd->state[1][2] ^ hd->state[2][2] - ^ hd->state[3][2] ^ hd->state[4][2]; - C[3] = hd->state[0][3] ^ hd->state[1][3] ^ hd->state[2][3] - ^ hd->state[3][3] ^ hd->state[4][3]; - C[4] = hd->state[0][4] ^ hd->state[1][4] ^ hd->state[2][4] - ^ hd->state[3][4] ^ hd->state[4][4]; - - /* Compute the θ effect for a given column */ - D[0] = C[4] ^ rol64(C[1], 1); - D[1] = C[0] ^ rol64(C[2], 1); - D[2] = C[1] ^ rol64(C[3], 1); - D[3] = C[2] ^ rol64(C[4], 1); - D[4] = C[3] ^ rol64(C[0], 1); - - /* Add the θ effect to the whole column */ - hd->state[0][0] ^= D[0]; - hd->state[1][0] ^= D[0]; - hd->state[2][0] ^= D[0]; - hd->state[3][0] ^= D[0]; - hd->state[4][0] ^= D[0]; - - /* Add the θ effect to the whole column */ - hd->state[0][1] ^= D[1]; - hd->state[1][1] ^= D[1]; - hd->state[2][1] ^= D[1]; - hd->state[3][1] ^= D[1]; - hd->state[4][1] ^= D[1]; - - /* Add the θ effect to the whole column */ - hd->state[0][2] ^= D[2]; - hd->state[1][2] ^= D[2]; - hd->state[2][2] ^= D[2]; - hd->state[3][2] ^= D[2]; - hd->state[4][2] ^= D[2]; - - /* Add the θ effect to the whole column */ - hd->state[0][3] ^= D[3]; - hd->state[1][3] ^= D[3]; - hd->state[2][3] ^= D[3]; - hd->state[3][3] ^= D[3]; - hd->state[4][3] ^= D[3]; - - /* Add the θ effect to the whole column */ - hd->state[0][4] ^= D[4]; - hd->state[1][4] ^= D[4]; - hd->state[2][4] ^= D[4]; - hd->state[3][4] ^= D[4]; - hd->state[4][4] ^= D[4]; - } - - { - /* ρ and π steps (see [Keccak Reference, Sections 2.3.3 and 2.3.4]) */ - u64 current, temp; - -#define do_swap_n_rol(x, y, r) \ - temp = hd->state[y][x]; \ - hd->state[y][x] = rol64(current, r); \ - current = temp; - - /* Start at coordinates (1 0) */ - current = hd->state[0][1]; - - /* Iterate over ((0 1)(2 3))^t * (1 0) for 0 ≤ t ≤ 23 */ - do_swap_n_rol(0, 2, 1); - do_swap_n_rol(2, 1, 3); - do_swap_n_rol(1, 2, 6); - do_swap_n_rol(2, 3, 10); - do_swap_n_rol(3, 3, 15); - do_swap_n_rol(3, 0, 21); - do_swap_n_rol(0, 1, 28); - do_swap_n_rol(1, 3, 36); - do_swap_n_rol(3, 1, 45); - do_swap_n_rol(1, 4, 55); - do_swap_n_rol(4, 4, 2); - do_swap_n_rol(4, 0, 14); - do_swap_n_rol(0, 3, 27); - do_swap_n_rol(3, 4, 41); - do_swap_n_rol(4, 3, 56); - do_swap_n_rol(3, 2, 8); - do_swap_n_rol(2, 2, 25); - do_swap_n_rol(2, 0, 43); - do_swap_n_rol(0, 4, 62); - do_swap_n_rol(4, 2, 18); - do_swap_n_rol(2, 4, 39); - do_swap_n_rol(4, 1, 61); - do_swap_n_rol(1, 1, 20); - do_swap_n_rol(1, 0, 44); - -#undef do_swap_n_rol - } - - { - /* χ step (see [Keccak Reference, Section 2.3.1]) */ - u64 temp[5]; - -#define do_x_step_for_plane(y) \ - /* Take a copy of the plane */ \ - temp[0] = hd->state[y][0]; \ - temp[1] = hd->state[y][1]; \ - temp[2] = hd->state[y][2]; \ - temp[3] = hd->state[y][3]; \ - temp[4] = hd->state[y][4]; \ - \ - /* Compute χ on the plane */ \ - hd->state[y][0] = temp[0] ^ ((~temp[1]) & temp[2]); \ - hd->state[y][1] = temp[1] ^ ((~temp[2]) & temp[3]); \ - hd->state[y][2] = temp[2] ^ ((~temp[3]) & temp[4]); \ - hd->state[y][3] = temp[3] ^ ((~temp[4]) & temp[0]); \ - hd->state[y][4] = temp[4] ^ ((~temp[0]) & temp[1]); - - do_x_step_for_plane(0); - do_x_step_for_plane(1); - do_x_step_for_plane(2); - do_x_step_for_plane(3); - do_x_step_for_plane(4); - -#undef do_x_step_for_plane - } - - { - /* ι step (see [Keccak Reference, Section 2.3.5]) */ - - hd->state[0][0] ^= round_consts[round]; - } + hd->u.state64[i] = le_bswap64(hd->u.state64[i]); } - return sizeof(void *) * 4 + sizeof(u64) * 10; + return 0; } +#endif /* NEED_COMMON64 */ + + +#ifdef NEED_COMMON32BI + +static const u32 round_consts_32bit[2 * 24] = +{ + 0x00000001UL, 0x00000000UL, 0x00000000UL, 0x00000089UL, + 0x00000000UL, 0x8000008bUL, 0x00000000UL, 0x80008080UL, + 0x00000001UL, 0x0000008bUL, 0x00000001UL, 0x00008000UL, + 0x00000001UL, 0x80008088UL, 0x00000001UL, 0x80000082UL, + 0x00000000UL, 0x0000000bUL, 0x00000000UL, 0x0000000aUL, + 0x00000001UL, 0x00008082UL, 0x00000000UL, 0x00008003UL, + 0x00000001UL, 0x0000808bUL, 0x00000001UL, 0x8000000bUL, + 0x00000001UL, 0x8000008aUL, 0x00000001UL, 0x80000081UL, + 0x00000000UL, 0x80000081UL, 0x00000000UL, 0x80000008UL, + 0x00000000UL, 0x00000083UL, 0x00000000UL, 0x80008003UL, + 0x00000001UL, 0x80008088UL, 0x00000000UL, 0x80000088UL, + 0x00000001UL, 0x00008000UL, 0x00000000UL, 0x80008082UL +}; static unsigned int -transform_blk (void *context, const unsigned char *data) +keccak_extract_inplace32bi(KECCAK_STATE *hd, unsigned int outlen) { - KECCAK_CONTEXT *ctx = context; - KECCAK_STATE *hd = &ctx->state; - u64 *state = (u64 *)hd->state; - const size_t bsize = ctx->bctx.blocksize; unsigned int i; + u32 x0; + u32 x1; + u32 t; + + for (i = 0; i < outlen / 8 + !!(outlen % 8); i++) + { + x0 = hd->u.state32bi[i * 2 + 0]; + x1 = hd->u.state32bi[i * 2 + 1]; + + t = (x0 & 0x0000FFFFUL) + (x1 << 16); + x1 = (x0 >> 16) + (x1 & 0xFFFF0000UL); + x0 = t; + t = (x0 ^ (x0 >> 8)) & 0x0000FF00UL; x0 = x0 ^ t ^ (t << 8); + t = (x0 ^ (x0 >> 4)) & 0x00F000F0UL; x0 = x0 ^ t ^ (t << 4); + t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0CUL; x0 = x0 ^ t ^ (t << 2); + t = (x0 ^ (x0 >> 1)) & 0x22222222UL; x0 = x0 ^ t ^ (t << 1); + t = (x1 ^ (x1 >> 8)) & 0x0000FF00UL; x1 = x1 ^ t ^ (t << 8); + t = (x1 ^ (x1 >> 4)) & 0x00F000F0UL; x1 = x1 ^ t ^ (t << 4); + t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0CUL; x1 = x1 ^ t ^ (t << 2); + t = (x1 ^ (x1 >> 1)) & 0x22222222UL; x1 = x1 ^ t ^ (t << 1); + + hd->u.state32bi[i * 2 + 0] = le_bswap32(x0); + hd->u.state32bi[i * 2 + 1] = le_bswap32(x1); + } - /* Absorb input block. */ - for (i = 0; i < bsize / 8; i++) - state[i] ^= buf_get_le64(data + i * 8); + return 0; +} - return keccak_f1600_state_permute(hd) + 4 * sizeof(void *); +static inline void +keccak_absorb_lane32bi(u32 *lane, u32 x0, u32 x1) +{ + u32 t; + + t = (x0 ^ (x0 >> 1)) & 0x22222222UL; x0 = x0 ^ t ^ (t << 1); + t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0CUL; x0 = x0 ^ t ^ (t << 2); + t = (x0 ^ (x0 >> 4)) & 0x00F000F0UL; x0 = x0 ^ t ^ (t << 4); + t = (x0 ^ (x0 >> 8)) & 0x0000FF00UL; x0 = x0 ^ t ^ (t << 8); + t = (x1 ^ (x1 >> 1)) & 0x22222222UL; x1 = x1 ^ t ^ (t << 1); + t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0CUL; x1 = x1 ^ t ^ (t << 2); + t = (x1 ^ (x1 >> 4)) & 0x00F000F0UL; x1 = x1 ^ t ^ (t << 4); + t = (x1 ^ (x1 >> 8)) & 0x0000FF00UL; x1 = x1 ^ t ^ (t << 8); + lane[0] ^= (x0 & 0x0000FFFFUL) + (x1 << 16); + lane[1] ^= (x0 >> 16) + (x1 & 0xFFFF0000UL); } +#endif /* NEED_COMMON32BI */ + + +/* Construct generic 64-bit implementation. */ +#ifdef USE_64BIT + +# define ANDN64(x, y) (~(x) & (y)) +# define ROL64(x, n) (((x) << ((unsigned int)n & 63)) | \ + ((x) >> ((64 - (unsigned int)(n)) & 63))) + +# define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute64 +# include "keccak_permute_64.h" + +# undef ANDN64 +# undef ROL64 +# undef KECCAK_F1600_PERMUTE_FUNC_NAME static unsigned int -transform (void *context, const unsigned char *data, size_t nblks) +keccak_absorb_lanes64(KECCAK_STATE *hd, int pos, const byte *lanes, + unsigned int nlanes, int blocklanes) { - KECCAK_CONTEXT *ctx = context; - const size_t bsize = ctx->bctx.blocksize; - unsigned int burn; + unsigned int burn = 0; + + while (nlanes) + { + hd->u.state64[pos] ^= buf_get_le64(lanes); + lanes += 8; + nlanes--; + + if (++pos == blocklanes) + { + burn = keccak_f1600_state_permute64(hd); + pos = 0; + } + } + + return burn; +} + +static const keccak_ops_t keccak_generic64_ops = +{ + .permute = keccak_f1600_state_permute64, + .absorb = keccak_absorb_lanes64, + .extract_inplace = keccak_extract_inplace64, +}; + +#endif /* USE_64BIT */ + + +/* Construct 64-bit Intel SHLD implementation. */ +#ifdef USE_64BIT_SHLD + +# define ANDN64(x, y) (~(x) & (y)) +# define ROL64(x, n) ({ \ + u64 tmp = (x); \ + asm ("shldq %1, %0, %0" \ + : "+r" (tmp) \ + : "J" ((n) & 63) \ + : "cc"); \ + tmp; }) + +# define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute64_shld +# include "keccak_permute_64.h" + +# undef ANDN64 +# undef ROL64 +# undef KECCAK_F1600_PERMUTE_FUNC_NAME + +static unsigned int +keccak_absorb_lanes64_shld(KECCAK_STATE *hd, int pos, const byte *lanes, + unsigned int nlanes, int blocklanes) +{ + unsigned int burn = 0; + + while (nlanes) + { + hd->u.state64[pos] ^= buf_get_le64(lanes); + lanes += 8; + nlanes--; + + if (++pos == blocklanes) + { + burn = keccak_f1600_state_permute64_shld(hd); + pos = 0; + } + } + + return burn; +} + +static const keccak_ops_t keccak_shld_64_ops = +{ + .permute = keccak_f1600_state_permute64_shld, + .absorb = keccak_absorb_lanes64_shld, + .extract_inplace = keccak_extract_inplace64, +}; + +#endif /* USE_64BIT_SHLD */ + + +/* Construct 64-bit Intel BMI2 implementation. */ +#ifdef USE_64BIT_BMI2 + +# define ANDN64(x, y) ({ \ + u64 tmp; \ + asm ("andnq %2, %1, %0" \ + : "=r" (tmp) \ + : "r0" (x), "rm" (y)); \ + tmp; }) + +# define ROL64(x, n) ({ \ + u64 tmp; \ + asm ("rorxq %2, %1, %0" \ + : "=r" (tmp) \ + : "rm0" (x), "J" (64 - ((n) & 63))); \ + tmp; }) + +# define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute64_bmi2 +# include "keccak_permute_64.h" + +# undef ANDN64 +# undef ROL64 +# undef KECCAK_F1600_PERMUTE_FUNC_NAME + +static unsigned int +keccak_absorb_lanes64_bmi2(KECCAK_STATE *hd, int pos, const byte *lanes, + unsigned int nlanes, int blocklanes) +{ + unsigned int burn = 0; + + while (nlanes) + { + hd->u.state64[pos] ^= buf_get_le64(lanes); + lanes += 8; + nlanes--; + + if (++pos == blocklanes) + { + burn = keccak_f1600_state_permute64_bmi2(hd); + pos = 0; + } + } + + return burn; +} + +static const keccak_ops_t keccak_bmi2_64_ops = +{ + .permute = keccak_f1600_state_permute64_bmi2, + .absorb = keccak_absorb_lanes64_bmi2, + .extract_inplace = keccak_extract_inplace64, +}; + +#endif /* USE_64BIT_BMI2 */ + + +/* Construct generic 32-bit implementation. */ +#ifdef USE_32BIT + +# define ANDN32(x, y) (~(x) & (y)) +# define ROL32(x, n) (((x) << ((unsigned int)n & 31)) | \ + ((x) >> ((32 - (unsigned int)(n)) & 31))) + +# define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute32bi +# include "keccak_permute_32.h" + +# undef ANDN32 +# undef ROL32 +# undef KECCAK_F1600_PERMUTE_FUNC_NAME + +static unsigned int +keccak_absorb_lanes32bi(KECCAK_STATE *hd, int pos, const byte *lanes, + unsigned int nlanes, int blocklanes) +{ + unsigned int burn = 0; - /* Absorb full blocks. */ - do + while (nlanes) { - burn = transform_blk (context, data); - data += bsize; + keccak_absorb_lane32bi(&hd->u.state32bi[pos * 2], + buf_get_le32(lanes + 0), + buf_get_le32(lanes + 4)); + lanes += 8; + nlanes--; + + if (++pos == blocklanes) + { + burn = keccak_f1600_state_permute32bi(hd); + pos = 0; + } } - while (--nblks); return burn; } +static const keccak_ops_t keccak_generic32bi_ops = +{ + .permute = keccak_f1600_state_permute32bi, + .absorb = keccak_absorb_lanes32bi, + .extract_inplace = keccak_extract_inplace32bi, +}; + +#endif /* USE_32BIT */ + + +/* Construct 32-bit Intel BMI2 implementation. */ +#ifdef USE_32BIT_BMI2 + +# define ANDN32(x, y) ({ \ + u32 tmp; \ + asm ("andnl %2, %1, %0" \ + : "=r" (tmp) \ + : "r0" (x), "rm" (y)); \ + tmp; }) + +# define ROL32(x, n) ({ \ + u32 tmp; \ + asm ("rorxl %2, %1, %0" \ + : "=r" (tmp) \ + : "rm0" (x), "J" (32 - ((n) & 31))); \ + tmp; }) + +# define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute32bi_bmi2 +# include "keccak_permute_32.h" + +# undef ANDN32 +# undef ROL32 +# undef KECCAK_F1600_PERMUTE_FUNC_NAME + +static inline u32 pext(u32 x, u32 mask) +{ + u32 tmp; + asm ("pextl %2, %1, %0" : "=r" (tmp) : "r0" (x), "rm" (mask)); + return tmp; +} + +static inline u32 pdep(u32 x, u32 mask) +{ + u32 tmp; + asm ("pdepl %2, %1, %0" : "=r" (tmp) : "r0" (x), "rm" (mask)); + return tmp; +} + +static inline void +keccak_absorb_lane32bi_bmi2(u32 *lane, u32 x0, u32 x1) +{ + x0 = pdep(pext(x0, 0x55555555), 0x0000ffff) | (pext(x0, 0xaaaaaaaa) << 16); + x1 = pdep(pext(x1, 0x55555555), 0x0000ffff) | (pext(x1, 0xaaaaaaaa) << 16); + + lane[0] ^= (x0 & 0x0000FFFFUL) + (x1 << 16); + lane[1] ^= (x0 >> 16) + (x1 & 0xFFFF0000UL); +} + +static unsigned int +keccak_absorb_lanes32bi_bmi2(KECCAK_STATE *hd, int pos, const byte *lanes, + unsigned int nlanes, int blocklanes) +{ + unsigned int burn = 0; + + while (nlanes) + { + keccak_absorb_lane32bi_bmi2(&hd->u.state32bi[pos * 2], + buf_get_le32(lanes + 0), + buf_get_le32(lanes + 4)); + lanes += 8; + nlanes--; + + if (++pos == blocklanes) + { + burn = keccak_f1600_state_permute32bi_bmi2(hd); + pos = 0; + } + } + + return burn; +} + +static unsigned int +keccak_extract_inplace32bi_bmi2(KECCAK_STATE *hd, unsigned int outlen) +{ + unsigned int i; + u32 x0; + u32 x1; + u32 t; + + for (i = 0; i < outlen / 8 + !!(outlen % 8); i++) + { + x0 = hd->u.state32bi[i * 2 + 0]; + x1 = hd->u.state32bi[i * 2 + 1]; + + t = (x0 & 0x0000FFFFUL) + (x1 << 16); + x1 = (x0 >> 16) + (x1 & 0xFFFF0000UL); + x0 = t; + + x0 = pdep(pext(x0, 0xffff0001), 0xaaaaaaab) | pdep(x0 >> 1, 0x55555554); + x1 = pdep(pext(x1, 0xffff0001), 0xaaaaaaab) | pdep(x1 >> 1, 0x55555554); + + hd->u.state32bi[i * 2 + 0] = le_bswap32(x0); + hd->u.state32bi[i * 2 + 1] = le_bswap32(x1); + } + + return 0; +} + +static const keccak_ops_t keccak_bmi2_32bi_ops = +{ + .permute = keccak_f1600_state_permute32bi_bmi2, + .absorb = keccak_absorb_lanes32bi_bmi2, + .extract_inplace = keccak_extract_inplace32bi_bmi2, +}; + +#endif /* USE_32BIT */ + + +static void +keccak_write (void *context, const void *inbuf_arg, size_t inlen) +{ + KECCAK_CONTEXT *ctx = context; + const size_t bsize = ctx->blocksize; + const size_t blocklanes = bsize / 8; + const byte *inbuf = inbuf_arg; + unsigned int nburn, burn = 0; + unsigned int count, i; + unsigned int pos, nlanes; + + count = ctx->count; + + if (inlen && (count % 8)) + { + byte lane[8] = { 0, }; + + /* Complete absorbing partial input lane. */ + + pos = count / 8; + + for (i = count % 8; inlen && i < 8; i++) + { + lane[i] = *inbuf++; + inlen--; + count++; + } + + if (count == bsize) + count = 0; + + nburn = ctx->ops->absorb(&ctx->state, pos, lane, 1, + (count % 8) ? -1 : blocklanes); + burn = nburn > burn ? nburn : burn; + } + + /* Absorb full input lanes. */ + + pos = count / 8; + nlanes = inlen / 8; + if (nlanes > 0) + { + nburn = ctx->ops->absorb(&ctx->state, pos, inbuf, nlanes, blocklanes); + burn = nburn > burn ? nburn : burn; + inlen -= nlanes * 8; + inbuf += nlanes * 8; + count += nlanes * 8; + count = count % bsize; + } + + if (inlen) + { + byte lane[8] = { 0, }; + + /* Absorb remaining partial input lane. */ + + pos = count / 8; + + for (i = count % 8; inlen && i < 8; i++) + { + lane[i] = *inbuf++; + inlen--; + count++; + } + + nburn = ctx->ops->absorb(&ctx->state, pos, lane, 1, -1); + burn = nburn > burn ? nburn : burn; + + gcry_assert(count < bsize); + } + + ctx->count = count; + + if (burn) + _gcry_burn_stack (burn); +} + static void keccak_init (int algo, void *context, unsigned int flags) @@ -267,29 +609,48 @@ keccak_init (int algo, void *context, unsigned int flags) memset (hd, 0, sizeof *hd); - ctx->bctx.nblocks = 0; - ctx->bctx.nblocks_high = 0; - ctx->bctx.count = 0; - ctx->bctx.bwrite = transform; + ctx->count = 0; + + /* Select generic implementation. */ +#ifdef USE_64BIT + ctx->ops = &keccak_generic64_ops; +#elif defined USE_32BIT + ctx->ops = &keccak_generic32bi_ops; +#endif + + /* Select optimized implementation based in hw features. */ + if (0) {} +#ifdef USE_64BIT_BMI2 + else if (features & HWF_INTEL_BMI2) + ctx->ops = &keccak_bmi2_64_ops; +#endif +#ifdef USE_32BIT_BMI2 + else if (features & HWF_INTEL_BMI2) + ctx->ops = &keccak_bmi2_32bi_ops; +#endif +#ifdef USE_64BIT_SHLD + else if (features & HWF_INTEL_FAST_SHLD) + ctx->ops = &keccak_shld_64_ops; +#endif /* Set input block size, in Keccak terms this is called 'rate'. */ switch (algo) { case GCRY_MD_SHA3_224: - ctx->bctx.blocksize = 1152 / 8; + ctx->blocksize = 1152 / 8; ctx->outlen = 224 / 8; break; case GCRY_MD_SHA3_256: - ctx->bctx.blocksize = 1088 / 8; + ctx->blocksize = 1088 / 8; ctx->outlen = 256 / 8; break; case GCRY_MD_SHA3_384: - ctx->bctx.blocksize = 832 / 8; + ctx->blocksize = 832 / 8; ctx->outlen = 384 / 8; break; case GCRY_MD_SHA3_512: - ctx->bctx.blocksize = 576 / 8; + ctx->blocksize = 576 / 8; ctx->outlen = 512 / 8; break; default: @@ -334,59 +695,37 @@ keccak_final (void *context) { KECCAK_CONTEXT *ctx = context; KECCAK_STATE *hd = &ctx->state; - const size_t bsize = ctx->bctx.blocksize; + const size_t bsize = ctx->blocksize; const byte suffix = SHA3_DELIMITED_SUFFIX; - u64 *state = (u64 *)hd->state; - unsigned int stack_burn_depth; + unsigned int nburn, burn = 0; unsigned int lastbytes; - unsigned int i; - byte *buf; + byte lane[8]; - _gcry_md_block_write (context, NULL, 0); /* flush */ - - buf = ctx->bctx.buf; - lastbytes = ctx->bctx.count; - - /* Absorb remaining bytes. */ - for (i = 0; i < lastbytes / 8; i++) - { - state[i] ^= buf_get_le64(buf); - buf += 8; - } - - for (i = 0; i < lastbytes % 8; i++) - { - state[lastbytes / 8] ^= (u64)*buf << (i * 8); - buf++; - } + lastbytes = ctx->count; /* Do the padding and switch to the squeezing phase */ /* Absorb the last few bits and add the first bit of padding (which coincides with the delimiter in delimited suffix) */ - state[lastbytes / 8] ^= (u64)suffix << ((lastbytes % 8) * 8); + buf_put_le64(lane, (u64)suffix << ((lastbytes % 8) * 8)); + nburn = ctx->ops->absorb(&ctx->state, lastbytes / 8, lane, 1, -1); + burn = nburn > burn ? nburn : burn; /* Add the second bit of padding. */ - state[(bsize - 1) / 8] ^= (u64)0x80 << (((bsize - 1) % 8) * 8); + buf_put_le64(lane, (u64)0x80 << (((bsize - 1) % 8) * 8)); + nburn = ctx->ops->absorb(&ctx->state, (bsize - 1) / 8, lane, 1, -1); + burn = nburn > burn ? nburn : burn; /* Switch to the squeezing phase. */ - stack_burn_depth = keccak_f1600_state_permute(hd); + nburn = ctx->ops->permute(hd); + burn = nburn > burn ? nburn : burn; /* Squeeze out all the output blocks */ if (ctx->outlen < bsize) { /* Output SHA3 digest. */ - buf = ctx->bctx.buf; - for (i = 0; i < ctx->outlen / 8; i++) - { - buf_put_le64(buf, state[i]); - buf += 8; - } - for (i = 0; i < ctx->outlen % 8; i++) - { - *buf = state[ctx->outlen / 8] >> (i * 8); - buf++; - } + nburn = ctx->ops->extract_inplace(hd, ctx->outlen); + burn = nburn > burn ? nburn : burn; } else { @@ -394,15 +733,18 @@ keccak_final (void *context) BUG(); } - _gcry_burn_stack (stack_burn_depth); + wipememory(lane, sizeof(lane)); + if (burn) + _gcry_burn_stack (burn); } static byte * keccak_read (void *context) { - KECCAK_CONTEXT *hd = (KECCAK_CONTEXT *) context; - return hd->bctx.buf; + KECCAK_CONTEXT *ctx = (KECCAK_CONTEXT *) context; + KECCAK_STATE *hd = &ctx->state; + return (byte *)&hd->u; } @@ -585,7 +927,7 @@ gcry_md_spec_t _gcry_digest_spec_sha3_224 = { GCRY_MD_SHA3_224, {0, 1}, "SHA3-224", sha3_224_asn, DIM (sha3_224_asn), oid_spec_sha3_224, 28, - sha3_224_init, _gcry_md_block_write, keccak_final, keccak_read, + sha3_224_init, keccak_write, keccak_final, keccak_read, sizeof (KECCAK_CONTEXT), run_selftests }; @@ -593,7 +935,7 @@ gcry_md_spec_t _gcry_digest_spec_sha3_256 = { GCRY_MD_SHA3_256, {0, 1}, "SHA3-256", sha3_256_asn, DIM (sha3_256_asn), oid_spec_sha3_256, 32, - sha3_256_init, _gcry_md_block_write, keccak_final, keccak_read, + sha3_256_init, keccak_write, keccak_final, keccak_read, sizeof (KECCAK_CONTEXT), run_selftests }; @@ -601,7 +943,7 @@ gcry_md_spec_t _gcry_digest_spec_sha3_384 = { GCRY_MD_SHA3_384, {0, 1}, "SHA3-384", sha3_384_asn, DIM (sha3_384_asn), oid_spec_sha3_384, 48, - sha3_384_init, _gcry_md_block_write, keccak_final, keccak_read, + sha3_384_init, keccak_write, keccak_final, keccak_read, sizeof (KECCAK_CONTEXT), run_selftests }; @@ -609,7 +951,7 @@ gcry_md_spec_t _gcry_digest_spec_sha3_512 = { GCRY_MD_SHA3_512, {0, 1}, "SHA3-512", sha3_512_asn, DIM (sha3_512_asn), oid_spec_sha3_512, 64, - sha3_512_init, _gcry_md_block_write, keccak_final, keccak_read, + sha3_512_init, keccak_write, keccak_final, keccak_read, sizeof (KECCAK_CONTEXT), run_selftests }; diff --git a/cipher/keccak_permute_32.h b/cipher/keccak_permute_32.h new file mode 100644 index 00000000..fed93831 --- /dev/null +++ b/cipher/keccak_permute_32.h @@ -0,0 +1,535 @@ +/* keccak_permute_32.h - Keccak permute function (simple 32bit bit-interleaved) + * Copyright (C) 2015 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser general Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +/* The code is based on public-domain/CC0 "keccakc1024/simple32bi/ + * Keccak-simple32BI.c" implementation by Ronny Van Keer from SUPERCOP toolkit + * package. + */ + +/* Function that computes the Keccak-f[1600] permutation on the given state. */ +static unsigned int +KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd) +{ + const u32 *round_consts = round_consts_32bit; + u32 Aba0, Abe0, Abi0, Abo0, Abu0; + u32 Aba1, Abe1, Abi1, Abo1, Abu1; + u32 Aga0, Age0, Agi0, Ago0, Agu0; + u32 Aga1, Age1, Agi1, Ago1, Agu1; + u32 Aka0, Ake0, Aki0, Ako0, Aku0; + u32 Aka1, Ake1, Aki1, Ako1, Aku1; + u32 Ama0, Ame0, Ami0, Amo0, Amu0; + u32 Ama1, Ame1, Ami1, Amo1, Amu1; + u32 Asa0, Ase0, Asi0, Aso0, Asu0; + u32 Asa1, Ase1, Asi1, Aso1, Asu1; + u32 BCa0, BCe0, BCi0, BCo0, BCu0; + u32 BCa1, BCe1, BCi1, BCo1, BCu1; + u32 Da0, De0, Di0, Do0, Du0; + u32 Da1, De1, Di1, Do1, Du1; + u32 Eba0, Ebe0, Ebi0, Ebo0, Ebu0; + u32 Eba1, Ebe1, Ebi1, Ebo1, Ebu1; + u32 Ega0, Ege0, Egi0, Ego0, Egu0; + u32 Ega1, Ege1, Egi1, Ego1, Egu1; + u32 Eka0, Eke0, Eki0, Eko0, Eku0; + u32 Eka1, Eke1, Eki1, Eko1, Eku1; + u32 Ema0, Eme0, Emi0, Emo0, Emu0; + u32 Ema1, Eme1, Emi1, Emo1, Emu1; + u32 Esa0, Ese0, Esi0, Eso0, Esu0; + u32 Esa1, Ese1, Esi1, Eso1, Esu1; + u32 *state = hd->u.state32bi; + unsigned int round; + + Aba0 = state[0]; + Aba1 = state[1]; + Abe0 = state[2]; + Abe1 = state[3]; + Abi0 = state[4]; + Abi1 = state[5]; + Abo0 = state[6]; + Abo1 = state[7]; + Abu0 = state[8]; + Abu1 = state[9]; + Aga0 = state[10]; + Aga1 = state[11]; + Age0 = state[12]; + Age1 = state[13]; + Agi0 = state[14]; + Agi1 = state[15]; + Ago0 = state[16]; + Ago1 = state[17]; + Agu0 = state[18]; + Agu1 = state[19]; + Aka0 = state[20]; + Aka1 = state[21]; + Ake0 = state[22]; + Ake1 = state[23]; + Aki0 = state[24]; + Aki1 = state[25]; + Ako0 = state[26]; + Ako1 = state[27]; + Aku0 = state[28]; + Aku1 = state[29]; + Ama0 = state[30]; + Ama1 = state[31]; + Ame0 = state[32]; + Ame1 = state[33]; + Ami0 = state[34]; + Ami1 = state[35]; + Amo0 = state[36]; + Amo1 = state[37]; + Amu0 = state[38]; + Amu1 = state[39]; + Asa0 = state[40]; + Asa1 = state[41]; + Ase0 = state[42]; + Ase1 = state[43]; + Asi0 = state[44]; + Asi1 = state[45]; + Aso0 = state[46]; + Aso1 = state[47]; + Asu0 = state[48]; + Asu1 = state[49]; + + for (round = 0; round < 24; round += 2) + { + /* prepareTheta */ + BCa0 = Aba0 ^ Aga0 ^ Aka0 ^ Ama0 ^ Asa0; + BCa1 = Aba1 ^ Aga1 ^ Aka1 ^ Ama1 ^ Asa1; + BCe0 = Abe0 ^ Age0 ^ Ake0 ^ Ame0 ^ Ase0; + BCe1 = Abe1 ^ Age1 ^ Ake1 ^ Ame1 ^ Ase1; + BCi0 = Abi0 ^ Agi0 ^ Aki0 ^ Ami0 ^ Asi0; + BCi1 = Abi1 ^ Agi1 ^ Aki1 ^ Ami1 ^ Asi1; + BCo0 = Abo0 ^ Ago0 ^ Ako0 ^ Amo0 ^ Aso0; + BCo1 = Abo1 ^ Ago1 ^ Ako1 ^ Amo1 ^ Aso1; + BCu0 = Abu0 ^ Agu0 ^ Aku0 ^ Amu0 ^ Asu0; + BCu1 = Abu1 ^ Agu1 ^ Aku1 ^ Amu1 ^ Asu1; + + /* thetaRhoPiChiIota(round , A, E) */ + Da0 = BCu0 ^ ROL32(BCe1, 1); + Da1 = BCu1 ^ BCe0; + De0 = BCa0 ^ ROL32(BCi1, 1); + De1 = BCa1 ^ BCi0; + Di0 = BCe0 ^ ROL32(BCo1, 1); + Di1 = BCe1 ^ BCo0; + Do0 = BCi0 ^ ROL32(BCu1, 1); + Do1 = BCi1 ^ BCu0; + Du0 = BCo0 ^ ROL32(BCa1, 1); + Du1 = BCo1 ^ BCa0; + + Aba0 ^= Da0; + BCa0 = Aba0; + Age0 ^= De0; + BCe0 = ROL32(Age0, 22); + Aki1 ^= Di1; + BCi0 = ROL32(Aki1, 22); + Amo1 ^= Do1; + BCo0 = ROL32(Amo1, 11); + Asu0 ^= Du0; + BCu0 = ROL32(Asu0, 7); + Eba0 = BCa0 ^ ANDN32(BCe0, BCi0); + Eba0 ^= round_consts[round * 2 + 0]; + Ebe0 = BCe0 ^ ANDN32(BCi0, BCo0); + Ebi0 = BCi0 ^ ANDN32(BCo0, BCu0); + Ebo0 = BCo0 ^ ANDN32(BCu0, BCa0); + Ebu0 = BCu0 ^ ANDN32(BCa0, BCe0); + + Aba1 ^= Da1; + BCa1 = Aba1; + Age1 ^= De1; + BCe1 = ROL32(Age1, 22); + Aki0 ^= Di0; + BCi1 = ROL32(Aki0, 21); + Amo0 ^= Do0; + BCo1 = ROL32(Amo0, 10); + Asu1 ^= Du1; + BCu1 = ROL32(Asu1, 7); + Eba1 = BCa1 ^ ANDN32(BCe1, BCi1); + Eba1 ^= round_consts[round * 2 + 1]; + Ebe1 = BCe1 ^ ANDN32(BCi1, BCo1); + Ebi1 = BCi1 ^ ANDN32(BCo1, BCu1); + Ebo1 = BCo1 ^ ANDN32(BCu1, BCa1); + Ebu1 = BCu1 ^ ANDN32(BCa1, BCe1); + + Abo0 ^= Do0; + BCa0 = ROL32(Abo0, 14); + Agu0 ^= Du0; + BCe0 = ROL32(Agu0, 10); + Aka1 ^= Da1; + BCi0 = ROL32(Aka1, 2); + Ame1 ^= De1; + BCo0 = ROL32(Ame1, 23); + Asi1 ^= Di1; + BCu0 = ROL32(Asi1, 31); + Ega0 = BCa0 ^ ANDN32(BCe0, BCi0); + Ege0 = BCe0 ^ ANDN32(BCi0, BCo0); + Egi0 = BCi0 ^ ANDN32(BCo0, BCu0); + Ego0 = BCo0 ^ ANDN32(BCu0, BCa0); + Egu0 = BCu0 ^ ANDN32(BCa0, BCe0); + + Abo1 ^= Do1; + BCa1 = ROL32(Abo1, 14); + Agu1 ^= Du1; + BCe1 = ROL32(Agu1, 10); + Aka0 ^= Da0; + BCi1 = ROL32(Aka0, 1); + Ame0 ^= De0; + BCo1 = ROL32(Ame0, 22); + Asi0 ^= Di0; + BCu1 = ROL32(Asi0, 30); + Ega1 = BCa1 ^ ANDN32(BCe1, BCi1); + Ege1 = BCe1 ^ ANDN32(BCi1, BCo1); + Egi1 = BCi1 ^ ANDN32(BCo1, BCu1); + Ego1 = BCo1 ^ ANDN32(BCu1, BCa1); + Egu1 = BCu1 ^ ANDN32(BCa1, BCe1); + + Abe1 ^= De1; + BCa0 = ROL32(Abe1, 1); + Agi0 ^= Di0; + BCe0 = ROL32(Agi0, 3); + Ako1 ^= Do1; + BCi0 = ROL32(Ako1, 13); + Amu0 ^= Du0; + BCo0 = ROL32(Amu0, 4); + Asa0 ^= Da0; + BCu0 = ROL32(Asa0, 9); + Eka0 = BCa0 ^ ANDN32(BCe0, BCi0); + Eke0 = BCe0 ^ ANDN32(BCi0, BCo0); + Eki0 = BCi0 ^ ANDN32(BCo0, BCu0); + Eko0 = BCo0 ^ ANDN32(BCu0, BCa0); + Eku0 = BCu0 ^ ANDN32(BCa0, BCe0); + + Abe0 ^= De0; + BCa1 = Abe0; + Agi1 ^= Di1; + BCe1 = ROL32(Agi1, 3); + Ako0 ^= Do0; + BCi1 = ROL32(Ako0, 12); + Amu1 ^= Du1; + BCo1 = ROL32(Amu1, 4); + Asa1 ^= Da1; + BCu1 = ROL32(Asa1, 9); + Eka1 = BCa1 ^ ANDN32(BCe1, BCi1); + Eke1 = BCe1 ^ ANDN32(BCi1, BCo1); + Eki1 = BCi1 ^ ANDN32(BCo1, BCu1); + Eko1 = BCo1 ^ ANDN32(BCu1, BCa1); + Eku1 = BCu1 ^ ANDN32(BCa1, BCe1); + + Abu1 ^= Du1; + BCa0 = ROL32(Abu1, 14); + Aga0 ^= Da0; + BCe0 = ROL32(Aga0, 18); + Ake0 ^= De0; + BCi0 = ROL32(Ake0, 5); + Ami1 ^= Di1; + BCo0 = ROL32(Ami1, 8); + Aso0 ^= Do0; + BCu0 = ROL32(Aso0, 28); + Ema0 = BCa0 ^ ANDN32(BCe0, BCi0); + Eme0 = BCe0 ^ ANDN32(BCi0, BCo0); + Emi0 = BCi0 ^ ANDN32(BCo0, BCu0); + Emo0 = BCo0 ^ ANDN32(BCu0, BCa0); + Emu0 = BCu0 ^ ANDN32(BCa0, BCe0); + + Abu0 ^= Du0; + BCa1 = ROL32(Abu0, 13); + Aga1 ^= Da1; + BCe1 = ROL32(Aga1, 18); + Ake1 ^= De1; + BCi1 = ROL32(Ake1, 5); + Ami0 ^= Di0; + BCo1 = ROL32(Ami0, 7); + Aso1 ^= Do1; + BCu1 = ROL32(Aso1, 28); + Ema1 = BCa1 ^ ANDN32(BCe1, BCi1); + Eme1 = BCe1 ^ ANDN32(BCi1, BCo1); + Emi1 = BCi1 ^ ANDN32(BCo1, BCu1); + Emo1 = BCo1 ^ ANDN32(BCu1, BCa1); + Emu1 = BCu1 ^ ANDN32(BCa1, BCe1); + + Abi0 ^= Di0; + BCa0 = ROL32(Abi0, 31); + Ago1 ^= Do1; + BCe0 = ROL32(Ago1, 28); + Aku1 ^= Du1; + BCi0 = ROL32(Aku1, 20); + Ama1 ^= Da1; + BCo0 = ROL32(Ama1, 21); + Ase0 ^= De0; + BCu0 = ROL32(Ase0, 1); + Esa0 = BCa0 ^ ANDN32(BCe0, BCi0); + Ese0 = BCe0 ^ ANDN32(BCi0, BCo0); + Esi0 = BCi0 ^ ANDN32(BCo0, BCu0); + Eso0 = BCo0 ^ ANDN32(BCu0, BCa0); + Esu0 = BCu0 ^ ANDN32(BCa0, BCe0); + + Abi1 ^= Di1; + BCa1 = ROL32(Abi1, 31); + Ago0 ^= Do0; + BCe1 = ROL32(Ago0, 27); + Aku0 ^= Du0; + BCi1 = ROL32(Aku0, 19); + Ama0 ^= Da0; + BCo1 = ROL32(Ama0, 20); + Ase1 ^= De1; + BCu1 = ROL32(Ase1, 1); + Esa1 = BCa1 ^ ANDN32(BCe1, BCi1); + Ese1 = BCe1 ^ ANDN32(BCi1, BCo1); + Esi1 = BCi1 ^ ANDN32(BCo1, BCu1); + Eso1 = BCo1 ^ ANDN32(BCu1, BCa1); + Esu1 = BCu1 ^ ANDN32(BCa1, BCe1); + + /* prepareTheta */ + BCa0 = Eba0 ^ Ega0 ^ Eka0 ^ Ema0 ^ Esa0; + BCa1 = Eba1 ^ Ega1 ^ Eka1 ^ Ema1 ^ Esa1; + BCe0 = Ebe0 ^ Ege0 ^ Eke0 ^ Eme0 ^ Ese0; + BCe1 = Ebe1 ^ Ege1 ^ Eke1 ^ Eme1 ^ Ese1; + BCi0 = Ebi0 ^ Egi0 ^ Eki0 ^ Emi0 ^ Esi0; + BCi1 = Ebi1 ^ Egi1 ^ Eki1 ^ Emi1 ^ Esi1; + BCo0 = Ebo0 ^ Ego0 ^ Eko0 ^ Emo0 ^ Eso0; + BCo1 = Ebo1 ^ Ego1 ^ Eko1 ^ Emo1 ^ Eso1; + BCu0 = Ebu0 ^ Egu0 ^ Eku0 ^ Emu0 ^ Esu0; + BCu1 = Ebu1 ^ Egu1 ^ Eku1 ^ Emu1 ^ Esu1; + + /* thetaRhoPiChiIota(round+1, E, A) */ + Da0 = BCu0 ^ ROL32(BCe1, 1); + Da1 = BCu1 ^ BCe0; + De0 = BCa0 ^ ROL32(BCi1, 1); + De1 = BCa1 ^ BCi0; + Di0 = BCe0 ^ ROL32(BCo1, 1); + Di1 = BCe1 ^ BCo0; + Do0 = BCi0 ^ ROL32(BCu1, 1); + Do1 = BCi1 ^ BCu0; + Du0 = BCo0 ^ ROL32(BCa1, 1); + Du1 = BCo1 ^ BCa0; + + Eba0 ^= Da0; + BCa0 = Eba0; + Ege0 ^= De0; + BCe0 = ROL32(Ege0, 22); + Eki1 ^= Di1; + BCi0 = ROL32(Eki1, 22); + Emo1 ^= Do1; + BCo0 = ROL32(Emo1, 11); + Esu0 ^= Du0; + BCu0 = ROL32(Esu0, 7); + Aba0 = BCa0 ^ ANDN32(BCe0, BCi0); + Aba0 ^= round_consts[round * 2 + 2]; + Abe0 = BCe0 ^ ANDN32(BCi0, BCo0); + Abi0 = BCi0 ^ ANDN32(BCo0, BCu0); + Abo0 = BCo0 ^ ANDN32(BCu0, BCa0); + Abu0 = BCu0 ^ ANDN32(BCa0, BCe0); + + Eba1 ^= Da1; + BCa1 = Eba1; + Ege1 ^= De1; + BCe1 = ROL32(Ege1, 22); + Eki0 ^= Di0; + BCi1 = ROL32(Eki0, 21); + Emo0 ^= Do0; + BCo1 = ROL32(Emo0, 10); + Esu1 ^= Du1; + BCu1 = ROL32(Esu1, 7); + Aba1 = BCa1 ^ ANDN32(BCe1, BCi1); + Aba1 ^= round_consts[round * 2 + 3]; + Abe1 = BCe1 ^ ANDN32(BCi1, BCo1); + Abi1 = BCi1 ^ ANDN32(BCo1, BCu1); + Abo1 = BCo1 ^ ANDN32(BCu1, BCa1); + Abu1 = BCu1 ^ ANDN32(BCa1, BCe1); + + Ebo0 ^= Do0; + BCa0 = ROL32(Ebo0, 14); + Egu0 ^= Du0; + BCe0 = ROL32(Egu0, 10); + Eka1 ^= Da1; + BCi0 = ROL32(Eka1, 2); + Eme1 ^= De1; + BCo0 = ROL32(Eme1, 23); + Esi1 ^= Di1; + BCu0 = ROL32(Esi1, 31); + Aga0 = BCa0 ^ ANDN32(BCe0, BCi0); + Age0 = BCe0 ^ ANDN32(BCi0, BCo0); + Agi0 = BCi0 ^ ANDN32(BCo0, BCu0); + Ago0 = BCo0 ^ ANDN32(BCu0, BCa0); + Agu0 = BCu0 ^ ANDN32(BCa0, BCe0); + + Ebo1 ^= Do1; + BCa1 = ROL32(Ebo1, 14); + Egu1 ^= Du1; + BCe1 = ROL32(Egu1, 10); + Eka0 ^= Da0; + BCi1 = ROL32(Eka0, 1); + Eme0 ^= De0; + BCo1 = ROL32(Eme0, 22); + Esi0 ^= Di0; + BCu1 = ROL32(Esi0, 30); + Aga1 = BCa1 ^ ANDN32(BCe1, BCi1); + Age1 = BCe1 ^ ANDN32(BCi1, BCo1); + Agi1 = BCi1 ^ ANDN32(BCo1, BCu1); + Ago1 = BCo1 ^ ANDN32(BCu1, BCa1); + Agu1 = BCu1 ^ ANDN32(BCa1, BCe1); + + Ebe1 ^= De1; + BCa0 = ROL32(Ebe1, 1); + Egi0 ^= Di0; + BCe0 = ROL32(Egi0, 3); + Eko1 ^= Do1; + BCi0 = ROL32(Eko1, 13); + Emu0 ^= Du0; + BCo0 = ROL32(Emu0, 4); + Esa0 ^= Da0; + BCu0 = ROL32(Esa0, 9); + Aka0 = BCa0 ^ ANDN32(BCe0, BCi0); + Ake0 = BCe0 ^ ANDN32(BCi0, BCo0); + Aki0 = BCi0 ^ ANDN32(BCo0, BCu0); + Ako0 = BCo0 ^ ANDN32(BCu0, BCa0); + Aku0 = BCu0 ^ ANDN32(BCa0, BCe0); + + Ebe0 ^= De0; + BCa1 = Ebe0; + Egi1 ^= Di1; + BCe1 = ROL32(Egi1, 3); + Eko0 ^= Do0; + BCi1 = ROL32(Eko0, 12); + Emu1 ^= Du1; + BCo1 = ROL32(Emu1, 4); + Esa1 ^= Da1; + BCu1 = ROL32(Esa1, 9); + Aka1 = BCa1 ^ ANDN32(BCe1, BCi1); + Ake1 = BCe1 ^ ANDN32(BCi1, BCo1); + Aki1 = BCi1 ^ ANDN32(BCo1, BCu1); + Ako1 = BCo1 ^ ANDN32(BCu1, BCa1); + Aku1 = BCu1 ^ ANDN32(BCa1, BCe1); + + Ebu1 ^= Du1; + BCa0 = ROL32(Ebu1, 14); + Ega0 ^= Da0; + BCe0 = ROL32(Ega0, 18); + Eke0 ^= De0; + BCi0 = ROL32(Eke0, 5); + Emi1 ^= Di1; + BCo0 = ROL32(Emi1, 8); + Eso0 ^= Do0; + BCu0 = ROL32(Eso0, 28); + Ama0 = BCa0 ^ ANDN32(BCe0, BCi0); + Ame0 = BCe0 ^ ANDN32(BCi0, BCo0); + Ami0 = BCi0 ^ ANDN32(BCo0, BCu0); + Amo0 = BCo0 ^ ANDN32(BCu0, BCa0); + Amu0 = BCu0 ^ ANDN32(BCa0, BCe0); + + Ebu0 ^= Du0; + BCa1 = ROL32(Ebu0, 13); + Ega1 ^= Da1; + BCe1 = ROL32(Ega1, 18); + Eke1 ^= De1; + BCi1 = ROL32(Eke1, 5); + Emi0 ^= Di0; + BCo1 = ROL32(Emi0, 7); + Eso1 ^= Do1; + BCu1 = ROL32(Eso1, 28); + Ama1 = BCa1 ^ ANDN32(BCe1, BCi1); + Ame1 = BCe1 ^ ANDN32(BCi1, BCo1); + Ami1 = BCi1 ^ ANDN32(BCo1, BCu1); + Amo1 = BCo1 ^ ANDN32(BCu1, BCa1); + Amu1 = BCu1 ^ ANDN32(BCa1, BCe1); + + Ebi0 ^= Di0; + BCa0 = ROL32(Ebi0, 31); + Ego1 ^= Do1; + BCe0 = ROL32(Ego1, 28); + Eku1 ^= Du1; + BCi0 = ROL32(Eku1, 20); + Ema1 ^= Da1; + BCo0 = ROL32(Ema1, 21); + Ese0 ^= De0; + BCu0 = ROL32(Ese0, 1); + Asa0 = BCa0 ^ ANDN32(BCe0, BCi0); + Ase0 = BCe0 ^ ANDN32(BCi0, BCo0); + Asi0 = BCi0 ^ ANDN32(BCo0, BCu0); + Aso0 = BCo0 ^ ANDN32(BCu0, BCa0); + Asu0 = BCu0 ^ ANDN32(BCa0, BCe0); + + Ebi1 ^= Di1; + BCa1 = ROL32(Ebi1, 31); + Ego0 ^= Do0; + BCe1 = ROL32(Ego0, 27); + Eku0 ^= Du0; + BCi1 = ROL32(Eku0, 19); + Ema0 ^= Da0; + BCo1 = ROL32(Ema0, 20); + Ese1 ^= De1; + BCu1 = ROL32(Ese1, 1); + Asa1 = BCa1 ^ ANDN32(BCe1, BCi1); + Ase1 = BCe1 ^ ANDN32(BCi1, BCo1); + Asi1 = BCi1 ^ ANDN32(BCo1, BCu1); + Aso1 = BCo1 ^ ANDN32(BCu1, BCa1); + Asu1 = BCu1 ^ ANDN32(BCa1, BCe1); + } + + state[0] = Aba0; + state[1] = Aba1; + state[2] = Abe0; + state[3] = Abe1; + state[4] = Abi0; + state[5] = Abi1; + state[6] = Abo0; + state[7] = Abo1; + state[8] = Abu0; + state[9] = Abu1; + state[10] = Aga0; + state[11] = Aga1; + state[12] = Age0; + state[13] = Age1; + state[14] = Agi0; + state[15] = Agi1; + state[16] = Ago0; + state[17] = Ago1; + state[18] = Agu0; + state[19] = Agu1; + state[20] = Aka0; + state[21] = Aka1; + state[22] = Ake0; + state[23] = Ake1; + state[24] = Aki0; + state[25] = Aki1; + state[26] = Ako0; + state[27] = Ako1; + state[28] = Aku0; + state[29] = Aku1; + state[30] = Ama0; + state[31] = Ama1; + state[32] = Ame0; + state[33] = Ame1; + state[34] = Ami0; + state[35] = Ami1; + state[36] = Amo0; + state[37] = Amo1; + state[38] = Amu0; + state[39] = Amu1; + state[40] = Asa0; + state[41] = Asa1; + state[42] = Ase0; + state[43] = Ase1; + state[44] = Asi0; + state[45] = Asi1; + state[46] = Aso0; + state[47] = Aso1; + state[48] = Asu0; + state[49] = Asu1; + + return sizeof(void *) * 4 + sizeof(u32) * 12 * 5 * 2; +} diff --git a/cipher/keccak_permute_64.h b/cipher/keccak_permute_64.h new file mode 100644 index 00000000..1264f195 --- /dev/null +++ b/cipher/keccak_permute_64.h @@ -0,0 +1,290 @@ +/* keccak_permute_64.h - Keccak permute function (simple 64bit) + * Copyright (C) 2015 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser general Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +/* The code is based on public-domain/CC0 "keccakc1024/simple/Keccak-simple.c" + * implementation by Ronny Van Keer from SUPERCOP toolkit package. + */ + +/* Function that computes the Keccak-f[1600] permutation on the given state. */ +static unsigned int +KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd) +{ + const u64 *round_consts = round_consts_64bit; + u64 Aba, Abe, Abi, Abo, Abu; + u64 Aga, Age, Agi, Ago, Agu; + u64 Aka, Ake, Aki, Ako, Aku; + u64 Ama, Ame, Ami, Amo, Amu; + u64 Asa, Ase, Asi, Aso, Asu; + u64 BCa, BCe, BCi, BCo, BCu; + u64 Da, De, Di, Do, Du; + u64 Eba, Ebe, Ebi, Ebo, Ebu; + u64 Ega, Ege, Egi, Ego, Egu; + u64 Eka, Eke, Eki, Eko, Eku; + u64 Ema, Eme, Emi, Emo, Emu; + u64 Esa, Ese, Esi, Eso, Esu; + u64 *state = hd->u.state64; + unsigned int round; + + Aba = state[0]; + Abe = state[1]; + Abi = state[2]; + Abo = state[3]; + Abu = state[4]; + Aga = state[5]; + Age = state[6]; + Agi = state[7]; + Ago = state[8]; + Agu = state[9]; + Aka = state[10]; + Ake = state[11]; + Aki = state[12]; + Ako = state[13]; + Aku = state[14]; + Ama = state[15]; + Ame = state[16]; + Ami = state[17]; + Amo = state[18]; + Amu = state[19]; + Asa = state[20]; + Ase = state[21]; + Asi = state[22]; + Aso = state[23]; + Asu = state[24]; + + for (round = 0; round < 24; round += 2) + { + /* prepareTheta */ + BCa = Aba ^ Aga ^ Aka ^ Ama ^ Asa; + BCe = Abe ^ Age ^ Ake ^ Ame ^ Ase; + BCi = Abi ^ Agi ^ Aki ^ Ami ^ Asi; + BCo = Abo ^ Ago ^ Ako ^ Amo ^ Aso; + BCu = Abu ^ Agu ^ Aku ^ Amu ^ Asu; + + /* thetaRhoPiChiIotaPrepareTheta(round , A, E) */ + Da = BCu ^ ROL64(BCe, 1); + De = BCa ^ ROL64(BCi, 1); + Di = BCe ^ ROL64(BCo, 1); + Do = BCi ^ ROL64(BCu, 1); + Du = BCo ^ ROL64(BCa, 1); + + Aba ^= Da; + BCa = Aba; + Age ^= De; + BCe = ROL64(Age, 44); + Aki ^= Di; + BCi = ROL64(Aki, 43); + Amo ^= Do; + BCo = ROL64(Amo, 21); + Asu ^= Du; + BCu = ROL64(Asu, 14); + Eba = BCa ^ ANDN64(BCe, BCi); + Eba ^= (u64)round_consts[round]; + Ebe = BCe ^ ANDN64(BCi, BCo); + Ebi = BCi ^ ANDN64(BCo, BCu); + Ebo = BCo ^ ANDN64(BCu, BCa); + Ebu = BCu ^ ANDN64(BCa, BCe); + + Abo ^= Do; + BCa = ROL64(Abo, 28); + Agu ^= Du; + BCe = ROL64(Agu, 20); + Aka ^= Da; + BCi = ROL64(Aka, 3); + Ame ^= De; + BCo = ROL64(Ame, 45); + Asi ^= Di; + BCu = ROL64(Asi, 61); + Ega = BCa ^ ANDN64(BCe, BCi); + Ege = BCe ^ ANDN64(BCi, BCo); + Egi = BCi ^ ANDN64(BCo, BCu); + Ego = BCo ^ ANDN64(BCu, BCa); + Egu = BCu ^ ANDN64(BCa, BCe); + + Abe ^= De; + BCa = ROL64(Abe, 1); + Agi ^= Di; + BCe = ROL64(Agi, 6); + Ako ^= Do; + BCi = ROL64(Ako, 25); + Amu ^= Du; + BCo = ROL64(Amu, 8); + Asa ^= Da; + BCu = ROL64(Asa, 18); + Eka = BCa ^ ANDN64(BCe, BCi); + Eke = BCe ^ ANDN64(BCi, BCo); + Eki = BCi ^ ANDN64(BCo, BCu); + Eko = BCo ^ ANDN64(BCu, BCa); + Eku = BCu ^ ANDN64(BCa, BCe); + + Abu ^= Du; + BCa = ROL64(Abu, 27); + Aga ^= Da; + BCe = ROL64(Aga, 36); + Ake ^= De; + BCi = ROL64(Ake, 10); + Ami ^= Di; + BCo = ROL64(Ami, 15); + Aso ^= Do; + BCu = ROL64(Aso, 56); + Ema = BCa ^ ANDN64(BCe, BCi); + Eme = BCe ^ ANDN64(BCi, BCo); + Emi = BCi ^ ANDN64(BCo, BCu); + Emo = BCo ^ ANDN64(BCu, BCa); + Emu = BCu ^ ANDN64(BCa, BCe); + + Abi ^= Di; + BCa = ROL64(Abi, 62); + Ago ^= Do; + BCe = ROL64(Ago, 55); + Aku ^= Du; + BCi = ROL64(Aku, 39); + Ama ^= Da; + BCo = ROL64(Ama, 41); + Ase ^= De; + BCu = ROL64(Ase, 2); + Esa = BCa ^ ANDN64(BCe, BCi); + Ese = BCe ^ ANDN64(BCi, BCo); + Esi = BCi ^ ANDN64(BCo, BCu); + Eso = BCo ^ ANDN64(BCu, BCa); + Esu = BCu ^ ANDN64(BCa, BCe); + + /* prepareTheta */ + BCa = Eba ^ Ega ^ Eka ^ Ema ^ Esa; + BCe = Ebe ^ Ege ^ Eke ^ Eme ^ Ese; + BCi = Ebi ^ Egi ^ Eki ^ Emi ^ Esi; + BCo = Ebo ^ Ego ^ Eko ^ Emo ^ Eso; + BCu = Ebu ^ Egu ^ Eku ^ Emu ^ Esu; + + /* thetaRhoPiChiIotaPrepareTheta(round+1, E, A) */ + Da = BCu ^ ROL64(BCe, 1); + De = BCa ^ ROL64(BCi, 1); + Di = BCe ^ ROL64(BCo, 1); + Do = BCi ^ ROL64(BCu, 1); + Du = BCo ^ ROL64(BCa, 1); + + Eba ^= Da; + BCa = Eba; + Ege ^= De; + BCe = ROL64(Ege, 44); + Eki ^= Di; + BCi = ROL64(Eki, 43); + Emo ^= Do; + BCo = ROL64(Emo, 21); + Esu ^= Du; + BCu = ROL64(Esu, 14); + Aba = BCa ^ ANDN64(BCe, BCi); + Aba ^= (u64)round_consts[round + 1]; + Abe = BCe ^ ANDN64(BCi, BCo); + Abi = BCi ^ ANDN64(BCo, BCu); + Abo = BCo ^ ANDN64(BCu, BCa); + Abu = BCu ^ ANDN64(BCa, BCe); + + Ebo ^= Do; + BCa = ROL64(Ebo, 28); + Egu ^= Du; + BCe = ROL64(Egu, 20); + Eka ^= Da; + BCi = ROL64(Eka, 3); + Eme ^= De; + BCo = ROL64(Eme, 45); + Esi ^= Di; + BCu = ROL64(Esi, 61); + Aga = BCa ^ ANDN64(BCe, BCi); + Age = BCe ^ ANDN64(BCi, BCo); + Agi = BCi ^ ANDN64(BCo, BCu); + Ago = BCo ^ ANDN64(BCu, BCa); + Agu = BCu ^ ANDN64(BCa, BCe); + + Ebe ^= De; + BCa = ROL64(Ebe, 1); + Egi ^= Di; + BCe = ROL64(Egi, 6); + Eko ^= Do; + BCi = ROL64(Eko, 25); + Emu ^= Du; + BCo = ROL64(Emu, 8); + Esa ^= Da; + BCu = ROL64(Esa, 18); + Aka = BCa ^ ANDN64(BCe, BCi); + Ake = BCe ^ ANDN64(BCi, BCo); + Aki = BCi ^ ANDN64(BCo, BCu); + Ako = BCo ^ ANDN64(BCu, BCa); + Aku = BCu ^ ANDN64(BCa, BCe); + + Ebu ^= Du; + BCa = ROL64(Ebu, 27); + Ega ^= Da; + BCe = ROL64(Ega, 36); + Eke ^= De; + BCi = ROL64(Eke, 10); + Emi ^= Di; + BCo = ROL64(Emi, 15); + Eso ^= Do; + BCu = ROL64(Eso, 56); + Ama = BCa ^ ANDN64(BCe, BCi); + Ame = BCe ^ ANDN64(BCi, BCo); + Ami = BCi ^ ANDN64(BCo, BCu); + Amo = BCo ^ ANDN64(BCu, BCa); + Amu = BCu ^ ANDN64(BCa, BCe); + + Ebi ^= Di; + BCa = ROL64(Ebi, 62); + Ego ^= Do; + BCe = ROL64(Ego, 55); + Eku ^= Du; + BCi = ROL64(Eku, 39); + Ema ^= Da; + BCo = ROL64(Ema, 41); + Ese ^= De; + BCu = ROL64(Ese, 2); + Asa = BCa ^ ANDN64(BCe, BCi); + Ase = BCe ^ ANDN64(BCi, BCo); + Asi = BCi ^ ANDN64(BCo, BCu); + Aso = BCo ^ ANDN64(BCu, BCa); + Asu = BCu ^ ANDN64(BCa, BCe); + } + + state[0] = Aba; + state[1] = Abe; + state[2] = Abi; + state[3] = Abo; + state[4] = Abu; + state[5] = Aga; + state[6] = Age; + state[7] = Agi; + state[8] = Ago; + state[9] = Agu; + state[10] = Aka; + state[11] = Ake; + state[12] = Aki; + state[13] = Ako; + state[14] = Aku; + state[15] = Ama; + state[16] = Ame; + state[17] = Ami; + state[18] = Amo; + state[19] = Amu; + state[20] = Asa; + state[21] = Ase; + state[22] = Asi; + state[23] = Aso; + state[24] = Asu; + + return sizeof(void *) * 4 + sizeof(u64) * 12 * 5; +} |