diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2015-10-23 22:30:48 +0300 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2015-10-28 20:08:56 +0200 |
commit | 74184c28fbe7ff58cf57f0094ef957d94045da7d (patch) | |
tree | 3d1c33a07e749439aa010abbdfbdd6fff0364356 /cipher/keccak.c | |
parent | 909644ef5883927262366c356eed530e55aba478 (diff) | |
download | libgcrypt-74184c28fbe7ff58cf57f0094ef957d94045da7d.tar.gz |
keccak: rewrite for improved performance
* cipher/Makefile.am: Add 'keccak_permute_32.h' and
'keccak_permute_64.h'.
* cipher/hash-common.h [USE_SHA3] (MD_BLOCK_MAX_BLOCKSIZE): Remove.
* cipher/keccak.c (USE_64BIT, USE_32BIT, USE_64BIT_BMI2)
(USE_64BIT_SHLD, USE_32BIT_BMI2, NEED_COMMON64, NEED_COMMON32BI)
(keccak_ops_t): New.
(KECCAK_STATE): Add 'state64' and 'state32bi' members.
(KECCAK_CONTEXT): Remove 'bctx'; add 'blocksize', 'count' and 'ops'.
(rol64, keccak_f1600_state_permute): Remove.
[NEED_COMMON64] (round_consts_64bit, keccak_extract_inplace64): New.
[NEED_COMMON32BI] (round_consts_32bit, keccak_extract_inplace32bi)
(keccak_absorb_lane32bi): New.
[USE_64BIT] (ANDN64, ROL64, keccak_f1600_state_permute64)
(keccak_absorb_lanes64, keccak_generic64_ops): New.
[USE_64BIT_SHLD] (ANDN64, ROL64, keccak_f1600_state_permute64_shld)
(keccak_absorb_lanes64_shld, keccak_shld_64_ops): New.
[USE_64BIT_BMI2] (ANDN64, ROL64, keccak_f1600_state_permute64_bmi2)
(keccak_absorb_lanes64_bmi2, keccak_bmi2_64_ops): New.
[USE_32BIT] (ANDN64, ROL64, keccak_f1600_state_permute32bi)
(keccak_absorb_lanes32bi, keccak_generic32bi_ops): New.
[USE_32BIT_BMI2] (ANDN64, ROL64, keccak_f1600_state_permute32bi_bmi2)
(pext, pdep, keccak_absorb_lane32bi_bmi2, keccak_absorb_lanes32bi_bmi2)
(keccak_extract_inplace32bi_bmi2, keccak_bmi2_32bi_ops): New.
(keccak_write): New.
(keccak_init): Adjust to KECCAK_CONTEXT changes; add implementation
selection based on HWF features.
(keccak_final): Adjust to KECCAK_CONTEXT changes; use selected 'ops'
for state manipulation.
(keccak_read): Adjust to KECCAK_CONTEXT changes.
(_gcry_digest_spec_sha3_224, _gcry_digest_spec_sha3_256)
(_gcry_digest_spec_sha3_348, _gcry_digest_spec_sha3_512): Use
'keccak_write' instead of '_gcry_md_block_write'.
* cipher/keccak_permute_32.h: New.
* cipher/keccak_permute_64.h: New.
--
Patch adds new generic 64-bit and 32-bit implementations and
optimized implementations for SHA3:
- Generic 64-bit implementation based on 'simple' implementation
from SUPERCOP package.
- Generic 32-bit bit-inteleaved implementataion based on
'simple32bi' implementation from SUPERCOP package.
- Intel BMI2 optimized variants of 64-bit and 32-bit BI
implementations.
- Intel SHLD optimized variant of 64-bit implementation.
Patch also makes proper use of sponge construction to avoid
use of addition input buffer.
Below are bench-slope benchmarks for new 64-bit implementations
made on Intel Core i5-4570 (no turbo, 3.2 Ghz, gcc-4.9.2).
Before (amd64):
SHA3-224 | 3.92 ns/B 243.2 MiB/s 12.55 c/B
SHA3-256 | 4.15 ns/B 230.0 MiB/s 13.27 c/B
SHA3-384 | 5.40 ns/B 176.6 MiB/s 17.29 c/B
SHA3-512 | 7.77 ns/B 122.7 MiB/s 24.87 c/B
After (generic 64-bit, amd64), 1.10x faster):
SHA3-224 | 3.57 ns/B 267.4 MiB/s 11.42 c/B
SHA3-256 | 3.77 ns/B 252.8 MiB/s 12.07 c/B
SHA3-384 | 4.91 ns/B 194.1 MiB/s 15.72 c/B
SHA3-512 | 7.06 ns/B 135.0 MiB/s 22.61 c/B
After (Intel SHLD 64-bit, amd64, 1.13x faster):
SHA3-224 | 3.48 ns/B 273.7 MiB/s 11.15 c/B
SHA3-256 | 3.68 ns/B 258.9 MiB/s 11.79 c/B
SHA3-384 | 4.80 ns/B 198.7 MiB/s 15.36 c/B
SHA3-512 | 6.89 ns/B 138.4 MiB/s 22.05 c/B
After (Intel BMI2 64-bit, amd64, 1.45x faster):
SHA3-224 | 2.71 ns/B 352.1 MiB/s 8.67 c/B
SHA3-256 | 2.86 ns/B 333.2 MiB/s 9.16 c/B
SHA3-384 | 3.72 ns/B 256.2 MiB/s 11.91 c/B
SHA3-512 | 5.34 ns/B 178.5 MiB/s 17.10 c/B
Benchmarks of new 32-bit implementations on Intel Core i5-4570
(no turbo, 3.2 Ghz, gcc-4.9.2):
Before (win32):
SHA3-224 | 12.05 ns/B 79.16 MiB/s 38.56 c/B
SHA3-256 | 12.75 ns/B 74.78 MiB/s 40.82 c/B
SHA3-384 | 16.63 ns/B 57.36 MiB/s 53.22 c/B
SHA3-512 | 23.97 ns/B 39.79 MiB/s 76.72 c/B
After (generic 32-bit BI, win32, 1.23x to 1.29x faster):
SHA3-224 | 9.76 ns/B 97.69 MiB/s 31.25 c/B
SHA3-256 | 10.27 ns/B 92.82 MiB/s 32.89 c/B
SHA3-384 | 13.22 ns/B 72.16 MiB/s 42.31 c/B
SHA3-512 | 18.65 ns/B 51.13 MiB/s 59.70 c/B
After (Intel BMI2 32-bit BI, win32, 1.66x to 1.70x faster):
SHA3-224 | 7.26 ns/B 131.4 MiB/s 23.23 c/B
SHA3-256 | 7.65 ns/B 124.7 MiB/s 24.47 c/B
SHA3-384 | 9.87 ns/B 96.67 MiB/s 31.58 c/B
SHA3-512 | 14.05 ns/B 67.85 MiB/s 44.99 c/B
Benchmarks of new 32-bit implementation on ARM Cortex-A8
(1008 Mhz, gcc-4.9.1):
Before:
SHA3-224 | 148.6 ns/B 6.42 MiB/s 149.8 c/B
SHA3-256 | 157.2 ns/B 6.07 MiB/s 158.4 c/B
SHA3-384 | 205.3 ns/B 4.65 MiB/s 206.9 c/B
SHA3-512 | 296.3 ns/B 3.22 MiB/s 298.6 c/B
After (1.56x faster):
SHA3-224 | 96.12 ns/B 9.92 MiB/s 96.89 c/B
SHA3-256 | 101.5 ns/B 9.40 MiB/s 102.3 c/B
SHA3-384 | 131.4 ns/B 7.26 MiB/s 132.5 c/B
SHA3-512 | 188.2 ns/B 5.07 MiB/s 189.7 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/keccak.c')
-rw-r--r-- | cipher/keccak.c | 808 |
1 files changed, 575 insertions, 233 deletions
diff --git a/cipher/keccak.c b/cipher/keccak.c index 4a9c1f27..3a72294a 100644 --- a/cipher/keccak.c +++ b/cipher/keccak.c @@ -27,11 +27,45 @@ #include "hash-common.h" -/* The code is based on public-domain/CC0 "Keccak-readable-and-compact.c" - * implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni, - * Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer. From: - * https://github.com/gvanas/KeccakCodePackage - */ + +/* USE_64BIT indicates whether to use 64-bit generic implementation. + * USE_32BIT indicates whether to use 32-bit generic implementation. */ +#undef USE_64BIT +#if defined(__x86_64__) || SIZEOF_UNSIGNED_LONG == 8 +# define USE_64BIT 1 +#else +# define USE_32BIT 1 +#endif + + +/* USE_64BIT_BMI2 indicates whether to compile with 64-bit Intel BMI2 code. */ +#undef USE_64BIT_BMI2 +#if defined(USE_64BIT) && defined(HAVE_GCC_INLINE_ASM_BMI2) +# define USE_64BIT_BMI2 1 +#endif + + +/* USE_64BIT_SHLD indicates whether to compile with 64-bit Intel SHLD code. */ +#undef USE_64BIT_SHLD +#if defined(USE_64BIT) && defined (__GNUC__) && defined(__x86_64__) +# define USE_64BIT_SHLD 1 +#endif + + +/* USE_32BIT_BMI2 indicates whether to compile with 32-bit Intel BMI2 code. */ +#undef USE_32BIT_BMI2 +#if defined(USE_32BIT) && defined(HAVE_GCC_INLINE_ASM_BMI2) +# define USE_32BIT_BMI2 1 +#endif + + +#ifdef USE_64BIT +# define NEED_COMMON64 1 +#endif + +#ifdef USE_32BIT +# define NEED_COMMON32BI 1 +#endif #define SHA3_DELIMITED_SUFFIX 0x06 @@ -40,220 +74,528 @@ typedef struct { - u64 state[5][5]; + union { +#ifdef NEED_COMMON64 + u64 state64[25]; +#endif +#ifdef NEED_COMMON32BI + u32 state32bi[50]; +#endif + } u; } KECCAK_STATE; typedef struct { - gcry_md_block_ctx_t bctx; + unsigned int (*permute)(KECCAK_STATE *hd); + unsigned int (*absorb)(KECCAK_STATE *hd, int pos, const byte *lanes, + unsigned int nlanes, int blocklanes); + unsigned int (*extract_inplace) (KECCAK_STATE *hd, unsigned int outlen); +} keccak_ops_t; + + +typedef struct KECCAK_CONTEXT_S +{ KECCAK_STATE state; unsigned int outlen; + unsigned int blocksize; + unsigned int count; + const keccak_ops_t *ops; } KECCAK_CONTEXT; -static inline u64 -rol64 (u64 x, unsigned int n) + +#ifdef NEED_COMMON64 + +static const u64 round_consts_64bit[24] = { - return ((x << n) | (x >> (64 - n))); -} + U64_C(0x0000000000000001), U64_C(0x0000000000008082), + U64_C(0x800000000000808A), U64_C(0x8000000080008000), + U64_C(0x000000000000808B), U64_C(0x0000000080000001), + U64_C(0x8000000080008081), U64_C(0x8000000000008009), + U64_C(0x000000000000008A), U64_C(0x0000000000000088), + U64_C(0x0000000080008009), U64_C(0x000000008000000A), + U64_C(0x000000008000808B), U64_C(0x800000000000008B), + U64_C(0x8000000000008089), U64_C(0x8000000000008003), + U64_C(0x8000000000008002), U64_C(0x8000000000000080), + U64_C(0x000000000000800A), U64_C(0x800000008000000A), + U64_C(0x8000000080008081), U64_C(0x8000000000008080), + U64_C(0x0000000080000001), U64_C(0x8000000080008008) +}; -/* Function that computes the Keccak-f[1600] permutation on the given state. */ -static unsigned int keccak_f1600_state_permute(KECCAK_STATE *hd) +static unsigned int +keccak_extract_inplace64(KECCAK_STATE *hd, unsigned int outlen) { - static const u64 round_consts[24] = - { - U64_C(0x0000000000000001), U64_C(0x0000000000008082), - U64_C(0x800000000000808A), U64_C(0x8000000080008000), - U64_C(0x000000000000808B), U64_C(0x0000000080000001), - U64_C(0x8000000080008081), U64_C(0x8000000000008009), - U64_C(0x000000000000008A), U64_C(0x0000000000000088), - U64_C(0x0000000080008009), U64_C(0x000000008000000A), - U64_C(0x000000008000808B), U64_C(0x800000000000008B), - U64_C(0x8000000000008089), U64_C(0x8000000000008003), - U64_C(0x8000000000008002), U64_C(0x8000000000000080), - U64_C(0x000000000000800A), U64_C(0x800000008000000A), - U64_C(0x8000000080008081), U64_C(0x8000000000008080), - U64_C(0x0000000080000001), U64_C(0x8000000080008008) - }; - unsigned int round; + unsigned int i; - for (round = 0; round < 24; round++) + for (i = 0; i < outlen / 8 + !!(outlen % 8); i++) { - { - /* θ step (see [Keccak Reference, Section 2.3.2]) === */ - u64 C[5], D[5]; - - /* Compute the parity of the columns */ - C[0] = hd->state[0][0] ^ hd->state[1][0] ^ hd->state[2][0] - ^ hd->state[3][0] ^ hd->state[4][0]; - C[1] = hd->state[0][1] ^ hd->state[1][1] ^ hd->state[2][1] - ^ hd->state[3][1] ^ hd->state[4][1]; - C[2] = hd->state[0][2] ^ hd->state[1][2] ^ hd->state[2][2] - ^ hd->state[3][2] ^ hd->state[4][2]; - C[3] = hd->state[0][3] ^ hd->state[1][3] ^ hd->state[2][3] - ^ hd->state[3][3] ^ hd->state[4][3]; - C[4] = hd->state[0][4] ^ hd->state[1][4] ^ hd->state[2][4] - ^ hd->state[3][4] ^ hd->state[4][4]; - - /* Compute the θ effect for a given column */ - D[0] = C[4] ^ rol64(C[1], 1); - D[1] = C[0] ^ rol64(C[2], 1); - D[2] = C[1] ^ rol64(C[3], 1); - D[3] = C[2] ^ rol64(C[4], 1); - D[4] = C[3] ^ rol64(C[0], 1); - - /* Add the θ effect to the whole column */ - hd->state[0][0] ^= D[0]; - hd->state[1][0] ^= D[0]; - hd->state[2][0] ^= D[0]; - hd->state[3][0] ^= D[0]; - hd->state[4][0] ^= D[0]; - - /* Add the θ effect to the whole column */ - hd->state[0][1] ^= D[1]; - hd->state[1][1] ^= D[1]; - hd->state[2][1] ^= D[1]; - hd->state[3][1] ^= D[1]; - hd->state[4][1] ^= D[1]; - - /* Add the θ effect to the whole column */ - hd->state[0][2] ^= D[2]; - hd->state[1][2] ^= D[2]; - hd->state[2][2] ^= D[2]; - hd->state[3][2] ^= D[2]; - hd->state[4][2] ^= D[2]; - - /* Add the θ effect to the whole column */ - hd->state[0][3] ^= D[3]; - hd->state[1][3] ^= D[3]; - hd->state[2][3] ^= D[3]; - hd->state[3][3] ^= D[3]; - hd->state[4][3] ^= D[3]; - - /* Add the θ effect to the whole column */ - hd->state[0][4] ^= D[4]; - hd->state[1][4] ^= D[4]; - hd->state[2][4] ^= D[4]; - hd->state[3][4] ^= D[4]; - hd->state[4][4] ^= D[4]; - } - - { - /* ρ and π steps (see [Keccak Reference, Sections 2.3.3 and 2.3.4]) */ - u64 current, temp; - -#define do_swap_n_rol(x, y, r) \ - temp = hd->state[y][x]; \ - hd->state[y][x] = rol64(current, r); \ - current = temp; - - /* Start at coordinates (1 0) */ - current = hd->state[0][1]; - - /* Iterate over ((0 1)(2 3))^t * (1 0) for 0 ≤ t ≤ 23 */ - do_swap_n_rol(0, 2, 1); - do_swap_n_rol(2, 1, 3); - do_swap_n_rol(1, 2, 6); - do_swap_n_rol(2, 3, 10); - do_swap_n_rol(3, 3, 15); - do_swap_n_rol(3, 0, 21); - do_swap_n_rol(0, 1, 28); - do_swap_n_rol(1, 3, 36); - do_swap_n_rol(3, 1, 45); - do_swap_n_rol(1, 4, 55); - do_swap_n_rol(4, 4, 2); - do_swap_n_rol(4, 0, 14); - do_swap_n_rol(0, 3, 27); - do_swap_n_rol(3, 4, 41); - do_swap_n_rol(4, 3, 56); - do_swap_n_rol(3, 2, 8); - do_swap_n_rol(2, 2, 25); - do_swap_n_rol(2, 0, 43); - do_swap_n_rol(0, 4, 62); - do_swap_n_rol(4, 2, 18); - do_swap_n_rol(2, 4, 39); - do_swap_n_rol(4, 1, 61); - do_swap_n_rol(1, 1, 20); - do_swap_n_rol(1, 0, 44); - -#undef do_swap_n_rol - } - - { - /* χ step (see [Keccak Reference, Section 2.3.1]) */ - u64 temp[5]; - -#define do_x_step_for_plane(y) \ - /* Take a copy of the plane */ \ - temp[0] = hd->state[y][0]; \ - temp[1] = hd->state[y][1]; \ - temp[2] = hd->state[y][2]; \ - temp[3] = hd->state[y][3]; \ - temp[4] = hd->state[y][4]; \ - \ - /* Compute χ on the plane */ \ - hd->state[y][0] = temp[0] ^ ((~temp[1]) & temp[2]); \ - hd->state[y][1] = temp[1] ^ ((~temp[2]) & temp[3]); \ - hd->state[y][2] = temp[2] ^ ((~temp[3]) & temp[4]); \ - hd->state[y][3] = temp[3] ^ ((~temp[4]) & temp[0]); \ - hd->state[y][4] = temp[4] ^ ((~temp[0]) & temp[1]); - - do_x_step_for_plane(0); - do_x_step_for_plane(1); - do_x_step_for_plane(2); - do_x_step_for_plane(3); - do_x_step_for_plane(4); - -#undef do_x_step_for_plane - } - - { - /* ι step (see [Keccak Reference, Section 2.3.5]) */ - - hd->state[0][0] ^= round_consts[round]; - } + hd->u.state64[i] = le_bswap64(hd->u.state64[i]); } - return sizeof(void *) * 4 + sizeof(u64) * 10; + return 0; } +#endif /* NEED_COMMON64 */ + + +#ifdef NEED_COMMON32BI + +static const u32 round_consts_32bit[2 * 24] = +{ + 0x00000001UL, 0x00000000UL, 0x00000000UL, 0x00000089UL, + 0x00000000UL, 0x8000008bUL, 0x00000000UL, 0x80008080UL, + 0x00000001UL, 0x0000008bUL, 0x00000001UL, 0x00008000UL, + 0x00000001UL, 0x80008088UL, 0x00000001UL, 0x80000082UL, + 0x00000000UL, 0x0000000bUL, 0x00000000UL, 0x0000000aUL, + 0x00000001UL, 0x00008082UL, 0x00000000UL, 0x00008003UL, + 0x00000001UL, 0x0000808bUL, 0x00000001UL, 0x8000000bUL, + 0x00000001UL, 0x8000008aUL, 0x00000001UL, 0x80000081UL, + 0x00000000UL, 0x80000081UL, 0x00000000UL, 0x80000008UL, + 0x00000000UL, 0x00000083UL, 0x00000000UL, 0x80008003UL, + 0x00000001UL, 0x80008088UL, 0x00000000UL, 0x80000088UL, + 0x00000001UL, 0x00008000UL, 0x00000000UL, 0x80008082UL +}; static unsigned int -transform_blk (void *context, const unsigned char *data) +keccak_extract_inplace32bi(KECCAK_STATE *hd, unsigned int outlen) { - KECCAK_CONTEXT *ctx = context; - KECCAK_STATE *hd = &ctx->state; - u64 *state = (u64 *)hd->state; - const size_t bsize = ctx->bctx.blocksize; unsigned int i; + u32 x0; + u32 x1; + u32 t; + + for (i = 0; i < outlen / 8 + !!(outlen % 8); i++) + { + x0 = hd->u.state32bi[i * 2 + 0]; + x1 = hd->u.state32bi[i * 2 + 1]; + + t = (x0 & 0x0000FFFFUL) + (x1 << 16); + x1 = (x0 >> 16) + (x1 & 0xFFFF0000UL); + x0 = t; + t = (x0 ^ (x0 >> 8)) & 0x0000FF00UL; x0 = x0 ^ t ^ (t << 8); + t = (x0 ^ (x0 >> 4)) & 0x00F000F0UL; x0 = x0 ^ t ^ (t << 4); + t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0CUL; x0 = x0 ^ t ^ (t << 2); + t = (x0 ^ (x0 >> 1)) & 0x22222222UL; x0 = x0 ^ t ^ (t << 1); + t = (x1 ^ (x1 >> 8)) & 0x0000FF00UL; x1 = x1 ^ t ^ (t << 8); + t = (x1 ^ (x1 >> 4)) & 0x00F000F0UL; x1 = x1 ^ t ^ (t << 4); + t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0CUL; x1 = x1 ^ t ^ (t << 2); + t = (x1 ^ (x1 >> 1)) & 0x22222222UL; x1 = x1 ^ t ^ (t << 1); + + hd->u.state32bi[i * 2 + 0] = le_bswap32(x0); + hd->u.state32bi[i * 2 + 1] = le_bswap32(x1); + } - /* Absorb input block. */ - for (i = 0; i < bsize / 8; i++) - state[i] ^= buf_get_le64(data + i * 8); + return 0; +} - return keccak_f1600_state_permute(hd) + 4 * sizeof(void *); +static inline void +keccak_absorb_lane32bi(u32 *lane, u32 x0, u32 x1) +{ + u32 t; + + t = (x0 ^ (x0 >> 1)) & 0x22222222UL; x0 = x0 ^ t ^ (t << 1); + t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0CUL; x0 = x0 ^ t ^ (t << 2); + t = (x0 ^ (x0 >> 4)) & 0x00F000F0UL; x0 = x0 ^ t ^ (t << 4); + t = (x0 ^ (x0 >> 8)) & 0x0000FF00UL; x0 = x0 ^ t ^ (t << 8); + t = (x1 ^ (x1 >> 1)) & 0x22222222UL; x1 = x1 ^ t ^ (t << 1); + t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0CUL; x1 = x1 ^ t ^ (t << 2); + t = (x1 ^ (x1 >> 4)) & 0x00F000F0UL; x1 = x1 ^ t ^ (t << 4); + t = (x1 ^ (x1 >> 8)) & 0x0000FF00UL; x1 = x1 ^ t ^ (t << 8); + lane[0] ^= (x0 & 0x0000FFFFUL) + (x1 << 16); + lane[1] ^= (x0 >> 16) + (x1 & 0xFFFF0000UL); } +#endif /* NEED_COMMON32BI */ + + +/* Construct generic 64-bit implementation. */ +#ifdef USE_64BIT + +# define ANDN64(x, y) (~(x) & (y)) +# define ROL64(x, n) (((x) << ((unsigned int)n & 63)) | \ + ((x) >> ((64 - (unsigned int)(n)) & 63))) + +# define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute64 +# include "keccak_permute_64.h" + +# undef ANDN64 +# undef ROL64 +# undef KECCAK_F1600_PERMUTE_FUNC_NAME static unsigned int -transform (void *context, const unsigned char *data, size_t nblks) +keccak_absorb_lanes64(KECCAK_STATE *hd, int pos, const byte *lanes, + unsigned int nlanes, int blocklanes) { - KECCAK_CONTEXT *ctx = context; - const size_t bsize = ctx->bctx.blocksize; - unsigned int burn; + unsigned int burn = 0; + + while (nlanes) + { + hd->u.state64[pos] ^= buf_get_le64(lanes); + lanes += 8; + nlanes--; + + if (++pos == blocklanes) + { + burn = keccak_f1600_state_permute64(hd); + pos = 0; + } + } + + return burn; +} + +static const keccak_ops_t keccak_generic64_ops = +{ + .permute = keccak_f1600_state_permute64, + .absorb = keccak_absorb_lanes64, + .extract_inplace = keccak_extract_inplace64, +}; + +#endif /* USE_64BIT */ + + +/* Construct 64-bit Intel SHLD implementation. */ +#ifdef USE_64BIT_SHLD + +# define ANDN64(x, y) (~(x) & (y)) +# define ROL64(x, n) ({ \ + u64 tmp = (x); \ + asm ("shldq %1, %0, %0" \ + : "+r" (tmp) \ + : "J" ((n) & 63) \ + : "cc"); \ + tmp; }) + +# define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute64_shld +# include "keccak_permute_64.h" + +# undef ANDN64 +# undef ROL64 +# undef KECCAK_F1600_PERMUTE_FUNC_NAME + +static unsigned int +keccak_absorb_lanes64_shld(KECCAK_STATE *hd, int pos, const byte *lanes, + unsigned int nlanes, int blocklanes) +{ + unsigned int burn = 0; + + while (nlanes) + { + hd->u.state64[pos] ^= buf_get_le64(lanes); + lanes += 8; + nlanes--; + + if (++pos == blocklanes) + { + burn = keccak_f1600_state_permute64_shld(hd); + pos = 0; + } + } + + return burn; +} + +static const keccak_ops_t keccak_shld_64_ops = +{ + .permute = keccak_f1600_state_permute64_shld, + .absorb = keccak_absorb_lanes64_shld, + .extract_inplace = keccak_extract_inplace64, +}; + +#endif /* USE_64BIT_SHLD */ + + +/* Construct 64-bit Intel BMI2 implementation. */ +#ifdef USE_64BIT_BMI2 + +# define ANDN64(x, y) ({ \ + u64 tmp; \ + asm ("andnq %2, %1, %0" \ + : "=r" (tmp) \ + : "r0" (x), "rm" (y)); \ + tmp; }) + +# define ROL64(x, n) ({ \ + u64 tmp; \ + asm ("rorxq %2, %1, %0" \ + : "=r" (tmp) \ + : "rm0" (x), "J" (64 - ((n) & 63))); \ + tmp; }) + +# define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute64_bmi2 +# include "keccak_permute_64.h" + +# undef ANDN64 +# undef ROL64 +# undef KECCAK_F1600_PERMUTE_FUNC_NAME + +static unsigned int +keccak_absorb_lanes64_bmi2(KECCAK_STATE *hd, int pos, const byte *lanes, + unsigned int nlanes, int blocklanes) +{ + unsigned int burn = 0; + + while (nlanes) + { + hd->u.state64[pos] ^= buf_get_le64(lanes); + lanes += 8; + nlanes--; + + if (++pos == blocklanes) + { + burn = keccak_f1600_state_permute64_bmi2(hd); + pos = 0; + } + } + + return burn; +} + +static const keccak_ops_t keccak_bmi2_64_ops = +{ + .permute = keccak_f1600_state_permute64_bmi2, + .absorb = keccak_absorb_lanes64_bmi2, + .extract_inplace = keccak_extract_inplace64, +}; + +#endif /* USE_64BIT_BMI2 */ + + +/* Construct generic 32-bit implementation. */ +#ifdef USE_32BIT + +# define ANDN32(x, y) (~(x) & (y)) +# define ROL32(x, n) (((x) << ((unsigned int)n & 31)) | \ + ((x) >> ((32 - (unsigned int)(n)) & 31))) + +# define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute32bi +# include "keccak_permute_32.h" + +# undef ANDN32 +# undef ROL32 +# undef KECCAK_F1600_PERMUTE_FUNC_NAME + +static unsigned int +keccak_absorb_lanes32bi(KECCAK_STATE *hd, int pos, const byte *lanes, + unsigned int nlanes, int blocklanes) +{ + unsigned int burn = 0; - /* Absorb full blocks. */ - do + while (nlanes) { - burn = transform_blk (context, data); - data += bsize; + keccak_absorb_lane32bi(&hd->u.state32bi[pos * 2], + buf_get_le32(lanes + 0), + buf_get_le32(lanes + 4)); + lanes += 8; + nlanes--; + + if (++pos == blocklanes) + { + burn = keccak_f1600_state_permute32bi(hd); + pos = 0; + } } - while (--nblks); return burn; } +static const keccak_ops_t keccak_generic32bi_ops = +{ + .permute = keccak_f1600_state_permute32bi, + .absorb = keccak_absorb_lanes32bi, + .extract_inplace = keccak_extract_inplace32bi, +}; + +#endif /* USE_32BIT */ + + +/* Construct 32-bit Intel BMI2 implementation. */ +#ifdef USE_32BIT_BMI2 + +# define ANDN32(x, y) ({ \ + u32 tmp; \ + asm ("andnl %2, %1, %0" \ + : "=r" (tmp) \ + : "r0" (x), "rm" (y)); \ + tmp; }) + +# define ROL32(x, n) ({ \ + u32 tmp; \ + asm ("rorxl %2, %1, %0" \ + : "=r" (tmp) \ + : "rm0" (x), "J" (32 - ((n) & 31))); \ + tmp; }) + +# define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute32bi_bmi2 +# include "keccak_permute_32.h" + +# undef ANDN32 +# undef ROL32 +# undef KECCAK_F1600_PERMUTE_FUNC_NAME + +static inline u32 pext(u32 x, u32 mask) +{ + u32 tmp; + asm ("pextl %2, %1, %0" : "=r" (tmp) : "r0" (x), "rm" (mask)); + return tmp; +} + +static inline u32 pdep(u32 x, u32 mask) +{ + u32 tmp; + asm ("pdepl %2, %1, %0" : "=r" (tmp) : "r0" (x), "rm" (mask)); + return tmp; +} + +static inline void +keccak_absorb_lane32bi_bmi2(u32 *lane, u32 x0, u32 x1) +{ + x0 = pdep(pext(x0, 0x55555555), 0x0000ffff) | (pext(x0, 0xaaaaaaaa) << 16); + x1 = pdep(pext(x1, 0x55555555), 0x0000ffff) | (pext(x1, 0xaaaaaaaa) << 16); + + lane[0] ^= (x0 & 0x0000FFFFUL) + (x1 << 16); + lane[1] ^= (x0 >> 16) + (x1 & 0xFFFF0000UL); +} + +static unsigned int +keccak_absorb_lanes32bi_bmi2(KECCAK_STATE *hd, int pos, const byte *lanes, + unsigned int nlanes, int blocklanes) +{ + unsigned int burn = 0; + + while (nlanes) + { + keccak_absorb_lane32bi_bmi2(&hd->u.state32bi[pos * 2], + buf_get_le32(lanes + 0), + buf_get_le32(lanes + 4)); + lanes += 8; + nlanes--; + + if (++pos == blocklanes) + { + burn = keccak_f1600_state_permute32bi_bmi2(hd); + pos = 0; + } + } + + return burn; +} + +static unsigned int +keccak_extract_inplace32bi_bmi2(KECCAK_STATE *hd, unsigned int outlen) +{ + unsigned int i; + u32 x0; + u32 x1; + u32 t; + + for (i = 0; i < outlen / 8 + !!(outlen % 8); i++) + { + x0 = hd->u.state32bi[i * 2 + 0]; + x1 = hd->u.state32bi[i * 2 + 1]; + + t = (x0 & 0x0000FFFFUL) + (x1 << 16); + x1 = (x0 >> 16) + (x1 & 0xFFFF0000UL); + x0 = t; + + x0 = pdep(pext(x0, 0xffff0001), 0xaaaaaaab) | pdep(x0 >> 1, 0x55555554); + x1 = pdep(pext(x1, 0xffff0001), 0xaaaaaaab) | pdep(x1 >> 1, 0x55555554); + + hd->u.state32bi[i * 2 + 0] = le_bswap32(x0); + hd->u.state32bi[i * 2 + 1] = le_bswap32(x1); + } + + return 0; +} + +static const keccak_ops_t keccak_bmi2_32bi_ops = +{ + .permute = keccak_f1600_state_permute32bi_bmi2, + .absorb = keccak_absorb_lanes32bi_bmi2, + .extract_inplace = keccak_extract_inplace32bi_bmi2, +}; + +#endif /* USE_32BIT */ + + +static void +keccak_write (void *context, const void *inbuf_arg, size_t inlen) +{ + KECCAK_CONTEXT *ctx = context; + const size_t bsize = ctx->blocksize; + const size_t blocklanes = bsize / 8; + const byte *inbuf = inbuf_arg; + unsigned int nburn, burn = 0; + unsigned int count, i; + unsigned int pos, nlanes; + + count = ctx->count; + + if (inlen && (count % 8)) + { + byte lane[8] = { 0, }; + + /* Complete absorbing partial input lane. */ + + pos = count / 8; + + for (i = count % 8; inlen && i < 8; i++) + { + lane[i] = *inbuf++; + inlen--; + count++; + } + + if (count == bsize) + count = 0; + + nburn = ctx->ops->absorb(&ctx->state, pos, lane, 1, + (count % 8) ? -1 : blocklanes); + burn = nburn > burn ? nburn : burn; + } + + /* Absorb full input lanes. */ + + pos = count / 8; + nlanes = inlen / 8; + if (nlanes > 0) + { + nburn = ctx->ops->absorb(&ctx->state, pos, inbuf, nlanes, blocklanes); + burn = nburn > burn ? nburn : burn; + inlen -= nlanes * 8; + inbuf += nlanes * 8; + count += nlanes * 8; + count = count % bsize; + } + + if (inlen) + { + byte lane[8] = { 0, }; + + /* Absorb remaining partial input lane. */ + + pos = count / 8; + + for (i = count % 8; inlen && i < 8; i++) + { + lane[i] = *inbuf++; + inlen--; + count++; + } + + nburn = ctx->ops->absorb(&ctx->state, pos, lane, 1, -1); + burn = nburn > burn ? nburn : burn; + + gcry_assert(count < bsize); + } + + ctx->count = count; + + if (burn) + _gcry_burn_stack (burn); +} + static void keccak_init (int algo, void *context, unsigned int flags) @@ -267,29 +609,48 @@ keccak_init (int algo, void *context, unsigned int flags) memset (hd, 0, sizeof *hd); - ctx->bctx.nblocks = 0; - ctx->bctx.nblocks_high = 0; - ctx->bctx.count = 0; - ctx->bctx.bwrite = transform; + ctx->count = 0; + + /* Select generic implementation. */ +#ifdef USE_64BIT + ctx->ops = &keccak_generic64_ops; +#elif defined USE_32BIT + ctx->ops = &keccak_generic32bi_ops; +#endif + + /* Select optimized implementation based in hw features. */ + if (0) {} +#ifdef USE_64BIT_BMI2 + else if (features & HWF_INTEL_BMI2) + ctx->ops = &keccak_bmi2_64_ops; +#endif +#ifdef USE_32BIT_BMI2 + else if (features & HWF_INTEL_BMI2) + ctx->ops = &keccak_bmi2_32bi_ops; +#endif +#ifdef USE_64BIT_SHLD + else if (features & HWF_INTEL_FAST_SHLD) + ctx->ops = &keccak_shld_64_ops; +#endif /* Set input block size, in Keccak terms this is called 'rate'. */ switch (algo) { case GCRY_MD_SHA3_224: - ctx->bctx.blocksize = 1152 / 8; + ctx->blocksize = 1152 / 8; ctx->outlen = 224 / 8; break; case GCRY_MD_SHA3_256: - ctx->bctx.blocksize = 1088 / 8; + ctx->blocksize = 1088 / 8; ctx->outlen = 256 / 8; break; case GCRY_MD_SHA3_384: - ctx->bctx.blocksize = 832 / 8; + ctx->blocksize = 832 / 8; ctx->outlen = 384 / 8; break; case GCRY_MD_SHA3_512: - ctx->bctx.blocksize = 576 / 8; + ctx->blocksize = 576 / 8; ctx->outlen = 512 / 8; break; default: @@ -334,59 +695,37 @@ keccak_final (void *context) { KECCAK_CONTEXT *ctx = context; KECCAK_STATE *hd = &ctx->state; - const size_t bsize = ctx->bctx.blocksize; + const size_t bsize = ctx->blocksize; const byte suffix = SHA3_DELIMITED_SUFFIX; - u64 *state = (u64 *)hd->state; - unsigned int stack_burn_depth; + unsigned int nburn, burn = 0; unsigned int lastbytes; - unsigned int i; - byte *buf; + byte lane[8]; - _gcry_md_block_write (context, NULL, 0); /* flush */ - - buf = ctx->bctx.buf; - lastbytes = ctx->bctx.count; - - /* Absorb remaining bytes. */ - for (i = 0; i < lastbytes / 8; i++) - { - state[i] ^= buf_get_le64(buf); - buf += 8; - } - - for (i = 0; i < lastbytes % 8; i++) - { - state[lastbytes / 8] ^= (u64)*buf << (i * 8); - buf++; - } + lastbytes = ctx->count; /* Do the padding and switch to the squeezing phase */ /* Absorb the last few bits and add the first bit of padding (which coincides with the delimiter in delimited suffix) */ - state[lastbytes / 8] ^= (u64)suffix << ((lastbytes % 8) * 8); + buf_put_le64(lane, (u64)suffix << ((lastbytes % 8) * 8)); + nburn = ctx->ops->absorb(&ctx->state, lastbytes / 8, lane, 1, -1); + burn = nburn > burn ? nburn : burn; /* Add the second bit of padding. */ - state[(bsize - 1) / 8] ^= (u64)0x80 << (((bsize - 1) % 8) * 8); + buf_put_le64(lane, (u64)0x80 << (((bsize - 1) % 8) * 8)); + nburn = ctx->ops->absorb(&ctx->state, (bsize - 1) / 8, lane, 1, -1); + burn = nburn > burn ? nburn : burn; /* Switch to the squeezing phase. */ - stack_burn_depth = keccak_f1600_state_permute(hd); + nburn = ctx->ops->permute(hd); + burn = nburn > burn ? nburn : burn; /* Squeeze out all the output blocks */ if (ctx->outlen < bsize) { /* Output SHA3 digest. */ - buf = ctx->bctx.buf; - for (i = 0; i < ctx->outlen / 8; i++) - { - buf_put_le64(buf, state[i]); - buf += 8; - } - for (i = 0; i < ctx->outlen % 8; i++) - { - *buf = state[ctx->outlen / 8] >> (i * 8); - buf++; - } + nburn = ctx->ops->extract_inplace(hd, ctx->outlen); + burn = nburn > burn ? nburn : burn; } else { @@ -394,15 +733,18 @@ keccak_final (void *context) BUG(); } - _gcry_burn_stack (stack_burn_depth); + wipememory(lane, sizeof(lane)); + if (burn) + _gcry_burn_stack (burn); } static byte * keccak_read (void *context) { - KECCAK_CONTEXT *hd = (KECCAK_CONTEXT *) context; - return hd->bctx.buf; + KECCAK_CONTEXT *ctx = (KECCAK_CONTEXT *) context; + KECCAK_STATE *hd = &ctx->state; + return (byte *)&hd->u; } @@ -585,7 +927,7 @@ gcry_md_spec_t _gcry_digest_spec_sha3_224 = { GCRY_MD_SHA3_224, {0, 1}, "SHA3-224", sha3_224_asn, DIM (sha3_224_asn), oid_spec_sha3_224, 28, - sha3_224_init, _gcry_md_block_write, keccak_final, keccak_read, + sha3_224_init, keccak_write, keccak_final, keccak_read, sizeof (KECCAK_CONTEXT), run_selftests }; @@ -593,7 +935,7 @@ gcry_md_spec_t _gcry_digest_spec_sha3_256 = { GCRY_MD_SHA3_256, {0, 1}, "SHA3-256", sha3_256_asn, DIM (sha3_256_asn), oid_spec_sha3_256, 32, - sha3_256_init, _gcry_md_block_write, keccak_final, keccak_read, + sha3_256_init, keccak_write, keccak_final, keccak_read, sizeof (KECCAK_CONTEXT), run_selftests }; @@ -601,7 +943,7 @@ gcry_md_spec_t _gcry_digest_spec_sha3_384 = { GCRY_MD_SHA3_384, {0, 1}, "SHA3-384", sha3_384_asn, DIM (sha3_384_asn), oid_spec_sha3_384, 48, - sha3_384_init, _gcry_md_block_write, keccak_final, keccak_read, + sha3_384_init, keccak_write, keccak_final, keccak_read, sizeof (KECCAK_CONTEXT), run_selftests }; @@ -609,7 +951,7 @@ gcry_md_spec_t _gcry_digest_spec_sha3_512 = { GCRY_MD_SHA3_512, {0, 1}, "SHA3-512", sha3_512_asn, DIM (sha3_512_asn), oid_spec_sha3_512, 64, - sha3_512_init, _gcry_md_block_write, keccak_final, keccak_read, + sha3_512_init, keccak_write, keccak_final, keccak_read, sizeof (KECCAK_CONTEXT), run_selftests }; |