diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2015-10-31 21:29:56 +0200 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2015-11-01 21:47:06 +0200 |
commit | 2857cb89c6dc1c02266600bc1fd2967a3cd5cf88 (patch) | |
tree | 1d5ca6d7135264461757cfdd90b051fc98f5f0ed /cipher/keccak_permute_64.h | |
parent | 07e4839e75a7bca3a6c0a94aecfe75efe61d7ff2 (diff) | |
download | libgcrypt-2857cb89c6dc1c02266600bc1fd2967a3cd5cf88.tar.gz |
Optimize Keccak 64-bit absorb functions
* cipher/keccak.c [USE_64BIT] [__x86_64__] (absorb_lanes64_8)
(absorb_lanes64_4, absorb_lanes64_2, absorb_lanes64_1): New.
* cipher/keccak.c [USE_64BIT] [!__x86_64__] (absorb_lanes64_8)
(absorb_lanes64_4, absorb_lanes64_2, absorb_lanes64_1): New.
[USE_64BIT] (KECCAK_F1600_ABSORB_FUNC_NAME): New.
[USE_64BIT] (keccak_absorb_lanes64): Remove.
[USE_64BIT_SHLD] (KECCAK_F1600_ABSORB_FUNC_NAME): New.
[USE_64BIT_SHLD] (keccak_absorb_lanes64_shld): Remove.
[USE_64BIT_BMI2] (KECCAK_F1600_ABSORB_FUNC_NAME): New.
[USE_64BIT_BMI2] (keccak_absorb_lanes64_bmi2): Remove.
* cipher/keccak_permute_64.h (KECCAK_F1600_ABSORB_FUNC_NAME): New.
--
Optimize 64-bit absorb functions for small speed-up. After this
change, 64-bit BMI2 implementation matches speed of fastest results
from SUPERCOP for Intel Haswell CPUs (long messages).
Benchmark on Intel Haswell @ 3.2 Ghz:
Before:
| nanosecs/byte mebibytes/sec cycles/byte
SHAKE128 | 2.32 ns/B 411.7 MiB/s 7.41 c/B
SHAKE256 | 2.84 ns/B 336.2 MiB/s 9.08 c/B
SHA3-224 | 2.69 ns/B 354.9 MiB/s 8.60 c/B
SHA3-256 | 2.84 ns/B 336.0 MiB/s 9.08 c/B
SHA3-384 | 3.69 ns/B 258.4 MiB/s 11.81 c/B
SHA3-512 | 5.30 ns/B 179.9 MiB/s 16.97 c/B
After:
| nanosecs/byte mebibytes/sec cycles/byte
SHAKE128 | 2.27 ns/B 420.6 MiB/s 7.26 c/B
SHAKE256 | 2.79 ns/B 341.4 MiB/s 8.94 c/B
SHA3-224 | 2.64 ns/B 361.7 MiB/s 8.44 c/B
SHA3-256 | 2.79 ns/B 341.5 MiB/s 8.94 c/B
SHA3-384 | 3.65 ns/B 261.4 MiB/s 11.68 c/B
SHA3-512 | 5.27 ns/B 181.0 MiB/s 16.87 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/keccak_permute_64.h')
-rw-r--r-- | cipher/keccak_permute_64.h | 99 |
1 files changed, 99 insertions, 0 deletions
diff --git a/cipher/keccak_permute_64.h b/cipher/keccak_permute_64.h index 1264f195..6f24217d 100644 --- a/cipher/keccak_permute_64.h +++ b/cipher/keccak_permute_64.h @@ -288,3 +288,102 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd) return sizeof(void *) * 4 + sizeof(u64) * 12 * 5; } + +static unsigned int +KECCAK_F1600_ABSORB_FUNC_NAME(KECCAK_STATE *hd, int pos, const byte *lanes, + unsigned int nlanes, int blocklanes) +{ + unsigned int burn = 0; + + while (nlanes) + { + switch (blocklanes) + { + case 21: + /* SHAKE128 */ + while (pos == 0 && nlanes >= 21) + { + absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0); + absorb_lanes64_4(&hd->u.state64[8], lanes + 8 * 8); + absorb_lanes64_8(&hd->u.state64[12], lanes + 8 * 12); + absorb_lanes64_1(&hd->u.state64[20], lanes + 8 * 20); + lanes += 8 * 21; + nlanes -= 21; + + burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd); + } + break; + + case 18: + /* SHA3-224 */ + while (pos == 0 && nlanes >= 18) + { + absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0); + absorb_lanes64_2(&hd->u.state64[8], lanes + 8 * 8); + absorb_lanes64_8(&hd->u.state64[10], lanes + 8 * 10); + lanes += 8 * 18; + nlanes -= 18; + + burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd); + } + break; + + case 17: + /* SHA3-256 & SHAKE256 */ + while (pos == 0 && nlanes >= 17) + { + absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0); + absorb_lanes64_8(&hd->u.state64[8], lanes + 8 * 8); + absorb_lanes64_1(&hd->u.state64[16], lanes + 8 * 16); + lanes += 8 * 17; + nlanes -= 17; + + burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd); + } + break; + + case 13: + /* SHA3-384 */ + while (pos == 0 && nlanes >= 13) + { + absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0); + absorb_lanes64_4(&hd->u.state64[8], lanes + 8 * 8); + absorb_lanes64_1(&hd->u.state64[12], lanes + 8 * 12); + lanes += 8 * 13; + nlanes -= 13; + + burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd); + } + break; + + case 9: + /* SHA3-512 */ + while (pos == 0 && nlanes >= 9) + { + absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0); + absorb_lanes64_1(&hd->u.state64[8], lanes + 8 * 8); + lanes += 8 * 9; + nlanes -= 9; + + burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd); + } + break; + } + + while (nlanes) + { + hd->u.state64[pos] ^= buf_get_le64(lanes); + lanes += 8; + nlanes--; + + if (++pos == blocklanes) + { + burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd); + pos = 0; + break; + } + } + } + + return burn; +} |