diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2015-11-18 09:44:18 +0200 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2015-11-18 09:44:18 +0200 |
commit | 6571a64331839d7d952292163afbf34c8bef62e0 (patch) | |
tree | dd3931f6151152724461e571493fced8cc06cd1c /cipher | |
parent | 15ea0acf8bb0aa307eccc23024a0bd7878fb8080 (diff) | |
download | libgcrypt-6571a64331839d7d952292163afbf34c8bef62e0.tar.gz |
Tweak Keccak for small speed-up
* cipher/keccak_permute_32.h (KECCAK_F1600_PERMUTE_FUNC_NAME): Track
rounds with round constant pointer instead of separate round counter.
* cipher/keccak_permute_64.h (KECCAK_F1600_PERMUTE_FUNC_NAME): Ditto.
(KECCAK_F1600_ABSORB_FUNC_NAME): Tweak lanes pointer increment for bulk
absorb loops.
--
Patch makes small tweaks to improve performance.
Benchmark on Intel Haswell @ 3.2 Ghz:
Before:
| nanosecs/byte mebibytes/sec cycles/byte
SHAKE128 | 2.27 ns/B 420.5 MiB/s 7.26 c/B
SHAKE256 | 2.79 ns/B 341.4 MiB/s 8.94 c/B
SHA3-224 | 2.64 ns/B 361.7 MiB/s 8.44 c/B
SHA3-256 | 2.79 ns/B 341.4 MiB/s 8.94 c/B
SHA3-384 | 3.65 ns/B 261.3 MiB/s 11.68 c/B
SHA3-512 | 5.27 ns/B 181.0 MiB/s 16.86 c/B
After:
| nanosecs/byte mebibytes/sec cycles/byte
SHAKE128 | 2.25 ns/B 423.5 MiB/s 7.21 c/B
SHAKE256 | 2.77 ns/B 343.9 MiB/s 8.88 c/B
SHA3-224 | 2.62 ns/B 364.1 MiB/s 8.38 c/B
SHA3-256 | 2.77 ns/B 343.8 MiB/s 8.88 c/B
SHA3-384 | 3.63 ns/B 262.6 MiB/s 11.63 c/B
SHA3-512 | 5.23 ns/B 182.3 MiB/s 16.75 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher')
-rw-r--r-- | cipher/keccak_permute_32.h | 13 | ||||
-rw-r--r-- | cipher/keccak_permute_64.h | 44 |
2 files changed, 27 insertions, 30 deletions
diff --git a/cipher/keccak_permute_32.h b/cipher/keccak_permute_32.h index fed93831..1ce42a42 100644 --- a/cipher/keccak_permute_32.h +++ b/cipher/keccak_permute_32.h @@ -27,6 +27,7 @@ static unsigned int KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd) { const u32 *round_consts = round_consts_32bit; + const u32 *round_consts_end = round_consts_32bit + 2 * 24; u32 Aba0, Abe0, Abi0, Abo0, Abu0; u32 Aba1, Abe1, Abi1, Abo1, Abu1; u32 Aga0, Age0, Agi0, Ago0, Agu0; @@ -52,7 +53,6 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd) u32 Esa0, Ese0, Esi0, Eso0, Esu0; u32 Esa1, Ese1, Esi1, Eso1, Esu1; u32 *state = hd->u.state32bi; - unsigned int round; Aba0 = state[0]; Aba1 = state[1]; @@ -105,7 +105,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd) Asu0 = state[48]; Asu1 = state[49]; - for (round = 0; round < 24; round += 2) + do { /* prepareTheta */ BCa0 = Aba0 ^ Aga0 ^ Aka0 ^ Ama0 ^ Asa0; @@ -142,7 +142,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd) Asu0 ^= Du0; BCu0 = ROL32(Asu0, 7); Eba0 = BCa0 ^ ANDN32(BCe0, BCi0); - Eba0 ^= round_consts[round * 2 + 0]; + Eba0 ^= *(round_consts++); Ebe0 = BCe0 ^ ANDN32(BCi0, BCo0); Ebi0 = BCi0 ^ ANDN32(BCo0, BCu0); Ebo0 = BCo0 ^ ANDN32(BCu0, BCa0); @@ -159,7 +159,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd) Asu1 ^= Du1; BCu1 = ROL32(Asu1, 7); Eba1 = BCa1 ^ ANDN32(BCe1, BCi1); - Eba1 ^= round_consts[round * 2 + 1]; + Eba1 ^= *(round_consts++); Ebe1 = BCe1 ^ ANDN32(BCi1, BCo1); Ebi1 = BCi1 ^ ANDN32(BCo1, BCu1); Ebo1 = BCo1 ^ ANDN32(BCu1, BCa1); @@ -328,7 +328,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd) Esu0 ^= Du0; BCu0 = ROL32(Esu0, 7); Aba0 = BCa0 ^ ANDN32(BCe0, BCi0); - Aba0 ^= round_consts[round * 2 + 2]; + Aba0 ^= *(round_consts++); Abe0 = BCe0 ^ ANDN32(BCi0, BCo0); Abi0 = BCi0 ^ ANDN32(BCo0, BCu0); Abo0 = BCo0 ^ ANDN32(BCu0, BCa0); @@ -345,7 +345,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd) Esu1 ^= Du1; BCu1 = ROL32(Esu1, 7); Aba1 = BCa1 ^ ANDN32(BCe1, BCi1); - Aba1 ^= round_consts[round * 2 + 3]; + Aba1 ^= *(round_consts++); Abe1 = BCe1 ^ ANDN32(BCi1, BCo1); Abi1 = BCi1 ^ ANDN32(BCo1, BCu1); Abo1 = BCo1 ^ ANDN32(BCu1, BCa1); @@ -479,6 +479,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd) Aso1 = BCo1 ^ ANDN32(BCu1, BCa1); Asu1 = BCu1 ^ ANDN32(BCa1, BCe1); } + while (round_consts < round_consts_end); state[0] = Aba0; state[1] = Aba1; diff --git a/cipher/keccak_permute_64.h b/cipher/keccak_permute_64.h index 1a80192c..b28c871e 100644 --- a/cipher/keccak_permute_64.h +++ b/cipher/keccak_permute_64.h @@ -26,6 +26,7 @@ static unsigned int KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd) { const u64 *round_consts = _gcry_keccak_round_consts_64bit; + const u64 *round_consts_end = _gcry_keccak_round_consts_64bit + 24; u64 Aba, Abe, Abi, Abo, Abu; u64 Aga, Age, Agi, Ago, Agu; u64 Aka, Ake, Aki, Ako, Aku; @@ -39,7 +40,6 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd) u64 Ema, Eme, Emi, Emo, Emu; u64 Esa, Ese, Esi, Eso, Esu; u64 *state = hd->u.state64; - unsigned int round; Aba = state[0]; Abe = state[1]; @@ -67,7 +67,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd) Aso = state[23]; Asu = state[24]; - for (round = 0; round < 24; round += 2) + do { /* prepareTheta */ BCa = Aba ^ Aga ^ Aka ^ Ama ^ Asa; @@ -94,7 +94,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd) Asu ^= Du; BCu = ROL64(Asu, 14); Eba = BCa ^ ANDN64(BCe, BCi); - Eba ^= (u64)round_consts[round]; + Eba ^= *(round_consts++); Ebe = BCe ^ ANDN64(BCi, BCo); Ebi = BCi ^ ANDN64(BCo, BCu); Ebo = BCo ^ ANDN64(BCu, BCa); @@ -189,7 +189,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd) Esu ^= Du; BCu = ROL64(Esu, 14); Aba = BCa ^ ANDN64(BCe, BCi); - Aba ^= (u64)round_consts[round + 1]; + Aba ^= *(round_consts++); Abe = BCe ^ ANDN64(BCi, BCo); Abi = BCi ^ ANDN64(BCo, BCu); Abo = BCo ^ ANDN64(BCu, BCa); @@ -259,6 +259,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd) Aso = BCo ^ ANDN64(BCu, BCa); Asu = BCu ^ ANDN64(BCa, BCe); } + while (round_consts < round_consts_end); state[0] = Aba; state[1] = Abe; @@ -303,12 +304,11 @@ KECCAK_F1600_ABSORB_FUNC_NAME(KECCAK_STATE *hd, int pos, const byte *lanes, /* SHAKE128 */ while (pos == 0 && nlanes >= 21) { - absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0); - absorb_lanes64_4(&hd->u.state64[8], lanes + 8 * 8); - absorb_lanes64_8(&hd->u.state64[12], lanes + 8 * 12); - absorb_lanes64_1(&hd->u.state64[20], lanes + 8 * 20); - lanes += 8 * 21; nlanes -= 21; + absorb_lanes64_8(&hd->u.state64[0], lanes); lanes += 8 * 8; + absorb_lanes64_8(&hd->u.state64[8], lanes); lanes += 8 * 8; + absorb_lanes64_4(&hd->u.state64[16], lanes); lanes += 8 * 4; + absorb_lanes64_1(&hd->u.state64[20], lanes); lanes += 8 * 1; burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd); } @@ -318,11 +318,10 @@ KECCAK_F1600_ABSORB_FUNC_NAME(KECCAK_STATE *hd, int pos, const byte *lanes, /* SHA3-224 */ while (pos == 0 && nlanes >= 18) { - absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0); - absorb_lanes64_2(&hd->u.state64[8], lanes + 8 * 8); - absorb_lanes64_8(&hd->u.state64[10], lanes + 8 * 10); - lanes += 8 * 18; nlanes -= 18; + absorb_lanes64_8(&hd->u.state64[0], lanes); lanes += 8 * 8; + absorb_lanes64_8(&hd->u.state64[8], lanes); lanes += 8 * 8; + absorb_lanes64_2(&hd->u.state64[16], lanes); lanes += 8 * 2; burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd); } @@ -332,11 +331,10 @@ KECCAK_F1600_ABSORB_FUNC_NAME(KECCAK_STATE *hd, int pos, const byte *lanes, /* SHA3-256 & SHAKE256 */ while (pos == 0 && nlanes >= 17) { - absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0); - absorb_lanes64_8(&hd->u.state64[8], lanes + 8 * 8); - absorb_lanes64_1(&hd->u.state64[16], lanes + 8 * 16); - lanes += 8 * 17; nlanes -= 17; + absorb_lanes64_8(&hd->u.state64[0], lanes); lanes += 8 * 8; + absorb_lanes64_8(&hd->u.state64[8], lanes); lanes += 8 * 8; + absorb_lanes64_1(&hd->u.state64[16], lanes); lanes += 8 * 1; burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd); } @@ -346,11 +344,10 @@ KECCAK_F1600_ABSORB_FUNC_NAME(KECCAK_STATE *hd, int pos, const byte *lanes, /* SHA3-384 */ while (pos == 0 && nlanes >= 13) { - absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0); - absorb_lanes64_4(&hd->u.state64[8], lanes + 8 * 8); - absorb_lanes64_1(&hd->u.state64[12], lanes + 8 * 12); - lanes += 8 * 13; nlanes -= 13; + absorb_lanes64_8(&hd->u.state64[0], lanes); lanes += 8 * 8; + absorb_lanes64_4(&hd->u.state64[8], lanes); lanes += 8 * 4; + absorb_lanes64_1(&hd->u.state64[12], lanes); lanes += 8 * 1; burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd); } @@ -360,10 +357,9 @@ KECCAK_F1600_ABSORB_FUNC_NAME(KECCAK_STATE *hd, int pos, const byte *lanes, /* SHA3-512 */ while (pos == 0 && nlanes >= 9) { - absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0); - absorb_lanes64_1(&hd->u.state64[8], lanes + 8 * 8); - lanes += 8 * 9; nlanes -= 9; + absorb_lanes64_8(&hd->u.state64[0], lanes); lanes += 8 * 8; + absorb_lanes64_1(&hd->u.state64[8], lanes); lanes += 8 * 1; burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd); } |