summaryrefslogtreecommitdiff
path: root/cipher/keccak_permute_32.h
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2015-10-23 22:30:48 +0300
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2015-10-28 20:08:56 +0200
commit74184c28fbe7ff58cf57f0094ef957d94045da7d (patch)
tree3d1c33a07e749439aa010abbdfbdd6fff0364356 /cipher/keccak_permute_32.h
parent909644ef5883927262366c356eed530e55aba478 (diff)
downloadlibgcrypt-74184c28fbe7ff58cf57f0094ef957d94045da7d.tar.gz
keccak: rewrite for improved performance
* cipher/Makefile.am: Add 'keccak_permute_32.h' and 'keccak_permute_64.h'. * cipher/hash-common.h [USE_SHA3] (MD_BLOCK_MAX_BLOCKSIZE): Remove. * cipher/keccak.c (USE_64BIT, USE_32BIT, USE_64BIT_BMI2) (USE_64BIT_SHLD, USE_32BIT_BMI2, NEED_COMMON64, NEED_COMMON32BI) (keccak_ops_t): New. (KECCAK_STATE): Add 'state64' and 'state32bi' members. (KECCAK_CONTEXT): Remove 'bctx'; add 'blocksize', 'count' and 'ops'. (rol64, keccak_f1600_state_permute): Remove. [NEED_COMMON64] (round_consts_64bit, keccak_extract_inplace64): New. [NEED_COMMON32BI] (round_consts_32bit, keccak_extract_inplace32bi) (keccak_absorb_lane32bi): New. [USE_64BIT] (ANDN64, ROL64, keccak_f1600_state_permute64) (keccak_absorb_lanes64, keccak_generic64_ops): New. [USE_64BIT_SHLD] (ANDN64, ROL64, keccak_f1600_state_permute64_shld) (keccak_absorb_lanes64_shld, keccak_shld_64_ops): New. [USE_64BIT_BMI2] (ANDN64, ROL64, keccak_f1600_state_permute64_bmi2) (keccak_absorb_lanes64_bmi2, keccak_bmi2_64_ops): New. [USE_32BIT] (ANDN64, ROL64, keccak_f1600_state_permute32bi) (keccak_absorb_lanes32bi, keccak_generic32bi_ops): New. [USE_32BIT_BMI2] (ANDN64, ROL64, keccak_f1600_state_permute32bi_bmi2) (pext, pdep, keccak_absorb_lane32bi_bmi2, keccak_absorb_lanes32bi_bmi2) (keccak_extract_inplace32bi_bmi2, keccak_bmi2_32bi_ops): New. (keccak_write): New. (keccak_init): Adjust to KECCAK_CONTEXT changes; add implementation selection based on HWF features. (keccak_final): Adjust to KECCAK_CONTEXT changes; use selected 'ops' for state manipulation. (keccak_read): Adjust to KECCAK_CONTEXT changes. (_gcry_digest_spec_sha3_224, _gcry_digest_spec_sha3_256) (_gcry_digest_spec_sha3_348, _gcry_digest_spec_sha3_512): Use 'keccak_write' instead of '_gcry_md_block_write'. * cipher/keccak_permute_32.h: New. * cipher/keccak_permute_64.h: New. -- Patch adds new generic 64-bit and 32-bit implementations and optimized implementations for SHA3: - Generic 64-bit implementation based on 'simple' implementation from SUPERCOP package. - Generic 32-bit bit-inteleaved implementataion based on 'simple32bi' implementation from SUPERCOP package. - Intel BMI2 optimized variants of 64-bit and 32-bit BI implementations. - Intel SHLD optimized variant of 64-bit implementation. Patch also makes proper use of sponge construction to avoid use of addition input buffer. Below are bench-slope benchmarks for new 64-bit implementations made on Intel Core i5-4570 (no turbo, 3.2 Ghz, gcc-4.9.2). Before (amd64): SHA3-224 | 3.92 ns/B 243.2 MiB/s 12.55 c/B SHA3-256 | 4.15 ns/B 230.0 MiB/s 13.27 c/B SHA3-384 | 5.40 ns/B 176.6 MiB/s 17.29 c/B SHA3-512 | 7.77 ns/B 122.7 MiB/s 24.87 c/B After (generic 64-bit, amd64), 1.10x faster): SHA3-224 | 3.57 ns/B 267.4 MiB/s 11.42 c/B SHA3-256 | 3.77 ns/B 252.8 MiB/s 12.07 c/B SHA3-384 | 4.91 ns/B 194.1 MiB/s 15.72 c/B SHA3-512 | 7.06 ns/B 135.0 MiB/s 22.61 c/B After (Intel SHLD 64-bit, amd64, 1.13x faster): SHA3-224 | 3.48 ns/B 273.7 MiB/s 11.15 c/B SHA3-256 | 3.68 ns/B 258.9 MiB/s 11.79 c/B SHA3-384 | 4.80 ns/B 198.7 MiB/s 15.36 c/B SHA3-512 | 6.89 ns/B 138.4 MiB/s 22.05 c/B After (Intel BMI2 64-bit, amd64, 1.45x faster): SHA3-224 | 2.71 ns/B 352.1 MiB/s 8.67 c/B SHA3-256 | 2.86 ns/B 333.2 MiB/s 9.16 c/B SHA3-384 | 3.72 ns/B 256.2 MiB/s 11.91 c/B SHA3-512 | 5.34 ns/B 178.5 MiB/s 17.10 c/B Benchmarks of new 32-bit implementations on Intel Core i5-4570 (no turbo, 3.2 Ghz, gcc-4.9.2): Before (win32): SHA3-224 | 12.05 ns/B 79.16 MiB/s 38.56 c/B SHA3-256 | 12.75 ns/B 74.78 MiB/s 40.82 c/B SHA3-384 | 16.63 ns/B 57.36 MiB/s 53.22 c/B SHA3-512 | 23.97 ns/B 39.79 MiB/s 76.72 c/B After (generic 32-bit BI, win32, 1.23x to 1.29x faster): SHA3-224 | 9.76 ns/B 97.69 MiB/s 31.25 c/B SHA3-256 | 10.27 ns/B 92.82 MiB/s 32.89 c/B SHA3-384 | 13.22 ns/B 72.16 MiB/s 42.31 c/B SHA3-512 | 18.65 ns/B 51.13 MiB/s 59.70 c/B After (Intel BMI2 32-bit BI, win32, 1.66x to 1.70x faster): SHA3-224 | 7.26 ns/B 131.4 MiB/s 23.23 c/B SHA3-256 | 7.65 ns/B 124.7 MiB/s 24.47 c/B SHA3-384 | 9.87 ns/B 96.67 MiB/s 31.58 c/B SHA3-512 | 14.05 ns/B 67.85 MiB/s 44.99 c/B Benchmarks of new 32-bit implementation on ARM Cortex-A8 (1008 Mhz, gcc-4.9.1): Before: SHA3-224 | 148.6 ns/B 6.42 MiB/s 149.8 c/B SHA3-256 | 157.2 ns/B 6.07 MiB/s 158.4 c/B SHA3-384 | 205.3 ns/B 4.65 MiB/s 206.9 c/B SHA3-512 | 296.3 ns/B 3.22 MiB/s 298.6 c/B After (1.56x faster): SHA3-224 | 96.12 ns/B 9.92 MiB/s 96.89 c/B SHA3-256 | 101.5 ns/B 9.40 MiB/s 102.3 c/B SHA3-384 | 131.4 ns/B 7.26 MiB/s 132.5 c/B SHA3-512 | 188.2 ns/B 5.07 MiB/s 189.7 c/B Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/keccak_permute_32.h')
-rw-r--r--cipher/keccak_permute_32.h535
1 files changed, 535 insertions, 0 deletions
diff --git a/cipher/keccak_permute_32.h b/cipher/keccak_permute_32.h
new file mode 100644
index 00000000..fed93831
--- /dev/null
+++ b/cipher/keccak_permute_32.h
@@ -0,0 +1,535 @@
+/* keccak_permute_32.h - Keccak permute function (simple 32bit bit-interleaved)
+ * Copyright (C) 2015 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* The code is based on public-domain/CC0 "keccakc1024/simple32bi/
+ * Keccak-simple32BI.c" implementation by Ronny Van Keer from SUPERCOP toolkit
+ * package.
+ */
+
+/* Function that computes the Keccak-f[1600] permutation on the given state. */
+static unsigned int
+KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
+{
+ const u32 *round_consts = round_consts_32bit;
+ u32 Aba0, Abe0, Abi0, Abo0, Abu0;
+ u32 Aba1, Abe1, Abi1, Abo1, Abu1;
+ u32 Aga0, Age0, Agi0, Ago0, Agu0;
+ u32 Aga1, Age1, Agi1, Ago1, Agu1;
+ u32 Aka0, Ake0, Aki0, Ako0, Aku0;
+ u32 Aka1, Ake1, Aki1, Ako1, Aku1;
+ u32 Ama0, Ame0, Ami0, Amo0, Amu0;
+ u32 Ama1, Ame1, Ami1, Amo1, Amu1;
+ u32 Asa0, Ase0, Asi0, Aso0, Asu0;
+ u32 Asa1, Ase1, Asi1, Aso1, Asu1;
+ u32 BCa0, BCe0, BCi0, BCo0, BCu0;
+ u32 BCa1, BCe1, BCi1, BCo1, BCu1;
+ u32 Da0, De0, Di0, Do0, Du0;
+ u32 Da1, De1, Di1, Do1, Du1;
+ u32 Eba0, Ebe0, Ebi0, Ebo0, Ebu0;
+ u32 Eba1, Ebe1, Ebi1, Ebo1, Ebu1;
+ u32 Ega0, Ege0, Egi0, Ego0, Egu0;
+ u32 Ega1, Ege1, Egi1, Ego1, Egu1;
+ u32 Eka0, Eke0, Eki0, Eko0, Eku0;
+ u32 Eka1, Eke1, Eki1, Eko1, Eku1;
+ u32 Ema0, Eme0, Emi0, Emo0, Emu0;
+ u32 Ema1, Eme1, Emi1, Emo1, Emu1;
+ u32 Esa0, Ese0, Esi0, Eso0, Esu0;
+ u32 Esa1, Ese1, Esi1, Eso1, Esu1;
+ u32 *state = hd->u.state32bi;
+ unsigned int round;
+
+ Aba0 = state[0];
+ Aba1 = state[1];
+ Abe0 = state[2];
+ Abe1 = state[3];
+ Abi0 = state[4];
+ Abi1 = state[5];
+ Abo0 = state[6];
+ Abo1 = state[7];
+ Abu0 = state[8];
+ Abu1 = state[9];
+ Aga0 = state[10];
+ Aga1 = state[11];
+ Age0 = state[12];
+ Age1 = state[13];
+ Agi0 = state[14];
+ Agi1 = state[15];
+ Ago0 = state[16];
+ Ago1 = state[17];
+ Agu0 = state[18];
+ Agu1 = state[19];
+ Aka0 = state[20];
+ Aka1 = state[21];
+ Ake0 = state[22];
+ Ake1 = state[23];
+ Aki0 = state[24];
+ Aki1 = state[25];
+ Ako0 = state[26];
+ Ako1 = state[27];
+ Aku0 = state[28];
+ Aku1 = state[29];
+ Ama0 = state[30];
+ Ama1 = state[31];
+ Ame0 = state[32];
+ Ame1 = state[33];
+ Ami0 = state[34];
+ Ami1 = state[35];
+ Amo0 = state[36];
+ Amo1 = state[37];
+ Amu0 = state[38];
+ Amu1 = state[39];
+ Asa0 = state[40];
+ Asa1 = state[41];
+ Ase0 = state[42];
+ Ase1 = state[43];
+ Asi0 = state[44];
+ Asi1 = state[45];
+ Aso0 = state[46];
+ Aso1 = state[47];
+ Asu0 = state[48];
+ Asu1 = state[49];
+
+ for (round = 0; round < 24; round += 2)
+ {
+ /* prepareTheta */
+ BCa0 = Aba0 ^ Aga0 ^ Aka0 ^ Ama0 ^ Asa0;
+ BCa1 = Aba1 ^ Aga1 ^ Aka1 ^ Ama1 ^ Asa1;
+ BCe0 = Abe0 ^ Age0 ^ Ake0 ^ Ame0 ^ Ase0;
+ BCe1 = Abe1 ^ Age1 ^ Ake1 ^ Ame1 ^ Ase1;
+ BCi0 = Abi0 ^ Agi0 ^ Aki0 ^ Ami0 ^ Asi0;
+ BCi1 = Abi1 ^ Agi1 ^ Aki1 ^ Ami1 ^ Asi1;
+ BCo0 = Abo0 ^ Ago0 ^ Ako0 ^ Amo0 ^ Aso0;
+ BCo1 = Abo1 ^ Ago1 ^ Ako1 ^ Amo1 ^ Aso1;
+ BCu0 = Abu0 ^ Agu0 ^ Aku0 ^ Amu0 ^ Asu0;
+ BCu1 = Abu1 ^ Agu1 ^ Aku1 ^ Amu1 ^ Asu1;
+
+ /* thetaRhoPiChiIota(round , A, E) */
+ Da0 = BCu0 ^ ROL32(BCe1, 1);
+ Da1 = BCu1 ^ BCe0;
+ De0 = BCa0 ^ ROL32(BCi1, 1);
+ De1 = BCa1 ^ BCi0;
+ Di0 = BCe0 ^ ROL32(BCo1, 1);
+ Di1 = BCe1 ^ BCo0;
+ Do0 = BCi0 ^ ROL32(BCu1, 1);
+ Do1 = BCi1 ^ BCu0;
+ Du0 = BCo0 ^ ROL32(BCa1, 1);
+ Du1 = BCo1 ^ BCa0;
+
+ Aba0 ^= Da0;
+ BCa0 = Aba0;
+ Age0 ^= De0;
+ BCe0 = ROL32(Age0, 22);
+ Aki1 ^= Di1;
+ BCi0 = ROL32(Aki1, 22);
+ Amo1 ^= Do1;
+ BCo0 = ROL32(Amo1, 11);
+ Asu0 ^= Du0;
+ BCu0 = ROL32(Asu0, 7);
+ Eba0 = BCa0 ^ ANDN32(BCe0, BCi0);
+ Eba0 ^= round_consts[round * 2 + 0];
+ Ebe0 = BCe0 ^ ANDN32(BCi0, BCo0);
+ Ebi0 = BCi0 ^ ANDN32(BCo0, BCu0);
+ Ebo0 = BCo0 ^ ANDN32(BCu0, BCa0);
+ Ebu0 = BCu0 ^ ANDN32(BCa0, BCe0);
+
+ Aba1 ^= Da1;
+ BCa1 = Aba1;
+ Age1 ^= De1;
+ BCe1 = ROL32(Age1, 22);
+ Aki0 ^= Di0;
+ BCi1 = ROL32(Aki0, 21);
+ Amo0 ^= Do0;
+ BCo1 = ROL32(Amo0, 10);
+ Asu1 ^= Du1;
+ BCu1 = ROL32(Asu1, 7);
+ Eba1 = BCa1 ^ ANDN32(BCe1, BCi1);
+ Eba1 ^= round_consts[round * 2 + 1];
+ Ebe1 = BCe1 ^ ANDN32(BCi1, BCo1);
+ Ebi1 = BCi1 ^ ANDN32(BCo1, BCu1);
+ Ebo1 = BCo1 ^ ANDN32(BCu1, BCa1);
+ Ebu1 = BCu1 ^ ANDN32(BCa1, BCe1);
+
+ Abo0 ^= Do0;
+ BCa0 = ROL32(Abo0, 14);
+ Agu0 ^= Du0;
+ BCe0 = ROL32(Agu0, 10);
+ Aka1 ^= Da1;
+ BCi0 = ROL32(Aka1, 2);
+ Ame1 ^= De1;
+ BCo0 = ROL32(Ame1, 23);
+ Asi1 ^= Di1;
+ BCu0 = ROL32(Asi1, 31);
+ Ega0 = BCa0 ^ ANDN32(BCe0, BCi0);
+ Ege0 = BCe0 ^ ANDN32(BCi0, BCo0);
+ Egi0 = BCi0 ^ ANDN32(BCo0, BCu0);
+ Ego0 = BCo0 ^ ANDN32(BCu0, BCa0);
+ Egu0 = BCu0 ^ ANDN32(BCa0, BCe0);
+
+ Abo1 ^= Do1;
+ BCa1 = ROL32(Abo1, 14);
+ Agu1 ^= Du1;
+ BCe1 = ROL32(Agu1, 10);
+ Aka0 ^= Da0;
+ BCi1 = ROL32(Aka0, 1);
+ Ame0 ^= De0;
+ BCo1 = ROL32(Ame0, 22);
+ Asi0 ^= Di0;
+ BCu1 = ROL32(Asi0, 30);
+ Ega1 = BCa1 ^ ANDN32(BCe1, BCi1);
+ Ege1 = BCe1 ^ ANDN32(BCi1, BCo1);
+ Egi1 = BCi1 ^ ANDN32(BCo1, BCu1);
+ Ego1 = BCo1 ^ ANDN32(BCu1, BCa1);
+ Egu1 = BCu1 ^ ANDN32(BCa1, BCe1);
+
+ Abe1 ^= De1;
+ BCa0 = ROL32(Abe1, 1);
+ Agi0 ^= Di0;
+ BCe0 = ROL32(Agi0, 3);
+ Ako1 ^= Do1;
+ BCi0 = ROL32(Ako1, 13);
+ Amu0 ^= Du0;
+ BCo0 = ROL32(Amu0, 4);
+ Asa0 ^= Da0;
+ BCu0 = ROL32(Asa0, 9);
+ Eka0 = BCa0 ^ ANDN32(BCe0, BCi0);
+ Eke0 = BCe0 ^ ANDN32(BCi0, BCo0);
+ Eki0 = BCi0 ^ ANDN32(BCo0, BCu0);
+ Eko0 = BCo0 ^ ANDN32(BCu0, BCa0);
+ Eku0 = BCu0 ^ ANDN32(BCa0, BCe0);
+
+ Abe0 ^= De0;
+ BCa1 = Abe0;
+ Agi1 ^= Di1;
+ BCe1 = ROL32(Agi1, 3);
+ Ako0 ^= Do0;
+ BCi1 = ROL32(Ako0, 12);
+ Amu1 ^= Du1;
+ BCo1 = ROL32(Amu1, 4);
+ Asa1 ^= Da1;
+ BCu1 = ROL32(Asa1, 9);
+ Eka1 = BCa1 ^ ANDN32(BCe1, BCi1);
+ Eke1 = BCe1 ^ ANDN32(BCi1, BCo1);
+ Eki1 = BCi1 ^ ANDN32(BCo1, BCu1);
+ Eko1 = BCo1 ^ ANDN32(BCu1, BCa1);
+ Eku1 = BCu1 ^ ANDN32(BCa1, BCe1);
+
+ Abu1 ^= Du1;
+ BCa0 = ROL32(Abu1, 14);
+ Aga0 ^= Da0;
+ BCe0 = ROL32(Aga0, 18);
+ Ake0 ^= De0;
+ BCi0 = ROL32(Ake0, 5);
+ Ami1 ^= Di1;
+ BCo0 = ROL32(Ami1, 8);
+ Aso0 ^= Do0;
+ BCu0 = ROL32(Aso0, 28);
+ Ema0 = BCa0 ^ ANDN32(BCe0, BCi0);
+ Eme0 = BCe0 ^ ANDN32(BCi0, BCo0);
+ Emi0 = BCi0 ^ ANDN32(BCo0, BCu0);
+ Emo0 = BCo0 ^ ANDN32(BCu0, BCa0);
+ Emu0 = BCu0 ^ ANDN32(BCa0, BCe0);
+
+ Abu0 ^= Du0;
+ BCa1 = ROL32(Abu0, 13);
+ Aga1 ^= Da1;
+ BCe1 = ROL32(Aga1, 18);
+ Ake1 ^= De1;
+ BCi1 = ROL32(Ake1, 5);
+ Ami0 ^= Di0;
+ BCo1 = ROL32(Ami0, 7);
+ Aso1 ^= Do1;
+ BCu1 = ROL32(Aso1, 28);
+ Ema1 = BCa1 ^ ANDN32(BCe1, BCi1);
+ Eme1 = BCe1 ^ ANDN32(BCi1, BCo1);
+ Emi1 = BCi1 ^ ANDN32(BCo1, BCu1);
+ Emo1 = BCo1 ^ ANDN32(BCu1, BCa1);
+ Emu1 = BCu1 ^ ANDN32(BCa1, BCe1);
+
+ Abi0 ^= Di0;
+ BCa0 = ROL32(Abi0, 31);
+ Ago1 ^= Do1;
+ BCe0 = ROL32(Ago1, 28);
+ Aku1 ^= Du1;
+ BCi0 = ROL32(Aku1, 20);
+ Ama1 ^= Da1;
+ BCo0 = ROL32(Ama1, 21);
+ Ase0 ^= De0;
+ BCu0 = ROL32(Ase0, 1);
+ Esa0 = BCa0 ^ ANDN32(BCe0, BCi0);
+ Ese0 = BCe0 ^ ANDN32(BCi0, BCo0);
+ Esi0 = BCi0 ^ ANDN32(BCo0, BCu0);
+ Eso0 = BCo0 ^ ANDN32(BCu0, BCa0);
+ Esu0 = BCu0 ^ ANDN32(BCa0, BCe0);
+
+ Abi1 ^= Di1;
+ BCa1 = ROL32(Abi1, 31);
+ Ago0 ^= Do0;
+ BCe1 = ROL32(Ago0, 27);
+ Aku0 ^= Du0;
+ BCi1 = ROL32(Aku0, 19);
+ Ama0 ^= Da0;
+ BCo1 = ROL32(Ama0, 20);
+ Ase1 ^= De1;
+ BCu1 = ROL32(Ase1, 1);
+ Esa1 = BCa1 ^ ANDN32(BCe1, BCi1);
+ Ese1 = BCe1 ^ ANDN32(BCi1, BCo1);
+ Esi1 = BCi1 ^ ANDN32(BCo1, BCu1);
+ Eso1 = BCo1 ^ ANDN32(BCu1, BCa1);
+ Esu1 = BCu1 ^ ANDN32(BCa1, BCe1);
+
+ /* prepareTheta */
+ BCa0 = Eba0 ^ Ega0 ^ Eka0 ^ Ema0 ^ Esa0;
+ BCa1 = Eba1 ^ Ega1 ^ Eka1 ^ Ema1 ^ Esa1;
+ BCe0 = Ebe0 ^ Ege0 ^ Eke0 ^ Eme0 ^ Ese0;
+ BCe1 = Ebe1 ^ Ege1 ^ Eke1 ^ Eme1 ^ Ese1;
+ BCi0 = Ebi0 ^ Egi0 ^ Eki0 ^ Emi0 ^ Esi0;
+ BCi1 = Ebi1 ^ Egi1 ^ Eki1 ^ Emi1 ^ Esi1;
+ BCo0 = Ebo0 ^ Ego0 ^ Eko0 ^ Emo0 ^ Eso0;
+ BCo1 = Ebo1 ^ Ego1 ^ Eko1 ^ Emo1 ^ Eso1;
+ BCu0 = Ebu0 ^ Egu0 ^ Eku0 ^ Emu0 ^ Esu0;
+ BCu1 = Ebu1 ^ Egu1 ^ Eku1 ^ Emu1 ^ Esu1;
+
+ /* thetaRhoPiChiIota(round+1, E, A) */
+ Da0 = BCu0 ^ ROL32(BCe1, 1);
+ Da1 = BCu1 ^ BCe0;
+ De0 = BCa0 ^ ROL32(BCi1, 1);
+ De1 = BCa1 ^ BCi0;
+ Di0 = BCe0 ^ ROL32(BCo1, 1);
+ Di1 = BCe1 ^ BCo0;
+ Do0 = BCi0 ^ ROL32(BCu1, 1);
+ Do1 = BCi1 ^ BCu0;
+ Du0 = BCo0 ^ ROL32(BCa1, 1);
+ Du1 = BCo1 ^ BCa0;
+
+ Eba0 ^= Da0;
+ BCa0 = Eba0;
+ Ege0 ^= De0;
+ BCe0 = ROL32(Ege0, 22);
+ Eki1 ^= Di1;
+ BCi0 = ROL32(Eki1, 22);
+ Emo1 ^= Do1;
+ BCo0 = ROL32(Emo1, 11);
+ Esu0 ^= Du0;
+ BCu0 = ROL32(Esu0, 7);
+ Aba0 = BCa0 ^ ANDN32(BCe0, BCi0);
+ Aba0 ^= round_consts[round * 2 + 2];
+ Abe0 = BCe0 ^ ANDN32(BCi0, BCo0);
+ Abi0 = BCi0 ^ ANDN32(BCo0, BCu0);
+ Abo0 = BCo0 ^ ANDN32(BCu0, BCa0);
+ Abu0 = BCu0 ^ ANDN32(BCa0, BCe0);
+
+ Eba1 ^= Da1;
+ BCa1 = Eba1;
+ Ege1 ^= De1;
+ BCe1 = ROL32(Ege1, 22);
+ Eki0 ^= Di0;
+ BCi1 = ROL32(Eki0, 21);
+ Emo0 ^= Do0;
+ BCo1 = ROL32(Emo0, 10);
+ Esu1 ^= Du1;
+ BCu1 = ROL32(Esu1, 7);
+ Aba1 = BCa1 ^ ANDN32(BCe1, BCi1);
+ Aba1 ^= round_consts[round * 2 + 3];
+ Abe1 = BCe1 ^ ANDN32(BCi1, BCo1);
+ Abi1 = BCi1 ^ ANDN32(BCo1, BCu1);
+ Abo1 = BCo1 ^ ANDN32(BCu1, BCa1);
+ Abu1 = BCu1 ^ ANDN32(BCa1, BCe1);
+
+ Ebo0 ^= Do0;
+ BCa0 = ROL32(Ebo0, 14);
+ Egu0 ^= Du0;
+ BCe0 = ROL32(Egu0, 10);
+ Eka1 ^= Da1;
+ BCi0 = ROL32(Eka1, 2);
+ Eme1 ^= De1;
+ BCo0 = ROL32(Eme1, 23);
+ Esi1 ^= Di1;
+ BCu0 = ROL32(Esi1, 31);
+ Aga0 = BCa0 ^ ANDN32(BCe0, BCi0);
+ Age0 = BCe0 ^ ANDN32(BCi0, BCo0);
+ Agi0 = BCi0 ^ ANDN32(BCo0, BCu0);
+ Ago0 = BCo0 ^ ANDN32(BCu0, BCa0);
+ Agu0 = BCu0 ^ ANDN32(BCa0, BCe0);
+
+ Ebo1 ^= Do1;
+ BCa1 = ROL32(Ebo1, 14);
+ Egu1 ^= Du1;
+ BCe1 = ROL32(Egu1, 10);
+ Eka0 ^= Da0;
+ BCi1 = ROL32(Eka0, 1);
+ Eme0 ^= De0;
+ BCo1 = ROL32(Eme0, 22);
+ Esi0 ^= Di0;
+ BCu1 = ROL32(Esi0, 30);
+ Aga1 = BCa1 ^ ANDN32(BCe1, BCi1);
+ Age1 = BCe1 ^ ANDN32(BCi1, BCo1);
+ Agi1 = BCi1 ^ ANDN32(BCo1, BCu1);
+ Ago1 = BCo1 ^ ANDN32(BCu1, BCa1);
+ Agu1 = BCu1 ^ ANDN32(BCa1, BCe1);
+
+ Ebe1 ^= De1;
+ BCa0 = ROL32(Ebe1, 1);
+ Egi0 ^= Di0;
+ BCe0 = ROL32(Egi0, 3);
+ Eko1 ^= Do1;
+ BCi0 = ROL32(Eko1, 13);
+ Emu0 ^= Du0;
+ BCo0 = ROL32(Emu0, 4);
+ Esa0 ^= Da0;
+ BCu0 = ROL32(Esa0, 9);
+ Aka0 = BCa0 ^ ANDN32(BCe0, BCi0);
+ Ake0 = BCe0 ^ ANDN32(BCi0, BCo0);
+ Aki0 = BCi0 ^ ANDN32(BCo0, BCu0);
+ Ako0 = BCo0 ^ ANDN32(BCu0, BCa0);
+ Aku0 = BCu0 ^ ANDN32(BCa0, BCe0);
+
+ Ebe0 ^= De0;
+ BCa1 = Ebe0;
+ Egi1 ^= Di1;
+ BCe1 = ROL32(Egi1, 3);
+ Eko0 ^= Do0;
+ BCi1 = ROL32(Eko0, 12);
+ Emu1 ^= Du1;
+ BCo1 = ROL32(Emu1, 4);
+ Esa1 ^= Da1;
+ BCu1 = ROL32(Esa1, 9);
+ Aka1 = BCa1 ^ ANDN32(BCe1, BCi1);
+ Ake1 = BCe1 ^ ANDN32(BCi1, BCo1);
+ Aki1 = BCi1 ^ ANDN32(BCo1, BCu1);
+ Ako1 = BCo1 ^ ANDN32(BCu1, BCa1);
+ Aku1 = BCu1 ^ ANDN32(BCa1, BCe1);
+
+ Ebu1 ^= Du1;
+ BCa0 = ROL32(Ebu1, 14);
+ Ega0 ^= Da0;
+ BCe0 = ROL32(Ega0, 18);
+ Eke0 ^= De0;
+ BCi0 = ROL32(Eke0, 5);
+ Emi1 ^= Di1;
+ BCo0 = ROL32(Emi1, 8);
+ Eso0 ^= Do0;
+ BCu0 = ROL32(Eso0, 28);
+ Ama0 = BCa0 ^ ANDN32(BCe0, BCi0);
+ Ame0 = BCe0 ^ ANDN32(BCi0, BCo0);
+ Ami0 = BCi0 ^ ANDN32(BCo0, BCu0);
+ Amo0 = BCo0 ^ ANDN32(BCu0, BCa0);
+ Amu0 = BCu0 ^ ANDN32(BCa0, BCe0);
+
+ Ebu0 ^= Du0;
+ BCa1 = ROL32(Ebu0, 13);
+ Ega1 ^= Da1;
+ BCe1 = ROL32(Ega1, 18);
+ Eke1 ^= De1;
+ BCi1 = ROL32(Eke1, 5);
+ Emi0 ^= Di0;
+ BCo1 = ROL32(Emi0, 7);
+ Eso1 ^= Do1;
+ BCu1 = ROL32(Eso1, 28);
+ Ama1 = BCa1 ^ ANDN32(BCe1, BCi1);
+ Ame1 = BCe1 ^ ANDN32(BCi1, BCo1);
+ Ami1 = BCi1 ^ ANDN32(BCo1, BCu1);
+ Amo1 = BCo1 ^ ANDN32(BCu1, BCa1);
+ Amu1 = BCu1 ^ ANDN32(BCa1, BCe1);
+
+ Ebi0 ^= Di0;
+ BCa0 = ROL32(Ebi0, 31);
+ Ego1 ^= Do1;
+ BCe0 = ROL32(Ego1, 28);
+ Eku1 ^= Du1;
+ BCi0 = ROL32(Eku1, 20);
+ Ema1 ^= Da1;
+ BCo0 = ROL32(Ema1, 21);
+ Ese0 ^= De0;
+ BCu0 = ROL32(Ese0, 1);
+ Asa0 = BCa0 ^ ANDN32(BCe0, BCi0);
+ Ase0 = BCe0 ^ ANDN32(BCi0, BCo0);
+ Asi0 = BCi0 ^ ANDN32(BCo0, BCu0);
+ Aso0 = BCo0 ^ ANDN32(BCu0, BCa0);
+ Asu0 = BCu0 ^ ANDN32(BCa0, BCe0);
+
+ Ebi1 ^= Di1;
+ BCa1 = ROL32(Ebi1, 31);
+ Ego0 ^= Do0;
+ BCe1 = ROL32(Ego0, 27);
+ Eku0 ^= Du0;
+ BCi1 = ROL32(Eku0, 19);
+ Ema0 ^= Da0;
+ BCo1 = ROL32(Ema0, 20);
+ Ese1 ^= De1;
+ BCu1 = ROL32(Ese1, 1);
+ Asa1 = BCa1 ^ ANDN32(BCe1, BCi1);
+ Ase1 = BCe1 ^ ANDN32(BCi1, BCo1);
+ Asi1 = BCi1 ^ ANDN32(BCo1, BCu1);
+ Aso1 = BCo1 ^ ANDN32(BCu1, BCa1);
+ Asu1 = BCu1 ^ ANDN32(BCa1, BCe1);
+ }
+
+ state[0] = Aba0;
+ state[1] = Aba1;
+ state[2] = Abe0;
+ state[3] = Abe1;
+ state[4] = Abi0;
+ state[5] = Abi1;
+ state[6] = Abo0;
+ state[7] = Abo1;
+ state[8] = Abu0;
+ state[9] = Abu1;
+ state[10] = Aga0;
+ state[11] = Aga1;
+ state[12] = Age0;
+ state[13] = Age1;
+ state[14] = Agi0;
+ state[15] = Agi1;
+ state[16] = Ago0;
+ state[17] = Ago1;
+ state[18] = Agu0;
+ state[19] = Agu1;
+ state[20] = Aka0;
+ state[21] = Aka1;
+ state[22] = Ake0;
+ state[23] = Ake1;
+ state[24] = Aki0;
+ state[25] = Aki1;
+ state[26] = Ako0;
+ state[27] = Ako1;
+ state[28] = Aku0;
+ state[29] = Aku1;
+ state[30] = Ama0;
+ state[31] = Ama1;
+ state[32] = Ame0;
+ state[33] = Ame1;
+ state[34] = Ami0;
+ state[35] = Ami1;
+ state[36] = Amo0;
+ state[37] = Amo1;
+ state[38] = Amu0;
+ state[39] = Amu1;
+ state[40] = Asa0;
+ state[41] = Asa1;
+ state[42] = Ase0;
+ state[43] = Ase1;
+ state[44] = Asi0;
+ state[45] = Asi1;
+ state[46] = Aso0;
+ state[47] = Aso1;
+ state[48] = Asu0;
+ state[49] = Asu1;
+
+ return sizeof(void *) * 4 + sizeof(u32) * 12 * 5 * 2;
+}