From a1cc7bb15473a2419b24ecac765ae0ce5989a13b Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Sun, 1 Nov 2015 16:06:26 +0200 Subject: Add ARMv7/NEON implementation of Keccak * cipher/Makefile.am: Add 'keccak-armv7-neon.S'. * cipher/keccak-armv7-neon.S: New. * cipher/keccak.c (USE_64BIT_ARM_NEON): New. (NEED_COMMON64): Select if USE_64BIT_ARM_NEON. [NEED_COMMON64] (round_consts_64bit): Rename to... [NEED_COMMON64] (_gcry_keccak_round_consts_64bit): ...this; Add terminator at end. [USE_64BIT_ARM_NEON] (_gcry_keccak_permute_armv7_neon) (_gcry_keccak_absorb_lanes64_armv7_neon, keccak_permute64_armv7_neon) (keccak_absorb_lanes64_armv7_neon, keccak_armv7_neon_64_ops): New. (keccak_init) [USE_64BIT_ARM_NEON]: Select ARM/NEON implementation if supported by HW. * cipher/keccak_permute_64.h (KECCAK_F1600_PERMUTE_FUNC_NAME): Update to use new round constant table. * configure.ac: Add 'keccak-armv7-neon.lo'. -- Patch adds ARMv7/NEON implementation of Keccak (SHAKE/SHA3). Patch is based on public-domain implementation by Ronny Van Keer from SUPERCOP package: https://github.com/floodyberry/supercop/blob/master/crypto_hash/\ keccakc1024/inplace-armv7a-neon/keccak2.s Benchmark results on Cortex-A8 @ 1008 Mhz: Before (generic 32-bit bit-interleaved impl.): | nanosecs/byte mebibytes/sec cycles/byte SHAKE128 | 83.00 ns/B 11.49 MiB/s 83.67 c/B SHAKE256 | 101.7 ns/B 9.38 MiB/s 102.5 c/B SHA3-224 | 96.13 ns/B 9.92 MiB/s 96.90 c/B SHA3-256 | 101.5 ns/B 9.40 MiB/s 102.3 c/B SHA3-384 | 131.4 ns/B 7.26 MiB/s 132.5 c/B SHA3-512 | 189.1 ns/B 5.04 MiB/s 190.6 c/B After (ARM/NEON, ~3.2x faster): | nanosecs/byte mebibytes/sec cycles/byte SHAKE128 | 25.09 ns/B 38.01 MiB/s 25.29 c/B SHAKE256 | 30.95 ns/B 30.82 MiB/s 31.19 c/B SHA3-224 | 29.24 ns/B 32.61 MiB/s 29.48 c/B SHA3-256 | 30.95 ns/B 30.82 MiB/s 31.19 c/B SHA3-384 | 40.42 ns/B 23.59 MiB/s 40.74 c/B SHA3-512 | 58.37 ns/B 16.34 MiB/s 58.84 c/B Signed-off-by: Jussi Kivilinna --- cipher/Makefile.am | 2 +- cipher/keccak-armv7-neon.S | 945 +++++++++++++++++++++++++++++++++++++++++++++ cipher/keccak.c | 71 +++- cipher/keccak_permute_64.h | 2 +- 4 files changed, 1015 insertions(+), 5 deletions(-) create mode 100644 cipher/keccak-armv7-neon.S (limited to 'cipher') diff --git a/cipher/Makefile.am b/cipher/Makefile.am index be03d065..88c8fbf5 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -90,7 +90,7 @@ sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \ sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S sha256-avx2-bmi2-amd64.S \ sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S sha512-avx2-bmi2-amd64.S \ sha512-armv7-neon.S \ -keccak.c keccak_permute_32.h keccak_permute_64.h \ +keccak.c keccak_permute_32.h keccak_permute_64.h keccak-armv7-neon.S \ stribog.c \ tiger.c \ whirlpool.c whirlpool-sse2-amd64.S \ diff --git a/cipher/keccak-armv7-neon.S b/cipher/keccak-armv7-neon.S new file mode 100644 index 00000000..0bec8d50 --- /dev/null +++ b/cipher/keccak-armv7-neon.S @@ -0,0 +1,945 @@ +/* keccak-armv7-neon.S - ARMv7/NEON implementation of Keccak + * + * Copyright (C) 2015 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#include + +#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \ + defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_NEON) + +/* Based on public-domain/CC0 implementation from SUPERCOP package + * (keccakc1024/inplace-armv7a-neon/keccak2.s) + * + * Original copyright header follows: + */ + +@ The Keccak sponge function, designed by Guido Bertoni, Joan Daemen, +@ Michaƫl Peeters and Gilles Van Assche. For more information, feedback or +@ questions, please refer to our website: http://keccak.noekeon.org/ +@ +@ Implementation by Ronny Van Keer, hereby denoted as "the implementer". +@ +@ To the extent possible under law, the implementer has waived all copyright +@ and related or neighboring rights to the source code in this file. +@ http://creativecommons.org/publicdomain/zero/1.0/ + +.text + +.syntax unified +.fpu neon +.arm + + +.extern _gcry_keccak_round_consts_64bit; + +#ifdef __PIC__ +# define GET_DATA_POINTER(reg, name, rtmp) \ + ldr reg, 1f; \ + ldr rtmp, 2f; \ + b 3f; \ + 1: .word _GLOBAL_OFFSET_TABLE_-(3f+8); \ + 2: .word name(GOT); \ + 3: add reg, pc, reg; \ + ldr reg, [reg, rtmp]; +#else +# define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name +#endif + + +@// --- offsets in state +.equ Aba, 0*8 +.equ Aga, 1*8 +.equ Aka, 2*8 +.equ Ama, 3*8 +.equ Asa, 4*8 + +@// --- macros + +.macro KeccakThetaRhoPiChiIota argA1, argA2, argA3, argA4, argA5 + + @Prepare Theta + @Ca = Aba^Aga^Aka^Ama^Asa@ + @Ce = Abe^Age^Ake^Ame^Ase@ + @Ci = Abi^Agi^Aki^Ami^Asi@ + @Co = Abo^Ago^Ako^Amo^Aso@ + @Cu = Abu^Agu^Aku^Amu^Asu@ + @De = Ca^ROL64(Ci, 1)@ + @Di = Ce^ROL64(Co, 1)@ + @Do = Ci^ROL64(Cu, 1)@ + @Du = Co^ROL64(Ca, 1)@ + @Da = Cu^ROL64(Ce, 1)@ + + veor.64 q4, q6, q7 + veor.64 q5, q9, q10 + veor.64 d8, d8, d9 + veor.64 d10, d10, d11 + veor.64 d1, d8, d16 + veor.64 d2, d10, d17 + + veor.64 q4, q11, q12 + veor.64 q5, q14, q15 + veor.64 d8, d8, d9 + veor.64 d10, d10, d11 + veor.64 d3, d8, d26 + + vadd.u64 q4, q1, q1 + veor.64 d4, d10, d27 + vmov.64 d0, d5 + vsri.64 q4, q1, #63 + + vadd.u64 q5, q2, q2 + veor.64 q4, q4, q0 + vsri.64 q5, q2, #63 + vadd.u64 d7, d1, d1 + veor.64 \argA2, \argA2, d8 + veor.64 q5, q5, q1 + + vsri.64 d7, d1, #63 + vshl.u64 d1, \argA2, #44 + veor.64 \argA3, \argA3, d9 + veor.64 d7, d7, d4 + + @Ba = argA1^Da@ + @Be = ROL64((argA2^De), 44)@ + @Bi = ROL64((argA3^Di), 43)@ + @Bo = ROL64((argA4^Do), 21)@ + @Bu = ROL64((argA5^Du), 14)@ + @argA2 = Be ^((~Bi)& Bo )@ + @argA3 = Bi ^((~Bo)& Bu )@ + @argA4 = Bo ^((~Bu)& Ba )@ + @argA5 = Bu ^((~Ba)& Be )@ + @argA1 = Ba ^((~Be)& Bi )@ argA1 ^= KeccakF1600RoundConstants[i+round]@ + vsri.64 d1, \argA2, #64-44 + vshl.u64 d2, \argA3, #43 + vldr.64 d0, [sp, #\argA1] + veor.64 \argA4, \argA4, d10 + vsri.64 d2, \argA3, #64-43 + vshl.u64 d3, \argA4, #21 + veor.64 \argA5, \argA5, d11 + veor.64 d0, d0, d7 + vsri.64 d3, \argA4, #64-21 + vbic.64 d5, d2, d1 + vshl.u64 d4, \argA5, #14 + vbic.64 \argA2, d3, d2 + vld1.64 d6, [ip]! + veor.64 d5, d0 + vsri.64 d4, \argA5, #64-14 + veor.64 d5, d6 + vbic.64 \argA5, d1, d0 + vbic.64 \argA3, d4, d3 + vbic.64 \argA4, d0, d4 + veor.64 \argA2, d1 + vstr.64 d5, [sp, #\argA1] + veor.64 \argA3, d2 + veor.64 \argA4, d3 + veor.64 \argA5, d4 + + .endm + +.macro KeccakThetaRhoPiChi1 argA1, argA2, argA3, argA4, argA5 + + @d2 = ROL64((argA1^Da), 3)@ + @d3 = ROL64((argA2^De), 45)@ + @d4 = ROL64((argA3^Di), 61)@ + @d0 = ROL64((argA4^Do), 28)@ + @d1 = ROL64((argA5^Du), 20)@ + @argA1 = Ba ^((~Be)& Bi )@ Ca ^= argA1@ + @argA2 = Be ^((~Bi)& Bo )@ + @argA3 = Bi ^((~Bo)& Bu )@ + @argA4 = Bo ^((~Bu)& Ba )@ + @argA5 = Bu ^((~Ba)& Be )@ + + veor.64 \argA2, \argA2, d8 + veor.64 \argA3, \argA3, d9 + vshl.u64 d3, \argA2, #45 + vldr.64 d6, [sp, #\argA1] + vshl.u64 d4, \argA3, #61 + veor.64 \argA4, \argA4, d10 + vsri.64 d3, \argA2, #64-45 + veor.64 \argA5, \argA5, d11 + vsri.64 d4, \argA3, #64-61 + vshl.u64 d0, \argA4, #28 + veor.64 d6, d6, d7 + vshl.u64 d1, \argA5, #20 + vbic.64 \argA3, d4, d3 + vsri.64 d0, \argA4, #64-28 + vbic.64 \argA4, d0, d4 + vshl.u64 d2, d6, #3 + vsri.64 d1, \argA5, #64-20 + veor.64 \argA4, d3 + vsri.64 d2, d6, #64-3 + vbic.64 \argA5, d1, d0 + vbic.64 d6, d2, d1 + vbic.64 \argA2, d3, d2 + veor.64 d6, d0 + veor.64 \argA2, d1 + vstr.64 d6, [sp, #\argA1] + veor.64 \argA3, d2 + veor.64 d5, d6 + veor.64 \argA5, d4 + + .endm + +.macro KeccakThetaRhoPiChi2 argA1, argA2, argA3, argA4, argA5 + + @d4 = ROL64((argA1^Da), 18)@ + @d0 = ROL64((argA2^De), 1)@ + @d1 = ROL64((argA3^Di), 6)@ + @d2 = ROL64((argA4^Do), 25)@ + @d3 = ROL64((argA5^Du), 8)@ + @argA1 = Ba ^((~Be)& Bi )@ Ca ^= argA1@ + @argA2 = Be ^((~Bi)& Bo )@ + @argA3 = Bi ^((~Bo)& Bu )@ + @argA4 = Bo ^((~Bu)& Ba )@ + @argA5 = Bu ^((~Ba)& Be )@ + + veor.64 \argA3, \argA3, d9 + veor.64 \argA4, \argA4, d10 + vshl.u64 d1, \argA3, #6 + vldr.64 d6, [sp, #\argA1] + vshl.u64 d2, \argA4, #25 + veor.64 \argA5, \argA5, d11 + vsri.64 d1, \argA3, #64-6 + veor.64 \argA2, \argA2, d8 + vsri.64 d2, \argA4, #64-25 + vext.8 d3, \argA5, \argA5, #7 + veor.64 d6, d6, d7 + vbic.64 \argA3, d2, d1 + vadd.u64 d0, \argA2, \argA2 + vbic.64 \argA4, d3, d2 + vsri.64 d0, \argA2, #64-1 + vshl.u64 d4, d6, #18 + veor.64 \argA2, d1, \argA4 + veor.64 \argA3, d0 + vsri.64 d4, d6, #64-18 + vstr.64 \argA3, [sp, #\argA1] + veor.64 d5, \argA3 + vbic.64 \argA5, d1, d0 + vbic.64 \argA3, d4, d3 + vbic.64 \argA4, d0, d4 + veor.64 \argA3, d2 + veor.64 \argA4, d3 + veor.64 \argA5, d4 + + .endm + +.macro KeccakThetaRhoPiChi3 argA1, argA2, argA3, argA4, argA5 + + @d1 = ROL64((argA1^Da), 36)@ + @d2 = ROL64((argA2^De), 10)@ + @d3 = ROL64((argA3^Di), 15)@ + @d4 = ROL64((argA4^Do), 56)@ + @d0 = ROL64((argA5^Du), 27)@ + @argA1 = Ba ^((~Be)& Bi )@ Ca ^= argA1@ + @argA2 = Be ^((~Bi)& Bo )@ + @argA3 = Bi ^((~Bo)& Bu )@ + @argA4 = Bo ^((~Bu)& Ba )@ + @argA5 = Bu ^((~Ba)& Be )@ + + veor.64 \argA2, \argA2, d8 + veor.64 \argA3, \argA3, d9 + vshl.u64 d2, \argA2, #10 + vldr.64 d6, [sp, #\argA1] + vshl.u64 d3, \argA3, #15 + veor.64 \argA4, \argA4, d10 + vsri.64 d2, \argA2, #64-10 + vsri.64 d3, \argA3, #64-15 + veor.64 \argA5, \argA5, d11 + vext.8 d4, \argA4, \argA4, #1 + vbic.64 \argA2, d3, d2 + vshl.u64 d0, \argA5, #27 + veor.64 d6, d6, d7 + vbic.64 \argA3, d4, d3 + vsri.64 d0, \argA5, #64-27 + vshl.u64 d1, d6, #36 + veor.64 \argA3, d2 + vbic.64 \argA4, d0, d4 + vsri.64 d1, d6, #64-36 + + veor.64 \argA4, d3 + vbic.64 d6, d2, d1 + vbic.64 \argA5, d1, d0 + veor.64 d6, d0 + veor.64 \argA2, d1 + vstr.64 d6, [sp, #\argA1] + veor.64 d5, d6 + veor.64 \argA5, d4 + + .endm + +.macro KeccakThetaRhoPiChi4 argA1, argA2, argA3, argA4, argA5 + + @d3 = ROL64((argA1^Da), 41)@ + @d4 = ROL64((argA2^De), 2)@ + @d0 = ROL64((argA3^Di), 62)@ + @d1 = ROL64((argA4^Do), 55)@ + @d2 = ROL64((argA5^Du), 39)@ + @argA1 = Ba ^((~Be)& Bi )@ Ca ^= argA1@ + @argA2 = Be ^((~Bi)& Bo )@ + @argA3 = Bi ^((~Bo)& Bu )@ + @argA4 = Bo ^((~Bu)& Ba )@ + @argA5 = Bu ^((~Ba)& Be )@ + + veor.64 \argA2, \argA2, d8 + veor.64 \argA3, \argA3, d9 + vshl.u64 d4, \argA2, #2 + veor.64 \argA5, \argA5, d11 + vshl.u64 d0, \argA3, #62 + vldr.64 d6, [sp, #\argA1] + vsri.64 d4, \argA2, #64-2 + veor.64 \argA4, \argA4, d10 + vsri.64 d0, \argA3, #64-62 + + vshl.u64 d1, \argA4, #55 + veor.64 d6, d6, d7 + vshl.u64 d2, \argA5, #39 + vsri.64 d1, \argA4, #64-55 + vbic.64 \argA4, d0, d4 + vsri.64 d2, \argA5, #64-39 + vbic.64 \argA2, d1, d0 + vshl.u64 d3, d6, #41 + veor.64 \argA5, d4, \argA2 + vbic.64 \argA2, d2, d1 + vsri.64 d3, d6, #64-41 + veor.64 d6, d0, \argA2 + + vbic.64 \argA2, d3, d2 + vbic.64 \argA3, d4, d3 + veor.64 \argA2, d1 + vstr.64 d6, [sp, #\argA1] + veor.64 d5, d6 + veor.64 \argA3, d2 + veor.64 \argA4, d3 + + .endm + + +@// --- code + +@not callable from C! +.p2align 3 +.type KeccakF_armv7a_neon_asm,%function; +KeccakF_armv7a_neon_asm: @ + +.LroundLoop: + + KeccakThetaRhoPiChiIota Aba, d13, d19, d25, d31 + KeccakThetaRhoPiChi1 Aka, d15, d21, d22, d28 + KeccakThetaRhoPiChi2 Asa, d12, d18, d24, d30 + KeccakThetaRhoPiChi3 Aga, d14, d20, d26, d27 + KeccakThetaRhoPiChi4 Ama, d16, d17, d23, d29 + + KeccakThetaRhoPiChiIota Aba, d15, d18, d26, d29 + KeccakThetaRhoPiChi1 Asa, d14, d17, d25, d28 + KeccakThetaRhoPiChi2 Ama, d13, d21, d24, d27 + KeccakThetaRhoPiChi3 Aka, d12, d20, d23, d31 + KeccakThetaRhoPiChi4 Aga, d16, d19, d22, d30 + + KeccakThetaRhoPiChiIota Aba, d14, d21, d23, d30 + KeccakThetaRhoPiChi1 Ama, d12, d19, d26, d28 + KeccakThetaRhoPiChi2 Aga, d15, d17, d24, d31 + KeccakThetaRhoPiChi3 Asa, d13, d20, d22, d29 + KeccakThetaRhoPiChi4 Aka, d16, d18, d25, d27 + + KeccakThetaRhoPiChiIota Aba, d12, d17, d22, d27 + KeccakThetaRhoPiChi1 Aga, d13, d18, d23, d28 + KeccakThetaRhoPiChi2 Aka, d14, d19, d24, d29 + ldr r0, [ip] + KeccakThetaRhoPiChi3 Ama, d15, d20, d25, d30 + cmp r0, #0xFFFFFFFF + KeccakThetaRhoPiChi4 Asa, d16, d21, d26, d31 + + bne .LroundLoop + sub ip, #(8*24) + bx lr +.p2align 2 +.ltorg +.size KeccakF_armv7a_neon_asm,.-KeccakF_armv7a_neon_asm; + + +@//unsigned _gcry_keccak_permute_armv7_neon(u64 *state) callable from C +.p2align 3 +.global _gcry_keccak_permute_armv7_neon +.type _gcry_keccak_permute_armv7_neon,%function; +_gcry_keccak_permute_armv7_neon: + + push {ip, lr} + vpush {q4-q7} + sub sp,sp, #5*8 + + vldr.64 d0, [r0, #0*8] + vldr.64 d12, [r0, #1*8] + vldr.64 d17, [r0, #2*8] + vldr.64 d22, [r0, #3*8] + vldr.64 d27, [r0, #4*8] + + GET_DATA_POINTER(ip, _gcry_keccak_round_consts_64bit, lr); + + vldr.64 d1, [r0, #5*8] + vldr.64 d13, [r0, #6*8] + vldr.64 d18, [r0, #7*8] + vldr.64 d23, [r0, #8*8] + vldr.64 d28, [r0, #9*8] + + vldr.64 d2, [r0, #10*8] + vldr.64 d14, [r0, #11*8] + vldr.64 d19, [r0, #12*8] + vldr.64 d24, [r0, #13*8] + vldr.64 d29, [r0, #14*8] + + vldr.64 d3, [r0, #15*8] + vldr.64 d15, [r0, #16*8] + vldr.64 d20, [r0, #17*8] + vldr.64 d25, [r0, #18*8] + vldr.64 d30, [r0, #19*8] + + vldr.64 d4, [r0, #20*8] + vldr.64 d16, [r0, #21*8] + vldr.64 d21, [r0, #22*8] + vldr.64 d26, [r0, #23*8] + vldr.64 d31, [r0, #24*8] + + vstr.64 d0, [sp, #Aba] + vstr.64 d1, [sp, #Aga] + veor.64 q0, q0, q1 + vstr.64 d2, [sp, #Aka] + veor.64 d5, d0, d1 + vstr.64 d3, [sp, #Ama] + mov r1, r0 + vstr.64 d4, [sp, #Asa] + veor.64 d5, d5, d4 + + bl KeccakF_armv7a_neon_asm + + vpop.64 { d0- d4 } + + vstr.64 d0, [r1, #0*8] + vstr.64 d12, [r1, #1*8] + vstr.64 d17, [r1, #2*8] + vstr.64 d22, [r1, #3*8] + vstr.64 d27, [r1, #4*8] + + vstr.64 d1, [r1, #5*8] + vstr.64 d13, [r1, #6*8] + vstr.64 d18, [r1, #7*8] + vstr.64 d23, [r1, #8*8] + vstr.64 d28, [r1, #9*8] + + vstr.64 d2, [r1, #10*8] + vstr.64 d14, [r1, #11*8] + vstr.64 d19, [r1, #12*8] + vstr.64 d24, [r1, #13*8] + vstr.64 d29, [r1, #14*8] + + vstr.64 d3, [r1, #15*8] + vstr.64 d15, [r1, #16*8] + vstr.64 d20, [r1, #17*8] + vstr.64 d25, [r1, #18*8] + vstr.64 d30, [r1, #19*8] + + vstr.64 d4, [r1, #20*8] + vstr.64 d16, [r1, #21*8] + vstr.64 d21, [r1, #22*8] + vstr.64 d26, [r1, #23*8] + vstr.64 d31, [r1, #24*8] + + mov r0, #112 + vpop {q4-q7} + pop {ip, pc} +.p2align 2 +.ltorg +.size _gcry_keccak_permute_armv7_neon,.-_gcry_keccak_permute_armv7_neon; + +@//unsigned _gcry_keccak_permute_armv7_neon(u64 *state, @r4 +@ int pos, @r1 +@ const byte *lanes, @r2 +@ unsigned int nlanes, @r3 +@ int blocklanes) @ r5 callable from C +.p2align 3 +.global _gcry_keccak_absorb_lanes64_armv7_neon +.type _gcry_keccak_absorb_lanes64_armv7_neon,%function; +_gcry_keccak_absorb_lanes64_armv7_neon: + + cmp r3, #0 @ nlanes == 0 + itt eq + moveq r0, #0 + bxeq lr + + push {r4-r5, ip, lr} + beq .Lout + mov r4, r0 + ldr r5, [sp, #(4*4)] + vpush {q4-q7} + + @ load state + vldr.64 d0, [r4, #0*8] + vldr.64 d12, [r4, #1*8] + vldr.64 d17, [r4, #2*8] + vldr.64 d22, [r4, #3*8] + vldr.64 d27, [r4, #4*8] + + GET_DATA_POINTER(ip, _gcry_keccak_round_consts_64bit, lr); + + vldr.64 d1, [r4, #5*8] + vldr.64 d13, [r4, #6*8] + vldr.64 d18, [r4, #7*8] + vldr.64 d23, [r4, #8*8] + vldr.64 d28, [r4, #9*8] + + vldr.64 d2, [r4, #10*8] + vldr.64 d14, [r4, #11*8] + vldr.64 d19, [r4, #12*8] + vldr.64 d24, [r4, #13*8] + vldr.64 d29, [r4, #14*8] + + vldr.64 d3, [r4, #15*8] + vldr.64 d15, [r4, #16*8] + vldr.64 d20, [r4, #17*8] + vldr.64 d25, [r4, #18*8] + vldr.64 d30, [r4, #19*8] + + vldr.64 d4, [r4, #20*8] + vldr.64 d16, [r4, #21*8] + vldr.64 d21, [r4, #22*8] + vldr.64 d26, [r4, #23*8] + vldr.64 d31, [r4, #24*8] + +.Lmain_loop: + + @ detect absorb mode (full blocks vs lanes) + + cmp r1, #0 @ pos != 0 + bne .Llanes_loop + +.Lmain_loop_pos0: + + @ full blocks mode + + @ switch (blocksize) + cmp r5, #21 + beq .Lfull_block_21 + cmp r5, #18 + beq .Lfull_block_18 + cmp r5, #17 + beq .Lfull_block_17 + cmp r5, #13 + beq .Lfull_block_13 + cmp r5, #9 + beq .Lfull_block_9 + + @ unknown blocksize + b .Llanes_loop + +.Lfull_block_21: + + @ SHAKE128 + + cmp r3, #21 @ nlanes < blocklanes + blo .Llanes_loop + + sub sp,sp, #5*8 + + vld1.64 {d5-d8}, [r2]! + veor d0, d5 + vld1.64 {d9-d11}, [r2]! + veor d12, d6 + veor d17, d7 + veor d22, d8 + vld1.64 {d5-d8}, [r2]! + veor d27, d9 + + veor d1, d10 + veor d13, d11 + vld1.64 {d9-d11}, [r2]! + veor d18, d5 + veor d23, d6 + veor d28, d7 + + veor d2, d8 + vld1.64 {d5-d8}, [r2]! + veor d14, d9 + veor d19, d10 + veor d24, d11 + vld1.64 {d9-d11}, [r2]! + veor d29, d5 + + veor d3, d6 + veor d15, d7 + veor d20, d8 + veor d25, d9 + veor d30, d10 + + veor d4, d11 + + vstr.64 d0, [sp, #Aba] + vstr.64 d1, [sp, #Aga] + veor.64 q0, q0, q1 + vstr.64 d2, [sp, #Aka] + veor.64 d5, d0, d1 + vstr.64 d3, [sp, #Ama] + vstr.64 d4, [sp, #Asa] + veor.64 d5, d5, d4 + + bl KeccakF_armv7a_neon_asm + + subs r3, #21 @ nlanes -= 21 + vpop.64 { d0-d4 } + + beq .Ldone + + b .Lfull_block_21 + +.Lfull_block_18: + + @ SHA3-224 + + cmp r3, #18 @ nlanes < blocklanes + blo .Llanes_loop + + sub sp,sp, #5*8 + + vld1.64 {d5-d8}, [r2]! + veor d0, d5 + vld1.64 {d9-d11}, [r2]! + veor d12, d6 + veor d17, d7 + veor d22, d8 + vld1.64 {d5-d8}, [r2]! + veor d27, d9 + + veor d1, d10 + veor d13, d11 + vld1.64 {d9-d11}, [r2]! + veor d18, d5 + veor d23, d6 + veor d28, d7 + + veor d2, d8 + vld1.64 {d5-d8}, [r2]! + veor d14, d9 + veor d19, d10 + veor d24, d11 + veor d29, d5 + + veor d3, d6 + veor d15, d7 + veor d20, d8 + + vstr.64 d0, [sp, #Aba] + vstr.64 d1, [sp, #Aga] + veor.64 q0, q0, q1 + vstr.64 d2, [sp, #Aka] + veor.64 d5, d0, d1 + vstr.64 d3, [sp, #Ama] + vstr.64 d4, [sp, #Asa] + veor.64 d5, d5, d4 + + bl KeccakF_armv7a_neon_asm + + subs r3, #18 @ nlanes -= 18 + vpop.64 { d0-d4 } + + beq .Ldone + + b .Lfull_block_18 + +.Lfull_block_17: + + @ SHA3-256 & SHAKE256 + + cmp r3, #17 @ nlanes < blocklanes + blo .Llanes_loop + + sub sp,sp, #5*8 + + vld1.64 {d5-d8}, [r2]! + veor d0, d5 + vld1.64 {d9-d11}, [r2]! + veor d12, d6 + veor d17, d7 + veor d22, d8 + vld1.64 {d5-d8}, [r2]! + veor d27, d9 + + veor d1, d10 + veor d13, d11 + vld1.64 {d9-d11}, [r2]! + veor d18, d5 + veor d23, d6 + veor d28, d7 + + veor d2, d8 + vld1.64 {d5-d7}, [r2]! + veor d14, d9 + veor d19, d10 + veor d24, d11 + veor d29, d5 + + veor d3, d6 + veor d15, d7 + + vstr.64 d0, [sp, #Aba] + vstr.64 d1, [sp, #Aga] + veor.64 q0, q0, q1 + vstr.64 d2, [sp, #Aka] + veor.64 d5, d0, d1 + vstr.64 d3, [sp, #Ama] + vstr.64 d4, [sp, #Asa] + veor.64 d5, d5, d4 + + bl KeccakF_armv7a_neon_asm + + subs r3, #17 @ nlanes -= 17 + vpop.64 { d0-d4 } + + beq .Ldone + + b .Lfull_block_17 + +.Lfull_block_13: + + @ SHA3-384 + + cmp r3, #13 @ nlanes < blocklanes + blo .Llanes_loop + + sub sp,sp, #5*8 + + vld1.64 {d5-d8}, [r2]! + veor d0, d5 + vld1.64 {d9-d11}, [r2]! + veor d12, d6 + veor d17, d7 + veor d22, d8 + vld1.64 {d5-d8}, [r2]! + veor d27, d9 + + veor d1, d10 + veor d13, d11 + vld1.64 {d9-d10}, [r2]! + veor d18, d5 + veor d23, d6 + veor d28, d7 + + veor d2, d8 + veor d14, d9 + veor d19, d10 + + vstr.64 d0, [sp, #Aba] + vstr.64 d1, [sp, #Aga] + veor.64 q0, q0, q1 + vstr.64 d2, [sp, #Aka] + veor.64 d5, d0, d1 + vstr.64 d3, [sp, #Ama] + vstr.64 d4, [sp, #Asa] + veor.64 d5, d5, d4 + + bl KeccakF_armv7a_neon_asm + + subs r3, #13 @ nlanes -= 13 + vpop.64 { d0-d4 } + + beq .Ldone + + b .Lfull_block_13 + +.Lfull_block_9: + + @ SHA3-512 + + cmp r3, #9 @ nlanes < blocklanes + blo .Llanes_loop + + sub sp,sp, #5*8 + + vld1.64 {d5-d8}, [r2]! + veor d0, d5 + vld1.64 {d9-d11}, [r2]! + veor d12, d6 + veor d17, d7 + veor d22, d8 + vld1.64 {d5-d6}, [r2]! + veor d27, d9 + + veor d1, d10 + veor d13, d11 + veor d18, d5 + veor d23, d6 + + vstr.64 d0, [sp, #Aba] + vstr.64 d1, [sp, #Aga] + veor.64 q0, q0, q1 + vstr.64 d2, [sp, #Aka] + veor.64 d5, d0, d1 + vstr.64 d3, [sp, #Ama] + vstr.64 d4, [sp, #Asa] + veor.64 d5, d5, d4 + + bl KeccakF_armv7a_neon_asm + + subs r3, #9 @ nlanes -= 9 + vpop.64 { d0-d4 } + + beq .Ldone + + b .Lfull_block_9 + +.Llanes_loop: + + @ per-lane mode + + @ switch (pos) + ldrb r0, [pc, r1] + add pc, pc, r0, lsl #2 +.Lswitch_table: + .byte (.Llane0-.Lswitch_table-4)/4 + .byte (.Llane1-.Lswitch_table-4)/4 + .byte (.Llane2-.Lswitch_table-4)/4 + .byte (.Llane3-.Lswitch_table-4)/4 + .byte (.Llane4-.Lswitch_table-4)/4 + .byte (.Llane5-.Lswitch_table-4)/4 + .byte (.Llane6-.Lswitch_table-4)/4 + .byte (.Llane7-.Lswitch_table-4)/4 + .byte (.Llane8-.Lswitch_table-4)/4 + .byte (.Llane9-.Lswitch_table-4)/4 + .byte (.Llane10-.Lswitch_table-4)/4 + .byte (.Llane11-.Lswitch_table-4)/4 + .byte (.Llane12-.Lswitch_table-4)/4 + .byte (.Llane13-.Lswitch_table-4)/4 + .byte (.Llane14-.Lswitch_table-4)/4 + .byte (.Llane15-.Lswitch_table-4)/4 + .byte (.Llane16-.Lswitch_table-4)/4 + .byte (.Llane17-.Lswitch_table-4)/4 + .byte (.Llane18-.Lswitch_table-4)/4 + .byte (.Llane19-.Lswitch_table-4)/4 + .byte (.Llane20-.Lswitch_table-4)/4 + .byte (.Llane21-.Lswitch_table-4)/4 + .byte (.Llane22-.Lswitch_table-4)/4 + .byte (.Llane23-.Lswitch_table-4)/4 + .byte (.Llane24-.Lswitch_table-4)/4 +.p2align 2 + +#define ABSORB_LANE(label, vreg) \ + label: \ + add r1, #1; \ + vld1.64 d5, [r2]!; \ + cmp r1, r5; /* pos == blocklanes */ \ + veor vreg, vreg, d5; \ + beq .Llanes_permute; \ + subs r3, #1; \ + beq .Ldone; + + ABSORB_LANE(.Llane0, d0) + ABSORB_LANE(.Llane1, d12) + ABSORB_LANE(.Llane2, d17) + ABSORB_LANE(.Llane3, d22) + ABSORB_LANE(.Llane4, d27) + + ABSORB_LANE(.Llane5, d1) + ABSORB_LANE(.Llane6, d13) + ABSORB_LANE(.Llane7, d18) + ABSORB_LANE(.Llane8, d23) + ABSORB_LANE(.Llane9, d28) + + ABSORB_LANE(.Llane10, d2) + ABSORB_LANE(.Llane11, d14) + ABSORB_LANE(.Llane12, d19) + ABSORB_LANE(.Llane13, d24) + ABSORB_LANE(.Llane14, d29) + + ABSORB_LANE(.Llane15, d3) + ABSORB_LANE(.Llane16, d15) + ABSORB_LANE(.Llane17, d20) + ABSORB_LANE(.Llane18, d25) + ABSORB_LANE(.Llane19, d30) + + ABSORB_LANE(.Llane20, d4) + ABSORB_LANE(.Llane21, d16) + ABSORB_LANE(.Llane22, d21) + ABSORB_LANE(.Llane23, d26) + ABSORB_LANE(.Llane24, d31) + + b .Llanes_loop + +.Llanes_permute: + + sub sp,sp, #5*8 + vstr.64 d0, [sp, #Aba] + vstr.64 d1, [sp, #Aga] + veor.64 q0, q0, q1 + vstr.64 d2, [sp, #Aka] + veor.64 d5, d0, d1 + vstr.64 d3, [sp, #Ama] + vstr.64 d4, [sp, #Asa] + veor.64 d5, d5, d4 + + bl KeccakF_armv7a_neon_asm + + mov r1, #0 @ pos <= 0 + subs r3, #1 + + vpop.64 { d0-d4 } + + beq .Ldone + + b .Lmain_loop_pos0 + +.Ldone: + + @ save state + vstr.64 d0, [r4, #0*8] + vstr.64 d12, [r4, #1*8] + vstr.64 d17, [r4, #2*8] + vstr.64 d22, [r4, #3*8] + vstr.64 d27, [r4, #4*8] + + vstr.64 d1, [r4, #5*8] + vstr.64 d13, [r4, #6*8] + vstr.64 d18, [r4, #7*8] + vstr.64 d23, [r4, #8*8] + vstr.64 d28, [r4, #9*8] + + vstr.64 d2, [r4, #10*8] + vstr.64 d14, [r4, #11*8] + vstr.64 d19, [r4, #12*8] + vstr.64 d24, [r4, #13*8] + vstr.64 d29, [r4, #14*8] + + vstr.64 d3, [r4, #15*8] + vstr.64 d15, [r4, #16*8] + vstr.64 d20, [r4, #17*8] + vstr.64 d25, [r4, #18*8] + vstr.64 d30, [r4, #19*8] + + vstr.64 d4, [r4, #20*8] + vstr.64 d16, [r4, #21*8] + vstr.64 d21, [r4, #22*8] + vstr.64 d26, [r4, #23*8] + vstr.64 d31, [r4, #24*8] + + mov r0, #120 + vpop {q4-q7} +.Lout: + pop {r4-r5, ip, pc} +.p2align 2 +.ltorg +.size _gcry_keccak_absorb_lanes64_armv7_neon,.-_gcry_keccak_absorb_lanes64_armv7_neon; + +#endif diff --git a/cipher/keccak.c b/cipher/keccak.c index ce578607..0bb31552 100644 --- a/cipher/keccak.c +++ b/cipher/keccak.c @@ -59,7 +59,19 @@ #endif -#ifdef USE_64BIT +/* USE_64BIT_ARM_NEON indicates whether to enable 64-bit ARM/NEON assembly + * code. */ +#undef USE_64BIT_ARM_NEON +#ifdef ENABLE_NEON_SUPPORT +# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \ + && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \ + && defined(HAVE_GCC_INLINE_ASM_NEON) +# define USE_64BIT_ARM_NEON 1 +# endif +#endif /*ENABLE_NEON_SUPPORT*/ + + +#if defined(USE_64BIT) || defined(USE_64BIT_ARM_NEON) # define NEED_COMMON64 1 #endif @@ -109,7 +121,7 @@ typedef struct KECCAK_CONTEXT_S #ifdef NEED_COMMON64 -static const u64 round_consts_64bit[24] = +const u64 _gcry_keccak_round_consts_64bit[24 + 1] = { U64_C(0x0000000000000001), U64_C(0x0000000000008082), U64_C(0x800000000000808A), U64_C(0x8000000080008000), @@ -122,7 +134,8 @@ static const u64 round_consts_64bit[24] = U64_C(0x8000000000008002), U64_C(0x8000000000000080), U64_C(0x000000000000800A), U64_C(0x800000008000000A), U64_C(0x8000000080008081), U64_C(0x8000000000008080), - U64_C(0x0000000080000001), U64_C(0x8000000080008008) + U64_C(0x0000000080000001), U64_C(0x8000000080008008), + U64_C(0xFFFFFFFFFFFFFFFF) }; static unsigned int @@ -400,6 +413,54 @@ static const keccak_ops_t keccak_bmi2_64_ops = #endif /* USE_64BIT_BMI2 */ +/* 64-bit ARMv7/NEON implementation. */ +#ifdef USE_64BIT_ARM_NEON + +unsigned int _gcry_keccak_permute_armv7_neon(u64 *state); +unsigned int _gcry_keccak_absorb_lanes64_armv7_neon(u64 *state, int pos, + const byte *lanes, + unsigned int nlanes, + int blocklanes); + +static unsigned int keccak_permute64_armv7_neon(KECCAK_STATE *hd) +{ + return _gcry_keccak_permute_armv7_neon(hd->u.state64); +} + +static unsigned int +keccak_absorb_lanes64_armv7_neon(KECCAK_STATE *hd, int pos, const byte *lanes, + unsigned int nlanes, int blocklanes) +{ + if (blocklanes < 0) + { + /* blocklanes == -1, permutationless absorb from keccak_final. */ + + while (nlanes) + { + hd->u.state64[pos] ^= buf_get_le64(lanes); + lanes += 8; + nlanes--; + } + + return 0; + } + else + { + return _gcry_keccak_absorb_lanes64_armv7_neon(hd->u.state64, pos, lanes, + nlanes, blocklanes); + } +} + +static const keccak_ops_t keccak_armv7_neon_64_ops = +{ + .permute = keccak_permute64_armv7_neon, + .absorb = keccak_absorb_lanes64_armv7_neon, + .extract = keccak_extract64, +}; + +#endif /* USE_64BIT_ARM_NEON */ + + /* Construct generic 32-bit implementation. */ #ifdef USE_32BIT @@ -662,6 +723,10 @@ keccak_init (int algo, void *context, unsigned int flags) /* Select optimized implementation based in hw features. */ if (0) {} +#ifdef USE_64BIT_ARM_NEON + else if (features & HWF_ARM_NEON) + ctx->ops = &keccak_armv7_neon_64_ops; +#endif #ifdef USE_64BIT_BMI2 else if (features & HWF_INTEL_BMI2) ctx->ops = &keccak_bmi2_64_ops; diff --git a/cipher/keccak_permute_64.h b/cipher/keccak_permute_64.h index 6f24217d..1a80192c 100644 --- a/cipher/keccak_permute_64.h +++ b/cipher/keccak_permute_64.h @@ -25,7 +25,7 @@ static unsigned int KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd) { - const u64 *round_consts = round_consts_64bit; + const u64 *round_consts = _gcry_keccak_round_consts_64bit; u64 Aba, Abe, Abi, Abo, Abu; u64 Aga, Age, Agi, Ago, Agu; u64 Aka, Ake, Aki, Ako, Aku; -- cgit v1.2.1