diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2013-08-31 12:48:31 +0300 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2013-08-31 13:31:34 +0300 |
commit | 99d15543b8d94a8f1ef66c6ccb862b0ce82c514d (patch) | |
tree | 1aa148d6c41647926f23607d7851a8d1e3f33aff /cipher/sha512-armv7-neon.S | |
parent | 03da7f8ba3ec24d4639a2bcebbc0d9d831734c08 (diff) | |
download | libgcrypt-99d15543b8d94a8f1ef66c6ccb862b0ce82c514d.tar.gz |
sha512: add ARM/NEON assembly version of transform function
* cipher/Makefile.am: Add 'sha512-armv7-neon.S'.
* cipher/sha512-armv7-neon.S: New file.
* cipher/sha512.c (USE_ARM_NEON_ASM): New macro.
(SHA512_CONTEXT) [USE_ARM_NEON_ASM]: Add 'use_neon'.
(sha512_init, sha384_init) [USE_ARM_NEON_ASM]: Enable 'use_neon' if
CPU support NEON instructions.
(k): Round constant array moved outside of 'transform' function.
(__transform): Renamed from 'tranform' function.
[USE_ARM_NEON_ASM] (_gcry_sha512_transform_armv7_neon): New prototype.
(transform): New wrapper function for different transform versions.
(sha512_write, sha512_final): Burn stack by the amount returned by
transform function.
* configure.ac (sha512) [neonsupport]: Add 'sha512-armv7-neon.lo'.
--
Add NEON assembly for transform function for faster SHA512 on ARM. Major speed
up thanks to 64-bit integer registers and large register file that can hold
full input buffer.
Benchmark results on Cortex-A8, 1Ghz:
Old:
$ tests/benchmark --hash-repetitions 100 md sha512 sha384
SHA512 17050ms 18780ms 29120ms 18040ms 17190ms
SHA384 17130ms 18720ms 29160ms 18090ms 17280ms
New:
$ tests/benchmark --hash-repetitions 100 md sha512 sha384
SHA512 3600ms 5070ms 15330ms 4510ms 3480ms
SHA384 3590ms 5060ms 15350ms 4510ms 3520ms
New vs old:
SHA512 4.74x 3.70x 1.90x 4.00x 4.94x
SHA384 4.77x 3.70x 1.90x 4.01x 4.91x
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/sha512-armv7-neon.S')
-rw-r--r-- | cipher/sha512-armv7-neon.S | 316 |
1 files changed, 316 insertions, 0 deletions
diff --git a/cipher/sha512-armv7-neon.S b/cipher/sha512-armv7-neon.S new file mode 100644 index 00000000..042b15a6 --- /dev/null +++ b/cipher/sha512-armv7-neon.S @@ -0,0 +1,316 @@ +/* sha512-armv7-neon.S - ARM/NEON assembly implementation of SHA-512 transform + * + * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include <config.h> + +#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \ + defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_NEON) + +.text + +.syntax unified +.fpu neon +.arm + +/* structure of SHA512_CONTEXT */ +#define hd_a 0 +#define hd_b ((hd_a) + 8) +#define hd_c ((hd_b) + 8) +#define hd_d ((hd_c) + 8) +#define hd_e ((hd_d) + 8) +#define hd_f ((hd_e) + 8) +#define hd_g ((hd_f) + 8) + +/* register macros */ +#define RK %r2 + +#define RA d0 +#define RB d1 +#define RC d2 +#define RD d3 +#define RE d4 +#define RF d5 +#define RG d6 +#define RH d7 + +#define RT0 d8 +#define RT1 d9 +#define RT2 d10 +#define RT3 d11 +#define RT4 d12 +#define RT5 d13 +#define RT6 d14 +#define RT7 d15 + +#define RW0 d16 +#define RW1 d17 +#define RW2 d18 +#define RW3 d19 +#define RW4 d20 +#define RW5 d21 +#define RW6 d22 +#define RW7 d23 +#define RW8 d24 +#define RW9 d25 +#define RW10 d26 +#define RW11 d27 +#define RW12 d28 +#define RW13 d29 +#define RW14 d30 +#define RW15 d31 + +#define RW01q q8 +#define RW23q q9 +#define RW45q q10 +#define RW67q q11 +#define RW89q q12 +#define RW1011q q13 +#define RW1213q q14 +#define RW1415q q15 + +/*********************************************************************** + * ARM assembly implementation of sha512 transform + ***********************************************************************/ +#define round_0_63(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw14, rw9, rw1) \ + /* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \ + vshr.u64 RT1, re, #14; \ + vshl.u64 RT3, re, #64 - 14; \ + vshr.u64 RT4, re, #18; \ + vshl.u64 RT5, re, #64 - 18; \ + veor.64 RT1, RT1, RT3; \ + vld1.64 {RT0}, [RK]!; \ + veor.64 RT1, RT1, RT4; \ + vshr.u64 RT3, re, #41; \ + vshl.u64 RT4, re, #64 - 41; \ + veor.64 RT1, RT1, RT5; \ + vadd.u64 RT0, RT0, rw0; \ + veor.64 RT1, RT1, RT3; \ + vand.64 RT2, re, rf; \ + veor.64 RT1, RT1, RT4; \ + vbic.64 RT6, rg, re; \ + \ + vadd.u64 RT1, RT1, rh; \ + veor.64 RT2, RT2, RT6; \ + vshr.u64 rh, ra, #28; \ + vshl.u64 RT3, ra, #64 - 28; \ + vadd.u64 RT1, RT1, RT0; \ + vshr.u64 RT4, ra, #34; \ + veor.64 rh, rh, RT3; \ + vshl.u64 RT5, ra, #64 - 34; \ + vadd.u64 RT1, RT1, RT2; \ + \ + /* h = Sum0 (a) + Maj (a, b, c); */ \ + veor.64 rh, rh, RT4; \ + vshr.u64 RT3, ra, #39; \ + vshl.u64 RT4, ra, #64 - 39; \ + vorr.64 RT6, ra, rb; \ + vand.64 RT0, ra, rb; \ + veor.64 rh, rh, RT5; \ + vand.64 RT6, RT6, rc; \ + veor.64 rh, rh, RT3; \ + vorr.64 RT0, RT0, RT6; \ + veor.64 rh, rh, RT4; \ + vshr.u64 RT4, rw14, #19; \ + vadd.u64 rh, rh, RT0; \ + vshl.u64 RT2, rw14, #64 - 19; \ + \ + /* w[0] += S1 (w[14]) + w[9] + S0 (w[1]); */ \ + vshr.u64 RT3, rw14, #61; \ + vshl.u64 RT6, rw14, #64 - 61; \ + veor.64 RT0, RT4, RT2; \ + vshr.u64 RT2, rw14, 6; \ + veor.64 RT0, RT0, RT3; \ + vshr.u64 RT7, rw1, #1; \ + veor.64 RT0, RT0, RT6; \ + vshl.u64 RT4, rw1, #64 - 1; \ + veor.64 RT0, RT0, RT2; \ + vshr.u64 RT5, rw1, #8; \ + vadd.u64 rw0, rw0, RT0; \ + vshl.u64 RT6, rw1, #64 - 8; \ + veor.64 RT7, RT7, RT4; \ + vshr.u64 RT4, rw1, 7; \ + veor.64 RT7, RT7, RT5; \ + vadd.u64 rw0, rw0, rw9; /* w[0]+=w[9]; */\ + veor.64 RT7, RT7, RT6; \ + vadd.u64 rd, rd, RT1; /* d+=t1; */ \ + veor.64 RT7, RT7, RT4; \ + vadd.u64 rh, rh, RT1; /* h+=t1; */ \ + vadd.u64 rw0, rw0, RT7; \ + +#define round_64_79(ra, rb, rc, rd, re, rf, rg, rh, rw0) \ + /* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \ + vld1.64 {RT0}, [RK]!; \ + vshr.u64 RT1, re, #14; \ + vshl.u64 RT3, re, #64 - 14; \ + vshr.u64 RT4, re, #18; \ + vshl.u64 RT5, re, #64 - 18; \ + veor.64 RT1, RT1, RT3; \ + vshr.u64 RT7, ra, #28; \ + veor.64 RT1, RT1, RT4; \ + vshr.u64 RT3, re, #41; \ + vshl.u64 RT4, re, #64 - 41; \ + veor.64 RT1, RT1, RT5; \ + vadd.u64 RT0, RT0, rw0; \ + veor.64 RT1, RT1, RT3; \ + vand.64 RT2, re, rf; \ + veor.64 RT1, RT1, RT4; \ + vbic.64 RT6, rg, re; \ + \ + vadd.u64 RT1, RT1, rh; \ + veor.64 RT2, RT2, RT6; \ + vadd.u64 RT1, RT1, RT0; \ + vshr.u64 RT4, ra, #34; \ + vshl.u64 RT5, ra, #64 - 34; \ + \ + /* t7 = Sum0 (a) + Maj (a, b, c); */ \ + vshl.u64 RT6, ra, #64 - 28; \ + veor.64 RT7, RT7, RT4; \ + vshr.u64 RT3, ra, #39; \ + veor.64 RT7, RT7, RT6; \ + vshl.u64 RT4, ra, #64 - 39; \ + vorr.64 RT6, ra, rb; \ + vand.64 RT0, ra, rb; \ + veor.64 RT7, RT7, RT5; \ + vand.64 RT6, RT6, rc; \ + veor.64 RT7, RT7, RT3; \ + vorr.64 RT0, RT0, RT6; \ + veor.64 RT7, RT7, RT4; \ + vadd.u64 RT1, RT1, RT2; \ + vadd.u64 RT7, RT7, RT0; \ + vadd.u64 rd, rd, RT1; /* d+=t1; */ \ + vadd.u64 rh, RT7, RT1; /* h=t7+t1; */ + +.align 3 +.globl _gcry_sha512_transform_armv7_neon +.type _gcry_sha512_transform_armv7_neon,%function; + +_gcry_sha512_transform_armv7_neon: + /* Input: + * %r0: SHA512_CONTEXT + * %r1: data + * %r2: u64 k[] constants + */ + mov %r3, #0; + + /* Load context to d0-d7 */ + vld1.64 {RA-RD}, [%r0]!; + vld1.64 {RE-RH}, [%r0]; + sub %r0, #(4*8); + + /* Load input to w[16], d16-d31 */ + /* NOTE: Assumes that on ARMv7 unaligned accesses are always allowed. */ + vld1.64 {RW0-RW3}, [%r1]!; + vld1.64 {RW4-RW7}, [%r1]!; + vld1.64 {RW8-RW11}, [%r1]!; + vld1.64 {RW12-RW15}, [%r1]; +#ifdef __ARMEL__ + /* byteswap */ + vrev64.8 RW01q, RW01q; + vrev64.8 RW23q, RW23q; + vrev64.8 RW45q, RW45q; + vrev64.8 RW67q, RW67q; + vrev64.8 RW89q, RW89q; + vrev64.8 RW1011q, RW1011q; + vrev64.8 RW1213q, RW1213q; + vrev64.8 RW1415q, RW1415q; +#endif + + /* EABI says that d8-d15 must be preserved by callee. */ + vpush {RT0-RT7}; + +.Loop: + add %r3, #16; + round_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW14, RW9, RW1); + cmp %r3, #64; + round_0_63(RH, RA, RB, RC, RD, RE, RF, RG, RW1, RW15, RW10, RW2); + round_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW0, RW11, RW3); + round_0_63(RF, RG, RH, RA, RB, RC, RD, RE, RW3, RW1, RW12, RW4); + round_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW2, RW13, RW5); + round_0_63(RD, RE, RF, RG, RH, RA, RB, RC, RW5, RW3, RW14, RW6); + round_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW4, RW15, RW7); + round_0_63(RB, RC, RD, RE, RF, RG, RH, RA, RW7, RW5, RW0, RW8); + round_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW6, RW1, RW9); + round_0_63(RH, RA, RB, RC, RD, RE, RF, RG, RW9, RW7, RW2, RW10); + round_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW8, RW3, RW11); + round_0_63(RF, RG, RH, RA, RB, RC, RD, RE, RW11, RW9, RW4, RW12); + round_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW10, RW5, RW13); + round_0_63(RD, RE, RF, RG, RH, RA, RB, RC, RW13, RW11, RW6, RW14); + round_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW12, RW7, RW15); + round_0_63(RB, RC, RD, RE, RF, RG, RH, RA, RW15, RW13, RW8, RW0); + bne .Loop; + + round_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW0); + round_64_79(RH, RA, RB, RC, RD, RE, RF, RG, RW1); + round_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW2); + round_64_79(RF, RG, RH, RA, RB, RC, RD, RE, RW3); + round_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4); + round_64_79(RD, RE, RF, RG, RH, RA, RB, RC, RW5); + round_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6); + round_64_79(RB, RC, RD, RE, RF, RG, RH, RA, RW7); + round_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8); + round_64_79(RH, RA, RB, RC, RD, RE, RF, RG, RW9); + round_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10); + round_64_79(RF, RG, RH, RA, RB, RC, RD, RE, RW11); + round_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12); + round_64_79(RD, RE, RF, RG, RH, RA, RB, RC, RW13); + round_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14); + round_64_79(RB, RC, RD, RE, RF, RG, RH, RA, RW15); + + /* Load context to d16-d23 */ + vld1.64 {RW0-RW3}, [%r0]!; + vld1.64 {RW4-RW7}, [%r0]; + sub %r0, #(4*8); + + vadd.u64 RA, RW0; + vadd.u64 RB, RW1; + vadd.u64 RC, RW2; + vadd.u64 RD, RW3; + vadd.u64 RE, RW4; + vadd.u64 RF, RW5; + vadd.u64 RG, RW6; + vadd.u64 RH, RW7; + + /* Store the first half of context */ + vst1.64 {RA-RD}, [%r0]!; + + /* Clear used registers */ + /* d16-d31 */ + veor.u64 RW01q, RW01q; + veor.u64 RW23q, RW23q; + veor.u64 RW45q, RW45q; + veor.u64 RW67q, RW67q; + vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */ + veor.u64 RW89q, RW89q; + veor.u64 RW1011q, RW1011q; + veor.u64 RW1213q, RW1213q; + veor.u64 RW1415q, RW1415q; + /* d8-d15 */ + vpop {RT0-RT7}; + /* d0-d7 (q0-q3) */ + veor.u64 %q0, %q0; + veor.u64 %q1, %q1; + veor.u64 %q2, %q2; + veor.u64 %q3, %q3; + + bx %lr; +.size _gcry_sha512_transform_armv7_neon,.-_gcry_sha512_transform_armv7_neon; + +#endif |