diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2013-10-27 14:07:59 +0200 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2013-10-28 16:19:09 +0200 |
commit | 2cb6e1f323d24359b1c5b113be5c2f79a2a4cded (patch) | |
tree | 344c148104db2f032e5f227615b1ff6b39f910c4 /cipher/serpent.c | |
parent | 3ff9d2571c18cd7a34359f9c60a10d3b0f932b23 (diff) | |
download | libgcrypt-2cb6e1f323d24359b1c5b113be5c2f79a2a4cded.tar.gz |
Add ARM NEON assembly implementation of Serpent
* cipher/Makefile.am: Add 'serpent-armv7-neon.S'.
* cipher/serpent-armv7-neon.S: New.
* cipher/serpent.c (USE_NEON): New macro.
(serpent_context_t) [USE_NEON]: Add 'use_neon'.
[USE_NEON] (_gcry_serpent_neon_ctr_enc, _gcry_serpent_neon_cfb_dec)
(_gcry_serpent_neon_cbc_dec): New prototypes.
(serpent_setkey_internal) [USE_NEON]: Detect NEON support.
(_gcry_serpent_neon_ctr_enc, _gcry_serpent_neon_cfb_dec)
(_gcry_serpent_neon_cbc_dec) [USE_NEON]: Use NEON implementations
to process eight blocks in parallel.
* configure.ac [neonsupport]: Add 'serpent-armv7-neon.lo'.
--
Patch adds ARM NEON optimized implementation of Serpent cipher
to speed up parallelizable bulk operations.
Benchmarks on ARM Cortex-A8 (armhf, 1008 Mhz):
Old:
SERPENT128 | nanosecs/byte mebibytes/sec cycles/byte
CBC dec | 43.53 ns/B 21.91 MiB/s 43.88 c/B
CFB dec | 44.77 ns/B 21.30 MiB/s 45.13 c/B
CTR enc | 45.21 ns/B 21.10 MiB/s 45.57 c/B
CTR dec | 45.21 ns/B 21.09 MiB/s 45.57 c/B
New:
SERPENT128 | nanosecs/byte mebibytes/sec cycles/byte
CBC dec | 26.26 ns/B 36.32 MiB/s 26.47 c/B
CFB dec | 26.21 ns/B 36.38 MiB/s 26.42 c/B
CTR enc | 26.20 ns/B 36.40 MiB/s 26.41 c/B
CTR dec | 26.20 ns/B 36.40 MiB/s 26.41 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/serpent.c')
-rw-r--r-- | cipher/serpent.c | 122 |
1 files changed, 122 insertions, 0 deletions
diff --git a/cipher/serpent.c b/cipher/serpent.c index a8ee15f3..cfda742e 100644 --- a/cipher/serpent.c +++ b/cipher/serpent.c @@ -46,6 +46,15 @@ # endif #endif +/* USE_NEON indicates whether to enable ARM NEON assembly code. */ +#undef USE_NEON +#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) +# if defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_NEON) +# define USE_NEON 1 +# endif +#endif + /* Number of rounds per Serpent encrypt/decrypt operation. */ #define ROUNDS 32 @@ -71,6 +80,9 @@ typedef struct serpent_context #ifdef USE_AVX2 int use_avx2; #endif +#ifdef USE_NEON + int use_neon; +#endif } serpent_context_t; @@ -114,6 +126,26 @@ extern void _gcry_serpent_avx2_cfb_dec(serpent_context_t *ctx, unsigned char *iv); #endif +#ifdef USE_NEON +/* Assembler implementations of Serpent using ARM NEON. Process 8 block in + parallel. + */ +extern void _gcry_serpent_neon_ctr_enc(serpent_context_t *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *ctr); + +extern void _gcry_serpent_neon_cbc_dec(serpent_context_t *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *iv); + +extern void _gcry_serpent_neon_cfb_dec(serpent_context_t *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *iv); +#endif + /* A prototype. */ static const char *serpent_test (void); @@ -634,6 +666,14 @@ serpent_setkey_internal (serpent_context_t *context, } #endif +#ifdef USE_NEON + context->use_neon = 0; + if ((_gcry_get_hw_features () & HWF_ARM_NEON)) + { + context->use_neon = 1; + } +#endif + _gcry_burn_stack (272 * sizeof (u32)); } @@ -861,6 +901,34 @@ _gcry_serpent_ctr_enc(void *context, unsigned char *ctr, } #endif +#ifdef USE_NEON + if (ctx->use_neon) + { + int did_use_neon = 0; + + /* Process data in 8 block chunks. */ + while (nblocks >= 8) + { + _gcry_serpent_neon_ctr_enc(ctx, outbuf, inbuf, ctr); + + nblocks -= 8; + outbuf += 8 * sizeof(serpent_block_t); + inbuf += 8 * sizeof(serpent_block_t); + did_use_neon = 1; + } + + if (did_use_neon) + { + /* serpent-neon assembly code does not use stack */ + if (nblocks == 0) + burn_stack_depth = 0; + } + + /* Use generic code to handle smaller chunks... */ + /* TODO: use caching instead? */ + } +#endif + for ( ;nblocks; nblocks-- ) { /* Encrypt the counter. */ @@ -948,6 +1016,33 @@ _gcry_serpent_cbc_dec(void *context, unsigned char *iv, } #endif +#ifdef USE_NEON + if (ctx->use_neon) + { + int did_use_neon = 0; + + /* Process data in 8 block chunks. */ + while (nblocks >= 8) + { + _gcry_serpent_neon_cbc_dec(ctx, outbuf, inbuf, iv); + + nblocks -= 8; + outbuf += 8 * sizeof(serpent_block_t); + inbuf += 8 * sizeof(serpent_block_t); + did_use_neon = 1; + } + + if (did_use_neon) + { + /* serpent-neon assembly code does not use stack */ + if (nblocks == 0) + burn_stack_depth = 0; + } + + /* Use generic code to handle smaller chunks... */ + } +#endif + for ( ;nblocks; nblocks-- ) { /* INBUF is needed later and it may be identical to OUTBUF, so store @@ -1028,6 +1123,33 @@ _gcry_serpent_cfb_dec(void *context, unsigned char *iv, } #endif +#ifdef USE_NEON + if (ctx->use_neon) + { + int did_use_neon = 0; + + /* Process data in 8 block chunks. */ + while (nblocks >= 8) + { + _gcry_serpent_neon_cfb_dec(ctx, outbuf, inbuf, iv); + + nblocks -= 8; + outbuf += 8 * sizeof(serpent_block_t); + inbuf += 8 * sizeof(serpent_block_t); + did_use_neon = 1; + } + + if (did_use_neon) + { + /* serpent-neon assembly code does not use stack */ + if (nblocks == 0) + burn_stack_depth = 0; + } + + /* Use generic code to handle smaller chunks... */ + } +#endif + for ( ;nblocks; nblocks-- ) { serpent_encrypt_internal(ctx, iv, iv); |