summaryrefslogtreecommitdiff
path: root/cipher/serpent.c
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2013-10-27 14:07:59 +0200
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2013-10-28 16:19:09 +0200
commit2cb6e1f323d24359b1c5b113be5c2f79a2a4cded (patch)
tree344c148104db2f032e5f227615b1ff6b39f910c4 /cipher/serpent.c
parent3ff9d2571c18cd7a34359f9c60a10d3b0f932b23 (diff)
downloadlibgcrypt-2cb6e1f323d24359b1c5b113be5c2f79a2a4cded.tar.gz
Add ARM NEON assembly implementation of Serpent
* cipher/Makefile.am: Add 'serpent-armv7-neon.S'. * cipher/serpent-armv7-neon.S: New. * cipher/serpent.c (USE_NEON): New macro. (serpent_context_t) [USE_NEON]: Add 'use_neon'. [USE_NEON] (_gcry_serpent_neon_ctr_enc, _gcry_serpent_neon_cfb_dec) (_gcry_serpent_neon_cbc_dec): New prototypes. (serpent_setkey_internal) [USE_NEON]: Detect NEON support. (_gcry_serpent_neon_ctr_enc, _gcry_serpent_neon_cfb_dec) (_gcry_serpent_neon_cbc_dec) [USE_NEON]: Use NEON implementations to process eight blocks in parallel. * configure.ac [neonsupport]: Add 'serpent-armv7-neon.lo'. -- Patch adds ARM NEON optimized implementation of Serpent cipher to speed up parallelizable bulk operations. Benchmarks on ARM Cortex-A8 (armhf, 1008 Mhz): Old: SERPENT128 | nanosecs/byte mebibytes/sec cycles/byte CBC dec | 43.53 ns/B 21.91 MiB/s 43.88 c/B CFB dec | 44.77 ns/B 21.30 MiB/s 45.13 c/B CTR enc | 45.21 ns/B 21.10 MiB/s 45.57 c/B CTR dec | 45.21 ns/B 21.09 MiB/s 45.57 c/B New: SERPENT128 | nanosecs/byte mebibytes/sec cycles/byte CBC dec | 26.26 ns/B 36.32 MiB/s 26.47 c/B CFB dec | 26.21 ns/B 36.38 MiB/s 26.42 c/B CTR enc | 26.20 ns/B 36.40 MiB/s 26.41 c/B CTR dec | 26.20 ns/B 36.40 MiB/s 26.41 c/B Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/serpent.c')
-rw-r--r--cipher/serpent.c122
1 files changed, 122 insertions, 0 deletions
diff --git a/cipher/serpent.c b/cipher/serpent.c
index a8ee15f3..cfda742e 100644
--- a/cipher/serpent.c
+++ b/cipher/serpent.c
@@ -46,6 +46,15 @@
# endif
#endif
+/* USE_NEON indicates whether to enable ARM NEON assembly code. */
+#undef USE_NEON
+#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__)
+# if defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
+ defined(HAVE_GCC_INLINE_ASM_NEON)
+# define USE_NEON 1
+# endif
+#endif
+
/* Number of rounds per Serpent encrypt/decrypt operation. */
#define ROUNDS 32
@@ -71,6 +80,9 @@ typedef struct serpent_context
#ifdef USE_AVX2
int use_avx2;
#endif
+#ifdef USE_NEON
+ int use_neon;
+#endif
} serpent_context_t;
@@ -114,6 +126,26 @@ extern void _gcry_serpent_avx2_cfb_dec(serpent_context_t *ctx,
unsigned char *iv);
#endif
+#ifdef USE_NEON
+/* Assembler implementations of Serpent using ARM NEON. Process 8 block in
+ parallel.
+ */
+extern void _gcry_serpent_neon_ctr_enc(serpent_context_t *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *ctr);
+
+extern void _gcry_serpent_neon_cbc_dec(serpent_context_t *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *iv);
+
+extern void _gcry_serpent_neon_cfb_dec(serpent_context_t *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *iv);
+#endif
+
/* A prototype. */
static const char *serpent_test (void);
@@ -634,6 +666,14 @@ serpent_setkey_internal (serpent_context_t *context,
}
#endif
+#ifdef USE_NEON
+ context->use_neon = 0;
+ if ((_gcry_get_hw_features () & HWF_ARM_NEON))
+ {
+ context->use_neon = 1;
+ }
+#endif
+
_gcry_burn_stack (272 * sizeof (u32));
}
@@ -861,6 +901,34 @@ _gcry_serpent_ctr_enc(void *context, unsigned char *ctr,
}
#endif
+#ifdef USE_NEON
+ if (ctx->use_neon)
+ {
+ int did_use_neon = 0;
+
+ /* Process data in 8 block chunks. */
+ while (nblocks >= 8)
+ {
+ _gcry_serpent_neon_ctr_enc(ctx, outbuf, inbuf, ctr);
+
+ nblocks -= 8;
+ outbuf += 8 * sizeof(serpent_block_t);
+ inbuf += 8 * sizeof(serpent_block_t);
+ did_use_neon = 1;
+ }
+
+ if (did_use_neon)
+ {
+ /* serpent-neon assembly code does not use stack */
+ if (nblocks == 0)
+ burn_stack_depth = 0;
+ }
+
+ /* Use generic code to handle smaller chunks... */
+ /* TODO: use caching instead? */
+ }
+#endif
+
for ( ;nblocks; nblocks-- )
{
/* Encrypt the counter. */
@@ -948,6 +1016,33 @@ _gcry_serpent_cbc_dec(void *context, unsigned char *iv,
}
#endif
+#ifdef USE_NEON
+ if (ctx->use_neon)
+ {
+ int did_use_neon = 0;
+
+ /* Process data in 8 block chunks. */
+ while (nblocks >= 8)
+ {
+ _gcry_serpent_neon_cbc_dec(ctx, outbuf, inbuf, iv);
+
+ nblocks -= 8;
+ outbuf += 8 * sizeof(serpent_block_t);
+ inbuf += 8 * sizeof(serpent_block_t);
+ did_use_neon = 1;
+ }
+
+ if (did_use_neon)
+ {
+ /* serpent-neon assembly code does not use stack */
+ if (nblocks == 0)
+ burn_stack_depth = 0;
+ }
+
+ /* Use generic code to handle smaller chunks... */
+ }
+#endif
+
for ( ;nblocks; nblocks-- )
{
/* INBUF is needed later and it may be identical to OUTBUF, so store
@@ -1028,6 +1123,33 @@ _gcry_serpent_cfb_dec(void *context, unsigned char *iv,
}
#endif
+#ifdef USE_NEON
+ if (ctx->use_neon)
+ {
+ int did_use_neon = 0;
+
+ /* Process data in 8 block chunks. */
+ while (nblocks >= 8)
+ {
+ _gcry_serpent_neon_cfb_dec(ctx, outbuf, inbuf, iv);
+
+ nblocks -= 8;
+ outbuf += 8 * sizeof(serpent_block_t);
+ inbuf += 8 * sizeof(serpent_block_t);
+ did_use_neon = 1;
+ }
+
+ if (did_use_neon)
+ {
+ /* serpent-neon assembly code does not use stack */
+ if (nblocks == 0)
+ burn_stack_depth = 0;
+ }
+
+ /* Use generic code to handle smaller chunks... */
+ }
+#endif
+
for ( ;nblocks; nblocks-- )
{
serpent_encrypt_internal(ctx, iv, iv);