diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2013-10-26 15:00:48 +0300 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2013-10-28 16:12:20 +0200 |
commit | 3ff9d2571c18cd7a34359f9c60a10d3b0f932b23 (patch) | |
tree | 9b33f85e2b1c56e2f7704ebf7e60dddb8bfea69b /cipher/salsa20.c | |
parent | 5a3d43485efdc09912be0967ee0a3ce345b3b15a (diff) | |
download | libgcrypt-3ff9d2571c18cd7a34359f9c60a10d3b0f932b23.tar.gz |
Add ARM NEON assembly implementation of Salsa20
* cipher/Makefile.am: Add 'salsa20-armv7-neon.S'.
* cipher/salsa20-armv7-neon.S: New.
* cipher/salsa20.c [USE_ARM_NEON_ASM]: New macro.
(struct SALSA20_context_s, salsa20_core_t, salsa20_keysetup_t)
(salsa20_ivsetup_t): New.
(SALSA20_context_t) [USE_ARM_NEON_ASM]: Add 'use_neon'.
(SALSA20_context_t): Add 'keysetup', 'ivsetup' and 'core'.
(salsa20_core): Change 'src' argument to 'ctx'.
[USE_ARM_NEON_ASM] (_gcry_arm_neon_salsa20_encrypt): New prototype.
[USE_ARM_NEON_ASM] (salsa20_core_neon, salsa20_keysetup_neon)
(salsa20_ivsetup_neon): New.
(salsa20_do_setkey): Setup keysetup, ivsetup and core with default
functions.
(salsa20_do_setkey) [USE_ARM_NEON_ASM]: When NEON support detect,
set keysetup, ivsetup and core with ARM NEON functions.
(salsa20_do_setkey): Call 'ctx->keysetup'.
(salsa20_setiv): Call 'ctx->ivsetup'.
(salsa20_do_encrypt_stream) [USE_ARM_NEON_ASM]: Process large buffers
in ARM NEON implementation.
(salsa20_do_encrypt_stream): Call 'ctx->core' instead of directly
calling 'salsa20_core'.
(selftest): Add test to check large buffer processing and block counter
updating.
* configure.ac [neonsupport]: 'Add salsa20-armv7-neon.lo'.
--
Patch adds fast ARM NEON assembly implementation for Salsa20. Implementation
gains extra speed by processing three blocks in parallel with help of ARM
NEON vector processing unit.
This implementation is based on public domain code by Peter Schwabe and D. J.
Bernstein and it is available in SUPERCOP benchmarking framework. For more
details on this work, check paper "NEON crypto" by Daniel J. Bernstein and
Peter Schwabe:
http://cryptojedi.org/papers/#neoncrypto
Benchmark results on Cortex-A8 (1008 Mhz):
Before:
SALSA20 | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 18.88 ns/B 50.51 MiB/s 19.03 c/B
STREAM dec | 18.89 ns/B 50.49 MiB/s 19.04 c/B
=
SALSA20R12 | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 13.60 ns/B 70.14 MiB/s 13.71 c/B
STREAM dec | 13.60 ns/B 70.13 MiB/s 13.71 c/B
After:
SALSA20 | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 5.48 ns/B 174.1 MiB/s 5.52 c/B
STREAM dec | 5.47 ns/B 174.2 MiB/s 5.52 c/B
=
SALSA20R12 | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 3.65 ns/B 260.9 MiB/s 3.68 c/B
STREAM dec | 3.65 ns/B 261.6 MiB/s 3.67 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/salsa20.c')
-rw-r--r-- | cipher/salsa20.c | 131 |
1 files changed, 122 insertions, 9 deletions
diff --git a/cipher/salsa20.c b/cipher/salsa20.c index 892b9fc2..f708b184 100644 --- a/cipher/salsa20.c +++ b/cipher/salsa20.c @@ -47,6 +47,15 @@ # define USE_AMD64 1 #endif +/* USE_ARM_NEON_ASM indicates whether to enable ARM NEON assembly code. */ +#undef USE_ARM_NEON_ASM +#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) +# if defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_NEON) +# define USE_ARM_NEON_ASM 1 +# endif +#endif + #define SALSA20_MIN_KEY_SIZE 16 /* Bytes. */ #define SALSA20_MAX_KEY_SIZE 32 /* Bytes. */ @@ -60,7 +69,16 @@ #define SALSA20R12_ROUNDS 12 -typedef struct +struct SALSA20_context_s; + +typedef unsigned int (*salsa20_core_t) (u32 *dst, struct SALSA20_context_s *ctx, + unsigned int rounds); +typedef void (* salsa20_keysetup_t)(struct SALSA20_context_s *ctx, + const byte *key, int keylen); +typedef void (* salsa20_ivsetup_t)(struct SALSA20_context_s *ctx, + const byte *iv); + +typedef struct SALSA20_context_s { /* Indices 1-4 and 11-14 holds the key (two identical copies for the shorter key size), indices 0, 5, 10, 15 are constant, indices 6, 7 @@ -74,6 +92,12 @@ typedef struct u32 input[SALSA20_INPUT_LENGTH]; u32 pad[SALSA20_INPUT_LENGTH]; unsigned int unused; /* bytes in the pad. */ +#ifdef USE_ARM_NEON_ASM + int use_neon; +#endif + salsa20_keysetup_t keysetup; + salsa20_ivsetup_t ivsetup; + salsa20_core_t core; } SALSA20_context_t; @@ -113,10 +137,10 @@ salsa20_ivsetup(SALSA20_context_t *ctx, const byte *iv) } static unsigned int -salsa20_core (u32 *dst, u32 *src, unsigned int rounds) +salsa20_core (u32 *dst, SALSA20_context_t *ctx, unsigned int rounds) { memset(dst, 0, SALSA20_BLOCK_SIZE); - return _gcry_salsa20_amd64_encrypt_blocks(src, dst, dst, 1, rounds); + return _gcry_salsa20_amd64_encrypt_blocks(ctx->input, dst, dst, 1, rounds); } #else /* USE_AMD64 */ @@ -149,9 +173,9 @@ salsa20_core (u32 *dst, u32 *src, unsigned int rounds) } while(0) static unsigned int -salsa20_core (u32 *dst, u32 *src, unsigned int rounds) +salsa20_core (u32 *dst, SALSA20_context_t *ctx, unsigned rounds) { - u32 pad[SALSA20_INPUT_LENGTH]; + u32 pad[SALSA20_INPUT_LENGTH], *src = ctx->input; unsigned int i; memcpy (pad, src, sizeof(pad)); @@ -236,6 +260,49 @@ static void salsa20_ivsetup(SALSA20_context_t *ctx, const byte *iv) #endif /*!USE_AMD64*/ +#ifdef USE_ARM_NEON_ASM + +/* ARM NEON implementation of Salsa20. */ +unsigned int +_gcry_arm_neon_salsa20_encrypt(void *c, const void *m, unsigned int nblks, + void *k, unsigned int rounds); + +static unsigned int +salsa20_core_neon (u32 *dst, SALSA20_context_t *ctx, unsigned int rounds) +{ + return _gcry_arm_neon_salsa20_encrypt(dst, NULL, 1, ctx->input, rounds); +} + +static void salsa20_ivsetup_neon(SALSA20_context_t *ctx, const byte *iv) +{ + memcpy(ctx->input + 8, iv, 8); + /* Reset the block counter. */ + memset(ctx->input + 10, 0, 8); +} + +static void +salsa20_keysetup_neon(SALSA20_context_t *ctx, const byte *key, int klen) +{ + static const unsigned char sigma32[16] = "expand 32-byte k"; + static const unsigned char sigma16[16] = "expand 16-byte k"; + + if (klen == 16) + { + memcpy (ctx->input, key, 16); + memcpy (ctx->input + 4, key, 16); /* Duplicate 128-bit key. */ + memcpy (ctx->input + 12, sigma16, 16); + } + else + { + /* 32-byte key */ + memcpy (ctx->input, key, 32); + memcpy (ctx->input + 12, sigma32, 16); + } +} + +#endif /*USE_ARM_NEON_ASM*/ + + static gcry_err_code_t salsa20_do_setkey (SALSA20_context_t *ctx, const byte *key, unsigned int keylen) @@ -257,7 +324,23 @@ salsa20_do_setkey (SALSA20_context_t *ctx, && keylen != SALSA20_MAX_KEY_SIZE) return GPG_ERR_INV_KEYLEN; - salsa20_keysetup (ctx, key, keylen); + /* Default ops. */ + ctx->keysetup = salsa20_keysetup; + ctx->ivsetup = salsa20_ivsetup; + ctx->core = salsa20_core; + +#ifdef USE_ARM_NEON_ASM + ctx->use_neon = (_gcry_get_hw_features () & HWF_ARM_NEON) != 0; + if (ctx->use_neon) + { + /* Use ARM NEON ops instead. */ + ctx->keysetup = salsa20_keysetup_neon; + ctx->ivsetup = salsa20_ivsetup_neon; + ctx->core = salsa20_core_neon; + } +#endif + + ctx->keysetup (ctx, key, keylen); /* We default to a zero nonce. */ salsa20_setiv (ctx, NULL, 0); @@ -290,7 +373,7 @@ salsa20_setiv (void *context, const byte *iv, unsigned int ivlen) else memcpy (tmp, iv, SALSA20_IV_SIZE); - salsa20_ivsetup (ctx, tmp); + ctx->ivsetup (ctx, tmp); /* Reset the unused pad bytes counter. */ ctx->unused = 0; @@ -340,12 +423,24 @@ salsa20_do_encrypt_stream (SALSA20_context_t *ctx, } #endif +#ifdef USE_ARM_NEON_ASM + if (ctx->use_neon && length >= SALSA20_BLOCK_SIZE) + { + unsigned int nblocks = length / SALSA20_BLOCK_SIZE; + _gcry_arm_neon_salsa20_encrypt (outbuf, inbuf, nblocks, ctx->input, + rounds); + length -= SALSA20_BLOCK_SIZE * nblocks; + outbuf += SALSA20_BLOCK_SIZE * nblocks; + inbuf += SALSA20_BLOCK_SIZE * nblocks; + } +#endif + while (length > 0) { /* Create the next pad and bump the block counter. Note that it is the user's duty to change to another nonce not later than after 2^70 processed bytes. */ - nburn = salsa20_core (ctx->pad, ctx->input, rounds); + nburn = ctx->core (ctx->pad, ctx, rounds); burn = nburn > burn ? nburn : burn; if (length <= SALSA20_BLOCK_SIZE) @@ -386,12 +481,13 @@ salsa20r12_encrypt_stream (void *context, } - static const char* selftest (void) { SALSA20_context_t ctx; byte scratch[8+1]; + byte buf[256+64+4]; + int i; static byte key_1[] = { 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -418,6 +514,23 @@ selftest (void) salsa20_encrypt_stream (&ctx, scratch, scratch, sizeof plaintext_1); if (memcmp (scratch, plaintext_1, sizeof plaintext_1)) return "Salsa20 decryption test 1 failed."; + + for (i = 0; i < sizeof buf; i++) + buf[i] = i; + salsa20_setkey (&ctx, key_1, sizeof key_1); + salsa20_setiv (&ctx, nonce_1, sizeof nonce_1); + /*encrypt*/ + salsa20_encrypt_stream (&ctx, buf, buf, sizeof buf); + /*decrypt*/ + salsa20_setkey (&ctx, key_1, sizeof key_1); + salsa20_setiv (&ctx, nonce_1, sizeof nonce_1); + salsa20_encrypt_stream (&ctx, buf, buf, 1); + salsa20_encrypt_stream (&ctx, buf+1, buf+1, (sizeof buf)-1-1); + salsa20_encrypt_stream (&ctx, buf+(sizeof buf)-1, buf+(sizeof buf)-1, 1); + for (i = 0; i < sizeof buf; i++) + if (buf[i] != (byte)i) + return "Salsa20 encryption test 2 failed."; + return NULL; } |