summaryrefslogtreecommitdiff
path: root/cipher/salsa20.c
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2013-10-26 15:00:48 +0300
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2013-10-28 16:12:20 +0200
commit3ff9d2571c18cd7a34359f9c60a10d3b0f932b23 (patch)
tree9b33f85e2b1c56e2f7704ebf7e60dddb8bfea69b /cipher/salsa20.c
parent5a3d43485efdc09912be0967ee0a3ce345b3b15a (diff)
downloadlibgcrypt-3ff9d2571c18cd7a34359f9c60a10d3b0f932b23.tar.gz
Add ARM NEON assembly implementation of Salsa20
* cipher/Makefile.am: Add 'salsa20-armv7-neon.S'. * cipher/salsa20-armv7-neon.S: New. * cipher/salsa20.c [USE_ARM_NEON_ASM]: New macro. (struct SALSA20_context_s, salsa20_core_t, salsa20_keysetup_t) (salsa20_ivsetup_t): New. (SALSA20_context_t) [USE_ARM_NEON_ASM]: Add 'use_neon'. (SALSA20_context_t): Add 'keysetup', 'ivsetup' and 'core'. (salsa20_core): Change 'src' argument to 'ctx'. [USE_ARM_NEON_ASM] (_gcry_arm_neon_salsa20_encrypt): New prototype. [USE_ARM_NEON_ASM] (salsa20_core_neon, salsa20_keysetup_neon) (salsa20_ivsetup_neon): New. (salsa20_do_setkey): Setup keysetup, ivsetup and core with default functions. (salsa20_do_setkey) [USE_ARM_NEON_ASM]: When NEON support detect, set keysetup, ivsetup and core with ARM NEON functions. (salsa20_do_setkey): Call 'ctx->keysetup'. (salsa20_setiv): Call 'ctx->ivsetup'. (salsa20_do_encrypt_stream) [USE_ARM_NEON_ASM]: Process large buffers in ARM NEON implementation. (salsa20_do_encrypt_stream): Call 'ctx->core' instead of directly calling 'salsa20_core'. (selftest): Add test to check large buffer processing and block counter updating. * configure.ac [neonsupport]: 'Add salsa20-armv7-neon.lo'. -- Patch adds fast ARM NEON assembly implementation for Salsa20. Implementation gains extra speed by processing three blocks in parallel with help of ARM NEON vector processing unit. This implementation is based on public domain code by Peter Schwabe and D. J. Bernstein and it is available in SUPERCOP benchmarking framework. For more details on this work, check paper "NEON crypto" by Daniel J. Bernstein and Peter Schwabe: http://cryptojedi.org/papers/#neoncrypto Benchmark results on Cortex-A8 (1008 Mhz): Before: SALSA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 18.88 ns/B 50.51 MiB/s 19.03 c/B STREAM dec | 18.89 ns/B 50.49 MiB/s 19.04 c/B = SALSA20R12 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 13.60 ns/B 70.14 MiB/s 13.71 c/B STREAM dec | 13.60 ns/B 70.13 MiB/s 13.71 c/B After: SALSA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 5.48 ns/B 174.1 MiB/s 5.52 c/B STREAM dec | 5.47 ns/B 174.2 MiB/s 5.52 c/B = SALSA20R12 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 3.65 ns/B 260.9 MiB/s 3.68 c/B STREAM dec | 3.65 ns/B 261.6 MiB/s 3.67 c/B Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/salsa20.c')
-rw-r--r--cipher/salsa20.c131
1 files changed, 122 insertions, 9 deletions
diff --git a/cipher/salsa20.c b/cipher/salsa20.c
index 892b9fc2..f708b184 100644
--- a/cipher/salsa20.c
+++ b/cipher/salsa20.c
@@ -47,6 +47,15 @@
# define USE_AMD64 1
#endif
+/* USE_ARM_NEON_ASM indicates whether to enable ARM NEON assembly code. */
+#undef USE_ARM_NEON_ASM
+#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__)
+# if defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
+ defined(HAVE_GCC_INLINE_ASM_NEON)
+# define USE_ARM_NEON_ASM 1
+# endif
+#endif
+
#define SALSA20_MIN_KEY_SIZE 16 /* Bytes. */
#define SALSA20_MAX_KEY_SIZE 32 /* Bytes. */
@@ -60,7 +69,16 @@
#define SALSA20R12_ROUNDS 12
-typedef struct
+struct SALSA20_context_s;
+
+typedef unsigned int (*salsa20_core_t) (u32 *dst, struct SALSA20_context_s *ctx,
+ unsigned int rounds);
+typedef void (* salsa20_keysetup_t)(struct SALSA20_context_s *ctx,
+ const byte *key, int keylen);
+typedef void (* salsa20_ivsetup_t)(struct SALSA20_context_s *ctx,
+ const byte *iv);
+
+typedef struct SALSA20_context_s
{
/* Indices 1-4 and 11-14 holds the key (two identical copies for the
shorter key size), indices 0, 5, 10, 15 are constant, indices 6, 7
@@ -74,6 +92,12 @@ typedef struct
u32 input[SALSA20_INPUT_LENGTH];
u32 pad[SALSA20_INPUT_LENGTH];
unsigned int unused; /* bytes in the pad. */
+#ifdef USE_ARM_NEON_ASM
+ int use_neon;
+#endif
+ salsa20_keysetup_t keysetup;
+ salsa20_ivsetup_t ivsetup;
+ salsa20_core_t core;
} SALSA20_context_t;
@@ -113,10 +137,10 @@ salsa20_ivsetup(SALSA20_context_t *ctx, const byte *iv)
}
static unsigned int
-salsa20_core (u32 *dst, u32 *src, unsigned int rounds)
+salsa20_core (u32 *dst, SALSA20_context_t *ctx, unsigned int rounds)
{
memset(dst, 0, SALSA20_BLOCK_SIZE);
- return _gcry_salsa20_amd64_encrypt_blocks(src, dst, dst, 1, rounds);
+ return _gcry_salsa20_amd64_encrypt_blocks(ctx->input, dst, dst, 1, rounds);
}
#else /* USE_AMD64 */
@@ -149,9 +173,9 @@ salsa20_core (u32 *dst, u32 *src, unsigned int rounds)
} while(0)
static unsigned int
-salsa20_core (u32 *dst, u32 *src, unsigned int rounds)
+salsa20_core (u32 *dst, SALSA20_context_t *ctx, unsigned rounds)
{
- u32 pad[SALSA20_INPUT_LENGTH];
+ u32 pad[SALSA20_INPUT_LENGTH], *src = ctx->input;
unsigned int i;
memcpy (pad, src, sizeof(pad));
@@ -236,6 +260,49 @@ static void salsa20_ivsetup(SALSA20_context_t *ctx, const byte *iv)
#endif /*!USE_AMD64*/
+#ifdef USE_ARM_NEON_ASM
+
+/* ARM NEON implementation of Salsa20. */
+unsigned int
+_gcry_arm_neon_salsa20_encrypt(void *c, const void *m, unsigned int nblks,
+ void *k, unsigned int rounds);
+
+static unsigned int
+salsa20_core_neon (u32 *dst, SALSA20_context_t *ctx, unsigned int rounds)
+{
+ return _gcry_arm_neon_salsa20_encrypt(dst, NULL, 1, ctx->input, rounds);
+}
+
+static void salsa20_ivsetup_neon(SALSA20_context_t *ctx, const byte *iv)
+{
+ memcpy(ctx->input + 8, iv, 8);
+ /* Reset the block counter. */
+ memset(ctx->input + 10, 0, 8);
+}
+
+static void
+salsa20_keysetup_neon(SALSA20_context_t *ctx, const byte *key, int klen)
+{
+ static const unsigned char sigma32[16] = "expand 32-byte k";
+ static const unsigned char sigma16[16] = "expand 16-byte k";
+
+ if (klen == 16)
+ {
+ memcpy (ctx->input, key, 16);
+ memcpy (ctx->input + 4, key, 16); /* Duplicate 128-bit key. */
+ memcpy (ctx->input + 12, sigma16, 16);
+ }
+ else
+ {
+ /* 32-byte key */
+ memcpy (ctx->input, key, 32);
+ memcpy (ctx->input + 12, sigma32, 16);
+ }
+}
+
+#endif /*USE_ARM_NEON_ASM*/
+
+
static gcry_err_code_t
salsa20_do_setkey (SALSA20_context_t *ctx,
const byte *key, unsigned int keylen)
@@ -257,7 +324,23 @@ salsa20_do_setkey (SALSA20_context_t *ctx,
&& keylen != SALSA20_MAX_KEY_SIZE)
return GPG_ERR_INV_KEYLEN;
- salsa20_keysetup (ctx, key, keylen);
+ /* Default ops. */
+ ctx->keysetup = salsa20_keysetup;
+ ctx->ivsetup = salsa20_ivsetup;
+ ctx->core = salsa20_core;
+
+#ifdef USE_ARM_NEON_ASM
+ ctx->use_neon = (_gcry_get_hw_features () & HWF_ARM_NEON) != 0;
+ if (ctx->use_neon)
+ {
+ /* Use ARM NEON ops instead. */
+ ctx->keysetup = salsa20_keysetup_neon;
+ ctx->ivsetup = salsa20_ivsetup_neon;
+ ctx->core = salsa20_core_neon;
+ }
+#endif
+
+ ctx->keysetup (ctx, key, keylen);
/* We default to a zero nonce. */
salsa20_setiv (ctx, NULL, 0);
@@ -290,7 +373,7 @@ salsa20_setiv (void *context, const byte *iv, unsigned int ivlen)
else
memcpy (tmp, iv, SALSA20_IV_SIZE);
- salsa20_ivsetup (ctx, tmp);
+ ctx->ivsetup (ctx, tmp);
/* Reset the unused pad bytes counter. */
ctx->unused = 0;
@@ -340,12 +423,24 @@ salsa20_do_encrypt_stream (SALSA20_context_t *ctx,
}
#endif
+#ifdef USE_ARM_NEON_ASM
+ if (ctx->use_neon && length >= SALSA20_BLOCK_SIZE)
+ {
+ unsigned int nblocks = length / SALSA20_BLOCK_SIZE;
+ _gcry_arm_neon_salsa20_encrypt (outbuf, inbuf, nblocks, ctx->input,
+ rounds);
+ length -= SALSA20_BLOCK_SIZE * nblocks;
+ outbuf += SALSA20_BLOCK_SIZE * nblocks;
+ inbuf += SALSA20_BLOCK_SIZE * nblocks;
+ }
+#endif
+
while (length > 0)
{
/* Create the next pad and bump the block counter. Note that it
is the user's duty to change to another nonce not later than
after 2^70 processed bytes. */
- nburn = salsa20_core (ctx->pad, ctx->input, rounds);
+ nburn = ctx->core (ctx->pad, ctx, rounds);
burn = nburn > burn ? nburn : burn;
if (length <= SALSA20_BLOCK_SIZE)
@@ -386,12 +481,13 @@ salsa20r12_encrypt_stream (void *context,
}
-
static const char*
selftest (void)
{
SALSA20_context_t ctx;
byte scratch[8+1];
+ byte buf[256+64+4];
+ int i;
static byte key_1[] =
{ 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -418,6 +514,23 @@ selftest (void)
salsa20_encrypt_stream (&ctx, scratch, scratch, sizeof plaintext_1);
if (memcmp (scratch, plaintext_1, sizeof plaintext_1))
return "Salsa20 decryption test 1 failed.";
+
+ for (i = 0; i < sizeof buf; i++)
+ buf[i] = i;
+ salsa20_setkey (&ctx, key_1, sizeof key_1);
+ salsa20_setiv (&ctx, nonce_1, sizeof nonce_1);
+ /*encrypt*/
+ salsa20_encrypt_stream (&ctx, buf, buf, sizeof buf);
+ /*decrypt*/
+ salsa20_setkey (&ctx, key_1, sizeof key_1);
+ salsa20_setiv (&ctx, nonce_1, sizeof nonce_1);
+ salsa20_encrypt_stream (&ctx, buf, buf, 1);
+ salsa20_encrypt_stream (&ctx, buf+1, buf+1, (sizeof buf)-1-1);
+ salsa20_encrypt_stream (&ctx, buf+(sizeof buf)-1, buf+(sizeof buf)-1, 1);
+ for (i = 0; i < sizeof buf; i++)
+ if (buf[i] != (byte)i)
+ return "Salsa20 encryption test 2 failed.";
+
return NULL;
}