summaryrefslogtreecommitdiff
path: root/cipher/salsa20.c
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2013-10-26 15:00:48 +0300
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2013-10-28 16:12:19 +0200
commit5a3d43485efdc09912be0967ee0a3ce345b3b15a (patch)
treeff8e937e2d010ae8e015707f5665915dabe1e915 /cipher/salsa20.c
parente214e8392671dd30e9c33260717b5e756debf3bf (diff)
downloadlibgcrypt-5a3d43485efdc09912be0967ee0a3ce345b3b15a.tar.gz
Add AMD64 assembly implementation of Salsa20
* cipher/Makefile.am: Add 'salsa20-amd64.S'. * cipher/salsa20-amd64.S: New. * cipher/salsa20.c (USE_AMD64): New macro. [USE_AMD64] (_gcry_salsa20_amd64_keysetup, _gcry_salsa20_amd64_ivsetup) (_gcry_salsa20_amd64_encrypt_blocks): New prototypes. [USE_AMD64] (salsa20_keysetup, salsa20_ivsetup, salsa20_core): New. [!USE_AMD64] (salsa20_core): Change 'src' to non-constant, update block counter in 'salsa20_core' and return burn stack depth. [!USE_AMD64] (salsa20_keysetup, salsa20_ivsetup): New. (salsa20_do_setkey): Move generic key setup to 'salsa20_keysetup'. (salsa20_setkey): Fix burn stack depth. (salsa20_setiv): Move generic IV setup to 'salsa20_ivsetup'. (salsa20_do_encrypt_stream) [USE_AMD64]: Process large buffers in AMD64 implementation. (salsa20_do_encrypt_stream): Move stack burning to this function... (salsa20_encrypt_stream, salsa20r12_encrypt_stream): ...from these functions. * configure.ac [x86-64]: Add 'salsa20-amd64.lo'. -- Patch adds fast AMD64 assembly implementation for Salsa20. This implementation is based on public domain code by D. J. Bernstein and it is available at http://cr.yp.to/snuffle.html (amd64-xmm6). Implementation gains extra speed by processing four blocks in parallel with help SSE2 instructions. Benchmark results on Intel Core i5-4570 (3.2 Ghz): Before: SALSA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 3.88 ns/B 246.0 MiB/s 12.41 c/B STREAM dec | 3.88 ns/B 246.0 MiB/s 12.41 c/B = SALSA20R12 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 2.46 ns/B 387.9 MiB/s 7.87 c/B STREAM dec | 2.46 ns/B 387.7 MiB/s 7.87 c/B After: SALSA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 0.985 ns/B 967.8 MiB/s 3.15 c/B STREAM dec | 0.987 ns/B 966.5 MiB/s 3.16 c/B = SALSA20R12 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 0.636 ns/B 1500.5 MiB/s 2.03 c/B STREAM dec | 0.636 ns/B 1499.2 MiB/s 2.04 c/B Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/salsa20.c')
-rw-r--r--cipher/salsa20.c197
1 files changed, 124 insertions, 73 deletions
diff --git a/cipher/salsa20.c b/cipher/salsa20.c
index 6189bca9..892b9fc2 100644
--- a/cipher/salsa20.c
+++ b/cipher/salsa20.c
@@ -40,6 +40,14 @@
#include "cipher.h"
#include "bufhelp.h"
+
+/* USE_AMD64 indicates whether to compile with AMD64 code. */
+#undef USE_AMD64
+#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+# define USE_AMD64 1
+#endif
+
+
#define SALSA20_MIN_KEY_SIZE 16 /* Bytes. */
#define SALSA20_MAX_KEY_SIZE 32 /* Bytes. */
#define SALSA20_BLOCK_SIZE 64 /* Bytes. */
@@ -83,6 +91,36 @@ typedef struct
static void salsa20_setiv (void *context, const byte *iv, unsigned int ivlen);
static const char *selftest (void);
+
+#ifdef USE_AMD64
+/* AMD64 assembly implementations of Salsa20. */
+void _gcry_salsa20_amd64_keysetup(u32 *ctxinput, const void *key, int keybits);
+void _gcry_salsa20_amd64_ivsetup(u32 *ctxinput, const void *iv);
+unsigned int
+_gcry_salsa20_amd64_encrypt_blocks(u32 *ctxinput, const void *src, void *dst,
+ size_t len, int rounds);
+
+static void
+salsa20_keysetup(SALSA20_context_t *ctx, const byte *key, int keylen)
+{
+ _gcry_salsa20_amd64_keysetup(ctx->input, key, keylen * 8);
+}
+
+static void
+salsa20_ivsetup(SALSA20_context_t *ctx, const byte *iv)
+{
+ _gcry_salsa20_amd64_ivsetup(ctx->input, iv);
+}
+
+static unsigned int
+salsa20_core (u32 *dst, u32 *src, unsigned int rounds)
+{
+ memset(dst, 0, SALSA20_BLOCK_SIZE);
+ return _gcry_salsa20_amd64_encrypt_blocks(src, dst, dst, 1, rounds);
+}
+
+#else /* USE_AMD64 */
+
#if 0
@@ -110,8 +148,8 @@ static const char *selftest (void);
x0 ^= ROTL32 (18, x3 + x2); \
} while(0)
-static void
-salsa20_core (u32 *dst, const u32 *src, unsigned rounds)
+static unsigned int
+salsa20_core (u32 *dst, u32 *src, unsigned int rounds)
{
u32 pad[SALSA20_INPUT_LENGTH];
unsigned int i;
@@ -138,31 +176,24 @@ salsa20_core (u32 *dst, const u32 *src, unsigned rounds)
u32 t = pad[i] + src[i];
dst[i] = LE_SWAP32 (t);
}
+
+ /* Update counter. */
+ if (!++src[8])
+ src[9]++;
+
+ /* burn_stack */
+ return ( 3*sizeof (void*) \
+ + 2*sizeof (void*) \
+ + 64 \
+ + sizeof (unsigned int) \
+ + sizeof (u32) );
}
#undef QROUND
#undef SALSA20_CORE_DEBUG
-static gcry_err_code_t
-salsa20_do_setkey (SALSA20_context_t *ctx,
- const byte *key, unsigned int keylen)
+static void
+salsa20_keysetup(SALSA20_context_t *ctx, const byte *key, int keylen)
{
- static int initialized;
- static const char *selftest_failed;
-
- if (!initialized )
- {
- initialized = 1;
- selftest_failed = selftest ();
- if (selftest_failed)
- log_error ("SALSA20 selftest failed (%s)\n", selftest_failed );
- }
- if (selftest_failed)
- return GPG_ERR_SELFTEST_FAILED;
-
- if (keylen != SALSA20_MIN_KEY_SIZE
- && keylen != SALSA20_MAX_KEY_SIZE)
- return GPG_ERR_INV_KEYLEN;
-
/* These constants are the little endian encoding of the string
"expand 32-byte k". For the 128 bit variant, the "32" in that
string will be fixed up to "16". */
@@ -192,6 +223,41 @@ salsa20_do_setkey (SALSA20_context_t *ctx,
ctx->input[5] -= 0x02000000; /* Change to "1 dn". */
ctx->input[10] += 0x00000004; /* Change to "yb-6". */
}
+}
+
+static void salsa20_ivsetup(SALSA20_context_t *ctx, const byte *iv)
+{
+ ctx->input[6] = LE_READ_UINT32(iv + 0);
+ ctx->input[7] = LE_READ_UINT32(iv + 4);
+ /* Reset the block counter. */
+ ctx->input[8] = 0;
+ ctx->input[9] = 0;
+}
+
+#endif /*!USE_AMD64*/
+
+static gcry_err_code_t
+salsa20_do_setkey (SALSA20_context_t *ctx,
+ const byte *key, unsigned int keylen)
+{
+ static int initialized;
+ static const char *selftest_failed;
+
+ if (!initialized )
+ {
+ initialized = 1;
+ selftest_failed = selftest ();
+ if (selftest_failed)
+ log_error ("SALSA20 selftest failed (%s)\n", selftest_failed );
+ }
+ if (selftest_failed)
+ return GPG_ERR_SELFTEST_FAILED;
+
+ if (keylen != SALSA20_MIN_KEY_SIZE
+ && keylen != SALSA20_MAX_KEY_SIZE)
+ return GPG_ERR_INV_KEYLEN;
+
+ salsa20_keysetup (ctx, key, keylen);
/* We default to a zero nonce. */
salsa20_setiv (ctx, NULL, 0);
@@ -205,7 +271,7 @@ salsa20_setkey (void *context, const byte *key, unsigned int keylen)
{
SALSA20_context_t *ctx = (SALSA20_context_t *)context;
gcry_err_code_t rc = salsa20_do_setkey (ctx, key, keylen);
- _gcry_burn_stack (300/* FIXME*/);
+ _gcry_burn_stack (4 + sizeof (void *) + 4 * sizeof (void *));
return rc;
}
@@ -214,28 +280,22 @@ static void
salsa20_setiv (void *context, const byte *iv, unsigned int ivlen)
{
SALSA20_context_t *ctx = (SALSA20_context_t *)context;
+ byte tmp[SALSA20_IV_SIZE];
- if (!iv)
- {
- ctx->input[6] = 0;
- ctx->input[7] = 0;
- }
- else if (ivlen == SALSA20_IV_SIZE)
- {
- ctx->input[6] = LE_READ_UINT32(iv + 0);
- ctx->input[7] = LE_READ_UINT32(iv + 4);
- }
+ if (iv && ivlen != SALSA20_IV_SIZE)
+ log_info ("WARNING: salsa20_setiv: bad ivlen=%u\n", ivlen);
+
+ if (!iv || ivlen != SALSA20_IV_SIZE)
+ memset (tmp, 0, sizeof(tmp));
else
- {
- log_info ("WARNING: salsa20_setiv: bad ivlen=%u\n", ivlen);
- ctx->input[6] = 0;
- ctx->input[7] = 0;
- }
- /* Reset the block counter. */
- ctx->input[8] = 0;
- ctx->input[9] = 0;
+ memcpy (tmp, iv, SALSA20_IV_SIZE);
+
+ salsa20_ivsetup (ctx, tmp);
+
/* Reset the unused pad bytes counter. */
ctx->unused = 0;
+
+ wipememory (tmp, sizeof(tmp));
}
@@ -246,6 +306,8 @@ salsa20_do_encrypt_stream (SALSA20_context_t *ctx,
byte *outbuf, const byte *inbuf,
unsigned int length, unsigned rounds)
{
+ unsigned int nburn, burn = 0;
+
if (ctx->unused)
{
unsigned char *p = (void*)ctx->pad;
@@ -266,26 +328,39 @@ salsa20_do_encrypt_stream (SALSA20_context_t *ctx,
gcry_assert (!ctx->unused);
}
- for (;;)
+#ifdef USE_AMD64
+ if (length >= SALSA20_BLOCK_SIZE)
+ {
+ unsigned int nblocks = length / SALSA20_BLOCK_SIZE;
+ burn = _gcry_salsa20_amd64_encrypt_blocks(ctx->input, inbuf, outbuf,
+ nblocks, rounds);
+ length -= SALSA20_BLOCK_SIZE * nblocks;
+ outbuf += SALSA20_BLOCK_SIZE * nblocks;
+ inbuf += SALSA20_BLOCK_SIZE * nblocks;
+ }
+#endif
+
+ while (length > 0)
{
/* Create the next pad and bump the block counter. Note that it
is the user's duty to change to another nonce not later than
after 2^70 processed bytes. */
- salsa20_core (ctx->pad, ctx->input, rounds);
- if (!++ctx->input[8])
- ctx->input[9]++;
+ nburn = salsa20_core (ctx->pad, ctx->input, rounds);
+ burn = nburn > burn ? nburn : burn;
if (length <= SALSA20_BLOCK_SIZE)
{
buf_xor (outbuf, inbuf, ctx->pad, length);
ctx->unused = SALSA20_BLOCK_SIZE - length;
- return;
+ break;
}
buf_xor (outbuf, inbuf, ctx->pad, SALSA20_BLOCK_SIZE);
length -= SALSA20_BLOCK_SIZE;
outbuf += SALSA20_BLOCK_SIZE;
inbuf += SALSA20_BLOCK_SIZE;
- }
+ }
+
+ _gcry_burn_stack (burn);
}
@@ -296,19 +371,7 @@ salsa20_encrypt_stream (void *context,
SALSA20_context_t *ctx = (SALSA20_context_t *)context;
if (length)
- {
- salsa20_do_encrypt_stream (ctx, outbuf, inbuf, length, SALSA20_ROUNDS);
- _gcry_burn_stack (/* salsa20_do_encrypt_stream: */
- 2*sizeof (void*)
- + 3*sizeof (void*) + sizeof (unsigned int)
- /* salsa20_core: */
- + 2*sizeof (void*)
- + 2*sizeof (void*)
- + 64
- + sizeof (unsigned int)
- + sizeof (u32)
- );
- }
+ salsa20_do_encrypt_stream (ctx, outbuf, inbuf, length, SALSA20_ROUNDS);
}
@@ -319,19 +382,7 @@ salsa20r12_encrypt_stream (void *context,
SALSA20_context_t *ctx = (SALSA20_context_t *)context;
if (length)
- {
- salsa20_do_encrypt_stream (ctx, outbuf, inbuf, length, SALSA20R12_ROUNDS);
- _gcry_burn_stack (/* salsa20_do_encrypt_stream: */
- 2*sizeof (void*)
- + 3*sizeof (void*) + sizeof (unsigned int)
- /* salsa20_core: */
- + 2*sizeof (void*)
- + 2*sizeof (void*)
- + 64
- + sizeof (unsigned int)
- + sizeof (u32)
- );
- }
+ salsa20_do_encrypt_stream (ctx, outbuf, inbuf, length, SALSA20R12_ROUNDS);
}