summaryrefslogtreecommitdiff
path: root/cipher/serpent.c
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2013-06-09 16:37:38 +0300
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2013-06-09 16:37:42 +0300
commite7ab4e1a7396f4609b9033207015b239ab4a5140 (patch)
tree97572bf1acf49e030d0ad1c361cb7c7415ebc4a5 /cipher/serpent.c
parent3289bca708bdd02c69a331095ac6ca9a1efd74cc (diff)
downloadlibgcrypt-e7ab4e1a7396f4609b9033207015b239ab4a5140.tar.gz
Add Serpent AVX2 implementation
* cipher/Makefile.am: Add 'serpent-avx2-amd64.S'. * cipher/serpent-avx2-amd64.S: New file. * cipher/serpent.c (USE_AVX2): New macro. (serpent_context_t) [USE_AVX2]: Add 'use_avx2'. [USE_AVX2] (_gcry_serpent_avx2_ctr_enc, _gcry_serpent_avx2_cbc_dec) (_gcry_serpent_avx2_cfb_dec): New prototypes. (serpent_setkey_internal) [USE_AVX2]: Check for AVX2 capable hardware and set 'use_avx2'. (_gcry_serpent_ctr_enc) [USE_AVX2]: Use AVX2 accelerated functions. (_gcry_serpent_cbc_dec) [USE_AVX2]: Use AVX2 accelerated functions. (_gcry_serpent_cfb_dec) [USE_AVX2]: Use AVX2 accelerated functions. (selftest_ctr_128, selftest_cbc_128, selftest_cfb_128): Grow 'nblocks' so that AVX2 codepaths are tested. * configure.ac (serpent) [avx2support]: Add 'serpent-avx2-amd64.lo'. -- Add new AVX2 implementation of Serpent that processes 16 blocks in parallel. Speed old (SSE2) vs. new (AVX2) on Intel Core i5-4570: ECB/Stream CBC CFB OFB CTR --------------- --------------- --------------- --------------- --------------- SERPENT128 1.00x 1.00x 1.00x 2.10x 1.00x 2.16x 1.01x 1.00x 2.16x 2.18x Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/serpent.c')
-rw-r--r--cipher/serpent.c140
1 files changed, 137 insertions, 3 deletions
diff --git a/cipher/serpent.c b/cipher/serpent.c
index c72951e3..89e14aef 100644
--- a/cipher/serpent.c
+++ b/cipher/serpent.c
@@ -38,6 +38,15 @@
# define USE_SSE2 1
#endif
+/* USE_AVX2 indicates whether to compile with AMD64 AVX2 code. */
+#undef USE_AVX2
+#if defined(__x86_64__)
+# if defined(ENABLE_AVX2_SUPPORT)
+# define USE_AVX2 1
+# endif
+#endif
+
+
/* Number of rounds per Serpent encrypt/decrypt operation. */
#define ROUNDS 32
@@ -58,6 +67,10 @@ typedef u32 serpent_subkeys_t[ROUNDS + 1][4];
typedef struct serpent_context
{
serpent_subkeys_t keys; /* Generated subkeys. */
+
+#ifdef USE_AVX2
+ int use_avx2;
+#endif
} serpent_context_t;
@@ -81,6 +94,27 @@ extern void _gcry_serpent_sse2_cfb_dec(serpent_context_t *ctx,
unsigned char *iv);
#endif
+#ifdef USE_AVX2
+/* Assembler implementations of Serpent using SSE2. Process 16 block in
+ parallel.
+ */
+extern void _gcry_serpent_avx2_ctr_enc(serpent_context_t *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *ctr);
+
+extern void _gcry_serpent_avx2_cbc_dec(serpent_context_t *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *iv);
+
+extern void _gcry_serpent_avx2_cfb_dec(serpent_context_t *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *iv);
+#endif
+
+
/* A prototype. */
static const char *serpent_test (void);
@@ -600,6 +634,15 @@ serpent_setkey_internal (serpent_context_t *context,
serpent_key_prepare (key, key_length, key_prepared);
serpent_subkeys_generate (key_prepared, context->keys);
+
+#ifdef USE_AVX2
+ context->use_avx2 = 0;
+ if ((_gcry_get_hw_features () & HWF_INTEL_AVX2))
+ {
+ context->use_avx2 = 1;
+ }
+#endif
+
_gcry_burn_stack (272 * sizeof (u32));
}
@@ -784,6 +827,37 @@ _gcry_serpent_ctr_enc(void *context, unsigned char *ctr,
int burn_stack_depth = 2 * sizeof (serpent_block_t);
int i;
+#ifdef USE_AVX2
+ if (ctx->use_avx2)
+ {
+ int did_use_avx2 = 0;
+
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
+ {
+ _gcry_serpent_avx2_ctr_enc(ctx, outbuf, inbuf, ctr);
+
+ nblocks -= 16;
+ outbuf += 16 * sizeof(serpent_block_t);
+ inbuf += 16 * sizeof(serpent_block_t);
+ did_use_avx2 = 1;
+ }
+
+ if (did_use_avx2)
+ {
+ /* clear avx2 registers used by serpent-sse2 */
+ asm volatile ("vzeroall;\n":::);
+
+ /* serpent-avx2 assembly code does not use stack */
+ if (nblocks == 0)
+ burn_stack_depth = 0;
+ }
+
+ /* Use generic/sse2 code to handle smaller chunks... */
+ /* TODO: use caching instead? */
+ }
+#endif
+
#ifdef USE_SSE2
{
int did_use_sse2 = 0;
@@ -861,6 +935,36 @@ _gcry_serpent_cbc_dec(void *context, unsigned char *iv,
unsigned char savebuf[sizeof(serpent_block_t)];
int burn_stack_depth = 2 * sizeof (serpent_block_t);
+#ifdef USE_AVX2
+ if (ctx->use_avx2)
+ {
+ int did_use_avx2 = 0;
+
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
+ {
+ _gcry_serpent_avx2_cbc_dec(ctx, outbuf, inbuf, iv);
+
+ nblocks -= 16;
+ outbuf += 16 * sizeof(serpent_block_t);
+ inbuf += 16 * sizeof(serpent_block_t);
+ did_use_avx2 = 1;
+ }
+
+ if (did_use_avx2)
+ {
+ /* clear avx2 registers used by serpent-sse2 */
+ asm volatile ("vzeroall;\n":::);
+
+ /* serpent-avx2 assembly code does not use stack */
+ if (nblocks == 0)
+ burn_stack_depth = 0;
+ }
+
+ /* Use generic/sse2 code to handle smaller chunks... */
+ }
+#endif
+
#ifdef USE_SSE2
{
int did_use_sse2 = 0;
@@ -933,6 +1037,36 @@ _gcry_serpent_cfb_dec(void *context, unsigned char *iv,
const unsigned char *inbuf = inbuf_arg;
int burn_stack_depth = 2 * sizeof (serpent_block_t);
+#ifdef USE_AVX2
+ if (ctx->use_avx2)
+ {
+ int did_use_avx2 = 0;
+
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
+ {
+ _gcry_serpent_avx2_cfb_dec(ctx, outbuf, inbuf, iv);
+
+ nblocks -= 16;
+ outbuf += 16 * sizeof(serpent_block_t);
+ inbuf += 16 * sizeof(serpent_block_t);
+ did_use_avx2 = 1;
+ }
+
+ if (did_use_avx2)
+ {
+ /* clear avx2 registers used by serpent-sse2 */
+ asm volatile ("vzeroall;\n":::);
+
+ /* serpent-avx2 assembly code does not use stack */
+ if (nblocks == 0)
+ burn_stack_depth = 0;
+ }
+
+ /* Use generic/sse2 code to handle smaller chunks... */
+ }
+#endif
+
#ifdef USE_SSE2
{
int did_use_sse2 = 0;
@@ -993,7 +1127,7 @@ _gcry_serpent_cfb_dec(void *context, unsigned char *iv,
static const char*
selftest_ctr_128 (void)
{
- const int nblocks = 8+1;
+ const int nblocks = 16+1;
const int blocksize = sizeof(serpent_block_t);
const int context_size = sizeof(serpent_context_t);
@@ -1008,7 +1142,7 @@ selftest_ctr_128 (void)
static const char*
selftest_cbc_128 (void)
{
- const int nblocks = 8+2;
+ const int nblocks = 16+2;
const int blocksize = sizeof(serpent_block_t);
const int context_size = sizeof(serpent_context_t);
@@ -1023,7 +1157,7 @@ selftest_cbc_128 (void)
static const char*
selftest_cfb_128 (void)
{
- const int nblocks = 8+2;
+ const int nblocks = 16+2;
const int blocksize = sizeof(serpent_block_t);
const int context_size = sizeof(serpent_context_t);