diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2013-06-09 16:37:38 +0300 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2013-06-09 16:37:42 +0300 |
commit | e7ab4e1a7396f4609b9033207015b239ab4a5140 (patch) | |
tree | 97572bf1acf49e030d0ad1c361cb7c7415ebc4a5 /cipher/serpent.c | |
parent | 3289bca708bdd02c69a331095ac6ca9a1efd74cc (diff) | |
download | libgcrypt-e7ab4e1a7396f4609b9033207015b239ab4a5140.tar.gz |
Add Serpent AVX2 implementation
* cipher/Makefile.am: Add 'serpent-avx2-amd64.S'.
* cipher/serpent-avx2-amd64.S: New file.
* cipher/serpent.c (USE_AVX2): New macro.
(serpent_context_t) [USE_AVX2]: Add 'use_avx2'.
[USE_AVX2] (_gcry_serpent_avx2_ctr_enc, _gcry_serpent_avx2_cbc_dec)
(_gcry_serpent_avx2_cfb_dec): New prototypes.
(serpent_setkey_internal) [USE_AVX2]: Check for AVX2 capable hardware
and set 'use_avx2'.
(_gcry_serpent_ctr_enc) [USE_AVX2]: Use AVX2 accelerated functions.
(_gcry_serpent_cbc_dec) [USE_AVX2]: Use AVX2 accelerated functions.
(_gcry_serpent_cfb_dec) [USE_AVX2]: Use AVX2 accelerated functions.
(selftest_ctr_128, selftest_cbc_128, selftest_cfb_128): Grow 'nblocks'
so that AVX2 codepaths are tested.
* configure.ac (serpent) [avx2support]: Add 'serpent-avx2-amd64.lo'.
--
Add new AVX2 implementation of Serpent that processes 16 blocks in parallel.
Speed old (SSE2) vs. new (AVX2) on Intel Core i5-4570:
ECB/Stream CBC CFB OFB CTR
--------------- --------------- --------------- --------------- ---------------
SERPENT128 1.00x 1.00x 1.00x 2.10x 1.00x 2.16x 1.01x 1.00x 2.16x 2.18x
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/serpent.c')
-rw-r--r-- | cipher/serpent.c | 140 |
1 files changed, 137 insertions, 3 deletions
diff --git a/cipher/serpent.c b/cipher/serpent.c index c72951e3..89e14aef 100644 --- a/cipher/serpent.c +++ b/cipher/serpent.c @@ -38,6 +38,15 @@ # define USE_SSE2 1 #endif +/* USE_AVX2 indicates whether to compile with AMD64 AVX2 code. */ +#undef USE_AVX2 +#if defined(__x86_64__) +# if defined(ENABLE_AVX2_SUPPORT) +# define USE_AVX2 1 +# endif +#endif + + /* Number of rounds per Serpent encrypt/decrypt operation. */ #define ROUNDS 32 @@ -58,6 +67,10 @@ typedef u32 serpent_subkeys_t[ROUNDS + 1][4]; typedef struct serpent_context { serpent_subkeys_t keys; /* Generated subkeys. */ + +#ifdef USE_AVX2 + int use_avx2; +#endif } serpent_context_t; @@ -81,6 +94,27 @@ extern void _gcry_serpent_sse2_cfb_dec(serpent_context_t *ctx, unsigned char *iv); #endif +#ifdef USE_AVX2 +/* Assembler implementations of Serpent using SSE2. Process 16 block in + parallel. + */ +extern void _gcry_serpent_avx2_ctr_enc(serpent_context_t *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *ctr); + +extern void _gcry_serpent_avx2_cbc_dec(serpent_context_t *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *iv); + +extern void _gcry_serpent_avx2_cfb_dec(serpent_context_t *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *iv); +#endif + + /* A prototype. */ static const char *serpent_test (void); @@ -600,6 +634,15 @@ serpent_setkey_internal (serpent_context_t *context, serpent_key_prepare (key, key_length, key_prepared); serpent_subkeys_generate (key_prepared, context->keys); + +#ifdef USE_AVX2 + context->use_avx2 = 0; + if ((_gcry_get_hw_features () & HWF_INTEL_AVX2)) + { + context->use_avx2 = 1; + } +#endif + _gcry_burn_stack (272 * sizeof (u32)); } @@ -784,6 +827,37 @@ _gcry_serpent_ctr_enc(void *context, unsigned char *ctr, int burn_stack_depth = 2 * sizeof (serpent_block_t); int i; +#ifdef USE_AVX2 + if (ctx->use_avx2) + { + int did_use_avx2 = 0; + + /* Process data in 16 block chunks. */ + while (nblocks >= 16) + { + _gcry_serpent_avx2_ctr_enc(ctx, outbuf, inbuf, ctr); + + nblocks -= 16; + outbuf += 16 * sizeof(serpent_block_t); + inbuf += 16 * sizeof(serpent_block_t); + did_use_avx2 = 1; + } + + if (did_use_avx2) + { + /* clear avx2 registers used by serpent-sse2 */ + asm volatile ("vzeroall;\n":::); + + /* serpent-avx2 assembly code does not use stack */ + if (nblocks == 0) + burn_stack_depth = 0; + } + + /* Use generic/sse2 code to handle smaller chunks... */ + /* TODO: use caching instead? */ + } +#endif + #ifdef USE_SSE2 { int did_use_sse2 = 0; @@ -861,6 +935,36 @@ _gcry_serpent_cbc_dec(void *context, unsigned char *iv, unsigned char savebuf[sizeof(serpent_block_t)]; int burn_stack_depth = 2 * sizeof (serpent_block_t); +#ifdef USE_AVX2 + if (ctx->use_avx2) + { + int did_use_avx2 = 0; + + /* Process data in 16 block chunks. */ + while (nblocks >= 16) + { + _gcry_serpent_avx2_cbc_dec(ctx, outbuf, inbuf, iv); + + nblocks -= 16; + outbuf += 16 * sizeof(serpent_block_t); + inbuf += 16 * sizeof(serpent_block_t); + did_use_avx2 = 1; + } + + if (did_use_avx2) + { + /* clear avx2 registers used by serpent-sse2 */ + asm volatile ("vzeroall;\n":::); + + /* serpent-avx2 assembly code does not use stack */ + if (nblocks == 0) + burn_stack_depth = 0; + } + + /* Use generic/sse2 code to handle smaller chunks... */ + } +#endif + #ifdef USE_SSE2 { int did_use_sse2 = 0; @@ -933,6 +1037,36 @@ _gcry_serpent_cfb_dec(void *context, unsigned char *iv, const unsigned char *inbuf = inbuf_arg; int burn_stack_depth = 2 * sizeof (serpent_block_t); +#ifdef USE_AVX2 + if (ctx->use_avx2) + { + int did_use_avx2 = 0; + + /* Process data in 16 block chunks. */ + while (nblocks >= 16) + { + _gcry_serpent_avx2_cfb_dec(ctx, outbuf, inbuf, iv); + + nblocks -= 16; + outbuf += 16 * sizeof(serpent_block_t); + inbuf += 16 * sizeof(serpent_block_t); + did_use_avx2 = 1; + } + + if (did_use_avx2) + { + /* clear avx2 registers used by serpent-sse2 */ + asm volatile ("vzeroall;\n":::); + + /* serpent-avx2 assembly code does not use stack */ + if (nblocks == 0) + burn_stack_depth = 0; + } + + /* Use generic/sse2 code to handle smaller chunks... */ + } +#endif + #ifdef USE_SSE2 { int did_use_sse2 = 0; @@ -993,7 +1127,7 @@ _gcry_serpent_cfb_dec(void *context, unsigned char *iv, static const char* selftest_ctr_128 (void) { - const int nblocks = 8+1; + const int nblocks = 16+1; const int blocksize = sizeof(serpent_block_t); const int context_size = sizeof(serpent_context_t); @@ -1008,7 +1142,7 @@ selftest_ctr_128 (void) static const char* selftest_cbc_128 (void) { - const int nblocks = 8+2; + const int nblocks = 16+2; const int blocksize = sizeof(serpent_block_t); const int context_size = sizeof(serpent_context_t); @@ -1023,7 +1157,7 @@ selftest_cbc_128 (void) static const char* selftest_cfb_128 (void) { - const int nblocks = 8+2; + const int nblocks = 16+2; const int blocksize = sizeof(serpent_block_t); const int context_size = sizeof(serpent_context_t); |