From 2fd06e207dcea1d8a7f0e7e92f3359615a99421b Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Thu, 23 May 2013 11:04:18 +0300 Subject: serpent: add SSE2 accelerated amd64 implementation * configure.ac (serpent): Add 'serpent-sse2-amd64.lo'. * cipher/Makefile.am (EXTRA_libcipher_la_SOURCES): Add 'serpent-sse2-amd64.S'. * cipher/cipher.c (gcry_cipher_open) [USE_SERPENT]: Register bulk functions for CBC-decryption and CTR-mode. * cipher/serpent.c (USE_SSE2): New macro. [USE_SSE2] (_gcry_serpent_sse2_ctr_enc, _gcry_serpent_sse2_cbc_dec): New prototypes to assembler functions. (serpent_setkey): Set 'serpent_init_done' before calling serpent_test. (_gcry_serpent_ctr_enc): New function. (_gcry_serpent_cbc_dec): New function. (selftest_ctr_128): New function. (selftest_cbc_128): New function. (selftest): Call selftest_ctr_128 and selftest_cbc_128. * cipher/serpent-sse2-amd64.S: New file. * src/cipher.h (_gcry_serpent_ctr_enc): New prototype. (_gcry_serpent_cbc_dec): New prototype. -- [v2]: Converted to SSE2, to support all amd64 processors (SSE2 is required feature by AMD64 SysV ABI). Patch adds word-sliced SSE2 implementation of Serpent for amd64 for speeding up parallelizable workloads (CTR mode, CBC mode decryption). Implementation processes eight blocks in parallel, with two four-block sets interleaved for out-of-order scheduling. Speed old vs. new on Intel Core i5-2450M (Sandy-Bridge): ECB/Stream CBC CFB OFB CTR --------------- --------------- --------------- --------------- --------------- SERPENT128 1.00x 0.99x 1.00x 3.98x 1.00x 1.01x 1.00x 1.01x 4.04x 4.04x Speed old vs. new on AMD Phenom II X6 1055T: ECB/Stream CBC CFB OFB CTR --------------- --------------- --------------- --------------- --------------- SERPENT128 1.02x 1.01x 1.00x 2.83x 1.00x 1.00x 1.00x 1.00x 2.72x 2.72x Speed old vs. new on Intel Core2 Duo T8100: ECB/Stream CBC CFB OFB CTR --------------- --------------- --------------- --------------- --------------- SERPENT128 1.00x 1.02x 0.97x 4.02x 0.98x 1.01x 0.98x 1.00x 3.82x 3.91x Signed-off-by: Jussi Kivilinna --- cipher/serpent.c | 219 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 217 insertions(+), 2 deletions(-) (limited to 'cipher/serpent.c') diff --git a/cipher/serpent.c b/cipher/serpent.c index 72840cf5..7b82b48c 100644 --- a/cipher/serpent.c +++ b/cipher/serpent.c @@ -28,6 +28,15 @@ #include "g10lib.h" #include "cipher.h" #include "bithelp.h" +#include "bufhelp.h" +#include "cipher-selftest.h" + + +/* USE_SSE2 indicates whether to compile with AMD64 SSE2 code. */ +#undef USE_SSE2 +#if defined(__x86_64__) +# define USE_SSE2 1 +#endif /* Number of rounds per Serpent encrypt/decrypt operation. */ #define ROUNDS 32 @@ -52,6 +61,21 @@ typedef struct serpent_context } serpent_context_t; +#ifdef USE_SSE2 +/* Assembler implementations of Serpent using SSE2. Process 8 block in + parallel. + */ +extern void _gcry_serpent_sse2_ctr_enc(serpent_context_t *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *ctr); + +extern void _gcry_serpent_sse2_cbc_dec(serpent_context_t *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *iv); +#endif + /* A prototype. */ static const char *serpent_test (void); @@ -191,7 +215,7 @@ static const char *serpent_test (void); r4 &= r0; r1 ^= r3; \ r4 ^= r2; r1 |= r0; \ r1 ^= r2; r0 ^= r3; \ - r2 = r1; r1 |= r3; \ + r2 = r1; r1 |= r3; \ r1 ^= r0; \ \ w = r1; x = r2; y = r3; z = r4; \ @@ -587,10 +611,10 @@ serpent_setkey (void *ctx, if (! serpent_init_done) { /* Execute a self-test the first time, Serpent is used. */ + serpent_init_done = 1; serpent_test_ret = serpent_test (); if (serpent_test_ret) log_error ("Serpent test failure: %s\n", serpent_test_ret); - serpent_init_done = 1; } if (serpent_test_ret) @@ -740,6 +764,190 @@ serpent_decrypt (void *ctx, byte *buffer_out, const byte *buffer_in) +/* Bulk encryption of complete blocks in CTR mode. This function is only + intended for the bulk encryption feature of cipher.c. CTR is expected to be + of size sizeof(serpent_block_t). */ +void +_gcry_serpent_ctr_enc(void *context, unsigned char *ctr, + void *outbuf_arg, const void *inbuf_arg, + unsigned int nblocks) +{ + serpent_context_t *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + unsigned char tmpbuf[sizeof(serpent_block_t)]; + int burn_stack_depth = 2 * sizeof (serpent_block_t); + int i; + +#ifdef USE_SSE2 + { + int did_use_sse2 = 0; + + /* Process data in 8 block chunks. */ + while (nblocks >= 8) + { + _gcry_serpent_sse2_ctr_enc(ctx, outbuf, inbuf, ctr); + + nblocks -= 8; + outbuf += 8 * sizeof(serpent_block_t); + inbuf += 8 * sizeof(serpent_block_t); + did_use_sse2 = 1; + } + + if (did_use_sse2) + { + /* clear SSE2 registers used by serpent-sse2 */ + asm volatile ( + "pxor %%xmm0, %%xmm0;\n" + "pxor %%xmm1, %%xmm1;\n" + "pxor %%xmm2, %%xmm2;\n" + "pxor %%xmm3, %%xmm3;\n" + "pxor %%xmm4, %%xmm4;\n" + "pxor %%xmm5, %%xmm5;\n" + "pxor %%xmm6, %%xmm6;\n" + "pxor %%xmm7, %%xmm7;\n" + "pxor %%xmm10, %%xmm10;\n" + "pxor %%xmm11, %%xmm11;\n" + "pxor %%xmm12, %%xmm12;\n" + "pxor %%xmm13, %%xmm13;\n" + :::); + + /* serpent-sse2 assembly code does not use stack */ + if (nblocks == 0) + burn_stack_depth = 0; + } + + /* Use generic code to handle smaller chunks... */ + /* TODO: use caching instead? */ + } +#endif + + for ( ;nblocks; nblocks-- ) + { + /* Encrypt the counter. */ + serpent_encrypt_internal(ctx, ctr, tmpbuf); + /* XOR the input with the encrypted counter and store in output. */ + buf_xor(outbuf, tmpbuf, inbuf, sizeof(serpent_block_t)); + outbuf += sizeof(serpent_block_t); + inbuf += sizeof(serpent_block_t); + /* Increment the counter. */ + for (i = sizeof(serpent_block_t); i > 0; i--) + { + ctr[i-1]++; + if (ctr[i-1]) + break; + } + } + + wipememory(tmpbuf, sizeof(tmpbuf)); + _gcry_burn_stack(burn_stack_depth); +} + +/* Bulk decryption of complete blocks in CBC mode. This function is only + intended for the bulk encryption feature of cipher.c. */ +void +_gcry_serpent_cbc_dec(void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + unsigned int nblocks) +{ + serpent_context_t *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + unsigned char savebuf[sizeof(serpent_block_t)]; + int burn_stack_depth = 2 * sizeof (serpent_block_t); + +#ifdef USE_SSE2 + { + int did_use_sse2 = 0; + + /* Process data in 8 block chunks. */ + while (nblocks >= 8) + { + _gcry_serpent_sse2_cbc_dec(ctx, outbuf, inbuf, iv); + + nblocks -= 8; + outbuf += 8 * sizeof(serpent_block_t); + inbuf += 8 * sizeof(serpent_block_t); + did_use_sse2 = 1; + } + + if (did_use_sse2) + { + /* clear SSE2 registers used by serpent-sse2 */ + asm volatile ( + "pxor %%xmm0, %%xmm0;\n" + "pxor %%xmm1, %%xmm1;\n" + "pxor %%xmm2, %%xmm2;\n" + "pxor %%xmm3, %%xmm3;\n" + "pxor %%xmm4, %%xmm4;\n" + "pxor %%xmm5, %%xmm5;\n" + "pxor %%xmm6, %%xmm6;\n" + "pxor %%xmm7, %%xmm7;\n" + "pxor %%xmm10, %%xmm10;\n" + "pxor %%xmm11, %%xmm11;\n" + "pxor %%xmm12, %%xmm12;\n" + "pxor %%xmm13, %%xmm13;\n" + :::); + + /* serpent-sse2 assembly code does not use stack */ + if (nblocks == 0) + burn_stack_depth = 0; + } + + /* Use generic code to handle smaller chunks... */ + } +#endif + + for ( ;nblocks; nblocks-- ) + { + /* We need to save INBUF away because it may be identical to + OUTBUF. */ + memcpy(savebuf, inbuf, sizeof(serpent_block_t)); + + serpent_decrypt_internal (ctx, inbuf, outbuf); + + buf_xor(outbuf, outbuf, iv, sizeof(serpent_block_t)); + memcpy(iv, savebuf, sizeof(serpent_block_t)); + inbuf += sizeof(serpent_block_t); + outbuf += sizeof(serpent_block_t); + } + + wipememory(savebuf, sizeof(savebuf)); + _gcry_burn_stack(burn_stack_depth); +} + + + +/* Run the self-tests for SERPENT-CTR-128, tests IV increment of bulk CTR + encryption. Returns NULL on success. */ +static const char* +selftest_ctr_128 (void) +{ + const int nblocks = 8+1; + const int blocksize = sizeof(serpent_block_t); + const int context_size = sizeof(serpent_context_t); + + return _gcry_selftest_helper_ctr_128("SERPENT", &serpent_setkey, + &serpent_encrypt, &_gcry_serpent_ctr_enc, nblocks, blocksize, + context_size); +} + + +/* Run the self-tests for SERPENT-CBC-128, tests bulk CBC decryption. + Returns NULL on success. */ +static const char* +selftest_cbc_128 (void) +{ + const int nblocks = 8+2; + const int blocksize = sizeof(serpent_block_t); + const int context_size = sizeof(serpent_context_t); + + return _gcry_selftest_helper_cbc_128("SERPENT", &serpent_setkey, + &serpent_encrypt, &_gcry_serpent_cbc_dec, nblocks, blocksize, + context_size); +} + + /* Serpent test. */ static const char * @@ -748,6 +956,7 @@ serpent_test (void) serpent_context_t context; unsigned char scratch[16]; unsigned int i; + const char *r; static struct test { @@ -819,6 +1028,12 @@ serpent_test (void) } } + if ( (r = selftest_ctr_128 ()) ) + return r; + + if ( (r = selftest_cbc_128 ()) ) + return r; + return NULL; } -- cgit v1.2.1