From d325ab5d86e6107a46007a4d0131122bbd719f8c Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Sun, 9 Jun 2013 16:37:38 +0300 Subject: twofish: add amd64 assembly implementation * cipher/Makefile.am: Add 'twofish-amd64.S'. * cipher/twofish-amd64.S: New file. * cipher/twofish.c (USE_AMD64_ASM): New macro. [USE_AMD64_ASM] (_gcry_twofish_amd64_encrypt_block) (_gcry_twofish_amd64_decrypt_block, _gcry_twofish_amd64_ctr_enc) (_gcry_twofish_amd64_cbc_dec, _gcry_twofish_amd64_cfb_dec): New prototypes. [USE_AMD64_ASM] (do_twofish_encrypt, do_twofish_decrypt) (twofish_encrypt, twofish_decrypt): New functions. (_gcry_twofish_ctr_enc, _gcry_twofish_cbc_dec, _gcry_twofish_cfb_dec) (selftest_ctr, selftest_cbc, selftest_cfb): New functions. (selftest): Call new bulk selftests. * cipher/cipher.c (gcry_cipher_open) [USE_TWOFISH]: Register Twofish bulk functions for ctr-enc, cbc-dec and cfb-dec. * configure.ac (twofish) [x86_64]: Add 'twofish-amd64.lo'. * src/cipher.h (_gcry_twofish_ctr_enc, _gcry_twofish_cbc_dec) (gcry_twofish_cfb_dec): New prototypes. -- Provides non-parallel implementations for small speed-up and 3-way parallel implementations that gets accelerated on `out-of-order' CPUs. Speed old vs. new on Intel Core i5-4570: ECB/Stream CBC CFB OFB CTR --------------- --------------- --------------- --------------- --------------- TWOFISH128 1.08x 1.07x 1.10x 1.80x 1.09x 1.70x 1.08x 1.08x 1.70x 1.69x Speed old vs. new on Intel Core2 T8100: ECB/Stream CBC CFB OFB CTR --------------- --------------- --------------- --------------- --------------- TWOFISH128 1.11x 1.10x 1.13x 1.65x 1.13x 1.62x 1.12x 1.11x 1.63x 1.59x Signed-off-by: Jussi Kivilinna --- cipher/twofish.c | 278 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 278 insertions(+) (limited to 'cipher/twofish.c') diff --git a/cipher/twofish.c b/cipher/twofish.c index f1a93ca8..ee721c6b 100644 --- a/cipher/twofish.c +++ b/cipher/twofish.c @@ -44,6 +44,19 @@ #include "types.h" /* for byte and u32 typedefs */ #include "g10lib.h" #include "cipher.h" +#include "bufhelp.h" +#include "cipher-selftest.h" + + +#define TWOFISH_BLOCKSIZE 16 + + +/* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */ +#undef USE_AMD64_ASM +#if defined(__x86_64__) +# define USE_AMD64_ASM 1 +#endif + /* Prototype for the self-test function. */ static const char *selftest(void); @@ -714,6 +727,27 @@ twofish_setkey (void *context, const byte *key, unsigned int keylen) +#ifdef USE_AMD64_ASM + +/* Assembly implementations of Twofish. */ +extern void _gcry_twofish_amd64_encrypt_block(const TWOFISH_context *c, + byte *out, const byte *in); + +extern void _gcry_twofish_amd64_decrypt_block(const TWOFISH_context *c, + byte *out, const byte *in); + +/* These assembly implementations process three blocks in parallel. */ +extern void _gcry_twofish_amd64_ctr_enc(const TWOFISH_context *c, byte *out, + const byte *in, byte *ctr); + +extern void _gcry_twofish_amd64_cbc_dec(const TWOFISH_context *c, byte *out, + const byte *in, byte *iv); + +extern void _gcry_twofish_amd64_cfb_dec(const TWOFISH_context *c, byte *out, + const byte *in, byte *iv); + +#else /*!USE_AMD64_ASM*/ + /* Macros to compute the g() function in the encryption and decryption * rounds. G1 is the straight g() function; G2 includes the 8-bit * rotation for the high 32-bit word. */ @@ -771,9 +805,30 @@ twofish_setkey (void *context, const byte *key, unsigned int keylen) x ^= ctx->w[m]; \ out[4 * (n)] = x; out[4 * (n) + 1] = x >> 8; \ out[4 * (n) + 2] = x >> 16; out[4 * (n) + 3] = x >> 24 + +#endif /*!USE_AMD64_ASM*/ + /* Encrypt one block. in and out may be the same. */ +#ifdef USE_AMD64_ASM + +static void +do_twofish_encrypt (const TWOFISH_context *ctx, byte *out, const byte *in) +{ + _gcry_twofish_amd64_encrypt_block(ctx, out, in); +} + +static void +twofish_encrypt (void *context, byte *out, const byte *in) +{ + TWOFISH_context *ctx = context; + _gcry_twofish_amd64_encrypt_block(ctx, out, in); + _gcry_burn_stack (4*sizeof (void*)); +} + +#else /*!USE_AMD64_ASM*/ + static void do_twofish_encrypt (const TWOFISH_context *ctx, byte *out, const byte *in) { @@ -814,9 +869,29 @@ twofish_encrypt (void *context, byte *out, const byte *in) _gcry_burn_stack (24+3*sizeof (void*)); } +#endif /*!USE_AMD64_ASM*/ + /* Decrypt one block. in and out may be the same. */ +#ifdef USE_AMD64_ASM + +static void +do_twofish_decrypt (const TWOFISH_context *ctx, byte *out, const byte *in) +{ + _gcry_twofish_amd64_decrypt_block(ctx, out, in); +} + +static void +twofish_decrypt (void *context, byte *out, const byte *in) +{ + TWOFISH_context *ctx = context; + _gcry_twofish_amd64_decrypt_block(ctx, out, in); + _gcry_burn_stack (4*sizeof (void*)); +} + +#else /*!USE_AMD64_ASM*/ + static void do_twofish_decrypt (const TWOFISH_context *ctx, byte *out, const byte *in) { @@ -858,6 +933,201 @@ twofish_decrypt (void *context, byte *out, const byte *in) _gcry_burn_stack (24+3*sizeof (void*)); } +#endif /*!USE_AMD64_ASM*/ + + + +/* Bulk encryption of complete blocks in CTR mode. This function is only + intended for the bulk encryption feature of cipher.c. CTR is expected to be + of size TWOFISH_BLOCKSIZE. */ +void +_gcry_twofish_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg, + const void *inbuf_arg, unsigned int nblocks) +{ + TWOFISH_context *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + unsigned char tmpbuf[TWOFISH_BLOCKSIZE]; + int burn_stack_depth = 24 + 3 * sizeof (void*); + int i; + +#ifdef USE_AMD64_ASM + { + if (nblocks >= 3 && burn_stack_depth < 8 * sizeof(void*)) + burn_stack_depth = 8 * sizeof(void*); + + /* Process data in 3 block chunks. */ + while (nblocks >= 3) + { + _gcry_twofish_amd64_ctr_enc(ctx, outbuf, inbuf, ctr); + + nblocks -= 3; + outbuf += 3 * TWOFISH_BLOCKSIZE; + inbuf += 3 * TWOFISH_BLOCKSIZE; + } + + /* Use generic code to handle smaller chunks... */ + /* TODO: use caching instead? */ + } +#endif + + for ( ;nblocks; nblocks-- ) + { + /* Encrypt the counter. */ + do_twofish_encrypt(ctx, tmpbuf, ctr); + /* XOR the input with the encrypted counter and store in output. */ + buf_xor(outbuf, tmpbuf, inbuf, TWOFISH_BLOCKSIZE); + outbuf += TWOFISH_BLOCKSIZE; + inbuf += TWOFISH_BLOCKSIZE; + /* Increment the counter. */ + for (i = TWOFISH_BLOCKSIZE; i > 0; i--) + { + ctr[i-1]++; + if (ctr[i-1]) + break; + } + } + + wipememory(tmpbuf, sizeof(tmpbuf)); + _gcry_burn_stack(burn_stack_depth); +} + + +/* Bulk decryption of complete blocks in CBC mode. This function is only + intended for the bulk encryption feature of cipher.c. */ +void +_gcry_twofish_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg, + const void *inbuf_arg, unsigned int nblocks) +{ + TWOFISH_context *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + unsigned char savebuf[TWOFISH_BLOCKSIZE]; + int burn_stack_depth = 24 + 3 * sizeof (void*); + +#ifdef USE_AMD64_ASM + { + if (nblocks >= 3 && burn_stack_depth < 9 * sizeof(void*)) + burn_stack_depth = 9 * sizeof(void*); + + /* Process data in 3 block chunks. */ + while (nblocks >= 3) + { + _gcry_twofish_amd64_cbc_dec(ctx, outbuf, inbuf, iv); + + nblocks -= 3; + outbuf += 3 * TWOFISH_BLOCKSIZE; + inbuf += 3 * TWOFISH_BLOCKSIZE; + } + + /* Use generic code to handle smaller chunks... */ + } +#endif + + for ( ;nblocks; nblocks-- ) + { + /* We need to save INBUF away because it may be identical to + OUTBUF. */ + memcpy(savebuf, inbuf, TWOFISH_BLOCKSIZE); + + do_twofish_decrypt (ctx, outbuf, inbuf); + + buf_xor(outbuf, outbuf, iv, TWOFISH_BLOCKSIZE); + memcpy(iv, savebuf, TWOFISH_BLOCKSIZE); + inbuf += TWOFISH_BLOCKSIZE; + outbuf += TWOFISH_BLOCKSIZE; + } + + wipememory(savebuf, sizeof(savebuf)); + _gcry_burn_stack(burn_stack_depth); +} + + +/* Bulk decryption of complete blocks in CFB mode. This function is only + intended for the bulk encryption feature of cipher.c. */ +void +_gcry_twofish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg, + const void *inbuf_arg, unsigned int nblocks) +{ + TWOFISH_context *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + int burn_stack_depth = 24 + 3 * sizeof (void*); + +#ifdef USE_AMD64_ASM + { + if (nblocks >= 3 && burn_stack_depth < 8 * sizeof(void*)) + burn_stack_depth = 8 * sizeof(void*); + + /* Process data in 3 block chunks. */ + while (nblocks >= 3) + { + _gcry_twofish_amd64_cfb_dec(ctx, outbuf, inbuf, iv); + + nblocks -= 3; + outbuf += 3 * TWOFISH_BLOCKSIZE; + inbuf += 3 * TWOFISH_BLOCKSIZE; + } + + /* Use generic code to handle smaller chunks... */ + } +#endif + + for ( ;nblocks; nblocks-- ) + { + do_twofish_encrypt(ctx, iv, iv); + buf_xor_n_copy(outbuf, iv, inbuf, TWOFISH_BLOCKSIZE); + outbuf += TWOFISH_BLOCKSIZE; + inbuf += TWOFISH_BLOCKSIZE; + } + + _gcry_burn_stack(burn_stack_depth); +} + + + +/* Run the self-tests for TWOFISH-CTR, tests IV increment of bulk CTR + encryption. Returns NULL on success. */ +static const char * +selftest_ctr (void) +{ + const int nblocks = 3+1; + const int blocksize = TWOFISH_BLOCKSIZE; + const int context_size = sizeof(TWOFISH_context); + + return _gcry_selftest_helper_ctr("TWOFISH", &twofish_setkey, + &twofish_encrypt, &_gcry_twofish_ctr_enc, nblocks, blocksize, + context_size); +} + +/* Run the self-tests for TWOFISH-CBC, tests bulk CBC decryption. + Returns NULL on success. */ +static const char * +selftest_cbc (void) +{ + const int nblocks = 3+2; + const int blocksize = TWOFISH_BLOCKSIZE; + const int context_size = sizeof(TWOFISH_context); + + return _gcry_selftest_helper_cbc("TWOFISH", &twofish_setkey, + &twofish_encrypt, &_gcry_twofish_cbc_dec, nblocks, blocksize, + context_size); +} + +/* Run the self-tests for TWOFISH-CFB, tests bulk CBC decryption. + Returns NULL on success. */ +static const char * +selftest_cfb (void) +{ + const int nblocks = 3+2; + const int blocksize = TWOFISH_BLOCKSIZE; + const int context_size = sizeof(TWOFISH_context); + + return _gcry_selftest_helper_cfb("TWOFISH", &twofish_setkey, + &twofish_encrypt, &_gcry_twofish_cfb_dec, nblocks, blocksize, + context_size); +} + /* Test a single encryption and decryption with each key size. */ @@ -866,6 +1136,7 @@ selftest (void) { TWOFISH_context ctx; /* Expanded key. */ byte scratch[16]; /* Encryption/decryption result buffer. */ + const char *r; /* Test vectors for single encryption/decryption. Note that I am using * the vectors from the Twofish paper's "known answer test", I=3 for @@ -915,6 +1186,13 @@ selftest (void) if (memcmp (scratch, plaintext_256, sizeof (plaintext_256))) return "Twofish-256 test decryption failed."; + if ((r = selftest_ctr()) != NULL) + return r; + if ((r = selftest_cbc()) != NULL) + return r; + if ((r = selftest_cfb()) != NULL) + return r; + return NULL; } -- cgit v1.2.1