summaryrefslogtreecommitdiff
path: root/cipher/camellia-glue.c
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@mbnet.fi>2013-01-23 11:55:13 +0200
committerWerner Koch <wk@gnupg.org>2013-02-19 11:21:48 +0100
commit63ac3ba07dba82fde040d31b90b4eff627bd92b9 (patch)
treec103c60a747faff8ebb8e1f7b72a9faa68ed089d /cipher/camellia-glue.c
parent4de62d80644228fc5db2a9f9c94a7eb633d8de2e (diff)
downloadlibgcrypt-63ac3ba07dba82fde040d31b90b4eff627bd92b9.tar.gz
Add AES-NI/AVX accelerated Camellia implementation
* configure.ac: Add option --disable-avx-support. (HAVE_GCC_INLINE_ASM_AVX): New. (ENABLE_AVX_SUPPORT): New. (camellia) [ENABLE_AVX_SUPPORT, ENABLE_AESNI_SUPPORT]: Add camellia_aesni_avx_x86-64.lo. * cipher/Makefile.am (AM_CCASFLAGS): Add. (EXTRA_libcipher_la_SOURCES): Add camellia_aesni_avx_x86-64.S * cipher/camellia-glue.c [ENABLE_AESNI_SUPPORT, ENABLE_AVX_SUPPORT] [__x86_64__] (USE_AESNI_AVX): Add macro. (struct Camellia_context) [USE_AESNI_AVX]: Add use_aesni_avx. [USE_AESNI_AVX] (_gcry_camellia_aesni_avx_ctr_enc) (_gcry_camellia_aesni_avx_cbc_dec): New prototypes to assembly functions. (camellia_setkey) [USE_AESNI_AVX]: Enable AES-NI/AVX if hardware support both. (_gcry_camellia_ctr_enc) [USE_AESNI_AVX]: Add AES-NI/AVX code. (_gcry_camellia_cbc_dec) [USE_AESNI_AVX]: Add AES-NI/AVX code. * cipher/camellia_aesni_avx_x86-64.S: New. * src/g10lib.h (HWF_INTEL_AVX): New. * src/global.c (hwflist): Add HWF_INTEL_AVX. * src/hwf-x86.c (detect_x86_gnuc) [ENABLE_AVX_SUPPORT]: Add detection for AVX. -- Before: Running each test 250 times. ECB/Stream CBC CFB OFB CTR --------------- --------------- --------------- --------------- --------------- CAMELLIA128 2210ms 2200ms 2300ms 2050ms 2240ms 2250ms 2290ms 2270ms 2070ms 2070ms CAMELLIA256 2810ms 2800ms 2920ms 2670ms 2840ms 2850ms 2910ms 2890ms 2660ms 2640ms After: Running each test 250 times. ECB/Stream CBC CFB OFB CTR --------------- --------------- --------------- --------------- --------------- CAMELLIA128 2200ms 2220ms 2290ms 470ms 2240ms 2270ms 2270ms 2290ms 480ms 480ms CAMELLIA256 2820ms 2820ms 2900ms 600ms 2860ms 2860ms 2900ms 2920ms 620ms 620ms AES-NI/AVX implementation works by processing 16 parallel blocks (256 bytes). It's bytesliced implementation that uses AES-NI (Subbyte) for Camellia sboxes, with help of prefiltering/postfiltering. For smaller data sets generic C implementation is used. Speed-up for CBC-decryption and CTR-mode (large data): 4.3x Tests were run on: Intel Core i5-2450M Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi> (license boiler plate update by wk)
Diffstat (limited to 'cipher/camellia-glue.c')
-rw-r--r--cipher/camellia-glue.c100
1 files changed, 98 insertions, 2 deletions
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index ba8aa281..dd9206f1 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -71,12 +71,38 @@
# define ATTR_ALIGNED_16
#endif
+/* USE_AESNI inidicates whether to compile with Intel AES-NI/AVX code. */
+#undef USE_AESNI_AVX
+#if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)
+# if defined(__x86_64__)
+# define USE_AESNI_AVX 1
+# endif
+#endif
+
typedef struct
{
int keybitlength;
KEY_TABLE_TYPE keytable;
+#ifdef USE_AESNI_AVX
+ int use_aesni_avx; /* AES-NI/AVX implementation shall be used. */
+#endif /*USE_AESNI_AVX*/
} CAMELLIA_context;
+#ifdef USE_AESNI_AVX
+/* Assembler implementations of Camellia using AES-NI and AVX. Process data
+ in 16 block same time.
+ */
+extern void _gcry_camellia_aesni_avx_ctr_enc(CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *ctr);
+
+extern void _gcry_camellia_aesni_avx_cbc_dec(CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *iv);
+#endif
+
static const char *selftest(void);
static gcry_err_code_t
@@ -109,6 +135,15 @@ camellia_setkey(void *c, const byte *key, unsigned keylen)
+3*2*sizeof(void*) /* Function calls. */
);
+#ifdef USE_AESNI_AVX
+ ctx->use_aesni_avx = 0;
+ if ((_gcry_get_hw_features () & HWF_INTEL_AESNI) &&
+ (_gcry_get_hw_features () & HWF_INTEL_AVX))
+ {
+ ctx->use_aesni_avx = 1;
+ }
+#endif
+
return 0;
}
@@ -158,8 +193,39 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
unsigned char tmpbuf[CAMELLIA_BLOCK_SIZE];
+ int burn_stack_depth = CAMELLIA_encrypt_stack_burn_size;
int i;
+#ifdef USE_AESNI_AVX
+ if (ctx->use_aesni_avx)
+ {
+ int did_use_aesni_avx = 0;
+
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
+ {
+ _gcry_camellia_aesni_avx_ctr_enc(ctx, outbuf, inbuf, ctr);
+
+ nblocks -= 16;
+ outbuf += 16 * CAMELLIA_BLOCK_SIZE;
+ inbuf += 16 * CAMELLIA_BLOCK_SIZE;
+ did_use_aesni_avx = 1;
+ }
+
+ if (did_use_aesni_avx)
+ {
+ /* clear AVX registers */
+ asm volatile ("vzeroall;\n":::);
+
+ if (burn_stack_depth < 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *))
+ burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *);
+ }
+
+ /* Use generic code to handle smaller chunks... */
+ /* TODO: use caching instead? */
+ }
+#endif
+
for ( ;nblocks; nblocks-- )
{
/* Encrypt the counter. */
@@ -178,7 +244,7 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
}
wipememory(tmpbuf, sizeof(tmpbuf));
- _gcry_burn_stack(CAMELLIA_encrypt_stack_burn_size);
+ _gcry_burn_stack(burn_stack_depth);
}
/* Bulk decryption of complete blocks in CBC mode. This function is only
@@ -192,6 +258,36 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
unsigned char savebuf[CAMELLIA_BLOCK_SIZE];
+ int burn_stack_depth = CAMELLIA_decrypt_stack_burn_size;
+
+#ifdef USE_AESNI_AVX
+ if (ctx->use_aesni_avx)
+ {
+ int did_use_aesni_avx = 0;
+
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
+ {
+ _gcry_camellia_aesni_avx_cbc_dec(ctx, outbuf, inbuf, iv);
+
+ nblocks -= 16;
+ outbuf += 16 * CAMELLIA_BLOCK_SIZE;
+ inbuf += 16 * CAMELLIA_BLOCK_SIZE;
+ did_use_aesni_avx = 1;
+ }
+
+ if (did_use_aesni_avx)
+ {
+ /* clear AVX registers */
+ asm volatile ("vzeroall;\n":::);
+
+ if (burn_stack_depth < 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *))
+ burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *);
+ }
+
+ /* Use generic code to handle smaller chunks... */
+ }
+#endif
for ( ;nblocks; nblocks-- )
{
@@ -208,7 +304,7 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
}
wipememory(savebuf, sizeof(savebuf));
- _gcry_burn_stack(CAMELLIA_decrypt_stack_burn_size);
+ _gcry_burn_stack(burn_stack_depth);
}
/* Run the self-tests for CAMELLIA-CTR-128, tests IV increment of bulk CTR