summaryrefslogtreecommitdiff
path: root/cipher/camellia-aesni-avx-amd64.S
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2013-05-23 14:15:46 +0300
committerWerner Koch <wk@gnupg.org>2013-05-23 17:34:23 +0200
commitb60f06f70227c1e69e1010da8b47ea51ade48145 (patch)
treec59a9a6977c260a658bd9eb53a512eb07084027a /cipher/camellia-aesni-avx-amd64.S
parent319ee14f2aab8db56a830fd7ac8926f91b4f738a (diff)
downloadlibgcrypt-b60f06f70227c1e69e1010da8b47ea51ade48145.tar.gz
camellia: add parallel processing for CFB decryption
* cipher/camellia-aesni-avx-amd64.S (_gcry_camellia_aesni_avx_cfb_dec): New function. * cipher/camellia-glue.c (_gcry_camellia_aesni_avx_cfb_dec): New prototype. (_gcry_camellia_cfb_dec): New function. (selftest_cfb_128): New function. (selftest): Call selftest_cfb_128. * cipher/cipher.c (gry_cipher_open): Add bulk CFB decryption function for Camellia. * src/cipher.h (_gcry_camellia_cfb_dec): New prototype. -- Patch makes Camellia-CFB decryption 4.7 times faster on Intel Sandy-Bridge. Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/camellia-aesni-avx-amd64.S')
-rw-r--r--cipher/camellia-aesni-avx-amd64.S65
1 files changed, 65 insertions, 0 deletions
diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S
index 2b1df17a..95c96b8b 100644
--- a/cipher/camellia-aesni-avx-amd64.S
+++ b/cipher/camellia-aesni-avx-amd64.S
@@ -1116,5 +1116,70 @@ _gcry_camellia_aesni_avx_cbc_dec:
ret;
.size _gcry_camellia_aesni_avx_cbc_dec,.-_gcry_camellia_aesni_avx_cbc_dec;
+.align 8
+.global _gcry_camellia_aesni_avx_cfb_dec
+.type _gcry_camellia_aesni_avx_cfb_dec,@function;
+
+_gcry_camellia_aesni_avx_cfb_dec:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv
+ */
+
+ subq $(16 * 16), %rsp;
+ movq %rsp, %rax;
+
+ /* inpack16_pre: */
+ vmovq (key_table)(CTX), %xmm0;
+ vpshufb .Lpack_bswap RIP, %xmm0, %xmm0;
+ vpxor (%rcx), %xmm0, %xmm15;
+ vmovdqu 15 * 16(%rdx), %xmm1;
+ vmovdqu %xmm1, (%rcx); /* store new IV */
+ vpxor 0 * 16(%rdx), %xmm0, %xmm14;
+ vpxor 1 * 16(%rdx), %xmm0, %xmm13;
+ vpxor 2 * 16(%rdx), %xmm0, %xmm12;
+ vpxor 3 * 16(%rdx), %xmm0, %xmm11;
+ vpxor 4 * 16(%rdx), %xmm0, %xmm10;
+ vpxor 5 * 16(%rdx), %xmm0, %xmm9;
+ vpxor 6 * 16(%rdx), %xmm0, %xmm8;
+ vpxor 7 * 16(%rdx), %xmm0, %xmm7;
+ vpxor 8 * 16(%rdx), %xmm0, %xmm6;
+ vpxor 9 * 16(%rdx), %xmm0, %xmm5;
+ vpxor 10 * 16(%rdx), %xmm0, %xmm4;
+ vpxor 11 * 16(%rdx), %xmm0, %xmm3;
+ vpxor 12 * 16(%rdx), %xmm0, %xmm2;
+ vpxor 13 * 16(%rdx), %xmm0, %xmm1;
+ vpxor 14 * 16(%rdx), %xmm0, %xmm0;
+
+ call __camellia_enc_blk16;
+
+ addq $(16 * 16), %rsp;
+
+ vpxor 0 * 16(%rdx), %xmm7, %xmm7;
+ vpxor 1 * 16(%rdx), %xmm6, %xmm6;
+ vpxor 2 * 16(%rdx), %xmm5, %xmm5;
+ vpxor 3 * 16(%rdx), %xmm4, %xmm4;
+ vpxor 4 * 16(%rdx), %xmm3, %xmm3;
+ vpxor 5 * 16(%rdx), %xmm2, %xmm2;
+ vpxor 6 * 16(%rdx), %xmm1, %xmm1;
+ vpxor 7 * 16(%rdx), %xmm0, %xmm0;
+ vpxor 8 * 16(%rdx), %xmm15, %xmm15;
+ vpxor 9 * 16(%rdx), %xmm14, %xmm14;
+ vpxor 10 * 16(%rdx), %xmm13, %xmm13;
+ vpxor 11 * 16(%rdx), %xmm12, %xmm12;
+ vpxor 12 * 16(%rdx), %xmm11, %xmm11;
+ vpxor 13 * 16(%rdx), %xmm10, %xmm10;
+ vpxor 14 * 16(%rdx), %xmm9, %xmm9;
+ vpxor 15 * 16(%rdx), %xmm8, %xmm8;
+
+ write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+ %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+ %xmm8, %rsi);
+
+ ret;
+.size _gcry_camellia_aesni_avx_cfb_dec,.-_gcry_camellia_aesni_avx_cfb_dec;
+
#endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)*/
#endif /*__x86_64*/