diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2013-05-23 14:15:51 +0300 |
---|---|---|
committer | Werner Koch <wk@gnupg.org> | 2013-05-23 17:35:05 +0200 |
commit | 6deb0ccdf718a0670f80e6762a3842caf76437d6 (patch) | |
tree | 5bf7f4bb2de2049de8eb0a9d6ce6d2ce43f93a40 /cipher/serpent-sse2-amd64.S | |
parent | b60f06f70227c1e69e1010da8b47ea51ade48145 (diff) | |
download | libgcrypt-6deb0ccdf718a0670f80e6762a3842caf76437d6.tar.gz |
serpent: add parallel processing for CFB decryption
* cipher/cipher.c (gcry_cipher_open): Add bulf CFB decryption function
for Serpent.
* cipher/serpent-sse2-amd64.S (_gcry_serpent_sse2_cfb_dec): New
function.
* cipher/serpent.c (_gcry_serpent_sse2_cfb_dec): New prototype.
(_gcry_serpent_cfb_dec) New function.
(selftest_cfb_128) New function.
(selftest) Call selftest_cfb_128.
* src/cipher.h (_gcry_serpent_cfb_dec): New prototype.
--
Patch makes Serpent-CFB decryption 4.0 times faster on Intel Sandy-Bridge and
2.7 times faster on AMD K10.
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/serpent-sse2-amd64.S')
-rw-r--r-- | cipher/serpent-sse2-amd64.S | 66 |
1 files changed, 66 insertions, 0 deletions
diff --git a/cipher/serpent-sse2-amd64.S b/cipher/serpent-sse2-amd64.S index 8d8c8dda..5f9e9d22 100644 --- a/cipher/serpent-sse2-amd64.S +++ b/cipher/serpent-sse2-amd64.S @@ -822,5 +822,71 @@ _gcry_serpent_sse2_cbc_dec: ret .size _gcry_serpent_sse2_cbc_dec,.-_gcry_serpent_sse2_cbc_dec; +.align 8 +.global _gcry_serpent_sse2_cfb_dec +.type _gcry_serpent_sse2_cfb_dec,@function; +_gcry_serpent_sse2_cfb_dec: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (8 blocks) + * %rdx: src (8 blocks) + * %rcx: iv + */ + + .set RA0, enc_in_a0 + .set RA1, enc_in_a1 + .set RA2, enc_in_a2 + .set RA3, enc_in_a3 + .set RB0, enc_in_b0 + .set RB1, enc_in_b1 + .set RB2, enc_in_b2 + .set RB3, enc_in_b3 + + /* Load input */ + movdqu (%rcx), RA0; + movdqu 0 * 16(%rdx), RA1; + movdqu 1 * 16(%rdx), RA2; + movdqu 2 * 16(%rdx), RA3; + movdqu 3 * 16(%rdx), RB0; + movdqu 4 * 16(%rdx), RB1; + movdqu 5 * 16(%rdx), RB2; + movdqu 6 * 16(%rdx), RB3; + + /* Update IV */ + movdqu 7 * 16(%rdx), RNOT; + movdqu RNOT, (%rcx); + + call __serpent_enc_blk8; + + .set RA0, enc_out_a0 + .set RA1, enc_out_a1 + .set RA2, enc_out_a2 + .set RA3, enc_out_a3 + .set RB0, enc_out_b0 + .set RB1, enc_out_b1 + .set RB2, enc_out_b2 + .set RB3, enc_out_b3 + + pxor_u((0 * 16)(%rdx), RA0, RTMP0); + pxor_u((1 * 16)(%rdx), RA1, RTMP0); + pxor_u((2 * 16)(%rdx), RA2, RTMP0); + pxor_u((3 * 16)(%rdx), RA3, RTMP0); + pxor_u((4 * 16)(%rdx), RB0, RTMP0); + pxor_u((5 * 16)(%rdx), RB1, RTMP0); + pxor_u((6 * 16)(%rdx), RB2, RTMP0); + pxor_u((7 * 16)(%rdx), RB3, RTMP0); + + movdqu RA0, (0 * 16)(%rsi); + movdqu RA1, (1 * 16)(%rsi); + movdqu RA2, (2 * 16)(%rsi); + movdqu RA3, (3 * 16)(%rsi); + movdqu RB0, (4 * 16)(%rsi); + movdqu RB1, (5 * 16)(%rsi); + movdqu RB2, (6 * 16)(%rsi); + movdqu RB3, (7 * 16)(%rsi); + + ret +.size _gcry_serpent_sse2_cfb_dec,.-_gcry_serpent_sse2_cfb_dec; + #endif /*defined(USE_SERPENT)*/ #endif /*__x86_64*/ |