summaryrefslogtreecommitdiff
path: root/cipher/serpent-sse2-amd64.S
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2013-05-23 14:15:51 +0300
committerWerner Koch <wk@gnupg.org>2013-05-23 17:35:05 +0200
commit6deb0ccdf718a0670f80e6762a3842caf76437d6 (patch)
tree5bf7f4bb2de2049de8eb0a9d6ce6d2ce43f93a40 /cipher/serpent-sse2-amd64.S
parentb60f06f70227c1e69e1010da8b47ea51ade48145 (diff)
downloadlibgcrypt-6deb0ccdf718a0670f80e6762a3842caf76437d6.tar.gz
serpent: add parallel processing for CFB decryption
* cipher/cipher.c (gcry_cipher_open): Add bulf CFB decryption function for Serpent. * cipher/serpent-sse2-amd64.S (_gcry_serpent_sse2_cfb_dec): New function. * cipher/serpent.c (_gcry_serpent_sse2_cfb_dec): New prototype. (_gcry_serpent_cfb_dec) New function. (selftest_cfb_128) New function. (selftest) Call selftest_cfb_128. * src/cipher.h (_gcry_serpent_cfb_dec): New prototype. -- Patch makes Serpent-CFB decryption 4.0 times faster on Intel Sandy-Bridge and 2.7 times faster on AMD K10. Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/serpent-sse2-amd64.S')
-rw-r--r--cipher/serpent-sse2-amd64.S66
1 files changed, 66 insertions, 0 deletions
diff --git a/cipher/serpent-sse2-amd64.S b/cipher/serpent-sse2-amd64.S
index 8d8c8dda..5f9e9d22 100644
--- a/cipher/serpent-sse2-amd64.S
+++ b/cipher/serpent-sse2-amd64.S
@@ -822,5 +822,71 @@ _gcry_serpent_sse2_cbc_dec:
ret
.size _gcry_serpent_sse2_cbc_dec,.-_gcry_serpent_sse2_cbc_dec;
+.align 8
+.global _gcry_serpent_sse2_cfb_dec
+.type _gcry_serpent_sse2_cfb_dec,@function;
+_gcry_serpent_sse2_cfb_dec:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (8 blocks)
+ * %rdx: src (8 blocks)
+ * %rcx: iv
+ */
+
+ .set RA0, enc_in_a0
+ .set RA1, enc_in_a1
+ .set RA2, enc_in_a2
+ .set RA3, enc_in_a3
+ .set RB0, enc_in_b0
+ .set RB1, enc_in_b1
+ .set RB2, enc_in_b2
+ .set RB3, enc_in_b3
+
+ /* Load input */
+ movdqu (%rcx), RA0;
+ movdqu 0 * 16(%rdx), RA1;
+ movdqu 1 * 16(%rdx), RA2;
+ movdqu 2 * 16(%rdx), RA3;
+ movdqu 3 * 16(%rdx), RB0;
+ movdqu 4 * 16(%rdx), RB1;
+ movdqu 5 * 16(%rdx), RB2;
+ movdqu 6 * 16(%rdx), RB3;
+
+ /* Update IV */
+ movdqu 7 * 16(%rdx), RNOT;
+ movdqu RNOT, (%rcx);
+
+ call __serpent_enc_blk8;
+
+ .set RA0, enc_out_a0
+ .set RA1, enc_out_a1
+ .set RA2, enc_out_a2
+ .set RA3, enc_out_a3
+ .set RB0, enc_out_b0
+ .set RB1, enc_out_b1
+ .set RB2, enc_out_b2
+ .set RB3, enc_out_b3
+
+ pxor_u((0 * 16)(%rdx), RA0, RTMP0);
+ pxor_u((1 * 16)(%rdx), RA1, RTMP0);
+ pxor_u((2 * 16)(%rdx), RA2, RTMP0);
+ pxor_u((3 * 16)(%rdx), RA3, RTMP0);
+ pxor_u((4 * 16)(%rdx), RB0, RTMP0);
+ pxor_u((5 * 16)(%rdx), RB1, RTMP0);
+ pxor_u((6 * 16)(%rdx), RB2, RTMP0);
+ pxor_u((7 * 16)(%rdx), RB3, RTMP0);
+
+ movdqu RA0, (0 * 16)(%rsi);
+ movdqu RA1, (1 * 16)(%rsi);
+ movdqu RA2, (2 * 16)(%rsi);
+ movdqu RA3, (3 * 16)(%rsi);
+ movdqu RB0, (4 * 16)(%rsi);
+ movdqu RB1, (5 * 16)(%rsi);
+ movdqu RB2, (6 * 16)(%rsi);
+ movdqu RB3, (7 * 16)(%rsi);
+
+ ret
+.size _gcry_serpent_sse2_cfb_dec,.-_gcry_serpent_sse2_cfb_dec;
+
#endif /*defined(USE_SERPENT)*/
#endif /*__x86_64*/