summaryrefslogtreecommitdiff
path: root/cipher/blowfish-amd64.S
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2013-05-29 16:40:27 +0300
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2013-05-29 16:40:27 +0300
commit9a61edd1f00cefe8ffa3ad54a53eed163883053c (patch)
tree4a594985a5d7cb57f4fb392113487f78aab022d0 /cipher/blowfish-amd64.S
parent99b18aa536703ef90c9a1f5c8f40bc68b2064593 (diff)
downloadlibgcrypt-9a61edd1f00cefe8ffa3ad54a53eed163883053c.tar.gz
blowfish: add amd64 assembly implementation
* cipher/Makefile.am: Add 'blowfish-amd64.S'. * cipher/blowfish-amd64.S: New file. * cipher/blowfish.c (USE_AMD64_ASM): New macro. [USE_AMD64_ASM] (_gcry_blowfish_amd64_do_encrypt) (_gcry_blowfish_amd64_encrypt_block) (_gcry_blowfish_amd64_decrypt_block, _gcry_blowfish_amd64_ctr_enc) (_gcry_blowfish_amd64_cbc_dec, _gcry_blowfish_amd64_cfb_dec): New prototypes. [USE_AMD64_ASM] (do_encrypt, do_encrypt_block, do_decrypt_block) (encrypt_block, decrypt_block): New functions. (_gcry_blowfish_ctr_enc, _gcry_blowfish_cbc_dec) (_gcry_blowfish_cfb_dec, selftest_ctr, selftest_cbc, selftest_cfb): New functions. (selftest): Call new bulk selftests. * cipher/cipher.c (gcry_cipher_open) [USE_BLOWFISH]: Register Blowfish bulk functions for ctr-enc, cbc-dec and cfb-dec. * configure.ac (blowfish) [x86_64]: Add 'blowfish-amd64.lo'. * src/cipher.h (_gcry_blowfish_ctr_enc, _gcry_blowfish_cbc_dec) (gcry_blowfish_cfb_dec): New prototypes. -- Add non-parallel functions for small speed-up and 4-way parallel functions for modes of operation that support parallel processing. Speed old vs. new on AMD Phenom II X6 1055T: ECB/Stream CBC CFB OFB CTR --------------- --------------- --------------- --------------- --------------- BLOWFISH 1.21x 1.12x 1.17x 3.52x 1.18x 3.34x 1.16x 1.15x 3.38x 3.47x Speed old vs. new on Intel Core i5-2450M (Sandy-Bridge): ECB/Stream CBC CFB OFB CTR --------------- --------------- --------------- --------------- --------------- BLOWFISH 1.16x 1.10x 1.17x 2.98x 1.18x 2.88x 1.16x 1.15x 3.00x 3.02x Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/blowfish-amd64.S')
-rw-r--r--cipher/blowfish-amd64.S533
1 files changed, 533 insertions, 0 deletions
diff --git a/cipher/blowfish-amd64.S b/cipher/blowfish-amd64.S
new file mode 100644
index 00000000..1008387f
--- /dev/null
+++ b/cipher/blowfish-amd64.S
@@ -0,0 +1,533 @@
+/* blowfish-amd64.S - AMD64 assembly implementation of Blowfish cipher
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(USE_BLOWFISH)
+
+.text
+
+/* structure of BLOWFISH_context: */
+#define s0 0
+#define s1 ((s0) + 256 * 4)
+#define s2 ((s1) + 256 * 4)
+#define s3 ((s2) + 256 * 4)
+#define p ((s3) + 256 * 4)
+
+/* register macros */
+#define CTX %rdi
+#define RIO %rsi
+
+#define RX0 %rax
+#define RX1 %rbx
+#define RX2 %rcx
+#define RX3 %rdx
+
+#define RX0d %eax
+#define RX1d %ebx
+#define RX2d %ecx
+#define RX3d %edx
+
+#define RX0bl %al
+#define RX1bl %bl
+#define RX2bl %cl
+#define RX3bl %dl
+
+#define RX0bh %ah
+#define RX1bh %bh
+#define RX2bh %ch
+#define RX3bh %dh
+
+#define RT0 %rbp
+#define RT1 %rsi
+#define RT2 %r8
+#define RT3 %r9
+
+#define RT0d %ebp
+#define RT1d %esi
+#define RT2d %r8d
+#define RT3d %r9d
+
+#define RKEY %r10
+
+/***********************************************************************
+ * 1-way blowfish
+ ***********************************************************************/
+#define F() \
+ movzbl RX0bh, RT1d; \
+ movzbl RX0bl, RT3d; \
+ rorq $16, RX0; \
+ movzbl RX0bh, RT0d; \
+ movzbl RX0bl, RT2d; \
+ rorq $16, RX0; \
+ movl s0(CTX,RT0,4), RT0d; \
+ addl s1(CTX,RT2,4), RT0d; \
+ xorl s2(CTX,RT1,4), RT0d; \
+ addl s3(CTX,RT3,4), RT0d; \
+ xorq RT0, RX0;
+
+#define load_roundkey_enc(n) \
+ movq p+4*(n)(CTX), RX3;
+
+#define add_roundkey_enc() \
+ xorq RX3, RX0;
+
+#define round_enc(n) \
+ add_roundkey_enc(); \
+ load_roundkey_enc(n); \
+ \
+ F(); \
+ F();
+
+#define load_roundkey_dec(n) \
+ movq p+4*(n-1)(CTX), RX3; \
+ rorq $32, RX3;
+
+#define add_roundkey_dec() \
+ xorq RX3, RX0;
+
+#define round_dec(n) \
+ add_roundkey_dec(); \
+ load_roundkey_dec(n); \
+ \
+ F(); \
+ F();
+
+#define read_block() \
+ movq (RIO), RX0; \
+ rorq $32, RX0; \
+ bswapq RX0;
+
+#define write_block() \
+ bswapq RX0; \
+ movq RX0, (RIO);
+
+.align 8
+.type __blowfish_enc_blk1,@function;
+
+__blowfish_enc_blk1:
+ /* input:
+ * %rdi: ctx, CTX
+ * RX0: input plaintext block
+ * output:
+ * RX0: output plaintext block
+ */
+ movq %rbp, %r11;
+
+ load_roundkey_enc(0);
+ round_enc(2);
+ round_enc(4);
+ round_enc(6);
+ round_enc(8);
+ round_enc(10);
+ round_enc(12);
+ round_enc(14);
+ round_enc(16);
+ add_roundkey_enc();
+
+ movq %r11, %rbp;
+
+ ret;
+.size __blowfish_enc_blk1,.-__blowfish_enc_blk1;
+
+.align 8
+.globl _gcry_blowfish_amd64_do_encrypt
+.type _gcry_blowfish_amd64_do_encrypt,@function;
+
+_gcry_blowfish_amd64_do_encrypt:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: u32 *ret_xl
+ * %rdx: u32 *ret_xr
+ */
+ movl (%rdx), RX0d;
+ shlq $32, RX0;
+ movl (%rsi), RT3d;
+ movq %rdx, %r10;
+ orq RT3, RX0;
+ movq %rsi, RX2;
+
+ call __blowfish_enc_blk1;
+
+ movl RX0d, (%r10);
+ shrq $32, RX0;
+ movl RX0d, (RX2);
+
+ ret;
+.size _gcry_blowfish_amd64_do_encrypt,.-_gcry_blowfish_amd64_do_encrypt;
+
+.align 8
+.globl _gcry_blowfish_amd64_encrypt_block
+.type _gcry_blowfish_amd64_encrypt_block,@function;
+
+_gcry_blowfish_amd64_encrypt_block:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ movq %rsi, %r10;
+
+ movq %rdx, RIO;
+ read_block();
+
+ call __blowfish_enc_blk1;
+
+ movq %r10, RIO;
+ write_block();
+
+ ret;
+.size _gcry_blowfish_amd64_encrypt_block,.-_gcry_blowfish_amd64_encrypt_block;
+
+.align 8
+.globl _gcry_blowfish_amd64_decrypt_block
+.type _gcry_blowfish_amd64_decrypt_block,@function;
+
+_gcry_blowfish_amd64_decrypt_block:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ movq %rbp, %r11;
+
+ movq %rsi, %r10;
+ movq %rdx, RIO;
+
+ read_block();
+
+ load_roundkey_dec(17);
+ round_dec(15);
+ round_dec(13);
+ round_dec(11);
+ round_dec(9);
+ round_dec(7);
+ round_dec(5);
+ round_dec(3);
+ round_dec(1);
+ add_roundkey_dec();
+
+ movq %r10, RIO;
+ write_block();
+
+ movq %r11, %rbp;
+
+ ret;
+.size _gcry_blowfish_amd64_decrypt_block,.-_gcry_blowfish_amd64_decrypt_block;
+
+/**********************************************************************
+ 4-way blowfish, four blocks parallel
+ **********************************************************************/
+#define F4(x) \
+ movzbl x ## bh, RT1d; \
+ movzbl x ## bl, RT3d; \
+ rorq $16, x; \
+ movzbl x ## bh, RT0d; \
+ movzbl x ## bl, RT2d; \
+ rorq $16, x; \
+ movl s0(CTX,RT0,4), RT0d; \
+ addl s1(CTX,RT2,4), RT0d; \
+ xorl s2(CTX,RT1,4), RT0d; \
+ addl s3(CTX,RT3,4), RT0d; \
+ xorq RT0, x;
+
+#define add_preloaded_roundkey4() \
+ xorq RKEY, RX0; \
+ xorq RKEY, RX1; \
+ xorq RKEY, RX2; \
+ xorq RKEY, RX3;
+
+#define preload_roundkey_enc(n) \
+ movq p+4*(n)(CTX), RKEY;
+
+#define add_roundkey_enc4(n) \
+ add_preloaded_roundkey4(); \
+ preload_roundkey_enc(n + 2);
+
+#define round_enc4(n) \
+ add_roundkey_enc4(n); \
+ \
+ F4(RX0); \
+ F4(RX1); \
+ F4(RX2); \
+ F4(RX3); \
+ \
+ F4(RX0); \
+ F4(RX1); \
+ F4(RX2); \
+ F4(RX3);
+
+#define preload_roundkey_dec(n) \
+ movq p+4*((n)-1)(CTX), RKEY; \
+ rorq $32, RKEY;
+
+#define add_roundkey_dec4(n) \
+ add_preloaded_roundkey4(); \
+ preload_roundkey_dec(n - 2);
+
+#define round_dec4(n) \
+ add_roundkey_dec4(n); \
+ \
+ F4(RX0); \
+ F4(RX1); \
+ F4(RX2); \
+ F4(RX3); \
+ \
+ F4(RX0); \
+ F4(RX1); \
+ F4(RX2); \
+ F4(RX3);
+
+#define inbswap_block4() \
+ rorq $32, RX0; \
+ bswapq RX0; \
+ rorq $32, RX1; \
+ bswapq RX1; \
+ rorq $32, RX2; \
+ bswapq RX2; \
+ rorq $32, RX3; \
+ bswapq RX3;
+
+#define inctrswap_block4() \
+ rorq $32, RX0; \
+ rorq $32, RX1; \
+ rorq $32, RX2; \
+ rorq $32, RX3;
+
+#define outbswap_block4() \
+ bswapq RX0; \
+ bswapq RX1; \
+ bswapq RX2; \
+ bswapq RX3;
+
+.align 8
+.type __blowfish_enc_blk4,@function;
+
+__blowfish_enc_blk4:
+ /* input:
+ * %rdi: ctx, CTX
+ * RX0,RX1,RX2,RX3: four input inbswapped plaintext blocks
+ * output:
+ * RX0,RX1,RX2,RX3: four output ciphertext blocks
+ */
+ preload_roundkey_enc(0);
+
+ round_enc4(0);
+ round_enc4(2);
+ round_enc4(4);
+ round_enc4(6);
+ round_enc4(8);
+ round_enc4(10);
+ round_enc4(12);
+ round_enc4(14);
+ add_preloaded_roundkey4();
+
+ outbswap_block4();
+
+ ret;
+.size __blowfish_enc_blk4,.-__blowfish_enc_blk4;
+
+.align 8
+.type __blowfish_dec_blk4,@function;
+
+__blowfish_dec_blk4:
+ /* input:
+ * %rdi: ctx, CTX
+ * RX0,RX1,RX2,RX3: four input ciphertext blocks
+ * output:
+ * RX0,RX1,RX2,RX3: four output plaintext blocks
+ */
+ preload_roundkey_dec(17);
+
+ inbswap_block4();
+
+ round_dec4(17);
+ round_dec4(15);
+ round_dec4(13);
+ round_dec4(11);
+ round_dec4(9);
+ round_dec4(7);
+ round_dec4(5);
+ round_dec4(3);
+ add_preloaded_roundkey4();
+
+ outbswap_block4();
+
+ ret;
+.size __blowfish_dec_blk4,.-__blowfish_dec_blk4;
+
+.align 8
+.globl _gcry_blowfish_amd64_ctr_enc
+.type _gcry_blowfish_amd64_ctr_enc,@function;
+_gcry_blowfish_amd64_ctr_enc:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (4 blocks)
+ * %rdx: src (4 blocks)
+ * %rcx: iv (big endian, 64bit)
+ */
+ pushq %rbp;
+ pushq %rbx;
+ pushq %r12;
+ pushq %r13;
+
+ /* %r11-%r13 are not used by __blowfish_enc_blk4 */
+ movq %rcx, %r13; /*iv*/
+ movq %rdx, %r12; /*src*/
+ movq %rsi, %r11; /*dst*/
+
+ /* load IV and byteswap */
+ movq (%r13), RT0;
+ bswapq RT0;
+ movq RT0, RX0;
+
+ /* construct IVs */
+ leaq 1(RT0), RX1;
+ leaq 2(RT0), RX2;
+ leaq 3(RT0), RX3;
+ leaq 4(RT0), RT0;
+ bswapq RT0;
+
+ inctrswap_block4();
+
+ /* store new IV */
+ movq RT0, (%r13);
+
+ call __blowfish_enc_blk4;
+
+ /* XOR key-stream with plaintext */
+ xorq 0 * 8(%r12), RX0;
+ xorq 1 * 8(%r12), RX1;
+ xorq 2 * 8(%r12), RX2;
+ xorq 3 * 8(%r12), RX3;
+ movq RX0, 0 * 8(%r11);
+ movq RX1, 1 * 8(%r11);
+ movq RX2, 2 * 8(%r11);
+ movq RX3, 3 * 8(%r11);
+
+ popq %r13;
+ popq %r12;
+ popq %rbx;
+ popq %rbp;
+
+ ret;
+.size _gcry_blowfish_amd64_ctr_enc,.-_gcry_blowfish_amd64_ctr_enc;
+
+.align 8
+.globl _gcry_blowfish_amd64_cbc_dec
+.type _gcry_blowfish_amd64_cbc_dec,@function;
+_gcry_blowfish_amd64_cbc_dec:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (4 blocks)
+ * %rdx: src (4 blocks)
+ * %rcx: iv (64bit)
+ */
+ pushq %rbp;
+ pushq %rbx;
+ pushq %r12;
+ pushq %r13;
+
+ /* %r11-%r13 are not used by __blowfish_dec_blk4 */
+ movq %rsi, %r11; /*dst*/
+ movq %rdx, %r12; /*src*/
+ movq %rcx, %r13; /*iv*/
+
+ /* load input */
+ movq 0 * 8(%r12), RX0;
+ movq 1 * 8(%r12), RX1;
+ movq 2 * 8(%r12), RX2;
+ movq 3 * 8(%r12), RX3;
+
+ call __blowfish_dec_blk4;
+
+ movq 3 * 8(%r12), RT0;
+ xorq (%r13), RX0;
+ xorq 0 * 8(%r12), RX1;
+ xorq 1 * 8(%r12), RX2;
+ xorq 2 * 8(%r12), RX3;
+ movq RT0, (%r13); /* store new IV */
+
+ movq RX0, 0 * 8(%r11);
+ movq RX1, 1 * 8(%r11);
+ movq RX2, 2 * 8(%r11);
+ movq RX3, 3 * 8(%r11);
+
+ popq %r13;
+ popq %r12;
+ popq %rbx;
+ popq %rbp;
+
+ ret;
+.size _gcry_blowfish_amd64_cbc_dec,.-_gcry_blowfish_amd64_cbc_dec;
+
+.align 8
+.globl _gcry_blowfish_amd64_cfb_dec
+.type _gcry_blowfish_amd64_cfb_dec,@function;
+_gcry_blowfish_amd64_cfb_dec:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (4 blocks)
+ * %rdx: src (4 blocks)
+ * %rcx: iv (64bit)
+ */
+ pushq %rbp;
+ pushq %rbx;
+ pushq %r12;
+ pushq %r13;
+
+ /* %r11-%r13 are not used by __blowfish_enc_blk4 */
+ movq %rcx, %r13; /*iv*/
+ movq %rdx, %r12; /*src*/
+ movq %rsi, %r11; /*dst*/
+
+ /* Load input */
+ movq (%r13), RX0;
+ movq 0 * 8(%r12), RX1;
+ movq 1 * 8(%r12), RX2;
+ movq 2 * 8(%r12), RX3;
+
+ inbswap_block4();
+
+ /* Update IV */
+ movq 3 * 8(%r12), RT0;
+ movq RT0, (%r13);
+
+ call __blowfish_enc_blk4;
+
+ xorq 0 * 8(%r12), RX0;
+ xorq 1 * 8(%r12), RX1;
+ xorq 2 * 8(%r12), RX2;
+ xorq 3 * 8(%r12), RX3;
+ movq RX0, 0 * 8(%r11);
+ movq RX1, 1 * 8(%r11);
+ movq RX2, 2 * 8(%r11);
+ movq RX3, 3 * 8(%r11);
+
+ popq %r13;
+ popq %r12;
+ popq %rbx;
+ popq %rbp;
+ ret;
+.size _gcry_blowfish_amd64_cfb_dec,.-_gcry_blowfish_amd64_cfb_dec;
+
+#endif /*defined(USE_BLOWFISH)*/
+#endif /*__x86_64*/