diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2013-05-29 16:40:27 +0300 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2013-05-29 16:40:27 +0300 |
commit | 9a61edd1f00cefe8ffa3ad54a53eed163883053c (patch) | |
tree | 4a594985a5d7cb57f4fb392113487f78aab022d0 /cipher/blowfish-amd64.S | |
parent | 99b18aa536703ef90c9a1f5c8f40bc68b2064593 (diff) | |
download | libgcrypt-9a61edd1f00cefe8ffa3ad54a53eed163883053c.tar.gz |
blowfish: add amd64 assembly implementation
* cipher/Makefile.am: Add 'blowfish-amd64.S'.
* cipher/blowfish-amd64.S: New file.
* cipher/blowfish.c (USE_AMD64_ASM): New macro.
[USE_AMD64_ASM] (_gcry_blowfish_amd64_do_encrypt)
(_gcry_blowfish_amd64_encrypt_block)
(_gcry_blowfish_amd64_decrypt_block, _gcry_blowfish_amd64_ctr_enc)
(_gcry_blowfish_amd64_cbc_dec, _gcry_blowfish_amd64_cfb_dec): New
prototypes.
[USE_AMD64_ASM] (do_encrypt, do_encrypt_block, do_decrypt_block)
(encrypt_block, decrypt_block): New functions.
(_gcry_blowfish_ctr_enc, _gcry_blowfish_cbc_dec)
(_gcry_blowfish_cfb_dec, selftest_ctr, selftest_cbc, selftest_cfb): New
functions.
(selftest): Call new bulk selftests.
* cipher/cipher.c (gcry_cipher_open) [USE_BLOWFISH]: Register Blowfish
bulk functions for ctr-enc, cbc-dec and cfb-dec.
* configure.ac (blowfish) [x86_64]: Add 'blowfish-amd64.lo'.
* src/cipher.h (_gcry_blowfish_ctr_enc, _gcry_blowfish_cbc_dec)
(gcry_blowfish_cfb_dec): New prototypes.
--
Add non-parallel functions for small speed-up and 4-way parallel functions for
modes of operation that support parallel processing.
Speed old vs. new on AMD Phenom II X6 1055T:
ECB/Stream CBC CFB OFB CTR
--------------- --------------- --------------- --------------- ---------------
BLOWFISH 1.21x 1.12x 1.17x 3.52x 1.18x 3.34x 1.16x 1.15x 3.38x 3.47x
Speed old vs. new on Intel Core i5-2450M (Sandy-Bridge):
ECB/Stream CBC CFB OFB CTR
--------------- --------------- --------------- --------------- ---------------
BLOWFISH 1.16x 1.10x 1.17x 2.98x 1.18x 2.88x 1.16x 1.15x 3.00x 3.02x
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/blowfish-amd64.S')
-rw-r--r-- | cipher/blowfish-amd64.S | 533 |
1 files changed, 533 insertions, 0 deletions
diff --git a/cipher/blowfish-amd64.S b/cipher/blowfish-amd64.S new file mode 100644 index 00000000..1008387f --- /dev/null +++ b/cipher/blowfish-amd64.S @@ -0,0 +1,533 @@ +/* blowfish-amd64.S - AMD64 assembly implementation of Blowfish cipher + * + * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifdef __x86_64 +#include <config.h> +#if defined(USE_BLOWFISH) + +.text + +/* structure of BLOWFISH_context: */ +#define s0 0 +#define s1 ((s0) + 256 * 4) +#define s2 ((s1) + 256 * 4) +#define s3 ((s2) + 256 * 4) +#define p ((s3) + 256 * 4) + +/* register macros */ +#define CTX %rdi +#define RIO %rsi + +#define RX0 %rax +#define RX1 %rbx +#define RX2 %rcx +#define RX3 %rdx + +#define RX0d %eax +#define RX1d %ebx +#define RX2d %ecx +#define RX3d %edx + +#define RX0bl %al +#define RX1bl %bl +#define RX2bl %cl +#define RX3bl %dl + +#define RX0bh %ah +#define RX1bh %bh +#define RX2bh %ch +#define RX3bh %dh + +#define RT0 %rbp +#define RT1 %rsi +#define RT2 %r8 +#define RT3 %r9 + +#define RT0d %ebp +#define RT1d %esi +#define RT2d %r8d +#define RT3d %r9d + +#define RKEY %r10 + +/*********************************************************************** + * 1-way blowfish + ***********************************************************************/ +#define F() \ + movzbl RX0bh, RT1d; \ + movzbl RX0bl, RT3d; \ + rorq $16, RX0; \ + movzbl RX0bh, RT0d; \ + movzbl RX0bl, RT2d; \ + rorq $16, RX0; \ + movl s0(CTX,RT0,4), RT0d; \ + addl s1(CTX,RT2,4), RT0d; \ + xorl s2(CTX,RT1,4), RT0d; \ + addl s3(CTX,RT3,4), RT0d; \ + xorq RT0, RX0; + +#define load_roundkey_enc(n) \ + movq p+4*(n)(CTX), RX3; + +#define add_roundkey_enc() \ + xorq RX3, RX0; + +#define round_enc(n) \ + add_roundkey_enc(); \ + load_roundkey_enc(n); \ + \ + F(); \ + F(); + +#define load_roundkey_dec(n) \ + movq p+4*(n-1)(CTX), RX3; \ + rorq $32, RX3; + +#define add_roundkey_dec() \ + xorq RX3, RX0; + +#define round_dec(n) \ + add_roundkey_dec(); \ + load_roundkey_dec(n); \ + \ + F(); \ + F(); + +#define read_block() \ + movq (RIO), RX0; \ + rorq $32, RX0; \ + bswapq RX0; + +#define write_block() \ + bswapq RX0; \ + movq RX0, (RIO); + +.align 8 +.type __blowfish_enc_blk1,@function; + +__blowfish_enc_blk1: + /* input: + * %rdi: ctx, CTX + * RX0: input plaintext block + * output: + * RX0: output plaintext block + */ + movq %rbp, %r11; + + load_roundkey_enc(0); + round_enc(2); + round_enc(4); + round_enc(6); + round_enc(8); + round_enc(10); + round_enc(12); + round_enc(14); + round_enc(16); + add_roundkey_enc(); + + movq %r11, %rbp; + + ret; +.size __blowfish_enc_blk1,.-__blowfish_enc_blk1; + +.align 8 +.globl _gcry_blowfish_amd64_do_encrypt +.type _gcry_blowfish_amd64_do_encrypt,@function; + +_gcry_blowfish_amd64_do_encrypt: + /* input: + * %rdi: ctx, CTX + * %rsi: u32 *ret_xl + * %rdx: u32 *ret_xr + */ + movl (%rdx), RX0d; + shlq $32, RX0; + movl (%rsi), RT3d; + movq %rdx, %r10; + orq RT3, RX0; + movq %rsi, RX2; + + call __blowfish_enc_blk1; + + movl RX0d, (%r10); + shrq $32, RX0; + movl RX0d, (RX2); + + ret; +.size _gcry_blowfish_amd64_do_encrypt,.-_gcry_blowfish_amd64_do_encrypt; + +.align 8 +.globl _gcry_blowfish_amd64_encrypt_block +.type _gcry_blowfish_amd64_encrypt_block,@function; + +_gcry_blowfish_amd64_encrypt_block: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + movq %rsi, %r10; + + movq %rdx, RIO; + read_block(); + + call __blowfish_enc_blk1; + + movq %r10, RIO; + write_block(); + + ret; +.size _gcry_blowfish_amd64_encrypt_block,.-_gcry_blowfish_amd64_encrypt_block; + +.align 8 +.globl _gcry_blowfish_amd64_decrypt_block +.type _gcry_blowfish_amd64_decrypt_block,@function; + +_gcry_blowfish_amd64_decrypt_block: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + movq %rbp, %r11; + + movq %rsi, %r10; + movq %rdx, RIO; + + read_block(); + + load_roundkey_dec(17); + round_dec(15); + round_dec(13); + round_dec(11); + round_dec(9); + round_dec(7); + round_dec(5); + round_dec(3); + round_dec(1); + add_roundkey_dec(); + + movq %r10, RIO; + write_block(); + + movq %r11, %rbp; + + ret; +.size _gcry_blowfish_amd64_decrypt_block,.-_gcry_blowfish_amd64_decrypt_block; + +/********************************************************************** + 4-way blowfish, four blocks parallel + **********************************************************************/ +#define F4(x) \ + movzbl x ## bh, RT1d; \ + movzbl x ## bl, RT3d; \ + rorq $16, x; \ + movzbl x ## bh, RT0d; \ + movzbl x ## bl, RT2d; \ + rorq $16, x; \ + movl s0(CTX,RT0,4), RT0d; \ + addl s1(CTX,RT2,4), RT0d; \ + xorl s2(CTX,RT1,4), RT0d; \ + addl s3(CTX,RT3,4), RT0d; \ + xorq RT0, x; + +#define add_preloaded_roundkey4() \ + xorq RKEY, RX0; \ + xorq RKEY, RX1; \ + xorq RKEY, RX2; \ + xorq RKEY, RX3; + +#define preload_roundkey_enc(n) \ + movq p+4*(n)(CTX), RKEY; + +#define add_roundkey_enc4(n) \ + add_preloaded_roundkey4(); \ + preload_roundkey_enc(n + 2); + +#define round_enc4(n) \ + add_roundkey_enc4(n); \ + \ + F4(RX0); \ + F4(RX1); \ + F4(RX2); \ + F4(RX3); \ + \ + F4(RX0); \ + F4(RX1); \ + F4(RX2); \ + F4(RX3); + +#define preload_roundkey_dec(n) \ + movq p+4*((n)-1)(CTX), RKEY; \ + rorq $32, RKEY; + +#define add_roundkey_dec4(n) \ + add_preloaded_roundkey4(); \ + preload_roundkey_dec(n - 2); + +#define round_dec4(n) \ + add_roundkey_dec4(n); \ + \ + F4(RX0); \ + F4(RX1); \ + F4(RX2); \ + F4(RX3); \ + \ + F4(RX0); \ + F4(RX1); \ + F4(RX2); \ + F4(RX3); + +#define inbswap_block4() \ + rorq $32, RX0; \ + bswapq RX0; \ + rorq $32, RX1; \ + bswapq RX1; \ + rorq $32, RX2; \ + bswapq RX2; \ + rorq $32, RX3; \ + bswapq RX3; + +#define inctrswap_block4() \ + rorq $32, RX0; \ + rorq $32, RX1; \ + rorq $32, RX2; \ + rorq $32, RX3; + +#define outbswap_block4() \ + bswapq RX0; \ + bswapq RX1; \ + bswapq RX2; \ + bswapq RX3; + +.align 8 +.type __blowfish_enc_blk4,@function; + +__blowfish_enc_blk4: + /* input: + * %rdi: ctx, CTX + * RX0,RX1,RX2,RX3: four input inbswapped plaintext blocks + * output: + * RX0,RX1,RX2,RX3: four output ciphertext blocks + */ + preload_roundkey_enc(0); + + round_enc4(0); + round_enc4(2); + round_enc4(4); + round_enc4(6); + round_enc4(8); + round_enc4(10); + round_enc4(12); + round_enc4(14); + add_preloaded_roundkey4(); + + outbswap_block4(); + + ret; +.size __blowfish_enc_blk4,.-__blowfish_enc_blk4; + +.align 8 +.type __blowfish_dec_blk4,@function; + +__blowfish_dec_blk4: + /* input: + * %rdi: ctx, CTX + * RX0,RX1,RX2,RX3: four input ciphertext blocks + * output: + * RX0,RX1,RX2,RX3: four output plaintext blocks + */ + preload_roundkey_dec(17); + + inbswap_block4(); + + round_dec4(17); + round_dec4(15); + round_dec4(13); + round_dec4(11); + round_dec4(9); + round_dec4(7); + round_dec4(5); + round_dec4(3); + add_preloaded_roundkey4(); + + outbswap_block4(); + + ret; +.size __blowfish_dec_blk4,.-__blowfish_dec_blk4; + +.align 8 +.globl _gcry_blowfish_amd64_ctr_enc +.type _gcry_blowfish_amd64_ctr_enc,@function; +_gcry_blowfish_amd64_ctr_enc: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (4 blocks) + * %rdx: src (4 blocks) + * %rcx: iv (big endian, 64bit) + */ + pushq %rbp; + pushq %rbx; + pushq %r12; + pushq %r13; + + /* %r11-%r13 are not used by __blowfish_enc_blk4 */ + movq %rcx, %r13; /*iv*/ + movq %rdx, %r12; /*src*/ + movq %rsi, %r11; /*dst*/ + + /* load IV and byteswap */ + movq (%r13), RT0; + bswapq RT0; + movq RT0, RX0; + + /* construct IVs */ + leaq 1(RT0), RX1; + leaq 2(RT0), RX2; + leaq 3(RT0), RX3; + leaq 4(RT0), RT0; + bswapq RT0; + + inctrswap_block4(); + + /* store new IV */ + movq RT0, (%r13); + + call __blowfish_enc_blk4; + + /* XOR key-stream with plaintext */ + xorq 0 * 8(%r12), RX0; + xorq 1 * 8(%r12), RX1; + xorq 2 * 8(%r12), RX2; + xorq 3 * 8(%r12), RX3; + movq RX0, 0 * 8(%r11); + movq RX1, 1 * 8(%r11); + movq RX2, 2 * 8(%r11); + movq RX3, 3 * 8(%r11); + + popq %r13; + popq %r12; + popq %rbx; + popq %rbp; + + ret; +.size _gcry_blowfish_amd64_ctr_enc,.-_gcry_blowfish_amd64_ctr_enc; + +.align 8 +.globl _gcry_blowfish_amd64_cbc_dec +.type _gcry_blowfish_amd64_cbc_dec,@function; +_gcry_blowfish_amd64_cbc_dec: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (4 blocks) + * %rdx: src (4 blocks) + * %rcx: iv (64bit) + */ + pushq %rbp; + pushq %rbx; + pushq %r12; + pushq %r13; + + /* %r11-%r13 are not used by __blowfish_dec_blk4 */ + movq %rsi, %r11; /*dst*/ + movq %rdx, %r12; /*src*/ + movq %rcx, %r13; /*iv*/ + + /* load input */ + movq 0 * 8(%r12), RX0; + movq 1 * 8(%r12), RX1; + movq 2 * 8(%r12), RX2; + movq 3 * 8(%r12), RX3; + + call __blowfish_dec_blk4; + + movq 3 * 8(%r12), RT0; + xorq (%r13), RX0; + xorq 0 * 8(%r12), RX1; + xorq 1 * 8(%r12), RX2; + xorq 2 * 8(%r12), RX3; + movq RT0, (%r13); /* store new IV */ + + movq RX0, 0 * 8(%r11); + movq RX1, 1 * 8(%r11); + movq RX2, 2 * 8(%r11); + movq RX3, 3 * 8(%r11); + + popq %r13; + popq %r12; + popq %rbx; + popq %rbp; + + ret; +.size _gcry_blowfish_amd64_cbc_dec,.-_gcry_blowfish_amd64_cbc_dec; + +.align 8 +.globl _gcry_blowfish_amd64_cfb_dec +.type _gcry_blowfish_amd64_cfb_dec,@function; +_gcry_blowfish_amd64_cfb_dec: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (4 blocks) + * %rdx: src (4 blocks) + * %rcx: iv (64bit) + */ + pushq %rbp; + pushq %rbx; + pushq %r12; + pushq %r13; + + /* %r11-%r13 are not used by __blowfish_enc_blk4 */ + movq %rcx, %r13; /*iv*/ + movq %rdx, %r12; /*src*/ + movq %rsi, %r11; /*dst*/ + + /* Load input */ + movq (%r13), RX0; + movq 0 * 8(%r12), RX1; + movq 1 * 8(%r12), RX2; + movq 2 * 8(%r12), RX3; + + inbswap_block4(); + + /* Update IV */ + movq 3 * 8(%r12), RT0; + movq RT0, (%r13); + + call __blowfish_enc_blk4; + + xorq 0 * 8(%r12), RX0; + xorq 1 * 8(%r12), RX1; + xorq 2 * 8(%r12), RX2; + xorq 3 * 8(%r12), RX3; + movq RX0, 0 * 8(%r11); + movq RX1, 1 * 8(%r11); + movq RX2, 2 * 8(%r11); + movq RX3, 3 * 8(%r11); + + popq %r13; + popq %r12; + popq %rbx; + popq %rbp; + ret; +.size _gcry_blowfish_amd64_cfb_dec,.-_gcry_blowfish_amd64_cfb_dec; + +#endif /*defined(USE_BLOWFISH)*/ +#endif /*__x86_64*/ |