/* blowfish-amd64.S - AMD64 assembly implementation of Blowfish cipher * * Copyright (C) 2013 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ #ifdef __x86_64 #include #if defined(USE_BLOWFISH) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) #ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS # define ELF(...) __VA_ARGS__ #else # define ELF(...) /*_*/ #endif .text /* structure of BLOWFISH_context: */ #define s0 0 #define s1 ((s0) + 256 * 4) #define s2 ((s1) + 256 * 4) #define s3 ((s2) + 256 * 4) #define p ((s3) + 256 * 4) /* register macros */ #define CTX %rdi #define RIO %rsi #define RX0 %rax #define RX1 %rbx #define RX2 %rcx #define RX3 %rdx #define RX0d %eax #define RX1d %ebx #define RX2d %ecx #define RX3d %edx #define RX0bl %al #define RX1bl %bl #define RX2bl %cl #define RX3bl %dl #define RX0bh %ah #define RX1bh %bh #define RX2bh %ch #define RX3bh %dh #define RT0 %rbp #define RT1 %rsi #define RT2 %r8 #define RT3 %r9 #define RT0d %ebp #define RT1d %esi #define RT2d %r8d #define RT3d %r9d #define RKEY %r10 /*********************************************************************** * 1-way blowfish ***********************************************************************/ #define F() \ movzbl RX0bh, RT1d; \ movzbl RX0bl, RT3d; \ rorq $16, RX0; \ movzbl RX0bh, RT0d; \ movzbl RX0bl, RT2d; \ rorq $16, RX0; \ movl s0(CTX,RT0,4), RT0d; \ addl s1(CTX,RT2,4), RT0d; \ xorl s2(CTX,RT1,4), RT0d; \ addl s3(CTX,RT3,4), RT0d; \ xorq RT0, RX0; #define load_roundkey_enc(n) \ movq p+4*(n)(CTX), RX3; #define add_roundkey_enc() \ xorq RX3, RX0; #define round_enc(n) \ add_roundkey_enc(); \ load_roundkey_enc(n); \ \ F(); \ F(); #define load_roundkey_dec(n) \ movq p+4*(n-1)(CTX), RX3; \ rorq $32, RX3; #define add_roundkey_dec() \ xorq RX3, RX0; #define round_dec(n) \ add_roundkey_dec(); \ load_roundkey_dec(n); \ \ F(); \ F(); #define read_block() \ movq (RIO), RX0; \ rorq $32, RX0; \ bswapq RX0; #define write_block() \ bswapq RX0; \ movq RX0, (RIO); .align 8 ELF(.type __blowfish_enc_blk1,@function;) __blowfish_enc_blk1: /* input: * %rdi: ctx, CTX * RX0: input plaintext block * output: * RX0: output plaintext block */ movq %rbp, %r11; load_roundkey_enc(0); round_enc(2); round_enc(4); round_enc(6); round_enc(8); round_enc(10); round_enc(12); round_enc(14); round_enc(16); add_roundkey_enc(); movq %r11, %rbp; ret; ELF(.size __blowfish_enc_blk1,.-__blowfish_enc_blk1;) .align 8 .globl _gcry_blowfish_amd64_do_encrypt ELF(.type _gcry_blowfish_amd64_do_encrypt,@function;) _gcry_blowfish_amd64_do_encrypt: /* input: * %rdi: ctx, CTX * %rsi: u32 *ret_xl * %rdx: u32 *ret_xr */ movl (%rdx), RX0d; shlq $32, RX0; movl (%rsi), RT3d; movq %rdx, %r10; orq RT3, RX0; movq %rsi, RX2; call __blowfish_enc_blk1; movl RX0d, (%r10); shrq $32, RX0; movl RX0d, (RX2); ret; ELF(.size _gcry_blowfish_amd64_do_encrypt,.-_gcry_blowfish_amd64_do_encrypt;) .align 8 .globl _gcry_blowfish_amd64_encrypt_block ELF(.type _gcry_blowfish_amd64_encrypt_block,@function;) _gcry_blowfish_amd64_encrypt_block: /* input: * %rdi: ctx, CTX * %rsi: dst * %rdx: src */ movq %rsi, %r10; movq %rdx, RIO; read_block(); call __blowfish_enc_blk1; movq %r10, RIO; write_block(); ret; ELF(.size _gcry_blowfish_amd64_encrypt_block,.-_gcry_blowfish_amd64_encrypt_block;) .align 8 .globl _gcry_blowfish_amd64_decrypt_block ELF(.type _gcry_blowfish_amd64_decrypt_block,@function;) _gcry_blowfish_amd64_decrypt_block: /* input: * %rdi: ctx, CTX * %rsi: dst * %rdx: src */ movq %rbp, %r11; movq %rsi, %r10; movq %rdx, RIO; read_block(); load_roundkey_dec(17); round_dec(15); round_dec(13); round_dec(11); round_dec(9); round_dec(7); round_dec(5); round_dec(3); round_dec(1); add_roundkey_dec(); movq %r10, RIO; write_block(); movq %r11, %rbp; ret; ELF(.size _gcry_blowfish_amd64_decrypt_block,.-_gcry_blowfish_amd64_decrypt_block;) /********************************************************************** 4-way blowfish, four blocks parallel **********************************************************************/ #define F4(x) \ movzbl x ## bh, RT1d; \ movzbl x ## bl, RT3d; \ rorq $16, x; \ movzbl x ## bh, RT0d; \ movzbl x ## bl, RT2d; \ rorq $16, x; \ movl s0(CTX,RT0,4), RT0d; \ addl s1(CTX,RT2,4), RT0d; \ xorl s2(CTX,RT1,4), RT0d; \ addl s3(CTX,RT3,4), RT0d; \ xorq RT0, x; #define add_preloaded_roundkey4() \ xorq RKEY, RX0; \ xorq RKEY, RX1; \ xorq RKEY, RX2; \ xorq RKEY, RX3; #define preload_roundkey_enc(n) \ movq p+4*(n)(CTX), RKEY; #define add_roundkey_enc4(n) \ add_preloaded_roundkey4(); \ preload_roundkey_enc(n + 2); #define round_enc4(n) \ add_roundkey_enc4(n); \ \ F4(RX0); \ F4(RX1); \ F4(RX2); \ F4(RX3); \ \ F4(RX0); \ F4(RX1); \ F4(RX2); \ F4(RX3); #define preload_roundkey_dec(n) \ movq p+4*((n)-1)(CTX), RKEY; \ rorq $32, RKEY; #define add_roundkey_dec4(n) \ add_preloaded_roundkey4(); \ preload_roundkey_dec(n - 2); #define round_dec4(n) \ add_roundkey_dec4(n); \ \ F4(RX0); \ F4(RX1); \ F4(RX2); \ F4(RX3); \ \ F4(RX0); \ F4(RX1); \ F4(RX2); \ F4(RX3); #define inbswap_block4() \ rorq $32, RX0; \ bswapq RX0; \ rorq $32, RX1; \ bswapq RX1; \ rorq $32, RX2; \ bswapq RX2; \ rorq $32, RX3; \ bswapq RX3; #define inctrswap_block4() \ rorq $32, RX0; \ rorq $32, RX1; \ rorq $32, RX2; \ rorq $32, RX3; #define outbswap_block4() \ bswapq RX0; \ bswapq RX1; \ bswapq RX2; \ bswapq RX3; .align 8 ELF(.type __blowfish_enc_blk4,@function;) __blowfish_enc_blk4: /* input: * %rdi: ctx, CTX * RX0,RX1,RX2,RX3: four input inbswapped plaintext blocks * output: * RX0,RX1,RX2,RX3: four output ciphertext blocks */ preload_roundkey_enc(0); round_enc4(0); round_enc4(2); round_enc4(4); round_enc4(6); round_enc4(8); round_enc4(10); round_enc4(12); round_enc4(14); add_preloaded_roundkey4(); outbswap_block4(); ret; ELF(.size __blowfish_enc_blk4,.-__blowfish_enc_blk4;) .align 8 ELF(.type __blowfish_dec_blk4,@function;) __blowfish_dec_blk4: /* input: * %rdi: ctx, CTX * RX0,RX1,RX2,RX3: four input ciphertext blocks * output: * RX0,RX1,RX2,RX3: four output plaintext blocks */ preload_roundkey_dec(17); inbswap_block4(); round_dec4(17); round_dec4(15); round_dec4(13); round_dec4(11); round_dec4(9); round_dec4(7); round_dec4(5); round_dec4(3); add_preloaded_roundkey4(); outbswap_block4(); ret; ELF(.size __blowfish_dec_blk4,.-__blowfish_dec_blk4;) .align 8 .globl _gcry_blowfish_amd64_ctr_enc ELF(.type _gcry_blowfish_amd64_ctr_enc,@function;) _gcry_blowfish_amd64_ctr_enc: /* input: * %rdi: ctx, CTX * %rsi: dst (4 blocks) * %rdx: src (4 blocks) * %rcx: iv (big endian, 64bit) */ pushq %rbp; pushq %rbx; pushq %r12; pushq %r13; /* %r11-%r13 are not used by __blowfish_enc_blk4 */ movq %rcx, %r13; /*iv*/ movq %rdx, %r12; /*src*/ movq %rsi, %r11; /*dst*/ /* load IV and byteswap */ movq (%r13), RT0; bswapq RT0; movq RT0, RX0; /* construct IVs */ leaq 1(RT0), RX1; leaq 2(RT0), RX2; leaq 3(RT0), RX3; leaq 4(RT0), RT0; bswapq RT0; inctrswap_block4(); /* store new IV */ movq RT0, (%r13); call __blowfish_enc_blk4; /* XOR key-stream with plaintext */ xorq 0 * 8(%r12), RX0; xorq 1 * 8(%r12), RX1; xorq 2 * 8(%r12), RX2; xorq 3 * 8(%r12), RX3; movq RX0, 0 * 8(%r11); movq RX1, 1 * 8(%r11); movq RX2, 2 * 8(%r11); movq RX3, 3 * 8(%r11); popq %r13; popq %r12; popq %rbx; popq %rbp; ret; ELF(.size _gcry_blowfish_amd64_ctr_enc,.-_gcry_blowfish_amd64_ctr_enc;) .align 8 .globl _gcry_blowfish_amd64_cbc_dec ELF(.type _gcry_blowfish_amd64_cbc_dec,@function;) _gcry_blowfish_amd64_cbc_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (4 blocks) * %rdx: src (4 blocks) * %rcx: iv (64bit) */ pushq %rbp; pushq %rbx; pushq %r12; pushq %r13; /* %r11-%r13 are not used by __blowfish_dec_blk4 */ movq %rsi, %r11; /*dst*/ movq %rdx, %r12; /*src*/ movq %rcx, %r13; /*iv*/ /* load input */ movq 0 * 8(%r12), RX0; movq 1 * 8(%r12), RX1; movq 2 * 8(%r12), RX2; movq 3 * 8(%r12), RX3; call __blowfish_dec_blk4; movq 3 * 8(%r12), RT0; xorq (%r13), RX0; xorq 0 * 8(%r12), RX1; xorq 1 * 8(%r12), RX2; xorq 2 * 8(%r12), RX3; movq RT0, (%r13); /* store new IV */ movq RX0, 0 * 8(%r11); movq RX1, 1 * 8(%r11); movq RX2, 2 * 8(%r11); movq RX3, 3 * 8(%r11); popq %r13; popq %r12; popq %rbx; popq %rbp; ret; ELF(.size _gcry_blowfish_amd64_cbc_dec,.-_gcry_blowfish_amd64_cbc_dec;) .align 8 .globl _gcry_blowfish_amd64_cfb_dec ELF(.type _gcry_blowfish_amd64_cfb_dec,@function;) _gcry_blowfish_amd64_cfb_dec: /* input: * %rdi: ctx, CTX * %rsi: dst (4 blocks) * %rdx: src (4 blocks) * %rcx: iv (64bit) */ pushq %rbp; pushq %rbx; pushq %r12; pushq %r13; /* %r11-%r13 are not used by __blowfish_enc_blk4 */ movq %rcx, %r13; /*iv*/ movq %rdx, %r12; /*src*/ movq %rsi, %r11; /*dst*/ /* Load input */ movq (%r13), RX0; movq 0 * 8(%r12), RX1; movq 1 * 8(%r12), RX2; movq 2 * 8(%r12), RX3; inbswap_block4(); /* Update IV */ movq 3 * 8(%r12), RT0; movq RT0, (%r13); call __blowfish_enc_blk4; xorq 0 * 8(%r12), RX0; xorq 1 * 8(%r12), RX1; xorq 2 * 8(%r12), RX2; xorq 3 * 8(%r12), RX3; movq RX0, 0 * 8(%r11); movq RX1, 1 * 8(%r11); movq RX2, 2 * 8(%r11); movq RX3, 3 * 8(%r11); popq %r13; popq %r12; popq %rbx; popq %rbp; ret; ELF(.size _gcry_blowfish_amd64_cfb_dec,.-_gcry_blowfish_amd64_cfb_dec;) #endif /*defined(USE_BLOWFISH)*/ #endif /*__x86_64*/