diff options
-rw-r--r-- | cipher/Makefile.am | 2 | ||||
-rw-r--r-- | cipher/cipher.c | 8 | ||||
-rw-r--r-- | cipher/twofish-amd64.S | 731 | ||||
-rw-r--r-- | cipher/twofish.c | 278 | ||||
-rw-r--r-- | configure.ac | 7 | ||||
-rw-r--r-- | src/cipher.h | 11 |
6 files changed, 1036 insertions, 1 deletions
diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 7439cc97..cf9fc336 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -75,7 +75,7 @@ sha256.c \ sha512.c \ tiger.c \ whirlpool.c \ -twofish.c \ +twofish.c twofish-amd64.S \ rfc2268.c \ camellia.c camellia.h camellia-glue.c camellia-aesni-avx-amd64.S diff --git a/cipher/cipher.c b/cipher/cipher.c index 508f26fc..d7ebea84 100644 --- a/cipher/cipher.c +++ b/cipher/cipher.c @@ -750,6 +750,14 @@ gcry_cipher_open (gcry_cipher_hd_t *handle, h->bulk.ctr_enc = _gcry_serpent_ctr_enc; break; #endif /*USE_SERPENT*/ +#ifdef USE_TWOFISH + case GCRY_CIPHER_TWOFISH: + case GCRY_CIPHER_TWOFISH128: + h->bulk.cbc_dec = _gcry_twofish_cbc_dec; + h->bulk.cfb_dec = _gcry_twofish_cfb_dec; + h->bulk.ctr_enc = _gcry_twofish_ctr_enc; + break; +#endif /*USE_TWOFISH*/ default: break; diff --git a/cipher/twofish-amd64.S b/cipher/twofish-amd64.S new file mode 100644 index 00000000..a2a878ae --- /dev/null +++ b/cipher/twofish-amd64.S @@ -0,0 +1,731 @@ +/* twofish-amd64.S - AMD64 assembly implementation of Twofish cipher + * + * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifdef __x86_64 +#include <config.h> +#if defined(USE_TWOFISH) + +#ifdef __PIC__ +# define RIP %rip +#else +# define RIP +#endif + +.text + +/* structure of TWOFISH_context: */ +#define s0 0 +#define s1 ((s0) + 4 * 256) +#define s2 ((s1) + 4 * 256) +#define s3 ((s2) + 4 * 256) +#define w ((s3) + 4 * 256) +#define k ((w) + 4 * 8) + +/* register macros */ +#define CTX %rdi + +#define RA %rax +#define RB %rbx +#define RC %rcx +#define RD %rdx + +#define RAd %eax +#define RBd %ebx +#define RCd %ecx +#define RDd %edx + +#define RAbl %al +#define RBbl %bl +#define RCbl %cl +#define RDbl %dl + +#define RAbh %ah +#define RBbh %bh +#define RCbh %ch +#define RDbh %dh + +#define RX %r8 +#define RY %r9 + +#define RXd %r8d +#define RYd %r9d + +#define RT0 %rsi +#define RT1 %rbp +#define RT2 %r10 +#define RT3 %r11 + +#define RT0d %esi +#define RT1d %ebp +#define RT2d %r10d +#define RT3d %r11d + +/*********************************************************************** + * AMD64 assembly implementation of the Twofish cipher + ***********************************************************************/ +#define enc_g1_2(a, b, x, y) \ + movzbl b ## bl, RT3d; \ + movzbl b ## bh, RT1d; \ + movzbl a ## bl, RT2d; \ + movzbl a ## bh, RT0d; \ + rorl $16, b ## d; \ + rorl $16, a ## d; \ + movl s1(CTX, RT3, 4), RYd; \ + movzbl b ## bl, RT3d; \ + movl s0(CTX, RT2, 4), RXd; \ + movzbl a ## bl, RT2d; \ + xorl s2(CTX, RT1, 4), RYd; \ + movzbl b ## bh, RT1d; \ + xorl s1(CTX, RT0, 4), RXd; \ + movzbl a ## bh, RT0d; \ + rorl $16, b ## d; \ + rorl $16, a ## d; \ + xorl s3(CTX, RT3, 4), RYd; \ + xorl s2(CTX, RT2, 4), RXd; \ + xorl s0(CTX, RT1, 4), RYd; \ + xorl s3(CTX, RT0, 4), RXd; + +#define dec_g1_2(a, b, x, y) \ + movzbl a ## bl, RT2d; \ + movzbl a ## bh, RT0d; \ + movzbl b ## bl, RT3d; \ + movzbl b ## bh, RT1d; \ + rorl $16, a ## d; \ + rorl $16, b ## d; \ + movl s0(CTX, RT2, 4), RXd; \ + movzbl a ## bl, RT2d; \ + movl s1(CTX, RT3, 4), RYd; \ + movzbl b ## bl, RT3d; \ + xorl s1(CTX, RT0, 4), RXd; \ + movzbl a ## bh, RT0d; \ + xorl s2(CTX, RT1, 4), RYd; \ + movzbl b ## bh, RT1d; \ + rorl $16, a ## d; \ + rorl $16, b ## d; \ + xorl s2(CTX, RT2, 4), RXd; \ + xorl s3(CTX, RT3, 4), RYd; \ + xorl s3(CTX, RT0, 4), RXd; \ + xorl s0(CTX, RT1, 4), RYd; + +#define encrypt_round(ra, rb, rc, rd, n) \ + enc_g1_2(##ra, ##rb, RX, RY); \ + \ + leal (RXd, RYd, 2), RT0d; \ + addl RYd, RXd; \ + addl (k + 8 * (n) + 4)(CTX), RT0d; \ + roll $1, rd ## d; \ + addl (k + 8 * (n))(CTX), RXd; \ + xorl RT0d, rd ## d; \ + xorl RXd, rc ## d; \ + rorl $1, rc ## d; + +#define decrypt_round(ra, rb, rc, rd, n) \ + dec_g1_2(##ra, ##rb, RX, RY); \ + \ + leal (RXd, RYd, 2), RT0d; \ + addl RYd, RXd; \ + addl (k + 8 * (n) + 4)(CTX), RT0d; \ + roll $1, rc ## d; \ + addl (k + 8 * (n))(CTX), RXd; \ + xorl RXd, rc ## d; \ + xorl RT0d, rd ## d; \ + rorl $1, rd ## d; + +#define encrypt_cycle(a, b, c, d, nc) \ + encrypt_round(##a, ##b, ##c, ##d, (nc) * 2); \ + encrypt_round(##c, ##d, ##a, ##b, (nc) * 2 + 1); + +#define decrypt_cycle(a, b, c, d, nc) \ + decrypt_round(##c, ##d, ##a, ##b, (nc) * 2 + 1); \ + decrypt_round(##a, ##b, ##c, ##d, (nc) * 2); + +#define inpack(in, n, x, m) \ + movl (4 * (n))(in), x; \ + xorl (w + 4 * (m))(CTX), x; + +#define outunpack(out, n, x, m) \ + xorl (w + 4 * (m))(CTX), x; \ + movl x, (4 * (n))(out); + +.align 8 +.global _gcry_twofish_amd64_encrypt_block +.type _gcry_twofish_amd64_encrypt_block,@function; + +_gcry_twofish_amd64_encrypt_block: + /* input: + * %rdi: context, CTX + * %rsi: dst + * %rdx: src + */ + subq $(3 * 8), %rsp; + movq %rsi, (0 * 8)(%rsp); + movq %rbp, (1 * 8)(%rsp); + movq %rbx, (2 * 8)(%rsp); + + movq %rdx, RX; + inpack(RX, 0, RAd, 0); + inpack(RX, 1, RBd, 1); + inpack(RX, 2, RCd, 2); + inpack(RX, 3, RDd, 3); + + encrypt_cycle(RA, RB, RC, RD, 0); + encrypt_cycle(RA, RB, RC, RD, 1); + encrypt_cycle(RA, RB, RC, RD, 2); + encrypt_cycle(RA, RB, RC, RD, 3); + encrypt_cycle(RA, RB, RC, RD, 4); + encrypt_cycle(RA, RB, RC, RD, 5); + encrypt_cycle(RA, RB, RC, RD, 6); + encrypt_cycle(RA, RB, RC, RD, 7); + + movq (0 * 8)(%rsp), RX; /*dst*/ + outunpack(RX, 0, RCd, 4); + outunpack(RX, 1, RDd, 5); + outunpack(RX, 2, RAd, 6); + outunpack(RX, 3, RBd, 7); + + movq (2 * 8)(%rsp), %rbx; + movq (1 * 8)(%rsp), %rbp; + addq $(3 * 8), %rsp; + + ret; +.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block; + +.align 8 +.global _gcry_twofish_amd64_decrypt_block +.type _gcry_twofish_amd64_decrypt_block,@function; + +_gcry_twofish_amd64_decrypt_block: + /* input: + * %rdi: context, CTX + * %rsi: dst + * %rdx: src + */ + subq $(3 * 8), %rsp; + movq %rsi, (0 * 8)(%rsp); + movq %rbp, (1 * 8)(%rsp); + movq %rbx, (2 * 8)(%rsp); + + movq %rdx, RX; + inpack(RX, 0, RCd, 4); + inpack(RX, 1, RDd, 5); + inpack(RX, 2, RAd, 6); + inpack(RX, 3, RBd, 7); + + decrypt_cycle(RA, RB, RC, RD, 7); + decrypt_cycle(RA, RB, RC, RD, 6); + decrypt_cycle(RA, RB, RC, RD, 5); + decrypt_cycle(RA, RB, RC, RD, 4); + decrypt_cycle(RA, RB, RC, RD, 3); + decrypt_cycle(RA, RB, RC, RD, 2); + decrypt_cycle(RA, RB, RC, RD, 1); + decrypt_cycle(RA, RB, RC, RD, 0); + + movq (0 * 8)(%rsp), RX; /*dst*/ + outunpack(RX, 0, RAd, 0); + outunpack(RX, 1, RBd, 1); + outunpack(RX, 2, RCd, 2); + outunpack(RX, 3, RDd, 3); + + movq (2 * 8)(%rsp), %rbx; + movq (1 * 8)(%rsp), %rbp; + addq $(3 * 8), %rsp; + + ret; +.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block; + +#undef CTX + +#undef RA +#undef RB +#undef RC +#undef RD + +#undef RAd +#undef RBd +#undef RCd +#undef RDd + +#undef RAbl +#undef RBbl +#undef RCbl +#undef RDbl + +#undef RAbh +#undef RBbh +#undef RCbh +#undef RDbh + +#undef RX +#undef RY + +#undef RXd +#undef RYd + +#undef RT0 +#undef RT1 +#undef RT2 +#undef RT3 + +#undef RT0d +#undef RT1d +#undef RT2d +#undef RT3d + +/*********************************************************************** + * AMD64 assembly implementation of the Twofish cipher, 3-way parallel + ***********************************************************************/ +#define CTX %rdi +#define RIO %rdx + +#define RAB0 %rax +#define RAB1 %rbx +#define RAB2 %rcx + +#define RAB0d %eax +#define RAB1d %ebx +#define RAB2d %ecx + +#define RAB0bh %ah +#define RAB1bh %bh +#define RAB2bh %ch + +#define RAB0bl %al +#define RAB1bl %bl +#define RAB2bl %cl + +#define RCD0 %r8 +#define RCD1 %r9 +#define RCD2 %r10 + +#define RCD0d %r8d +#define RCD1d %r9d +#define RCD2d %r10d + +#define RX0 %rbp +#define RX1 %r11 +#define RX2 %r12 + +#define RX0d %ebp +#define RX1d %r11d +#define RX2d %r12d + +#define RY0 %r13 +#define RY1 %r14 +#define RY2 %r15 + +#define RY0d %r13d +#define RY1d %r14d +#define RY2d %r15d + +#define RT0 %rdx +#define RT1 %rsi + +#define RT0d %edx +#define RT1d %esi + +#define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \ + movzbl ab ## bl, tmp2 ## d; \ + movzbl ab ## bh, tmp1 ## d; \ + rorq $(rot), ab; \ + op1##l T0(CTX, tmp2, 4), dst ## d; \ + op2##l T1(CTX, tmp1, 4), dst ## d; + +/* + * Combined G1 & G2 function. Reordered with help of rotates to have moves + * at beginning. + */ +#define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \ + /* G1,1 && G2,1 */ \ + do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \ + do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \ + \ + do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \ + do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \ + \ + do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \ + do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \ + \ + /* G1,2 && G2,2 */ \ + do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \ + do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \ + xchgq cd ## 0, ab ## 0; \ + \ + do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \ + do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \ + xchgq cd ## 1, ab ## 1; \ + \ + do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \ + do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \ + xchgq cd ## 2, ab ## 2; + +#define enc_round_end(ab, x, y, n) \ + addl y ## d, x ## d; \ + addl x ## d, y ## d; \ + addl k+4*(2*(n))(CTX), x ## d; \ + xorl ab ## d, x ## d; \ + addl k+4*(2*(n)+1)(CTX), y ## d; \ + shrq $32, ab; \ + roll $1, ab ## d; \ + xorl y ## d, ab ## d; \ + shlq $32, ab; \ + rorl $1, x ## d; \ + orq x, ab; + +#define dec_round_end(ba, x, y, n) \ + addl y ## d, x ## d; \ + addl x ## d, y ## d; \ + addl k+4*(2*(n))(CTX), x ## d; \ + addl k+4*(2*(n)+1)(CTX), y ## d; \ + xorl ba ## d, y ## d; \ + shrq $32, ba; \ + roll $1, ba ## d; \ + xorl x ## d, ba ## d; \ + shlq $32, ba; \ + rorl $1, y ## d; \ + orq y, ba; + +#define encrypt_round3(ab, cd, n) \ + g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \ + \ + enc_round_end(ab ## 0, RX0, RY0, n); \ + enc_round_end(ab ## 1, RX1, RY1, n); \ + enc_round_end(ab ## 2, RX2, RY2, n); + +#define decrypt_round3(ba, dc, n) \ + g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \ + \ + dec_round_end(ba ## 0, RX0, RY0, n); \ + dec_round_end(ba ## 1, RX1, RY1, n); \ + dec_round_end(ba ## 2, RX2, RY2, n); + +#define encrypt_cycle3(ab, cd, n) \ + encrypt_round3(ab, cd, n*2); \ + encrypt_round3(ab, cd, (n*2)+1); + +#define decrypt_cycle3(ba, dc, n) \ + decrypt_round3(ba, dc, (n*2)+1); \ + decrypt_round3(ba, dc, (n*2)); + +#define inpack3(xy, m) \ + xorq w+4*m(CTX), xy ## 0; \ + xorq w+4*m(CTX), xy ## 1; \ + xorq w+4*m(CTX), xy ## 2; + +#define outunpack3(xy, m) \ + xorq w+4*m(CTX), xy ## 0; \ + xorq w+4*m(CTX), xy ## 1; \ + xorq w+4*m(CTX), xy ## 2; + +#define inpack_enc3() \ + inpack3(RAB, 0); \ + inpack3(RCD, 2); + +#define outunpack_enc3() \ + outunpack3(RAB, 6); \ + outunpack3(RCD, 4); + +#define inpack_dec3() \ + inpack3(RAB, 4); \ + rorq $32, RAB0; \ + rorq $32, RAB1; \ + rorq $32, RAB2; \ + inpack3(RCD, 6); \ + rorq $32, RCD0; \ + rorq $32, RCD1; \ + rorq $32, RCD2; + +#define outunpack_dec3() \ + rorq $32, RCD0; \ + rorq $32, RCD1; \ + rorq $32, RCD2; \ + outunpack3(RCD, 0); \ + rorq $32, RAB0; \ + rorq $32, RAB1; \ + rorq $32, RAB2; \ + outunpack3(RAB, 2); + +.align 8 +.type __twofish_enc_blk3,@function; + +__twofish_enc_blk3: + /* input: + * %rdi: ctx, CTX + * RAB0,RCD0,RAB1,RCD1,RAB2,RCD2: three plaintext blocks + * output: + * RCD0,RAB0,RCD1,RAB1,RCD2,RAB2: three ciphertext blocks + */ + inpack_enc3(); + + encrypt_cycle3(RAB, RCD, 0); + encrypt_cycle3(RAB, RCD, 1); + encrypt_cycle3(RAB, RCD, 2); + encrypt_cycle3(RAB, RCD, 3); + encrypt_cycle3(RAB, RCD, 4); + encrypt_cycle3(RAB, RCD, 5); + encrypt_cycle3(RAB, RCD, 6); + encrypt_cycle3(RAB, RCD, 7); + + outunpack_enc3(); + + ret; +.size __twofish_enc_blk3,.-__twofish_enc_blk3; + +.align 8 +.global __twofish_dec_blk3 + +__twofish_dec_blk3: + /* input: + * %rdi: ctx, CTX + * RAB0,RCD0,RAB1,RCD1,RAB2,RCD2: three ciphertext blocks + * output: + * RCD0,RAB0,RCD1,RAB1,RCD2,RAB2: three plaintext blocks + */ + inpack_dec3(); + + decrypt_cycle3(RAB, RCD, 7); + decrypt_cycle3(RAB, RCD, 6); + decrypt_cycle3(RAB, RCD, 5); + decrypt_cycle3(RAB, RCD, 4); + decrypt_cycle3(RAB, RCD, 3); + decrypt_cycle3(RAB, RCD, 2); + decrypt_cycle3(RAB, RCD, 1); + decrypt_cycle3(RAB, RCD, 0); + + outunpack_dec3(); + + ret; +.size __twofish_dec_blk3,.-__twofish_dec_blk3; + +.align 8 +.global _gcry_twofish_amd64_ctr_enc +.type _gcry_twofish_amd64_ctr_enc,@function; +_gcry_twofish_amd64_ctr_enc: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (3 blocks) + * %rdx: src (3 blocks) + * %rcx: iv (big endian, 128bit) + */ + subq $(8 * 8), %rsp; + movq %rbp, (0 * 8)(%rsp); + movq %rbx, (1 * 8)(%rsp); + movq %r12, (2 * 8)(%rsp); + movq %r13, (3 * 8)(%rsp); + movq %r14, (4 * 8)(%rsp); + movq %r15, (5 * 8)(%rsp); + + movq %rsi, (6 * 8)(%rsp); + movq %rdx, (7 * 8)(%rsp); + movq %rcx, RX0; + + /* load IV and byteswap */ + movq 8(RX0), RT0; + movq 0(RX0), RT1; + movq RT0, RCD0; + movq RT1, RAB0; + bswapq RT0; + bswapq RT1; + + /* construct IVs */ + movq RT0, RCD1; + movq RT1, RAB1; + movq RT0, RCD2; + movq RT1, RAB2; + addq $1, RCD1; + adcq $0, RAB1; + bswapq RCD1; + bswapq RAB1; + addq $2, RCD2; + adcq $0, RAB2; + bswapq RCD2; + bswapq RAB2; + addq $3, RT0; + adcq $0, RT1; + bswapq RT0; + bswapq RT1; + + /* store new IV */ + movq RT0, 8(RX0); + movq RT1, 0(RX0); + + call __twofish_enc_blk3; + + movq (7 * 8)(%rsp), RX0; /*src*/ + movq (6 * 8)(%rsp), RX1; /*dst*/ + + /* XOR key-stream with plaintext */ + xorq (0 * 8)(RX0), RCD0; + xorq (1 * 8)(RX0), RAB0; + xorq (2 * 8)(RX0), RCD1; + xorq (3 * 8)(RX0), RAB1; + xorq (4 * 8)(RX0), RCD2; + xorq (5 * 8)(RX0), RAB2; + movq RCD0, (0 * 8)(RX1); + movq RAB0, (1 * 8)(RX1); + movq RCD1, (2 * 8)(RX1); + movq RAB1, (3 * 8)(RX1); + movq RCD2, (4 * 8)(RX1); + movq RAB2, (5 * 8)(RX1); + + movq (0 * 8)(%rsp), %rbp; + movq (1 * 8)(%rsp), %rbx; + movq (2 * 8)(%rsp), %r12; + movq (3 * 8)(%rsp), %r13; + movq (4 * 8)(%rsp), %r14; + movq (5 * 8)(%rsp), %r15; + addq $(8 * 8), %rsp; + + ret; +.size _gcry_twofish_amd64_ctr_enc,.-_gcry_twofish_amd64_ctr_enc; + +.align 8 +.global _gcry_twofish_amd64_cbc_dec +.type _gcry_twofish_amd64_cbc_dec,@function; +_gcry_twofish_amd64_cbc_dec: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (3 blocks) + * %rdx: src (3 blocks) + * %rcx: iv (128bit) + */ + subq $(9 * 8), %rsp; + movq %rbp, (0 * 8)(%rsp); + movq %rbx, (1 * 8)(%rsp); + movq %r12, (2 * 8)(%rsp); + movq %r13, (3 * 8)(%rsp); + movq %r14, (4 * 8)(%rsp); + movq %r15, (5 * 8)(%rsp); + + movq %rsi, (6 * 8)(%rsp); + movq %rdx, (7 * 8)(%rsp); + movq %rcx, (8 * 8)(%rsp); + movq %rdx, RX0; + + /* load input */ + movq (0 * 8)(RX0), RAB0; + movq (1 * 8)(RX0), RCD0; + movq (2 * 8)(RX0), RAB1; + movq (3 * 8)(RX0), RCD1; + movq (4 * 8)(RX0), RAB2; + movq (5 * 8)(RX0), RCD2; + + call __twofish_dec_blk3; + + movq (8 * 8)(%rsp), RT0; /*iv*/ + movq (7 * 8)(%rsp), RX0; /*src*/ + movq (6 * 8)(%rsp), RX1; /*dst*/ + + movq (4 * 8)(RX0), RY0; + movq (5 * 8)(RX0), RY1; + xorq (0 * 8)(RT0), RCD0; + xorq (1 * 8)(RT0), RAB0; + xorq (0 * 8)(RX0), RCD1; + xorq (1 * 8)(RX0), RAB1; + xorq (2 * 8)(RX0), RCD2; + xorq (3 * 8)(RX0), RAB2; + movq RY0, (0 * 8)(RT0); + movq RY1, (1 * 8)(RT0); + + movq RCD0, (0 * 8)(RX1); + movq RAB0, (1 * 8)(RX1); + movq RCD1, (2 * 8)(RX1); + movq RAB1, (3 * 8)(RX1); + movq RCD2, (4 * 8)(RX1); + movq RAB2, (5 * 8)(RX1); + + movq (0 * 8)(%rsp), %rbp; + movq (1 * 8)(%rsp), %rbx; + movq (2 * 8)(%rsp), %r12; + movq (3 * 8)(%rsp), %r13; + movq (4 * 8)(%rsp), %r14; + movq (5 * 8)(%rsp), %r15; + addq $(9 * 8), %rsp; + + ret; +.size _gcry_twofish_amd64_cbc_dec,.-_gcry_twofish_amd64_cbc_dec; + +.align 8 +.global _gcry_twofish_amd64_cfb_dec +.type _gcry_twofish_amd64_cfb_dec,@function; +_gcry_twofish_amd64_cfb_dec: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (3 blocks) + * %rdx: src (3 blocks) + * %rcx: iv (128bit) + */ + subq $(8 * 8), %rsp; + movq %rbp, (0 * 8)(%rsp); + movq %rbx, (1 * 8)(%rsp); + movq %r12, (2 * 8)(%rsp); + movq %r13, (3 * 8)(%rsp); + movq %r14, (4 * 8)(%rsp); + movq %r15, (5 * 8)(%rsp); + + movq %rsi, (6 * 8)(%rsp); + movq %rdx, (7 * 8)(%rsp); + movq %rdx, RX0; + movq %rcx, RX1; + + /* load input */ + movq (0 * 8)(RX1), RAB0; + movq (1 * 8)(RX1), RCD0; + movq (0 * 8)(RX0), RAB1; + movq (1 * 8)(RX0), RCD1; + movq (2 * 8)(RX0), RAB2; + movq (3 * 8)(RX0), RCD2; + + /* Update IV */ + movq (4 * 8)(RX0), RY0; + movq (5 * 8)(RX0), RY1; + movq RY0, (0 * 8)(RX1); + movq RY1, (1 * 8)(RX1); + + call __twofish_enc_blk3; + + movq (7 * 8)(%rsp), RX0; /*src*/ + movq (6 * 8)(%rsp), RX1; /*dst*/ + + xorq (0 * 8)(RX0), RCD0; + xorq (1 * 8)(RX0), RAB0; + xorq (2 * 8)(RX0), RCD1; + xorq (3 * 8)(RX0), RAB1; + xorq (4 * 8)(RX0), RCD2; + xorq (5 * 8)(RX0), RAB2; + movq RCD0, (0 * 8)(RX1); + movq RAB0, (1 * 8)(RX1); + movq RCD1, (2 * 8)(RX1); + movq RAB1, (3 * 8)(RX1); + movq RCD2, (4 * 8)(RX1); + movq RAB2, (5 * 8)(RX1); + + movq (0 * 8)(%rsp), %rbp; + movq (1 * 8)(%rsp), %rbx; + movq (2 * 8)(%rsp), %r12; + movq (3 * 8)(%rsp), %r13; + movq (4 * 8)(%rsp), %r14; + movq (5 * 8)(%rsp), %r15; + addq $(8 * 8), %rsp; + + ret; +.size _gcry_twofish_amd64_cfb_dec,.-_gcry_twofish_amd64_cfb_dec; + +#endif /*USE_TWOFISH*/ +#endif /*__x86_64*/ diff --git a/cipher/twofish.c b/cipher/twofish.c index f1a93ca8..ee721c6b 100644 --- a/cipher/twofish.c +++ b/cipher/twofish.c @@ -44,6 +44,19 @@ #include "types.h" /* for byte and u32 typedefs */ #include "g10lib.h" #include "cipher.h" +#include "bufhelp.h" +#include "cipher-selftest.h" + + +#define TWOFISH_BLOCKSIZE 16 + + +/* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */ +#undef USE_AMD64_ASM +#if defined(__x86_64__) +# define USE_AMD64_ASM 1 +#endif + /* Prototype for the self-test function. */ static const char *selftest(void); @@ -714,6 +727,27 @@ twofish_setkey (void *context, const byte *key, unsigned int keylen) +#ifdef USE_AMD64_ASM + +/* Assembly implementations of Twofish. */ +extern void _gcry_twofish_amd64_encrypt_block(const TWOFISH_context *c, + byte *out, const byte *in); + +extern void _gcry_twofish_amd64_decrypt_block(const TWOFISH_context *c, + byte *out, const byte *in); + +/* These assembly implementations process three blocks in parallel. */ +extern void _gcry_twofish_amd64_ctr_enc(const TWOFISH_context *c, byte *out, + const byte *in, byte *ctr); + +extern void _gcry_twofish_amd64_cbc_dec(const TWOFISH_context *c, byte *out, + const byte *in, byte *iv); + +extern void _gcry_twofish_amd64_cfb_dec(const TWOFISH_context *c, byte *out, + const byte *in, byte *iv); + +#else /*!USE_AMD64_ASM*/ + /* Macros to compute the g() function in the encryption and decryption * rounds. G1 is the straight g() function; G2 includes the 8-bit * rotation for the high 32-bit word. */ @@ -771,9 +805,30 @@ twofish_setkey (void *context, const byte *key, unsigned int keylen) x ^= ctx->w[m]; \ out[4 * (n)] = x; out[4 * (n) + 1] = x >> 8; \ out[4 * (n) + 2] = x >> 16; out[4 * (n) + 3] = x >> 24 + +#endif /*!USE_AMD64_ASM*/ + /* Encrypt one block. in and out may be the same. */ +#ifdef USE_AMD64_ASM + +static void +do_twofish_encrypt (const TWOFISH_context *ctx, byte *out, const byte *in) +{ + _gcry_twofish_amd64_encrypt_block(ctx, out, in); +} + +static void +twofish_encrypt (void *context, byte *out, const byte *in) +{ + TWOFISH_context *ctx = context; + _gcry_twofish_amd64_encrypt_block(ctx, out, in); + _gcry_burn_stack (4*sizeof (void*)); +} + +#else /*!USE_AMD64_ASM*/ + static void do_twofish_encrypt (const TWOFISH_context *ctx, byte *out, const byte *in) { @@ -814,9 +869,29 @@ twofish_encrypt (void *context, byte *out, const byte *in) _gcry_burn_stack (24+3*sizeof (void*)); } +#endif /*!USE_AMD64_ASM*/ + /* Decrypt one block. in and out may be the same. */ +#ifdef USE_AMD64_ASM + +static void +do_twofish_decrypt (const TWOFISH_context *ctx, byte *out, const byte *in) +{ + _gcry_twofish_amd64_decrypt_block(ctx, out, in); +} + +static void +twofish_decrypt (void *context, byte *out, const byte *in) +{ + TWOFISH_context *ctx = context; + _gcry_twofish_amd64_decrypt_block(ctx, out, in); + _gcry_burn_stack (4*sizeof (void*)); +} + +#else /*!USE_AMD64_ASM*/ + static void do_twofish_decrypt (const TWOFISH_context *ctx, byte *out, const byte *in) { @@ -858,6 +933,201 @@ twofish_decrypt (void *context, byte *out, const byte *in) _gcry_burn_stack (24+3*sizeof (void*)); } +#endif /*!USE_AMD64_ASM*/ + + + +/* Bulk encryption of complete blocks in CTR mode. This function is only + intended for the bulk encryption feature of cipher.c. CTR is expected to be + of size TWOFISH_BLOCKSIZE. */ +void +_gcry_twofish_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg, + const void *inbuf_arg, unsigned int nblocks) +{ + TWOFISH_context *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + unsigned char tmpbuf[TWOFISH_BLOCKSIZE]; + int burn_stack_depth = 24 + 3 * sizeof (void*); + int i; + +#ifdef USE_AMD64_ASM + { + if (nblocks >= 3 && burn_stack_depth < 8 * sizeof(void*)) + burn_stack_depth = 8 * sizeof(void*); + + /* Process data in 3 block chunks. */ + while (nblocks >= 3) + { + _gcry_twofish_amd64_ctr_enc(ctx, outbuf, inbuf, ctr); + + nblocks -= 3; + outbuf += 3 * TWOFISH_BLOCKSIZE; + inbuf += 3 * TWOFISH_BLOCKSIZE; + } + + /* Use generic code to handle smaller chunks... */ + /* TODO: use caching instead? */ + } +#endif + + for ( ;nblocks; nblocks-- ) + { + /* Encrypt the counter. */ + do_twofish_encrypt(ctx, tmpbuf, ctr); + /* XOR the input with the encrypted counter and store in output. */ + buf_xor(outbuf, tmpbuf, inbuf, TWOFISH_BLOCKSIZE); + outbuf += TWOFISH_BLOCKSIZE; + inbuf += TWOFISH_BLOCKSIZE; + /* Increment the counter. */ + for (i = TWOFISH_BLOCKSIZE; i > 0; i--) + { + ctr[i-1]++; + if (ctr[i-1]) + break; + } + } + + wipememory(tmpbuf, sizeof(tmpbuf)); + _gcry_burn_stack(burn_stack_depth); +} + + +/* Bulk decryption of complete blocks in CBC mode. This function is only + intended for the bulk encryption feature of cipher.c. */ +void +_gcry_twofish_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg, + const void *inbuf_arg, unsigned int nblocks) +{ + TWOFISH_context *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + unsigned char savebuf[TWOFISH_BLOCKSIZE]; + int burn_stack_depth = 24 + 3 * sizeof (void*); + +#ifdef USE_AMD64_ASM + { + if (nblocks >= 3 && burn_stack_depth < 9 * sizeof(void*)) + burn_stack_depth = 9 * sizeof(void*); + + /* Process data in 3 block chunks. */ + while (nblocks >= 3) + { + _gcry_twofish_amd64_cbc_dec(ctx, outbuf, inbuf, iv); + + nblocks -= 3; + outbuf += 3 * TWOFISH_BLOCKSIZE; + inbuf += 3 * TWOFISH_BLOCKSIZE; + } + + /* Use generic code to handle smaller chunks... */ + } +#endif + + for ( ;nblocks; nblocks-- ) + { + /* We need to save INBUF away because it may be identical to + OUTBUF. */ + memcpy(savebuf, inbuf, TWOFISH_BLOCKSIZE); + + do_twofish_decrypt (ctx, outbuf, inbuf); + + buf_xor(outbuf, outbuf, iv, TWOFISH_BLOCKSIZE); + memcpy(iv, savebuf, TWOFISH_BLOCKSIZE); + inbuf += TWOFISH_BLOCKSIZE; + outbuf += TWOFISH_BLOCKSIZE; + } + + wipememory(savebuf, sizeof(savebuf)); + _gcry_burn_stack(burn_stack_depth); +} + + +/* Bulk decryption of complete blocks in CFB mode. This function is only + intended for the bulk encryption feature of cipher.c. */ +void +_gcry_twofish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg, + const void *inbuf_arg, unsigned int nblocks) +{ + TWOFISH_context *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + int burn_stack_depth = 24 + 3 * sizeof (void*); + +#ifdef USE_AMD64_ASM + { + if (nblocks >= 3 && burn_stack_depth < 8 * sizeof(void*)) + burn_stack_depth = 8 * sizeof(void*); + + /* Process data in 3 block chunks. */ + while (nblocks >= 3) + { + _gcry_twofish_amd64_cfb_dec(ctx, outbuf, inbuf, iv); + + nblocks -= 3; + outbuf += 3 * TWOFISH_BLOCKSIZE; + inbuf += 3 * TWOFISH_BLOCKSIZE; + } + + /* Use generic code to handle smaller chunks... */ + } +#endif + + for ( ;nblocks; nblocks-- ) + { + do_twofish_encrypt(ctx, iv, iv); + buf_xor_n_copy(outbuf, iv, inbuf, TWOFISH_BLOCKSIZE); + outbuf += TWOFISH_BLOCKSIZE; + inbuf += TWOFISH_BLOCKSIZE; + } + + _gcry_burn_stack(burn_stack_depth); +} + + + +/* Run the self-tests for TWOFISH-CTR, tests IV increment of bulk CTR + encryption. Returns NULL on success. */ +static const char * +selftest_ctr (void) +{ + const int nblocks = 3+1; + const int blocksize = TWOFISH_BLOCKSIZE; + const int context_size = sizeof(TWOFISH_context); + + return _gcry_selftest_helper_ctr("TWOFISH", &twofish_setkey, + &twofish_encrypt, &_gcry_twofish_ctr_enc, nblocks, blocksize, + context_size); +} + +/* Run the self-tests for TWOFISH-CBC, tests bulk CBC decryption. + Returns NULL on success. */ +static const char * +selftest_cbc (void) +{ + const int nblocks = 3+2; + const int blocksize = TWOFISH_BLOCKSIZE; + const int context_size = sizeof(TWOFISH_context); + + return _gcry_selftest_helper_cbc("TWOFISH", &twofish_setkey, + &twofish_encrypt, &_gcry_twofish_cbc_dec, nblocks, blocksize, + context_size); +} + +/* Run the self-tests for TWOFISH-CFB, tests bulk CBC decryption. + Returns NULL on success. */ +static const char * +selftest_cfb (void) +{ + const int nblocks = 3+2; + const int blocksize = TWOFISH_BLOCKSIZE; + const int context_size = sizeof(TWOFISH_context); + + return _gcry_selftest_helper_cfb("TWOFISH", &twofish_setkey, + &twofish_encrypt, &_gcry_twofish_cfb_dec, nblocks, blocksize, + context_size); +} + /* Test a single encryption and decryption with each key size. */ @@ -866,6 +1136,7 @@ selftest (void) { TWOFISH_context ctx; /* Expanded key. */ byte scratch[16]; /* Encryption/decryption result buffer. */ + const char *r; /* Test vectors for single encryption/decryption. Note that I am using * the vectors from the Twofish paper's "known answer test", I=3 for @@ -915,6 +1186,13 @@ selftest (void) if (memcmp (scratch, plaintext_256, sizeof (plaintext_256))) return "Twofish-256 test decryption failed."; + if ((r = selftest_ctr()) != NULL) + return r; + if ((r = selftest_cbc()) != NULL) + return r; + if ((r = selftest_cfb()) != NULL) + return r; + return NULL; } diff --git a/configure.ac b/configure.ac index c33c950f..8cb980c4 100644 --- a/configure.ac +++ b/configure.ac @@ -1234,6 +1234,13 @@ LIST_MEMBER(twofish, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish.lo" AC_DEFINE(USE_TWOFISH, 1, [Defined if this module should be included]) + + case "${host}" in + x86_64-*-*) + # Build with the assembly implementation + GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish-amd64.lo" + ;; + esac fi LIST_MEMBER(serpent, $enabled_ciphers) diff --git a/src/cipher.h b/src/cipher.h index ca595b00..80c88392 100644 --- a/src/cipher.h +++ b/src/cipher.h @@ -143,6 +143,17 @@ void _gcry_serpent_cfb_dec (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, unsigned int nblocks); +/*-- twofish.c --*/ +void _gcry_twofish_ctr_enc (void *context, unsigned char *ctr, + void *outbuf_arg, const void *inbuf_arg, + unsigned int nblocks); +void _gcry_twofish_cbc_dec (void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + unsigned int nblocks); +void _gcry_twofish_cfb_dec (void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + unsigned int nblocks); + /*-- dsa.c --*/ void _gcry_register_pk_dsa_progress (gcry_handler_progress_t cbc, void *cb_data); |