diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2014-03-30 18:11:09 +0300 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2014-03-30 18:11:09 +0300 |
commit | b76b632a453b8d100d024e2439b4358454dc286e (patch) | |
tree | 5413cffa2b109eda5eaeacbdbe63d5986bba12b2 | |
parent | 50aeee51a0b1a09dd9fff2bb71749a816fe7a791 (diff) | |
download | libgcrypt-b76b632a453b8d100d024e2439b4358454dc286e.tar.gz |
3des: add amd64 assembly implementation for 3DES
* cipher/Makefile.am: Add 'des-amd64.S'.
* cipher/cipher-selftests.c (_gcry_selftest_helper_cbc)
(_gcry_selftest_helper_cfb, _gcry_selftest_helper_ctr): Handle failures
from 'setkey' function.
* cipher/cipher.c (_gcry_cipher_open_internal) [USE_DES]: Setup bulk
functions for 3DES.
* cipher/des-amd64.S: New file.
* cipher/des.c (USE_AMD64_ASM, ATTR_ALIGNED_16): New macros.
[USE_AMD64_ASM] (_gcry_3des_amd64_crypt_block)
(_gcry_3des_amd64_ctr_enc), _gcry_3des_amd64_cbc_dec)
(_gcry_3des_amd64_cfb_dec): New prototypes.
[USE_AMD64_ASM] (tripledes_ecb_crypt): New function.
(TRIPLEDES_ECB_BURN_STACK): New macro.
(_gcry_3des_ctr_enc, _gcry_3des_cbc_dec, _gcry_3des_cfb_dec)
(bulk_selftest_setkey, selftest_ctr, selftest_cbc, selftest_cfb): New
functions.
(selftest): Add call to CTR, CBC and CFB selftest functions.
(do_tripledes_encrypt, do_tripledes_decrypt): Use
TRIPLEDES_ECB_BURN_STACK.
* configure.ac [host=x86-64]: Add 'des-amd64.lo'.
* src/cipher.h (_gcry_3des_ctr_enc, _gcry_3des_cbc_dec)
(_gcry_3des_cfb_dec): New prototypes.
--
Add non-parallel functions for small speed-up and 3-way parallel functions for
modes of operation that support parallel processing.
Old vs new (Intel Core i5-4570):
================================
enc dec
ECB 1.17x 1.17x
CBC 1.17x 2.51x
CFB 1.16x 2.49x
OFB 1.17x 1.17x
CTR 2.56x 2.56x
Old vs new (Intel Core i5-2450M):
=================================
enc dec
ECB 1.28x 1.28x
CBC 1.27x 2.33x
CFB 1.27x 2.34x
OFB 1.27x 1.27x
CTR 2.36x 2.35x
New (Intel Core i5-4570):
=========================
3DES | nanosecs/byte mebibytes/sec cycles/byte
ECB enc | 28.39 ns/B 33.60 MiB/s 90.84 c/B
ECB dec | 28.27 ns/B 33.74 MiB/s 90.45 c/B
CBC enc | 29.50 ns/B 32.33 MiB/s 94.40 c/B
CBC dec | 13.35 ns/B 71.45 MiB/s 42.71 c/B
CFB enc | 29.59 ns/B 32.23 MiB/s 94.68 c/B
CFB dec | 13.41 ns/B 71.12 MiB/s 42.91 c/B
OFB enc | 28.90 ns/B 33.00 MiB/s 92.47 c/B
OFB dec | 28.90 ns/B 33.00 MiB/s 92.48 c/B
CTR enc | 13.39 ns/B 71.20 MiB/s 42.86 c/B
CTR dec | 13.39 ns/B 71.21 MiB/s 42.86 c/B
Old (Intel Core i5-4570):
=========================
3DES | nanosecs/byte mebibytes/sec cycles/byte
ECB enc | 33.24 ns/B 28.69 MiB/s 106.4 c/B
ECB dec | 33.26 ns/B 28.67 MiB/s 106.4 c/B
CBC enc | 34.45 ns/B 27.69 MiB/s 110.2 c/B
CBC dec | 33.45 ns/B 28.51 MiB/s 107.1 c/B
CFB enc | 34.43 ns/B 27.70 MiB/s 110.2 c/B
CFB dec | 33.41 ns/B 28.55 MiB/s 106.9 c/B
OFB enc | 33.79 ns/B 28.22 MiB/s 108.1 c/B
OFB dec | 33.79 ns/B 28.22 MiB/s 108.1 c/B
CTR enc | 34.27 ns/B 27.83 MiB/s 109.7 c/B
CTR dec | 34.27 ns/B 27.83 MiB/s 109.7 c/B
New (Intel Core i5-2450M):
==========================
3DES | nanosecs/byte mebibytes/sec cycles/byte
ECB enc | 42.21 ns/B 22.59 MiB/s 105.5 c/B
ECB dec | 42.23 ns/B 22.58 MiB/s 105.6 c/B
CBC enc | 43.70 ns/B 21.82 MiB/s 109.2 c/B
CBC dec | 23.25 ns/B 41.02 MiB/s 58.12 c/B
CFB enc | 43.71 ns/B 21.82 MiB/s 109.3 c/B
CFB dec | 23.23 ns/B 41.05 MiB/s 58.08 c/B
OFB enc | 42.73 ns/B 22.32 MiB/s 106.8 c/B
OFB dec | 42.73 ns/B 22.32 MiB/s 106.8 c/B
CTR enc | 23.31 ns/B 40.92 MiB/s 58.27 c/B
CTR dec | 23.35 ns/B 40.84 MiB/s 58.38 c/B
Old (Intel Core i5-2450M):
==========================
3DES | nanosecs/byte mebibytes/sec cycles/byte
ECB enc | 53.98 ns/B 17.67 MiB/s 134.9 c/B
ECB dec | 54.00 ns/B 17.66 MiB/s 135.0 c/B
CBC enc | 55.43 ns/B 17.20 MiB/s 138.6 c/B
CBC dec | 54.27 ns/B 17.57 MiB/s 135.7 c/B
CFB enc | 55.42 ns/B 17.21 MiB/s 138.6 c/B
CFB dec | 54.35 ns/B 17.55 MiB/s 135.9 c/B
OFB enc | 54.49 ns/B 17.50 MiB/s 136.2 c/B
OFB dec | 54.49 ns/B 17.50 MiB/s 136.2 c/B
CTR enc | 55.02 ns/B 17.33 MiB/s 137.5 c/B
CTR dec | 55.01 ns/B 17.34 MiB/s 137.5 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
-rw-r--r-- | cipher/Makefile.am | 2 | ||||
-rw-r--r-- | cipher/cipher-selftest.c | 18 | ||||
-rw-r--r-- | cipher/cipher.c | 7 | ||||
-rw-r--r-- | cipher/des-amd64.S | 1030 | ||||
-rw-r--r-- | cipher/des.c | 292 | ||||
-rw-r--r-- | configure.ac | 7 | ||||
-rw-r--r-- | src/cipher.h | 13 |
7 files changed, 1362 insertions, 7 deletions
diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 462e6db5..3c20d3c3 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -60,7 +60,7 @@ arcfour.c arcfour-amd64.S \ blowfish.c blowfish-amd64.S blowfish-arm.S \ cast5.c cast5-amd64.S cast5-arm.S \ crc.c \ -des.c \ +des.c des-amd64.S \ dsa.c \ elgamal.c \ ecc.c ecc-curves.c ecc-misc.c ecc-common.h \ diff --git a/cipher/cipher-selftest.c b/cipher/cipher-selftest.c index 5e958146..852368a0 100644 --- a/cipher/cipher-selftest.c +++ b/cipher/cipher-selftest.c @@ -82,7 +82,11 @@ _gcry_selftest_helper_cbc (const char *cipher, gcry_cipher_setkey_t setkey_func, ciphertext = plaintext2 + nblocks * blocksize; /* Initialize ctx */ - setkey_func (ctx, key, sizeof(key)); + if (setkey_func (ctx, key, sizeof(key)) != GPG_ERR_NO_ERROR) + { + xfree(mem); + return "setkey failed"; + } /* Test single block code path */ memset (iv, 0x4e, blocksize); @@ -199,7 +203,11 @@ _gcry_selftest_helper_cfb (const char *cipher, gcry_cipher_setkey_t setkey_func, ciphertext = plaintext2 + nblocks * blocksize; /* Initialize ctx */ - setkey_func (ctx, key, sizeof(key)); + if (setkey_func (ctx, key, sizeof(key)) != GPG_ERR_NO_ERROR) + { + xfree(mem); + return "setkey failed"; + } /* Test single block code path */ memset(iv, 0xd3, blocksize); @@ -316,7 +324,11 @@ _gcry_selftest_helper_ctr (const char *cipher, gcry_cipher_setkey_t setkey_func, ciphertext2 = ciphertext + nblocks * blocksize; /* Initialize ctx */ - setkey_func (ctx, key, sizeof(key)); + if (setkey_func (ctx, key, sizeof(key)) != GPG_ERR_NO_ERROR) + { + xfree(mem); + return "setkey failed"; + } /* Test single block code path */ memset (iv, 0xff, blocksize); diff --git a/cipher/cipher.c b/cipher/cipher.c index baa4720a..6552ed30 100644 --- a/cipher/cipher.c +++ b/cipher/cipher.c @@ -513,6 +513,13 @@ _gcry_cipher_open_internal (gcry_cipher_hd_t *handle, h->bulk.ctr_enc = _gcry_camellia_ctr_enc; break; #endif /*USE_CAMELLIA*/ +#ifdef USE_DES + case GCRY_CIPHER_3DES: + h->bulk.cbc_dec = _gcry_3des_cbc_dec; + h->bulk.cfb_dec = _gcry_3des_cfb_dec; + h->bulk.ctr_enc = _gcry_3des_ctr_enc; + break; +#endif /*USE_DES*/ #ifdef USE_SERPENT case GCRY_CIPHER_SERPENT128: case GCRY_CIPHER_SERPENT192: diff --git a/cipher/des-amd64.S b/cipher/des-amd64.S new file mode 100644 index 00000000..e8b2c568 --- /dev/null +++ b/cipher/des-amd64.S @@ -0,0 +1,1030 @@ +/* des-amd64.S - AMD64 assembly implementation of 3DES cipher + * + * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifdef __x86_64 +#include <config.h> +#if defined(USE_DES) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) + +#ifdef __PIC__ +# define RIP (%rip) +#else +# define RIP +#endif + +.text + +#define s1 0 +#define s2 ((s1) + (64*8)) +#define s3 ((s2) + (64*8)) +#define s4 ((s3) + (64*8)) +#define s5 ((s4) + (64*8)) +#define s6 ((s5) + (64*8)) +#define s7 ((s6) + (64*8)) +#define s8 ((s7) + (64*8)) + +/* register macros */ +#define CTX %rdi +#define SBOXES %rbp + +#define RL0 %r8 +#define RL1 %r9 +#define RL2 %r10 + +#define RL0d %r8d +#define RL1d %r9d +#define RL2d %r10d + +#define RR0 %r11 +#define RR1 %r12 +#define RR2 %r13 + +#define RR0d %r11d +#define RR1d %r12d +#define RR2d %r13d + +#define RW0 %rax +#define RW1 %rbx +#define RW2 %rcx + +#define RW0d %eax +#define RW1d %ebx +#define RW2d %ecx + +#define RW0bl %al +#define RW1bl %bl +#define RW2bl %cl + +#define RW0bh %ah +#define RW1bh %bh +#define RW2bh %ch + +#define RT0 %r15 +#define RT1 %rsi +#define RT2 %r14 +#define RT3 %rdx + +#define RT0d %r15d +#define RT1d %esi +#define RT2d %r14d +#define RT3d %edx + +/*********************************************************************** + * 1-way 3DES + ***********************************************************************/ +#define do_permutation(a, b, offset, mask) \ + movl a, RT0d; \ + shrl $(offset), RT0d; \ + xorl b, RT0d; \ + andl $(mask), RT0d; \ + xorl RT0d, b; \ + shll $(offset), RT0d; \ + xorl RT0d, a; + +#define expand_to_64bits(val, mask) \ + movl val##d, RT0d; \ + rorl $4, RT0d; \ + shlq $32, RT0; \ + orq RT0, val; \ + andq mask, val; + +#define compress_to_64bits(val) \ + movq val, RT0; \ + shrq $32, RT0; \ + roll $4, RT0d; \ + orl RT0d, val##d; + +#define initial_permutation(left, right) \ + do_permutation(left##d, right##d, 4, 0x0f0f0f0f); \ + do_permutation(left##d, right##d, 16, 0x0000ffff); \ + do_permutation(right##d, left##d, 2, 0x33333333); \ + do_permutation(right##d, left##d, 8, 0x00ff00ff); \ + movabs $0x3f3f3f3f3f3f3f3f, RT3; \ + movl left##d, RW0d; \ + roll $1, right##d; \ + xorl right##d, RW0d; \ + andl $0xaaaaaaaa, RW0d; \ + xorl RW0d, left##d; \ + xorl RW0d, right##d; \ + roll $1, left##d; \ + expand_to_64bits(right, RT3); \ + expand_to_64bits(left, RT3); + +#define final_permutation(left, right) \ + compress_to_64bits(right); \ + compress_to_64bits(left); \ + movl right##d, RW0d; \ + rorl $1, left##d; \ + xorl left##d, RW0d; \ + andl $0xaaaaaaaa, RW0d; \ + xorl RW0d, right##d; \ + xorl RW0d, left##d; \ + rorl $1, right##d; \ + do_permutation(right##d, left##d, 8, 0x00ff00ff); \ + do_permutation(right##d, left##d, 2, 0x33333333); \ + do_permutation(left##d, right##d, 16, 0x0000ffff); \ + do_permutation(left##d, right##d, 4, 0x0f0f0f0f); + +#define round1(n, from, to, load_next_key) \ + xorq from, RW0; \ + \ + movzbl RW0bl, RT0d; \ + movzbl RW0bh, RT1d; \ + shrq $16, RW0; \ + movzbl RW0bl, RT2d; \ + movzbl RW0bh, RT3d; \ + shrq $16, RW0; \ + movq s8(SBOXES, RT0, 8), RT0; \ + xorq s6(SBOXES, RT1, 8), to; \ + movzbl RW0bl, RL1d; \ + movzbl RW0bh, RT1d; \ + shrl $16, RW0d; \ + xorq s4(SBOXES, RT2, 8), RT0; \ + xorq s2(SBOXES, RT3, 8), to; \ + movzbl RW0bl, RT2d; \ + movzbl RW0bh, RT3d; \ + xorq s7(SBOXES, RL1, 8), RT0; \ + xorq s5(SBOXES, RT1, 8), to; \ + xorq s3(SBOXES, RT2, 8), RT0; \ + load_next_key(n, RW0); \ + xorq RT0, to; \ + xorq s1(SBOXES, RT3, 8), to; \ + +#define load_next_key(n, RWx) \ + movq (((n) + 1) * 8)(CTX), RWx; + +#define dummy2(a, b) /*_*/ + +#define read_block(io, left, right) \ + movl (io), left##d; \ + movl 4(io), right##d; \ + bswapl left##d; \ + bswapl right##d; + +#define write_block(io, left, right) \ + bswapl left##d; \ + bswapl right##d; \ + movl left##d, (io); \ + movl right##d, 4(io); + +.align 8 +.globl _gcry_3des_amd64_crypt_block +.type _gcry_3des_amd64_crypt_block,@function; + +_gcry_3des_amd64_crypt_block: + /* input: + * %rdi: round keys, CTX + * %rsi: dst + * %rdx: src + */ + pushq %rbp; + pushq %rbx; + pushq %r12; + pushq %r13; + pushq %r14; + pushq %r15; + pushq %rsi; /*dst*/ + + leaq .L_s1 RIP, SBOXES; + + read_block(%rdx, RL0, RR0); + initial_permutation(RL0, RR0); + + movq (CTX), RW0; + + round1(0, RR0, RL0, load_next_key); + round1(1, RL0, RR0, load_next_key); + round1(2, RR0, RL0, load_next_key); + round1(3, RL0, RR0, load_next_key); + round1(4, RR0, RL0, load_next_key); + round1(5, RL0, RR0, load_next_key); + round1(6, RR0, RL0, load_next_key); + round1(7, RL0, RR0, load_next_key); + round1(8, RR0, RL0, load_next_key); + round1(9, RL0, RR0, load_next_key); + round1(10, RR0, RL0, load_next_key); + round1(11, RL0, RR0, load_next_key); + round1(12, RR0, RL0, load_next_key); + round1(13, RL0, RR0, load_next_key); + round1(14, RR0, RL0, load_next_key); + round1(15, RL0, RR0, load_next_key); + + round1(16+0, RL0, RR0, load_next_key); + round1(16+1, RR0, RL0, load_next_key); + round1(16+2, RL0, RR0, load_next_key); + round1(16+3, RR0, RL0, load_next_key); + round1(16+4, RL0, RR0, load_next_key); + round1(16+5, RR0, RL0, load_next_key); + round1(16+6, RL0, RR0, load_next_key); + round1(16+7, RR0, RL0, load_next_key); + round1(16+8, RL0, RR0, load_next_key); + round1(16+9, RR0, RL0, load_next_key); + round1(16+10, RL0, RR0, load_next_key); + round1(16+11, RR0, RL0, load_next_key); + round1(16+12, RL0, RR0, load_next_key); + round1(16+13, RR0, RL0, load_next_key); + round1(16+14, RL0, RR0, load_next_key); + round1(16+15, RR0, RL0, load_next_key); + + round1(32+0, RR0, RL0, load_next_key); + round1(32+1, RL0, RR0, load_next_key); + round1(32+2, RR0, RL0, load_next_key); + round1(32+3, RL0, RR0, load_next_key); + round1(32+4, RR0, RL0, load_next_key); + round1(32+5, RL0, RR0, load_next_key); + round1(32+6, RR0, RL0, load_next_key); + round1(32+7, RL0, RR0, load_next_key); + round1(32+8, RR0, RL0, load_next_key); + round1(32+9, RL0, RR0, load_next_key); + round1(32+10, RR0, RL0, load_next_key); + round1(32+11, RL0, RR0, load_next_key); + round1(32+12, RR0, RL0, load_next_key); + round1(32+13, RL0, RR0, load_next_key); + round1(32+14, RR0, RL0, load_next_key); + round1(32+15, RL0, RR0, dummy2); + + popq RW2; /*dst*/ + final_permutation(RR0, RL0); + write_block(RW2, RR0, RL0); + + popq %r15; + popq %r14; + popq %r13; + popq %r12; + popq %rbx; + popq %rbp; + + ret; +.size _gcry_3des_amd64_crypt_block,.-_gcry_3des_amd64_crypt_block; + +/*********************************************************************** + * 3-way 3DES + ***********************************************************************/ +#define expand_to_64bits(val, mask) \ + movl val##d, RT0d; \ + rorl $4, RT0d; \ + shlq $32, RT0; \ + orq RT0, val; \ + andq mask, val; + +#define compress_to_64bits(val) \ + movq val, RT0; \ + shrq $32, RT0; \ + roll $4, RT0d; \ + orl RT0d, val##d; + +#define initial_permutation3(left, right) \ + do_permutation(left##0d, right##0d, 4, 0x0f0f0f0f); \ + do_permutation(left##0d, right##0d, 16, 0x0000ffff); \ + do_permutation(left##1d, right##1d, 4, 0x0f0f0f0f); \ + do_permutation(left##1d, right##1d, 16, 0x0000ffff); \ + do_permutation(left##2d, right##2d, 4, 0x0f0f0f0f); \ + do_permutation(left##2d, right##2d, 16, 0x0000ffff); \ + \ + do_permutation(right##0d, left##0d, 2, 0x33333333); \ + do_permutation(right##0d, left##0d, 8, 0x00ff00ff); \ + do_permutation(right##1d, left##1d, 2, 0x33333333); \ + do_permutation(right##1d, left##1d, 8, 0x00ff00ff); \ + do_permutation(right##2d, left##2d, 2, 0x33333333); \ + do_permutation(right##2d, left##2d, 8, 0x00ff00ff); \ + \ + movabs $0x3f3f3f3f3f3f3f3f, RT3; \ + \ + movl left##0d, RW0d; \ + roll $1, right##0d; \ + xorl right##0d, RW0d; \ + andl $0xaaaaaaaa, RW0d; \ + xorl RW0d, left##0d; \ + xorl RW0d, right##0d; \ + roll $1, left##0d; \ + expand_to_64bits(right##0, RT3); \ + expand_to_64bits(left##0, RT3); \ + movl left##1d, RW1d; \ + roll $1, right##1d; \ + xorl right##1d, RW1d; \ + andl $0xaaaaaaaa, RW1d; \ + xorl RW1d, left##1d; \ + xorl RW1d, right##1d; \ + roll $1, left##1d; \ + expand_to_64bits(right##1, RT3); \ + expand_to_64bits(left##1, RT3); \ + movl left##2d, RW2d; \ + roll $1, right##2d; \ + xorl right##2d, RW2d; \ + andl $0xaaaaaaaa, RW2d; \ + xorl RW2d, left##2d; \ + xorl RW2d, right##2d; \ + roll $1, left##2d; \ + expand_to_64bits(right##2, RT3); \ + expand_to_64bits(left##2, RT3); + +#define final_permutation3(left, right) \ + compress_to_64bits(right##0); \ + compress_to_64bits(left##0); \ + movl right##0d, RW0d; \ + rorl $1, left##0d; \ + xorl left##0d, RW0d; \ + andl $0xaaaaaaaa, RW0d; \ + xorl RW0d, right##0d; \ + xorl RW0d, left##0d; \ + rorl $1, right##0d; \ + compress_to_64bits(right##1); \ + compress_to_64bits(left##1); \ + movl right##1d, RW1d; \ + rorl $1, left##1d; \ + xorl left##1d, RW1d; \ + andl $0xaaaaaaaa, RW1d; \ + xorl RW1d, right##1d; \ + xorl RW1d, left##1d; \ + rorl $1, right##1d; \ + compress_to_64bits(right##2); \ + compress_to_64bits(left##2); \ + movl right##2d, RW2d; \ + rorl $1, left##2d; \ + xorl left##2d, RW2d; \ + andl $0xaaaaaaaa, RW2d; \ + xorl RW2d, right##2d; \ + xorl RW2d, left##2d; \ + rorl $1, right##2d; \ + \ + do_permutation(right##0d, left##0d, 8, 0x00ff00ff); \ + do_permutation(right##0d, left##0d, 2, 0x33333333); \ + do_permutation(right##1d, left##1d, 8, 0x00ff00ff); \ + do_permutation(right##1d, left##1d, 2, 0x33333333); \ + do_permutation(right##2d, left##2d, 8, 0x00ff00ff); \ + do_permutation(right##2d, left##2d, 2, 0x33333333); \ + \ + do_permutation(left##0d, right##0d, 16, 0x0000ffff); \ + do_permutation(left##0d, right##0d, 4, 0x0f0f0f0f); \ + do_permutation(left##1d, right##1d, 16, 0x0000ffff); \ + do_permutation(left##1d, right##1d, 4, 0x0f0f0f0f); \ + do_permutation(left##2d, right##2d, 16, 0x0000ffff); \ + do_permutation(left##2d, right##2d, 4, 0x0f0f0f0f); + +#define round3(n, from, to, load_next_key, do_movq) \ + xorq from##0, RW0; \ + movzbl RW0bl, RT3d; \ + movzbl RW0bh, RT1d; \ + shrq $16, RW0; \ + xorq s8(SBOXES, RT3, 8), to##0; \ + xorq s6(SBOXES, RT1, 8), to##0; \ + movzbl RW0bl, RT3d; \ + movzbl RW0bh, RT1d; \ + shrq $16, RW0; \ + xorq s4(SBOXES, RT3, 8), to##0; \ + xorq s2(SBOXES, RT1, 8), to##0; \ + movzbl RW0bl, RT3d; \ + movzbl RW0bh, RT1d; \ + shrl $16, RW0d; \ + xorq s7(SBOXES, RT3, 8), to##0; \ + xorq s5(SBOXES, RT1, 8), to##0; \ + movzbl RW0bl, RT3d; \ + movzbl RW0bh, RT1d; \ + load_next_key(n, RW0); \ + xorq s3(SBOXES, RT3, 8), to##0; \ + xorq s1(SBOXES, RT1, 8), to##0; \ + xorq from##1, RW1; \ + movzbl RW1bl, RT3d; \ + movzbl RW1bh, RT1d; \ + shrq $16, RW1; \ + xorq s8(SBOXES, RT3, 8), to##1; \ + xorq s6(SBOXES, RT1, 8), to##1; \ + movzbl RW1bl, RT3d; \ + movzbl RW1bh, RT1d; \ + shrq $16, RW1; \ + xorq s4(SBOXES, RT3, 8), to##1; \ + xorq s2(SBOXES, RT1, 8), to##1; \ + movzbl RW1bl, RT3d; \ + movzbl RW1bh, RT1d; \ + shrl $16, RW1d; \ + xorq s7(SBOXES, RT3, 8), to##1; \ + xorq s5(SBOXES, RT1, 8), to##1; \ + movzbl RW1bl, RT3d; \ + movzbl RW1bh, RT1d; \ + do_movq(RW0, RW1); \ + xorq s3(SBOXES, RT3, 8), to##1; \ + xorq s1(SBOXES, RT1, 8), to##1; \ + xorq from##2, RW2; \ + movzbl RW2bl, RT3d; \ + movzbl RW2bh, RT1d; \ + shrq $16, RW2; \ + xorq s8(SBOXES, RT3, 8), to##2; \ + xorq s6(SBOXES, RT1, 8), to##2; \ + movzbl RW2bl, RT3d; \ + movzbl RW2bh, RT1d; \ + shrq $16, RW2; \ + xorq s4(SBOXES, RT3, 8), to##2; \ + xorq s2(SBOXES, RT1, 8), to##2; \ + movzbl RW2bl, RT3d; \ + movzbl RW2bh, RT1d; \ + shrl $16, RW2d; \ + xorq s7(SBOXES, RT3, 8), to##2; \ + xorq s5(SBOXES, RT1, 8), to##2; \ + movzbl RW2bl, RT3d; \ + movzbl RW2bh, RT1d; \ + do_movq(RW0, RW2); \ + xorq s3(SBOXES, RT3, 8), to##2; \ + xorq s1(SBOXES, RT1, 8), to##2; + +#define __movq(src, dst) \ + movq src, dst; + +#define read_block(io, left, right) \ + movl (io), left##d; \ + movl 4(io), right##d; \ + bswapl left##d; \ + bswapl right##d; + +#define write_block(io, left, right) \ + bswapl left##d; \ + bswapl right##d; \ + movl left##d, (io); \ + movl right##d, 4(io); + +.align 8 +.type _gcry_3des_amd64_crypt_blk3,@function; +_gcry_3des_amd64_crypt_blk3: + /* input: + * %rdi: round keys, CTX + * RL0d, RR0d, RL1d, RR1d, RL2d, RR2d: 3 input blocks + * RR0d, RL0d, RR1d, RL1d, RR2d, RL2d: 3 output blocks + */ + + leaq .L_s1 RIP, SBOXES; + + initial_permutation3(RL, RR); + + movq 0(CTX), RW0; + movq RW0, RW1; + movq RW0, RW2; + + round3(0, RR, RL, load_next_key, __movq); + round3(1, RL, RR, load_next_key, __movq); + round3(2, RR, RL, load_next_key, __movq); + round3(3, RL, RR, load_next_key, __movq); + round3(4, RR, RL, load_next_key, __movq); + round3(5, RL, RR, load_next_key, __movq); + round3(6, RR, RL, load_next_key, __movq); + round3(7, RL, RR, load_next_key, __movq); + round3(8, RR, RL, load_next_key, __movq); + round3(9, RL, RR, load_next_key, __movq); + round3(10, RR, RL, load_next_key, __movq); + round3(11, RL, RR, load_next_key, __movq); + round3(12, RR, RL, load_next_key, __movq); + round3(13, RL, RR, load_next_key, __movq); + round3(14, RR, RL, load_next_key, __movq); + round3(15, RL, RR, load_next_key, __movq); + + round3(16+0, RL, RR, load_next_key, __movq); + round3(16+1, RR, RL, load_next_key, __movq); + round3(16+2, RL, RR, load_next_key, __movq); + round3(16+3, RR, RL, load_next_key, __movq); + round3(16+4, RL, RR, load_next_key, __movq); + round3(16+5, RR, RL, load_next_key, __movq); + round3(16+6, RL, RR, load_next_key, __movq); + round3(16+7, RR, RL, load_next_key, __movq); + round3(16+8, RL, RR, load_next_key, __movq); + round3(16+9, RR, RL, load_next_key, __movq); + round3(16+10, RL, RR, load_next_key, __movq); + round3(16+11, RR, RL, load_next_key, __movq); + round3(16+12, RL, RR, load_next_key, __movq); + round3(16+13, RR, RL, load_next_key, __movq); + round3(16+14, RL, RR, load_next_key, __movq); + round3(16+15, RR, RL, load_next_key, __movq); + + round3(32+0, RR, RL, load_next_key, __movq); + round3(32+1, RL, RR, load_next_key, __movq); + round3(32+2, RR, RL, load_next_key, __movq); + round3(32+3, RL, RR, load_next_key, __movq); + round3(32+4, RR, RL, load_next_key, __movq); + round3(32+5, RL, RR, load_next_key, __movq); + round3(32+6, RR, RL, load_next_key, __movq); + round3(32+7, RL, RR, load_next_key, __movq); + round3(32+8, RR, RL, load_next_key, __movq); + round3(32+9, RL, RR, load_next_key, __movq); + round3(32+10, RR, RL, load_next_key, __movq); + round3(32+11, RL, RR, load_next_key, __movq); + round3(32+12, RR, RL, load_next_key, __movq); + round3(32+13, RL, RR, load_next_key, __movq); + round3(32+14, RR, RL, load_next_key, __movq); + round3(32+15, RL, RR, dummy2, dummy2); + + final_permutation3(RR, RL); + + ret; +.size _gcry_3des_amd64_crypt_blk3,.-_gcry_3des_amd64_crypt_blk3; + +.align 8 +.globl _gcry_3des_amd64_cbc_dec +.type _gcry_3des_amd64_cbc_dec,@function; +_gcry_3des_amd64_cbc_dec: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (3 blocks) + * %rdx: src (3 blocks) + * %rcx: iv (64bit) + */ + + pushq %rbp; + pushq %rbx; + pushq %r12; + pushq %r13; + pushq %r14; + pushq %r15; + + pushq %rsi; /*dst*/ + pushq %rdx; /*src*/ + pushq %rcx; /*iv*/ + + /* load input */ + movl 0 * 4(%rdx), RL0d; + movl 1 * 4(%rdx), RR0d; + movl 2 * 4(%rdx), RL1d; + movl 3 * 4(%rdx), RR1d; + movl 4 * 4(%rdx), RL2d; + movl 5 * 4(%rdx), RR2d; + + bswapl RL0d; + bswapl RR0d; + bswapl RL1d; + bswapl RR1d; + bswapl RL2d; + bswapl RR2d; + + call _gcry_3des_amd64_crypt_blk3; + + popq %rcx; /*iv*/ + popq %rdx; /*src*/ + popq %rsi; /*dst*/ + + bswapl RR0d; + bswapl RL0d; + bswapl RR1d; + bswapl RL1d; + bswapl RR2d; + bswapl RL2d; + + movq 2 * 8(%rdx), RT0; + xorl 0 * 4(%rcx), RR0d; + xorl 1 * 4(%rcx), RL0d; + xorl 0 * 4(%rdx), RR1d; + xorl 1 * 4(%rdx), RL1d; + xorl 2 * 4(%rdx), RR2d; + xorl 3 * 4(%rdx), RL2d; + movq RT0, (%rcx); /* store new IV */ + + movl RR0d, 0 * 4(%rsi); + movl RL0d, 1 * 4(%rsi); + movl RR1d, 2 * 4(%rsi); + movl RL1d, 3 * 4(%rsi); + movl RR2d, 4 * 4(%rsi); + movl RL2d, 5 * 4(%rsi); + + popq %r15; + popq %r14; + popq %r13; + popq %r12; + popq %rbx; + popq %rbp; + + ret; +.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec; + +.align 8 +.globl _gcry_3des_amd64_ctr_enc +.type _gcry_3des_amd64_ctr_enc,@function; +_gcry_3des_amd64_ctr_enc: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (3 blocks) + * %rdx: src (3 blocks) + * %rcx: iv (64bit) + */ + + pushq %rbp; + pushq %rbx; + pushq %r12; + pushq %r13; + pushq %r14; + pushq %r15; + + pushq %rsi; /*dst*/ + pushq %rdx; /*src*/ + movq %rcx, RW2; + + /* load IV and byteswap */ + movq (RW2), RT0; + bswapq RT0; + movq RT0, RR0; + + /* construct IVs */ + leaq 1(RT0), RR1; + leaq 2(RT0), RR2; + leaq 3(RT0), RT0; + movq RR0, RL0; + movq RR1, RL1; + movq RR2, RL2; + bswapq RT0; + shrq $32, RL0; + shrq $32, RL1; + shrq $32, RL2; + + /* store new IV */ + movq RT0, (RW2); + + call _gcry_3des_amd64_crypt_blk3; + + popq %rdx; /*src*/ + popq %rsi; /*dst*/ + + bswapl RR0d; + bswapl RL0d; + bswapl RR1d; + bswapl RL1d; + bswapl RR2d; + bswapl RL2d; + + xorl 0 * 4(%rdx), RR0d; + xorl 1 * 4(%rdx), RL0d; + xorl 2 * 4(%rdx), RR1d; + xorl 3 * 4(%rdx), RL1d; + xorl 4 * 4(%rdx), RR2d; + xorl 5 * 4(%rdx), RL2d; + + movl RR0d, 0 * 4(%rsi); + movl RL0d, 1 * 4(%rsi); + movl RR1d, 2 * 4(%rsi); + movl RL1d, 3 * 4(%rsi); + movl RR2d, 4 * 4(%rsi); + movl RL2d, 5 * 4(%rsi); + + popq %r15; + popq %r14; + popq %r13; + popq %r12; + popq %rbx; + popq %rbp; + + ret; +.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec; + +.align 8 +.globl _gcry_3des_amd64_cfb_dec +.type _gcry_3des_amd64_cfb_dec,@function; +_gcry_3des_amd64_cfb_dec: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (3 blocks) + * %rdx: src (3 blocks) + * %rcx: iv (64bit) + */ + pushq %rbp; + pushq %rbx; + pushq %r12; + pushq %r13; + pushq %r14; + pushq %r15; + + pushq %rsi; /*dst*/ + pushq %rdx; /*src*/ + movq %rcx, RW2; + + /* Load input */ + movl 0 * 4(RW2), RL0d; + movl 1 * 4(RW2), RR0d; + movl 0 * 4(%rdx), RL1d; + movl 1 * 4(%rdx), RR1d; + movl 2 * 4(%rdx), RL2d; + movl 3 * 4(%rdx), RR2d; + + bswapl RL0d; + bswapl RR0d; + bswapl RL1d; + bswapl RR1d; + bswapl RL2d; + bswapl RR2d; + + /* Update IV */ + movq 4 * 4(%rdx), RW0; + movq RW0, (RW2); + + call _gcry_3des_amd64_crypt_blk3; + + popq %rdx; /*src*/ + popq %rsi; /*dst*/ + + bswapl RR0d; + bswapl RL0d; + bswapl RR1d; + bswapl RL1d; + bswapl RR2d; + bswapl RL2d; + + xorl 0 * 4(%rdx), RR0d; + xorl 1 * 4(%rdx), RL0d; + xorl 2 * 4(%rdx), RR1d; + xorl 3 * 4(%rdx), RL1d; + xorl 4 * 4(%rdx), RR2d; + xorl 5 * 4(%rdx), RL2d; + + movl RR0d, 0 * 4(%rsi); + movl RL0d, 1 * 4(%rsi); + movl RR1d, 2 * 4(%rsi); + movl RL1d, 3 * 4(%rsi); + movl RR2d, 4 * 4(%rsi); + movl RL2d, 5 * 4(%rsi); + + popq %r15; + popq %r14; + popq %r13; + popq %r12; + popq %rbx; + popq %rbp; + ret; +.size _gcry_3des_amd64_cfb_dec,.-_gcry_3des_amd64_cfb_dec; + +.data +.align 16 +.L_s1: + .quad 0x0010100001010400, 0x0000000000000000 + .quad 0x0000100000010000, 0x0010100001010404 + .quad 0x0010100001010004, 0x0000100000010404 + .quad 0x0000000000000004, 0x0000100000010000 + .quad 0x0000000000000400, 0x0010100001010400 + .quad 0x0010100001010404, 0x0000000000000400 + .quad 0x0010000001000404, 0x0010100001010004 + .quad 0x0010000001000000, 0x0000000000000004 + .quad 0x0000000000000404, 0x0010000001000400 + .quad 0x0010000001000400, 0x0000100000010400 + .quad 0x0000100000010400, 0x0010100001010000 + .quad 0x0010100001010000, 0x0010000001000404 + .quad 0x0000100000010004, 0x0010000001000004 + .quad 0x0010000001000004, 0x0000100000010004 + .quad 0x0000000000000000, 0x0000000000000404 + .quad 0x0000100000010404, 0x0010000001000000 + .quad 0x0000100000010000, 0x0010100001010404 + .quad 0x0000000000000004, 0x0010100001010000 + .quad 0x0010100001010400, 0x0010000001000000 + .quad 0x0010000001000000, 0x0000000000000400 + .quad 0x0010100001010004, 0x0000100000010000 + .quad 0x0000100000010400, 0x0010000001000004 + .quad 0x0000000000000400, 0x0000000000000004 + .quad 0x0010000001000404, 0x0000100000010404 + .quad 0x0010100001010404, 0x0000100000010004 + .quad 0x0010100001010000, 0x0010000001000404 + .quad 0x0010000001000004, 0x0000000000000404 + .quad 0x0000100000010404, 0x0010100001010400 + .quad 0x0000000000000404, 0x0010000001000400 + .quad 0x0010000001000400, 0x0000000000000000 + .quad 0x0000100000010004, 0x0000100000010400 + .quad 0x0000000000000000, 0x0010100001010004 +.L_s2: + .quad 0x0801080200100020, 0x0800080000000000 + .quad 0x0000080000000000, 0x0001080200100020 + .quad 0x0001000000100000, 0x0000000200000020 + .quad 0x0801000200100020, 0x0800080200000020 + .quad 0x0800000200000020, 0x0801080200100020 + .quad 0x0801080000100000, 0x0800000000000000 + .quad 0x0800080000000000, 0x0001000000100000 + .quad 0x0000000200000020, 0x0801000200100020 + .quad 0x0001080000100000, 0x0001000200100020 + .quad 0x0800080200000020, 0x0000000000000000 + .quad 0x0800000000000000, 0x0000080000000000 + .quad 0x0001080200100020, 0x0801000000100000 + .quad 0x0001000200100020, 0x0800000200000020 + .quad 0x0000000000000000, 0x0001080000100000 + .quad 0x0000080200000020, 0x0801080000100000 + .quad 0x0801000000100000, 0x0000080200000020 + .quad 0x0000000000000000, 0x0001080200100020 + .quad 0x0801000200100020, 0x0001000000100000 + .quad 0x0800080200000020, 0x0801000000100000 + .quad 0x0801080000100000, 0x0000080000000000 + .quad 0x0801000000100000, 0x0800080000000000 + .quad 0x0000000200000020, 0x0801080200100020 + .quad 0x0001080200100020, 0x0000000200000020 + .quad 0x0000080000000000, 0x0800000000000000 + .quad 0x0000080200000020, 0x0801080000100000 + .quad 0x0001000000100000, 0x0800000200000020 + .quad 0x0001000200100020, 0x0800080200000020 + .quad 0x0800000200000020, 0x0001000200100020 + .quad 0x0001080000100000, 0x0000000000000000 + .quad 0x0800080000000000, 0x0000080200000020 + .quad 0x0800000000000000, 0x0801000200100020 + .quad 0x0801080200100020, 0x0001080000100000 +.L_s3: + .quad 0x0000002000000208, 0x0000202008020200 + .quad 0x0000000000000000, 0x0000200008020008 + .quad 0x0000002008000200, 0x0000000000000000 + .quad 0x0000202000020208, 0x0000002008000200 + .quad 0x0000200000020008, 0x0000000008000008 + .quad 0x0000000008000008, 0x0000200000020000 + .quad 0x0000202008020208, 0x0000200000020008 + .quad 0x0000200008020000, 0x0000002000000208 + .quad 0x0000000008000000, 0x0000000000000008 + .quad 0x0000202008020200, 0x0000002000000200 + .quad 0x0000202000020200, 0x0000200008020000 + .quad 0x0000200008020008, 0x0000202000020208 + .quad 0x0000002008000208, 0x0000202000020200 + .quad 0x0000200000020000, 0x0000002008000208 + .quad 0x0000000000000008, 0x0000202008020208 + .quad 0x0000002000000200, 0x0000000008000000 + .quad 0x0000202008020200, 0x0000000008000000 + .quad 0x0000200000020008, 0x0000002000000208 + .quad 0x0000200000020000, 0x0000202008020200 + .quad 0x0000002008000200, 0x0000000000000000 + .quad 0x0000002000000200, 0x0000200000020008 + .quad 0x0000202008020208, 0x0000002008000200 + .quad 0x0000000008000008, 0x0000002000000200 + .quad 0x0000000000000000, 0x0000200008020008 + .quad 0x0000002008000208, 0x0000200000020000 + .quad 0x0000000008000000, 0x0000202008020208 + .quad 0x0000000000000008, 0x0000202000020208 + .quad 0x0000202000020200, 0x0000000008000008 + .quad 0x0000200008020000, 0x0000002008000208 + .quad 0x0000002000000208, 0x0000200008020000 + .quad 0x0000202000020208, 0x0000000000000008 + .quad 0x0000200008020008, 0x0000202000020200 +.L_s4: + .quad 0x1008020000002001, 0x1000020800002001 + .quad 0x1000020800002001, 0x0000000800000000 + .quad 0x0008020800002000, 0x1008000800000001 + .quad 0x1008000000000001, 0x1000020000002001 + .quad 0x0000000000000000, 0x0008020000002000 + .quad 0x0008020000002000, 0x1008020800002001 + .quad 0x1000000800000001, 0x0000000000000000 + .quad 0x0008000800000000, 0x1008000000000001 + .quad 0x1000000000000001, 0x0000020000002000 + .quad 0x0008000000000000, 0x1008020000002001 + .quad 0x0000000800000000, 0x0008000000000000 + .quad 0x1000020000002001, 0x0000020800002000 + .quad 0x1008000800000001, 0x1000000000000001 + .quad 0x0000020800002000, 0x0008000800000000 + .quad 0x0000020000002000, 0x0008020800002000 + .quad 0x1008020800002001, 0x1000000800000001 + .quad 0x0008000800000000, 0x1008000000000001 + .quad 0x0008020000002000, 0x1008020800002001 + .quad 0x1000000800000001, 0x0000000000000000 + .quad 0x0000000000000000, 0x0008020000002000 + .quad 0x0000020800002000, 0x0008000800000000 + .quad 0x1008000800000001, 0x1000000000000001 + .quad 0x1008020000002001, 0x1000020800002001 + .quad 0x1000020800002001, 0x0000000800000000 + .quad 0x1008020800002001, 0x1000000800000001 + .quad 0x1000000000000001, 0x0000020000002000 + .quad 0x1008000000000001, 0x1000020000002001 + .quad 0x0008020800002000, 0x1008000800000001 + .quad 0x1000020000002001, 0x0000020800002000 + .quad 0x0008000000000000, 0x1008020000002001 + .quad 0x0000000800000000, 0x0008000000000000 + .quad 0x0000020000002000, 0x0008020800002000 +.L_s5: + .quad 0x0000001000000100, 0x0020001002080100 + .quad 0x0020000002080000, 0x0420001002000100 + .quad 0x0000000000080000, 0x0000001000000100 + .quad 0x0400000000000000, 0x0020000002080000 + .quad 0x0400001000080100, 0x0000000000080000 + .quad 0x0020001002000100, 0x0400001000080100 + .quad 0x0420001002000100, 0x0420000002080000 + .quad 0x0000001000080100, 0x0400000000000000 + .quad 0x0020000002000000, 0x0400000000080000 + .quad 0x0400000000080000, 0x0000000000000000 + .quad 0x0400001000000100, 0x0420001002080100 + .quad 0x0420001002080100, 0x0020001002000100 + .quad 0x0420000002080000, 0x0400001000000100 + .quad 0x0000000000000000, 0x0420000002000000 + .quad 0x0020001002080100, 0x0020000002000000 + .quad 0x0420000002000000, 0x0000001000080100 + .quad 0x0000000000080000, 0x0420001002000100 + .quad 0x0000001000000100, 0x0020000002000000 + .quad 0x0400000000000000, 0x0020000002080000 + .quad 0x0420001002000100, 0x0400001000080100 + .quad 0x0020001002000100, 0x0400000000000000 + .quad 0x0420000002080000, 0x0020001002080100 + .quad 0x0400001000080100, 0x0000001000000100 + .quad 0x0020000002000000, 0x0420000002080000 + .quad 0x0420001002080100, 0x0000001000080100 + .quad 0x0420000002000000, 0x0420001002080100 + .quad 0x0020000002080000, 0x0000000000000000 + .quad 0x0400000000080000, 0x0420000002000000 + .quad 0x0000001000080100, 0x0020001002000100 + .quad 0x0400001000000100, 0x0000000000080000 + .quad 0x0000000000000000, 0x0400000000080000 + .quad 0x0020001002080100, 0x0400001000000100 +.L_s6: + .quad 0x0200000120000010, 0x0204000020000000 + .quad 0x0000040000000000, 0x0204040120000010 + .quad 0x0204000020000000, 0x0000000100000010 + .quad 0x0204040120000010, 0x0004000000000000 + .quad 0x0200040020000000, 0x0004040100000010 + .quad 0x0004000000000000, 0x0200000120000010 + .quad 0x0004000100000010, 0x0200040020000000 + .quad 0x0200000020000000, 0x0000040100000010 + .quad 0x0000000000000000, 0x0004000100000010 + .quad 0x0200040120000010, 0x0000040000000000 + .quad 0x0004040000000000, 0x0200040120000010 + .quad 0x0000000100000010, 0x0204000120000010 + .quad 0x0204000120000010, 0x0000000000000000 + .quad 0x0004040100000010, 0x0204040020000000 + .quad 0x0000040100000010, 0x0004040000000000 + .quad 0x0204040020000000, 0x0200000020000000 + .quad 0x0200040020000000, 0x0000000100000010 + .quad 0x0204000120000010, 0x0004040000000000 + .quad 0x0204040120000010, 0x0004000000000000 + .quad 0x0000040100000010, 0x0200000120000010 + .quad 0x0004000000000000, 0x0200040020000000 + .quad 0x0200000020000000, 0x0000040100000010 + .quad 0x0200000120000010, 0x0204040120000010 + .quad 0x0004040000000000, 0x0204000020000000 + .quad 0x0004040100000010, 0x0204040020000000 + .quad 0x0000000000000000, 0x0204000120000010 + .quad 0x0000000100000010, 0x0000040000000000 + .quad 0x0204000020000000, 0x0004040100000010 + .quad 0x0000040000000000, 0x0004000100000010 + .quad 0x0200040120000010, 0x0000000000000000 + .quad 0x0204040020000000, 0x0200000020000000 + .quad 0x0004000100000010, 0x0200040120000010 +.L_s7: + .quad 0x0002000000200000, 0x2002000004200002 + .quad 0x2000000004000802, 0x0000000000000000 + .quad 0x0000000000000800, 0x2000000004000802 + .quad 0x2002000000200802, 0x0002000004200800 + .quad 0x2002000004200802, 0x0002000000200000 + .quad 0x0000000000000000, 0x2000000004000002 + .quad 0x2000000000000002, 0x0000000004000000 + .quad 0x2002000004200002, 0x2000000000000802 + .quad 0x0000000004000800, 0x2002000000200802 + .quad 0x2002000000200002, 0x0000000004000800 + .quad 0x2000000004000002, 0x0002000004200000 + .quad 0x0002000004200800, 0x2002000000200002 + .quad 0x0002000004200000, 0x0000000000000800 + .quad 0x2000000000000802, 0x2002000004200802 + .quad 0x0002000000200800, 0x2000000000000002 + .quad 0x0000000004000000, 0x0002000000200800 + .quad 0x0000000004000000, 0x0002000000200800 + .quad 0x0002000000200000, 0x2000000004000802 + .quad 0x2000000004000802, 0x2002000004200002 + .quad 0x2002000004200002, 0x2000000000000002 + .quad 0x2002000000200002, 0x0000000004000000 + .quad 0x0000000004000800, 0x0002000000200000 + .quad 0x0002000004200800, 0x2000000000000802 + .quad 0x2002000000200802, 0x0002000004200800 + .quad 0x2000000000000802, 0x2000000004000002 + .quad 0x2002000004200802, 0x0002000004200000 + .quad 0x0002000000200800, 0x0000000000000000 + .quad 0x2000000000000002, 0x2002000004200802 + .quad 0x0000000000000000, 0x2002000000200802 + .quad 0x0002000004200000, 0x0000000000000800 + .quad 0x2000000004000002, 0x0000000004000800 + .quad 0x0000000000000800, 0x2002000000200002 +.L_s8: + .quad 0x0100010410001000, 0x0000010000001000 + .quad 0x0000000000040000, 0x0100010410041000 + .quad 0x0100000010000000, 0x0100010410001000 + .quad 0x0000000400000000, 0x0100000010000000 + .quad 0x0000000400040000, 0x0100000010040000 + .quad 0x0100010410041000, 0x0000010000041000 + .quad 0x0100010010041000, 0x0000010400041000 + .quad 0x0000010000001000, 0x0000000400000000 + .quad 0x0100000010040000, 0x0100000410000000 + .quad 0x0100010010001000, 0x0000010400001000 + .quad 0x0000010000041000, 0x0000000400040000 + .quad 0x0100000410040000, 0x0100010010041000 + .quad 0x0000010400001000, 0x0000000000000000 + .quad 0x0000000000000000, 0x0100000410040000 + .quad 0x0100000410000000, 0x0100010010001000 + .quad 0x0000010400041000, 0x0000000000040000 + .quad 0x0000010400041000, 0x0000000000040000 + .quad 0x0100010010041000, 0x0000010000001000 + .quad 0x0000000400000000, 0x0100000410040000 + .quad 0x0000010000001000, 0x0000010400041000 + .quad 0x0100010010001000, 0x0000000400000000 + .quad 0x0100000410000000, 0x0100000010040000 + .quad 0x0100000410040000, 0x0100000010000000 + .quad 0x0000000000040000, 0x0100010410001000 + .quad 0x0000000000000000, 0x0100010410041000 + .quad 0x0000000400040000, 0x0100000410000000 + .quad 0x0100000010040000, 0x0100010010001000 + .quad 0x0100010410001000, 0x0000000000000000 + .quad 0x0100010410041000, 0x0000010000041000 + .quad 0x0000010000041000, 0x0000010400001000 + .quad 0x0000010400001000, 0x0000000400040000 + .quad 0x0100000010000000, 0x0100010010041000 + +#endif +#endif diff --git a/cipher/des.c b/cipher/des.c index 6611fd3b..bc2a474d 100644 --- a/cipher/des.c +++ b/cipher/des.c @@ -119,9 +119,27 @@ #include "g10lib.h" #include "cipher.h" #include "bufhelp.h" +#include "cipher-selftest.h" + + +#define DES_BLOCKSIZE 8 + + +/* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */ +#undef USE_AMD64_ASM +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) +# define USE_AMD64_ASM 1 +#endif + +/* Helper macro to force alignment to 16 bytes. */ +#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED +# define ATTR_ALIGNED_16 __attribute__ ((aligned (16))) +#else +# define ATTR_ALIGNED_16 +#endif #if defined(__GNUC__) && defined(__GNU_LIBRARY__) -#define working_memcmp memcmp +# define working_memcmp memcmp #else /* * According to the SunOS man page, memcmp returns indeterminate sign @@ -171,6 +189,12 @@ static int tripledes_ecb_crypt (struct _tripledes_ctx *, const byte *, byte *, int); static int is_weak_key ( const byte *key ); static const char *selftest (void); +static unsigned int do_tripledes_encrypt(void *context, byte *outbuf, + const byte *inbuf ); +static unsigned int do_tripledes_decrypt(void *context, byte *outbuf, + const byte *inbuf ); +static gcry_err_code_t do_tripledes_setkey(void *context, const byte *key, + unsigned keylen); static int initialized; @@ -727,6 +751,46 @@ tripledes_set3keys (struct _tripledes_ctx *ctx, +#ifdef USE_AMD64_ASM + +/* Assembly implementation of triple-DES. */ +extern void _gcry_3des_amd64_crypt_block(const void *keys, byte *out, + const byte *in); + +/* These assembly implementations process three blocks in parallel. */ +extern void _gcry_3des_amd64_ctr_enc(const void *keys, byte *out, + const byte *in, byte *ctr); + +extern void _gcry_3des_amd64_cbc_dec(const void *keys, byte *out, + const byte *in, byte *iv); + +extern void _gcry_3des_amd64_cfb_dec(const void *keys, byte *out, + const byte *in, byte *iv); + +#define TRIPLEDES_ECB_BURN_STACK (8 * sizeof(void *)) + +/* + * Electronic Codebook Mode Triple-DES encryption/decryption of data + * according to 'mode'. Sometimes this mode is named 'EDE' mode + * (Encryption-Decryption-Encryption). + */ +static inline int +tripledes_ecb_crypt (struct _tripledes_ctx *ctx, const byte * from, + byte * to, int mode) +{ + u32 *keys; + + keys = mode ? ctx->decrypt_subkeys : ctx->encrypt_subkeys; + + _gcry_3des_amd64_crypt_block(keys, to, from); + + return 0; +} + +#else /*USE_AMD64_ASM*/ + +#define TRIPLEDES_ECB_BURN_STACK 32 + /* * Electronic Codebook Mode Triple-DES encryption/decryption of data * according to 'mode'. Sometimes this mode is named 'EDE' mode @@ -777,8 +841,158 @@ tripledes_ecb_crypt (struct _tripledes_ctx *ctx, const byte * from, return 0; } +#endif /*!USE_AMD64_ASM*/ + + + +/* Bulk encryption of complete blocks in CTR mode. This function is only + intended for the bulk encryption feature of cipher.c. CTR is expected to be + of size DES_BLOCKSIZE. */ +void +_gcry_3des_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks) +{ + struct _tripledes_ctx *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + unsigned char tmpbuf[DES_BLOCKSIZE]; + int burn_stack_depth = TRIPLEDES_ECB_BURN_STACK; + int i; + +#ifdef USE_AMD64_ASM + { + int asm_burn_depth = 9 * sizeof(void *); + + if (nblocks >= 3 && burn_stack_depth < asm_burn_depth) + burn_stack_depth = asm_burn_depth; + + /* Process data in 3 block chunks. */ + while (nblocks >= 3) + { + _gcry_3des_amd64_ctr_enc(ctx->encrypt_subkeys, outbuf, inbuf, ctr); + + nblocks -= 3; + outbuf += 3 * DES_BLOCKSIZE; + inbuf += 3 * DES_BLOCKSIZE; + } + + /* Use generic code to handle smaller chunks... */ + } +#endif + + for ( ;nblocks; nblocks-- ) + { + /* Encrypt the counter. */ + tripledes_ecb_encrypt (ctx, ctr, tmpbuf); + /* XOR the input with the encrypted counter and store in output. */ + buf_xor(outbuf, tmpbuf, inbuf, DES_BLOCKSIZE); + outbuf += DES_BLOCKSIZE; + inbuf += DES_BLOCKSIZE; + /* Increment the counter. */ + for (i = DES_BLOCKSIZE; i > 0; i--) + { + ctr[i-1]++; + if (ctr[i-1]) + break; + } + } + + wipememory(tmpbuf, sizeof(tmpbuf)); + _gcry_burn_stack(burn_stack_depth); +} + + +/* Bulk decryption of complete blocks in CBC mode. This function is only + intended for the bulk encryption feature of cipher.c. */ +void +_gcry_3des_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks) +{ + struct _tripledes_ctx *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + unsigned char savebuf[DES_BLOCKSIZE]; + int burn_stack_depth = TRIPLEDES_ECB_BURN_STACK; + +#ifdef USE_AMD64_ASM + { + int asm_burn_depth = 10 * sizeof(void *); + + if (nblocks >= 3 && burn_stack_depth < asm_burn_depth) + burn_stack_depth = asm_burn_depth; + + /* Process data in 3 block chunks. */ + while (nblocks >= 3) + { + _gcry_3des_amd64_cbc_dec(ctx->decrypt_subkeys, outbuf, inbuf, iv); + + nblocks -= 3; + outbuf += 3 * DES_BLOCKSIZE; + inbuf += 3 * DES_BLOCKSIZE; + } + + /* Use generic code to handle smaller chunks... */ + } +#endif + + for ( ;nblocks; nblocks-- ) + { + /* INBUF is needed later and it may be identical to OUTBUF, so store + the intermediate result to SAVEBUF. */ + tripledes_ecb_decrypt (ctx, inbuf, savebuf); + + buf_xor_n_copy_2(outbuf, savebuf, iv, inbuf, DES_BLOCKSIZE); + inbuf += DES_BLOCKSIZE; + outbuf += DES_BLOCKSIZE; + } + + wipememory(savebuf, sizeof(savebuf)); + _gcry_burn_stack(burn_stack_depth); +} + + +/* Bulk decryption of complete blocks in CFB mode. This function is only + intended for the bulk encryption feature of cipher.c. */ +void +_gcry_3des_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks) +{ + struct _tripledes_ctx *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + int burn_stack_depth = TRIPLEDES_ECB_BURN_STACK; + +#ifdef USE_AMD64_ASM + { + int asm_burn_depth = 9 * sizeof(void *); + + if (nblocks >= 3 && burn_stack_depth < asm_burn_depth) + burn_stack_depth = asm_burn_depth; + /* Process data in 3 block chunks. */ + while (nblocks >= 3) + { + _gcry_3des_amd64_cfb_dec(ctx->encrypt_subkeys, outbuf, inbuf, iv); + nblocks -= 3; + outbuf += 3 * DES_BLOCKSIZE; + inbuf += 3 * DES_BLOCKSIZE; + } + + /* Use generic code to handle smaller chunks... */ + } +#endif + + for ( ;nblocks; nblocks-- ) + { + tripledes_ecb_encrypt (ctx, iv, iv); + buf_xor_n_copy(outbuf, iv, inbuf, DES_BLOCKSIZE); + outbuf += DES_BLOCKSIZE; + inbuf += DES_BLOCKSIZE; + } + + _gcry_burn_stack(burn_stack_depth); +} /* @@ -815,6 +1029,67 @@ is_weak_key ( const byte *key ) } +/* Alternative setkey for selftests; need larger key than default. */ +static gcry_err_code_t +bulk_selftest_setkey (void *context, const byte *__key, unsigned __keylen) +{ + static const unsigned char key[24] ATTR_ALIGNED_16 = { + 0x66,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F, + 0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x22, + 0x18,0x2A,0x39,0x47,0x5E,0x6F,0x75,0x82 + }; + + (void)__key; + (void)__keylen; + + return do_tripledes_setkey(context, key, sizeof(key)); +} + + +/* Run the self-tests for DES-CTR, tests IV increment of bulk CTR + encryption. Returns NULL on success. */ +static const char * +selftest_ctr (void) +{ + const int nblocks = 3+1; + const int blocksize = DES_BLOCKSIZE; + const int context_size = sizeof(struct _tripledes_ctx); + + return _gcry_selftest_helper_ctr("3DES", &bulk_selftest_setkey, + &do_tripledes_encrypt, &_gcry_3des_ctr_enc, nblocks, blocksize, + context_size); +} + + +/* Run the self-tests for DES-CBC, tests bulk CBC decryption. + Returns NULL on success. */ +static const char * +selftest_cbc (void) +{ + const int nblocks = 3+2; + const int blocksize = DES_BLOCKSIZE; + const int context_size = sizeof(struct _tripledes_ctx); + + return _gcry_selftest_helper_cbc("3DES", &bulk_selftest_setkey, + &do_tripledes_encrypt, &_gcry_3des_cbc_dec, nblocks, blocksize, + context_size); +} + + +/* Run the self-tests for DES-CFB, tests bulk CBC decryption. + Returns NULL on success. */ +static const char * +selftest_cfb (void) +{ + const int nblocks = 3+2; + const int blocksize = DES_BLOCKSIZE; + const int context_size = sizeof(struct _tripledes_ctx); + + return _gcry_selftest_helper_cfb("3DES", &bulk_selftest_setkey, + &do_tripledes_encrypt, &_gcry_3des_cfb_dec, nblocks, blocksize, + context_size); +} + /* * Performs a selftest of this DES/Triple-DES implementation. @@ -824,6 +1099,8 @@ is_weak_key ( const byte *key ) static const char * selftest (void) { + const char *r; + /* * Check if 'u32' is really 32 bits wide. This DES / 3DES implementation * need this. @@ -1003,6 +1280,15 @@ selftest (void) return "DES weak key detection failed"; } + if ( (r = selftest_cbc ()) ) + return r; + + if ( (r = selftest_cfb ()) ) + return r; + + if ( (r = selftest_ctr ()) ) + return r; + return 0; } @@ -1060,7 +1346,7 @@ do_tripledes_encrypt( void *context, byte *outbuf, const byte *inbuf ) struct _tripledes_ctx *ctx = (struct _tripledes_ctx *) context; tripledes_ecb_encrypt ( ctx, inbuf, outbuf ); - return /*burn_stack*/ (32); + return /*burn_stack*/ TRIPLEDES_ECB_BURN_STACK; } static unsigned int @@ -1068,7 +1354,7 @@ do_tripledes_decrypt( void *context, byte *outbuf, const byte *inbuf ) { struct _tripledes_ctx *ctx = (struct _tripledes_ctx *) context; tripledes_ecb_decrypt ( ctx, inbuf, outbuf ); - return /*burn_stack*/ (32); + return /*burn_stack*/ TRIPLEDES_ECB_BURN_STACK; } static gcry_err_code_t diff --git a/configure.ac b/configure.ac index a0f75a5d..79f79ef3 100644 --- a/configure.ac +++ b/configure.ac @@ -1671,6 +1671,13 @@ LIST_MEMBER(des, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS des.lo" AC_DEFINE(USE_DES, 1, [Defined if this module should be included]) + + case "${host}" in + x86_64-*-*) + # Build with the assembly implementation + GCRYPT_CIPHERS="$GCRYPT_CIPHERS des-amd64.lo" + ;; + esac fi LIST_MEMBER(aes, $enabled_ciphers) diff --git a/src/cipher.h b/src/cipher.h index cd981b3d..5d1b5f6f 100644 --- a/src/cipher.h +++ b/src/cipher.h @@ -173,6 +173,19 @@ void _gcry_camellia_cfb_dec (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); +/*-- des.c --*/ +void _gcry_3des_ctr_enc (void *context, unsigned char *ctr, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks); + +void _gcry_3des_cbc_dec (void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks); + +void _gcry_3des_cfb_dec (void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks); + /*-- serpent.c --*/ void _gcry_serpent_ctr_enc (void *context, unsigned char *ctr, void *outbuf_arg, const void *inbuf_arg, |