diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2013-10-26 15:00:48 +0300 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2013-10-28 16:12:19 +0200 |
commit | 5a3d43485efdc09912be0967ee0a3ce345b3b15a (patch) | |
tree | ff8e937e2d010ae8e015707f5665915dabe1e915 | |
parent | e214e8392671dd30e9c33260717b5e756debf3bf (diff) | |
download | libgcrypt-5a3d43485efdc09912be0967ee0a3ce345b3b15a.tar.gz |
Add AMD64 assembly implementation of Salsa20
* cipher/Makefile.am: Add 'salsa20-amd64.S'.
* cipher/salsa20-amd64.S: New.
* cipher/salsa20.c (USE_AMD64): New macro.
[USE_AMD64] (_gcry_salsa20_amd64_keysetup, _gcry_salsa20_amd64_ivsetup)
(_gcry_salsa20_amd64_encrypt_blocks): New prototypes.
[USE_AMD64] (salsa20_keysetup, salsa20_ivsetup, salsa20_core): New.
[!USE_AMD64] (salsa20_core): Change 'src' to non-constant, update block
counter in 'salsa20_core' and return burn stack depth.
[!USE_AMD64] (salsa20_keysetup, salsa20_ivsetup): New.
(salsa20_do_setkey): Move generic key setup to 'salsa20_keysetup'.
(salsa20_setkey): Fix burn stack depth.
(salsa20_setiv): Move generic IV setup to 'salsa20_ivsetup'.
(salsa20_do_encrypt_stream) [USE_AMD64]: Process large buffers in AMD64
implementation.
(salsa20_do_encrypt_stream): Move stack burning to this function...
(salsa20_encrypt_stream, salsa20r12_encrypt_stream): ...from these
functions.
* configure.ac [x86-64]: Add 'salsa20-amd64.lo'.
--
Patch adds fast AMD64 assembly implementation for Salsa20. This implementation
is based on public domain code by D. J. Bernstein and it is available at
http://cr.yp.to/snuffle.html (amd64-xmm6). Implementation gains extra speed
by processing four blocks in parallel with help SSE2 instructions.
Benchmark results on Intel Core i5-4570 (3.2 Ghz):
Before:
SALSA20 | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 3.88 ns/B 246.0 MiB/s 12.41 c/B
STREAM dec | 3.88 ns/B 246.0 MiB/s 12.41 c/B
=
SALSA20R12 | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 2.46 ns/B 387.9 MiB/s 7.87 c/B
STREAM dec | 2.46 ns/B 387.7 MiB/s 7.87 c/B
After:
SALSA20 | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 0.985 ns/B 967.8 MiB/s 3.15 c/B
STREAM dec | 0.987 ns/B 966.5 MiB/s 3.16 c/B
=
SALSA20R12 | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 0.636 ns/B 1500.5 MiB/s 2.03 c/B
STREAM dec | 0.636 ns/B 1499.2 MiB/s 2.04 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
-rw-r--r-- | cipher/Makefile.am | 2 | ||||
-rw-r--r-- | cipher/salsa20-amd64.S | 924 | ||||
-rw-r--r-- | cipher/salsa20.c | 197 | ||||
-rw-r--r-- | configure.ac | 7 |
4 files changed, 1056 insertions, 74 deletions
diff --git a/cipher/Makefile.am b/cipher/Makefile.am index d7db9337..e786713e 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -71,7 +71,7 @@ md5.c \ rijndael.c rijndael-tables.h rijndael-amd64.S rijndael-arm.S \ rmd160.c \ rsa.c \ -salsa20.c \ +salsa20.c salsa20-amd64.S \ scrypt.c \ seed.c \ serpent.c serpent-sse2-amd64.S serpent-avx2-amd64.S \ diff --git a/cipher/salsa20-amd64.S b/cipher/salsa20-amd64.S new file mode 100644 index 00000000..691df588 --- /dev/null +++ b/cipher/salsa20-amd64.S @@ -0,0 +1,924 @@ +/* salsa20-amd64.S - AMD64 implementation of Salsa20 + * + * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +/* + * Based on public domain implementation by D. J. Bernstein at + * http://cr.yp.to/snuffle.html + */ + +#ifdef __x86_64 +#include <config.h> +#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && defined(USE_SALSA20) + +.text + +.align 8 +.globl _gcry_salsa20_amd64_keysetup +.type _gcry_salsa20_amd64_keysetup,@function; +_gcry_salsa20_amd64_keysetup: + movl 0(%rsi),%r8d + movl 4(%rsi),%r9d + movl 8(%rsi),%eax + movl 12(%rsi),%r10d + movl %r8d,20(%rdi) + movl %r9d,40(%rdi) + movl %eax,60(%rdi) + movl %r10d,48(%rdi) + cmp $256,%rdx + jb ._kbits128 +._kbits256: + movl 16(%rsi),%edx + movl 20(%rsi),%ecx + movl 24(%rsi),%r8d + movl 28(%rsi),%esi + movl %edx,28(%rdi) + movl %ecx,16(%rdi) + movl %r8d,36(%rdi) + movl %esi,56(%rdi) + mov $1634760805,%rsi + mov $857760878,%rdx + mov $2036477234,%rcx + mov $1797285236,%r8 + movl %esi,0(%rdi) + movl %edx,4(%rdi) + movl %ecx,8(%rdi) + movl %r8d,12(%rdi) + jmp ._keysetupdone +._kbits128: + movl 0(%rsi),%edx + movl 4(%rsi),%ecx + movl 8(%rsi),%r8d + movl 12(%rsi),%esi + movl %edx,28(%rdi) + movl %ecx,16(%rdi) + movl %r8d,36(%rdi) + movl %esi,56(%rdi) + mov $1634760805,%rsi + mov $824206446,%rdx + mov $2036477238,%rcx + mov $1797285236,%r8 + movl %esi,0(%rdi) + movl %edx,4(%rdi) + movl %ecx,8(%rdi) + movl %r8d,12(%rdi) +._keysetupdone: + ret + +.align 8 +.globl _gcry_salsa20_amd64_ivsetup +.type _gcry_salsa20_amd64_ivsetup,@function; +_gcry_salsa20_amd64_ivsetup: + movl 0(%rsi),%r8d + movl 4(%rsi),%esi + mov $0,%r9 + mov $0,%rax + movl %r8d,24(%rdi) + movl %esi,44(%rdi) + movl %r9d,32(%rdi) + movl %eax,52(%rdi) + ret + +.align 8 +.globl _gcry_salsa20_amd64_encrypt_blocks +.type _gcry_salsa20_amd64_encrypt_blocks,@function; +_gcry_salsa20_amd64_encrypt_blocks: + /* + * Modifications to original implementation: + * - Number of rounds passing in register %r8 (for Salsa20/12). + * - Length is input as number of blocks, so don't handle tail bytes + * (this is done in salsa20.c). + */ + push %rbx + shlq $6, %rcx /* blocks to bytes */ + mov %r8, %rbx + mov %rsp,%r11 + and $31,%r11 + add $384,%r11 + sub %r11,%rsp + mov %rdi,%r8 + mov %rsi,%rsi + mov %rdx,%rdi + mov %rcx,%rdx + cmp $0,%rdx + jbe ._done +._start: + cmp $256,%rdx + jb ._bytes_are_64_128_or_192 + movdqa 0(%r8),%xmm0 + pshufd $0x55,%xmm0,%xmm1 + pshufd $0xaa,%xmm0,%xmm2 + pshufd $0xff,%xmm0,%xmm3 + pshufd $0x00,%xmm0,%xmm0 + movdqa %xmm1,0(%rsp) + movdqa %xmm2,16(%rsp) + movdqa %xmm3,32(%rsp) + movdqa %xmm0,48(%rsp) + movdqa 16(%r8),%xmm0 + pshufd $0xaa,%xmm0,%xmm1 + pshufd $0xff,%xmm0,%xmm2 + pshufd $0x00,%xmm0,%xmm3 + pshufd $0x55,%xmm0,%xmm0 + movdqa %xmm1,64(%rsp) + movdqa %xmm2,80(%rsp) + movdqa %xmm3,96(%rsp) + movdqa %xmm0,112(%rsp) + movdqa 32(%r8),%xmm0 + pshufd $0xff,%xmm0,%xmm1 + pshufd $0x55,%xmm0,%xmm2 + pshufd $0xaa,%xmm0,%xmm0 + movdqa %xmm1,128(%rsp) + movdqa %xmm2,144(%rsp) + movdqa %xmm0,160(%rsp) + movdqa 48(%r8),%xmm0 + pshufd $0x00,%xmm0,%xmm1 + pshufd $0xaa,%xmm0,%xmm2 + pshufd $0xff,%xmm0,%xmm0 + movdqa %xmm1,176(%rsp) + movdqa %xmm2,192(%rsp) + movdqa %xmm0,208(%rsp) +._bytesatleast256: + movl 32(%r8),%ecx + movl 52(%r8),%r9d + movl %ecx,224(%rsp) + movl %r9d,240(%rsp) + add $1,%ecx + adc $0,%r9d + movl %ecx,4+224(%rsp) + movl %r9d,4+240(%rsp) + add $1,%ecx + adc $0,%r9d + movl %ecx,8+224(%rsp) + movl %r9d,8+240(%rsp) + add $1,%ecx + adc $0,%r9d + movl %ecx,12+224(%rsp) + movl %r9d,12+240(%rsp) + add $1,%ecx + adc $0,%r9d + movl %ecx,32(%r8) + movl %r9d,52(%r8) + movq %rdx,288(%rsp) + mov %rbx,%rdx + movdqa 0(%rsp),%xmm0 + movdqa 16(%rsp),%xmm1 + movdqa 32(%rsp),%xmm2 + movdqa 192(%rsp),%xmm3 + movdqa 208(%rsp),%xmm4 + movdqa 64(%rsp),%xmm5 + movdqa 80(%rsp),%xmm6 + movdqa 112(%rsp),%xmm7 + movdqa 128(%rsp),%xmm8 + movdqa 144(%rsp),%xmm9 + movdqa 160(%rsp),%xmm10 + movdqa 240(%rsp),%xmm11 + movdqa 48(%rsp),%xmm12 + movdqa 96(%rsp),%xmm13 + movdqa 176(%rsp),%xmm14 + movdqa 224(%rsp),%xmm15 +._mainloop1: + movdqa %xmm1,256(%rsp) + movdqa %xmm2,272(%rsp) + movdqa %xmm13,%xmm1 + paddd %xmm12,%xmm1 + movdqa %xmm1,%xmm2 + pslld $7,%xmm1 + pxor %xmm1,%xmm14 + psrld $25,%xmm2 + pxor %xmm2,%xmm14 + movdqa %xmm7,%xmm1 + paddd %xmm0,%xmm1 + movdqa %xmm1,%xmm2 + pslld $7,%xmm1 + pxor %xmm1,%xmm11 + psrld $25,%xmm2 + pxor %xmm2,%xmm11 + movdqa %xmm12,%xmm1 + paddd %xmm14,%xmm1 + movdqa %xmm1,%xmm2 + pslld $9,%xmm1 + pxor %xmm1,%xmm15 + psrld $23,%xmm2 + pxor %xmm2,%xmm15 + movdqa %xmm0,%xmm1 + paddd %xmm11,%xmm1 + movdqa %xmm1,%xmm2 + pslld $9,%xmm1 + pxor %xmm1,%xmm9 + psrld $23,%xmm2 + pxor %xmm2,%xmm9 + movdqa %xmm14,%xmm1 + paddd %xmm15,%xmm1 + movdqa %xmm1,%xmm2 + pslld $13,%xmm1 + pxor %xmm1,%xmm13 + psrld $19,%xmm2 + pxor %xmm2,%xmm13 + movdqa %xmm11,%xmm1 + paddd %xmm9,%xmm1 + movdqa %xmm1,%xmm2 + pslld $13,%xmm1 + pxor %xmm1,%xmm7 + psrld $19,%xmm2 + pxor %xmm2,%xmm7 + movdqa %xmm15,%xmm1 + paddd %xmm13,%xmm1 + movdqa %xmm1,%xmm2 + pslld $18,%xmm1 + pxor %xmm1,%xmm12 + psrld $14,%xmm2 + pxor %xmm2,%xmm12 + movdqa 256(%rsp),%xmm1 + movdqa %xmm12,256(%rsp) + movdqa %xmm9,%xmm2 + paddd %xmm7,%xmm2 + movdqa %xmm2,%xmm12 + pslld $18,%xmm2 + pxor %xmm2,%xmm0 + psrld $14,%xmm12 + pxor %xmm12,%xmm0 + movdqa %xmm5,%xmm2 + paddd %xmm1,%xmm2 + movdqa %xmm2,%xmm12 + pslld $7,%xmm2 + pxor %xmm2,%xmm3 + psrld $25,%xmm12 + pxor %xmm12,%xmm3 + movdqa 272(%rsp),%xmm2 + movdqa %xmm0,272(%rsp) + movdqa %xmm6,%xmm0 + paddd %xmm2,%xmm0 + movdqa %xmm0,%xmm12 + pslld $7,%xmm0 + pxor %xmm0,%xmm4 + psrld $25,%xmm12 + pxor %xmm12,%xmm4 + movdqa %xmm1,%xmm0 + paddd %xmm3,%xmm0 + movdqa %xmm0,%xmm12 + pslld $9,%xmm0 + pxor %xmm0,%xmm10 + psrld $23,%xmm12 + pxor %xmm12,%xmm10 + movdqa %xmm2,%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,%xmm12 + pslld $9,%xmm0 + pxor %xmm0,%xmm8 + psrld $23,%xmm12 + pxor %xmm12,%xmm8 + movdqa %xmm3,%xmm0 + paddd %xmm10,%xmm0 + movdqa %xmm0,%xmm12 + pslld $13,%xmm0 + pxor %xmm0,%xmm5 + psrld $19,%xmm12 + pxor %xmm12,%xmm5 + movdqa %xmm4,%xmm0 + paddd %xmm8,%xmm0 + movdqa %xmm0,%xmm12 + pslld $13,%xmm0 + pxor %xmm0,%xmm6 + psrld $19,%xmm12 + pxor %xmm12,%xmm6 + movdqa %xmm10,%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,%xmm12 + pslld $18,%xmm0 + pxor %xmm0,%xmm1 + psrld $14,%xmm12 + pxor %xmm12,%xmm1 + movdqa 256(%rsp),%xmm0 + movdqa %xmm1,256(%rsp) + movdqa %xmm4,%xmm1 + paddd %xmm0,%xmm1 + movdqa %xmm1,%xmm12 + pslld $7,%xmm1 + pxor %xmm1,%xmm7 + psrld $25,%xmm12 + pxor %xmm12,%xmm7 + movdqa %xmm8,%xmm1 + paddd %xmm6,%xmm1 + movdqa %xmm1,%xmm12 + pslld $18,%xmm1 + pxor %xmm1,%xmm2 + psrld $14,%xmm12 + pxor %xmm12,%xmm2 + movdqa 272(%rsp),%xmm12 + movdqa %xmm2,272(%rsp) + movdqa %xmm14,%xmm1 + paddd %xmm12,%xmm1 + movdqa %xmm1,%xmm2 + pslld $7,%xmm1 + pxor %xmm1,%xmm5 + psrld $25,%xmm2 + pxor %xmm2,%xmm5 + movdqa %xmm0,%xmm1 + paddd %xmm7,%xmm1 + movdqa %xmm1,%xmm2 + pslld $9,%xmm1 + pxor %xmm1,%xmm10 + psrld $23,%xmm2 + pxor %xmm2,%xmm10 + movdqa %xmm12,%xmm1 + paddd %xmm5,%xmm1 + movdqa %xmm1,%xmm2 + pslld $9,%xmm1 + pxor %xmm1,%xmm8 + psrld $23,%xmm2 + pxor %xmm2,%xmm8 + movdqa %xmm7,%xmm1 + paddd %xmm10,%xmm1 + movdqa %xmm1,%xmm2 + pslld $13,%xmm1 + pxor %xmm1,%xmm4 + psrld $19,%xmm2 + pxor %xmm2,%xmm4 + movdqa %xmm5,%xmm1 + paddd %xmm8,%xmm1 + movdqa %xmm1,%xmm2 + pslld $13,%xmm1 + pxor %xmm1,%xmm14 + psrld $19,%xmm2 + pxor %xmm2,%xmm14 + movdqa %xmm10,%xmm1 + paddd %xmm4,%xmm1 + movdqa %xmm1,%xmm2 + pslld $18,%xmm1 + pxor %xmm1,%xmm0 + psrld $14,%xmm2 + pxor %xmm2,%xmm0 + movdqa 256(%rsp),%xmm1 + movdqa %xmm0,256(%rsp) + movdqa %xmm8,%xmm0 + paddd %xmm14,%xmm0 + movdqa %xmm0,%xmm2 + pslld $18,%xmm0 + pxor %xmm0,%xmm12 + psrld $14,%xmm2 + pxor %xmm2,%xmm12 + movdqa %xmm11,%xmm0 + paddd %xmm1,%xmm0 + movdqa %xmm0,%xmm2 + pslld $7,%xmm0 + pxor %xmm0,%xmm6 + psrld $25,%xmm2 + pxor %xmm2,%xmm6 + movdqa 272(%rsp),%xmm2 + movdqa %xmm12,272(%rsp) + movdqa %xmm3,%xmm0 + paddd %xmm2,%xmm0 + movdqa %xmm0,%xmm12 + pslld $7,%xmm0 + pxor %xmm0,%xmm13 + psrld $25,%xmm12 + pxor %xmm12,%xmm13 + movdqa %xmm1,%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,%xmm12 + pslld $9,%xmm0 + pxor %xmm0,%xmm15 + psrld $23,%xmm12 + pxor %xmm12,%xmm15 + movdqa %xmm2,%xmm0 + paddd %xmm13,%xmm0 + movdqa %xmm0,%xmm12 + pslld $9,%xmm0 + pxor %xmm0,%xmm9 + psrld $23,%xmm12 + pxor %xmm12,%xmm9 + movdqa %xmm6,%xmm0 + paddd %xmm15,%xmm0 + movdqa %xmm0,%xmm12 + pslld $13,%xmm0 + pxor %xmm0,%xmm11 + psrld $19,%xmm12 + pxor %xmm12,%xmm11 + movdqa %xmm13,%xmm0 + paddd %xmm9,%xmm0 + movdqa %xmm0,%xmm12 + pslld $13,%xmm0 + pxor %xmm0,%xmm3 + psrld $19,%xmm12 + pxor %xmm12,%xmm3 + movdqa %xmm15,%xmm0 + paddd %xmm11,%xmm0 + movdqa %xmm0,%xmm12 + pslld $18,%xmm0 + pxor %xmm0,%xmm1 + psrld $14,%xmm12 + pxor %xmm12,%xmm1 + movdqa %xmm9,%xmm0 + paddd %xmm3,%xmm0 + movdqa %xmm0,%xmm12 + pslld $18,%xmm0 + pxor %xmm0,%xmm2 + psrld $14,%xmm12 + pxor %xmm12,%xmm2 + movdqa 256(%rsp),%xmm12 + movdqa 272(%rsp),%xmm0 + sub $2,%rdx + ja ._mainloop1 + paddd 48(%rsp),%xmm12 + paddd 112(%rsp),%xmm7 + paddd 160(%rsp),%xmm10 + paddd 208(%rsp),%xmm4 + movd %xmm12,%rdx + movd %xmm7,%rcx + movd %xmm10,%r9 + movd %xmm4,%rax + pshufd $0x39,%xmm12,%xmm12 + pshufd $0x39,%xmm7,%xmm7 + pshufd $0x39,%xmm10,%xmm10 + pshufd $0x39,%xmm4,%xmm4 + xorl 0(%rsi),%edx + xorl 4(%rsi),%ecx + xorl 8(%rsi),%r9d + xorl 12(%rsi),%eax + movl %edx,0(%rdi) + movl %ecx,4(%rdi) + movl %r9d,8(%rdi) + movl %eax,12(%rdi) + movd %xmm12,%rdx + movd %xmm7,%rcx + movd %xmm10,%r9 + movd %xmm4,%rax + pshufd $0x39,%xmm12,%xmm12 + pshufd $0x39,%xmm7,%xmm7 + pshufd $0x39,%xmm10,%xmm10 + pshufd $0x39,%xmm4,%xmm4 + xorl 64(%rsi),%edx + xorl 68(%rsi),%ecx + xorl 72(%rsi),%r9d + xorl 76(%rsi),%eax + movl %edx,64(%rdi) + movl %ecx,68(%rdi) + movl %r9d,72(%rdi) + movl %eax,76(%rdi) + movd %xmm12,%rdx + movd %xmm7,%rcx + movd %xmm10,%r9 + movd %xmm4,%rax + pshufd $0x39,%xmm12,%xmm12 + pshufd $0x39,%xmm7,%xmm7 + pshufd $0x39,%xmm10,%xmm10 + pshufd $0x39,%xmm4,%xmm4 + xorl 128(%rsi),%edx + xorl 132(%rsi),%ecx + xorl 136(%rsi),%r9d + xorl 140(%rsi),%eax + movl %edx,128(%rdi) + movl %ecx,132(%rdi) + movl %r9d,136(%rdi) + movl %eax,140(%rdi) + movd %xmm12,%rdx + movd %xmm7,%rcx + movd %xmm10,%r9 + movd %xmm4,%rax + xorl 192(%rsi),%edx + xorl 196(%rsi),%ecx + xorl 200(%rsi),%r9d + xorl 204(%rsi),%eax + movl %edx,192(%rdi) + movl %ecx,196(%rdi) + movl %r9d,200(%rdi) + movl %eax,204(%rdi) + paddd 176(%rsp),%xmm14 + paddd 0(%rsp),%xmm0 + paddd 64(%rsp),%xmm5 + paddd 128(%rsp),%xmm8 + movd %xmm14,%rdx + movd %xmm0,%rcx + movd %xmm5,%r9 + movd %xmm8,%rax + pshufd $0x39,%xmm14,%xmm14 + pshufd $0x39,%xmm0,%xmm0 + pshufd $0x39,%xmm5,%xmm5 + pshufd $0x39,%xmm8,%xmm8 + xorl 16(%rsi),%edx + xorl 20(%rsi),%ecx + xorl 24(%rsi),%r9d + xorl 28(%rsi),%eax + movl %edx,16(%rdi) + movl %ecx,20(%rdi) + movl %r9d,24(%rdi) + movl %eax,28(%rdi) + movd %xmm14,%rdx + movd %xmm0,%rcx + movd %xmm5,%r9 + movd %xmm8,%rax + pshufd $0x39,%xmm14,%xmm14 + pshufd $0x39,%xmm0,%xmm0 + pshufd $0x39,%xmm5,%xmm5 + pshufd $0x39,%xmm8,%xmm8 + xorl 80(%rsi),%edx + xorl 84(%rsi),%ecx + xorl 88(%rsi),%r9d + xorl 92(%rsi),%eax + movl %edx,80(%rdi) + movl %ecx,84(%rdi) + movl %r9d,88(%rdi) + movl %eax,92(%rdi) + movd %xmm14,%rdx + movd %xmm0,%rcx + movd %xmm5,%r9 + movd %xmm8,%rax + pshufd $0x39,%xmm14,%xmm14 + pshufd $0x39,%xmm0,%xmm0 + pshufd $0x39,%xmm5,%xmm5 + pshufd $0x39,%xmm8,%xmm8 + xorl 144(%rsi),%edx + xorl 148(%rsi),%ecx + xorl 152(%rsi),%r9d + xorl 156(%rsi),%eax + movl %edx,144(%rdi) + movl %ecx,148(%rdi) + movl %r9d,152(%rdi) + movl %eax,156(%rdi) + movd %xmm14,%rdx + movd %xmm0,%rcx + movd %xmm5,%r9 + movd %xmm8,%rax + xorl 208(%rsi),%edx + xorl 212(%rsi),%ecx + xorl 216(%rsi),%r9d + xorl 220(%rsi),%eax + movl %edx,208(%rdi) + movl %ecx,212(%rdi) + movl %r9d,216(%rdi) + movl %eax,220(%rdi) + paddd 224(%rsp),%xmm15 + paddd 240(%rsp),%xmm11 + paddd 16(%rsp),%xmm1 + paddd 80(%rsp),%xmm6 + movd %xmm15,%rdx + movd %xmm11,%rcx + movd %xmm1,%r9 + movd %xmm6,%rax + pshufd $0x39,%xmm15,%xmm15 + pshufd $0x39,%xmm11,%xmm11 + pshufd $0x39,%xmm1,%xmm1 + pshufd $0x39,%xmm6,%xmm6 + xorl 32(%rsi),%edx + xorl 36(%rsi),%ecx + xorl 40(%rsi),%r9d + xorl 44(%rsi),%eax + movl %edx,32(%rdi) + movl %ecx,36(%rdi) + movl %r9d,40(%rdi) + movl %eax,44(%rdi) + movd %xmm15,%rdx + movd %xmm11,%rcx + movd %xmm1,%r9 + movd %xmm6,%rax + pshufd $0x39,%xmm15,%xmm15 + pshufd $0x39,%xmm11,%xmm11 + pshufd $0x39,%xmm1,%xmm1 + pshufd $0x39,%xmm6,%xmm6 + xorl 96(%rsi),%edx + xorl 100(%rsi),%ecx + xorl 104(%rsi),%r9d + xorl 108(%rsi),%eax + movl %edx,96(%rdi) + movl %ecx,100(%rdi) + movl %r9d,104(%rdi) + movl %eax,108(%rdi) + movd %xmm15,%rdx + movd %xmm11,%rcx + movd %xmm1,%r9 + movd %xmm6,%rax + pshufd $0x39,%xmm15,%xmm15 + pshufd $0x39,%xmm11,%xmm11 + pshufd $0x39,%xmm1,%xmm1 + pshufd $0x39,%xmm6,%xmm6 + xorl 160(%rsi),%edx + xorl 164(%rsi),%ecx + xorl 168(%rsi),%r9d + xorl 172(%rsi),%eax + movl %edx,160(%rdi) + movl %ecx,164(%rdi) + movl %r9d,168(%rdi) + movl %eax,172(%rdi) + movd %xmm15,%rdx + movd %xmm11,%rcx + movd %xmm1,%r9 + movd %xmm6,%rax + xorl 224(%rsi),%edx + xorl 228(%rsi),%ecx + xorl 232(%rsi),%r9d + xorl 236(%rsi),%eax + movl %edx,224(%rdi) + movl %ecx,228(%rdi) + movl %r9d,232(%rdi) + movl %eax,236(%rdi) + paddd 96(%rsp),%xmm13 + paddd 144(%rsp),%xmm9 + paddd 192(%rsp),%xmm3 + paddd 32(%rsp),%xmm2 + movd %xmm13,%rdx + movd %xmm9,%rcx + movd %xmm3,%r9 + movd %xmm2,%rax + pshufd $0x39,%xmm13,%xmm13 + pshufd $0x39,%xmm9,%xmm9 + pshufd $0x39,%xmm3,%xmm3 + pshufd $0x39,%xmm2,%xmm2 + xorl 48(%rsi),%edx + xorl 52(%rsi),%ecx + xorl 56(%rsi),%r9d + xorl 60(%rsi),%eax + movl %edx,48(%rdi) + movl %ecx,52(%rdi) + movl %r9d,56(%rdi) + movl %eax,60(%rdi) + movd %xmm13,%rdx + movd %xmm9,%rcx + movd %xmm3,%r9 + movd %xmm2,%rax + pshufd $0x39,%xmm13,%xmm13 + pshufd $0x39,%xmm9,%xmm9 + pshufd $0x39,%xmm3,%xmm3 + pshufd $0x39,%xmm2,%xmm2 + xorl 112(%rsi),%edx + xorl 116(%rsi),%ecx + xorl 120(%rsi),%r9d + xorl 124(%rsi),%eax + movl %edx,112(%rdi) + movl %ecx,116(%rdi) + movl %r9d,120(%rdi) + movl %eax,124(%rdi) + movd %xmm13,%rdx + movd %xmm9,%rcx + movd %xmm3,%r9 + movd %xmm2,%rax + pshufd $0x39,%xmm13,%xmm13 + pshufd $0x39,%xmm9,%xmm9 + pshufd $0x39,%xmm3,%xmm3 + pshufd $0x39,%xmm2,%xmm2 + xorl 176(%rsi),%edx + xorl 180(%rsi),%ecx + xorl 184(%rsi),%r9d + xorl 188(%rsi),%eax + movl %edx,176(%rdi) + movl %ecx,180(%rdi) + movl %r9d,184(%rdi) + movl %eax,188(%rdi) + movd %xmm13,%rdx + movd %xmm9,%rcx + movd %xmm3,%r9 + movd %xmm2,%rax + xorl 240(%rsi),%edx + xorl 244(%rsi),%ecx + xorl 248(%rsi),%r9d + xorl 252(%rsi),%eax + movl %edx,240(%rdi) + movl %ecx,244(%rdi) + movl %r9d,248(%rdi) + movl %eax,252(%rdi) + movq 288(%rsp),%rdx + sub $256,%rdx + add $256,%rsi + add $256,%rdi + cmp $256,%rdx + jae ._bytesatleast256 + cmp $0,%rdx + jbe ._done +._bytes_are_64_128_or_192: + movq %rdx,288(%rsp) + movdqa 0(%r8),%xmm0 + movdqa 16(%r8),%xmm1 + movdqa 32(%r8),%xmm2 + movdqa 48(%r8),%xmm3 + movdqa %xmm1,%xmm4 + mov %rbx,%rdx +._mainloop2: + paddd %xmm0,%xmm4 + movdqa %xmm0,%xmm5 + movdqa %xmm4,%xmm6 + pslld $7,%xmm4 + psrld $25,%xmm6 + pxor %xmm4,%xmm3 + pxor %xmm6,%xmm3 + paddd %xmm3,%xmm5 + movdqa %xmm3,%xmm4 + movdqa %xmm5,%xmm6 + pslld $9,%xmm5 + psrld $23,%xmm6 + pxor %xmm5,%xmm2 + pshufd $0x93,%xmm3,%xmm3 + pxor %xmm6,%xmm2 + paddd %xmm2,%xmm4 + movdqa %xmm2,%xmm5 + movdqa %xmm4,%xmm6 + pslld $13,%xmm4 + psrld $19,%xmm6 + pxor %xmm4,%xmm1 + pshufd $0x4e,%xmm2,%xmm2 + pxor %xmm6,%xmm1 + paddd %xmm1,%xmm5 + movdqa %xmm3,%xmm4 + movdqa %xmm5,%xmm6 + pslld $18,%xmm5 + psrld $14,%xmm6 + pxor %xmm5,%xmm0 + pshufd $0x39,%xmm1,%xmm1 + pxor %xmm6,%xmm0 + paddd %xmm0,%xmm4 + movdqa %xmm0,%xmm5 + movdqa %xmm4,%xmm6 + pslld $7,%xmm4 + psrld $25,%xmm6 + pxor %xmm4,%xmm1 + pxor %xmm6,%xmm1 + paddd %xmm1,%xmm5 + movdqa %xmm1,%xmm4 + movdqa %xmm5,%xmm6 + pslld $9,%xmm5 + psrld $23,%xmm6 + pxor %xmm5,%xmm2 + pshufd $0x93,%xmm1,%xmm1 + pxor %xmm6,%xmm2 + paddd %xmm2,%xmm4 + movdqa %xmm2,%xmm5 + movdqa %xmm4,%xmm6 + pslld $13,%xmm4 + psrld $19,%xmm6 + pxor %xmm4,%xmm3 + pshufd $0x4e,%xmm2,%xmm2 + pxor %xmm6,%xmm3 + paddd %xmm3,%xmm5 + movdqa %xmm1,%xmm4 + movdqa %xmm5,%xmm6 + pslld $18,%xmm5 + psrld $14,%xmm6 + pxor %xmm5,%xmm0 + pshufd $0x39,%xmm3,%xmm3 + pxor %xmm6,%xmm0 + paddd %xmm0,%xmm4 + movdqa %xmm0,%xmm5 + movdqa %xmm4,%xmm6 + pslld $7,%xmm4 + psrld $25,%xmm6 + pxor %xmm4,%xmm3 + pxor %xmm6,%xmm3 + paddd %xmm3,%xmm5 + movdqa %xmm3,%xmm4 + movdqa %xmm5,%xmm6 + pslld $9,%xmm5 + psrld $23,%xmm6 + pxor %xmm5,%xmm2 + pshufd $0x93,%xmm3,%xmm3 + pxor %xmm6,%xmm2 + paddd %xmm2,%xmm4 + movdqa %xmm2,%xmm5 + movdqa %xmm4,%xmm6 + pslld $13,%xmm4 + psrld $19,%xmm6 + pxor %xmm4,%xmm1 + pshufd $0x4e,%xmm2,%xmm2 + pxor %xmm6,%xmm1 + paddd %xmm1,%xmm5 + movdqa %xmm3,%xmm4 + movdqa %xmm5,%xmm6 + pslld $18,%xmm5 + psrld $14,%xmm6 + pxor %xmm5,%xmm0 + pshufd $0x39,%xmm1,%xmm1 + pxor %xmm6,%xmm0 + paddd %xmm0,%xmm4 + movdqa %xmm0,%xmm5 + movdqa %xmm4,%xmm6 + pslld $7,%xmm4 + psrld $25,%xmm6 + pxor %xmm4,%xmm1 + pxor %xmm6,%xmm1 + paddd %xmm1,%xmm5 + movdqa %xmm1,%xmm4 + movdqa %xmm5,%xmm6 + pslld $9,%xmm5 + psrld $23,%xmm6 + pxor %xmm5,%xmm2 + pshufd $0x93,%xmm1,%xmm1 + pxor %xmm6,%xmm2 + paddd %xmm2,%xmm4 + movdqa %xmm2,%xmm5 + movdqa %xmm4,%xmm6 + pslld $13,%xmm4 + psrld $19,%xmm6 + pxor %xmm4,%xmm3 + pshufd $0x4e,%xmm2,%xmm2 + pxor %xmm6,%xmm3 + sub $4,%rdx + paddd %xmm3,%xmm5 + movdqa %xmm1,%xmm4 + movdqa %xmm5,%xmm6 + pslld $18,%xmm5 + pxor %xmm7,%xmm7 + psrld $14,%xmm6 + pxor %xmm5,%xmm0 + pshufd $0x39,%xmm3,%xmm3 + pxor %xmm6,%xmm0 + ja ._mainloop2 + paddd 0(%r8),%xmm0 + paddd 16(%r8),%xmm1 + paddd 32(%r8),%xmm2 + paddd 48(%r8),%xmm3 + movd %xmm0,%rdx + movd %xmm1,%rcx + movd %xmm2,%rax + movd %xmm3,%r10 + pshufd $0x39,%xmm0,%xmm0 + pshufd $0x39,%xmm1,%xmm1 + pshufd $0x39,%xmm2,%xmm2 + pshufd $0x39,%xmm3,%xmm3 + xorl 0(%rsi),%edx + xorl 48(%rsi),%ecx + xorl 32(%rsi),%eax + xorl 16(%rsi),%r10d + movl %edx,0(%rdi) + movl %ecx,48(%rdi) + movl %eax,32(%rdi) + movl %r10d,16(%rdi) + movd %xmm0,%rdx + movd %xmm1,%rcx + movd %xmm2,%rax + movd %xmm3,%r10 + pshufd $0x39,%xmm0,%xmm0 + pshufd $0x39,%xmm1,%xmm1 + pshufd $0x39,%xmm2,%xmm2 + pshufd $0x39,%xmm3,%xmm3 + xorl 20(%rsi),%edx + xorl 4(%rsi),%ecx + xorl 52(%rsi),%eax + xorl 36(%rsi),%r10d + movl %edx,20(%rdi) + movl %ecx,4(%rdi) + movl %eax,52(%rdi) + movl %r10d,36(%rdi) + movd %xmm0,%rdx + movd %xmm1,%rcx + movd %xmm2,%rax + movd %xmm3,%r10 + pshufd $0x39,%xmm0,%xmm0 + pshufd $0x39,%xmm1,%xmm1 + pshufd $0x39,%xmm2,%xmm2 + pshufd $0x39,%xmm3,%xmm3 + xorl 40(%rsi),%edx + xorl 24(%rsi),%ecx + xorl 8(%rsi),%eax + xorl 56(%rsi),%r10d + movl %edx,40(%rdi) + movl %ecx,24(%rdi) + movl %eax,8(%rdi) + movl %r10d,56(%rdi) + movd %xmm0,%rdx + movd %xmm1,%rcx + movd %xmm2,%rax + movd %xmm3,%r10 + xorl 60(%rsi),%edx + xorl 44(%rsi),%ecx + xorl 28(%rsi),%eax + xorl 12(%rsi),%r10d + movl %edx,60(%rdi) + movl %ecx,44(%rdi) + movl %eax,28(%rdi) + movl %r10d,12(%rdi) + movq 288(%rsp),%rdx + movl 32(%r8),%ecx + movl 52(%r8),%eax + add $1,%ecx + adc $0,%eax + movl %ecx,32(%r8) + movl %eax,52(%r8) + cmp $64,%rdx + ja ._bytes_are_128_or_192 +._done: + add %r11,%rsp + mov %r11,%rax + pop %rbx + ret +._bytes_are_128_or_192: + sub $64,%rdx + add $64,%rdi + add $64,%rsi + jmp ._bytes_are_64_128_or_192 +.size _gcry_salsa20_amd64_encrypt_blocks,.-_gcry_salsa20_amd64_encrypt_blocks; + +#endif /*defined(USE_SALSA20)*/ +#endif /*__x86_64*/ diff --git a/cipher/salsa20.c b/cipher/salsa20.c index 6189bca9..892b9fc2 100644 --- a/cipher/salsa20.c +++ b/cipher/salsa20.c @@ -40,6 +40,14 @@ #include "cipher.h" #include "bufhelp.h" + +/* USE_AMD64 indicates whether to compile with AMD64 code. */ +#undef USE_AMD64 +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) +# define USE_AMD64 1 +#endif + + #define SALSA20_MIN_KEY_SIZE 16 /* Bytes. */ #define SALSA20_MAX_KEY_SIZE 32 /* Bytes. */ #define SALSA20_BLOCK_SIZE 64 /* Bytes. */ @@ -83,6 +91,36 @@ typedef struct static void salsa20_setiv (void *context, const byte *iv, unsigned int ivlen); static const char *selftest (void); + +#ifdef USE_AMD64 +/* AMD64 assembly implementations of Salsa20. */ +void _gcry_salsa20_amd64_keysetup(u32 *ctxinput, const void *key, int keybits); +void _gcry_salsa20_amd64_ivsetup(u32 *ctxinput, const void *iv); +unsigned int +_gcry_salsa20_amd64_encrypt_blocks(u32 *ctxinput, const void *src, void *dst, + size_t len, int rounds); + +static void +salsa20_keysetup(SALSA20_context_t *ctx, const byte *key, int keylen) +{ + _gcry_salsa20_amd64_keysetup(ctx->input, key, keylen * 8); +} + +static void +salsa20_ivsetup(SALSA20_context_t *ctx, const byte *iv) +{ + _gcry_salsa20_amd64_ivsetup(ctx->input, iv); +} + +static unsigned int +salsa20_core (u32 *dst, u32 *src, unsigned int rounds) +{ + memset(dst, 0, SALSA20_BLOCK_SIZE); + return _gcry_salsa20_amd64_encrypt_blocks(src, dst, dst, 1, rounds); +} + +#else /* USE_AMD64 */ + #if 0 @@ -110,8 +148,8 @@ static const char *selftest (void); x0 ^= ROTL32 (18, x3 + x2); \ } while(0) -static void -salsa20_core (u32 *dst, const u32 *src, unsigned rounds) +static unsigned int +salsa20_core (u32 *dst, u32 *src, unsigned int rounds) { u32 pad[SALSA20_INPUT_LENGTH]; unsigned int i; @@ -138,31 +176,24 @@ salsa20_core (u32 *dst, const u32 *src, unsigned rounds) u32 t = pad[i] + src[i]; dst[i] = LE_SWAP32 (t); } + + /* Update counter. */ + if (!++src[8]) + src[9]++; + + /* burn_stack */ + return ( 3*sizeof (void*) \ + + 2*sizeof (void*) \ + + 64 \ + + sizeof (unsigned int) \ + + sizeof (u32) ); } #undef QROUND #undef SALSA20_CORE_DEBUG -static gcry_err_code_t -salsa20_do_setkey (SALSA20_context_t *ctx, - const byte *key, unsigned int keylen) +static void +salsa20_keysetup(SALSA20_context_t *ctx, const byte *key, int keylen) { - static int initialized; - static const char *selftest_failed; - - if (!initialized ) - { - initialized = 1; - selftest_failed = selftest (); - if (selftest_failed) - log_error ("SALSA20 selftest failed (%s)\n", selftest_failed ); - } - if (selftest_failed) - return GPG_ERR_SELFTEST_FAILED; - - if (keylen != SALSA20_MIN_KEY_SIZE - && keylen != SALSA20_MAX_KEY_SIZE) - return GPG_ERR_INV_KEYLEN; - /* These constants are the little endian encoding of the string "expand 32-byte k". For the 128 bit variant, the "32" in that string will be fixed up to "16". */ @@ -192,6 +223,41 @@ salsa20_do_setkey (SALSA20_context_t *ctx, ctx->input[5] -= 0x02000000; /* Change to "1 dn". */ ctx->input[10] += 0x00000004; /* Change to "yb-6". */ } +} + +static void salsa20_ivsetup(SALSA20_context_t *ctx, const byte *iv) +{ + ctx->input[6] = LE_READ_UINT32(iv + 0); + ctx->input[7] = LE_READ_UINT32(iv + 4); + /* Reset the block counter. */ + ctx->input[8] = 0; + ctx->input[9] = 0; +} + +#endif /*!USE_AMD64*/ + +static gcry_err_code_t +salsa20_do_setkey (SALSA20_context_t *ctx, + const byte *key, unsigned int keylen) +{ + static int initialized; + static const char *selftest_failed; + + if (!initialized ) + { + initialized = 1; + selftest_failed = selftest (); + if (selftest_failed) + log_error ("SALSA20 selftest failed (%s)\n", selftest_failed ); + } + if (selftest_failed) + return GPG_ERR_SELFTEST_FAILED; + + if (keylen != SALSA20_MIN_KEY_SIZE + && keylen != SALSA20_MAX_KEY_SIZE) + return GPG_ERR_INV_KEYLEN; + + salsa20_keysetup (ctx, key, keylen); /* We default to a zero nonce. */ salsa20_setiv (ctx, NULL, 0); @@ -205,7 +271,7 @@ salsa20_setkey (void *context, const byte *key, unsigned int keylen) { SALSA20_context_t *ctx = (SALSA20_context_t *)context; gcry_err_code_t rc = salsa20_do_setkey (ctx, key, keylen); - _gcry_burn_stack (300/* FIXME*/); + _gcry_burn_stack (4 + sizeof (void *) + 4 * sizeof (void *)); return rc; } @@ -214,28 +280,22 @@ static void salsa20_setiv (void *context, const byte *iv, unsigned int ivlen) { SALSA20_context_t *ctx = (SALSA20_context_t *)context; + byte tmp[SALSA20_IV_SIZE]; - if (!iv) - { - ctx->input[6] = 0; - ctx->input[7] = 0; - } - else if (ivlen == SALSA20_IV_SIZE) - { - ctx->input[6] = LE_READ_UINT32(iv + 0); - ctx->input[7] = LE_READ_UINT32(iv + 4); - } + if (iv && ivlen != SALSA20_IV_SIZE) + log_info ("WARNING: salsa20_setiv: bad ivlen=%u\n", ivlen); + + if (!iv || ivlen != SALSA20_IV_SIZE) + memset (tmp, 0, sizeof(tmp)); else - { - log_info ("WARNING: salsa20_setiv: bad ivlen=%u\n", ivlen); - ctx->input[6] = 0; - ctx->input[7] = 0; - } - /* Reset the block counter. */ - ctx->input[8] = 0; - ctx->input[9] = 0; + memcpy (tmp, iv, SALSA20_IV_SIZE); + + salsa20_ivsetup (ctx, tmp); + /* Reset the unused pad bytes counter. */ ctx->unused = 0; + + wipememory (tmp, sizeof(tmp)); } @@ -246,6 +306,8 @@ salsa20_do_encrypt_stream (SALSA20_context_t *ctx, byte *outbuf, const byte *inbuf, unsigned int length, unsigned rounds) { + unsigned int nburn, burn = 0; + if (ctx->unused) { unsigned char *p = (void*)ctx->pad; @@ -266,26 +328,39 @@ salsa20_do_encrypt_stream (SALSA20_context_t *ctx, gcry_assert (!ctx->unused); } - for (;;) +#ifdef USE_AMD64 + if (length >= SALSA20_BLOCK_SIZE) + { + unsigned int nblocks = length / SALSA20_BLOCK_SIZE; + burn = _gcry_salsa20_amd64_encrypt_blocks(ctx->input, inbuf, outbuf, + nblocks, rounds); + length -= SALSA20_BLOCK_SIZE * nblocks; + outbuf += SALSA20_BLOCK_SIZE * nblocks; + inbuf += SALSA20_BLOCK_SIZE * nblocks; + } +#endif + + while (length > 0) { /* Create the next pad and bump the block counter. Note that it is the user's duty to change to another nonce not later than after 2^70 processed bytes. */ - salsa20_core (ctx->pad, ctx->input, rounds); - if (!++ctx->input[8]) - ctx->input[9]++; + nburn = salsa20_core (ctx->pad, ctx->input, rounds); + burn = nburn > burn ? nburn : burn; if (length <= SALSA20_BLOCK_SIZE) { buf_xor (outbuf, inbuf, ctx->pad, length); ctx->unused = SALSA20_BLOCK_SIZE - length; - return; + break; } buf_xor (outbuf, inbuf, ctx->pad, SALSA20_BLOCK_SIZE); length -= SALSA20_BLOCK_SIZE; outbuf += SALSA20_BLOCK_SIZE; inbuf += SALSA20_BLOCK_SIZE; - } + } + + _gcry_burn_stack (burn); } @@ -296,19 +371,7 @@ salsa20_encrypt_stream (void *context, SALSA20_context_t *ctx = (SALSA20_context_t *)context; if (length) - { - salsa20_do_encrypt_stream (ctx, outbuf, inbuf, length, SALSA20_ROUNDS); - _gcry_burn_stack (/* salsa20_do_encrypt_stream: */ - 2*sizeof (void*) - + 3*sizeof (void*) + sizeof (unsigned int) - /* salsa20_core: */ - + 2*sizeof (void*) - + 2*sizeof (void*) - + 64 - + sizeof (unsigned int) - + sizeof (u32) - ); - } + salsa20_do_encrypt_stream (ctx, outbuf, inbuf, length, SALSA20_ROUNDS); } @@ -319,19 +382,7 @@ salsa20r12_encrypt_stream (void *context, SALSA20_context_t *ctx = (SALSA20_context_t *)context; if (length) - { - salsa20_do_encrypt_stream (ctx, outbuf, inbuf, length, SALSA20R12_ROUNDS); - _gcry_burn_stack (/* salsa20_do_encrypt_stream: */ - 2*sizeof (void*) - + 3*sizeof (void*) + sizeof (unsigned int) - /* salsa20_core: */ - + 2*sizeof (void*) - + 2*sizeof (void*) - + 64 - + sizeof (unsigned int) - + sizeof (u32) - ); - } + salsa20_do_encrypt_stream (ctx, outbuf, inbuf, length, SALSA20R12_ROUNDS); } diff --git a/configure.ac b/configure.ac index 5b7ba0d8..114460c2 100644 --- a/configure.ac +++ b/configure.ac @@ -1553,6 +1553,13 @@ LIST_MEMBER(salsa20, $enabled_ciphers) if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS salsa20.lo" AC_DEFINE(USE_SALSA20, 1, [Defined if this module should be included]) + + case "${host}" in + x86_64-*-*) + # Build with the assembly implementation + GCRYPT_CIPHERS="$GCRYPT_CIPHERS salsa20-amd64.lo" + ;; + esac fi LIST_MEMBER(gost28147, $enabled_ciphers) |