summaryrefslogtreecommitdiff
path: root/cipher/salsa20-amd64.S
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2013-10-26 15:00:48 +0300
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2013-10-28 16:12:19 +0200
commit5a3d43485efdc09912be0967ee0a3ce345b3b15a (patch)
treeff8e937e2d010ae8e015707f5665915dabe1e915 /cipher/salsa20-amd64.S
parente214e8392671dd30e9c33260717b5e756debf3bf (diff)
downloadlibgcrypt-5a3d43485efdc09912be0967ee0a3ce345b3b15a.tar.gz
Add AMD64 assembly implementation of Salsa20
* cipher/Makefile.am: Add 'salsa20-amd64.S'. * cipher/salsa20-amd64.S: New. * cipher/salsa20.c (USE_AMD64): New macro. [USE_AMD64] (_gcry_salsa20_amd64_keysetup, _gcry_salsa20_amd64_ivsetup) (_gcry_salsa20_amd64_encrypt_blocks): New prototypes. [USE_AMD64] (salsa20_keysetup, salsa20_ivsetup, salsa20_core): New. [!USE_AMD64] (salsa20_core): Change 'src' to non-constant, update block counter in 'salsa20_core' and return burn stack depth. [!USE_AMD64] (salsa20_keysetup, salsa20_ivsetup): New. (salsa20_do_setkey): Move generic key setup to 'salsa20_keysetup'. (salsa20_setkey): Fix burn stack depth. (salsa20_setiv): Move generic IV setup to 'salsa20_ivsetup'. (salsa20_do_encrypt_stream) [USE_AMD64]: Process large buffers in AMD64 implementation. (salsa20_do_encrypt_stream): Move stack burning to this function... (salsa20_encrypt_stream, salsa20r12_encrypt_stream): ...from these functions. * configure.ac [x86-64]: Add 'salsa20-amd64.lo'. -- Patch adds fast AMD64 assembly implementation for Salsa20. This implementation is based on public domain code by D. J. Bernstein and it is available at http://cr.yp.to/snuffle.html (amd64-xmm6). Implementation gains extra speed by processing four blocks in parallel with help SSE2 instructions. Benchmark results on Intel Core i5-4570 (3.2 Ghz): Before: SALSA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 3.88 ns/B 246.0 MiB/s 12.41 c/B STREAM dec | 3.88 ns/B 246.0 MiB/s 12.41 c/B = SALSA20R12 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 2.46 ns/B 387.9 MiB/s 7.87 c/B STREAM dec | 2.46 ns/B 387.7 MiB/s 7.87 c/B After: SALSA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 0.985 ns/B 967.8 MiB/s 3.15 c/B STREAM dec | 0.987 ns/B 966.5 MiB/s 3.16 c/B = SALSA20R12 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 0.636 ns/B 1500.5 MiB/s 2.03 c/B STREAM dec | 0.636 ns/B 1499.2 MiB/s 2.04 c/B Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/salsa20-amd64.S')
-rw-r--r--cipher/salsa20-amd64.S924
1 files changed, 924 insertions, 0 deletions
diff --git a/cipher/salsa20-amd64.S b/cipher/salsa20-amd64.S
new file mode 100644
index 00000000..691df588
--- /dev/null
+++ b/cipher/salsa20-amd64.S
@@ -0,0 +1,924 @@
+/* salsa20-amd64.S - AMD64 implementation of Salsa20
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Based on public domain implementation by D. J. Bernstein at
+ * http://cr.yp.to/snuffle.html
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && defined(USE_SALSA20)
+
+.text
+
+.align 8
+.globl _gcry_salsa20_amd64_keysetup
+.type _gcry_salsa20_amd64_keysetup,@function;
+_gcry_salsa20_amd64_keysetup:
+ movl 0(%rsi),%r8d
+ movl 4(%rsi),%r9d
+ movl 8(%rsi),%eax
+ movl 12(%rsi),%r10d
+ movl %r8d,20(%rdi)
+ movl %r9d,40(%rdi)
+ movl %eax,60(%rdi)
+ movl %r10d,48(%rdi)
+ cmp $256,%rdx
+ jb ._kbits128
+._kbits256:
+ movl 16(%rsi),%edx
+ movl 20(%rsi),%ecx
+ movl 24(%rsi),%r8d
+ movl 28(%rsi),%esi
+ movl %edx,28(%rdi)
+ movl %ecx,16(%rdi)
+ movl %r8d,36(%rdi)
+ movl %esi,56(%rdi)
+ mov $1634760805,%rsi
+ mov $857760878,%rdx
+ mov $2036477234,%rcx
+ mov $1797285236,%r8
+ movl %esi,0(%rdi)
+ movl %edx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %r8d,12(%rdi)
+ jmp ._keysetupdone
+._kbits128:
+ movl 0(%rsi),%edx
+ movl 4(%rsi),%ecx
+ movl 8(%rsi),%r8d
+ movl 12(%rsi),%esi
+ movl %edx,28(%rdi)
+ movl %ecx,16(%rdi)
+ movl %r8d,36(%rdi)
+ movl %esi,56(%rdi)
+ mov $1634760805,%rsi
+ mov $824206446,%rdx
+ mov $2036477238,%rcx
+ mov $1797285236,%r8
+ movl %esi,0(%rdi)
+ movl %edx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %r8d,12(%rdi)
+._keysetupdone:
+ ret
+
+.align 8
+.globl _gcry_salsa20_amd64_ivsetup
+.type _gcry_salsa20_amd64_ivsetup,@function;
+_gcry_salsa20_amd64_ivsetup:
+ movl 0(%rsi),%r8d
+ movl 4(%rsi),%esi
+ mov $0,%r9
+ mov $0,%rax
+ movl %r8d,24(%rdi)
+ movl %esi,44(%rdi)
+ movl %r9d,32(%rdi)
+ movl %eax,52(%rdi)
+ ret
+
+.align 8
+.globl _gcry_salsa20_amd64_encrypt_blocks
+.type _gcry_salsa20_amd64_encrypt_blocks,@function;
+_gcry_salsa20_amd64_encrypt_blocks:
+ /*
+ * Modifications to original implementation:
+ * - Number of rounds passing in register %r8 (for Salsa20/12).
+ * - Length is input as number of blocks, so don't handle tail bytes
+ * (this is done in salsa20.c).
+ */
+ push %rbx
+ shlq $6, %rcx /* blocks to bytes */
+ mov %r8, %rbx
+ mov %rsp,%r11
+ and $31,%r11
+ add $384,%r11
+ sub %r11,%rsp
+ mov %rdi,%r8
+ mov %rsi,%rsi
+ mov %rdx,%rdi
+ mov %rcx,%rdx
+ cmp $0,%rdx
+ jbe ._done
+._start:
+ cmp $256,%rdx
+ jb ._bytes_are_64_128_or_192
+ movdqa 0(%r8),%xmm0
+ pshufd $0x55,%xmm0,%xmm1
+ pshufd $0xaa,%xmm0,%xmm2
+ pshufd $0xff,%xmm0,%xmm3
+ pshufd $0x00,%xmm0,%xmm0
+ movdqa %xmm1,0(%rsp)
+ movdqa %xmm2,16(%rsp)
+ movdqa %xmm3,32(%rsp)
+ movdqa %xmm0,48(%rsp)
+ movdqa 16(%r8),%xmm0
+ pshufd $0xaa,%xmm0,%xmm1
+ pshufd $0xff,%xmm0,%xmm2
+ pshufd $0x00,%xmm0,%xmm3
+ pshufd $0x55,%xmm0,%xmm0
+ movdqa %xmm1,64(%rsp)
+ movdqa %xmm2,80(%rsp)
+ movdqa %xmm3,96(%rsp)
+ movdqa %xmm0,112(%rsp)
+ movdqa 32(%r8),%xmm0
+ pshufd $0xff,%xmm0,%xmm1
+ pshufd $0x55,%xmm0,%xmm2
+ pshufd $0xaa,%xmm0,%xmm0
+ movdqa %xmm1,128(%rsp)
+ movdqa %xmm2,144(%rsp)
+ movdqa %xmm0,160(%rsp)
+ movdqa 48(%r8),%xmm0
+ pshufd $0x00,%xmm0,%xmm1
+ pshufd $0xaa,%xmm0,%xmm2
+ pshufd $0xff,%xmm0,%xmm0
+ movdqa %xmm1,176(%rsp)
+ movdqa %xmm2,192(%rsp)
+ movdqa %xmm0,208(%rsp)
+._bytesatleast256:
+ movl 32(%r8),%ecx
+ movl 52(%r8),%r9d
+ movl %ecx,224(%rsp)
+ movl %r9d,240(%rsp)
+ add $1,%ecx
+ adc $0,%r9d
+ movl %ecx,4+224(%rsp)
+ movl %r9d,4+240(%rsp)
+ add $1,%ecx
+ adc $0,%r9d
+ movl %ecx,8+224(%rsp)
+ movl %r9d,8+240(%rsp)
+ add $1,%ecx
+ adc $0,%r9d
+ movl %ecx,12+224(%rsp)
+ movl %r9d,12+240(%rsp)
+ add $1,%ecx
+ adc $0,%r9d
+ movl %ecx,32(%r8)
+ movl %r9d,52(%r8)
+ movq %rdx,288(%rsp)
+ mov %rbx,%rdx
+ movdqa 0(%rsp),%xmm0
+ movdqa 16(%rsp),%xmm1
+ movdqa 32(%rsp),%xmm2
+ movdqa 192(%rsp),%xmm3
+ movdqa 208(%rsp),%xmm4
+ movdqa 64(%rsp),%xmm5
+ movdqa 80(%rsp),%xmm6
+ movdqa 112(%rsp),%xmm7
+ movdqa 128(%rsp),%xmm8
+ movdqa 144(%rsp),%xmm9
+ movdqa 160(%rsp),%xmm10
+ movdqa 240(%rsp),%xmm11
+ movdqa 48(%rsp),%xmm12
+ movdqa 96(%rsp),%xmm13
+ movdqa 176(%rsp),%xmm14
+ movdqa 224(%rsp),%xmm15
+._mainloop1:
+ movdqa %xmm1,256(%rsp)
+ movdqa %xmm2,272(%rsp)
+ movdqa %xmm13,%xmm1
+ paddd %xmm12,%xmm1
+ movdqa %xmm1,%xmm2
+ pslld $7,%xmm1
+ pxor %xmm1,%xmm14
+ psrld $25,%xmm2
+ pxor %xmm2,%xmm14
+ movdqa %xmm7,%xmm1
+ paddd %xmm0,%xmm1
+ movdqa %xmm1,%xmm2
+ pslld $7,%xmm1
+ pxor %xmm1,%xmm11
+ psrld $25,%xmm2
+ pxor %xmm2,%xmm11
+ movdqa %xmm12,%xmm1
+ paddd %xmm14,%xmm1
+ movdqa %xmm1,%xmm2
+ pslld $9,%xmm1
+ pxor %xmm1,%xmm15
+ psrld $23,%xmm2
+ pxor %xmm2,%xmm15
+ movdqa %xmm0,%xmm1
+ paddd %xmm11,%xmm1
+ movdqa %xmm1,%xmm2
+ pslld $9,%xmm1
+ pxor %xmm1,%xmm9
+ psrld $23,%xmm2
+ pxor %xmm2,%xmm9
+ movdqa %xmm14,%xmm1
+ paddd %xmm15,%xmm1
+ movdqa %xmm1,%xmm2
+ pslld $13,%xmm1
+ pxor %xmm1,%xmm13
+ psrld $19,%xmm2
+ pxor %xmm2,%xmm13
+ movdqa %xmm11,%xmm1
+ paddd %xmm9,%xmm1
+ movdqa %xmm1,%xmm2
+ pslld $13,%xmm1
+ pxor %xmm1,%xmm7
+ psrld $19,%xmm2
+ pxor %xmm2,%xmm7
+ movdqa %xmm15,%xmm1
+ paddd %xmm13,%xmm1
+ movdqa %xmm1,%xmm2
+ pslld $18,%xmm1
+ pxor %xmm1,%xmm12
+ psrld $14,%xmm2
+ pxor %xmm2,%xmm12
+ movdqa 256(%rsp),%xmm1
+ movdqa %xmm12,256(%rsp)
+ movdqa %xmm9,%xmm2
+ paddd %xmm7,%xmm2
+ movdqa %xmm2,%xmm12
+ pslld $18,%xmm2
+ pxor %xmm2,%xmm0
+ psrld $14,%xmm12
+ pxor %xmm12,%xmm0
+ movdqa %xmm5,%xmm2
+ paddd %xmm1,%xmm2
+ movdqa %xmm2,%xmm12
+ pslld $7,%xmm2
+ pxor %xmm2,%xmm3
+ psrld $25,%xmm12
+ pxor %xmm12,%xmm3
+ movdqa 272(%rsp),%xmm2
+ movdqa %xmm0,272(%rsp)
+ movdqa %xmm6,%xmm0
+ paddd %xmm2,%xmm0
+ movdqa %xmm0,%xmm12
+ pslld $7,%xmm0
+ pxor %xmm0,%xmm4
+ psrld $25,%xmm12
+ pxor %xmm12,%xmm4
+ movdqa %xmm1,%xmm0
+ paddd %xmm3,%xmm0
+ movdqa %xmm0,%xmm12
+ pslld $9,%xmm0
+ pxor %xmm0,%xmm10
+ psrld $23,%xmm12
+ pxor %xmm12,%xmm10
+ movdqa %xmm2,%xmm0
+ paddd %xmm4,%xmm0
+ movdqa %xmm0,%xmm12
+ pslld $9,%xmm0
+ pxor %xmm0,%xmm8
+ psrld $23,%xmm12
+ pxor %xmm12,%xmm8
+ movdqa %xmm3,%xmm0
+ paddd %xmm10,%xmm0
+ movdqa %xmm0,%xmm12
+ pslld $13,%xmm0
+ pxor %xmm0,%xmm5
+ psrld $19,%xmm12
+ pxor %xmm12,%xmm5
+ movdqa %xmm4,%xmm0
+ paddd %xmm8,%xmm0
+ movdqa %xmm0,%xmm12
+ pslld $13,%xmm0
+ pxor %xmm0,%xmm6
+ psrld $19,%xmm12
+ pxor %xmm12,%xmm6
+ movdqa %xmm10,%xmm0
+ paddd %xmm5,%xmm0
+ movdqa %xmm0,%xmm12
+ pslld $18,%xmm0
+ pxor %xmm0,%xmm1
+ psrld $14,%xmm12
+ pxor %xmm12,%xmm1
+ movdqa 256(%rsp),%xmm0
+ movdqa %xmm1,256(%rsp)
+ movdqa %xmm4,%xmm1
+ paddd %xmm0,%xmm1
+ movdqa %xmm1,%xmm12
+ pslld $7,%xmm1
+ pxor %xmm1,%xmm7
+ psrld $25,%xmm12
+ pxor %xmm12,%xmm7
+ movdqa %xmm8,%xmm1
+ paddd %xmm6,%xmm1
+ movdqa %xmm1,%xmm12
+ pslld $18,%xmm1
+ pxor %xmm1,%xmm2
+ psrld $14,%xmm12
+ pxor %xmm12,%xmm2
+ movdqa 272(%rsp),%xmm12
+ movdqa %xmm2,272(%rsp)
+ movdqa %xmm14,%xmm1
+ paddd %xmm12,%xmm1
+ movdqa %xmm1,%xmm2
+ pslld $7,%xmm1
+ pxor %xmm1,%xmm5
+ psrld $25,%xmm2
+ pxor %xmm2,%xmm5
+ movdqa %xmm0,%xmm1
+ paddd %xmm7,%xmm1
+ movdqa %xmm1,%xmm2
+ pslld $9,%xmm1
+ pxor %xmm1,%xmm10
+ psrld $23,%xmm2
+ pxor %xmm2,%xmm10
+ movdqa %xmm12,%xmm1
+ paddd %xmm5,%xmm1
+ movdqa %xmm1,%xmm2
+ pslld $9,%xmm1
+ pxor %xmm1,%xmm8
+ psrld $23,%xmm2
+ pxor %xmm2,%xmm8
+ movdqa %xmm7,%xmm1
+ paddd %xmm10,%xmm1
+ movdqa %xmm1,%xmm2
+ pslld $13,%xmm1
+ pxor %xmm1,%xmm4
+ psrld $19,%xmm2
+ pxor %xmm2,%xmm4
+ movdqa %xmm5,%xmm1
+ paddd %xmm8,%xmm1
+ movdqa %xmm1,%xmm2
+ pslld $13,%xmm1
+ pxor %xmm1,%xmm14
+ psrld $19,%xmm2
+ pxor %xmm2,%xmm14
+ movdqa %xmm10,%xmm1
+ paddd %xmm4,%xmm1
+ movdqa %xmm1,%xmm2
+ pslld $18,%xmm1
+ pxor %xmm1,%xmm0
+ psrld $14,%xmm2
+ pxor %xmm2,%xmm0
+ movdqa 256(%rsp),%xmm1
+ movdqa %xmm0,256(%rsp)
+ movdqa %xmm8,%xmm0
+ paddd %xmm14,%xmm0
+ movdqa %xmm0,%xmm2
+ pslld $18,%xmm0
+ pxor %xmm0,%xmm12
+ psrld $14,%xmm2
+ pxor %xmm2,%xmm12
+ movdqa %xmm11,%xmm0
+ paddd %xmm1,%xmm0
+ movdqa %xmm0,%xmm2
+ pslld $7,%xmm0
+ pxor %xmm0,%xmm6
+ psrld $25,%xmm2
+ pxor %xmm2,%xmm6
+ movdqa 272(%rsp),%xmm2
+ movdqa %xmm12,272(%rsp)
+ movdqa %xmm3,%xmm0
+ paddd %xmm2,%xmm0
+ movdqa %xmm0,%xmm12
+ pslld $7,%xmm0
+ pxor %xmm0,%xmm13
+ psrld $25,%xmm12
+ pxor %xmm12,%xmm13
+ movdqa %xmm1,%xmm0
+ paddd %xmm6,%xmm0
+ movdqa %xmm0,%xmm12
+ pslld $9,%xmm0
+ pxor %xmm0,%xmm15
+ psrld $23,%xmm12
+ pxor %xmm12,%xmm15
+ movdqa %xmm2,%xmm0
+ paddd %xmm13,%xmm0
+ movdqa %xmm0,%xmm12
+ pslld $9,%xmm0
+ pxor %xmm0,%xmm9
+ psrld $23,%xmm12
+ pxor %xmm12,%xmm9
+ movdqa %xmm6,%xmm0
+ paddd %xmm15,%xmm0
+ movdqa %xmm0,%xmm12
+ pslld $13,%xmm0
+ pxor %xmm0,%xmm11
+ psrld $19,%xmm12
+ pxor %xmm12,%xmm11
+ movdqa %xmm13,%xmm0
+ paddd %xmm9,%xmm0
+ movdqa %xmm0,%xmm12
+ pslld $13,%xmm0
+ pxor %xmm0,%xmm3
+ psrld $19,%xmm12
+ pxor %xmm12,%xmm3
+ movdqa %xmm15,%xmm0
+ paddd %xmm11,%xmm0
+ movdqa %xmm0,%xmm12
+ pslld $18,%xmm0
+ pxor %xmm0,%xmm1
+ psrld $14,%xmm12
+ pxor %xmm12,%xmm1
+ movdqa %xmm9,%xmm0
+ paddd %xmm3,%xmm0
+ movdqa %xmm0,%xmm12
+ pslld $18,%xmm0
+ pxor %xmm0,%xmm2
+ psrld $14,%xmm12
+ pxor %xmm12,%xmm2
+ movdqa 256(%rsp),%xmm12
+ movdqa 272(%rsp),%xmm0
+ sub $2,%rdx
+ ja ._mainloop1
+ paddd 48(%rsp),%xmm12
+ paddd 112(%rsp),%xmm7
+ paddd 160(%rsp),%xmm10
+ paddd 208(%rsp),%xmm4
+ movd %xmm12,%rdx
+ movd %xmm7,%rcx
+ movd %xmm10,%r9
+ movd %xmm4,%rax
+ pshufd $0x39,%xmm12,%xmm12
+ pshufd $0x39,%xmm7,%xmm7
+ pshufd $0x39,%xmm10,%xmm10
+ pshufd $0x39,%xmm4,%xmm4
+ xorl 0(%rsi),%edx
+ xorl 4(%rsi),%ecx
+ xorl 8(%rsi),%r9d
+ xorl 12(%rsi),%eax
+ movl %edx,0(%rdi)
+ movl %ecx,4(%rdi)
+ movl %r9d,8(%rdi)
+ movl %eax,12(%rdi)
+ movd %xmm12,%rdx
+ movd %xmm7,%rcx
+ movd %xmm10,%r9
+ movd %xmm4,%rax
+ pshufd $0x39,%xmm12,%xmm12
+ pshufd $0x39,%xmm7,%xmm7
+ pshufd $0x39,%xmm10,%xmm10
+ pshufd $0x39,%xmm4,%xmm4
+ xorl 64(%rsi),%edx
+ xorl 68(%rsi),%ecx
+ xorl 72(%rsi),%r9d
+ xorl 76(%rsi),%eax
+ movl %edx,64(%rdi)
+ movl %ecx,68(%rdi)
+ movl %r9d,72(%rdi)
+ movl %eax,76(%rdi)
+ movd %xmm12,%rdx
+ movd %xmm7,%rcx
+ movd %xmm10,%r9
+ movd %xmm4,%rax
+ pshufd $0x39,%xmm12,%xmm12
+ pshufd $0x39,%xmm7,%xmm7
+ pshufd $0x39,%xmm10,%xmm10
+ pshufd $0x39,%xmm4,%xmm4
+ xorl 128(%rsi),%edx
+ xorl 132(%rsi),%ecx
+ xorl 136(%rsi),%r9d
+ xorl 140(%rsi),%eax
+ movl %edx,128(%rdi)
+ movl %ecx,132(%rdi)
+ movl %r9d,136(%rdi)
+ movl %eax,140(%rdi)
+ movd %xmm12,%rdx
+ movd %xmm7,%rcx
+ movd %xmm10,%r9
+ movd %xmm4,%rax
+ xorl 192(%rsi),%edx
+ xorl 196(%rsi),%ecx
+ xorl 200(%rsi),%r9d
+ xorl 204(%rsi),%eax
+ movl %edx,192(%rdi)
+ movl %ecx,196(%rdi)
+ movl %r9d,200(%rdi)
+ movl %eax,204(%rdi)
+ paddd 176(%rsp),%xmm14
+ paddd 0(%rsp),%xmm0
+ paddd 64(%rsp),%xmm5
+ paddd 128(%rsp),%xmm8
+ movd %xmm14,%rdx
+ movd %xmm0,%rcx
+ movd %xmm5,%r9
+ movd %xmm8,%rax
+ pshufd $0x39,%xmm14,%xmm14
+ pshufd $0x39,%xmm0,%xmm0
+ pshufd $0x39,%xmm5,%xmm5
+ pshufd $0x39,%xmm8,%xmm8
+ xorl 16(%rsi),%edx
+ xorl 20(%rsi),%ecx
+ xorl 24(%rsi),%r9d
+ xorl 28(%rsi),%eax
+ movl %edx,16(%rdi)
+ movl %ecx,20(%rdi)
+ movl %r9d,24(%rdi)
+ movl %eax,28(%rdi)
+ movd %xmm14,%rdx
+ movd %xmm0,%rcx
+ movd %xmm5,%r9
+ movd %xmm8,%rax
+ pshufd $0x39,%xmm14,%xmm14
+ pshufd $0x39,%xmm0,%xmm0
+ pshufd $0x39,%xmm5,%xmm5
+ pshufd $0x39,%xmm8,%xmm8
+ xorl 80(%rsi),%edx
+ xorl 84(%rsi),%ecx
+ xorl 88(%rsi),%r9d
+ xorl 92(%rsi),%eax
+ movl %edx,80(%rdi)
+ movl %ecx,84(%rdi)
+ movl %r9d,88(%rdi)
+ movl %eax,92(%rdi)
+ movd %xmm14,%rdx
+ movd %xmm0,%rcx
+ movd %xmm5,%r9
+ movd %xmm8,%rax
+ pshufd $0x39,%xmm14,%xmm14
+ pshufd $0x39,%xmm0,%xmm0
+ pshufd $0x39,%xmm5,%xmm5
+ pshufd $0x39,%xmm8,%xmm8
+ xorl 144(%rsi),%edx
+ xorl 148(%rsi),%ecx
+ xorl 152(%rsi),%r9d
+ xorl 156(%rsi),%eax
+ movl %edx,144(%rdi)
+ movl %ecx,148(%rdi)
+ movl %r9d,152(%rdi)
+ movl %eax,156(%rdi)
+ movd %xmm14,%rdx
+ movd %xmm0,%rcx
+ movd %xmm5,%r9
+ movd %xmm8,%rax
+ xorl 208(%rsi),%edx
+ xorl 212(%rsi),%ecx
+ xorl 216(%rsi),%r9d
+ xorl 220(%rsi),%eax
+ movl %edx,208(%rdi)
+ movl %ecx,212(%rdi)
+ movl %r9d,216(%rdi)
+ movl %eax,220(%rdi)
+ paddd 224(%rsp),%xmm15
+ paddd 240(%rsp),%xmm11
+ paddd 16(%rsp),%xmm1
+ paddd 80(%rsp),%xmm6
+ movd %xmm15,%rdx
+ movd %xmm11,%rcx
+ movd %xmm1,%r9
+ movd %xmm6,%rax
+ pshufd $0x39,%xmm15,%xmm15
+ pshufd $0x39,%xmm11,%xmm11
+ pshufd $0x39,%xmm1,%xmm1
+ pshufd $0x39,%xmm6,%xmm6
+ xorl 32(%rsi),%edx
+ xorl 36(%rsi),%ecx
+ xorl 40(%rsi),%r9d
+ xorl 44(%rsi),%eax
+ movl %edx,32(%rdi)
+ movl %ecx,36(%rdi)
+ movl %r9d,40(%rdi)
+ movl %eax,44(%rdi)
+ movd %xmm15,%rdx
+ movd %xmm11,%rcx
+ movd %xmm1,%r9
+ movd %xmm6,%rax
+ pshufd $0x39,%xmm15,%xmm15
+ pshufd $0x39,%xmm11,%xmm11
+ pshufd $0x39,%xmm1,%xmm1
+ pshufd $0x39,%xmm6,%xmm6
+ xorl 96(%rsi),%edx
+ xorl 100(%rsi),%ecx
+ xorl 104(%rsi),%r9d
+ xorl 108(%rsi),%eax
+ movl %edx,96(%rdi)
+ movl %ecx,100(%rdi)
+ movl %r9d,104(%rdi)
+ movl %eax,108(%rdi)
+ movd %xmm15,%rdx
+ movd %xmm11,%rcx
+ movd %xmm1,%r9
+ movd %xmm6,%rax
+ pshufd $0x39,%xmm15,%xmm15
+ pshufd $0x39,%xmm11,%xmm11
+ pshufd $0x39,%xmm1,%xmm1
+ pshufd $0x39,%xmm6,%xmm6
+ xorl 160(%rsi),%edx
+ xorl 164(%rsi),%ecx
+ xorl 168(%rsi),%r9d
+ xorl 172(%rsi),%eax
+ movl %edx,160(%rdi)
+ movl %ecx,164(%rdi)
+ movl %r9d,168(%rdi)
+ movl %eax,172(%rdi)
+ movd %xmm15,%rdx
+ movd %xmm11,%rcx
+ movd %xmm1,%r9
+ movd %xmm6,%rax
+ xorl 224(%rsi),%edx
+ xorl 228(%rsi),%ecx
+ xorl 232(%rsi),%r9d
+ xorl 236(%rsi),%eax
+ movl %edx,224(%rdi)
+ movl %ecx,228(%rdi)
+ movl %r9d,232(%rdi)
+ movl %eax,236(%rdi)
+ paddd 96(%rsp),%xmm13
+ paddd 144(%rsp),%xmm9
+ paddd 192(%rsp),%xmm3
+ paddd 32(%rsp),%xmm2
+ movd %xmm13,%rdx
+ movd %xmm9,%rcx
+ movd %xmm3,%r9
+ movd %xmm2,%rax
+ pshufd $0x39,%xmm13,%xmm13
+ pshufd $0x39,%xmm9,%xmm9
+ pshufd $0x39,%xmm3,%xmm3
+ pshufd $0x39,%xmm2,%xmm2
+ xorl 48(%rsi),%edx
+ xorl 52(%rsi),%ecx
+ xorl 56(%rsi),%r9d
+ xorl 60(%rsi),%eax
+ movl %edx,48(%rdi)
+ movl %ecx,52(%rdi)
+ movl %r9d,56(%rdi)
+ movl %eax,60(%rdi)
+ movd %xmm13,%rdx
+ movd %xmm9,%rcx
+ movd %xmm3,%r9
+ movd %xmm2,%rax
+ pshufd $0x39,%xmm13,%xmm13
+ pshufd $0x39,%xmm9,%xmm9
+ pshufd $0x39,%xmm3,%xmm3
+ pshufd $0x39,%xmm2,%xmm2
+ xorl 112(%rsi),%edx
+ xorl 116(%rsi),%ecx
+ xorl 120(%rsi),%r9d
+ xorl 124(%rsi),%eax
+ movl %edx,112(%rdi)
+ movl %ecx,116(%rdi)
+ movl %r9d,120(%rdi)
+ movl %eax,124(%rdi)
+ movd %xmm13,%rdx
+ movd %xmm9,%rcx
+ movd %xmm3,%r9
+ movd %xmm2,%rax
+ pshufd $0x39,%xmm13,%xmm13
+ pshufd $0x39,%xmm9,%xmm9
+ pshufd $0x39,%xmm3,%xmm3
+ pshufd $0x39,%xmm2,%xmm2
+ xorl 176(%rsi),%edx
+ xorl 180(%rsi),%ecx
+ xorl 184(%rsi),%r9d
+ xorl 188(%rsi),%eax
+ movl %edx,176(%rdi)
+ movl %ecx,180(%rdi)
+ movl %r9d,184(%rdi)
+ movl %eax,188(%rdi)
+ movd %xmm13,%rdx
+ movd %xmm9,%rcx
+ movd %xmm3,%r9
+ movd %xmm2,%rax
+ xorl 240(%rsi),%edx
+ xorl 244(%rsi),%ecx
+ xorl 248(%rsi),%r9d
+ xorl 252(%rsi),%eax
+ movl %edx,240(%rdi)
+ movl %ecx,244(%rdi)
+ movl %r9d,248(%rdi)
+ movl %eax,252(%rdi)
+ movq 288(%rsp),%rdx
+ sub $256,%rdx
+ add $256,%rsi
+ add $256,%rdi
+ cmp $256,%rdx
+ jae ._bytesatleast256
+ cmp $0,%rdx
+ jbe ._done
+._bytes_are_64_128_or_192:
+ movq %rdx,288(%rsp)
+ movdqa 0(%r8),%xmm0
+ movdqa 16(%r8),%xmm1
+ movdqa 32(%r8),%xmm2
+ movdqa 48(%r8),%xmm3
+ movdqa %xmm1,%xmm4
+ mov %rbx,%rdx
+._mainloop2:
+ paddd %xmm0,%xmm4
+ movdqa %xmm0,%xmm5
+ movdqa %xmm4,%xmm6
+ pslld $7,%xmm4
+ psrld $25,%xmm6
+ pxor %xmm4,%xmm3
+ pxor %xmm6,%xmm3
+ paddd %xmm3,%xmm5
+ movdqa %xmm3,%xmm4
+ movdqa %xmm5,%xmm6
+ pslld $9,%xmm5
+ psrld $23,%xmm6
+ pxor %xmm5,%xmm2
+ pshufd $0x93,%xmm3,%xmm3
+ pxor %xmm6,%xmm2
+ paddd %xmm2,%xmm4
+ movdqa %xmm2,%xmm5
+ movdqa %xmm4,%xmm6
+ pslld $13,%xmm4
+ psrld $19,%xmm6
+ pxor %xmm4,%xmm1
+ pshufd $0x4e,%xmm2,%xmm2
+ pxor %xmm6,%xmm1
+ paddd %xmm1,%xmm5
+ movdqa %xmm3,%xmm4
+ movdqa %xmm5,%xmm6
+ pslld $18,%xmm5
+ psrld $14,%xmm6
+ pxor %xmm5,%xmm0
+ pshufd $0x39,%xmm1,%xmm1
+ pxor %xmm6,%xmm0
+ paddd %xmm0,%xmm4
+ movdqa %xmm0,%xmm5
+ movdqa %xmm4,%xmm6
+ pslld $7,%xmm4
+ psrld $25,%xmm6
+ pxor %xmm4,%xmm1
+ pxor %xmm6,%xmm1
+ paddd %xmm1,%xmm5
+ movdqa %xmm1,%xmm4
+ movdqa %xmm5,%xmm6
+ pslld $9,%xmm5
+ psrld $23,%xmm6
+ pxor %xmm5,%xmm2
+ pshufd $0x93,%xmm1,%xmm1
+ pxor %xmm6,%xmm2
+ paddd %xmm2,%xmm4
+ movdqa %xmm2,%xmm5
+ movdqa %xmm4,%xmm6
+ pslld $13,%xmm4
+ psrld $19,%xmm6
+ pxor %xmm4,%xmm3
+ pshufd $0x4e,%xmm2,%xmm2
+ pxor %xmm6,%xmm3
+ paddd %xmm3,%xmm5
+ movdqa %xmm1,%xmm4
+ movdqa %xmm5,%xmm6
+ pslld $18,%xmm5
+ psrld $14,%xmm6
+ pxor %xmm5,%xmm0
+ pshufd $0x39,%xmm3,%xmm3
+ pxor %xmm6,%xmm0
+ paddd %xmm0,%xmm4
+ movdqa %xmm0,%xmm5
+ movdqa %xmm4,%xmm6
+ pslld $7,%xmm4
+ psrld $25,%xmm6
+ pxor %xmm4,%xmm3
+ pxor %xmm6,%xmm3
+ paddd %xmm3,%xmm5
+ movdqa %xmm3,%xmm4
+ movdqa %xmm5,%xmm6
+ pslld $9,%xmm5
+ psrld $23,%xmm6
+ pxor %xmm5,%xmm2
+ pshufd $0x93,%xmm3,%xmm3
+ pxor %xmm6,%xmm2
+ paddd %xmm2,%xmm4
+ movdqa %xmm2,%xmm5
+ movdqa %xmm4,%xmm6
+ pslld $13,%xmm4
+ psrld $19,%xmm6
+ pxor %xmm4,%xmm1
+ pshufd $0x4e,%xmm2,%xmm2
+ pxor %xmm6,%xmm1
+ paddd %xmm1,%xmm5
+ movdqa %xmm3,%xmm4
+ movdqa %xmm5,%xmm6
+ pslld $18,%xmm5
+ psrld $14,%xmm6
+ pxor %xmm5,%xmm0
+ pshufd $0x39,%xmm1,%xmm1
+ pxor %xmm6,%xmm0
+ paddd %xmm0,%xmm4
+ movdqa %xmm0,%xmm5
+ movdqa %xmm4,%xmm6
+ pslld $7,%xmm4
+ psrld $25,%xmm6
+ pxor %xmm4,%xmm1
+ pxor %xmm6,%xmm1
+ paddd %xmm1,%xmm5
+ movdqa %xmm1,%xmm4
+ movdqa %xmm5,%xmm6
+ pslld $9,%xmm5
+ psrld $23,%xmm6
+ pxor %xmm5,%xmm2
+ pshufd $0x93,%xmm1,%xmm1
+ pxor %xmm6,%xmm2
+ paddd %xmm2,%xmm4
+ movdqa %xmm2,%xmm5
+ movdqa %xmm4,%xmm6
+ pslld $13,%xmm4
+ psrld $19,%xmm6
+ pxor %xmm4,%xmm3
+ pshufd $0x4e,%xmm2,%xmm2
+ pxor %xmm6,%xmm3
+ sub $4,%rdx
+ paddd %xmm3,%xmm5
+ movdqa %xmm1,%xmm4
+ movdqa %xmm5,%xmm6
+ pslld $18,%xmm5
+ pxor %xmm7,%xmm7
+ psrld $14,%xmm6
+ pxor %xmm5,%xmm0
+ pshufd $0x39,%xmm3,%xmm3
+ pxor %xmm6,%xmm0
+ ja ._mainloop2
+ paddd 0(%r8),%xmm0
+ paddd 16(%r8),%xmm1
+ paddd 32(%r8),%xmm2
+ paddd 48(%r8),%xmm3
+ movd %xmm0,%rdx
+ movd %xmm1,%rcx
+ movd %xmm2,%rax
+ movd %xmm3,%r10
+ pshufd $0x39,%xmm0,%xmm0
+ pshufd $0x39,%xmm1,%xmm1
+ pshufd $0x39,%xmm2,%xmm2
+ pshufd $0x39,%xmm3,%xmm3
+ xorl 0(%rsi),%edx
+ xorl 48(%rsi),%ecx
+ xorl 32(%rsi),%eax
+ xorl 16(%rsi),%r10d
+ movl %edx,0(%rdi)
+ movl %ecx,48(%rdi)
+ movl %eax,32(%rdi)
+ movl %r10d,16(%rdi)
+ movd %xmm0,%rdx
+ movd %xmm1,%rcx
+ movd %xmm2,%rax
+ movd %xmm3,%r10
+ pshufd $0x39,%xmm0,%xmm0
+ pshufd $0x39,%xmm1,%xmm1
+ pshufd $0x39,%xmm2,%xmm2
+ pshufd $0x39,%xmm3,%xmm3
+ xorl 20(%rsi),%edx
+ xorl 4(%rsi),%ecx
+ xorl 52(%rsi),%eax
+ xorl 36(%rsi),%r10d
+ movl %edx,20(%rdi)
+ movl %ecx,4(%rdi)
+ movl %eax,52(%rdi)
+ movl %r10d,36(%rdi)
+ movd %xmm0,%rdx
+ movd %xmm1,%rcx
+ movd %xmm2,%rax
+ movd %xmm3,%r10
+ pshufd $0x39,%xmm0,%xmm0
+ pshufd $0x39,%xmm1,%xmm1
+ pshufd $0x39,%xmm2,%xmm2
+ pshufd $0x39,%xmm3,%xmm3
+ xorl 40(%rsi),%edx
+ xorl 24(%rsi),%ecx
+ xorl 8(%rsi),%eax
+ xorl 56(%rsi),%r10d
+ movl %edx,40(%rdi)
+ movl %ecx,24(%rdi)
+ movl %eax,8(%rdi)
+ movl %r10d,56(%rdi)
+ movd %xmm0,%rdx
+ movd %xmm1,%rcx
+ movd %xmm2,%rax
+ movd %xmm3,%r10
+ xorl 60(%rsi),%edx
+ xorl 44(%rsi),%ecx
+ xorl 28(%rsi),%eax
+ xorl 12(%rsi),%r10d
+ movl %edx,60(%rdi)
+ movl %ecx,44(%rdi)
+ movl %eax,28(%rdi)
+ movl %r10d,12(%rdi)
+ movq 288(%rsp),%rdx
+ movl 32(%r8),%ecx
+ movl 52(%r8),%eax
+ add $1,%ecx
+ adc $0,%eax
+ movl %ecx,32(%r8)
+ movl %eax,52(%r8)
+ cmp $64,%rdx
+ ja ._bytes_are_128_or_192
+._done:
+ add %r11,%rsp
+ mov %r11,%rax
+ pop %rbx
+ ret
+._bytes_are_128_or_192:
+ sub $64,%rdx
+ add $64,%rdi
+ add $64,%rsi
+ jmp ._bytes_are_64_128_or_192
+.size _gcry_salsa20_amd64_encrypt_blocks,.-_gcry_salsa20_amd64_encrypt_blocks;
+
+#endif /*defined(USE_SALSA20)*/
+#endif /*__x86_64*/