diff options
-rw-r--r-- | cipher/Makefile.am | 2 | ||||
-rw-r--r-- | cipher/poly1305-avx2-amd64.S | 954 | ||||
-rw-r--r-- | cipher/poly1305-internal.h | 23 | ||||
-rw-r--r-- | cipher/poly1305.c | 26 | ||||
-rw-r--r-- | configure.ac | 1 |
5 files changed, 1002 insertions, 4 deletions
diff --git a/cipher/Makefile.am b/cipher/Makefile.am index a32ae892..19b00979 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -72,7 +72,7 @@ gost28147.c gost.h \ gostr3411-94.c \ md4.c \ md5.c \ -poly1305-sse2-amd64.S \ +poly1305-sse2-amd64.S poly1305-avx2-amd64.S \ rijndael.c rijndael-tables.h rijndael-amd64.S rijndael-arm.S \ rmd160.c \ rsa.c \ diff --git a/cipher/poly1305-avx2-amd64.S b/cipher/poly1305-avx2-amd64.S new file mode 100644 index 00000000..0ba7e761 --- /dev/null +++ b/cipher/poly1305-avx2-amd64.S @@ -0,0 +1,954 @@ +/* poly1305-avx2-amd64.S - AMD64/AVX2 implementation of Poly1305 + * + * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +/* + * Based on public domain implementation by Andrew Moon at + * https://github.com/floodyberry/poly1305-opt + */ + +#include <config.h> + +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(ENABLE_AVX2_SUPPORT) + +.text + + +.align 8 +.globl _gcry_poly1305_amd64_avx2_init_ext +.type _gcry_poly1305_amd64_avx2_init_ext,@function; +_gcry_poly1305_amd64_avx2_init_ext: +.Lpoly1305_init_ext_avx2_local: + xor %edx, %edx + vzeroupper + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbx + movq %rdx, %rcx + vpxor %ymm0, %ymm0, %ymm0 + movq $-1, %r8 + testq %rcx, %rcx + vmovdqu %ymm0, (%rdi) + vmovdqu %ymm0, 32(%rdi) + vmovdqu %ymm0, 64(%rdi) + vmovdqu %ymm0, 96(%rdi) + vmovdqu %ymm0, 128(%rdi) + movq 8(%rsi), %r9 + cmove %r8, %rcx + movq $0xffc0fffffff, %r8 + movq %r9, %r13 + movq (%rsi), %r10 + andq %r10, %r8 + shrq $44, %r10 + movq %r8, %r14 + shlq $20, %r13 + orq %r13, %r10 + movq $0xfffffc0ffff, %r13 + shrq $24, %r9 + andq %r13, %r10 + movq $0xffffffc0f, %r13 + andq %r13, %r9 + movl %r8d, %r13d + andl $67108863, %r13d + movl %r13d, 164(%rdi) + movq %r10, %r13 + shrq $26, %r14 + shlq $18, %r13 + orq %r13, %r14 + movq %r10, %r13 + shrq $8, %r13 + andl $67108863, %r14d + andl $67108863, %r13d + movl %r14d, 172(%rdi) + movq %r10, %r14 + movl %r13d, 180(%rdi) + movq %r9, %r13 + shrq $34, %r14 + shlq $10, %r13 + orq %r13, %r14 + movq %r9, %r13 + shrq $16, %r13 + andl $67108863, %r14d + movl %r14d, 188(%rdi) + movl %r13d, 196(%rdi) + cmpq $16, %rcx + jbe .Lpoly1305_init_ext_avx2_continue + lea (%r9,%r9,4), %r11 + shlq $2, %r11 + lea (%r10,%r10), %rax + mulq %r11 + movq %rax, %r13 + movq %r8, %rax + movq %rdx, %r14 + mulq %r8 + addq %rax, %r13 + lea (%r8,%r8), %rax + movq %r13, %r12 + adcq %rdx, %r14 + mulq %r10 + shlq $20, %r14 + movq %rax, %r15 + shrq $44, %r12 + movq %r11, %rax + orq %r12, %r14 + movq %rdx, %r12 + mulq %r9 + addq %rax, %r15 + movq %r8, %rax + adcq %rdx, %r12 + addq %r15, %r14 + lea (%r9,%r9), %r15 + movq %r14, %rbx + adcq $0, %r12 + mulq %r15 + shlq $20, %r12 + movq %rdx, %r11 + shrq $44, %rbx + orq %rbx, %r12 + movq %rax, %rbx + movq %r10, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %r11 + addq %rbx, %r12 + movq $0xfffffffffff, %rbx + movq %r12, %r15 + adcq $0, %r11 + andq %rbx, %r13 + shlq $22, %r11 + andq %rbx, %r14 + shrq $42, %r15 + orq %r15, %r11 + lea (%r11,%r11,4), %r11 + addq %r11, %r13 + movq %rbx, %r11 + andq %r13, %r11 + shrq $44, %r13 + movq %r11, %r15 + addq %r13, %r14 + movq $0x3ffffffffff, %r13 + andq %r14, %rbx + andq %r13, %r12 + movq %rbx, %r13 + shrq $26, %r15 + shlq $18, %r13 + orq %r13, %r15 + movq %rbx, %r13 + shrq $44, %r14 + shrq $8, %r13 + addq %r14, %r12 + movl %r11d, %r14d + andl $67108863, %r15d + andl $67108863, %r14d + andl $67108863, %r13d + movl %r14d, 204(%rdi) + movq %rbx, %r14 + movl %r13d, 220(%rdi) + movq %r12, %r13 + shrq $34, %r14 + shlq $10, %r13 + orq %r13, %r14 + movq %r12, %r13 + shrq $16, %r13 + andl $67108863, %r14d + movl %r15d, 212(%rdi) + movl %r14d, 228(%rdi) + movl %r13d, 236(%rdi) + cmpq $32, %rcx + jbe .Lpoly1305_init_ext_avx2_continue + movq %r9, %rax + lea (%rbx,%rbx,4), %r14 + shlq $2, %r14 + mulq %r14 + movq %rdi, -32(%rsp) + lea (%r12,%r12,4), %rdi + shlq $2, %rdi + movq %rax, %r14 + movq %r10, %rax + movq %rdx, %r15 + mulq %rdi + movq %rax, %r13 + movq %r11, %rax + movq %rcx, -16(%rsp) + movq %rdx, %rcx + mulq %r8 + addq %rax, %r13 + movq %rdi, %rax + movq %rsi, -24(%rsp) + adcq %rdx, %rcx + addq %r13, %r14 + adcq %rcx, %r15 + movq %r14, %rcx + mulq %r9 + shlq $20, %r15 + movq %rax, %r13 + shrq $44, %rcx + movq %r11, %rax + orq %rcx, %r15 + movq %rdx, %rcx + mulq %r10 + movq %rax, %rsi + movq %rbx, %rax + movq %rdx, %rdi + mulq %r8 + addq %rax, %rsi + movq %r11, %rax + adcq %rdx, %rdi + addq %rsi, %r13 + adcq %rdi, %rcx + addq %r13, %r15 + movq %r15, %rdi + adcq $0, %rcx + mulq %r9 + shlq $20, %rcx + movq %rdx, %rsi + shrq $44, %rdi + orq %rdi, %rcx + movq %rax, %rdi + movq %rbx, %rax + mulq %r10 + movq %rax, %r9 + movq %r8, %rax + movq %rdx, %r10 + movq $0xfffffffffff, %r8 + mulq %r12 + addq %rax, %r9 + adcq %rdx, %r10 + andq %r8, %r14 + addq %r9, %rdi + adcq %r10, %rsi + andq %r8, %r15 + addq %rdi, %rcx + movq $0x3ffffffffff, %rdi + movq %rcx, %r10 + adcq $0, %rsi + andq %rdi, %rcx + shlq $22, %rsi + shrq $42, %r10 + orq %r10, %rsi + movq -32(%rsp), %rdi + lea (%rsi,%rsi,4), %r9 + movq %r8, %rsi + addq %r9, %r14 + andq %r14, %rsi + shrq $44, %r14 + addq %r14, %r15 + andq %r15, %r8 + shrq $44, %r15 + movq %r8, %r14 + addq %r15, %rcx + movl %esi, %r15d + movq %rcx, %r10 + movq %r8, %r9 + shrq $26, %rsi + andl $67108863, %r15d + shlq $18, %r14 + shrq $34, %r8 + orq %r14, %rsi + shlq $10, %r10 + shrq $8, %r9 + orq %r10, %r8 + shrq $16, %rcx + andl $67108863, %esi + movl %esi, 252(%rdi) + andl $67108863, %r9d + movl %ecx, 276(%rdi) + andl $67108863, %r8d + movl %r15d, 244(%rdi) + movl %r9d, 260(%rdi) + movl %r8d, 268(%rdi) + movq -16(%rsp), %rcx + movq -24(%rsp), %rsi +.Lpoly1305_init_ext_avx2_continue: + movl 16(%rsi), %r8d + movl %r8d, 284(%rdi) + movl 20(%rsi), %r9d + movl %r9d, 292(%rdi) + movl 24(%rsi), %r10d + movl %r10d, 300(%rdi) + movl 28(%rsi), %esi + movl %esi, 308(%rdi) + cmpq $48, %rcx + jbe .Lpoly1305_init_ext_avx2_done + lea (%r12,%r12,4), %r9 + shlq $2, %r9 + lea (%rbx,%rbx), %rax + mulq %r9 + movq %rax, %rsi + movq %r11, %rax + movq %rdx, %r8 + mulq %r11 + addq %rax, %rsi + lea (%r11,%r11), %rax + movq %rsi, %r10 + adcq %rdx, %r8 + mulq %rbx + movq %rax, %r13 + movq %r12, %rax + movq %rdx, %rcx + addq %r12, %r12 + mulq %r9 + addq %rax, %r13 + movq %r11, %rax + movq $0xfffffffffff, %r9 + adcq %rdx, %rcx + andq %r9, %rsi + mulq %r12 + shlq $20, %r8 + movq %rax, %r11 + shrq $44, %r10 + movq %rbx, %rax + orq %r10, %r8 + movq %rdx, %r12 + mulq %rbx + addq %r13, %r8 + movq %r8, %r14 + adcq $0, %rcx + andq %r9, %r8 + addq %rax, %r11 + adcq %rdx, %r12 + shlq $20, %rcx + shrq $44, %r14 + orq %r14, %rcx + addq %r11, %rcx + movq %rcx, %rbx + adcq $0, %r12 + shlq $22, %r12 + shrq $42, %rbx + orq %rbx, %r12 + movq %r9, %rbx + lea (%r12,%r12,4), %r15 + addq %r15, %rsi + andq %rsi, %rbx + shrq $44, %rsi + movl %ebx, %r11d + addq %rsi, %r8 + movq $0x3ffffffffff, %rsi + andq %r8, %r9 + andq %rsi, %rcx + shrq $44, %r8 + movq %r9, %rax + addq %r8, %rcx + movq %r9, %r8 + movq %rcx, %r10 + andl $67108863, %r11d + shrq $26, %rbx + shlq $18, %r8 + shrq $34, %r9 + orq %r8, %rbx + shlq $10, %r10 + shrq $8, %rax + orq %r10, %r9 + shrq $16, %rcx + andl $67108863, %ebx + andl $67108863, %eax + andl $67108863, %r9d + movl %r11d, 184(%rdi) + movl %r11d, 176(%rdi) + movl %r11d, 168(%rdi) + movl %r11d, 160(%rdi) + movl %ebx, 216(%rdi) + movl %ebx, 208(%rdi) + movl %ebx, 200(%rdi) + movl %ebx, 192(%rdi) + movl %eax, 248(%rdi) + movl %eax, 240(%rdi) + movl %eax, 232(%rdi) + movl %eax, 224(%rdi) + movl %r9d, 280(%rdi) + movl %r9d, 272(%rdi) + movl %r9d, 264(%rdi) + movl %r9d, 256(%rdi) + movl %ecx, 312(%rdi) + movl %ecx, 304(%rdi) + movl %ecx, 296(%rdi) + movl %ecx, 288(%rdi) +.Lpoly1305_init_ext_avx2_done: + movq $0, 320(%rdi) + vzeroall + popq %rbx + popq %r15 + popq %r14 + popq %r13 + popq %r12 + ret +.size _gcry_poly1305_amd64_avx2_init_ext,.-_gcry_poly1305_amd64_avx2_init_ext; + + +.align 8 +.globl _gcry_poly1305_amd64_avx2_blocks +.type _gcry_poly1305_amd64_avx2_blocks,@function; +_gcry_poly1305_amd64_avx2_blocks: +.Lpoly1305_blocks_avx2_local: + vzeroupper + pushq %rbp + movq %rsp, %rbp + pushq %rbx + andq $-64, %rsp + subq $200, %rsp + movl $((1<<26)-1), %r8d + movl $(5), %r9d + movl $((1<<24)), %r10d + vmovd %r8d, %xmm0 + vmovd %r9d, %xmm8 + vmovd %r10d, %xmm7 + vpbroadcastq %xmm0, %ymm0 + vpbroadcastq %xmm8, %ymm8 + vpbroadcastq %xmm7, %ymm7 + vmovdqa %ymm7, 168(%rsp) + movq 320(%rdi), %rax + testb $60, %al + je .Lpoly1305_blocks_avx2_9 + vmovdqa 168(%rsp), %ymm7 + vpsrldq $8, %ymm7, %ymm1 + vmovdqa %ymm1, 168(%rsp) + testb $4, %al + je .Lpoly1305_blocks_avx2_10 + vpermq $192, %ymm1, %ymm7 + vmovdqa %ymm7, 168(%rsp) +.Lpoly1305_blocks_avx2_10: + testb $8, %al + je .Lpoly1305_blocks_avx2_11 + vpermq $240, 168(%rsp), %ymm7 + vmovdqa %ymm7, 168(%rsp) +.Lpoly1305_blocks_avx2_11: + testb $16, %al + je .Lpoly1305_blocks_avx2_12 + vpermq $252, 168(%rsp), %ymm6 + vmovdqa %ymm6, 168(%rsp) +.Lpoly1305_blocks_avx2_12: + testb $32, %al + je .Lpoly1305_blocks_avx2_9 + vpxor %xmm6, %xmm6, %xmm6 + vmovdqa %ymm6, 168(%rsp) +.Lpoly1305_blocks_avx2_9: + testb $1, %al + jne .Lpoly1305_blocks_avx2_13 + vmovdqu (%rsi), %ymm3 + vmovdqu 32(%rsi), %ymm1 + vpunpcklqdq %ymm1, %ymm3, %ymm2 + vpunpckhqdq %ymm1, %ymm3, %ymm1 + vpermq $216, %ymm2, %ymm2 + vpermq $216, %ymm1, %ymm1 + vpand %ymm2, %ymm0, %ymm5 + vpsrlq $26, %ymm2, %ymm4 + vpand %ymm4, %ymm0, %ymm4 + vpsllq $12, %ymm1, %ymm3 + vpsrlq $52, %ymm2, %ymm2 + vpor %ymm3, %ymm2, %ymm2 + vpand %ymm2, %ymm0, %ymm3 + vpsrlq $26, %ymm2, %ymm2 + vpand %ymm2, %ymm0, %ymm2 + vpsrlq $40, %ymm1, %ymm1 + vpor 168(%rsp), %ymm1, %ymm1 + addq $64, %rsi + subq $64, %rdx + orq $1, 320(%rdi) + jmp .Lpoly1305_blocks_avx2_14 +.Lpoly1305_blocks_avx2_13: + vmovdqa (%rdi), %ymm5 + vmovdqa 32(%rdi), %ymm4 + vmovdqa 64(%rdi), %ymm3 + vmovdqa 96(%rdi), %ymm2 + vmovdqa 128(%rdi), %ymm1 +.Lpoly1305_blocks_avx2_14: + cmpq $63, %rdx + jbe .Lpoly1305_blocks_avx2_15 + vmovdqa 160(%rdi), %ymm6 + vmovdqa %ymm8, 136(%rsp) + vmovdqa 192(%rdi), %ymm7 + vpmuludq %ymm8, %ymm7, %ymm11 + vmovdqa %ymm11, 104(%rsp) + vmovdqa 224(%rdi), %ymm11 + vmovdqa %ymm11, 72(%rsp) + vpmuludq %ymm11, %ymm8, %ymm11 + vmovdqa %ymm11, 40(%rsp) + vmovdqa 256(%rdi), %ymm11 + vmovdqa %ymm11, 8(%rsp) + vpmuludq %ymm11, %ymm8, %ymm11 + vmovdqa %ymm11, -24(%rsp) + vmovdqa 288(%rdi), %ymm13 + vmovdqa %ymm13, -56(%rsp) + vpmuludq %ymm13, %ymm8, %ymm13 + vmovdqa %ymm13, -88(%rsp) +.Lpoly1305_blocks_avx2_16: + vpmuludq 104(%rsp), %ymm1, %ymm14 + vmovdqa 40(%rsp), %ymm13 + vpmuludq %ymm13, %ymm2, %ymm8 + vpmuludq %ymm13, %ymm1, %ymm13 + vmovdqa -24(%rsp), %ymm9 + vpmuludq %ymm9, %ymm2, %ymm10 + vpmuludq %ymm9, %ymm1, %ymm11 + vpaddq %ymm8, %ymm14, %ymm14 + vpmuludq %ymm9, %ymm3, %ymm8 + vmovdqa -88(%rsp), %ymm12 + vpmuludq %ymm12, %ymm1, %ymm9 + vpaddq %ymm10, %ymm13, %ymm13 + vpmuludq %ymm12, %ymm4, %ymm15 + vmovdqa %ymm12, %ymm10 + vpmuludq %ymm12, %ymm3, %ymm12 + vpaddq %ymm8, %ymm14, %ymm14 + vpmuludq %ymm10, %ymm2, %ymm10 + vpmuludq %ymm6, %ymm2, %ymm8 + vpaddq %ymm15, %ymm14, %ymm14 + vpmuludq %ymm6, %ymm1, %ymm1 + vpaddq %ymm12, %ymm13, %ymm13 + vpmuludq %ymm6, %ymm5, %ymm15 + vpaddq %ymm10, %ymm11, %ymm11 + vpmuludq %ymm6, %ymm4, %ymm12 + vpaddq %ymm8, %ymm9, %ymm9 + vpmuludq %ymm6, %ymm3, %ymm10 + vpmuludq %ymm7, %ymm3, %ymm8 + vpaddq %ymm15, %ymm14, %ymm14 + vpmuludq %ymm7, %ymm2, %ymm2 + vpaddq %ymm12, %ymm13, %ymm12 + vpmuludq %ymm7, %ymm5, %ymm15 + vpaddq %ymm10, %ymm11, %ymm10 + vpmuludq %ymm7, %ymm4, %ymm13 + vpaddq %ymm8, %ymm9, %ymm8 + vmovdqa 72(%rsp), %ymm9 + vpmuludq %ymm9, %ymm4, %ymm11 + vpaddq %ymm2, %ymm1, %ymm1 + vpmuludq %ymm9, %ymm3, %ymm3 + vpaddq %ymm15, %ymm12, %ymm12 + vpmuludq %ymm9, %ymm5, %ymm15 + vpaddq %ymm13, %ymm10, %ymm10 + vmovdqa 8(%rsp), %ymm2 + vpmuludq %ymm2, %ymm5, %ymm9 + vpaddq %ymm11, %ymm8, %ymm8 + vpmuludq %ymm2, %ymm4, %ymm4 + vpaddq %ymm3, %ymm1, %ymm1 + vpmuludq -56(%rsp), %ymm5, %ymm5 + vpaddq %ymm15, %ymm10, %ymm10 + vpaddq %ymm9, %ymm8, %ymm8 + vpaddq %ymm4, %ymm1, %ymm1 + vpaddq %ymm5, %ymm1, %ymm5 + vmovdqu (%rsi), %ymm3 + vmovdqu 32(%rsi), %ymm2 + vperm2i128 $32, %ymm2, %ymm3, %ymm1 + vperm2i128 $49, %ymm2, %ymm3, %ymm2 + vpunpckldq %ymm2, %ymm1, %ymm15 + vpunpckhdq %ymm2, %ymm1, %ymm2 + vpxor %xmm4, %xmm4, %xmm4 + vpunpckldq %ymm4, %ymm15, %ymm1 + vpunpckhdq %ymm4, %ymm15, %ymm15 + vpunpckldq %ymm4, %ymm2, %ymm3 + vpunpckhdq %ymm4, %ymm2, %ymm2 + vpsllq $6, %ymm15, %ymm15 + vpsllq $12, %ymm3, %ymm3 + vpsllq $18, %ymm2, %ymm2 + vpaddq %ymm1, %ymm14, %ymm14 + vpaddq %ymm15, %ymm12, %ymm12 + vpaddq %ymm3, %ymm10, %ymm10 + vpaddq %ymm2, %ymm8, %ymm8 + vpaddq 168(%rsp), %ymm5, %ymm5 + addq $64, %rsi + vpsrlq $26, %ymm14, %ymm4 + vpsrlq $26, %ymm8, %ymm2 + vpand %ymm0, %ymm14, %ymm14 + vpand %ymm0, %ymm8, %ymm8 + vpaddq %ymm4, %ymm12, %ymm12 + vpaddq %ymm2, %ymm5, %ymm5 + vpsrlq $26, %ymm12, %ymm3 + vpsrlq $26, %ymm5, %ymm9 + vpand %ymm0, %ymm12, %ymm12 + vpand %ymm0, %ymm5, %ymm11 + vpaddq %ymm3, %ymm10, %ymm3 + vpmuludq 136(%rsp), %ymm9, %ymm9 + vpaddq %ymm9, %ymm14, %ymm14 + vpsrlq $26, %ymm3, %ymm2 + vpsrlq $26, %ymm14, %ymm4 + vpand %ymm0, %ymm3, %ymm3 + vpand %ymm0, %ymm14, %ymm5 + vpaddq %ymm2, %ymm8, %ymm2 + vpaddq %ymm4, %ymm12, %ymm4 + vpsrlq $26, %ymm2, %ymm1 + vpand %ymm0, %ymm2, %ymm2 + vpaddq %ymm1, %ymm11, %ymm1 + subq $64, %rdx + cmpq $63, %rdx + ja .Lpoly1305_blocks_avx2_16 +.Lpoly1305_blocks_avx2_15: + testb $64, 320(%rdi) + jne .Lpoly1305_blocks_avx2_17 + vmovdqa %ymm5, (%rdi) + vmovdqa %ymm4, 32(%rdi) + vmovdqa %ymm3, 64(%rdi) + vmovdqa %ymm2, 96(%rdi) + vmovdqa %ymm1, 128(%rdi) + jmp .Lpoly1305_blocks_avx2_8 +.Lpoly1305_blocks_avx2_17: + vpermq $245, %ymm5, %ymm0 + vpaddq %ymm0, %ymm5, %ymm5 + vpermq $245, %ymm4, %ymm0 + vpaddq %ymm0, %ymm4, %ymm4 + vpermq $245, %ymm3, %ymm0 + vpaddq %ymm0, %ymm3, %ymm3 + vpermq $245, %ymm2, %ymm0 + vpaddq %ymm0, %ymm2, %ymm2 + vpermq $245, %ymm1, %ymm0 + vpaddq %ymm0, %ymm1, %ymm1 + vpermq $170, %ymm5, %ymm0 + vpaddq %ymm0, %ymm5, %ymm5 + vpermq $170, %ymm4, %ymm0 + vpaddq %ymm0, %ymm4, %ymm4 + vpermq $170, %ymm3, %ymm0 + vpaddq %ymm0, %ymm3, %ymm3 + vpermq $170, %ymm2, %ymm0 + vpaddq %ymm0, %ymm2, %ymm2 + vpermq $170, %ymm1, %ymm0 + vpaddq %ymm0, %ymm1, %ymm1 + vmovd %xmm5, %eax + vmovd %xmm4, %edx + movl %eax, %ecx + shrl $26, %ecx + addl %edx, %ecx + movl %ecx, %edx + andl $67108863, %edx + vmovd %xmm3, %esi + shrl $26, %ecx + movl %ecx, %r11d + addl %esi, %r11d + vmovd %xmm2, %ecx + movl %r11d, %r10d + shrl $26, %r10d + addl %ecx, %r10d + movl %r10d, %r9d + andl $67108863, %r9d + vmovd %xmm1, %r8d + movl %edx, %esi + salq $26, %rsi + andl $67108863, %eax + orq %rax, %rsi + movabsq $17592186044415, %rax + andq %rax, %rsi + andl $67108863, %r11d + salq $8, %r11 + shrl $18, %edx + movl %edx, %edx + orq %r11, %rdx + movq %r9, %rcx + salq $34, %rcx + orq %rcx, %rdx + andq %rax, %rdx + shrl $26, %r10d + addl %r10d, %r8d + salq $16, %r8 + shrl $10, %r9d + movl %r9d, %r9d + orq %r9, %r8 + movabsq $4398046511103, %r10 + movq %r8, %r9 + andq %r10, %r9 + shrq $42, %r8 + leaq (%r8,%r8,4), %rcx + addq %rcx, %rsi + movq %rsi, %r8 + andq %rax, %r8 + movq %rsi, %rcx + shrq $44, %rcx + addq %rdx, %rcx + movq %rcx, %rsi + andq %rax, %rsi + shrq $44, %rcx + movq %rcx, %rdx + addq %r9, %rdx + andq %rdx, %r10 + shrq $42, %rdx + leaq (%r8,%rdx,4), %rcx + leaq (%rcx,%rdx), %rdx + movq %rdx, %rbx + andq %rax, %rbx + shrq $44, %rdx + movq %rdx, %r11 + addq %rsi, %r11 + leaq 5(%rbx), %r9 + movq %r9, %r8 + shrq $44, %r8 + addq %r11, %r8 + movabsq $-4398046511104, %rsi + addq %r10, %rsi + movq %r8, %rdx + shrq $44, %rdx + addq %rdx, %rsi + movq %rsi, %rdx + shrq $63, %rdx + subq $1, %rdx + movq %rdx, %rcx + notq %rcx + andq %rcx, %rbx + andq %rcx, %r11 + andq %r10, %rcx + andq %rax, %r9 + andq %rdx, %r9 + orq %r9, %rbx + movq %rbx, (%rdi) + andq %r8, %rax + andq %rdx, %rax + orq %rax, %r11 + movq %r11, 8(%rdi) + andq %rsi, %rdx + orq %rcx, %rdx + movq %rdx, 16(%rdi) +.Lpoly1305_blocks_avx2_8: + movq -8(%rbp), %rbx + vzeroall + movq %rbp, %rax + subq %rsp, %rax + leave + addq $8, %rax + ret +.size _gcry_poly1305_amd64_avx2_blocks,.-_gcry_poly1305_amd64_avx2_blocks; + + +.align 8 +.globl _gcry_poly1305_amd64_avx2_finish_ext +.type _gcry_poly1305_amd64_avx2_finish_ext,@function; +_gcry_poly1305_amd64_avx2_finish_ext: +.Lpoly1305_finish_ext_avx2_local: + vzeroupper + pushq %rbp + movq %rsp, %rbp + pushq %r13 + pushq %r12 + pushq %rbx + andq $-64, %rsp + subq $64, %rsp + movq %rdi, %rbx + movq %rdx, %r13 + movq %rcx, %r12 + testq %rdx, %rdx + je .Lpoly1305_finish_ext_avx2_22 + vpxor %xmm0, %xmm0, %xmm0 + vmovdqa %ymm0, (%rsp) + vmovdqa %ymm0, 32(%rsp) + movq %rsp, %rax + subq %rsp, %rsi + testb $32, %dl + je .Lpoly1305_finish_ext_avx2_23 + vmovdqu (%rsp,%rsi), %ymm0 + vmovdqa %ymm0, (%rsp) + leaq 32(%rsp), %rax +.Lpoly1305_finish_ext_avx2_23: + testb $16, %r13b + je .Lpoly1305_finish_ext_avx2_24 + vmovdqu (%rax,%rsi), %xmm0 + vmovdqa %xmm0, (%rax) + addq $16, %rax +.Lpoly1305_finish_ext_avx2_24: + testb $8, %r13b + je .Lpoly1305_finish_ext_avx2_25 + movq (%rax,%rsi), %rdx + movq %rdx, (%rax) + addq $8, %rax +.Lpoly1305_finish_ext_avx2_25: + testb $4, %r13b + je .Lpoly1305_finish_ext_avx2_26 + movl (%rax,%rsi), %edx + movl %edx, (%rax) + addq $4, %rax +.Lpoly1305_finish_ext_avx2_26: + testb $2, %r13b + je .Lpoly1305_finish_ext_avx2_27 + movzwl (%rax,%rsi), %edx + movw %dx, (%rax) + addq $2, %rax +.Lpoly1305_finish_ext_avx2_27: + testb $1, %r13b + je .Lpoly1305_finish_ext_avx2_28 + movzbl (%rax,%rsi), %edx + movb %dl, (%rax) +.Lpoly1305_finish_ext_avx2_28: + testb $15, %r13b + je .Lpoly1305_finish_ext_avx2_29 + movb $1, (%rsp,%r13) +.Lpoly1305_finish_ext_avx2_29: + cmpq $47, %r13 + jbe .Lpoly1305_finish_ext_avx2_30 + orq $4, 320(%rbx) + jmp .Lpoly1305_finish_ext_avx2_31 +.Lpoly1305_finish_ext_avx2_30: + cmpq $31, %r13 + jbe .Lpoly1305_finish_ext_avx2_32 + orq $8, 320(%rbx) + jmp .Lpoly1305_finish_ext_avx2_31 +.Lpoly1305_finish_ext_avx2_32: + cmpq $15, %r13 + jbe .Lpoly1305_finish_ext_avx2_33 + orq $16, 320(%rbx) + jmp .Lpoly1305_finish_ext_avx2_31 +.Lpoly1305_finish_ext_avx2_33: + orq $32, 320(%rbx) +.Lpoly1305_finish_ext_avx2_31: + testb $1, 320(%rbx) + je .Lpoly1305_finish_ext_avx2_34 + cmpq $32, %r13 + ja .Lpoly1305_finish_ext_avx2_34 + cmpq $17, %r13 + sbbq %rsi, %rsi + notq %rsi + addq $2, %rsi + cmpq $17, %r13 + sbbq %rax, %rax + movq %rbx, %rdx + addq $23, %rax + leaq (%rbx,%rax,8), %rax + movl $0, %ecx +.Lpoly1305_finish_ext_avx2_37: + movl 244(%rdx), %edi + movl %edi, (%rax) + movl 252(%rdx), %edi + movl %edi, 32(%rax) + movl 260(%rdx), %edi + movl %edi, 64(%rax) + movl 268(%rdx), %edi + movl %edi, 96(%rax) + movl 276(%rdx), %edi + movl %edi, 128(%rax) + addq $1, %rcx + subq $40, %rdx + addq $8, %rax + cmpq %rcx, %rsi + ja .Lpoly1305_finish_ext_avx2_37 +.Lpoly1305_finish_ext_avx2_34: + movl $64, %edx + movq %rsp, %rsi + movq %rbx, %rdi + call .Lpoly1305_blocks_avx2_local +.Lpoly1305_finish_ext_avx2_22: + movq 320(%rbx), %r8 + testb $1, %r8b + je .Lpoly1305_finish_ext_avx2_38 + leaq -1(%r13), %rax + cmpq $47, %rax + ja .Lpoly1305_finish_ext_avx2_46 + cmpq $32, %r13 + ja .Lpoly1305_finish_ext_avx2_47 + cmpq $17, %r13 + sbbq %r9, %r9 + addq $2, %r9 + movl $0, %edi + cmpq $17, %r13 + sbbq %rax, %rax + notq %rax + andl $5, %eax + jmp .Lpoly1305_finish_ext_avx2_39 +.Lpoly1305_finish_ext_avx2_41: + movl (%rdx), %esi + movl %esi, (%rax) + movl 8(%rdx), %esi + movl %esi, 32(%rax) + movl 16(%rdx), %esi + movl %esi, 64(%rax) + movl 24(%rdx), %esi + movl %esi, 96(%rax) + movl 32(%rdx), %esi + movl %esi, 128(%rax) + addq $1, %rcx + subq $40, %rdx + addq $8, %rax + movq %rcx, %rsi + subq %rdi, %rsi + cmpq %rsi, %r9 + ja .Lpoly1305_finish_ext_avx2_41 + cmpq $3, %rcx + ja .Lpoly1305_finish_ext_avx2_42 + leaq 160(%rbx,%rcx,8), %rax +.Lpoly1305_finish_ext_avx2_43: + movl $1, (%rax) + movl $0, 32(%rax) + movl $0, 64(%rax) + movl $0, 96(%rax) + movl $0, 128(%rax) + addq $1, %rcx + addq $8, %rax + cmpq $4, %rcx + jne .Lpoly1305_finish_ext_avx2_43 +.Lpoly1305_finish_ext_avx2_42: + orq $96, %r8 + movq %r8, 320(%rbx) + vpxor %ymm0, %ymm0, %ymm0 + vmovdqa %ymm0, (%rsp) + vmovdqa %ymm0, 32(%rsp) + movl $64, %edx + movq %rsp, %rsi + movq %rbx, %rdi + call .Lpoly1305_blocks_avx2_local +.Lpoly1305_finish_ext_avx2_38: + movq 8(%rbx), %rax + movq %rax, %rdx + salq $44, %rdx + orq (%rbx), %rdx + shrq $20, %rax + movl $24, %edi + shlx %rdi, 16(%rbx), %rcx + orq %rcx, %rax + movl 292(%rbx), %ecx + salq $32, %rcx + movl 284(%rbx), %esi + orq %rsi, %rcx + movl 308(%rbx), %esi + salq $32, %rsi + movl 300(%rbx), %edi + orq %rdi, %rsi + addq %rcx, %rdx + adcq %rsi, %rax + movq %rdx, (%r12) + movq %rax, 8(%r12) + vpxor %xmm0, %xmm0, %xmm0 + vmovdqu %ymm0, (%rbx) + vmovdqu %ymm0, 32(%rbx) + vmovdqu %ymm0, 64(%rbx) + vmovdqu %ymm0, 96(%rbx) + vmovdqu %ymm0, 128(%rbx) + vmovdqu %ymm0, 160(%rbx) + vmovdqu %ymm0, 192(%rbx) + vmovdqu %ymm0, 224(%rbx) + jmp .Lpoly1305_finish_ext_avx2_49 +.Lpoly1305_finish_ext_avx2_46: + movl $3, %r9d + movl $1, %edi + movl $10, %eax + jmp .Lpoly1305_finish_ext_avx2_39 +.Lpoly1305_finish_ext_avx2_47: + movl $3, %r9d + movl $0, %edi + movl $10, %eax +.Lpoly1305_finish_ext_avx2_39: + leaq 164(%rbx,%rax,8), %rdx + leaq 160(%rbx,%rdi,8), %rax + movq %rdi, %rcx + jmp .Lpoly1305_finish_ext_avx2_41 +.Lpoly1305_finish_ext_avx2_49: + movq %rbp, %rax + subq %rsp, %rax + leaq -24(%rbp), %rsp + vzeroall + popq %rbx + popq %r12 + popq %r13 + popq %rbp + addq $(8*5), %rax +ret +.size _gcry_poly1305_amd64_avx2_finish_ext,.-_gcry_poly1305_amd64_avx2_finish_ext; + +#endif diff --git a/cipher/poly1305-internal.h b/cipher/poly1305-internal.h index fa3fe75e..0299c430 100644 --- a/cipher/poly1305-internal.h +++ b/cipher/poly1305-internal.h @@ -54,23 +54,40 @@ #endif +/* POLY1305_USE_AVX2 indicates whether to compile with AMD64 AVX2 code. */ +#undef POLY1305_USE_AVX2 +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(ENABLE_AVX2_SUPPORT) +# define POLY1305_USE_AVX2 1 +# define POLY1305_AVX2_BLOCKSIZE 64 +# define POLY1305_AVX2_STATESIZE 328 +# define POLY1305_AVX2_ALIGNMENT 32 +#endif + + /* Largest block-size used in any implementation (optimized implementations * might use block-size multiple of 16). */ -#ifdef POLY1305_USE_SSE2 +#ifdef POLY1305_USE_AVX2 +# define POLY1305_LARGEST_BLOCKSIZE POLY1305_AVX2_BLOCKSIZE +#elif defined(POLY1305_USE_SSE2) # define POLY1305_LARGEST_BLOCKSIZE POLY1305_SSE2_BLOCKSIZE #else # define POLY1305_LARGEST_BLOCKSIZE POLY1305_REF_BLOCKSIZE #endif /* Largest state-size used in any implementation. */ -#ifdef POLY1305_USE_SSE2 +#ifdef POLY1305_USE_AVX2 +# define POLY1305_LARGEST_STATESIZE POLY1305_AVX2_STATESIZE +#elif defined(POLY1305_USE_SSE2) # define POLY1305_LARGEST_STATESIZE POLY1305_SSE2_STATESIZE #else # define POLY1305_LARGEST_STATESIZE POLY1305_REF_STATESIZE #endif /* Minimum alignment for state pointer passed to implementations. */ -#ifdef POLY1305_USE_SSE2 +#ifdef POLY1305_USE_AVX2 +# define POLY1305_STATE_ALIGNMENT POLY1305_AVX2_ALIGNMENT +#elif defined(POLY1305_USE_SSE2) # define POLY1305_STATE_ALIGNMENT POLY1305_SSE2_ALIGNMENT #else # define POLY1305_STATE_ALIGNMENT POLY1305_REF_ALIGNMENT diff --git a/cipher/poly1305.c b/cipher/poly1305.c index cd1902aa..fe241c1f 100644 --- a/cipher/poly1305.c +++ b/cipher/poly1305.c @@ -57,6 +57,25 @@ static const poly1305_ops_t poly1305_amd64_sse2_ops = { #endif +#ifdef POLY1305_USE_AVX2 + +void _gcry_poly1305_amd64_avx2_init_ext(void *state, const poly1305_key_t *key); +unsigned int _gcry_poly1305_amd64_avx2_finish_ext(void *state, const byte *m, + size_t remaining, + byte mac[16]); +unsigned int _gcry_poly1305_amd64_avx2_blocks(void *ctx, const byte *m, + size_t bytes); + +static const poly1305_ops_t poly1305_amd64_avx2_ops = { + POLY1305_AVX2_BLOCKSIZE, + _gcry_poly1305_amd64_avx2_init_ext, + _gcry_poly1305_amd64_avx2_blocks, + _gcry_poly1305_amd64_avx2_finish_ext +}; + +#endif + + #ifdef HAVE_U64_TYPEDEF /* Reference unoptimized poly1305 implementation using 32 bit * 32 bit = 64 bit @@ -616,6 +635,7 @@ _gcry_poly1305_init (poly1305_context_t * ctx, const byte * key, static int initialized; static const char *selftest_failed; poly1305_key_t keytmp; + unsigned int features = _gcry_get_hw_features (); if (!initialized) { @@ -637,6 +657,12 @@ _gcry_poly1305_init (poly1305_context_t * ctx, const byte * key, ctx->ops = &poly1305_default_ops; #endif +#ifdef POLY1305_USE_AVX2 + if (features & HWF_INTEL_AVX2) + ctx->ops = &poly1305_amd64_avx2_ops; +#endif + (void)features; + buf_cpy (keytmp.b, key, POLY1305_KEYLEN); poly1305_init (ctx, &keytmp); diff --git a/configure.ac b/configure.ac index 4dc36d5f..47a322be 100644 --- a/configure.ac +++ b/configure.ac @@ -1825,6 +1825,7 @@ case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS poly1305-sse2-amd64.lo" + GCRYPT_CIPHERS="$GCRYPT_CIPHERS poly1305-avx2-amd64.lo" ;; esac |