/* poly1305-avx2-amd64.S - AMD64/AVX2 implementation of Poly1305 * * Copyright (C) 2014 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . */ /* * Based on public domain implementation by Andrew Moon at * https://github.com/floodyberry/poly1305-opt */ #include #if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(ENABLE_AVX2_SUPPORT) #ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS # define ELF(...) __VA_ARGS__ #else # define ELF(...) /*_*/ #endif .text .align 8 .globl _gcry_poly1305_amd64_avx2_init_ext ELF(.type _gcry_poly1305_amd64_avx2_init_ext,@function;) _gcry_poly1305_amd64_avx2_init_ext: .Lpoly1305_init_ext_avx2_local: xor %edx, %edx vzeroupper pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbx movq %rdx, %rcx vpxor %ymm0, %ymm0, %ymm0 movq $-1, %r8 testq %rcx, %rcx vmovdqu %ymm0, (%rdi) vmovdqu %ymm0, 32(%rdi) vmovdqu %ymm0, 64(%rdi) vmovdqu %ymm0, 96(%rdi) vmovdqu %ymm0, 128(%rdi) movq 8(%rsi), %r9 cmove %r8, %rcx movq $0xffc0fffffff, %r8 movq %r9, %r13 movq (%rsi), %r10 andq %r10, %r8 shrq $44, %r10 movq %r8, %r14 shlq $20, %r13 orq %r13, %r10 movq $0xfffffc0ffff, %r13 shrq $24, %r9 andq %r13, %r10 movq $0xffffffc0f, %r13 andq %r13, %r9 movl %r8d, %r13d andl $67108863, %r13d movl %r13d, 164(%rdi) movq %r10, %r13 shrq $26, %r14 shlq $18, %r13 orq %r13, %r14 movq %r10, %r13 shrq $8, %r13 andl $67108863, %r14d andl $67108863, %r13d movl %r14d, 172(%rdi) movq %r10, %r14 movl %r13d, 180(%rdi) movq %r9, %r13 shrq $34, %r14 shlq $10, %r13 orq %r13, %r14 movq %r9, %r13 shrq $16, %r13 andl $67108863, %r14d movl %r14d, 188(%rdi) movl %r13d, 196(%rdi) cmpq $16, %rcx jbe .Lpoly1305_init_ext_avx2_continue lea (%r9,%r9,4), %r11 shlq $2, %r11 lea (%r10,%r10), %rax mulq %r11 movq %rax, %r13 movq %r8, %rax movq %rdx, %r14 mulq %r8 addq %rax, %r13 lea (%r8,%r8), %rax movq %r13, %r12 adcq %rdx, %r14 mulq %r10 shlq $20, %r14 movq %rax, %r15 shrq $44, %r12 movq %r11, %rax orq %r12, %r14 movq %rdx, %r12 mulq %r9 addq %rax, %r15 movq %r8, %rax adcq %rdx, %r12 addq %r15, %r14 lea (%r9,%r9), %r15 movq %r14, %rbx adcq $0, %r12 mulq %r15 shlq $20, %r12 movq %rdx, %r11 shrq $44, %rbx orq %rbx, %r12 movq %rax, %rbx movq %r10, %rax mulq %r10 addq %rax, %rbx adcq %rdx, %r11 addq %rbx, %r12 movq $0xfffffffffff, %rbx movq %r12, %r15 adcq $0, %r11 andq %rbx, %r13 shlq $22, %r11 andq %rbx, %r14 shrq $42, %r15 orq %r15, %r11 lea (%r11,%r11,4), %r11 addq %r11, %r13 movq %rbx, %r11 andq %r13, %r11 shrq $44, %r13 movq %r11, %r15 addq %r13, %r14 movq $0x3ffffffffff, %r13 andq %r14, %rbx andq %r13, %r12 movq %rbx, %r13 shrq $26, %r15 shlq $18, %r13 orq %r13, %r15 movq %rbx, %r13 shrq $44, %r14 shrq $8, %r13 addq %r14, %r12 movl %r11d, %r14d andl $67108863, %r15d andl $67108863, %r14d andl $67108863, %r13d movl %r14d, 204(%rdi) movq %rbx, %r14 movl %r13d, 220(%rdi) movq %r12, %r13 shrq $34, %r14 shlq $10, %r13 orq %r13, %r14 movq %r12, %r13 shrq $16, %r13 andl $67108863, %r14d movl %r15d, 212(%rdi) movl %r14d, 228(%rdi) movl %r13d, 236(%rdi) cmpq $32, %rcx jbe .Lpoly1305_init_ext_avx2_continue movq %r9, %rax lea (%rbx,%rbx,4), %r14 shlq $2, %r14 mulq %r14 movq %rdi, -32(%rsp) lea (%r12,%r12,4), %rdi shlq $2, %rdi movq %rax, %r14 movq %r10, %rax movq %rdx, %r15 mulq %rdi movq %rax, %r13 movq %r11, %rax movq %rcx, -16(%rsp) movq %rdx, %rcx mulq %r8 addq %rax, %r13 movq %rdi, %rax movq %rsi, -24(%rsp) adcq %rdx, %rcx addq %r13, %r14 adcq %rcx, %r15 movq %r14, %rcx mulq %r9 shlq $20, %r15 movq %rax, %r13 shrq $44, %rcx movq %r11, %rax orq %rcx, %r15 movq %rdx, %rcx mulq %r10 movq %rax, %rsi movq %rbx, %rax movq %rdx, %rdi mulq %r8 addq %rax, %rsi movq %r11, %rax adcq %rdx, %rdi addq %rsi, %r13 adcq %rdi, %rcx addq %r13, %r15 movq %r15, %rdi adcq $0, %rcx mulq %r9 shlq $20, %rcx movq %rdx, %rsi shrq $44, %rdi orq %rdi, %rcx movq %rax, %rdi movq %rbx, %rax mulq %r10 movq %rax, %r9 movq %r8, %rax movq %rdx, %r10 movq $0xfffffffffff, %r8 mulq %r12 addq %rax, %r9 adcq %rdx, %r10 andq %r8, %r14 addq %r9, %rdi adcq %r10, %rsi andq %r8, %r15 addq %rdi, %rcx movq $0x3ffffffffff, %rdi movq %rcx, %r10 adcq $0, %rsi andq %rdi, %rcx shlq $22, %rsi shrq $42, %r10 orq %r10, %rsi movq -32(%rsp), %rdi lea (%rsi,%rsi,4), %r9 movq %r8, %rsi addq %r9, %r14 andq %r14, %rsi shrq $44, %r14 addq %r14, %r15 andq %r15, %r8 shrq $44, %r15 movq %r8, %r14 addq %r15, %rcx movl %esi, %r15d movq %rcx, %r10 movq %r8, %r9 shrq $26, %rsi andl $67108863, %r15d shlq $18, %r14 shrq $34, %r8 orq %r14, %rsi shlq $10, %r10 shrq $8, %r9 orq %r10, %r8 shrq $16, %rcx andl $67108863, %esi movl %esi, 252(%rdi) andl $67108863, %r9d movl %ecx, 276(%rdi) andl $67108863, %r8d movl %r15d, 244(%rdi) movl %r9d, 260(%rdi) movl %r8d, 268(%rdi) movq -16(%rsp), %rcx movq -24(%rsp), %rsi .Lpoly1305_init_ext_avx2_continue: movl 16(%rsi), %r8d movl %r8d, 284(%rdi) movl 20(%rsi), %r9d movl %r9d, 292(%rdi) movl 24(%rsi), %r10d movl %r10d, 300(%rdi) movl 28(%rsi), %esi movl %esi, 308(%rdi) cmpq $48, %rcx jbe .Lpoly1305_init_ext_avx2_done lea (%r12,%r12,4), %r9 shlq $2, %r9 lea (%rbx,%rbx), %rax mulq %r9 movq %rax, %rsi movq %r11, %rax movq %rdx, %r8 mulq %r11 addq %rax, %rsi lea (%r11,%r11), %rax movq %rsi, %r10 adcq %rdx, %r8 mulq %rbx movq %rax, %r13 movq %r12, %rax movq %rdx, %rcx addq %r12, %r12 mulq %r9 addq %rax, %r13 movq %r11, %rax movq $0xfffffffffff, %r9 adcq %rdx, %rcx andq %r9, %rsi mulq %r12 shlq $20, %r8 movq %rax, %r11 shrq $44, %r10 movq %rbx, %rax orq %r10, %r8 movq %rdx, %r12 mulq %rbx addq %r13, %r8 movq %r8, %r14 adcq $0, %rcx andq %r9, %r8 addq %rax, %r11 adcq %rdx, %r12 shlq $20, %rcx shrq $44, %r14 orq %r14, %rcx addq %r11, %rcx movq %rcx, %rbx adcq $0, %r12 shlq $22, %r12 shrq $42, %rbx orq %rbx, %r12 movq %r9, %rbx lea (%r12,%r12,4), %r15 addq %r15, %rsi andq %rsi, %rbx shrq $44, %rsi movl %ebx, %r11d addq %rsi, %r8 movq $0x3ffffffffff, %rsi andq %r8, %r9 andq %rsi, %rcx shrq $44, %r8 movq %r9, %rax addq %r8, %rcx movq %r9, %r8 movq %rcx, %r10 andl $67108863, %r11d shrq $26, %rbx shlq $18, %r8 shrq $34, %r9 orq %r8, %rbx shlq $10, %r10 shrq $8, %rax orq %r10, %r9 shrq $16, %rcx andl $67108863, %ebx andl $67108863, %eax andl $67108863, %r9d movl %r11d, 184(%rdi) movl %r11d, 176(%rdi) movl %r11d, 168(%rdi) movl %r11d, 160(%rdi) movl %ebx, 216(%rdi) movl %ebx, 208(%rdi) movl %ebx, 200(%rdi) movl %ebx, 192(%rdi) movl %eax, 248(%rdi) movl %eax, 240(%rdi) movl %eax, 232(%rdi) movl %eax, 224(%rdi) movl %r9d, 280(%rdi) movl %r9d, 272(%rdi) movl %r9d, 264(%rdi) movl %r9d, 256(%rdi) movl %ecx, 312(%rdi) movl %ecx, 304(%rdi) movl %ecx, 296(%rdi) movl %ecx, 288(%rdi) .Lpoly1305_init_ext_avx2_done: movq $0, 320(%rdi) vzeroall popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 ret ELF(.size _gcry_poly1305_amd64_avx2_init_ext,.-_gcry_poly1305_amd64_avx2_init_ext;) .align 8 .globl _gcry_poly1305_amd64_avx2_blocks ELF(.type _gcry_poly1305_amd64_avx2_blocks,@function;) _gcry_poly1305_amd64_avx2_blocks: .Lpoly1305_blocks_avx2_local: vzeroupper pushq %rbp movq %rsp, %rbp pushq %rbx andq $-64, %rsp subq $200, %rsp movl $((1<<26)-1), %r8d movl $(5), %r9d movl $((1<<24)), %r10d vmovd %r8d, %xmm0 vmovd %r9d, %xmm8 vmovd %r10d, %xmm7 vpbroadcastq %xmm0, %ymm0 vpbroadcastq %xmm8, %ymm8 vpbroadcastq %xmm7, %ymm7 vmovdqa %ymm7, 168(%rsp) movq 320(%rdi), %rax testb $60, %al je .Lpoly1305_blocks_avx2_9 vmovdqa 168(%rsp), %ymm7 vpsrldq $8, %ymm7, %ymm1 vmovdqa %ymm1, 168(%rsp) testb $4, %al je .Lpoly1305_blocks_avx2_10 vpermq $192, %ymm1, %ymm7 vmovdqa %ymm7, 168(%rsp) .Lpoly1305_blocks_avx2_10: testb $8, %al je .Lpoly1305_blocks_avx2_11 vpermq $240, 168(%rsp), %ymm7 vmovdqa %ymm7, 168(%rsp) .Lpoly1305_blocks_avx2_11: testb $16, %al je .Lpoly1305_blocks_avx2_12 vpermq $252, 168(%rsp), %ymm6 vmovdqa %ymm6, 168(%rsp) .Lpoly1305_blocks_avx2_12: testb $32, %al je .Lpoly1305_blocks_avx2_9 vpxor %xmm6, %xmm6, %xmm6 vmovdqa %ymm6, 168(%rsp) .Lpoly1305_blocks_avx2_9: testb $1, %al jne .Lpoly1305_blocks_avx2_13 vmovdqu (%rsi), %ymm3 vmovdqu 32(%rsi), %ymm1 vpunpcklqdq %ymm1, %ymm3, %ymm2 vpunpckhqdq %ymm1, %ymm3, %ymm1 vpermq $216, %ymm2, %ymm2 vpermq $216, %ymm1, %ymm1 vpand %ymm2, %ymm0, %ymm5 vpsrlq $26, %ymm2, %ymm4 vpand %ymm4, %ymm0, %ymm4 vpsllq $12, %ymm1, %ymm3 vpsrlq $52, %ymm2, %ymm2 vpor %ymm3, %ymm2, %ymm2 vpand %ymm2, %ymm0, %ymm3 vpsrlq $26, %ymm2, %ymm2 vpand %ymm2, %ymm0, %ymm2 vpsrlq $40, %ymm1, %ymm1 vpor 168(%rsp), %ymm1, %ymm1 addq $64, %rsi subq $64, %rdx orq $1, 320(%rdi) jmp .Lpoly1305_blocks_avx2_14 .Lpoly1305_blocks_avx2_13: vmovdqa (%rdi), %ymm5 vmovdqa 32(%rdi), %ymm4 vmovdqa 64(%rdi), %ymm3 vmovdqa 96(%rdi), %ymm2 vmovdqa 128(%rdi), %ymm1 .Lpoly1305_blocks_avx2_14: cmpq $63, %rdx jbe .Lpoly1305_blocks_avx2_15 vmovdqa 160(%rdi), %ymm6 vmovdqa %ymm8, 136(%rsp) vmovdqa 192(%rdi), %ymm7 vpmuludq %ymm8, %ymm7, %ymm11 vmovdqa %ymm11, 104(%rsp) vmovdqa 224(%rdi), %ymm11 vmovdqa %ymm11, 72(%rsp) vpmuludq %ymm11, %ymm8, %ymm11 vmovdqa %ymm11, 40(%rsp) vmovdqa 256(%rdi), %ymm11 vmovdqa %ymm11, 8(%rsp) vpmuludq %ymm11, %ymm8, %ymm11 vmovdqa %ymm11, -24(%rsp) vmovdqa 288(%rdi), %ymm13 vmovdqa %ymm13, -56(%rsp) vpmuludq %ymm13, %ymm8, %ymm13 vmovdqa %ymm13, -88(%rsp) .Lpoly1305_blocks_avx2_16: vpmuludq 104(%rsp), %ymm1, %ymm14 vmovdqa 40(%rsp), %ymm13 vpmuludq %ymm13, %ymm2, %ymm8 vpmuludq %ymm13, %ymm1, %ymm13 vmovdqa -24(%rsp), %ymm9 vpmuludq %ymm9, %ymm2, %ymm10 vpmuludq %ymm9, %ymm1, %ymm11 vpaddq %ymm8, %ymm14, %ymm14 vpmuludq %ymm9, %ymm3, %ymm8 vmovdqa -88(%rsp), %ymm12 vpmuludq %ymm12, %ymm1, %ymm9 vpaddq %ymm10, %ymm13, %ymm13 vpmuludq %ymm12, %ymm4, %ymm15 vmovdqa %ymm12, %ymm10 vpmuludq %ymm12, %ymm3, %ymm12 vpaddq %ymm8, %ymm14, %ymm14 vpmuludq %ymm10, %ymm2, %ymm10 vpmuludq %ymm6, %ymm2, %ymm8 vpaddq %ymm15, %ymm14, %ymm14 vpmuludq %ymm6, %ymm1, %ymm1 vpaddq %ymm12, %ymm13, %ymm13 vpmuludq %ymm6, %ymm5, %ymm15 vpaddq %ymm10, %ymm11, %ymm11 vpmuludq %ymm6, %ymm4, %ymm12 vpaddq %ymm8, %ymm9, %ymm9 vpmuludq %ymm6, %ymm3, %ymm10 vpmuludq %ymm7, %ymm3, %ymm8 vpaddq %ymm15, %ymm14, %ymm14 vpmuludq %ymm7, %ymm2, %ymm2 vpaddq %ymm12, %ymm13, %ymm12 vpmuludq %ymm7, %ymm5, %ymm15 vpaddq %ymm10, %ymm11, %ymm10 vpmuludq %ymm7, %ymm4, %ymm13 vpaddq %ymm8, %ymm9, %ymm8 vmovdqa 72(%rsp), %ymm9 vpmuludq %ymm9, %ymm4, %ymm11 vpaddq %ymm2, %ymm1, %ymm1 vpmuludq %ymm9, %ymm3, %ymm3 vpaddq %ymm15, %ymm12, %ymm12 vpmuludq %ymm9, %ymm5, %ymm15 vpaddq %ymm13, %ymm10, %ymm10 vmovdqa 8(%rsp), %ymm2 vpmuludq %ymm2, %ymm5, %ymm9 vpaddq %ymm11, %ymm8, %ymm8 vpmuludq %ymm2, %ymm4, %ymm4 vpaddq %ymm3, %ymm1, %ymm1 vpmuludq -56(%rsp), %ymm5, %ymm5 vpaddq %ymm15, %ymm10, %ymm10 vpaddq %ymm9, %ymm8, %ymm8 vpaddq %ymm4, %ymm1, %ymm1 vpaddq %ymm5, %ymm1, %ymm5 vmovdqu (%rsi), %ymm3 vmovdqu 32(%rsi), %ymm2 vperm2i128 $32, %ymm2, %ymm3, %ymm1 vperm2i128 $49, %ymm2, %ymm3, %ymm2 vpunpckldq %ymm2, %ymm1, %ymm15 vpunpckhdq %ymm2, %ymm1, %ymm2 vpxor %xmm4, %xmm4, %xmm4 vpunpckldq %ymm4, %ymm15, %ymm1 vpunpckhdq %ymm4, %ymm15, %ymm15 vpunpckldq %ymm4, %ymm2, %ymm3 vpunpckhdq %ymm4, %ymm2, %ymm2 vpsllq $6, %ymm15, %ymm15 vpsllq $12, %ymm3, %ymm3 vpsllq $18, %ymm2, %ymm2 vpaddq %ymm1, %ymm14, %ymm14 vpaddq %ymm15, %ymm12, %ymm12 vpaddq %ymm3, %ymm10, %ymm10 vpaddq %ymm2, %ymm8, %ymm8 vpaddq 168(%rsp), %ymm5, %ymm5 addq $64, %rsi vpsrlq $26, %ymm14, %ymm4 vpsrlq $26, %ymm8, %ymm2 vpand %ymm0, %ymm14, %ymm14 vpand %ymm0, %ymm8, %ymm8 vpaddq %ymm4, %ymm12, %ymm12 vpaddq %ymm2, %ymm5, %ymm5 vpsrlq $26, %ymm12, %ymm3 vpsrlq $26, %ymm5, %ymm9 vpand %ymm0, %ymm12, %ymm12 vpand %ymm0, %ymm5, %ymm11 vpaddq %ymm3, %ymm10, %ymm3 vpmuludq 136(%rsp), %ymm9, %ymm9 vpaddq %ymm9, %ymm14, %ymm14 vpsrlq $26, %ymm3, %ymm2 vpsrlq $26, %ymm14, %ymm4 vpand %ymm0, %ymm3, %ymm3 vpand %ymm0, %ymm14, %ymm5 vpaddq %ymm2, %ymm8, %ymm2 vpaddq %ymm4, %ymm12, %ymm4 vpsrlq $26, %ymm2, %ymm1 vpand %ymm0, %ymm2, %ymm2 vpaddq %ymm1, %ymm11, %ymm1 subq $64, %rdx cmpq $63, %rdx ja .Lpoly1305_blocks_avx2_16 .Lpoly1305_blocks_avx2_15: testb $64, 320(%rdi) jne .Lpoly1305_blocks_avx2_17 vmovdqa %ymm5, (%rdi) vmovdqa %ymm4, 32(%rdi) vmovdqa %ymm3, 64(%rdi) vmovdqa %ymm2, 96(%rdi) vmovdqa %ymm1, 128(%rdi) jmp .Lpoly1305_blocks_avx2_8 .Lpoly1305_blocks_avx2_17: vpermq $245, %ymm5, %ymm0 vpaddq %ymm0, %ymm5, %ymm5 vpermq $245, %ymm4, %ymm0 vpaddq %ymm0, %ymm4, %ymm4 vpermq $245, %ymm3, %ymm0 vpaddq %ymm0, %ymm3, %ymm3 vpermq $245, %ymm2, %ymm0 vpaddq %ymm0, %ymm2, %ymm2 vpermq $245, %ymm1, %ymm0 vpaddq %ymm0, %ymm1, %ymm1 vpermq $170, %ymm5, %ymm0 vpaddq %ymm0, %ymm5, %ymm5 vpermq $170, %ymm4, %ymm0 vpaddq %ymm0, %ymm4, %ymm4 vpermq $170, %ymm3, %ymm0 vpaddq %ymm0, %ymm3, %ymm3 vpermq $170, %ymm2, %ymm0 vpaddq %ymm0, %ymm2, %ymm2 vpermq $170, %ymm1, %ymm0 vpaddq %ymm0, %ymm1, %ymm1 vmovd %xmm5, %eax vmovd %xmm4, %edx movl %eax, %ecx shrl $26, %ecx addl %edx, %ecx movl %ecx, %edx andl $67108863, %edx vmovd %xmm3, %esi shrl $26, %ecx movl %ecx, %r11d addl %esi, %r11d vmovd %xmm2, %ecx movl %r11d, %r10d shrl $26, %r10d addl %ecx, %r10d movl %r10d, %r9d andl $67108863, %r9d vmovd %xmm1, %r8d movl %edx, %esi salq $26, %rsi andl $67108863, %eax orq %rax, %rsi movabsq $17592186044415, %rax andq %rax, %rsi andl $67108863, %r11d salq $8, %r11 shrl $18, %edx movl %edx, %edx orq %r11, %rdx movq %r9, %rcx salq $34, %rcx orq %rcx, %rdx andq %rax, %rdx shrl $26, %r10d addl %r10d, %r8d salq $16, %r8 shrl $10, %r9d movl %r9d, %r9d orq %r9, %r8 movabsq $4398046511103, %r10 movq %r8, %r9 andq %r10, %r9 shrq $42, %r8 leaq (%r8,%r8,4), %rcx addq %rcx, %rsi movq %rsi, %r8 andq %rax, %r8 movq %rsi, %rcx shrq $44, %rcx addq %rdx, %rcx movq %rcx, %rsi andq %rax, %rsi shrq $44, %rcx movq %rcx, %rdx addq %r9, %rdx andq %rdx, %r10 shrq $42, %rdx leaq (%r8,%rdx,4), %rcx leaq (%rcx,%rdx), %rdx movq %rdx, %rbx andq %rax, %rbx shrq $44, %rdx movq %rdx, %r11 addq %rsi, %r11 leaq 5(%rbx), %r9 movq %r9, %r8 shrq $44, %r8 addq %r11, %r8 movabsq $-4398046511104, %rsi addq %r10, %rsi movq %r8, %rdx shrq $44, %rdx addq %rdx, %rsi movq %rsi, %rdx shrq $63, %rdx subq $1, %rdx movq %rdx, %rcx notq %rcx andq %rcx, %rbx andq %rcx, %r11 andq %r10, %rcx andq %rax, %r9 andq %rdx, %r9 orq %r9, %rbx movq %rbx, (%rdi) andq %r8, %rax andq %rdx, %rax orq %rax, %r11 movq %r11, 8(%rdi) andq %rsi, %rdx orq %rcx, %rdx movq %rdx, 16(%rdi) .Lpoly1305_blocks_avx2_8: movq -8(%rbp), %rbx vzeroall movq %rbp, %rax subq %rsp, %rax leave addq $8, %rax ret ELF(.size _gcry_poly1305_amd64_avx2_blocks,.-_gcry_poly1305_amd64_avx2_blocks;) .align 8 .globl _gcry_poly1305_amd64_avx2_finish_ext ELF(.type _gcry_poly1305_amd64_avx2_finish_ext,@function;) _gcry_poly1305_amd64_avx2_finish_ext: .Lpoly1305_finish_ext_avx2_local: vzeroupper pushq %rbp movq %rsp, %rbp pushq %r13 pushq %r12 pushq %rbx andq $-64, %rsp subq $64, %rsp movq %rdi, %rbx movq %rdx, %r13 movq %rcx, %r12 testq %rdx, %rdx je .Lpoly1305_finish_ext_avx2_22 vpxor %xmm0, %xmm0, %xmm0 vmovdqa %ymm0, (%rsp) vmovdqa %ymm0, 32(%rsp) movq %rsp, %rax subq %rsp, %rsi testb $32, %dl je .Lpoly1305_finish_ext_avx2_23 vmovdqu (%rsp,%rsi), %ymm0 vmovdqa %ymm0, (%rsp) leaq 32(%rsp), %rax .Lpoly1305_finish_ext_avx2_23: testb $16, %r13b je .Lpoly1305_finish_ext_avx2_24 vmovdqu (%rax,%rsi), %xmm0 vmovdqa %xmm0, (%rax) addq $16, %rax .Lpoly1305_finish_ext_avx2_24: testb $8, %r13b je .Lpoly1305_finish_ext_avx2_25 movq (%rax,%rsi), %rdx movq %rdx, (%rax) addq $8, %rax .Lpoly1305_finish_ext_avx2_25: testb $4, %r13b je .Lpoly1305_finish_ext_avx2_26 movl (%rax,%rsi), %edx movl %edx, (%rax) addq $4, %rax .Lpoly1305_finish_ext_avx2_26: testb $2, %r13b je .Lpoly1305_finish_ext_avx2_27 movzwl (%rax,%rsi), %edx movw %dx, (%rax) addq $2, %rax .Lpoly1305_finish_ext_avx2_27: testb $1, %r13b je .Lpoly1305_finish_ext_avx2_28 movzbl (%rax,%rsi), %edx movb %dl, (%rax) .Lpoly1305_finish_ext_avx2_28: testb $15, %r13b je .Lpoly1305_finish_ext_avx2_29 movb $1, (%rsp,%r13) .Lpoly1305_finish_ext_avx2_29: cmpq $47, %r13 jbe .Lpoly1305_finish_ext_avx2_30 orq $4, 320(%rbx) jmp .Lpoly1305_finish_ext_avx2_31 .Lpoly1305_finish_ext_avx2_30: cmpq $31, %r13 jbe .Lpoly1305_finish_ext_avx2_32 orq $8, 320(%rbx) jmp .Lpoly1305_finish_ext_avx2_31 .Lpoly1305_finish_ext_avx2_32: cmpq $15, %r13 jbe .Lpoly1305_finish_ext_avx2_33 orq $16, 320(%rbx) jmp .Lpoly1305_finish_ext_avx2_31 .Lpoly1305_finish_ext_avx2_33: orq $32, 320(%rbx) .Lpoly1305_finish_ext_avx2_31: testb $1, 320(%rbx) je .Lpoly1305_finish_ext_avx2_34 cmpq $32, %r13 ja .Lpoly1305_finish_ext_avx2_34 cmpq $17, %r13 sbbq %rsi, %rsi notq %rsi addq $2, %rsi cmpq $17, %r13 sbbq %rax, %rax movq %rbx, %rdx addq $23, %rax leaq (%rbx,%rax,8), %rax movl $0, %ecx .Lpoly1305_finish_ext_avx2_37: movl 244(%rdx), %edi movl %edi, (%rax) movl 252(%rdx), %edi movl %edi, 32(%rax) movl 260(%rdx), %edi movl %edi, 64(%rax) movl 268(%rdx), %edi movl %edi, 96(%rax) movl 276(%rdx), %edi movl %edi, 128(%rax) addq $1, %rcx subq $40, %rdx addq $8, %rax cmpq %rcx, %rsi ja .Lpoly1305_finish_ext_avx2_37 .Lpoly1305_finish_ext_avx2_34: movl $64, %edx movq %rsp, %rsi movq %rbx, %rdi call .Lpoly1305_blocks_avx2_local .Lpoly1305_finish_ext_avx2_22: movq 320(%rbx), %r8 testb $1, %r8b je .Lpoly1305_finish_ext_avx2_38 leaq -1(%r13), %rax cmpq $47, %rax ja .Lpoly1305_finish_ext_avx2_46 cmpq $32, %r13 ja .Lpoly1305_finish_ext_avx2_47 cmpq $17, %r13 sbbq %r9, %r9 addq $2, %r9 movl $0, %edi cmpq $17, %r13 sbbq %rax, %rax notq %rax andl $5, %eax jmp .Lpoly1305_finish_ext_avx2_39 .Lpoly1305_finish_ext_avx2_41: movl (%rdx), %esi movl %esi, (%rax) movl 8(%rdx), %esi movl %esi, 32(%rax) movl 16(%rdx), %esi movl %esi, 64(%rax) movl 24(%rdx), %esi movl %esi, 96(%rax) movl 32(%rdx), %esi movl %esi, 128(%rax) addq $1, %rcx subq $40, %rdx addq $8, %rax movq %rcx, %rsi subq %rdi, %rsi cmpq %rsi, %r9 ja .Lpoly1305_finish_ext_avx2_41 cmpq $3, %rcx ja .Lpoly1305_finish_ext_avx2_42 leaq 160(%rbx,%rcx,8), %rax .Lpoly1305_finish_ext_avx2_43: movl $1, (%rax) movl $0, 32(%rax) movl $0, 64(%rax) movl $0, 96(%rax) movl $0, 128(%rax) addq $1, %rcx addq $8, %rax cmpq $4, %rcx jne .Lpoly1305_finish_ext_avx2_43 .Lpoly1305_finish_ext_avx2_42: orq $96, %r8 movq %r8, 320(%rbx) vpxor %ymm0, %ymm0, %ymm0 vmovdqa %ymm0, (%rsp) vmovdqa %ymm0, 32(%rsp) movl $64, %edx movq %rsp, %rsi movq %rbx, %rdi call .Lpoly1305_blocks_avx2_local .Lpoly1305_finish_ext_avx2_38: movq 8(%rbx), %rax movq %rax, %rdx salq $44, %rdx orq (%rbx), %rdx shrq $20, %rax movl $24, %edi shlx %rdi, 16(%rbx), %rcx orq %rcx, %rax movl 292(%rbx), %ecx salq $32, %rcx movl 284(%rbx), %esi orq %rsi, %rcx movl 308(%rbx), %esi salq $32, %rsi movl 300(%rbx), %edi orq %rdi, %rsi addq %rcx, %rdx adcq %rsi, %rax movq %rdx, (%r12) movq %rax, 8(%r12) vpxor %xmm0, %xmm0, %xmm0 vmovdqu %ymm0, (%rbx) vmovdqu %ymm0, 32(%rbx) vmovdqu %ymm0, 64(%rbx) vmovdqu %ymm0, 96(%rbx) vmovdqu %ymm0, 128(%rbx) vmovdqu %ymm0, 160(%rbx) vmovdqu %ymm0, 192(%rbx) vmovdqu %ymm0, 224(%rbx) jmp .Lpoly1305_finish_ext_avx2_49 .Lpoly1305_finish_ext_avx2_46: movl $3, %r9d movl $1, %edi movl $10, %eax jmp .Lpoly1305_finish_ext_avx2_39 .Lpoly1305_finish_ext_avx2_47: movl $3, %r9d movl $0, %edi movl $10, %eax .Lpoly1305_finish_ext_avx2_39: leaq 164(%rbx,%rax,8), %rdx leaq 160(%rbx,%rdi,8), %rax movq %rdi, %rcx jmp .Lpoly1305_finish_ext_avx2_41 .Lpoly1305_finish_ext_avx2_49: movq %rbp, %rax subq %rsp, %rax leaq -24(%rbp), %rsp vzeroall popq %rbx popq %r12 popq %r13 popq %rbp addq $(8*5), %rax ret ELF(.size _gcry_poly1305_amd64_avx2_finish_ext,.-_gcry_poly1305_amd64_avx2_finish_ext;) #endif