summaryrefslogtreecommitdiff
path: root/cipher/whirlpool-sse2-amd64.S
diff options
context:
space:
mode:
Diffstat (limited to 'cipher/whirlpool-sse2-amd64.S')
-rw-r--r--cipher/whirlpool-sse2-amd64.S335
1 files changed, 335 insertions, 0 deletions
diff --git a/cipher/whirlpool-sse2-amd64.S b/cipher/whirlpool-sse2-amd64.S
new file mode 100644
index 00000000..d0bcf2d9
--- /dev/null
+++ b/cipher/whirlpool-sse2-amd64.S
@@ -0,0 +1,335 @@
+/* whirlpool-sse2-amd64.S - AMD64 assembly implementation of Whirlpool
+ *
+ * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && defined(USE_WHIRLPOOL)
+
+#ifdef __PIC__
+# define RIP %rip
+#else
+# define RIP
+#endif
+
+.text
+
+/* look-up table offsets on RTAB */
+#define RC (0)
+#define C0 (RC + (8 * 10))
+#define C1 (C0 + (8 * 256))
+#define C2 (C1 + (8 * 256))
+#define C3 (C2 + (8 * 256))
+#define C4 (C3 + (8 * 256))
+#define C5 (C4 + (8 * 256))
+#define C6 (C5 + (8 * 256))
+#define C7 (C6 + (8 * 256))
+
+/* stack variables */
+#define STACK_DATAP (0)
+#define STACK_STATEP (STACK_DATAP + 8)
+#define STACK_ROUNDS (STACK_STATEP + 8)
+#define STACK_NBLKS (STACK_ROUNDS + 8)
+#define STACK_RBP (STACK_NBLKS + 8)
+#define STACK_RBX (STACK_RBP + 8)
+#define STACK_R12 (STACK_RBX + 8)
+#define STACK_R13 (STACK_R12 + 8)
+#define STACK_R14 (STACK_R13 + 8)
+#define STACK_R15 (STACK_R14 + 8)
+#define STACK_MAX (STACK_R15 + 8)
+
+/* register macros */
+#define RTAB %rbp
+
+#define RI1 %rax
+#define RI2 %rbx
+#define RI3 %rcx
+#define RI4 %rdx
+
+#define RI1d %eax
+#define RI2d %ebx
+#define RI3d %ecx
+#define RI4d %edx
+
+#define RI1bl %al
+#define RI2bl %bl
+#define RI3bl %cl
+#define RI4bl %dl
+
+#define RI1bh %ah
+#define RI2bh %bh
+#define RI3bh %ch
+#define RI4bh %dh
+
+#define RB0 %r8
+#define RB1 %r9
+#define RB2 %r10
+#define RB3 %r11
+#define RB4 %r12
+#define RB5 %r13
+#define RB6 %r14
+#define RB7 %r15
+
+#define RT0 %rsi
+#define RT1 %rdi
+
+#define RT0d %esi
+#define RT1d %edi
+
+#define XKEY0 %xmm0
+#define XKEY1 %xmm1
+#define XKEY2 %xmm2
+#define XKEY3 %xmm3
+#define XKEY4 %xmm4
+#define XKEY5 %xmm5
+#define XKEY6 %xmm6
+#define XKEY7 %xmm7
+
+#define XSTATE0 %xmm8
+#define XSTATE1 %xmm9
+#define XSTATE2 %xmm10
+#define XSTATE3 %xmm11
+#define XSTATE4 %xmm12
+#define XSTATE5 %xmm13
+#define XSTATE6 %xmm14
+#define XSTATE7 %xmm15
+
+/***********************************************************************
+ * AMD64 assembly implementation of Whirlpool.
+ * - Using table-lookups
+ * - Store state in XMM registers
+ ***********************************************************************/
+#define __do_whirl(op, ri, \
+ b0, b1, b2, b3, b4, b5, b6, b7, \
+ load_ri, load_arg) \
+ movzbl ri ## bl, RT0d; \
+ movzbl ri ## bh, RT1d; \
+ shrq $16, ri; \
+ op ## q C7(RTAB,RT0,8), b7; \
+ op ## q C6(RTAB,RT1,8), b6; \
+ movzbl ri ## bl, RT0d; \
+ movzbl ri ## bh, RT1d; \
+ shrq $16, ri; \
+ op ## q C5(RTAB,RT0,8), b5; \
+ op ## q C4(RTAB,RT1,8), b4; \
+ movzbl ri ## bl, RT0d; \
+ movzbl ri ## bh, RT1d; \
+ shrl $16, ri ## d; \
+ op ## q C3(RTAB,RT0,8), b3; \
+ op ## q C2(RTAB,RT1,8), b2; \
+ movzbl ri ## bl, RT0d; \
+ movzbl ri ## bh, RT1d; \
+ load_ri( load_arg, ri); \
+ op ## q C1(RTAB,RT0,8), b1; \
+ op ## q C0(RTAB,RT1,8), b0;
+
+#define do_whirl(op, ri, rb_add, load_ri, load_arg) \
+ __do_whirl(op, ##ri, rb_add, load_ri, load_arg)
+
+#define dummy(...) /*_*/
+
+#define do_movq(src, dst) movq src, dst;
+
+#define RB_ADD0 RB0, RB1, RB2, RB3, RB4, RB5, RB6, RB7
+#define RB_ADD1 RB1, RB2, RB3, RB4, RB5, RB6, RB7, RB0
+#define RB_ADD2 RB2, RB3, RB4, RB5, RB6, RB7, RB0, RB1
+#define RB_ADD3 RB3, RB4, RB5, RB6, RB7, RB0, RB1, RB2
+#define RB_ADD4 RB4, RB5, RB6, RB7, RB0, RB1, RB2, RB3
+#define RB_ADD5 RB5, RB6, RB7, RB0, RB1, RB2, RB3, RB4
+#define RB_ADD6 RB6, RB7, RB0, RB1, RB2, RB3, RB4, RB5
+#define RB_ADD7 RB7, RB0, RB1, RB2, RB3, RB4, RB5, RB6
+
+.align 8
+.globl _gcry_whirlpool_transform_amd64
+.type _gcry_whirlpool_transform_amd64,@function;
+
+_gcry_whirlpool_transform_amd64:
+ /* input:
+ * %rdi: state
+ * %rsi: inblk
+ * %rdx: nblks
+ * %rcx: look-up tables
+ */
+ cmp $0, %rdx;
+ je .Lskip;
+
+ subq $STACK_MAX, %rsp;
+ movq %rbp, STACK_RBP(%rsp);
+ movq %rbx, STACK_RBX(%rsp);
+ movq %r12, STACK_R12(%rsp);
+ movq %r13, STACK_R13(%rsp);
+ movq %r14, STACK_R14(%rsp);
+ movq %r15, STACK_R15(%rsp);
+
+ movq %rdx, STACK_NBLKS(%rsp);
+ movq %rdi, STACK_STATEP(%rsp);
+ movq %rsi, STACK_DATAP(%rsp);
+
+ movq %rcx, RTAB;
+
+ jmp .Lfirst_block;
+
+.align 8
+.Lblock_loop:
+ movq STACK_DATAP(%rsp), %rsi;
+ movq RI1, %rdi;
+
+.Lfirst_block:
+ /* load data_block */
+ movq 0*8(%rsi), RB0;
+ movq 1*8(%rsi), RB1;
+ bswapq RB0;
+ movq 2*8(%rsi), RB2;
+ bswapq RB1;
+ movq 3*8(%rsi), RB3;
+ bswapq RB2;
+ movq 4*8(%rsi), RB4;
+ bswapq RB3;
+ movq 5*8(%rsi), RB5;
+ bswapq RB4;
+ movq RB0, XSTATE0;
+ movq 6*8(%rsi), RB6;
+ bswapq RB5;
+ movq RB1, XSTATE1;
+ movq 7*8(%rsi), RB7;
+ bswapq RB6;
+ movq RB2, XSTATE2;
+ bswapq RB7;
+ movq RB3, XSTATE3;
+ movq RB4, XSTATE4;
+ movq RB5, XSTATE5;
+ movq RB6, XSTATE6;
+ movq RB7, XSTATE7;
+
+ /* load key */
+ movq 0*8(%rdi), XKEY0;
+ movq 1*8(%rdi), XKEY1;
+ movq 2*8(%rdi), XKEY2;
+ movq 3*8(%rdi), XKEY3;
+ movq 4*8(%rdi), XKEY4;
+ movq 5*8(%rdi), XKEY5;
+ movq 6*8(%rdi), XKEY6;
+ movq 7*8(%rdi), XKEY7;
+
+ movq XKEY0, RI1;
+ movq XKEY1, RI2;
+ movq XKEY2, RI3;
+ movq XKEY3, RI4;
+
+ /* prepare and store state */
+ pxor XKEY0, XSTATE0;
+ pxor XKEY1, XSTATE1;
+ pxor XKEY2, XSTATE2;
+ pxor XKEY3, XSTATE3;
+ pxor XKEY4, XSTATE4;
+ pxor XKEY5, XSTATE5;
+ pxor XKEY6, XSTATE6;
+ pxor XKEY7, XSTATE7;
+
+ movq XSTATE0, 0*8(%rdi);
+ movq XSTATE1, 1*8(%rdi);
+ movq XSTATE2, 2*8(%rdi);
+ movq XSTATE3, 3*8(%rdi);
+ movq XSTATE4, 4*8(%rdi);
+ movq XSTATE5, 5*8(%rdi);
+ movq XSTATE6, 6*8(%rdi);
+ movq XSTATE7, 7*8(%rdi);
+
+ addq $64, STACK_DATAP(%rsp);
+ movl $(0), STACK_ROUNDS(%rsp);
+.align 8
+.Lround_loop:
+ do_whirl(mov, RI1 /*XKEY0*/, RB_ADD0, do_movq, XKEY4);
+ do_whirl(xor, RI2 /*XKEY1*/, RB_ADD1, do_movq, XKEY5);
+ do_whirl(xor, RI3 /*XKEY2*/, RB_ADD2, do_movq, XKEY6);
+ do_whirl(xor, RI4 /*XKEY3*/, RB_ADD3, do_movq, XKEY7);
+ do_whirl(xor, RI1 /*XKEY0*/, RB_ADD4, do_movq, XSTATE0);
+ do_whirl(xor, RI2 /*XKEY1*/, RB_ADD5, do_movq, XSTATE1);
+ do_whirl(xor, RI3 /*XKEY2*/, RB_ADD6, do_movq, XSTATE2);
+ do_whirl(xor, RI4 /*XKEY3*/, RB_ADD7, do_movq, XSTATE3);
+
+ movl STACK_ROUNDS(%rsp), RT0d;
+ movq RB1, XKEY1;
+ addl $1, STACK_ROUNDS(%rsp);
+ movq RB2, XKEY2;
+ movq RB3, XKEY3;
+ xorq RC(RTAB,RT0,8), RB0; /* Add round constant */
+ movq RB4, XKEY4;
+ movq RB5, XKEY5;
+ movq RB0, XKEY0;
+ movq RB6, XKEY6;
+ movq RB7, XKEY7;
+
+ do_whirl(xor, RI1 /*XSTATE0*/, RB_ADD0, do_movq, XSTATE4);
+ do_whirl(xor, RI2 /*XSTATE1*/, RB_ADD1, do_movq, XSTATE5);
+ do_whirl(xor, RI3 /*XSTATE2*/, RB_ADD2, do_movq, XSTATE6);
+ do_whirl(xor, RI4 /*XSTATE3*/, RB_ADD3, do_movq, XSTATE7);
+
+ cmpl $10, STACK_ROUNDS(%rsp);
+ je .Lis_last_round;
+
+ do_whirl(xor, RI1 /*XSTATE4*/, RB_ADD4, do_movq, XKEY0);
+ do_whirl(xor, RI2 /*XSTATE5*/, RB_ADD5, do_movq, XKEY1);
+ do_whirl(xor, RI3 /*XSTATE6*/, RB_ADD6, do_movq, XKEY2);
+ do_whirl(xor, RI4 /*XSTATE7*/, RB_ADD7, do_movq, XKEY3);
+ movq RB0, XSTATE0;
+ movq RB1, XSTATE1;
+ movq RB2, XSTATE2;
+ movq RB3, XSTATE3;
+ movq RB4, XSTATE4;
+ movq RB5, XSTATE5;
+ movq RB6, XSTATE6;
+ movq RB7, XSTATE7;
+
+ jmp .Lround_loop;
+.align 8
+.Lis_last_round:
+ do_whirl(xor, RI1 /*XSTATE4*/, RB_ADD4, dummy, _);
+ movq STACK_STATEP(%rsp), RI1;
+ do_whirl(xor, RI2 /*XSTATE5*/, RB_ADD5, dummy, _);
+ do_whirl(xor, RI3 /*XSTATE6*/, RB_ADD6, dummy, _);
+ do_whirl(xor, RI4 /*XSTATE7*/, RB_ADD7, dummy, _);
+
+ /* store state */
+ xorq RB0, 0*8(RI1);
+ xorq RB1, 1*8(RI1);
+ xorq RB2, 2*8(RI1);
+ xorq RB3, 3*8(RI1);
+ xorq RB4, 4*8(RI1);
+ xorq RB5, 5*8(RI1);
+ xorq RB6, 6*8(RI1);
+ xorq RB7, 7*8(RI1);
+
+ subq $1, STACK_NBLKS(%rsp);
+ jnz .Lblock_loop;
+
+ movq STACK_RBP(%rsp), %rbp;
+ movq STACK_RBX(%rsp), %rbx;
+ movq STACK_R12(%rsp), %r12;
+ movq STACK_R13(%rsp), %r13;
+ movq STACK_R14(%rsp), %r14;
+ movq STACK_R15(%rsp), %r15;
+ addq $STACK_MAX, %rsp;
+.Lskip:
+ movl $(STACK_MAX + 8), %eax;
+ ret;
+.size _gcry_whirlpool_transform_amd64,.-_gcry_whirlpool_transform_amd64;
+
+#endif
+#endif