From 8eabecc883332156adffc1df42d27f614c157e06 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Sat, 27 Dec 2014 12:37:16 +0200 Subject: Add Intel SSSE3 based vector permutation AES implementation * cipher/Makefile.am: Add 'rijndael-ssse3-amd64.c'. * cipher/rijndael-internal.h (USE_SSSE3): New. (RIJNDAEL_context_s) [USE_SSSE3]: Add 'use_ssse3'. * cipher/rijndael-ssse3-amd64.c: New. * cipher/rijndael.c [USE_SSSE3] (_gcry_aes_ssse3_do_setkey) (_gcry_aes_ssse3_prepare_decryption, _gcry_aes_ssse3_encrypt) (_gcry_aes_ssse3_decrypt, _gcry_aes_ssse3_cfb_enc) (_gcry_aes_ssse3_cbc_enc, _gcry_aes_ssse3_ctr_enc) (_gcry_aes_ssse3_cfb_dec, _gcry_aes_ssse3_cbc_dec): New. (do_setkey): Add HWF check for SSSE3 and setup for SSSE3 implementation. (prepare_decryption, _gcry_aes_cfb_enc, _gcry_aes_cbc_enc) (_gcry_aes_ctr_enc, _gcry_aes_cfb_dec, _gcry_aes_cbc_dec): Add selection for SSSE3 implementation. * configure.ac [host=x86_64]: Add 'rijndael-ssse3-amd64.lo'. -- This patch adds "AES with vector permutations" implementation by Mike Hamburg. Public-domain source-code is available at: http://crypto.stanford.edu/vpaes/ Benchmark on Intel Core2 T8100 (2.1Ghz, no turbo): Old (AMD64 asm): AES | nanosecs/byte mebibytes/sec cycles/byte ECB enc | 8.79 ns/B 108.5 MiB/s 18.46 c/B ECB dec | 9.07 ns/B 105.1 MiB/s 19.05 c/B CBC enc | 7.77 ns/B 122.7 MiB/s 16.33 c/B CBC dec | 7.74 ns/B 123.2 MiB/s 16.26 c/B CFB enc | 7.88 ns/B 121.0 MiB/s 16.54 c/B CFB dec | 7.56 ns/B 126.1 MiB/s 15.88 c/B OFB enc | 9.02 ns/B 105.8 MiB/s 18.94 c/B OFB dec | 9.07 ns/B 105.1 MiB/s 19.05 c/B CTR enc | 7.80 ns/B 122.2 MiB/s 16.38 c/B CTR dec | 7.81 ns/B 122.2 MiB/s 16.39 c/B New (ssse3): AES | nanosecs/byte mebibytes/sec cycles/byte ECB enc | 5.77 ns/B 165.2 MiB/s 12.13 c/B ECB dec | 7.13 ns/B 133.7 MiB/s 14.98 c/B CBC enc | 5.27 ns/B 181.0 MiB/s 11.06 c/B CBC dec | 6.39 ns/B 149.3 MiB/s 13.42 c/B CFB enc | 5.27 ns/B 180.9 MiB/s 11.07 c/B CFB dec | 5.28 ns/B 180.7 MiB/s 11.08 c/B OFB enc | 6.11 ns/B 156.1 MiB/s 12.83 c/B OFB dec | 6.13 ns/B 155.5 MiB/s 12.88 c/B CTR enc | 5.26 ns/B 181.5 MiB/s 11.04 c/B CTR dec | 5.24 ns/B 182.0 MiB/s 11.00 c/B Benchmark on Intel i5-2450M (2.5Ghz, no turbo, aes-ni disabled): Old (AMD64 asm): AES | nanosecs/byte mebibytes/sec cycles/byte ECB enc | 8.06 ns/B 118.3 MiB/s 20.15 c/B ECB dec | 8.21 ns/B 116.1 MiB/s 20.53 c/B CBC enc | 7.88 ns/B 121.1 MiB/s 19.69 c/B CBC dec | 7.57 ns/B 126.0 MiB/s 18.92 c/B CFB enc | 7.87 ns/B 121.2 MiB/s 19.67 c/B CFB dec | 7.56 ns/B 126.2 MiB/s 18.89 c/B OFB enc | 8.27 ns/B 115.3 MiB/s 20.67 c/B OFB dec | 8.28 ns/B 115.1 MiB/s 20.71 c/B CTR enc | 8.02 ns/B 119.0 MiB/s 20.04 c/B CTR dec | 8.02 ns/B 118.9 MiB/s 20.05 c/B New (ssse3): AES | nanosecs/byte mebibytes/sec cycles/byte ECB enc | 4.03 ns/B 236.6 MiB/s 10.07 c/B ECB dec | 5.28 ns/B 180.8 MiB/s 13.19 c/B CBC enc | 3.77 ns/B 252.7 MiB/s 9.43 c/B CBC dec | 4.69 ns/B 203.3 MiB/s 11.73 c/B CFB enc | 3.75 ns/B 254.3 MiB/s 9.37 c/B CFB dec | 3.69 ns/B 258.6 MiB/s 9.22 c/B OFB enc | 4.17 ns/B 228.7 MiB/s 10.43 c/B OFB dec | 4.17 ns/B 228.7 MiB/s 10.42 c/B CTR enc | 3.72 ns/B 256.5 MiB/s 9.30 c/B CTR dec | 3.72 ns/B 256.1 MiB/s 9.31 c/B Signed-off-by: Jussi Kivilinna --- cipher/Makefile.am | 2 +- cipher/rijndael-internal.h | 9 + cipher/rijndael-ssse3-amd64.c | 1209 +++++++++++++++++++++++++++++++++++++++++ cipher/rijndael.c | 96 +++- configure.ac | 3 + 5 files changed, 1316 insertions(+), 3 deletions(-) create mode 100644 cipher/rijndael-ssse3-amd64.c diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 98142ed8..7dd626cb 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -75,7 +75,7 @@ md4.c \ md5.c \ poly1305-sse2-amd64.S poly1305-avx2-amd64.S poly1305-armv7-neon.S \ rijndael.c rijndael-internal.h rijndael-tables.h rijndael-aesni.c \ - rijndael-padlock.c rijndael-amd64.S rijndael-arm.S \ + rijndael-padlock.c rijndael-amd64.S rijndael-arm.S rijndael-ssse3-amd64.c \ rmd160.c \ rsa.c \ salsa20.c salsa20-amd64.S salsa20-armv7-neon.S \ diff --git a/cipher/rijndael-internal.h b/cipher/rijndael-internal.h index 7ff86609..854980bd 100644 --- a/cipher/rijndael-internal.h +++ b/cipher/rijndael-internal.h @@ -43,6 +43,12 @@ # define USE_AMD64_ASM 1 #endif +/* USE_SSSE3 indicates whether to use SSSE3 code. */ +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_SSSE3) +# define USE_SSSE3 1 +#endif + /* USE_ARM_ASM indicates whether to use ARM assembly code. */ #undef USE_ARM_ASM #if defined(__ARMEL__) @@ -116,6 +122,9 @@ typedef struct RIJNDAEL_context_s #ifdef USE_AESNI unsigned int use_aesni:1; /* AES-NI shall be used. */ #endif /*USE_AESNI*/ +#ifdef USE_SSSE3 + unsigned int use_ssse3:1; /* SSSE3 shall be used. */ +#endif /*USE_SSSE3*/ rijndael_cryptfn_t encrypt_fn; rijndael_cryptfn_t decrypt_fn; rijndael_prefetchfn_t prefetch_enc_fn; diff --git a/cipher/rijndael-ssse3-amd64.c b/cipher/rijndael-ssse3-amd64.c new file mode 100644 index 00000000..112ab22d --- /dev/null +++ b/cipher/rijndael-ssse3-amd64.c @@ -0,0 +1,1209 @@ +/* SSSE3 vector permutation AES for Libgcrypt + * Copyright (C) 2014-2015 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#include +#include +#include +#include /* for memcmp() */ + +#include "types.h" /* for byte and u32 typedefs */ +#include "g10lib.h" +#include "cipher.h" +#include "bufhelp.h" +#include "cipher-selftest.h" +#include "rijndael-internal.h" + + +#ifdef USE_SSSE3 + + +/* Two macros to be called prior and after the use of SSSE3 + instructions. There should be no external function calls between + the use of these macros. There purpose is to make sure that the + SSE regsiters are cleared and won't reveal any information about + the key or the data. */ +#define vpaes_ssse3_prepare_enc(const_ptr) \ + asm volatile ("lea .Laes_consts(%%rip), %q0 \n\t" \ + "movdqa (%q0), %%xmm9 # 0F \n\t" \ + "movdqa .Lk_inv (%q0), %%xmm10 # inv \n\t" \ + "movdqa .Lk_inv+16(%q0), %%xmm11 # inva \n\t" \ + "movdqa .Lk_sb1 (%q0), %%xmm13 # sb1u \n\t" \ + "movdqa .Lk_sb1+16(%q0), %%xmm12 # sb1t \n\t" \ + "movdqa .Lk_sb2 (%q0), %%xmm15 # sb2u \n\t" \ + "movdqa .Lk_sb2+16(%q0), %%xmm14 # sb2t \n\t" \ + : "=c" (const_ptr) \ + : \ + : "memory" ) + +#define vpaes_ssse3_prepare_dec(const_ptr) \ + asm volatile ("lea .Laes_consts(%%rip), %q0 \n\t" \ + "movdqa (%q0), %%xmm9 # 0F \n\t" \ + "movdqa .Lk_inv (%q0), %%xmm10 # inv \n\t" \ + "movdqa .Lk_inv+16(%q0), %%xmm11 # inva \n\t" \ + "movdqa .Lk_dsb9 (%q0), %%xmm13 # sb9u \n\t" \ + "movdqa .Lk_dsb9+16(%q0), %%xmm12 # sb9t \n\t" \ + "movdqa .Lk_dsbd (%q0), %%xmm15 # sbdu \n\t" \ + "movdqa .Lk_dsbb (%q0), %%xmm14 # sbbu \n\t" \ + "movdqa .Lk_dsbe (%q0), %%xmm8 # sbeu \n\t" \ + : "=c" (const_ptr) \ + : \ + : "memory" ) + +#define vpaes_ssse3_cleanup() \ + asm volatile ("pxor %%xmm0, %%xmm0 \n\t" \ + "pxor %%xmm1, %%xmm1 \n\t" \ + "pxor %%xmm2, %%xmm2 \n\t" \ + "pxor %%xmm3, %%xmm3 \n\t" \ + "pxor %%xmm4, %%xmm4 \n\t" \ + "pxor %%xmm5, %%xmm5 \n\t" \ + "pxor %%xmm6, %%xmm6 \n\t" \ + "pxor %%xmm7, %%xmm7 \n\t" \ + "pxor %%xmm8, %%xmm8 \n\t" \ + ::: "memory" ) + + +void +_gcry_aes_ssse3_do_setkey (RIJNDAEL_context *ctx, const byte *key) +{ + unsigned int keybits = (ctx->rounds - 10) * 32 + 128; + + asm volatile ("leaq %q[key], %%rdi" "\n\t" + "movl %[bits], %%esi" "\n\t" + "leaq %[buf], %%rdx" "\n\t" + "movl %[dir], %%ecx" "\n\t" + "movl %[rotoffs], %%r8d" "\n\t" + "call _aes_schedule_core" "\n\t" + : + : [key] "m" (*key), + [bits] "g" (keybits), + [buf] "m" (ctx->keyschenc32[0][0]), + [dir] "g" (0), + [rotoffs] "g" (48) + : "r8", "r9", "r10", "r11", "rax", "rcx", "rdx", "rdi", "rsi", + "cc", "memory"); + + /* Save key for setting up decryption. */ + memcpy(&ctx->keyschdec32[0][0], key, keybits / 8); +} + + +/* Make a decryption key from an encryption key. */ +void +_gcry_aes_ssse3_prepare_decryption (RIJNDAEL_context *ctx) +{ + unsigned int keybits = (ctx->rounds - 10) * 32 + 128; + + asm volatile ("leaq %q[key], %%rdi" "\n\t" + "movl %[bits], %%esi" "\n\t" + "leaq %[buf], %%rdx" "\n\t" + "movl %[dir], %%ecx" "\n\t" + "movl %[rotoffs], %%r8d" "\n\t" + "call _aes_schedule_core" "\n\t" + : + : [key] "m" (ctx->keyschdec32[0][0]), + [bits] "g" (keybits), + [buf] "m" (ctx->keyschdec32[ctx->rounds][0]), + [dir] "g" (1), + [rotoffs] "g" ((keybits == 192) ? 0 : 32) + : "r8", "r9", "r10", "r11", "rax", "rcx", "rdx", "rdi", "rsi", + "cc", "memory"); +} + + +/* Encrypt one block using the Intel SSSE3 instructions. Block is input +* and output through SSE register xmm0. */ +static inline void +do_vpaes_ssse3_enc (const RIJNDAEL_context *ctx, unsigned int nrounds, + const void *aes_const_ptr) +{ + unsigned int middle_rounds = nrounds - 1; + const void *keysched = ctx->keyschenc32; + + asm volatile ("call _aes_encrypt_core" "\n\t" + : "+a" (middle_rounds), "+d" (keysched) + : "c" (aes_const_ptr) + : "rdi", "rsi", "cc", "memory"); +} + + +/* Decrypt one block using the Intel SSSE3 instructions. Block is input +* and output through SSE register xmm0. */ +static inline void +do_vpaes_ssse3_dec (const RIJNDAEL_context *ctx, unsigned int nrounds, + const void *aes_const_ptr) +{ + unsigned int middle_rounds = nrounds - 1; + const void *keysched = ctx->keyschdec32; + + asm volatile ("call _aes_decrypt_core" "\n\t" + : "+a" (middle_rounds), "+d" (keysched) + : "c" (aes_const_ptr) + : "rsi", "cc", "memory"); +} + + +unsigned int +_gcry_aes_ssse3_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst, + const unsigned char *src) +{ + unsigned int nrounds = ctx->rounds; + const void *aes_const_ptr; + + vpaes_ssse3_prepare_enc (aes_const_ptr); + asm volatile ("movdqu %[src], %%xmm0\n\t" + : + : [src] "m" (*src) + : "memory" ); + do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr); + asm volatile ("movdqu %%xmm0, %[dst]\n\t" + : [dst] "=m" (*dst) + : + : "memory" ); + vpaes_ssse3_cleanup (); + return 0; +} + + +void +_gcry_aes_ssse3_cfb_enc (RIJNDAEL_context *ctx, unsigned char *outbuf, + const unsigned char *inbuf, unsigned char *iv, + size_t nblocks) +{ + unsigned int nrounds = ctx->rounds; + const void *aes_const_ptr; + + vpaes_ssse3_prepare_enc (aes_const_ptr); + + asm volatile ("movdqu %[iv], %%xmm0\n\t" + : /* No output */ + : [iv] "m" (*iv) + : "memory" ); + + for ( ;nblocks; nblocks-- ) + { + do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr); + + asm volatile ("movdqu %[inbuf], %%xmm1\n\t" + "pxor %%xmm1, %%xmm0\n\t" + "movdqu %%xmm0, %[outbuf]\n\t" + : [outbuf] "=m" (*outbuf) + : [inbuf] "m" (*inbuf) + : "memory" ); + + outbuf += BLOCKSIZE; + inbuf += BLOCKSIZE; + } + + asm volatile ("movdqu %%xmm0, %[iv]\n\t" + : [iv] "=m" (*iv) + : + : "memory" ); + + vpaes_ssse3_cleanup (); +} + + +void +_gcry_aes_ssse3_cbc_enc (RIJNDAEL_context *ctx, unsigned char *outbuf, + const unsigned char *inbuf, unsigned char *iv, + size_t nblocks, int cbc_mac) +{ + unsigned int nrounds = ctx->rounds; + const void *aes_const_ptr; + + vpaes_ssse3_prepare_enc (aes_const_ptr); + + asm volatile ("movdqu %[iv], %%xmm7\n\t" + : /* No output */ + : [iv] "m" (*iv) + : "memory" ); + + for ( ;nblocks; nblocks-- ) + { + asm volatile ("movdqu %[inbuf], %%xmm0\n\t" + "pxor %%xmm7, %%xmm0\n\t" + : /* No output */ + : [inbuf] "m" (*inbuf) + : "memory" ); + + do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr); + + asm volatile ("movdqa %%xmm0, %%xmm7\n\t" + "movdqu %%xmm0, %[outbuf]\n\t" + : [outbuf] "=m" (*outbuf) + : + : "memory" ); + + inbuf += BLOCKSIZE; + if (!cbc_mac) + outbuf += BLOCKSIZE; + } + + asm volatile ("movdqu %%xmm7, %[iv]\n\t" + : [iv] "=m" (*iv) + : + : "memory" ); + + vpaes_ssse3_cleanup (); +} + + +void +_gcry_aes_ssse3_ctr_enc (RIJNDAEL_context *ctx, unsigned char *outbuf, + const unsigned char *inbuf, unsigned char *ctr, + size_t nblocks) +{ + static const unsigned char be_mask[16] __attribute__ ((aligned (16))) = + { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; + unsigned int nrounds = ctx->rounds; + const void *aes_const_ptr; + u64 ctrlow; + + vpaes_ssse3_prepare_enc (aes_const_ptr); + + asm volatile ("movdqa %[mask], %%xmm6\n\t" /* Preload mask */ + "movdqa (%[ctr]), %%xmm7\n\t" /* Preload CTR */ + "movq 8(%[ctr]), %q[ctrlow]\n\t" + "bswapq %q[ctrlow]\n\t" + : [ctrlow] "=r" (ctrlow) + : [mask] "m" (*be_mask), + [ctr] "r" (ctr) + : "memory", "cc"); + + for ( ;nblocks; nblocks-- ) + { + asm volatile ("movdqa %%xmm7, %%xmm0\n\t" /* xmm0 := CTR (xmm7) */ + "pcmpeqd %%xmm1, %%xmm1\n\t" + "psrldq $8, %%xmm1\n\t" /* xmm1 = -1 */ + + "pshufb %%xmm6, %%xmm7\n\t" + "psubq %%xmm1, %%xmm7\n\t" /* xmm7++ (big endian) */ + + /* detect if 64-bit carry handling is needed */ + "incq %q[ctrlow]\n\t" + "jnz .Lno_carry%=\n\t" + + "pslldq $8, %%xmm1\n\t" /* move lower 64-bit to high */ + "psubq %%xmm1, %%xmm7\n\t" /* add carry to upper 64bits */ + + ".Lno_carry%=:\n\t" + + "pshufb %%xmm6, %%xmm7\n\t" + : + : [ctr] "r" (ctr), [ctrlow] "r" (ctrlow) + : "cc", "memory"); + + do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr); + + asm volatile ("movdqu %[src], %%xmm1\n\t" /* xmm1 := input */ + "pxor %%xmm1, %%xmm0\n\t" /* EncCTR ^= input */ + "movdqu %%xmm0, %[dst]" /* Store EncCTR. */ + : [dst] "=m" (*outbuf) + : [src] "m" (*inbuf) + : "memory"); + + outbuf += BLOCKSIZE; + inbuf += BLOCKSIZE; + } + + asm volatile ("movdqu %%xmm7, %[ctr]\n\t" /* Update CTR (mem). */ + : [ctr] "=m" (*ctr) + : + : "memory" ); + + vpaes_ssse3_cleanup (); +} + + +unsigned int +_gcry_aes_ssse3_decrypt (const RIJNDAEL_context *ctx, unsigned char *dst, + const unsigned char *src) +{ + unsigned int nrounds = ctx->rounds; + const void *aes_const_ptr; + + vpaes_ssse3_prepare_dec (aes_const_ptr); + asm volatile ("movdqu %[src], %%xmm0\n\t" + : + : [src] "m" (*src) + : "memory" ); + do_vpaes_ssse3_dec (ctx, nrounds, aes_const_ptr); + asm volatile ("movdqu %%xmm0, %[dst]\n\t" + : [dst] "=m" (*dst) + : + : "memory" ); + vpaes_ssse3_cleanup (); + return 0; +} + + +void +_gcry_aes_ssse3_cfb_dec (RIJNDAEL_context *ctx, unsigned char *outbuf, + const unsigned char *inbuf, unsigned char *iv, + size_t nblocks) +{ + unsigned int nrounds = ctx->rounds; + const void *aes_const_ptr; + + vpaes_ssse3_prepare_enc (aes_const_ptr); + + asm volatile ("movdqu %[iv], %%xmm0\n\t" + : /* No output */ + : [iv] "m" (*iv) + : "memory" ); + + for ( ;nblocks; nblocks-- ) + { + do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr); + + asm volatile ("movdqa %%xmm0, %%xmm6\n\t" + "movdqu %[inbuf], %%xmm0\n\t" + "pxor %%xmm0, %%xmm6\n\t" + "movdqu %%xmm6, %[outbuf]\n\t" + : [outbuf] "=m" (*outbuf) + : [inbuf] "m" (*inbuf) + : "memory" ); + + outbuf += BLOCKSIZE; + inbuf += BLOCKSIZE; + } + + asm volatile ("movdqu %%xmm0, %[iv]\n\t" + : [iv] "=m" (*iv) + : + : "memory" ); + + vpaes_ssse3_cleanup (); +} + + +void +_gcry_aes_ssse3_cbc_dec (RIJNDAEL_context *ctx, unsigned char *outbuf, + const unsigned char *inbuf, unsigned char *iv, + size_t nblocks) +{ + unsigned int nrounds = ctx->rounds; + const void *aes_const_ptr; + + vpaes_ssse3_prepare_dec (aes_const_ptr); + + asm volatile + ("movdqu %[iv], %%xmm7\n\t" /* use xmm7 as fast IV storage */ + : /* No output */ + : [iv] "m" (*iv) + : "memory"); + + for ( ;nblocks; nblocks-- ) + { + asm volatile + ("movdqu %[inbuf], %%xmm0\n\t" + "movdqa %%xmm0, %%xmm6\n\t" /* use xmm6 as savebuf */ + : /* No output */ + : [inbuf] "m" (*inbuf) + : "memory"); + + do_vpaes_ssse3_dec (ctx, nrounds, aes_const_ptr); + + asm volatile + ("pxor %%xmm7, %%xmm0\n\t" /* xor IV with output */ + "movdqu %%xmm0, %[outbuf]\n\t" + "movdqu %%xmm6, %%xmm7\n\t" /* store savebuf as new IV */ + : [outbuf] "=m" (*outbuf) + : + : "memory"); + + outbuf += BLOCKSIZE; + inbuf += BLOCKSIZE; + } + + asm volatile + ("movdqu %%xmm7, %[iv]\n\t" /* store IV */ + : /* No output */ + : [iv] "m" (*iv) + : "memory"); + + vpaes_ssse3_cleanup (); +} + + + +asm ( + "\n\t" "##" + "\n\t" "## Constant-time SSSE3 AES core implementation." + "\n\t" "##" + "\n\t" "## By Mike Hamburg (Stanford University), 2009" + "\n\t" "## Public domain." + "\n\t" "##" + + "\n\t" ".text" + + "\n\t" "##" + "\n\t" "## _aes_encrypt_core" + "\n\t" "##" + "\n\t" "## AES-encrypt %xmm0." + "\n\t" "##" + "\n\t" "## Inputs:" + "\n\t" "## %xmm0 = input" + "\n\t" "## %xmm9-%xmm15 as in .Laes_preheat" + "\n\t" "## %rcx = .Laes_consts" + "\n\t" "## (%rdx) = scheduled keys" + "\n\t" "## %rax = nrounds - 1" + "\n\t" "##" + "\n\t" "## Output in %xmm0" + "\n\t" "## Clobbers %xmm1-%xmm4, %r9, %r11, %rax" + "\n\t" "## Preserves %xmm6 - %xmm7 so you get some local vectors" + "\n\t" "##" + "\n\t" "##" + "\n\t" ".align 16" + "\n\t" ".type _aes_encrypt_core,@function" + "\n\t" "_aes_encrypt_core:" + "\n\t" " leaq .Lk_mc_backward(%rcx), %rdi" + "\n\t" " mov $16, %rsi" + "\n\t" " movdqa .Lk_ipt (%rcx), %xmm2 # iptlo" + "\n\t" " movdqa %xmm9, %xmm1" + "\n\t" " pandn %xmm0, %xmm1" + "\n\t" " psrld $4, %xmm1" + "\n\t" " pand %xmm9, %xmm0" + "\n\t" " pshufb %xmm0, %xmm2" + "\n\t" " movdqa .Lk_ipt+16(%rcx), %xmm0 # ipthi" + "\n\t" " pshufb %xmm1, %xmm0" + "\n\t" " pxor (%rdx),%xmm2" + "\n\t" " pxor %xmm2, %xmm0" + "\n\t" " add $16, %rdx" + "\n\t" " jmp .Laes_entry" + + "\n\t" ".align 8" + "\n\t" ".Laes_loop:" + "\n\t" " # middle of middle round" + "\n\t" " movdqa %xmm13, %xmm4 # 4 : sb1u" + "\n\t" " pshufb %xmm2, %xmm4 # 4 = sb1u" + "\n\t" " pxor (%rdx), %xmm4 # 4 = sb1u + k" + "\n\t" " movdqa %xmm12, %xmm0 # 0 : sb1t" + "\n\t" " pshufb %xmm3, %xmm0 # 0 = sb1t" + "\n\t" " pxor %xmm4, %xmm0 # 0 = A" + "\n\t" " movdqa %xmm15, %xmm4 # 4 : sb2u" + "\n\t" " pshufb %xmm2, %xmm4 # 4 = sb2u" + "\n\t" " movdqa .Lk_mc_forward-.Lk_mc_backward(%rsi,%rdi), %xmm1" + "\n\t" " movdqa %xmm14, %xmm2 # 2 : sb2t" + "\n\t" " pshufb %xmm3, %xmm2 # 2 = sb2t" + "\n\t" " pxor %xmm4, %xmm2 # 2 = 2A" + "\n\t" " movdqa %xmm0, %xmm3 # 3 = A" + "\n\t" " pshufb %xmm1, %xmm0 # 0 = B" + "\n\t" " pxor %xmm2, %xmm0 # 0 = 2A+B" + "\n\t" " pshufb (%rsi,%rdi), %xmm3 # 3 = D" + "\n\t" " lea 16(%esi),%esi # next mc" + "\n\t" " pxor %xmm0, %xmm3 # 3 = 2A+B+D" + "\n\t" " lea 16(%rdx),%rdx # next key" + "\n\t" " pshufb %xmm1, %xmm0 # 0 = 2B+C" + "\n\t" " pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D" + "\n\t" " and $48, %rsi # ... mod 4" + "\n\t" " dec %rax # nr--" + + "\n\t" ".Laes_entry:" + "\n\t" " # top of round" + "\n\t" " movdqa %xmm9, %xmm1 # 1 : i" + "\n\t" " pandn %xmm0, %xmm1 # 1 = i<<4" + "\n\t" " psrld $4, %xmm1 # 1 = i" + "\n\t" " pand %xmm9, %xmm0 # 0 = k" + "\n\t" " movdqa %xmm11, %xmm2 # 2 : a/k" + "\n\t" " pshufb %xmm0, %xmm2 # 2 = a/k" + "\n\t" " pxor %xmm1, %xmm0 # 0 = j" + "\n\t" " movdqa %xmm10, %xmm3 # 3 : 1/i" + "\n\t" " pshufb %xmm1, %xmm3 # 3 = 1/i" + "\n\t" " pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k" + "\n\t" " movdqa %xmm10, %xmm4 # 4 : 1/j" + "\n\t" " pshufb %xmm0, %xmm4 # 4 = 1/j" + "\n\t" " pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k" + "\n\t" " movdqa %xmm10, %xmm2 # 2 : 1/iak" + "\n\t" " pshufb %xmm3, %xmm2 # 2 = 1/iak" + "\n\t" " pxor %xmm0, %xmm2 # 2 = io" + "\n\t" " movdqa %xmm10, %xmm3 # 3 : 1/jak" + "\n\t" " pshufb %xmm4, %xmm3 # 3 = 1/jak" + "\n\t" " pxor %xmm1, %xmm3 # 3 = jo" + "\n\t" " jnz .Laes_loop" + + "\n\t" " # middle of last round" + "\n\t" " movdqa .Lk_sbo(%rcx), %xmm4 # 3 : sbou" + "\n\t" " pshufb %xmm2, %xmm4 # 4 = sbou" + "\n\t" " pxor (%rdx), %xmm4 # 4 = sb1u + k" + "\n\t" " movdqa .Lk_sbo+16(%rcx), %xmm0 # 0 : sbot" + "\n\t" " pshufb %xmm3, %xmm0 # 0 = sb1t" + "\n\t" " pxor %xmm4, %xmm0 # 0 = A" + "\n\t" " pshufb .Lk_sr(%rsi,%rcx), %xmm0" + "\n\t" " ret" + "\n\t" ".size _aes_encrypt_core,.-_aes_encrypt_core" + + "\n\t" "##" + "\n\t" "## Decryption core" + "\n\t" "##" + "\n\t" "## Same API as encryption core." + "\n\t" "##" + "\n\t" ".align 16" + "\n\t" ".type _aes_decrypt_core,@function" + "\n\t" "_aes_decrypt_core:" + "\n\t" " movl %eax, %esi" + "\n\t" " shll $4, %esi" + "\n\t" " xorl $48, %esi" + "\n\t" " andl $48, %esi" + "\n\t" " movdqa .Lk_dipt (%rcx), %xmm2 # iptlo" + "\n\t" " movdqa %xmm9, %xmm1" + "\n\t" " pandn %xmm0, %xmm1" + "\n\t" " psrld $4, %xmm1" + "\n\t" " pand %xmm9, %xmm0" + "\n\t" " pshufb %xmm0, %xmm2" + "\n\t" " movdqa .Lk_dipt+16(%rcx), %xmm0 # ipthi" + "\n\t" " pshufb %xmm1, %xmm0" + "\n\t" " pxor (%rdx), %xmm2" + "\n\t" " pxor %xmm2, %xmm0" + "\n\t" " movdqa .Lk_mc_forward+48(%rcx), %xmm5" + "\n\t" " lea 16(%rdx), %rdx" + "\n\t" " neg %rax" + "\n\t" " jmp .Laes_dec_entry" + + "\n\t" ".align 16" + "\n\t" ".Laes_dec_loop:" + "\n\t" "##" + "\n\t" "## Inverse mix columns" + "\n\t" "##" + "\n\t" " movdqa %xmm13, %xmm4 # 4 : sb9u" + "\n\t" " pshufb %xmm2, %xmm4 # 4 = sb9u" + "\n\t" " pxor (%rdx), %xmm4" + "\n\t" " movdqa %xmm12, %xmm0 # 0 : sb9t" + "\n\t" " pshufb %xmm3, %xmm0 # 0 = sb9t" + "\n\t" " movdqa .Lk_dsbd+16(%rcx),%xmm1 # 1 : sbdt" + "\n\t" " pxor %xmm4, %xmm0 # 0 = ch" + "\n\t" " lea 16(%rdx), %rdx # next round key" + + "\n\t" " pshufb %xmm5, %xmm0 # MC ch" + "\n\t" " movdqa %xmm15, %xmm4 # 4 : sbdu" + "\n\t" " pshufb %xmm2, %xmm4 # 4 = sbdu" + "\n\t" " pxor %xmm0, %xmm4 # 4 = ch" + "\n\t" " pshufb %xmm3, %xmm1 # 1 = sbdt" + "\n\t" " pxor %xmm4, %xmm1 # 1 = ch" + + "\n\t" " pshufb %xmm5, %xmm1 # MC ch" + "\n\t" " movdqa %xmm14, %xmm4 # 4 : sbbu" + "\n\t" " pshufb %xmm2, %xmm4 # 4 = sbbu" + "\n\t" " inc %rax # nr--" + "\n\t" " pxor %xmm1, %xmm4 # 4 = ch" + "\n\t" " movdqa .Lk_dsbb+16(%rcx),%xmm0 # 0 : sbbt" + "\n\t" " pshufb %xmm3, %xmm0 # 0 = sbbt" + "\n\t" " pxor %xmm4, %xmm0 # 0 = ch" + + "\n\t" " pshufb %xmm5, %xmm0 # MC ch" + "\n\t" " movdqa %xmm8, %xmm4 # 4 : sbeu" + "\n\t" " pshufb %xmm2, %xmm4 # 4 = sbeu" + "\n\t" " pshufd $0x93, %xmm5, %xmm5" + "\n\t" " pxor %xmm0, %xmm4 # 4 = ch" + "\n\t" " movdqa .Lk_dsbe+16(%rcx),%xmm0 # 0 : sbet" + "\n\t" " pshufb %xmm3, %xmm0 # 0 = sbet" + "\n\t" " pxor %xmm4, %xmm0 # 0 = ch" + + "\n\t" ".Laes_dec_entry:" + "\n\t" " # top of round" + "\n\t" " movdqa %xmm9, %xmm1 # 1 : i" + "\n\t" " pandn %xmm0, %xmm1 # 1 = i<<4" + "\n\t" " psrld $4, %xmm1 # 1 = i" + "\n\t" " pand %xmm9, %xmm0 # 0 = k" + "\n\t" " movdqa %xmm11, %xmm2 # 2 : a/k" + "\n\t" " pshufb %xmm0, %xmm2 # 2 = a/k" + "\n\t" " pxor %xmm1, %xmm0 # 0 = j" + "\n\t" " movdqa %xmm10, %xmm3 # 3 : 1/i" + "\n\t" " pshufb %xmm1, %xmm3 # 3 = 1/i" + "\n\t" " pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k" + "\n\t" " movdqa %xmm10, %xmm4 # 4 : 1/j" + "\n\t" " pshufb %xmm0, %xmm4 # 4 = 1/j" + "\n\t" " pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k" + "\n\t" " movdqa %xmm10, %xmm2 # 2 : 1/iak" + "\n\t" " pshufb %xmm3, %xmm2 # 2 = 1/iak" + "\n\t" " pxor %xmm0, %xmm2 # 2 = io" + "\n\t" " movdqa %xmm10, %xmm3 # 3 : 1/jak" + "\n\t" " pshufb %xmm4, %xmm3 # 3 = 1/jak" + "\n\t" " pxor %xmm1, %xmm3 # 3 = jo" + "\n\t" " jnz .Laes_dec_loop" + + "\n\t" " # middle of last round" + "\n\t" " movdqa .Lk_dsbo(%rcx), %xmm4 # 3 : sbou" + "\n\t" " pshufb %xmm2, %xmm4 # 4 = sbou" + "\n\t" " pxor (%rdx), %xmm4 # 4 = sb1u + k" + "\n\t" " movdqa .Lk_dsbo+16(%rcx), %xmm0 # 0 : sbot" + "\n\t" " pshufb %xmm3, %xmm0 # 0 = sb1t" + "\n\t" " pxor %xmm4, %xmm0 # 0 = A" + "\n\t" " pshufb .Lk_sr(%rsi,%rcx), %xmm0" + "\n\t" " ret" + "\n\t" ".size _aes_decrypt_core,.-_aes_decrypt_core" + + "\n\t" "########################################################" + "\n\t" "## ##" + "\n\t" "## AES key schedule ##" + "\n\t" "## ##" + "\n\t" "########################################################" + + "\n\t" ".align 16" + "\n\t" ".type _aes_schedule_core,@function" + "\n\t" "_aes_schedule_core:" + "\n\t" " # rdi = key" + "\n\t" " # rsi = size in bits" + "\n\t" " # rdx = buffer" + "\n\t" " # rcx = direction. 0=encrypt, 1=decrypt" + + "\n\t" " # load the tables" + "\n\t" " lea .Laes_consts(%rip), %r10" + "\n\t" " movdqa (%r10), %xmm9 # 0F" + "\n\t" " movdqa .Lk_inv (%r10), %xmm10 # inv" + "\n\t" " movdqa .Lk_inv+16(%r10), %xmm11 # inva" + "\n\t" " movdqa .Lk_sb1 (%r10), %xmm13 # sb1u" + "\n\t" " movdqa .Lk_sb1+16(%r10), %xmm12 # sb1t" + "\n\t" " movdqa .Lk_sb2 (%r10), %xmm15 # sb2u" + "\n\t" " movdqa .Lk_sb2+16(%r10), %xmm14 # sb2t" + + "\n\t" " movdqa .Lk_rcon(%r10), %xmm8 # load rcon" + "\n\t" " movdqu (%rdi), %xmm0 # load key (unaligned)" + + "\n\t" " # input transform" + "\n\t" " movdqu %xmm0, %xmm3" + "\n\t" " lea .Lk_ipt(%r10), %r11" + "\n\t" " call .Laes_schedule_transform" + "\n\t" " movdqu %xmm0, %xmm7" + + "\n\t" " test %rcx, %rcx" + "\n\t" " jnz .Laes_schedule_am_decrypting" + + "\n\t" " # encrypting, output zeroth round key after transform" + "\n\t" " movdqa %xmm0, (%rdx)" + "\n\t" " jmp .Laes_schedule_go" + + "\n\t" ".Laes_schedule_am_decrypting:" + "\n\t" " # decrypting, output zeroth round key after shiftrows" + "\n\t" " pshufb .Lk_sr(%r8,%r10),%xmm3" + "\n\t" " movdqa %xmm3, (%rdx)" + "\n\t" " xor $48, %r8" + + "\n\t" ".Laes_schedule_go:" + "\n\t" " cmp $192, %rsi" + "\n\t" " je .Laes_schedule_192" + "\n\t" " cmp $256, %rsi" + "\n\t" " je .Laes_schedule_256" + "\n\t" " # 128: fall though" + + "\n\t" "##" + "\n\t" "## .Laes_schedule_128" + "\n\t" "##" + "\n\t" "## 128-bit specific part of key schedule." + "\n\t" "##" + "\n\t" "## This schedule is really simple, because all its parts" + "\n\t" "## are accomplished by the subroutines." + "\n\t" "##" + "\n\t" ".Laes_schedule_128:" + "\n\t" " mov $10, %rsi" + + "\n\t" ".Laes_schedule_128_L:" + "\n\t" " call .Laes_schedule_round" + "\n\t" " dec %rsi" + "\n\t" " jz .Laes_schedule_mangle_last" + "\n\t" " call .Laes_schedule_mangle # write output" + "\n\t" " jmp .Laes_schedule_128_L" + + "\n\t" "##" + "\n\t" "## .Laes_schedule_192" + "\n\t" "##" + "\n\t" "## 192-bit specific part of key schedule." + "\n\t" "##" + "\n\t" "## The main body of this schedule is the same as the 128-bit" + "\n\t" "## schedule, but with more smearing. The long, high side is" + "\n\t" "## stored in %xmm7 as before, and the short, low side is in" + "\n\t" "## the high bits of %xmm6." + "\n\t" "##" + "\n\t" "## This schedule is somewhat nastier, however, because each" + "\n\t" "## round produces 192 bits of key material, or 1.5 round keys." + "\n\t" "## Therefore, on each cycle we do 2 rounds and produce 3 round" + "\n\t" "## keys." + "\n\t" "##" + "\n\t" ".Laes_schedule_192:" + "\n\t" " movdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)" + "\n\t" " call .Laes_schedule_transform # input transform" + "\n\t" " pshufd $0x0E, %xmm0, %xmm6" + "\n\t" " pslldq $8, %xmm6 # clobber low side with zeros" + "\n\t" " mov $4, %rsi" + + "\n\t" ".Laes_schedule_192_L:" + "\n\t" " call .Laes_schedule_round" + "\n\t" " palignr $8,%xmm6,%xmm0 " + "\n\t" " call .Laes_schedule_mangle # save key n" + "\n\t" " call .Laes_schedule_192_smear" + "\n\t" " call .Laes_schedule_mangle # save key n+1" + "\n\t" " call .Laes_schedule_round" + "\n\t" " dec %rsi" + "\n\t" " jz .Laes_schedule_mangle_last" + "\n\t" " call .Laes_schedule_mangle # save key n+2" + "\n\t" " call .Laes_schedule_192_smear" + "\n\t" " jmp .Laes_schedule_192_L" + + "\n\t" "##" + "\n\t" "## .Laes_schedule_192_smear" + "\n\t" "##" + "\n\t" "## Smear the short, low side in the 192-bit key schedule." + "\n\t" "##" + "\n\t" "## Inputs:" + "\n\t" "## %xmm7: high side, b a x y" + "\n\t" "## %xmm6: low side, d c 0 0" + "\n\t" "## %xmm13: 0" + "\n\t" "##" + "\n\t" "## Outputs:" + "\n\t" "## %xmm6: b+c+d b+c 0 0" + "\n\t" "## %xmm0: b+c+d b+c b a" + "\n\t" "##" + "\n\t" ".Laes_schedule_192_smear:" + "\n\t" " pshufd $0x80, %xmm6, %xmm0 # d c 0 0 -> c 0 0 0" + "\n\t" " pxor %xmm0, %xmm6 # -> c+d c 0 0" + "\n\t" " pshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a" + "\n\t" " pxor %xmm6, %xmm0 # -> b+c+d b+c b a" + "\n\t" " pshufd $0x0E, %xmm0, %xmm6" + "\n\t" " pslldq $8, %xmm6 # clobber low side with zeros" + "\n\t" " ret" + + "\n\t" "##" + "\n\t" "## .Laes_schedule_256" + "\n\t" "##" + "\n\t" "## 256-bit specific part of key schedule." + "\n\t" "##" + "\n\t" "## The structure here is very similar to the 128-bit" + "\n\t" "## schedule, but with an additional 'low side' in" + "\n\t" "## %xmm6. The low side's rounds are the same as the" + "\n\t" "## high side's, except no rcon and no rotation." + "\n\t" "##" + "\n\t" ".Laes_schedule_256:" + "\n\t" " movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)" + "\n\t" " call .Laes_schedule_transform # input transform" + "\n\t" " mov $7, %rsi" + + "\n\t" ".Laes_schedule_256_L:" + "\n\t" " call .Laes_schedule_mangle # output low result" + "\n\t" " movdqa %xmm0, %xmm6 # save cur_lo in xmm6" + + "\n\t" " # high round" + "\n\t" " call .Laes_schedule_round" + "\n\t" " dec %rsi" + "\n\t" " jz .Laes_schedule_mangle_last" + "\n\t" " call .Laes_schedule_mangle " + + "\n\t" " # low round. swap xmm7 and xmm6" + "\n\t" " pshufd $0xFF, %xmm0, %xmm0" + "\n\t" " movdqa %xmm7, %xmm5" + "\n\t" " movdqa %xmm6, %xmm7" + "\n\t" " call .Laes_schedule_low_round" + "\n\t" " movdqa %xmm5, %xmm7" + + "\n\t" " jmp .Laes_schedule_256_L" + + "\n\t" "##" + "\n\t" "## .Laes_schedule_round" + "\n\t" "##" + "\n\t" "## Runs one main round of the key schedule on %xmm0, %xmm7" + "\n\t" "##" + "\n\t" "## Specifically, runs subbytes on the high dword of %xmm0" + "\n\t" "## then rotates it by one byte and xors into the low dword of" + "\n\t" "## %xmm7." + "\n\t" "##" + "\n\t" "## Adds rcon from low byte of %xmm8, then rotates %xmm8 for" + "\n\t" "## next rcon." + "\n\t" "##" + "\n\t" "## Smears the dwords of %xmm7 by xoring the low into the" + "\n\t" "## second low, result into third, result into highest." + "\n\t" "##" + "\n\t" "## Returns results in %xmm7 = %xmm0." + "\n\t" "## Clobbers %xmm1-%xmm4, %r11." + "\n\t" "##" + "\n\t" ".Laes_schedule_round:" + "\n\t" " # extract rcon from xmm8" + "\n\t" " pxor %xmm1, %xmm1" + "\n\t" " palignr $15, %xmm8, %xmm1" + "\n\t" " palignr $15, %xmm8, %xmm8" + "\n\t" " pxor %xmm1, %xmm7" + + "\n\t" " # rotate" + "\n\t" " pshufd $0xFF, %xmm0, %xmm0" + "\n\t" " palignr $1, %xmm0, %xmm0" + + "\n\t" " # fall through..." + + "\n\t" " # low round: same as high round, but no rotation and no rcon." + "\n\t" ".Laes_schedule_low_round:" + "\n\t" " # smear xmm7" + "\n\t" " movdqa %xmm7, %xmm1" + "\n\t" " pslldq $4, %xmm7" + "\n\t" " pxor %xmm1, %xmm7" + "\n\t" " movdqa %xmm7, %xmm1" + "\n\t" " pslldq $8, %xmm7" + "\n\t" " pxor %xmm1, %xmm7" + "\n\t" " pxor .Lk_s63(%r10), %xmm7" + + "\n\t" " # subbytes" + "\n\t" " movdqa %xmm9, %xmm1" + "\n\t" " pandn %xmm0, %xmm1" + "\n\t" " psrld $4, %xmm1 # 1 = i" + "\n\t" " pand %xmm9, %xmm0 # 0 = k" + "\n\t" " movdqa %xmm11, %xmm2 # 2 : a/k" + "\n\t" " pshufb %xmm0, %xmm2 # 2 = a/k" + "\n\t" " pxor %xmm1, %xmm0 # 0 = j" + "\n\t" " movdqa %xmm10, %xmm3 # 3 : 1/i" + "\n\t" " pshufb %xmm1, %xmm3 # 3 = 1/i" + "\n\t" " pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k" + "\n\t" " movdqa %xmm10, %xmm4 # 4 : 1/j" + "\n\t" " pshufb %xmm0, %xmm4 # 4 = 1/j" + "\n\t" " pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k" + "\n\t" " movdqa %xmm10, %xmm2 # 2 : 1/iak" + "\n\t" " pshufb %xmm3, %xmm2 # 2 = 1/iak" + "\n\t" " pxor %xmm0, %xmm2 # 2 = io" + "\n\t" " movdqa %xmm10, %xmm3 # 3 : 1/jak" + "\n\t" " pshufb %xmm4, %xmm3 # 3 = 1/jak" + "\n\t" " pxor %xmm1, %xmm3 # 3 = jo" + "\n\t" " movdqa .Lk_sb1(%r10), %xmm4 # 4 : sbou" + "\n\t" " pshufb %xmm2, %xmm4 # 4 = sbou" + "\n\t" " movdqa .Lk_sb1+16(%r10), %xmm0 # 0 : sbot" + "\n\t" " pshufb %xmm3, %xmm0 # 0 = sb1t" + "\n\t" " pxor %xmm4, %xmm0 # 0 = sbox output" + + "\n\t" " # add in smeared stuff" + "\n\t" " pxor %xmm7, %xmm0 " + "\n\t" " movdqa %xmm0, %xmm7" + "\n\t" " ret" + + "\n\t" "##" + "\n\t" "## .Laes_schedule_transform" + "\n\t" "##" + "\n\t" "## Linear-transform %xmm0 according to tables at (%r11)" + "\n\t" "##" + "\n\t" "## Requires that %xmm9 = 0x0F0F... as in preheat" + "\n\t" "## Output in %xmm0" + "\n\t" "## Clobbers %xmm1, %xmm2" + "\n\t" "##" + "\n\t" ".Laes_schedule_transform:" + "\n\t" " movdqa %xmm9, %xmm1" + "\n\t" " pandn %xmm0, %xmm1" + "\n\t" " psrld $4, %xmm1" + "\n\t" " pand %xmm9, %xmm0" + "\n\t" " movdqa (%r11), %xmm2 # lo" + "\n\t" " pshufb %xmm0, %xmm2" + "\n\t" " movdqa 16(%r11), %xmm0 # hi" + "\n\t" " pshufb %xmm1, %xmm0" + "\n\t" " pxor %xmm2, %xmm0" + "\n\t" " ret" + + "\n\t" "##" + "\n\t" "## .Laes_schedule_mangle" + "\n\t" "##" + "\n\t" "## Mangle xmm0 from (basis-transformed) standard version" + "\n\t" "## to our version." + "\n\t" "##" + "\n\t" "## On encrypt," + "\n\t" "## xor with 0x63" + "\n\t" "## multiply by circulant 0,1,1,1" + "\n\t" "## apply shiftrows transform" + "\n\t" "##" + "\n\t" "## On decrypt," + "\n\t" "## xor with 0x63" + "\n\t" "## multiply by 'inverse mixcolumns' circulant E,B,D,9" + "\n\t" "## deskew" + "\n\t" "## apply shiftrows transform" + "\n\t" "##" + "\n\t" "##" + "\n\t" "## Writes out to (%rdx), and increments or decrements it" + "\n\t" "## Keeps track of round number mod 4 in %r8" + "\n\t" "## Preserves xmm0" + "\n\t" "## Clobbers xmm1-xmm5" + "\n\t" "##" + "\n\t" ".Laes_schedule_mangle:" + "\n\t" " movdqa %xmm0, %xmm4 # save xmm0 for later" + "\n\t" " movdqa .Lk_mc_forward(%r10),%xmm5" + "\n\t" " test %rcx, %rcx" + "\n\t" " jnz .Laes_schedule_mangle_dec" + + "\n\t" " # encrypting" + "\n\t" " add $16, %rdx" + "\n\t" " pxor .Lk_s63(%r10),%xmm4" + "\n\t" " pshufb %xmm5, %xmm4" + "\n\t" " movdqa %xmm4, %xmm3" + "\n\t" " pshufb %xmm5, %xmm4" + "\n\t" " pxor %xmm4, %xmm3" + "\n\t" " pshufb %xmm5, %xmm4" + "\n\t" " pxor %xmm4, %xmm3" + + "\n\t" " jmp .Laes_schedule_mangle_both" + + "\n\t" ".Laes_schedule_mangle_dec:" + "\n\t" " lea .Lk_dks_1(%r10), %r11 # first table: *9" + "\n\t" " call .Laes_schedule_transform" + "\n\t" " movdqa %xmm0, %xmm3" + "\n\t" " pshufb %xmm5, %xmm3" + + "\n\t" " add $32, %r11 # next table: *B" + "\n\t" " call .Laes_schedule_transform" + "\n\t" " pxor %xmm0, %xmm3" + "\n\t" " pshufb %xmm5, %xmm3" + + "\n\t" " add $32, %r11 # next table: *D" + "\n\t" " call .Laes_schedule_transform" + "\n\t" " pxor %xmm0, %xmm3" + "\n\t" " pshufb %xmm5, %xmm3" + + "\n\t" " add $32, %r11 # next table: *E" + "\n\t" " call .Laes_schedule_transform" + "\n\t" " pxor %xmm0, %xmm3" + "\n\t" " pshufb %xmm5, %xmm3" + + "\n\t" " movdqa %xmm4, %xmm0 # restore %xmm0" + "\n\t" " add $-16, %rdx" + + "\n\t" ".Laes_schedule_mangle_both:" + "\n\t" " pshufb .Lk_sr(%r8,%r10),%xmm3" + "\n\t" " add $-16, %r8" + "\n\t" " and $48, %r8" + "\n\t" " movdqa %xmm3, (%rdx)" + "\n\t" " ret" + + "\n\t" "##" + "\n\t" "## .Laes_schedule_mangle_last" + "\n\t" "##" + "\n\t" "## Mangler for last round of key schedule" + "\n\t" "## Mangles %xmm0" + "\n\t" "## when encrypting, outputs out(%xmm0) ^ 63" + "\n\t" "## when decrypting, outputs unskew(%xmm0)" + "\n\t" "##" + "\n\t" "## Always called right before return... jumps to cleanup and exits" + "\n\t" "##" + "\n\t" ".Laes_schedule_mangle_last:" + "\n\t" " # schedule last round key from xmm0" + "\n\t" " lea .Lk_deskew(%r10),%r11 # prepare to deskew" + "\n\t" " test %rcx, %rcx" + "\n\t" " jnz .Laes_schedule_mangle_last_dec" + + "\n\t" " # encrypting" + "\n\t" " pshufb .Lk_sr(%r8,%r10),%xmm0 # output permute" + "\n\t" " lea .Lk_opt(%r10), %r11 # prepare to output transform" + "\n\t" " add $32, %rdx" + + "\n\t" ".Laes_schedule_mangle_last_dec:" + "\n\t" " add $-16, %rdx" + "\n\t" " pxor .Lk_s63(%r10), %xmm0" + "\n\t" " call .Laes_schedule_transform # output transform" + "\n\t" " movdqa %xmm0, (%rdx) # save last key" + + "\n\t" " #_aes_cleanup" + "\n\t" " pxor %xmm0, %xmm0" + "\n\t" " pxor %xmm1, %xmm1" + "\n\t" " pxor %xmm2, %xmm2" + "\n\t" " pxor %xmm3, %xmm3" + "\n\t" " pxor %xmm4, %xmm4" + "\n\t" " pxor %xmm5, %xmm5" + "\n\t" " pxor %xmm6, %xmm6" + "\n\t" " pxor %xmm7, %xmm7" + "\n\t" " pxor %xmm8, %xmm8" + "\n\t" " ret" + "\n\t" ".size _aes_schedule_core,.-_aes_schedule_core" + + "\n\t" "########################################################" + "\n\t" "## ##" + "\n\t" "## Constants ##" + "\n\t" "## ##" + "\n\t" "########################################################" + + "\n\t" ".align 16" + "\n\t" ".type _aes_consts,@object" + "\n\t" ".Laes_consts:" + "\n\t" "_aes_consts:" + "\n\t" " # s0F" + "\n\t" " .Lk_s0F = .-.Laes_consts" + "\n\t" " .quad 0x0F0F0F0F0F0F0F0F" + "\n\t" " .quad 0x0F0F0F0F0F0F0F0F" + + "\n\t" " # input transform (lo, hi)" + "\n\t" " .Lk_ipt = .-.Laes_consts" + "\n\t" " .quad 0xC2B2E8985A2A7000" + "\n\t" " .quad 0xCABAE09052227808" + "\n\t" " .quad 0x4C01307D317C4D00" + "\n\t" " .quad 0xCD80B1FCB0FDCC81" + + "\n\t" " # inv, inva" + "\n\t" " .Lk_inv = .-.Laes_consts" + "\n\t" " .quad 0x0E05060F0D080180" + "\n\t" " .quad 0x040703090A0B0C02" + "\n\t" " .quad 0x01040A060F0B0780" + "\n\t" " .quad 0x030D0E0C02050809" + + "\n\t" " # sb1u, sb1t" + "\n\t" " .Lk_sb1 = .-.Laes_consts" + "\n\t" " .quad 0xB19BE18FCB503E00" + "\n\t" " .quad 0xA5DF7A6E142AF544" + "\n\t" " .quad 0x3618D415FAE22300" + "\n\t" " .quad 0x3BF7CCC10D2ED9EF" + + + "\n\t" " # sb2u, sb2t" + "\n\t" " .Lk_sb2 = .-.Laes_consts" + "\n\t" " .quad 0xE27A93C60B712400" + "\n\t" " .quad 0x5EB7E955BC982FCD" + "\n\t" " .quad 0x69EB88400AE12900" + "\n\t" " .quad 0xC2A163C8AB82234A" + + "\n\t" " # sbou, sbot" + "\n\t" " .Lk_sbo = .-.Laes_consts" + "\n\t" " .quad 0xD0D26D176FBDC700" + "\n\t" " .quad 0x15AABF7AC502A878" + "\n\t" " .quad 0xCFE474A55FBB6A00" + "\n\t" " .quad 0x8E1E90D1412B35FA" + + "\n\t" " # mc_forward" + "\n\t" " .Lk_mc_forward = .-.Laes_consts" + "\n\t" " .quad 0x0407060500030201" + "\n\t" " .quad 0x0C0F0E0D080B0A09" + "\n\t" " .quad 0x080B0A0904070605" + "\n\t" " .quad 0x000302010C0F0E0D" + "\n\t" " .quad 0x0C0F0E0D080B0A09" + "\n\t" " .quad 0x0407060500030201" + "\n\t" " .quad 0x000302010C0F0E0D" + "\n\t" " .quad 0x080B0A0904070605" + + "\n\t" " # mc_backward" + "\n\t" " .Lk_mc_backward = .-.Laes_consts" + "\n\t" " .quad 0x0605040702010003" + "\n\t" " .quad 0x0E0D0C0F0A09080B" + "\n\t" " .quad 0x020100030E0D0C0F" + "\n\t" " .quad 0x0A09080B06050407" + "\n\t" " .quad 0x0E0D0C0F0A09080B" + "\n\t" " .quad 0x0605040702010003" + "\n\t" " .quad 0x0A09080B06050407" + "\n\t" " .quad 0x020100030E0D0C0F" + + "\n\t" " # sr" + "\n\t" " .Lk_sr = .-.Laes_consts" + "\n\t" " .quad 0x0706050403020100" + "\n\t" " .quad 0x0F0E0D0C0B0A0908" + "\n\t" " .quad 0x030E09040F0A0500" + "\n\t" " .quad 0x0B06010C07020D08" + "\n\t" " .quad 0x0F060D040B020900" + "\n\t" " .quad 0x070E050C030A0108" + "\n\t" " .quad 0x0B0E0104070A0D00" + "\n\t" " .quad 0x0306090C0F020508" + + "\n\t" " # rcon" + "\n\t" " .Lk_rcon = .-.Laes_consts" + "\n\t" " .quad 0x1F8391B9AF9DEEB6" + "\n\t" " .quad 0x702A98084D7C7D81" + + "\n\t" " # s63: all equal to 0x63 transformed" + "\n\t" " .Lk_s63 = .-.Laes_consts" + "\n\t" " .quad 0x5B5B5B5B5B5B5B5B" + "\n\t" " .quad 0x5B5B5B5B5B5B5B5B" + + "\n\t" " # output transform" + "\n\t" " .Lk_opt = .-.Laes_consts" + "\n\t" " .quad 0xFF9F4929D6B66000" + "\n\t" " .quad 0xF7974121DEBE6808" + "\n\t" " .quad 0x01EDBD5150BCEC00" + "\n\t" " .quad 0xE10D5DB1B05C0CE0" + + "\n\t" " # deskew tables: inverts the sbox's 'skew'" + "\n\t" " .Lk_deskew = .-.Laes_consts" + "\n\t" " .quad 0x07E4A34047A4E300" + "\n\t" " .quad 0x1DFEB95A5DBEF91A" + "\n\t" " .quad 0x5F36B5DC83EA6900" + "\n\t" " .quad 0x2841C2ABF49D1E77" + + "\n\t" "##" + "\n\t" "## Decryption stuff" + "\n\t" "## Key schedule constants" + "\n\t" "##" + "\n\t" " # decryption key schedule: x -> invskew x*9" + "\n\t" " .Lk_dks_1 = .-.Laes_consts" + "\n\t" " .quad 0xB6116FC87ED9A700" + "\n\t" " .quad 0x4AED933482255BFC" + "\n\t" " .quad 0x4576516227143300" + "\n\t" " .quad 0x8BB89FACE9DAFDCE" + + "\n\t" " # decryption key schedule: invskew x*9 -> invskew x*D" + "\n\t" " .Lk_dks_2 = .-.Laes_consts" + "\n\t" " .quad 0x27438FEBCCA86400" + "\n\t" " .quad 0x4622EE8AADC90561" + "\n\t" " .quad 0x815C13CE4F92DD00" + "\n\t" " .quad 0x73AEE13CBD602FF2" + + "\n\t" " # decryption key schedule: invskew x*D -> invskew x*B" + "\n\t" " .Lk_dks_3 = .-.Laes_consts" + "\n\t" " .quad 0x03C4C50201C6C700" + "\n\t" " .quad 0xF83F3EF9FA3D3CFB" + "\n\t" " .quad 0xEE1921D638CFF700" + "\n\t" " .quad 0xA5526A9D7384BC4B" + + "\n\t" " # decryption key schedule: invskew x*B -> invskew x*E + 0x63" + "\n\t" " .Lk_dks_4 = .-.Laes_consts" + "\n\t" " .quad 0xE3C390B053732000" + "\n\t" " .quad 0xA080D3F310306343" + "\n\t" " .quad 0xA0CA214B036982E8" + "\n\t" " .quad 0x2F45AEC48CE60D67" + + "\n\t" "##" + "\n\t" "## Decryption stuff" + "\n\t" "## Round function constants" + "\n\t" "##" + "\n\t" " # decryption input transform" + "\n\t" " .Lk_dipt = .-.Laes_consts" + "\n\t" " .quad 0x0F505B040B545F00" + "\n\t" " .quad 0x154A411E114E451A" + "\n\t" " .quad 0x86E383E660056500" + "\n\t" " .quad 0x12771772F491F194" + + "\n\t" " # decryption sbox output *9*u, *9*t" + "\n\t" " .Lk_dsb9 = .-.Laes_consts" + "\n\t" " .quad 0x851C03539A86D600" + "\n\t" " .quad 0xCAD51F504F994CC9" + "\n\t" " .quad 0xC03B1789ECD74900" + "\n\t" " .quad 0x725E2C9EB2FBA565" + + "\n\t" " # decryption sbox output *D*u, *D*t" + "\n\t" " .Lk_dsbd = .-.Laes_consts" + "\n\t" " .quad 0x7D57CCDFE6B1A200" + "\n\t" " .quad 0xF56E9B13882A4439" + "\n\t" " .quad 0x3CE2FAF724C6CB00" + "\n\t" " .quad 0x2931180D15DEEFD3" + + "\n\t" " # decryption sbox output *B*u, *B*t" + "\n\t" " .Lk_dsbb = .-.Laes_consts" + "\n\t" " .quad 0xD022649296B44200" + "\n\t" " .quad 0x602646F6B0F2D404" + "\n\t" " .quad 0xC19498A6CD596700" + "\n\t" " .quad 0xF3FF0C3E3255AA6B" + + "\n\t" " # decryption sbox output *E*u, *E*t" + "\n\t" " .Lk_dsbe = .-.Laes_consts" + "\n\t" " .quad 0x46F2929626D4D000" + "\n\t" " .quad 0x2242600464B4F6B0" + "\n\t" " .quad 0x0C55A6CDFFAAC100" + "\n\t" " .quad 0x9467F36B98593E32" + + "\n\t" " # decryption sbox final output" + "\n\t" " .Lk_dsbo = .-.Laes_consts" + "\n\t" " .quad 0x1387EA537EF94000" + "\n\t" " .quad 0xC7AA6DB9D4943E2D" + "\n\t" " .quad 0x12D7560F93441D00" + "\n\t" " .quad 0xCA4B8159D8C58E9C" + "\n\t" ".size _aes_consts,.-_aes_consts" +); + +#endif /* USE_SSSE3 */ diff --git a/cipher/rijndael.c b/cipher/rijndael.c index 7a83718b..51c36c76 100644 --- a/cipher/rijndael.c +++ b/cipher/rijndael.c @@ -99,6 +99,40 @@ extern void _gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx, unsigned char *iv, size_t nblocks); #endif +#ifdef USE_SSSE3 +/* SSSE3 (AMD64) vector permutation implementation of AES */ +extern void _gcry_aes_ssse3_do_setkey(RIJNDAEL_context *ctx, const byte *key); +extern void _gcry_aes_ssse3_prepare_decryption(RIJNDAEL_context *ctx); + +extern unsigned int _gcry_aes_ssse3_encrypt (const RIJNDAEL_context *ctx, + unsigned char *dst, + const unsigned char *src); +extern unsigned int _gcry_aes_ssse3_decrypt (const RIJNDAEL_context *ctx, + unsigned char *dst, + const unsigned char *src); +extern void _gcry_aes_ssse3_cfb_enc (RIJNDAEL_context *ctx, + unsigned char *outbuf, + const unsigned char *inbuf, + unsigned char *iv, size_t nblocks); +extern void _gcry_aes_ssse3_cbc_enc (RIJNDAEL_context *ctx, + unsigned char *outbuf, + const unsigned char *inbuf, + unsigned char *iv, size_t nblocks, + int cbc_mac); +extern void _gcry_aes_ssse3_ctr_enc (RIJNDAEL_context *ctx, + unsigned char *outbuf, + const unsigned char *inbuf, + unsigned char *ctr, size_t nblocks); +extern void _gcry_aes_ssse3_cfb_dec (RIJNDAEL_context *ctx, + unsigned char *outbuf, + const unsigned char *inbuf, + unsigned char *iv, size_t nblocks); +extern void _gcry_aes_ssse3_cbc_dec (RIJNDAEL_context *ctx, + unsigned char *outbuf, + const unsigned char *inbuf, + unsigned char *iv, size_t nblocks); +#endif + #ifdef USE_PADLOCK extern unsigned int _gcry_aes_padlock_encrypt (const RIJNDAEL_context *ctx, unsigned char *bx, @@ -182,7 +216,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen) int rounds; int i,j, r, t, rconpointer = 0; int KC; -#if defined(USE_AESNI) || defined(USE_PADLOCK) +#if defined(USE_AESNI) || defined(USE_PADLOCK) || defined(USE_SSSE3) unsigned int hwfeatures; #endif @@ -223,7 +257,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen) ctx->rounds = rounds; -#if defined(USE_AESNI) || defined(USE_PADLOCK) +#if defined(USE_AESNI) || defined(USE_PADLOCK) || defined(USE_SSSE3) hwfeatures = _gcry_get_hw_features (); #endif @@ -234,6 +268,9 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen) #ifdef USE_AESNI ctx->use_aesni = 0; #endif +#ifdef USE_SSSE3 + ctx->use_ssse3 = 0; +#endif if (0) { @@ -259,6 +296,16 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen) ctx->use_padlock = 1; memcpy (ctx->padlockkey, key, keylen); } +#endif +#ifdef USE_SSSE3 + else if (hwfeatures & HWF_INTEL_SSSE3) + { + ctx->encrypt_fn = _gcry_aes_ssse3_encrypt; + ctx->decrypt_fn = _gcry_aes_ssse3_decrypt; + ctx->prefetch_enc_fn = NULL; + ctx->prefetch_dec_fn = NULL; + ctx->use_ssse3 = 1; + } #endif else { @@ -277,6 +324,10 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen) #ifdef USE_AESNI else if (ctx->use_aesni) _gcry_aes_aesni_do_setkey (ctx, key); +#endif +#ifdef USE_AESNI + else if (ctx->use_ssse3) + _gcry_aes_ssse3_do_setkey (ctx, key); #endif else { @@ -403,6 +454,12 @@ prepare_decryption( RIJNDAEL_context *ctx ) _gcry_aes_aesni_prepare_decryption (ctx); } #endif /*USE_AESNI*/ +#ifdef USE_SSSE3 + else if (ctx->use_ssse3) + { + _gcry_aes_ssse3_prepare_decryption (ctx); + } +#endif /*USE_SSSE3*/ #ifdef USE_PADLOCK else if (ctx->use_padlock) { @@ -650,6 +707,13 @@ _gcry_aes_cfb_enc (void *context, unsigned char *iv, burn_depth = 0; } #endif /*USE_AESNI*/ +#ifdef USE_SSSE3 + else if (ctx->use_ssse3) + { + _gcry_aes_ssse3_cfb_enc (ctx, outbuf, inbuf, iv, nblocks); + burn_depth = 0; + } +#endif /*USE_SSSE3*/ else { rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn; @@ -697,6 +761,13 @@ _gcry_aes_cbc_enc (void *context, unsigned char *iv, burn_depth = 0; } #endif /*USE_AESNI*/ +#ifdef USE_SSSE3 + else if (ctx->use_ssse3) + { + _gcry_aes_ssse3_cbc_enc (ctx, outbuf, inbuf, iv, nblocks, cbc_mac); + burn_depth = 0; + } +#endif /*USE_SSSE3*/ else { rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn; @@ -752,6 +823,13 @@ _gcry_aes_ctr_enc (void *context, unsigned char *ctr, burn_depth = 0; } #endif /*USE_AESNI*/ +#ifdef USE_SSSE3 + else if (ctx->use_ssse3) + { + _gcry_aes_ssse3_ctr_enc (ctx, outbuf, inbuf, ctr, nblocks); + burn_depth = 0; + } +#endif /*USE_SSSE3*/ else { union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } tmp; @@ -986,6 +1064,13 @@ _gcry_aes_cfb_dec (void *context, unsigned char *iv, burn_depth = 0; } #endif /*USE_AESNI*/ +#ifdef USE_SSSE3 + else if (ctx->use_ssse3) + { + _gcry_aes_ssse3_cfb_dec (ctx, outbuf, inbuf, iv, nblocks); + burn_depth = 0; + } +#endif /*USE_SSSE3*/ else { rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn; @@ -1032,6 +1117,13 @@ _gcry_aes_cbc_dec (void *context, unsigned char *iv, burn_depth = 0; } #endif /*USE_AESNI*/ +#ifdef USE_SSSE3 + else if (ctx->use_ssse3) + { + _gcry_aes_ssse3_cbc_dec (ctx, outbuf, inbuf, iv, nblocks); + burn_depth = 0; + } +#endif /*USE_SSSE3*/ else { unsigned char savebuf[BLOCKSIZE] ATTR_ALIGNED_16; diff --git a/configure.ac b/configure.ac index a4ea990d..71c50c06 100644 --- a/configure.ac +++ b/configure.ac @@ -1692,6 +1692,9 @@ if test "$found" = "1" ; then x86_64-*-*) # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-amd64.lo" + + # Build with the SSSE3 implementation + GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ssse3-amd64.lo" ;; arm*-*-*) # Build with the assembly implementation -- cgit v1.2.1