diff options
-rw-r--r-- | cipher/Makefile.am | 4 | ||||
-rw-r--r-- | cipher/cipher-gcm-intel-pclmul.c | 395 | ||||
-rw-r--r-- | cipher/cipher-gcm.c | 395 | ||||
-rw-r--r-- | cipher/cipher-internal.h | 13 |
4 files changed, 430 insertions, 377 deletions
diff --git a/cipher/Makefile.am b/cipher/Makefile.am index d7e77736..98142ed8 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -40,8 +40,8 @@ libcipher_la_LIBADD = $(GCRYPT_MODULES) libcipher_la_SOURCES = \ cipher.c cipher-internal.h \ cipher-cbc.c cipher-cfb.c cipher-ofb.c cipher-ctr.c cipher-aeswrap.c \ -cipher-ccm.c cipher-cmac.c cipher-gcm.c cipher-poly1305.c \ -cipher-selftest.c cipher-selftest.h \ +cipher-ccm.c cipher-cmac.c cipher-gcm.c cipher-gcm-intel-pclmul.c \ +cipher-poly1305.c cipher-selftest.c cipher-selftest.h \ pubkey.c pubkey-internal.h pubkey-util.c \ md.c \ mac.c mac-internal.h \ diff --git a/cipher/cipher-gcm-intel-pclmul.c b/cipher/cipher-gcm-intel-pclmul.c new file mode 100644 index 00000000..02e77016 --- /dev/null +++ b/cipher/cipher-gcm-intel-pclmul.c @@ -0,0 +1,395 @@ +/* cipher-gcm-intel-pclmul.c - Intel PCLMUL accelerated Galois Counter Mode + * implementation + * Copyright (C) 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser general Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include <config.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> + +#include "g10lib.h" +#include "cipher.h" +#include "bufhelp.h" +#include "./cipher-internal.h" + + +#ifdef GCM_USE_INTEL_PCLMUL + +/* + Intel PCLMUL ghash based on white paper: + "Intel® Carry-Less Multiplication Instruction and its Usage for Computing the + GCM Mode - Rev 2.01"; Shay Gueron, Michael E. Kounavis. + */ +static inline void gfmul_pclmul(void) +{ + /* Input: XMM0 and XMM1, Output: XMM1. Input XMM0 stays unmodified. + Input must be converted to little-endian. + */ + asm volatile (/* gfmul, xmm0 has operator a and xmm1 has operator b. */ + "pshufd $78, %%xmm0, %%xmm2\n\t" + "pshufd $78, %%xmm1, %%xmm4\n\t" + "pxor %%xmm0, %%xmm2\n\t" /* xmm2 holds a0+a1 */ + "pxor %%xmm1, %%xmm4\n\t" /* xmm4 holds b0+b1 */ + + "movdqa %%xmm0, %%xmm3\n\t" + "pclmulqdq $0, %%xmm1, %%xmm3\n\t" /* xmm3 holds a0*b0 */ + "movdqa %%xmm0, %%xmm6\n\t" + "pclmulqdq $17, %%xmm1, %%xmm6\n\t" /* xmm6 holds a1*b1 */ + "movdqa %%xmm3, %%xmm5\n\t" + "pclmulqdq $0, %%xmm2, %%xmm4\n\t" /* xmm4 holds (a0+a1)*(b0+b1) */ + + "pxor %%xmm6, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */ + "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */ + "movdqa %%xmm4, %%xmm5\n\t" + "psrldq $8, %%xmm4\n\t" + "pslldq $8, %%xmm5\n\t" + "pxor %%xmm5, %%xmm3\n\t" + "pxor %%xmm4, %%xmm6\n\t" /* <xmm6:xmm3> holds the result of the + carry-less multiplication of xmm0 + by xmm1 */ + + /* shift the result by one bit position to the left cope for + the fact that bits are reversed */ + "movdqa %%xmm3, %%xmm4\n\t" + "movdqa %%xmm6, %%xmm5\n\t" + "pslld $1, %%xmm3\n\t" + "pslld $1, %%xmm6\n\t" + "psrld $31, %%xmm4\n\t" + "psrld $31, %%xmm5\n\t" + "movdqa %%xmm4, %%xmm1\n\t" + "pslldq $4, %%xmm5\n\t" + "pslldq $4, %%xmm4\n\t" + "psrldq $12, %%xmm1\n\t" + "por %%xmm4, %%xmm3\n\t" + "por %%xmm5, %%xmm6\n\t" + "por %%xmm6, %%xmm1\n\t" + + /* first phase of the reduction */ + "movdqa %%xmm3, %%xmm6\n\t" + "movdqa %%xmm3, %%xmm7\n\t" + "pslld $31, %%xmm6\n\t" /* packed right shifting << 31 */ + "movdqa %%xmm3, %%xmm5\n\t" + "pslld $30, %%xmm7\n\t" /* packed right shifting shift << 30 */ + "pslld $25, %%xmm5\n\t" /* packed right shifting shift << 25 */ + "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */ + "pxor %%xmm5, %%xmm6\n\t" + "movdqa %%xmm6, %%xmm7\n\t" + "pslldq $12, %%xmm6\n\t" + "psrldq $4, %%xmm7\n\t" + "pxor %%xmm6, %%xmm3\n\t" /* first phase of the reduction + complete */ + + /* second phase of the reduction */ + "movdqa %%xmm3, %%xmm2\n\t" + "movdqa %%xmm3, %%xmm4\n\t" + "psrld $1, %%xmm2\n\t" /* packed left shifting >> 1 */ + "movdqa %%xmm3, %%xmm5\n\t" + "psrld $2, %%xmm4\n\t" /* packed left shifting >> 2 */ + "psrld $7, %%xmm5\n\t" /* packed left shifting >> 7 */ + "pxor %%xmm4, %%xmm2\n\t" /* xor the shifted versions */ + "pxor %%xmm5, %%xmm2\n\t" + "pxor %%xmm7, %%xmm2\n\t" + "pxor %%xmm2, %%xmm3\n\t" + "pxor %%xmm3, %%xmm1\n\t" /* the result is in xmm1 */ + ::: "cc" ); +} + + +#ifdef __x86_64__ +static inline void gfmul_pclmul_aggr4(void) +{ + /* Input: + H¹: XMM0 X_i : XMM6 + H²: XMM8 X_(i-1) : XMM3 + H³: XMM9 X_(i-2) : XMM2 + H⁴: XMM10 X_(i-3)⊕Y_(i-4): XMM1 + Output: + Y_i: XMM1 + Inputs XMM0 stays unmodified. + Input must be converted to little-endian. + */ + asm volatile (/* perform clmul and merge results... */ + "pshufd $78, %%xmm10, %%xmm11\n\t" + "pshufd $78, %%xmm1, %%xmm12\n\t" + "pxor %%xmm10, %%xmm11\n\t" /* xmm11 holds 4:a0+a1 */ + "pxor %%xmm1, %%xmm12\n\t" /* xmm12 holds 4:b0+b1 */ + + "pshufd $78, %%xmm9, %%xmm13\n\t" + "pshufd $78, %%xmm2, %%xmm14\n\t" + "pxor %%xmm9, %%xmm13\n\t" /* xmm13 holds 3:a0+a1 */ + "pxor %%xmm2, %%xmm14\n\t" /* xmm14 holds 3:b0+b1 */ + + "pshufd $78, %%xmm8, %%xmm5\n\t" + "pshufd $78, %%xmm3, %%xmm15\n\t" + "pxor %%xmm8, %%xmm5\n\t" /* xmm1 holds 2:a0+a1 */ + "pxor %%xmm3, %%xmm15\n\t" /* xmm2 holds 2:b0+b1 */ + + "movdqa %%xmm10, %%xmm4\n\t" + "movdqa %%xmm9, %%xmm7\n\t" + "pclmulqdq $0, %%xmm1, %%xmm4\n\t" /* xmm4 holds 4:a0*b0 */ + "pclmulqdq $0, %%xmm2, %%xmm7\n\t" /* xmm7 holds 3:a0*b0 */ + "pclmulqdq $17, %%xmm10, %%xmm1\n\t" /* xmm1 holds 4:a1*b1 */ + "pclmulqdq $17, %%xmm9, %%xmm2\n\t" /* xmm9 holds 3:a1*b1 */ + "pclmulqdq $0, %%xmm11, %%xmm12\n\t" /* xmm12 holds 4:(a0+a1)*(b0+b1) */ + "pclmulqdq $0, %%xmm13, %%xmm14\n\t" /* xmm14 holds 3:(a0+a1)*(b0+b1) */ + + "pshufd $78, %%xmm0, %%xmm10\n\t" + "pshufd $78, %%xmm6, %%xmm11\n\t" + "pxor %%xmm0, %%xmm10\n\t" /* xmm10 holds 1:a0+a1 */ + "pxor %%xmm6, %%xmm11\n\t" /* xmm11 holds 1:b0+b1 */ + + "pxor %%xmm4, %%xmm7\n\t" /* xmm7 holds 3+4:a0*b0 */ + "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 3+4:a1*b1 */ + "pxor %%xmm14, %%xmm12\n\t" /* xmm12 holds 3+4:(a0+a1)*(b0+b1) */ + + "movdqa %%xmm8, %%xmm13\n\t" + "pclmulqdq $0, %%xmm3, %%xmm13\n\t" /* xmm13 holds 2:a0*b0 */ + "pclmulqdq $17, %%xmm8, %%xmm3\n\t" /* xmm3 holds 2:a1*b1 */ + "pclmulqdq $0, %%xmm5, %%xmm15\n\t" /* xmm15 holds 2:(a0+a1)*(b0+b1) */ + + "pxor %%xmm13, %%xmm7\n\t" /* xmm7 holds 2+3+4:a0*b0 */ + "pxor %%xmm3, %%xmm1\n\t" /* xmm1 holds 2+3+4:a1*b1 */ + "pxor %%xmm15, %%xmm12\n\t" /* xmm12 holds 2+3+4:(a0+a1)*(b0+b1) */ + + "movdqa %%xmm0, %%xmm3\n\t" + "pclmulqdq $0, %%xmm6, %%xmm3\n\t" /* xmm3 holds 1:a0*b0 */ + "pclmulqdq $17, %%xmm0, %%xmm6\n\t" /* xmm6 holds 1:a1*b1 */ + "movdqa %%xmm11, %%xmm4\n\t" + "pclmulqdq $0, %%xmm10, %%xmm4\n\t" /* xmm4 holds 1:(a0+a1)*(b0+b1) */ + + "pxor %%xmm7, %%xmm3\n\t" /* xmm3 holds 1+2+3+4:a0*b0 */ + "pxor %%xmm1, %%xmm6\n\t" /* xmm6 holds 1+2+3+4:a1*b1 */ + "pxor %%xmm12, %%xmm4\n\t" /* xmm4 holds 1+2+3+4:(a0+a1)*(b0+b1) */ + + /* aggregated reduction... */ + "movdqa %%xmm3, %%xmm5\n\t" + "pxor %%xmm6, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */ + "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */ + "movdqa %%xmm4, %%xmm5\n\t" + "psrldq $8, %%xmm4\n\t" + "pslldq $8, %%xmm5\n\t" + "pxor %%xmm5, %%xmm3\n\t" + "pxor %%xmm4, %%xmm6\n\t" /* <xmm6:xmm3> holds the result of the + carry-less multiplication of xmm0 + by xmm1 */ + + /* shift the result by one bit position to the left cope for + the fact that bits are reversed */ + "movdqa %%xmm3, %%xmm4\n\t" + "movdqa %%xmm6, %%xmm5\n\t" + "pslld $1, %%xmm3\n\t" + "pslld $1, %%xmm6\n\t" + "psrld $31, %%xmm4\n\t" + "psrld $31, %%xmm5\n\t" + "movdqa %%xmm4, %%xmm1\n\t" + "pslldq $4, %%xmm5\n\t" + "pslldq $4, %%xmm4\n\t" + "psrldq $12, %%xmm1\n\t" + "por %%xmm4, %%xmm3\n\t" + "por %%xmm5, %%xmm6\n\t" + "por %%xmm6, %%xmm1\n\t" + + /* first phase of the reduction */ + "movdqa %%xmm3, %%xmm6\n\t" + "movdqa %%xmm3, %%xmm7\n\t" + "pslld $31, %%xmm6\n\t" /* packed right shifting << 31 */ + "movdqa %%xmm3, %%xmm5\n\t" + "pslld $30, %%xmm7\n\t" /* packed right shifting shift << 30 */ + "pslld $25, %%xmm5\n\t" /* packed right shifting shift << 25 */ + "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */ + "pxor %%xmm5, %%xmm6\n\t" + "movdqa %%xmm6, %%xmm7\n\t" + "pslldq $12, %%xmm6\n\t" + "psrldq $4, %%xmm7\n\t" + "pxor %%xmm6, %%xmm3\n\t" /* first phase of the reduction + complete */ + + /* second phase of the reduction */ + "movdqa %%xmm3, %%xmm2\n\t" + "movdqa %%xmm3, %%xmm4\n\t" + "psrld $1, %%xmm2\n\t" /* packed left shifting >> 1 */ + "movdqa %%xmm3, %%xmm5\n\t" + "psrld $2, %%xmm4\n\t" /* packed left shifting >> 2 */ + "psrld $7, %%xmm5\n\t" /* packed left shifting >> 7 */ + "pxor %%xmm4, %%xmm2\n\t" /* xor the shifted versions */ + "pxor %%xmm5, %%xmm2\n\t" + "pxor %%xmm7, %%xmm2\n\t" + "pxor %%xmm2, %%xmm3\n\t" + "pxor %%xmm3, %%xmm1\n\t" /* the result is in xmm1 */ + :::"cc"); +} +#endif + + +void +_gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c, byte *h) +{ + u64 tmp[2]; + + /* Swap endianness of hsub. */ + tmp[0] = buf_get_be64(c->u_mode.gcm.u_ghash_key.key + 8); + tmp[1] = buf_get_be64(c->u_mode.gcm.u_ghash_key.key + 0); + buf_cpy (c->u_mode.gcm.u_ghash_key.key, tmp, GCRY_GCM_BLOCK_LEN); + +#ifdef __x86_64__ + asm volatile ("movdqu %[h_1], %%xmm0\n\t" + "movdqa %%xmm0, %%xmm1\n\t" + : + : [h_1] "m" (*tmp)); + + gfmul_pclmul (); /* H•H => H² */ + + asm volatile ("movdqu %%xmm1, 0*16(%[h_234])\n\t" + "movdqa %%xmm1, %%xmm8\n\t" + : + : [h_234] "r" (c->u_mode.gcm.gcm_table) + : "memory"); + + gfmul_pclmul (); /* H•H² => H³ */ + + asm volatile ("movdqa %%xmm8, %%xmm0\n\t" + "movdqu %%xmm1, 1*16(%[h_234])\n\t" + "movdqa %%xmm8, %%xmm1\n\t" + : + : [h_234] "r" (c->u_mode.gcm.gcm_table) + : "memory"); + + gfmul_pclmul (); /* H²•H² => H⁴ */ + + asm volatile ("movdqu %%xmm1, 2*16(%[h_234])\n\t" + : + : [h_234] "r" (c->u_mode.gcm.gcm_table) + : "memory"); + + /* Clear used registers. */ + asm volatile( "pxor %%xmm0, %%xmm0\n\t" + "pxor %%xmm1, %%xmm1\n\t" + "pxor %%xmm2, %%xmm2\n\t" + "pxor %%xmm3, %%xmm3\n\t" + "pxor %%xmm4, %%xmm4\n\t" + "pxor %%xmm5, %%xmm5\n\t" + "pxor %%xmm6, %%xmm6\n\t" + "pxor %%xmm7, %%xmm7\n\t" + "pxor %%xmm8, %%xmm8\n\t" + ::: "cc" ); +#endif + + wipememory (tmp, sizeof(tmp)); +} + + +unsigned int +_gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf, + size_t nblocks) +{ + static const unsigned char be_mask[16] __attribute__ ((aligned (16))) = + { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; + const unsigned int blocksize = GCRY_GCM_BLOCK_LEN; + + if (nblocks == 0) + return 0; + + /* Preload hash and H1. */ + asm volatile ("movdqu %[hash], %%xmm1\n\t" + "movdqa %[hsub], %%xmm0\n\t" + "pshufb %[be_mask], %%xmm1\n\t" /* be => le */ + : + : [hash] "m" (*result), [be_mask] "m" (*be_mask), + [hsub] "m" (*c->u_mode.gcm.u_ghash_key.key)); + +#ifdef __x86_64__ + if (nblocks >= 4) + { + do + { + asm volatile ("movdqa %[be_mask], %%xmm4\n\t" + "movdqu 0*16(%[buf]), %%xmm5\n\t" + "movdqu 1*16(%[buf]), %%xmm2\n\t" + "movdqu 2*16(%[buf]), %%xmm3\n\t" + "movdqu 3*16(%[buf]), %%xmm6\n\t" + "pshufb %%xmm4, %%xmm5\n\t" /* be => le */ + + /* Load H2, H3, H4. */ + "movdqu 2*16(%[h_234]), %%xmm10\n\t" + "movdqu 1*16(%[h_234]), %%xmm9\n\t" + "movdqu 0*16(%[h_234]), %%xmm8\n\t" + + "pxor %%xmm5, %%xmm1\n\t" + "pshufb %%xmm4, %%xmm2\n\t" /* be => le */ + "pshufb %%xmm4, %%xmm3\n\t" /* be => le */ + "pshufb %%xmm4, %%xmm6\n\t" /* be => le */ + : + : [buf] "r" (buf), [be_mask] "m" (*be_mask), + [h_234] "r" (c->u_mode.gcm.gcm_table)); + + gfmul_pclmul_aggr4 (); + + buf += 4 * blocksize; + nblocks -= 4; + } + while (nblocks >= 4); + + /* Clear used x86-64/XMM registers. */ + asm volatile( "pxor %%xmm8, %%xmm8\n\t" + "pxor %%xmm9, %%xmm9\n\t" + "pxor %%xmm10, %%xmm10\n\t" + "pxor %%xmm11, %%xmm11\n\t" + "pxor %%xmm12, %%xmm12\n\t" + "pxor %%xmm13, %%xmm13\n\t" + "pxor %%xmm14, %%xmm14\n\t" + "pxor %%xmm15, %%xmm15\n\t" + ::: "cc" ); + } +#endif + + while (nblocks--) + { + asm volatile ("movdqu %[buf], %%xmm2\n\t" + "pshufb %[be_mask], %%xmm2\n\t" /* be => le */ + "pxor %%xmm2, %%xmm1\n\t" + : + : [buf] "m" (*buf), [be_mask] "m" (*be_mask)); + + gfmul_pclmul (); + + buf += blocksize; + } + + /* Store hash. */ + asm volatile ("pshufb %[be_mask], %%xmm1\n\t" /* be => le */ + "movdqu %%xmm1, %[hash]\n\t" + : [hash] "=m" (*result) + : [be_mask] "m" (*be_mask)); + + /* Clear used registers. */ + asm volatile( "pxor %%xmm0, %%xmm0\n\t" + "pxor %%xmm1, %%xmm1\n\t" + "pxor %%xmm2, %%xmm2\n\t" + "pxor %%xmm3, %%xmm3\n\t" + "pxor %%xmm4, %%xmm4\n\t" + "pxor %%xmm5, %%xmm5\n\t" + "pxor %%xmm6, %%xmm6\n\t" + "pxor %%xmm7, %%xmm7\n\t" + ::: "cc" ); + + return 0; +} + +#endif /* GCM_USE_INTEL_PCLMUL */ diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c index 05347616..f89b81e3 100644 --- a/cipher/cipher-gcm.c +++ b/cipher/cipher-gcm.c @@ -29,6 +29,15 @@ #include "bufhelp.h" #include "./cipher-internal.h" + +#ifdef GCM_USE_INTEL_PCLMUL +extern void _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c, byte *h); + +extern unsigned int _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, + const byte *buf, size_t nblocks); +#endif + + #ifdef GCM_USE_TABLES static const u16 gcmR[256] = { 0x0000, 0x01c2, 0x0384, 0x0246, 0x0708, 0x06ca, 0x048c, 0x054e, @@ -348,325 +357,18 @@ do_ghash (unsigned char *hsub, unsigned char *result, const unsigned char *buf) #endif /* !GCM_USE_TABLES */ -#ifdef GCM_USE_INTEL_PCLMUL -/* - Intel PCLMUL ghash based on white paper: - "Intel® Carry-Less Multiplication Instruction and its Usage for Computing the - GCM Mode - Rev 2.01"; Shay Gueron, Michael E. Kounavis. - */ -static inline void gfmul_pclmul(void) -{ - /* Input: XMM0 and XMM1, Output: XMM1. Input XMM0 stays unmodified. - Input must be converted to little-endian. - */ - asm volatile (/* gfmul, xmm0 has operator a and xmm1 has operator b. */ - "pshufd $78, %%xmm0, %%xmm2\n\t" - "pshufd $78, %%xmm1, %%xmm4\n\t" - "pxor %%xmm0, %%xmm2\n\t" /* xmm2 holds a0+a1 */ - "pxor %%xmm1, %%xmm4\n\t" /* xmm4 holds b0+b1 */ - - "movdqa %%xmm0, %%xmm3\n\t" - "pclmulqdq $0, %%xmm1, %%xmm3\n\t" /* xmm3 holds a0*b0 */ - "movdqa %%xmm0, %%xmm6\n\t" - "pclmulqdq $17, %%xmm1, %%xmm6\n\t" /* xmm6 holds a1*b1 */ - "movdqa %%xmm3, %%xmm5\n\t" - "pclmulqdq $0, %%xmm2, %%xmm4\n\t" /* xmm4 holds (a0+a1)*(b0+b1) */ - - "pxor %%xmm6, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */ - "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */ - "movdqa %%xmm4, %%xmm5\n\t" - "psrldq $8, %%xmm4\n\t" - "pslldq $8, %%xmm5\n\t" - "pxor %%xmm5, %%xmm3\n\t" - "pxor %%xmm4, %%xmm6\n\t" /* <xmm6:xmm3> holds the result of the - carry-less multiplication of xmm0 - by xmm1 */ - - /* shift the result by one bit position to the left cope for - the fact that bits are reversed */ - "movdqa %%xmm3, %%xmm4\n\t" - "movdqa %%xmm6, %%xmm5\n\t" - "pslld $1, %%xmm3\n\t" - "pslld $1, %%xmm6\n\t" - "psrld $31, %%xmm4\n\t" - "psrld $31, %%xmm5\n\t" - "movdqa %%xmm4, %%xmm1\n\t" - "pslldq $4, %%xmm5\n\t" - "pslldq $4, %%xmm4\n\t" - "psrldq $12, %%xmm1\n\t" - "por %%xmm4, %%xmm3\n\t" - "por %%xmm5, %%xmm6\n\t" - "por %%xmm6, %%xmm1\n\t" - - /* first phase of the reduction */ - "movdqa %%xmm3, %%xmm6\n\t" - "movdqa %%xmm3, %%xmm7\n\t" - "pslld $31, %%xmm6\n\t" /* packed right shifting << 31 */ - "movdqa %%xmm3, %%xmm5\n\t" - "pslld $30, %%xmm7\n\t" /* packed right shifting shift << 30 */ - "pslld $25, %%xmm5\n\t" /* packed right shifting shift << 25 */ - "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */ - "pxor %%xmm5, %%xmm6\n\t" - "movdqa %%xmm6, %%xmm7\n\t" - "pslldq $12, %%xmm6\n\t" - "psrldq $4, %%xmm7\n\t" - "pxor %%xmm6, %%xmm3\n\t" /* first phase of the reduction - complete */ - - /* second phase of the reduction */ - "movdqa %%xmm3, %%xmm2\n\t" - "movdqa %%xmm3, %%xmm4\n\t" - "psrld $1, %%xmm2\n\t" /* packed left shifting >> 1 */ - "movdqa %%xmm3, %%xmm5\n\t" - "psrld $2, %%xmm4\n\t" /* packed left shifting >> 2 */ - "psrld $7, %%xmm5\n\t" /* packed left shifting >> 7 */ - "pxor %%xmm4, %%xmm2\n\t" /* xor the shifted versions */ - "pxor %%xmm5, %%xmm2\n\t" - "pxor %%xmm7, %%xmm2\n\t" - "pxor %%xmm2, %%xmm3\n\t" - "pxor %%xmm3, %%xmm1\n\t" /* the result is in xmm1 */ - ::: "cc" ); -} - -#ifdef __x86_64__ -static inline void gfmul_pclmul_aggr4(void) -{ - /* Input: - H¹: XMM0 X_i : XMM6 - H²: XMM8 X_(i-1) : XMM3 - H³: XMM9 X_(i-2) : XMM2 - H⁴: XMM10 X_(i-3)⊕Y_(i-4): XMM1 - Output: - Y_i: XMM1 - Inputs XMM0 stays unmodified. - Input must be converted to little-endian. - */ - asm volatile (/* perform clmul and merge results... */ - "pshufd $78, %%xmm10, %%xmm11\n\t" - "pshufd $78, %%xmm1, %%xmm12\n\t" - "pxor %%xmm10, %%xmm11\n\t" /* xmm11 holds 4:a0+a1 */ - "pxor %%xmm1, %%xmm12\n\t" /* xmm12 holds 4:b0+b1 */ - - "pshufd $78, %%xmm9, %%xmm13\n\t" - "pshufd $78, %%xmm2, %%xmm14\n\t" - "pxor %%xmm9, %%xmm13\n\t" /* xmm13 holds 3:a0+a1 */ - "pxor %%xmm2, %%xmm14\n\t" /* xmm14 holds 3:b0+b1 */ - - "pshufd $78, %%xmm8, %%xmm5\n\t" - "pshufd $78, %%xmm3, %%xmm15\n\t" - "pxor %%xmm8, %%xmm5\n\t" /* xmm1 holds 2:a0+a1 */ - "pxor %%xmm3, %%xmm15\n\t" /* xmm2 holds 2:b0+b1 */ - - "movdqa %%xmm10, %%xmm4\n\t" - "movdqa %%xmm9, %%xmm7\n\t" - "pclmulqdq $0, %%xmm1, %%xmm4\n\t" /* xmm4 holds 4:a0*b0 */ - "pclmulqdq $0, %%xmm2, %%xmm7\n\t" /* xmm7 holds 3:a0*b0 */ - "pclmulqdq $17, %%xmm10, %%xmm1\n\t" /* xmm1 holds 4:a1*b1 */ - "pclmulqdq $17, %%xmm9, %%xmm2\n\t" /* xmm9 holds 3:a1*b1 */ - "pclmulqdq $0, %%xmm11, %%xmm12\n\t" /* xmm12 holds 4:(a0+a1)*(b0+b1) */ - "pclmulqdq $0, %%xmm13, %%xmm14\n\t" /* xmm14 holds 3:(a0+a1)*(b0+b1) */ - - "pshufd $78, %%xmm0, %%xmm10\n\t" - "pshufd $78, %%xmm6, %%xmm11\n\t" - "pxor %%xmm0, %%xmm10\n\t" /* xmm10 holds 1:a0+a1 */ - "pxor %%xmm6, %%xmm11\n\t" /* xmm11 holds 1:b0+b1 */ - - "pxor %%xmm4, %%xmm7\n\t" /* xmm7 holds 3+4:a0*b0 */ - "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 3+4:a1*b1 */ - "pxor %%xmm14, %%xmm12\n\t" /* xmm12 holds 3+4:(a0+a1)*(b0+b1) */ - - "movdqa %%xmm8, %%xmm13\n\t" - "pclmulqdq $0, %%xmm3, %%xmm13\n\t" /* xmm13 holds 2:a0*b0 */ - "pclmulqdq $17, %%xmm8, %%xmm3\n\t" /* xmm3 holds 2:a1*b1 */ - "pclmulqdq $0, %%xmm5, %%xmm15\n\t" /* xmm15 holds 2:(a0+a1)*(b0+b1) */ - - "pxor %%xmm13, %%xmm7\n\t" /* xmm7 holds 2+3+4:a0*b0 */ - "pxor %%xmm3, %%xmm1\n\t" /* xmm1 holds 2+3+4:a1*b1 */ - "pxor %%xmm15, %%xmm12\n\t" /* xmm12 holds 2+3+4:(a0+a1)*(b0+b1) */ - - "movdqa %%xmm0, %%xmm3\n\t" - "pclmulqdq $0, %%xmm6, %%xmm3\n\t" /* xmm3 holds 1:a0*b0 */ - "pclmulqdq $17, %%xmm0, %%xmm6\n\t" /* xmm6 holds 1:a1*b1 */ - "movdqa %%xmm11, %%xmm4\n\t" - "pclmulqdq $0, %%xmm10, %%xmm4\n\t" /* xmm4 holds 1:(a0+a1)*(b0+b1) */ - - "pxor %%xmm7, %%xmm3\n\t" /* xmm3 holds 1+2+3+4:a0*b0 */ - "pxor %%xmm1, %%xmm6\n\t" /* xmm6 holds 1+2+3+4:a1*b1 */ - "pxor %%xmm12, %%xmm4\n\t" /* xmm4 holds 1+2+3+4:(a0+a1)*(b0+b1) */ - - /* aggregated reduction... */ - "movdqa %%xmm3, %%xmm5\n\t" - "pxor %%xmm6, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */ - "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */ - "movdqa %%xmm4, %%xmm5\n\t" - "psrldq $8, %%xmm4\n\t" - "pslldq $8, %%xmm5\n\t" - "pxor %%xmm5, %%xmm3\n\t" - "pxor %%xmm4, %%xmm6\n\t" /* <xmm6:xmm3> holds the result of the - carry-less multiplication of xmm0 - by xmm1 */ - - /* shift the result by one bit position to the left cope for - the fact that bits are reversed */ - "movdqa %%xmm3, %%xmm4\n\t" - "movdqa %%xmm6, %%xmm5\n\t" - "pslld $1, %%xmm3\n\t" - "pslld $1, %%xmm6\n\t" - "psrld $31, %%xmm4\n\t" - "psrld $31, %%xmm5\n\t" - "movdqa %%xmm4, %%xmm1\n\t" - "pslldq $4, %%xmm5\n\t" - "pslldq $4, %%xmm4\n\t" - "psrldq $12, %%xmm1\n\t" - "por %%xmm4, %%xmm3\n\t" - "por %%xmm5, %%xmm6\n\t" - "por %%xmm6, %%xmm1\n\t" - - /* first phase of the reduction */ - "movdqa %%xmm3, %%xmm6\n\t" - "movdqa %%xmm3, %%xmm7\n\t" - "pslld $31, %%xmm6\n\t" /* packed right shifting << 31 */ - "movdqa %%xmm3, %%xmm5\n\t" - "pslld $30, %%xmm7\n\t" /* packed right shifting shift << 30 */ - "pslld $25, %%xmm5\n\t" /* packed right shifting shift << 25 */ - "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */ - "pxor %%xmm5, %%xmm6\n\t" - "movdqa %%xmm6, %%xmm7\n\t" - "pslldq $12, %%xmm6\n\t" - "psrldq $4, %%xmm7\n\t" - "pxor %%xmm6, %%xmm3\n\t" /* first phase of the reduction - complete */ - - /* second phase of the reduction */ - "movdqa %%xmm3, %%xmm2\n\t" - "movdqa %%xmm3, %%xmm4\n\t" - "psrld $1, %%xmm2\n\t" /* packed left shifting >> 1 */ - "movdqa %%xmm3, %%xmm5\n\t" - "psrld $2, %%xmm4\n\t" /* packed left shifting >> 2 */ - "psrld $7, %%xmm5\n\t" /* packed left shifting >> 7 */ - "pxor %%xmm4, %%xmm2\n\t" /* xor the shifted versions */ - "pxor %%xmm5, %%xmm2\n\t" - "pxor %%xmm7, %%xmm2\n\t" - "pxor %%xmm2, %%xmm3\n\t" - "pxor %%xmm3, %%xmm1\n\t" /* the result is in xmm1 */ - :::"cc"); -} -#endif - -#endif /*GCM_USE_INTEL_PCLMUL*/ - - static unsigned int -ghash (gcry_cipher_hd_t c, byte *result, const byte *buf, - size_t nblocks) +ghash_internal (gcry_cipher_hd_t c, byte *result, const byte *buf, + size_t nblocks) { const unsigned int blocksize = GCRY_GCM_BLOCK_LEN; - unsigned int burn; - - if (nblocks == 0) - return 0; - - if (0) - ; -#ifdef GCM_USE_INTEL_PCLMUL - else if (c->u_mode.gcm.use_intel_pclmul) - { - static const unsigned char be_mask[16] __attribute__ ((aligned (16))) = - { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; - - /* Preload hash and H1. */ - asm volatile ("movdqu %[hash], %%xmm1\n\t" - "movdqa %[hsub], %%xmm0\n\t" - "pshufb %[be_mask], %%xmm1\n\t" /* be => le */ - : - : [hash] "m" (*result), [be_mask] "m" (*be_mask), - [hsub] "m" (*c->u_mode.gcm.u_ghash_key.key)); - -#ifdef __x86_64__ - if (nblocks >= 4) - { - do - { - asm volatile ("movdqa %[be_mask], %%xmm4\n\t" - "movdqu 0*16(%[buf]), %%xmm5\n\t" - "movdqu 1*16(%[buf]), %%xmm2\n\t" - "movdqu 2*16(%[buf]), %%xmm3\n\t" - "movdqu 3*16(%[buf]), %%xmm6\n\t" - "pshufb %%xmm4, %%xmm5\n\t" /* be => le */ - - /* Load H2, H3, H4. */ - "movdqu 2*16(%[h_234]), %%xmm10\n\t" - "movdqu 1*16(%[h_234]), %%xmm9\n\t" - "movdqu 0*16(%[h_234]), %%xmm8\n\t" - - "pxor %%xmm5, %%xmm1\n\t" - "pshufb %%xmm4, %%xmm2\n\t" /* be => le */ - "pshufb %%xmm4, %%xmm3\n\t" /* be => le */ - "pshufb %%xmm4, %%xmm6\n\t" /* be => le */ - : - : [buf] "r" (buf), [be_mask] "m" (*be_mask), - [h_234] "r" (c->u_mode.gcm.gcm_table)); - - gfmul_pclmul_aggr4 (); - - buf += 4 * blocksize; - nblocks -= 4; - } - while (nblocks >= 4); - - /* Clear used x86-64/XMM registers. */ - asm volatile( "pxor %%xmm8, %%xmm8\n\t" - "pxor %%xmm9, %%xmm9\n\t" - "pxor %%xmm10, %%xmm10\n\t" - "pxor %%xmm11, %%xmm11\n\t" - "pxor %%xmm12, %%xmm12\n\t" - "pxor %%xmm13, %%xmm13\n\t" - "pxor %%xmm14, %%xmm14\n\t" - "pxor %%xmm15, %%xmm15\n\t" - ::: "cc" ); - } -#endif - - while (nblocks--) - { - asm volatile ("movdqu %[buf], %%xmm2\n\t" - "pshufb %[be_mask], %%xmm2\n\t" /* be => le */ - "pxor %%xmm2, %%xmm1\n\t" - : - : [buf] "m" (*buf), [be_mask] "m" (*be_mask)); - - gfmul_pclmul (); - - buf += blocksize; - } + unsigned int burn = 0; - /* Store hash. */ - asm volatile ("pshufb %[be_mask], %%xmm1\n\t" /* be => le */ - "movdqu %%xmm1, %[hash]\n\t" - : [hash] "=m" (*result) - : [be_mask] "m" (*be_mask)); - - /* Clear used registers. */ - asm volatile( "pxor %%xmm0, %%xmm0\n\t" - "pxor %%xmm1, %%xmm1\n\t" - "pxor %%xmm2, %%xmm2\n\t" - "pxor %%xmm3, %%xmm3\n\t" - "pxor %%xmm4, %%xmm4\n\t" - "pxor %%xmm5, %%xmm5\n\t" - "pxor %%xmm6, %%xmm6\n\t" - "pxor %%xmm7, %%xmm7\n\t" - ::: "cc" ); - burn = 0; - } -#endif - else + while (nblocks) { - while (nblocks) - { - burn = GHASH (c, result, buf); - buf += blocksize; - nblocks--; - } + burn = GHASH (c, result, buf); + buf += blocksize; + nblocks--; } return burn + (burn ? 5*sizeof(void*) : 0); @@ -681,63 +383,15 @@ setupM (gcry_cipher_hd_t c, byte *h) #ifdef GCM_USE_INTEL_PCLMUL else if (_gcry_get_hw_features () & HWF_INTEL_PCLMUL) { - u64 tmp[2]; - - c->u_mode.gcm.use_intel_pclmul = 1; - - /* Swap endianness of hsub. */ - tmp[0] = buf_get_be64(c->u_mode.gcm.u_ghash_key.key + 8); - tmp[1] = buf_get_be64(c->u_mode.gcm.u_ghash_key.key + 0); - buf_cpy (c->u_mode.gcm.u_ghash_key.key, tmp, GCRY_GCM_BLOCK_LEN); - -#ifdef __x86_64__ - asm volatile ("movdqu %[h_1], %%xmm0\n\t" - "movdqa %%xmm0, %%xmm1\n\t" - : - : [h_1] "m" (*tmp)); - - gfmul_pclmul (); /* H•H => H² */ - - asm volatile ("movdqu %%xmm1, 0*16(%[h_234])\n\t" - "movdqa %%xmm1, %%xmm8\n\t" - : - : [h_234] "r" (c->u_mode.gcm.gcm_table) - : "memory"); - - gfmul_pclmul (); /* H•H² => H³ */ - - asm volatile ("movdqa %%xmm8, %%xmm0\n\t" - "movdqu %%xmm1, 1*16(%[h_234])\n\t" - "movdqa %%xmm8, %%xmm1\n\t" - : - : [h_234] "r" (c->u_mode.gcm.gcm_table) - : "memory"); - - gfmul_pclmul (); /* H²•H² => H⁴ */ - - asm volatile ("movdqu %%xmm1, 2*16(%[h_234])\n\t" - : - : [h_234] "r" (c->u_mode.gcm.gcm_table) - : "memory"); - - /* Clear used registers. */ - asm volatile( "pxor %%xmm0, %%xmm0\n\t" - "pxor %%xmm1, %%xmm1\n\t" - "pxor %%xmm2, %%xmm2\n\t" - "pxor %%xmm3, %%xmm3\n\t" - "pxor %%xmm4, %%xmm4\n\t" - "pxor %%xmm5, %%xmm5\n\t" - "pxor %%xmm6, %%xmm6\n\t" - "pxor %%xmm7, %%xmm7\n\t" - "pxor %%xmm8, %%xmm8\n\t" - ::: "cc" ); -#endif - - wipememory (tmp, sizeof(tmp)); + c->u_mode.gcm.ghash_fn = _gcry_ghash_intel_pclmul; + _gcry_ghash_setup_intel_pclmul(c, h); } #endif else - fillM (c, h); + { + c->u_mode.gcm.ghash_fn = ghash_internal; + fillM (c, h); + } } @@ -810,6 +464,7 @@ do_ghash_buf(gcry_cipher_hd_t c, byte *hash, const byte *buf, { unsigned int blocksize = GCRY_GCM_BLOCK_LEN; unsigned int unused = c->u_mode.gcm.mac_unused; + ghash_fn_t ghash_fn = c->u_mode.gcm.ghash_fn; size_t nblocks, n; unsigned int burn = 0; @@ -843,7 +498,7 @@ do_ghash_buf(gcry_cipher_hd_t c, byte *hash, const byte *buf, gcry_assert (unused == blocksize); /* Process one block from macbuf. */ - burn = ghash (c, hash, c->u_mode.gcm.macbuf, 1); + burn = ghash_fn (c, hash, c->u_mode.gcm.macbuf, 1); unused = 0; } @@ -851,7 +506,7 @@ do_ghash_buf(gcry_cipher_hd_t c, byte *hash, const byte *buf, if (nblocks) { - burn = ghash (c, hash, buf, nblocks); + burn = ghash_fn (c, hash, buf, nblocks); buf += blocksize * nblocks; buflen -= blocksize * nblocks; } diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h index f6bda668..fef0ecba 100644 --- a/cipher/cipher-internal.h +++ b/cipher/cipher-internal.h @@ -42,7 +42,7 @@ #define GCM_USE_TABLES 1 -/* GCM_USE_INTEL_PCLMUL inidicates whether to compile GCM with Intel PCLMUL +/* GCM_USE_INTEL_PCLMUL indicates whether to compile GCM with Intel PCLMUL code. */ #undef GCM_USE_INTEL_PCLMUL #if defined(ENABLE_PCLMUL_SUPPORT) && defined(GCM_USE_TABLES) @@ -54,6 +54,10 @@ #endif /* GCM_USE_INTEL_PCLMUL */ +typedef unsigned int (*ghash_fn_t) (gcry_cipher_hd_t c, byte *result, + const byte *buf, size_t nblocks); + + /* A VIA processor with the Padlock engine as well as the Intel AES_NI instructions require an alignment of most data on a 16 byte boundary. Because we trick out the compiler while allocating the @@ -188,6 +192,7 @@ struct gcry_cipher_handle unsigned char macbuf[GCRY_CCM_BLOCK_LEN]; int mac_unused; /* Number of unprocessed bytes in MACBUF. */ + /* byte counters for GCM */ u32 aadlen[2]; u32 datalen[2]; @@ -209,10 +214,8 @@ struct gcry_cipher_handle unsigned char key[MAX_BLOCKSIZE]; } u_ghash_key; -#ifdef GCM_USE_INTEL_PCLMUL - /* Use Intel PCLMUL instructions for accelerated GHASH. */ - unsigned int use_intel_pclmul:1; -#endif + /* GHASH implementation in use. */ + ghash_fn_t ghash_fn; /* Pre-calculated table for GCM. */ #ifdef GCM_USE_TABLES |