summaryrefslogtreecommitdiff
path: root/cipher/cipher-gcm.c
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2014-12-06 10:38:36 +0200
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2014-12-12 23:49:52 +0200
commit4a0795af021305f9240f23626a3796157db46bd7 (patch)
tree6444ff3279c9b014bc5b765c75878e4a6d29346f /cipher/cipher-gcm.c
parentcbf4c8cb6bbda15eea61885279f2a6f1d4bcedfd (diff)
downloadlibgcrypt-4a0795af021305f9240f23626a3796157db46bd7.tar.gz
GCM: move Intel PCLMUL accelerated implementation to separate file
* cipher/Makefile.am: Add 'cipher-gcm-intel-pclmul.c'. * cipher/cipher-gcm-intel-pclmul.c: New. * cipher/cipher-gcm.c [GCM_USE_INTEL_PCLMUL] (_gcry_ghash_setup_intel_pclmul, _gcry_ghash_intel_pclmul): New prototypes. [GCM_USE_INTEL_PCLMUL] (gfmul_pclmul, gfmul_pclmul_aggr4): Move to 'cipher-gcm-intel-pclmul.c'. (ghash): Rename to... (ghash_internal): ...this and move GCM_USE_INTEL_PCLMUL part to new function in 'cipher-gcm-intel-pclmul.c'. (setupM): Move GCM_USE_INTEL_PCLMUL part to new function in 'cipher-gcm-intel-pclmul.c'; Add selection of ghash function based on available HW acceleration. (do_ghash_buf): Change use of 'ghash' to 'c->u_mode.gcm.ghash_fn'. * cipher/internal.h (ghash_fn_t): New. (gcry_cipher_handle): Remove 'use_intel_pclmul'; Add 'ghash_fn'. -- Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/cipher-gcm.c')
-rw-r--r--cipher/cipher-gcm.c395
1 files changed, 25 insertions, 370 deletions
diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c
index 05347616..f89b81e3 100644
--- a/cipher/cipher-gcm.c
+++ b/cipher/cipher-gcm.c
@@ -29,6 +29,15 @@
#include "bufhelp.h"
#include "./cipher-internal.h"
+
+#ifdef GCM_USE_INTEL_PCLMUL
+extern void _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c, byte *h);
+
+extern unsigned int _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result,
+ const byte *buf, size_t nblocks);
+#endif
+
+
#ifdef GCM_USE_TABLES
static const u16 gcmR[256] = {
0x0000, 0x01c2, 0x0384, 0x0246, 0x0708, 0x06ca, 0x048c, 0x054e,
@@ -348,325 +357,18 @@ do_ghash (unsigned char *hsub, unsigned char *result, const unsigned char *buf)
#endif /* !GCM_USE_TABLES */
-#ifdef GCM_USE_INTEL_PCLMUL
-/*
- Intel PCLMUL ghash based on white paper:
- "Intel® Carry-Less Multiplication Instruction and its Usage for Computing the
- GCM Mode - Rev 2.01"; Shay Gueron, Michael E. Kounavis.
- */
-static inline void gfmul_pclmul(void)
-{
- /* Input: XMM0 and XMM1, Output: XMM1. Input XMM0 stays unmodified.
- Input must be converted to little-endian.
- */
- asm volatile (/* gfmul, xmm0 has operator a and xmm1 has operator b. */
- "pshufd $78, %%xmm0, %%xmm2\n\t"
- "pshufd $78, %%xmm1, %%xmm4\n\t"
- "pxor %%xmm0, %%xmm2\n\t" /* xmm2 holds a0+a1 */
- "pxor %%xmm1, %%xmm4\n\t" /* xmm4 holds b0+b1 */
-
- "movdqa %%xmm0, %%xmm3\n\t"
- "pclmulqdq $0, %%xmm1, %%xmm3\n\t" /* xmm3 holds a0*b0 */
- "movdqa %%xmm0, %%xmm6\n\t"
- "pclmulqdq $17, %%xmm1, %%xmm6\n\t" /* xmm6 holds a1*b1 */
- "movdqa %%xmm3, %%xmm5\n\t"
- "pclmulqdq $0, %%xmm2, %%xmm4\n\t" /* xmm4 holds (a0+a1)*(b0+b1) */
-
- "pxor %%xmm6, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
- "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
- "movdqa %%xmm4, %%xmm5\n\t"
- "psrldq $8, %%xmm4\n\t"
- "pslldq $8, %%xmm5\n\t"
- "pxor %%xmm5, %%xmm3\n\t"
- "pxor %%xmm4, %%xmm6\n\t" /* <xmm6:xmm3> holds the result of the
- carry-less multiplication of xmm0
- by xmm1 */
-
- /* shift the result by one bit position to the left cope for
- the fact that bits are reversed */
- "movdqa %%xmm3, %%xmm4\n\t"
- "movdqa %%xmm6, %%xmm5\n\t"
- "pslld $1, %%xmm3\n\t"
- "pslld $1, %%xmm6\n\t"
- "psrld $31, %%xmm4\n\t"
- "psrld $31, %%xmm5\n\t"
- "movdqa %%xmm4, %%xmm1\n\t"
- "pslldq $4, %%xmm5\n\t"
- "pslldq $4, %%xmm4\n\t"
- "psrldq $12, %%xmm1\n\t"
- "por %%xmm4, %%xmm3\n\t"
- "por %%xmm5, %%xmm6\n\t"
- "por %%xmm6, %%xmm1\n\t"
-
- /* first phase of the reduction */
- "movdqa %%xmm3, %%xmm6\n\t"
- "movdqa %%xmm3, %%xmm7\n\t"
- "pslld $31, %%xmm6\n\t" /* packed right shifting << 31 */
- "movdqa %%xmm3, %%xmm5\n\t"
- "pslld $30, %%xmm7\n\t" /* packed right shifting shift << 30 */
- "pslld $25, %%xmm5\n\t" /* packed right shifting shift << 25 */
- "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */
- "pxor %%xmm5, %%xmm6\n\t"
- "movdqa %%xmm6, %%xmm7\n\t"
- "pslldq $12, %%xmm6\n\t"
- "psrldq $4, %%xmm7\n\t"
- "pxor %%xmm6, %%xmm3\n\t" /* first phase of the reduction
- complete */
-
- /* second phase of the reduction */
- "movdqa %%xmm3, %%xmm2\n\t"
- "movdqa %%xmm3, %%xmm4\n\t"
- "psrld $1, %%xmm2\n\t" /* packed left shifting >> 1 */
- "movdqa %%xmm3, %%xmm5\n\t"
- "psrld $2, %%xmm4\n\t" /* packed left shifting >> 2 */
- "psrld $7, %%xmm5\n\t" /* packed left shifting >> 7 */
- "pxor %%xmm4, %%xmm2\n\t" /* xor the shifted versions */
- "pxor %%xmm5, %%xmm2\n\t"
- "pxor %%xmm7, %%xmm2\n\t"
- "pxor %%xmm2, %%xmm3\n\t"
- "pxor %%xmm3, %%xmm1\n\t" /* the result is in xmm1 */
- ::: "cc" );
-}
-
-#ifdef __x86_64__
-static inline void gfmul_pclmul_aggr4(void)
-{
- /* Input:
- H¹: XMM0 X_i : XMM6
- H²: XMM8 X_(i-1) : XMM3
- H³: XMM9 X_(i-2) : XMM2
- H⁴: XMM10 X_(i-3)⊕Y_(i-4): XMM1
- Output:
- Y_i: XMM1
- Inputs XMM0 stays unmodified.
- Input must be converted to little-endian.
- */
- asm volatile (/* perform clmul and merge results... */
- "pshufd $78, %%xmm10, %%xmm11\n\t"
- "pshufd $78, %%xmm1, %%xmm12\n\t"
- "pxor %%xmm10, %%xmm11\n\t" /* xmm11 holds 4:a0+a1 */
- "pxor %%xmm1, %%xmm12\n\t" /* xmm12 holds 4:b0+b1 */
-
- "pshufd $78, %%xmm9, %%xmm13\n\t"
- "pshufd $78, %%xmm2, %%xmm14\n\t"
- "pxor %%xmm9, %%xmm13\n\t" /* xmm13 holds 3:a0+a1 */
- "pxor %%xmm2, %%xmm14\n\t" /* xmm14 holds 3:b0+b1 */
-
- "pshufd $78, %%xmm8, %%xmm5\n\t"
- "pshufd $78, %%xmm3, %%xmm15\n\t"
- "pxor %%xmm8, %%xmm5\n\t" /* xmm1 holds 2:a0+a1 */
- "pxor %%xmm3, %%xmm15\n\t" /* xmm2 holds 2:b0+b1 */
-
- "movdqa %%xmm10, %%xmm4\n\t"
- "movdqa %%xmm9, %%xmm7\n\t"
- "pclmulqdq $0, %%xmm1, %%xmm4\n\t" /* xmm4 holds 4:a0*b0 */
- "pclmulqdq $0, %%xmm2, %%xmm7\n\t" /* xmm7 holds 3:a0*b0 */
- "pclmulqdq $17, %%xmm10, %%xmm1\n\t" /* xmm1 holds 4:a1*b1 */
- "pclmulqdq $17, %%xmm9, %%xmm2\n\t" /* xmm9 holds 3:a1*b1 */
- "pclmulqdq $0, %%xmm11, %%xmm12\n\t" /* xmm12 holds 4:(a0+a1)*(b0+b1) */
- "pclmulqdq $0, %%xmm13, %%xmm14\n\t" /* xmm14 holds 3:(a0+a1)*(b0+b1) */
-
- "pshufd $78, %%xmm0, %%xmm10\n\t"
- "pshufd $78, %%xmm6, %%xmm11\n\t"
- "pxor %%xmm0, %%xmm10\n\t" /* xmm10 holds 1:a0+a1 */
- "pxor %%xmm6, %%xmm11\n\t" /* xmm11 holds 1:b0+b1 */
-
- "pxor %%xmm4, %%xmm7\n\t" /* xmm7 holds 3+4:a0*b0 */
- "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 3+4:a1*b1 */
- "pxor %%xmm14, %%xmm12\n\t" /* xmm12 holds 3+4:(a0+a1)*(b0+b1) */
-
- "movdqa %%xmm8, %%xmm13\n\t"
- "pclmulqdq $0, %%xmm3, %%xmm13\n\t" /* xmm13 holds 2:a0*b0 */
- "pclmulqdq $17, %%xmm8, %%xmm3\n\t" /* xmm3 holds 2:a1*b1 */
- "pclmulqdq $0, %%xmm5, %%xmm15\n\t" /* xmm15 holds 2:(a0+a1)*(b0+b1) */
-
- "pxor %%xmm13, %%xmm7\n\t" /* xmm7 holds 2+3+4:a0*b0 */
- "pxor %%xmm3, %%xmm1\n\t" /* xmm1 holds 2+3+4:a1*b1 */
- "pxor %%xmm15, %%xmm12\n\t" /* xmm12 holds 2+3+4:(a0+a1)*(b0+b1) */
-
- "movdqa %%xmm0, %%xmm3\n\t"
- "pclmulqdq $0, %%xmm6, %%xmm3\n\t" /* xmm3 holds 1:a0*b0 */
- "pclmulqdq $17, %%xmm0, %%xmm6\n\t" /* xmm6 holds 1:a1*b1 */
- "movdqa %%xmm11, %%xmm4\n\t"
- "pclmulqdq $0, %%xmm10, %%xmm4\n\t" /* xmm4 holds 1:(a0+a1)*(b0+b1) */
-
- "pxor %%xmm7, %%xmm3\n\t" /* xmm3 holds 1+2+3+4:a0*b0 */
- "pxor %%xmm1, %%xmm6\n\t" /* xmm6 holds 1+2+3+4:a1*b1 */
- "pxor %%xmm12, %%xmm4\n\t" /* xmm4 holds 1+2+3+4:(a0+a1)*(b0+b1) */
-
- /* aggregated reduction... */
- "movdqa %%xmm3, %%xmm5\n\t"
- "pxor %%xmm6, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
- "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
- "movdqa %%xmm4, %%xmm5\n\t"
- "psrldq $8, %%xmm4\n\t"
- "pslldq $8, %%xmm5\n\t"
- "pxor %%xmm5, %%xmm3\n\t"
- "pxor %%xmm4, %%xmm6\n\t" /* <xmm6:xmm3> holds the result of the
- carry-less multiplication of xmm0
- by xmm1 */
-
- /* shift the result by one bit position to the left cope for
- the fact that bits are reversed */
- "movdqa %%xmm3, %%xmm4\n\t"
- "movdqa %%xmm6, %%xmm5\n\t"
- "pslld $1, %%xmm3\n\t"
- "pslld $1, %%xmm6\n\t"
- "psrld $31, %%xmm4\n\t"
- "psrld $31, %%xmm5\n\t"
- "movdqa %%xmm4, %%xmm1\n\t"
- "pslldq $4, %%xmm5\n\t"
- "pslldq $4, %%xmm4\n\t"
- "psrldq $12, %%xmm1\n\t"
- "por %%xmm4, %%xmm3\n\t"
- "por %%xmm5, %%xmm6\n\t"
- "por %%xmm6, %%xmm1\n\t"
-
- /* first phase of the reduction */
- "movdqa %%xmm3, %%xmm6\n\t"
- "movdqa %%xmm3, %%xmm7\n\t"
- "pslld $31, %%xmm6\n\t" /* packed right shifting << 31 */
- "movdqa %%xmm3, %%xmm5\n\t"
- "pslld $30, %%xmm7\n\t" /* packed right shifting shift << 30 */
- "pslld $25, %%xmm5\n\t" /* packed right shifting shift << 25 */
- "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */
- "pxor %%xmm5, %%xmm6\n\t"
- "movdqa %%xmm6, %%xmm7\n\t"
- "pslldq $12, %%xmm6\n\t"
- "psrldq $4, %%xmm7\n\t"
- "pxor %%xmm6, %%xmm3\n\t" /* first phase of the reduction
- complete */
-
- /* second phase of the reduction */
- "movdqa %%xmm3, %%xmm2\n\t"
- "movdqa %%xmm3, %%xmm4\n\t"
- "psrld $1, %%xmm2\n\t" /* packed left shifting >> 1 */
- "movdqa %%xmm3, %%xmm5\n\t"
- "psrld $2, %%xmm4\n\t" /* packed left shifting >> 2 */
- "psrld $7, %%xmm5\n\t" /* packed left shifting >> 7 */
- "pxor %%xmm4, %%xmm2\n\t" /* xor the shifted versions */
- "pxor %%xmm5, %%xmm2\n\t"
- "pxor %%xmm7, %%xmm2\n\t"
- "pxor %%xmm2, %%xmm3\n\t"
- "pxor %%xmm3, %%xmm1\n\t" /* the result is in xmm1 */
- :::"cc");
-}
-#endif
-
-#endif /*GCM_USE_INTEL_PCLMUL*/
-
-
static unsigned int
-ghash (gcry_cipher_hd_t c, byte *result, const byte *buf,
- size_t nblocks)
+ghash_internal (gcry_cipher_hd_t c, byte *result, const byte *buf,
+ size_t nblocks)
{
const unsigned int blocksize = GCRY_GCM_BLOCK_LEN;
- unsigned int burn;
-
- if (nblocks == 0)
- return 0;
-
- if (0)
- ;
-#ifdef GCM_USE_INTEL_PCLMUL
- else if (c->u_mode.gcm.use_intel_pclmul)
- {
- static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
- { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
-
- /* Preload hash and H1. */
- asm volatile ("movdqu %[hash], %%xmm1\n\t"
- "movdqa %[hsub], %%xmm0\n\t"
- "pshufb %[be_mask], %%xmm1\n\t" /* be => le */
- :
- : [hash] "m" (*result), [be_mask] "m" (*be_mask),
- [hsub] "m" (*c->u_mode.gcm.u_ghash_key.key));
-
-#ifdef __x86_64__
- if (nblocks >= 4)
- {
- do
- {
- asm volatile ("movdqa %[be_mask], %%xmm4\n\t"
- "movdqu 0*16(%[buf]), %%xmm5\n\t"
- "movdqu 1*16(%[buf]), %%xmm2\n\t"
- "movdqu 2*16(%[buf]), %%xmm3\n\t"
- "movdqu 3*16(%[buf]), %%xmm6\n\t"
- "pshufb %%xmm4, %%xmm5\n\t" /* be => le */
-
- /* Load H2, H3, H4. */
- "movdqu 2*16(%[h_234]), %%xmm10\n\t"
- "movdqu 1*16(%[h_234]), %%xmm9\n\t"
- "movdqu 0*16(%[h_234]), %%xmm8\n\t"
-
- "pxor %%xmm5, %%xmm1\n\t"
- "pshufb %%xmm4, %%xmm2\n\t" /* be => le */
- "pshufb %%xmm4, %%xmm3\n\t" /* be => le */
- "pshufb %%xmm4, %%xmm6\n\t" /* be => le */
- :
- : [buf] "r" (buf), [be_mask] "m" (*be_mask),
- [h_234] "r" (c->u_mode.gcm.gcm_table));
-
- gfmul_pclmul_aggr4 ();
-
- buf += 4 * blocksize;
- nblocks -= 4;
- }
- while (nblocks >= 4);
-
- /* Clear used x86-64/XMM registers. */
- asm volatile( "pxor %%xmm8, %%xmm8\n\t"
- "pxor %%xmm9, %%xmm9\n\t"
- "pxor %%xmm10, %%xmm10\n\t"
- "pxor %%xmm11, %%xmm11\n\t"
- "pxor %%xmm12, %%xmm12\n\t"
- "pxor %%xmm13, %%xmm13\n\t"
- "pxor %%xmm14, %%xmm14\n\t"
- "pxor %%xmm15, %%xmm15\n\t"
- ::: "cc" );
- }
-#endif
-
- while (nblocks--)
- {
- asm volatile ("movdqu %[buf], %%xmm2\n\t"
- "pshufb %[be_mask], %%xmm2\n\t" /* be => le */
- "pxor %%xmm2, %%xmm1\n\t"
- :
- : [buf] "m" (*buf), [be_mask] "m" (*be_mask));
-
- gfmul_pclmul ();
-
- buf += blocksize;
- }
+ unsigned int burn = 0;
- /* Store hash. */
- asm volatile ("pshufb %[be_mask], %%xmm1\n\t" /* be => le */
- "movdqu %%xmm1, %[hash]\n\t"
- : [hash] "=m" (*result)
- : [be_mask] "m" (*be_mask));
-
- /* Clear used registers. */
- asm volatile( "pxor %%xmm0, %%xmm0\n\t"
- "pxor %%xmm1, %%xmm1\n\t"
- "pxor %%xmm2, %%xmm2\n\t"
- "pxor %%xmm3, %%xmm3\n\t"
- "pxor %%xmm4, %%xmm4\n\t"
- "pxor %%xmm5, %%xmm5\n\t"
- "pxor %%xmm6, %%xmm6\n\t"
- "pxor %%xmm7, %%xmm7\n\t"
- ::: "cc" );
- burn = 0;
- }
-#endif
- else
+ while (nblocks)
{
- while (nblocks)
- {
- burn = GHASH (c, result, buf);
- buf += blocksize;
- nblocks--;
- }
+ burn = GHASH (c, result, buf);
+ buf += blocksize;
+ nblocks--;
}
return burn + (burn ? 5*sizeof(void*) : 0);
@@ -681,63 +383,15 @@ setupM (gcry_cipher_hd_t c, byte *h)
#ifdef GCM_USE_INTEL_PCLMUL
else if (_gcry_get_hw_features () & HWF_INTEL_PCLMUL)
{
- u64 tmp[2];
-
- c->u_mode.gcm.use_intel_pclmul = 1;
-
- /* Swap endianness of hsub. */
- tmp[0] = buf_get_be64(c->u_mode.gcm.u_ghash_key.key + 8);
- tmp[1] = buf_get_be64(c->u_mode.gcm.u_ghash_key.key + 0);
- buf_cpy (c->u_mode.gcm.u_ghash_key.key, tmp, GCRY_GCM_BLOCK_LEN);
-
-#ifdef __x86_64__
- asm volatile ("movdqu %[h_1], %%xmm0\n\t"
- "movdqa %%xmm0, %%xmm1\n\t"
- :
- : [h_1] "m" (*tmp));
-
- gfmul_pclmul (); /* H•H => H² */
-
- asm volatile ("movdqu %%xmm1, 0*16(%[h_234])\n\t"
- "movdqa %%xmm1, %%xmm8\n\t"
- :
- : [h_234] "r" (c->u_mode.gcm.gcm_table)
- : "memory");
-
- gfmul_pclmul (); /* H•H² => H³ */
-
- asm volatile ("movdqa %%xmm8, %%xmm0\n\t"
- "movdqu %%xmm1, 1*16(%[h_234])\n\t"
- "movdqa %%xmm8, %%xmm1\n\t"
- :
- : [h_234] "r" (c->u_mode.gcm.gcm_table)
- : "memory");
-
- gfmul_pclmul (); /* H²•H² => H⁴ */
-
- asm volatile ("movdqu %%xmm1, 2*16(%[h_234])\n\t"
- :
- : [h_234] "r" (c->u_mode.gcm.gcm_table)
- : "memory");
-
- /* Clear used registers. */
- asm volatile( "pxor %%xmm0, %%xmm0\n\t"
- "pxor %%xmm1, %%xmm1\n\t"
- "pxor %%xmm2, %%xmm2\n\t"
- "pxor %%xmm3, %%xmm3\n\t"
- "pxor %%xmm4, %%xmm4\n\t"
- "pxor %%xmm5, %%xmm5\n\t"
- "pxor %%xmm6, %%xmm6\n\t"
- "pxor %%xmm7, %%xmm7\n\t"
- "pxor %%xmm8, %%xmm8\n\t"
- ::: "cc" );
-#endif
-
- wipememory (tmp, sizeof(tmp));
+ c->u_mode.gcm.ghash_fn = _gcry_ghash_intel_pclmul;
+ _gcry_ghash_setup_intel_pclmul(c, h);
}
#endif
else
- fillM (c, h);
+ {
+ c->u_mode.gcm.ghash_fn = ghash_internal;
+ fillM (c, h);
+ }
}
@@ -810,6 +464,7 @@ do_ghash_buf(gcry_cipher_hd_t c, byte *hash, const byte *buf,
{
unsigned int blocksize = GCRY_GCM_BLOCK_LEN;
unsigned int unused = c->u_mode.gcm.mac_unused;
+ ghash_fn_t ghash_fn = c->u_mode.gcm.ghash_fn;
size_t nblocks, n;
unsigned int burn = 0;
@@ -843,7 +498,7 @@ do_ghash_buf(gcry_cipher_hd_t c, byte *hash, const byte *buf,
gcry_assert (unused == blocksize);
/* Process one block from macbuf. */
- burn = ghash (c, hash, c->u_mode.gcm.macbuf, 1);
+ burn = ghash_fn (c, hash, c->u_mode.gcm.macbuf, 1);
unused = 0;
}
@@ -851,7 +506,7 @@ do_ghash_buf(gcry_cipher_hd_t c, byte *hash, const byte *buf,
if (nblocks)
{
- burn = ghash (c, hash, buf, nblocks);
+ burn = ghash_fn (c, hash, buf, nblocks);
buf += blocksize * nblocks;
buflen -= blocksize * nblocks;
}