summaryrefslogtreecommitdiff
path: root/cipher/cipher-gcm.c
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2013-11-19 23:26:26 +0200
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2013-11-20 18:26:58 +0200
commit5a65ffabadd50f174ab7375faad7a726cce49e61 (patch)
tree3ef317ba516e666d1c214fdf6defaa8b80f06dbf /cipher/cipher-gcm.c
parent0e9e7d72f3c9eb7ac832746c3034855faaf8d02c (diff)
downloadlibgcrypt-5a65ffabadd50f174ab7375faad7a726cce49e61.tar.gz
Add Intel PCLMUL acceleration for GCM
* cipher/cipher-gcm.c (fillM): Rename... (do_fillM): ...to this. (ghash): Remove. (fillM): New macro. (GHASH): Use 'do_ghash' instead of 'ghash'. [GCM_USE_INTEL_PCLMUL] (do_ghash_pclmul): New. (ghash): New. (setupM): New. (_gcry_cipher_gcm_encrypt, _gcry_cipher_gcm_decrypt) (_gcry_cipher_gcm_authenticate, _gcry_cipher_gcm_setiv) (_gcry_cipher_gcm_tag): Use 'ghash' instead of 'GHASH' and 'c->u_mode.gcm.u_tag.tag' instead of 'c->u_tag.tag'. * cipher/cipher-internal.h (GCM_USE_INTEL_PCLMUL): New. (gcry_cipher_handle): Move 'u_tag' and 'gcm_table' under 'u_mode.gcm'. * configure.ac (pclmulsupport, gcry_cv_gcc_inline_asm_pclmul): New. * src/g10lib.h (HWF_INTEL_PCLMUL): New. * src/global.c: Add "intel-pclmul". * src/hwf-x86.c (detect_x86_gnuc): Add check for Intel PCLMUL. -- Speed-up GCM for Intel CPUs. Intel Haswell (x86-64): Old: AES GCM enc | 5.17 ns/B 184.4 MiB/s 16.55 c/B GCM dec | 4.38 ns/B 218.0 MiB/s 14.00 c/B GCM auth | 3.17 ns/B 300.4 MiB/s 10.16 c/B New: AES GCM enc | 3.01 ns/B 317.2 MiB/s 9.62 c/B GCM dec | 1.96 ns/B 486.9 MiB/s 6.27 c/B GCM auth | 0.848 ns/B 1124.8 MiB/s 2.71 c/B Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/cipher-gcm.c')
-rw-r--r--cipher/cipher-gcm.c205
1 files changed, 176 insertions, 29 deletions
diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c
index d7fc0d85..b93f0fa7 100644
--- a/cipher/cipher-gcm.c
+++ b/cipher/cipher-gcm.c
@@ -1,5 +1,6 @@
/* cipher-gcm.c - Generic Galois Counter Mode implementation
* Copyright (C) 2013 Dmitry Eremin-Solenikov
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
*
* This file is part of Libgcrypt.
*
@@ -81,7 +82,7 @@ bshift (u64 * b0, u64 * b1)
}
static void
-fillM (unsigned char *h, u64 * M)
+do_fillM (unsigned char *h, u64 *M)
{
int i, j;
@@ -179,7 +180,7 @@ bshift (u32 * M, int i)
}
static void
-fillM (unsigned char *h, u32 * M)
+do_fillM (unsigned char *h, u32 *M)
{
int i, j;
@@ -269,15 +270,10 @@ do_ghash (unsigned char *result, const unsigned char *buf, const u32 * gcmM)
buf_put_be32 (result + 8, tmp[2]);
buf_put_be32 (result + 12, tmp[3]);
}
-#endif
-
-static void
-ghash (unsigned char *result, const unsigned char *buf, const void *gcmM)
-{
- do_ghash (result, buf, gcmM);
-}
+#endif /* !HAVE_U64_TYPEDEF || SIZEOF_UNSIGNED_LONG != 8 */
-#define GHASH(c, result, buf) ghash (result, buf, c->gcm_table);
+#define fillM(c, h) do_fillM (h, c->u_mode.gcm.gcm_table)
+#define GHASH(c, result, buf) do_ghash (result, buf, c->u_mode.gcm.gcm_table)
#else
@@ -296,7 +292,7 @@ bshift (unsigned long *b)
}
static void
-ghash (unsigned char *hsub, unsigned char *result, const unsigned char *buf)
+do_ghash (unsigned char *hsub, unsigned char *result, const unsigned char *buf)
{
unsigned long V[4];
int i, j;
@@ -339,10 +335,161 @@ ghash (unsigned char *hsub, unsigned char *result, const unsigned char *buf)
#endif
}
-#define fillM(h, M) do { } while (0)
+#define fillM(c, h) do { } while (0)
+#define GHASH(c, result, buf) do_ghash (c->u_iv.iv, result, buf)
+
+#endif /* !GCM_USE_TABLES */
+
+
+#ifdef GCM_USE_INTEL_PCLMUL
+/*
+ Intel PCLMUL ghash based on white paper:
+ "Intel® Carry-Less Multiplication Instruction and its Usage for Computing the
+ GCM Mode - Rev 2.01"; Shay Gueron, Michael E. Kounavis.
+ */
+static void
+do_ghash_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf)
+{
+ static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
+ { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+
+ asm volatile ("movdqu (%[result]), %%xmm1\n\t"
+ "movdqu %[buf], %%xmm2\n\t"
+ "movdqa %[hsub], %%xmm0\n\t"
+ "pxor %%xmm2, %%xmm1\n\t" /* big endian */
+
+ /* be => le */
+ "pshufb %[be_mask], %%xmm1\n\t"
+
+ /* gfmul, xmm0 has operator a and xmm1 has operator b. */
+ "pshufd $78, %%xmm0, %%xmm2\n\t"
+ "pshufd $78, %%xmm1, %%xmm4\n\t"
+ "pxor %%xmm0, %%xmm2\n\t" /* xmm2 holds a0+a1 */
+ "pxor %%xmm1, %%xmm4\n\t" /* xmm4 holds b0+b1 */
+
+ "movdqa %%xmm0, %%xmm3\n\t"
+ "pclmulqdq $0, %%xmm1, %%xmm3\n\t" /* xmm3 holds a0*b0 */
+ "movdqa %%xmm0, %%xmm6\n\t"
+ "pclmulqdq $17, %%xmm1, %%xmm6\n\t" /* xmm6 holds a1*b1 */
+ "movdqa %%xmm3, %%xmm5\n\t"
+ "pclmulqdq $0, %%xmm2, %%xmm4\n\t" /* xmm4 holds (a0+a1)*(b0+b1) */
+
+ "pxor %%xmm6, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
+ "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
+ "movdqa %%xmm4, %%xmm5\n\t"
+ "psrldq $8, %%xmm4\n\t"
+ "pslldq $8, %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm3\n\t"
+ "pxor %%xmm4, %%xmm6\n\t" /* <xmm6:xmm3> holds the result of the
+ carry-less multiplication of xmm0
+ by xmm1 */
+
+ /* shift the result by one bit position to the left cope for
+ the fact that bits are reversed */
+ "movdqa %%xmm3, %%xmm7\n\t"
+ "movdqa %%xmm6, %%xmm0\n\t"
+ "pslld $1, %%xmm3\n\t"
+ "pslld $1, %%xmm6\n\t"
+ "psrld $31, %%xmm7\n\t"
+ "psrld $31, %%xmm0\n\t"
+ "movdqa %%xmm7, %%xmm1\n\t"
+ "pslldq $4, %%xmm0\n\t"
+ "pslldq $4, %%xmm7\n\t"
+ "psrldq $12, %%xmm1\n\t"
+ "por %%xmm7, %%xmm3\n\t"
+ "por %%xmm0, %%xmm6\n\t"
+ "por %%xmm1, %%xmm6\n\t"
+
+ /* first phase of the reduction */
+ "movdqa %%xmm3, %%xmm7\n\t"
+ "movdqa %%xmm3, %%xmm0\n\t"
+ "pslld $31, %%xmm7\n\t" /* packed right shifting << 31 */
+ "movdqa %%xmm3, %%xmm1\n\t"
+ "pslld $30, %%xmm0\n\t" /* packed right shifting shift << 30 */
+ "pslld $25, %%xmm1\n\t" /* packed right shifting shift << 25 */
+ "pxor %%xmm0, %%xmm7\n\t" /* xor the shifted versions */
+ "pxor %%xmm1, %%xmm7\n\t"
+ "movdqa %%xmm7, %%xmm0\n\t"
+ "pslldq $12, %%xmm7\n\t"
+ "psrldq $4, %%xmm0\n\t"
+ "pxor %%xmm7, %%xmm3\n\t" /* first phase of the reduction
+ complete */
+
+ /* second phase of the reduction */
+ "movdqa %%xmm3, %%xmm2\n\t"
+ "movdqa %%xmm3, %%xmm4\n\t"
+ "psrld $1, %%xmm2\n\t" /* packed left shifting >> 1 */
+ "movdqa %%xmm3, %%xmm5\n\t"
+ "psrld $2, %%xmm4\n\t" /* packed left shifting >> 2 */
+ "psrld $7, %%xmm5\n\t" /* packed left shifting >> 7 */
+ "pxor %%xmm4, %%xmm2\n\t" /* xor the shifted versions */
+ "pxor %%xmm5, %%xmm2\n\t"
+ "pxor %%xmm0, %%xmm2\n\t"
+ "pxor %%xmm2, %%xmm3\n\t"
+ "pxor %%xmm3, %%xmm6\n\t" /* the result is in xmm6 */
+
+ /* le => be */
+ "pshufb %[be_mask], %%xmm6\n\t"
+
+ "movdqu %%xmm6, (%[result])\n\t" /* store the result */
+ :
+ : [result] "r" (result), [buf] "m" (*buf),
+ [hsub] "m" (*c->u_iv.iv), [be_mask] "m" (*be_mask)
+ : "memory" );
+}
+
+#endif /*GCM_USE_INTEL_PCLMUL*/
+
+
+static void
+ghash (gcry_cipher_hd_t c, unsigned char *result, const unsigned char *buf)
+{
+ if (0)
+ ;
+#ifdef GCM_USE_INTEL_PCLMUL
+ else if (c->u_mode.gcm.use_intel_pclmul)
+ {
+ /* TODO: Loop structure, use bit-reflection and add faster bulk
+ processing (parallel four blocks). */
+ do_ghash_pclmul (c, result, buf);
+
+ /* Clear used registers. */
+ asm volatile( "pxor %%xmm0, %%xmm0\n\t"
+ "pxor %%xmm1, %%xmm1\n\t"
+ "pxor %%xmm2, %%xmm2\n\t"
+ "pxor %%xmm3, %%xmm3\n\t"
+ "pxor %%xmm4, %%xmm4\n\t"
+ "pxor %%xmm5, %%xmm5\n\t"
+ "pxor %%xmm6, %%xmm6\n\t"
+ "pxor %%xmm7, %%xmm7\n\t"
+ ::: "cc" );
+ }
+#endif
+ else
+ GHASH (c, result, buf);
+}
+
+static void
+setupM (gcry_cipher_hd_t c, byte *h)
+{
+ if (0)
+ ;
+#ifdef GCM_USE_INTEL_PCLMUL
+ else if (_gcry_get_hw_features () & HWF_INTEL_PCLMUL)
+ {
+ u64 tmp[2];
+
+ c->u_mode.gcm.use_intel_pclmul = 1;
-#define GHASH(c, result, buf) ghash (c->u_iv.iv, result, buf);
+ /* Swap endianness of hsub. */
+ tmp[0] = buf_get_be64(c->u_iv.iv + 8);
+ tmp[1] = buf_get_be64(c->u_iv.iv + 0);
+ buf_cpy (c->u_iv.iv, tmp, 16);
+ }
#endif
+ else
+ fillM (c, h);
+}
gcry_err_code_t
@@ -389,12 +536,12 @@ _gcry_cipher_gcm_encrypt (gcry_cipher_hd_t c,
{
buf_xor_2dst (outbuf, tmp, inbuf, n);
memset (tmp + n, 0, blocksize - n);
- GHASH (c, c->u_tag.tag, tmp);
+ ghash (c, c->u_mode.gcm.u_tag.tag, tmp);
}
else
{
buf_xor (outbuf, tmp, inbuf, n);
- GHASH (c, c->u_tag.tag, outbuf);
+ ghash (c, c->u_mode.gcm.u_tag.tag, outbuf);
}
inbuflen -= n;
@@ -442,11 +589,11 @@ _gcry_cipher_gcm_decrypt (gcry_cipher_hd_t c,
{
memcpy (tmp, inbuf, n);
memset (tmp + n, 0, blocksize - n);
- GHASH (c, c->u_tag.tag, tmp);
+ ghash (c, c->u_mode.gcm.u_tag.tag, tmp);
}
else
{
- GHASH (c, c->u_tag.tag, inbuf);
+ ghash (c, c->u_mode.gcm.u_tag.tag, inbuf);
}
i = blocksize - 1;
@@ -490,7 +637,7 @@ _gcry_cipher_gcm_authenticate (gcry_cipher_hd_t c,
while (aadbuflen >= blocksize)
{
- GHASH (c, c->u_tag.tag, aadbuf);
+ ghash (c, c->u_mode.gcm.u_tag.tag, aadbuf);
aadbuflen -= blocksize;
aadbuf += blocksize;
@@ -501,7 +648,7 @@ _gcry_cipher_gcm_authenticate (gcry_cipher_hd_t c,
memcpy (tmp, aadbuf, aadbuflen);
memset (tmp + aadbuflen, 0, blocksize - aadbuflen);
- GHASH (c, c->u_tag.tag, tmp);
+ ghash (c, c->u_mode.gcm.u_tag.tag, tmp);
}
return 0;
@@ -512,10 +659,10 @@ _gcry_cipher_gcm_setiv (gcry_cipher_hd_t c,
const byte * iv, unsigned int ivlen)
{
memset (c->length, 0, 16);
- memset (c->u_tag.tag, 0, 16);
- c->spec->encrypt (&c->context.c, c->u_iv.iv, c->u_tag.tag);
+ memset (c->u_mode.gcm.u_tag.tag, 0, 16);
+ c->spec->encrypt (&c->context.c, c->u_iv.iv, c->u_mode.gcm.u_tag.tag);
- fillM (c->u_iv.iv, c->gcm_table);
+ setupM (c, c->u_iv.iv);
if (ivlen != 16 - 4)
{
@@ -523,12 +670,12 @@ _gcry_cipher_gcm_setiv (gcry_cipher_hd_t c,
unsigned n;
memset (c->u_ctr.ctr, 0, 16);
for (n = ivlen; n >= 16; n -= 16, iv += 16)
- GHASH (c, c->u_ctr.ctr, iv);
+ ghash (c, c->u_ctr.ctr, iv);
if (n != 0)
{
memcpy (tmp, iv, n);
memset (tmp + n, 0, 16 - n);
- GHASH (c, c->u_ctr.ctr, tmp);
+ ghash (c, c->u_ctr.ctr, tmp);
}
memset (tmp, 0, 16);
n = 16;
@@ -537,7 +684,7 @@ _gcry_cipher_gcm_setiv (gcry_cipher_hd_t c,
n--;
for (; n > 0; n--, ivlen >>= 8)
tmp[n - 1] = ivlen & 0xff;
- GHASH (c, c->u_ctr.ctr, tmp);
+ ghash (c, c->u_ctr.ctr, tmp);
}
else
{
@@ -560,19 +707,19 @@ _gcry_cipher_gcm_tag (gcry_cipher_hd_t c,
if (!c->marks.tag)
{
- GHASH (c, c->u_tag.tag, c->length);
- buf_xor (c->u_tag.tag, c->lastiv, c->u_tag.tag, 16);
+ ghash (c, c->u_mode.gcm.u_tag.tag, c->length);
+ buf_xor (c->u_mode.gcm.u_tag.tag, c->lastiv, c->u_mode.gcm.u_tag.tag, 16);
c->marks.tag = 1;
}
if (!check)
{
- memcpy (outbuf, c->u_tag.tag, outbuflen);
+ memcpy (outbuf, c->u_mode.gcm.u_tag.tag, outbuflen);
return GPG_ERR_NO_ERROR;
}
else
{
- return buf_eq_const(outbuf, c->u_tag.tag, outbuflen) ?
+ return buf_eq_const(outbuf, c->u_mode.gcm.u_tag.tag, outbuflen) ?
GPG_ERR_NO_ERROR : GPG_ERR_CHECKSUM;
}