4 files changed, 430 insertions, 377 deletions
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index d7e77736..98142ed8 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -40,8 +40,8 @@ libcipher_la_LIBADD = $(GCRYPT_MODULES)
 libcipher_la_SOURCES = \
 cipher.c cipher-internal.h \
 cipher-cbc.c cipher-cfb.c cipher-ofb.c cipher-ctr.c cipher-aeswrap.c \
-cipher-ccm.c cipher-cmac.c cipher-gcm.c cipher-poly1305.c \
-cipher-selftest.c cipher-selftest.h \
+cipher-ccm.c cipher-cmac.c cipher-gcm.c cipher-gcm-intel-pclmul.c \
+cipher-poly1305.c cipher-selftest.c cipher-selftest.h \
 pubkey.c pubkey-internal.h pubkey-util.c \
 md.c \
 mac.c mac-internal.h \
diff --git a/cipher/cipher-gcm-intel-pclmul.c b/cipher/cipher-gcm-intel-pclmul.c
new file mode 100644
index 00000000..02e77016
--- /dev/null
+++ b/cipher/cipher-gcm-intel-pclmul.c
@@ -0,0 +1,395 @@
+/* cipher-gcm-intel-pclmul.c  -  Intel PCLMUL accelerated Galois Counter Mode
+ *                               implementation
+ * Copyright (C) 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "./cipher-internal.h"
+
+
+#ifdef GCM_USE_INTEL_PCLMUL
+
+/*
+ Intel PCLMUL ghash based on white paper:
+  "Intel® Carry-Less Multiplication Instruction and its Usage for Computing the
+   GCM Mode - Rev 2.01"; Shay Gueron, Michael E. Kounavis.
+ */
+static inline void gfmul_pclmul(void)
+{
+  /* Input: XMM0 and XMM1, Output: XMM1. Input XMM0 stays unmodified.
+     Input must be converted to little-endian.
+   */
+  asm volatile (/* gfmul, xmm0 has operator a and xmm1 has operator b. */
+                "pshufd $78, %%xmm0, %%xmm2\n\t"
+                "pshufd $78, %%xmm1, %%xmm4\n\t"
+                "pxor %%xmm0, %%xmm2\n\t" /* xmm2 holds a0+a1 */
+                "pxor %%xmm1, %%xmm4\n\t" /* xmm4 holds b0+b1 */
+
+                "movdqa %%xmm0, %%xmm3\n\t"
+                "pclmulqdq $0, %%xmm1, %%xmm3\n\t"  /* xmm3 holds a0*b0 */
+                "movdqa %%xmm0, %%xmm6\n\t"
+                "pclmulqdq $17, %%xmm1, %%xmm6\n\t" /* xmm6 holds a1*b1 */
+                "movdqa %%xmm3, %%xmm5\n\t"
+                "pclmulqdq $0, %%xmm2, %%xmm4\n\t"  /* xmm4 holds (a0+a1)*(b0+b1) */
+
+                "pxor %%xmm6, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
+                "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
+                "movdqa %%xmm4, %%xmm5\n\t"
+                "psrldq $8, %%xmm4\n\t"
+                "pslldq $8, %%xmm5\n\t"
+                "pxor %%xmm5, %%xmm3\n\t"
+                "pxor %%xmm4, %%xmm6\n\t" /* <xmm6:xmm3> holds the result of the
+                                             carry-less multiplication of xmm0
+                                             by xmm1 */
+
+                /* shift the result by one bit position to the left cope for
+                   the fact that bits are reversed */
+                "movdqa %%xmm3, %%xmm4\n\t"
+                "movdqa %%xmm6, %%xmm5\n\t"
+                "pslld $1, %%xmm3\n\t"
+                "pslld $1, %%xmm6\n\t"
+                "psrld $31, %%xmm4\n\t"
+                "psrld $31, %%xmm5\n\t"
+                "movdqa %%xmm4, %%xmm1\n\t"
+                "pslldq $4, %%xmm5\n\t"
+                "pslldq $4, %%xmm4\n\t"
+                "psrldq $12, %%xmm1\n\t"
+                "por %%xmm4, %%xmm3\n\t"
+                "por %%xmm5, %%xmm6\n\t"
+                "por %%xmm6, %%xmm1\n\t"
+
+                /* first phase of the reduction */
+                "movdqa %%xmm3, %%xmm6\n\t"
+                "movdqa %%xmm3, %%xmm7\n\t"
+                "pslld $31, %%xmm6\n\t"  /* packed right shifting << 31 */
+                "movdqa %%xmm3, %%xmm5\n\t"
+                "pslld $30, %%xmm7\n\t"  /* packed right shifting shift << 30 */
+                "pslld $25, %%xmm5\n\t"  /* packed right shifting shift << 25 */
+                "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */
+                "pxor %%xmm5, %%xmm6\n\t"
+                "movdqa %%xmm6, %%xmm7\n\t"
+                "pslldq $12, %%xmm6\n\t"
+                "psrldq $4, %%xmm7\n\t"
+                "pxor %%xmm6, %%xmm3\n\t" /* first phase of the reduction
+                                             complete */
+
+                /* second phase of the reduction */
+                "movdqa %%xmm3, %%xmm2\n\t"
+                "movdqa %%xmm3, %%xmm4\n\t"
+                "psrld $1, %%xmm2\n\t"    /* packed left shifting >> 1 */
+                "movdqa %%xmm3, %%xmm5\n\t"
+                "psrld $2, %%xmm4\n\t"    /* packed left shifting >> 2 */
+                "psrld $7, %%xmm5\n\t"    /* packed left shifting >> 7 */
+                "pxor %%xmm4, %%xmm2\n\t" /* xor the shifted versions */
+                "pxor %%xmm5, %%xmm2\n\t"
+                "pxor %%xmm7, %%xmm2\n\t"
+                "pxor %%xmm2, %%xmm3\n\t"
+                "pxor %%xmm3, %%xmm1\n\t" /* the result is in xmm1 */
+                ::: "cc" );
+}
+
+
+#ifdef __x86_64__
+static inline void gfmul_pclmul_aggr4(void)
+{
+  /* Input:
+      H¹: XMM0          X_i            : XMM6
+      H²: XMM8          X_(i-1)        : XMM3
+      H³: XMM9          X_(i-2)        : XMM2
+      H⁴: XMM10         X_(i-3)⊕Y_(i-4): XMM1
+     Output:
+      Y_i: XMM1
+     Inputs XMM0 stays unmodified.
+     Input must be converted to little-endian.
+   */
+  asm volatile (/* perform clmul and merge results... */
+                "pshufd $78, %%xmm10, %%xmm11\n\t"
+                "pshufd $78, %%xmm1, %%xmm12\n\t"
+                "pxor %%xmm10, %%xmm11\n\t" /* xmm11 holds 4:a0+a1 */
+                "pxor %%xmm1, %%xmm12\n\t" /* xmm12 holds 4:b0+b1 */
+
+                "pshufd $78, %%xmm9, %%xmm13\n\t"
+                "pshufd $78, %%xmm2, %%xmm14\n\t"
+                "pxor %%xmm9, %%xmm13\n\t" /* xmm13 holds 3:a0+a1 */
+                "pxor %%xmm2, %%xmm14\n\t" /* xmm14 holds 3:b0+b1 */
+
+                "pshufd $78, %%xmm8, %%xmm5\n\t"
+                "pshufd $78, %%xmm3, %%xmm15\n\t"
+                "pxor %%xmm8, %%xmm5\n\t" /* xmm1 holds 2:a0+a1 */
+                "pxor %%xmm3, %%xmm15\n\t" /* xmm2 holds 2:b0+b1 */
+
+                "movdqa %%xmm10, %%xmm4\n\t"
+                "movdqa %%xmm9, %%xmm7\n\t"
+                "pclmulqdq $0, %%xmm1, %%xmm4\n\t"   /* xmm4 holds 4:a0*b0 */
+                "pclmulqdq $0, %%xmm2, %%xmm7\n\t"   /* xmm7 holds 3:a0*b0 */
+                "pclmulqdq $17, %%xmm10, %%xmm1\n\t" /* xmm1 holds 4:a1*b1 */
+                "pclmulqdq $17, %%xmm9, %%xmm2\n\t"  /* xmm9 holds 3:a1*b1 */
+                "pclmulqdq $0, %%xmm11, %%xmm12\n\t" /* xmm12 holds 4:(a0+a1)*(b0+b1) */
+                "pclmulqdq $0, %%xmm13, %%xmm14\n\t" /* xmm14 holds 3:(a0+a1)*(b0+b1) */
+
+                "pshufd $78, %%xmm0, %%xmm10\n\t"
+                "pshufd $78, %%xmm6, %%xmm11\n\t"
+                "pxor %%xmm0, %%xmm10\n\t" /* xmm10 holds 1:a0+a1 */
+                "pxor %%xmm6, %%xmm11\n\t" /* xmm11 holds 1:b0+b1 */
+
+                "pxor %%xmm4, %%xmm7\n\t"   /* xmm7 holds 3+4:a0*b0 */
+                "pxor %%xmm2, %%xmm1\n\t"   /* xmm1 holds 3+4:a1*b1 */
+                "pxor %%xmm14, %%xmm12\n\t" /* xmm12 holds 3+4:(a0+a1)*(b0+b1) */
+
+                "movdqa %%xmm8, %%xmm13\n\t"
+                "pclmulqdq $0, %%xmm3, %%xmm13\n\t"  /* xmm13 holds 2:a0*b0 */
+                "pclmulqdq $17, %%xmm8, %%xmm3\n\t"  /* xmm3 holds 2:a1*b1 */
+                "pclmulqdq $0, %%xmm5, %%xmm15\n\t" /* xmm15 holds 2:(a0+a1)*(b0+b1) */
+
+                "pxor %%xmm13, %%xmm7\n\t" /* xmm7 holds 2+3+4:a0*b0 */
+                "pxor %%xmm3, %%xmm1\n\t"  /* xmm1 holds 2+3+4:a1*b1 */
+                "pxor %%xmm15, %%xmm12\n\t" /* xmm12 holds 2+3+4:(a0+a1)*(b0+b1) */
+
+                "movdqa %%xmm0, %%xmm3\n\t"
+                "pclmulqdq $0, %%xmm6, %%xmm3\n\t"  /* xmm3 holds 1:a0*b0 */
+                "pclmulqdq $17, %%xmm0, %%xmm6\n\t" /* xmm6 holds 1:a1*b1 */
+                "movdqa %%xmm11, %%xmm4\n\t"
+                "pclmulqdq $0, %%xmm10, %%xmm4\n\t" /* xmm4 holds 1:(a0+a1)*(b0+b1) */
+
+                "pxor %%xmm7, %%xmm3\n\t"  /* xmm3 holds 1+2+3+4:a0*b0 */
+                "pxor %%xmm1, %%xmm6\n\t"  /* xmm6 holds 1+2+3+4:a1*b1 */
+                "pxor %%xmm12, %%xmm4\n\t" /* xmm4 holds 1+2+3+4:(a0+a1)*(b0+b1) */
+
+                /* aggregated reduction... */
+                "movdqa %%xmm3, %%xmm5\n\t"
+                "pxor %%xmm6, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
+                "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
+                "movdqa %%xmm4, %%xmm5\n\t"
+                "psrldq $8, %%xmm4\n\t"
+                "pslldq $8, %%xmm5\n\t"
+                "pxor %%xmm5, %%xmm3\n\t"
+                "pxor %%xmm4, %%xmm6\n\t" /* <xmm6:xmm3> holds the result of the
+                                             carry-less multiplication of xmm0
+                                             by xmm1 */
+
+                /* shift the result by one bit position to the left cope for
+                   the fact that bits are reversed */
+                "movdqa %%xmm3, %%xmm4\n\t"
+                "movdqa %%xmm6, %%xmm5\n\t"
+                "pslld $1, %%xmm3\n\t"
+                "pslld $1, %%xmm6\n\t"
+                "psrld $31, %%xmm4\n\t"
+                "psrld $31, %%xmm5\n\t"
+                "movdqa %%xmm4, %%xmm1\n\t"
+                "pslldq $4, %%xmm5\n\t"
+                "pslldq $4, %%xmm4\n\t"
+                "psrldq $12, %%xmm1\n\t"
+                "por %%xmm4, %%xmm3\n\t"
+                "por %%xmm5, %%xmm6\n\t"
+                "por %%xmm6, %%xmm1\n\t"
+
+                /* first phase of the reduction */
+                "movdqa %%xmm3, %%xmm6\n\t"
+                "movdqa %%xmm3, %%xmm7\n\t"
+                "pslld $31, %%xmm6\n\t"  /* packed right shifting << 31 */
+                "movdqa %%xmm3, %%xmm5\n\t"
+                "pslld $30, %%xmm7\n\t"  /* packed right shifting shift << 30 */
+                "pslld $25, %%xmm5\n\t"  /* packed right shifting shift << 25 */
+                "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */
+                "pxor %%xmm5, %%xmm6\n\t"
+                "movdqa %%xmm6, %%xmm7\n\t"
+                "pslldq $12, %%xmm6\n\t"
+                "psrldq $4, %%xmm7\n\t"
+                "pxor %%xmm6, %%xmm3\n\t" /* first phase of the reduction
+                                             complete */
+
+                /* second phase of the reduction */
+                "movdqa %%xmm3, %%xmm2\n\t"
+                "movdqa %%xmm3, %%xmm4\n\t"
+                "psrld $1, %%xmm2\n\t"    /* packed left shifting >> 1 */
+                "movdqa %%xmm3, %%xmm5\n\t"
+                "psrld $2, %%xmm4\n\t"    /* packed left shifting >> 2 */
+                "psrld $7, %%xmm5\n\t"    /* packed left shifting >> 7 */
+                "pxor %%xmm4, %%xmm2\n\t" /* xor the shifted versions */
+                "pxor %%xmm5, %%xmm2\n\t"
+                "pxor %%xmm7, %%xmm2\n\t"
+                "pxor %%xmm2, %%xmm3\n\t"
+                "pxor %%xmm3, %%xmm1\n\t" /* the result is in xmm1 */
+                :::"cc");
+}
+#endif
+
+
+void
+_gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c, byte *h)
+{
+  u64 tmp[2];
+
+  /* Swap endianness of hsub. */
+  tmp[0] = buf_get_be64(c->u_mode.gcm.u_ghash_key.key + 8);
+  tmp[1] = buf_get_be64(c->u_mode.gcm.u_ghash_key.key + 0);
+  buf_cpy (c->u_mode.gcm.u_ghash_key.key, tmp, GCRY_GCM_BLOCK_LEN);
+
+#ifdef __x86_64__
+  asm volatile ("movdqu %[h_1], %%xmm0\n\t"
+                "movdqa %%xmm0, %%xmm1\n\t"
+                :
+                : [h_1] "m" (*tmp));
+
+  gfmul_pclmul (); /* H•H => H² */
+
+  asm volatile ("movdqu %%xmm1, 0*16(%[h_234])\n\t"
+                "movdqa %%xmm1, %%xmm8\n\t"
+                :
+                : [h_234] "r" (c->u_mode.gcm.gcm_table)
+                : "memory");
+
+  gfmul_pclmul (); /* H•H² => H³ */
+
+  asm volatile ("movdqa %%xmm8, %%xmm0\n\t"
+                "movdqu %%xmm1, 1*16(%[h_234])\n\t"
+                "movdqa %%xmm8, %%xmm1\n\t"
+                :
+                : [h_234] "r" (c->u_mode.gcm.gcm_table)
+                : "memory");
+
+  gfmul_pclmul (); /* H²•H² => H⁴ */
+
+  asm volatile ("movdqu %%xmm1, 2*16(%[h_234])\n\t"
+                :
+                : [h_234] "r" (c->u_mode.gcm.gcm_table)
+                : "memory");
+
+  /* Clear used registers. */
+  asm volatile( "pxor %%xmm0, %%xmm0\n\t"
+                "pxor %%xmm1, %%xmm1\n\t"
+                "pxor %%xmm2, %%xmm2\n\t"
+                "pxor %%xmm3, %%xmm3\n\t"
+                "pxor %%xmm4, %%xmm4\n\t"
+                "pxor %%xmm5, %%xmm5\n\t"
+                "pxor %%xmm6, %%xmm6\n\t"
+                "pxor %%xmm7, %%xmm7\n\t"
+                "pxor %%xmm8, %%xmm8\n\t"
+                ::: "cc" );
+#endif
+
+  wipememory (tmp, sizeof(tmp));
+}
+
+
+unsigned int
+_gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
+                          size_t nblocks)
+{
+  static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
+    { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+  const unsigned int blocksize = GCRY_GCM_BLOCK_LEN;
+
+  if (nblocks == 0)
+    return 0;
+
+  /* Preload hash and H1. */
+  asm volatile ("movdqu %[hash], %%xmm1\n\t"
+                "movdqa %[hsub], %%xmm0\n\t"
+                "pshufb %[be_mask], %%xmm1\n\t" /* be => le */
+                :
+                : [hash] "m" (*result), [be_mask] "m" (*be_mask),
+                  [hsub] "m" (*c->u_mode.gcm.u_ghash_key.key));
+
+#ifdef __x86_64__
+  if (nblocks >= 4)
+    {
+      do
+        {
+          asm volatile ("movdqa %[be_mask], %%xmm4\n\t"
+                        "movdqu 0*16(%[buf]), %%xmm5\n\t"
+                        "movdqu 1*16(%[buf]), %%xmm2\n\t"
+                        "movdqu 2*16(%[buf]), %%xmm3\n\t"
+                        "movdqu 3*16(%[buf]), %%xmm6\n\t"
+                        "pshufb %%xmm4, %%xmm5\n\t" /* be => le */
+
+                        /* Load H2, H3, H4. */
+                        "movdqu 2*16(%[h_234]), %%xmm10\n\t"
+                        "movdqu 1*16(%[h_234]), %%xmm9\n\t"
+                        "movdqu 0*16(%[h_234]), %%xmm8\n\t"
+
+                        "pxor %%xmm5, %%xmm1\n\t"
+                        "pshufb %%xmm4, %%xmm2\n\t" /* be => le */
+                        "pshufb %%xmm4, %%xmm3\n\t" /* be => le */
+                        "pshufb %%xmm4, %%xmm6\n\t" /* be => le */
+                        :
+                        : [buf] "r" (buf), [be_mask] "m" (*be_mask),
+                          [h_234] "r" (c->u_mode.gcm.gcm_table));
+
+          gfmul_pclmul_aggr4 ();
+
+          buf += 4 * blocksize;
+          nblocks -= 4;
+        }
+      while (nblocks >= 4);
+
+      /* Clear used x86-64/XMM registers. */
+      asm volatile( "pxor %%xmm8, %%xmm8\n\t"
+                    "pxor %%xmm9, %%xmm9\n\t"
+                    "pxor %%xmm10, %%xmm10\n\t"
+                    "pxor %%xmm11, %%xmm11\n\t"
+                    "pxor %%xmm12, %%xmm12\n\t"
+                    "pxor %%xmm13, %%xmm13\n\t"
+                    "pxor %%xmm14, %%xmm14\n\t"
+                    "pxor %%xmm15, %%xmm15\n\t"
+                    ::: "cc" );
+    }
+#endif
+
+  while (nblocks--)
+    {
+      asm volatile ("movdqu %[buf], %%xmm2\n\t"
+                    "pshufb %[be_mask], %%xmm2\n\t" /* be => le */
+                    "pxor %%xmm2, %%xmm1\n\t"
+                    :
+                    : [buf] "m" (*buf), [be_mask] "m" (*be_mask));
+
+      gfmul_pclmul ();
+
+      buf += blocksize;
+    }
+
+  /* Store hash. */
+  asm volatile ("pshufb %[be_mask], %%xmm1\n\t" /* be => le */
+                "movdqu %%xmm1, %[hash]\n\t"
+                : [hash] "=m" (*result)
+                : [be_mask] "m" (*be_mask));
+
+  /* Clear used registers. */
+  asm volatile( "pxor %%xmm0, %%xmm0\n\t"
+                "pxor %%xmm1, %%xmm1\n\t"
+                "pxor %%xmm2, %%xmm2\n\t"
+                "pxor %%xmm3, %%xmm3\n\t"
+                "pxor %%xmm4, %%xmm4\n\t"
+                "pxor %%xmm5, %%xmm5\n\t"
+                "pxor %%xmm6, %%xmm6\n\t"
+                "pxor %%xmm7, %%xmm7\n\t"
+                ::: "cc" );
+
+  return 0;
+}
+
+#endif /* GCM_USE_INTEL_PCLMUL */
diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c
index 05347616..f89b81e3 100644
--- a/cipher/cipher-gcm.c
+++ b/cipher/cipher-gcm.c
@@ -29,6 +29,15 @@
 #include "bufhelp.h"
 #include "./cipher-internal.h"
 
+
+#ifdef GCM_USE_INTEL_PCLMUL
+extern void _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c, byte *h);
+
+extern unsigned int _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result,
+                                              const byte *buf, size_t nblocks);
+#endif
+
+
 #ifdef GCM_USE_TABLES
 static const u16 gcmR[256] = {
   0x0000, 0x01c2, 0x0384, 0x0246, 0x0708, 0x06ca, 0x048c, 0x054e,
@@ -348,325 +357,18 @@ do_ghash (unsigned char *hsub, unsigned char *result, const unsigned char *buf)
 #endif /* !GCM_USE_TABLES */
 
 
-#ifdef GCM_USE_INTEL_PCLMUL
-/*
- Intel PCLMUL ghash based on white paper:
-  "Intel® Carry-Less Multiplication Instruction and its Usage for Computing the
-   GCM Mode - Rev 2.01"; Shay Gueron, Michael E. Kounavis.
- */
-static inline void gfmul_pclmul(void)
-{
-  /* Input: XMM0 and XMM1, Output: XMM1. Input XMM0 stays unmodified.
-     Input must be converted to little-endian.
-   */
-  asm volatile (/* gfmul, xmm0 has operator a and xmm1 has operator b. */
-                "pshufd $78, %%xmm0, %%xmm2\n\t"
-                "pshufd $78, %%xmm1, %%xmm4\n\t"
-                "pxor %%xmm0, %%xmm2\n\t" /* xmm2 holds a0+a1 */
-                "pxor %%xmm1, %%xmm4\n\t" /* xmm4 holds b0+b1 */
-
-                "movdqa %%xmm0, %%xmm3\n\t"
-                "pclmulqdq $0, %%xmm1, %%xmm3\n\t"  /* xmm3 holds a0*b0 */
-                "movdqa %%xmm0, %%xmm6\n\t"
-                "pclmulqdq $17, %%xmm1, %%xmm6\n\t" /* xmm6 holds a1*b1 */
-                "movdqa %%xmm3, %%xmm5\n\t"
-                "pclmulqdq $0, %%xmm2, %%xmm4\n\t"  /* xmm4 holds (a0+a1)*(b0+b1) */
-
-                "pxor %%xmm6, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
-                "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
-                "movdqa %%xmm4, %%xmm5\n\t"
-                "psrldq $8, %%xmm4\n\t"
-                "pslldq $8, %%xmm5\n\t"
-                "pxor %%xmm5, %%xmm3\n\t"
-                "pxor %%xmm4, %%xmm6\n\t" /* <xmm6:xmm3> holds the result of the
-                                             carry-less multiplication of xmm0
-                                             by xmm1 */
-
-                /* shift the result by one bit position to the left cope for
-                   the fact that bits are reversed */
-                "movdqa %%xmm3, %%xmm4\n\t"
-                "movdqa %%xmm6, %%xmm5\n\t"
-                "pslld $1, %%xmm3\n\t"
-                "pslld $1, %%xmm6\n\t"
-                "psrld $31, %%xmm4\n\t"
-                "psrld $31, %%xmm5\n\t"
-                "movdqa %%xmm4, %%xmm1\n\t"
-                "pslldq $4, %%xmm5\n\t"
-                "pslldq $4, %%xmm4\n\t"
-                "psrldq $12, %%xmm1\n\t"
-                "por %%xmm4, %%xmm3\n\t"
-                "por %%xmm5, %%xmm6\n\t"
-                "por %%xmm6, %%xmm1\n\t"
-
-                /* first phase of the reduction */
-                "movdqa %%xmm3, %%xmm6\n\t"
-                "movdqa %%xmm3, %%xmm7\n\t"
-                "pslld $31, %%xmm6\n\t"  /* packed right shifting << 31 */
-                "movdqa %%xmm3, %%xmm5\n\t"
-                "pslld $30, %%xmm7\n\t"  /* packed right shifting shift << 30 */
-                "pslld $25, %%xmm5\n\t"  /* packed right shifting shift << 25 */
-                "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */
-                "pxor %%xmm5, %%xmm6\n\t"
-                "movdqa %%xmm6, %%xmm7\n\t"
-                "pslldq $12, %%xmm6\n\t"
-                "psrldq $4, %%xmm7\n\t"
-                "pxor %%xmm6, %%xmm3\n\t" /* first phase of the reduction
-                                             complete */
-
-                /* second phase of the reduction */
-                "movdqa %%xmm3, %%xmm2\n\t"
-                "movdqa %%xmm3, %%xmm4\n\t"
-                "psrld $1, %%xmm2\n\t"    /* packed left shifting >> 1 */
-                "movdqa %%xmm3, %%xmm5\n\t"
-                "psrld $2, %%xmm4\n\t"    /* packed left shifting >> 2 */
-                "psrld $7, %%xmm5\n\t"    /* packed left shifting >> 7 */
-                "pxor %%xmm4, %%xmm2\n\t" /* xor the shifted versions */
-                "pxor %%xmm5, %%xmm2\n\t"
-                "pxor %%xmm7, %%xmm2\n\t"
-                "pxor %%xmm2, %%xmm3\n\t"
-                "pxor %%xmm3, %%xmm1\n\t" /* the result is in xmm1 */
-                ::: "cc" );
-}
-
-#ifdef __x86_64__
-static inline void gfmul_pclmul_aggr4(void)
-{
-  /* Input:
-      H¹: XMM0		X_i            : XMM6
-      H²: XMM8		X_(i-1)        : XMM3
-      H³: XMM9		X_(i-2)        : XMM2
-      H⁴: XMM10		X_(i-3)⊕Y_(i-4): XMM1
-     Output:
-      Y_i: XMM1
-     Inputs XMM0 stays unmodified.
-     Input must be converted to little-endian.
-   */
-  asm volatile (/* perform clmul and merge results... */
-                "pshufd $78, %%xmm10, %%xmm11\n\t"
-                "pshufd $78, %%xmm1, %%xmm12\n\t"
-                "pxor %%xmm10, %%xmm11\n\t" /* xmm11 holds 4:a0+a1 */
-                "pxor %%xmm1, %%xmm12\n\t" /* xmm12 holds 4:b0+b1 */
-
-                "pshufd $78, %%xmm9, %%xmm13\n\t"
-                "pshufd $78, %%xmm2, %%xmm14\n\t"
-                "pxor %%xmm9, %%xmm13\n\t" /* xmm13 holds 3:a0+a1 */
-                "pxor %%xmm2, %%xmm14\n\t" /* xmm14 holds 3:b0+b1 */
-
-                "pshufd $78, %%xmm8, %%xmm5\n\t"
-                "pshufd $78, %%xmm3, %%xmm15\n\t"
-                "pxor %%xmm8, %%xmm5\n\t" /* xmm1 holds 2:a0+a1 */
-                "pxor %%xmm3, %%xmm15\n\t" /* xmm2 holds 2:b0+b1 */
-
-                "movdqa %%xmm10, %%xmm4\n\t"
-                "movdqa %%xmm9, %%xmm7\n\t"
-                "pclmulqdq $0, %%xmm1, %%xmm4\n\t"   /* xmm4 holds 4:a0*b0 */
-                "pclmulqdq $0, %%xmm2, %%xmm7\n\t"   /* xmm7 holds 3:a0*b0 */
-                "pclmulqdq $17, %%xmm10, %%xmm1\n\t" /* xmm1 holds 4:a1*b1 */
-                "pclmulqdq $17, %%xmm9, %%xmm2\n\t"  /* xmm9 holds 3:a1*b1 */
-                "pclmulqdq $0, %%xmm11, %%xmm12\n\t" /* xmm12 holds 4:(a0+a1)*(b0+b1) */
-                "pclmulqdq $0, %%xmm13, %%xmm14\n\t" /* xmm14 holds 3:(a0+a1)*(b0+b1) */
-
-                "pshufd $78, %%xmm0, %%xmm10\n\t"
-                "pshufd $78, %%xmm6, %%xmm11\n\t"
-                "pxor %%xmm0, %%xmm10\n\t" /* xmm10 holds 1:a0+a1 */
-                "pxor %%xmm6, %%xmm11\n\t" /* xmm11 holds 1:b0+b1 */
-
-                "pxor %%xmm4, %%xmm7\n\t"   /* xmm7 holds 3+4:a0*b0 */
-                "pxor %%xmm2, %%xmm1\n\t"   /* xmm1 holds 3+4:a1*b1 */
-                "pxor %%xmm14, %%xmm12\n\t" /* xmm12 holds 3+4:(a0+a1)*(b0+b1) */
-
-                "movdqa %%xmm8, %%xmm13\n\t"
-                "pclmulqdq $0, %%xmm3, %%xmm13\n\t"  /* xmm13 holds 2:a0*b0 */
-                "pclmulqdq $17, %%xmm8, %%xmm3\n\t"  /* xmm3 holds 2:a1*b1 */
-                "pclmulqdq $0, %%xmm5, %%xmm15\n\t" /* xmm15 holds 2:(a0+a1)*(b0+b1) */
-
-                "pxor %%xmm13, %%xmm7\n\t" /* xmm7 holds 2+3+4:a0*b0 */
-                "pxor %%xmm3, %%xmm1\n\t"  /* xmm1 holds 2+3+4:a1*b1 */
-                "pxor %%xmm15, %%xmm12\n\t" /* xmm12 holds 2+3+4:(a0+a1)*(b0+b1) */
-
-                "movdqa %%xmm0, %%xmm3\n\t"
-                "pclmulqdq $0, %%xmm6, %%xmm3\n\t"  /* xmm3 holds 1:a0*b0 */
-                "pclmulqdq $17, %%xmm0, %%xmm6\n\t" /* xmm6 holds 1:a1*b1 */
-                "movdqa %%xmm11, %%xmm4\n\t"
-                "pclmulqdq $0, %%xmm10, %%xmm4\n\t" /* xmm4 holds 1:(a0+a1)*(b0+b1) */
-
-                "pxor %%xmm7, %%xmm3\n\t"  /* xmm3 holds 1+2+3+4:a0*b0 */
-                "pxor %%xmm1, %%xmm6\n\t"  /* xmm6 holds 1+2+3+4:a1*b1 */
-                "pxor %%xmm12, %%xmm4\n\t" /* xmm4 holds 1+2+3+4:(a0+a1)*(b0+b1) */
-
-                /* aggregated reduction... */
-                "movdqa %%xmm3, %%xmm5\n\t"
-                "pxor %%xmm6, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
-                "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
-                "movdqa %%xmm4, %%xmm5\n\t"
-                "psrldq $8, %%xmm4\n\t"
-                "pslldq $8, %%xmm5\n\t"
-                "pxor %%xmm5, %%xmm3\n\t"
-                "pxor %%xmm4, %%xmm6\n\t" /* <xmm6:xmm3> holds the result of the
-                                             carry-less multiplication of xmm0
-                                             by xmm1 */
-
-                /* shift the result by one bit position to the left cope for
-                   the fact that bits are reversed */
-                "movdqa %%xmm3, %%xmm4\n\t"
-                "movdqa %%xmm6, %%xmm5\n\t"
-                "pslld $1, %%xmm3\n\t"
-                "pslld $1, %%xmm6\n\t"
-                "psrld $31, %%xmm4\n\t"
-                "psrld $31, %%xmm5\n\t"
-                "movdqa %%xmm4, %%xmm1\n\t"
-                "pslldq $4, %%xmm5\n\t"
-                "pslldq $4, %%xmm4\n\t"
-                "psrldq $12, %%xmm1\n\t"
-                "por %%xmm4, %%xmm3\n\t"
-                "por %%xmm5, %%xmm6\n\t"
-                "por %%xmm6, %%xmm1\n\t"
-
-                /* first phase of the reduction */
-                "movdqa %%xmm3, %%xmm6\n\t"
-                "movdqa %%xmm3, %%xmm7\n\t"
-                "pslld $31, %%xmm6\n\t"  /* packed right shifting << 31 */
-                "movdqa %%xmm3, %%xmm5\n\t"
-                "pslld $30, %%xmm7\n\t"  /* packed right shifting shift << 30 */
-                "pslld $25, %%xmm5\n\t"  /* packed right shifting shift << 25 */
-                "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */
-                "pxor %%xmm5, %%xmm6\n\t"
-                "movdqa %%xmm6, %%xmm7\n\t"
-                "pslldq $12, %%xmm6\n\t"
-                "psrldq $4, %%xmm7\n\t"
-                "pxor %%xmm6, %%xmm3\n\t" /* first phase of the reduction
-                                             complete */
-
-                /* second phase of the reduction */
-                "movdqa %%xmm3, %%xmm2\n\t"
-                "movdqa %%xmm3, %%xmm4\n\t"
-                "psrld $1, %%xmm2\n\t"    /* packed left shifting >> 1 */
-                "movdqa %%xmm3, %%xmm5\n\t"
-                "psrld $2, %%xmm4\n\t"    /* packed left shifting >> 2 */
-                "psrld $7, %%xmm5\n\t"    /* packed left shifting >> 7 */
-                "pxor %%xmm4, %%xmm2\n\t" /* xor the shifted versions */
-                "pxor %%xmm5, %%xmm2\n\t"
-                "pxor %%xmm7, %%xmm2\n\t"
-                "pxor %%xmm2, %%xmm3\n\t"
-                "pxor %%xmm3, %%xmm1\n\t" /* the result is in xmm1 */
-                :::"cc");
-}
-#endif
-
-#endif /*GCM_USE_INTEL_PCLMUL*/
-
-
 static unsigned int
-ghash (gcry_cipher_hd_t c, byte *result, const byte *buf,
-       size_t nblocks)
+ghash_internal (gcry_cipher_hd_t c, byte *result, const byte *buf,
+                size_t nblocks)
 {
   const unsigned int blocksize = GCRY_GCM_BLOCK_LEN;
-  unsigned int burn;
-
-  if (nblocks == 0)
-    return 0;
-
-  if (0)
-    ;
-#ifdef GCM_USE_INTEL_PCLMUL
-  else if (c->u_mode.gcm.use_intel_pclmul)
-    {
-      static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
-        { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
-
-      /* Preload hash and H1. */
-      asm volatile ("movdqu %[hash], %%xmm1\n\t"
-                    "movdqa %[hsub], %%xmm0\n\t"
-                    "pshufb %[be_mask], %%xmm1\n\t" /* be => le */
-                    :
-                    : [hash] "m" (*result), [be_mask] "m" (*be_mask),
-                      [hsub] "m" (*c->u_mode.gcm.u_ghash_key.key));
-
-#ifdef __x86_64__
-      if (nblocks >= 4)
-        {
-          do
-            {
-              asm volatile ("movdqa %[be_mask], %%xmm4\n\t"
-                            "movdqu 0*16(%[buf]), %%xmm5\n\t"
-                            "movdqu 1*16(%[buf]), %%xmm2\n\t"
-                            "movdqu 2*16(%[buf]), %%xmm3\n\t"
-                            "movdqu 3*16(%[buf]), %%xmm6\n\t"
-                            "pshufb %%xmm4, %%xmm5\n\t" /* be => le */
-
-                            /* Load H2, H3, H4. */
-                            "movdqu 2*16(%[h_234]), %%xmm10\n\t"
-                            "movdqu 1*16(%[h_234]), %%xmm9\n\t"
-                            "movdqu 0*16(%[h_234]), %%xmm8\n\t"
-
-                            "pxor %%xmm5, %%xmm1\n\t"
-                            "pshufb %%xmm4, %%xmm2\n\t" /* be => le */
-                            "pshufb %%xmm4, %%xmm3\n\t" /* be => le */
-                            "pshufb %%xmm4, %%xmm6\n\t" /* be => le */
-                            :
-                            : [buf] "r" (buf), [be_mask] "m" (*be_mask),
-                              [h_234] "r" (c->u_mode.gcm.gcm_table));
-
-              gfmul_pclmul_aggr4 ();
-
-              buf += 4 * blocksize;
-              nblocks -= 4;
-            }
-          while (nblocks >= 4);
-
-          /* Clear used x86-64/XMM registers. */
-          asm volatile( "pxor %%xmm8, %%xmm8\n\t"
-                        "pxor %%xmm9, %%xmm9\n\t"
-                        "pxor %%xmm10, %%xmm10\n\t"
-                        "pxor %%xmm11, %%xmm11\n\t"
-                        "pxor %%xmm12, %%xmm12\n\t"
-                        "pxor %%xmm13, %%xmm13\n\t"
-                        "pxor %%xmm14, %%xmm14\n\t"
-                        "pxor %%xmm15, %%xmm15\n\t"
-                        ::: "cc" );
-        }
-#endif
-
-      while (nblocks--)
-        {
-          asm volatile ("movdqu %[buf], %%xmm2\n\t"
-                        "pshufb %[be_mask], %%xmm2\n\t" /* be => le */
-                        "pxor %%xmm2, %%xmm1\n\t"
-                        :
-                        : [buf] "m" (*buf), [be_mask] "m" (*be_mask));
-
-          gfmul_pclmul ();
-
-          buf += blocksize;
-        }
+  unsigned int burn = 0;
 
-      /* Store hash. */
-      asm volatile ("pshufb %[be_mask], %%xmm1\n\t" /* be => le */
-                    "movdqu %%xmm1, %[hash]\n\t"
-                    : [hash] "=m" (*result)
-                    : [be_mask] "m" (*be_mask));
-
-      /* Clear used registers. */
-      asm volatile( "pxor %%xmm0, %%xmm0\n\t"
-                    "pxor %%xmm1, %%xmm1\n\t"
-                    "pxor %%xmm2, %%xmm2\n\t"
-                    "pxor %%xmm3, %%xmm3\n\t"
-                    "pxor %%xmm4, %%xmm4\n\t"
-                    "pxor %%xmm5, %%xmm5\n\t"
-                    "pxor %%xmm6, %%xmm6\n\t"
-                    "pxor %%xmm7, %%xmm7\n\t"
-                    ::: "cc" );
-      burn = 0;
-    }
-#endif
-  else
+  while (nblocks)
     {
-      while (nblocks)
-        {
-          burn = GHASH (c, result, buf);
-          buf += blocksize;
-          nblocks--;
-        }
+      burn = GHASH (c, result, buf);
+      buf += blocksize;
+      nblocks--;
     }
 
   return burn + (burn ? 5*sizeof(void*) : 0);
@@ -681,63 +383,15 @@ setupM (gcry_cipher_hd_t c, byte *h)
 #ifdef GCM_USE_INTEL_PCLMUL
   else if (_gcry_get_hw_features () & HWF_INTEL_PCLMUL)
     {
-      u64 tmp[2];
-
-      c->u_mode.gcm.use_intel_pclmul = 1;
-
-      /* Swap endianness of hsub. */
-      tmp[0] = buf_get_be64(c->u_mode.gcm.u_ghash_key.key + 8);
-      tmp[1] = buf_get_be64(c->u_mode.gcm.u_ghash_key.key + 0);
-      buf_cpy (c->u_mode.gcm.u_ghash_key.key, tmp, GCRY_GCM_BLOCK_LEN);
-
-#ifdef __x86_64__
-      asm volatile ("movdqu %[h_1], %%xmm0\n\t"
-                    "movdqa %%xmm0, %%xmm1\n\t"
-                    :
-                    : [h_1] "m" (*tmp));
-
-      gfmul_pclmul (); /* H•H => H² */
-
-      asm volatile ("movdqu %%xmm1, 0*16(%[h_234])\n\t"
-                    "movdqa %%xmm1, %%xmm8\n\t"
-                    :
-                    : [h_234] "r" (c->u_mode.gcm.gcm_table)
-                    : "memory");
-
-      gfmul_pclmul (); /* H•H² => H³ */
-
-      asm volatile ("movdqa %%xmm8, %%xmm0\n\t"
-                    "movdqu %%xmm1, 1*16(%[h_234])\n\t"
-                    "movdqa %%xmm8, %%xmm1\n\t"
-                    :
-                    : [h_234] "r" (c->u_mode.gcm.gcm_table)
-                    : "memory");
-
-      gfmul_pclmul (); /* H²•H² => H⁴ */
-
-      asm volatile ("movdqu %%xmm1, 2*16(%[h_234])\n\t"
-                    :
-                    : [h_234] "r" (c->u_mode.gcm.gcm_table)
-                    : "memory");
-
-      /* Clear used registers. */
-      asm volatile( "pxor %%xmm0, %%xmm0\n\t"
-                    "pxor %%xmm1, %%xmm1\n\t"
-                    "pxor %%xmm2, %%xmm2\n\t"
-                    "pxor %%xmm3, %%xmm3\n\t"
-                    "pxor %%xmm4, %%xmm4\n\t"
-                    "pxor %%xmm5, %%xmm5\n\t"
-                    "pxor %%xmm6, %%xmm6\n\t"
-                    "pxor %%xmm7, %%xmm7\n\t"
-                    "pxor %%xmm8, %%xmm8\n\t"
-                    ::: "cc" );
-#endif
-
-      wipememory (tmp, sizeof(tmp));
+      c->u_mode.gcm.ghash_fn = _gcry_ghash_intel_pclmul;
+      _gcry_ghash_setup_intel_pclmul(c, h);
     }
 #endif
   else
-    fillM (c, h);
+    {
+      c->u_mode.gcm.ghash_fn = ghash_internal;
+      fillM (c, h);
+    }
 }
 
 
@@ -810,6 +464,7 @@ do_ghash_buf(gcry_cipher_hd_t c, byte *hash, const byte *buf,
 {
   unsigned int blocksize = GCRY_GCM_BLOCK_LEN;
   unsigned int unused = c->u_mode.gcm.mac_unused;
+  ghash_fn_t ghash_fn = c->u_mode.gcm.ghash_fn;
   size_t nblocks, n;
   unsigned int burn = 0;
 
@@ -843,7 +498,7 @@ do_ghash_buf(gcry_cipher_hd_t c, byte *hash, const byte *buf,
           gcry_assert (unused == blocksize);
 
           /* Process one block from macbuf.  */
-          burn = ghash (c, hash, c->u_mode.gcm.macbuf, 1);
+          burn = ghash_fn (c, hash, c->u_mode.gcm.macbuf, 1);
           unused = 0;
         }
 
@@ -851,7 +506,7 @@ do_ghash_buf(gcry_cipher_hd_t c, byte *hash, const byte *buf,
 
       if (nblocks)
         {
-          burn = ghash (c, hash, buf, nblocks);
+          burn = ghash_fn (c, hash, buf, nblocks);
           buf += blocksize * nblocks;
           buflen -= blocksize * nblocks;
         }
diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index f6bda668..fef0ecba 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -42,7 +42,7 @@
 #define GCM_USE_TABLES 1
 
 
-/* GCM_USE_INTEL_PCLMUL inidicates whether to compile GCM with Intel PCLMUL
+/* GCM_USE_INTEL_PCLMUL indicates whether to compile GCM with Intel PCLMUL
    code.  */
 #undef GCM_USE_INTEL_PCLMUL
 #if defined(ENABLE_PCLMUL_SUPPORT) && defined(GCM_USE_TABLES)
@@ -54,6 +54,10 @@
 #endif /* GCM_USE_INTEL_PCLMUL */
 
 
+typedef unsigned int (*ghash_fn_t) (gcry_cipher_hd_t c, byte *result,
+                                    const byte *buf, size_t nblocks);
+
+
 /* A VIA processor with the Padlock engine as well as the Intel AES_NI
    instructions require an alignment of most data on a 16 byte
    boundary.  Because we trick out the compiler while allocating the
@@ -188,6 +192,7 @@ struct gcry_cipher_handle
       unsigned char macbuf[GCRY_CCM_BLOCK_LEN];
       int mac_unused;  /* Number of unprocessed bytes in MACBUF. */
 
+
       /* byte counters for GCM */
       u32 aadlen[2];
       u32 datalen[2];
@@ -209,10 +214,8 @@ struct gcry_cipher_handle
         unsigned char key[MAX_BLOCKSIZE];
       } u_ghash_key;
 
-#ifdef GCM_USE_INTEL_PCLMUL
-      /* Use Intel PCLMUL instructions for accelerated GHASH. */
-      unsigned int use_intel_pclmul:1;
-#endif
+      /* GHASH implementation in use. */
+      ghash_fn_t ghash_fn;
 
       /* Pre-calculated table for GCM. */
 #ifdef GCM_USE_TABLES