summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2016-03-12 17:07:21 +0200
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2016-03-12 17:16:41 +0200
commit5d601dd57fcb41aa2015ab655fd6fc51537da667 (patch)
treef607471051b8cec46a8421e1d59a66c48aa72d4c
parenta8b803d9e4bea2b779385ec9e9a579acc64431e9 (diff)
downloadlibgcrypt-5d601dd57fcb41aa2015ab655fd6fc51537da667.tar.gz
Add Intel PCLMUL implementations of CRC algorithms
* cipher/Makefile.am: Add 'crc-intel-pclmul.c'. * cipher/crc-intel-pclmul.c: New. * cipher/crc.c (USE_INTEL_PCLMUL): New macro. (CRC_CONTEXT) [USE_INTEL_PCLMUL]: Add 'use_pclmul'. [USE_INTEL_PCLMUL] (_gcry_crc32_intel_pclmul) (gcry_crc24rfc2440_intel_pclmul): New. (crc32_init, crc32rfc1510_init, crc24rfc2440_init) [USE_INTEL_PCLMUL]: Select PCLMUL implementation if SSE4.1 and PCLMUL HW features detected. (crc32_write, crc24rfc2440_write) [USE_INTEL_PCLMUL]: Use PCLMUL implementation if enabled. (crc24_init): Document storage format of 24-bit CRC. (crc24_next4): Use only 'data' for last table look-up. * configure.ac: Add 'crc-intel-pclmul.lo'. * src/g10lib.h (HWF_*, HWF_INTEL_SSE4_1): Update HWF flags to include Intel SSE4.1. * src/hwf-x86.c (detect_x86_gnuc): Add SSE4.1 detection. * src/hwfeatures.c (hwflist): Add 'intel-sse4.1'. * tests/basic.c (fillbuf_count): New. (check_one_md): Add "?" check (million byte data-set with byte pattern 0x00,0x01,0x02,...); Test all buffer sizes 1 to 1000, for "!" and "?" checks. (check_one_md_multi): Skip "?". (check_digests): Add "?" test-vectors for MD5, SHA1, SHA224, SHA256, SHA384, SHA512, SHA3_224, SHA3_256, SHA3_384, SHA3_512, RIPEMD160, CRC32, CRC32_RFC1510, CRC24_RFC2440, TIGER1 and WHIRLPOOL; Add "!" test-vectors for CRC32_RFC1510 and CRC24_RFC2440. -- Add Intel PCLMUL accelerated implmentations of CRC algorithms. CRC performance is improved ~11x on x86_64 and i386 on Intel Haswell, and ~2.7x on Intel Sandy-bridge. Benchmark on Intel Core i5-4570 (x86_64, 3.2 Ghz): Before: | nanosecs/byte mebibytes/sec cycles/byte CRC32 | 0.865 ns/B 1103.0 MiB/s 2.77 c/B CRC32RFC1510 | 0.865 ns/B 1102.7 MiB/s 2.77 c/B CRC24RFC2440 | 0.865 ns/B 1103.0 MiB/s 2.77 c/B After: | nanosecs/byte mebibytes/sec cycles/byte CRC32 | 0.079 ns/B 12051.7 MiB/s 0.253 c/B CRC32RFC1510 | 0.079 ns/B 12050.6 MiB/s 0.253 c/B CRC24RFC2440 | 0.079 ns/B 12100.0 MiB/s 0.252 c/B Benchmark on Intel Core i5-4570 (i386, 3.2 Ghz): Before: | nanosecs/byte mebibytes/sec cycles/byte CRC32 | 0.860 ns/B 1109.0 MiB/s 2.75 c/B CRC32RFC1510 | 0.861 ns/B 1108.3 MiB/s 2.75 c/B CRC24RFC2440 | 0.860 ns/B 1108.6 MiB/s 2.75 c/B After: | nanosecs/byte mebibytes/sec cycles/byte CRC32 | 0.078 ns/B 12207.0 MiB/s 0.250 c/B CRC32RFC1510 | 0.078 ns/B 12207.0 MiB/s 0.250 c/B CRC24RFC2440 | 0.080 ns/B 11925.6 MiB/s 0.256 c/B Benchmark on Intel Core i5-2450M (x86_64, 2.5 Ghz): Before: | nanosecs/byte mebibytes/sec cycles/byte CRC32 | 1.25 ns/B 762.3 MiB/s 3.13 c/B CRC32RFC1510 | 1.26 ns/B 759.1 MiB/s 3.14 c/B CRC24RFC2440 | 1.25 ns/B 764.9 MiB/s 3.12 c/B After: | nanosecs/byte mebibytes/sec cycles/byte CRC32 | 0.451 ns/B 2114.3 MiB/s 1.13 c/B CRC32RFC1510 | 0.451 ns/B 2114.6 MiB/s 1.13 c/B CRC24RFC2440 | 0.457 ns/B 2085.0 MiB/s 1.14 c/B Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
-rw-r--r--cipher/Makefile.am1
-rw-r--r--cipher/crc-intel-pclmul.c912
-rw-r--r--cipher/crc.c59
-rw-r--r--configure.ac7
-rw-r--r--src/g10lib.h34
-rw-r--r--src/hwf-x86.c3
-rw-r--r--src/hwfeatures.c1
-rw-r--r--tests/basic.c94
8 files changed, 1084 insertions, 27 deletions
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 65d7afb4..ab71fa79 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -66,6 +66,7 @@ cast5.c cast5-amd64.S cast5-arm.S \
chacha20.c chacha20-sse2-amd64.S chacha20-ssse3-amd64.S chacha20-avx2-amd64.S \
chacha20-armv7-neon.S \
crc.c \
+ crc-intel-pclmul.c \
des.c des-amd64.S \
dsa.c \
elgamal.c \
diff --git a/cipher/crc-intel-pclmul.c b/cipher/crc-intel-pclmul.c
new file mode 100644
index 00000000..5002f804
--- /dev/null
+++ b/cipher/crc-intel-pclmul.c
@@ -0,0 +1,912 @@
+/* crc-intel-pclmul.c - Intel PCLMUL accelerated CRC implementation
+ * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ *
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "g10lib.h"
+
+#include "bithelp.h"
+#include "bufhelp.h"
+
+
+#if defined(ENABLE_PCLMUL_SUPPORT) && __GNUC__ >= 4 && \
+ ((defined(__i386__) && SIZEOF_UNSIGNED_LONG == 4) || defined(__x86_64__))
+
+
+#if _GCRY_GCC_VERSION >= 40400 /* 4.4 */
+/* Prevent compiler from issuing SSE instructions between asm blocks. */
+# pragma GCC target("no-sse")
+#endif
+
+
+#define ALIGNED_16 __attribute__ ((aligned (16)))
+
+
+/* Constants structure for generic reflected/non-reflected CRC32 CLMUL
+ * functions. */
+struct crc32_consts_s
+{
+ /* k: { x^(32*17), x^(32*15), x^(32*5), x^(32*3), x^(32*2), 0 } mod P(x) */
+ u64 k[6];
+ /* my_p: { floor(x^64 / P(x)), P(x) } */
+ u64 my_p[2];
+};
+
+
+/* CLMUL constants for CRC32 and CRC32RFC1510. */
+static const struct crc32_consts_s crc32_consts ALIGNED_16 =
+{
+ { /* k[6] = reverse_33bits( x^(32*y) mod P(x) ) */
+ U64_C(0x154442bd4), U64_C(0x1c6e41596), /* y = { 17, 15 } */
+ U64_C(0x1751997d0), U64_C(0x0ccaa009e), /* y = { 5, 3 } */
+ U64_C(0x163cd6124), 0 /* y = 2 */
+ },
+ { /* my_p[2] = reverse_33bits ( { floor(x^64 / P(x)), P(x) } ) */
+ U64_C(0x1f7011641), U64_C(0x1db710641)
+ }
+};
+
+/* CLMUL constants for CRC24RFC2440 (polynomial multiplied with x⁸). */
+static const struct crc32_consts_s crc24rfc2440_consts ALIGNED_16 =
+{
+ { /* k[6] = x^(32*y) mod P(x) << 32*/
+ U64_C(0x08289a00) << 32, U64_C(0x74b44a00) << 32, /* y = { 17, 15 } */
+ U64_C(0xc4b14d00) << 32, U64_C(0xfd7e0c00) << 32, /* y = { 5, 3 } */
+ U64_C(0xd9fe8c00) << 32, 0 /* y = 2 */
+ },
+ { /* my_p[2] = { floor(x^64 / P(x)), P(x) } */
+ U64_C(0x1f845fe24), U64_C(0x1864cfb00)
+ }
+};
+
+/* Common constants for CRC32 algorithms. */
+static const byte crc32_refl_shuf_shift[3 * 16] ALIGNED_16 =
+ {
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ };
+static const byte crc32_shuf_shift[3 * 16] ALIGNED_16 =
+ {
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08,
+ 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ };
+static const byte *crc32_bswap_shuf = &crc32_shuf_shift[16];
+static const byte crc32_partial_fold_input_mask[16 + 16] ALIGNED_16 =
+ {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ };
+static const u64 crc32_merge9to15_shuf[15 - 9 + 1][2] ALIGNED_16 =
+ {
+ { U64_C(0x0706050403020100), U64_C(0xffffffffffffff0f) }, /* 9 */
+ { U64_C(0x0706050403020100), U64_C(0xffffffffffff0f0e) },
+ { U64_C(0x0706050403020100), U64_C(0xffffffffff0f0e0d) },
+ { U64_C(0x0706050403020100), U64_C(0xffffffff0f0e0d0c) },
+ { U64_C(0x0706050403020100), U64_C(0xffffff0f0e0d0c0b) },
+ { U64_C(0x0706050403020100), U64_C(0xffff0f0e0d0c0b0a) },
+ { U64_C(0x0706050403020100), U64_C(0xff0f0e0d0c0b0a09) }, /* 15 */
+ };
+static const u64 crc32_merge5to7_shuf[7 - 5 + 1][2] ALIGNED_16 =
+ {
+ { U64_C(0xffffff0703020100), U64_C(0xffffffffffffffff) }, /* 5 */
+ { U64_C(0xffff070603020100), U64_C(0xffffffffffffffff) },
+ { U64_C(0xff07060503020100), U64_C(0xffffffffffffffff) }, /* 7 */
+ };
+
+/* PCLMUL functions for reflected CRC32. */
+static inline void
+crc32_reflected_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
+ const struct crc32_consts_s *consts)
+{
+ if (inlen >= 8 * 16)
+ {
+ asm volatile ("movd %[crc], %%xmm4\n\t"
+ "movdqu %[inbuf_0], %%xmm0\n\t"
+ "movdqu %[inbuf_1], %%xmm1\n\t"
+ "movdqu %[inbuf_2], %%xmm2\n\t"
+ "movdqu %[inbuf_3], %%xmm3\n\t"
+ "pxor %%xmm4, %%xmm0\n\t"
+ :
+ : [inbuf_0] "m" (inbuf[0 * 16]),
+ [inbuf_1] "m" (inbuf[1 * 16]),
+ [inbuf_2] "m" (inbuf[2 * 16]),
+ [inbuf_3] "m" (inbuf[3 * 16]),
+ [crc] "m" (*pcrc)
+ : );
+
+ inbuf += 4 * 16;
+ inlen -= 4 * 16;
+
+ asm volatile ("movdqa %[k1k2], %%xmm4\n\t"
+ :
+ : [k1k2] "m" (consts->k[1 - 1])
+ : );
+
+ /* Fold by 4. */
+ while (inlen >= 4 * 16)
+ {
+ asm volatile ("movdqu %[inbuf_0], %%xmm5\n\t"
+ "movdqa %%xmm0, %%xmm6\n\t"
+ "pclmulqdq $0x00, %%xmm4, %%xmm0\n\t"
+ "pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
+ "pxor %%xmm5, %%xmm0\n\t"
+ "pxor %%xmm6, %%xmm0\n\t"
+
+ "movdqu %[inbuf_1], %%xmm5\n\t"
+ "movdqa %%xmm1, %%xmm6\n\t"
+ "pclmulqdq $0x00, %%xmm4, %%xmm1\n\t"
+ "pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
+ "pxor %%xmm5, %%xmm1\n\t"
+ "pxor %%xmm6, %%xmm1\n\t"
+
+ "movdqu %[inbuf_2], %%xmm5\n\t"
+ "movdqa %%xmm2, %%xmm6\n\t"
+ "pclmulqdq $0x00, %%xmm4, %%xmm2\n\t"
+ "pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
+ "pxor %%xmm5, %%xmm2\n\t"
+ "pxor %%xmm6, %%xmm2\n\t"
+
+ "movdqu %[inbuf_3], %%xmm5\n\t"
+ "movdqa %%xmm3, %%xmm6\n\t"
+ "pclmulqdq $0x00, %%xmm4, %%xmm3\n\t"
+ "pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
+ "pxor %%xmm5, %%xmm3\n\t"
+ "pxor %%xmm6, %%xmm3\n\t"
+ :
+ : [inbuf_0] "m" (inbuf[0 * 16]),
+ [inbuf_1] "m" (inbuf[1 * 16]),
+ [inbuf_2] "m" (inbuf[2 * 16]),
+ [inbuf_3] "m" (inbuf[3 * 16])
+ : );
+
+ inbuf += 4 * 16;
+ inlen -= 4 * 16;
+ }
+
+ asm volatile ("movdqa %[k3k4], %%xmm6\n\t"
+ "movdqa %[my_p], %%xmm5\n\t"
+ :
+ : [k3k4] "m" (consts->k[3 - 1]),
+ [my_p] "m" (consts->my_p[0])
+ : );
+
+ /* Fold 4 to 1. */
+
+ asm volatile ("movdqa %%xmm0, %%xmm4\n\t"
+ "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
+ "pclmulqdq $0x11, %%xmm6, %%xmm4\n\t"
+ "pxor %%xmm1, %%xmm0\n\t"
+ "pxor %%xmm4, %%xmm0\n\t"
+
+ "movdqa %%xmm0, %%xmm4\n\t"
+ "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
+ "pclmulqdq $0x11, %%xmm6, %%xmm4\n\t"
+ "pxor %%xmm2, %%xmm0\n\t"
+ "pxor %%xmm4, %%xmm0\n\t"
+
+ "movdqa %%xmm0, %%xmm4\n\t"
+ "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
+ "pclmulqdq $0x11, %%xmm6, %%xmm4\n\t"
+ "pxor %%xmm3, %%xmm0\n\t"
+ "pxor %%xmm4, %%xmm0\n\t"
+ :
+ :
+ : );
+ }
+ else
+ {
+ asm volatile ("movd %[crc], %%xmm1\n\t"
+ "movdqu %[inbuf], %%xmm0\n\t"
+ "movdqa %[k3k4], %%xmm6\n\t"
+ "pxor %%xmm1, %%xmm0\n\t"
+ "movdqa %[my_p], %%xmm5\n\t"
+ :
+ : [inbuf] "m" (*inbuf),
+ [crc] "m" (*pcrc),
+ [k3k4] "m" (consts->k[3 - 1]),
+ [my_p] "m" (consts->my_p[0])
+ : );
+
+ inbuf += 16;
+ inlen -= 16;
+ }
+
+ /* Fold by 1. */
+ if (inlen >= 16)
+ {
+ while (inlen >= 16)
+ {
+ /* Load next block to XMM2. Fold XMM0 to XMM0:XMM1. */
+ asm volatile ("movdqu %[inbuf], %%xmm2\n\t"
+ "movdqa %%xmm0, %%xmm1\n\t"
+ "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
+ "pclmulqdq $0x11, %%xmm6, %%xmm1\n\t"
+ "pxor %%xmm2, %%xmm0\n\t"
+ "pxor %%xmm1, %%xmm0\n\t"
+ :
+ : [inbuf] "m" (*inbuf)
+ : );
+
+ inbuf += 16;
+ inlen -= 16;
+ }
+ }
+
+ /* Partial fold. */
+ if (inlen)
+ {
+ /* Load last input and add padding zeros. */
+ asm volatile ("movdqu %[shr_shuf], %%xmm3\n\t"
+ "movdqu %[shl_shuf], %%xmm4\n\t"
+ "movdqu %[mask], %%xmm2\n\t"
+
+ "movdqa %%xmm0, %%xmm1\n\t"
+ "pshufb %%xmm4, %%xmm0\n\t"
+ "movdqu %[inbuf], %%xmm4\n\t"
+ "pshufb %%xmm3, %%xmm1\n\t"
+ "pand %%xmm4, %%xmm2\n\t"
+ "por %%xmm1, %%xmm2\n\t"
+
+ "movdqa %%xmm0, %%xmm1\n\t"
+ "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
+ "pclmulqdq $0x11, %%xmm6, %%xmm1\n\t"
+ "pxor %%xmm2, %%xmm0\n\t"
+ "pxor %%xmm1, %%xmm0\n\t"
+ :
+ : [inbuf] "m" (*(inbuf - 16 + inlen)),
+ [mask] "m" (crc32_partial_fold_input_mask[inlen]),
+ [shl_shuf] "m" (crc32_refl_shuf_shift[inlen]),
+ [shr_shuf] "m" (crc32_refl_shuf_shift[inlen + 16])
+ : );
+
+ inbuf += inlen;
+ inlen -= inlen;
+ }
+
+ /* Final fold. */
+ asm volatile (/* reduce 128-bits to 96-bits */
+ "movdqa %%xmm0, %%xmm1\n\t"
+ "pclmulqdq $0x10, %%xmm6, %%xmm0\n\t"
+ "psrldq $8, %%xmm1\n\t"
+ "pxor %%xmm1, %%xmm0\n\t"
+
+ /* reduce 96-bits to 64-bits */
+ "pshufd $0xfc, %%xmm0, %%xmm1\n\t" /* [00][00][00][x] */
+ "pshufd $0xf9, %%xmm0, %%xmm0\n\t" /* [00][00][x>>64][x>>32] */
+ "pclmulqdq $0x00, %[k5], %%xmm1\n\t" /* [00][00][xx][xx] */
+ "pxor %%xmm1, %%xmm0\n\t" /* top 64-bit are zero */
+
+ /* barrett reduction */
+ "pshufd $0xf3, %%xmm0, %%xmm1\n\t" /* [00][00][x>>32][00] */
+ "pslldq $4, %%xmm0\n\t" /* [??][x>>32][??][??] */
+ "pclmulqdq $0x00, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
+ "pclmulqdq $0x10, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
+ "pxor %%xmm1, %%xmm0\n\t"
+
+ /* store CRC */
+ "pextrd $2, %%xmm0, %[out]\n\t"
+ : [out] "=m" (*pcrc)
+ : [k5] "m" (consts->k[5 - 1])
+ : );
+}
+
+static inline void
+crc32_reflected_less_than_16 (u32 *pcrc, const byte *inbuf, size_t inlen,
+ const struct crc32_consts_s *consts)
+{
+ if (inlen < 4)
+ {
+ u32 crc = *pcrc;
+ u32 data;
+
+ asm volatile ("movdqa %[my_p], %%xmm5\n\t"
+ :
+ : [my_p] "m" (consts->my_p[0])
+ : );
+
+ if (inlen == 1)
+ {
+ data = inbuf[0];
+ data ^= crc;
+ data <<= 24;
+ crc >>= 8;
+ }
+ else if (inlen == 2)
+ {
+ data = *((const u16 *)inbuf);
+ data ^= crc;
+ data <<= 16;
+ crc >>= 16;
+ }
+ else
+ {
+ data = *((const u16 *)inbuf);
+ data |= inbuf[2] << 16;
+ data ^= crc;
+ data <<= 8;
+ crc >>= 24;
+ }
+
+ /* Barrett reduction */
+ asm volatile ("movd %[in], %%xmm0\n\t"
+ "movd %[crc], %%xmm1\n\t"
+
+ "pclmulqdq $0x00, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
+ "psllq $32, %%xmm1\n\t"
+ "pshufd $0xfc, %%xmm0, %%xmm0\n\t" /* [00][00][00][x] */
+ "pclmulqdq $0x10, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
+ "pxor %%xmm1, %%xmm0\n\t"
+
+ "pextrd $1, %%xmm0, %[out]\n\t"
+ : [out] "=m" (*pcrc)
+ : [in] "rm" (data),
+ [crc] "rm" (crc)
+ : );
+ }
+ else if (inlen == 4)
+ {
+ /* Barrett reduction */
+ asm volatile ("movd %[crc], %%xmm1\n\t"
+ "movd %[in], %%xmm0\n\t"
+ "movdqa %[my_p], %%xmm5\n\t"
+ "pxor %%xmm1, %%xmm0\n\t"
+
+ "pclmulqdq $0x00, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
+ "pshufd $0xfc, %%xmm0, %%xmm0\n\t" /* [00][00][00][x] */
+ "pclmulqdq $0x10, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
+
+ "pextrd $1, %%xmm0, %[out]\n\t"
+ : [out] "=m" (*pcrc)
+ : [in] "m" (*inbuf),
+ [crc] "m" (*pcrc),
+ [my_p] "m" (consts->my_p[0])
+ : );
+ }
+ else
+ {
+ asm volatile ("movdqu %[shuf], %%xmm4\n\t"
+ "movd %[crc], %%xmm1\n\t"
+ "movdqa %[my_p], %%xmm5\n\t"
+ "movdqa %[k3k4], %%xmm6\n\t"
+ :
+ : [shuf] "m" (crc32_refl_shuf_shift[inlen]),
+ [crc] "m" (*pcrc),
+ [my_p] "m" (consts->my_p[0]),
+ [k3k4] "m" (consts->k[3 - 1])
+ : );
+
+ if (inlen >= 8)
+ {
+ asm volatile ("movq %[inbuf], %%xmm0\n\t"
+ :
+ : [inbuf] "m" (*inbuf)
+ : );
+ if (inlen > 8)
+ {
+ asm volatile (/*"pinsrq $1, %[inbuf_tail], %%xmm0\n\t"*/
+ "movq %[inbuf_tail], %%xmm2\n\t"
+ "punpcklqdq %%xmm2, %%xmm0\n\t"
+ "pshufb %[merge_shuf], %%xmm0\n\t"
+ :
+ : [inbuf_tail] "m" (inbuf[inlen - 8]),
+ [merge_shuf] "m"
+ (*crc32_merge9to15_shuf[inlen - 9])
+ : );
+ }
+ }
+ else
+ {
+ asm volatile ("movd %[inbuf], %%xmm0\n\t"
+ "pinsrd $1, %[inbuf_tail], %%xmm0\n\t"
+ "pshufb %[merge_shuf], %%xmm0\n\t"
+ :
+ : [inbuf] "m" (*inbuf),
+ [inbuf_tail] "m" (inbuf[inlen - 4]),
+ [merge_shuf] "m"
+ (*crc32_merge5to7_shuf[inlen - 5])
+ : );
+ }
+
+ /* Final fold. */
+ asm volatile ("pxor %%xmm1, %%xmm0\n\t"
+ "pshufb %%xmm4, %%xmm0\n\t"
+
+ /* reduce 128-bits to 96-bits */
+ "movdqa %%xmm0, %%xmm1\n\t"
+ "pclmulqdq $0x10, %%xmm6, %%xmm0\n\t"
+ "psrldq $8, %%xmm1\n\t"
+ "pxor %%xmm1, %%xmm0\n\t" /* top 32-bit are zero */
+
+ /* reduce 96-bits to 64-bits */
+ "pshufd $0xfc, %%xmm0, %%xmm1\n\t" /* [00][00][00][x] */
+ "pshufd $0xf9, %%xmm0, %%xmm0\n\t" /* [00][00][x>>64][x>>32] */
+ "pclmulqdq $0x00, %[k5], %%xmm1\n\t" /* [00][00][xx][xx] */
+ "pxor %%xmm1, %%xmm0\n\t" /* top 64-bit are zero */
+
+ /* barrett reduction */
+ "pshufd $0xf3, %%xmm0, %%xmm1\n\t" /* [00][00][x>>32][00] */
+ "pslldq $4, %%xmm0\n\t" /* [??][x>>32][??][??] */
+ "pclmulqdq $0x00, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
+ "pclmulqdq $0x10, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
+ "pxor %%xmm1, %%xmm0\n\t"
+
+ /* store CRC */
+ "pextrd $2, %%xmm0, %[out]\n\t"
+ : [out] "=m" (*pcrc)
+ : [k5] "m" (consts->k[5 - 1])
+ : );
+ }
+}
+
+/* PCLMUL functions for non-reflected CRC32. */
+static inline void
+crc32_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
+ const struct crc32_consts_s *consts)
+{
+ asm volatile ("movdqa %[bswap], %%xmm7\n\t"
+ :
+ : [bswap] "m" (*crc32_bswap_shuf)
+ : );
+
+ if (inlen >= 8 * 16)
+ {
+ asm volatile ("movd %[crc], %%xmm4\n\t"
+ "movdqu %[inbuf_0], %%xmm0\n\t"
+ "movdqu %[inbuf_1], %%xmm1\n\t"
+ "movdqu %[inbuf_2], %%xmm2\n\t"
+ "pxor %%xmm4, %%xmm0\n\t"
+ "movdqu %[inbuf_3], %%xmm3\n\t"
+ "pshufb %%xmm7, %%xmm0\n\t"
+ "pshufb %%xmm7, %%xmm1\n\t"
+ "pshufb %%xmm7, %%xmm2\n\t"
+ "pshufb %%xmm7, %%xmm3\n\t"
+ :
+ : [inbuf_0] "m" (inbuf[0 * 16]),
+ [inbuf_1] "m" (inbuf[1 * 16]),
+ [inbuf_2] "m" (inbuf[2 * 16]),
+ [inbuf_3] "m" (inbuf[3 * 16]),
+ [crc] "m" (*pcrc)
+ : );
+
+ inbuf += 4 * 16;
+ inlen -= 4 * 16;
+
+ asm volatile ("movdqa %[k1k2], %%xmm4\n\t"
+ :
+ : [k1k2] "m" (consts->k[1 - 1])
+ : );
+
+ /* Fold by 4. */
+ while (inlen >= 4 * 16)
+ {
+ asm volatile ("movdqu %[inbuf_0], %%xmm5\n\t"
+ "movdqa %%xmm0, %%xmm6\n\t"
+ "pshufb %%xmm7, %%xmm5\n\t"
+ "pclmulqdq $0x01, %%xmm4, %%xmm0\n\t"
+ "pclmulqdq $0x10, %%xmm4, %%xmm6\n\t"
+ "pxor %%xmm5, %%xmm0\n\t"
+ "pxor %%xmm6, %%xmm0\n\t"
+
+ "movdqu %[inbuf_1], %%xmm5\n\t"
+ "movdqa %%xmm1, %%xmm6\n\t"
+ "pshufb %%xmm7, %%xmm5\n\t"
+ "pclmulqdq $0x01, %%xmm4, %%xmm1\n\t"
+ "pclmulqdq $0x10, %%xmm4, %%xmm6\n\t"
+ "pxor %%xmm5, %%xmm1\n\t"
+ "pxor %%xmm6, %%xmm1\n\t"
+
+ "movdqu %[inbuf_2], %%xmm5\n\t"
+ "movdqa %%xmm2, %%xmm6\n\t"
+ "pshufb %%xmm7, %%xmm5\n\t"
+ "pclmulqdq $0x01, %%xmm4, %%xmm2\n\t"
+ "pclmulqdq $0x10, %%xmm4, %%xmm6\n\t"
+ "pxor %%xmm5, %%xmm2\n\t"
+ "pxor %%xmm6, %%xmm2\n\t"
+
+ "movdqu %[inbuf_3], %%xmm5\n\t"
+ "movdqa %%xmm3, %%xmm6\n\t"
+ "pshufb %%xmm7, %%xmm5\n\t"
+ "pclmulqdq $0x01, %%xmm4, %%xmm3\n\t"
+ "pclmulqdq $0x10, %%xmm4, %%xmm6\n\t"
+ "pxor %%xmm5, %%xmm3\n\t"
+ "pxor %%xmm6, %%xmm3\n\t"
+ :
+ : [inbuf_0] "m" (inbuf[0 * 16]),
+ [inbuf_1] "m" (inbuf[1 * 16]),
+ [inbuf_2] "m" (inbuf[2 * 16]),
+ [inbuf_3] "m" (inbuf[3 * 16])
+ : );
+
+ inbuf += 4 * 16;
+ inlen -= 4 * 16;
+ }
+
+ asm volatile ("movdqa %[k3k4], %%xmm6\n\t"
+ "movdqa %[my_p], %%xmm5\n\t"
+ :
+ : [k3k4] "m" (consts->k[3 - 1]),
+ [my_p] "m" (consts->my_p[0])
+ : );
+
+ /* Fold 4 to 1. */
+
+ asm volatile ("movdqa %%xmm0, %%xmm4\n\t"
+ "pclmulqdq $0x01, %%xmm6, %%xmm0\n\t"
+ "pclmulqdq $0x10, %%xmm6, %%xmm4\n\t"
+ "pxor %%xmm1, %%xmm0\n\t"
+ "pxor %%xmm4, %%xmm0\n\t"
+
+ "movdqa %%xmm0, %%xmm4\n\t"
+ "pclmulqdq $0x01, %%xmm6, %%xmm0\n\t"
+ "pclmulqdq $0x10, %%xmm6, %%xmm4\n\t"
+ "pxor %%xmm2, %%xmm0\n\t"
+ "pxor %%xmm4, %%xmm0\n\t"
+
+ "movdqa %%xmm0, %%xmm4\n\t"
+ "pclmulqdq $0x01, %%xmm6, %%xmm0\n\t"
+ "pclmulqdq $0x10, %%xmm6, %%xmm4\n\t"
+ "pxor %%xmm3, %%xmm0\n\t"
+ "pxor %%xmm4, %%xmm0\n\t"
+ :
+ :
+ : );
+ }
+ else
+ {
+ asm volatile ("movd %[crc], %%xmm1\n\t"
+ "movdqu %[inbuf], %%xmm0\n\t"
+ "movdqa %[k3k4], %%xmm6\n\t"
+ "pxor %%xmm1, %%xmm0\n\t"
+ "movdqa %[my_p], %%xmm5\n\t"
+ "pshufb %%xmm7, %%xmm0\n\t"
+ :
+ : [inbuf] "m" (*inbuf),
+ [crc] "m" (*pcrc),
+ [k3k4] "m" (consts->k[3 - 1]),
+ [my_p] "m" (consts->my_p[0])
+ : );
+
+ inbuf += 16;
+ inlen -= 16;
+ }
+
+ /* Fold by 1. */
+ if (inlen >= 16)
+ {
+ while (inlen >= 16)
+ {
+ /* Load next block to XMM2. Fold XMM0 to XMM0:XMM1. */
+ asm volatile ("movdqu %[inbuf], %%xmm2\n\t"
+ "movdqa %%xmm0, %%xmm1\n\t"
+ "pclmulqdq $0x01, %%xmm6, %%xmm0\n\t"
+ "pshufb %%xmm7, %%xmm2\n\t"
+ "pclmulqdq $0x10, %%xmm6, %%xmm1\n\t"
+ "pxor %%xmm2, %%xmm0\n\t"
+ "pxor %%xmm1, %%xmm0\n\t"
+ :
+ : [inbuf] "m" (*inbuf)
+ : );
+
+ inbuf += 16;
+ inlen -= 16;
+ }
+ }
+
+ /* Partial fold. */
+ if (inlen)
+ {
+ /* Load last input and add padding zeros. */
+ asm volatile ("movdqu %[shl_shuf], %%xmm4\n\t"
+ "movdqu %[shr_shuf], %%xmm3\n\t"
+ "movdqu %[mask], %%xmm2\n\t"
+
+ "movdqa %%xmm0, %%xmm1\n\t"
+ "pshufb %%xmm4, %%xmm0\n\t"
+ "movdqu %[inbuf], %%xmm4\n\t"
+ "pshufb %%xmm3, %%xmm1\n\t"
+ "pand %%xmm4, %%xmm2\n\t"
+ "por %%xmm1, %%xmm2\n\t"
+
+ "pshufb %%xmm7, %%xmm2\n\t"
+
+ "movdqa %%xmm0, %%xmm1\n\t"
+ "pclmulqdq $0x01, %%xmm6, %%xmm0\n\t"
+ "pclmulqdq $0x10, %%xmm6, %%xmm1\n\t"
+ "pxor %%xmm2, %%xmm0\n\t"
+ "pxor %%xmm1, %%xmm0\n\t"
+ :
+ : [inbuf] "m" (*(inbuf - 16 + inlen)),
+ [mask] "m" (crc32_partial_fold_input_mask[inlen]),
+ [shl_shuf] "m" (crc32_refl_shuf_shift[32 - inlen]),
+ [shr_shuf] "m" (crc32_shuf_shift[inlen + 16])
+ : );
+
+ inbuf += inlen;
+ inlen -= inlen;
+ }
+
+ /* Final fold. */
+ asm volatile (/* reduce 128-bits to 96-bits */
+ "movdqa %%xmm0, %%xmm1\n\t"
+ "pclmulqdq $0x11, %%xmm6, %%xmm0\n\t"
+ "pslldq $8, %%xmm1\n\t"
+ "pxor %%xmm1, %%xmm0\n\t" /* bottom 32-bit are zero */
+
+ /* reduce 96-bits to 64-bits */
+ "pshufd $0x30, %%xmm0, %%xmm1\n\t" /* [00][x>>96][00][00] */
+ "pshufd $0x24, %%xmm0, %%xmm0\n\t" /* [00][xx][xx][00] */
+ "pclmulqdq $0x01, %[k5], %%xmm1\n\t" /* [00][xx][xx][00] */
+ "pxor %%xmm1, %%xmm0\n\t" /* top and bottom 32-bit are zero */
+
+ /* barrett reduction */
+ "pshufd $0x01, %%xmm0, %%xmm1\n\t" /* [00][00][00][x>>32] */
+ "pclmulqdq $0x01, %%xmm5, %%xmm0\n\t" /* [00][xx][xx][xx] */
+ "psrldq $4, %%xmm0\n\t" /* [00][00][xx][xx] */
+ "pclmulqdq $0x10, %%xmm5, %%xmm0\n\t"
+ "pxor %%xmm1, %%xmm0\n\t"
+
+ /* store CRC in input endian */
+ "movd %%xmm0, %%eax\n\t"
+ "bswapl %%eax\n\t"
+ "movl %%eax, %[out]\n\t"
+ : [out] "=m" (*pcrc)
+ : [k5] "m" (consts->k[5 - 1])
+ : "eax" );
+}
+
+static inline void
+crc32_less_than_16 (u32 *pcrc, const byte *inbuf, size_t inlen,
+ const struct crc32_consts_s *consts)
+{
+ if (inlen < 4)
+ {
+ u32 crc = *pcrc;
+ u32 data;
+
+ asm volatile ("movdqa %[my_p], %%xmm5\n\t"
+ :
+ : [my_p] "m" (consts->my_p[0])
+ : );
+
+ if (inlen == 1)
+ {
+ data = inbuf[0];
+ data ^= crc;
+ data = _gcry_bswap32(data << 24);
+ crc = _gcry_bswap32(crc >> 8);
+ }
+ else if (inlen == 2)
+ {
+ data = *((const u16 *)inbuf);
+ data ^= crc;
+ data = _gcry_bswap32(data << 16);
+ crc = _gcry_bswap32(crc >> 16);
+ }
+ else
+ {
+ data = *((const u16 *)inbuf);
+ data |= inbuf[2] << 16;
+ data ^= crc;
+ data = _gcry_bswap32(data << 8);
+ crc = _gcry_bswap32(crc >> 24);
+ }
+
+ /* Barrett reduction */
+ asm volatile ("movd %[in], %%xmm0\n\t"
+ "psllq $32, %%xmm0\n\t" /* [00][00][xx][00] */
+ "movd %[crc], %%xmm1\n\t"
+
+ "pclmulqdq $0x00, %%xmm5, %%xmm0\n\t" /* [00][xx][xx][00] */
+ "pclmulqdq $0x11, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
+ "pxor %%xmm1, %%xmm0\n\t"
+
+ /* store CRC in input endian */
+ "movd %%xmm0, %%eax\n\t"
+ "bswapl %%eax\n\t"
+ "movl %%eax, %[out]\n\t"
+ : [out] "=m" (*pcrc)
+ : [in] "r" (data),
+ [crc] "r" (crc)
+ : "eax" );
+ }
+ else if (inlen == 4)
+ {
+ /* Barrett reduction */
+ asm volatile ("movd %[crc], %%xmm0\n\t"
+ "movd %[in], %%xmm1\n\t"
+ "movdqa %[my_p], %%xmm5\n\t"
+ "pxor %%xmm1, %%xmm0\n\t"
+ "pshufb %[bswap], %%xmm0\n\t" /* [xx][00][00][00] */
+
+ "pclmulqdq $0x01, %%xmm5, %%xmm0\n\t" /* [00][xx][xx][00] */
+ "pclmulqdq $0x11, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
+
+ /* store CRC in input endian */
+ "movd %%xmm0, %%eax\n\t"
+ "bswapl %%eax\n\t"
+ "movl %%eax, %[out]\n\t"
+ : [out] "=m" (*pcrc)
+ : [in] "m" (*inbuf),
+ [crc] "m" (*pcrc),
+ [my_p] "m" (consts->my_p[0]),
+ [bswap] "m" (*crc32_bswap_shuf)
+ : "eax" );
+ }
+ else
+ {
+ asm volatile ("movdqu %[shuf], %%xmm7\n\t"
+ "movd %[crc], %%xmm1\n\t"
+ "movdqa %[my_p], %%xmm5\n\t"
+ "movdqa %[k3k4], %%xmm6\n\t"
+ :
+ : [shuf] "m" (crc32_shuf_shift[32 - inlen]),
+ [crc] "m" (*pcrc),
+ [my_p] "m" (consts->my_p[0]),
+ [k3k4] "m" (consts->k[3 - 1])
+ : );
+
+ if (inlen >= 8)
+ {
+ asm volatile ("movq %[inbuf], %%xmm0\n\t"
+ :
+ : [inbuf] "m" (*inbuf)
+ : );
+ if (inlen > 8)
+ {
+ asm volatile (/*"pinsrq $1, %[inbuf_tail], %%xmm0\n\t"*/
+ "movq %[inbuf_tail], %%xmm2\n\t"
+ "punpcklqdq %%xmm2, %%xmm0\n\t"
+ "pshufb %[merge_shuf], %%xmm0\n\t"
+ :
+ : [inbuf_tail] "m" (inbuf[inlen - 8]),
+ [merge_shuf] "m"
+ (*crc32_merge9to15_shuf[inlen - 9])
+ : );
+ }
+ }
+ else
+ {
+ asm volatile ("movd %[inbuf], %%xmm0\n\t"
+ "pinsrd $1, %[inbuf_tail], %%xmm0\n\t"
+ "pshufb %[merge_shuf], %%xmm0\n\t"
+ :
+ : [inbuf] "m" (*inbuf),
+ [inbuf_tail] "m" (inbuf[inlen - 4]),
+ [merge_shuf] "m"
+ (*crc32_merge5to7_shuf[inlen - 5])
+ : );
+ }
+
+ /* Final fold. */
+ asm volatile ("pxor %%xmm1, %%xmm0\n\t"
+ "pshufb %%xmm7, %%xmm0\n\t"
+
+ /* reduce 128-bits to 96-bits */
+ "movdqa %%xmm0, %%xmm1\n\t"
+ "pclmulqdq $0x11, %%xmm6, %%xmm0\n\t"
+ "pslldq $8, %%xmm1\n\t"
+ "pxor %%xmm1, %%xmm0\n\t" /* bottom 32-bit are zero */
+
+ /* reduce 96-bits to 64-bits */
+ "pshufd $0x30, %%xmm0, %%xmm1\n\t" /* [00][x>>96][00][00] */
+ "pshufd $0x24, %%xmm0, %%xmm0\n\t" /* [00][xx][xx][00] */
+ "pclmulqdq $0x01, %[k5], %%xmm1\n\t" /* [00][xx][xx][00] */
+ "pxor %%xmm1, %%xmm0\n\t" /* top and bottom 32-bit are zero */
+
+ /* barrett reduction */
+ "pshufd $0x01, %%xmm0, %%xmm1\n\t" /* [00][00][00][x>>32] */
+ "pclmulqdq $0x01, %%xmm5, %%xmm0\n\t" /* [00][xx][xx][xx] */
+ "psrldq $4, %%xmm0\n\t" /* [00][00][xx][xx] */
+ "pclmulqdq $0x10, %%xmm5, %%xmm0\n\t"
+ "pxor %%xmm1, %%xmm0\n\t"
+
+ /* store CRC in input endian */
+ "movd %%xmm0, %%eax\n\t"
+ "bswapl %%eax\n\t"
+ "movl %%eax, %[out]\n\t"
+ : [out] "=m" (*pcrc)
+ : [k5] "m" (consts->k[5 - 1])
+ : "eax" );
+ }
+}
+
+void
+_gcry_crc32_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen)
+{
+ const struct crc32_consts_s *consts = &crc32_consts;
+#if defined(__x86_64__) && defined(__WIN64__)
+ char win64tmp[2 * 16];
+
+ /* XMM6-XMM7 need to be restored after use. */
+ asm volatile ("movdqu %%xmm6, 0*16(%0)\n\t"
+ "movdqu %%xmm7, 1*16(%0)\n\t"
+ :
+ : "r" (win64tmp)
+ : "memory");
+#endif
+
+ if (!inlen)
+ return;
+
+ if (inlen >= 16)
+ crc32_reflected_bulk(pcrc, inbuf, inlen, consts);
+ else
+ crc32_reflected_less_than_16(pcrc, inbuf, inlen, consts);
+
+#if defined(__x86_64__) && defined(__WIN64__)
+ /* Restore used registers. */
+ asm volatile("movdqu 0*16(%0), %%xmm6\n\t"
+ "movdqu 1*16(%0), %%xmm7\n\t"
+ :
+ : "r" (win64tmp)
+ : "memory");
+#endif
+}
+
+void
+_gcry_crc24rfc2440_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen)
+{
+ const struct crc32_consts_s *consts = &crc24rfc2440_consts;
+#if defined(__x86_64__) && defined(__WIN64__)
+ char win64tmp[2 * 16];
+
+ /* XMM6-XMM7 need to be restored after use. */
+ asm volatile ("movdqu %%xmm6, 0*16(%0)\n\t"
+ "movdqu %%xmm7, 1*16(%0)\n\t"
+ :
+ : "r" (win64tmp)
+ : "memory");
+#endif
+
+ if (!inlen)
+ return;
+
+ /* Note: *pcrc in input endian. */
+
+ if (inlen >= 16)
+ crc32_bulk(pcrc, inbuf, inlen, consts);
+ else
+ crc32_less_than_16(pcrc, inbuf, inlen, consts);
+
+#if defined(__x86_64__) && defined(__WIN64__)
+ /* Restore used registers. */
+ asm volatile("movdqu 0*16(%0), %%xmm6\n\t"
+ "movdqu 1*16(%0), %%xmm7\n\t"
+ :
+ : "r" (win64tmp)
+ : "memory");
+#endif
+}
+
+#endif /* USE_INTEL_PCLMUL */
diff --git a/cipher/crc.c b/cipher/crc.c
index 46a185a8..ee0e4e2e 100644
--- a/cipher/crc.c
+++ b/cipher/crc.c
@@ -31,14 +31,37 @@
#include "bufhelp.h"
+/* USE_INTEL_PCLMUL indicates whether to compile CRC with Intel PCLMUL
+ * code. */
+#undef USE_INTEL_PCLMUL
+#ifdef ENABLE_PCLMUL_SUPPORT
+# if ((defined(__i386__) && SIZEOF_UNSIGNED_LONG == 4) || defined(__x86_64__))
+# if __GNUC__ >= 4
+# define USE_INTEL_PCLMUL 1
+# endif
+# endif
+#endif /* USE_INTEL_PCLMUL */
+
+
typedef struct
{
u32 CRC;
+#ifdef USE_INTEL_PCLMUL
+ unsigned int use_pclmul:1; /* Intel PCLMUL shall be used. */
+#endif
byte buf[4];
}
CRC_CONTEXT;
+#ifdef USE_INTEL_PCLMUL
+/*-- crc-intel-pclmul.c --*/
+void _gcry_crc32_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen);
+void _gcry_crc24rfc2440_intel_pclmul (u32 *pcrc, const byte *inbuf,
+ size_t inlen);
+#endif
+
+
/*
* Code generated by universal_crc by Danjel McGougan
*
@@ -338,6 +361,11 @@ static void
crc32_init (void *context, unsigned int flags)
{
CRC_CONTEXT *ctx = (CRC_CONTEXT *) context;
+#ifdef USE_INTEL_PCLMUL
+ u32 hwf = _gcry_get_hw_features ();
+
+ ctx->use_pclmul = (hwf & HWF_INTEL_SSE4_1) && (hwf & HWF_INTEL_PCLMUL);
+#endif
(void)flags;
@@ -351,6 +379,14 @@ crc32_write (void *context, const void *inbuf_arg, size_t inlen)
const byte *inbuf = inbuf_arg;
u32 crc;
+#ifdef USE_INTEL_PCLMUL
+ if (ctx->use_pclmul)
+ {
+ _gcry_crc32_intel_pclmul(&ctx->CRC, inbuf, inlen);
+ return;
+ }
+#endif
+
if (!inbuf || !inlen)
return;
@@ -403,6 +439,11 @@ static void
crc32rfc1510_init (void *context, unsigned int flags)
{
CRC_CONTEXT *ctx = (CRC_CONTEXT *) context;
+#ifdef USE_INTEL_PCLMUL
+ u32 hwf = _gcry_get_hw_features ();
+
+ ctx->use_pclmul = (hwf & HWF_INTEL_SSE4_1) && (hwf & HWF_INTEL_PCLMUL);
+#endif
(void)flags;
@@ -694,7 +735,8 @@ static const u32 crc24_table[1024] =
static inline
u32 crc24_init (void)
{
- return 0xce04b7;
+ /* Transformed to 32-bit CRC by multiplied by x⁸ and then byte swapped. */
+ return 0xce04b7; /* _gcry_bswap(0xb704ce << 8) */
}
static inline
@@ -713,7 +755,7 @@ u32 crc24_next4 (u32 crc, u32 data)
crc = crc24_table[(crc & 0xff) + 0x300] ^
crc24_table[((crc >> 8) & 0xff) + 0x200] ^
crc24_table[((crc >> 16) & 0xff) + 0x100] ^
- crc24_table[(crc >> 24) & 0xff];
+ crc24_table[(data >> 24) & 0xff];
return crc;
}
@@ -727,6 +769,11 @@ static void
crc24rfc2440_init (void *context, unsigned int flags)
{
CRC_CONTEXT *ctx = (CRC_CONTEXT *) context;
+#ifdef USE_INTEL_PCLMUL
+ u32 hwf = _gcry_get_hw_features ();
+
+ ctx->use_pclmul = (hwf & HWF_INTEL_SSE4_1) && (hwf & HWF_INTEL_PCLMUL);
+#endif
(void)flags;
@@ -740,6 +787,14 @@ crc24rfc2440_write (void *context, const void *inbuf_arg, size_t inlen)
CRC_CONTEXT *ctx = (CRC_CONTEXT *) context;
u32 crc;
+#ifdef USE_INTEL_PCLMUL
+ if (ctx->use_pclmul)
+ {
+ _gcry_crc24rfc2440_intel_pclmul(&ctx->CRC, inbuf, inlen);
+ return;
+ }
+#endif
+
if (!inbuf || !inlen)
return;
diff --git a/configure.ac b/configure.ac
index 8b503609..ff72e3fd 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2023,6 +2023,13 @@ LIST_MEMBER(crc, $enabled_digests)
if test "$found" = "1" ; then
GCRYPT_DIGESTS="$GCRYPT_DIGESTS crc.lo"
AC_DEFINE(USE_CRC, 1, [Defined if this module should be included])
+
+ case "${host}" in
+ i?86-*-* | x86_64-*-*)
+ # Build with the assembly implementation
+ GCRYPT_DIGESTS="$GCRYPT_DIGESTS crc-intel-pclmul.lo"
+ ;;
+ esac
fi
LIST_MEMBER(gostr3411-94, $enabled_digests)
diff --git a/src/g10lib.h b/src/g10lib.h
index 9d2ece9c..7352556a 100644
--- a/src/g10lib.h
+++ b/src/g10lib.h
@@ -194,23 +194,23 @@ char **_gcry_strtokenize (const char *string, const char *delim);
/*-- src/hwfeatures.c --*/
-/* (Do not change these values unless synced with the asm code.) */
-#define HWF_PADLOCK_RNG 1
-#define HWF_PADLOCK_AES 2
-#define HWF_PADLOCK_SHA 4
-#define HWF_PADLOCK_MMUL 8
-
-#define HWF_INTEL_CPU 16
-#define HWF_INTEL_FAST_SHLD 32
-#define HWF_INTEL_BMI2 64
-#define HWF_INTEL_SSSE3 128
-#define HWF_INTEL_PCLMUL 256
-#define HWF_INTEL_AESNI 512
-#define HWF_INTEL_RDRAND 1024
-#define HWF_INTEL_AVX 2048
-#define HWF_INTEL_AVX2 4096
-
-#define HWF_ARM_NEON 8192
+#define HWF_PADLOCK_RNG (1 << 0)
+#define HWF_PADLOCK_AES (1 << 1)
+#define HWF_PADLOCK_SHA (1 << 2)
+#define HWF_PADLOCK_MMUL (1 << 3)
+
+#define HWF_INTEL_CPU (1 << 4)
+#define HWF_INTEL_FAST_SHLD (1 << 5)
+#define HWF_INTEL_BMI2 (1 << 6)
+#define HWF_INTEL_SSSE3 (1 << 7)
+#define HWF_INTEL_SSE4_1 (1 << 8)
+#define HWF_INTEL_PCLMUL (1 << 9)
+#define HWF_INTEL_AESNI (1 << 10)
+#define HWF_INTEL_RDRAND (1 << 11)
+#define HWF_INTEL_AVX (1 << 12)
+#define HWF_INTEL_AVX2 (1 << 13)
+
+#define HWF_ARM_NEON (1 << 14)
gpg_err_code_t _gcry_disable_hw_feature (const char *name);
diff --git a/src/hwf-x86.c b/src/hwf-x86.c
index fbd63315..eeacccb9 100644
--- a/src/hwf-x86.c
+++ b/src/hwf-x86.c
@@ -277,6 +277,9 @@ detect_x86_gnuc (void)
/* Test bit 9 for SSSE3. */
if (features & 0x00000200)
result |= HWF_INTEL_SSSE3;
+ /* Test bit 19 for SSE4.1. */
+ if (features & 0x00080000)
+ result |= HWF_INTEL_SSE4_1;
#ifdef ENABLE_AESNI_SUPPORT
/* Test bit 25 for AES-NI. */
if (features & 0x02000000)
diff --git a/src/hwfeatures.c b/src/hwfeatures.c
index e7c55cc3..4cafae1b 100644
--- a/src/hwfeatures.c
+++ b/src/hwfeatures.c
@@ -50,6 +50,7 @@ static struct
{ HWF_INTEL_FAST_SHLD, "intel-fast-shld" },
{ HWF_INTEL_BMI2, "intel-bmi2" },
{ HWF_INTEL_SSSE3, "intel-ssse3" },
+ { HWF_INTEL_SSE4_1, "intel-sse4.1" },
{ HWF_INTEL_PCLMUL, "intel-pclmul" },
{ HWF_INTEL_AESNI, "intel-aesni" },
{ HWF_INTEL_RDRAND, "intel-rdrand" },
diff --git a/tests/basic.c b/tests/basic.c
index 7d5de00f..5e7ee44b 100644
--- a/tests/basic.c
+++ b/tests/basic.c
@@ -5267,6 +5267,14 @@ check_cipher_modes(void)
static void
+fillbuf_count (char *buf, size_t buflen, unsigned char pos)
+{
+ while (buflen--)
+ *((unsigned char *)(buf++)) = pos++;
+}
+
+
+static void
check_one_md (int algo, const char *data, int len, const char *expect, int elen)
{
gcry_md_hd_t hd, hd2;
@@ -5297,14 +5305,33 @@ check_one_md (int algo, const char *data, int len, const char *expect, int elen)
}
}
- if (*data == '!' && !data[1])
- { /* hash one million times a "a" */
+ if ((*data == '!' && !data[1]) || /* hash one million times a "a" */
+ (*data == '?' && !data[1])) /* hash million byte data-set with byte pattern 0x00,0x01,0x02,... */
+ {
char aaa[1000];
size_t left = 1000 * 1000;
size_t startlen = 1;
size_t piecelen = startlen;
- memset (aaa, 'a', 1000);
+ if (*data == '!')
+ memset (aaa, 'a', 1000);
+
+ /* Write in chuck with all sizes 1 to 1000 (500500 bytes) */
+ for (i = 1; i <= 1000 && left > 0; i++)
+ {
+ piecelen = i;
+ if (piecelen > sizeof(aaa))
+ piecelen = sizeof(aaa);
+ if (piecelen > left)
+ piecelen = left;
+
+ if (*data == '?')
+ fillbuf_count(aaa, piecelen, 1000 * 1000 - left);
+
+ gcry_md_write (hd, aaa, piecelen);
+
+ left -= piecelen;
+ }
/* Write in odd size chunks so that we test the buffering. */
while (left > 0)
@@ -5314,6 +5341,9 @@ check_one_md (int algo, const char *data, int len, const char *expect, int elen)
if (piecelen > left)
piecelen = left;
+ if (*data == '?')
+ fillbuf_count(aaa, piecelen, 1000 * 1000 - left);
+
gcry_md_write (hd, aaa, piecelen);
left -= piecelen;
@@ -5526,6 +5556,8 @@ check_one_md_multi (int algo, const char *data, int len, const char *expect)
if (*data == '!' && !data[1])
return; /* We can't do that here. */
+ if (*data == '?' && !data[1])
+ return; /* We can't do that here. */
memset (iov, 0, sizeof iov);
@@ -5616,6 +5648,8 @@ check_digests (void)
"\xc4\x1a\x5c\x0b\x44\x5f\xba\x1a\xda\xbc\xc0\x38\x0e\x0c\x9e\x33" },
{ GCRY_MD_MD5, "!",
"\x77\x07\xd6\xae\x4e\x02\x7c\x70\xee\xa2\xa9\x35\xc2\x29\x6f\x21" },
+ { GCRY_MD_MD5, "?",
+ "\x5c\x72\x5c\xbc\x2d\xbb\xe1\x14\x81\x59\xe9\xd9\xcf\x90\x64\x8f" },
{ GCRY_MD_SHA1, "abc",
"\xA9\x99\x3E\x36\x47\x06\x81\x6A\xBA\x3E"
"\x25\x71\x78\x50\xC2\x6C\x9C\xD0\xD8\x9D" },
@@ -5626,6 +5660,9 @@ check_digests (void)
{ GCRY_MD_SHA1, "!" /* kludge for "a"*1000000 */ ,
"\x34\xAA\x97\x3C\xD4\xC4\xDA\xA4\xF6\x1E"
"\xEB\x2B\xDB\xAD\x27\x31\x65\x34\x01\x6F" },
+ { GCRY_MD_SHA1, "?" /* kludge for "\x00\x01\x02"..."\xfe\xff\x00\x01"... (length 1000000) */ ,
+ "\x5f\x8d\x3c\x4f\x12\xf0\x49\x9e\x28\x73"
+ "\x79\xec\x97\x3b\x98\x4c\x94\x75\xaa\x8f" },
{ GCRY_MD_SHA1,
"Libgcrypt is free software; you can redistribute it and/or modif"
"y it under the terms of the GNU Lesser general Public License as"
@@ -5648,6 +5685,9 @@ check_digests (void)
{ GCRY_MD_SHA224, "!",
"\x20\x79\x46\x55\x98\x0c\x91\xd8\xbb\xb4\xc1\xea\x97\x61\x8a\x4b"
"\xf0\x3f\x42\x58\x19\x48\xb2\xee\x4e\xe7\xad\x67" },
+ { GCRY_MD_SHA224, "?",
+ "\xfa\xb9\xf0\xdf\x12\xfe\xa1\x1a\x34\x78\x96\x31\xe6\x53\x48\xbf"
+ "\x3b\xca\x70\x78\xf2\x44\xdf\x62\xab\x27\xb8\xda" },
{ GCRY_MD_SHA224,
"Libgcrypt is free software; you can redistribute it and/or modif"
"y it under the terms of the GNU Lesser general Public License as"
@@ -5669,6 +5709,9 @@ check_digests (void)
{ GCRY_MD_SHA256, "!",
"\xcd\xc7\x6e\x5c\x99\x14\xfb\x92\x81\xa1\xc7\xe2\x84\xd7\x3e\x67"
"\xf1\x80\x9a\x48\xa4\x97\x20\x0e\x04\x6d\x39\xcc\xc7\x11\x2c\xd0" },
+ { GCRY_MD_SHA256, "?",
+ "\x67\x87\x0d\xfc\x9c\x64\xe7\xaa\x27\x0a\x3f\x7e\x80\x51\xae\x65"
+ "\xd2\x07\xf9\x3f\xc3\xdf\x04\xd7\x57\x2e\x63\x65\xaf\x69\xcd\x0d" },
{ GCRY_MD_SHA256,
"Libgcrypt is free software; you can redistribute it and/or modif"
"y it under the terms of the GNU Lesser general Public License as"
@@ -5700,6 +5743,10 @@ check_digests (void)
"\x9d\x0e\x18\x09\x71\x64\x74\xcb\x08\x6e\x83\x4e\x31\x0a\x4a\x1c"
"\xed\x14\x9e\x9c\x00\xf2\x48\x52\x79\x72\xce\xc5\x70\x4c\x2a\x5b"
"\x07\xb8\xb3\xdc\x38\xec\xc4\xeb\xae\x97\xdd\xd8\x7f\x3d\x89\x85" },
+ { GCRY_MD_SHA384, "?",
+ "\xfa\x77\xbb\x86\x3a\xd5\xae\x88\xa9\x9c\x5e\xda\xb5\xc7\xcb\x40"
+ "\xcd\xf4\x30\xef\xa8\x1b\x23\x7b\xa9\xde\xfd\x81\x12\xf6\x7e\xed"
+ "\xa7\xd2\x27\x91\xd1\xbc\x76\x44\x57\x59\x71\x11\xe6\x8a\x2c\xde" },
{ GCRY_MD_SHA512, "abc",
"\xDD\xAF\x35\xA1\x93\x61\x7A\xBA\xCC\x41\x73\x49\xAE\x20\x41\x31"
"\x12\xE6\xFA\x4E\x89\xA9\x7E\xA2\x0A\x9E\xEE\xE6\x4B\x55\xD3\x9A"
@@ -5723,6 +5770,11 @@ check_digests (void)
"\x8e\x1f\x98\xb1\x3b\x20\x44\x28\x56\x32\xa8\x03\xaf\xa9\x73\xeb"
"\xde\x0f\xf2\x44\x87\x7e\xa6\x0a\x4c\xb0\x43\x2c\xe5\x77\xc3\x1b"
"\xeb\x00\x9c\x5c\x2c\x49\xaa\x2e\x4e\xad\xb2\x17\xad\x8c\xc0\x9b" },
+ { GCRY_MD_SHA512, "?",
+ "\x91\xe9\x42\x4e\xa9\xdc\x44\x01\x40\x64\xa4\x5a\x69\xcc\xac\xa3"
+ "\x74\xee\x78\xeb\x79\x1f\x94\x38\x5b\x73\xef\xf8\xfd\x5d\x74\xd8"
+ "\x51\x36\xfe\x63\x52\xde\x07\x70\x95\xd6\x78\x2b\x7b\x46\x8a\x2c"
+ "\x30\x0f\x48\x0c\x74\x43\x06\xdb\xa3\x8d\x64\x3d\xe9\xa1\xa7\x72" },
{ GCRY_MD_SHA3_224, "abc",
"\xe6\x42\x82\x4c\x3f\x8c\xf2\x4a\xd0\x92\x34\xee\x7d\x3c\x76\x6f"
"\xc9\xa3\xa5\x16\x8d\x0c\x94\xad\x73\xb4\x6f\xdf" },
@@ -5806,6 +5858,21 @@ check_digests (void)
"\x12\x0a\x2a\x53\x70\x21\x2d\xff\xb3\x38\x5a\x18\xd4\xf3\x88\x59"
"\xed\x31\x1d\x0a\x9d\x51\x41\xce\x9c\xc5\xc6\x6e\xe6\x89\xb2\x66"
"\xa8\xaa\x18\xac\xe8\x28\x2a\x0e\x0d\xb5\x96\xc9\x0b\x0a\x7b\x87" },
+ { GCRY_MD_SHA3_224, "?",
+ "\x1b\xd1\xc6\x12\x02\x35\x52\x8b\x44\x7e\x16\x39\x20\x05\xec\x67"
+ "\x2d\x57\x20\xe0\x90\xc9\x78\x08\x86\x4f\x1b\xd0" },
+ { GCRY_MD_SHA3_256, "?",
+ "\xfe\xb7\xf4\x76\x78\x97\x48\x2f\xe2\x29\x1b\x66\x85\xc1\x7b\x45"
+ "\xc5\x08\xed\x82\x50\xcc\x5d\x99\x96\xd2\xc3\x82\x1a\xa8\xd4\xa7" },
+ { GCRY_MD_SHA3_384, "?",
+ "\x45\x1f\x0b\x93\x4b\xca\x3e\x65\x93\xd4\xaa\x8c\x18\xc1\x04\x84"
+ "\x12\xd5\x1e\x35\xe1\x05\xd9\x77\x3f\xc1\x08\x8b\x77\x36\xad\x4a"
+ "\x33\x70\xaf\x49\x8b\xea\x4c\x5c\x52\xe7\x5b\xed\x31\x74\x57\x12" },
+ { GCRY_MD_SHA3_512, "?",
+ "\xa2\xee\xb5\x6f\x2a\x87\xa5\xb3\x9b\xd9\x1c\xf0\xaa\xdf\xb1\xd5"
+ "\xad\x0a\x1a\xaa\xd3\x63\x81\xcf\xb8\x7c\x36\xa7\x80\x3b\x03\xd6"
+ "\x31\x5c\x5d\x33\x8e\x52\xb1\x42\x4d\x27\x1c\xa2\xa5\xf2\xc5\x97"
+ "\x10\x12\xe5\xee\x86\xa3\xcc\xaf\x91\x7a\x94\x28\x65\xea\x66\xe3" },
{ GCRY_MD_RMD160, "",
"\x9c\x11\x85\xa5\xc5\xe9\xfc\x54\x61\x28"
"\x08\x97\x7e\xe8\xf5\x48\xb2\x25\x8d\x31" },
@@ -5832,6 +5899,9 @@ check_digests (void)
{ GCRY_MD_RMD160, "!",
"\x52\x78\x32\x43\xc1\x69\x7b\xdb\xe1\x6d\x37\xf9\x7f\x68\xf0\x83"
"\x25\xdc\x15\x28" },
+ { GCRY_MD_RMD160, "?",
+ "\x68\x14\x86\x70\x3d\x51\x4e\x36\x68\x50\xf8\xb3\x00\x75\xda\x49"
+ "\x0a\xaa\x2c\xf6" },
{ GCRY_MD_CRC32, "", "\x00\x00\x00\x00" },
{ GCRY_MD_CRC32, "foo", "\x8c\x73\x65\x21" },
{ GCRY_MD_CRC32,
@@ -5846,6 +5916,7 @@ check_digests (void)
"\x4A\x53\x7D\x67" },
{ GCRY_MD_CRC32, "123456789", "\xcb\xf4\x39\x26" },
{ GCRY_MD_CRC32, "!", "\xdc\x25\xbf\xbc" },
+ { GCRY_MD_CRC32, "?", "\x61\x82\x29\x1B" },
{ GCRY_MD_CRC32_RFC1510, "", "\x00\x00\x00\x00" },
{ GCRY_MD_CRC32_RFC1510, "foo", "\x73\x32\xbc\x33" },
{ GCRY_MD_CRC32_RFC1510, "test0123456789", "\xb8\x3e\x88\xd6" },
@@ -5858,9 +5929,13 @@ check_digests (void)
{ GCRY_MD_CRC32_RFC1510, "\x80\x00\x00\x00", "\xed\x59\xb6\x3b", 4 },
{ GCRY_MD_CRC32_RFC1510, "\x00\x00\x00\x01", "\x77\x07\x30\x96", 4 },
{ GCRY_MD_CRC32_RFC1510, "123456789", "\x2d\xfd\x2d\x88" },
+ { GCRY_MD_CRC32_RFC1510, "!", "\xce\x5c\x74\x22" },
+ { GCRY_MD_CRC32_RFC1510, "?", "\x73\xfb\xe2\x85" },
{ GCRY_MD_CRC24_RFC2440, "", "\xb7\x04\xce" },
{ GCRY_MD_CRC24_RFC2440, "foo", "\x4f\xc2\x55" },
{ GCRY_MD_CRC24_RFC2440, "123456789", "\x21\xcf\x02" },
+ { GCRY_MD_CRC24_RFC2440, "!", "\xa5\xcb\x6b" },
+ { GCRY_MD_CRC24_RFC2440, "?", "\x7f\x67\x03" },
{ GCRY_MD_TIGER, "",
"\x24\xF0\x13\x0C\x63\xAC\x93\x32\x16\x16\x6E\x76"
@@ -5942,6 +6017,9 @@ check_digests (void)
"ral Public License for more details.",
"\x60\xee\xdf\x95\x39\xc8\x44\x94\x64\xdc\xdf\x3d\x2e\x1c\xe5\x79"
"\x6a\x95\xbd\x30\x68\x8c\x7e\xb8" },
+ { GCRY_MD_TIGER1, "?",
+ "\x4b\xe2\x3f\x23\xf5\x34\xbe\xbf\x97\x42\x95\x80"
+ "\x54\xe4\x6c\x12\x64\x85\x44\x0a\xa9\x49\x9b\x65" },
{ GCRY_MD_TIGER2, "",
"\x44\x41\xBE\x75\xF6\x01\x87\x73\xC2\x06\xC2\x27"
@@ -5986,11 +6064,11 @@ check_digests (void)
"\xF0\xDF\xF5\x94\x13\x14\x5E\x69\x73\xC4\x50\x01\xD0\x08\x7B\x42"
"\xD1\x1B\xC6\x45\x41\x3A\xEF\xF6\x3A\x42\x39\x1A\x39\x14\x5A\x59"
"\x1A\x92\x20\x0D\x56\x01\x95\xE5\x3B\x47\x85\x84\xFD\xAE\x23\x1A" },
- { GCRY_MD_WHIRLPOOL, "a",
- "\x8A\xCA\x26\x02\x79\x2A\xEC\x6F\x11\xA6\x72\x06\x53\x1F\xB7\xD7"
- "\xF0\xDF\xF5\x94\x13\x14\x5E\x69\x73\xC4\x50\x01\xD0\x08\x7B\x42"
- "\xD1\x1B\xC6\x45\x41\x3A\xEF\xF6\x3A\x42\x39\x1A\x39\x14\x5A\x59"
- "\x1A\x92\x20\x0D\x56\x01\x95\xE5\x3B\x47\x85\x84\xFD\xAE\x23\x1A" },
+ { GCRY_MD_WHIRLPOOL, "?",
+ "\x88\xf0\x78\x6d\x0d\x47\xe5\x32\x1f\x88\xb1\x48\x05\x53\x58\x7d"
+ "\x19\x4b\x32\x9b\xf1\xfb\x17\xc5\x98\x3a\x87\xa2\x48\x61\x3d\x2b"
+ "\xb2\xbc\x9f\x0d\xd2\x14\x37\x30\x55\x30\x91\xa7\xb8\x0c\x0f\x80"
+ "\x7c\x7b\x94\xf6\x55\xf6\x0b\x12\x85\x0c\x8e\x6d\x17\x5b\x1e\x71" },
{ GCRY_MD_WHIRLPOOL,
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789",
"\xDC\x37\xE0\x08\xCF\x9E\xE6\x9B\xF1\x1F\x00\xED\x9A\xBA\x26\x90"