hwf-x86: add detection for Intel CPUs with fast SHLD instruction

* cipher/sha1.c (sha1_init): Use HWF_INTEL_FAST_SHLD instead of HWF_INTEL_CPU. * cipher/sha256.c (sha256_init, sha224_init): Ditto. * cipher/sha512.c (sha512_init, sha384_init): Ditto. * src/g10lib.h (HWF_INTEL_FAST_SHLD): New. (HWF_INTEL_BMI2, HWF_INTEL_SSSE3, HWF_INTEL_PCLMUL, HWF_INTEL_AESNI) (HWF_INTEL_RDRAND, HWF_INTEL_AVX, HWF_INTEL_AVX2) (HWF_ARM_NEON): Update. * src/hwf-x86.c (detect_x86_gnuc): Add detection of Intel Core CPUs with fast SHLD/SHRD instruction. * src/hwfeatures.c (hwflist): Add "intel-fast-shld". -- Intel Core CPUs since codename sandy-bridge have been able to execute SHLD/SHRD instructions faster than rotate instructions ROL/ROR. Since SHLD/SHRD can be used to do rotation, some optimized implementations (SHA1/SHA256/SHA512) use SHLD/SHRD instructions in-place of ROL/ROR. This patch provides more accurate detection of CPUs with fast SHLD implementation. Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
author: Jussi Kivilinna <jussi.kivilinna@iki.fi> 2015-10-23 22:39:47 +0300
committer: Jussi Kivilinna <jussi.kivilinna@iki.fi> 2015-10-28 20:08:54 +0200
commit: 909644ef5883927262366c356eed530e55aba478 (patch)
tree: 71c03dfeae98a5a7ab1118663a877cd3941f1dba
parent: 16fd540f4d01eb6dc23d9509ae549353617c7a67 (diff)
download: libgcrypt-909644ef5883927262366c356eed530e55aba478.tar.gz
6 files changed, 62 insertions, 30 deletions
diff --git a/cipher/sha1.c b/cipher/sha1.c
index eb428835..554d55ce 100644
--- a/cipher/sha1.c
+++ b/cipher/sha1.c
@@ -136,7 +136,7 @@ sha1_init (void *context, unsigned int flags)
 #ifdef USE_AVX
   /* AVX implementation uses SHLD which is known to be slow on non-Intel CPUs.
    * Therefore use this implementation on Intel CPUs only. */
-  hd->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_CPU);
+  hd->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD);
 #endif
 #ifdef USE_BMI2
   hd->use_bmi2 = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_BMI2);
diff --git a/cipher/sha256.c b/cipher/sha256.c
index 59ffa434..63869d54 100644
--- a/cipher/sha256.c
+++ b/cipher/sha256.c
@@ -124,7 +124,7 @@ sha256_init (void *context, unsigned int flags)
 #ifdef USE_AVX
   /* AVX implementation uses SHLD which is known to be slow on non-Intel CPUs.
    * Therefore use this implementation on Intel CPUs only. */
-  hd->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_CPU);
+  hd->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD);
 #endif
 #ifdef USE_AVX2
   hd->use_avx2 = (features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2);
@@ -162,7 +162,7 @@ sha224_init (void *context, unsigned int flags)
 #ifdef USE_AVX
   /* AVX implementation uses SHLD which is known to be slow on non-Intel CPUs.
    * Therefore use this implementation on Intel CPUs only. */
-  hd->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_CPU);
+  hd->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD);
 #endif
 #ifdef USE_AVX2
   hd->use_avx2 = (features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2);
diff --git a/cipher/sha512.c b/cipher/sha512.c
index 029f8f02..4be1cab2 100644
--- a/cipher/sha512.c
+++ b/cipher/sha512.c
@@ -154,7 +154,7 @@ sha512_init (void *context, unsigned int flags)
   ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0;
 #endif
 #ifdef USE_AVX
-  ctx->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_CPU);
+  ctx->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD);
 #endif
 #ifdef USE_AVX2
   ctx->use_avx2 = (features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2);
@@ -194,7 +194,7 @@ sha384_init (void *context, unsigned int flags)
   ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0;
 #endif
 #ifdef USE_AVX
-  ctx->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_CPU);
+  ctx->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD);
 #endif
 #ifdef USE_AVX2
   ctx->use_avx2 = (features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2);
diff --git a/src/g10lib.h b/src/g10lib.h
index d1f94268..a579e945 100644
--- a/src/g10lib.h
+++ b/src/g10lib.h
@@ -197,16 +197,17 @@ int _gcry_log_verbosity( int level );
 #define HWF_PADLOCK_SHA  4
 #define HWF_PADLOCK_MMUL 8
 
-#define HWF_INTEL_CPU    16
-#define HWF_INTEL_BMI2   32
-#define HWF_INTEL_SSSE3  64
-#define HWF_INTEL_PCLMUL 128
-#define HWF_INTEL_AESNI  256
-#define HWF_INTEL_RDRAND 512
-#define HWF_INTEL_AVX    1024
-#define HWF_INTEL_AVX2   2048
-
-#define HWF_ARM_NEON     4096
+#define HWF_INTEL_CPU       16
+#define HWF_INTEL_FAST_SHLD 32
+#define HWF_INTEL_BMI2      64
+#define HWF_INTEL_SSSE3     128
+#define HWF_INTEL_PCLMUL    256
+#define HWF_INTEL_AESNI     512
+#define HWF_INTEL_RDRAND    1024
+#define HWF_INTEL_AVX       2048
+#define HWF_INTEL_AVX2      4096
+
+#define HWF_ARM_NEON        8192
 
 
 gpg_err_code_t _gcry_disable_hw_feature (const char *name);
diff --git a/src/hwf-x86.c b/src/hwf-x86.c
index 399952c4..fbd63315 100644
--- a/src/hwf-x86.c
+++ b/src/hwf-x86.c
@@ -174,6 +174,7 @@ detect_x86_gnuc (void)
   unsigned int features;
   unsigned int os_supports_avx_avx2_registers = 0;
   unsigned int max_cpuid_level;
+  unsigned int fms, family, model;
   unsigned int result = 0;
 
   (void)os_supports_avx_avx2_registers;
@@ -236,8 +237,37 @@ detect_x86_gnuc (void)
   /* Detect Intel features, that might also be supported by other
      vendors.  */
 
-  /* Get CPU info and Intel feature flags (ECX).  */
-  get_cpuid(1, NULL, NULL, &features, NULL);
+  /* Get CPU family/model/stepping (EAX) and Intel feature flags (ECX).  */
+  get_cpuid(1, &fms, NULL, &features, NULL);
+
+  family = ((fms & 0xf00) >> 8) + ((fms & 0xff00000) >> 20);
+  model = ((fms & 0xf0) >> 4) + ((fms & 0xf0000) >> 12);
+
+  if ((result & HWF_INTEL_CPU) && family == 6)
+    {
+      /* These Intel Core processor models have SHLD/SHRD instruction that
+       * can do integer rotation faster actual ROL/ROR instructions. */
+      switch (model)
+	{
+	case 0x2A:
+	case 0x2D:
+	case 0x3A:
+	case 0x3C:
+	case 0x3F:
+	case 0x45:
+	case 0x46:
+	case 0x3D:
+	case 0x4F:
+	case 0x56:
+	case 0x47:
+	case 0x4E:
+	case 0x5E:
+	case 0x55:
+	case 0x66:
+	  result |= HWF_INTEL_FAST_SHLD;
+	  break;
+	}
+    }
 
 #ifdef ENABLE_PCLMUL_SUPPORT
   /* Test bit 1 for PCLMUL.  */
diff --git a/src/hwfeatures.c b/src/hwfeatures.c
index 58099c49..e7c55cc3 100644
--- a/src/hwfeatures.c
+++ b/src/hwfeatures.c
@@ -42,19 +42,20 @@ static struct
   const char *desc;
 } hwflist[] =
   {
-    { HWF_PADLOCK_RNG, "padlock-rng" },
-    { HWF_PADLOCK_AES, "padlock-aes" },
-    { HWF_PADLOCK_SHA, "padlock-sha" },
-    { HWF_PADLOCK_MMUL,"padlock-mmul"},
-    { HWF_INTEL_CPU,   "intel-cpu" },
-    { HWF_INTEL_BMI2,  "intel-bmi2" },
-    { HWF_INTEL_SSSE3, "intel-ssse3" },
-    { HWF_INTEL_PCLMUL,"intel-pclmul" },
-    { HWF_INTEL_AESNI, "intel-aesni" },
-    { HWF_INTEL_RDRAND,"intel-rdrand" },
-    { HWF_INTEL_AVX,   "intel-avx" },
-    { HWF_INTEL_AVX2,  "intel-avx2" },
-    { HWF_ARM_NEON,    "arm-neon" }
+    { HWF_PADLOCK_RNG,     "padlock-rng" },
+    { HWF_PADLOCK_AES,     "padlock-aes" },
+    { HWF_PADLOCK_SHA,     "padlock-sha" },
+    { HWF_PADLOCK_MMUL,    "padlock-mmul"},
+    { HWF_INTEL_CPU,       "intel-cpu" },
+    { HWF_INTEL_FAST_SHLD, "intel-fast-shld" },
+    { HWF_INTEL_BMI2,      "intel-bmi2" },
+    { HWF_INTEL_SSSE3,     "intel-ssse3" },
+    { HWF_INTEL_PCLMUL,    "intel-pclmul" },
+    { HWF_INTEL_AESNI,     "intel-aesni" },
+    { HWF_INTEL_RDRAND,    "intel-rdrand" },
+    { HWF_INTEL_AVX,       "intel-avx" },
+    { HWF_INTEL_AVX2,      "intel-avx2" },
+    { HWF_ARM_NEON,        "arm-neon" }
   };
 
 /* A bit vector with the hardware features which shall not be used.
author	Jussi Kivilinna <jussi.kivilinna@iki.fi>	2015-10-23 22:39:47 +0300
committer	Jussi Kivilinna <jussi.kivilinna@iki.fi>	2015-10-28 20:08:54 +0200
commit	909644ef5883927262366c356eed530e55aba478 (patch)
tree	71c03dfeae98a5a7ab1118663a877cd3941f1dba
parent	16fd540f4d01eb6dc23d9509ae549353617c7a67 (diff)
download	libgcrypt-909644ef5883927262366c356eed530e55aba478.tar.gz