diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2015-10-23 22:39:47 +0300 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2015-10-28 20:08:54 +0200 |
commit | 909644ef5883927262366c356eed530e55aba478 (patch) | |
tree | 71c03dfeae98a5a7ab1118663a877cd3941f1dba | |
parent | 16fd540f4d01eb6dc23d9509ae549353617c7a67 (diff) | |
download | libgcrypt-909644ef5883927262366c356eed530e55aba478.tar.gz |
hwf-x86: add detection for Intel CPUs with fast SHLD instruction
* cipher/sha1.c (sha1_init): Use HWF_INTEL_FAST_SHLD instead of
HWF_INTEL_CPU.
* cipher/sha256.c (sha256_init, sha224_init): Ditto.
* cipher/sha512.c (sha512_init, sha384_init): Ditto.
* src/g10lib.h (HWF_INTEL_FAST_SHLD): New.
(HWF_INTEL_BMI2, HWF_INTEL_SSSE3, HWF_INTEL_PCLMUL, HWF_INTEL_AESNI)
(HWF_INTEL_RDRAND, HWF_INTEL_AVX, HWF_INTEL_AVX2)
(HWF_ARM_NEON): Update.
* src/hwf-x86.c (detect_x86_gnuc): Add detection of Intel Core
CPUs with fast SHLD/SHRD instruction.
* src/hwfeatures.c (hwflist): Add "intel-fast-shld".
--
Intel Core CPUs since codename sandy-bridge have been able to
execute SHLD/SHRD instructions faster than rotate instructions
ROL/ROR. Since SHLD/SHRD can be used to do rotation, some
optimized implementations (SHA1/SHA256/SHA512) use SHLD/SHRD
instructions in-place of ROL/ROR.
This patch provides more accurate detection of CPUs with
fast SHLD implementation.
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
-rw-r--r-- | cipher/sha1.c | 2 | ||||
-rw-r--r-- | cipher/sha256.c | 4 | ||||
-rw-r--r-- | cipher/sha512.c | 4 | ||||
-rw-r--r-- | src/g10lib.h | 21 | ||||
-rw-r--r-- | src/hwf-x86.c | 34 | ||||
-rw-r--r-- | src/hwfeatures.c | 27 |
6 files changed, 62 insertions, 30 deletions
diff --git a/cipher/sha1.c b/cipher/sha1.c index eb428835..554d55ce 100644 --- a/cipher/sha1.c +++ b/cipher/sha1.c @@ -136,7 +136,7 @@ sha1_init (void *context, unsigned int flags) #ifdef USE_AVX /* AVX implementation uses SHLD which is known to be slow on non-Intel CPUs. * Therefore use this implementation on Intel CPUs only. */ - hd->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_CPU); + hd->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD); #endif #ifdef USE_BMI2 hd->use_bmi2 = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_BMI2); diff --git a/cipher/sha256.c b/cipher/sha256.c index 59ffa434..63869d54 100644 --- a/cipher/sha256.c +++ b/cipher/sha256.c @@ -124,7 +124,7 @@ sha256_init (void *context, unsigned int flags) #ifdef USE_AVX /* AVX implementation uses SHLD which is known to be slow on non-Intel CPUs. * Therefore use this implementation on Intel CPUs only. */ - hd->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_CPU); + hd->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD); #endif #ifdef USE_AVX2 hd->use_avx2 = (features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2); @@ -162,7 +162,7 @@ sha224_init (void *context, unsigned int flags) #ifdef USE_AVX /* AVX implementation uses SHLD which is known to be slow on non-Intel CPUs. * Therefore use this implementation on Intel CPUs only. */ - hd->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_CPU); + hd->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD); #endif #ifdef USE_AVX2 hd->use_avx2 = (features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2); diff --git a/cipher/sha512.c b/cipher/sha512.c index 029f8f02..4be1cab2 100644 --- a/cipher/sha512.c +++ b/cipher/sha512.c @@ -154,7 +154,7 @@ sha512_init (void *context, unsigned int flags) ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0; #endif #ifdef USE_AVX - ctx->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_CPU); + ctx->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD); #endif #ifdef USE_AVX2 ctx->use_avx2 = (features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2); @@ -194,7 +194,7 @@ sha384_init (void *context, unsigned int flags) ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0; #endif #ifdef USE_AVX - ctx->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_CPU); + ctx->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD); #endif #ifdef USE_AVX2 ctx->use_avx2 = (features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2); diff --git a/src/g10lib.h b/src/g10lib.h index d1f94268..a579e945 100644 --- a/src/g10lib.h +++ b/src/g10lib.h @@ -197,16 +197,17 @@ int _gcry_log_verbosity( int level ); #define HWF_PADLOCK_SHA 4 #define HWF_PADLOCK_MMUL 8 -#define HWF_INTEL_CPU 16 -#define HWF_INTEL_BMI2 32 -#define HWF_INTEL_SSSE3 64 -#define HWF_INTEL_PCLMUL 128 -#define HWF_INTEL_AESNI 256 -#define HWF_INTEL_RDRAND 512 -#define HWF_INTEL_AVX 1024 -#define HWF_INTEL_AVX2 2048 - -#define HWF_ARM_NEON 4096 +#define HWF_INTEL_CPU 16 +#define HWF_INTEL_FAST_SHLD 32 +#define HWF_INTEL_BMI2 64 +#define HWF_INTEL_SSSE3 128 +#define HWF_INTEL_PCLMUL 256 +#define HWF_INTEL_AESNI 512 +#define HWF_INTEL_RDRAND 1024 +#define HWF_INTEL_AVX 2048 +#define HWF_INTEL_AVX2 4096 + +#define HWF_ARM_NEON 8192 gpg_err_code_t _gcry_disable_hw_feature (const char *name); diff --git a/src/hwf-x86.c b/src/hwf-x86.c index 399952c4..fbd63315 100644 --- a/src/hwf-x86.c +++ b/src/hwf-x86.c @@ -174,6 +174,7 @@ detect_x86_gnuc (void) unsigned int features; unsigned int os_supports_avx_avx2_registers = 0; unsigned int max_cpuid_level; + unsigned int fms, family, model; unsigned int result = 0; (void)os_supports_avx_avx2_registers; @@ -236,8 +237,37 @@ detect_x86_gnuc (void) /* Detect Intel features, that might also be supported by other vendors. */ - /* Get CPU info and Intel feature flags (ECX). */ - get_cpuid(1, NULL, NULL, &features, NULL); + /* Get CPU family/model/stepping (EAX) and Intel feature flags (ECX). */ + get_cpuid(1, &fms, NULL, &features, NULL); + + family = ((fms & 0xf00) >> 8) + ((fms & 0xff00000) >> 20); + model = ((fms & 0xf0) >> 4) + ((fms & 0xf0000) >> 12); + + if ((result & HWF_INTEL_CPU) && family == 6) + { + /* These Intel Core processor models have SHLD/SHRD instruction that + * can do integer rotation faster actual ROL/ROR instructions. */ + switch (model) + { + case 0x2A: + case 0x2D: + case 0x3A: + case 0x3C: + case 0x3F: + case 0x45: + case 0x46: + case 0x3D: + case 0x4F: + case 0x56: + case 0x47: + case 0x4E: + case 0x5E: + case 0x55: + case 0x66: + result |= HWF_INTEL_FAST_SHLD; + break; + } + } #ifdef ENABLE_PCLMUL_SUPPORT /* Test bit 1 for PCLMUL. */ diff --git a/src/hwfeatures.c b/src/hwfeatures.c index 58099c49..e7c55cc3 100644 --- a/src/hwfeatures.c +++ b/src/hwfeatures.c @@ -42,19 +42,20 @@ static struct const char *desc; } hwflist[] = { - { HWF_PADLOCK_RNG, "padlock-rng" }, - { HWF_PADLOCK_AES, "padlock-aes" }, - { HWF_PADLOCK_SHA, "padlock-sha" }, - { HWF_PADLOCK_MMUL,"padlock-mmul"}, - { HWF_INTEL_CPU, "intel-cpu" }, - { HWF_INTEL_BMI2, "intel-bmi2" }, - { HWF_INTEL_SSSE3, "intel-ssse3" }, - { HWF_INTEL_PCLMUL,"intel-pclmul" }, - { HWF_INTEL_AESNI, "intel-aesni" }, - { HWF_INTEL_RDRAND,"intel-rdrand" }, - { HWF_INTEL_AVX, "intel-avx" }, - { HWF_INTEL_AVX2, "intel-avx2" }, - { HWF_ARM_NEON, "arm-neon" } + { HWF_PADLOCK_RNG, "padlock-rng" }, + { HWF_PADLOCK_AES, "padlock-aes" }, + { HWF_PADLOCK_SHA, "padlock-sha" }, + { HWF_PADLOCK_MMUL, "padlock-mmul"}, + { HWF_INTEL_CPU, "intel-cpu" }, + { HWF_INTEL_FAST_SHLD, "intel-fast-shld" }, + { HWF_INTEL_BMI2, "intel-bmi2" }, + { HWF_INTEL_SSSE3, "intel-ssse3" }, + { HWF_INTEL_PCLMUL, "intel-pclmul" }, + { HWF_INTEL_AESNI, "intel-aesni" }, + { HWF_INTEL_RDRAND, "intel-rdrand" }, + { HWF_INTEL_AVX, "intel-avx" }, + { HWF_INTEL_AVX2, "intel-avx2" }, + { HWF_ARM_NEON, "arm-neon" } }; /* A bit vector with the hardware features which shall not be used. |