summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2015-05-02 13:05:12 +0300
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2015-05-02 14:41:15 +0300
commit1089a13073c26a9a456e43ec38d937e6ee7f4077 (patch)
treeed469e80d277395d78a04f37070ca83274e5ea9f
parent022959099644f64df5f2a83ade21159864f64837 (diff)
downloadlibgcrypt-1089a13073c26a9a456e43ec38d937e6ee7f4077.tar.gz
Enable AMD64 SHA512 implementations for WIN64
* cipher/sha512-avx-amd64.S: Enable when HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined. (ELF): New macro to mask lines with ELF specific commands. * cipher/sha512-avx-bmi2-amd64.S: Ditto. * cipher/sha512-ssse3-amd64.S: Ditto. * cipher/sha512.c (USE_SSSE3, USE_AVX, USE_AVX2): Enable when HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined. [USE_SSSE3 || USE_AVX || USE_AVX2] (ASM_FUNC_ABI) (ASM_EXTRA_STACK): New. (_gcry_sha512_transform_amd64_ssse3, _gcry_sha512_transform_amd64_avx) (_gcry_sha512_transform_amd64_avx_bmi2): Add ASM_FUNC_ABI to prototypes. (transform): Add ASM_EXTRA_STACK to stack burn value. -- Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
-rw-r--r--cipher/sha512-avx-amd64.S11
-rw-r--r--cipher/sha512-avx2-bmi2-amd64.S11
-rw-r--r--cipher/sha512-ssse3-amd64.S11
-rw-r--r--cipher/sha512.c60
4 files changed, 72 insertions, 21 deletions
diff --git a/cipher/sha512-avx-amd64.S b/cipher/sha512-avx-amd64.S
index 3449b877..699c271b 100644
--- a/cipher/sha512-avx-amd64.S
+++ b/cipher/sha512-avx-amd64.S
@@ -41,7 +41,8 @@
#ifdef __x86_64
#include <config.h>
-#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA512)
@@ -51,6 +52,12 @@
# define ADD_RIP
#endif
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
.intel_syntax noprefix
.text
@@ -259,7 +266,7 @@ frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size))
; L is the message length in SHA512 blocks
*/
.globl _gcry_sha512_transform_amd64_avx
-.type _gcry_sha512_transform_amd64_avx,@function;
+ELF(.type _gcry_sha512_transform_amd64_avx,@function;)
.align 16
_gcry_sha512_transform_amd64_avx:
xor eax, eax
diff --git a/cipher/sha512-avx2-bmi2-amd64.S b/cipher/sha512-avx2-bmi2-amd64.S
index d6301f36..02f95af6 100644
--- a/cipher/sha512-avx2-bmi2-amd64.S
+++ b/cipher/sha512-avx2-bmi2-amd64.S
@@ -43,7 +43,8 @@
#ifdef __x86_64
#include <config.h>
-#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \
defined(USE_SHA512)
@@ -54,6 +55,12 @@
# define ADD_RIP
#endif
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
.intel_syntax noprefix
.text
@@ -596,7 +603,7 @@ rotate_Ys
; L is the message length in SHA512 blocks
*/
.globl _gcry_sha512_transform_amd64_avx2
-.type _gcry_sha512_transform_amd64_avx2,@function;
+ELF(.type _gcry_sha512_transform_amd64_avx2,@function;)
.align 16
_gcry_sha512_transform_amd64_avx2:
xor eax, eax
diff --git a/cipher/sha512-ssse3-amd64.S b/cipher/sha512-ssse3-amd64.S
index 4c80baa3..c721bcf2 100644
--- a/cipher/sha512-ssse3-amd64.S
+++ b/cipher/sha512-ssse3-amd64.S
@@ -44,7 +44,8 @@
#ifdef __x86_64
#include <config.h>
-#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA512)
@@ -54,6 +55,12 @@
# define ADD_RIP
#endif
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
.intel_syntax noprefix
.text
@@ -261,7 +268,7 @@ frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size))
; L is the message length in SHA512 blocks.
*/
.globl _gcry_sha512_transform_amd64_ssse3
-.type _gcry_sha512_transform_amd64_ssse3,@function;
+ELF(.type _gcry_sha512_transform_amd64_ssse3,@function;)
.align 16
_gcry_sha512_transform_amd64_ssse3:
xor eax, eax
diff --git a/cipher/sha512.c b/cipher/sha512.c
index 5a6af808..029f8f02 100644
--- a/cipher/sha512.c
+++ b/cipher/sha512.c
@@ -68,27 +68,31 @@
/* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
#undef USE_SSSE3
-#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
- defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
- defined(HAVE_INTEL_SYNTAX_PLATFORM_AS)
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
+ defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
+ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
# define USE_SSSE3 1
#endif
/* USE_AVX indicates whether to compile with Intel AVX code. */
#undef USE_AVX
-#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
- defined(HAVE_GCC_INLINE_ASM_AVX) && \
- defined(HAVE_INTEL_SYNTAX_PLATFORM_AS)
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX) && \
+ defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
+ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
# define USE_AVX 1
#endif
/* USE_AVX2 indicates whether to compile with Intel AVX2/rorx code. */
#undef USE_AVX2
-#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
- defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \
- defined(HAVE_INTEL_SYNTAX_PLATFORM_AS)
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX2) && \
+ defined(HAVE_GCC_INLINE_ASM_BMI2) && \
+ defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
+ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
# define USE_AVX2 1
#endif
@@ -543,6 +547,21 @@ transform_blk (SHA512_STATE *hd, const unsigned char *data)
}
+/* AMD64 assembly implementations use SystemV ABI, ABI conversion and additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#undef ASM_EXTRA_STACK
+#if defined(USE_SSSE3) || defined(USE_AVX) || defined(USE_AVX2)
+# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+# define ASM_FUNC_ABI __attribute__((sysv_abi))
+# define ASM_EXTRA_STACK (10 * 16)
+# else
+# define ASM_FUNC_ABI
+# define ASM_EXTRA_STACK 0
+# endif
+#endif
+
+
#ifdef USE_ARM_NEON_ASM
void _gcry_sha512_transform_armv7_neon (SHA512_STATE *hd,
const unsigned char *data,
@@ -551,17 +570,20 @@ void _gcry_sha512_transform_armv7_neon (SHA512_STATE *hd,
#ifdef USE_SSSE3
unsigned int _gcry_sha512_transform_amd64_ssse3(const void *input_data,
- void *state, size_t num_blks);
+ void *state,
+ size_t num_blks) ASM_FUNC_ABI;
#endif
#ifdef USE_AVX
unsigned int _gcry_sha512_transform_amd64_avx(const void *input_data,
- void *state, size_t num_blks);
+ void *state,
+ size_t num_blks) ASM_FUNC_ABI;
#endif
#ifdef USE_AVX2
unsigned int _gcry_sha512_transform_amd64_avx2(const void *input_data,
- void *state, size_t num_blks);
+ void *state,
+ size_t num_blks) ASM_FUNC_ABI;
#endif
@@ -574,19 +596,19 @@ transform (void *context, const unsigned char *data, size_t nblks)
#ifdef USE_AVX2
if (ctx->use_avx2)
return _gcry_sha512_transform_amd64_avx2 (data, &ctx->state, nblks)
- + 4 * sizeof(void*);
+ + 4 * sizeof(void*) + ASM_EXTRA_STACK;
#endif
#ifdef USE_AVX
if (ctx->use_avx)
return _gcry_sha512_transform_amd64_avx (data, &ctx->state, nblks)
- + 4 * sizeof(void*);
+ + 4 * sizeof(void*) + ASM_EXTRA_STACK;
#endif
#ifdef USE_SSSE3
if (ctx->use_ssse3)
return _gcry_sha512_transform_amd64_ssse3 (data, &ctx->state, nblks)
- + 4 * sizeof(void*);
+ + 4 * sizeof(void*) + ASM_EXTRA_STACK;
#endif
#ifdef USE_ARM_NEON_ASM
@@ -607,6 +629,14 @@ transform (void *context, const unsigned char *data, size_t nblks)
}
while (--nblks);
+#ifdef ASM_EXTRA_STACK
+ /* 'transform_blk' is typically inlined and XMM6-XMM15 are stored at
+ * the prologue of this function. Therefore need to add ASM_EXTRA_STACK to
+ * here too.
+ */
+ burn += ASM_EXTRA_STACK;
+#endif
+
return burn;
}