From 9597cfddf03c467825da152be5ca0d12a8c30d88 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Tue, 5 May 2015 21:02:43 +0300 Subject: Enable AMD64 ChaCha20 implementations on WIN64 * cipher/chacha20-avx2-amd64.S: Enable when HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined. (ELF): New macro to mask lines with ELF specific commands. * cipher/chacha20-sse2-amd64.S: Enable when HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined. (ELF): New macro to mask lines with ELF specific commands. * cipher/chacha20-ssse3-amd64.S: Enable when HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined. (ELF): New macro to mask lines with ELF specific commands. * cipher/chacha20.c (USE_SSE2, USE_SSSE3, USE_AVX2): Enable when HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined. (ASM_FUNC_ABI, ASM_EXTRA_STACK): New. (chacha20_blocks_t, _gcry_chacha20_amd64_sse2_blocks) (_gcry_chacha20_amd64_ssse3_blocks, _gcry_chacha20_amd64_avx2_blocks) (_gcry_chacha20_armv7_neon_blocks, chacha20_blocks): Add ASM_FUNC_ABI. (chacha20_core): Add ASM_EXTRA_STACK. -- Signed-off-by: Jussi Kivilinna --- cipher/chacha20-avx2-amd64.S | 13 ++++++++++--- cipher/chacha20-sse2-amd64.S | 13 ++++++++++--- cipher/chacha20-ssse3-amd64.S | 13 ++++++++++--- cipher/chacha20.c | 43 +++++++++++++++++++++++++++++++++---------- 4 files changed, 63 insertions(+), 19 deletions(-) diff --git a/cipher/chacha20-avx2-amd64.S b/cipher/chacha20-avx2-amd64.S index 1f33de8b..12bed35b 100644 --- a/cipher/chacha20-avx2-amd64.S +++ b/cipher/chacha20-avx2-amd64.S @@ -26,7 +26,8 @@ #ifdef __x86_64__ #include -#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ +#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(ENABLE_AVX2_SUPPORT) && USE_CHACHA20 #ifdef __PIC__ @@ -35,11 +36,17 @@ # define RIP #endif +#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS +# define ELF(...) __VA_ARGS__ +#else +# define ELF(...) /*_*/ +#endif + .text .align 8 .globl _gcry_chacha20_amd64_avx2_blocks -.type _gcry_chacha20_amd64_avx2_blocks,@function; +ELF(.type _gcry_chacha20_amd64_avx2_blocks,@function;) _gcry_chacha20_amd64_avx2_blocks: .Lchacha_blocks_avx2_local: vzeroupper @@ -938,7 +945,7 @@ _gcry_chacha20_amd64_avx2_blocks: vzeroall movl $(63 + 512), %eax ret -.size _gcry_chacha20_amd64_avx2_blocks,.-_gcry_chacha20_amd64_avx2_blocks; +ELF(.size _gcry_chacha20_amd64_avx2_blocks,.-_gcry_chacha20_amd64_avx2_blocks;) .data .align 16 diff --git a/cipher/chacha20-sse2-amd64.S b/cipher/chacha20-sse2-amd64.S index 4811f408..2b9842c1 100644 --- a/cipher/chacha20-sse2-amd64.S +++ b/cipher/chacha20-sse2-amd64.S @@ -26,13 +26,20 @@ #ifdef __x86_64__ #include -#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && USE_CHACHA20 +#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && USE_CHACHA20 + +#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS +# define ELF(...) __VA_ARGS__ +#else +# define ELF(...) /*_*/ +#endif .text .align 8 .globl _gcry_chacha20_amd64_sse2_blocks -.type _gcry_chacha20_amd64_sse2_blocks,@function; +ELF(.type _gcry_chacha20_amd64_sse2_blocks,@function;) _gcry_chacha20_amd64_sse2_blocks: .Lchacha_blocks_sse2_local: pushq %rbx @@ -646,7 +653,7 @@ _gcry_chacha20_amd64_sse2_blocks: pxor %xmm8, %xmm8 pxor %xmm0, %xmm0 ret -.size _gcry_chacha20_amd64_sse2_blocks,.-_gcry_chacha20_amd64_sse2_blocks; +ELF(.size _gcry_chacha20_amd64_sse2_blocks,.-_gcry_chacha20_amd64_sse2_blocks;) #endif /*defined(USE_CHACHA20)*/ #endif /*__x86_64*/ diff --git a/cipher/chacha20-ssse3-amd64.S b/cipher/chacha20-ssse3-amd64.S index 50c2ff86..a1a843fa 100644 --- a/cipher/chacha20-ssse3-amd64.S +++ b/cipher/chacha20-ssse3-amd64.S @@ -26,7 +26,8 @@ #ifdef __x86_64__ #include -#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ +#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(HAVE_GCC_INLINE_ASM_SSSE3) && USE_CHACHA20 #ifdef __PIC__ @@ -35,11 +36,17 @@ # define RIP #endif +#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS +# define ELF(...) __VA_ARGS__ +#else +# define ELF(...) /*_*/ +#endif + .text .align 8 .globl _gcry_chacha20_amd64_ssse3_blocks -.type _gcry_chacha20_amd64_ssse3_blocks,@function; +ELF(.type _gcry_chacha20_amd64_ssse3_blocks,@function;) _gcry_chacha20_amd64_ssse3_blocks: .Lchacha_blocks_ssse3_local: pushq %rbx @@ -614,7 +621,7 @@ _gcry_chacha20_amd64_ssse3_blocks: pxor %xmm8, %xmm8 pxor %xmm0, %xmm0 ret -.size _gcry_chacha20_amd64_ssse3_blocks,.-_gcry_chacha20_amd64_ssse3_blocks; +ELF(.size _gcry_chacha20_amd64_ssse3_blocks,.-_gcry_chacha20_amd64_ssse3_blocks;) .data .align 16; diff --git a/cipher/chacha20.c b/cipher/chacha20.c index 2eaeffde..e25e2398 100644 --- a/cipher/chacha20.c +++ b/cipher/chacha20.c @@ -50,20 +50,23 @@ /* USE_SSE2 indicates whether to compile with Intel SSE2 code. */ #undef USE_SSE2 -#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) +#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) # define USE_SSE2 1 #endif /* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */ #undef USE_SSSE3 -#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ +#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(HAVE_GCC_INLINE_ASM_SSSE3) # define USE_SSSE3 1 #endif /* USE_AVX2 indicates whether to compile with Intel AVX2 code. */ #undef USE_AVX2 -#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ +#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(ENABLE_AVX2_SUPPORT) # define USE_AVX2 1 #endif @@ -82,8 +85,23 @@ struct CHACHA20_context_s; +/* Assembly implementations use SystemV ABI, ABI conversion and additional + * stack to store XMM6-XMM15 needed on Win64. */ +#undef ASM_FUNC_ABI +#undef ASM_EXTRA_STACK +#if (defined(USE_SSE2) || defined(USE_SSSE3) || defined(USE_AVX2)) && \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS) +# define ASM_FUNC_ABI __attribute__((sysv_abi)) +# define ASM_EXTRA_STACK (10 * 16) +#else +# define ASM_FUNC_ABI +# define ASM_EXTRA_STACK 0 +#endif + + typedef unsigned int (* chacha20_blocks_t)(u32 *state, const byte *src, - byte *dst, size_t bytes); + byte *dst, + size_t bytes) ASM_FUNC_ABI; typedef struct CHACHA20_context_s { @@ -97,28 +115,32 @@ typedef struct CHACHA20_context_s #ifdef USE_SSE2 unsigned int _gcry_chacha20_amd64_sse2_blocks(u32 *state, const byte *in, - byte *out, size_t bytes); + byte *out, + size_t bytes) ASM_FUNC_ABI; #endif /* USE_SSE2 */ #ifdef USE_SSSE3 unsigned int _gcry_chacha20_amd64_ssse3_blocks(u32 *state, const byte *in, - byte *out, size_t bytes); + byte *out, + size_t bytes) ASM_FUNC_ABI; #endif /* USE_SSSE3 */ #ifdef USE_AVX2 unsigned int _gcry_chacha20_amd64_avx2_blocks(u32 *state, const byte *in, - byte *out, size_t bytes); + byte *out, + size_t bytes) ASM_FUNC_ABI; #endif /* USE_AVX2 */ #ifdef USE_NEON unsigned int _gcry_chacha20_armv7_neon_blocks(u32 *state, const byte *in, - byte *out, size_t bytes); + byte *out, + size_t bytes) ASM_FUNC_ABI; #endif /* USE_NEON */ @@ -141,7 +163,7 @@ static const char *selftest (void); #ifndef USE_SSE2 -static unsigned int +ASM_FUNC_ABI static unsigned int chacha20_blocks (u32 *state, const byte *src, byte *dst, size_t bytes) { u32 pad[CHACHA20_INPUT_LENGTH]; @@ -269,7 +291,8 @@ chacha20_blocks (u32 *state, const byte *src, byte *dst, size_t bytes) static unsigned int chacha20_core(u32 *dst, struct CHACHA20_context_s *ctx) { - return ctx->blocks(ctx->input, NULL, (byte *)dst, CHACHA20_BLOCK_SIZE); + return ctx->blocks(ctx->input, NULL, (byte *)dst, CHACHA20_BLOCK_SIZE) + + ASM_EXTRA_STACK; } -- cgit v1.2.1