From 9a4fb3709864bf3e3918800d44ff576590cd4e92 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Thu, 14 May 2015 13:33:07 +0300 Subject: Enable AMD64 Camellia implementations on WIN64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * cipher/camellia-aesni-avx-amd64.S: Enable when HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined. (ELF): New macro to mask lines with ELF specific commands. * cipher/camellia-aesni-avx2-amd64.S: Enable when HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined. (ELF): New macro to mask lines with ELF specific commands. * cipher/camellia-glue.c (USE_AESNI_AVX, USE_AESNI_AVX2): Enable when HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined. [USE_AESNI_AVX || USE_AESNI_AVX2] (ASM_FUNC_ABI, ASM_EXTRA_STACK): New. (_gcry_camellia_aesni_avx_ctr_enc, _gcry_camellia_aesni_avx_cbc_dec) (_gcry_camellia_aesni_avx_cfb_dec, _gcry_camellia_aesni_avx_keygen) (_gcry_camellia_aesni_avx2_ctr_enc, _gcry_camellia_aesni_avx2_cbc_dec) (_gcry_camellia_aesni_avx2_cfb_dec): Add ASM_FUNC_ABI. -- Signed-off-by: Jussi Kivilinna --- cipher/camellia-aesni-avx-amd64.S | 41 ++++++++++++++----------- cipher/camellia-aesni-avx2-amd64.S | 29 +++++++++++------- cipher/camellia-glue.c | 61 +++++++++++++++++++++++++++----------- 3 files changed, 85 insertions(+), 46 deletions(-) diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S index 6d157a7f..c047a21e 100644 --- a/cipher/camellia-aesni-avx-amd64.S +++ b/cipher/camellia-aesni-avx-amd64.S @@ -20,7 +20,8 @@ #ifdef __x86_64 #include -#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ +#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT) #ifdef __PIC__ @@ -29,6 +30,12 @@ # define RIP #endif +#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS +# define ELF(...) __VA_ARGS__ +#else +# define ELF(...) /*_*/ +#endif + #define CAMELLIA_TABLE_BYTE_LEN 272 /* struct CAMELLIA_context: */ @@ -769,7 +776,7 @@ .text .align 8 -.type __camellia_enc_blk16,@function; +ELF(.type __camellia_enc_blk16,@function;) __camellia_enc_blk16: /* input: @@ -853,10 +860,10 @@ __camellia_enc_blk16: %xmm15, %rax, %rcx, 24); jmp .Lenc_done; -.size __camellia_enc_blk16,.-__camellia_enc_blk16; +ELF(.size __camellia_enc_blk16,.-__camellia_enc_blk16;) .align 8 -.type __camellia_dec_blk16,@function; +ELF(.type __camellia_dec_blk16,@function;) __camellia_dec_blk16: /* input: @@ -938,7 +945,7 @@ __camellia_dec_blk16: ((key_table + (24) * 8) + 4)(CTX)); jmp .Ldec_max24; -.size __camellia_dec_blk16,.-__camellia_dec_blk16; +ELF(.size __camellia_dec_blk16,.-__camellia_dec_blk16;) #define inc_le128(x, minus_one, tmp) \ vpcmpeqq minus_one, x, tmp; \ @@ -948,7 +955,7 @@ __camellia_dec_blk16: .align 8 .globl _gcry_camellia_aesni_avx_ctr_enc -.type _gcry_camellia_aesni_avx_ctr_enc,@function; +ELF(.type _gcry_camellia_aesni_avx_ctr_enc,@function;) _gcry_camellia_aesni_avx_ctr_enc: /* input: @@ -1062,11 +1069,11 @@ _gcry_camellia_aesni_avx_ctr_enc: leave; ret; -.size _gcry_camellia_aesni_avx_ctr_enc,.-_gcry_camellia_aesni_avx_ctr_enc; +ELF(.size _gcry_camellia_aesni_avx_ctr_enc,.-_gcry_camellia_aesni_avx_ctr_enc;) .align 8 .globl _gcry_camellia_aesni_avx_cbc_dec -.type _gcry_camellia_aesni_avx_cbc_dec,@function; +ELF(.type _gcry_camellia_aesni_avx_cbc_dec,@function;) _gcry_camellia_aesni_avx_cbc_dec: /* input: @@ -1130,11 +1137,11 @@ _gcry_camellia_aesni_avx_cbc_dec: leave; ret; -.size _gcry_camellia_aesni_avx_cbc_dec,.-_gcry_camellia_aesni_avx_cbc_dec; +ELF(.size _gcry_camellia_aesni_avx_cbc_dec,.-_gcry_camellia_aesni_avx_cbc_dec;) .align 8 .globl _gcry_camellia_aesni_avx_cfb_dec -.type _gcry_camellia_aesni_avx_cfb_dec,@function; +ELF(.type _gcry_camellia_aesni_avx_cfb_dec,@function;) _gcry_camellia_aesni_avx_cfb_dec: /* input: @@ -1202,7 +1209,7 @@ _gcry_camellia_aesni_avx_cfb_dec: leave; ret; -.size _gcry_camellia_aesni_avx_cfb_dec,.-_gcry_camellia_aesni_avx_cfb_dec; +ELF(.size _gcry_camellia_aesni_avx_cfb_dec,.-_gcry_camellia_aesni_avx_cfb_dec;) /* * IN: @@ -1309,7 +1316,7 @@ _gcry_camellia_aesni_avx_cfb_dec: .text .align 8 -.type __camellia_avx_setup128,@function; +ELF(.type __camellia_avx_setup128,@function;) __camellia_avx_setup128: /* input: * %rdi: ctx, CTX; subkey storage at key_table(CTX) @@ -1650,10 +1657,10 @@ __camellia_avx_setup128: vzeroall; ret; -.size __camellia_avx_setup128,.-__camellia_avx_setup128; +ELF(.size __camellia_avx_setup128,.-__camellia_avx_setup128;) .align 8 -.type __camellia_avx_setup256,@function; +ELF(.type __camellia_avx_setup256,@function;) __camellia_avx_setup256: /* input: @@ -2127,11 +2134,11 @@ __camellia_avx_setup256: vzeroall; ret; -.size __camellia_avx_setup256,.-__camellia_avx_setup256; +ELF(.size __camellia_avx_setup256,.-__camellia_avx_setup256;) .align 8 .globl _gcry_camellia_aesni_avx_keygen -.type _gcry_camellia_aesni_avx_keygen,@function; +ELF(.type _gcry_camellia_aesni_avx_keygen,@function;) _gcry_camellia_aesni_avx_keygen: /* input: @@ -2159,7 +2166,7 @@ _gcry_camellia_aesni_avx_keygen: vpor %xmm2, %xmm1, %xmm1; jmp __camellia_avx_setup256; -.size _gcry_camellia_aesni_avx_keygen,.-_gcry_camellia_aesni_avx_keygen; +ELF(.size _gcry_camellia_aesni_avx_keygen,.-_gcry_camellia_aesni_avx_keygen;) #endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)*/ #endif /*__x86_64*/ diff --git a/cipher/camellia-aesni-avx2-amd64.S b/cipher/camellia-aesni-avx2-amd64.S index 25f48bc7..a3fa229d 100644 --- a/cipher/camellia-aesni-avx2-amd64.S +++ b/cipher/camellia-aesni-avx2-amd64.S @@ -20,7 +20,8 @@ #ifdef __x86_64 #include -#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ +#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT) #ifdef __PIC__ @@ -29,6 +30,12 @@ # define RIP #endif +#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS +# define ELF(...) __VA_ARGS__ +#else +# define ELF(...) /*_*/ +#endif + #define CAMELLIA_TABLE_BYTE_LEN 272 /* struct CAMELLIA_context: */ @@ -748,7 +755,7 @@ .text .align 8 -.type __camellia_enc_blk32,@function; +ELF(.type __camellia_enc_blk32,@function;) __camellia_enc_blk32: /* input: @@ -832,10 +839,10 @@ __camellia_enc_blk32: %ymm15, %rax, %rcx, 24); jmp .Lenc_done; -.size __camellia_enc_blk32,.-__camellia_enc_blk32; +ELF(.size __camellia_enc_blk32,.-__camellia_enc_blk32;) .align 8 -.type __camellia_dec_blk32,@function; +ELF(.type __camellia_dec_blk32,@function;) __camellia_dec_blk32: /* input: @@ -917,7 +924,7 @@ __camellia_dec_blk32: ((key_table + (24) * 8) + 4)(CTX)); jmp .Ldec_max24; -.size __camellia_dec_blk32,.-__camellia_dec_blk32; +ELF(.size __camellia_dec_blk32,.-__camellia_dec_blk32;) #define inc_le128(x, minus_one, tmp) \ vpcmpeqq minus_one, x, tmp; \ @@ -927,7 +934,7 @@ __camellia_dec_blk32: .align 8 .globl _gcry_camellia_aesni_avx2_ctr_enc -.type _gcry_camellia_aesni_avx2_ctr_enc,@function; +ELF(.type _gcry_camellia_aesni_avx2_ctr_enc,@function;) _gcry_camellia_aesni_avx2_ctr_enc: /* input: @@ -1111,11 +1118,11 @@ _gcry_camellia_aesni_avx2_ctr_enc: leave; ret; -.size _gcry_camellia_aesni_avx2_ctr_enc,.-_gcry_camellia_aesni_avx2_ctr_enc; +ELF(.size _gcry_camellia_aesni_avx2_ctr_enc,.-_gcry_camellia_aesni_avx2_ctr_enc;) .align 8 .globl _gcry_camellia_aesni_avx2_cbc_dec -.type _gcry_camellia_aesni_avx2_cbc_dec,@function; +ELF(.type _gcry_camellia_aesni_avx2_cbc_dec,@function;) _gcry_camellia_aesni_avx2_cbc_dec: /* input: @@ -1183,11 +1190,11 @@ _gcry_camellia_aesni_avx2_cbc_dec: leave; ret; -.size _gcry_camellia_aesni_avx2_cbc_dec,.-_gcry_camellia_aesni_avx2_cbc_dec; +ELF(.size _gcry_camellia_aesni_avx2_cbc_dec,.-_gcry_camellia_aesni_avx2_cbc_dec;) .align 8 .globl _gcry_camellia_aesni_avx2_cfb_dec -.type _gcry_camellia_aesni_avx2_cfb_dec,@function; +ELF(.type _gcry_camellia_aesni_avx2_cfb_dec,@function;) _gcry_camellia_aesni_avx2_cfb_dec: /* input: @@ -1257,7 +1264,7 @@ _gcry_camellia_aesni_avx2_cfb_dec: leave; ret; -.size _gcry_camellia_aesni_avx2_cfb_dec,.-_gcry_camellia_aesni_avx2_cfb_dec; +ELF(.size _gcry_camellia_aesni_avx2_cfb_dec,.-_gcry_camellia_aesni_avx2_cfb_dec;) #endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)*/ #endif /*__x86_64*/ diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c index f18d1358..50323218 100644 --- a/cipher/camellia-glue.c +++ b/cipher/camellia-glue.c @@ -75,7 +75,8 @@ /* USE_AESNI inidicates whether to compile with Intel AES-NI/AVX code. */ #undef USE_AESNI_AVX #if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT) -# if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) +# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) # define USE_AESNI_AVX 1 # endif #endif @@ -83,7 +84,8 @@ /* USE_AESNI_AVX2 inidicates whether to compile with Intel AES-NI/AVX2 code. */ #undef USE_AESNI_AVX2 #if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT) -# if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) +# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) # define USE_AESNI_AVX2 1 # endif #endif @@ -100,6 +102,20 @@ typedef struct #endif /*USE_AESNI_AVX2*/ } CAMELLIA_context; +/* Assembly implementations use SystemV ABI, ABI conversion and additional + * stack to store XMM6-XMM15 needed on Win64. */ +#undef ASM_FUNC_ABI +#undef ASM_EXTRA_STACK +#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) +# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS +# define ASM_FUNC_ABI __attribute__((sysv_abi)) +# define ASM_EXTRA_STACK (10 * 16) +# else +# define ASM_FUNC_ABI +# define ASM_EXTRA_STACK 0 +# endif +#endif + #ifdef USE_AESNI_AVX /* Assembler implementations of Camellia using AES-NI and AVX. Process data in 16 block same time. @@ -107,21 +123,21 @@ typedef struct extern void _gcry_camellia_aesni_avx_ctr_enc(CAMELLIA_context *ctx, unsigned char *out, const unsigned char *in, - unsigned char *ctr); + unsigned char *ctr) ASM_FUNC_ABI; extern void _gcry_camellia_aesni_avx_cbc_dec(CAMELLIA_context *ctx, unsigned char *out, const unsigned char *in, - unsigned char *iv); + unsigned char *iv) ASM_FUNC_ABI; extern void _gcry_camellia_aesni_avx_cfb_dec(CAMELLIA_context *ctx, unsigned char *out, const unsigned char *in, - unsigned char *iv); + unsigned char *iv) ASM_FUNC_ABI; extern void _gcry_camellia_aesni_avx_keygen(CAMELLIA_context *ctx, const unsigned char *key, - unsigned int keylen); + unsigned int keylen) ASM_FUNC_ABI; #endif #ifdef USE_AESNI_AVX2 @@ -131,17 +147,17 @@ extern void _gcry_camellia_aesni_avx_keygen(CAMELLIA_context *ctx, extern void _gcry_camellia_aesni_avx2_ctr_enc(CAMELLIA_context *ctx, unsigned char *out, const unsigned char *in, - unsigned char *ctr); + unsigned char *ctr) ASM_FUNC_ABI; extern void _gcry_camellia_aesni_avx2_cbc_dec(CAMELLIA_context *ctx, unsigned char *out, const unsigned char *in, - unsigned char *iv); + unsigned char *iv) ASM_FUNC_ABI; extern void _gcry_camellia_aesni_avx2_cfb_dec(CAMELLIA_context *ctx, unsigned char *out, const unsigned char *in, - unsigned char *iv); + unsigned char *iv) ASM_FUNC_ABI; #endif static const char *selftest(void); @@ -318,7 +334,7 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr, if (did_use_aesni_avx2) { int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 + - 2 * sizeof(void *); + 2 * sizeof(void *) + ASM_EXTRA_STACK; if (burn_stack_depth < avx2_burn_stack_depth) burn_stack_depth = avx2_burn_stack_depth; @@ -347,8 +363,11 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr, if (did_use_aesni_avx) { - if (burn_stack_depth < 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *)) - burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *); + int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + + 2 * sizeof(void *) + ASM_EXTRA_STACK; + + if (burn_stack_depth < avx_burn_stack_depth) + burn_stack_depth = avx_burn_stack_depth; } /* Use generic code to handle smaller chunks... */ @@ -409,7 +428,7 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv, if (did_use_aesni_avx2) { int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 + - 2 * sizeof(void *); + 2 * sizeof(void *) + ASM_EXTRA_STACK;; if (burn_stack_depth < avx2_burn_stack_depth) burn_stack_depth = avx2_burn_stack_depth; @@ -437,8 +456,11 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv, if (did_use_aesni_avx) { - if (burn_stack_depth < 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *)) - burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *); + int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + + 2 * sizeof(void *) + ASM_EXTRA_STACK; + + if (burn_stack_depth < avx_burn_stack_depth) + burn_stack_depth = avx_burn_stack_depth; } /* Use generic code to handle smaller chunks... */ @@ -491,7 +513,7 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv, if (did_use_aesni_avx2) { int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 + - 2 * sizeof(void *); + 2 * sizeof(void *) + ASM_EXTRA_STACK; if (burn_stack_depth < avx2_burn_stack_depth) burn_stack_depth = avx2_burn_stack_depth; @@ -519,8 +541,11 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv, if (did_use_aesni_avx) { - if (burn_stack_depth < 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *)) - burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *); + int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + + 2 * sizeof(void *) + ASM_EXTRA_STACK; + + if (burn_stack_depth < avx_burn_stack_depth) + burn_stack_depth = avx_burn_stack_depth; } /* Use generic code to handle smaller chunks... */ -- cgit v1.2.1