From 6a6646df80386204675d8b149ab60e74d7ca124c Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Tue, 5 May 2015 20:46:10 +0300 Subject: Enable AMD64 CAST5 implementation on WIN64 * cipher/cast5-amd64.S: Enable when HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined. (RIP): Remove. (GET_EXTERN_POINTER): Use 'leaq' version on WIN64. (ELF): New macro to mask lines with ELF specific commands. * cipher/cast5.c (USE_AMD64_ASM): Enable when HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined. [HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS] (call_sysv_fn): New. (do_encrypt_block, do_decrypt_block) [HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS]: Call assembly function through 'call_sysv_fn'. (cast5_amd64_ctr_enc, cast5_amd64_cbc_dec) (cast5_amd64_cfb_dec): New wrapper functions for bulk assembly functions. -- Signed-off-by: Jussi Kivilinna --- cipher/cast5-amd64.S | 43 ++++++++++++++++++--------------- cipher/cast5.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 86 insertions(+), 24 deletions(-) diff --git a/cipher/cast5-amd64.S b/cipher/cast5-amd64.S index 41fbb746..a5f078e3 100644 --- a/cipher/cast5-amd64.S +++ b/cipher/cast5-amd64.S @@ -20,14 +20,19 @@ #ifdef __x86_64 #include -#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && defined(USE_CAST5) +#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_CAST5) -#ifdef __PIC__ -# define RIP %rip +#if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS) || !defined(__PIC__) +# define GET_EXTERN_POINTER(name, reg) leaq name, reg +#else # define GET_EXTERN_POINTER(name, reg) movq name@GOTPCREL(%rip), reg +#endif + +#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS +# define ELF(...) __VA_ARGS__ #else -# define RIP -# define GET_EXTERN_POINTER(name, reg) leaq name, reg +# define ELF(...) /*_*/ #endif .text @@ -180,7 +185,7 @@ .align 8 .globl _gcry_cast5_amd64_encrypt_block -.type _gcry_cast5_amd64_encrypt_block,@function; +ELF(.type _gcry_cast5_amd64_encrypt_block,@function;) _gcry_cast5_amd64_encrypt_block: /* input: @@ -216,11 +221,11 @@ _gcry_cast5_amd64_encrypt_block: popq %rbx; popq %rbp; ret; -.size _gcry_cast5_amd64_encrypt_block,.-_gcry_cast5_amd64_encrypt_block; +ELF(.size _gcry_cast5_amd64_encrypt_block,.-_gcry_cast5_amd64_encrypt_block;) .align 8 .globl _gcry_cast5_amd64_decrypt_block -.type _gcry_cast5_amd64_decrypt_block,@function; +ELF(.type _gcry_cast5_amd64_decrypt_block,@function;) _gcry_cast5_amd64_decrypt_block: /* input: @@ -256,7 +261,7 @@ _gcry_cast5_amd64_decrypt_block: popq %rbx; popq %rbp; ret; -.size _gcry_cast5_amd64_decrypt_block,.-_gcry_cast5_amd64_decrypt_block; +ELF(.size _gcry_cast5_amd64_decrypt_block,.-_gcry_cast5_amd64_decrypt_block;) /********************************************************************** 4-way cast5, four blocks parallel @@ -359,7 +364,7 @@ _gcry_cast5_amd64_decrypt_block: rorq $32, d; .align 8 -.type __cast5_enc_blk4,@function; +ELF(.type __cast5_enc_blk4,@function;) __cast5_enc_blk4: /* input: @@ -384,10 +389,10 @@ __cast5_enc_blk4: outbswap_block4(RLR0, RLR1, RLR2, RLR3); ret; -.size __cast5_enc_blk4,.-__cast5_enc_blk4; +ELF(.size __cast5_enc_blk4,.-__cast5_enc_blk4;) .align 8 -.type __cast5_dec_blk4,@function; +ELF(.type __cast5_dec_blk4,@function;) __cast5_dec_blk4: /* input: @@ -414,11 +419,11 @@ __cast5_dec_blk4: outbswap_block4(RLR0, RLR1, RLR2, RLR3); ret; -.size __cast5_dec_blk4,.-__cast5_dec_blk4; +ELF(.size __cast5_dec_blk4,.-__cast5_dec_blk4;) .align 8 .globl _gcry_cast5_amd64_ctr_enc -.type _gcry_cast5_amd64_ctr_enc,@function; +ELF(.type _gcry_cast5_amd64_ctr_enc,@function;) _gcry_cast5_amd64_ctr_enc: /* input: * %rdi: ctx, CTX @@ -472,11 +477,11 @@ _gcry_cast5_amd64_ctr_enc: popq %rbx; popq %rbp; ret -.size _gcry_cast5_amd64_ctr_enc,.-_gcry_cast5_amd64_ctr_enc; +ELF(.size _gcry_cast5_amd64_ctr_enc,.-_gcry_cast5_amd64_ctr_enc;) .align 8 .globl _gcry_cast5_amd64_cbc_dec -.type _gcry_cast5_amd64_cbc_dec,@function; +ELF(.type _gcry_cast5_amd64_cbc_dec,@function;) _gcry_cast5_amd64_cbc_dec: /* input: * %rdi: ctx, CTX @@ -526,11 +531,11 @@ _gcry_cast5_amd64_cbc_dec: popq %rbp; ret; -.size _gcry_cast5_amd64_cbc_dec,.-_gcry_cast5_amd64_cbc_dec; +ELF(.size _gcry_cast5_amd64_cbc_dec,.-_gcry_cast5_amd64_cbc_dec;) .align 8 .globl _gcry_cast5_amd64_cfb_dec -.type _gcry_cast5_amd64_cfb_dec,@function; +ELF(.type _gcry_cast5_amd64_cfb_dec,@function;) _gcry_cast5_amd64_cfb_dec: /* input: * %rdi: ctx, CTX @@ -581,7 +586,7 @@ _gcry_cast5_amd64_cfb_dec: popq %rbp; ret; -.size _gcry_cast5_amd64_cfb_dec,.-_gcry_cast5_amd64_cfb_dec; +ELF(.size _gcry_cast5_amd64_cfb_dec,.-_gcry_cast5_amd64_cfb_dec;) #endif /*defined(USE_CAST5)*/ #endif /*__x86_64*/ diff --git a/cipher/cast5.c b/cipher/cast5.c index 115e1e62..94dcee76 100644 --- a/cipher/cast5.c +++ b/cipher/cast5.c @@ -48,7 +48,8 @@ /* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */ #undef USE_AMD64_ASM -#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) +#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) # define USE_AMD64_ASM 1 #endif @@ -372,16 +373,72 @@ extern void _gcry_cast5_amd64_cbc_dec(CAST5_context *ctx, byte *out, extern void _gcry_cast5_amd64_cfb_dec(CAST5_context *ctx, byte *out, const byte *in, byte *iv); +#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS +static inline void +call_sysv_fn (const void *fn, const void *arg1, const void *arg2, + const void *arg3, const void *arg4) +{ + /* Call SystemV ABI function without storing non-volatile XMM registers, + * as target function does not use vector instruction sets. */ + asm volatile ("callq *%0\n\t" + : "+a" (fn), + "+D" (arg1), + "+S" (arg2), + "+d" (arg3), + "+c" (arg4) + : + : "cc", "memory", "r8", "r9", "r10", "r11"); +} +#endif + static void do_encrypt_block (CAST5_context *context, byte *outbuf, const byte *inbuf) { +#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS + call_sysv_fn (_gcry_cast5_amd64_encrypt_block, context, outbuf, inbuf, NULL); +#else _gcry_cast5_amd64_encrypt_block (context, outbuf, inbuf); +#endif } static void do_decrypt_block (CAST5_context *context, byte *outbuf, const byte *inbuf) { +#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS + call_sysv_fn (_gcry_cast5_amd64_decrypt_block, context, outbuf, inbuf, NULL); +#else _gcry_cast5_amd64_decrypt_block (context, outbuf, inbuf); +#endif +} + +static void +cast5_amd64_ctr_enc(CAST5_context *ctx, byte *out, const byte *in, byte *ctr) +{ +#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS + call_sysv_fn (_gcry_cast5_amd64_ctr_enc, ctx, out, in, ctr); +#else + _gcry_cast5_amd64_ctr_enc (ctx, out, in, ctr); +#endif +} + +static void +cast5_amd64_cbc_dec(CAST5_context *ctx, byte *out, const byte *in, byte *iv) +{ +#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS + call_sysv_fn (_gcry_cast5_amd64_cbc_dec, ctx, out, in, iv); +#else + _gcry_cast5_amd64_cbc_dec (ctx, out, in, iv); +#endif +} + +static void +cast5_amd64_cfb_dec(CAST5_context *ctx, byte *out, const byte *in, byte *iv) +{ +#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS + call_sysv_fn (_gcry_cast5_amd64_cfb_dec, ctx, out, in, iv); +#else + _gcry_cast5_amd64_cfb_dec (ctx, out, in, iv); +#endif } static unsigned int @@ -396,7 +453,7 @@ static unsigned int decrypt_block (void *context, byte *outbuf, const byte *inbuf) { CAST5_context *c = (CAST5_context *) context; - _gcry_cast5_amd64_decrypt_block (c, outbuf, inbuf); + do_decrypt_block (c, outbuf, inbuf); return /*burn_stack*/ (2*8); } @@ -582,7 +639,7 @@ _gcry_cast5_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg, /* Process data in 4 block chunks. */ while (nblocks >= 4) { - _gcry_cast5_amd64_ctr_enc(ctx, outbuf, inbuf, ctr); + cast5_amd64_ctr_enc(ctx, outbuf, inbuf, ctr); nblocks -= 4; outbuf += 4 * CAST5_BLOCKSIZE; @@ -651,7 +708,7 @@ _gcry_cast5_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg, /* Process data in 4 block chunks. */ while (nblocks >= 4) { - _gcry_cast5_amd64_cbc_dec(ctx, outbuf, inbuf, iv); + cast5_amd64_cbc_dec(ctx, outbuf, inbuf, iv); nblocks -= 4; outbuf += 4 * CAST5_BLOCKSIZE; @@ -710,7 +767,7 @@ _gcry_cast5_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg, /* Process data in 4 block chunks. */ while (nblocks >= 4) { - _gcry_cast5_amd64_cfb_dec(ctx, outbuf, inbuf, iv); + cast5_amd64_cfb_dec(ctx, outbuf, inbuf, iv); nblocks -= 4; outbuf += 4 * CAST5_BLOCKSIZE; -- cgit v1.2.1