/* ** RC4 implementation optimized for AMD64. ** ** Author: Marc Bevand ** Licence: I hereby disclaim the copyright on this code and place it ** in the public domain. ** ** The throughput achieved by this code is about 320 MBytes/sec, on ** a 1.8 GHz AMD Opteron (rev C0) processor. ** ** 2013/12/20 : ** - Integrated to libgcrypt ** - 4.18 cycles/byte on Intel i5-4570 */ #ifdef __x86_64__ #include #if defined(USE_ARCFOUR) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) #ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS # define ELF(...) __VA_ARGS__ #else # define ELF(...) /*_*/ #endif .text .align 16 .globl _gcry_arcfour_amd64 ELF(.type _gcry_arcfour_amd64,@function) _gcry_arcfour_amd64: push %rbp push %rbx mov %rdi, %rbp # key = ARG(key) mov %rsi, %rbx # rbx = ARG(len) mov %rdx, %rsi # in = ARG(in) mov %rcx, %rdi # out = ARG(out) mov (4*256)(%rbp), %ecx # x = key->x mov (4*256+4)(%rbp),%edx # y = key->y inc %rcx # x++ and $255, %rcx # x &= 0xff lea -8(%rbx,%rsi), %rbx # rbx = in+len-8 mov %rbx, %r9 # tmp = in+len-8 mov (%rbp,%rcx,4), %eax # tx = d[x] cmp %rsi, %rbx # cmp in with in+len-8 jl .Lend # jump if (in+len-8 < in) .Lstart: add $8, %rsi # increment in add $8, %rdi # increment out # generate the next 8 bytes of the rc4 stream into %r8 mov $8, %r11 # byte counter 1: add %al, %dl # y += tx mov (%rbp,%rdx,4), %ebx # ty = d[y] mov %ebx, (%rbp,%rcx,4) # d[x] = ty add %al, %bl # val = ty + tx mov %eax, (%rbp,%rdx,4) # d[y] = tx inc %cl # x++ (NEXT ROUND) mov (%rbp,%rcx,4), %eax # tx = d[x] (NEXT ROUND) shl $8, %r8 movb (%rbp,%rbx,4), %r8b # val = d[val] dec %r11b jnz 1b # xor 8 bytes bswap %r8 xor -8(%rsi), %r8 cmp %r9, %rsi # cmp in+len-8 with in mov %r8, -8(%rdi) jle .Lstart # jump if (in <= in+len-8) .Lend: add $8, %r9 # tmp = in+len # handle the last bytes, one by one 1: cmp %rsi, %r9 # cmp in with in+len jle .Lfinished # jump if (in+len <= in) add %al, %dl # y += tx mov (%rbp,%rdx,4), %ebx # ty = d[y] mov %ebx, (%rbp,%rcx,4) # d[x] = ty add %al, %bl # val = ty + tx mov %eax, (%rbp,%rdx,4) # d[y] = tx inc %cl # x++ (NEXT ROUND) mov (%rbp,%rcx,4), %eax # tx = d[x] (NEXT ROUND) movb (%rbp,%rbx,4), %r8b # val = d[val] xor (%rsi), %r8b # xor 1 byte movb %r8b, (%rdi) inc %rsi # in++ inc %rdi # out++ jmp 1b .Lfinished: dec %rcx # x-- movb %cl, (4*256)(%rbp) # key->y = y movb %dl, (4*256+4)(%rbp) # key->x = x pop %rbx pop %rbp ret .L__gcry_arcfour_amd64_end: ELF(.size _gcry_arcfour_amd64,.L__gcry_arcfour_amd64_end-_gcry_arcfour_amd64) #endif #endif