From 6fd0dd2a5f1362f91e2861cd9d300341a43842a5 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Tue, 17 Dec 2013 15:35:38 +0200 Subject: SHA-1/SSSE3: Improve performance on large buffers * cipher/sha1-ssse3-amd64.S (RNBLKS): New. (_gcry_sha1_transform_amd64_ssse3): Handle multiple input blocks, with software pipelining of next data block processing. * cipher/sha1.c [USE_SSSE3] (_gcry_sha1_transform_amd64_ssse3): Add 'nblks'. (transform) [USE_SSSE3]: Pass nblks to assembly function. -- Patch gives small improvement for large buffer processing, on Intel i5-4570 speed goes from 4.80 c/B to 4.61 c/B. Signed-off-by: Jussi Kivilinna --- cipher/sha1-ssse3-amd64.S | 70 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 60 insertions(+), 10 deletions(-) (limited to 'cipher/sha1-ssse3-amd64.S') diff --git a/cipher/sha1-ssse3-amd64.S b/cipher/sha1-ssse3-amd64.S index 5e5716bd..d80631d3 100644 --- a/cipher/sha1-ssse3-amd64.S +++ b/cipher/sha1-ssse3-amd64.S @@ -71,6 +71,7 @@ #define RSTATE %r8 #define RDATA %r9 #define ROLDSTACK %r10 +#define RNBLKS %r11 #define a %eax #define b %ebx @@ -211,10 +212,11 @@ /* - * Transform 64 bytes (16 32-bit words) at DATA. + * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA. * * unsigned int - * _gcry_sha1_transform_amd64_ssse3 (void *ctx, const unsigned char *data) + * _gcry_sha1_transform_amd64_ssse3 (void *ctx, const unsigned char *data, + * size_t nblks) */ .text .globl _gcry_sha1_transform_amd64_ssse3 @@ -223,10 +225,15 @@ _gcry_sha1_transform_amd64_ssse3: /* input: * %rdi: ctx, CTX - * %rsi: data (64 bytes) - * %rdx: ... + * %rsi: data (64*nblks bytes) + * %rdx: nblks */ + xorl %eax, %eax; + cmpq $0, %rdx; + jz .Lret; + + movq %rdx, RNBLKS; movq %rdi, RSTATE; movq %rsi, RDATA; pushq %rbx; @@ -264,6 +271,10 @@ _gcry_sha1_transform_amd64_ssse3: W_PRECALC_00_15_2(14, W5, Wtmp0); W_PRECALC_00_15_3(15, W5, Wtmp0); +.align 8 +.Loop: + addq $64, RDATA; + /* Transform 0-15 + Precalc 16-31. */ R( a, b, c, d, e, F1, 0 ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); R( e, a, b, c, d, F1, 1 ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); @@ -332,6 +343,44 @@ _gcry_sha1_transform_amd64_ssse3: R( d, e, a, b, c, F4, 62 ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); R( c, d, e, a, b, F4, 63 ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + decq RNBLKS; + jz .Lend; + + /* Transform 64-79 + Precalc 0-15 of next block. */ + R( b, c, d, e, a, F4, 64 ); W_PRECALC_00_15_0(0, W0, Wtmp0); + R( a, b, c, d, e, F4, 65 ); W_PRECALC_00_15_1(1, W0, Wtmp0); + R( e, a, b, c, d, F4, 66 ); W_PRECALC_00_15_2(2, W0, Wtmp0); + R( d, e, a, b, c, F4, 67 ); W_PRECALC_00_15_3(3, W0, Wtmp0); + R( c, d, e, a, b, F4, 68 ); W_PRECALC_00_15_0(4, W7, Wtmp0); + R( b, c, d, e, a, F4, 69 ); W_PRECALC_00_15_1(5, W7, Wtmp0); + R( a, b, c, d, e, F4, 70 ); W_PRECALC_00_15_2(6, W7, Wtmp0); + R( e, a, b, c, d, F4, 71 ); W_PRECALC_00_15_3(7, W7, Wtmp0); + R( d, e, a, b, c, F4, 72 ); W_PRECALC_00_15_0(8, W6, Wtmp0); + R( c, d, e, a, b, F4, 73 ); W_PRECALC_00_15_1(9, W6, Wtmp0); + R( b, c, d, e, a, F4, 74 ); W_PRECALC_00_15_2(10, W6, Wtmp0); + R( a, b, c, d, e, F4, 75 ); W_PRECALC_00_15_3(11, W6, Wtmp0); + R( e, a, b, c, d, F4, 76 ); W_PRECALC_00_15_0(12, W5, Wtmp0); + R( d, e, a, b, c, F4, 77 ); W_PRECALC_00_15_1(13, W5, Wtmp0); + R( c, d, e, a, b, F4, 78 ); + addl state_h0(RSTATE), a; W_PRECALC_00_15_2(14, W5, Wtmp0); + R( b, c, d, e, a, F4, 79 ); W_PRECALC_00_15_3(15, W5, Wtmp0); + + /* Update the chaining variables. */ + addl state_h3(RSTATE), d; + addl state_h2(RSTATE), c; + addl state_h1(RSTATE), b; + addl state_h4(RSTATE), e; + + movl d, state_h3(RSTATE); + movl c, state_h2(RSTATE); + movl b, state_h1(RSTATE); + movl a, state_h0(RSTATE); + movl e, state_h4(RSTATE); + + jmp .Loop; + +.align 16 +.Lend: /* Transform 64-79 + Clear XMM registers. */ R( b, c, d, e, a, F4, 64 ); CLEAR_REG(BSWAP_REG); R( a, b, c, d, e, F4, 65 ); CLEAR_REG(Wtmp0); @@ -348,19 +397,19 @@ _gcry_sha1_transform_amd64_ssse3: R( e, a, b, c, d, F4, 76 ); R( d, e, a, b, c, F4, 77 ); R( c, d, e, a, b, F4, 78 ); + addl state_h0(RSTATE), a; R( b, c, d, e, a, F4, 79 ); /* Update the chaining variables. */ - addl state_h0(RSTATE), a; - addl state_h1(RSTATE), b; - addl state_h2(RSTATE), c; addl state_h3(RSTATE), d; + addl state_h2(RSTATE), c; + addl state_h1(RSTATE), b; addl state_h4(RSTATE), e; - movl a, state_h0(RSTATE); - movl b, state_h1(RSTATE); - movl c, state_h2(RSTATE); movl d, state_h3(RSTATE); + movl c, state_h2(RSTATE); + movl b, state_h1(RSTATE); + movl a, state_h0(RSTATE); movl e, state_h4(RSTATE); movq ROLDSTACK, %rsp; @@ -371,6 +420,7 @@ _gcry_sha1_transform_amd64_ssse3: /* burn_stack */ movl $(16*4 + 2*8 + 31), %eax; +.Lret: ret; #endif -- cgit v1.2.1