From adbdca0d58f9c06dc3850b95e3455e179c1e6960 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Sun, 26 Jul 2015 17:17:20 +0300 Subject: Add bulk OCB for Serpent SSE2, AVX2 and NEON implementations * cipher/cipher.c (_gcry_cipher_open_internal): Setup OCB bulk functions for Serpent. * cipher/serpent-armv7-neon.S: Add OCB assembly functions. * cipher/serpent-avx2-amd64.S: Add OCB assembly functions. * cipher/serpent-sse2-amd64.S: Add OCB assembly functions. * cipher/serpent.c (_gcry_serpent_sse2_ocb_enc) (_gcry_serpent_sse2_ocb_dec, _gcry_serpent_sse2_ocb_auth) (_gcry_serpent_neon_ocb_enc, _gcry_serpent_neon_ocb_dec) (_gcry_serpent_neon_ocb_auth, _gcry_serpent_avx2_ocb_enc) (_gcry_serpent_avx2_ocb_dec, _gcry_serpent_avx2_ocb_auth): New prototypes. (get_l, _gcry_serpent_ocb_crypt, _gcry_serpent_ocb_auth): New. * src/cipher.h (_gcry_serpent_ocb_crypt) (_gcry_serpent_ocb_auth): New. * tests/basic.c (check_ocb_cipher): Add test-vector for serpent. -- Signed-off-by: Jussi Kivilinna --- cipher/cipher.c | 2 + cipher/serpent-armv7-neon.S | 255 +++++++++++++++++++++++++++ cipher/serpent-avx2-amd64.S | 307 +++++++++++++++++++++++++++++++- cipher/serpent-sse2-amd64.S | 307 +++++++++++++++++++++++++++++++- cipher/serpent.c | 419 +++++++++++++++++++++++++++++++++++++++++++- src/cipher.h | 5 + tests/basic.c | 9 + 7 files changed, 1301 insertions(+), 3 deletions(-) diff --git a/cipher/cipher.c b/cipher/cipher.c index 8483c5fc..30c2f489 100644 --- a/cipher/cipher.c +++ b/cipher/cipher.c @@ -553,6 +553,8 @@ _gcry_cipher_open_internal (gcry_cipher_hd_t *handle, h->bulk.cbc_dec = _gcry_serpent_cbc_dec; h->bulk.cfb_dec = _gcry_serpent_cfb_dec; h->bulk.ctr_enc = _gcry_serpent_ctr_enc; + h->bulk.ocb_crypt = _gcry_serpent_ocb_crypt; + h->bulk.ocb_auth = _gcry_serpent_ocb_auth; break; #endif /*USE_SERPENT*/ #ifdef USE_TWOFISH diff --git a/cipher/serpent-armv7-neon.S b/cipher/serpent-armv7-neon.S index 35595583..adff6394 100644 --- a/cipher/serpent-armv7-neon.S +++ b/cipher/serpent-armv7-neon.S @@ -866,4 +866,259 @@ _gcry_serpent_neon_cbc_dec: pop {pc}; .size _gcry_serpent_neon_cbc_dec,.-_gcry_serpent_neon_cbc_dec; +.align 3 +.globl _gcry_serpent_neon_ocb_enc +.type _gcry_serpent_neon_ocb_enc,%function; +_gcry_serpent_neon_ocb_enc: + /* input: + * r0 : ctx, CTX + * r1 : dst (8 blocks) + * r2 : src (8 blocks) + * r3 : offset + * sp+0: checksum + * sp+4: L pointers (void *L[8]) + */ + + push {r4-r11, ip, lr}; + add ip, sp, #(10*4); + + vpush {RA4-RB2}; + + ldm ip, {r4, lr}; + + vld1.8 {RT0}, [r3]; + vld1.8 {RT1}, [r4]; + + /* Load L pointers */ + ldm lr!, {r5, r6, r7, r8}; + ldm lr, {r9, r10, r11, ip}; + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* Checksum_i = Checksum_{i-1} xor P_i */ + /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ + + vld1.8 {RA0, RA1}, [r2]!; + vld1.8 {RA2, RA3}, [r2]!; + vld1.8 {RB0, RB1}, [r2]!; + vld1.8 {RB2, RB3}, [r2]; + +#define OCB_INPUT(lreg, vreg) \ + vld1.8 {RT3}, [lreg]; \ + veor RT0, RT3; \ + veor RT1, vreg; \ + veor vreg, RT0; \ + vst1.8 {RT0}, [r1]!; + + OCB_INPUT(r5, RA0); + OCB_INPUT(r6, RA1); + OCB_INPUT(r7, RA2); + OCB_INPUT(r8, RA3); + OCB_INPUT(r9, RB0); + OCB_INPUT(r10, RB1); + OCB_INPUT(r11, RB2); + OCB_INPUT(ip, RB3); +#undef OCB_INPUT + + sub r1, r1, #(8*16); + vst1.8 {RT0}, [r3]; + vst1.8 {RT1}, [r4]; + mov r2, r1; + + bl __serpent_enc_blk8; + + vld1.8 {RT0, RT1}, [r1]!; + veor RT0, RA4, RT0; + veor RT1, RA1, RT1; + vld1.8 {RT2, RT3}, [r1]!; + vst1.8 {RT0, RT1}, [r2]!; + veor RT2, RA2, RT2; + veor RT3, RA0, RT3; + vld1.8 {RT0, RT1}, [r1]!; + vst1.8 {RT2, RT3}, [r2]!; + veor RT0, RB4, RT0; + veor RT1, RB1, RT1; + vld1.8 {RT2, RT3}, [r1]!; + vst1.8 {RT0, RT1}, [r2]!; + veor RT2, RB2, RT2; + veor RT3, RB0, RT3; + vst1.8 {RT2, RT3}, [r2]!; + + vpop {RA4-RB2}; + + /* clear the used registers */ + veor RA3, RA3; + veor RB3, RB3; + + pop {r4-r11, ip, pc}; +.size _gcry_serpent_neon_ocb_enc,.-_gcry_serpent_neon_ocb_enc; + +.align 3 +.globl _gcry_serpent_neon_ocb_dec +.type _gcry_serpent_neon_ocb_dec,%function; +_gcry_serpent_neon_ocb_dec: + /* input: + * r0 : ctx, CTX + * r1 : dst (8 blocks) + * r2 : src (8 blocks) + * r3 : offset + * sp+0: checksum + * sp+4: L pointers (void *L[8]) + */ + + push {r4-r11, ip, lr}; + add ip, sp, #(10*4); + + vpush {RA4-RB2}; + + ldm ip, {r4, lr}; + + vld1.8 {RT0}, [r3]; + + /* Load L pointers */ + ldm lr!, {r5, r6, r7, r8}; + ldm lr, {r9, r10, r11, ip}; + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ + + vld1.8 {RA0, RA1}, [r2]!; + vld1.8 {RA2, RA3}, [r2]!; + vld1.8 {RB0, RB1}, [r2]!; + vld1.8 {RB2, RB3}, [r2]; + +#define OCB_INPUT(lreg, vreg) \ + vld1.8 {RT3}, [lreg]; \ + veor RT0, RT3; \ + veor vreg, RT0; \ + vst1.8 {RT0}, [r1]!; + + OCB_INPUT(r5, RA0); + OCB_INPUT(r6, RA1); + OCB_INPUT(r7, RA2); + OCB_INPUT(r8, RA3); + OCB_INPUT(r9, RB0); + OCB_INPUT(r10, RB1); + OCB_INPUT(r11, RB2); + OCB_INPUT(ip, RB3); +#undef OCB_INPUT + + sub r1, r1, #(8*16); + vst1.8 {RT0}, [r3]; + mov r2, r1; + + bl __serpent_dec_blk8; + + /* Checksum_i = Checksum_{i-1} xor P_i */ + vld1.8 {RA4}, [r4]; + + vld1.8 {RT0, RT1}, [r1]!; + veor RA0, RA0, RT0; + veor RA1, RA1, RT1; + vld1.8 {RT2, RT3}, [r1]!; + veor RA4, RA4, RA0; + vst1.8 {RA0, RA1}, [r2]!; + veor RA4, RA4, RA1; + veor RA2, RA2, RT2; + veor RA3, RA3, RT3; + vld1.8 {RT0, RT1}, [r1]!; + veor RA4, RA4, RA2; + vst1.8 {RA2, RA3}, [r2]!; + veor RA4, RA4, RA3; + veor RB0, RB0, RT0; + veor RB1, RB1, RT1; + vld1.8 {RT2, RT3}, [r1]!; + veor RA4, RA4, RB0; + vst1.8 {RB0, RB1}, [r2]!; + veor RA4, RA4, RB1; + veor RB2, RB2, RT2; + veor RB3, RB3, RT3; + veor RA4, RA4, RB2; + vst1.8 {RB2, RB3}, [r2]!; + + veor RA4, RA4, RB3; + vst1.8 {RA4}, [r4]; + + vpop {RA4-RB2}; + + /* clear the used registers */ + veor RB4, RB4; + + pop {r4-r11, ip, pc}; +.size _gcry_serpent_neon_ocb_dec,.-_gcry_serpent_neon_ocb_dec; + +.align 3 +.globl _gcry_serpent_neon_ocb_auth +.type _gcry_serpent_neon_ocb_auth,%function; +_gcry_serpent_neon_ocb_auth: + /* input: + * r0 : ctx, CTX + * r1 : abuf (8 blocks) + * r2 : offset + * r3 : checksum + * sp+0: L pointers (void *L[8]) + */ + + push {r5-r11, ip, lr}; + ldr lr, [sp, #(9*4)]; + + vpush {RA4-RB2}; + + vld1.8 {RT0}, [r2]; + + /* Load L pointers */ + ldm lr!, {r5, r6, r7, r8}; + ldm lr, {r9, r10, r11, ip}; + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ + + vld1.8 {RA0, RA1}, [r1]!; + vld1.8 {RA2, RA3}, [r1]!; + vld1.8 {RB0, RB1}, [r1]!; + vld1.8 {RB2, RB3}, [r1]; + +#define OCB_INPUT(lreg, vreg) \ + vld1.8 {RT3}, [lreg]; \ + veor RT0, RT3; \ + veor vreg, RT0; + + OCB_INPUT(r5, RA0); + OCB_INPUT(r6, RA1); + OCB_INPUT(r7, RA2); + OCB_INPUT(r8, RA3); + OCB_INPUT(r9, RB0); + OCB_INPUT(r10, RB1); + OCB_INPUT(r11, RB2); + OCB_INPUT(ip, RB3); +#undef OCB_INPUT + + vst1.8 {RT0}, [r2]; + + bl __serpent_enc_blk8; + + /* Checksum_i = Checksum_{i-1} xor P_i */ + vld1.8 {RT0}, [r3]; + + veor RA4, RB4; + veor RA1, RB1; + veor RA2, RB2; + veor RA0, RB0; + + veor RA2, RT0; + veor RA1, RA4; + veor RA0, RA2; + + veor RA0, RA1; + + vst1.8 {RA0}, [r3]; + + vpop {RA4-RB2}; + + /* clear the used registers */ + veor RA3, RA3; + veor RB3, RB3; + + pop {r5-r11, ip, pc}; +.size _gcry_serpent_neon_ocb_auth,.-_gcry_serpent_neon_ocb_auth; + #endif diff --git a/cipher/serpent-avx2-amd64.S b/cipher/serpent-avx2-amd64.S index 3f59f060..2902dab5 100644 --- a/cipher/serpent-avx2-amd64.S +++ b/cipher/serpent-avx2-amd64.S @@ -1,6 +1,6 @@ /* serpent-avx2-amd64.S - AVX2 implementation of Serpent cipher * - * Copyright (C) 2013 Jussi Kivilinna + * Copyright (C) 2013-2015 Jussi Kivilinna * * This file is part of Libgcrypt. * @@ -808,6 +808,311 @@ _gcry_serpent_avx2_cfb_dec: ret ELF(.size _gcry_serpent_avx2_cfb_dec,.-_gcry_serpent_avx2_cfb_dec;) +.align 8 +.globl _gcry_serpent_avx2_ocb_enc +ELF(.type _gcry_serpent_avx2_ocb_enc,@function;) + +_gcry_serpent_avx2_ocb_enc: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (16 blocks) + * %rdx: src (16 blocks) + * %rcx: offset + * %r8 : checksum + * %r9 : L pointers (void *L[16]) + */ + + vzeroupper; + + subq $(4 * 8), %rsp; + + movq %r10, (0 * 8)(%rsp); + movq %r11, (1 * 8)(%rsp); + movq %r12, (2 * 8)(%rsp); + movq %r13, (3 * 8)(%rsp); + + vmovdqu (%rcx), RTMP0x; + vmovdqu (%r8), RTMP1x; + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* Checksum_i = Checksum_{i-1} xor P_i */ + /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ + +#define OCB_INPUT(n, l0reg, l1reg, yreg) \ + vmovdqu (n * 32)(%rdx), yreg; \ + vpxor (l0reg), RTMP0x, RNOTx; \ + vpxor (l1reg), RNOTx, RTMP0x; \ + vinserti128 $1, RTMP0x, RNOT, RNOT; \ + vpxor yreg, RTMP1, RTMP1; \ + vpxor yreg, RNOT, yreg; \ + vmovdqu RNOT, (n * 32)(%rsi); + + movq (0 * 8)(%r9), %r10; + movq (1 * 8)(%r9), %r11; + movq (2 * 8)(%r9), %r12; + movq (3 * 8)(%r9), %r13; + OCB_INPUT(0, %r10, %r11, RA0); + OCB_INPUT(1, %r12, %r13, RA1); + movq (4 * 8)(%r9), %r10; + movq (5 * 8)(%r9), %r11; + movq (6 * 8)(%r9), %r12; + movq (7 * 8)(%r9), %r13; + OCB_INPUT(2, %r10, %r11, RA2); + OCB_INPUT(3, %r12, %r13, RA3); + movq (8 * 8)(%r9), %r10; + movq (9 * 8)(%r9), %r11; + movq (10 * 8)(%r9), %r12; + movq (11 * 8)(%r9), %r13; + OCB_INPUT(4, %r10, %r11, RB0); + OCB_INPUT(5, %r12, %r13, RB1); + movq (12 * 8)(%r9), %r10; + movq (13 * 8)(%r9), %r11; + movq (14 * 8)(%r9), %r12; + movq (15 * 8)(%r9), %r13; + OCB_INPUT(6, %r10, %r11, RB2); + OCB_INPUT(7, %r12, %r13, RB3); +#undef OCB_INPUT + + vextracti128 $1, RTMP1, RNOTx; + vmovdqu RTMP0x, (%rcx); + vpxor RNOTx, RTMP1x, RTMP1x; + vmovdqu RTMP1x, (%r8); + + movq (0 * 8)(%rsp), %r10; + movq (1 * 8)(%rsp), %r11; + movq (2 * 8)(%rsp), %r12; + movq (3 * 8)(%rsp), %r13; + + call __serpent_enc_blk16; + + addq $(4 * 8), %rsp; + + vpxor (0 * 32)(%rsi), RA4, RA4; + vpxor (1 * 32)(%rsi), RA1, RA1; + vpxor (2 * 32)(%rsi), RA2, RA2; + vpxor (3 * 32)(%rsi), RA0, RA0; + vpxor (4 * 32)(%rsi), RB4, RB4; + vpxor (5 * 32)(%rsi), RB1, RB1; + vpxor (6 * 32)(%rsi), RB2, RB2; + vpxor (7 * 32)(%rsi), RB0, RB0; + + vmovdqu RA4, (0 * 32)(%rsi); + vmovdqu RA1, (1 * 32)(%rsi); + vmovdqu RA2, (2 * 32)(%rsi); + vmovdqu RA0, (3 * 32)(%rsi); + vmovdqu RB4, (4 * 32)(%rsi); + vmovdqu RB1, (5 * 32)(%rsi); + vmovdqu RB2, (6 * 32)(%rsi); + vmovdqu RB0, (7 * 32)(%rsi); + + vzeroall; + + ret; +ELF(.size _gcry_serpent_avx2_ocb_enc,.-_gcry_serpent_avx2_ocb_enc;) + +.align 8 +.globl _gcry_serpent_avx2_ocb_dec +ELF(.type _gcry_serpent_avx2_ocb_dec,@function;) + +_gcry_serpent_avx2_ocb_dec: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (16 blocks) + * %rdx: src (16 blocks) + * %rcx: offset + * %r8 : checksum + * %r9 : L pointers (void *L[16]) + */ + + vzeroupper; + + subq $(4 * 8), %rsp; + + movq %r10, (0 * 8)(%rsp); + movq %r11, (1 * 8)(%rsp); + movq %r12, (2 * 8)(%rsp); + movq %r13, (3 * 8)(%rsp); + + vmovdqu (%rcx), RTMP0x; + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ + +#define OCB_INPUT(n, l0reg, l1reg, yreg) \ + vmovdqu (n * 32)(%rdx), yreg; \ + vpxor (l0reg), RTMP0x, RNOTx; \ + vpxor (l1reg), RNOTx, RTMP0x; \ + vinserti128 $1, RTMP0x, RNOT, RNOT; \ + vpxor yreg, RNOT, yreg; \ + vmovdqu RNOT, (n * 32)(%rsi); + + movq (0 * 8)(%r9), %r10; + movq (1 * 8)(%r9), %r11; + movq (2 * 8)(%r9), %r12; + movq (3 * 8)(%r9), %r13; + OCB_INPUT(0, %r10, %r11, RA0); + OCB_INPUT(1, %r12, %r13, RA1); + movq (4 * 8)(%r9), %r10; + movq (5 * 8)(%r9), %r11; + movq (6 * 8)(%r9), %r12; + movq (7 * 8)(%r9), %r13; + OCB_INPUT(2, %r10, %r11, RA2); + OCB_INPUT(3, %r12, %r13, RA3); + movq (8 * 8)(%r9), %r10; + movq (9 * 8)(%r9), %r11; + movq (10 * 8)(%r9), %r12; + movq (11 * 8)(%r9), %r13; + OCB_INPUT(4, %r10, %r11, RB0); + OCB_INPUT(5, %r12, %r13, RB1); + movq (12 * 8)(%r9), %r10; + movq (13 * 8)(%r9), %r11; + movq (14 * 8)(%r9), %r12; + movq (15 * 8)(%r9), %r13; + OCB_INPUT(6, %r10, %r11, RB2); + OCB_INPUT(7, %r12, %r13, RB3); +#undef OCB_INPUT + + vmovdqu RTMP0x, (%rcx); + + movq (0 * 8)(%rsp), %r10; + movq (1 * 8)(%rsp), %r11; + movq (2 * 8)(%rsp), %r12; + movq (3 * 8)(%rsp), %r13; + + call __serpent_dec_blk16; + + addq $(4 * 8), %rsp; + + vmovdqu (%r8), RTMP1x; + + vpxor (0 * 32)(%rsi), RA0, RA0; + vpxor (1 * 32)(%rsi), RA1, RA1; + vpxor (2 * 32)(%rsi), RA2, RA2; + vpxor (3 * 32)(%rsi), RA3, RA3; + vpxor (4 * 32)(%rsi), RB0, RB0; + vpxor (5 * 32)(%rsi), RB1, RB1; + vpxor (6 * 32)(%rsi), RB2, RB2; + vpxor (7 * 32)(%rsi), RB3, RB3; + + /* Checksum_i = Checksum_{i-1} xor P_i */ + + vmovdqu RA0, (0 * 32)(%rsi); + vpxor RA0, RTMP1, RTMP1; + vmovdqu RA1, (1 * 32)(%rsi); + vpxor RA1, RTMP1, RTMP1; + vmovdqu RA2, (2 * 32)(%rsi); + vpxor RA2, RTMP1, RTMP1; + vmovdqu RA3, (3 * 32)(%rsi); + vpxor RA3, RTMP1, RTMP1; + vmovdqu RB0, (4 * 32)(%rsi); + vpxor RB0, RTMP1, RTMP1; + vmovdqu RB1, (5 * 32)(%rsi); + vpxor RB1, RTMP1, RTMP1; + vmovdqu RB2, (6 * 32)(%rsi); + vpxor RB2, RTMP1, RTMP1; + vmovdqu RB3, (7 * 32)(%rsi); + vpxor RB3, RTMP1, RTMP1; + + vextracti128 $1, RTMP1, RNOTx; + vpxor RNOTx, RTMP1x, RTMP1x; + vmovdqu RTMP1x, (%r8); + + vzeroall; + + ret; +ELF(.size _gcry_serpent_avx2_ocb_dec,.-_gcry_serpent_avx2_ocb_dec;) + +.align 8 +.globl _gcry_serpent_avx2_ocb_auth +ELF(.type _gcry_serpent_avx2_ocb_auth,@function;) + +_gcry_serpent_avx2_ocb_auth: + /* input: + * %rdi: ctx, CTX + * %rsi: abuf (16 blocks) + * %rdx: offset + * %rcx: checksum + * %r8 : L pointers (void *L[16]) + */ + + vzeroupper; + + subq $(4 * 8), %rsp; + + movq %r10, (0 * 8)(%rsp); + movq %r11, (1 * 8)(%rsp); + movq %r12, (2 * 8)(%rsp); + movq %r13, (3 * 8)(%rsp); + + vmovdqu (%rdx), RTMP0x; + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ + +#define OCB_INPUT(n, l0reg, l1reg, yreg) \ + vmovdqu (n * 32)(%rsi), yreg; \ + vpxor (l0reg), RTMP0x, RNOTx; \ + vpxor (l1reg), RNOTx, RTMP0x; \ + vinserti128 $1, RTMP0x, RNOT, RNOT; \ + vpxor yreg, RNOT, yreg; + + movq (0 * 8)(%r8), %r10; + movq (1 * 8)(%r8), %r11; + movq (2 * 8)(%r8), %r12; + movq (3 * 8)(%r8), %r13; + OCB_INPUT(0, %r10, %r11, RA0); + OCB_INPUT(1, %r12, %r13, RA1); + movq (4 * 8)(%r8), %r10; + movq (5 * 8)(%r8), %r11; + movq (6 * 8)(%r8), %r12; + movq (7 * 8)(%r8), %r13; + OCB_INPUT(2, %r10, %r11, RA2); + OCB_INPUT(3, %r12, %r13, RA3); + movq (8 * 8)(%r8), %r10; + movq (9 * 8)(%r8), %r11; + movq (10 * 8)(%r8), %r12; + movq (11 * 8)(%r8), %r13; + OCB_INPUT(4, %r10, %r11, RB0); + OCB_INPUT(5, %r12, %r13, RB1); + movq (12 * 8)(%r8), %r10; + movq (13 * 8)(%r8), %r11; + movq (14 * 8)(%r8), %r12; + movq (15 * 8)(%r8), %r13; + OCB_INPUT(6, %r10, %r11, RB2); + OCB_INPUT(7, %r12, %r13, RB3); +#undef OCB_INPUT + + vmovdqu RTMP0x, (%rdx); + + movq (0 * 8)(%rsp), %r10; + movq (1 * 8)(%rsp), %r11; + movq (2 * 8)(%rsp), %r12; + movq (3 * 8)(%rsp), %r13; + + call __serpent_enc_blk16; + + addq $(4 * 8), %rsp; + + vpxor RA4, RB4, RA4; + vpxor RA1, RB1, RA1; + vpxor RA2, RB2, RA2; + vpxor RA0, RB0, RA0; + + vpxor RA4, RA1, RA1; + vpxor RA2, RA0, RA0; + + vpxor RA1, RA0, RTMP1; + + vextracti128 $1, RTMP1, RNOTx; + vpxor (%rcx), RTMP1x, RTMP1x; + vpxor RNOTx, RTMP1x, RTMP1x; + vmovdqu RTMP1x, (%rcx); + + vzeroall; + + ret; +ELF(.size _gcry_serpent_avx2_ocb_auth,.-_gcry_serpent_avx2_ocb_auth;) + .data .align 16 diff --git a/cipher/serpent-sse2-amd64.S b/cipher/serpent-sse2-amd64.S index adbf4e27..b149af24 100644 --- a/cipher/serpent-sse2-amd64.S +++ b/cipher/serpent-sse2-amd64.S @@ -1,6 +1,6 @@ /* serpent-sse2-amd64.S - SSE2 implementation of Serpent cipher * - * Copyright (C) 2013 Jussi Kivilinna + * Copyright (C) 2013-2015 Jussi Kivilinna * * This file is part of Libgcrypt. * @@ -866,5 +866,310 @@ _gcry_serpent_sse2_cfb_dec: ret ELF(.size _gcry_serpent_sse2_cfb_dec,.-_gcry_serpent_sse2_cfb_dec;) +.align 8 +.globl _gcry_serpent_sse2_ocb_enc +ELF(.type _gcry_serpent_sse2_ocb_enc,@function;) + +_gcry_serpent_sse2_ocb_enc: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (8 blocks) + * %rdx: src (8 blocks) + * %rcx: offset + * %r8 : checksum + * %r9 : L pointers (void *L[8]) + */ + + subq $(4 * 8), %rsp; + + movq %r10, (0 * 8)(%rsp); + movq %r11, (1 * 8)(%rsp); + movq %r12, (2 * 8)(%rsp); + movq %r13, (3 * 8)(%rsp); + + movdqu (%rcx), RTMP0; + movdqu (%r8), RTMP1; + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* Checksum_i = Checksum_{i-1} xor P_i */ + /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ + +#define OCB_INPUT(n, lreg, xreg) \ + movdqu (n * 16)(%rdx), xreg; \ + movdqu (lreg), RNOT; \ + pxor RNOT, RTMP0; \ + pxor xreg, RTMP1; \ + pxor RTMP0, xreg; \ + movdqu RTMP0, (n * 16)(%rsi); + movq (0 * 8)(%r9), %r10; + movq (1 * 8)(%r9), %r11; + movq (2 * 8)(%r9), %r12; + movq (3 * 8)(%r9), %r13; + OCB_INPUT(0, %r10, RA0); + OCB_INPUT(1, %r11, RA1); + OCB_INPUT(2, %r12, RA2); + OCB_INPUT(3, %r13, RA3); + movq (4 * 8)(%r9), %r10; + movq (5 * 8)(%r9), %r11; + movq (6 * 8)(%r9), %r12; + movq (7 * 8)(%r9), %r13; + OCB_INPUT(4, %r10, RB0); + OCB_INPUT(5, %r11, RB1); + OCB_INPUT(6, %r12, RB2); + OCB_INPUT(7, %r13, RB3); +#undef OCB_INPUT + + movdqu RTMP0, (%rcx); + movdqu RTMP1, (%r8); + + movq (0 * 8)(%rsp), %r10; + movq (1 * 8)(%rsp), %r11; + movq (2 * 8)(%rsp), %r12; + movq (3 * 8)(%rsp), %r13; + + call __serpent_enc_blk8; + + addq $(4 * 8), %rsp; + + pxor_u((0 * 16)(%rsi), RA4, RTMP0); + pxor_u((1 * 16)(%rsi), RA1, RTMP0); + pxor_u((2 * 16)(%rsi), RA2, RTMP0); + pxor_u((3 * 16)(%rsi), RA0, RTMP0); + pxor_u((4 * 16)(%rsi), RB4, RTMP0); + pxor_u((5 * 16)(%rsi), RB1, RTMP0); + pxor_u((6 * 16)(%rsi), RB2, RTMP0); + pxor_u((7 * 16)(%rsi), RB0, RTMP0); + + movdqu RA4, (0 * 16)(%rsi); + movdqu RA1, (1 * 16)(%rsi); + movdqu RA2, (2 * 16)(%rsi); + movdqu RA0, (3 * 16)(%rsi); + movdqu RB4, (4 * 16)(%rsi); + movdqu RB1, (5 * 16)(%rsi); + movdqu RB2, (6 * 16)(%rsi); + movdqu RB0, (7 * 16)(%rsi); + + /* clear the used registers */ + pxor RA0, RA0; + pxor RA1, RA1; + pxor RA2, RA2; + pxor RA3, RA3; + pxor RA4, RA4; + pxor RB0, RB0; + pxor RB1, RB1; + pxor RB2, RB2; + pxor RB3, RB3; + pxor RB4, RB4; + pxor RTMP0, RTMP0; + pxor RTMP1, RTMP1; + pxor RTMP2, RTMP2; + pxor RNOT, RNOT; + + ret; +ELF(.size _gcry_serpent_sse2_ocb_enc,.-_gcry_serpent_sse2_ocb_enc;) + +.align 8 +.globl _gcry_serpent_sse2_ocb_dec +ELF(.type _gcry_serpent_sse2_ocb_dec,@function;) + +_gcry_serpent_sse2_ocb_dec: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (8 blocks) + * %rdx: src (8 blocks) + * %rcx: offset + * %r8 : checksum + * %r9 : L pointers (void *L[8]) + */ + + subq $(4 * 8), %rsp; + + movq %r10, (0 * 8)(%rsp); + movq %r11, (1 * 8)(%rsp); + movq %r12, (2 * 8)(%rsp); + movq %r13, (3 * 8)(%rsp); + + movdqu (%rcx), RTMP0; + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ + +#define OCB_INPUT(n, lreg, xreg) \ + movdqu (n * 16)(%rdx), xreg; \ + movdqu (lreg), RNOT; \ + pxor RNOT, RTMP0; \ + pxor RTMP0, xreg; \ + movdqu RTMP0, (n * 16)(%rsi); + movq (0 * 8)(%r9), %r10; + movq (1 * 8)(%r9), %r11; + movq (2 * 8)(%r9), %r12; + movq (3 * 8)(%r9), %r13; + OCB_INPUT(0, %r10, RA0); + OCB_INPUT(1, %r11, RA1); + OCB_INPUT(2, %r12, RA2); + OCB_INPUT(3, %r13, RA3); + movq (4 * 8)(%r9), %r10; + movq (5 * 8)(%r9), %r11; + movq (6 * 8)(%r9), %r12; + movq (7 * 8)(%r9), %r13; + OCB_INPUT(4, %r10, RB0); + OCB_INPUT(5, %r11, RB1); + OCB_INPUT(6, %r12, RB2); + OCB_INPUT(7, %r13, RB3); +#undef OCB_INPUT + + movdqu RTMP0, (%rcx); + + movq (0 * 8)(%rsp), %r10; + movq (1 * 8)(%rsp), %r11; + movq (2 * 8)(%rsp), %r12; + movq (3 * 8)(%rsp), %r13; + + call __serpent_dec_blk8; + + addq $(4 * 8), %rsp; + + movdqu (%r8), RTMP0; + + pxor_u((0 * 16)(%rsi), RA0, RTMP1); + pxor_u((1 * 16)(%rsi), RA1, RTMP1); + pxor_u((2 * 16)(%rsi), RA2, RTMP1); + pxor_u((3 * 16)(%rsi), RA3, RTMP1); + pxor_u((4 * 16)(%rsi), RB0, RTMP1); + pxor_u((5 * 16)(%rsi), RB1, RTMP1); + pxor_u((6 * 16)(%rsi), RB2, RTMP1); + pxor_u((7 * 16)(%rsi), RB3, RTMP1); + + /* Checksum_i = Checksum_{i-1} xor P_i */ + + movdqu RA0, (0 * 16)(%rsi); + pxor RA0, RTMP0; + movdqu RA1, (1 * 16)(%rsi); + pxor RA1, RTMP0; + movdqu RA2, (2 * 16)(%rsi); + pxor RA2, RTMP0; + movdqu RA3, (3 * 16)(%rsi); + pxor RA3, RTMP0; + movdqu RB0, (4 * 16)(%rsi); + pxor RB0, RTMP0; + movdqu RB1, (5 * 16)(%rsi); + pxor RB1, RTMP0; + movdqu RB2, (6 * 16)(%rsi); + pxor RB2, RTMP0; + movdqu RB3, (7 * 16)(%rsi); + pxor RB3, RTMP0; + + movdqu RTMP0, (%r8); + + /* clear the used registers */ + pxor RA0, RA0; + pxor RA1, RA1; + pxor RA2, RA2; + pxor RA3, RA3; + pxor RA4, RA4; + pxor RB0, RB0; + pxor RB1, RB1; + pxor RB2, RB2; + pxor RB3, RB3; + pxor RB4, RB4; + pxor RTMP0, RTMP0; + pxor RTMP1, RTMP1; + pxor RTMP2, RTMP2; + pxor RNOT, RNOT; + + ret; +ELF(.size _gcry_serpent_sse2_ocb_dec,.-_gcry_serpent_sse2_ocb_dec;) + +.align 8 +.globl _gcry_serpent_sse2_ocb_auth +ELF(.type _gcry_serpent_sse2_ocb_auth,@function;) + +_gcry_serpent_sse2_ocb_auth: + /* input: + * %rdi: ctx, CTX + * %rsi: abuf (8 blocks) + * %rdx: offset + * %rcx: checksum + * %r8 : L pointers (void *L[8]) + */ + + subq $(4 * 8), %rsp; + + movq %r10, (0 * 8)(%rsp); + movq %r11, (1 * 8)(%rsp); + movq %r12, (2 * 8)(%rsp); + movq %r13, (3 * 8)(%rsp); + + movdqu (%rdx), RTMP0; + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ + +#define OCB_INPUT(n, lreg, xreg) \ + movdqu (n * 16)(%rsi), xreg; \ + movdqu (lreg), RNOT; \ + pxor RNOT, RTMP0; \ + pxor RTMP0, xreg; + movq (0 * 8)(%r8), %r10; + movq (1 * 8)(%r8), %r11; + movq (2 * 8)(%r8), %r12; + movq (3 * 8)(%r8), %r13; + OCB_INPUT(0, %r10, RA0); + OCB_INPUT(1, %r11, RA1); + OCB_INPUT(2, %r12, RA2); + OCB_INPUT(3, %r13, RA3); + movq (4 * 8)(%r8), %r10; + movq (5 * 8)(%r8), %r11; + movq (6 * 8)(%r8), %r12; + movq (7 * 8)(%r8), %r13; + OCB_INPUT(4, %r10, RB0); + OCB_INPUT(5, %r11, RB1); + OCB_INPUT(6, %r12, RB2); + OCB_INPUT(7, %r13, RB3); +#undef OCB_INPUT + + movdqu RTMP0, (%rdx); + + movq (0 * 8)(%rsp), %r10; + movq (1 * 8)(%rsp), %r11; + movq (2 * 8)(%rsp), %r12; + movq (3 * 8)(%rsp), %r13; + + call __serpent_enc_blk8; + + addq $(4 * 8), %rsp; + + movdqu (%rcx), RTMP0; + pxor RB4, RA4; + pxor RB1, RA1; + pxor RB2, RA2; + pxor RB0, RA0; + + pxor RTMP0, RA2; + pxor RA4, RA1; + pxor RA2, RA0; + + pxor RA1, RA0; + movdqu RA0, (%rcx); + + /* clear the used registers */ + pxor RA0, RA0; + pxor RA1, RA1; + pxor RA2, RA2; + pxor RA3, RA3; + pxor RA4, RA4; + pxor RB0, RB0; + pxor RB1, RB1; + pxor RB2, RB2; + pxor RB3, RB3; + pxor RB4, RB4; + pxor RTMP0, RTMP0; + pxor RTMP1, RTMP1; + pxor RTMP2, RTMP2; + pxor RNOT, RNOT; + + ret; +ELF(.size _gcry_serpent_sse2_ocb_auth,.-_gcry_serpent_sse2_ocb_auth;) + #endif /*defined(USE_SERPENT)*/ #endif /*__x86_64*/ diff --git a/cipher/serpent.c b/cipher/serpent.c index 7d0e1127..eb491aa0 100644 --- a/cipher/serpent.c +++ b/cipher/serpent.c @@ -29,6 +29,7 @@ #include "cipher.h" #include "bithelp.h" #include "bufhelp.h" +#include "cipher-internal.h" #include "cipher-selftest.h" @@ -118,10 +119,30 @@ extern void _gcry_serpent_sse2_cfb_dec(serpent_context_t *ctx, unsigned char *out, const unsigned char *in, unsigned char *iv) ASM_FUNC_ABI; + +extern void _gcry_serpent_sse2_ocb_enc(serpent_context_t *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *offset, + unsigned char *checksum, + const void *Ls[8]) ASM_FUNC_ABI; + +extern void _gcry_serpent_sse2_ocb_dec(serpent_context_t *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *offset, + unsigned char *checksum, + const void *Ls[8]) ASM_FUNC_ABI; + +extern void _gcry_serpent_sse2_ocb_auth(serpent_context_t *ctx, + const unsigned char *abuf, + unsigned char *offset, + unsigned char *checksum, + const void *Ls[8]) ASM_FUNC_ABI; #endif #ifdef USE_AVX2 -/* Assembler implementations of Serpent using SSE2. Process 16 block in +/* Assembler implementations of Serpent using AVX2. Process 16 block in parallel. */ extern void _gcry_serpent_avx2_ctr_enc(serpent_context_t *ctx, @@ -138,6 +159,26 @@ extern void _gcry_serpent_avx2_cfb_dec(serpent_context_t *ctx, unsigned char *out, const unsigned char *in, unsigned char *iv) ASM_FUNC_ABI; + +extern void _gcry_serpent_avx2_ocb_enc(serpent_context_t *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *offset, + unsigned char *checksum, + const void *Ls[16]) ASM_FUNC_ABI; + +extern void _gcry_serpent_avx2_ocb_dec(serpent_context_t *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *offset, + unsigned char *checksum, + const void *Ls[16]) ASM_FUNC_ABI; + +extern void _gcry_serpent_avx2_ocb_auth(serpent_context_t *ctx, + const unsigned char *abuf, + unsigned char *offset, + unsigned char *checksum, + const void *Ls[16]) ASM_FUNC_ABI; #endif #ifdef USE_NEON @@ -158,6 +199,26 @@ extern void _gcry_serpent_neon_cfb_dec(serpent_context_t *ctx, unsigned char *out, const unsigned char *in, unsigned char *iv); + +extern void _gcry_serpent_neon_ocb_enc(serpent_context_t *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *offset, + unsigned char *checksum, + const void *Ls[8]); + +extern void _gcry_serpent_neon_ocb_dec(serpent_context_t *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *offset, + unsigned char *checksum, + const void *Ls[8]); + +extern void _gcry_serpent_neon_ocb_auth(serpent_context_t *ctx, + const unsigned char *abuf, + unsigned char *offset, + unsigned char *checksum, + const void *Ls[8]); #endif @@ -1165,6 +1226,362 @@ _gcry_serpent_cfb_dec(void *context, unsigned char *iv, _gcry_burn_stack(burn_stack_depth); } +static inline const unsigned char * +get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i) +{ + unsigned int ntz = _gcry_ctz64 (i); + + if (ntz < OCB_L_TABLE_SIZE) + return c->u_mode.ocb.L[ntz]; + else + return _gcry_cipher_ocb_get_l (c, l_tmp, i); +} + +/* Bulk encryption/decryption of complete blocks in OCB mode. */ +void +_gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, int encrypt) +{ + serpent_context_t *ctx = (void *)&c->context.c; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + unsigned char l_tmp[sizeof(serpent_block_t)]; + const unsigned char *l; + int burn_stack_depth = 2 * sizeof (serpent_block_t); + u64 blkn = c->u_mode.ocb.data_nblocks; + +#ifdef USE_AVX2 + if (ctx->use_avx2) + { + int did_use_avx2 = 0; + const void *Ls[16]; + int i; + + /* Process data in 16 block chunks. */ + while (nblocks >= 16) + { + /* l_tmp will be used only every 65536-th block. */ + for (i = 0; i < 16; i += 4) + { + Ls[i + 0] = get_l(c, l_tmp, blkn + 1); + Ls[i + 1] = get_l(c, l_tmp, blkn + 2); + Ls[i + 2] = get_l(c, l_tmp, blkn + 3); + Ls[i + 3] = get_l(c, l_tmp, blkn + 4); + blkn += 4; + } + + if (encrypt) + _gcry_serpent_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv, + c->u_ctr.ctr, Ls); + else + _gcry_serpent_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv, + c->u_ctr.ctr, Ls); + + nblocks -= 16; + outbuf += 16 * sizeof(serpent_block_t); + inbuf += 16 * sizeof(serpent_block_t); + did_use_avx2 = 1; + } + + if (did_use_avx2) + { + /* serpent-avx2 assembly code does not use stack */ + if (nblocks == 0) + burn_stack_depth = 0; + } + + /* Use generic code to handle smaller chunks... */ + } +#endif + +#ifdef USE_SSE2 + { + int did_use_sse2 = 0; + const void *Ls[8]; + int i; + + /* Process data in 8 block chunks. */ + while (nblocks >= 8) + { + /* l_tmp will be used only every 65536-th block. */ + for (i = 0; i < 8; i += 4) + { + Ls[i + 0] = get_l(c, l_tmp, blkn + 1); + Ls[i + 1] = get_l(c, l_tmp, blkn + 2); + Ls[i + 2] = get_l(c, l_tmp, blkn + 3); + Ls[i + 3] = get_l(c, l_tmp, blkn + 4); + blkn += 4; + } + + if (encrypt) + _gcry_serpent_sse2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv, + c->u_ctr.ctr, Ls); + else + _gcry_serpent_sse2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv, + c->u_ctr.ctr, Ls); + + nblocks -= 8; + outbuf += 8 * sizeof(serpent_block_t); + inbuf += 8 * sizeof(serpent_block_t); + did_use_sse2 = 1; + } + + if (did_use_sse2) + { + /* serpent-sse2 assembly code does not use stack */ + if (nblocks == 0) + burn_stack_depth = 0; + } + + /* Use generic code to handle smaller chunks... */ + } +#endif + +#ifdef USE_NEON + if (ctx->use_neon) + { + int did_use_neon = 0; + const void *Ls[8]; + int i; + + /* Process data in 8 block chunks. */ + while (nblocks >= 8) + { + /* l_tmp will be used only every 65536-th block. */ + for (i = 0; i < 8; i += 4) + { + Ls[i + 0] = get_l(c, l_tmp, blkn + 1); + Ls[i + 1] = get_l(c, l_tmp, blkn + 2); + Ls[i + 2] = get_l(c, l_tmp, blkn + 3); + Ls[i + 3] = get_l(c, l_tmp, blkn + 4); + blkn += 4; + } + + if (encrypt) + _gcry_serpent_neon_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv, + c->u_ctr.ctr, Ls); + else + _gcry_serpent_neon_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv, + c->u_ctr.ctr, Ls); + + nblocks -= 8; + outbuf += 8 * sizeof(serpent_block_t); + inbuf += 8 * sizeof(serpent_block_t); + did_use_neon = 1; + } + + if (did_use_neon) + { + /* serpent-neon assembly code does not use stack */ + if (nblocks == 0) + burn_stack_depth = 0; + } + + /* Use generic code to handle smaller chunks... */ + } +#endif + + if (encrypt) + { + for (; nblocks; nblocks--) + { + l = get_l(c, l_tmp, ++blkn); + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + buf_xor_1 (c->u_iv.iv, l, sizeof(serpent_block_t)); + buf_cpy (l_tmp, inbuf, sizeof(serpent_block_t)); + /* Checksum_i = Checksum_{i-1} xor P_i */ + buf_xor_1 (c->u_ctr.ctr, l_tmp, sizeof(serpent_block_t)); + /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ + buf_xor_1 (l_tmp, c->u_iv.iv, sizeof(serpent_block_t)); + serpent_encrypt_internal(ctx, l_tmp, l_tmp); + buf_xor_1 (l_tmp, c->u_iv.iv, sizeof(serpent_block_t)); + buf_cpy (outbuf, l_tmp, sizeof(serpent_block_t)); + + inbuf += sizeof(serpent_block_t); + outbuf += sizeof(serpent_block_t); + } + } + else + { + for (; nblocks; nblocks--) + { + l = get_l(c, l_tmp, ++blkn); + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + buf_xor_1 (c->u_iv.iv, l, sizeof(serpent_block_t)); + buf_cpy (l_tmp, inbuf, sizeof(serpent_block_t)); + /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ + buf_xor_1 (l_tmp, c->u_iv.iv, sizeof(serpent_block_t)); + serpent_decrypt_internal(ctx, l_tmp, l_tmp); + buf_xor_1 (l_tmp, c->u_iv.iv, sizeof(serpent_block_t)); + /* Checksum_i = Checksum_{i-1} xor P_i */ + buf_xor_1 (c->u_ctr.ctr, l_tmp, sizeof(serpent_block_t)); + buf_cpy (outbuf, l_tmp, sizeof(serpent_block_t)); + + inbuf += sizeof(serpent_block_t); + outbuf += sizeof(serpent_block_t); + } + } + + c->u_mode.ocb.data_nblocks = blkn; + + wipememory(&l_tmp, sizeof(l_tmp)); + + if (burn_stack_depth) + _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *)); +} + +/* Bulk authentication of complete blocks in OCB mode. */ +void +_gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, + size_t nblocks) +{ + serpent_context_t *ctx = (void *)&c->context.c; + const unsigned char *abuf = abuf_arg; + unsigned char l_tmp[sizeof(serpent_block_t)]; + const unsigned char *l; + int burn_stack_depth = 2 * sizeof(serpent_block_t); + u64 blkn = c->u_mode.ocb.aad_nblocks; + +#ifdef USE_AVX2 + if (ctx->use_avx2) + { + int did_use_avx2 = 0; + const void *Ls[16]; + int i; + + /* Process data in 16 block chunks. */ + while (nblocks >= 16) + { + /* l_tmp will be used only every 65536-th block. */ + for (i = 0; i < 16; i += 4) + { + Ls[i + 0] = get_l(c, l_tmp, blkn + 1); + Ls[i + 1] = get_l(c, l_tmp, blkn + 2); + Ls[i + 2] = get_l(c, l_tmp, blkn + 3); + Ls[i + 3] = get_l(c, l_tmp, blkn + 4); + blkn += 4; + } + + _gcry_serpent_avx2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum, Ls); + + nblocks -= 16; + abuf += 16 * sizeof(serpent_block_t); + did_use_avx2 = 1; + } + + if (did_use_avx2) + { + /* serpent-avx2 assembly code does not use stack */ + if (nblocks == 0) + burn_stack_depth = 0; + } + + /* Use generic code to handle smaller chunks... */ + } +#endif + +#ifdef USE_SSE2 + { + int did_use_sse2 = 0; + const void *Ls[8]; + int i; + + /* Process data in 8 block chunks. */ + while (nblocks >= 8) + { + /* l_tmp will be used only every 65536-th block. */ + for (i = 0; i < 8; i += 4) + { + Ls[i + 0] = get_l(c, l_tmp, blkn + 1); + Ls[i + 1] = get_l(c, l_tmp, blkn + 2); + Ls[i + 2] = get_l(c, l_tmp, blkn + 3); + Ls[i + 3] = get_l(c, l_tmp, blkn + 4); + blkn += 4; + } + + _gcry_serpent_sse2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum, Ls); + + nblocks -= 8; + abuf += 8 * sizeof(serpent_block_t); + did_use_sse2 = 1; + } + + if (did_use_sse2) + { + /* serpent-avx2 assembly code does not use stack */ + if (nblocks == 0) + burn_stack_depth = 0; + } + + /* Use generic code to handle smaller chunks... */ + } +#endif + +#ifdef USE_NEON + if (ctx->use_neon) + { + int did_use_neon = 0; + const void *Ls[8]; + int i; + + /* Process data in 8 block chunks. */ + while (nblocks >= 8) + { + /* l_tmp will be used only every 65536-th block. */ + for (i = 0; i < 8; i += 4) + { + Ls[i + 0] = get_l(c, l_tmp, blkn + 1); + Ls[i + 1] = get_l(c, l_tmp, blkn + 2); + Ls[i + 2] = get_l(c, l_tmp, blkn + 3); + Ls[i + 3] = get_l(c, l_tmp, blkn + 4); + blkn += 4; + } + + _gcry_serpent_neon_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum, Ls); + + nblocks -= 8; + abuf += 8 * sizeof(serpent_block_t); + did_use_neon = 1; + } + + if (did_use_neon) + { + /* serpent-neon assembly code does not use stack */ + if (nblocks == 0) + burn_stack_depth = 0; + } + + /* Use generic code to handle smaller chunks... */ + } +#endif + + for (; nblocks; nblocks--) + { + l = get_l(c, l_tmp, ++blkn); + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + buf_xor_1 (c->u_mode.ocb.aad_offset, l, sizeof(serpent_block_t)); + /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ + buf_xor (l_tmp, c->u_mode.ocb.aad_offset, abuf, sizeof(serpent_block_t)); + serpent_encrypt_internal(ctx, l_tmp, l_tmp); + buf_xor_1 (c->u_mode.ocb.aad_sum, l_tmp, sizeof(serpent_block_t)); + + abuf += sizeof(serpent_block_t); + } + + c->u_mode.ocb.aad_nblocks = blkn; + + wipememory(&l_tmp, sizeof(l_tmp)); + + if (burn_stack_depth) + _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *)); +} + /* Run the self-tests for SERPENT-CTR-128, tests IV increment of bulk CTR diff --git a/src/cipher.h b/src/cipher.h index 1a66f6de..d16746a3 100644 --- a/src/cipher.h +++ b/src/cipher.h @@ -206,6 +206,11 @@ void _gcry_serpent_cbc_dec (void *context, unsigned char *iv, void _gcry_serpent_cfb_dec (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); +void _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, + int encrypt); +void _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, + size_t nblocks); /*-- twofish.c --*/ void _gcry_twofish_ctr_enc (void *context, unsigned char *ctr, diff --git a/tests/basic.c b/tests/basic.c index 124df55b..3ad05a45 100644 --- a/tests/basic.c +++ b/tests/basic.c @@ -3350,6 +3350,15 @@ check_ocb_cipher (void) check_ocb_cipher_largebuf(GCRY_CIPHER_TWOFISH, 32, "\xf6\xd4\xfe\x4e\x50\x85\x13\x59" "\x69\x0e\x4c\x67\x3e\xdd\x47\x90"); + check_ocb_cipher_largebuf(GCRY_CIPHER_SERPENT128, 16, + "\x3c\xfb\x66\x14\x3c\xc8\x6c\x67" + "\x26\xb8\x23\xeb\xaf\x43\x98\x69"); + check_ocb_cipher_largebuf(GCRY_CIPHER_SERPENT192, 24, + "\x5e\x62\x27\xc5\x32\xc3\x1d\xe6" + "\x2e\x65\xe7\xd6\xfb\x05\xd7\xb2"); + check_ocb_cipher_largebuf(GCRY_CIPHER_SERPENT256, 32, + "\xe7\x8b\xe6\xd4\x2f\x7a\x36\x4c" + "\xba\xee\x20\xe2\x68\xf4\xcb\xcc"); } -- cgit v1.2.1