diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2015-07-07 21:49:57 +0300 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2015-07-27 11:47:16 +0300 |
commit | bb088c6b1620504fdc79e89af27c2bf3fb02b4b4 (patch) | |
tree | 96e9dca658f564df73fc095d16432875f8599faa /cipher | |
parent | 620e1e0300c79943a1846a49563b04386dc60546 (diff) | |
download | libgcrypt-bb088c6b1620504fdc79e89af27c2bf3fb02b4b4.tar.gz |
Add bulk OCB for Camellia AES-NI/AVX and AES-NI/AVX2 implementations
* cipher/camellia-aesni-avx-amd64.S: Add OCB assembly functions.
* cipher/camellia-aesni-avx2-amd64.S: Add OCB assembly functions.
* cipher/camellia-glue.c (_gcry_camellia_aesni_avx_ocb_enc)
(_gcry_camellia_aesni_avx_ocb_dec, _gcry_camellia_aesni_avx_ocb_auth)
(_gcry_camellia_aesni_avx2_ocb_enc, _gcry_camellia_aesni_avx2_ocb_dec)
(_gcry_camellia_aesni_avx2_ocb_auth): New prototypes.
(get_l, _gcry_camellia_ocb_crypt, _gcry_camellia_ocb_auth): New.
* cipher/cipher.c (_gcry_cipher_open_internal): Setup OCB bulk
functions for Camellia.
* src/cipher.h (_gcry_camellia_ocb_crypt)
(_gcry_camellia_ocb_auth): New.
* tests/basic.c (check_ocb_cipher): Add test-vector for Camellia.
--
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher')
-rw-r--r-- | cipher/camellia-aesni-avx-amd64.S | 424 | ||||
-rw-r--r-- | cipher/camellia-aesni-avx2-amd64.S | 503 | ||||
-rw-r--r-- | cipher/camellia-glue.c | 329 | ||||
-rw-r--r-- | cipher/cipher.c | 2 |
4 files changed, 1252 insertions, 6 deletions
diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S index c047a21e..5a3a3cbc 100644 --- a/cipher/camellia-aesni-avx-amd64.S +++ b/cipher/camellia-aesni-avx-amd64.S @@ -1,6 +1,6 @@ /* camellia-avx-aesni-amd64.S - AES-NI/AVX implementation of Camellia cipher * - * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * Copyright (C) 2013-2015 Jussi Kivilinna <jussi.kivilinna@iki.fi> * * This file is part of Libgcrypt. * @@ -1211,6 +1211,428 @@ _gcry_camellia_aesni_avx_cfb_dec: ret; ELF(.size _gcry_camellia_aesni_avx_cfb_dec,.-_gcry_camellia_aesni_avx_cfb_dec;) +.align 8 +.globl _gcry_camellia_aesni_avx_ocb_enc +ELF(.type _gcry_camellia_aesni_avx_ocb_enc,@function;) + +_gcry_camellia_aesni_avx_ocb_enc: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (16 blocks) + * %rdx: src (16 blocks) + * %rcx: offset + * %r8 : checksum + * %r9 : L pointers (void *L[16]) + */ + + pushq %rbp; + movq %rsp, %rbp; + + vzeroupper; + + subq $(16 * 16 + 4 * 8), %rsp; + andq $~31, %rsp; + movq %rsp, %rax; + + movq %r10, (16 * 16 + 0 * 8)(%rax); + movq %r11, (16 * 16 + 1 * 8)(%rax); + movq %r12, (16 * 16 + 2 * 8)(%rax); + movq %r13, (16 * 16 + 3 * 8)(%rax); + + vmovdqu (%rcx), %xmm14; + vmovdqu (%r8), %xmm15; + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* Checksum_i = Checksum_{i-1} xor P_i */ + /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ + +#define OCB_INPUT(n, lreg, xreg) \ + vmovdqu (n * 16)(%rdx), xreg; \ + vpxor (lreg), %xmm14, %xmm14; \ + vpxor xreg, %xmm15, %xmm15; \ + vpxor xreg, %xmm14, xreg; \ + vmovdqu %xmm14, (n * 16)(%rsi); + movq (0 * 8)(%r9), %r10; + movq (1 * 8)(%r9), %r11; + movq (2 * 8)(%r9), %r12; + movq (3 * 8)(%r9), %r13; + OCB_INPUT(0, %r10, %xmm0); + vmovdqu %xmm0, (15 * 16)(%rax); + OCB_INPUT(1, %r11, %xmm0); + vmovdqu %xmm0, (14 * 16)(%rax); + OCB_INPUT(2, %r12, %xmm13); + OCB_INPUT(3, %r13, %xmm12); + movq (4 * 8)(%r9), %r10; + movq (5 * 8)(%r9), %r11; + movq (6 * 8)(%r9), %r12; + movq (7 * 8)(%r9), %r13; + OCB_INPUT(4, %r10, %xmm11); + OCB_INPUT(5, %r11, %xmm10); + OCB_INPUT(6, %r12, %xmm9); + OCB_INPUT(7, %r13, %xmm8); + movq (8 * 8)(%r9), %r10; + movq (9 * 8)(%r9), %r11; + movq (10 * 8)(%r9), %r12; + movq (11 * 8)(%r9), %r13; + OCB_INPUT(8, %r10, %xmm7); + OCB_INPUT(9, %r11, %xmm6); + OCB_INPUT(10, %r12, %xmm5); + OCB_INPUT(11, %r13, %xmm4); + movq (12 * 8)(%r9), %r10; + movq (13 * 8)(%r9), %r11; + movq (14 * 8)(%r9), %r12; + movq (15 * 8)(%r9), %r13; + OCB_INPUT(12, %r10, %xmm3); + OCB_INPUT(13, %r11, %xmm2); + OCB_INPUT(14, %r12, %xmm1); + OCB_INPUT(15, %r13, %xmm0); +#undef OCB_INPUT + + vmovdqu %xmm14, (%rcx); + vmovdqu %xmm15, (%r8); + + /* inpack16_pre: */ + vmovq (key_table)(CTX), %xmm15; + vpshufb .Lpack_bswap RIP, %xmm15, %xmm15; + vpxor %xmm0, %xmm15, %xmm0; + vpxor %xmm1, %xmm15, %xmm1; + vpxor %xmm2, %xmm15, %xmm2; + vpxor %xmm3, %xmm15, %xmm3; + vpxor %xmm4, %xmm15, %xmm4; + vpxor %xmm5, %xmm15, %xmm5; + vpxor %xmm6, %xmm15, %xmm6; + vpxor %xmm7, %xmm15, %xmm7; + vpxor %xmm8, %xmm15, %xmm8; + vpxor %xmm9, %xmm15, %xmm9; + vpxor %xmm10, %xmm15, %xmm10; + vpxor %xmm11, %xmm15, %xmm11; + vpxor %xmm12, %xmm15, %xmm12; + vpxor %xmm13, %xmm15, %xmm13; + vpxor 14 * 16(%rax), %xmm15, %xmm14; + vpxor 15 * 16(%rax), %xmm15, %xmm15; + + call __camellia_enc_blk16; + + vpxor 0 * 16(%rsi), %xmm7, %xmm7; + vpxor 1 * 16(%rsi), %xmm6, %xmm6; + vpxor 2 * 16(%rsi), %xmm5, %xmm5; + vpxor 3 * 16(%rsi), %xmm4, %xmm4; + vpxor 4 * 16(%rsi), %xmm3, %xmm3; + vpxor 5 * 16(%rsi), %xmm2, %xmm2; + vpxor 6 * 16(%rsi), %xmm1, %xmm1; + vpxor 7 * 16(%rsi), %xmm0, %xmm0; + vpxor 8 * 16(%rsi), %xmm15, %xmm15; + vpxor 9 * 16(%rsi), %xmm14, %xmm14; + vpxor 10 * 16(%rsi), %xmm13, %xmm13; + vpxor 11 * 16(%rsi), %xmm12, %xmm12; + vpxor 12 * 16(%rsi), %xmm11, %xmm11; + vpxor 13 * 16(%rsi), %xmm10, %xmm10; + vpxor 14 * 16(%rsi), %xmm9, %xmm9; + vpxor 15 * 16(%rsi), %xmm8, %xmm8; + + write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, + %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, + %xmm8, %rsi); + + vzeroall; + + movq (16 * 16 + 0 * 8)(%rax), %r10; + movq (16 * 16 + 1 * 8)(%rax), %r11; + movq (16 * 16 + 2 * 8)(%rax), %r12; + movq (16 * 16 + 3 * 8)(%rax), %r13; + + leave; + ret; +ELF(.size _gcry_camellia_aesni_avx_ocb_enc,.-_gcry_camellia_aesni_avx_ocb_enc;) + +.align 8 +.globl _gcry_camellia_aesni_avx_ocb_dec +ELF(.type _gcry_camellia_aesni_avx_ocb_dec,@function;) + +_gcry_camellia_aesni_avx_ocb_dec: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (16 blocks) + * %rdx: src (16 blocks) + * %rcx: offset + * %r8 : checksum + * %r9 : L pointers (void *L[16]) + */ + + pushq %rbp; + movq %rsp, %rbp; + + vzeroupper; + + subq $(16 * 16 + 4 * 8), %rsp; + andq $~31, %rsp; + movq %rsp, %rax; + + movq %r10, (16 * 16 + 0 * 8)(%rax); + movq %r11, (16 * 16 + 1 * 8)(%rax); + movq %r12, (16 * 16 + 2 * 8)(%rax); + movq %r13, (16 * 16 + 3 * 8)(%rax); + + vmovdqu (%rcx), %xmm15; + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ + +#define OCB_INPUT(n, lreg, xreg) \ + vmovdqu (n * 16)(%rdx), xreg; \ + vpxor (lreg), %xmm15, %xmm15; \ + vpxor xreg, %xmm15, xreg; \ + vmovdqu %xmm15, (n * 16)(%rsi); + movq (0 * 8)(%r9), %r10; + movq (1 * 8)(%r9), %r11; + movq (2 * 8)(%r9), %r12; + movq (3 * 8)(%r9), %r13; + OCB_INPUT(0, %r10, %xmm0); + vmovdqu %xmm0, (15 * 16)(%rax); + OCB_INPUT(1, %r11, %xmm14); + OCB_INPUT(2, %r12, %xmm13); + OCB_INPUT(3, %r13, %xmm12); + movq (4 * 8)(%r9), %r10; + movq (5 * 8)(%r9), %r11; + movq (6 * 8)(%r9), %r12; + movq (7 * 8)(%r9), %r13; + OCB_INPUT(4, %r10, %xmm11); + OCB_INPUT(5, %r11, %xmm10); + OCB_INPUT(6, %r12, %xmm9); + OCB_INPUT(7, %r13, %xmm8); + movq (8 * 8)(%r9), %r10; + movq (9 * 8)(%r9), %r11; + movq (10 * 8)(%r9), %r12; + movq (11 * 8)(%r9), %r13; + OCB_INPUT(8, %r10, %xmm7); + OCB_INPUT(9, %r11, %xmm6); + OCB_INPUT(10, %r12, %xmm5); + OCB_INPUT(11, %r13, %xmm4); + movq (12 * 8)(%r9), %r10; + movq (13 * 8)(%r9), %r11; + movq (14 * 8)(%r9), %r12; + movq (15 * 8)(%r9), %r13; + OCB_INPUT(12, %r10, %xmm3); + OCB_INPUT(13, %r11, %xmm2); + OCB_INPUT(14, %r12, %xmm1); + OCB_INPUT(15, %r13, %xmm0); +#undef OCB_INPUT + + vmovdqu %xmm15, (%rcx); + + movq %r8, %r10; + + cmpl $128, key_bitlength(CTX); + movl $32, %r8d; + movl $24, %r9d; + cmovel %r9d, %r8d; /* max */ + + /* inpack16_pre: */ + vmovq (key_table)(CTX, %r8, 8), %xmm15; + vpshufb .Lpack_bswap RIP, %xmm15, %xmm15; + vpxor %xmm0, %xmm15, %xmm0; + vpxor %xmm1, %xmm15, %xmm1; + vpxor %xmm2, %xmm15, %xmm2; + vpxor %xmm3, %xmm15, %xmm3; + vpxor %xmm4, %xmm15, %xmm4; + vpxor %xmm5, %xmm15, %xmm5; + vpxor %xmm6, %xmm15, %xmm6; + vpxor %xmm7, %xmm15, %xmm7; + vpxor %xmm8, %xmm15, %xmm8; + vpxor %xmm9, %xmm15, %xmm9; + vpxor %xmm10, %xmm15, %xmm10; + vpxor %xmm11, %xmm15, %xmm11; + vpxor %xmm12, %xmm15, %xmm12; + vpxor %xmm13, %xmm15, %xmm13; + vpxor %xmm14, %xmm15, %xmm14; + vpxor 15 * 16(%rax), %xmm15, %xmm15; + + call __camellia_dec_blk16; + + vpxor 0 * 16(%rsi), %xmm7, %xmm7; + vpxor 1 * 16(%rsi), %xmm6, %xmm6; + vpxor 2 * 16(%rsi), %xmm5, %xmm5; + vpxor 3 * 16(%rsi), %xmm4, %xmm4; + vpxor 4 * 16(%rsi), %xmm3, %xmm3; + vpxor 5 * 16(%rsi), %xmm2, %xmm2; + vpxor 6 * 16(%rsi), %xmm1, %xmm1; + vpxor 7 * 16(%rsi), %xmm0, %xmm0; + vmovdqu %xmm7, (7 * 16)(%rax); + vpxor 8 * 16(%rsi), %xmm15, %xmm15; + vpxor 9 * 16(%rsi), %xmm14, %xmm14; + vpxor 10 * 16(%rsi), %xmm13, %xmm13; + vpxor 11 * 16(%rsi), %xmm12, %xmm12; + vpxor 12 * 16(%rsi), %xmm11, %xmm11; + vpxor 13 * 16(%rsi), %xmm10, %xmm10; + vpxor 14 * 16(%rsi), %xmm9, %xmm9; + vpxor 15 * 16(%rsi), %xmm8, %xmm8; + + /* Checksum_i = Checksum_{i-1} xor P_i */ + + vpxor (%r10), %xmm7, %xmm7; + vpxor %xmm6, %xmm7, %xmm7; + vpxor %xmm5, %xmm7, %xmm7; + vpxor %xmm4, %xmm7, %xmm7; + vpxor %xmm3, %xmm7, %xmm7; + vpxor %xmm2, %xmm7, %xmm7; + vpxor %xmm1, %xmm7, %xmm7; + vpxor %xmm0, %xmm7, %xmm7; + vpxor %xmm15, %xmm7, %xmm7; + vpxor %xmm14, %xmm7, %xmm7; + vpxor %xmm13, %xmm7, %xmm7; + vpxor %xmm12, %xmm7, %xmm7; + vpxor %xmm11, %xmm7, %xmm7; + vpxor %xmm10, %xmm7, %xmm7; + vpxor %xmm9, %xmm7, %xmm7; + vpxor %xmm8, %xmm7, %xmm7; + vmovdqu %xmm7, (%r10); + vmovdqu (7 * 16)(%rax), %xmm7; + + write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, + %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, + %xmm8, %rsi); + + vzeroall; + + movq (16 * 16 + 0 * 8)(%rax), %r10; + movq (16 * 16 + 1 * 8)(%rax), %r11; + movq (16 * 16 + 2 * 8)(%rax), %r12; + movq (16 * 16 + 3 * 8)(%rax), %r13; + + leave; + ret; +ELF(.size _gcry_camellia_aesni_avx_ocb_dec,.-_gcry_camellia_aesni_avx_ocb_dec;) + +.align 8 +.globl _gcry_camellia_aesni_avx_ocb_auth +ELF(.type _gcry_camellia_aesni_avx_ocb_auth,@function;) + +_gcry_camellia_aesni_avx_ocb_auth: + /* input: + * %rdi: ctx, CTX + * %rsi: abuf (16 blocks) + * %rdx: offset + * %rcx: checksum + * %r8 : L pointers (void *L[16]) + */ + + pushq %rbp; + movq %rsp, %rbp; + + vzeroupper; + + subq $(16 * 16 + 4 * 8), %rsp; + andq $~31, %rsp; + movq %rsp, %rax; + + movq %r10, (16 * 16 + 0 * 8)(%rax); + movq %r11, (16 * 16 + 1 * 8)(%rax); + movq %r12, (16 * 16 + 2 * 8)(%rax); + movq %r13, (16 * 16 + 3 * 8)(%rax); + + vmovdqu (%rdx), %xmm15; + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ + +#define OCB_INPUT(n, lreg, xreg) \ + vmovdqu (n * 16)(%rsi), xreg; \ + vpxor (lreg), %xmm15, %xmm15; \ + vpxor xreg, %xmm15, xreg; + + movq (0 * 8)(%r8), %r10; + movq (1 * 8)(%r8), %r11; + movq (2 * 8)(%r8), %r12; + movq (3 * 8)(%r8), %r13; + OCB_INPUT(0, %r10, %xmm0); + vmovdqu %xmm0, (15 * 16)(%rax); + OCB_INPUT(1, %r11, %xmm14); + OCB_INPUT(2, %r12, %xmm13); + OCB_INPUT(3, %r13, %xmm12); + movq (4 * 8)(%r8), %r10; + movq (5 * 8)(%r8), %r11; + movq (6 * 8)(%r8), %r12; + movq (7 * 8)(%r8), %r13; + OCB_INPUT(4, %r10, %xmm11); + OCB_INPUT(5, %r11, %xmm10); + OCB_INPUT(6, %r12, %xmm9); + OCB_INPUT(7, %r13, %xmm8); + movq (8 * 8)(%r8), %r10; + movq (9 * 8)(%r8), %r11; + movq (10 * 8)(%r8), %r12; + movq (11 * 8)(%r8), %r13; + OCB_INPUT(8, %r10, %xmm7); + OCB_INPUT(9, %r11, %xmm6); + OCB_INPUT(10, %r12, %xmm5); + OCB_INPUT(11, %r13, %xmm4); + movq (12 * 8)(%r8), %r10; + movq (13 * 8)(%r8), %r11; + movq (14 * 8)(%r8), %r12; + movq (15 * 8)(%r8), %r13; + OCB_INPUT(12, %r10, %xmm3); + OCB_INPUT(13, %r11, %xmm2); + OCB_INPUT(14, %r12, %xmm1); + OCB_INPUT(15, %r13, %xmm0); +#undef OCB_INPUT + + vmovdqu %xmm15, (%rdx); + + movq %rcx, %r10; + + /* inpack16_pre: */ + vmovq (key_table)(CTX), %xmm15; + vpshufb .Lpack_bswap RIP, %xmm15, %xmm15; + vpxor %xmm0, %xmm15, %xmm0; + vpxor %xmm1, %xmm15, %xmm1; + vpxor %xmm2, %xmm15, %xmm2; + vpxor %xmm3, %xmm15, %xmm3; + vpxor %xmm4, %xmm15, %xmm4; + vpxor %xmm5, %xmm15, %xmm5; + vpxor %xmm6, %xmm15, %xmm6; + vpxor %xmm7, %xmm15, %xmm7; + vpxor %xmm8, %xmm15, %xmm8; + vpxor %xmm9, %xmm15, %xmm9; + vpxor %xmm10, %xmm15, %xmm10; + vpxor %xmm11, %xmm15, %xmm11; + vpxor %xmm12, %xmm15, %xmm12; + vpxor %xmm13, %xmm15, %xmm13; + vpxor %xmm14, %xmm15, %xmm14; + vpxor 15 * 16(%rax), %xmm15, %xmm15; + + call __camellia_enc_blk16; + + vpxor %xmm7, %xmm6, %xmm6; + vpxor %xmm5, %xmm4, %xmm4; + vpxor %xmm3, %xmm2, %xmm2; + vpxor %xmm1, %xmm0, %xmm0; + vpxor %xmm15, %xmm14, %xmm14; + vpxor %xmm13, %xmm12, %xmm12; + vpxor %xmm11, %xmm10, %xmm10; + vpxor %xmm9, %xmm8, %xmm8; + + vpxor %xmm6, %xmm4, %xmm4; + vpxor %xmm2, %xmm0, %xmm0; + vpxor %xmm14, %xmm12, %xmm12; + vpxor %xmm10, %xmm8, %xmm8; + + vpxor %xmm4, %xmm0, %xmm0; + vpxor %xmm12, %xmm8, %xmm8; + + vpxor %xmm0, %xmm8, %xmm0; + vpxor (%r10), %xmm0, %xmm0; + vmovdqu %xmm0, (%r10); + + vzeroall; + + movq (16 * 16 + 0 * 8)(%rax), %r10; + movq (16 * 16 + 1 * 8)(%rax), %r11; + movq (16 * 16 + 2 * 8)(%rax), %r12; + movq (16 * 16 + 3 * 8)(%rax), %r13; + + leave; + ret; +ELF(.size _gcry_camellia_aesni_avx_ocb_auth,.-_gcry_camellia_aesni_avx_ocb_auth;) + /* * IN: * ab: 64-bit AB state diff --git a/cipher/camellia-aesni-avx2-amd64.S b/cipher/camellia-aesni-avx2-amd64.S index a3fa229d..26381df0 100644 --- a/cipher/camellia-aesni-avx2-amd64.S +++ b/cipher/camellia-aesni-avx2-amd64.S @@ -1,6 +1,6 @@ /* camellia-avx2-aesni-amd64.S - AES-NI/AVX2 implementation of Camellia cipher * - * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * Copyright (C) 2013-2015 Jussi Kivilinna <jussi.kivilinna@iki.fi> * * This file is part of Libgcrypt. * @@ -1127,8 +1127,8 @@ ELF(.type _gcry_camellia_aesni_avx2_cbc_dec,@function;) _gcry_camellia_aesni_avx2_cbc_dec: /* input: * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) + * %rsi: dst (32 blocks) + * %rdx: src (32 blocks) * %rcx: iv */ @@ -1199,8 +1199,8 @@ ELF(.type _gcry_camellia_aesni_avx2_cfb_dec,@function;) _gcry_camellia_aesni_avx2_cfb_dec: /* input: * %rdi: ctx, CTX - * %rsi: dst (16 blocks) - * %rdx: src (16 blocks) + * %rsi: dst (32 blocks) + * %rdx: src (32 blocks) * %rcx: iv */ @@ -1266,5 +1266,498 @@ _gcry_camellia_aesni_avx2_cfb_dec: ret; ELF(.size _gcry_camellia_aesni_avx2_cfb_dec,.-_gcry_camellia_aesni_avx2_cfb_dec;) +.align 8 +.globl _gcry_camellia_aesni_avx2_ocb_enc +ELF(.type _gcry_camellia_aesni_avx2_ocb_enc,@function;) + +_gcry_camellia_aesni_avx2_ocb_enc: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (32 blocks) + * %rdx: src (32 blocks) + * %rcx: offset + * %r8 : checksum + * %r9 : L pointers (void *L[32]) + */ + + pushq %rbp; + movq %rsp, %rbp; + + vzeroupper; + + subq $(16 * 32 + 4 * 8), %rsp; + andq $~63, %rsp; + movq %rsp, %rax; + + movq %r10, (16 * 32 + 0 * 8)(%rax); + movq %r11, (16 * 32 + 1 * 8)(%rax); + movq %r12, (16 * 32 + 2 * 8)(%rax); + movq %r13, (16 * 32 + 3 * 8)(%rax); + + vmovdqu (%rcx), %xmm14; + vmovdqu (%r8), %xmm13; + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* Checksum_i = Checksum_{i-1} xor P_i */ + /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ + +#define OCB_INPUT(n, l0reg, l1reg, yreg) \ + vmovdqu (n * 32)(%rdx), yreg; \ + vpxor (l0reg), %xmm14, %xmm15; \ + vpxor (l1reg), %xmm15, %xmm14; \ + vinserti128 $1, %xmm14, %ymm15, %ymm15; \ + vpxor yreg, %ymm13, %ymm13; \ + vpxor yreg, %ymm15, yreg; \ + vmovdqu %ymm15, (n * 32)(%rsi); + + movq (0 * 8)(%r9), %r10; + movq (1 * 8)(%r9), %r11; + movq (2 * 8)(%r9), %r12; + movq (3 * 8)(%r9), %r13; + OCB_INPUT(0, %r10, %r11, %ymm0); + vmovdqu %ymm0, (15 * 32)(%rax); + OCB_INPUT(1, %r12, %r13, %ymm0); + vmovdqu %ymm0, (14 * 32)(%rax); + movq (4 * 8)(%r9), %r10; + movq (5 * 8)(%r9), %r11; + movq (6 * 8)(%r9), %r12; + movq (7 * 8)(%r9), %r13; + OCB_INPUT(2, %r10, %r11, %ymm0); + vmovdqu %ymm0, (13 * 32)(%rax); + OCB_INPUT(3, %r12, %r13, %ymm12); + movq (8 * 8)(%r9), %r10; + movq (9 * 8)(%r9), %r11; + movq (10 * 8)(%r9), %r12; + movq (11 * 8)(%r9), %r13; + OCB_INPUT(4, %r10, %r11, %ymm11); + OCB_INPUT(5, %r12, %r13, %ymm10); + movq (12 * 8)(%r9), %r10; + movq (13 * 8)(%r9), %r11; + movq (14 * 8)(%r9), %r12; + movq (15 * 8)(%r9), %r13; + OCB_INPUT(6, %r10, %r11, %ymm9); + OCB_INPUT(7, %r12, %r13, %ymm8); + movq (16 * 8)(%r9), %r10; + movq (17 * 8)(%r9), %r11; + movq (18 * 8)(%r9), %r12; + movq (19 * 8)(%r9), %r13; + OCB_INPUT(8, %r10, %r11, %ymm7); + OCB_INPUT(9, %r12, %r13, %ymm6); + movq (20 * 8)(%r9), %r10; + movq (21 * 8)(%r9), %r11; + movq (22 * 8)(%r9), %r12; + movq (23 * 8)(%r9), %r13; + OCB_INPUT(10, %r10, %r11, %ymm5); + OCB_INPUT(11, %r12, %r13, %ymm4); + movq (24 * 8)(%r9), %r10; + movq (25 * 8)(%r9), %r11; + movq (26 * 8)(%r9), %r12; + movq (27 * 8)(%r9), %r13; + OCB_INPUT(12, %r10, %r11, %ymm3); + OCB_INPUT(13, %r12, %r13, %ymm2); + movq (28 * 8)(%r9), %r10; + movq (29 * 8)(%r9), %r11; + movq (30 * 8)(%r9), %r12; + movq (31 * 8)(%r9), %r13; + OCB_INPUT(14, %r10, %r11, %ymm1); + OCB_INPUT(15, %r12, %r13, %ymm0); +#undef OCB_INPUT + + vextracti128 $1, %ymm13, %xmm15; + vmovdqu %xmm14, (%rcx); + vpxor %xmm13, %xmm15, %xmm15; + vmovdqu %xmm15, (%r8); + + /* inpack16_pre: */ + vpbroadcastq (key_table)(CTX), %ymm15; + vpshufb .Lpack_bswap RIP, %ymm15, %ymm15; + vpxor %ymm0, %ymm15, %ymm0; + vpxor %ymm1, %ymm15, %ymm1; + vpxor %ymm2, %ymm15, %ymm2; + vpxor %ymm3, %ymm15, %ymm3; + vpxor %ymm4, %ymm15, %ymm4; + vpxor %ymm5, %ymm15, %ymm5; + vpxor %ymm6, %ymm15, %ymm6; + vpxor %ymm7, %ymm15, %ymm7; + vpxor %ymm8, %ymm15, %ymm8; + vpxor %ymm9, %ymm15, %ymm9; + vpxor %ymm10, %ymm15, %ymm10; + vpxor %ymm11, %ymm15, %ymm11; + vpxor %ymm12, %ymm15, %ymm12; + vpxor 13 * 32(%rax), %ymm15, %ymm13; + vpxor 14 * 32(%rax), %ymm15, %ymm14; + vpxor 15 * 32(%rax), %ymm15, %ymm15; + + call __camellia_enc_blk32; + + vpxor 0 * 32(%rsi), %ymm7, %ymm7; + vpxor 1 * 32(%rsi), %ymm6, %ymm6; + vpxor 2 * 32(%rsi), %ymm5, %ymm5; + vpxor 3 * 32(%rsi), %ymm4, %ymm4; + vpxor 4 * 32(%rsi), %ymm3, %ymm3; + vpxor 5 * 32(%rsi), %ymm2, %ymm2; + vpxor 6 * 32(%rsi), %ymm1, %ymm1; + vpxor 7 * 32(%rsi), %ymm0, %ymm0; + vpxor 8 * 32(%rsi), %ymm15, %ymm15; + vpxor 9 * 32(%rsi), %ymm14, %ymm14; + vpxor 10 * 32(%rsi), %ymm13, %ymm13; + vpxor 11 * 32(%rsi), %ymm12, %ymm12; + vpxor 12 * 32(%rsi), %ymm11, %ymm11; + vpxor 13 * 32(%rsi), %ymm10, %ymm10; + vpxor 14 * 32(%rsi), %ymm9, %ymm9; + vpxor 15 * 32(%rsi), %ymm8, %ymm8; + + write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, + %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, + %ymm8, %rsi); + + vzeroall; + + movq (16 * 32 + 0 * 8)(%rax), %r10; + movq (16 * 32 + 1 * 8)(%rax), %r11; + movq (16 * 32 + 2 * 8)(%rax), %r12; + movq (16 * 32 + 3 * 8)(%rax), %r13; + + leave; + ret; +ELF(.size _gcry_camellia_aesni_avx2_ocb_enc,.-_gcry_camellia_aesni_avx2_ocb_enc;) + +.align 8 +.globl _gcry_camellia_aesni_avx2_ocb_dec +ELF(.type _gcry_camellia_aesni_avx2_ocb_dec,@function;) + +_gcry_camellia_aesni_avx2_ocb_dec: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (32 blocks) + * %rdx: src (32 blocks) + * %rcx: offset + * %r8 : checksum + * %r9 : L pointers (void *L[32]) + */ + + pushq %rbp; + movq %rsp, %rbp; + + vzeroupper; + + subq $(16 * 32 + 4 * 8), %rsp; + andq $~63, %rsp; + movq %rsp, %rax; + + movq %r10, (16 * 32 + 0 * 8)(%rax); + movq %r11, (16 * 32 + 1 * 8)(%rax); + movq %r12, (16 * 32 + 2 * 8)(%rax); + movq %r13, (16 * 32 + 3 * 8)(%rax); + + vmovdqu (%rcx), %xmm14; + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ + +#define OCB_INPUT(n, l0reg, l1reg, yreg) \ + vmovdqu (n * 32)(%rdx), yreg; \ + vpxor (l0reg), %xmm14, %xmm15; \ + vpxor (l1reg), %xmm15, %xmm14; \ + vinserti128 $1, %xmm14, %ymm15, %ymm15; \ + vpxor yreg, %ymm15, yreg; \ + vmovdqu %ymm15, (n * 32)(%rsi); + + movq (0 * 8)(%r9), %r10; + movq (1 * 8)(%r9), %r11; + movq (2 * 8)(%r9), %r12; + movq (3 * 8)(%r9), %r13; + OCB_INPUT(0, %r10, %r11, %ymm0); + vmovdqu %ymm0, (15 * 32)(%rax); + OCB_INPUT(1, %r12, %r13, %ymm0); + vmovdqu %ymm0, (14 * 32)(%rax); + movq (4 * 8)(%r9), %r10; + movq (5 * 8)(%r9), %r11; + movq (6 * 8)(%r9), %r12; + movq (7 * 8)(%r9), %r13; + OCB_INPUT(2, %r10, %r11, %ymm13); + OCB_INPUT(3, %r12, %r13, %ymm12); + movq (8 * 8)(%r9), %r10; + movq (9 * 8)(%r9), %r11; + movq (10 * 8)(%r9), %r12; + movq (11 * 8)(%r9), %r13; + OCB_INPUT(4, %r10, %r11, %ymm11); + OCB_INPUT(5, %r12, %r13, %ymm10); + movq (12 * 8)(%r9), %r10; + movq (13 * 8)(%r9), %r11; + movq (14 * 8)(%r9), %r12; + movq (15 * 8)(%r9), %r13; + OCB_INPUT(6, %r10, %r11, %ymm9); + OCB_INPUT(7, %r12, %r13, %ymm8); + movq (16 * 8)(%r9), %r10; + movq (17 * 8)(%r9), %r11; + movq (18 * 8)(%r9), %r12; + movq (19 * 8)(%r9), %r13; + OCB_INPUT(8, %r10, %r11, %ymm7); + OCB_INPUT(9, %r12, %r13, %ymm6); + movq (20 * 8)(%r9), %r10; + movq (21 * 8)(%r9), %r11; + movq (22 * 8)(%r9), %r12; + movq (23 * 8)(%r9), %r13; + OCB_INPUT(10, %r10, %r11, %ymm5); + OCB_INPUT(11, %r12, %r13, %ymm4); + movq (24 * 8)(%r9), %r10; + movq (25 * 8)(%r9), %r11; + movq (26 * 8)(%r9), %r12; + movq (27 * 8)(%r9), %r13; + OCB_INPUT(12, %r10, %r11, %ymm3); + OCB_INPUT(13, %r12, %r13, %ymm2); + movq (28 * 8)(%r9), %r10; + movq (29 * 8)(%r9), %r11; + movq (30 * 8)(%r9), %r12; + movq (31 * 8)(%r9), %r13; + OCB_INPUT(14, %r10, %r11, %ymm1); + OCB_INPUT(15, %r12, %r13, %ymm0); +#undef OCB_INPUT + + vmovdqu %xmm14, (%rcx); + + movq %r8, %r10; + + cmpl $128, key_bitlength(CTX); + movl $32, %r8d; + movl $24, %r9d; + cmovel %r9d, %r8d; /* max */ + + /* inpack16_pre: */ + vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15; + vpshufb .Lpack_bswap RIP, %ymm15, %ymm15; + vpxor %ymm0, %ymm15, %ymm0; + vpxor %ymm1, %ymm15, %ymm1; + vpxor %ymm2, %ymm15, %ymm2; + vpxor %ymm3, %ymm15, %ymm3; + vpxor %ymm4, %ymm15, %ymm4; + vpxor %ymm5, %ymm15, %ymm5; + vpxor %ymm6, %ymm15, %ymm6; + vpxor %ymm7, %ymm15, %ymm7; + vpxor %ymm8, %ymm15, %ymm8; + vpxor %ymm9, %ymm15, %ymm9; + vpxor %ymm10, %ymm15, %ymm10; + vpxor %ymm11, %ymm15, %ymm11; + vpxor %ymm12, %ymm15, %ymm12; + vpxor %ymm13, %ymm15, %ymm13; + vpxor 14 * 32(%rax), %ymm15, %ymm14; + vpxor 15 * 32(%rax), %ymm15, %ymm15; + + call __camellia_dec_blk32; + + vpxor 0 * 32(%rsi), %ymm7, %ymm7; + vpxor 1 * 32(%rsi), %ymm6, %ymm6; + vpxor 2 * 32(%rsi), %ymm5, %ymm5; + vpxor 3 * 32(%rsi), %ymm4, %ymm4; + vpxor 4 * 32(%rsi), %ymm3, %ymm3; + vpxor 5 * 32(%rsi), %ymm2, %ymm2; + vpxor 6 * 32(%rsi), %ymm1, %ymm1; + vpxor 7 * 32(%rsi), %ymm0, %ymm0; + vmovdqu %ymm7, (7 * 32)(%rax); + vmovdqu %ymm6, (6 * 32)(%rax); + vpxor 8 * 32(%rsi), %ymm15, %ymm15; + vpxor 9 * 32(%rsi), %ymm14, %ymm14; + vpxor 10 * 32(%rsi), %ymm13, %ymm13; + vpxor 11 * 32(%rsi), %ymm12, %ymm12; + vpxor 12 * 32(%rsi), %ymm11, %ymm11; + vpxor 13 * 32(%rsi), %ymm10, %ymm10; + vpxor 14 * 32(%rsi), %ymm9, %ymm9; + vpxor 15 * 32(%rsi), %ymm8, %ymm8; + + /* Checksum_i = Checksum_{i-1} xor P_i */ + + vpxor %ymm5, %ymm7, %ymm7; + vpxor %ymm4, %ymm6, %ymm6; + vpxor %ymm3, %ymm7, %ymm7; + vpxor %ymm2, %ymm6, %ymm6; + vpxor %ymm1, %ymm7, %ymm7; + vpxor %ymm0, %ymm6, %ymm6; + vpxor %ymm15, %ymm7, %ymm7; + vpxor %ymm14, %ymm6, %ymm6; + vpxor %ymm13, %ymm7, %ymm7; + vpxor %ymm12, %ymm6, %ymm6; + vpxor %ymm11, %ymm7, %ymm7; + vpxor %ymm10, %ymm6, %ymm6; + vpxor %ymm9, %ymm7, %ymm7; + vpxor %ymm8, %ymm6, %ymm6; + vpxor %ymm7, %ymm6, %ymm7; + + vextracti128 $1, %ymm7, %xmm6; + vpxor %xmm6, %xmm7, %xmm7; + vpxor (%r10), %xmm7, %xmm7; + vmovdqu %xmm7, (%r10); + + vmovdqu 7 * 32(%rax), %ymm7; + vmovdqu 6 * 32(%rax), %ymm6; + + write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, + %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, + %ymm8, %rsi); + + vzeroall; + + movq (16 * 32 + 0 * 8)(%rax), %r10; + movq (16 * 32 + 1 * 8)(%rax), %r11; + movq (16 * 32 + 2 * 8)(%rax), %r12; + movq (16 * 32 + 3 * 8)(%rax), %r13; + + leave; + ret; +ELF(.size _gcry_camellia_aesni_avx2_ocb_dec,.-_gcry_camellia_aesni_avx2_ocb_dec;) + +.align 8 +.globl _gcry_camellia_aesni_avx2_ocb_auth +ELF(.type _gcry_camellia_aesni_avx2_ocb_auth,@function;) + +_gcry_camellia_aesni_avx2_ocb_auth: + /* input: + * %rdi: ctx, CTX + * %rsi: abuf (16 blocks) + * %rdx: offset + * %rcx: checksum + * %r8 : L pointers (void *L[16]) + */ + + pushq %rbp; + movq %rsp, %rbp; + + vzeroupper; + + subq $(16 * 32 + 4 * 8), %rsp; + andq $~63, %rsp; + movq %rsp, %rax; + + movq %r10, (16 * 32 + 0 * 8)(%rax); + movq %r11, (16 * 32 + 1 * 8)(%rax); + movq %r12, (16 * 32 + 2 * 8)(%rax); + movq %r13, (16 * 32 + 3 * 8)(%rax); + + vmovdqu (%rdx), %xmm14; + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + /* Checksum_i = Checksum_{i-1} xor P_i */ + /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ + +#define OCB_INPUT(n, l0reg, l1reg, yreg) \ + vmovdqu (n * 32)(%rsi), yreg; \ + vpxor (l0reg), %xmm14, %xmm15; \ + vpxor (l1reg), %xmm15, %xmm14; \ + vinserti128 $1, %xmm14, %ymm15, %ymm15; \ + vpxor yreg, %ymm15, yreg; + + movq (0 * 8)(%r8), %r10; + movq (1 * 8)(%r8), %r11; + movq (2 * 8)(%r8), %r12; + movq (3 * 8)(%r8), %r13; + OCB_INPUT(0, %r10, %r11, %ymm0); + vmovdqu %ymm0, (15 * 32)(%rax); + OCB_INPUT(1, %r12, %r13, %ymm0); + vmovdqu %ymm0, (14 * 32)(%rax); + movq (4 * 8)(%r8), %r10; + movq (5 * 8)(%r8), %r11; + movq (6 * 8)(%r8), %r12; + movq (7 * 8)(%r8), %r13; + OCB_INPUT(2, %r10, %r11, %ymm13); + OCB_INPUT(3, %r12, %r13, %ymm12); + movq (8 * 8)(%r8), %r10; + movq (9 * 8)(%r8), %r11; + movq (10 * 8)(%r8), %r12; + movq (11 * 8)(%r8), %r13; + OCB_INPUT(4, %r10, %r11, %ymm11); + OCB_INPUT(5, %r12, %r13, %ymm10); + movq (12 * 8)(%r8), %r10; + movq (13 * 8)(%r8), %r11; + movq (14 * 8)(%r8), %r12; + movq (15 * 8)(%r8), %r13; + OCB_INPUT(6, %r10, %r11, %ymm9); + OCB_INPUT(7, %r12, %r13, %ymm8); + movq (16 * 8)(%r8), %r10; + movq (17 * 8)(%r8), %r11; + movq (18 * 8)(%r8), %r12; + movq (19 * 8)(%r8), %r13; + OCB_INPUT(8, %r10, %r11, %ymm7); + OCB_INPUT(9, %r12, %r13, %ymm6); + movq (20 * 8)(%r8), %r10; + movq (21 * 8)(%r8), %r11; + movq (22 * 8)(%r8), %r12; + movq (23 * 8)(%r8), %r13; + OCB_INPUT(10, %r10, %r11, %ymm5); + OCB_INPUT(11, %r12, %r13, %ymm4); + movq (24 * 8)(%r8), %r10; + movq (25 * 8)(%r8), %r11; + movq (26 * 8)(%r8), %r12; + movq (27 * 8)(%r8), %r13; + OCB_INPUT(12, %r10, %r11, %ymm3); + OCB_INPUT(13, %r12, %r13, %ymm2); + movq (28 * 8)(%r8), %r10; + movq (29 * 8)(%r8), %r11; + movq (30 * 8)(%r8), %r12; + movq (31 * 8)(%r8), %r13; + OCB_INPUT(14, %r10, %r11, %ymm1); + OCB_INPUT(15, %r12, %r13, %ymm0); +#undef OCB_INPUT + + vmovdqu %xmm14, (%rdx); + + movq %rcx, %r10; + + /* inpack16_pre: */ + vpbroadcastq (key_table)(CTX), %ymm15; + vpshufb .Lpack_bswap RIP, %ymm15, %ymm15; + vpxor %ymm0, %ymm15, %ymm0; + vpxor %ymm1, %ymm15, %ymm1; + vpxor %ymm2, %ymm15, %ymm2; + vpxor %ymm3, %ymm15, %ymm3; + vpxor %ymm4, %ymm15, %ymm4; + vpxor %ymm5, %ymm15, %ymm5; + vpxor %ymm6, %ymm15, %ymm6; + vpxor %ymm7, %ymm15, %ymm7; + vpxor %ymm8, %ymm15, %ymm8; + vpxor %ymm9, %ymm15, %ymm9; + vpxor %ymm10, %ymm15, %ymm10; + vpxor %ymm11, %ymm15, %ymm11; + vpxor %ymm12, %ymm15, %ymm12; + vpxor %ymm13, %ymm15, %ymm13; + vpxor 14 * 32(%rax), %ymm15, %ymm14; + vpxor 15 * 32(%rax), %ymm15, %ymm15; + + call __camellia_enc_blk32; + + vpxor %ymm7, %ymm6, %ymm6; + vpxor %ymm5, %ymm4, %ymm4; + vpxor %ymm3, %ymm2, %ymm2; + vpxor %ymm1, %ymm0, %ymm0; + vpxor %ymm15, %ymm14, %ymm14; + vpxor %ymm13, %ymm12, %ymm12; + vpxor %ymm11, %ymm10, %ymm10; + vpxor %ymm9, %ymm8, %ymm8; + + vpxor %ymm6, %ymm4, %ymm4; + vpxor %ymm2, %ymm0, %ymm0; + vpxor %ymm14, %ymm12, %ymm12; + vpxor %ymm10, %ymm8, %ymm8; + + vpxor %ymm4, %ymm0, %ymm0; + vpxor %ymm12, %ymm8, %ymm8; + + vpxor %ymm0, %ymm8, %ymm0; + + vextracti128 $1, %ymm0, %xmm1; + vpxor (%r10), %xmm0, %xmm0; + vpxor %xmm0, %xmm1, %xmm0; + vmovdqu %xmm0, (%r10); + + vzeroall; + + movq (16 * 32 + 0 * 8)(%rax), %r10; + movq (16 * 32 + 1 * 8)(%rax), %r11; + movq (16 * 32 + 2 * 8)(%rax), %r12; + movq (16 * 32 + 3 * 8)(%rax), %r13; + + leave; + ret; +ELF(.size _gcry_camellia_aesni_avx2_ocb_auth,.-_gcry_camellia_aesni_avx2_ocb_auth;) + #endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)*/ #endif /*__x86_64*/ diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c index 50323218..197e1b39 100644 --- a/cipher/camellia-glue.c +++ b/cipher/camellia-glue.c @@ -63,6 +63,7 @@ #include "cipher.h" #include "camellia.h" #include "bufhelp.h" +#include "cipher-internal.h" #include "cipher-selftest.h" /* Helper macro to force alignment to 16 bytes. */ @@ -135,6 +136,26 @@ extern void _gcry_camellia_aesni_avx_cfb_dec(CAMELLIA_context *ctx, const unsigned char *in, unsigned char *iv) ASM_FUNC_ABI; +extern void _gcry_camellia_aesni_avx_ocb_enc(CAMELLIA_context *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *offset, + unsigned char *checksum, + const void *Ls[16]) ASM_FUNC_ABI; + +extern void _gcry_camellia_aesni_avx_ocb_dec(CAMELLIA_context *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *offset, + unsigned char *checksum, + const void *Ls[16]) ASM_FUNC_ABI; + +extern void _gcry_camellia_aesni_avx_ocb_auth(CAMELLIA_context *ctx, + const unsigned char *abuf, + unsigned char *offset, + unsigned char *checksum, + const void *Ls[16]) ASM_FUNC_ABI; + extern void _gcry_camellia_aesni_avx_keygen(CAMELLIA_context *ctx, const unsigned char *key, unsigned int keylen) ASM_FUNC_ABI; @@ -158,6 +179,26 @@ extern void _gcry_camellia_aesni_avx2_cfb_dec(CAMELLIA_context *ctx, unsigned char *out, const unsigned char *in, unsigned char *iv) ASM_FUNC_ABI; + +extern void _gcry_camellia_aesni_avx2_ocb_enc(CAMELLIA_context *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *offset, + unsigned char *checksum, + const void *Ls[32]) ASM_FUNC_ABI; + +extern void _gcry_camellia_aesni_avx2_ocb_dec(CAMELLIA_context *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *offset, + unsigned char *checksum, + const void *Ls[32]) ASM_FUNC_ABI; + +extern void _gcry_camellia_aesni_avx2_ocb_auth(CAMELLIA_context *ctx, + const unsigned char *abuf, + unsigned char *offset, + unsigned char *checksum, + const void *Ls[32]) ASM_FUNC_ABI; #endif static const char *selftest(void); @@ -563,6 +604,294 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv, _gcry_burn_stack(burn_stack_depth); } +static inline const unsigned char * +get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i) +{ + unsigned int ntz = _gcry_ctz64 (i); + + if (ntz < OCB_L_TABLE_SIZE) + return c->u_mode.ocb.L[ntz]; + else + return _gcry_cipher_ocb_get_l (c, l_tmp, i); +} + +/* Bulk encryption/decryption of complete blocks in OCB mode. */ +void +_gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, int encrypt) +{ + CAMELLIA_context *ctx = (void *)&c->context.c; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + unsigned char l_tmp[CAMELLIA_BLOCK_SIZE]; + const unsigned char *l; + int burn_stack_depth; + u64 blkn = c->u_mode.ocb.data_nblocks; + + burn_stack_depth = encrypt ? CAMELLIA_encrypt_stack_burn_size : + CAMELLIA_decrypt_stack_burn_size; + +#ifdef USE_AESNI_AVX2 + if (ctx->use_aesni_avx2) + { + int did_use_aesni_avx2 = 0; + const void *Ls[32]; + int i; + + /* Process data in 32 block chunks. */ + while (nblocks >= 32) + { + /* l_tmp will be used only every 65536-th block. */ + for (i = 0; i < 32; i += 4) + { + Ls[i + 0] = get_l(c, l_tmp, blkn + 1); + Ls[i + 1] = get_l(c, l_tmp, blkn + 2); + Ls[i + 2] = get_l(c, l_tmp, blkn + 3); + Ls[i + 3] = get_l(c, l_tmp, blkn + 4); + blkn += 4; + } + + if (encrypt) + _gcry_camellia_aesni_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv, + c->u_ctr.ctr, Ls); + else + _gcry_camellia_aesni_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv, + c->u_ctr.ctr, Ls); + + nblocks -= 32; + outbuf += 32 * CAMELLIA_BLOCK_SIZE; + inbuf += 32 * CAMELLIA_BLOCK_SIZE; + did_use_aesni_avx2 = 1; + } + + if (did_use_aesni_avx2) + { + int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + + 2 * sizeof(void *) + ASM_EXTRA_STACK; + + if (burn_stack_depth < avx2_burn_stack_depth) + burn_stack_depth = avx2_burn_stack_depth; + } + + /* Use generic code to handle smaller chunks... */ + } +#endif + +#ifdef USE_AESNI_AVX + if (ctx->use_aesni_avx) + { + int did_use_aesni_avx = 0; + const void *Ls[16]; + int i; + + /* Process data in 16 block chunks. */ + while (nblocks >= 16) + { + /* l_tmp will be used only every 65536-th block. */ + for (i = 0; i < 16; i += 4) + { + Ls[i + 0] = get_l(c, l_tmp, blkn + 1); + Ls[i + 1] = get_l(c, l_tmp, blkn + 2); + Ls[i + 2] = get_l(c, l_tmp, blkn + 3); + Ls[i + 3] = get_l(c, l_tmp, blkn + 4); + blkn += 4; + } + + if (encrypt) + _gcry_camellia_aesni_avx_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv, + c->u_ctr.ctr, Ls); + else + _gcry_camellia_aesni_avx_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv, + c->u_ctr.ctr, Ls); + + nblocks -= 16; + outbuf += 16 * CAMELLIA_BLOCK_SIZE; + inbuf += 16 * CAMELLIA_BLOCK_SIZE; + did_use_aesni_avx = 1; + } + + if (did_use_aesni_avx) + { + int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + + 2 * sizeof(void *) + ASM_EXTRA_STACK; + + if (burn_stack_depth < avx_burn_stack_depth) + burn_stack_depth = avx_burn_stack_depth; + } + + /* Use generic code to handle smaller chunks... */ + } +#endif + + if (encrypt) + { + for (; nblocks; nblocks--) + { + l = get_l(c, l_tmp, ++blkn); + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + buf_xor_1 (c->u_iv.iv, l, CAMELLIA_BLOCK_SIZE); + buf_cpy (l_tmp, inbuf, CAMELLIA_BLOCK_SIZE); + /* Checksum_i = Checksum_{i-1} xor P_i */ + buf_xor_1 (c->u_ctr.ctr, l_tmp, CAMELLIA_BLOCK_SIZE); + /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ + buf_xor_1 (l_tmp, c->u_iv.iv, CAMELLIA_BLOCK_SIZE); + Camellia_EncryptBlock(ctx->keybitlength, l_tmp, ctx->keytable, l_tmp); + buf_xor_1 (l_tmp, c->u_iv.iv, CAMELLIA_BLOCK_SIZE); + buf_cpy (outbuf, l_tmp, CAMELLIA_BLOCK_SIZE); + + inbuf += CAMELLIA_BLOCK_SIZE; + outbuf += CAMELLIA_BLOCK_SIZE; + } + } + else + { + for (; nblocks; nblocks--) + { + l = get_l(c, l_tmp, ++blkn); + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + buf_xor_1 (c->u_iv.iv, l, CAMELLIA_BLOCK_SIZE); + buf_cpy (l_tmp, inbuf, CAMELLIA_BLOCK_SIZE); + /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ + buf_xor_1 (l_tmp, c->u_iv.iv, CAMELLIA_BLOCK_SIZE); + Camellia_DecryptBlock(ctx->keybitlength, l_tmp, ctx->keytable, l_tmp); + buf_xor_1 (l_tmp, c->u_iv.iv, CAMELLIA_BLOCK_SIZE); + /* Checksum_i = Checksum_{i-1} xor P_i */ + buf_xor_1 (c->u_ctr.ctr, l_tmp, CAMELLIA_BLOCK_SIZE); + buf_cpy (outbuf, l_tmp, CAMELLIA_BLOCK_SIZE); + + inbuf += CAMELLIA_BLOCK_SIZE; + outbuf += CAMELLIA_BLOCK_SIZE; + } + } + + c->u_mode.ocb.data_nblocks = blkn; + + wipememory(&l_tmp, sizeof(l_tmp)); + + if (burn_stack_depth) + _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *)); +} + +/* Bulk authentication of complete blocks in OCB mode. */ +void +_gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, + size_t nblocks) +{ + CAMELLIA_context *ctx = (void *)&c->context.c; + const unsigned char *abuf = abuf_arg; + unsigned char l_tmp[CAMELLIA_BLOCK_SIZE]; + const unsigned char *l; + int burn_stack_depth; + u64 blkn = c->u_mode.ocb.aad_nblocks; + + burn_stack_depth = CAMELLIA_encrypt_stack_burn_size; + +#ifdef USE_AESNI_AVX2 + if (ctx->use_aesni_avx2) + { + int did_use_aesni_avx2 = 0; + const void *Ls[32]; + int i; + + /* Process data in 32 block chunks. */ + while (nblocks >= 32) + { + /* l_tmp will be used only every 65536-th block. */ + for (i = 0; i < 32; i += 4) + { + Ls[i + 0] = get_l(c, l_tmp, blkn + 1); + Ls[i + 1] = get_l(c, l_tmp, blkn + 2); + Ls[i + 2] = get_l(c, l_tmp, blkn + 3); + Ls[i + 3] = get_l(c, l_tmp, blkn + 4); + blkn += 4; + } + + _gcry_camellia_aesni_avx2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum, Ls); + + nblocks -= 32; + abuf += 32 * CAMELLIA_BLOCK_SIZE; + did_use_aesni_avx2 = 1; + } + + if (did_use_aesni_avx2) + { + int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + + 2 * sizeof(void *) + ASM_EXTRA_STACK; + + if (burn_stack_depth < avx2_burn_stack_depth) + burn_stack_depth = avx2_burn_stack_depth; + } + + /* Use generic code to handle smaller chunks... */ + } +#endif + +#ifdef USE_AESNI_AVX + if (ctx->use_aesni_avx) + { + int did_use_aesni_avx = 0; + const void *Ls[16]; + int i; + + /* Process data in 16 block chunks. */ + while (nblocks >= 16) + { + /* l_tmp will be used only every 65536-th block. */ + for (i = 0; i < 16; i += 4) + { + Ls[i + 0] = get_l(c, l_tmp, blkn + 1); + Ls[i + 1] = get_l(c, l_tmp, blkn + 2); + Ls[i + 2] = get_l(c, l_tmp, blkn + 3); + Ls[i + 3] = get_l(c, l_tmp, blkn + 4); + blkn += 4; + } + + _gcry_camellia_aesni_avx_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum, Ls); + + nblocks -= 16; + abuf += 16 * CAMELLIA_BLOCK_SIZE; + did_use_aesni_avx = 1; + } + + if (did_use_aesni_avx) + { + int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + + 2 * sizeof(void *) + ASM_EXTRA_STACK; + + if (burn_stack_depth < avx_burn_stack_depth) + burn_stack_depth = avx_burn_stack_depth; + } + + /* Use generic code to handle smaller chunks... */ + } +#endif + + for (; nblocks; nblocks--) + { + l = get_l(c, l_tmp, ++blkn); + + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ + buf_xor_1 (c->u_mode.ocb.aad_offset, l, CAMELLIA_BLOCK_SIZE); + /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */ + buf_xor (l_tmp, c->u_mode.ocb.aad_offset, abuf, CAMELLIA_BLOCK_SIZE); + Camellia_EncryptBlock(ctx->keybitlength, l_tmp, ctx->keytable, l_tmp); + buf_xor_1 (c->u_mode.ocb.aad_sum, l_tmp, CAMELLIA_BLOCK_SIZE); + + abuf += CAMELLIA_BLOCK_SIZE; + } + + c->u_mode.ocb.aad_nblocks = blkn; + + wipememory(&l_tmp, sizeof(l_tmp)); + + if (burn_stack_depth) + _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *)); +} + /* Run the self-tests for CAMELLIA-CTR-128, tests IV increment of bulk CTR encryption. Returns NULL on success. */ static const char* diff --git a/cipher/cipher.c b/cipher/cipher.c index 7a29824c..2d2b0ade 100644 --- a/cipher/cipher.c +++ b/cipher/cipher.c @@ -535,6 +535,8 @@ _gcry_cipher_open_internal (gcry_cipher_hd_t *handle, h->bulk.cbc_dec = _gcry_camellia_cbc_dec; h->bulk.cfb_dec = _gcry_camellia_cfb_dec; h->bulk.ctr_enc = _gcry_camellia_ctr_enc; + h->bulk.ocb_crypt = _gcry_camellia_ocb_crypt; + h->bulk.ocb_auth = _gcry_camellia_ocb_auth; break; #endif /*USE_CAMELLIA*/ #ifdef USE_DES |