diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2013-10-22 17:07:53 +0300 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2013-10-22 19:53:29 +0300 |
commit | c7efaa5fe0ee92e321a7b49d56752cc12eb75fe0 (patch) | |
tree | c090454520de01a26a0357dc257cede6639674c3 /cipher/serpent-sse2-amd64.S | |
parent | 335d9bf7b035815750b63a3a8334d6ce44dc4449 (diff) | |
download | libgcrypt-c7efaa5fe0ee92e321a7b49d56752cc12eb75fe0.tar.gz |
serpent-amd64: do not use GAS macros
* cipher/serpent-avx2-amd64.S: Remove use of GAS macros.
* cipher/serpent-sse2-amd64.S: Ditto.
* configure.ac [HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS]: Do not check
for GAS macros.
--
This way we have better portability; for example, when compiling with clang
on x86-64, the assembly implementations are now enabled and working.
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/serpent-sse2-amd64.S')
-rw-r--r-- | cipher/serpent-sse2-amd64.S | 507 |
1 files changed, 218 insertions, 289 deletions
diff --git a/cipher/serpent-sse2-amd64.S b/cipher/serpent-sse2-amd64.S index a5cf3539..516126b3 100644 --- a/cipher/serpent-sse2-amd64.S +++ b/cipher/serpent-sse2-amd64.S @@ -35,42 +35,27 @@ #define CTX %rdi /* vector registers */ -.set RA0, %xmm0 -.set RA1, %xmm1 -.set RA2, %xmm2 -.set RA3, %xmm3 -.set RA4, %xmm4 - -.set RB0, %xmm5 -.set RB1, %xmm6 -.set RB2, %xmm7 -.set RB3, %xmm8 -.set RB4, %xmm9 - -.set RNOT, %xmm10 -.set RTMP0, %xmm11 -.set RTMP1, %xmm12 -.set RTMP2, %xmm13 +#define RA0 %xmm0 +#define RA1 %xmm1 +#define RA2 %xmm2 +#define RA3 %xmm3 +#define RA4 %xmm4 + +#define RB0 %xmm5 +#define RB1 %xmm6 +#define RB2 %xmm7 +#define RB3 %xmm8 +#define RB4 %xmm9 + +#define RNOT %xmm10 +#define RTMP0 %xmm11 +#define RTMP1 %xmm12 +#define RTMP2 %xmm13 /********************************************************************** helper macros **********************************************************************/ -/* preprocessor macro for renaming vector registers using GAS macros */ -#define sbox_reg_rename(r0, r1, r2, r3, r4, \ - new_r0, new_r1, new_r2, new_r3, new_r4) \ - .set rename_reg0, new_r0; \ - .set rename_reg1, new_r1; \ - .set rename_reg2, new_r2; \ - .set rename_reg3, new_r3; \ - .set rename_reg4, new_r4; \ - \ - .set r0, rename_reg0; \ - .set r1, rename_reg1; \ - .set r2, rename_reg2; \ - .set r3, rename_reg3; \ - .set r4, rename_reg4; - /* vector 32-bit rotation to left */ #define vec_rol(reg, nleft, tmp) \ movdqa reg, tmp; \ @@ -147,9 +132,7 @@ pxor r4, r2; pxor RNOT, r4; \ por r1, r4; pxor r3, r1; \ pxor r4, r1; por r0, r3; \ - pxor r3, r1; pxor r3, r4; \ - \ - sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r2,r0,r3); + pxor r3, r1; pxor r3, r4; #define SBOX0_INVERSE(r0, r1, r2, r3, r4) \ pxor RNOT, r2; movdqa r1, r4; \ @@ -162,9 +145,7 @@ pxor r1, r2; pxor r0, r3; \ pxor r1, r3; \ pand r3, r2; \ - pxor r2, r4; \ - \ - sbox_reg_rename(r0,r1,r2,r3,r4, r0,r4,r1,r3,r2); + pxor r2, r4; #define SBOX1(r0, r1, r2, r3, r4) \ pxor RNOT, r0; pxor RNOT, r2; \ @@ -176,9 +157,7 @@ pand r4, r2; pxor r1, r0; \ pand r2, r1; \ pxor r0, r1; pand r2, r0; \ - pxor r4, r0; \ - \ - sbox_reg_rename(r0,r1,r2,r3,r4, r2,r0,r3,r1,r4); + pxor r4, r0; #define SBOX1_INVERSE(r0, r1, r2, r3, r4) \ movdqa r1, r4; pxor r3, r1; \ @@ -191,9 +170,7 @@ pxor r1, r4; por r0, r1; \ pxor r0, r1; \ por r4, r1; \ - pxor r1, r3; \ - \ - sbox_reg_rename(r0,r1,r2,r3,r4, r4,r0,r3,r2,r1); + pxor r1, r3; #define SBOX2(r0, r1, r2, r3, r4) \ movdqa r0, r4; pand r2, r0; \ @@ -203,9 +180,7 @@ movdqa r3, r1; por r4, r3; \ pxor r0, r3; pand r1, r0; \ pxor r0, r4; pxor r3, r1; \ - pxor r4, r1; pxor RNOT, r4; \ - \ - sbox_reg_rename(r0,r1,r2,r3,r4, r2,r3,r1,r4,r0); + pxor r4, r1; pxor RNOT, r4; #define SBOX2_INVERSE(r0, r1, r2, r3, r4) \ pxor r3, r2; pxor r0, r3; \ @@ -217,9 +192,7 @@ por r0, r2; pxor RNOT, r3; \ pxor r3, r2; pxor r3, r0; \ pand r1, r0; pxor r4, r3; \ - pxor r0, r3; \ - \ - sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r2,r3,r0); + pxor r0, r3; #define SBOX3(r0, r1, r2, r3, r4) \ movdqa r0, r4; por r3, r0; \ @@ -231,9 +204,7 @@ pxor r2, r4; por r0, r1; \ pxor r2, r1; pxor r3, r0; \ movdqa r1, r2; por r3, r1; \ - pxor r0, r1; \ - \ - sbox_reg_rename(r0,r1,r2,r3,r4, r1,r2,r3,r4,r0); + pxor r0, r1; #define SBOX3_INVERSE(r0, r1, r2, r3, r4) \ movdqa r2, r4; pxor r1, r2; \ @@ -245,9 +216,7 @@ pxor r1, r3; pxor r0, r1; \ por r2, r1; pxor r3, r0; \ pxor r4, r1; \ - pxor r1, r0; \ - \ - sbox_reg_rename(r0,r1,r2,r3,r4, r2,r1,r3,r0,r4); + pxor r1, r0; #define SBOX4(r0, r1, r2, r3, r4) \ pxor r3, r1; pxor RNOT, r3; \ @@ -259,9 +228,7 @@ pxor r0, r3; por r1, r4; \ pxor r0, r4; por r3, r0; \ pxor r2, r0; pand r3, r2; \ - pxor RNOT, r0; pxor r2, r4; \ - \ - sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r0,r3,r2); + pxor RNOT, r0; pxor r2, r4; #define SBOX4_INVERSE(r0, r1, r2, r3, r4) \ movdqa r2, r4; pand r3, r2; \ @@ -274,9 +241,7 @@ pand r0, r2; pxor r0, r3; \ pxor r4, r2; \ por r3, r2; pxor r0, r3; \ - pxor r1, r2; \ - \ - sbox_reg_rename(r0,r1,r2,r3,r4, r0,r3,r2,r4,r1); + pxor r1, r2; #define SBOX5(r0, r1, r2, r3, r4) \ pxor r1, r0; pxor r3, r1; \ @@ -288,9 +253,7 @@ pxor r2, r4; pxor r0, r2; \ pand r3, r0; pxor RNOT, r2; \ pxor r4, r0; por r3, r4; \ - pxor r4, r2; \ - \ - sbox_reg_rename(r0,r1,r2,r3,r4, r1,r3,r0,r2,r4); + pxor r4, r2; #define SBOX5_INVERSE(r0, r1, r2, r3, r4) \ pxor RNOT, r1; movdqa r3, r4; \ @@ -302,9 +265,7 @@ pxor r3, r1; pxor r2, r4; \ pand r4, r3; pxor r1, r4; \ pxor r4, r3; pxor RNOT, r4; \ - pxor r0, r3; \ - \ - sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r3,r2,r0); + pxor r0, r3; #define SBOX6(r0, r1, r2, r3, r4) \ pxor RNOT, r2; movdqa r3, r4; \ @@ -316,9 +277,7 @@ pxor r2, r0; pxor r3, r4; \ pxor r0, r4; pxor RNOT, r3; \ pand r4, r2; \ - pxor r3, r2; \ - \ - sbox_reg_rename(r0,r1,r2,r3,r4, r0,r1,r4,r2,r3); + pxor r3, r2; #define SBOX6_INVERSE(r0, r1, r2, r3, r4) \ pxor r2, r0; movdqa r2, r4; \ @@ -329,9 +288,7 @@ pxor r1, r4; pand r3, r1; \ pxor r0, r1; pxor r3, r0; \ por r2, r0; pxor r1, r3; \ - pxor r0, r4; \ - \ - sbox_reg_rename(r0,r1,r2,r3,r4, r1,r2,r4,r3,r0); + pxor r0, r4; #define SBOX7(r0, r1, r2, r3, r4) \ movdqa r1, r4; por r2, r1; \ @@ -344,9 +301,7 @@ pxor r1, r2; pand r0, r1; \ pxor r4, r1; pxor RNOT, r2; \ por r0, r2; \ - pxor r2, r4; \ - \ - sbox_reg_rename(r0,r1,r2,r3,r4, r4,r3,r1,r0,r2); + pxor r2, r4; #define SBOX7_INVERSE(r0, r1, r2, r3, r4) \ movdqa r2, r4; pxor r0, r2; \ @@ -358,9 +313,7 @@ por r2, r0; pxor r1, r4; \ pxor r3, r0; pxor r4, r3; \ por r0, r4; pxor r2, r3; \ - pxor r2, r4; \ - \ - sbox_reg_rename(r0,r1,r2,r3,r4, r3,r0,r1,r4,r2); + pxor r2, r4; /* Apply SBOX number WHICH to to the block. */ #define SBOX(which, r0, r1, r2, r3, r4) \ @@ -425,49 +378,51 @@ /* Apply a Serpent round to eight parallel blocks. This macro increments `round'. */ -#define ROUND(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ - BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \ - SBOX (which, a0, a1, a2, a3, a4); \ - BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \ - SBOX (which, b0, b1, b2, b3, b4); \ - LINEAR_TRANSFORMATION (a0, a1, a2, a3, a4); \ - LINEAR_TRANSFORMATION (b0, b1, b2, b3, b4); \ - .set round, (round + 1); +#define ROUND(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \ + b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \ + BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \ + SBOX (which, a0, a1, a2, a3, a4); \ + BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \ + SBOX (which, b0, b1, b2, b3, b4); \ + LINEAR_TRANSFORMATION (na0, na1, na2, na3, na4); \ + LINEAR_TRANSFORMATION (nb0, nb1, nb2, nb3, nb4); /* Apply the last Serpent round to eight parallel blocks. This macro increments `round'. */ -#define ROUND_LAST(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ - BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \ - SBOX (which, a0, a1, a2, a3, a4); \ - BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \ - SBOX (which, b0, b1, b2, b3, b4); \ - .set round, (round + 1); \ - BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \ - BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \ - .set round, (round + 1); +#define ROUND_LAST(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \ + b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \ + BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \ + SBOX (which, a0, a1, a2, a3, a4); \ + BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \ + SBOX (which, b0, b1, b2, b3, b4); \ + BLOCK_XOR_KEY (na0, na1, na2, na3, na4, ((round) + 1)); \ + BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, ((round) + 1)); /* Apply an inverse Serpent round to eight parallel blocks. This macro increments `round'. */ -#define ROUND_INVERSE(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ +#define ROUND_INVERSE(round, which, a0, a1, a2, a3, a4, \ + na0, na1, na2, na3, na4, \ + b0, b1, b2, b3, b4, \ + nb0, nb1, nb2, nb3, nb4) \ LINEAR_TRANSFORMATION_INVERSE (a0, a1, a2, a3, a4); \ LINEAR_TRANSFORMATION_INVERSE (b0, b1, b2, b3, b4); \ SBOX_INVERSE (which, a0, a1, a2, a3, a4); \ - BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \ + BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round); \ SBOX_INVERSE (which, b0, b1, b2, b3, b4); \ - BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \ - .set round, (round - 1); + BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round); /* Apply the first inverse Serpent round to eight parallel blocks. This macro increments `round'. */ -#define ROUND_FIRST_INVERSE(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \ - BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \ - BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \ - .set round, (round - 1); \ +#define ROUND_FIRST_INVERSE(round, which, a0, a1, a2, a3, a4, \ + na0, na1, na2, na3, na4, \ + b0, b1, b2, b3, b4, \ + nb0, nb1, nb2, nb3, nb4) \ + BLOCK_XOR_KEY (a0, a1, a2, a3, a4, ((round) + 1)); \ + BLOCK_XOR_KEY (b0, b1, b2, b3, b4, ((round) + 1)); \ SBOX_INVERSE (which, a0, a1, a2, a3, a4); \ - BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \ + BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round); \ SBOX_INVERSE (which, b0, b1, b2, b3, b4); \ - BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \ - .set round, (round - 1); + BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round); .text @@ -479,72 +434,82 @@ __serpent_enc_blk8: * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext * blocks * output: - * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel + * RA4, RA1, RA2, RA0, RB4, RB1, RB2, RB0: eight parallel * ciphertext blocks */ - /* record input vector names for __serpent_enc_blk8 */ - .set enc_in_a0, RA0 - .set enc_in_a1, RA1 - .set enc_in_a2, RA2 - .set enc_in_a3, RA3 - .set enc_in_b0, RB0 - .set enc_in_b1, RB1 - .set enc_in_b2, RB2 - .set enc_in_b3, RB3 - pcmpeqd RNOT, RNOT; transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1); transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1); - .set round, 0 - ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - - ROUND_LAST (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - - transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1); - transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1); - - /* record output vector names for __serpent_enc_blk8 */ - .set enc_out_a0, RA0 - .set enc_out_a1, RA1 - .set enc_out_a2, RA2 - .set enc_out_a3, RA3 - .set enc_out_b0, RB0 - .set enc_out_b1, RB1 - .set enc_out_b2, RB2 - .set enc_out_b3, RB3 + ROUND (0, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3, + RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3); + ROUND (1, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3, + RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3); + ROUND (2, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2, + RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2); + ROUND (3, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0, + RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0); + ROUND (4, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3, + RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3); + ROUND (5, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3, + RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3); + ROUND (6, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4, + RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4); + ROUND (7, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3, + RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3); + ROUND (8, 0, RA4, RA1, RA2, RA0, RA3, RA1, RA3, RA2, RA4, RA0, + RB4, RB1, RB2, RB0, RB3, RB1, RB3, RB2, RB4, RB0); + ROUND (9, 1, RA1, RA3, RA2, RA4, RA0, RA2, RA1, RA4, RA3, RA0, + RB1, RB3, RB2, RB4, RB0, RB2, RB1, RB4, RB3, RB0); + ROUND (10, 2, RA2, RA1, RA4, RA3, RA0, RA4, RA3, RA1, RA0, RA2, + RB2, RB1, RB4, RB3, RB0, RB4, RB3, RB1, RB0, RB2); + ROUND (11, 3, RA4, RA3, RA1, RA0, RA2, RA3, RA1, RA0, RA2, RA4, + RB4, RB3, RB1, RB0, RB2, RB3, RB1, RB0, RB2, RB4); + ROUND (12, 4, RA3, RA1, RA0, RA2, RA4, RA1, RA4, RA3, RA2, RA0, + RB3, RB1, RB0, RB2, RB4, RB1, RB4, RB3, RB2, RB0); + ROUND (13, 5, RA1, RA4, RA3, RA2, RA0, RA4, RA2, RA1, RA3, RA0, + RB1, RB4, RB3, RB2, RB0, RB4, RB2, RB1, RB3, RB0); + ROUND (14, 6, RA4, RA2, RA1, RA3, RA0, RA4, RA2, RA0, RA1, RA3, + RB4, RB2, RB1, RB3, RB0, RB4, RB2, RB0, RB1, RB3); + ROUND (15, 7, RA4, RA2, RA0, RA1, RA3, RA3, RA1, RA2, RA4, RA0, + RB4, RB2, RB0, RB1, RB3, RB3, RB1, RB2, RB4, RB0); + ROUND (16, 0, RA3, RA1, RA2, RA4, RA0, RA1, RA0, RA2, RA3, RA4, + RB3, RB1, RB2, RB4, RB0, RB1, RB0, RB2, RB3, RB4); + ROUND (17, 1, RA1, RA0, RA2, RA3, RA4, RA2, RA1, RA3, RA0, RA4, + RB1, RB0, RB2, RB3, RB4, RB2, RB1, RB3, RB0, RB4); + ROUND (18, 2, RA2, RA1, RA3, RA0, RA4, RA3, RA0, RA1, RA4, RA2, + RB2, RB1, RB3, RB0, RB4, RB3, RB0, RB1, RB4, RB2); + ROUND (19, 3, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA4, RA2, RA3, + RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB4, RB2, RB3); + ROUND (20, 4, RA0, RA1, RA4, RA2, RA3, RA1, RA3, RA0, RA2, RA4, + RB0, RB1, RB4, RB2, RB3, RB1, RB3, RB0, RB2, RB4); + ROUND (21, 5, RA1, RA3, RA0, RA2, RA4, RA3, RA2, RA1, RA0, RA4, + RB1, RB3, RB0, RB2, RB4, RB3, RB2, RB1, RB0, RB4); + ROUND (22, 6, RA3, RA2, RA1, RA0, RA4, RA3, RA2, RA4, RA1, RA0, + RB3, RB2, RB1, RB0, RB4, RB3, RB2, RB4, RB1, RB0); + ROUND (23, 7, RA3, RA2, RA4, RA1, RA0, RA0, RA1, RA2, RA3, RA4, + RB3, RB2, RB4, RB1, RB0, RB0, RB1, RB2, RB3, RB4); + ROUND (24, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3, + RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3); + ROUND (25, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3, + RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3); + ROUND (26, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2, + RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2); + ROUND (27, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0, + RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0); + ROUND (28, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3, + RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3); + ROUND (29, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3, + RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3); + ROUND (30, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4, + RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4); + ROUND_LAST (31, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3, + RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3); + + transpose_4x4(RA4, RA1, RA2, RA0, RA3, RTMP0, RTMP1); + transpose_4x4(RB4, RB1, RB2, RB0, RB3, RTMP0, RTMP1); ret; .size __serpent_enc_blk8,.-__serpent_enc_blk8; @@ -561,69 +526,81 @@ __serpent_dec_blk8: * blocks */ - /* record input vector names for __serpent_dec_blk8 */ - .set dec_in_a0, RA0 - .set dec_in_a1, RA1 - .set dec_in_a2, RA2 - .set dec_in_a3, RA3 - .set dec_in_b0, RB0 - .set dec_in_b1, RB1 - .set dec_in_b2, RB2 - .set dec_in_b3, RB3 - pcmpeqd RNOT, RNOT; transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1); transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1); - .set round, 32 - ROUND_FIRST_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - - ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); - ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4); + ROUND_FIRST_INVERSE (31, 7, RA0, RA1, RA2, RA3, RA4, + RA3, RA0, RA1, RA4, RA2, + RB0, RB1, RB2, RB3, RB4, + RB3, RB0, RB1, RB4, RB2); + ROUND_INVERSE (30, 6, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA2, RA4, RA3, + RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB2, RB4, RB3); + ROUND_INVERSE (29, 5, RA0, RA1, RA2, RA4, RA3, RA1, RA3, RA4, RA2, RA0, + RB0, RB1, RB2, RB4, RB3, RB1, RB3, RB4, RB2, RB0); + ROUND_INVERSE (28, 4, RA1, RA3, RA4, RA2, RA0, RA1, RA2, RA4, RA0, RA3, + RB1, RB3, RB4, RB2, RB0, RB1, RB2, RB4, RB0, RB3); + ROUND_INVERSE (27, 3, RA1, RA2, RA4, RA0, RA3, RA4, RA2, RA0, RA1, RA3, + RB1, RB2, RB4, RB0, RB3, RB4, RB2, RB0, RB1, RB3); + ROUND_INVERSE (26, 2, RA4, RA2, RA0, RA1, RA3, RA2, RA3, RA0, RA1, RA4, + RB4, RB2, RB0, RB1, RB3, RB2, RB3, RB0, RB1, RB4); + ROUND_INVERSE (25, 1, RA2, RA3, RA0, RA1, RA4, RA4, RA2, RA1, RA0, RA3, + RB2, RB3, RB0, RB1, RB4, RB4, RB2, RB1, RB0, RB3); + ROUND_INVERSE (24, 0, RA4, RA2, RA1, RA0, RA3, RA4, RA3, RA2, RA0, RA1, + RB4, RB2, RB1, RB0, RB3, RB4, RB3, RB2, RB0, RB1); + ROUND_INVERSE (23, 7, RA4, RA3, RA2, RA0, RA1, RA0, RA4, RA3, RA1, RA2, + RB4, RB3, RB2, RB0, RB1, RB0, RB4, RB3, RB1, RB2); + ROUND_INVERSE (22, 6, RA0, RA4, RA3, RA1, RA2, RA4, RA3, RA2, RA1, RA0, + RB0, RB4, RB3, RB1, RB2, RB4, RB3, RB2, RB1, RB0); + ROUND_INVERSE (21, 5, RA4, RA3, RA2, RA1, RA0, RA3, RA0, RA1, RA2, RA4, + RB4, RB3, RB2, RB1, RB0, RB3, RB0, RB1, RB2, RB4); + ROUND_INVERSE (20, 4, RA3, RA0, RA1, RA2, RA4, RA3, RA2, RA1, RA4, RA0, + RB3, RB0, RB1, RB2, RB4, RB3, RB2, RB1, RB4, RB0); + ROUND_INVERSE (19, 3, RA3, RA2, RA1, RA4, RA0, RA1, RA2, RA4, RA3, RA0, + RB3, RB2, RB1, RB4, RB0, RB1, RB2, RB4, RB3, RB0); + ROUND_INVERSE (18, 2, RA1, RA2, RA4, RA3, RA0, RA2, RA0, RA4, RA3, RA1, + RB1, RB2, RB4, RB3, RB0, RB2, RB0, RB4, RB3, RB1); + ROUND_INVERSE (17, 1, RA2, RA0, RA4, RA3, RA1, RA1, RA2, RA3, RA4, RA0, + RB2, RB0, RB4, RB3, RB1, RB1, RB2, RB3, RB4, RB0); + ROUND_INVERSE (16, 0, RA1, RA2, RA3, RA4, RA0, RA1, RA0, RA2, RA4, RA3, + RB1, RB2, RB3, RB4, RB0, RB1, RB0, RB2, RB4, RB3); + ROUND_INVERSE (15, 7, RA1, RA0, RA2, RA4, RA3, RA4, RA1, RA0, RA3, RA2, + RB1, RB0, RB2, RB4, RB3, RB4, RB1, RB0, RB3, RB2); + ROUND_INVERSE (14, 6, RA4, RA1, RA0, RA3, RA2, RA1, RA0, RA2, RA3, RA4, + RB4, RB1, RB0, RB3, RB2, RB1, RB0, RB2, RB3, RB4); + ROUND_INVERSE (13, 5, RA1, RA0, RA2, RA3, RA4, RA0, RA4, RA3, RA2, RA1, + RB1, RB0, RB2, RB3, RB4, RB0, RB4, RB3, RB2, RB1); + ROUND_INVERSE (12, 4, RA0, RA4, RA3, RA2, RA1, RA0, RA2, RA3, RA1, RA4, + RB0, RB4, RB3, RB2, RB1, RB0, RB2, RB3, RB1, RB4); + ROUND_INVERSE (11, 3, RA0, RA2, RA3, RA1, RA4, RA3, RA2, RA1, RA0, RA4, + RB0, RB2, RB3, RB1, RB4, RB3, RB2, RB1, RB0, RB4); + ROUND_INVERSE (10, 2, RA3, RA2, RA1, RA0, RA4, RA2, RA4, RA1, RA0, RA3, + RB3, RB2, RB1, RB0, RB4, RB2, RB4, RB1, RB0, RB3); + ROUND_INVERSE (9, 1, RA2, RA4, RA1, RA0, RA3, RA3, RA2, RA0, RA1, RA4, + RB2, RB4, RB1, RB0, RB3, RB3, RB2, RB0, RB1, RB4); + ROUND_INVERSE (8, 0, RA3, RA2, RA0, RA1, RA4, RA3, RA4, RA2, RA1, RA0, + RB3, RB2, RB0, RB1, RB4, RB3, RB4, RB2, RB1, RB0); + ROUND_INVERSE (7, 7, RA3, RA4, RA2, RA1, RA0, RA1, RA3, RA4, RA0, RA2, + RB3, RB4, RB2, RB1, RB0, RB1, RB3, RB4, RB0, RB2); + ROUND_INVERSE (6, 6, RA1, RA3, RA4, RA0, RA2, RA3, RA4, RA2, RA0, RA1, + RB1, RB3, RB4, RB0, RB2, RB3, RB4, RB2, RB0, RB1); + ROUND_INVERSE (5, 5, RA3, RA4, RA2, RA0, RA1, RA4, RA1, RA0, RA2, RA3, + RB3, RB4, RB2, RB0, RB1, RB4, RB1, RB0, RB2, RB3); + ROUND_INVERSE (4, 4, RA4, RA1, RA0, RA2, RA3, RA4, RA2, RA0, RA3, RA1, + RB4, RB1, RB0, RB2, RB3, RB4, RB2, RB0, RB3, RB1); + ROUND_INVERSE (3, 3, RA4, RA2, RA0, RA3, RA1, RA0, RA2, RA3, RA4, RA1, + RB4, RB2, RB0, RB3, RB1, RB0, RB2, RB3, RB4, RB1); + ROUND_INVERSE (2, 2, RA0, RA2, RA3, RA4, RA1, RA2, RA1, RA3, RA4, RA0, + RB0, RB2, RB3, RB4, RB1, RB2, RB1, RB3, RB4, RB0); + ROUND_INVERSE (1, 1, RA2, RA1, RA3, RA4, RA0, RA0, RA2, RA4, RA3, RA1, + RB2, RB1, RB3, RB4, RB0, RB0, RB2, RB4, RB3, RB1); + ROUND_INVERSE (0, 0, RA0, RA2, RA4, RA3, RA1, RA0, RA1, RA2, RA3, RA4, + RB0, RB2, RB4, RB3, RB1, RB0, RB1, RB2, RB3, RB4); transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1); transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1); - /* record output vector names for __serpent_dec_blk8 */ - .set dec_out_a0, RA0 - .set dec_out_a1, RA1 - .set dec_out_a2, RA2 - .set dec_out_a3, RA3 - .set dec_out_b0, RB0 - .set dec_out_b1, RB1 - .set dec_out_b2, RB2 - .set dec_out_b3, RB3 - ret; .size __serpent_dec_blk8,.-__serpent_dec_blk8; @@ -638,15 +615,6 @@ _gcry_serpent_sse2_ctr_enc: * %rcx: iv (big endian, 128bit) */ - .set RA0, enc_in_a0 - .set RA1, enc_in_a1 - .set RA2, enc_in_a2 - .set RA3, enc_in_a3 - .set RB0, enc_in_b0 - .set RB1, enc_in_b1 - .set RB2, enc_in_b2 - .set RB3, enc_in_b3 - /* load IV and byteswap */ movdqu (%rcx), RA0; movdqa RA0, RTMP0; @@ -729,42 +697,35 @@ _gcry_serpent_sse2_ctr_enc: call __serpent_enc_blk8; - .set RA0, enc_out_a0 - .set RA1, enc_out_a1 - .set RA2, enc_out_a2 - .set RA3, enc_out_a3 - .set RB0, enc_out_b0 - .set RB1, enc_out_b1 - .set RB2, enc_out_b2 - .set RB3, enc_out_b3 - - pxor_u((0 * 16)(%rdx), RA0, RTMP0); + pxor_u((0 * 16)(%rdx), RA4, RTMP0); pxor_u((1 * 16)(%rdx), RA1, RTMP0); pxor_u((2 * 16)(%rdx), RA2, RTMP0); - pxor_u((3 * 16)(%rdx), RA3, RTMP0); - pxor_u((4 * 16)(%rdx), RB0, RTMP0); + pxor_u((3 * 16)(%rdx), RA0, RTMP0); + pxor_u((4 * 16)(%rdx), RB4, RTMP0); pxor_u((5 * 16)(%rdx), RB1, RTMP0); pxor_u((6 * 16)(%rdx), RB2, RTMP0); - pxor_u((7 * 16)(%rdx), RB3, RTMP0); + pxor_u((7 * 16)(%rdx), RB0, RTMP0); - movdqu RA0, (0 * 16)(%rsi); + movdqu RA4, (0 * 16)(%rsi); movdqu RA1, (1 * 16)(%rsi); movdqu RA2, (2 * 16)(%rsi); - movdqu RA3, (3 * 16)(%rsi); - movdqu RB0, (4 * 16)(%rsi); + movdqu RA0, (3 * 16)(%rsi); + movdqu RB4, (4 * 16)(%rsi); movdqu RB1, (5 * 16)(%rsi); movdqu RB2, (6 * 16)(%rsi); - movdqu RB3, (7 * 16)(%rsi); + movdqu RB0, (7 * 16)(%rsi); /* clear the used registers */ pxor RA0, RA0; pxor RA1, RA1; pxor RA2, RA2; pxor RA3, RA3; + pxor RA4, RA4; pxor RB0, RB0; pxor RB1, RB1; pxor RB2, RB2; pxor RB3, RB3; + pxor RB4, RB4; pxor RTMP0, RTMP0; pxor RTMP1, RTMP1; pxor RTMP2, RTMP2; @@ -784,15 +745,6 @@ _gcry_serpent_sse2_cbc_dec: * %rcx: iv */ - .set RA0, dec_in_a0 - .set RA1, dec_in_a1 - .set RA2, dec_in_a2 - .set RA3, dec_in_a3 - .set RB0, dec_in_b0 - .set RB1, dec_in_b1 - .set RB2, dec_in_b2 - .set RB3, dec_in_b3 - movdqu (0 * 16)(%rdx), RA0; movdqu (1 * 16)(%rdx), RA1; movdqu (2 * 16)(%rdx), RA2; @@ -804,15 +756,6 @@ _gcry_serpent_sse2_cbc_dec: call __serpent_dec_blk8; - .set RA0, dec_out_a0 - .set RA1, dec_out_a1 - .set RA2, dec_out_a2 - .set RA3, dec_out_a3 - .set RB0, dec_out_b0 - .set RB1, dec_out_b1 - .set RB2, dec_out_b2 - .set RB3, dec_out_b3 - movdqu (7 * 16)(%rdx), RNOT; pxor_u((%rcx), RA0, RTMP0); pxor_u((0 * 16)(%rdx), RA1, RTMP0); @@ -838,10 +781,12 @@ _gcry_serpent_sse2_cbc_dec: pxor RA1, RA1; pxor RA2, RA2; pxor RA3, RA3; + pxor RA4, RA4; pxor RB0, RB0; pxor RB1, RB1; pxor RB2, RB2; pxor RB3, RB3; + pxor RB4, RB4; pxor RTMP0, RTMP0; pxor RTMP1, RTMP1; pxor RTMP2, RTMP2; @@ -861,15 +806,6 @@ _gcry_serpent_sse2_cfb_dec: * %rcx: iv */ - .set RA0, enc_in_a0 - .set RA1, enc_in_a1 - .set RA2, enc_in_a2 - .set RA3, enc_in_a3 - .set RB0, enc_in_b0 - .set RB1, enc_in_b1 - .set RB2, enc_in_b2 - .set RB3, enc_in_b3 - /* Load input */ movdqu (%rcx), RA0; movdqu 0 * 16(%rdx), RA1; @@ -886,42 +822,35 @@ _gcry_serpent_sse2_cfb_dec: call __serpent_enc_blk8; - .set RA0, enc_out_a0 - .set RA1, enc_out_a1 - .set RA2, enc_out_a2 - .set RA3, enc_out_a3 - .set RB0, enc_out_b0 - .set RB1, enc_out_b1 - .set RB2, enc_out_b2 - .set RB3, enc_out_b3 - - pxor_u((0 * 16)(%rdx), RA0, RTMP0); + pxor_u((0 * 16)(%rdx), RA4, RTMP0); pxor_u((1 * 16)(%rdx), RA1, RTMP0); pxor_u((2 * 16)(%rdx), RA2, RTMP0); - pxor_u((3 * 16)(%rdx), RA3, RTMP0); - pxor_u((4 * 16)(%rdx), RB0, RTMP0); + pxor_u((3 * 16)(%rdx), RA0, RTMP0); + pxor_u((4 * 16)(%rdx), RB4, RTMP0); pxor_u((5 * 16)(%rdx), RB1, RTMP0); pxor_u((6 * 16)(%rdx), RB2, RTMP0); - pxor_u((7 * 16)(%rdx), RB3, RTMP0); + pxor_u((7 * 16)(%rdx), RB0, RTMP0); - movdqu RA0, (0 * 16)(%rsi); + movdqu RA4, (0 * 16)(%rsi); movdqu RA1, (1 * 16)(%rsi); movdqu RA2, (2 * 16)(%rsi); - movdqu RA3, (3 * 16)(%rsi); - movdqu RB0, (4 * 16)(%rsi); + movdqu RA0, (3 * 16)(%rsi); + movdqu RB4, (4 * 16)(%rsi); movdqu RB1, (5 * 16)(%rsi); movdqu RB2, (6 * 16)(%rsi); - movdqu RB3, (7 * 16)(%rsi); + movdqu RB0, (7 * 16)(%rsi); /* clear the used registers */ pxor RA0, RA0; pxor RA1, RA1; pxor RA2, RA2; pxor RA3, RA3; + pxor RA4, RA4; pxor RB0, RB0; pxor RB1, RB1; pxor RB2, RB2; pxor RB3, RB3; + pxor RB4, RB4; pxor RTMP0, RTMP0; pxor RTMP1, RTMP1; pxor RTMP2, RTMP2; |