summaryrefslogtreecommitdiff
path: root/cipher/serpent-sse2-amd64.S
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2013-10-22 17:07:53 +0300
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2013-10-22 19:53:29 +0300
commitc7efaa5fe0ee92e321a7b49d56752cc12eb75fe0 (patch)
treec090454520de01a26a0357dc257cede6639674c3 /cipher/serpent-sse2-amd64.S
parent335d9bf7b035815750b63a3a8334d6ce44dc4449 (diff)
downloadlibgcrypt-c7efaa5fe0ee92e321a7b49d56752cc12eb75fe0.tar.gz
serpent-amd64: do not use GAS macros
* cipher/serpent-avx2-amd64.S: Remove use of GAS macros. * cipher/serpent-sse2-amd64.S: Ditto. * configure.ac [HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS]: Do not check for GAS macros. -- This way we have better portability; for example, when compiling with clang on x86-64, the assembly implementations are now enabled and working. Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/serpent-sse2-amd64.S')
-rw-r--r--cipher/serpent-sse2-amd64.S507
1 files changed, 218 insertions, 289 deletions
diff --git a/cipher/serpent-sse2-amd64.S b/cipher/serpent-sse2-amd64.S
index a5cf3539..516126b3 100644
--- a/cipher/serpent-sse2-amd64.S
+++ b/cipher/serpent-sse2-amd64.S
@@ -35,42 +35,27 @@
#define CTX %rdi
/* vector registers */
-.set RA0, %xmm0
-.set RA1, %xmm1
-.set RA2, %xmm2
-.set RA3, %xmm3
-.set RA4, %xmm4
-
-.set RB0, %xmm5
-.set RB1, %xmm6
-.set RB2, %xmm7
-.set RB3, %xmm8
-.set RB4, %xmm9
-
-.set RNOT, %xmm10
-.set RTMP0, %xmm11
-.set RTMP1, %xmm12
-.set RTMP2, %xmm13
+#define RA0 %xmm0
+#define RA1 %xmm1
+#define RA2 %xmm2
+#define RA3 %xmm3
+#define RA4 %xmm4
+
+#define RB0 %xmm5
+#define RB1 %xmm6
+#define RB2 %xmm7
+#define RB3 %xmm8
+#define RB4 %xmm9
+
+#define RNOT %xmm10
+#define RTMP0 %xmm11
+#define RTMP1 %xmm12
+#define RTMP2 %xmm13
/**********************************************************************
helper macros
**********************************************************************/
-/* preprocessor macro for renaming vector registers using GAS macros */
-#define sbox_reg_rename(r0, r1, r2, r3, r4, \
- new_r0, new_r1, new_r2, new_r3, new_r4) \
- .set rename_reg0, new_r0; \
- .set rename_reg1, new_r1; \
- .set rename_reg2, new_r2; \
- .set rename_reg3, new_r3; \
- .set rename_reg4, new_r4; \
- \
- .set r0, rename_reg0; \
- .set r1, rename_reg1; \
- .set r2, rename_reg2; \
- .set r3, rename_reg3; \
- .set r4, rename_reg4;
-
/* vector 32-bit rotation to left */
#define vec_rol(reg, nleft, tmp) \
movdqa reg, tmp; \
@@ -147,9 +132,7 @@
pxor r4, r2; pxor RNOT, r4; \
por r1, r4; pxor r3, r1; \
pxor r4, r1; por r0, r3; \
- pxor r3, r1; pxor r3, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r2,r0,r3);
+ pxor r3, r1; pxor r3, r4;
#define SBOX0_INVERSE(r0, r1, r2, r3, r4) \
pxor RNOT, r2; movdqa r1, r4; \
@@ -162,9 +145,7 @@
pxor r1, r2; pxor r0, r3; \
pxor r1, r3; \
pand r3, r2; \
- pxor r2, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r0,r4,r1,r3,r2);
+ pxor r2, r4;
#define SBOX1(r0, r1, r2, r3, r4) \
pxor RNOT, r0; pxor RNOT, r2; \
@@ -176,9 +157,7 @@
pand r4, r2; pxor r1, r0; \
pand r2, r1; \
pxor r0, r1; pand r2, r0; \
- pxor r4, r0; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r2,r0,r3,r1,r4);
+ pxor r4, r0;
#define SBOX1_INVERSE(r0, r1, r2, r3, r4) \
movdqa r1, r4; pxor r3, r1; \
@@ -191,9 +170,7 @@
pxor r1, r4; por r0, r1; \
pxor r0, r1; \
por r4, r1; \
- pxor r1, r3; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r4,r0,r3,r2,r1);
+ pxor r1, r3;
#define SBOX2(r0, r1, r2, r3, r4) \
movdqa r0, r4; pand r2, r0; \
@@ -203,9 +180,7 @@
movdqa r3, r1; por r4, r3; \
pxor r0, r3; pand r1, r0; \
pxor r0, r4; pxor r3, r1; \
- pxor r4, r1; pxor RNOT, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r2,r3,r1,r4,r0);
+ pxor r4, r1; pxor RNOT, r4;
#define SBOX2_INVERSE(r0, r1, r2, r3, r4) \
pxor r3, r2; pxor r0, r3; \
@@ -217,9 +192,7 @@
por r0, r2; pxor RNOT, r3; \
pxor r3, r2; pxor r3, r0; \
pand r1, r0; pxor r4, r3; \
- pxor r0, r3; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r2,r3,r0);
+ pxor r0, r3;
#define SBOX3(r0, r1, r2, r3, r4) \
movdqa r0, r4; por r3, r0; \
@@ -231,9 +204,7 @@
pxor r2, r4; por r0, r1; \
pxor r2, r1; pxor r3, r0; \
movdqa r1, r2; por r3, r1; \
- pxor r0, r1; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r2,r3,r4,r0);
+ pxor r0, r1;
#define SBOX3_INVERSE(r0, r1, r2, r3, r4) \
movdqa r2, r4; pxor r1, r2; \
@@ -245,9 +216,7 @@
pxor r1, r3; pxor r0, r1; \
por r2, r1; pxor r3, r0; \
pxor r4, r1; \
- pxor r1, r0; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r2,r1,r3,r0,r4);
+ pxor r1, r0;
#define SBOX4(r0, r1, r2, r3, r4) \
pxor r3, r1; pxor RNOT, r3; \
@@ -259,9 +228,7 @@
pxor r0, r3; por r1, r4; \
pxor r0, r4; por r3, r0; \
pxor r2, r0; pand r3, r2; \
- pxor RNOT, r0; pxor r2, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r0,r3,r2);
+ pxor RNOT, r0; pxor r2, r4;
#define SBOX4_INVERSE(r0, r1, r2, r3, r4) \
movdqa r2, r4; pand r3, r2; \
@@ -274,9 +241,7 @@
pand r0, r2; pxor r0, r3; \
pxor r4, r2; \
por r3, r2; pxor r0, r3; \
- pxor r1, r2; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r0,r3,r2,r4,r1);
+ pxor r1, r2;
#define SBOX5(r0, r1, r2, r3, r4) \
pxor r1, r0; pxor r3, r1; \
@@ -288,9 +253,7 @@
pxor r2, r4; pxor r0, r2; \
pand r3, r0; pxor RNOT, r2; \
pxor r4, r0; por r3, r4; \
- pxor r4, r2; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r3,r0,r2,r4);
+ pxor r4, r2;
#define SBOX5_INVERSE(r0, r1, r2, r3, r4) \
pxor RNOT, r1; movdqa r3, r4; \
@@ -302,9 +265,7 @@
pxor r3, r1; pxor r2, r4; \
pand r4, r3; pxor r1, r4; \
pxor r4, r3; pxor RNOT, r4; \
- pxor r0, r3; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r3,r2,r0);
+ pxor r0, r3;
#define SBOX6(r0, r1, r2, r3, r4) \
pxor RNOT, r2; movdqa r3, r4; \
@@ -316,9 +277,7 @@
pxor r2, r0; pxor r3, r4; \
pxor r0, r4; pxor RNOT, r3; \
pand r4, r2; \
- pxor r3, r2; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r0,r1,r4,r2,r3);
+ pxor r3, r2;
#define SBOX6_INVERSE(r0, r1, r2, r3, r4) \
pxor r2, r0; movdqa r2, r4; \
@@ -329,9 +288,7 @@
pxor r1, r4; pand r3, r1; \
pxor r0, r1; pxor r3, r0; \
por r2, r0; pxor r1, r3; \
- pxor r0, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r2,r4,r3,r0);
+ pxor r0, r4;
#define SBOX7(r0, r1, r2, r3, r4) \
movdqa r1, r4; por r2, r1; \
@@ -344,9 +301,7 @@
pxor r1, r2; pand r0, r1; \
pxor r4, r1; pxor RNOT, r2; \
por r0, r2; \
- pxor r2, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r4,r3,r1,r0,r2);
+ pxor r2, r4;
#define SBOX7_INVERSE(r0, r1, r2, r3, r4) \
movdqa r2, r4; pxor r0, r2; \
@@ -358,9 +313,7 @@
por r2, r0; pxor r1, r4; \
pxor r3, r0; pxor r4, r3; \
por r0, r4; pxor r2, r3; \
- pxor r2, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r3,r0,r1,r4,r2);
+ pxor r2, r4;
/* Apply SBOX number WHICH to to the block. */
#define SBOX(which, r0, r1, r2, r3, r4) \
@@ -425,49 +378,51 @@
/* Apply a Serpent round to eight parallel blocks. This macro increments
`round'. */
-#define ROUND(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
- BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
- SBOX (which, a0, a1, a2, a3, a4); \
- BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
- SBOX (which, b0, b1, b2, b3, b4); \
- LINEAR_TRANSFORMATION (a0, a1, a2, a3, a4); \
- LINEAR_TRANSFORMATION (b0, b1, b2, b3, b4); \
- .set round, (round + 1);
+#define ROUND(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \
+ b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \
+ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
+ SBOX (which, a0, a1, a2, a3, a4); \
+ BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
+ SBOX (which, b0, b1, b2, b3, b4); \
+ LINEAR_TRANSFORMATION (na0, na1, na2, na3, na4); \
+ LINEAR_TRANSFORMATION (nb0, nb1, nb2, nb3, nb4);
/* Apply the last Serpent round to eight parallel blocks. This macro increments
`round'. */
-#define ROUND_LAST(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
- BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
- SBOX (which, a0, a1, a2, a3, a4); \
- BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
- SBOX (which, b0, b1, b2, b3, b4); \
- .set round, (round + 1); \
- BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
- BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
- .set round, (round + 1);
+#define ROUND_LAST(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \
+ b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \
+ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
+ SBOX (which, a0, a1, a2, a3, a4); \
+ BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
+ SBOX (which, b0, b1, b2, b3, b4); \
+ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, ((round) + 1)); \
+ BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, ((round) + 1));
/* Apply an inverse Serpent round to eight parallel blocks. This macro
increments `round'. */
-#define ROUND_INVERSE(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+#define ROUND_INVERSE(round, which, a0, a1, a2, a3, a4, \
+ na0, na1, na2, na3, na4, \
+ b0, b1, b2, b3, b4, \
+ nb0, nb1, nb2, nb3, nb4) \
LINEAR_TRANSFORMATION_INVERSE (a0, a1, a2, a3, a4); \
LINEAR_TRANSFORMATION_INVERSE (b0, b1, b2, b3, b4); \
SBOX_INVERSE (which, a0, a1, a2, a3, a4); \
- BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
+ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round); \
SBOX_INVERSE (which, b0, b1, b2, b3, b4); \
- BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
- .set round, (round - 1);
+ BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round);
/* Apply the first inverse Serpent round to eight parallel blocks. This macro
increments `round'. */
-#define ROUND_FIRST_INVERSE(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
- BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
- BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
- .set round, (round - 1); \
+#define ROUND_FIRST_INVERSE(round, which, a0, a1, a2, a3, a4, \
+ na0, na1, na2, na3, na4, \
+ b0, b1, b2, b3, b4, \
+ nb0, nb1, nb2, nb3, nb4) \
+ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, ((round) + 1)); \
+ BLOCK_XOR_KEY (b0, b1, b2, b3, b4, ((round) + 1)); \
SBOX_INVERSE (which, a0, a1, a2, a3, a4); \
- BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
+ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round); \
SBOX_INVERSE (which, b0, b1, b2, b3, b4); \
- BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
- .set round, (round - 1);
+ BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round);
.text
@@ -479,72 +434,82 @@ __serpent_enc_blk8:
* RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext
* blocks
* output:
- * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
+ * RA4, RA1, RA2, RA0, RB4, RB1, RB2, RB0: eight parallel
* ciphertext blocks
*/
- /* record input vector names for __serpent_enc_blk8 */
- .set enc_in_a0, RA0
- .set enc_in_a1, RA1
- .set enc_in_a2, RA2
- .set enc_in_a3, RA3
- .set enc_in_b0, RB0
- .set enc_in_b1, RB1
- .set enc_in_b2, RB2
- .set enc_in_b3, RB3
-
pcmpeqd RNOT, RNOT;
transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
- .set round, 0
- ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-
- ROUND_LAST (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-
- transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
- transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
-
- /* record output vector names for __serpent_enc_blk8 */
- .set enc_out_a0, RA0
- .set enc_out_a1, RA1
- .set enc_out_a2, RA2
- .set enc_out_a3, RA3
- .set enc_out_b0, RB0
- .set enc_out_b1, RB1
- .set enc_out_b2, RB2
- .set enc_out_b3, RB3
+ ROUND (0, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3,
+ RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3);
+ ROUND (1, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3,
+ RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3);
+ ROUND (2, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2,
+ RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2);
+ ROUND (3, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0,
+ RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0);
+ ROUND (4, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3,
+ RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3);
+ ROUND (5, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3,
+ RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3);
+ ROUND (6, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4,
+ RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4);
+ ROUND (7, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3,
+ RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3);
+ ROUND (8, 0, RA4, RA1, RA2, RA0, RA3, RA1, RA3, RA2, RA4, RA0,
+ RB4, RB1, RB2, RB0, RB3, RB1, RB3, RB2, RB4, RB0);
+ ROUND (9, 1, RA1, RA3, RA2, RA4, RA0, RA2, RA1, RA4, RA3, RA0,
+ RB1, RB3, RB2, RB4, RB0, RB2, RB1, RB4, RB3, RB0);
+ ROUND (10, 2, RA2, RA1, RA4, RA3, RA0, RA4, RA3, RA1, RA0, RA2,
+ RB2, RB1, RB4, RB3, RB0, RB4, RB3, RB1, RB0, RB2);
+ ROUND (11, 3, RA4, RA3, RA1, RA0, RA2, RA3, RA1, RA0, RA2, RA4,
+ RB4, RB3, RB1, RB0, RB2, RB3, RB1, RB0, RB2, RB4);
+ ROUND (12, 4, RA3, RA1, RA0, RA2, RA4, RA1, RA4, RA3, RA2, RA0,
+ RB3, RB1, RB0, RB2, RB4, RB1, RB4, RB3, RB2, RB0);
+ ROUND (13, 5, RA1, RA4, RA3, RA2, RA0, RA4, RA2, RA1, RA3, RA0,
+ RB1, RB4, RB3, RB2, RB0, RB4, RB2, RB1, RB3, RB0);
+ ROUND (14, 6, RA4, RA2, RA1, RA3, RA0, RA4, RA2, RA0, RA1, RA3,
+ RB4, RB2, RB1, RB3, RB0, RB4, RB2, RB0, RB1, RB3);
+ ROUND (15, 7, RA4, RA2, RA0, RA1, RA3, RA3, RA1, RA2, RA4, RA0,
+ RB4, RB2, RB0, RB1, RB3, RB3, RB1, RB2, RB4, RB0);
+ ROUND (16, 0, RA3, RA1, RA2, RA4, RA0, RA1, RA0, RA2, RA3, RA4,
+ RB3, RB1, RB2, RB4, RB0, RB1, RB0, RB2, RB3, RB4);
+ ROUND (17, 1, RA1, RA0, RA2, RA3, RA4, RA2, RA1, RA3, RA0, RA4,
+ RB1, RB0, RB2, RB3, RB4, RB2, RB1, RB3, RB0, RB4);
+ ROUND (18, 2, RA2, RA1, RA3, RA0, RA4, RA3, RA0, RA1, RA4, RA2,
+ RB2, RB1, RB3, RB0, RB4, RB3, RB0, RB1, RB4, RB2);
+ ROUND (19, 3, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA4, RA2, RA3,
+ RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB4, RB2, RB3);
+ ROUND (20, 4, RA0, RA1, RA4, RA2, RA3, RA1, RA3, RA0, RA2, RA4,
+ RB0, RB1, RB4, RB2, RB3, RB1, RB3, RB0, RB2, RB4);
+ ROUND (21, 5, RA1, RA3, RA0, RA2, RA4, RA3, RA2, RA1, RA0, RA4,
+ RB1, RB3, RB0, RB2, RB4, RB3, RB2, RB1, RB0, RB4);
+ ROUND (22, 6, RA3, RA2, RA1, RA0, RA4, RA3, RA2, RA4, RA1, RA0,
+ RB3, RB2, RB1, RB0, RB4, RB3, RB2, RB4, RB1, RB0);
+ ROUND (23, 7, RA3, RA2, RA4, RA1, RA0, RA0, RA1, RA2, RA3, RA4,
+ RB3, RB2, RB4, RB1, RB0, RB0, RB1, RB2, RB3, RB4);
+ ROUND (24, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3,
+ RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3);
+ ROUND (25, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3,
+ RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3);
+ ROUND (26, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2,
+ RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2);
+ ROUND (27, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0,
+ RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0);
+ ROUND (28, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3,
+ RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3);
+ ROUND (29, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3,
+ RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3);
+ ROUND (30, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4,
+ RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4);
+ ROUND_LAST (31, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3,
+ RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3);
+
+ transpose_4x4(RA4, RA1, RA2, RA0, RA3, RTMP0, RTMP1);
+ transpose_4x4(RB4, RB1, RB2, RB0, RB3, RTMP0, RTMP1);
ret;
.size __serpent_enc_blk8,.-__serpent_enc_blk8;
@@ -561,69 +526,81 @@ __serpent_dec_blk8:
* blocks
*/
- /* record input vector names for __serpent_dec_blk8 */
- .set dec_in_a0, RA0
- .set dec_in_a1, RA1
- .set dec_in_a2, RA2
- .set dec_in_a3, RA3
- .set dec_in_b0, RB0
- .set dec_in_b1, RB1
- .set dec_in_b2, RB2
- .set dec_in_b3, RB3
-
pcmpeqd RNOT, RNOT;
transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
- .set round, 32
- ROUND_FIRST_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-
- ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+ ROUND_FIRST_INVERSE (31, 7, RA0, RA1, RA2, RA3, RA4,
+ RA3, RA0, RA1, RA4, RA2,
+ RB0, RB1, RB2, RB3, RB4,
+ RB3, RB0, RB1, RB4, RB2);
+ ROUND_INVERSE (30, 6, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA2, RA4, RA3,
+ RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB2, RB4, RB3);
+ ROUND_INVERSE (29, 5, RA0, RA1, RA2, RA4, RA3, RA1, RA3, RA4, RA2, RA0,
+ RB0, RB1, RB2, RB4, RB3, RB1, RB3, RB4, RB2, RB0);
+ ROUND_INVERSE (28, 4, RA1, RA3, RA4, RA2, RA0, RA1, RA2, RA4, RA0, RA3,
+ RB1, RB3, RB4, RB2, RB0, RB1, RB2, RB4, RB0, RB3);
+ ROUND_INVERSE (27, 3, RA1, RA2, RA4, RA0, RA3, RA4, RA2, RA0, RA1, RA3,
+ RB1, RB2, RB4, RB0, RB3, RB4, RB2, RB0, RB1, RB3);
+ ROUND_INVERSE (26, 2, RA4, RA2, RA0, RA1, RA3, RA2, RA3, RA0, RA1, RA4,
+ RB4, RB2, RB0, RB1, RB3, RB2, RB3, RB0, RB1, RB4);
+ ROUND_INVERSE (25, 1, RA2, RA3, RA0, RA1, RA4, RA4, RA2, RA1, RA0, RA3,
+ RB2, RB3, RB0, RB1, RB4, RB4, RB2, RB1, RB0, RB3);
+ ROUND_INVERSE (24, 0, RA4, RA2, RA1, RA0, RA3, RA4, RA3, RA2, RA0, RA1,
+ RB4, RB2, RB1, RB0, RB3, RB4, RB3, RB2, RB0, RB1);
+ ROUND_INVERSE (23, 7, RA4, RA3, RA2, RA0, RA1, RA0, RA4, RA3, RA1, RA2,
+ RB4, RB3, RB2, RB0, RB1, RB0, RB4, RB3, RB1, RB2);
+ ROUND_INVERSE (22, 6, RA0, RA4, RA3, RA1, RA2, RA4, RA3, RA2, RA1, RA0,
+ RB0, RB4, RB3, RB1, RB2, RB4, RB3, RB2, RB1, RB0);
+ ROUND_INVERSE (21, 5, RA4, RA3, RA2, RA1, RA0, RA3, RA0, RA1, RA2, RA4,
+ RB4, RB3, RB2, RB1, RB0, RB3, RB0, RB1, RB2, RB4);
+ ROUND_INVERSE (20, 4, RA3, RA0, RA1, RA2, RA4, RA3, RA2, RA1, RA4, RA0,
+ RB3, RB0, RB1, RB2, RB4, RB3, RB2, RB1, RB4, RB0);
+ ROUND_INVERSE (19, 3, RA3, RA2, RA1, RA4, RA0, RA1, RA2, RA4, RA3, RA0,
+ RB3, RB2, RB1, RB4, RB0, RB1, RB2, RB4, RB3, RB0);
+ ROUND_INVERSE (18, 2, RA1, RA2, RA4, RA3, RA0, RA2, RA0, RA4, RA3, RA1,
+ RB1, RB2, RB4, RB3, RB0, RB2, RB0, RB4, RB3, RB1);
+ ROUND_INVERSE (17, 1, RA2, RA0, RA4, RA3, RA1, RA1, RA2, RA3, RA4, RA0,
+ RB2, RB0, RB4, RB3, RB1, RB1, RB2, RB3, RB4, RB0);
+ ROUND_INVERSE (16, 0, RA1, RA2, RA3, RA4, RA0, RA1, RA0, RA2, RA4, RA3,
+ RB1, RB2, RB3, RB4, RB0, RB1, RB0, RB2, RB4, RB3);
+ ROUND_INVERSE (15, 7, RA1, RA0, RA2, RA4, RA3, RA4, RA1, RA0, RA3, RA2,
+ RB1, RB0, RB2, RB4, RB3, RB4, RB1, RB0, RB3, RB2);
+ ROUND_INVERSE (14, 6, RA4, RA1, RA0, RA3, RA2, RA1, RA0, RA2, RA3, RA4,
+ RB4, RB1, RB0, RB3, RB2, RB1, RB0, RB2, RB3, RB4);
+ ROUND_INVERSE (13, 5, RA1, RA0, RA2, RA3, RA4, RA0, RA4, RA3, RA2, RA1,
+ RB1, RB0, RB2, RB3, RB4, RB0, RB4, RB3, RB2, RB1);
+ ROUND_INVERSE (12, 4, RA0, RA4, RA3, RA2, RA1, RA0, RA2, RA3, RA1, RA4,
+ RB0, RB4, RB3, RB2, RB1, RB0, RB2, RB3, RB1, RB4);
+ ROUND_INVERSE (11, 3, RA0, RA2, RA3, RA1, RA4, RA3, RA2, RA1, RA0, RA4,
+ RB0, RB2, RB3, RB1, RB4, RB3, RB2, RB1, RB0, RB4);
+ ROUND_INVERSE (10, 2, RA3, RA2, RA1, RA0, RA4, RA2, RA4, RA1, RA0, RA3,
+ RB3, RB2, RB1, RB0, RB4, RB2, RB4, RB1, RB0, RB3);
+ ROUND_INVERSE (9, 1, RA2, RA4, RA1, RA0, RA3, RA3, RA2, RA0, RA1, RA4,
+ RB2, RB4, RB1, RB0, RB3, RB3, RB2, RB0, RB1, RB4);
+ ROUND_INVERSE (8, 0, RA3, RA2, RA0, RA1, RA4, RA3, RA4, RA2, RA1, RA0,
+ RB3, RB2, RB0, RB1, RB4, RB3, RB4, RB2, RB1, RB0);
+ ROUND_INVERSE (7, 7, RA3, RA4, RA2, RA1, RA0, RA1, RA3, RA4, RA0, RA2,
+ RB3, RB4, RB2, RB1, RB0, RB1, RB3, RB4, RB0, RB2);
+ ROUND_INVERSE (6, 6, RA1, RA3, RA4, RA0, RA2, RA3, RA4, RA2, RA0, RA1,
+ RB1, RB3, RB4, RB0, RB2, RB3, RB4, RB2, RB0, RB1);
+ ROUND_INVERSE (5, 5, RA3, RA4, RA2, RA0, RA1, RA4, RA1, RA0, RA2, RA3,
+ RB3, RB4, RB2, RB0, RB1, RB4, RB1, RB0, RB2, RB3);
+ ROUND_INVERSE (4, 4, RA4, RA1, RA0, RA2, RA3, RA4, RA2, RA0, RA3, RA1,
+ RB4, RB1, RB0, RB2, RB3, RB4, RB2, RB0, RB3, RB1);
+ ROUND_INVERSE (3, 3, RA4, RA2, RA0, RA3, RA1, RA0, RA2, RA3, RA4, RA1,
+ RB4, RB2, RB0, RB3, RB1, RB0, RB2, RB3, RB4, RB1);
+ ROUND_INVERSE (2, 2, RA0, RA2, RA3, RA4, RA1, RA2, RA1, RA3, RA4, RA0,
+ RB0, RB2, RB3, RB4, RB1, RB2, RB1, RB3, RB4, RB0);
+ ROUND_INVERSE (1, 1, RA2, RA1, RA3, RA4, RA0, RA0, RA2, RA4, RA3, RA1,
+ RB2, RB1, RB3, RB4, RB0, RB0, RB2, RB4, RB3, RB1);
+ ROUND_INVERSE (0, 0, RA0, RA2, RA4, RA3, RA1, RA0, RA1, RA2, RA3, RA4,
+ RB0, RB2, RB4, RB3, RB1, RB0, RB1, RB2, RB3, RB4);
transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
- /* record output vector names for __serpent_dec_blk8 */
- .set dec_out_a0, RA0
- .set dec_out_a1, RA1
- .set dec_out_a2, RA2
- .set dec_out_a3, RA3
- .set dec_out_b0, RB0
- .set dec_out_b1, RB1
- .set dec_out_b2, RB2
- .set dec_out_b3, RB3
-
ret;
.size __serpent_dec_blk8,.-__serpent_dec_blk8;
@@ -638,15 +615,6 @@ _gcry_serpent_sse2_ctr_enc:
* %rcx: iv (big endian, 128bit)
*/
- .set RA0, enc_in_a0
- .set RA1, enc_in_a1
- .set RA2, enc_in_a2
- .set RA3, enc_in_a3
- .set RB0, enc_in_b0
- .set RB1, enc_in_b1
- .set RB2, enc_in_b2
- .set RB3, enc_in_b3
-
/* load IV and byteswap */
movdqu (%rcx), RA0;
movdqa RA0, RTMP0;
@@ -729,42 +697,35 @@ _gcry_serpent_sse2_ctr_enc:
call __serpent_enc_blk8;
- .set RA0, enc_out_a0
- .set RA1, enc_out_a1
- .set RA2, enc_out_a2
- .set RA3, enc_out_a3
- .set RB0, enc_out_b0
- .set RB1, enc_out_b1
- .set RB2, enc_out_b2
- .set RB3, enc_out_b3
-
- pxor_u((0 * 16)(%rdx), RA0, RTMP0);
+ pxor_u((0 * 16)(%rdx), RA4, RTMP0);
pxor_u((1 * 16)(%rdx), RA1, RTMP0);
pxor_u((2 * 16)(%rdx), RA2, RTMP0);
- pxor_u((3 * 16)(%rdx), RA3, RTMP0);
- pxor_u((4 * 16)(%rdx), RB0, RTMP0);
+ pxor_u((3 * 16)(%rdx), RA0, RTMP0);
+ pxor_u((4 * 16)(%rdx), RB4, RTMP0);
pxor_u((5 * 16)(%rdx), RB1, RTMP0);
pxor_u((6 * 16)(%rdx), RB2, RTMP0);
- pxor_u((7 * 16)(%rdx), RB3, RTMP0);
+ pxor_u((7 * 16)(%rdx), RB0, RTMP0);
- movdqu RA0, (0 * 16)(%rsi);
+ movdqu RA4, (0 * 16)(%rsi);
movdqu RA1, (1 * 16)(%rsi);
movdqu RA2, (2 * 16)(%rsi);
- movdqu RA3, (3 * 16)(%rsi);
- movdqu RB0, (4 * 16)(%rsi);
+ movdqu RA0, (3 * 16)(%rsi);
+ movdqu RB4, (4 * 16)(%rsi);
movdqu RB1, (5 * 16)(%rsi);
movdqu RB2, (6 * 16)(%rsi);
- movdqu RB3, (7 * 16)(%rsi);
+ movdqu RB0, (7 * 16)(%rsi);
/* clear the used registers */
pxor RA0, RA0;
pxor RA1, RA1;
pxor RA2, RA2;
pxor RA3, RA3;
+ pxor RA4, RA4;
pxor RB0, RB0;
pxor RB1, RB1;
pxor RB2, RB2;
pxor RB3, RB3;
+ pxor RB4, RB4;
pxor RTMP0, RTMP0;
pxor RTMP1, RTMP1;
pxor RTMP2, RTMP2;
@@ -784,15 +745,6 @@ _gcry_serpent_sse2_cbc_dec:
* %rcx: iv
*/
- .set RA0, dec_in_a0
- .set RA1, dec_in_a1
- .set RA2, dec_in_a2
- .set RA3, dec_in_a3
- .set RB0, dec_in_b0
- .set RB1, dec_in_b1
- .set RB2, dec_in_b2
- .set RB3, dec_in_b3
-
movdqu (0 * 16)(%rdx), RA0;
movdqu (1 * 16)(%rdx), RA1;
movdqu (2 * 16)(%rdx), RA2;
@@ -804,15 +756,6 @@ _gcry_serpent_sse2_cbc_dec:
call __serpent_dec_blk8;
- .set RA0, dec_out_a0
- .set RA1, dec_out_a1
- .set RA2, dec_out_a2
- .set RA3, dec_out_a3
- .set RB0, dec_out_b0
- .set RB1, dec_out_b1
- .set RB2, dec_out_b2
- .set RB3, dec_out_b3
-
movdqu (7 * 16)(%rdx), RNOT;
pxor_u((%rcx), RA0, RTMP0);
pxor_u((0 * 16)(%rdx), RA1, RTMP0);
@@ -838,10 +781,12 @@ _gcry_serpent_sse2_cbc_dec:
pxor RA1, RA1;
pxor RA2, RA2;
pxor RA3, RA3;
+ pxor RA4, RA4;
pxor RB0, RB0;
pxor RB1, RB1;
pxor RB2, RB2;
pxor RB3, RB3;
+ pxor RB4, RB4;
pxor RTMP0, RTMP0;
pxor RTMP1, RTMP1;
pxor RTMP2, RTMP2;
@@ -861,15 +806,6 @@ _gcry_serpent_sse2_cfb_dec:
* %rcx: iv
*/
- .set RA0, enc_in_a0
- .set RA1, enc_in_a1
- .set RA2, enc_in_a2
- .set RA3, enc_in_a3
- .set RB0, enc_in_b0
- .set RB1, enc_in_b1
- .set RB2, enc_in_b2
- .set RB3, enc_in_b3
-
/* Load input */
movdqu (%rcx), RA0;
movdqu 0 * 16(%rdx), RA1;
@@ -886,42 +822,35 @@ _gcry_serpent_sse2_cfb_dec:
call __serpent_enc_blk8;
- .set RA0, enc_out_a0
- .set RA1, enc_out_a1
- .set RA2, enc_out_a2
- .set RA3, enc_out_a3
- .set RB0, enc_out_b0
- .set RB1, enc_out_b1
- .set RB2, enc_out_b2
- .set RB3, enc_out_b3
-
- pxor_u((0 * 16)(%rdx), RA0, RTMP0);
+ pxor_u((0 * 16)(%rdx), RA4, RTMP0);
pxor_u((1 * 16)(%rdx), RA1, RTMP0);
pxor_u((2 * 16)(%rdx), RA2, RTMP0);
- pxor_u((3 * 16)(%rdx), RA3, RTMP0);
- pxor_u((4 * 16)(%rdx), RB0, RTMP0);
+ pxor_u((3 * 16)(%rdx), RA0, RTMP0);
+ pxor_u((4 * 16)(%rdx), RB4, RTMP0);
pxor_u((5 * 16)(%rdx), RB1, RTMP0);
pxor_u((6 * 16)(%rdx), RB2, RTMP0);
- pxor_u((7 * 16)(%rdx), RB3, RTMP0);
+ pxor_u((7 * 16)(%rdx), RB0, RTMP0);
- movdqu RA0, (0 * 16)(%rsi);
+ movdqu RA4, (0 * 16)(%rsi);
movdqu RA1, (1 * 16)(%rsi);
movdqu RA2, (2 * 16)(%rsi);
- movdqu RA3, (3 * 16)(%rsi);
- movdqu RB0, (4 * 16)(%rsi);
+ movdqu RA0, (3 * 16)(%rsi);
+ movdqu RB4, (4 * 16)(%rsi);
movdqu RB1, (5 * 16)(%rsi);
movdqu RB2, (6 * 16)(%rsi);
- movdqu RB3, (7 * 16)(%rsi);
+ movdqu RB0, (7 * 16)(%rsi);
/* clear the used registers */
pxor RA0, RA0;
pxor RA1, RA1;
pxor RA2, RA2;
pxor RA3, RA3;
+ pxor RA4, RA4;
pxor RB0, RB0;
pxor RB1, RB1;
pxor RB2, RB2;
pxor RB3, RB3;
+ pxor RB4, RB4;
pxor RTMP0, RTMP0;
pxor RTMP1, RTMP1;
pxor RTMP2, RTMP2;