summaryrefslogtreecommitdiff
path: root/mpi/longlong.h
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2013-07-03 11:14:56 +0300
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2013-07-10 21:53:25 +0300
commit71dda4507053379433dc8b0fc6462c15de7299df (patch)
treee656842061f5e946778f54c20c40d7814960351d /mpi/longlong.h
parent6540b84a6e9113813e7e49e3ad2024d4a0073300 (diff)
downloadlibgcrypt-71dda4507053379433dc8b0fc6462c15de7299df.tar.gz
Tweak ARM inline assembly for mpi
mpi/longlong.h [__arm__]: Enable inline assembly if __thumb2__ is defined. [__arm__]: Use __ARCH_ARM when defined. [__arm__] [__ARM_ARCH >= 5] (count_leading_zeros): New. -- Current ARM Linux distributions use EABI that enables thumb2, and therefore inline assembly is disable (because !defined(__thumb__) selector). However thumb2 allows the use of assembly instructions that longlong.h contains for ARM. So this patch enables inline assembly for ARM when __thumb2__ is defined in addition to __thumb__. Patch also adds optimization for count_leading_zeros() macro for ARM. Results on Cortex-A8, 1Ghz: === Before: Algorithm generate 100*sign 100*verify ------------------------------------------------ RSA 1024 bit 750ms 2780ms 110ms RSA 2048 bit 14280ms 17250ms 300ms RSA 3072 bit 38630ms 51300ms 650ms RSA 4096 bit 60940ms 111430ms 1000ms jussi@cubie:~/libgcrypt$ tests/benchmark dsa Algorithm generate 100*sign 100*verify ------------------------------------------------ DSA 1024/160 - 1410ms 1680ms DSA 2048/224 - 6100ms 7390ms DSA 3072/256 - 14350ms 17120ms jussi@cubie:~/libgcrypt$ tests/benchmark ecc Algorithm generate 100*sign 100*verify ------------------------------------------------ ECDSA 192 bit 90ms 2160ms 3940ms ECDSA 224 bit 110ms 2810ms 5400ms ECDSA 256 bit 150ms 3570ms 6970ms ECDSA 384 bit 340ms 8320ms 16420ms ECDSA 521 bit 850ms 19760ms 38480ms After: jussi@cubie:~/libgcrypt$ tests/benchmark rsa Algorithm generate 100*sign 100*verify ------------------------------------------------ RSA 1024 bit 590ms 2230ms 80ms RSA 2048 bit 2320ms 13090ms 240ms RSA 3072 bit 60580ms 38420ms 460ms RSA 4096 bit 115130ms 82250ms 750ms jussi@cubie:~/libgcrypt$ tests/benchmark dsa Algorithm generate 100*sign 100*verify ------------------------------------------------ DSA 1024/160 - 1070ms 1290ms DSA 2048/224 - 4500ms 5550ms DSA 3072/256 - 10280ms 12200ms jussi@cubie:~/libgcrypt$ tests/benchmark ecc Algorithm generate 100*sign 100*verify ------------------------------------------------ ECDSA 192 bit 70ms 1900ms 3560ms ECDSA 224 bit 100ms 2490ms 4750ms ECDSA 256 bit 120ms 3140ms 5920ms ECDSA 384 bit 270ms 6990ms 13790ms ECDSA 521 bit 680ms 17080ms 33490ms Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'mpi/longlong.h')
-rw-r--r--mpi/longlong.h20
1 files changed, 16 insertions, 4 deletions
diff --git a/mpi/longlong.h b/mpi/longlong.h
index 5dba7931..bb34fd7b 100644
--- a/mpi/longlong.h
+++ b/mpi/longlong.h
@@ -184,7 +184,8 @@ extern UDItype __udiv_qrnnd ();
/***************************************
************** ARM ******************
***************************************/
-#if defined (__arm__) && W_TYPE_SIZE == 32 && !defined (__thumb__)
+#if defined (__arm__) && W_TYPE_SIZE == 32 && \
+ (!defined (__thumb__) || defined (__thumb2__))
#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
__asm__ ("adds %1, %4, %5\n" \
"adc %0, %2, %3" \
@@ -203,7 +204,9 @@ extern UDItype __udiv_qrnnd ();
"rI" ((USItype)(bh)), \
"r" ((USItype)(al)), \
"rI" ((USItype)(bl)))
-#if defined __ARM_ARCH_2__ || defined __ARM_ARCH_3__
+/* The __ARM_ARCH define is provided by gcc 4.8 */
+#if (defined __ARM_ARCH && __ARM_ARCH <= 3) || \
+ defined __ARM_ARCH_2__ || defined __ARM_ARCH_3__
#define umul_ppmm(xh, xl, a, b) \
__asm__ ("%@ Inlined umul_ppmm\n" \
"mov %|r0, %2, lsr #16 @ AAAA\n" \
@@ -223,7 +226,7 @@ extern UDItype __udiv_qrnnd ();
: "r" ((USItype)(a)), \
"r" ((USItype)(b)) \
: "r0", "r1", "r2")
-#else
+#else /* __ARM_ARCH >= 4 */
#define umul_ppmm(xh, xl, a, b) \
__asm__ ("%@ Inlined umul_ppmm\n" \
"umull %r1, %r0, %r2, %r3" \
@@ -232,9 +235,18 @@ extern UDItype __udiv_qrnnd ();
: "r" ((USItype)(a)), \
"r" ((USItype)(b)) \
: "r0", "r1")
-#endif
+#endif /* __ARM_ARCH >= 4 */
#define UMUL_TIME 20
#define UDIV_TIME 100
+/* The __ARM_ARCH define is provided by gcc 4.8 */
+#if (defined __ARM_ARCH && __ARM_ARCH >= 5) || !(defined __ARM_ARCH_2__ || \
+ defined __ARM_ARCH_3__ || defined __ARM_ARCH_3M__ || __ARM_ARCH_4__ || \
+ __ARM_ARCH_4T__)
+#define count_leading_zeros(count, x) \
+ __asm__ ("clz %0, %1" \
+ : "=r" ((USItype)(count)) \
+ : "r" ((USItype)(x)))
+#endif /* __ARM_ARCH >= 5 */
#endif /* __arm__ */
/***************************************