From 911344a99889319a7dba8a725a64dc324597f9eb Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 6 Oct 2021 17:17:34 +0200 Subject: [PATCH] tls: P256: x86-64 assembly function old new delta sp_256_mont_mul_8 127 155 +28 sp_256_proj_point_dbl_8 448 469 +21 sp_256_mont_sub_8 23 35 +12 sp_256_mont_dbl_8 26 38 +12 sp_256_sub_8 44 49 +5 sp_256_ecc_mulmod_8 1530 1535 +5 ------------------------------------------------------------------------------ (add/remove: 0/0 grow/shrink: 6/0 up/down: 83/0) Total: 83 bytes Signed-off-by: Denys Vlasenko --- networking/tls_sp_c32.c | 99 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) diff --git a/networking/tls_sp_c32.c b/networking/tls_sp_c32.c index 532047739..14a7c7066 100644 --- a/networking/tls_sp_c32.c +++ b/networking/tls_sp_c32.c @@ -189,6 +189,34 @@ static int sp_256_add_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "\n movl %3, 7*4(%2)" "\n" "\n sbbl %3, %3" +"\n" + : "=r" (a), "=r" (b), "=r" (r), "=r" (reg) + : "0" (a), "1" (b), "2" (r) + : "memory" + ); + return reg; +#elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__) + /* x86_64 has no alignment restrictions, and is little-endian, + * so 64-bit and 32-bit representations are identical */ + uint64_t reg; + asm volatile ( +"\n movq (%0), %3" +"\n addq (%1), %3" +"\n movq %3, (%2)" +"\n" +"\n movq 1*8(%0), %3" +"\n adcq 1*8(%1), %3" +"\n movq %3, 1*8(%2)" +"\n" +"\n movq 2*8(%0), %3" +"\n adcq 2*8(%1), %3" +"\n movq %3, 2*8(%2)" +"\n" +"\n movq 3*8(%0), %3" +"\n adcq 3*8(%1), %3" +"\n movq %3, 3*8(%2)" +"\n" +"\n sbbq %3, %3" "\n" : "=r" (a), "=r" (b), "=r" (r), "=r" (reg) : "0" (a), "1" (b), "2" (r) @@ -259,6 +287,34 @@ static int sp_256_sub_8(sp_digit* r, const sp_digit* a, const sp_digit* b) "\n movl %3, 7*4(%2)" "\n" "\n sbbl %3, %3" +"\n" + : "=r" (a), "=r" (b), "=r" (r), "=r" (reg) + : "0" (a), "1" (b), "2" (r) + : "memory" + ); + return reg; +#elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__) + /* x86_64 has no alignment restrictions, and is little-endian, + * so 64-bit and 32-bit representations are identical */ + uint64_t reg; + asm volatile ( +"\n movq (%0), %3" +"\n subq (%1), %3" +"\n movq %3, (%2)" +"\n" +"\n movq 1*8(%0), %3" +"\n sbbq 1*8(%1), %3" +"\n movq %3, 1*8(%2)" +"\n" +"\n movq 2*8(%0), %3" +"\n sbbq 2*8(%1), %3" +"\n movq %3, 2*8(%2)" +"\n" +"\n movq 3*8(%0), %3" +"\n sbbq 3*8(%1), %3" +"\n movq %3, 3*8(%2)" +"\n" +"\n sbbq %3, %3" "\n" : "=r" (a), "=r" (b), "=r" (r), "=r" (reg) : "0" (a), "1" (b), "2" (r) @@ -380,6 +436,49 @@ static void sp_256_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) } r[15] = accl; memcpy(r, rr, sizeof(rr)); +#elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__) + /* x86_64 has no alignment restrictions, and is little-endian, + * so 64-bit and 32-bit representations are identical */ + const uint64_t* aa = (const void*)a; + const uint64_t* bb = (const void*)b; + uint64_t rr[8]; + int k; + uint64_t accl; + uint64_t acch; + + acch = accl = 0; + for (k = 0; k < 7; k++) { + int i, j; + uint64_t acc_hi; + i = k - 3; + if (i < 0) + i = 0; + j = k - i; + acc_hi = 0; + do { +//////////////////////// +// uint128_t m = ((uint128_t)a[i]) * b[j]; +// acc_hi:acch:accl += m; + asm volatile ( + // aa[i] is already loaded in %%rax +"\n mulq %7" +"\n addq %%rax, %0" +"\n adcq %%rdx, %1" +"\n adcq $0, %2" + : "=rm" (accl), "=rm" (acch), "=rm" (acc_hi) + : "0" (accl), "1" (acch), "2" (acc_hi), "a" (aa[i]), "m" (bb[j]) + : "cc", "dx" + ); +//////////////////////// + j--; + i++; + } while (i != 4 && i <= k); + rr[k] = accl; + accl = acch; + acch = acc_hi; + } + rr[7] = accl; + memcpy(r, rr, sizeof(rr)); #elif 0 //TODO: arm assembly (untested) sp_digit tmp[16];