From d728a30c211c2df6adccd64c6e2fc23387b341f2 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 26 Apr 2021 23:07:32 +0200 Subject: [PATCH] tls: add a patch with optimization which _should_ give better code ...but does not. Signed-off-by: Denys Vlasenko --- networking/tls_sp_c32.patch | 142 ++++++++++++++++++++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 networking/tls_sp_c32.patch diff --git a/networking/tls_sp_c32.patch b/networking/tls_sp_c32.patch new file mode 100644 index 000000000..7559586c9 --- /dev/null +++ b/networking/tls_sp_c32.patch @@ -0,0 +1,142 @@ +Somehow, gcc 6+ does this optimization same or better than the below +hand-written optimized code (gcc seem to eliminate a32[] array, uses 32-bit +registers/memory for "lower halves" of a32[i] elements). + +But there can be arches where gcc won't be this good? + +diff --git a/networking/tls_sp_c32.c b/networking/tls_sp_c32.c +index 72a3be537..e8a011ad1 100644 +--- a/networking/tls_sp_c32.c ++++ b/networking/tls_sp_c32.c +@@ -228,51 +228,96 @@ static void sp_256_rshift1_10(sp_digit* r, sp_digit* a) + static void sp_256_mod_mul_norm_10(sp_digit* r, const sp_digit* a) + { + int64_t t[8]; +- int64_t a32[8]; ++ uint32_t a32; + int64_t o; + +- a32[0] = a[0]; +- a32[0] |= a[1] << 26; +- a32[0] &= 0xffffffff; +- a32[1] = (sp_digit)(a[1] >> 6); +- a32[1] |= a[2] << 20; +- a32[1] &= 0xffffffff; +- a32[2] = (sp_digit)(a[2] >> 12); +- a32[2] |= a[3] << 14; +- a32[2] &= 0xffffffff; +- a32[3] = (sp_digit)(a[3] >> 18); +- a32[3] |= a[4] << 8; +- a32[3] &= 0xffffffff; +- a32[4] = (sp_digit)(a[4] >> 24); +- a32[4] |= a[5] << 2; +- a32[4] |= a[6] << 28; +- a32[4] &= 0xffffffff; +- a32[5] = (sp_digit)(a[6] >> 4); +- a32[5] |= a[7] << 22; +- a32[5] &= 0xffffffff; +- a32[6] = (sp_digit)(a[7] >> 10); +- a32[6] |= a[8] << 16; +- a32[6] &= 0xffffffff; +- a32[7] = (sp_digit)(a[8] >> 16); +- a32[7] |= a[9] << 10; +- a32[7] &= 0xffffffff; +- + /* 1 1 0 -1 -1 -1 -1 0 */ +- t[0] = 0 + a32[0] + a32[1] - a32[3] - a32[4] - a32[5] - a32[6]; + /* 0 1 1 0 -1 -1 -1 -1 */ +- t[1] = 0 + a32[1] + a32[2] - a32[4] - a32[5] - a32[6] - a32[7]; + /* 0 0 1 1 0 -1 -1 -1 */ +- t[2] = 0 + a32[2] + a32[3] - a32[5] - a32[6] - a32[7]; + /* -1 -1 0 2 2 1 0 -1 */ +- t[3] = 0 - a32[0] - a32[1] + 2 * a32[3] + 2 * a32[4] + a32[5] - a32[7]; + /* 0 -1 -1 0 2 2 1 0 */ +- t[4] = 0 - a32[1] - a32[2] + 2 * a32[4] + 2 * a32[5] + a32[6]; + /* 0 0 -1 -1 0 2 2 1 */ +- t[5] = 0 - a32[2] - a32[3] + 2 * a32[5] + 2 * a32[6] + a32[7]; + /* -1 -1 0 0 0 1 3 2 */ +- t[6] = 0 - a32[0] - a32[1] + a32[5] + 3 * a32[6] + 2 * a32[7]; + /* 1 0 -1 -1 -1 -1 0 3 */ +- t[7] = 0 + a32[0] - a32[2] - a32[3] - a32[4] - a32[5] + 3 * a32[7]; ++ //t[0] = 0 + a32[0] + a32[1] - a32[3] - a32[4] - a32[5] - a32[6] ; ++ //t[1] = 0 + a32[1] + a32[2] - a32[4] - a32[5] - a32[6] - a32[7] ; ++ //t[2] = 0 + a32[2] + a32[3] - a32[5] - a32[6] - a32[7] ; ++ //t[3] = 0 - a32[0] - a32[1] + 2*a32[3] + 2*a32[4] + a32[5] - a32[7] ; ++ //t[4] = 0 - a32[1] - a32[2] + 2*a32[4] + 2*a32[5] + a32[6] ; ++ //t[5] = 0 - a32[2] - a32[3] + 2*a32[5] + 2*a32[6] + a32[7] ; ++ //t[6] = 0 - a32[0] - a32[1] + a32[5] + 3*a32[6] + 2*a32[7]; ++ //t[7] = 0 + a32[0] - a32[2] - a32[3] - a32[4] - a32[5] + 3*a32[7]; ++ ++#define A32 (int64_t)a32 ++ a32 = a[0]; ++ a32 |= a[1] << 26; ++ t[0] = 0 + A32; ++ t[3] = 0 - A32; ++ t[6] = 0 - A32; ++ t[7] = 0 + A32; ++ ++ a32 = (sp_digit)(a[1] >> 6); ++ a32 |= a[2] << 20; ++ t[0] += A32 ; ++ t[1] = 0 + A32; ++ t[3] -= A32 ; ++ t[4] = 0 - A32; ++ t[6] -= A32 ; ++ ++ a32 = (sp_digit)(a[2] >> 12); ++ a32 |= a[3] << 14; ++ t[1] += A32 ; ++ t[2] = 0 + A32; ++ t[4] -= A32 ; ++ t[5] = 0 - A32; ++ t[7] -= A32 ; ++ ++ a32 = (sp_digit)(a[3] >> 18); ++ a32 |= a[4] << 8; ++ t[0] -= A32 ; ++ t[2] += A32 ; ++ t[3] += 2*A32; ++ t[5] -= A32 ; ++ t[7] -= A32 ; ++ ++ a32 = (sp_digit)(a[4] >> 24); ++ a32 |= a[5] << 2; ++ a32 |= a[6] << 28; ++ t[0] -= A32 ; ++ t[1] -= A32 ; ++ t[3] += 2*A32; ++ t[4] += 2*A32; ++ t[7] -= A32 ; ++ ++ a32 = (sp_digit)(a[6] >> 4); ++ a32 |= a[7] << 22; ++ t[0] -= A32 ; ++ t[1] -= A32 ; ++ t[2] -= A32 ; ++ t[3] += A32 ; ++ t[4] += 2*A32; ++ t[5] += 2*A32; ++ t[6] += A32 ; ++ t[7] -= A32 ; ++ ++ a32 = (sp_digit)(a[7] >> 10); ++ a32 |= a[8] << 16; ++ t[0] -= A32 ; ++ t[1] -= A32 ; ++ t[2] -= A32 ; ++ t[4] += A32 ; ++ t[5] += 2*A32; ++ t[6] += 3*A32; ++ ++ a32 = (sp_digit)(a[8] >> 16); ++ a32 |= a[9] << 10; ++ t[1] -= A32 ; ++ t[2] -= A32 ; ++ t[3] -= A32 ; ++ t[5] += A32 ; ++ t[6] += 2*A32; ++ t[7] += 3*A32; ++#undef A32 + + t[1] += t[0] >> 32; t[0] &= 0xffffffff; + t[2] += t[1] >> 32; t[1] &= 0xffffffff;