libbb/sha1: add config-selectable fully unrolled version, closes 14391

function                                             old     new   delta
sha1_process_block64                                 364    4167   +3803
static.rconsts                                        16       -     -16
------------------------------------------------------------------------------
(add/remove: 0/1 grow/shrink: 1/0 up/down: 3803/-16)         Total: 3787 bytes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
This commit is contained in:
Denys Vlasenko 2021-12-30 13:07:12 +01:00
parent 9173c9cce4
commit 25aadc893d
2 changed files with 95 additions and 14 deletions

View File

@ -42,21 +42,32 @@ config MD5_SMALL
default 1 # all "fast or small" options default to small default 1 # all "fast or small" options default to small
range 0 3 range 0 3
help help
Trade binary size versus speed for the md5sum algorithm. Trade binary size versus speed for the md5 algorithm.
Approximate values running uClibc and hashing Approximate values running uClibc and hashing
linux-2.4.4.tar.bz2 were: linux-2.4.4.tar.bz2 were:
value user times (sec) text size (386) value user times (sec) text size (386)
0 (fastest) 1.1 6144 0 (fastest) 1.1 6144
1 1.4 5392 1 1.4 5392
2 3.0 5088 2 3.0 5088
3 (smallest) 5.1 4912 3 (smallest) 5.1 4912
config SHA1_SMALL
int "SHA1: Trade bytes for speed (0:fast, 3:slow)"
default 3 # all "fast or small" options default to small
range 0 3
help
Trade binary size versus speed for the sha1 algorithm.
throughput MB/s size of sha1_process_block64
value 486 x86-64 486 x86-64
0 339 374 4149 4167
1,2,3 200 195 358 380
config SHA3_SMALL config SHA3_SMALL
int "SHA3: Trade bytes for speed (0:fast, 1:slow)" int "SHA3: Trade bytes for speed (0:fast, 1:slow)"
default 1 # all "fast or small" options default to small default 1 # all "fast or small" options default to small
range 0 1 range 0 1
help help
Trade binary size versus speed for the sha3sum algorithm. Trade binary size versus speed for the sha3 algorithm.
SHA3_SMALL=0 compared to SHA3_SMALL=1 (approximate): SHA3_SMALL=0 compared to SHA3_SMALL=1 (approximate):
64-bit x86: +270 bytes of code, 45% faster 64-bit x86: +270 bytes of code, 45% faster
32-bit x86: +450 bytes of code, 75% faster 32-bit x86: +450 bytes of code, 75% faster

View File

@ -390,7 +390,6 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
OP(FI, D, A, B, C, 11, 10, 0xbd3af235); OP(FI, D, A, B, C, 11, 10, 0xbd3af235);
OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb); OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb);
OP(FI, B, C, D, A, 9, 21, 0xeb86d391); OP(FI, B, C, D, A, 9, 21, 0xeb86d391);
# undef OP
# endif # endif
/* Add checksum to the starting values */ /* Add checksum to the starting values */
ctx->hash[0] += A; ctx->hash[0] += A;
@ -399,6 +398,7 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
ctx->hash[3] += D; ctx->hash[3] += D;
#endif #endif
} }
#undef OP
#undef FF #undef FF
#undef FG #undef FG
#undef FH #undef FH
@ -490,18 +490,87 @@ unsigned FAST_FUNC md5_end(md5_ctx_t *ctx, void *resbuf)
* then rebuild and compare "shaNNNsum bigfile" results. * then rebuild and compare "shaNNNsum bigfile" results.
*/ */
#if CONFIG_SHA1_SMALL == 0
/* Fast, fully-unrolled SHA1. +3800 bytes of code on x86.
* It seems further speedup can be achieved by handling more than
* 64 bytes per one function call (coreutils does that).
*/
static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
{
static const uint32_t rconsts[] ALIGN4 = {
0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
};
uint32_t W[16];
uint32_t a, b, c, d, e;
a = ctx->hash[0];
b = ctx->hash[1];
c = ctx->hash[2];
d = ctx->hash[3];
e = ctx->hash[4];
#undef OP
#define OP(A,B,C,D,E, n) \
do { \
uint32_t work = EXPR(B, C, D); \
if (n <= 15) \
work += W[n & 0xf] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \
if (n >= 16) \
work += W[n & 0xf] = rotl32(W[(n+13) & 0xf] ^ W[(n+8) & 0xf] ^ W[(n+2) & 0xf] ^ W[n & 0xf], 1); \
E += work + rotl32(A, 5) + rconsts[n / 20]; \
B = rotl32(B, 30); \
} while (0)
#define OP20(n) \
OP(a,b,c,d,e, (n+ 0)); OP(e,a,b,c,d, (n+ 1)); OP(d,e,a,b,c, (n+ 2)); OP(c,d,e,a,b, (n+ 3)); OP(b,c,d,e,a, (n+ 4)); \
OP(a,b,c,d,e, (n+ 5)); OP(e,a,b,c,d, (n+ 6)); OP(d,e,a,b,c, (n+ 7)); OP(c,d,e,a,b, (n+ 8)); OP(b,c,d,e,a, (n+ 9)); \
OP(a,b,c,d,e, (n+10)); OP(e,a,b,c,d, (n+11)); OP(d,e,a,b,c, (n+12)); OP(c,d,e,a,b, (n+13)); OP(b,c,d,e,a, (n+14)); \
OP(a,b,c,d,e, (n+15)); OP(e,a,b,c,d, (n+16)); OP(d,e,a,b,c, (n+17)); OP(c,d,e,a,b, (n+18)); OP(b,c,d,e,a, (n+19))
/* 4 rounds of 20 operations each */
#define EXPR(b,c,d) (((c ^ d) & b) ^ d)
OP20(0);
#undef EXPR
#define EXPR(b,c,d) (c ^ d ^ b)
OP20(20);
#undef EXPR
#define EXPR(b,c,d) (((b | c) & d) | (b & c))
OP20(40);
#undef EXPR
#define EXPR(b,c,d) (c ^ d ^ b)
OP20(60);
#undef EXPR
#undef OP
#undef OP20
ctx->hash[0] += a;
ctx->hash[1] += b;
ctx->hash[2] += c;
ctx->hash[3] += d;
ctx->hash[4] += e;
}
#else
/* TODO: for CONFIG_SHA1_SMALL == 1, have a partially unrolled version? */
/* Compact version, almost twice as slow as fully unrolled */
static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx) static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
{ {
static const uint32_t rconsts[] ALIGN4 = { static const uint32_t rconsts[] ALIGN4 = {
0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
}; };
int i, j; int i, j;
int cnt; int n;
uint32_t W[16+16]; uint32_t W[16+16];
uint32_t a, b, c, d, e; uint32_t a, b, c, d, e;
/* On-stack work buffer frees up one register in the main loop /* On-stack work buffer frees up one register in the main loop
* which otherwise will be needed to hold ctx pointer */ * which otherwise will be needed to hold ctx pointer.
*
* The compiler is not smart enough to realize it, though. :(
* If __attribute__((optimize("2"))) is added to the function,
* only then gcc-9.3.1 spills "ctx" to stack and uses the freed
* register (making code 6 bytes smaller, not just faster).
*/
for (i = 0; i < 16; i++) for (i = 0; i < 16; i++)
W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]); W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]);
@ -512,7 +581,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
e = ctx->hash[4]; e = ctx->hash[4];
/* 4 rounds of 20 operations each */ /* 4 rounds of 20 operations each */
cnt = 0; n = 0;
for (i = 0; i < 4; i++) { for (i = 0; i < 4; i++) {
j = 19; j = 19;
do { do {
@ -529,9 +598,9 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
else /* i = 1 or 3 */ else /* i = 1 or 3 */
work ^= b; work ^= b;
ge16: ge16:
W[cnt] = W[cnt+16] = rotl32(W[cnt+13] ^ W[cnt+8] ^ W[cnt+2] ^ W[cnt], 1); W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
} }
work += W[cnt]; work += W[n];
work += e + rotl32(a, 5) + rconsts[i]; work += e + rotl32(a, 5) + rconsts[i];
/* Rotate by one for next time */ /* Rotate by one for next time */
@ -540,7 +609,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
c = rotl32(b, 30); c = rotl32(b, 30);
b = a; b = a;
a = work; a = work;
cnt = (cnt + 1) & 15; n = (n + 1) & 15;
} while (--j >= 0); } while (--j >= 0);
} }
@ -550,6 +619,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
ctx->hash[3] += d; ctx->hash[3] += d;
ctx->hash[4] += e; ctx->hash[4] += e;
} }
#endif
/* Constants for SHA512 from FIPS 180-2:4.2.3. /* Constants for SHA512 from FIPS 180-2:4.2.3.
* SHA256 constants from FIPS 180-2:4.2.2 * SHA256 constants from FIPS 180-2:4.2.2