libbb/sha1: add config-selectable fully unrolled version, closes 14391
function old new delta sha1_process_block64 364 4167 +3803 static.rconsts 16 - -16 ------------------------------------------------------------------------------ (add/remove: 0/1 grow/shrink: 1/0 up/down: 3803/-16) Total: 3787 bytes Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
This commit is contained in:
parent
9173c9cce4
commit
25aadc893d
@ -42,21 +42,32 @@ config MD5_SMALL
|
|||||||
default 1 # all "fast or small" options default to small
|
default 1 # all "fast or small" options default to small
|
||||||
range 0 3
|
range 0 3
|
||||||
help
|
help
|
||||||
Trade binary size versus speed for the md5sum algorithm.
|
Trade binary size versus speed for the md5 algorithm.
|
||||||
Approximate values running uClibc and hashing
|
Approximate values running uClibc and hashing
|
||||||
linux-2.4.4.tar.bz2 were:
|
linux-2.4.4.tar.bz2 were:
|
||||||
value user times (sec) text size (386)
|
value user times (sec) text size (386)
|
||||||
0 (fastest) 1.1 6144
|
0 (fastest) 1.1 6144
|
||||||
1 1.4 5392
|
1 1.4 5392
|
||||||
2 3.0 5088
|
2 3.0 5088
|
||||||
3 (smallest) 5.1 4912
|
3 (smallest) 5.1 4912
|
||||||
|
|
||||||
|
config SHA1_SMALL
|
||||||
|
int "SHA1: Trade bytes for speed (0:fast, 3:slow)"
|
||||||
|
default 3 # all "fast or small" options default to small
|
||||||
|
range 0 3
|
||||||
|
help
|
||||||
|
Trade binary size versus speed for the sha1 algorithm.
|
||||||
|
throughput MB/s size of sha1_process_block64
|
||||||
|
value 486 x86-64 486 x86-64
|
||||||
|
0 339 374 4149 4167
|
||||||
|
1,2,3 200 195 358 380
|
||||||
|
|
||||||
config SHA3_SMALL
|
config SHA3_SMALL
|
||||||
int "SHA3: Trade bytes for speed (0:fast, 1:slow)"
|
int "SHA3: Trade bytes for speed (0:fast, 1:slow)"
|
||||||
default 1 # all "fast or small" options default to small
|
default 1 # all "fast or small" options default to small
|
||||||
range 0 1
|
range 0 1
|
||||||
help
|
help
|
||||||
Trade binary size versus speed for the sha3sum algorithm.
|
Trade binary size versus speed for the sha3 algorithm.
|
||||||
SHA3_SMALL=0 compared to SHA3_SMALL=1 (approximate):
|
SHA3_SMALL=0 compared to SHA3_SMALL=1 (approximate):
|
||||||
64-bit x86: +270 bytes of code, 45% faster
|
64-bit x86: +270 bytes of code, 45% faster
|
||||||
32-bit x86: +450 bytes of code, 75% faster
|
32-bit x86: +450 bytes of code, 75% faster
|
||||||
|
@ -390,7 +390,6 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
|
|||||||
OP(FI, D, A, B, C, 11, 10, 0xbd3af235);
|
OP(FI, D, A, B, C, 11, 10, 0xbd3af235);
|
||||||
OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb);
|
OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb);
|
||||||
OP(FI, B, C, D, A, 9, 21, 0xeb86d391);
|
OP(FI, B, C, D, A, 9, 21, 0xeb86d391);
|
||||||
# undef OP
|
|
||||||
# endif
|
# endif
|
||||||
/* Add checksum to the starting values */
|
/* Add checksum to the starting values */
|
||||||
ctx->hash[0] += A;
|
ctx->hash[0] += A;
|
||||||
@ -399,6 +398,7 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
|
|||||||
ctx->hash[3] += D;
|
ctx->hash[3] += D;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
#undef OP
|
||||||
#undef FF
|
#undef FF
|
||||||
#undef FG
|
#undef FG
|
||||||
#undef FH
|
#undef FH
|
||||||
@ -490,18 +490,87 @@ unsigned FAST_FUNC md5_end(md5_ctx_t *ctx, void *resbuf)
|
|||||||
* then rebuild and compare "shaNNNsum bigfile" results.
|
* then rebuild and compare "shaNNNsum bigfile" results.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#if CONFIG_SHA1_SMALL == 0
|
||||||
|
/* Fast, fully-unrolled SHA1. +3800 bytes of code on x86.
|
||||||
|
* It seems further speedup can be achieved by handling more than
|
||||||
|
* 64 bytes per one function call (coreutils does that).
|
||||||
|
*/
|
||||||
|
static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
|
||||||
|
{
|
||||||
|
static const uint32_t rconsts[] ALIGN4 = {
|
||||||
|
0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
|
||||||
|
};
|
||||||
|
uint32_t W[16];
|
||||||
|
uint32_t a, b, c, d, e;
|
||||||
|
|
||||||
|
a = ctx->hash[0];
|
||||||
|
b = ctx->hash[1];
|
||||||
|
c = ctx->hash[2];
|
||||||
|
d = ctx->hash[3];
|
||||||
|
e = ctx->hash[4];
|
||||||
|
|
||||||
|
#undef OP
|
||||||
|
#define OP(A,B,C,D,E, n) \
|
||||||
|
do { \
|
||||||
|
uint32_t work = EXPR(B, C, D); \
|
||||||
|
if (n <= 15) \
|
||||||
|
work += W[n & 0xf] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \
|
||||||
|
if (n >= 16) \
|
||||||
|
work += W[n & 0xf] = rotl32(W[(n+13) & 0xf] ^ W[(n+8) & 0xf] ^ W[(n+2) & 0xf] ^ W[n & 0xf], 1); \
|
||||||
|
E += work + rotl32(A, 5) + rconsts[n / 20]; \
|
||||||
|
B = rotl32(B, 30); \
|
||||||
|
} while (0)
|
||||||
|
#define OP20(n) \
|
||||||
|
OP(a,b,c,d,e, (n+ 0)); OP(e,a,b,c,d, (n+ 1)); OP(d,e,a,b,c, (n+ 2)); OP(c,d,e,a,b, (n+ 3)); OP(b,c,d,e,a, (n+ 4)); \
|
||||||
|
OP(a,b,c,d,e, (n+ 5)); OP(e,a,b,c,d, (n+ 6)); OP(d,e,a,b,c, (n+ 7)); OP(c,d,e,a,b, (n+ 8)); OP(b,c,d,e,a, (n+ 9)); \
|
||||||
|
OP(a,b,c,d,e, (n+10)); OP(e,a,b,c,d, (n+11)); OP(d,e,a,b,c, (n+12)); OP(c,d,e,a,b, (n+13)); OP(b,c,d,e,a, (n+14)); \
|
||||||
|
OP(a,b,c,d,e, (n+15)); OP(e,a,b,c,d, (n+16)); OP(d,e,a,b,c, (n+17)); OP(c,d,e,a,b, (n+18)); OP(b,c,d,e,a, (n+19))
|
||||||
|
|
||||||
|
/* 4 rounds of 20 operations each */
|
||||||
|
#define EXPR(b,c,d) (((c ^ d) & b) ^ d)
|
||||||
|
OP20(0);
|
||||||
|
#undef EXPR
|
||||||
|
#define EXPR(b,c,d) (c ^ d ^ b)
|
||||||
|
OP20(20);
|
||||||
|
#undef EXPR
|
||||||
|
#define EXPR(b,c,d) (((b | c) & d) | (b & c))
|
||||||
|
OP20(40);
|
||||||
|
#undef EXPR
|
||||||
|
#define EXPR(b,c,d) (c ^ d ^ b)
|
||||||
|
OP20(60);
|
||||||
|
|
||||||
|
#undef EXPR
|
||||||
|
#undef OP
|
||||||
|
#undef OP20
|
||||||
|
|
||||||
|
ctx->hash[0] += a;
|
||||||
|
ctx->hash[1] += b;
|
||||||
|
ctx->hash[2] += c;
|
||||||
|
ctx->hash[3] += d;
|
||||||
|
ctx->hash[4] += e;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
/* TODO: for CONFIG_SHA1_SMALL == 1, have a partially unrolled version? */
|
||||||
|
|
||||||
|
/* Compact version, almost twice as slow as fully unrolled */
|
||||||
static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
|
static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
|
||||||
{
|
{
|
||||||
static const uint32_t rconsts[] ALIGN4 = {
|
static const uint32_t rconsts[] ALIGN4 = {
|
||||||
0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
|
0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
|
||||||
};
|
};
|
||||||
int i, j;
|
int i, j;
|
||||||
int cnt;
|
int n;
|
||||||
uint32_t W[16+16];
|
uint32_t W[16+16];
|
||||||
uint32_t a, b, c, d, e;
|
uint32_t a, b, c, d, e;
|
||||||
|
|
||||||
/* On-stack work buffer frees up one register in the main loop
|
/* On-stack work buffer frees up one register in the main loop
|
||||||
* which otherwise will be needed to hold ctx pointer */
|
* which otherwise will be needed to hold ctx pointer.
|
||||||
|
*
|
||||||
|
* The compiler is not smart enough to realize it, though. :(
|
||||||
|
* If __attribute__((optimize("2"))) is added to the function,
|
||||||
|
* only then gcc-9.3.1 spills "ctx" to stack and uses the freed
|
||||||
|
* register (making code 6 bytes smaller, not just faster).
|
||||||
|
*/
|
||||||
for (i = 0; i < 16; i++)
|
for (i = 0; i < 16; i++)
|
||||||
W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]);
|
W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]);
|
||||||
|
|
||||||
@ -512,7 +581,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
|
|||||||
e = ctx->hash[4];
|
e = ctx->hash[4];
|
||||||
|
|
||||||
/* 4 rounds of 20 operations each */
|
/* 4 rounds of 20 operations each */
|
||||||
cnt = 0;
|
n = 0;
|
||||||
for (i = 0; i < 4; i++) {
|
for (i = 0; i < 4; i++) {
|
||||||
j = 19;
|
j = 19;
|
||||||
do {
|
do {
|
||||||
@ -529,9 +598,9 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
|
|||||||
else /* i = 1 or 3 */
|
else /* i = 1 or 3 */
|
||||||
work ^= b;
|
work ^= b;
|
||||||
ge16:
|
ge16:
|
||||||
W[cnt] = W[cnt+16] = rotl32(W[cnt+13] ^ W[cnt+8] ^ W[cnt+2] ^ W[cnt], 1);
|
W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
|
||||||
}
|
}
|
||||||
work += W[cnt];
|
work += W[n];
|
||||||
work += e + rotl32(a, 5) + rconsts[i];
|
work += e + rotl32(a, 5) + rconsts[i];
|
||||||
|
|
||||||
/* Rotate by one for next time */
|
/* Rotate by one for next time */
|
||||||
@ -540,7 +609,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
|
|||||||
c = rotl32(b, 30);
|
c = rotl32(b, 30);
|
||||||
b = a;
|
b = a;
|
||||||
a = work;
|
a = work;
|
||||||
cnt = (cnt + 1) & 15;
|
n = (n + 1) & 15;
|
||||||
} while (--j >= 0);
|
} while (--j >= 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -550,6 +619,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
|
|||||||
ctx->hash[3] += d;
|
ctx->hash[3] += d;
|
||||||
ctx->hash[4] += e;
|
ctx->hash[4] += e;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
/* Constants for SHA512 from FIPS 180-2:4.2.3.
|
/* Constants for SHA512 from FIPS 180-2:4.2.3.
|
||||||
* SHA256 constants from FIPS 180-2:4.2.2
|
* SHA256 constants from FIPS 180-2:4.2.2
|
||||||
|
Loading…
x
Reference in New Issue
Block a user