libbb/sha1: shrink and speed up fully unrolled version

function                                             old     new   delta
sha1_process_block64                                4149    3950    -199

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
This commit is contained in:
Denys Vlasenko 2021-12-31 17:06:00 +01:00
parent 0b62a08777
commit f09d088fdf
2 changed files with 23 additions and 1 deletions

View File

@ -59,7 +59,7 @@ config SHA1_SMALL
Trade binary size versus speed for the sha1 algorithm.
throughput MB/s size of sha1_process_block64
value 486 x86-64 486 x86-64
0 339 374 4149 4167
0 360 374 3950 4167
1 224 229 654 732
2,3 200 195 358 380

View File

@ -509,6 +509,27 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
d = ctx->hash[3];
e = ctx->hash[4];
/* From kernel source comments:
* """
* If you have 32 registers or more, the compiler can (and should)
* try to change the array[] accesses into registers. However, on
* machines with less than ~25 registers, that won't really work,
* and at least gcc will make an unholy mess of it.
*
* So to avoid that mess which just slows things down, we force
* the stores to memory to actually happen (we might be better off
* with a 'W(t)=(val);asm("":"+m" (W(t))' there instead, as
* suggested by Artur Skawina - that will also make gcc unable to
* try to do the silly "optimize away loads" part because it won't
* see what the value will be).
* """
*/
#if defined(__i386__)
# define DO_NOT_TRY_PROPAGATING(m) asm("":"+m"(m))
#else
# define DO_NOT_TRY_PROPAGATING(m) ((void)0)
#endif
#undef OP
#define OP(A,B,C,D,E, n) \
do { \
@ -517,6 +538,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
work += W[n & 15] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \
if (n >= 16) \
work += W[n & 15] = rotl32(W[(n+13) & 15] ^ W[(n+8) & 15] ^ W[(n+2) & 15] ^ W[n & 15], 1); \
DO_NOT_TRY_PROPAGATING(W[n & 15]); \
E += work + rotl32(A, 5) + rconsts[n / 20]; \
B = rotl32(B, 30); \
} while (0)