libbb/sha1: shrink and speed up fully unrolled version
function old new delta sha1_process_block64 4149 3950 -199 Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
This commit is contained in:
parent
0b62a08777
commit
f09d088fdf
@ -59,7 +59,7 @@ config SHA1_SMALL
|
|||||||
Trade binary size versus speed for the sha1 algorithm.
|
Trade binary size versus speed for the sha1 algorithm.
|
||||||
throughput MB/s size of sha1_process_block64
|
throughput MB/s size of sha1_process_block64
|
||||||
value 486 x86-64 486 x86-64
|
value 486 x86-64 486 x86-64
|
||||||
0 339 374 4149 4167
|
0 360 374 3950 4167
|
||||||
1 224 229 654 732
|
1 224 229 654 732
|
||||||
2,3 200 195 358 380
|
2,3 200 195 358 380
|
||||||
|
|
||||||
|
@ -509,6 +509,27 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
|
|||||||
d = ctx->hash[3];
|
d = ctx->hash[3];
|
||||||
e = ctx->hash[4];
|
e = ctx->hash[4];
|
||||||
|
|
||||||
|
/* From kernel source comments:
|
||||||
|
* """
|
||||||
|
* If you have 32 registers or more, the compiler can (and should)
|
||||||
|
* try to change the array[] accesses into registers. However, on
|
||||||
|
* machines with less than ~25 registers, that won't really work,
|
||||||
|
* and at least gcc will make an unholy mess of it.
|
||||||
|
*
|
||||||
|
* So to avoid that mess which just slows things down, we force
|
||||||
|
* the stores to memory to actually happen (we might be better off
|
||||||
|
* with a 'W(t)=(val);asm("":"+m" (W(t))' there instead, as
|
||||||
|
* suggested by Artur Skawina - that will also make gcc unable to
|
||||||
|
* try to do the silly "optimize away loads" part because it won't
|
||||||
|
* see what the value will be).
|
||||||
|
* """
|
||||||
|
*/
|
||||||
|
#if defined(__i386__)
|
||||||
|
# define DO_NOT_TRY_PROPAGATING(m) asm("":"+m"(m))
|
||||||
|
#else
|
||||||
|
# define DO_NOT_TRY_PROPAGATING(m) ((void)0)
|
||||||
|
#endif
|
||||||
|
|
||||||
#undef OP
|
#undef OP
|
||||||
#define OP(A,B,C,D,E, n) \
|
#define OP(A,B,C,D,E, n) \
|
||||||
do { \
|
do { \
|
||||||
@ -517,6 +538,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
|
|||||||
work += W[n & 15] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \
|
work += W[n & 15] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \
|
||||||
if (n >= 16) \
|
if (n >= 16) \
|
||||||
work += W[n & 15] = rotl32(W[(n+13) & 15] ^ W[(n+8) & 15] ^ W[(n+2) & 15] ^ W[n & 15], 1); \
|
work += W[n & 15] = rotl32(W[(n+13) & 15] ^ W[(n+8) & 15] ^ W[(n+2) & 15] ^ W[n & 15], 1); \
|
||||||
|
DO_NOT_TRY_PROPAGATING(W[n & 15]); \
|
||||||
E += work + rotl32(A, 5) + rconsts[n / 20]; \
|
E += work + rotl32(A, 5) + rconsts[n / 20]; \
|
||||||
B = rotl32(B, 30); \
|
B = rotl32(B, 30); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
Loading…
Reference in New Issue
Block a user