libbb/sha1: shrink and speed up fully unrolled version

function old new delta sha1_process_block64 4149 3950 -199 Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
2021-12-31 17:06:00 +01:00
parent 0b62a08777
commit f09d088fdf
2 changed files with 23 additions and 1 deletions
--- a/libbb/Config.src
+++ b/libbb/Config.src
@@ -59,7 +59,7 @@ config SHA1_SMALL
 	Trade binary size versus speed for the sha1 algorithm.
 	                throughput MB/s   size of sha1_process_block64
 	value           486  x86-64       486   x86-64
-	0               339  374          4149  4167
+	0               360  374          3950  4167
 	1               224  229           654   732
 	2,3             200  195           358   380

--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@@ -509,6 +509,27 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
 	d = ctx->hash[3];
 	e = ctx->hash[4];

+/* From kernel source comments:
+ * """
+ * If you have 32 registers or more, the compiler can (and should)
+ * try to change the array[] accesses into registers. However, on
+ * machines with less than ~25 registers, that won't really work,
+ * and at least gcc will make an unholy mess of it.
+ *
+ * So to avoid that mess which just slows things down, we force
+ * the stores to memory to actually happen (we might be better off
+ * with a 'W(t)=(val);asm("":"+m" (W(t))' there instead, as
+ * suggested by Artur Skawina - that will also make gcc unable to
+ * try to do the silly "optimize away loads" part because it won't
+ * see what the value will be).
+ * """
+ */
+#if defined(__i386__)
+# define DO_NOT_TRY_PROPAGATING(m) asm("":"+m"(m))
+#else
+# define DO_NOT_TRY_PROPAGATING(m) ((void)0)
+#endif
+
 #undef OP
 #define OP(A,B,C,D,E, n) \
 	do { \
@@ -517,6 +538,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
 			work += W[n & 15] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \
 		if (n >= 16) \
 			work += W[n & 15] = rotl32(W[(n+13) & 15] ^ W[(n+8) & 15] ^ W[(n+2) & 15] ^ W[n & 15], 1); \
+		DO_NOT_TRY_PROPAGATING(W[n & 15]); \
 		E += work + rotl32(A, 5) + rconsts[n / 20]; \
 		B = rotl32(B, 30); \
 	} while (0)