bzip2: work around bad compiler optimization

gc-6.1.1 x86_64: function old new delta generateMTFValues 380 367 -13 gcc-4.3.1 386: function old new delta inner_loop - 41 +41 generateMTFValues 357 294 -63 ------------------------------------------------------------------------------ (add/remove: 1/0 grow/shrink: 0/1 up/down: 41/-63) Total: -22 bytes gcc-6.3.0 386: function old new delta inner_loop - 36 +36 generateMTFValues 363 250 -113 ------------------------------------------------------------------------------ (add/remove: 1/0 grow/shrink: 0/1 up/down: 36/-113) Total: -77 bytes The last case, gcc-6.3.0, runs almost 3 times faster after this change. Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
2018-02-05 00:34:08 +01:00
parent f75a7c0439
commit c2a51b0cf1
1 changed files with 43 additions and 32 deletions
--- a/archival/libarchive/bz/compress.c
+++ b/archival/libarchive/bz/compress.c
@ -158,6 +158,38 @@ void makeMaps_e(EState* s)


 /*---------------------------------------------------*/
+/*
+ * This bit of code is performance-critical.
+ * On 32bit x86, gcc-6.3.0 was observed to spill ryy_j to stack,
+ * resulting in abysmal performance (x3 slowdown).
+ * Forcing it into a separate function alleviates register pressure,
+ * and spillage no longer happens.
+ * Other versions of gcc do not exhibit this problem, but out-of-line code
+ * seems to be helping them too (code is both smaller and faster).
+ * Therefore NOINLINE is enabled for the entire 32bit x86 arch for now,
+ * without a check for gcc version.
+ */
+static
+#if defined __i386__
+NOINLINE
+#endif
+int inner_loop(uint8_t *yy, uint8_t ll_i)
+{
+	register uint8_t  rtmp;
+	register uint8_t* ryy_j;
+	rtmp  = yy[1];
+	yy[1] = yy[0];
+	ryy_j = &(yy[1]);
+	while (ll_i != rtmp) {
+		register uint8_t rtmp2;
+		ryy_j++;
+		rtmp2  = rtmp;
+		rtmp   = *ryy_j;
+		*ryy_j = rtmp2;
+	}
+	yy[0] = rtmp;
+	return ryy_j - &(yy[0]);
+}
 static NOINLINE
 void generateMTFValues(EState* s)
 {
@ -165,7 +197,6 @@ void generateMTFValues(EState* s)
 	int i;
 	int zPend;
 	int32_t wr;
-	int32_t EOB;

 	/*
 	 * After sorting (eg, here),
@ -189,15 +220,12 @@ void generateMTFValues(EState* s)
 	 * compressBlock().
 	 */
 	uint32_t* ptr   = s->ptr;
-	uint8_t*  block = s->block;
-	uint16_t* mtfv  = s->mtfv;

 	makeMaps_e(s);
-	EOB = s->nInUse+1;

 	wr = 0;
 	zPend = 0;
-	for (i = 0; i <= EOB; i++)
+	for (i = 0; i <= s->nInUse+1; i++)
 		s->mtfFreq[i] = 0;

 	for (i = 0; i < s->nInUse; i++)
@ -211,7 +239,7 @@ void generateMTFValues(EState* s)
 		j = ptr[i] - 1;
 		if (j < 0)
 			j += s->nblock;
-		ll_i = s->unseqToSeq[block[j]];
+		ll_i = s->unseqToSeq[s->block[j]];
 		AssertD(ll_i < s->nInUse, "generateMTFValues(2a)");

 		if (yy[0] == ll_i) {
@ -225,15 +253,15 @@ void generateMTFValues(EState* s)
 			while (1) {
 #if 0
 				if (zPend & 1) {
-					mtfv[wr] = BZ_RUNB; wr++;
+					s->mtfv[wr] = BZ_RUNB; wr++;
 					s->mtfFreq[BZ_RUNB]++;
 				} else {
-					mtfv[wr] = BZ_RUNA; wr++;
+					s->mtfv[wr] = BZ_RUNA; wr++;
 					s->mtfFreq[BZ_RUNA]++;
 				}
 #else /* same as above, since BZ_RUNA is 0 and BZ_RUNB is 1 */
 				unsigned run = zPend & 1;
-				mtfv[wr] = run;
+				s->mtfv[wr] = run;
 				wr++;
 				s->mtfFreq[run]++;
 #endif
@ -247,36 +275,19 @@ void generateMTFValues(EState* s)
 				goto end;
 			zPend = 0;
 		}
-		{
-			register uint8_t  rtmp;
-			register uint8_t* ryy_j;
-			register uint8_t  rll_i;
-			rtmp  = yy[1];
-			yy[1] = yy[0];
-			ryy_j = &(yy[1]);
-			rll_i = ll_i;
-			while (rll_i != rtmp) {
-				register uint8_t rtmp2;
-				ryy_j++;
-				rtmp2  = rtmp;
-				rtmp   = *ryy_j;
-				*ryy_j = rtmp2;
-			}
-			yy[0] = rtmp;
-			j = ryy_j - &(yy[0]);
-			mtfv[wr] = j+1;
-			wr++;
-			s->mtfFreq[j+1]++;
-		}
+		j = inner_loop(yy, ll_i);
+		s->mtfv[wr] = j+1;
+		wr++;
+		s->mtfFreq[j+1]++;
 	}

 	i = -1;
 	if (zPend > 0)
 		goto process_zPend; /* "process it and come back here" */
 end:
-	mtfv[wr] = EOB;
+	s->mtfv[wr] = s->nInUse+1;
 	wr++;
-	s->mtfFreq[EOB]++;
+	s->mtfFreq[s->nInUse+1]++;

 	s->nMTF = wr;
 }