bzip2: work around bad compiler optimization
gc-6.1.1 x86_64: function old new delta generateMTFValues 380 367 -13 gcc-4.3.1 386: function old new delta inner_loop - 41 +41 generateMTFValues 357 294 -63 ------------------------------------------------------------------------------ (add/remove: 1/0 grow/shrink: 0/1 up/down: 41/-63) Total: -22 bytes gcc-6.3.0 386: function old new delta inner_loop - 36 +36 generateMTFValues 363 250 -113 ------------------------------------------------------------------------------ (add/remove: 1/0 grow/shrink: 0/1 up/down: 36/-113) Total: -77 bytes The last case, gcc-6.3.0, runs almost 3 times faster after this change. Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
This commit is contained in:
parent
f75a7c0439
commit
c2a51b0cf1
@ -158,6 +158,38 @@ void makeMaps_e(EState* s)
|
||||
|
||||
|
||||
/*---------------------------------------------------*/
|
||||
/*
|
||||
* This bit of code is performance-critical.
|
||||
* On 32bit x86, gcc-6.3.0 was observed to spill ryy_j to stack,
|
||||
* resulting in abysmal performance (x3 slowdown).
|
||||
* Forcing it into a separate function alleviates register pressure,
|
||||
* and spillage no longer happens.
|
||||
* Other versions of gcc do not exhibit this problem, but out-of-line code
|
||||
* seems to be helping them too (code is both smaller and faster).
|
||||
* Therefore NOINLINE is enabled for the entire 32bit x86 arch for now,
|
||||
* without a check for gcc version.
|
||||
*/
|
||||
static
|
||||
#if defined __i386__
|
||||
NOINLINE
|
||||
#endif
|
||||
int inner_loop(uint8_t *yy, uint8_t ll_i)
|
||||
{
|
||||
register uint8_t rtmp;
|
||||
register uint8_t* ryy_j;
|
||||
rtmp = yy[1];
|
||||
yy[1] = yy[0];
|
||||
ryy_j = &(yy[1]);
|
||||
while (ll_i != rtmp) {
|
||||
register uint8_t rtmp2;
|
||||
ryy_j++;
|
||||
rtmp2 = rtmp;
|
||||
rtmp = *ryy_j;
|
||||
*ryy_j = rtmp2;
|
||||
}
|
||||
yy[0] = rtmp;
|
||||
return ryy_j - &(yy[0]);
|
||||
}
|
||||
static NOINLINE
|
||||
void generateMTFValues(EState* s)
|
||||
{
|
||||
@ -165,7 +197,6 @@ void generateMTFValues(EState* s)
|
||||
int i;
|
||||
int zPend;
|
||||
int32_t wr;
|
||||
int32_t EOB;
|
||||
|
||||
/*
|
||||
* After sorting (eg, here),
|
||||
@ -189,15 +220,12 @@ void generateMTFValues(EState* s)
|
||||
* compressBlock().
|
||||
*/
|
||||
uint32_t* ptr = s->ptr;
|
||||
uint8_t* block = s->block;
|
||||
uint16_t* mtfv = s->mtfv;
|
||||
|
||||
makeMaps_e(s);
|
||||
EOB = s->nInUse+1;
|
||||
|
||||
wr = 0;
|
||||
zPend = 0;
|
||||
for (i = 0; i <= EOB; i++)
|
||||
for (i = 0; i <= s->nInUse+1; i++)
|
||||
s->mtfFreq[i] = 0;
|
||||
|
||||
for (i = 0; i < s->nInUse; i++)
|
||||
@ -211,7 +239,7 @@ void generateMTFValues(EState* s)
|
||||
j = ptr[i] - 1;
|
||||
if (j < 0)
|
||||
j += s->nblock;
|
||||
ll_i = s->unseqToSeq[block[j]];
|
||||
ll_i = s->unseqToSeq[s->block[j]];
|
||||
AssertD(ll_i < s->nInUse, "generateMTFValues(2a)");
|
||||
|
||||
if (yy[0] == ll_i) {
|
||||
@ -225,15 +253,15 @@ void generateMTFValues(EState* s)
|
||||
while (1) {
|
||||
#if 0
|
||||
if (zPend & 1) {
|
||||
mtfv[wr] = BZ_RUNB; wr++;
|
||||
s->mtfv[wr] = BZ_RUNB; wr++;
|
||||
s->mtfFreq[BZ_RUNB]++;
|
||||
} else {
|
||||
mtfv[wr] = BZ_RUNA; wr++;
|
||||
s->mtfv[wr] = BZ_RUNA; wr++;
|
||||
s->mtfFreq[BZ_RUNA]++;
|
||||
}
|
||||
#else /* same as above, since BZ_RUNA is 0 and BZ_RUNB is 1 */
|
||||
unsigned run = zPend & 1;
|
||||
mtfv[wr] = run;
|
||||
s->mtfv[wr] = run;
|
||||
wr++;
|
||||
s->mtfFreq[run]++;
|
||||
#endif
|
||||
@ -247,36 +275,19 @@ void generateMTFValues(EState* s)
|
||||
goto end;
|
||||
zPend = 0;
|
||||
}
|
||||
{
|
||||
register uint8_t rtmp;
|
||||
register uint8_t* ryy_j;
|
||||
register uint8_t rll_i;
|
||||
rtmp = yy[1];
|
||||
yy[1] = yy[0];
|
||||
ryy_j = &(yy[1]);
|
||||
rll_i = ll_i;
|
||||
while (rll_i != rtmp) {
|
||||
register uint8_t rtmp2;
|
||||
ryy_j++;
|
||||
rtmp2 = rtmp;
|
||||
rtmp = *ryy_j;
|
||||
*ryy_j = rtmp2;
|
||||
}
|
||||
yy[0] = rtmp;
|
||||
j = ryy_j - &(yy[0]);
|
||||
mtfv[wr] = j+1;
|
||||
wr++;
|
||||
s->mtfFreq[j+1]++;
|
||||
}
|
||||
j = inner_loop(yy, ll_i);
|
||||
s->mtfv[wr] = j+1;
|
||||
wr++;
|
||||
s->mtfFreq[j+1]++;
|
||||
}
|
||||
|
||||
i = -1;
|
||||
if (zPend > 0)
|
||||
goto process_zPend; /* "process it and come back here" */
|
||||
end:
|
||||
mtfv[wr] = EOB;
|
||||
s->mtfv[wr] = s->nInUse+1;
|
||||
wr++;
|
||||
s->mtfFreq[EOB]++;
|
||||
s->mtfFreq[s->nInUse+1]++;
|
||||
|
||||
s->nMTF = wr;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user