bzip2: work around bad compiler optimization
gc-6.1.1 x86_64: function old new delta generateMTFValues 380 367 -13 gcc-4.3.1 386: function old new delta inner_loop - 41 +41 generateMTFValues 357 294 -63 ------------------------------------------------------------------------------ (add/remove: 1/0 grow/shrink: 0/1 up/down: 41/-63) Total: -22 bytes gcc-6.3.0 386: function old new delta inner_loop - 36 +36 generateMTFValues 363 250 -113 ------------------------------------------------------------------------------ (add/remove: 1/0 grow/shrink: 0/1 up/down: 36/-113) Total: -77 bytes The last case, gcc-6.3.0, runs almost 3 times faster after this change. Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
This commit is contained in:
parent
f75a7c0439
commit
c2a51b0cf1
@ -158,6 +158,38 @@ void makeMaps_e(EState* s)
|
|||||||
|
|
||||||
|
|
||||||
/*---------------------------------------------------*/
|
/*---------------------------------------------------*/
|
||||||
|
/*
|
||||||
|
* This bit of code is performance-critical.
|
||||||
|
* On 32bit x86, gcc-6.3.0 was observed to spill ryy_j to stack,
|
||||||
|
* resulting in abysmal performance (x3 slowdown).
|
||||||
|
* Forcing it into a separate function alleviates register pressure,
|
||||||
|
* and spillage no longer happens.
|
||||||
|
* Other versions of gcc do not exhibit this problem, but out-of-line code
|
||||||
|
* seems to be helping them too (code is both smaller and faster).
|
||||||
|
* Therefore NOINLINE is enabled for the entire 32bit x86 arch for now,
|
||||||
|
* without a check for gcc version.
|
||||||
|
*/
|
||||||
|
static
|
||||||
|
#if defined __i386__
|
||||||
|
NOINLINE
|
||||||
|
#endif
|
||||||
|
int inner_loop(uint8_t *yy, uint8_t ll_i)
|
||||||
|
{
|
||||||
|
register uint8_t rtmp;
|
||||||
|
register uint8_t* ryy_j;
|
||||||
|
rtmp = yy[1];
|
||||||
|
yy[1] = yy[0];
|
||||||
|
ryy_j = &(yy[1]);
|
||||||
|
while (ll_i != rtmp) {
|
||||||
|
register uint8_t rtmp2;
|
||||||
|
ryy_j++;
|
||||||
|
rtmp2 = rtmp;
|
||||||
|
rtmp = *ryy_j;
|
||||||
|
*ryy_j = rtmp2;
|
||||||
|
}
|
||||||
|
yy[0] = rtmp;
|
||||||
|
return ryy_j - &(yy[0]);
|
||||||
|
}
|
||||||
static NOINLINE
|
static NOINLINE
|
||||||
void generateMTFValues(EState* s)
|
void generateMTFValues(EState* s)
|
||||||
{
|
{
|
||||||
@ -165,7 +197,6 @@ void generateMTFValues(EState* s)
|
|||||||
int i;
|
int i;
|
||||||
int zPend;
|
int zPend;
|
||||||
int32_t wr;
|
int32_t wr;
|
||||||
int32_t EOB;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* After sorting (eg, here),
|
* After sorting (eg, here),
|
||||||
@ -189,15 +220,12 @@ void generateMTFValues(EState* s)
|
|||||||
* compressBlock().
|
* compressBlock().
|
||||||
*/
|
*/
|
||||||
uint32_t* ptr = s->ptr;
|
uint32_t* ptr = s->ptr;
|
||||||
uint8_t* block = s->block;
|
|
||||||
uint16_t* mtfv = s->mtfv;
|
|
||||||
|
|
||||||
makeMaps_e(s);
|
makeMaps_e(s);
|
||||||
EOB = s->nInUse+1;
|
|
||||||
|
|
||||||
wr = 0;
|
wr = 0;
|
||||||
zPend = 0;
|
zPend = 0;
|
||||||
for (i = 0; i <= EOB; i++)
|
for (i = 0; i <= s->nInUse+1; i++)
|
||||||
s->mtfFreq[i] = 0;
|
s->mtfFreq[i] = 0;
|
||||||
|
|
||||||
for (i = 0; i < s->nInUse; i++)
|
for (i = 0; i < s->nInUse; i++)
|
||||||
@ -211,7 +239,7 @@ void generateMTFValues(EState* s)
|
|||||||
j = ptr[i] - 1;
|
j = ptr[i] - 1;
|
||||||
if (j < 0)
|
if (j < 0)
|
||||||
j += s->nblock;
|
j += s->nblock;
|
||||||
ll_i = s->unseqToSeq[block[j]];
|
ll_i = s->unseqToSeq[s->block[j]];
|
||||||
AssertD(ll_i < s->nInUse, "generateMTFValues(2a)");
|
AssertD(ll_i < s->nInUse, "generateMTFValues(2a)");
|
||||||
|
|
||||||
if (yy[0] == ll_i) {
|
if (yy[0] == ll_i) {
|
||||||
@ -225,15 +253,15 @@ void generateMTFValues(EState* s)
|
|||||||
while (1) {
|
while (1) {
|
||||||
#if 0
|
#if 0
|
||||||
if (zPend & 1) {
|
if (zPend & 1) {
|
||||||
mtfv[wr] = BZ_RUNB; wr++;
|
s->mtfv[wr] = BZ_RUNB; wr++;
|
||||||
s->mtfFreq[BZ_RUNB]++;
|
s->mtfFreq[BZ_RUNB]++;
|
||||||
} else {
|
} else {
|
||||||
mtfv[wr] = BZ_RUNA; wr++;
|
s->mtfv[wr] = BZ_RUNA; wr++;
|
||||||
s->mtfFreq[BZ_RUNA]++;
|
s->mtfFreq[BZ_RUNA]++;
|
||||||
}
|
}
|
||||||
#else /* same as above, since BZ_RUNA is 0 and BZ_RUNB is 1 */
|
#else /* same as above, since BZ_RUNA is 0 and BZ_RUNB is 1 */
|
||||||
unsigned run = zPend & 1;
|
unsigned run = zPend & 1;
|
||||||
mtfv[wr] = run;
|
s->mtfv[wr] = run;
|
||||||
wr++;
|
wr++;
|
||||||
s->mtfFreq[run]++;
|
s->mtfFreq[run]++;
|
||||||
#endif
|
#endif
|
||||||
@ -247,36 +275,19 @@ void generateMTFValues(EState* s)
|
|||||||
goto end;
|
goto end;
|
||||||
zPend = 0;
|
zPend = 0;
|
||||||
}
|
}
|
||||||
{
|
j = inner_loop(yy, ll_i);
|
||||||
register uint8_t rtmp;
|
s->mtfv[wr] = j+1;
|
||||||
register uint8_t* ryy_j;
|
wr++;
|
||||||
register uint8_t rll_i;
|
s->mtfFreq[j+1]++;
|
||||||
rtmp = yy[1];
|
|
||||||
yy[1] = yy[0];
|
|
||||||
ryy_j = &(yy[1]);
|
|
||||||
rll_i = ll_i;
|
|
||||||
while (rll_i != rtmp) {
|
|
||||||
register uint8_t rtmp2;
|
|
||||||
ryy_j++;
|
|
||||||
rtmp2 = rtmp;
|
|
||||||
rtmp = *ryy_j;
|
|
||||||
*ryy_j = rtmp2;
|
|
||||||
}
|
|
||||||
yy[0] = rtmp;
|
|
||||||
j = ryy_j - &(yy[0]);
|
|
||||||
mtfv[wr] = j+1;
|
|
||||||
wr++;
|
|
||||||
s->mtfFreq[j+1]++;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
i = -1;
|
i = -1;
|
||||||
if (zPend > 0)
|
if (zPend > 0)
|
||||||
goto process_zPend; /* "process it and come back here" */
|
goto process_zPend; /* "process it and come back here" */
|
||||||
end:
|
end:
|
||||||
mtfv[wr] = EOB;
|
s->mtfv[wr] = s->nInUse+1;
|
||||||
wr++;
|
wr++;
|
||||||
s->mtfFreq[EOB]++;
|
s->mtfFreq[s->nInUse+1]++;
|
||||||
|
|
||||||
s->nMTF = wr;
|
s->nMTF = wr;
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user