ash: parser: Add syntax stack for recursive parsing

This closes 10821.

Upstream patch:

    From: Herbert Xu <herbert@gondor.apana.org.au>
    Date: Fri, 9 Mar 2018 00:14:02 +0800
    parser: Add syntax stack for recursive parsing

    Without a stack of syntaxes we cannot correctly these two cases
    together:

            "${a#'$$'}"
            "${a#"${b-'$$'}"}"

    A recursive parser also helps in some other corner cases such
    as nested arithmetic expansion with paratheses.

    This patch adds a syntax stack allocated from the stack using
    alloca.  As a side-effect this allows us to remove the naked
    backslashes for patterns within double-quotes, which means that
    EXP_QPAT also has to go.

    This patch also fixes removes any backslashes that precede right
    braces when they are present within a parameter expansion context,
    and backslashes that precede double quotes within inner double
    quotes inside a parameter expansion in a here-document context.

    The idea of a recursive parser is based on a patch by Harald van
    Dijk.

    Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

var_bash3, var_bash4 and var_bash6 tests are updated
with the output given by bash-4.3.43

With this patch, the following tests now pass for ash:

    dollar_repl_slash_bash2.tests
    squote_in_varexp2.tests
    squote_in_varexp.tests
    var_bash4.tests

function                                             old     new   delta
readtoken1                                          2615    2874    +259
synstack_push                                          -      54     +54
evalvar                                              574     571      -3
rmescapes                                            330     310     -20
subevalvar                                          1279    1258     -21
argstr                                              1146    1107     -39
------------------------------------------------------------------------------
(add/remove: 1/0 grow/shrink: 1/4 up/down: 313/-83)           Total: 230 bytes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
This commit is contained in:
Denys Vlasenko 2018-04-02 12:35:04 +02:00
parent e84212f834
commit 216913c290
19 changed files with 168 additions and 121 deletions

View File

@ -5888,10 +5888,9 @@ static int substr_atoi(const char *s)
* performs globbing, and thus diverges from what we do).
*/
#define EXP_CASE 0x10 /* keeps quotes around for CASE pattern */
#define EXP_QPAT 0x20 /* pattern in quoted parameter expansion */
#define EXP_VARTILDE2 0x40 /* expand tildes after colons only */
#define EXP_WORD 0x80 /* expand word in parameter expansion */
#define EXP_QUOTED 0x100 /* expand word in double quotes */
#define EXP_VARTILDE2 0x20 /* expand tildes after colons only */
#define EXP_WORD 0x40 /* expand word in parameter expansion */
#define EXP_QUOTED 0x80 /* expand word in double quotes */
/*
* rmescape() flags
*/
@ -5901,7 +5900,7 @@ static int substr_atoi(const char *s)
#define RMESCAPE_HEAP 0x10 /* Malloc strings instead of stalloc */
/* Add CTLESC when necessary. */
#define QUOTES_ESC (EXP_FULL | EXP_CASE | EXP_QPAT)
#define QUOTES_ESC (EXP_FULL | EXP_CASE)
/* Do not skip NUL characters. */
#define QUOTES_KEEPNUL EXP_TILDE
@ -6090,7 +6089,6 @@ rmescapes(char *str, int flag, int *slash_position)
IF_BASH_PATTERN_SUBST('/',) CTLESC, CTLQUOTEMARK, '\0' };
char *p, *q, *r;
unsigned inquotes;
unsigned protect_against_glob;
unsigned globbing;
@ -6121,18 +6119,21 @@ rmescapes(char *str, int flag, int *slash_position)
}
}
inquotes = 0;
globbing = flag & RMESCAPE_GLOB;
protect_against_glob = globbing;
while (*p) {
if ((unsigned char)*p == CTLQUOTEMARK) {
// Note: both inquotes and protect_against_glob only affect whether
// Note: protect_against_glob only affect whether
// CTLESC,<ch> gets converted to <ch> or to \<ch>
inquotes = ~inquotes;
p++;
protect_against_glob = globbing;
continue;
}
if (*p == '\\') {
/* naked back slash */
protect_against_glob = 0;
goto copy;
}
if ((unsigned char)*p == CTLESC) {
p++;
#if DEBUG
@ -6168,10 +6169,6 @@ rmescapes(char *str, int flag, int *slash_position)
*q++ = '\\';
}
}
} else if (*p == '\\' && !inquotes) {
/* naked back slash */
protect_against_glob = 0;
goto copy;
}
#if BASH_PATTERN_SUBST
else if (slash_position && p == str + *slash_position) {
@ -6669,16 +6666,6 @@ argstr(char *p, int flags)
case CTLESC:
startloc++;
length++;
/*
* Quoted parameter expansion pattern: remove quote
* unless inside inner quotes or we have a literal
* backslash.
*/
if (((flags | inquotes) & (EXP_QPAT | EXP_QUOTED)) ==
EXP_QPAT && *p != '\\')
break;
goto addquote;
case CTLVAR:
TRACE(("argstr: evalvar('%s')\n", p));
@ -6869,15 +6856,24 @@ subevalvar(char *p, char *varname, int strloc, int subtype,
}
#endif
argstr_flags = EXP_TILDE;
if (subtype != VSASSIGN && subtype != VSQUESTION)
argstr_flags |= (flag & (EXP_QUOTED | EXP_QPAT) ? EXP_QPAT : EXP_CASE);
if (subtype != VSASSIGN
&& subtype != VSQUESTION
#if BASH_SUBSTR
&& subtype != VSSUBSTR
#endif
) {
/* EXP_CASE keeps CTLESC's */
argstr_flags = EXP_TILDE | EXP_CASE;
}
argstr(p, argstr_flags);
//bb_error_msg("str0:'%s'", (char *)stackblock() + strloc);
#if BASH_PATTERN_SUBST
slash_pos = -1;
if (repl) {
slash_pos = expdest - ((char *)stackblock() + strloc);
STPUTC('/', expdest);
argstr(repl + 1, argstr_flags);
//bb_error_msg("repl+1:'%s'", repl + 1);
argstr(repl + 1, EXP_TILDE); /* EXP_TILDE: echo "${v/x/~}" expands ~ ! */
*repl = '/';
}
#endif
@ -10669,6 +10665,34 @@ pgetc_eatbnl(void)
return c;
}
struct synstack {
smalluint syntax;
uint8_t innerdq :1;
uint8_t varpushed :1;
uint8_t dblquote :1;
int varnest; /* levels of variables expansion */
int dqvarnest; /* levels of variables expansion within double quotes */
int parenlevel; /* levels of parens in arithmetic */
struct synstack *prev;
struct synstack *next;
};
static void
synstack_push(struct synstack **stack, struct synstack *next, int syntax)
{
memset(next, 0, sizeof(*next));
next->syntax = syntax;
next->next = *stack;
(*stack)->prev = next;
*stack = next;
}
static ALWAYS_INLINE void
synstack_pop(struct synstack **stack)
{
*stack = (*stack)->next;
}
/*
* To handle the "." command, a stack of input files is used. Pushfile
* adds a new entry to the stack and popfile restores the previous level.
@ -11928,19 +11952,13 @@ readtoken1(int c, int syntax, char *eofmark, int striptabs)
size_t len;
struct nodelist *bqlist;
smallint quotef;
smallint dblquote;
smallint oldstyle;
IF_FEATURE_SH_MATH(smallint prevsyntax;) /* syntax before arithmetic */
smallint pssyntax; /* we are expanding a prompt string */
int varnest; /* levels of variables expansion */
IF_FEATURE_SH_MATH(int arinest;) /* levels of arithmetic expansion */
IF_FEATURE_SH_MATH(int parenlevel;) /* levels of parens in arithmetic */
int dqvarnest; /* levels of variables expansion within double quotes */
IF_BASH_DOLLAR_SQUOTE(smallint bash_dollar_squote = 0;)
/* syntax stack */
struct synstack synbase = { .syntax = syntax };
struct synstack *synstack = &synbase;
bqlist = NULL;
quotef = 0;
IF_FEATURE_SH_MATH(prevsyntax = 0;)
#if ENABLE_ASH_EXPAND_PRMT
pssyntax = (syntax == PSSYNTAX);
if (pssyntax)
@ -11948,11 +11966,10 @@ readtoken1(int c, int syntax, char *eofmark, int striptabs)
#else
pssyntax = 0; /* constant */
#endif
dblquote = (syntax == DQSYNTAX);
varnest = 0;
IF_FEATURE_SH_MATH(arinest = 0;)
IF_FEATURE_SH_MATH(parenlevel = 0;)
dqvarnest = 0;
if (syntax == DQSYNTAX)
synstack->dblquote = 1;
quotef = 0;
bqlist = NULL;
STARTSTACKSTR(out);
loop:
@ -11960,9 +11977,9 @@ readtoken1(int c, int syntax, char *eofmark, int striptabs)
CHECKEND(); /* set c to PEOF if at end of here document */
for (;;) { /* until end of line or end of word */
CHECKSTRSPACE(4, out); /* permit 4 calls to USTPUTC */
switch (SIT(c, syntax)) {
switch (SIT(c, synstack->syntax)) {
case CNL: /* '\n' */
if (syntax == BASESYNTAX)
if (synstack->syntax == BASESYNTAX)
goto endword; /* exit outer loop */
USTPUTC(c, out);
nlprompt();
@ -11982,13 +11999,13 @@ readtoken1(int c, int syntax, char *eofmark, int striptabs)
if (c & 0x100) {
/* Unknown escape. Encode as '\z' */
c = (unsigned char)c;
if (eofmark == NULL || dblquote)
if (eofmark == NULL || synstack->dblquote)
USTPUTC(CTLESC, out);
USTPUTC('\\', out);
}
}
#endif
if (eofmark == NULL || dblquote)
if (eofmark == NULL || synstack->dblquote)
USTPUTC(CTLESC, out);
USTPUTC(c, out);
break;
@ -12008,20 +12025,13 @@ readtoken1(int c, int syntax, char *eofmark, int striptabs)
/* Backslash is retained if we are in "str"
* and next char isn't dquote-special.
*/
if (dblquote
if (synstack->dblquote
&& c != '\\'
&& c != '`'
&& c != '$'
&& (c != '"' || eofmark != NULL)
&& (c != '"' || (eofmark != NULL && !synstack->varnest))
&& (c != '}' || !synstack->varnest)
) {
//dash survives not doing USTPUTC(CTLESC), but merely by chance:
//Example: "\z" gets encoded as "\<CTLESC>z".
//rmescapes() then emits "\", "\z", protecting z from globbing.
//But it's wrong, should protect _both_ from globbing:
//everything in double quotes is not globbed.
//Unlike dash, we have a fix in rmescapes() which emits bare "z"
//for "<CTLESC>z" since "z" is not glob-special (else unicode may break),
//and glob would see "\z" and eat "\". Thus:
USTPUTC(CTLESC, out); /* protect '\' from glob */
USTPUTC('\\', out);
}
@ -12031,56 +12041,62 @@ readtoken1(int c, int syntax, char *eofmark, int striptabs)
}
break;
case CSQUOTE:
syntax = SQSYNTAX;
synstack->syntax = SQSYNTAX;
quotemark:
if (eofmark == NULL) {
USTPUTC(CTLQUOTEMARK, out);
}
break;
case CDQUOTE:
syntax = DQSYNTAX;
dblquote = 1;
synstack->syntax = DQSYNTAX;
synstack->dblquote = 1;
toggledq:
if (synstack->varnest)
synstack->innerdq ^= 1;
goto quotemark;
case CENDQUOTE:
IF_BASH_DOLLAR_SQUOTE(bash_dollar_squote = 0;)
if (eofmark != NULL && varnest == 0) {
if (eofmark != NULL && synstack->varnest == 0) {
USTPUTC(c, out);
} else {
if (dqvarnest == 0) {
syntax = BASESYNTAX;
dblquote = 0;
}
quotef = 1;
goto quotemark;
}
break;
}
if (synstack->dqvarnest == 0) {
synstack->syntax = BASESYNTAX;
synstack->dblquote = 0;
}
quotef = 1;
if (c == '"')
goto toggledq;
goto quotemark;
case CVAR: /* '$' */
PARSESUB(); /* parse substitution */
break;
case CENDVAR: /* '}' */
if (varnest > 0) {
varnest--;
if (dqvarnest > 0) {
dqvarnest--;
}
if (!synstack->innerdq && synstack->varnest > 0) {
if (!--synstack->varnest && synstack->varpushed)
synstack_pop(&synstack);
else if (synstack->dqvarnest > 0)
synstack->dqvarnest--;
c = CTLENDVAR;
}
USTPUTC(c, out);
break;
#if ENABLE_FEATURE_SH_MATH
case CLP: /* '(' in arithmetic */
parenlevel++;
synstack->parenlevel++;
USTPUTC(c, out);
break;
case CRP: /* ')' in arithmetic */
if (parenlevel > 0) {
parenlevel--;
if (synstack->parenlevel > 0) {
synstack->parenlevel--;
} else {
if (pgetc_eatbnl() == ')') {
c = CTLENDARI;
if (--arinest == 0) {
syntax = prevsyntax;
}
synstack_pop(&synstack);
} else {
/*
* unbalanced parens
@ -12106,7 +12122,7 @@ readtoken1(int c, int syntax, char *eofmark, int striptabs)
case CIGN:
break;
default:
if (varnest == 0) {
if (synstack->varnest == 0) {
#if BASH_REDIR_OUTPUT
if (c == '&') {
//Can't call pgetc_eatbnl() here, this requires three-deep pungetc()
@ -12125,12 +12141,12 @@ readtoken1(int c, int syntax, char *eofmark, int striptabs)
endword:
#if ENABLE_FEATURE_SH_MATH
if (syntax == ARISYNTAX)
if (synstack->syntax == ARISYNTAX)
raise_error_syntax("missing '))'");
#endif
if (syntax != BASESYNTAX && eofmark == NULL)
if (synstack->syntax != BASESYNTAX && eofmark == NULL)
raise_error_syntax("unterminated quoted string");
if (varnest != 0) {
if (synstack->varnest != 0) {
/* { */
raise_error_syntax("missing '}'");
}
@ -12312,7 +12328,7 @@ parsesub: {
|| (c != '(' && c != '{' && !is_name(c) && !is_special(c))
) {
#if BASH_DOLLAR_SQUOTE
if (syntax != DQSYNTAX && c == '\'')
if (synstack->syntax != DQSYNTAX && c == '\'')
bash_dollar_squote = 1;
else
#endif
@ -12332,6 +12348,8 @@ parsesub: {
}
} else {
/* $VAR, $<specialchar>, ${...}, or PEOA/PEOF */
smalluint newsyn = synstack->syntax;
USTPUTC(CTLVAR, out);
typeloc = out - (char *)stackblock();
STADJUST(1, out);
@ -12390,6 +12408,8 @@ parsesub: {
static const char types[] ALIGN1 = "}-+?=";
/* ${VAR...} but not $VAR or ${#VAR} */
/* c == first char after VAR */
int cc = c;
switch (c) {
case ':':
c = pgetc_eatbnl();
@ -12414,21 +12434,24 @@ parsesub: {
break;
}
case '%':
case '#': {
int cc = c;
case '#':
subtype = (c == '#' ? VSTRIMLEFT : VSTRIMRIGHT);
c = pgetc_eatbnl();
if (c != cc)
goto badsub;
if (c == cc)
subtype++;
else
pungetc();
newsyn = BASESYNTAX;
break;
}
#if BASH_PATTERN_SUBST
case '/':
/* ${v/[/]pattern/repl} */
//TODO: encode pattern and repl separately.
// Currently ${v/$var_with_slash/repl} is horribly broken
// Currently cases like: v=1;echo ${v/$((1/1))/ONE}
// are broken (should print "ONE")
subtype = VSREPLACE;
newsyn = BASESYNTAX;
c = pgetc_eatbnl();
if (c != '/')
goto badsub;
@ -12440,11 +12463,24 @@ parsesub: {
badsub:
pungetc();
}
if (newsyn == ARISYNTAX && subtype > VSNORMAL)
newsyn = DQSYNTAX;
if (newsyn != synstack->syntax) {
synstack_push(&synstack,
synstack->prev ?: alloca(sizeof(*synstack)),
newsyn);
synstack->varpushed = 1;
synstack->dblquote = newsyn != BASESYNTAX;
}
((unsigned char *)stackblock())[typeloc] = subtype;
if (subtype != VSNORMAL) {
varnest++;
if (dblquote)
dqvarnest++;
synstack->varnest++;
if (synstack->dblquote)
synstack->dqvarnest++;
}
STPUTC('=', out);
}
@ -12501,7 +12537,7 @@ parsebackq: {
case '\\':
pc = pgetc(); /* or pgetc_eatbnl()? why (example)? */
if (pc != '\\' && pc != '`' && pc != '$'
&& (!dblquote || pc != '"')
&& (!synstack->dblquote || pc != '"')
) {
STPUTC('\\', pout);
}
@ -12576,10 +12612,11 @@ parsebackq: {
* Parse an arithmetic expansion (indicate start of one and set state)
*/
parsearith: {
if (++arinest == 1) {
prevsyntax = syntax;
syntax = ARISYNTAX;
}
synstack_push(&synstack,
synstack->prev ?: alloca(sizeof(*synstack)),
ARISYNTAX);
synstack->dblquote = 1;
USTPUTC(CTLARI, out);
goto parsearith_return;
}

View File

@ -0,0 +1 @@
1

View File

@ -0,0 +1 @@
echo $(( ( $((1)) ) ))

View File

@ -0,0 +1 @@
B

View File

@ -0,0 +1 @@
x=\'B; echo "${x#\'}"

View File

@ -1,6 +1,6 @@
1 a041#c
2 a041#c
3 a\041#c
3 a041#c
4 a\041#c
5 a\041#c
6 a\041#c
@ -17,4 +17,4 @@
17 a\tc
18 a\tc
19 atc
20 a\tc
20 atc

View File

@ -3,26 +3,26 @@ Replace str: _\\_\z_
Pattern: single backslash and star: "replace literal star"
Unquoted: a_\_z_b\*c
Unquoted =: a_\_z_b\*c
Quoted: a_\_\z_b\*c
Quoted =: a_\_\z_b\*c
Quoted: a_\_z_b\*c
Quoted =: a_\_z_b\*c
Pattern: double backslash and star: "replace backslash and everything after it"
Unquoted: a*b_\_z_
Unquoted =: a*b_\_z_
Quoted: a*b_\_\z_
Quoted =: a*b_\_\z_
Quoted: a*b_\_z_
Quoted =: a*b_\_z_
Source: a\bc
Replace str: _\\_\z_
Pattern: single backslash and b: "replace b"
Unquoted: a\_\_z_c
Unquoted =: a\_\_z_c
Quoted: a\_\_\z_c
Quoted =: a\_\_\z_c
Quoted: a\_\_z_c
Quoted =: a\_\_z_c
Pattern: double backslash and b: "replace backslash and b"
Unquoted: a_\_z_c
Unquoted =: a_\_z_c
Quoted: a_\_\z_c
Quoted =: a_\_\z_c
Quoted: a_\_z_c
Quoted =: a_\_z_c
Source: a\bc
Replace str: _\\_\z_ (as variable $s)

View File

@ -1,5 +1,5 @@
Expected Actual
a*z : a*z
\z : \z
z : z
a1z a2z: a1z a2z
z : z

View File

@ -3,7 +3,7 @@
>a1z; >a2z;
echo 'Expected' 'Actual'
v='a bz'; echo 'a*z :' "${v/a*z/a*z}"
v='a bz'; echo '\z :' "${v/a*z/\z}"
v='a bz'; echo 'z :' "${v/a*z/\z}"
v='a bz'; echo 'a1z a2z:' ${v/a*z/a*z}
v='a bz'; echo 'z :' ${v/a*z/\z}
rm a1z a2z

View File

@ -0,0 +1 @@
B

View File

@ -0,0 +1 @@
x=AB; echo "${x#$'\x41'}"

View File

@ -0,0 +1 @@
1

View File

@ -0,0 +1 @@
echo $(( ( $((1)) ) ))

View File

@ -0,0 +1 @@
B

View File

@ -0,0 +1 @@
x=\'B; echo "${x#\'}"

View File

@ -1,6 +1,6 @@
1 a041#c
2 a041#c
3 a\041#c
3 a041#c
4 a\041#c
5 a\041#c
6 a\041#c
@ -17,4 +17,4 @@
17 a\tc
18 a\tc
19 atc
20 a\tc
20 atc

View File

@ -3,26 +3,26 @@ Replace str: _\\_\z_
Pattern: single backslash and star: "replace literal star"
Unquoted: a_\_z_b\*c
Unquoted =: a_\_z_b\*c
Quoted: a_\_\z_b\*c
Quoted =: a_\_\z_b\*c
Quoted: a_\_z_b\*c
Quoted =: a_\_z_b\*c
Pattern: double backslash and star: "replace backslash and everything after it"
Unquoted: a*b_\_z_
Unquoted =: a*b_\_z_
Quoted: a*b_\_\z_
Quoted =: a*b_\_\z_
Quoted: a*b_\_z_
Quoted =: a*b_\_z_
Source: a\bc
Replace str: _\\_\z_
Pattern: single backslash and b: "replace b"
Unquoted: a\_\_z_c
Unquoted =: a\_\_z_c
Quoted: a\_\_\z_c
Quoted =: a\_\_\z_c
Quoted: a\_\_z_c
Quoted =: a\_\_z_c
Pattern: double backslash and b: "replace backslash and b"
Unquoted: a_\_z_c
Unquoted =: a_\_z_c
Quoted: a_\_\z_c
Quoted =: a_\_\z_c
Quoted: a_\_z_c
Quoted =: a_\_z_c
Source: a\bc
Replace str: _\\_\z_ (as variable $s)

View File

@ -1,5 +1,5 @@
Expected Actual
a*z : a*z
\z : \z
z : z
a1z a2z: a1z a2z
z : z

View File

@ -3,7 +3,7 @@
>a1z; >a2z;
echo 'Expected' 'Actual'
v='a bz'; echo 'a*z :' "${v/a*z/a*z}"
v='a bz'; echo '\z :' "${v/a*z/\z}"
v='a bz'; echo 'z :' "${v/a*z/\z}"
v='a bz'; echo 'a1z a2z:' ${v/a*z/a*z}
v='a bz'; echo 'z :' ${v/a*z/\z}
rm a1z a2z