grep: option to use GNU regex matching instead of POSIX one.

This fixes problems with NULs in files being scanned, but
 costs +800 bytes. The same can be done to sed (TODO).
This commit is contained in:
Denis Vlasenko 2008-08-09 16:15:14 +00:00
parent fb5902ca5c
commit 3fd15e197e
5 changed files with 155 additions and 36 deletions

View File

@ -21,6 +21,15 @@ config DESKTOP
Select this only if you plan to use busybox on full-blown Select this only if you plan to use busybox on full-blown
desktop machine with common Linux distro, not on an embedded box. desktop machine with common Linux distro, not on an embedded box.
config EXTRA_COMPAT
bool "Provide compatible behavior for rare corner cases (bigger code)"
default n
help
This option makes grep, sed etc handle rare corner cases
(embedded NUL bytes and such). This makes code bigger and uses
some GNU extensions in libc. You probably only need this option
if you plan to run busybox on desktop.
config FEATURE_ASSUME_UNICODE config FEATURE_ASSUME_UNICODE
bool "Assume that 1:1 char/glyph correspondence is not true" bool "Assume that 1:1 char/glyph correspondence is not true"
default n default n

View File

@ -96,6 +96,7 @@ struct globals {
int lines_before; int lines_before;
int lines_after; int lines_after;
char **before_buf; char **before_buf;
USE_EXTRA_COMPAT(size_t *before_buf_size;)
int last_line_printed; int last_line_printed;
#endif #endif
/* globals used internally */ /* globals used internally */
@ -117,6 +118,7 @@ struct globals {
#define lines_before (G.lines_before ) #define lines_before (G.lines_before )
#define lines_after (G.lines_after ) #define lines_after (G.lines_after )
#define before_buf (G.before_buf ) #define before_buf (G.before_buf )
#define before_buf_size (G.before_buf_size )
#define last_line_printed (G.last_line_printed ) #define last_line_printed (G.last_line_printed )
#define pattern_head (G.pattern_head ) #define pattern_head (G.pattern_head )
#define cur_file (G.cur_file ) #define cur_file (G.cur_file )
@ -124,14 +126,24 @@ struct globals {
typedef struct grep_list_data_t { typedef struct grep_list_data_t {
char *pattern; char *pattern;
regex_t preg; /* for GNU regex, matched_range must be persistent across grep_file() calls */
#if !ENABLE_EXTRA_COMPAT
regex_t compiled_regex;
regmatch_t matched_range;
#else
struct re_pattern_buffer compiled_regex;
struct re_registers matched_range;
#endif
#define ALLOCATED 1 #define ALLOCATED 1
#define COMPILED 2 #define COMPILED 2
int flg_mem_alocated_compiled; int flg_mem_alocated_compiled;
} grep_list_data_t; } grep_list_data_t;
#if !ENABLE_EXTRA_COMPAT
static void print_line(const char *line, int linenum, char decoration) #define print_line(line, line_len, linenum, decoration) \
print_line(line, linenum, decoration)
#endif
static void print_line(const char *line, size_t line_len, int linenum, char decoration)
{ {
#if ENABLE_FEATURE_GREP_CONTEXT #if ENABLE_FEATURE_GREP_CONTEXT
/* Happens when we go to next file, immediately hit match /* Happens when we go to next file, immediately hit match
@ -139,8 +151,9 @@ static void print_line(const char *line, int linenum, char decoration)
if (linenum < 1) if (linenum < 1)
return; return;
/* possibly print the little '--' separator */ /* possibly print the little '--' separator */
if ((lines_before || lines_after) && did_print_line && if ((lines_before || lines_after) && did_print_line
last_line_printed != linenum - 1) { && last_line_printed != linenum - 1
) {
puts("--"); puts("--");
} }
/* guard against printing "--" before first line of first file */ /* guard against printing "--" before first line of first file */
@ -152,17 +165,50 @@ static void print_line(const char *line, int linenum, char decoration)
if (PRINT_LINE_NUM) if (PRINT_LINE_NUM)
printf("%i%c", linenum, decoration); printf("%i%c", linenum, decoration);
/* Emulate weird GNU grep behavior with -ov */ /* Emulate weird GNU grep behavior with -ov */
if ((option_mask32 & (OPT_v|OPT_o)) != (OPT_v|OPT_o)) if ((option_mask32 & (OPT_v|OPT_o)) != (OPT_v|OPT_o)) {
#if !ENABLE_EXTRA_COMPAT
puts(line); puts(line);
#else
fwrite(line, 1, line_len, stdout);
putchar('\n');
#endif
}
} }
#if ENABLE_EXTRA_COMPAT
/* Unlike getline, this one removes trailing '\n' */
static ssize_t FAST_FUNC bb_getline(char **line_ptr, size_t *line_alloc_len, FILE *file)
{
ssize_t res_sz;
char *line;
res_sz = getline(line_ptr, line_alloc_len, file);
line = *line_ptr;
if (res_sz > 0) {
if (line[res_sz - 1] == '\n')
line[--res_sz] = '\0';
} else {
free(line); /* uclibc allocates a buffer even on EOF. WTF? */
}
return res_sz;
}
#endif
static int grep_file(FILE *file) static int grep_file(FILE *file)
{ {
char *line;
smalluint found; smalluint found;
int linenum = 0; int linenum = 0;
int nmatches = 0; int nmatches = 0;
regmatch_t regmatch; #if !ENABLE_EXTRA_COMPAT
char *line;
#else
char *line = NULL;
ssize_t line_len;
size_t line_alloc_len;
#define rm_so start[0]
#define rm_eo end[0]
#endif
#if ENABLE_FEATURE_GREP_CONTEXT #if ENABLE_FEATURE_GREP_CONTEXT
int print_n_lines_after = 0; int print_n_lines_after = 0;
int curpos = 0; /* track where we are in the circular 'before' buffer */ int curpos = 0; /* track where we are in the circular 'before' buffer */
@ -171,7 +217,13 @@ static int grep_file(FILE *file)
enum { print_n_lines_after = 0 }; enum { print_n_lines_after = 0 };
#endif /* ENABLE_FEATURE_GREP_CONTEXT */ #endif /* ENABLE_FEATURE_GREP_CONTEXT */
while ((line = xmalloc_fgetline(file)) != NULL) { while (
#if !ENABLE_EXTRA_COMPAT
(line = xmalloc_fgetline(file)) != NULL
#else
(line_len = bb_getline(&line, &line_alloc_len, file)) >= 0
#endif
) {
llist_t *pattern_ptr = pattern_head; llist_t *pattern_ptr = pattern_head;
grep_list_data_t *gl = gl; /* for gcc */ grep_list_data_t *gl = gl; /* for gcc */
@ -184,19 +236,35 @@ static int grep_file(FILE *file)
} else { } else {
if (!(gl->flg_mem_alocated_compiled & COMPILED)) { if (!(gl->flg_mem_alocated_compiled & COMPILED)) {
gl->flg_mem_alocated_compiled |= COMPILED; gl->flg_mem_alocated_compiled |= COMPILED;
xregcomp(&(gl->preg), gl->pattern, reflags); #if !ENABLE_EXTRA_COMPAT
xregcomp(&gl->compiled_regex, gl->pattern, reflags);
#else
memset(&gl->compiled_regex, 0, sizeof(gl->compiled_regex));
if (re_compile_pattern(gl->pattern, strlen(gl->pattern), &gl->compiled_regex))
bb_error_msg_and_die("bad regex '%s'", gl->pattern);
#endif
} }
regmatch.rm_so = 0; #if !ENABLE_EXTRA_COMPAT
regmatch.rm_eo = 0; gl->matched_range.rm_so = 0;
if (regexec(&(gl->preg), line, 1, &regmatch, 0) == 0) { gl->matched_range.rm_eo = 0;
#endif
if (
#if !ENABLE_EXTRA_COMPAT
regexec(&gl->compiled_regex, line, 1, &gl->matched_range, 0) == 0
#else
re_search(&gl->compiled_regex, line, line_len,
/*start:*/ 0, /*range:*/ line_len,
&gl->matched_range) >= 0
#endif
) {
if (!(option_mask32 & OPT_w)) if (!(option_mask32 & OPT_w))
found = 1; found = 1;
else { else {
char c = ' '; char c = ' ';
if (regmatch.rm_so) if (gl->matched_range.rm_so)
c = line[regmatch.rm_so - 1]; c = line[gl->matched_range.rm_so - 1];
if (!isalnum(c) && c != '_') { if (!isalnum(c) && c != '_') {
c = line[regmatch.rm_eo]; c = line[gl->matched_range.rm_eo];
if (!c || (!isalnum(c) && c != '_')) if (!c || (!isalnum(c) && c != '_'))
found = 1; found = 1;
} }
@ -261,7 +329,7 @@ static int grep_file(FILE *file)
/* now print each line in the buffer, clearing them as we go */ /* now print each line in the buffer, clearing them as we go */
while (before_buf[idx] != NULL) { while (before_buf[idx] != NULL) {
print_line(before_buf[idx], first_buf_entry_line_num, '-'); print_line(before_buf[idx], before_buf_size[idx], first_buf_entry_line_num, '-');
free(before_buf[idx]); free(before_buf[idx]);
before_buf[idx] = NULL; before_buf[idx] = NULL;
idx = (idx + 1) % lines_before; idx = (idx + 1) % lines_before;
@ -277,13 +345,15 @@ static int grep_file(FILE *file)
/* -Fo just prints the pattern /* -Fo just prints the pattern
* (unless -v: -Fov doesnt print anything at all) */ * (unless -v: -Fov doesnt print anything at all) */
if (found) if (found)
print_line(gl->pattern, linenum, ':'); print_line(gl->pattern, strlen(gl->pattern), linenum, ':');
} else { } else {
line[regmatch.rm_eo] = '\0'; line[gl->matched_range.rm_eo] = '\0';
print_line(line + regmatch.rm_so, linenum, ':'); print_line(line + gl->matched_range.rm_so,
gl->matched_range.rm_eo - gl->matched_range.rm_so,
linenum, ':');
} }
} else { } else {
print_line(line, linenum, ':'); print_line(line, line_len, linenum, ':');
} }
} }
} }
@ -291,12 +361,13 @@ static int grep_file(FILE *file)
else { /* no match */ else { /* no match */
/* if we need to print some context lines after the last match, do so */ /* if we need to print some context lines after the last match, do so */
if (print_n_lines_after) { if (print_n_lines_after) {
print_line(line, linenum, '-'); print_line(line, strlen(line), linenum, '-');
print_n_lines_after--; print_n_lines_after--;
} else if (lines_before) { } else if (lines_before) {
/* Add the line to the circular 'before' buffer */ /* Add the line to the circular 'before' buffer */
free(before_buf[curpos]); free(before_buf[curpos]);
before_buf[curpos] = line; before_buf[curpos] = line;
USE_EXTRA_COMPAT(before_buf_size[curpos] = line_len;)
curpos = (curpos + 1) % lines_before; curpos = (curpos + 1) % lines_before;
/* avoid free(line) - we took the line */ /* avoid free(line) - we took the line */
line = NULL; line = NULL;
@ -304,13 +375,14 @@ static int grep_file(FILE *file)
} }
#endif /* ENABLE_FEATURE_GREP_CONTEXT */ #endif /* ENABLE_FEATURE_GREP_CONTEXT */
#if !ENABLE_EXTRA_COMPAT
free(line); free(line);
#endif
/* Did we print all context after last requested match? */ /* Did we print all context after last requested match? */
if ((option_mask32 & OPT_m) if ((option_mask32 & OPT_m)
&& !print_n_lines_after && nmatches == max_matches) && !print_n_lines_after && nmatches == max_matches)
break; break;
} } /* while (read line) */
/* special-case file post-processing for options where we don't print line /* special-case file post-processing for options where we don't print line
* matches, just filenames and possibly match counts */ * matches, just filenames and possibly match counts */
@ -428,15 +500,16 @@ int grep_main(int argc, char **argv)
lines_after = Copt; lines_after = Copt;
if (!(option_mask32 & OPT_B)) /* not overridden */ if (!(option_mask32 & OPT_B)) /* not overridden */
lines_before = Copt; lines_before = Copt;
//option_mask32 |= OPT_A|OPT_B; /* for parser */
} }
/* sanity checks */ /* sanity checks */
if (option_mask32 & (OPT_c|OPT_q|OPT_l|OPT_L)) { if (option_mask32 & (OPT_c|OPT_q|OPT_l|OPT_L)) {
option_mask32 &= ~OPT_n; option_mask32 &= ~OPT_n;
lines_before = 0; lines_before = 0;
lines_after = 0; lines_after = 0;
} else if (lines_before > 0) } else if (lines_before > 0) {
before_buf = xzalloc(lines_before * sizeof(char *)); before_buf = xzalloc(lines_before * sizeof(before_buf[0]));
USE_EXTRA_COMPAT(before_buf_size = xzalloc(lines_before * sizeof(before_buf_size[0]));)
}
#else #else
/* with auto sanity checks */ /* with auto sanity checks */
/* -H unsets -h; -c,-q or -l unset -n; -e,-f are lists; -m N */ /* -H unsets -h; -c,-q or -l unset -n; -e,-f are lists; -m N */
@ -537,7 +610,7 @@ int grep_main(int argc, char **argv)
if (gl->flg_mem_alocated_compiled & ALLOCATED) if (gl->flg_mem_alocated_compiled & ALLOCATED)
free(gl->pattern); free(gl->pattern);
if (gl->flg_mem_alocated_compiled & COMPILED) if (gl->flg_mem_alocated_compiled & COMPILED)
regfree(&(gl->preg)); regfree(&gl->compiled_regex);
free(gl); free(gl);
free(pattern_head_ptr); free(pattern_head_ptr);
} }

View File

@ -9,6 +9,10 @@
* Licensed under GPLv2 or later, see file LICENSE in this tarball for details. * Licensed under GPLv2 or later, see file LICENSE in this tarball for details.
*/ */
/* for getline() [GNUism] */
#ifndef _GNU_SOURCE
#define _GNU_SOURCE 1
#endif
#include "libbb.h" #include "libbb.h"
/* This function reads an entire line from a text file, up to a newline /* This function reads an entire line from a text file, up to a newline
@ -55,7 +59,6 @@ char* FAST_FUNC xmalloc_fgets(FILE *file)
return bb_get_chunk_from_file(file, &i); return bb_get_chunk_from_file(file, &i);
} }
/* Get line. Remove trailing \n */ /* Get line. Remove trailing \n */
char* FAST_FUNC xmalloc_fgetline(FILE *file) char* FAST_FUNC xmalloc_fgetline(FILE *file)
{ {
@ -68,6 +71,44 @@ char* FAST_FUNC xmalloc_fgetline(FILE *file)
return c; return c;
} }
#if 0
/* GNUism getline() should be faster (not tested) than a loop with fgetc */
/* Get line, including trailing \n if any */
char* FAST_FUNC xmalloc_fgets(FILE *file)
{
char *res_buf = NULL;
size_t res_sz;
if (getline(&res_buf, &res_sz, file) == -1) {
free(res_buf); /* uclibc allocates a buffer even on EOF. WTF? */
res_buf = NULL;
}
//TODO: trimming to res_sz?
return res_buf;
}
/* Get line. Remove trailing \n */
char* FAST_FUNC xmalloc_fgetline(FILE *file)
{
char *res_buf = NULL;
size_t res_sz;
res_sz = getline(&res_buf, &res_sz, file);
if ((ssize_t)res_sz != -1) {
if (res_buf[res_sz - 1] == '\n')
res_buf[--res_sz] = '\0';
//TODO: trimming to res_sz?
} else {
free(res_buf); /* uclibc allocates a buffer even on EOF. WTF? */
res_buf = NULL;
}
return res_buf;
}
#endif
#if 0 #if 0
/* Faster routines (~twice as fast). +170 bytes. Unused as of 2008-07. /* Faster routines (~twice as fast). +170 bytes. Unused as of 2008-07.
* *

View File

@ -27,6 +27,6 @@ void FAST_FUNC xregcomp(regex_t *preg, const char *regex, int cflags)
{ {
char *errmsg = regcomp_or_errmsg(preg, regex, cflags); char *errmsg = regcomp_or_errmsg(preg, regex, cflags);
if (errmsg) { if (errmsg) {
bb_error_msg_and_die("xregcomp: %s", errmsg); bb_error_msg_and_die("bad regex '%s': %s", regex, errmsg);
} }
} }

View File

@ -62,12 +62,8 @@ testing "grep -s nofile - (stdin and nonexisting file, match)" \
"grep -s domatch nonexistent - ; echo \$?" \ "grep -s domatch nonexistent - ; echo \$?" \
"(standard input):domatch\n2\n" "" "nomatch\ndomatch\nend\n" "(standard input):domatch\n2\n" "" "nomatch\ndomatch\nend\n"
# This doesn't match GNU behaviour (Binary file input matches) testing "grep handles NUL in files" "grep -a foo input" "\0foo\n" "\0foo\n\n" ""
# acts like GNU grep -a testing "grep handles NUL on stdin" "grep -a foo" "\0foo\n" "" "\0foo\n\n"
testing "grep handles binary files" "grep foo input" "foo\n" "\0foo\n\n" ""
# This doesn't match GNU behaviour (Binary file (standard input) matches)
# acts like GNU grep -a
testing "grep handles binary stdin" "grep foo" "foo\n" "" "\0foo\n\n"
testing "grep matches NUL" "grep . input > /dev/null 2>&1 ; echo \$?" \ testing "grep matches NUL" "grep . input > /dev/null 2>&1 ; echo \$?" \
"0\n" "\0\n" "" "0\n" "\0\n" ""