grep: option to use GNU regex matching instead of POSIX one.
This fixes problems with NULs in files being scanned, but costs +800 bytes. The same can be done to sed (TODO).
This commit is contained in:
		@@ -21,6 +21,15 @@ config DESKTOP
 | 
			
		||||
	  Select this only if you plan to use busybox on full-blown
 | 
			
		||||
	  desktop machine with common Linux distro, not on an embedded box.
 | 
			
		||||
 | 
			
		||||
config EXTRA_COMPAT
 | 
			
		||||
	bool "Provide compatible behavior for rare corner cases (bigger code)"
 | 
			
		||||
	default n
 | 
			
		||||
	help
 | 
			
		||||
	  This option makes grep, sed etc handle rare corner cases
 | 
			
		||||
	  (embedded NUL bytes and such). This makes code bigger and uses
 | 
			
		||||
	  some GNU extensions in libc. You probably only need this option
 | 
			
		||||
	  if you plan to run busybox on desktop.
 | 
			
		||||
 | 
			
		||||
config FEATURE_ASSUME_UNICODE
 | 
			
		||||
	bool "Assume that 1:1 char/glyph correspondence is not true"
 | 
			
		||||
	default n
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										129
									
								
								findutils/grep.c
									
									
									
									
									
								
							
							
						
						
									
										129
									
								
								findutils/grep.c
									
									
									
									
									
								
							@@ -96,6 +96,7 @@ struct globals {
 | 
			
		||||
	int lines_before;
 | 
			
		||||
	int lines_after;
 | 
			
		||||
	char **before_buf;
 | 
			
		||||
	USE_EXTRA_COMPAT(size_t *before_buf_size;)
 | 
			
		||||
	int last_line_printed;
 | 
			
		||||
#endif
 | 
			
		||||
	/* globals used internally */
 | 
			
		||||
@@ -117,6 +118,7 @@ struct globals {
 | 
			
		||||
#define lines_before      (G.lines_before        )
 | 
			
		||||
#define lines_after       (G.lines_after         )
 | 
			
		||||
#define before_buf        (G.before_buf          )
 | 
			
		||||
#define before_buf_size   (G.before_buf_size     )
 | 
			
		||||
#define last_line_printed (G.last_line_printed   )
 | 
			
		||||
#define pattern_head      (G.pattern_head        )
 | 
			
		||||
#define cur_file          (G.cur_file            )
 | 
			
		||||
@@ -124,14 +126,24 @@ struct globals {
 | 
			
		||||
 | 
			
		||||
typedef struct grep_list_data_t {
 | 
			
		||||
	char *pattern;
 | 
			
		||||
	regex_t preg;
 | 
			
		||||
/* for GNU regex, matched_range must be persistent across grep_file() calls */
 | 
			
		||||
#if !ENABLE_EXTRA_COMPAT
 | 
			
		||||
	regex_t compiled_regex;
 | 
			
		||||
	regmatch_t matched_range;
 | 
			
		||||
#else
 | 
			
		||||
	struct re_pattern_buffer compiled_regex;
 | 
			
		||||
	struct re_registers matched_range;
 | 
			
		||||
#endif
 | 
			
		||||
#define ALLOCATED 1
 | 
			
		||||
#define COMPILED 2
 | 
			
		||||
	int flg_mem_alocated_compiled;
 | 
			
		||||
} grep_list_data_t;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
static void print_line(const char *line, int linenum, char decoration)
 | 
			
		||||
#if !ENABLE_EXTRA_COMPAT
 | 
			
		||||
#define print_line(line, line_len, linenum, decoration) \
 | 
			
		||||
	print_line(line, linenum, decoration)
 | 
			
		||||
#endif
 | 
			
		||||
static void print_line(const char *line, size_t line_len, int linenum, char decoration)
 | 
			
		||||
{
 | 
			
		||||
#if ENABLE_FEATURE_GREP_CONTEXT
 | 
			
		||||
	/* Happens when we go to next file, immediately hit match
 | 
			
		||||
@@ -139,8 +151,9 @@ static void print_line(const char *line, int linenum, char decoration)
 | 
			
		||||
	if (linenum < 1)
 | 
			
		||||
		return;
 | 
			
		||||
	/* possibly print the little '--' separator */
 | 
			
		||||
	if ((lines_before || lines_after) && did_print_line &&
 | 
			
		||||
			last_line_printed != linenum - 1) {
 | 
			
		||||
	if ((lines_before || lines_after) && did_print_line
 | 
			
		||||
	 && last_line_printed != linenum - 1
 | 
			
		||||
	) {
 | 
			
		||||
		puts("--");
 | 
			
		||||
	}
 | 
			
		||||
	/* guard against printing "--" before first line of first file */
 | 
			
		||||
@@ -152,17 +165,50 @@ static void print_line(const char *line, int linenum, char decoration)
 | 
			
		||||
	if (PRINT_LINE_NUM)
 | 
			
		||||
		printf("%i%c", linenum, decoration);
 | 
			
		||||
	/* Emulate weird GNU grep behavior with -ov */
 | 
			
		||||
	if ((option_mask32 & (OPT_v|OPT_o)) != (OPT_v|OPT_o))
 | 
			
		||||
	if ((option_mask32 & (OPT_v|OPT_o)) != (OPT_v|OPT_o)) {
 | 
			
		||||
#if !ENABLE_EXTRA_COMPAT
 | 
			
		||||
		puts(line);
 | 
			
		||||
#else
 | 
			
		||||
		fwrite(line, 1, line_len, stdout);
 | 
			
		||||
		putchar('\n');
 | 
			
		||||
#endif
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#if ENABLE_EXTRA_COMPAT
 | 
			
		||||
/* Unlike getline, this one removes trailing '\n' */
 | 
			
		||||
static ssize_t FAST_FUNC bb_getline(char **line_ptr, size_t *line_alloc_len, FILE *file)
 | 
			
		||||
{
 | 
			
		||||
	ssize_t res_sz;
 | 
			
		||||
	char *line;
 | 
			
		||||
 | 
			
		||||
	res_sz = getline(line_ptr, line_alloc_len, file);
 | 
			
		||||
	line = *line_ptr;
 | 
			
		||||
 | 
			
		||||
	if (res_sz > 0) {
 | 
			
		||||
		if (line[res_sz - 1] == '\n')
 | 
			
		||||
			line[--res_sz] = '\0';
 | 
			
		||||
	} else {
 | 
			
		||||
		free(line); /* uclibc allocates a buffer even on EOF. WTF? */
 | 
			
		||||
	}
 | 
			
		||||
	return res_sz;
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
static int grep_file(FILE *file)
 | 
			
		||||
{
 | 
			
		||||
	char *line;
 | 
			
		||||
	smalluint found;
 | 
			
		||||
	int linenum = 0;
 | 
			
		||||
	int nmatches = 0;
 | 
			
		||||
	regmatch_t regmatch;
 | 
			
		||||
#if !ENABLE_EXTRA_COMPAT
 | 
			
		||||
	char *line;
 | 
			
		||||
#else
 | 
			
		||||
	char *line = NULL;
 | 
			
		||||
	ssize_t line_len;
 | 
			
		||||
	size_t line_alloc_len;
 | 
			
		||||
#define rm_so start[0]
 | 
			
		||||
#define rm_eo end[0]
 | 
			
		||||
#endif
 | 
			
		||||
#if ENABLE_FEATURE_GREP_CONTEXT
 | 
			
		||||
	int print_n_lines_after = 0;
 | 
			
		||||
	int curpos = 0; /* track where we are in the circular 'before' buffer */
 | 
			
		||||
@@ -171,7 +217,13 @@ static int grep_file(FILE *file)
 | 
			
		||||
	enum { print_n_lines_after = 0 };
 | 
			
		||||
#endif /* ENABLE_FEATURE_GREP_CONTEXT */
 | 
			
		||||
 | 
			
		||||
	while ((line = xmalloc_fgetline(file)) != NULL) {
 | 
			
		||||
	while (
 | 
			
		||||
#if !ENABLE_EXTRA_COMPAT
 | 
			
		||||
		(line = xmalloc_fgetline(file)) != NULL
 | 
			
		||||
#else
 | 
			
		||||
		(line_len = bb_getline(&line, &line_alloc_len, file)) >= 0
 | 
			
		||||
#endif
 | 
			
		||||
	) {
 | 
			
		||||
		llist_t *pattern_ptr = pattern_head;
 | 
			
		||||
		grep_list_data_t *gl = gl; /* for gcc */
 | 
			
		||||
 | 
			
		||||
@@ -184,19 +236,35 @@ static int grep_file(FILE *file)
 | 
			
		||||
			} else {
 | 
			
		||||
				if (!(gl->flg_mem_alocated_compiled & COMPILED)) {
 | 
			
		||||
					gl->flg_mem_alocated_compiled |= COMPILED;
 | 
			
		||||
					xregcomp(&(gl->preg), gl->pattern, reflags);
 | 
			
		||||
#if !ENABLE_EXTRA_COMPAT
 | 
			
		||||
					xregcomp(&gl->compiled_regex, gl->pattern, reflags);
 | 
			
		||||
#else
 | 
			
		||||
					memset(&gl->compiled_regex, 0, sizeof(gl->compiled_regex));
 | 
			
		||||
					if (re_compile_pattern(gl->pattern, strlen(gl->pattern), &gl->compiled_regex))
 | 
			
		||||
						bb_error_msg_and_die("bad regex '%s'", gl->pattern);
 | 
			
		||||
#endif
 | 
			
		||||
				}
 | 
			
		||||
				regmatch.rm_so = 0;
 | 
			
		||||
				regmatch.rm_eo = 0;
 | 
			
		||||
				if (regexec(&(gl->preg), line, 1, ®match, 0) == 0) {
 | 
			
		||||
#if !ENABLE_EXTRA_COMPAT
 | 
			
		||||
				gl->matched_range.rm_so = 0;
 | 
			
		||||
				gl->matched_range.rm_eo = 0;
 | 
			
		||||
#endif
 | 
			
		||||
				if (
 | 
			
		||||
#if !ENABLE_EXTRA_COMPAT
 | 
			
		||||
					regexec(&gl->compiled_regex, line, 1, &gl->matched_range, 0) == 0
 | 
			
		||||
#else
 | 
			
		||||
					re_search(&gl->compiled_regex, line, line_len,
 | 
			
		||||
							/*start:*/ 0, /*range:*/ line_len,
 | 
			
		||||
							&gl->matched_range) >= 0
 | 
			
		||||
#endif
 | 
			
		||||
				) {
 | 
			
		||||
					if (!(option_mask32 & OPT_w))
 | 
			
		||||
						found = 1;
 | 
			
		||||
					else {
 | 
			
		||||
						char c = ' ';
 | 
			
		||||
						if (regmatch.rm_so)
 | 
			
		||||
							c = line[regmatch.rm_so - 1];
 | 
			
		||||
						if (gl->matched_range.rm_so)
 | 
			
		||||
							c = line[gl->matched_range.rm_so - 1];
 | 
			
		||||
						if (!isalnum(c) && c != '_') {
 | 
			
		||||
							c = line[regmatch.rm_eo];
 | 
			
		||||
							c = line[gl->matched_range.rm_eo];
 | 
			
		||||
							if (!c || (!isalnum(c) && c != '_'))
 | 
			
		||||
								found = 1;
 | 
			
		||||
						}
 | 
			
		||||
@@ -261,7 +329,7 @@ static int grep_file(FILE *file)
 | 
			
		||||
 | 
			
		||||
					/* now print each line in the buffer, clearing them as we go */
 | 
			
		||||
					while (before_buf[idx] != NULL) {
 | 
			
		||||
						print_line(before_buf[idx], first_buf_entry_line_num, '-');
 | 
			
		||||
						print_line(before_buf[idx], before_buf_size[idx], first_buf_entry_line_num, '-');
 | 
			
		||||
						free(before_buf[idx]);
 | 
			
		||||
						before_buf[idx] = NULL;
 | 
			
		||||
						idx = (idx + 1) % lines_before;
 | 
			
		||||
@@ -277,13 +345,15 @@ static int grep_file(FILE *file)
 | 
			
		||||
						/* -Fo just prints the pattern
 | 
			
		||||
						 * (unless -v: -Fov doesnt print anything at all) */
 | 
			
		||||
						if (found)
 | 
			
		||||
							print_line(gl->pattern, linenum, ':');
 | 
			
		||||
							print_line(gl->pattern, strlen(gl->pattern), linenum, ':');
 | 
			
		||||
					} else {
 | 
			
		||||
						line[regmatch.rm_eo] = '\0';
 | 
			
		||||
						print_line(line + regmatch.rm_so, linenum, ':');
 | 
			
		||||
						line[gl->matched_range.rm_eo] = '\0';
 | 
			
		||||
						print_line(line + gl->matched_range.rm_so,
 | 
			
		||||
								gl->matched_range.rm_eo - gl->matched_range.rm_so,
 | 
			
		||||
								linenum, ':');
 | 
			
		||||
					}
 | 
			
		||||
				} else {
 | 
			
		||||
					print_line(line, linenum, ':');
 | 
			
		||||
					print_line(line, line_len, linenum, ':');
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
@@ -291,12 +361,13 @@ static int grep_file(FILE *file)
 | 
			
		||||
		else { /* no match */
 | 
			
		||||
			/* if we need to print some context lines after the last match, do so */
 | 
			
		||||
			if (print_n_lines_after) {
 | 
			
		||||
				print_line(line, linenum, '-');
 | 
			
		||||
				print_line(line, strlen(line), linenum, '-');
 | 
			
		||||
				print_n_lines_after--;
 | 
			
		||||
			} else if (lines_before) {
 | 
			
		||||
				/* Add the line to the circular 'before' buffer */
 | 
			
		||||
				free(before_buf[curpos]);
 | 
			
		||||
				before_buf[curpos] = line;
 | 
			
		||||
				USE_EXTRA_COMPAT(before_buf_size[curpos] = line_len;)
 | 
			
		||||
				curpos = (curpos + 1) % lines_before;
 | 
			
		||||
				/* avoid free(line) - we took the line */
 | 
			
		||||
				line = NULL;
 | 
			
		||||
@@ -304,13 +375,14 @@ static int grep_file(FILE *file)
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
#endif /* ENABLE_FEATURE_GREP_CONTEXT */
 | 
			
		||||
#if !ENABLE_EXTRA_COMPAT
 | 
			
		||||
		free(line);
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
		/* Did we print all context after last requested match? */
 | 
			
		||||
		if ((option_mask32 & OPT_m)
 | 
			
		||||
		 && !print_n_lines_after && nmatches == max_matches)
 | 
			
		||||
			break;
 | 
			
		||||
	}
 | 
			
		||||
	} /* while (read line) */
 | 
			
		||||
 | 
			
		||||
	/* special-case file post-processing for options where we don't print line
 | 
			
		||||
	 * matches, just filenames and possibly match counts */
 | 
			
		||||
@@ -428,15 +500,16 @@ int grep_main(int argc, char **argv)
 | 
			
		||||
			lines_after = Copt;
 | 
			
		||||
		if (!(option_mask32 & OPT_B)) /* not overridden */
 | 
			
		||||
			lines_before = Copt;
 | 
			
		||||
		//option_mask32 |= OPT_A|OPT_B; /* for parser */
 | 
			
		||||
	}
 | 
			
		||||
	/* sanity checks */
 | 
			
		||||
	if (option_mask32 & (OPT_c|OPT_q|OPT_l|OPT_L)) {
 | 
			
		||||
		option_mask32 &= ~OPT_n;
 | 
			
		||||
		lines_before = 0;
 | 
			
		||||
		lines_after = 0;
 | 
			
		||||
	} else if (lines_before > 0)
 | 
			
		||||
		before_buf = xzalloc(lines_before * sizeof(char *));
 | 
			
		||||
	} else if (lines_before > 0) {
 | 
			
		||||
		before_buf = xzalloc(lines_before * sizeof(before_buf[0]));
 | 
			
		||||
		USE_EXTRA_COMPAT(before_buf_size = xzalloc(lines_before * sizeof(before_buf_size[0]));)
 | 
			
		||||
	}
 | 
			
		||||
#else
 | 
			
		||||
	/* with auto sanity checks */
 | 
			
		||||
	/* -H unsets -h; -c,-q or -l unset -n; -e,-f are lists; -m N */
 | 
			
		||||
@@ -537,7 +610,7 @@ int grep_main(int argc, char **argv)
 | 
			
		||||
			if (gl->flg_mem_alocated_compiled & ALLOCATED)
 | 
			
		||||
				free(gl->pattern);
 | 
			
		||||
			if (gl->flg_mem_alocated_compiled & COMPILED)
 | 
			
		||||
				regfree(&(gl->preg));
 | 
			
		||||
				regfree(&gl->compiled_regex);
 | 
			
		||||
			free(gl);
 | 
			
		||||
			free(pattern_head_ptr);
 | 
			
		||||
		}
 | 
			
		||||
 
 | 
			
		||||
@@ -9,6 +9,10 @@
 | 
			
		||||
 * Licensed under GPLv2 or later, see file LICENSE in this tarball for details.
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
/* for getline() [GNUism] */
 | 
			
		||||
#ifndef _GNU_SOURCE
 | 
			
		||||
#define _GNU_SOURCE 1
 | 
			
		||||
#endif
 | 
			
		||||
#include "libbb.h"
 | 
			
		||||
 | 
			
		||||
/* This function reads an entire line from a text file, up to a newline
 | 
			
		||||
@@ -55,7 +59,6 @@ char* FAST_FUNC xmalloc_fgets(FILE *file)
 | 
			
		||||
 | 
			
		||||
	return bb_get_chunk_from_file(file, &i);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* Get line.  Remove trailing \n */
 | 
			
		||||
char* FAST_FUNC xmalloc_fgetline(FILE *file)
 | 
			
		||||
{
 | 
			
		||||
@@ -68,6 +71,44 @@ char* FAST_FUNC xmalloc_fgetline(FILE *file)
 | 
			
		||||
	return c;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#if 0
 | 
			
		||||
 | 
			
		||||
/* GNUism getline() should be faster (not tested) than a loop with fgetc */
 | 
			
		||||
 | 
			
		||||
/* Get line, including trailing \n if any */
 | 
			
		||||
char* FAST_FUNC xmalloc_fgets(FILE *file)
 | 
			
		||||
{
 | 
			
		||||
	char *res_buf = NULL;
 | 
			
		||||
	size_t res_sz;
 | 
			
		||||
 | 
			
		||||
	if (getline(&res_buf, &res_sz, file) == -1) {
 | 
			
		||||
		free(res_buf); /* uclibc allocates a buffer even on EOF. WTF? */
 | 
			
		||||
		res_buf = NULL;
 | 
			
		||||
	}
 | 
			
		||||
//TODO: trimming to res_sz?
 | 
			
		||||
	return res_buf;
 | 
			
		||||
}
 | 
			
		||||
/* Get line.  Remove trailing \n */
 | 
			
		||||
char* FAST_FUNC xmalloc_fgetline(FILE *file)
 | 
			
		||||
{
 | 
			
		||||
	char *res_buf = NULL;
 | 
			
		||||
	size_t res_sz;
 | 
			
		||||
 | 
			
		||||
	res_sz = getline(&res_buf, &res_sz, file);
 | 
			
		||||
 | 
			
		||||
	if ((ssize_t)res_sz != -1) {
 | 
			
		||||
		if (res_buf[res_sz - 1] == '\n')
 | 
			
		||||
			res_buf[--res_sz] = '\0';
 | 
			
		||||
//TODO: trimming to res_sz?
 | 
			
		||||
	} else {
 | 
			
		||||
		free(res_buf); /* uclibc allocates a buffer even on EOF. WTF? */
 | 
			
		||||
		res_buf = NULL;
 | 
			
		||||
	}
 | 
			
		||||
	return res_buf;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#if 0
 | 
			
		||||
/* Faster routines (~twice as fast). +170 bytes. Unused as of 2008-07.
 | 
			
		||||
 *
 | 
			
		||||
 
 | 
			
		||||
@@ -27,6 +27,6 @@ void FAST_FUNC xregcomp(regex_t *preg, const char *regex, int cflags)
 | 
			
		||||
{
 | 
			
		||||
	char *errmsg = regcomp_or_errmsg(preg, regex, cflags);
 | 
			
		||||
	if (errmsg) {
 | 
			
		||||
		bb_error_msg_and_die("xregcomp: %s", errmsg);
 | 
			
		||||
		bb_error_msg_and_die("bad regex '%s': %s", regex, errmsg);
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -62,12 +62,8 @@ testing "grep -s nofile - (stdin and nonexisting file, match)" \
 | 
			
		||||
	"grep -s domatch nonexistent - ; echo \$?" \
 | 
			
		||||
	"(standard input):domatch\n2\n" "" "nomatch\ndomatch\nend\n"
 | 
			
		||||
 | 
			
		||||
# This doesn't match GNU behaviour (Binary file input matches)
 | 
			
		||||
# acts like GNU grep -a
 | 
			
		||||
testing "grep handles binary files" "grep foo input" "foo\n" "\0foo\n\n" ""
 | 
			
		||||
# This doesn't match GNU behaviour (Binary file (standard input) matches)
 | 
			
		||||
# acts like GNU grep -a
 | 
			
		||||
testing "grep handles binary stdin" "grep foo" "foo\n" "" "\0foo\n\n"
 | 
			
		||||
testing "grep handles NUL in files" "grep -a foo input" "\0foo\n" "\0foo\n\n" ""
 | 
			
		||||
testing "grep handles NUL on stdin" "grep -a foo" "\0foo\n" "" "\0foo\n\n"
 | 
			
		||||
 | 
			
		||||
testing "grep matches NUL" "grep . input > /dev/null 2>&1 ; echo \$?" \
 | 
			
		||||
	"0\n" "\0\n" ""
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user