cut: add toybox-compatible options -O OUTSEP, -D, -F LIST

function                                             old     new   delta
cut_main                                             884    1201    +317
packed_usage                                       33823   33885     +62
.rodata                                           104186  104179      -7
------------------------------------------------------------------------------
(add/remove: 0/0 grow/shrink: 2/1 up/down: 379/-7)            Total: 372 bytes

Signed-off-by: Rob Landley <rob@landley.net>
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
This commit is contained in:
Rob Landley 2021-07-20 16:02:31 +02:00 committed by Denys Vlasenko
parent dabbeeb793
commit 0068ce2fa0
2 changed files with 171 additions and 94 deletions

View File

@ -14,6 +14,13 @@
//config: help //config: help
//config: cut is used to print selected parts of lines from //config: cut is used to print selected parts of lines from
//config: each file to stdout. //config: each file to stdout.
//config:
//config:config FEATURE_CUT_REGEX
//config: bool "cut -F"
//config: default y
//config: depends on CUT
//config: help
//config: Allow regex based delimiters.
//applet:IF_CUT(APPLET_NOEXEC(cut, cut, BB_DIR_USR_BIN, BB_SUID_DROP, cut)) //applet:IF_CUT(APPLET_NOEXEC(cut, cut, BB_DIR_USR_BIN, BB_SUID_DROP, cut))
@ -25,9 +32,14 @@
//usage: "Print selected fields from FILEs to stdout\n" //usage: "Print selected fields from FILEs to stdout\n"
//usage: "\n -b LIST Output only bytes from LIST" //usage: "\n -b LIST Output only bytes from LIST"
//usage: "\n -c LIST Output only characters from LIST" //usage: "\n -c LIST Output only characters from LIST"
//usage: "\n -d CHAR Use CHAR instead of tab as field delimiter" //usage: "\n -d SEP Field delimiter for input (default -f TAB, -F run of whitespace)"
//usage: "\n -O SEP Field delimeter for output (default = -d for -f, one space for -F)"
//usage: "\n -D Don't sort/collate sections or match -fF lines without delimeter"
//usage: "\n -f LIST Print only these fields (-d is single char)"
//usage: IF_FEATURE_CUT_REGEX(
//usage: "\n -F LIST Print only these fields (-d is regex)"
//usage: )
//usage: "\n -s Output only lines containing delimiter" //usage: "\n -s Output only lines containing delimiter"
//usage: "\n -f LIST Print only these fields"
//usage: "\n -n Ignored" //usage: "\n -n Ignored"
//(manpage:-n with -b: don't split multibyte characters) //(manpage:-n with -b: don't split multibyte characters)
//usage: //usage:
@ -39,38 +51,49 @@
#include "libbb.h" #include "libbb.h"
#if ENABLE_FEATURE_CUT_REGEX
#include "xregex.h"
#else
#define regex_t int
typedef struct { int rm_eo, rm_so; } regmatch_t;
#define xregcomp(x, ...) *(x) = 0
#define regexec(...) 0
#endif
/* This is a NOEXEC applet. Be very careful! */ /* This is a NOEXEC applet. Be very careful! */
/* option vars */ /* option vars */
#define OPT_STR "b:c:f:d:sn" #define OPT_STR "b:c:f:d:O:sD"IF_FEATURE_CUT_REGEX("F:")"n"
#define CUT_OPT_BYTE_FLGS (1 << 0) #define CUT_OPT_BYTE_FLGS (1 << 0)
#define CUT_OPT_CHAR_FLGS (1 << 1) #define CUT_OPT_CHAR_FLGS (1 << 1)
#define CUT_OPT_FIELDS_FLGS (1 << 2) #define CUT_OPT_FIELDS_FLGS (1 << 2)
#define CUT_OPT_DELIM_FLGS (1 << 3) #define CUT_OPT_DELIM_FLGS (1 << 3)
#define CUT_OPT_SUPPRESS_FLGS (1 << 4) #define CUT_OPT_ODELIM_FLGS (1 << 4)
#define CUT_OPT_SUPPRESS_FLGS (1 << 5)
#define CUT_OPT_NOSORT_FLGS (1 << 6)
#define CUT_OPT_REGEX_FLGS ((1 << 7) * ENABLE_FEATURE_CUT_REGEX)
struct cut_list { struct cut_list {
int startpos; int startpos;
int endpos; int endpos;
}; };
enum {
BOL = 0,
EOL = INT_MAX,
NON_RANGE = -1
};
static int cmpfunc(const void *a, const void *b) static int cmpfunc(const void *a, const void *b)
{ {
return (((struct cut_list *) a)->startpos - return (((struct cut_list *) a)->startpos -
((struct cut_list *) b)->startpos); ((struct cut_list *) b)->startpos);
} }
static void cut_file(FILE *file, char delim, const struct cut_list *cut_lists, unsigned nlists) static void cut_file(FILE *file, const char *delim, const char *odelim,
const struct cut_list *cut_lists, unsigned nlists)
{ {
char *line; char *line;
unsigned linenum = 0; /* keep these zero-based to be consistent */ unsigned linenum = 0; /* keep these zero-based to be consistent */
regex_t reg;
int spos, shoe = option_mask32 & CUT_OPT_REGEX_FLGS;
if (shoe) xregcomp(&reg, delim, REG_EXTENDED);
/* go through every line in the file */ /* go through every line in the file */
while ((line = xmalloc_fgetline(file)) != NULL) { while ((line = xmalloc_fgetline(file)) != NULL) {
@ -80,29 +103,22 @@ static void cut_file(FILE *file, char delim, const struct cut_list *cut_lists, u
char *printed = xzalloc(linelen + 1); char *printed = xzalloc(linelen + 1);
char *orig_line = line; char *orig_line = line;
unsigned cl_pos = 0; unsigned cl_pos = 0;
int spos;
/* cut based on chars/bytes XXX: only works when sizeof(char) == byte */ /* cut based on chars/bytes XXX: only works when sizeof(char) == byte */
if (option_mask32 & (CUT_OPT_CHAR_FLGS | CUT_OPT_BYTE_FLGS)) { if (option_mask32 & (CUT_OPT_CHAR_FLGS | CUT_OPT_BYTE_FLGS)) {
/* print the chars specified in each cut list */ /* print the chars specified in each cut list */
for (; cl_pos < nlists; cl_pos++) { for (; cl_pos < nlists; cl_pos++) {
spos = cut_lists[cl_pos].startpos; for (spos = cut_lists[cl_pos].startpos; spos < linelen;) {
while (spos < linelen) {
if (!printed[spos]) { if (!printed[spos]) {
printed[spos] = 'X'; printed[spos] = 'X';
putchar(line[spos]); putchar(line[spos]);
} }
spos++; if (++spos > cut_lists[cl_pos].endpos) {
if (spos > cut_lists[cl_pos].endpos
/* NON_RANGE is -1, so if below is true,
* the above was true too (spos is >= 0) */
/* || cut_lists[cl_pos].endpos == NON_RANGE */
) {
break; break;
} }
} }
} }
} else if (delim == '\n') { /* cut by lines */ } else if (*delim == '\n') { /* cut by lines */
spos = cut_lists[cl_pos].startpos; spos = cut_lists[cl_pos].startpos;
/* get out if we have no more lists to process or if the lines /* get out if we have no more lists to process or if the lines
@ -115,9 +131,7 @@ static void cut_file(FILE *file, char delim, const struct cut_list *cut_lists, u
while (spos < (int)linenum) { while (spos < (int)linenum) {
spos++; spos++;
/* go to the next list if we're at the end of this one */ /* go to the next list if we're at the end of this one */
if (spos > cut_lists[cl_pos].endpos if (spos > cut_lists[cl_pos].endpos) {
|| cut_lists[cl_pos].endpos == NON_RANGE
) {
cl_pos++; cl_pos++;
/* get out if there's no more lists to process */ /* get out if there's no more lists to process */
if (cl_pos >= nlists) if (cl_pos >= nlists)
@ -135,55 +149,56 @@ static void cut_file(FILE *file, char delim, const struct cut_list *cut_lists, u
puts(line); puts(line);
goto next_line; goto next_line;
} else { /* cut by fields */ } else { /* cut by fields */
int ndelim = -1; /* zero-based / one-based problem */ unsigned uu = 0, start = 0, end = 0, out = 0;
int nfields_printed = 0; int dcount = 0;
char *field = NULL;
char delimiter[2];
delimiter[0] = delim; /* Loop through bytes, finding next delimiter */
delimiter[1] = 0; for (;;) {
/* End of current range? */
if (end == linelen || dcount > cut_lists[cl_pos].endpos) {
if (++cl_pos >= nlists) break;
if (option_mask32 & CUT_OPT_NOSORT_FLGS)
start = dcount = uu = 0;
end = 0;
}
/* End of current line? */
if (uu == linelen) {
/* If we've seen no delimiters, check -s */
if (!cl_pos && !dcount && !shoe) {
if (option_mask32 & CUT_OPT_SUPPRESS_FLGS)
goto next_line;
} else if (dcount<cut_lists[cl_pos].startpos)
start = linelen;
end = linelen;
} else {
/* Find next delimiter */
if (shoe) {
regmatch_t rr = {-1, -1};
/* does this line contain any delimiters? */ if (!regexec(&reg, line+uu, 1, &rr, REG_NOTBOL|REG_NOTEOL)) {
if (strchr(line, delim) == NULL) { end = uu + rr.rm_so;
if (!(option_mask32 & CUT_OPT_SUPPRESS_FLGS)) uu += rr.rm_eo;
puts(line); } else {
goto next_line; uu = linelen;
} continue;
}
} else if (line[end = uu++] != *delim)
continue;
/* process each list on this line, for as long as we've got /* Got delimiter. Loop if not yet within range. */
* a line to process */ if (dcount++ < cut_lists[cl_pos].startpos) {
for (; cl_pos < nlists && line; cl_pos++) { start = uu;
spos = cut_lists[cl_pos].startpos; continue;
do {
/* find the field we're looking for */
while (line && ndelim < spos) {
field = strsep(&line, delimiter);
ndelim++;
} }
}
/* we found it, and it hasn't been printed yet */ if (end != start || !shoe)
if (field && ndelim == spos && !printed[ndelim]) { printf("%s%.*s", out++ ? odelim : "", end-start, line + start);
/* if this isn't our first time through, we need to start = uu;
* print the delimiter after the last field that was if (!dcount)
* printed */ break;
if (nfields_printed > 0)
putchar(delim);
fputs_stdout(field);
printed[ndelim] = 'X';
nfields_printed++; /* shouldn't overflow.. */
}
spos++;
/* keep going as long as we have a line to work with,
* this is a list, and we're not at the end of that
* list */
} while (spos <= cut_lists[cl_pos].endpos && line
&& cut_lists[cl_pos].endpos != NON_RANGE);
} }
} }
/* if we printed anything at all, we need to finish it with a /* if we printed anything, finish with newline */
* newline cuz we were handed a chomped line */
putchar('\n'); putchar('\n');
next_line: next_line:
linenum++; linenum++;
@ -198,37 +213,35 @@ int cut_main(int argc UNUSED_PARAM, char **argv)
/* growable array holding a series of lists */ /* growable array holding a series of lists */
struct cut_list *cut_lists = NULL; struct cut_list *cut_lists = NULL;
unsigned nlists = 0; /* number of elements in above list */ unsigned nlists = 0; /* number of elements in above list */
char delim = '\t'; /* delimiter, default is tab */
char *sopt, *ltok; char *sopt, *ltok;
const char *delim = NULL;
const char *odelim = NULL;
unsigned opt; unsigned opt;
#define ARG "bcf"IF_FEATURE_CUT_REGEX("F")
opt = getopt32(argv, "^" opt = getopt32(argv, "^"
OPT_STR OPT_STR // = "b:c:f:d:O:sD"IF_FEATURE_CUT_REGEX("F:")"n"
"\0" "b--bcf:c--bcf:f--bcf", "\0" "b--"ARG":c--"ARG":f--"ARG IF_FEATURE_CUT_REGEX("F--"ARG),
&sopt, &sopt, &sopt, &ltok &sopt, &sopt, &sopt, &delim, &odelim IF_FEATURE_CUT_REGEX(, &sopt)
); );
if (!delim || !*delim)
delim = (opt & CUT_OPT_REGEX_FLGS) ? "[[:space:]]+" : "\t";
if (!odelim) odelim = (opt & CUT_OPT_REGEX_FLGS) ? " " : delim;
// argc -= optind; // argc -= optind;
argv += optind; argv += optind;
if (!(opt & (CUT_OPT_BYTE_FLGS | CUT_OPT_CHAR_FLGS | CUT_OPT_FIELDS_FLGS))) if (!(opt & (CUT_OPT_BYTE_FLGS | CUT_OPT_CHAR_FLGS | CUT_OPT_FIELDS_FLGS | CUT_OPT_REGEX_FLGS)))
bb_simple_error_msg_and_die("expected a list of bytes, characters, or fields"); bb_simple_error_msg_and_die("expected a list of bytes, characters, or fields");
if (opt & CUT_OPT_DELIM_FLGS) {
if (ltok[0] && ltok[1]) { /* more than 1 char? */
bb_simple_error_msg_and_die("the delimiter must be a single character");
}
delim = ltok[0];
}
/* non-field (char or byte) cutting has some special handling */ /* non-field (char or byte) cutting has some special handling */
if (!(opt & CUT_OPT_FIELDS_FLGS)) { if (!(opt & (CUT_OPT_FIELDS_FLGS|CUT_OPT_REGEX_FLGS))) {
static const char _op_on_field[] ALIGN1 = " only when operating on fields"; static const char _op_on_field[] ALIGN1 = " only when operating on fields";
if (opt & CUT_OPT_SUPPRESS_FLGS) { if (opt & CUT_OPT_SUPPRESS_FLGS) {
bb_error_msg_and_die bb_error_msg_and_die
("suppressing non-delimited lines makes sense%s", ("suppressing non-delimited lines makes sense%s", _op_on_field);
_op_on_field);
} }
if (delim != '\t') { if (opt & CUT_OPT_DELIM_FLGS) {
bb_error_msg_and_die bb_error_msg_and_die
("a delimiter may be specified%s", _op_on_field); ("a delimiter may be specified%s", _op_on_field);
} }
@ -253,7 +266,7 @@ int cut_main(int argc UNUSED_PARAM, char **argv)
/* get the start pos */ /* get the start pos */
ntok = strsep(&ltok, "-"); ntok = strsep(&ltok, "-");
if (!ntok[0]) { if (!ntok[0]) {
s = BOL; s = 0;
} else { } else {
s = xatoi_positive(ntok); s = xatoi_positive(ntok);
/* account for the fact that arrays are zero based, while /* account for the fact that arrays are zero based, while
@ -264,24 +277,23 @@ int cut_main(int argc UNUSED_PARAM, char **argv)
/* get the end pos */ /* get the end pos */
if (ltok == NULL) { if (ltok == NULL) {
e = NON_RANGE; e = s;
} else if (!ltok[0]) { } else if (!ltok[0]) {
e = EOL; e = INT_MAX;
} else { } else {
e = xatoi_positive(ltok); e = xatoi_positive(ltok);
/* if the user specified and end position of 0, /* if the user specified and end position of 0,
* that means "til the end of the line" */ * that means "til the end of the line" */
if (e == 0) if (!*ltok)
e = EOL; e = INT_MAX;
else if (e < s)
bb_error_msg_and_die("%d<%d", e, s);
e--; /* again, arrays are zero based, lines are 1 based */ e--; /* again, arrays are zero based, lines are 1 based */
if (e == s)
e = NON_RANGE;
} }
/* add the new list */ /* add the new list */
cut_lists = xrealloc_vector(cut_lists, 4, nlists); cut_lists = xrealloc_vector(cut_lists, 4, nlists);
/* NB: startpos is always >= 0, /* NB: startpos is always >= 0 */
* while endpos may be = NON_RANGE (-1) */
cut_lists[nlists].startpos = s; cut_lists[nlists].startpos = s;
cut_lists[nlists].endpos = e; cut_lists[nlists].endpos = e;
nlists++; nlists++;
@ -294,7 +306,8 @@ int cut_main(int argc UNUSED_PARAM, char **argv)
/* now that the lists are parsed, we need to sort them to make life /* now that the lists are parsed, we need to sort them to make life
* easier on us when it comes time to print the chars / fields / lines * easier on us when it comes time to print the chars / fields / lines
*/ */
qsort(cut_lists, nlists, sizeof(cut_lists[0]), cmpfunc); if (!(opt & CUT_OPT_NOSORT_FLGS))
qsort(cut_lists, nlists, sizeof(cut_lists[0]), cmpfunc);
} }
{ {
@ -309,7 +322,7 @@ int cut_main(int argc UNUSED_PARAM, char **argv)
retval = EXIT_FAILURE; retval = EXIT_FAILURE;
continue; continue;
} }
cut_file(file, delim, cut_lists, nlists); cut_file(file, delim, odelim, cut_lists, nlists);
fclose_if_not_stdin(file); fclose_if_not_stdin(file);
} while (*++argv); } while (*++argv);

View File

@ -15,4 +15,68 @@ testing "cut '-' (stdin) and multi file handling" \
"the quick brown fox\n" \ "the quick brown fox\n" \
"jumps over the lazy dog\n" \ "jumps over the lazy dog\n" \
abc="\
one:two:three:four:five:six:seven
alpha:beta:gamma:delta:epsilon:zeta:eta:theta:iota:kappa:lambda:mu
the quick brown fox jumps over the lazy dog
"
testing "cut -b a,a,a" "cut -b 3,3,3 input" "e\np\ne\n" "$abc" ""
testing "cut -b overlaps" "cut -b 1-3,2-5,7-9,9-10 input" \
"one:to:th\nalphabeta\nthe qick \n" "$abc" ""
testing "-b encapsulated" "cut -b 3-8,4-6 input" "e:two:\npha:be\ne quic\n" \
"$abc" ""
# --output-delimiter not implemnted (yet?)
#testing "cut -bO overlaps" \
# "cut --output-delimiter ' ' -b 1-3,2-5,7-9,9-10 input" \
# "one:t o:th\nalpha beta\nthe q ick \n" "$abc" ""
testing "cut high-low error" "cut -b 8-3 abc.txt 2>/dev/null || echo err" "err\n" \
"$abc" ""
testing "cut -c a-b" "cut -c 4-10 input" ":two:th\nha:beta\n quick \n" "$abc" ""
testing "cut -c a-" "cut -c 41- input" "\ntheta:iota:kappa:lambda:mu\ndog\n" "$abc" ""
testing "cut -c -b" "cut -c -39 input" \
"one:two:three:four:five:six:seven\nalpha:beta:gamma:delta:epsilon:zeta:eta\nthe quick brown fox jumps over the lazy\n" \
"$abc" ""
testing "cut -c a" "cut -c 40 input" "\n:\n \n" "$abc" ""
testing "cut -c a,b-c,d" "cut -c 3,5-7,10 input" "etwoh\npa:ba\nequi \n" "$abc" ""
testing "cut -f a-" "cut -d ':' -f 5- input" "five:six:seven\nepsilon:zeta:eta:theta:iota:kappa:lambda:mu\nthe quick brown fox jumps over the lazy dog\n" "$abc" ""
testing "cut show whole line with no delim" "cut -d ' ' -f 3 input" \
"one:two:three:four:five:six:seven\nalpha:beta:gamma:delta:epsilon:zeta:eta:theta:iota:kappa:lambda:mu\nbrown\n" "$abc" ""
testing "cut with echo, -c (a-b)" "echo 'ref_categorie=test' | cut -c 1-15 " "ref_categorie=t\n" "" ""
testing "cut with echo, -c (a)" "echo 'ref_categorie=test' | cut -c 14" "=\n" "" ""
testing "cut with -c (a,b,c)" "cut -c 4,5,20 input" "det\n" "abcdefghijklmnopqrstuvwxyz" ""
testing "cut with -b (a,b,c)" "cut -b 4,5,20 input" "det\n" "abcdefghijklmnopqrstuvwxyz" ""
input="\
406378:Sales:Itorre:Jan
031762:Marketing:Nasium:Jim
636496:Research:Ancholie:Mel
396082:Sales:Jucacion:Ed
"
testing "cut with -d -f(:) -s" "cut -d: -f3 -s input" "Itorre\nNasium\nAncholie\nJucacion\n" "$input" ""
testing "cut with -d -f( ) -s" "cut -d' ' -f3 -s input && echo yes" "yes\n" "$input" ""
testing "cut with -d -f(a) -s" "cut -da -f3 -s input" "n\nsium:Jim\n\ncion:Ed\n" "$input" ""
testing "cut with -d -f(a) -s -n" "cut -da -f3 -s -n input" "n\nsium:Jim\n\ncion:Ed\n" "$input" ""
# substitute for awk
testing "cut -DF" "cut -DF 2,7,5" \
"said and your\nare\nis demand. supply\nforecast :\nyou you better,\n\nEm: Took hate\n" "" \
"Bother, said Pooh. It's your husband, and he has a gun.
Cheerios are donut seeds.
Talk is cheap because supply exceeds demand.
Weather forecast for tonight : dark.
Apple: you can buy better, but you can't pay more.
Subcalifragilisticexpialidocious.
Auntie Em: Hate you, hate Kansas. Took the dog. Dorothy."
testing "cut empty field" "cut -d ':' -f 1-3" "a::b\n" "" "a::b\n"
testing "cut empty field 2" "cut -d ':' -f 3-5" "b::c\n" "" "a::b::c:d\n"
exit $FAILCOUNT exit $FAILCOUNT