From b2eadd109c3a508011705761dfe24a35180d925d Mon Sep 17 00:00:00 2001 From: Pádraig Brady Date: Tue, 12 Jan 2016 16:29:32 +0000 Subject: join,sort,uniq: with -z, treat '\n' as a field separator * NEWS: Mention the change in behavior. * doc/coreutils.texi (newlineFieldSeparator): A new description, referenced from ({join,sort,uniq} invocation). * src/system.h (field_sep): A new inline function to determine if a character is a field separator. * src/join.c (usage): s/whitespace/blank/ to be more accurate wrt which characters are field separators. (xfields): s/isblank/field_sep/. * src/sort.c (inittables): Likewise. * src/uniq.c (find_field): Likewise. * tests/misc/join.pl: Adjust -z test, and add a test/example for processing the whole record with field processing. * tests/misc/sort.pl: Add -z test cases, including case with '\n'. * tests/misc/uniq.pl: Add -z -f test case with \n. --- NEWS | 2 ++ doc/coreutils.texi | 5 +++++ src/join.c | 8 ++++---- src/sort.c | 4 ++-- src/system.h | 7 +++++++ src/uniq.c | 4 ++-- tests/misc/join.pl | 9 ++++++--- tests/misc/sort.pl | 5 +++++ tests/misc/uniq.pl | 1 + 9 files changed, 34 insertions(+), 11 deletions(-) diff --git a/NEWS b/NEWS index 6e48a5365..192d8fa9e 100644 --- a/NEWS +++ b/NEWS @@ -64,6 +64,8 @@ GNU coreutils NEWS -*- outline -*- ls now quotes file names unambiguously and appropriate for use in a shell, when outputting to a terminal. + join, sort, uniq with --zero-terminated, now treat '\n' as a field delimiter. + ** Improvements All utilities now quote user supplied arguments in error strings, diff --git a/doc/coreutils.texi b/doc/coreutils.texi index 25380628d..80e9a032d 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -4493,6 +4493,9 @@ numeric string when checking for uniqueness, whereas @code{sort -n | uniq} inspects the entire line. @xref{uniq invocation}. @optZeroTerminated +@macro newlineFieldSeparator +Note with @option{-z} the newline character is treated as a field separator. +@end macro @end table @@ -5034,6 +5037,7 @@ fields and characters). By default the entire rest of the lines are compared. @optZeroTerminated +@newlineFieldSeparator @end table @@ -6157,6 +6161,7 @@ Print a line for each unpairable line in file @var{file-number} (either @samp{1} or @samp{2}), instead of the normal output. @optZeroTerminated +@newlineFieldSeparator @end table diff --git a/src/join.c b/src/join.c index 8686428fb..9b25da667 100644 --- a/src/join.c +++ b/src/join.c @@ -194,7 +194,7 @@ Usage: %s [OPTION]... FILE1 FILE2\n\ program_name); fputs (_("\ For each pair of input lines with identical join fields, write a line to\n\ -standard output. The default join field is the first, delimited by whitespace.\ +standard output. The default join field is the first, delimited by blanks.\ \n\ "), stdout); fputs (_("\ @@ -284,19 +284,19 @@ xfields (struct line *line) else if (tab < 0) { /* Skip leading blanks before the first field. */ - while (isblank (to_uchar (*ptr))) + while (field_sep (*ptr)) if (++ptr == lim) return; do { char *sep; - for (sep = ptr + 1; sep != lim && ! isblank (to_uchar (*sep)); sep++) + for (sep = ptr + 1; sep != lim && ! field_sep (*sep); sep++) continue; extract_field (line, ptr, sep - ptr); if (sep == lim) return; - for (ptr = sep + 1; ptr != lim && isblank (to_uchar (*ptr)); ptr++) + for (ptr = sep + 1; ptr != lim && field_sep (*ptr); ptr++) continue; } while (ptr != lim); diff --git a/src/sort.c b/src/sort.c index aca3b4231..575877d22 100644 --- a/src/sort.c +++ b/src/sort.c @@ -1275,9 +1275,9 @@ inittables (void) for (i = 0; i < UCHAR_LIM; ++i) { - blanks[i] = !! isblank (i); + blanks[i] = field_sep (i); nonprinting[i] = ! isprint (i); - nondictionary[i] = ! isalnum (i) && ! isblank (i); + nondictionary[i] = ! isalnum (i) && ! field_sep (i); fold_toupper[i] = toupper (i); } diff --git a/src/system.h b/src/system.h index c1c4a18a3..9898bc79c 100644 --- a/src/system.h +++ b/src/system.h @@ -155,6 +155,13 @@ enum errors that the cast doesn't. */ static inline unsigned char to_uchar (char ch) { return ch; } +/* '\n' is considered a field separator with --zero-terminated. */ +static inline bool +field_sep (unsigned char ch) +{ + return isblank (ch) || ch == '\n'; +} + #include /* Take care of NLS matters. */ diff --git a/src/uniq.c b/src/uniq.c index 6f8cd4a70..0e118da9d 100644 --- a/src/uniq.c +++ b/src/uniq.c @@ -261,9 +261,9 @@ find_field (struct linebuffer const *line) for (count = 0; count < skip_fields && i < size; count++) { - while (i < size && isblank (to_uchar (lp[i]))) + while (i < size && field_sep (lp[i])) i++; - while (i < size && !isblank (to_uchar (lp[i]))) + while (i < size && !field_sep (lp[i])) i++; } diff --git a/tests/misc/join.pl b/tests/misc/join.pl index 2a40f0095..4d399d8ae 100755 --- a/tests/misc/join.pl +++ b/tests/misc/join.pl @@ -290,10 +290,13 @@ my @tv = ( # missing last NUL at the end of the last line (=end of file) ['z4', '-z', ["a\0c\0e", "a\0b\0c"], "a\0c\0", 0], -# edge-case: the embedded newlines should treated as -# part of the nul-terminated line +# With -z, embedded newlines are treated as field separators. +# Note '\n' are converted to ' ' in this case. ['z5', '-z -a1 -a2', - ["a\n1\0c 3\0","b\n8\0c 9\0"], "a\n1\0b\n8\0c 3 9\0"], + ["a\n\n1\0c 3\0", "a 2\0b\n8\0c 9\0"], "a 1 2\0b 8\0c 3 9\0"], +# One can avoid field processing like: +['z6', '-z -t ""', + ["a\n1\n\0", "a\n1\n\0"], "a\n1\n\0"], ); diff --git a/tests/misc/sort.pl b/tests/misc/sort.pl index c9bcce194..c3e7f8e48 100755 --- a/tests/misc/sort.pl +++ b/tests/misc/sort.pl @@ -406,6 +406,11 @@ my @Tests = ["output-is-input-3", '-m -o f', {OUT=>''}, {IN=> {g=> "a\n"}}, {IN=> {h=> "b\n"}}, {IN=> {f=> "c\n"}}, {CMP=> ["a\nb\nc\n", {'f'=> undef}]} ], + +# --zero-terminated +['zero-1', '-z', {IN=>"2\0001\000"}, {OUT=>"1\0002\000"}], +['zero-2', '-z -k2,2', {IN=>"1\n2\0002\n1\000"}, {OUT=>"2\n1\0001\n2\000"}], +['zero-3', '-zb -k2,2', {IN=>"1\n\n2\0002\n1\0"}, {OUT=>"2\n1\0001\n\n2\0"}], ); # Add _POSIX2_VERSION=199209 to the environment of each test diff --git a/tests/misc/uniq.pl b/tests/misc/uniq.pl index 2bc06b9d6..f028036be 100755 --- a/tests/misc/uniq.pl +++ b/tests/misc/uniq.pl @@ -95,6 +95,7 @@ my @Tests = ['3z', '-z', {IN=>"a\na"}, {OUT=>"a\na\0"}], ['4z', '-z', {IN=>"a\nb"}, {OUT=>"a\nb\0"}], ['5z', '-z', {IN=>"a\na\nb"}, {OUT=>"a\na\nb\0"}], + ['10z', '-z -f1', {IN=>"a\nb\n\0c\nb\n\0"}, {OUT=>"a\nb\n\0"}], ['20z', '-dz', {IN=>"a\na\n"}, {OUT=>""}], # Make sure that eight bit characters work -- cgit v1.2.3-54-g00ecf