diff options
author | Assaf Gordon <assafgordon@gmail.com> | 2013-03-06 15:53:16 -0500 |
---|---|---|
committer | Pádraig Brady <P@draigBrady.com> | 2013-04-10 14:34:52 +0100 |
commit | 4114c93af398d7aecb5eb253f90d9b4cc0785643 (patch) | |
tree | e12c863d7a3ac08a1abec7c8fc8a4be6991fde9c | |
parent | ec02161aefab06bec919d10396900ce6fe87390d (diff) | |
download | coreutils-4114c93af398d7aecb5eb253f90d9b4cc0785643.tar.xz |
csplit: add the --suppress-matched option
With --suppress-matched, the lines that match the pattern will not be
printed in the output files. I.E. the first line from the second
and subsequent splits will be suppressed.
* src/csplit.c: process_regexp(),process_line_count(): Don't output the
matched lines. Since csplit includes "up to but not including" matched
lines in each split, the first line (in the next group) is the matched
line - so just skip it.
main(): Handle new option.
usage(): Mention new option.
* doc/coreutils.texi (csplit invocation): Mention new option, examples.
* tests/misc/csplit-suppress-matched.pl: New test script.
* tests/local.mk: Reference the new test.
* NEWS: Mention new feature.
-rw-r--r-- | NEWS | 3 | ||||
-rw-r--r-- | doc/coreutils.texi | 25 | ||||
-rw-r--r-- | src/csplit.c | 34 | ||||
-rw-r--r-- | tests/local.mk | 1 | ||||
-rw-r--r-- | tests/misc/csplit-suppress-matched.pl | 213 |
5 files changed, 274 insertions, 2 deletions
@@ -22,6 +22,9 @@ GNU coreutils NEWS -*- outline -*- uniq accepts a new option: --group to print all items, while separating unique groups with empty lines. + csplit accepts a new option: --suppressed-matched, to elide the lines + used to identify the split points. + ** Improvements stat and tail work better with EFIVARFS, EXOFS, F2FS and UBIFS. diff --git a/doc/coreutils.texi b/doc/coreutils.texi index 4cfe4c50c..8e4bacfb7 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -3607,6 +3607,12 @@ long instead of the default 2. @opindex --keep-files Do not remove output files when errors are encountered. +@item --suppress-matched +@opindex --suppress-matched +Do not output lines matching the specified @var{pattern}. +I.E. suppress the boundary line from the start of the second +and subsequent splits. + @item -z @itemx --elide-empty-files @opindex -z @@ -3683,6 +3689,25 @@ $ head xx* 14 @end example +Example of splitting input by empty lines: + +@example +$ csplit --suppress-matched @var{input.txt} '/^$/' '@{*@}' +@end example + +@c +@c TODO: "uniq" already supports "--group". +@c when it gets the "--key" option, uncomment this example. +@c +@c Example of splitting input file, based on the value of column 2: +@c +@c @example +@c $ cat @var{input.txt} | +@c sort -k2,2 | +@c uniq --group -k2,2 | +@c csplit -m '/^$/' '@{*@}' +@c @end example + @node Summarizing files @chapter Summarizing files diff --git a/src/csplit.c b/src/csplit.c index 22f3ad4b1..7a36e6736 100644 --- a/src/csplit.c +++ b/src/csplit.c @@ -166,6 +166,9 @@ static bool volatile remove_files; /* If true, remove all output files which have a zero length. */ static bool elide_empty_files; +/* If true, suppress the lines that match the PATTERN */ +static bool suppress_matched; + /* The compiled pattern arguments, which determine how to split the input file. */ static struct control *controls; @@ -176,6 +179,13 @@ static size_t control_used; /* The set of signals that are caught. */ static sigset_t caught_signals; +/* For long options that have no equivalent short option, use a + non-character as a pseudo short option, starting with CHAR_MAX + 1. */ +enum +{ + SUPPRESS_MATCHED_OPTION = CHAR_MAX + 1 +}; + static struct option const longopts[] = { {"digits", required_argument, NULL, 'n'}, @@ -185,6 +195,7 @@ static struct option const longopts[] = {"elide-empty-files", no_argument, NULL, 'z'}, {"prefix", required_argument, NULL, 'f'}, {"suffix-format", required_argument, NULL, 'b'}, + {"suppress-matched", no_argument, NULL, SUPPRESS_MATCHED_OPTION}, {GETOPT_HELP_OPTION_DECL}, {GETOPT_VERSION_OPTION_DECL}, {NULL, 0, NULL, 0} @@ -721,8 +732,13 @@ process_line_count (const struct control *p, uintmax_t repetition) create_output_file (); - linenum = get_first_line_in_buffer (); + /* Ensure that the line number specified is not 1 greater than + the number of lines in the file. + When suppressing matched lines, check before the loop. */ + if (no_more_lines () && suppress_matched) + handle_line_error (p, repetition); + linenum = get_first_line_in_buffer (); while (linenum++ < last_line_to_save) { line = remove_line (); @@ -733,9 +749,12 @@ process_line_count (const struct control *p, uintmax_t repetition) close_output_file (); + if (suppress_matched) + line = remove_line (); + /* Ensure that the line number specified is not 1 greater than the number of lines in the file. */ - if (no_more_lines ()) + if (no_more_lines () && !suppress_matched) handle_line_error (p, repetition); } @@ -778,6 +797,9 @@ process_regexp (struct control *p, uintmax_t repetition) if (!ignore) create_output_file (); + if (suppress_matched && current_line > 0) + line = remove_line (); + /* If there is no offset for the regular expression, or it is positive, then it is not necessary to buffer the lines. */ @@ -1324,6 +1346,7 @@ main (int argc, char **argv) control_used = 0; suppress_count = false; remove_files = true; + suppress_matched = false; prefix = DEFAULT_PREFIX; while ((optc = getopt_long (argc, argv, "f:b:kn:sqz", longopts, NULL)) != -1) @@ -1357,6 +1380,10 @@ main (int argc, char **argv) elide_empty_files = true; break; + case SUPPRESS_MATCHED_OPTION: + suppress_matched = true; + break; + case_GETOPT_HELP_CHAR; case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS); @@ -1465,6 +1492,9 @@ and output byte counts of each piece to standard output.\n\ -k, --keep-files do not remove output files on errors\n\ "), stdout); fputs (_("\ + -m, --suppress-matched suppress the lines matching PATTERN\n\ +"), stdout); + fputs (_("\ -n, --digits=DIGITS use specified number of digits instead of 2\n\ -s, --quiet, --silent do not print counts of output file sizes\n\ -z, --elide-empty-files remove empty output files\n\ diff --git a/tests/local.mk b/tests/local.mk index dc87ef491..e3a72ab3a 100644 --- a/tests/local.mk +++ b/tests/local.mk @@ -260,6 +260,7 @@ all_tests = \ tests/misc/csplit.sh \ tests/misc/csplit-1000.sh \ tests/misc/csplit-heap.sh \ + tests/misc/csplit-suppress-matched.pl \ tests/misc/date-sec.sh \ tests/misc/dircolors.pl \ tests/misc/dirname.pl \ diff --git a/tests/misc/csplit-suppress-matched.pl b/tests/misc/csplit-suppress-matched.pl new file mode 100644 index 000000000..bfced423b --- /dev/null +++ b/tests/misc/csplit-suppress-matched.pl @@ -0,0 +1,213 @@ +#!/usr/bin/perl + +# Copyright (C) 2013 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +use strict; +use Data::Dumper; + +my $limits = getlimits (); + +my $prog = 'csplit'; + +# Turn off localization of executable's output. +@ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3; + +# Input from 'seq 6' +my $IN_SEQ_6 =<<EOF; +1 +2 +3 +4 +5 +6 +EOF + +# Input from a possible run of 'uniq --group' +# (groups separated by empty lines) +my $IN_UNIQ =<<EOF; +a +a +YY + +XX +b +b +YY + +XX +c +YY + +XX +d +d +d +EOF + +# Standard Coreotils::run_tests() structure, except the addition of +# "OUTPUTS" array, containing the expected content of the output files. +# See code below for conversion into PRE/CMP/POST checks. +my @csplit_tests = +( + # without --suppress-matched, + # the newline (matched line) appears in the output files + ["re-base", "-q - '/^\$/' '{*}'", {IN_PIPE => $IN_UNIQ}, + {OUTPUTS => [ "a\na\nYY\n", "\nXX\nb\nb\nYY\n","\nXX\nc\nYY\n", + "\nXX\nd\nd\nd\n" ] }], + + # the newline (matched line) does not appears in the output files + ["re-1", " --suppress-matched -q - '/^\$/' '{*}'", {IN_PIPE => $IN_UNIQ}, + {OUTPUTS => ["a\na\nYY\n", "XX\nb\nb\nYY\n", "XX\nc\nYY\n", + "XX\nd\nd\nd\n"]}], + + # the 'XX' (matched line + offset 1) does not appears in the output files. + # the newline appears in the files (before each split, at the end of the file) + ["re-2", "--suppress-matched -q - '/^\$/1' '{*}'", {IN_PIPE => $IN_UNIQ}, + {OUTPUTS => ["a\na\nYY\n\n","b\nb\nYY\n\n","c\nYY\n\n","d\nd\nd\n"]}], + + # the 'YY' (matched line + offset of -1) does not appears in the output files + # the newline appears in the files (as the first line of the new split) + ["re-3", " --suppress-matched -q - '/^\$/-1' '{*}'", {IN_PIPE => $IN_UNIQ}, + {OUTPUTS => ["a\na\n", "\nXX\nb\nb\n", "\nXX\nc\n", "\nXX\nd\nd\nd\n"]}], + + # Test two consecutive matched lines + # without suppress-matched, the second file should contain a single newline. + ["re-4.1", "-q - '/^\$/' '{*}'", {IN_PIPE => "a\n\n\nb\n"}, + {OUTPUTS => [ "a\n", "\n", "\nb\n" ]}], + # suppress-matched will cause the second file to be empty. + ["re-4.2", "--suppress-match -q - '/^\$/' '{*}'", {IN_PIPE => "a\n\n\nb\n"}, + {OUTPUTS => [ "a\n", "", "b\n" ]}], + # suppress-matched + elide-empty should output just two files. + ["re-4.3", "--suppress-match -zq - '/^\$/' '{*}'", {IN_PIPE => "a\n\n\nb\n"}, + {OUTPUTS => [ "a\n", "b\n" ]}], + + + # Test a matched-line as the last line + # default: last file with newline should be created. + ["re-5.1", "-q - '/^\$/' '{*}'", {IN_PIPE => "a\n\nb\n\n"}, + {OUTPUTS => [ "a\n", "\nb\n", "\n" ]}], + # suppress-matched - last empty files should be created. + ["re-5.2", "--suppress-match -q - '/^\$/' '{*}'", {IN_PIPE => "a\n\nb\n\n"}, + {OUTPUTS => [ "a\n", "b\n", "" ]}], + # suppress-matched + elide-empty: just two files should be created. + ["re-5.3", "--suppress-match -zq - '/^\$/' '{*}'", {IN_PIPE => "a\n\nb\n\n"}, + {OUTPUTS => [ "a\n", "b\n" ]}], + + # without suppress-matched, + # the matched lines (2/4/6) appears in the output files + ["int-base", '-q - 2 4 6', {IN_PIPE => $IN_SEQ_6}, + {OUTPUTS => [ "1\n", "2\n3\n", "4\n5\n", "6\n" ]}], + # suppress matched - the matching lines (2/4/6) should not appear. + ["int-1", '--suppress-matched -q - 2 4 6', {IN_PIPE => $IN_SEQ_6}, + {OUTPUTS => [ "1\n", "3\n", "5\n", "" ]}], + # suppress matched + elide-empty + ["int-2", '--suppress-matched -zq - 2 4 6', {IN_PIPE => $IN_SEQ_6}, + {OUTPUTS => [ "1\n", "3\n", "5\n" ]}], +); + + + +=pod +The following loop translate the above @Tests to a Cureutils::run_tests() +compatible structure. It converts "OUTPUTS" key into "CMP" + "POST" keys: +1. Each element in the OUTPUTS key is expected to be an output file + from csplit (named xx00, xx01, xx02...) + create a "CMP" key for each one, with the output and the filename. +2. Add a "POST" key, ensuring no extra files have been created. + (e.g. if there are 4 expected outputs, xx00 to xx03, + ensure xx04 doesn't exist). +3. Add a "PRE" key, deleting all existing 'xx*' files. + +Example: + +Before conversion: + my @csplit_tests = + ( + ["1", '-z -q - 2 4 6', + {IN_PIPE => "1\n2\n3\n4\n5\n6\n"}, + {OUTPUTS => [ "1\n", "2\n3\n", "4\n5\n", "6\n" ], + ] + ) + +After conversion: + + my @csplit_tests = + ( + ["1", '-z -q - 2 4 6', + {IN_PIPE => "1\n2\n3\n4\n5\n6\n"}, + {PRE => sub { unlink glob './xx??' ; }}, + {CMP => ["1\n", {'xx00'=> undef}]}, + {CMP => ["2\n3\n", {'xx01'=> undef}]}, + {CMP => ["4\n5\n", {'xx02'=> undef}]}, + {CMP => ["6\n", {'xx03'=> undef}]}, + {POST => sub { die "extra file" if -e 'xx04'}}, + ], + ); +=cut +my @Tests; +foreach my $t (@csplit_tests) + { + my ($test_name, $cmdline, @others) = @$t; + my $new_ent = [$test_name, $cmdline]; + + my $out_file_num = 0 ; + + foreach my $e (@others) + { + die "Internal error: expecting a hash (e.g. IN_PIPE/OUTPUTS/ERR)" . + "in test '$test_name', got $e" + unless ref $e && (ref $e eq 'HASH'); + + my ($key, $value) = each %$e; + if ($key eq 'OUTPUTS') + { + # Convert each expected OUTPUT to a 'CMP' key. + foreach my $output (@$value) + { + my $filename = sprintf("xx%02d",$out_file_num++); + my $cmp = {CMP => [ $output, { $filename => undef}]}; + push @$new_ent, $cmp; + } + + # Add a 'POST' check + # Ensure no extra files have been created. + my $filename = sprintf("xx%02d",$out_file_num++); + my $post = { POST => sub { die "Test failed: an extraneous file " . + "'$filename' has been created\n" + if -e $filename; } } ; + push @$new_ent, $post; + + # before running each test, cleanup the 'xx00' files + # from previous runs. + my $pre = { PRE => sub { unlink glob "./xx??"; } }; + push @$new_ent, $pre; + } + else + { + # pass other entities as-is (e.g. OUT, ERR, OUT_SUBST, EXIT) + # run_tests() will know how to handle them. + push @$new_ent, $e; + } + } + + push @Tests, $new_ent; + } + +my $save_temps = $ENV{DEBUG}; +my $verbose = $ENV{VERBOSE}; + +my $fail = run_tests ($prog, $prog, \@Tests, $save_temps, $verbose); +exit $fail; |