diff options
author | Assaf Gordon <assafgordon@gmail.com> | 2015-01-07 18:30:28 -0500 |
---|---|---|
committer | Pádraig Brady <P@draigBrady.com> | 2015-01-19 23:22:37 +0000 |
commit | 4c795d543908ea4715b3e0bd6c6cf908315936d8 (patch) | |
tree | 74e9d10d130ce903bf9053508a42f9cb3f48858a | |
parent | c4c2a09cc804afb338efa5ccedffa269888c4685 (diff) | |
download | coreutils-4c795d543908ea4715b3e0bd6c6cf908315936d8.tar.xz |
split: new -t option to select record separator
* src/split.c (eolchar): A new variable to hold
the separator character (unibyte for now).
This is reference throughout rather than hardcoding '\n'.
(usage): Describe the new --separator option, and
mention records along with lines so there is no ambiguity
that all options treat lines and records equivalently.
(main): Have -t update eolchar, or default to '\n'.
* tests/split/record-sep.sh: New test case.
* tests/local.mk: Reference the new test.
* doc/coreutils.texi (split invocation): Document the new option.
Adjust --lines, --line-bytes, --number=[lr]/... to mention
they pertain to records if --separator is specified.
* NEWS: Mention the new feature.
-rw-r--r-- | NEWS | 3 | ||||
-rw-r--r-- | doc/coreutils.texi | 25 | ||||
-rw-r--r-- | src/split.c | 68 | ||||
-rw-r--r-- | tests/local.mk | 5 | ||||
-rwxr-xr-x | tests/split/record-sep.sh | 78 |
5 files changed, 157 insertions, 22 deletions
@@ -45,6 +45,9 @@ GNU coreutils NEWS -*- outline -*- dd accepts a new status=progress level to print data transfer statistics on stderr approximately every second. + split accepts a new --separator option to select a record separator character + other than the default newline character. + ** Changes in behavior df no longer suppresses separate exports of the same remote device, as diff --git a/doc/coreutils.texi b/doc/coreutils.texi index 1cc65329c..5a3c31a15 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -3395,6 +3395,8 @@ The program accepts the following options. Also see @ref{Common options}. @opindex -l @opindex --lines Put @var{lines} lines of @var{input} into each output file. +If @option{--separator} is specified, then @var{lines} determines +the number of records. For compatibility @command{split} also supports an obsolete option syntax @option{-@var{lines}}. New scripts should use @@ -3412,9 +3414,11 @@ Put @var{size} bytes of @var{input} into each output file. @opindex -C @opindex --line-bytes Put into each output file as many complete lines of @var{input} as -possible without exceeding @var{size} bytes. Individual lines longer than -@var{size} bytes are broken into multiple files. +possible without exceeding @var{size} bytes. Individual lines or records +longer than @var{size} bytes are broken into multiple files. @var{size} has the same format as for the @option{--bytes} option. +If @option{--separator} is specified, then @var{lines} determines +the number of records. @item --filter=@var{command} @opindex --filter @@ -3445,7 +3449,7 @@ Split @var{input} to @var{chunks} output files where @var{chunks} may be: @example @var{n} generate @var{n} files based on current size of @var{input} @var{k}/@var{n} only output @var{k}th of @var{n} to stdout -l/@var{n} generate @var{n} files without splitting lines +l/@var{n} generate @var{n} files without splitting lines or records l/@var{k}/@var{n} likewise but only output @var{k}th of @var{n} to stdout r/@var{n} like @samp{l} but use round robin distribution r/@var{k}/@var{n} likewise but only output @var{k}th of @var{n} to stdout @@ -3462,10 +3466,10 @@ or the @var{input} is truncated. For @samp{l} mode, chunks are approximately @var{input} size / @var{n}. The @var{input} is partitioned into @var{n} equal sized portions, with the last assigned any excess. If a line @emph{starts} within a partition -it is written completely to the corresponding file. Since lines +it is written completely to the corresponding file. Since lines or records are not split even if they overlap a partition, the files written can be larger or smaller than the partition size, and even empty -if a line is so long as to completely overlap the partition. +if a line/record is so long as to completely overlap the partition. For @samp{r} mode, the size of @var{input} is irrelevant, and so can be a pipe for example. @@ -3505,6 +3509,17 @@ than the number requested, or if a line is so long as to completely span a chunk. The output file sequence numbers, always run consecutively even when this option is specified. +@item -t @var{separator} +@itemx --separator=@var{separator} +@opindex -t +@opindex --separator +@cindex line separator character +@cindex record separator character +Use character @var{separator} as the record separator instead of the default +newline character (ASCII LF). +To specify ASCII NUL as the separator, use the two-character string @samp{\0}, +e.g., @samp{split -t '\0'}. + @item -u @itemx --unbuffered @opindex -u diff --git a/src/split.c b/src/split.c index ef672f4fd..d17616c17 100644 --- a/src/split.c +++ b/src/split.c @@ -16,10 +16,9 @@ /* By tege@sics.se, with rms. - To do: - * Implement -t CHAR or -t REGEX to specify break characters other - than newline. */ - + TODO: + * support -p REGEX as in BSD's split. + * support --suppress-matched as in csplit. */ #include <config.h> #include <assert.h> @@ -108,6 +107,9 @@ static bool elide_empty_files; input to output, which is much slower, so disabled by default. */ static bool unbuffered; +/* The character marking end of line. Defaults to \n below. */ +static int eolchar = -1; + /* The split mode to use. */ enum Split_type { @@ -139,6 +141,7 @@ static struct option const longopts[] = {"numeric-suffixes", optional_argument, NULL, 'd'}, {"filter", required_argument, NULL, FILTER_OPTION}, {"verbose", no_argument, NULL, VERBOSE_OPTION}, + {"separator", required_argument, NULL, 't'}, {"-io-blksize", required_argument, NULL, IO_BLKSIZE_OPTION}, /* do not document */ {GETOPT_HELP_OPTION_DECL}, @@ -216,13 +219,15 @@ is -, read standard input.\n\ -a, --suffix-length=N generate suffixes of length N (default %d)\n\ --additional-suffix=SUFFIX append an additional SUFFIX to file names\n\ -b, --bytes=SIZE put SIZE bytes per output file\n\ - -C, --line-bytes=SIZE put at most SIZE bytes of lines per output file\n\ + -C, --line-bytes=SIZE put at most SIZE bytes of records per output file\n\ -d, --numeric-suffixes[=FROM] use numeric suffixes instead of alphabetic;\n\ FROM changes the start value (default 0)\n\ -e, --elide-empty-files do not generate empty output files with '-n'\n\ --filter=COMMAND write to shell COMMAND; file name is $FILE\n\ - -l, --lines=NUMBER put NUMBER lines per output file\n\ + -l, --lines=NUMBER put NUMBER lines/records per output file\n\ -n, --number=CHUNKS generate CHUNKS output files; see explanation below\n\ + -t, --separator=SEP use SEP instead of newline as the record separator;\n\ + '\\0' (zero) specifies the NUL character\n\ -u, --unbuffered immediately copy input to output with '-n r/...'\n\ "), DEFAULT_SUFFIX_LENGTH); fputs (_("\ @@ -236,8 +241,8 @@ is -, read standard input.\n\ CHUNKS may be:\n\ N split into N files based on size of input\n\ K/N output Kth of N to stdout\n\ - l/N split into N files without splitting lines\n\ - l/K/N output Kth of N to stdout without splitting lines\n\ + l/N split into N files without splitting lines/records\n\ + l/K/N output Kth of N to stdout without splitting lines/records\n\ r/N like 'l' but use round robin distribution\n\ r/K/N likewise but only output Kth of N to stdout\n\ "), stdout); @@ -630,10 +635,10 @@ lines_split (uintmax_t n_lines, char *buf, size_t bufsize) error (EXIT_FAILURE, errno, "%s", infile); bp = bp_out = buf; eob = bp + n_read; - *eob = '\n'; + *eob = eolchar; while (true) { - bp = memchr (bp, '\n', eob - bp + 1); + bp = memchr (bp, eolchar, eob - bp + 1); if (bp == eob) { if (eob != bp_out) /* do not write 0 bytes! */ @@ -692,10 +697,10 @@ line_bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize) /* Have enough for split. */ split_rest = n_bytes - n_out - n_hold; eoc = sob + split_rest - 1; - eol = memrchr (sob, '\n', split_rest); + eol = memrchr (sob, eolchar, split_rest); } else - eol = memrchr (sob, '\n', n_left); + eol = memrchr (sob, eolchar, n_left); /* Output hold space if possible. */ if (n_hold && !(!eol && n_out)) @@ -833,7 +838,7 @@ lines_chunk_split (uintmax_t k, uintmax_t n, char *buf, size_t bufsize, /* Begin looking for '\n' at last byte of chunk. */ off_t skip = MIN (n_read, MAX (0, chunk_end - n_written)); - char *bp_out = memchr (bp + skip, '\n', n_read - skip); + char *bp_out = memchr (bp + skip, eolchar, n_read - skip); if (bp_out++) next = true; else @@ -1080,7 +1085,7 @@ lines_rr (uintmax_t k, uintmax_t n, char *buf, size_t bufsize) bool next = false; /* Find end of line. */ - char *bp_out = memchr (bp, '\n', eob - bp); + char *bp_out = memchr (bp, eolchar, eob - bp); if (bp_out) { bp_out++; @@ -1224,7 +1229,7 @@ main (int argc, char **argv) int this_optind = optind ? optind : 1; char *slash; - c = getopt_long (argc, argv, "0123456789C:a:b:del:n:u", + c = getopt_long (argc, argv, "0123456789C:a:b:del:n:t:u", longopts, NULL); if (c == -1) break; @@ -1303,6 +1308,36 @@ main (int argc, char **argv) unbuffered = true; break; + case 't': + { + char neweol = optarg[0]; + if (! neweol) + error (EXIT_FAILURE, 0, _("empty record separator")); + if (optarg[1]) + { + if (STREQ (optarg, "\\0")) + neweol = '\0'; + else + { + /* Provoke with 'split -txx'. Complain about + "multi-character tab" instead of "multibyte tab", so + that the diagnostic's wording does not need to be + changed once multibyte characters are supported. */ + error (EXIT_FAILURE, 0, _("multi-character separator %s"), + quote (optarg)); + } + } + /* Make it explicit we don't support multiple separators. */ + if (0 <= eolchar && neweol != eolchar) + { + error (EXIT_FAILURE, 0, + _("multiple separator characters specified")); + } + + eolchar = neweol; + } + break; + case '0': case '1': case '2': @@ -1398,6 +1433,9 @@ main (int argc, char **argv) usage (EXIT_FAILURE); } + if (eolchar < 0) + eolchar = '\n'; + set_suffix_length (n_units, split_type); /* Get out the filename arguments. */ diff --git a/tests/local.mk b/tests/local.mk index 6fc859961..5dcbd5595 100644 --- a/tests/local.mk +++ b/tests/local.mk @@ -358,6 +358,7 @@ all_tests = \ tests/split/line-bytes.sh \ tests/split/l-chunk.sh \ tests/split/r-chunk.sh \ + tests/split/record-sep.sh \ tests/split/numeric.sh \ tests/split/guard-input.sh \ tests/misc/stat-birthtime.sh \ @@ -402,7 +403,7 @@ all_tests = \ tests/misc/xattr.sh \ tests/tail-2/wait.sh \ tests/tail-2/retry.sh \ - tests/tail-2/symlink.sh \ + tests/tail-2/symlink.sh \ tests/tail-2/tail-c.sh \ tests/chmod/c-option.sh \ tests/chmod/equal-x.sh \ @@ -483,7 +484,7 @@ all_tests = \ tests/dd/ascii.sh \ tests/dd/direct.sh \ tests/dd/misc.sh \ - tests/dd/no-allocate.sh \ + tests/dd/no-allocate.sh \ tests/dd/nocache.sh \ tests/dd/not-rewound.sh \ tests/dd/reblock.sh \ diff --git a/tests/split/record-sep.sh b/tests/split/record-sep.sh new file mode 100755 index 000000000..f41215a45 --- /dev/null +++ b/tests/split/record-sep.sh @@ -0,0 +1,78 @@ +#!/bin/sh +# test split with custom record separators + +# Copyright (C) 2015 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src +print_ver_ split + +NL=' +' + +for sep in "$NL" '\0' ':'; do + + test "$sep" = "$NL" && tr='\n' || tr="$sep" + + for mode in '--lines=2' '--line-bytes=4' '--number=l/3' '--number=r/3'; do + + # Generate in default mode for comparison + printf '1\n2\n3\n4\n5\n' > in || framework_failure_ + split $mode in || fail=1 + tr '\n' "$tr" < xaa > exp1 + tr '\n' "$tr" < xab > exp2 + tr '\n' "$tr" < xac > exp3 + + rm -f x?? + + # Generate output with specified --separator + printf '1\n2\n3\n4\n5\n' | tr '\n' "$tr" > in || framework_failure_ + split $mode -t "$sep" in || fail=1 + + compare exp1 xaa || fail=1 + compare exp2 xab || fail=1 + compare exp3 xac || fail=1 + test -f xad && fail=1 + done + +done + + +# +# Test usage edge cases +# + +# Should fail: '-t' requires an argument +{ split -t </dev/null >/dev/null 2>/dev/null || test $? -ne 1; } && + { warn_ "-t without argument did not trigger an error" ; fail=1 ; } + +# should fail: multi-character separator +{ split -txx </dev/null >/dev/null 2>&1 || test $? -ne 1; } && + { warn_ "-txx did not trigger an error" ; fail=1 ; } + +# should fail: different separators used +{ split -ta -tb </dev/null >/dev/null 2>&1 || test $? -ne 1; } && + { warn_ "-ta -tb did not trigger an error" ; fail=1 ; } + +# should fail: different separators used, including default +{ split -t"$NL" -tb </dev/null >/dev/null 2>&1 || test $? -ne 1; } && + { warn_ "-t\$NL -tb did not trigger an error" ; fail=1 ; } + +# should not fail: same separator used multiple times +split -t: -t: </dev/null >/dev/null 2>&1 || + { warn_ "-t: -t: triggered an error" ; fail=1 ; } + + +Exit $fail |