diff options
author | Assaf Gordon <assafgordon@gmail.com> | 2015-01-07 18:30:28 -0500 |
---|---|---|
committer | Pádraig Brady <P@draigBrady.com> | 2015-01-19 23:22:37 +0000 |
commit | 4c795d543908ea4715b3e0bd6c6cf908315936d8 (patch) | |
tree | 74e9d10d130ce903bf9053508a42f9cb3f48858a /src/split.c | |
parent | c4c2a09cc804afb338efa5ccedffa269888c4685 (diff) | |
download | coreutils-4c795d543908ea4715b3e0bd6c6cf908315936d8.tar.xz |
split: new -t option to select record separator
* src/split.c (eolchar): A new variable to hold
the separator character (unibyte for now).
This is reference throughout rather than hardcoding '\n'.
(usage): Describe the new --separator option, and
mention records along with lines so there is no ambiguity
that all options treat lines and records equivalently.
(main): Have -t update eolchar, or default to '\n'.
* tests/split/record-sep.sh: New test case.
* tests/local.mk: Reference the new test.
* doc/coreutils.texi (split invocation): Document the new option.
Adjust --lines, --line-bytes, --number=[lr]/... to mention
they pertain to records if --separator is specified.
* NEWS: Mention the new feature.
Diffstat (limited to 'src/split.c')
-rw-r--r-- | src/split.c | 68 |
1 files changed, 53 insertions, 15 deletions
diff --git a/src/split.c b/src/split.c index ef672f4fd..d17616c17 100644 --- a/src/split.c +++ b/src/split.c @@ -16,10 +16,9 @@ /* By tege@sics.se, with rms. - To do: - * Implement -t CHAR or -t REGEX to specify break characters other - than newline. */ - + TODO: + * support -p REGEX as in BSD's split. + * support --suppress-matched as in csplit. */ #include <config.h> #include <assert.h> @@ -108,6 +107,9 @@ static bool elide_empty_files; input to output, which is much slower, so disabled by default. */ static bool unbuffered; +/* The character marking end of line. Defaults to \n below. */ +static int eolchar = -1; + /* The split mode to use. */ enum Split_type { @@ -139,6 +141,7 @@ static struct option const longopts[] = {"numeric-suffixes", optional_argument, NULL, 'd'}, {"filter", required_argument, NULL, FILTER_OPTION}, {"verbose", no_argument, NULL, VERBOSE_OPTION}, + {"separator", required_argument, NULL, 't'}, {"-io-blksize", required_argument, NULL, IO_BLKSIZE_OPTION}, /* do not document */ {GETOPT_HELP_OPTION_DECL}, @@ -216,13 +219,15 @@ is -, read standard input.\n\ -a, --suffix-length=N generate suffixes of length N (default %d)\n\ --additional-suffix=SUFFIX append an additional SUFFIX to file names\n\ -b, --bytes=SIZE put SIZE bytes per output file\n\ - -C, --line-bytes=SIZE put at most SIZE bytes of lines per output file\n\ + -C, --line-bytes=SIZE put at most SIZE bytes of records per output file\n\ -d, --numeric-suffixes[=FROM] use numeric suffixes instead of alphabetic;\n\ FROM changes the start value (default 0)\n\ -e, --elide-empty-files do not generate empty output files with '-n'\n\ --filter=COMMAND write to shell COMMAND; file name is $FILE\n\ - -l, --lines=NUMBER put NUMBER lines per output file\n\ + -l, --lines=NUMBER put NUMBER lines/records per output file\n\ -n, --number=CHUNKS generate CHUNKS output files; see explanation below\n\ + -t, --separator=SEP use SEP instead of newline as the record separator;\n\ + '\\0' (zero) specifies the NUL character\n\ -u, --unbuffered immediately copy input to output with '-n r/...'\n\ "), DEFAULT_SUFFIX_LENGTH); fputs (_("\ @@ -236,8 +241,8 @@ is -, read standard input.\n\ CHUNKS may be:\n\ N split into N files based on size of input\n\ K/N output Kth of N to stdout\n\ - l/N split into N files without splitting lines\n\ - l/K/N output Kth of N to stdout without splitting lines\n\ + l/N split into N files without splitting lines/records\n\ + l/K/N output Kth of N to stdout without splitting lines/records\n\ r/N like 'l' but use round robin distribution\n\ r/K/N likewise but only output Kth of N to stdout\n\ "), stdout); @@ -630,10 +635,10 @@ lines_split (uintmax_t n_lines, char *buf, size_t bufsize) error (EXIT_FAILURE, errno, "%s", infile); bp = bp_out = buf; eob = bp + n_read; - *eob = '\n'; + *eob = eolchar; while (true) { - bp = memchr (bp, '\n', eob - bp + 1); + bp = memchr (bp, eolchar, eob - bp + 1); if (bp == eob) { if (eob != bp_out) /* do not write 0 bytes! */ @@ -692,10 +697,10 @@ line_bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize) /* Have enough for split. */ split_rest = n_bytes - n_out - n_hold; eoc = sob + split_rest - 1; - eol = memrchr (sob, '\n', split_rest); + eol = memrchr (sob, eolchar, split_rest); } else - eol = memrchr (sob, '\n', n_left); + eol = memrchr (sob, eolchar, n_left); /* Output hold space if possible. */ if (n_hold && !(!eol && n_out)) @@ -833,7 +838,7 @@ lines_chunk_split (uintmax_t k, uintmax_t n, char *buf, size_t bufsize, /* Begin looking for '\n' at last byte of chunk. */ off_t skip = MIN (n_read, MAX (0, chunk_end - n_written)); - char *bp_out = memchr (bp + skip, '\n', n_read - skip); + char *bp_out = memchr (bp + skip, eolchar, n_read - skip); if (bp_out++) next = true; else @@ -1080,7 +1085,7 @@ lines_rr (uintmax_t k, uintmax_t n, char *buf, size_t bufsize) bool next = false; /* Find end of line. */ - char *bp_out = memchr (bp, '\n', eob - bp); + char *bp_out = memchr (bp, eolchar, eob - bp); if (bp_out) { bp_out++; @@ -1224,7 +1229,7 @@ main (int argc, char **argv) int this_optind = optind ? optind : 1; char *slash; - c = getopt_long (argc, argv, "0123456789C:a:b:del:n:u", + c = getopt_long (argc, argv, "0123456789C:a:b:del:n:t:u", longopts, NULL); if (c == -1) break; @@ -1303,6 +1308,36 @@ main (int argc, char **argv) unbuffered = true; break; + case 't': + { + char neweol = optarg[0]; + if (! neweol) + error (EXIT_FAILURE, 0, _("empty record separator")); + if (optarg[1]) + { + if (STREQ (optarg, "\\0")) + neweol = '\0'; + else + { + /* Provoke with 'split -txx'. Complain about + "multi-character tab" instead of "multibyte tab", so + that the diagnostic's wording does not need to be + changed once multibyte characters are supported. */ + error (EXIT_FAILURE, 0, _("multi-character separator %s"), + quote (optarg)); + } + } + /* Make it explicit we don't support multiple separators. */ + if (0 <= eolchar && neweol != eolchar) + { + error (EXIT_FAILURE, 0, + _("multiple separator characters specified")); + } + + eolchar = neweol; + } + break; + case '0': case '1': case '2': @@ -1398,6 +1433,9 @@ main (int argc, char **argv) usage (EXIT_FAILURE); } + if (eolchar < 0) + eolchar = '\n'; + set_suffix_length (n_units, split_type); /* Get out the filename arguments. */ |