diff options
author | Pádraig Brady <P@draigBrady.com> | 2013-04-20 08:46:43 +0100 |
---|---|---|
committer | Pádraig Brady <P@draigBrady.com> | 2013-05-23 11:13:24 +0100 |
commit | fec363cebf581ef27f6d01686dec5a0499aa6818 (patch) | |
tree | 59d03cfaae216cc0dc4f9ec221f375fe7b605b14 | |
parent | 478dade09a4288f73e963b7f185ef9f73b681b42 (diff) | |
download | coreutils-fec363cebf581ef27f6d01686dec5a0499aa6818.tar.xz |
split: with --line-bytes only allocate memory as needed
* src/split.c (line_bytes_split): Rewrite to only buffer
when necessary. I.E. only increase the buffer when we've
already lines output in a split and we encounter a line
larger than the input buffer size, in which case a hold
buffer will be increased in increments of the input buffer size.
(lines_rr): Use the more abstract xalloc_die() just like
we did in line_bytes_split(), rather than explicitly
printing the "memory exhausted" message and exiting.
* tests/split/line-bytes.sh: Add a new test for this
function which previously had no test coverage.
* tests/local.mk: Reference the new test.
* NEWS: Mention the improvement.
Fixes http://bugs.gnu.org/13537
-rw-r--r-- | NEWS | 3 | ||||
-rw-r--r-- | src/split.c | 139 | ||||
-rw-r--r-- | tests/local.mk | 1 | ||||
-rwxr-xr-x | tests/split/line-bytes.sh | 86 |
4 files changed, 186 insertions, 43 deletions
@@ -48,6 +48,9 @@ GNU coreutils NEWS -*- outline -*- Reservoir sampling is used to limit memory usage based on the number of outputs, rather than the number of inputs. + split --line-bytes=SIZE, now only allocates memory as needed rather + than allocating SIZE bytes at program start. + ** Build-related factor now builds on aarch64 based systems [bug introduced in coreutils-8.20] diff --git a/src/split.c b/src/split.c index e5e75f0f5..feabe39b1 100644 --- a/src/split.c +++ b/src/split.c @@ -616,62 +616,115 @@ lines_split (uintmax_t n_lines, char *buf, size_t bufsize) } while (n_read == bufsize); } - + /* Split into pieces that are as large as possible while still not more than N_BYTES bytes, and are split on line boundaries except - where lines longer than N_BYTES bytes occur. - FIXME: Allow N_BYTES to be any uintmax_t value, and don't require a - buffer of size N_BYTES, in case N_BYTES is very large. */ + where lines longer than N_BYTES bytes occur. */ static void -line_bytes_split (size_t n_bytes) +line_bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize) { - char *bp; - bool eof = false; - size_t n_buffered = 0; - char *buf = xmalloc (n_bytes); + size_t n_read; + uintmax_t n_out = 0; /* for each split. */ + size_t n_hold = 0; + char *hold = NULL; /* for lines > bufsize. */ + size_t hold_size = 0; + bool split_line = false; /* Whether a \n was output in a split. */ do { - /* Fill up the full buffer size from the input file. */ - - size_t to_read = n_bytes - n_buffered; - size_t n_read = full_read (STDIN_FILENO, buf + n_buffered, to_read); - if (n_read < to_read && errno) + n_read = full_read (STDIN_FILENO, buf, bufsize); + if (n_read < bufsize && errno) error (EXIT_FAILURE, errno, "%s", infile); - - n_buffered += n_read; - if (n_buffered != n_bytes) + size_t n_left = n_read; + char *sob = buf; + while (n_left) { - if (n_buffered == 0) - break; - eof = true; - } + size_t split_rest = 0; + char *eoc = NULL; + char *eol; - /* Find where to end this chunk. */ - bp = buf + n_buffered; - if (n_buffered == n_bytes) - { - while (bp > buf && bp[-1] != '\n') - bp--; - } + /* Determine End Of Chunk and/or End of Line, + which are used below to select what to write or buffer. */ + if (n_bytes - n_out - n_hold <= n_left) + { + /* Have enough for split. */ + split_rest = n_bytes - n_out - n_hold; + eoc = sob + split_rest - 1; + eol = memrchr (sob, '\n', split_rest); + } + else + eol = memrchr (sob, '\n', n_left); - /* If chunk has no newlines, use all the chunk. */ - if (bp == buf) - bp = buf + n_buffered; + /* Output hold space if possible. */ + if (n_hold && !(!eol && n_out)) + { + cwrite (n_out == 0, hold, n_hold); + n_out += n_hold; + if (n_hold > bufsize) + hold = xrealloc (hold, bufsize); + n_hold = 0; + hold_size = bufsize; + } - /* Output the chars as one output file. */ - cwrite (true, buf, bp - buf); + /* Output to eol if present. */ + if (eol) + { + split_line = true; + size_t n_write = eol - sob + 1; + cwrite (n_out == 0, sob, n_write); + n_out += n_write; + n_left -= n_write; + sob += n_write; + if (eoc) + split_rest -= n_write; + } + + /* Output to eoc or eob if possible. */ + if (n_left && !split_line) + { + size_t n_write = eoc ? split_rest : n_left; + cwrite (n_out == 0, sob, n_write); + n_out += n_write; + n_left -= n_write; + sob += n_write; + if (eoc) + split_rest -= n_write; + } + + /* Update hold if needed. */ + if ((eoc && split_rest) || (!eoc && n_left)) + { + size_t n_buf = eoc ? split_rest : n_left; + if (hold_size - n_hold < n_buf) + { + if (hold_size <= SIZE_MAX - bufsize) + hold_size += bufsize; + else + xalloc_die (); + hold = xrealloc (hold, hold_size); + } + memcpy (hold + n_hold, sob, n_buf); + n_hold += n_buf; + n_left -= n_buf; + sob += n_buf; + } - /* Discard the chars we just output; move rest of chunk - down to be the start of the next chunk. Source and - destination probably overlap. */ - n_buffered -= bp - buf; - if (n_buffered > 0) - memmove (buf, bp, n_buffered); + /* Reset for new split. */ + if (eoc) + { + n_out = 0; + split_line = false; + } + } } - while (!eof); - free (buf); + while (n_read == bufsize); + + /* Handle no eol at end of file. */ + if (n_hold) + cwrite (n_out == 0, hold, n_hold); + + free (hold); } /* -n l/[K/]N: Write lines to files of approximately file size / N. @@ -926,7 +979,7 @@ lines_rr (uintmax_t k, uintmax_t n, char *buf, size_t bufsize) else { if (SIZE_MAX < n) - error (exit_failure, 0, "%s", _("memory exhausted")); + xalloc_die (); files = xnmalloc (n, sizeof *files); /* Generate output file names. */ @@ -1408,7 +1461,7 @@ main (int argc, char **argv) break; case type_byteslines: - line_bytes_split (n_units); + line_bytes_split (n_units, buf, in_blk_size); break; case type_chunk_bytes: diff --git a/tests/local.mk b/tests/local.mk index 5ec7d9859..58b7958c2 100644 --- a/tests/local.mk +++ b/tests/local.mk @@ -345,6 +345,7 @@ all_tests = \ tests/split/b-chunk.sh \ tests/split/fail.sh \ tests/split/lines.sh \ + tests/split/line-bytes.sh \ tests/split/l-chunk.sh \ tests/split/r-chunk.sh \ tests/split/numeric.sh \ diff --git a/tests/split/line-bytes.sh b/tests/split/line-bytes.sh new file mode 100755 index 000000000..c58f12b21 --- /dev/null +++ b/tests/split/line-bytes.sh @@ -0,0 +1,86 @@ +#!/bin/sh +# test -C, --lines-bytes + +# Copyright (C) 2013 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src +print_ver_ split +require_ulimit_v_ + + +# Ensure memory is not allocated up front +(ulimit -v 20000; split -C 'E' /dev/null) || fail=1 + + +# Ensure correct operation with various split and buffer size combinations + +lines=\ +1~2222~3~4 + +printf '%s' "$lines" | tr '~' '\n' > in || framework_failure_ + +cat <<\EOF > splits_exp +1 1 1 1 1 1 1 1 1 1 +2 2 2 1 2 1 +2 3 2 2 1 +2 4 3 1 +2 5 3 +2 5 3 +7 3 +7 3 +9 1 +9 1 +10 +EOF + +seq 0 9 | tr -d '\n' > no_eol_in + +cat <<\EOF > no_eol_splits_exp +1 1 1 1 1 1 1 1 1 1 +2 2 2 2 2 +3 3 3 1 +4 4 2 +5 5 +6 4 +7 3 +8 2 +9 1 +10 +10 +EOF + +for b in $(seq 10); do + : > splits + : > no_eol_splits + for s in $(seq 11); do + rm x?? + split ---io=$b -C$s in || fail=1 + cat x* > out || framework_failure_ + compare in out || fail=1 + stat -c %s x* | paste -s -d ' ' >> splits + + rm x?? + split ---io=$b -C$s no_eol_in || fail=1 + cat x* > out || framework_failure_ + cat xaa + compare no_eol_in out || fail=1 + stat -c %s x* | paste -s -d ' ' >> no_eol_splits + done + compare splits_exp splits || fail=1 + compare no_eol_splits_exp no_eol_splits || fail=1 +done + +Exit $fail |