summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPádraig Brady <P@draigBrady.com>2013-04-20 08:46:43 +0100
committerPádraig Brady <P@draigBrady.com>2013-05-23 11:13:24 +0100
commitfec363cebf581ef27f6d01686dec5a0499aa6818 (patch)
tree59d03cfaae216cc0dc4f9ec221f375fe7b605b14
parent478dade09a4288f73e963b7f185ef9f73b681b42 (diff)
downloadcoreutils-fec363cebf581ef27f6d01686dec5a0499aa6818.tar.xz
split: with --line-bytes only allocate memory as needed
* src/split.c (line_bytes_split): Rewrite to only buffer when necessary. I.E. only increase the buffer when we've already lines output in a split and we encounter a line larger than the input buffer size, in which case a hold buffer will be increased in increments of the input buffer size. (lines_rr): Use the more abstract xalloc_die() just like we did in line_bytes_split(), rather than explicitly printing the "memory exhausted" message and exiting. * tests/split/line-bytes.sh: Add a new test for this function which previously had no test coverage. * tests/local.mk: Reference the new test. * NEWS: Mention the improvement. Fixes http://bugs.gnu.org/13537
-rw-r--r--NEWS3
-rw-r--r--src/split.c139
-rw-r--r--tests/local.mk1
-rwxr-xr-xtests/split/line-bytes.sh86
4 files changed, 186 insertions, 43 deletions
diff --git a/NEWS b/NEWS
index eec93dfd0..721e05bf5 100644
--- a/NEWS
+++ b/NEWS
@@ -48,6 +48,9 @@ GNU coreutils NEWS -*- outline -*-
Reservoir sampling is used to limit memory usage based on the number of
outputs, rather than the number of inputs.
+ split --line-bytes=SIZE, now only allocates memory as needed rather
+ than allocating SIZE bytes at program start.
+
** Build-related
factor now builds on aarch64 based systems [bug introduced in coreutils-8.20]
diff --git a/src/split.c b/src/split.c
index e5e75f0f5..feabe39b1 100644
--- a/src/split.c
+++ b/src/split.c
@@ -616,62 +616,115 @@ lines_split (uintmax_t n_lines, char *buf, size_t bufsize)
}
while (n_read == bufsize);
}
-
+
/* Split into pieces that are as large as possible while still not more
than N_BYTES bytes, and are split on line boundaries except
- where lines longer than N_BYTES bytes occur.
- FIXME: Allow N_BYTES to be any uintmax_t value, and don't require a
- buffer of size N_BYTES, in case N_BYTES is very large. */
+ where lines longer than N_BYTES bytes occur. */
static void
-line_bytes_split (size_t n_bytes)
+line_bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize)
{
- char *bp;
- bool eof = false;
- size_t n_buffered = 0;
- char *buf = xmalloc (n_bytes);
+ size_t n_read;
+ uintmax_t n_out = 0; /* for each split. */
+ size_t n_hold = 0;
+ char *hold = NULL; /* for lines > bufsize. */
+ size_t hold_size = 0;
+ bool split_line = false; /* Whether a \n was output in a split. */
do
{
- /* Fill up the full buffer size from the input file. */
-
- size_t to_read = n_bytes - n_buffered;
- size_t n_read = full_read (STDIN_FILENO, buf + n_buffered, to_read);
- if (n_read < to_read && errno)
+ n_read = full_read (STDIN_FILENO, buf, bufsize);
+ if (n_read < bufsize && errno)
error (EXIT_FAILURE, errno, "%s", infile);
-
- n_buffered += n_read;
- if (n_buffered != n_bytes)
+ size_t n_left = n_read;
+ char *sob = buf;
+ while (n_left)
{
- if (n_buffered == 0)
- break;
- eof = true;
- }
+ size_t split_rest = 0;
+ char *eoc = NULL;
+ char *eol;
- /* Find where to end this chunk. */
- bp = buf + n_buffered;
- if (n_buffered == n_bytes)
- {
- while (bp > buf && bp[-1] != '\n')
- bp--;
- }
+ /* Determine End Of Chunk and/or End of Line,
+ which are used below to select what to write or buffer. */
+ if (n_bytes - n_out - n_hold <= n_left)
+ {
+ /* Have enough for split. */
+ split_rest = n_bytes - n_out - n_hold;
+ eoc = sob + split_rest - 1;
+ eol = memrchr (sob, '\n', split_rest);
+ }
+ else
+ eol = memrchr (sob, '\n', n_left);
- /* If chunk has no newlines, use all the chunk. */
- if (bp == buf)
- bp = buf + n_buffered;
+ /* Output hold space if possible. */
+ if (n_hold && !(!eol && n_out))
+ {
+ cwrite (n_out == 0, hold, n_hold);
+ n_out += n_hold;
+ if (n_hold > bufsize)
+ hold = xrealloc (hold, bufsize);
+ n_hold = 0;
+ hold_size = bufsize;
+ }
- /* Output the chars as one output file. */
- cwrite (true, buf, bp - buf);
+ /* Output to eol if present. */
+ if (eol)
+ {
+ split_line = true;
+ size_t n_write = eol - sob + 1;
+ cwrite (n_out == 0, sob, n_write);
+ n_out += n_write;
+ n_left -= n_write;
+ sob += n_write;
+ if (eoc)
+ split_rest -= n_write;
+ }
+
+ /* Output to eoc or eob if possible. */
+ if (n_left && !split_line)
+ {
+ size_t n_write = eoc ? split_rest : n_left;
+ cwrite (n_out == 0, sob, n_write);
+ n_out += n_write;
+ n_left -= n_write;
+ sob += n_write;
+ if (eoc)
+ split_rest -= n_write;
+ }
+
+ /* Update hold if needed. */
+ if ((eoc && split_rest) || (!eoc && n_left))
+ {
+ size_t n_buf = eoc ? split_rest : n_left;
+ if (hold_size - n_hold < n_buf)
+ {
+ if (hold_size <= SIZE_MAX - bufsize)
+ hold_size += bufsize;
+ else
+ xalloc_die ();
+ hold = xrealloc (hold, hold_size);
+ }
+ memcpy (hold + n_hold, sob, n_buf);
+ n_hold += n_buf;
+ n_left -= n_buf;
+ sob += n_buf;
+ }
- /* Discard the chars we just output; move rest of chunk
- down to be the start of the next chunk. Source and
- destination probably overlap. */
- n_buffered -= bp - buf;
- if (n_buffered > 0)
- memmove (buf, bp, n_buffered);
+ /* Reset for new split. */
+ if (eoc)
+ {
+ n_out = 0;
+ split_line = false;
+ }
+ }
}
- while (!eof);
- free (buf);
+ while (n_read == bufsize);
+
+ /* Handle no eol at end of file. */
+ if (n_hold)
+ cwrite (n_out == 0, hold, n_hold);
+
+ free (hold);
}
/* -n l/[K/]N: Write lines to files of approximately file size / N.
@@ -926,7 +979,7 @@ lines_rr (uintmax_t k, uintmax_t n, char *buf, size_t bufsize)
else
{
if (SIZE_MAX < n)
- error (exit_failure, 0, "%s", _("memory exhausted"));
+ xalloc_die ();
files = xnmalloc (n, sizeof *files);
/* Generate output file names. */
@@ -1408,7 +1461,7 @@ main (int argc, char **argv)
break;
case type_byteslines:
- line_bytes_split (n_units);
+ line_bytes_split (n_units, buf, in_blk_size);
break;
case type_chunk_bytes:
diff --git a/tests/local.mk b/tests/local.mk
index 5ec7d9859..58b7958c2 100644
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -345,6 +345,7 @@ all_tests = \
tests/split/b-chunk.sh \
tests/split/fail.sh \
tests/split/lines.sh \
+ tests/split/line-bytes.sh \
tests/split/l-chunk.sh \
tests/split/r-chunk.sh \
tests/split/numeric.sh \
diff --git a/tests/split/line-bytes.sh b/tests/split/line-bytes.sh
new file mode 100755
index 000000000..c58f12b21
--- /dev/null
+++ b/tests/split/line-bytes.sh
@@ -0,0 +1,86 @@
+#!/bin/sh
+# test -C, --lines-bytes
+
+# Copyright (C) 2013 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ split
+require_ulimit_v_
+
+
+# Ensure memory is not allocated up front
+(ulimit -v 20000; split -C 'E' /dev/null) || fail=1
+
+
+# Ensure correct operation with various split and buffer size combinations
+
+lines=\
+1~2222~3~4
+
+printf '%s' "$lines" | tr '~' '\n' > in || framework_failure_
+
+cat <<\EOF > splits_exp
+1 1 1 1 1 1 1 1 1 1
+2 2 2 1 2 1
+2 3 2 2 1
+2 4 3 1
+2 5 3
+2 5 3
+7 3
+7 3
+9 1
+9 1
+10
+EOF
+
+seq 0 9 | tr -d '\n' > no_eol_in
+
+cat <<\EOF > no_eol_splits_exp
+1 1 1 1 1 1 1 1 1 1
+2 2 2 2 2
+3 3 3 1
+4 4 2
+5 5
+6 4
+7 3
+8 2
+9 1
+10
+10
+EOF
+
+for b in $(seq 10); do
+ : > splits
+ : > no_eol_splits
+ for s in $(seq 11); do
+ rm x??
+ split ---io=$b -C$s in || fail=1
+ cat x* > out || framework_failure_
+ compare in out || fail=1
+ stat -c %s x* | paste -s -d ' ' >> splits
+
+ rm x??
+ split ---io=$b -C$s no_eol_in || fail=1
+ cat x* > out || framework_failure_
+ cat xaa
+ compare no_eol_in out || fail=1
+ stat -c %s x* | paste -s -d ' ' >> no_eol_splits
+ done
+ compare splits_exp splits || fail=1
+ compare no_eol_splits_exp no_eol_splits || fail=1
+done
+
+Exit $fail