diff options
author | Assaf Gordon <assafgordon@gmail.com> | 2013-03-06 18:25:49 -0500 |
---|---|---|
committer | Pádraig Brady <P@draigBrady.com> | 2013-03-25 20:07:14 +0000 |
commit | 20d7bce0f7e57d9a98f0ee811e31c757e9fedfff (patch) | |
tree | 0c3a560e7bbd16762807e0b2fc28d32670549eff /tests | |
parent | 4c49dc823ff7da589ae58d8d8313d38a75fc8f64 (diff) | |
download | coreutils-20d7bce0f7e57d9a98f0ee811e31c757e9fedfff.tar.xz |
shuf: use reservoir-sampling for large or unknown sized inputs
Reservoir sampling optimizes selecting K random lines from large or
unknown-sized input: http://en.wikipedia.org/wiki/Reservoir_sampling
Note this also avoids reading any input when -n0 is specified.
* src/shuf.c (main): Use reservoir-sampling when the number of output
lines is known, and the input size is large or unknown.
(input_size): A new function to get the input size for regular files.
(read_input_reservoir_sampling): New function to read lines from input,
keeping only K lines in memory, replacing lines with decreasing prob.
(write_permuted_output_reservoir): New function to output reservoir.
* tests/misc/shuf-reservoir.sh: An expensive_ test using valgrind to
exercise the reservoir-sampling code.
* tests/local.mk: Reference new test.
* NEWS: Mention the improvement.
Diffstat (limited to 'tests')
-rw-r--r-- | tests/local.mk | 1 | ||||
-rwxr-xr-x | tests/misc/shuf-reservoir.sh | 69 |
2 files changed, 70 insertions, 0 deletions
diff --git a/tests/local.mk b/tests/local.mk index 607ddc4d9..dc87ef491 100644 --- a/tests/local.mk +++ b/tests/local.mk @@ -313,6 +313,7 @@ all_tests = \ tests/misc/shred-passes.sh \ tests/misc/shred-remove.sh \ tests/misc/shuf.sh \ + tests/misc/shuf-reservoir.sh \ tests/misc/sort.pl \ tests/misc/sort-benchmark-random.sh \ tests/misc/sort-compress.sh \ diff --git a/tests/misc/shuf-reservoir.sh b/tests/misc/shuf-reservoir.sh new file mode 100755 index 000000000..e971c594e --- /dev/null +++ b/tests/misc/shuf-reservoir.sh @@ -0,0 +1,69 @@ +#!/bin/sh +# Exercise shuf's reservoir-sampling code +# NOTE: +# These tests do not check valid randomness, +# they just check memory allocation related code. + +# Copyright (C) 2013 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src +print_ver_ shuf +expensive_ +require_valgrind_ + +# Run "shuf" with specific number of input lines and output lines +# Check the output for expected number of lines. +run_shuf_n() +{ + INPUT_LINES="$1" + OUTPUT_LINES="$2" + + # Critical memory-related bugs will cause a segfault here + # (with varying numbres of input/output lines) + seq "$INPUT_LINES" | valgrind --leak-check=full --error-exitcode=1 \ + shuf -n "$OUTPUT_LINES" -o "out_${INPUT_LINES}_${OUTPUT_LINES}" || return 1 + + EXPECTED_LINES="$OUTPUT_LINES" + test "$INPUT_LINES" -lt "$OUTPUT_LINES" && EXPECTED_LINES="$INPUT_LINES" + + # There is no sure way to verify shuffled output (as it is random). + # Ensure we have the correct number of all numeric lines non duplicated lines. + GOOD_LINES=$(grep '^[0-9][0-9]*$' "out_${INPUT_LINES}_${OUTPUT_LINES}" | + sort -un | wc -l) || framework_failure_ + LINES=$(wc -l < "out_${INPUT_LINES}_${OUTPUT_LINES}") || framework_failure_ + + test "$EXPECTED_LINES" -eq "$GOOD_LINES" || return 1 + test "$EXPECTED_LINES" -eq "$LINES" || return 1 + + return 0 +} + +# Test multiple combinations of input lines and output lines. +# (e.g. small number of input lines and large number of output lines, +# and vice-versa. Also, each reservoir allocation uses a 1024-lines batch, +# so test 1023/1024/1025 and related values). +TEST_LINES="0 1 5 1023 1024 1025 3071 3072 3073" + +for IN_N in $TEST_LINES; do + for OUT_N in $TEST_LINES; do + run_shuf_n "$IN_N" "$OUT_N" || { + fail=1 + echo "shuf-reservoir-sampling failed with IN_N=$IN_N OUT_N=$OUT_N" >&2; + } + done +done + +Exit $fail |