summaryrefslogtreecommitdiff
path: root/tests
diff options
context:
space:
mode:
authorAssaf Gordon <assafgordon@gmail.com>2013-03-06 18:25:49 -0500
committerPádraig Brady <P@draigBrady.com>2013-03-25 20:07:14 +0000
commit20d7bce0f7e57d9a98f0ee811e31c757e9fedfff (patch)
tree0c3a560e7bbd16762807e0b2fc28d32670549eff /tests
parent4c49dc823ff7da589ae58d8d8313d38a75fc8f64 (diff)
downloadcoreutils-20d7bce0f7e57d9a98f0ee811e31c757e9fedfff.tar.xz
shuf: use reservoir-sampling for large or unknown sized inputs
Reservoir sampling optimizes selecting K random lines from large or unknown-sized input: http://en.wikipedia.org/wiki/Reservoir_sampling Note this also avoids reading any input when -n0 is specified. * src/shuf.c (main): Use reservoir-sampling when the number of output lines is known, and the input size is large or unknown. (input_size): A new function to get the input size for regular files. (read_input_reservoir_sampling): New function to read lines from input, keeping only K lines in memory, replacing lines with decreasing prob. (write_permuted_output_reservoir): New function to output reservoir. * tests/misc/shuf-reservoir.sh: An expensive_ test using valgrind to exercise the reservoir-sampling code. * tests/local.mk: Reference new test. * NEWS: Mention the improvement.
Diffstat (limited to 'tests')
-rw-r--r--tests/local.mk1
-rwxr-xr-xtests/misc/shuf-reservoir.sh69
2 files changed, 70 insertions, 0 deletions
diff --git a/tests/local.mk b/tests/local.mk
index 607ddc4d9..dc87ef491 100644
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -313,6 +313,7 @@ all_tests = \
tests/misc/shred-passes.sh \
tests/misc/shred-remove.sh \
tests/misc/shuf.sh \
+ tests/misc/shuf-reservoir.sh \
tests/misc/sort.pl \
tests/misc/sort-benchmark-random.sh \
tests/misc/sort-compress.sh \
diff --git a/tests/misc/shuf-reservoir.sh b/tests/misc/shuf-reservoir.sh
new file mode 100755
index 000000000..e971c594e
--- /dev/null
+++ b/tests/misc/shuf-reservoir.sh
@@ -0,0 +1,69 @@
+#!/bin/sh
+# Exercise shuf's reservoir-sampling code
+# NOTE:
+# These tests do not check valid randomness,
+# they just check memory allocation related code.
+
+# Copyright (C) 2013 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ shuf
+expensive_
+require_valgrind_
+
+# Run "shuf" with specific number of input lines and output lines
+# Check the output for expected number of lines.
+run_shuf_n()
+{
+ INPUT_LINES="$1"
+ OUTPUT_LINES="$2"
+
+ # Critical memory-related bugs will cause a segfault here
+ # (with varying numbres of input/output lines)
+ seq "$INPUT_LINES" | valgrind --leak-check=full --error-exitcode=1 \
+ shuf -n "$OUTPUT_LINES" -o "out_${INPUT_LINES}_${OUTPUT_LINES}" || return 1
+
+ EXPECTED_LINES="$OUTPUT_LINES"
+ test "$INPUT_LINES" -lt "$OUTPUT_LINES" && EXPECTED_LINES="$INPUT_LINES"
+
+ # There is no sure way to verify shuffled output (as it is random).
+ # Ensure we have the correct number of all numeric lines non duplicated lines.
+ GOOD_LINES=$(grep '^[0-9][0-9]*$' "out_${INPUT_LINES}_${OUTPUT_LINES}" |
+ sort -un | wc -l) || framework_failure_
+ LINES=$(wc -l < "out_${INPUT_LINES}_${OUTPUT_LINES}") || framework_failure_
+
+ test "$EXPECTED_LINES" -eq "$GOOD_LINES" || return 1
+ test "$EXPECTED_LINES" -eq "$LINES" || return 1
+
+ return 0
+}
+
+# Test multiple combinations of input lines and output lines.
+# (e.g. small number of input lines and large number of output lines,
+# and vice-versa. Also, each reservoir allocation uses a 1024-lines batch,
+# so test 1023/1024/1025 and related values).
+TEST_LINES="0 1 5 1023 1024 1025 3071 3072 3073"
+
+for IN_N in $TEST_LINES; do
+ for OUT_N in $TEST_LINES; do
+ run_shuf_n "$IN_N" "$OUT_N" || {
+ fail=1
+ echo "shuf-reservoir-sampling failed with IN_N=$IN_N OUT_N=$OUT_N" >&2;
+ }
+ done
+done
+
+Exit $fail