From 20d7bce0f7e57d9a98f0ee811e31c757e9fedfff Mon Sep 17 00:00:00 2001 From: Assaf Gordon Date: Wed, 6 Mar 2013 18:25:49 -0500 Subject: shuf: use reservoir-sampling for large or unknown sized inputs Reservoir sampling optimizes selecting K random lines from large or unknown-sized input: http://en.wikipedia.org/wiki/Reservoir_sampling Note this also avoids reading any input when -n0 is specified. * src/shuf.c (main): Use reservoir-sampling when the number of output lines is known, and the input size is large or unknown. (input_size): A new function to get the input size for regular files. (read_input_reservoir_sampling): New function to read lines from input, keeping only K lines in memory, replacing lines with decreasing prob. (write_permuted_output_reservoir): New function to output reservoir. * tests/misc/shuf-reservoir.sh: An expensive_ test using valgrind to exercise the reservoir-sampling code. * tests/local.mk: Reference new test. * NEWS: Mention the improvement. --- tests/local.mk | 1 + tests/misc/shuf-reservoir.sh | 69 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100755 tests/misc/shuf-reservoir.sh (limited to 'tests') diff --git a/tests/local.mk b/tests/local.mk index 607ddc4d9..dc87ef491 100644 --- a/tests/local.mk +++ b/tests/local.mk @@ -313,6 +313,7 @@ all_tests = \ tests/misc/shred-passes.sh \ tests/misc/shred-remove.sh \ tests/misc/shuf.sh \ + tests/misc/shuf-reservoir.sh \ tests/misc/sort.pl \ tests/misc/sort-benchmark-random.sh \ tests/misc/sort-compress.sh \ diff --git a/tests/misc/shuf-reservoir.sh b/tests/misc/shuf-reservoir.sh new file mode 100755 index 000000000..e971c594e --- /dev/null +++ b/tests/misc/shuf-reservoir.sh @@ -0,0 +1,69 @@ +#!/bin/sh +# Exercise shuf's reservoir-sampling code +# NOTE: +# These tests do not check valid randomness, +# they just check memory allocation related code. + +# Copyright (C) 2013 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src +print_ver_ shuf +expensive_ +require_valgrind_ + +# Run "shuf" with specific number of input lines and output lines +# Check the output for expected number of lines. +run_shuf_n() +{ + INPUT_LINES="$1" + OUTPUT_LINES="$2" + + # Critical memory-related bugs will cause a segfault here + # (with varying numbres of input/output lines) + seq "$INPUT_LINES" | valgrind --leak-check=full --error-exitcode=1 \ + shuf -n "$OUTPUT_LINES" -o "out_${INPUT_LINES}_${OUTPUT_LINES}" || return 1 + + EXPECTED_LINES="$OUTPUT_LINES" + test "$INPUT_LINES" -lt "$OUTPUT_LINES" && EXPECTED_LINES="$INPUT_LINES" + + # There is no sure way to verify shuffled output (as it is random). + # Ensure we have the correct number of all numeric lines non duplicated lines. + GOOD_LINES=$(grep '^[0-9][0-9]*$' "out_${INPUT_LINES}_${OUTPUT_LINES}" | + sort -un | wc -l) || framework_failure_ + LINES=$(wc -l < "out_${INPUT_LINES}_${OUTPUT_LINES}") || framework_failure_ + + test "$EXPECTED_LINES" -eq "$GOOD_LINES" || return 1 + test "$EXPECTED_LINES" -eq "$LINES" || return 1 + + return 0 +} + +# Test multiple combinations of input lines and output lines. +# (e.g. small number of input lines and large number of output lines, +# and vice-versa. Also, each reservoir allocation uses a 1024-lines batch, +# so test 1023/1024/1025 and related values). +TEST_LINES="0 1 5 1023 1024 1025 3071 3072 3073" + +for IN_N in $TEST_LINES; do + for OUT_N in $TEST_LINES; do + run_shuf_n "$IN_N" "$OUT_N" || { + fail=1 + echo "shuf-reservoir-sampling failed with IN_N=$IN_N OUT_N=$OUT_N" >&2; + } + done +done + +Exit $fail -- cgit v1.2.3-70-g09d2