From 27873f1deb69745c79d403bbb8e1145bc18f55b8 Mon Sep 17 00:00:00 2001 From: Pádraig Brady Date: Fri, 13 May 2011 18:41:42 +0100 Subject: shuf: use memory more efficiently when returning a subset * gl/lib/randperm.c (randperm_new): When the number of items to return H, is much smaller than the total number of items N, use a hash to represent the sparse permutations of the set N. This is currently enabled for N > 128K and N/H > 32. * tests/misc/shuf: Ensure shuf can quickly return 2 numbers from a large range. * gl/modules/randperm: Depend on hash. * NEWS: Mention the change. --- NEWS | 5 ++ gl/lib/randperm.c | 149 +++++++++++++++++++++++++++++++++++++++++++++++++--- gl/modules/randperm | 1 + tests/misc/shuf | 6 +++ 4 files changed, 154 insertions(+), 7 deletions(-) diff --git a/NEWS b/NEWS index 82ce53c42..7a7f7612b 100644 --- a/NEWS +++ b/NEWS @@ -12,6 +12,11 @@ GNU coreutils NEWS -*- outline -*- Note the use of single quotes, not double quotes. That creates files named xaa.xz, xab.xz and xac.xz. +** Improvements + + shuf outputs small subsets of large permutations much more efficiently. + For example `shuf -i1-$((2**32-1)) -n2` no longer exhausts memory. + * Noteworthy changes in release 8.12 (2011-04-26) [stable] diff --git a/gl/lib/randperm.c b/gl/lib/randperm.c index 97c8d9a8b..26181b8a6 100644 --- a/gl/lib/randperm.c +++ b/gl/lib/randperm.c @@ -19,9 +19,11 @@ #include +#include "hash.h" #include "randperm.h" #include +#include #include "xalloc.h" @@ -57,6 +59,94 @@ randperm_bound (size_t h, size_t n) return bound; } +/* Swap elements I and J in array V. */ + +static void +swap (size_t *v, size_t i, size_t j) +{ + size_t t = v[i]; + v[i] = v[j]; + v[j] = t; +} + +/* Structures and functions for a sparse_map abstract data type that's + used to effectively swap elements I and J in array V like swap(), + but in a more memory efficient manner (when the number of permutations + performed is significantly less than the size of the input). */ + +struct sparse_ent_ +{ + size_t index; + size_t val; +}; + +static size_t +sparse_hash_ (void const *x, size_t table_size) +{ + struct sparse_ent_ const *ent = x; + return ent->index % table_size; +} + +static bool +sparse_cmp_ (void const *x, void const *y) +{ + struct sparse_ent_ const *ent1 = x; + struct sparse_ent_ const *ent2 = y; + return ent1->index == ent2->index; +} + +typedef Hash_table sparse_map; + +/* Initialize the structure for the sparse map, + when a best guess as to the number of entries + specified with SIZE_HINT. */ + +static sparse_map * +sparse_new (size_t size_hint) +{ + return hash_initialize (size_hint, NULL, sparse_hash_, sparse_cmp_, free); +} + +/* Swap the values for I and J. If a value is not already present + then assume it's equal to the index. Update the value for + index I in array V. */ + +static void +sparse_swap (sparse_map *sv, size_t* v, size_t i, size_t j) +{ + struct sparse_ent_ *v1 = hash_delete (sv, &(struct sparse_ent_) {i,0}); + struct sparse_ent_ *v2 = hash_delete (sv, &(struct sparse_ent_) {j,0}); + + /* FIXME: reduce the frequency of these mallocs. */ + if (!v1) + { + v1 = xmalloc (sizeof *v1); + v1->index = v1->val = i; + } + if (!v2) + { + v2 = xmalloc (sizeof *v2); + v2->index = v2->val = j; + } + + size_t t = v1->val; + v1->val = v2->val; + v2->val = t; + if (!hash_insert (sv, v1)) + xalloc_die (); + if (!hash_insert (sv, v2)) + xalloc_die (); + + v[i] = v1->val; +} + +static void +sparse_free (sparse_map *sv) +{ + hash_free (sv); +} + + /* From R, allocate and return a malloc'd array of the first H elements of a random permutation of N elements. H must not exceed N. Return NULL if H is zero. */ @@ -79,21 +169,66 @@ randperm_new (struct randint_source *r, size_t h, size_t n) default: { + /* The algorithm is essentially the same in both + the sparse and non sparse case. In the sparse case we use + a hash to implement sparse storage for the set of n numbers + we're shuffling. When to use the sparse method was + determined with the help of this script: + + #!/bin/sh + for n in $(seq 2 32); do + for h in $(seq 2 32); do + test $h -gt $n && continue + for s in o n; do + test $s = o && shuf=shuf || shuf=./shuf + num=$(env time -f "$s:${h},${n} = %e,%M" \ + $shuf -i0-$((2**$n-2)) -n$((2**$h-2)) | wc -l) + test $num = $((2**$h-2)) || echo "$s:${h},${n} = failed" >&2 + done + done + done + + This showed that if sparseness = n/h, then: + + sparseness = 128 => .125 mem used, and about same speed + sparseness = 64 => .25 mem used, but 1.5 times slower + sparseness = 32 => .5 mem used, but 2 times slower + + Also the memory usage was only significant when n > 128Ki + */ + bool sparse = (n >= (128 * 1024)) && (n / h >= 32); + size_t i; + sparse_map *sv; - v = xnmalloc (n, sizeof *v); - for (i = 0; i < n; i++) - v[i] = i; + if (sparse) + { + sv = sparse_new (h * 2); + if (sv == NULL) + xalloc_die (); + v = xnmalloc (h, sizeof *v); + } + else + { + sv = NULL; /* To placate GCC's -Wuninitialized. */ + v = xnmalloc (n, sizeof *v); + for (i = 0; i < n; i++) + v[i] = i; + } for (i = 0; i < h; i++) { size_t j = i + randint_choose (r, n - i); - size_t t = v[i]; - v[i] = v[j]; - v[j] = t; + if (sparse) + sparse_swap (sv, v, i, j); + else + swap (v, i, j); } - v = xnrealloc (v, h, sizeof *v); + if (sparse) + sparse_free (sv); + else + v = xnrealloc (v, h, sizeof *v); } break; } diff --git a/gl/modules/randperm b/gl/modules/randperm index 9cef78271..daf9e3215 100644 --- a/gl/modules/randperm +++ b/gl/modules/randperm @@ -8,6 +8,7 @@ lib/randperm.h Depends-on: randint xalloc +hash configure.ac: diff --git a/tests/misc/shuf b/tests/misc/shuf index cdfe7052b..10d285846 100755 --- a/tests/misc/shuf +++ b/tests/misc/shuf @@ -18,6 +18,7 @@ . "${srcdir=.}/init.sh"; path_prepend_ ../src print_ver_ shuf +getlimits_ seq 100 > in || framework_failure @@ -51,4 +52,9 @@ shuf --zero-terminated -i 1-1 > out || fail=1 printf '1\0' > exp || framework_failure cmp out exp || { fail=1; echo "missing NUL terminator?" 1>&2; } +# Ensure shuf -n operates efficiently for small n. Before coreutils-8.13 +# this would try to allocate $SIZE_MAX * sizeof(size_t) +timeout 10 shuf -i1-$SIZE_MAX -n2 >/dev/null || + { fail=1; echo "couldn't get a small subset" >&2; } + Exit $fail -- cgit v1.2.3-70-g09d2