diff options
author | Paul Eggert <eggert@cs.ucla.edu> | 2006-08-08 22:20:12 +0000 |
---|---|---|
committer | Paul Eggert <eggert@cs.ucla.edu> | 2006-08-08 22:20:12 +0000 |
commit | f8abf03c3fb0f8abf3d41893922b59bb06daac2e (patch) | |
tree | 92c9cd2d6408aba656d6d9393cedc57ed084f6b1 /src | |
parent | 65533e1b0939bba6a1a222ebc82189f448640ea9 (diff) | |
download | coreutils-f8abf03c3fb0f8abf3d41893922b59bb06daac2e.tar.xz |
Use new random-number interface rather than rand-isaac.c.
Don't include rand-isaac.c; include randint.h and randread.h instead.
(RANDOM_SOURCE_OPTION): New enum.
(long_opts, usage, main): New option --random-source.
Include md5.h, randread.h, xmemxfrm.h.
(longopts, usage, main): Remove undocumented --seed option;
it's now replaced by --random-source.
(rand_state, get_hash): Remove.
(randread_source): New static var.
(random_state, cmp_hashes, compare_random): New functions; they guarantee
no collisions in the random hash function.
(keycompare): Use compare_random for -R; don't fall back on comparing
via memcoll, since compare_random does the right thing.
Diffstat (limited to 'src')
-rw-r--r-- | src/sort.c | 177 |
1 files changed, 127 insertions, 50 deletions
diff --git a/src/sort.c b/src/sort.c index 4bb1149ef..7f9566d57 100644 --- a/src/sort.c +++ b/src/sort.c @@ -30,13 +30,16 @@ #include "error.h" #include "hard-locale.h" #include "inttostr.h" +#include "md5.h" #include "physmem.h" #include "posixver.h" #include "quote.h" -#include "stdlib--.h" +#include "randread.h" #include "stdio--.h" +#include "stdlib--.h" #include "strnumcmp.h" #include "xmemcoll.h" +#include "xmemxfrm.h" #include "xstrtol.h" #if HAVE_SYS_RESOURCE_H @@ -77,8 +80,6 @@ double strtod (); # define DEFAULT_TMPDIR "/tmp" #endif -#include "rand-isaac.c" - /* Exit statuses. */ enum { @@ -307,6 +308,7 @@ Ordering options:\n\ -M, --month-sort compare (unknown) < `JAN' < ... < `DEC'\n\ -n, --numeric-sort compare according to string numerical value\n\ -R, --random-sort sort by random hash of keys\n\ + --random-source=FILE get random bytes from FILE (default /dev/urandom)\n\ -r, --reverse reverse the result of comparisons\n\ \n\ "), stdout); @@ -362,7 +364,7 @@ native byte values.\n\ non-character as a pseudo short option, starting with CHAR_MAX + 1. */ enum { - SEED_OPTION = CHAR_MAX + 1 + RANDOM_SOURCE_OPTION = CHAR_MAX + 1 }; static char const short_options[] = "-bcdfgik:mMno:rRsS:t:T:uy:z"; @@ -380,6 +382,7 @@ static struct option const long_options[] = {"month-sort", no_argument, NULL, 'M'}, {"numeric-sort", no_argument, NULL, 'n'}, {"random-sort", no_argument, NULL, 'R'}, + {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION}, {"output", required_argument, NULL, 'o'}, {"reverse", no_argument, NULL, 'r'}, {"stable", no_argument, NULL, 's'}, @@ -388,7 +391,6 @@ static struct option const long_options[] = {"temporary-directory", required_argument, NULL, 'T'}, {"unique", no_argument, NULL, 'u'}, {"zero-terminated", no_argument, NULL, 'z'}, - {"seed", required_argument, NULL, SEED_OPTION}, /* This will go away soon. */ {GETOPT_HELP_OPTION_DECL}, {GETOPT_VERSION_OPTION_DECL}, {NULL, 0, NULL, 0}, @@ -1166,19 +1168,117 @@ getmonth (char const *month, size_t len) return 0; } -/* The ISAAC state resulting from the user-specified seed, or from - random data taken from the environment. */ -static struct isaac_state rand_state; +/* A source of random data. */ +static struct randread_source *randread_source; -/* Set *RESULT to the ISAAC state resulting from applying TEXT (with - length LEN) to rand_state. */ -static void -get_hash (char const *text, size_t len, uint32_t resbuf[ISAAC_WORDS]) +/* Return the Ith randomly-generated state. The caller must invoke + random_state (H) for all H less than I before invoking random_state + (I). */ + +static struct md5_ctx +random_state (size_t i) +{ + /* An array of states resulting from the random data, and counts of + its used and allocated members. */ + static struct md5_ctx *state; + static size_t used; + static size_t allocated; + + struct md5_ctx *s = &state[i]; + + if (used <= i) + { + unsigned char buf[MD5_DIGEST_SIZE]; + + used++; + + if (allocated <= i) + { + state = X2NREALLOC (state, &allocated); + s = &state[i]; + } + + randread (randread_source, buf, sizeof buf); + md5_init_ctx (s); + md5_process_bytes (buf, sizeof buf, s); + } + + return *s; +} + +/* Compare the hashes of TEXTA with length LENGTHA to those of TEXTB + with length LENGTHB. Return negative if less, zero if equal, + positive if greater. */ + +static int +cmp_hashes (char const *texta, size_t lena, + char const *textb, size_t lenb) +{ + /* Try random hashes until a pair of hashes disagree. But if the + first pair of random hashes agree, check whether the keys are + identical and if so report no difference. */ + int diff; + size_t i; + for (i = 0; ; i++) + { + uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)]; + struct md5_ctx s[2]; + s[0] = s[1] = random_state (i); + md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]); + md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]); + diff = memcmp (dig[0], dig[1], sizeof dig[0]); + if (diff != 0) + break; + if (i == 0 && lena == lenb && memcmp (texta, textb, lena) == 0) + break; + } + + return diff; +} + +/* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB) + using one or more random hash functions. */ + +static int +compare_random (char *restrict texta, size_t lena, + char *restrict textb, size_t lenb) { - struct isaac_state s = rand_state; - isaac_seed_data (&s, text, len); - isaac_seed_finish (&s); - isaac_refill (&s, resbuf); + int diff; + + if (! hard_LC_COLLATE) + diff = cmp_hashes (texta, lena, textb, lenb); + else + { + /* Transform the text into the basis of comparison, so that byte + strings that would otherwise considered to be equal are + considered equal here even if their bytes differ. */ + + char *buf = NULL; + char stackbuf[4000]; + size_t tlena = xmemxfrm (stackbuf, sizeof stackbuf, texta, lena); + bool a_fits = tlena <= sizeof stackbuf; + size_t tlenb = xmemxfrm ((a_fits ? stackbuf + tlena : NULL), + (a_fits ? sizeof stackbuf - tlena : 0), + textb, lenb); + + if (a_fits && tlena + tlenb <= sizeof stackbuf) + buf = stackbuf; + else + { + /* Adding 1 to the buffer size lets xmemxfrm run a bit + faster by avoiding the need for an extra buffer copy. */ + buf = xmalloc (tlena + tlenb + 1); + xmemxfrm (buf, tlena + 1, texta, lena); + xmemxfrm (buf + tlena, tlenb + 1, textb, lenb); + } + + diff = cmp_hashes (buf, tlena, buf + tlena, tlenb); + + if (buf != stackbuf) + free (buf); + } + + return diff; } /* Compare two lines A and B trying every key in sequence until there @@ -1210,29 +1310,8 @@ keycompare (const struct line *a, const struct line *b) /* Actually compare the fields. */ if (key->random) - { - int i; - uint32_t diga[ISAAC_WORDS]; - uint32_t digb[ISAAC_WORDS]; - get_hash (texta, lena, diga); - get_hash (textb, lenb, digb); - - /* Go backwards, since the last words are marginally better - mixed. */ - for (i = ISAAC_WORDS - 1; 0 <= i; i--) - if (diga[i] != digb[i]) - { - diff = (diga[i] < digb[i] ? -1 : 1); - goto not_equal; - } - - /* FIXME: Should break ties by rehashing with a different - random hashing function (and repeat until the tie is - broken) unless --seed was specified. The probability of - this being needed should be infinitesimal. */ - } - - if (key->numeric | key->general_numeric) + diff = compare_random (texta, lena, textb, lenb); + else if (key->numeric | key->general_numeric) { char savea = *lima, saveb = *limb; @@ -2208,7 +2287,7 @@ main (int argc, char **argv) int c = 0; bool checkonly = false; bool mergeonly = false; - char const *seed = NULL; + char *random_source = NULL; bool need_random = false; size_t nfiles = 0; bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL); @@ -2447,9 +2526,11 @@ main (int argc, char **argv) outfile = optarg; break; - case SEED_OPTION: - seed = optarg; - break; + case RANDOM_SOURCE_OPTION: + if (random_source && !STREQ (random_source, optarg)) + error (SORT_FAILURE, 0, _("multiple random sources specified")); + random_source = optarg; + break; case 's': stable = true; @@ -2563,13 +2644,9 @@ main (int argc, char **argv) if (need_random) { - if (seed) - { - isaac_seed_start (&rand_state); - isaac_seed_data (&rand_state, seed, strlen (seed)); - } - else - isaac_seed (&rand_state); + randread_source = randread_new (random_source, MD5_DIGEST_SIZE); + if (! randread_source) + die (_("open failed"), random_source); } if (temp_dir_count == 0) |