diff options
author | Bo Borgerson <gigabo@gmail.com> | 2008-04-05 13:33:51 -0400 |
---|---|---|
committer | Jim Meyering <meyering@redhat.com> | 2008-06-17 22:36:51 +0200 |
commit | 20905b0cdcb960c9949763c606f15e7e09d02e00 (patch) | |
tree | 4857729dfcb9dfa816027a106567c166e8260271 | |
parent | 322c6f2e5cd3d09ed31d9d8dea2310d70f47842a (diff) | |
download | coreutils-20905b0cdcb960c9949763c606f15e7e09d02e00.tar.xz |
sort: accept new option --batch-size=NMERGE
* src/sort.c: (static unsigned int nmerge) Replace constant NMERGE.
(specify_nmerge) Validate and apply new option.
(mergefps) Replace some arrays with pointers to xnmalloc'd storage.
* tests/misc/sort-merge: Test new option.
* doc/coreutils.texi: Describe new option.
* NEWS: Advertise new option.
-rw-r--r-- | NEWS | 4 | ||||
-rw-r--r-- | doc/coreutils.texi | 23 | ||||
-rw-r--r-- | src/sort.c | 115 | ||||
-rwxr-xr-x | tests/misc/sort-merge | 43 |
4 files changed, 171 insertions, 14 deletions
@@ -23,6 +23,10 @@ GNU coreutils NEWS -*- outline -*- instead of filenames passed on the command-line to avoid problems with maximum command-line (argv) length. + sort accepts a new option --batch-size=NMERGE, where NMERGE + represents the maximum number of inputs that will be merged at once. + When processing more than NMERGE inputs, sort uses temporary files. + ** Bug fixes chcon --verbose now prints a newline after each message diff --git a/doc/coreutils.texi b/doc/coreutils.texi index e4a979e61..f7b9e7844 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -3804,6 +3804,29 @@ multiple fields. Example: To sort on the second field, use @option{--key=2,2} (@option{-k 2,2}). See below for more examples. +@item --batch-size=@var{nmerge} +@opindex --batch-size +@cindex number of inputs to merge, nmerge +Merge at most @var{nmerge} inputs at once. + +When @command{sort} has to merge more than @var{nmerge} inputs, +it merges them in groups of @var{nmerge}, saving the result in +a temporary file, which is then used as an input in a subsequent merge. + +A large value of @var{nmerge} may improve merge performance and decrease +temporary storage utilization at the expense of increased memory usage +and I/0. Conversely a small value of @var{nmerge} may reduce memory +requirements and I/0 at the expense of temporary storage consumption and +merge performance. + +The value of @var{nmerge} must be at least 2. + +The value of @var{nmerge} may be bounded by a resource limit for open +file descriptors. Try @samp{ulimit -n} or @samp{getconf OPEN_MAX} to +to display the limit for a particular system. +If the value of @var{nmerge} exceeds this limit, then @command{sort} will +issue a warning to standard error and exit with a nonzero status. + @item -o @var{output-file} @itemx --output=@var{output-file} @opindex -o diff --git a/src/sort.c b/src/sort.c index 8fb943af9..13935218a 100644 --- a/src/sort.c +++ b/src/sort.c @@ -224,13 +224,16 @@ static struct month monthtab[] = }; /* During the merge phase, the number of files to merge at once. */ -#define NMERGE 16 +#define NMERGE_DEFAULT 16 /* Minimum size for a merge or check buffer. */ #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line)) /* Minimum sort size; the code might not work with smaller sizes. */ -#define MIN_SORT_SIZE (NMERGE * MIN_MERGE_BUFFER_SIZE) +#define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE) + +/* Maximum merge buffers we can theoretically support */ +#define MAX_NMERGE (SIZE_MAX / MIN_MERGE_BUFFER_SIZE) /* The number of bytes needed for a merge or check buffer, which can function relatively efficiently even if it holds only one line. If @@ -282,6 +285,10 @@ static struct keyfield *keylist; /* Program used to (de)compress temp files. Must accept -d. */ static char const *compress_program; +/* Maximum number of files to merge in one go. If more than this + number are present, temp files will be used. */ +static unsigned int nmerge = NMERGE_DEFAULT; + static void sortlines_temp (struct line *, size_t, struct line *); /* Report MESSAGE for FILE, then clean up and exit. @@ -340,6 +347,12 @@ Ordering options:\n\ fputs (_("\ Other options:\n\ \n\ +"), stdout); + fputs (_("\ + --batch-size=NMERGE merge at most NMERGE inputs at once;\n\ + for more use temp files\n\ +"), stdout); + fputs (_("\ -c, --check, --check=diagnose-first check for sorted input; do not sort\n\ -C, --check=quiet, --check=silent like -c, but do not report first bad line\n\ --compress-program=PROG compress temporaries with PROG;\n\ @@ -401,6 +414,7 @@ enum CHECK_OPTION = CHAR_MAX + 1, COMPRESS_PROGRAM_OPTION, FILES0_FROM_OPTION, + NMERGE_OPTION, RANDOM_SOURCE_OPTION, SORT_OPTION }; @@ -427,6 +441,7 @@ static struct option const long_options[] = {"output", required_argument, NULL, 'o'}, {"reverse", no_argument, NULL, 'r'}, {"stable", no_argument, NULL, 's'}, + {"batch-size", required_argument, NULL, NMERGE_OPTION}, {"buffer-size", required_argument, NULL, 'S'}, {"field-separator", required_argument, NULL, 't'}, {"temporary-directory", required_argument, NULL, 'T'}, @@ -1053,6 +1068,68 @@ inittables (void) #endif } +/* Specify how many inputs may be merged at once. + This may be set on the command-line with the + --batch-size option. */ +static void +specify_nmerge (int oi, char c, char const *s) +{ + uintmax_t n; + struct rlimit rlimit; + unsigned int max_nmerge = (unsigned int) MAX_NMERGE; + enum strtol_error e = xstrtoumax (s, NULL, 10, &n, NULL); + + /* Try to find out how many file descriptors we'll be able + to open. We need at least nmerge + 3 (STDIN_FILENO, + STDOUT_FILENO and STDERR_FILENO). */ + if (getrlimit (RLIMIT_NOFILE, &rlimit) == 0) + max_nmerge = MIN (max_nmerge, rlimit.rlim_cur - 3); + + if (e == LONGINT_OK) + { + nmerge = n; + if (nmerge != n) + e = LONGINT_OVERFLOW; + else + { + if (nmerge < 2) + { + error (0, 0, _("invalid --%s argument %s"), + long_options[oi].name, quote(s)); + error (SORT_FAILURE, 0, + _("minimum --%s argument is %s"), + long_options[oi].name, quote("2")); + } + else if (max_nmerge < nmerge) + { + e = LONGINT_OVERFLOW; + } + else + { + /* Need to re-check that we meet the minimum + requirement for memory usage with the new, + potentially larger, nmerge. */ + sort_size = MAX (sort_size, MIN_SORT_SIZE); + + return; + } + } + } + + if (e == LONGINT_OVERFLOW) + { + char max_nmerge_buf[INT_BUFSIZE_BOUND (unsigned int)]; + uinttostr (max_nmerge, max_nmerge_buf); + error (0, 0, _("--%s argument %s too large"), + long_options[oi].name, quote(s)); + error (SORT_FAILURE, 0, + _("maximum --%s argument with current rlimit is %s"), + long_options[oi].name, quote (max_nmerge_buf)); + } + else + xstrtol_fatal (e, oi, c, long_options, s); +} + /* Specify the amount of main memory to use when sorting. */ static void specify_sort_size (int oi, char c, char const *s) @@ -2037,15 +2114,20 @@ static void mergefps (struct sortfile *files, size_t ntemps, size_t nfiles, FILE *ofp, char const *output_file) { - FILE *fps[NMERGE]; /* Input streams for each file. */ - struct buffer buffer[NMERGE]; /* Input buffers for each file. */ + FILE **fps = xnmalloc (nmerge, sizeof *fps); + /* Input streams for each file. */ + struct buffer *buffer = xnmalloc (nmerge, sizeof *buffer); + /* Input buffers for each file. */ struct line saved; /* Saved line storage for unique check. */ struct line const *savedline = NULL; /* &saved if there is a saved line. */ size_t savealloc = 0; /* Size allocated for the saved line. */ - struct line const *cur[NMERGE]; /* Current line in each line table. */ - struct line const *base[NMERGE]; /* Base of each line table. */ - size_t ord[NMERGE]; /* Table representing a permutation of fps, + struct line const **cur = xnmalloc (nmerge, sizeof *cur); + /* Current line in each line table. */ + struct line const **base = xnmalloc (nmerge, sizeof *base); + /* Base of each line table. */ + size_t *ord = xnmalloc (nmerge, sizeof *ord); + /* Table representing a permutation of fps, such that cur[ord[0]] is the smallest line and will be next output. */ size_t i; @@ -2218,6 +2300,11 @@ mergefps (struct sortfile *files, size_t ntemps, size_t nfiles, } xfclose (ofp, output_file); + free(fps); + free(buffer); + free(ord); + free(base); + free(cur); } /* Merge into T the two sorted arrays of lines LO (with NLO members) @@ -2405,7 +2492,7 @@ static void merge (struct sortfile *files, size_t ntemps, size_t nfiles, char const *output_file) { - while (NMERGE < nfiles) + while (nmerge < nfiles) { /* Number of input files processed so far. */ size_t in; @@ -2421,20 +2508,20 @@ merge (struct sortfile *files, size_t ntemps, size_t nfiles, size_t cheap_slots; /* Do as many NMERGE-size merges as possible. */ - for (out = in = 0; out < nfiles / NMERGE; out++, in += NMERGE) + for (out = in = 0; out < nfiles / nmerge; out++, in += nmerge) { FILE *tfp; pid_t pid; char *temp = create_temp (&tfp, &pid); - size_t nt = MIN (ntemps, NMERGE); + size_t nt = MIN (ntemps, nmerge); ntemps -= nt; - mergefps (&files[in], nt, NMERGE, tfp, temp); + mergefps (&files[in], nt, nmerge, tfp, temp); files[out].name = temp; files[out].pid = pid; } remainder = nfiles - in; - cheap_slots = NMERGE - out % NMERGE; + cheap_slots = nmerge - out % nmerge; if (cheap_slots < remainder) { @@ -3027,6 +3114,10 @@ main (int argc, char **argv) mergeonly = true; break; + case NMERGE_OPTION: + specify_nmerge (oi, c, optarg); + break; + case 'o': if (outfile && !STREQ (outfile, optarg)) error (SORT_FAILURE, 0, _("multiple output files specified")); diff --git a/tests/misc/sort-merge b/tests/misc/sort-merge index f90ba9bce..a2524c40c 100755 --- a/tests/misc/sort-merge +++ b/tests/misc/sort-merge @@ -19,18 +19,57 @@ use strict; (my $program_name = $0) =~ s|.*/||; +my $prog = 'sort'; # Turn off localization of executable's output. @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3; +# three empty files and one that says 'foo' +my @inputs = (+(map{{IN=> {"empty$_"=> ''}}}1..3), {IN=> {foo=> "foo\n"}}); + +# don't need to check for existence, since we're running in a temp dir +my $badtmp = 'does/not/exist'; + +# 2^64+1 +my $bigint = "18446744073709551617"; + my @Tests = ( - ['m1', '-m', {IN=> {empty=> ''}}, {IN=> {f=> "foo\n"}}, {OUT=>"foo\n"}], + ['m1', '-m', @inputs, {OUT=>"foo\n"}], + + # check validation of --batch-size option + ['nmerge-0', "-m --batch-size=0", @inputs, + {ERR=>"$prog: invalid --batch-size argument `0'\n". + "$prog: minimum --batch-size argument is `2'\n"}, {EXIT=>2}], + + ['nmerge-1', "-m --batch-size=1", @inputs, + {ERR=>"$prog: invalid --batch-size argument `1'\n". + "$prog: minimum --batch-size argument is `2'\n"}, {EXIT=>2}], + + ['nmerge-neg', "-m --batch-size=-1", @inputs, + {ERR=>"$prog: invalid --batch-size argument `-1'\n"}, {EXIT=>2}], + + ['nmerge-nan', "-m --batch-size=a", @inputs, + {ERR=>"$prog: invalid --batch-size argument `a'\n"}, {EXIT=>2}], + + ['nmerge-big', "-m --batch-size=$bigint", @inputs, + {ERR_SUBST=>'s/current rlimit is .+\n/current rlimit is/'}, + {ERR=>"$prog: --batch-size argument `$bigint' too large\n". + "$prog: maximum --batch-size argument with current rlimit is"}, + {EXIT=>2}], + + # This should work since nmerge >= the number of input files + ['nmerge-yes', "-m --batch-size=4 -T$badtmp", @inputs, {OUT=>"foo\n"}], + + # this should fail since nmerge < # of input files, so + # temp files are needed + ['nmerge-no', "-m --batch-size=2 -T$badtmp", @inputs, + {ERR_SUBST=>"s|: $badtmp/sort.+||"}, + {ERR=>"$prog: cannot create temporary file\n"}, {EXIT=>2}], ); my $save_temps = $ENV{DEBUG}; my $verbose = $ENV{VERBOSE}; -my $prog = 'sort'; my $fail = run_tests ($program_name, $prog, \@Tests, $save_temps, $verbose); exit $fail; |