From 55d155dbbe63fe368a94d6d497f21d949e3797c3 Mon Sep 17 00:00:00 2001 From: Jim Meyering Date: Mon, 19 Feb 2001 08:50:12 +0000 Subject: Check for input size, and do not overallocate memory. Also check for memory quotas. Revamp storage management so that line tables and character data are taken from the same buffer. Line tables are now in reverse order, since they grow down while the character data grow up. (): Include if HAVE_SYS_RESOURCE_H. (struct rlimit, getrlimit): Define a replacement if RLIMIT_DATA is not defined. (RLIMIT_AS): Define to RLIMIT_DATA if not defined. (struct lines): Remove. (struct buffer): New members nlines, line_bytes, eof. Remove member newline_free; no longer needed, since the code no longer runs out of line table space. (SORTALLOC_MIN, SORTALLOC_DEFAULT_MIN): Remove. (sort_size): Renamed from sortalloc; now applies to the sum of the character data and the line table, not just the character data. (MIN_SORT_SIZE, INPUT_FILE_SIZE_GUESS): New macros. (linelength): remove. (specify_sort_size): Don't worry about the distinction between the character data and the line table; that is now the caller's responsibility. (default_sort_size): Return the value, instead of being executed for side effect. Return half of available memory, or 1/16 of total memory, whichever is greater; except do not exceed 1/2 of quota. (sort_buffer_size): New function. (initbuf): New arg LINE_BYTES. Ensure that the line array is properly aligned. Initialize the new set of struct buffer members. (buffer_linelim): New function. (fillbuf): Return int, not size_t, since the callers merely care whether the result is nonzero. New arg FILE so that error messages can report the file name. Keep track of eof. Initialize the line table too, taking its memory from the input buffer's memory; this subsumes the old findlines function and removes the need for worrying about running out of line table entries. (checkfp, mergefps, sortlines, merge, sort): Adjust to the new storage management regime, in particular the fact that line tables are now filled in by fillbuf and are in reverse order. (checkfp): Now takes char *, not const char *, since subroutines require that now. Rewrite to avoid lint and duplicate code. If line length alloc calculation overflows, simply allocate enough memory to hold the line. (mergefps): New arg FILES, used for buffer size calculation and error messages. Rewrite to avoid lint. Do not loop if savealloc*2 overflows. (mergefps, merge): Zap temporary files eagerly rather than lazily; this is needed because we now pass FILES to mergefps. (sortlines): Args now point at end of arrays, not at beginnings. (sort): Do not allocate temporary line array for sortlines; instead, take the space from the same buffer. (main): Adjust to sort_size and default_sort_size changes. --- src/sort.c | 666 +++++++++++++++++++++++++++++++++---------------------------- 1 file changed, 358 insertions(+), 308 deletions(-) (limited to 'src') diff --git a/src/sort.c b/src/sort.c index 0c548655c..affde7b21 100644 --- a/src/sort.c +++ b/src/sort.c @@ -38,6 +38,17 @@ #include "xalloc.h" #include "xstrtol.h" +#if HAVE_SYS_RESOURCE_H +# include +#endif +#ifndef RLIMIT_DATA +struct rlimit { size_t rlim_cur; }; +# define getrlimit(Resource, Rlp) (-1) +#endif +#ifndef RLIMIT_AS +# define RLIMIT_AS RLIMIT_DATA +#endif + /* The official name of this program (e.g., no `g' prefix). */ #define PROGRAM_NAME "sort" @@ -120,24 +131,20 @@ struct line char *keylim; /* Limit of first key. */ }; -/* Arrays of lines. */ -struct lines -{ - struct line *lines; /* Dynamically allocated array of lines. */ - size_t used; /* Number of slots used. */ - size_t alloc; /* Number of slots allocated. */ - size_t limit; /* Max number of slots to allocate. */ -}; - /* Input buffers. */ struct buffer { - char *buf; /* Dynamically allocated buffer. */ - size_t used; /* Number of bytes used. */ + char *buf; /* Dynamically allocated buffer, + partitioned into 3 regions: + - input data; + - unused area; + - an array of lines, in reverse order. */ + size_t used; /* Number of bytes used for input data. */ + size_t nlines; /* Number of lines in the line array. */ size_t alloc; /* Number of bytes allocated. */ size_t left; /* Number of bytes left from previous reads. */ - size_t newline_free; /* Number of left bytes that are known - to be newline-free. */ + size_t line_bytes; /* Number of bytes to reserve for each line. */ + int eof; /* An EOF has been read. */ }; struct keyfield @@ -217,18 +224,14 @@ static MONTHTAB_CONST struct month monthtab[] = /* During the merge phase, the number of files to merge at once. */ #define NMERGE 16 -/* Minimum text buffer size. */ -#define SORTALLOC_MIN (4 * NMERGE * sizeof (struct line)) - -/* Minimum text buffer size if the user does not specify a size. */ -#define SORTALLOC_DEFAULT_MIN max (SORTALLOC_MIN, 1024 * 1024) +/* The approximate maximum number of bytes of main memory to use. */ +static size_t sort_size; -/* Initial text buffer size for main-memory sorting. The buffer will - grow only if a line longer than this is seen. */ -static size_t sortalloc; +/* Minimum sort size; the code might not work with smaller sizes. */ +#define MIN_SORT_SIZE (NMERGE * (2 + sizeof (struct line))) -/* Guess of average line length. */ -static size_t const linelength = 30; +/* The guessed size for non-regular files. */ +#define INPUT_FILE_SIZE_GUESS (1024 * 1024) /* Array of directory names in which any temporary files are to be created. */ static char const **temp_dirs; @@ -607,14 +610,10 @@ specify_sort_size (char const *s) if (e == LONGINT_OK) { - /* Normally a buffer is allocated with sortalloc bytes, and a - line table with at most sortalloc / 2 bytes. So adjust the - size so that size * 1.5 is about n. */ - sortalloc = (n / 3) * 2; - if (sortalloc == (n / 3) * 2) + sort_size = n; + if (sort_size == n) { - if (sortalloc < SORTALLOC_MIN) - sortalloc = SORTALLOC_MIN; + sort_size = MAX (sort_size, MIN_SORT_SIZE); return; } @@ -624,100 +623,108 @@ specify_sort_size (char const *s) STRTOL_FATAL_ERROR (s, _("sort size"), e); } -/* Set the sort size to the default. */ -static void +/* Return the default sort size. */ +static size_t default_sort_size (void) { - /* Set sortalloc to 50% of available memory, unless it overflows. */ - double mem = physmem_available (); - sortalloc = min (mem, SIZE_MAX); - sortalloc >>= 1; - - if (sortalloc < SORTALLOC_DEFAULT_MIN) - sortalloc = SORTALLOC_DEFAULT_MIN; + /* Let MEM be available memory or 1/8 of total memory, whichever + is greater. */ + double avail = physmem_available (); + double total = physmem_total (); + double mem = MAX (avail, total / 8); + struct rlimit rlimit; + + /* Let SIZE be MEM, but no more than the maximum object size or + system resource limits. Avoid the MIN macro here, as it is not + quite right when only one argument is floating point. Don't + bother to check for values like RLIM_INFINITY since in practice + they are not much less than SIZE_MAX. */ + size_t size = SIZE_MAX; + if (mem < size) + size = mem; + if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size) + size = rlimit.rlim_cur; + if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size) + size = rlimit.rlim_cur; + + /* Use half of SIZE, but no less than the minimum. */ + return MAX (size / 2, MIN_SORT_SIZE); } -/* Initialize BUF, allocating ALLOC bytes initially. */ +/* Return the sort buffer size to use with the input files identified + by FPS and FILES, which are alternate paths to the same files. + NFILES gives the number of input files; NFPS may be less. Assume + that each input line requires LINE_BYTES extra bytes' worth of line + information. Return at most SIZE_BOUND. */ -static void -initbuf (struct buffer *buf, size_t alloc) +static size_t +sort_buffer_size (FILE *const *fps, int nfps, + char *const *files, int nfiles, + size_t line_bytes, size_t size_bound) { - buf->alloc = alloc; - buf->buf = xmalloc (buf->alloc); - buf->used = buf->left = buf->newline_free = 0; -} + /* In the worst case, each input byte is a newline. */ + size_t worst_case_per_input_byte = line_bytes + 1; -/* Fill BUF reading from FP, moving buf->left bytes from the end - of buf->buf to the beginning first. If EOF is reached and the - file wasn't terminated by a newline, supply one. Return a count - of bytes buffered. */ + /* Keep enough room for one extra input line and an extra byte. + This extra room might be needed when preparing to read EOF. */ + size_t size = worst_case_per_input_byte + 1; -static size_t -fillbuf (struct buffer *buf, FILE *fp) -{ - if (buf->used != buf->left) - { - memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left); - buf->used = buf->left; - } + int i; - while (!feof (fp)) + for (i = 0; i < nfiles; i++) { - size_t cc; - if (buf->used == buf->alloc) - { - buf->alloc *= 2; - if (buf->alloc < buf->used) - xalloc_die (); - buf->buf = xrealloc (buf->buf, buf->alloc); - } - cc = fread (buf->buf + buf->used, 1, buf->alloc - buf->used, fp); - if (ferror (fp)) + struct stat st; + off_t file_size; + size_t worst_case; + + if ((i < nfps ? fstat (fileno (fps[i]), &st) + : strcmp (files[i], "-") == 0 ? fstat (STDIN_FILENO, &st) + : stat (files[i], &st)) + != 0) { - error (0, errno, _("read error")); + error (0, errno, "%s", files[i]); cleanup (); exit (SORT_FAILURE); } - buf->used += cc; - if (memchr (buf->buf + buf->used - cc, eolchar, cc)) - break; - } - if (feof (fp) && buf->used && buf->buf[buf->used - 1] != eolchar) - { - if (buf->used == buf->alloc) - { - /* If the buffer isn't parsed yet, don't increase the buffer - size to append a newline, as this won't be needed unless - the buffer turns out to be newline-free. */ - if (buf->newline_free != buf->used) - return buf->used; - buf->alloc *= 2; - if (buf->alloc < buf->used) - xalloc_die (); - buf->buf = xrealloc (buf->buf, buf->alloc); - } - buf->buf[buf->used++] = eolchar; + file_size = S_ISREG (st.st_mode) ? st.st_size : INPUT_FILE_SIZE_GUESS; + + /* Add the amount of memory needed to represent the worst case + where the input consists entirely of newlines followed by a + single non-newline. Check for overflow. */ + worst_case = file_size * worst_case_per_input_byte + 1; + if (file_size != worst_case / worst_case_per_input_byte + || size_bound - size <= worst_case) + return size_bound; + size += worst_case; } - return buf->used; + return size; } -/* Initialize LINES, allocating space for ALLOC lines initially. - LIMIT is the maximum possible number of lines to allocate space - for, ever. */ +/* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES + must be at least sizeof (struct line). Allocate ALLOC bytes + initially. */ static void -initlines (struct lines *lines, size_t alloc, size_t limit) +initbuf (struct buffer *buf, size_t line_bytes, size_t alloc) { - if (limit < alloc) - alloc = limit; - lines->alloc = alloc; - if (SIZE_MAX / sizeof (struct line) < alloc) - xalloc_die (); - lines->lines = (struct line *) xmalloc (lines->alloc * sizeof (struct line)); - lines->used = 0; - lines->limit = limit; + /* Ensure that the line array is properly aligned. */ + alloc += sizeof (struct line) - alloc % sizeof (struct line); + + buf->line_bytes = line_bytes; + buf->alloc = alloc; + buf->buf = xmalloc (buf->alloc); + buf->used = buf->left = buf->nlines = 0; + buf->eof = 0; +} + +/* Return one past the limit of the line array. */ + +static inline struct line * +buffer_linelim (struct buffer const *buf) +{ + return (struct line *) (buf->buf + buf->alloc); } /* Return a pointer to the first character of the field specified @@ -870,58 +877,119 @@ trim_trailing_blanks (const char *a_start, char **a_end) --(*a_end); } -/* Find the lines in BUF, storing pointers and lengths in LINES. */ +/* Fill BUF reading from FP, moving buf->left bytes from the end + of buf->buf to the beginning first. If EOF is reached and the + file wasn't terminated by a newline, supply one. Set up BUF's line + table too. FILE is the name of the file corresponding to FP. + Return nonzero if some input was read. */ -static void -findlines (struct buffer *buf, struct lines *lines) +static int +fillbuf (struct buffer *buf, register FILE *fp, char const *file) { - register char *beg = buf->buf, *lim = buf->buf + buf->used, *ptr; - struct keyfield *key = keylist; + struct keyfield const *key = keylist; + char eol = eolchar; + size_t line_bytes = buf->line_bytes; - lines->used = 0; + if (buf->eof) + return 0; - while (beg < lim && (ptr = memchr (beg, eolchar, lim - beg)) - && lines->used < lines->limit) + if (buf->used != buf->left) { - struct line *line; - - if (lines->used == lines->alloc) - { - lines->alloc = min (2 * lines->alloc, lines->limit); - if (SIZE_MAX / sizeof (struct line) < lines->alloc) - xalloc_die (); - lines->lines = (struct line *) - xrealloc ((char *) lines->lines, - lines->alloc * sizeof (struct line)); - } + memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left); + buf->used = buf->left; + buf->nlines = 0; + } - line = &lines->lines[lines->used]; - line->text = beg; - line->length = ptr + 1 - beg; + for (;;) + { + char *ptr = buf->buf + buf->used; + struct line *linelim = buffer_linelim (buf); + struct line *line = linelim - buf->nlines; + size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr; + char *line_start = buf->nlines ? line->text + line->length : buf->buf; - if (key) + while (line_bytes + 1 < avail) { - /* Precompute the position of the first key for efficiency. */ - line->keylim = (key->eword == -1 ? ptr : limfield (line, key)); + /* Read as many bytes as possible, but do not read so many + bytes that there might not be enough room for the + corresponding line array. The worst case is when the + rest of the input file consists entirely of newlines, + except that the last byte is not a newline. */ + size_t readsize = (avail - 1) / (line_bytes + 1); + size_t bytes_read = fread (ptr, 1, readsize, fp); + char *ptrlim = ptr + bytes_read; + char *p; + avail -= bytes_read; + + if (bytes_read != readsize) + { + if (ferror (fp)) + { + error (0, errno, "%s", file); + cleanup (); + exit (SORT_FAILURE); + } + if (feof (fp)) + { + buf->eof = 1; + if (buf->buf == ptrlim) + return 0; + if (ptrlim[-1] != eol) + *ptrlim++ = eol; + } + } - if (key->sword != -1) - line->keybeg = begfield (line, key); - else + /* Find and record each line in the just-read input. */ + while ((p = memchr (ptr, eol, ptrlim - ptr))) { - if (key->skipsblanks) - while (blanks[UCHAR (*beg)]) - ++beg; - line->keybeg = beg; + ptr = p + 1; + line--; + line->text = line_start; + line->length = ptr - line_start; + avail -= line_bytes; + + if (key) + { + /* Precompute the position of the first key for + efficiency. */ + line->keylim = key->eword == -1 ? p : limfield (line, key); + + if (key->sword != -1) + line->keybeg = begfield (line, key); + else + { + if (key->skipsblanks) + while (blanks[UCHAR (*line_start)]) + line_start++; + line->keybeg = line_start; + } + if (key->skipeblanks) + trim_trailing_blanks (line->keybeg, &line->keylim); + } + + line_start = ptr; } - if (key->skipeblanks) - trim_trailing_blanks (line->keybeg, &line->keylim); + + ptr = ptrlim; + if (buf->eof) + break; } - ++lines->used; - beg = ptr + 1; - } + buf->used = ptr - buf->buf; + buf->nlines = buffer_linelim (buf) - line; + if (buf->nlines != 0) + { + buf->left = ptr - line_start; + return 1; + } - buf->newline_free = buf->left = lim - beg; + /* The current input line is too long to fit in the buffer. + Double the buffer size and try again. */ + if (2 * buf->alloc < buf->alloc) + xalloc_die (); + buf->alloc *= 2; + buf->buf = xrealloc (buf->buf, buf->alloc); + } } /* Compare strings A and B containing decimal fractions < 1. Each string @@ -1430,156 +1498,133 @@ compare (register const struct line *a, register const struct line *b) and return zero. */ static int -checkfp (FILE *fp, const char *file_name) +checkfp (FILE *fp, char *file_name) { struct buffer buf; /* Input buffer. */ - struct lines lines; /* Lines scanned from the buffer. */ struct line temp; /* Copy of previous line. */ - size_t cc; /* Character count. */ - size_t alloc; - uintmax_t line_number = 1; - struct line *disorder_line IF_LINT (= NULL); - uintmax_t disorder_line_number = 0; + size_t alloc = 0; + uintmax_t line_number = 0; struct keyfield *key = keylist; - size_t mergealloc = sortalloc / (2 * NMERGE); + int nonunique = 1 - unique; + int disordered = 0; - initbuf (&buf, mergealloc); - initlines (&lines, mergealloc / linelength + 1, - sortalloc / (4 * NMERGE * sizeof (struct line))); - alloc = linelength; - temp.text = xmalloc (alloc); + initbuf (&buf, sizeof (struct line), + sort_buffer_size (&fp, 1, &file_name, 1, + sizeof (struct line), sort_size)); + temp.text = NULL; - cc = fillbuf (&buf, fp); - if (cc == 0) - goto finish; - - findlines (&buf, &lines); - - while (1) + while (fillbuf (&buf, fp, file_name)) { - struct line *prev_line; /* Pointer to previous line. */ - int cmp; /* Result of calling compare. */ - size_t i; + struct line const *line = buffer_linelim (&buf); + struct line const *linebase = line - buf.nlines; - /* Compare each line in the buffer with its successor. */ - for (i = 1; i < lines.used; ++i) + /* Make sure the line saved from the old buffer contents is + less than or equal to the first line of the new buffer. */ + if (alloc && nonunique <= compare (&temp, line - 1)) { - cmp = compare (&lines.lines[i - 1], &lines.lines[i]); - if ((unique && cmp >= 0) || (cmp > 0)) - { - disorder_line = &lines.lines[i]; - disorder_line_number = line_number + i; - goto finish; - } + found_disorder: + { + char hr_buf[LONGEST_HUMAN_READABLE + 1]; + struct line const *disorder_line = line - 1; + uintmax_t disorder_line_number = + buffer_linelim (&buf) - disorder_line + line_number; + fprintf (stderr, _("%s: %s:%s: disorder: "), + program_name, file_name, + human_readable (disorder_line_number, hr_buf, 1, 1)); + write_bytes (disorder_line->text, disorder_line->length, stderr, + _("standard error")); + disordered = 1; + break; + } } - line_number += lines.used; + /* Compare each line in the buffer with its successor. */ + while (linebase < --line) + if (nonunique <= compare (line, line - 1)) + goto found_disorder; + + line_number += buf.nlines; - /* Save the last line of the buffer and refill the buffer. */ - prev_line = lines.lines + (lines.used - 1); - if (alloc < prev_line->length) + /* Save the last line of the buffer. */ + if (alloc < line->length) { do { - if (alloc * 2 < alloc) - xalloc_die (); alloc *= 2; + if (! alloc) + { + alloc = line->length; + break; + } } - while (alloc < prev_line->length); + while (alloc < line->length); temp.text = xrealloc (temp.text, alloc); } - memcpy (temp.text, prev_line->text, prev_line->length); - temp.length = prev_line->length; + memcpy (temp.text, line->text, line->length); + temp.length = line->length; if (key) { - temp.keybeg = temp.text + (prev_line->keybeg - prev_line->text); - temp.keylim = temp.text + (prev_line->keylim - prev_line->text); - } - - cc = fillbuf (&buf, fp); - if (cc == 0) - break; - - findlines (&buf, &lines); - /* Make sure the line saved from the old buffer contents is - less than or equal to the first line of the new buffer. */ - cmp = compare (&temp, &lines.lines[0]); - if ((unique && cmp >= 0) || (cmp > 0)) - { - disorder_line = &lines.lines[0]; - disorder_line_number = line_number; - break; + temp.keybeg = temp.text + (line->keybeg - line->text); + temp.keylim = temp.text + (line->keylim - line->text); } } -finish: xfclose (fp); - - if (disorder_line_number) - { - char hr_buf[LONGEST_HUMAN_READABLE + 1]; - fprintf (stderr, _("%s: %s:%s: disorder: "), program_name, file_name, - human_readable (disorder_line_number, hr_buf, 1, 1)); - write_bytes (disorder_line->text, disorder_line->length, stderr, - _("standard error")); - } - free (buf.buf); - free ((char *) lines.lines); - free (temp.text); - return NONZERO (disorder_line_number); + if (temp.text) + free (temp.text); + return disordered; } /* Merge lines from FPS onto OFP. NFPS cannot be greater than NMERGE. - Close FPS before returning. */ + Close FPS before returning. NAMES and OUTPUT_NAME give the names + of the files. */ static void -mergefps (FILE **fps, register int nfps, FILE *ofp, const char *output_file) +mergefps (FILE **fps, char **files, register int nfps, + FILE *ofp, const char *output_file) { struct buffer buffer[NMERGE]; /* Input buffers for each file. */ - struct lines lines[NMERGE]; /* Line tables for each buffer. */ struct line saved; /* Saved line storage for unique check. */ - struct line const *savedline IF_LINT (= NULL); + struct line const *savedline = NULL; /* &saved if there is a saved line. */ - size_t mergealloc = sortalloc / (2 * NMERGE); - size_t savealloc IF_LINT (= 0); /* Size allocated for the saved line. */ - size_t cur[NMERGE]; /* Current line in each line table. */ + size_t savealloc = 0; /* Size allocated for the saved line. */ + struct line const *cur[NMERGE]; /* Current line in each line table. */ + struct line const *base[NMERGE]; /* Base of each line table. */ int ord[NMERGE]; /* Table representing a permutation of fps, - such that lines[ord[0]].lines[cur[ord[0]]] - is the smallest line and will be next - output. */ + such that cur[ord[0]] is the smallest line + and will be next output. */ register int i, j, t; struct keyfield *key = keylist; - - /* Allocate space for a saved line if necessary. */ - if (unique) - { - savedline = NULL; - savealloc = linelength; - saved.text = xmalloc (savealloc); - } + saved.text = NULL; /* Read initial lines from each input file. */ for (i = 0; i < nfps; ++i) { - initbuf (&buffer[i], mergealloc); + size_t mergealloc = sort_buffer_size (&fps[i], 1, &files[i], 1, + sizeof (struct line), + sort_size / nfps); + initbuf (&buffer[i], sizeof (struct line), mergealloc); /* If a file is empty, eliminate it from future consideration. */ - while (i < nfps && !fillbuf (&buffer[i], fps[i])) + while (i < nfps && !fillbuf (&buffer[i], fps[i], files[i])) { xfclose (fps[i]); + zaptemp (files[i]); --nfps; for (j = i; j < nfps; ++j) - fps[j] = fps[j + 1]; + { + fps[j] = fps[j + 1]; + files[j] = files[j + 1]; + } } if (i == nfps) free (buffer[i].buf); else { - initlines (&lines[i], mergealloc / linelength + 1, - sortalloc / (4 * NMERGE * sizeof (struct line))); - findlines (&buffer[i], &lines[i]); - cur[i] = 0; + struct line const *linelim = buffer_linelim (&buffer[i]); + cur[i] = linelim - 1; + base[i] = linelim - buffer[i].nlines; } } @@ -1589,14 +1634,13 @@ mergefps (FILE **fps, register int nfps, FILE *ofp, const char *output_file) for (i = 0; i < nfps; ++i) ord[i] = i; for (i = 1; i < nfps; ++i) - if (compare (&lines[ord[i - 1]].lines[cur[ord[i - 1]]], - &lines[ord[i]].lines[cur[ord[i]]]) > 0) + if (0 < compare (cur[ord[i - 1]], cur[ord[i]])) t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0; /* Repeatedly output the smallest line until no input remains. */ while (nfps) { - struct line const *smallest = &lines[ord[0]].lines[cur[ord[0]]]; + struct line const *smallest = cur[ord[0]]; /* If uniquified output is turned on, output only the first of an identical series of lines. */ @@ -1612,8 +1656,14 @@ mergefps (FILE **fps, register int nfps, FILE *ofp, const char *output_file) savedline = &saved; if (savealloc < smallest->length) { - while ((savealloc *= 2) < smallest->length) - continue; + do + if (! savealloc) + { + savealloc = smallest->length; + break; + } + while ((savealloc *= 2) < smallest->length); + saved.text = xrealloc (saved.text, savealloc); } saved.length = smallest->length; @@ -1631,12 +1681,15 @@ mergefps (FILE **fps, register int nfps, FILE *ofp, const char *output_file) write_bytes (smallest->text, smallest->length, ofp, output_file); /* Check if we need to read more lines into core. */ - if (++cur[ord[0]] == lines[ord[0]].used) + if (base[ord[0]] < smallest) + cur[ord[0]] = smallest - 1; + else { - if (fillbuf (&buffer[ord[0]], fps[ord[0]])) + if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]])) { - findlines (&buffer[ord[0]], &lines[ord[0]]); - cur[ord[0]] = 0; + struct line const *linelim = buffer_linelim (&buffer[ord[0]]); + cur[ord[0]] = linelim - 1; + base[ord[0]] = linelim - buffer[ord[0]].nlines; } else { @@ -1646,14 +1699,15 @@ mergefps (FILE **fps, register int nfps, FILE *ofp, const char *output_file) --ord[i]; --nfps; xfclose (fps[ord[0]]); + zaptemp (files[ord[0]]); free (buffer[ord[0]].buf); - free ((char *) lines[ord[0]].lines); for (i = ord[0]; i < nfps; ++i) { fps[i] = fps[i + 1]; + files[i] = files[i + 1]; buffer[i] = buffer[i + 1]; - lines[i] = lines[i + 1]; cur[i] = cur[i + 1]; + base[i] = base[i + 1]; } for (i = 0; i < nfps; ++i) ord[i] = ord[i + 1]; @@ -1666,8 +1720,7 @@ mergefps (FILE **fps, register int nfps, FILE *ofp, const char *output_file) a line larger than it. */ for (i = 1; i < nfps; ++i) { - t = compare (&lines[ord[0]].lines[cur[ord[0]]], - &lines[ord[i]].lines[cur[ord[i]]]); + t = compare (cur[ord[0]], cur[ord[i]]); if (!t) t = ord[0] - ord[i]; if (t < 0) @@ -1686,7 +1739,9 @@ mergefps (FILE **fps, register int nfps, FILE *ofp, const char *output_file) } } -/* Sort the array LINES with NLINES members, using TEMP for temporary space. */ +/* Sort the array LINES with NLINES members, using TEMP for temporary space. + The input and output arrays are in reverse order, and LINES and + TEMP point just past the end of their respective arrays. */ static void sortlines (struct line *lines, size_t nlines, struct line *temp) @@ -1696,11 +1751,11 @@ sortlines (struct line *lines, size_t nlines, struct line *temp) if (nlines == 2) { - if (compare (&lines[0], &lines[1]) > 0) + if (0 < compare (&lines[-1], &lines[-2])) { - *temp = lines[0]; - lines[0] = lines[1]; - lines[1] = *temp; + struct line tmp = lines[-1]; + lines[-1] = lines[-2]; + lines[-2] = tmp; } return; } @@ -1708,7 +1763,7 @@ sortlines (struct line *lines, size_t nlines, struct line *temp) nlo = nlines / 2; lo = lines; nhi = nlines - nlo; - hi = lines + nlo; + hi = lines - nlo; if (nlo > 1) sortlines (lo, nlo, temp); @@ -1719,15 +1774,15 @@ sortlines (struct line *lines, size_t nlines, struct line *temp) t = temp; while (nlo && nhi) - if (compare (lo, hi) <= 0) - *t++ = *lo++, --nlo; + if (compare (lo - 1, hi - 1) <= 0) + *--t = *--lo, --nlo; else - *t++ = *hi++, --nhi; + *--t = *--hi, --nhi; while (nlo--) - *t++ = *lo++; + *--t = *--lo; for (lo = lines, nlo = nlines - nhi, t = temp; nlo; --nlo) - *lo++ = *t++; + *--lo = *--t; } /* Check that each of the NFILES FILES is ordered. @@ -1764,28 +1819,22 @@ merge (char **files, int nfiles, FILE *ofp, const char *output_file) for (j = 0; j < NMERGE; ++j) fps[j] = xfopen (files[i * NMERGE + j], "r"); tfp = xtmpfopen (temp = tempname ()); - mergefps (fps, NMERGE, tfp, temp); + mergefps (fps, &files[i * NMERGE], NMERGE, tfp, temp); xfclose (tfp); - for (j = 0; j < NMERGE; ++j) - zaptemp (files[i * NMERGE + j]); files[t++] = temp; } for (j = 0; j < nfiles % NMERGE; ++j) fps[j] = xfopen (files[i * NMERGE + j], "r"); tfp = xtmpfopen (temp = tempname ()); - mergefps (fps, nfiles % NMERGE, tfp, temp); + mergefps (fps, &files[i * NMERGE], nfiles % NMERGE, tfp, temp); xfclose (tfp); - for (j = 0; j < nfiles % NMERGE; ++j) - zaptemp (files[i * NMERGE + j]); files[t++] = temp; nfiles = t; } for (i = 0; i < nfiles; ++i) fps[i] = xfopen (files[i], "r"); - mergefps (fps, i, ofp, output_file); - for (i = 0; i < nfiles; ++i) - zaptemp (files[i]); + mergefps (fps, files, nfiles, ofp, output_file); } /* Sort NFILES FILES onto OFP. */ @@ -1794,28 +1843,36 @@ static void sort (char **files, int nfiles, FILE *ofp, const char *output_file) { struct buffer buf; - struct lines lines; - struct line *tmp = NULL; - size_t ntmp = 0; - FILE *fp, *tfp; + FILE *tfp; struct tempnode *node; int n_temp_files = 0; char **tempfiles; - initbuf (&buf, sortalloc); - initlines (&lines, sortalloc / linelength + 1, - sortalloc / (2 * sizeof (struct line))); + buf.alloc = 0; - while (nfiles--) + while (nfiles) { - const char *temp_output; - - fp = xfopen (*files++, "r"); - while (fillbuf (&buf, fp)) + char const *temp_output; + char const *file = *files; + FILE *fp = xfopen (file, "r"); + + if (! buf.alloc) + initbuf (&buf, 2 * sizeof (struct line), + sort_buffer_size (&fp, 1, files, nfiles, + 2 * sizeof (struct line), sort_size)); + buf.eof = 0; + files++; + nfiles--; + + while (fillbuf (&buf, fp, file)) { - size_t i; + struct line *line; + struct line *linebase; - if (nfiles && buf.used != buf.alloc && feof (fp)) + if (buf.eof && nfiles + && (2 * sizeof (struct line) + 1 + < (buf.alloc - buf.used + - 2 * sizeof (struct line) * buf.nlines))) { /* End of file, but there is more input and buffer room. Concatenate the next input file; this is faster in @@ -1824,21 +1881,10 @@ sort (char **files, int nfiles, FILE *ofp, const char *output_file) break; } - findlines (&buf, &lines); - if (ntmp < lines.used) - { - if (ntmp == 0) - ntmp = lines.used; - else - while ((ntmp *= 2) < lines.used) - continue; - if (SIZE_MAX / sizeof (struct line) < ntmp) - xalloc_die (); - tmp = (struct line *) - xrealloc ((char *) tmp, ntmp * sizeof (struct line)); - } - sortlines (lines.lines, lines.used, tmp); - if (feof (fp) && !nfiles && !n_temp_files && !buf.left) + line = buffer_linelim (&buf); + linebase = line - buf.nlines; + sortlines (line, buf.nlines, linebase); + if (buf.eof && !nfiles && !n_temp_files && !buf.left) { tfp = ofp; temp_output = output_file; @@ -1848,11 +1894,17 @@ sort (char **files, int nfiles, FILE *ofp, const char *output_file) ++n_temp_files; tfp = xtmpfopen (temp_output = tempname ()); } - for (i = 0; i < lines.used; ++i) - if (!unique || i == 0 - || compare (&lines.lines[i], &lines.lines[i - 1])) - write_bytes (lines.lines[i].text, lines.lines[i].length, tfp, - temp_output); + + do + { + line--; + write_bytes (line->text, line->length, tfp, temp_output); + if (unique) + while (linebase < line && compare (line, line - 1) == 0) + line--; + } + while (linebase < line); + if (tfp != ofp) xfclose (tfp); } @@ -1860,8 +1912,6 @@ sort (char **files, int nfiles, FILE *ofp, const char *output_file) } free (buf.buf); - free ((char *) lines.lines); - free ((char *) tmp); if (n_temp_files) { @@ -2361,8 +2411,8 @@ but lacks following character offset")); files = − } - if (sortalloc == 0) - default_sort_size (); + if (sort_size == 0) + sort_size = default_sort_size (); if (checkonly) { -- cgit v1.2.3-70-g09d2