diff options
author | Erich Eckner <git@eckner.net> | 2016-11-29 10:08:31 +0100 |
---|---|---|
committer | Erich Eckner <git@eckner.net> | 2017-02-16 13:23:49 +0100 |
commit | 698617e7eb710a3da51ff4edaf44274109544e4a (patch) | |
tree | 315d63c949846233f6e91d70e4728cefba185c8b /src/sort-uniq-keyfuncs.c | |
parent | e74f109de104c1966d2d4863aedfb25eefe0358f (diff) | |
download | coreutils-698617e7eb710a3da51ff4edaf44274109544e4a.tar.xz |
sort: split off even more source lines useful for uniq
Diffstat (limited to 'src/sort-uniq-keyfuncs.c')
-rw-r--r-- | src/sort-uniq-keyfuncs.c | 164 |
1 files changed, 164 insertions, 0 deletions
diff --git a/src/sort-uniq-keyfuncs.c b/src/sort-uniq-keyfuncs.c index 196045bc0..d661aefe3 100644 --- a/src/sort-uniq-keyfuncs.c +++ b/src/sort-uniq-keyfuncs.c @@ -37,6 +37,9 @@ static bool hard_LC_COLLATE; /* The kind of blanks for '-b' to skip in various options. */ enum blanktype { bl_start, bl_end, bl_both }; +/* The character marking end of line. Default to \n. */ +static char eolchar = '\n'; + /* Lines are held in core as counted strings. */ struct line { @@ -46,6 +49,22 @@ struct line char *keylim; /* Limit of first key. */ }; +/* Input buffers. */ +struct buffer +{ + char *buf; /* Dynamically allocated buffer, + partitioned into 3 regions: + - input data; + - unused area; + - an array of lines, in reverse order. */ + size_t used; /* Number of bytes used for input data. */ + size_t nlines; /* Number of lines in the line array. */ + size_t alloc; /* Number of bytes allocated. */ + size_t left; /* Number of bytes left from previous reads. */ + size_t line_bytes; /* Number of bytes to reserve for each line. */ + bool eof; /* An EOF has been read. */ +}; + /* Sort key. */ struct keyfield { @@ -116,6 +135,14 @@ static struct month monthtab[] = {"SEP", 9} }; +/* Minimum size for a merge or check buffer. */ +#define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line)) + +/* The number of bytes needed for a merge or check buffer, which can + function relatively efficiently even if it holds only one line. If + a longer line is seen, this value is increased. */ +static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024); + /* Flag to reverse the order of all comparisons. */ static bool reverse; @@ -139,6 +166,23 @@ static bool unique; /* List of key field comparisons to be tried. */ static struct keyfield *keylist; +static void sort_die (char const *, char const *) ATTRIBUTE_NORETURN; +static void +sort_die (char const *message, char const *file) +{ + die (SORT_FAILURE, errno, "%s: %s", message, + quotef (file ? file : _("standard output"))); +} + +/* Return one past the limit of the line array. */ + +static inline struct line * +buffer_linelim (struct buffer const *buf) +{ + void *linelim = buf->buf + buf->alloc; + return linelim; +} + /* Return a pointer to the first character of the field specified by KEY in LINE. */ @@ -283,6 +327,126 @@ limfield (struct line const *line, struct keyfield const *key) return ptr; } +/* Fill BUF reading from FP, moving buf->left bytes from the end + of buf->buf to the beginning first. If EOF is reached and the + file wasn't terminated by a newline, supply one. Set up BUF's line + table too. FILE is the name of the file corresponding to FP. + Return true if some input was read. */ + +static bool +fillbuf (struct buffer *buf, FILE *fp, char const *file) +{ + struct keyfield const *key = keylist; + char eol = eolchar; + size_t line_bytes = buf->line_bytes; + size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE; + + if (buf->eof) + return false; + + if (buf->used != buf->left) + { + memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left); + buf->used = buf->left; + buf->nlines = 0; + } + + while (true) + { + char *ptr = buf->buf + buf->used; + struct line *linelim = buffer_linelim (buf); + struct line *line = linelim - buf->nlines; + size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr; + char *line_start = buf->nlines ? line->text + line->length : buf->buf; + + while (line_bytes + 1 < avail) + { + /* Read as many bytes as possible, but do not read so many + bytes that there might not be enough room for the + corresponding line array. The worst case is when the + rest of the input file consists entirely of newlines, + except that the last byte is not a newline. */ + size_t readsize = (avail - 1) / (line_bytes + 1); + size_t bytes_read = fread (ptr, 1, readsize, fp); + char *ptrlim = ptr + bytes_read; + char *p; + avail -= bytes_read; + + if (bytes_read != readsize) + { + if (ferror (fp)) + sort_die (_("read failed"), file); + if (feof (fp)) + { + buf->eof = true; + if (buf->buf == ptrlim) + return false; + if (line_start != ptrlim && ptrlim[-1] != eol) + *ptrlim++ = eol; + } + } + + /* Find and record each line in the just-read input. */ + while ((p = memchr (ptr, eol, ptrlim - ptr))) + { + /* Delimit the line with NUL. This eliminates the need to + temporarily replace the last byte with NUL when calling + xmemcoll(), which increases performance. */ + *p = '\0'; + ptr = p + 1; + line--; + line->text = line_start; + line->length = ptr - line_start; + mergesize = MAX (mergesize, line->length); + avail -= line_bytes; + + if (key) + { + /* Precompute the position of the first key for + efficiency. */ + line->keylim = (key->eword == SIZE_MAX + ? p + : limfield (line, key)); + + if (key->sword != SIZE_MAX) + line->keybeg = begfield (line, key); + else + { + if (key->skipsblanks) + while (blanks[to_uchar (*line_start)]) + line_start++; + line->keybeg = line_start; + } + } + + line_start = ptr; + } + + ptr = ptrlim; + if (buf->eof) + break; + } + + buf->used = ptr - buf->buf; + buf->nlines = buffer_linelim (buf) - line; + if (buf->nlines != 0) + { + buf->left = ptr - line_start; + merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE; + return true; + } + + { + /* The current input line is too long to fit in the buffer. + Increase the buffer size and try again, keeping it properly + aligned. */ + size_t line_alloc = buf->alloc / sizeof (struct line); + buf->buf = x2nrealloc (buf->buf, &line_alloc, sizeof (struct line)); + buf->alloc = line_alloc * sizeof (struct line); + } + } +} + /* Table that maps characters to order-of-magnitude values. */ static char const unit_order[UCHAR_LIM] = { |