From 698617e7eb710a3da51ff4edaf44274109544e4a Mon Sep 17 00:00:00 2001 From: Erich Eckner Date: Tue, 29 Nov 2016 10:08:31 +0100 Subject: sort: split off even more source lines useful for uniq --- src/sort-uniq-keyfuncs.c | 164 +++++++++++++++++++++++++++++++++++++++++++++++ src/sort.c | 164 ----------------------------------------------- 2 files changed, 164 insertions(+), 164 deletions(-) diff --git a/src/sort-uniq-keyfuncs.c b/src/sort-uniq-keyfuncs.c index 196045bc0..d661aefe3 100644 --- a/src/sort-uniq-keyfuncs.c +++ b/src/sort-uniq-keyfuncs.c @@ -37,6 +37,9 @@ static bool hard_LC_COLLATE; /* The kind of blanks for '-b' to skip in various options. */ enum blanktype { bl_start, bl_end, bl_both }; +/* The character marking end of line. Default to \n. */ +static char eolchar = '\n'; + /* Lines are held in core as counted strings. */ struct line { @@ -46,6 +49,22 @@ struct line char *keylim; /* Limit of first key. */ }; +/* Input buffers. */ +struct buffer +{ + char *buf; /* Dynamically allocated buffer, + partitioned into 3 regions: + - input data; + - unused area; + - an array of lines, in reverse order. */ + size_t used; /* Number of bytes used for input data. */ + size_t nlines; /* Number of lines in the line array. */ + size_t alloc; /* Number of bytes allocated. */ + size_t left; /* Number of bytes left from previous reads. */ + size_t line_bytes; /* Number of bytes to reserve for each line. */ + bool eof; /* An EOF has been read. */ +}; + /* Sort key. */ struct keyfield { @@ -116,6 +135,14 @@ static struct month monthtab[] = {"SEP", 9} }; +/* Minimum size for a merge or check buffer. */ +#define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line)) + +/* The number of bytes needed for a merge or check buffer, which can + function relatively efficiently even if it holds only one line. If + a longer line is seen, this value is increased. */ +static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024); + /* Flag to reverse the order of all comparisons. */ static bool reverse; @@ -139,6 +166,23 @@ static bool unique; /* List of key field comparisons to be tried. */ static struct keyfield *keylist; +static void sort_die (char const *, char const *) ATTRIBUTE_NORETURN; +static void +sort_die (char const *message, char const *file) +{ + die (SORT_FAILURE, errno, "%s: %s", message, + quotef (file ? file : _("standard output"))); +} + +/* Return one past the limit of the line array. */ + +static inline struct line * +buffer_linelim (struct buffer const *buf) +{ + void *linelim = buf->buf + buf->alloc; + return linelim; +} + /* Return a pointer to the first character of the field specified by KEY in LINE. */ @@ -283,6 +327,126 @@ limfield (struct line const *line, struct keyfield const *key) return ptr; } +/* Fill BUF reading from FP, moving buf->left bytes from the end + of buf->buf to the beginning first. If EOF is reached and the + file wasn't terminated by a newline, supply one. Set up BUF's line + table too. FILE is the name of the file corresponding to FP. + Return true if some input was read. */ + +static bool +fillbuf (struct buffer *buf, FILE *fp, char const *file) +{ + struct keyfield const *key = keylist; + char eol = eolchar; + size_t line_bytes = buf->line_bytes; + size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE; + + if (buf->eof) + return false; + + if (buf->used != buf->left) + { + memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left); + buf->used = buf->left; + buf->nlines = 0; + } + + while (true) + { + char *ptr = buf->buf + buf->used; + struct line *linelim = buffer_linelim (buf); + struct line *line = linelim - buf->nlines; + size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr; + char *line_start = buf->nlines ? line->text + line->length : buf->buf; + + while (line_bytes + 1 < avail) + { + /* Read as many bytes as possible, but do not read so many + bytes that there might not be enough room for the + corresponding line array. The worst case is when the + rest of the input file consists entirely of newlines, + except that the last byte is not a newline. */ + size_t readsize = (avail - 1) / (line_bytes + 1); + size_t bytes_read = fread (ptr, 1, readsize, fp); + char *ptrlim = ptr + bytes_read; + char *p; + avail -= bytes_read; + + if (bytes_read != readsize) + { + if (ferror (fp)) + sort_die (_("read failed"), file); + if (feof (fp)) + { + buf->eof = true; + if (buf->buf == ptrlim) + return false; + if (line_start != ptrlim && ptrlim[-1] != eol) + *ptrlim++ = eol; + } + } + + /* Find and record each line in the just-read input. */ + while ((p = memchr (ptr, eol, ptrlim - ptr))) + { + /* Delimit the line with NUL. This eliminates the need to + temporarily replace the last byte with NUL when calling + xmemcoll(), which increases performance. */ + *p = '\0'; + ptr = p + 1; + line--; + line->text = line_start; + line->length = ptr - line_start; + mergesize = MAX (mergesize, line->length); + avail -= line_bytes; + + if (key) + { + /* Precompute the position of the first key for + efficiency. */ + line->keylim = (key->eword == SIZE_MAX + ? p + : limfield (line, key)); + + if (key->sword != SIZE_MAX) + line->keybeg = begfield (line, key); + else + { + if (key->skipsblanks) + while (blanks[to_uchar (*line_start)]) + line_start++; + line->keybeg = line_start; + } + } + + line_start = ptr; + } + + ptr = ptrlim; + if (buf->eof) + break; + } + + buf->used = ptr - buf->buf; + buf->nlines = buffer_linelim (buf) - line; + if (buf->nlines != 0) + { + buf->left = ptr - line_start; + merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE; + return true; + } + + { + /* The current input line is too long to fit in the buffer. + Increase the buffer size and try again, keeping it properly + aligned. */ + size_t line_alloc = buf->alloc / sizeof (struct line); + buf->buf = x2nrealloc (buf->buf, &line_alloc, sizeof (struct line)); + buf->alloc = line_alloc * sizeof (struct line); + } + } +} + /* Table that maps characters to order-of-magnitude values. */ static char const unit_order[UCHAR_LIM] = { diff --git a/src/sort.c b/src/sort.c index 6f28b63ae..e2a3c7e3e 100644 --- a/src/sort.c +++ b/src/sort.c @@ -139,25 +139,6 @@ enum #if HAVE_NL_LANGINFO static bool hard_LC_TIME; #endif -/* The character marking end of line. Default to \n. */ -static char eolchar = '\n'; - -/* Input buffers. */ -struct buffer -{ - char *buf; /* Dynamically allocated buffer, - partitioned into 3 regions: - - input data; - - unused area; - - an array of lines, in reverse order. */ - size_t used; /* Number of bytes used for input data. */ - size_t nlines; /* Number of lines in the line array. */ - size_t alloc; /* Number of bytes allocated. */ - size_t left; /* Number of bytes left from previous reads. */ - size_t line_bytes; /* Number of bytes to reserve for each line. */ - bool eof; /* An EOF has been read. */ -}; - /* Binary merge tree node. */ struct merge_node { @@ -191,17 +172,9 @@ static struct line saved_line; /* During the merge phase, the number of files to merge at once. */ #define NMERGE_DEFAULT 16 -/* Minimum size for a merge or check buffer. */ -#define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line)) - /* Minimum sort size; the code might not work with smaller sizes. */ #define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE) -/* The number of bytes needed for a merge or check buffer, which can - function relatively efficiently even if it holds only one line. If - a longer line is seen, this value is increased. */ -static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024); - /* The approximate maximum number of bytes of main memory to use, as specified by the user. Zero if the user has not specified a size. */ static size_t sort_size; @@ -266,14 +239,6 @@ async_safe_die (int errnum, const char *errstr) /* Report MESSAGE for FILE, then clean up and exit. If FILE is null, it represents standard output. */ -static void sort_die (char const *, char const *) ATTRIBUTE_NORETURN; -static void -sort_die (char const *message, char const *file) -{ - die (SORT_FAILURE, errno, "%s: %s", message, - quotef (file ? file : _("standard output"))); -} - void usage (int status) { @@ -1445,135 +1410,6 @@ initbuf (struct buffer *buf, size_t line_bytes, size_t alloc) buf->eof = false; } -/* Return one past the limit of the line array. */ - -static inline struct line * -buffer_linelim (struct buffer const *buf) -{ - void *linelim = buf->buf + buf->alloc; - return linelim; -} - -/* Fill BUF reading from FP, moving buf->left bytes from the end - of buf->buf to the beginning first. If EOF is reached and the - file wasn't terminated by a newline, supply one. Set up BUF's line - table too. FILE is the name of the file corresponding to FP. - Return true if some input was read. */ - -static bool -fillbuf (struct buffer *buf, FILE *fp, char const *file) -{ - struct keyfield const *key = keylist; - char eol = eolchar; - size_t line_bytes = buf->line_bytes; - size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE; - - if (buf->eof) - return false; - - if (buf->used != buf->left) - { - memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left); - buf->used = buf->left; - buf->nlines = 0; - } - - while (true) - { - char *ptr = buf->buf + buf->used; - struct line *linelim = buffer_linelim (buf); - struct line *line = linelim - buf->nlines; - size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr; - char *line_start = buf->nlines ? line->text + line->length : buf->buf; - - while (line_bytes + 1 < avail) - { - /* Read as many bytes as possible, but do not read so many - bytes that there might not be enough room for the - corresponding line array. The worst case is when the - rest of the input file consists entirely of newlines, - except that the last byte is not a newline. */ - size_t readsize = (avail - 1) / (line_bytes + 1); - size_t bytes_read = fread (ptr, 1, readsize, fp); - char *ptrlim = ptr + bytes_read; - char *p; - avail -= bytes_read; - - if (bytes_read != readsize) - { - if (ferror (fp)) - sort_die (_("read failed"), file); - if (feof (fp)) - { - buf->eof = true; - if (buf->buf == ptrlim) - return false; - if (line_start != ptrlim && ptrlim[-1] != eol) - *ptrlim++ = eol; - } - } - - /* Find and record each line in the just-read input. */ - while ((p = memchr (ptr, eol, ptrlim - ptr))) - { - /* Delimit the line with NUL. This eliminates the need to - temporarily replace the last byte with NUL when calling - xmemcoll(), which increases performance. */ - *p = '\0'; - ptr = p + 1; - line--; - line->text = line_start; - line->length = ptr - line_start; - mergesize = MAX (mergesize, line->length); - avail -= line_bytes; - - if (key) - { - /* Precompute the position of the first key for - efficiency. */ - line->keylim = (key->eword == SIZE_MAX - ? p - : limfield (line, key)); - - if (key->sword != SIZE_MAX) - line->keybeg = begfield (line, key); - else - { - if (key->skipsblanks) - while (blanks[to_uchar (*line_start)]) - line_start++; - line->keybeg = line_start; - } - } - - line_start = ptr; - } - - ptr = ptrlim; - if (buf->eof) - break; - } - - buf->used = ptr - buf->buf; - buf->nlines = buffer_linelim (buf) - line; - if (buf->nlines != 0) - { - buf->left = ptr - line_start; - merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE; - return true; - } - - { - /* The current input line is too long to fit in the buffer. - Increase the buffer size and try again, keeping it properly - aligned. */ - size_t line_alloc = buf->alloc / sizeof (struct line); - buf->buf = x2nrealloc (buf->buf, &line_alloc, sizeof (struct line)); - buf->alloc = line_alloc * sizeof (struct line); - } - } -} - /* Initialize the randomly chosen MD5 state. */ static void -- cgit v1.2.3-54-g00ecf