summaryrefslogtreecommitdiff
path: root/src/sort-uniq-keyfuncs.c
diff options
context:
space:
mode:
authorErich Eckner <git@eckner.net>2016-11-29 10:08:31 +0100
committerErich Eckner <git@eckner.net>2017-02-16 13:23:49 +0100
commit698617e7eb710a3da51ff4edaf44274109544e4a (patch)
tree315d63c949846233f6e91d70e4728cefba185c8b /src/sort-uniq-keyfuncs.c
parente74f109de104c1966d2d4863aedfb25eefe0358f (diff)
downloadcoreutils-698617e7eb710a3da51ff4edaf44274109544e4a.tar.xz
sort: split off even more source lines useful for uniq
Diffstat (limited to 'src/sort-uniq-keyfuncs.c')
-rw-r--r--src/sort-uniq-keyfuncs.c164
1 files changed, 164 insertions, 0 deletions
diff --git a/src/sort-uniq-keyfuncs.c b/src/sort-uniq-keyfuncs.c
index 196045bc0..d661aefe3 100644
--- a/src/sort-uniq-keyfuncs.c
+++ b/src/sort-uniq-keyfuncs.c
@@ -37,6 +37,9 @@ static bool hard_LC_COLLATE;
/* The kind of blanks for '-b' to skip in various options. */
enum blanktype { bl_start, bl_end, bl_both };
+/* The character marking end of line. Default to \n. */
+static char eolchar = '\n';
+
/* Lines are held in core as counted strings. */
struct line
{
@@ -46,6 +49,22 @@ struct line
char *keylim; /* Limit of first key. */
};
+/* Input buffers. */
+struct buffer
+{
+ char *buf; /* Dynamically allocated buffer,
+ partitioned into 3 regions:
+ - input data;
+ - unused area;
+ - an array of lines, in reverse order. */
+ size_t used; /* Number of bytes used for input data. */
+ size_t nlines; /* Number of lines in the line array. */
+ size_t alloc; /* Number of bytes allocated. */
+ size_t left; /* Number of bytes left from previous reads. */
+ size_t line_bytes; /* Number of bytes to reserve for each line. */
+ bool eof; /* An EOF has been read. */
+};
+
/* Sort key. */
struct keyfield
{
@@ -116,6 +135,14 @@ static struct month monthtab[] =
{"SEP", 9}
};
+/* Minimum size for a merge or check buffer. */
+#define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
+
+/* The number of bytes needed for a merge or check buffer, which can
+ function relatively efficiently even if it holds only one line. If
+ a longer line is seen, this value is increased. */
+static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
+
/* Flag to reverse the order of all comparisons. */
static bool reverse;
@@ -139,6 +166,23 @@ static bool unique;
/* List of key field comparisons to be tried. */
static struct keyfield *keylist;
+static void sort_die (char const *, char const *) ATTRIBUTE_NORETURN;
+static void
+sort_die (char const *message, char const *file)
+{
+ die (SORT_FAILURE, errno, "%s: %s", message,
+ quotef (file ? file : _("standard output")));
+}
+
+/* Return one past the limit of the line array. */
+
+static inline struct line *
+buffer_linelim (struct buffer const *buf)
+{
+ void *linelim = buf->buf + buf->alloc;
+ return linelim;
+}
+
/* Return a pointer to the first character of the field specified
by KEY in LINE. */
@@ -283,6 +327,126 @@ limfield (struct line const *line, struct keyfield const *key)
return ptr;
}
+/* Fill BUF reading from FP, moving buf->left bytes from the end
+ of buf->buf to the beginning first. If EOF is reached and the
+ file wasn't terminated by a newline, supply one. Set up BUF's line
+ table too. FILE is the name of the file corresponding to FP.
+ Return true if some input was read. */
+
+static bool
+fillbuf (struct buffer *buf, FILE *fp, char const *file)
+{
+ struct keyfield const *key = keylist;
+ char eol = eolchar;
+ size_t line_bytes = buf->line_bytes;
+ size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
+
+ if (buf->eof)
+ return false;
+
+ if (buf->used != buf->left)
+ {
+ memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
+ buf->used = buf->left;
+ buf->nlines = 0;
+ }
+
+ while (true)
+ {
+ char *ptr = buf->buf + buf->used;
+ struct line *linelim = buffer_linelim (buf);
+ struct line *line = linelim - buf->nlines;
+ size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
+ char *line_start = buf->nlines ? line->text + line->length : buf->buf;
+
+ while (line_bytes + 1 < avail)
+ {
+ /* Read as many bytes as possible, but do not read so many
+ bytes that there might not be enough room for the
+ corresponding line array. The worst case is when the
+ rest of the input file consists entirely of newlines,
+ except that the last byte is not a newline. */
+ size_t readsize = (avail - 1) / (line_bytes + 1);
+ size_t bytes_read = fread (ptr, 1, readsize, fp);
+ char *ptrlim = ptr + bytes_read;
+ char *p;
+ avail -= bytes_read;
+
+ if (bytes_read != readsize)
+ {
+ if (ferror (fp))
+ sort_die (_("read failed"), file);
+ if (feof (fp))
+ {
+ buf->eof = true;
+ if (buf->buf == ptrlim)
+ return false;
+ if (line_start != ptrlim && ptrlim[-1] != eol)
+ *ptrlim++ = eol;
+ }
+ }
+
+ /* Find and record each line in the just-read input. */
+ while ((p = memchr (ptr, eol, ptrlim - ptr)))
+ {
+ /* Delimit the line with NUL. This eliminates the need to
+ temporarily replace the last byte with NUL when calling
+ xmemcoll(), which increases performance. */
+ *p = '\0';
+ ptr = p + 1;
+ line--;
+ line->text = line_start;
+ line->length = ptr - line_start;
+ mergesize = MAX (mergesize, line->length);
+ avail -= line_bytes;
+
+ if (key)
+ {
+ /* Precompute the position of the first key for
+ efficiency. */
+ line->keylim = (key->eword == SIZE_MAX
+ ? p
+ : limfield (line, key));
+
+ if (key->sword != SIZE_MAX)
+ line->keybeg = begfield (line, key);
+ else
+ {
+ if (key->skipsblanks)
+ while (blanks[to_uchar (*line_start)])
+ line_start++;
+ line->keybeg = line_start;
+ }
+ }
+
+ line_start = ptr;
+ }
+
+ ptr = ptrlim;
+ if (buf->eof)
+ break;
+ }
+
+ buf->used = ptr - buf->buf;
+ buf->nlines = buffer_linelim (buf) - line;
+ if (buf->nlines != 0)
+ {
+ buf->left = ptr - line_start;
+ merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
+ return true;
+ }
+
+ {
+ /* The current input line is too long to fit in the buffer.
+ Increase the buffer size and try again, keeping it properly
+ aligned. */
+ size_t line_alloc = buf->alloc / sizeof (struct line);
+ buf->buf = x2nrealloc (buf->buf, &line_alloc, sizeof (struct line));
+ buf->alloc = line_alloc * sizeof (struct line);
+ }
+ }
+}
+
/* Table that maps characters to order-of-magnitude values. */
static char const unit_order[UCHAR_LIM] =
{