summaryrefslogtreecommitdiff
path: root/src/sort-uniq-keyfuncs.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/sort-uniq-keyfuncs.c')
-rw-r--r--src/sort-uniq-keyfuncs.c275
1 files changed, 274 insertions, 1 deletions
diff --git a/src/sort-uniq-keyfuncs.c b/src/sort-uniq-keyfuncs.c
index 31bf03e19..5b27f7cce 100644
--- a/src/sort-uniq-keyfuncs.c
+++ b/src/sort-uniq-keyfuncs.c
@@ -1,5 +1,7 @@
#include "filevercmp.h"
+#include "ignore-value.h"
#include "md5.h"
+#include "mbswidth.h"
#include "strnumcmp.h"
#define UCHAR_LIM (UCHAR_MAX + 1)
@@ -31,7 +33,6 @@ static int thousands_sep;
/* Nonzero if the corresponding locales are hard. */
static bool hard_LC_COLLATE;
-
#define NONZERO(x) ((x) != 0)
/* The kind of blanks for '-b' to skip in various options. */
@@ -143,6 +144,10 @@ static struct month monthtab[] =
a longer line is seen, this value is increased. */
static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
+/* The approximate maximum number of bytes of main memory to use, as
+ specified by the user. Zero if the user has not specified a size. */
+static size_t sort_size;
+
/* Flag to reverse the order of all comparisons. */
static bool reverse;
@@ -163,9 +168,18 @@ static int tab = TAB_DEFAULT;
Only the last of a sequence of equal lines will be output. */
static bool unique;
+/* Nonzero if any of the input files are the standard input. */
+static bool have_read_stdin;
+
/* List of key field comparisons to be tried. */
static struct keyfield *keylist;
+/* Annotate the output with extra info to aid the user. */
+static bool debug;
+
+/* Report MESSAGE for FILE, then clean up and exit.
+ If FILE is null, it represents standard output. */
+
static void sort_die (char const *, char const *) ATTRIBUTE_NORETURN;
static void
sort_die (char const *message, char const *file)
@@ -174,6 +188,120 @@ sort_die (char const *message, char const *file)
quotef (file ? file : _("standard output")));
}
+/* Return a stream for FILE, opened with mode HOW. A null FILE means
+ standard output; HOW should be "w". When opening for input, "-"
+ means standard input. To avoid confusion, do not return file
+ descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
+ opening an ordinary FILE. Return NULL if unsuccessful.
+
+ fadvise() is used to specify an access pattern for input files.
+ There are a few hints we could possibly provide,
+ and after careful testing it was decided that
+ specifying POSIX_FADV_SEQUENTIAL was not detrimental
+ to any cases. On Linux 2.6.31, this option doubles
+ the size of read ahead performed and thus was seen to
+ benefit these cases:
+ Merging
+ Sorting with a smaller internal buffer
+ Reading from faster flash devices
+
+ In _addition_ one could also specify other hints...
+
+ POSIX_FADV_WILLNEED was tested, but Linux 2.6.31
+ at least uses that to _synchronously_ prepopulate the cache
+ with the specified range. While sort does need to
+ read all of its input before outputting, a synchronous
+ read of the whole file up front precludes any processing
+ that sort could do in parallel with the system doing
+ read ahead of the data. This was seen to have negative effects
+ in a couple of cases:
+ Merging
+ Sorting with a smaller internal buffer
+ Note this option was seen to shorten the runtime for sort
+ on a multicore system with lots of RAM and other processes
+ competing for CPU. It could be argued that more explicit
+ scheduling hints with 'nice' et. al. are more appropriate
+ for this situation.
+
+ POSIX_FADV_NOREUSE is a possibility as it could lower
+ the priority of input data in the cache as sort will
+ only need to process it once. However its functionality
+ has changed over Linux kernel versions and as of 2.6.31
+ it does nothing and thus we can't depend on what it might
+ do in future.
+
+ POSIX_FADV_DONTNEED is not appropriate for user specified
+ input files, but for temp files we do want to drop the
+ cache immediately after processing. This is done implicitly
+ however when the files are unlinked. */
+
+static FILE *
+stream_open (char const *file, char const *how)
+{
+ FILE *fp;
+
+ if (*how == 'r')
+ {
+ if (STREQ (file, "-"))
+ {
+ have_read_stdin = true;
+ fp = stdin;
+ }
+ else
+ fp = fopen (file, how);
+ fadvise (fp, FADVISE_SEQUENTIAL);
+ }
+ else if (*how == 'w')
+ {
+ if (file && ftruncate (STDOUT_FILENO, 0) != 0)
+ die (SORT_FAILURE, errno, _("%s: error truncating"),
+ quotef (file));
+ fp = stdout;
+ }
+ else
+ sort_die (_("unexpected mode passed to stream_open"), file);
+
+ return fp;
+}
+
+/* Same as stream_open, except always return a non-null value; die on
+ failure. */
+
+static FILE *
+xfopen (char const *file, char const *how)
+{
+ FILE *fp = stream_open (file, how);
+ if (!fp)
+ sort_die (_("open failed"), file);
+ return fp;
+}
+
+/* Close FP, whose name is FILE, and report any errors. */
+
+static void
+xfclose (FILE *fp, char const *file)
+{
+ switch (fileno (fp))
+ {
+ case STDIN_FILENO:
+ /* Allow reading stdin from tty more than once. */
+ if (feof (fp))
+ clearerr (fp);
+ break;
+
+ case STDOUT_FILENO:
+ /* Don't close stdout just yet. close_stdout does that. */
+ if (fflush (fp) != 0)
+ sort_die (_("fflush failed"), file);
+ break;
+
+ default:
+ if (fclose (fp) != 0)
+ sort_die (_("close failed"), file);
+ break;
+ }
+}
+
/* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES
must be at least sizeof (struct line). Allocate ALLOC bytes
initially. */
@@ -847,6 +975,40 @@ compare_random (char *restrict texta, size_t lena,
return diff;
}
+/* Return the printable width of the block of memory starting at
+ TEXT and ending just before LIM, counting each tab as one byte.
+ FIXME: Should we generally be counting non printable chars? */
+
+static size_t
+debug_width (char const *text, char const *lim)
+{
+ size_t width = mbsnwidth (text, lim - text, 0);
+ while (text < lim)
+ width += (*text++ == '\t');
+ return width;
+}
+
+/* For debug mode, "underline" a key at the
+ specified offset and screen width. */
+
+static void
+mark_key (size_t offset, size_t width)
+{
+ while (offset--)
+ putchar (' ');
+
+ if (!width)
+ printf (_("^ no match for key\n"));
+ else
+ {
+ do
+ putchar ('_');
+ while (--width);
+
+ putchar ('\n');
+ }
+}
+
/* Return true if KEY is a numeric key. */
static inline bool
@@ -855,6 +1017,76 @@ key_numeric (struct keyfield const *key)
return key->numeric || key->general_numeric || key->human_numeric;
}
+/* For LINE, output a debugging line that underlines KEY in LINE.
+ If KEY is null, underline the whole line. */
+
+static void
+debug_key (struct line const *line, struct keyfield const *key)
+{
+ char *text = line->text;
+ char *beg = text;
+ char *lim = text + line->length - 1;
+
+ if (key)
+ {
+ if (key->sword != SIZE_MAX)
+ beg = begfield (line, key);
+ if (key->eword != SIZE_MAX)
+ lim = limfield (line, key);
+
+ if ((key->skipsblanks && key->sword == SIZE_MAX)
+ || key->month || key_numeric (key))
+ {
+ char saved = *lim;
+ *lim = '\0';
+
+ while (blanks[to_uchar (*beg)])
+ beg++;
+
+ char *tighter_lim = beg;
+
+ if (lim < beg)
+ tighter_lim = lim;
+ else if (key->month)
+ getmonth (beg, &tighter_lim);
+ else if (key->general_numeric)
+ ignore_value (strtold (beg, &tighter_lim));
+ else if (key->numeric || key->human_numeric)
+ {
+ char const *p = beg + (beg < lim && *beg == '-');
+ unsigned char max_digit = traverse_raw_number (&p);
+ if ('0' <= max_digit)
+ {
+ unsigned char ch = *p;
+ tighter_lim = (char *) p
+ + (key->human_numeric && unit_order[ch]);
+ }
+ }
+ else
+ tighter_lim = lim;
+
+ *lim = saved;
+ lim = tighter_lim;
+ }
+ }
+
+ size_t offset = debug_width (text, beg);
+ size_t width = debug_width (beg, lim);
+ mark_key (offset, width);
+}
+
+/* Debug LINE by underlining its keys. */
+
+static void
+debug_line (struct line const *line)
+{
+ struct keyfield const *key = keylist;
+
+ do
+ debug_key (line, key);
+ while (key && ((key = key->next) || ! (unique || stable)));
+}
+
/* Compare two lines A and B trying every key in sequence until there
are no more keys or a difference is found. */
@@ -1101,6 +1333,47 @@ compare (struct line const *a, struct line const *b)
return reverse ? -diff : diff;
}
+/* Write LINE to output stream FP; the output file's name is
+ OUTPUT_FILE if OUTPUT_FILE is non-null, and is the standard output
+ otherwise. If debugging is enabled and FP is standard output,
+ append some debugging information. */
+
+static void
+write_line (struct line const *line, FILE *fp, char const *output_file)
+{
+ char *buf = line->text;
+ size_t n_bytes = line->length;
+ char *ebuf = buf + n_bytes;
+
+ if (!output_file && debug)
+ {
+ /* Convert TAB to '>' and EOL to \n, and then output debugging info. */
+ char const *c = buf;
+
+ while (c < ebuf)
+ {
+ char wc = *c++;
+ if (wc == '\t')
+ wc = '>';
+ else if (c == ebuf)
+ wc = '\n';
+ if (fputc (wc, fp) == EOF)
+ sort_die (_("write failed"), output_file);
+ }
+
+ debug_line (line);
+ }
+ else
+ {
+ ebuf[-1] = eolchar;
+ if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
+ sort_die (_("write failed"), output_file);
+ ebuf[-1] = '\0';
+ }
+}
+
+/* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */
+
static void
insertkey (struct keyfield *key_arg)
{