diff options
Diffstat (limited to 'src/sort-uniq-keyfuncs.c')
-rw-r--r-- | src/sort-uniq-keyfuncs.c | 275 |
1 files changed, 274 insertions, 1 deletions
diff --git a/src/sort-uniq-keyfuncs.c b/src/sort-uniq-keyfuncs.c index 31bf03e19..5b27f7cce 100644 --- a/src/sort-uniq-keyfuncs.c +++ b/src/sort-uniq-keyfuncs.c @@ -1,5 +1,7 @@ #include "filevercmp.h" +#include "ignore-value.h" #include "md5.h" +#include "mbswidth.h" #include "strnumcmp.h" #define UCHAR_LIM (UCHAR_MAX + 1) @@ -31,7 +33,6 @@ static int thousands_sep; /* Nonzero if the corresponding locales are hard. */ static bool hard_LC_COLLATE; - #define NONZERO(x) ((x) != 0) /* The kind of blanks for '-b' to skip in various options. */ @@ -143,6 +144,10 @@ static struct month monthtab[] = a longer line is seen, this value is increased. */ static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024); +/* The approximate maximum number of bytes of main memory to use, as + specified by the user. Zero if the user has not specified a size. */ +static size_t sort_size; + /* Flag to reverse the order of all comparisons. */ static bool reverse; @@ -163,9 +168,18 @@ static int tab = TAB_DEFAULT; Only the last of a sequence of equal lines will be output. */ static bool unique; +/* Nonzero if any of the input files are the standard input. */ +static bool have_read_stdin; + /* List of key field comparisons to be tried. */ static struct keyfield *keylist; +/* Annotate the output with extra info to aid the user. */ +static bool debug; + +/* Report MESSAGE for FILE, then clean up and exit. + If FILE is null, it represents standard output. */ + static void sort_die (char const *, char const *) ATTRIBUTE_NORETURN; static void sort_die (char const *message, char const *file) @@ -174,6 +188,120 @@ sort_die (char const *message, char const *file) quotef (file ? file : _("standard output"))); } +/* Return a stream for FILE, opened with mode HOW. A null FILE means + standard output; HOW should be "w". When opening for input, "-" + means standard input. To avoid confusion, do not return file + descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when + opening an ordinary FILE. Return NULL if unsuccessful. + + fadvise() is used to specify an access pattern for input files. + There are a few hints we could possibly provide, + and after careful testing it was decided that + specifying POSIX_FADV_SEQUENTIAL was not detrimental + to any cases. On Linux 2.6.31, this option doubles + the size of read ahead performed and thus was seen to + benefit these cases: + Merging + Sorting with a smaller internal buffer + Reading from faster flash devices + + In _addition_ one could also specify other hints... + + POSIX_FADV_WILLNEED was tested, but Linux 2.6.31 + at least uses that to _synchronously_ prepopulate the cache + with the specified range. While sort does need to + read all of its input before outputting, a synchronous + read of the whole file up front precludes any processing + that sort could do in parallel with the system doing + read ahead of the data. This was seen to have negative effects + in a couple of cases: + Merging + Sorting with a smaller internal buffer + Note this option was seen to shorten the runtime for sort + on a multicore system with lots of RAM and other processes + competing for CPU. It could be argued that more explicit + scheduling hints with 'nice' et. al. are more appropriate + for this situation. + + POSIX_FADV_NOREUSE is a possibility as it could lower + the priority of input data in the cache as sort will + only need to process it once. However its functionality + has changed over Linux kernel versions and as of 2.6.31 + it does nothing and thus we can't depend on what it might + do in future. + + POSIX_FADV_DONTNEED is not appropriate for user specified + input files, but for temp files we do want to drop the + cache immediately after processing. This is done implicitly + however when the files are unlinked. */ + +static FILE * +stream_open (char const *file, char const *how) +{ + FILE *fp; + + if (*how == 'r') + { + if (STREQ (file, "-")) + { + have_read_stdin = true; + fp = stdin; + } + else + fp = fopen (file, how); + fadvise (fp, FADVISE_SEQUENTIAL); + } + else if (*how == 'w') + { + if (file && ftruncate (STDOUT_FILENO, 0) != 0) + die (SORT_FAILURE, errno, _("%s: error truncating"), + quotef (file)); + fp = stdout; + } + else + sort_die (_("unexpected mode passed to stream_open"), file); + + return fp; +} + +/* Same as stream_open, except always return a non-null value; die on + failure. */ + +static FILE * +xfopen (char const *file, char const *how) +{ + FILE *fp = stream_open (file, how); + if (!fp) + sort_die (_("open failed"), file); + return fp; +} + +/* Close FP, whose name is FILE, and report any errors. */ + +static void +xfclose (FILE *fp, char const *file) +{ + switch (fileno (fp)) + { + case STDIN_FILENO: + /* Allow reading stdin from tty more than once. */ + if (feof (fp)) + clearerr (fp); + break; + + case STDOUT_FILENO: + /* Don't close stdout just yet. close_stdout does that. */ + if (fflush (fp) != 0) + sort_die (_("fflush failed"), file); + break; + + default: + if (fclose (fp) != 0) + sort_die (_("close failed"), file); + break; + } +} + /* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES must be at least sizeof (struct line). Allocate ALLOC bytes initially. */ @@ -847,6 +975,40 @@ compare_random (char *restrict texta, size_t lena, return diff; } +/* Return the printable width of the block of memory starting at + TEXT and ending just before LIM, counting each tab as one byte. + FIXME: Should we generally be counting non printable chars? */ + +static size_t +debug_width (char const *text, char const *lim) +{ + size_t width = mbsnwidth (text, lim - text, 0); + while (text < lim) + width += (*text++ == '\t'); + return width; +} + +/* For debug mode, "underline" a key at the + specified offset and screen width. */ + +static void +mark_key (size_t offset, size_t width) +{ + while (offset--) + putchar (' '); + + if (!width) + printf (_("^ no match for key\n")); + else + { + do + putchar ('_'); + while (--width); + + putchar ('\n'); + } +} + /* Return true if KEY is a numeric key. */ static inline bool @@ -855,6 +1017,76 @@ key_numeric (struct keyfield const *key) return key->numeric || key->general_numeric || key->human_numeric; } +/* For LINE, output a debugging line that underlines KEY in LINE. + If KEY is null, underline the whole line. */ + +static void +debug_key (struct line const *line, struct keyfield const *key) +{ + char *text = line->text; + char *beg = text; + char *lim = text + line->length - 1; + + if (key) + { + if (key->sword != SIZE_MAX) + beg = begfield (line, key); + if (key->eword != SIZE_MAX) + lim = limfield (line, key); + + if ((key->skipsblanks && key->sword == SIZE_MAX) + || key->month || key_numeric (key)) + { + char saved = *lim; + *lim = '\0'; + + while (blanks[to_uchar (*beg)]) + beg++; + + char *tighter_lim = beg; + + if (lim < beg) + tighter_lim = lim; + else if (key->month) + getmonth (beg, &tighter_lim); + else if (key->general_numeric) + ignore_value (strtold (beg, &tighter_lim)); + else if (key->numeric || key->human_numeric) + { + char const *p = beg + (beg < lim && *beg == '-'); + unsigned char max_digit = traverse_raw_number (&p); + if ('0' <= max_digit) + { + unsigned char ch = *p; + tighter_lim = (char *) p + + (key->human_numeric && unit_order[ch]); + } + } + else + tighter_lim = lim; + + *lim = saved; + lim = tighter_lim; + } + } + + size_t offset = debug_width (text, beg); + size_t width = debug_width (beg, lim); + mark_key (offset, width); +} + +/* Debug LINE by underlining its keys. */ + +static void +debug_line (struct line const *line) +{ + struct keyfield const *key = keylist; + + do + debug_key (line, key); + while (key && ((key = key->next) || ! (unique || stable))); +} + /* Compare two lines A and B trying every key in sequence until there are no more keys or a difference is found. */ @@ -1101,6 +1333,47 @@ compare (struct line const *a, struct line const *b) return reverse ? -diff : diff; } +/* Write LINE to output stream FP; the output file's name is + OUTPUT_FILE if OUTPUT_FILE is non-null, and is the standard output + otherwise. If debugging is enabled and FP is standard output, + append some debugging information. */ + +static void +write_line (struct line const *line, FILE *fp, char const *output_file) +{ + char *buf = line->text; + size_t n_bytes = line->length; + char *ebuf = buf + n_bytes; + + if (!output_file && debug) + { + /* Convert TAB to '>' and EOL to \n, and then output debugging info. */ + char const *c = buf; + + while (c < ebuf) + { + char wc = *c++; + if (wc == '\t') + wc = '>'; + else if (c == ebuf) + wc = '\n'; + if (fputc (wc, fp) == EOF) + sort_die (_("write failed"), output_file); + } + + debug_line (line); + } + else + { + ebuf[-1] = eolchar; + if (fwrite (buf, 1, n_bytes, fp) != n_bytes) + sort_die (_("write failed"), output_file); + ebuf[-1] = '\0'; + } +} + +/* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */ + static void insertkey (struct keyfield *key_arg) { |