From 34e84e57aa987b7c39ba8f285c8ce602c6cf63ee Mon Sep 17 00:00:00 2001 From: Erich Eckner Date: Wed, 30 Nov 2016 11:17:36 +0100 Subject: sort: split off even more lines --- src/sort-uniq-keyfuncs.c | 275 ++++++++++++++++++++++++++++++++++++++++++++++- src/sort.c | 275 +---------------------------------------------- 2 files changed, 275 insertions(+), 275 deletions(-) diff --git a/src/sort-uniq-keyfuncs.c b/src/sort-uniq-keyfuncs.c index 31bf03e19..5b27f7cce 100644 --- a/src/sort-uniq-keyfuncs.c +++ b/src/sort-uniq-keyfuncs.c @@ -1,5 +1,7 @@ #include "filevercmp.h" +#include "ignore-value.h" #include "md5.h" +#include "mbswidth.h" #include "strnumcmp.h" #define UCHAR_LIM (UCHAR_MAX + 1) @@ -31,7 +33,6 @@ static int thousands_sep; /* Nonzero if the corresponding locales are hard. */ static bool hard_LC_COLLATE; - #define NONZERO(x) ((x) != 0) /* The kind of blanks for '-b' to skip in various options. */ @@ -143,6 +144,10 @@ static struct month monthtab[] = a longer line is seen, this value is increased. */ static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024); +/* The approximate maximum number of bytes of main memory to use, as + specified by the user. Zero if the user has not specified a size. */ +static size_t sort_size; + /* Flag to reverse the order of all comparisons. */ static bool reverse; @@ -163,9 +168,18 @@ static int tab = TAB_DEFAULT; Only the last of a sequence of equal lines will be output. */ static bool unique; +/* Nonzero if any of the input files are the standard input. */ +static bool have_read_stdin; + /* List of key field comparisons to be tried. */ static struct keyfield *keylist; +/* Annotate the output with extra info to aid the user. */ +static bool debug; + +/* Report MESSAGE for FILE, then clean up and exit. + If FILE is null, it represents standard output. */ + static void sort_die (char const *, char const *) ATTRIBUTE_NORETURN; static void sort_die (char const *message, char const *file) @@ -174,6 +188,120 @@ sort_die (char const *message, char const *file) quotef (file ? file : _("standard output"))); } +/* Return a stream for FILE, opened with mode HOW. A null FILE means + standard output; HOW should be "w". When opening for input, "-" + means standard input. To avoid confusion, do not return file + descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when + opening an ordinary FILE. Return NULL if unsuccessful. + + fadvise() is used to specify an access pattern for input files. + There are a few hints we could possibly provide, + and after careful testing it was decided that + specifying POSIX_FADV_SEQUENTIAL was not detrimental + to any cases. On Linux 2.6.31, this option doubles + the size of read ahead performed and thus was seen to + benefit these cases: + Merging + Sorting with a smaller internal buffer + Reading from faster flash devices + + In _addition_ one could also specify other hints... + + POSIX_FADV_WILLNEED was tested, but Linux 2.6.31 + at least uses that to _synchronously_ prepopulate the cache + with the specified range. While sort does need to + read all of its input before outputting, a synchronous + read of the whole file up front precludes any processing + that sort could do in parallel with the system doing + read ahead of the data. This was seen to have negative effects + in a couple of cases: + Merging + Sorting with a smaller internal buffer + Note this option was seen to shorten the runtime for sort + on a multicore system with lots of RAM and other processes + competing for CPU. It could be argued that more explicit + scheduling hints with 'nice' et. al. are more appropriate + for this situation. + + POSIX_FADV_NOREUSE is a possibility as it could lower + the priority of input data in the cache as sort will + only need to process it once. However its functionality + has changed over Linux kernel versions and as of 2.6.31 + it does nothing and thus we can't depend on what it might + do in future. + + POSIX_FADV_DONTNEED is not appropriate for user specified + input files, but for temp files we do want to drop the + cache immediately after processing. This is done implicitly + however when the files are unlinked. */ + +static FILE * +stream_open (char const *file, char const *how) +{ + FILE *fp; + + if (*how == 'r') + { + if (STREQ (file, "-")) + { + have_read_stdin = true; + fp = stdin; + } + else + fp = fopen (file, how); + fadvise (fp, FADVISE_SEQUENTIAL); + } + else if (*how == 'w') + { + if (file && ftruncate (STDOUT_FILENO, 0) != 0) + die (SORT_FAILURE, errno, _("%s: error truncating"), + quotef (file)); + fp = stdout; + } + else + sort_die (_("unexpected mode passed to stream_open"), file); + + return fp; +} + +/* Same as stream_open, except always return a non-null value; die on + failure. */ + +static FILE * +xfopen (char const *file, char const *how) +{ + FILE *fp = stream_open (file, how); + if (!fp) + sort_die (_("open failed"), file); + return fp; +} + +/* Close FP, whose name is FILE, and report any errors. */ + +static void +xfclose (FILE *fp, char const *file) +{ + switch (fileno (fp)) + { + case STDIN_FILENO: + /* Allow reading stdin from tty more than once. */ + if (feof (fp)) + clearerr (fp); + break; + + case STDOUT_FILENO: + /* Don't close stdout just yet. close_stdout does that. */ + if (fflush (fp) != 0) + sort_die (_("fflush failed"), file); + break; + + default: + if (fclose (fp) != 0) + sort_die (_("close failed"), file); + break; + } +} + /* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES must be at least sizeof (struct line). Allocate ALLOC bytes initially. */ @@ -847,6 +975,40 @@ compare_random (char *restrict texta, size_t lena, return diff; } +/* Return the printable width of the block of memory starting at + TEXT and ending just before LIM, counting each tab as one byte. + FIXME: Should we generally be counting non printable chars? */ + +static size_t +debug_width (char const *text, char const *lim) +{ + size_t width = mbsnwidth (text, lim - text, 0); + while (text < lim) + width += (*text++ == '\t'); + return width; +} + +/* For debug mode, "underline" a key at the + specified offset and screen width. */ + +static void +mark_key (size_t offset, size_t width) +{ + while (offset--) + putchar (' '); + + if (!width) + printf (_("^ no match for key\n")); + else + { + do + putchar ('_'); + while (--width); + + putchar ('\n'); + } +} + /* Return true if KEY is a numeric key. */ static inline bool @@ -855,6 +1017,76 @@ key_numeric (struct keyfield const *key) return key->numeric || key->general_numeric || key->human_numeric; } +/* For LINE, output a debugging line that underlines KEY in LINE. + If KEY is null, underline the whole line. */ + +static void +debug_key (struct line const *line, struct keyfield const *key) +{ + char *text = line->text; + char *beg = text; + char *lim = text + line->length - 1; + + if (key) + { + if (key->sword != SIZE_MAX) + beg = begfield (line, key); + if (key->eword != SIZE_MAX) + lim = limfield (line, key); + + if ((key->skipsblanks && key->sword == SIZE_MAX) + || key->month || key_numeric (key)) + { + char saved = *lim; + *lim = '\0'; + + while (blanks[to_uchar (*beg)]) + beg++; + + char *tighter_lim = beg; + + if (lim < beg) + tighter_lim = lim; + else if (key->month) + getmonth (beg, &tighter_lim); + else if (key->general_numeric) + ignore_value (strtold (beg, &tighter_lim)); + else if (key->numeric || key->human_numeric) + { + char const *p = beg + (beg < lim && *beg == '-'); + unsigned char max_digit = traverse_raw_number (&p); + if ('0' <= max_digit) + { + unsigned char ch = *p; + tighter_lim = (char *) p + + (key->human_numeric && unit_order[ch]); + } + } + else + tighter_lim = lim; + + *lim = saved; + lim = tighter_lim; + } + } + + size_t offset = debug_width (text, beg); + size_t width = debug_width (beg, lim); + mark_key (offset, width); +} + +/* Debug LINE by underlining its keys. */ + +static void +debug_line (struct line const *line) +{ + struct keyfield const *key = keylist; + + do + debug_key (line, key); + while (key && ((key = key->next) || ! (unique || stable))); +} + /* Compare two lines A and B trying every key in sequence until there are no more keys or a difference is found. */ @@ -1101,6 +1333,47 @@ compare (struct line const *a, struct line const *b) return reverse ? -diff : diff; } +/* Write LINE to output stream FP; the output file's name is + OUTPUT_FILE if OUTPUT_FILE is non-null, and is the standard output + otherwise. If debugging is enabled and FP is standard output, + append some debugging information. */ + +static void +write_line (struct line const *line, FILE *fp, char const *output_file) +{ + char *buf = line->text; + size_t n_bytes = line->length; + char *ebuf = buf + n_bytes; + + if (!output_file && debug) + { + /* Convert TAB to '>' and EOL to \n, and then output debugging info. */ + char const *c = buf; + + while (c < ebuf) + { + char wc = *c++; + if (wc == '\t') + wc = '>'; + else if (c == ebuf) + wc = '\n'; + if (fputc (wc, fp) == EOF) + sort_die (_("write failed"), output_file); + } + + debug_line (line); + } + else + { + ebuf[-1] = eolchar; + if (fwrite (buf, 1, n_bytes, fp) != n_bytes) + sort_die (_("write failed"), output_file); + ebuf[-1] = '\0'; + } +} + +/* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */ + static void insertkey (struct keyfield *key_arg) { diff --git a/src/sort.c b/src/sort.c index b7a6f7aec..7610393a0 100644 --- a/src/sort.c +++ b/src/sort.c @@ -38,8 +38,6 @@ #include "hard-locale.h" #include "hash.h" #include "heap.h" -#include "ignore-value.h" -#include "mbswidth.h" #include "nproc.h" #include "physmem.h" #include "posixver.h" @@ -139,6 +137,7 @@ enum #if HAVE_NL_LANGINFO static bool hard_LC_TIME; #endif + /* Binary merge tree node. */ struct merge_node { @@ -175,10 +174,6 @@ static struct line saved_line; /* Minimum sort size; the code might not work with smaller sizes. */ #define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE) -/* The approximate maximum number of bytes of main memory to use, as - specified by the user. Zero if the user has not specified a size. */ -static size_t sort_size; - /* The initial allocation factor for non-regular files. This is used, e.g., when reading from a pipe. Don't make it too big, since it is multiplied by ~130 to @@ -195,15 +190,9 @@ static size_t temp_dir_count; /* Number of allocated slots in temp_dirs. */ static size_t temp_dir_alloc; -/* Nonzero if any of the input files are the standard input. */ -static bool have_read_stdin; - /* Program used to (de)compress temp files. Must accept -d. */ static char const *compress_program; -/* Annotate the output with extra info to aid the user. */ -static bool debug; - /* Maximum number of files to merge in one go. If more than this number are present, temp files will be used. */ static unsigned int nmerge = NMERGE_DEFAULT; @@ -236,9 +225,6 @@ async_safe_die (int errnum, const char *errstr) _exit (SORT_FAILURE); } -/* Report MESSAGE for FILE, then clean up and exit. - If FILE is null, it represents standard output. */ - void usage (int status) { @@ -717,120 +703,6 @@ create_temp_file (int *pfd, bool survive_fd_exhaustion) return node; } -/* Return a stream for FILE, opened with mode HOW. A null FILE means - standard output; HOW should be "w". When opening for input, "-" - means standard input. To avoid confusion, do not return file - descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when - opening an ordinary FILE. Return NULL if unsuccessful. - - fadvise() is used to specify an access pattern for input files. - There are a few hints we could possibly provide, - and after careful testing it was decided that - specifying POSIX_FADV_SEQUENTIAL was not detrimental - to any cases. On Linux 2.6.31, this option doubles - the size of read ahead performed and thus was seen to - benefit these cases: - Merging - Sorting with a smaller internal buffer - Reading from faster flash devices - - In _addition_ one could also specify other hints... - - POSIX_FADV_WILLNEED was tested, but Linux 2.6.31 - at least uses that to _synchronously_ prepopulate the cache - with the specified range. While sort does need to - read all of its input before outputting, a synchronous - read of the whole file up front precludes any processing - that sort could do in parallel with the system doing - read ahead of the data. This was seen to have negative effects - in a couple of cases: - Merging - Sorting with a smaller internal buffer - Note this option was seen to shorten the runtime for sort - on a multicore system with lots of RAM and other processes - competing for CPU. It could be argued that more explicit - scheduling hints with 'nice' et. al. are more appropriate - for this situation. - - POSIX_FADV_NOREUSE is a possibility as it could lower - the priority of input data in the cache as sort will - only need to process it once. However its functionality - has changed over Linux kernel versions and as of 2.6.31 - it does nothing and thus we can't depend on what it might - do in future. - - POSIX_FADV_DONTNEED is not appropriate for user specified - input files, but for temp files we do want to drop the - cache immediately after processing. This is done implicitly - however when the files are unlinked. */ - -static FILE * -stream_open (char const *file, char const *how) -{ - FILE *fp; - - if (*how == 'r') - { - if (STREQ (file, "-")) - { - have_read_stdin = true; - fp = stdin; - } - else - fp = fopen (file, how); - fadvise (fp, FADVISE_SEQUENTIAL); - } - else if (*how == 'w') - { - if (file && ftruncate (STDOUT_FILENO, 0) != 0) - die (SORT_FAILURE, errno, _("%s: error truncating"), - quotef (file)); - fp = stdout; - } - else - assert (!"unexpected mode passed to stream_open"); - - return fp; -} - -/* Same as stream_open, except always return a non-null value; die on - failure. */ - -static FILE * -xfopen (char const *file, char const *how) -{ - FILE *fp = stream_open (file, how); - if (!fp) - sort_die (_("open failed"), file); - return fp; -} - -/* Close FP, whose name is FILE, and report any errors. */ - -static void -xfclose (FILE *fp, char const *file) -{ - switch (fileno (fp)) - { - case STDIN_FILENO: - /* Allow reading stdin from tty more than once. */ - if (feof (fp)) - clearerr (fp); - break; - - case STDOUT_FILENO: - /* Don't close stdout just yet. close_stdout does that. */ - if (fflush (fp) != 0) - sort_die (_("fflush failed"), file); - break; - - default: - if (fclose (fp) != 0) - sort_die (_("close failed"), file); - break; - } -} - static void move_fd_or_die (int oldfd, int newfd) { @@ -1398,110 +1270,6 @@ random_md5_state_init (char const *random_source) md5_process_bytes (buf, sizeof buf, &random_md5_state); } -/* Return the printable width of the block of memory starting at - TEXT and ending just before LIM, counting each tab as one byte. - FIXME: Should we generally be counting non printable chars? */ - -static size_t -debug_width (char const *text, char const *lim) -{ - size_t width = mbsnwidth (text, lim - text, 0); - while (text < lim) - width += (*text++ == '\t'); - return width; -} - -/* For debug mode, "underline" a key at the - specified offset and screen width. */ - -static void -mark_key (size_t offset, size_t width) -{ - while (offset--) - putchar (' '); - - if (!width) - printf (_("^ no match for key\n")); - else - { - do - putchar ('_'); - while (--width); - - putchar ('\n'); - } -} - -/* For LINE, output a debugging line that underlines KEY in LINE. - If KEY is null, underline the whole line. */ - -static void -debug_key (struct line const *line, struct keyfield const *key) -{ - char *text = line->text; - char *beg = text; - char *lim = text + line->length - 1; - - if (key) - { - if (key->sword != SIZE_MAX) - beg = begfield (line, key); - if (key->eword != SIZE_MAX) - lim = limfield (line, key); - - if ((key->skipsblanks && key->sword == SIZE_MAX) - || key->month || key_numeric (key)) - { - char saved = *lim; - *lim = '\0'; - - while (blanks[to_uchar (*beg)]) - beg++; - - char *tighter_lim = beg; - - if (lim < beg) - tighter_lim = lim; - else if (key->month) - getmonth (beg, &tighter_lim); - else if (key->general_numeric) - ignore_value (strtold (beg, &tighter_lim)); - else if (key->numeric || key->human_numeric) - { - char const *p = beg + (beg < lim && *beg == '-'); - unsigned char max_digit = traverse_raw_number (&p); - if ('0' <= max_digit) - { - unsigned char ch = *p; - tighter_lim = (char *) p - + (key->human_numeric && unit_order[ch]); - } - } - else - tighter_lim = lim; - - *lim = saved; - lim = tighter_lim; - } - } - - size_t offset = debug_width (text, beg); - size_t width = debug_width (beg, lim); - mark_key (offset, width); -} - -/* Debug LINE by underlining its keys. */ - -static void -debug_line (struct line const *line) -{ - struct keyfield const *key = keylist; - - do - debug_key (line, key); - while (key && ((key = key->next) || ! (unique || stable))); -} - /* Return whether sorting options specified for key. */ static bool @@ -1655,45 +1423,6 @@ key_warnings (struct keyfield const *gkey, bool gkey_only) error (0, 0, _("option '-r' only applies to last-resort comparison")); } -/* Write LINE to output stream FP; the output file's name is - OUTPUT_FILE if OUTPUT_FILE is non-null, and is the standard output - otherwise. If debugging is enabled and FP is standard output, - append some debugging information. */ - -static void -write_line (struct line const *line, FILE *fp, char const *output_file) -{ - char *buf = line->text; - size_t n_bytes = line->length; - char *ebuf = buf + n_bytes; - - if (!output_file && debug) - { - /* Convert TAB to '>' and EOL to \n, and then output debugging info. */ - char const *c = buf; - - while (c < ebuf) - { - char wc = *c++; - if (wc == '\t') - wc = '>'; - else if (c == ebuf) - wc = '\n'; - if (fputc (wc, fp) == EOF) - sort_die (_("write failed"), output_file); - } - - debug_line (line); - } - else - { - ebuf[-1] = eolchar; - if (fwrite (buf, 1, n_bytes, fp) != n_bytes) - sort_die (_("write failed"), output_file); - ebuf[-1] = '\0'; - } -} - /* Check that the lines read from FILE_NAME come in order. Return true if they are in order. If CHECKONLY == 'c', also print a diagnostic (FILE_NAME, line number, contents of line) to stderr if @@ -2912,8 +2641,6 @@ sort (char *const *files, size_t nfiles, char const *output_file, reap_all (); } -/* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */ - /* Report incompatible options. */ static void incompatible_options (char const *) ATTRIBUTE_NORETURN; -- cgit v1.2.3-54-g00ecf