summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/sort-uniq-keyfuncs.c275
-rw-r--r--src/sort.c275
2 files changed, 275 insertions, 275 deletions
diff --git a/src/sort-uniq-keyfuncs.c b/src/sort-uniq-keyfuncs.c
index 31bf03e19..5b27f7cce 100644
--- a/src/sort-uniq-keyfuncs.c
+++ b/src/sort-uniq-keyfuncs.c
@@ -1,5 +1,7 @@
#include "filevercmp.h"
+#include "ignore-value.h"
#include "md5.h"
+#include "mbswidth.h"
#include "strnumcmp.h"
#define UCHAR_LIM (UCHAR_MAX + 1)
@@ -31,7 +33,6 @@ static int thousands_sep;
/* Nonzero if the corresponding locales are hard. */
static bool hard_LC_COLLATE;
-
#define NONZERO(x) ((x) != 0)
/* The kind of blanks for '-b' to skip in various options. */
@@ -143,6 +144,10 @@ static struct month monthtab[] =
a longer line is seen, this value is increased. */
static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
+/* The approximate maximum number of bytes of main memory to use, as
+ specified by the user. Zero if the user has not specified a size. */
+static size_t sort_size;
+
/* Flag to reverse the order of all comparisons. */
static bool reverse;
@@ -163,9 +168,18 @@ static int tab = TAB_DEFAULT;
Only the last of a sequence of equal lines will be output. */
static bool unique;
+/* Nonzero if any of the input files are the standard input. */
+static bool have_read_stdin;
+
/* List of key field comparisons to be tried. */
static struct keyfield *keylist;
+/* Annotate the output with extra info to aid the user. */
+static bool debug;
+
+/* Report MESSAGE for FILE, then clean up and exit.
+ If FILE is null, it represents standard output. */
+
static void sort_die (char const *, char const *) ATTRIBUTE_NORETURN;
static void
sort_die (char const *message, char const *file)
@@ -174,6 +188,120 @@ sort_die (char const *message, char const *file)
quotef (file ? file : _("standard output")));
}
+/* Return a stream for FILE, opened with mode HOW. A null FILE means
+ standard output; HOW should be "w". When opening for input, "-"
+ means standard input. To avoid confusion, do not return file
+ descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
+ opening an ordinary FILE. Return NULL if unsuccessful.
+
+ fadvise() is used to specify an access pattern for input files.
+ There are a few hints we could possibly provide,
+ and after careful testing it was decided that
+ specifying POSIX_FADV_SEQUENTIAL was not detrimental
+ to any cases. On Linux 2.6.31, this option doubles
+ the size of read ahead performed and thus was seen to
+ benefit these cases:
+ Merging
+ Sorting with a smaller internal buffer
+ Reading from faster flash devices
+
+ In _addition_ one could also specify other hints...
+
+ POSIX_FADV_WILLNEED was tested, but Linux 2.6.31
+ at least uses that to _synchronously_ prepopulate the cache
+ with the specified range. While sort does need to
+ read all of its input before outputting, a synchronous
+ read of the whole file up front precludes any processing
+ that sort could do in parallel with the system doing
+ read ahead of the data. This was seen to have negative effects
+ in a couple of cases:
+ Merging
+ Sorting with a smaller internal buffer
+ Note this option was seen to shorten the runtime for sort
+ on a multicore system with lots of RAM and other processes
+ competing for CPU. It could be argued that more explicit
+ scheduling hints with 'nice' et. al. are more appropriate
+ for this situation.
+
+ POSIX_FADV_NOREUSE is a possibility as it could lower
+ the priority of input data in the cache as sort will
+ only need to process it once. However its functionality
+ has changed over Linux kernel versions and as of 2.6.31
+ it does nothing and thus we can't depend on what it might
+ do in future.
+
+ POSIX_FADV_DONTNEED is not appropriate for user specified
+ input files, but for temp files we do want to drop the
+ cache immediately after processing. This is done implicitly
+ however when the files are unlinked. */
+
+static FILE *
+stream_open (char const *file, char const *how)
+{
+ FILE *fp;
+
+ if (*how == 'r')
+ {
+ if (STREQ (file, "-"))
+ {
+ have_read_stdin = true;
+ fp = stdin;
+ }
+ else
+ fp = fopen (file, how);
+ fadvise (fp, FADVISE_SEQUENTIAL);
+ }
+ else if (*how == 'w')
+ {
+ if (file && ftruncate (STDOUT_FILENO, 0) != 0)
+ die (SORT_FAILURE, errno, _("%s: error truncating"),
+ quotef (file));
+ fp = stdout;
+ }
+ else
+ sort_die (_("unexpected mode passed to stream_open"), file);
+
+ return fp;
+}
+
+/* Same as stream_open, except always return a non-null value; die on
+ failure. */
+
+static FILE *
+xfopen (char const *file, char const *how)
+{
+ FILE *fp = stream_open (file, how);
+ if (!fp)
+ sort_die (_("open failed"), file);
+ return fp;
+}
+
+/* Close FP, whose name is FILE, and report any errors. */
+
+static void
+xfclose (FILE *fp, char const *file)
+{
+ switch (fileno (fp))
+ {
+ case STDIN_FILENO:
+ /* Allow reading stdin from tty more than once. */
+ if (feof (fp))
+ clearerr (fp);
+ break;
+
+ case STDOUT_FILENO:
+ /* Don't close stdout just yet. close_stdout does that. */
+ if (fflush (fp) != 0)
+ sort_die (_("fflush failed"), file);
+ break;
+
+ default:
+ if (fclose (fp) != 0)
+ sort_die (_("close failed"), file);
+ break;
+ }
+}
+
/* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES
must be at least sizeof (struct line). Allocate ALLOC bytes
initially. */
@@ -847,6 +975,40 @@ compare_random (char *restrict texta, size_t lena,
return diff;
}
+/* Return the printable width of the block of memory starting at
+ TEXT and ending just before LIM, counting each tab as one byte.
+ FIXME: Should we generally be counting non printable chars? */
+
+static size_t
+debug_width (char const *text, char const *lim)
+{
+ size_t width = mbsnwidth (text, lim - text, 0);
+ while (text < lim)
+ width += (*text++ == '\t');
+ return width;
+}
+
+/* For debug mode, "underline" a key at the
+ specified offset and screen width. */
+
+static void
+mark_key (size_t offset, size_t width)
+{
+ while (offset--)
+ putchar (' ');
+
+ if (!width)
+ printf (_("^ no match for key\n"));
+ else
+ {
+ do
+ putchar ('_');
+ while (--width);
+
+ putchar ('\n');
+ }
+}
+
/* Return true if KEY is a numeric key. */
static inline bool
@@ -855,6 +1017,76 @@ key_numeric (struct keyfield const *key)
return key->numeric || key->general_numeric || key->human_numeric;
}
+/* For LINE, output a debugging line that underlines KEY in LINE.
+ If KEY is null, underline the whole line. */
+
+static void
+debug_key (struct line const *line, struct keyfield const *key)
+{
+ char *text = line->text;
+ char *beg = text;
+ char *lim = text + line->length - 1;
+
+ if (key)
+ {
+ if (key->sword != SIZE_MAX)
+ beg = begfield (line, key);
+ if (key->eword != SIZE_MAX)
+ lim = limfield (line, key);
+
+ if ((key->skipsblanks && key->sword == SIZE_MAX)
+ || key->month || key_numeric (key))
+ {
+ char saved = *lim;
+ *lim = '\0';
+
+ while (blanks[to_uchar (*beg)])
+ beg++;
+
+ char *tighter_lim = beg;
+
+ if (lim < beg)
+ tighter_lim = lim;
+ else if (key->month)
+ getmonth (beg, &tighter_lim);
+ else if (key->general_numeric)
+ ignore_value (strtold (beg, &tighter_lim));
+ else if (key->numeric || key->human_numeric)
+ {
+ char const *p = beg + (beg < lim && *beg == '-');
+ unsigned char max_digit = traverse_raw_number (&p);
+ if ('0' <= max_digit)
+ {
+ unsigned char ch = *p;
+ tighter_lim = (char *) p
+ + (key->human_numeric && unit_order[ch]);
+ }
+ }
+ else
+ tighter_lim = lim;
+
+ *lim = saved;
+ lim = tighter_lim;
+ }
+ }
+
+ size_t offset = debug_width (text, beg);
+ size_t width = debug_width (beg, lim);
+ mark_key (offset, width);
+}
+
+/* Debug LINE by underlining its keys. */
+
+static void
+debug_line (struct line const *line)
+{
+ struct keyfield const *key = keylist;
+
+ do
+ debug_key (line, key);
+ while (key && ((key = key->next) || ! (unique || stable)));
+}
+
/* Compare two lines A and B trying every key in sequence until there
are no more keys or a difference is found. */
@@ -1101,6 +1333,47 @@ compare (struct line const *a, struct line const *b)
return reverse ? -diff : diff;
}
+/* Write LINE to output stream FP; the output file's name is
+ OUTPUT_FILE if OUTPUT_FILE is non-null, and is the standard output
+ otherwise. If debugging is enabled and FP is standard output,
+ append some debugging information. */
+
+static void
+write_line (struct line const *line, FILE *fp, char const *output_file)
+{
+ char *buf = line->text;
+ size_t n_bytes = line->length;
+ char *ebuf = buf + n_bytes;
+
+ if (!output_file && debug)
+ {
+ /* Convert TAB to '>' and EOL to \n, and then output debugging info. */
+ char const *c = buf;
+
+ while (c < ebuf)
+ {
+ char wc = *c++;
+ if (wc == '\t')
+ wc = '>';
+ else if (c == ebuf)
+ wc = '\n';
+ if (fputc (wc, fp) == EOF)
+ sort_die (_("write failed"), output_file);
+ }
+
+ debug_line (line);
+ }
+ else
+ {
+ ebuf[-1] = eolchar;
+ if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
+ sort_die (_("write failed"), output_file);
+ ebuf[-1] = '\0';
+ }
+}
+
+/* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */
+
static void
insertkey (struct keyfield *key_arg)
{
diff --git a/src/sort.c b/src/sort.c
index b7a6f7aec..7610393a0 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -38,8 +38,6 @@
#include "hard-locale.h"
#include "hash.h"
#include "heap.h"
-#include "ignore-value.h"
-#include "mbswidth.h"
#include "nproc.h"
#include "physmem.h"
#include "posixver.h"
@@ -139,6 +137,7 @@ enum
#if HAVE_NL_LANGINFO
static bool hard_LC_TIME;
#endif
+
/* Binary merge tree node. */
struct merge_node
{
@@ -175,10 +174,6 @@ static struct line saved_line;
/* Minimum sort size; the code might not work with smaller sizes. */
#define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
-/* The approximate maximum number of bytes of main memory to use, as
- specified by the user. Zero if the user has not specified a size. */
-static size_t sort_size;
-
/* The initial allocation factor for non-regular files.
This is used, e.g., when reading from a pipe.
Don't make it too big, since it is multiplied by ~130 to
@@ -195,15 +190,9 @@ static size_t temp_dir_count;
/* Number of allocated slots in temp_dirs. */
static size_t temp_dir_alloc;
-/* Nonzero if any of the input files are the standard input. */
-static bool have_read_stdin;
-
/* Program used to (de)compress temp files. Must accept -d. */
static char const *compress_program;
-/* Annotate the output with extra info to aid the user. */
-static bool debug;
-
/* Maximum number of files to merge in one go. If more than this
number are present, temp files will be used. */
static unsigned int nmerge = NMERGE_DEFAULT;
@@ -236,9 +225,6 @@ async_safe_die (int errnum, const char *errstr)
_exit (SORT_FAILURE);
}
-/* Report MESSAGE for FILE, then clean up and exit.
- If FILE is null, it represents standard output. */
-
void
usage (int status)
{
@@ -717,120 +703,6 @@ create_temp_file (int *pfd, bool survive_fd_exhaustion)
return node;
}
-/* Return a stream for FILE, opened with mode HOW. A null FILE means
- standard output; HOW should be "w". When opening for input, "-"
- means standard input. To avoid confusion, do not return file
- descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
- opening an ordinary FILE. Return NULL if unsuccessful.
-
- fadvise() is used to specify an access pattern for input files.
- There are a few hints we could possibly provide,
- and after careful testing it was decided that
- specifying POSIX_FADV_SEQUENTIAL was not detrimental
- to any cases. On Linux 2.6.31, this option doubles
- the size of read ahead performed and thus was seen to
- benefit these cases:
- Merging
- Sorting with a smaller internal buffer
- Reading from faster flash devices
-
- In _addition_ one could also specify other hints...
-
- POSIX_FADV_WILLNEED was tested, but Linux 2.6.31
- at least uses that to _synchronously_ prepopulate the cache
- with the specified range. While sort does need to
- read all of its input before outputting, a synchronous
- read of the whole file up front precludes any processing
- that sort could do in parallel with the system doing
- read ahead of the data. This was seen to have negative effects
- in a couple of cases:
- Merging
- Sorting with a smaller internal buffer
- Note this option was seen to shorten the runtime for sort
- on a multicore system with lots of RAM and other processes
- competing for CPU. It could be argued that more explicit
- scheduling hints with 'nice' et. al. are more appropriate
- for this situation.
-
- POSIX_FADV_NOREUSE is a possibility as it could lower
- the priority of input data in the cache as sort will
- only need to process it once. However its functionality
- has changed over Linux kernel versions and as of 2.6.31
- it does nothing and thus we can't depend on what it might
- do in future.
-
- POSIX_FADV_DONTNEED is not appropriate for user specified
- input files, but for temp files we do want to drop the
- cache immediately after processing. This is done implicitly
- however when the files are unlinked. */
-
-static FILE *
-stream_open (char const *file, char const *how)
-{
- FILE *fp;
-
- if (*how == 'r')
- {
- if (STREQ (file, "-"))
- {
- have_read_stdin = true;
- fp = stdin;
- }
- else
- fp = fopen (file, how);
- fadvise (fp, FADVISE_SEQUENTIAL);
- }
- else if (*how == 'w')
- {
- if (file && ftruncate (STDOUT_FILENO, 0) != 0)
- die (SORT_FAILURE, errno, _("%s: error truncating"),
- quotef (file));
- fp = stdout;
- }
- else
- assert (!"unexpected mode passed to stream_open");
-
- return fp;
-}
-
-/* Same as stream_open, except always return a non-null value; die on
- failure. */
-
-static FILE *
-xfopen (char const *file, char const *how)
-{
- FILE *fp = stream_open (file, how);
- if (!fp)
- sort_die (_("open failed"), file);
- return fp;
-}
-
-/* Close FP, whose name is FILE, and report any errors. */
-
-static void
-xfclose (FILE *fp, char const *file)
-{
- switch (fileno (fp))
- {
- case STDIN_FILENO:
- /* Allow reading stdin from tty more than once. */
- if (feof (fp))
- clearerr (fp);
- break;
-
- case STDOUT_FILENO:
- /* Don't close stdout just yet. close_stdout does that. */
- if (fflush (fp) != 0)
- sort_die (_("fflush failed"), file);
- break;
-
- default:
- if (fclose (fp) != 0)
- sort_die (_("close failed"), file);
- break;
- }
-}
-
static void
move_fd_or_die (int oldfd, int newfd)
{
@@ -1398,110 +1270,6 @@ random_md5_state_init (char const *random_source)
md5_process_bytes (buf, sizeof buf, &random_md5_state);
}
-/* Return the printable width of the block of memory starting at
- TEXT and ending just before LIM, counting each tab as one byte.
- FIXME: Should we generally be counting non printable chars? */
-
-static size_t
-debug_width (char const *text, char const *lim)
-{
- size_t width = mbsnwidth (text, lim - text, 0);
- while (text < lim)
- width += (*text++ == '\t');
- return width;
-}
-
-/* For debug mode, "underline" a key at the
- specified offset and screen width. */
-
-static void
-mark_key (size_t offset, size_t width)
-{
- while (offset--)
- putchar (' ');
-
- if (!width)
- printf (_("^ no match for key\n"));
- else
- {
- do
- putchar ('_');
- while (--width);
-
- putchar ('\n');
- }
-}
-
-/* For LINE, output a debugging line that underlines KEY in LINE.
- If KEY is null, underline the whole line. */
-
-static void
-debug_key (struct line const *line, struct keyfield const *key)
-{
- char *text = line->text;
- char *beg = text;
- char *lim = text + line->length - 1;
-
- if (key)
- {
- if (key->sword != SIZE_MAX)
- beg = begfield (line, key);
- if (key->eword != SIZE_MAX)
- lim = limfield (line, key);
-
- if ((key->skipsblanks && key->sword == SIZE_MAX)
- || key->month || key_numeric (key))
- {
- char saved = *lim;
- *lim = '\0';
-
- while (blanks[to_uchar (*beg)])
- beg++;
-
- char *tighter_lim = beg;
-
- if (lim < beg)
- tighter_lim = lim;
- else if (key->month)
- getmonth (beg, &tighter_lim);
- else if (key->general_numeric)
- ignore_value (strtold (beg, &tighter_lim));
- else if (key->numeric || key->human_numeric)
- {
- char const *p = beg + (beg < lim && *beg == '-');
- unsigned char max_digit = traverse_raw_number (&p);
- if ('0' <= max_digit)
- {
- unsigned char ch = *p;
- tighter_lim = (char *) p
- + (key->human_numeric && unit_order[ch]);
- }
- }
- else
- tighter_lim = lim;
-
- *lim = saved;
- lim = tighter_lim;
- }
- }
-
- size_t offset = debug_width (text, beg);
- size_t width = debug_width (beg, lim);
- mark_key (offset, width);
-}
-
-/* Debug LINE by underlining its keys. */
-
-static void
-debug_line (struct line const *line)
-{
- struct keyfield const *key = keylist;
-
- do
- debug_key (line, key);
- while (key && ((key = key->next) || ! (unique || stable)));
-}
-
/* Return whether sorting options specified for key. */
static bool
@@ -1655,45 +1423,6 @@ key_warnings (struct keyfield const *gkey, bool gkey_only)
error (0, 0, _("option '-r' only applies to last-resort comparison"));
}
-/* Write LINE to output stream FP; the output file's name is
- OUTPUT_FILE if OUTPUT_FILE is non-null, and is the standard output
- otherwise. If debugging is enabled and FP is standard output,
- append some debugging information. */
-
-static void
-write_line (struct line const *line, FILE *fp, char const *output_file)
-{
- char *buf = line->text;
- size_t n_bytes = line->length;
- char *ebuf = buf + n_bytes;
-
- if (!output_file && debug)
- {
- /* Convert TAB to '>' and EOL to \n, and then output debugging info. */
- char const *c = buf;
-
- while (c < ebuf)
- {
- char wc = *c++;
- if (wc == '\t')
- wc = '>';
- else if (c == ebuf)
- wc = '\n';
- if (fputc (wc, fp) == EOF)
- sort_die (_("write failed"), output_file);
- }
-
- debug_line (line);
- }
- else
- {
- ebuf[-1] = eolchar;
- if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
- sort_die (_("write failed"), output_file);
- ebuf[-1] = '\0';
- }
-}
-
/* Check that the lines read from FILE_NAME come in order. Return
true if they are in order. If CHECKONLY == 'c', also print a
diagnostic (FILE_NAME, line number, contents of line) to stderr if
@@ -2912,8 +2641,6 @@ sort (char *const *files, size_t nfiles, char const *output_file,
reap_all ();
}
-/* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */
-
/* Report incompatible options. */
static void incompatible_options (char const *) ATTRIBUTE_NORETURN;