summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/sort-uniq-keyfuncs.c1091
-rw-r--r--src/sort.c1089
2 files changed, 1094 insertions, 1086 deletions
diff --git a/src/sort-uniq-keyfuncs.c b/src/sort-uniq-keyfuncs.c
new file mode 100644
index 000000000..196045bc0
--- /dev/null
+++ b/src/sort-uniq-keyfuncs.c
@@ -0,0 +1,1091 @@
+#include "filevercmp.h"
+#include "md5.h"
+#include "strnumcmp.h"
+#define UCHAR_LIM (UCHAR_MAX + 1)
+
+#if HAVE_C99_STRTOLD
+# define long_double long double
+#else
+# define long_double double
+# undef strtold
+# define strtold strtod
+#endif
+
+/* Exit statuses. */
+enum
+ {
+ /* POSIX says to exit with status 1 if invoked with -c and the
+ input is not properly sorted. */
+ SORT_OUT_OF_ORDER = 1,
+
+ /* POSIX says any other irregular exit must exit with a status
+ code greater than 1. */
+ SORT_FAILURE = 2
+ };
+
+/* The representation of the decimal point in the current locale. */
+static int decimal_point;
+
+/* Thousands separator; if -1, then there isn't one. */
+static int thousands_sep;
+
+/* Nonzero if the corresponding locales are hard. */
+static bool hard_LC_COLLATE;
+
+#define NONZERO(x) ((x) != 0)
+
+/* The kind of blanks for '-b' to skip in various options. */
+enum blanktype { bl_start, bl_end, bl_both };
+
+/* Lines are held in core as counted strings. */
+struct line
+{
+ char *text; /* Text of the line. */
+ size_t length; /* Length including final newline. */
+ char *keybeg; /* Start of first key. */
+ char *keylim; /* Limit of first key. */
+};
+
+/* Sort key. */
+struct keyfield
+{
+ size_t sword; /* Zero-origin 'word' to start at. */
+ size_t schar; /* Additional characters to skip. */
+ size_t eword; /* Zero-origin last 'word' of key. */
+ size_t echar; /* Additional characters in field. */
+ bool const *ignore; /* Boolean array of characters to ignore. */
+ char const *translate; /* Translation applied to characters. */
+ bool skipsblanks; /* Skip leading blanks when finding start. */
+ bool skipeblanks; /* Skip leading blanks when finding end. */
+ bool numeric; /* Flag for numeric comparison. Handle
+ strings of digits with optional decimal
+ point, but no exponential notation. */
+ bool random; /* Sort by random hash of key. */
+ bool general_numeric; /* Flag for general, numeric comparison.
+ Handle numbers in exponential notation. */
+ bool human_numeric; /* Flag for sorting by human readable
+ units with either SI or IEC prefixes. */
+ bool month; /* Flag for comparison by month name. */
+ bool reverse; /* Reverse the sense of comparison. */
+ bool version; /* sort by version number */
+ bool traditional_used; /* Traditional key option format is used. */
+ struct keyfield *next; /* Next keyfield to try. */
+};
+
+struct month
+{
+ char const *name;
+ int val;
+};
+
+/* FIXME: None of these tables work with multibyte character sets.
+ Also, there are many other bugs when handling multibyte characters.
+ One way to fix this is to rewrite 'sort' to use wide characters
+ internally, but doing this with good performance is a bit
+ tricky. */
+
+/* Table of blanks. */
+static bool blanks[UCHAR_LIM];
+
+/* Table of non-printing characters. */
+static bool nonprinting[UCHAR_LIM];
+
+/* Table of non-dictionary characters (not letters, digits, or blanks). */
+static bool nondictionary[UCHAR_LIM];
+
+/* Translation table folding lower case to upper. */
+static char fold_toupper[UCHAR_LIM];
+
+#define MONTHS_PER_YEAR 12
+
+/* Table mapping month names to integers.
+ Alphabetic order allows binary search. */
+static struct month monthtab[] =
+{
+ {"APR", 4},
+ {"AUG", 8},
+ {"DEC", 12},
+ {"FEB", 2},
+ {"JAN", 1},
+ {"JUL", 7},
+ {"JUN", 6},
+ {"MAR", 3},
+ {"MAY", 5},
+ {"NOV", 11},
+ {"OCT", 10},
+ {"SEP", 9}
+};
+
+/* Flag to reverse the order of all comparisons. */
+static bool reverse;
+
+/* Flag for stable sort. This turns off the last ditch bytewise
+ comparison of lines, and instead leaves lines in the same order
+ they were read if all keys compare equal. */
+static bool stable;
+
+/* If TAB has this value, blanks separate fields. */
+enum { TAB_DEFAULT = CHAR_MAX + 1 };
+
+/* Tab character separating fields. If TAB_DEFAULT, then fields are
+ separated by the empty string between a non-blank character and a blank
+ character. */
+static int tab = TAB_DEFAULT;
+
+/* Flag to remove consecutive duplicate lines from the output.
+ Only the last of a sequence of equal lines will be output. */
+static bool unique;
+
+/* List of key field comparisons to be tried. */
+static struct keyfield *keylist;
+
+/* Return a pointer to the first character of the field specified
+ by KEY in LINE. */
+
+static char *
+begfield (struct line const *line, struct keyfield const *key)
+{
+ char *ptr = line->text, *lim = ptr + line->length - 1;
+ size_t sword = key->sword;
+ size_t schar = key->schar;
+
+ /* The leading field separator itself is included in a field when -t
+ is absent. */
+
+ if (tab != TAB_DEFAULT)
+ while (ptr < lim && sword--)
+ {
+ while (ptr < lim && *ptr != tab)
+ ++ptr;
+ if (ptr < lim)
+ ++ptr;
+ }
+ else
+ while (ptr < lim && sword--)
+ {
+ while (ptr < lim && blanks[to_uchar (*ptr)])
+ ++ptr;
+ while (ptr < lim && !blanks[to_uchar (*ptr)])
+ ++ptr;
+ }
+
+ /* If we're ignoring leading blanks when computing the Start
+ of the field, skip past them here. */
+ if (key->skipsblanks)
+ while (ptr < lim && blanks[to_uchar (*ptr)])
+ ++ptr;
+
+ /* Advance PTR by SCHAR (if possible), but no further than LIM. */
+ ptr = MIN (lim, ptr + schar);
+
+ return ptr;
+}
+
+/* Return the limit of (a pointer to the first character after) the field
+ in LINE specified by KEY. */
+
+static char *
+limfield (struct line const *line, struct keyfield const *key)
+{
+ char *ptr = line->text, *lim = ptr + line->length - 1;
+ size_t eword = key->eword, echar = key->echar;
+
+ if (echar == 0)
+ eword++; /* Skip all of end field. */
+
+ /* Move PTR past EWORD fields or to one past the last byte on LINE,
+ whichever comes first. If there are more than EWORD fields, leave
+ PTR pointing at the beginning of the field having zero-based index,
+ EWORD. If a delimiter character was specified (via -t), then that
+ 'beginning' is the first character following the delimiting TAB.
+ Otherwise, leave PTR pointing at the first 'blank' character after
+ the preceding field. */
+ if (tab != TAB_DEFAULT)
+ while (ptr < lim && eword--)
+ {
+ while (ptr < lim && *ptr != tab)
+ ++ptr;
+ if (ptr < lim && (eword || echar))
+ ++ptr;
+ }
+ else
+ while (ptr < lim && eword--)
+ {
+ while (ptr < lim && blanks[to_uchar (*ptr)])
+ ++ptr;
+ while (ptr < lim && !blanks[to_uchar (*ptr)])
+ ++ptr;
+ }
+
+#ifdef POSIX_UNSPECIFIED
+ /* The following block of code makes GNU sort incompatible with
+ standard Unix sort, so it's ifdef'd out for now.
+ The POSIX spec isn't clear on how to interpret this.
+ FIXME: request clarification.
+
+ From: kwzh@gnu.ai.mit.edu (Karl Heuer)
+ Date: Thu, 30 May 96 12:20:41 -0400
+ [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
+
+ [...]I believe I've found another bug in 'sort'.
+
+ $ cat /tmp/sort.in
+ a b c 2 d
+ pq rs 1 t
+ $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
+ a b c 2 d
+ pq rs 1 t
+ $ /bin/sort -k1.7,1.7 </tmp/sort.in
+ pq rs 1 t
+ a b c 2 d
+
+ Unix sort produced the answer I expected: sort on the single character
+ in column 7. GNU sort produced different results, because it disagrees
+ on the interpretation of the key-end spec "M.N". Unix sort reads this
+ as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
+ "skip M-1 fields, then either N-1 characters or the rest of the current
+ field, whichever comes first". This extra clause applies only to
+ key-ends, not key-starts.
+ */
+
+ /* Make LIM point to the end of (one byte past) the current field. */
+ if (tab != TAB_DEFAULT)
+ {
+ char *newlim;
+ newlim = memchr (ptr, tab, lim - ptr);
+ if (newlim)
+ lim = newlim;
+ }
+ else
+ {
+ char *newlim;
+ newlim = ptr;
+ while (newlim < lim && blanks[to_uchar (*newlim)])
+ ++newlim;
+ while (newlim < lim && !blanks[to_uchar (*newlim)])
+ ++newlim;
+ lim = newlim;
+ }
+#endif
+
+ if (echar != 0) /* We need to skip over a portion of the end field. */
+ {
+ /* If we're ignoring leading blanks when computing the End
+ of the field, skip past them here. */
+ if (key->skipeblanks)
+ while (ptr < lim && blanks[to_uchar (*ptr)])
+ ++ptr;
+
+ /* Advance PTR by ECHAR (if possible), but no further than LIM. */
+ ptr = MIN (lim, ptr + echar);
+ }
+
+ return ptr;
+}
+
+/* Table that maps characters to order-of-magnitude values. */
+static char const unit_order[UCHAR_LIM] =
+ {
+#if ! ('K' == 75 && 'M' == 77 && 'G' == 71 && 'T' == 84 && 'P' == 80 \
+ && 'E' == 69 && 'Z' == 90 && 'Y' == 89 && 'k' == 107)
+ /* This initializer syntax works on all C99 hosts. For now, use
+ it only on non-ASCII hosts, to ease the pain of porting to
+ pre-C99 ASCII hosts. */
+ ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
+ ['k']=1,
+#else
+ /* Generate the following table with this command:
+ perl -e 'my %a=(k=>1, K=>1, M=>2, G=>3, T=>4, P=>5, E=>6, Z=>7, Y=>8);
+ foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\
+ |fmt */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3,
+ 0, 0, 0, 1, 0, 2, 0, 0, 5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+#endif
+ };
+
+/* Traverse number given as *number consisting of digits, thousands_sep, and
+ decimal_point chars only. Returns the highest digit found in the number,
+ or '\0' if no digit has been found. Upon return *number points at the
+ character that immediately follows after the given number. */
+static unsigned char
+traverse_raw_number (char const **number)
+{
+ char const *p = *number;
+ unsigned char ch;
+ unsigned char max_digit = '\0';
+ bool ends_with_thousands_sep = false;
+
+ /* Scan to end of number.
+ Decimals or separators not followed by digits stop the scan.
+ Numbers ending in decimals or separators are thus considered
+ to be lacking in units.
+ FIXME: add support for multibyte thousands_sep and decimal_point. */
+
+ while (ISDIGIT (ch = *p++))
+ {
+ if (max_digit < ch)
+ max_digit = ch;
+
+ /* Allow to skip only one occurrence of thousands_sep to avoid finding
+ the unit in the next column in case thousands_sep matches as blank
+ and is used as column delimiter. */
+ ends_with_thousands_sep = (*p == thousands_sep);
+ if (ends_with_thousands_sep)
+ ++p;
+ }
+
+ if (ends_with_thousands_sep)
+ {
+ /* thousands_sep not followed by digit is not allowed. */
+ *number = p - 2;
+ return max_digit;
+ }
+
+ if (ch == decimal_point)
+ while (ISDIGIT (ch = *p++))
+ if (max_digit < ch)
+ max_digit = ch;
+
+ *number = p - 1;
+ return max_digit;
+}
+
+/* Return an integer that represents the order of magnitude of the
+ unit following the number. The number may contain thousands
+ separators and a decimal point, but it may not contain leading blanks.
+ Negative numbers get negative orders; zero numbers have a zero order. */
+
+static int _GL_ATTRIBUTE_PURE
+find_unit_order (char const *number)
+{
+ bool minus_sign = (*number == '-');
+ char const *p = number + minus_sign;
+ unsigned char max_digit = traverse_raw_number (&p);
+ if ('0' < max_digit)
+ {
+ unsigned char ch = *p;
+ int order = unit_order[ch];
+ return (minus_sign ? -order : order);
+ }
+ else
+ return 0;
+}
+
+/* Compare numbers A and B ending in units with SI or IEC prefixes
+ <none/unknown> < K/k < M < G < T < P < E < Z < Y */
+
+static int
+human_numcompare (char const *a, char const *b)
+{
+ while (blanks[to_uchar (*a)])
+ a++;
+ while (blanks[to_uchar (*b)])
+ b++;
+
+ int diff = find_unit_order (a) - find_unit_order (b);
+ return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep));
+}
+
+/* Compare strings A and B as numbers without explicitly converting them to
+ machine numbers. Comparatively slow for short strings, but asymptotically
+ hideously fast. */
+
+static int
+numcompare (char const *a, char const *b)
+{
+ while (blanks[to_uchar (*a)])
+ a++;
+ while (blanks[to_uchar (*b)])
+ b++;
+
+ return strnumcmp (a, b, decimal_point, thousands_sep);
+}
+
+/* Work around a problem whereby the long double value returned by glibc's
+ strtold ("NaN", ...) contains uninitialized bits: clear all bytes of
+ A and B before calling strtold. FIXME: remove this function once
+ gnulib guarantees that strtold's result is always well defined. */
+static int
+nan_compare (char const *sa, char const *sb)
+{
+ long_double a;
+ memset (&a, 0, sizeof a);
+ a = strtold (sa, NULL);
+
+ long_double b;
+ memset (&b, 0, sizeof b);
+ b = strtold (sb, NULL);
+
+ return memcmp (&a, &b, sizeof a);
+}
+
+static int
+general_numcompare (char const *sa, char const *sb)
+{
+ /* FIXME: maybe add option to try expensive FP conversion
+ only if A and B can't be compared more cheaply/accurately. */
+
+ char *ea;
+ char *eb;
+ long_double a = strtold (sa, &ea);
+ long_double b = strtold (sb, &eb);
+
+ /* Put conversion errors at the start of the collating sequence. */
+ if (sa == ea)
+ return sb == eb ? 0 : -1;
+ if (sb == eb)
+ return 1;
+
+ /* Sort numbers in the usual way, where -0 == +0. Put NaNs after
+ conversion errors but before numbers; sort them by internal
+ bit-pattern, for lack of a more portable alternative. */
+ return (a < b ? -1
+ : a > b ? 1
+ : a == b ? 0
+ : b == b ? -1
+ : a == a ? 1
+ : nan_compare (sa, sb));
+}
+
+/* Return an integer in 1..12 of the month name MONTH.
+ Return 0 if the name in S is not recognized. */
+
+static int
+getmonth (char const *month, char **ea)
+{
+ size_t lo = 0;
+ size_t hi = MONTHS_PER_YEAR;
+
+ while (blanks[to_uchar (*month)])
+ month++;
+
+ do
+ {
+ size_t ix = (lo + hi) / 2;
+ char const *m = month;
+ char const *n = monthtab[ix].name;
+
+ for (;; m++, n++)
+ {
+ if (!*n)
+ {
+ if (ea)
+ *ea = (char *) m;
+ return monthtab[ix].val;
+ }
+ if (to_uchar (fold_toupper[to_uchar (*m)]) < to_uchar (*n))
+ {
+ hi = ix;
+ break;
+ }
+ else if (to_uchar (fold_toupper[to_uchar (*m)]) > to_uchar (*n))
+ {
+ lo = ix + 1;
+ break;
+ }
+ }
+ }
+ while (lo < hi);
+
+ return 0;
+}
+
+/* A randomly chosen MD5 state, used for random comparison. */
+static struct md5_ctx random_md5_state;
+
+/* This is like strxfrm, except it reports any error and exits. */
+
+static size_t
+xstrxfrm (char *restrict dest, char const *restrict src, size_t destsize)
+{
+ errno = 0;
+ size_t translated_size = strxfrm (dest, src, destsize);
+
+ if (errno)
+ {
+ error (0, errno, _("string transformation failed"));
+ error (0, 0, _("set LC_ALL='C' to work around the problem"));
+ die (SORT_FAILURE, 0,
+ _("the untransformed string was %s"),
+ quotearg_n_style (0, locale_quoting_style, src));
+ }
+
+ return translated_size;
+}
+
+/* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
+ using one or more random hash functions. TEXTA[LENA] and
+ TEXTB[LENB] must be zero. */
+
+static int
+compare_random (char *restrict texta, size_t lena,
+ char *restrict textb, size_t lenb)
+{
+ /* XFRM_DIFF records the equivalent of memcmp on the transformed
+ data. This is used to break ties if there is a checksum
+ collision, and this is good enough given the astronomically low
+ probability of a collision. */
+ int xfrm_diff = 0;
+
+ char stackbuf[4000];
+ char *buf = stackbuf;
+ size_t bufsize = sizeof stackbuf;
+ void *allocated = NULL;
+ uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
+ struct md5_ctx s[2];
+ s[0] = s[1] = random_md5_state;
+
+ if (hard_LC_COLLATE)
+ {
+ char const *lima = texta + lena;
+ char const *limb = textb + lenb;
+
+ while (true)
+ {
+ /* Transform the text into the basis of comparison, so that byte
+ strings that would otherwise considered to be equal are
+ considered equal here even if their bytes differ.
+
+ Each time through this loop, transform one
+ null-terminated string's worth from TEXTA or from TEXTB
+ or both. That way, there's no need to store the
+ transformation of the whole line, if it contains many
+ null-terminated strings. */
+
+ /* Store the transformed data into a big-enough buffer. */
+
+ /* A 3X size guess avoids the overhead of calling strxfrm
+ twice on typical implementations. Don't worry about
+ size_t overflow, as the guess need not be correct. */
+ size_t guess_bufsize = 3 * (lena + lenb) + 2;
+ if (bufsize < guess_bufsize)
+ {
+ bufsize = MAX (guess_bufsize, bufsize * 3 / 2);
+ free (allocated);
+ buf = allocated = malloc (bufsize);
+ if (! buf)
+ {
+ buf = stackbuf;
+ bufsize = sizeof stackbuf;
+ }
+ }
+
+ size_t sizea =
+ (texta < lima ? xstrxfrm (buf, texta, bufsize) + 1 : 0);
+ bool a_fits = sizea <= bufsize;
+ size_t sizeb =
+ (textb < limb
+ ? (xstrxfrm ((a_fits ? buf + sizea : NULL), textb,
+ (a_fits ? bufsize - sizea : 0))
+ + 1)
+ : 0);
+
+ if (! (a_fits && sizea + sizeb <= bufsize))
+ {
+ bufsize = sizea + sizeb;
+ if (bufsize < SIZE_MAX / 3)
+ bufsize = bufsize * 3 / 2;
+ free (allocated);
+ buf = allocated = xmalloc (bufsize);
+ if (texta < lima)
+ strxfrm (buf, texta, sizea);
+ if (textb < limb)
+ strxfrm (buf + sizea, textb, sizeb);
+ }
+
+ /* Advance past NULs to the next part of each input string,
+ exiting the loop if both strings are exhausted. When
+ exiting the loop, prepare to finish off the tiebreaker
+ comparison properly. */
+ if (texta < lima)
+ texta += strlen (texta) + 1;
+ if (textb < limb)
+ textb += strlen (textb) + 1;
+ if (! (texta < lima || textb < limb))
+ {
+ lena = sizea; texta = buf;
+ lenb = sizeb; textb = buf + sizea;
+ break;
+ }
+
+ /* Accumulate the transformed data in the corresponding
+ checksums. */
+ md5_process_bytes (buf, sizea, &s[0]);
+ md5_process_bytes (buf + sizea, sizeb, &s[1]);
+
+ /* Update the tiebreaker comparison of the transformed data. */
+ if (! xfrm_diff)
+ {
+ xfrm_diff = memcmp (buf, buf + sizea, MIN (sizea, sizeb));
+ if (! xfrm_diff)
+ xfrm_diff = (sizea > sizeb) - (sizea < sizeb);
+ }
+ }
+ }
+
+ /* Compute and compare the checksums. */
+ md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]);
+ md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]);
+ int diff = memcmp (dig[0], dig[1], sizeof dig[0]);
+
+ /* Fall back on the tiebreaker if the checksums collide. */
+ if (! diff)
+ {
+ if (! xfrm_diff)
+ {
+ xfrm_diff = memcmp (texta, textb, MIN (lena, lenb));
+ if (! xfrm_diff)
+ xfrm_diff = (lena > lenb) - (lena < lenb);
+ }
+
+ diff = xfrm_diff;
+ }
+
+ free (allocated);
+
+ return diff;
+}
+
+/* Return true if KEY is a numeric key. */
+
+static inline bool
+key_numeric (struct keyfield const *key)
+{
+ return key->numeric || key->general_numeric || key->human_numeric;
+}
+
+/* Compare two lines A and B trying every key in sequence until there
+ are no more keys or a difference is found. */
+
+static int
+keycompare (struct line const *a, struct line const *b)
+{
+ struct keyfield *key = keylist;
+
+ /* For the first iteration only, the key positions have been
+ precomputed for us. */
+ char *texta = a->keybeg;
+ char *textb = b->keybeg;
+ char *lima = a->keylim;
+ char *limb = b->keylim;
+
+ int diff;
+
+ while (true)
+ {
+ char const *translate = key->translate;
+ bool const *ignore = key->ignore;
+
+ /* Treat field ends before field starts as empty fields. */
+ lima = MAX (texta, lima);
+ limb = MAX (textb, limb);
+
+ /* Find the lengths. */
+ size_t lena = lima - texta;
+ size_t lenb = limb - textb;
+
+ if (hard_LC_COLLATE || key_numeric (key)
+ || key->month || key->random || key->version)
+ {
+ char *ta;
+ char *tb;
+ size_t tlena;
+ size_t tlenb;
+
+ char enda IF_LINT (= 0);
+ char endb IF_LINT (= 0);
+ void *allocated IF_LINT (= NULL);
+ char stackbuf[4000];
+
+ if (ignore || translate)
+ {
+ /* Compute with copies of the keys, which are the result of
+ translating or ignoring characters, and which need their
+ own storage. */
+
+ size_t i;
+
+ /* Allocate space for copies. */
+ size_t size = lena + 1 + lenb + 1;
+ if (size <= sizeof stackbuf)
+ ta = stackbuf, allocated = NULL;
+ else
+ ta = allocated = xmalloc (size);
+ tb = ta + lena + 1;
+
+ /* Put into each copy a version of the key in which the
+ requested characters are ignored or translated. */
+ for (tlena = i = 0; i < lena; i++)
+ if (! (ignore && ignore[to_uchar (texta[i])]))
+ ta[tlena++] = (translate
+ ? translate[to_uchar (texta[i])]
+ : texta[i]);
+ ta[tlena] = '\0';
+
+ for (tlenb = i = 0; i < lenb; i++)
+ if (! (ignore && ignore[to_uchar (textb[i])]))
+ tb[tlenb++] = (translate
+ ? translate[to_uchar (textb[i])]
+ : textb[i]);
+ tb[tlenb] = '\0';
+ }
+ else
+ {
+ /* Use the keys in-place, temporarily null-terminated. */
+ ta = texta; tlena = lena; enda = ta[tlena]; ta[tlena] = '\0';
+ tb = textb; tlenb = lenb; endb = tb[tlenb]; tb[tlenb] = '\0';
+ }
+
+ if (key->numeric)
+ diff = numcompare (ta, tb);
+ else if (key->general_numeric)
+ diff = general_numcompare (ta, tb);
+ else if (key->human_numeric)
+ diff = human_numcompare (ta, tb);
+ else if (key->month)
+ diff = getmonth (ta, NULL) - getmonth (tb, NULL);
+ else if (key->random)
+ diff = compare_random (ta, tlena, tb, tlenb);
+ else if (key->version)
+ diff = filevercmp (ta, tb);
+ else
+ {
+ /* Locale-dependent string sorting. This is slower than
+ C-locale sorting, which is implemented below. */
+ if (tlena == 0)
+ diff = - NONZERO (tlenb);
+ else if (tlenb == 0)
+ diff = 1;
+ else
+ diff = xmemcoll0 (ta, tlena + 1, tb, tlenb + 1);
+ }
+
+ if (ignore || translate)
+ free (allocated);
+ else
+ {
+ ta[tlena] = enda;
+ tb[tlenb] = endb;
+ }
+ }
+ else if (ignore)
+ {
+#define CMP_WITH_IGNORE(A, B) \
+ do \
+ { \
+ while (true) \
+ { \
+ while (texta < lima && ignore[to_uchar (*texta)]) \
+ ++texta; \
+ while (textb < limb && ignore[to_uchar (*textb)]) \
+ ++textb; \
+ if (! (texta < lima && textb < limb)) \
+ break; \
+ diff = to_uchar (A) - to_uchar (B); \
+ if (diff) \
+ goto not_equal; \
+ ++texta; \
+ ++textb; \
+ } \
+ \
+ diff = (texta < lima) - (textb < limb); \
+ } \
+ while (0)
+
+ if (translate)
+ CMP_WITH_IGNORE (translate[to_uchar (*texta)],
+ translate[to_uchar (*textb)]);
+ else
+ CMP_WITH_IGNORE (*texta, *textb);
+ }
+ else if (lena == 0)
+ diff = - NONZERO (lenb);
+ else if (lenb == 0)
+ goto greater;
+ else
+ {
+ if (translate)
+ {
+ while (texta < lima && textb < limb)
+ {
+ diff = (to_uchar (translate[to_uchar (*texta++)])
+ - to_uchar (translate[to_uchar (*textb++)]));
+ if (diff)
+ goto not_equal;
+ }
+ }
+ else
+ {
+ diff = memcmp (texta, textb, MIN (lena, lenb));
+ if (diff)
+ goto not_equal;
+ }
+ diff = lena < lenb ? -1 : lena != lenb;
+ }
+
+ if (diff)
+ goto not_equal;
+
+ key = key->next;
+ if (! key)
+ break;
+
+ /* Find the beginning and limit of the next field. */
+ if (key->eword != SIZE_MAX)
+ lima = limfield (a, key), limb = limfield (b, key);
+ else
+ lima = a->text + a->length - 1, limb = b->text + b->length - 1;
+
+ if (key->sword != SIZE_MAX)
+ texta = begfield (a, key), textb = begfield (b, key);
+ else
+ {
+ texta = a->text, textb = b->text;
+ if (key->skipsblanks)
+ {
+ while (texta < lima && blanks[to_uchar (*texta)])
+ ++texta;
+ while (textb < limb && blanks[to_uchar (*textb)])
+ ++textb;
+ }
+ }
+ }
+
+ return 0;
+
+ greater:
+ diff = 1;
+ not_equal:
+ return key->reverse ? -diff : diff;
+}
+
+/* Compare two lines A and B, returning negative, zero, or positive
+ depending on whether A compares less than, equal to, or greater than B. */
+
+static int
+compare (struct line const *a, struct line const *b)
+{
+ int diff;
+ size_t alen, blen;
+
+ /* First try to compare on the specified keys (if any).
+ The only two cases with no key at all are unadorned sort,
+ and unadorned sort -r. */
+ if (keylist)
+ {
+ diff = keycompare (a, b);
+ if (diff || unique || stable)
+ return diff;
+ }
+
+ /* If the keys all compare equal (or no keys were specified)
+ fall through to the default comparison. */
+ alen = a->length - 1, blen = b->length - 1;
+
+ if (alen == 0)
+ diff = - NONZERO (blen);
+ else if (blen == 0)
+ diff = 1;
+ else if (hard_LC_COLLATE)
+ {
+ /* Note xmemcoll0 is a performance enhancement as
+ it will not unconditionally write '\0' after the
+ passed in buffers, which was seen to give around
+ a 3% increase in performance for short lines. */
+ diff = xmemcoll0 (a->text, alen + 1, b->text, blen + 1);
+ }
+ else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
+ diff = alen < blen ? -1 : alen != blen;
+
+ return reverse ? -diff : diff;
+}
+
+static void
+insertkey (struct keyfield *key_arg)
+{
+ struct keyfield **p;
+ struct keyfield *key = xmemdup (key_arg, sizeof *key);
+
+ for (p = &keylist; *p; p = &(*p)->next)
+ continue;
+ *p = key;
+ key->next = NULL;
+}
+
+/* Report a bad field specification SPEC, with extra info MSGID. */
+
+static void badfieldspec (char const *, char const *)
+ ATTRIBUTE_NORETURN;
+static void
+badfieldspec (char const *spec, char const *msgid)
+{
+ die (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
+ _(msgid), quote (spec));
+}
+
+/* Parse the leading integer in STRING and store the resulting value
+ (which must fit into size_t) into *VAL. Return the address of the
+ suffix after the integer. If the value is too large, silently
+ substitute SIZE_MAX. If MSGID is NULL, return NULL after
+ failure; otherwise, report MSGID and exit on failure. */
+
+static char const *
+parse_field_count (char const *string, size_t *val, char const *msgid)
+{
+ char *suffix;
+ uintmax_t n;
+
+ switch (xstrtoumax (string, &suffix, 10, &n, ""))
+ {
+ case LONGINT_OK:
+ case LONGINT_INVALID_SUFFIX_CHAR:
+ *val = n;
+ if (*val == n)
+ break;
+ /* Fall through. */
+ case LONGINT_OVERFLOW:
+ case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
+ *val = SIZE_MAX;
+ break;
+
+ case LONGINT_INVALID:
+ if (msgid)
+ die (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
+ _(msgid), quote (string));
+ return NULL;
+ }
+
+ return suffix;
+}
+
+/* Set the ordering options for KEY specified in S.
+ Return the address of the first character in S that
+ is not a valid ordering option.
+ BLANKTYPE is the kind of blanks that 'b' should skip. */
+
+static char *
+set_ordering (char const *s, struct keyfield *key, enum blanktype blanktype)
+{
+ while (*s)
+ {
+ switch (*s)
+ {
+ case 'b':
+ if (blanktype == bl_start || blanktype == bl_both)
+ key->skipsblanks = true;
+ if (blanktype == bl_end || blanktype == bl_both)
+ key->skipeblanks = true;
+ break;
+ case 'd':
+ key->ignore = nondictionary;
+ break;
+ case 'f':
+ key->translate = fold_toupper;
+ break;
+ case 'g':
+ key->general_numeric = true;
+ break;
+ case 'h':
+ key->human_numeric = true;
+ break;
+ case 'i':
+ /* Option order should not matter, so don't let -i override
+ -d. -d implies -i, but -i does not imply -d. */
+ if (! key->ignore)
+ key->ignore = nonprinting;
+ break;
+ case 'M':
+ key->month = true;
+ break;
+ case 'n':
+ key->numeric = true;
+ break;
+ case 'R':
+ key->random = true;
+ break;
+ case 'r':
+ key->reverse = true;
+ break;
+ case 'V':
+ key->version = true;
+ break;
+ default:
+ return (char *) s;
+ }
+ ++s;
+ }
+ return (char *) s;
+}
+
+/* Initialize KEY. */
+
+static struct keyfield *
+key_init (struct keyfield *key)
+{
+ memset (key, 0, sizeof *key);
+ key->eword = SIZE_MAX;
+ return key;
+}
+
+static void add_key(void)
+{
+ struct keyfield *key;
+ struct keyfield key_buf; char const *s;
+ key = key_init (&key_buf);
+
+ /* Get POS1. */
+ s = parse_field_count (optarg, &key->sword,
+ N_("invalid number at field start"));
+ if (! key->sword--)
+ {
+ /* Provoke with 'sort -k0' */
+ badfieldspec (optarg, N_("field number is zero"));
+ }
+ if (*s == '.')
+ {
+ s = parse_field_count (s + 1, &key->schar,
+ N_("invalid number after '.'"));
+ if (! key->schar--)
+ {
+ /* Provoke with 'sort -k1.0' */
+ badfieldspec (optarg, N_("character offset is zero"));
+ }
+ }
+ if (! (key->sword || key->schar))
+ key->sword = SIZE_MAX;
+ s = set_ordering (s, key, bl_start);
+ if (*s != ',')
+ {
+ key->eword = SIZE_MAX;
+ key->echar = 0;
+ }
+ else
+ {
+ /* Get POS2. */
+ s = parse_field_count (s + 1, &key->eword,
+ N_("invalid number after ','"));
+ if (! key->eword--)
+ {
+ /* Provoke with 'sort -k1,0' */
+ badfieldspec (optarg, N_("field number is zero"));
+ }
+ if (*s == '.')
+ {
+ s = parse_field_count (s + 1, &key->echar,
+ N_("invalid number after '.'"));
+ }
+ s = set_ordering (s, key, bl_end);
+ }
+ if (*s)
+ badfieldspec (optarg, N_("stray character in field spec"));
+ insertkey (key);
+}
diff --git a/src/sort.c b/src/sort.c
index 94315d2a1..6f28b63ae 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -34,13 +34,11 @@
#include "die.h"
#include "error.h"
#include "fadvise.h"
-#include "filevercmp.h"
#include "flexmember.h"
#include "hard-locale.h"
#include "hash.h"
#include "heap.h"
#include "ignore-value.h"
-#include "md5.h"
#include "mbswidth.h"
#include "nproc.h"
#include "physmem.h"
@@ -50,11 +48,12 @@
#include "readtokens0.h"
#include "stdio--.h"
#include "stdlib--.h"
-#include "strnumcmp.h"
#include "xmemcoll.h"
#include "xnanosleep.h"
#include "xstrtol.h"
+#include "sort-uniq-keyfuncs.c"
+
#ifndef RLIMIT_DATA
struct rlimit { size_t rlim_cur; };
# define getrlimit(Resource, Rlp) (-1)
@@ -90,16 +89,6 @@ struct rlimit { size_t rlim_cur; };
# define OPEN_MAX 20
#endif
-#define UCHAR_LIM (UCHAR_MAX + 1)
-
-#if HAVE_C99_STRTOLD
-# define long_double long double
-#else
-# define long_double double
-# undef strtold
-# define strtold strtod
-#endif
-
#ifndef DEFAULT_TMPDIR
# define DEFAULT_TMPDIR "/tmp"
#endif
@@ -124,18 +113,6 @@ verify (4 <= SUBTHREAD_LINES_HEURISTIC);
diminishing performance gains. */
enum { DEFAULT_MAX_THREADS = 8 };
-/* Exit statuses. */
-enum
- {
- /* POSIX says to exit with status 1 if invoked with -c and the
- input is not properly sorted. */
- SORT_OUT_OF_ORDER = 1,
-
- /* POSIX says any other irregular exit must exit with a status
- code greater than 1. */
- SORT_FAILURE = 2
- };
-
enum
{
/* The number of times we should try to fork a compression process
@@ -159,35 +136,12 @@ enum
MERGE_ROOT = 1
};
-/* The representation of the decimal point in the current locale. */
-static int decimal_point;
-
-/* Thousands separator; if -1, then there isn't one. */
-static int thousands_sep;
-
-/* Nonzero if the corresponding locales are hard. */
-static bool hard_LC_COLLATE;
#if HAVE_NL_LANGINFO
static bool hard_LC_TIME;
#endif
-
-#define NONZERO(x) ((x) != 0)
-
-/* The kind of blanks for '-b' to skip in various options. */
-enum blanktype { bl_start, bl_end, bl_both };
-
/* The character marking end of line. Default to \n. */
static char eolchar = '\n';
-/* Lines are held in core as counted strings. */
-struct line
-{
- char *text; /* Text of the line. */
- size_t length; /* Length including final newline. */
- char *keybeg; /* Start of first key. */
- char *keylim; /* Limit of first key. */
-};
-
/* Input buffers. */
struct buffer
{
@@ -204,38 +158,6 @@ struct buffer
bool eof; /* An EOF has been read. */
};
-/* Sort key. */
-struct keyfield
-{
- size_t sword; /* Zero-origin 'word' to start at. */
- size_t schar; /* Additional characters to skip. */
- size_t eword; /* Zero-origin last 'word' of key. */
- size_t echar; /* Additional characters in field. */
- bool const *ignore; /* Boolean array of characters to ignore. */
- char const *translate; /* Translation applied to characters. */
- bool skipsblanks; /* Skip leading blanks when finding start. */
- bool skipeblanks; /* Skip leading blanks when finding end. */
- bool numeric; /* Flag for numeric comparison. Handle
- strings of digits with optional decimal
- point, but no exponential notation. */
- bool random; /* Sort by random hash of key. */
- bool general_numeric; /* Flag for general, numeric comparison.
- Handle numbers in exponential notation. */
- bool human_numeric; /* Flag for sorting by human readable
- units with either SI or IEC prefixes. */
- bool month; /* Flag for comparison by month name. */
- bool reverse; /* Reverse the sense of comparison. */
- bool version; /* sort by version number */
- bool traditional_used; /* Traditional key option format is used. */
- struct keyfield *next; /* Next keyfield to try. */
-};
-
-struct month
-{
- char const *name;
- int val;
-};
-
/* Binary merge tree node. */
struct merge_node
{
@@ -266,44 +188,6 @@ struct merge_node_queue
/* Used to implement --unique (-u). */
static struct line saved_line;
-/* FIXME: None of these tables work with multibyte character sets.
- Also, there are many other bugs when handling multibyte characters.
- One way to fix this is to rewrite 'sort' to use wide characters
- internally, but doing this with good performance is a bit
- tricky. */
-
-/* Table of blanks. */
-static bool blanks[UCHAR_LIM];
-
-/* Table of non-printing characters. */
-static bool nonprinting[UCHAR_LIM];
-
-/* Table of non-dictionary characters (not letters, digits, or blanks). */
-static bool nondictionary[UCHAR_LIM];
-
-/* Translation table folding lower case to upper. */
-static char fold_toupper[UCHAR_LIM];
-
-#define MONTHS_PER_YEAR 12
-
-/* Table mapping month names to integers.
- Alphabetic order allows binary search. */
-static struct month monthtab[] =
-{
- {"APR", 4},
- {"AUG", 8},
- {"DEC", 12},
- {"FEB", 2},
- {"JAN", 1},
- {"JUL", 7},
- {"JUN", 6},
- {"MAR", 3},
- {"MAY", 5},
- {"NOV", 11},
- {"OCT", 10},
- {"SEP", 9}
-};
-
/* During the merge phase, the number of files to merge at once. */
#define NMERGE_DEFAULT 16
@@ -338,32 +222,9 @@ static size_t temp_dir_count;
/* Number of allocated slots in temp_dirs. */
static size_t temp_dir_alloc;
-/* Flag to reverse the order of all comparisons. */
-static bool reverse;
-
-/* Flag for stable sort. This turns off the last ditch bytewise
- comparison of lines, and instead leaves lines in the same order
- they were read if all keys compare equal. */
-static bool stable;
-
-/* If TAB has this value, blanks separate fields. */
-enum { TAB_DEFAULT = CHAR_MAX + 1 };
-
-/* Tab character separating fields. If TAB_DEFAULT, then fields are
- separated by the empty string between a non-blank character and a blank
- character. */
-static int tab = TAB_DEFAULT;
-
-/* Flag to remove consecutive duplicate lines from the output.
- Only the last of a sequence of equal lines will be output. */
-static bool unique;
-
/* Nonzero if any of the input files are the standard input. */
static bool have_read_stdin;
-/* List of key field comparisons to be tried. */
-static struct keyfield *keylist;
-
/* Program used to (de)compress temp files. Must accept -d. */
static char const *compress_program;
@@ -1593,150 +1454,6 @@ buffer_linelim (struct buffer const *buf)
return linelim;
}
-/* Return a pointer to the first character of the field specified
- by KEY in LINE. */
-
-static char *
-begfield (struct line const *line, struct keyfield const *key)
-{
- char *ptr = line->text, *lim = ptr + line->length - 1;
- size_t sword = key->sword;
- size_t schar = key->schar;
-
- /* The leading field separator itself is included in a field when -t
- is absent. */
-
- if (tab != TAB_DEFAULT)
- while (ptr < lim && sword--)
- {
- while (ptr < lim && *ptr != tab)
- ++ptr;
- if (ptr < lim)
- ++ptr;
- }
- else
- while (ptr < lim && sword--)
- {
- while (ptr < lim && blanks[to_uchar (*ptr)])
- ++ptr;
- while (ptr < lim && !blanks[to_uchar (*ptr)])
- ++ptr;
- }
-
- /* If we're ignoring leading blanks when computing the Start
- of the field, skip past them here. */
- if (key->skipsblanks)
- while (ptr < lim && blanks[to_uchar (*ptr)])
- ++ptr;
-
- /* Advance PTR by SCHAR (if possible), but no further than LIM. */
- ptr = MIN (lim, ptr + schar);
-
- return ptr;
-}
-
-/* Return the limit of (a pointer to the first character after) the field
- in LINE specified by KEY. */
-
-static char *
-limfield (struct line const *line, struct keyfield const *key)
-{
- char *ptr = line->text, *lim = ptr + line->length - 1;
- size_t eword = key->eword, echar = key->echar;
-
- if (echar == 0)
- eword++; /* Skip all of end field. */
-
- /* Move PTR past EWORD fields or to one past the last byte on LINE,
- whichever comes first. If there are more than EWORD fields, leave
- PTR pointing at the beginning of the field having zero-based index,
- EWORD. If a delimiter character was specified (via -t), then that
- 'beginning' is the first character following the delimiting TAB.
- Otherwise, leave PTR pointing at the first 'blank' character after
- the preceding field. */
- if (tab != TAB_DEFAULT)
- while (ptr < lim && eword--)
- {
- while (ptr < lim && *ptr != tab)
- ++ptr;
- if (ptr < lim && (eword || echar))
- ++ptr;
- }
- else
- while (ptr < lim && eword--)
- {
- while (ptr < lim && blanks[to_uchar (*ptr)])
- ++ptr;
- while (ptr < lim && !blanks[to_uchar (*ptr)])
- ++ptr;
- }
-
-#ifdef POSIX_UNSPECIFIED
- /* The following block of code makes GNU sort incompatible with
- standard Unix sort, so it's ifdef'd out for now.
- The POSIX spec isn't clear on how to interpret this.
- FIXME: request clarification.
-
- From: kwzh@gnu.ai.mit.edu (Karl Heuer)
- Date: Thu, 30 May 96 12:20:41 -0400
- [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
-
- [...]I believe I've found another bug in 'sort'.
-
- $ cat /tmp/sort.in
- a b c 2 d
- pq rs 1 t
- $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
- a b c 2 d
- pq rs 1 t
- $ /bin/sort -k1.7,1.7 </tmp/sort.in
- pq rs 1 t
- a b c 2 d
-
- Unix sort produced the answer I expected: sort on the single character
- in column 7. GNU sort produced different results, because it disagrees
- on the interpretation of the key-end spec "M.N". Unix sort reads this
- as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
- "skip M-1 fields, then either N-1 characters or the rest of the current
- field, whichever comes first". This extra clause applies only to
- key-ends, not key-starts.
- */
-
- /* Make LIM point to the end of (one byte past) the current field. */
- if (tab != TAB_DEFAULT)
- {
- char *newlim;
- newlim = memchr (ptr, tab, lim - ptr);
- if (newlim)
- lim = newlim;
- }
- else
- {
- char *newlim;
- newlim = ptr;
- while (newlim < lim && blanks[to_uchar (*newlim)])
- ++newlim;
- while (newlim < lim && !blanks[to_uchar (*newlim)])
- ++newlim;
- lim = newlim;
- }
-#endif
-
- if (echar != 0) /* We need to skip over a portion of the end field. */
- {
- /* If we're ignoring leading blanks when computing the End
- of the field, skip past them here. */
- if (key->skipeblanks)
- while (ptr < lim && blanks[to_uchar (*ptr)])
- ++ptr;
-
- /* Advance PTR by ECHAR (if possible), but no further than LIM. */
- ptr = MIN (lim, ptr + echar);
- }
-
- return ptr;
-}
-
/* Fill BUF reading from FP, moving buf->left bytes from the end
of buf->buf to the beginning first. If EOF is reached and the
file wasn't terminated by a newline, supply one. Set up BUF's line
@@ -1857,225 +1574,6 @@ fillbuf (struct buffer *buf, FILE *fp, char const *file)
}
}
-/* Table that maps characters to order-of-magnitude values. */
-static char const unit_order[UCHAR_LIM] =
- {
-#if ! ('K' == 75 && 'M' == 77 && 'G' == 71 && 'T' == 84 && 'P' == 80 \
- && 'E' == 69 && 'Z' == 90 && 'Y' == 89 && 'k' == 107)
- /* This initializer syntax works on all C99 hosts. For now, use
- it only on non-ASCII hosts, to ease the pain of porting to
- pre-C99 ASCII hosts. */
- ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
- ['k']=1,
-#else
- /* Generate the following table with this command:
- perl -e 'my %a=(k=>1, K=>1, M=>2, G=>3, T=>4, P=>5, E=>6, Z=>7, Y=>8);
- foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\
- |fmt */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3,
- 0, 0, 0, 1, 0, 2, 0, 0, 5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-#endif
- };
-
-/* Traverse number given as *number consisting of digits, thousands_sep, and
- decimal_point chars only. Returns the highest digit found in the number,
- or '\0' if no digit has been found. Upon return *number points at the
- character that immediately follows after the given number. */
-static unsigned char
-traverse_raw_number (char const **number)
-{
- char const *p = *number;
- unsigned char ch;
- unsigned char max_digit = '\0';
- bool ends_with_thousands_sep = false;
-
- /* Scan to end of number.
- Decimals or separators not followed by digits stop the scan.
- Numbers ending in decimals or separators are thus considered
- to be lacking in units.
- FIXME: add support for multibyte thousands_sep and decimal_point. */
-
- while (ISDIGIT (ch = *p++))
- {
- if (max_digit < ch)
- max_digit = ch;
-
- /* Allow to skip only one occurrence of thousands_sep to avoid finding
- the unit in the next column in case thousands_sep matches as blank
- and is used as column delimiter. */
- ends_with_thousands_sep = (*p == thousands_sep);
- if (ends_with_thousands_sep)
- ++p;
- }
-
- if (ends_with_thousands_sep)
- {
- /* thousands_sep not followed by digit is not allowed. */
- *number = p - 2;
- return max_digit;
- }
-
- if (ch == decimal_point)
- while (ISDIGIT (ch = *p++))
- if (max_digit < ch)
- max_digit = ch;
-
- *number = p - 1;
- return max_digit;
-}
-
-/* Return an integer that represents the order of magnitude of the
- unit following the number. The number may contain thousands
- separators and a decimal point, but it may not contain leading blanks.
- Negative numbers get negative orders; zero numbers have a zero order. */
-
-static int _GL_ATTRIBUTE_PURE
-find_unit_order (char const *number)
-{
- bool minus_sign = (*number == '-');
- char const *p = number + minus_sign;
- unsigned char max_digit = traverse_raw_number (&p);
- if ('0' < max_digit)
- {
- unsigned char ch = *p;
- int order = unit_order[ch];
- return (minus_sign ? -order : order);
- }
- else
- return 0;
-}
-
-/* Compare numbers A and B ending in units with SI or IEC prefixes
- <none/unknown> < K/k < M < G < T < P < E < Z < Y */
-
-static int
-human_numcompare (char const *a, char const *b)
-{
- while (blanks[to_uchar (*a)])
- a++;
- while (blanks[to_uchar (*b)])
- b++;
-
- int diff = find_unit_order (a) - find_unit_order (b);
- return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep));
-}
-
-/* Compare strings A and B as numbers without explicitly converting them to
- machine numbers. Comparatively slow for short strings, but asymptotically
- hideously fast. */
-
-static int
-numcompare (char const *a, char const *b)
-{
- while (blanks[to_uchar (*a)])
- a++;
- while (blanks[to_uchar (*b)])
- b++;
-
- return strnumcmp (a, b, decimal_point, thousands_sep);
-}
-
-/* Work around a problem whereby the long double value returned by glibc's
- strtold ("NaN", ...) contains uninitialized bits: clear all bytes of
- A and B before calling strtold. FIXME: remove this function once
- gnulib guarantees that strtold's result is always well defined. */
-static int
-nan_compare (char const *sa, char const *sb)
-{
- long_double a;
- memset (&a, 0, sizeof a);
- a = strtold (sa, NULL);
-
- long_double b;
- memset (&b, 0, sizeof b);
- b = strtold (sb, NULL);
-
- return memcmp (&a, &b, sizeof a);
-}
-
-static int
-general_numcompare (char const *sa, char const *sb)
-{
- /* FIXME: maybe add option to try expensive FP conversion
- only if A and B can't be compared more cheaply/accurately. */
-
- char *ea;
- char *eb;
- long_double a = strtold (sa, &ea);
- long_double b = strtold (sb, &eb);
-
- /* Put conversion errors at the start of the collating sequence. */
- if (sa == ea)
- return sb == eb ? 0 : -1;
- if (sb == eb)
- return 1;
-
- /* Sort numbers in the usual way, where -0 == +0. Put NaNs after
- conversion errors but before numbers; sort them by internal
- bit-pattern, for lack of a more portable alternative. */
- return (a < b ? -1
- : a > b ? 1
- : a == b ? 0
- : b == b ? -1
- : a == a ? 1
- : nan_compare (sa, sb));
-}
-
-/* Return an integer in 1..12 of the month name MONTH.
- Return 0 if the name in S is not recognized. */
-
-static int
-getmonth (char const *month, char **ea)
-{
- size_t lo = 0;
- size_t hi = MONTHS_PER_YEAR;
-
- while (blanks[to_uchar (*month)])
- month++;
-
- do
- {
- size_t ix = (lo + hi) / 2;
- char const *m = month;
- char const *n = monthtab[ix].name;
-
- for (;; m++, n++)
- {
- if (!*n)
- {
- if (ea)
- *ea = (char *) m;
- return monthtab[ix].val;
- }
- if (to_uchar (fold_toupper[to_uchar (*m)]) < to_uchar (*n))
- {
- hi = ix;
- break;
- }
- else if (to_uchar (fold_toupper[to_uchar (*m)]) > to_uchar (*n))
- {
- lo = ix + 1;
- break;
- }
- }
- }
- while (lo < hi);
-
- return 0;
-}
-
-/* A randomly chosen MD5 state, used for random comparison. */
-static struct md5_ctx random_md5_state;
-
/* Initialize the randomly chosen MD5 state. */
static void
@@ -2092,159 +1590,6 @@ random_md5_state_init (char const *random_source)
md5_process_bytes (buf, sizeof buf, &random_md5_state);
}
-/* This is like strxfrm, except it reports any error and exits. */
-
-static size_t
-xstrxfrm (char *restrict dest, char const *restrict src, size_t destsize)
-{
- errno = 0;
- size_t translated_size = strxfrm (dest, src, destsize);
-
- if (errno)
- {
- error (0, errno, _("string transformation failed"));
- error (0, 0, _("set LC_ALL='C' to work around the problem"));
- die (SORT_FAILURE, 0,
- _("the untransformed string was %s"),
- quotearg_n_style (0, locale_quoting_style, src));
- }
-
- return translated_size;
-}
-
-/* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
- using one or more random hash functions. TEXTA[LENA] and
- TEXTB[LENB] must be zero. */
-
-static int
-compare_random (char *restrict texta, size_t lena,
- char *restrict textb, size_t lenb)
-{
- /* XFRM_DIFF records the equivalent of memcmp on the transformed
- data. This is used to break ties if there is a checksum
- collision, and this is good enough given the astronomically low
- probability of a collision. */
- int xfrm_diff = 0;
-
- char stackbuf[4000];
- char *buf = stackbuf;
- size_t bufsize = sizeof stackbuf;
- void *allocated = NULL;
- uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
- struct md5_ctx s[2];
- s[0] = s[1] = random_md5_state;
-
- if (hard_LC_COLLATE)
- {
- char const *lima = texta + lena;
- char const *limb = textb + lenb;
-
- while (true)
- {
- /* Transform the text into the basis of comparison, so that byte
- strings that would otherwise considered to be equal are
- considered equal here even if their bytes differ.
-
- Each time through this loop, transform one
- null-terminated string's worth from TEXTA or from TEXTB
- or both. That way, there's no need to store the
- transformation of the whole line, if it contains many
- null-terminated strings. */
-
- /* Store the transformed data into a big-enough buffer. */
-
- /* A 3X size guess avoids the overhead of calling strxfrm
- twice on typical implementations. Don't worry about
- size_t overflow, as the guess need not be correct. */
- size_t guess_bufsize = 3 * (lena + lenb) + 2;
- if (bufsize < guess_bufsize)
- {
- bufsize = MAX (guess_bufsize, bufsize * 3 / 2);
- free (allocated);
- buf = allocated = malloc (bufsize);
- if (! buf)
- {
- buf = stackbuf;
- bufsize = sizeof stackbuf;
- }
- }
-
- size_t sizea =
- (texta < lima ? xstrxfrm (buf, texta, bufsize) + 1 : 0);
- bool a_fits = sizea <= bufsize;
- size_t sizeb =
- (textb < limb
- ? (xstrxfrm ((a_fits ? buf + sizea : NULL), textb,
- (a_fits ? bufsize - sizea : 0))
- + 1)
- : 0);
-
- if (! (a_fits && sizea + sizeb <= bufsize))
- {
- bufsize = sizea + sizeb;
- if (bufsize < SIZE_MAX / 3)
- bufsize = bufsize * 3 / 2;
- free (allocated);
- buf = allocated = xmalloc (bufsize);
- if (texta < lima)
- strxfrm (buf, texta, sizea);
- if (textb < limb)
- strxfrm (buf + sizea, textb, sizeb);
- }
-
- /* Advance past NULs to the next part of each input string,
- exiting the loop if both strings are exhausted. When
- exiting the loop, prepare to finish off the tiebreaker
- comparison properly. */
- if (texta < lima)
- texta += strlen (texta) + 1;
- if (textb < limb)
- textb += strlen (textb) + 1;
- if (! (texta < lima || textb < limb))
- {
- lena = sizea; texta = buf;
- lenb = sizeb; textb = buf + sizea;
- break;
- }
-
- /* Accumulate the transformed data in the corresponding
- checksums. */
- md5_process_bytes (buf, sizea, &s[0]);
- md5_process_bytes (buf + sizea, sizeb, &s[1]);
-
- /* Update the tiebreaker comparison of the transformed data. */
- if (! xfrm_diff)
- {
- xfrm_diff = memcmp (buf, buf + sizea, MIN (sizea, sizeb));
- if (! xfrm_diff)
- xfrm_diff = (sizea > sizeb) - (sizea < sizeb);
- }
- }
- }
-
- /* Compute and compare the checksums. */
- md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]);
- md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]);
- int diff = memcmp (dig[0], dig[1], sizeof dig[0]);
-
- /* Fall back on the tiebreaker if the checksums collide. */
- if (! diff)
- {
- if (! xfrm_diff)
- {
- xfrm_diff = memcmp (texta, textb, MIN (lena, lenb));
- if (! xfrm_diff)
- xfrm_diff = (lena > lenb) - (lena < lenb);
- }
-
- diff = xfrm_diff;
- }
-
- free (allocated);
-
- return diff;
-}
-
/* Return the printable width of the block of memory starting at
TEXT and ending just before LIM, counting each tab as one byte.
FIXME: Should we generally be counting non printable chars? */
@@ -2279,14 +1624,6 @@ mark_key (size_t offset, size_t width)
}
}
-/* Return true if KEY is a numeric key. */
-
-static inline bool
-key_numeric (struct keyfield const *key)
-{
- return key->numeric || key->general_numeric || key->human_numeric;
-}
-
/* For LINE, output a debugging line that underlines KEY in LINE.
If KEY is null, underline the whole line. */
@@ -2510,252 +1847,6 @@ key_warnings (struct keyfield const *gkey, bool gkey_only)
error (0, 0, _("option '-r' only applies to last-resort comparison"));
}
-/* Compare two lines A and B trying every key in sequence until there
- are no more keys or a difference is found. */
-
-static int
-keycompare (struct line const *a, struct line const *b)
-{
- struct keyfield *key = keylist;
-
- /* For the first iteration only, the key positions have been
- precomputed for us. */
- char *texta = a->keybeg;
- char *textb = b->keybeg;
- char *lima = a->keylim;
- char *limb = b->keylim;
-
- int diff;
-
- while (true)
- {
- char const *translate = key->translate;
- bool const *ignore = key->ignore;
-
- /* Treat field ends before field starts as empty fields. */
- lima = MAX (texta, lima);
- limb = MAX (textb, limb);
-
- /* Find the lengths. */
- size_t lena = lima - texta;
- size_t lenb = limb - textb;
-
- if (hard_LC_COLLATE || key_numeric (key)
- || key->month || key->random || key->version)
- {
- char *ta;
- char *tb;
- size_t tlena;
- size_t tlenb;
-
- char enda IF_LINT (= 0);
- char endb IF_LINT (= 0);
- void *allocated IF_LINT (= NULL);
- char stackbuf[4000];
-
- if (ignore || translate)
- {
- /* Compute with copies of the keys, which are the result of
- translating or ignoring characters, and which need their
- own storage. */
-
- size_t i;
-
- /* Allocate space for copies. */
- size_t size = lena + 1 + lenb + 1;
- if (size <= sizeof stackbuf)
- ta = stackbuf, allocated = NULL;
- else
- ta = allocated = xmalloc (size);
- tb = ta + lena + 1;
-
- /* Put into each copy a version of the key in which the
- requested characters are ignored or translated. */
- for (tlena = i = 0; i < lena; i++)
- if (! (ignore && ignore[to_uchar (texta[i])]))
- ta[tlena++] = (translate
- ? translate[to_uchar (texta[i])]
- : texta[i]);
- ta[tlena] = '\0';
-
- for (tlenb = i = 0; i < lenb; i++)
- if (! (ignore && ignore[to_uchar (textb[i])]))
- tb[tlenb++] = (translate
- ? translate[to_uchar (textb[i])]
- : textb[i]);
- tb[tlenb] = '\0';
- }
- else
- {
- /* Use the keys in-place, temporarily null-terminated. */
- ta = texta; tlena = lena; enda = ta[tlena]; ta[tlena] = '\0';
- tb = textb; tlenb = lenb; endb = tb[tlenb]; tb[tlenb] = '\0';
- }
-
- if (key->numeric)
- diff = numcompare (ta, tb);
- else if (key->general_numeric)
- diff = general_numcompare (ta, tb);
- else if (key->human_numeric)
- diff = human_numcompare (ta, tb);
- else if (key->month)
- diff = getmonth (ta, NULL) - getmonth (tb, NULL);
- else if (key->random)
- diff = compare_random (ta, tlena, tb, tlenb);
- else if (key->version)
- diff = filevercmp (ta, tb);
- else
- {
- /* Locale-dependent string sorting. This is slower than
- C-locale sorting, which is implemented below. */
- if (tlena == 0)
- diff = - NONZERO (tlenb);
- else if (tlenb == 0)
- diff = 1;
- else
- diff = xmemcoll0 (ta, tlena + 1, tb, tlenb + 1);
- }
-
- if (ignore || translate)
- free (allocated);
- else
- {
- ta[tlena] = enda;
- tb[tlenb] = endb;
- }
- }
- else if (ignore)
- {
-#define CMP_WITH_IGNORE(A, B) \
- do \
- { \
- while (true) \
- { \
- while (texta < lima && ignore[to_uchar (*texta)]) \
- ++texta; \
- while (textb < limb && ignore[to_uchar (*textb)]) \
- ++textb; \
- if (! (texta < lima && textb < limb)) \
- break; \
- diff = to_uchar (A) - to_uchar (B); \
- if (diff) \
- goto not_equal; \
- ++texta; \
- ++textb; \
- } \
- \
- diff = (texta < lima) - (textb < limb); \
- } \
- while (0)
-
- if (translate)
- CMP_WITH_IGNORE (translate[to_uchar (*texta)],
- translate[to_uchar (*textb)]);
- else
- CMP_WITH_IGNORE (*texta, *textb);
- }
- else if (lena == 0)
- diff = - NONZERO (lenb);
- else if (lenb == 0)
- goto greater;
- else
- {
- if (translate)
- {
- while (texta < lima && textb < limb)
- {
- diff = (to_uchar (translate[to_uchar (*texta++)])
- - to_uchar (translate[to_uchar (*textb++)]));
- if (diff)
- goto not_equal;
- }
- }
- else
- {
- diff = memcmp (texta, textb, MIN (lena, lenb));
- if (diff)
- goto not_equal;
- }
- diff = lena < lenb ? -1 : lena != lenb;
- }
-
- if (diff)
- goto not_equal;
-
- key = key->next;
- if (! key)
- break;
-
- /* Find the beginning and limit of the next field. */
- if (key->eword != SIZE_MAX)
- lima = limfield (a, key), limb = limfield (b, key);
- else
- lima = a->text + a->length - 1, limb = b->text + b->length - 1;
-
- if (key->sword != SIZE_MAX)
- texta = begfield (a, key), textb = begfield (b, key);
- else
- {
- texta = a->text, textb = b->text;
- if (key->skipsblanks)
- {
- while (texta < lima && blanks[to_uchar (*texta)])
- ++texta;
- while (textb < limb && blanks[to_uchar (*textb)])
- ++textb;
- }
- }
- }
-
- return 0;
-
- greater:
- diff = 1;
- not_equal:
- return key->reverse ? -diff : diff;
-}
-
-/* Compare two lines A and B, returning negative, zero, or positive
- depending on whether A compares less than, equal to, or greater than B. */
-
-static int
-compare (struct line const *a, struct line const *b)
-{
- int diff;
- size_t alen, blen;
-
- /* First try to compare on the specified keys (if any).
- The only two cases with no key at all are unadorned sort,
- and unadorned sort -r. */
- if (keylist)
- {
- diff = keycompare (a, b);
- if (diff || unique || stable)
- return diff;
- }
-
- /* If the keys all compare equal (or no keys were specified)
- fall through to the default comparison. */
- alen = a->length - 1, blen = b->length - 1;
-
- if (alen == 0)
- diff = - NONZERO (blen);
- else if (blen == 0)
- diff = 1;
- else if (hard_LC_COLLATE)
- {
- /* Note xmemcoll0 is a performance enhancement as
- it will not unconditionally write '\0' after the
- passed in buffers, which was seen to give around
- a 3% increase in performance for short lines. */
- diff = xmemcoll0 (a->text, alen + 1, b->text, blen + 1);
- }
- else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))
- diff = alen < blen ? -1 : alen != blen;
-
- return reverse ? -diff : diff;
-}
-
/* Write LINE to output stream FP; the output file's name is
OUTPUT_FILE if OUTPUT_FILE is non-null, and is the standard output
otherwise. If debugging is enabled and FP is standard output,
@@ -4015,29 +3106,6 @@ sort (char *const *files, size_t nfiles, char const *output_file,
/* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */
-static void
-insertkey (struct keyfield *key_arg)
-{
- struct keyfield **p;
- struct keyfield *key = xmemdup (key_arg, sizeof *key);
-
- for (p = &keylist; *p; p = &(*p)->next)
- continue;
- *p = key;
- key->next = NULL;
-}
-
-/* Report a bad field specification SPEC, with extra info MSGID. */
-
-static void badfieldspec (char const *, char const *)
- ATTRIBUTE_NORETURN;
-static void
-badfieldspec (char const *spec, char const *msgid)
-{
- die (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
- _(msgid), quote (spec));
-}
-
/* Report incompatible options. */
static void incompatible_options (char const *) ATTRIBUTE_NORETURN;
@@ -4067,41 +3135,6 @@ check_ordering_compatibility (void)
}
}
-/* Parse the leading integer in STRING and store the resulting value
- (which must fit into size_t) into *VAL. Return the address of the
- suffix after the integer. If the value is too large, silently
- substitute SIZE_MAX. If MSGID is NULL, return NULL after
- failure; otherwise, report MSGID and exit on failure. */
-
-static char const *
-parse_field_count (char const *string, size_t *val, char const *msgid)
-{
- char *suffix;
- uintmax_t n;
-
- switch (xstrtoumax (string, &suffix, 10, &n, ""))
- {
- case LONGINT_OK:
- case LONGINT_INVALID_SUFFIX_CHAR:
- *val = n;
- if (*val == n)
- break;
- /* Fall through. */
- case LONGINT_OVERFLOW:
- case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
- *val = SIZE_MAX;
- break;
-
- case LONGINT_INVALID:
- if (msgid)
- die (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
- _(msgid), quote (string));
- return NULL;
- }
-
- return suffix;
-}
-
/* Handle interrupts and hangups. */
static void
@@ -4116,75 +3149,6 @@ sighandler (int sig)
raise (sig);
}
-/* Set the ordering options for KEY specified in S.
- Return the address of the first character in S that
- is not a valid ordering option.
- BLANKTYPE is the kind of blanks that 'b' should skip. */
-
-static char *
-set_ordering (char const *s, struct keyfield *key, enum blanktype blanktype)
-{
- while (*s)
- {
- switch (*s)
- {
- case 'b':
- if (blanktype == bl_start || blanktype == bl_both)
- key->skipsblanks = true;
- if (blanktype == bl_end || blanktype == bl_both)
- key->skipeblanks = true;
- break;
- case 'd':
- key->ignore = nondictionary;
- break;
- case 'f':
- key->translate = fold_toupper;
- break;
- case 'g':
- key->general_numeric = true;
- break;
- case 'h':
- key->human_numeric = true;
- break;
- case 'i':
- /* Option order should not matter, so don't let -i override
- -d. -d implies -i, but -i does not imply -d. */
- if (! key->ignore)
- key->ignore = nonprinting;
- break;
- case 'M':
- key->month = true;
- break;
- case 'n':
- key->numeric = true;
- break;
- case 'R':
- key->random = true;
- break;
- case 'r':
- key->reverse = true;
- break;
- case 'V':
- key->version = true;
- break;
- default:
- return (char *) s;
- }
- ++s;
- }
- return (char *) s;
-}
-
-/* Initialize KEY. */
-
-static struct keyfield *
-key_init (struct keyfield *key)
-{
- memset (key, 0, sizeof *key);
- key->eword = SIZE_MAX;
- return key;
-}
-
int
main (int argc, char **argv)
{
@@ -4433,54 +3397,7 @@ main (int argc, char **argv)
break;
case 'k':
- key = key_init (&key_buf);
-
- /* Get POS1. */
- s = parse_field_count (optarg, &key->sword,
- N_("invalid number at field start"));
- if (! key->sword--)
- {
- /* Provoke with 'sort -k0' */
- badfieldspec (optarg, N_("field number is zero"));
- }
- if (*s == '.')
- {
- s = parse_field_count (s + 1, &key->schar,
- N_("invalid number after '.'"));
- if (! key->schar--)
- {
- /* Provoke with 'sort -k1.0' */
- badfieldspec (optarg, N_("character offset is zero"));
- }
- }
- if (! (key->sword || key->schar))
- key->sword = SIZE_MAX;
- s = set_ordering (s, key, bl_start);
- if (*s != ',')
- {
- key->eword = SIZE_MAX;
- key->echar = 0;
- }
- else
- {
- /* Get POS2. */
- s = parse_field_count (s + 1, &key->eword,
- N_("invalid number after ','"));
- if (! key->eword--)
- {
- /* Provoke with 'sort -k1,0' */
- badfieldspec (optarg, N_("field number is zero"));
- }
- if (*s == '.')
- {
- s = parse_field_count (s + 1, &key->echar,
- N_("invalid number after '.'"));
- }
- s = set_ordering (s, key, bl_end);
- }
- if (*s)
- badfieldspec (optarg, N_("stray character in field spec"));
- insertkey (key);
+ add_key();
break;
case 'm':