diff options
author | Erich Eckner <git@eckner.net> | 2016-11-28 14:03:15 +0100 |
---|---|---|
committer | Erich Eckner <git@eckner.net> | 2017-02-16 13:23:48 +0100 |
commit | e74f109de104c1966d2d4863aedfb25eefe0358f (patch) | |
tree | 738ec2f7ae295019d346175a86ea85f7e13782e2 /src | |
parent | 1c27b56095b4a82be7d072baabc09262cb4227e5 (diff) | |
download | coreutils-e74f109de104c1966d2d4863aedfb25eefe0358f.tar.xz |
sort: split off source lines useful for uniq, too
Diffstat (limited to 'src')
-rw-r--r-- | src/sort-uniq-keyfuncs.c | 1091 | ||||
-rw-r--r-- | src/sort.c | 1089 |
2 files changed, 1094 insertions, 1086 deletions
diff --git a/src/sort-uniq-keyfuncs.c b/src/sort-uniq-keyfuncs.c new file mode 100644 index 000000000..196045bc0 --- /dev/null +++ b/src/sort-uniq-keyfuncs.c @@ -0,0 +1,1091 @@ +#include "filevercmp.h" +#include "md5.h" +#include "strnumcmp.h" +#define UCHAR_LIM (UCHAR_MAX + 1) + +#if HAVE_C99_STRTOLD +# define long_double long double +#else +# define long_double double +# undef strtold +# define strtold strtod +#endif + +/* Exit statuses. */ +enum + { + /* POSIX says to exit with status 1 if invoked with -c and the + input is not properly sorted. */ + SORT_OUT_OF_ORDER = 1, + + /* POSIX says any other irregular exit must exit with a status + code greater than 1. */ + SORT_FAILURE = 2 + }; + +/* The representation of the decimal point in the current locale. */ +static int decimal_point; + +/* Thousands separator; if -1, then there isn't one. */ +static int thousands_sep; + +/* Nonzero if the corresponding locales are hard. */ +static bool hard_LC_COLLATE; + +#define NONZERO(x) ((x) != 0) + +/* The kind of blanks for '-b' to skip in various options. */ +enum blanktype { bl_start, bl_end, bl_both }; + +/* Lines are held in core as counted strings. */ +struct line +{ + char *text; /* Text of the line. */ + size_t length; /* Length including final newline. */ + char *keybeg; /* Start of first key. */ + char *keylim; /* Limit of first key. */ +}; + +/* Sort key. */ +struct keyfield +{ + size_t sword; /* Zero-origin 'word' to start at. */ + size_t schar; /* Additional characters to skip. */ + size_t eword; /* Zero-origin last 'word' of key. */ + size_t echar; /* Additional characters in field. */ + bool const *ignore; /* Boolean array of characters to ignore. */ + char const *translate; /* Translation applied to characters. */ + bool skipsblanks; /* Skip leading blanks when finding start. */ + bool skipeblanks; /* Skip leading blanks when finding end. */ + bool numeric; /* Flag for numeric comparison. Handle + strings of digits with optional decimal + point, but no exponential notation. */ + bool random; /* Sort by random hash of key. */ + bool general_numeric; /* Flag for general, numeric comparison. + Handle numbers in exponential notation. */ + bool human_numeric; /* Flag for sorting by human readable + units with either SI or IEC prefixes. */ + bool month; /* Flag for comparison by month name. */ + bool reverse; /* Reverse the sense of comparison. */ + bool version; /* sort by version number */ + bool traditional_used; /* Traditional key option format is used. */ + struct keyfield *next; /* Next keyfield to try. */ +}; + +struct month +{ + char const *name; + int val; +}; + +/* FIXME: None of these tables work with multibyte character sets. + Also, there are many other bugs when handling multibyte characters. + One way to fix this is to rewrite 'sort' to use wide characters + internally, but doing this with good performance is a bit + tricky. */ + +/* Table of blanks. */ +static bool blanks[UCHAR_LIM]; + +/* Table of non-printing characters. */ +static bool nonprinting[UCHAR_LIM]; + +/* Table of non-dictionary characters (not letters, digits, or blanks). */ +static bool nondictionary[UCHAR_LIM]; + +/* Translation table folding lower case to upper. */ +static char fold_toupper[UCHAR_LIM]; + +#define MONTHS_PER_YEAR 12 + +/* Table mapping month names to integers. + Alphabetic order allows binary search. */ +static struct month monthtab[] = +{ + {"APR", 4}, + {"AUG", 8}, + {"DEC", 12}, + {"FEB", 2}, + {"JAN", 1}, + {"JUL", 7}, + {"JUN", 6}, + {"MAR", 3}, + {"MAY", 5}, + {"NOV", 11}, + {"OCT", 10}, + {"SEP", 9} +}; + +/* Flag to reverse the order of all comparisons. */ +static bool reverse; + +/* Flag for stable sort. This turns off the last ditch bytewise + comparison of lines, and instead leaves lines in the same order + they were read if all keys compare equal. */ +static bool stable; + +/* If TAB has this value, blanks separate fields. */ +enum { TAB_DEFAULT = CHAR_MAX + 1 }; + +/* Tab character separating fields. If TAB_DEFAULT, then fields are + separated by the empty string between a non-blank character and a blank + character. */ +static int tab = TAB_DEFAULT; + +/* Flag to remove consecutive duplicate lines from the output. + Only the last of a sequence of equal lines will be output. */ +static bool unique; + +/* List of key field comparisons to be tried. */ +static struct keyfield *keylist; + +/* Return a pointer to the first character of the field specified + by KEY in LINE. */ + +static char * +begfield (struct line const *line, struct keyfield const *key) +{ + char *ptr = line->text, *lim = ptr + line->length - 1; + size_t sword = key->sword; + size_t schar = key->schar; + + /* The leading field separator itself is included in a field when -t + is absent. */ + + if (tab != TAB_DEFAULT) + while (ptr < lim && sword--) + { + while (ptr < lim && *ptr != tab) + ++ptr; + if (ptr < lim) + ++ptr; + } + else + while (ptr < lim && sword--) + { + while (ptr < lim && blanks[to_uchar (*ptr)]) + ++ptr; + while (ptr < lim && !blanks[to_uchar (*ptr)]) + ++ptr; + } + + /* If we're ignoring leading blanks when computing the Start + of the field, skip past them here. */ + if (key->skipsblanks) + while (ptr < lim && blanks[to_uchar (*ptr)]) + ++ptr; + + /* Advance PTR by SCHAR (if possible), but no further than LIM. */ + ptr = MIN (lim, ptr + schar); + + return ptr; +} + +/* Return the limit of (a pointer to the first character after) the field + in LINE specified by KEY. */ + +static char * +limfield (struct line const *line, struct keyfield const *key) +{ + char *ptr = line->text, *lim = ptr + line->length - 1; + size_t eword = key->eword, echar = key->echar; + + if (echar == 0) + eword++; /* Skip all of end field. */ + + /* Move PTR past EWORD fields or to one past the last byte on LINE, + whichever comes first. If there are more than EWORD fields, leave + PTR pointing at the beginning of the field having zero-based index, + EWORD. If a delimiter character was specified (via -t), then that + 'beginning' is the first character following the delimiting TAB. + Otherwise, leave PTR pointing at the first 'blank' character after + the preceding field. */ + if (tab != TAB_DEFAULT) + while (ptr < lim && eword--) + { + while (ptr < lim && *ptr != tab) + ++ptr; + if (ptr < lim && (eword || echar)) + ++ptr; + } + else + while (ptr < lim && eword--) + { + while (ptr < lim && blanks[to_uchar (*ptr)]) + ++ptr; + while (ptr < lim && !blanks[to_uchar (*ptr)]) + ++ptr; + } + +#ifdef POSIX_UNSPECIFIED + /* The following block of code makes GNU sort incompatible with + standard Unix sort, so it's ifdef'd out for now. + The POSIX spec isn't clear on how to interpret this. + FIXME: request clarification. + + From: kwzh@gnu.ai.mit.edu (Karl Heuer) + Date: Thu, 30 May 96 12:20:41 -0400 + [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.] + + [...]I believe I've found another bug in 'sort'. + + $ cat /tmp/sort.in + a b c 2 d + pq rs 1 t + $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in + a b c 2 d + pq rs 1 t + $ /bin/sort -k1.7,1.7 </tmp/sort.in + pq rs 1 t + a b c 2 d + + Unix sort produced the answer I expected: sort on the single character + in column 7. GNU sort produced different results, because it disagrees + on the interpretation of the key-end spec "M.N". Unix sort reads this + as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean + "skip M-1 fields, then either N-1 characters or the rest of the current + field, whichever comes first". This extra clause applies only to + key-ends, not key-starts. + */ + + /* Make LIM point to the end of (one byte past) the current field. */ + if (tab != TAB_DEFAULT) + { + char *newlim; + newlim = memchr (ptr, tab, lim - ptr); + if (newlim) + lim = newlim; + } + else + { + char *newlim; + newlim = ptr; + while (newlim < lim && blanks[to_uchar (*newlim)]) + ++newlim; + while (newlim < lim && !blanks[to_uchar (*newlim)]) + ++newlim; + lim = newlim; + } +#endif + + if (echar != 0) /* We need to skip over a portion of the end field. */ + { + /* If we're ignoring leading blanks when computing the End + of the field, skip past them here. */ + if (key->skipeblanks) + while (ptr < lim && blanks[to_uchar (*ptr)]) + ++ptr; + + /* Advance PTR by ECHAR (if possible), but no further than LIM. */ + ptr = MIN (lim, ptr + echar); + } + + return ptr; +} + +/* Table that maps characters to order-of-magnitude values. */ +static char const unit_order[UCHAR_LIM] = + { +#if ! ('K' == 75 && 'M' == 77 && 'G' == 71 && 'T' == 84 && 'P' == 80 \ + && 'E' == 69 && 'Z' == 90 && 'Y' == 89 && 'k' == 107) + /* This initializer syntax works on all C99 hosts. For now, use + it only on non-ASCII hosts, to ease the pain of porting to + pre-C99 ASCII hosts. */ + ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8, + ['k']=1, +#else + /* Generate the following table with this command: + perl -e 'my %a=(k=>1, K=>1, M=>2, G=>3, T=>4, P=>5, E=>6, Z=>7, Y=>8); + foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\ + |fmt */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3, + 0, 0, 0, 1, 0, 2, 0, 0, 5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +#endif + }; + +/* Traverse number given as *number consisting of digits, thousands_sep, and + decimal_point chars only. Returns the highest digit found in the number, + or '\0' if no digit has been found. Upon return *number points at the + character that immediately follows after the given number. */ +static unsigned char +traverse_raw_number (char const **number) +{ + char const *p = *number; + unsigned char ch; + unsigned char max_digit = '\0'; + bool ends_with_thousands_sep = false; + + /* Scan to end of number. + Decimals or separators not followed by digits stop the scan. + Numbers ending in decimals or separators are thus considered + to be lacking in units. + FIXME: add support for multibyte thousands_sep and decimal_point. */ + + while (ISDIGIT (ch = *p++)) + { + if (max_digit < ch) + max_digit = ch; + + /* Allow to skip only one occurrence of thousands_sep to avoid finding + the unit in the next column in case thousands_sep matches as blank + and is used as column delimiter. */ + ends_with_thousands_sep = (*p == thousands_sep); + if (ends_with_thousands_sep) + ++p; + } + + if (ends_with_thousands_sep) + { + /* thousands_sep not followed by digit is not allowed. */ + *number = p - 2; + return max_digit; + } + + if (ch == decimal_point) + while (ISDIGIT (ch = *p++)) + if (max_digit < ch) + max_digit = ch; + + *number = p - 1; + return max_digit; +} + +/* Return an integer that represents the order of magnitude of the + unit following the number. The number may contain thousands + separators and a decimal point, but it may not contain leading blanks. + Negative numbers get negative orders; zero numbers have a zero order. */ + +static int _GL_ATTRIBUTE_PURE +find_unit_order (char const *number) +{ + bool minus_sign = (*number == '-'); + char const *p = number + minus_sign; + unsigned char max_digit = traverse_raw_number (&p); + if ('0' < max_digit) + { + unsigned char ch = *p; + int order = unit_order[ch]; + return (minus_sign ? -order : order); + } + else + return 0; +} + +/* Compare numbers A and B ending in units with SI or IEC prefixes + <none/unknown> < K/k < M < G < T < P < E < Z < Y */ + +static int +human_numcompare (char const *a, char const *b) +{ + while (blanks[to_uchar (*a)]) + a++; + while (blanks[to_uchar (*b)]) + b++; + + int diff = find_unit_order (a) - find_unit_order (b); + return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep)); +} + +/* Compare strings A and B as numbers without explicitly converting them to + machine numbers. Comparatively slow for short strings, but asymptotically + hideously fast. */ + +static int +numcompare (char const *a, char const *b) +{ + while (blanks[to_uchar (*a)]) + a++; + while (blanks[to_uchar (*b)]) + b++; + + return strnumcmp (a, b, decimal_point, thousands_sep); +} + +/* Work around a problem whereby the long double value returned by glibc's + strtold ("NaN", ...) contains uninitialized bits: clear all bytes of + A and B before calling strtold. FIXME: remove this function once + gnulib guarantees that strtold's result is always well defined. */ +static int +nan_compare (char const *sa, char const *sb) +{ + long_double a; + memset (&a, 0, sizeof a); + a = strtold (sa, NULL); + + long_double b; + memset (&b, 0, sizeof b); + b = strtold (sb, NULL); + + return memcmp (&a, &b, sizeof a); +} + +static int +general_numcompare (char const *sa, char const *sb) +{ + /* FIXME: maybe add option to try expensive FP conversion + only if A and B can't be compared more cheaply/accurately. */ + + char *ea; + char *eb; + long_double a = strtold (sa, &ea); + long_double b = strtold (sb, &eb); + + /* Put conversion errors at the start of the collating sequence. */ + if (sa == ea) + return sb == eb ? 0 : -1; + if (sb == eb) + return 1; + + /* Sort numbers in the usual way, where -0 == +0. Put NaNs after + conversion errors but before numbers; sort them by internal + bit-pattern, for lack of a more portable alternative. */ + return (a < b ? -1 + : a > b ? 1 + : a == b ? 0 + : b == b ? -1 + : a == a ? 1 + : nan_compare (sa, sb)); +} + +/* Return an integer in 1..12 of the month name MONTH. + Return 0 if the name in S is not recognized. */ + +static int +getmonth (char const *month, char **ea) +{ + size_t lo = 0; + size_t hi = MONTHS_PER_YEAR; + + while (blanks[to_uchar (*month)]) + month++; + + do + { + size_t ix = (lo + hi) / 2; + char const *m = month; + char const *n = monthtab[ix].name; + + for (;; m++, n++) + { + if (!*n) + { + if (ea) + *ea = (char *) m; + return monthtab[ix].val; + } + if (to_uchar (fold_toupper[to_uchar (*m)]) < to_uchar (*n)) + { + hi = ix; + break; + } + else if (to_uchar (fold_toupper[to_uchar (*m)]) > to_uchar (*n)) + { + lo = ix + 1; + break; + } + } + } + while (lo < hi); + + return 0; +} + +/* A randomly chosen MD5 state, used for random comparison. */ +static struct md5_ctx random_md5_state; + +/* This is like strxfrm, except it reports any error and exits. */ + +static size_t +xstrxfrm (char *restrict dest, char const *restrict src, size_t destsize) +{ + errno = 0; + size_t translated_size = strxfrm (dest, src, destsize); + + if (errno) + { + error (0, errno, _("string transformation failed")); + error (0, 0, _("set LC_ALL='C' to work around the problem")); + die (SORT_FAILURE, 0, + _("the untransformed string was %s"), + quotearg_n_style (0, locale_quoting_style, src)); + } + + return translated_size; +} + +/* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB) + using one or more random hash functions. TEXTA[LENA] and + TEXTB[LENB] must be zero. */ + +static int +compare_random (char *restrict texta, size_t lena, + char *restrict textb, size_t lenb) +{ + /* XFRM_DIFF records the equivalent of memcmp on the transformed + data. This is used to break ties if there is a checksum + collision, and this is good enough given the astronomically low + probability of a collision. */ + int xfrm_diff = 0; + + char stackbuf[4000]; + char *buf = stackbuf; + size_t bufsize = sizeof stackbuf; + void *allocated = NULL; + uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)]; + struct md5_ctx s[2]; + s[0] = s[1] = random_md5_state; + + if (hard_LC_COLLATE) + { + char const *lima = texta + lena; + char const *limb = textb + lenb; + + while (true) + { + /* Transform the text into the basis of comparison, so that byte + strings that would otherwise considered to be equal are + considered equal here even if their bytes differ. + + Each time through this loop, transform one + null-terminated string's worth from TEXTA or from TEXTB + or both. That way, there's no need to store the + transformation of the whole line, if it contains many + null-terminated strings. */ + + /* Store the transformed data into a big-enough buffer. */ + + /* A 3X size guess avoids the overhead of calling strxfrm + twice on typical implementations. Don't worry about + size_t overflow, as the guess need not be correct. */ + size_t guess_bufsize = 3 * (lena + lenb) + 2; + if (bufsize < guess_bufsize) + { + bufsize = MAX (guess_bufsize, bufsize * 3 / 2); + free (allocated); + buf = allocated = malloc (bufsize); + if (! buf) + { + buf = stackbuf; + bufsize = sizeof stackbuf; + } + } + + size_t sizea = + (texta < lima ? xstrxfrm (buf, texta, bufsize) + 1 : 0); + bool a_fits = sizea <= bufsize; + size_t sizeb = + (textb < limb + ? (xstrxfrm ((a_fits ? buf + sizea : NULL), textb, + (a_fits ? bufsize - sizea : 0)) + + 1) + : 0); + + if (! (a_fits && sizea + sizeb <= bufsize)) + { + bufsize = sizea + sizeb; + if (bufsize < SIZE_MAX / 3) + bufsize = bufsize * 3 / 2; + free (allocated); + buf = allocated = xmalloc (bufsize); + if (texta < lima) + strxfrm (buf, texta, sizea); + if (textb < limb) + strxfrm (buf + sizea, textb, sizeb); + } + + /* Advance past NULs to the next part of each input string, + exiting the loop if both strings are exhausted. When + exiting the loop, prepare to finish off the tiebreaker + comparison properly. */ + if (texta < lima) + texta += strlen (texta) + 1; + if (textb < limb) + textb += strlen (textb) + 1; + if (! (texta < lima || textb < limb)) + { + lena = sizea; texta = buf; + lenb = sizeb; textb = buf + sizea; + break; + } + + /* Accumulate the transformed data in the corresponding + checksums. */ + md5_process_bytes (buf, sizea, &s[0]); + md5_process_bytes (buf + sizea, sizeb, &s[1]); + + /* Update the tiebreaker comparison of the transformed data. */ + if (! xfrm_diff) + { + xfrm_diff = memcmp (buf, buf + sizea, MIN (sizea, sizeb)); + if (! xfrm_diff) + xfrm_diff = (sizea > sizeb) - (sizea < sizeb); + } + } + } + + /* Compute and compare the checksums. */ + md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]); + md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]); + int diff = memcmp (dig[0], dig[1], sizeof dig[0]); + + /* Fall back on the tiebreaker if the checksums collide. */ + if (! diff) + { + if (! xfrm_diff) + { + xfrm_diff = memcmp (texta, textb, MIN (lena, lenb)); + if (! xfrm_diff) + xfrm_diff = (lena > lenb) - (lena < lenb); + } + + diff = xfrm_diff; + } + + free (allocated); + + return diff; +} + +/* Return true if KEY is a numeric key. */ + +static inline bool +key_numeric (struct keyfield const *key) +{ + return key->numeric || key->general_numeric || key->human_numeric; +} + +/* Compare two lines A and B trying every key in sequence until there + are no more keys or a difference is found. */ + +static int +keycompare (struct line const *a, struct line const *b) +{ + struct keyfield *key = keylist; + + /* For the first iteration only, the key positions have been + precomputed for us. */ + char *texta = a->keybeg; + char *textb = b->keybeg; + char *lima = a->keylim; + char *limb = b->keylim; + + int diff; + + while (true) + { + char const *translate = key->translate; + bool const *ignore = key->ignore; + + /* Treat field ends before field starts as empty fields. */ + lima = MAX (texta, lima); + limb = MAX (textb, limb); + + /* Find the lengths. */ + size_t lena = lima - texta; + size_t lenb = limb - textb; + + if (hard_LC_COLLATE || key_numeric (key) + || key->month || key->random || key->version) + { + char *ta; + char *tb; + size_t tlena; + size_t tlenb; + + char enda IF_LINT (= 0); + char endb IF_LINT (= 0); + void *allocated IF_LINT (= NULL); + char stackbuf[4000]; + + if (ignore || translate) + { + /* Compute with copies of the keys, which are the result of + translating or ignoring characters, and which need their + own storage. */ + + size_t i; + + /* Allocate space for copies. */ + size_t size = lena + 1 + lenb + 1; + if (size <= sizeof stackbuf) + ta = stackbuf, allocated = NULL; + else + ta = allocated = xmalloc (size); + tb = ta + lena + 1; + + /* Put into each copy a version of the key in which the + requested characters are ignored or translated. */ + for (tlena = i = 0; i < lena; i++) + if (! (ignore && ignore[to_uchar (texta[i])])) + ta[tlena++] = (translate + ? translate[to_uchar (texta[i])] + : texta[i]); + ta[tlena] = '\0'; + + for (tlenb = i = 0; i < lenb; i++) + if (! (ignore && ignore[to_uchar (textb[i])])) + tb[tlenb++] = (translate + ? translate[to_uchar (textb[i])] + : textb[i]); + tb[tlenb] = '\0'; + } + else + { + /* Use the keys in-place, temporarily null-terminated. */ + ta = texta; tlena = lena; enda = ta[tlena]; ta[tlena] = '\0'; + tb = textb; tlenb = lenb; endb = tb[tlenb]; tb[tlenb] = '\0'; + } + + if (key->numeric) + diff = numcompare (ta, tb); + else if (key->general_numeric) + diff = general_numcompare (ta, tb); + else if (key->human_numeric) + diff = human_numcompare (ta, tb); + else if (key->month) + diff = getmonth (ta, NULL) - getmonth (tb, NULL); + else if (key->random) + diff = compare_random (ta, tlena, tb, tlenb); + else if (key->version) + diff = filevercmp (ta, tb); + else + { + /* Locale-dependent string sorting. This is slower than + C-locale sorting, which is implemented below. */ + if (tlena == 0) + diff = - NONZERO (tlenb); + else if (tlenb == 0) + diff = 1; + else + diff = xmemcoll0 (ta, tlena + 1, tb, tlenb + 1); + } + + if (ignore || translate) + free (allocated); + else + { + ta[tlena] = enda; + tb[tlenb] = endb; + } + } + else if (ignore) + { +#define CMP_WITH_IGNORE(A, B) \ + do \ + { \ + while (true) \ + { \ + while (texta < lima && ignore[to_uchar (*texta)]) \ + ++texta; \ + while (textb < limb && ignore[to_uchar (*textb)]) \ + ++textb; \ + if (! (texta < lima && textb < limb)) \ + break; \ + diff = to_uchar (A) - to_uchar (B); \ + if (diff) \ + goto not_equal; \ + ++texta; \ + ++textb; \ + } \ + \ + diff = (texta < lima) - (textb < limb); \ + } \ + while (0) + + if (translate) + CMP_WITH_IGNORE (translate[to_uchar (*texta)], + translate[to_uchar (*textb)]); + else + CMP_WITH_IGNORE (*texta, *textb); + } + else if (lena == 0) + diff = - NONZERO (lenb); + else if (lenb == 0) + goto greater; + else + { + if (translate) + { + while (texta < lima && textb < limb) + { + diff = (to_uchar (translate[to_uchar (*texta++)]) + - to_uchar (translate[to_uchar (*textb++)])); + if (diff) + goto not_equal; + } + } + else + { + diff = memcmp (texta, textb, MIN (lena, lenb)); + if (diff) + goto not_equal; + } + diff = lena < lenb ? -1 : lena != lenb; + } + + if (diff) + goto not_equal; + + key = key->next; + if (! key) + break; + + /* Find the beginning and limit of the next field. */ + if (key->eword != SIZE_MAX) + lima = limfield (a, key), limb = limfield (b, key); + else + lima = a->text + a->length - 1, limb = b->text + b->length - 1; + + if (key->sword != SIZE_MAX) + texta = begfield (a, key), textb = begfield (b, key); + else + { + texta = a->text, textb = b->text; + if (key->skipsblanks) + { + while (texta < lima && blanks[to_uchar (*texta)]) + ++texta; + while (textb < limb && blanks[to_uchar (*textb)]) + ++textb; + } + } + } + + return 0; + + greater: + diff = 1; + not_equal: + return key->reverse ? -diff : diff; +} + +/* Compare two lines A and B, returning negative, zero, or positive + depending on whether A compares less than, equal to, or greater than B. */ + +static int +compare (struct line const *a, struct line const *b) +{ + int diff; + size_t alen, blen; + + /* First try to compare on the specified keys (if any). + The only two cases with no key at all are unadorned sort, + and unadorned sort -r. */ + if (keylist) + { + diff = keycompare (a, b); + if (diff || unique || stable) + return diff; + } + + /* If the keys all compare equal (or no keys were specified) + fall through to the default comparison. */ + alen = a->length - 1, blen = b->length - 1; + + if (alen == 0) + diff = - NONZERO (blen); + else if (blen == 0) + diff = 1; + else if (hard_LC_COLLATE) + { + /* Note xmemcoll0 is a performance enhancement as + it will not unconditionally write '\0' after the + passed in buffers, which was seen to give around + a 3% increase in performance for short lines. */ + diff = xmemcoll0 (a->text, alen + 1, b->text, blen + 1); + } + else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen)))) + diff = alen < blen ? -1 : alen != blen; + + return reverse ? -diff : diff; +} + +static void +insertkey (struct keyfield *key_arg) +{ + struct keyfield **p; + struct keyfield *key = xmemdup (key_arg, sizeof *key); + + for (p = &keylist; *p; p = &(*p)->next) + continue; + *p = key; + key->next = NULL; +} + +/* Report a bad field specification SPEC, with extra info MSGID. */ + +static void badfieldspec (char const *, char const *) + ATTRIBUTE_NORETURN; +static void +badfieldspec (char const *spec, char const *msgid) +{ + die (SORT_FAILURE, 0, _("%s: invalid field specification %s"), + _(msgid), quote (spec)); +} + +/* Parse the leading integer in STRING and store the resulting value + (which must fit into size_t) into *VAL. Return the address of the + suffix after the integer. If the value is too large, silently + substitute SIZE_MAX. If MSGID is NULL, return NULL after + failure; otherwise, report MSGID and exit on failure. */ + +static char const * +parse_field_count (char const *string, size_t *val, char const *msgid) +{ + char *suffix; + uintmax_t n; + + switch (xstrtoumax (string, &suffix, 10, &n, "")) + { + case LONGINT_OK: + case LONGINT_INVALID_SUFFIX_CHAR: + *val = n; + if (*val == n) + break; + /* Fall through. */ + case LONGINT_OVERFLOW: + case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR: + *val = SIZE_MAX; + break; + + case LONGINT_INVALID: + if (msgid) + die (SORT_FAILURE, 0, _("%s: invalid count at start of %s"), + _(msgid), quote (string)); + return NULL; + } + + return suffix; +} + +/* Set the ordering options for KEY specified in S. + Return the address of the first character in S that + is not a valid ordering option. + BLANKTYPE is the kind of blanks that 'b' should skip. */ + +static char * +set_ordering (char const *s, struct keyfield *key, enum blanktype blanktype) +{ + while (*s) + { + switch (*s) + { + case 'b': + if (blanktype == bl_start || blanktype == bl_both) + key->skipsblanks = true; + if (blanktype == bl_end || blanktype == bl_both) + key->skipeblanks = true; + break; + case 'd': + key->ignore = nondictionary; + break; + case 'f': + key->translate = fold_toupper; + break; + case 'g': + key->general_numeric = true; + break; + case 'h': + key->human_numeric = true; + break; + case 'i': + /* Option order should not matter, so don't let -i override + -d. -d implies -i, but -i does not imply -d. */ + if (! key->ignore) + key->ignore = nonprinting; + break; + case 'M': + key->month = true; + break; + case 'n': + key->numeric = true; + break; + case 'R': + key->random = true; + break; + case 'r': + key->reverse = true; + break; + case 'V': + key->version = true; + break; + default: + return (char *) s; + } + ++s; + } + return (char *) s; +} + +/* Initialize KEY. */ + +static struct keyfield * +key_init (struct keyfield *key) +{ + memset (key, 0, sizeof *key); + key->eword = SIZE_MAX; + return key; +} + +static void add_key(void) +{ + struct keyfield *key; + struct keyfield key_buf; char const *s; + key = key_init (&key_buf); + + /* Get POS1. */ + s = parse_field_count (optarg, &key->sword, + N_("invalid number at field start")); + if (! key->sword--) + { + /* Provoke with 'sort -k0' */ + badfieldspec (optarg, N_("field number is zero")); + } + if (*s == '.') + { + s = parse_field_count (s + 1, &key->schar, + N_("invalid number after '.'")); + if (! key->schar--) + { + /* Provoke with 'sort -k1.0' */ + badfieldspec (optarg, N_("character offset is zero")); + } + } + if (! (key->sword || key->schar)) + key->sword = SIZE_MAX; + s = set_ordering (s, key, bl_start); + if (*s != ',') + { + key->eword = SIZE_MAX; + key->echar = 0; + } + else + { + /* Get POS2. */ + s = parse_field_count (s + 1, &key->eword, + N_("invalid number after ','")); + if (! key->eword--) + { + /* Provoke with 'sort -k1,0' */ + badfieldspec (optarg, N_("field number is zero")); + } + if (*s == '.') + { + s = parse_field_count (s + 1, &key->echar, + N_("invalid number after '.'")); + } + s = set_ordering (s, key, bl_end); + } + if (*s) + badfieldspec (optarg, N_("stray character in field spec")); + insertkey (key); +} diff --git a/src/sort.c b/src/sort.c index 94315d2a1..6f28b63ae 100644 --- a/src/sort.c +++ b/src/sort.c @@ -34,13 +34,11 @@ #include "die.h" #include "error.h" #include "fadvise.h" -#include "filevercmp.h" #include "flexmember.h" #include "hard-locale.h" #include "hash.h" #include "heap.h" #include "ignore-value.h" -#include "md5.h" #include "mbswidth.h" #include "nproc.h" #include "physmem.h" @@ -50,11 +48,12 @@ #include "readtokens0.h" #include "stdio--.h" #include "stdlib--.h" -#include "strnumcmp.h" #include "xmemcoll.h" #include "xnanosleep.h" #include "xstrtol.h" +#include "sort-uniq-keyfuncs.c" + #ifndef RLIMIT_DATA struct rlimit { size_t rlim_cur; }; # define getrlimit(Resource, Rlp) (-1) @@ -90,16 +89,6 @@ struct rlimit { size_t rlim_cur; }; # define OPEN_MAX 20 #endif -#define UCHAR_LIM (UCHAR_MAX + 1) - -#if HAVE_C99_STRTOLD -# define long_double long double -#else -# define long_double double -# undef strtold -# define strtold strtod -#endif - #ifndef DEFAULT_TMPDIR # define DEFAULT_TMPDIR "/tmp" #endif @@ -124,18 +113,6 @@ verify (4 <= SUBTHREAD_LINES_HEURISTIC); diminishing performance gains. */ enum { DEFAULT_MAX_THREADS = 8 }; -/* Exit statuses. */ -enum - { - /* POSIX says to exit with status 1 if invoked with -c and the - input is not properly sorted. */ - SORT_OUT_OF_ORDER = 1, - - /* POSIX says any other irregular exit must exit with a status - code greater than 1. */ - SORT_FAILURE = 2 - }; - enum { /* The number of times we should try to fork a compression process @@ -159,35 +136,12 @@ enum MERGE_ROOT = 1 }; -/* The representation of the decimal point in the current locale. */ -static int decimal_point; - -/* Thousands separator; if -1, then there isn't one. */ -static int thousands_sep; - -/* Nonzero if the corresponding locales are hard. */ -static bool hard_LC_COLLATE; #if HAVE_NL_LANGINFO static bool hard_LC_TIME; #endif - -#define NONZERO(x) ((x) != 0) - -/* The kind of blanks for '-b' to skip in various options. */ -enum blanktype { bl_start, bl_end, bl_both }; - /* The character marking end of line. Default to \n. */ static char eolchar = '\n'; -/* Lines are held in core as counted strings. */ -struct line -{ - char *text; /* Text of the line. */ - size_t length; /* Length including final newline. */ - char *keybeg; /* Start of first key. */ - char *keylim; /* Limit of first key. */ -}; - /* Input buffers. */ struct buffer { @@ -204,38 +158,6 @@ struct buffer bool eof; /* An EOF has been read. */ }; -/* Sort key. */ -struct keyfield -{ - size_t sword; /* Zero-origin 'word' to start at. */ - size_t schar; /* Additional characters to skip. */ - size_t eword; /* Zero-origin last 'word' of key. */ - size_t echar; /* Additional characters in field. */ - bool const *ignore; /* Boolean array of characters to ignore. */ - char const *translate; /* Translation applied to characters. */ - bool skipsblanks; /* Skip leading blanks when finding start. */ - bool skipeblanks; /* Skip leading blanks when finding end. */ - bool numeric; /* Flag for numeric comparison. Handle - strings of digits with optional decimal - point, but no exponential notation. */ - bool random; /* Sort by random hash of key. */ - bool general_numeric; /* Flag for general, numeric comparison. - Handle numbers in exponential notation. */ - bool human_numeric; /* Flag for sorting by human readable - units with either SI or IEC prefixes. */ - bool month; /* Flag for comparison by month name. */ - bool reverse; /* Reverse the sense of comparison. */ - bool version; /* sort by version number */ - bool traditional_used; /* Traditional key option format is used. */ - struct keyfield *next; /* Next keyfield to try. */ -}; - -struct month -{ - char const *name; - int val; -}; - /* Binary merge tree node. */ struct merge_node { @@ -266,44 +188,6 @@ struct merge_node_queue /* Used to implement --unique (-u). */ static struct line saved_line; -/* FIXME: None of these tables work with multibyte character sets. - Also, there are many other bugs when handling multibyte characters. - One way to fix this is to rewrite 'sort' to use wide characters - internally, but doing this with good performance is a bit - tricky. */ - -/* Table of blanks. */ -static bool blanks[UCHAR_LIM]; - -/* Table of non-printing characters. */ -static bool nonprinting[UCHAR_LIM]; - -/* Table of non-dictionary characters (not letters, digits, or blanks). */ -static bool nondictionary[UCHAR_LIM]; - -/* Translation table folding lower case to upper. */ -static char fold_toupper[UCHAR_LIM]; - -#define MONTHS_PER_YEAR 12 - -/* Table mapping month names to integers. - Alphabetic order allows binary search. */ -static struct month monthtab[] = -{ - {"APR", 4}, - {"AUG", 8}, - {"DEC", 12}, - {"FEB", 2}, - {"JAN", 1}, - {"JUL", 7}, - {"JUN", 6}, - {"MAR", 3}, - {"MAY", 5}, - {"NOV", 11}, - {"OCT", 10}, - {"SEP", 9} -}; - /* During the merge phase, the number of files to merge at once. */ #define NMERGE_DEFAULT 16 @@ -338,32 +222,9 @@ static size_t temp_dir_count; /* Number of allocated slots in temp_dirs. */ static size_t temp_dir_alloc; -/* Flag to reverse the order of all comparisons. */ -static bool reverse; - -/* Flag for stable sort. This turns off the last ditch bytewise - comparison of lines, and instead leaves lines in the same order - they were read if all keys compare equal. */ -static bool stable; - -/* If TAB has this value, blanks separate fields. */ -enum { TAB_DEFAULT = CHAR_MAX + 1 }; - -/* Tab character separating fields. If TAB_DEFAULT, then fields are - separated by the empty string between a non-blank character and a blank - character. */ -static int tab = TAB_DEFAULT; - -/* Flag to remove consecutive duplicate lines from the output. - Only the last of a sequence of equal lines will be output. */ -static bool unique; - /* Nonzero if any of the input files are the standard input. */ static bool have_read_stdin; -/* List of key field comparisons to be tried. */ -static struct keyfield *keylist; - /* Program used to (de)compress temp files. Must accept -d. */ static char const *compress_program; @@ -1593,150 +1454,6 @@ buffer_linelim (struct buffer const *buf) return linelim; } -/* Return a pointer to the first character of the field specified - by KEY in LINE. */ - -static char * -begfield (struct line const *line, struct keyfield const *key) -{ - char *ptr = line->text, *lim = ptr + line->length - 1; - size_t sword = key->sword; - size_t schar = key->schar; - - /* The leading field separator itself is included in a field when -t - is absent. */ - - if (tab != TAB_DEFAULT) - while (ptr < lim && sword--) - { - while (ptr < lim && *ptr != tab) - ++ptr; - if (ptr < lim) - ++ptr; - } - else - while (ptr < lim && sword--) - { - while (ptr < lim && blanks[to_uchar (*ptr)]) - ++ptr; - while (ptr < lim && !blanks[to_uchar (*ptr)]) - ++ptr; - } - - /* If we're ignoring leading blanks when computing the Start - of the field, skip past them here. */ - if (key->skipsblanks) - while (ptr < lim && blanks[to_uchar (*ptr)]) - ++ptr; - - /* Advance PTR by SCHAR (if possible), but no further than LIM. */ - ptr = MIN (lim, ptr + schar); - - return ptr; -} - -/* Return the limit of (a pointer to the first character after) the field - in LINE specified by KEY. */ - -static char * -limfield (struct line const *line, struct keyfield const *key) -{ - char *ptr = line->text, *lim = ptr + line->length - 1; - size_t eword = key->eword, echar = key->echar; - - if (echar == 0) - eword++; /* Skip all of end field. */ - - /* Move PTR past EWORD fields or to one past the last byte on LINE, - whichever comes first. If there are more than EWORD fields, leave - PTR pointing at the beginning of the field having zero-based index, - EWORD. If a delimiter character was specified (via -t), then that - 'beginning' is the first character following the delimiting TAB. - Otherwise, leave PTR pointing at the first 'blank' character after - the preceding field. */ - if (tab != TAB_DEFAULT) - while (ptr < lim && eword--) - { - while (ptr < lim && *ptr != tab) - ++ptr; - if (ptr < lim && (eword || echar)) - ++ptr; - } - else - while (ptr < lim && eword--) - { - while (ptr < lim && blanks[to_uchar (*ptr)]) - ++ptr; - while (ptr < lim && !blanks[to_uchar (*ptr)]) - ++ptr; - } - -#ifdef POSIX_UNSPECIFIED - /* The following block of code makes GNU sort incompatible with - standard Unix sort, so it's ifdef'd out for now. - The POSIX spec isn't clear on how to interpret this. - FIXME: request clarification. - - From: kwzh@gnu.ai.mit.edu (Karl Heuer) - Date: Thu, 30 May 96 12:20:41 -0400 - [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.] - - [...]I believe I've found another bug in 'sort'. - - $ cat /tmp/sort.in - a b c 2 d - pq rs 1 t - $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in - a b c 2 d - pq rs 1 t - $ /bin/sort -k1.7,1.7 </tmp/sort.in - pq rs 1 t - a b c 2 d - - Unix sort produced the answer I expected: sort on the single character - in column 7. GNU sort produced different results, because it disagrees - on the interpretation of the key-end spec "M.N". Unix sort reads this - as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean - "skip M-1 fields, then either N-1 characters or the rest of the current - field, whichever comes first". This extra clause applies only to - key-ends, not key-starts. - */ - - /* Make LIM point to the end of (one byte past) the current field. */ - if (tab != TAB_DEFAULT) - { - char *newlim; - newlim = memchr (ptr, tab, lim - ptr); - if (newlim) - lim = newlim; - } - else - { - char *newlim; - newlim = ptr; - while (newlim < lim && blanks[to_uchar (*newlim)]) - ++newlim; - while (newlim < lim && !blanks[to_uchar (*newlim)]) - ++newlim; - lim = newlim; - } -#endif - - if (echar != 0) /* We need to skip over a portion of the end field. */ - { - /* If we're ignoring leading blanks when computing the End - of the field, skip past them here. */ - if (key->skipeblanks) - while (ptr < lim && blanks[to_uchar (*ptr)]) - ++ptr; - - /* Advance PTR by ECHAR (if possible), but no further than LIM. */ - ptr = MIN (lim, ptr + echar); - } - - return ptr; -} - /* Fill BUF reading from FP, moving buf->left bytes from the end of buf->buf to the beginning first. If EOF is reached and the file wasn't terminated by a newline, supply one. Set up BUF's line @@ -1857,225 +1574,6 @@ fillbuf (struct buffer *buf, FILE *fp, char const *file) } } -/* Table that maps characters to order-of-magnitude values. */ -static char const unit_order[UCHAR_LIM] = - { -#if ! ('K' == 75 && 'M' == 77 && 'G' == 71 && 'T' == 84 && 'P' == 80 \ - && 'E' == 69 && 'Z' == 90 && 'Y' == 89 && 'k' == 107) - /* This initializer syntax works on all C99 hosts. For now, use - it only on non-ASCII hosts, to ease the pain of porting to - pre-C99 ASCII hosts. */ - ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8, - ['k']=1, -#else - /* Generate the following table with this command: - perl -e 'my %a=(k=>1, K=>1, M=>2, G=>3, T=>4, P=>5, E=>6, Z=>7, Y=>8); - foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\ - |fmt */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3, - 0, 0, 0, 1, 0, 2, 0, 0, 5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -#endif - }; - -/* Traverse number given as *number consisting of digits, thousands_sep, and - decimal_point chars only. Returns the highest digit found in the number, - or '\0' if no digit has been found. Upon return *number points at the - character that immediately follows after the given number. */ -static unsigned char -traverse_raw_number (char const **number) -{ - char const *p = *number; - unsigned char ch; - unsigned char max_digit = '\0'; - bool ends_with_thousands_sep = false; - - /* Scan to end of number. - Decimals or separators not followed by digits stop the scan. - Numbers ending in decimals or separators are thus considered - to be lacking in units. - FIXME: add support for multibyte thousands_sep and decimal_point. */ - - while (ISDIGIT (ch = *p++)) - { - if (max_digit < ch) - max_digit = ch; - - /* Allow to skip only one occurrence of thousands_sep to avoid finding - the unit in the next column in case thousands_sep matches as blank - and is used as column delimiter. */ - ends_with_thousands_sep = (*p == thousands_sep); - if (ends_with_thousands_sep) - ++p; - } - - if (ends_with_thousands_sep) - { - /* thousands_sep not followed by digit is not allowed. */ - *number = p - 2; - return max_digit; - } - - if (ch == decimal_point) - while (ISDIGIT (ch = *p++)) - if (max_digit < ch) - max_digit = ch; - - *number = p - 1; - return max_digit; -} - -/* Return an integer that represents the order of magnitude of the - unit following the number. The number may contain thousands - separators and a decimal point, but it may not contain leading blanks. - Negative numbers get negative orders; zero numbers have a zero order. */ - -static int _GL_ATTRIBUTE_PURE -find_unit_order (char const *number) -{ - bool minus_sign = (*number == '-'); - char const *p = number + minus_sign; - unsigned char max_digit = traverse_raw_number (&p); - if ('0' < max_digit) - { - unsigned char ch = *p; - int order = unit_order[ch]; - return (minus_sign ? -order : order); - } - else - return 0; -} - -/* Compare numbers A and B ending in units with SI or IEC prefixes - <none/unknown> < K/k < M < G < T < P < E < Z < Y */ - -static int -human_numcompare (char const *a, char const *b) -{ - while (blanks[to_uchar (*a)]) - a++; - while (blanks[to_uchar (*b)]) - b++; - - int diff = find_unit_order (a) - find_unit_order (b); - return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep)); -} - -/* Compare strings A and B as numbers without explicitly converting them to - machine numbers. Comparatively slow for short strings, but asymptotically - hideously fast. */ - -static int -numcompare (char const *a, char const *b) -{ - while (blanks[to_uchar (*a)]) - a++; - while (blanks[to_uchar (*b)]) - b++; - - return strnumcmp (a, b, decimal_point, thousands_sep); -} - -/* Work around a problem whereby the long double value returned by glibc's - strtold ("NaN", ...) contains uninitialized bits: clear all bytes of - A and B before calling strtold. FIXME: remove this function once - gnulib guarantees that strtold's result is always well defined. */ -static int -nan_compare (char const *sa, char const *sb) -{ - long_double a; - memset (&a, 0, sizeof a); - a = strtold (sa, NULL); - - long_double b; - memset (&b, 0, sizeof b); - b = strtold (sb, NULL); - - return memcmp (&a, &b, sizeof a); -} - -static int -general_numcompare (char const *sa, char const *sb) -{ - /* FIXME: maybe add option to try expensive FP conversion - only if A and B can't be compared more cheaply/accurately. */ - - char *ea; - char *eb; - long_double a = strtold (sa, &ea); - long_double b = strtold (sb, &eb); - - /* Put conversion errors at the start of the collating sequence. */ - if (sa == ea) - return sb == eb ? 0 : -1; - if (sb == eb) - return 1; - - /* Sort numbers in the usual way, where -0 == +0. Put NaNs after - conversion errors but before numbers; sort them by internal - bit-pattern, for lack of a more portable alternative. */ - return (a < b ? -1 - : a > b ? 1 - : a == b ? 0 - : b == b ? -1 - : a == a ? 1 - : nan_compare (sa, sb)); -} - -/* Return an integer in 1..12 of the month name MONTH. - Return 0 if the name in S is not recognized. */ - -static int -getmonth (char const *month, char **ea) -{ - size_t lo = 0; - size_t hi = MONTHS_PER_YEAR; - - while (blanks[to_uchar (*month)]) - month++; - - do - { - size_t ix = (lo + hi) / 2; - char const *m = month; - char const *n = monthtab[ix].name; - - for (;; m++, n++) - { - if (!*n) - { - if (ea) - *ea = (char *) m; - return monthtab[ix].val; - } - if (to_uchar (fold_toupper[to_uchar (*m)]) < to_uchar (*n)) - { - hi = ix; - break; - } - else if (to_uchar (fold_toupper[to_uchar (*m)]) > to_uchar (*n)) - { - lo = ix + 1; - break; - } - } - } - while (lo < hi); - - return 0; -} - -/* A randomly chosen MD5 state, used for random comparison. */ -static struct md5_ctx random_md5_state; - /* Initialize the randomly chosen MD5 state. */ static void @@ -2092,159 +1590,6 @@ random_md5_state_init (char const *random_source) md5_process_bytes (buf, sizeof buf, &random_md5_state); } -/* This is like strxfrm, except it reports any error and exits. */ - -static size_t -xstrxfrm (char *restrict dest, char const *restrict src, size_t destsize) -{ - errno = 0; - size_t translated_size = strxfrm (dest, src, destsize); - - if (errno) - { - error (0, errno, _("string transformation failed")); - error (0, 0, _("set LC_ALL='C' to work around the problem")); - die (SORT_FAILURE, 0, - _("the untransformed string was %s"), - quotearg_n_style (0, locale_quoting_style, src)); - } - - return translated_size; -} - -/* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB) - using one or more random hash functions. TEXTA[LENA] and - TEXTB[LENB] must be zero. */ - -static int -compare_random (char *restrict texta, size_t lena, - char *restrict textb, size_t lenb) -{ - /* XFRM_DIFF records the equivalent of memcmp on the transformed - data. This is used to break ties if there is a checksum - collision, and this is good enough given the astronomically low - probability of a collision. */ - int xfrm_diff = 0; - - char stackbuf[4000]; - char *buf = stackbuf; - size_t bufsize = sizeof stackbuf; - void *allocated = NULL; - uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)]; - struct md5_ctx s[2]; - s[0] = s[1] = random_md5_state; - - if (hard_LC_COLLATE) - { - char const *lima = texta + lena; - char const *limb = textb + lenb; - - while (true) - { - /* Transform the text into the basis of comparison, so that byte - strings that would otherwise considered to be equal are - considered equal here even if their bytes differ. - - Each time through this loop, transform one - null-terminated string's worth from TEXTA or from TEXTB - or both. That way, there's no need to store the - transformation of the whole line, if it contains many - null-terminated strings. */ - - /* Store the transformed data into a big-enough buffer. */ - - /* A 3X size guess avoids the overhead of calling strxfrm - twice on typical implementations. Don't worry about - size_t overflow, as the guess need not be correct. */ - size_t guess_bufsize = 3 * (lena + lenb) + 2; - if (bufsize < guess_bufsize) - { - bufsize = MAX (guess_bufsize, bufsize * 3 / 2); - free (allocated); - buf = allocated = malloc (bufsize); - if (! buf) - { - buf = stackbuf; - bufsize = sizeof stackbuf; - } - } - - size_t sizea = - (texta < lima ? xstrxfrm (buf, texta, bufsize) + 1 : 0); - bool a_fits = sizea <= bufsize; - size_t sizeb = - (textb < limb - ? (xstrxfrm ((a_fits ? buf + sizea : NULL), textb, - (a_fits ? bufsize - sizea : 0)) - + 1) - : 0); - - if (! (a_fits && sizea + sizeb <= bufsize)) - { - bufsize = sizea + sizeb; - if (bufsize < SIZE_MAX / 3) - bufsize = bufsize * 3 / 2; - free (allocated); - buf = allocated = xmalloc (bufsize); - if (texta < lima) - strxfrm (buf, texta, sizea); - if (textb < limb) - strxfrm (buf + sizea, textb, sizeb); - } - - /* Advance past NULs to the next part of each input string, - exiting the loop if both strings are exhausted. When - exiting the loop, prepare to finish off the tiebreaker - comparison properly. */ - if (texta < lima) - texta += strlen (texta) + 1; - if (textb < limb) - textb += strlen (textb) + 1; - if (! (texta < lima || textb < limb)) - { - lena = sizea; texta = buf; - lenb = sizeb; textb = buf + sizea; - break; - } - - /* Accumulate the transformed data in the corresponding - checksums. */ - md5_process_bytes (buf, sizea, &s[0]); - md5_process_bytes (buf + sizea, sizeb, &s[1]); - - /* Update the tiebreaker comparison of the transformed data. */ - if (! xfrm_diff) - { - xfrm_diff = memcmp (buf, buf + sizea, MIN (sizea, sizeb)); - if (! xfrm_diff) - xfrm_diff = (sizea > sizeb) - (sizea < sizeb); - } - } - } - - /* Compute and compare the checksums. */ - md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]); - md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]); - int diff = memcmp (dig[0], dig[1], sizeof dig[0]); - - /* Fall back on the tiebreaker if the checksums collide. */ - if (! diff) - { - if (! xfrm_diff) - { - xfrm_diff = memcmp (texta, textb, MIN (lena, lenb)); - if (! xfrm_diff) - xfrm_diff = (lena > lenb) - (lena < lenb); - } - - diff = xfrm_diff; - } - - free (allocated); - - return diff; -} - /* Return the printable width of the block of memory starting at TEXT and ending just before LIM, counting each tab as one byte. FIXME: Should we generally be counting non printable chars? */ @@ -2279,14 +1624,6 @@ mark_key (size_t offset, size_t width) } } -/* Return true if KEY is a numeric key. */ - -static inline bool -key_numeric (struct keyfield const *key) -{ - return key->numeric || key->general_numeric || key->human_numeric; -} - /* For LINE, output a debugging line that underlines KEY in LINE. If KEY is null, underline the whole line. */ @@ -2510,252 +1847,6 @@ key_warnings (struct keyfield const *gkey, bool gkey_only) error (0, 0, _("option '-r' only applies to last-resort comparison")); } -/* Compare two lines A and B trying every key in sequence until there - are no more keys or a difference is found. */ - -static int -keycompare (struct line const *a, struct line const *b) -{ - struct keyfield *key = keylist; - - /* For the first iteration only, the key positions have been - precomputed for us. */ - char *texta = a->keybeg; - char *textb = b->keybeg; - char *lima = a->keylim; - char *limb = b->keylim; - - int diff; - - while (true) - { - char const *translate = key->translate; - bool const *ignore = key->ignore; - - /* Treat field ends before field starts as empty fields. */ - lima = MAX (texta, lima); - limb = MAX (textb, limb); - - /* Find the lengths. */ - size_t lena = lima - texta; - size_t lenb = limb - textb; - - if (hard_LC_COLLATE || key_numeric (key) - || key->month || key->random || key->version) - { - char *ta; - char *tb; - size_t tlena; - size_t tlenb; - - char enda IF_LINT (= 0); - char endb IF_LINT (= 0); - void *allocated IF_LINT (= NULL); - char stackbuf[4000]; - - if (ignore || translate) - { - /* Compute with copies of the keys, which are the result of - translating or ignoring characters, and which need their - own storage. */ - - size_t i; - - /* Allocate space for copies. */ - size_t size = lena + 1 + lenb + 1; - if (size <= sizeof stackbuf) - ta = stackbuf, allocated = NULL; - else - ta = allocated = xmalloc (size); - tb = ta + lena + 1; - - /* Put into each copy a version of the key in which the - requested characters are ignored or translated. */ - for (tlena = i = 0; i < lena; i++) - if (! (ignore && ignore[to_uchar (texta[i])])) - ta[tlena++] = (translate - ? translate[to_uchar (texta[i])] - : texta[i]); - ta[tlena] = '\0'; - - for (tlenb = i = 0; i < lenb; i++) - if (! (ignore && ignore[to_uchar (textb[i])])) - tb[tlenb++] = (translate - ? translate[to_uchar (textb[i])] - : textb[i]); - tb[tlenb] = '\0'; - } - else - { - /* Use the keys in-place, temporarily null-terminated. */ - ta = texta; tlena = lena; enda = ta[tlena]; ta[tlena] = '\0'; - tb = textb; tlenb = lenb; endb = tb[tlenb]; tb[tlenb] = '\0'; - } - - if (key->numeric) - diff = numcompare (ta, tb); - else if (key->general_numeric) - diff = general_numcompare (ta, tb); - else if (key->human_numeric) - diff = human_numcompare (ta, tb); - else if (key->month) - diff = getmonth (ta, NULL) - getmonth (tb, NULL); - else if (key->random) - diff = compare_random (ta, tlena, tb, tlenb); - else if (key->version) - diff = filevercmp (ta, tb); - else - { - /* Locale-dependent string sorting. This is slower than - C-locale sorting, which is implemented below. */ - if (tlena == 0) - diff = - NONZERO (tlenb); - else if (tlenb == 0) - diff = 1; - else - diff = xmemcoll0 (ta, tlena + 1, tb, tlenb + 1); - } - - if (ignore || translate) - free (allocated); - else - { - ta[tlena] = enda; - tb[tlenb] = endb; - } - } - else if (ignore) - { -#define CMP_WITH_IGNORE(A, B) \ - do \ - { \ - while (true) \ - { \ - while (texta < lima && ignore[to_uchar (*texta)]) \ - ++texta; \ - while (textb < limb && ignore[to_uchar (*textb)]) \ - ++textb; \ - if (! (texta < lima && textb < limb)) \ - break; \ - diff = to_uchar (A) - to_uchar (B); \ - if (diff) \ - goto not_equal; \ - ++texta; \ - ++textb; \ - } \ - \ - diff = (texta < lima) - (textb < limb); \ - } \ - while (0) - - if (translate) - CMP_WITH_IGNORE (translate[to_uchar (*texta)], - translate[to_uchar (*textb)]); - else - CMP_WITH_IGNORE (*texta, *textb); - } - else if (lena == 0) - diff = - NONZERO (lenb); - else if (lenb == 0) - goto greater; - else - { - if (translate) - { - while (texta < lima && textb < limb) - { - diff = (to_uchar (translate[to_uchar (*texta++)]) - - to_uchar (translate[to_uchar (*textb++)])); - if (diff) - goto not_equal; - } - } - else - { - diff = memcmp (texta, textb, MIN (lena, lenb)); - if (diff) - goto not_equal; - } - diff = lena < lenb ? -1 : lena != lenb; - } - - if (diff) - goto not_equal; - - key = key->next; - if (! key) - break; - - /* Find the beginning and limit of the next field. */ - if (key->eword != SIZE_MAX) - lima = limfield (a, key), limb = limfield (b, key); - else - lima = a->text + a->length - 1, limb = b->text + b->length - 1; - - if (key->sword != SIZE_MAX) - texta = begfield (a, key), textb = begfield (b, key); - else - { - texta = a->text, textb = b->text; - if (key->skipsblanks) - { - while (texta < lima && blanks[to_uchar (*texta)]) - ++texta; - while (textb < limb && blanks[to_uchar (*textb)]) - ++textb; - } - } - } - - return 0; - - greater: - diff = 1; - not_equal: - return key->reverse ? -diff : diff; -} - -/* Compare two lines A and B, returning negative, zero, or positive - depending on whether A compares less than, equal to, or greater than B. */ - -static int -compare (struct line const *a, struct line const *b) -{ - int diff; - size_t alen, blen; - - /* First try to compare on the specified keys (if any). - The only two cases with no key at all are unadorned sort, - and unadorned sort -r. */ - if (keylist) - { - diff = keycompare (a, b); - if (diff || unique || stable) - return diff; - } - - /* If the keys all compare equal (or no keys were specified) - fall through to the default comparison. */ - alen = a->length - 1, blen = b->length - 1; - - if (alen == 0) - diff = - NONZERO (blen); - else if (blen == 0) - diff = 1; - else if (hard_LC_COLLATE) - { - /* Note xmemcoll0 is a performance enhancement as - it will not unconditionally write '\0' after the - passed in buffers, which was seen to give around - a 3% increase in performance for short lines. */ - diff = xmemcoll0 (a->text, alen + 1, b->text, blen + 1); - } - else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen)))) - diff = alen < blen ? -1 : alen != blen; - - return reverse ? -diff : diff; -} - /* Write LINE to output stream FP; the output file's name is OUTPUT_FILE if OUTPUT_FILE is non-null, and is the standard output otherwise. If debugging is enabled and FP is standard output, @@ -4015,29 +3106,6 @@ sort (char *const *files, size_t nfiles, char const *output_file, /* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */ -static void -insertkey (struct keyfield *key_arg) -{ - struct keyfield **p; - struct keyfield *key = xmemdup (key_arg, sizeof *key); - - for (p = &keylist; *p; p = &(*p)->next) - continue; - *p = key; - key->next = NULL; -} - -/* Report a bad field specification SPEC, with extra info MSGID. */ - -static void badfieldspec (char const *, char const *) - ATTRIBUTE_NORETURN; -static void -badfieldspec (char const *spec, char const *msgid) -{ - die (SORT_FAILURE, 0, _("%s: invalid field specification %s"), - _(msgid), quote (spec)); -} - /* Report incompatible options. */ static void incompatible_options (char const *) ATTRIBUTE_NORETURN; @@ -4067,41 +3135,6 @@ check_ordering_compatibility (void) } } -/* Parse the leading integer in STRING and store the resulting value - (which must fit into size_t) into *VAL. Return the address of the - suffix after the integer. If the value is too large, silently - substitute SIZE_MAX. If MSGID is NULL, return NULL after - failure; otherwise, report MSGID and exit on failure. */ - -static char const * -parse_field_count (char const *string, size_t *val, char const *msgid) -{ - char *suffix; - uintmax_t n; - - switch (xstrtoumax (string, &suffix, 10, &n, "")) - { - case LONGINT_OK: - case LONGINT_INVALID_SUFFIX_CHAR: - *val = n; - if (*val == n) - break; - /* Fall through. */ - case LONGINT_OVERFLOW: - case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR: - *val = SIZE_MAX; - break; - - case LONGINT_INVALID: - if (msgid) - die (SORT_FAILURE, 0, _("%s: invalid count at start of %s"), - _(msgid), quote (string)); - return NULL; - } - - return suffix; -} - /* Handle interrupts and hangups. */ static void @@ -4116,75 +3149,6 @@ sighandler (int sig) raise (sig); } -/* Set the ordering options for KEY specified in S. - Return the address of the first character in S that - is not a valid ordering option. - BLANKTYPE is the kind of blanks that 'b' should skip. */ - -static char * -set_ordering (char const *s, struct keyfield *key, enum blanktype blanktype) -{ - while (*s) - { - switch (*s) - { - case 'b': - if (blanktype == bl_start || blanktype == bl_both) - key->skipsblanks = true; - if (blanktype == bl_end || blanktype == bl_both) - key->skipeblanks = true; - break; - case 'd': - key->ignore = nondictionary; - break; - case 'f': - key->translate = fold_toupper; - break; - case 'g': - key->general_numeric = true; - break; - case 'h': - key->human_numeric = true; - break; - case 'i': - /* Option order should not matter, so don't let -i override - -d. -d implies -i, but -i does not imply -d. */ - if (! key->ignore) - key->ignore = nonprinting; - break; - case 'M': - key->month = true; - break; - case 'n': - key->numeric = true; - break; - case 'R': - key->random = true; - break; - case 'r': - key->reverse = true; - break; - case 'V': - key->version = true; - break; - default: - return (char *) s; - } - ++s; - } - return (char *) s; -} - -/* Initialize KEY. */ - -static struct keyfield * -key_init (struct keyfield *key) -{ - memset (key, 0, sizeof *key); - key->eword = SIZE_MAX; - return key; -} - int main (int argc, char **argv) { @@ -4433,54 +3397,7 @@ main (int argc, char **argv) break; case 'k': - key = key_init (&key_buf); - - /* Get POS1. */ - s = parse_field_count (optarg, &key->sword, - N_("invalid number at field start")); - if (! key->sword--) - { - /* Provoke with 'sort -k0' */ - badfieldspec (optarg, N_("field number is zero")); - } - if (*s == '.') - { - s = parse_field_count (s + 1, &key->schar, - N_("invalid number after '.'")); - if (! key->schar--) - { - /* Provoke with 'sort -k1.0' */ - badfieldspec (optarg, N_("character offset is zero")); - } - } - if (! (key->sword || key->schar)) - key->sword = SIZE_MAX; - s = set_ordering (s, key, bl_start); - if (*s != ',') - { - key->eword = SIZE_MAX; - key->echar = 0; - } - else - { - /* Get POS2. */ - s = parse_field_count (s + 1, &key->eword, - N_("invalid number after ','")); - if (! key->eword--) - { - /* Provoke with 'sort -k1,0' */ - badfieldspec (optarg, N_("field number is zero")); - } - if (*s == '.') - { - s = parse_field_count (s + 1, &key->echar, - N_("invalid number after '.'")); - } - s = set_ordering (s, key, bl_end); - } - if (*s) - badfieldspec (optarg, N_("stray character in field spec")); - insertkey (key); + add_key(); break; case 'm': |