diff options
Diffstat (limited to 'src/sort-uniq-keyfuncs.c')
-rw-r--r-- | src/sort-uniq-keyfuncs.c | 1091 |
1 files changed, 1091 insertions, 0 deletions
diff --git a/src/sort-uniq-keyfuncs.c b/src/sort-uniq-keyfuncs.c new file mode 100644 index 000000000..196045bc0 --- /dev/null +++ b/src/sort-uniq-keyfuncs.c @@ -0,0 +1,1091 @@ +#include "filevercmp.h" +#include "md5.h" +#include "strnumcmp.h" +#define UCHAR_LIM (UCHAR_MAX + 1) + +#if HAVE_C99_STRTOLD +# define long_double long double +#else +# define long_double double +# undef strtold +# define strtold strtod +#endif + +/* Exit statuses. */ +enum + { + /* POSIX says to exit with status 1 if invoked with -c and the + input is not properly sorted. */ + SORT_OUT_OF_ORDER = 1, + + /* POSIX says any other irregular exit must exit with a status + code greater than 1. */ + SORT_FAILURE = 2 + }; + +/* The representation of the decimal point in the current locale. */ +static int decimal_point; + +/* Thousands separator; if -1, then there isn't one. */ +static int thousands_sep; + +/* Nonzero if the corresponding locales are hard. */ +static bool hard_LC_COLLATE; + +#define NONZERO(x) ((x) != 0) + +/* The kind of blanks for '-b' to skip in various options. */ +enum blanktype { bl_start, bl_end, bl_both }; + +/* Lines are held in core as counted strings. */ +struct line +{ + char *text; /* Text of the line. */ + size_t length; /* Length including final newline. */ + char *keybeg; /* Start of first key. */ + char *keylim; /* Limit of first key. */ +}; + +/* Sort key. */ +struct keyfield +{ + size_t sword; /* Zero-origin 'word' to start at. */ + size_t schar; /* Additional characters to skip. */ + size_t eword; /* Zero-origin last 'word' of key. */ + size_t echar; /* Additional characters in field. */ + bool const *ignore; /* Boolean array of characters to ignore. */ + char const *translate; /* Translation applied to characters. */ + bool skipsblanks; /* Skip leading blanks when finding start. */ + bool skipeblanks; /* Skip leading blanks when finding end. */ + bool numeric; /* Flag for numeric comparison. Handle + strings of digits with optional decimal + point, but no exponential notation. */ + bool random; /* Sort by random hash of key. */ + bool general_numeric; /* Flag for general, numeric comparison. + Handle numbers in exponential notation. */ + bool human_numeric; /* Flag for sorting by human readable + units with either SI or IEC prefixes. */ + bool month; /* Flag for comparison by month name. */ + bool reverse; /* Reverse the sense of comparison. */ + bool version; /* sort by version number */ + bool traditional_used; /* Traditional key option format is used. */ + struct keyfield *next; /* Next keyfield to try. */ +}; + +struct month +{ + char const *name; + int val; +}; + +/* FIXME: None of these tables work with multibyte character sets. + Also, there are many other bugs when handling multibyte characters. + One way to fix this is to rewrite 'sort' to use wide characters + internally, but doing this with good performance is a bit + tricky. */ + +/* Table of blanks. */ +static bool blanks[UCHAR_LIM]; + +/* Table of non-printing characters. */ +static bool nonprinting[UCHAR_LIM]; + +/* Table of non-dictionary characters (not letters, digits, or blanks). */ +static bool nondictionary[UCHAR_LIM]; + +/* Translation table folding lower case to upper. */ +static char fold_toupper[UCHAR_LIM]; + +#define MONTHS_PER_YEAR 12 + +/* Table mapping month names to integers. + Alphabetic order allows binary search. */ +static struct month monthtab[] = +{ + {"APR", 4}, + {"AUG", 8}, + {"DEC", 12}, + {"FEB", 2}, + {"JAN", 1}, + {"JUL", 7}, + {"JUN", 6}, + {"MAR", 3}, + {"MAY", 5}, + {"NOV", 11}, + {"OCT", 10}, + {"SEP", 9} +}; + +/* Flag to reverse the order of all comparisons. */ +static bool reverse; + +/* Flag for stable sort. This turns off the last ditch bytewise + comparison of lines, and instead leaves lines in the same order + they were read if all keys compare equal. */ +static bool stable; + +/* If TAB has this value, blanks separate fields. */ +enum { TAB_DEFAULT = CHAR_MAX + 1 }; + +/* Tab character separating fields. If TAB_DEFAULT, then fields are + separated by the empty string between a non-blank character and a blank + character. */ +static int tab = TAB_DEFAULT; + +/* Flag to remove consecutive duplicate lines from the output. + Only the last of a sequence of equal lines will be output. */ +static bool unique; + +/* List of key field comparisons to be tried. */ +static struct keyfield *keylist; + +/* Return a pointer to the first character of the field specified + by KEY in LINE. */ + +static char * +begfield (struct line const *line, struct keyfield const *key) +{ + char *ptr = line->text, *lim = ptr + line->length - 1; + size_t sword = key->sword; + size_t schar = key->schar; + + /* The leading field separator itself is included in a field when -t + is absent. */ + + if (tab != TAB_DEFAULT) + while (ptr < lim && sword--) + { + while (ptr < lim && *ptr != tab) + ++ptr; + if (ptr < lim) + ++ptr; + } + else + while (ptr < lim && sword--) + { + while (ptr < lim && blanks[to_uchar (*ptr)]) + ++ptr; + while (ptr < lim && !blanks[to_uchar (*ptr)]) + ++ptr; + } + + /* If we're ignoring leading blanks when computing the Start + of the field, skip past them here. */ + if (key->skipsblanks) + while (ptr < lim && blanks[to_uchar (*ptr)]) + ++ptr; + + /* Advance PTR by SCHAR (if possible), but no further than LIM. */ + ptr = MIN (lim, ptr + schar); + + return ptr; +} + +/* Return the limit of (a pointer to the first character after) the field + in LINE specified by KEY. */ + +static char * +limfield (struct line const *line, struct keyfield const *key) +{ + char *ptr = line->text, *lim = ptr + line->length - 1; + size_t eword = key->eword, echar = key->echar; + + if (echar == 0) + eword++; /* Skip all of end field. */ + + /* Move PTR past EWORD fields or to one past the last byte on LINE, + whichever comes first. If there are more than EWORD fields, leave + PTR pointing at the beginning of the field having zero-based index, + EWORD. If a delimiter character was specified (via -t), then that + 'beginning' is the first character following the delimiting TAB. + Otherwise, leave PTR pointing at the first 'blank' character after + the preceding field. */ + if (tab != TAB_DEFAULT) + while (ptr < lim && eword--) + { + while (ptr < lim && *ptr != tab) + ++ptr; + if (ptr < lim && (eword || echar)) + ++ptr; + } + else + while (ptr < lim && eword--) + { + while (ptr < lim && blanks[to_uchar (*ptr)]) + ++ptr; + while (ptr < lim && !blanks[to_uchar (*ptr)]) + ++ptr; + } + +#ifdef POSIX_UNSPECIFIED + /* The following block of code makes GNU sort incompatible with + standard Unix sort, so it's ifdef'd out for now. + The POSIX spec isn't clear on how to interpret this. + FIXME: request clarification. + + From: kwzh@gnu.ai.mit.edu (Karl Heuer) + Date: Thu, 30 May 96 12:20:41 -0400 + [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.] + + [...]I believe I've found another bug in 'sort'. + + $ cat /tmp/sort.in + a b c 2 d + pq rs 1 t + $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in + a b c 2 d + pq rs 1 t + $ /bin/sort -k1.7,1.7 </tmp/sort.in + pq rs 1 t + a b c 2 d + + Unix sort produced the answer I expected: sort on the single character + in column 7. GNU sort produced different results, because it disagrees + on the interpretation of the key-end spec "M.N". Unix sort reads this + as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean + "skip M-1 fields, then either N-1 characters or the rest of the current + field, whichever comes first". This extra clause applies only to + key-ends, not key-starts. + */ + + /* Make LIM point to the end of (one byte past) the current field. */ + if (tab != TAB_DEFAULT) + { + char *newlim; + newlim = memchr (ptr, tab, lim - ptr); + if (newlim) + lim = newlim; + } + else + { + char *newlim; + newlim = ptr; + while (newlim < lim && blanks[to_uchar (*newlim)]) + ++newlim; + while (newlim < lim && !blanks[to_uchar (*newlim)]) + ++newlim; + lim = newlim; + } +#endif + + if (echar != 0) /* We need to skip over a portion of the end field. */ + { + /* If we're ignoring leading blanks when computing the End + of the field, skip past them here. */ + if (key->skipeblanks) + while (ptr < lim && blanks[to_uchar (*ptr)]) + ++ptr; + + /* Advance PTR by ECHAR (if possible), but no further than LIM. */ + ptr = MIN (lim, ptr + echar); + } + + return ptr; +} + +/* Table that maps characters to order-of-magnitude values. */ +static char const unit_order[UCHAR_LIM] = + { +#if ! ('K' == 75 && 'M' == 77 && 'G' == 71 && 'T' == 84 && 'P' == 80 \ + && 'E' == 69 && 'Z' == 90 && 'Y' == 89 && 'k' == 107) + /* This initializer syntax works on all C99 hosts. For now, use + it only on non-ASCII hosts, to ease the pain of porting to + pre-C99 ASCII hosts. */ + ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8, + ['k']=1, +#else + /* Generate the following table with this command: + perl -e 'my %a=(k=>1, K=>1, M=>2, G=>3, T=>4, P=>5, E=>6, Z=>7, Y=>8); + foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\ + |fmt */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3, + 0, 0, 0, 1, 0, 2, 0, 0, 5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +#endif + }; + +/* Traverse number given as *number consisting of digits, thousands_sep, and + decimal_point chars only. Returns the highest digit found in the number, + or '\0' if no digit has been found. Upon return *number points at the + character that immediately follows after the given number. */ +static unsigned char +traverse_raw_number (char const **number) +{ + char const *p = *number; + unsigned char ch; + unsigned char max_digit = '\0'; + bool ends_with_thousands_sep = false; + + /* Scan to end of number. + Decimals or separators not followed by digits stop the scan. + Numbers ending in decimals or separators are thus considered + to be lacking in units. + FIXME: add support for multibyte thousands_sep and decimal_point. */ + + while (ISDIGIT (ch = *p++)) + { + if (max_digit < ch) + max_digit = ch; + + /* Allow to skip only one occurrence of thousands_sep to avoid finding + the unit in the next column in case thousands_sep matches as blank + and is used as column delimiter. */ + ends_with_thousands_sep = (*p == thousands_sep); + if (ends_with_thousands_sep) + ++p; + } + + if (ends_with_thousands_sep) + { + /* thousands_sep not followed by digit is not allowed. */ + *number = p - 2; + return max_digit; + } + + if (ch == decimal_point) + while (ISDIGIT (ch = *p++)) + if (max_digit < ch) + max_digit = ch; + + *number = p - 1; + return max_digit; +} + +/* Return an integer that represents the order of magnitude of the + unit following the number. The number may contain thousands + separators and a decimal point, but it may not contain leading blanks. + Negative numbers get negative orders; zero numbers have a zero order. */ + +static int _GL_ATTRIBUTE_PURE +find_unit_order (char const *number) +{ + bool minus_sign = (*number == '-'); + char const *p = number + minus_sign; + unsigned char max_digit = traverse_raw_number (&p); + if ('0' < max_digit) + { + unsigned char ch = *p; + int order = unit_order[ch]; + return (minus_sign ? -order : order); + } + else + return 0; +} + +/* Compare numbers A and B ending in units with SI or IEC prefixes + <none/unknown> < K/k < M < G < T < P < E < Z < Y */ + +static int +human_numcompare (char const *a, char const *b) +{ + while (blanks[to_uchar (*a)]) + a++; + while (blanks[to_uchar (*b)]) + b++; + + int diff = find_unit_order (a) - find_unit_order (b); + return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep)); +} + +/* Compare strings A and B as numbers without explicitly converting them to + machine numbers. Comparatively slow for short strings, but asymptotically + hideously fast. */ + +static int +numcompare (char const *a, char const *b) +{ + while (blanks[to_uchar (*a)]) + a++; + while (blanks[to_uchar (*b)]) + b++; + + return strnumcmp (a, b, decimal_point, thousands_sep); +} + +/* Work around a problem whereby the long double value returned by glibc's + strtold ("NaN", ...) contains uninitialized bits: clear all bytes of + A and B before calling strtold. FIXME: remove this function once + gnulib guarantees that strtold's result is always well defined. */ +static int +nan_compare (char const *sa, char const *sb) +{ + long_double a; + memset (&a, 0, sizeof a); + a = strtold (sa, NULL); + + long_double b; + memset (&b, 0, sizeof b); + b = strtold (sb, NULL); + + return memcmp (&a, &b, sizeof a); +} + +static int +general_numcompare (char const *sa, char const *sb) +{ + /* FIXME: maybe add option to try expensive FP conversion + only if A and B can't be compared more cheaply/accurately. */ + + char *ea; + char *eb; + long_double a = strtold (sa, &ea); + long_double b = strtold (sb, &eb); + + /* Put conversion errors at the start of the collating sequence. */ + if (sa == ea) + return sb == eb ? 0 : -1; + if (sb == eb) + return 1; + + /* Sort numbers in the usual way, where -0 == +0. Put NaNs after + conversion errors but before numbers; sort them by internal + bit-pattern, for lack of a more portable alternative. */ + return (a < b ? -1 + : a > b ? 1 + : a == b ? 0 + : b == b ? -1 + : a == a ? 1 + : nan_compare (sa, sb)); +} + +/* Return an integer in 1..12 of the month name MONTH. + Return 0 if the name in S is not recognized. */ + +static int +getmonth (char const *month, char **ea) +{ + size_t lo = 0; + size_t hi = MONTHS_PER_YEAR; + + while (blanks[to_uchar (*month)]) + month++; + + do + { + size_t ix = (lo + hi) / 2; + char const *m = month; + char const *n = monthtab[ix].name; + + for (;; m++, n++) + { + if (!*n) + { + if (ea) + *ea = (char *) m; + return monthtab[ix].val; + } + if (to_uchar (fold_toupper[to_uchar (*m)]) < to_uchar (*n)) + { + hi = ix; + break; + } + else if (to_uchar (fold_toupper[to_uchar (*m)]) > to_uchar (*n)) + { + lo = ix + 1; + break; + } + } + } + while (lo < hi); + + return 0; +} + +/* A randomly chosen MD5 state, used for random comparison. */ +static struct md5_ctx random_md5_state; + +/* This is like strxfrm, except it reports any error and exits. */ + +static size_t +xstrxfrm (char *restrict dest, char const *restrict src, size_t destsize) +{ + errno = 0; + size_t translated_size = strxfrm (dest, src, destsize); + + if (errno) + { + error (0, errno, _("string transformation failed")); + error (0, 0, _("set LC_ALL='C' to work around the problem")); + die (SORT_FAILURE, 0, + _("the untransformed string was %s"), + quotearg_n_style (0, locale_quoting_style, src)); + } + + return translated_size; +} + +/* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB) + using one or more random hash functions. TEXTA[LENA] and + TEXTB[LENB] must be zero. */ + +static int +compare_random (char *restrict texta, size_t lena, + char *restrict textb, size_t lenb) +{ + /* XFRM_DIFF records the equivalent of memcmp on the transformed + data. This is used to break ties if there is a checksum + collision, and this is good enough given the astronomically low + probability of a collision. */ + int xfrm_diff = 0; + + char stackbuf[4000]; + char *buf = stackbuf; + size_t bufsize = sizeof stackbuf; + void *allocated = NULL; + uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)]; + struct md5_ctx s[2]; + s[0] = s[1] = random_md5_state; + + if (hard_LC_COLLATE) + { + char const *lima = texta + lena; + char const *limb = textb + lenb; + + while (true) + { + /* Transform the text into the basis of comparison, so that byte + strings that would otherwise considered to be equal are + considered equal here even if their bytes differ. + + Each time through this loop, transform one + null-terminated string's worth from TEXTA or from TEXTB + or both. That way, there's no need to store the + transformation of the whole line, if it contains many + null-terminated strings. */ + + /* Store the transformed data into a big-enough buffer. */ + + /* A 3X size guess avoids the overhead of calling strxfrm + twice on typical implementations. Don't worry about + size_t overflow, as the guess need not be correct. */ + size_t guess_bufsize = 3 * (lena + lenb) + 2; + if (bufsize < guess_bufsize) + { + bufsize = MAX (guess_bufsize, bufsize * 3 / 2); + free (allocated); + buf = allocated = malloc (bufsize); + if (! buf) + { + buf = stackbuf; + bufsize = sizeof stackbuf; + } + } + + size_t sizea = + (texta < lima ? xstrxfrm (buf, texta, bufsize) + 1 : 0); + bool a_fits = sizea <= bufsize; + size_t sizeb = + (textb < limb + ? (xstrxfrm ((a_fits ? buf + sizea : NULL), textb, + (a_fits ? bufsize - sizea : 0)) + + 1) + : 0); + + if (! (a_fits && sizea + sizeb <= bufsize)) + { + bufsize = sizea + sizeb; + if (bufsize < SIZE_MAX / 3) + bufsize = bufsize * 3 / 2; + free (allocated); + buf = allocated = xmalloc (bufsize); + if (texta < lima) + strxfrm (buf, texta, sizea); + if (textb < limb) + strxfrm (buf + sizea, textb, sizeb); + } + + /* Advance past NULs to the next part of each input string, + exiting the loop if both strings are exhausted. When + exiting the loop, prepare to finish off the tiebreaker + comparison properly. */ + if (texta < lima) + texta += strlen (texta) + 1; + if (textb < limb) + textb += strlen (textb) + 1; + if (! (texta < lima || textb < limb)) + { + lena = sizea; texta = buf; + lenb = sizeb; textb = buf + sizea; + break; + } + + /* Accumulate the transformed data in the corresponding + checksums. */ + md5_process_bytes (buf, sizea, &s[0]); + md5_process_bytes (buf + sizea, sizeb, &s[1]); + + /* Update the tiebreaker comparison of the transformed data. */ + if (! xfrm_diff) + { + xfrm_diff = memcmp (buf, buf + sizea, MIN (sizea, sizeb)); + if (! xfrm_diff) + xfrm_diff = (sizea > sizeb) - (sizea < sizeb); + } + } + } + + /* Compute and compare the checksums. */ + md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]); + md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]); + int diff = memcmp (dig[0], dig[1], sizeof dig[0]); + + /* Fall back on the tiebreaker if the checksums collide. */ + if (! diff) + { + if (! xfrm_diff) + { + xfrm_diff = memcmp (texta, textb, MIN (lena, lenb)); + if (! xfrm_diff) + xfrm_diff = (lena > lenb) - (lena < lenb); + } + + diff = xfrm_diff; + } + + free (allocated); + + return diff; +} + +/* Return true if KEY is a numeric key. */ + +static inline bool +key_numeric (struct keyfield const *key) +{ + return key->numeric || key->general_numeric || key->human_numeric; +} + +/* Compare two lines A and B trying every key in sequence until there + are no more keys or a difference is found. */ + +static int +keycompare (struct line const *a, struct line const *b) +{ + struct keyfield *key = keylist; + + /* For the first iteration only, the key positions have been + precomputed for us. */ + char *texta = a->keybeg; + char *textb = b->keybeg; + char *lima = a->keylim; + char *limb = b->keylim; + + int diff; + + while (true) + { + char const *translate = key->translate; + bool const *ignore = key->ignore; + + /* Treat field ends before field starts as empty fields. */ + lima = MAX (texta, lima); + limb = MAX (textb, limb); + + /* Find the lengths. */ + size_t lena = lima - texta; + size_t lenb = limb - textb; + + if (hard_LC_COLLATE || key_numeric (key) + || key->month || key->random || key->version) + { + char *ta; + char *tb; + size_t tlena; + size_t tlenb; + + char enda IF_LINT (= 0); + char endb IF_LINT (= 0); + void *allocated IF_LINT (= NULL); + char stackbuf[4000]; + + if (ignore || translate) + { + /* Compute with copies of the keys, which are the result of + translating or ignoring characters, and which need their + own storage. */ + + size_t i; + + /* Allocate space for copies. */ + size_t size = lena + 1 + lenb + 1; + if (size <= sizeof stackbuf) + ta = stackbuf, allocated = NULL; + else + ta = allocated = xmalloc (size); + tb = ta + lena + 1; + + /* Put into each copy a version of the key in which the + requested characters are ignored or translated. */ + for (tlena = i = 0; i < lena; i++) + if (! (ignore && ignore[to_uchar (texta[i])])) + ta[tlena++] = (translate + ? translate[to_uchar (texta[i])] + : texta[i]); + ta[tlena] = '\0'; + + for (tlenb = i = 0; i < lenb; i++) + if (! (ignore && ignore[to_uchar (textb[i])])) + tb[tlenb++] = (translate + ? translate[to_uchar (textb[i])] + : textb[i]); + tb[tlenb] = '\0'; + } + else + { + /* Use the keys in-place, temporarily null-terminated. */ + ta = texta; tlena = lena; enda = ta[tlena]; ta[tlena] = '\0'; + tb = textb; tlenb = lenb; endb = tb[tlenb]; tb[tlenb] = '\0'; + } + + if (key->numeric) + diff = numcompare (ta, tb); + else if (key->general_numeric) + diff = general_numcompare (ta, tb); + else if (key->human_numeric) + diff = human_numcompare (ta, tb); + else if (key->month) + diff = getmonth (ta, NULL) - getmonth (tb, NULL); + else if (key->random) + diff = compare_random (ta, tlena, tb, tlenb); + else if (key->version) + diff = filevercmp (ta, tb); + else + { + /* Locale-dependent string sorting. This is slower than + C-locale sorting, which is implemented below. */ + if (tlena == 0) + diff = - NONZERO (tlenb); + else if (tlenb == 0) + diff = 1; + else + diff = xmemcoll0 (ta, tlena + 1, tb, tlenb + 1); + } + + if (ignore || translate) + free (allocated); + else + { + ta[tlena] = enda; + tb[tlenb] = endb; + } + } + else if (ignore) + { +#define CMP_WITH_IGNORE(A, B) \ + do \ + { \ + while (true) \ + { \ + while (texta < lima && ignore[to_uchar (*texta)]) \ + ++texta; \ + while (textb < limb && ignore[to_uchar (*textb)]) \ + ++textb; \ + if (! (texta < lima && textb < limb)) \ + break; \ + diff = to_uchar (A) - to_uchar (B); \ + if (diff) \ + goto not_equal; \ + ++texta; \ + ++textb; \ + } \ + \ + diff = (texta < lima) - (textb < limb); \ + } \ + while (0) + + if (translate) + CMP_WITH_IGNORE (translate[to_uchar (*texta)], + translate[to_uchar (*textb)]); + else + CMP_WITH_IGNORE (*texta, *textb); + } + else if (lena == 0) + diff = - NONZERO (lenb); + else if (lenb == 0) + goto greater; + else + { + if (translate) + { + while (texta < lima && textb < limb) + { + diff = (to_uchar (translate[to_uchar (*texta++)]) + - to_uchar (translate[to_uchar (*textb++)])); + if (diff) + goto not_equal; + } + } + else + { + diff = memcmp (texta, textb, MIN (lena, lenb)); + if (diff) + goto not_equal; + } + diff = lena < lenb ? -1 : lena != lenb; + } + + if (diff) + goto not_equal; + + key = key->next; + if (! key) + break; + + /* Find the beginning and limit of the next field. */ + if (key->eword != SIZE_MAX) + lima = limfield (a, key), limb = limfield (b, key); + else + lima = a->text + a->length - 1, limb = b->text + b->length - 1; + + if (key->sword != SIZE_MAX) + texta = begfield (a, key), textb = begfield (b, key); + else + { + texta = a->text, textb = b->text; + if (key->skipsblanks) + { + while (texta < lima && blanks[to_uchar (*texta)]) + ++texta; + while (textb < limb && blanks[to_uchar (*textb)]) + ++textb; + } + } + } + + return 0; + + greater: + diff = 1; + not_equal: + return key->reverse ? -diff : diff; +} + +/* Compare two lines A and B, returning negative, zero, or positive + depending on whether A compares less than, equal to, or greater than B. */ + +static int +compare (struct line const *a, struct line const *b) +{ + int diff; + size_t alen, blen; + + /* First try to compare on the specified keys (if any). + The only two cases with no key at all are unadorned sort, + and unadorned sort -r. */ + if (keylist) + { + diff = keycompare (a, b); + if (diff || unique || stable) + return diff; + } + + /* If the keys all compare equal (or no keys were specified) + fall through to the default comparison. */ + alen = a->length - 1, blen = b->length - 1; + + if (alen == 0) + diff = - NONZERO (blen); + else if (blen == 0) + diff = 1; + else if (hard_LC_COLLATE) + { + /* Note xmemcoll0 is a performance enhancement as + it will not unconditionally write '\0' after the + passed in buffers, which was seen to give around + a 3% increase in performance for short lines. */ + diff = xmemcoll0 (a->text, alen + 1, b->text, blen + 1); + } + else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen)))) + diff = alen < blen ? -1 : alen != blen; + + return reverse ? -diff : diff; +} + +static void +insertkey (struct keyfield *key_arg) +{ + struct keyfield **p; + struct keyfield *key = xmemdup (key_arg, sizeof *key); + + for (p = &keylist; *p; p = &(*p)->next) + continue; + *p = key; + key->next = NULL; +} + +/* Report a bad field specification SPEC, with extra info MSGID. */ + +static void badfieldspec (char const *, char const *) + ATTRIBUTE_NORETURN; +static void +badfieldspec (char const *spec, char const *msgid) +{ + die (SORT_FAILURE, 0, _("%s: invalid field specification %s"), + _(msgid), quote (spec)); +} + +/* Parse the leading integer in STRING and store the resulting value + (which must fit into size_t) into *VAL. Return the address of the + suffix after the integer. If the value is too large, silently + substitute SIZE_MAX. If MSGID is NULL, return NULL after + failure; otherwise, report MSGID and exit on failure. */ + +static char const * +parse_field_count (char const *string, size_t *val, char const *msgid) +{ + char *suffix; + uintmax_t n; + + switch (xstrtoumax (string, &suffix, 10, &n, "")) + { + case LONGINT_OK: + case LONGINT_INVALID_SUFFIX_CHAR: + *val = n; + if (*val == n) + break; + /* Fall through. */ + case LONGINT_OVERFLOW: + case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR: + *val = SIZE_MAX; + break; + + case LONGINT_INVALID: + if (msgid) + die (SORT_FAILURE, 0, _("%s: invalid count at start of %s"), + _(msgid), quote (string)); + return NULL; + } + + return suffix; +} + +/* Set the ordering options for KEY specified in S. + Return the address of the first character in S that + is not a valid ordering option. + BLANKTYPE is the kind of blanks that 'b' should skip. */ + +static char * +set_ordering (char const *s, struct keyfield *key, enum blanktype blanktype) +{ + while (*s) + { + switch (*s) + { + case 'b': + if (blanktype == bl_start || blanktype == bl_both) + key->skipsblanks = true; + if (blanktype == bl_end || blanktype == bl_both) + key->skipeblanks = true; + break; + case 'd': + key->ignore = nondictionary; + break; + case 'f': + key->translate = fold_toupper; + break; + case 'g': + key->general_numeric = true; + break; + case 'h': + key->human_numeric = true; + break; + case 'i': + /* Option order should not matter, so don't let -i override + -d. -d implies -i, but -i does not imply -d. */ + if (! key->ignore) + key->ignore = nonprinting; + break; + case 'M': + key->month = true; + break; + case 'n': + key->numeric = true; + break; + case 'R': + key->random = true; + break; + case 'r': + key->reverse = true; + break; + case 'V': + key->version = true; + break; + default: + return (char *) s; + } + ++s; + } + return (char *) s; +} + +/* Initialize KEY. */ + +static struct keyfield * +key_init (struct keyfield *key) +{ + memset (key, 0, sizeof *key); + key->eword = SIZE_MAX; + return key; +} + +static void add_key(void) +{ + struct keyfield *key; + struct keyfield key_buf; char const *s; + key = key_init (&key_buf); + + /* Get POS1. */ + s = parse_field_count (optarg, &key->sword, + N_("invalid number at field start")); + if (! key->sword--) + { + /* Provoke with 'sort -k0' */ + badfieldspec (optarg, N_("field number is zero")); + } + if (*s == '.') + { + s = parse_field_count (s + 1, &key->schar, + N_("invalid number after '.'")); + if (! key->schar--) + { + /* Provoke with 'sort -k1.0' */ + badfieldspec (optarg, N_("character offset is zero")); + } + } + if (! (key->sword || key->schar)) + key->sword = SIZE_MAX; + s = set_ordering (s, key, bl_start); + if (*s != ',') + { + key->eword = SIZE_MAX; + key->echar = 0; + } + else + { + /* Get POS2. */ + s = parse_field_count (s + 1, &key->eword, + N_("invalid number after ','")); + if (! key->eword--) + { + /* Provoke with 'sort -k1,0' */ + badfieldspec (optarg, N_("field number is zero")); + } + if (*s == '.') + { + s = parse_field_count (s + 1, &key->echar, + N_("invalid number after '.'")); + } + s = set_ordering (s, key, bl_end); + } + if (*s) + badfieldspec (optarg, N_("stray character in field spec")); + insertkey (key); +} |