#include "filevercmp.h" #include "md5.h" #include "strnumcmp.h" #define UCHAR_LIM (UCHAR_MAX + 1) #if HAVE_C99_STRTOLD # define long_double long double #else # define long_double double # undef strtold # define strtold strtod #endif /* Exit statuses. */ enum { /* POSIX says to exit with status 1 if invoked with -c and the input is not properly sorted. */ SORT_OUT_OF_ORDER = 1, /* POSIX says any other irregular exit must exit with a status code greater than 1. */ SORT_FAILURE = 2 }; /* The representation of the decimal point in the current locale. */ static int decimal_point; /* Thousands separator; if -1, then there isn't one. */ static int thousands_sep; /* Nonzero if the corresponding locales are hard. */ static bool hard_LC_COLLATE; #define NONZERO(x) ((x) != 0) /* The kind of blanks for '-b' to skip in various options. */ enum blanktype { bl_start, bl_end, bl_both }; /* The character marking end of line. Default to \n. */ static char eolchar = '\n'; /* Lines are held in core as counted strings. */ struct line { char *text; /* Text of the line. */ size_t length; /* Length including final newline. */ char *keybeg; /* Start of first key. */ char *keylim; /* Limit of first key. */ }; /* Input buffers. */ struct buffer { char *buf; /* Dynamically allocated buffer, partitioned into 3 regions: - input data; - unused area; - an array of lines, in reverse order. */ size_t used; /* Number of bytes used for input data. */ size_t nlines; /* Number of lines in the line array. */ size_t alloc; /* Number of bytes allocated. */ size_t left; /* Number of bytes left from previous reads. */ size_t line_bytes; /* Number of bytes to reserve for each line. */ bool eof; /* An EOF has been read. */ }; /* Sort key. */ struct keyfield { size_t sword; /* Zero-origin 'word' to start at. */ size_t schar; /* Additional characters to skip. */ size_t eword; /* Zero-origin last 'word' of key. */ size_t echar; /* Additional characters in field. */ bool const *ignore; /* Boolean array of characters to ignore. */ char const *translate; /* Translation applied to characters. */ bool skipsblanks; /* Skip leading blanks when finding start. */ bool skipeblanks; /* Skip leading blanks when finding end. */ bool numeric; /* Flag for numeric comparison. Handle strings of digits with optional decimal point, but no exponential notation. */ bool random; /* Sort by random hash of key. */ bool general_numeric; /* Flag for general, numeric comparison. Handle numbers in exponential notation. */ bool human_numeric; /* Flag for sorting by human readable units with either SI or IEC prefixes. */ bool month; /* Flag for comparison by month name. */ bool reverse; /* Reverse the sense of comparison. */ bool version; /* sort by version number */ bool traditional_used; /* Traditional key option format is used. */ struct keyfield *next; /* Next keyfield to try. */ }; struct month { char const *name; int val; }; /* FIXME: None of these tables work with multibyte character sets. Also, there are many other bugs when handling multibyte characters. One way to fix this is to rewrite 'sort' to use wide characters internally, but doing this with good performance is a bit tricky. */ /* Table of blanks. */ static bool blanks[UCHAR_LIM]; /* Table of non-printing characters. */ static bool nonprinting[UCHAR_LIM]; /* Table of non-dictionary characters (not letters, digits, or blanks). */ static bool nondictionary[UCHAR_LIM]; /* Translation table folding lower case to upper. */ static char fold_toupper[UCHAR_LIM]; #define MONTHS_PER_YEAR 12 /* Table mapping month names to integers. Alphabetic order allows binary search. */ static struct month monthtab[] = { {"APR", 4}, {"AUG", 8}, {"DEC", 12}, {"FEB", 2}, {"JAN", 1}, {"JUL", 7}, {"JUN", 6}, {"MAR", 3}, {"MAY", 5}, {"NOV", 11}, {"OCT", 10}, {"SEP", 9} }; /* Minimum size for a merge or check buffer. */ #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line)) /* The number of bytes needed for a merge or check buffer, which can function relatively efficiently even if it holds only one line. If a longer line is seen, this value is increased. */ static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024); /* Flag to reverse the order of all comparisons. */ static bool reverse; /* Flag for stable sort. This turns off the last ditch bytewise comparison of lines, and instead leaves lines in the same order they were read if all keys compare equal. */ static bool stable; /* If TAB has this value, blanks separate fields. */ enum { TAB_DEFAULT = CHAR_MAX + 1 }; /* Tab character separating fields. If TAB_DEFAULT, then fields are separated by the empty string between a non-blank character and a blank character. */ static int tab = TAB_DEFAULT; /* Flag to remove consecutive duplicate lines from the output. Only the last of a sequence of equal lines will be output. */ static bool unique; /* List of key field comparisons to be tried. */ static struct keyfield *keylist; static void sort_die (char const *, char const *) ATTRIBUTE_NORETURN; static void sort_die (char const *message, char const *file) { die (SORT_FAILURE, errno, "%s: %s", message, quotef (file ? file : _("standard output"))); } /* Return one past the limit of the line array. */ static inline struct line * buffer_linelim (struct buffer const *buf) { void *linelim = buf->buf + buf->alloc; return linelim; } /* Return a pointer to the first character of the field specified by KEY in LINE. */ static char * begfield (struct line const *line, struct keyfield const *key) { char *ptr = line->text, *lim = ptr + line->length - 1; size_t sword = key->sword; size_t schar = key->schar; /* The leading field separator itself is included in a field when -t is absent. */ if (tab != TAB_DEFAULT) while (ptr < lim && sword--) { while (ptr < lim && *ptr != tab) ++ptr; if (ptr < lim) ++ptr; } else while (ptr < lim && sword--) { while (ptr < lim && blanks[to_uchar (*ptr)]) ++ptr; while (ptr < lim && !blanks[to_uchar (*ptr)]) ++ptr; } /* If we're ignoring leading blanks when computing the Start of the field, skip past them here. */ if (key->skipsblanks) while (ptr < lim && blanks[to_uchar (*ptr)]) ++ptr; /* Advance PTR by SCHAR (if possible), but no further than LIM. */ ptr = MIN (lim, ptr + schar); return ptr; } /* Return the limit of (a pointer to the first character after) the field in LINE specified by KEY. */ static char * limfield (struct line const *line, struct keyfield const *key) { char *ptr = line->text, *lim = ptr + line->length - 1; size_t eword = key->eword, echar = key->echar; if (echar == 0) eword++; /* Skip all of end field. */ /* Move PTR past EWORD fields or to one past the last byte on LINE, whichever comes first. If there are more than EWORD fields, leave PTR pointing at the beginning of the field having zero-based index, EWORD. If a delimiter character was specified (via -t), then that 'beginning' is the first character following the delimiting TAB. Otherwise, leave PTR pointing at the first 'blank' character after the preceding field. */ if (tab != TAB_DEFAULT) while (ptr < lim && eword--) { while (ptr < lim && *ptr != tab) ++ptr; if (ptr < lim && (eword || echar)) ++ptr; } else while (ptr < lim && eword--) { while (ptr < lim && blanks[to_uchar (*ptr)]) ++ptr; while (ptr < lim && !blanks[to_uchar (*ptr)]) ++ptr; } #ifdef POSIX_UNSPECIFIED /* The following block of code makes GNU sort incompatible with standard Unix sort, so it's ifdef'd out for now. The POSIX spec isn't clear on how to interpret this. FIXME: request clarification. From: kwzh@gnu.ai.mit.edu (Karl Heuer) Date: Thu, 30 May 96 12:20:41 -0400 [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.] [...]I believe I've found another bug in 'sort'. $ cat /tmp/sort.in a b c 2 d pq rs 1 t $ textutils-1.15/src/sort -k1.7,1.7 skipeblanks) while (ptr < lim && blanks[to_uchar (*ptr)]) ++ptr; /* Advance PTR by ECHAR (if possible), but no further than LIM. */ ptr = MIN (lim, ptr + echar); } return ptr; } /* Fill BUF reading from FP, moving buf->left bytes from the end of buf->buf to the beginning first. If EOF is reached and the file wasn't terminated by a newline, supply one. Set up BUF's line table too. FILE is the name of the file corresponding to FP. Return true if some input was read. */ static bool fillbuf (struct buffer *buf, FILE *fp, char const *file) { struct keyfield const *key = keylist; char eol = eolchar; size_t line_bytes = buf->line_bytes; size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE; if (buf->eof) return false; if (buf->used != buf->left) { memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left); buf->used = buf->left; buf->nlines = 0; } while (true) { char *ptr = buf->buf + buf->used; struct line *linelim = buffer_linelim (buf); struct line *line = linelim - buf->nlines; size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr; char *line_start = buf->nlines ? line->text + line->length : buf->buf; while (line_bytes + 1 < avail) { /* Read as many bytes as possible, but do not read so many bytes that there might not be enough room for the corresponding line array. The worst case is when the rest of the input file consists entirely of newlines, except that the last byte is not a newline. */ size_t readsize = (avail - 1) / (line_bytes + 1); size_t bytes_read = fread (ptr, 1, readsize, fp); char *ptrlim = ptr + bytes_read; char *p; avail -= bytes_read; if (bytes_read != readsize) { if (ferror (fp)) sort_die (_("read failed"), file); if (feof (fp)) { buf->eof = true; if (buf->buf == ptrlim) return false; if (line_start != ptrlim && ptrlim[-1] != eol) *ptrlim++ = eol; } } /* Find and record each line in the just-read input. */ while ((p = memchr (ptr, eol, ptrlim - ptr))) { /* Delimit the line with NUL. This eliminates the need to temporarily replace the last byte with NUL when calling xmemcoll(), which increases performance. */ *p = '\0'; ptr = p + 1; line--; line->text = line_start; line->length = ptr - line_start; mergesize = MAX (mergesize, line->length); avail -= line_bytes; if (key) { /* Precompute the position of the first key for efficiency. */ line->keylim = (key->eword == SIZE_MAX ? p : limfield (line, key)); if (key->sword != SIZE_MAX) line->keybeg = begfield (line, key); else { if (key->skipsblanks) while (blanks[to_uchar (*line_start)]) line_start++; line->keybeg = line_start; } } line_start = ptr; } ptr = ptrlim; if (buf->eof) break; } buf->used = ptr - buf->buf; buf->nlines = buffer_linelim (buf) - line; if (buf->nlines != 0) { buf->left = ptr - line_start; merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE; return true; } { /* The current input line is too long to fit in the buffer. Increase the buffer size and try again, keeping it properly aligned. */ size_t line_alloc = buf->alloc / sizeof (struct line); buf->buf = x2nrealloc (buf->buf, &line_alloc, sizeof (struct line)); buf->alloc = line_alloc * sizeof (struct line); } } } /* Table that maps characters to order-of-magnitude values. */ static char const unit_order[UCHAR_LIM] = { #if ! ('K' == 75 && 'M' == 77 && 'G' == 71 && 'T' == 84 && 'P' == 80 \ && 'E' == 69 && 'Z' == 90 && 'Y' == 89 && 'k' == 107) /* This initializer syntax works on all C99 hosts. For now, use it only on non-ASCII hosts, to ease the pain of porting to pre-C99 ASCII hosts. */ ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8, ['k']=1, #else /* Generate the following table with this command: perl -e 'my %a=(k=>1, K=>1, M=>2, G=>3, T=>4, P=>5, E=>6, Z=>7, Y=>8); foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\ |fmt */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3, 0, 0, 0, 1, 0, 2, 0, 0, 5, 0, 0, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 #endif }; /* Traverse number given as *number consisting of digits, thousands_sep, and decimal_point chars only. Returns the highest digit found in the number, or '\0' if no digit has been found. Upon return *number points at the character that immediately follows after the given number. */ static unsigned char traverse_raw_number (char const **number) { char const *p = *number; unsigned char ch; unsigned char max_digit = '\0'; bool ends_with_thousands_sep = false; /* Scan to end of number. Decimals or separators not followed by digits stop the scan. Numbers ending in decimals or separators are thus considered to be lacking in units. FIXME: add support for multibyte thousands_sep and decimal_point. */ while (ISDIGIT (ch = *p++)) { if (max_digit < ch) max_digit = ch; /* Allow to skip only one occurrence of thousands_sep to avoid finding the unit in the next column in case thousands_sep matches as blank and is used as column delimiter. */ ends_with_thousands_sep = (*p == thousands_sep); if (ends_with_thousands_sep) ++p; } if (ends_with_thousands_sep) { /* thousands_sep not followed by digit is not allowed. */ *number = p - 2; return max_digit; } if (ch == decimal_point) while (ISDIGIT (ch = *p++)) if (max_digit < ch) max_digit = ch; *number = p - 1; return max_digit; } /* Return an integer that represents the order of magnitude of the unit following the number. The number may contain thousands separators and a decimal point, but it may not contain leading blanks. Negative numbers get negative orders; zero numbers have a zero order. */ static int _GL_ATTRIBUTE_PURE find_unit_order (char const *number) { bool minus_sign = (*number == '-'); char const *p = number + minus_sign; unsigned char max_digit = traverse_raw_number (&p); if ('0' < max_digit) { unsigned char ch = *p; int order = unit_order[ch]; return (minus_sign ? -order : order); } else return 0; } /* Compare numbers A and B ending in units with SI or IEC prefixes < K/k < M < G < T < P < E < Z < Y */ static int human_numcompare (char const *a, char const *b) { while (blanks[to_uchar (*a)]) a++; while (blanks[to_uchar (*b)]) b++; int diff = find_unit_order (a) - find_unit_order (b); return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep)); } /* Compare strings A and B as numbers without explicitly converting them to machine numbers. Comparatively slow for short strings, but asymptotically hideously fast. */ static int numcompare (char const *a, char const *b) { while (blanks[to_uchar (*a)]) a++; while (blanks[to_uchar (*b)]) b++; return strnumcmp (a, b, decimal_point, thousands_sep); } /* Work around a problem whereby the long double value returned by glibc's strtold ("NaN", ...) contains uninitialized bits: clear all bytes of A and B before calling strtold. FIXME: remove this function once gnulib guarantees that strtold's result is always well defined. */ static int nan_compare (char const *sa, char const *sb) { long_double a; memset (&a, 0, sizeof a); a = strtold (sa, NULL); long_double b; memset (&b, 0, sizeof b); b = strtold (sb, NULL); return memcmp (&a, &b, sizeof a); } static int general_numcompare (char const *sa, char const *sb) { /* FIXME: maybe add option to try expensive FP conversion only if A and B can't be compared more cheaply/accurately. */ char *ea; char *eb; long_double a = strtold (sa, &ea); long_double b = strtold (sb, &eb); /* Put conversion errors at the start of the collating sequence. */ if (sa == ea) return sb == eb ? 0 : -1; if (sb == eb) return 1; /* Sort numbers in the usual way, where -0 == +0. Put NaNs after conversion errors but before numbers; sort them by internal bit-pattern, for lack of a more portable alternative. */ return (a < b ? -1 : a > b ? 1 : a == b ? 0 : b == b ? -1 : a == a ? 1 : nan_compare (sa, sb)); } /* Return an integer in 1..12 of the month name MONTH. Return 0 if the name in S is not recognized. */ static int getmonth (char const *month, char **ea) { size_t lo = 0; size_t hi = MONTHS_PER_YEAR; while (blanks[to_uchar (*month)]) month++; do { size_t ix = (lo + hi) / 2; char const *m = month; char const *n = monthtab[ix].name; for (;; m++, n++) { if (!*n) { if (ea) *ea = (char *) m; return monthtab[ix].val; } if (to_uchar (fold_toupper[to_uchar (*m)]) < to_uchar (*n)) { hi = ix; break; } else if (to_uchar (fold_toupper[to_uchar (*m)]) > to_uchar (*n)) { lo = ix + 1; break; } } } while (lo < hi); return 0; } /* A randomly chosen MD5 state, used for random comparison. */ static struct md5_ctx random_md5_state; /* This is like strxfrm, except it reports any error and exits. */ static size_t xstrxfrm (char *restrict dest, char const *restrict src, size_t destsize) { errno = 0; size_t translated_size = strxfrm (dest, src, destsize); if (errno) { error (0, errno, _("string transformation failed")); error (0, 0, _("set LC_ALL='C' to work around the problem")); die (SORT_FAILURE, 0, _("the untransformed string was %s"), quotearg_n_style (0, locale_quoting_style, src)); } return translated_size; } /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB) using one or more random hash functions. TEXTA[LENA] and TEXTB[LENB] must be zero. */ static int compare_random (char *restrict texta, size_t lena, char *restrict textb, size_t lenb) { /* XFRM_DIFF records the equivalent of memcmp on the transformed data. This is used to break ties if there is a checksum collision, and this is good enough given the astronomically low probability of a collision. */ int xfrm_diff = 0; char stackbuf[4000]; char *buf = stackbuf; size_t bufsize = sizeof stackbuf; void *allocated = NULL; uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)]; struct md5_ctx s[2]; s[0] = s[1] = random_md5_state; if (hard_LC_COLLATE) { char const *lima = texta + lena; char const *limb = textb + lenb; while (true) { /* Transform the text into the basis of comparison, so that byte strings that would otherwise considered to be equal are considered equal here even if their bytes differ. Each time through this loop, transform one null-terminated string's worth from TEXTA or from TEXTB or both. That way, there's no need to store the transformation of the whole line, if it contains many null-terminated strings. */ /* Store the transformed data into a big-enough buffer. */ /* A 3X size guess avoids the overhead of calling strxfrm twice on typical implementations. Don't worry about size_t overflow, as the guess need not be correct. */ size_t guess_bufsize = 3 * (lena + lenb) + 2; if (bufsize < guess_bufsize) { bufsize = MAX (guess_bufsize, bufsize * 3 / 2); free (allocated); buf = allocated = malloc (bufsize); if (! buf) { buf = stackbuf; bufsize = sizeof stackbuf; } } size_t sizea = (texta < lima ? xstrxfrm (buf, texta, bufsize) + 1 : 0); bool a_fits = sizea <= bufsize; size_t sizeb = (textb < limb ? (xstrxfrm ((a_fits ? buf + sizea : NULL), textb, (a_fits ? bufsize - sizea : 0)) + 1) : 0); if (! (a_fits && sizea + sizeb <= bufsize)) { bufsize = sizea + sizeb; if (bufsize < SIZE_MAX / 3) bufsize = bufsize * 3 / 2; free (allocated); buf = allocated = xmalloc (bufsize); if (texta < lima) strxfrm (buf, texta, sizea); if (textb < limb) strxfrm (buf + sizea, textb, sizeb); } /* Advance past NULs to the next part of each input string, exiting the loop if both strings are exhausted. When exiting the loop, prepare to finish off the tiebreaker comparison properly. */ if (texta < lima) texta += strlen (texta) + 1; if (textb < limb) textb += strlen (textb) + 1; if (! (texta < lima || textb < limb)) { lena = sizea; texta = buf; lenb = sizeb; textb = buf + sizea; break; } /* Accumulate the transformed data in the corresponding checksums. */ md5_process_bytes (buf, sizea, &s[0]); md5_process_bytes (buf + sizea, sizeb, &s[1]); /* Update the tiebreaker comparison of the transformed data. */ if (! xfrm_diff) { xfrm_diff = memcmp (buf, buf + sizea, MIN (sizea, sizeb)); if (! xfrm_diff) xfrm_diff = (sizea > sizeb) - (sizea < sizeb); } } } /* Compute and compare the checksums. */ md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]); md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]); int diff = memcmp (dig[0], dig[1], sizeof dig[0]); /* Fall back on the tiebreaker if the checksums collide. */ if (! diff) { if (! xfrm_diff) { xfrm_diff = memcmp (texta, textb, MIN (lena, lenb)); if (! xfrm_diff) xfrm_diff = (lena > lenb) - (lena < lenb); } diff = xfrm_diff; } free (allocated); return diff; } /* Return true if KEY is a numeric key. */ static inline bool key_numeric (struct keyfield const *key) { return key->numeric || key->general_numeric || key->human_numeric; } /* Compare two lines A and B trying every key in sequence until there are no more keys or a difference is found. */ static int keycompare (struct line const *a, struct line const *b) { struct keyfield *key = keylist; /* For the first iteration only, the key positions have been precomputed for us. */ char *texta = a->keybeg; char *textb = b->keybeg; char *lima = a->keylim; char *limb = b->keylim; int diff; while (true) { char const *translate = key->translate; bool const *ignore = key->ignore; /* Treat field ends before field starts as empty fields. */ lima = MAX (texta, lima); limb = MAX (textb, limb); /* Find the lengths. */ size_t lena = lima - texta; size_t lenb = limb - textb; if (hard_LC_COLLATE || key_numeric (key) || key->month || key->random || key->version) { char *ta; char *tb; size_t tlena; size_t tlenb; char enda IF_LINT (= 0); char endb IF_LINT (= 0); void *allocated IF_LINT (= NULL); char stackbuf[4000]; if (ignore || translate) { /* Compute with copies of the keys, which are the result of translating or ignoring characters, and which need their own storage. */ size_t i; /* Allocate space for copies. */ size_t size = lena + 1 + lenb + 1; if (size <= sizeof stackbuf) ta = stackbuf, allocated = NULL; else ta = allocated = xmalloc (size); tb = ta + lena + 1; /* Put into each copy a version of the key in which the requested characters are ignored or translated. */ for (tlena = i = 0; i < lena; i++) if (! (ignore && ignore[to_uchar (texta[i])])) ta[tlena++] = (translate ? translate[to_uchar (texta[i])] : texta[i]); ta[tlena] = '\0'; for (tlenb = i = 0; i < lenb; i++) if (! (ignore && ignore[to_uchar (textb[i])])) tb[tlenb++] = (translate ? translate[to_uchar (textb[i])] : textb[i]); tb[tlenb] = '\0'; } else { /* Use the keys in-place, temporarily null-terminated. */ ta = texta; tlena = lena; enda = ta[tlena]; ta[tlena] = '\0'; tb = textb; tlenb = lenb; endb = tb[tlenb]; tb[tlenb] = '\0'; } if (key->numeric) diff = numcompare (ta, tb); else if (key->general_numeric) diff = general_numcompare (ta, tb); else if (key->human_numeric) diff = human_numcompare (ta, tb); else if (key->month) diff = getmonth (ta, NULL) - getmonth (tb, NULL); else if (key->random) diff = compare_random (ta, tlena, tb, tlenb); else if (key->version) diff = filevercmp (ta, tb); else { /* Locale-dependent string sorting. This is slower than C-locale sorting, which is implemented below. */ if (tlena == 0) diff = - NONZERO (tlenb); else if (tlenb == 0) diff = 1; else diff = xmemcoll0 (ta, tlena + 1, tb, tlenb + 1); } if (ignore || translate) free (allocated); else { ta[tlena] = enda; tb[tlenb] = endb; } } else if (ignore) { #define CMP_WITH_IGNORE(A, B) \ do \ { \ while (true) \ { \ while (texta < lima && ignore[to_uchar (*texta)]) \ ++texta; \ while (textb < limb && ignore[to_uchar (*textb)]) \ ++textb; \ if (! (texta < lima && textb < limb)) \ break; \ diff = to_uchar (A) - to_uchar (B); \ if (diff) \ goto not_equal; \ ++texta; \ ++textb; \ } \ \ diff = (texta < lima) - (textb < limb); \ } \ while (0) if (translate) CMP_WITH_IGNORE (translate[to_uchar (*texta)], translate[to_uchar (*textb)]); else CMP_WITH_IGNORE (*texta, *textb); } else if (lena == 0) diff = - NONZERO (lenb); else if (lenb == 0) goto greater; else { if (translate) { while (texta < lima && textb < limb) { diff = (to_uchar (translate[to_uchar (*texta++)]) - to_uchar (translate[to_uchar (*textb++)])); if (diff) goto not_equal; } } else { diff = memcmp (texta, textb, MIN (lena, lenb)); if (diff) goto not_equal; } diff = lena < lenb ? -1 : lena != lenb; } if (diff) goto not_equal; key = key->next; if (! key) break; /* Find the beginning and limit of the next field. */ if (key->eword != SIZE_MAX) lima = limfield (a, key), limb = limfield (b, key); else lima = a->text + a->length - 1, limb = b->text + b->length - 1; if (key->sword != SIZE_MAX) texta = begfield (a, key), textb = begfield (b, key); else { texta = a->text, textb = b->text; if (key->skipsblanks) { while (texta < lima && blanks[to_uchar (*texta)]) ++texta; while (textb < limb && blanks[to_uchar (*textb)]) ++textb; } } } return 0; greater: diff = 1; not_equal: return key->reverse ? -diff : diff; } /* Compare two lines A and B, returning negative, zero, or positive depending on whether A compares less than, equal to, or greater than B. */ static int compare (struct line const *a, struct line const *b) { int diff; size_t alen, blen; /* First try to compare on the specified keys (if any). The only two cases with no key at all are unadorned sort, and unadorned sort -r. */ if (keylist) { diff = keycompare (a, b); if (diff || unique || stable) return diff; } /* If the keys all compare equal (or no keys were specified) fall through to the default comparison. */ alen = a->length - 1, blen = b->length - 1; if (alen == 0) diff = - NONZERO (blen); else if (blen == 0) diff = 1; else if (hard_LC_COLLATE) { /* Note xmemcoll0 is a performance enhancement as it will not unconditionally write '\0' after the passed in buffers, which was seen to give around a 3% increase in performance for short lines. */ diff = xmemcoll0 (a->text, alen + 1, b->text, blen + 1); } else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen)))) diff = alen < blen ? -1 : alen != blen; return reverse ? -diff : diff; } static void insertkey (struct keyfield *key_arg) { struct keyfield **p; struct keyfield *key = xmemdup (key_arg, sizeof *key); for (p = &keylist; *p; p = &(*p)->next) continue; *p = key; key->next = NULL; } /* Report a bad field specification SPEC, with extra info MSGID. */ static void badfieldspec (char const *, char const *) ATTRIBUTE_NORETURN; static void badfieldspec (char const *spec, char const *msgid) { die (SORT_FAILURE, 0, _("%s: invalid field specification %s"), _(msgid), quote (spec)); } /* Parse the leading integer in STRING and store the resulting value (which must fit into size_t) into *VAL. Return the address of the suffix after the integer. If the value is too large, silently substitute SIZE_MAX. If MSGID is NULL, return NULL after failure; otherwise, report MSGID and exit on failure. */ static char const * parse_field_count (char const *string, size_t *val, char const *msgid) { char *suffix; uintmax_t n; switch (xstrtoumax (string, &suffix, 10, &n, "")) { case LONGINT_OK: case LONGINT_INVALID_SUFFIX_CHAR: *val = n; if (*val == n) break; /* Fall through. */ case LONGINT_OVERFLOW: case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR: *val = SIZE_MAX; break; case LONGINT_INVALID: if (msgid) die (SORT_FAILURE, 0, _("%s: invalid count at start of %s"), _(msgid), quote (string)); return NULL; } return suffix; } /* Set the ordering options for KEY specified in S. Return the address of the first character in S that is not a valid ordering option. BLANKTYPE is the kind of blanks that 'b' should skip. */ static char * set_ordering (char const *s, struct keyfield *key, enum blanktype blanktype) { while (*s) { switch (*s) { case 'b': if (blanktype == bl_start || blanktype == bl_both) key->skipsblanks = true; if (blanktype == bl_end || blanktype == bl_both) key->skipeblanks = true; break; case 'd': key->ignore = nondictionary; break; case 'f': key->translate = fold_toupper; break; case 'g': key->general_numeric = true; break; case 'h': key->human_numeric = true; break; case 'i': /* Option order should not matter, so don't let -i override -d. -d implies -i, but -i does not imply -d. */ if (! key->ignore) key->ignore = nonprinting; break; case 'M': key->month = true; break; case 'n': key->numeric = true; break; case 'R': key->random = true; break; case 'r': key->reverse = true; break; case 'V': key->version = true; break; default: return (char *) s; } ++s; } return (char *) s; } /* Initialize KEY. */ static struct keyfield * key_init (struct keyfield *key) { memset (key, 0, sizeof *key); key->eword = SIZE_MAX; return key; } static void add_key(void) { struct keyfield *key; struct keyfield key_buf; char const *s; key = key_init (&key_buf); /* Get POS1. */ s = parse_field_count (optarg, &key->sword, N_("invalid number at field start")); if (! key->sword--) { /* Provoke with 'sort -k0' */ badfieldspec (optarg, N_("field number is zero")); } if (*s == '.') { s = parse_field_count (s + 1, &key->schar, N_("invalid number after '.'")); if (! key->schar--) { /* Provoke with 'sort -k1.0' */ badfieldspec (optarg, N_("character offset is zero")); } } if (! (key->sword || key->schar)) key->sword = SIZE_MAX; s = set_ordering (s, key, bl_start); if (*s != ',') { key->eword = SIZE_MAX; key->echar = 0; } else { /* Get POS2. */ s = parse_field_count (s + 1, &key->eword, N_("invalid number after ','")); if (! key->eword--) { /* Provoke with 'sort -k1,0' */ badfieldspec (optarg, N_("field number is zero")); } if (*s == '.') { s = parse_field_count (s + 1, &key->echar, N_("invalid number after '.'")); } s = set_ordering (s, key, bl_end); } if (*s) badfieldspec (optarg, N_("stray character in field spec")); insertkey (key); }