diff options
Diffstat (limited to 'src/sort.c')
-rw-r--r-- | src/sort.c | 167 |
1 files changed, 116 insertions, 51 deletions
diff --git a/src/sort.c b/src/sort.c index 1df711da7..54382631e 100644 --- a/src/sort.c +++ b/src/sort.c @@ -48,7 +48,6 @@ #include "stdlib--.h" #include "strnumcmp.h" #include "xmemcoll.h" -#include "xmemxfrm.h" #include "xnanosleep.h" #include "xstrtol.h" @@ -2001,34 +2000,24 @@ random_md5_state_init (char const *random_source) md5_process_bytes (buf, sizeof buf, &random_md5_state); } -/* Compare the hashes of TEXTA with length LENGTHA to those of TEXTB - with length LENGTHB. Return negative if less, zero if equal, - positive if greater. */ +/* This is like strxfrm, except it reports any error and exits. */ -static int -cmp_hashes (char const *texta, size_t lena, - char const *textb, size_t lenb) +static size_t +xstrxfrm (char *restrict dest, char const *restrict src, size_t destsize) { - /* Try random hashes until a pair of hashes disagree. But if the - first pair of random hashes agree, check whether the keys are - identical and if so report no difference. */ - int diff; - uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)]; - struct md5_ctx s[2]; - s[0] = s[1] = random_md5_state; - md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]); - md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]); - diff = memcmp (dig[0], dig[1], sizeof dig[0]); - if (! diff) + errno = 0; + size_t translated_size = strxfrm (dest, src, destsize); + + if (errno) { - /* Break ties with memcmp. This is good enough given the - astronomically low probability of an MD5 collision. */ - diff = memcmp (texta, textb, MIN (lena, lenb)); - if (! diff) - diff = lena < lenb ? -1 : lena != lenb; + error (0, errno, _("string transformation failed")); + error (0, 0, _("set LC_ALL='C' to work around the problem")); + error (SORT_FAILURE, 0, + _("the untransformed string was %s"), + quotearg_n_style (0, locale_quoting_style, src)); } - return diff; + return translated_size; } /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB) @@ -2038,41 +2027,117 @@ static int compare_random (char *restrict texta, size_t lena, char *restrict textb, size_t lenb) { - int diff; + /* XFRM_DIFF records the equivalent of memcmp on the transformed + data. This is used to break ties if there is an checksum + collision, and this is good enough given the astronomically low + probability of a collision. */ + int xfrm_diff = 0; + + char stackbuf[4000]; + char *buf = stackbuf; + size_t bufsize = sizeof stackbuf; + uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)]; + struct md5_ctx s[2]; + s[0] = s[1] = random_md5_state; - if (! hard_LC_COLLATE) - diff = cmp_hashes (texta, lena, textb, lenb); - else + if (hard_LC_COLLATE) { - /* Transform the text into the basis of comparison, so that byte - strings that would otherwise considered to be equal are - considered equal here even if their bytes differ. */ - - char *buf = NULL; - char stackbuf[4000]; - size_t tlena = xmemxfrm (stackbuf, sizeof stackbuf, texta, lena); - bool a_fits = tlena <= sizeof stackbuf; - size_t tlenb = xmemxfrm ((a_fits ? stackbuf + tlena : NULL), - (a_fits ? sizeof stackbuf - tlena : 0), - textb, lenb); - - if (a_fits && tlena + tlenb <= sizeof stackbuf) - buf = stackbuf; - else + /* Null-terminate the keys so that strxfrm knows where to stop. */ + char *lima = texta + lena; char enda = *lima; *lima = '\0'; + char *limb = textb + lenb; char endb = *limb; *limb = '\0'; + + while (true) { - /* Adding 1 to the buffer size lets xmemxfrm run a bit - faster by avoiding the need for an extra buffer copy. */ - buf = xmalloc (tlena + tlenb + 1); - xmemxfrm (buf, tlena + 1, texta, lena); - xmemxfrm (buf + tlena, tlenb + 1, textb, lenb); + /* Transform the text into the basis of comparison, so that byte + strings that would otherwise considered to be equal are + considered equal here even if their bytes differ. + + Each time through this loop, transform one + null-terminated string's worth from TEXTA or from TEXTB + or both. That way, there's no need to store the + transformation of the whole line, if it contains many + null-terminated strings. */ + + /* Store the transformed data into a big-enough buffer. */ + + size_t sizea = + (texta < lima ? xstrxfrm (buf, texta, bufsize) + 1 : 0); + bool a_fits = sizea <= bufsize; + size_t sizeb = + (textb < limb + ? (xstrxfrm ((a_fits ? buf + sizea : NULL), textb, + (a_fits ? bufsize - sizea : 0)) + + 1) + : 0); + + if (! (a_fits && sizea + sizeb <= bufsize)) + { + bufsize = sizea + sizeb; + if (bufsize < SIZE_MAX / 3) + bufsize = bufsize * 3 / 2; + buf = (buf == stackbuf + ? xmalloc (bufsize) + : xrealloc (buf, bufsize)); + if (texta < lima) + strxfrm (buf, texta, sizea); + if (textb < limb) + strxfrm (buf + sizea, textb, sizeb); + } + + /* Advance past NULs to the next part of each input string, + exiting the loop if both strings are exhausted. When + exiting the loop, prepare to finish off the tiebreaker + comparison properly. */ + if (texta < lima) + texta += strlen (texta) + 1; + if (textb < limb) + textb += strlen (textb) + 1; + if (! (texta < lima || textb < limb)) + { + lena = sizea; texta = buf; + lenb = sizeb; textb = buf + sizea; + break; + } + + /* Accumulate the transformed data in the corresponding + checksums. */ + md5_process_bytes (buf, sizea, &s[0]); + md5_process_bytes (buf + sizea, sizeb, &s[1]); + + /* Update the tiebreaker comparison of the transformed data. */ + if (! xfrm_diff) + { + xfrm_diff = memcmp (buf, buf + sizea, MIN (sizea, sizeb)); + if (! xfrm_diff) + xfrm_diff = (sizea > sizeb) - (sizea < sizeb); + } } - diff = cmp_hashes (buf, tlena, buf + tlena, tlenb); + *lima = enda; + *limb = endb; + } + + /* Compute and compare the checksums. */ + md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]); + md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]); + int diff = memcmp (dig[0], dig[1], sizeof dig[0]); + + /* Fall back on the tiebreaker if the checksums collide. */ + if (! diff) + { + if (! xfrm_diff) + { + xfrm_diff = memcmp (texta, textb, MIN (lena, lenb)); + if (! xfrm_diff) + xfrm_diff = (lena > lenb) - (lena < lenb); + } - if (buf != stackbuf) - free (buf); + diff = xfrm_diff; } + if (buf != stackbuf) + free (buf); + return diff; } |