sort: -R now uses less memory on long lines with internal NULs

* lib/Makefile.am (libcoreutils_a_SOURCES): Remove xmemxfrm.c, xmemxfrm.h. * lib/memxfrm.c, lib/memxfrm.h, lib/xmemxfrm.c, lib/xmemxfrm.h: Remove. * m4/memxfrm.m4: Likewise. * m4/prereq.m4 (gl_PREREQ): Remove gl_MEMXFRM. * po/POTFILES.in: Remove lib/xmemxfrm.c. * src/sort.c: Don't include xmemxfrm.h. (cmp_hashes): Remove. (xstrxfrm): New function. (compare_random): If a line contains NULs, don't create a big buffer that contains the strxfrm output of each string in the line. Instead, accumulate checksums and differences as we go, so that at any one time we have to store at most the output of a single strxfrm call when processing the line. This removes the need for an memxfrm function.
author: Paul Eggert <eggert@cs.ucla.edu> 2010-08-04 16:10:10 -0700
committer: Paul Eggert <eggert@cs.ucla.edu> 2010-08-04 16:10:59 -0700
commit: 2b49b140cc13cf36ec5ee5acaca5ac7bfeed6366 (patch)
tree: f527d9cdeae11387f80ee5ea81b3bfe869c55e8d /src
parent: db48b362417e3f86671c040610ddfa27596bd258 (diff)
download: coreutils-2b49b140cc13cf36ec5ee5acaca5ac7bfeed6366.tar.xz
1 files changed, 116 insertions, 51 deletions
diff --git a/src/sort.c b/src/sort.c
index 1df711da7..54382631e 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -48,7 +48,6 @@
 #include "stdlib--.h"
 #include "strnumcmp.h"
 #include "xmemcoll.h"
-#include "xmemxfrm.h"
 #include "xnanosleep.h"
 #include "xstrtol.h"
 
@@ -2001,34 +2000,24 @@ random_md5_state_init (char const *random_source)
   md5_process_bytes (buf, sizeof buf, &random_md5_state);
 }
 
-/* Compare the hashes of TEXTA with length LENGTHA to those of TEXTB
-   with length LENGTHB.  Return negative if less, zero if equal,
-   positive if greater.  */
+/* This is like strxfrm, except it reports any error and exits.  */
 
-static int
-cmp_hashes (char const *texta, size_t lena,
-            char const *textb, size_t lenb)
+static size_t
+xstrxfrm (char *restrict dest, char const *restrict src, size_t destsize)
 {
-  /* Try random hashes until a pair of hashes disagree.  But if the
-     first pair of random hashes agree, check whether the keys are
-     identical and if so report no difference.  */
-  int diff;
-  uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
-  struct md5_ctx s[2];
-  s[0] = s[1] = random_md5_state;
-  md5_process_bytes (texta, lena, &s[0]);  md5_finish_ctx (&s[0], dig[0]);
-  md5_process_bytes (textb, lenb, &s[1]);  md5_finish_ctx (&s[1], dig[1]);
-  diff = memcmp (dig[0], dig[1], sizeof dig[0]);
-  if (! diff)
+  errno = 0;
+  size_t translated_size = strxfrm (dest, src, destsize);
+
+  if (errno)
     {
-      /* Break ties with memcmp.  This is good enough given the
-         astronomically low probability of an MD5 collision.  */
-      diff = memcmp (texta, textb, MIN (lena, lenb));
-      if (! diff)
-        diff = lena < lenb ? -1 : lena != lenb;
+      error (0, errno, _("string transformation failed"));
+      error (0, 0, _("set LC_ALL='C' to work around the problem"));
+      error (SORT_FAILURE, 0,
+             _("the untransformed string was %s"),
+             quotearg_n_style (0, locale_quoting_style, src));
     }
 
-  return diff;
+  return translated_size;
 }
 
 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
@@ -2038,41 +2027,117 @@ static int
 compare_random (char *restrict texta, size_t lena,
                 char *restrict textb, size_t lenb)
 {
-  int diff;
+  /* XFRM_DIFF records the equivalent of memcmp on the transformed
+     data.  This is used to break ties if there is an checksum
+     collision, and this is good enough given the astronomically low
+     probability of a collision.  */
+  int xfrm_diff = 0;
+
+  char stackbuf[4000];
+  char *buf = stackbuf;
+  size_t bufsize = sizeof stackbuf;
+  uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
+  struct md5_ctx s[2];
+  s[0] = s[1] = random_md5_state;
 
-  if (! hard_LC_COLLATE)
-    diff = cmp_hashes (texta, lena, textb, lenb);
-  else
+  if (hard_LC_COLLATE)
     {
-      /* Transform the text into the basis of comparison, so that byte
-         strings that would otherwise considered to be equal are
-         considered equal here even if their bytes differ.  */
-
-      char *buf = NULL;
-      char stackbuf[4000];
-      size_t tlena = xmemxfrm (stackbuf, sizeof stackbuf, texta, lena);
-      bool a_fits = tlena <= sizeof stackbuf;
-      size_t tlenb = xmemxfrm ((a_fits ? stackbuf + tlena : NULL),
-                               (a_fits ? sizeof stackbuf - tlena : 0),
-                               textb, lenb);
-
-      if (a_fits && tlena + tlenb <= sizeof stackbuf)
-        buf = stackbuf;
-      else
+      /* Null-terminate the keys so that strxfrm knows where to stop.  */
+      char *lima = texta + lena; char enda = *lima; *lima = '\0';
+      char *limb = textb + lenb; char endb = *limb; *limb = '\0';
+
+      while (true)
         {
-          /* Adding 1 to the buffer size lets xmemxfrm run a bit
-             faster by avoiding the need for an extra buffer copy.  */
-          buf = xmalloc (tlena + tlenb + 1);
-          xmemxfrm (buf, tlena + 1, texta, lena);
-          xmemxfrm (buf + tlena, tlenb + 1, textb, lenb);
+          /* Transform the text into the basis of comparison, so that byte
+             strings that would otherwise considered to be equal are
+             considered equal here even if their bytes differ.
+
+             Each time through this loop, transform one
+             null-terminated string's worth from TEXTA or from TEXTB
+             or both.  That way, there's no need to store the
+             transformation of the whole line, if it contains many
+             null-terminated strings.  */
+
+          /* Store the transformed data into a big-enough buffer.  */
+
+          size_t sizea =
+            (texta < lima ? xstrxfrm (buf, texta, bufsize) + 1 : 0);
+          bool a_fits = sizea <= bufsize;
+          size_t sizeb =
+            (textb < limb
+             ? (xstrxfrm ((a_fits ? buf + sizea : NULL), textb,
+                          (a_fits ? bufsize - sizea : 0))
+                + 1)
+             : 0);
+
+          if (! (a_fits && sizea + sizeb <= bufsize))
+            {
+              bufsize = sizea + sizeb;
+              if (bufsize < SIZE_MAX / 3)
+                bufsize = bufsize * 3 / 2;
+              buf = (buf == stackbuf
+                     ? xmalloc (bufsize)
+                     : xrealloc (buf, bufsize));
+              if (texta < lima)
+                strxfrm (buf, texta, sizea);
+              if (textb < limb)
+                strxfrm (buf + sizea, textb, sizeb);
+            }
+
+          /* Advance past NULs to the next part of each input string,
+             exiting the loop if both strings are exhausted.  When
+             exiting the loop, prepare to finish off the tiebreaker
+             comparison properly.  */
+          if (texta < lima)
+            texta += strlen (texta) + 1;
+          if (textb < limb)
+            textb += strlen (textb) + 1;
+          if (! (texta < lima || textb < limb))
+            {
+              lena = sizea; texta = buf;
+              lenb = sizeb; textb = buf + sizea;
+              break;
+            }
+
+          /* Accumulate the transformed data in the corresponding
+             checksums.  */
+          md5_process_bytes (buf, sizea, &s[0]);
+          md5_process_bytes (buf + sizea, sizeb, &s[1]);
+
+          /* Update the tiebreaker comparison of the transformed data.  */
+          if (! xfrm_diff)
+            {
+              xfrm_diff = memcmp (buf, buf + sizea, MIN (sizea, sizeb));
+              if (! xfrm_diff)
+                xfrm_diff = (sizea > sizeb) - (sizea < sizeb);
+            }
         }
 
-      diff = cmp_hashes (buf, tlena, buf + tlena, tlenb);
+      *lima = enda;
+      *limb = endb;
+    }
+
+  /* Compute and compare the checksums.  */
+  md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]);
+  md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]);
+  int diff = memcmp (dig[0], dig[1], sizeof dig[0]);
+
+  /* Fall back on the tiebreaker if the checksums collide.  */
+  if (! diff)
+    {
+      if (! xfrm_diff)
+        {
+          xfrm_diff = memcmp (texta, textb, MIN (lena, lenb));
+          if (! xfrm_diff)
+            xfrm_diff = (lena > lenb) - (lena < lenb);
+        }
 
-      if (buf != stackbuf)
-        free (buf);
+      diff = xfrm_diff;
     }
 
+  if (buf != stackbuf)
+    free (buf);
+
   return diff;
 }
author	Paul Eggert <eggert@cs.ucla.edu>	2010-08-04 16:10:10 -0700
committer	Paul Eggert <eggert@cs.ucla.edu>	2010-08-04 16:10:59 -0700
commit	2b49b140cc13cf36ec5ee5acaca5ac7bfeed6366 (patch)
tree	f527d9cdeae11387f80ee5ea81b3bfe869c55e8d /src
parent	db48b362417e3f86671c040610ddfa27596bd258 (diff)
download	coreutils-2b49b140cc13cf36ec5ee5acaca5ac7bfeed6366.tar.xz