wc: read and process --files0-from= input a name at a time,

when the file name list is not too large. Before, wc would always read the entire file name list into memory and *then* process each file name. wc does read the list into memory when the list is known not to be too large; this is done in order to be able to align the output numbers, as it does with arguments specified on the command-line * src/wc.c: Include "argv-iter.h". (main): Rewrite to use argv-iter when the input file name list is known to be too large. * NEWS (Bug fixes): Mention it.
author: Jim Meyering <meyering@redhat.com> 2008-11-25 18:38:26 +0100
committer: Jim Meyering <meyering@redhat.com> 2008-12-02 13:12:22 +0100
commit: c2e56e0de7d86bdc0f824d758a7efde4d5d7b235 (patch)
tree: acd3e00bd616d75dccf110ef810a9e74a1c787d3 /src/wc.c
parent: 031e2fb5e9501fb9cda4d739a92abb02e2b05a52 (diff)
download: coreutils-c2e56e0de7d86bdc0f824d758a7efde4d5d7b235.tar.xz
1 files changed, 95 insertions, 44 deletions
diff --git a/src/wc.c b/src/wc.c
index ad25ed8d0..65368f994 100644
--- a/src/wc.c
+++ b/src/wc.c
@@ -20,14 +20,17 @@
 #include <config.h>
 
 #include <stdio.h>
+#include <assert.h>
 #include <getopt.h>
 #include <sys/types.h>
 #include <wchar.h>
 #include <wctype.h>
 
 #include "system.h"
+#include "argv-iter.h"
 #include "error.h"
 #include "mbchar.h"
+#include "physmem.h"
 #include "quote.h"
 #include "quotearg.h"
 #include "readtokens0.h"
@@ -515,17 +518,19 @@ wc_file (char const *file, struct fstatus *fstatus)
 /* Return the file status for the NFILES files addressed by FILE.
    Optimize the case where only one number is printed, for just one
    file; in that case we can use a print width of 1, so we don't need
-   to stat the file.  */
+   to stat the file.  Handle the case of (nfiles == 0) in the same way;
+   that happens when we don't know how long the list of file names will be.  */
 
 static struct fstatus *
-get_input_fstatus (int nfiles, char * const *file)
+get_input_fstatus (int nfiles, char *const *file)
 {
-  struct fstatus *fstatus = xnmalloc (nfiles, sizeof *fstatus);
+  struct fstatus *fstatus = xnmalloc (nfiles ? nfiles : 1, sizeof *fstatus);
 
-  if (nfiles == 1
-      && ((print_lines + print_words + print_chars
-	   + print_bytes + print_linelength)
-	  == 1))
+  if (nfiles == 0
+      || (nfiles == 1
+	  && ((print_lines + print_words + print_chars
+	       + print_bytes + print_linelength)
+	      == 1)))
     fstatus[0].failed = 1;
   else
     {
@@ -577,7 +582,6 @@ compute_number_width (int nfiles, struct fstatus const *fstatus)
 int
 main (int argc, char **argv)
 {
-  int i;
   bool ok;
   int optc;
   int nfiles;
@@ -637,6 +641,8 @@ main (int argc, char **argv)
 	 | print_linelength))
     print_lines = print_words = print_bytes = true;
 
+  bool read_tokens = false;
+  struct argv_iterator *ai;
   if (files_from)
     {
       FILE *stream;
@@ -661,69 +667,114 @@ main (int argc, char **argv)
 		   quote (files_from));
 	}
 
-      readtokens0_init (&tok);
-
-      if (! readtokens0 (stream, &tok) || fclose (stream) != 0)
-	error (EXIT_FAILURE, 0, _("cannot read file names from %s"),
-	       quote (files_from));
-
-      files = tok.tok;
-      nfiles = tok.n_tok;
+      /* Read the file list into RAM if we can detect its size and that
+	 size is reasonable.  Otherwise, we'll read a name at a time.  */
+      struct stat st;
+      if (fstat (fileno (stream), &st) == 0
+	  && S_ISREG (st.st_mode)
+	  && st.st_size <= MIN (10 * 1024 * 1024, physmem_available () / 2))
+	{
+	  read_tokens = true;
+	  readtokens0_init (&tok);
+	  if (! readtokens0 (stream, &tok) || fclose (stream) != 0)
+	    error (EXIT_FAILURE, 0, _("cannot read file names from %s"),
+		   quote (files_from));
+	  files = tok.tok;
+	  nfiles = tok.n_tok;
+	  ai = argv_iter_init_argv (files);
+	}
+      else
+	{
+	  files = NULL;
+	  nfiles = 0;
+	  ai = argv_iter_init_stream (stream);
+	}
     }
   else
     {
-      static char *stdin_only[2];
+      static char *stdin_only[] = { NULL };
       files = (optind < argc ? argv + optind : stdin_only);
       nfiles = (optind < argc ? argc - optind : 1);
-      stdin_only[0] = NULL;
+      ai = argv_iter_init_argv (files);
     }
 
   fstatus = get_input_fstatus (nfiles, files);
   number_width = compute_number_width (nfiles, fstatus);
 
+  int i;
   ok = true;
-  for (i = 0; i < nfiles; i++)
+  for (i = 0; /* */; i++)
     {
-      if (files[i])
+      bool skip_file = false;
+      enum argv_iter_err ai_err;
+      char *file_name = argv_iter (ai, &ai_err);
+      if (ai_err == AI_ERR_EOF)
+	break;
+      if (!file_name)
 	{
-	  if (files_from && STREQ (files_from, "-") && STREQ (files[i], "-"))
+	  switch (ai_err)
 	    {
-	      ok = false;
-	      /* Give a better diagnostic in an unusual case:
-		 printf - | wc --files0-from=- */
-	      error (0, 0, _("when reading file names from stdin, "
-			     "no file name of %s allowed"),
-		     quote ("-"));
+	    case AI_ERR_READ:
+	      error (0, errno, _("%s: read error"), quote (files_from));
+	      skip_file = true;
 	      continue;
+	    case AI_ERR_MEM:
+	      xalloc_die ();
+	    default:
+	      assert (!"unexpected error code from argv_iter");
 	    }
+	}
+      if (files_from && STREQ (files_from, "-") && STREQ (file_name, "-"))
+	{
+	  /* Give a better diagnostic in an unusual case:
+	     printf - | wc --files0-from=- */
+	  error (0, 0, _("when reading file names from stdin, "
+			 "no file name of %s allowed"),
+		 quote (file_name));
+	  skip_file = true;
+	}
 
+      if (!file_name[0])
+	{
 	  /* Diagnose a zero-length file name.  When it's one
-	     among many, knowing the record number may help.  */
-	  if (files[i][0] == '\0')
+	     among many, knowing the record number may help.
+	     FIXME: currently print the record number only with
+	     --files0-from=FILE.  Maybe do it for argv, too?  */
+	  if (files_from == NULL)
+	    error (0, 0, "%s", _("invalid zero-length file name"));
+	  else
 	    {
-	      ok = false;
-	      if (files_from)
-		{
-		  /* Using the standard `filename:line-number:' prefix here is
-		     not totally appropriate, since NUL is the separator, not NL,
-		     but it might be better than nothing.  */
-		  unsigned long int file_number = i + 1;
-		  error (0, 0, "%s:%lu: %s", quotearg_colon (files_from),
-			 file_number, _("invalid zero-length file name"));
-		}
-	      else
-		error (0, 0, "%s", _("invalid zero-length file name"));
-	      continue;
+	      /* Using the standard `filename:line-number:' prefix here is
+		 not totally appropriate, since NUL is the separator, not NL,
+		 but it might be better than nothing.  */
+	      unsigned long int file_number = argv_iter_n_args (ai);
+	      error (0, 0, "%s:%lu: %s", quotearg_colon (files_from),
+		     file_number, _("invalid zero-length file name"));
 	    }
+	  skip_file = true;
 	}
 
-      ok &= wc_file (files[i], &fstatus[i]);
+      if (skip_file)
+	ok = false;
+      else
+	ok &= wc_file (file_name, &fstatus[nfiles ? i : 0]);
     }
 
-  if (1 < nfiles)
+  /* No arguments on the command line is fine.  That means read from stdin.
+     However, no arguments on the --files0-from input stream is an error
+     means don't read anything.  */
+  if (ok && !files_from && argv_iter_n_args (ai) == 0)
+    ok &= wc_file (NULL, &fstatus[0]);
+
+  if (read_tokens)
+    readtokens0_free (&tok);
+
+  if (1 < argv_iter_n_args (ai))
     write_counts (total_lines, total_words, total_chars, total_bytes,
 		  max_line_length, _("total"));
 
+  argv_iter_free (ai);
+
   free (fstatus);
 
   if (have_read_stdin && close (STDIN_FILENO) != 0)
author	Jim Meyering <meyering@redhat.com>	2008-11-25 18:38:26 +0100
committer	Jim Meyering <meyering@redhat.com>	2008-12-02 13:12:22 +0100
commit	c2e56e0de7d86bdc0f824d758a7efde4d5d7b235 (patch)
tree	acd3e00bd616d75dccf110ef810a9e74a1c787d3 /src/wc.c
parent	031e2fb5e9501fb9cda4d739a92abb02e2b05a52 (diff)
download	coreutils-c2e56e0de7d86bdc0f824d758a7efde4d5d7b235.tar.xz