uniq: add the --group option

* src/uniq.c (usage): Summarize the new option, and adjust the --all-repeated option to be more consistent. (check_file): Merge the --group functionality into the core loop for the default uniq operation since it's very similar and can output lines immediately upon reading. (main): Handle the new --group option and make it mutually exclusive with other selection options. * tests/misc/uniq.pl: Add tests. * NEWS: Mention the new feature. * doc/coreutils.texi (uniq invocation): Describe --group.
author: Assaf Gordon <assafgordon@gmail.com> 2013-02-20 13:31:22 -0500
committer: Pádraig Brady <P@draigBrady.com> 2013-02-28 18:20:30 +0000
commit: 374f569579fe4e319d592f4d77ae1ede5566eed6 (patch)
tree: b7493c64cf19988dc84aaf1899b1e9c3718896bc
parent: 8b6d3c5700526f962b12cd5901b55961c5e18186 (diff)
download: coreutils-374f569579fe4e319d592f4d77ae1ede5566eed6.tar.xz
4 files changed, 189 insertions, 14 deletions
diff --git a/NEWS b/NEWS
index 8785bb333..5b28c921f 100644
--- a/NEWS
+++ b/NEWS
@@ -15,6 +15,9 @@ GNU coreutils NEWS                                    -*- outline -*-
   option of the same name, this makes join consume and produce NUL-terminated
   lines rather than newline-terminated lines.
 
+  uniq accepts a new option: --group to print all items, while separating
+  unique groups with empty lines.
+
 
 * Noteworthy changes in release 8.21 (2013-02-14) [stable]
 
diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index 19ef4651c..fe4c3ad36 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -5067,7 +5067,7 @@ Do not discard the second and subsequent repeated input lines,
 but discard lines that are not repeated.
 This option is useful mainly in conjunction with other options e.g.,
 to ignore case or to compare only selected fields.
-The optional @var{delimit-method} tells how to delimit
+The optional @var{delimit-method} specifies how to delimit
 groups of repeated lines, and must be one of the following:
 
 @table @samp
@@ -5078,26 +5078,60 @@ This is equivalent to @option{--all-repeated} (@option{-D}).
 
 @item prepend
 Output a newline before each group of repeated lines.
+@macro nulOutputNote
 With @option{--zero-terminated} (@option{-z}), use a zero
-byte (ASCII NUL) instead of a newline.
+byte (ASCII NUL) instead of a newline as the delimiter.
+@end macro
+@nulOutputNote
 
 @item separate
 Separate groups of repeated lines with a single newline.
-With @option{--zero-terminated} (@option{-z}), use a zero
-byte (ASCII NUL) instead of a newline.
 This is the same as using @samp{prepend}, except that
 no delimiter is inserted before the first group, and hence
 may be better suited for output direct to users.
+@nulOutputNote
 @end table
 
+@macro ambiguousGroupNote
 Note that when groups are delimited and the input stream contains
 two or more consecutive blank lines, then the output is ambiguous.
 To avoid that, filter the input through @samp{tr -s '\n'} to replace
 each sequence of consecutive newlines with a single newline.
+@end macro
+@ambiguousGroupNote
 
 This is a GNU extension.
 @c FIXME: give an example showing *how* it's useful
 
+@item --group[=@var{delimit-method}]
+@opindex --group
+@cindex all lines, grouping
+Output all lines, and delimit each unique group.
+@nulOutputNote
+The optional @var{delimit-method} specifies how to delimit
+groups, and must be one of the following:
+
+@table @samp
+
+@item separate
+Separate unique groups with a single delimiter.
+This is the default delimiting method if none is specified,
+and better suited for output direct to users.
+
+@item prepend
+Output a delimiter before each group of unique items.
+
+@item append
+Output a delimiter after each group of unique items.
+
+@item both
+Output a delimiter around each group of unique items.
+@end table
+
+@ambiguousGroupNote
+
+This is a GNU extension.
+
 @item -u
 @itemx --unique
 @opindex -u
diff --git a/src/uniq.c b/src/uniq.c
index 5efdad7d4..835b5b194 100644
--- a/src/uniq.c
+++ b/src/uniq.c
@@ -108,11 +108,47 @@ static enum delimit_method const delimit_method_map[] =
 /* Select whether/how to delimit groups of duplicate lines.  */
 static enum delimit_method delimit_groups;
 
+enum grouping_method
+{
+  /* No grouping, when "--group" isn't used */
+  GM_NONE,
+
+  /* Delimiter preceges all groups.  --group=prepend */
+  GM_PREPEND,
+
+  /* Delimiter follows all groups.   --group=append */
+  GM_APPEND,
+
+  /* Delimiter between groups.    --group[=separate] */
+  GM_SEPARATE,
+
+  /* Delimiter before and after each group. --group=both */
+  GM_BOTH
+};
+
+static char const *const grouping_method_string[] =
+{
+  "prepend", "append", "separate", "both", NULL
+};
+
+static enum grouping_method const grouping_method_map[] =
+{
+  GM_PREPEND, GM_APPEND, GM_SEPARATE, GM_BOTH
+};
+
+static enum grouping_method grouping = GM_NONE;
+
+enum
+{
+  GROUP_OPTION = CHAR_MAX + 1
+};
+
 static struct option const longopts[] =
 {
   {"count", no_argument, NULL, 'c'},
   {"repeated", no_argument, NULL, 'd'},
   {"all-repeated", optional_argument, NULL, 'D'},
+  {"group", optional_argument, NULL, GROUP_OPTION},
   {"ignore-case", no_argument, NULL, 'i'},
   {"unique", no_argument, NULL, 'u'},
   {"skip-fields", required_argument, NULL, 'f'},
@@ -149,10 +185,18 @@ With no options, matching lines are merged to the first occurrence.\n\
   -d, --repeated        only print duplicate lines\n\
 "), stdout);
      fputs (_("\
-  -D, --all-repeated[=delimit-method]  print all duplicate lines\n\
-                        delimit-method={none(default),prepend,separate}\n\
-                        Delimiting is done with blank lines\n\
+  -D, --all-repeated[=METHOD]  print all duplicate lines\n\
+                          groups can be delimited with an empty line\n\
+                          METHOD={none(default),prepend,separate}\n\
+"), stdout);
+     fputs (_("\
   -f, --skip-fields=N   avoid comparing the first N fields\n\
+"), stdout);
+     fputs (_("\
+      --group[=METHOD]  show all items, separating groups with an empty line\n\
+                          METHOD={separate(default),prepend,append,both}\n\
+"), stdout);
+     fputs (_("\
   -i, --ignore-case     ignore differences in case when comparing\n\
   -s, --skip-chars=N    avoid comparing the first N characters\n\
   -u, --unique          only print unique lines\n\
@@ -293,27 +337,48 @@ check_file (const char *infile, const char *outfile, char delimiter)
   initbuffer (prevline);
 
   /* The duplication in the following 'if' and 'else' blocks is an
-     optimization to distinguish the common case (in which none of
-     the following options has been specified: --count, -repeated,
-     --all-repeated, --unique) from the others.  In the common case,
-     this optimization lets uniq output each different line right away,
-     without waiting to see if the next one is different.  */
+     optimization to distinguish between when we can print input
+     lines immediately (1. & 2.) or not.
+
+     1. --group => all input lines are printed.
+        checking for unique/duplicated lines is used only for printing
+        group separators.
+
+     2. The default case in which none of these options has been specified:
+          --count, --repeated,  --all-repeated, --unique
+        In the default case, this optimization lets uniq output each different
+        line right away, without waiting to see if the next one is different.
 
+     3. All other cases.
+  */
   if (output_unique && output_first_repeated && countmode == count_none)
     {
       char *prevfield IF_LINT ( = NULL);
       size_t prevlen IF_LINT ( = 0);
+      bool first_group_printed = false;
 
       while (!feof (stdin))
         {
           char *thisfield;
           size_t thislen;
+          bool new_group;
+
           if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
             break;
+
           thisfield = find_field (thisline);
           thislen = thisline->length - 1 - (thisfield - thisline->buffer);
-          if (prevline->length == 0
-              || different (thisfield, prevfield, thislen, prevlen))
+
+          new_group = (prevline->length == 0
+                       || different (thisfield, prevfield, thislen, prevlen));
+
+          if (new_group && grouping != GM_NONE
+              && (grouping == GM_PREPEND || grouping == GM_BOTH
+                  || (first_group_printed && (grouping == GM_APPEND
+                                              || grouping == GM_SEPARATE))))
+            putchar (delimiter);
+
+          if (new_group || grouping != GM_NONE)
             {
               fwrite (thisline->buffer, sizeof (char),
                       thisline->length, stdout);
@@ -321,8 +386,11 @@ check_file (const char *infile, const char *outfile, char delimiter)
               SWAP_LINES (prevline, thisline);
               prevfield = thisfield;
               prevlen = thislen;
+              first_group_printed = true;
             }
         }
+      if ((grouping == GM_BOTH || grouping == GM_APPEND) && first_group_printed)
+        putchar (delimiter);
     }
   else
     {
@@ -415,6 +483,7 @@ main (int argc, char **argv)
   int nfiles = 0;
   char const *file[2];
   char delimiter = '\n';	/* change with --zero-terminated, -z */
+  bool output_option_used = false;   /* if true, one of -u/-d/-D/-c was used */
 
   file[0] = file[1] = "-";
   initialize_main (&argc, &argv);
@@ -498,10 +567,12 @@ main (int argc, char **argv)
 
         case 'c':
           countmode = count_occurrences;
+          output_option_used = true;
           break;
 
         case 'd':
           output_unique = false;
+          output_option_used = true;
           break;
 
         case 'D':
@@ -513,6 +584,16 @@ main (int argc, char **argv)
             delimit_groups = XARGMATCH ("--all-repeated", optarg,
                                         delimit_method_string,
                                         delimit_method_map);
+          output_option_used = true;
+          break;
+
+        case GROUP_OPTION:
+          if (optarg == NULL)
+            grouping = GM_SEPARATE;
+          else
+            grouping = XARGMATCH ("--group", optarg,
+                                  grouping_method_string,
+                                  grouping_method_map);
           break;
 
         case 'f':
@@ -532,6 +613,7 @@ main (int argc, char **argv)
 
         case 'u':
           output_first_repeated = false;
+          output_option_used = true;
           break;
 
         case 'w':
@@ -552,6 +634,23 @@ main (int argc, char **argv)
         }
     }
 
+  /* Note we could allow --group with -D at least, and that would
+     avoid the need to specify a grouping method to --all-repeated.
+     It was thought best to avoid deprecating those parameters though
+     and keep --group separate to other options.  */
+  if (grouping != GM_NONE && output_option_used)
+    {
+      error (0, 0, _("--group is mutually exclusive with -c/-d/-D/-u"));
+      usage (EXIT_FAILURE);
+    }
+
+  if (grouping != GM_NONE && countmode != count_none)
+    {
+      error (0, 0,
+           _("grouping and printing repeat counts is meaningless"));
+      usage (EXIT_FAILURE);
+    }
+
   if (countmode == count_occurrences && output_later_repeated)
     {
       error (0, 0,
diff --git a/tests/misc/uniq.pl b/tests/misc/uniq.pl
index e3873b577..4640b1488 100755
--- a/tests/misc/uniq.pl
+++ b/tests/misc/uniq.pl
@@ -203,6 +203,45 @@ my @Tests =
  ['125', '',              {IN=>"A\na\n"}, {OUT=>"A\na\n"}],
  ['126', '-i',            {IN=>"A\na\n"}, {OUT=>"A\n"}],
  ['127', '--ignore-case', {IN=>"A\na\n"}, {OUT=>"A\n"}],
+ # Check grouping
+ ['128', '--group=prepend', {IN=>"a\na\nb\n"}, {OUT=>"\na\na\n\nb\n"}],
+ ['129', '--group=append',  {IN=>"a\na\nb\n"}, {OUT=>"a\na\n\nb\n\n"}],
+ ['130', '--group=separate',{IN=>"a\na\nb\n"}, {OUT=>"a\na\n\nb\n"}],
+ # no explicit grouping = separate
+ ['131', '--group',         {IN=>"a\na\nb\n"}, {OUT=>"a\na\n\nb\n"}],
+ ['132', '--group=both',    {IN=>"a\na\nb\n"}, {OUT=>"\na\na\n\nb\n\n"}],
+ # Grouping in the special case of a single group
+ ['133', '--group=prepend', {IN=>"a\na\n"}, {OUT=>"\na\na\n"}],
+ ['134', '--group=append',  {IN=>"a\na\n"}, {OUT=>"a\na\n\n"}],
+ ['135', '--group=separate',{IN=>"a\na\n"}, {OUT=>"a\na\n"}],
+ ['136', '--group',         {IN=>"a\na\n"}, {OUT=>"a\na\n"}],
+ # Grouping with empty input - should never print anything
+ ['137', '--group=prepend',  {IN=>""}, {OUT=>""}],
+ ['138', '--group=append',   {IN=>""}, {OUT=>""}],
+ ['139', '--group=separate', {IN=>""}, {OUT=>""}],
+ ['140', '--group=both',     {IN=>""}, {OUT=>""}],
+ # Grouping with other options - must fail
+ ['141', '--group -c',       {IN=>""}, {OUT=>""}, {EXIT=>1},
+  {ERR=>"$prog: --group is mutually exclusive with -c/-d/-D/-u\n" .
+        "Try 'uniq --help' for more information.\n"}],
+ ['142', '--group -d',       {IN=>""}, {OUT=>""}, {EXIT=>1},
+  {ERR=>"$prog: --group is mutually exclusive with -c/-d/-D/-u\n" .
+        "Try 'uniq --help' for more information.\n"}],
+ ['143', '--group -u',       {IN=>""}, {OUT=>""}, {EXIT=>1},
+  {ERR=>"$prog: --group is mutually exclusive with -c/-d/-D/-u\n" .
+        "Try 'uniq --help' for more information.\n"}],
+ ['144', '--group -D',       {IN=>""}, {OUT=>""}, {EXIT=>1},
+  {ERR=>"$prog: --group is mutually exclusive with -c/-d/-D/-u\n" .
+        "Try 'uniq --help' for more information.\n"}],
+ # Grouping with badoption
+ ['145', '--group=badoption',{IN=>""}, {OUT=>""}, {EXIT=>1},
+  {ERR=>"$prog: invalid argument 'badoption' for '--group'\n" .
+        "Valid arguments are:\n" .
+        "  - 'prepend'\n" .
+        "  - 'append'\n" .
+        "  - 'separate'\n" .
+        "  - 'both'\n" .
+        "Try '$prog --help' for more information.\n"}],
 );
 
 # Set _POSIX2_VERSION=199209 in the environment of each obs-plus* test.
author	Assaf Gordon <assafgordon@gmail.com>	2013-02-20 13:31:22 -0500
committer	Pádraig Brady <P@draigBrady.com>	2013-02-28 18:20:30 +0000
commit	374f569579fe4e319d592f4d77ae1ede5566eed6 (patch)
tree	b7493c64cf19988dc84aaf1899b1e9c3718896bc
parent	8b6d3c5700526f962b12cd5901b55961c5e18186 (diff)
download	coreutils-374f569579fe4e319d592f4d77ae1ede5566eed6.tar.xz