From 24aff479581963a3dbd87c86d27c4e2aaac27507 Mon Sep 17 00:00:00 2001 From: Erich Eckner Date: Wed, 30 Nov 2016 11:18:30 +0100 Subject: uniq: compiles, but does the wrong thing - for now --- src/uniq.c | 291 ++++++++++++++++++++++++++++--------------------------------- 1 file changed, 134 insertions(+), 157 deletions(-) diff --git a/src/uniq.c b/src/uniq.c index d76fa7ed2..ecf05825e 100644 --- a/src/uniq.c +++ b/src/uniq.c @@ -43,16 +43,6 @@ proper_name ("Richard M. Stallman"), \ proper_name ("David MacKenzie") -#define SWAP_LINES(A, B) \ - do \ - { \ - struct line *_tmp; \ - _tmp = (A); \ - (A) = (B); \ - (B) = _tmp; \ - } \ - while (0) - /* True if the LC_COLLATE locale is hard. */ static bool hard_LC_COLLATE; @@ -152,6 +142,7 @@ static struct option const longopts[] = {"all-repeated", optional_argument, NULL, 'D'}, {"group", optional_argument, NULL, GROUP_OPTION}, {"ignore-case", no_argument, NULL, 'i'}, + {"key", required_argument, NULL, 'k'}, {"unique", no_argument, NULL, 'u'}, {"skip-fields", required_argument, NULL, 'f'}, {"skip-chars", required_argument, NULL, 's'}, @@ -201,6 +192,7 @@ With no options, matching lines are merged to the first occurrence.\n\ "), stdout); fputs (_("\ -i, --ignore-case ignore differences in case when comparing\n\ + -k, --key=... see 'sort' for details\n\ -s, --skip-chars=N avoid comparing the first N characters\n\ -u, --unique only print unique lines\n\ "), stdout); @@ -257,30 +249,6 @@ size_opt (char const *opt, char const *msgid) return MIN (size, SIZE_MAX); } -/* Given a line LINE, - return a pointer to the beginning of the line's field to be compared. */ - -static char * _GL_ATTRIBUTE_PURE -find_field (struct line const *line) -{ - size_t count; - char const *lp = line->text; - size_t size = line->length - 1; - size_t i = 0; - - for (count = 0; count < skip_fields && i < size; count++) - { - while (i < size && field_sep (lp[i])) - i++; - while (i < size && !field_sep (lp[i])) - i++; - } - - i += MIN (skip_chars, size - i); - - return line->text + i; -} - /* Output the line in line LINE to standard output provided that the switches say it should be output. MATCH is true if the line matches the previous line. @@ -289,7 +257,7 @@ find_field (struct line const *line) static void writeline (struct line const *line, - bool match, uintmax_t linecount) + bool match, uintmax_t linecount, FILE *outfp, const char *outfile) { if (! (linecount == 0 ? output_unique : !match ? output_first_repeated @@ -297,9 +265,10 @@ writeline (struct line const *line, return; if (countmode == count_occurrences) - printf ("%7" PRIuMAX " ", linecount + 1); + fprintf (outfp, "%7" PRIuMAX " ", linecount + 1); - fwrite (line->text, sizeof (char), line->length, stdout); + fwrite (line->text, sizeof (char), line->length, outfp); + fputc (eolchar, outfp); } /* Process input file INFILE with output to OUTFILE. @@ -308,154 +277,158 @@ writeline (struct line const *line, static void check_file (const char *infile, const char *outfile, char delimiter) { - struct buffer buf; - struct line *thisline, *prevline; - - if (! (STREQ (infile, "-") || freopen (infile, "r", stdin))) - die (EXIT_FAILURE, errno, "%s", quotef (infile)); - if (! (STREQ (outfile, "-") || freopen (outfile, "w", stdout))) - die (EXIT_FAILURE, errno, "%s", quotef (outfile)); - - fadvise (stdin, FADVISE_SEQUENTIAL); - + FILE *infp = xfopen (infile, "r"); + FILE *outfp = xfopen (STREQ(outfile,"-") ? NULL : outfile, "w"); + struct buffer buf; /* Input buffer. */ + struct line temp; /* Copy of previous line. */ + size_t alloc = 0; + struct keyfield const *key = keylist; initbuf (&buf, sizeof (struct line), MAX (merge_buffer_size, sort_size)); - thisline = lb1.buf; - prevline = lb2.buf; + struct line const *line = buffer_linelim (&buf); + struct line const *linebase = line - buf.nlines; - initbuffer (thisline); - initbuffer (prevline); + bool the_same; /* Is the line equal to the last one? */ + bool first_group_printed = false; + uintmax_t match_count = 0; + bool first_delimiter = true; - /* The duplication in the following 'if' and 'else' blocks is an - optimization to distinguish between when we can print input - lines immediately (1. & 2.) or not. + temp.text = NULL; - 1. --group => all input lines are printed. - checking for unique/duplicated lines is used only for printing - group separators. + while (fillbuf (&buf, infp, infile)) + { + line = buffer_linelim (&buf); + linebase = line - buf.nlines; - 2. The default case in which none of these options has been specified: - --count, --repeated, --all-repeated, --unique - In the default case, this optimization lets uniq output each different - line right away, without waiting to see if the next one is different. + /* Check if the line saved from the old buffer contents is + equal to the first line of the new buffer. */ + the_same = alloc && ! compare (&temp, line - 1); - 3. All other cases. - */ - if (output_unique && output_first_repeated && countmode == count_none) - { - char *prevfield IF_LINT ( = NULL); - size_t prevlen IF_LINT ( = 0); - bool first_group_printed = false; + if (linebase < line) + goto print_something_if_appropriate; - while (!feof (stdin)) +/* { - char *thisfield; - size_t thislen; - bool new_group; + found_disorder: + { + if (checkonly == 'c') + { + struct line const *disorder_line = line - 1; + uintmax_t disorder_line_number = + buffer_linelim (&buf) - disorder_line + line_number; + char hr_buf[INT_BUFSIZE_BOUND (disorder_line_number)]; + fprintf (stderr, _("%s: %s:%s: disorder: "), + program_name, file_name, + umaxtostr (disorder_line_number, hr_buf)); + write_line (disorder_line, stderr, _("standard error")); + } - if (readlinebuffer_delim (thisline, stdin, delimiter) == 0) + ordered = false; break; + } + } */ - thisfield = find_field (thisline); - thislen = thisline->length - 1 - (thisfield - thisline->text); - - new_group = (prevline->length == 0 - || different (thisfield, prevfield, thislen, prevlen)); - - if (new_group && grouping != GM_NONE - && (grouping == GM_PREPEND || grouping == GM_BOTH - || (first_group_printed && (grouping == GM_APPEND - || grouping == GM_SEPARATE)))) - putchar (delimiter); - - if (new_group || grouping != GM_NONE) - { - fwrite (thisline->text, sizeof (char), - thisline->length, stdout); - - SWAP_LINES (prevline, thisline); - prevfield = thisfield; - prevlen = thislen; - first_group_printed = true; - } - } - if ((grouping == GM_BOTH || grouping == GM_APPEND) && first_group_printed) - putchar (delimiter); - } - else - { - char *prevfield; - size_t prevlen; - uintmax_t match_count = 0; - bool first_delimiter = true; - - if (readlinebuffer_delim (prevline, stdin, delimiter) == 0) - goto closefiles; - prevfield = find_field (prevline); - prevlen = prevline->length - 1 - (prevfield - prevline->text); - - while (!feof (stdin)) + /* Compare each line in the buffer with its successor. */ + while (linebase < --line) { - bool match; - char *thisfield; - size_t thislen; - if (readlinebuffer_delim (thisline, stdin, delimiter) == 0) - { - if (ferror (stdin)) - goto closefiles; - break; - } - thisfield = find_field (thisline); - thislen = thisline->length - 1 - (thisfield - thisline->text); - match = !different (thisfield, prevfield, thislen, prevlen); - match_count += match; + the_same = ! compare (line, line - 1); - if (match_count == UINTMAX_MAX) + print_something_if_appropriate: { - if (count_occurrences) - die (EXIT_FAILURE, 0, _("too many repeated lines")); - match_count--; - } - if (delimit_groups != DM_NONE) - { - if (!match) + if (output_unique && output_first_repeated && countmode == count_none) { - if (match_count) /* a previous match */ - first_delimiter = false; /* Only used when DM_SEPARATE */ + if (! the_same && grouping != GM_NONE + && (grouping == GM_PREPEND || grouping == GM_BOTH + || (first_group_printed && (grouping == GM_APPEND + || grouping == GM_SEPARATE)))) + fputc (delimiter, outfp); + + if (! the_same || grouping != GM_NONE) + { + write_line (line-1, outfp, outfile); + + first_group_printed = true; + } } - else if (match_count == 1) + else { - if ((delimit_groups == DM_PREPEND) - || (delimit_groups == DM_SEPARATE - && !first_delimiter)) - putchar (delimiter); + match_count += the_same; + + if (match_count == UINTMAX_MAX) + { + if (count_occurrences) + die (EXIT_FAILURE, 0, _("too many repeated lines")); + match_count--; + } + + if (delimit_groups != DM_NONE) + { + if (! the_same) + { + if (match_count) /* a previous match */ + first_delimiter = false; /* Only used when DM_SEPARATE */ + } + else if (match_count == 1) + { + if ((delimit_groups == DM_PREPEND) + || (delimit_groups == DM_SEPARATE + && !first_delimiter)) + fputc (delimiter, outfp); + } + } + + if (! the_same || output_later_repeated) + { + writeline (line-1, the_same, match_count, outfp, outfile); + if (! the_same) + match_count = 0; + } } } + } - if (!match || output_later_repeated) + /* Save the last line of the buffer. */ + if (alloc < line->length) + { + do { - writeline (prevline, match, match_count); - SWAP_LINES (prevline, thisline); - prevfield = thisfield; - prevlen = thislen; - if (!match) - match_count = 0; + alloc *= 2; + if (! alloc) + { + alloc = line->length; + break; + } } - } + while (alloc < line->length); - writeline (prevline, false, match_count); + free (temp.text); + temp.text = xmalloc (alloc); + } + memcpy (temp.text, line->text, line->length); + temp.length = line->length; + if (key) + { + temp.keybeg = temp.text + (line->keybeg - line->text); + temp.keylim = temp.text + (line->keylim - line->text); + } } - closefiles: - if (ferror (stdin) || fclose (stdin) != 0) - die (EXIT_FAILURE, 0, _("error reading %s"), quoteaf (infile)); - - /* stdout is handled via the atexit-invoked close_stdout function. */ + if (output_unique && output_first_repeated && countmode == count_none) + { + if ((grouping == GM_BOTH || grouping == GM_APPEND) && first_group_printed) + fputc (delimiter, outfp); + } + else + { + writeline (line-1, false, match_count, outfp, outfile); + } - free (lb1.text); - free (lb2.text); + xfclose (infp, infile); + xfclose (outfp, outfile); + free (buf.buf); + free (temp.text); } enum Skip_field_option_type @@ -503,7 +476,7 @@ main (int argc, char **argv) if (optc == -1 || (posixly_correct && nfiles != 0) || ((optc = getopt_long (argc, argv, - "-0123456789Dcdf:is:uw:z", longopts, NULL)) + "-0123456789Dcdf:iks:uw:z", longopts, NULL)) == -1)) { if (argc <= optind) @@ -597,6 +570,10 @@ main (int argc, char **argv) ignore_case = true; break; + case 'k': + add_key(); + break; + case 's': skip_chars = size_opt (optarg, N_("invalid number of bytes to skip")); -- cgit v1.2.3-54-g00ecf