From a1e715698a038af7ff341011a2aeecf6729c8de9 Mon Sep 17 00:00:00 2001 From: James Youngman Date: Tue, 19 Feb 2008 14:13:00 +0100 Subject: join: new options: --check-order and --nocheck-order. * src/join.c: Support --check-order and --nocheck-order. New variables check_input_order, seen_unpairable and issued_disorder_warning[]. For --check-order, verify that the input files are in sorted order. For the default case, check the order only if there are unpairable lines. (join): Perform ordering checks after reaching EOF on either input. (usage): Mention --check-order and --nocheck-order. (dupline): Save a copy of the previously-read input line so that we can detect disorder on the input. (get_line): Temporarily save a copy of the previous line (by calling dupline) and check relative ordering (by calling checkorder) before returning the newly-read line. (getseq, join): Tell get_line which file we are reading from. (advance_seq): New function, factoring out some of the code commonly surrounding calls to getseq. (checkorder): New function. Verifies that a pair of consecutive input lines are in sorted order. * doc/coreutils.texi (join invocation): Document the new options --check-order and --nocheck-order. * tests/join/Test.pm (tv): Added tests for --check-order and --nocheck-order. * NEWS: Mention this new feature. --- src/join.c | 216 ++++++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 184 insertions(+), 32 deletions(-) (limited to 'src') diff --git a/src/join.c b/src/join.c index a6ca7e4e1..71b11da06 100644 --- a/src/join.c +++ b/src/join.c @@ -1,5 +1,5 @@ /* join - join lines of two files on a common field - Copyright (C) 91, 1995-2006 Free Software Foundation, Inc. + Copyright (C) 91, 1995-2006, 2008 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -90,6 +90,12 @@ static bool print_unpairables_1, print_unpairables_2; /* If nonzero, print pairable lines. */ static bool print_pairables; +/* If nonzero, we have seen at least one unpairable line. */ +static bool seen_unpairable; + +/* If nonzero, we have warned about disorder in that file. */ +static bool issued_disorder_warning[2]; + /* Empty output field filler. */ static char const *empty_filler; @@ -108,9 +114,26 @@ static struct outlist *outlist_end = &outlist_head; tab character whose value (when cast to unsigned char) equals TAB. */ static int tab = -1; +/* If nonzero, check that the input is correctly ordered. */ +static enum + { + CHECK_ORDER_DEFAULT, + CHECK_ORDER_ENABLED, + CHECK_ORDER_DISABLED + } check_input_order; + +enum +{ + CHECK_ORDER_OPTION = CHAR_MAX + 1, + NOCHECK_ORDER_OPTION +}; + + static struct option const longopts[] = { {"ignore-case", no_argument, NULL, 'i'}, + {"check-order", no_argument, NULL, CHECK_ORDER_OPTION}, + {"nocheck-order", no_argument, NULL, NOCHECK_ORDER_OPTION}, {GETOPT_HELP_OPTION_DECL}, {GETOPT_VERSION_OPTION_DECL}, {NULL, 0, NULL, 0} @@ -122,6 +145,9 @@ static struct line uni_blank; /* If nonzero, ignore case when comparing join fields. */ static bool ignore_case; + +static void checkorder (const struct line *, const struct line *, int); + void usage (int status) { @@ -153,6 +179,9 @@ by whitespace. When FILE1 or FILE2 (not both) is -, read standard input.\n\ -v FILENUM like -a FILENUM, but suppress joined output lines\n\ -1 FIELD join on this FIELD of file 1\n\ -2 FIELD join on this FIELD of file 2\n\ + --check-order check that the input is correctly sorted, even\n\ + if all input lines are pairable\n\ + --nocheck-order do not check that the input is correctly sorted\n\ "), stdout); fputs (HELP_OPTION_DESCRIPTION, stdout); fputs (VERSION_OPTION_DESCRIPTION, stdout); @@ -167,6 +196,8 @@ separated by CHAR.\n\ \n\ Important: FILE1 and FILE2 must be sorted on the join fields.\n\ E.g., use `sort -k 1b,1' if `join' has no options.\n\ +If the input is not sorted and some lines cannot be joined, a\n\ +warning message will be given.\n\ "), stdout); emit_bug_reporting_address (); } @@ -228,12 +259,49 @@ xfields (struct line *line) extract_field (line, ptr, lim - ptr); } +static struct line * +dup_line (const struct line *old) +{ + struct line *newline = xmalloc (sizeof *newline); + size_t i; + + /* Duplicate the buffer. */ + initbuffer (&newline->buf); + newline->buf.buffer = xmalloc (old->buf.size); + newline->buf.size = old->buf.size; + memcpy (newline->buf.buffer, old->buf.buffer, old->buf.length); + newline->buf.length = old->buf.length; + + /* Duplicate the field positions. */ + newline->fields = xnmalloc (old->nfields_allocated, sizeof *newline->fields); + newline->nfields = old->nfields; + newline->nfields_allocated = old->nfields_allocated; + + for (i = 0; i < old->nfields; i++) + { + newline->fields[i].len = old->fields[i].len; + newline->fields[i].beg = newline->buf.buffer + (old->fields[i].beg + - old->buf.buffer); + } + return newline; +} + +static void +freeline (struct line *line) +{ + free (line->fields); + free (line->buf.buffer); + line->buf.buffer = NULL; +} + /* Read a line from FP into LINE and split it into fields. Return true if successful. */ static bool -get_line (FILE *fp, struct line *line) +get_line (FILE *fp, struct line *line, int which) { + static struct line *prevline[2]; + initbuffer (&line->buf); if (! readlinebuffer (&line->buf, fp)) @@ -249,15 +317,14 @@ get_line (FILE *fp, struct line *line) line->nfields = 0; line->fields = NULL; xfields (line); - return true; -} -static void -freeline (struct line *line) -{ - free (line->fields); - free (line->buf.buffer); - line->buf.buffer = NULL; + if (prevline[which - 1]) + { + checkorder (prevline[which - 1], line, which); + freeline (prevline[which - 1]); + } + prevline[which - 1] = dup_line (line); + return true; } static void @@ -271,12 +338,12 @@ initseq (struct seq *seq) /* Read a line from FP and add it to SEQ. Return true if successful. */ static bool -getseq (FILE *fp, struct seq *seq) +getseq (FILE *fp, struct seq *seq, int whichfile) { if (seq->count == seq->alloc) seq->lines = X2NREALLOC (seq->lines, &seq->alloc); - if (get_line (fp, &seq->lines[seq->count])) + if (get_line (fp, &seq->lines[seq->count], whichfile)) { ++seq->count; return true; @@ -284,6 +351,20 @@ getseq (FILE *fp, struct seq *seq) return false; } +/* Read a line from FP and add it to SEQ, as the first item if FIRST is + * true, else as the next. + */ +static bool +advance_seq (FILE *fp, struct seq *seq, bool first, int whichfile) +{ + if (first) + { + freeline (&seq->lines[0]); + seq->count = 0; + } + return getseq (fp, seq, whichfile); +} + static void delseq (struct seq *seq) { @@ -354,6 +435,44 @@ keycmp (struct line const *line1, struct line const *line2) return len1 < len2 ? -1 : len1 != len2; } + + +/* Check that successive input lines PREV and CURRENT from input file + * WHATFILE are presented in order, unless the user may be relying on + * the GNU extension that input lines may be out of order if no input + * lines are unpairable. + * + * If the user specified --nocheck-order, the check is not made. + * If the user specified --check-order, the problem is fatal. + * Otherwise (the default), the message is simply a warning. + * + * A message is printed at most once per input file. + */ +static void +checkorder (const struct line *prev, + const struct line *current, + int whatfile) +{ + if (check_input_order != CHECK_ORDER_DISABLED + && ((check_input_order == CHECK_ORDER_ENABLED) || seen_unpairable)) + { + if (!issued_disorder_warning[whatfile-1]) + { + if (keycmp (prev, current) > 0) + { + error ((check_input_order == CHECK_ORDER_ENABLED ? 1 : 0), + 0, _("File %d is not in sorted order"), whatfile); + + /* If we get to here, the message was just a warning, but we + want only to issue it once. */ + issued_disorder_warning[whatfile-1] = true; + } + } + } +} + + + /* Print field N of LINE if it exists and is nonempty, otherwise `empty_filler' if it is nonempty. */ @@ -464,13 +583,13 @@ join (FILE *fp1, FILE *fp2) struct seq seq1, seq2; struct line line; int diff; - bool eof1, eof2; + bool eof1, eof2, checktail; /* Read the first line of each file. */ initseq (&seq1); - getseq (fp1, &seq1); + getseq (fp1, &seq1, 1); initseq (&seq2); - getseq (fp2, &seq2); + getseq (fp2, &seq2, 2); while (seq1.count && seq2.count) { @@ -480,18 +599,16 @@ join (FILE *fp1, FILE *fp2) { if (print_unpairables_1) prjoin (&seq1.lines[0], &uni_blank); - freeline (&seq1.lines[0]); - seq1.count = 0; - getseq (fp1, &seq1); + advance_seq (fp1, &seq1, true, 1); + seen_unpairable = true; continue; } if (diff > 0) { if (print_unpairables_2) prjoin (&uni_blank, &seq2.lines[0]); - freeline (&seq2.lines[0]); - seq2.count = 0; - getseq (fp2, &seq2); + advance_seq (fp2, &seq2, true, 2); + seen_unpairable = true; continue; } @@ -499,7 +616,7 @@ join (FILE *fp1, FILE *fp2) match the current line from file2. */ eof1 = false; do - if (!getseq (fp1, &seq1)) + if (!advance_seq (fp1, &seq1, false, 1)) { eof1 = true; ++seq1.count; @@ -511,7 +628,7 @@ join (FILE *fp1, FILE *fp2) match the current line from file1. */ eof2 = false; do - if (!getseq (fp2, &seq2)) + if (!advance_seq (fp2, &seq2, false, 2)) { eof2 = true; ++seq2.count; @@ -550,25 +667,46 @@ join (FILE *fp1, FILE *fp2) seq2.count = 0; } - if (print_unpairables_1 && seq1.count) + /* If the user did not specify --check-order, and the we read the + * tail ends of both inputs to verify that they are in order. We + * skip the rest of the tail once we have issued a warning for that + * file, unless we actually need to print the unpairable lines. + */ + if (check_input_order != CHECK_ORDER_DISABLED + && !(issued_disorder_warning[0] && issued_disorder_warning[1])) + checktail = true; + else + checktail = false; + + if ((print_unpairables_1 || checktail) && seq1.count) { - prjoin (&seq1.lines[0], &uni_blank); + if (print_unpairables_1) + prjoin (&seq1.lines[0], &uni_blank); freeline (&seq1.lines[0]); - while (get_line (fp1, &line)) + seen_unpairable = true; + while (get_line (fp1, &line, 1)) { - prjoin (&line, &uni_blank); + if (print_unpairables_1) + prjoin (&line, &uni_blank); freeline (&line); + if (issued_disorder_warning[0] && !print_unpairables_1) + break; } } - if (print_unpairables_2 && seq2.count) + if ((print_unpairables_2 || checktail) && seq2.count) { - prjoin (&uni_blank, &seq2.lines[0]); + if (print_unpairables_2) + prjoin (&uni_blank, &seq2.lines[0]); freeline (&seq2.lines[0]); - while (get_line (fp2, &line)) + seen_unpairable = true; + while (get_line (fp2, &line, 2)) { - prjoin (&uni_blank, &line); + if (print_unpairables_2) + prjoin (&uni_blank, &line); freeline (&line); + if (issued_disorder_warning[1] && !print_unpairables_2) + break; } } @@ -789,6 +927,9 @@ main (int argc, char **argv) atexit (close_stdout); print_pairables = true; + seen_unpairable = false; + issued_disorder_warning[0] = issued_disorder_warning[1] = false; + check_input_order = CHECK_ORDER_DEFAULT; while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:", longopts, NULL)) @@ -875,6 +1016,14 @@ main (int argc, char **argv) } break; + case NOCHECK_ORDER_OPTION: + check_input_order = CHECK_ORDER_DISABLED; + break; + + case CHECK_ORDER_OPTION: + check_input_order = CHECK_ORDER_ENABLED; + break; + case 1: /* Non-option argument. */ add_file_name (optarg, names, operand_status, joption_count, &nfiles, &prev_optc_status, &optc_status); @@ -935,5 +1084,8 @@ main (int argc, char **argv) if (fclose (fp2) != 0) error (EXIT_FAILURE, errno, "%s", names[1]); - exit (EXIT_SUCCESS); + if (issued_disorder_warning[0] || issued_disorder_warning[1]) + exit (EXIT_FAILURE); + else + exit (EXIT_SUCCESS); } -- cgit v1.2.3-54-g00ecf