summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJames Youngman <jay@gnu.org>2008-02-19 14:13:00 +0100
committerJim Meyering <meyering@redhat.com>2008-02-19 15:17:39 +0100
commita1e715698a038af7ff341011a2aeecf6729c8de9 (patch)
tree7786e67b64636ee6cca2a2ca720dcc9f1ef14fbf /src
parent4242d4f5c4f32374b684882a74e1b773ad01b1d6 (diff)
downloadcoreutils-a1e715698a038af7ff341011a2aeecf6729c8de9.tar.xz
join: new options: --check-order and --nocheck-order.
* src/join.c: Support --check-order and --nocheck-order. New variables check_input_order, seen_unpairable and issued_disorder_warning[]. For --check-order, verify that the input files are in sorted order. For the default case, check the order only if there are unpairable lines. (join): Perform ordering checks after reaching EOF on either input. (usage): Mention --check-order and --nocheck-order. (dupline): Save a copy of the previously-read input line so that we can detect disorder on the input. (get_line): Temporarily save a copy of the previous line (by calling dupline) and check relative ordering (by calling checkorder) before returning the newly-read line. (getseq, join): Tell get_line which file we are reading from. (advance_seq): New function, factoring out some of the code commonly surrounding calls to getseq. (checkorder): New function. Verifies that a pair of consecutive input lines are in sorted order. * doc/coreutils.texi (join invocation): Document the new options --check-order and --nocheck-order. * tests/join/Test.pm (tv): Added tests for --check-order and --nocheck-order. * NEWS: Mention this new feature.
Diffstat (limited to 'src')
-rw-r--r--src/join.c216
1 files changed, 184 insertions, 32 deletions
diff --git a/src/join.c b/src/join.c
index a6ca7e4e1..71b11da06 100644
--- a/src/join.c
+++ b/src/join.c
@@ -1,5 +1,5 @@
/* join - join lines of two files on a common field
- Copyright (C) 91, 1995-2006 Free Software Foundation, Inc.
+ Copyright (C) 91, 1995-2006, 2008 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -90,6 +90,12 @@ static bool print_unpairables_1, print_unpairables_2;
/* If nonzero, print pairable lines. */
static bool print_pairables;
+/* If nonzero, we have seen at least one unpairable line. */
+static bool seen_unpairable;
+
+/* If nonzero, we have warned about disorder in that file. */
+static bool issued_disorder_warning[2];
+
/* Empty output field filler. */
static char const *empty_filler;
@@ -108,9 +114,26 @@ static struct outlist *outlist_end = &outlist_head;
tab character whose value (when cast to unsigned char) equals TAB. */
static int tab = -1;
+/* If nonzero, check that the input is correctly ordered. */
+static enum
+ {
+ CHECK_ORDER_DEFAULT,
+ CHECK_ORDER_ENABLED,
+ CHECK_ORDER_DISABLED
+ } check_input_order;
+
+enum
+{
+ CHECK_ORDER_OPTION = CHAR_MAX + 1,
+ NOCHECK_ORDER_OPTION
+};
+
+
static struct option const longopts[] =
{
{"ignore-case", no_argument, NULL, 'i'},
+ {"check-order", no_argument, NULL, CHECK_ORDER_OPTION},
+ {"nocheck-order", no_argument, NULL, NOCHECK_ORDER_OPTION},
{GETOPT_HELP_OPTION_DECL},
{GETOPT_VERSION_OPTION_DECL},
{NULL, 0, NULL, 0}
@@ -122,6 +145,9 @@ static struct line uni_blank;
/* If nonzero, ignore case when comparing join fields. */
static bool ignore_case;
+
+static void checkorder (const struct line *, const struct line *, int);
+
void
usage (int status)
{
@@ -153,6 +179,9 @@ by whitespace. When FILE1 or FILE2 (not both) is -, read standard input.\n\
-v FILENUM like -a FILENUM, but suppress joined output lines\n\
-1 FIELD join on this FIELD of file 1\n\
-2 FIELD join on this FIELD of file 2\n\
+ --check-order check that the input is correctly sorted, even\n\
+ if all input lines are pairable\n\
+ --nocheck-order do not check that the input is correctly sorted\n\
"), stdout);
fputs (HELP_OPTION_DESCRIPTION, stdout);
fputs (VERSION_OPTION_DESCRIPTION, stdout);
@@ -167,6 +196,8 @@ separated by CHAR.\n\
\n\
Important: FILE1 and FILE2 must be sorted on the join fields.\n\
E.g., use `sort -k 1b,1' if `join' has no options.\n\
+If the input is not sorted and some lines cannot be joined, a\n\
+warning message will be given.\n\
"), stdout);
emit_bug_reporting_address ();
}
@@ -228,12 +259,49 @@ xfields (struct line *line)
extract_field (line, ptr, lim - ptr);
}
+static struct line *
+dup_line (const struct line *old)
+{
+ struct line *newline = xmalloc (sizeof *newline);
+ size_t i;
+
+ /* Duplicate the buffer. */
+ initbuffer (&newline->buf);
+ newline->buf.buffer = xmalloc (old->buf.size);
+ newline->buf.size = old->buf.size;
+ memcpy (newline->buf.buffer, old->buf.buffer, old->buf.length);
+ newline->buf.length = old->buf.length;
+
+ /* Duplicate the field positions. */
+ newline->fields = xnmalloc (old->nfields_allocated, sizeof *newline->fields);
+ newline->nfields = old->nfields;
+ newline->nfields_allocated = old->nfields_allocated;
+
+ for (i = 0; i < old->nfields; i++)
+ {
+ newline->fields[i].len = old->fields[i].len;
+ newline->fields[i].beg = newline->buf.buffer + (old->fields[i].beg
+ - old->buf.buffer);
+ }
+ return newline;
+}
+
+static void
+freeline (struct line *line)
+{
+ free (line->fields);
+ free (line->buf.buffer);
+ line->buf.buffer = NULL;
+}
+
/* Read a line from FP into LINE and split it into fields.
Return true if successful. */
static bool
-get_line (FILE *fp, struct line *line)
+get_line (FILE *fp, struct line *line, int which)
{
+ static struct line *prevline[2];
+
initbuffer (&line->buf);
if (! readlinebuffer (&line->buf, fp))
@@ -249,15 +317,14 @@ get_line (FILE *fp, struct line *line)
line->nfields = 0;
line->fields = NULL;
xfields (line);
- return true;
-}
-static void
-freeline (struct line *line)
-{
- free (line->fields);
- free (line->buf.buffer);
- line->buf.buffer = NULL;
+ if (prevline[which - 1])
+ {
+ checkorder (prevline[which - 1], line, which);
+ freeline (prevline[which - 1]);
+ }
+ prevline[which - 1] = dup_line (line);
+ return true;
}
static void
@@ -271,12 +338,12 @@ initseq (struct seq *seq)
/* Read a line from FP and add it to SEQ. Return true if successful. */
static bool
-getseq (FILE *fp, struct seq *seq)
+getseq (FILE *fp, struct seq *seq, int whichfile)
{
if (seq->count == seq->alloc)
seq->lines = X2NREALLOC (seq->lines, &seq->alloc);
- if (get_line (fp, &seq->lines[seq->count]))
+ if (get_line (fp, &seq->lines[seq->count], whichfile))
{
++seq->count;
return true;
@@ -284,6 +351,20 @@ getseq (FILE *fp, struct seq *seq)
return false;
}
+/* Read a line from FP and add it to SEQ, as the first item if FIRST is
+ * true, else as the next.
+ */
+static bool
+advance_seq (FILE *fp, struct seq *seq, bool first, int whichfile)
+{
+ if (first)
+ {
+ freeline (&seq->lines[0]);
+ seq->count = 0;
+ }
+ return getseq (fp, seq, whichfile);
+}
+
static void
delseq (struct seq *seq)
{
@@ -354,6 +435,44 @@ keycmp (struct line const *line1, struct line const *line2)
return len1 < len2 ? -1 : len1 != len2;
}
+
+
+/* Check that successive input lines PREV and CURRENT from input file
+ * WHATFILE are presented in order, unless the user may be relying on
+ * the GNU extension that input lines may be out of order if no input
+ * lines are unpairable.
+ *
+ * If the user specified --nocheck-order, the check is not made.
+ * If the user specified --check-order, the problem is fatal.
+ * Otherwise (the default), the message is simply a warning.
+ *
+ * A message is printed at most once per input file.
+ */
+static void
+checkorder (const struct line *prev,
+ const struct line *current,
+ int whatfile)
+{
+ if (check_input_order != CHECK_ORDER_DISABLED
+ && ((check_input_order == CHECK_ORDER_ENABLED) || seen_unpairable))
+ {
+ if (!issued_disorder_warning[whatfile-1])
+ {
+ if (keycmp (prev, current) > 0)
+ {
+ error ((check_input_order == CHECK_ORDER_ENABLED ? 1 : 0),
+ 0, _("File %d is not in sorted order"), whatfile);
+
+ /* If we get to here, the message was just a warning, but we
+ want only to issue it once. */
+ issued_disorder_warning[whatfile-1] = true;
+ }
+ }
+ }
+}
+
+
+
/* Print field N of LINE if it exists and is nonempty, otherwise
`empty_filler' if it is nonempty. */
@@ -464,13 +583,13 @@ join (FILE *fp1, FILE *fp2)
struct seq seq1, seq2;
struct line line;
int diff;
- bool eof1, eof2;
+ bool eof1, eof2, checktail;
/* Read the first line of each file. */
initseq (&seq1);
- getseq (fp1, &seq1);
+ getseq (fp1, &seq1, 1);
initseq (&seq2);
- getseq (fp2, &seq2);
+ getseq (fp2, &seq2, 2);
while (seq1.count && seq2.count)
{
@@ -480,18 +599,16 @@ join (FILE *fp1, FILE *fp2)
{
if (print_unpairables_1)
prjoin (&seq1.lines[0], &uni_blank);
- freeline (&seq1.lines[0]);
- seq1.count = 0;
- getseq (fp1, &seq1);
+ advance_seq (fp1, &seq1, true, 1);
+ seen_unpairable = true;
continue;
}
if (diff > 0)
{
if (print_unpairables_2)
prjoin (&uni_blank, &seq2.lines[0]);
- freeline (&seq2.lines[0]);
- seq2.count = 0;
- getseq (fp2, &seq2);
+ advance_seq (fp2, &seq2, true, 2);
+ seen_unpairable = true;
continue;
}
@@ -499,7 +616,7 @@ join (FILE *fp1, FILE *fp2)
match the current line from file2. */
eof1 = false;
do
- if (!getseq (fp1, &seq1))
+ if (!advance_seq (fp1, &seq1, false, 1))
{
eof1 = true;
++seq1.count;
@@ -511,7 +628,7 @@ join (FILE *fp1, FILE *fp2)
match the current line from file1. */
eof2 = false;
do
- if (!getseq (fp2, &seq2))
+ if (!advance_seq (fp2, &seq2, false, 2))
{
eof2 = true;
++seq2.count;
@@ -550,25 +667,46 @@ join (FILE *fp1, FILE *fp2)
seq2.count = 0;
}
- if (print_unpairables_1 && seq1.count)
+ /* If the user did not specify --check-order, and the we read the
+ * tail ends of both inputs to verify that they are in order. We
+ * skip the rest of the tail once we have issued a warning for that
+ * file, unless we actually need to print the unpairable lines.
+ */
+ if (check_input_order != CHECK_ORDER_DISABLED
+ && !(issued_disorder_warning[0] && issued_disorder_warning[1]))
+ checktail = true;
+ else
+ checktail = false;
+
+ if ((print_unpairables_1 || checktail) && seq1.count)
{
- prjoin (&seq1.lines[0], &uni_blank);
+ if (print_unpairables_1)
+ prjoin (&seq1.lines[0], &uni_blank);
freeline (&seq1.lines[0]);
- while (get_line (fp1, &line))
+ seen_unpairable = true;
+ while (get_line (fp1, &line, 1))
{
- prjoin (&line, &uni_blank);
+ if (print_unpairables_1)
+ prjoin (&line, &uni_blank);
freeline (&line);
+ if (issued_disorder_warning[0] && !print_unpairables_1)
+ break;
}
}
- if (print_unpairables_2 && seq2.count)
+ if ((print_unpairables_2 || checktail) && seq2.count)
{
- prjoin (&uni_blank, &seq2.lines[0]);
+ if (print_unpairables_2)
+ prjoin (&uni_blank, &seq2.lines[0]);
freeline (&seq2.lines[0]);
- while (get_line (fp2, &line))
+ seen_unpairable = true;
+ while (get_line (fp2, &line, 2))
{
- prjoin (&uni_blank, &line);
+ if (print_unpairables_2)
+ prjoin (&uni_blank, &line);
freeline (&line);
+ if (issued_disorder_warning[1] && !print_unpairables_2)
+ break;
}
}
@@ -789,6 +927,9 @@ main (int argc, char **argv)
atexit (close_stdout);
print_pairables = true;
+ seen_unpairable = false;
+ issued_disorder_warning[0] = issued_disorder_warning[1] = false;
+ check_input_order = CHECK_ORDER_DEFAULT;
while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:",
longopts, NULL))
@@ -875,6 +1016,14 @@ main (int argc, char **argv)
}
break;
+ case NOCHECK_ORDER_OPTION:
+ check_input_order = CHECK_ORDER_DISABLED;
+ break;
+
+ case CHECK_ORDER_OPTION:
+ check_input_order = CHECK_ORDER_ENABLED;
+ break;
+
case 1: /* Non-option argument. */
add_file_name (optarg, names, operand_status, joption_count,
&nfiles, &prev_optc_status, &optc_status);
@@ -935,5 +1084,8 @@ main (int argc, char **argv)
if (fclose (fp2) != 0)
error (EXIT_FAILURE, errno, "%s", names[1]);
- exit (EXIT_SUCCESS);
+ if (issued_disorder_warning[0] || issued_disorder_warning[1])
+ exit (EXIT_FAILURE);
+ else
+ exit (EXIT_SUCCESS);
}