/* join - join lines of two files on a common field Copyright (C) 1991, 1995 Free Software Foundation, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. Written by Mike Haertel, mike@gnu.ai.mit.edu. */ #include #ifdef __GNUC__ #define alloca __builtin_alloca #else /* not __GNUC__ */ #if HAVE_ALLOCA_H #include #else /* not HAVE_ALLOCA_H */ #ifdef _AIX #pragma alloca #else /* not _AIX */ char *alloca (); #endif /* not _AIX */ #endif /* not HAVE_ALLOCA_H */ #endif /* not __GNUC__ */ /* Get isblank from GNU libc. */ #define _GNU_SOURCE #include #define NDEBUG #include #include #include #if HAVE_LIMITS_H # include #endif #ifndef UINT_MAX # define UINT_MAX ((unsigned int) ~(unsigned int) 0) #endif #ifndef INT_MAX # define INT_MAX ((int) (UINT_MAX >> 1)) #endif #if _LIBC || STDC_HEADERS # define TOLOWER(c) tolower (c) #else # define TOLOWER(c) (ISUPPER (c) ? tolower (c) : (c)) #endif #include "system.h" #include "version.h" #include "long-options.h" #include "xstrtol.h" #include "error.h" #define join system_join char *xmalloc (); char *xrealloc (); /* Undefine, to avoid warning about redefinition on some systems. */ #undef min #undef max #define min(A, B) ((A) < (B) ? (A) : (B)) #define max(A, B) ((A) > (B) ? (A) : (B)) /* An element of the list identifying which fields to print for each output line. */ struct outlist { /* File number: 0, 1, or 2. 0 means use the join field. 1 means use the first file argument, 2 the second. */ int file; /* Field index (zero-based), specified only when FILE is 1 or 2. */ int field; struct outlist *next; }; /* A field of a line. */ struct field { const char *beg; /* First character in field. */ size_t len; /* The length of the field. */ }; /* A line read from an input file. Newlines are not stored. */ struct line { char *beg; /* First character in line. */ char *lim; /* Character after last character in line. */ int nfields; /* Number of elements in `fields'. */ int nfields_allocated; /* Number of elements in `fields'. */ struct field *fields; }; /* One or more consecutive lines read from a file that all have the same join field value. */ struct seq { int count; /* Elements used in `lines'. */ int alloc; /* Elements allocated in `lines'. */ struct line *lines; }; /* The name this program was run with. */ char *program_name; /* If nonzero, print unpairable lines in file 1 or 2. */ static int print_unpairables_1, print_unpairables_2; /* If nonzero, print pairable lines. */ static int print_pairables; /* Empty output field filler. */ static char *empty_filler; /* Field to join on. */ static int join_field_1, join_field_2; /* List of fields to print. */ static struct outlist outlist_head; /* Last element in `outlist', where a new element can be added. */ static struct outlist *outlist_end = &outlist_head; /* Tab character separating fields; if this is NUL fields are separated by any nonempty string of white space, otherwise by exactly one tab character. */ static char tab; /* When using getopt_long_only, no long option can start with a character that is a short option. */ static struct option const longopts[] = { {"ignore-case", no_argument, NULL, 'i'}, {"j", required_argument, NULL, 'j'}, {"j1", required_argument, NULL, '1'}, {"j2", required_argument, NULL, '2'}, {NULL, 0, NULL, 0} }; /* Used to print non-joining lines */ static struct line uni_blank; /* If nonzero, ignore case when comparing join fields. */ static int ignore_case; static void usage (int status) { if (status != 0) fprintf (stderr, _("Try `%s --help' for more information.\n"), program_name); else { printf (_("\ Usage: %s [OPTION]... FILE1 FILE2\n\ "), program_name); printf (_("\ For each pair of input lines with identical join fields, write a line to\n\ standard output. The default join field is the first, delimited\n\ by whitespace. When FILE1 or FILE2 (not both) is -, read standard input.\n\ \n\ -a SIDE print unpairable lines coming from file SIDE\n\ -e EMPTY replace missing input fields with EMPTY\n\ -i, --ignore-case ignore differences in case when comparing fields\n\ -j FIELD (Obsolescent) equivalent to `-1 FIELD -2 FIELD'\n\ -j1 FIELD (Obsolescent) equivalent to `-1 FIELD'\n\ -j2 FIELD (Obsolescent) equivalent to `-2 FIELD'\n\ -1 FIELD join on this FIELD of file 1\n\ -2 FIELD join on this FIELD of file 2\n\ -o FORMAT obey FORMAT while constructing output line\n\ -t CHAR use CHAR as input and output field separator\n\ -v SIDE like -a SIDE, but suppress joined output lines\n\ --help display this help and exit\n\ --version output version information and exit\n\ \n\ Unless -t CHAR is given, leading blanks separate fields and are ignored,\n\ else fields are separated by CHAR. Any FIELD is a field number counted\n\ from 1. FORMAT is one or more comma or blank separated specifications,\n\ each being `SIDE.FIELD' or `0'. Default FORMAT outputs the join field,\n\ the remaining fields from FILE1, the remaining fields from FILE2, all\n\ separated by CHAR.\n\ ")); } exit (status); } /* Like memcmp, but ignore differences in case. */ static int memcasecmp (const void *vs1, const void *vs2, size_t n) { int i; unsigned char *s1 = (unsigned char *) vs1; unsigned char *s2 = (unsigned char *) vs2; for (i = 0; i < n; i++) { unsigned char u1 = *s1++; unsigned char u2 = *s2++; if (TOLOWER (u1) != TOLOWER (u2)) return TOLOWER (u1) - TOLOWER (u2); } return 0; } static void ADD_FIELD (struct line *line, const char *field, size_t len) { if (line->nfields >= line->nfields_allocated) { line->nfields_allocated = (3 * line->nfields_allocated) / 2 + 1; line->fields = (struct field *) xrealloc ((char *) line->fields, (line->nfields_allocated * sizeof (struct field))); } line->fields[line->nfields].beg = field; line->fields[line->nfields].len = len; ++(line->nfields); } /* Fill in the `fields' structure in LINE. */ static void xfields (struct line *line) { int i; register char *ptr, *lim; ptr = line->beg; lim = line->lim; for (i = 0; ptr < lim; ++i) { if (tab) { char *beg; beg = ptr; while (ptr < lim && *ptr != tab) ++ptr; ADD_FIELD (line, beg, ptr - beg); if (ptr < lim) ++ptr; } else { char *beg; beg = ptr; while (ptr < lim && !ISSPACE (*ptr)) ++ptr; ADD_FIELD (line, beg, ptr - beg); while (ptr < lim && ISSPACE (*ptr)) ++ptr; } } if (ptr > line->beg && ((tab && ISSPACE (ptr[-1])) || ptr[-1] == tab)) { /* Add one more (empty) field because the last character of the line was a delimiter. */ ADD_FIELD (line, NULL, 0); } } /* Read a line from FP into LINE and split it into fields. Return 0 if EOF, 1 otherwise. */ static int get_line (FILE *fp, struct line *line) { static int linesize = 80; int c, i; char *ptr; if (feof (fp)) return 0; ptr = xmalloc (linesize); for (i = 0; (c = getc (fp)) != EOF && c != '\n'; ++i) { if (i == linesize) { linesize *= 2; ptr = xrealloc (ptr, linesize); } ptr[i] = c; } if (c == EOF && i == 0) { free (ptr); return 0; } line->beg = ptr; line->lim = line->beg + i; line->nfields_allocated = 0; line->nfields = 0; line->fields = NULL; xfields (line); return 1; } static void freeline (struct line *line) { free ((char *) line->fields); free (line->beg); line->beg = NULL; } static void initseq (struct seq *seq) { seq->count = 0; seq->alloc = 1; seq->lines = (struct line *) xmalloc (seq->alloc * sizeof (struct line)); } /* Read a line from FP and add it to SEQ. Return 0 if EOF, 1 otherwise. */ static int getseq (FILE *fp, struct seq *seq) { if (seq->count == seq->alloc) { seq->alloc *= 2; seq->lines = (struct line *) xrealloc ((char *) seq->lines, seq->alloc * sizeof (struct line)); } if (get_line (fp, &seq->lines[seq->count])) { ++seq->count; return 1; } return 0; } static void delseq (struct seq *seq) { int i; for (i = 0; i < seq->count; i++) if (seq->lines[i].beg) freeline (&seq->lines[i]); free ((char *) seq->lines); } /* Return <0 if the join field in LINE1 compares less than the one in LINE2; >0 if it compares greater; 0 if it compares equal. */ static int keycmp (struct line *line1, struct line *line2) { const char *beg1, *beg2; /* Start of field to compare in each file. */ int len1, len2; /* Length of fields to compare. */ int diff; if (join_field_1 < line1->nfields) { beg1 = line1->fields[join_field_1].beg; len1 = line1->fields[join_field_1].len; } else { beg1 = NULL; len1 = 0; } if (join_field_2 < line2->nfields) { beg2 = line2->fields[join_field_2].beg; len2 = line2->fields[join_field_2].len; } else { beg2 = NULL; len2 = 0; } if (len1 == 0) return len2 == 0 ? 0 : -1; if (len2 == 0) return 1; /* Use an if-statement here rather than a function variable to avoid portability hassles of getting a non-conflicting declaration of memcmp. */ if (ignore_case) diff = memcasecmp (beg1, beg2, min (len1, len2)); else diff = memcmp (beg1, beg2, min (len1, len2)); if (diff) return diff; return len1 - len2; } /* Print field N of LINE if it exists and is nonempty, otherwise `empty_filler' if it is nonempty. */ static void prfield (int n, struct line *line) { int len; if (n < line->nfields) { len = line->fields[n].len; if (len) fwrite (line->fields[n].beg, 1, len, stdout); else if (empty_filler) fputs (empty_filler, stdout); } else if (empty_filler) fputs (empty_filler, stdout); } /* Print the join of LINE1 and LINE2. */ static void prjoin (struct line *line1, struct line *line2) { const struct outlist *outlist; outlist = outlist_head.next; if (outlist) { const struct outlist *o; o = outlist; while (1) { int field; struct line *line; if (o->file == 0) { if (line1 == &uni_blank) { line = line2; field = join_field_2; } else { line = line1; field = join_field_1; } } else { line = (o->file == 1 ? line1 : line2); field = o->field; } prfield (field, line); o = o->next; if (o == NULL) break; putchar (tab ? tab : ' '); } putchar ('\n'); } else { int i; if (line1 == &uni_blank) { struct line *t; t = line1; line1 = line2; line2 = t; } prfield (join_field_1, line1); for (i = 0; i < join_field_1 && i < line1->nfields; ++i) { putchar (tab ? tab : ' '); prfield (i, line1); } for (i = join_field_1 + 1; i < line1->nfields; ++i) { putchar (tab ? tab : ' '); prfield (i, line1); } for (i = 0; i < join_field_2 && i < line2->nfields; ++i) { putchar (tab ? tab : ' '); prfield (i, line2); } for (i = join_field_2 + 1; i < line2->nfields; ++i) { putchar (tab ? tab : ' '); prfield (i, line2); } putchar ('\n'); } } /* Print the join of the files in FP1 and FP2. */ static void join (FILE *fp1, FILE *fp2) { struct seq seq1, seq2; struct line line; int diff, i, j, eof1, eof2; /* Read the first line of each file. */ initseq (&seq1); getseq (fp1, &seq1); initseq (&seq2); getseq (fp2, &seq2); while (seq1.count && seq2.count) { diff = keycmp (&seq1.lines[0], &seq2.lines[0]); if (diff < 0) { if (print_unpairables_1) prjoin (&seq1.lines[0], &uni_blank); freeline (&seq1.lines[0]); seq1.count = 0; getseq (fp1, &seq1); continue; } if (diff > 0) { if (print_unpairables_2) prjoin (&uni_blank, &seq2.lines[0]); freeline (&seq2.lines[0]); seq2.count = 0; getseq (fp2, &seq2); continue; } /* Keep reading lines from file1 as long as they continue to match the current line from file2. */ eof1 = 0; do if (!getseq (fp1, &seq1)) { eof1 = 1; ++seq1.count; break; } while (!keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0])); /* Keep reading lines from file2 as long as they continue to match the current line from file1. */ eof2 = 0; do if (!getseq (fp2, &seq2)) { eof2 = 1; ++seq2.count; break; } while (!keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1])); if (print_pairables) { for (i = 0; i < seq1.count - 1; ++i) for (j = 0; j < seq2.count - 1; ++j) prjoin (&seq1.lines[i], &seq2.lines[j]); } for (i = 0; i < seq1.count - 1; ++i) freeline (&seq1.lines[i]); if (!eof1) { seq1.lines[0] = seq1.lines[seq1.count - 1]; seq1.count = 1; } else seq1.count = 0; for (i = 0; i < seq2.count - 1; ++i) freeline (&seq2.lines[i]); if (!eof2) { seq2.lines[0] = seq2.lines[seq2.count - 1]; seq2.count = 1; } else seq2.count = 0; } if (print_unpairables_1 && seq1.count) { prjoin (&seq1.lines[0], &uni_blank); freeline (&seq1.lines[0]); while (get_line (fp1, &line)) { prjoin (&line, &uni_blank); freeline (&line); } } if (print_unpairables_2 && seq2.count) { prjoin (&uni_blank, &seq2.lines[0]); freeline (&seq2.lines[0]); while (get_line (fp2, &line)) { prjoin (&uni_blank, &line); freeline (&line); } } delseq (&seq1); delseq (&seq2); } /* Add a field spec for field FIELD of file FILE to `outlist'. */ static void add_field (int file, int field) { struct outlist *o; assert (file == 0 || file == 1 || file == 2); assert (field >= 0); o = (struct outlist *) xmalloc (sizeof (struct outlist)); o->file = file; o->field = field; o->next = NULL; /* Add to the end of the list so the fields are in the right order. */ outlist_end->next = o; outlist_end = o; } /* Convert a single field specifier string, S, to a *FILE_INDEX, *FIELD_INDEX pair. In S, the field index string is 1-based; *FIELD_INDEX is zero-based. If S is valid, return zero. Otherwise, give a diagnostic, don't update *FILE_INDEX or *FIELD_INDEX, and return nonzero. */ static int decode_field_spec (const char *s, int *file_index, int *field_index) { int valid = 0; /* The first character must be 0, 1, or 2. */ switch (s[0]) { case '0': if (s[1] == '\0') { *file_index = 0; /* Leave *field_index undefined. */ valid = 1; } else { /* `0' must be all alone -- no `.FIELD'. */ error (0, 0, _("invalid field specifier: `%s'"), s); } break; case '1': case '2': if (s[1] == '.' && s[2] != '\0') { strtol_error s_err; long int tmp_long; s_err = xstrtol (s + 2, NULL, 10, &tmp_long, NULL); if (s_err != LONGINT_OK || tmp_long <= 0 || tmp_long > INT_MAX) { error (0, 0, _("invalid field number: `%s'"), s + 2); } else { *file_index = s[0] - '0'; /* Convert to a zero-based index. */ *field_index = (int) tmp_long - 1; valid = 1; } } break; default: error (0, 0, _("invalid file number in field spec: `%s'"), s); break; } return !valid; } /* Add the comma or blank separated field spec(s) in STR to `outlist'. Return nonzero to indicate failure. */ static int add_field_list (const char *c_str) { char *p, *str; /* Make a writable copy of c_str. */ str = (char *) alloca (strlen (c_str) + 1); strcpy (str, c_str); p = str; do { int invalid; int file_index, field_index; char *spec_item = p; p = strpbrk (p, ", \t"); if (p) *p++ = 0; invalid = decode_field_spec (spec_item, &file_index, &field_index); if (invalid) return 1; add_field (file_index, field_index); uni_blank.nfields = max (uni_blank.nfields, field_index); } while (p); return 0; } /* Create a blank line with COUNT fields separated by tabs. */ void make_blank (struct line *blank, int count) { int i; blank->nfields = count; blank->beg = xmalloc (blank->nfields + 1); blank->fields = (struct field *) xmalloc (sizeof (struct field) * count); for (i = 0; i < blank->nfields; i++) { blank->beg[i] = '\t'; blank->fields[i].beg = &blank->beg[i]; blank->fields[i].len = 0; } blank->beg[i] = '\0'; blank->lim = &blank->beg[i]; } void main (int argc, char **argv) { char *names[2]; FILE *fp1, *fp2; int optc, prev_optc = 0, nfiles; program_name = argv[0]; /* Initialize this before parsing options. In parsing options, it may be increased. */ uni_blank.nfields = 1; parse_long_options (argc, argv, "join", version_string, usage); nfiles = 0; print_pairables = 1; while ((optc = getopt_long_only (argc, argv, "-a:e:i1:2:o:t:v:", longopts, (int *) 0)) != EOF) { long int val; switch (optc) { case 0: break; case 'v': print_pairables = 0; /* Fall through. */ case 'a': if (xstrtol (optarg, NULL, 10, &val, NULL) != LONGINT_OK || (val != 1 && val != 2)) error (2, 0, _("invalid field number: `%s'"), optarg); if (val == 1) print_unpairables_1 = 1; else print_unpairables_2 = 1; break; case 'e': empty_filler = optarg; break; case 'i': ignore_case = 1; break; case '1': if (xstrtol (optarg, NULL, 10, &val, NULL) != LONGINT_OK || val <= 0 || val > INT_MAX) { error (2, 0, _("invalid field number for file 1: `%s'"), optarg); } join_field_1 = (int) val - 1; break; case '2': if (xstrtol (optarg, NULL, 10, &val, NULL) != LONGINT_OK || val <= 0 || val > INT_MAX) error (2, 0, _("invalid field number for file 2: `%s'"), optarg); join_field_2 = (int) val - 1; break; case 'j': if (xstrtol (optarg, NULL, 10, &val, NULL) != LONGINT_OK || val <= 0 || val > INT_MAX) error (2, 0, _("invalid field number: `%s'"), optarg); join_field_1 = join_field_2 = (int) val - 1; break; case 'o': if (add_field_list (optarg)) exit (1); break; case 't': tab = *optarg; break; case 1: /* Non-option argument. */ if (prev_optc == 'o' && optind <= argc - 2) { if (add_field_list (optarg)) exit (1); /* Might be continuation of args to -o. */ continue; /* Don't change `prev_optc'. */ } if (nfiles > 1) { error (0, 0, _("too many non-option arguments")); usage (1); } names[nfiles++] = optarg; break; case '?': usage (1); } prev_optc = optc; } /* Now that we've seen the options, we can construct the blank line structure. */ make_blank (&uni_blank, uni_blank.nfields); if (nfiles != 2) { error (0, 0, _("too few non-option arguments")); usage (1); } fp1 = strcmp (names[0], "-") ? fopen (names[0], "r") : stdin; if (!fp1) error (1, errno, "%s", names[0]); fp2 = strcmp (names[1], "-") ? fopen (names[1], "r") : stdin; if (!fp2) error (1, errno, "%s", names[1]); if (fp1 == fp2) error (1, errno, _("both files cannot be standard input")); join (fp1, fp2); if (fp1 != stdin && fclose (fp1) == EOF) error (1, errno, "%s", names[0]); if (fp2 != stdin && fclose (fp2) == EOF) error (1, errno, "%s", names[1]); if ((fp1 == stdin || fp2 == stdin) && fclose (stdin) == EOF) error (1, errno, "-"); if (ferror (stdout) || fclose (stdout) == EOF) error (1, errno, _("write error")); exit (0); }