diff options
Diffstat (limited to 'src/join.c')
-rw-r--r-- | src/join.c | 690 |
1 files changed, 690 insertions, 0 deletions
diff --git a/src/join.c b/src/join.c new file mode 100644 index 000000000..9ac82e0fd --- /dev/null +++ b/src/join.c @@ -0,0 +1,690 @@ +/* join - join lines of two files on a common field + Copyright (C) 1991 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + Written by Mike Haertel, mike@gnu.ai.mit.edu. */ + +#define _GNU_SOURCE +#include <ctype.h> +#ifndef isblank +#define isblank(c) ((c) == ' ' || (c) == '\t') +#endif +#include <stdio.h> +#include <sys/types.h> +#include <getopt.h> +#include "system.h" + +#ifdef isascii +#define ISSPACE(c) (isascii(c) && isspace(c)) +#define ISDIGIT(c) (isascii(c) && isdigit(c)) +#else +#define ISSPACE(c) isspace(c) +#define ISDIGIT(c) isdigit(c) +#endif + +char *xmalloc (); +char *xrealloc (); +void error (); +static void usage (); + +#define min(A, B) ((A) < (B) ? (A) : (B)) + +/* An element of the list describing the format of each + output line. */ +struct outlist +{ + int file; /* File to take field from (1 or 2). */ + int field; /* Field number to print. */ + struct outlist *next; +}; + +/* A field of a line. */ +struct field +{ + char *beg; /* First character in field. */ + char *lim; /* Character after last character in field. */ +}; + +/* A line read from an input file. Newlines are not stored. */ +struct line +{ + char *beg; /* First character in line. */ + char *lim; /* Character after last character in line. */ + int nfields; /* Number of elements in `fields'. */ + struct field *fields; +}; + +/* One or more consecutive lines read from a file that all have the + same join field value. */ +struct seq +{ + int count; /* Elements used in `lines'. */ + int alloc; /* Elements allocated in `lines'. */ + struct line *lines; +}; + +/* If nonzero, print unpairable lines in file 1 or 2. */ +static int print_unpairables_1, print_unpairables_2; + +/* If nonzero, print pairable lines. */ +static int print_pairables; + +/* Empty output field filler. */ +static char *empty_filler; + +/* Field to join on. */ +static int join_field_1, join_field_2; + +/* List of fields to print. */ +struct outlist *outlist; + +/* Last element in `outlist', where a new element can be added. */ +struct outlist *outlist_end; + +/* Tab character separating fields; if this is NUL fields are separated + by any nonempty string of white space, otherwise by exactly one + tab character. */ +static char tab; + +/* The name this program was run with. */ +char *program_name; + +/* Fill in the `fields' structure in LINE. */ + +static void +xfields (line) + struct line *line; +{ + static int nfields = 2; + int i; + register char *ptr, *lim; + + line->fields = (struct field *) xmalloc (nfields * sizeof (struct field)); + + ptr = line->beg; + lim = line->lim; + + for (i = 0; ptr < lim; ++i) + { + if (i == nfields) + { + nfields *= 2; + line->fields = (struct field *) + xrealloc ((char *) line->fields, nfields * sizeof (struct field)); + } + if (tab) + { + line->fields[i].beg = ptr; + while (ptr < lim && *ptr != tab) + ++ptr; + line->fields[i].lim = ptr; + if (ptr < lim) + ++ptr; + } + else + { + line->fields[i].beg = ptr; + while (ptr < lim && !ISSPACE (*ptr)) + ++ptr; + line->fields[i].lim = ptr; + while (ptr < lim && ISSPACE (*ptr)) + ++ptr; + } + } + + line->nfields = i; +} + +/* Read a line from FP into LINE and split it into fields. + Return 0 if EOF, 1 otherwise. */ + +static int +get_line (fp, line) + FILE *fp; + struct line *line; +{ + static int linesize = 80; + int c, i; + char *ptr; + + if (feof (fp)) + return 0; + + ptr = xmalloc (linesize); + + for (i = 0; (c = getc (fp)) != EOF && c != '\n'; ++i) + { + if (i == linesize) + { + linesize *= 2; + ptr = xrealloc (ptr, linesize); + } + ptr[i] = c; + } + + if (c == EOF && i == 0) + { + free (ptr); + return 0; + } + + line->beg = ptr; + line->lim = line->beg + i; + xfields (line); + return 1; +} + +static void +freeline (line) + struct line *line; +{ + free ((char *) line->fields); + free (line->beg); +} + +static void +initseq (seq) + struct seq *seq; +{ + seq->count = 0; + seq->alloc = 1; + seq->lines = (struct line *) xmalloc (seq->alloc * sizeof (struct line)); +} + +/* Read a line from FP and add it to SEQ. Return 0 if EOF, 1 otherwise. */ + +static int +getseq (fp, seq) + FILE *fp; + struct seq *seq; +{ + if (seq->count == seq->alloc) + { + seq->alloc *= 2; + seq->lines = (struct line *) + xrealloc ((char *) seq->lines, seq->alloc * sizeof (struct line)); + } + + if (get_line (fp, &seq->lines[seq->count])) + { + ++seq->count; + return 1; + } + return 0; +} + +static void +delseq (seq) + struct seq *seq; +{ + free ((char *) seq->lines); +} + +/* Return <0 if the join field in LINE1 compares less than the one in LINE2; + >0 if it compares greater; 0 if it compares equal. */ + +static int +keycmp (line1, line2) + struct line *line1; + struct line *line2; +{ + char *beg1, *beg2; /* Start of field to compare in each file. */ + int len1, len2; /* Length of fields to compare. */ + int diff; + + if (join_field_1 < line1->nfields) + { + beg1 = line1->fields[join_field_1].beg; + len1 = line1->fields[join_field_1].lim + - line1->fields[join_field_1].beg; + } + else + { + beg1 = NULL; + len1 = 0; + } + + if (join_field_2 < line2->nfields) + { + beg2 = line2->fields[join_field_2].beg; + len2 = line2->fields[join_field_2].lim + - line2->fields[join_field_2].beg; + } + else + { + beg2 = NULL; + len2 = 0; + } + + if (len1 == 0) + return len2 == 0 ? 0 : -1; + if (len2 == 0) + return 1; + diff = memcmp (beg1, beg2, min (len1, len2)); + if (diff) + return diff; + return len1 - len2; +} + +/* Print field N of LINE if it exists and is nonempty, otherwise + `empty_filler' if it is nonempty. */ + +static void +prfield (n, line) + int n; + struct line *line; +{ + int len; + + if (n < line->nfields) + { + len = line->fields[n].lim - line->fields[n].beg; + if (len) + fwrite (line->fields[n].beg, 1, len, stdout); + else if (empty_filler) + fputs (empty_filler, stdout); + } + else if (empty_filler) + fputs (empty_filler, stdout); +} + +/* Print LINE, with its fields separated by `tab'. */ + +static void +prline (line) + struct line *line; +{ + int i; + + for (i = 0; i < line->nfields; ++i) + { + prfield (i, line); + if (i == line->nfields - 1) + putchar ('\n'); + else + putchar (tab ? tab : ' '); + } +} + +/* Print the join of LINE1 and LINE2. */ + +static void +prjoin (line1, line2) + struct line *line1; + struct line *line2; +{ + if (outlist) + { + struct outlist *o; + + prfield (outlist->field - 1, outlist->file == 1 ? line1 : line2); + for (o = outlist->next; o; o = o->next) + { + putchar (tab ? tab : ' '); + prfield (o->field - 1, o->file == 1 ? line1 : line2); + } + putchar ('\n'); + } + else + { + int i; + + prfield (join_field_1, line1); + for (i = 0; i < join_field_1 && i < line1->nfields; ++i) + { + putchar (tab ? tab : ' '); + prfield (i, line1); + } + for (i = join_field_1 + 1; i < line1->nfields; ++i) + { + putchar (tab ? tab : ' '); + prfield (i, line1); + } + + for (i = 0; i < join_field_2 && i < line2->nfields; ++i) + { + putchar (tab ? tab : ' '); + prfield (i, line2); + } + for (i = join_field_2 + 1; i < line2->nfields; ++i) + { + putchar (tab ? tab : ' '); + prfield (i, line2); + } + putchar ('\n'); + } +} + +/* Print the join of the files in FP1 and FP2. */ + +static void +join (fp1, fp2) + FILE *fp1; + FILE *fp2; +{ + struct seq seq1, seq2; + struct line line; + int diff, i, j, eof1, eof2; + + /* Read the first line of each file. */ + initseq (&seq1); + getseq (fp1, &seq1); + initseq (&seq2); + getseq (fp2, &seq2); + + while (seq1.count && seq2.count) + { + diff = keycmp (&seq1.lines[0], &seq2.lines[0]); + if (diff < 0) + { + if (print_unpairables_1) + prline (&seq1.lines[0]); + freeline (&seq1.lines[0]); + seq1.count = 0; + getseq (fp1, &seq1); + continue; + } + if (diff > 0) + { + if (print_unpairables_2) + prline (&seq2.lines[0]); + freeline (&seq2.lines[0]); + seq2.count = 0; + getseq (fp2, &seq2); + continue; + } + + /* Keep reading lines from file1 as long as they continue to + match the current line from file2. */ + eof1 = 0; + do + if (!getseq (fp1, &seq1)) + { + eof1 = 1; + ++seq1.count; + break; + } + while (!keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0])); + + /* Keep reading lines from file2 as long as they continue to + match the current line from file1. */ + eof2 = 0; + do + if (!getseq (fp2, &seq2)) + { + eof2 = 1; + ++seq2.count; + break; + } + while (!keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1])); + + if (print_pairables) + { + for (i = 0; i < seq1.count - 1; ++i) + for (j = 0; j < seq2.count - 1; ++j) + prjoin (&seq1.lines[i], &seq2.lines[j]); + } + + for (i = 0; i < seq1.count - 1; ++i) + freeline (&seq1.lines[i]); + if (!eof1) + { + seq1.lines[0] = seq1.lines[seq1.count - 1]; + seq1.count = 1; + } + else + seq1.count = 0; + + for (i = 0; i < seq2.count - 1; ++i) + freeline (&seq2.lines[i]); + if (!eof2) + { + seq2.lines[0] = seq2.lines[seq2.count - 1]; + seq2.count = 1; + } + else + seq2.count = 0; + } + + if (print_unpairables_1 && seq1.count) + { + prline (&seq1.lines[0]); + freeline (&seq1.lines[0]); + while (get_line (fp1, &line)) + { + prline (&line); + freeline (&line); + } + } + + if (print_unpairables_2 && seq2.count) + { + prline (&seq2.lines[0]); + freeline (&seq2.lines[0]); + while (get_line (fp2, &line)) + { + prline (&line); + freeline (&line); + } + } + + delseq (&seq1); + delseq (&seq2); +} + +/* Add a field spec for field FIELD of file FILE to `outlist' and return 1, + unless either argument is invalid; then just return 0. */ + +static int +add_field (file, field) + int file; + int field; +{ + struct outlist *o; + + if (file < 1 || file > 2 || field < 1) + return 0; + o = (struct outlist *) xmalloc (sizeof (struct outlist)); + o->file = file; + o->field = field; + o->next = NULL; + + /* Add to the end of the list so the fields are in the right order. */ + if (outlist == NULL) + outlist = o; + else + outlist_end->next = o; + outlist_end = o; + + return 1; +} + +/* Add the comma or blank separated field spec(s) in STR to `outlist'. + Return the number of fields added. */ + +static int +add_field_list (str) + char *str; +{ + int added = 0; + int file = -1, field = -1; + int dot_found = 0; + + for (; *str; str++) + { + if (*str == ',' || isblank (*str)) + { + added += add_field (file, field); + file = field = -1; + dot_found = 0; + } + else if (*str == '.') + dot_found = 1; + else if (ISDIGIT (*str)) + { + if (!dot_found) + { + if (file == -1) + file = 0; + file = file * 10 + *str - '0'; + } + else + { + if (field == -1) + field = 0; + field = field * 10 + *str - '0'; + } + } + else + return 0; + } + + added += add_field (file, field); + return added; +} + +/* When using getopt_long_only, no long option can start with + a character that is a short option. */ +static struct option longopts[] = +{ + {"j", 1, NULL, 'j'}, + {"j1", 1, NULL, '1'}, + {"j2", 1, NULL, '2'}, + {NULL, 0, NULL, 0} +}; + +void +main (argc, argv) + int argc; + char *argv[]; +{ + char *names[2]; + FILE *fp1, *fp2; + int optc, prev_optc = 0, nfiles, val; + + program_name = argv[0]; + nfiles = 0; + print_pairables = 1; + + while ((optc = getopt_long_only (argc, argv, "-a:e:1:2:o:t:v:", longopts, + (int *) 0)) != EOF) + { + switch (optc) + { + case 'a': + val = atoi (optarg); + if (val == 1) + print_unpairables_1 = 1; + else if (val == 2) + print_unpairables_2 = 1; + else + error (2, 0, "invalid file number for `-a'"); + break; + + case 'e': + empty_filler = optarg; + break; + + case '1': + val = atoi (optarg); + if (val <= 0) + error (2, 0, "invalid field number for `-1'"); + join_field_1 = val - 1; + break; + + case '2': + val = atoi (optarg); + if (val <= 0) + error (2, 0, "invalid field number for `-2'"); + join_field_2 = val - 1; + break; + + case 'j': + val = atoi (optarg); + if (val <= 0) + error (2, 0, "invalid field number for `-j'"); + join_field_1 = join_field_2 = val - 1; + break; + + case 'o': + if (add_field_list (optarg) == 0) + error (2, 0, "invalid field list for `-o'"); + break; + + case 't': + tab = *optarg; + break; + + case 'v': + val = atoi (optarg); + if (val == 1) + print_unpairables_1 = 1; + else if (val == 2) + print_unpairables_2 = 1; + else + error (2, 0, "invalid file number for `-v'"); + print_pairables = 0; + break; + + case 1: /* Non-option argument. */ + if (prev_optc == 'o') + { + /* Might be continuation of args to -o. */ + if (add_field_list (optarg) > 0) + continue; /* Don't change `prev_optc'. */ + } + + if (nfiles > 1) + usage (); + names[nfiles++] = optarg; + break; + + case '?': + usage (); + } + prev_optc = optc; + } + + if (nfiles != 2) + usage (); + + fp1 = strcmp (names[0], "-") ? fopen (names[0], "r") : stdin; + if (!fp1) + error (1, errno, "%s", names[0]); + fp2 = strcmp (names[1], "-") ? fopen (names[1], "r") : stdin; + if (!fp2) + error (1, errno, "%s", names[1]); + if (fp1 == fp2) + error (1, errno, "both files cannot be standard input"); + join (fp1, fp2); + + if ((fp1 == stdin || fp2 == stdin) && fclose (stdin) == EOF) + error (1, errno, "-"); + if (ferror (stdout) || fclose (stdout) == EOF) + error (1, 0, "write error"); + + exit (0); +} + +static void +usage () +{ + fprintf (stderr, "\ +Usage: %s [-a 1|2] [-v 1|2] [-e empty-string] [-o field-list...] [-t char]\n\ + [-j[1|2] field] [-1 field] [-2 field] file1 file2\n", + program_name); + exit (1); +} |