diff options
-rw-r--r-- | lib/bumpalloc.h | 61 | ||||
-rw-r--r-- | lib/diacrit.c | 148 | ||||
-rw-r--r-- | lib/diacrit.h | 16 | ||||
-rw-r--r-- | src/ptx.c | 2219 |
4 files changed, 2444 insertions, 0 deletions
diff --git a/lib/bumpalloc.h b/lib/bumpalloc.h new file mode 100644 index 000000000..ce37d1c68 --- /dev/null +++ b/lib/bumpalloc.h @@ -0,0 +1,61 @@ +/* BUMP_ALLOC macro - increase table allocation by one element. + Copyright (C) 1990, 1991, 1993 Free Software Foundation, Inc. + François Pinard <pinard@iro.umontreal.ca>, 1990. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + +/*-------------------------------------------------------------------------. +| Bump the allocation of the array pointed to by TABLE whenever required. | +| The table already has already COUNT elements in it, this macro ensure it | +| has enough space to accommodate at least one more element. Space is | +| allocated (2 ^ EXPONENT) elements at a time. Each element of the array | +| is of type TYPE. | +`-------------------------------------------------------------------------*/ + +/* Routines `xmalloc' and `xrealloc' are called to do the actual memory + management. This implies that the program will abort with an `Memory + exhausted!' error if any problem arise. + + To work correctly, at least EXPONENT and TYPE should always be the + same for all uses of this macro for any given TABLE. A secure way to + achieve this is to never use this macro directly, but use it to define + other macros, which would then be TABLE-specific. + + The first time through, COUNT is usually zero. Note that COUNT is not + updated by this macro, but it should be update elsewhere, later. This + is convenient, because it allows TABLE[COUNT] to refer to the new + element at the end. Once its construction is completed, COUNT++ will + record it in the table. Calling this macro several times in a row + without updating COUNT is a bad thing to do. */ + +#define BUMP_ALLOC(Table, Count, Exponent, Type) \ + BUMP_ALLOC_WITH_SIZE ((Table), (Count), (Exponent), Type, sizeof (Type)) + +/* In cases `sizeof TYPE' would not always yield the correct value for + the size of each element entry, this macro accepts a supplementary + SIZE argument. The EXPONENT, TYPE and SIZE parameters should still + have the same value for all macro calls related to a specific TABLE. */ + +#define BUMP_ALLOC_WITH_SIZE(Table, Count, Exponent, Type, Size) \ + do \ + { \ + if (((Count) & (~(~0 << (Exponent)))) == 0) \ + if ((Count) == 0) \ + (Table) = (Type *) xmalloc ((1 << (Exponent)) * (Size)); \ + else \ + (Table) = (Type *) \ + xrealloc ((Table), ((Count) + (1 << (Exponent))) * (Size)); \ + } \ + while (0) diff --git a/lib/diacrit.c b/lib/diacrit.c new file mode 100644 index 000000000..598043acf --- /dev/null +++ b/lib/diacrit.c @@ -0,0 +1,148 @@ +/* Diacritics processing for a few character codes. + Copyright (C) 1990, 1991, 1992, 1993 Free Software Foundation, Inc. + François Pinard <pinard@iro.umontreal.ca>, 1988. + + All this file is a temporary hack, waiting for locales in GNU. +*/ + +#ifdef HAVE_CONFIG_H +# include <config.h> +#endif + +#include "diacrit.h" + +/* ISO 8859-1 Latin-1 code is used as the underlying character set. If + MSDOS is defined, IBM-PC's character set code is used instead. */ + +/*--------------------------------------------------------------------. +| For each alphabetic character, returns what it would be without its | +| possible diacritic symbol. | +`--------------------------------------------------------------------*/ + +const char diacrit_base[256] = +{ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 'A', 'B', 'C', 'D', 'E', 'F', 'G', + 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', + 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', + 'X', 'Y', 'Z', 0, 0, 0, 0, 0, + 0, 'a', 'b', 'c', 'd', 'e', 'f', 'g', + 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', + 'x', 'y', 'z', 0, 0, 0, 0, 0, + +#ifdef MSDOS + + 'C', 'u', 'e', 'a', 'a', 'a', 'a', 'c', + 'e', 'e', 'e', 'i', 'i', 'i', 'A', 'A', + 'E', 'e', 'E', 'o', 'o', 'o', 'u', 'u', + 'y', 'O', 'U', 0, 0, 0, 0, 0, + 'a', 'i', 'o', 'u', 'n', 'N', 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + +#else /* not MSDOS */ + + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'C', + 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I', + 0, 'N', 'O', 'O', 'O', 'O', 'O', 0, + 'O', 'U', 'U', 'U', 'U', 'Y', 0, 0, + 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', + 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i', + 0, 'n', 'o', 'o', 'o', 'o', 'o', 0, + 'o', 'u', 'u', 'u', 'u', 'y', 0, 'y', + +#endif /* not MSDOS */ +}; + +/*------------------------------------------------------------------------. +| For each alphabetic character, returns a code of what its diacritic is, | +| according to the following codes: 1 (eE) over aA for latin diphtongs; 2 | +| (') acute accent; 3 (`) grave accent; 4 (^) circumflex accent; 5 (") | +| umlaut or diaraesis; 6 (~) tilda; 7 (,) cedilla; 8 (o) covering degree | +| symbol; 9 (|) slashed character. | +`------------------------------------------------------------------------*/ + +const char diacrit_diac[256] = +{ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 4, 0, + 3, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 6, 0, + +#ifdef MSDOS + + 7, 5, 2, 4, 5, 3, 8, 7, + 4, 5, 3, 5, 4, 3, 5, 8, + 2, 1, 1, 4, 5, 3, 4, 3, + 5, 5, 5, 0, 0, 0, 0, 0, + 2, 2, 2, 2, 6, 6, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + +#else /* not MSDOS */ + + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 3, 2, 4, 6, 5, 8, 1, 7, + 3, 2, 4, 5, 3, 2, 4, 5, + 0, 6, 3, 2, 4, 6, 5, 0, + 9, 3, 2, 4, 5, 2, 0, 0, + 3, 2, 4, 6, 5, 8, 1, 7, + 3, 2, 4, 5, 3, 2, 4, 5, + 0, 6, 3, 2, 4, 6, 5, 0, + 9, 3, 2, 4, 5, 2, 0, 0, + +#endif /* not MSDOS */ +}; diff --git a/lib/diacrit.h b/lib/diacrit.h new file mode 100644 index 000000000..20e710bd6 --- /dev/null +++ b/lib/diacrit.h @@ -0,0 +1,16 @@ +/* Diacritics processing for a few character codes. + Copyright (C) 1990, 1991, 1992, 1993 Free Software Foundation, Inc. + François Pinard <pinard@iro.umontreal.ca>, 1988. + + All this file is a temporary hack, waiting for locales in GNU. +*/ + +extern const char diacrit_base[]; /* characters without diacritics */ +extern const char diacrit_diac[]; /* diacritic code for each character */ + +/* Returns CHAR without its diacritic. CHAR is known to be alphabetic. */ +#define tobase(Char) (diacrit_base[(unsigned char) (Char)]) + +/* Returns a diacritic code for CHAR. CHAR is known to be alphabetic. */ +#define todiac(Char) (diacrit_diac[(unsigned char) (Char)]) + diff --git a/src/ptx.c b/src/ptx.c new file mode 100644 index 000000000..7f5263aab --- /dev/null +++ b/src/ptx.c @@ -0,0 +1,2219 @@ +/* Permuted index for GNU, with keywords in their context. + Copyright © 1990, 1991, 1993, 1998 Free Software Foundation, Inc. + François Pinard <pinard@iro.umontreal.ca>, 1988. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + François Pinard <pinard@iro.umontreal.ca> */ + +#include <config.h> + +#include <stdio.h> +#include <getopt.h> +#include <sys/types.h> +#include "system.h" +#include "argmatch.h" +#include "bumpalloc.h" +#include "diacrit.h" +#include "error.h" +#include "regex.h" + +/* Number of possible characters in a byte. */ +#define CHAR_SET_SIZE 256 + +/* The ctype definitions should work for all 256 characters. */ +#if STDC_HEADERS +# include <ctype.h> +#else +# define isspace(C) ((C) == ' ' || (C) == '\t' || (C) == '\n') +# define isxdigit(C) \ + (((unsigned char) (C) >= 'a' && (unsigned char) (C) <= 'f') \ + || ((unsigned char) (C) >= 'A' && (unsigned char) (C) <= 'F') \ + || ((unsigned char) (C) >= '0' && (unsigned char) (C) <= '9')) +# define islower(C) ((unsigned char) (C) >= 'a' && (unsigned char) (C) <= 'z') +# define isupper(C) ((unsigned char) (C) >= 'A' && (unsigned char) (C) <= 'Z') +# define isalpha(C) (islower (C) || isupper (C)) +# define toupper(C) (islower (C) ? (C) - 'a' + 'A' : (C)) +#endif + +#if !defined (isascii) || defined (STDC_HEADERS) +# undef isascii +# define isascii(C) 1 +#endif + +#ifndef ISXDIGIT +# define ISXDIGIT(C) (isascii (C) && isxdigit (C)) +#endif +#define ISODIGIT(C) ((C) >= '0' && (C) <= '7') +#define HEXTOBIN(C) ((C) >= 'a' && (C) <= 'f' ? (C)-'a'+10 \ + : (C) >= 'A' && (C) <= 'F' ? (C)-'A'+10 : (C)-'0') +#define OCTTOBIN(C) ((C) - '0') + +/* Debugging the memory allocator. */ + +#if WITH_DMALLOC +# define MALLOC_FUNC_CHECK 1 +# include <dmalloc.h> +#endif + +/* Global definitions. */ + +/* Reallocation step when swallowing non regular files. The value is not + the actual reallocation step, but its base two logarithm. */ +#define SWALLOW_REALLOC_LOG 12 + +/* Imported from "regex.c". */ +#define Sword 1 + +/* The name this program was run with. */ +const char *program_name; + +/* If nonzero, display usage information and exit. */ +static int show_help = 0; + +/* If nonzero, print the version on standard output and exit. */ +static int show_version = 0; + +/* Program options. */ + +enum Format +{ + UNKNOWN_FORMAT, /* output format still unknown */ + DUMB_FORMAT, /* output for a dumb terminal */ + ROFF_FORMAT, /* output for `troff' or `nroff' */ + TEX_FORMAT /* output for `TeX' or `LaTeX' */ +}; + +int gnu_extensions = 1; /* trigger all GNU extensions */ +int auto_reference = 0; /* references are `file_name:line_number:' */ +int input_reference = 0; /* references at beginning of input lines */ +int right_reference = 0; /* output references after right context */ +int line_width = 72; /* output line width in characters */ +int gap_size = 3; /* number of spaces between output fields */ +const char *truncation_string = "/"; + /* string used to mark line truncations */ +const char *macro_name = "xx"; /* macro name for roff or TeX output */ +enum Format output_format = UNKNOWN_FORMAT; + /* output format */ + +int ignore_case = 0; /* fold lower to upper case for sorting */ +const char *context_regex_string = NULL; + /* raw regex for end of context */ +const char *word_regex_string = NULL; + /* raw regex for a keyword */ +const char *break_file = NULL; /* name of the `Break characters' file */ +const char *only_file = NULL; /* name of the `Only words' file */ +const char *ignore_file = NULL; /* name of the `Ignore words' file */ + +/* A BLOCK delimit a region in memory of arbitrary size, like the copy of a + whole file. A WORD is something smaller, its length should fit in a + short integer. A WORD_TABLE may contain several WORDs. */ + +typedef struct + { + char *start; /* pointer to beginning of region */ + char *end; /* pointer to end + 1 of region */ + } +BLOCK; + +typedef struct + { + char *start; /* pointer to beginning of region */ + short size; /* length of the region */ + } +WORD; + +typedef struct + { + WORD *start; /* array of WORDs */ + size_t length; /* number of entries */ + } +WORD_TABLE; + +/* Pattern description tables. */ + +/* For each character, provide its folded equivalent. */ +unsigned char folded_chars[CHAR_SET_SIZE]; + +/* For each character, indicate if it is part of a word. */ +char syntax_table[CHAR_SET_SIZE]; +char *re_syntax_table = syntax_table; + +/* Compiled regex for end of context. */ +struct re_pattern_buffer *context_regex; + +/* End of context pattern register indices. */ +struct re_registers context_regs; + +/* Compiled regex for a keyword. */ +struct re_pattern_buffer *word_regex; + +/* Keyword pattern register indices. */ +struct re_registers word_regs; + +/* A word characters fastmap is used only when no word regexp has been + provided. A word is then made up of a sequence of one or more characters + allowed by the fastmap. Contains !0 if character allowed in word. Not + only this is faster in most cases, but it simplifies the implementation + of the Break files. */ +char word_fastmap[CHAR_SET_SIZE]; + +/* Maximum length of any word read. */ +int maximum_word_length; + +/* Maximum width of any reference used. */ +int reference_max_width; + + +/* Ignore and Only word tables. */ + +WORD_TABLE ignore_table; /* table of words to ignore */ +WORD_TABLE only_table; /* table of words to select */ + +#define ALLOC_NEW_WORD(table) \ + BUMP_ALLOC ((table)->start, (table)->length, 8, WORD) + +/* Source text table, and scanning macros. */ + +int number_input_files; /* number of text input files */ +int total_line_count; /* total number of lines seen so far */ +const char **input_file_name; /* array of text input file names */ +int *file_line_count; /* array of `total_line_count' values at end */ + +BLOCK text_buffer; /* file to study */ +char *text_buffer_maxend; /* allocated end of text_buffer */ + +/* SKIP_NON_WHITE used only for getting or skipping the reference. */ + +#define SKIP_NON_WHITE(cursor, limit) \ + while (cursor < limit && !isspace(*cursor)) \ + cursor++ + +#define SKIP_WHITE(cursor, limit) \ + while (cursor < limit && isspace(*cursor)) \ + cursor++ + +#define SKIP_WHITE_BACKWARDS(cursor, start) \ + while (cursor > start && isspace(cursor[-1])) \ + cursor-- + +#define SKIP_SOMETHING(cursor, limit) \ + if (word_regex_string) \ + { \ + int count; \ + count = re_match (word_regex, cursor, limit - cursor, 0, NULL); \ + cursor += count <= 0 ? 1 : count; \ + } \ + else if (word_fastmap[(unsigned char) *cursor]) \ + while (cursor < limit && word_fastmap[(unsigned char) *cursor]) \ + cursor++; \ + else \ + cursor++ + +/* Occurrences table. + + The `keyword' pointer provides the central word, which is surrounded + by a left context and a right context. The `keyword' and `length' + field allow full 8-bit characters keys, even including NULs. At other + places in this program, the name `keyafter' refers to the keyword + followed by its right context. + + The left context does not extend, towards the beginning of the file, + further than a distance given by the `left' value. This value is + relative to the keyword beginning, it is usually negative. This + insures that, except for white space, we will never have to backward + scan the source text, when it is time to generate the final output + lines. + + The right context, indirectly attainable through the keyword end, does + not extend, towards the end of the file, further than a distance given + by the `right' value. This value is relative to the keyword + beginning, it is usually positive. + + When automatic references are used, the `reference' value is the + overall line number in all input files read so far, in this case, it + is of type (int). When input references are used, the `reference' + value indicates the distance between the keyword beginning and the + start of the reference field, it is of type (DELTA) and usually + negative. */ + +typedef short DELTA; /* to hold displacement within one context */ + +typedef struct + { + WORD key; /* description of the keyword */ + DELTA left; /* distance to left context start */ + DELTA right; /* distance to right context end */ + int reference; /* reference descriptor */ + } +OCCURS; + +/* The various OCCURS tables are indexed by the language. But the time + being, there is no such multiple language support. */ + +OCCURS *occurs_table[1]; /* all words retained from the read text */ +size_t number_of_occurs[1]; /* number of used slots in occurs_table */ + +#define ALLOC_NEW_OCCURS(language) \ + BUMP_ALLOC (occurs_table[language], number_of_occurs[language], 9, OCCURS) + + +/* Communication among output routines. */ + +/* Indicate if special output processing is requested for each character. */ +char edited_flag[CHAR_SET_SIZE]; + +int half_line_width; /* half of line width, reference excluded */ +int before_max_width; /* maximum width of before field */ +int keyafter_max_width; /* maximum width of keyword-and-after field */ +int truncation_string_length; /* length of string used to flag truncation */ + +/* When context is limited by lines, wraparound may happen on final output: + the `head' pointer gives access to some supplementary left context which + will be seen at the end of the output line, the `tail' pointer gives + access to some supplementary right context which will be seen at the + beginning of the output line. */ + +BLOCK tail; /* tail field */ +int tail_truncation; /* flag truncation after the tail field */ + +BLOCK before; /* before field */ +int before_truncation; /* flag truncation before the before field */ + +BLOCK keyafter; /* keyword-and-after field */ +int keyafter_truncation; /* flag truncation after the keyafter field */ + +BLOCK head; /* head field */ +int head_truncation; /* flag truncation before the head field */ + +BLOCK reference; /* reference field for input reference mode */ + +/* Miscellaneous routines. */ + +/*------------------------------------------------------. +| Duplicate string STRING, while evaluating \-escapes. | +`------------------------------------------------------*/ + +/* Loosely adapted from GNU sh-utils printf.c code. */ + +static char * +copy_unescaped_string (const char *string) +{ + char *result; /* allocated result */ + char *cursor; /* cursor in result */ + int value; /* value of \nnn escape */ + int length; /* length of \nnn escape */ + + result = xmalloc (strlen (string) + 1); + cursor = result; + + while (*string) + if (*string == '\\') + { + string++; + switch (*string) + { + case 'x': /* \xhhh escape, 3 chars maximum */ + value = 0; + for (length = 0, string++; + length < 3 && ISXDIGIT (*string); + length++, string++) + value = value * 16 + HEXTOBIN (*string); + if (length == 0) + { + *cursor++ = '\\'; + *cursor++ = 'x'; + } + else + *cursor++ = value; + break; + + case '0': /* \0ooo escape, 3 chars maximum */ + value = 0; + for (length = 0, string++; + length < 3 && ISODIGIT (*string); + length++, string++) + value = value * 8 + OCTTOBIN (*string); + *cursor++ = value; + break; + + case 'a': /* alert */ +#if __STDC__ + *cursor++ = '\a'; +#else + *cursor++ = 7; +#endif + string++; + break; + + case 'b': /* backspace */ + *cursor++ = '\b'; + string++; + break; + + case 'c': /* cancel the rest of the output */ + while (*string) + string++; + break; + + case 'f': /* form feed */ + *cursor++ = '\f'; + string++; + break; + + case 'n': /* new line */ + *cursor++ = '\n'; + string++; + break; + + case 'r': /* carriage return */ + *cursor++ = '\r'; + string++; + break; + + case 't': /* horizontal tab */ + *cursor++ = '\t'; + string++; + break; + + case 'v': /* vertical tab */ +#if __STDC__ + *cursor++ = '\v'; +#else + *cursor++ = 11; +#endif + string++; + break; + + default: + *cursor++ = '\\'; + *cursor++ = *string++; + break; + } + } + else + *cursor++ = *string++; + + *cursor = '\0'; + return result; +} + +/*-------------------------------------------------------------------. +| Compile the regex represented by STRING, diagnose and abort if any | +| error. Returns the compiled regex structure. | +`-------------------------------------------------------------------*/ + +static struct re_pattern_buffer * +alloc_and_compile_regex (const char *string) +{ + struct re_pattern_buffer *pattern; /* newly allocated structure */ + const char *message; /* error message returned by regex.c */ + + pattern = (struct re_pattern_buffer *) + xmalloc (sizeof (struct re_pattern_buffer)); + memset (pattern, 0, sizeof (struct re_pattern_buffer)); + + pattern->buffer = NULL; + pattern->allocated = 0; + pattern->translate = ignore_case ? (char *) folded_chars : NULL; + pattern->fastmap = (char *) xmalloc ((size_t) CHAR_SET_SIZE); + + message = re_compile_pattern (string, (int) strlen (string), pattern); + if (message) + error (EXIT_FAILURE, 0, _("%s (for regexp `%s')"), message, string); + + /* The fastmap should be compiled before `re_match'. The following + call is not mandatory, because `re_search' is always called sooner, + and it compiles the fastmap if this has not been done yet. */ + + re_compile_fastmap (pattern); + + /* Do not waste extra allocated space. */ + + if (pattern->allocated > pattern->used) + { + pattern->buffer + = (unsigned char *) xrealloc (pattern->buffer, (size_t) pattern->used); + pattern->allocated = pattern->used; + } + + return pattern; +} + +/*------------------------------------------------------------------------. +| This will initialize various tables for pattern match and compiles some | +| regexps. | +`------------------------------------------------------------------------*/ + +static void +initialize_regex (void) +{ + int character; /* character value */ + + /* Initialize the regex syntax table. */ + + for (character = 0; character < CHAR_SET_SIZE; character++) + syntax_table[character] = isalpha (character) ? Sword : 0; + + /* Initialize the case folding table. */ + + if (ignore_case) + for (character = 0; character < CHAR_SET_SIZE; character++) + folded_chars[character] = toupper (character); + + /* Unless the user already provided a description of the end of line or + end of sentence sequence, select an end of line sequence to compile. + If the user provided an empty definition, thus disabling end of line + or sentence feature, make it NULL to speed up tests. If GNU + extensions are enabled, use end of sentence like in GNU emacs. If + disabled, use end of lines. */ + + if (context_regex_string) + { + if (!*context_regex_string) + context_regex_string = NULL; + } + else if (gnu_extensions && !input_reference) + context_regex_string = "[.?!][]\"')}]*\\($\\|\t\\| \\)[ \t\n]*"; + else + context_regex_string = "\n"; + + if (context_regex_string) + context_regex = alloc_and_compile_regex (context_regex_string); + + /* If the user has already provided a non-empty regexp to describe + words, compile it. Else, unless this has already been done through + a user provided Break character file, construct a fastmap of + characters that may appear in a word. If GNU extensions enabled, + include only letters of the underlying character set. If disabled, + include almost everything, even punctuations; stop only on white + space. */ + + if (word_regex_string && *word_regex_string) + word_regex = alloc_and_compile_regex (word_regex_string); + else if (!break_file) + if (gnu_extensions) + { + + /* Simulate \w+. */ + + for (character = 0; character < CHAR_SET_SIZE; character++) + word_fastmap[character] = isalpha (character) ? 1 : 0; + } + else + { + + /* Simulate [^ \t\n]+. */ + + memset (word_fastmap, 1, CHAR_SET_SIZE); + word_fastmap[' '] = 0; + word_fastmap['\t'] = 0; + word_fastmap['\n'] = 0; + } +} + +/*------------------------------------------------------------------------. +| This routine will attempt to swallow a whole file name FILE_NAME into a | +| contiguous region of memory and return a description of it into BLOCK. | +| Standard input is assumed whenever FILE_NAME is NULL, empty or "-". | +| | +| Previously, in some cases, white space compression was attempted while | +| inputting text. This was defeating some regexps like default end of | +| sentence, which checks for two consecutive spaces. If white space | +| compression is ever reinstated, it should be in output routines. | +`------------------------------------------------------------------------*/ + +static void +swallow_file_in_memory (const char *file_name, BLOCK *block) +{ + int file_handle; /* file descriptor number */ + struct stat stat_block; /* stat block for file */ + size_t allocated_length; /* allocated length of memory buffer */ + size_t used_length; /* used length in memory buffer */ + int read_length; /* number of character gotten on last read */ + + /* As special cases, a file name which is NULL or "-" indicates standard + input, which is already opened. In all other cases, open the file from + its name. */ + + if (!file_name || !*file_name || strcmp (file_name, "-") == 0) + file_handle = fileno (stdin); + else + if ((file_handle = open (file_name, O_RDONLY)) < 0) + error (EXIT_FAILURE, errno, file_name); + + /* If the file is a plain, regular file, allocate the memory buffer all at + once and swallow the file in one blow. In other cases, read the file + repeatedly in smaller chunks until we have it all, reallocating memory + once in a while, as we go. */ + + if (fstat (file_handle, &stat_block) < 0) + error (EXIT_FAILURE, errno, file_name); + +#if !MSDOS + + /* On MSDOS, we cannot predict in memory size from file size, because of + end of line conversions. */ + + if (S_ISREG (stat_block.st_mode)) + { + block->start = (char *) xmalloc ((size_t) stat_block.st_size); + + if (read (file_handle, block->start, (size_t) stat_block.st_size) + != stat_block.st_size) + error (EXIT_FAILURE, errno, file_name); + + block->end = block->start + stat_block.st_size; + } + else + +#endif /* not MSDOS */ + + { + block->start = (char *) xmalloc ((size_t) 1 << SWALLOW_REALLOC_LOG); + used_length = 0; + allocated_length = (1 << SWALLOW_REALLOC_LOG); + + while (read_length = read (file_handle, + block->start + used_length, + allocated_length - used_length), + read_length > 0) + { + used_length += read_length; + if (used_length == allocated_length) + { + allocated_length += (1 << SWALLOW_REALLOC_LOG); + block->start + = (char *) xrealloc (block->start, allocated_length); + } + } + + if (read_length < 0) + error (EXIT_FAILURE, errno, file_name); + + block->end = block->start + used_length; + } + + /* Close the file, but only if it was not the standard input. */ + + if (file_handle != fileno (stdin)) + close (file_handle); +} + +/* Sort and search routines. */ + +/*--------------------------------------------------------------------------. +| Compare two words, FIRST and SECOND, and return 0 if they are identical. | +| Return less than 0 if the first word goes before the second; return | +| greater than 0 if the first word goes after the second. | +| | +| If a word is indeed a prefix of the other, the shorter should go first. | +`--------------------------------------------------------------------------*/ + +static int +compare_words (const void *void_first, const void *void_second) +{ +#define first ((const WORD *) void_first) +#define second ((const WORD *) void_second) + int length; /* minimum of two lengths */ + int counter; /* cursor in words */ + int value; /* value of comparison */ + + length = first->size < second->size ? first->size : second->size; + + if (ignore_case) + { + for (counter = 0; counter < length; counter++) + { + value = (folded_chars [(unsigned char) (first->start[counter])] + - folded_chars [(unsigned char) (second->start[counter])]); + if (value != 0) + return value; + } + } + else + { + for (counter = 0; counter < length; counter++) + { + value = ((unsigned char) first->start[counter] + - (unsigned char) second->start[counter]); + if (value != 0) + return value; + } + } + + return first->size - second->size; +#undef first +#undef second +} + +/*-----------------------------------------------------------------------. +| Decides which of two OCCURS, FIRST or SECOND, should lexicographically | +| go first. In case of a tie, preserve the original order through a | +| pointer comparison. | +`-----------------------------------------------------------------------*/ + +static int +compare_occurs (const void *void_first, const void *void_second) +{ +#define first ((const OCCURS *) void_first) +#define second ((const OCCURS *) void_second) + int value; + + value = compare_words (&first->key, &second->key); + return value == 0 ? first->key.start - second->key.start : value; +#undef first +#undef second +} + +/*------------------------------------------------------------. +| Return !0 if WORD appears in TABLE. Uses a binary search. | +`------------------------------------------------------------*/ + +static int +search_table (WORD *word, WORD_TABLE *table) +{ + int lowest; /* current lowest possible index */ + int highest; /* current highest possible index */ + int middle; /* current middle index */ + int value; /* value from last comparison */ + + lowest = 0; + highest = table->length - 1; + while (lowest <= highest) + { + middle = (lowest + highest) / 2; + value = compare_words (word, table->start + middle); + if (value < 0) + highest = middle - 1; + else if (value > 0) + lowest = middle + 1; + else + return 1; + } + return 0; +} + +/*---------------------------------------------------------------------. +| Sort the whole occurs table in memory. Presumably, `qsort' does not | +| take intermediate copies or table elements, so the sort will be | +| stabilized throughout the comparison routine. | +`---------------------------------------------------------------------*/ + +static void +sort_found_occurs (void) +{ + + /* Only one language for the time being. */ + + qsort (occurs_table[0], number_of_occurs[0], sizeof (OCCURS), + compare_occurs); +} + +/* Parameter files reading routines. */ + +/*----------------------------------------------------------------------. +| Read a file named FILE_NAME, containing a set of break characters. | +| Build a content to the array word_fastmap in which all characters are | +| allowed except those found in the file. Characters may be repeated. | +`----------------------------------------------------------------------*/ + +static void +digest_break_file (const char *file_name) +{ + BLOCK file_contents; /* to receive a copy of the file */ + char *cursor; /* cursor in file copy */ + + swallow_file_in_memory (file_name, &file_contents); + + /* Make the fastmap and record the file contents in it. */ + + memset (word_fastmap, 1, CHAR_SET_SIZE); + for (cursor = file_contents.start; cursor < file_contents.end; cursor++) + word_fastmap[(unsigned char) *cursor] = 0; + + if (!gnu_extensions) + { + + /* If GNU extensions are enabled, the only way to avoid newline as + a break character is to write all the break characters in the + file with no newline at all, not even at the end of the file. + If disabled, spaces, tabs and newlines are always considered as + break characters even if not included in the break file. */ + + word_fastmap[' '] = 0; + word_fastmap['\t'] = 0; + word_fastmap['\n'] = 0; + } + + /* Return the space of the file, which is no more required. */ + + free (file_contents.start); +} + +/*-----------------------------------------------------------------------. +| Read a file named FILE_NAME, containing one word per line, then | +| construct in TABLE a table of WORD descriptors for them. The routine | +| swallows the whole file in memory; this is at the expense of space | +| needed for newlines, which are useless; however, the reading is fast. | +`-----------------------------------------------------------------------*/ + +static void +digest_word_file (const char *file_name, WORD_TABLE *table) +{ + BLOCK file_contents; /* to receive a copy of the file */ + char *cursor; /* cursor in file copy */ + char *word_start; /* start of the current word */ + + swallow_file_in_memory (file_name, &file_contents); + + table->start = NULL; + table->length = 0; + + /* Read the whole file. */ + + cursor = file_contents.start; + while (cursor < file_contents.end) + { + + /* Read one line, and save the word in contains. */ + + word_start = cursor; + while (cursor < file_contents.end && *cursor != '\n') + cursor++; + + /* Record the word in table if it is not empty. */ + + if (cursor > word_start) + { + ALLOC_NEW_WORD (table); + table->start[table->length].start = word_start; + table->start[table->length].size = cursor - word_start; + table->length++; + } + + /* This test allows for an incomplete line at end of file. */ + + if (cursor < file_contents.end) + cursor++; + } + + /* Finally, sort all the words read. */ + + qsort (table->start, table->length, (size_t) sizeof (WORD), compare_words); +} + +/* Keyword recognition and selection. */ + +/*----------------------------------------------------------------------. +| For each keyword in the source text, constructs an OCCURS structure. | +`----------------------------------------------------------------------*/ + +static void +find_occurs_in_text (void) +{ + char *cursor; /* for scanning the source text */ + char *scan; /* for scanning the source text also */ + char *line_start; /* start of the current input line */ + char *line_scan; /* newlines scanned until this point */ + int reference_length; /* length of reference in input mode */ + WORD possible_key; /* possible key, to ease searches */ + OCCURS *occurs_cursor; /* current OCCURS under construction */ + + char *context_start; /* start of left context */ + char *context_end; /* end of right context */ + char *word_start; /* start of word */ + char *word_end; /* end of word */ + char *next_context_start; /* next start of left context */ + + /* reference_length is always used within `if (input_reference)'. + However, GNU C diagnoses that it may be used uninitialized. The + following assignment is merely to shut it up. */ + + reference_length = 0; + + /* Tracking where lines start is helpful for reference processing. In + auto reference mode, this allows counting lines. In input reference + mode, this permits finding the beginning of the references. + + The first line begins with the file, skip immediately this very first + reference in input reference mode, to help further rejection any word + found inside it. Also, unconditionally assigning these variable has + the happy effect of shutting up lint. */ + + line_start = text_buffer.start; + line_scan = line_start; + if (input_reference) + { + SKIP_NON_WHITE (line_scan, text_buffer.end); + reference_length = line_scan - line_start; + SKIP_WHITE (line_scan, text_buffer.end); + } + + /* Process the whole buffer, one line or one sentence at a time. */ + + for (cursor = text_buffer.start; + cursor < text_buffer.end; + cursor = next_context_start) + { + + /* `context_start' gets initialized before the processing of each + line, or once for the whole buffer if no end of line or sentence + sequence separator. */ + + context_start = cursor; + + /* If a end of line or end of sentence sequence is defined and + non-empty, `next_context_start' will be recomputed to be the end of + each line or sentence, before each one is processed. If no such + sequence, then `next_context_start' is set at the end of the whole + buffer, which is then considered to be a single line or sentence. + This test also accounts for the case of an incomplete line or + sentence at the end of the buffer. */ + + if (context_regex_string + && (re_search (context_regex, cursor, text_buffer.end - cursor, + 0, text_buffer.end - cursor, &context_regs) + >= 0)) + next_context_start = cursor + context_regs.end[0]; + + else + next_context_start = text_buffer.end; + + /* Include the separator into the right context, but not any suffix + white space in this separator; this insures it will be seen in + output and will not take more space than necessary. */ + + context_end = next_context_start; + SKIP_WHITE_BACKWARDS (context_end, context_start); + + /* Read and process a single input line or sentence, one word at a + time. */ + + while (1) + { + if (word_regex) + + /* If a word regexp has been compiled, use it to skip at the + beginning of the next word. If there is no such word, exit + the loop. */ + + { + if (re_search (word_regex, cursor, context_end - cursor, + 0, context_end - cursor, &word_regs) + < 0) + break; + word_start = cursor + word_regs.start[0]; + word_end = cursor + word_regs.end[0]; + } + else + + /* Avoid re_search and use the fastmap to skip to the + beginning of the next word. If there is no more word in + the buffer, exit the loop. */ + + { + scan = cursor; + while (scan < context_end + && !word_fastmap[(unsigned char) *scan]) + scan++; + + if (scan == context_end) + break; + + word_start = scan; + + while (scan < context_end + && word_fastmap[(unsigned char) *scan]) + scan++; + + word_end = scan; + } + + /* Skip right to the beginning of the found word. */ + + cursor = word_start; + + /* Skip any zero length word. Just advance a single position, + then go fetch the next word. */ + + if (word_end == word_start) + { + cursor++; + continue; + } + + /* This is a genuine, non empty word, so save it as a possible + key. Then skip over it. Also, maintain the maximum length of + all words read so far. It is mandatory to take the maximum + length of all words in the file, without considering if they + are actually kept or rejected, because backward jumps at output + generation time may fall in *any* word. */ + + possible_key.start = cursor; + possible_key.size = word_end - word_start; + cursor += possible_key.size; + + if (possible_key.size > maximum_word_length) + maximum_word_length = possible_key.size; + + /* In input reference mode, update `line_start' from its previous + value. Count the lines just in case auto reference mode is + also selected. If it happens that the word just matched is + indeed part of a reference; just ignore it. */ + + if (input_reference) + { + while (line_scan < possible_key.start) + if (*line_scan == '\n') + { + total_line_count++; + line_scan++; + line_start = line_scan; + SKIP_NON_WHITE (line_scan, text_buffer.end); + reference_length = line_scan - line_start; + } + else + line_scan++; + if (line_scan > possible_key.start) + continue; + } + + /* Ignore the word if an `Ignore words' table exists and if it is + part of it. Also ignore the word if an `Only words' table and + if it is *not* part of it. + + It is allowed that both tables be used at once, even if this + may look strange for now. Just ignore a word that would appear + in both. If regexps are eventually implemented for these + tables, the Ignore table could then reject words that would + have been previously accepted by the Only table. */ + + if (ignore_file && search_table (&possible_key, &ignore_table)) + continue; + if (only_file && !search_table (&possible_key, &only_table)) + continue; + + /* A non-empty word has been found. First of all, insure + proper allocation of the next OCCURS, and make a pointer to + where it will be constructed. */ + + ALLOC_NEW_OCCURS (0); + occurs_cursor = occurs_table[0] + number_of_occurs[0]; + + /* Define the refence field, if any. */ + + if (auto_reference) + { + + /* While auto referencing, update `line_start' from its + previous value, counting lines as we go. If input + referencing at the same time, `line_start' has been + advanced earlier, and the following loop is never really + executed. */ + + while (line_scan < possible_key.start) + if (*line_scan == '\n') + { + total_line_count++; + line_scan++; + line_start = line_scan; + SKIP_NON_WHITE (line_scan, text_buffer.end); + } + else + line_scan++; + + occurs_cursor->reference = total_line_count; + } + else if (input_reference) + { + + /* If only input referencing, `line_start' has been computed + earlier to detect the case the word matched would be part + of the reference. The reference position is simply the + value of `line_start'. */ + + occurs_cursor->reference + = (DELTA) (line_start - possible_key.start); + if (reference_length > reference_max_width) + reference_max_width = reference_length; + } + + /* Exclude the reference from the context in simple cases. */ + + if (input_reference && line_start == context_start) + { + SKIP_NON_WHITE (context_start, context_end); + SKIP_WHITE (context_start, context_end); + } + + /* Completes the OCCURS structure. */ + + occurs_cursor->key = possible_key; + occurs_cursor->left = context_start - possible_key.start; + occurs_cursor->right = context_end - possible_key.start; + + number_of_occurs[0]++; + } + } +} + +/* Formatting and actual output - service routines. */ + +/*-----------------------------------------. +| Prints some NUMBER of spaces on stdout. | +`-----------------------------------------*/ + +static void +print_spaces (int number) +{ + int counter; + + for (counter = number; counter > 0; counter--) + putchar (' '); +} + +/*-------------------------------------. +| Prints the field provided by FIELD. | +`-------------------------------------*/ + +static void +print_field (BLOCK field) +{ + char *cursor; /* Cursor in field to print */ + int character; /* Current character */ + int base; /* Base character, without diacritic */ + int diacritic; /* Diacritic code for the character */ + + /* Whitespace is not really compressed. Instead, each white space + character (tab, vt, ht etc.) is printed as one single space. */ + + for (cursor = field.start; cursor < field.end; cursor++) + { + character = (unsigned char) *cursor; + if (edited_flag[character]) + { + + /* First check if this is a diacriticized character. + + This works only for TeX. I do not know how diacriticized + letters work with `roff'. Please someone explain it to me! */ + + diacritic = todiac (character); + if (diacritic != 0 && output_format == TEX_FORMAT) + { + base = tobase (character); + switch (diacritic) + { + + case 1: /* Latin diphthongs */ + switch (base) + { + case 'o': + fputs ("\\oe{}", stdout); + break; + + case 'O': + fputs ("\\OE{}", stdout); + break; + + case 'a': + fputs ("\\ae{}", stdout); + break; + + case 'A': + fputs ("\\AE{}", stdout); + break; + + default: + putchar (' '); + } + break; + + case 2: /* Acute accent */ + printf ("\\'%s%c", (base == 'i' ? "\\" : ""), base); + break; + + case 3: /* Grave accent */ + printf ("\\`%s%c", (base == 'i' ? "\\" : ""), base); + break; + + case 4: /* Circumflex accent */ + printf ("\\^%s%c", (base == 'i' ? "\\" : ""), base); + break; + + case 5: /* Diaeresis */ + printf ("\\\"%s%c", (base == 'i' ? "\\" : ""), base); + break; + + case 6: /* Tilde accent */ + printf ("\\~%s%c", (base == 'i' ? "\\" : ""), base); + break; + + case 7: /* Cedilla */ + printf ("\\c{%c}", base); + break; + + case 8: /* Small circle beneath */ + switch (base) + { + case 'a': + fputs ("\\aa{}", stdout); + break; + + case 'A': + fputs ("\\AA{}", stdout); + break; + + default: + putchar (' '); + } + break; + + case 9: /* Strike through */ + switch (base) + { + case 'o': + fputs ("\\o{}", stdout); + break; + + case 'O': + fputs ("\\O{}", stdout); + break; + + default: + putchar (' '); + } + break; + } + } + else + + /* This is not a diacritic character, so handle cases which are + really specific to `roff' or TeX. All white space processing + is done as the default case of this switch. */ + + switch (character) + { + case '"': + /* In roff output format, double any quote. */ + putchar ('"'); + putchar ('"'); + break; + + case '$': + case '%': + case '&': + case '#': + case '_': + /* In TeX output format, precede these with a backslash. */ + putchar ('\\'); + putchar (character); + break; + + case '{': + case '}': + /* In TeX output format, precede these with a backslash and + force mathematical mode. */ + printf ("$\\%c$", character); + break; + + case '\\': + /* In TeX output mode, request production of a backslash. */ + fputs ("\\backslash{}", stdout); + break; + + default: + /* Any other flagged character produces a single space. */ + putchar (' '); + } + } + else + putchar (*cursor); + } +} + +/* Formatting and actual output - planning routines. */ + +/*--------------------------------------------------------------------. +| From information collected from command line options and input file | +| readings, compute and fix some output parameter values. | +`--------------------------------------------------------------------*/ + +static void +fix_output_parameters (void) +{ + int file_index; /* index in text input file arrays */ + int line_ordinal; /* line ordinal value for reference */ + char ordinal_string[12]; /* edited line ordinal for reference */ + int reference_width; /* width for the whole reference */ + int character; /* character ordinal */ + const char *cursor; /* cursor in some constant strings */ + + /* In auto reference mode, the maximum width of this field is + precomputed and subtracted from the overall line width. Add one for + the column which separate the file name from the line number. */ + + if (auto_reference) + { + reference_max_width = 0; + for (file_index = 0; file_index < number_input_files; file_index++) + { + line_ordinal = file_line_count[file_index] + 1; + if (file_index > 0) + line_ordinal -= file_line_count[file_index - 1]; + sprintf (ordinal_string, "%d", line_ordinal); + reference_width = strlen (ordinal_string); + if (input_file_name[file_index]) + reference_width += strlen (input_file_name[file_index]); + if (reference_width > reference_max_width) + reference_max_width = reference_width; + } + reference_max_width++; + reference.start = (char *) xmalloc ((size_t) reference_max_width + 1); + } + + /* If the reference appears to the left of the output line, reserve some + space for it right away, including one gap size. */ + + if ((auto_reference || input_reference) && !right_reference) + line_width -= reference_max_width + gap_size; + + /* The output lines, minimally, will contain from left to right a left + context, a gap, and a keyword followed by the right context with no + special intervening gap. Half of the line width is dedicated to the + left context and the gap, the other half is dedicated to the keyword + and the right context; these values are computed once and for all here. + There also are tail and head wrap around fields, used when the keyword + is near the beginning or the end of the line, or when some long word + cannot fit in, but leave place from wrapped around shorter words. The + maximum width of these fields are recomputed separately for each line, + on a case by case basis. It is worth noting that it cannot happen that + both the tail and head fields are used at once. */ + + half_line_width = line_width / 2; + before_max_width = half_line_width - gap_size; + keyafter_max_width = half_line_width; + + /* If truncation_string is the empty string, make it NULL to speed up + tests. In this case, truncation_string_length will never get used, so + there is no need to set it. */ + + if (truncation_string && *truncation_string) + truncation_string_length = strlen (truncation_string); + else + truncation_string = NULL; + + if (gnu_extensions) + { + + /* When flagging truncation at the left of the keyword, the + truncation mark goes at the beginning of the before field, + unless there is a head field, in which case the mark goes at the + left of the head field. When flagging truncation at the right + of the keyword, the mark goes at the end of the keyafter field, + unless there is a tail field, in which case the mark goes at the + end of the tail field. Only eight combination cases could arise + for truncation marks: + + . None. + . One beginning the before field. + . One beginning the head field. + . One ending the keyafter field. + . One ending the tail field. + . One beginning the before field, another ending the keyafter field. + . One ending the tail field, another beginning the before field. + . One ending the keyafter field, another beginning the head field. + + So, there is at most two truncation marks, which could appear both + on the left side of the center of the output line, both on the + right side, or one on either side. */ + + before_max_width -= 2 * truncation_string_length; + keyafter_max_width -= 2 * truncation_string_length; + } + else + { + + /* I never figured out exactly how UNIX' ptx plans the output width + of its various fields. If GNU extensions are disabled, do not + try computing the field widths correctly; instead, use the + following formula, which does not completely imitate UNIX' ptx, + but almost. */ + + keyafter_max_width -= 2 * truncation_string_length + 1; + } + + /* Compute which characters need special output processing. Initialize + by flagging any white space character. Some systems do not consider + form feed as a space character, but we do. */ + + for (character = 0; character < CHAR_SET_SIZE; character++) + edited_flag[character] = isspace (character) != 0; + edited_flag['\f'] = 1; + + /* Complete the special character flagging according to selected output + format. */ + + switch (output_format) + { + case UNKNOWN_FORMAT: + /* Should never happen. */ + + case DUMB_FORMAT: + break; + + case ROFF_FORMAT: + + /* `Quote' characters should be doubled. */ + + edited_flag['"'] = 1; + break; + + case TEX_FORMAT: + + /* Various characters need special processing. */ + + for (cursor = "$%&#_{}\\"; *cursor; cursor++) + edited_flag[(unsigned char) *cursor] = 1; + + /* Any character with 8th bit set will print to a single space, unless + it is diacriticized. */ + + for (character = 0200; character < CHAR_SET_SIZE; character++) + edited_flag[character] = todiac (character) != 0; + break; + } +} + +/*------------------------------------------------------------------. +| Compute the position and length of all the output fields, given a | +| pointer to some OCCURS. | +`------------------------------------------------------------------*/ + +static void +define_all_fields (OCCURS *occurs) +{ + int tail_max_width; /* allowable width of tail field */ + int head_max_width; /* allowable width of head field */ + char *cursor; /* running cursor in source text */ + char *left_context_start; /* start of left context */ + char *right_context_end; /* end of right context */ + char *left_field_start; /* conservative start for `head'/`before' */ + int file_index; /* index in text input file arrays */ + const char *file_name; /* file name for reference */ + int line_ordinal; /* line ordinal for reference */ + + /* Define `keyafter', start of left context and end of right context. + `keyafter' starts at the saved position for keyword and extend to the + right from the end of the keyword, eating separators or full words, but + not beyond maximum allowed width for `keyafter' field or limit for the + right context. Suffix spaces will be removed afterwards. */ + + keyafter.start = occurs->key.start; + keyafter.end = keyafter.start + occurs->key.size; + left_context_start = keyafter.start + occurs->left; + right_context_end = keyafter.start + occurs->right; + + cursor = keyafter.end; + while (cursor < right_context_end + && cursor <= keyafter.start + keyafter_max_width) + { + keyafter.end = cursor; + SKIP_SOMETHING (cursor, right_context_end); + } + if (cursor <= keyafter.start + keyafter_max_width) + keyafter.end = cursor; + + keyafter_truncation = truncation_string && keyafter.end < right_context_end; + + SKIP_WHITE_BACKWARDS (keyafter.end, keyafter.start); + + /* When the left context is wide, it might take some time to catch up from + the left context boundary to the beginning of the `head' or `before' + fields. So, in this case, to speed the catchup, we jump back from the + keyword, using some secure distance, possibly falling in the middle of + a word. A secure backward jump would be at least half the maximum + width of a line, plus the size of the longest word met in the whole + input. We conclude this backward jump by a skip forward of at least + one word. In this manner, we should not inadvertently accept only part + of a word. From the reached point, when it will be time to fix the + beginning of `head' or `before' fields, we will skip forward words or + delimiters until we get sufficiently near. */ + + if (-occurs->left > half_line_width + maximum_word_length) + { + left_field_start + = keyafter.start - (half_line_width + maximum_word_length); + SKIP_SOMETHING (left_field_start, keyafter.start); + } + else + left_field_start = keyafter.start + occurs->left; + + /* `before' certainly ends at the keyword, but not including separating + spaces. It starts after than the saved value for the left context, by + advancing it until it falls inside the maximum allowed width for the + before field. There will be no prefix spaces either. `before' only + advances by skipping single separators or whole words. */ + + before.start = left_field_start; + before.end = keyafter.start; + SKIP_WHITE_BACKWARDS (before.end, before.start); + + while (before.start + before_max_width < before.end) + SKIP_SOMETHING (before.start, before.end); + + if (truncation_string) + { + cursor = before.start; + SKIP_WHITE_BACKWARDS (cursor, text_buffer.start); + before_truncation = cursor > left_context_start; + } + else + before_truncation = 0; + + SKIP_WHITE (before.start, text_buffer.end); + + /* The tail could not take more columns than what has been left in the + left context field, and a gap is mandatory. It starts after the + right context, and does not contain prefixed spaces. It ends at + the end of line, the end of buffer or when the tail field is full, + whichever comes first. It cannot contain only part of a word, and + has no suffixed spaces. */ + + tail_max_width + = before_max_width - (before.end - before.start) - gap_size; + + if (tail_max_width > 0) + { + tail.start = keyafter.end; + SKIP_WHITE (tail.start, text_buffer.end); + + tail.end = tail.start; + cursor = tail.end; + while (cursor < right_context_end + && cursor < tail.start + tail_max_width) + { + tail.end = cursor; + SKIP_SOMETHING (cursor, right_context_end); + } + + if (cursor < tail.start + tail_max_width) + tail.end = cursor; + + if (tail.end > tail.start) + { + keyafter_truncation = 0; + tail_truncation = truncation_string && tail.end < right_context_end; + } + else + tail_truncation = 0; + + SKIP_WHITE_BACKWARDS (tail.end, tail.start); + } + else + { + + /* No place left for a tail field. */ + + tail.start = NULL; + tail.end = NULL; + tail_truncation = 0; + } + + /* `head' could not take more columns than what has been left in the right + context field, and a gap is mandatory. It ends before the left + context, and does not contain suffixed spaces. Its pointer is advanced + until the head field has shrunk to its allowed width. It cannot + contain only part of a word, and has no suffixed spaces. */ + + head_max_width + = keyafter_max_width - (keyafter.end - keyafter.start) - gap_size; + + if (head_max_width > 0) + { + head.end = before.start; + SKIP_WHITE_BACKWARDS (head.end, text_buffer.start); + + head.start = left_field_start; + while (head.start + head_max_width < head.end) + SKIP_SOMETHING (head.start, head.end); + + if (head.end > head.start) + { + before_truncation = 0; + head_truncation = (truncation_string + && head.start > left_context_start); + } + else + head_truncation = 0; + + SKIP_WHITE (head.start, head.end); + } + else + { + + /* No place left for a head field. */ + + head.start = NULL; + head.end = NULL; + head_truncation = 0; + } + + if (auto_reference) + { + + /* Construct the reference text in preallocated space from the file + name and the line number. Find out in which file the reference + occurred. Standard input yields an empty file name. Insure line + numbers are one based, even if they are computed zero based. */ + + file_index = 0; + while (file_line_count[file_index] < occurs->reference) + file_index++; + + file_name = input_file_name[file_index]; + if (!file_name) + file_name = ""; + + line_ordinal = occurs->reference + 1; + if (file_index > 0) + line_ordinal -= file_line_count[file_index - 1]; + + sprintf (reference.start, "%s:%d", file_name, line_ordinal); + reference.end = reference.start + strlen (reference.start); + } + else if (input_reference) + { + + /* Reference starts at saved position for reference and extends right + until some white space is met. */ + + reference.start = keyafter.start + (DELTA) occurs->reference; + reference.end = reference.start; + SKIP_NON_WHITE (reference.end, right_context_end); + } +} + +/* Formatting and actual output - control routines. */ + +/*----------------------------------------------------------------------. +| Output the current output fields as one line for `troff' or `nroff'. | +`----------------------------------------------------------------------*/ + +static void +output_one_roff_line (void) +{ + /* Output the `tail' field. */ + + printf (".%s \"", macro_name); + print_field (tail); + if (tail_truncation) + fputs (truncation_string, stdout); + putchar ('"'); + + /* Output the `before' field. */ + + fputs (" \"", stdout); + if (before_truncation) + fputs (truncation_string, stdout); + print_field (before); + putchar ('"'); + + /* Output the `keyafter' field. */ + + fputs (" \"", stdout); + print_field (keyafter); + if (keyafter_truncation) + fputs (truncation_string, stdout); + putchar ('"'); + + /* Output the `head' field. */ + + fputs (" \"", stdout); + if (head_truncation) + fputs (truncation_string, stdout); + print_field (head); + putchar ('"'); + + /* Conditionally output the `reference' field. */ + + if (auto_reference || input_reference) + { + fputs (" \"", stdout); + print_field (reference); + putchar ('"'); + } + + putchar ('\n'); +} + +/*---------------------------------------------------------. +| Output the current output fields as one line for `TeX'. | +`---------------------------------------------------------*/ + +static void +output_one_tex_line (void) +{ + BLOCK key; /* key field, isolated */ + BLOCK after; /* after field, isolated */ + char *cursor; /* running cursor in source text */ + + printf ("\\%s ", macro_name); + fputs ("{", stdout); + print_field (tail); + fputs ("}{", stdout); + print_field (before); + fputs ("}{", stdout); + key.start = keyafter.start; + after.end = keyafter.end; + cursor = keyafter.start; + SKIP_SOMETHING (cursor, keyafter.end); + key.end = cursor; + after.start = cursor; + print_field (key); + fputs ("}{", stdout); + print_field (after); + fputs ("}{", stdout); + print_field (head); + fputs ("}", stdout); + if (auto_reference || input_reference) + { + fputs ("{", stdout); + print_field (reference); + fputs ("}", stdout); + } + fputs ("\n", stdout); +} + +/*-------------------------------------------------------------------. +| Output the current output fields as one line for a dumb terminal. | +`-------------------------------------------------------------------*/ + +static void +output_one_dumb_line (void) +{ + if (!right_reference) + if (auto_reference) + { + + /* Output the `reference' field, in such a way that GNU emacs + next-error will handle it. The ending colon is taken from the + gap which follows. */ + + print_field (reference); + putchar (':'); + print_spaces (reference_max_width + + gap_size + - (reference.end - reference.start) + - 1); + } + else + { + + /* Output the `reference' field and its following gap. */ + + print_field (reference); + print_spaces (reference_max_width + + gap_size + - (reference.end - reference.start)); + } + + if (tail.start < tail.end) + { + /* Output the `tail' field. */ + + print_field (tail); + if (tail_truncation) + fputs (truncation_string, stdout); + + print_spaces (half_line_width - gap_size + - (before.end - before.start) + - (before_truncation ? truncation_string_length : 0) + - (tail.end - tail.start) + - (tail_truncation ? truncation_string_length : 0)); + } + else + print_spaces (half_line_width - gap_size + - (before.end - before.start) + - (before_truncation ? truncation_string_length : 0)); + + /* Output the `before' field. */ + + if (before_truncation) + fputs (truncation_string, stdout); + print_field (before); + + print_spaces (gap_size); + + /* Output the `keyafter' field. */ + + print_field (keyafter); + if (keyafter_truncation) + fputs (truncation_string, stdout); + + if (head.start < head.end) + { + /* Output the `head' field. */ + + print_spaces (half_line_width + - (keyafter.end - keyafter.start) + - (keyafter_truncation ? truncation_string_length : 0) + - (head.end - head.start) + - (head_truncation ? truncation_string_length : 0)); + if (head_truncation) + fputs (truncation_string, stdout); + print_field (head); + } + else + + if ((auto_reference || input_reference) && right_reference) + print_spaces (half_line_width + - (keyafter.end - keyafter.start) + - (keyafter_truncation ? truncation_string_length : 0)); + + if ((auto_reference || input_reference) && right_reference) + { + /* Output the `reference' field. */ + + print_spaces (gap_size); + print_field (reference); + } + + fputs ("\n", stdout); +} + +/*------------------------------------------------------------------------. +| Scan the whole occurs table and, for each entry, output one line in the | +| appropriate format. | +`------------------------------------------------------------------------*/ + +static void +generate_all_output (void) +{ + int occurs_index; /* index of keyword entry being processed */ + OCCURS *occurs_cursor; /* current keyword entry being processed */ + + + /* The following assignments are useful to provide default values in case + line contexts or references are not used, in which case these variables + would never be computed. */ + + tail.start = NULL; + tail.end = NULL; + tail_truncation = 0; + + head.start = NULL; + head.end = NULL; + head_truncation = 0; + + + /* Loop over all keyword occurrences. */ + + occurs_cursor = occurs_table[0]; + + for (occurs_index = 0; occurs_index < number_of_occurs[0]; occurs_index++) + { + /* Compute the exact size of every field and whenever truncation flags + are present or not. */ + + define_all_fields (occurs_cursor); + + /* Produce one output line according to selected format. */ + + switch (output_format) + { + case UNKNOWN_FORMAT: + /* Should never happen. */ + + case DUMB_FORMAT: + output_one_dumb_line (); + break; + + case ROFF_FORMAT: + output_one_roff_line (); + break; + + case TEX_FORMAT: + output_one_tex_line (); + break; + } + + /* Advance the cursor into the occurs table. */ + + occurs_cursor++; + } +} + +/* Option decoding and main program. */ + +/*------------------------------------------------------. +| Print program identification and options, then exit. | +`------------------------------------------------------*/ + +static void +usage (int status) +{ + if (status != EXIT_SUCCESS) + fprintf (stderr, _("Try `%s --help' for more information.\n"), + program_name); + else + { + printf (_("\ +Usage: %s [OPTION]... [INPUT]... (without -G)\n\ + or: %s -G [OPTION]... [INPUT [OUTPUT]]\n"), + program_name, program_name); + fputs (_("\ +Mandatory arguments to long options are mandatory for short options too.\n\ +\n\ + -A, --auto-reference output automatically generated references\n\ + -C, --copyright display Copyright and copying conditions\n\ + -G, --traditional behave more like System V `ptx'\n\ + -F, --flag-truncation=STRING use STRING for flagging line truncations\n\ + -M, --macro-name=STRING macro name to use instead of `xx'\n\ + -O, --format=roff generate output as roff directives\n\ + -R, --right-side-refs put references at right, not counted in -w\n\ + -S, --sentence-regexp=REGEXP for end of lines or end of sentences\n\ + -T, --format=tex generate output as TeX directives\n\ + -W, --word-regexp=REGEXP use REGEXP to match each keyword\n\ + -b, --break-file=FILE word break characters in this FILE\n\ + -f, --ignore-case fold lower case to upper case for sorting\n\ + -g, --gap-size=NUMBER gap size in columns between output fields\n\ + -i, --ignore-file=FILE read ignore word list from FILE\n\ + -o, --only-file=FILE read only word list from this FILE\n\ + -r, --references first field of each line is a reference\n\ + -t, --typeset-mode - not implemented -\n\ + -w, --width=NUMBER output width in columns, reference excluded\n\ + --help display this help and exit\n\ + --version output version information and exit\n\ +\n\ +With no FILE or if FILE is -, read Standard Input. `-F /' by default.\n"), + stdout); + } + exit (status); +} + +/*----------------------------------------------------------------------. +| Main program. Decode ARGC arguments passed through the ARGV array of | +| strings, then launch execution. | +`----------------------------------------------------------------------*/ + +/* Long options equivalences. */ +static const struct option long_options[] = +{ + {"auto-reference", no_argument, NULL, 'A'}, + {"break-file", required_argument, NULL, 'b'}, + {"copyright", no_argument, NULL, 'C'}, + {"flag-truncation", required_argument, NULL, 'F'}, + {"ignore-case", no_argument, NULL, 'f'}, + {"gap-size", required_argument, NULL, 'g'}, + {"help", no_argument, &show_help, 1}, + {"ignore-file", required_argument, NULL, 'i'}, + {"macro-name", required_argument, NULL, 'M'}, + {"only-file", required_argument, NULL, 'o'}, + {"references", no_argument, NULL, 'r'}, + {"right-side-refs", no_argument, NULL, 'R'}, + {"format", required_argument, NULL, 10}, + {"sentence-regexp", required_argument, NULL, 'S'}, + {"traditional", no_argument, NULL, 'G'}, + {"typeset-mode", no_argument, NULL, 't'}, + {"version", no_argument, &show_version, 1}, + {"width", required_argument, NULL, 'w'}, + {"word-regexp", required_argument, NULL, 'W'}, + {0, 0, 0, 0}, +}; + +static char const* const format_args[] = +{ + "roff", "tex", 0 +}; + +int +main (int argc, char *const argv[]) +{ + int optchar; /* argument character */ + int file_index; /* index in text input file arrays */ + + /* Decode program options. */ + + program_name = argv[0]; + setlocale (LC_ALL, ""); + +#if HAVE_SETCHRCLASS + setchrclass (NULL); +#endif + + while (optchar = getopt_long (argc, argv, "ACF:GM:ORS:TW:b:i:fg:o:trw:", + long_options, NULL), + optchar != EOF) + { + switch (optchar) + { + default: + usage (EXIT_FAILURE); + + case 0: + break; + + case 'C': + fputs (_("\ +This program is free software; you can redistribute it and/or modify\n\ +it under the terms of the GNU General Public License as published by\n\ +the Free Software Foundation; either version 2, or (at your option)\n\ +any later version.\n\ +\n\ +This program is distributed in the hope that it will be useful,\n\ +but WITHOUT ANY WARRANTY; without even the implied warranty of\n\ +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n\ +GNU General Public License for more details.\n\ +\n\ +You should have received a copy of the GNU General Public License\n\ +along with this program; if not, write to the Free Software Foundation,\n\ +Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.\n"), + stdout); + + exit (EXIT_SUCCESS); + + case 'G': + gnu_extensions = 0; + break; + + case 'b': + break_file = optarg; + break; + + case 'f': + ignore_case = 1; + break; + + case 'g': + gap_size = atoi (optarg); + break; + + case 'i': + ignore_file = optarg; + break; + + case 'o': + only_file = optarg; + break; + + case 'r': + input_reference = 1; + break; + + case 't': + /* Yet to understand... */ + break; + + case 'w': + line_width = atoi (optarg); + break; + + case 'A': + auto_reference = 1; + break; + + case 'F': + truncation_string = copy_unescaped_string (optarg); + break; + + case 'M': + macro_name = optarg; + break; + + case 'O': + output_format = ROFF_FORMAT; + break; + + case 'R': + right_reference = 1; + break; + + case 'S': + context_regex_string = copy_unescaped_string (optarg); + break; + + case 'T': + output_format = TEX_FORMAT; + break; + + case 'W': + word_regex_string = copy_unescaped_string (optarg); + break; + + case 10: + switch (argmatch (optarg, format_args)) + { + default: + usage (EXIT_FAILURE); + + case 0: + output_format = ROFF_FORMAT; + break; + + case 1: + output_format = TEX_FORMAT; + break; + } + } + } + + /* Process trivial options. */ + + if (show_help) + usage (EXIT_SUCCESS); + + if (show_version) + { + printf ("ptx (%s) %s\n", GNU_PACKAGE, VERSION); + exit (EXIT_SUCCESS); + } + + /* Change the default Ignore file if one is defined. */ + +#ifdef DEFAULT_IGNORE_FILE + if (!ignore_file) + ignore_file = DEFAULT_IGNORE_FILE; +#endif + + /* Process remaining arguments. If GNU extensions are enabled, process + all arguments as input parameters. If disabled, accept at most two + arguments, the second of which is an output parameter. */ + + if (optind == argc) + { + + /* No more argument simply means: read standard input. */ + + input_file_name = (const char **) xmalloc (sizeof (const char *)); + file_line_count = (int *) xmalloc (sizeof (int)); + number_input_files = 1; + input_file_name[0] = NULL; + } + else if (gnu_extensions) + { + number_input_files = argc - optind; + input_file_name + = (const char **) xmalloc (number_input_files * sizeof (const char *)); + file_line_count + = (int *) xmalloc (number_input_files * sizeof (int)); + + for (file_index = 0; file_index < number_input_files; file_index++) + { + input_file_name[file_index] = argv[optind]; + if (!*argv[optind] || strcmp (argv[optind], "-") == 0) + input_file_name[0] = NULL; + else + input_file_name[0] = argv[optind]; + optind++; + } + } + else + { + + /* There is one necessary input file. */ + + number_input_files = 1; + input_file_name = (const char **) xmalloc (sizeof (const char *)); + file_line_count = (int *) xmalloc (sizeof (int)); + if (!*argv[optind] || strcmp (argv[optind], "-") == 0) + input_file_name[0] = NULL; + else + input_file_name[0] = argv[optind]; + optind++; + + /* Redirect standard output, only if requested. */ + + if (optind < argc) + { + fclose (stdout); + if (fopen (argv[optind], "w") == NULL) + error (EXIT_FAILURE, errno, argv[optind]); + optind++; + } + + /* Diagnose any other argument as an error. */ + + if (optind < argc) + usage (EXIT_FAILURE); + } + + /* If the output format has not been explicitly selected, choose dumb + terminal format if GNU extensions are enabled, else `roff' format. */ + + if (output_format == UNKNOWN_FORMAT) + output_format = gnu_extensions ? DUMB_FORMAT : ROFF_FORMAT; + + /* Initialize the main tables. */ + + initialize_regex (); + + /* Read `Break character' file, if any. */ + + if (break_file) + digest_break_file (break_file); + + /* Read `Ignore words' file and `Only words' files, if any. If any of + these files is empty, reset the name of the file to NULL, to avoid + unnecessary calls to search_table. */ + + if (ignore_file) + { + digest_word_file (ignore_file, &ignore_table); + if (ignore_table.length == 0) + ignore_file = NULL; + } + + if (only_file) + { + digest_word_file (only_file, &only_table); + if (only_table.length == 0) + only_file = NULL; + } + + /* Prepare to study all the input files. */ + + number_of_occurs[0] = 0; + total_line_count = 0; + maximum_word_length = 0; + reference_max_width = 0; + + for (file_index = 0; file_index < number_input_files; file_index++) + { + + /* Read the file in core, than study it. */ + + swallow_file_in_memory (input_file_name[file_index], &text_buffer); + find_occurs_in_text (); + + /* Maintain for each file how many lines has been read so far when its + end is reached. Incrementing the count first is a simple kludge to + handle a possible incomplete line at end of file. */ + + total_line_count++; + file_line_count[file_index] = total_line_count; + } + + /* Do the output process phase. */ + + sort_found_occurs (); + fix_output_parameters (); + generate_all_output (); + + /* All done. */ + + exit (EXIT_SUCCESS); +} |