From dccf44af32efb236da96b99031c571151041d724 Mon Sep 17 00:00:00 2001 From: Jim Meyering Date: Thu, 29 Sep 1994 23:47:37 +0000 Subject: . --- src/fmt.c | 974 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 974 insertions(+) create mode 100644 src/fmt.c (limited to 'src') diff --git a/src/fmt.c b/src/fmt.c new file mode 100644 index 000000000..88f5739ec --- /dev/null +++ b/src/fmt.c @@ -0,0 +1,974 @@ +/* GNU fmt -- simple text formatter. + Copyright (C) 1994 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* Written by Ross Paterson . */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include + +#include +#include "system.h" + +#include "getopt.h" +#include "version.h" + +/* The following parameters represent the program's idea of what is + "best". Adjust to taste, subject to the caveats given. */ + +/* Default longest permitted line length (max_width). */ +#define WIDTH 75 + +/* Prefer lines to be LEEWAY % shorter than the maximum width, giving + room for optimization. */ +#define LEEWAY 7 + +/* The default secondary indent of tagged paragraph used for unindented + one-line paragraphs not preceded by any multi-line paragraphs. */ +#define DEF_INDENT 3 + +/* Costs and bonuses are expressed as the equivalent departure from the + optimal line length, multiplied by 10. e.g. assigning something a + cost of 50 means that it is as bad as a line 5 characters too short + or too long. The definition of SHORT_COST(n) should not be changed. + However, EQUIV(n) may need tuning. */ + +typedef long COST; + +#define MAXCOST (~(((COST) 1) << (8 * sizeof (COST) -1))) + +#define SQR(n) ((n) * (n)) +#define EQUIV(n) SQR ((COST) (n)) + +/* Cost of a filled line n chars longer or shorter than best_width. */ +#define SHORT_COST(n) EQUIV ((n) * 10) + +/* Cost of the difference between adjacent filled lines. */ +#define RAGGED_COST(n) (SHORT_COST (n) / 2) + +/* Basic cost per line. */ +#define LINE_COST EQUIV (70) + +/* Cost of breaking a line after the first word of a sentence, where + the length of the word is N. */ +#define WIDOW_COST(n) (EQUIV (200) / ((n) + 2)) + +/* Cost of breaking a line before the last word of a sentence, where + the length of the word is N. */ +#define ORPHAN_COST(n) (EQUIV (150) / ((n) + 2)) + +/* Bonus for breaking a line at the end of a sentence. */ +#define SENTENCE_BONUS EQUIV (50) + +/* Cost of breaking a line after a period not marking end of a sentence. + With the definition of sentence we are using (borrowed from emacs, see + get_line()) such a break would then look like a sentence break. Hence + we assign a very high cost -- it should be avoided unless things are + really bad. */ +#define NOBREAK_COST EQUIV (600) + +/* Bonus for breaking a line before open parenthesis. */ +#define PAREN_BONUS EQUIV (40) + +/* Bonus for breaking a line after other punctuation. */ +#define PUNCT_BONUS EQUIV(40) + +/* Credit for breaking a long paragraph one line later. */ +#define LINE_CREDIT EQUIV(3) + +/* Size of paragraph buffer, in words and characters. Longer paragraphs + are handled neatly (cf. flush_paragraph()), so there's little to gain + by making these larger. */ +#define MAXWORDS 1000 +#define MAXCHARS 5000 + +/* Extra ctype(3)-style macros. */ + +#define isopen(c) (strchr ("([`'\"", c) != NULL) +#define isclose(c) (strchr (")]'\"", c) != NULL) +#define isperiod(c) (strchr (".?!", c) != NULL) + +/* Size of a tab stop, for expansion on input and re-introduction on + output. */ +#define TABWIDTH 8 + +/* Miscellaneous definitions. */ + +typedef unsigned int bool; +#define TRUE 1 +#define FALSE 0 + +/* Word descriptor structure. */ + +typedef struct Word WORD; + +struct Word + { + + /* Static attributes determined during input. */ + + const char *text; /* the text of the word */ + short length; /* length of this word */ + short space; /* the size of the following space */ + bool paren:1; /* starts with open paren */ + bool period:1; /* ends in [.?!])* */ + bool punct:1; /* ends in punctuation */ + bool final:1; /* end of sentence */ + + /* The remaining fields are computed during the optimization. */ + + short line_length; /* length of the best line starting here */ + COST best_cost; /* cost of best paragraph starting here */ + WORD *next_break; /* break which achieves best_cost */ + }; + +/* Forward declarations. */ + +/* My AC_PROTOTYPES would be better than __STDC__. FIXME :-). */ +#ifdef __STDC__ +#define _(x) x +#else +#define _(x) () +#endif + +#ifdef HAVE_VPRINTF +void error _ ((int, int, const char *,...)); +#else +void error (); +#endif + +static void set_prefix _ ((char *p)); +static void fmt _ ((FILE *f)); +static bool get_paragraph _ ((FILE *f)); +static int get_line _ ((FILE *f, int c)); +static int get_prefix _ ((FILE *f)); +static int get_space _ ((FILE *f, int c)); +static int copy_rest _ ((FILE *f, int c)); +static bool same_para _ ((int c)); +static void flush_paragraph _ ((void)); +static void fmt_paragraph _ ((void)); +static void check_punctuation _ ((WORD * w)); +static COST base_cost _ ((WORD * this)); +static COST line_cost _ ((WORD * next, int len)); +static void put_paragraph _ ((WORD * finish)); +static void put_line _ ((WORD * w, int indent)); +static void put_word _ ((WORD * w)); +static void put_space _ ((int space)); + +/* The name this program was run with. */ +const char *program_name; + +/* If non-zero, display usage information and exit. */ +static int show_help = 0; + +/* If non-zero, print the version on standard output and exit. */ +static int show_version = 0; + +/* Option values. */ + +/* If TRUE, first 2 lines may have different indent (default FALSE). */ +static bool crown; + +/* If TRUE, first 2 lines _must_ have different indent (default FALSE). */ +static bool tagged; + +/* If TRUE, each line is a paragraph on its own (default FALSE). */ +static bool split; + +/* If TRUE, don't preserve inter-word spacing (default FALSE). */ +static bool uniform; + +/* Prefix minus leading and trailing spaces (default ""). */ +static const char *prefix; + +/* User-supplied maximum line width (default WIDTH). The only output +lines + longer than this will each comprise a single word. */ +static int max_width; + +/* Values derived from the option values. */ + +/* The length of prefix minus leading space. */ +static int prefix_full_length; + +/* The length of the leading space trimmed from the prefix. */ +static int prefix_lead_space; + +/* The length of prefix minus leading and trailing space. */ +static int prefix_length; + +/* The preferred width of text lines, set to LEEWAY % less than max_width. */ +static int best_width; + +/* Dynamic variables. */ + +/* Start column of the character most recently read from the input file. */ +static int in_column; + +/* Start column of the next character to be written to stdout. */ +static int out_column; + +/* Space for the paragraph text -- longer paragraphs are handled neatly + (cf. flush_paragraph()). */ +static char parabuf[MAXCHARS]; + +/* A pointer into parabuf, indicating the first unused character position. */ +static char *wptr; + +/* The words of a paragraph -- longer paragraphs are handled neatly + (cf. flush_paragraph()). */ +static WORD word[MAXWORDS]; + +/* A pointer into the above word array, indicating the first position + after the last complete word. Sometimes it will point at an incomplete + word. */ +static WORD *word_limit; + +/* If TRUE, current input file contains tab characters, and so tabs can be + used for white space on output. */ +static bool tabs; + +/* Space before trimmed prefix on each line of the current paragraph. */ +static int prefix_indent; + +/* Indentation of the first line of the current paragraph. */ +static int first_indent; + +/* Indentation of other lines of the current paragraph */ +static int other_indent; + +/* To detect the end of a paragraph, we need to look ahead to the first + non-blank character after the prefix on the next line, or the first + character on the following line that failed to match the prefix. + We can reconstruct the lookahead from that character (next_char), its + position on the line (in_column) and the amount of space before the + prefix (next_prefix_indent). See get_paragraph() and copy_rest(). */ + +/* The last character read from the input file. */ +static int next_char; + +/* The space before the trimmed prefix (or part of it) on the next line + after the current paragraph. */ +static int next_prefix_indent; + +/* If non-zero, the length of the last line output in the current + paragraph, used to charge for raggedness at the split point for long + paragraphs chosen by fmt_paragraph(). */ +static int last_line_length; + +static void +usage (status) + int status; +{ + if (status != 0) + fprintf (stderr, "Try `%s --help' for more information.\n", program_name); + else + { + printf ("Usage: %s [OPTION]... [FILE]...\n", program_name); + fputs ("\ +Mandatory arguments to long options are mandatory for short options too.\n\ +\n\ + -c, --crown-margin preserve indentation of first two lines\n\ + -s, --split-only split long lines, but do not refill\n\ + -t, --tagged-paragraph indentation of first line different from second\n\ + -u, --uniform-spacing one space between words, two after sentences\n\ + -w, --width=NUMBER maximum line width (default of 75 columns)\n\ + -p, --prefix=STRING combine only lines having STRING as prefix\n\ + --help display this help and exit\n\ + --version output version information and exit\n\ +\n\ +In -wNUMBER, the letter `w' may be omitted. Input FILEs are concatenated.\n\ +If no FILE or if FILE is `-', standard input is read.\n", + stdout); + } + exit (status); +} + +/* Decode options and launch execution. */ + +static const struct option long_options[] = +{ + {"crown-margin", no_argument, NULL, 'c'}, + {"help", no_argument, &show_help, 1}, + {"prefix", required_argument, NULL, 'p'}, + {"split-only", no_argument, NULL, 's'}, + {"tagged-paragraph", no_argument, NULL, 't'}, + {"uniform-spacing", no_argument, NULL, 'u'}, + {"version", no_argument, &show_version, 1}, + {"width", required_argument, NULL, 'w'}, + {0, 0, 0, 0}, +}; + +int +main (argc, argv) + register int argc; + register char *argv[]; +{ + int optchar; + FILE *infile; + + program_name = argv[0]; + + crown = tagged = split = uniform = FALSE; + max_width = WIDTH; + prefix = ""; + prefix_length = prefix_lead_space = prefix_full_length = 0; + + if (argc > 1 && argv[1][0] == '-' && ISDIGIT (argv[1][1])) + { + max_width = 0; + /* Old option syntax; a dash followed by one or more digits. + Move past the number. */ + for (++argv[1]; ISDIGIT (*argv[1]); ++argv[1]) + { + /* FIXME: use strtol to detect overflow. */ + max_width = max_width * 10 + *argv[1] - '0'; + } + /* Make the options we just parsed invisible to getopt. */ + argv[1] = argv[0]; + argv++; + argc--; + } + + while ((optchar = getopt_long (argc, argv, "0123456789cstuw:p:", + long_options, NULL)) + != EOF) + switch (optchar) + { + default: + usage (1); + + case 0: + break; + + case 'c': + crown = TRUE; + break; + + case 's': + split = TRUE; + break; + + case 't': + tagged = TRUE; + break; + + case 'u': + uniform = TRUE; + break; + + case 'w': + /* FIXME: use strtol. */ + max_width = atoi (optarg); + break; + + case 'p': + set_prefix (optarg); + break; + + } + + if (show_version) + { + printf ("%s\n", version_string); + exit (0); + } + + if (show_help) + usage (0); + + best_width = max_width * (2 * (100 - LEEWAY) + 1) / 200; + + if (optind == argc) + fmt (stdin); + else + for (; optind < argc; optind++) + if (strcmp (argv[optind], "-") == 0) + fmt (stdin); + else + { + infile = fopen (argv[optind], "r"); + if (infile != NULL) + { + fmt (infile); + fclose (infile); + } + else + error (0, errno, argv[optind]); + } + + exit (0); +} + +/* Trim space from the front and back of the string P, yielding the prefix, + and record the lengths of the prefix and the space trimmed. */ + +static void +set_prefix (p) + register char *p; +{ + register char *s; + + prefix_lead_space = 0; + while (*p == ' ') + { + prefix_lead_space++; + p++; + } + prefix = p; + prefix_full_length = strlen (p); + s = p + prefix_full_length; + while (s > p && s[-1] == ' ') + s--; + *s = '\0'; + prefix_length = s - p; +} + +/* read file F and send formatted output to stdout. */ + +static void +fmt (f) + FILE *f; +{ + tabs = FALSE; + other_indent = 0; + next_char = get_prefix (f); + while (get_paragraph (f)) + { + fmt_paragraph (); + put_paragraph (word_limit); + } +} + +/* Read a paragraph from input file F. A paragraph consists of a + maximal number of non-blank (excluding any prefix) lines subject to: + * In split mode, a paragraph is a single non-blank line. + * In crown mode, the second and subsequent lines must have the + same indentation, but possibly different from the indent of the + first line. + * Tagged mode is similar, but the first and second lines must have + different indentations. + * Otherwise, all lines of a paragraph must have the same indent. + If a prefix is in effect, it must be present at the same indent for + each line in the paragraph. + + Return FALSE if end-of-file was encountered before the start of a + paragraph, else TRUE. */ + +static bool +get_paragraph (f) + FILE *f; +{ + register int c; + + last_line_length = 0; + c = next_char; + + /* Scan (and copy) blank lines, and lines not introduced by the prefix. */ + + while (c == '\n' || c == EOF + || next_prefix_indent < prefix_lead_space + || in_column < next_prefix_indent + prefix_full_length) + { + c = copy_rest (f, c); + if (c == EOF) + { + next_char = EOF; + return FALSE; + } + putchar ('\n'); + c = get_prefix (f); + } + + /* Got a suitable first line for a paragraph. */ + + prefix_indent = next_prefix_indent; + first_indent = in_column; + wptr = parabuf; + word_limit = word; + c = get_line (f, c); + + /* Read rest of paragraph (unless split is specified). */ + + if (split) + other_indent = first_indent; + else if (crown) + { + if (same_para (c)) + { + other_indent = in_column; + do + { /* for each line till the end of the para */ + c = get_line (f, c); + } + while (same_para (c) && in_column == other_indent); + } + else + other_indent = first_indent; + } + else if (tagged) + { + if (same_para (c) && in_column != first_indent) + { + other_indent = in_column; + do + { /* for each line till the end of the para */ + c = get_line (f, c); + } + while (same_para (c) && in_column == other_indent); + } + + /* Only one line: use the secondary indent from last time if it + splits, or 0 if there have been no multi-line paragraphs in the + input so far. But if these rules make the two indents the same, + pick a new secondary indent. */ + + else if (other_indent == first_indent) + other_indent = first_indent == 0 ? DEF_INDENT : 0; + } + else + { + other_indent = first_indent; + while (same_para (c) && in_column == other_indent) + c = get_line (f, c); + } + (word_limit - 1)->period = (word_limit - 1)->final = TRUE; + next_char = c; + return TRUE; +} + +/* Copy a line which failed to match the prefix to the output, or which + was blank after the prefix. In the former case, C is the character + that failed to match the prefix. In the latter, C is \n or EOF. + Return the character (\n or EOF) ending the line. */ + +static int +copy_rest (f, c) + FILE *f; + register int c; +{ + register const char *s; + + out_column = 0; + if (in_column > next_prefix_indent || (c != '\n' && c != EOF)) + { + put_space (next_prefix_indent); + for (s = prefix; out_column != in_column; out_column++) + putchar(*s++); + } + while (c != '\n' && c != EOF) + { + putchar (c); + c = getc (f); + } + return c; +} + +/* Return TRUE if a line whose first non-blank character after the + prefix (if any) is C could belong to the current paragraph, + otherwise FALSE. */ + +static bool +same_para (c) + register int c; +{ + return (next_prefix_indent == prefix_indent + && in_column >= next_prefix_indent + prefix_full_length + && c != '\n' && c != EOF); +} + +/* Read a line from input file F, given first non-blank character C + after the prefix, and the following indent, and break it into words. + A word is a maximal non-empty string of non-white characters. A word + ending in [.?!]["')\]]* and followed by end-of-line or at least two + spaces ends a sentence, as in emacs. + + Return the first non-blank character of the next line. */ + +static int +get_line (f, c) + FILE *f; + register int c; +{ + int start; + register char *end_of_parabuf; + register WORD *end_of_word; + + end_of_parabuf = ¶buf[MAXCHARS]; + end_of_word = &word[MAXWORDS - 2]; + + do + { /* for each word in a line */ + + /* Scan word. */ + + word_limit->text = wptr; + do + { + if (wptr == end_of_parabuf) + flush_paragraph (); + *wptr++ = c; + c = getc (f); + } + while (c != EOF && !isspace (c)); + in_column += word_limit->length = wptr - word_limit->text; + check_punctuation (word_limit); + + /* Scan inter-word space. */ + + start = in_column; + c = get_space (f, c); + word_limit->space = in_column - start; + word_limit->final = (c == EOF + || (word_limit->period + && (c == '\n' || word_limit->space > 1))); + if (c == '\n' || c == EOF || uniform) + word_limit->space = word_limit->final ? 2 : 1; + if (word_limit == end_of_word) + flush_paragraph(); + word_limit++; + if (c == EOF) + return EOF; + } + while (c != '\n'); + return get_prefix (f); +} + +/* Read a prefix from input file F. Return either first non-matching + character, or first non-blank character after the prefix. */ + +static int +get_prefix (f) + FILE *f; +{ + register int c; + register const char *p; + + in_column = 0; + c = get_space (f, getc (f)); + if (prefix_length == 0) + next_prefix_indent = prefix_lead_space < in_column ? + prefix_lead_space : in_column; + else + { + next_prefix_indent = in_column; + for (p = prefix; *p != '\0'; p++) + { + if (c != *p) + return c; + in_column++; + c = getc (f); + } + c = get_space (f, c); + } + return c; +} + +/* Read blank characters from input file F, starting with C, and keeping + in_column up-to-date. Return first non-blank character. */ + +static int +get_space (f, c) + FILE *f; + register int c; +{ + for (;;) + { + if (c == ' ') + in_column++; + else if (c == '\t') + { + tabs = TRUE; + in_column = (in_column / TABWIDTH + 1) * TABWIDTH; + } + else + return c; + c = getc (f); + } +} + +/* Set extra fields in word W describing any attached punctuation. */ + +static void +check_punctuation (w) + register WORD *w; +{ + register const char *start, *finish; + + start = w->text; + finish = start + (w->length - 1); + w->paren = isopen (*start); + w->punct = ispunct (*finish); + while (isclose (*finish) && finish > start) + finish--; + w->period = isperiod (*finish); +} + +/* Flush part of the paragraph to make room. This function is called on + hitting the limit on the number of words or characters. */ + +static void +flush_paragraph () +{ + WORD *split_point; + register WORD *w; + int shift; + COST best_break; + + /* In the special case where it's all one word, just flush it. */ + + if (word_limit == word) + { + printf ("%*s", wptr - parabuf, parabuf); + wptr = parabuf; + return; + } + + /* Otherwise: + - format what you have so far as a paragraph, + - find a low-cost line break near the end, + - output to there, + - make that the start of the paragraph. */ + + fmt_paragraph (); + + /* Choose a good split point. */ + + split_point = word_limit; + best_break = MAXCOST; + for (w = word->next_break; w != word_limit; w = w->next_break) + { + if (w->best_cost - w->next_break->best_cost < best_break) + { + split_point = w; + best_break = w->best_cost - w->next_break->best_cost; + } + if (best_break <= MAXCOST - LINE_CREDIT) + best_break += LINE_CREDIT; + } + put_paragraph (split_point); + + /* Copy text of words down to start of parabuf -- we use bcopy because + the source and target may overlap. */ + + bcopy (split_point->text, parabuf, (size_t) (wptr - split_point->text)); + shift = split_point->text - parabuf; + wptr -= shift; + + /* Adjust text pointers. */ + + for (w = split_point; w <= word_limit; w++) + w->text -= shift; + + /* Copy words from split_point down to word -- we use bcopy because + the source and target may overlap. */ + + bcopy ((char *) split_point, (char *) word, + (word_limit - split_point + 1) * sizeof (WORD)); + word_limit -= split_point - word; +} + +/* Compute the optimal formatting for the whole paragraph by computing + and remembering the optimal formatting for each suffix from the empty + one to the whole paragraph. */ + +static void +fmt_paragraph () +{ + register WORD *start, *w; + register int len; + register COST wcost, best; + int saved_length; + + word_limit->best_cost = 0; + saved_length = word_limit->length; + word_limit->length = max_width; /* sentinel */ + + for (start = word_limit - 1; start >= word; start--) + { + best = MAXCOST; + len = start == word ? first_indent : other_indent; + + /* At least one word, however long, in the line. */ + + w = start; + len += w->length; + do + { + w++; + + /* Consider breaking before w. */ + + wcost = line_cost (w, len) + w->best_cost; + if (start == word && last_line_length > 0) + wcost += RAGGED_COST (len - last_line_length); + if (wcost < best) + { + best = wcost; + start->next_break = w; + start->line_length = len; + } + len += (w - 1)->space + w->length; /* w > start >= word */ + } + while (len < max_width); + start->best_cost = best + base_cost (start); + } + + word_limit->length = saved_length; +} + +/* Return the constant component of the cost of breaking before the + word THIS. */ + +static COST +base_cost (this) + register WORD *this; +{ + register COST cost; + + cost = LINE_COST; + + if (this > word) + { + if ((this - 1)->period) + { + if ((this - 1)->final) + cost -= SENTENCE_BONUS; + else + cost += NOBREAK_COST; + } + else if ((this - 1)->punct) + cost -= PUNCT_BONUS; + else if (this > word + 1 && (this - 2)->final) + cost += WIDOW_COST ((this - 1)->length); + } + + if (this->paren) + cost -= PAREN_BONUS; + else if (this->final) + cost += ORPHAN_COST (this->length); + + return cost; +} + +/* Return the component of the cost of breaking before word NEXT that + depends on LEN, the length of the line beginning there. */ + +static COST +line_cost (next, len) + register WORD *next; + register int len; +{ + register int n; + register COST cost; + + if (next == word_limit) + return 0; + n = best_width - len; + cost = SHORT_COST (n); + if (next->next_break != word_limit) + { + n = len - next->line_length; + cost += RAGGED_COST (n); + } + return cost; +} + +/* Output to stdout a paragraph from word up to (but not including) + FINISH, which must be in the next_break chain from word. */ + +static void +put_paragraph (finish) + register WORD *finish; +{ + register WORD *w; + + put_line (word, first_indent); + for (w = word->next_break; w != finish; w = w->next_break) + put_line (w, other_indent); +} + +/* Output to stdout the line beginning with word W, beginning in column + INDENT, including the prefix (if any). */ + +static void +put_line (w, indent) + register WORD *w; + int indent; +{ + register WORD *endline; + + out_column = 0; + put_space (prefix_indent); + fputs (prefix, stdout); + out_column += prefix_length; + put_space (indent - out_column); + + endline = w->next_break - 1; + for ( ; w != endline; w++) + { + put_word (w); + put_space (w->space); + } + put_word (w); + last_line_length = out_column; + putchar ('\n'); +} + +/* Output to stdout the word W. */ + +static void +put_word (w) + register WORD *w; +{ + register const char *s; + register int n; + + s = w->text; + for (n = w->length; n != 0; n--) + putchar(*s++); + out_column += w->length; +} + +/* Output to stdout SPACE spaces, or equivalent tabs. */ + +static void +put_space (space) + int space; +{ + register int space_target, tab_target; + + space_target = out_column + space; + if (tabs) + { + tab_target = space_target / TABWIDTH * TABWIDTH; + if (out_column + 1 < tab_target) + while (out_column < tab_target) + { + putchar ('\t'); + out_column = (out_column / TABWIDTH + 1) * TABWIDTH; + } + } + while (out_column < space_target) + { + putchar (' '); + out_column++; + } +} -- cgit v1.2.3-70-g09d2