summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rw-r--r--lib/regex.c230
1 files changed, 144 insertions, 86 deletions
diff --git a/lib/regex.c b/lib/regex.c
index 3129ed499..a0e4e8950 100644
--- a/lib/regex.c
+++ b/lib/regex.c
@@ -29,11 +29,14 @@
/* We need this for `regex.h', and perhaps for the Emacs include files. */
#include <sys/types.h>
+#if defined (HAVE_CONFIG_H) || defined (emacs)
+#include "config.h"
+#endif
+
/* The `emacs' switch turns on certain matching commands
that make sense only in Emacs. */
#ifdef emacs
-#include "config.h"
#include "lisp.h"
#include "buffer.h"
#include "syntax.h"
@@ -45,11 +48,17 @@
/* We used to test for `BSTRING' here, but only GCC and Emacs define
`BSTRING', as far as I know, and neither of them use this code. */
-#if USG || STDC_HEADERS
+#if HAVE_STRING_H || STDC_HEADERS
#include <string.h>
+#ifndef bcmp
#define bcmp(s1, s2, n) memcmp ((s1), (s2), (n))
+#endif
+#ifndef bcopy
#define bcopy(s, d, n) memcpy ((d), (s), (n))
+#endif
+#ifndef bzero
#define bzero(s, n) memset ((s), 0, (n))
+#endif
#else
#include <strings.h>
#endif
@@ -115,16 +124,35 @@ init_syntax_once ()
/* Get the interface, including the syntax bits. */
#include "regex.h"
-
/* isalpha etc. are used for the character classes. */
#include <ctype.h>
-#ifndef isgraph
-#define isgraph(c) (isprint (c) && !isspace (c))
+
+#ifndef isascii
+#define isascii(c) 1
+#endif
+
+#ifdef isblank
+#define ISBLANK(c) (isascii (c) && isblank (c))
+#else
+#define ISBLANK(c) ((c) == ' ' || (c) == '\t')
#endif
-#ifndef isblank
-#define isblank(c) ((c) == ' ' || (c) == '\t')
+#ifdef isgraph
+#define ISGRAPH(c) (isascii (c) && isgraph (c))
+#else
+#define ISGRAPH(c) (isascii (c) && isprint (c) && !isspace (c))
#endif
+#define ISPRINT(c) (isascii (c) && isprint (c))
+#define ISDIGIT(c) (isascii (c) && isdigit (c))
+#define ISALNUM(c) (isascii (c) && isalnum (c))
+#define ISALPHA(c) (isascii (c) && isalpha (c))
+#define ISCNTRL(c) (isascii (c) && iscntrl (c))
+#define ISLOWER(c) (isascii (c) && islower (c))
+#define ISPUNCT(c) (isascii (c) && ispunct (c))
+#define ISSPACE(c) (isascii (c) && isspace (c))
+#define ISUPPER(c) (isascii (c) && isupper (c))
+#define ISXDIGIT(c) (isascii (c) && isxdigit (c))
+
#ifndef NULL
#define NULL 0
#endif
@@ -136,7 +164,7 @@ init_syntax_once ()
#undef SIGN_EXTEND_CHAR
#if __STDC__
#define SIGN_EXTEND_CHAR(c) ((signed char) (c))
-#else
+#else /* not __STDC__ */
/* As in Harbison and Steele. */
#define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
#endif
@@ -443,6 +471,7 @@ static int debug = 0;
#define DEBUG_PRINT1(x) if (debug) printf (x)
#define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
#define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
+#define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
#define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
if (debug) print_partial_compiled_pattern (s, e)
#define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
@@ -756,6 +785,7 @@ print_double_string (where, string1, size1, string2, size2)
#define DEBUG_PRINT1(x)
#define DEBUG_PRINT2(x1, x2)
#define DEBUG_PRINT3(x1, x2, x3)
+#define DEBUG_PRINT4(x1, x2, x3, x4)
#define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
#define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
@@ -988,7 +1018,7 @@ typedef struct
{ if (p != pend) \
{ \
PATFETCH (c); \
- while (isdigit (c)) \
+ while (ISDIGIT (c)) \
{ \
if (num < 0) \
num = 0; \
@@ -1021,9 +1051,9 @@ typedef struct
`buffer' is the compiled pattern;
`syntax' is set to SYNTAX;
`used' is set to the length of the compiled pattern;
- `fastmap_accurate' is set to zero;
- `re_nsub' is set to the number of groups in PATTERN;
- `not_bol' and `not_eol' are set to zero.
+ `fastmap_accurate' is zero;
+ `re_nsub' is the number of subexpressions in PATTERN;
+ `not_bol' and `not_eol' are zero;
The `fastmap' and `newline_anchor' fields are neither
examined nor set. */
@@ -1453,18 +1483,18 @@ regex_compile (pattern, size, syntax, bufp)
for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
{
- if ( (is_alnum && isalnum (ch))
- || (is_alpha && isalpha (ch))
- || (is_blank && isblank (ch))
- || (is_cntrl && iscntrl (ch))
- || (is_digit && isdigit (ch))
- || (is_graph && isgraph (ch))
- || (is_lower && islower (ch))
- || (is_print && isprint (ch))
- || (is_punct && ispunct (ch))
- || (is_space && isspace (ch))
- || (is_upper && isupper (ch))
- || (is_xdigit && isxdigit (ch)))
+ if ( (is_alnum && ISALNUM (ch))
+ || (is_alpha && ISALPHA (ch))
+ || (is_blank && ISBLANK (ch))
+ || (is_cntrl && ISCNTRL (ch))
+ || (is_digit && ISDIGIT (ch))
+ || (is_graph && ISGRAPH (ch))
+ || (is_lower && ISLOWER (ch))
+ || (is_print && ISPRINT (ch))
+ || (is_punct && ISPUNCT (ch))
+ || (is_space && ISSPACE (ch))
+ || (is_upper && ISUPPER (ch))
+ || (is_xdigit && ISXDIGIT (ch)))
SET_LIST_BIT (ch);
}
had_char_class = true;
@@ -1672,10 +1702,10 @@ regex_compile (pattern, size, syntax, bufp)
| v | v
a | b | c
- If we are at `b,' then fixup_alt_jump right now points to a
- three-byte space after `a.' We'll put in the jump, set
- fixup_alt_jump to right after `b,' and leave behind three
- bytes which we'll fill in when we get to after `c.' */
+ If we are at `b', then fixup_alt_jump right now points to a
+ three-byte space after `a'. We'll put in the jump, set
+ fixup_alt_jump to right after `b', and leave behind three
+ bytes which we'll fill in when we get to after `c'. */
if (fixup_alt_jump)
STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
@@ -2316,6 +2346,7 @@ typedef struct
int this_reg; \
\
DEBUG_STATEMENT (failure_id++); \
+ DEBUG_STATEMENT (nfailure_points_pushed++); \
DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
@@ -2469,6 +2500,8 @@ typedef struct
regstart[this_reg] = (const char *) POP_FAILURE_ITEM (); \
DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
} \
+ \
+ DEBUG_STATEMENT (nfailure_points_popped++); \
} /* POP_FAILURE_POINT */
/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
@@ -2856,15 +2889,9 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
else if (endpos > total_size)
range = total_size - startpos;
- /* Update the fastmap now if not correct already. */
- if (fastmap && !bufp->fastmap_accurate)
- if (re_compile_fastmap (bufp) == -2)
- return -2;
-
/* If the search isn't to be a backwards one, don't waste time in a
- long search for a pattern that says it is anchored. */
- if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf
- && range > 0)
+ search for a pattern that must be anchored. */
+ if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
{
if (startpos > 0)
return -1;
@@ -2872,6 +2899,12 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
range = 1;
}
+ /* Update the fastmap now if not correct already. */
+ if (fastmap && !bufp->fastmap_accurate)
+ if (re_compile_fastmap (bufp) == -2)
+ return -2;
+
+ /* Loop through the string, looking for a place to start matching. */
for (;;)
{
/* If a fastmap is supplied, skip quickly over characters that
@@ -2909,7 +2942,7 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
? string2[startpos - size1]
: string1[startpos]);
- if (!fastmap[TRANSLATE (c)])
+ if (!fastmap[(unsigned char) TRANSLATE (c)])
goto advance;
}
}
@@ -2983,12 +3016,9 @@ typedef union
#define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
-/* Call this when have matched something; it sets `matched' flags for the
- registers corresponding to the group of which we currently are inside.
- Also records whether this group ever matched something. We only care
- about this information at `stop_memory', and then only about the
- previous time through the loop (if the group is starred or whatever).
- So it is ok to clear all the nonactive registers here. */
+/* Call this when have matched a real character; it sets `matched' flags
+ for the subexpressions which we are currently inside. Also records
+ that those subexprs have matched. */
#define SET_REGS_MATCHED() \
do \
{ \
@@ -3033,24 +3063,24 @@ typedef union
/* Test if at very beginning or at very end of the virtual concatenation
of `string1' and `string2'. If only one string, it's `string2'. */
-#define AT_STRINGS_BEG() (d == (size1 ? string1 : string2) || !size2)
-#define AT_STRINGS_END() (d == end2)
+#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
+#define AT_STRINGS_END(d) ((d) == end2)
/* Test if D points to a character which is word-constituent. We have
two special cases to check for: if past the end of string1, look at
the first character in string2; and if before the beginning of
- string2, look at the last character in string1.
-
- Assumes `string1' exists, so use in conjunction with AT_STRINGS_BEG (). */
-#define LETTER_P(d) \
+ string2, look at the last character in string1. */
+#define WORDCHAR_P(d) \
(SYNTAX ((d) == end1 ? *string2 \
- : (d) == string2 - 1 ? *(end1 - 1) : *(d)) == Sword)
+ : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
+ == Sword)
/* Test if the character before D and the one at D differ with respect
to being word-constituent. */
#define AT_WORD_BOUNDARY(d) \
- (AT_STRINGS_BEG () || AT_STRINGS_END () || LETTER_P (d - 1) != LETTER_P (d))
+ (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
+ || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
/* Free everything we malloc. */
@@ -3157,6 +3187,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
fail_stack_type fail_stack;
#ifdef DEBUG
static unsigned failure_id = 0;
+ unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
#endif
/* We fill all the registers internally, independent of what we
@@ -3250,8 +3281,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
else
{
/* We must initialize all our variables to NULL, so that
- `FREE_VARIABLES' doesn't try to free them. Too bad this isn't
- Lisp, so we could have a list of variables. As it is, */
+ `FREE_VARIABLES' doesn't try to free them. */
regstart = regend = old_regstart = old_regend = best_regstart
= best_regend = reg_dummy = NULL;
reg_info = reg_info_dummy = (register_info_type *) NULL;
@@ -3335,8 +3365,10 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
if (p == pend)
{ /* End of pattern means we might have succeeded. */
- DEBUG_PRINT1 ("End of pattern: ");
- /* If not end of string, try backtracking. Otherwise done. */
+ DEBUG_PRINT1 ("end of pattern ... ");
+
+ /* If we haven't matched the entire string, and we want the
+ longest match, try backtracking. */
if (d != end_match_2)
{
DEBUG_PRINT1 ("backtracking.\n");
@@ -3374,6 +3406,8 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
For example, the pattern `x.*y.*z' against the
strings `x-' and `y-z-', if the two strings are
not consecutive in memory. */
+ DEBUG_PRINT1 ("Restoring best registers.\n");
+
d = match_end;
dend = ((d >= string1 && d <= end1)
? end_match_1 : end_match_2);
@@ -3386,7 +3420,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
}
} /* d != end_match_2 */
- DEBUG_PRINT1 ("\nAccepting match.\n");
+ DEBUG_PRINT1 ("Accepting match.\n");
/* If caller wants register contents data back, do it. */
if (regs && !bufp->no_sub)
@@ -3452,7 +3486,10 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
} /* regs && !bufp->no_sub */
FREE_VARIABLES ();
- DEBUG_PRINT2 ("%d registers pushed.\n", num_regs_pushed);
+ DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
+ nfailure_points_pushed, nfailure_points_popped,
+ nfailure_points_pushed - nfailure_points_popped);
+ DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
mcnt = d - pos - (MATCHING_IN_FIRST_STRING
? string1
@@ -3654,7 +3691,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
/* If just failed to match something this time around with a
group that's operated on by a repetition operator, try to
- force exit from the ``loop,'' and restore the register
+ force exit from the ``loop'', and restore the register
information for this group that we had before trying this
last match. */
if ((!MATCHED_SOMETHING (reg_info[*p])
@@ -3798,7 +3835,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
case begline:
DEBUG_PRINT1 ("EXECUTING begline.\n");
- if (AT_STRINGS_BEG ())
+ if (AT_STRINGS_BEG (d))
{
if (!bufp->not_bol) break;
}
@@ -3814,7 +3851,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
case endline:
DEBUG_PRINT1 ("EXECUTING endline.\n");
- if (AT_STRINGS_END ())
+ if (AT_STRINGS_END (d))
{
if (!bufp->not_eol) break;
}
@@ -3831,7 +3868,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
/* Match at the very beginning of the data. */
case begbuf:
DEBUG_PRINT1 ("EXECUTING begbuf.\n");
- if (AT_STRINGS_BEG ())
+ if (AT_STRINGS_BEG (d))
break;
goto fail;
@@ -3839,7 +3876,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
/* Match at the very end of the data. */
case endbuf:
DEBUG_PRINT1 ("EXECUTING endbuf.\n");
- if (AT_STRINGS_END ())
+ if (AT_STRINGS_END (d))
break;
goto fail;
@@ -3893,7 +3930,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
the original * applied to a group), save the information
for that group and all inner ones, so that if we fail back
to this point, the group's information will be correct.
- For example, in \(a*\)*\1, we only need the preceding group,
+ For example, in \(a*\)*\1, we need the preceding group,
and in \(\(a*\)b*\)\2, we need the inner group. */
/* We can't use `p' to check ahead because we push
@@ -3923,8 +3960,8 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
break;
- /* A smart repeat ends with a maybe_pop_jump.
- We change it either to a pop_failure_jump or a jump. */
+ /* A smart repeat ends with `maybe_pop_jump'.
+ We change it to either `pop_failure_jump' or `jump'. */
case maybe_pop_jump:
EXTRACT_NUMBER_AND_INCR (mcnt, p);
DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
@@ -3952,10 +3989,21 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
/* If we're at the end of the pattern, we can change. */
if (p2 == pend)
- {
+ { /* But if we're also at the end of the string, we might
+ as well skip changing anything. For example, in `a+'
+ against `a', we'll have already matched the `a', and
+ I don't see the the point of changing the opcode,
+ popping the failure point, finding out it fails, and
+ then going into our endgame. */
+ if (d == dend)
+ {
+ p = pend;
+ DEBUG_PRINT1 (" End of pattern & string => done.\n");
+ continue;
+ }
+
p[-3] = (unsigned char) pop_failure_jump;
- DEBUG_PRINT1
- (" End of pattern: change to `pop_failure_jump'.\n");
+ DEBUG_PRINT1 (" End of pattern => pop_failure_jump.\n");
}
else if ((re_opcode_t) *p2 == exactn
@@ -3969,7 +4017,12 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
to the `maybe_finalize_jump' of this case. Examine what
follows. */
if ((re_opcode_t) p1[3] == exactn && p1[5] != c)
- p[-3] = (unsigned char) pop_failure_jump;
+ {
+ p[-3] = (unsigned char) pop_failure_jump;
+ DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
+ c, p1[5]);
+ }
+
else if ((re_opcode_t) p1[3] == charset
|| (re_opcode_t) p1[3] == charset_not)
{
@@ -3984,9 +4037,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
if (!not)
{
p[-3] = (unsigned char) pop_failure_jump;
- DEBUG_PRINT1
- (" No match: change to `pop_failure_jump'.\n");
-
+ DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
}
}
}
@@ -3995,6 +4046,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
if ((re_opcode_t) p[-1] != pop_failure_jump)
{
p[-1] = (unsigned char) jump;
+ DEBUG_PRINT1 (" Match => jump.\n");
goto unconditional_jump;
}
/* Note fall through. */
@@ -4056,7 +4108,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
/* At the end of an alternative, we need to push a dummy failure
- point in case we are followed by a pop_failure_jump', because
+ point in case we are followed by a `pop_failure_jump', because
we don't want the failure point for the alternative to be
popped. For example, matching `(a|ab)*' against `aab'
requires that we match the `ab' alternative. */
@@ -4133,14 +4185,14 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
case wordbeg:
DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
- if (LETTER_P (d) && (AT_STRINGS_BEG () || !LETTER_P (d - 1)))
+ if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
break;
goto fail;
case wordend:
DEBUG_PRINT1 ("EXECUTING wordend.\n");
- if (!AT_STRINGS_BEG () && LETTER_P (d - 1)
- && (!LETTER_P (d) || AT_STRINGS_END ()))
+ if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
+ && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
break;
goto fail;
@@ -4177,11 +4229,12 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
goto matchsyntax;
case wordchar:
- DEBUG_PRINT1 ("EXECUTING wordchar.\n");
+ DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
mcnt = (int) Sword;
matchsyntax:
PREFETCH ();
- if (SYNTAX (*d++) != (enum syntaxcode) mcnt) goto fail;
+ if (SYNTAX (*d++) != (enum syntaxcode) mcnt)
+ goto fail;
SET_REGS_MATCHED ();
break;
@@ -4191,11 +4244,12 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
goto matchnotsyntax;
case notwordchar:
- DEBUG_PRINT1 ("EXECUTING notwordchar.\n");
+ DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
mcnt = (int) Sword;
- matchnotsyntax: /* We goto here from notsyntaxspec. */
+ matchnotsyntax:
PREFETCH ();
- if (SYNTAX (*d++) == (enum syntaxcode) mcnt) goto fail;
+ if (SYNTAX (*d++) == (enum syntaxcode) mcnt)
+ goto fail;
SET_REGS_MATCHED ();
break;
@@ -4203,17 +4257,19 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
case wordchar:
DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
PREFETCH ();
- if (!LETTER_P (d))
+ if (!WORDCHAR_P (d))
goto fail;
SET_REGS_MATCHED ();
+ d++;
break;
case notwordchar:
DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
PREFETCH ();
- if (LETTER_P (d))
+ if (WORDCHAR_P (d))
goto fail;
SET_REGS_MATCHED ();
+ d++;
break;
#endif /* not emacs */
@@ -4680,10 +4736,12 @@ regcomp (preg, pattern, cflags)
{
reg_errcode_t ret;
unsigned syntax
- = cflags & REG_EXTENDED ? RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
+ = (cflags & REG_EXTENDED) ?
+ RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
/* regex_compile will allocate the space for the compiled pattern. */
preg->buffer = 0;
+ preg->allocated = 0;
/* Don't bother to use a fastmap when searching. This simplifies the
REG_NEWLINE case: if we used a fastmap, we'd have to put all the
@@ -4701,7 +4759,7 @@ regcomp (preg, pattern, cflags)
/* Map uppercase characters to corresponding lowercase ones. */
for (i = 0; i < CHAR_SET_SIZE; i++)
- preg->translate[i] = isupper (i) ? tolower (i) : i;
+ preg->translate[i] = ISUPPER (i) ? tolower (i) : i;
}
else
preg->translate = NULL;
@@ -4808,7 +4866,7 @@ regexec (preg, string, nmatch, pmatch, eflags)
/* Returns a message corresponding to an error code, ERRCODE, returned
- from either regcomp or regexec. */
+ from either regcomp or regexec. We don't use PREG here. */
size_t
regerror (errcode, preg, errbuf, errbuf_size)