|
Graphviz
2.29.20120524.0446
|
00001 /* #define DEBUG */ 00002 /* LGPLd GNU regex for Native WIN32 */ 00003 00004 /* Part of the ht://Dig package <http://www.htdig.org/> */ 00005 /* Copyright (c) 2003 The ht://Dig Group */ 00006 /* For copyright details, see the file COPYING in your distribution */ 00007 /* or the GNU Library General Public License (LGPL) version 2 or later or later */ 00008 /* <http://www.gnu.org/copyleft/lgpl.html> */ 00009 00010 /* Added June 2003 Neal Richter, RightNow Technologies */ 00011 00012 /* note that this version is significantly different from the original */ 00013 /* version 0.12 GNU source code. It compiles and works on Native WIN32. */ 00014 00015 /* Extended regular expression matching and search library, 00016 version 0.12. 00017 (Implements POSIX draft P1003.2/D11.2, except for some of the 00018 internationalization features.) 00019 00020 Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc. 00021 00022 This file is part of the GNU C Library. Its master source is NOT part of 00023 the C library, however. The master source lives in /gd/gnu/lib. 00024 00025 The GNU C Library is free software; you can redistribute it and/or 00026 modify it under the terms of the GNU Library General Public License as 00027 published by the Free Software Foundation; either version 2 of the 00028 License, or (at your option) any later version. 00029 00030 The GNU C Library is distributed in the hope that it will be useful, 00031 but WITHOUT ANY WARRANTY; without even the implied warranty of 00032 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00033 Library General Public License for more details. 00034 00035 You should have received a copy of the GNU Library General Public 00036 License along with the GNU C Library; see the file COPYING.LIB. If not, 00037 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, 00038 Boston, MA 02111-1307, USA. */ 00039 00040 #if defined(_WIN32) 00041 #pragma warning(disable: 4018 4101) 00042 typedef unsigned char boolean; 00043 #endif 00044 00045 /* AIX requires this to be the first thing in the file. */ 00046 #if defined (_AIX) && !defined (REGEX_MALLOC) 00047 #pragma alloca 00048 #endif 00049 00050 #undef _GNU_SOURCE 00051 #define _GNU_SOURCE 00052 00053 #if defined(LINUX) 00054 #define STDC_HEADERS 00055 #endif 00056 00057 #if defined(STDC_HEADERS) && !defined(emacs) 00058 #include <stddef.h> 00059 #else 00060 /* We need this for `regex.h', and perhaps for the Emacs include files. */ 00061 #include <sys/types.h> 00062 #endif 00063 00064 /* For platform which support the ISO C amendement 1 functionality we 00065 support user defined character classes. */ 00066 #if defined _LIBC || (defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H) 00067 # include <wctype.h> 00068 # include <wchar.h> 00069 #endif 00070 00071 /* This is for other GNU distributions with internationalized messages. */ 00072 #undef HAVE_LIBINTL_H 00073 #if HAVE_LIBINTL_H || defined (_LIBC) 00074 # include <libintl.h> 00075 #else 00076 # define gettext(msgid) (msgid) 00077 #endif 00078 00079 #ifndef gettext_noop 00080 /* This define is so xgettext can find the internationalizable 00081 strings. */ 00082 #define gettext_noop(String) String 00083 #endif 00084 00085 /* The `emacs' switch turns on certain matching commands 00086 that make sense only in Emacs. */ 00087 #ifdef emacs 00088 00089 #include "lisp.h" 00090 #include "buffer.h" 00091 #include "syntax.h" 00092 00093 #else /* not emacs */ 00094 00095 /* If we are not linking with Emacs proper, 00096 we can't use the relocating allocator 00097 even if config.h says that we can. */ 00098 #undef REL_ALLOC 00099 00100 #if defined (STDC_HEADERS) || defined (_LIBC) || defined(_WIN32) 00101 #include <stdlib.h> 00102 #else 00103 char *malloc (); 00104 char *realloc (); 00105 void free(); 00106 #endif 00107 00108 /* When used in Emacs's lib-src, we need to get bzero and bcopy somehow. 00109 If nothing else has been done, use the method below. */ 00110 #ifdef INHIBIT_STRING_HEADER 00111 #if !(defined (HAVE_BZERO) && defined (HAVE_BCOPY)) 00112 #if !defined (bzero) && !defined (bcopy) 00113 #undef INHIBIT_STRING_HEADER 00114 #endif 00115 #endif 00116 #endif 00117 00118 #include <string.h> 00119 00120 /* This is the normal way of making sure we have a bcopy and a bzero. 00121 This is used in most programs--a few other programs avoid this 00122 by defining INHIBIT_STRING_HEADER. */ 00123 #ifndef INHIBIT_STRING_HEADER 00124 #if defined (HAVE_STRING_H) || defined (STDC_HEADERS) || defined (_LIBC) || defined (_WIN32) 00125 #ifndef bcmp 00126 #define bcmp(s1, s2, n) memcmp ((s1), (s2), (n)) 00127 #endif 00128 #ifndef bcopy 00129 #define bcopy(s, d, n) memcpy ((d), (s), (n)) 00130 #endif 00131 #ifndef bzero 00132 #define bzero(s, n) memset ((s), 0, (n)) 00133 #endif 00134 #else 00135 #include <strings.h> 00136 #endif 00137 #endif 00138 00139 /* Define the syntax stuff for <, >, etc. */ 00140 00141 /* This must be nonzero for the wordchar and notwordchar pattern 00142 commands in re_match_2. */ 00143 #ifndef Sword 00144 #define Sword 1 00145 #endif 00146 00147 #ifdef SWITCH_ENUM_BUG 00148 #define SWITCH_ENUM_CAST(x) ((int)(x)) 00149 #else 00150 #define SWITCH_ENUM_CAST(x) (x) 00151 #endif 00152 00153 #ifdef SYNTAX_TABLE 00154 00155 extern char *re_syntax_table; 00156 00157 #else /* not SYNTAX_TABLE */ 00158 00159 /* How many characters in the character set. */ 00160 #define CHAR_SET_SIZE 256 00161 00162 static char re_syntax_table[CHAR_SET_SIZE]; 00163 00164 static void 00165 init_syntax_once () 00166 { 00167 register int c; 00168 static int done = 0; 00169 00170 if (done) 00171 return; 00172 00173 bzero (re_syntax_table, sizeof re_syntax_table); 00174 00175 for (c = 'a'; c <= 'z'; c++) 00176 re_syntax_table[c] = Sword; 00177 00178 for (c = 'A'; c <= 'Z'; c++) 00179 re_syntax_table[c] = Sword; 00180 00181 for (c = '0'; c <= '9'; c++) 00182 re_syntax_table[c] = Sword; 00183 00184 re_syntax_table['_'] = Sword; 00185 00186 done = 1; 00187 } 00188 00189 #endif /* not SYNTAX_TABLE */ 00190 00191 #define SYNTAX(c) re_syntax_table[c] 00192 00193 #endif /* not emacs */ 00194 00195 /* Get the interface, including the syntax bits. */ 00196 /* #include "regex.h" */ 00197 #include "regex_win32.h" 00198 00199 /* isalpha etc. are used for the character classes. */ 00200 #include <ctype.h> 00201 00202 /* Jim Meyering writes: 00203 00204 "... Some ctype macros are valid only for character codes that 00205 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when 00206 using /bin/cc or gcc but without giving an ansi option). So, all 00207 ctype uses should be through macros like ISPRINT... If 00208 STDC_HEADERS is defined, then autoconf has verified that the ctype 00209 macros don't need to be guarded with references to isascii. ... 00210 Defining isascii to 1 should let any compiler worth its salt 00211 eliminate the && through constant folding." */ 00212 00213 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII)) 00214 #define ISASCII(c) 1 00215 #else 00216 #define ISASCII(c) isascii(c) 00217 #endif 00218 00219 #ifdef isblank 00220 #define ISBLANK(c) (ISASCII (c) && isblank (c)) 00221 #else 00222 #define ISBLANK(c) ((c) == ' ' || (c) == '\t') 00223 #endif 00224 #ifdef isgraph 00225 #define ISGRAPH(c) (ISASCII (c) && isgraph (c)) 00226 #else 00227 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c)) 00228 #endif 00229 00230 #define ISPRINT(c) (ISASCII (c) && isprint (c)) 00231 #define ISDIGIT(c) (ISASCII (c) && isdigit (c)) 00232 #define ISALNUM(c) (ISASCII (c) && isalnum (c)) 00233 #define ISALPHA(c) (ISASCII (c) && isalpha (c)) 00234 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c)) 00235 #define ISLOWER(c) (ISASCII (c) && islower (c)) 00236 #define ISPUNCT(c) (ISASCII (c) && ispunct (c)) 00237 #define ISSPACE(c) (ISASCII (c) && isspace (c)) 00238 #define ISUPPER(c) (ISASCII (c) && isupper (c)) 00239 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c)) 00240 00241 #ifndef NULL 00242 #define NULL (void *)0 00243 #endif 00244 00245 /* We remove any previous definition of `SIGN_EXTEND_CHAR', 00246 since ours (we hope) works properly with all combinations of 00247 machines, compilers, `char' and `unsigned char' argument types. 00248 (Per Bothner suggested the basic approach.) */ 00249 #undef SIGN_EXTEND_CHAR 00250 #if __STDC__ 00251 #define SIGN_EXTEND_CHAR(c) ((signed char) (c)) 00252 #else /* not __STDC__ */ 00253 /* As in Harbison and Steele. */ 00254 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128) 00255 #endif 00256 00257 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we 00258 use `alloca' instead of `malloc'. This is because using malloc in 00259 re_search* or re_match* could cause memory leaks when C-g is used in 00260 Emacs; also, malloc is slower and causes storage fragmentation. On 00261 the other hand, malloc is more portable, and easier to debug. 00262 00263 Because we sometimes use alloca, some routines have to be macros, 00264 not functions -- `alloca'-allocated space disappears at the end of the 00265 function it is called in. */ 00266 00267 #if defined(REGEX_MALLOC) || defined(_WIN32) 00268 00269 #define REGEX_ALLOCATE malloc 00270 #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize) 00271 #define REGEX_FREE free 00272 #define REGEX_MALLOC 00273 00274 #else /* not REGEX_MALLOC */ 00275 00276 /* Emacs already defines alloca, sometimes. */ 00277 #ifndef alloca 00278 00279 /* Make alloca work the best possible way. */ 00280 #ifdef __GNUC__ 00281 #define alloca __builtin_alloca 00282 #else /* not __GNUC__ */ 00283 #if HAVE_ALLOCA_H 00284 #include <alloca.h> 00285 #else /* not __GNUC__ or HAVE_ALLOCA_H */ 00286 #if 0 /* It is a bad idea to declare alloca. We always cast the result. */ 00287 #ifndef _AIX /* Already did AIX, up at the top. */ 00288 char *alloca (); 00289 #endif /* not _AIX */ 00290 #endif 00291 #endif /* not HAVE_ALLOCA_H */ 00292 #endif /* not __GNUC__ */ 00293 00294 #endif /* not alloca */ 00295 00296 #define REGEX_ALLOCATE alloca 00297 00298 /* Assumes a `char *destination' variable. */ 00299 #define REGEX_REALLOCATE(source, osize, nsize) \ 00300 (destination = (char *) alloca (nsize), \ 00301 bcopy (source, destination, osize), \ 00302 destination) 00303 00304 /* No need to do anything to free, after alloca. */ 00305 #define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */ 00306 00307 #endif /* not REGEX_MALLOC */ 00308 00309 /* Define how to allocate the failure stack. */ 00310 00311 #if defined (REL_ALLOC) && defined (REGEX_MALLOC) 00312 00313 #define REGEX_ALLOCATE_STACK(size) \ 00314 r_alloc (&failure_stack_ptr, (size)) 00315 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \ 00316 r_re_alloc (&failure_stack_ptr, (nsize)) 00317 #define REGEX_FREE_STACK(ptr) \ 00318 r_alloc_free (&failure_stack_ptr) 00319 00320 #else /* not using relocating allocator */ 00321 00322 #ifdef REGEX_MALLOC 00323 00324 #define REGEX_ALLOCATE_STACK malloc 00325 #define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize) 00326 #define REGEX_FREE_STACK free 00327 00328 #else /* not REGEX_MALLOC */ 00329 00330 #define REGEX_ALLOCATE_STACK alloca 00331 00332 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \ 00333 REGEX_REALLOCATE (source, osize, nsize) 00334 /* No need to explicitly free anything. */ 00335 #define REGEX_FREE_STACK(arg) 00336 00337 #endif /* not REGEX_MALLOC */ 00338 #endif /* not using relocating allocator */ 00339 00340 00341 /* True if `size1' is non-NULL and PTR is pointing anywhere inside 00342 `string1' or just past its end. This works if PTR is NULL, which is 00343 a good thing. */ 00344 #define FIRST_STRING_P(ptr) \ 00345 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1) 00346 00347 /* (Re)Allocate N items of type T using malloc, or fail. */ 00348 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t))) 00349 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t))) 00350 #define RETALLOC_IF(addr, n, t) \ 00351 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t) 00352 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t))) 00353 00354 #define BYTEWIDTH 8 /* In bits. */ 00355 00356 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0)) 00357 00358 #undef MAX 00359 #undef MIN 00360 #define MAX(a, b) ((a) > (b) ? (a) : (b)) 00361 #define MIN(a, b) ((a) < (b) ? (a) : (b)) 00362 00363 /* typedef char boolean; */ 00364 #define false 0 00365 #define true 1 00366 00367 static int 00368 re_match_2_internal(struct re_pattern_buffer *bufp, 00369 const char *string1, 00370 int size1, 00371 const char *string2, 00372 int size2, 00373 int pos, 00374 struct re_registers *regs, 00375 int stop); 00376 00377 /* These are the command codes that appear in compiled regular 00378 expressions. Some opcodes are followed by argument bytes. A 00379 command code can specify any interpretation whatsoever for its 00380 arguments. Zero bytes may appear in the compiled regular expression. */ 00381 00382 typedef enum 00383 { 00384 no_op = 0, 00385 00386 /* Succeed right away--no more backtracking. */ 00387 succeed, 00388 00389 /* Followed by one byte giving n, then by n literal bytes. */ 00390 exactn, 00391 00392 /* Matches any (more or less) character. */ 00393 anychar, 00394 00395 /* Matches any one char belonging to specified set. First 00396 following byte is number of bitmap bytes. Then come bytes 00397 for a bitmap saying which chars are in. Bits in each byte 00398 are ordered low-bit-first. A character is in the set if its 00399 bit is 1. A character too large to have a bit in the map is 00400 automatically not in the set. */ 00401 charset, 00402 00403 /* Same parameters as charset, but match any character that is 00404 not one of those specified. */ 00405 charset_not, 00406 00407 /* Start remembering the text that is matched, for storing in a 00408 register. Followed by one byte with the register number, in 00409 the range 0 to one less than the pattern buffer's re_nsub 00410 field. Then followed by one byte with the number of groups 00411 inner to this one. (This last has to be part of the 00412 start_memory only because we need it in the on_failure_jump 00413 of re_match_2.) */ 00414 start_memory, 00415 00416 /* Stop remembering the text that is matched and store it in a 00417 memory register. Followed by one byte with the register 00418 number, in the range 0 to one less than `re_nsub' in the 00419 pattern buffer, and one byte with the number of inner groups, 00420 just like `start_memory'. (We need the number of inner 00421 groups here because we don't have any easy way of finding the 00422 corresponding start_memory when we're at a stop_memory.) */ 00423 stop_memory, 00424 00425 /* Match a duplicate of something remembered. Followed by one 00426 byte containing the register number. */ 00427 duplicate, 00428 00429 /* Fail unless at beginning of line. */ 00430 begline, 00431 00432 /* Fail unless at end of line. */ 00433 endline, 00434 00435 /* Succeeds if at beginning of buffer (if emacs) or at beginning 00436 of string to be matched (if not). */ 00437 begbuf, 00438 00439 /* Analogously, for end of buffer/string. */ 00440 endbuf, 00441 00442 /* Followed by two byte relative address to which to jump. */ 00443 jump, 00444 00445 /* Same as jump, but marks the end of an alternative. */ 00446 jump_past_alt, 00447 00448 /* Followed by two-byte relative address of place to resume at 00449 in case of failure. */ 00450 on_failure_jump, 00451 00452 /* Like on_failure_jump, but pushes a placeholder instead of the 00453 current string position when executed. */ 00454 on_failure_keep_string_jump, 00455 00456 /* Throw away latest failure point and then jump to following 00457 two-byte relative address. */ 00458 pop_failure_jump, 00459 00460 /* Change to pop_failure_jump if know won't have to backtrack to 00461 match; otherwise change to jump. This is used to jump 00462 back to the beginning of a repeat. If what follows this jump 00463 clearly won't match what the repeat does, such that we can be 00464 sure that there is no use backtracking out of repetitions 00465 already matched, then we change it to a pop_failure_jump. 00466 Followed by two-byte address. */ 00467 maybe_pop_jump, 00468 00469 /* Jump to following two-byte address, and push a dummy failure 00470 point. This failure point will be thrown away if an attempt 00471 is made to use it for a failure. A `+' construct makes this 00472 before the first repeat. Also used as an intermediary kind 00473 of jump when compiling an alternative. */ 00474 dummy_failure_jump, 00475 00476 /* Push a dummy failure point and continue. Used at the end of 00477 alternatives. */ 00478 push_dummy_failure, 00479 00480 /* Followed by two-byte relative address and two-byte number n. 00481 After matching N times, jump to the address upon failure. */ 00482 succeed_n, 00483 00484 /* Followed by two-byte relative address, and two-byte number n. 00485 Jump to the address N times, then fail. */ 00486 jump_n, 00487 00488 /* Set the following two-byte relative address to the 00489 subsequent two-byte number. The address *includes* the two 00490 bytes of number. */ 00491 set_number_at, 00492 00493 wordchar, /* Matches any word-constituent character. */ 00494 notwordchar, /* Matches any char that is not a word-constituent. */ 00495 00496 wordbeg, /* Succeeds if at word beginning. */ 00497 wordend, /* Succeeds if at word end. */ 00498 00499 wordbound, /* Succeeds if at a word boundary. */ 00500 notwordbound /* Succeeds if not at a word boundary. */ 00501 00502 #ifdef emacs 00503 ,before_dot, /* Succeeds if before point. */ 00504 at_dot, /* Succeeds if at point. */ 00505 after_dot, /* Succeeds if after point. */ 00506 00507 /* Matches any character whose syntax is specified. Followed by 00508 a byte which contains a syntax code, e.g., Sword. */ 00509 syntaxspec, 00510 00511 /* Matches any character whose syntax is not that specified. */ 00512 notsyntaxspec 00513 #endif /* emacs */ 00514 } re_opcode_t; 00515 00516 /* Common operations on the compiled pattern. */ 00517 00518 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */ 00519 00520 #define STORE_NUMBER(destination, number) \ 00521 do { \ 00522 (destination)[0] = (number) & 0377; \ 00523 (destination)[1] = (number) >> 8; \ 00524 } while (0) 00525 00526 /* Same as STORE_NUMBER, except increment DESTINATION to 00527 the byte after where the number is stored. Therefore, DESTINATION 00528 must be an lvalue. */ 00529 00530 #define STORE_NUMBER_AND_INCR(destination, number) \ 00531 do { \ 00532 STORE_NUMBER (destination, number); \ 00533 (destination) += 2; \ 00534 } while (0) 00535 00536 /* Put into DESTINATION a number stored in two contiguous bytes starting 00537 at SOURCE. */ 00538 00539 #define EXTRACT_NUMBER(destination, source) \ 00540 do { \ 00541 (destination) = *(source) & 0377; \ 00542 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \ 00543 } while (0) 00544 00545 #ifdef DEBUG 00546 static void extract_number _RE_ARGS ((int *dest, unsigned char *source)); 00547 static void 00548 extract_number (dest, source) 00549 int *dest; 00550 unsigned char *source; 00551 { 00552 int temp = SIGN_EXTEND_CHAR (*(source + 1)); 00553 *dest = *source & 0377; 00554 *dest += temp << 8; 00555 } 00556 00557 #ifndef EXTRACT_MACROS /* To debug the macros. */ 00558 #undef EXTRACT_NUMBER 00559 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src) 00560 #endif /* not EXTRACT_MACROS */ 00561 00562 #endif /* DEBUG */ 00563 00564 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number. 00565 SOURCE must be an lvalue. */ 00566 00567 #define EXTRACT_NUMBER_AND_INCR(destination, source) \ 00568 do { \ 00569 EXTRACT_NUMBER (destination, source); \ 00570 (source) += 2; \ 00571 } while (0) 00572 00573 #ifdef DEBUG 00574 static void extract_number_and_incr _RE_ARGS ((int *destination, 00575 unsigned char **source)); 00576 static void 00577 extract_number_and_incr (destination, source) 00578 int *destination; 00579 unsigned char **source; 00580 { 00581 extract_number (destination, *source); 00582 *source += 2; 00583 } 00584 00585 #ifndef EXTRACT_MACROS 00586 #undef EXTRACT_NUMBER_AND_INCR 00587 #define EXTRACT_NUMBER_AND_INCR(dest, src) \ 00588 extract_number_and_incr (&dest, &src) 00589 #endif /* not EXTRACT_MACROS */ 00590 00591 #endif /* DEBUG */ 00592 00593 /* If DEBUG is defined, Regex prints many voluminous messages about what 00594 it is doing (if the variable `debug' is nonzero). If linked with the 00595 main program in `iregex.c', you can enter patterns and strings 00596 interactively. And if linked with the main program in `main.c' and 00597 the other test files, you can run the already-written tests. */ 00598 00599 #ifdef DEBUG 00600 00601 /* We use standard I/O for debugging. */ 00602 #include <stdio.h> 00603 00604 /* It is useful to test things that ``must'' be true when debugging. */ 00605 #include <assert.h> 00606 00607 static int debug = 0; 00608 00609 #define DEBUG_STATEMENT(e) e 00610 #define DEBUG_PRINT1(x) if (debug) printf (x) 00611 #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2) 00612 #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3) 00613 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4) 00614 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \ 00615 if (debug) print_partial_compiled_pattern (s, e) 00616 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ 00617 if (debug) print_double_string (w, s1, sz1, s2, sz2) 00618 00619 00620 /* Print the fastmap in human-readable form. */ 00621 00622 void 00623 print_fastmap (fastmap) 00624 char *fastmap; 00625 { 00626 unsigned was_a_range = 0; 00627 unsigned i = 0; 00628 00629 while (i < (1 << BYTEWIDTH)) 00630 { 00631 if (fastmap[i++]) 00632 { 00633 was_a_range = 0; 00634 putchar (i - 1); 00635 while (i < (1 << BYTEWIDTH) && fastmap[i]) 00636 { 00637 was_a_range = 1; 00638 i++; 00639 } 00640 if (was_a_range) 00641 { 00642 printf ("-"); 00643 putchar (i - 1); 00644 } 00645 } 00646 } 00647 putchar ('\n'); 00648 } 00649 00650 00651 /* Print a compiled pattern string in human-readable form, starting at 00652 the START pointer into it and ending just before the pointer END. */ 00653 00654 void 00655 print_partial_compiled_pattern (start, end) 00656 unsigned char *start; 00657 unsigned char *end; 00658 { 00659 int mcnt, mcnt2; 00660 unsigned char *p1; 00661 unsigned char *p = start; 00662 unsigned char *pend = end; 00663 00664 if (start == NULL) 00665 { 00666 printf ("(null)\n"); 00667 return; 00668 } 00669 00670 /* Loop over pattern commands. */ 00671 while (p < pend) 00672 { 00673 printf ("%d:\t", p - start); 00674 00675 switch ((re_opcode_t) *p++) 00676 { 00677 case no_op: 00678 printf ("/no_op"); 00679 break; 00680 00681 case exactn: 00682 mcnt = *p++; 00683 printf ("/exactn/%d", mcnt); 00684 do 00685 { 00686 putchar ('/'); 00687 putchar (*p++); 00688 } 00689 while (--mcnt); 00690 break; 00691 00692 case start_memory: 00693 mcnt = *p++; 00694 printf ("/start_memory/%d/%d", mcnt, *p++); 00695 break; 00696 00697 case stop_memory: 00698 mcnt = *p++; 00699 printf ("/stop_memory/%d/%d", mcnt, *p++); 00700 break; 00701 00702 case duplicate: 00703 printf ("/duplicate/%d", *p++); 00704 break; 00705 00706 case anychar: 00707 printf ("/anychar"); 00708 break; 00709 00710 case charset: 00711 case charset_not: 00712 { 00713 register int c, last = -100; 00714 register int in_range = 0; 00715 00716 printf ("/charset [%s", 00717 (re_opcode_t) *(p - 1) == charset_not ? "^" : ""); 00718 00719 assert (p + *p < pend); 00720 00721 for (c = 0; c < 256; c++) 00722 if (c / 8 < *p 00723 && (p[1 + (c/8)] & (1 << (c % 8)))) 00724 { 00725 /* Are we starting a range? */ 00726 if (last + 1 == c && ! in_range) 00727 { 00728 putchar ('-'); 00729 in_range = 1; 00730 } 00731 /* Have we broken a range? */ 00732 else if (last + 1 != c && in_range) 00733 { 00734 putchar (last); 00735 in_range = 0; 00736 } 00737 00738 if (! in_range) 00739 putchar (c); 00740 00741 last = c; 00742 } 00743 00744 if (in_range) 00745 putchar (last); 00746 00747 putchar (']'); 00748 00749 p += 1 + *p; 00750 } 00751 break; 00752 00753 case begline: 00754 printf ("/begline"); 00755 break; 00756 00757 case endline: 00758 printf ("/endline"); 00759 break; 00760 00761 case on_failure_jump: 00762 extract_number_and_incr (&mcnt, &p); 00763 printf ("/on_failure_jump to %d", p + mcnt - start); 00764 break; 00765 00766 case on_failure_keep_string_jump: 00767 extract_number_and_incr (&mcnt, &p); 00768 printf ("/on_failure_keep_string_jump to %d", p + mcnt - start); 00769 break; 00770 00771 case dummy_failure_jump: 00772 extract_number_and_incr (&mcnt, &p); 00773 printf ("/dummy_failure_jump to %d", p + mcnt - start); 00774 break; 00775 00776 case push_dummy_failure: 00777 printf ("/push_dummy_failure"); 00778 break; 00779 00780 case maybe_pop_jump: 00781 extract_number_and_incr (&mcnt, &p); 00782 printf ("/maybe_pop_jump to %d", p + mcnt - start); 00783 break; 00784 00785 case pop_failure_jump: 00786 extract_number_and_incr (&mcnt, &p); 00787 printf ("/pop_failure_jump to %d", p + mcnt - start); 00788 break; 00789 00790 case jump_past_alt: 00791 extract_number_and_incr (&mcnt, &p); 00792 printf ("/jump_past_alt to %d", p + mcnt - start); 00793 break; 00794 00795 case jump: 00796 extract_number_and_incr (&mcnt, &p); 00797 printf ("/jump to %d", p + mcnt - start); 00798 break; 00799 00800 case succeed_n: 00801 extract_number_and_incr (&mcnt, &p); 00802 p1 = p + mcnt; 00803 extract_number_and_incr (&mcnt2, &p); 00804 printf ("/succeed_n to %d, %d times", p1 - start, mcnt2); 00805 break; 00806 00807 case jump_n: 00808 extract_number_and_incr (&mcnt, &p); 00809 p1 = p + mcnt; 00810 extract_number_and_incr (&mcnt2, &p); 00811 printf ("/jump_n to %d, %d times", p1 - start, mcnt2); 00812 break; 00813 00814 case set_number_at: 00815 extract_number_and_incr (&mcnt, &p); 00816 p1 = p + mcnt; 00817 extract_number_and_incr (&mcnt2, &p); 00818 printf ("/set_number_at location %d to %d", p1 - start, mcnt2); 00819 break; 00820 00821 case wordbound: 00822 printf ("/wordbound"); 00823 break; 00824 00825 case notwordbound: 00826 printf ("/notwordbound"); 00827 break; 00828 00829 case wordbeg: 00830 printf ("/wordbeg"); 00831 break; 00832 00833 case wordend: 00834 printf ("/wordend"); 00835 00836 #ifdef emacs 00837 case before_dot: 00838 printf ("/before_dot"); 00839 break; 00840 00841 case at_dot: 00842 printf ("/at_dot"); 00843 break; 00844 00845 case after_dot: 00846 printf ("/after_dot"); 00847 break; 00848 00849 case syntaxspec: 00850 printf ("/syntaxspec"); 00851 mcnt = *p++; 00852 printf ("/%d", mcnt); 00853 break; 00854 00855 case notsyntaxspec: 00856 printf ("/notsyntaxspec"); 00857 mcnt = *p++; 00858 printf ("/%d", mcnt); 00859 break; 00860 #endif /* emacs */ 00861 00862 case wordchar: 00863 printf ("/wordchar"); 00864 break; 00865 00866 case notwordchar: 00867 printf ("/notwordchar"); 00868 break; 00869 00870 case begbuf: 00871 printf ("/begbuf"); 00872 break; 00873 00874 case endbuf: 00875 printf ("/endbuf"); 00876 break; 00877 00878 default: 00879 printf ("?%d", *(p-1)); 00880 } 00881 00882 putchar ('\n'); 00883 } 00884 00885 printf ("%d:\tend of pattern.\n", p - start); 00886 } 00887 00888 00889 void 00890 print_compiled_pattern (bufp) 00891 struct re_pattern_buffer *bufp; 00892 { 00893 unsigned char *buffer = bufp->buffer; 00894 00895 print_partial_compiled_pattern (buffer, buffer + bufp->used); 00896 printf ("%ld bytes used/%ld bytes allocated.\n", 00897 bufp->used, bufp->allocated); 00898 00899 if (bufp->fastmap_accurate && bufp->fastmap) 00900 { 00901 printf ("fastmap: "); 00902 print_fastmap (bufp->fastmap); 00903 } 00904 00905 printf ("re_nsub: %d\t", bufp->re_nsub); 00906 printf ("regs_alloc: %d\t", bufp->regs_allocated); 00907 printf ("can_be_null: %d\t", bufp->can_be_null); 00908 printf ("newline_anchor: %d\n", bufp->newline_anchor); 00909 printf ("no_sub: %d\t", bufp->no_sub); 00910 printf ("not_bol: %d\t", bufp->not_bol); 00911 printf ("not_eol: %d\t", bufp->not_eol); 00912 printf ("syntax: %lx\n", bufp->syntax); 00913 /* Perhaps we should print the translate table? */ 00914 } 00915 00916 00917 void 00918 print_double_string (where, string1, size1, string2, size2) 00919 const char *where; 00920 const char *string1; 00921 const char *string2; 00922 int size1; 00923 int size2; 00924 { 00925 int this_char; 00926 00927 if (where == NULL) 00928 printf ("(null)"); 00929 else 00930 { 00931 if (FIRST_STRING_P (where)) 00932 { 00933 for (this_char = where - string1; this_char < size1; this_char++) 00934 putchar (string1[this_char]); 00935 00936 where = string2; 00937 } 00938 00939 for (this_char = where - string2; this_char < size2; this_char++) 00940 putchar (string2[this_char]); 00941 } 00942 } 00943 00944 void 00945 printchar (c) 00946 int c; 00947 { 00948 putc (c, stderr); 00949 } 00950 00951 #else /* not DEBUG */ 00952 00953 #undef assert 00954 #define assert(e) 00955 00956 #define DEBUG_STATEMENT(e) 00957 #define DEBUG_PRINT1(x) 00958 #define DEBUG_PRINT2(x1, x2) 00959 #define DEBUG_PRINT3(x1, x2, x3) 00960 #define DEBUG_PRINT4(x1, x2, x3, x4) 00961 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) 00962 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) 00963 00964 #endif /* not DEBUG */ 00965 00966 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can 00967 also be assigned to arbitrarily: each pattern buffer stores its own 00968 syntax, so it can be changed between regex compilations. */ 00969 /* This has no initializer because initialized variables in Emacs 00970 become read-only after dumping. */ 00971 reg_syntax_t re_syntax_options; 00972 00973 00974 /* Specify the precise syntax of regexps for compilation. This provides 00975 for compatibility for various utilities which historically have 00976 different, incompatible syntaxes. 00977 00978 The argument SYNTAX is a bit mask comprised of the various bits 00979 defined in regex.h. We return the old syntax. */ 00980 00981 reg_syntax_t 00982 re_set_syntax(reg_syntax_t syntax) 00983 { 00984 reg_syntax_t ret = re_syntax_options; 00985 00986 re_syntax_options = syntax; 00987 #ifdef DEBUG 00988 if (syntax & RE_DEBUG) 00989 debug = 1; 00990 else if (debug) /* was on but now is not */ 00991 debug = 0; 00992 #endif /* DEBUG */ 00993 return ret; 00994 } 00995 00996 /* This table gives an error message for each of the error codes listed 00997 in regex.h. Obviously the order here has to be same as there. 00998 POSIX doesn't require that we do anything for REG_NOERROR, 00999 but why not be nice? */ 01000 01001 static const char *re_error_msgid[] = 01002 { 01003 gettext_noop ("Success"), /* REG_NOERROR */ 01004 gettext_noop ("No match"), /* REG_NOMATCH */ 01005 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */ 01006 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */ 01007 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */ 01008 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */ 01009 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */ 01010 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */ 01011 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */ 01012 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */ 01013 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */ 01014 gettext_noop ("Invalid range end"), /* REG_ERANGE */ 01015 gettext_noop ("Memory exhausted"), /* REG_ESPACE */ 01016 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */ 01017 gettext_noop ("Premature end of regular expression"), /* REG_EEND */ 01018 gettext_noop ("Regular expression too big"), /* REG_ESIZE */ 01019 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */ 01020 }; 01021 01022 /* Avoiding alloca during matching, to placate r_alloc. */ 01023 01024 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the 01025 searching and matching functions should not call alloca. On some 01026 systems, alloca is implemented in terms of malloc, and if we're 01027 using the relocating allocator routines, then malloc could cause a 01028 relocation, which might (if the strings being searched are in the 01029 ralloc heap) shift the data out from underneath the regexp 01030 routines. 01031 01032 Here's another reason to avoid allocation: Emacs 01033 processes input from X in a signal handler; processing X input may 01034 call malloc; if input arrives while a matching routine is calling 01035 malloc, then we're scrod. But Emacs can't just block input while 01036 calling matching routines; then we don't notice interrupts when 01037 they come in. So, Emacs blocks input around all regexp calls 01038 except the matching calls, which it leaves unprotected, in the 01039 faith that they will not malloc. */ 01040 01041 /* Normally, this is fine. */ 01042 #define MATCH_MAY_ALLOCATE 01043 01044 /* When using GNU C, we are not REALLY using the C alloca, no matter 01045 what config.h may say. So don't take precautions for it. */ 01046 #ifdef __GNUC__ 01047 #undef C_ALLOCA 01048 #endif 01049 01050 /* The match routines may not allocate if (1) they would do it with malloc 01051 and (2) it's not safe for them to use malloc. 01052 Note that if REL_ALLOC is defined, matching would not use malloc for the 01053 failure stack, but we would still use it for the register vectors; 01054 so REL_ALLOC should not affect this. */ 01055 #if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs) 01056 #undef MATCH_MAY_ALLOCATE 01057 #endif 01058 01059 01060 /* Failure stack declarations and macros; both re_compile_fastmap and 01061 re_match_2 use a failure stack. These have to be macros because of 01062 REGEX_ALLOCATE_STACK. */ 01063 01064 01065 /* Number of failure points for which to initially allocate space 01066 when matching. If this number is exceeded, we allocate more 01067 space, so it is not a hard limit. */ 01068 #ifndef INIT_FAILURE_ALLOC 01069 #define INIT_FAILURE_ALLOC 5 01070 #endif 01071 01072 /* Roughly the maximum number of failure points on the stack. Would be 01073 exactly that if always used MAX_FAILURE_ITEMS items each time we failed. 01074 This is a variable only so users of regex can assign to it; we never 01075 change it ourselves. */ 01076 01077 #ifdef INT_IS_16BIT 01078 01079 #if defined (MATCH_MAY_ALLOCATE) 01080 /* 4400 was enough to cause a crash on Alpha OSF/1, 01081 whose default stack limit is 2mb. */ 01082 static long int re_max_failures = 4000; 01083 #else 01084 static long int re_max_failures = 2000; 01085 #endif 01086 01087 union fail_stack_elt 01088 { 01089 unsigned char *pointer; 01090 long int integer; 01091 }; 01092 01093 typedef union fail_stack_elt fail_stack_elt_t; 01094 01095 typedef struct 01096 { 01097 fail_stack_elt_t *stack; 01098 unsigned long int size; 01099 unsigned long int avail; /* Offset of next open position. */ 01100 } fail_stack_type; 01101 01102 #else /* not INT_IS_16BIT */ 01103 01104 #if defined (MATCH_MAY_ALLOCATE) 01105 /* 4400 was enough to cause a crash on Alpha OSF/1, 01106 whose default stack limit is 2mb. */ 01107 static int re_max_failures = 20000; 01108 #else 01109 static int re_max_failures = 2000; 01110 #endif 01111 01112 union fail_stack_elt 01113 { 01114 unsigned char *pointer; 01115 int integer; 01116 }; 01117 01118 typedef union fail_stack_elt fail_stack_elt_t; 01119 01120 typedef struct 01121 { 01122 fail_stack_elt_t *stack; 01123 unsigned size; 01124 unsigned avail; /* Offset of next open position. */ 01125 } fail_stack_type; 01126 01127 #endif /* INT_IS_16BIT */ 01128 01129 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0) 01130 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0) 01131 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size) 01132 01133 01134 /* Define macros to initialize and free the failure stack. 01135 Do `return -2' if the alloc fails. */ 01136 01137 #ifdef MATCH_MAY_ALLOCATE 01138 #define INIT_FAIL_STACK() \ 01139 do { \ 01140 fail_stack.stack = (fail_stack_elt_t *) \ 01141 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \ 01142 \ 01143 if (fail_stack.stack == NULL) \ 01144 return -2; \ 01145 \ 01146 fail_stack.size = INIT_FAILURE_ALLOC; \ 01147 fail_stack.avail = 0; \ 01148 } while (0) 01149 01150 #define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack) 01151 #else 01152 #define INIT_FAIL_STACK() \ 01153 do { \ 01154 fail_stack.avail = 0; \ 01155 } while (0) 01156 01157 #define RESET_FAIL_STACK() 01158 #endif 01159 01160 01161 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items. 01162 01163 Return 1 if succeeds, and 0 if either ran out of memory 01164 allocating space for it or it was already too large. 01165 01166 REGEX_REALLOCATE_STACK requires `destination' be declared. */ 01167 01168 #define DOUBLE_FAIL_STACK(fail_stack) \ 01169 ((fail_stack).size > (unsigned) (re_max_failures * MAX_FAILURE_ITEMS) \ 01170 ? 0 \ 01171 : ((fail_stack).stack = (fail_stack_elt_t *) \ 01172 REGEX_REALLOCATE_STACK ((fail_stack).stack, \ 01173 (fail_stack).size * sizeof (fail_stack_elt_t), \ 01174 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \ 01175 \ 01176 (fail_stack).stack == NULL \ 01177 ? 0 \ 01178 : ((fail_stack).size <<= 1, \ 01179 1))) 01180 01181 01182 /* Push pointer POINTER on FAIL_STACK. 01183 Return 1 if was able to do so and 0 if ran out of memory allocating 01184 space to do so. */ 01185 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \ 01186 ((FAIL_STACK_FULL () \ 01187 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \ 01188 ? 0 \ 01189 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \ 01190 1)) 01191 01192 /* Push a pointer value onto the failure stack. 01193 Assumes the variable `fail_stack'. Probably should only 01194 be called from within `PUSH_FAILURE_POINT'. */ 01195 #define PUSH_FAILURE_POINTER(item) \ 01196 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item) 01197 01198 /* This pushes an integer-valued item onto the failure stack. 01199 Assumes the variable `fail_stack'. Probably should only 01200 be called from within `PUSH_FAILURE_POINT'. */ 01201 #define PUSH_FAILURE_INT(item) \ 01202 fail_stack.stack[fail_stack.avail++].integer = (item) 01203 01204 /* Push a fail_stack_elt_t value onto the failure stack. 01205 Assumes the variable `fail_stack'. Probably should only 01206 be called from within `PUSH_FAILURE_POINT'. */ 01207 #define PUSH_FAILURE_ELT(item) \ 01208 fail_stack.stack[fail_stack.avail++] = (item) 01209 01210 /* These three POP... operations complement the three PUSH... operations. 01211 All assume that `fail_stack' is nonempty. */ 01212 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer 01213 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer 01214 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail] 01215 01216 /* Used to omit pushing failure point id's when we're not debugging. */ 01217 #ifdef DEBUG 01218 #define DEBUG_PUSH PUSH_FAILURE_INT 01219 #define DEBUG_POP(item_addr) (item_addr)->integer = POP_FAILURE_INT () 01220 #else 01221 #define DEBUG_PUSH(item) 01222 #define DEBUG_POP(item_addr) 01223 #endif 01224 01225 01226 /* Push the information about the state we will need 01227 if we ever fail back to it. 01228 01229 Requires variables fail_stack, regstart, regend, reg_info, and 01230 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be 01231 declared. 01232 01233 Does `return FAILURE_CODE' if runs out of memory. */ 01234 01235 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \ 01236 do { \ 01237 char *destination; \ 01238 /* Must be int, so when we don't save any registers, the arithmetic \ 01239 of 0 + -1 isn't done as unsigned. */ \ 01240 /* Can't be int, since there is not a shred of a guarantee that int \ 01241 is wide enough to hold a value of something to which pointer can \ 01242 be assigned */ \ 01243 s_reg_t this_reg; \ 01244 \ 01245 DEBUG_STATEMENT (failure_id++); \ 01246 DEBUG_STATEMENT (nfailure_points_pushed++); \ 01247 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \ 01248 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\ 01249 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\ 01250 \ 01251 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \ 01252 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \ 01253 \ 01254 /* Ensure we have enough space allocated for what we will push. */ \ 01255 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \ 01256 { \ 01257 if (!DOUBLE_FAIL_STACK (fail_stack)) \ 01258 return failure_code; \ 01259 \ 01260 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \ 01261 (fail_stack).size); \ 01262 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\ 01263 } \ 01264 \ 01265 /* Push the info, starting with the registers. */ \ 01266 DEBUG_PRINT1 ("\n"); \ 01267 \ 01268 if (1) \ 01269 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \ 01270 this_reg++) \ 01271 { \ 01272 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \ 01273 DEBUG_STATEMENT (num_regs_pushed++); \ 01274 \ 01275 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \ 01276 PUSH_FAILURE_POINTER (regstart[this_reg]); \ 01277 \ 01278 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \ 01279 PUSH_FAILURE_POINTER (regend[this_reg]); \ 01280 \ 01281 DEBUG_PRINT2 (" info: 0x%x\n ", reg_info[this_reg]); \ 01282 DEBUG_PRINT2 (" match_null=%d", \ 01283 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \ 01284 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \ 01285 DEBUG_PRINT2 (" matched_something=%d", \ 01286 MATCHED_SOMETHING (reg_info[this_reg])); \ 01287 DEBUG_PRINT2 (" ever_matched=%d", \ 01288 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \ 01289 DEBUG_PRINT1 ("\n"); \ 01290 PUSH_FAILURE_ELT (reg_info[this_reg].word); \ 01291 } \ 01292 \ 01293 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg);\ 01294 PUSH_FAILURE_INT (lowest_active_reg); \ 01295 \ 01296 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg);\ 01297 PUSH_FAILURE_INT (highest_active_reg); \ 01298 \ 01299 DEBUG_PRINT2 (" Pushing pattern 0x%x:\n", pattern_place); \ 01300 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \ 01301 PUSH_FAILURE_POINTER (pattern_place); \ 01302 \ 01303 DEBUG_PRINT2 (" Pushing string 0x%x: `", string_place); \ 01304 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \ 01305 size2); \ 01306 DEBUG_PRINT1 ("'\n"); \ 01307 PUSH_FAILURE_POINTER (string_place); \ 01308 \ 01309 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \ 01310 DEBUG_PUSH (failure_id); \ 01311 } while (0) 01312 01313 /* This is the number of items that are pushed and popped on the stack 01314 for each register. */ 01315 #define NUM_REG_ITEMS 3 01316 01317 /* Individual items aside from the registers. */ 01318 #ifdef DEBUG 01319 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */ 01320 #else 01321 #define NUM_NONREG_ITEMS 4 01322 #endif 01323 01324 /* We push at most this many items on the stack. */ 01325 /* We used to use (num_regs - 1), which is the number of registers 01326 this regexp will save; but that was changed to 5 01327 to avoid stack overflow for a regexp with lots of parens. */ 01328 #define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS) 01329 01330 /* We actually push this many items. */ 01331 #define NUM_FAILURE_ITEMS \ 01332 (((0 \ 01333 ? 0 : highest_active_reg - lowest_active_reg + 1) \ 01334 * NUM_REG_ITEMS) \ 01335 + NUM_NONREG_ITEMS) 01336 01337 /* How many items can still be added to the stack without overflowing it. */ 01338 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail) 01339 01340 01341 /* Pops what PUSH_FAIL_STACK pushes. 01342 01343 We restore into the parameters, all of which should be lvalues: 01344 STR -- the saved data position. 01345 PAT -- the saved pattern position. 01346 LOW_REG, HIGH_REG -- the highest and lowest active registers. 01347 REGSTART, REGEND -- arrays of string positions. 01348 REG_INFO -- array of information about each subexpression. 01349 01350 Also assumes the variables `fail_stack' and (if debugging), `bufp', 01351 `pend', `string1', `size1', `string2', and `size2'. */ 01352 01353 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\ 01354 { \ 01355 DEBUG_STATEMENT (fail_stack_elt_t failure_id;) \ 01356 s_reg_t this_reg; \ 01357 const unsigned char *string_temp; \ 01358 \ 01359 assert (!FAIL_STACK_EMPTY ()); \ 01360 \ 01361 /* Remove failure points and point to how many regs pushed. */ \ 01362 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \ 01363 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \ 01364 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \ 01365 \ 01366 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \ 01367 \ 01368 DEBUG_POP (&failure_id); \ 01369 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \ 01370 \ 01371 /* If the saved string location is NULL, it came from an \ 01372 on_failure_keep_string_jump opcode, and we want to throw away the \ 01373 saved NULL, thus retaining our current position in the string. */ \ 01374 string_temp = POP_FAILURE_POINTER (); \ 01375 if (string_temp != NULL) \ 01376 str = (const char *) string_temp; \ 01377 \ 01378 DEBUG_PRINT2 (" Popping string 0x%x: `", str); \ 01379 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \ 01380 DEBUG_PRINT1 ("'\n"); \ 01381 \ 01382 pat = (unsigned char *) POP_FAILURE_POINTER (); \ 01383 DEBUG_PRINT2 (" Popping pattern 0x%x:\n", pat); \ 01384 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \ 01385 \ 01386 /* Restore register info. */ \ 01387 high_reg = (active_reg_t) POP_FAILURE_INT (); \ 01388 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \ 01389 \ 01390 low_reg = (active_reg_t) POP_FAILURE_INT (); \ 01391 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \ 01392 \ 01393 if (1) \ 01394 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \ 01395 { \ 01396 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \ 01397 \ 01398 reg_info[this_reg].word = POP_FAILURE_ELT (); \ 01399 DEBUG_PRINT2 (" info: 0x%x\n", reg_info[this_reg]); \ 01400 \ 01401 regend[this_reg] = (const char *) POP_FAILURE_POINTER (); \ 01402 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \ 01403 \ 01404 regstart[this_reg] = (const char *) POP_FAILURE_POINTER (); \ 01405 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \ 01406 } \ 01407 else \ 01408 { \ 01409 for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) \ 01410 { \ 01411 reg_info[this_reg].word.integer = 0; \ 01412 regend[this_reg] = 0; \ 01413 regstart[this_reg] = 0; \ 01414 } \ 01415 highest_active_reg = high_reg; \ 01416 } \ 01417 \ 01418 set_regs_matched_done = 0; \ 01419 DEBUG_STATEMENT (nfailure_points_popped++); \ 01420 } /* POP_FAILURE_POINT */ 01421 01422 01423 /* Structure for per-register (a.k.a. per-group) information. 01424 Other register information, such as the 01425 starting and ending positions (which are addresses), and the list of 01426 inner groups (which is a bits list) are maintained in separate 01427 variables. 01428 01429 We are making a (strictly speaking) nonportable assumption here: that 01430 the compiler will pack our bit fields into something that fits into 01431 the type of `word', i.e., is something that fits into one item on the 01432 failure stack. */ 01433 01434 01435 /* Declarations and macros for re_match_2. */ 01436 01437 typedef union 01438 { 01439 fail_stack_elt_t word; 01440 struct 01441 { 01442 /* This field is one if this group can match the empty string, 01443 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */ 01444 #define MATCH_NULL_UNSET_VALUE 3 01445 unsigned match_null_string_p : 2; 01446 unsigned is_active : 1; 01447 unsigned matched_something : 1; 01448 unsigned ever_matched_something : 1; 01449 } bits; 01450 } register_info_type; 01451 01452 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p) 01453 #define IS_ACTIVE(R) ((R).bits.is_active) 01454 #define MATCHED_SOMETHING(R) ((R).bits.matched_something) 01455 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something) 01456 01457 01458 /* Call this when have matched a real character; it sets `matched' flags 01459 for the subexpressions which we are currently inside. Also records 01460 that those subexprs have matched. */ 01461 #define SET_REGS_MATCHED() \ 01462 do \ 01463 { \ 01464 if (!set_regs_matched_done) \ 01465 { \ 01466 active_reg_t r; \ 01467 set_regs_matched_done = 1; \ 01468 for (r = lowest_active_reg; r <= highest_active_reg; r++) \ 01469 { \ 01470 MATCHED_SOMETHING (reg_info[r]) \ 01471 = EVER_MATCHED_SOMETHING (reg_info[r]) \ 01472 = 1; \ 01473 } \ 01474 } \ 01475 } \ 01476 while (0) 01477 01478 /* Registers are set to a sentinel when they haven't yet matched. */ 01479 static char reg_unset_dummy; 01480 #define REG_UNSET_VALUE (®_unset_dummy) 01481 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE) 01482 01483 /* Subroutine declarations and macros for regex_compile. */ 01484 01485 static reg_errcode_t regex_compile _RE_ARGS ((const char *pattern, size_t size, 01486 reg_syntax_t syntax, 01487 struct re_pattern_buffer *bufp)); 01488 static void store_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg)); 01489 static void store_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc, 01490 int arg1, int arg2)); 01491 static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, 01492 int arg, unsigned char *end)); 01493 static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc, 01494 int arg1, int arg2, unsigned char *end)); 01495 static boolean at_begline_loc_p _RE_ARGS ((const char *pattern, const char *p, 01496 reg_syntax_t syntax)); 01497 static boolean at_endline_loc_p _RE_ARGS ((const char *p, const char *pend, 01498 reg_syntax_t syntax)); 01499 static reg_errcode_t compile_range _RE_ARGS ((const char **p_ptr, 01500 const char *pend, 01501 char *translate, 01502 reg_syntax_t syntax, 01503 unsigned char *b)); 01504 01505 /* Fetch the next character in the uncompiled pattern---translating it 01506 if necessary. Also cast from a signed character in the constant 01507 string passed to us by the user to an unsigned char that we can use 01508 as an array index (in, e.g., `translate'). */ 01509 #ifndef PATFETCH 01510 #define PATFETCH(c) \ 01511 do {if (p == pend) return REG_EEND; \ 01512 c = (unsigned char) *p++; \ 01513 if (translate) c = (unsigned char) translate[c]; \ 01514 } while (0) 01515 #endif 01516 01517 /* Fetch the next character in the uncompiled pattern, with no 01518 translation. */ 01519 #define PATFETCH_RAW(c) \ 01520 do {if (p == pend) return REG_EEND; \ 01521 c = (unsigned char) *p++; \ 01522 } while (0) 01523 01524 /* Go backwards one character in the pattern. */ 01525 #define PATUNFETCH p-- 01526 01527 01528 /* If `translate' is non-null, return translate[D], else just D. We 01529 cast the subscript to translate because some data is declared as 01530 `char *', to avoid warnings when a string constant is passed. But 01531 when we use a character as a subscript we must make it unsigned. */ 01532 #ifndef TRANSLATE 01533 #define TRANSLATE(d) \ 01534 (translate ? (char) translate[(unsigned char) (d)] : (d)) 01535 #endif 01536 01537 01538 /* Macros for outputting the compiled pattern into `buffer'. */ 01539 01540 /* If the buffer isn't allocated when it comes in, use this. */ 01541 #define INIT_BUF_SIZE 32 01542 01543 /* Make sure we have at least N more bytes of space in buffer. */ 01544 #define GET_BUFFER_SPACE(n) \ 01545 while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \ 01546 EXTEND_BUFFER () 01547 01548 /* Make sure we have one more byte of buffer space and then add C to it. */ 01549 #define BUF_PUSH(c) \ 01550 do { \ 01551 GET_BUFFER_SPACE (1); \ 01552 *b++ = (unsigned char) (c); \ 01553 } while (0) 01554 01555 01556 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */ 01557 #define BUF_PUSH_2(c1, c2) \ 01558 do { \ 01559 GET_BUFFER_SPACE (2); \ 01560 *b++ = (unsigned char) (c1); \ 01561 *b++ = (unsigned char) (c2); \ 01562 } while (0) 01563 01564 01565 /* As with BUF_PUSH_2, except for three bytes. */ 01566 #define BUF_PUSH_3(c1, c2, c3) \ 01567 do { \ 01568 GET_BUFFER_SPACE (3); \ 01569 *b++ = (unsigned char) (c1); \ 01570 *b++ = (unsigned char) (c2); \ 01571 *b++ = (unsigned char) (c3); \ 01572 } while (0) 01573 01574 01575 /* Store a jump with opcode OP at LOC to location TO. We store a 01576 relative address offset by the three bytes the jump itself occupies. */ 01577 #define STORE_JUMP(op, loc, to) \ 01578 store_op1 (op, loc, (int) ((to) - (loc) - 3)) 01579 01580 /* Likewise, for a two-argument jump. */ 01581 #define STORE_JUMP2(op, loc, to, arg) \ 01582 store_op2 (op, loc, (int) ((to) - (loc) - 3), arg) 01583 01584 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */ 01585 #define INSERT_JUMP(op, loc, to) \ 01586 insert_op1 (op, loc, (int) ((to) - (loc) - 3), b) 01587 01588 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */ 01589 #define INSERT_JUMP2(op, loc, to, arg) \ 01590 insert_op2 (op, loc, (int) ((to) - (loc) - 3), arg, b) 01591 01592 01593 /* This is not an arbitrary limit: the arguments which represent offsets 01594 into the pattern are two bytes long. So if 2^16 bytes turns out to 01595 be too small, many things would have to change. */ 01596 /* Any other compiler which, like MSC, has allocation limit below 2^16 01597 bytes will have to use approach similar to what was done below for 01598 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up 01599 reallocating to 0 bytes. Such thing is not going to work too well. 01600 You have been warned!! */ 01601 #if defined(_MSC_VER) && !defined(_WIN32) 01602 /* Microsoft C 16-bit versions limit malloc to approx 65512 bytes. 01603 The REALLOC define eliminates a flurry of conversion warnings, 01604 but is not required. */ 01605 #define MAX_BUF_SIZE 65500L 01606 #define REALLOC(p,s) realloc ((p), (size_t) (s)) 01607 #else 01608 #define MAX_BUF_SIZE (1L << 16) 01609 #define REALLOC(p,s) realloc ((p), (s)) 01610 #endif 01611 01612 /* Extend the buffer by twice its current size via realloc and 01613 reset the pointers that pointed into the old block to point to the 01614 correct places in the new one. If extending the buffer results in it 01615 being larger than MAX_BUF_SIZE, then flag memory exhausted. */ 01616 #define EXTEND_BUFFER() \ 01617 do { \ 01618 unsigned char *old_buffer = bufp->buffer; \ 01619 if (bufp->allocated == MAX_BUF_SIZE) \ 01620 return REG_ESIZE; \ 01621 bufp->allocated <<= 1; \ 01622 if (bufp->allocated > MAX_BUF_SIZE) \ 01623 bufp->allocated = MAX_BUF_SIZE; \ 01624 bufp->buffer = (unsigned char *) REALLOC (bufp->buffer, bufp->allocated);\ 01625 if (bufp->buffer == NULL) \ 01626 return REG_ESPACE; \ 01627 /* If the buffer moved, move all the pointers into it. */ \ 01628 if (old_buffer != bufp->buffer) \ 01629 { \ 01630 b = (b - old_buffer) + bufp->buffer; \ 01631 begalt = (begalt - old_buffer) + bufp->buffer; \ 01632 if (fixup_alt_jump) \ 01633 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\ 01634 if (laststart) \ 01635 laststart = (laststart - old_buffer) + bufp->buffer; \ 01636 if (pending_exact) \ 01637 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \ 01638 } \ 01639 } while (0) 01640 01641 01642 /* Since we have one byte reserved for the register number argument to 01643 {start,stop}_memory, the maximum number of groups we can report 01644 things about is what fits in that byte. */ 01645 #define MAX_REGNUM 255 01646 01647 /* But patterns can have more than `MAX_REGNUM' registers. We just 01648 ignore the excess. */ 01649 typedef unsigned regnum_t; 01650 01651 01652 /* Macros for the compile stack. */ 01653 01654 /* Since offsets can go either forwards or backwards, this type needs to 01655 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */ 01656 /* int may be not enough when sizeof(int) == 2. */ 01657 typedef long pattern_offset_t; 01658 01659 typedef struct 01660 { 01661 pattern_offset_t begalt_offset; 01662 pattern_offset_t fixup_alt_jump; 01663 pattern_offset_t inner_group_offset; 01664 pattern_offset_t laststart_offset; 01665 regnum_t regnum; 01666 } compile_stack_elt_t; 01667 01668 01669 typedef struct 01670 { 01671 compile_stack_elt_t *stack; 01672 unsigned size; 01673 unsigned avail; /* Offset of next open position. */ 01674 } compile_stack_type; 01675 01676 01677 #define INIT_COMPILE_STACK_SIZE 32 01678 01679 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0) 01680 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size) 01681 01682 /* The next available element. */ 01683 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) 01684 01685 01686 /* Set the bit for character C in a list. */ 01687 #define SET_LIST_BIT(c) \ 01688 (b[((unsigned char) (c)) / BYTEWIDTH] \ 01689 |= 1 << (((unsigned char) c) % BYTEWIDTH)) 01690 01691 01692 /* Get the next unsigned number in the uncompiled pattern. */ 01693 #define GET_UNSIGNED_NUMBER(num) \ 01694 { if (p != pend) \ 01695 { \ 01696 PATFETCH (c); \ 01697 while (ISDIGIT (c)) \ 01698 { \ 01699 if (num < 0) \ 01700 num = 0; \ 01701 num = num * 10 + c - '0'; \ 01702 if (p == pend) \ 01703 break; \ 01704 PATFETCH (c); \ 01705 } \ 01706 } \ 01707 } 01708 01709 #if defined _LIBC || (defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H) 01710 /* The GNU C library provides support for user-defined character classes 01711 and the functions from ISO C amendement 1. */ 01712 # ifdef CHARCLASS_NAME_MAX 01713 # define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX 01714 # else 01715 /* This shouldn't happen but some implementation might still have this 01716 problem. Use a reasonable default value. */ 01717 # define CHAR_CLASS_MAX_LENGTH 256 01718 # endif 01719 01720 # define IS_CHAR_CLASS(string) wctype (string) 01721 #else 01722 # define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */ 01723 01724 # define IS_CHAR_CLASS(string) \ 01725 (STREQ (string, "alpha") || STREQ (string, "upper") \ 01726 || STREQ (string, "lower") || STREQ (string, "digit") \ 01727 || STREQ (string, "alnum") || STREQ (string, "xdigit") \ 01728 || STREQ (string, "space") || STREQ (string, "print") \ 01729 || STREQ (string, "punct") || STREQ (string, "graph") \ 01730 || STREQ (string, "cntrl") || STREQ (string, "blank")) 01731 #endif 01732 01733 #ifndef MATCH_MAY_ALLOCATE 01734 01735 /* If we cannot allocate large objects within re_match_2_internal, 01736 we make the fail stack and register vectors global. 01737 The fail stack, we grow to the maximum size when a regexp 01738 is compiled. 01739 The register vectors, we adjust in size each time we 01740 compile a regexp, according to the number of registers it needs. */ 01741 01742 static fail_stack_type fail_stack; 01743 01744 /* Size with which the following vectors are currently allocated. 01745 That is so we can make them bigger as needed, 01746 but never make them smaller. */ 01747 static int regs_allocated_size; 01748 01749 static const char ** regstart, ** regend; 01750 static const char ** old_regstart, ** old_regend; 01751 static const char **best_regstart, **best_regend; 01752 static register_info_type *reg_info; 01753 static const char **reg_dummy; 01754 static register_info_type *reg_info_dummy; 01755 01756 /* Make the register vectors big enough for NUM_REGS registers, 01757 but don't make them smaller. */ 01758 01759 static 01760 regex_grow_registers (num_regs) 01761 int num_regs; 01762 { 01763 if (num_regs > regs_allocated_size) 01764 { 01765 RETALLOC_IF (regstart, num_regs, const char *); 01766 RETALLOC_IF (regend, num_regs, const char *); 01767 RETALLOC_IF (old_regstart, num_regs, const char *); 01768 RETALLOC_IF (old_regend, num_regs, const char *); 01769 RETALLOC_IF (best_regstart, num_regs, const char *); 01770 RETALLOC_IF (best_regend, num_regs, const char *); 01771 RETALLOC_IF (reg_info, num_regs, register_info_type); 01772 RETALLOC_IF (reg_dummy, num_regs, const char *); 01773 RETALLOC_IF (reg_info_dummy, num_regs, register_info_type); 01774 01775 regs_allocated_size = num_regs; 01776 } 01777 } 01778 01779 #endif /* not MATCH_MAY_ALLOCATE */ 01780 01781 static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type 01782 compile_stack, 01783 regnum_t regnum)); 01784 01785 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. 01786 Returns one of error codes defined in `regex.h', or zero for success. 01787 01788 Assumes the `allocated' (and perhaps `buffer') and `translate' 01789 fields are set in BUFP on entry. 01790 01791 If it succeeds, results are put in BUFP (if it returns an error, the 01792 contents of BUFP are undefined): 01793 `buffer' is the compiled pattern; 01794 `syntax' is set to SYNTAX; 01795 `used' is set to the length of the compiled pattern; 01796 `fastmap_accurate' is zero; 01797 `re_nsub' is the number of subexpressions in PATTERN; 01798 `not_bol' and `not_eol' are zero; 01799 01800 The `fastmap' and `newline_anchor' fields are neither 01801 examined nor set. */ 01802 01803 /* Return, freeing storage we allocated. */ 01804 #define FREE_STACK_RETURN(value) \ 01805 return (free (compile_stack.stack), value) 01806 01807 static reg_errcode_t 01808 regex_compile (const char *pattern, 01809 size_t size, 01810 reg_syntax_t syntax, 01811 struct re_pattern_buffer *bufp) 01812 { 01813 /* We fetch characters from PATTERN here. Even though PATTERN is 01814 `char *' (i.e., signed), we declare these variables as unsigned, so 01815 they can be reliably used as array indices. */ 01816 register unsigned char c, c1; 01817 01818 /* A random temporary spot in PATTERN. */ 01819 const char *p1; 01820 01821 /* Points to the end of the buffer, where we should append. */ 01822 register unsigned char *b; 01823 01824 /* Keeps track of unclosed groups. */ 01825 compile_stack_type compile_stack; 01826 01827 /* Points to the current (ending) position in the pattern. */ 01828 const char *p = pattern; 01829 const char *pend = pattern + size; 01830 01831 /* How to translate the characters in the pattern. */ 01832 RE_TRANSLATE_TYPE translate = bufp->translate; 01833 01834 /* Address of the count-byte of the most recently inserted `exactn' 01835 command. This makes it possible to tell if a new exact-match 01836 character can be added to that command or if the character requires 01837 a new `exactn' command. */ 01838 unsigned char *pending_exact = 0; 01839 01840 /* Address of start of the most recently finished expression. 01841 This tells, e.g., postfix * where to find the start of its 01842 operand. Reset at the beginning of groups and alternatives. */ 01843 unsigned char *laststart = 0; 01844 01845 /* Address of beginning of regexp, or inside of last group. */ 01846 unsigned char *begalt; 01847 01848 /* Place in the uncompiled pattern (i.e., the {) to 01849 which to go back if the interval is invalid. */ 01850 const char *beg_interval; 01851 01852 /* Address of the place where a forward jump should go to the end of 01853 the containing expression. Each alternative of an `or' -- except the 01854 last -- ends with a forward jump of this sort. */ 01855 unsigned char *fixup_alt_jump = 0; 01856 01857 /* Counts open-groups as they are encountered. Remembered for the 01858 matching close-group on the compile stack, so the same register 01859 number is put in the stop_memory as the start_memory. */ 01860 regnum_t regnum = 0; 01861 01862 #ifdef DEBUG 01863 DEBUG_PRINT1 ("\nCompiling pattern: "); 01864 if (debug) 01865 { 01866 unsigned debug_count; 01867 01868 for (debug_count = 0; debug_count < size; debug_count++) 01869 putchar (pattern[debug_count]); 01870 putchar ('\n'); 01871 } 01872 #endif /* DEBUG */ 01873 01874 /* Initialize the compile stack. */ 01875 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t); 01876 if (compile_stack.stack == NULL) 01877 return REG_ESPACE; 01878 01879 compile_stack.size = INIT_COMPILE_STACK_SIZE; 01880 compile_stack.avail = 0; 01881 01882 /* Initialize the pattern buffer. */ 01883 bufp->syntax = syntax; 01884 bufp->fastmap_accurate = 0; 01885 bufp->not_bol = bufp->not_eol = 0; 01886 01887 /* Set `used' to zero, so that if we return an error, the pattern 01888 printer (for debugging) will think there's no pattern. We reset it 01889 at the end. */ 01890 bufp->used = 0; 01891 01892 /* Always count groups, whether or not bufp->no_sub is set. */ 01893 bufp->re_nsub = 0; 01894 01895 #if !defined (emacs) && !defined (SYNTAX_TABLE) 01896 /* Initialize the syntax table. */ 01897 init_syntax_once (); 01898 #endif 01899 01900 if (bufp->allocated == 0) 01901 { 01902 if (bufp->buffer) 01903 { /* If zero allocated, but buffer is non-null, try to realloc 01904 enough space. This loses if buffer's address is bogus, but 01905 that is the user's responsibility. */ 01906 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char); 01907 } 01908 else 01909 { /* Caller did not allocate a buffer. Do it for them. */ 01910 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char); 01911 } 01912 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE); 01913 01914 bufp->allocated = INIT_BUF_SIZE; 01915 } 01916 01917 begalt = b = bufp->buffer; 01918 01919 /* Loop through the uncompiled pattern until we're at the end. */ 01920 while (p != pend) 01921 { 01922 PATFETCH (c); 01923 01924 switch (c) 01925 { 01926 case '^': 01927 { 01928 if ( /* If at start of pattern, it's an operator. */ 01929 p == pattern + 1 01930 /* If context independent, it's an operator. */ 01931 || syntax & RE_CONTEXT_INDEP_ANCHORS 01932 /* Otherwise, depends on what's come before. */ 01933 || at_begline_loc_p (pattern, p, syntax)) 01934 BUF_PUSH (begline); 01935 else 01936 goto normal_char; 01937 } 01938 break; 01939 01940 01941 case '$': 01942 { 01943 if ( /* If at end of pattern, it's an operator. */ 01944 p == pend 01945 /* If context independent, it's an operator. */ 01946 || syntax & RE_CONTEXT_INDEP_ANCHORS 01947 /* Otherwise, depends on what's next. */ 01948 || at_endline_loc_p (p, pend, syntax)) 01949 BUF_PUSH (endline); 01950 else 01951 goto normal_char; 01952 } 01953 break; 01954 01955 01956 case '+': 01957 case '?': 01958 if ((syntax & RE_BK_PLUS_QM) 01959 || (syntax & RE_LIMITED_OPS)) 01960 goto normal_char; 01961 handle_plus: 01962 case '*': 01963 /* If there is no previous pattern... */ 01964 if (!laststart) 01965 { 01966 if (syntax & RE_CONTEXT_INVALID_OPS) 01967 FREE_STACK_RETURN (REG_BADRPT); 01968 else if (!(syntax & RE_CONTEXT_INDEP_OPS)) 01969 goto normal_char; 01970 } 01971 01972 { 01973 /* Are we optimizing this jump? */ 01974 boolean keep_string_p = false; 01975 01976 /* 1 means zero (many) matches is allowed. */ 01977 char zero_times_ok = 0, many_times_ok = 0; 01978 01979 /* If there is a sequence of repetition chars, collapse it 01980 down to just one (the right one). We can't combine 01981 interval operators with these because of, e.g., `a{2}*', 01982 which should only match an even number of `a's. */ 01983 01984 for (;;) 01985 { 01986 zero_times_ok |= c != '+'; 01987 many_times_ok |= c != '?'; 01988 01989 if (p == pend) 01990 break; 01991 01992 PATFETCH (c); 01993 01994 if (c == '*' 01995 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?'))) 01996 ; 01997 01998 else if (syntax & RE_BK_PLUS_QM && c == '\\') 01999 { 02000 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); 02001 02002 PATFETCH (c1); 02003 if (!(c1 == '+' || c1 == '?')) 02004 { 02005 PATUNFETCH; 02006 PATUNFETCH; 02007 break; 02008 } 02009 02010 c = c1; 02011 } 02012 else 02013 { 02014 PATUNFETCH; 02015 break; 02016 } 02017 02018 /* If we get here, we found another repeat character. */ 02019 } 02020 02021 /* Star, etc. applied to an empty pattern is equivalent 02022 to an empty pattern. */ 02023 if (!laststart) 02024 break; 02025 02026 /* Now we know whether or not zero matches is allowed 02027 and also whether or not two or more matches is allowed. */ 02028 if (many_times_ok) 02029 { /* More than one repetition is allowed, so put in at the 02030 end a backward relative jump from `b' to before the next 02031 jump we're going to put in below (which jumps from 02032 laststart to after this jump). 02033 02034 But if we are at the `*' in the exact sequence `.*\n', 02035 insert an unconditional jump backwards to the ., 02036 instead of the beginning of the loop. This way we only 02037 push a failure point once, instead of every time 02038 through the loop. */ 02039 assert (p - 1 > pattern); 02040 02041 /* Allocate the space for the jump. */ 02042 GET_BUFFER_SPACE (3); 02043 02044 /* We know we are not at the first character of the pattern, 02045 because laststart was nonzero. And we've already 02046 incremented `p', by the way, to be the character after 02047 the `*'. Do we have to do something analogous here 02048 for null bytes, because of RE_DOT_NOT_NULL? */ 02049 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.') 02050 && zero_times_ok 02051 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n') 02052 && !(syntax & RE_DOT_NEWLINE)) 02053 { /* We have .*\n. */ 02054 STORE_JUMP (jump, b, laststart); 02055 keep_string_p = true; 02056 } 02057 else 02058 /* Anything else. */ 02059 STORE_JUMP (maybe_pop_jump, b, laststart - 3); 02060 02061 /* We've added more stuff to the buffer. */ 02062 b += 3; 02063 } 02064 02065 /* On failure, jump from laststart to b + 3, which will be the 02066 end of the buffer after this jump is inserted. */ 02067 GET_BUFFER_SPACE (3); 02068 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump 02069 : on_failure_jump, 02070 laststart, b + 3); 02071 pending_exact = 0; 02072 b += 3; 02073 02074 if (!zero_times_ok) 02075 { 02076 /* At least one repetition is required, so insert a 02077 `dummy_failure_jump' before the initial 02078 `on_failure_jump' instruction of the loop. This 02079 effects a skip over that instruction the first time 02080 we hit that loop. */ 02081 GET_BUFFER_SPACE (3); 02082 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6); 02083 b += 3; 02084 } 02085 } 02086 break; 02087 02088 02089 case '.': 02090 laststart = b; 02091 BUF_PUSH (anychar); 02092 break; 02093 02094 02095 case '[': 02096 { 02097 boolean had_char_class = false; 02098 02099 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 02100 02101 /* Ensure that we have enough space to push a charset: the 02102 opcode, the length count, and the bitset; 34 bytes in all. */ 02103 GET_BUFFER_SPACE (34); 02104 02105 laststart = b; 02106 02107 /* We test `*p == '^' twice, instead of using an if 02108 statement, so we only need one BUF_PUSH. */ 02109 BUF_PUSH (*p == '^' ? charset_not : charset); 02110 if (*p == '^') 02111 p++; 02112 02113 /* Remember the first position in the bracket expression. */ 02114 p1 = p; 02115 02116 /* Push the number of bytes in the bitmap. */ 02117 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH); 02118 02119 /* Clear the whole map. */ 02120 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH); 02121 02122 /* charset_not matches newline according to a syntax bit. */ 02123 if ((re_opcode_t) b[-2] == charset_not 02124 && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) 02125 SET_LIST_BIT ('\n'); 02126 02127 /* Read in characters and ranges, setting map bits. */ 02128 for (;;) 02129 { 02130 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 02131 02132 PATFETCH (c); 02133 02134 /* \ might escape characters inside [...] and [^...]. */ 02135 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') 02136 { 02137 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); 02138 02139 PATFETCH (c1); 02140 SET_LIST_BIT (c1); 02141 continue; 02142 } 02143 02144 /* Could be the end of the bracket expression. If it's 02145 not (i.e., when the bracket expression is `[]' so 02146 far), the ']' character bit gets set way below. */ 02147 if (c == ']' && p != p1 + 1) 02148 break; 02149 02150 /* Look ahead to see if it's a range when the last thing 02151 was a character class. */ 02152 if (had_char_class && c == '-' && *p != ']') 02153 FREE_STACK_RETURN (REG_ERANGE); 02154 02155 /* Look ahead to see if it's a range when the last thing 02156 was a character: if this is a hyphen not at the 02157 beginning or the end of a list, then it's the range 02158 operator. */ 02159 if (c == '-' 02160 && !(p - 2 >= pattern && p[-2] == '[') 02161 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') 02162 && *p != ']') 02163 { 02164 reg_errcode_t ret 02165 = compile_range (&p, pend, translate, syntax, b); 02166 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); 02167 } 02168 02169 else if (p[0] == '-' && p[1] != ']') 02170 { /* This handles ranges made up of characters only. */ 02171 reg_errcode_t ret; 02172 02173 /* Move past the `-'. */ 02174 PATFETCH (c1); 02175 02176 ret = compile_range (&p, pend, translate, syntax, b); 02177 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); 02178 } 02179 02180 /* See if we're at the beginning of a possible character 02181 class. */ 02182 02183 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') 02184 { /* Leave room for the null. */ 02185 char str[CHAR_CLASS_MAX_LENGTH + 1]; 02186 02187 PATFETCH (c); 02188 c1 = 0; 02189 02190 /* If pattern is `[[:'. */ 02191 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 02192 02193 for (;;) 02194 { 02195 PATFETCH (c); 02196 if (c == ':' || c == ']' || p == pend 02197 || c1 == CHAR_CLASS_MAX_LENGTH) 02198 break; 02199 str[c1++] = c; 02200 } 02201 str[c1] = '\0'; 02202 02203 /* If isn't a word bracketed by `[:' and:`]': 02204 undo the ending character, the letters, and leave 02205 the leading `:' and `[' (but set bits for them). */ 02206 if (c == ':' && *p == ']') 02207 { 02208 #if defined _LIBC || (defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H) 02209 boolean is_lower = STREQ (str, "lower"); 02210 boolean is_upper = STREQ (str, "upper"); 02211 wctype_t wt; 02212 int ch; 02213 02214 wt = wctype (str); 02215 if (wt == 0) 02216 FREE_STACK_RETURN (REG_ECTYPE); 02217 02218 /* Throw away the ] at the end of the character 02219 class. */ 02220 PATFETCH (c); 02221 02222 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 02223 02224 for (ch = 0; ch < 1 << BYTEWIDTH; ++ch) 02225 { 02226 if (iswctype (btowc (ch), wt)) 02227 SET_LIST_BIT (ch); 02228 02229 if (translate && (is_upper || is_lower) 02230 && (ISUPPER (ch) || ISLOWER (ch))) 02231 SET_LIST_BIT (ch); 02232 } 02233 02234 had_char_class = true; 02235 #else 02236 int ch; 02237 boolean is_alnum = STREQ (str, "alnum"); 02238 boolean is_alpha = STREQ (str, "alpha"); 02239 boolean is_blank = STREQ (str, "blank"); 02240 boolean is_cntrl = STREQ (str, "cntrl"); 02241 boolean is_digit = STREQ (str, "digit"); 02242 boolean is_graph = STREQ (str, "graph"); 02243 boolean is_lower = STREQ (str, "lower"); 02244 boolean is_print = STREQ (str, "print"); 02245 boolean is_punct = STREQ (str, "punct"); 02246 boolean is_space = STREQ (str, "space"); 02247 boolean is_upper = STREQ (str, "upper"); 02248 boolean is_xdigit = STREQ (str, "xdigit"); 02249 02250 if (!IS_CHAR_CLASS (str)) 02251 FREE_STACK_RETURN (REG_ECTYPE); 02252 02253 /* Throw away the ] at the end of the character 02254 class. */ 02255 PATFETCH (c); 02256 02257 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 02258 02259 for (ch = 0; ch < 1 << BYTEWIDTH; ch++) 02260 { 02261 /* This was split into 3 if's to 02262 avoid an arbitrary limit in some compiler. */ 02263 if ( (is_alnum && ISALNUM (ch)) 02264 || (is_alpha && ISALPHA (ch)) 02265 || (is_blank && ISBLANK (ch)) 02266 || (is_cntrl && ISCNTRL (ch))) 02267 SET_LIST_BIT (ch); 02268 if ( (is_digit && ISDIGIT (ch)) 02269 || (is_graph && ISGRAPH (ch)) 02270 || (is_lower && ISLOWER (ch)) 02271 || (is_print && ISPRINT (ch))) 02272 SET_LIST_BIT (ch); 02273 if ( (is_punct && ISPUNCT (ch)) 02274 || (is_space && ISSPACE (ch)) 02275 || (is_upper && ISUPPER (ch)) 02276 || (is_xdigit && ISXDIGIT (ch))) 02277 SET_LIST_BIT (ch); 02278 if ( translate && (is_upper || is_lower) 02279 && (ISUPPER (ch) || ISLOWER (ch))) 02280 SET_LIST_BIT (ch); 02281 } 02282 had_char_class = true; 02283 #endif /* libc || wctype.h */ 02284 } 02285 else 02286 { 02287 c1++; 02288 while (c1--) 02289 PATUNFETCH; 02290 SET_LIST_BIT ('['); 02291 SET_LIST_BIT (':'); 02292 had_char_class = false; 02293 } 02294 } 02295 else 02296 { 02297 had_char_class = false; 02298 SET_LIST_BIT (c); 02299 } 02300 } 02301 02302 /* Discard any (non)matching list bytes that are all 0 at the 02303 end of the map. Decrease the map-length byte too. */ 02304 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0) 02305 b[-1]--; 02306 b += b[-1]; 02307 } 02308 break; 02309 02310 02311 case '(': 02312 if (syntax & RE_NO_BK_PARENS) 02313 goto handle_open; 02314 else 02315 goto normal_char; 02316 02317 02318 case ')': 02319 if (syntax & RE_NO_BK_PARENS) 02320 goto handle_close; 02321 else 02322 goto normal_char; 02323 02324 02325 case '\n': 02326 if (syntax & RE_NEWLINE_ALT) 02327 goto handle_alt; 02328 else 02329 goto normal_char; 02330 02331 02332 case '|': 02333 if (syntax & RE_NO_BK_VBAR) 02334 goto handle_alt; 02335 else 02336 goto normal_char; 02337 02338 02339 case '{': 02340 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES) 02341 goto handle_interval; 02342 else 02343 goto normal_char; 02344 02345 02346 case '\\': 02347 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); 02348 02349 /* Do not translate the character after the \, so that we can 02350 distinguish, e.g., \B from \b, even if we normally would 02351 translate, e.g., B to b. */ 02352 PATFETCH_RAW (c); 02353 02354 switch (c) 02355 { 02356 case '(': 02357 if (syntax & RE_NO_BK_PARENS) 02358 goto normal_backslash; 02359 02360 handle_open: 02361 bufp->re_nsub++; 02362 regnum++; 02363 02364 if (COMPILE_STACK_FULL) 02365 { 02366 RETALLOC (compile_stack.stack, compile_stack.size << 1, 02367 compile_stack_elt_t); 02368 if (compile_stack.stack == NULL) return REG_ESPACE; 02369 02370 compile_stack.size <<= 1; 02371 } 02372 02373 /* These are the values to restore when we hit end of this 02374 group. They are all relative offsets, so that if the 02375 whole pattern moves because of realloc, they will still 02376 be valid. */ 02377 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer; 02378 COMPILE_STACK_TOP.fixup_alt_jump 02379 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0; 02380 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer; 02381 COMPILE_STACK_TOP.regnum = regnum; 02382 02383 /* We will eventually replace the 0 with the number of 02384 groups inner to this one. But do not push a 02385 start_memory for groups beyond the last one we can 02386 represent in the compiled pattern. */ 02387 if (regnum <= MAX_REGNUM) 02388 { 02389 COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2; 02390 BUF_PUSH_3 (start_memory, regnum, 0); 02391 } 02392 02393 compile_stack.avail++; 02394 02395 fixup_alt_jump = 0; 02396 laststart = 0; 02397 begalt = b; 02398 /* If we've reached MAX_REGNUM groups, then this open 02399 won't actually generate any code, so we'll have to 02400 clear pending_exact explicitly. */ 02401 pending_exact = 0; 02402 break; 02403 02404 02405 case ')': 02406 if (syntax & RE_NO_BK_PARENS) goto normal_backslash; 02407 02408 if (COMPILE_STACK_EMPTY) { 02409 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) 02410 goto normal_backslash; 02411 else 02412 FREE_STACK_RETURN (REG_ERPAREN); 02413 } 02414 handle_close: 02415 if (fixup_alt_jump) 02416 { /* Push a dummy failure point at the end of the 02417 alternative for a possible future 02418 `pop_failure_jump' to pop. See comments at 02419 `push_dummy_failure' in `re_match_2'. */ 02420 BUF_PUSH (push_dummy_failure); 02421 02422 /* We allocated space for this jump when we assigned 02423 to `fixup_alt_jump', in the `handle_alt' case below. */ 02424 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1); 02425 } 02426 02427 /* See similar code for backslashed left paren above. */ 02428 if (COMPILE_STACK_EMPTY) { 02429 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) 02430 goto normal_char; 02431 else 02432 FREE_STACK_RETURN (REG_ERPAREN); 02433 } 02434 /* Since we just checked for an empty stack above, this 02435 ``can't happen''. */ 02436 assert (compile_stack.avail != 0); 02437 { 02438 /* We don't just want to restore into `regnum', because 02439 later groups should continue to be numbered higher, 02440 as in `(ab)c(de)' -- the second group is #2. */ 02441 regnum_t this_group_regnum; 02442 02443 compile_stack.avail--; 02444 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset; 02445 fixup_alt_jump 02446 = COMPILE_STACK_TOP.fixup_alt_jump 02447 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1 02448 : 0; 02449 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset; 02450 this_group_regnum = COMPILE_STACK_TOP.regnum; 02451 /* If we've reached MAX_REGNUM groups, then this open 02452 won't actually generate any code, so we'll have to 02453 clear pending_exact explicitly. */ 02454 pending_exact = 0; 02455 02456 /* We're at the end of the group, so now we know how many 02457 groups were inside this one. */ 02458 if (this_group_regnum <= MAX_REGNUM) 02459 { 02460 unsigned char *inner_group_loc 02461 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset; 02462 02463 *inner_group_loc = regnum - this_group_regnum; 02464 BUF_PUSH_3 (stop_memory, this_group_regnum, 02465 regnum - this_group_regnum); 02466 } 02467 } 02468 break; 02469 02470 02471 case '|': /* `\|'. */ 02472 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR) 02473 goto normal_backslash; 02474 handle_alt: 02475 if (syntax & RE_LIMITED_OPS) 02476 goto normal_char; 02477 02478 /* Insert before the previous alternative a jump which 02479 jumps to this alternative if the former fails. */ 02480 GET_BUFFER_SPACE (3); 02481 INSERT_JUMP (on_failure_jump, begalt, b + 6); 02482 pending_exact = 0; 02483 b += 3; 02484 02485 /* The alternative before this one has a jump after it 02486 which gets executed if it gets matched. Adjust that 02487 jump so it will jump to this alternative's analogous 02488 jump (put in below, which in turn will jump to the next 02489 (if any) alternative's such jump, etc.). The last such 02490 jump jumps to the correct final destination. A picture: 02491 _____ _____ 02492 | | | | 02493 | v | v 02494 a | b | c 02495 02496 If we are at `b', then fixup_alt_jump right now points to a 02497 three-byte space after `a'. We'll put in the jump, set 02498 fixup_alt_jump to right after `b', and leave behind three 02499 bytes which we'll fill in when we get to after `c'. */ 02500 02501 if (fixup_alt_jump) 02502 STORE_JUMP (jump_past_alt, fixup_alt_jump, b); 02503 02504 /* Mark and leave space for a jump after this alternative, 02505 to be filled in later either by next alternative or 02506 when know we're at the end of a series of alternatives. */ 02507 fixup_alt_jump = b; 02508 GET_BUFFER_SPACE (3); 02509 b += 3; 02510 02511 laststart = 0; 02512 begalt = b; 02513 break; 02514 02515 02516 case '{': 02517 /* If \{ is a literal. */ 02518 if (!(syntax & RE_INTERVALS) 02519 /* If we're at `\{' and it's not the open-interval 02520 operator. */ 02521 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 02522 || (p - 2 == pattern && p == pend)) 02523 goto normal_backslash; 02524 02525 handle_interval: 02526 { 02527 /* If got here, then the syntax allows intervals. */ 02528 02529 /* At least (most) this many matches must be made. */ 02530 int lower_bound = -1, upper_bound = -1; 02531 02532 beg_interval = p - 1; 02533 02534 if (p == pend) 02535 { 02536 if (syntax & RE_NO_BK_BRACES) 02537 goto unfetch_interval; 02538 else 02539 FREE_STACK_RETURN (REG_EBRACE); 02540 } 02541 02542 GET_UNSIGNED_NUMBER (lower_bound); 02543 02544 if (c == ',') 02545 { 02546 GET_UNSIGNED_NUMBER (upper_bound); 02547 if (upper_bound < 0) upper_bound = RE_DUP_MAX; 02548 } 02549 else 02550 /* Interval such as `{1}' => match exactly once. */ 02551 upper_bound = lower_bound; 02552 02553 if (lower_bound < 0 || upper_bound > RE_DUP_MAX 02554 || lower_bound > upper_bound) 02555 { 02556 if (syntax & RE_NO_BK_BRACES) 02557 goto unfetch_interval; 02558 else 02559 FREE_STACK_RETURN (REG_BADBR); 02560 } 02561 02562 if (!(syntax & RE_NO_BK_BRACES)) 02563 { 02564 if (c != '\\') FREE_STACK_RETURN (REG_EBRACE); 02565 02566 PATFETCH (c); 02567 } 02568 02569 if (c != '}') 02570 { 02571 if (syntax & RE_NO_BK_BRACES) 02572 goto unfetch_interval; 02573 else 02574 FREE_STACK_RETURN (REG_BADBR); 02575 } 02576 02577 /* We just parsed a valid interval. */ 02578 02579 /* If it's invalid to have no preceding re. */ 02580 if (!laststart) 02581 { 02582 if (syntax & RE_CONTEXT_INVALID_OPS) 02583 FREE_STACK_RETURN (REG_BADRPT); 02584 else if (syntax & RE_CONTEXT_INDEP_OPS) 02585 laststart = b; 02586 else 02587 goto unfetch_interval; 02588 } 02589 02590 /* If the upper bound is zero, don't want to succeed at 02591 all; jump from `laststart' to `b + 3', which will be 02592 the end of the buffer after we insert the jump. */ 02593 if (upper_bound == 0) 02594 { 02595 GET_BUFFER_SPACE (3); 02596 INSERT_JUMP (jump, laststart, b + 3); 02597 b += 3; 02598 } 02599 02600 /* Otherwise, we have a nontrivial interval. When 02601 we're all done, the pattern will look like: 02602 set_number_at <jump count> <upper bound> 02603 set_number_at <succeed_n count> <lower bound> 02604 succeed_n <after jump addr> <succeed_n count> 02605 <body of loop> 02606 jump_n <succeed_n addr> <jump count> 02607 (The upper bound and `jump_n' are omitted if 02608 `upper_bound' is 1, though.) */ 02609 else 02610 { /* If the upper bound is > 1, we need to insert 02611 more at the end of the loop. */ 02612 unsigned nbytes = 10 + (upper_bound > 1) * 10; 02613 02614 GET_BUFFER_SPACE (nbytes); 02615 02616 /* Initialize lower bound of the `succeed_n', even 02617 though it will be set during matching by its 02618 attendant `set_number_at' (inserted next), 02619 because `re_compile_fastmap' needs to know. 02620 Jump to the `jump_n' we might insert below. */ 02621 INSERT_JUMP2 (succeed_n, laststart, 02622 b + 5 + (upper_bound > 1) * 5, 02623 lower_bound); 02624 b += 5; 02625 02626 /* Code to initialize the lower bound. Insert 02627 before the `succeed_n'. The `5' is the last two 02628 bytes of this `set_number_at', plus 3 bytes of 02629 the following `succeed_n'. */ 02630 insert_op2 (set_number_at, laststart, 5, lower_bound, b); 02631 b += 5; 02632 02633 if (upper_bound > 1) 02634 { /* More than one repetition is allowed, so 02635 append a backward jump to the `succeed_n' 02636 that starts this interval. 02637 02638 When we've reached this during matching, 02639 we'll have matched the interval once, so 02640 jump back only `upper_bound - 1' times. */ 02641 STORE_JUMP2 (jump_n, b, laststart + 5, 02642 upper_bound - 1); 02643 b += 5; 02644 02645 /* The location we want to set is the second 02646 parameter of the `jump_n'; that is `b-2' as 02647 an absolute address. `laststart' will be 02648 the `set_number_at' we're about to insert; 02649 `laststart+3' the number to set, the source 02650 for the relative address. But we are 02651 inserting into the middle of the pattern -- 02652 so everything is getting moved up by 5. 02653 Conclusion: (b - 2) - (laststart + 3) + 5, 02654 i.e., b - laststart. 02655 02656 We insert this at the beginning of the loop 02657 so that if we fail during matching, we'll 02658 reinitialize the bounds. */ 02659 insert_op2 (set_number_at, laststart, b - laststart, 02660 upper_bound - 1, b); 02661 b += 5; 02662 } 02663 } 02664 pending_exact = 0; 02665 beg_interval = NULL; 02666 } 02667 break; 02668 02669 unfetch_interval: 02670 /* If an invalid interval, match the characters as literals. */ 02671 assert (beg_interval); 02672 p = beg_interval; 02673 beg_interval = NULL; 02674 02675 /* normal_char and normal_backslash need `c'. */ 02676 PATFETCH (c); 02677 02678 if (!(syntax & RE_NO_BK_BRACES)) 02679 { 02680 if (p > pattern && p[-1] == '\\') 02681 goto normal_backslash; 02682 } 02683 goto normal_char; 02684 02685 #ifdef emacs 02686 /* There is no way to specify the before_dot and after_dot 02687 operators. rms says this is ok. --karl */ 02688 case '=': 02689 BUF_PUSH (at_dot); 02690 break; 02691 02692 case 's': 02693 laststart = b; 02694 PATFETCH (c); 02695 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]); 02696 break; 02697 02698 case 'S': 02699 laststart = b; 02700 PATFETCH (c); 02701 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]); 02702 break; 02703 #endif /* emacs */ 02704 02705 02706 case 'w': 02707 if (re_syntax_options & RE_NO_GNU_OPS) 02708 goto normal_char; 02709 laststart = b; 02710 BUF_PUSH (wordchar); 02711 break; 02712 02713 02714 case 'W': 02715 if (re_syntax_options & RE_NO_GNU_OPS) 02716 goto normal_char; 02717 laststart = b; 02718 BUF_PUSH (notwordchar); 02719 break; 02720 02721 02722 case '<': 02723 if (re_syntax_options & RE_NO_GNU_OPS) 02724 goto normal_char; 02725 BUF_PUSH (wordbeg); 02726 break; 02727 02728 case '>': 02729 if (re_syntax_options & RE_NO_GNU_OPS) 02730 goto normal_char; 02731 BUF_PUSH (wordend); 02732 break; 02733 02734 case 'b': 02735 if (re_syntax_options & RE_NO_GNU_OPS) 02736 goto normal_char; 02737 BUF_PUSH (wordbound); 02738 break; 02739 02740 case 'B': 02741 if (re_syntax_options & RE_NO_GNU_OPS) 02742 goto normal_char; 02743 BUF_PUSH (notwordbound); 02744 break; 02745 02746 case '`': 02747 if (re_syntax_options & RE_NO_GNU_OPS) 02748 goto normal_char; 02749 BUF_PUSH (begbuf); 02750 break; 02751 02752 case '\'': 02753 if (re_syntax_options & RE_NO_GNU_OPS) 02754 goto normal_char; 02755 BUF_PUSH (endbuf); 02756 break; 02757 02758 case '1': case '2': case '3': case '4': case '5': 02759 case '6': case '7': case '8': case '9': 02760 if (syntax & RE_NO_BK_REFS) 02761 goto normal_char; 02762 02763 c1 = c - '0'; 02764 02765 if (c1 > regnum) 02766 FREE_STACK_RETURN (REG_ESUBREG); 02767 02768 /* Can't back reference to a subexpression if inside of it. */ 02769 if (group_in_compile_stack (compile_stack, (regnum_t) c1)) 02770 goto normal_char; 02771 02772 laststart = b; 02773 BUF_PUSH_2 (duplicate, c1); 02774 break; 02775 02776 02777 case '+': 02778 case '?': 02779 if (syntax & RE_BK_PLUS_QM) 02780 goto handle_plus; 02781 else 02782 goto normal_backslash; 02783 02784 default: 02785 normal_backslash: 02786 /* You might think it would be useful for \ to mean 02787 not to translate; but if we don't translate it 02788 it will never match anything. */ 02789 c = TRANSLATE (c); 02790 goto normal_char; 02791 } 02792 break; 02793 02794 02795 default: 02796 /* Expects the character in `c'. */ 02797 normal_char: 02798 /* If no exactn currently being built. */ 02799 if (!pending_exact 02800 02801 /* If last exactn not at current position. */ 02802 || pending_exact + *pending_exact + 1 != b 02803 02804 /* We have only one byte following the exactn for the count. */ 02805 || *pending_exact == (1 << BYTEWIDTH) - 1 02806 02807 /* If followed by a repetition operator. */ 02808 || *p == '*' || *p == '^' 02809 || ((syntax & RE_BK_PLUS_QM) 02810 ? *p == '\\' && (p[1] == '+' || p[1] == '?') 02811 : (*p == '+' || *p == '?')) 02812 || ((syntax & RE_INTERVALS) 02813 && ((syntax & RE_NO_BK_BRACES) 02814 ? *p == '{' 02815 : (p[0] == '\\' && p[1] == '{')))) 02816 { 02817 /* Start building a new exactn. */ 02818 02819 laststart = b; 02820 02821 BUF_PUSH_2 (exactn, 0); 02822 pending_exact = b - 1; 02823 } 02824 02825 BUF_PUSH (c); 02826 (*pending_exact)++; 02827 break; 02828 } /* switch (c) */ 02829 } /* while p != pend */ 02830 02831 02832 /* Through the pattern now. */ 02833 02834 if (fixup_alt_jump) 02835 STORE_JUMP (jump_past_alt, fixup_alt_jump, b); 02836 02837 if (!COMPILE_STACK_EMPTY) 02838 FREE_STACK_RETURN (REG_EPAREN); 02839 02840 /* If we don't want backtracking, force success 02841 the first time we reach the end of the compiled pattern. */ 02842 if (syntax & RE_NO_POSIX_BACKTRACKING) 02843 BUF_PUSH (succeed); 02844 02845 free (compile_stack.stack); 02846 02847 /* We have succeeded; set the length of the buffer. */ 02848 bufp->used = b - bufp->buffer; 02849 02850 #ifdef DEBUG 02851 if (debug) 02852 { 02853 DEBUG_PRINT1 ("\nCompiled pattern: \n"); 02854 print_compiled_pattern (bufp); 02855 } 02856 #endif /* DEBUG */ 02857 02858 #ifndef MATCH_MAY_ALLOCATE 02859 /* Initialize the failure stack to the largest possible stack. This 02860 isn't necessary unless we're trying to avoid calling alloca in 02861 the search and match routines. */ 02862 { 02863 int num_regs = bufp->re_nsub + 1; 02864 02865 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size 02866 is strictly greater than re_max_failures, the largest possible stack 02867 is 2 * re_max_failures failure points. */ 02868 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS)) 02869 { 02870 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS); 02871 02872 #ifdef emacs 02873 if (! fail_stack.stack) 02874 fail_stack.stack 02875 = (fail_stack_elt_t *) xmalloc (fail_stack.size 02876 * sizeof (fail_stack_elt_t)); 02877 else 02878 fail_stack.stack 02879 = (fail_stack_elt_t *) xrealloc (fail_stack.stack, 02880 (fail_stack.size 02881 * sizeof (fail_stack_elt_t))); 02882 #else /* not emacs */ 02883 if (! fail_stack.stack) 02884 fail_stack.stack 02885 = (fail_stack_elt_t *) malloc (fail_stack.size 02886 * sizeof (fail_stack_elt_t)); 02887 else 02888 fail_stack.stack 02889 = (fail_stack_elt_t *) realloc (fail_stack.stack, 02890 (fail_stack.size 02891 * sizeof (fail_stack_elt_t))); 02892 #endif /* not emacs */ 02893 } 02894 02895 regex_grow_registers (num_regs); 02896 } 02897 #endif /* not MATCH_MAY_ALLOCATE */ 02898 02899 return REG_NOERROR; 02900 } /* regex_compile */ 02901 02902 /* Subroutines for `regex_compile'. */ 02903 02904 /* Store OP at LOC followed by two-byte integer parameter ARG. */ 02905 02906 static void 02907 store_op1 (re_opcode_t op, 02908 unsigned char *loc, 02909 int arg) 02910 { 02911 *loc = (unsigned char) op; 02912 STORE_NUMBER (loc + 1, arg); 02913 } 02914 02915 02916 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */ 02917 02918 static void 02919 store_op2(re_opcode_t op, 02920 unsigned char *loc, 02921 int arg1, 02922 int arg2) 02923 { 02924 *loc = (unsigned char) op; 02925 STORE_NUMBER (loc + 1, arg1); 02926 STORE_NUMBER (loc + 3, arg2); 02927 } 02928 02929 02930 /* Copy the bytes from LOC to END to open up three bytes of space at LOC 02931 for OP followed by two-byte integer parameter ARG. */ 02932 02933 static void 02934 insert_op1(re_opcode_t op, 02935 unsigned char *loc, 02936 int arg, 02937 unsigned char *end) 02938 { 02939 register unsigned char *pfrom = end; 02940 register unsigned char *pto = end + 3; 02941 02942 while (pfrom != loc) 02943 *--pto = *--pfrom; 02944 02945 store_op1 (op, loc, arg); 02946 } 02947 02948 02949 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */ 02950 02951 static void 02952 insert_op2(re_opcode_t op, 02953 unsigned char *loc, 02954 int arg1, 02955 int arg2, 02956 unsigned char *end) 02957 { 02958 register unsigned char *pfrom = end; 02959 register unsigned char *pto = end + 5; 02960 02961 while (pfrom != loc) 02962 *--pto = *--pfrom; 02963 02964 store_op2 (op, loc, arg1, arg2); 02965 } 02966 02967 02968 /* P points to just after a ^ in PATTERN. Return true if that ^ comes 02969 after an alternative or a begin-subexpression. We assume there is at 02970 least one character before the ^. */ 02971 02972 static boolean 02973 at_begline_loc_p(const char *pattern, 02974 const char *p, 02975 reg_syntax_t syntax) 02976 { 02977 const char *prev = p - 2; 02978 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\'; 02979 02980 return 02981 /* After a subexpression? */ 02982 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash)) 02983 /* After an alternative? */ 02984 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash)); 02985 } 02986 02987 02988 /* The dual of at_begline_loc_p. This one is for $. We assume there is 02989 at least one character after the $, i.e., `P < PEND'. */ 02990 02991 static boolean 02992 at_endline_loc_p(const char *p, 02993 const char *pend, 02994 reg_syntax_t syntax) 02995 { 02996 const char *next = p; 02997 boolean next_backslash = *next == '\\'; 02998 const char *next_next = p + 1 < pend ? p + 1 : 0; 02999 03000 return 03001 /* Before a subexpression? */ 03002 (syntax & RE_NO_BK_PARENS ? *next == ')' 03003 : next_backslash && next_next && *next_next == ')') 03004 /* Before an alternative? */ 03005 || (syntax & RE_NO_BK_VBAR ? *next == '|' 03006 : next_backslash && next_next && *next_next == '|'); 03007 } 03008 03009 03010 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and 03011 false if it's not. */ 03012 03013 static boolean 03014 group_in_compile_stack(compile_stack_type compile_stack, 03015 regnum_t regnum) 03016 { 03017 int this_element; 03018 03019 for (this_element = compile_stack.avail - 1; 03020 this_element >= 0; 03021 this_element--) 03022 if (compile_stack.stack[this_element].regnum == regnum) 03023 return true; 03024 03025 return false; 03026 } 03027 03028 03029 /* Read the ending character of a range (in a bracket expression) from the 03030 uncompiled pattern *P_PTR (which ends at PEND). We assume the 03031 starting character is in `P[-2]'. (`P[-1]' is the character `-'.) 03032 Then we set the translation of all bits between the starting and 03033 ending characters (inclusive) in the compiled pattern B. 03034 03035 Return an error code. 03036 03037 We use these short variable names so we can use the same macros as 03038 `regex_compile' itself. */ 03039 03040 static reg_errcode_t 03041 compile_range(const char **p_ptr, 03042 const char *pend, 03043 RE_TRANSLATE_TYPE translate, 03044 reg_syntax_t syntax, 03045 unsigned char *b) 03046 { 03047 unsigned this_char; 03048 03049 const char *p = *p_ptr; 03050 unsigned int range_start, range_end; 03051 03052 if (p == pend) 03053 return REG_ERANGE; 03054 03055 /* Even though the pattern is a signed `char *', we need to fetch 03056 with unsigned char *'s; if the high bit of the pattern character 03057 is set, the range endpoints will be negative if we fetch using a 03058 signed char *. 03059 03060 We also want to fetch the endpoints without translating them; the 03061 appropriate translation is done in the bit-setting loop below. */ 03062 /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */ 03063 range_start = ((const unsigned char *) p)[-2]; 03064 range_end = ((const unsigned char *) p)[0]; 03065 03066 /* Have to increment the pointer into the pattern string, so the 03067 caller isn't still at the ending character. */ 03068 (*p_ptr)++; 03069 03070 /* If the start is after the end, the range is empty. */ 03071 if (range_start > range_end) 03072 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; 03073 03074 /* Here we see why `this_char' has to be larger than an `unsigned 03075 char' -- the range is inclusive, so if `range_end' == 0xff 03076 (assuming 8-bit characters), we would otherwise go into an infinite 03077 loop, since all characters <= 0xff. */ 03078 for (this_char = range_start; this_char <= range_end; this_char++) 03079 { 03080 SET_LIST_BIT (TRANSLATE (this_char)); 03081 } 03082 03083 return REG_NOERROR; 03084 } 03085 03086 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in 03087 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible 03088 characters can start a string that matches the pattern. This fastmap 03089 is used by re_search to skip quickly over impossible starting points. 03090 03091 The caller must supply the address of a (1 << BYTEWIDTH)-byte data 03092 area as BUFP->fastmap. 03093 03094 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in 03095 the pattern buffer. 03096 03097 Returns 0 if we succeed, -2 if an internal error. */ 03098 03099 int 03100 re_compile_fastmap(struct re_pattern_buffer *bufp) 03101 { 03102 int j, k; 03103 #ifdef MATCH_MAY_ALLOCATE 03104 fail_stack_type fail_stack; 03105 #endif 03106 #ifndef REGEX_MALLOC 03107 char *destination; 03108 #endif 03109 03110 register char *fastmap = bufp->fastmap; 03111 unsigned char *pattern = bufp->buffer; 03112 unsigned char *p = pattern; 03113 register unsigned char *pend = pattern + bufp->used; 03114 03115 #ifdef REL_ALLOC 03116 /* This holds the pointer to the failure stack, when 03117 it is allocated relocatably. */ 03118 fail_stack_elt_t *failure_stack_ptr; 03119 #endif 03120 03121 /* Assume that each path through the pattern can be null until 03122 proven otherwise. We set this false at the bottom of switch 03123 statement, to which we get only if a particular path doesn't 03124 match the empty string. */ 03125 boolean path_can_be_null = true; 03126 03127 /* We aren't doing a `succeed_n' to begin with. */ 03128 boolean succeed_n_p = false; 03129 03130 assert (fastmap != NULL && p != NULL); 03131 03132 INIT_FAIL_STACK (); 03133 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */ 03134 bufp->fastmap_accurate = 1; /* It will be when we're done. */ 03135 bufp->can_be_null = 0; 03136 03137 while (1) 03138 { 03139 if (p == pend || *p == succeed) 03140 { 03141 /* We have reached the (effective) end of pattern. */ 03142 if (!FAIL_STACK_EMPTY ()) 03143 { 03144 bufp->can_be_null |= path_can_be_null; 03145 03146 /* Reset for next path. */ 03147 path_can_be_null = true; 03148 03149 p = fail_stack.stack[--fail_stack.avail].pointer; 03150 03151 continue; 03152 } 03153 else 03154 break; 03155 } 03156 03157 /* We should never be about to go beyond the end of the pattern. */ 03158 assert (p < pend); 03159 03160 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++)) 03161 { 03162 03163 /* I guess the idea here is to simply not bother with a fastmap 03164 if a backreference is used, since it's too hard to figure out 03165 the fastmap for the corresponding group. Setting 03166 `can_be_null' stops `re_search_2' from using the fastmap, so 03167 that is all we do. */ 03168 case duplicate: 03169 bufp->can_be_null = 1; 03170 goto done; 03171 03172 03173 /* Following are the cases which match a character. These end 03174 with `break'. */ 03175 03176 case exactn: 03177 fastmap[p[1]] = 1; 03178 break; 03179 03180 03181 case charset: 03182 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) 03183 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) 03184 fastmap[j] = 1; 03185 break; 03186 03187 03188 case charset_not: 03189 /* Chars beyond end of map must be allowed. */ 03190 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++) 03191 fastmap[j] = 1; 03192 03193 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) 03194 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) 03195 fastmap[j] = 1; 03196 break; 03197 03198 03199 case wordchar: 03200 for (j = 0; j < (1 << BYTEWIDTH); j++) 03201 if (SYNTAX (j) == Sword) 03202 fastmap[j] = 1; 03203 break; 03204 03205 03206 case notwordchar: 03207 for (j = 0; j < (1 << BYTEWIDTH); j++) 03208 if (SYNTAX (j) != Sword) 03209 fastmap[j] = 1; 03210 break; 03211 03212 03213 case anychar: 03214 { 03215 int fastmap_newline = fastmap['\n']; 03216 03217 /* `.' matches anything ... */ 03218 for (j = 0; j < (1 << BYTEWIDTH); j++) 03219 fastmap[j] = 1; 03220 03221 /* ... except perhaps newline. */ 03222 if (!(bufp->syntax & RE_DOT_NEWLINE)) 03223 fastmap['\n'] = fastmap_newline; 03224 03225 /* Return if we have already set `can_be_null'; if we have, 03226 then the fastmap is irrelevant. Something's wrong here. */ 03227 else if (bufp->can_be_null) 03228 goto done; 03229 03230 /* Otherwise, have to check alternative paths. */ 03231 break; 03232 } 03233 03234 #ifdef emacs 03235 case syntaxspec: 03236 k = *p++; 03237 for (j = 0; j < (1 << BYTEWIDTH); j++) 03238 if (SYNTAX (j) == (enum syntaxcode) k) 03239 fastmap[j] = 1; 03240 break; 03241 03242 03243 case notsyntaxspec: 03244 k = *p++; 03245 for (j = 0; j < (1 << BYTEWIDTH); j++) 03246 if (SYNTAX (j) != (enum syntaxcode) k) 03247 fastmap[j] = 1; 03248 break; 03249 03250 03251 /* All cases after this match the empty string. These end with 03252 `continue'. */ 03253 03254 03255 case before_dot: 03256 case at_dot: 03257 case after_dot: 03258 continue; 03259 #endif /* emacs */ 03260 03261 03262 case no_op: 03263 case begline: 03264 case endline: 03265 case begbuf: 03266 case endbuf: 03267 case wordbound: 03268 case notwordbound: 03269 case wordbeg: 03270 case wordend: 03271 case push_dummy_failure: 03272 continue; 03273 03274 03275 case jump_n: 03276 case pop_failure_jump: 03277 case maybe_pop_jump: 03278 case jump: 03279 case jump_past_alt: 03280 case dummy_failure_jump: 03281 EXTRACT_NUMBER_AND_INCR (j, p); 03282 p += j; 03283 if (j > 0) 03284 continue; 03285 03286 /* Jump backward implies we just went through the body of a 03287 loop and matched nothing. Opcode jumped to should be 03288 `on_failure_jump' or `succeed_n'. Just treat it like an 03289 ordinary jump. For a * loop, it has pushed its failure 03290 point already; if so, discard that as redundant. */ 03291 if ((re_opcode_t) *p != on_failure_jump 03292 && (re_opcode_t) *p != succeed_n) 03293 continue; 03294 03295 p++; 03296 EXTRACT_NUMBER_AND_INCR (j, p); 03297 p += j; 03298 03299 /* If what's on the stack is where we are now, pop it. */ 03300 if (!FAIL_STACK_EMPTY () 03301 && fail_stack.stack[fail_stack.avail - 1].pointer == p) 03302 fail_stack.avail--; 03303 03304 continue; 03305 03306 03307 case on_failure_jump: 03308 case on_failure_keep_string_jump: 03309 handle_on_failure_jump: 03310 EXTRACT_NUMBER_AND_INCR (j, p); 03311 03312 /* For some patterns, e.g., `(a?)?', `p+j' here points to the 03313 end of the pattern. We don't want to push such a point, 03314 since when we restore it above, entering the switch will 03315 increment `p' past the end of the pattern. We don't need 03316 to push such a point since we obviously won't find any more 03317 fastmap entries beyond `pend'. Such a pattern can match 03318 the null string, though. */ 03319 if (p + j < pend) 03320 { 03321 if (!PUSH_PATTERN_OP (p + j, fail_stack)) 03322 { 03323 RESET_FAIL_STACK (); 03324 return -2; 03325 } 03326 } 03327 else 03328 bufp->can_be_null = 1; 03329 03330 if (succeed_n_p) 03331 { 03332 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */ 03333 succeed_n_p = false; 03334 } 03335 03336 continue; 03337 03338 03339 case succeed_n: 03340 /* Get to the number of times to succeed. */ 03341 p += 2; 03342 03343 /* Increment p past the n for when k != 0. */ 03344 EXTRACT_NUMBER_AND_INCR (k, p); 03345 if (k == 0) 03346 { 03347 p -= 4; 03348 succeed_n_p = true; /* Spaghetti code alert. */ 03349 goto handle_on_failure_jump; 03350 } 03351 continue; 03352 03353 03354 case set_number_at: 03355 p += 4; 03356 continue; 03357 03358 03359 case start_memory: 03360 case stop_memory: 03361 p += 2; 03362 continue; 03363 03364 03365 default: 03366 abort (); /* We have listed all the cases. */ 03367 } /* switch *p++ */ 03368 03369 /* Getting here means we have found the possible starting 03370 characters for one path of the pattern -- and that the empty 03371 string does not match. We need not follow this path further. 03372 Instead, look at the next alternative (remembered on the 03373 stack), or quit if no more. The test at the top of the loop 03374 does these things. */ 03375 path_can_be_null = false; 03376 p = pend; 03377 } /* while p */ 03378 03379 /* Set `can_be_null' for the last path (also the first path, if the 03380 pattern is empty). */ 03381 bufp->can_be_null |= path_can_be_null; 03382 03383 done: 03384 RESET_FAIL_STACK (); 03385 return 0; 03386 } /* re_compile_fastmap */ 03387 03388 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and 03389 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use 03390 this memory for recording register information. STARTS and ENDS 03391 must be allocated using the malloc library routine, and must each 03392 be at least NUM_REGS * sizeof (regoff_t) bytes long. 03393 03394 If NUM_REGS == 0, then subsequent matches should allocate their own 03395 register data. 03396 03397 Unless this function is called, the first search or match using 03398 PATTERN_BUFFER will allocate its own register data, without 03399 freeing the old data. */ 03400 03401 void 03402 re_set_registers(struct re_pattern_buffer *bufp, 03403 struct re_registers *regs, 03404 unsigned num_regs, 03405 regoff_t *starts, 03406 regoff_t *ends) 03407 { 03408 if (num_regs) 03409 { 03410 bufp->regs_allocated = REGS_REALLOCATE; 03411 regs->num_regs = num_regs; 03412 regs->start = starts; 03413 regs->end = ends; 03414 } 03415 else 03416 { 03417 bufp->regs_allocated = REGS_UNALLOCATED; 03418 regs->num_regs = 0; 03419 regs->start = regs->end = (regoff_t *) 0; 03420 } 03421 } 03422 03423 /* Searching routines. */ 03424 03425 /* Like re_search_2, below, but only one string is specified, and 03426 doesn't let you say where to stop matching. */ 03427 03428 int 03429 re_search(struct re_pattern_buffer *bufp, 03430 const char *string, 03431 int size, 03432 int startpos, 03433 int range, 03434 struct re_registers *regs) 03435 { 03436 return re_search_2 (bufp, NULL, 0, string, size, startpos, range, 03437 regs, size); 03438 } 03439 03440 03441 /* Using the compiled pattern in BUFP->buffer, first tries to match the 03442 virtual concatenation of STRING1 and STRING2, starting first at index 03443 STARTPOS, then at STARTPOS + 1, and so on. 03444 03445 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively. 03446 03447 RANGE is how far to scan while trying to match. RANGE = 0 means try 03448 only at STARTPOS; in general, the last start tried is STARTPOS + 03449 RANGE. 03450 03451 In REGS, return the indices of the virtual concatenation of STRING1 03452 and STRING2 that matched the entire BUFP->buffer and its contained 03453 subexpressions. 03454 03455 Do not consider matching one past the index STOP in the virtual 03456 concatenation of STRING1 and STRING2. 03457 03458 We return either the position in the strings at which the match was 03459 found, -1 if no match, or -2 if error (such as failure 03460 stack overflow). */ 03461 03462 int 03463 re_search_2(struct re_pattern_buffer *bufp, 03464 const char *string1, 03465 int size1, 03466 const char *string2, 03467 int size2, 03468 int startpos, 03469 int range, 03470 struct re_registers *regs, 03471 int stop) 03472 { 03473 int val; 03474 register char *fastmap = bufp->fastmap; 03475 register RE_TRANSLATE_TYPE translate = bufp->translate; 03476 int total_size = size1 + size2; 03477 int endpos = startpos + range; 03478 03479 /* Check for out-of-range STARTPOS. */ 03480 if (startpos < 0 || startpos > total_size) 03481 return -1; 03482 03483 /* Fix up RANGE if it might eventually take us outside 03484 the virtual concatenation of STRING1 and STRING2. 03485 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */ 03486 if (endpos < 0) 03487 range = 0 - startpos; 03488 else if (endpos > total_size) 03489 range = total_size - startpos; 03490 03491 /* If the search isn't to be a backwards one, don't waste time in a 03492 search for a pattern that must be anchored. */ 03493 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0) 03494 { 03495 if (startpos > 0) 03496 return -1; 03497 else 03498 range = 1; 03499 } 03500 03501 #ifdef emacs 03502 /* In a forward search for something that starts with \=. 03503 don't keep searching past point. */ 03504 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0) 03505 { 03506 range = PT - startpos; 03507 if (range <= 0) 03508 return -1; 03509 } 03510 #endif /* emacs */ 03511 03512 /* Update the fastmap now if not correct already. */ 03513 if (fastmap && !bufp->fastmap_accurate) 03514 if (re_compile_fastmap (bufp) == -2) 03515 return -2; 03516 03517 /* Loop through the string, looking for a place to start matching. */ 03518 for (;;) 03519 { 03520 /* If a fastmap is supplied, skip quickly over characters that 03521 cannot be the start of a match. If the pattern can match the 03522 null string, however, we don't need to skip characters; we want 03523 the first null string. */ 03524 if (fastmap && startpos < total_size && !bufp->can_be_null) 03525 { 03526 if (range > 0) /* Searching forwards. */ 03527 { 03528 register const char *d; 03529 register int lim = 0; 03530 int irange = range; 03531 03532 if (startpos < size1 && startpos + range >= size1) 03533 lim = range - (size1 - startpos); 03534 03535 d = (startpos >= size1 ? string2 - size1 : string1) + startpos; 03536 03537 /* Written out as an if-else to avoid testing `translate' 03538 inside the loop. */ 03539 if (translate) 03540 while (range > lim 03541 && !fastmap[(unsigned char) 03542 translate[(unsigned char) *d++]]) 03543 range--; 03544 else 03545 while (range > lim && !fastmap[(unsigned char) *d++]) 03546 range--; 03547 03548 startpos += irange - range; 03549 } 03550 else /* Searching backwards. */ 03551 { 03552 register char c = (size1 == 0 || startpos >= size1 03553 ? string2[startpos - size1] 03554 : string1[startpos]); 03555 03556 if (!fastmap[(unsigned char) TRANSLATE (c)]) 03557 goto advance; 03558 } 03559 } 03560 03561 /* If can't match the null string, and that's all we have left, fail. */ 03562 if (range >= 0 && startpos == total_size && fastmap 03563 && !bufp->can_be_null) 03564 return -1; 03565 03566 val = re_match_2_internal (bufp, string1, size1, string2, size2, 03567 startpos, regs, stop); 03568 #ifndef REGEX_MALLOC 03569 #ifdef C_ALLOCA 03570 alloca (0); 03571 #endif 03572 #endif 03573 03574 if (val >= 0) 03575 return startpos; 03576 03577 if (val == -2) 03578 return -2; 03579 03580 advance: 03581 if (!range) 03582 break; 03583 else if (range > 0) 03584 { 03585 range--; 03586 startpos++; 03587 } 03588 else 03589 { 03590 range++; 03591 startpos--; 03592 } 03593 } 03594 return -1; 03595 } /* re_search_2 */ 03596 03597 /* This converts PTR, a pointer into one of the search strings `string1' 03598 and `string2' into an offset from the beginning of that string. */ 03599 #define POINTER_TO_OFFSET(ptr) \ 03600 (FIRST_STRING_P (ptr) \ 03601 ? ((regoff_t) ((ptr) - string1)) \ 03602 : ((regoff_t) ((ptr) - string2 + size1))) 03603 03604 /* Macros for dealing with the split strings in re_match_2. */ 03605 03606 #define MATCHING_IN_FIRST_STRING (dend == end_match_1) 03607 03608 /* Call before fetching a character with *d. This switches over to 03609 string2 if necessary. */ 03610 #define PREFETCH() \ 03611 while (d == dend) \ 03612 { \ 03613 /* End of string2 => fail. */ \ 03614 if (dend == end_match_2) \ 03615 goto fail; \ 03616 /* End of string1 => advance to string2. */ \ 03617 d = string2; \ 03618 dend = end_match_2; \ 03619 } 03620 03621 03622 /* Test if at very beginning or at very end of the virtual concatenation 03623 of `string1' and `string2'. If only one string, it's `string2'. */ 03624 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2) 03625 #define AT_STRINGS_END(d) ((d) == end2) 03626 03627 03628 /* Test if D points to a character which is word-constituent. We have 03629 two special cases to check for: if past the end of string1, look at 03630 the first character in string2; and if before the beginning of 03631 string2, look at the last character in string1. */ 03632 #define WORDCHAR_P(d) \ 03633 (SYNTAX ((d) == end1 ? *string2 \ 03634 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \ 03635 == Sword) 03636 03637 /* Disabled due to a compiler bug -- see comment at case wordbound */ 03638 #if 0 03639 /* Test if the character before D and the one at D differ with respect 03640 to being word-constituent. */ 03641 #define AT_WORD_BOUNDARY(d) \ 03642 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \ 03643 || WORDCHAR_P (d - 1) != WORDCHAR_P (d)) 03644 #endif 03645 03646 /* Free everything we malloc. */ 03647 #ifdef MATCH_MAY_ALLOCATE 03648 #define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL 03649 #define FREE_VARIABLES() \ 03650 do { \ 03651 REGEX_FREE_STACK (fail_stack.stack); \ 03652 FREE_VAR ((void*) regstart); \ 03653 FREE_VAR ((void*) regend); \ 03654 FREE_VAR ((void*) old_regstart); \ 03655 FREE_VAR ((void*) old_regend); \ 03656 FREE_VAR ((void*) best_regstart); \ 03657 FREE_VAR ((void*) best_regend); \ 03658 FREE_VAR ((void*) reg_info); \ 03659 FREE_VAR ((void*) reg_dummy); \ 03660 FREE_VAR ((void*) reg_info_dummy); \ 03661 } while (0) 03662 #else 03663 #define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */ 03664 #endif /* not MATCH_MAY_ALLOCATE */ 03665 03666 /* These values must meet several constraints. They must not be valid 03667 register values; since we have a limit of 255 registers (because 03668 we use only one byte in the pattern for the register number), we can 03669 use numbers larger than 255. They must differ by 1, because of 03670 NUM_FAILURE_ITEMS above. And the value for the lowest register must 03671 be larger than the value for the highest register, so we do not try 03672 to actually save any registers when none are active. */ 03673 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH) 03674 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1) 03675 03676 /* Matching routines. */ 03677 03678 #ifndef emacs /* Emacs never uses this. */ 03679 /* re_match is like re_match_2 except it takes only a single string. */ 03680 03681 int 03682 re_match(struct re_pattern_buffer *bufp, 03683 const char *string, 03684 int size, 03685 int pos, 03686 struct re_registers *regs) 03687 { 03688 int result = re_match_2_internal (bufp, NULL, 0, string, size, 03689 pos, regs, size); 03690 #ifndef REGEX_MALLOC 03691 #ifdef C_ALLOCA 03692 alloca (0); 03693 #endif 03694 #endif 03695 return result; 03696 } 03697 #endif /* not emacs */ 03698 03699 static boolean group_match_null_string_p _RE_ARGS ((unsigned char **p, 03700 unsigned char *end, 03701 register_info_type *reg_info)); 03702 static boolean alt_match_null_string_p _RE_ARGS ((unsigned char *p, 03703 unsigned char *end, 03704 register_info_type *reg_info)); 03705 static boolean common_op_match_null_string_p _RE_ARGS ((unsigned char **p, 03706 unsigned char *end, 03707 register_info_type *reg_info)); 03708 static int bcmp_translate _RE_ARGS ((const char *s1, const char *s2, 03709 int len, char *translate)); 03710 03711 /* re_match_2 matches the compiled pattern in BUFP against the 03712 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 03713 and SIZE2, respectively). We start matching at POS, and stop 03714 matching at STOP. 03715 03716 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we 03717 store offsets for the substring each group matched in REGS. See the 03718 documentation for exactly how many groups we fill. 03719 03720 We return -1 if no match, -2 if an internal error (such as the 03721 failure stack overflowing). Otherwise, we return the length of the 03722 matched substring. */ 03723 03724 int 03725 re_match_2(struct re_pattern_buffer *bufp, 03726 const char *string1, 03727 int size1, 03728 const char *string2, 03729 int size2, 03730 int pos, 03731 struct re_registers *regs, 03732 int stop) 03733 { 03734 int result = re_match_2_internal (bufp, string1, size1, string2, size2, 03735 pos, regs, stop); 03736 #ifndef REGEX_MALLOC 03737 #ifdef C_ALLOCA 03738 alloca (0); 03739 #endif 03740 #endif 03741 return result; 03742 } 03743 03744 /* This is a separate function so that we can force an alloca cleanup 03745 afterwards. */ 03746 static int 03747 re_match_2_internal(struct re_pattern_buffer *bufp, 03748 const char *string1, 03749 int size1, 03750 const char *string2, 03751 int size2, 03752 int pos, 03753 struct re_registers *regs, 03754 int stop) 03755 { 03756 /* General temporaries. */ 03757 int mcnt; 03758 unsigned char *p1; 03759 03760 /* Just past the end of the corresponding string. */ 03761 const char *end1, *end2; 03762 03763 /* Pointers into string1 and string2, just past the last characters in 03764 each to consider matching. */ 03765 const char *end_match_1, *end_match_2; 03766 03767 /* Where we are in the data, and the end of the current string. */ 03768 const char *d, *dend; 03769 03770 /* Where we are in the pattern, and the end of the pattern. */ 03771 unsigned char *p = bufp->buffer; 03772 register unsigned char *pend = p + bufp->used; 03773 03774 /* Mark the opcode just after a start_memory, so we can test for an 03775 empty subpattern when we get to the stop_memory. */ 03776 unsigned char *just_past_start_mem = 0; 03777 03778 /* We use this to map every character in the string. */ 03779 RE_TRANSLATE_TYPE translate = bufp->translate; 03780 03781 /* Failure point stack. Each place that can handle a failure further 03782 down the line pushes a failure point on this stack. It consists of 03783 restart, regend, and reg_info for all registers corresponding to 03784 the subexpressions we're currently inside, plus the number of such 03785 registers, and, finally, two char *'s. The first char * is where 03786 to resume scanning the pattern; the second one is where to resume 03787 scanning the strings. If the latter is zero, the failure point is 03788 a ``dummy''; if a failure happens and the failure point is a dummy, 03789 it gets discarded and the next next one is tried. */ 03790 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */ 03791 fail_stack_type fail_stack; 03792 #endif 03793 #ifdef DEBUG 03794 static unsigned failure_id = 0; 03795 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0; 03796 #endif 03797 03798 #ifdef REL_ALLOC 03799 /* This holds the pointer to the failure stack, when 03800 it is allocated relocatably. */ 03801 fail_stack_elt_t *failure_stack_ptr; 03802 #endif 03803 03804 /* We fill all the registers internally, independent of what we 03805 return, for use in backreferences. The number here includes 03806 an element for register zero. */ 03807 size_t num_regs = bufp->re_nsub + 1; 03808 03809 /* The currently active registers. */ 03810 active_reg_t lowest_active_reg = NO_LOWEST_ACTIVE_REG; 03811 active_reg_t highest_active_reg = NO_HIGHEST_ACTIVE_REG; 03812 03813 /* Information on the contents of registers. These are pointers into 03814 the input strings; they record just what was matched (on this 03815 attempt) by a subexpression part of the pattern, that is, the 03816 regnum-th regstart pointer points to where in the pattern we began 03817 matching and the regnum-th regend points to right after where we 03818 stopped matching the regnum-th subexpression. (The zeroth register 03819 keeps track of what the whole pattern matches.) */ 03820 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ 03821 const char **regstart, **regend; 03822 #endif 03823 03824 /* If a group that's operated upon by a repetition operator fails to 03825 match anything, then the register for its start will need to be 03826 restored because it will have been set to wherever in the string we 03827 are when we last see its open-group operator. Similarly for a 03828 register's end. */ 03829 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ 03830 const char **old_regstart, **old_regend; 03831 #endif 03832 03833 /* The is_active field of reg_info helps us keep track of which (possibly 03834 nested) subexpressions we are currently in. The matched_something 03835 field of reg_info[reg_num] helps us tell whether or not we have 03836 matched any of the pattern so far this time through the reg_num-th 03837 subexpression. These two fields get reset each time through any 03838 loop their register is in. */ 03839 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */ 03840 register_info_type *reg_info; 03841 #endif 03842 03843 /* The following record the register info as found in the above 03844 variables when we find a match better than any we've seen before. 03845 This happens as we backtrack through the failure points, which in 03846 turn happens only if we have not yet matched the entire string. */ 03847 unsigned best_regs_set = false; 03848 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ 03849 const char **best_regstart, **best_regend; 03850 #endif 03851 03852 /* Logically, this is `best_regend[0]'. But we don't want to have to 03853 allocate space for that if we're not allocating space for anything 03854 else (see below). Also, we never need info about register 0 for 03855 any of the other register vectors, and it seems rather a kludge to 03856 treat `best_regend' differently than the rest. So we keep track of 03857 the end of the best match so far in a separate variable. We 03858 initialize this to NULL so that when we backtrack the first time 03859 and need to test it, it's not garbage. */ 03860 const char *match_end = NULL; 03861 03862 /* This helps SET_REGS_MATCHED avoid doing redundant work. */ 03863 int set_regs_matched_done = 0; 03864 03865 /* Used when we pop values we don't care about. */ 03866 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ 03867 const char **reg_dummy; 03868 register_info_type *reg_info_dummy; 03869 #endif 03870 03871 #ifdef DEBUG 03872 /* Counts the total number of registers pushed. */ 03873 unsigned num_regs_pushed = 0; 03874 #endif 03875 03876 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n"); 03877 03878 INIT_FAIL_STACK (); 03879 03880 #ifdef MATCH_MAY_ALLOCATE 03881 /* Do not bother to initialize all the register variables if there are 03882 no groups in the pattern, as it takes a fair amount of time. If 03883 there are groups, we include space for register 0 (the whole 03884 pattern), even though we never use it, since it simplifies the 03885 array indexing. We should fix this. */ 03886 if (bufp->re_nsub) 03887 { 03888 regstart = REGEX_TALLOC (num_regs, const char *); 03889 regend = REGEX_TALLOC (num_regs, const char *); 03890 old_regstart = REGEX_TALLOC (num_regs, const char *); 03891 old_regend = REGEX_TALLOC (num_regs, const char *); 03892 best_regstart = REGEX_TALLOC (num_regs, const char *); 03893 best_regend = REGEX_TALLOC (num_regs, const char *); 03894 reg_info = REGEX_TALLOC (num_regs, register_info_type); 03895 reg_dummy = REGEX_TALLOC (num_regs, const char *); 03896 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type); 03897 03898 if (!(regstart && regend && old_regstart && old_regend && reg_info 03899 && best_regstart && best_regend && reg_dummy && reg_info_dummy)) 03900 { 03901 FREE_VARIABLES (); 03902 return -2; 03903 } 03904 } 03905 else 03906 { 03907 /* We must initialize all our variables to NULL, so that 03908 `FREE_VARIABLES' doesn't try to free them. */ 03909 regstart = regend = old_regstart = old_regend = best_regstart 03910 = best_regend = reg_dummy = NULL; 03911 reg_info = reg_info_dummy = (register_info_type *) NULL; 03912 } 03913 #endif /* MATCH_MAY_ALLOCATE */ 03914 03915 /* The starting position is bogus. */ 03916 if (pos < 0 || pos > size1 + size2) 03917 { 03918 FREE_VARIABLES (); 03919 return -1; 03920 } 03921 03922 /* Initialize subexpression text positions to -1 to mark ones that no 03923 start_memory/stop_memory has been seen for. Also initialize the 03924 register information struct. */ 03925 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++) 03926 { 03927 regstart[mcnt] = regend[mcnt] 03928 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE; 03929 03930 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE; 03931 IS_ACTIVE (reg_info[mcnt]) = 0; 03932 MATCHED_SOMETHING (reg_info[mcnt]) = 0; 03933 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0; 03934 } 03935 03936 /* We move `string1' into `string2' if the latter's empty -- but not if 03937 `string1' is null. */ 03938 if (size2 == 0 && string1 != NULL) 03939 { 03940 string2 = string1; 03941 size2 = size1; 03942 string1 = 0; 03943 size1 = 0; 03944 } 03945 end1 = string1 + size1; 03946 end2 = string2 + size2; 03947 03948 /* Compute where to stop matching, within the two strings. */ 03949 if (stop <= size1) 03950 { 03951 end_match_1 = string1 + stop; 03952 end_match_2 = string2; 03953 } 03954 else 03955 { 03956 end_match_1 = end1; 03957 end_match_2 = string2 + stop - size1; 03958 } 03959 03960 /* `p' scans through the pattern as `d' scans through the data. 03961 `dend' is the end of the input string that `d' points within. `d' 03962 is advanced into the following input string whenever necessary, but 03963 this happens before fetching; therefore, at the beginning of the 03964 loop, `d' can be pointing at the end of a string, but it cannot 03965 equal `string2'. */ 03966 if (size1 > 0 && pos <= size1) 03967 { 03968 d = string1 + pos; 03969 dend = end_match_1; 03970 } 03971 else 03972 { 03973 d = string2 + pos - size1; 03974 dend = end_match_2; 03975 } 03976 03977 DEBUG_PRINT1 ("The compiled pattern is:\n"); 03978 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend); 03979 DEBUG_PRINT1 ("The string to match is: `"); 03980 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2); 03981 DEBUG_PRINT1 ("'\n"); 03982 03983 /* This loops over pattern commands. It exits by returning from the 03984 function if the match is complete, or it drops through if the match 03985 fails at this starting point in the input data. */ 03986 for (;;) 03987 { 03988 #ifdef _LIBC 03989 DEBUG_PRINT2 ("\n%p: ", p); 03990 #else 03991 DEBUG_PRINT2 ("\n0x%x: ", p); 03992 #endif 03993 03994 if (p == pend) 03995 { /* End of pattern means we might have succeeded. */ 03996 DEBUG_PRINT1 ("end of pattern ... "); 03997 03998 /* If we haven't matched the entire string, and we want the 03999 longest match, try backtracking. */ 04000 if (d != end_match_2) 04001 { 04002 /* 1 if this match ends in the same string (string1 or string2) 04003 as the best previous match. */ 04004 boolean same_str_p = (FIRST_STRING_P (match_end) 04005 == MATCHING_IN_FIRST_STRING); 04006 /* 1 if this match is the best seen so far. */ 04007 boolean best_match_p; 04008 04009 /* AIX compiler got confused when this was combined 04010 with the previous declaration. */ 04011 if (same_str_p) 04012 best_match_p = d > match_end; 04013 else 04014 best_match_p = !MATCHING_IN_FIRST_STRING; 04015 04016 DEBUG_PRINT1 ("backtracking.\n"); 04017 04018 if (!FAIL_STACK_EMPTY ()) 04019 { /* More failure points to try. */ 04020 04021 /* If exceeds best match so far, save it. */ 04022 if (!best_regs_set || best_match_p) 04023 { 04024 best_regs_set = true; 04025 match_end = d; 04026 04027 DEBUG_PRINT1 ("\nSAVING match as best so far.\n"); 04028 04029 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++) 04030 { 04031 best_regstart[mcnt] = regstart[mcnt]; 04032 best_regend[mcnt] = regend[mcnt]; 04033 } 04034 } 04035 goto fail; 04036 } 04037 04038 /* If no failure points, don't restore garbage. And if 04039 last match is real best match, don't restore second 04040 best one. */ 04041 else if (best_regs_set && !best_match_p) 04042 { 04043 restore_best_regs: 04044 /* Restore best match. It may happen that `dend == 04045 end_match_1' while the restored d is in string2. 04046 For example, the pattern `x.*y.*z' against the 04047 strings `x-' and `y-z-', if the two strings are 04048 not consecutive in memory. */ 04049 DEBUG_PRINT1 ("Restoring best registers.\n"); 04050 04051 d = match_end; 04052 dend = ((d >= string1 && d <= end1) 04053 ? end_match_1 : end_match_2); 04054 04055 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++) 04056 { 04057 regstart[mcnt] = best_regstart[mcnt]; 04058 regend[mcnt] = best_regend[mcnt]; 04059 } 04060 } 04061 } /* d != end_match_2 */ 04062 04063 succeed_label: 04064 DEBUG_PRINT1 ("Accepting match.\n"); 04065 04066 /* If caller wants register contents data back, do it. */ 04067 if (regs && !bufp->no_sub) 04068 { 04069 /* Have the register data arrays been allocated? */ 04070 if (bufp->regs_allocated == REGS_UNALLOCATED) 04071 { /* No. So allocate them with malloc. We need one 04072 extra element beyond `num_regs' for the `-1' marker 04073 GNU code uses. */ 04074 regs->num_regs = MAX (RE_NREGS, num_regs + 1); 04075 regs->start = TALLOC (regs->num_regs, regoff_t); 04076 regs->end = TALLOC (regs->num_regs, regoff_t); 04077 if (regs->start == NULL || regs->end == NULL) 04078 { 04079 FREE_VARIABLES (); 04080 return -2; 04081 } 04082 bufp->regs_allocated = REGS_REALLOCATE; 04083 } 04084 else if (bufp->regs_allocated == REGS_REALLOCATE) 04085 { /* Yes. If we need more elements than were already 04086 allocated, reallocate them. If we need fewer, just 04087 leave it alone. */ 04088 if (regs->num_regs < num_regs + 1) 04089 { 04090 regs->num_regs = num_regs + 1; 04091 RETALLOC (regs->start, regs->num_regs, regoff_t); 04092 RETALLOC (regs->end, regs->num_regs, regoff_t); 04093 if (regs->start == NULL || regs->end == NULL) 04094 { 04095 FREE_VARIABLES (); 04096 return -2; 04097 } 04098 } 04099 } 04100 else 04101 { 04102 /* These braces fend off a "empty body in an else-statement" 04103 warning under GCC when assert expands to nothing. */ 04104 assert (bufp->regs_allocated == REGS_FIXED); 04105 } 04106 04107 /* Convert the pointer data in `regstart' and `regend' to 04108 indices. Register zero has to be set differently, 04109 since we haven't kept track of any info for it. */ 04110 if (regs->num_regs > 0) 04111 { 04112 regs->start[0] = pos; 04113 regs->end[0] = (MATCHING_IN_FIRST_STRING 04114 ? ((regoff_t) (d - string1)) 04115 : ((regoff_t) (d - string2 + size1))); 04116 } 04117 04118 /* Go through the first `min (num_regs, regs->num_regs)' 04119 registers, since that is all we initialized. */ 04120 for (mcnt = 1; (unsigned) mcnt < MIN (num_regs, regs->num_regs); 04121 mcnt++) 04122 { 04123 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt])) 04124 regs->start[mcnt] = regs->end[mcnt] = -1; 04125 else 04126 { 04127 regs->start[mcnt] 04128 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]); 04129 regs->end[mcnt] 04130 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]); 04131 } 04132 } 04133 04134 /* If the regs structure we return has more elements than 04135 were in the pattern, set the extra elements to -1. If 04136 we (re)allocated the registers, this is the case, 04137 because we always allocate enough to have at least one 04138 -1 at the end. */ 04139 for (mcnt = num_regs; (unsigned) mcnt < regs->num_regs; mcnt++) 04140 regs->start[mcnt] = regs->end[mcnt] = -1; 04141 } /* regs && !bufp->no_sub */ 04142 04143 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n", 04144 nfailure_points_pushed, nfailure_points_popped, 04145 nfailure_points_pushed - nfailure_points_popped); 04146 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed); 04147 04148 mcnt = d - pos - (MATCHING_IN_FIRST_STRING 04149 ? string1 04150 : string2 - size1); 04151 04152 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt); 04153 04154 FREE_VARIABLES (); 04155 return mcnt; 04156 } 04157 04158 /* Otherwise match next pattern command. */ 04159 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++)) 04160 { 04161 /* Ignore these. Used to ignore the n of succeed_n's which 04162 currently have n == 0. */ 04163 case no_op: 04164 DEBUG_PRINT1 ("EXECUTING no_op.\n"); 04165 break; 04166 04167 case succeed: 04168 DEBUG_PRINT1 ("EXECUTING succeed.\n"); 04169 goto succeed_label; 04170 04171 /* Match the next n pattern characters exactly. The following 04172 byte in the pattern defines n, and the n bytes after that 04173 are the characters to match. */ 04174 case exactn: 04175 mcnt = *p++; 04176 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt); 04177 04178 /* This is written out as an if-else so we don't waste time 04179 testing `translate' inside the loop. */ 04180 if (translate) 04181 { 04182 do 04183 { 04184 PREFETCH (); 04185 if ((unsigned char) translate[(unsigned char) *d++] 04186 != (unsigned char) *p++) 04187 goto fail; 04188 } 04189 while (--mcnt); 04190 } 04191 else 04192 { 04193 do 04194 { 04195 PREFETCH (); 04196 if (*d++ != (char) *p++) goto fail; 04197 } 04198 while (--mcnt); 04199 } 04200 SET_REGS_MATCHED (); 04201 break; 04202 04203 04204 /* Match any character except possibly a newline or a null. */ 04205 case anychar: 04206 DEBUG_PRINT1 ("EXECUTING anychar.\n"); 04207 04208 PREFETCH (); 04209 04210 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n') 04211 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000')) 04212 goto fail; 04213 04214 SET_REGS_MATCHED (); 04215 DEBUG_PRINT2 (" Matched `%d'.\n", *d); 04216 d++; 04217 break; 04218 04219 04220 case charset: 04221 case charset_not: 04222 { 04223 register unsigned char c; 04224 boolean not = (re_opcode_t) *(p - 1) == charset_not; 04225 04226 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : ""); 04227 04228 PREFETCH (); 04229 c = TRANSLATE (*d); /* The character to match. */ 04230 04231 /* Cast to `unsigned' instead of `unsigned char' in case the 04232 bit list is a full 32 bytes long. */ 04233 if (c < (unsigned) (*p * BYTEWIDTH) 04234 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) 04235 not = !not; 04236 04237 p += 1 + *p; 04238 04239 if (!not) goto fail; 04240 04241 SET_REGS_MATCHED (); 04242 d++; 04243 break; 04244 } 04245 04246 04247 /* The beginning of a group is represented by start_memory. 04248 The arguments are the register number in the next byte, and the 04249 number of groups inner to this one in the next. The text 04250 matched within the group is recorded (in the internal 04251 registers data structure) under the register number. */ 04252 case start_memory: 04253 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]); 04254 04255 /* Find out if this group can match the empty string. */ 04256 p1 = p; /* To send to group_match_null_string_p. */ 04257 04258 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE) 04259 REG_MATCH_NULL_STRING_P (reg_info[*p]) 04260 = group_match_null_string_p (&p1, pend, reg_info); 04261 04262 /* Save the position in the string where we were the last time 04263 we were at this open-group operator in case the group is 04264 operated upon by a repetition operator, e.g., with `(a*)*b' 04265 against `ab'; then we want to ignore where we are now in 04266 the string in case this attempt to match fails. */ 04267 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) 04268 ? REG_UNSET (regstart[*p]) ? d : regstart[*p] 04269 : regstart[*p]; 04270 DEBUG_PRINT2 (" old_regstart: %d\n", 04271 POINTER_TO_OFFSET (old_regstart[*p])); 04272 04273 regstart[*p] = d; 04274 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p])); 04275 04276 IS_ACTIVE (reg_info[*p]) = 1; 04277 MATCHED_SOMETHING (reg_info[*p]) = 0; 04278 04279 /* Clear this whenever we change the register activity status. */ 04280 set_regs_matched_done = 0; 04281 04282 /* This is the new highest active register. */ 04283 highest_active_reg = *p; 04284 04285 /* If nothing was active before, this is the new lowest active 04286 register. */ 04287 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) 04288 lowest_active_reg = *p; 04289 04290 /* Move past the register number and inner group count. */ 04291 p += 2; 04292 just_past_start_mem = p; 04293 04294 break; 04295 04296 04297 /* The stop_memory opcode represents the end of a group. Its 04298 arguments are the same as start_memory's: the register 04299 number, and the number of inner groups. */ 04300 case stop_memory: 04301 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]); 04302 04303 /* We need to save the string position the last time we were at 04304 this close-group operator in case the group is operated 04305 upon by a repetition operator, e.g., with `((a*)*(b*)*)*' 04306 against `aba'; then we want to ignore where we are now in 04307 the string in case this attempt to match fails. */ 04308 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) 04309 ? REG_UNSET (regend[*p]) ? d : regend[*p] 04310 : regend[*p]; 04311 DEBUG_PRINT2 (" old_regend: %d\n", 04312 POINTER_TO_OFFSET (old_regend[*p])); 04313 04314 regend[*p] = d; 04315 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p])); 04316 04317 /* This register isn't active anymore. */ 04318 IS_ACTIVE (reg_info[*p]) = 0; 04319 04320 /* Clear this whenever we change the register activity status. */ 04321 set_regs_matched_done = 0; 04322 04323 /* If this was the only register active, nothing is active 04324 anymore. */ 04325 if (lowest_active_reg == highest_active_reg) 04326 { 04327 lowest_active_reg = NO_LOWEST_ACTIVE_REG; 04328 highest_active_reg = NO_HIGHEST_ACTIVE_REG; 04329 } 04330 else 04331 { /* We must scan for the new highest active register, since 04332 it isn't necessarily one less than now: consider 04333 (a(b)c(d(e)f)g). When group 3 ends, after the f), the 04334 new highest active register is 1. */ 04335 unsigned char r = *p - 1; 04336 while (r > 0 && !IS_ACTIVE (reg_info[r])) 04337 r--; 04338 04339 /* If we end up at register zero, that means that we saved 04340 the registers as the result of an `on_failure_jump', not 04341 a `start_memory', and we jumped to past the innermost 04342 `stop_memory'. For example, in ((.)*) we save 04343 registers 1 and 2 as a result of the *, but when we pop 04344 back to the second ), we are at the stop_memory 1. 04345 Thus, nothing is active. */ 04346 if (r == 0) 04347 { 04348 lowest_active_reg = NO_LOWEST_ACTIVE_REG; 04349 highest_active_reg = NO_HIGHEST_ACTIVE_REG; 04350 } 04351 else 04352 highest_active_reg = r; 04353 } 04354 04355 /* If just failed to match something this time around with a 04356 group that's operated on by a repetition operator, try to 04357 force exit from the ``loop'', and restore the register 04358 information for this group that we had before trying this 04359 last match. */ 04360 if ((!MATCHED_SOMETHING (reg_info[*p]) 04361 || just_past_start_mem == p - 1) 04362 && (p + 2) < pend) 04363 { 04364 boolean is_a_jump_n = false; 04365 04366 p1 = p + 2; 04367 mcnt = 0; 04368 switch ((re_opcode_t) *p1++) 04369 { 04370 case jump_n: 04371 is_a_jump_n = true; 04372 case pop_failure_jump: 04373 case maybe_pop_jump: 04374 case jump: 04375 case dummy_failure_jump: 04376 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 04377 if (is_a_jump_n) 04378 p1 += 2; 04379 break; 04380 04381 default: 04382 /* do nothing */ ; 04383 } 04384 p1 += mcnt; 04385 04386 /* If the next operation is a jump backwards in the pattern 04387 to an on_failure_jump right before the start_memory 04388 corresponding to this stop_memory, exit from the loop 04389 by forcing a failure after pushing on the stack the 04390 on_failure_jump's jump in the pattern, and d. */ 04391 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump 04392 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p) 04393 { 04394 /* If this group ever matched anything, then restore 04395 what its registers were before trying this last 04396 failed match, e.g., with `(a*)*b' against `ab' for 04397 regstart[1], and, e.g., with `((a*)*(b*)*)*' 04398 against `aba' for regend[3]. 04399 04400 Also restore the registers for inner groups for, 04401 e.g., `((a*)(b*))*' against `aba' (register 3 would 04402 otherwise get trashed). */ 04403 04404 if (EVER_MATCHED_SOMETHING (reg_info[*p])) 04405 { 04406 unsigned r; 04407 04408 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0; 04409 04410 /* Restore this and inner groups' (if any) registers. */ 04411 for (r = *p; r < (unsigned) *p + (unsigned) *(p + 1); 04412 r++) 04413 { 04414 regstart[r] = old_regstart[r]; 04415 04416 /* xx why this test? */ 04417 if (old_regend[r] >= regstart[r]) 04418 regend[r] = old_regend[r]; 04419 } 04420 } 04421 p1++; 04422 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 04423 PUSH_FAILURE_POINT (p1 + mcnt, d, -2); 04424 04425 goto fail; 04426 } 04427 } 04428 04429 /* Move past the register number and the inner group count. */ 04430 p += 2; 04431 break; 04432 04433 04434 /* <digit> has been turned into a `duplicate' command which is 04435 followed by the numeric value of <digit> as the register number. */ 04436 case duplicate: 04437 { 04438 register const char *d2, *dend2; 04439 int regno = *p++; /* Get which register to match against. */ 04440 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno); 04441 04442 /* Can't back reference a group which we've never matched. */ 04443 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno])) 04444 goto fail; 04445 04446 /* Where in input to try to start matching. */ 04447 d2 = regstart[regno]; 04448 04449 /* Where to stop matching; if both the place to start and 04450 the place to stop matching are in the same string, then 04451 set to the place to stop, otherwise, for now have to use 04452 the end of the first string. */ 04453 04454 dend2 = ((FIRST_STRING_P (regstart[regno]) 04455 == FIRST_STRING_P (regend[regno])) 04456 ? regend[regno] : end_match_1); 04457 for (;;) 04458 { 04459 /* If necessary, advance to next segment in register 04460 contents. */ 04461 while (d2 == dend2) 04462 { 04463 if (dend2 == end_match_2) break; 04464 if (dend2 == regend[regno]) break; 04465 04466 /* End of string1 => advance to string2. */ 04467 d2 = string2; 04468 dend2 = regend[regno]; 04469 } 04470 /* At end of register contents => success */ 04471 if (d2 == dend2) break; 04472 04473 /* If necessary, advance to next segment in data. */ 04474 PREFETCH (); 04475 04476 /* How many characters left in this segment to match. */ 04477 mcnt = dend - d; 04478 04479 /* Want how many consecutive characters we can match in 04480 one shot, so, if necessary, adjust the count. */ 04481 if (mcnt > dend2 - d2) 04482 mcnt = dend2 - d2; 04483 04484 /* Compare that many; failure if mismatch, else move 04485 past them. */ 04486 if (translate 04487 ? bcmp_translate (d, d2, mcnt, translate) 04488 : bcmp (d, d2, mcnt)) 04489 goto fail; 04490 d += mcnt, d2 += mcnt; 04491 04492 /* Do this because we've match some characters. */ 04493 SET_REGS_MATCHED (); 04494 } 04495 } 04496 break; 04497 04498 04499 /* begline matches the empty string at the beginning of the string 04500 (unless `not_bol' is set in `bufp'), and, if 04501 `newline_anchor' is set, after newlines. */ 04502 case begline: 04503 DEBUG_PRINT1 ("EXECUTING begline.\n"); 04504 04505 if (AT_STRINGS_BEG (d)) 04506 { 04507 if (!bufp->not_bol) break; 04508 } 04509 else if (d[-1] == '\n' && bufp->newline_anchor) 04510 { 04511 break; 04512 } 04513 /* In all other cases, we fail. */ 04514 goto fail; 04515 04516 04517 /* endline is the dual of begline. */ 04518 case endline: 04519 DEBUG_PRINT1 ("EXECUTING endline.\n"); 04520 04521 if (AT_STRINGS_END (d)) 04522 { 04523 if (!bufp->not_eol) break; 04524 } 04525 04526 /* We have to ``prefetch'' the next character. */ 04527 else if ((d == end1 ? *string2 : *d) == '\n' 04528 && bufp->newline_anchor) 04529 { 04530 break; 04531 } 04532 goto fail; 04533 04534 04535 /* Match at the very beginning of the data. */ 04536 case begbuf: 04537 DEBUG_PRINT1 ("EXECUTING begbuf.\n"); 04538 if (AT_STRINGS_BEG (d)) 04539 break; 04540 goto fail; 04541 04542 04543 /* Match at the very end of the data. */ 04544 case endbuf: 04545 DEBUG_PRINT1 ("EXECUTING endbuf.\n"); 04546 if (AT_STRINGS_END (d)) 04547 break; 04548 goto fail; 04549 04550 04551 /* on_failure_keep_string_jump is used to optimize `.*\n'. It 04552 pushes NULL as the value for the string on the stack. Then 04553 `pop_failure_point' will keep the current value for the 04554 string, instead of restoring it. To see why, consider 04555 matching `foo\nbar' against `.*\n'. The .* matches the foo; 04556 then the . fails against the \n. But the next thing we want 04557 to do is match the \n against the \n; if we restored the 04558 string value, we would be back at the foo. 04559 04560 Because this is used only in specific cases, we don't need to 04561 check all the things that `on_failure_jump' does, to make 04562 sure the right things get saved on the stack. Hence we don't 04563 share its code. The only reason to push anything on the 04564 stack at all is that otherwise we would have to change 04565 `anychar's code to do something besides goto fail in this 04566 case; that seems worse than this. */ 04567 case on_failure_keep_string_jump: 04568 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump"); 04569 04570 EXTRACT_NUMBER_AND_INCR (mcnt, p); 04571 #ifdef _LIBC 04572 DEBUG_PRINT3 (" %d (to %p):\n", mcnt, p + mcnt); 04573 #else 04574 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt); 04575 #endif 04576 04577 PUSH_FAILURE_POINT (p + mcnt, NULL, -2); 04578 break; 04579 04580 04581 /* Uses of on_failure_jump: 04582 04583 Each alternative starts with an on_failure_jump that points 04584 to the beginning of the next alternative. Each alternative 04585 except the last ends with a jump that in effect jumps past 04586 the rest of the alternatives. (They really jump to the 04587 ending jump of the following alternative, because tensioning 04588 these jumps is a hassle.) 04589 04590 Repeats start with an on_failure_jump that points past both 04591 the repetition text and either the following jump or 04592 pop_failure_jump back to this on_failure_jump. */ 04593 case on_failure_jump: 04594 on_failure: 04595 DEBUG_PRINT1 ("EXECUTING on_failure_jump"); 04596 04597 EXTRACT_NUMBER_AND_INCR (mcnt, p); 04598 #ifdef _LIBC 04599 DEBUG_PRINT3 (" %d (to %p)", mcnt, p + mcnt); 04600 #else 04601 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt); 04602 #endif 04603 04604 /* If this on_failure_jump comes right before a group (i.e., 04605 the original * applied to a group), save the information 04606 for that group and all inner ones, so that if we fail back 04607 to this point, the group's information will be correct. 04608 For example, in \(a*\)*\1, we need the preceding group, 04609 and in \(zz\(a*\)b*\)\2, we need the inner group. */ 04610 04611 /* We can't use `p' to check ahead because we push 04612 a failure point to `p + mcnt' after we do this. */ 04613 p1 = p; 04614 04615 /* We need to skip no_op's before we look for the 04616 start_memory in case this on_failure_jump is happening as 04617 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1 04618 against aba. */ 04619 while (p1 < pend && (re_opcode_t) *p1 == no_op) 04620 p1++; 04621 04622 if (p1 < pend && (re_opcode_t) *p1 == start_memory) 04623 { 04624 /* We have a new highest active register now. This will 04625 get reset at the start_memory we are about to get to, 04626 but we will have saved all the registers relevant to 04627 this repetition op, as described above. */ 04628 highest_active_reg = *(p1 + 1) + *(p1 + 2); 04629 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) 04630 lowest_active_reg = *(p1 + 1); 04631 } 04632 04633 DEBUG_PRINT1 (":\n"); 04634 PUSH_FAILURE_POINT (p + mcnt, d, -2); 04635 break; 04636 04637 04638 /* A smart repeat ends with `maybe_pop_jump'. 04639 We change it to either `pop_failure_jump' or `jump'. */ 04640 case maybe_pop_jump: 04641 EXTRACT_NUMBER_AND_INCR (mcnt, p); 04642 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt); 04643 { 04644 register unsigned char *p2 = p; 04645 04646 /* Compare the beginning of the repeat with what in the 04647 pattern follows its end. If we can establish that there 04648 is nothing that they would both match, i.e., that we 04649 would have to backtrack because of (as in, e.g., `a*a') 04650 then we can change to pop_failure_jump, because we'll 04651 never have to backtrack. 04652 04653 This is not true in the case of alternatives: in 04654 `(a|ab)*' we do need to backtrack to the `ab' alternative 04655 (e.g., if the string was `ab'). But instead of trying to 04656 detect that here, the alternative has put on a dummy 04657 failure point which is what we will end up popping. */ 04658 04659 /* Skip over open/close-group commands. 04660 If what follows this loop is a ...+ construct, 04661 look at what begins its body, since we will have to 04662 match at least one of that. */ 04663 while (1) 04664 { 04665 if (p2 + 2 < pend 04666 && ((re_opcode_t) *p2 == stop_memory 04667 || (re_opcode_t) *p2 == start_memory)) 04668 p2 += 3; 04669 else if (p2 + 6 < pend 04670 && (re_opcode_t) *p2 == dummy_failure_jump) 04671 p2 += 6; 04672 else 04673 break; 04674 } 04675 04676 p1 = p + mcnt; 04677 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding 04678 to the `maybe_finalize_jump' of this case. Examine what 04679 follows. */ 04680 04681 /* If we're at the end of the pattern, we can change. */ 04682 if (p2 == pend) 04683 { 04684 /* Consider what happens when matching ":\(.*\)" 04685 against ":/". I don't really understand this code 04686 yet. */ 04687 p[-3] = (unsigned char) pop_failure_jump; 04688 DEBUG_PRINT1 04689 (" End of pattern: change to `pop_failure_jump'.\n"); 04690 } 04691 04692 else if ((re_opcode_t) *p2 == exactn 04693 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline)) 04694 { 04695 register unsigned char c 04696 = *p2 == (unsigned char) endline ? '\n' : p2[2]; 04697 04698 if ((re_opcode_t) p1[3] == exactn && p1[5] != c) 04699 { 04700 p[-3] = (unsigned char) pop_failure_jump; 04701 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n", 04702 c, p1[5]); 04703 } 04704 04705 else if ((re_opcode_t) p1[3] == charset 04706 || (re_opcode_t) p1[3] == charset_not) 04707 { 04708 int not = (re_opcode_t) p1[3] == charset_not; 04709 04710 if (c < (unsigned char) (p1[4] * BYTEWIDTH) 04711 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) 04712 not = !not; 04713 04714 /* `not' is equal to 1 if c would match, which means 04715 that we can't change to pop_failure_jump. */ 04716 if (!not) 04717 { 04718 p[-3] = (unsigned char) pop_failure_jump; 04719 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); 04720 } 04721 } 04722 } 04723 else if ((re_opcode_t) *p2 == charset) 04724 { 04725 #ifdef DEBUG 04726 register unsigned char c 04727 = *p2 == (unsigned char) endline ? '\n' : p2[2]; 04728 #endif 04729 04730 #if 0 04731 if ((re_opcode_t) p1[3] == exactn 04732 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5] 04733 && (p2[2 + p1[5] / BYTEWIDTH] 04734 & (1 << (p1[5] % BYTEWIDTH))))) 04735 #else 04736 if ((re_opcode_t) p1[3] == exactn 04737 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[4] 04738 && (p2[2 + p1[4] / BYTEWIDTH] 04739 & (1 << (p1[4] % BYTEWIDTH))))) 04740 #endif 04741 { 04742 p[-3] = (unsigned char) pop_failure_jump; 04743 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n", 04744 c, p1[5]); 04745 } 04746 04747 else if ((re_opcode_t) p1[3] == charset_not) 04748 { 04749 int idx; 04750 /* We win if the charset_not inside the loop 04751 lists every character listed in the charset after. */ 04752 for (idx = 0; idx < (int) p2[1]; idx++) 04753 if (! (p2[2 + idx] == 0 04754 || (idx < (int) p1[4] 04755 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0)))) 04756 break; 04757 04758 if (idx == p2[1]) 04759 { 04760 p[-3] = (unsigned char) pop_failure_jump; 04761 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); 04762 } 04763 } 04764 else if ((re_opcode_t) p1[3] == charset) 04765 { 04766 int idx; 04767 /* We win if the charset inside the loop 04768 has no overlap with the one after the loop. */ 04769 for (idx = 0; 04770 idx < (int) p2[1] && idx < (int) p1[4]; 04771 idx++) 04772 if ((p2[2 + idx] & p1[5 + idx]) != 0) 04773 break; 04774 04775 if (idx == p2[1] || idx == p1[4]) 04776 { 04777 p[-3] = (unsigned char) pop_failure_jump; 04778 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); 04779 } 04780 } 04781 } 04782 } 04783 p -= 2; /* Point at relative address again. */ 04784 if ((re_opcode_t) p[-1] != pop_failure_jump) 04785 { 04786 p[-1] = (unsigned char) jump; 04787 DEBUG_PRINT1 (" Match => jump.\n"); 04788 goto unconditional_jump; 04789 } 04790 /* Note fall through. */ 04791 04792 04793 /* The end of a simple repeat has a pop_failure_jump back to 04794 its matching on_failure_jump, where the latter will push a 04795 failure point. The pop_failure_jump takes off failure 04796 points put on by this pop_failure_jump's matching 04797 on_failure_jump; we got through the pattern to here from the 04798 matching on_failure_jump, so didn't fail. */ 04799 case pop_failure_jump: 04800 { 04801 /* We need to pass separate storage for the lowest and 04802 highest registers, even though we don't care about the 04803 actual values. Otherwise, we will restore only one 04804 register from the stack, since lowest will == highest in 04805 `pop_failure_point'. */ 04806 active_reg_t dummy_low_reg, dummy_high_reg; 04807 unsigned char *pdummy; 04808 const char *sdummy; 04809 04810 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n"); 04811 POP_FAILURE_POINT (sdummy, pdummy, 04812 dummy_low_reg, dummy_high_reg, 04813 reg_dummy, reg_dummy, reg_info_dummy); 04814 } 04815 /* Note fall through. */ 04816 04817 unconditional_jump: 04818 #ifdef _LIBC 04819 DEBUG_PRINT2 ("\n%p: ", p); 04820 #else 04821 DEBUG_PRINT2 ("\n0x%x: ", p); 04822 #endif 04823 /* Note fall through. */ 04824 04825 /* Unconditionally jump (without popping any failure points). */ 04826 case jump: 04827 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */ 04828 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt); 04829 p += mcnt; /* Do the jump. */ 04830 #ifdef _LIBC 04831 DEBUG_PRINT2 ("(to %p).\n", p); 04832 #else 04833 DEBUG_PRINT2 ("(to 0x%x).\n", p); 04834 #endif 04835 break; 04836 04837 04838 /* We need this opcode so we can detect where alternatives end 04839 in `group_match_null_string_p' et al. */ 04840 case jump_past_alt: 04841 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n"); 04842 goto unconditional_jump; 04843 04844 04845 /* Normally, the on_failure_jump pushes a failure point, which 04846 then gets popped at pop_failure_jump. We will end up at 04847 pop_failure_jump, also, and with a pattern of, say, `a+', we 04848 are skipping over the on_failure_jump, so we have to push 04849 something meaningless for pop_failure_jump to pop. */ 04850 case dummy_failure_jump: 04851 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n"); 04852 /* It doesn't matter what we push for the string here. What 04853 the code at `fail' tests is the value for the pattern. */ 04854 PUSH_FAILURE_POINT (0, 0, -2); 04855 goto unconditional_jump; 04856 04857 04858 /* At the end of an alternative, we need to push a dummy failure 04859 point in case we are followed by a `pop_failure_jump', because 04860 we don't want the failure point for the alternative to be 04861 popped. For example, matching `(a|ab)*' against `aab' 04862 requires that we match the `ab' alternative. */ 04863 case push_dummy_failure: 04864 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n"); 04865 /* See comments just above at `dummy_failure_jump' about the 04866 two zeroes. */ 04867 PUSH_FAILURE_POINT (0, 0, -2); 04868 break; 04869 04870 /* Have to succeed matching what follows at least n times. 04871 After that, handle like `on_failure_jump'. */ 04872 case succeed_n: 04873 EXTRACT_NUMBER (mcnt, p + 2); 04874 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt); 04875 04876 assert (mcnt >= 0); 04877 /* Originally, this is how many times we HAVE to succeed. */ 04878 if (mcnt > 0) 04879 { 04880 mcnt--; 04881 p += 2; 04882 STORE_NUMBER_AND_INCR (p, mcnt); 04883 #ifdef _LIBC 04884 DEBUG_PRINT3 (" Setting %p to %d.\n", p - 2, mcnt); 04885 #else 04886 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p - 2, mcnt); 04887 #endif 04888 } 04889 else if (mcnt == 0) 04890 { 04891 #ifdef _LIBC 04892 DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n", p+2); 04893 #else 04894 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2); 04895 #endif 04896 p[2] = (unsigned char) no_op; 04897 p[3] = (unsigned char) no_op; 04898 goto on_failure; 04899 } 04900 break; 04901 04902 case jump_n: 04903 EXTRACT_NUMBER (mcnt, p + 2); 04904 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt); 04905 04906 /* Originally, this is how many times we CAN jump. */ 04907 if (mcnt) 04908 { 04909 mcnt--; 04910 STORE_NUMBER (p + 2, mcnt); 04911 #ifdef _LIBC 04912 DEBUG_PRINT3 (" Setting %p to %d.\n", p + 2, mcnt); 04913 #else 04914 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p + 2, mcnt); 04915 #endif 04916 goto unconditional_jump; 04917 } 04918 /* If don't have to jump any more, skip over the rest of command. */ 04919 else 04920 p += 4; 04921 break; 04922 04923 case set_number_at: 04924 { 04925 DEBUG_PRINT1 ("EXECUTING set_number_at.\n"); 04926 04927 EXTRACT_NUMBER_AND_INCR (mcnt, p); 04928 p1 = p + mcnt; 04929 EXTRACT_NUMBER_AND_INCR (mcnt, p); 04930 #ifdef _LIBC 04931 DEBUG_PRINT3 (" Setting %p to %d.\n", p1, mcnt); 04932 #else 04933 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt); 04934 #endif 04935 STORE_NUMBER (p1, mcnt); 04936 break; 04937 } 04938 04939 #if 0 04940 /* The DEC Alpha C compiler 3.x generates incorrect code for the 04941 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of 04942 AT_WORD_BOUNDARY, so this code is disabled. Expanding the 04943 macro and introducing temporary variables works around the bug. */ 04944 04945 case wordbound: 04946 DEBUG_PRINT1 ("EXECUTING wordbound.\n"); 04947 if (AT_WORD_BOUNDARY (d)) 04948 break; 04949 goto fail; 04950 04951 case notwordbound: 04952 DEBUG_PRINT1 ("EXECUTING notwordbound.\n"); 04953 if (AT_WORD_BOUNDARY (d)) 04954 goto fail; 04955 break; 04956 #else 04957 case wordbound: 04958 { 04959 boolean prevchar, thischar; 04960 04961 DEBUG_PRINT1 ("EXECUTING wordbound.\n"); 04962 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d)) 04963 break; 04964 04965 prevchar = WORDCHAR_P (d - 1); 04966 thischar = WORDCHAR_P (d); 04967 if (prevchar != thischar) 04968 break; 04969 goto fail; 04970 } 04971 04972 case notwordbound: 04973 { 04974 boolean prevchar, thischar; 04975 04976 DEBUG_PRINT1 ("EXECUTING notwordbound.\n"); 04977 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d)) 04978 goto fail; 04979 04980 prevchar = WORDCHAR_P (d - 1); 04981 thischar = WORDCHAR_P (d); 04982 if (prevchar != thischar) 04983 goto fail; 04984 break; 04985 } 04986 #endif 04987 04988 case wordbeg: 04989 DEBUG_PRINT1 ("EXECUTING wordbeg.\n"); 04990 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1))) 04991 break; 04992 goto fail; 04993 04994 case wordend: 04995 DEBUG_PRINT1 ("EXECUTING wordend.\n"); 04996 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1) 04997 && (!WORDCHAR_P (d) || AT_STRINGS_END (d))) 04998 break; 04999 goto fail; 05000 05001 #ifdef emacs 05002 case before_dot: 05003 DEBUG_PRINT1 ("EXECUTING before_dot.\n"); 05004 if (PTR_CHAR_POS ((unsigned char *) d) >= point) 05005 goto fail; 05006 break; 05007 05008 case at_dot: 05009 DEBUG_PRINT1 ("EXECUTING at_dot.\n"); 05010 if (PTR_CHAR_POS ((unsigned char *) d) != point) 05011 goto fail; 05012 break; 05013 05014 case after_dot: 05015 DEBUG_PRINT1 ("EXECUTING after_dot.\n"); 05016 if (PTR_CHAR_POS ((unsigned char *) d) <= point) 05017 goto fail; 05018 break; 05019 05020 case syntaxspec: 05021 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt); 05022 mcnt = *p++; 05023 goto matchsyntax; 05024 05025 case wordchar: 05026 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n"); 05027 mcnt = (int) Sword; 05028 matchsyntax: 05029 PREFETCH (); 05030 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */ 05031 d++; 05032 if (SYNTAX (d[-1]) != (enum syntaxcode) mcnt) 05033 goto fail; 05034 SET_REGS_MATCHED (); 05035 break; 05036 05037 case notsyntaxspec: 05038 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt); 05039 mcnt = *p++; 05040 goto matchnotsyntax; 05041 05042 case notwordchar: 05043 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n"); 05044 mcnt = (int) Sword; 05045 matchnotsyntax: 05046 PREFETCH (); 05047 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */ 05048 d++; 05049 if (SYNTAX (d[-1]) == (enum syntaxcode) mcnt) 05050 goto fail; 05051 SET_REGS_MATCHED (); 05052 break; 05053 05054 #else /* not emacs */ 05055 case wordchar: 05056 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n"); 05057 PREFETCH (); 05058 if (!WORDCHAR_P (d)) 05059 goto fail; 05060 SET_REGS_MATCHED (); 05061 d++; 05062 break; 05063 05064 case notwordchar: 05065 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n"); 05066 PREFETCH (); 05067 if (WORDCHAR_P (d)) 05068 goto fail; 05069 SET_REGS_MATCHED (); 05070 d++; 05071 break; 05072 #endif /* not emacs */ 05073 05074 default: 05075 abort (); 05076 } 05077 continue; /* Successfully executed one pattern command; keep going. */ 05078 05079 05080 /* We goto here if a matching operation fails. */ 05081 fail: 05082 if (!FAIL_STACK_EMPTY ()) 05083 { /* A restart point is known. Restore to that state. */ 05084 DEBUG_PRINT1 ("\nFAIL:\n"); 05085 POP_FAILURE_POINT (d, p, 05086 lowest_active_reg, highest_active_reg, 05087 regstart, regend, reg_info); 05088 05089 /* If this failure point is a dummy, try the next one. */ 05090 if (!p) 05091 goto fail; 05092 05093 /* If we failed to the end of the pattern, don't examine *p. */ 05094 assert (p <= pend); 05095 if (p < pend) 05096 { 05097 boolean is_a_jump_n = false; 05098 05099 /* If failed to a backwards jump that's part of a repetition 05100 loop, need to pop this failure point and use the next one. */ 05101 switch ((re_opcode_t) *p) 05102 { 05103 case jump_n: 05104 is_a_jump_n = true; 05105 case maybe_pop_jump: 05106 case pop_failure_jump: 05107 case jump: 05108 p1 = p + 1; 05109 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 05110 p1 += mcnt; 05111 05112 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n) 05113 || (!is_a_jump_n 05114 && (re_opcode_t) *p1 == on_failure_jump)) 05115 goto fail; 05116 break; 05117 default: 05118 /* do nothing */ ; 05119 } 05120 } 05121 05122 if (d >= string1 && d <= end1) 05123 dend = end_match_1; 05124 } 05125 else 05126 break; /* Matching at this starting point really fails. */ 05127 } /* for (;;) */ 05128 05129 if (best_regs_set) 05130 goto restore_best_regs; 05131 05132 FREE_VARIABLES (); 05133 05134 return -1; /* Failure to match. */ 05135 } /* re_match_2 */ 05136 05137 /* Subroutine definitions for re_match_2. */ 05138 05139 05140 /* We are passed P pointing to a register number after a start_memory. 05141 05142 Return true if the pattern up to the corresponding stop_memory can 05143 match the empty string, and false otherwise. 05144 05145 If we find the matching stop_memory, sets P to point to one past its number. 05146 Otherwise, sets P to an undefined byte less than or equal to END. 05147 05148 We don't handle duplicates properly (yet). */ 05149 05150 static boolean 05151 group_match_null_string_p(unsigned char **p, 05152 unsigned char *end, 05153 register_info_type *reg_info) 05154 { 05155 int mcnt; 05156 /* Point to after the args to the start_memory. */ 05157 unsigned char *p1 = *p + 2; 05158 05159 while (p1 < end) 05160 { 05161 /* Skip over opcodes that can match nothing, and return true or 05162 false, as appropriate, when we get to one that can't, or to the 05163 matching stop_memory. */ 05164 05165 switch ((re_opcode_t) *p1) 05166 { 05167 /* Could be either a loop or a series of alternatives. */ 05168 case on_failure_jump: 05169 p1++; 05170 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 05171 05172 /* If the next operation is not a jump backwards in the 05173 pattern. */ 05174 05175 if (mcnt >= 0) 05176 { 05177 /* Go through the on_failure_jumps of the alternatives, 05178 seeing if any of the alternatives cannot match nothing. 05179 The last alternative starts with only a jump, 05180 whereas the rest start with on_failure_jump and end 05181 with a jump, e.g., here is the pattern for `a|b|c': 05182 05183 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6 05184 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3 05185 /exactn/1/c 05186 05187 So, we have to first go through the first (n-1) 05188 alternatives and then deal with the last one separately. */ 05189 05190 05191 /* Deal with the first (n-1) alternatives, which start 05192 with an on_failure_jump (see above) that jumps to right 05193 past a jump_past_alt. */ 05194 05195 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt) 05196 { 05197 /* `mcnt' holds how many bytes long the alternative 05198 is, including the ending `jump_past_alt' and 05199 its number. */ 05200 05201 if (!alt_match_null_string_p (p1, p1 + mcnt - 3, 05202 reg_info)) 05203 return false; 05204 05205 /* Move to right after this alternative, including the 05206 jump_past_alt. */ 05207 p1 += mcnt; 05208 05209 /* Break if it's the beginning of an n-th alternative 05210 that doesn't begin with an on_failure_jump. */ 05211 if ((re_opcode_t) *p1 != on_failure_jump) 05212 break; 05213 05214 /* Still have to check that it's not an n-th 05215 alternative that starts with an on_failure_jump. */ 05216 p1++; 05217 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 05218 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt) 05219 { 05220 /* Get to the beginning of the n-th alternative. */ 05221 p1 -= 3; 05222 break; 05223 } 05224 } 05225 05226 /* Deal with the last alternative: go back and get number 05227 of the `jump_past_alt' just before it. `mcnt' contains 05228 the length of the alternative. */ 05229 EXTRACT_NUMBER (mcnt, p1 - 2); 05230 05231 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info)) 05232 return false; 05233 05234 p1 += mcnt; /* Get past the n-th alternative. */ 05235 } /* if mcnt > 0 */ 05236 break; 05237 05238 05239 case stop_memory: 05240 assert (p1[1] == **p); 05241 *p = p1 + 2; 05242 return true; 05243 05244 05245 default: 05246 if (!common_op_match_null_string_p (&p1, end, reg_info)) 05247 return false; 05248 } 05249 } /* while p1 < end */ 05250 05251 return false; 05252 } /* group_match_null_string_p */ 05253 05254 05255 /* Similar to group_match_null_string_p, but doesn't deal with alternatives: 05256 It expects P to be the first byte of a single alternative and END one 05257 byte past the last. The alternative can contain groups. */ 05258 05259 static boolean 05260 alt_match_null_string_p(unsigned char *p, 05261 unsigned char *end, 05262 register_info_type *reg_info) 05263 { 05264 int mcnt; 05265 unsigned char *p1 = p; 05266 05267 while (p1 < end) 05268 { 05269 /* Skip over opcodes that can match nothing, and break when we get 05270 to one that can't. */ 05271 05272 switch ((re_opcode_t) *p1) 05273 { 05274 /* It's a loop. */ 05275 case on_failure_jump: 05276 p1++; 05277 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 05278 p1 += mcnt; 05279 break; 05280 05281 default: 05282 if (!common_op_match_null_string_p (&p1, end, reg_info)) 05283 return false; 05284 } 05285 } /* while p1 < end */ 05286 05287 return true; 05288 } /* alt_match_null_string_p */ 05289 05290 05291 /* Deals with the ops common to group_match_null_string_p and 05292 alt_match_null_string_p. 05293 05294 Sets P to one after the op and its arguments, if any. */ 05295 05296 static boolean 05297 common_op_match_null_string_p(unsigned char **p, 05298 unsigned char *end, 05299 register_info_type *reg_info) 05300 { 05301 int mcnt; 05302 boolean ret; 05303 int reg_no; 05304 unsigned char *p1 = *p; 05305 05306 switch ((re_opcode_t) *p1++) 05307 { 05308 case no_op: 05309 case begline: 05310 case endline: 05311 case begbuf: 05312 case endbuf: 05313 case wordbeg: 05314 case wordend: 05315 case wordbound: 05316 case notwordbound: 05317 #ifdef emacs 05318 case before_dot: 05319 case at_dot: 05320 case after_dot: 05321 #endif 05322 break; 05323 05324 case start_memory: 05325 reg_no = *p1; 05326 assert (reg_no > 0 && reg_no <= MAX_REGNUM); 05327 ret = group_match_null_string_p (&p1, end, reg_info); 05328 05329 /* Have to set this here in case we're checking a group which 05330 contains a group and a back reference to it. */ 05331 05332 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE) 05333 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret; 05334 05335 if (!ret) 05336 return false; 05337 break; 05338 05339 /* If this is an optimized succeed_n for zero times, make the jump. */ 05340 case jump: 05341 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 05342 if (mcnt >= 0) 05343 p1 += mcnt; 05344 else 05345 return false; 05346 break; 05347 05348 case succeed_n: 05349 /* Get to the number of times to succeed. */ 05350 p1 += 2; 05351 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 05352 05353 if (mcnt == 0) 05354 { 05355 p1 -= 4; 05356 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 05357 p1 += mcnt; 05358 } 05359 else 05360 return false; 05361 break; 05362 05363 case duplicate: 05364 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1])) 05365 return false; 05366 break; 05367 05368 case set_number_at: 05369 p1 += 4; 05370 05371 default: 05372 /* All other opcodes mean we cannot match the empty string. */ 05373 return false; 05374 } 05375 05376 *p = p1; 05377 return true; 05378 } /* common_op_match_null_string_p */ 05379 05380 05381 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN 05382 bytes; nonzero otherwise. */ 05383 05384 static int 05385 bcmp_translate(const char *s1, 05386 const char *s2, 05387 register int len, 05388 RE_TRANSLATE_TYPE translate) 05389 { 05390 register const unsigned char *p1 = (const unsigned char *) s1; 05391 register const unsigned char *p2 = (const unsigned char *) s2; 05392 while (len) 05393 { 05394 if (translate[*p1++] != translate[*p2++]) return 1; 05395 len--; 05396 } 05397 return 0; 05398 } 05399 05400 /* Entry points for GNU code. */ 05401 05402 /* re_compile_pattern is the GNU regular expression compiler: it 05403 compiles PATTERN (of length SIZE) and puts the result in BUFP. 05404 Returns 0 if the pattern was valid, otherwise an error string. 05405 05406 Assumes the `allocated' (and perhaps `buffer') and `translate' fields 05407 are set in BUFP on entry. 05408 05409 We call regex_compile to do the actual compilation. */ 05410 05411 const char * 05412 re_compile_pattern(const char *pattern, 05413 size_t length, 05414 struct re_pattern_buffer *bufp) 05415 { 05416 reg_errcode_t ret; 05417 05418 /* GNU code is written to assume at least RE_NREGS registers will be set 05419 (and at least one extra will be -1). */ 05420 bufp->regs_allocated = REGS_UNALLOCATED; 05421 05422 /* And GNU code determines whether or not to get register information 05423 by passing null for the REGS argument to re_match, etc., not by 05424 setting no_sub. */ 05425 bufp->no_sub = 0; 05426 05427 /* Match anchors at newline. */ 05428 bufp->newline_anchor = 1; 05429 05430 ret = regex_compile (pattern, length, re_syntax_options, bufp); 05431 05432 if (!ret) 05433 return NULL; 05434 return gettext (re_error_msgid[(int) ret]); 05435 } 05436 05437 /* Entry points compatible with 4.2 BSD regex library. We don't define 05438 them unless specifically requested. */ 05439 05440 #if defined (_REGEX_RE_COMP) || defined (_LIBC) 05441 05442 /* BSD has one and only one pattern buffer. */ 05443 static struct re_pattern_buffer re_comp_buf; 05444 05445 char * 05446 #ifdef _LIBC 05447 /* Make these definitions weak in libc, so POSIX programs can redefine 05448 these names if they don't use our functions, and still use 05449 regcomp/regexec below without link errors. */ 05450 weak_function 05451 #endif 05452 re_comp (s) 05453 const char *s; 05454 { 05455 reg_errcode_t ret; 05456 05457 if (!s) 05458 { 05459 if (!re_comp_buf.buffer) 05460 return gettext ("No previous regular expression"); 05461 return 0; 05462 } 05463 05464 if (!re_comp_buf.buffer) 05465 { 05466 re_comp_buf.buffer = (unsigned char *) malloc (200); 05467 if (re_comp_buf.buffer == NULL) 05468 return gettext (re_error_msgid[(int) REG_ESPACE]); 05469 re_comp_buf.allocated = 200; 05470 05471 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH); 05472 if (re_comp_buf.fastmap == NULL) 05473 return gettext (re_error_msgid[(int) REG_ESPACE]); 05474 } 05475 05476 /* Since `re_exec' always passes NULL for the `regs' argument, we 05477 don't need to initialize the pattern buffer fields which affect it. */ 05478 05479 /* Match anchors at newlines. */ 05480 re_comp_buf.newline_anchor = 1; 05481 05482 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf); 05483 05484 if (!ret) 05485 return NULL; 05486 05487 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ 05488 return (char *) gettext (re_error_msgid[(int) ret]); 05489 } 05490 05491 05492 int 05493 #ifdef _LIBC 05494 weak_function 05495 #endif 05496 re_exec (s) 05497 const char *s; 05498 { 05499 const int len = strlen (s); 05500 return 05501 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0); 05502 } 05503 05504 #endif /* _REGEX_RE_COMP */ 05505 05506 /* POSIX.2 functions. Don't define these for Emacs. */ 05507 05508 #ifndef emacs 05509 05510 /* regcomp takes a regular expression as a string and compiles it. 05511 05512 PREG is a regex_t *. We do not expect any fields to be initialized, 05513 since POSIX says we shouldn't. Thus, we set 05514 05515 `buffer' to the compiled pattern; 05516 `used' to the length of the compiled pattern; 05517 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the 05518 REG_EXTENDED bit in CFLAGS is set; otherwise, to 05519 RE_SYNTAX_POSIX_BASIC; 05520 `newline_anchor' to REG_NEWLINE being set in CFLAGS; 05521 `fastmap' and `fastmap_accurate' to zero; 05522 `re_nsub' to the number of subexpressions in PATTERN. 05523 05524 PATTERN is the address of the pattern string. 05525 05526 CFLAGS is a series of bits which affect compilation. 05527 05528 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we 05529 use POSIX basic syntax. 05530 05531 If REG_NEWLINE is set, then . and [^...] don't match newline. 05532 Also, regexec will try a match beginning after every newline. 05533 05534 If REG_ICASE is set, then we considers upper- and lowercase 05535 versions of letters to be equivalent when matching. 05536 05537 If REG_NOSUB is set, then when PREG is passed to regexec, that 05538 routine will report only success or failure, and nothing about the 05539 registers. 05540 05541 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for 05542 the return codes and their meanings.) */ 05543 05544 int 05545 regcomp(regex_t *preg, 05546 const char *pattern, 05547 int cflags) 05548 { 05549 reg_errcode_t ret; 05550 reg_syntax_t syntax 05551 = (cflags & REG_EXTENDED) ? 05552 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC; 05553 05554 #ifdef DEBUG 05555 debug=0; 05556 DEBUG_PRINT1("EXECUTING regcomp"); 05557 debug=0; 05558 #endif 05559 /* regex_compile will allocate the space for the compiled pattern. */ 05560 preg->buffer = 0; 05561 preg->allocated = 0; 05562 preg->used = 0; 05563 05564 /* Don't bother to use a fastmap when searching. This simplifies the 05565 REG_NEWLINE case: if we used a fastmap, we'd have to put all the 05566 characters after newlines into the fastmap. This way, we just try 05567 every character. */ 05568 preg->fastmap = 0; 05569 05570 if (cflags & REG_ICASE) 05571 { 05572 unsigned i; 05573 05574 preg->translate 05575 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE 05576 * sizeof (*(RE_TRANSLATE_TYPE)0)); 05577 if (preg->translate == NULL) 05578 return (int) REG_ESPACE; 05579 05580 /* Map uppercase characters to corresponding lowercase ones. */ 05581 for (i = 0; i < CHAR_SET_SIZE; i++) 05582 preg->translate[i] = ISUPPER (i) ? tolower (i) : i; 05583 } 05584 else 05585 preg->translate = NULL; 05586 05587 /* If REG_NEWLINE is set, newlines are treated differently. */ 05588 if (cflags & REG_NEWLINE) 05589 { /* REG_NEWLINE implies neither . nor [^...] match newline. */ 05590 syntax &= ~RE_DOT_NEWLINE; 05591 syntax |= RE_HAT_LISTS_NOT_NEWLINE; 05592 /* It also changes the matching behavior. */ 05593 preg->newline_anchor = 1; 05594 } 05595 else 05596 preg->newline_anchor = 0; 05597 05598 preg->no_sub = !!(cflags & REG_NOSUB); 05599 05600 /* POSIX says a null character in the pattern terminates it, so we 05601 can use strlen here in compiling the pattern. */ 05602 ret = regex_compile (pattern, strlen (pattern), syntax, preg); 05603 05604 /* POSIX doesn't distinguish between an unmatched open-group and an 05605 unmatched close-group: both are REG_EPAREN. */ 05606 if (ret == REG_ERPAREN) ret = REG_EPAREN; 05607 05608 // printf("done with regcomp\n"); 05609 05610 return (int) ret; 05611 } 05612 05613 05614 /* regexec searches for a given pattern, specified by PREG, in the 05615 string STRING. 05616 05617 If NMATCH is zero or REG_NOSUB was set in the cflags argument to 05618 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at 05619 least NMATCH elements, and we set them to the offsets of the 05620 corresponding matched substrings. 05621 05622 EFLAGS specifies `execution flags' which affect matching: if 05623 REG_NOTBOL is set, then ^ does not match at the beginning of the 05624 string; if REG_NOTEOL is set, then $ does not match at the end. 05625 05626 We return 0 if we find a match and REG_NOMATCH if not. */ 05627 05628 int 05629 regexec(const regex_t *preg, 05630 const char *string, 05631 size_t nmatch, 05632 regmatch_t pmatch[], 05633 int eflags) 05634 { 05635 int ret; 05636 struct re_registers regs; 05637 regex_t private_preg; 05638 int len = strlen (string); 05639 boolean want_reg_info = !preg->no_sub && nmatch > 0; 05640 05641 private_preg = *preg; 05642 05643 private_preg.not_bol = !!(eflags & REG_NOTBOL); 05644 private_preg.not_eol = !!(eflags & REG_NOTEOL); 05645 05646 /* The user has told us exactly how many registers to return 05647 information about, via `nmatch'. We have to pass that on to the 05648 matching routines. */ 05649 private_preg.regs_allocated = REGS_FIXED; 05650 05651 if (want_reg_info) 05652 { 05653 regs.num_regs = nmatch; 05654 regs.start = TALLOC (nmatch, regoff_t); 05655 regs.end = TALLOC (nmatch, regoff_t); 05656 if (regs.start == NULL || regs.end == NULL) 05657 return (int) REG_NOMATCH; 05658 } 05659 05660 /* Perform the searching operation. */ 05661 ret = re_search (&private_preg, string, len, 05662 /* start: */ 0, /* range: */ len, 05663 want_reg_info ? ®s : (struct re_registers *) 0); 05664 05665 /* Copy the register information to the POSIX structure. */ 05666 if (want_reg_info) 05667 { 05668 if (ret >= 0) 05669 { 05670 unsigned r; 05671 05672 for (r = 0; r < nmatch; r++) 05673 { 05674 pmatch[r].rm_so = regs.start[r]; 05675 pmatch[r].rm_eo = regs.end[r]; 05676 } 05677 } 05678 05679 /* If we needed the temporary register info, free the space now. */ 05680 free (regs.start); 05681 free (regs.end); 05682 } 05683 05684 /* We want zero return to mean success, unlike `re_search'. */ 05685 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH; 05686 } 05687 05688 05689 /* Returns a message corresponding to an error code, ERRCODE, returned 05690 from either regcomp or regexec. We don't use PREG here. */ 05691 05692 size_t 05693 regerror(int errcode, 05694 const regex_t *preg, 05695 char *errbuf, 05696 size_t errbuf_size) 05697 { 05698 const char *msg; 05699 size_t msg_size; 05700 05701 if (errcode < 0 05702 || errcode >= (int) (sizeof (re_error_msgid) 05703 / sizeof (re_error_msgid[0]))) 05704 /* Only error codes returned by the rest of the code should be passed 05705 to this routine. If we are given anything else, or if other regex 05706 code generates an invalid error code, then the program has a bug. 05707 Dump core so we can fix it. */ 05708 abort (); 05709 05710 msg = gettext (re_error_msgid[errcode]); 05711 05712 msg_size = strlen (msg) + 1; /* Includes the null. */ 05713 05714 if (errbuf_size != 0) 05715 { 05716 if (msg_size > errbuf_size) 05717 { 05718 strncpy (errbuf, msg, errbuf_size - 1); 05719 errbuf[errbuf_size - 1] = 0; 05720 } 05721 else 05722 strcpy (errbuf, msg); 05723 } 05724 05725 return msg_size; 05726 } 05727 05728 05729 /* Free dynamically allocated space used by PREG. */ 05730 05731 void 05732 regfree(regex_t *preg) 05733 { 05734 if (preg->buffer != NULL) 05735 free (preg->buffer); 05736 preg->buffer = NULL; 05737 05738 preg->allocated = 0; 05739 preg->used = 0; 05740 05741 if (preg->fastmap != NULL) 05742 free (preg->fastmap); 05743 preg->fastmap = NULL; 05744 preg->fastmap_accurate = 0; 05745 05746 if (preg->translate != NULL) 05747 free (preg->translate); 05748 preg->translate = NULL; 05749 } 05750 05751 #endif /* not emacs */
1.7.5