1 /* Extended regular expression matching and search library,
3 (Implements POSIX draft P1003.2/D11.2, except for some of the
4 internationalization features.)
5 Copyright (C) 1993-1999, 2000, 2001 Free Software Foundation, Inc.
6 This file is part of the GNU C Library.
8 The GNU C Library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public
10 License as published by the Free Software Foundation; either
11 version 2.1 of the License, or (at your option) any later version.
13 The GNU C Library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public
19 License along with the GNU C Library; if not, see
20 <http://www.gnu.org/licenses/>. */
22 /* To exclude some unwanted junk.... */
25 /* unistd.h must be included with _LIBC defined: we need smallint */
30 # define _REGEX_RE_COMP
32 # define __RE_TRANSLATE_TYPE char *
33 # define RE_TRANSLATE_TYPE __RE_TRANSLATE_TYPE
39 /* AIX requires this to be the first thing in the file. */
40 #if defined _AIX && !defined REGEX_MALLOC
48 #ifndef INSIDE_RECURSION
50 # if defined STDC_HEADERS && !defined emacs
53 /* We need this for `regex.h', and perhaps for the Emacs include files. */
54 # include <sys/types.h>
58 /* For platform which support the ISO C amendement 1 functionality we
59 support user defined character classes. */
60 # if defined __UCLIBC_HAS_WCHAR__
61 # define WIDE_CHAR_SUPPORT 1
62 /* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
68 /* We have to keep the namespace clean. */
70 # define btowc __btowc
72 /* We are also using some library internals. */
73 # include <locale/localeinfo.h>
74 # include <locale/elem-hash.h>
75 # include <langinfo.h>
76 # include <locale/coll-lookup.h>
79 /* This is for other GNU distributions with internationalized messages. */
80 # if defined HAVE_LIBINTL_H || defined _LIBC
84 # define gettext(msgid) __dcgettext ("libc", msgid, LC_MESSAGES)
87 # define gettext(msgid) (msgid)
91 /* This define is so xgettext can find the internationalizable
93 # define gettext_noop(String) String
96 /* The `emacs' switch turns on certain matching commands
97 that make sense only in Emacs. */
104 # else /* not emacs */
106 /* If we are not linking with Emacs proper,
107 we can't use the relocating allocator
108 even if config.h says that we can. */
111 # if defined STDC_HEADERS || defined _LIBC
118 /* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
119 If nothing else has been done, use the method below. */
120 # ifdef INHIBIT_STRING_HEADER
121 # if !(defined HAVE_BZERO && defined HAVE_BCOPY)
122 # if !defined bzero && !defined bcopy
123 # undef INHIBIT_STRING_HEADER
128 /* This is the normal way of making sure we have a bcopy and a bzero.
129 This is used in most programs--a few other programs avoid this
130 by defining INHIBIT_STRING_HEADER. */
131 # ifndef INHIBIT_STRING_HEADER
132 # if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
136 # define bzero(s, n) (memset (s, '\0', n), (s))
138 # define bzero(s, n) __bzero (s, n)
142 # include <strings.h>
144 # define memcmp(s1, s2, n) bcmp (s1, s2, n)
147 # define memcpy(d, s, n) (bcopy (s, d, n), (d))
152 /* Define the syntax stuff for \<, \>, etc. */
154 /* This must be nonzero for the wordchar and notwordchar pattern
155 commands in re_match_2. */
160 # ifdef SWITCH_ENUM_BUG
161 # define SWITCH_ENUM_CAST(x) ((int)(x))
163 # define SWITCH_ENUM_CAST(x) (x)
166 # endif /* not emacs */
168 # if defined _LIBC || defined HAVE_LIMITS_H
173 # define MB_LEN_MAX 1
176 /* Get the interface, including the syntax bits. */
178 # define translate __REPB_PREFIX(translate)
180 /* isalpha etc. are used for the character classes. */
183 /* Jim Meyering writes:
185 "... Some ctype macros are valid only for character codes that
186 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
187 using /bin/cc or gcc but without giving an ansi option). So, all
188 ctype uses should be through macros like ISPRINT... If
189 STDC_HEADERS is defined, then autoconf has verified that the ctype
190 macros don't need to be guarded with references to isascii. ...
191 Defining isascii to 1 should let any compiler worth its salt
192 eliminate the && through constant folding."
193 Solaris defines some of these symbols so we must undefine them first. */
196 # if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
197 # define ISASCII(c) 1
199 # define ISASCII(c) isascii(c)
203 # define ISBLANK(c) (ISASCII (c) && isblank (c))
205 # define ISBLANK(c) ((c) == ' ' || (c) == '\t')
208 # define ISGRAPH(c) (ISASCII (c) && isgraph (c))
210 # define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
214 # define ISPRINT(c) (ISASCII (c) && isprint (c))
215 # define ISDIGIT(c) (ISASCII (c) && isdigit (c))
216 # define ISALNUM(c) (ISASCII (c) && isalnum (c))
217 # define ISALPHA(c) (ISASCII (c) && isalpha (c))
218 # define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
219 # define ISLOWER(c) (ISASCII (c) && islower (c))
220 # define ISPUNCT(c) (ISASCII (c) && ispunct (c))
221 # define ISSPACE(c) (ISASCII (c) && isspace (c))
222 # define ISUPPER(c) (ISASCII (c) && isupper (c))
223 # define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
226 # define TOLOWER(c) _tolower(c)
228 # define TOLOWER(c) tolower(c)
232 # define NULL (void *)0
235 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
236 since ours (we hope) works properly with all combinations of
237 machines, compilers, `char' and `unsigned char' argument types.
238 (Per Bothner suggested the basic approach.) */
239 # undef SIGN_EXTEND_CHAR
241 # define SIGN_EXTEND_CHAR(c) ((signed char) (c))
242 # else /* not __STDC__ */
243 /* As in Harbison and Steele. */
244 # define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
248 /* How many characters in the character set. */
249 # define CHAR_SET_SIZE 256
253 extern char *re_syntax_table;
255 # else /* not SYNTAX_TABLE */
257 static char re_syntax_table[CHAR_SET_SIZE];
259 static void init_syntax_once (void);
262 init_syntax_once (void)
265 static smallint done = 0;
269 bzero (re_syntax_table, sizeof re_syntax_table);
271 for (c = 0; c < CHAR_SET_SIZE; ++c)
273 re_syntax_table[c] = Sword;
275 re_syntax_table['_'] = Sword;
280 # endif /* not SYNTAX_TABLE */
282 # define SYNTAX(c) re_syntax_table[(unsigned char) (c)]
286 /* Integer type for pointers. */
287 # if !defined _LIBC && !defined __intptr_t_defined
288 typedef unsigned long int uintptr_t;
291 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
292 use `alloca' instead of `malloc'. This is because using malloc in
293 re_search* or re_match* could cause memory leaks when C-g is used in
294 Emacs; also, malloc is slower and causes storage fragmentation. On
295 the other hand, malloc is more portable, and easier to debug.
297 Because we sometimes use alloca, some routines have to be macros,
298 not functions -- `alloca'-allocated space disappears at the end of the
299 function it is called in. */
303 # define REGEX_ALLOCATE malloc
304 # define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
305 # define REGEX_FREE free
307 # else /* not REGEX_MALLOC */
309 /* Emacs already defines alloca, sometimes. */
312 /* Make alloca work the best possible way. */
314 # define alloca __builtin_alloca
315 # else /* not __GNUC__ */
318 # endif /* HAVE_ALLOCA_H */
319 # endif /* not __GNUC__ */
321 # endif /* not alloca */
323 # define REGEX_ALLOCATE alloca
325 /* Assumes a `char *destination' variable. */
326 # define REGEX_REALLOCATE(source, osize, nsize) \
327 (destination = (char *) alloca (nsize), \
328 memcpy (destination, source, osize))
330 /* No need to do anything to free, after alloca. */
331 # define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
333 # endif /* not REGEX_MALLOC */
335 /* Define how to allocate the failure stack. */
337 # if defined REL_ALLOC && defined REGEX_MALLOC
339 # define REGEX_ALLOCATE_STACK(size) \
340 r_alloc (&failure_stack_ptr, (size))
341 # define REGEX_REALLOCATE_STACK(source, osize, nsize) \
342 r_re_alloc (&failure_stack_ptr, (nsize))
343 # define REGEX_FREE_STACK(ptr) \
344 r_alloc_free (&failure_stack_ptr)
346 # else /* not using relocating allocator */
350 # define REGEX_ALLOCATE_STACK malloc
351 # define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
352 # define REGEX_FREE_STACK free
354 # else /* not REGEX_MALLOC */
356 # define REGEX_ALLOCATE_STACK alloca
358 # define REGEX_REALLOCATE_STACK(source, osize, nsize) \
359 REGEX_REALLOCATE (source, osize, nsize)
360 /* No need to explicitly free anything. */
361 # define REGEX_FREE_STACK(arg)
363 # endif /* not REGEX_MALLOC */
364 # endif /* not using relocating allocator */
367 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
368 `string1' or just past its end. This works if PTR is NULL, which is
370 # define FIRST_STRING_P(ptr) \
371 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
373 /* (Re)Allocate N items of type T using malloc, or fail. */
374 # define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
375 # define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
376 # define RETALLOC_IF(addr, n, t) \
377 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
378 # define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
380 # define BYTEWIDTH 8 /* In bits. */
382 # define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
386 # define MAX(a, b) ((a) > (b) ? (a) : (b))
387 # define MIN(a, b) ((a) < (b) ? (a) : (b))
389 typedef char boolean;
393 static reg_errcode_t byte_regex_compile (const char *pattern, size_t size,
395 struct re_pattern_buffer *bufp);
397 static int byte_re_match_2_internal (struct re_pattern_buffer *bufp,
398 const char *string1, int size1,
399 const char *string2, int size2,
401 struct re_registers *regs,
403 static int byte_re_search_2 (struct re_pattern_buffer *bufp,
404 const char *string1, int size1,
405 const char *string2, int size2,
406 int startpos, int range,
407 struct re_registers *regs, int stop);
408 static int byte_re_compile_fastmap (struct re_pattern_buffer *bufp);
411 static reg_errcode_t wcs_regex_compile (const char *pattern, size_t size,
413 struct re_pattern_buffer *bufp);
416 static int wcs_re_match_2_internal (struct re_pattern_buffer *bufp,
417 const char *cstring1, int csize1,
418 const char *cstring2, int csize2,
420 struct re_registers *regs,
422 wchar_t *string1, int size1,
423 wchar_t *string2, int size2,
424 int *mbs_offset1, int *mbs_offset2);
425 static int wcs_re_search_2 (struct re_pattern_buffer *bufp,
426 const char *string1, int size1,
427 const char *string2, int size2,
428 int startpos, int range,
429 struct re_registers *regs, int stop);
430 static int wcs_re_compile_fastmap (struct re_pattern_buffer *bufp);
433 /* These are the command codes that appear in compiled regular
434 expressions. Some opcodes are followed by argument bytes. A
435 command code can specify any interpretation whatsoever for its
436 arguments. Zero bytes may appear in the compiled regular expression. */
442 /* Succeed right away--no more backtracking. */
445 /* Followed by one byte giving n, then by n literal bytes. */
449 /* Same as exactn, but contains binary data. */
453 /* Matches any (more or less) character. */
456 /* Matches any one char belonging to specified set. First
457 following byte is number of bitmap bytes. Then come bytes
458 for a bitmap saying which chars are in. Bits in each byte
459 are ordered low-bit-first. A character is in the set if its
460 bit is 1. A character too large to have a bit in the map is
461 automatically not in the set. */
462 /* ifdef MBS_SUPPORT, following element is length of character
463 classes, length of collating symbols, length of equivalence
464 classes, length of character ranges, and length of characters.
465 Next, character class element, collating symbols elements,
466 equivalence class elements, range elements, and character
468 See regex_compile function. */
471 /* Same parameters as charset, but match any character that is
472 not one of those specified. */
475 /* Start remembering the text that is matched, for storing in a
476 register. Followed by one byte with the register number, in
477 the range 0 to one less than the pattern buffer's re_nsub
478 field. Then followed by one byte with the number of groups
479 inner to this one. (This last has to be part of the
480 start_memory only because we need it in the on_failure_jump
484 /* Stop remembering the text that is matched and store it in a
485 memory register. Followed by one byte with the register
486 number, in the range 0 to one less than `re_nsub' in the
487 pattern buffer, and one byte with the number of inner groups,
488 just like `start_memory'. (We need the number of inner
489 groups here because we don't have any easy way of finding the
490 corresponding start_memory when we're at a stop_memory.) */
493 /* Match a duplicate of something remembered. Followed by one
494 byte containing the register number. */
497 /* Fail unless at beginning of line. */
500 /* Fail unless at end of line. */
503 /* Succeeds if at beginning of buffer (if emacs) or at beginning
504 of string to be matched (if not). */
507 /* Analogously, for end of buffer/string. */
510 /* Followed by two byte relative address to which to jump. */
513 /* Same as jump, but marks the end of an alternative. */
516 /* Followed by two-byte relative address of place to resume at
517 in case of failure. */
518 /* ifdef MBS_SUPPORT, the size of address is 1. */
521 /* Like on_failure_jump, but pushes a placeholder instead of the
522 current string position when executed. */
523 on_failure_keep_string_jump,
525 /* Throw away latest failure point and then jump to following
526 two-byte relative address. */
527 /* ifdef MBS_SUPPORT, the size of address is 1. */
530 /* Change to pop_failure_jump if know won't have to backtrack to
531 match; otherwise change to jump. This is used to jump
532 back to the beginning of a repeat. If what follows this jump
533 clearly won't match what the repeat does, such that we can be
534 sure that there is no use backtracking out of repetitions
535 already matched, then we change it to a pop_failure_jump.
536 Followed by two-byte address. */
537 /* ifdef MBS_SUPPORT, the size of address is 1. */
540 /* Jump to following two-byte address, and push a dummy failure
541 point. This failure point will be thrown away if an attempt
542 is made to use it for a failure. A `+' construct makes this
543 before the first repeat. Also used as an intermediary kind
544 of jump when compiling an alternative. */
545 /* ifdef MBS_SUPPORT, the size of address is 1. */
548 /* Push a dummy failure point and continue. Used at the end of
552 /* Followed by two-byte relative address and two-byte number n.
553 After matching N times, jump to the address upon failure. */
554 /* ifdef MBS_SUPPORT, the size of address is 1. */
557 /* Followed by two-byte relative address, and two-byte number n.
558 Jump to the address N times, then fail. */
559 /* ifdef MBS_SUPPORT, the size of address is 1. */
562 /* Set the following two-byte relative address to the
563 subsequent two-byte number. The address *includes* the two
565 /* ifdef MBS_SUPPORT, the size of address is 1. */
568 wordchar, /* Matches any word-constituent character. */
569 notwordchar, /* Matches any char that is not a word-constituent. */
571 wordbeg, /* Succeeds if at word beginning. */
572 wordend, /* Succeeds if at word end. */
574 wordbound, /* Succeeds if at a word boundary. */
575 notwordbound /* Succeeds if not at a word boundary. */
578 ,before_dot, /* Succeeds if before point. */
579 at_dot, /* Succeeds if at point. */
580 after_dot, /* Succeeds if after point. */
582 /* Matches any character whose syntax is specified. Followed by
583 a byte which contains a syntax code, e.g., Sword. */
586 /* Matches any character whose syntax is not that specified. */
590 #endif /* not INSIDE_RECURSION */
595 # define UCHAR_T unsigned char
596 # define COMPILED_BUFFER_VAR bufp->buffer
597 # define OFFSET_ADDRESS_SIZE 2
598 # define PREFIX(name) byte_##name
599 # define ARG_PREFIX(name) name
600 # define PUT_CHAR(c) putchar (c)
603 # define CHAR_T wchar_t
604 # define UCHAR_T wchar_t
605 # define COMPILED_BUFFER_VAR wc_buffer
606 # define OFFSET_ADDRESS_SIZE 1 /* the size which STORE_NUMBER macro use */
607 # define CHAR_CLASS_SIZE ((__alignof__(wctype_t)+sizeof(wctype_t))/sizeof(CHAR_T)+1)
608 # define PREFIX(name) wcs_##name
609 # define ARG_PREFIX(name) c##name
610 /* Should we use wide stream?? */
611 # define PUT_CHAR(c) printf ("%C", c);
617 # define INSIDE_RECURSION
618 # include "regex_old.c"
619 # undef INSIDE_RECURSION
622 # define INSIDE_RECURSION
623 # include "regex_old.c"
624 # undef INSIDE_RECURSION
628 #ifdef INSIDE_RECURSION
629 /* Common operations on the compiled pattern. */
631 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
632 /* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
635 # define STORE_NUMBER(destination, number) \
637 *(destination) = (UCHAR_T)(number); \
640 # define STORE_NUMBER(destination, number) \
642 (destination)[0] = (number) & 0377; \
643 (destination)[1] = (number) >> 8; \
647 /* Same as STORE_NUMBER, except increment DESTINATION to
648 the byte after where the number is stored. Therefore, DESTINATION
649 must be an lvalue. */
650 /* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
652 # define STORE_NUMBER_AND_INCR(destination, number) \
654 STORE_NUMBER (destination, number); \
655 (destination) += OFFSET_ADDRESS_SIZE; \
658 /* Put into DESTINATION a number stored in two contiguous bytes starting
660 /* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
663 # define EXTRACT_NUMBER(destination, source) \
665 (destination) = *(source); \
668 # define EXTRACT_NUMBER(destination, source) \
670 (destination) = *(source) & 0377; \
671 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
676 static void PREFIX(extract_number) (int *dest, UCHAR_T *source)
681 int temp = SIGN_EXTEND_CHAR (*(source + 1));
682 *dest = *source & 0377;
687 # ifndef EXTRACT_MACROS /* To debug the macros. */
688 # undef EXTRACT_NUMBER
689 # define EXTRACT_NUMBER(dest, src) PREFIX(extract_number) (&dest, src)
690 # endif /* not EXTRACT_MACROS */
694 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
695 SOURCE must be an lvalue. */
697 # define EXTRACT_NUMBER_AND_INCR(destination, source) \
699 EXTRACT_NUMBER (destination, source); \
700 (source) += OFFSET_ADDRESS_SIZE; \
704 static void PREFIX(extract_number_and_incr) (int *destination,
707 PREFIX(extract_number) (destination, *source);
708 *source += OFFSET_ADDRESS_SIZE;
711 # ifndef EXTRACT_MACROS
712 # undef EXTRACT_NUMBER_AND_INCR
713 # define EXTRACT_NUMBER_AND_INCR(dest, src) \
714 PREFIX(extract_number_and_incr) (&dest, &src)
715 # endif /* not EXTRACT_MACROS */
721 /* If DEBUG is defined, Regex prints many voluminous messages about what
722 it is doing (if the variable `debug' is nonzero). If linked with the
723 main program in `iregex.c', you can enter patterns and strings
724 interactively. And if linked with the main program in `main.c' and
725 the other test files, you can run the already-written tests. */
729 # ifndef DEFINED_ONCE
731 /* We use standard I/O for debugging. */
734 /* It is useful to test things that ``must'' be true when debugging. */
737 static smallint debug;
739 # define DEBUG_STATEMENT(e) e
740 # define DEBUG_PRINT1(x) if (debug) printf (x)
741 # define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
742 # define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
743 # define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
744 # endif /* not DEFINED_ONCE */
746 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
747 if (debug) PREFIX(print_partial_compiled_pattern) (s, e)
748 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
749 if (debug) PREFIX(print_double_string) (w, s1, sz1, s2, sz2)
752 /* Print the fastmap in human-readable form. */
754 # ifndef DEFINED_ONCE
756 print_fastmap (char *fastmap)
758 unsigned was_a_range = 0;
761 while (i < (1 << BYTEWIDTH))
767 while (i < (1 << BYTEWIDTH) && fastmap[i])
781 # endif /* not DEFINED_ONCE */
784 /* Print a compiled pattern string in human-readable form, starting at
785 the START pointer into it and ending just before the pointer END. */
788 PREFIX(print_partial_compiled_pattern) (UCHAR_T *start, UCHAR_T *end)
801 /* Loop over pattern commands. */
805 printf ("%td:\t", p - start);
807 printf ("%ld:\t", (long int) (p - start));
810 switch ((re_opcode_t) *p++)
818 printf ("/exactn/%d", mcnt);
830 printf ("/exactn_bin/%d", mcnt);
833 printf("/%lx", (long int) *p++);
837 # endif /* MBS_SUPPORT */
841 printf ("/start_memory/%d/%ld", mcnt, (long int) *p++);
846 printf ("/stop_memory/%d/%ld", mcnt, (long int) *p++);
850 printf ("/duplicate/%ld", (long int) *p++);
863 printf ("/charset [%s",
864 (re_opcode_t) *(workp - 1) == charset_not ? "^" : "");
866 length = *workp++; /* the length of char_classes */
867 for (i=0 ; i<length ; i++)
868 printf("[:%lx:]", (long int) *p++);
869 length = *workp++; /* the length of collating_symbol */
870 for (i=0 ; i<length ;)
874 PUT_CHAR((i++,*p++));
878 length = *workp++; /* the length of equivalence_class */
879 for (i=0 ; i<length ;)
883 PUT_CHAR((i++,*p++));
887 length = *workp++; /* the length of char_range */
888 for (i=0 ; i<length ; i++)
890 wchar_t range_start = *p++;
891 wchar_t range_end = *p++;
892 printf("%C-%C", range_start, range_end);
894 length = *workp++; /* the length of char */
895 for (i=0 ; i<length ; i++)
899 register int c, last = -100;
900 register int in_range = 0;
902 printf ("/charset [%s",
903 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
905 assert (p + *p < pend);
907 for (c = 0; c < 256; c++)
909 && (p[1 + (c/8)] & (1 << (c % 8))))
911 /* Are we starting a range? */
912 if (last + 1 == c && ! in_range)
917 /* Have we broken a range? */
918 else if (last + 1 != c && in_range)
948 case on_failure_jump:
949 PREFIX(extract_number_and_incr) (&mcnt, &p);
951 printf ("/on_failure_jump to %td", p + mcnt - start);
953 printf ("/on_failure_jump to %ld", (long int) (p + mcnt - start));
957 case on_failure_keep_string_jump:
958 PREFIX(extract_number_and_incr) (&mcnt, &p);
960 printf ("/on_failure_keep_string_jump to %td", p + mcnt - start);
962 printf ("/on_failure_keep_string_jump to %ld",
963 (long int) (p + mcnt - start));
967 case dummy_failure_jump:
968 PREFIX(extract_number_and_incr) (&mcnt, &p);
970 printf ("/dummy_failure_jump to %td", p + mcnt - start);
972 printf ("/dummy_failure_jump to %ld", (long int) (p + mcnt - start));
976 case push_dummy_failure:
977 printf ("/push_dummy_failure");
981 PREFIX(extract_number_and_incr) (&mcnt, &p);
983 printf ("/maybe_pop_jump to %td", p + mcnt - start);
985 printf ("/maybe_pop_jump to %ld", (long int) (p + mcnt - start));
989 case pop_failure_jump:
990 PREFIX(extract_number_and_incr) (&mcnt, &p);
992 printf ("/pop_failure_jump to %td", p + mcnt - start);
994 printf ("/pop_failure_jump to %ld", (long int) (p + mcnt - start));
999 PREFIX(extract_number_and_incr) (&mcnt, &p);
1001 printf ("/jump_past_alt to %td", p + mcnt - start);
1003 printf ("/jump_past_alt to %ld", (long int) (p + mcnt - start));
1008 PREFIX(extract_number_and_incr) (&mcnt, &p);
1010 printf ("/jump to %td", p + mcnt - start);
1012 printf ("/jump to %ld", (long int) (p + mcnt - start));
1017 PREFIX(extract_number_and_incr) (&mcnt, &p);
1019 PREFIX(extract_number_and_incr) (&mcnt2, &p);
1021 printf ("/succeed_n to %td, %d times", p1 - start, mcnt2);
1023 printf ("/succeed_n to %ld, %d times",
1024 (long int) (p1 - start), mcnt2);
1029 PREFIX(extract_number_and_incr) (&mcnt, &p);
1031 PREFIX(extract_number_and_incr) (&mcnt2, &p);
1032 printf ("/jump_n to %d, %d times", p1 - start, mcnt2);
1036 PREFIX(extract_number_and_incr) (&mcnt, &p);
1038 PREFIX(extract_number_and_incr) (&mcnt2, &p);
1040 printf ("/set_number_at location %td to %d", p1 - start, mcnt2);
1042 printf ("/set_number_at location %ld to %d",
1043 (long int) (p1 - start), mcnt2);
1048 printf ("/wordbound");
1052 printf ("/notwordbound");
1056 printf ("/wordbeg");
1060 printf ("/wordend");
1065 printf ("/before_dot");
1073 printf ("/after_dot");
1077 printf ("/syntaxspec");
1079 printf ("/%d", mcnt);
1083 printf ("/notsyntaxspec");
1085 printf ("/%d", mcnt);
1090 printf ("/wordchar");
1094 printf ("/notwordchar");
1106 printf ("?%ld", (long int) *(p-1));
1113 printf ("%td:\tend of pattern.\n", p - start);
1115 printf ("%ld:\tend of pattern.\n", (long int) (p - start));
1121 PREFIX(print_compiled_pattern) (struct re_pattern_buffer *bufp)
1123 UCHAR_T *buffer = (UCHAR_T*) bufp->buffer;
1125 PREFIX(print_partial_compiled_pattern) (buffer, buffer
1126 + bufp->used / sizeof(UCHAR_T));
1127 printf ("%ld bytes used/%ld bytes allocated.\n",
1128 bufp->used, bufp->allocated);
1130 if (bufp->fastmap_accurate && bufp->fastmap)
1132 printf ("fastmap: ");
1133 print_fastmap (bufp->fastmap);
1137 printf ("re_nsub: %Zd\t", bufp->re_nsub);
1139 printf ("re_nsub: %ld\t", (long int) bufp->re_nsub);
1141 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1142 printf ("can_be_null: %d\t", bufp->can_be_null);
1143 printf ("newline_anchor: %d\n", bufp->newline_anchor);
1144 printf ("no_sub: %d\t", bufp->no_sub);
1145 printf ("not_bol: %d\t", bufp->not_bol);
1146 printf ("not_eol: %d\t", bufp->not_eol);
1147 printf ("syntax: %lx\n", bufp->syntax);
1148 /* Perhaps we should print the translate table? */
1153 PREFIX(print_double_string) (
1154 const CHAR_T *where,
1155 const CHAR_T *string1,
1157 const CHAR_T *string2,
1168 if (FIRST_STRING_P (where))
1170 for (this_char = where - string1; this_char < size1; this_char++)
1171 PUT_CHAR (string1[this_char]);
1177 for (this_char = where - string2; this_char < size2; this_char++)
1179 PUT_CHAR (string2[this_char]);
1182 fputs ("...", stdout);
1189 # if 0 /* ndef DEFINED_ONCE */
1197 # else /* not DEBUG */
1199 # ifndef DEFINED_ONCE
1203 # define DEBUG_STATEMENT(e)
1204 # define DEBUG_PRINT1(x)
1205 # define DEBUG_PRINT2(x1, x2)
1206 # define DEBUG_PRINT3(x1, x2, x3)
1207 # define DEBUG_PRINT4(x1, x2, x3, x4)
1208 # endif /* not DEFINED_ONCE */
1209 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1210 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1212 # endif /* not DEBUG */
1217 /* This convert a multibyte string to a wide character string.
1218 And write their correspondances to offset_buffer(see below)
1219 and write whether each wchar_t is binary data to is_binary.
1220 This assume invalid multibyte sequences as binary data.
1221 We assume offset_buffer and is_binary is already allocated
1225 convert_mbs_to_wcs (
1227 const unsigned char* src,
1228 size_t len, /* the length of multibyte string. */
1230 /* It hold correspondances between src(char string) and
1231 dest(wchar_t string) for optimization.
1233 dest = {'X', 'Y', 'Z'}
1234 (each "xxx", "y" and "zz" represent one multibyte character
1235 corresponding to 'X', 'Y' and 'Z'.)
1236 offset_buffer = {0, 0+3("xxx"), 0+3+1("y"), 0+3+1+2("zz")}
1242 wchar_t *pdest = dest;
1243 const unsigned char *psrc = src;
1244 size_t wc_count = 0;
1248 size_t mb_remain = len;
1249 size_t mb_count = 0;
1251 /* Initialize the conversion state. */
1252 memset (&mbs, 0, sizeof (mbstate_t));
1254 offset_buffer[0] = 0;
1255 for( ; mb_remain > 0 ; ++wc_count, ++pdest, mb_remain -= consumed,
1259 consumed = __mbrtowc (pdest, psrc, mb_remain, &mbs);
1261 consumed = mbrtowc (pdest, psrc, mb_remain, &mbs);
1265 /* failed to convert. maybe src contains binary data.
1266 So we consume 1 byte manualy. */
1270 is_binary[wc_count] = TRUE;
1273 is_binary[wc_count] = FALSE;
1274 /* In sjis encoding, we use yen sign as escape character in
1275 place of reverse solidus. So we convert 0x5c(yen sign in
1276 sjis) to not 0xa5(yen sign in UCS2) but 0x5c(reverse
1277 solidus in UCS2). */
1278 if (consumed == 1 && (int) *psrc == 0x5c && (int) *pdest == 0xa5)
1279 *pdest = (wchar_t) *psrc;
1281 offset_buffer[wc_count + 1] = mb_count += consumed;
1284 /* Fill remain of the buffer with sentinel. */
1285 for (i = wc_count + 1 ; i <= len ; i++)
1286 offset_buffer[i] = mb_count + 1;
1293 #else /* not INSIDE_RECURSION */
1295 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1296 also be assigned to arbitrarily: each pattern buffer stores its own
1297 syntax, so it can be changed between regex compilations. */
1298 /* This has no initializer because initialized variables in Emacs
1299 become read-only after dumping. */
1300 reg_syntax_t re_syntax_options;
1303 /* Specify the precise syntax of regexps for compilation. This provides
1304 for compatibility for various utilities which historically have
1305 different, incompatible syntaxes.
1307 The argument SYNTAX is a bit mask comprised of the various bits
1308 defined in regex.h. We return the old syntax. */
1311 re_set_syntax (reg_syntax_t syntax)
1313 reg_syntax_t ret = re_syntax_options;
1315 re_syntax_options = syntax;
1317 if (syntax & RE_DEBUG)
1319 else if (debug) /* was on but now is not */
1325 /* This table gives an error message for each of the error codes listed
1326 in regex.h. Obviously the order here has to be same as there.
1327 POSIX doesn't require that we do anything for REG_NOERROR,
1328 but why not be nice? */
1330 static const char re_error_msgid[] =
1332 # define REG_NOERROR_IDX 0
1333 gettext_noop ("Success") /* REG_NOERROR */
1335 # define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
1336 gettext_noop ("No match") /* REG_NOMATCH */
1338 # define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
1339 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
1341 # define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
1342 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
1344 # define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
1345 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
1347 # define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
1348 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
1350 # define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
1351 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
1353 # define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
1354 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
1356 # define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
1357 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
1359 # define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
1360 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
1362 # define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
1363 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
1365 # define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
1366 gettext_noop ("Invalid range end") /* REG_ERANGE */
1368 # define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
1369 gettext_noop ("Memory exhausted") /* REG_ESPACE */
1371 # define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
1372 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
1374 # define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
1375 gettext_noop ("Premature end of regular expression") /* REG_EEND */
1377 # define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
1378 gettext_noop ("Regular expression too big") /* REG_ESIZE */
1380 # define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
1381 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
1384 static const uint16_t re_error_msgid_idx[] =
1405 #endif /* INSIDE_RECURSION */
1407 #ifndef DEFINED_ONCE
1408 /* Avoiding alloca during matching, to placate r_alloc. */
1410 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1411 searching and matching functions should not call alloca. On some
1412 systems, alloca is implemented in terms of malloc, and if we're
1413 using the relocating allocator routines, then malloc could cause a
1414 relocation, which might (if the strings being searched are in the
1415 ralloc heap) shift the data out from underneath the regexp
1418 Here's another reason to avoid allocation: Emacs
1419 processes input from X in a signal handler; processing X input may
1420 call malloc; if input arrives while a matching routine is calling
1421 malloc, then we're scrod. But Emacs can't just block input while
1422 calling matching routines; then we don't notice interrupts when
1423 they come in. So, Emacs blocks input around all regexp calls
1424 except the matching calls, which it leaves unprotected, in the
1425 faith that they will not malloc. */
1427 /* Normally, this is fine. */
1428 # define MATCH_MAY_ALLOCATE
1430 /* When using GNU C, we are not REALLY using the C alloca, no matter
1431 what config.h may say. So don't take precautions for it. */
1436 /* The match routines may not allocate if (1) they would do it with malloc
1437 and (2) it's not safe for them to use malloc.
1438 Note that if REL_ALLOC is defined, matching would not use malloc for the
1439 failure stack, but we would still use it for the register vectors;
1440 so REL_ALLOC should not affect this. */
1441 # if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs
1442 # undef MATCH_MAY_ALLOCATE
1444 #endif /* not DEFINED_ONCE */
1446 #ifdef INSIDE_RECURSION
1447 /* Failure stack declarations and macros; both re_compile_fastmap and
1448 re_match_2 use a failure stack. These have to be macros because of
1449 REGEX_ALLOCATE_STACK. */
1452 /* Number of failure points for which to initially allocate space
1453 when matching. If this number is exceeded, we allocate more
1454 space, so it is not a hard limit. */
1455 # ifndef INIT_FAILURE_ALLOC
1456 # define INIT_FAILURE_ALLOC 5
1459 /* Roughly the maximum number of failure points on the stack. Would be
1460 exactly that if always used MAX_FAILURE_ITEMS items each time we failed.
1461 This is a variable only so users of regex can assign to it; we never
1462 change it ourselves. */
1464 # ifdef INT_IS_16BIT
1466 # ifndef DEFINED_ONCE
1467 # if defined MATCH_MAY_ALLOCATE
1468 /* 4400 was enough to cause a crash on Alpha OSF/1,
1469 whose default stack limit is 2mb. */
1470 long int re_max_failures = 4000;
1472 long int re_max_failures = 2000;
1476 union PREFIX(fail_stack_elt)
1482 typedef union PREFIX(fail_stack_elt) PREFIX(fail_stack_elt_t);
1486 PREFIX(fail_stack_elt_t) *stack;
1487 unsigned long int size;
1488 unsigned long int avail; /* Offset of next open position. */
1489 } PREFIX(fail_stack_type);
1491 # else /* not INT_IS_16BIT */
1493 # ifndef DEFINED_ONCE
1494 # if defined MATCH_MAY_ALLOCATE
1495 /* 4400 was enough to cause a crash on Alpha OSF/1,
1496 whose default stack limit is 2mb. */
1497 int re_max_failures = 4000;
1499 int re_max_failures = 2000;
1503 union PREFIX(fail_stack_elt)
1509 typedef union PREFIX(fail_stack_elt) PREFIX(fail_stack_elt_t);
1513 PREFIX(fail_stack_elt_t) *stack;
1515 unsigned avail; /* Offset of next open position. */
1516 } PREFIX(fail_stack_type);
1518 # endif /* INT_IS_16BIT */
1520 # ifndef DEFINED_ONCE
1521 # define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1522 # define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1523 # define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1527 /* Define macros to initialize and free the failure stack.
1528 Do `return -2' if the alloc fails. */
1530 # ifdef MATCH_MAY_ALLOCATE
1531 # define INIT_FAIL_STACK() \
1533 fail_stack.stack = (PREFIX(fail_stack_elt_t) *) \
1534 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (PREFIX(fail_stack_elt_t))); \
1536 if (fail_stack.stack == NULL) \
1539 fail_stack.size = INIT_FAILURE_ALLOC; \
1540 fail_stack.avail = 0; \
1543 # define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1545 # define INIT_FAIL_STACK() \
1547 fail_stack.avail = 0; \
1550 # define RESET_FAIL_STACK()
1554 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1556 Return 1 if succeeds, and 0 if either ran out of memory
1557 allocating space for it or it was already too large.
1559 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1561 # define DOUBLE_FAIL_STACK(fail_stack) \
1562 ((fail_stack).size > (unsigned) (re_max_failures * MAX_FAILURE_ITEMS) \
1564 : ((fail_stack).stack = (PREFIX(fail_stack_elt_t) *) \
1565 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1566 (fail_stack).size * sizeof (PREFIX(fail_stack_elt_t)), \
1567 ((fail_stack).size << 1) * sizeof (PREFIX(fail_stack_elt_t))),\
1569 (fail_stack).stack == NULL \
1571 : ((fail_stack).size <<= 1, \
1575 /* Push pointer POINTER on FAIL_STACK.
1576 Return 1 if was able to do so and 0 if ran out of memory allocating
1578 # define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1579 ((FAIL_STACK_FULL () \
1580 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
1582 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1585 /* Push a pointer value onto the failure stack.
1586 Assumes the variable `fail_stack'. Probably should only
1587 be called from within `PUSH_FAILURE_POINT'. */
1588 # define PUSH_FAILURE_POINTER(item) \
1589 fail_stack.stack[fail_stack.avail++].pointer = (UCHAR_T *) (item)
1591 /* This pushes an integer-valued item onto the failure stack.
1592 Assumes the variable `fail_stack'. Probably should only
1593 be called from within `PUSH_FAILURE_POINT'. */
1594 # define PUSH_FAILURE_INT(item) \
1595 fail_stack.stack[fail_stack.avail++].integer = (item)
1597 /* Push a fail_stack_elt_t value onto the failure stack.
1598 Assumes the variable `fail_stack'. Probably should only
1599 be called from within `PUSH_FAILURE_POINT'. */
1600 # define PUSH_FAILURE_ELT(item) \
1601 fail_stack.stack[fail_stack.avail++] = (item)
1603 /* These three POP... operations complement the three PUSH... operations.
1604 All assume that `fail_stack' is nonempty. */
1605 # define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1606 # define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1607 # define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1609 /* Used to omit pushing failure point id's when we're not debugging. */
1611 # define DEBUG_PUSH PUSH_FAILURE_INT
1612 # define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1614 # define DEBUG_PUSH(item)
1615 # define DEBUG_POP(item_addr)
1619 /* Push the information about the state we will need
1620 if we ever fail back to it.
1622 Requires variables fail_stack, regstart, regend, reg_info, and
1623 num_regs_pushed be declared. DOUBLE_FAIL_STACK requires `destination'
1626 Does `return FAILURE_CODE' if runs out of memory. */
1628 # define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1630 char *destination; \
1631 /* Must be int, so when we don't save any registers, the arithmetic \
1632 of 0 + -1 isn't done as unsigned. */ \
1633 /* Can't be int, since there is not a shred of a guarantee that int \
1634 is wide enough to hold a value of something to which pointer can \
1636 active_reg_t this_reg; \
1638 DEBUG_STATEMENT (failure_id++); \
1639 DEBUG_STATEMENT (nfailure_points_pushed++); \
1640 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1641 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
1642 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1644 DEBUG_PRINT2 (" slots needed: %ld\n", NUM_FAILURE_ITEMS); \
1645 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \
1647 /* Ensure we have enough space allocated for what we will push. */ \
1648 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
1650 if (!DOUBLE_FAIL_STACK (fail_stack)) \
1651 return failure_code; \
1653 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
1654 (fail_stack).size); \
1655 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1658 /* Push the info, starting with the registers. */ \
1659 DEBUG_PRINT1 ("\n"); \
1662 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
1665 DEBUG_PRINT2 (" Pushing reg: %lu\n", this_reg); \
1666 DEBUG_STATEMENT (num_regs_pushed++); \
1668 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \
1669 PUSH_FAILURE_POINTER (regstart[this_reg]); \
1671 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \
1672 PUSH_FAILURE_POINTER (regend[this_reg]); \
1674 DEBUG_PRINT2 (" info: %p\n ", \
1675 reg_info[this_reg].word.pointer); \
1676 DEBUG_PRINT2 (" match_null=%d", \
1677 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
1678 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
1679 DEBUG_PRINT2 (" matched_something=%d", \
1680 MATCHED_SOMETHING (reg_info[this_reg])); \
1681 DEBUG_PRINT2 (" ever_matched=%d", \
1682 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
1683 DEBUG_PRINT1 ("\n"); \
1684 PUSH_FAILURE_ELT (reg_info[this_reg].word); \
1687 DEBUG_PRINT2 (" Pushing low active reg: %ld\n", lowest_active_reg);\
1688 PUSH_FAILURE_INT (lowest_active_reg); \
1690 DEBUG_PRINT2 (" Pushing high active reg: %ld\n", highest_active_reg);\
1691 PUSH_FAILURE_INT (highest_active_reg); \
1693 DEBUG_PRINT2 (" Pushing pattern %p:\n", pattern_place); \
1694 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
1695 PUSH_FAILURE_POINTER (pattern_place); \
1697 DEBUG_PRINT2 (" Pushing string %p: `", string_place); \
1698 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
1700 DEBUG_PRINT1 ("'\n"); \
1701 PUSH_FAILURE_POINTER (string_place); \
1703 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
1704 DEBUG_PUSH (failure_id); \
1707 # ifndef DEFINED_ONCE
1708 /* This is the number of items that are pushed and popped on the stack
1709 for each register. */
1710 # define NUM_REG_ITEMS 3
1712 /* Individual items aside from the registers. */
1714 # define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1716 # define NUM_NONREG_ITEMS 4
1719 /* We push at most this many items on the stack. */
1720 /* We used to use (num_regs - 1), which is the number of registers
1721 this regexp will save; but that was changed to 5
1722 to avoid stack overflow for a regexp with lots of parens. */
1723 # define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
1725 /* We actually push this many items. */
1726 # define NUM_FAILURE_ITEMS \
1728 ? 0 : highest_active_reg - lowest_active_reg + 1) \
1732 /* How many items can still be added to the stack without overflowing it. */
1733 # define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1734 # endif /* not DEFINED_ONCE */
1737 /* Pops what PUSH_FAIL_STACK pushes.
1739 We restore into the parameters, all of which should be lvalues:
1740 STR -- the saved data position.
1741 PAT -- the saved pattern position.
1742 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1743 REGSTART, REGEND -- arrays of string positions.
1744 REG_INFO -- array of information about each subexpression.
1746 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1747 `pend', `string1', `size1', `string2', and `size2'. */
1748 # define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
1750 DEBUG_STATEMENT (unsigned failure_id;) \
1751 active_reg_t this_reg; \
1752 const UCHAR_T *string_temp; \
1754 assert (!FAIL_STACK_EMPTY ()); \
1756 /* Remove failure points and point to how many regs pushed. */ \
1757 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1758 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
1759 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
1761 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1763 DEBUG_POP (&failure_id); \
1764 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \
1766 /* If the saved string location is NULL, it came from an \
1767 on_failure_keep_string_jump opcode, and we want to throw away the \
1768 saved NULL, thus retaining our current position in the string. */ \
1769 string_temp = POP_FAILURE_POINTER (); \
1770 if (string_temp != NULL) \
1771 str = (const CHAR_T *) string_temp; \
1773 DEBUG_PRINT2 (" Popping string %p: `", str); \
1774 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1775 DEBUG_PRINT1 ("'\n"); \
1777 pat = (UCHAR_T *) POP_FAILURE_POINTER (); \
1778 DEBUG_PRINT2 (" Popping pattern %p:\n", pat); \
1779 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1781 /* Restore register info. */ \
1782 high_reg = (active_reg_t) POP_FAILURE_INT (); \
1783 DEBUG_PRINT2 (" Popping high active reg: %ld\n", high_reg); \
1785 low_reg = (active_reg_t) POP_FAILURE_INT (); \
1786 DEBUG_PRINT2 (" Popping low active reg: %ld\n", low_reg); \
1789 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
1791 DEBUG_PRINT2 (" Popping reg: %ld\n", this_reg); \
1793 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1794 DEBUG_PRINT2 (" info: %p\n", \
1795 reg_info[this_reg].word.pointer); \
1797 regend[this_reg] = (const CHAR_T *) POP_FAILURE_POINTER (); \
1798 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \
1800 regstart[this_reg] = (const CHAR_T *) POP_FAILURE_POINTER (); \
1801 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \
1805 for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) \
1807 reg_info[this_reg].word.integer = 0; \
1808 regend[this_reg] = 0; \
1809 regstart[this_reg] = 0; \
1811 highest_active_reg = high_reg; \
1814 set_regs_matched_done = 0; \
1815 DEBUG_STATEMENT (nfailure_points_popped++); \
1816 } /* POP_FAILURE_POINT */
1818 /* Structure for per-register (a.k.a. per-group) information.
1819 Other register information, such as the
1820 starting and ending positions (which are addresses), and the list of
1821 inner groups (which is a bits list) are maintained in separate
1824 We are making a (strictly speaking) nonportable assumption here: that
1825 the compiler will pack our bit fields into something that fits into
1826 the type of `word', i.e., is something that fits into one item on the
1830 /* Declarations and macros for re_match_2. */
1834 PREFIX(fail_stack_elt_t) word;
1837 /* This field is one if this group can match the empty string,
1838 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1839 # define MATCH_NULL_UNSET_VALUE 3
1840 unsigned match_null_string_p : 2;
1841 unsigned is_active : 1;
1842 unsigned matched_something : 1;
1843 unsigned ever_matched_something : 1;
1845 } PREFIX(register_info_type);
1847 # ifndef DEFINED_ONCE
1848 # define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1849 # define IS_ACTIVE(R) ((R).bits.is_active)
1850 # define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1851 # define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1854 /* Call this when have matched a real character; it sets `matched' flags
1855 for the subexpressions which we are currently inside. Also records
1856 that those subexprs have matched. */
1857 # define SET_REGS_MATCHED() \
1860 if (!set_regs_matched_done) \
1863 set_regs_matched_done = 1; \
1864 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1866 MATCHED_SOMETHING (reg_info[r]) \
1867 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1873 # endif /* not DEFINED_ONCE */
1875 /* Registers are set to a sentinel when they haven't yet matched. */
1876 static CHAR_T PREFIX(reg_unset_dummy);
1877 # define REG_UNSET_VALUE (&PREFIX(reg_unset_dummy))
1878 # define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1880 /* Subroutine declarations and macros for regex_compile. */
1881 static void PREFIX(store_op1) (re_opcode_t op, UCHAR_T *loc, int arg);
1882 static void PREFIX(store_op2) (re_opcode_t op, UCHAR_T *loc,
1883 int arg1, int arg2);
1884 static void PREFIX(insert_op1) (re_opcode_t op, UCHAR_T *loc,
1885 int arg, UCHAR_T *end);
1886 static void PREFIX(insert_op2) (re_opcode_t op, UCHAR_T *loc,
1887 int arg1, int arg2, UCHAR_T *end);
1888 static boolean PREFIX(at_begline_loc_p) (const CHAR_T *pattern,
1890 reg_syntax_t syntax);
1891 static boolean PREFIX(at_endline_loc_p) (const CHAR_T *p,
1893 reg_syntax_t syntax);
1895 static reg_errcode_t wcs_compile_range (CHAR_T range_start,
1896 const CHAR_T **p_ptr,
1898 __RE_TRANSLATE_TYPE translate,
1899 reg_syntax_t syntax,
1902 static void insert_space (int num, CHAR_T *loc, CHAR_T *end);
1904 static reg_errcode_t byte_compile_range (unsigned int range_start,
1907 __RE_TRANSLATE_TYPE translate,
1908 reg_syntax_t syntax,
1912 /* Fetch the next character in the uncompiled pattern---translating it
1913 if necessary. Also cast from a signed character in the constant
1914 string passed to us by the user to an unsigned char that we can use
1915 as an array index (in, e.g., `translate'). */
1916 /* ifdef MBS_SUPPORT, we translate only if character <= 0xff,
1917 because it is impossible to allocate 4GB array for some encodings
1918 which have 4 byte character_set like UCS4. */
1921 # define PATFETCH(c) \
1922 do {if (p == pend) return REG_EEND; \
1923 c = (UCHAR_T) *p++; \
1924 if (translate && (c <= 0xff)) c = (UCHAR_T) translate[c]; \
1927 # define PATFETCH(c) \
1928 do {if (p == pend) return REG_EEND; \
1929 c = (unsigned char) *p++; \
1930 if (translate) c = (unsigned char) translate[c]; \
1935 /* Fetch the next character in the uncompiled pattern, with no
1937 # define PATFETCH_RAW(c) \
1938 do {if (p == pend) return REG_EEND; \
1939 c = (UCHAR_T) *p++; \
1942 /* Go backwards one character in the pattern. */
1943 # define PATUNFETCH p--
1946 /* If `translate' is non-null, return translate[D], else just D. We
1947 cast the subscript to translate because some data is declared as
1948 `char *', to avoid warnings when a string constant is passed. But
1949 when we use a character as a subscript we must make it unsigned. */
1950 /* ifdef MBS_SUPPORT, we translate only if character <= 0xff,
1951 because it is impossible to allocate 4GB array for some encodings
1952 which have 4 byte character_set like UCS4. */
1956 # define TRANSLATE(d) \
1957 ((translate && ((UCHAR_T) (d)) <= 0xff) \
1958 ? (char) translate[(unsigned char) (d)] : (d))
1960 # define TRANSLATE(d) \
1961 (translate ? (char) translate[(unsigned char) (d)] : (d))
1966 /* Macros for outputting the compiled pattern into `buffer'. */
1968 /* If the buffer isn't allocated when it comes in, use this. */
1969 # define INIT_BUF_SIZE (32 * sizeof(UCHAR_T))
1971 /* Make sure we have at least N more bytes of space in buffer. */
1973 # define GET_BUFFER_SPACE(n) \
1974 while (((unsigned long)b - (unsigned long)COMPILED_BUFFER_VAR \
1975 + (n)*sizeof(CHAR_T)) > bufp->allocated) \
1978 # define GET_BUFFER_SPACE(n) \
1979 while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \
1983 /* Make sure we have one more byte of buffer space and then add C to it. */
1984 # define BUF_PUSH(c) \
1986 GET_BUFFER_SPACE (1); \
1987 *b++ = (UCHAR_T) (c); \
1991 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1992 # define BUF_PUSH_2(c1, c2) \
1994 GET_BUFFER_SPACE (2); \
1995 *b++ = (UCHAR_T) (c1); \
1996 *b++ = (UCHAR_T) (c2); \
2000 /* As with BUF_PUSH_2, except for three bytes. */
2001 # define BUF_PUSH_3(c1, c2, c3) \
2003 GET_BUFFER_SPACE (3); \
2004 *b++ = (UCHAR_T) (c1); \
2005 *b++ = (UCHAR_T) (c2); \
2006 *b++ = (UCHAR_T) (c3); \
2009 /* Store a jump with opcode OP at LOC to location TO. We store a
2010 relative address offset by the three bytes the jump itself occupies. */
2011 # define STORE_JUMP(op, loc, to) \
2012 PREFIX(store_op1) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)))
2014 /* Likewise, for a two-argument jump. */
2015 # define STORE_JUMP2(op, loc, to, arg) \
2016 PREFIX(store_op2) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), arg)
2018 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
2019 # define INSERT_JUMP(op, loc, to) \
2020 PREFIX(insert_op1) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), b)
2022 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
2023 # define INSERT_JUMP2(op, loc, to, arg) \
2024 PREFIX(insert_op2) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)),\
2027 /* This is not an arbitrary limit: the arguments which represent offsets
2028 into the pattern are two bytes long. So if 2^16 bytes turns out to
2029 be too small, many things would have to change. */
2030 /* Any other compiler which, like MSC, has allocation limit below 2^16
2031 bytes will have to use approach similar to what was done below for
2032 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
2033 reallocating to 0 bytes. Such thing is not going to work too well.
2034 You have been warned!! */
2035 # ifndef DEFINED_ONCE
2036 # if defined _MSC_VER && !defined WIN32
2037 /* Microsoft C 16-bit versions limit malloc to approx 65512 bytes.
2038 The REALLOC define eliminates a flurry of conversion warnings,
2039 but is not required. */
2040 # define MAX_BUF_SIZE 65500L
2041 # define REALLOC(p,s) realloc ((p), (size_t) (s))
2043 # define MAX_BUF_SIZE (1L << 16)
2044 # define REALLOC(p,s) realloc ((p), (s))
2046 # endif /* not DEFINED_ONCE */
2048 /* Extend the buffer by twice its current size via realloc and
2049 reset the pointers that pointed into the old block to point to the
2050 correct places in the new one. If extending the buffer results in it
2051 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
2053 # define EXTEND_BUFFER() \
2055 UCHAR_T *old_buffer = COMPILED_BUFFER_VAR; \
2057 if (bufp->allocated + sizeof(UCHAR_T) > MAX_BUF_SIZE) \
2059 bufp->allocated <<= 1; \
2060 if (bufp->allocated > MAX_BUF_SIZE) \
2061 bufp->allocated = MAX_BUF_SIZE; \
2062 /* How many characters the new buffer can have? */ \
2063 wchar_count = bufp->allocated / sizeof(UCHAR_T); \
2064 if (wchar_count == 0) wchar_count = 1; \
2065 /* Truncate the buffer to CHAR_T align. */ \
2066 bufp->allocated = wchar_count * sizeof(UCHAR_T); \
2067 RETALLOC (COMPILED_BUFFER_VAR, wchar_count, UCHAR_T); \
2068 bufp->buffer = (char*)COMPILED_BUFFER_VAR; \
2069 if (COMPILED_BUFFER_VAR == NULL) \
2070 return REG_ESPACE; \
2071 /* If the buffer moved, move all the pointers into it. */ \
2072 if (old_buffer != COMPILED_BUFFER_VAR) \
2074 int incr = COMPILED_BUFFER_VAR - old_buffer; \
2077 if (fixup_alt_jump) \
2078 fixup_alt_jump += incr; \
2080 laststart += incr; \
2081 if (pending_exact) \
2082 pending_exact += incr; \
2086 # define EXTEND_BUFFER() \
2088 UCHAR_T *old_buffer = COMPILED_BUFFER_VAR; \
2089 if (bufp->allocated == MAX_BUF_SIZE) \
2091 bufp->allocated <<= 1; \
2092 if (bufp->allocated > MAX_BUF_SIZE) \
2093 bufp->allocated = MAX_BUF_SIZE; \
2094 bufp->buffer = (UCHAR_T *) REALLOC (COMPILED_BUFFER_VAR, \
2096 if (COMPILED_BUFFER_VAR == NULL) \
2097 return REG_ESPACE; \
2098 /* If the buffer moved, move all the pointers into it. */ \
2099 if (old_buffer != COMPILED_BUFFER_VAR) \
2101 int incr = COMPILED_BUFFER_VAR - old_buffer; \
2104 if (fixup_alt_jump) \
2105 fixup_alt_jump += incr; \
2107 laststart += incr; \
2108 if (pending_exact) \
2109 pending_exact += incr; \
2114 # ifndef DEFINED_ONCE
2115 /* Since we have one byte reserved for the register number argument to
2116 {start,stop}_memory, the maximum number of groups we can report
2117 things about is what fits in that byte. */
2118 # define MAX_REGNUM 255
2120 /* But patterns can have more than `MAX_REGNUM' registers. We just
2121 ignore the excess. */
2122 typedef unsigned regnum_t;
2125 /* Macros for the compile stack. */
2127 /* Since offsets can go either forwards or backwards, this type needs to
2128 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
2129 /* int may be not enough when sizeof(int) == 2. */
2130 typedef long pattern_offset_t;
2134 pattern_offset_t begalt_offset;
2135 pattern_offset_t fixup_alt_jump;
2136 pattern_offset_t inner_group_offset;
2137 pattern_offset_t laststart_offset;
2139 } compile_stack_elt_t;
2144 compile_stack_elt_t *stack;
2146 unsigned avail; /* Offset of next open position. */
2147 } compile_stack_type;
2150 # define INIT_COMPILE_STACK_SIZE 32
2152 # define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
2153 # define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
2155 /* The next available element. */
2156 # define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
2158 # endif /* not DEFINED_ONCE */
2160 /* Set the bit for character C in a list. */
2161 # ifndef DEFINED_ONCE
2162 # define SET_LIST_BIT(c) \
2163 (b[((unsigned char) (c)) / BYTEWIDTH] \
2164 |= 1 << (((unsigned char) c) % BYTEWIDTH))
2165 # endif /* DEFINED_ONCE */
2167 /* Get the next unsigned number in the uncompiled pattern. */
2168 # define GET_UNSIGNED_NUMBER(num) \
2173 if (c < '0' || c > '9') \
2175 if (num <= RE_DUP_MAX) \
2179 num = num * 10 + c - '0'; \
2184 # ifndef DEFINED_ONCE
2185 # if defined _LIBC || defined WIDE_CHAR_SUPPORT
2186 /* The GNU C library provides support for user-defined character classes
2187 and the functions from ISO C amendement 1. */
2188 # ifdef CHARCLASS_NAME_MAX
2189 # define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX
2191 /* This shouldn't happen but some implementation might still have this
2192 problem. Use a reasonable default value. */
2193 # define CHAR_CLASS_MAX_LENGTH 256
2197 # define IS_CHAR_CLASS(string) __wctype (string)
2199 # define IS_CHAR_CLASS(string) wctype (string)
2202 # define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
2204 # define IS_CHAR_CLASS(string) \
2205 (STREQ (string, "alpha") || STREQ (string, "upper") \
2206 || STREQ (string, "lower") || STREQ (string, "digit") \
2207 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
2208 || STREQ (string, "space") || STREQ (string, "print") \
2209 || STREQ (string, "punct") || STREQ (string, "graph") \
2210 || STREQ (string, "cntrl") || STREQ (string, "blank"))
2212 # endif /* DEFINED_ONCE */
2214 # ifndef MATCH_MAY_ALLOCATE
2216 /* If we cannot allocate large objects within re_match_2_internal,
2217 we make the fail stack and register vectors global.
2218 The fail stack, we grow to the maximum size when a regexp
2220 The register vectors, we adjust in size each time we
2221 compile a regexp, according to the number of registers it needs. */
2223 static PREFIX(fail_stack_type) fail_stack;
2225 /* Size with which the following vectors are currently allocated.
2226 That is so we can make them bigger as needed,
2227 but never make them smaller. */
2228 # ifdef DEFINED_ONCE
2229 static int regs_allocated_size;
2231 static const char ** regstart, ** regend;
2232 static const char ** old_regstart, ** old_regend;
2233 static const char **best_regstart, **best_regend;
2234 static const char **reg_dummy;
2235 # endif /* DEFINED_ONCE */
2237 static PREFIX(register_info_type) *PREFIX(reg_info);
2238 static PREFIX(register_info_type) *PREFIX(reg_info_dummy);
2240 /* Make the register vectors big enough for NUM_REGS registers,
2241 but don't make them smaller. */
2244 PREFIX(regex_grow_registers) (int num_regs)
2246 if (num_regs > regs_allocated_size)
2248 RETALLOC_IF (regstart, num_regs, const char *);
2249 RETALLOC_IF (regend, num_regs, const char *);
2250 RETALLOC_IF (old_regstart, num_regs, const char *);
2251 RETALLOC_IF (old_regend, num_regs, const char *);
2252 RETALLOC_IF (best_regstart, num_regs, const char *);
2253 RETALLOC_IF (best_regend, num_regs, const char *);
2254 RETALLOC_IF (PREFIX(reg_info), num_regs, PREFIX(register_info_type));
2255 RETALLOC_IF (reg_dummy, num_regs, const char *);
2256 RETALLOC_IF (PREFIX(reg_info_dummy), num_regs, PREFIX(register_info_type));
2258 regs_allocated_size = num_regs;
2262 # endif /* not MATCH_MAY_ALLOCATE */
2264 # ifndef DEFINED_ONCE
2265 static boolean group_in_compile_stack (compile_stack_type
2268 # endif /* not DEFINED_ONCE */
2270 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2271 Returns one of error codes defined in `regex.h', or zero for success.
2273 Assumes the `allocated' (and perhaps `buffer') and `translate'
2274 fields are set in BUFP on entry.
2276 If it succeeds, results are put in BUFP (if it returns an error, the
2277 contents of BUFP are undefined):
2278 `buffer' is the compiled pattern;
2279 `syntax' is set to SYNTAX;
2280 `used' is set to the length of the compiled pattern;
2281 `fastmap_accurate' is zero;
2282 `re_nsub' is the number of subexpressions in PATTERN;
2283 `not_bol' and `not_eol' are zero;
2285 The `fastmap' and `newline_anchor' fields are neither
2286 examined nor set. */
2288 /* Return, freeing storage we allocated. */
2290 # define FREE_STACK_RETURN(value) \
2291 return (free(pattern), free(mbs_offset), free(is_binary), free (compile_stack.stack), value)
2293 # define FREE_STACK_RETURN(value) \
2294 return (free (compile_stack.stack), value)
2297 static reg_errcode_t
2298 PREFIX(regex_compile) (
2299 const char *ARG_PREFIX(pattern),
2300 size_t ARG_PREFIX(size),
2301 reg_syntax_t syntax,
2302 struct re_pattern_buffer *bufp)
2304 /* We fetch characters from PATTERN here. Even though PATTERN is
2305 `char *' (i.e., signed), we declare these variables as unsigned, so
2306 they can be reliably used as array indices. */
2307 register UCHAR_T c, c1;
2310 /* A temporary space to keep wchar_t pattern and compiled pattern. */
2311 CHAR_T *pattern, *COMPILED_BUFFER_VAR;
2313 /* offset buffer for optimization. See convert_mbs_to_wc. */
2314 int *mbs_offset = NULL;
2315 /* It hold whether each wchar_t is binary data or not. */
2316 char *is_binary = NULL;
2317 /* A flag whether exactn is handling binary data or not. */
2318 char is_exactn_bin = FALSE;
2321 /* A random temporary spot in PATTERN. */
2324 /* Points to the end of the buffer, where we should append. */
2325 register UCHAR_T *b;
2327 /* Keeps track of unclosed groups. */
2328 compile_stack_type compile_stack;
2330 /* Points to the current (ending) position in the pattern. */
2335 const CHAR_T *p = pattern;
2336 const CHAR_T *pend = pattern + size;
2339 /* How to translate the characters in the pattern. */
2340 __RE_TRANSLATE_TYPE translate = bufp->translate;
2342 /* Address of the count-byte of the most recently inserted `exactn'
2343 command. This makes it possible to tell if a new exact-match
2344 character can be added to that command or if the character requires
2345 a new `exactn' command. */
2346 UCHAR_T *pending_exact = 0;
2348 /* Address of start of the most recently finished expression.
2349 This tells, e.g., postfix * where to find the start of its
2350 operand. Reset at the beginning of groups and alternatives. */
2351 UCHAR_T *laststart = 0;
2353 /* Address of beginning of regexp, or inside of last group. */
2356 /* Address of the place where a forward jump should go to the end of
2357 the containing expression. Each alternative of an `or' -- except the
2358 last -- ends with a forward jump of this sort. */
2359 UCHAR_T *fixup_alt_jump = 0;
2361 /* Counts open-groups as they are encountered. Remembered for the
2362 matching close-group on the compile stack, so the same register
2363 number is put in the stop_memory as the start_memory. */
2364 regnum_t regnum = 0;
2367 /* Initialize the wchar_t PATTERN and offset_buffer. */
2368 p = pend = pattern = TALLOC(csize + 1, CHAR_T);
2369 mbs_offset = TALLOC(csize + 1, int);
2370 is_binary = TALLOC(csize + 1, char);
2371 if (pattern == NULL || mbs_offset == NULL || is_binary == NULL)
2378 pattern[csize] = L'\0'; /* sentinel */
2379 size = convert_mbs_to_wcs(pattern, cpattern, csize, mbs_offset, is_binary);
2391 DEBUG_PRINT1 ("\nCompiling pattern: ");
2394 unsigned debug_count;
2396 for (debug_count = 0; debug_count < size; debug_count++)
2397 PUT_CHAR (pattern[debug_count]);
2402 /* Initialize the compile stack. */
2403 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2404 if (compile_stack.stack == NULL)
2414 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2415 compile_stack.avail = 0;
2417 /* Initialize the pattern buffer. */
2418 bufp->syntax = syntax;
2419 bufp->fastmap_accurate = 0;
2420 bufp->not_bol = bufp->not_eol = 0;
2422 /* Set `used' to zero, so that if we return an error, the pattern
2423 printer (for debugging) will think there's no pattern. We reset it
2427 /* Always count groups, whether or not bufp->no_sub is set. */
2430 #if !defined emacs && !defined SYNTAX_TABLE
2431 /* Initialize the syntax table. */
2432 init_syntax_once ();
2435 if (bufp->allocated == 0)
2438 { /* If zero allocated, but buffer is non-null, try to realloc
2439 enough space. This loses if buffer's address is bogus, but
2440 that is the user's responsibility. */
2442 /* Free bufp->buffer and allocate an array for wchar_t pattern
2445 COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE/sizeof(UCHAR_T),
2448 RETALLOC (COMPILED_BUFFER_VAR, INIT_BUF_SIZE, UCHAR_T);
2452 { /* Caller did not allocate a buffer. Do it for them. */
2453 COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE / sizeof(UCHAR_T),
2457 if (!COMPILED_BUFFER_VAR) FREE_STACK_RETURN (REG_ESPACE);
2459 bufp->buffer = (char*)COMPILED_BUFFER_VAR;
2461 bufp->allocated = INIT_BUF_SIZE;
2465 COMPILED_BUFFER_VAR = (UCHAR_T*) bufp->buffer;
2468 begalt = b = COMPILED_BUFFER_VAR;
2470 /* Loop through the uncompiled pattern until we're at the end. */
2479 if ( /* If at start of pattern, it's an operator. */
2481 /* If context independent, it's an operator. */
2482 || syntax & RE_CONTEXT_INDEP_ANCHORS
2483 /* Otherwise, depends on what's come before. */
2484 || PREFIX(at_begline_loc_p) (pattern, p, syntax))
2494 if ( /* If at end of pattern, it's an operator. */
2496 /* If context independent, it's an operator. */
2497 || syntax & RE_CONTEXT_INDEP_ANCHORS
2498 /* Otherwise, depends on what's next. */
2499 || PREFIX(at_endline_loc_p) (p, pend, syntax))
2509 if ((syntax & RE_BK_PLUS_QM)
2510 || (syntax & RE_LIMITED_OPS))
2514 /* If there is no previous pattern... */
2517 if (syntax & RE_CONTEXT_INVALID_OPS)
2518 FREE_STACK_RETURN (REG_BADRPT);
2519 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2524 /* Are we optimizing this jump? */
2525 boolean keep_string_p = false;
2527 /* 1 means zero (many) matches is allowed. */
2528 char zero_times_ok = 0, many_times_ok = 0;
2530 /* If there is a sequence of repetition chars, collapse it
2531 down to just one (the right one). We can't combine
2532 interval operators with these because of, e.g., `a{2}*',
2533 which should only match an even number of `a's. */
2537 zero_times_ok |= c != '+';
2538 many_times_ok |= c != '?';
2546 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
2549 else if (syntax & RE_BK_PLUS_QM && c == '\\')
2551 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2554 if (!(c1 == '+' || c1 == '?'))
2569 /* If we get here, we found another repeat character. */
2572 /* Star, etc. applied to an empty pattern is equivalent
2573 to an empty pattern. */
2577 /* Now we know whether or not zero matches is allowed
2578 and also whether or not two or more matches is allowed. */
2580 { /* More than one repetition is allowed, so put in at the
2581 end a backward relative jump from `b' to before the next
2582 jump we're going to put in below (which jumps from
2583 laststart to after this jump).
2585 But if we are at the `*' in the exact sequence `.*\n',
2586 insert an unconditional jump backwards to the .,
2587 instead of the beginning of the loop. This way we only
2588 push a failure point once, instead of every time
2589 through the loop. */
2590 assert (p - 1 > pattern);
2592 /* Allocate the space for the jump. */
2593 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
2595 /* We know we are not at the first character of the pattern,
2596 because laststart was nonzero. And we've already
2597 incremented `p', by the way, to be the character after
2598 the `*'. Do we have to do something analogous here
2599 for null bytes, because of RE_DOT_NOT_NULL? */
2600 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
2602 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
2603 && !(syntax & RE_DOT_NEWLINE))
2604 { /* We have .*\n. */
2605 STORE_JUMP (jump, b, laststart);
2606 keep_string_p = true;
2609 /* Anything else. */
2610 STORE_JUMP (maybe_pop_jump, b, laststart -
2611 (1 + OFFSET_ADDRESS_SIZE));
2613 /* We've added more stuff to the buffer. */
2614 b += 1 + OFFSET_ADDRESS_SIZE;
2617 /* On failure, jump from laststart to b + 3, which will be the
2618 end of the buffer after this jump is inserted. */
2619 /* ifdef WCHAR, 'b + 1 + OFFSET_ADDRESS_SIZE' instead of
2621 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
2622 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
2624 laststart, b + 1 + OFFSET_ADDRESS_SIZE);
2626 b += 1 + OFFSET_ADDRESS_SIZE;
2630 /* At least one repetition is required, so insert a
2631 `dummy_failure_jump' before the initial
2632 `on_failure_jump' instruction of the loop. This
2633 effects a skip over that instruction the first time
2634 we hit that loop. */
2635 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
2636 INSERT_JUMP (dummy_failure_jump, laststart, laststart +
2637 2 + 2 * OFFSET_ADDRESS_SIZE);
2638 b += 1 + OFFSET_ADDRESS_SIZE;
2652 boolean had_char_class = false;
2654 CHAR_T range_start = 0xffffffff;
2656 unsigned int range_start = 0xffffffff;
2658 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2661 /* We assume a charset(_not) structure as a wchar_t array.
2662 charset[0] = (re_opcode_t) charset(_not)
2663 charset[1] = l (= length of char_classes)
2664 charset[2] = m (= length of collating_symbols)
2665 charset[3] = n (= length of equivalence_classes)
2666 charset[4] = o (= length of char_ranges)
2667 charset[5] = p (= length of chars)
2669 charset[6] = char_class (wctype_t)
2670 charset[6+CHAR_CLASS_SIZE] = char_class (wctype_t)
2672 charset[l+5] = char_class (wctype_t)
2674 charset[l+6] = collating_symbol (wchar_t)
2676 charset[l+m+5] = collating_symbol (wchar_t)
2677 ifdef _LIBC we use the index if
2678 _NL_COLLATE_SYMB_EXTRAMB instead of
2681 charset[l+m+6] = equivalence_classes (wchar_t)
2683 charset[l+m+n+5] = equivalence_classes (wchar_t)
2684 ifdef _LIBC we use the index in
2685 _NL_COLLATE_WEIGHT instead of
2688 charset[l+m+n+6] = range_start
2689 charset[l+m+n+7] = range_end
2691 charset[l+m+n+2o+4] = range_start
2692 charset[l+m+n+2o+5] = range_end
2693 ifdef _LIBC we use the value looked up
2694 in _NL_COLLATE_COLLSEQ instead of
2697 charset[l+m+n+2o+6] = char
2699 charset[l+m+n+2o+p+5] = char
2703 /* We need at least 6 spaces: the opcode, the length of
2704 char_classes, the length of collating_symbols, the length of
2705 equivalence_classes, the length of char_ranges, the length of
2707 GET_BUFFER_SPACE (6);
2709 /* Save b as laststart. And We use laststart as the pointer
2710 to the first element of the charset here.
2711 In other words, laststart[i] indicates charset[i]. */
2714 /* We test `*p == '^' twice, instead of using an if
2715 statement, so we only need one BUF_PUSH. */
2716 BUF_PUSH (*p == '^' ? charset_not : charset);
2720 /* Push the length of char_classes, the length of
2721 collating_symbols, the length of equivalence_classes, the
2722 length of char_ranges and the length of chars. */
2723 BUF_PUSH_3 (0, 0, 0);
2726 /* Remember the first position in the bracket expression. */
2729 /* charset_not matches newline according to a syntax bit. */
2730 if ((re_opcode_t) b[-6] == charset_not
2731 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2734 laststart[5]++; /* Update the length of characters */
2737 /* Read in characters and ranges, setting map bits. */
2740 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2744 /* \ might escape characters inside [...] and [^...]. */
2745 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2747 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2751 laststart[5]++; /* Update the length of chars */
2756 /* Could be the end of the bracket expression. If it's
2757 not (i.e., when the bracket expression is `[]' so
2758 far), the ']' character bit gets set way below. */
2759 if (c == ']' && p != p1 + 1)
2762 /* Look ahead to see if it's a range when the last thing
2763 was a character class. */
2764 if (had_char_class && c == '-' && *p != ']')
2765 FREE_STACK_RETURN (REG_ERANGE);
2767 /* Look ahead to see if it's a range when the last thing
2768 was a character: if this is a hyphen not at the
2769 beginning or the end of a list, then it's the range
2772 && !(p - 2 >= pattern && p[-2] == '[')
2773 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
2777 /* Allocate the space for range_start and range_end. */
2778 GET_BUFFER_SPACE (2);
2779 /* Update the pointer to indicate end of buffer. */
2781 ret = wcs_compile_range (range_start, &p, pend, translate,
2782 syntax, b, laststart);
2783 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2784 range_start = 0xffffffff;
2786 else if (p[0] == '-' && p[1] != ']')
2787 { /* This handles ranges made up of characters only. */
2790 /* Move past the `-'. */
2792 /* Allocate the space for range_start and range_end. */
2793 GET_BUFFER_SPACE (2);
2794 /* Update the pointer to indicate end of buffer. */
2796 ret = wcs_compile_range (c, &p, pend, translate, syntax, b,
2798 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2799 range_start = 0xffffffff;
2802 /* See if we're at the beginning of a possible character
2804 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2805 { /* Leave room for the null. */
2806 char str[CHAR_CLASS_MAX_LENGTH + 1];
2811 /* If pattern is `[[:'. */
2812 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2817 if ((c == ':' && *p == ']') || p == pend)
2819 if (c1 < CHAR_CLASS_MAX_LENGTH)
2822 /* This is in any case an invalid class name. */
2827 /* If isn't a word bracketed by `[:' and `:]':
2828 undo the ending character, the letters, and leave
2829 the leading `:' and `[' (but store them as character). */
2830 if (c == ':' && *p == ']')
2835 /* Query the character class as wctype_t. */
2836 wt = IS_CHAR_CLASS (str);
2838 FREE_STACK_RETURN (REG_ECTYPE);
2840 /* Throw away the ] at the end of the character
2844 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2846 /* Allocate the space for character class. */
2847 GET_BUFFER_SPACE(CHAR_CLASS_SIZE);
2848 /* Update the pointer to indicate end of buffer. */
2849 b += CHAR_CLASS_SIZE;
2850 /* Move data which follow character classes
2851 not to violate the data. */
2852 insert_space(CHAR_CLASS_SIZE,
2853 laststart + 6 + laststart[1],
2855 alignedp = ((uintptr_t)(laststart + 6 + laststart[1])
2856 + __alignof__(wctype_t) - 1)
2857 & ~(uintptr_t)(__alignof__(wctype_t) - 1);
2858 /* Store the character class. */
2859 *((wctype_t*)alignedp) = wt;
2860 /* Update length of char_classes */
2861 laststart[1] += CHAR_CLASS_SIZE;
2863 had_char_class = true;
2872 laststart[5] += 2; /* Update the length of characters */
2874 had_char_class = false;
2877 else if (syntax & RE_CHAR_CLASSES && c == '[' && (*p == '='
2880 CHAR_T str[128]; /* Should be large enough. */
2881 CHAR_T delim = *p; /* '=' or '.' */
2884 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2889 /* If pattern is `[[=' or '[[.'. */
2890 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2895 if ((c == delim && *p == ']') || p == pend)
2897 if (c1 < sizeof (str) - 1)
2900 /* This is in any case an invalid class name. */
2905 if (c == delim && *p == ']' && str[0] != '\0')
2907 unsigned int i, offset;
2908 /* If we have no collation data we use the default
2909 collation in which each character is in a class
2910 by itself. It also means that ASCII is the
2911 character set and therefore we cannot have character
2912 with more than one byte in the multibyte
2915 /* If not defined _LIBC, we push the name and
2916 `\0' for the sake of matching performance. */
2917 int datasize = c1 + 1;
2925 FREE_STACK_RETURN (REG_ECOLLATE);
2930 const int32_t *table;
2931 const int32_t *weights;
2932 const int32_t *extra;
2933 const int32_t *indirect;
2936 /* This #include defines a local function! */
2937 # include <locale/weightwc.h>
2941 /* We push the index for equivalence class. */
2944 table = (const int32_t *)
2945 _NL_CURRENT (LC_COLLATE,
2946 _NL_COLLATE_TABLEWC);
2947 weights = (const int32_t *)
2948 _NL_CURRENT (LC_COLLATE,
2949 _NL_COLLATE_WEIGHTWC);
2950 extra = (const int32_t *)
2951 _NL_CURRENT (LC_COLLATE,
2952 _NL_COLLATE_EXTRAWC);
2953 indirect = (const int32_t *)
2954 _NL_CURRENT (LC_COLLATE,
2955 _NL_COLLATE_INDIRECTWC);
2957 idx = findidx ((const wint_t**)&cp);
2958 if (idx == 0 || cp < (wint_t*) str + c1)
2959 /* This is no valid character. */
2960 FREE_STACK_RETURN (REG_ECOLLATE);
2962 str[0] = (wchar_t)idx;
2964 else /* delim == '.' */
2966 /* We push collation sequence value
2967 for collating symbol. */
2969 const int32_t *symb_table;
2970 const unsigned char *extra;
2977 /* We have to convert the name to a single-byte
2978 string. This is possible since the names
2979 consist of ASCII characters and the internal
2980 representation is UCS4. */
2981 for (i = 0; i < c1; ++i)
2982 char_str[i] = str[i];
2985 _NL_CURRENT_WORD (LC_COLLATE,
2986 _NL_COLLATE_SYMB_HASH_SIZEMB);
2987 symb_table = (const int32_t *)
2988 _NL_CURRENT (LC_COLLATE,
2989 _NL_COLLATE_SYMB_TABLEMB);
2990 extra = (const unsigned char *)
2991 _NL_CURRENT (LC_COLLATE,
2992 _NL_COLLATE_SYMB_EXTRAMB);
2994 /* Locate the character in the hashing table. */
2995 hash = elem_hash (char_str, c1);
2998 elem = hash % table_size;
2999 second = hash % (table_size - 2);
3000 while (symb_table[2 * elem] != 0)
3002 /* First compare the hashing value. */
3003 if (symb_table[2 * elem] == hash
3004 && c1 == extra[symb_table[2 * elem + 1]]
3005 && memcmp (char_str,
3006 &extra[symb_table[2 * elem + 1]
3009 /* Yep, this is the entry. */
3010 idx = symb_table[2 * elem + 1];
3011 idx += 1 + extra[idx];
3019 if (symb_table[2 * elem] != 0)
3021 /* Compute the index of the byte sequence
3023 idx += 1 + extra[idx];
3024 /* Adjust for the alignment. */
3025 idx = (idx + 3) & ~3;
3027 str[0] = (wchar_t) idx + 4;
3029 else if (symb_table[2 * elem] == 0 && c1 == 1)
3031 /* No valid character. Match it as a
3032 single byte character. */
3033 had_char_class = false;
3035 /* Update the length of characters */
3037 range_start = str[0];
3039 /* Throw away the ] at the end of the
3040 collating symbol. */
3042 /* exit from the switch block. */
3046 FREE_STACK_RETURN (REG_ECOLLATE);
3051 /* Throw away the ] at the end of the equivalence
3052 class (or collating symbol). */
3055 /* Allocate the space for the equivalence class
3056 (or collating symbol) (and '\0' if needed). */
3057 GET_BUFFER_SPACE(datasize);
3058 /* Update the pointer to indicate end of buffer. */
3062 { /* equivalence class */
3063 /* Calculate the offset of char_ranges,
3064 which is next to equivalence_classes. */
3065 offset = laststart[1] + laststart[2]
3068 insert_space(datasize, laststart + offset, b - 1);
3070 /* Write the equivalence_class and \0. */
3071 for (i = 0 ; i < datasize ; i++)
3072 laststart[offset + i] = str[i];
3074 /* Update the length of equivalence_classes. */
3075 laststart[3] += datasize;
3076 had_char_class = true;
3078 else /* delim == '.' */
3079 { /* collating symbol */
3080 /* Calculate the offset of the equivalence_classes,
3081 which is next to collating_symbols. */
3082 offset = laststart[1] + laststart[2] + 6;
3083 /* Insert space and write the collationg_symbol
3085 insert_space(datasize, laststart + offset, b-1);
3086 for (i = 0 ; i < datasize ; i++)
3087 laststart[offset + i] = str[i];
3089 /* In re_match_2_internal if range_start < -1, we
3090 assume -range_start is the offset of the
3091 collating symbol which is specified as
3092 the character of the range start. So we assign
3093 -(laststart[1] + laststart[2] + 6) to
3095 range_start = -(laststart[1] + laststart[2] + 6);
3096 /* Update the length of collating_symbol. */
3097 laststart[2] += datasize;
3098 had_char_class = false;
3108 laststart[5] += 2; /* Update the length of characters */
3109 range_start = delim;
3110 had_char_class = false;
3115 had_char_class = false;
3117 laststart[5]++; /* Update the length of characters */
3123 /* Ensure that we have enough space to push a charset: the
3124 opcode, the length count, and the bitset; 34 bytes in all. */
3125 GET_BUFFER_SPACE (34);
3129 /* We test `*p == '^' twice, instead of using an if
3130 statement, so we only need one BUF_PUSH. */
3131 BUF_PUSH (*p == '^' ? charset_not : charset);
3135 /* Remember the first position in the bracket expression. */
3138 /* Push the number of bytes in the bitmap. */
3139 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
3141 /* Clear the whole map. */
3142 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
3144 /* charset_not matches newline according to a syntax bit. */
3145 if ((re_opcode_t) b[-2] == charset_not
3146 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
3147 SET_LIST_BIT ('\n');
3149 /* Read in characters and ranges, setting map bits. */
3152 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3156 /* \ might escape characters inside [...] and [^...]. */
3157 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
3159 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3167 /* Could be the end of the bracket expression. If it's
3168 not (i.e., when the bracket expression is `[]' so
3169 far), the ']' character bit gets set way below. */
3170 if (c == ']' && p != p1 + 1)
3173 /* Look ahead to see if it's a range when the last thing
3174 was a character class. */
3175 if (had_char_class && c == '-' && *p != ']')
3176 FREE_STACK_RETURN (REG_ERANGE);
3178 /* Look ahead to see if it's a range when the last thing
3179 was a character: if this is a hyphen not at the
3180 beginning or the end of a list, then it's the range
3183 && !(p - 2 >= pattern && p[-2] == '[')
3184 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
3188 = byte_compile_range (range_start, &p, pend, translate,
3190 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
3191 range_start = 0xffffffff;
3194 else if (p[0] == '-' && p[1] != ']')
3195 { /* This handles ranges made up of characters only. */
3198 /* Move past the `-'. */
3201 ret = byte_compile_range (c, &p, pend, translate, syntax, b);
3202 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
3203 range_start = 0xffffffff;
3206 /* See if we're at the beginning of a possible character
3209 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
3210 { /* Leave room for the null. */
3211 char str[CHAR_CLASS_MAX_LENGTH + 1];
3216 /* If pattern is `[[:'. */
3217 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3222 if ((c == ':' && *p == ']') || p == pend)
3224 #if CHAR_CLASS_MAX_LENGTH != 256
3225 if (c1 < CHAR_CLASS_MAX_LENGTH)
3228 /* This is in any case an invalid class name. */
3236 /* If isn't a word bracketed by `[:' and `:]':
3237 undo the ending character, the letters, and leave
3238 the leading `:' and `[' (but set bits for them). */
3239 if (c == ':' && *p == ']')
3241 # if defined _LIBC || defined WIDE_CHAR_SUPPORT
3242 boolean is_lower = STREQ (str, "lower");
3243 boolean is_upper = STREQ (str, "upper");
3247 wt = IS_CHAR_CLASS (str);
3249 FREE_STACK_RETURN (REG_ECTYPE);
3251 /* Throw away the ] at the end of the character
3255 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3257 for (ch = 0; ch < 1 << BYTEWIDTH; ++ch)
3260 if (__iswctype (__btowc (ch), wt))
3263 if (iswctype (btowc (ch), wt))
3267 if (translate && (is_upper || is_lower)
3268 && (ISUPPER (ch) || ISLOWER (ch)))
3272 had_char_class = true;
3275 boolean is_alnum = STREQ (str, "alnum");
3276 boolean is_alpha = STREQ (str, "alpha");
3277 boolean is_blank = STREQ (str, "blank");
3278 boolean is_cntrl = STREQ (str, "cntrl");
3279 boolean is_digit = STREQ (str, "digit");
3280 boolean is_graph = STREQ (str, "graph");
3281 boolean is_lower = STREQ (str, "lower");
3282 boolean is_print = STREQ (str, "print");
3283 boolean is_punct = STREQ (str, "punct");
3284 boolean is_space = STREQ (str, "space");
3285 boolean is_upper = STREQ (str, "upper");
3286 boolean is_xdigit = STREQ (str, "xdigit");
3288 if (!IS_CHAR_CLASS (str))
3289 FREE_STACK_RETURN (REG_ECTYPE);
3291 /* Throw away the ] at the end of the character
3295 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3297 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
3299 /* This was split into 3 if's to
3300 avoid an arbitrary limit in some compiler. */
3301 if ( (is_alnum && ISALNUM (ch))
3302 || (is_alpha && ISALPHA (ch))
3303 || (is_blank && ISBLANK (ch))
3304 || (is_cntrl && ISCNTRL (ch)))
3306 if ( (is_digit && ISDIGIT (ch))
3307 || (is_graph && ISGRAPH (ch))
3308 || (is_lower && ISLOWER (ch))
3309 || (is_print && ISPRINT (ch)))
3311 if ( (is_punct && ISPUNCT (ch))
3312 || (is_space && ISSPACE (ch))
3313 || (is_upper && ISUPPER (ch))
3314 || (is_xdigit && ISXDIGIT (ch)))
3316 if ( translate && (is_upper || is_lower)
3317 && (ISUPPER (ch) || ISLOWER (ch)))
3320 had_char_class = true;
3321 # endif /* libc || wctype.h */
3331 had_char_class = false;
3334 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '=')
3336 unsigned char str[MB_LEN_MAX + 1];
3339 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3345 /* If pattern is `[[='. */
3346 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3351 if ((c == '=' && *p == ']') || p == pend)
3353 if (c1 < MB_LEN_MAX)
3356 /* This is in any case an invalid class name. */
3361 if (c == '=' && *p == ']' && str[0] != '\0')
3363 /* If we have no collation data we use the default
3364 collation in which each character is in a class
3365 by itself. It also means that ASCII is the
3366 character set and therefore we cannot have character
3367 with more than one byte in the multibyte
3374 FREE_STACK_RETURN (REG_ECOLLATE);
3376 /* Throw away the ] at the end of the equivalence
3380 /* Set the bit for the character. */
3381 SET_LIST_BIT (str[0]);
3386 /* Try to match the byte sequence in `str' against
3387 those known to the collate implementation.
3388 First find out whether the bytes in `str' are
3389 actually from exactly one character. */
3390 const int32_t *table;
3391 const unsigned char *weights;
3392 const unsigned char *extra;
3393 const int32_t *indirect;
3395 const unsigned char *cp = str;
3398 /* This #include defines a local function! */
3399 # include <locale/weight.h>
3401 table = (const int32_t *)
3402 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3403 weights = (const unsigned char *)
3404 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3405 extra = (const unsigned char *)
3406 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3407 indirect = (const int32_t *)
3408 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3410 idx = findidx (&cp);
3411 if (idx == 0 || cp < str + c1)
3412 /* This is no valid character. */
3413 FREE_STACK_RETURN (REG_ECOLLATE);
3415 /* Throw away the ] at the end of the equivalence
3419 /* Now we have to go throught the whole table
3420 and find all characters which have the same
3423 XXX Note that this is not entirely correct.
3424 we would have to match multibyte sequences
3425 but this is not possible with the current
3427 for (ch = 1; ch < 256; ++ch)
3428 /* XXX This test would have to be changed if we
3429 would allow matching multibyte sequences. */
3432 int32_t idx2 = table[ch];
3433 size_t len = weights[idx2];
3435 /* Test whether the lenghts match. */
3436 if (weights[idx] == len)
3438 /* They do. New compare the bytes of
3443 && (weights[idx + 1 + cnt]
3444 == weights[idx2 + 1 + cnt]))
3448 /* They match. Mark the character as
3455 had_char_class = true;
3465 had_char_class = false;
3468 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '.')
3470 unsigned char str[128]; /* Should be large enough. */
3473 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3479 /* If pattern is `[[.'. */
3480 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3485 if ((c == '.' && *p == ']') || p == pend)
3487 if (c1 < sizeof (str))
3490 /* This is in any case an invalid class name. */
3495 if (c == '.' && *p == ']' && str[0] != '\0')
3497 /* If we have no collation data we use the default
3498 collation in which each character is the name
3499 for its own class which contains only the one
3500 character. It also means that ASCII is the
3501 character set and therefore we cannot have character
3502 with more than one byte in the multibyte
3509 FREE_STACK_RETURN (REG_ECOLLATE);
3511 /* Throw away the ] at the end of the equivalence
3515 /* Set the bit for the character. */
3516 SET_LIST_BIT (str[0]);
3517 range_start = ((const unsigned char *) str)[0];
3522 /* Try to match the byte sequence in `str' against
3523 those known to the collate implementation.
3524 First find out whether the bytes in `str' are
3525 actually from exactly one character. */
3527 const int32_t *symb_table;
3528 const unsigned char *extra;
3535 _NL_CURRENT_WORD (LC_COLLATE,
3536 _NL_COLLATE_SYMB_HASH_SIZEMB);
3537 symb_table = (const int32_t *)
3538 _NL_CURRENT (LC_COLLATE,
3539 _NL_COLLATE_SYMB_TABLEMB);
3540 extra = (const unsigned char *)
3541 _NL_CURRENT (LC_COLLATE,
3542 _NL_COLLATE_SYMB_EXTRAMB);
3544 /* Locate the character in the hashing table. */
3545 hash = elem_hash (str, c1);
3548 elem = hash % table_size;
3549 second = hash % (table_size - 2);
3550 while (symb_table[2 * elem] != 0)
3552 /* First compare the hashing value. */
3553 if (symb_table[2 * elem] == hash
3554 && c1 == extra[symb_table[2 * elem + 1]]
3556 &extra[symb_table[2 * elem + 1]
3560 /* Yep, this is the entry. */
3561 idx = symb_table[2 * elem + 1];
3562 idx += 1 + extra[idx];
3570 if (symb_table[2 * elem] == 0)
3571 /* This is no valid character. */
3572 FREE_STACK_RETURN (REG_ECOLLATE);
3574 /* Throw away the ] at the end of the equivalence
3578 /* Now add the multibyte character(s) we found
3581 XXX Note that this is not entirely correct.
3582 we would have to match multibyte sequences
3583 but this is not possible with the current
3584 implementation. Also, we have to match
3585 collating symbols, which expand to more than
3586 one file, as a whole and not allow the
3587 individual bytes. */
3590 range_start = extra[idx];
3593 SET_LIST_BIT (extra[idx]);
3598 had_char_class = false;
3608 had_char_class = false;
3613 had_char_class = false;
3619 /* Discard any (non)matching list bytes that are all 0 at the
3620 end of the map. Decrease the map-length byte too. */
3621 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
3630 if (syntax & RE_NO_BK_PARENS)
3637 if (syntax & RE_NO_BK_PARENS)
3644 if (syntax & RE_NEWLINE_ALT)
3651 if (syntax & RE_NO_BK_VBAR)
3658 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3659 goto handle_interval;
3665 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3667 /* Do not translate the character after the \, so that we can
3668 distinguish, e.g., \B from \b, even if we normally would
3669 translate, e.g., B to b. */
3675 if (syntax & RE_NO_BK_PARENS)
3676 goto normal_backslash;
3682 if (COMPILE_STACK_FULL)
3684 RETALLOC (compile_stack.stack, compile_stack.size << 1,
3685 compile_stack_elt_t);
3686 if (compile_stack.stack == NULL) return REG_ESPACE;
3688 compile_stack.size <<= 1;
3691 /* These are the values to restore when we hit end of this
3692 group. They are all relative offsets, so that if the
3693 whole pattern moves because of realloc, they will still
3695 COMPILE_STACK_TOP.begalt_offset = begalt - COMPILED_BUFFER_VAR;
3696 COMPILE_STACK_TOP.fixup_alt_jump
3697 = fixup_alt_jump ? fixup_alt_jump - COMPILED_BUFFER_VAR + 1 : 0;
3698 COMPILE_STACK_TOP.laststart_offset = b - COMPILED_BUFFER_VAR;
3699 COMPILE_STACK_TOP.regnum = regnum;
3701 /* We will eventually replace the 0 with the number of
3702 groups inner to this one. But do not push a
3703 start_memory for groups beyond the last one we can
3704 represent in the compiled pattern. */
3705 if (regnum <= MAX_REGNUM)
3707 COMPILE_STACK_TOP.inner_group_offset = b
3708 - COMPILED_BUFFER_VAR + 2;
3709 BUF_PUSH_3 (start_memory, regnum, 0);
3712 compile_stack.avail++;
3717 /* If we've reached MAX_REGNUM groups, then this open
3718 won't actually generate any code, so we'll have to
3719 clear pending_exact explicitly. */
3725 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3727 if (COMPILE_STACK_EMPTY)
3729 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3730 goto normal_backslash;
3732 FREE_STACK_RETURN (REG_ERPAREN);
3737 { /* Push a dummy failure point at the end of the
3738 alternative for a possible future
3739 `pop_failure_jump' to pop. See comments at
3740 `push_dummy_failure' in `re_match_2'. */
3741 BUF_PUSH (push_dummy_failure);
3743 /* We allocated space for this jump when we assigned
3744 to `fixup_alt_jump', in the `handle_alt' case below. */
3745 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1);
3748 /* See similar code for backslashed left paren above. */
3749 if (COMPILE_STACK_EMPTY)
3751 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3754 FREE_STACK_RETURN (REG_ERPAREN);
3757 /* Since we just checked for an empty stack above, this
3758 ``can't happen''. */
3759 assert (compile_stack.avail != 0);
3761 /* We don't just want to restore into `regnum', because
3762 later groups should continue to be numbered higher,
3763 as in `(ab)c(de)' -- the second group is #2. */
3764 regnum_t this_group_regnum;
3766 compile_stack.avail--;
3767 begalt = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.begalt_offset;
3769 = COMPILE_STACK_TOP.fixup_alt_jump
3770 ? COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.fixup_alt_jump - 1
3772 laststart = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.laststart_offset;
3773 this_group_regnum = COMPILE_STACK_TOP.regnum;
3774 /* If we've reached MAX_REGNUM groups, then this open
3775 won't actually generate any code, so we'll have to
3776 clear pending_exact explicitly. */
3779 /* We're at the end of the group, so now we know how many
3780 groups were inside this one. */
3781 if (this_group_regnum <= MAX_REGNUM)
3783 UCHAR_T *inner_group_loc
3784 = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.inner_group_offset;
3786 *inner_group_loc = regnum - this_group_regnum;
3787 BUF_PUSH_3 (stop_memory, this_group_regnum,
3788 regnum - this_group_regnum);
3794 case '|': /* `\|'. */
3795 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3796 goto normal_backslash;
3798 if (syntax & RE_LIMITED_OPS)
3801 /* Insert before the previous alternative a jump which
3802 jumps to this alternative if the former fails. */
3803 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
3804 INSERT_JUMP (on_failure_jump, begalt,
3805 b + 2 + 2 * OFFSET_ADDRESS_SIZE);
3807 b += 1 + OFFSET_ADDRESS_SIZE;
3809 /* The alternative before this one has a jump after it
3810 which gets executed if it gets matched. Adjust that
3811 jump so it will jump to this alternative's analogous
3812 jump (put in below, which in turn will jump to the next
3813 (if any) alternative's such jump, etc.). The last such
3814 jump jumps to the correct final destination. A picture:
3820 If we are at `b', then fixup_alt_jump right now points to a
3821 three-byte space after `a'. We'll put in the jump, set
3822 fixup_alt_jump to right after `b', and leave behind three
3823 bytes which we'll fill in when we get to after `c'. */
3826 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
3828 /* Mark and leave space for a jump after this alternative,
3829 to be filled in later either by next alternative or
3830 when know we're at the end of a series of alternatives. */
3832 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
3833 b += 1 + OFFSET_ADDRESS_SIZE;
3841 /* If \{ is a literal. */
3842 if (!(syntax & RE_INTERVALS)
3843 /* If we're at `\{' and it's not the open-interval
3845 || (syntax & RE_NO_BK_BRACES))
3846 goto normal_backslash;
3850 /* If got here, then the syntax allows intervals. */
3852 /* At least (most) this many matches must be made. */
3853 int lower_bound = -1, upper_bound = -1;
3855 /* Place in the uncompiled pattern (i.e., just after
3856 the '{') to go back to if the interval is invalid. */
3857 const CHAR_T *beg_interval = p;
3860 goto invalid_interval;
3862 GET_UNSIGNED_NUMBER (lower_bound);
3866 GET_UNSIGNED_NUMBER (upper_bound);
3867 if (upper_bound < 0)
3868 upper_bound = RE_DUP_MAX;
3871 /* Interval such as `{1}' => match exactly once. */
3872 upper_bound = lower_bound;
3874 if (! (0 <= lower_bound && lower_bound <= upper_bound))
3875 goto invalid_interval;
3877 if (!(syntax & RE_NO_BK_BRACES))
3879 if (c != '\\' || p == pend)
3880 goto invalid_interval;
3885 goto invalid_interval;
3887 /* If it's invalid to have no preceding re. */
3890 if (syntax & RE_CONTEXT_INVALID_OPS
3891 && !(syntax & RE_INVALID_INTERVAL_ORD))
3892 FREE_STACK_RETURN (REG_BADRPT);
3893 else if (syntax & RE_CONTEXT_INDEP_OPS)
3896 goto unfetch_interval;
3899 /* We just parsed a valid interval. */
3901 if (RE_DUP_MAX < upper_bound)
3902 FREE_STACK_RETURN (REG_BADBR);
3904 /* If the upper bound is zero, don't want to succeed at
3905 all; jump from `laststart' to `b + 3', which will be
3906 the end of the buffer after we insert the jump. */
3907 /* ifdef WCHAR, 'b + 1 + OFFSET_ADDRESS_SIZE'
3908 instead of 'b + 3'. */
3909 if (upper_bound == 0)
3911 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
3912 INSERT_JUMP (jump, laststart, b + 1
3913 + OFFSET_ADDRESS_SIZE);
3914 b += 1 + OFFSET_ADDRESS_SIZE;
3917 /* Otherwise, we have a nontrivial interval. When
3918 we're all done, the pattern will look like:
3919 set_number_at <jump count> <upper bound>
3920 set_number_at <succeed_n count> <lower bound>
3921 succeed_n <after jump addr> <succeed_n count>
3923 jump_n <succeed_n addr> <jump count>
3924 (The upper bound and `jump_n' are omitted if
3925 `upper_bound' is 1, though.) */
3927 { /* If the upper bound is > 1, we need to insert
3928 more at the end of the loop. */
3929 unsigned nbytes = 2 + 4 * OFFSET_ADDRESS_SIZE +
3930 (upper_bound > 1) * (2 + 4 * OFFSET_ADDRESS_SIZE);
3932 GET_BUFFER_SPACE (nbytes);
3934 /* Initialize lower bound of the `succeed_n', even
3935 though it will be set during matching by its
3936 attendant `set_number_at' (inserted next),
3937 because `re_compile_fastmap' needs to know.
3938 Jump to the `jump_n' we might insert below. */
3939 INSERT_JUMP2 (succeed_n, laststart,
3940 b + 1 + 2 * OFFSET_ADDRESS_SIZE
3941 + (upper_bound > 1) * (1 + 2 * OFFSET_ADDRESS_SIZE)
3943 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
3945 /* Code to initialize the lower bound. Insert
3946 before the `succeed_n'. The `5' is the last two
3947 bytes of this `set_number_at', plus 3 bytes of
3948 the following `succeed_n'. */
3949 /* ifdef WCHAR, The '1+2*OFFSET_ADDRESS_SIZE'
3950 is the 'set_number_at', plus '1+OFFSET_ADDRESS_SIZE'
3951 of the following `succeed_n'. */
3952 PREFIX(insert_op2) (set_number_at, laststart, 1
3953 + 2 * OFFSET_ADDRESS_SIZE, lower_bound, b);
3954 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
3956 if (upper_bound > 1)
3957 { /* More than one repetition is allowed, so
3958 append a backward jump to the `succeed_n'
3959 that starts this interval.
3961 When we've reached this during matching,
3962 we'll have matched the interval once, so
3963 jump back only `upper_bound - 1' times. */
3964 STORE_JUMP2 (jump_n, b, laststart
3965 + 2 * OFFSET_ADDRESS_SIZE + 1,
3967 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
3969 /* The location we want to set is the second
3970 parameter of the `jump_n'; that is `b-2' as
3971 an absolute address. `laststart' will be
3972 the `set_number_at' we're about to insert;
3973 `laststart+3' the number to set, the source
3974 for the relative address. But we are
3975 inserting into the middle of the pattern --
3976 so everything is getting moved up by 5.
3977 Conclusion: (b - 2) - (laststart + 3) + 5,
3978 i.e., b - laststart.
3980 We insert this at the beginning of the loop
3981 so that if we fail during matching, we'll
3982 reinitialize the bounds. */
3983 PREFIX(insert_op2) (set_number_at, laststart,
3985 upper_bound - 1, b);
3986 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
3993 if (!(syntax & RE_INVALID_INTERVAL_ORD))
3994 FREE_STACK_RETURN (p == pend ? REG_EBRACE : REG_BADBR);
3996 /* Match the characters as literals. */
3999 if (syntax & RE_NO_BK_BRACES)
4002 goto normal_backslash;
4006 /* There is no way to specify the before_dot and after_dot
4007 operators. rms says this is ok. --karl */
4015 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
4021 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
4027 if (syntax & RE_NO_GNU_OPS)
4030 BUF_PUSH (wordchar);
4035 if (syntax & RE_NO_GNU_OPS)
4038 BUF_PUSH (notwordchar);
4043 if (syntax & RE_NO_GNU_OPS)
4049 if (syntax & RE_NO_GNU_OPS)
4055 if (syntax & RE_NO_GNU_OPS)
4057 BUF_PUSH (wordbound);
4061 if (syntax & RE_NO_GNU_OPS)
4063 BUF_PUSH (notwordbound);
4067 if (syntax & RE_NO_GNU_OPS)
4073 if (syntax & RE_NO_GNU_OPS)
4078 case '1': case '2': case '3': case '4': case '5':
4079 case '6': case '7': case '8': case '9':
4080 if (syntax & RE_NO_BK_REFS)
4086 FREE_STACK_RETURN (REG_ESUBREG);
4088 /* Can't back reference to a subexpression if inside of it. */
4089 if (group_in_compile_stack (compile_stack, (regnum_t) c1))
4093 BUF_PUSH_2 (duplicate, c1);
4099 if (syntax & RE_BK_PLUS_QM)
4102 goto normal_backslash;
4106 /* You might think it would be useful for \ to mean
4107 not to translate; but if we don't translate it
4108 it will never match anything. */
4116 /* Expects the character in `c'. */
4118 /* If no exactn currently being built. */
4121 /* If last exactn handle binary(or character) and
4122 new exactn handle character(or binary). */
4123 || is_exactn_bin != is_binary[p - 1 - pattern]
4126 /* If last exactn not at current position. */
4127 || pending_exact + *pending_exact + 1 != b
4129 /* We have only one byte following the exactn for the count. */
4130 || *pending_exact == (1 << BYTEWIDTH) - 1
4132 /* If followed by a repetition operator. */
4133 || *p == '*' || *p == '^'
4134 || ((syntax & RE_BK_PLUS_QM)
4135 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
4136 : (*p == '+' || *p == '?'))
4137 || ((syntax & RE_INTERVALS)
4138 && ((syntax & RE_NO_BK_BRACES)
4140 : (p[0] == '\\' && p[1] == '{'))))
4142 /* Start building a new exactn. */
4147 /* Is this exactn binary data or character? */
4148 is_exactn_bin = is_binary[p - 1 - pattern];
4150 BUF_PUSH_2 (exactn_bin, 0);
4152 BUF_PUSH_2 (exactn, 0);
4154 BUF_PUSH_2 (exactn, 0);
4156 pending_exact = b - 1;
4163 } /* while p != pend */
4166 /* Through the pattern now. */
4169 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
4171 if (!COMPILE_STACK_EMPTY)
4172 FREE_STACK_RETURN (REG_EPAREN);
4174 /* If we don't want backtracking, force success
4175 the first time we reach the end of the compiled pattern. */
4176 if (syntax & RE_NO_POSIX_BACKTRACKING)
4184 free (compile_stack.stack);
4186 /* We have succeeded; set the length of the buffer. */
4188 bufp->used = (uintptr_t) b - (uintptr_t) COMPILED_BUFFER_VAR;
4190 bufp->used = b - bufp->buffer;
4196 DEBUG_PRINT1 ("\nCompiled pattern: \n");
4197 PREFIX(print_compiled_pattern) (bufp);
4201 #ifndef MATCH_MAY_ALLOCATE
4202 /* Initialize the failure stack to the largest possible stack. This
4203 isn't necessary unless we're trying to avoid calling alloca in
4204 the search and match routines. */
4206 int num_regs = bufp->re_nsub + 1;
4208 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
4209 is strictly greater than re_max_failures, the largest possible stack
4210 is 2 * re_max_failures failure points. */
4211 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS))
4213 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS);
4216 if (! fail_stack.stack)
4218 = (PREFIX(fail_stack_elt_t) *) xmalloc (fail_stack.size
4219 * sizeof (PREFIX(fail_stack_elt_t)));
4222 = (PREFIX(fail_stack_elt_t) *) xrealloc (fail_stack.stack,
4224 * sizeof (PREFIX(fail_stack_elt_t))));
4225 # else /* not emacs */
4226 if (! fail_stack.stack)
4228 = (PREFIX(fail_stack_elt_t) *) malloc (fail_stack.size
4229 * sizeof (PREFIX(fail_stack_elt_t)));
4232 = (PREFIX(fail_stack_elt_t) *) realloc (fail_stack.stack,
4234 * sizeof (PREFIX(fail_stack_elt_t))));
4235 # endif /* not emacs */
4238 PREFIX(regex_grow_registers) (num_regs);
4240 #endif /* not MATCH_MAY_ALLOCATE */
4243 } /* regex_compile */
4245 /* Subroutines for `regex_compile'. */
4247 /* Store OP at LOC followed by two-byte integer parameter ARG. */
4248 /* ifdef WCHAR, integer parameter is 1 wchar_t. */
4256 *loc = (UCHAR_T) op;
4257 STORE_NUMBER (loc + 1, arg);
4261 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
4262 /* ifdef WCHAR, integer parameter is 1 wchar_t. */
4270 *loc = (UCHAR_T) op;
4271 STORE_NUMBER (loc + 1, arg1);
4272 STORE_NUMBER (loc + 1 + OFFSET_ADDRESS_SIZE, arg2);
4276 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
4277 for OP followed by two-byte integer parameter ARG. */
4278 /* ifdef WCHAR, integer parameter is 1 wchar_t. */
4281 PREFIX(insert_op1) (
4287 register UCHAR_T *pfrom = end;
4288 register UCHAR_T *pto = end + 1 + OFFSET_ADDRESS_SIZE;
4290 while (pfrom != loc)
4293 PREFIX(store_op1) (op, loc, arg);
4297 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
4298 /* ifdef WCHAR, integer parameter is 1 wchar_t. */
4301 PREFIX(insert_op2) (
4307 register UCHAR_T *pfrom = end;
4308 register UCHAR_T *pto = end + 1 + 2 * OFFSET_ADDRESS_SIZE;
4310 while (pfrom != loc)
4313 PREFIX(store_op2) (op, loc, arg1, arg2);
4317 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
4318 after an alternative or a begin-subexpression. We assume there is at
4319 least one character before the ^. */
4322 PREFIX(at_begline_loc_p) (
4323 const CHAR_T *pattern, const CHAR_T *p,
4324 reg_syntax_t syntax)
4326 const CHAR_T *prev = p - 2;
4327 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
4330 /* After a subexpression? */
4331 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
4332 /* After an alternative? */
4333 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
4337 /* The dual of at_begline_loc_p. This one is for $. We assume there is
4338 at least one character after the $, i.e., `P < PEND'. */
4341 PREFIX(at_endline_loc_p) (
4342 const CHAR_T *p, const CHAR_T *pend,
4343 reg_syntax_t syntax)
4345 const CHAR_T *next = p;
4346 boolean next_backslash = *next == '\\';
4347 const CHAR_T *next_next = p + 1 < pend ? p + 1 : 0;
4350 /* Before a subexpression? */
4351 (syntax & RE_NO_BK_PARENS ? *next == ')'
4352 : next_backslash && next_next && *next_next == ')')
4353 /* Before an alternative? */
4354 || (syntax & RE_NO_BK_VBAR ? *next == '|'
4355 : next_backslash && next_next && *next_next == '|');
4358 #else /* not INSIDE_RECURSION */
4360 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
4361 false if it's not. */
4364 group_in_compile_stack (
4365 compile_stack_type compile_stack,
4370 for (this_element = compile_stack.avail - 1;
4373 if (compile_stack.stack[this_element].regnum == regnum)
4378 #endif /* not INSIDE_RECURSION */
4380 #ifdef INSIDE_RECURSION
4383 /* This insert space, which size is "num", into the pattern at "loc".
4384 "end" must point the end of the allocated buffer. */
4391 register CHAR_T *pto = end;
4392 register CHAR_T *pfrom = end - num;
4394 while (pfrom >= loc)
4400 static reg_errcode_t
4402 CHAR_T range_start_char,
4403 const CHAR_T **p_ptr, const CHAR_T *pend,
4404 __RE_TRANSLATE_TYPE translate,
4405 reg_syntax_t syntax,
4406 CHAR_T *b, CHAR_T *char_set)
4408 const CHAR_T *p = *p_ptr;
4409 CHAR_T range_start, range_end;
4413 uint32_t start_val, end_val;
4419 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
4422 const char *collseq = (const char *) _NL_CURRENT(LC_COLLATE,
4423 _NL_COLLATE_COLLSEQWC);
4424 const unsigned char *extra = (const unsigned char *)
4425 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4427 if (range_start_char < -1)
4429 /* range_start is a collating symbol. */
4431 /* Retreive the index and get collation sequence value. */
4432 wextra = (int32_t*)(extra + char_set[-range_start_char]);
4433 start_val = wextra[1 + *wextra];
4436 start_val = collseq_table_lookup(collseq, TRANSLATE(range_start_char));
4438 end_val = collseq_table_lookup (collseq, TRANSLATE (p[0]));
4440 /* Report an error if the range is empty and the syntax prohibits
4442 ret = ((syntax & RE_NO_EMPTY_RANGES)
4443 && (start_val > end_val))? REG_ERANGE : REG_NOERROR;
4445 /* Insert space to the end of the char_ranges. */
4446 insert_space(2, b - char_set[5] - 2, b - 1);
4447 *(b - char_set[5] - 2) = (wchar_t)start_val;
4448 *(b - char_set[5] - 1) = (wchar_t)end_val;
4449 char_set[4]++; /* ranges_index */
4454 range_start = (range_start_char >= 0)? TRANSLATE (range_start_char):
4456 range_end = TRANSLATE (p[0]);
4457 /* Report an error if the range is empty and the syntax prohibits
4459 ret = ((syntax & RE_NO_EMPTY_RANGES)
4460 && (range_start > range_end))? REG_ERANGE : REG_NOERROR;
4462 /* Insert space to the end of the char_ranges. */
4463 insert_space(2, b - char_set[5] - 2, b - 1);
4464 *(b - char_set[5] - 2) = range_start;
4465 *(b - char_set[5] - 1) = range_end;
4466 char_set[4]++; /* ranges_index */
4468 /* Have to increment the pointer into the pattern string, so the
4469 caller isn't still at the ending character. */
4475 /* Read the ending character of a range (in a bracket expression) from the
4476 uncompiled pattern *P_PTR (which ends at PEND). We assume the
4477 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
4478 Then we set the translation of all bits between the starting and
4479 ending characters (inclusive) in the compiled pattern B.
4481 Return an error code.
4483 We use these short variable names so we can use the same macros as
4484 `regex_compile' itself. */
4486 static reg_errcode_t
4487 byte_compile_range (
4488 unsigned int range_start_char,
4489 const char **p_ptr, const char *pend,
4490 __RE_TRANSLATE_TYPE translate,
4491 reg_syntax_t syntax,
4495 const char *p = *p_ptr;
4498 const unsigned char *collseq;
4499 unsigned int start_colseq;
4500 unsigned int end_colseq;
4508 /* Have to increment the pointer into the pattern string, so the
4509 caller isn't still at the ending character. */
4512 /* Report an error if the range is empty and the syntax prohibits this. */
4513 ret = syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
4516 collseq = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
4517 _NL_COLLATE_COLLSEQMB);
4519 start_colseq = collseq[(unsigned char) TRANSLATE (range_start_char)];
4520 end_colseq = collseq[(unsigned char) TRANSLATE (p[0])];
4521 for (this_char = 0; this_char <= (unsigned char) -1; ++this_char)
4523 unsigned int this_colseq = collseq[(unsigned char) TRANSLATE (this_char)];
4525 if (start_colseq <= this_colseq && this_colseq <= end_colseq)
4527 SET_LIST_BIT (TRANSLATE (this_char));
4532 /* Here we see why `this_char' has to be larger than an `unsigned
4533 char' -- we would otherwise go into an infinite loop, since all
4534 characters <= 0xff. */
4535 range_start_char = TRANSLATE (range_start_char);
4536 /* TRANSLATE(p[0]) is casted to char (not unsigned char) in TRANSLATE,
4537 and some compilers cast it to int implicitly, so following for_loop
4538 may fall to (almost) infinite loop.
4539 e.g. If translate[p[0]] = 0xff, end_char may equals to 0xffffffff.
4540 To avoid this, we cast p[0] to unsigned int and truncate it. */
4541 end_char = ((unsigned)TRANSLATE(p[0]) & ((1 << BYTEWIDTH) - 1));
4543 for (this_char = range_start_char; this_char <= end_char; ++this_char)
4545 SET_LIST_BIT (TRANSLATE (this_char));
4554 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4555 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
4556 characters can start a string that matches the pattern. This fastmap
4557 is used by re_search to skip quickly over impossible starting points.
4559 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4560 area as BUFP->fastmap.
4562 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4565 Returns 0 if we succeed, -2 if an internal error. */
4568 /* local function for re_compile_fastmap.
4569 truncate wchar_t character to char. */
4570 static unsigned char truncate_wchar (CHAR_T c)
4572 unsigned char buf[MB_CUR_MAX];
4575 memset (&state, '\0', sizeof (state));
4577 retval = __wcrtomb (buf, c, &state);
4579 retval = wcrtomb (buf, c, &state);
4581 return retval > 0 ? buf[0] : (unsigned char) c;
4586 PREFIX(re_compile_fastmap) (struct re_pattern_buffer *bufp)
4589 #ifdef MATCH_MAY_ALLOCATE
4590 PREFIX(fail_stack_type) fail_stack;
4592 #ifndef REGEX_MALLOC
4596 register char *fastmap = bufp->fastmap;
4599 /* We need to cast pattern to (wchar_t*), because we casted this compiled
4600 pattern to (char*) in regex_compile. */
4601 UCHAR_T *pattern = (UCHAR_T*)bufp->buffer;
4602 register UCHAR_T *pend = (UCHAR_T*) (bufp->buffer + bufp->used);
4604 UCHAR_T *pattern = bufp->buffer;
4605 register UCHAR_T *pend = pattern + bufp->used;
4607 UCHAR_T *p = pattern;
4610 /* This holds the pointer to the failure stack, when
4611 it is allocated relocatably. */
4612 fail_stack_elt_t *failure_stack_ptr;
4615 /* Assume that each path through the pattern can be null until
4616 proven otherwise. We set this false at the bottom of switch
4617 statement, to which we get only if a particular path doesn't
4618 match the empty string. */
4619 boolean path_can_be_null = true;
4621 /* We aren't doing a `succeed_n' to begin with. */
4622 boolean succeed_n_p = false;
4624 assert (fastmap != NULL && p != NULL);
4627 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
4628 bufp->fastmap_accurate = 1; /* It will be when we're done. */
4629 bufp->can_be_null = 0;
4633 if (p == pend || *p == succeed)
4635 /* We have reached the (effective) end of pattern. */
4636 if (!FAIL_STACK_EMPTY ())
4638 bufp->can_be_null |= path_can_be_null;
4640 /* Reset for next path. */
4641 path_can_be_null = true;
4643 p = fail_stack.stack[--fail_stack.avail].pointer;
4651 /* We should never be about to go beyond the end of the pattern. */
4654 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4657 /* I guess the idea here is to simply not bother with a fastmap
4658 if a backreference is used, since it's too hard to figure out
4659 the fastmap for the corresponding group. Setting
4660 `can_be_null' stops `re_search_2' from using the fastmap, so
4661 that is all we do. */
4663 bufp->can_be_null = 1;
4667 /* Following are the cases which match a character. These end
4672 fastmap[truncate_wchar(p[1])] = 1;
4686 /* It is hard to distinguish fastmap from (multi byte) characters
4687 which depends on current locale. */
4692 bufp->can_be_null = 1;
4696 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
4697 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
4703 /* Chars beyond end of map must be allowed. */
4704 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
4707 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
4708 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
4714 for (j = 0; j < (1 << BYTEWIDTH); j++)
4715 if (SYNTAX (j) == Sword)
4721 for (j = 0; j < (1 << BYTEWIDTH); j++)
4722 if (SYNTAX (j) != Sword)
4729 int fastmap_newline = fastmap['\n'];
4731 /* `.' matches anything ... */
4732 for (j = 0; j < (1 << BYTEWIDTH); j++)
4735 /* ... except perhaps newline. */
4736 if (!(bufp->syntax & RE_DOT_NEWLINE))
4737 fastmap['\n'] = fastmap_newline;
4739 /* Return if we have already set `can_be_null'; if we have,
4740 then the fastmap is irrelevant. Something's wrong here. */
4741 else if (bufp->can_be_null)
4744 /* Otherwise, have to check alternative paths. */
4751 for (j = 0; j < (1 << BYTEWIDTH); j++)
4752 if (SYNTAX (j) == (enum syntaxcode) k)
4759 for (j = 0; j < (1 << BYTEWIDTH); j++)
4760 if (SYNTAX (j) != (enum syntaxcode) k)
4765 /* All cases after this match the empty string. These end with
4785 case push_dummy_failure:
4790 case pop_failure_jump:
4791 case maybe_pop_jump:
4794 case dummy_failure_jump:
4795 EXTRACT_NUMBER_AND_INCR (j, p);
4800 /* Jump backward implies we just went through the body of a
4801 loop and matched nothing. Opcode jumped to should be
4802 `on_failure_jump' or `succeed_n'. Just treat it like an
4803 ordinary jump. For a * loop, it has pushed its failure
4804 point already; if so, discard that as redundant. */
4805 if ((re_opcode_t) *p != on_failure_jump
4806 && (re_opcode_t) *p != succeed_n)
4810 EXTRACT_NUMBER_AND_INCR (j, p);
4813 /* If what's on the stack is where we are now, pop it. */
4814 if (!FAIL_STACK_EMPTY ()
4815 && fail_stack.stack[fail_stack.avail - 1].pointer == p)
4821 case on_failure_jump:
4822 case on_failure_keep_string_jump:
4823 handle_on_failure_jump:
4824 EXTRACT_NUMBER_AND_INCR (j, p);
4826 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
4827 end of the pattern. We don't want to push such a point,
4828 since when we restore it above, entering the switch will
4829 increment `p' past the end of the pattern. We don't need
4830 to push such a point since we obviously won't find any more
4831 fastmap entries beyond `pend'. Such a pattern can match
4832 the null string, though. */
4835 if (!PUSH_PATTERN_OP (p + j, fail_stack))
4837 RESET_FAIL_STACK ();
4842 bufp->can_be_null = 1;
4846 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
4847 succeed_n_p = false;
4854 /* Get to the number of times to succeed. */
4855 p += OFFSET_ADDRESS_SIZE;
4857 /* Increment p past the n for when k != 0. */
4858 EXTRACT_NUMBER_AND_INCR (k, p);
4861 p -= 2 * OFFSET_ADDRESS_SIZE;
4862 succeed_n_p = true; /* Spaghetti code alert. */
4863 goto handle_on_failure_jump;
4869 p += 2 * OFFSET_ADDRESS_SIZE;
4880 abort (); /* We have listed all the cases. */
4883 /* Getting here means we have found the possible starting
4884 characters for one path of the pattern -- and that the empty
4885 string does not match. We need not follow this path further.
4886 Instead, look at the next alternative (remembered on the
4887 stack), or quit if no more. The test at the top of the loop
4888 does these things. */
4889 path_can_be_null = false;
4893 /* Set `can_be_null' for the last path (also the first path, if the
4894 pattern is empty). */
4895 bufp->can_be_null |= path_can_be_null;
4898 RESET_FAIL_STACK ();
4902 #else /* not INSIDE_RECURSION */
4905 re_compile_fastmap (struct re_pattern_buffer *bufp)
4908 if (MB_CUR_MAX != 1)
4909 return wcs_re_compile_fastmap(bufp);
4911 return byte_re_compile_fastmap(bufp);
4913 libc_hidden_def(re_compile_fastmap)
4916 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4917 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4918 this memory for recording register information. STARTS and ENDS
4919 must be allocated using the malloc library routine, and must each
4920 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4922 If NUM_REGS == 0, then subsequent matches should allocate their own
4925 Unless this function is called, the first search or match using
4926 PATTERN_BUFFER will allocate its own register data, without
4927 freeing the old data. */
4931 struct re_pattern_buffer *bufp,
4932 struct re_registers *regs,
4934 regoff_t *starts, regoff_t *ends)
4938 bufp->regs_allocated = REGS_REALLOCATE;
4939 regs->num_regs = num_regs;
4940 regs->start = starts;
4945 bufp->regs_allocated = REGS_UNALLOCATED;
4947 regs->start = regs->end = (regoff_t *) 0;
4951 /* Searching routines. */
4953 /* Like re_search_2, below, but only one string is specified, and
4954 doesn't let you say where to stop matching. */
4958 struct re_pattern_buffer *bufp,
4960 int size, int startpos, int range,
4961 struct re_registers *regs)
4963 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
4966 libc_hidden_def(re_search)
4969 /* Using the compiled pattern in BUFP->buffer, first tries to match the
4970 virtual concatenation of STRING1 and STRING2, starting first at index
4971 STARTPOS, then at STARTPOS + 1, and so on.
4973 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
4975 RANGE is how far to scan while trying to match. RANGE = 0 means try
4976 only at STARTPOS; in general, the last start tried is STARTPOS +
4979 In REGS, return the indices of the virtual concatenation of STRING1
4980 and STRING2 that matched the entire BUFP->buffer and its contained
4983 Do not consider matching one past the index STOP in the virtual
4984 concatenation of STRING1 and STRING2.
4986 We return either the position in the strings at which the match was
4987 found, -1 if no match, or -2 if error (such as failure
4992 struct re_pattern_buffer *bufp,
4993 const char *string1, int size1,
4994 const char *string2, int size2,
4997 struct re_registers *regs,
5001 if (MB_CUR_MAX != 1)
5002 return wcs_re_search_2 (bufp, string1, size1, string2, size2, startpos,
5005 return byte_re_search_2 (bufp, string1, size1, string2, size2, startpos,
5008 libc_hidden_def(re_search_2)
5010 #endif /* not INSIDE_RECURSION */
5012 #ifdef INSIDE_RECURSION
5014 #ifdef MATCH_MAY_ALLOCATE
5015 # define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
5017 # define FREE_VAR(var) free (var); var = NULL
5021 # define MAX_ALLOCA_SIZE 2000
5023 # define FREE_WCS_BUFFERS() \
5025 if (size1 > MAX_ALLOCA_SIZE) \
5027 free (wcs_string1); \
5028 free (mbs_offset1); \
5032 FREE_VAR (wcs_string1); \
5033 FREE_VAR (mbs_offset1); \
5035 if (size2 > MAX_ALLOCA_SIZE) \
5037 free (wcs_string2); \
5038 free (mbs_offset2); \
5042 FREE_VAR (wcs_string2); \
5043 FREE_VAR (mbs_offset2); \
5051 PREFIX(re_search_2) (
5052 struct re_pattern_buffer *bufp,
5053 const char *string1, int size1,
5054 const char *string2, int size2,
5057 struct re_registers *regs,
5061 register char *fastmap = bufp->fastmap;
5062 register __RE_TRANSLATE_TYPE translate = bufp->translate;
5063 int total_size = size1 + size2;
5064 int endpos = startpos + range;
5066 /* We need wchar_t* buffers correspond to cstring1, cstring2. */
5067 wchar_t *wcs_string1 = NULL, *wcs_string2 = NULL;
5068 /* We need the size of wchar_t buffers correspond to csize1, csize2. */
5069 int wcs_size1 = 0, wcs_size2 = 0;
5070 /* offset buffer for optimization. See convert_mbs_to_wc. */
5071 int *mbs_offset1 = NULL, *mbs_offset2 = NULL;
5072 /* They hold whether each wchar_t is binary data or not. */
5073 char *is_binary = NULL;
5076 /* Check for out-of-range STARTPOS. */
5077 if (startpos < 0 || startpos > total_size)
5080 /* Fix up RANGE if it might eventually take us outside
5081 the virtual concatenation of STRING1 and STRING2.
5082 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
5084 range = 0 - startpos;
5085 else if (endpos > total_size)
5086 range = total_size - startpos;
5088 /* If the search isn't to be a backwards one, don't waste time in a
5089 search for a pattern that must be anchored. */
5090 if (bufp->used > 0 && range > 0
5091 && ((re_opcode_t) bufp->buffer[0] == begbuf
5092 /* `begline' is like `begbuf' if it cannot match at newlines. */
5093 || ((re_opcode_t) bufp->buffer[0] == begline
5094 && !bufp->newline_anchor)))
5103 /* In a forward search for something that starts with \=.
5104 don't keep searching past point. */
5105 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
5107 range = PT - startpos;
5113 /* Update the fastmap now if not correct already. */
5114 if (fastmap && !bufp->fastmap_accurate)
5115 if (re_compile_fastmap (bufp) == -2)
5119 /* Allocate wchar_t array for wcs_string1 and wcs_string2 and
5120 fill them with converted string. */
5123 if (size1 > MAX_ALLOCA_SIZE)
5125 wcs_string1 = TALLOC (size1 + 1, CHAR_T);
5126 mbs_offset1 = TALLOC (size1 + 1, int);
5127 is_binary = TALLOC (size1 + 1, char);
5131 wcs_string1 = REGEX_TALLOC (size1 + 1, CHAR_T);
5132 mbs_offset1 = REGEX_TALLOC (size1 + 1, int);
5133 is_binary = REGEX_TALLOC (size1 + 1, char);
5135 if (!wcs_string1 || !mbs_offset1 || !is_binary)
5137 if (size1 > MAX_ALLOCA_SIZE)
5145 FREE_VAR (wcs_string1);
5146 FREE_VAR (mbs_offset1);
5147 FREE_VAR (is_binary);
5151 wcs_size1 = convert_mbs_to_wcs(wcs_string1, string1, size1,
5152 mbs_offset1, is_binary);
5153 wcs_string1[wcs_size1] = L'\0'; /* for a sentinel */
5154 if (size1 > MAX_ALLOCA_SIZE)
5157 FREE_VAR (is_binary);
5161 if (size2 > MAX_ALLOCA_SIZE)
5163 wcs_string2 = TALLOC (size2 + 1, CHAR_T);
5164 mbs_offset2 = TALLOC (size2 + 1, int);
5165 is_binary = TALLOC (size2 + 1, char);
5169 wcs_string2 = REGEX_TALLOC (size2 + 1, CHAR_T);
5170 mbs_offset2 = REGEX_TALLOC (size2 + 1, int);
5171 is_binary = REGEX_TALLOC (size2 + 1, char);
5173 if (!wcs_string2 || !mbs_offset2 || !is_binary)
5175 FREE_WCS_BUFFERS ();
5176 if (size2 > MAX_ALLOCA_SIZE)
5179 FREE_VAR (is_binary);
5182 wcs_size2 = convert_mbs_to_wcs(wcs_string2, string2, size2,
5183 mbs_offset2, is_binary);
5184 wcs_string2[wcs_size2] = L'\0'; /* for a sentinel */
5185 if (size2 > MAX_ALLOCA_SIZE)
5188 FREE_VAR (is_binary);
5193 /* Loop through the string, looking for a place to start matching. */
5196 /* If a fastmap is supplied, skip quickly over characters that
5197 cannot be the start of a match. If the pattern can match the
5198 null string, however, we don't need to skip characters; we want
5199 the first null string. */
5200 if (fastmap && startpos < total_size && !bufp->can_be_null)
5202 if (range > 0) /* Searching forwards. */
5204 register const char *d;
5205 register int lim = 0;
5208 if (startpos < size1 && startpos + range >= size1)
5209 lim = range - (size1 - startpos);
5211 d = (startpos >= size1 ? string2 - size1 : string1) + startpos;
5213 /* Written out as an if-else to avoid testing `translate'
5217 && !fastmap[(unsigned char)
5218 translate[(unsigned char) *d++]])
5221 while (range > lim && !fastmap[(unsigned char) *d++])
5224 startpos += irange - range;
5226 else /* Searching backwards. */
5228 register CHAR_T c = (size1 == 0 || startpos >= size1
5229 ? string2[startpos - size1]
5230 : string1[startpos]);
5232 if (!fastmap[(unsigned char) TRANSLATE (c)])
5237 /* If can't match the null string, and that's all we have left, fail. */
5238 if (range >= 0 && startpos == total_size && fastmap
5239 && !bufp->can_be_null)
5242 FREE_WCS_BUFFERS ();
5248 val = wcs_re_match_2_internal (bufp, string1, size1, string2,
5249 size2, startpos, regs, stop,
5250 wcs_string1, wcs_size1,
5251 wcs_string2, wcs_size2,
5252 mbs_offset1, mbs_offset2);
5254 val = byte_re_match_2_internal (bufp, string1, size1, string2,
5255 size2, startpos, regs, stop);
5258 #ifndef REGEX_MALLOC
5267 FREE_WCS_BUFFERS ();
5275 FREE_WCS_BUFFERS ();
5295 FREE_WCS_BUFFERS ();
5301 /* This converts PTR, a pointer into one of the search wchar_t strings
5302 `string1' and `string2' into an multibyte string offset from the
5303 beginning of that string. We use mbs_offset to optimize.
5304 See convert_mbs_to_wcs. */
5305 # define POINTER_TO_OFFSET(ptr) \
5306 (FIRST_STRING_P (ptr) \
5307 ? ((regoff_t)(mbs_offset1 != NULL? mbs_offset1[(ptr)-string1] : 0)) \
5308 : ((regoff_t)((mbs_offset2 != NULL? mbs_offset2[(ptr)-string2] : 0) \
5311 /* This converts PTR, a pointer into one of the search strings `string1'
5312 and `string2' into an offset from the beginning of that string. */
5313 # define POINTER_TO_OFFSET(ptr) \
5314 (FIRST_STRING_P (ptr) \
5315 ? ((regoff_t) ((ptr) - string1)) \
5316 : ((regoff_t) ((ptr) - string2 + size1)))
5319 /* Macros for dealing with the split strings in re_match_2. */
5321 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
5323 /* Call before fetching a character with *d. This switches over to
5324 string2 if necessary. */
5325 #define PREFETCH() \
5328 /* End of string2 => fail. */ \
5329 if (dend == end_match_2) \
5331 /* End of string1 => advance to string2. */ \
5333 dend = end_match_2; \
5336 /* Test if at very beginning or at very end of the virtual concatenation
5337 of `string1' and `string2'. If only one string, it's `string2'. */
5338 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5339 #define AT_STRINGS_END(d) ((d) == end2)
5342 /* Test if D points to a character which is word-constituent. We have
5343 two special cases to check for: if past the end of string1, look at
5344 the first character in string2; and if before the beginning of
5345 string2, look at the last character in string1. */
5347 /* Use internationalized API instead of SYNTAX. */
5348 # define WORDCHAR_P(d) \
5349 (iswalnum ((wint_t)((d) == end1 ? *string2 \
5350 : (d) == string2 - 1 ? *(end1 - 1) : *(d))) != 0 \
5351 || ((d) == end1 ? *string2 \
5352 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) == L'_')
5354 # define WORDCHAR_P(d) \
5355 (SYNTAX ((d) == end1 ? *string2 \
5356 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
5360 /* Disabled due to a compiler bug -- see comment at case wordbound */
5362 /* Test if the character before D and the one at D differ with respect
5363 to being word-constituent. */
5364 #define AT_WORD_BOUNDARY(d) \
5365 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
5366 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
5369 /* Free everything we malloc. */
5370 #ifdef MATCH_MAY_ALLOCATE
5372 # define FREE_VARIABLES() \
5374 REGEX_FREE_STACK (fail_stack.stack); \
5375 FREE_VAR (regstart); \
5376 FREE_VAR (regend); \
5377 FREE_VAR (old_regstart); \
5378 FREE_VAR (old_regend); \
5379 FREE_VAR (best_regstart); \
5380 FREE_VAR (best_regend); \
5381 FREE_VAR (reg_info); \
5382 FREE_VAR (reg_dummy); \
5383 FREE_VAR (reg_info_dummy); \
5384 if (!cant_free_wcs_buf) \
5386 FREE_VAR (string1); \
5387 FREE_VAR (string2); \
5388 FREE_VAR (mbs_offset1); \
5389 FREE_VAR (mbs_offset2); \
5393 # define FREE_VARIABLES() \
5395 REGEX_FREE_STACK (fail_stack.stack); \
5396 FREE_VAR (regstart); \
5397 FREE_VAR (regend); \
5398 FREE_VAR (old_regstart); \
5399 FREE_VAR (old_regend); \
5400 FREE_VAR (best_regstart); \
5401 FREE_VAR (best_regend); \
5402 FREE_VAR (reg_info); \
5403 FREE_VAR (reg_dummy); \
5404 FREE_VAR (reg_info_dummy); \
5409 # define FREE_VARIABLES() \
5411 if (!cant_free_wcs_buf) \
5413 FREE_VAR (string1); \
5414 FREE_VAR (string2); \
5415 FREE_VAR (mbs_offset1); \
5416 FREE_VAR (mbs_offset2); \
5420 # define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
5422 #endif /* not MATCH_MAY_ALLOCATE */
5424 /* These values must meet several constraints. They must not be valid
5425 register values; since we have a limit of 255 registers (because
5426 we use only one byte in the pattern for the register number), we can
5427 use numbers larger than 255. They must differ by 1, because of
5428 NUM_FAILURE_ITEMS above. And the value for the lowest register must
5429 be larger than the value for the highest register, so we do not try
5430 to actually save any registers when none are active. */
5431 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
5432 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
5434 #else /* not INSIDE_RECURSION */
5435 /* Matching routines. */
5437 #ifndef emacs /* Emacs never uses this. */
5438 /* re_match is like re_match_2 except it takes only a single string. */
5442 struct re_pattern_buffer *bufp,
5445 struct re_registers *regs)
5449 if (MB_CUR_MAX != 1)
5450 result = wcs_re_match_2_internal (bufp, NULL, 0, string, size,
5452 NULL, 0, NULL, 0, NULL, NULL);
5455 result = byte_re_match_2_internal (bufp, NULL, 0, string, size,
5457 # ifndef REGEX_MALLOC
5464 #endif /* not emacs */
5466 #endif /* not INSIDE_RECURSION */
5468 #ifdef INSIDE_RECURSION
5469 static boolean PREFIX(group_match_null_string_p) (UCHAR_T **p,
5471 PREFIX(register_info_type) *reg_info);
5472 static boolean PREFIX(alt_match_null_string_p) (UCHAR_T *p,
5474 PREFIX(register_info_type) *reg_info);
5475 static boolean PREFIX(common_op_match_null_string_p) (UCHAR_T **p,
5477 PREFIX(register_info_type) *reg_info);
5478 static int PREFIX(bcmp_translate) (const CHAR_T *s1, const CHAR_T *s2,
5479 int len, __RE_TRANSLATE_TYPE translate);
5480 #else /* not INSIDE_RECURSION */
5482 /* re_match_2 matches the compiled pattern in BUFP against the
5483 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
5484 and SIZE2, respectively). We start matching at POS, and stop
5487 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
5488 store offsets for the substring each group matched in REGS. See the
5489 documentation for exactly how many groups we fill.
5491 We return -1 if no match, -2 if an internal error (such as the
5492 failure stack overflowing). Otherwise, we return the length of the
5493 matched substring. */
5497 struct re_pattern_buffer *bufp,
5498 const char *string1, int size1,
5499 const char *string2, int size2,
5501 struct re_registers *regs,
5506 if (MB_CUR_MAX != 1)
5507 result = wcs_re_match_2_internal (bufp, string1, size1, string2, size2,
5509 NULL, 0, NULL, 0, NULL, NULL);
5512 result = byte_re_match_2_internal (bufp, string1, size1, string2, size2,
5515 #ifndef REGEX_MALLOC
5523 #endif /* not INSIDE_RECURSION */
5525 #ifdef INSIDE_RECURSION
5528 static int count_mbs_length (int *, int);
5530 /* This check the substring (from 0, to length) of the multibyte string,
5531 to which offset_buffer correspond. And count how many wchar_t_characters
5532 the substring occupy. We use offset_buffer to optimization.
5533 See convert_mbs_to_wcs. */
5542 /* Check whether the size is valid. */
5546 if (offset_buffer == NULL)
5549 /* If there are no multibyte character, offset_buffer[i] == i.
5550 Optmize for this case. */
5551 if (offset_buffer[length] == length)
5554 /* Set up upper with length. (because for all i, offset_buffer[i] >= i) */
5560 int middle = (lower + upper) / 2;
5561 if (middle == lower || middle == upper)
5563 if (offset_buffer[middle] > length)
5565 else if (offset_buffer[middle] < length)
5575 /* This is a separate function so that we can force an alloca cleanup
5579 wcs_re_match_2_internal (
5580 struct re_pattern_buffer *bufp,
5581 const char *cstring1, int csize1,
5582 const char *cstring2, int csize2,
5584 struct re_registers *regs,
5586 /* string1 == string2 == NULL means string1/2, size1/2 and
5587 mbs_offset1/2 need seting up in this function. */
5588 /* We need wchar_t* buffers correspond to cstring1, cstring2. */
5589 /* We need the size of wchar_t buffers correspond to csize1, csize2. */
5590 wchar_t *string1, int size1,
5591 wchar_t *string2, int size2,
5592 /* offset buffer for optimization. See convert_mbs_to_wc. */
5593 int *mbs_offset1, int *mbs_offset2)
5596 byte_re_match_2_internal (
5597 struct re_pattern_buffer *bufp,
5598 const char *string1, int size1,
5599 const char *string2, int size2,
5601 struct re_registers *regs,
5605 /* General temporaries. */
5609 /* They hold whether each wchar_t is binary data or not. */
5610 char *is_binary = NULL;
5611 /* If true, we can't free string1/2, mbs_offset1/2. */
5612 int cant_free_wcs_buf = 1;
5615 /* Just past the end of the corresponding string. */
5616 const CHAR_T *end1, *end2;
5618 /* Pointers into string1 and string2, just past the last characters in
5619 each to consider matching. */
5620 const CHAR_T *end_match_1, *end_match_2;
5622 /* Where we are in the data, and the end of the current string. */
5623 const CHAR_T *d, *dend;
5625 /* Where we are in the pattern, and the end of the pattern. */
5627 UCHAR_T *pattern, *p;
5628 register UCHAR_T *pend;
5630 UCHAR_T *p = bufp->buffer;
5631 register UCHAR_T *pend = p + bufp->used;
5634 /* Mark the opcode just after a start_memory, so we can test for an
5635 empty subpattern when we get to the stop_memory. */
5636 UCHAR_T *just_past_start_mem = 0;
5638 /* We use this to map every character in the string. */
5639 __RE_TRANSLATE_TYPE translate = bufp->translate;
5641 /* Failure point stack. Each place that can handle a failure further
5642 down the line pushes a failure point on this stack. It consists of
5643 restart, regend, and reg_info for all registers corresponding to
5644 the subexpressions we're currently inside, plus the number of such
5645 registers, and, finally, two char *'s. The first char * is where
5646 to resume scanning the pattern; the second one is where to resume
5647 scanning the strings. If the latter is zero, the failure point is
5648 a ``dummy''; if a failure happens and the failure point is a dummy,
5649 it gets discarded and the next next one is tried. */
5650 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
5651 PREFIX(fail_stack_type) fail_stack;
5654 static unsigned failure_id;
5655 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
5659 /* This holds the pointer to the failure stack, when
5660 it is allocated relocatably. */
5661 fail_stack_elt_t *failure_stack_ptr;
5664 /* We fill all the registers internally, independent of what we
5665 return, for use in backreferences. The number here includes
5666 an element for register zero. */
5667 size_t num_regs = bufp->re_nsub + 1;
5669 /* The currently active registers. */
5670 active_reg_t lowest_active_reg = NO_LOWEST_ACTIVE_REG;
5671 active_reg_t highest_active_reg = NO_HIGHEST_ACTIVE_REG;
5673 /* Information on the contents of registers. These are pointers into
5674 the input strings; they record just what was matched (on this
5675 attempt) by a subexpression part of the pattern, that is, the
5676 regnum-th regstart pointer points to where in the pattern we began
5677 matching and the regnum-th regend points to right after where we
5678 stopped matching the regnum-th subexpression. (The zeroth register
5679 keeps track of what the whole pattern matches.) */
5680 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5681 const CHAR_T **regstart, **regend;
5684 /* If a group that's operated upon by a repetition operator fails to
5685 match anything, then the register for its start will need to be
5686 restored because it will have been set to wherever in the string we
5687 are when we last see its open-group operator. Similarly for a
5689 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5690 const CHAR_T **old_regstart, **old_regend;
5693 /* The is_active field of reg_info helps us keep track of which (possibly
5694 nested) subexpressions we are currently in. The matched_something
5695 field of reg_info[reg_num] helps us tell whether or not we have
5696 matched any of the pattern so far this time through the reg_num-th
5697 subexpression. These two fields get reset each time through any
5698 loop their register is in. */
5699 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
5700 PREFIX(register_info_type) *reg_info;
5703 /* The following record the register info as found in the above
5704 variables when we find a match better than any we've seen before.
5705 This happens as we backtrack through the failure points, which in
5706 turn happens only if we have not yet matched the entire string. */
5707 unsigned best_regs_set = false;
5708 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5709 const CHAR_T **best_regstart, **best_regend;
5712 /* Logically, this is `best_regend[0]'. But we don't want to have to
5713 allocate space for that if we're not allocating space for anything
5714 else (see below). Also, we never need info about register 0 for
5715 any of the other register vectors, and it seems rather a kludge to
5716 treat `best_regend' differently than the rest. So we keep track of
5717 the end of the best match so far in a separate variable. We
5718 initialize this to NULL so that when we backtrack the first time
5719 and need to test it, it's not garbage. */
5720 const CHAR_T *match_end = NULL;
5722 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
5723 int set_regs_matched_done = 0;
5725 /* Used when we pop values we don't care about. */
5726 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5727 const CHAR_T **reg_dummy;
5728 PREFIX(register_info_type) *reg_info_dummy;
5732 /* Counts the total number of registers pushed. */
5733 unsigned num_regs_pushed = 0;
5736 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5740 #ifdef MATCH_MAY_ALLOCATE
5741 /* Do not bother to initialize all the register variables if there are
5742 no groups in the pattern, as it takes a fair amount of time. If
5743 there are groups, we include space for register 0 (the whole
5744 pattern), even though we never use it, since it simplifies the
5745 array indexing. We should fix this. */
5748 regstart = REGEX_TALLOC (num_regs, const CHAR_T *);
5749 regend = REGEX_TALLOC (num_regs, const CHAR_T *);
5750 old_regstart = REGEX_TALLOC (num_regs, const CHAR_T *);
5751 old_regend = REGEX_TALLOC (num_regs, const CHAR_T *);
5752 best_regstart = REGEX_TALLOC (num_regs, const CHAR_T *);
5753 best_regend = REGEX_TALLOC (num_regs, const CHAR_T *);
5754 reg_info = REGEX_TALLOC (num_regs, PREFIX(register_info_type));
5755 reg_dummy = REGEX_TALLOC (num_regs, const CHAR_T *);
5756 reg_info_dummy = REGEX_TALLOC (num_regs, PREFIX(register_info_type));
5758 if (!(regstart && regend && old_regstart && old_regend && reg_info
5759 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
5767 /* We must initialize all our variables to NULL, so that
5768 `FREE_VARIABLES' doesn't try to free them. */
5769 regstart = regend = old_regstart = old_regend = best_regstart
5770 = best_regend = reg_dummy = NULL;
5771 reg_info = reg_info_dummy = (PREFIX(register_info_type) *) NULL;
5773 #endif /* MATCH_MAY_ALLOCATE */
5775 /* The starting position is bogus. */
5777 if (pos < 0 || pos > csize1 + csize2)
5779 if (pos < 0 || pos > size1 + size2)
5787 /* Allocate wchar_t array for string1 and string2 and
5788 fill them with converted string. */
5789 if (string1 == NULL && string2 == NULL)
5791 /* We need seting up buffers here. */
5793 /* We must free wcs buffers in this function. */
5794 cant_free_wcs_buf = 0;
5798 string1 = REGEX_TALLOC (csize1 + 1, CHAR_T);
5799 mbs_offset1 = REGEX_TALLOC (csize1 + 1, int);
5800 is_binary = REGEX_TALLOC (csize1 + 1, char);
5801 if (!string1 || !mbs_offset1 || !is_binary)
5804 FREE_VAR (mbs_offset1);
5805 FREE_VAR (is_binary);
5811 string2 = REGEX_TALLOC (csize2 + 1, CHAR_T);
5812 mbs_offset2 = REGEX_TALLOC (csize2 + 1, int);
5813 is_binary = REGEX_TALLOC (csize2 + 1, char);
5814 if (!string2 || !mbs_offset2 || !is_binary)
5817 FREE_VAR (mbs_offset1);
5819 FREE_VAR (mbs_offset2);
5820 FREE_VAR (is_binary);
5823 size2 = convert_mbs_to_wcs(string2, cstring2, csize2,
5824 mbs_offset2, is_binary);
5825 string2[size2] = L'\0'; /* for a sentinel */
5826 FREE_VAR (is_binary);
5830 /* We need to cast pattern to (wchar_t*), because we casted this compiled
5831 pattern to (char*) in regex_compile. */
5832 p = pattern = (CHAR_T*)bufp->buffer;
5833 pend = (CHAR_T*)(bufp->buffer + bufp->used);
5837 /* Initialize subexpression text positions to -1 to mark ones that no
5838 start_memory/stop_memory has been seen for. Also initialize the
5839 register information struct. */
5840 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
5842 regstart[mcnt] = regend[mcnt]
5843 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
5845 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
5846 IS_ACTIVE (reg_info[mcnt]) = 0;
5847 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
5848 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
5851 /* We move `string1' into `string2' if the latter's empty -- but not if
5852 `string1' is null. */
5853 if (size2 == 0 && string1 != NULL)
5860 mbs_offset2 = mbs_offset1;
5866 end1 = string1 + size1;
5867 end2 = string2 + size2;
5869 /* Compute where to stop matching, within the two strings. */
5873 mcnt = count_mbs_length(mbs_offset1, stop);
5874 end_match_1 = string1 + mcnt;
5875 end_match_2 = string2;
5879 if (stop > csize1 + csize2)
5880 stop = csize1 + csize2;
5882 mcnt = count_mbs_length(mbs_offset2, stop-csize1);
5883 end_match_2 = string2 + mcnt;
5886 { /* count_mbs_length return error. */
5893 end_match_1 = string1 + stop;
5894 end_match_2 = string2;
5899 end_match_2 = string2 + stop - size1;
5903 /* `p' scans through the pattern as `d' scans through the data.
5904 `dend' is the end of the input string that `d' points within. `d'
5905 is advanced into the following input string whenever necessary, but
5906 this happens before fetching; therefore, at the beginning of the
5907 loop, `d' can be pointing at the end of a string, but it cannot
5910 if (size1 > 0 && pos <= csize1)
5912 mcnt = count_mbs_length(mbs_offset1, pos);
5918 mcnt = count_mbs_length(mbs_offset2, pos-csize1);
5924 { /* count_mbs_length return error. */
5929 if (size1 > 0 && pos <= size1)
5936 d = string2 + pos - size1;
5941 DEBUG_PRINT1 ("The compiled pattern is:\n");
5942 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
5943 DEBUG_PRINT1 ("The string to match is: `");
5944 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
5945 DEBUG_PRINT1 ("'\n");
5947 /* This loops over pattern commands. It exits by returning from the
5948 function if the match is complete, or it drops through if the match
5949 fails at this starting point in the input data. */
5953 DEBUG_PRINT2 ("\n%p: ", p);
5955 DEBUG_PRINT2 ("\n0x%x: ", p);
5959 { /* End of pattern means we might have succeeded. */
5960 DEBUG_PRINT1 ("end of pattern ... ");
5962 /* If we haven't matched the entire string, and we want the
5963 longest match, try backtracking. */
5964 if (d != end_match_2)
5966 /* 1 if this match ends in the same string (string1 or string2)
5967 as the best previous match. */
5968 boolean same_str_p = (FIRST_STRING_P (match_end)
5969 == MATCHING_IN_FIRST_STRING);
5970 /* 1 if this match is the best seen so far. */
5971 boolean best_match_p;
5973 /* AIX compiler got confused when this was combined
5974 with the previous declaration. */
5976 best_match_p = d > match_end;
5978 best_match_p = !MATCHING_IN_FIRST_STRING;
5980 DEBUG_PRINT1 ("backtracking.\n");
5982 if (!FAIL_STACK_EMPTY ())
5983 { /* More failure points to try. */
5985 /* If exceeds best match so far, save it. */
5986 if (!best_regs_set || best_match_p)
5988 best_regs_set = true;
5991 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
5993 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
5995 best_regstart[mcnt] = regstart[mcnt];
5996 best_regend[mcnt] = regend[mcnt];
6002 /* If no failure points, don't restore garbage. And if
6003 last match is real best match, don't restore second
6005 else if (best_regs_set && !best_match_p)
6008 /* Restore best match. It may happen that `dend ==
6009 end_match_1' while the restored d is in string2.
6010 For example, the pattern `x.*y.*z' against the
6011 strings `x-' and `y-z-', if the two strings are
6012 not consecutive in memory. */
6013 DEBUG_PRINT1 ("Restoring best registers.\n");
6016 dend = ((d >= string1 && d <= end1)
6017 ? end_match_1 : end_match_2);
6019 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
6021 regstart[mcnt] = best_regstart[mcnt];
6022 regend[mcnt] = best_regend[mcnt];
6025 } /* d != end_match_2 */
6028 DEBUG_PRINT1 ("Accepting match.\n");
6029 /* If caller wants register contents data back, do it. */
6030 if (regs && !bufp->no_sub)
6032 /* Have the register data arrays been allocated? */
6033 if (bufp->regs_allocated == REGS_UNALLOCATED)
6034 { /* No. So allocate them with malloc. We need one
6035 extra element beyond `num_regs' for the `-1' marker
6038 * "If REGS_UNALLOCATED, allocate space in the regs structure
6039 * for max(RE_NREGS, re_nsub + 1) groups"
6040 * but real-world testsuites fail with contrived examples
6041 * with lots of groups.
6042 * I don't see why we can't just allocate exact needed number.
6043 * Incidentally, it makes RE_NREGS unused.
6045 * regs->num_regs = MAX (RE_NREGS, num_regs + 1); - VERY WRONG
6046 * regs->num_regs = MIN (RE_NREGS, num_regs + 1); - slightly less wrong
6047 * good one which passes uclibc test/regex/tst-regex2.c:
6049 regs->num_regs = num_regs + 1;
6050 regs->start = TALLOC (regs->num_regs, regoff_t);
6051 regs->end = TALLOC (regs->num_regs, regoff_t);
6052 if (regs->start == NULL || regs->end == NULL)
6057 bufp->regs_allocated = REGS_REALLOCATE;
6059 else if (bufp->regs_allocated == REGS_REALLOCATE)
6060 { /* Yes. If we need more elements than were already
6061 allocated, reallocate them. If we need fewer, just
6063 if (regs->num_regs < num_regs + 1)
6065 regs->num_regs = num_regs + 1;
6066 RETALLOC (regs->start, regs->num_regs, regoff_t);
6067 RETALLOC (regs->end, regs->num_regs, regoff_t);
6068 if (regs->start == NULL || regs->end == NULL)
6077 /* These braces fend off a "empty body in an else-statement"
6078 warning under GCC when assert expands to nothing. */
6079 assert (bufp->regs_allocated == REGS_FIXED);
6082 /* Convert the pointer data in `regstart' and `regend' to
6083 indices. Register zero has to be set differently,
6084 since we haven't kept track of any info for it. */
6085 if (regs->num_regs > 0)
6087 regs->start[0] = pos;
6089 if (MATCHING_IN_FIRST_STRING)
6090 regs->end[0] = mbs_offset1 != NULL ?
6091 mbs_offset1[d-string1] : 0;
6093 regs->end[0] = csize1 + (mbs_offset2 != NULL ?
6094 mbs_offset2[d-string2] : 0);
6096 regs->end[0] = (MATCHING_IN_FIRST_STRING
6097 ? ((regoff_t) (d - string1))
6098 : ((regoff_t) (d - string2 + size1)));
6102 /* Go through the first `min (num_regs, regs->num_regs)'
6103 registers, since that is all we initialized. */
6104 for (mcnt = 1; (unsigned) mcnt < MIN (num_regs, regs->num_regs);
6107 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
6108 regs->start[mcnt] = regs->end[mcnt] = -1;
6112 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
6114 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
6118 /* If the regs structure we return has more elements than
6119 were in the pattern, set the extra elements to -1. If
6120 we (re)allocated the registers, this is the case,
6121 because we always allocate enough to have at least one
6123 for (mcnt = num_regs; (unsigned) mcnt < regs->num_regs; mcnt++)
6124 regs->start[mcnt] = regs->end[mcnt] = -1;
6125 } /* regs && !bufp->no_sub */
6127 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
6128 nfailure_points_pushed, nfailure_points_popped,
6129 nfailure_points_pushed - nfailure_points_popped);
6130 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
6133 if (MATCHING_IN_FIRST_STRING)
6134 mcnt = mbs_offset1 != NULL ? mbs_offset1[d-string1] : 0;
6136 mcnt = (mbs_offset2 != NULL ? mbs_offset2[d-string2] : 0) +
6140 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
6145 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
6151 /* Otherwise match next pattern command. */
6152 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
6154 /* Ignore these. Used to ignore the n of succeed_n's which
6155 currently have n == 0. */
6157 DEBUG_PRINT1 ("EXECUTING no_op.\n");
6161 DEBUG_PRINT1 ("EXECUTING succeed.\n");
6164 /* Match the next n pattern characters exactly. The following
6165 byte in the pattern defines n, and the n bytes after that
6166 are the characters to match. */
6172 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
6174 /* This is written out as an if-else so we don't waste time
6175 testing `translate' inside the loop. */
6184 if ((UCHAR_T) translate[(unsigned char) *d++]
6190 if (*d++ != (CHAR_T) *p++)
6194 if ((UCHAR_T) translate[(unsigned char) *d++]
6206 if (*d++ != (CHAR_T) *p++) goto fail;
6210 SET_REGS_MATCHED ();
6214 /* Match any character except possibly a newline or a null. */
6216 DEBUG_PRINT1 ("EXECUTING anychar.\n");
6220 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
6221 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
6224 SET_REGS_MATCHED ();
6225 DEBUG_PRINT2 (" Matched `%ld'.\n", (long int) *d);
6235 unsigned int i, char_class_length, coll_symbol_length,
6236 equiv_class_length, ranges_length, chars_length, length;
6237 CHAR_T *workp, *workp2, *charset_top;
6238 #define WORK_BUFFER_SIZE 128
6239 CHAR_T str_buf[WORK_BUFFER_SIZE];
6244 boolean not = (re_opcode_t) *(p - 1) == charset_not;
6246 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
6248 c = TRANSLATE (*d); /* The character to match. */
6251 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
6253 charset_top = p - 1;
6254 char_class_length = *p++;
6255 coll_symbol_length = *p++;
6256 equiv_class_length = *p++;
6257 ranges_length = *p++;
6258 chars_length = *p++;
6259 /* p points charset[6], so the address of the next instruction
6260 (charset[l+m+n+2o+k+p']) equals p[l+m+n+2*o+p'],
6261 where l=length of char_classes, m=length of collating_symbol,
6262 n=equivalence_class, o=length of char_range,
6263 p'=length of character. */
6265 /* Update p to indicate the next instruction. */
6266 p += char_class_length + coll_symbol_length+ equiv_class_length +
6267 2*ranges_length + chars_length;
6269 /* match with char_class? */
6270 for (i = 0; i < char_class_length ; i += CHAR_CLASS_SIZE)
6273 uintptr_t alignedp = ((uintptr_t)workp
6274 + __alignof__(wctype_t) - 1)
6275 & ~(uintptr_t)(__alignof__(wctype_t) - 1);
6276 wctype = *((wctype_t*)alignedp);
6277 workp += CHAR_CLASS_SIZE;
6279 if (__iswctype((wint_t)c, wctype))
6280 goto char_set_matched;
6282 if (iswctype((wint_t)c, wctype))
6283 goto char_set_matched;
6287 /* match with collating_symbol? */
6291 const unsigned char *extra = (const unsigned char *)
6292 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
6294 for (workp2 = workp + coll_symbol_length ; workp < workp2 ;
6298 wextra = (int32_t*)(extra + *workp++);
6299 for (i = 0; i < *wextra; ++i)
6300 if (TRANSLATE(d[i]) != wextra[1 + i])
6305 /* Update d, however d will be incremented at
6306 char_set_matched:, we decrement d here. */
6308 goto char_set_matched;
6312 else /* (nrules == 0) */
6314 /* If we can't look up collation data, we use wcscoll
6317 for (workp2 = workp + coll_symbol_length ; workp < workp2 ;)
6319 const CHAR_T *backup_d = d, *backup_dend = dend;
6321 length = __wcslen (workp);
6323 length = wcslen (workp);
6326 /* If wcscoll(the collating symbol, whole string) > 0,
6327 any substring of the string never match with the
6328 collating symbol. */
6330 if (__wcscoll (workp, d) > 0)
6332 if (wcscoll (workp, d) > 0)
6335 workp += length + 1;
6339 /* First, we compare the collating symbol with
6340 the first character of the string.
6341 If it don't match, we add the next character to
6342 the compare buffer in turn. */
6343 for (i = 0 ; i < WORK_BUFFER_SIZE-1 ; i++, d++)
6348 if (dend == end_match_2)
6354 /* add next character to the compare buffer. */
6355 str_buf[i] = TRANSLATE(*d);
6356 str_buf[i+1] = '\0';
6359 match = __wcscoll (workp, str_buf);
6361 match = wcscoll (workp, str_buf);
6364 goto char_set_matched;
6367 /* (str_buf > workp) indicate (str_buf + X > workp),
6368 because for all X (str_buf + X > str_buf).
6369 So we don't need continue this loop. */
6372 /* Otherwise(str_buf < workp),
6373 (str_buf+next_character) may equals (workp).
6374 So we continue this loop. */
6379 workp += length + 1;
6382 /* match with equivalence_class? */
6386 const CHAR_T *backup_d = d, *backup_dend = dend;
6387 /* Try to match the equivalence class against
6388 those known to the collate implementation. */
6389 const int32_t *table;
6390 const int32_t *weights;
6391 const int32_t *extra;
6392 const int32_t *indirect;
6397 /* This #include defines a local function! */
6398 # include <locale/weightwc.h>
6400 table = (const int32_t *)
6401 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEWC);
6402 weights = (const wint_t *)
6403 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTWC);
6404 extra = (const wint_t *)
6405 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAWC);
6406 indirect = (const int32_t *)
6407 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTWC);
6409 /* Write 1 collating element to str_buf, and
6413 for (i = 0 ; idx2 == 0 && i < WORK_BUFFER_SIZE - 1; i++)
6415 cp = (wint_t*)str_buf;
6418 if (dend == end_match_2)
6423 str_buf[i] = TRANSLATE(*(d+i));
6424 str_buf[i+1] = '\0'; /* sentinel */
6425 idx2 = findidx ((const wint_t**)&cp);
6428 /* Update d, however d will be incremented at
6429 char_set_matched:, we decrement d here. */
6430 d = backup_d + ((wchar_t*)cp - (wchar_t*)str_buf - 1);
6433 if (dend == end_match_2)
6442 len = weights[idx2];
6444 for (workp2 = workp + equiv_class_length ; workp < workp2 ;
6447 idx = (int32_t)*workp;
6448 /* We already checked idx != 0 in regex_compile. */
6450 if (idx2 != 0 && len == weights[idx])
6453 while (cnt < len && (weights[idx + 1 + cnt]
6454 == weights[idx2 + 1 + cnt]))
6458 goto char_set_matched;
6465 else /* (nrules == 0) */
6467 /* If we can't look up collation data, we use wcscoll
6470 for (workp2 = workp + equiv_class_length ; workp < workp2 ;)
6472 const CHAR_T *backup_d = d, *backup_dend = dend;
6474 length = __wcslen (workp);
6476 length = wcslen (workp);
6479 /* If wcscoll(the collating symbol, whole string) > 0,
6480 any substring of the string never match with the
6481 collating symbol. */
6483 if (__wcscoll (workp, d) > 0)
6485 if (wcscoll (workp, d) > 0)
6488 workp += length + 1;
6492 /* First, we compare the equivalence class with
6493 the first character of the string.
6494 If it don't match, we add the next character to
6495 the compare buffer in turn. */
6496 for (i = 0 ; i < WORK_BUFFER_SIZE - 1 ; i++, d++)
6501 if (dend == end_match_2)
6507 /* add next character to the compare buffer. */
6508 str_buf[i] = TRANSLATE(*d);
6509 str_buf[i+1] = '\0';
6512 match = __wcscoll (workp, str_buf);
6514 match = wcscoll (workp, str_buf);
6518 goto char_set_matched;
6521 /* (str_buf > workp) indicate (str_buf + X > workp),
6522 because for all X (str_buf + X > str_buf).
6523 So we don't need continue this loop. */
6526 /* Otherwise(str_buf < workp),
6527 (str_buf+next_character) may equals (workp).
6528 So we continue this loop. */
6533 workp += length + 1;
6537 /* match with char_range? */
6541 uint32_t collseqval;
6542 const char *collseq = (const char *)
6543 _NL_CURRENT(LC_COLLATE, _NL_COLLATE_COLLSEQWC);
6545 collseqval = collseq_table_lookup (collseq, c);
6547 for (; workp < p - chars_length ;)
6549 uint32_t start_val, end_val;
6551 /* We already compute the collation sequence value
6552 of the characters (or collating symbols). */
6553 start_val = (uint32_t) *workp++; /* range_start */
6554 end_val = (uint32_t) *workp++; /* range_end */
6556 if (start_val <= collseqval && collseqval <= end_val)
6557 goto char_set_matched;
6563 /* We set range_start_char at str_buf[0], range_end_char
6564 at str_buf[4], and compared char at str_buf[2]. */
6569 for (; workp < p - chars_length ;)
6571 wchar_t *range_start_char, *range_end_char;
6573 /* match if (range_start_char <= c <= range_end_char). */
6575 /* If range_start(or end) < 0, we assume -range_start(end)
6576 is the offset of the collating symbol which is specified
6577 as the character of the range start(end). */
6581 range_start_char = charset_top - (*workp++);
6584 str_buf[0] = *workp++;
6585 range_start_char = str_buf;
6590 range_end_char = charset_top - (*workp++);
6593 str_buf[4] = *workp++;
6594 range_end_char = str_buf + 4;
6598 if (__wcscoll (range_start_char, str_buf+2) <= 0
6599 && __wcscoll (str_buf+2, range_end_char) <= 0)
6601 if (wcscoll (range_start_char, str_buf+2) <= 0
6602 && wcscoll (str_buf+2, range_end_char) <= 0)
6604 goto char_set_matched;
6608 /* match with char? */
6609 for (; workp < p ; workp++)
6611 goto char_set_matched;
6618 /* Cast to `unsigned' instead of `unsigned char' in case the
6619 bit list is a full 32 bytes long. */
6620 if (c < (unsigned) (*p * BYTEWIDTH)
6621 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
6626 if (!not) goto fail;
6627 #undef WORK_BUFFER_SIZE
6629 SET_REGS_MATCHED ();
6635 /* The beginning of a group is represented by start_memory.
6636 The arguments are the register number in the next byte, and the
6637 number of groups inner to this one in the next. The text
6638 matched within the group is recorded (in the internal
6639 registers data structure) under the register number. */
6641 DEBUG_PRINT3 ("EXECUTING start_memory %ld (%ld):\n",
6642 (long int) *p, (long int) p[1]);
6644 /* Find out if this group can match the empty string. */
6645 p1 = p; /* To send to group_match_null_string_p. */
6647 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
6648 REG_MATCH_NULL_STRING_P (reg_info[*p])
6649 = PREFIX(group_match_null_string_p) (&p1, pend, reg_info);
6651 /* Save the position in the string where we were the last time
6652 we were at this open-group operator in case the group is
6653 operated upon by a repetition operator, e.g., with `(a*)*b'
6654 against `ab'; then we want to ignore where we are now in
6655 the string in case this attempt to match fails. */
6656 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
6657 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
6659 DEBUG_PRINT2 (" old_regstart: %d\n",
6660 POINTER_TO_OFFSET (old_regstart[*p]));
6663 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
6665 IS_ACTIVE (reg_info[*p]) = 1;
6666 MATCHED_SOMETHING (reg_info[*p]) = 0;
6668 /* Clear this whenever we change the register activity status. */
6669 set_regs_matched_done = 0;
6671 /* This is the new highest active register. */
6672 highest_active_reg = *p;
6674 /* If nothing was active before, this is the new lowest active
6676 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
6677 lowest_active_reg = *p;
6679 /* Move past the register number and inner group count. */
6681 just_past_start_mem = p;
6686 /* The stop_memory opcode represents the end of a group. Its
6687 arguments are the same as start_memory's: the register
6688 number, and the number of inner groups. */
6690 DEBUG_PRINT3 ("EXECUTING stop_memory %ld (%ld):\n",
6691 (long int) *p, (long int) p[1]);
6693 /* We need to save the string position the last time we were at
6694 this close-group operator in case the group is operated
6695 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
6696 against `aba'; then we want to ignore where we are now in
6697 the string in case this attempt to match fails. */
6698 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
6699 ? REG_UNSET (regend[*p]) ? d : regend[*p]
6701 DEBUG_PRINT2 (" old_regend: %d\n",
6702 POINTER_TO_OFFSET (old_regend[*p]));
6705 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
6707 /* This register isn't active anymore. */
6708 IS_ACTIVE (reg_info[*p]) = 0;
6710 /* Clear this whenever we change the register activity status. */
6711 set_regs_matched_done = 0;
6713 /* If this was the only register active, nothing is active
6715 if (lowest_active_reg == highest_active_reg)
6717 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
6718 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
6721 { /* We must scan for the new highest active register, since
6722 it isn't necessarily one less than now: consider
6723 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
6724 new highest active register is 1. */
6726 while (r > 0 && !IS_ACTIVE (reg_info[r]))
6729 /* If we end up at register zero, that means that we saved
6730 the registers as the result of an `on_failure_jump', not
6731 a `start_memory', and we jumped to past the innermost
6732 `stop_memory'. For example, in ((.)*) we save
6733 registers 1 and 2 as a result of the *, but when we pop
6734 back to the second ), we are at the stop_memory 1.
6735 Thus, nothing is active. */
6738 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
6739 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
6742 highest_active_reg = r;
6745 /* If just failed to match something this time around with a
6746 group that's operated on by a repetition operator, try to
6747 force exit from the ``loop'', and restore the register
6748 information for this group that we had before trying this
6750 if ((!MATCHED_SOMETHING (reg_info[*p])
6751 || just_past_start_mem == p - 1)
6754 boolean is_a_jump_n = false;
6758 switch ((re_opcode_t) *p1++)
6762 case pop_failure_jump:
6763 case maybe_pop_jump:
6765 case dummy_failure_jump:
6766 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6768 p1 += OFFSET_ADDRESS_SIZE;
6776 /* If the next operation is a jump backwards in the pattern
6777 to an on_failure_jump right before the start_memory
6778 corresponding to this stop_memory, exit from the loop
6779 by forcing a failure after pushing on the stack the
6780 on_failure_jump's jump in the pattern, and d. */
6781 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
6782 && (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == start_memory
6783 && p1[2+OFFSET_ADDRESS_SIZE] == *p)
6785 /* If this group ever matched anything, then restore
6786 what its registers were before trying this last
6787 failed match, e.g., with `(a*)*b' against `ab' for
6788 regstart[1], and, e.g., with `((a*)*(b*)*)*'
6789 against `aba' for regend[3].
6791 Also restore the registers for inner groups for,
6792 e.g., `((a*)(b*))*' against `aba' (register 3 would
6793 otherwise get trashed). */
6795 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
6799 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
6801 /* Restore this and inner groups' (if any) registers. */
6802 for (r = *p; r < (unsigned) *p + (unsigned) *(p + 1);
6805 regstart[r] = old_regstart[r];
6807 /* xx why this test? */
6808 if (old_regend[r] >= regstart[r])
6809 regend[r] = old_regend[r];
6813 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6814 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
6820 /* Move past the register number and the inner group count. */
6825 /* \<digit> has been turned into a `duplicate' command which is
6826 followed by the numeric value of <digit> as the register number. */
6829 register const CHAR_T *d2, *dend2;
6830 int regno = *p++; /* Get which register to match against. */
6831 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
6833 /* Can't back reference a group which we've never matched. */
6834 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
6837 /* Where in input to try to start matching. */
6838 d2 = regstart[regno];
6840 /* Where to stop matching; if both the place to start and
6841 the place to stop matching are in the same string, then
6842 set to the place to stop, otherwise, for now have to use
6843 the end of the first string. */
6845 dend2 = ((FIRST_STRING_P (regstart[regno])
6846 == FIRST_STRING_P (regend[regno]))
6847 ? regend[regno] : end_match_1);
6850 /* If necessary, advance to next segment in register
6854 if (dend2 == end_match_2) break;
6855 if (dend2 == regend[regno]) break;
6857 /* End of string1 => advance to string2. */
6859 dend2 = regend[regno];
6861 /* At end of register contents => success */
6862 if (d2 == dend2) break;
6864 /* If necessary, advance to next segment in data. */
6867 /* How many characters left in this segment to match. */
6870 /* Want how many consecutive characters we can match in
6871 one shot, so, if necessary, adjust the count. */
6872 if (mcnt > dend2 - d2)
6875 /* Compare that many; failure if mismatch, else move
6878 ? PREFIX(bcmp_translate) (d, d2, mcnt, translate)
6879 : memcmp (d, d2, mcnt*sizeof(UCHAR_T)))
6881 d += mcnt, d2 += mcnt;
6883 /* Do this because we've match some characters. */
6884 SET_REGS_MATCHED ();
6890 /* begline matches the empty string at the beginning of the string
6891 (unless `not_bol' is set in `bufp'), and, if
6892 `newline_anchor' is set, after newlines. */
6894 DEBUG_PRINT1 ("EXECUTING begline.\n");
6896 if (AT_STRINGS_BEG (d))
6898 if (!bufp->not_bol) break;
6900 else if (d[-1] == '\n' && bufp->newline_anchor)
6904 /* In all other cases, we fail. */
6908 /* endline is the dual of begline. */
6910 DEBUG_PRINT1 ("EXECUTING endline.\n");
6912 if (AT_STRINGS_END (d))
6914 if (!bufp->not_eol) break;
6917 /* We have to ``prefetch'' the next character. */
6918 else if ((d == end1 ? *string2 : *d) == '\n'
6919 && bufp->newline_anchor)
6926 /* Match at the very beginning of the data. */
6928 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
6929 if (AT_STRINGS_BEG (d))
6934 /* Match at the very end of the data. */
6936 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
6937 if (AT_STRINGS_END (d))
6942 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
6943 pushes NULL as the value for the string on the stack. Then
6944 `pop_failure_point' will keep the current value for the
6945 string, instead of restoring it. To see why, consider
6946 matching `foo\nbar' against `.*\n'. The .* matches the foo;
6947 then the . fails against the \n. But the next thing we want
6948 to do is match the \n against the \n; if we restored the
6949 string value, we would be back at the foo.
6951 Because this is used only in specific cases, we don't need to
6952 check all the things that `on_failure_jump' does, to make
6953 sure the right things get saved on the stack. Hence we don't
6954 share its code. The only reason to push anything on the
6955 stack at all is that otherwise we would have to change
6956 `anychar's code to do something besides goto fail in this
6957 case; that seems worse than this. */
6958 case on_failure_keep_string_jump:
6959 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
6961 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6963 DEBUG_PRINT3 (" %d (to %p):\n", mcnt, p + mcnt);
6965 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt);
6968 PUSH_FAILURE_POINT (p + mcnt, NULL, -2);
6972 /* Uses of on_failure_jump:
6974 Each alternative starts with an on_failure_jump that points
6975 to the beginning of the next alternative. Each alternative
6976 except the last ends with a jump that in effect jumps past
6977 the rest of the alternatives. (They really jump to the
6978 ending jump of the following alternative, because tensioning
6979 these jumps is a hassle.)
6981 Repeats start with an on_failure_jump that points past both
6982 the repetition text and either the following jump or
6983 pop_failure_jump back to this on_failure_jump. */
6984 case on_failure_jump:
6986 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
6988 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6990 DEBUG_PRINT3 (" %d (to %p)", mcnt, p + mcnt);
6992 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt);
6995 /* If this on_failure_jump comes right before a group (i.e.,
6996 the original * applied to a group), save the information
6997 for that group and all inner ones, so that if we fail back
6998 to this point, the group's information will be correct.
6999 For example, in \(a*\)*\1, we need the preceding group,
7000 and in \(zz\(a*\)b*\)\2, we need the inner group. */
7002 /* We can't use `p' to check ahead because we push
7003 a failure point to `p + mcnt' after we do this. */
7006 /* We need to skip no_op's before we look for the
7007 start_memory in case this on_failure_jump is happening as
7008 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
7010 while (p1 < pend && (re_opcode_t) *p1 == no_op)
7013 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
7015 /* We have a new highest active register now. This will
7016 get reset at the start_memory we are about to get to,
7017 but we will have saved all the registers relevant to
7018 this repetition op, as described above. */
7019 highest_active_reg = *(p1 + 1) + *(p1 + 2);
7020 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
7021 lowest_active_reg = *(p1 + 1);
7024 DEBUG_PRINT1 (":\n");
7025 PUSH_FAILURE_POINT (p + mcnt, d, -2);
7029 /* A smart repeat ends with `maybe_pop_jump'.
7030 We change it to either `pop_failure_jump' or `jump'. */
7031 case maybe_pop_jump:
7032 EXTRACT_NUMBER_AND_INCR (mcnt, p);
7033 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
7035 register UCHAR_T *p2 = p;
7037 /* Compare the beginning of the repeat with what in the
7038 pattern follows its end. If we can establish that there
7039 is nothing that they would both match, i.e., that we
7040 would have to backtrack because of (as in, e.g., `a*a')
7041 then we can change to pop_failure_jump, because we'll
7042 never have to backtrack.
7044 This is not true in the case of alternatives: in
7045 `(a|ab)*' we do need to backtrack to the `ab' alternative
7046 (e.g., if the string was `ab'). But instead of trying to
7047 detect that here, the alternative has put on a dummy
7048 failure point which is what we will end up popping. */
7050 /* Skip over open/close-group commands.
7051 If what follows this loop is a ...+ construct,
7052 look at what begins its body, since we will have to
7053 match at least one of that. */
7057 && ((re_opcode_t) *p2 == stop_memory
7058 || (re_opcode_t) *p2 == start_memory))
7060 else if (p2 + 2 + 2 * OFFSET_ADDRESS_SIZE < pend
7061 && (re_opcode_t) *p2 == dummy_failure_jump)
7062 p2 += 2 + 2 * OFFSET_ADDRESS_SIZE;
7068 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
7069 to the `maybe_finalize_jump' of this case. Examine what
7072 /* If we're at the end of the pattern, we can change. */
7075 /* Consider what happens when matching ":\(.*\)"
7076 against ":/". I don't really understand this code
7078 p[-(1+OFFSET_ADDRESS_SIZE)] = (UCHAR_T)
7081 (" End of pattern: change to `pop_failure_jump'.\n");
7084 else if ((re_opcode_t) *p2 == exactn
7086 || (re_opcode_t) *p2 == exactn_bin
7088 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
7091 = *p2 == (UCHAR_T) endline ? '\n' : p2[2];
7093 if (((re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn
7095 || (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn_bin
7097 ) && p1[3+OFFSET_ADDRESS_SIZE] != c)
7099 p[-(1+OFFSET_ADDRESS_SIZE)] = (UCHAR_T)
7102 DEBUG_PRINT3 (" %C != %C => pop_failure_jump.\n",
7104 (wint_t) p1[3+OFFSET_ADDRESS_SIZE]);
7106 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
7108 (char) p1[3+OFFSET_ADDRESS_SIZE]);
7113 else if ((re_opcode_t) p1[3] == charset
7114 || (re_opcode_t) p1[3] == charset_not)
7116 int not = (re_opcode_t) p1[3] == charset_not;
7118 if (c < (unsigned) (p1[4] * BYTEWIDTH)
7119 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
7122 /* `not' is equal to 1 if c would match, which means
7123 that we can't change to pop_failure_jump. */
7126 p[-3] = (unsigned char) pop_failure_jump;
7127 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
7130 #endif /* not WCHAR */
7133 else if ((re_opcode_t) *p2 == charset)
7135 /* We win if the first character of the loop is not part
7137 if ((re_opcode_t) p1[3] == exactn
7138 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5]
7139 && (p2[2 + p1[5] / BYTEWIDTH]
7140 & (1 << (p1[5] % BYTEWIDTH)))))
7142 p[-3] = (unsigned char) pop_failure_jump;
7143 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
7146 else if ((re_opcode_t) p1[3] == charset_not)
7149 /* We win if the charset_not inside the loop
7150 lists every character listed in the charset after. */
7151 for (idx = 0; idx < (int) p2[1]; idx++)
7152 if (! (p2[2 + idx] == 0
7153 || (idx < (int) p1[4]
7154 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
7159 p[-3] = (unsigned char) pop_failure_jump;
7160 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
7163 else if ((re_opcode_t) p1[3] == charset)
7166 /* We win if the charset inside the loop
7167 has no overlap with the one after the loop. */
7169 idx < (int) p2[1] && idx < (int) p1[4];
7171 if ((p2[2 + idx] & p1[5 + idx]) != 0)
7174 if (idx == p2[1] || idx == p1[4])
7176 p[-3] = (unsigned char) pop_failure_jump;
7177 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
7181 #endif /* not WCHAR */
7183 p -= OFFSET_ADDRESS_SIZE; /* Point at relative address again. */
7184 if ((re_opcode_t) p[-1] != pop_failure_jump)
7186 p[-1] = (UCHAR_T) jump;
7187 DEBUG_PRINT1 (" Match => jump.\n");
7188 goto unconditional_jump;
7190 /* Note fall through. */
7193 /* The end of a simple repeat has a pop_failure_jump back to
7194 its matching on_failure_jump, where the latter will push a
7195 failure point. The pop_failure_jump takes off failure
7196 points put on by this pop_failure_jump's matching
7197 on_failure_jump; we got through the pattern to here from the
7198 matching on_failure_jump, so didn't fail. */
7199 case pop_failure_jump:
7201 /* We need to pass separate storage for the lowest and
7202 highest registers, even though we don't care about the
7203 actual values. Otherwise, we will restore only one
7204 register from the stack, since lowest will == highest in
7205 `pop_failure_point'. */
7206 active_reg_t dummy_low_reg, dummy_high_reg;
7207 UCHAR_T *pdummy = NULL;
7208 const CHAR_T *sdummy = NULL;
7210 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
7211 POP_FAILURE_POINT (sdummy, pdummy,
7212 dummy_low_reg, dummy_high_reg,
7213 reg_dummy, reg_dummy, reg_info_dummy);
7215 /* Silence 'set but not used' warnings. */
7219 /* Note fall through. */
7223 DEBUG_PRINT2 ("\n%p: ", p);
7225 DEBUG_PRINT2 ("\n0x%x: ", p);
7227 /* Note fall through. */
7229 /* Unconditionally jump (without popping any failure points). */
7231 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
7232 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
7233 p += mcnt; /* Do the jump. */
7235 DEBUG_PRINT2 ("(to %p).\n", p);
7237 DEBUG_PRINT2 ("(to 0x%x).\n", p);
7242 /* We need this opcode so we can detect where alternatives end
7243 in `group_match_null_string_p' et al. */
7245 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
7246 goto unconditional_jump;
7249 /* Normally, the on_failure_jump pushes a failure point, which
7250 then gets popped at pop_failure_jump. We will end up at
7251 pop_failure_jump, also, and with a pattern of, say, `a+', we
7252 are skipping over the on_failure_jump, so we have to push
7253 something meaningless for pop_failure_jump to pop. */
7254 case dummy_failure_jump:
7255 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
7256 /* It doesn't matter what we push for the string here. What
7257 the code at `fail' tests is the value for the pattern. */
7258 PUSH_FAILURE_POINT (NULL, NULL, -2);
7259 goto unconditional_jump;
7262 /* At the end of an alternative, we need to push a dummy failure
7263 point in case we are followed by a `pop_failure_jump', because
7264 we don't want the failure point for the alternative to be
7265 popped. For example, matching `(a|ab)*' against `aab'
7266 requires that we match the `ab' alternative. */
7267 case push_dummy_failure:
7268 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
7269 /* See comments just above at `dummy_failure_jump' about the
7271 PUSH_FAILURE_POINT (NULL, NULL, -2);
7274 /* Have to succeed matching what follows at least n times.
7275 After that, handle like `on_failure_jump'. */
7277 EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE);
7278 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
7281 /* Originally, this is how many times we HAVE to succeed. */
7285 p += OFFSET_ADDRESS_SIZE;
7286 STORE_NUMBER_AND_INCR (p, mcnt);
7288 DEBUG_PRINT3 (" Setting %p to %d.\n", p - OFFSET_ADDRESS_SIZE
7291 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p - OFFSET_ADDRESS_SIZE
7298 DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n",
7299 p + OFFSET_ADDRESS_SIZE);
7301 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n",
7302 p + OFFSET_ADDRESS_SIZE);
7306 p[1] = (UCHAR_T) no_op;
7308 p[2] = (UCHAR_T) no_op;
7309 p[3] = (UCHAR_T) no_op;
7316 EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE);
7317 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
7319 /* Originally, this is how many times we CAN jump. */
7323 STORE_NUMBER (p + OFFSET_ADDRESS_SIZE, mcnt);
7326 DEBUG_PRINT3 (" Setting %p to %d.\n", p + OFFSET_ADDRESS_SIZE,
7329 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p + OFFSET_ADDRESS_SIZE,
7332 goto unconditional_jump;
7334 /* If don't have to jump any more, skip over the rest of command. */
7336 p += 2 * OFFSET_ADDRESS_SIZE;
7341 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
7343 EXTRACT_NUMBER_AND_INCR (mcnt, p);
7345 EXTRACT_NUMBER_AND_INCR (mcnt, p);
7347 DEBUG_PRINT3 (" Setting %p to %d.\n", p1, mcnt);
7349 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt);
7351 STORE_NUMBER (p1, mcnt);
7356 /* The DEC Alpha C compiler 3.x generates incorrect code for the
7357 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
7358 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
7359 macro and introducing temporary variables works around the bug. */
7362 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
7363 if (AT_WORD_BOUNDARY (d))
7368 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
7369 if (AT_WORD_BOUNDARY (d))
7375 boolean prevchar, thischar;
7377 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
7378 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
7381 prevchar = WORDCHAR_P (d - 1);
7382 thischar = WORDCHAR_P (d);
7383 if (prevchar != thischar)
7390 boolean prevchar, thischar;
7392 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
7393 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
7396 prevchar = WORDCHAR_P (d - 1);
7397 thischar = WORDCHAR_P (d);
7398 if (prevchar != thischar)
7405 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
7406 if (!AT_STRINGS_END (d) && WORDCHAR_P (d)
7407 && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
7412 DEBUG_PRINT1 ("EXECUTING wordend.\n");
7413 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
7414 && (AT_STRINGS_END (d) || !WORDCHAR_P (d)))
7420 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
7421 if (PTR_CHAR_POS ((unsigned char *) d) >= point)
7426 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
7427 if (PTR_CHAR_POS ((unsigned char *) d) != point)
7432 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
7433 if (PTR_CHAR_POS ((unsigned char *) d) <= point)
7438 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
7443 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
7447 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
7449 if (SYNTAX (d[-1]) != (enum syntaxcode) mcnt)
7451 SET_REGS_MATCHED ();
7455 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
7457 goto matchnotsyntax;
7460 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
7464 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
7466 if (SYNTAX (d[-1]) == (enum syntaxcode) mcnt)
7468 SET_REGS_MATCHED ();
7471 #else /* not emacs */
7473 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
7475 if (!WORDCHAR_P (d))
7477 SET_REGS_MATCHED ();
7482 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
7486 SET_REGS_MATCHED ();
7489 #endif /* not emacs */
7494 continue; /* Successfully executed one pattern command; keep going. */
7497 /* We goto here if a matching operation fails. */
7499 if (!FAIL_STACK_EMPTY ())
7500 { /* A restart point is known. Restore to that state. */
7501 DEBUG_PRINT1 ("\nFAIL:\n");
7502 POP_FAILURE_POINT (d, p,
7503 lowest_active_reg, highest_active_reg,
7504 regstart, regend, reg_info);
7506 /* If this failure point is a dummy, try the next one. */
7510 /* If we failed to the end of the pattern, don't examine *p. */
7514 boolean is_a_jump_n = false;
7516 /* If failed to a backwards jump that's part of a repetition
7517 loop, need to pop this failure point and use the next one. */
7518 switch ((re_opcode_t) *p)
7522 case maybe_pop_jump:
7523 case pop_failure_jump:
7526 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7529 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
7531 && (re_opcode_t) *p1 == on_failure_jump))
7539 if (d >= string1 && d <= end1)
7543 break; /* Matching at this starting point really fails. */
7547 goto restore_best_regs;
7551 return -1; /* Failure to match. */
7554 /* Subroutine definitions for re_match_2. */
7557 /* We are passed P pointing to a register number after a start_memory.
7559 Return true if the pattern up to the corresponding stop_memory can
7560 match the empty string, and false otherwise.
7562 If we find the matching stop_memory, sets P to point to one past its number.
7563 Otherwise, sets P to an undefined byte less than or equal to END.
7565 We don't handle duplicates properly (yet). */
7568 PREFIX(group_match_null_string_p) (
7569 UCHAR_T **p, UCHAR_T *end,
7570 PREFIX(register_info_type) *reg_info)
7573 /* Point to after the args to the start_memory. */
7574 UCHAR_T *p1 = *p + 2;
7578 /* Skip over opcodes that can match nothing, and return true or
7579 false, as appropriate, when we get to one that can't, or to the
7580 matching stop_memory. */
7582 switch ((re_opcode_t) *p1)
7584 /* Could be either a loop or a series of alternatives. */
7585 case on_failure_jump:
7587 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7589 /* If the next operation is not a jump backwards in the
7594 /* Go through the on_failure_jumps of the alternatives,
7595 seeing if any of the alternatives cannot match nothing.
7596 The last alternative starts with only a jump,
7597 whereas the rest start with on_failure_jump and end
7598 with a jump, e.g., here is the pattern for `a|b|c':
7600 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
7601 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
7604 So, we have to first go through the first (n-1)
7605 alternatives and then deal with the last one separately. */
7608 /* Deal with the first (n-1) alternatives, which start
7609 with an on_failure_jump (see above) that jumps to right
7610 past a jump_past_alt. */
7612 while ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] ==
7615 /* `mcnt' holds how many bytes long the alternative
7616 is, including the ending `jump_past_alt' and
7619 if (!PREFIX(alt_match_null_string_p) (p1, p1 + mcnt -
7620 (1 + OFFSET_ADDRESS_SIZE),
7624 /* Move to right after this alternative, including the
7628 /* Break if it's the beginning of an n-th alternative
7629 that doesn't begin with an on_failure_jump. */
7630 if ((re_opcode_t) *p1 != on_failure_jump)
7633 /* Still have to check that it's not an n-th
7634 alternative that starts with an on_failure_jump. */
7636 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7637 if ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] !=
7640 /* Get to the beginning of the n-th alternative. */
7641 p1 -= 1 + OFFSET_ADDRESS_SIZE;
7646 /* Deal with the last alternative: go back and get number
7647 of the `jump_past_alt' just before it. `mcnt' contains
7648 the length of the alternative. */
7649 EXTRACT_NUMBER (mcnt, p1 - OFFSET_ADDRESS_SIZE);
7651 if (!PREFIX(alt_match_null_string_p) (p1, p1 + mcnt, reg_info))
7654 p1 += mcnt; /* Get past the n-th alternative. */
7660 assert (p1[1] == **p);
7666 if (!PREFIX(common_op_match_null_string_p) (&p1, end, reg_info))
7669 } /* while p1 < end */
7672 } /* group_match_null_string_p */
7675 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
7676 It expects P to be the first byte of a single alternative and END one
7677 byte past the last. The alternative can contain groups. */
7680 PREFIX(alt_match_null_string_p) (
7681 UCHAR_T *p, UCHAR_T *end,
7682 PREFIX(register_info_type) *reg_info)
7689 /* Skip over opcodes that can match nothing, and break when we get
7690 to one that can't. */
7692 switch ((re_opcode_t) *p1)
7695 case on_failure_jump:
7697 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7702 if (!PREFIX(common_op_match_null_string_p) (&p1, end, reg_info))
7705 } /* while p1 < end */
7708 } /* alt_match_null_string_p */
7711 /* Deals with the ops common to group_match_null_string_p and
7712 alt_match_null_string_p.
7714 Sets P to one after the op and its arguments, if any. */
7717 PREFIX(common_op_match_null_string_p) (
7718 UCHAR_T **p, UCHAR_T *end,
7719 PREFIX(register_info_type) *reg_info)
7726 switch ((re_opcode_t) *p1++)
7746 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
7747 ret = PREFIX(group_match_null_string_p) (&p1, end, reg_info);
7749 /* Have to set this here in case we're checking a group which
7750 contains a group and a back reference to it. */
7752 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
7753 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
7759 /* If this is an optimized succeed_n for zero times, make the jump. */
7761 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7769 /* Get to the number of times to succeed. */
7770 p1 += OFFSET_ADDRESS_SIZE;
7771 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7775 p1 -= 2 * OFFSET_ADDRESS_SIZE;
7776 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7784 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
7789 p1 += 2 * OFFSET_ADDRESS_SIZE;
7792 /* All other opcodes mean we cannot match the empty string. */
7798 } /* common_op_match_null_string_p */
7801 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
7802 bytes; nonzero otherwise. */
7805 PREFIX(bcmp_translate) (
7806 const CHAR_T *s1, const CHAR_T *s2,
7808 __RE_TRANSLATE_TYPE translate)
7810 register const UCHAR_T *p1 = (const UCHAR_T *) s1;
7811 register const UCHAR_T *p2 = (const UCHAR_T *) s2;
7815 if (((*p1<=0xff)?translate[*p1++]:*p1++)
7816 != ((*p2<=0xff)?translate[*p2++]:*p2++))
7819 if (translate[*p1++] != translate[*p2++]) return 1;
7827 #else /* not INSIDE_RECURSION */
7829 /* Entry points for GNU code. */
7831 /* re_compile_pattern is the GNU regular expression compiler: it
7832 compiles PATTERN (of length SIZE) and puts the result in BUFP.
7833 Returns 0 if the pattern was valid, otherwise an error string.
7835 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
7836 are set in BUFP on entry.
7838 We call regex_compile to do the actual compilation. */
7841 re_compile_pattern (const char *pattern,
7843 struct re_pattern_buffer *bufp)
7847 /* GNU code is written to assume at least RE_NREGS registers will be set
7848 (and at least one extra will be -1). */
7849 bufp->regs_allocated = REGS_UNALLOCATED;
7851 /* And GNU code determines whether or not to get register information
7852 by passing null for the REGS argument to re_match, etc., not by
7856 /* Match anchors at newline. */
7857 bufp->newline_anchor = 1;
7860 if (MB_CUR_MAX != 1)
7861 ret = wcs_regex_compile (pattern, length, re_syntax_options, bufp);
7864 ret = byte_regex_compile (pattern, length, re_syntax_options, bufp);
7868 return gettext (re_error_msgid + re_error_msgid_idx[(int) ret]);
7871 /* Entry points compatible with 4.2 BSD regex library. We don't define
7872 them unless specifically requested. */
7874 #if defined _REGEX_RE_COMP || defined _LIBC
7876 /* BSD has one and only one pattern buffer. */
7877 static struct re_pattern_buffer re_comp_buf;
7881 /* Make these definitions weak in libc, so POSIX programs can redefine
7882 these names if they don't use our functions, and still use
7883 regcomp/regexec below without link errors. */
7886 re_comp (const char *s)
7892 if (!re_comp_buf.buffer)
7893 return gettext ("No previous regular expression");
7897 if (!re_comp_buf.buffer)
7899 re_comp_buf.buffer = (unsigned char *) malloc (200);
7900 if (re_comp_buf.buffer == NULL)
7901 return (char *) gettext (re_error_msgid
7902 + re_error_msgid_idx[(int) REG_ESPACE]);
7903 re_comp_buf.allocated = 200;
7905 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
7906 if (re_comp_buf.fastmap == NULL)
7907 return (char *) gettext (re_error_msgid
7908 + re_error_msgid_idx[(int) REG_ESPACE]);
7911 /* Since `re_exec' always passes NULL for the `regs' argument, we
7912 don't need to initialize the pattern buffer fields which affect it. */
7914 /* Match anchors at newlines. */
7915 re_comp_buf.newline_anchor = 1;
7918 if (MB_CUR_MAX != 1)
7919 ret = wcs_regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
7922 ret = byte_regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
7927 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
7928 return (char *) gettext (re_error_msgid + re_error_msgid_idx[(int) ret]);
7933 #if defined _LIBC || defined __UCLIBC__
7936 re_exec (const char *s)
7938 const int len = strlen (s);
7940 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
7943 #endif /* _REGEX_RE_COMP */
7945 /* POSIX.2 functions. Don't define these for Emacs. */
7949 /* regcomp takes a regular expression as a string and compiles it.
7951 PREG is a regex_t *. We do not expect any fields to be initialized,
7952 since POSIX says we shouldn't. Thus, we set
7954 `buffer' to the compiled pattern;
7955 `used' to the length of the compiled pattern;
7956 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
7957 REG_EXTENDED bit in CFLAGS is set; otherwise, to
7958 RE_SYNTAX_POSIX_BASIC;
7959 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
7960 `fastmap' to an allocated space for the fastmap;
7961 `fastmap_accurate' to zero;
7962 `re_nsub' to the number of subexpressions in PATTERN.
7964 PATTERN is the address of the pattern string.
7966 CFLAGS is a series of bits which affect compilation.
7968 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
7969 use POSIX basic syntax.
7971 If REG_NEWLINE is set, then . and [^...] don't match newline.
7972 Also, regexec will try a match beginning after every newline.
7974 If REG_ICASE is set, then we considers upper- and lowercase
7975 versions of letters to be equivalent when matching.
7977 If REG_NOSUB is set, then when PREG is passed to regexec, that
7978 routine will report only success or failure, and nothing about the
7981 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
7982 the return codes and their meanings.) */
7987 const char *pattern,
7992 = (cflags & REG_EXTENDED) ?
7993 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
7995 /* regex_compile will allocate the space for the compiled pattern. */
7997 preg->allocated = 0;
8000 /* Try to allocate space for the fastmap. */
8001 preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
8003 if (cflags & REG_ICASE)
8008 = (__RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
8009 * sizeof (*(__RE_TRANSLATE_TYPE)0));
8010 if (preg->translate == NULL)
8011 return (int) REG_ESPACE;
8013 /* Map uppercase characters to corresponding lowercase ones. */
8014 for (i = 0; i < CHAR_SET_SIZE; i++)
8015 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
8018 preg->translate = NULL;
8020 /* If REG_NEWLINE is set, newlines are treated differently. */
8021 if (cflags & REG_NEWLINE)
8022 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
8023 syntax &= ~RE_DOT_NEWLINE;
8024 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
8025 /* It also changes the matching behavior. */
8026 preg->newline_anchor = 1;
8029 preg->newline_anchor = 0;
8031 preg->no_sub = !!(cflags & REG_NOSUB);
8033 /* POSIX says a null character in the pattern terminates it, so we
8034 can use strlen here in compiling the pattern. */
8036 if (MB_CUR_MAX != 1)
8037 ret = wcs_regex_compile (pattern, strlen (pattern), syntax, preg);
8040 ret = byte_regex_compile (pattern, strlen (pattern), syntax, preg);
8042 /* POSIX doesn't distinguish between an unmatched open-group and an
8043 unmatched close-group: both are REG_EPAREN. */
8044 if (ret == REG_ERPAREN) ret = REG_EPAREN;
8046 if (ret == REG_NOERROR && preg->fastmap)
8048 /* Compute the fastmap now, since regexec cannot modify the pattern
8050 if (re_compile_fastmap (preg) == -2)
8052 /* Some error occurred while computing the fastmap, just forget
8054 free (preg->fastmap);
8055 preg->fastmap = NULL;
8063 /* regexec searches for a given pattern, specified by PREG, in the
8066 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
8067 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
8068 least NMATCH elements, and we set them to the offsets of the
8069 corresponding matched substrings.
8071 EFLAGS specifies `execution flags' which affect matching: if
8072 REG_NOTBOL is set, then ^ does not match at the beginning of the
8073 string; if REG_NOTEOL is set, then $ does not match at the end.
8075 We return 0 if we find a match and REG_NOMATCH if not. */
8079 const regex_t *preg,
8082 regmatch_t pmatch[],
8086 struct re_registers regs;
8087 regex_t private_preg;
8088 int len = strlen (string);
8089 boolean want_reg_info = !preg->no_sub && nmatch > 0;
8091 /* use hidden memcpy() ourselves rather than gcc calling public memcpy() */
8092 memcpy(&private_preg, preg, sizeof(*preg));
8094 private_preg.not_bol = !!(eflags & REG_NOTBOL);
8095 private_preg.not_eol = !!(eflags & REG_NOTEOL);
8097 /* The user has told us exactly how many registers to return
8098 information about, via `nmatch'. We have to pass that on to the
8099 matching routines. */
8100 private_preg.regs_allocated = REGS_FIXED;
8104 regs.num_regs = nmatch;
8105 regs.start = TALLOC (nmatch * 2, regoff_t);
8106 if (regs.start == NULL)
8107 return (int) REG_NOMATCH;
8108 regs.end = regs.start + nmatch;
8111 /* Perform the searching operation. */
8112 ret = re_search (&private_preg, string, len,
8113 /* start: */ 0, /* range: */ len,
8114 want_reg_info ? ®s : (struct re_registers *) 0);
8116 /* Copy the register information to the POSIX structure. */
8123 for (r = 0; r < nmatch; r++)
8125 pmatch[r].rm_so = regs.start[r];
8126 pmatch[r].rm_eo = regs.end[r];
8130 /* If we needed the temporary register info, free the space now. */
8134 /* We want zero return to mean success, unlike `re_search'. */
8135 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
8137 libc_hidden_def(regexec)
8140 /* Returns a message corresponding to an error code, ERRCODE, returned
8141 from either regcomp or regexec. We don't use PREG here. */
8146 const regex_t * preg attribute_unused,
8154 || errcode >= (int) (sizeof (re_error_msgid_idx)
8155 / sizeof (re_error_msgid_idx[0])))
8156 /* Only error codes returned by the rest of the code should be passed
8157 to this routine. If we are given anything else, or if other regex
8158 code generates an invalid error code, then the program has a bug.
8159 Dump core so we can fix it. */
8162 msg = gettext (re_error_msgid + re_error_msgid_idx[errcode]);
8164 msg_size = strlen (msg) + 1; /* Includes the null. */
8166 if (errbuf_size != 0)
8168 if (msg_size > errbuf_size)
8170 memcpy (errbuf, msg, errbuf_size - 1);
8171 errbuf[errbuf_size - 1] = 0;
8174 memcpy (errbuf, msg, msg_size);
8181 /* Free dynamically allocated space used by PREG. */
8184 regfree (regex_t *preg)
8186 free (preg->buffer);
8187 preg->buffer = NULL;
8189 preg->allocated = 0;
8192 free (preg->fastmap);
8193 preg->fastmap = NULL;
8194 preg->fastmap_accurate = 0;
8196 free (preg->translate);
8197 preg->translate = NULL;
8199 libc_hidden_def(regfree)
8201 #endif /* not emacs */
8203 #endif /* not INSIDE_RECURSION */
8207 #undef STORE_NUMBER_AND_INCR
8208 #undef EXTRACT_NUMBER
8209 #undef EXTRACT_NUMBER_AND_INCR
8211 #undef DEBUG_PRINT_COMPILED_PATTERN
8212 #undef DEBUG_PRINT_DOUBLE_STRING
8214 #undef INIT_FAIL_STACK
8215 #undef RESET_FAIL_STACK
8216 #undef DOUBLE_FAIL_STACK
8217 #undef PUSH_PATTERN_OP
8218 #undef PUSH_FAILURE_POINTER
8219 #undef PUSH_FAILURE_INT
8220 #undef PUSH_FAILURE_ELT
8221 #undef POP_FAILURE_POINTER
8222 #undef POP_FAILURE_INT
8223 #undef POP_FAILURE_ELT
8226 #undef PUSH_FAILURE_POINT
8227 #undef POP_FAILURE_POINT
8229 #undef REG_UNSET_VALUE
8237 #undef INIT_BUF_SIZE
8238 #undef GET_BUFFER_SPACE
8246 #undef EXTEND_BUFFER
8247 #undef GET_UNSIGNED_NUMBER
8248 #undef FREE_STACK_RETURN
8250 # undef POINTER_TO_OFFSET
8251 # undef MATCHING_IN_FRST_STRING
8253 # undef AT_STRINGS_BEG
8254 # undef AT_STRINGS_END
8257 # undef FREE_VARIABLES
8258 # undef NO_HIGHEST_ACTIVE_REG
8259 # undef NO_LOWEST_ACTIVE_REG
8263 # undef COMPILED_BUFFER_VAR
8264 # undef OFFSET_ADDRESS_SIZE
8265 # undef CHAR_CLASS_SIZE
8272 # define DEFINED_ONCE