1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
22 int n) internal_function;
23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
24 static void match_ctx_free (re_match_context_t *cache) internal_function;
25 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
26 int str_idx, int from, int to)
28 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
30 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
31 int str_idx) internal_function;
32 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
33 int node, int str_idx)
35 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
36 re_dfastate_t **limited_sts, int last_node,
39 static reg_errcode_t re_search_internal (const regex_t *preg,
40 const char *string, int length,
41 int start, int range, int stop,
42 size_t nmatch, regmatch_t pmatch[],
43 int eflags) internal_function;
44 static int re_search_2_stub (struct re_pattern_buffer *bufp,
45 const char *string1, int length1,
46 const char *string2, int length2,
47 int start, int range, struct re_registers *regs,
48 int stop, int ret_len) internal_function;
49 static int re_search_stub (struct re_pattern_buffer *bufp,
50 const char *string, int length, int start,
51 int range, int stop, struct re_registers *regs,
52 int ret_len) internal_function;
53 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
54 int nregs, int regs_allocated) internal_function;
55 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
57 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
58 int *p_match_first) internal_function;
59 static int check_halt_state_context (const re_match_context_t *mctx,
60 const re_dfastate_t *state, int idx)
62 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
63 regmatch_t *prev_idx_match, int cur_node,
64 int cur_idx, int nmatch) internal_function;
65 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
66 int str_idx, int dest_node, int nregs,
68 re_node_set *eps_via_nodes)
70 static reg_errcode_t set_regs (const regex_t *preg,
71 const re_match_context_t *mctx,
72 size_t nmatch, regmatch_t *pmatch,
73 int fl_backtrack) internal_function;
74 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
78 static int sift_states_iter_mb (const re_match_context_t *mctx,
79 re_sift_context_t *sctx,
80 int node_idx, int str_idx, int max_str_idx)
83 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
84 re_sift_context_t *sctx)
86 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
87 re_sift_context_t *sctx, int str_idx,
88 re_node_set *cur_dest)
90 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
91 re_sift_context_t *sctx,
93 re_node_set *dest_nodes)
95 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
96 re_node_set *dest_nodes,
97 const re_node_set *candidates)
99 static int check_dst_limits (const re_match_context_t *mctx,
101 int dst_node, int dst_idx, int src_node,
102 int src_idx) internal_function;
103 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
104 int boundaries, int subexp_idx,
105 int from_node, int bkref_idx)
107 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
108 int limit, int subexp_idx,
109 int node, int str_idx,
110 int bkref_idx) internal_function;
111 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
112 re_node_set *dest_nodes,
113 const re_node_set *candidates,
115 struct re_backref_cache_entry *bkref_ents,
116 int str_idx) internal_function;
117 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
118 re_sift_context_t *sctx,
119 int str_idx, const re_node_set *candidates)
121 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
123 re_dfastate_t **src, int num)
125 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
126 re_match_context_t *mctx) internal_function;
127 static re_dfastate_t *transit_state (reg_errcode_t *err,
128 re_match_context_t *mctx,
129 re_dfastate_t *state) internal_function;
130 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
131 re_match_context_t *mctx,
132 re_dfastate_t *next_state)
134 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
135 re_node_set *cur_nodes,
136 int str_idx) internal_function;
138 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
139 re_match_context_t *mctx,
140 re_dfastate_t *pstate)
143 #ifdef RE_ENABLE_I18N
144 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
145 re_dfastate_t *pstate)
148 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
149 const re_node_set *nodes)
151 static reg_errcode_t get_subexp (re_match_context_t *mctx,
152 int bkref_node, int bkref_str_idx)
154 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
155 const re_sub_match_top_t *sub_top,
156 re_sub_match_last_t *sub_last,
157 int bkref_node, int bkref_str)
159 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
160 int subexp_idx, int type) internal_function;
161 static reg_errcode_t check_arrival (re_match_context_t *mctx,
162 state_array_t *path, int top_node,
163 int top_str, int last_node, int last_str,
164 int type) internal_function;
165 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
167 re_node_set *cur_nodes,
168 re_node_set *next_nodes)
170 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
171 re_node_set *cur_nodes,
172 int ex_subexp, int type)
174 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
175 re_node_set *dst_nodes,
176 int target, int ex_subexp,
177 int type) internal_function;
178 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
179 re_node_set *cur_nodes, int cur_str,
180 int subexp_num, int type)
182 static int build_trtable (const re_dfa_t *dfa,
183 re_dfastate_t *state) internal_function;
184 #ifdef RE_ENABLE_I18N
185 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
186 const re_string_t *input, int idx)
189 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
190 const re_dfastate_t *state,
191 re_node_set *states_node,
192 bitset_t *states_ch) internal_function;
193 static int check_node_accept (const re_match_context_t *mctx,
194 const re_token_t *node, int idx)
196 static reg_errcode_t extend_buffers (re_match_context_t *mctx)
199 /* Entry point for POSIX code. */
201 /* regexec searches for a given pattern, specified by PREG, in the
204 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
205 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
206 least NMATCH elements, and we set them to the offsets of the
207 corresponding matched substrings.
209 EFLAGS specifies `execution flags' which affect matching: if
210 REG_NOTBOL is set, then ^ does not match at the beginning of the
211 string; if REG_NOTEOL is set, then $ does not match at the end.
213 We return 0 if we find a match and REG_NOMATCH if not. */
216 regexec (preg, string, nmatch, pmatch, eflags)
217 const regex_t *__restrict preg;
218 const char *__restrict string;
225 #ifdef __UCLIBC_HAS_THREADS__
226 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
229 if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
232 if (eflags & REG_STARTEND)
234 start = pmatch[0].rm_so;
235 length = pmatch[0].rm_eo;
240 length = strlen (string);
243 __libc_lock_lock (dfa->lock);
245 err = re_search_internal (preg, string, length, start, length - start,
246 length, 0, NULL, eflags);
248 err = re_search_internal (preg, string, length, start, length - start,
249 length, nmatch, pmatch, eflags);
250 __libc_lock_unlock (dfa->lock);
251 return err != REG_NOERROR;
253 libc_hidden_def(regexec)
255 /* Entry points for GNU code. */
257 /* re_match, re_search, re_match_2, re_search_2
259 The former two functions operate on STRING with length LENGTH,
260 while the later two operate on concatenation of STRING1 and STRING2
261 with lengths LENGTH1 and LENGTH2, respectively.
263 re_match() matches the compiled pattern in BUFP against the string,
264 starting at index START.
266 re_search() first tries matching at index START, then it tries to match
267 starting from index START + 1, and so on. The last start position tried
268 is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
271 The parameter STOP of re_{match,search}_2 specifies that no match exceeding
272 the first STOP characters of the concatenation of the strings should be
275 If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
276 and all groups is stroed in REGS. (For the "_2" variants, the offsets are
277 computed relative to the concatenation, not relative to the individual
280 On success, re_match* functions return the length of the match, re_search*
281 return the position of the start of the match. Return value -1 means no
282 match was found and -2 indicates an internal error. */
285 re_match (bufp, string, length, start, regs)
286 struct re_pattern_buffer *bufp;
289 struct re_registers *regs;
291 return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
295 re_search (bufp, string, length, start, range, regs)
296 struct re_pattern_buffer *bufp;
298 int length, start, range;
299 struct re_registers *regs;
301 return re_search_stub (bufp, string, length, start, range, length, regs, 0);
303 libc_hidden_def(re_search)
306 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
307 struct re_pattern_buffer *bufp;
308 const char *string1, *string2;
309 int length1, length2, start, stop;
310 struct re_registers *regs;
312 return re_search_2_stub (bufp, string1, length1, string2, length2,
313 start, 0, regs, stop, 1);
317 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
318 struct re_pattern_buffer *bufp;
319 const char *string1, *string2;
320 int length1, length2, start, range, stop;
321 struct re_registers *regs;
323 return re_search_2_stub (bufp, string1, length1, string2, length2,
324 start, range, regs, stop, 0);
326 libc_hidden_def(re_search_2)
329 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
331 struct re_pattern_buffer *bufp;
332 const char *string1, *string2;
333 int length1, length2, start, range, stop, ret_len;
334 struct re_registers *regs;
338 int len = length1 + length2;
341 if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
344 /* Concatenate the strings. */
348 char *s = re_malloc (char, len);
350 if (BE (s == NULL, 0))
352 memcpy (s, string1, length1);
353 memcpy (s + length1, string2, length2);
362 rval = re_search_stub (bufp, str, len, start, range, stop, regs,
365 re_free ((char *) str);
369 /* The parameters have the same meaning as those of re_search.
370 Additional parameters:
371 If RET_LEN is nonzero the length of the match is returned (re_match style);
372 otherwise the position of the match is returned. */
375 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
376 struct re_pattern_buffer *bufp;
378 int length, start, range, stop, ret_len;
379 struct re_registers *regs;
381 reg_errcode_t result;
385 #ifdef __UCLIBC_HAS_THREADS__
386 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
388 /* Check for out-of-range. */
389 if (BE (start < 0 || start > length, 0))
391 if (BE (start + range > length, 0))
392 range = length - start;
393 else if (BE (start + range < 0, 0))
396 __libc_lock_lock (dfa->lock);
398 eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
399 eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
401 /* Compile fastmap if we haven't yet. */
402 if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
403 re_compile_fastmap (bufp);
405 if (BE (bufp->no_sub, 0))
408 /* We need at least 1 register. */
411 else if (BE (bufp->regs_allocated == REGS_FIXED &&
412 regs->num_regs < bufp->re_nsub + 1, 0))
414 nregs = regs->num_regs;
415 if (BE (nregs < 1, 0))
417 /* Nothing can be copied to regs. */
423 nregs = bufp->re_nsub + 1;
424 pmatch = re_malloc (regmatch_t, nregs);
425 if (BE (pmatch == NULL, 0))
431 result = re_search_internal (bufp, string, length, start, range, stop,
432 nregs, pmatch, eflags);
436 /* I hope we needn't fill ther regs with -1's when no match was found. */
437 if (result != REG_NOERROR)
439 else if (regs != NULL)
441 /* If caller wants register contents data back, copy them. */
442 bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
443 bufp->regs_allocated);
444 if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
448 if (BE (rval == 0, 1))
452 assert (pmatch[0].rm_so == start);
453 rval = pmatch[0].rm_eo - start;
456 rval = pmatch[0].rm_so;
460 __libc_lock_unlock (dfa->lock);
465 re_copy_regs (regs, pmatch, nregs, regs_allocated)
466 struct re_registers *regs;
468 int nregs, regs_allocated;
470 int rval = REGS_REALLOCATE;
472 int need_regs = nregs + 1;
473 /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
476 /* Have the register data arrays been allocated? */
477 if (regs_allocated == REGS_UNALLOCATED)
478 { /* No. So allocate them with malloc. */
479 regs->start = re_malloc (regoff_t, need_regs);
480 regs->end = re_malloc (regoff_t, need_regs);
481 if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
482 return REGS_UNALLOCATED;
483 regs->num_regs = need_regs;
485 else if (regs_allocated == REGS_REALLOCATE)
486 { /* Yes. If we need more elements than were already
487 allocated, reallocate them. If we need fewer, just
489 if (BE (need_regs > regs->num_regs, 0))
491 regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
492 regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
493 if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
494 return REGS_UNALLOCATED;
495 regs->start = new_start;
497 regs->num_regs = need_regs;
502 assert (regs_allocated == REGS_FIXED);
503 /* This function may not be called with REGS_FIXED and nregs too big. */
504 assert (regs->num_regs >= nregs);
509 for (i = 0; i < nregs; ++i)
511 regs->start[i] = pmatch[i].rm_so;
512 regs->end[i] = pmatch[i].rm_eo;
514 for ( ; i < regs->num_regs; ++i)
515 regs->start[i] = regs->end[i] = -1;
520 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
521 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
522 this memory for recording register information. STARTS and ENDS
523 must be allocated using the malloc library routine, and must each
524 be at least NUM_REGS * sizeof (regoff_t) bytes long.
526 If NUM_REGS == 0, then subsequent matches should allocate their own
529 Unless this function is called, the first search or match using
530 PATTERN_BUFFER will allocate its own register data, without
531 freeing the old data. */
534 re_set_registers (bufp, regs, num_regs, starts, ends)
535 struct re_pattern_buffer *bufp;
536 struct re_registers *regs;
538 regoff_t *starts, *ends;
542 bufp->regs_allocated = REGS_REALLOCATE;
543 regs->num_regs = num_regs;
544 regs->start = starts;
549 bufp->regs_allocated = REGS_UNALLOCATED;
551 regs->start = regs->end = (regoff_t *) 0;
555 /* Entry points compatible with 4.2 BSD regex library. We don't define
556 them unless specifically requested. */
558 #if defined _REGEX_RE_COMP || defined __UCLIBC__
561 re_exec (const char *s)
563 return 0 == regexec (re_comp_buf, s, 0, NULL, 0);
567 /* Internal entry point. */
569 /* Searches for a compiled pattern PREG in the string STRING, whose
570 length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
571 mingings with regexec. START, and RANGE have the same meanings
573 Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
574 otherwise return the error code.
575 Note: We assume front end functions already check ranges.
576 (START + RANGE >= 0 && START + RANGE <= LENGTH) */
579 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
583 int length, start, range, stop, eflags;
588 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
589 int left_lim, right_lim, incr;
590 int fl_longest_match, match_first, match_kind, match_last = -1;
593 re_match_context_t mctx;
594 char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
595 && range && !preg->can_be_null) ? preg->fastmap : NULL;
596 RE_TRANSLATE_TYPE t = preg->translate;
598 memset (&mctx, '\0', sizeof (re_match_context_t));
601 extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
602 nmatch -= extra_nmatch;
604 /* Check if the DFA haven't been compiled. */
605 if (BE (preg->used == 0 || dfa->init_state == NULL
606 || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
607 || dfa->init_state_begbuf == NULL, 0))
611 /* We assume front-end functions already check them. */
612 assert (start + range >= 0 && start + range <= length);
615 /* If initial states with non-begbuf contexts have no elements,
616 the regex must be anchored. If preg->newline_anchor is set,
617 we'll never use init_state_nl, so do not check it. */
618 if (dfa->init_state->nodes.nelem == 0
619 && dfa->init_state_word->nodes.nelem == 0
620 && (dfa->init_state_nl->nodes.nelem == 0
621 || !preg->newline_anchor))
623 if (start != 0 && start + range != 0)
628 /* We must check the longest matching, if nmatch > 0. */
629 fl_longest_match = (nmatch != 0 || dfa->nbackref);
631 err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
632 preg->translate, preg->syntax & RE_ICASE, dfa);
633 if (BE (err != REG_NOERROR, 0))
635 mctx.input.stop = stop;
636 mctx.input.raw_stop = stop;
637 mctx.input.newline_anchor = preg->newline_anchor;
639 err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
640 if (BE (err != REG_NOERROR, 0))
643 /* We will log all the DFA states through which the dfa pass,
644 if nmatch > 1, or this dfa has "multibyte node", which is a
645 back-reference or a node which can accept multibyte character or
646 multi character collating element. */
647 if (nmatch > 1 || dfa->has_mb_node)
649 mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
650 if (BE (mctx.state_log == NULL, 0))
657 mctx.state_log = NULL;
660 mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
661 : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
663 /* Check incrementally whether of not the input string match. */
664 incr = (range < 0) ? -1 : 1;
665 left_lim = (range < 0) ? start + range : start;
666 right_lim = (range < 0) ? start : start + range;
667 sb = dfa->mb_cur_max == 1;
670 ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
671 | (range >= 0 ? 2 : 0)
672 | (t != NULL ? 1 : 0))
675 for (;; match_first += incr)
678 if (match_first < left_lim || right_lim < match_first)
681 /* Advance as rapidly as possible through the string, until we
682 find a plausible place to start matching. This may be done
683 with varying efficiency, so there are various possibilities:
684 only the most common of them are specialized, in order to
685 save on code size. We use a switch statement for speed. */
693 /* Fastmap with single-byte translation, match forward. */
694 while (BE (match_first < right_lim, 1)
695 && !fastmap[t[(unsigned char) string[match_first]]])
697 goto forward_match_found_start_or_reached_end;
700 /* Fastmap without translation, match forward. */
701 while (BE (match_first < right_lim, 1)
702 && !fastmap[(unsigned char) string[match_first]])
705 forward_match_found_start_or_reached_end:
706 if (BE (match_first == right_lim, 0))
708 ch = match_first >= length
709 ? 0 : (unsigned char) string[match_first];
710 if (!fastmap[t ? t[ch] : ch])
717 /* Fastmap without multi-byte translation, match backwards. */
718 while (match_first >= left_lim)
720 ch = match_first >= length
721 ? 0 : (unsigned char) string[match_first];
722 if (fastmap[t ? t[ch] : ch])
726 if (match_first < left_lim)
731 /* In this case, we can't determine easily the current byte,
732 since it might be a component byte of a multibyte
733 character. Then we use the constructed buffer instead. */
736 /* If MATCH_FIRST is out of the valid range, reconstruct the
738 unsigned int offset = match_first - mctx.input.raw_mbs_idx;
739 if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
741 err = re_string_reconstruct (&mctx.input, match_first,
743 if (BE (err != REG_NOERROR, 0))
746 offset = match_first - mctx.input.raw_mbs_idx;
748 /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
749 Note that MATCH_FIRST must not be smaller than 0. */
750 ch = (match_first >= length
751 ? 0 : re_string_byte_at (&mctx.input, offset));
755 if (match_first < left_lim || match_first > right_lim)
764 /* Reconstruct the buffers so that the matcher can assume that
765 the matching starts from the beginning of the buffer. */
766 err = re_string_reconstruct (&mctx.input, match_first, eflags);
767 if (BE (err != REG_NOERROR, 0))
770 #ifdef RE_ENABLE_I18N
771 /* Don't consider this char as a possible match start if it part,
772 yet isn't the head, of a multibyte character. */
773 if (!sb && !re_string_first_byte (&mctx.input, 0))
777 /* It seems to be appropriate one, then use the matcher. */
778 /* We assume that the matching starts from 0. */
779 mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
780 match_last = check_matching (&mctx, fl_longest_match,
781 range >= 0 ? &match_first : NULL);
782 if (match_last != -1)
784 if (BE (match_last == -2, 0))
791 mctx.match_last = match_last;
792 if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
794 re_dfastate_t *pstate = mctx.state_log[match_last];
795 mctx.last_node = check_halt_state_context (&mctx, pstate,
798 if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
801 err = prune_impossible_nodes (&mctx);
802 if (err == REG_NOERROR)
804 if (BE (err != REG_NOMATCH, 0))
809 break; /* We found a match. */
813 match_ctx_clean (&mctx);
817 assert (match_last != -1);
818 assert (err == REG_NOERROR);
821 /* Set pmatch[] if we need. */
826 /* Initialize registers. */
827 for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
828 pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
830 /* Set the points where matching start/end. */
832 pmatch[0].rm_eo = mctx.match_last;
834 if (!preg->no_sub && nmatch > 1)
836 err = set_regs (preg, &mctx, nmatch, pmatch,
837 dfa->has_plural_match && dfa->nbackref > 0);
838 if (BE (err != REG_NOERROR, 0))
842 /* At last, add the offset to the each registers, since we slided
843 the buffers so that we could assume that the matching starts
845 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
846 if (pmatch[reg_idx].rm_so != -1)
848 #ifdef RE_ENABLE_I18N
849 if (BE (mctx.input.offsets_needed != 0, 0))
851 pmatch[reg_idx].rm_so =
852 (pmatch[reg_idx].rm_so == mctx.input.valid_len
853 ? mctx.input.valid_raw_len
854 : mctx.input.offsets[pmatch[reg_idx].rm_so]);
855 pmatch[reg_idx].rm_eo =
856 (pmatch[reg_idx].rm_eo == mctx.input.valid_len
857 ? mctx.input.valid_raw_len
858 : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
861 assert (mctx.input.offsets_needed == 0);
863 pmatch[reg_idx].rm_so += match_first;
864 pmatch[reg_idx].rm_eo += match_first;
866 for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
868 pmatch[nmatch + reg_idx].rm_so = -1;
869 pmatch[nmatch + reg_idx].rm_eo = -1;
873 for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
874 if (dfa->subexp_map[reg_idx] != reg_idx)
876 pmatch[reg_idx + 1].rm_so
877 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
878 pmatch[reg_idx + 1].rm_eo
879 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
884 re_free (mctx.state_log);
886 match_ctx_free (&mctx);
887 re_string_destruct (&mctx.input);
892 prune_impossible_nodes (mctx)
893 re_match_context_t *mctx;
895 const re_dfa_t *const dfa = mctx->dfa;
896 int halt_node, match_last;
898 re_dfastate_t **sifted_states;
899 re_dfastate_t **lim_states = NULL;
900 re_sift_context_t sctx;
902 assert (mctx->state_log != NULL);
904 match_last = mctx->match_last;
905 halt_node = mctx->last_node;
906 sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
907 if (BE (sifted_states == NULL, 0))
914 lim_states = re_malloc (re_dfastate_t *, match_last + 1);
915 if (BE (lim_states == NULL, 0))
922 memset (lim_states, '\0',
923 sizeof (re_dfastate_t *) * (match_last + 1));
924 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
926 ret = sift_states_backward (mctx, &sctx);
927 re_node_set_free (&sctx.limits);
928 if (BE (ret != REG_NOERROR, 0))
930 if (sifted_states[0] != NULL || lim_states[0] != NULL)
940 } while (mctx->state_log[match_last] == NULL
941 || !mctx->state_log[match_last]->halt);
942 halt_node = check_halt_state_context (mctx,
943 mctx->state_log[match_last],
946 ret = merge_state_array (dfa, sifted_states, lim_states,
948 re_free (lim_states);
950 if (BE (ret != REG_NOERROR, 0))
955 sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
956 ret = sift_states_backward (mctx, &sctx);
957 re_node_set_free (&sctx.limits);
958 if (BE (ret != REG_NOERROR, 0))
961 re_free (mctx->state_log);
962 mctx->state_log = sifted_states;
963 sifted_states = NULL;
964 mctx->last_node = halt_node;
965 mctx->match_last = match_last;
968 re_free (sifted_states);
969 re_free (lim_states);
973 /* Acquire an initial state and return it.
974 We must select appropriate initial state depending on the context,
975 since initial states may have constraints like "\<", "^", etc.. */
977 static __inline__ re_dfastate_t *
978 __attribute ((always_inline)) internal_function
979 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
982 const re_dfa_t *const dfa = mctx->dfa;
983 if (dfa->init_state->has_constraint)
985 unsigned int context;
986 context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
987 if (IS_WORD_CONTEXT (context))
988 return dfa->init_state_word;
989 else if (IS_ORDINARY_CONTEXT (context))
990 return dfa->init_state;
991 else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
992 return dfa->init_state_begbuf;
993 else if (IS_NEWLINE_CONTEXT (context))
994 return dfa->init_state_nl;
995 else if (IS_BEGBUF_CONTEXT (context))
997 /* It is relatively rare case, then calculate on demand. */
998 return re_acquire_state_context (err, dfa,
999 dfa->init_state->entrance_nodes,
1003 /* Must not happen? */
1004 return dfa->init_state;
1007 return dfa->init_state;
1010 /* Check whether the regular expression match input string INPUT or not,
1011 and return the index where the matching end, return -1 if not match,
1012 or return -2 in case of an error.
1013 FL_LONGEST_MATCH means we want the POSIX longest matching.
1014 If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1015 next place where we may want to try matching.
1016 Note that the matcher assume that the maching starts from the current
1017 index of the buffer. */
1021 check_matching (re_match_context_t *mctx, int fl_longest_match,
1024 const re_dfa_t *const dfa = mctx->dfa;
1027 int match_last = -1;
1028 int cur_str_idx = re_string_cur_idx (&mctx->input);
1029 re_dfastate_t *cur_state;
1030 int at_init_state = p_match_first != NULL;
1031 int next_start_idx = cur_str_idx;
1034 cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1035 /* An initial state must not be NULL (invalid). */
1036 if (BE (cur_state == NULL, 0))
1038 assert (err == REG_ESPACE);
1042 if (mctx->state_log != NULL)
1044 mctx->state_log[cur_str_idx] = cur_state;
1046 /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1047 later. E.g. Processing back references. */
1048 if (BE (dfa->nbackref, 0))
1051 err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1052 if (BE (err != REG_NOERROR, 0))
1055 if (cur_state->has_backref)
1057 err = transit_state_bkref (mctx, &cur_state->nodes);
1058 if (BE (err != REG_NOERROR, 0))
1064 /* If the RE accepts NULL string. */
1065 if (BE (cur_state->halt, 0))
1067 if (!cur_state->has_constraint
1068 || check_halt_state_context (mctx, cur_state, cur_str_idx))
1070 if (!fl_longest_match)
1074 match_last = cur_str_idx;
1080 while (!re_string_eoi (&mctx->input))
1082 re_dfastate_t *old_state = cur_state;
1083 int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1085 if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1086 || (BE (next_char_idx >= mctx->input.valid_len, 0)
1087 && mctx->input.valid_len < mctx->input.len))
1089 err = extend_buffers (mctx);
1090 if (BE (err != REG_NOERROR, 0))
1092 assert (err == REG_ESPACE);
1097 cur_state = transit_state (&err, mctx, cur_state);
1098 if (mctx->state_log != NULL)
1099 cur_state = merge_state_with_log (&err, mctx, cur_state);
1101 if (cur_state == NULL)
1103 /* Reached the invalid state or an error. Try to recover a valid
1104 state using the state log, if available and if we have not
1105 already found a valid (even if not the longest) match. */
1106 if (BE (err != REG_NOERROR, 0))
1109 if (mctx->state_log == NULL
1110 || (match && !fl_longest_match)
1111 || (cur_state = find_recover_state (&err, mctx)) == NULL)
1115 if (BE (at_init_state, 0))
1117 if (old_state == cur_state)
1118 next_start_idx = next_char_idx;
1123 if (cur_state->halt)
1125 /* Reached a halt state.
1126 Check the halt state can satisfy the current context. */
1127 if (!cur_state->has_constraint
1128 || check_halt_state_context (mctx, cur_state,
1129 re_string_cur_idx (&mctx->input)))
1131 /* We found an appropriate halt state. */
1132 match_last = re_string_cur_idx (&mctx->input);
1135 /* We found a match, do not modify match_first below. */
1136 p_match_first = NULL;
1137 if (!fl_longest_match)
1144 *p_match_first += next_start_idx;
1149 /* Check NODE match the current context. */
1153 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1155 re_token_type_t type = dfa->nodes[node].type;
1156 unsigned int constraint = dfa->nodes[node].constraint;
1157 if (type != END_OF_RE)
1161 if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1166 /* Check the halt state STATE match the current context.
1167 Return 0 if not match, if the node, STATE has, is a halt node and
1168 match the context, return the node. */
1172 check_halt_state_context (const re_match_context_t *mctx,
1173 const re_dfastate_t *state, int idx)
1176 unsigned int context;
1178 assert (state->halt);
1180 context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1181 for (i = 0; i < state->nodes.nelem; ++i)
1182 if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1183 return state->nodes.elems[i];
1187 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1188 corresponding to the DFA).
1189 Return the destination node, and update EPS_VIA_NODES, return -1 in case
1194 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1195 int *pidx, int node, re_node_set *eps_via_nodes,
1196 struct re_fail_stack_t *fs)
1198 const re_dfa_t *const dfa = mctx->dfa;
1200 if (IS_EPSILON_NODE (dfa->nodes[node].type))
1202 re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1203 re_node_set *edests = &dfa->edests[node];
1205 err = re_node_set_insert (eps_via_nodes, node);
1206 if (BE (err < 0, 0))
1208 /* Pick up a valid destination, or return -1 if none is found. */
1209 for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1211 int candidate = edests->elems[i];
1212 if (!re_node_set_contains (cur_nodes, candidate))
1214 if (dest_node == -1)
1215 dest_node = candidate;
1219 /* In order to avoid infinite loop like "(a*)*", return the second
1220 epsilon-transition if the first was already considered. */
1221 if (re_node_set_contains (eps_via_nodes, dest_node))
1224 /* Otherwise, push the second epsilon-transition on the fail stack. */
1226 && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1230 /* We know we are going to exit. */
1239 re_token_type_t type = dfa->nodes[node].type;
1241 #ifdef RE_ENABLE_I18N
1242 if (dfa->nodes[node].accept_mb)
1243 naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1245 #endif /* RE_ENABLE_I18N */
1246 if (type == OP_BACK_REF)
1248 int subexp_idx = dfa->nodes[node].opr.idx + 1;
1249 naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1252 if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1256 char *buf = (char *) re_string_get_buffer (&mctx->input);
1257 if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1266 err = re_node_set_insert (eps_via_nodes, node);
1267 if (BE (err < 0, 0))
1269 dest_node = dfa->edests[node].elems[0];
1270 if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1277 || check_node_accept (mctx, dfa->nodes + node, *pidx))
1279 int dest_node = dfa->nexts[node];
1280 *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1281 if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1282 || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1285 re_node_set_empty (eps_via_nodes);
1292 static reg_errcode_t
1294 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1295 int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1298 int num = fs->num++;
1299 if (fs->num == fs->alloc)
1301 struct re_fail_stack_ent_t *new_array;
1302 new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1304 if (new_array == NULL)
1307 fs->stack = new_array;
1309 fs->stack[num].idx = str_idx;
1310 fs->stack[num].node = dest_node;
1311 fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1312 if (fs->stack[num].regs == NULL)
1314 memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1315 err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1321 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1322 regmatch_t *regs, re_node_set *eps_via_nodes)
1324 int num = --fs->num;
1326 *pidx = fs->stack[num].idx;
1327 memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1328 re_node_set_free (eps_via_nodes);
1329 re_free (fs->stack[num].regs);
1330 *eps_via_nodes = fs->stack[num].eps_via_nodes;
1331 return fs->stack[num].node;
1334 /* Set the positions where the subexpressions are starts/ends to registers
1336 Note: We assume that pmatch[0] is already set, and
1337 pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch. */
1339 static reg_errcode_t
1341 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1342 regmatch_t *pmatch, int fl_backtrack)
1344 const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1346 re_node_set eps_via_nodes;
1347 struct re_fail_stack_t *fs;
1348 struct re_fail_stack_t fs_body = { 0, 2, NULL };
1349 regmatch_t *prev_idx_match;
1350 int prev_idx_match_malloced = 0;
1353 assert (nmatch > 1);
1354 assert (mctx->state_log != NULL);
1359 fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1360 if (fs->stack == NULL)
1366 cur_node = dfa->init_node;
1367 re_node_set_init_empty (&eps_via_nodes);
1369 if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1370 prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1373 prev_idx_match = re_malloc (regmatch_t, nmatch);
1374 if (prev_idx_match == NULL)
1376 free_fail_stack_return (fs);
1379 prev_idx_match_malloced = 1;
1381 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1383 for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1385 update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1387 if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1392 for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1393 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1395 if (reg_idx == nmatch)
1397 re_node_set_free (&eps_via_nodes);
1398 if (prev_idx_match_malloced)
1399 re_free (prev_idx_match);
1400 return free_fail_stack_return (fs);
1402 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1407 re_node_set_free (&eps_via_nodes);
1408 if (prev_idx_match_malloced)
1409 re_free (prev_idx_match);
1414 /* Proceed to next node. */
1415 cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1416 &eps_via_nodes, fs);
1418 if (BE (cur_node < 0, 0))
1420 if (BE (cur_node == -2, 0))
1422 re_node_set_free (&eps_via_nodes);
1423 if (prev_idx_match_malloced)
1424 re_free (prev_idx_match);
1425 free_fail_stack_return (fs);
1429 cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1433 re_node_set_free (&eps_via_nodes);
1434 if (prev_idx_match_malloced)
1435 re_free (prev_idx_match);
1440 re_node_set_free (&eps_via_nodes);
1441 if (prev_idx_match_malloced)
1442 re_free (prev_idx_match);
1443 return free_fail_stack_return (fs);
1446 static reg_errcode_t
1448 free_fail_stack_return (struct re_fail_stack_t *fs)
1453 for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1455 re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1456 re_free (fs->stack[fs_idx].regs);
1458 re_free (fs->stack);
1465 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1466 regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1468 int type = dfa->nodes[cur_node].type;
1469 if (type == OP_OPEN_SUBEXP)
1471 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1473 /* We are at the first node of this sub expression. */
1474 if (reg_num < nmatch)
1476 pmatch[reg_num].rm_so = cur_idx;
1477 pmatch[reg_num].rm_eo = -1;
1480 else if (type == OP_CLOSE_SUBEXP)
1482 int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1483 if (reg_num < nmatch)
1485 /* We are at the last node of this sub expression. */
1486 if (pmatch[reg_num].rm_so < cur_idx)
1488 pmatch[reg_num].rm_eo = cur_idx;
1489 /* This is a non-empty match or we are not inside an optional
1490 subexpression. Accept this right away. */
1491 memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1495 if (dfa->nodes[cur_node].opt_subexp
1496 && prev_idx_match[reg_num].rm_so != -1)
1497 /* We transited through an empty match for an optional
1498 subexpression, like (a?)*, and this is not the subexp's
1499 first match. Copy back the old content of the registers
1500 so that matches of an inner subexpression are undone as
1501 well, like in ((a?))*. */
1502 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1504 /* We completed a subexpression, but it may be part of
1505 an optional one, so do not update PREV_IDX_MATCH. */
1506 pmatch[reg_num].rm_eo = cur_idx;
1512 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1513 and sift the nodes in each states according to the following rules.
1514 Updated state_log will be wrote to STATE_LOG.
1516 Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1517 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1518 If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1519 the LAST_NODE, we throw away the node `a'.
1520 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1521 string `s' and transit to `b':
1522 i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1524 ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1525 thrown away, we throw away the node `a'.
1526 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1527 i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1529 ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1530 we throw away the node `a'. */
1532 #define STATE_NODE_CONTAINS(state,node) \
1533 ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1535 static reg_errcode_t
1537 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1541 int str_idx = sctx->last_str_idx;
1542 re_node_set cur_dest;
1545 assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1548 /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1549 transit to the last_node and the last_node itself. */
1550 err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1551 if (BE (err != REG_NOERROR, 0))
1553 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1554 if (BE (err != REG_NOERROR, 0))
1557 /* Then check each states in the state_log. */
1560 /* Update counters. */
1561 null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1562 if (null_cnt > mctx->max_mb_elem_len)
1564 memset (sctx->sifted_states, '\0',
1565 sizeof (re_dfastate_t *) * str_idx);
1566 re_node_set_free (&cur_dest);
1569 re_node_set_empty (&cur_dest);
1572 if (mctx->state_log[str_idx])
1574 err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1575 if (BE (err != REG_NOERROR, 0))
1579 /* Add all the nodes which satisfy the following conditions:
1580 - It can epsilon transit to a node in CUR_DEST.
1582 And update state_log. */
1583 err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1584 if (BE (err != REG_NOERROR, 0))
1589 re_node_set_free (&cur_dest);
1593 static reg_errcode_t
1595 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1596 int str_idx, re_node_set *cur_dest)
1598 const re_dfa_t *const dfa = mctx->dfa;
1599 const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1602 /* Then build the next sifted state.
1603 We build the next sifted state on `cur_dest', and update
1604 `sifted_states[str_idx]' with `cur_dest'.
1606 `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1607 `cur_src' points the node_set of the old `state_log[str_idx]'
1608 (with the epsilon nodes pre-filtered out). */
1609 for (i = 0; i < cur_src->nelem; i++)
1611 int prev_node = cur_src->elems[i];
1616 re_token_type_t type = dfa->nodes[prev_node].type;
1617 assert (!IS_EPSILON_NODE (type));
1619 #ifdef RE_ENABLE_I18N
1620 /* If the node may accept `multi byte'. */
1621 if (dfa->nodes[prev_node].accept_mb)
1622 naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1623 str_idx, sctx->last_str_idx);
1624 #endif /* RE_ENABLE_I18N */
1626 /* We don't check backreferences here.
1627 See update_cur_sifted_state(). */
1629 && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1630 && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1631 dfa->nexts[prev_node]))
1637 if (sctx->limits.nelem)
1639 int to_idx = str_idx + naccepted;
1640 if (check_dst_limits (mctx, &sctx->limits,
1641 dfa->nexts[prev_node], to_idx,
1642 prev_node, str_idx))
1645 ret = re_node_set_insert (cur_dest, prev_node);
1646 if (BE (ret == -1, 0))
1653 /* Helper functions. */
1655 static reg_errcode_t
1657 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1659 int top = mctx->state_log_top;
1661 if (next_state_log_idx >= mctx->input.bufs_len
1662 || (next_state_log_idx >= mctx->input.valid_len
1663 && mctx->input.valid_len < mctx->input.len))
1666 err = extend_buffers (mctx);
1667 if (BE (err != REG_NOERROR, 0))
1671 if (top < next_state_log_idx)
1673 memset (mctx->state_log + top + 1, '\0',
1674 sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1675 mctx->state_log_top = next_state_log_idx;
1680 static reg_errcode_t
1682 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1683 re_dfastate_t **src, int num)
1687 for (st_idx = 0; st_idx < num; ++st_idx)
1689 if (dst[st_idx] == NULL)
1690 dst[st_idx] = src[st_idx];
1691 else if (src[st_idx] != NULL)
1693 re_node_set merged_set;
1694 err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1695 &src[st_idx]->nodes);
1696 if (BE (err != REG_NOERROR, 0))
1698 dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1699 re_node_set_free (&merged_set);
1700 if (BE (err != REG_NOERROR, 0))
1707 static reg_errcode_t
1709 update_cur_sifted_state (const re_match_context_t *mctx,
1710 re_sift_context_t *sctx, int str_idx,
1711 re_node_set *dest_nodes)
1713 const re_dfa_t *const dfa = mctx->dfa;
1714 reg_errcode_t err = REG_NOERROR;
1715 const re_node_set *candidates;
1716 candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1717 : &mctx->state_log[str_idx]->nodes);
1719 if (dest_nodes->nelem == 0)
1720 sctx->sifted_states[str_idx] = NULL;
1725 /* At first, add the nodes which can epsilon transit to a node in
1727 err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1728 if (BE (err != REG_NOERROR, 0))
1731 /* Then, check the limitations in the current sift_context. */
1732 if (sctx->limits.nelem)
1734 err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1735 mctx->bkref_ents, str_idx);
1736 if (BE (err != REG_NOERROR, 0))
1741 sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1742 if (BE (err != REG_NOERROR, 0))
1746 if (candidates && mctx->state_log[str_idx]->has_backref)
1748 err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1749 if (BE (err != REG_NOERROR, 0))
1755 static reg_errcode_t
1757 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1758 const re_node_set *candidates)
1760 reg_errcode_t err = REG_NOERROR;
1763 re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1764 if (BE (err != REG_NOERROR, 0))
1767 if (!state->inveclosure.alloc)
1769 err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1770 if (BE (err != REG_NOERROR, 0))
1772 for (i = 0; i < dest_nodes->nelem; i++)
1773 re_node_set_merge (&state->inveclosure,
1774 dfa->inveclosures + dest_nodes->elems[i]);
1776 return re_node_set_add_intersect (dest_nodes, candidates,
1777 &state->inveclosure);
1780 static reg_errcode_t
1782 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1783 const re_node_set *candidates)
1787 re_node_set *inv_eclosure = dfa->inveclosures + node;
1788 re_node_set except_nodes;
1789 re_node_set_init_empty (&except_nodes);
1790 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1792 int cur_node = inv_eclosure->elems[ecl_idx];
1793 if (cur_node == node)
1795 if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1797 int edst1 = dfa->edests[cur_node].elems[0];
1798 int edst2 = ((dfa->edests[cur_node].nelem > 1)
1799 ? dfa->edests[cur_node].elems[1] : -1);
1800 if ((!re_node_set_contains (inv_eclosure, edst1)
1801 && re_node_set_contains (dest_nodes, edst1))
1803 && !re_node_set_contains (inv_eclosure, edst2)
1804 && re_node_set_contains (dest_nodes, edst2)))
1806 err = re_node_set_add_intersect (&except_nodes, candidates,
1807 dfa->inveclosures + cur_node);
1808 if (BE (err != REG_NOERROR, 0))
1810 re_node_set_free (&except_nodes);
1816 for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1818 int cur_node = inv_eclosure->elems[ecl_idx];
1819 if (!re_node_set_contains (&except_nodes, cur_node))
1821 int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1822 re_node_set_remove_at (dest_nodes, idx);
1825 re_node_set_free (&except_nodes);
1831 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1832 int dst_node, int dst_idx, int src_node, int src_idx)
1834 const re_dfa_t *const dfa = mctx->dfa;
1835 int lim_idx, src_pos, dst_pos;
1837 int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1838 int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1839 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1842 struct re_backref_cache_entry *ent;
1843 ent = mctx->bkref_ents + limits->elems[lim_idx];
1844 subexp_idx = dfa->nodes[ent->node].opr.idx;
1846 dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1847 subexp_idx, dst_node, dst_idx,
1849 src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1850 subexp_idx, src_node, src_idx,
1854 <src> <dst> ( <subexp> )
1855 ( <subexp> ) <src> <dst>
1856 ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1857 if (src_pos == dst_pos)
1858 continue; /* This is unrelated limitation. */
1867 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1868 int subexp_idx, int from_node, int bkref_idx)
1870 const re_dfa_t *const dfa = mctx->dfa;
1871 const re_node_set *eclosures = dfa->eclosures + from_node;
1874 /* Else, we are on the boundary: examine the nodes on the epsilon
1876 for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1878 int node = eclosures->elems[node_idx];
1879 switch (dfa->nodes[node].type)
1882 if (bkref_idx != -1)
1884 struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1889 if (ent->node != node)
1892 if (subexp_idx < BITSET_WORD_BITS
1893 && !(ent->eps_reachable_subexps_map
1894 & ((bitset_word_t) 1 << subexp_idx)))
1897 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1898 OP_CLOSE_SUBEXP cases below. But, if the
1899 destination node is the same node as the source
1900 node, don't recurse because it would cause an
1901 infinite loop: a regex that exhibits this behavior
1903 dst = dfa->edests[node].elems[0];
1904 if (dst == from_node)
1908 else /* if (boundaries & 2) */
1913 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1915 if (cpos == -1 /* && (boundaries & 1) */)
1917 if (cpos == 0 && (boundaries & 2))
1920 if (subexp_idx < BITSET_WORD_BITS)
1921 ent->eps_reachable_subexps_map
1922 &= ~((bitset_word_t) 1 << subexp_idx);
1924 while (ent++->more);
1928 case OP_OPEN_SUBEXP:
1929 if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1933 case OP_CLOSE_SUBEXP:
1934 if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1943 return (boundaries & 2) ? 1 : 0;
1948 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
1949 int subexp_idx, int from_node, int str_idx,
1952 struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1955 /* If we are outside the range of the subexpression, return -1 or 1. */
1956 if (str_idx < lim->subexp_from)
1959 if (lim->subexp_to < str_idx)
1962 /* If we are within the subexpression, return 0. */
1963 boundaries = (str_idx == lim->subexp_from);
1964 boundaries |= (str_idx == lim->subexp_to) << 1;
1965 if (boundaries == 0)
1968 /* Else, examine epsilon closure. */
1969 return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1970 from_node, bkref_idx);
1973 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1974 which are against limitations from DEST_NODES. */
1976 static reg_errcode_t
1978 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
1979 const re_node_set *candidates, re_node_set *limits,
1980 struct re_backref_cache_entry *bkref_ents, int str_idx)
1983 int node_idx, lim_idx;
1985 for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1988 struct re_backref_cache_entry *ent;
1989 ent = bkref_ents + limits->elems[lim_idx];
1991 if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
1992 continue; /* This is unrelated limitation. */
1994 subexp_idx = dfa->nodes[ent->node].opr.idx;
1995 if (ent->subexp_to == str_idx)
1999 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2001 int node = dest_nodes->elems[node_idx];
2002 re_token_type_t type = dfa->nodes[node].type;
2003 if (type == OP_OPEN_SUBEXP
2004 && subexp_idx == dfa->nodes[node].opr.idx)
2006 else if (type == OP_CLOSE_SUBEXP
2007 && subexp_idx == dfa->nodes[node].opr.idx)
2011 /* Check the limitation of the open subexpression. */
2012 /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
2015 err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2017 if (BE (err != REG_NOERROR, 0))
2021 /* Check the limitation of the close subexpression. */
2023 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2025 int node = dest_nodes->elems[node_idx];
2026 if (!re_node_set_contains (dfa->inveclosures + node,
2028 && !re_node_set_contains (dfa->eclosures + node,
2031 /* It is against this limitation.
2032 Remove it form the current sifted state. */
2033 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2035 if (BE (err != REG_NOERROR, 0))
2041 else /* (ent->subexp_to != str_idx) */
2043 for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2045 int node = dest_nodes->elems[node_idx];
2046 re_token_type_t type = dfa->nodes[node].type;
2047 if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2049 if (subexp_idx != dfa->nodes[node].opr.idx)
2051 /* It is against this limitation.
2052 Remove it form the current sifted state. */
2053 err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2055 if (BE (err != REG_NOERROR, 0))
2064 static reg_errcode_t
2066 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2067 int str_idx, const re_node_set *candidates)
2069 const re_dfa_t *const dfa = mctx->dfa;
2072 re_sift_context_t local_sctx;
2073 int first_idx = search_cur_bkref_entry (mctx, str_idx);
2075 if (first_idx == -1)
2078 local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
2080 for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2083 re_token_type_t type;
2084 struct re_backref_cache_entry *entry;
2085 node = candidates->elems[node_idx];
2086 type = dfa->nodes[node].type;
2087 /* Avoid infinite loop for the REs like "()\1+". */
2088 if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2090 if (type != OP_BACK_REF)
2093 entry = mctx->bkref_ents + first_idx;
2094 enabled_idx = first_idx;
2101 re_dfastate_t *cur_state;
2103 if (entry->node != node)
2105 subexp_len = entry->subexp_to - entry->subexp_from;
2106 to_idx = str_idx + subexp_len;
2107 dst_node = (subexp_len ? dfa->nexts[node]
2108 : dfa->edests[node].elems[0]);
2110 if (to_idx > sctx->last_str_idx
2111 || sctx->sifted_states[to_idx] == NULL
2112 || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2113 || check_dst_limits (mctx, &sctx->limits, node,
2114 str_idx, dst_node, to_idx))
2117 if (local_sctx.sifted_states == NULL)
2120 err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2121 if (BE (err != REG_NOERROR, 0))
2124 local_sctx.last_node = node;
2125 local_sctx.last_str_idx = str_idx;
2126 ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2127 if (BE (ret < 0, 0))
2132 cur_state = local_sctx.sifted_states[str_idx];
2133 err = sift_states_backward (mctx, &local_sctx);
2134 if (BE (err != REG_NOERROR, 0))
2136 if (sctx->limited_states != NULL)
2138 err = merge_state_array (dfa, sctx->limited_states,
2139 local_sctx.sifted_states,
2141 if (BE (err != REG_NOERROR, 0))
2144 local_sctx.sifted_states[str_idx] = cur_state;
2145 re_node_set_remove (&local_sctx.limits, enabled_idx);
2147 /* mctx->bkref_ents may have changed, reload the pointer. */
2148 entry = mctx->bkref_ents + enabled_idx;
2150 while (enabled_idx++, entry++->more);
2154 if (local_sctx.sifted_states != NULL)
2156 re_node_set_free (&local_sctx.limits);
2163 #ifdef RE_ENABLE_I18N
2166 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2167 int node_idx, int str_idx, int max_str_idx)
2169 const re_dfa_t *const dfa = mctx->dfa;
2171 /* Check the node can accept `multi byte'. */
2172 naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2173 if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2174 !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2175 dfa->nexts[node_idx]))
2176 /* The node can't accept the `multi byte', or the
2177 destination was already thrown away, then the node
2178 could't accept the current input `multi byte'. */
2180 /* Otherwise, it is sure that the node could accept
2181 `naccepted' bytes input. */
2184 #endif /* RE_ENABLE_I18N */
2187 /* Functions for state transition. */
2189 /* Return the next state to which the current state STATE will transit by
2190 accepting the current input byte, and update STATE_LOG if necessary.
2191 If STATE can accept a multibyte char/collating element/back reference
2192 update the destination of STATE_LOG. */
2194 static re_dfastate_t *
2196 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2197 re_dfastate_t *state)
2199 re_dfastate_t **trtable;
2202 #ifdef RE_ENABLE_I18N
2203 /* If the current state can accept multibyte. */
2204 if (BE (state->accept_mb, 0))
2206 *err = transit_state_mb (mctx, state);
2207 if (BE (*err != REG_NOERROR, 0))
2210 #endif /* RE_ENABLE_I18N */
2212 /* Then decide the next state with the single byte. */
2215 /* don't use transition table */
2216 return transit_state_sb (err, mctx, state);
2219 /* Use transition table */
2220 ch = re_string_fetch_byte (&mctx->input);
2223 trtable = state->trtable;
2224 if (BE (trtable != NULL, 1))
2227 trtable = state->word_trtable;
2228 if (BE (trtable != NULL, 1))
2230 unsigned int context;
2232 = re_string_context_at (&mctx->input,
2233 re_string_cur_idx (&mctx->input) - 1,
2235 if (IS_WORD_CONTEXT (context))
2236 return trtable[ch + SBC_MAX];
2241 if (!build_trtable (mctx->dfa, state))
2247 /* Retry, we now have a transition table. */
2251 /* Update the state_log if we need */
2254 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2255 re_dfastate_t *next_state)
2257 const re_dfa_t *const dfa = mctx->dfa;
2258 int cur_idx = re_string_cur_idx (&mctx->input);
2260 if (cur_idx > mctx->state_log_top)
2262 mctx->state_log[cur_idx] = next_state;
2263 mctx->state_log_top = cur_idx;
2265 else if (mctx->state_log[cur_idx] == 0)
2267 mctx->state_log[cur_idx] = next_state;
2271 re_dfastate_t *pstate;
2272 unsigned int context;
2273 re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2274 /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2275 the destination of a multibyte char/collating element/
2276 back reference. Then the next state is the union set of
2277 these destinations and the results of the transition table. */
2278 pstate = mctx->state_log[cur_idx];
2279 log_nodes = pstate->entrance_nodes;
2280 if (next_state != NULL)
2282 table_nodes = next_state->entrance_nodes;
2283 *err = re_node_set_init_union (&next_nodes, table_nodes,
2285 if (BE (*err != REG_NOERROR, 0))
2289 next_nodes = *log_nodes;
2290 /* Note: We already add the nodes of the initial state,
2291 then we don't need to add them here. */
2293 context = re_string_context_at (&mctx->input,
2294 re_string_cur_idx (&mctx->input) - 1,
2296 next_state = mctx->state_log[cur_idx]
2297 = re_acquire_state_context (err, dfa, &next_nodes, context);
2298 /* We don't need to check errors here, since the return value of
2299 this function is next_state and ERR is already set. */
2301 if (table_nodes != NULL)
2302 re_node_set_free (&next_nodes);
2305 if (BE (dfa->nbackref, 0) && next_state != NULL)
2307 /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2308 later. We must check them here, since the back references in the
2309 next state might use them. */
2310 *err = check_subexp_matching_top (mctx, &next_state->nodes,
2312 if (BE (*err != REG_NOERROR, 0))
2315 /* If the next state has back references. */
2316 if (next_state->has_backref)
2318 *err = transit_state_bkref (mctx, &next_state->nodes);
2319 if (BE (*err != REG_NOERROR, 0))
2321 next_state = mctx->state_log[cur_idx];
2328 /* Skip bytes in the input that correspond to part of a
2329 multi-byte match, then look in the log for a state
2330 from which to restart matching. */
2333 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2335 re_dfastate_t *cur_state;
2338 int max = mctx->state_log_top;
2339 int cur_str_idx = re_string_cur_idx (&mctx->input);
2343 if (++cur_str_idx > max)
2345 re_string_skip_bytes (&mctx->input, 1);
2347 while (mctx->state_log[cur_str_idx] == NULL);
2349 cur_state = merge_state_with_log (err, mctx, NULL);
2351 while (*err == REG_NOERROR && cur_state == NULL);
2355 /* Helper functions for transit_state. */
2357 /* From the node set CUR_NODES, pick up the nodes whose types are
2358 OP_OPEN_SUBEXP and which have corresponding back references in the regular
2359 expression. And register them to use them later for evaluating the
2360 correspoding back references. */
2362 static reg_errcode_t
2364 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2367 const re_dfa_t *const dfa = mctx->dfa;
2371 /* TODO: This isn't efficient.
2372 Because there might be more than one nodes whose types are
2373 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2376 for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2378 int node = cur_nodes->elems[node_idx];
2379 if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2380 && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2381 && (dfa->used_bkref_map
2382 & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2384 err = match_ctx_add_subtop (mctx, node, str_idx);
2385 if (BE (err != REG_NOERROR, 0))
2393 /* Return the next state to which the current state STATE will transit by
2394 accepting the current input byte. */
2396 static re_dfastate_t *
2397 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2398 re_dfastate_t *state)
2400 const re_dfa_t *const dfa = mctx->dfa;
2401 re_node_set next_nodes;
2402 re_dfastate_t *next_state;
2403 int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2404 unsigned int context;
2406 *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2407 if (BE (*err != REG_NOERROR, 0))
2409 for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2411 int cur_node = state->nodes.elems[node_cnt];
2412 if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2414 *err = re_node_set_merge (&next_nodes,
2415 dfa->eclosures + dfa->nexts[cur_node]);
2416 if (BE (*err != REG_NOERROR, 0))
2418 re_node_set_free (&next_nodes);
2423 context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2424 next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2425 /* We don't need to check errors here, since the return value of
2426 this function is next_state and ERR is already set. */
2428 re_node_set_free (&next_nodes);
2429 re_string_skip_bytes (&mctx->input, 1);
2434 #ifdef RE_ENABLE_I18N
2435 static reg_errcode_t
2437 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2439 const re_dfa_t *const dfa = mctx->dfa;
2443 for (i = 0; i < pstate->nodes.nelem; ++i)
2445 re_node_set dest_nodes, *new_nodes;
2446 int cur_node_idx = pstate->nodes.elems[i];
2447 int naccepted, dest_idx;
2448 unsigned int context;
2449 re_dfastate_t *dest_state;
2451 if (!dfa->nodes[cur_node_idx].accept_mb)
2454 if (dfa->nodes[cur_node_idx].constraint)
2456 context = re_string_context_at (&mctx->input,
2457 re_string_cur_idx (&mctx->input),
2459 if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2464 /* How many bytes the node can accept? */
2465 naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2466 re_string_cur_idx (&mctx->input));
2470 /* The node can accepts `naccepted' bytes. */
2471 dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2472 mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2473 : mctx->max_mb_elem_len);
2474 err = clean_state_log_if_needed (mctx, dest_idx);
2475 if (BE (err != REG_NOERROR, 0))
2478 assert (dfa->nexts[cur_node_idx] != -1);
2480 new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2482 dest_state = mctx->state_log[dest_idx];
2483 if (dest_state == NULL)
2484 dest_nodes = *new_nodes;
2487 err = re_node_set_init_union (&dest_nodes,
2488 dest_state->entrance_nodes, new_nodes);
2489 if (BE (err != REG_NOERROR, 0))
2492 context = re_string_context_at (&mctx->input, dest_idx - 1,
2494 mctx->state_log[dest_idx]
2495 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2496 if (dest_state != NULL)
2497 re_node_set_free (&dest_nodes);
2498 if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2503 #endif /* RE_ENABLE_I18N */
2505 static reg_errcode_t
2507 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2509 const re_dfa_t *const dfa = mctx->dfa;
2512 int cur_str_idx = re_string_cur_idx (&mctx->input);
2514 for (i = 0; i < nodes->nelem; ++i)
2516 int dest_str_idx, prev_nelem, bkc_idx;
2517 int node_idx = nodes->elems[i];
2518 unsigned int context;
2519 const re_token_t *node = dfa->nodes + node_idx;
2520 re_node_set *new_dest_nodes;
2522 /* Check whether `node' is a backreference or not. */
2523 if (node->type != OP_BACK_REF)
2526 if (node->constraint)
2528 context = re_string_context_at (&mctx->input, cur_str_idx,
2530 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2534 /* `node' is a backreference.
2535 Check the substring which the substring matched. */
2536 bkc_idx = mctx->nbkref_ents;
2537 err = get_subexp (mctx, node_idx, cur_str_idx);
2538 if (BE (err != REG_NOERROR, 0))
2541 /* And add the epsilon closures (which is `new_dest_nodes') of
2542 the backreference to appropriate state_log. */
2544 assert (dfa->nexts[node_idx] != -1);
2546 for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2549 re_dfastate_t *dest_state;
2550 struct re_backref_cache_entry *bkref_ent;
2551 bkref_ent = mctx->bkref_ents + bkc_idx;
2552 if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2554 subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2555 new_dest_nodes = (subexp_len == 0
2556 ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2557 : dfa->eclosures + dfa->nexts[node_idx]);
2558 dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2559 - bkref_ent->subexp_from);
2560 context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2562 dest_state = mctx->state_log[dest_str_idx];
2563 prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2564 : mctx->state_log[cur_str_idx]->nodes.nelem);
2565 /* Add `new_dest_node' to state_log. */
2566 if (dest_state == NULL)
2568 mctx->state_log[dest_str_idx]
2569 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2571 if (BE (mctx->state_log[dest_str_idx] == NULL
2572 && err != REG_NOERROR, 0))
2577 re_node_set dest_nodes;
2578 err = re_node_set_init_union (&dest_nodes,
2579 dest_state->entrance_nodes,
2581 if (BE (err != REG_NOERROR, 0))
2583 re_node_set_free (&dest_nodes);
2586 mctx->state_log[dest_str_idx]
2587 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2588 re_node_set_free (&dest_nodes);
2589 if (BE (mctx->state_log[dest_str_idx] == NULL
2590 && err != REG_NOERROR, 0))
2593 /* We need to check recursively if the backreference can epsilon
2596 && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2598 err = check_subexp_matching_top (mctx, new_dest_nodes,
2600 if (BE (err != REG_NOERROR, 0))
2602 err = transit_state_bkref (mctx, new_dest_nodes);
2603 if (BE (err != REG_NOERROR, 0))
2613 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2614 at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2615 Note that we might collect inappropriate candidates here.
2616 However, the cost of checking them strictly here is too high, then we
2617 delay these checking for prune_impossible_nodes(). */
2619 static reg_errcode_t
2621 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2623 const re_dfa_t *const dfa = mctx->dfa;
2624 int subexp_num, sub_top_idx;
2625 const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2626 /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2627 int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2628 if (cache_idx != -1)
2630 const struct re_backref_cache_entry *entry
2631 = mctx->bkref_ents + cache_idx;
2633 if (entry->node == bkref_node)
2634 return REG_NOERROR; /* We already checked it. */
2635 while (entry++->more);
2638 subexp_num = dfa->nodes[bkref_node].opr.idx;
2640 /* For each sub expression */
2641 for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2644 re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2645 re_sub_match_last_t *sub_last;
2646 int sub_last_idx, sl_str, bkref_str_off;
2648 if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2649 continue; /* It isn't related. */
2651 sl_str = sub_top->str_idx;
2652 bkref_str_off = bkref_str_idx;
2653 /* At first, check the last node of sub expressions we already
2655 for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2658 sub_last = sub_top->lasts[sub_last_idx];
2659 sl_str_diff = sub_last->str_idx - sl_str;
2660 /* The matched string by the sub expression match with the substring
2661 at the back reference? */
2662 if (sl_str_diff > 0)
2664 if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2666 /* Not enough chars for a successful match. */
2667 if (bkref_str_off + sl_str_diff > mctx->input.len)
2670 err = clean_state_log_if_needed (mctx,
2673 if (BE (err != REG_NOERROR, 0))
2675 buf = (const char *) re_string_get_buffer (&mctx->input);
2677 if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2678 /* We don't need to search this sub expression any more. */
2681 bkref_str_off += sl_str_diff;
2682 sl_str += sl_str_diff;
2683 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2686 /* Reload buf, since the preceding call might have reallocated
2688 buf = (const char *) re_string_get_buffer (&mctx->input);
2690 if (err == REG_NOMATCH)
2692 if (BE (err != REG_NOERROR, 0))
2696 if (sub_last_idx < sub_top->nlasts)
2698 if (sub_last_idx > 0)
2700 /* Then, search for the other last nodes of the sub expression. */
2701 for (; sl_str <= bkref_str_idx; ++sl_str)
2703 int cls_node, sl_str_off;
2704 const re_node_set *nodes;
2705 sl_str_off = sl_str - sub_top->str_idx;
2706 /* The matched string by the sub expression match with the substring
2707 at the back reference? */
2710 if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2712 /* If we are at the end of the input, we cannot match. */
2713 if (bkref_str_off >= mctx->input.len)
2716 err = extend_buffers (mctx);
2717 if (BE (err != REG_NOERROR, 0))
2720 buf = (const char *) re_string_get_buffer (&mctx->input);
2722 if (buf [bkref_str_off++] != buf[sl_str - 1])
2723 break; /* We don't need to search this sub expression
2726 if (mctx->state_log[sl_str] == NULL)
2728 /* Does this state have a ')' of the sub expression? */
2729 nodes = &mctx->state_log[sl_str]->nodes;
2730 cls_node = find_subexp_node (dfa, nodes, subexp_num,
2734 if (sub_top->path == NULL)
2736 sub_top->path = calloc (sizeof (state_array_t),
2737 sl_str - sub_top->str_idx + 1);
2738 if (sub_top->path == NULL)
2741 /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2742 in the current context? */
2743 err = check_arrival (mctx, sub_top->path, sub_top->node,
2744 sub_top->str_idx, cls_node, sl_str,
2746 if (err == REG_NOMATCH)
2748 if (BE (err != REG_NOERROR, 0))
2750 sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2751 if (BE (sub_last == NULL, 0))
2753 err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2755 if (err == REG_NOMATCH)
2762 /* Helper functions for get_subexp(). */
2764 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2765 If it can arrive, register the sub expression expressed with SUB_TOP
2768 static reg_errcode_t
2770 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2771 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2775 /* Can the subexpression arrive the back reference? */
2776 err = check_arrival (mctx, &sub_last->path, sub_last->node,
2777 sub_last->str_idx, bkref_node, bkref_str,
2779 if (err != REG_NOERROR)
2781 err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2783 if (BE (err != REG_NOERROR, 0))
2785 to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2786 return clean_state_log_if_needed (mctx, to_idx);
2789 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2790 Search '(' if FL_OPEN, or search ')' otherwise.
2791 TODO: This function isn't efficient...
2792 Because there might be more than one nodes whose types are
2793 OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2799 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2800 int subexp_idx, int type)
2803 for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2805 int cls_node = nodes->elems[cls_idx];
2806 const re_token_t *node = dfa->nodes + cls_node;
2807 if (node->type == type
2808 && node->opr.idx == subexp_idx)
2814 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2815 LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2817 Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2819 static reg_errcode_t
2821 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2822 int top_str, int last_node, int last_str, int type)
2824 const re_dfa_t *const dfa = mctx->dfa;
2825 reg_errcode_t err = REG_NOERROR;
2826 int subexp_num, backup_cur_idx, str_idx, null_cnt;
2827 re_dfastate_t *cur_state = NULL;
2828 re_node_set *cur_nodes, next_nodes;
2829 re_dfastate_t **backup_state_log;
2830 unsigned int context;
2832 subexp_num = dfa->nodes[top_node].opr.idx;
2833 /* Extend the buffer if we need. */
2834 if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2836 re_dfastate_t **new_array;
2837 int old_alloc = path->alloc;
2838 path->alloc += last_str + mctx->max_mb_elem_len + 1;
2839 new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2840 if (BE (new_array == NULL, 0))
2842 path->alloc = old_alloc;
2845 path->array = new_array;
2846 memset (new_array + old_alloc, '\0',
2847 sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2850 str_idx = path->next_idx ?: top_str;
2852 /* Temporary modify MCTX. */
2853 backup_state_log = mctx->state_log;
2854 backup_cur_idx = mctx->input.cur_idx;
2855 mctx->state_log = path->array;
2856 mctx->input.cur_idx = str_idx;
2858 /* Setup initial node set. */
2859 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2860 if (str_idx == top_str)
2862 err = re_node_set_init_1 (&next_nodes, top_node);
2863 if (BE (err != REG_NOERROR, 0))
2865 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2866 if (BE (err != REG_NOERROR, 0))
2868 re_node_set_free (&next_nodes);
2874 cur_state = mctx->state_log[str_idx];
2875 if (cur_state && cur_state->has_backref)
2877 err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2878 if (BE (err != REG_NOERROR, 0))
2882 re_node_set_init_empty (&next_nodes);
2884 if (str_idx == top_str || (cur_state && cur_state->has_backref))
2886 if (next_nodes.nelem)
2888 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2890 if (BE (err != REG_NOERROR, 0))
2892 re_node_set_free (&next_nodes);
2896 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2897 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2899 re_node_set_free (&next_nodes);
2902 mctx->state_log[str_idx] = cur_state;
2905 for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2907 re_node_set_empty (&next_nodes);
2908 if (mctx->state_log[str_idx + 1])
2910 err = re_node_set_merge (&next_nodes,
2911 &mctx->state_log[str_idx + 1]->nodes);
2912 if (BE (err != REG_NOERROR, 0))
2914 re_node_set_free (&next_nodes);
2920 err = check_arrival_add_next_nodes (mctx, str_idx,
2921 &cur_state->non_eps_nodes,
2923 if (BE (err != REG_NOERROR, 0))
2925 re_node_set_free (&next_nodes);
2930 if (next_nodes.nelem)
2932 err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2933 if (BE (err != REG_NOERROR, 0))
2935 re_node_set_free (&next_nodes);
2938 err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2940 if (BE (err != REG_NOERROR, 0))
2942 re_node_set_free (&next_nodes);
2946 context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2947 cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2948 if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2950 re_node_set_free (&next_nodes);
2953 mctx->state_log[str_idx] = cur_state;
2954 null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2956 re_node_set_free (&next_nodes);
2957 cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2958 : &mctx->state_log[last_str]->nodes);
2959 path->next_idx = str_idx;
2962 mctx->state_log = backup_state_log;
2963 mctx->input.cur_idx = backup_cur_idx;
2965 /* Then check the current node set has the node LAST_NODE. */
2966 if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2972 /* Helper functions for check_arrival. */
2974 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2976 TODO: This function is similar to the functions transit_state*(),
2977 however this function has many additional works.
2978 Can't we unify them? */
2980 static reg_errcode_t
2982 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
2983 re_node_set *cur_nodes, re_node_set *next_nodes)
2985 const re_dfa_t *const dfa = mctx->dfa;
2988 #ifdef RE_ENABLE_I18N
2989 reg_errcode_t err = REG_NOERROR;
2991 re_node_set union_set;
2992 re_node_set_init_empty (&union_set);
2993 for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
2996 int cur_node = cur_nodes->elems[cur_idx];
2998 re_token_type_t type = dfa->nodes[cur_node].type;
2999 assert (!IS_EPSILON_NODE (type));
3001 #ifdef RE_ENABLE_I18N
3002 /* If the node may accept `multi byte'. */
3003 if (dfa->nodes[cur_node].accept_mb)
3005 naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3009 re_dfastate_t *dest_state;
3010 int next_node = dfa->nexts[cur_node];
3011 int next_idx = str_idx + naccepted;
3012 dest_state = mctx->state_log[next_idx];
3013 re_node_set_empty (&union_set);
3016 err = re_node_set_merge (&union_set, &dest_state->nodes);
3017 if (BE (err != REG_NOERROR, 0))
3019 re_node_set_free (&union_set);
3023 result = re_node_set_insert (&union_set, next_node);
3024 if (BE (result < 0, 0))
3026 re_node_set_free (&union_set);
3029 mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3031 if (BE (mctx->state_log[next_idx] == NULL
3032 && err != REG_NOERROR, 0))
3034 re_node_set_free (&union_set);
3039 #endif /* RE_ENABLE_I18N */
3041 || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3043 result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3044 if (BE (result < 0, 0))
3046 re_node_set_free (&union_set);
3051 re_node_set_free (&union_set);
3055 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3056 CUR_NODES, however exclude the nodes which are:
3057 - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3058 - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3061 static reg_errcode_t
3063 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3064 int ex_subexp, int type)
3067 int idx, outside_node;
3068 re_node_set new_nodes;
3070 assert (cur_nodes->nelem);
3072 err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3073 if (BE (err != REG_NOERROR, 0))
3075 /* Create a new node set NEW_NODES with the nodes which are epsilon
3076 closures of the node in CUR_NODES. */
3078 for (idx = 0; idx < cur_nodes->nelem; ++idx)
3080 int cur_node = cur_nodes->elems[idx];
3081 const re_node_set *eclosure = dfa->eclosures + cur_node;
3082 outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3083 if (outside_node == -1)
3085 /* There are no problematic nodes, just merge them. */
3086 err = re_node_set_merge (&new_nodes, eclosure);
3087 if (BE (err != REG_NOERROR, 0))
3089 re_node_set_free (&new_nodes);
3095 /* There are problematic nodes, re-calculate incrementally. */
3096 err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3098 if (BE (err != REG_NOERROR, 0))
3100 re_node_set_free (&new_nodes);
3105 re_node_set_free (cur_nodes);
3106 *cur_nodes = new_nodes;
3110 /* Helper function for check_arrival_expand_ecl.
3111 Check incrementally the epsilon closure of TARGET, and if it isn't
3112 problematic append it to DST_NODES. */
3114 static reg_errcode_t
3116 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3117 int target, int ex_subexp, int type)
3120 for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3124 if (dfa->nodes[cur_node].type == type
3125 && dfa->nodes[cur_node].opr.idx == ex_subexp)
3127 if (type == OP_CLOSE_SUBEXP)
3129 err = re_node_set_insert (dst_nodes, cur_node);
3130 if (BE (err == -1, 0))
3135 err = re_node_set_insert (dst_nodes, cur_node);
3136 if (BE (err == -1, 0))
3138 if (dfa->edests[cur_node].nelem == 0)
3140 if (dfa->edests[cur_node].nelem == 2)
3142 err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3143 dfa->edests[cur_node].elems[1],
3145 if (BE (err != REG_NOERROR, 0))
3148 cur_node = dfa->edests[cur_node].elems[0];
3154 /* For all the back references in the current state, calculate the
3155 destination of the back references by the appropriate entry
3156 in MCTX->BKREF_ENTS. */
3158 static reg_errcode_t
3160 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3161 int cur_str, int subexp_num, int type)
3163 const re_dfa_t *const dfa = mctx->dfa;
3165 int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3166 struct re_backref_cache_entry *ent;
3168 if (cache_idx_start == -1)
3172 ent = mctx->bkref_ents + cache_idx_start;
3175 int to_idx, next_node;
3177 /* Is this entry ENT is appropriate? */
3178 if (!re_node_set_contains (cur_nodes, ent->node))
3181 to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3182 /* Calculate the destination of the back reference, and append it
3183 to MCTX->STATE_LOG. */
3184 if (to_idx == cur_str)
3186 /* The backreference did epsilon transit, we must re-check all the
3187 node in the current state. */
3188 re_node_set new_dests;
3189 reg_errcode_t err2, err3;
3190 next_node = dfa->edests[ent->node].elems[0];
3191 if (re_node_set_contains (cur_nodes, next_node))
3193 err = re_node_set_init_1 (&new_dests, next_node);
3194 err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3195 err3 = re_node_set_merge (cur_nodes, &new_dests);
3196 re_node_set_free (&new_dests);
3197 if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3198 || err3 != REG_NOERROR, 0))
3200 err = (err != REG_NOERROR ? err
3201 : (err2 != REG_NOERROR ? err2 : err3));
3204 /* TODO: It is still inefficient... */
3209 re_node_set union_set;
3210 next_node = dfa->nexts[ent->node];
3211 if (mctx->state_log[to_idx])
3214 if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3217 err = re_node_set_init_copy (&union_set,
3218 &mctx->state_log[to_idx]->nodes);
3219 ret = re_node_set_insert (&union_set, next_node);
3220 if (BE (err != REG_NOERROR || ret < 0, 0))
3222 re_node_set_free (&union_set);
3223 err = err != REG_NOERROR ? err : REG_ESPACE;
3229 err = re_node_set_init_1 (&union_set, next_node);
3230 if (BE (err != REG_NOERROR, 0))
3233 mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3234 re_node_set_free (&union_set);
3235 if (BE (mctx->state_log[to_idx] == NULL
3236 && err != REG_NOERROR, 0))
3240 while (ent++->more);
3244 /* Build transition table for the state.
3245 Return 1 if succeeded, otherwise return NULL. */
3249 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3252 int i, j, ch, need_word_trtable = 0;
3253 bitset_word_t elem, mask;
3254 bool dests_node_malloced = false;
3255 bool dest_states_malloced = false;
3256 int ndests; /* Number of the destination states from `state'. */
3257 re_dfastate_t **trtable;
3258 re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3259 re_node_set follows, *dests_node;
3261 bitset_t acceptable;
3265 re_node_set dests_node[SBC_MAX];
3266 bitset_t dests_ch[SBC_MAX];
3269 /* We build DFA states which corresponds to the destination nodes
3270 from `state'. `dests_node[i]' represents the nodes which i-th
3271 destination state contains, and `dests_ch[i]' represents the
3272 characters which i-th destination state accepts. */
3273 if (__libc_use_alloca (sizeof (struct dests_alloc)))
3274 dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3277 dests_alloc = re_malloc (struct dests_alloc, 1);
3278 if (BE (dests_alloc == NULL, 0))
3280 dests_node_malloced = true;
3282 dests_node = dests_alloc->dests_node;
3283 dests_ch = dests_alloc->dests_ch;
3285 /* Initialize transiton table. */
3286 state->word_trtable = state->trtable = NULL;
3288 /* At first, group all nodes belonging to `state' into several
3290 ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3291 if (BE (ndests <= 0, 0))
3293 if (dests_node_malloced)
3295 /* Return 0 in case of an error, 1 otherwise. */
3298 state->trtable = (re_dfastate_t **)
3299 calloc (sizeof (re_dfastate_t *), SBC_MAX);
3305 err = re_node_set_alloc (&follows, ndests + 1);
3306 if (BE (err != REG_NOERROR, 0))
3309 if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3310 + ndests * 3 * sizeof (re_dfastate_t *)))
3311 dest_states = (re_dfastate_t **)
3312 alloca (ndests * 3 * sizeof (re_dfastate_t *));
3315 dest_states = (re_dfastate_t **)
3316 malloc (ndests * 3 * sizeof (re_dfastate_t *));
3317 if (BE (dest_states == NULL, 0))
3320 if (dest_states_malloced)
3322 re_node_set_free (&follows);
3323 for (i = 0; i < ndests; ++i)
3324 re_node_set_free (dests_node + i);
3325 if (dests_node_malloced)
3329 dest_states_malloced = true;
3331 dest_states_word = dest_states + ndests;
3332 dest_states_nl = dest_states_word + ndests;
3333 bitset_empty (acceptable);
3335 /* Then build the states for all destinations. */
3336 for (i = 0; i < ndests; ++i)
3339 re_node_set_empty (&follows);
3340 /* Merge the follows of this destination states. */
3341 for (j = 0; j < dests_node[i].nelem; ++j)
3343 next_node = dfa->nexts[dests_node[i].elems[j]];
3344 if (next_node != -1)
3346 err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3347 if (BE (err != REG_NOERROR, 0))
3351 dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3352 if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3354 /* If the new state has context constraint,
3355 build appropriate states for these contexts. */
3356 if (dest_states[i]->has_constraint)
3358 dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3360 if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3363 if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3364 need_word_trtable = 1;
3366 dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3368 if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3373 dest_states_word[i] = dest_states[i];
3374 dest_states_nl[i] = dest_states[i];
3376 bitset_merge (acceptable, dests_ch[i]);
3379 if (!BE (need_word_trtable, 0))
3381 /* We don't care about whether the following character is a word
3382 character, or we are in a single-byte character set so we can
3383 discern by looking at the character code: allocate a
3384 256-entry transition table. */
3385 trtable = state->trtable = calloc (sizeof (re_dfastate_t *), SBC_MAX);
3386 if (BE (trtable == NULL, 0))
3389 /* For all characters ch...: */
3390 for (i = 0; i < BITSET_WORDS; ++i)
3391 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3393 mask <<= 1, elem >>= 1, ++ch)
3394 if (BE (elem & 1, 0))
3396 /* There must be exactly one destination which accepts
3397 character ch. See group_nodes_into_DFAstates. */
3398 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3401 /* j-th destination accepts the word character ch. */
3402 if (dfa->word_char[i] & mask)
3403 trtable[ch] = dest_states_word[j];
3405 trtable[ch] = dest_states[j];
3410 /* We care about whether the following character is a word
3411 character, and we are in a multi-byte character set: discern
3412 by looking at the character code: build two 256-entry
3413 transition tables, one starting at trtable[0] and one
3414 starting at trtable[SBC_MAX]. */
3415 trtable = state->word_trtable = calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3416 if (BE (trtable == NULL, 0))
3419 /* For all characters ch...: */
3420 for (i = 0; i < BITSET_WORDS; ++i)
3421 for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3423 mask <<= 1, elem >>= 1, ++ch)
3424 if (BE (elem & 1, 0))
3426 /* There must be exactly one destination which accepts
3427 character ch. See group_nodes_into_DFAstates. */
3428 for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3431 /* j-th destination accepts the word character ch. */
3432 trtable[ch] = dest_states[j];
3433 trtable[ch + SBC_MAX] = dest_states_word[j];
3438 if (bitset_contain (acceptable, NEWLINE_CHAR))
3440 /* The current state accepts newline character. */
3441 for (j = 0; j < ndests; ++j)
3442 if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3444 /* k-th destination accepts newline character. */
3445 trtable[NEWLINE_CHAR] = dest_states_nl[j];
3446 if (need_word_trtable)
3447 trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3448 /* There must be only one destination which accepts
3449 newline. See group_nodes_into_DFAstates. */
3454 if (dest_states_malloced)
3457 re_node_set_free (&follows);
3458 for (i = 0; i < ndests; ++i)
3459 re_node_set_free (dests_node + i);
3461 if (dests_node_malloced)
3467 /* Group all nodes belonging to STATE into several destinations.
3468 Then for all destinations, set the nodes belonging to the destination
3469 to DESTS_NODE[i] and set the characters accepted by the destination
3470 to DEST_CH[i]. This function return the number of destinations. */
3474 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3475 re_node_set *dests_node, bitset_t *dests_ch)
3480 int ndests; /* Number of the destinations from `state'. */
3481 bitset_t accepts; /* Characters a node can accept. */
3482 const re_node_set *cur_nodes = &state->nodes;
3483 bitset_empty (accepts);
3486 /* For all the nodes belonging to `state', */
3487 for (i = 0; i < cur_nodes->nelem; ++i)
3489 re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3490 re_token_type_t type = node->type;
3491 unsigned int constraint = node->constraint;
3493 /* Enumerate all single byte character this node can accept. */
3494 if (type == CHARACTER)
3495 bitset_set (accepts, node->opr.c);
3496 else if (type == SIMPLE_BRACKET)
3498 bitset_merge (accepts, node->opr.sbcset);
3500 else if (type == OP_PERIOD)
3502 #ifdef RE_ENABLE_I18N
3503 if (dfa->mb_cur_max > 1)
3504 bitset_merge (accepts, dfa->sb_char);
3507 bitset_set_all (accepts);
3508 if (!(dfa->syntax & RE_DOT_NEWLINE))
3509 bitset_clear (accepts, '\n');
3510 if (dfa->syntax & RE_DOT_NOT_NULL)
3511 bitset_clear (accepts, '\0');
3513 #ifdef RE_ENABLE_I18N
3514 else if (type == OP_UTF8_PERIOD)
3516 memset (accepts, '\xff', sizeof (bitset_t) / 2);
3517 if (!(dfa->syntax & RE_DOT_NEWLINE))
3518 bitset_clear (accepts, '\n');
3519 if (dfa->syntax & RE_DOT_NOT_NULL)
3520 bitset_clear (accepts, '\0');
3526 /* Check the `accepts' and sift the characters which are not
3527 match it the context. */
3530 if (constraint & NEXT_NEWLINE_CONSTRAINT)
3532 bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3533 bitset_empty (accepts);
3534 if (accepts_newline)
3535 bitset_set (accepts, NEWLINE_CHAR);
3539 if (constraint & NEXT_ENDBUF_CONSTRAINT)
3541 bitset_empty (accepts);
3545 if (constraint & NEXT_WORD_CONSTRAINT)
3547 bitset_word_t any_set = 0;
3548 if (type == CHARACTER && !node->word_char)
3550 bitset_empty (accepts);
3553 #ifdef RE_ENABLE_I18N
3554 if (dfa->mb_cur_max > 1)
3555 for (j = 0; j < BITSET_WORDS; ++j)
3556 any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3559 for (j = 0; j < BITSET_WORDS; ++j)
3560 any_set |= (accepts[j] &= dfa->word_char[j]);
3564 if (constraint & NEXT_NOTWORD_CONSTRAINT)
3566 bitset_word_t any_set = 0;
3567 if (type == CHARACTER && node->word_char)
3569 bitset_empty (accepts);
3572 #ifdef RE_ENABLE_I18N
3573 if (dfa->mb_cur_max > 1)
3574 for (j = 0; j < BITSET_WORDS; ++j)
3575 any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3578 for (j = 0; j < BITSET_WORDS; ++j)
3579 any_set |= (accepts[j] &= ~dfa->word_char[j]);
3585 /* Then divide `accepts' into DFA states, or create a new
3586 state. Above, we make sure that accepts is not empty. */
3587 for (j = 0; j < ndests; ++j)
3589 bitset_t intersec; /* Intersection sets, see below. */
3591 /* Flags, see below. */
3592 bitset_word_t has_intersec, not_subset, not_consumed;
3594 /* Optimization, skip if this state doesn't accept the character. */
3595 if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3598 /* Enumerate the intersection set of this state and `accepts'. */
3600 for (k = 0; k < BITSET_WORDS; ++k)
3601 has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3602 /* And skip if the intersection set is empty. */
3606 /* Then check if this state is a subset of `accepts'. */
3607 not_subset = not_consumed = 0;
3608 for (k = 0; k < BITSET_WORDS; ++k)
3610 not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3611 not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3614 /* If this state isn't a subset of `accepts', create a
3615 new group state, which has the `remains'. */
3618 bitset_copy (dests_ch[ndests], remains);
3619 bitset_copy (dests_ch[j], intersec);
3620 err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3621 if (BE (err != REG_NOERROR, 0))
3626 /* Put the position in the current group. */
3627 result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3628 if (BE (result < 0, 0))
3631 /* If all characters are consumed, go to next node. */
3635 /* Some characters remain, create a new group. */
3638 bitset_copy (dests_ch[ndests], accepts);
3639 err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3640 if (BE (err != REG_NOERROR, 0))
3643 bitset_empty (accepts);
3648 for (j = 0; j < ndests; ++j)
3649 re_node_set_free (dests_node + j);
3653 #ifdef RE_ENABLE_I18N
3654 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3655 Return the number of the bytes the node accepts.
3656 STR_IDX is the current index of the input string.
3658 This function handles the nodes which can accept one character, or
3659 one collating element like '.', '[a-z]', opposite to the other nodes
3660 can only accept one byte. */
3664 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3665 const re_string_t *input, int str_idx)
3667 const re_token_t *node = dfa->nodes + node_idx;
3668 int char_len, elem_len;
3671 if (BE (node->type == OP_UTF8_PERIOD, 0))
3673 unsigned char c = re_string_byte_at (input, str_idx), d;
3674 if (BE (c < 0xc2, 1))
3677 if (str_idx + 2 > input->len)
3680 d = re_string_byte_at (input, str_idx + 1);
3682 return (d < 0x80 || d > 0xbf) ? 0 : 2;
3686 if (c == 0xe0 && d < 0xa0)
3692 if (c == 0xf0 && d < 0x90)
3698 if (c == 0xf8 && d < 0x88)
3704 if (c == 0xfc && d < 0x84)
3710 if (str_idx + char_len > input->len)
3713 for (i = 1; i < char_len; ++i)
3715 d = re_string_byte_at (input, str_idx + i);
3716 if (d < 0x80 || d > 0xbf)
3722 char_len = re_string_char_size_at (input, str_idx);
3723 if (node->type == OP_PERIOD)
3727 /* FIXME: I don't think this if is needed, as both '\n'
3728 and '\0' are char_len == 1. */
3729 /* '.' accepts any one character except the following two cases. */
3730 if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3731 re_string_byte_at (input, str_idx) == '\n') ||
3732 ((dfa->syntax & RE_DOT_NOT_NULL) &&
3733 re_string_byte_at (input, str_idx) == '\0'))
3738 elem_len = re_string_elem_size_at (input, str_idx);
3739 if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3742 if (node->type == COMPLEX_BRACKET)
3744 const re_charset_t *cset = node->opr.mbcset;
3746 const unsigned char *pin
3747 = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3752 wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3753 ? re_string_wchar_at (input, str_idx) : 0);
3755 /* match with multibyte character? */
3756 for (i = 0; i < cset->nmbchars; ++i)
3757 if (wc == cset->mbchars[i])
3759 match_len = char_len;
3760 goto check_node_accept_bytes_match;
3762 /* match with character_class? */
3763 for (i = 0; i < cset->nchar_classes; ++i)
3765 wctype_t wt = cset->char_classes[i];
3766 if (__iswctype (wc, wt))
3768 match_len = char_len;
3769 goto check_node_accept_bytes_match;
3774 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3777 unsigned int in_collseq = 0;
3778 const int32_t *table, *indirect;
3779 const unsigned char *weights, *extra;
3780 const char *collseqwc;
3782 /* This #include defines a local function! */
3783 # include <locale/weight.h>
3785 /* match with collating_symbol? */
3786 if (cset->ncoll_syms)
3787 extra = (const unsigned char *)
3788 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3789 for (i = 0; i < cset->ncoll_syms; ++i)
3791 const unsigned char *coll_sym = extra + cset->coll_syms[i];
3792 /* Compare the length of input collating element and
3793 the length of current collating element. */
3794 if (*coll_sym != elem_len)
3796 /* Compare each bytes. */
3797 for (j = 0; j < *coll_sym; j++)
3798 if (pin[j] != coll_sym[1 + j])
3802 /* Match if every bytes is equal. */
3804 goto check_node_accept_bytes_match;
3810 if (elem_len <= char_len)
3812 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3813 in_collseq = __collseq_table_lookup (collseqwc, wc);
3816 in_collseq = find_collation_sequence_value (pin, elem_len);
3818 /* match with range expression? */
3819 for (i = 0; i < cset->nranges; ++i)
3820 if (cset->range_starts[i] <= in_collseq
3821 && in_collseq <= cset->range_ends[i])
3823 match_len = elem_len;
3824 goto check_node_accept_bytes_match;
3827 /* match with equivalence_class? */
3828 if (cset->nequiv_classes)
3830 const unsigned char *cp = pin;
3831 table = (const int32_t *)
3832 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3833 weights = (const unsigned char *)
3834 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3835 extra = (const unsigned char *)
3836 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3837 indirect = (const int32_t *)
3838 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3839 idx = findidx (&cp);
3841 for (i = 0; i < cset->nequiv_classes; ++i)
3843 int32_t equiv_class_idx = cset->equiv_classes[i];
3844 size_t weight_len = weights[idx];
3845 if (weight_len == weights[equiv_class_idx])
3848 while (cnt <= weight_len
3849 && (weights[equiv_class_idx + 1 + cnt]
3850 == weights[idx + 1 + cnt]))
3852 if (cnt > weight_len)
3854 match_len = elem_len;
3855 goto check_node_accept_bytes_match;
3864 /* match with range expression? */
3867 memset (cmp_buf, 0, sizeof(cmp_buf));
3869 for (i = 0; i < cset->nranges; ++i)
3871 cmp_buf[0] = cset->range_starts[i];
3872 cmp_buf[4] = cset->range_ends[i];
3873 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3874 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3876 match_len = char_len;
3877 goto check_node_accept_bytes_match;
3882 check_node_accept_bytes_match:
3883 if (!cset->non_match)
3887 return (elem_len > char_len) ? elem_len : char_len;
3895 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3897 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3902 /* No valid character. Match it as a single byte character. */
3903 const unsigned char *collseq = (const unsigned char *)
3904 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3905 return collseq[mbs[0]];
3912 const unsigned char *extra = (const unsigned char *)
3913 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3914 int32_t extrasize = (const unsigned char *)
3915 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3917 for (idx = 0; idx < extrasize;)
3919 int mbs_cnt, found = 0;
3920 int32_t elem_mbs_len;
3921 /* Skip the name of collating element name. */
3922 idx = idx + extra[idx] + 1;
3923 elem_mbs_len = extra[idx++];
3924 if (mbs_len == elem_mbs_len)
3926 for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3927 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3929 if (mbs_cnt == elem_mbs_len)
3930 /* Found the entry. */
3933 /* Skip the byte sequence of the collating element. */
3934 idx += elem_mbs_len;
3935 /* Adjust for the alignment. */
3936 idx = (idx + 3) & ~3;
3937 /* Skip the collation sequence value. */
3938 idx += sizeof (uint32_t);
3939 /* Skip the wide char sequence of the collating element. */
3940 idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3941 /* If we found the entry, return the sequence value. */
3943 return *(uint32_t *) (extra + idx);
3944 /* Skip the collation sequence value. */
3945 idx += sizeof (uint32_t);
3951 #endif /* RE_ENABLE_I18N */
3953 /* Check whether the node accepts the byte which is IDX-th
3954 byte of the INPUT. */
3958 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3962 ch = re_string_byte_at (&mctx->input, idx);
3966 if (node->opr.c != ch)
3970 case SIMPLE_BRACKET:
3971 if (!bitset_contain (node->opr.sbcset, ch))
3975 #ifdef RE_ENABLE_I18N
3976 case OP_UTF8_PERIOD:
3982 if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
3983 || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
3991 if (node->constraint)
3993 /* The node has constraints. Check whether the current context
3994 satisfies the constraints. */
3995 unsigned int context = re_string_context_at (&mctx->input, idx,
3997 if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4004 /* Extend the buffers, if the buffers have run out. */
4006 static reg_errcode_t
4008 extend_buffers (re_match_context_t *mctx)
4011 re_string_t *pstr = &mctx->input;
4013 /* Double the lengthes of the buffers. */
4014 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4015 if (BE (ret != REG_NOERROR, 0))
4018 if (mctx->state_log != NULL)
4020 /* And double the length of state_log. */
4021 /* XXX We have no indication of the size of this buffer. If this
4022 allocation fail we have no indication that the state_log array
4023 does not have the right size. */
4024 re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4025 pstr->bufs_len + 1);
4026 if (BE (new_array == NULL, 0))
4028 mctx->state_log = new_array;
4031 /* Then reconstruct the buffers. */
4034 #ifdef RE_ENABLE_I18N
4035 if (pstr->mb_cur_max > 1)
4037 ret = build_wcs_upper_buffer (pstr);
4038 if (BE (ret != REG_NOERROR, 0))
4042 #endif /* RE_ENABLE_I18N */
4043 build_upper_buffer (pstr);
4047 #ifdef RE_ENABLE_I18N
4048 if (pstr->mb_cur_max > 1)
4049 build_wcs_buffer (pstr);
4051 #endif /* RE_ENABLE_I18N */
4053 if (pstr->trans != NULL)
4054 re_string_translate_buffer (pstr);
4061 /* Functions for matching context. */
4063 /* Initialize MCTX. */
4065 static reg_errcode_t
4067 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4069 mctx->eflags = eflags;
4070 mctx->match_last = -1;
4073 mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4074 mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4075 if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4078 /* Already zero-ed by the caller.
4080 mctx->bkref_ents = NULL;
4081 mctx->nbkref_ents = 0;
4082 mctx->nsub_tops = 0; */
4083 mctx->abkref_ents = n;
4084 mctx->max_mb_elem_len = 1;
4085 mctx->asub_tops = n;
4089 /* Clean the entries which depend on the current input in MCTX.
4090 This function must be invoked when the matcher changes the start index
4091 of the input, or changes the input string. */
4095 match_ctx_clean (re_match_context_t *mctx)
4098 for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4101 re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4102 for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4104 re_sub_match_last_t *last = top->lasts[sl_idx];
4105 re_free (last->path.array);
4108 re_free (top->lasts);
4111 re_free (top->path->array);
4112 re_free (top->path);
4117 mctx->nsub_tops = 0;
4118 mctx->nbkref_ents = 0;
4121 /* Free all the memory associated with MCTX. */
4125 match_ctx_free (re_match_context_t *mctx)
4127 /* First, free all the memory associated with MCTX->SUB_TOPS. */
4128 match_ctx_clean (mctx);
4129 re_free (mctx->sub_tops);
4130 re_free (mctx->bkref_ents);
4133 /* Add a new backreference entry to MCTX.
4134 Note that we assume that caller never call this function with duplicate
4135 entry, and call with STR_IDX which isn't smaller than any existing entry.
4138 static reg_errcode_t
4140 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4143 if (mctx->nbkref_ents >= mctx->abkref_ents)
4145 struct re_backref_cache_entry* new_entry;
4146 new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4147 mctx->abkref_ents * 2);
4148 if (BE (new_entry == NULL, 0))
4150 re_free (mctx->bkref_ents);
4153 mctx->bkref_ents = new_entry;
4154 memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4155 sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4156 mctx->abkref_ents *= 2;
4158 if (mctx->nbkref_ents > 0
4159 && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4160 mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4162 mctx->bkref_ents[mctx->nbkref_ents].node = node;
4163 mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4164 mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4165 mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4167 /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4168 If bit N is clear, means that this entry won't epsilon-transition to
4169 an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression. If
4170 it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4173 A backreference does not epsilon-transition unless it is empty, so set
4174 to all zeros if FROM != TO. */
4175 mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4176 = (from == to ? ~0 : 0);
4178 mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4179 if (mctx->max_mb_elem_len < to - from)
4180 mctx->max_mb_elem_len = to - from;
4184 /* Search for the first entry which has the same str_idx, or -1 if none is
4185 found. Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
4189 search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4191 int left, right, mid, last;
4192 last = right = mctx->nbkref_ents;
4193 for (left = 0; left < right;)
4195 mid = (left + right) / 2;
4196 if (mctx->bkref_ents[mid].str_idx < str_idx)
4201 if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4207 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4210 static reg_errcode_t
4212 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4215 assert (mctx->sub_tops != NULL);
4216 assert (mctx->asub_tops > 0);
4218 if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4220 int new_asub_tops = mctx->asub_tops * 2;
4221 re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4222 re_sub_match_top_t *,
4224 if (BE (new_array == NULL, 0))
4226 mctx->sub_tops = new_array;
4227 mctx->asub_tops = new_asub_tops;
4229 mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4230 if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4232 mctx->sub_tops[mctx->nsub_tops]->node = node;
4233 mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4237 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4238 at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
4240 static re_sub_match_last_t *
4242 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4244 re_sub_match_last_t *new_entry;
4245 if (BE (subtop->nlasts == subtop->alasts, 0))
4247 int new_alasts = 2 * subtop->alasts + 1;
4248 re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4249 re_sub_match_last_t *,
4251 if (BE (new_array == NULL, 0))
4253 subtop->lasts = new_array;
4254 subtop->alasts = new_alasts;
4256 new_entry = calloc (1, sizeof (re_sub_match_last_t));
4257 if (BE (new_entry != NULL, 1))
4259 subtop->lasts[subtop->nlasts] = new_entry;
4260 new_entry->node = node;
4261 new_entry->str_idx = str_idx;
4269 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4270 re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4272 sctx->sifted_states = sifted_sts;
4273 sctx->limited_states = limited_sts;
4274 sctx->last_node = last_node;
4275 sctx->last_str_idx = last_str_idx;
4276 re_node_set_init_empty (&sctx->limits);