libc/misc/regex/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  21                                      int n) internal_function;
  22 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  23 static void match_ctx_free (re_match_context_t *cache) internal_function;
  24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  25                                           int str_idx, int from, int to)
  26      internal_function;
  27 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
  28      internal_function;
  29 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  30                                            int str_idx) internal_function;
  31 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  32                                                    int node, int str_idx)
  33      internal_function;
  34 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  35                            re_dfastate_t **limited_sts, int last_node,
  36                            int last_str_idx)
  37      internal_function;
  38 static reg_errcode_t re_search_internal (const regex_t *preg,
  39                                          const char *string, int length,
  40                                          int start, int range, int stop,
  41                                          size_t nmatch, regmatch_t pmatch[],
  42                                          int eflags) internal_function;
  43 static int re_search_2_stub (struct re_pattern_buffer *bufp,
  44                              const char *string1, int length1,
  45                              const char *string2, int length2,
  46                              int start, int range, struct re_registers *regs,
  47                              int stop, int ret_len) internal_function;
  48 static int re_search_stub (struct re_pattern_buffer *bufp,
  49                            const char *string, int length, int start,
  50                            int range, int stop, struct re_registers *regs,
  51                            int ret_len) internal_function;
  52 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  53                               int nregs, int regs_allocated) internal_function;
  54 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  55      internal_function;
  56 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  57                            int *p_match_first) internal_function;
  58 static int check_halt_state_context (const re_match_context_t *mctx,
  59                                      const re_dfastate_t *state, int idx)
  60      internal_function;
  61 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
  62                          regmatch_t *prev_idx_match, int cur_node,
  63                          int cur_idx, int nmatch) internal_function;
  64 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  65                                       int str_idx, int dest_node, int nregs,
  66                                       regmatch_t *regs,
  67                                       re_node_set *eps_via_nodes)
  68      internal_function;
  69 static reg_errcode_t set_regs (const regex_t *preg,
  70                                const re_match_context_t *mctx,
  71                                size_t nmatch, regmatch_t *pmatch,
  72                                int fl_backtrack) internal_function;
  73 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
  74      internal_function;
  75
  76 #ifdef RE_ENABLE_I18N
  77 static int sift_states_iter_mb (const re_match_context_t *mctx,
  78                                 re_sift_context_t *sctx,
  79                                 int node_idx, int str_idx, int max_str_idx)
  80      internal_function;
  81 #endif
  82 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
  83                                            re_sift_context_t *sctx)
  84      internal_function;
  85 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
  86                                           re_sift_context_t *sctx, int str_idx,
  87                                           re_node_set *cur_dest)
  88      internal_function;
  89 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
  90                                               re_sift_context_t *sctx,
  91                                               int str_idx,
  92                                               re_node_set *dest_nodes)
  93      internal_function;
  94 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
  95                                             re_node_set *dest_nodes,
  96                                             const re_node_set *candidates)
  97      internal_function;
  98 static int check_dst_limits (const re_match_context_t *mctx,
  99                              re_node_set *limits,
 100                              int dst_node, int dst_idx, int src_node,
 101                              int src_idx) internal_function;
 102 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
 103                                         int boundaries, int subexp_idx,
 104                                         int from_node, int bkref_idx)
 105      internal_function;
 106 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
 107                                       int limit, int subexp_idx,
 108                                       int node, int str_idx,
 109                                       int bkref_idx) internal_function;
 110 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
 111                                           re_node_set *dest_nodes,
 112                                           const re_node_set *candidates,
 113                                           re_node_set *limits,
 114                                           struct re_backref_cache_entry *bkref_ents,
 115                                           int str_idx) internal_function;
 116 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
 117                                         re_sift_context_t *sctx,
 118                                         int str_idx, const re_node_set *candidates)
 119      internal_function;
 120 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
 121                                         re_dfastate_t **dst,
 122                                         re_dfastate_t **src, int num)
 123      internal_function;
 124 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 125                                          re_match_context_t *mctx) internal_function;
 126 static re_dfastate_t *transit_state (reg_errcode_t *err,
 127                                      re_match_context_t *mctx,
 128                                      re_dfastate_t *state) internal_function;
 129 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 130                                             re_match_context_t *mctx,
 131                                             re_dfastate_t *next_state)
 132      internal_function;
 133 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 134                                                 re_node_set *cur_nodes,
 135                                                 int str_idx) internal_function;
 136 #if 0
 137 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 138                                         re_match_context_t *mctx,
 139                                         re_dfastate_t *pstate)
 140      internal_function;
 141 #endif
 142 #ifdef RE_ENABLE_I18N
 143 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 144                                        re_dfastate_t *pstate)
 145      internal_function;
 146 #endif
 147 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 148                                           const re_node_set *nodes)
 149      internal_function;
 150 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 151                                  int bkref_node, int bkref_str_idx)
 152      internal_function;
 153 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 154                                      const re_sub_match_top_t *sub_top,
 155                                      re_sub_match_last_t *sub_last,
 156                                      int bkref_node, int bkref_str)
 157      internal_function;
 158 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 159                              int subexp_idx, int type) internal_function;
 160 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 161                                     state_array_t *path, int top_node,
 162                                     int top_str, int last_node, int last_str,
 163                                     int type) internal_function;
 164 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 165                                                    int str_idx,
 166                                                    re_node_set *cur_nodes,
 167                                                    re_node_set *next_nodes)
 168      internal_function;
 169 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
 170                                                re_node_set *cur_nodes,
 171                                                int ex_subexp, int type)
 172      internal_function;
 173 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
 174                                                    re_node_set *dst_nodes,
 175                                                    int target, int ex_subexp,
 176                                                    int type) internal_function;
 177 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 178                                          re_node_set *cur_nodes, int cur_str,
 179                                          int subexp_num, int type)
 180      internal_function;
 181 static int build_trtable (const re_dfa_t *dfa,
 182                           re_dfastate_t *state) internal_function;
 183 #ifdef RE_ENABLE_I18N
 184 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
 185                                     const re_string_t *input, int idx)
 186      internal_function;
 187 #endif
 188 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
 189                                        const re_dfastate_t *state,
 190                                        re_node_set *states_node,
 191                                        bitset_t *states_ch) internal_function;
 192 static int check_node_accept (const re_match_context_t *mctx,
 193                               const re_token_t *node, int idx)
 194      internal_function;
 195 static reg_errcode_t extend_buffers (re_match_context_t *mctx)
 196      internal_function;
 197
 198 /* Entry point for POSIX code.  */
 199
 200 /* regexec searches for a given pattern, specified by PREG, in the
 201    string STRING.
 202
 203    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 204    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 205    least NMATCH elements, and we set them to the offsets of the
 206    corresponding matched substrings.
 207
 208    EFLAGS specifies `execution flags' which affect matching: if
 209    REG_NOTBOL is set, then ^ does not match at the beginning of the
 210    string; if REG_NOTEOL is set, then $ does not match at the end.
 211
 212    We return 0 if we find a match and REG_NOMATCH if not.  */
 213
 214 int
 215 regexec (const regex_t *__restrict preg, const char *__restrict string,
 216          size_t nmatch, regmatch_t pmatch[], int eflags)
 217 {
 218   reg_errcode_t err;
 219   int start, length;
 220 #ifdef __UCLIBC_HAS_THREADS__
 221   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 222 #endif
 223
 224   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 225     return REG_BADPAT;
 226
 227   if (eflags & REG_STARTEND)
 228     {
 229       start = pmatch[0].rm_so;
 230       length = pmatch[0].rm_eo;
 231     }
 232   else
 233     {
 234       start = 0;
 235       length = strlen (string);
 236     }
 237
 238   __libc_lock_lock (dfa->lock);
 239   if (preg->no_sub)
 240     err = re_search_internal (preg, string, length, start, length - start,
 241                               length, 0, NULL, eflags);
 242   else
 243     err = re_search_internal (preg, string, length, start, length - start,
 244                               length, nmatch, pmatch, eflags);
 245   __libc_lock_unlock (dfa->lock);
 246   return err != REG_NOERROR;
 247 }
 248 libc_hidden_def(regexec)
 249
 250 /* Entry points for GNU code.  */
 251
 252 /* re_match, re_search, re_match_2, re_search_2
 253
 254    The former two functions operate on STRING with length LENGTH,
 255    while the later two operate on concatenation of STRING1 and STRING2
 256    with lengths LENGTH1 and LENGTH2, respectively.
 257
 258    re_match() matches the compiled pattern in BUFP against the string,
 259    starting at index START.
 260
 261    re_search() first tries matching at index START, then it tries to match
 262    starting from index START + 1, and so on.  The last start position tried
 263    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 264    way as re_match().)
 265
 266    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 267    the first STOP characters of the concatenation of the strings should be
 268    concerned.
 269
 270    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 271    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 272    computed relative to the concatenation, not relative to the individual
 273    strings.)
 274
 275    On success, re_match* functions return the length of the match, re_search*
 276    return the position of the start of the match.  Return value -1 means no
 277    match was found and -2 indicates an internal error.  */
 278
 279 int
 280 re_match (struct re_pattern_buffer *bufp, const char *string, int length,
 281                   int start, struct re_registers *regs)
 282 {
 283   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 284 }
 285
 286 int
 287 re_search (struct re_pattern_buffer *bufp, const char *string, int length,
 288                    int start, int range, struct re_registers *regs)
 289 {
 290   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 291 }
 292 libc_hidden_def(re_search)
 293
 294 int
 295 re_match_2 (struct re_pattern_buffer *bufp, const char *string1, int length1,
 296                    const char *string2, int length2, int start,
 297                    struct re_registers *regs, int stop)
 298 {
 299   return re_search_2_stub (bufp, string1, length1, string2, length2,
 300                            start, 0, regs, stop, 1);
 301 }
 302
 303 int
 304 re_search_2 (struct re_pattern_buffer *bufp, const char *string1, int lenght1,
 305                          const char *string2, int length2, int start, int range,
 306                          struct re_registers *regs,  int stop)
 307 {
 308   return re_search_2_stub (bufp, string1, lenght1, string2, length2,
 309                            start, range, regs, stop, 0);
 310 }
 311 libc_hidden_def(re_search_2)
 312
 313 static int internal_function
 314 re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1,
 315                                   int length1, const char *string2, int length2, int start,
 316                                   int range, struct re_registers *regs, int stop, int ret_len)
 317 {
 318   const char *str;
 319   int rval;
 320   int len = length1 + length2;
 321   int free_str = 0;
 322
 323   if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
 324     return -2;
 325
 326   /* Concatenate the strings.  */
 327   if (length2 > 0)
 328     if (length1 > 0)
 329       {
 330         char *s = re_malloc (char, len);
 331
 332         if (BE (s == NULL, 0))
 333           return -2;
 334         memcpy (s, string1, length1);
 335         memcpy (s + length1, string2, length2);
 336         str = s;
 337         free_str = 1;
 338       }
 339     else
 340       str = string2;
 341   else
 342     str = string1;
 343
 344   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
 345                          ret_len);
 346   if (free_str)
 347     re_free ((char *) str);
 348   return rval;
 349 }
 350
 351 /* The parameters have the same meaning as those of re_search.
 352    Additional parameters:
 353    If RET_LEN is nonzero the length of the match is returned (re_match style);
 354    otherwise the position of the match is returned.  */
 355
 356 static int internal_function
 357 re_search_stub (struct re_pattern_buffer *bufp, const char *string, int length,
 358                                 int start, int range, int stop, struct re_registers *regs,
 359                                 int ret_len)
 360 {
 361   reg_errcode_t result;
 362   regmatch_t *pmatch;
 363   int nregs, rval;
 364   int eflags = 0;
 365 #ifdef __UCLIBC_HAS_THREADS__
 366   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 367 #endif
 368   /* Check for out-of-range.  */
 369   if (BE (start < 0 || start > length, 0))
 370     return -1;
 371   if (BE (start + range > length, 0))
 372     range = length - start;
 373   else if (BE (start + range < 0, 0))
 374     range = -start;
 375
 376   __libc_lock_lock (dfa->lock);
 377
 378   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 379   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 380
 381   /* Compile fastmap if we haven't yet.  */
 382   if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 383     re_compile_fastmap (bufp);
 384
 385   if (BE (bufp->no_sub, 0))
 386     regs = NULL;
 387
 388   /* We need at least 1 register.  */
 389   if (regs == NULL)
 390     nregs = 1;
 391   else if (BE (bufp->regs_allocated == REGS_FIXED &&
 392                regs->num_regs < bufp->re_nsub + 1, 0))
 393     {
 394       nregs = regs->num_regs;
 395       if (BE (nregs < 1, 0))
 396         {
 397           /* Nothing can be copied to regs.  */
 398           regs = NULL;
 399           nregs = 1;
 400         }
 401     }
 402   else
 403     nregs = bufp->re_nsub + 1;
 404   pmatch = re_malloc (regmatch_t, nregs);
 405   if (BE (pmatch == NULL, 0))
 406     {
 407       rval = -2;
 408       goto out;
 409     }
 410
 411   result = re_search_internal (bufp, string, length, start, range, stop,
 412                                nregs, pmatch, eflags);
 413
 414   rval = 0;
 415
 416   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 417   if (result != REG_NOERROR)
 418     rval = -1;
 419   else if (regs != NULL)
 420     {
 421       /* If caller wants register contents data back, copy them.  */
 422       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 423                                            bufp->regs_allocated);
 424       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 425         rval = -2;
 426     }
 427
 428   if (BE (rval == 0, 1))
 429     {
 430       if (ret_len)
 431         {
 432           assert (pmatch[0].rm_so == start);
 433           rval = pmatch[0].rm_eo - start;
 434         }
 435       else
 436         rval = pmatch[0].rm_so;
 437     }
 438   re_free (pmatch);
 439  out:
 440   __libc_lock_unlock (dfa->lock);
 441   return rval;
 442 }
 443
 444 static unsigned internal_function
 445 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, int nregs,
 446                           int regs_allocated)
 447 {
 448   int rval = REGS_REALLOCATE;
 449   int i;
 450   int need_regs = nregs + 1;
 451   /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 452      uses.  */
 453
 454   /* Have the register data arrays been allocated?  */
 455   if (regs_allocated == REGS_UNALLOCATED)
 456     { /* No.  So allocate them with malloc.  */
 457       regs->start = re_malloc (regoff_t, need_regs);
 458       regs->end = re_malloc (regoff_t, need_regs);
 459       if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
 460         return REGS_UNALLOCATED;
 461       regs->num_regs = need_regs;
 462     }
 463   else if (regs_allocated == REGS_REALLOCATE)
 464     { /* Yes.  If we need more elements than were already
 465          allocated, reallocate them.  If we need fewer, just
 466          leave it alone.  */
 467       if (BE (need_regs > regs->num_regs, 0))
 468         {
 469           regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 470           regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
 471           if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
 472             return REGS_UNALLOCATED;
 473           regs->start = new_start;
 474           regs->end = new_end;
 475           regs->num_regs = need_regs;
 476         }
 477     }
 478   else
 479     {
 480       assert (regs_allocated == REGS_FIXED);
 481       /* This function may not be called with REGS_FIXED and nregs too big.  */
 482       assert (regs->num_regs >= nregs);
 483       rval = REGS_FIXED;
 484     }
 485
 486   /* Copy the regs.  */
 487   for (i = 0; i < nregs; ++i)
 488     {
 489       regs->start[i] = pmatch[i].rm_so;
 490       regs->end[i] = pmatch[i].rm_eo;
 491     }
 492   for ( ; i < regs->num_regs; ++i)
 493     regs->start[i] = regs->end[i] = -1;
 494
 495   return rval;
 496 }
 497
 498 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 499    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 500    this memory for recording register information.  STARTS and ENDS
 501    must be allocated using the malloc library routine, and must each
 502    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 503
 504    If NUM_REGS == 0, then subsequent matches should allocate their own
 505    register data.
 506
 507    Unless this function is called, the first search or match using
 508    PATTERN_BUFFER will allocate its own register data, without
 509    freeing the old data.  */
 510
 511 void
 512 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
 513                                   unsigned num_regs, regoff_t *starts, regoff_t *ends)
 514 {
 515   if (num_regs)
 516     {
 517       bufp->regs_allocated = REGS_REALLOCATE;
 518       regs->num_regs = num_regs;
 519       regs->start = starts;
 520       regs->end = ends;
 521     }
 522   else
 523     {
 524       bufp->regs_allocated = REGS_UNALLOCATED;
 525       regs->num_regs = 0;
 526       regs->start = regs->end = (regoff_t *) 0;
 527     }
 528 }
 529
 530 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 531    them unless specifically requested.  */
 532
 533 #if defined _REGEX_RE_COMP || defined __UCLIBC__
 534 int
 535 weak_function
 536 re_exec (const char *s)
 537 {
 538   return 0 == regexec (re_comp_buf, s, 0, NULL, 0);
 539 }
 540 #endif
 541
 542 /* Internal entry point.  */
 543
 544 /* Searches for a compiled pattern PREG in the string STRING, whose
 545    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 546    mingings with regexec.  START, and RANGE have the same meanings
 547    with re_search.
 548    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 549    otherwise return the error code.
 550    Note: We assume front end functions already check ranges.
 551    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 552 static reg_errcode_t internal_function
 553 re_search_internal (const regex_t *preg, const char *string, int length,
 554                                         int start, int range, int stop, size_t nmatch,
 555                                         regmatch_t pmatch[], int eflags)
 556 {
 557   reg_errcode_t err;
 558   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
 559   int left_lim, right_lim, incr;
 560   int fl_longest_match, match_first, match_kind, match_last = -1;
 561   int extra_nmatch;
 562   int sb, ch;
 563   re_match_context_t mctx;
 564   char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
 565                    && range && !preg->can_be_null) ? preg->fastmap : NULL;
 566   __RE_TRANSLATE_TYPE t = preg->translate;
 567
 568   memset (&mctx, '\0', sizeof (re_match_context_t));
 569   mctx.dfa = dfa;
 570
 571   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 572   nmatch -= extra_nmatch;
 573
 574   /* Check if the DFA haven't been compiled.  */
 575   if (BE (preg->used == 0 || dfa->init_state == NULL
 576           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 577           || dfa->init_state_begbuf == NULL, 0))
 578     return REG_NOMATCH;
 579
 580 #ifdef DEBUG
 581   /* We assume front-end functions already check them.  */
 582   assert (start + range >= 0 && start + range <= length);
 583 #endif
 584
 585   /* If initial states with non-begbuf contexts have no elements,
 586      the regex must be anchored.  If preg->newline_anchor is set,
 587      we'll never use init_state_nl, so do not check it.  */
 588   if (dfa->init_state->nodes.nelem == 0
 589       && dfa->init_state_word->nodes.nelem == 0
 590       && (dfa->init_state_nl->nodes.nelem == 0
 591           || !preg->newline_anchor))
 592     {
 593       if (start != 0 && start + range != 0)
 594         return REG_NOMATCH;
 595       start = range = 0;
 596     }
 597
 598   /* We must check the longest matching, if nmatch > 0.  */
 599   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 600
 601   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 602                             preg->translate, preg->syntax & RE_ICASE, dfa);
 603   if (BE (err != REG_NOERROR, 0))
 604     goto free_return;
 605   mctx.input.stop = stop;
 606   mctx.input.raw_stop = stop;
 607   mctx.input.newline_anchor = preg->newline_anchor;
 608
 609   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 610   if (BE (err != REG_NOERROR, 0))
 611     goto free_return;
 612
 613   /* We will log all the DFA states through which the dfa pass,
 614      if nmatch > 1, or this dfa has "multibyte node", which is a
 615      back-reference or a node which can accept multibyte character or
 616      multi character collating element.  */
 617   if (nmatch > 1 || dfa->has_mb_node)
 618     {
 619       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 620       if (BE (mctx.state_log == NULL, 0))
 621         {
 622           err = REG_ESPACE;
 623           goto free_return;
 624         }
 625     }
 626   else
 627     mctx.state_log = NULL;
 628
 629   match_first = start;
 630   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 631                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 632
 633   /* Check incrementally whether of not the input string match.  */
 634   incr = (range < 0) ? -1 : 1;
 635   left_lim = (range < 0) ? start + range : start;
 636   right_lim = (range < 0) ? start : start + range;
 637   sb = dfa->mb_cur_max == 1;
 638   match_kind =
 639     (fastmap
 640      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
 641         | (range >= 0 ? 2 : 0)
 642         | (t != NULL ? 1 : 0))
 643      : 8);
 644
 645   for (;; match_first += incr)
 646     {
 647       err = REG_NOMATCH;
 648       if (match_first < left_lim || right_lim < match_first)
 649         goto free_return;
 650
 651       /* Advance as rapidly as possible through the string, until we
 652          find a plausible place to start matching.  This may be done
 653          with varying efficiency, so there are various possibilities:
 654          only the most common of them are specialized, in order to
 655          save on code size.  We use a switch statement for speed.  */
 656       switch (match_kind)
 657         {
 658         case 8:
 659           /* No fastmap.  */
 660           break;
 661
 662         case 7:
 663           /* Fastmap with single-byte translation, match forward.  */
 664           while (BE (match_first < right_lim, 1)
 665                  && !fastmap[t[(unsigned char) string[match_first]]])
 666             ++match_first;
 667           goto forward_match_found_start_or_reached_end;
 668
 669         case 6:
 670           /* Fastmap without translation, match forward.  */
 671           while (BE (match_first < right_lim, 1)
 672                  && !fastmap[(unsigned char) string[match_first]])
 673             ++match_first;
 674
 675         forward_match_found_start_or_reached_end:
 676           if (BE (match_first == right_lim, 0))
 677             {
 678               ch = match_first >= length
 679                        ? 0 : (unsigned char) string[match_first];
 680               if (!fastmap[t ? t[ch] : ch])
 681                 goto free_return;
 682             }
 683           break;
 684
 685         case 4:
 686         case 5:
 687           /* Fastmap without multi-byte translation, match backwards.  */
 688           while (match_first >= left_lim)
 689             {
 690               ch = match_first >= length
 691                        ? 0 : (unsigned char) string[match_first];
 692               if (fastmap[t ? t[ch] : ch])
 693                 break;
 694               --match_first;
 695             }
 696           if (match_first < left_lim)
 697             goto free_return;
 698           break;
 699
 700         default:
 701           /* In this case, we can't determine easily the current byte,
 702              since it might be a component byte of a multibyte
 703              character.  Then we use the constructed buffer instead.  */
 704           for (;;)
 705             {
 706               /* If MATCH_FIRST is out of the valid range, reconstruct the
 707                  buffers.  */
 708               unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 709               if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 710                 {
 711                   err = re_string_reconstruct (&mctx.input, match_first,
 712                                                eflags);
 713                   if (BE (err != REG_NOERROR, 0))
 714                     goto free_return;
 715
 716                   offset = match_first - mctx.input.raw_mbs_idx;
 717                 }
 718               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 719                  Note that MATCH_FIRST must not be smaller than 0.  */
 720               ch = (match_first >= length
 721                     ? 0 : re_string_byte_at (&mctx.input, offset));
 722               if (fastmap[ch])
 723                 break;
 724               match_first += incr;
 725               if (match_first < left_lim || match_first > right_lim)
 726                 {
 727                   err = REG_NOMATCH;
 728                   goto free_return;
 729                 }
 730             }
 731           break;
 732         }
 733
 734       /* Reconstruct the buffers so that the matcher can assume that
 735          the matching starts from the beginning of the buffer.  */
 736       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 737       if (BE (err != REG_NOERROR, 0))
 738         goto free_return;
 739
 740 #ifdef RE_ENABLE_I18N
 741      /* Don't consider this char as a possible match start if it part,
 742         yet isn't the head, of a multibyte character.  */
 743       if (!sb && !re_string_first_byte (&mctx.input, 0))
 744         continue;
 745 #endif
 746
 747       /* It seems to be appropriate one, then use the matcher.  */
 748       /* We assume that the matching starts from 0.  */
 749       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 750       match_last = check_matching (&mctx, fl_longest_match,
 751                                    range >= 0 ? &match_first : NULL);
 752       if (match_last != -1)
 753         {
 754           if (BE (match_last == -2, 0))
 755             {
 756               err = REG_ESPACE;
 757               goto free_return;
 758             }
 759           else
 760             {
 761               mctx.match_last = match_last;
 762               if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 763                 {
 764                   re_dfastate_t *pstate = mctx.state_log[match_last];
 765                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 766                                                              match_last);
 767                 }
 768               if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 769                   || dfa->nbackref)
 770                 {
 771                   err = prune_impossible_nodes (&mctx);
 772                   if (err == REG_NOERROR)
 773                     break;
 774                   if (BE (err != REG_NOMATCH, 0))
 775                     goto free_return;
 776                   match_last = -1;
 777                 }
 778               else
 779                 break; /* We found a match.  */
 780             }
 781         }
 782
 783       match_ctx_clean (&mctx);
 784     }
 785
 786 #ifdef DEBUG
 787   assert (match_last != -1);
 788   assert (err == REG_NOERROR);
 789 #endif
 790
 791   /* Set pmatch[] if we need.  */
 792   if (nmatch > 0)
 793     {
 794       int reg_idx;
 795
 796       /* Initialize registers.  */
 797       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 798         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 799
 800       /* Set the points where matching start/end.  */
 801       pmatch[0].rm_so = 0;
 802       pmatch[0].rm_eo = mctx.match_last;
 803
 804       if (!preg->no_sub && nmatch > 1)
 805         {
 806           err = set_regs (preg, &mctx, nmatch, pmatch,
 807                           dfa->has_plural_match && dfa->nbackref > 0);
 808           if (BE (err != REG_NOERROR, 0))
 809             goto free_return;
 810         }
 811
 812       /* At last, add the offset to the each registers, since we slided
 813          the buffers so that we could assume that the matching starts
 814          from 0.  */
 815       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 816         if (pmatch[reg_idx].rm_so != -1)
 817           {
 818 #ifdef RE_ENABLE_I18N
 819             if (BE (mctx.input.offsets_needed != 0, 0))
 820               {
 821                 pmatch[reg_idx].rm_so =
 822                   (pmatch[reg_idx].rm_so == mctx.input.valid_len
 823                    ? mctx.input.valid_raw_len
 824                    : mctx.input.offsets[pmatch[reg_idx].rm_so]);
 825                 pmatch[reg_idx].rm_eo =
 826                   (pmatch[reg_idx].rm_eo == mctx.input.valid_len
 827                    ? mctx.input.valid_raw_len
 828                    : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
 829               }
 830 #else
 831             assert (mctx.input.offsets_needed == 0);
 832 #endif
 833             pmatch[reg_idx].rm_so += match_first;
 834             pmatch[reg_idx].rm_eo += match_first;
 835           }
 836       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 837         {
 838           pmatch[nmatch + reg_idx].rm_so = -1;
 839           pmatch[nmatch + reg_idx].rm_eo = -1;
 840         }
 841
 842       if (dfa->subexp_map)
 843         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 844           if (dfa->subexp_map[reg_idx] != reg_idx)
 845             {
 846               pmatch[reg_idx + 1].rm_so
 847                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 848               pmatch[reg_idx + 1].rm_eo
 849                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 850             }
 851     }
 852
 853  free_return:
 854   re_free (mctx.state_log);
 855   if (dfa->nbackref)
 856     match_ctx_free (&mctx);
 857   re_string_destruct (&mctx.input);
 858   return err;
 859 }
 860
 861 static reg_errcode_t internal_function
 862 prune_impossible_nodes (re_match_context_t *mctx)
 863 {
 864   const re_dfa_t *const dfa = mctx->dfa;
 865   int halt_node, match_last;
 866   reg_errcode_t ret;
 867   re_dfastate_t **sifted_states;
 868   re_dfastate_t **lim_states = NULL;
 869   re_sift_context_t sctx;
 870 #ifdef DEBUG
 871   assert (mctx->state_log != NULL);
 872 #endif
 873   match_last = mctx->match_last;
 874   halt_node = mctx->last_node;
 875   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 876   if (BE (sifted_states == NULL, 0))
 877     {
 878       ret = REG_ESPACE;
 879       goto free_return;
 880     }
 881   if (dfa->nbackref)
 882     {
 883       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 884       if (BE (lim_states == NULL, 0))
 885         {
 886           ret = REG_ESPACE;
 887           goto free_return;
 888         }
 889       while (1)
 890         {
 891           memset (lim_states, '\0',
 892                   sizeof (re_dfastate_t *) * (match_last + 1));
 893           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 894                          match_last);
 895           ret = sift_states_backward (mctx, &sctx);
 896           re_node_set_free (&sctx.limits);
 897           if (BE (ret != REG_NOERROR, 0))
 898               goto free_return;
 899           if (sifted_states[0] != NULL || lim_states[0] != NULL)
 900             break;
 901           do
 902             {
 903               --match_last;
 904               if (match_last < 0)
 905                 {
 906                   ret = REG_NOMATCH;
 907                   goto free_return;
 908                 }
 909             } while (mctx->state_log[match_last] == NULL
 910                      || !mctx->state_log[match_last]->halt);
 911           halt_node = check_halt_state_context (mctx,
 912                                                 mctx->state_log[match_last],
 913                                                 match_last);
 914         }
 915       ret = merge_state_array (dfa, sifted_states, lim_states,
 916                                match_last + 1);
 917       re_free (lim_states);
 918       lim_states = NULL;
 919       if (BE (ret != REG_NOERROR, 0))
 920         goto free_return;
 921     }
 922   else
 923     {
 924       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
 925       ret = sift_states_backward (mctx, &sctx);
 926       re_node_set_free (&sctx.limits);
 927       if (BE (ret != REG_NOERROR, 0))
 928         goto free_return;
 929     }
 930   re_free (mctx->state_log);
 931   mctx->state_log = sifted_states;
 932   sifted_states = NULL;
 933   mctx->last_node = halt_node;
 934   mctx->match_last = match_last;
 935   ret = REG_NOERROR;
 936  free_return:
 937   re_free (sifted_states);
 938   re_free (lim_states);
 939   return ret;
 940 }
 941
 942 /* Acquire an initial state and return it.
 943    We must select appropriate initial state depending on the context,
 944    since initial states may have constraints like "\<", "^", etc..  */
 945
 946 static __inline__ re_dfastate_t *
 947 __attribute ((always_inline)) internal_function
 948 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
 949                             int idx)
 950 {
 951   const re_dfa_t *const dfa = mctx->dfa;
 952   if (dfa->init_state->has_constraint)
 953     {
 954       unsigned int context;
 955       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
 956       if (IS_WORD_CONTEXT (context))
 957         return dfa->init_state_word;
 958       else if (IS_ORDINARY_CONTEXT (context))
 959         return dfa->init_state;
 960       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
 961         return dfa->init_state_begbuf;
 962       else if (IS_NEWLINE_CONTEXT (context))
 963         return dfa->init_state_nl;
 964       else if (IS_BEGBUF_CONTEXT (context))
 965         {
 966           /* It is relatively rare case, then calculate on demand.  */
 967           return re_acquire_state_context (err, dfa,
 968                                            dfa->init_state->entrance_nodes,
 969                                            context);
 970         }
 971       else
 972         /* Must not happen?  */
 973         return dfa->init_state;
 974     }
 975   else
 976     return dfa->init_state;
 977 }
 978
 979 /* Check whether the regular expression match input string INPUT or not,
 980    and return the index where the matching end, return -1 if not match,
 981    or return -2 in case of an error.
 982    FL_LONGEST_MATCH means we want the POSIX longest matching.
 983    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
 984    next place where we may want to try matching.
 985    Note that the matcher assume that the maching starts from the current
 986    index of the buffer.  */
 987
 988 static int
 989 internal_function
 990 check_matching (re_match_context_t *mctx, int fl_longest_match,
 991                 int *p_match_first)
 992 {
 993   const re_dfa_t *const dfa = mctx->dfa;
 994   reg_errcode_t err;
 995   int match = 0;
 996   int match_last = -1;
 997   int cur_str_idx = re_string_cur_idx (&mctx->input);
 998   re_dfastate_t *cur_state;
 999   int at_init_state = p_match_first != NULL;
1000   int next_start_idx = cur_str_idx;
1001
1002   err = REG_NOERROR;
1003   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1004   /* An initial state must not be NULL (invalid).  */
1005   if (BE (cur_state == NULL, 0))
1006     {
1007       assert (err == REG_ESPACE);
1008       return -2;
1009     }
1010
1011   if (mctx->state_log != NULL)
1012     {
1013       mctx->state_log[cur_str_idx] = cur_state;
1014
1015       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1016          later.  E.g. Processing back references.  */
1017       if (BE (dfa->nbackref, 0))
1018         {
1019           at_init_state = 0;
1020           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1021           if (BE (err != REG_NOERROR, 0))
1022             return err;
1023
1024           if (cur_state->has_backref)
1025             {
1026               err = transit_state_bkref (mctx, &cur_state->nodes);
1027               if (BE (err != REG_NOERROR, 0))
1028                 return err;
1029             }
1030         }
1031     }
1032
1033   /* If the RE accepts NULL string.  */
1034   if (BE (cur_state->halt, 0))
1035     {
1036       if (!cur_state->has_constraint
1037           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1038         {
1039           if (!fl_longest_match)
1040             return cur_str_idx;
1041           else
1042             {
1043               match_last = cur_str_idx;
1044               match = 1;
1045             }
1046         }
1047     }
1048
1049   while (!re_string_eoi (&mctx->input))
1050     {
1051       re_dfastate_t *old_state = cur_state;
1052       int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1053
1054       if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1055           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1056               && mctx->input.valid_len < mctx->input.len))
1057         {
1058           err = extend_buffers (mctx);
1059           if (BE (err != REG_NOERROR, 0))
1060             {
1061               assert (err == REG_ESPACE);
1062               return -2;
1063             }
1064         }
1065
1066       cur_state = transit_state (&err, mctx, cur_state);
1067       if (mctx->state_log != NULL)
1068         cur_state = merge_state_with_log (&err, mctx, cur_state);
1069
1070       if (cur_state == NULL)
1071         {
1072           /* Reached the invalid state or an error.  Try to recover a valid
1073              state using the state log, if available and if we have not
1074              already found a valid (even if not the longest) match.  */
1075           if (BE (err != REG_NOERROR, 0))
1076             return -2;
1077
1078           if (mctx->state_log == NULL
1079               || (match && !fl_longest_match)
1080               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1081             break;
1082         }
1083
1084       if (BE (at_init_state, 0))
1085         {
1086           if (old_state == cur_state)
1087             next_start_idx = next_char_idx;
1088           else
1089             at_init_state = 0;
1090         }
1091
1092       if (cur_state->halt)
1093         {
1094           /* Reached a halt state.
1095              Check the halt state can satisfy the current context.  */
1096           if (!cur_state->has_constraint
1097               || check_halt_state_context (mctx, cur_state,
1098                                            re_string_cur_idx (&mctx->input)))
1099             {
1100               /* We found an appropriate halt state.  */
1101               match_last = re_string_cur_idx (&mctx->input);
1102               match = 1;
1103
1104               /* We found a match, do not modify match_first below.  */
1105               p_match_first = NULL;
1106               if (!fl_longest_match)
1107                 break;
1108             }
1109         }
1110     }
1111
1112   if (p_match_first)
1113     *p_match_first += next_start_idx;
1114
1115   return match_last;
1116 }
1117
1118 /* Check NODE match the current context.  */
1119
1120 static int
1121 internal_function
1122 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1123 {
1124   re_token_type_t type = dfa->nodes[node].type;
1125   unsigned int constraint = dfa->nodes[node].constraint;
1126   if (type != END_OF_RE)
1127     return 0;
1128   if (!constraint)
1129     return 1;
1130   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1131     return 0;
1132   return 1;
1133 }
1134
1135 /* Check the halt state STATE match the current context.
1136    Return 0 if not match, if the node, STATE has, is a halt node and
1137    match the context, return the node.  */
1138
1139 static int
1140 internal_function
1141 check_halt_state_context (const re_match_context_t *mctx,
1142                           const re_dfastate_t *state, int idx)
1143 {
1144   int i;
1145   unsigned int context;
1146 #ifdef DEBUG
1147   assert (state->halt);
1148 #endif
1149   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1150   for (i = 0; i < state->nodes.nelem; ++i)
1151     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1152       return state->nodes.elems[i];
1153   return 0;
1154 }
1155
1156 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1157    corresponding to the DFA).
1158    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1159    of errors.  */
1160
1161 static int
1162 internal_function
1163 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1164                    int *pidx, int node, re_node_set *eps_via_nodes,
1165                    struct re_fail_stack_t *fs)
1166 {
1167   const re_dfa_t *const dfa = mctx->dfa;
1168   int i, err;
1169   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1170     {
1171       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1172       re_node_set *edests = &dfa->edests[node];
1173       int dest_node;
1174       err = re_node_set_insert (eps_via_nodes, node);
1175       if (BE (err < 0, 0))
1176         return -2;
1177       /* Pick up a valid destination, or return -1 if none is found.  */
1178       for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1179         {
1180           int candidate = edests->elems[i];
1181           if (!re_node_set_contains (cur_nodes, candidate))
1182             continue;
1183           if (dest_node == -1)
1184             dest_node = candidate;
1185
1186           else
1187             {
1188               /* In order to avoid infinite loop like "(a*)*", return the second
1189                  epsilon-transition if the first was already considered.  */
1190               if (re_node_set_contains (eps_via_nodes, dest_node))
1191                 return candidate;
1192
1193               /* Otherwise, push the second epsilon-transition on the fail stack.  */
1194               else if (fs != NULL
1195                        && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1196                                            eps_via_nodes))
1197                 return -2;
1198
1199               /* We know we are going to exit.  */
1200               break;
1201             }
1202         }
1203       return dest_node;
1204     }
1205   else
1206     {
1207       int naccepted = 0;
1208       re_token_type_t type = dfa->nodes[node].type;
1209
1210 #ifdef RE_ENABLE_I18N
1211       if (dfa->nodes[node].accept_mb)
1212         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1213       else
1214 #endif /* RE_ENABLE_I18N */
1215       if (type == OP_BACK_REF)
1216         {
1217           int subexp_idx = dfa->nodes[node].opr.idx + 1;
1218           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1219           if (fs != NULL)
1220             {
1221               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1222                 return -1;
1223               else if (naccepted)
1224                 {
1225                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1226                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1227                               naccepted) != 0)
1228                     return -1;
1229                 }
1230             }
1231
1232           if (naccepted == 0)
1233             {
1234               int dest_node;
1235               err = re_node_set_insert (eps_via_nodes, node);
1236               if (BE (err < 0, 0))
1237                 return -2;
1238               dest_node = dfa->edests[node].elems[0];
1239               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1240                                         dest_node))
1241                 return dest_node;
1242             }
1243         }
1244
1245       if (naccepted != 0
1246           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1247         {
1248           int dest_node = dfa->nexts[node];
1249           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1250           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1251                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1252                                                dest_node)))
1253             return -1;
1254           re_node_set_empty (eps_via_nodes);
1255           return dest_node;
1256         }
1257     }
1258   return -1;
1259 }
1260
1261 static reg_errcode_t
1262 internal_function
1263 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1264                  int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1265 {
1266   reg_errcode_t err;
1267   int num = fs->num++;
1268   if (fs->num == fs->alloc)
1269     {
1270       struct re_fail_stack_ent_t *new_array;
1271       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1272                                        * fs->alloc * 2));
1273       if (new_array == NULL)
1274         return REG_ESPACE;
1275       fs->alloc *= 2;
1276       fs->stack = new_array;
1277     }
1278   fs->stack[num].idx = str_idx;
1279   fs->stack[num].node = dest_node;
1280   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1281   if (fs->stack[num].regs == NULL)
1282     return REG_ESPACE;
1283   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1284   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1285   return err;
1286 }
1287
1288 static int
1289 internal_function
1290 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1291                 regmatch_t *regs, re_node_set *eps_via_nodes)
1292 {
1293   int num = --fs->num;
1294   assert (num >= 0);
1295   *pidx = fs->stack[num].idx;
1296   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1297   re_node_set_free (eps_via_nodes);
1298   re_free (fs->stack[num].regs);
1299   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1300   return fs->stack[num].node;
1301 }
1302
1303 /* Set the positions where the subexpressions are starts/ends to registers
1304    PMATCH.
1305    Note: We assume that pmatch[0] is already set, and
1306    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1307
1308 static reg_errcode_t
1309 internal_function
1310 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1311           regmatch_t *pmatch, int fl_backtrack)
1312 {
1313   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1314   int idx, cur_node;
1315   re_node_set eps_via_nodes;
1316   struct re_fail_stack_t *fs;
1317   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1318   regmatch_t *prev_idx_match;
1319   int prev_idx_match_malloced = 0;
1320
1321 #ifdef DEBUG
1322   assert (nmatch > 1);
1323   assert (mctx->state_log != NULL);
1324 #endif
1325   if (fl_backtrack)
1326     {
1327       fs = &fs_body;
1328       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1329       if (fs->stack == NULL)
1330         return REG_ESPACE;
1331     }
1332   else
1333     fs = NULL;
1334
1335   cur_node = dfa->init_node;
1336   re_node_set_init_empty (&eps_via_nodes);
1337
1338   if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1339     prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1340   else
1341     {
1342       prev_idx_match = re_malloc (regmatch_t, nmatch);
1343       if (prev_idx_match == NULL)
1344         {
1345           free_fail_stack_return (fs);
1346           return REG_ESPACE;
1347         }
1348       prev_idx_match_malloced = 1;
1349     }
1350   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1351
1352   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1353     {
1354       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1355
1356       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1357         {
1358           int reg_idx;
1359           if (fs)
1360             {
1361               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1362                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1363                   break;
1364               if (reg_idx == nmatch)
1365                 {
1366                   re_node_set_free (&eps_via_nodes);
1367                   if (prev_idx_match_malloced)
1368                     re_free (prev_idx_match);
1369                   return free_fail_stack_return (fs);
1370                 }
1371               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1372                                          &eps_via_nodes);
1373             }
1374           else
1375             {
1376               re_node_set_free (&eps_via_nodes);
1377               if (prev_idx_match_malloced)
1378                 re_free (prev_idx_match);
1379               return REG_NOERROR;
1380             }
1381         }
1382
1383       /* Proceed to next node.  */
1384       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1385                                     &eps_via_nodes, fs);
1386
1387       if (BE (cur_node < 0, 0))
1388         {
1389           if (BE (cur_node == -2, 0))
1390             {
1391               re_node_set_free (&eps_via_nodes);
1392               if (prev_idx_match_malloced)
1393                 re_free (prev_idx_match);
1394               free_fail_stack_return (fs);
1395               return REG_ESPACE;
1396             }
1397           if (fs)
1398             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1399                                        &eps_via_nodes);
1400           else
1401             {
1402               re_node_set_free (&eps_via_nodes);
1403               if (prev_idx_match_malloced)
1404                 re_free (prev_idx_match);
1405               return REG_NOMATCH;
1406             }
1407         }
1408     }
1409   re_node_set_free (&eps_via_nodes);
1410   if (prev_idx_match_malloced)
1411     re_free (prev_idx_match);
1412   return free_fail_stack_return (fs);
1413 }
1414
1415 static reg_errcode_t
1416 internal_function
1417 free_fail_stack_return (struct re_fail_stack_t *fs)
1418 {
1419   if (fs)
1420     {
1421       int fs_idx;
1422       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1423         {
1424           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1425           re_free (fs->stack[fs_idx].regs);
1426         }
1427       re_free (fs->stack);
1428     }
1429   return REG_NOERROR;
1430 }
1431
1432 static void
1433 internal_function
1434 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1435              regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1436 {
1437   int type = dfa->nodes[cur_node].type;
1438   if (type == OP_OPEN_SUBEXP)
1439     {
1440       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1441
1442       /* We are at the first node of this sub expression.  */
1443       if (reg_num < nmatch)
1444         {
1445           pmatch[reg_num].rm_so = cur_idx;
1446           pmatch[reg_num].rm_eo = -1;
1447         }
1448     }
1449   else if (type == OP_CLOSE_SUBEXP)
1450     {
1451       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1452       if (reg_num < nmatch)
1453         {
1454           /* We are at the last node of this sub expression.  */
1455           if (pmatch[reg_num].rm_so < cur_idx)
1456             {
1457               pmatch[reg_num].rm_eo = cur_idx;
1458               /* This is a non-empty match or we are not inside an optional
1459                  subexpression.  Accept this right away.  */
1460               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1461             }
1462           else
1463             {
1464               if (dfa->nodes[cur_node].opt_subexp
1465                   && prev_idx_match[reg_num].rm_so != -1)
1466                 /* We transited through an empty match for an optional
1467                    subexpression, like (a?)*, and this is not the subexp's
1468                    first match.  Copy back the old content of the registers
1469                    so that matches of an inner subexpression are undone as
1470                    well, like in ((a?))*.  */
1471                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1472               else
1473                 /* We completed a subexpression, but it may be part of
1474                    an optional one, so do not update PREV_IDX_MATCH.  */
1475                 pmatch[reg_num].rm_eo = cur_idx;
1476             }
1477         }
1478     }
1479 }
1480
1481 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1482    and sift the nodes in each states according to the following rules.
1483    Updated state_log will be wrote to STATE_LOG.
1484
1485    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1486      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1487         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1488         the LAST_NODE, we throw away the node `a'.
1489      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1490         string `s' and transit to `b':
1491         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1492            away the node `a'.
1493         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1494             thrown away, we throw away the node `a'.
1495      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1496         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1497            node `a'.
1498         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1499             we throw away the node `a'.  */
1500
1501 #define STATE_NODE_CONTAINS(state,node) \
1502   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1503
1504 static reg_errcode_t
1505 internal_function
1506 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1507 {
1508   reg_errcode_t err;
1509   int null_cnt = 0;
1510   int str_idx = sctx->last_str_idx;
1511   re_node_set cur_dest;
1512
1513 #ifdef DEBUG
1514   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1515 #endif
1516
1517   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1518      transit to the last_node and the last_node itself.  */
1519   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1520   if (BE (err != REG_NOERROR, 0))
1521     return err;
1522   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1523   if (BE (err != REG_NOERROR, 0))
1524     goto free_return;
1525
1526   /* Then check each states in the state_log.  */
1527   while (str_idx > 0)
1528     {
1529       /* Update counters.  */
1530       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1531       if (null_cnt > mctx->max_mb_elem_len)
1532         {
1533           memset (sctx->sifted_states, '\0',
1534                   sizeof (re_dfastate_t *) * str_idx);
1535           re_node_set_free (&cur_dest);
1536           return REG_NOERROR;
1537         }
1538       re_node_set_empty (&cur_dest);
1539       --str_idx;
1540
1541       if (mctx->state_log[str_idx])
1542         {
1543           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1544           if (BE (err != REG_NOERROR, 0))
1545             goto free_return;
1546         }
1547
1548       /* Add all the nodes which satisfy the following conditions:
1549          - It can epsilon transit to a node in CUR_DEST.
1550          - It is in CUR_SRC.
1551          And update state_log.  */
1552       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1553       if (BE (err != REG_NOERROR, 0))
1554         goto free_return;
1555     }
1556   err = REG_NOERROR;
1557  free_return:
1558   re_node_set_free (&cur_dest);
1559   return err;
1560 }
1561
1562 static reg_errcode_t
1563 internal_function
1564 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1565                      int str_idx, re_node_set *cur_dest)
1566 {
1567   const re_dfa_t *const dfa = mctx->dfa;
1568   const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1569   int i;
1570
1571   /* Then build the next sifted state.
1572      We build the next sifted state on `cur_dest', and update
1573      `sifted_states[str_idx]' with `cur_dest'.
1574      Note:
1575      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1576      `cur_src' points the node_set of the old `state_log[str_idx]'
1577      (with the epsilon nodes pre-filtered out).  */
1578   for (i = 0; i < cur_src->nelem; i++)
1579     {
1580       int prev_node = cur_src->elems[i];
1581       int naccepted = 0;
1582       int ret;
1583
1584 #ifdef DEBUG
1585       re_token_type_t type = dfa->nodes[prev_node].type;
1586       assert (!IS_EPSILON_NODE (type));
1587 #endif
1588 #ifdef RE_ENABLE_I18N
1589       /* If the node may accept `multi byte'.  */
1590       if (dfa->nodes[prev_node].accept_mb)
1591         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1592                                          str_idx, sctx->last_str_idx);
1593 #endif /* RE_ENABLE_I18N */
1594
1595       /* We don't check backreferences here.
1596          See update_cur_sifted_state().  */
1597       if (!naccepted
1598           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1599           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1600                                   dfa->nexts[prev_node]))
1601         naccepted = 1;
1602
1603       if (naccepted == 0)
1604         continue;
1605
1606       if (sctx->limits.nelem)
1607         {
1608           int to_idx = str_idx + naccepted;
1609           if (check_dst_limits (mctx, &sctx->limits,
1610                                 dfa->nexts[prev_node], to_idx,
1611                                 prev_node, str_idx))
1612             continue;
1613         }
1614       ret = re_node_set_insert (cur_dest, prev_node);
1615       if (BE (ret == -1, 0))
1616         return REG_ESPACE;
1617     }
1618
1619   return REG_NOERROR;
1620 }
1621
1622 /* Helper functions.  */
1623
1624 static reg_errcode_t
1625 internal_function
1626 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1627 {
1628   int top = mctx->state_log_top;
1629
1630   if (next_state_log_idx >= mctx->input.bufs_len
1631       || (next_state_log_idx >= mctx->input.valid_len
1632           && mctx->input.valid_len < mctx->input.len))
1633     {
1634       reg_errcode_t err;
1635       err = extend_buffers (mctx);
1636       if (BE (err != REG_NOERROR, 0))
1637         return err;
1638     }
1639
1640   if (top < next_state_log_idx)
1641     {
1642       memset (mctx->state_log + top + 1, '\0',
1643               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1644       mctx->state_log_top = next_state_log_idx;
1645     }
1646   return REG_NOERROR;
1647 }
1648
1649 static reg_errcode_t
1650 internal_function
1651 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1652                    re_dfastate_t **src, int num)
1653 {
1654   int st_idx;
1655   reg_errcode_t err;
1656   for (st_idx = 0; st_idx < num; ++st_idx)
1657     {
1658       if (dst[st_idx] == NULL)
1659         dst[st_idx] = src[st_idx];
1660       else if (src[st_idx] != NULL)
1661         {
1662           re_node_set merged_set;
1663           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1664                                         &src[st_idx]->nodes);
1665           if (BE (err != REG_NOERROR, 0))
1666             return err;
1667           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1668           re_node_set_free (&merged_set);
1669           if (BE (err != REG_NOERROR, 0))
1670             return err;
1671         }
1672     }
1673   return REG_NOERROR;
1674 }
1675
1676 static reg_errcode_t
1677 internal_function
1678 update_cur_sifted_state (const re_match_context_t *mctx,
1679                          re_sift_context_t *sctx, int str_idx,
1680                          re_node_set *dest_nodes)
1681 {
1682   const re_dfa_t *const dfa = mctx->dfa;
1683   reg_errcode_t err = REG_NOERROR;
1684   const re_node_set *candidates;
1685   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1686                 : &mctx->state_log[str_idx]->nodes);
1687
1688   if (dest_nodes->nelem == 0)
1689     sctx->sifted_states[str_idx] = NULL;
1690   else
1691     {
1692       if (candidates)
1693         {
1694           /* At first, add the nodes which can epsilon transit to a node in
1695              DEST_NODE.  */
1696           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1697           if (BE (err != REG_NOERROR, 0))
1698             return err;
1699
1700           /* Then, check the limitations in the current sift_context.  */
1701           if (sctx->limits.nelem)
1702             {
1703               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1704                                          mctx->bkref_ents, str_idx);
1705               if (BE (err != REG_NOERROR, 0))
1706                 return err;
1707             }
1708         }
1709
1710       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1711       if (BE (err != REG_NOERROR, 0))
1712         return err;
1713     }
1714
1715   if (candidates && mctx->state_log[str_idx]->has_backref)
1716     {
1717       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1718       if (BE (err != REG_NOERROR, 0))
1719         return err;
1720     }
1721   return REG_NOERROR;
1722 }
1723
1724 static reg_errcode_t
1725 internal_function
1726 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1727                        const re_node_set *candidates)
1728 {
1729   reg_errcode_t err = REG_NOERROR;
1730   int i;
1731
1732   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1733   if (BE (err != REG_NOERROR, 0))
1734     return err;
1735
1736   if (!state->inveclosure.alloc)
1737     {
1738       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1739       if (BE (err != REG_NOERROR, 0))
1740         return REG_ESPACE;
1741       for (i = 0; i < dest_nodes->nelem; i++)
1742         re_node_set_merge (&state->inveclosure,
1743                            dfa->inveclosures + dest_nodes->elems[i]);
1744     }
1745   return re_node_set_add_intersect (dest_nodes, candidates,
1746                                     &state->inveclosure);
1747 }
1748
1749 static reg_errcode_t
1750 internal_function
1751 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1752                        const re_node_set *candidates)
1753 {
1754     int ecl_idx;
1755     reg_errcode_t err;
1756     re_node_set *inv_eclosure = dfa->inveclosures + node;
1757     re_node_set except_nodes;
1758     re_node_set_init_empty (&except_nodes);
1759     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1760       {
1761         int cur_node = inv_eclosure->elems[ecl_idx];
1762         if (cur_node == node)
1763           continue;
1764         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1765           {
1766             int edst1 = dfa->edests[cur_node].elems[0];
1767             int edst2 = ((dfa->edests[cur_node].nelem > 1)
1768                          ? dfa->edests[cur_node].elems[1] : -1);
1769             if ((!re_node_set_contains (inv_eclosure, edst1)
1770                  && re_node_set_contains (dest_nodes, edst1))
1771                 || (edst2 > 0
1772                     && !re_node_set_contains (inv_eclosure, edst2)
1773                     && re_node_set_contains (dest_nodes, edst2)))
1774               {
1775                 err = re_node_set_add_intersect (&except_nodes, candidates,
1776                                                  dfa->inveclosures + cur_node);
1777                 if (BE (err != REG_NOERROR, 0))
1778                   {
1779                     re_node_set_free (&except_nodes);
1780                     return err;
1781                   }
1782               }
1783           }
1784       }
1785     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1786       {
1787         int cur_node = inv_eclosure->elems[ecl_idx];
1788         if (!re_node_set_contains (&except_nodes, cur_node))
1789           {
1790             int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1791             re_node_set_remove_at (dest_nodes, idx);
1792           }
1793       }
1794     re_node_set_free (&except_nodes);
1795     return REG_NOERROR;
1796 }
1797
1798 static int
1799 internal_function
1800 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1801                   int dst_node, int dst_idx, int src_node, int src_idx)
1802 {
1803   const re_dfa_t *const dfa = mctx->dfa;
1804   int lim_idx, src_pos, dst_pos;
1805
1806   int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1807   int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1808   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1809     {
1810       int subexp_idx;
1811       struct re_backref_cache_entry *ent;
1812       ent = mctx->bkref_ents + limits->elems[lim_idx];
1813       subexp_idx = dfa->nodes[ent->node].opr.idx;
1814
1815       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1816                                            subexp_idx, dst_node, dst_idx,
1817                                            dst_bkref_idx);
1818       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1819                                            subexp_idx, src_node, src_idx,
1820                                            src_bkref_idx);
1821
1822       /* In case of:
1823          <src> <dst> ( <subexp> )
1824          ( <subexp> ) <src> <dst>
1825          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1826       if (src_pos == dst_pos)
1827         continue; /* This is unrelated limitation.  */
1828       else
1829         return 1;
1830     }
1831   return 0;
1832 }
1833
1834 static int
1835 internal_function
1836 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1837                              int subexp_idx, int from_node, int bkref_idx)
1838 {
1839   const re_dfa_t *const dfa = mctx->dfa;
1840   const re_node_set *eclosures = dfa->eclosures + from_node;
1841   int node_idx;
1842
1843   /* Else, we are on the boundary: examine the nodes on the epsilon
1844      closure.  */
1845   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1846     {
1847       int node = eclosures->elems[node_idx];
1848       switch (dfa->nodes[node].type)
1849         {
1850         case OP_BACK_REF:
1851           if (bkref_idx != -1)
1852             {
1853               struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1854               do
1855                 {
1856                   int dst, cpos;
1857
1858                   if (ent->node != node)
1859                     continue;
1860
1861                   if (subexp_idx < BITSET_WORD_BITS
1862                       && !(ent->eps_reachable_subexps_map
1863                            & ((bitset_word_t) 1 << subexp_idx)))
1864                     continue;
1865
1866                   /* Recurse trying to reach the OP_OPEN_SUBEXP and
1867                      OP_CLOSE_SUBEXP cases below.  But, if the
1868                      destination node is the same node as the source
1869                      node, don't recurse because it would cause an
1870                      infinite loop: a regex that exhibits this behavior
1871                      is ()\1*\1*  */
1872                   dst = dfa->edests[node].elems[0];
1873                   if (dst == from_node)
1874                     {
1875                       if (boundaries & 1)
1876                         return -1;
1877                       else /* if (boundaries & 2) */
1878                         return 0;
1879                     }
1880
1881                   cpos =
1882                     check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1883                                                  dst, bkref_idx);
1884                   if (cpos == -1 /* && (boundaries & 1) */)
1885                     return -1;
1886                   if (cpos == 0 && (boundaries & 2))
1887                     return 0;
1888
1889                   if (subexp_idx < BITSET_WORD_BITS)
1890                     ent->eps_reachable_subexps_map
1891                       &= ~((bitset_word_t) 1 << subexp_idx);
1892                 }
1893               while (ent++->more);
1894             }
1895           break;
1896
1897         case OP_OPEN_SUBEXP:
1898           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1899             return -1;
1900           break;
1901
1902         case OP_CLOSE_SUBEXP:
1903           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1904             return 0;
1905           break;
1906
1907         default:
1908             break;
1909         }
1910     }
1911
1912   return (boundaries & 2) ? 1 : 0;
1913 }
1914
1915 static int
1916 internal_function
1917 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
1918                            int subexp_idx, int from_node, int str_idx,
1919                            int bkref_idx)
1920 {
1921   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1922   int boundaries;
1923
1924   /* If we are outside the range of the subexpression, return -1 or 1.  */
1925   if (str_idx < lim->subexp_from)
1926     return -1;
1927
1928   if (lim->subexp_to < str_idx)
1929     return 1;
1930
1931   /* If we are within the subexpression, return 0.  */
1932   boundaries = (str_idx == lim->subexp_from);
1933   boundaries |= (str_idx == lim->subexp_to) << 1;
1934   if (boundaries == 0)
1935     return 0;
1936
1937   /* Else, examine epsilon closure.  */
1938   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1939                                       from_node, bkref_idx);
1940 }
1941
1942 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1943    which are against limitations from DEST_NODES. */
1944
1945 static reg_errcode_t
1946 internal_function
1947 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
1948                      const re_node_set *candidates, re_node_set *limits,
1949                      struct re_backref_cache_entry *bkref_ents, int str_idx)
1950 {
1951   reg_errcode_t err;
1952   int node_idx, lim_idx;
1953
1954   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1955     {
1956       int subexp_idx;
1957       struct re_backref_cache_entry *ent;
1958       ent = bkref_ents + limits->elems[lim_idx];
1959
1960       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
1961         continue; /* This is unrelated limitation.  */
1962
1963       subexp_idx = dfa->nodes[ent->node].opr.idx;
1964       if (ent->subexp_to == str_idx)
1965         {
1966           int ops_node = -1;
1967           int cls_node = -1;
1968           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1969             {
1970               int node = dest_nodes->elems[node_idx];
1971               re_token_type_t type = dfa->nodes[node].type;
1972               if (type == OP_OPEN_SUBEXP
1973                   && subexp_idx == dfa->nodes[node].opr.idx)
1974                 ops_node = node;
1975               else if (type == OP_CLOSE_SUBEXP
1976                        && subexp_idx == dfa->nodes[node].opr.idx)
1977                 cls_node = node;
1978             }
1979
1980           /* Check the limitation of the open subexpression.  */
1981           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
1982           if (ops_node >= 0)
1983             {
1984               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
1985                                            candidates);
1986               if (BE (err != REG_NOERROR, 0))
1987                 return err;
1988             }
1989
1990           /* Check the limitation of the close subexpression.  */
1991           if (cls_node >= 0)
1992             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1993               {
1994                 int node = dest_nodes->elems[node_idx];
1995                 if (!re_node_set_contains (dfa->inveclosures + node,
1996                                            cls_node)
1997                     && !re_node_set_contains (dfa->eclosures + node,
1998                                               cls_node))
1999                   {
2000                     /* It is against this limitation.
2001                        Remove it form the current sifted state.  */
2002                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2003                                                  candidates);
2004                     if (BE (err != REG_NOERROR, 0))
2005                       return err;
2006                     --node_idx;
2007                   }
2008               }
2009         }
2010       else /* (ent->subexp_to != str_idx)  */
2011         {
2012           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2013             {
2014               int node = dest_nodes->elems[node_idx];
2015               re_token_type_t type = dfa->nodes[node].type;
2016               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2017                 {
2018                   if (subexp_idx != dfa->nodes[node].opr.idx)
2019                     continue;
2020                   /* It is against this limitation.
2021                      Remove it form the current sifted state.  */
2022                   err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2023                                                candidates);
2024                   if (BE (err != REG_NOERROR, 0))
2025                     return err;
2026                 }
2027             }
2028         }
2029     }
2030   return REG_NOERROR;
2031 }
2032
2033 static reg_errcode_t
2034 internal_function
2035 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2036                    int str_idx, const re_node_set *candidates)
2037 {
2038   const re_dfa_t *const dfa = mctx->dfa;
2039   reg_errcode_t err;
2040   int node_idx, node;
2041   re_sift_context_t local_sctx;
2042   int first_idx = search_cur_bkref_entry (mctx, str_idx);
2043
2044   if (first_idx == -1)
2045     return REG_NOERROR;
2046
2047   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2048
2049   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2050     {
2051       int enabled_idx;
2052       re_token_type_t type;
2053       struct re_backref_cache_entry *entry;
2054       node = candidates->elems[node_idx];
2055       type = dfa->nodes[node].type;
2056       /* Avoid infinite loop for the REs like "()\1+".  */
2057       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2058         continue;
2059       if (type != OP_BACK_REF)
2060         continue;
2061
2062       entry = mctx->bkref_ents + first_idx;
2063       enabled_idx = first_idx;
2064       do
2065         {
2066           int subexp_len;
2067           int to_idx;
2068           int dst_node;
2069           int ret;
2070           re_dfastate_t *cur_state;
2071
2072           if (entry->node != node)
2073             continue;
2074           subexp_len = entry->subexp_to - entry->subexp_from;
2075           to_idx = str_idx + subexp_len;
2076           dst_node = (subexp_len ? dfa->nexts[node]
2077                       : dfa->edests[node].elems[0]);
2078
2079           if (to_idx > sctx->last_str_idx
2080               || sctx->sifted_states[to_idx] == NULL
2081               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2082               || check_dst_limits (mctx, &sctx->limits, node,
2083                                    str_idx, dst_node, to_idx))
2084             continue;
2085
2086           if (local_sctx.sifted_states == NULL)
2087             {
2088               local_sctx = *sctx;
2089               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2090               if (BE (err != REG_NOERROR, 0))
2091                 goto free_return;
2092             }
2093           local_sctx.last_node = node;
2094           local_sctx.last_str_idx = str_idx;
2095           ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2096           if (BE (ret < 0, 0))
2097             {
2098               err = REG_ESPACE;
2099               goto free_return;
2100             }
2101           cur_state = local_sctx.sifted_states[str_idx];
2102           err = sift_states_backward (mctx, &local_sctx);
2103           if (BE (err != REG_NOERROR, 0))
2104             goto free_return;
2105           if (sctx->limited_states != NULL)
2106             {
2107               err = merge_state_array (dfa, sctx->limited_states,
2108                                        local_sctx.sifted_states,
2109                                        str_idx + 1);
2110               if (BE (err != REG_NOERROR, 0))
2111                 goto free_return;
2112             }
2113           local_sctx.sifted_states[str_idx] = cur_state;
2114           re_node_set_remove (&local_sctx.limits, enabled_idx);
2115
2116           /* mctx->bkref_ents may have changed, reload the pointer.  */
2117           entry = mctx->bkref_ents + enabled_idx;
2118         }
2119       while (enabled_idx++, entry++->more);
2120     }
2121   err = REG_NOERROR;
2122  free_return:
2123   if (local_sctx.sifted_states != NULL)
2124     {
2125       re_node_set_free (&local_sctx.limits);
2126     }
2127
2128   return err;
2129 }
2130
2131
2132 #ifdef RE_ENABLE_I18N
2133 static int
2134 internal_function
2135 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2136                      int node_idx, int str_idx, int max_str_idx)
2137 {
2138   const re_dfa_t *const dfa = mctx->dfa;
2139   int naccepted;
2140   /* Check the node can accept `multi byte'.  */
2141   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2142   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2143       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2144                             dfa->nexts[node_idx]))
2145     /* The node can't accept the `multi byte', or the
2146        destination was already thrown away, then the node
2147        could't accept the current input `multi byte'.   */
2148     naccepted = 0;
2149   /* Otherwise, it is sure that the node could accept
2150      `naccepted' bytes input.  */
2151   return naccepted;
2152 }
2153 #endif /* RE_ENABLE_I18N */
2154
2155
2156 /* Functions for state transition.  */
2157
2158 /* Return the next state to which the current state STATE will transit by
2159    accepting the current input byte, and update STATE_LOG if necessary.
2160    If STATE can accept a multibyte char/collating element/back reference
2161    update the destination of STATE_LOG.  */
2162
2163 static re_dfastate_t *
2164 internal_function
2165 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2166                re_dfastate_t *state)
2167 {
2168   re_dfastate_t **trtable;
2169   unsigned char ch;
2170
2171 #ifdef RE_ENABLE_I18N
2172   /* If the current state can accept multibyte.  */
2173   if (BE (state->accept_mb, 0))
2174     {
2175       *err = transit_state_mb (mctx, state);
2176       if (BE (*err != REG_NOERROR, 0))
2177         return NULL;
2178     }
2179 #endif /* RE_ENABLE_I18N */
2180
2181   /* Then decide the next state with the single byte.  */
2182 #if 0
2183   if (0)
2184     /* don't use transition table  */
2185     return transit_state_sb (err, mctx, state);
2186 #endif
2187
2188   /* Use transition table  */
2189   ch = re_string_fetch_byte (&mctx->input);
2190   for (;;)
2191     {
2192       trtable = state->trtable;
2193       if (BE (trtable != NULL, 1))
2194         return trtable[ch];
2195
2196       trtable = state->word_trtable;
2197       if (BE (trtable != NULL, 1))
2198         {
2199           unsigned int context;
2200           context
2201             = re_string_context_at (&mctx->input,
2202                                     re_string_cur_idx (&mctx->input) - 1,
2203                                     mctx->eflags);
2204           if (IS_WORD_CONTEXT (context))
2205             return trtable[ch + SBC_MAX];
2206           else
2207             return trtable[ch];
2208         }
2209
2210       if (!build_trtable (mctx->dfa, state))
2211         {
2212           *err = REG_ESPACE;
2213           return NULL;
2214         }
2215
2216       /* Retry, we now have a transition table.  */
2217     }
2218 }
2219
2220 /* Update the state_log if we need */
2221 re_dfastate_t *
2222 internal_function
2223 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2224                       re_dfastate_t *next_state)
2225 {
2226   const re_dfa_t *const dfa = mctx->dfa;
2227   int cur_idx = re_string_cur_idx (&mctx->input);
2228
2229   if (cur_idx > mctx->state_log_top)
2230     {
2231       mctx->state_log[cur_idx] = next_state;
2232       mctx->state_log_top = cur_idx;
2233     }
2234   else if (mctx->state_log[cur_idx] == 0)
2235     {
2236       mctx->state_log[cur_idx] = next_state;
2237     }
2238   else
2239     {
2240       re_dfastate_t *pstate;
2241       unsigned int context;
2242       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2243       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2244          the destination of a multibyte char/collating element/
2245          back reference.  Then the next state is the union set of
2246          these destinations and the results of the transition table.  */
2247       pstate = mctx->state_log[cur_idx];
2248       log_nodes = pstate->entrance_nodes;
2249       if (next_state != NULL)
2250         {
2251           table_nodes = next_state->entrance_nodes;
2252           *err = re_node_set_init_union (&next_nodes, table_nodes,
2253                                              log_nodes);
2254           if (BE (*err != REG_NOERROR, 0))
2255             return NULL;
2256         }
2257       else
2258         next_nodes = *log_nodes;
2259       /* Note: We already add the nodes of the initial state,
2260          then we don't need to add them here.  */
2261
2262       context = re_string_context_at (&mctx->input,
2263                                       re_string_cur_idx (&mctx->input) - 1,
2264                                       mctx->eflags);
2265       next_state = mctx->state_log[cur_idx]
2266         = re_acquire_state_context (err, dfa, &next_nodes, context);
2267       /* We don't need to check errors here, since the return value of
2268          this function is next_state and ERR is already set.  */
2269
2270       if (table_nodes != NULL)
2271         re_node_set_free (&next_nodes);
2272     }
2273
2274   if (BE (dfa->nbackref, 0) && next_state != NULL)
2275     {
2276       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2277          later.  We must check them here, since the back references in the
2278          next state might use them.  */
2279       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2280                                         cur_idx);
2281       if (BE (*err != REG_NOERROR, 0))
2282         return NULL;
2283
2284       /* If the next state has back references.  */
2285       if (next_state->has_backref)
2286         {
2287           *err = transit_state_bkref (mctx, &next_state->nodes);
2288           if (BE (*err != REG_NOERROR, 0))
2289             return NULL;
2290           next_state = mctx->state_log[cur_idx];
2291         }
2292     }
2293
2294   return next_state;
2295 }
2296
2297 /* Skip bytes in the input that correspond to part of a
2298    multi-byte match, then look in the log for a state
2299    from which to restart matching.  */
2300 re_dfastate_t *
2301 internal_function
2302 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2303 {
2304   re_dfastate_t *cur_state;
2305   do
2306     {
2307       int max = mctx->state_log_top;
2308       int cur_str_idx = re_string_cur_idx (&mctx->input);
2309
2310       do
2311         {
2312           if (++cur_str_idx > max)
2313             return NULL;
2314           re_string_skip_bytes (&mctx->input, 1);
2315         }
2316       while (mctx->state_log[cur_str_idx] == NULL);
2317
2318       cur_state = merge_state_with_log (err, mctx, NULL);
2319     }
2320   while (*err == REG_NOERROR && cur_state == NULL);
2321   return cur_state;
2322 }
2323
2324 /* Helper functions for transit_state.  */
2325
2326 /* From the node set CUR_NODES, pick up the nodes whose types are
2327    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2328    expression. And register them to use them later for evaluating the
2329    correspoding back references.  */
2330
2331 static reg_errcode_t
2332 internal_function
2333 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2334                            int str_idx)
2335 {
2336   const re_dfa_t *const dfa = mctx->dfa;
2337   int node_idx;
2338   reg_errcode_t err;
2339
2340   /* TODO: This isn't efficient.
2341            Because there might be more than one nodes whose types are
2342            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2343            nodes.
2344            E.g. RE: (a){2}  */
2345   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2346     {
2347       int node = cur_nodes->elems[node_idx];
2348       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2349           && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2350           && (dfa->used_bkref_map
2351               & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2352         {
2353           err = match_ctx_add_subtop (mctx, node, str_idx);
2354           if (BE (err != REG_NOERROR, 0))
2355             return err;
2356         }
2357     }
2358   return REG_NOERROR;
2359 }
2360
2361 #if 0
2362 /* Return the next state to which the current state STATE will transit by
2363    accepting the current input byte.  */
2364
2365 static re_dfastate_t *
2366 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2367                   re_dfastate_t *state)
2368 {
2369   const re_dfa_t *const dfa = mctx->dfa;
2370   re_node_set next_nodes;
2371   re_dfastate_t *next_state;
2372   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2373   unsigned int context;
2374
2375   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2376   if (BE (*err != REG_NOERROR, 0))
2377     return NULL;
2378   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2379     {
2380       int cur_node = state->nodes.elems[node_cnt];
2381       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2382         {
2383           *err = re_node_set_merge (&next_nodes,
2384                                     dfa->eclosures + dfa->nexts[cur_node]);
2385           if (BE (*err != REG_NOERROR, 0))
2386             {
2387               re_node_set_free (&next_nodes);
2388               return NULL;
2389             }
2390         }
2391     }
2392   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2393   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2394   /* We don't need to check errors here, since the return value of
2395      this function is next_state and ERR is already set.  */
2396
2397   re_node_set_free (&next_nodes);
2398   re_string_skip_bytes (&mctx->input, 1);
2399   return next_state;
2400 }
2401 #endif
2402
2403 #ifdef RE_ENABLE_I18N
2404 static reg_errcode_t
2405 internal_function
2406 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2407 {
2408   const re_dfa_t *const dfa = mctx->dfa;
2409   reg_errcode_t err;
2410   int i;
2411
2412   for (i = 0; i < pstate->nodes.nelem; ++i)
2413     {
2414       re_node_set dest_nodes, *new_nodes;
2415       int cur_node_idx = pstate->nodes.elems[i];
2416       int naccepted, dest_idx;
2417       unsigned int context;
2418       re_dfastate_t *dest_state;
2419
2420       if (!dfa->nodes[cur_node_idx].accept_mb)
2421         continue;
2422
2423       if (dfa->nodes[cur_node_idx].constraint)
2424         {
2425           context = re_string_context_at (&mctx->input,
2426                                           re_string_cur_idx (&mctx->input),
2427                                           mctx->eflags);
2428           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2429                                            context))
2430             continue;
2431         }
2432
2433       /* How many bytes the node can accept?  */
2434       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2435                                            re_string_cur_idx (&mctx->input));
2436       if (naccepted == 0)
2437         continue;
2438
2439       /* The node can accepts `naccepted' bytes.  */
2440       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2441       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2442                                : mctx->max_mb_elem_len);
2443       err = clean_state_log_if_needed (mctx, dest_idx);
2444       if (BE (err != REG_NOERROR, 0))
2445         return err;
2446 #ifdef DEBUG
2447       assert (dfa->nexts[cur_node_idx] != -1);
2448 #endif
2449       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2450
2451       dest_state = mctx->state_log[dest_idx];
2452       if (dest_state == NULL)
2453         dest_nodes = *new_nodes;
2454       else
2455         {
2456           err = re_node_set_init_union (&dest_nodes,
2457                                         dest_state->entrance_nodes, new_nodes);
2458           if (BE (err != REG_NOERROR, 0))
2459             return err;
2460         }
2461       context = re_string_context_at (&mctx->input, dest_idx - 1,
2462                                       mctx->eflags);
2463       mctx->state_log[dest_idx]
2464         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2465       if (dest_state != NULL)
2466         re_node_set_free (&dest_nodes);
2467       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2468         return err;
2469     }
2470   return REG_NOERROR;
2471 }
2472 #endif /* RE_ENABLE_I18N */
2473
2474 static reg_errcode_t
2475 internal_function
2476 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2477 {
2478   const re_dfa_t *const dfa = mctx->dfa;
2479   reg_errcode_t err;
2480   int i;
2481   int cur_str_idx = re_string_cur_idx (&mctx->input);
2482
2483   for (i = 0; i < nodes->nelem; ++i)
2484     {
2485       int dest_str_idx, prev_nelem, bkc_idx;
2486       int node_idx = nodes->elems[i];
2487       unsigned int context;
2488       const re_token_t *node = dfa->nodes + node_idx;
2489       re_node_set *new_dest_nodes;
2490
2491       /* Check whether `node' is a backreference or not.  */
2492       if (node->type != OP_BACK_REF)
2493         continue;
2494
2495       if (node->constraint)
2496         {
2497           context = re_string_context_at (&mctx->input, cur_str_idx,
2498                                           mctx->eflags);
2499           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2500             continue;
2501         }
2502
2503       /* `node' is a backreference.
2504          Check the substring which the substring matched.  */
2505       bkc_idx = mctx->nbkref_ents;
2506       err = get_subexp (mctx, node_idx, cur_str_idx);
2507       if (BE (err != REG_NOERROR, 0))
2508         goto free_return;
2509
2510       /* And add the epsilon closures (which is `new_dest_nodes') of
2511          the backreference to appropriate state_log.  */
2512 #ifdef DEBUG
2513       assert (dfa->nexts[node_idx] != -1);
2514 #endif
2515       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2516         {
2517           int subexp_len;
2518           re_dfastate_t *dest_state;
2519           struct re_backref_cache_entry *bkref_ent;
2520           bkref_ent = mctx->bkref_ents + bkc_idx;
2521           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2522             continue;
2523           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2524           new_dest_nodes = (subexp_len == 0
2525                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2526                             : dfa->eclosures + dfa->nexts[node_idx]);
2527           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2528                           - bkref_ent->subexp_from);
2529           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2530                                           mctx->eflags);
2531           dest_state = mctx->state_log[dest_str_idx];
2532           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2533                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2534           /* Add `new_dest_node' to state_log.  */
2535           if (dest_state == NULL)
2536             {
2537               mctx->state_log[dest_str_idx]
2538                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2539                                             context);
2540               if (BE (mctx->state_log[dest_str_idx] == NULL
2541                       && err != REG_NOERROR, 0))
2542                 goto free_return;
2543             }
2544           else
2545             {
2546               re_node_set dest_nodes;
2547               err = re_node_set_init_union (&dest_nodes,
2548                                             dest_state->entrance_nodes,
2549                                             new_dest_nodes);
2550               if (BE (err != REG_NOERROR, 0))
2551                 {
2552                   re_node_set_free (&dest_nodes);
2553                   goto free_return;
2554                 }
2555               mctx->state_log[dest_str_idx]
2556                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2557               re_node_set_free (&dest_nodes);
2558               if (BE (mctx->state_log[dest_str_idx] == NULL
2559                       && err != REG_NOERROR, 0))
2560                 goto free_return;
2561             }
2562           /* We need to check recursively if the backreference can epsilon
2563              transit.  */
2564           if (subexp_len == 0
2565               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2566             {
2567               err = check_subexp_matching_top (mctx, new_dest_nodes,
2568                                                cur_str_idx);
2569               if (BE (err != REG_NOERROR, 0))
2570                 goto free_return;
2571               err = transit_state_bkref (mctx, new_dest_nodes);
2572               if (BE (err != REG_NOERROR, 0))
2573                 goto free_return;
2574             }
2575         }
2576     }
2577   err = REG_NOERROR;
2578  free_return:
2579   return err;
2580 }
2581
2582 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2583    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2584    Note that we might collect inappropriate candidates here.
2585    However, the cost of checking them strictly here is too high, then we
2586    delay these checking for prune_impossible_nodes().  */
2587
2588 static reg_errcode_t
2589 internal_function
2590 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2591 {
2592   const re_dfa_t *const dfa = mctx->dfa;
2593   int subexp_num, sub_top_idx;
2594   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2595   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2596   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2597   if (cache_idx != -1)
2598     {
2599       const struct re_backref_cache_entry *entry
2600         = mctx->bkref_ents + cache_idx;
2601       do
2602         if (entry->node == bkref_node)
2603           return REG_NOERROR; /* We already checked it.  */
2604       while (entry++->more);
2605     }
2606
2607   subexp_num = dfa->nodes[bkref_node].opr.idx;
2608
2609   /* For each sub expression  */
2610   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2611     {
2612       reg_errcode_t err;
2613       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2614       re_sub_match_last_t *sub_last;
2615       int sub_last_idx, sl_str, bkref_str_off;
2616
2617       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2618         continue; /* It isn't related.  */
2619
2620       sl_str = sub_top->str_idx;
2621       bkref_str_off = bkref_str_idx;
2622       /* At first, check the last node of sub expressions we already
2623          evaluated.  */
2624       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2625         {
2626           int sl_str_diff;
2627           sub_last = sub_top->lasts[sub_last_idx];
2628           sl_str_diff = sub_last->str_idx - sl_str;
2629           /* The matched string by the sub expression match with the substring
2630              at the back reference?  */
2631           if (sl_str_diff > 0)
2632             {
2633               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2634                 {
2635                   /* Not enough chars for a successful match.  */
2636                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2637                     break;
2638
2639                   err = clean_state_log_if_needed (mctx,
2640                                                    bkref_str_off
2641                                                    + sl_str_diff);
2642                   if (BE (err != REG_NOERROR, 0))
2643                     return err;
2644                   buf = (const char *) re_string_get_buffer (&mctx->input);
2645                 }
2646               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2647                 /* We don't need to search this sub expression any more.  */
2648                 break;
2649             }
2650           bkref_str_off += sl_str_diff;
2651           sl_str += sl_str_diff;
2652           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2653                                 bkref_str_idx);
2654
2655           /* Reload buf, since the preceding call might have reallocated
2656              the buffer.  */
2657           buf = (const char *) re_string_get_buffer (&mctx->input);
2658
2659           if (err == REG_NOMATCH)
2660             continue;
2661           if (BE (err != REG_NOERROR, 0))
2662             return err;
2663         }
2664
2665       if (sub_last_idx < sub_top->nlasts)
2666         continue;
2667       if (sub_last_idx > 0)
2668         ++sl_str;
2669       /* Then, search for the other last nodes of the sub expression.  */
2670       for (; sl_str <= bkref_str_idx; ++sl_str)
2671         {
2672           int cls_node, sl_str_off;
2673           const re_node_set *nodes;
2674           sl_str_off = sl_str - sub_top->str_idx;
2675           /* The matched string by the sub expression match with the substring
2676              at the back reference?  */
2677           if (sl_str_off > 0)
2678             {
2679               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2680                 {
2681                   /* If we are at the end of the input, we cannot match.  */
2682                   if (bkref_str_off >= mctx->input.len)
2683                     break;
2684
2685                   err = extend_buffers (mctx);
2686                   if (BE (err != REG_NOERROR, 0))
2687                     return err;
2688
2689                   buf = (const char *) re_string_get_buffer (&mctx->input);
2690                 }
2691               if (buf [bkref_str_off++] != buf[sl_str - 1])
2692                 break; /* We don't need to search this sub expression
2693                           any more.  */
2694             }
2695           if (mctx->state_log[sl_str] == NULL)
2696             continue;
2697           /* Does this state have a ')' of the sub expression?  */
2698           nodes = &mctx->state_log[sl_str]->nodes;
2699           cls_node = find_subexp_node (dfa, nodes, subexp_num,
2700                                        OP_CLOSE_SUBEXP);
2701           if (cls_node == -1)
2702             continue; /* No.  */
2703           if (sub_top->path == NULL)
2704             {
2705               sub_top->path = calloc (sizeof (state_array_t),
2706                                       sl_str - sub_top->str_idx + 1);
2707               if (sub_top->path == NULL)
2708                 return REG_ESPACE;
2709             }
2710           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2711              in the current context?  */
2712           err = check_arrival (mctx, sub_top->path, sub_top->node,
2713                                sub_top->str_idx, cls_node, sl_str,
2714                                OP_CLOSE_SUBEXP);
2715           if (err == REG_NOMATCH)
2716               continue;
2717           if (BE (err != REG_NOERROR, 0))
2718               return err;
2719           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2720           if (BE (sub_last == NULL, 0))
2721             return REG_ESPACE;
2722           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2723                                 bkref_str_idx);
2724           if (err == REG_NOMATCH)
2725             continue;
2726         }
2727     }
2728   return REG_NOERROR;
2729 }
2730
2731 /* Helper functions for get_subexp().  */
2732
2733 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2734    If it can arrive, register the sub expression expressed with SUB_TOP
2735    and SUB_LAST.  */
2736
2737 static reg_errcode_t
2738 internal_function
2739 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2740                 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2741 {
2742   reg_errcode_t err;
2743   int to_idx;
2744   /* Can the subexpression arrive the back reference?  */
2745   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2746                        sub_last->str_idx, bkref_node, bkref_str,
2747                        OP_OPEN_SUBEXP);
2748   if (err != REG_NOERROR)
2749     return err;
2750   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2751                              sub_last->str_idx);
2752   if (BE (err != REG_NOERROR, 0))
2753     return err;
2754   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2755   return clean_state_log_if_needed (mctx, to_idx);
2756 }
2757
2758 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2759    Search '(' if FL_OPEN, or search ')' otherwise.
2760    TODO: This function isn't efficient...
2761          Because there might be more than one nodes whose types are
2762          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2763          nodes.
2764          E.g. RE: (a){2}  */
2765
2766 static int
2767 internal_function
2768 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2769                   int subexp_idx, int type)
2770 {
2771   int cls_idx;
2772   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2773     {
2774       int cls_node = nodes->elems[cls_idx];
2775       const re_token_t *node = dfa->nodes + cls_node;
2776       if (node->type == type
2777           && node->opr.idx == subexp_idx)
2778         return cls_node;
2779     }
2780   return -1;
2781 }
2782
2783 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2784    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2785    heavily reused.
2786    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2787
2788 static reg_errcode_t
2789 internal_function
2790 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2791                int top_str, int last_node, int last_str, int type)
2792 {
2793   const re_dfa_t *const dfa = mctx->dfa;
2794   reg_errcode_t err = REG_NOERROR;
2795   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2796   re_dfastate_t *cur_state = NULL;
2797   re_node_set *cur_nodes, next_nodes;
2798   re_dfastate_t **backup_state_log;
2799   unsigned int context;
2800
2801   subexp_num = dfa->nodes[top_node].opr.idx;
2802   /* Extend the buffer if we need.  */
2803   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2804     {
2805       re_dfastate_t **new_array;
2806       int old_alloc = path->alloc;
2807       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2808       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2809       if (BE (new_array == NULL, 0))
2810         {
2811           path->alloc = old_alloc;
2812           return REG_ESPACE;
2813         }
2814       path->array = new_array;
2815       memset (new_array + old_alloc, '\0',
2816               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2817     }
2818
2819   str_idx = path->next_idx ?: top_str;
2820
2821   /* Temporary modify MCTX.  */
2822   backup_state_log = mctx->state_log;
2823   backup_cur_idx = mctx->input.cur_idx;
2824   mctx->state_log = path->array;
2825   mctx->input.cur_idx = str_idx;
2826
2827   /* Setup initial node set.  */
2828   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2829   if (str_idx == top_str)
2830     {
2831       err = re_node_set_init_1 (&next_nodes, top_node);
2832       if (BE (err != REG_NOERROR, 0))
2833         return err;
2834       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2835       if (BE (err != REG_NOERROR, 0))
2836         {
2837           re_node_set_free (&next_nodes);
2838           return err;
2839         }
2840     }
2841   else
2842     {
2843       cur_state = mctx->state_log[str_idx];
2844       if (cur_state && cur_state->has_backref)
2845         {
2846           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2847           if (BE (err != REG_NOERROR, 0))
2848             return err;
2849         }
2850       else
2851         re_node_set_init_empty (&next_nodes);
2852     }
2853   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2854     {
2855       if (next_nodes.nelem)
2856         {
2857           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2858                                     subexp_num, type);
2859           if (BE (err != REG_NOERROR, 0))
2860             {
2861               re_node_set_free (&next_nodes);
2862               return err;
2863             }
2864         }
2865       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2866       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2867         {
2868           re_node_set_free (&next_nodes);
2869           return err;
2870         }
2871       mctx->state_log[str_idx] = cur_state;
2872     }
2873
2874   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2875     {
2876       re_node_set_empty (&next_nodes);
2877       if (mctx->state_log[str_idx + 1])
2878         {
2879           err = re_node_set_merge (&next_nodes,
2880                                    &mctx->state_log[str_idx + 1]->nodes);
2881           if (BE (err != REG_NOERROR, 0))
2882             {
2883               re_node_set_free (&next_nodes);
2884               return err;
2885             }
2886         }
2887       if (cur_state)
2888         {
2889           err = check_arrival_add_next_nodes (mctx, str_idx,
2890                                               &cur_state->non_eps_nodes,
2891                                               &next_nodes);
2892           if (BE (err != REG_NOERROR, 0))
2893             {
2894               re_node_set_free (&next_nodes);
2895               return err;
2896             }
2897         }
2898       ++str_idx;
2899       if (next_nodes.nelem)
2900         {
2901           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2902           if (BE (err != REG_NOERROR, 0))
2903             {
2904               re_node_set_free (&next_nodes);
2905               return err;
2906             }
2907           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2908                                     subexp_num, type);
2909           if (BE (err != REG_NOERROR, 0))
2910             {
2911               re_node_set_free (&next_nodes);
2912               return err;
2913             }
2914         }
2915       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2916       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2917       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2918         {
2919           re_node_set_free (&next_nodes);
2920           return err;
2921         }
2922       mctx->state_log[str_idx] = cur_state;
2923       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2924     }
2925   re_node_set_free (&next_nodes);
2926   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2927                : &mctx->state_log[last_str]->nodes);
2928   path->next_idx = str_idx;
2929
2930   /* Fix MCTX.  */
2931   mctx->state_log = backup_state_log;
2932   mctx->input.cur_idx = backup_cur_idx;
2933
2934   /* Then check the current node set has the node LAST_NODE.  */
2935   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2936     return REG_NOERROR;
2937
2938   return REG_NOMATCH;
2939 }
2940
2941 /* Helper functions for check_arrival.  */
2942
2943 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2944    to NEXT_NODES.
2945    TODO: This function is similar to the functions transit_state*(),
2946          however this function has many additional works.
2947          Can't we unify them?  */
2948
2949 static reg_errcode_t
2950 internal_function
2951 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
2952                               re_node_set *cur_nodes, re_node_set *next_nodes)
2953 {
2954   const re_dfa_t *const dfa = mctx->dfa;
2955   int result;
2956   int cur_idx;
2957 #ifdef RE_ENABLE_I18N
2958   reg_errcode_t err = REG_NOERROR;
2959 #endif
2960   re_node_set union_set;
2961   re_node_set_init_empty (&union_set);
2962   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
2963     {
2964       int naccepted = 0;
2965       int cur_node = cur_nodes->elems[cur_idx];
2966 #ifdef DEBUG
2967       re_token_type_t type = dfa->nodes[cur_node].type;
2968       assert (!IS_EPSILON_NODE (type));
2969 #endif
2970 #ifdef RE_ENABLE_I18N
2971       /* If the node may accept `multi byte'.  */
2972       if (dfa->nodes[cur_node].accept_mb)
2973         {
2974           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
2975                                                str_idx);
2976           if (naccepted > 1)
2977             {
2978               re_dfastate_t *dest_state;
2979               int next_node = dfa->nexts[cur_node];
2980               int next_idx = str_idx + naccepted;
2981               dest_state = mctx->state_log[next_idx];
2982               re_node_set_empty (&union_set);
2983               if (dest_state)
2984                 {
2985                   err = re_node_set_merge (&union_set, &dest_state->nodes);
2986                   if (BE (err != REG_NOERROR, 0))
2987                     {
2988                       re_node_set_free (&union_set);
2989                       return err;
2990                     }
2991                 }
2992               result = re_node_set_insert (&union_set, next_node);
2993               if (BE (result < 0, 0))
2994                 {
2995                   re_node_set_free (&union_set);
2996                   return REG_ESPACE;
2997                 }
2998               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
2999                                                             &union_set);
3000               if (BE (mctx->state_log[next_idx] == NULL
3001                       && err != REG_NOERROR, 0))
3002                 {
3003                   re_node_set_free (&union_set);
3004                   return err;
3005                 }
3006             }
3007         }
3008 #endif /* RE_ENABLE_I18N */
3009       if (naccepted
3010           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3011         {
3012           result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3013           if (BE (result < 0, 0))
3014             {
3015               re_node_set_free (&union_set);
3016               return REG_ESPACE;
3017             }
3018         }
3019     }
3020   re_node_set_free (&union_set);
3021   return REG_NOERROR;
3022 }
3023
3024 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3025    CUR_NODES, however exclude the nodes which are:
3026     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3027     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3028 */
3029
3030 static reg_errcode_t
3031 internal_function
3032 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3033                           int ex_subexp, int type)
3034 {
3035   reg_errcode_t err;
3036   int idx, outside_node;
3037   re_node_set new_nodes;
3038 #ifdef DEBUG
3039   assert (cur_nodes->nelem);
3040 #endif
3041   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3042   if (BE (err != REG_NOERROR, 0))
3043     return err;
3044   /* Create a new node set NEW_NODES with the nodes which are epsilon
3045      closures of the node in CUR_NODES.  */
3046
3047   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3048     {
3049       int cur_node = cur_nodes->elems[idx];
3050       const re_node_set *eclosure = dfa->eclosures + cur_node;
3051       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3052       if (outside_node == -1)
3053         {
3054           /* There are no problematic nodes, just merge them.  */
3055           err = re_node_set_merge (&new_nodes, eclosure);
3056           if (BE (err != REG_NOERROR, 0))
3057             {
3058               re_node_set_free (&new_nodes);
3059               return err;
3060             }
3061         }
3062       else
3063         {
3064           /* There are problematic nodes, re-calculate incrementally.  */
3065           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3066                                               ex_subexp, type);
3067           if (BE (err != REG_NOERROR, 0))
3068             {
3069               re_node_set_free (&new_nodes);
3070               return err;
3071             }
3072         }
3073     }
3074   re_node_set_free (cur_nodes);
3075   *cur_nodes = new_nodes;
3076   return REG_NOERROR;
3077 }
3078
3079 /* Helper function for check_arrival_expand_ecl.
3080    Check incrementally the epsilon closure of TARGET, and if it isn't
3081    problematic append it to DST_NODES.  */
3082
3083 static reg_errcode_t
3084 internal_function
3085 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3086                               int target, int ex_subexp, int type)
3087 {
3088   int cur_node;
3089   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3090     {
3091       int err;
3092
3093       if (dfa->nodes[cur_node].type == type
3094           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3095         {
3096           if (type == OP_CLOSE_SUBEXP)
3097             {
3098               err = re_node_set_insert (dst_nodes, cur_node);
3099               if (BE (err == -1, 0))
3100                 return REG_ESPACE;
3101             }
3102           break;
3103         }
3104       err = re_node_set_insert (dst_nodes, cur_node);
3105       if (BE (err == -1, 0))
3106         return REG_ESPACE;
3107       if (dfa->edests[cur_node].nelem == 0)
3108         break;
3109       if (dfa->edests[cur_node].nelem == 2)
3110         {
3111           err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3112                                               dfa->edests[cur_node].elems[1],
3113                                               ex_subexp, type);
3114           if (BE (err != REG_NOERROR, 0))
3115             return err;
3116         }
3117       cur_node = dfa->edests[cur_node].elems[0];
3118     }
3119   return REG_NOERROR;
3120 }
3121
3122
3123 /* For all the back references in the current state, calculate the
3124    destination of the back references by the appropriate entry
3125    in MCTX->BKREF_ENTS.  */
3126
3127 static reg_errcode_t
3128 internal_function
3129 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3130                     int cur_str, int subexp_num, int type)
3131 {
3132   const re_dfa_t *const dfa = mctx->dfa;
3133   reg_errcode_t err;
3134   int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3135   struct re_backref_cache_entry *ent;
3136
3137   if (cache_idx_start == -1)
3138     return REG_NOERROR;
3139
3140  restart:
3141   ent = mctx->bkref_ents + cache_idx_start;
3142   do
3143     {
3144       int to_idx, next_node;
3145
3146       /* Is this entry ENT is appropriate?  */
3147       if (!re_node_set_contains (cur_nodes, ent->node))
3148         continue; /* No.  */
3149
3150       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3151       /* Calculate the destination of the back reference, and append it
3152          to MCTX->STATE_LOG.  */
3153       if (to_idx == cur_str)
3154         {
3155           /* The backreference did epsilon transit, we must re-check all the
3156              node in the current state.  */
3157           re_node_set new_dests;
3158           reg_errcode_t err2, err3;
3159           next_node = dfa->edests[ent->node].elems[0];
3160           if (re_node_set_contains (cur_nodes, next_node))
3161             continue;
3162           err = re_node_set_init_1 (&new_dests, next_node);
3163           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3164           err3 = re_node_set_merge (cur_nodes, &new_dests);
3165           re_node_set_free (&new_dests);
3166           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3167                   || err3 != REG_NOERROR, 0))
3168             {
3169               err = (err != REG_NOERROR ? err
3170                      : (err2 != REG_NOERROR ? err2 : err3));
3171               return err;
3172             }
3173           /* TODO: It is still inefficient...  */
3174           goto restart;
3175         }
3176       else
3177         {
3178           re_node_set union_set;
3179           next_node = dfa->nexts[ent->node];
3180           if (mctx->state_log[to_idx])
3181             {
3182               int ret;
3183               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3184                                         next_node))
3185                 continue;
3186               err = re_node_set_init_copy (&union_set,
3187                                            &mctx->state_log[to_idx]->nodes);
3188               ret = re_node_set_insert (&union_set, next_node);
3189               if (BE (err != REG_NOERROR || ret < 0, 0))
3190                 {
3191                   re_node_set_free (&union_set);
3192                   err = err != REG_NOERROR ? err : REG_ESPACE;
3193                   return err;
3194                 }
3195             }
3196           else
3197             {
3198               err = re_node_set_init_1 (&union_set, next_node);
3199               if (BE (err != REG_NOERROR, 0))
3200                 return err;
3201             }
3202           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3203           re_node_set_free (&union_set);
3204           if (BE (mctx->state_log[to_idx] == NULL
3205                   && err != REG_NOERROR, 0))
3206             return err;
3207         }
3208     }
3209   while (ent++->more);
3210   return REG_NOERROR;
3211 }
3212
3213 /* Build transition table for the state.
3214    Return 1 if succeeded, otherwise return NULL.  */
3215
3216 static int
3217 internal_function
3218 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3219 {
3220   reg_errcode_t err;
3221   int i, j, ch, need_word_trtable = 0;
3222   bitset_word_t elem, mask;
3223   bool dests_node_malloced = false;
3224   bool dest_states_malloced = false;
3225   int ndests; /* Number of the destination states from `state'.  */
3226   re_dfastate_t **trtable;
3227   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3228   re_node_set follows, *dests_node;
3229   bitset_t *dests_ch;
3230   bitset_t acceptable;
3231
3232   struct dests_alloc
3233   {
3234     re_node_set dests_node[SBC_MAX];
3235     bitset_t dests_ch[SBC_MAX];
3236   } *dests_alloc;
3237
3238   /* We build DFA states which corresponds to the destination nodes
3239      from `state'.  `dests_node[i]' represents the nodes which i-th
3240      destination state contains, and `dests_ch[i]' represents the
3241      characters which i-th destination state accepts.  */
3242   if (__libc_use_alloca (sizeof (struct dests_alloc)))
3243     dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3244   else
3245     {
3246       dests_alloc = re_malloc (struct dests_alloc, 1);
3247       if (BE (dests_alloc == NULL, 0))
3248         return 0;
3249       dests_node_malloced = true;
3250     }
3251   dests_node = dests_alloc->dests_node;
3252   dests_ch = dests_alloc->dests_ch;
3253
3254   /* Initialize transiton table.  */
3255   state->word_trtable = state->trtable = NULL;
3256
3257   /* At first, group all nodes belonging to `state' into several
3258      destinations.  */
3259   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3260   if (BE (ndests <= 0, 0))
3261     {
3262       if (dests_node_malloced)
3263         free (dests_alloc);
3264       /* Return 0 in case of an error, 1 otherwise.  */
3265       if (ndests == 0)
3266         {
3267           state->trtable = (re_dfastate_t **)
3268             calloc (sizeof (re_dfastate_t *), SBC_MAX);
3269           return 1;
3270         }
3271       return 0;
3272     }
3273
3274   err = re_node_set_alloc (&follows, ndests + 1);
3275   if (BE (err != REG_NOERROR, 0))
3276     goto out_free;
3277
3278   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3279                          + ndests * 3 * sizeof (re_dfastate_t *)))
3280     dest_states = (re_dfastate_t **)
3281       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3282   else
3283     {
3284       dest_states = (re_dfastate_t **)
3285         malloc (ndests * 3 * sizeof (re_dfastate_t *));
3286       if (BE (dest_states == NULL, 0))
3287         {
3288 out_free:
3289           if (dest_states_malloced)
3290             free (dest_states);
3291           re_node_set_free (&follows);
3292           for (i = 0; i < ndests; ++i)
3293             re_node_set_free (dests_node + i);
3294           if (dests_node_malloced)
3295             free (dests_alloc);
3296           return 0;
3297         }
3298       dest_states_malloced = true;
3299     }
3300   dest_states_word = dest_states + ndests;
3301   dest_states_nl = dest_states_word + ndests;
3302   bitset_empty (acceptable);
3303
3304   /* Then build the states for all destinations.  */
3305   for (i = 0; i < ndests; ++i)
3306     {
3307       int next_node;
3308       re_node_set_empty (&follows);
3309       /* Merge the follows of this destination states.  */
3310       for (j = 0; j < dests_node[i].nelem; ++j)
3311         {
3312           next_node = dfa->nexts[dests_node[i].elems[j]];
3313           if (next_node != -1)
3314             {
3315               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3316               if (BE (err != REG_NOERROR, 0))
3317                 goto out_free;
3318             }
3319         }
3320       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3321       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3322         goto out_free;
3323       /* If the new state has context constraint,
3324          build appropriate states for these contexts.  */
3325       if (dest_states[i]->has_constraint)
3326         {
3327           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3328                                                           CONTEXT_WORD);
3329           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3330             goto out_free;
3331
3332           if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3333             need_word_trtable = 1;
3334
3335           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3336                                                         CONTEXT_NEWLINE);
3337           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3338             goto out_free;
3339         }
3340       else
3341         {
3342           dest_states_word[i] = dest_states[i];
3343           dest_states_nl[i] = dest_states[i];
3344         }
3345       bitset_merge (acceptable, dests_ch[i]);
3346     }
3347
3348   if (!BE (need_word_trtable, 0))
3349     {
3350       /* We don't care about whether the following character is a word
3351          character, or we are in a single-byte character set so we can
3352          discern by looking at the character code: allocate a
3353          256-entry transition table.  */
3354       trtable = state->trtable = calloc (sizeof (re_dfastate_t *), SBC_MAX);
3355       if (BE (trtable == NULL, 0))
3356         goto out_free;
3357
3358       /* For all characters ch...:  */
3359       for (i = 0; i < BITSET_WORDS; ++i)
3360         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3361              elem;
3362              mask <<= 1, elem >>= 1, ++ch)
3363           if (BE (elem & 1, 0))
3364             {
3365               /* There must be exactly one destination which accepts
3366                  character ch.  See group_nodes_into_DFAstates.  */
3367               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3368                 ;
3369
3370               /* j-th destination accepts the word character ch.  */
3371               if (dfa->word_char[i] & mask)
3372                 trtable[ch] = dest_states_word[j];
3373               else
3374                 trtable[ch] = dest_states[j];
3375             }
3376     }
3377   else
3378     {
3379       /* We care about whether the following character is a word
3380          character, and we are in a multi-byte character set: discern
3381          by looking at the character code: build two 256-entry
3382          transition tables, one starting at trtable[0] and one
3383          starting at trtable[SBC_MAX].  */
3384       trtable = state->word_trtable = calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3385       if (BE (trtable == NULL, 0))
3386         goto out_free;
3387
3388       /* For all characters ch...:  */
3389       for (i = 0; i < BITSET_WORDS; ++i)
3390         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3391              elem;
3392              mask <<= 1, elem >>= 1, ++ch)
3393           if (BE (elem & 1, 0))
3394             {
3395               /* There must be exactly one destination which accepts
3396                  character ch.  See group_nodes_into_DFAstates.  */
3397               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3398                 ;
3399
3400               /* j-th destination accepts the word character ch.  */
3401               trtable[ch] = dest_states[j];
3402               trtable[ch + SBC_MAX] = dest_states_word[j];
3403             }
3404     }
3405
3406   /* new line */
3407   if (bitset_contain (acceptable, NEWLINE_CHAR))
3408     {
3409       /* The current state accepts newline character.  */
3410       for (j = 0; j < ndests; ++j)
3411         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3412           {
3413             /* k-th destination accepts newline character.  */
3414             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3415             if (need_word_trtable)
3416               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3417             /* There must be only one destination which accepts
3418                newline.  See group_nodes_into_DFAstates.  */
3419             break;
3420           }
3421     }
3422
3423   if (dest_states_malloced)
3424     free (dest_states);
3425
3426   re_node_set_free (&follows);
3427   for (i = 0; i < ndests; ++i)
3428     re_node_set_free (dests_node + i);
3429
3430   if (dests_node_malloced)
3431     free (dests_alloc);
3432
3433   return 1;
3434 }
3435
3436 /* Group all nodes belonging to STATE into several destinations.
3437    Then for all destinations, set the nodes belonging to the destination
3438    to DESTS_NODE[i] and set the characters accepted by the destination
3439    to DEST_CH[i].  This function return the number of destinations.  */
3440
3441 static int
3442 internal_function
3443 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3444                             re_node_set *dests_node, bitset_t *dests_ch)
3445 {
3446   reg_errcode_t err;
3447   int result;
3448   int i, j, k;
3449   int ndests; /* Number of the destinations from `state'.  */
3450   bitset_t accepts; /* Characters a node can accept.  */
3451   const re_node_set *cur_nodes = &state->nodes;
3452   bitset_empty (accepts);
3453   ndests = 0;
3454
3455   /* For all the nodes belonging to `state',  */
3456   for (i = 0; i < cur_nodes->nelem; ++i)
3457     {
3458       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3459       re_token_type_t type = node->type;
3460       unsigned int constraint = node->constraint;
3461
3462       /* Enumerate all single byte character this node can accept.  */
3463       if (type == CHARACTER)
3464         bitset_set (accepts, node->opr.c);
3465       else if (type == SIMPLE_BRACKET)
3466         {
3467           bitset_merge (accepts, node->opr.sbcset);
3468         }
3469       else if (type == OP_PERIOD)
3470         {
3471 #ifdef RE_ENABLE_I18N
3472           if (dfa->mb_cur_max > 1)
3473             bitset_merge (accepts, dfa->sb_char);
3474           else
3475 #endif
3476             bitset_set_all (accepts);
3477           if (!(dfa->syntax & RE_DOT_NEWLINE))
3478             bitset_clear (accepts, '\n');
3479           if (dfa->syntax & RE_DOT_NOT_NULL)
3480             bitset_clear (accepts, '\0');
3481         }
3482 #ifdef RE_ENABLE_I18N
3483       else if (type == OP_UTF8_PERIOD)
3484         {
3485           memset (accepts, '\xff', sizeof (bitset_t) / 2);
3486           if (!(dfa->syntax & RE_DOT_NEWLINE))
3487             bitset_clear (accepts, '\n');
3488           if (dfa->syntax & RE_DOT_NOT_NULL)
3489             bitset_clear (accepts, '\0');
3490         }
3491 #endif
3492       else
3493         continue;
3494
3495       /* Check the `accepts' and sift the characters which are not
3496          match it the context.  */
3497       if (constraint)
3498         {
3499           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3500             {
3501               bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3502               bitset_empty (accepts);
3503               if (accepts_newline)
3504                 bitset_set (accepts, NEWLINE_CHAR);
3505               else
3506                 continue;
3507             }
3508           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3509             {
3510               bitset_empty (accepts);
3511               continue;
3512             }
3513
3514           if (constraint & NEXT_WORD_CONSTRAINT)
3515             {
3516               bitset_word_t any_set = 0;
3517               if (type == CHARACTER && !node->word_char)
3518                 {
3519                   bitset_empty (accepts);
3520                   continue;
3521                 }
3522 #ifdef RE_ENABLE_I18N
3523               if (dfa->mb_cur_max > 1)
3524                 for (j = 0; j < BITSET_WORDS; ++j)
3525                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3526               else
3527 #endif
3528                 for (j = 0; j < BITSET_WORDS; ++j)
3529                   any_set |= (accepts[j] &= dfa->word_char[j]);
3530               if (!any_set)
3531                 continue;
3532             }
3533           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3534             {
3535               bitset_word_t any_set = 0;
3536               if (type == CHARACTER && node->word_char)
3537                 {
3538                   bitset_empty (accepts);
3539                   continue;
3540                 }
3541 #ifdef RE_ENABLE_I18N
3542               if (dfa->mb_cur_max > 1)
3543                 for (j = 0; j < BITSET_WORDS; ++j)
3544                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3545               else
3546 #endif
3547                 for (j = 0; j < BITSET_WORDS; ++j)
3548                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3549               if (!any_set)
3550                 continue;
3551             }
3552         }
3553
3554       /* Then divide `accepts' into DFA states, or create a new
3555          state.  Above, we make sure that accepts is not empty.  */
3556       for (j = 0; j < ndests; ++j)
3557         {
3558           bitset_t intersec; /* Intersection sets, see below.  */
3559           bitset_t remains;
3560           /* Flags, see below.  */
3561           bitset_word_t has_intersec, not_subset, not_consumed;
3562
3563           /* Optimization, skip if this state doesn't accept the character.  */
3564           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3565             continue;
3566
3567           /* Enumerate the intersection set of this state and `accepts'.  */
3568           has_intersec = 0;
3569           for (k = 0; k < BITSET_WORDS; ++k)
3570             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3571           /* And skip if the intersection set is empty.  */
3572           if (!has_intersec)
3573             continue;
3574
3575           /* Then check if this state is a subset of `accepts'.  */
3576           not_subset = not_consumed = 0;
3577           for (k = 0; k < BITSET_WORDS; ++k)
3578             {
3579               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3580               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3581             }
3582
3583           /* If this state isn't a subset of `accepts', create a
3584              new group state, which has the `remains'. */
3585           if (not_subset)
3586             {
3587               bitset_copy (dests_ch[ndests], remains);
3588               bitset_copy (dests_ch[j], intersec);
3589               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3590               if (BE (err != REG_NOERROR, 0))
3591                 goto error_return;
3592               ++ndests;
3593             }
3594
3595           /* Put the position in the current group. */
3596           result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3597           if (BE (result < 0, 0))
3598             goto error_return;
3599
3600           /* If all characters are consumed, go to next node. */
3601           if (!not_consumed)
3602             break;
3603         }
3604       /* Some characters remain, create a new group. */
3605       if (j == ndests)
3606         {
3607           bitset_copy (dests_ch[ndests], accepts);
3608           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3609           if (BE (err != REG_NOERROR, 0))
3610             goto error_return;
3611           ++ndests;
3612           bitset_empty (accepts);
3613         }
3614     }
3615   return ndests;
3616  error_return:
3617   for (j = 0; j < ndests; ++j)
3618     re_node_set_free (dests_node + j);
3619   return -1;
3620 }
3621
3622 #ifdef RE_ENABLE_I18N
3623 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3624    Return the number of the bytes the node accepts.
3625    STR_IDX is the current index of the input string.
3626
3627    This function handles the nodes which can accept one character, or
3628    one collating element like '.', '[a-z]', opposite to the other nodes
3629    can only accept one byte.  */
3630
3631 static int
3632 internal_function
3633 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3634                          const re_string_t *input, int str_idx)
3635 {
3636   const re_token_t *node = dfa->nodes + node_idx;
3637   int char_len, elem_len;
3638   int i;
3639
3640   if (BE (node->type == OP_UTF8_PERIOD, 0))
3641     {
3642       unsigned char c = re_string_byte_at (input, str_idx), d;
3643       if (BE (c < 0xc2, 1))
3644         return 0;
3645
3646       if (str_idx + 2 > input->len)
3647         return 0;
3648
3649       d = re_string_byte_at (input, str_idx + 1);
3650       if (c < 0xe0)
3651         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3652       else if (c < 0xf0)
3653         {
3654           char_len = 3;
3655           if (c == 0xe0 && d < 0xa0)
3656             return 0;
3657         }
3658       else if (c < 0xf8)
3659         {
3660           char_len = 4;
3661           if (c == 0xf0 && d < 0x90)
3662             return 0;
3663         }
3664       else if (c < 0xfc)
3665         {
3666           char_len = 5;
3667           if (c == 0xf8 && d < 0x88)
3668             return 0;
3669         }
3670       else if (c < 0xfe)
3671         {
3672           char_len = 6;
3673           if (c == 0xfc && d < 0x84)
3674             return 0;
3675         }
3676       else
3677         return 0;
3678
3679       if (str_idx + char_len > input->len)
3680         return 0;
3681
3682       for (i = 1; i < char_len; ++i)
3683         {
3684           d = re_string_byte_at (input, str_idx + i);
3685           if (d < 0x80 || d > 0xbf)
3686             return 0;
3687         }
3688       return char_len;
3689     }
3690
3691   char_len = re_string_char_size_at (input, str_idx);
3692   if (node->type == OP_PERIOD)
3693     {
3694       if (char_len <= 1)
3695         return 0;
3696       /* FIXME: I don't think this if is needed, as both '\n'
3697          and '\0' are char_len == 1.  */
3698       /* '.' accepts any one character except the following two cases.  */
3699       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3700            re_string_byte_at (input, str_idx) == '\n') ||
3701           ((dfa->syntax & RE_DOT_NOT_NULL) &&
3702            re_string_byte_at (input, str_idx) == '\0'))
3703         return 0;
3704       return char_len;
3705     }
3706
3707   elem_len = re_string_elem_size_at (input, str_idx);
3708   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3709     return 0;
3710
3711   if (node->type == COMPLEX_BRACKET)
3712     {
3713       const re_charset_t *cset = node->opr.mbcset;
3714 # if 0
3715       const unsigned char *pin
3716         = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3717       int j;
3718       uint32_t nrules;
3719 # endif
3720       int match_len = 0;
3721       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3722                     ? re_string_wchar_at (input, str_idx) : 0);
3723
3724       /* match with multibyte character?  */
3725       for (i = 0; i < cset->nmbchars; ++i)
3726         if (wc == cset->mbchars[i])
3727           {
3728             match_len = char_len;
3729             goto check_node_accept_bytes_match;
3730           }
3731       /* match with character_class?  */
3732       for (i = 0; i < cset->nchar_classes; ++i)
3733         {
3734           wctype_t wt = cset->char_classes[i];
3735           if (__iswctype (wc, wt))
3736             {
3737               match_len = char_len;
3738               goto check_node_accept_bytes_match;
3739             }
3740         }
3741
3742 # if 0
3743       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3744       if (nrules != 0)
3745         {
3746           unsigned int in_collseq = 0;
3747           const int32_t *table, *indirect;
3748           const unsigned char *weights, *extra;
3749           const char *collseqwc;
3750           int32_t idx;
3751           /* This #include defines a local function!  */
3752 #  include <locale/weight.h>
3753
3754           /* match with collating_symbol?  */
3755           if (cset->ncoll_syms)
3756             extra = (const unsigned char *)
3757               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3758           for (i = 0; i < cset->ncoll_syms; ++i)
3759             {
3760               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3761               /* Compare the length of input collating element and
3762                  the length of current collating element.  */
3763               if (*coll_sym != elem_len)
3764                 continue;
3765               /* Compare each bytes.  */
3766               for (j = 0; j < *coll_sym; j++)
3767                 if (pin[j] != coll_sym[1 + j])
3768                   break;
3769               if (j == *coll_sym)
3770                 {
3771                   /* Match if every bytes is equal.  */
3772                   match_len = j;
3773                   goto check_node_accept_bytes_match;
3774                 }
3775             }
3776
3777           if (cset->nranges)
3778             {
3779               if (elem_len <= char_len)
3780                 {
3781                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3782                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3783                 }
3784               else
3785                 in_collseq = find_collation_sequence_value (pin, elem_len);
3786             }
3787           /* match with range expression?  */
3788           for (i = 0; i < cset->nranges; ++i)
3789             if (cset->range_starts[i] <= in_collseq
3790                 && in_collseq <= cset->range_ends[i])
3791               {
3792                 match_len = elem_len;
3793                 goto check_node_accept_bytes_match;
3794               }
3795
3796           /* match with equivalence_class?  */
3797           if (cset->nequiv_classes)
3798             {
3799               const unsigned char *cp = pin;
3800               table = (const int32_t *)
3801                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3802               weights = (const unsigned char *)
3803                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3804               extra = (const unsigned char *)
3805                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3806               indirect = (const int32_t *)
3807                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3808               idx = findidx (&cp);
3809               if (idx > 0)
3810                 for (i = 0; i < cset->nequiv_classes; ++i)
3811                   {
3812                     int32_t equiv_class_idx = cset->equiv_classes[i];
3813                     size_t weight_len = weights[idx];
3814                     if (weight_len == weights[equiv_class_idx])
3815                       {
3816                         int cnt = 0;
3817                         while (cnt <= weight_len
3818                                && (weights[equiv_class_idx + 1 + cnt]
3819                                    == weights[idx + 1 + cnt]))
3820                           ++cnt;
3821                         if (cnt > weight_len)
3822                           {
3823                             match_len = elem_len;
3824                             goto check_node_accept_bytes_match;
3825                           }
3826                       }
3827                   }
3828             }
3829         }
3830       else
3831 # endif
3832         {
3833           /* match with range expression?  */
3834           wchar_t cmp_buf[6];
3835
3836           memset (cmp_buf, 0, sizeof(cmp_buf));
3837           cmp_buf[2] = wc;
3838           for (i = 0; i < cset->nranges; ++i)
3839             {
3840               cmp_buf[0] = cset->range_starts[i];
3841               cmp_buf[4] = cset->range_ends[i];
3842               if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3843                   && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3844                 {
3845                   match_len = char_len;
3846                   goto check_node_accept_bytes_match;
3847                 }
3848             }
3849         }
3850
3851  check_node_accept_bytes_match:
3852       if (!cset->non_match)
3853         return match_len;
3854       if (match_len > 0)
3855         return 0;
3856       return (elem_len > char_len) ? elem_len : char_len;
3857     }
3858   return 0;
3859 }
3860
3861 # if 0
3862 static unsigned int
3863 internal_function
3864 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3865 {
3866   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3867   if (nrules == 0)
3868     {
3869       if (mbs_len == 1)
3870         {
3871           /* No valid character.  Match it as a single byte character.  */
3872           const unsigned char *collseq = (const unsigned char *)
3873             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3874           return collseq[mbs[0]];
3875         }
3876       return UINT_MAX;
3877     }
3878   else
3879     {
3880       int32_t idx;
3881       const unsigned char *extra = (const unsigned char *)
3882         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3883       int32_t extrasize = (const unsigned char *)
3884         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3885
3886       for (idx = 0; idx < extrasize;)
3887         {
3888           int mbs_cnt, found = 0;
3889           int32_t elem_mbs_len;
3890           /* Skip the name of collating element name.  */
3891           idx = idx + extra[idx] + 1;
3892           elem_mbs_len = extra[idx++];
3893           if (mbs_len == elem_mbs_len)
3894             {
3895               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3896                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3897                   break;
3898               if (mbs_cnt == elem_mbs_len)
3899                 /* Found the entry.  */
3900                 found = 1;
3901             }
3902           /* Skip the byte sequence of the collating element.  */
3903           idx += elem_mbs_len;
3904           /* Adjust for the alignment.  */
3905           idx = (idx + 3) & ~3;
3906           /* Skip the collation sequence value.  */
3907           idx += sizeof (uint32_t);
3908           /* Skip the wide char sequence of the collating element.  */
3909           idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3910           /* If we found the entry, return the sequence value.  */
3911           if (found)
3912             return *(uint32_t *) (extra + idx);
3913           /* Skip the collation sequence value.  */
3914           idx += sizeof (uint32_t);
3915         }
3916       return UINT_MAX;
3917     }
3918 }
3919 # endif
3920 #endif /* RE_ENABLE_I18N */
3921
3922 /* Check whether the node accepts the byte which is IDX-th
3923    byte of the INPUT.  */
3924
3925 static int
3926 internal_function
3927 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3928                    int idx)
3929 {
3930   unsigned char ch;
3931   ch = re_string_byte_at (&mctx->input, idx);
3932   switch (node->type)
3933     {
3934     case CHARACTER:
3935       if (node->opr.c != ch)
3936         return 0;
3937       break;
3938
3939     case SIMPLE_BRACKET:
3940       if (!bitset_contain (node->opr.sbcset, ch))
3941         return 0;
3942       break;
3943
3944 #ifdef RE_ENABLE_I18N
3945     case OP_UTF8_PERIOD:
3946       if (ch >= 0x80)
3947         return 0;
3948       /* FALLTHROUGH */
3949 #endif
3950     case OP_PERIOD:
3951       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
3952           || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
3953         return 0;
3954       break;
3955
3956     default:
3957       return 0;
3958     }
3959
3960   if (node->constraint)
3961     {
3962       /* The node has constraints.  Check whether the current context
3963          satisfies the constraints.  */
3964       unsigned int context = re_string_context_at (&mctx->input, idx,
3965                                                    mctx->eflags);
3966       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3967         return 0;
3968     }
3969
3970   return 1;
3971 }
3972
3973 /* Extend the buffers, if the buffers have run out.  */
3974
3975 static reg_errcode_t
3976 internal_function
3977 extend_buffers (re_match_context_t *mctx)
3978 {
3979   reg_errcode_t ret;
3980   re_string_t *pstr = &mctx->input;
3981
3982   /* Double the lengthes of the buffers.  */
3983   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
3984   if (BE (ret != REG_NOERROR, 0))
3985     return ret;
3986
3987   if (mctx->state_log != NULL)
3988     {
3989       /* And double the length of state_log.  */
3990       /* XXX We have no indication of the size of this buffer.  If this
3991          allocation fail we have no indication that the state_log array
3992          does not have the right size.  */
3993       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
3994                                               pstr->bufs_len + 1);
3995       if (BE (new_array == NULL, 0))
3996         return REG_ESPACE;
3997       mctx->state_log = new_array;
3998     }
3999
4000   /* Then reconstruct the buffers.  */
4001   if (pstr->icase)
4002     {
4003 #ifdef RE_ENABLE_I18N
4004       if (pstr->mb_cur_max > 1)
4005         {
4006           ret = build_wcs_upper_buffer (pstr);
4007           if (BE (ret != REG_NOERROR, 0))
4008             return ret;
4009         }
4010       else
4011 #endif /* RE_ENABLE_I18N  */
4012         build_upper_buffer (pstr);
4013     }
4014   else
4015     {
4016 #ifdef RE_ENABLE_I18N
4017       if (pstr->mb_cur_max > 1)
4018         build_wcs_buffer (pstr);
4019       else
4020 #endif /* RE_ENABLE_I18N  */
4021         {
4022           if (pstr->trans != NULL)
4023             re_string_translate_buffer (pstr);
4024         }
4025     }
4026   return REG_NOERROR;
4027 }
4028
4029
4030 /* Functions for matching context.  */
4031
4032 /* Initialize MCTX.  */
4033
4034 static reg_errcode_t
4035 internal_function
4036 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4037 {
4038   mctx->eflags = eflags;
4039   mctx->match_last = -1;
4040   if (n > 0)
4041     {
4042       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4043       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4044       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4045         return REG_ESPACE;
4046     }
4047   /* Already zero-ed by the caller.
4048      else
4049        mctx->bkref_ents = NULL;
4050      mctx->nbkref_ents = 0;
4051      mctx->nsub_tops = 0;  */
4052   mctx->abkref_ents = n;
4053   mctx->max_mb_elem_len = 1;
4054   mctx->asub_tops = n;
4055   return REG_NOERROR;
4056 }
4057
4058 /* Clean the entries which depend on the current input in MCTX.
4059    This function must be invoked when the matcher changes the start index
4060    of the input, or changes the input string.  */
4061
4062 static void
4063 internal_function
4064 match_ctx_clean (re_match_context_t *mctx)
4065 {
4066   int st_idx;
4067   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4068     {
4069       int sl_idx;
4070       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4071       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4072         {
4073           re_sub_match_last_t *last = top->lasts[sl_idx];
4074           re_free (last->path.array);
4075           re_free (last);
4076         }
4077       re_free (top->lasts);
4078       if (top->path)
4079         {
4080           re_free (top->path->array);
4081           re_free (top->path);
4082         }
4083       free (top);
4084     }
4085
4086   mctx->nsub_tops = 0;
4087   mctx->nbkref_ents = 0;
4088 }
4089
4090 /* Free all the memory associated with MCTX.  */
4091
4092 static void
4093 internal_function
4094 match_ctx_free (re_match_context_t *mctx)
4095 {
4096   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4097   match_ctx_clean (mctx);
4098   re_free (mctx->sub_tops);
4099   re_free (mctx->bkref_ents);
4100 }
4101
4102 /* Add a new backreference entry to MCTX.
4103    Note that we assume that caller never call this function with duplicate
4104    entry, and call with STR_IDX which isn't smaller than any existing entry.
4105 */
4106
4107 static reg_errcode_t
4108 internal_function
4109 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4110                      int to)
4111 {
4112   if (mctx->nbkref_ents >= mctx->abkref_ents)
4113     {
4114       struct re_backref_cache_entry* new_entry;
4115       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4116                               mctx->abkref_ents * 2);
4117       if (BE (new_entry == NULL, 0))
4118         {
4119           re_free (mctx->bkref_ents);
4120           return REG_ESPACE;
4121         }
4122       mctx->bkref_ents = new_entry;
4123       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4124               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4125       mctx->abkref_ents *= 2;
4126     }
4127   if (mctx->nbkref_ents > 0
4128       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4129     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4130
4131   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4132   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4133   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4134   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4135
4136   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4137      If bit N is clear, means that this entry won't epsilon-transition to
4138      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4139      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4140      such node.
4141
4142      A backreference does not epsilon-transition unless it is empty, so set
4143      to all zeros if FROM != TO.  */
4144   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4145     = (from == to ? ~0 : 0);
4146
4147   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4148   if (mctx->max_mb_elem_len < to - from)
4149     mctx->max_mb_elem_len = to - from;
4150   return REG_NOERROR;
4151 }
4152
4153 /* Search for the first entry which has the same str_idx, or -1 if none is
4154    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4155
4156 static int
4157 internal_function
4158 search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4159 {
4160   int left, right, mid, last;
4161   last = right = mctx->nbkref_ents;
4162   for (left = 0; left < right;)
4163     {
4164       mid = (left + right) / 2;
4165       if (mctx->bkref_ents[mid].str_idx < str_idx)
4166         left = mid + 1;
4167       else
4168         right = mid;
4169     }
4170   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4171     return left;
4172   else
4173     return -1;
4174 }
4175
4176 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4177    at STR_IDX.  */
4178
4179 static reg_errcode_t
4180 internal_function
4181 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4182 {
4183 #ifdef DEBUG
4184   assert (mctx->sub_tops != NULL);
4185   assert (mctx->asub_tops > 0);
4186 #endif
4187   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4188     {
4189       int new_asub_tops = mctx->asub_tops * 2;
4190       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4191                                                    re_sub_match_top_t *,
4192                                                    new_asub_tops);
4193       if (BE (new_array == NULL, 0))
4194         return REG_ESPACE;
4195       mctx->sub_tops = new_array;
4196       mctx->asub_tops = new_asub_tops;
4197     }
4198   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4199   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4200     return REG_ESPACE;
4201   mctx->sub_tops[mctx->nsub_tops]->node = node;
4202   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4203   return REG_NOERROR;
4204 }
4205
4206 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4207    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4208
4209 static re_sub_match_last_t *
4210 internal_function
4211 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4212 {
4213   re_sub_match_last_t *new_entry;
4214   if (BE (subtop->nlasts == subtop->alasts, 0))
4215     {
4216       int new_alasts = 2 * subtop->alasts + 1;
4217       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4218                                                     re_sub_match_last_t *,
4219                                                     new_alasts);
4220       if (BE (new_array == NULL, 0))
4221         return NULL;
4222       subtop->lasts = new_array;
4223       subtop->alasts = new_alasts;
4224     }
4225   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4226   if (BE (new_entry != NULL, 1))
4227     {
4228       subtop->lasts[subtop->nlasts] = new_entry;
4229       new_entry->node = node;
4230       new_entry->str_idx = str_idx;
4231       ++subtop->nlasts;
4232     }
4233   return new_entry;
4234 }
4235
4236 static void
4237 internal_function
4238 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4239                re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4240 {
4241   sctx->sifted_states = sifted_sts;
4242   sctx->limited_states = limited_sts;
4243   sctx->last_node = last_node;
4244   sctx->last_str_idx = last_str_idx;
4245   re_node_set_init_empty (&sctx->limits);
4246 }