libc/misc/regex/regex_internal.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 static void re_string_construct_common (const char *str, int len,
  21                                         re_string_t *pstr,
  22                                         __RE_TRANSLATE_TYPE trans, int icase,
  23                                         const re_dfa_t *dfa) internal_function;
  24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
  25                                           const re_node_set *nodes,
  26                                           unsigned int hash) internal_function;
  27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
  28                                           const re_node_set *nodes,
  29                                           unsigned int context,
  30                                           unsigned int hash) internal_function;
  31
  32 /* Functions for string operation.  */
  33
  34 /* This function allocate the buffers.  It is necessary to call
  35    re_string_reconstruct before using the object.  */
  36
  37 static reg_errcode_t
  38 internal_function
  39 re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
  40                     __RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
  41 {
  42   reg_errcode_t ret;
  43   int init_buf_len;
  44
  45   /* Ensure at least one character fits into the buffers.  */
  46   if (init_len < dfa->mb_cur_max)
  47     init_len = dfa->mb_cur_max;
  48   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  49   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  50
  51   ret = re_string_realloc_buffers (pstr, init_buf_len);
  52   if (BE (ret != REG_NOERROR, 0))
  53     return ret;
  54
  55   pstr->word_char = dfa->word_char;
  56   pstr->word_ops_used = dfa->word_ops_used;
  57   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  58   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  59   pstr->valid_raw_len = pstr->valid_len;
  60   return REG_NOERROR;
  61 }
  62
  63 /* This function allocate the buffers, and initialize them.  */
  64
  65 static reg_errcode_t
  66 internal_function
  67 re_string_construct (re_string_t *pstr, const char *str, int len,
  68                      __RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
  69 {
  70   reg_errcode_t ret;
  71   memset (pstr, '\0', sizeof (re_string_t));
  72   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  73
  74   if (len > 0)
  75     {
  76       ret = re_string_realloc_buffers (pstr, len + 1);
  77       if (BE (ret != REG_NOERROR, 0))
  78         return ret;
  79     }
  80   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  81
  82   if (icase)
  83     {
  84 #ifdef RE_ENABLE_I18N
  85       if (dfa->mb_cur_max > 1)
  86         {
  87           while (1)
  88             {
  89               ret = build_wcs_upper_buffer (pstr);
  90               if (BE (ret != REG_NOERROR, 0))
  91                 return ret;
  92               if (pstr->valid_raw_len >= len)
  93                 break;
  94               if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
  95                 break;
  96               ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
  97               if (BE (ret != REG_NOERROR, 0))
  98                 return ret;
  99             }
 100         }
 101       else
 102 #endif /* RE_ENABLE_I18N  */
 103         build_upper_buffer (pstr);
 104     }
 105   else
 106     {
 107 #ifdef RE_ENABLE_I18N
 108       if (dfa->mb_cur_max > 1)
 109         build_wcs_buffer (pstr);
 110       else
 111 #endif
 112         {
 113           if (trans != NULL)
 114             re_string_translate_buffer (pstr);
 115           else
 116             {
 117               pstr->valid_len = pstr->bufs_len;
 118               pstr->valid_raw_len = pstr->bufs_len;
 119             }
 120         }
 121     }
 122
 123   return REG_NOERROR;
 124 }
 125
 126 /* Helper functions for re_string_allocate, and re_string_construct.  */
 127
 128 static reg_errcode_t
 129 internal_function
 130 re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
 131 {
 132 #ifdef RE_ENABLE_I18N
 133   if (pstr->mb_cur_max > 1)
 134     {
 135       wint_t *new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
 136       if (BE (new_wcs == NULL, 0))
 137         return REG_ESPACE;
 138       pstr->wcs = new_wcs;
 139       if (pstr->offsets != NULL)
 140         {
 141           int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
 142           if (BE (new_offsets == NULL, 0))
 143             return REG_ESPACE;
 144           pstr->offsets = new_offsets;
 145         }
 146     }
 147 #endif /* RE_ENABLE_I18N  */
 148   if (pstr->mbs_allocated)
 149     {
 150       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
 151                                            new_buf_len);
 152       if (BE (new_mbs == NULL, 0))
 153         return REG_ESPACE;
 154       pstr->mbs = new_mbs;
 155     }
 156   pstr->bufs_len = new_buf_len;
 157   return REG_NOERROR;
 158 }
 159
 160
 161 static void
 162 internal_function
 163 re_string_construct_common (const char *str, int len, re_string_t *pstr,
 164                             __RE_TRANSLATE_TYPE trans, int icase,
 165                             const re_dfa_t *dfa)
 166 {
 167   pstr->raw_mbs = (const unsigned char *) str;
 168   pstr->len = len;
 169   pstr->raw_len = len;
 170   pstr->trans = trans;
 171   pstr->icase = icase ? 1 : 0;
 172   pstr->mbs_allocated = (trans != NULL || icase);
 173   pstr->mb_cur_max = dfa->mb_cur_max;
 174   pstr->is_utf8 = dfa->is_utf8;
 175   pstr->map_notascii = dfa->map_notascii;
 176   pstr->stop = pstr->len;
 177   pstr->raw_stop = pstr->stop;
 178 }
 179
 180 #ifdef RE_ENABLE_I18N
 181
 182 /* Build wide character buffer PSTR->WCS.
 183    If the byte sequence of the string are:
 184      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
 185    Then wide character buffer will be:
 186      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
 187    We use WEOF for padding, they indicate that the position isn't
 188    a first byte of a multibyte character.
 189
 190    Note that this function assumes PSTR->VALID_LEN elements are already
 191    built and starts from PSTR->VALID_LEN.  */
 192
 193 static void
 194 internal_function
 195 build_wcs_buffer (re_string_t *pstr)
 196 {
 197 #if defined __UCLIBC__
 198   unsigned char buf[MB_LEN_MAX];
 199   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 200 #else
 201   unsigned char buf[64];
 202 #endif
 203   mbstate_t prev_st;
 204   int byte_idx, end_idx, remain_len;
 205   size_t mbclen;
 206
 207   /* Build the buffers from pstr->valid_len to either pstr->len or
 208      pstr->bufs_len.  */
 209   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 210   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
 211     {
 212       wchar_t wc;
 213       const char *p;
 214
 215       remain_len = end_idx - byte_idx;
 216       prev_st = pstr->cur_state;
 217       /* Apply the translation if we need.  */
 218       if (BE (pstr->trans != NULL, 0))
 219         {
 220           int i, ch;
 221
 222           for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 223             {
 224               ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
 225               buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
 226             }
 227           p = (const char *) buf;
 228         }
 229       else
 230         p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
 231       mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 232       if (BE (mbclen == (size_t) -2, 0))
 233         {
 234           /* The buffer doesn't have enough space, finish to build.  */
 235           pstr->cur_state = prev_st;
 236           break;
 237         }
 238       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
 239         {
 240           /* We treat these cases as a singlebyte character.  */
 241           mbclen = 1;
 242           wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 243           if (BE (pstr->trans != NULL, 0))
 244             wc = pstr->trans[wc];
 245           pstr->cur_state = prev_st;
 246         }
 247
 248       /* Write wide character and padding.  */
 249       pstr->wcs[byte_idx++] = wc;
 250       /* Write paddings.  */
 251       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 252         pstr->wcs[byte_idx++] = WEOF;
 253     }
 254   pstr->valid_len = byte_idx;
 255   pstr->valid_raw_len = byte_idx;
 256 }
 257
 258 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
 259    but for REG_ICASE.  */
 260
 261 static reg_errcode_t
 262 internal_function
 263 build_wcs_upper_buffer (re_string_t *pstr)
 264 {
 265   mbstate_t prev_st;
 266   int src_idx, byte_idx, end_idx, remain_len;
 267   size_t mbclen;
 268 #if defined __UCLIBC__
 269   char buf[MB_LEN_MAX];
 270   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 271 #else
 272   char buf[64];
 273 #endif
 274
 275   byte_idx = pstr->valid_len;
 276   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 277
 278   /* The following optimization assumes that ASCII characters can be
 279      mapped to wide characters with a simple cast.  */
 280   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
 281     {
 282       while (byte_idx < end_idx)
 283         {
 284           wchar_t wc;
 285
 286           if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
 287               && mbsinit (&pstr->cur_state))
 288             {
 289               /* In case of a singlebyte character.  */
 290               pstr->mbs[byte_idx]
 291                 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
 292               /* The next step uses the assumption that wchar_t is encoded
 293                  ASCII-safe: all ASCII values can be converted like this.  */
 294               pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
 295               ++byte_idx;
 296               continue;
 297             }
 298
 299           remain_len = end_idx - byte_idx;
 300           prev_st = pstr->cur_state;
 301           mbclen = mbrtowc (&wc,
 302                             ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 303                              + byte_idx), remain_len, &pstr->cur_state);
 304           if (BE (mbclen + 2 > 2, 1))
 305             {
 306               wchar_t wcu = wc;
 307               if (iswlower (wc))
 308                 {
 309                   size_t mbcdlen;
 310
 311                   wcu = towupper (wc);
 312                   mbcdlen = wcrtomb (buf, wcu, &prev_st);
 313                   if (BE (mbclen == mbcdlen, 1))
 314                     memcpy (pstr->mbs + byte_idx, buf, mbclen);
 315                   else
 316                     {
 317                       src_idx = byte_idx;
 318                       goto offsets_needed;
 319                     }
 320                 }
 321               else
 322                 memcpy (pstr->mbs + byte_idx,
 323                         pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 324               pstr->wcs[byte_idx++] = wcu;
 325               /* Write paddings.  */
 326               for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 327                 pstr->wcs[byte_idx++] = WEOF;
 328             }
 329           else if (mbclen == (size_t) -1 || mbclen == 0)
 330             {
 331               /* It is an invalid character or '\0'.  Just use the byte.  */
 332               int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 333               pstr->mbs[byte_idx] = ch;
 334               /* And also cast it to wide char.  */
 335               pstr->wcs[byte_idx++] = (wchar_t) ch;
 336               if (BE (mbclen == (size_t) -1, 0))
 337                 pstr->cur_state = prev_st;
 338             }
 339           else
 340             {
 341               /* The buffer doesn't have enough space, finish to build.  */
 342               pstr->cur_state = prev_st;
 343               break;
 344             }
 345         }
 346       pstr->valid_len = byte_idx;
 347       pstr->valid_raw_len = byte_idx;
 348       return REG_NOERROR;
 349     }
 350   else
 351     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
 352       {
 353         wchar_t wc;
 354         const char *p;
 355       offsets_needed:
 356         remain_len = end_idx - byte_idx;
 357         prev_st = pstr->cur_state;
 358         if (BE (pstr->trans != NULL, 0))
 359           {
 360             int i, ch;
 361
 362             for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 363               {
 364                 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
 365                 buf[i] = pstr->trans[ch];
 366               }
 367             p = (const char *) buf;
 368           }
 369         else
 370           p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
 371         mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 372         if (BE (mbclen + 2 > 2, 1))
 373           {
 374             wchar_t wcu = wc;
 375             if (iswlower (wc))
 376               {
 377                 size_t mbcdlen;
 378
 379                 wcu = towupper (wc);
 380                 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
 381                 if (BE (mbclen == mbcdlen, 1))
 382                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
 383                 else if (mbcdlen != (size_t) -1)
 384                   {
 385                     size_t i;
 386
 387                     if (byte_idx + mbcdlen > pstr->bufs_len)
 388                       {
 389                         pstr->cur_state = prev_st;
 390                         break;
 391                       }
 392
 393                     if (pstr->offsets == NULL)
 394                       {
 395                         pstr->offsets = re_malloc (int, pstr->bufs_len);
 396
 397                         if (pstr->offsets == NULL)
 398                           return REG_ESPACE;
 399                       }
 400                     if (!pstr->offsets_needed)
 401                       {
 402                         for (i = 0; i < (size_t) byte_idx; ++i)
 403                           pstr->offsets[i] = i;
 404                         pstr->offsets_needed = 1;
 405                       }
 406
 407                     memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
 408                     pstr->wcs[byte_idx] = wcu;
 409                     pstr->offsets[byte_idx] = src_idx;
 410                     for (i = 1; i < mbcdlen; ++i)
 411                       {
 412                         pstr->offsets[byte_idx + i]
 413                           = src_idx + (i < mbclen ? i : mbclen - 1);
 414                         pstr->wcs[byte_idx + i] = WEOF;
 415                       }
 416                     pstr->len += mbcdlen - mbclen;
 417                     if (pstr->raw_stop > src_idx)
 418                       pstr->stop += mbcdlen - mbclen;
 419                     end_idx = (pstr->bufs_len > pstr->len)
 420                               ? pstr->len : pstr->bufs_len;
 421                     byte_idx += mbcdlen;
 422                     src_idx += mbclen;
 423                     continue;
 424                   }
 425                 else
 426                   memcpy (pstr->mbs + byte_idx, p, mbclen);
 427               }
 428             else
 429               memcpy (pstr->mbs + byte_idx, p, mbclen);
 430
 431             if (BE (pstr->offsets_needed != 0, 0))
 432               {
 433                 size_t i;
 434                 for (i = 0; i < mbclen; ++i)
 435                   pstr->offsets[byte_idx + i] = src_idx + i;
 436               }
 437             src_idx += mbclen;
 438
 439             pstr->wcs[byte_idx++] = wcu;
 440             /* Write paddings.  */
 441             for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 442               pstr->wcs[byte_idx++] = WEOF;
 443           }
 444         else if (mbclen == (size_t) -1 || mbclen == 0)
 445           {
 446             /* It is an invalid character or '\0'.  Just use the byte.  */
 447             int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
 448
 449             if (BE (pstr->trans != NULL, 0))
 450               ch = pstr->trans [ch];
 451             pstr->mbs[byte_idx] = ch;
 452
 453             if (BE (pstr->offsets_needed != 0, 0))
 454               pstr->offsets[byte_idx] = src_idx;
 455             ++src_idx;
 456
 457             /* And also cast it to wide char.  */
 458             pstr->wcs[byte_idx++] = (wchar_t) ch;
 459             if (BE (mbclen == (size_t) -1, 0))
 460               pstr->cur_state = prev_st;
 461           }
 462         else
 463           {
 464             /* The buffer doesn't have enough space, finish to build.  */
 465             pstr->cur_state = prev_st;
 466             break;
 467           }
 468       }
 469   pstr->valid_len = byte_idx;
 470   pstr->valid_raw_len = src_idx;
 471   return REG_NOERROR;
 472 }
 473
 474 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
 475    Return the index.  */
 476
 477 static int
 478 internal_function
 479 re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
 480 {
 481   mbstate_t prev_st;
 482   int rawbuf_idx;
 483   size_t mbclen;
 484   wchar_t wc = 0;
 485
 486   /* Skip the characters which are not necessary to check.  */
 487   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
 488        rawbuf_idx < new_raw_idx;)
 489     {
 490       int remain_len;
 491       remain_len = pstr->len - rawbuf_idx;
 492       prev_st = pstr->cur_state;
 493       mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
 494                         remain_len, &pstr->cur_state);
 495       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
 496         {
 497           /* We treat these cases as a singlebyte character.  */
 498           mbclen = 1;
 499           pstr->cur_state = prev_st;
 500         }
 501       /* Then proceed the next character.  */
 502       rawbuf_idx += mbclen;
 503     }
 504   *last_wc = (wint_t) wc;
 505   return rawbuf_idx;
 506 }
 507 #endif /* RE_ENABLE_I18N  */
 508
 509 /* Build the buffer PSTR->MBS, and apply the translation if we need.
 510    This function is used in case of REG_ICASE.  */
 511
 512 static void
 513 internal_function
 514 build_upper_buffer (re_string_t *pstr)
 515 {
 516   int char_idx, end_idx;
 517   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 518
 519   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
 520     {
 521       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
 522       if (BE (pstr->trans != NULL, 0))
 523         ch = pstr->trans[ch];
 524       if (islower (ch))
 525         pstr->mbs[char_idx] = toupper (ch);
 526       else
 527         pstr->mbs[char_idx] = ch;
 528     }
 529   pstr->valid_len = char_idx;
 530   pstr->valid_raw_len = char_idx;
 531 }
 532
 533 /* Apply TRANS to the buffer in PSTR.  */
 534
 535 static void
 536 internal_function
 537 re_string_translate_buffer (re_string_t *pstr)
 538 {
 539   int buf_idx, end_idx;
 540   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 541
 542   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
 543     {
 544       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
 545       pstr->mbs[buf_idx] = pstr->trans[ch];
 546     }
 547
 548   pstr->valid_len = buf_idx;
 549   pstr->valid_raw_len = buf_idx;
 550 }
 551
 552 /* This function re-construct the buffers.
 553    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
 554    convert to upper case in case of REG_ICASE, apply translation.  */
 555
 556 static reg_errcode_t
 557 internal_function
 558 re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
 559 {
 560   int offset = idx - pstr->raw_mbs_idx;
 561   if (BE (offset < 0, 0))
 562     {
 563       /* Reset buffer.  */
 564 #ifdef RE_ENABLE_I18N
 565       if (pstr->mb_cur_max > 1)
 566         memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 567 #endif
 568       pstr->len = pstr->raw_len;
 569       pstr->stop = pstr->raw_stop;
 570       pstr->valid_len = 0;
 571       pstr->raw_mbs_idx = 0;
 572       pstr->valid_raw_len = 0;
 573       pstr->offsets_needed = 0;
 574       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 575                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 576       if (!pstr->mbs_allocated)
 577         pstr->mbs = (unsigned char *) pstr->raw_mbs;
 578       offset = idx;
 579     }
 580
 581   if (BE (offset != 0, 1))
 582     {
 583       /* Are the characters which are already checked remain?  */
 584       if (BE (offset < pstr->valid_raw_len, 1)
 585 #ifdef RE_ENABLE_I18N
 586           /* Handling this would enlarge the code too much.
 587              Accept a slowdown in that case.  */
 588           && pstr->offsets_needed == 0
 589 #endif
 590          )
 591         {
 592           /* Yes, move them to the front of the buffer.  */
 593           pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
 594 #ifdef RE_ENABLE_I18N
 595           if (pstr->mb_cur_max > 1)
 596             memmove (pstr->wcs, pstr->wcs + offset,
 597                      (pstr->valid_len - offset) * sizeof (wint_t));
 598 #endif
 599           if (BE (pstr->mbs_allocated, 0))
 600             memmove (pstr->mbs, pstr->mbs + offset,
 601                      pstr->valid_len - offset);
 602           pstr->valid_len -= offset;
 603           pstr->valid_raw_len -= offset;
 604 #ifdef DEBUG
 605           assert (pstr->valid_len > 0);
 606 #endif
 607         }
 608       else
 609         {
 610           /* No, skip all characters until IDX.  */
 611 #ifdef RE_ENABLE_I18N
 612           if (BE (pstr->offsets_needed, 0))
 613             {
 614               pstr->len = pstr->raw_len - idx + offset;
 615               pstr->stop = pstr->raw_stop - idx + offset;
 616               pstr->offsets_needed = 0;
 617             }
 618 #endif
 619           pstr->valid_len = 0;
 620           pstr->valid_raw_len = 0;
 621 #ifdef RE_ENABLE_I18N
 622           if (pstr->mb_cur_max > 1)
 623             {
 624               int wcs_idx;
 625               wint_t wc = WEOF;
 626
 627               if (pstr->is_utf8)
 628                 {
 629                   const unsigned char *raw, *p, *end;
 630
 631                   /* Special case UTF-8.  Multi-byte chars start with any
 632                      byte other than 0x80 - 0xbf.  */
 633                   raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 634                   end = raw + (offset - pstr->mb_cur_max);
 635                   p = raw + offset - 1;
 636 #if 0
 637                   /* We know the wchar_t encoding is UCS4, so for the simple
 638                      case, ASCII characters, skip the conversion step.  */
 639                   if (isascii (*p) && BE (pstr->trans == NULL, 1))
 640                     {
 641                       memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 642                       pstr->valid_len = 0;
 643                       wc = (wchar_t) *p;
 644                     }
 645                   else
 646 #endif
 647                     for (; p >= end; --p)
 648                       if ((*p & 0xc0) != 0x80)
 649                         {
 650                           mbstate_t cur_state;
 651                           wchar_t wc2;
 652                           int mlen = raw + pstr->len - p;
 653                           unsigned char buf[6];
 654                           size_t mbclen;
 655
 656                           if (BE (pstr->trans != NULL, 0))
 657                             {
 658                               int i = mlen < 6 ? mlen : 6;
 659                               while (--i >= 0)
 660                                 buf[i] = pstr->trans[p[i]];
 661                             }
 662                           /* XXX Don't use mbrtowc, we know which conversion
 663                              to use (UTF-8 -> UCS4).  */
 664                           memset (&cur_state, 0, sizeof (cur_state));
 665                           mbclen = mbrtowc (&wc2, (const char *) p, mlen,
 666                                             &cur_state);
 667                           if (raw + offset - p <= mbclen
 668                               && mbclen < (size_t) -2)
 669                             {
 670                               memset (&pstr->cur_state, '\0',
 671                                       sizeof (mbstate_t));
 672                               pstr->valid_len = mbclen - (raw + offset - p);
 673                               wc = wc2;
 674                             }
 675                           break;
 676                         }
 677                 }
 678
 679               if (wc == WEOF)
 680                 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 681               if (BE (pstr->valid_len, 0))
 682                 {
 683                   for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 684                     pstr->wcs[wcs_idx] = WEOF;
 685                   if (pstr->mbs_allocated)
 686                     memset (pstr->mbs, 255, pstr->valid_len);
 687                 }
 688               pstr->valid_raw_len = pstr->valid_len;
 689               pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
 690                                     && IS_WIDE_WORD_CHAR (wc))
 691                                    ? CONTEXT_WORD
 692                                    : ((IS_WIDE_NEWLINE (wc)
 693                                        && pstr->newline_anchor)
 694                                       ? CONTEXT_NEWLINE : 0));
 695             }
 696           else
 697 #endif /* RE_ENABLE_I18N */
 698             {
 699               int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 700               if (pstr->trans)
 701                 c = pstr->trans[c];
 702               pstr->tip_context = (bitset_contain (pstr->word_char, c)
 703                                    ? CONTEXT_WORD
 704                                    : ((IS_NEWLINE (c) && pstr->newline_anchor)
 705                                       ? CONTEXT_NEWLINE : 0));
 706             }
 707         }
 708       if (!BE (pstr->mbs_allocated, 0))
 709         pstr->mbs += offset;
 710     }
 711   pstr->raw_mbs_idx = idx;
 712   pstr->len -= offset;
 713   pstr->stop -= offset;
 714
 715   /* Then build the buffers.  */
 716 #ifdef RE_ENABLE_I18N
 717   if (pstr->mb_cur_max > 1)
 718     {
 719       if (pstr->icase)
 720         {
 721           reg_errcode_t ret = build_wcs_upper_buffer (pstr);
 722           if (BE (ret != REG_NOERROR, 0))
 723             return ret;
 724         }
 725       else
 726         build_wcs_buffer (pstr);
 727     }
 728   else
 729 #endif
 730     if (BE (pstr->mbs_allocated, 0))
 731       {
 732         if (pstr->icase)
 733           build_upper_buffer (pstr);
 734         else if (pstr->trans != NULL)
 735           re_string_translate_buffer (pstr);
 736       }
 737     else
 738       pstr->valid_len = pstr->len;
 739
 740   pstr->cur_idx = 0;
 741   return REG_NOERROR;
 742 }
 743
 744 static unsigned char
 745 internal_function __attribute ((pure))
 746 re_string_peek_byte_case (const re_string_t *pstr, int idx)
 747 {
 748   int ch, off;
 749
 750   /* Handle the common (easiest) cases first.  */
 751   if (BE (!pstr->mbs_allocated, 1))
 752     return re_string_peek_byte (pstr, idx);
 753
 754 #ifdef RE_ENABLE_I18N
 755   if (pstr->mb_cur_max > 1
 756       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
 757     return re_string_peek_byte (pstr, idx);
 758 #endif
 759
 760   off = pstr->cur_idx + idx;
 761 #ifdef RE_ENABLE_I18N
 762   if (pstr->offsets_needed)
 763     off = pstr->offsets[off];
 764 #endif
 765
 766   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 767
 768 #ifdef RE_ENABLE_I18N
 769   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
 770      this function returns CAPITAL LETTER I instead of first byte of
 771      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
 772      since peek_byte_case doesn't advance cur_idx in any way.  */
 773   if (pstr->offsets_needed && !isascii (ch))
 774     return re_string_peek_byte (pstr, idx);
 775 #endif
 776
 777   return ch;
 778 }
 779
 780 static unsigned char
 781 internal_function __attribute ((pure))
 782 re_string_fetch_byte_case (re_string_t *pstr)
 783 {
 784   if (BE (!pstr->mbs_allocated, 1))
 785     return re_string_fetch_byte (pstr);
 786
 787 #ifdef RE_ENABLE_I18N
 788   if (pstr->offsets_needed)
 789     {
 790       int off, ch;
 791
 792       /* For tr_TR.UTF-8 [[:islower:]] there is
 793          [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 794          in that case the whole multi-byte character and return
 795          the original letter.  On the other side, with
 796          [[: DOTLESS SMALL LETTER I return [[:I, as doing
 797          anything else would complicate things too much.  */
 798
 799       if (!re_string_first_byte (pstr, pstr->cur_idx))
 800         return re_string_fetch_byte (pstr);
 801
 802       off = pstr->offsets[pstr->cur_idx];
 803       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 804
 805       if (! isascii (ch))
 806         return re_string_fetch_byte (pstr);
 807
 808       re_string_skip_bytes (pstr,
 809                             re_string_char_size_at (pstr, pstr->cur_idx));
 810       return ch;
 811     }
 812 #endif
 813
 814   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 815 }
 816
 817 static void
 818 internal_function
 819 re_string_destruct (re_string_t *pstr)
 820 {
 821 #ifdef RE_ENABLE_I18N
 822   re_free (pstr->wcs);
 823   re_free (pstr->offsets);
 824 #endif /* RE_ENABLE_I18N  */
 825   if (pstr->mbs_allocated)
 826     re_free (pstr->mbs);
 827 }
 828
 829 /* Return the context at IDX in INPUT.  */
 830
 831 static unsigned int
 832 internal_function
 833 re_string_context_at (const re_string_t *input, int idx, int eflags)
 834 {
 835   int c;
 836   if (BE (idx < 0, 0))
 837     /* In this case, we use the value stored in input->tip_context,
 838        since we can't know the character in input->mbs[-1] here.  */
 839     return input->tip_context;
 840   if (BE (idx == input->len, 0))
 841     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
 842             : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
 843 #ifdef RE_ENABLE_I18N
 844   if (input->mb_cur_max > 1)
 845     {
 846       wint_t wc;
 847       int wc_idx = idx;
 848       while(input->wcs[wc_idx] == WEOF)
 849         {
 850 #ifdef DEBUG
 851           /* It must not happen.  */
 852           assert (wc_idx >= 0);
 853 #endif
 854           --wc_idx;
 855           if (wc_idx < 0)
 856             return input->tip_context;
 857         }
 858       wc = input->wcs[wc_idx];
 859       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
 860         return CONTEXT_WORD;
 861       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
 862               ? CONTEXT_NEWLINE : 0);
 863     }
 864 #endif
 865   c = re_string_byte_at (input, idx);
 866   if (bitset_contain (input->word_char, c))
 867     return CONTEXT_WORD;
 868   return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
 869 }
 870
 871 /* Functions for set operation.  */
 872
 873 static reg_errcode_t
 874 internal_function
 875 re_node_set_alloc (re_node_set *set, int size)
 876 {
 877   set->alloc = size;
 878   set->nelem = 0;
 879   set->elems = re_malloc (int, size);   /* can be NULL if size == 0
 880                                            (see re_node_set_init_empty(set)) */
 881   if (BE (set->elems == NULL && size != 0, 0))
 882     return REG_ESPACE;
 883   return REG_NOERROR;
 884 }
 885
 886 static reg_errcode_t
 887 internal_function
 888 re_node_set_init_1 (re_node_set *set, int elem)
 889 {
 890   set->alloc = 1;
 891   set->nelem = 1;
 892   set->elems = re_malloc (int, 1);
 893   if (BE (set->elems == NULL, 0))
 894     {
 895       set->alloc = set->nelem = 0;
 896       return REG_ESPACE;
 897     }
 898   set->elems[0] = elem;
 899   return REG_NOERROR;
 900 }
 901
 902 static reg_errcode_t
 903 internal_function
 904 re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
 905 {
 906   set->alloc = 2;
 907   set->elems = re_malloc (int, 2);
 908   if (BE (set->elems == NULL, 0))
 909     return REG_ESPACE;
 910   if (elem1 == elem2)
 911     {
 912       set->nelem = 1;
 913       set->elems[0] = elem1;
 914     }
 915   else
 916     {
 917       set->nelem = 2;
 918       if (elem1 < elem2)
 919         {
 920           set->elems[0] = elem1;
 921           set->elems[1] = elem2;
 922         }
 923       else
 924         {
 925           set->elems[0] = elem2;
 926           set->elems[1] = elem1;
 927         }
 928     }
 929   return REG_NOERROR;
 930 }
 931
 932 static reg_errcode_t
 933 internal_function
 934 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
 935 {
 936   dest->nelem = src->nelem;
 937   if (src->nelem > 0)
 938     {
 939       dest->alloc = dest->nelem;
 940       dest->elems = re_malloc (int, dest->alloc);
 941       if (BE (dest->elems == NULL, 0))
 942         {
 943           dest->alloc = dest->nelem = 0;
 944           return REG_ESPACE;
 945         }
 946       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
 947     }
 948   else
 949     re_node_set_init_empty (dest);
 950   return REG_NOERROR;
 951 }
 952
 953 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
 954    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
 955    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
 956
 957 static reg_errcode_t
 958 internal_function
 959 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
 960                            const re_node_set *src2)
 961 {
 962   int i1, i2, is, id, delta, sbase;
 963   if (src1->nelem == 0 || src2->nelem == 0)
 964     return REG_NOERROR;
 965
 966   /* We need dest->nelem + 2 * elems_in_intersection; this is a
 967      conservative estimate.  */
 968   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
 969     {
 970       int new_alloc = src1->nelem + src2->nelem + dest->alloc;
 971       int *new_elems = re_realloc (dest->elems, int, new_alloc);
 972       if (BE (new_elems == NULL, 0))
 973         return REG_ESPACE;
 974       dest->elems = new_elems;
 975       dest->alloc = new_alloc;
 976     }
 977
 978   /* Find the items in the intersection of SRC1 and SRC2, and copy
 979      into the top of DEST those that are not already in DEST itself.  */
 980   sbase = dest->nelem + src1->nelem + src2->nelem;
 981   i1 = src1->nelem - 1;
 982   i2 = src2->nelem - 1;
 983   id = dest->nelem - 1;
 984   for (;;)
 985     {
 986       if (src1->elems[i1] == src2->elems[i2])
 987         {
 988           /* Try to find the item in DEST.  Maybe we could binary search?  */
 989           while (id >= 0 && dest->elems[id] > src1->elems[i1])
 990             --id;
 991
 992           if (id < 0 || dest->elems[id] != src1->elems[i1])
 993             dest->elems[--sbase] = src1->elems[i1];
 994
 995           if (--i1 < 0 || --i2 < 0)
 996             break;
 997         }
 998
 999       /* Lower the highest of the two items.  */
1000       else if (src1->elems[i1] < src2->elems[i2])
1001         {
1002           if (--i2 < 0)
1003             break;
1004         }
1005       else
1006         {
1007           if (--i1 < 0)
1008             break;
1009         }
1010     }
1011
1012   id = dest->nelem - 1;
1013   is = dest->nelem + src1->nelem + src2->nelem - 1;
1014   delta = is - sbase + 1;
1015
1016   /* Now copy.  When DELTA becomes zero, the remaining
1017      DEST elements are already in place; this is more or
1018      less the same loop that is in re_node_set_merge.  */
1019   dest->nelem += delta;
1020   if (delta > 0 && id >= 0)
1021     for (;;)
1022       {
1023         if (dest->elems[is] > dest->elems[id])
1024           {
1025             /* Copy from the top.  */
1026             dest->elems[id + delta--] = dest->elems[is--];
1027             if (delta == 0)
1028               break;
1029           }
1030         else
1031           {
1032             /* Slide from the bottom.  */
1033             dest->elems[id + delta] = dest->elems[id];
1034             if (--id < 0)
1035               break;
1036           }
1037       }
1038
1039   /* Copy remaining SRC elements.  */
1040   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1041
1042   return REG_NOERROR;
1043 }
1044
1045 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1046    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1047
1048 static reg_errcode_t
1049 internal_function
1050 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1051                         const re_node_set *src2)
1052 {
1053   int i1, i2, id;
1054   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1055     {
1056       dest->alloc = src1->nelem + src2->nelem;
1057       dest->elems = re_malloc (int, dest->alloc);
1058       if (BE (dest->elems == NULL, 0))
1059         return REG_ESPACE;
1060     }
1061   else
1062     {
1063       if (src1 != NULL && src1->nelem > 0)
1064         return re_node_set_init_copy (dest, src1);
1065       if (src2 != NULL && src2->nelem > 0)
1066         return re_node_set_init_copy (dest, src2);
1067       re_node_set_init_empty (dest);
1068       return REG_NOERROR;
1069     }
1070   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1071     {
1072       if (src1->elems[i1] > src2->elems[i2])
1073         {
1074           dest->elems[id++] = src2->elems[i2++];
1075           continue;
1076         }
1077       if (src1->elems[i1] == src2->elems[i2])
1078         ++i2;
1079       dest->elems[id++] = src1->elems[i1++];
1080     }
1081   if (i1 < src1->nelem)
1082     {
1083       memcpy (dest->elems + id, src1->elems + i1,
1084              (src1->nelem - i1) * sizeof (int));
1085       id += src1->nelem - i1;
1086     }
1087   else if (i2 < src2->nelem)
1088     {
1089       memcpy (dest->elems + id, src2->elems + i2,
1090              (src2->nelem - i2) * sizeof (int));
1091       id += src2->nelem - i2;
1092     }
1093   dest->nelem = id;
1094   return REG_NOERROR;
1095 }
1096
1097 /* Calculate the union set of the sets DEST and SRC. And store it to
1098    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1099
1100 static reg_errcode_t
1101 internal_function
1102 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1103 {
1104   int is, id, sbase, delta;
1105   if (src == NULL || src->nelem == 0)
1106     return REG_NOERROR;
1107   if (dest->alloc < 2 * src->nelem + dest->nelem)
1108     {
1109       int new_alloc = 2 * (src->nelem + dest->alloc);
1110       int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1111       if (BE (new_buffer == NULL, 0))
1112         return REG_ESPACE;
1113       dest->elems = new_buffer;
1114       dest->alloc = new_alloc;
1115     }
1116
1117   if (BE (dest->nelem == 0, 0))
1118     {
1119       dest->nelem = src->nelem;
1120       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1121       return REG_NOERROR;
1122     }
1123
1124   /* Copy into the top of DEST the items of SRC that are not
1125      found in DEST.  Maybe we could binary search in DEST?  */
1126   for (sbase = dest->nelem + 2 * src->nelem,
1127        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1128     {
1129       if (dest->elems[id] == src->elems[is])
1130         is--, id--;
1131       else if (dest->elems[id] < src->elems[is])
1132         dest->elems[--sbase] = src->elems[is--];
1133       else /* if (dest->elems[id] > src->elems[is]) */
1134         --id;
1135     }
1136
1137   if (is >= 0)
1138     {
1139       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1140       sbase -= is + 1;
1141       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1142     }
1143
1144   id = dest->nelem - 1;
1145   is = dest->nelem + 2 * src->nelem - 1;
1146   delta = is - sbase + 1;
1147   if (delta == 0)
1148     return REG_NOERROR;
1149
1150   /* Now copy.  When DELTA becomes zero, the remaining
1151      DEST elements are already in place.  */
1152   dest->nelem += delta;
1153   for (;;)
1154     {
1155       if (dest->elems[is] > dest->elems[id])
1156         {
1157           /* Copy from the top.  */
1158           dest->elems[id + delta--] = dest->elems[is--];
1159           if (delta == 0)
1160             break;
1161         }
1162       else
1163         {
1164           /* Slide from the bottom.  */
1165           dest->elems[id + delta] = dest->elems[id];
1166           if (--id < 0)
1167             {
1168               /* Copy remaining SRC elements.  */
1169               memcpy (dest->elems, dest->elems + sbase,
1170                       delta * sizeof (int));
1171               break;
1172             }
1173         }
1174     }
1175
1176   return REG_NOERROR;
1177 }
1178
1179 /* Insert the new element ELEM to the re_node_set* SET.
1180    SET should not already have ELEM.
1181    return -1 if an error is occured, return 1 otherwise.  */
1182
1183 static int
1184 internal_function
1185 re_node_set_insert (re_node_set *set, int elem)
1186 {
1187   int idx;
1188   /* In case the set is empty.  */
1189   if (set->alloc == 0)
1190     {
1191       if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1192         return 1;
1193       return -1;
1194     }
1195
1196   if (BE (set->nelem, 0) == 0)
1197     {
1198       /* We already guaranteed above that set->alloc != 0.  */
1199       set->elems[0] = elem;
1200       ++set->nelem;
1201       return 1;
1202     }
1203
1204   /* Realloc if we need.  */
1205   if (set->alloc == set->nelem)
1206     {
1207       int *new_elems;
1208       set->alloc = set->alloc * 2;
1209       new_elems = re_realloc (set->elems, int, set->alloc);
1210       if (BE (new_elems == NULL, 0))
1211         return -1;
1212       set->elems = new_elems;
1213     }
1214
1215   /* Move the elements which follows the new element.  Test the
1216      first element separately to skip a check in the inner loop.  */
1217   if (elem < set->elems[0])
1218     {
1219       idx = 0;
1220       for (idx = set->nelem; idx > 0; idx--)
1221         set->elems[idx] = set->elems[idx - 1];
1222     }
1223   else
1224     {
1225       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1226         set->elems[idx] = set->elems[idx - 1];
1227     }
1228
1229   /* Insert the new element.  */
1230   set->elems[idx] = elem;
1231   ++set->nelem;
1232   return 1;
1233 }
1234
1235 /* Insert the new element ELEM to the re_node_set* SET.
1236    SET should not already have any element greater than or equal to ELEM.
1237    Return -1 if an error is occured, return 1 otherwise.  */
1238
1239 static int
1240 internal_function
1241 re_node_set_insert_last (re_node_set *set, int elem)
1242 {
1243   /* Realloc if we need.  */
1244   if (set->alloc == set->nelem)
1245     {
1246       int *new_elems;
1247       set->alloc = (set->alloc + 1) * 2;
1248       new_elems = re_realloc (set->elems, int, set->alloc);
1249       if (BE (new_elems == NULL, 0))
1250         return -1;
1251       set->elems = new_elems;
1252     }
1253
1254   /* Insert the new element.  */
1255   set->elems[set->nelem++] = elem;
1256   return 1;
1257 }
1258
1259 /* Compare two node sets SET1 and SET2.
1260    return 1 if SET1 and SET2 are equivalent, return 0 otherwise.  */
1261
1262 static int
1263 internal_function __attribute ((pure))
1264 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1265 {
1266   int i;
1267   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1268     return 0;
1269   for (i = set1->nelem ; --i >= 0 ; )
1270     if (set1->elems[i] != set2->elems[i])
1271       return 0;
1272   return 1;
1273 }
1274
1275 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1276
1277 static int
1278 internal_function __attribute ((pure))
1279 re_node_set_contains (const re_node_set *set, int elem)
1280 {
1281   unsigned int idx, right, mid;
1282   if (set->nelem <= 0)
1283     return 0;
1284
1285   /* Binary search the element.  */
1286   idx = 0;
1287   right = set->nelem - 1;
1288   while (idx < right)
1289     {
1290       mid = (idx + right) / 2;
1291       if (set->elems[mid] < elem)
1292         idx = mid + 1;
1293       else
1294         right = mid;
1295     }
1296   return set->elems[idx] == elem ? idx + 1 : 0;
1297 }
1298
1299 static void
1300 internal_function
1301 re_node_set_remove_at (re_node_set *set, int idx)
1302 {
1303   if (idx < 0 || idx >= set->nelem)
1304     return;
1305   --set->nelem;
1306   for (; idx < set->nelem; idx++)
1307     set->elems[idx] = set->elems[idx + 1];
1308 }
1309
1310
1311 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1312    Or return -1, if an error will be occured.  */
1313
1314 static int
1315 internal_function
1316 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1317 {
1318 #ifdef RE_ENABLE_I18N
1319   int type = token.type;
1320 #endif
1321   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1322     {
1323       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1324       int *new_nexts, *new_indices;
1325       re_node_set *new_edests, *new_eclosures;
1326       re_token_t *new_nodes;
1327
1328       /* Avoid overflows.  */
1329       if (BE (new_nodes_alloc < dfa->nodes_alloc, 0))
1330         return -1;
1331
1332       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1333       if (BE (new_nodes == NULL, 0))
1334         return -1;
1335       dfa->nodes = new_nodes;
1336       new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1337       new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1338       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1339       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1340       if (BE (new_nexts == NULL || new_indices == NULL
1341               || new_edests == NULL || new_eclosures == NULL, 0))
1342         return -1;
1343       dfa->nexts = new_nexts;
1344       dfa->org_indices = new_indices;
1345       dfa->edests = new_edests;
1346       dfa->eclosures = new_eclosures;
1347       dfa->nodes_alloc = new_nodes_alloc;
1348     }
1349   dfa->nodes[dfa->nodes_len] = token;
1350   dfa->nodes[dfa->nodes_len].constraint = 0;
1351 #ifdef RE_ENABLE_I18N
1352   dfa->nodes[dfa->nodes_len].accept_mb =
1353     (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1354 #endif
1355   dfa->nexts[dfa->nodes_len] = -1;
1356   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1357   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1358   return dfa->nodes_len++;
1359 }
1360
1361 static __inline__ unsigned int
1362 internal_function
1363 calc_state_hash (const re_node_set *nodes, unsigned int context)
1364 {
1365   unsigned int hash = nodes->nelem + context;
1366   int i;
1367   for (i = 0 ; i < nodes->nelem ; i++)
1368     hash += nodes->elems[i];
1369   return hash;
1370 }
1371
1372 /* Search for the state whose node_set is equivalent to NODES.
1373    Return the pointer to the state, if we found it in the DFA.
1374    Otherwise create the new one and return it.  In case of an error
1375    return NULL and set the error code in ERR.
1376    Note: - We assume NULL as the invalid state, then it is possible that
1377            return value is NULL and ERR is REG_NOERROR.
1378          - We never return non-NULL value in case of any errors, it is for
1379            optimization.  */
1380
1381 static re_dfastate_t *
1382 internal_function
1383 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1384                   const re_node_set *nodes)
1385 {
1386   unsigned int hash;
1387   re_dfastate_t *new_state;
1388   struct re_state_table_entry *spot;
1389   int i;
1390   if (BE (nodes->nelem == 0, 0))
1391     {
1392       *err = REG_NOERROR;
1393       return NULL;
1394     }
1395   hash = calc_state_hash (nodes, 0);
1396   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1397
1398   for (i = 0 ; i < spot->num ; i++)
1399     {
1400       re_dfastate_t *state = spot->array[i];
1401       if (hash != state->hash)
1402         continue;
1403       if (re_node_set_compare (&state->nodes, nodes))
1404         return state;
1405     }
1406
1407   /* There are no appropriate state in the dfa, create the new one.  */
1408   new_state = create_ci_newstate (dfa, nodes, hash);
1409   if (BE (new_state == NULL, 0))
1410     *err = REG_ESPACE;
1411
1412   return new_state;
1413 }
1414
1415 /* Search for the state whose node_set is equivalent to NODES and
1416    whose context is equivalent to CONTEXT.
1417    Return the pointer to the state, if we found it in the DFA.
1418    Otherwise create the new one and return it.  In case of an error
1419    return NULL and set the error code in ERR.
1420    Note: - We assume NULL as the invalid state, then it is possible that
1421            return value is NULL and ERR is REG_NOERROR.
1422          - We never return non-NULL value in case of any errors, it is for
1423            optimization.  */
1424
1425 static re_dfastate_t *
1426 internal_function
1427 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1428                           const re_node_set *nodes, unsigned int context)
1429 {
1430   unsigned int hash;
1431   re_dfastate_t *new_state;
1432   struct re_state_table_entry *spot;
1433   int i;
1434   if (nodes->nelem == 0)
1435     {
1436       *err = REG_NOERROR;
1437       return NULL;
1438     }
1439   hash = calc_state_hash (nodes, context);
1440   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1441
1442   for (i = 0 ; i < spot->num ; i++)
1443     {
1444       re_dfastate_t *state = spot->array[i];
1445       if (state->hash == hash
1446           && state->context == context
1447           && re_node_set_compare (state->entrance_nodes, nodes))
1448         return state;
1449     }
1450   /* There are no appropriate state in `dfa', create the new one.  */
1451   new_state = create_cd_newstate (dfa, nodes, context, hash);
1452   if (BE (new_state == NULL, 0))
1453     *err = REG_ESPACE;
1454
1455   return new_state;
1456 }
1457
1458 /* Finish initialization of the new state NEWSTATE, and using its hash value
1459    HASH put in the appropriate bucket of DFA's state table.  Return value
1460    indicates the error code if failed.  */
1461
1462 static reg_errcode_t
1463 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1464                 unsigned int hash)
1465 {
1466   struct re_state_table_entry *spot;
1467   reg_errcode_t err;
1468   int i;
1469
1470   newstate->hash = hash;
1471   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1472   if (BE (err != REG_NOERROR, 0))
1473     return REG_ESPACE;
1474   for (i = 0; i < newstate->nodes.nelem; i++)
1475     {
1476       int elem = newstate->nodes.elems[i];
1477       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1478         re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1479     }
1480
1481   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1482   if (BE (spot->alloc <= spot->num, 0))
1483     {
1484       int new_alloc = 2 * spot->num + 2;
1485       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1486                                               new_alloc);
1487       if (BE (new_array == NULL, 0))
1488         return REG_ESPACE;
1489       spot->array = new_array;
1490       spot->alloc = new_alloc;
1491     }
1492   spot->array[spot->num++] = newstate;
1493   return REG_NOERROR;
1494 }
1495
1496 static void
1497 free_state (re_dfastate_t *state)
1498 {
1499   re_node_set_free (&state->non_eps_nodes);
1500   re_node_set_free (&state->inveclosure);
1501   if (state->entrance_nodes != &state->nodes)
1502     {
1503       re_node_set_free (state->entrance_nodes);
1504       re_free (state->entrance_nodes);
1505     }
1506   re_node_set_free (&state->nodes);
1507   re_free (state->word_trtable);
1508   re_free (state->trtable);
1509   re_free (state);
1510 }
1511
1512 /* Create the new state which is independ of contexts.
1513    Return the new state if succeeded, otherwise return NULL.  */
1514
1515 static re_dfastate_t *
1516 internal_function
1517 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1518                     unsigned int hash)
1519 {
1520   int i;
1521   reg_errcode_t err;
1522   re_dfastate_t *newstate;
1523
1524   newstate = calloc (sizeof (re_dfastate_t), 1);
1525   if (BE (newstate == NULL, 0))
1526     return NULL;
1527   err = re_node_set_init_copy (&newstate->nodes, nodes);
1528   if (BE (err != REG_NOERROR, 0))
1529     {
1530       re_free (newstate);
1531       return NULL;
1532     }
1533
1534   newstate->entrance_nodes = &newstate->nodes;
1535   for (i = 0 ; i < nodes->nelem ; i++)
1536     {
1537       re_token_t *node = dfa->nodes + nodes->elems[i];
1538       re_token_type_t type = node->type;
1539
1540       if (type == CHARACTER && !node->constraint)
1541         continue;
1542 #ifdef RE_ENABLE_I18N
1543       newstate->accept_mb |= node->accept_mb;
1544 #endif
1545
1546       /* If the state has the halt node, the state is a halt state.  */
1547       if (type == END_OF_RE)
1548         newstate->halt = 1;
1549       else if (type == OP_BACK_REF)
1550         newstate->has_backref = 1;
1551       else if (type == ANCHOR || node->constraint)
1552         newstate->has_constraint = 1;
1553     }
1554   err = register_state (dfa, newstate, hash);
1555   if (BE (err != REG_NOERROR, 0))
1556     {
1557       free_state (newstate);
1558       newstate = NULL;
1559     }
1560   return newstate;
1561 }
1562
1563 /* Create the new state which is depend on the context CONTEXT.
1564    Return the new state if succeeded, otherwise return NULL.  */
1565
1566 static re_dfastate_t *
1567 internal_function
1568 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1569                     unsigned int context, unsigned int hash)
1570 {
1571   int i, nctx_nodes = 0;
1572   reg_errcode_t err;
1573   re_dfastate_t *newstate;
1574
1575   newstate = calloc (sizeof (re_dfastate_t), 1);
1576   if (BE (newstate == NULL, 0))
1577     return NULL;
1578   err = re_node_set_init_copy (&newstate->nodes, nodes);
1579   if (BE (err != REG_NOERROR, 0))
1580     {
1581       re_free (newstate);
1582       return NULL;
1583     }
1584
1585   newstate->context = context;
1586   newstate->entrance_nodes = &newstate->nodes;
1587
1588   for (i = 0 ; i < nodes->nelem ; i++)
1589     {
1590       unsigned int constraint = 0;
1591       re_token_t *node = dfa->nodes + nodes->elems[i];
1592       re_token_type_t type = node->type;
1593       if (node->constraint)
1594         constraint = node->constraint;
1595
1596       if (type == CHARACTER && !constraint)
1597         continue;
1598 #ifdef RE_ENABLE_I18N
1599       newstate->accept_mb |= node->accept_mb;
1600 #endif /* RE_ENABLE_I18N */
1601
1602       /* If the state has the halt node, the state is a halt state.  */
1603       if (type == END_OF_RE)
1604         newstate->halt = 1;
1605       else if (type == OP_BACK_REF)
1606         newstate->has_backref = 1;
1607       else if (type == ANCHOR)
1608         constraint = node->opr.ctx_type;
1609
1610       if (constraint)
1611         {
1612           if (newstate->entrance_nodes == &newstate->nodes)
1613             {
1614               newstate->entrance_nodes = re_malloc (re_node_set, 1);
1615               if (BE (newstate->entrance_nodes == NULL, 0))
1616                 {
1617                   free_state (newstate);
1618                   return NULL;
1619                 }
1620               re_node_set_init_copy (newstate->entrance_nodes, nodes);
1621               nctx_nodes = 0;
1622               newstate->has_constraint = 1;
1623             }
1624
1625           if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1626             {
1627               re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1628               ++nctx_nodes;
1629             }
1630         }
1631     }
1632   err = register_state (dfa, newstate, hash);
1633   if (BE (err != REG_NOERROR, 0))
1634     {
1635       free_state (newstate);
1636       newstate = NULL;
1637     }
1638   return  newstate;
1639 }