1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 static void re_string_construct_common (const char *str, int len,
22 __RE_TRANSLATE_TYPE trans, int icase,
23 const re_dfa_t *dfa) internal_function;
24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25 const re_node_set *nodes,
26 unsigned int hash) internal_function;
27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28 const re_node_set *nodes,
30 unsigned int hash) internal_function;
32 /* Functions for string operation. */
34 /* This function allocate the buffers. It is necessary to call
35 re_string_reconstruct before using the object. */
39 re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
40 __RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
45 /* Ensure at least one character fits into the buffers. */
46 if (init_len < dfa->mb_cur_max)
47 init_len = dfa->mb_cur_max;
48 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
49 re_string_construct_common (str, len, pstr, trans, icase, dfa);
51 ret = re_string_realloc_buffers (pstr, init_buf_len);
52 if (BE (ret != REG_NOERROR, 0))
55 pstr->word_char = dfa->word_char;
56 pstr->word_ops_used = dfa->word_ops_used;
57 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
58 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
59 pstr->valid_raw_len = pstr->valid_len;
63 /* This function allocate the buffers, and initialize them. */
67 re_string_construct (re_string_t *pstr, const char *str, int len,
68 __RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
71 memset (pstr, '\0', sizeof (re_string_t));
72 re_string_construct_common (str, len, pstr, trans, icase, dfa);
76 ret = re_string_realloc_buffers (pstr, len + 1);
77 if (BE (ret != REG_NOERROR, 0))
80 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
85 if (dfa->mb_cur_max > 1)
89 ret = build_wcs_upper_buffer (pstr);
90 if (BE (ret != REG_NOERROR, 0))
92 if (pstr->valid_raw_len >= len)
94 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
96 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
97 if (BE (ret != REG_NOERROR, 0))
102 #endif /* RE_ENABLE_I18N */
103 build_upper_buffer (pstr);
107 #ifdef RE_ENABLE_I18N
108 if (dfa->mb_cur_max > 1)
109 build_wcs_buffer (pstr);
114 re_string_translate_buffer (pstr);
117 pstr->valid_len = pstr->bufs_len;
118 pstr->valid_raw_len = pstr->bufs_len;
126 /* Helper functions for re_string_allocate, and re_string_construct. */
130 re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
132 #ifdef RE_ENABLE_I18N
133 if (pstr->mb_cur_max > 1)
135 wint_t *new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
136 if (BE (new_wcs == NULL, 0))
139 if (pstr->offsets != NULL)
141 int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
142 if (BE (new_offsets == NULL, 0))
144 pstr->offsets = new_offsets;
147 #endif /* RE_ENABLE_I18N */
148 if (pstr->mbs_allocated)
150 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
152 if (BE (new_mbs == NULL, 0))
156 pstr->bufs_len = new_buf_len;
163 re_string_construct_common (const char *str, int len, re_string_t *pstr,
164 __RE_TRANSLATE_TYPE trans, int icase,
167 pstr->raw_mbs = (const unsigned char *) str;
171 pstr->icase = icase ? 1 : 0;
172 pstr->mbs_allocated = (trans != NULL || icase);
173 pstr->mb_cur_max = dfa->mb_cur_max;
174 pstr->is_utf8 = dfa->is_utf8;
175 pstr->map_notascii = dfa->map_notascii;
176 pstr->stop = pstr->len;
177 pstr->raw_stop = pstr->stop;
180 #ifdef RE_ENABLE_I18N
182 /* Build wide character buffer PSTR->WCS.
183 If the byte sequence of the string are:
184 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
185 Then wide character buffer will be:
186 <wc1> , WEOF , <wc2> , WEOF , <wc3>
187 We use WEOF for padding, they indicate that the position isn't
188 a first byte of a multibyte character.
190 Note that this function assumes PSTR->VALID_LEN elements are already
191 built and starts from PSTR->VALID_LEN. */
195 build_wcs_buffer (re_string_t *pstr)
197 #if defined __UCLIBC__
198 unsigned char buf[MB_LEN_MAX];
199 assert (MB_LEN_MAX >= pstr->mb_cur_max);
201 unsigned char buf[64];
204 int byte_idx, end_idx, remain_len;
207 /* Build the buffers from pstr->valid_len to either pstr->len or
209 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
210 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
215 remain_len = end_idx - byte_idx;
216 prev_st = pstr->cur_state;
217 /* Apply the translation if we need. */
218 if (BE (pstr->trans != NULL, 0))
222 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
224 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
225 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
227 p = (const char *) buf;
230 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
231 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
232 if (BE (mbclen == (size_t) -2, 0))
234 /* The buffer doesn't have enough space, finish to build. */
235 pstr->cur_state = prev_st;
238 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
240 /* We treat these cases as a singlebyte character. */
242 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
243 if (BE (pstr->trans != NULL, 0))
244 wc = pstr->trans[wc];
245 pstr->cur_state = prev_st;
248 /* Write wide character and padding. */
249 pstr->wcs[byte_idx++] = wc;
250 /* Write paddings. */
251 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
252 pstr->wcs[byte_idx++] = WEOF;
254 pstr->valid_len = byte_idx;
255 pstr->valid_raw_len = byte_idx;
258 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
259 but for REG_ICASE. */
263 build_wcs_upper_buffer (re_string_t *pstr)
266 int src_idx, byte_idx, end_idx, remain_len;
268 #if defined __UCLIBC__
269 char buf[MB_LEN_MAX];
270 assert (MB_LEN_MAX >= pstr->mb_cur_max);
275 byte_idx = pstr->valid_len;
276 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
278 /* The following optimization assumes that ASCII characters can be
279 mapped to wide characters with a simple cast. */
280 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
282 while (byte_idx < end_idx)
286 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
287 && mbsinit (&pstr->cur_state))
289 /* In case of a singlebyte character. */
291 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
292 /* The next step uses the assumption that wchar_t is encoded
293 ASCII-safe: all ASCII values can be converted like this. */
294 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
299 remain_len = end_idx - byte_idx;
300 prev_st = pstr->cur_state;
301 mbclen = mbrtowc (&wc,
302 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
303 + byte_idx), remain_len, &pstr->cur_state);
304 if (BE (mbclen + 2 > 2, 1))
312 mbcdlen = wcrtomb (buf, wcu, &prev_st);
313 if (BE (mbclen == mbcdlen, 1))
314 memcpy (pstr->mbs + byte_idx, buf, mbclen);
322 memcpy (pstr->mbs + byte_idx,
323 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
324 pstr->wcs[byte_idx++] = wcu;
325 /* Write paddings. */
326 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
327 pstr->wcs[byte_idx++] = WEOF;
329 else if (mbclen == (size_t) -1 || mbclen == 0)
331 /* It is an invalid character or '\0'. Just use the byte. */
332 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
333 pstr->mbs[byte_idx] = ch;
334 /* And also cast it to wide char. */
335 pstr->wcs[byte_idx++] = (wchar_t) ch;
336 if (BE (mbclen == (size_t) -1, 0))
337 pstr->cur_state = prev_st;
341 /* The buffer doesn't have enough space, finish to build. */
342 pstr->cur_state = prev_st;
346 pstr->valid_len = byte_idx;
347 pstr->valid_raw_len = byte_idx;
351 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
356 remain_len = end_idx - byte_idx;
357 prev_st = pstr->cur_state;
358 if (BE (pstr->trans != NULL, 0))
362 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
364 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
365 buf[i] = pstr->trans[ch];
367 p = (const char *) buf;
370 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
371 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
372 if (BE (mbclen + 2 > 2, 1))
380 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
381 if (BE (mbclen == mbcdlen, 1))
382 memcpy (pstr->mbs + byte_idx, buf, mbclen);
383 else if (mbcdlen != (size_t) -1)
387 if (byte_idx + mbcdlen > pstr->bufs_len)
389 pstr->cur_state = prev_st;
393 if (pstr->offsets == NULL)
395 pstr->offsets = re_malloc (int, pstr->bufs_len);
397 if (pstr->offsets == NULL)
400 if (!pstr->offsets_needed)
402 for (i = 0; i < (size_t) byte_idx; ++i)
403 pstr->offsets[i] = i;
404 pstr->offsets_needed = 1;
407 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
408 pstr->wcs[byte_idx] = wcu;
409 pstr->offsets[byte_idx] = src_idx;
410 for (i = 1; i < mbcdlen; ++i)
412 pstr->offsets[byte_idx + i]
413 = src_idx + (i < mbclen ? i : mbclen - 1);
414 pstr->wcs[byte_idx + i] = WEOF;
416 pstr->len += mbcdlen - mbclen;
417 if (pstr->raw_stop > src_idx)
418 pstr->stop += mbcdlen - mbclen;
419 end_idx = (pstr->bufs_len > pstr->len)
420 ? pstr->len : pstr->bufs_len;
426 memcpy (pstr->mbs + byte_idx, p, mbclen);
429 memcpy (pstr->mbs + byte_idx, p, mbclen);
431 if (BE (pstr->offsets_needed != 0, 0))
434 for (i = 0; i < mbclen; ++i)
435 pstr->offsets[byte_idx + i] = src_idx + i;
439 pstr->wcs[byte_idx++] = wcu;
440 /* Write paddings. */
441 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
442 pstr->wcs[byte_idx++] = WEOF;
444 else if (mbclen == (size_t) -1 || mbclen == 0)
446 /* It is an invalid character or '\0'. Just use the byte. */
447 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
449 if (BE (pstr->trans != NULL, 0))
450 ch = pstr->trans [ch];
451 pstr->mbs[byte_idx] = ch;
453 if (BE (pstr->offsets_needed != 0, 0))
454 pstr->offsets[byte_idx] = src_idx;
457 /* And also cast it to wide char. */
458 pstr->wcs[byte_idx++] = (wchar_t) ch;
459 if (BE (mbclen == (size_t) -1, 0))
460 pstr->cur_state = prev_st;
464 /* The buffer doesn't have enough space, finish to build. */
465 pstr->cur_state = prev_st;
469 pstr->valid_len = byte_idx;
470 pstr->valid_raw_len = src_idx;
474 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
479 re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
486 /* Skip the characters which are not necessary to check. */
487 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
488 rawbuf_idx < new_raw_idx;)
491 remain_len = pstr->len - rawbuf_idx;
492 prev_st = pstr->cur_state;
493 mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
494 remain_len, &pstr->cur_state);
495 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
497 /* We treat these cases as a singlebyte character. */
499 pstr->cur_state = prev_st;
501 /* Then proceed the next character. */
502 rawbuf_idx += mbclen;
504 *last_wc = (wint_t) wc;
507 #endif /* RE_ENABLE_I18N */
509 /* Build the buffer PSTR->MBS, and apply the translation if we need.
510 This function is used in case of REG_ICASE. */
514 build_upper_buffer (re_string_t *pstr)
516 int char_idx, end_idx;
517 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
519 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
521 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
522 if (BE (pstr->trans != NULL, 0))
523 ch = pstr->trans[ch];
525 pstr->mbs[char_idx] = toupper (ch);
527 pstr->mbs[char_idx] = ch;
529 pstr->valid_len = char_idx;
530 pstr->valid_raw_len = char_idx;
533 /* Apply TRANS to the buffer in PSTR. */
537 re_string_translate_buffer (re_string_t *pstr)
539 int buf_idx, end_idx;
540 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
542 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
544 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
545 pstr->mbs[buf_idx] = pstr->trans[ch];
548 pstr->valid_len = buf_idx;
549 pstr->valid_raw_len = buf_idx;
552 /* This function re-construct the buffers.
553 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
554 convert to upper case in case of REG_ICASE, apply translation. */
558 re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
560 int offset = idx - pstr->raw_mbs_idx;
561 if (BE (offset < 0, 0))
564 #ifdef RE_ENABLE_I18N
565 if (pstr->mb_cur_max > 1)
566 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
568 pstr->len = pstr->raw_len;
569 pstr->stop = pstr->raw_stop;
571 pstr->raw_mbs_idx = 0;
572 pstr->valid_raw_len = 0;
573 pstr->offsets_needed = 0;
574 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
575 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
576 if (!pstr->mbs_allocated)
577 pstr->mbs = (unsigned char *) pstr->raw_mbs;
581 if (BE (offset != 0, 1))
583 /* Are the characters which are already checked remain? */
584 if (BE (offset < pstr->valid_raw_len, 1)
585 #ifdef RE_ENABLE_I18N
586 /* Handling this would enlarge the code too much.
587 Accept a slowdown in that case. */
588 && pstr->offsets_needed == 0
592 /* Yes, move them to the front of the buffer. */
593 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
594 #ifdef RE_ENABLE_I18N
595 if (pstr->mb_cur_max > 1)
596 memmove (pstr->wcs, pstr->wcs + offset,
597 (pstr->valid_len - offset) * sizeof (wint_t));
599 if (BE (pstr->mbs_allocated, 0))
600 memmove (pstr->mbs, pstr->mbs + offset,
601 pstr->valid_len - offset);
602 pstr->valid_len -= offset;
603 pstr->valid_raw_len -= offset;
605 assert (pstr->valid_len > 0);
610 /* No, skip all characters until IDX. */
611 #ifdef RE_ENABLE_I18N
612 if (BE (pstr->offsets_needed, 0))
614 pstr->len = pstr->raw_len - idx + offset;
615 pstr->stop = pstr->raw_stop - idx + offset;
616 pstr->offsets_needed = 0;
620 pstr->valid_raw_len = 0;
621 #ifdef RE_ENABLE_I18N
622 if (pstr->mb_cur_max > 1)
629 const unsigned char *raw, *p, *end;
631 /* Special case UTF-8. Multi-byte chars start with any
632 byte other than 0x80 - 0xbf. */
633 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
634 end = raw + (offset - pstr->mb_cur_max);
635 p = raw + offset - 1;
637 /* We know the wchar_t encoding is UCS4, so for the simple
638 case, ASCII characters, skip the conversion step. */
639 if (isascii (*p) && BE (pstr->trans == NULL, 1))
641 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
647 for (; p >= end; --p)
648 if ((*p & 0xc0) != 0x80)
652 int mlen = raw + pstr->len - p;
653 unsigned char buf[6];
656 if (BE (pstr->trans != NULL, 0))
658 int i = mlen < 6 ? mlen : 6;
660 buf[i] = pstr->trans[p[i]];
662 /* XXX Don't use mbrtowc, we know which conversion
663 to use (UTF-8 -> UCS4). */
664 memset (&cur_state, 0, sizeof (cur_state));
665 mbclen = mbrtowc (&wc2, (const char *) p, mlen,
667 if (raw + offset - p <= mbclen
668 && mbclen < (size_t) -2)
670 memset (&pstr->cur_state, '\0',
672 pstr->valid_len = mbclen - (raw + offset - p);
680 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
681 if (BE (pstr->valid_len, 0))
683 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
684 pstr->wcs[wcs_idx] = WEOF;
685 if (pstr->mbs_allocated)
686 memset (pstr->mbs, 255, pstr->valid_len);
688 pstr->valid_raw_len = pstr->valid_len;
689 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
690 && IS_WIDE_WORD_CHAR (wc))
692 : ((IS_WIDE_NEWLINE (wc)
693 && pstr->newline_anchor)
694 ? CONTEXT_NEWLINE : 0));
697 #endif /* RE_ENABLE_I18N */
699 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
702 pstr->tip_context = (bitset_contain (pstr->word_char, c)
704 : ((IS_NEWLINE (c) && pstr->newline_anchor)
705 ? CONTEXT_NEWLINE : 0));
708 if (!BE (pstr->mbs_allocated, 0))
711 pstr->raw_mbs_idx = idx;
713 pstr->stop -= offset;
715 /* Then build the buffers. */
716 #ifdef RE_ENABLE_I18N
717 if (pstr->mb_cur_max > 1)
721 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
722 if (BE (ret != REG_NOERROR, 0))
726 build_wcs_buffer (pstr);
730 if (BE (pstr->mbs_allocated, 0))
733 build_upper_buffer (pstr);
734 else if (pstr->trans != NULL)
735 re_string_translate_buffer (pstr);
738 pstr->valid_len = pstr->len;
745 internal_function __attribute ((pure))
746 re_string_peek_byte_case (const re_string_t *pstr, int idx)
750 /* Handle the common (easiest) cases first. */
751 if (BE (!pstr->mbs_allocated, 1))
752 return re_string_peek_byte (pstr, idx);
754 #ifdef RE_ENABLE_I18N
755 if (pstr->mb_cur_max > 1
756 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
757 return re_string_peek_byte (pstr, idx);
760 off = pstr->cur_idx + idx;
761 #ifdef RE_ENABLE_I18N
762 if (pstr->offsets_needed)
763 off = pstr->offsets[off];
766 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
768 #ifdef RE_ENABLE_I18N
769 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
770 this function returns CAPITAL LETTER I instead of first byte of
771 DOTLESS SMALL LETTER I. The latter would confuse the parser,
772 since peek_byte_case doesn't advance cur_idx in any way. */
773 if (pstr->offsets_needed && !isascii (ch))
774 return re_string_peek_byte (pstr, idx);
781 internal_function __attribute ((pure))
782 re_string_fetch_byte_case (re_string_t *pstr)
784 if (BE (!pstr->mbs_allocated, 1))
785 return re_string_fetch_byte (pstr);
787 #ifdef RE_ENABLE_I18N
788 if (pstr->offsets_needed)
792 /* For tr_TR.UTF-8 [[:islower:]] there is
793 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
794 in that case the whole multi-byte character and return
795 the original letter. On the other side, with
796 [[: DOTLESS SMALL LETTER I return [[:I, as doing
797 anything else would complicate things too much. */
799 if (!re_string_first_byte (pstr, pstr->cur_idx))
800 return re_string_fetch_byte (pstr);
802 off = pstr->offsets[pstr->cur_idx];
803 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
806 return re_string_fetch_byte (pstr);
808 re_string_skip_bytes (pstr,
809 re_string_char_size_at (pstr, pstr->cur_idx));
814 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
819 re_string_destruct (re_string_t *pstr)
821 #ifdef RE_ENABLE_I18N
823 re_free (pstr->offsets);
824 #endif /* RE_ENABLE_I18N */
825 if (pstr->mbs_allocated)
829 /* Return the context at IDX in INPUT. */
833 re_string_context_at (const re_string_t *input, int idx, int eflags)
837 /* In this case, we use the value stored in input->tip_context,
838 since we can't know the character in input->mbs[-1] here. */
839 return input->tip_context;
840 if (BE (idx == input->len, 0))
841 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
842 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
843 #ifdef RE_ENABLE_I18N
844 if (input->mb_cur_max > 1)
848 while(input->wcs[wc_idx] == WEOF)
851 /* It must not happen. */
852 assert (wc_idx >= 0);
856 return input->tip_context;
858 wc = input->wcs[wc_idx];
859 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
861 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
862 ? CONTEXT_NEWLINE : 0);
865 c = re_string_byte_at (input, idx);
866 if (bitset_contain (input->word_char, c))
868 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
871 /* Functions for set operation. */
875 re_node_set_alloc (re_node_set *set, int size)
879 set->elems = re_malloc (int, size); /* can be NULL if size == 0
880 (see re_node_set_init_empty(set)) */
881 if (BE (set->elems == NULL && size != 0, 0))
888 re_node_set_init_1 (re_node_set *set, int elem)
892 set->elems = re_malloc (int, 1);
893 if (BE (set->elems == NULL, 0))
895 set->alloc = set->nelem = 0;
898 set->elems[0] = elem;
904 re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
907 set->elems = re_malloc (int, 2);
908 if (BE (set->elems == NULL, 0))
913 set->elems[0] = elem1;
920 set->elems[0] = elem1;
921 set->elems[1] = elem2;
925 set->elems[0] = elem2;
926 set->elems[1] = elem1;
934 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
936 dest->nelem = src->nelem;
939 dest->alloc = dest->nelem;
940 dest->elems = re_malloc (int, dest->alloc);
941 if (BE (dest->elems == NULL, 0))
943 dest->alloc = dest->nelem = 0;
946 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
949 re_node_set_init_empty (dest);
953 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
954 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
955 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
959 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
960 const re_node_set *src2)
962 int i1, i2, is, id, delta, sbase;
963 if (src1->nelem == 0 || src2->nelem == 0)
966 /* We need dest->nelem + 2 * elems_in_intersection; this is a
967 conservative estimate. */
968 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
970 int new_alloc = src1->nelem + src2->nelem + dest->alloc;
971 int *new_elems = re_realloc (dest->elems, int, new_alloc);
972 if (BE (new_elems == NULL, 0))
974 dest->elems = new_elems;
975 dest->alloc = new_alloc;
978 /* Find the items in the intersection of SRC1 and SRC2, and copy
979 into the top of DEST those that are not already in DEST itself. */
980 sbase = dest->nelem + src1->nelem + src2->nelem;
981 i1 = src1->nelem - 1;
982 i2 = src2->nelem - 1;
983 id = dest->nelem - 1;
986 if (src1->elems[i1] == src2->elems[i2])
988 /* Try to find the item in DEST. Maybe we could binary search? */
989 while (id >= 0 && dest->elems[id] > src1->elems[i1])
992 if (id < 0 || dest->elems[id] != src1->elems[i1])
993 dest->elems[--sbase] = src1->elems[i1];
995 if (--i1 < 0 || --i2 < 0)
999 /* Lower the highest of the two items. */
1000 else if (src1->elems[i1] < src2->elems[i2])
1012 id = dest->nelem - 1;
1013 is = dest->nelem + src1->nelem + src2->nelem - 1;
1014 delta = is - sbase + 1;
1016 /* Now copy. When DELTA becomes zero, the remaining
1017 DEST elements are already in place; this is more or
1018 less the same loop that is in re_node_set_merge. */
1019 dest->nelem += delta;
1020 if (delta > 0 && id >= 0)
1023 if (dest->elems[is] > dest->elems[id])
1025 /* Copy from the top. */
1026 dest->elems[id + delta--] = dest->elems[is--];
1032 /* Slide from the bottom. */
1033 dest->elems[id + delta] = dest->elems[id];
1039 /* Copy remaining SRC elements. */
1040 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1045 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1046 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1048 static reg_errcode_t
1050 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1051 const re_node_set *src2)
1054 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1056 dest->alloc = src1->nelem + src2->nelem;
1057 dest->elems = re_malloc (int, dest->alloc);
1058 if (BE (dest->elems == NULL, 0))
1063 if (src1 != NULL && src1->nelem > 0)
1064 return re_node_set_init_copy (dest, src1);
1065 if (src2 != NULL && src2->nelem > 0)
1066 return re_node_set_init_copy (dest, src2);
1067 re_node_set_init_empty (dest);
1070 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1072 if (src1->elems[i1] > src2->elems[i2])
1074 dest->elems[id++] = src2->elems[i2++];
1077 if (src1->elems[i1] == src2->elems[i2])
1079 dest->elems[id++] = src1->elems[i1++];
1081 if (i1 < src1->nelem)
1083 memcpy (dest->elems + id, src1->elems + i1,
1084 (src1->nelem - i1) * sizeof (int));
1085 id += src1->nelem - i1;
1087 else if (i2 < src2->nelem)
1089 memcpy (dest->elems + id, src2->elems + i2,
1090 (src2->nelem - i2) * sizeof (int));
1091 id += src2->nelem - i2;
1097 /* Calculate the union set of the sets DEST and SRC. And store it to
1098 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1100 static reg_errcode_t
1102 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1104 int is, id, sbase, delta;
1105 if (src == NULL || src->nelem == 0)
1107 if (dest->alloc < 2 * src->nelem + dest->nelem)
1109 int new_alloc = 2 * (src->nelem + dest->alloc);
1110 int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1111 if (BE (new_buffer == NULL, 0))
1113 dest->elems = new_buffer;
1114 dest->alloc = new_alloc;
1117 if (BE (dest->nelem == 0, 0))
1119 dest->nelem = src->nelem;
1120 memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1124 /* Copy into the top of DEST the items of SRC that are not
1125 found in DEST. Maybe we could binary search in DEST? */
1126 for (sbase = dest->nelem + 2 * src->nelem,
1127 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1129 if (dest->elems[id] == src->elems[is])
1131 else if (dest->elems[id] < src->elems[is])
1132 dest->elems[--sbase] = src->elems[is--];
1133 else /* if (dest->elems[id] > src->elems[is]) */
1139 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1141 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1144 id = dest->nelem - 1;
1145 is = dest->nelem + 2 * src->nelem - 1;
1146 delta = is - sbase + 1;
1150 /* Now copy. When DELTA becomes zero, the remaining
1151 DEST elements are already in place. */
1152 dest->nelem += delta;
1155 if (dest->elems[is] > dest->elems[id])
1157 /* Copy from the top. */
1158 dest->elems[id + delta--] = dest->elems[is--];
1164 /* Slide from the bottom. */
1165 dest->elems[id + delta] = dest->elems[id];
1168 /* Copy remaining SRC elements. */
1169 memcpy (dest->elems, dest->elems + sbase,
1170 delta * sizeof (int));
1179 /* Insert the new element ELEM to the re_node_set* SET.
1180 SET should not already have ELEM.
1181 return -1 if an error is occured, return 1 otherwise. */
1185 re_node_set_insert (re_node_set *set, int elem)
1188 /* In case the set is empty. */
1189 if (set->alloc == 0)
1191 if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1196 if (BE (set->nelem, 0) == 0)
1198 /* We already guaranteed above that set->alloc != 0. */
1199 set->elems[0] = elem;
1204 /* Realloc if we need. */
1205 if (set->alloc == set->nelem)
1208 set->alloc = set->alloc * 2;
1209 new_elems = re_realloc (set->elems, int, set->alloc);
1210 if (BE (new_elems == NULL, 0))
1212 set->elems = new_elems;
1215 /* Move the elements which follows the new element. Test the
1216 first element separately to skip a check in the inner loop. */
1217 if (elem < set->elems[0])
1220 for (idx = set->nelem; idx > 0; idx--)
1221 set->elems[idx] = set->elems[idx - 1];
1225 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1226 set->elems[idx] = set->elems[idx - 1];
1229 /* Insert the new element. */
1230 set->elems[idx] = elem;
1235 /* Insert the new element ELEM to the re_node_set* SET.
1236 SET should not already have any element greater than or equal to ELEM.
1237 Return -1 if an error is occured, return 1 otherwise. */
1241 re_node_set_insert_last (re_node_set *set, int elem)
1243 /* Realloc if we need. */
1244 if (set->alloc == set->nelem)
1247 set->alloc = (set->alloc + 1) * 2;
1248 new_elems = re_realloc (set->elems, int, set->alloc);
1249 if (BE (new_elems == NULL, 0))
1251 set->elems = new_elems;
1254 /* Insert the new element. */
1255 set->elems[set->nelem++] = elem;
1259 /* Compare two node sets SET1 and SET2.
1260 return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1263 internal_function __attribute ((pure))
1264 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1267 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1269 for (i = set1->nelem ; --i >= 0 ; )
1270 if (set1->elems[i] != set2->elems[i])
1275 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1278 internal_function __attribute ((pure))
1279 re_node_set_contains (const re_node_set *set, int elem)
1281 unsigned int idx, right, mid;
1282 if (set->nelem <= 0)
1285 /* Binary search the element. */
1287 right = set->nelem - 1;
1290 mid = (idx + right) / 2;
1291 if (set->elems[mid] < elem)
1296 return set->elems[idx] == elem ? idx + 1 : 0;
1301 re_node_set_remove_at (re_node_set *set, int idx)
1303 if (idx < 0 || idx >= set->nelem)
1306 for (; idx < set->nelem; idx++)
1307 set->elems[idx] = set->elems[idx + 1];
1311 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1312 Or return -1, if an error will be occured. */
1316 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1318 #ifdef RE_ENABLE_I18N
1319 int type = token.type;
1321 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1323 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1324 int *new_nexts, *new_indices;
1325 re_node_set *new_edests, *new_eclosures;
1326 re_token_t *new_nodes;
1328 /* Avoid overflows. */
1329 if (BE (new_nodes_alloc < dfa->nodes_alloc, 0))
1332 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1333 if (BE (new_nodes == NULL, 0))
1335 dfa->nodes = new_nodes;
1336 new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1337 new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1338 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1339 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1340 if (BE (new_nexts == NULL || new_indices == NULL
1341 || new_edests == NULL || new_eclosures == NULL, 0))
1343 dfa->nexts = new_nexts;
1344 dfa->org_indices = new_indices;
1345 dfa->edests = new_edests;
1346 dfa->eclosures = new_eclosures;
1347 dfa->nodes_alloc = new_nodes_alloc;
1349 dfa->nodes[dfa->nodes_len] = token;
1350 dfa->nodes[dfa->nodes_len].constraint = 0;
1351 #ifdef RE_ENABLE_I18N
1352 dfa->nodes[dfa->nodes_len].accept_mb =
1353 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1355 dfa->nexts[dfa->nodes_len] = -1;
1356 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1357 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1358 return dfa->nodes_len++;
1361 static __inline__ unsigned int
1363 calc_state_hash (const re_node_set *nodes, unsigned int context)
1365 unsigned int hash = nodes->nelem + context;
1367 for (i = 0 ; i < nodes->nelem ; i++)
1368 hash += nodes->elems[i];
1372 /* Search for the state whose node_set is equivalent to NODES.
1373 Return the pointer to the state, if we found it in the DFA.
1374 Otherwise create the new one and return it. In case of an error
1375 return NULL and set the error code in ERR.
1376 Note: - We assume NULL as the invalid state, then it is possible that
1377 return value is NULL and ERR is REG_NOERROR.
1378 - We never return non-NULL value in case of any errors, it is for
1381 static re_dfastate_t *
1383 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1384 const re_node_set *nodes)
1387 re_dfastate_t *new_state;
1388 struct re_state_table_entry *spot;
1390 if (BE (nodes->nelem == 0, 0))
1395 hash = calc_state_hash (nodes, 0);
1396 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1398 for (i = 0 ; i < spot->num ; i++)
1400 re_dfastate_t *state = spot->array[i];
1401 if (hash != state->hash)
1403 if (re_node_set_compare (&state->nodes, nodes))
1407 /* There are no appropriate state in the dfa, create the new one. */
1408 new_state = create_ci_newstate (dfa, nodes, hash);
1409 if (BE (new_state == NULL, 0))
1415 /* Search for the state whose node_set is equivalent to NODES and
1416 whose context is equivalent to CONTEXT.
1417 Return the pointer to the state, if we found it in the DFA.
1418 Otherwise create the new one and return it. In case of an error
1419 return NULL and set the error code in ERR.
1420 Note: - We assume NULL as the invalid state, then it is possible that
1421 return value is NULL and ERR is REG_NOERROR.
1422 - We never return non-NULL value in case of any errors, it is for
1425 static re_dfastate_t *
1427 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1428 const re_node_set *nodes, unsigned int context)
1431 re_dfastate_t *new_state;
1432 struct re_state_table_entry *spot;
1434 if (nodes->nelem == 0)
1439 hash = calc_state_hash (nodes, context);
1440 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1442 for (i = 0 ; i < spot->num ; i++)
1444 re_dfastate_t *state = spot->array[i];
1445 if (state->hash == hash
1446 && state->context == context
1447 && re_node_set_compare (state->entrance_nodes, nodes))
1450 /* There are no appropriate state in `dfa', create the new one. */
1451 new_state = create_cd_newstate (dfa, nodes, context, hash);
1452 if (BE (new_state == NULL, 0))
1458 /* Finish initialization of the new state NEWSTATE, and using its hash value
1459 HASH put in the appropriate bucket of DFA's state table. Return value
1460 indicates the error code if failed. */
1462 static reg_errcode_t
1463 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1466 struct re_state_table_entry *spot;
1470 newstate->hash = hash;
1471 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1472 if (BE (err != REG_NOERROR, 0))
1474 for (i = 0; i < newstate->nodes.nelem; i++)
1476 int elem = newstate->nodes.elems[i];
1477 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1478 re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1481 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1482 if (BE (spot->alloc <= spot->num, 0))
1484 int new_alloc = 2 * spot->num + 2;
1485 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1487 if (BE (new_array == NULL, 0))
1489 spot->array = new_array;
1490 spot->alloc = new_alloc;
1492 spot->array[spot->num++] = newstate;
1497 free_state (re_dfastate_t *state)
1499 re_node_set_free (&state->non_eps_nodes);
1500 re_node_set_free (&state->inveclosure);
1501 if (state->entrance_nodes != &state->nodes)
1503 re_node_set_free (state->entrance_nodes);
1504 re_free (state->entrance_nodes);
1506 re_node_set_free (&state->nodes);
1507 re_free (state->word_trtable);
1508 re_free (state->trtable);
1512 /* Create the new state which is independ of contexts.
1513 Return the new state if succeeded, otherwise return NULL. */
1515 static re_dfastate_t *
1517 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1522 re_dfastate_t *newstate;
1524 newstate = calloc (sizeof (re_dfastate_t), 1);
1525 if (BE (newstate == NULL, 0))
1527 err = re_node_set_init_copy (&newstate->nodes, nodes);
1528 if (BE (err != REG_NOERROR, 0))
1534 newstate->entrance_nodes = &newstate->nodes;
1535 for (i = 0 ; i < nodes->nelem ; i++)
1537 re_token_t *node = dfa->nodes + nodes->elems[i];
1538 re_token_type_t type = node->type;
1540 if (type == CHARACTER && !node->constraint)
1542 #ifdef RE_ENABLE_I18N
1543 newstate->accept_mb |= node->accept_mb;
1546 /* If the state has the halt node, the state is a halt state. */
1547 if (type == END_OF_RE)
1549 else if (type == OP_BACK_REF)
1550 newstate->has_backref = 1;
1551 else if (type == ANCHOR || node->constraint)
1552 newstate->has_constraint = 1;
1554 err = register_state (dfa, newstate, hash);
1555 if (BE (err != REG_NOERROR, 0))
1557 free_state (newstate);
1563 /* Create the new state which is depend on the context CONTEXT.
1564 Return the new state if succeeded, otherwise return NULL. */
1566 static re_dfastate_t *
1568 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1569 unsigned int context, unsigned int hash)
1571 int i, nctx_nodes = 0;
1573 re_dfastate_t *newstate;
1575 newstate = calloc (sizeof (re_dfastate_t), 1);
1576 if (BE (newstate == NULL, 0))
1578 err = re_node_set_init_copy (&newstate->nodes, nodes);
1579 if (BE (err != REG_NOERROR, 0))
1585 newstate->context = context;
1586 newstate->entrance_nodes = &newstate->nodes;
1588 for (i = 0 ; i < nodes->nelem ; i++)
1590 unsigned int constraint = 0;
1591 re_token_t *node = dfa->nodes + nodes->elems[i];
1592 re_token_type_t type = node->type;
1593 if (node->constraint)
1594 constraint = node->constraint;
1596 if (type == CHARACTER && !constraint)
1598 #ifdef RE_ENABLE_I18N
1599 newstate->accept_mb |= node->accept_mb;
1600 #endif /* RE_ENABLE_I18N */
1602 /* If the state has the halt node, the state is a halt state. */
1603 if (type == END_OF_RE)
1605 else if (type == OP_BACK_REF)
1606 newstate->has_backref = 1;
1607 else if (type == ANCHOR)
1608 constraint = node->opr.ctx_type;
1612 if (newstate->entrance_nodes == &newstate->nodes)
1614 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1615 if (BE (newstate->entrance_nodes == NULL, 0))
1617 free_state (newstate);
1620 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1622 newstate->has_constraint = 1;
1625 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1627 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1632 err = register_state (dfa, newstate, hash);
1633 if (BE (err != REG_NOERROR, 0))
1635 free_state (newstate);