2 * Copyright (C) 2009 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 #ifndef RegexPattern_h
28 #define RegexPattern_h
30 #include <wtf/Vector.h>
31 #include <wtf/unicode/Unicode.h>
33 namespace JSC { namespace Yarr {
35 #define RegexStackSpaceForBackTrackInfoPatternCharacter 1 // Only for !fixed quantifiers.
36 #define RegexStackSpaceForBackTrackInfoCharacterClass 1 // Only for !fixed quantifiers.
37 #define RegexStackSpaceForBackTrackInfoBackReference 2
38 #define RegexStackSpaceForBackTrackInfoAlternative 1 // One per alternative.
39 #define RegexStackSpaceForBackTrackInfoParentheticalAssertion 1
40 #define RegexStackSpaceForBackTrackInfoParenthesesOnce 1 // Only for !fixed quantifiers.
41 #define RegexStackSpaceForBackTrackInfoParenthesesTerminal 1
42 #define RegexStackSpaceForBackTrackInfoParentheses 4
44 struct PatternDisjunction;
46 struct CharacterRange {
50 CharacterRange(UChar begin, UChar end)
57 struct CharacterClassTable : RefCounted<CharacterClassTable> {
60 static PassRefPtr<CharacterClassTable> create(const char* table, bool inverted)
62 return adoptRef(new CharacterClassTable(table, inverted));
66 CharacterClassTable(const char* table, bool inverted)
68 , m_inverted(inverted)
73 struct CharacterClass : FastAllocBase {
74 // All CharacterClass instances have to have the full set of matches and ranges,
75 // they may have an optional table for faster lookups (which must match the
76 // specified matches and ranges)
77 CharacterClass(PassRefPtr<CharacterClassTable> table)
81 Vector<UChar> m_matches;
82 Vector<CharacterRange> m_ranges;
83 Vector<UChar> m_matchesUnicode;
84 Vector<CharacterRange> m_rangesUnicode;
85 RefPtr<CharacterClassTable> m_table;
98 TypeAssertionWordBoundary,
102 TypeForwardReference,
103 TypeParenthesesSubpattern,
104 TypeParentheticalAssertion,
106 bool invertOrCapture;
108 UChar patternCharacter;
109 CharacterClass* characterClass;
110 unsigned backReferenceSubpatternId;
112 PatternDisjunction* disjunction;
113 unsigned subpatternId;
114 unsigned lastSubpatternId;
119 QuantifierType quantityType;
120 unsigned quantityCount;
122 unsigned frameLocation;
124 PatternTerm(UChar ch)
125 : type(PatternTerm::TypePatternCharacter)
127 patternCharacter = ch;
128 quantityType = QuantifierFixedCount;
132 PatternTerm(CharacterClass* charClass, bool invert)
133 : type(PatternTerm::TypeCharacterClass)
134 , invertOrCapture(invert)
136 characterClass = charClass;
137 quantityType = QuantifierFixedCount;
141 PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool invertOrCapture)
143 , invertOrCapture(invertOrCapture)
145 parentheses.disjunction = disjunction;
146 parentheses.subpatternId = subpatternId;
147 parentheses.isCopy = false;
148 parentheses.isTerminal = false;
149 quantityType = QuantifierFixedCount;
153 PatternTerm(Type type, bool invert = false)
155 , invertOrCapture(invert)
157 quantityType = QuantifierFixedCount;
161 PatternTerm(unsigned spatternId)
162 : type(TypeBackReference)
163 , invertOrCapture(false)
165 backReferenceSubpatternId = spatternId;
166 quantityType = QuantifierFixedCount;
170 static PatternTerm ForwardReference()
172 return PatternTerm(TypeForwardReference);
175 static PatternTerm BOL()
177 return PatternTerm(TypeAssertionBOL);
180 static PatternTerm EOL()
182 return PatternTerm(TypeAssertionEOL);
185 static PatternTerm WordBoundary(bool invert)
187 return PatternTerm(TypeAssertionWordBoundary, invert);
192 return invertOrCapture;
197 return invertOrCapture;
200 void quantify(unsigned count, QuantifierType type)
202 quantityCount = count;
207 struct PatternAlternative : FastAllocBase {
208 PatternAlternative(PatternDisjunction* disjunction)
209 : m_parent(disjunction)
210 , m_onceThrough(false)
211 , m_hasFixedSize(false)
212 , m_startsWithBOL(false)
213 , m_containsBOL(false)
217 PatternTerm& lastTerm()
219 ASSERT(m_terms.size());
220 return m_terms[m_terms.size() - 1];
223 void removeLastTerm()
225 ASSERT(m_terms.size());
226 m_terms.shrink(m_terms.size() - 1);
229 void setOnceThrough()
231 m_onceThrough = true;
236 return m_onceThrough;
239 Vector<PatternTerm> m_terms;
240 PatternDisjunction* m_parent;
241 unsigned m_minimumSize;
242 bool m_onceThrough : 1;
243 bool m_hasFixedSize : 1;
244 bool m_startsWithBOL : 1;
245 bool m_containsBOL : 1;
248 struct PatternDisjunction : FastAllocBase {
249 PatternDisjunction(PatternAlternative* parent = 0)
251 , m_hasFixedSize(false)
255 ~PatternDisjunction()
257 deleteAllValues(m_alternatives);
260 PatternAlternative* addNewAlternative()
262 PatternAlternative* alternative = new PatternAlternative(this);
263 m_alternatives.append(alternative);
267 Vector<PatternAlternative*> m_alternatives;
268 PatternAlternative* m_parent;
269 unsigned m_minimumSize;
270 unsigned m_callFrameSize;
274 // You probably don't want to be calling these functions directly
275 // (please to be calling newlineCharacterClass() et al on your
276 // friendly neighborhood RegexPattern instance to get nicely
278 CharacterClass* newlineCreate();
279 CharacterClass* digitsCreate();
280 CharacterClass* spacesCreate();
281 CharacterClass* wordcharCreate();
282 CharacterClass* nondigitsCreate();
283 CharacterClass* nonspacesCreate();
284 CharacterClass* nonwordcharCreate();
287 TermChain(PatternTerm term)
292 Vector<TermChain> hotTerms;
301 BeginChar(unsigned value, unsigned mask)
310 struct RegexPattern {
311 RegexPattern(bool ignoreCase, bool multiline)
312 : m_ignoreCase(ignoreCase)
313 , m_multiline(multiline)
314 , m_containsBackreferences(false)
315 , m_containsBeginChars(false)
316 , m_containsBOL(false)
317 , m_numSubpatterns(0)
318 , m_maxBackReference(0)
325 , nonwordcharCached(0)
331 deleteAllValues(m_disjunctions);
332 deleteAllValues(m_userCharacterClasses);
337 m_numSubpatterns = 0;
338 m_maxBackReference = 0;
340 m_containsBackreferences = false;
341 m_containsBeginChars = false;
342 m_containsBOL = false;
350 nonwordcharCached = 0;
352 deleteAllValues(m_disjunctions);
353 m_disjunctions.clear();
354 deleteAllValues(m_userCharacterClasses);
355 m_userCharacterClasses.clear();
356 m_beginChars.clear();
359 bool containsIllegalBackReference()
361 return m_maxBackReference > m_numSubpatterns;
364 CharacterClass* newlineCharacterClass()
367 m_userCharacterClasses.append(newlineCached = newlineCreate());
368 return newlineCached;
370 CharacterClass* digitsCharacterClass()
373 m_userCharacterClasses.append(digitsCached = digitsCreate());
376 CharacterClass* spacesCharacterClass()
379 m_userCharacterClasses.append(spacesCached = spacesCreate());
382 CharacterClass* wordcharCharacterClass()
385 m_userCharacterClasses.append(wordcharCached = wordcharCreate());
386 return wordcharCached;
388 CharacterClass* nondigitsCharacterClass()
390 if (!nondigitsCached)
391 m_userCharacterClasses.append(nondigitsCached = nondigitsCreate());
392 return nondigitsCached;
394 CharacterClass* nonspacesCharacterClass()
396 if (!nonspacesCached)
397 m_userCharacterClasses.append(nonspacesCached = nonspacesCreate());
398 return nonspacesCached;
400 CharacterClass* nonwordcharCharacterClass()
402 if (!nonwordcharCached)
403 m_userCharacterClasses.append(nonwordcharCached = nonwordcharCreate());
404 return nonwordcharCached;
407 bool m_ignoreCase : 1;
408 bool m_multiline : 1;
409 bool m_containsBackreferences : 1;
410 bool m_containsBeginChars : 1;
411 bool m_containsBOL : 1;
412 unsigned m_numSubpatterns;
413 unsigned m_maxBackReference;
414 PatternDisjunction* m_body;
415 Vector<PatternDisjunction*, 4> m_disjunctions;
416 Vector<CharacterClass*> m_userCharacterClasses;
417 Vector<BeginChar> m_beginChars;
420 CharacterClass* newlineCached;
421 CharacterClass* digitsCached;
422 CharacterClass* spacesCached;
423 CharacterClass* wordcharCached;
424 CharacterClass* nondigitsCached;
425 CharacterClass* nonspacesCached;
426 CharacterClass* nonwordcharCached;
429 } } // namespace JSC::Yarr
431 #endif // RegexPattern_h