2 * Copyright (C) 2007 The Android Open Source Project
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 package java.util.regex;
19 import java.io.Serializable;
20 import java.util.ArrayList;
21 import com.ibm.icu4jni.regex.NativeRegEx;
24 * Represents a pattern used for matching, searching, or replacing strings.
25 * {@code Pattern}s are specified in terms of regular expressions and compiled
26 * using an instance of this class. They are then used in conjunction with a
27 * {@link Matcher} to perform the actual search.
29 * A typical use case looks like this:
32 * Pattern p = Pattern.compile("Hello, A[a-z]*!");
34 * Matcher m = p.matcher("Hello, Android!");
35 * boolean b1 = m.matches(); // true
37 * m.setInput("Hello, Robot!");
38 * boolean b2 = m.matches(); // false
41 * The above code could also be written in a more compact fashion, though this
42 * variant is less efficient, since {@code Pattern} and {@code Matcher} objects
43 * are created on the fly instead of being reused.
46 * boolean b1 = Pattern.matches("Hello, A[a-z]*!", "Hello, Android!"); // true
47 * boolean b2 = Pattern.matches("Hello, A[a-z]*!", "Hello, Robot!"); // false
50 * Please consult the <a href="package-summary.html">package documentation</a> for an
51 * overview of the regular expression syntax used in this class as well as
52 * Android-specific implementation details.
57 public final class Pattern implements Serializable {
59 private static final long serialVersionUID = 5073258162644648461L;
62 * This constant specifies that a pattern matches Unix line endings ('\n')
63 * only against the '.', '^', and '$' meta characters.
67 public static final int UNIX_LINES = 0x01;
70 * This constant specifies that a {@code Pattern} is matched
71 * case-insensitively. That is, the patterns "a+" and "A+" would both match
72 * the string "aAaAaA".
74 * Note: For Android, the {@code CASE_INSENSITIVE} constant
75 * (currently) always includes the meaning of the {@link #UNICODE_CASE}
76 * constant. So if case insensitivity is enabled, this automatically extends
77 * to all Unicode characters. The {@code UNICODE_CASE} constant itself has
78 * no special consequences.
82 public static final int CASE_INSENSITIVE = 0x02;
85 * This constant specifies that a {@code Pattern} may contain whitespace or
86 * comments. Otherwise comments and whitespace are taken as literal
91 public static final int COMMENTS = 0x04;
94 * This constant specifies that the meta characters '^' and '$' match only
95 * the beginning and end end of an input line, respectively. Normally, they
96 * match the beginning and the end of the complete input.
100 public static final int MULTILINE = 0x08;
103 * This constant specifies that the whole {@code Pattern} is to be taken
104 * literally, that is, all meta characters lose their meanings.
108 public static final int LITERAL = 0x10;
111 * This constant specifies that the '.' meta character matches arbitrary
112 * characters, including line endings, which is normally not the case.
116 public static final int DOTALL = 0x20;
119 * This constant specifies that a {@code Pattern} is matched
120 * case-insensitively with regard to all Unicode characters. It is used in
121 * conjunction with the {@link #CASE_INSENSITIVE} constant to extend its
122 * meaning to all Unicode characters.
124 * Note: For Android, the {@code CASE_INSENSITIVE} constant
125 * (currently) always includes the meaning of the {@code UNICODE_CASE}
126 * constant. So if case insensitivity is enabled, this automatically extends
127 * to all Unicode characters. The {@code UNICODE_CASE} constant then has no
128 * special consequences.
132 public static final int UNICODE_CASE = 0x40;
135 * This constant specifies that a character in a {@code Pattern} and a
136 * character in the input string only match if they are canonically
137 * equivalent. It is (currently) not supported in Android.
141 public static final int CANON_EQ = 0x80;
144 * Holds the regular expression.
146 private String pattern;
149 * Holds the flags used when compiling this pattern.
154 * Holds a handle (a pointer, actually) for the native ICU pattern.
156 transient int mNativePattern;
159 * Holds the number of groups in the pattern.
161 transient int mGroupCount;
164 * Compiles a regular expression, creating a new Pattern instance in the
165 * process. This is actually a convenience method that calls {@link
166 * #compile(String, int)} with a {@code flags} value of zero.
169 * the regular expression.
171 * @return the new {@code Pattern} instance.
173 * @throws PatternSyntaxException
174 * if the regular expression is syntactically incorrect.
178 public static Pattern compile(String pattern) throws PatternSyntaxException {
179 return new Pattern(pattern, 0);
183 * Compiles a regular expression, creating a new {@code Pattern} instance in
184 * the process. Allows to set some flags that modify the behavior of the
188 * the regular expression.
190 * the flags to set. Basically, any combination of the constants
191 * defined in this class is valid.
193 * Note: Currently, the {@link #CASE_INSENSITIVE} and
194 * {@link #UNICODE_CASE} constants have slightly special behavior
195 * in Android, and the {@link #CANON_EQ} constant is not
198 * @return the new {@code Pattern} instance.
200 * @throws PatternSyntaxException
201 * if the regular expression is syntactically incorrect.
204 * @see #CASE_INSENSITIVE
214 public static Pattern compile(String pattern, int flags) throws PatternSyntaxException {
215 return new Pattern(pattern, flags);
219 * Creates a new {@code Pattern} instance from a given regular expression
223 * the regular expression.
225 * the flags to set. Any combination of the constants defined in
226 * this class is valid.
228 * @throws PatternSyntaxException
229 * if the regular expression is syntactically incorrect.
231 private Pattern(String pattern, int flags) throws PatternSyntaxException {
232 if ((flags & CANON_EQ) != 0) {
233 throw new UnsupportedOperationException("CANON_EQ flag not supported");
236 this.pattern = pattern;
239 compileImpl(pattern, flags);
243 * Compiles the given regular expression using the given flags. Used
247 * the regular expression.
251 private void compileImpl(String pattern, int flags) throws PatternSyntaxException {
252 if (pattern == null) {
253 throw new NullPointerException();
256 if ((flags & LITERAL) != 0) {
257 pattern = quote(pattern);
260 // These are the flags natively supported by ICU.
261 // They even have the same value in native code.
262 flags = flags & (CASE_INSENSITIVE | COMMENTS | MULTILINE | DOTALL | UNIX_LINES);
264 mNativePattern = NativeRegEx.open(pattern, flags);
265 mGroupCount = NativeRegEx.groupCount(mNativePattern);
269 * Returns the regular expression that was compiled into this
272 * @return the regular expression.
276 public String pattern() {
281 * Returns the flags that have been set for this {@code Pattern}.
283 * @return the flags that have been set. A combination of the constants
284 * defined in this class.
287 * @see #CASE_INSENSITIVE
302 * Returns a {@link Matcher} for the {@code Pattern} and a given input. The
303 * {@code Matcher} can be used to match the {@code Pattern} against the
304 * whole input, find occurrences of the {@code Pattern} in the input, or
305 * replace parts of the input.
308 * the input to process.
310 * @return the resulting {@code Matcher}.
314 public Matcher matcher(CharSequence input) {
315 return new Matcher(this, input);
319 * Tries to match a given regular expression against a given input. This is
320 * actually nothing but a convenience method that compiles the regular
321 * expression into a {@code Pattern}, builds a {@link Matcher} for it, and
322 * then does the match. If the same regular expression is used for multiple
323 * operations, it is recommended to compile it into a {@code Pattern}
324 * explicitly and request a reusable {@code Matcher}.
327 * the regular expression.
329 * the input to process.
331 * @return true if and only if the {@code Pattern} matches the input.
333 * @see Pattern#compile(java.lang.String, int)
334 * @see Matcher#matches()
338 static public boolean matches(String regex, CharSequence input) {
339 return new Matcher(new Pattern(regex, 0), input).matches();
343 * Splits a given input around occurrences of a regular expression. This is
344 * a convenience method that is equivalent to calling the method
345 * {@link #split(java.lang.CharSequence, int)} with a limit of 0.
348 * the input sequence.
350 * @return the resulting array.
354 public String[] split(CharSequence input) {
355 return split(input, 0);
359 * Splits the given input sequence around occurrences of the {@code Pattern}.
360 * The function first determines all occurrences of the {@code Pattern}
361 * inside the input sequence. It then builds an array of the
362 * "remaining" strings before, in-between, and after these
363 * occurrences. An additional parameter determines the maximal number of
364 * entries in the resulting array and the handling of trailing empty
368 * the input sequence.
370 * Determines the maximal number of entries in the resulting
373 * <li>For n > 0, it is guaranteed that the resulting array
374 * contains at most n entries.
375 * <li>For n < 0, the length of the resulting array is
376 * exactly the number of occurrences of the {@code Pattern} +1.
377 * All entries are included.
378 * <li>For n == 0, the length of the resulting array is at most
379 * the number of occurrences of the {@code Pattern} +1. Empty
380 * strings at the end of the array are not included.
383 * @return the resulting array.
387 public String[] split(CharSequence inputSeq, int limit) {
388 int maxLength = limit <= 0 ? Integer.MAX_VALUE : limit;
390 String input = inputSeq.toString();
391 ArrayList<String> list = new ArrayList<String>();
393 Matcher matcher = new Matcher(this, inputSeq);
396 // Add text preceding each occurrence, if enough space. Only do this for
397 // non-empty input sequences, because otherwise we'd add the "trailing
398 // empty string" twice.
399 if (inputSeq.length() != 0) {
400 while(matcher.find() && list.size() + 1 < maxLength) {
401 list.add(input.substring(savedPos, matcher.start()));
402 savedPos = matcher.end();
406 // Add trailing text if enough space.
407 if (list.size() < maxLength) {
408 if (savedPos < input.length()) {
409 list.add(input.substring(savedPos));
415 // Remove trailing spaces, if limit == 0 is requested.
417 int i = list.size() - 1;
418 // Don't remove 1st element, since array must not be empty.
419 while(i > 0 && "".equals(list.get(i))) {
425 return list.toArray(new String[list.size()]);
429 * Quotes a given string using "\Q" and "\E", so that all other
430 * meta-characters lose their special meaning. If the string is used for a
431 * {@code Pattern} afterwards, it can only be matched literally.
434 * the string to quote.
436 * @return the quoted string.
440 public static String quote(String s) {
441 StringBuffer sb = new StringBuffer().append("\\Q");
444 while ((k = s.indexOf("\\E", apos)) >= 0) {
445 sb.append(s.substring(apos, k + 2)).append("\\\\E\\Q");
449 return sb.append(s.substring(apos)).append("\\E").toString();
453 public String toString() {
458 protected void finalize() throws Throwable {
460 if (mNativePattern != 0) {
461 NativeRegEx.close(mNativePattern);
470 * Provides serialization support
472 private void readObject(java.io.ObjectInputStream s)
473 throws java.io.IOException, ClassNotFoundException {
474 s.defaultReadObject();
476 compileImpl(pattern, flags);