OSDN Git Service

bionic: Teach fnmatch(3) to handle UTF-8 characters in patterns
[android-x86/bionic.git] / libc / upstream-openbsd / lib / libc / gen / fnmatch.c
index 0d0f18f..ac6280e 100644 (file)
 #include <string.h>
 #include <ctype.h>
 
+#include <stdint.h>
+
 #include "charclass.h"
 
 #define        RANGE_MATCH     1
 #define        RANGE_NOMATCH   0
 #define        RANGE_ERROR     (-1)
 
+static unsigned int
+utf8_len(const char *s)
+{
+    const unsigned char *b = (const unsigned char *)s;
+    unsigned int len = 1;
+    unsigned char c;
+
+    if ((b[0] & 0xc0) != 0xc0) {
+        return 1;
+    }
+    c = b[0] << 1;
+    while (len < 6 && (c & 0x80)) {
+        if ((b[len] & 0xc0) != 0x80) {
+            return 1;
+        }
+        c <<= 1;
+        ++len;
+    }
+
+    return len;
+}
+
+static void
+utf8_inc(const char **s)
+{
+    *s += utf8_len(*s);
+}
+
+static uint32_t
+utf8_get_inc(const char **s)
+{
+    unsigned int len = utf8_len(*s);
+    const unsigned char *b = (const unsigned char *)(*s);
+    unsigned int n;
+    uint32_t ch;
+
+    *s += len;
+
+    if (len == 1) {
+        return b[0];
+    }
+
+    ch = b[0] & (0xff >> len);
+    for (n = 1; n < len; ++n) {
+        ch <<= 6;
+        ch |= (b[n] & 0x3f);
+    }
+
+    return ch;
+}
+
+static uint32_t
+utf8_get(const char *s)
+{
+    const char *tmp = s;
+    return utf8_get_inc(&tmp);
+}
+
 static int
 classmatch(const char *pattern, char test, int foldcase, const char **ep)
 {
@@ -149,7 +209,7 @@ static int fnmatch_ch(const char **pattern, const char **string, int flags)
     const int escape = !(flags & FNM_NOESCAPE);
     const int slash = !!(flags & FNM_PATHNAME);
     int result = FNM_NOMATCH;
-    const char *startch;
+    uint32_t startch, endch, compch;
     int negate;
 
     if (**pattern == '[')
@@ -170,7 +230,7 @@ static int fnmatch_ch(const char **pattern, const char **string, int flags)
             if (**pattern == ']') {
                 ++*pattern;
                 /* XXX: Fix for MBCS character width */
-                ++*string;
+                utf8_inc(string);
                 return (result ^ negate);
             }
 
@@ -200,10 +260,13 @@ leadingclosebrace:
              * "x-]" is not allowed unless escaped ("x-\]")
              * XXX: Fix for locale/MBCS character width
              */
-            if (((*pattern)[1] == '-') && ((*pattern)[2] != ']'))
+            startch = utf8_get_inc(pattern);
+            compch = utf8_get(*string);
+            if (((*pattern)[0] == '-') && ((*pattern)[1] != ']'))
             {
-                startch = *pattern;
-                *pattern += (escape && ((*pattern)[2] == '\\')) ? 3 : 2;
+                *pattern += 1;
+                if (escape && **pattern == '\\')
+                    *pattern += 1;
 
                 /* NOT a properly balanced [expr] pattern, EOS terminated 
                  * or ranges containing a slash in FNM_PATHNAME mode pattern
@@ -212,32 +275,35 @@ leadingclosebrace:
                 if (!**pattern || (slash && (**pattern == '/')))
                     break;
 
+                endch = utf8_get_inc(pattern);
+
+                /* Refuse to attempt collation for non-ASCII chars */
+                if (startch >= 0x80 || endch >= 0x80)
+                    continue;
+
                 /* XXX: handle locale/MBCS comparison, advance by MBCS char width */
-                if ((**string >= *startch) && (**string <= **pattern))
+                if ((compch >= startch) && (compch <= endch))
                     result = 0;
-                else if (nocase && (isupper((unsigned char)**string) ||
-                           isupper((unsigned char)*startch) ||
-                            isupper((unsigned char)**pattern))
-                            && (tolower((unsigned char)**string) >=
-                               tolower((unsigned char)*startch)) 
-                            && (tolower((unsigned char)**string) <=
-                               tolower((unsigned char)**pattern)))
+                else if (nocase && (isupper(compch) ||
+                           isupper(startch) ||
+                            isupper(endch))
+                            && (tolower(compch) >=
+                               tolower(startch))
+                            && (tolower(compch) <=
+                               tolower(endch)))
                     result = 0;
 
-                ++*pattern;
                 continue;
             }
 
             /* XXX: handle locale/MBCS comparison, advance by MBCS char width */
-            if ((**string == **pattern))
+            if (compch == startch)
                 result = 0;
-            else if (nocase && (isupper((unsigned char)**string) ||
-                           isupper((unsigned char)**pattern))
-                            && (tolower((unsigned char)**string) ==
-                               tolower((unsigned char)**pattern)))
+            else if (nocase && (isupper(compch) ||
+                           isupper(startch))
+                            && (tolower(compch) ==
+                               tolower(startch)))
                 result = 0;
-
-            ++*pattern;
         }
 
         /* NOT a properly balanced [expr] pattern; Rewind
@@ -258,7 +324,7 @@ leadingclosebrace:
     }
 
     /* XXX: handle locale/MBCS comparison, advance by the MBCS char width */
-    if (**string == **pattern)
+    if (utf8_get(*string) == utf8_get(*pattern))
         result = 0;
     else if (nocase && (isupper((unsigned char)**string) ||
                    isupper((unsigned char)**pattern))
@@ -272,8 +338,8 @@ leadingclosebrace:
         return result;
 
 fnmatch_ch_success:
-    ++*pattern;
-    ++*string;
+    utf8_inc(pattern);
+    utf8_inc(string);
     return result;
 }