libdex/DexUtf.h

   1 /*
   2  * Copyright (C) 2011 The Android Open Source Project
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 /*
  18  * Validate and manipulate MUTF-8 (modified UTF-8) encoded string data.
  19  */
  20
  21 #ifndef _LIBDEX_DEXUTF
  22 #define _LIBDEX_DEXUTF
  23
  24 #include "DexFile.h"
  25
  26 #ifdef __cplusplus
  27 extern "C" {
  28 #endif
  29
  30 /*
  31  * Retrieve the next UTF-16 character from a UTF-8 string.
  32  *
  33  * Advances "*pUtf8Ptr" to the start of the next character.
  34  *
  35  * WARNING: If a string is corrupted by dropping a '\0' in the middle
  36  * of a 3-byte sequence, you can end up overrunning the buffer with
  37  * reads (and possibly with the writes if the length was computed and
  38  * cached before the damage). For performance reasons, this function
  39  * assumes that the string being parsed is known to be valid (e.g., by
  40  * already being verified). Most strings we process here are coming
  41  * out of dex files or other internal translations, so the only real
  42  * risk comes from the JNI NewStringUTF call.
  43  */
  44 DEX_INLINE u2 dexGetUtf16FromUtf8(const char** pUtf8Ptr)
  45 {
  46     unsigned int one, two, three;
  47
  48     one = *(*pUtf8Ptr)++;
  49     if ((one & 0x80) != 0) {
  50         /* two- or three-byte encoding */
  51         two = *(*pUtf8Ptr)++;
  52         if ((one & 0x20) != 0) {
  53             /* three-byte encoding */
  54             three = *(*pUtf8Ptr)++;
  55             return ((one & 0x0f) << 12) |
  56                    ((two & 0x3f) << 6) |
  57                    (three & 0x3f);
  58         } else {
  59             /* two-byte encoding */
  60             return ((one & 0x1f) << 6) |
  61                    (two & 0x3f);
  62         }
  63     } else {
  64         /* one-byte encoding */
  65         return one;
  66     }
  67 }
  68
  69 /* Compare two '\0'-terminated modified UTF-8 strings, using Unicode
  70  * code point values for comparison. This treats different encodings
  71  * for the same code point as equivalent, except that only a real '\0'
  72  * byte is considered the string terminator. The return value is as
  73  * for strcmp(). */
  74 int dexUtf8Cmp(const char* s1, const char* s2);
  75
  76 /* for dexIsValidMemberNameUtf8(), a bit vector indicating valid low ascii */
  77 extern u4 DEX_MEMBER_VALID_LOW_ASCII[4];
  78
  79 /* Helper for dexIsValidMemberUtf8(); do not call directly. */
  80 bool dexIsValidMemberNameUtf8_0(const char** pUtf8Ptr);
  81
  82 /* Return whether the pointed-at modified-UTF-8 encoded character is
  83  * valid as part of a member name, updating the pointer to point past
  84  * the consumed character. This will consume two encoded UTF-16 code
  85  * points if the character is encoded as a surrogate pair. Also, if
  86  * this function returns false, then the given pointer may only have
  87  * been partially advanced. */
  88 DEX_INLINE bool dexIsValidMemberNameUtf8(const char** pUtf8Ptr) {
  89     u1 c = (u1) **pUtf8Ptr;
  90     if (c <= 0x7f) {
  91         // It's low-ascii, so check the table.
  92         u4 wordIdx = c >> 5;
  93         u4 bitIdx = c & 0x1f;
  94         (*pUtf8Ptr)++;
  95         return (DEX_MEMBER_VALID_LOW_ASCII[wordIdx] & (1 << bitIdx)) != 0;
  96     }
  97
  98     /*
  99      * It's a multibyte encoded character. Call a non-inline function
 100      * for the heavy lifting.
 101      */
 102     return dexIsValidMemberNameUtf8_0(pUtf8Ptr);
 103 }
 104
 105 /* Return whether the given string is a valid field or method name. */
 106 bool dexIsValidMemberName(const char* s);
 107
 108 /* Return whether the given string is a valid type descriptor. */
 109 bool dexIsValidTypeDescriptor(const char* s);
 110
 111 /* Return whether the given string is a valid internal-form class
 112  * name, with components separated either by dots or slashes as
 113  * specified. A class name is like a type descriptor, except that it
 114  * can't name a primitive type (including void). In terms of syntax,
 115  * the form is either (a) the name of the class without adornment
 116  * (that is, not bracketed by "L" and ";"); or (b) identical to the
 117  * type descriptor syntax for array types. */
 118 bool dexIsValidClassName(const char* s, bool dotSeparator);
 119
 120 /* Return whether the given string is a valid reference descriptor. This
 121  * is true if dexIsValidTypeDescriptor() returns true and the descriptor
 122  * is for a class or array and not a primitive type. */
 123 bool dexIsReferenceDescriptor(const char* s);
 124
 125 /* Return whether the given string is a valid class descriptor. This
 126  * is true if dexIsValidTypeDescriptor() returns true and the descriptor
 127  * is for a class and not an array or primitive type. */
 128 bool dexIsClassDescriptor(const char* s);
 129
 130 /* Return whether the given string is a valid field type descriptor. This
 131  * is true if dexIsValidTypeDescriptor() returns true and the descriptor
 132  * is for anything but "void". */
 133 bool dexIsFieldDescriptor(const char* s);
 134
 135 #ifdef __cplusplus
 136 }
 137 #endif
 138
 139 #endif /* def _LIBDEX_DEXUTF */