vm/UtfString.c

   1 /*
   2  * Copyright (C) 2008 The Android Open Source Project
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 /*
  18  * UTF-8 and Unicode string manipulation, plus java/lang/String convenience
  19  * functions.
  20  *
  21  * In most cases we populate the fields in the String object directly,
  22  * rather than going through an instance field lookup.
  23  */
  24 #include "Dalvik.h"
  25 #include <stdlib.h>
  26
  27 /*
  28  * Initialize string globals.
  29  *
  30  * This isn't part of the VM init sequence because it's hard to get the
  31  * timing right -- we need it to happen after java/lang/String has been
  32  * loaded, but before anybody wants to use a string.  It's easiest to
  33  * just initialize it on first use.
  34  *
  35  * In some unusual circumstances (e.g. trying to throw an exception because
  36  * String implements java/lang/CharSequence, but CharSequence doesn't exist)
  37  * we can try to create an exception string internally before anything has
  38  * really tried to use String.  In that case we basically self-destruct.
  39  */
  40 static bool stringStartup()
  41 {
  42     if (gDvm.javaLangStringReady < 0) {
  43         LOGE("ERROR: reentrant string initialization\n");
  44         assert(false);
  45         return false;
  46     }
  47     assert(gDvm.javaLangStringReady == 0);
  48
  49     gDvm.javaLangStringReady = -1;
  50
  51     if (gDvm.classJavaLangString == NULL)
  52         gDvm.classJavaLangString =
  53             dvmFindSystemClassNoInit("Ljava/lang/String;");
  54
  55     gDvm.offJavaLangString_value =
  56         dvmFindFieldOffset(gDvm.classJavaLangString, "value", "[C");
  57     gDvm.offJavaLangString_count =
  58         dvmFindFieldOffset(gDvm.classJavaLangString, "count", "I");
  59     gDvm.offJavaLangString_offset =
  60         dvmFindFieldOffset(gDvm.classJavaLangString, "offset", "I");
  61     gDvm.offJavaLangString_hashCode =
  62         dvmFindFieldOffset(gDvm.classJavaLangString, "hashCode", "I");
  63
  64     if (gDvm.offJavaLangString_value < 0 ||
  65         gDvm.offJavaLangString_count < 0 ||
  66         gDvm.offJavaLangString_offset < 0 ||
  67         gDvm.offJavaLangString_hashCode < 0)
  68     {
  69         LOGE("VM-required field missing from java/lang/String\n");
  70         return false;
  71     }
  72
  73     bool badValue = false;
  74     if (gDvm.offJavaLangString_value != STRING_FIELDOFF_VALUE) {
  75         LOGE("InlineNative: String.value offset = %d, expected %d\n",
  76             gDvm.offJavaLangString_value, STRING_FIELDOFF_VALUE);
  77         badValue = true;
  78     }
  79     if (gDvm.offJavaLangString_count != STRING_FIELDOFF_COUNT) {
  80         LOGE("InlineNative: String.count offset = %d, expected %d\n",
  81             gDvm.offJavaLangString_count, STRING_FIELDOFF_COUNT);
  82         badValue = true;
  83     }
  84     if (gDvm.offJavaLangString_offset != STRING_FIELDOFF_OFFSET) {
  85         LOGE("InlineNative: String.offset offset = %d, expected %d\n",
  86             gDvm.offJavaLangString_offset, STRING_FIELDOFF_OFFSET);
  87         badValue = true;
  88     }
  89     if (gDvm.offJavaLangString_hashCode != STRING_FIELDOFF_HASHCODE) {
  90         LOGE("InlineNative: String.hashCode offset = %d, expected %d\n",
  91             gDvm.offJavaLangString_hashCode, STRING_FIELDOFF_HASHCODE);
  92         badValue = true;
  93     }
  94     if (badValue)
  95         return false;
  96
  97     gDvm.javaLangStringReady = 1;
  98
  99     return true;
 100 }
 101
 102 /*
 103  * Discard heap-allocated storage.
 104  */
 105 void dvmStringShutdown()
 106 {
 107     // currently unused
 108 }
 109
 110 /*
 111  * Compute a hash code on a UTF-8 string, for use with internal hash tables.
 112  *
 113  * This may or may not yield the same results as the java/lang/String
 114  * computeHashCode() function.  (To make sure this doesn't get abused,
 115  * I'm initializing the hash code to 1 so they *don't* match up.)
 116  *
 117  * It would be more correct to invoke dexGetUtf16FromUtf8() here and compute
 118  * the hash with the result.  That way, if something encoded the same
 119  * character in two different ways, the hash value would be the same.  For
 120  * our purposes that isn't necessary.
 121  */
 122 u4 dvmComputeUtf8Hash(const char* utf8Str)
 123 {
 124     u4 hash = 1;
 125
 126     while (*utf8Str != '\0')
 127         hash = hash * 31 + *utf8Str++;
 128
 129     return hash;
 130 }
 131
 132 /*
 133  * Like "strlen", but for strings encoded with "modified" UTF-8.
 134  *
 135  * The value returned is the number of characters, which may or may not
 136  * be the same as the number of bytes.
 137  *
 138  * (If this needs optimizing, try: mask against 0xa0, shift right 5,
 139  * get increment {1-3} from table of 8 values.)
 140  */
 141 int dvmUtf8Len(const char* utf8Str)
 142 {
 143     int ic, len = 0;
 144
 145     while ((ic = *utf8Str++) != '\0') {
 146         len++;
 147         if ((ic & 0x80) != 0) {
 148             /* two- or three-byte encoding */
 149             utf8Str++;
 150             if ((ic & 0x20) != 0) {
 151                 /* three-byte encoding */
 152                 utf8Str++;
 153             }
 154         }
 155     }
 156
 157     return len;
 158 }
 159
 160 /*
 161  * Convert a "modified" UTF-8 string to UTF-16.
 162  */
 163 void dvmConvertUtf8ToUtf16(u2* utf16Str, const char* utf8Str)
 164 {
 165     while (*utf8Str != '\0')
 166         *utf16Str++ = dexGetUtf16FromUtf8(&utf8Str);
 167 }
 168
 169 /*
 170  * Given a UTF-16 string, compute the length of the corresponding UTF-8
 171  * string in bytes.
 172  */
 173 static int utf16_utf8ByteLen(const u2* utf16Str, int len)
 174 {
 175     int utf8Len = 0;
 176
 177     while (len--) {
 178         unsigned int uic = *utf16Str++;
 179
 180         /*
 181          * The most common case is (uic > 0 && uic <= 0x7f).
 182          */
 183         if (uic == 0 || uic > 0x7f) {
 184             if (uic > 0x07ff)
 185                 utf8Len += 3;
 186             else /*(uic > 0x7f || uic == 0) */
 187                 utf8Len += 2;
 188         } else
 189             utf8Len++;
 190     }
 191     return utf8Len;
 192 }
 193
 194 /*
 195  * Convert a UTF-16 string to UTF-8.
 196  *
 197  * Make sure you allocate "utf8Str" with the result of utf16_utf8ByteLen(),
 198  * not just "len".
 199  */
 200 static void convertUtf16ToUtf8(char* utf8Str, const u2* utf16Str, int len)
 201 {
 202     assert(len >= 0);
 203
 204     while (len--) {
 205         unsigned int uic = *utf16Str++;
 206
 207         /*
 208          * The most common case is (uic > 0 && uic <= 0x7f).
 209          */
 210         if (uic == 0 || uic > 0x7f) {
 211             if (uic > 0x07ff) {
 212                 *utf8Str++ = (uic >> 12) | 0xe0;
 213                 *utf8Str++ = ((uic >> 6) & 0x3f) | 0x80;
 214                 *utf8Str++ = (uic & 0x3f) | 0x80;
 215             } else /*(uic > 0x7f || uic == 0)*/ {
 216                 *utf8Str++ = (uic >> 6) | 0xc0;
 217                 *utf8Str++ = (uic & 0x3f) | 0x80;
 218             }
 219         } else {
 220             *utf8Str++ = uic;
 221         }
 222     }
 223
 224     *utf8Str = '\0';
 225 }
 226
 227 /*
 228  * Use the java/lang/String.computeHashCode() algorithm.
 229  */
 230 static inline u4 dvmComputeUtf16Hash(const u2* utf16Str, int len)
 231 {
 232     u4 hash = 0;
 233
 234     while (len--)
 235         hash = hash * 31 + *utf16Str++;
 236
 237     return hash;
 238 }
 239 u4 dvmComputeStringHash(StringObject* strObj) {
 240     ArrayObject* chars = (ArrayObject*) dvmGetFieldObject((Object*) strObj,
 241                                 STRING_FIELDOFF_VALUE);
 242     int offset, len;
 243
 244     len = dvmGetFieldInt((Object*) strObj, STRING_FIELDOFF_COUNT);
 245     offset = dvmGetFieldInt((Object*) strObj, STRING_FIELDOFF_OFFSET);
 246
 247     return dvmComputeUtf16Hash((u2*) chars->contents + offset, len);
 248 }
 249
 250 /*
 251  * Create a new java/lang/String object, using the string data in "utf8Str".
 252  *
 253  * Note that "allocFlags" affects both of the allocations here.  If you
 254  * use ALLOC_DONT_TRACK in a context where a GC could happen between the
 255  * two allocations, you could lose the array reference.
 256  *
 257  * Returns NULL and throws an exception on failure.
 258  */
 259 StringObject* dvmCreateStringFromCstr(const char* utf8Str, int allocFlags)
 260 {
 261     assert(utf8Str != NULL);
 262
 263     return dvmCreateStringFromCstrAndLength(utf8Str, dvmUtf8Len(utf8Str),
 264             allocFlags);
 265 }
 266
 267 /*
 268  * Create a java/lang/String from a C string, given its UTF-16 length
 269  * (number of UTF-16 code points).
 270  *
 271  * The caller must call dvmReleaseTrackedAlloc() on the return value or
 272  * use a non-default value for "allocFlags".  It is never appropriate
 273  * to use ALLOC_DONT_TRACK with this function.
 274  *
 275  * Returns NULL and throws an exception on failure.
 276  */
 277 StringObject* dvmCreateStringFromCstrAndLength(const char* utf8Str,
 278     u4 utf16Length, int allocFlags)
 279 {
 280     StringObject* newObj;
 281     ArrayObject* chars;
 282     u4 hashCode = 0;
 283
 284     //LOGV("Creating String from '%s'\n", utf8Str);
 285     assert(allocFlags != ALLOC_DONT_TRACK);     /* don't currently need */
 286     assert(utf8Str != NULL);
 287
 288     if (gDvm.javaLangStringReady <= 0) {
 289         if (!stringStartup())
 290             return NULL;
 291     }
 292
 293     /* init before alloc */
 294     if (!dvmIsClassInitialized(gDvm.classJavaLangString) &&
 295         !dvmInitClass(gDvm.classJavaLangString))
 296     {
 297         return NULL;
 298     }
 299
 300     newObj = (StringObject*) dvmAllocObject(gDvm.classJavaLangString,
 301                 allocFlags);
 302     if (newObj == NULL)
 303         return NULL;
 304
 305     chars = dvmAllocPrimitiveArray('C', utf16Length, allocFlags);
 306     if (chars == NULL) {
 307         dvmReleaseTrackedAllocIFN((Object*) newObj, NULL, allocFlags);
 308         return NULL;
 309     }
 310     dvmConvertUtf8ToUtf16((u2*)chars->contents, utf8Str);
 311     hashCode = dvmComputeUtf16Hash((u2*) chars->contents, utf16Length);
 312
 313     dvmSetFieldObject((Object*)newObj, STRING_FIELDOFF_VALUE,
 314         (Object*)chars);
 315     dvmReleaseTrackedAllocIFN((Object*) chars, NULL, allocFlags);
 316     dvmSetFieldInt((Object*)newObj, STRING_FIELDOFF_COUNT, utf16Length);
 317     dvmSetFieldInt((Object*)newObj, STRING_FIELDOFF_HASHCODE, hashCode);
 318     /* leave offset set to zero */
 319
 320     /* debugging stuff */
 321     //dvmDumpObject((Object*)newObj);
 322     //printHexDumpEx(ANDROID_LOG_DEBUG, chars->contents, utf16Length * 2,
 323     //    kHexDumpMem);
 324
 325     /* caller may need to dvmReleaseTrackedAlloc(newObj) */
 326     return newObj;
 327 }
 328
 329 /*
 330  * Create a new java/lang/String object, using the Unicode data.
 331  */
 332 StringObject* dvmCreateStringFromUnicode(const u2* unichars, int len)
 333 {
 334     StringObject* newObj;
 335     ArrayObject* chars;
 336     u4 hashCode = 0;
 337
 338     /* we allow a null pointer if the length is zero */
 339     assert(len == 0 || unichars != NULL);
 340
 341     if (gDvm.javaLangStringReady <= 0) {
 342         if (!stringStartup())
 343             return NULL;
 344     }
 345
 346     /* init before alloc */
 347     if (!dvmIsClassInitialized(gDvm.classJavaLangString) &&
 348         !dvmInitClass(gDvm.classJavaLangString))
 349     {
 350         return NULL;
 351     }
 352
 353     newObj = (StringObject*) dvmAllocObject(gDvm.classJavaLangString,
 354         ALLOC_DEFAULT);
 355     if (newObj == NULL)
 356         return NULL;
 357
 358     chars = dvmAllocPrimitiveArray('C', len, ALLOC_DEFAULT);
 359     if (chars == NULL) {
 360         dvmReleaseTrackedAlloc((Object*) newObj, NULL);
 361         return NULL;
 362     }
 363     if (len > 0)
 364         memcpy(chars->contents, unichars, len * sizeof(u2));
 365     hashCode = dvmComputeUtf16Hash((u2*) chars->contents, len);
 366
 367     dvmSetFieldObject((Object*)newObj, STRING_FIELDOFF_VALUE,
 368         (Object*)chars);
 369     dvmReleaseTrackedAlloc((Object*) chars, NULL);
 370     dvmSetFieldInt((Object*)newObj, STRING_FIELDOFF_COUNT, len);
 371     dvmSetFieldInt((Object*)newObj, STRING_FIELDOFF_HASHCODE, hashCode);
 372     /* leave offset set to zero */
 373
 374     /* debugging stuff */
 375     //dvmDumpObject((Object*)newObj);
 376     //printHexDumpEx(ANDROID_LOG_DEBUG, chars->contents, len*2, kHexDumpMem);
 377
 378     /* caller must dvmReleaseTrackedAlloc(newObj) */
 379     return newObj;
 380 }
 381
 382 /*
 383  * Create a new C string from a java/lang/String object.
 384  *
 385  * Returns NULL if the object is NULL.
 386  */
 387 char* dvmCreateCstrFromString(StringObject* jstr)
 388 {
 389     char* newStr;
 390     ArrayObject* chars;
 391     int len, byteLen, offset;
 392     const u2* data;
 393
 394     assert(gDvm.javaLangStringReady > 0);
 395
 396     if (jstr == NULL)
 397         return NULL;
 398
 399     len = dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_COUNT);
 400     offset = dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_OFFSET);
 401     chars = (ArrayObject*) dvmGetFieldObject((Object*) jstr,
 402                                 STRING_FIELDOFF_VALUE);
 403     data = (const u2*) chars->contents + offset;
 404     assert(offset + len <= (int) chars->length);
 405
 406     byteLen = utf16_utf8ByteLen(data, len);
 407     newStr = (char*) malloc(byteLen+1);
 408     if (newStr == NULL)
 409         return NULL;
 410     convertUtf16ToUtf8(newStr, data, len);
 411
 412     return newStr;
 413 }
 414
 415 /*
 416  * Create a UTF-8 C string from a region of a java/lang/String.  (Used by
 417  * the JNI GetStringUTFRegion call.)
 418  */
 419 void dvmCreateCstrFromStringRegion(StringObject* jstr, int start, int len,
 420     char* buf)
 421 {
 422     const u2* data;
 423
 424     data = dvmStringChars(jstr) + start;
 425     convertUtf16ToUtf8(buf, data, len);
 426 }
 427
 428 /*
 429  * Compute the length, in modified UTF-8, of a java/lang/String object.
 430  *
 431  * Does not include the terminating null byte.
 432  */
 433 int dvmStringUtf8ByteLen(StringObject* jstr)
 434 {
 435     ArrayObject* chars;
 436     int len, offset;
 437     const u2* data;
 438
 439     assert(gDvm.javaLangStringReady > 0);
 440
 441     if (jstr == NULL)
 442         return 0;       // should we throw something?  assert?
 443
 444     len = dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_COUNT);
 445     offset = dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_OFFSET);
 446     chars = (ArrayObject*) dvmGetFieldObject((Object*) jstr,
 447                                 STRING_FIELDOFF_VALUE);
 448     data = (const u2*) chars->contents + offset;
 449     assert(offset + len <= (int) chars->length);
 450
 451     return utf16_utf8ByteLen(data, len);
 452 }
 453
 454 /*
 455  * Get the string's length.
 456  */
 457 int dvmStringLen(StringObject* jstr)
 458 {
 459     return dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_COUNT);
 460 }
 461
 462 /*
 463  * Get the char[] object from the String.
 464  */
 465 ArrayObject* dvmStringCharArray(StringObject* jstr)
 466 {
 467     return (ArrayObject*) dvmGetFieldObject((Object*) jstr,
 468                                 STRING_FIELDOFF_VALUE);
 469 }
 470
 471 /*
 472  * Get the string's data.
 473  */
 474 const u2* dvmStringChars(StringObject* jstr)
 475 {
 476     ArrayObject* chars;
 477     int offset;
 478
 479     offset = dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_OFFSET);
 480     chars = (ArrayObject*) dvmGetFieldObject((Object*) jstr,
 481                                 STRING_FIELDOFF_VALUE);
 482     return (const u2*) chars->contents + offset;
 483 }
 484
 485
 486 /*
 487  * Compare two String objects.
 488  *
 489  * This is a dvmHashTableLookup() callback.  The function has already
 490  * compared their hash values; we need to do a full compare to ensure
 491  * that the strings really match.
 492  */
 493 int dvmHashcmpStrings(const void* vstrObj1, const void* vstrObj2)
 494 {
 495     const StringObject* strObj1 = (const StringObject*) vstrObj1;
 496     const StringObject* strObj2 = (const StringObject*) vstrObj2;
 497     ArrayObject* chars1;
 498     ArrayObject* chars2;
 499     int len1, len2, offset1, offset2;
 500
 501     assert(gDvm.javaLangStringReady > 0);
 502
 503     /* get offset and length into char array; all values are in 16-bit units */
 504     len1 = dvmGetFieldInt((Object*) strObj1, STRING_FIELDOFF_COUNT);
 505     offset1 = dvmGetFieldInt((Object*) strObj1, STRING_FIELDOFF_OFFSET);
 506     len2 = dvmGetFieldInt((Object*) strObj2, STRING_FIELDOFF_COUNT);
 507     offset2 = dvmGetFieldInt((Object*) strObj2, STRING_FIELDOFF_OFFSET);
 508     if (len1 != len2)
 509         return len1 - len2;
 510
 511     chars1 = (ArrayObject*) dvmGetFieldObject((Object*) strObj1,
 512                                 STRING_FIELDOFF_VALUE);
 513     chars2 = (ArrayObject*) dvmGetFieldObject((Object*) strObj2,
 514                                 STRING_FIELDOFF_VALUE);
 515
 516     /* damage here actually indicates a broken java/lang/String */
 517     assert(offset1 + len1 <= (int) chars1->length);
 518     assert(offset2 + len2 <= (int) chars2->length);
 519
 520     return memcmp((const u2*) chars1->contents + offset1,
 521                   (const u2*) chars2->contents + offset2,
 522                   len1 * sizeof(u2));
 523 }
 524