diff options
Diffstat (limited to 'vm/UtfString.cpp')
-rw-r--r-- | vm/UtfString.cpp | 412 |
1 files changed, 412 insertions, 0 deletions
diff --git a/vm/UtfString.cpp b/vm/UtfString.cpp new file mode 100644 index 0000000..63d116e --- /dev/null +++ b/vm/UtfString.cpp @@ -0,0 +1,412 @@ +/* + * Copyright (C) 2008 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * UTF-8 and Unicode string manipulation, plus java/lang/String convenience + * functions. + * + * In most cases we populate the fields in the String object directly, + * rather than going through an instance field lookup. + */ +#include "Dalvik.h" +#include <stdlib.h> + +/* + * Allocate a new instance of the class String, performing first-use + * initialization of the class if necessary. Upon success, the + * returned value will have all its fields except hashCode already + * filled in, including a reference to a newly-allocated char[] for + * the contents, sized as given. Additionally, a reference to the + * chars array is stored to the pChars pointer. Callers must + * subsequently call dvmReleaseTrackedAlloc() on the result pointer. + * This function returns NULL on failure. + */ +static StringObject* makeStringObject(u4 charsLength, ArrayObject** pChars) +{ + /* + * The String class should have already gotten found (but not + * necessarily initialized) before making it here. We assert it + * explicitly, since historically speaking, we have had bugs with + * regard to when the class String gets set up. The assert helps + * make any regressions easier to diagnose. + */ + assert(gDvm.classJavaLangString != NULL); + + if (!dvmIsClassInitialized(gDvm.classJavaLangString)) { + /* Perform first-time use initialization of the class. */ + if (!dvmInitClass(gDvm.classJavaLangString)) { + LOGE("FATAL: Could not initialize class String"); + dvmAbort(); + } + } + + Object* result = dvmAllocObject(gDvm.classJavaLangString, ALLOC_DEFAULT); + if (result == NULL) { + return NULL; + } + + ArrayObject* chars = dvmAllocPrimitiveArray('C', charsLength, ALLOC_DEFAULT); + if (chars == NULL) { + dvmReleaseTrackedAlloc(result, NULL); + return NULL; + } + + dvmSetFieldInt(result, STRING_FIELDOFF_COUNT, charsLength); + dvmSetFieldObject(result, STRING_FIELDOFF_VALUE, (Object*) chars); + dvmReleaseTrackedAlloc((Object*) chars, NULL); + /* Leave offset and hashCode set to zero. */ + + *pChars = chars; + return (StringObject*) result; +} + +/* + * Compute a hash code on a UTF-8 string, for use with internal hash tables. + * + * This may or may not yield the same results as the java/lang/String + * computeHashCode() function. (To make sure this doesn't get abused, + * I'm initializing the hash code to 1 so they *don't* match up.) + * + * It would be more correct to invoke dexGetUtf16FromUtf8() here and compute + * the hash with the result. That way, if something encoded the same + * character in two different ways, the hash value would be the same. For + * our purposes that isn't necessary. + */ +u4 dvmComputeUtf8Hash(const char* utf8Str) +{ + u4 hash = 1; + + while (*utf8Str != '\0') + hash = hash * 31 + *utf8Str++; + + return hash; +} + +/* + * Like "strlen", but for strings encoded with "modified" UTF-8. + * + * The value returned is the number of characters, which may or may not + * be the same as the number of bytes. + * + * (If this needs optimizing, try: mask against 0xa0, shift right 5, + * get increment {1-3} from table of 8 values.) + */ +size_t dvmUtf8Len(const char* utf8Str) +{ + size_t len = 0; + int ic; + + while ((ic = *utf8Str++) != '\0') { + len++; + if ((ic & 0x80) != 0) { + /* two- or three-byte encoding */ + utf8Str++; + if ((ic & 0x20) != 0) { + /* three-byte encoding */ + utf8Str++; + } + } + } + + return len; +} + +/* + * Convert a "modified" UTF-8 string to UTF-16. + */ +void dvmConvertUtf8ToUtf16(u2* utf16Str, const char* utf8Str) +{ + while (*utf8Str != '\0') + *utf16Str++ = dexGetUtf16FromUtf8(&utf8Str); +} + +/* + * Given a UTF-16 string, compute the length of the corresponding UTF-8 + * string in bytes. + */ +static int utf16_utf8ByteLen(const u2* utf16Str, int len) +{ + int utf8Len = 0; + + while (len--) { + unsigned int uic = *utf16Str++; + + /* + * The most common case is (uic > 0 && uic <= 0x7f). + */ + if (uic == 0 || uic > 0x7f) { + if (uic > 0x07ff) + utf8Len += 3; + else /*(uic > 0x7f || uic == 0) */ + utf8Len += 2; + } else + utf8Len++; + } + return utf8Len; +} + +/* + * Convert a UTF-16 string to UTF-8. + * + * Make sure you allocate "utf8Str" with the result of utf16_utf8ByteLen(), + * not just "len". + */ +static void convertUtf16ToUtf8(char* utf8Str, const u2* utf16Str, int len) +{ + assert(len >= 0); + + while (len--) { + unsigned int uic = *utf16Str++; + + /* + * The most common case is (uic > 0 && uic <= 0x7f). + */ + if (uic == 0 || uic > 0x7f) { + if (uic > 0x07ff) { + *utf8Str++ = (uic >> 12) | 0xe0; + *utf8Str++ = ((uic >> 6) & 0x3f) | 0x80; + *utf8Str++ = (uic & 0x3f) | 0x80; + } else /*(uic > 0x7f || uic == 0)*/ { + *utf8Str++ = (uic >> 6) | 0xc0; + *utf8Str++ = (uic & 0x3f) | 0x80; + } + } else { + *utf8Str++ = uic; + } + } + + *utf8Str = '\0'; +} + +/* + * Use the java/lang/String.computeHashCode() algorithm. + */ +static inline u4 computeUtf16Hash(const u2* utf16Str, size_t len) +{ + u4 hash = 0; + + while (len--) + hash = hash * 31 + *utf16Str++; + + return hash; +} + +u4 dvmComputeStringHash(StringObject* strObj) { + int hashCode = dvmGetFieldInt(strObj, STRING_FIELDOFF_HASHCODE); + if (hashCode != 0) { + return hashCode; + } + int len = dvmGetFieldInt(strObj, STRING_FIELDOFF_COUNT); + int offset = dvmGetFieldInt(strObj, STRING_FIELDOFF_OFFSET); + ArrayObject* chars = + (ArrayObject*) dvmGetFieldObject(strObj, STRING_FIELDOFF_VALUE); + hashCode = computeUtf16Hash((u2*)(void*)chars->contents + offset, len); + dvmSetFieldInt(strObj, STRING_FIELDOFF_HASHCODE, hashCode); + return hashCode; +} + +StringObject* dvmCreateStringFromCstr(const char* utf8Str) { + assert(utf8Str != NULL); + return dvmCreateStringFromCstrAndLength(utf8Str, dvmUtf8Len(utf8Str)); +} + +StringObject* dvmCreateStringFromCstr(const std::string& utf8Str) { + return dvmCreateStringFromCstr(utf8Str.c_str()); +} + +/* + * Create a java/lang/String from a C string, given its UTF-16 length + * (number of UTF-16 code points). + * + * The caller must call dvmReleaseTrackedAlloc() on the return value. + * + * Returns NULL and throws an exception on failure. + */ +StringObject* dvmCreateStringFromCstrAndLength(const char* utf8Str, + size_t utf16Length) +{ + assert(utf8Str != NULL); + + ArrayObject* chars; + StringObject* newObj = makeStringObject(utf16Length, &chars); + if (newObj == NULL) { + return NULL; + } + + dvmConvertUtf8ToUtf16((u2*)(void*)chars->contents, utf8Str); + + u4 hashCode = computeUtf16Hash((u2*)(void*)chars->contents, utf16Length); + dvmSetFieldInt((Object*) newObj, STRING_FIELDOFF_HASHCODE, hashCode); + + return newObj; +} + +/* + * Create a new java/lang/String object, using the given Unicode data. + */ +StringObject* dvmCreateStringFromUnicode(const u2* unichars, int len) +{ + /* We allow a NULL pointer if the length is zero. */ + assert(len == 0 || unichars != NULL); + + ArrayObject* chars; + StringObject* newObj = makeStringObject(len, &chars); + if (newObj == NULL) { + return NULL; + } + + if (len > 0) memcpy(chars->contents, unichars, len * sizeof(u2)); + + u4 hashCode = computeUtf16Hash((u2*)(void*)chars->contents, len); + dvmSetFieldInt((Object*)newObj, STRING_FIELDOFF_HASHCODE, hashCode); + + return newObj; +} + +/* + * Create a new C string from a java/lang/String object. + * + * Returns NULL if the object is NULL. + */ +char* dvmCreateCstrFromString(const StringObject* jstr) +{ + assert(gDvm.classJavaLangString != NULL); + if (jstr == NULL) { + return NULL; + } + + int len = dvmGetFieldInt(jstr, STRING_FIELDOFF_COUNT); + int offset = dvmGetFieldInt(jstr, STRING_FIELDOFF_OFFSET); + ArrayObject* chars = + (ArrayObject*) dvmGetFieldObject(jstr, STRING_FIELDOFF_VALUE); + const u2* data = (const u2*)(void*)chars->contents + offset; + assert(offset + len <= (int) chars->length); + + int byteLen = utf16_utf8ByteLen(data, len); + char* newStr = (char*) malloc(byteLen+1); + if (newStr == NULL) { + return NULL; + } + convertUtf16ToUtf8(newStr, data, len); + + return newStr; +} + +void dvmGetStringUtfRegion(const StringObject* jstr, + int start, int len, char* buf) +{ + const u2* data = jstr->chars() + start; + convertUtf16ToUtf8(buf, data, len); +} + +int StringObject::utfLength() const +{ + assert(gDvm.classJavaLangString != NULL); + + int len = dvmGetFieldInt(this, STRING_FIELDOFF_COUNT); + int offset = dvmGetFieldInt(this, STRING_FIELDOFF_OFFSET); + ArrayObject* chars = + (ArrayObject*) dvmGetFieldObject(this, STRING_FIELDOFF_VALUE); + const u2* data = (const u2*)(void*)chars->contents + offset; + assert(offset + len <= (int) chars->length); + + return utf16_utf8ByteLen(data, len); +} + +int StringObject::length() const +{ + return dvmGetFieldInt(this, STRING_FIELDOFF_COUNT); +} + +ArrayObject* StringObject::array() const +{ + return (ArrayObject*) dvmGetFieldObject(this, STRING_FIELDOFF_VALUE); +} + +const u2* StringObject::chars() const +{ + int offset = dvmGetFieldInt(this, STRING_FIELDOFF_OFFSET); + ArrayObject* chars = + (ArrayObject*) dvmGetFieldObject(this, STRING_FIELDOFF_VALUE); + return (const u2*)(void*)chars->contents + offset; +} + + +/* + * Compare two String objects. + * + * This is a dvmHashTableLookup() callback. The function has already + * compared their hash values; we need to do a full compare to ensure + * that the strings really match. + */ +int dvmHashcmpStrings(const void* vstrObj1, const void* vstrObj2) +{ + const StringObject* strObj1 = (const StringObject*) vstrObj1; + const StringObject* strObj2 = (const StringObject*) vstrObj2; + + assert(gDvm.classJavaLangString != NULL); + + /* get offset and length into char array; all values are in 16-bit units */ + int len1 = dvmGetFieldInt(strObj1, STRING_FIELDOFF_COUNT); + int offset1 = dvmGetFieldInt(strObj1, STRING_FIELDOFF_OFFSET); + int len2 = dvmGetFieldInt(strObj2, STRING_FIELDOFF_COUNT); + int offset2 = dvmGetFieldInt(strObj2, STRING_FIELDOFF_OFFSET); + if (len1 != len2) { + return len1 - len2; + } + + ArrayObject* chars1 = + (ArrayObject*) dvmGetFieldObject(strObj1, STRING_FIELDOFF_VALUE); + ArrayObject* chars2 = + (ArrayObject*) dvmGetFieldObject(strObj2, STRING_FIELDOFF_VALUE); + + /* damage here actually indicates a broken java/lang/String */ + assert(offset1 + len1 <= (int) chars1->length); + assert(offset2 + len2 <= (int) chars2->length); + + return memcmp((const u2*)(void*)chars1->contents + offset1, + (const u2*)(void*)chars2->contents + offset2, + len1 * sizeof(u2)); +} + +ArrayObject* dvmCreateStringArray(const std::vector<std::string>& strings) { + Thread* self = dvmThreadSelf(); + + // Allocate an array to hold the String objects. + ClassObject* elementClass = dvmFindArrayClassForElement(gDvm.classJavaLangString); + ArrayObject* stringArray = dvmAllocArrayByClass(elementClass, strings.size(), ALLOC_DEFAULT); + if (stringArray == NULL) { + // Probably OOM. + assert(dvmCheckException(self)); + return NULL; + } + + // Create the individual String objects and add them to the array. + for (size_t i = 0; i < strings.size(); i++) { + Object* str = (Object*) dvmCreateStringFromCstr(strings[i]); + if (str == NULL) { + // Probably OOM; drop out now. + assert(dvmCheckException(self)); + dvmReleaseTrackedAlloc((Object*) stringArray, self); + return NULL; + } + dvmSetObjectArrayElement(stringArray, i, str); + /* stored in tracked array, okay to release */ + dvmReleaseTrackedAlloc(str, self); + } + + return stringArray; +} |