summaryrefslogtreecommitdiff
path: root/vm/UtfString.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'vm/UtfString.cpp')
-rw-r--r--vm/UtfString.cpp412
1 files changed, 412 insertions, 0 deletions
diff --git a/vm/UtfString.cpp b/vm/UtfString.cpp
new file mode 100644
index 0000000..63d116e
--- /dev/null
+++ b/vm/UtfString.cpp
@@ -0,0 +1,412 @@
+/*
+ * Copyright (C) 2008 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * UTF-8 and Unicode string manipulation, plus java/lang/String convenience
+ * functions.
+ *
+ * In most cases we populate the fields in the String object directly,
+ * rather than going through an instance field lookup.
+ */
+#include "Dalvik.h"
+#include <stdlib.h>
+
+/*
+ * Allocate a new instance of the class String, performing first-use
+ * initialization of the class if necessary. Upon success, the
+ * returned value will have all its fields except hashCode already
+ * filled in, including a reference to a newly-allocated char[] for
+ * the contents, sized as given. Additionally, a reference to the
+ * chars array is stored to the pChars pointer. Callers must
+ * subsequently call dvmReleaseTrackedAlloc() on the result pointer.
+ * This function returns NULL on failure.
+ */
+static StringObject* makeStringObject(u4 charsLength, ArrayObject** pChars)
+{
+ /*
+ * The String class should have already gotten found (but not
+ * necessarily initialized) before making it here. We assert it
+ * explicitly, since historically speaking, we have had bugs with
+ * regard to when the class String gets set up. The assert helps
+ * make any regressions easier to diagnose.
+ */
+ assert(gDvm.classJavaLangString != NULL);
+
+ if (!dvmIsClassInitialized(gDvm.classJavaLangString)) {
+ /* Perform first-time use initialization of the class. */
+ if (!dvmInitClass(gDvm.classJavaLangString)) {
+ LOGE("FATAL: Could not initialize class String");
+ dvmAbort();
+ }
+ }
+
+ Object* result = dvmAllocObject(gDvm.classJavaLangString, ALLOC_DEFAULT);
+ if (result == NULL) {
+ return NULL;
+ }
+
+ ArrayObject* chars = dvmAllocPrimitiveArray('C', charsLength, ALLOC_DEFAULT);
+ if (chars == NULL) {
+ dvmReleaseTrackedAlloc(result, NULL);
+ return NULL;
+ }
+
+ dvmSetFieldInt(result, STRING_FIELDOFF_COUNT, charsLength);
+ dvmSetFieldObject(result, STRING_FIELDOFF_VALUE, (Object*) chars);
+ dvmReleaseTrackedAlloc((Object*) chars, NULL);
+ /* Leave offset and hashCode set to zero. */
+
+ *pChars = chars;
+ return (StringObject*) result;
+}
+
+/*
+ * Compute a hash code on a UTF-8 string, for use with internal hash tables.
+ *
+ * This may or may not yield the same results as the java/lang/String
+ * computeHashCode() function. (To make sure this doesn't get abused,
+ * I'm initializing the hash code to 1 so they *don't* match up.)
+ *
+ * It would be more correct to invoke dexGetUtf16FromUtf8() here and compute
+ * the hash with the result. That way, if something encoded the same
+ * character in two different ways, the hash value would be the same. For
+ * our purposes that isn't necessary.
+ */
+u4 dvmComputeUtf8Hash(const char* utf8Str)
+{
+ u4 hash = 1;
+
+ while (*utf8Str != '\0')
+ hash = hash * 31 + *utf8Str++;
+
+ return hash;
+}
+
+/*
+ * Like "strlen", but for strings encoded with "modified" UTF-8.
+ *
+ * The value returned is the number of characters, which may or may not
+ * be the same as the number of bytes.
+ *
+ * (If this needs optimizing, try: mask against 0xa0, shift right 5,
+ * get increment {1-3} from table of 8 values.)
+ */
+size_t dvmUtf8Len(const char* utf8Str)
+{
+ size_t len = 0;
+ int ic;
+
+ while ((ic = *utf8Str++) != '\0') {
+ len++;
+ if ((ic & 0x80) != 0) {
+ /* two- or three-byte encoding */
+ utf8Str++;
+ if ((ic & 0x20) != 0) {
+ /* three-byte encoding */
+ utf8Str++;
+ }
+ }
+ }
+
+ return len;
+}
+
+/*
+ * Convert a "modified" UTF-8 string to UTF-16.
+ */
+void dvmConvertUtf8ToUtf16(u2* utf16Str, const char* utf8Str)
+{
+ while (*utf8Str != '\0')
+ *utf16Str++ = dexGetUtf16FromUtf8(&utf8Str);
+}
+
+/*
+ * Given a UTF-16 string, compute the length of the corresponding UTF-8
+ * string in bytes.
+ */
+static int utf16_utf8ByteLen(const u2* utf16Str, int len)
+{
+ int utf8Len = 0;
+
+ while (len--) {
+ unsigned int uic = *utf16Str++;
+
+ /*
+ * The most common case is (uic > 0 && uic <= 0x7f).
+ */
+ if (uic == 0 || uic > 0x7f) {
+ if (uic > 0x07ff)
+ utf8Len += 3;
+ else /*(uic > 0x7f || uic == 0) */
+ utf8Len += 2;
+ } else
+ utf8Len++;
+ }
+ return utf8Len;
+}
+
+/*
+ * Convert a UTF-16 string to UTF-8.
+ *
+ * Make sure you allocate "utf8Str" with the result of utf16_utf8ByteLen(),
+ * not just "len".
+ */
+static void convertUtf16ToUtf8(char* utf8Str, const u2* utf16Str, int len)
+{
+ assert(len >= 0);
+
+ while (len--) {
+ unsigned int uic = *utf16Str++;
+
+ /*
+ * The most common case is (uic > 0 && uic <= 0x7f).
+ */
+ if (uic == 0 || uic > 0x7f) {
+ if (uic > 0x07ff) {
+ *utf8Str++ = (uic >> 12) | 0xe0;
+ *utf8Str++ = ((uic >> 6) & 0x3f) | 0x80;
+ *utf8Str++ = (uic & 0x3f) | 0x80;
+ } else /*(uic > 0x7f || uic == 0)*/ {
+ *utf8Str++ = (uic >> 6) | 0xc0;
+ *utf8Str++ = (uic & 0x3f) | 0x80;
+ }
+ } else {
+ *utf8Str++ = uic;
+ }
+ }
+
+ *utf8Str = '\0';
+}
+
+/*
+ * Use the java/lang/String.computeHashCode() algorithm.
+ */
+static inline u4 computeUtf16Hash(const u2* utf16Str, size_t len)
+{
+ u4 hash = 0;
+
+ while (len--)
+ hash = hash * 31 + *utf16Str++;
+
+ return hash;
+}
+
+u4 dvmComputeStringHash(StringObject* strObj) {
+ int hashCode = dvmGetFieldInt(strObj, STRING_FIELDOFF_HASHCODE);
+ if (hashCode != 0) {
+ return hashCode;
+ }
+ int len = dvmGetFieldInt(strObj, STRING_FIELDOFF_COUNT);
+ int offset = dvmGetFieldInt(strObj, STRING_FIELDOFF_OFFSET);
+ ArrayObject* chars =
+ (ArrayObject*) dvmGetFieldObject(strObj, STRING_FIELDOFF_VALUE);
+ hashCode = computeUtf16Hash((u2*)(void*)chars->contents + offset, len);
+ dvmSetFieldInt(strObj, STRING_FIELDOFF_HASHCODE, hashCode);
+ return hashCode;
+}
+
+StringObject* dvmCreateStringFromCstr(const char* utf8Str) {
+ assert(utf8Str != NULL);
+ return dvmCreateStringFromCstrAndLength(utf8Str, dvmUtf8Len(utf8Str));
+}
+
+StringObject* dvmCreateStringFromCstr(const std::string& utf8Str) {
+ return dvmCreateStringFromCstr(utf8Str.c_str());
+}
+
+/*
+ * Create a java/lang/String from a C string, given its UTF-16 length
+ * (number of UTF-16 code points).
+ *
+ * The caller must call dvmReleaseTrackedAlloc() on the return value.
+ *
+ * Returns NULL and throws an exception on failure.
+ */
+StringObject* dvmCreateStringFromCstrAndLength(const char* utf8Str,
+ size_t utf16Length)
+{
+ assert(utf8Str != NULL);
+
+ ArrayObject* chars;
+ StringObject* newObj = makeStringObject(utf16Length, &chars);
+ if (newObj == NULL) {
+ return NULL;
+ }
+
+ dvmConvertUtf8ToUtf16((u2*)(void*)chars->contents, utf8Str);
+
+ u4 hashCode = computeUtf16Hash((u2*)(void*)chars->contents, utf16Length);
+ dvmSetFieldInt((Object*) newObj, STRING_FIELDOFF_HASHCODE, hashCode);
+
+ return newObj;
+}
+
+/*
+ * Create a new java/lang/String object, using the given Unicode data.
+ */
+StringObject* dvmCreateStringFromUnicode(const u2* unichars, int len)
+{
+ /* We allow a NULL pointer if the length is zero. */
+ assert(len == 0 || unichars != NULL);
+
+ ArrayObject* chars;
+ StringObject* newObj = makeStringObject(len, &chars);
+ if (newObj == NULL) {
+ return NULL;
+ }
+
+ if (len > 0) memcpy(chars->contents, unichars, len * sizeof(u2));
+
+ u4 hashCode = computeUtf16Hash((u2*)(void*)chars->contents, len);
+ dvmSetFieldInt((Object*)newObj, STRING_FIELDOFF_HASHCODE, hashCode);
+
+ return newObj;
+}
+
+/*
+ * Create a new C string from a java/lang/String object.
+ *
+ * Returns NULL if the object is NULL.
+ */
+char* dvmCreateCstrFromString(const StringObject* jstr)
+{
+ assert(gDvm.classJavaLangString != NULL);
+ if (jstr == NULL) {
+ return NULL;
+ }
+
+ int len = dvmGetFieldInt(jstr, STRING_FIELDOFF_COUNT);
+ int offset = dvmGetFieldInt(jstr, STRING_FIELDOFF_OFFSET);
+ ArrayObject* chars =
+ (ArrayObject*) dvmGetFieldObject(jstr, STRING_FIELDOFF_VALUE);
+ const u2* data = (const u2*)(void*)chars->contents + offset;
+ assert(offset + len <= (int) chars->length);
+
+ int byteLen = utf16_utf8ByteLen(data, len);
+ char* newStr = (char*) malloc(byteLen+1);
+ if (newStr == NULL) {
+ return NULL;
+ }
+ convertUtf16ToUtf8(newStr, data, len);
+
+ return newStr;
+}
+
+void dvmGetStringUtfRegion(const StringObject* jstr,
+ int start, int len, char* buf)
+{
+ const u2* data = jstr->chars() + start;
+ convertUtf16ToUtf8(buf, data, len);
+}
+
+int StringObject::utfLength() const
+{
+ assert(gDvm.classJavaLangString != NULL);
+
+ int len = dvmGetFieldInt(this, STRING_FIELDOFF_COUNT);
+ int offset = dvmGetFieldInt(this, STRING_FIELDOFF_OFFSET);
+ ArrayObject* chars =
+ (ArrayObject*) dvmGetFieldObject(this, STRING_FIELDOFF_VALUE);
+ const u2* data = (const u2*)(void*)chars->contents + offset;
+ assert(offset + len <= (int) chars->length);
+
+ return utf16_utf8ByteLen(data, len);
+}
+
+int StringObject::length() const
+{
+ return dvmGetFieldInt(this, STRING_FIELDOFF_COUNT);
+}
+
+ArrayObject* StringObject::array() const
+{
+ return (ArrayObject*) dvmGetFieldObject(this, STRING_FIELDOFF_VALUE);
+}
+
+const u2* StringObject::chars() const
+{
+ int offset = dvmGetFieldInt(this, STRING_FIELDOFF_OFFSET);
+ ArrayObject* chars =
+ (ArrayObject*) dvmGetFieldObject(this, STRING_FIELDOFF_VALUE);
+ return (const u2*)(void*)chars->contents + offset;
+}
+
+
+/*
+ * Compare two String objects.
+ *
+ * This is a dvmHashTableLookup() callback. The function has already
+ * compared their hash values; we need to do a full compare to ensure
+ * that the strings really match.
+ */
+int dvmHashcmpStrings(const void* vstrObj1, const void* vstrObj2)
+{
+ const StringObject* strObj1 = (const StringObject*) vstrObj1;
+ const StringObject* strObj2 = (const StringObject*) vstrObj2;
+
+ assert(gDvm.classJavaLangString != NULL);
+
+ /* get offset and length into char array; all values are in 16-bit units */
+ int len1 = dvmGetFieldInt(strObj1, STRING_FIELDOFF_COUNT);
+ int offset1 = dvmGetFieldInt(strObj1, STRING_FIELDOFF_OFFSET);
+ int len2 = dvmGetFieldInt(strObj2, STRING_FIELDOFF_COUNT);
+ int offset2 = dvmGetFieldInt(strObj2, STRING_FIELDOFF_OFFSET);
+ if (len1 != len2) {
+ return len1 - len2;
+ }
+
+ ArrayObject* chars1 =
+ (ArrayObject*) dvmGetFieldObject(strObj1, STRING_FIELDOFF_VALUE);
+ ArrayObject* chars2 =
+ (ArrayObject*) dvmGetFieldObject(strObj2, STRING_FIELDOFF_VALUE);
+
+ /* damage here actually indicates a broken java/lang/String */
+ assert(offset1 + len1 <= (int) chars1->length);
+ assert(offset2 + len2 <= (int) chars2->length);
+
+ return memcmp((const u2*)(void*)chars1->contents + offset1,
+ (const u2*)(void*)chars2->contents + offset2,
+ len1 * sizeof(u2));
+}
+
+ArrayObject* dvmCreateStringArray(const std::vector<std::string>& strings) {
+ Thread* self = dvmThreadSelf();
+
+ // Allocate an array to hold the String objects.
+ ClassObject* elementClass = dvmFindArrayClassForElement(gDvm.classJavaLangString);
+ ArrayObject* stringArray = dvmAllocArrayByClass(elementClass, strings.size(), ALLOC_DEFAULT);
+ if (stringArray == NULL) {
+ // Probably OOM.
+ assert(dvmCheckException(self));
+ return NULL;
+ }
+
+ // Create the individual String objects and add them to the array.
+ for (size_t i = 0; i < strings.size(); i++) {
+ Object* str = (Object*) dvmCreateStringFromCstr(strings[i]);
+ if (str == NULL) {
+ // Probably OOM; drop out now.
+ assert(dvmCheckException(self));
+ dvmReleaseTrackedAlloc((Object*) stringArray, self);
+ return NULL;
+ }
+ dvmSetObjectArrayElement(stringArray, i, str);
+ /* stored in tracked array, okay to release */
+ dvmReleaseTrackedAlloc(str, self);
+ }
+
+ return stringArray;
+}