1 files changed, 412 insertions, 0 deletions
diff --git a/vm/UtfString.cpp b/vm/UtfString.cpp
new file mode 100644
index 0000000..63d116e
--- /dev/null
+++ b/vm/UtfString.cpp
@@ -0,0 +1,412 @@
+/*
+ * Copyright (C) 2008 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * UTF-8 and Unicode string manipulation, plus java/lang/String convenience
+ * functions.
+ *
+ * In most cases we populate the fields in the String object directly,
+ * rather than going through an instance field lookup.
+ */
+#include "Dalvik.h"
+#include <stdlib.h>
+
+/*
+ * Allocate a new instance of the class String, performing first-use
+ * initialization of the class if necessary. Upon success, the
+ * returned value will have all its fields except hashCode already
+ * filled in, including a reference to a newly-allocated char[] for
+ * the contents, sized as given. Additionally, a reference to the
+ * chars array is stored to the pChars pointer. Callers must
+ * subsequently call dvmReleaseTrackedAlloc() on the result pointer.
+ * This function returns NULL on failure.
+ */
+static StringObject* makeStringObject(u4 charsLength, ArrayObject** pChars)
+{
+    /*
+     * The String class should have already gotten found (but not
+     * necessarily initialized) before making it here. We assert it
+     * explicitly, since historically speaking, we have had bugs with
+     * regard to when the class String gets set up. The assert helps
+     * make any regressions easier to diagnose.
+     */
+    assert(gDvm.classJavaLangString != NULL);
+
+    if (!dvmIsClassInitialized(gDvm.classJavaLangString)) {
+        /* Perform first-time use initialization of the class. */
+        if (!dvmInitClass(gDvm.classJavaLangString)) {
+            LOGE("FATAL: Could not initialize class String");
+            dvmAbort();
+        }
+    }
+
+    Object* result = dvmAllocObject(gDvm.classJavaLangString, ALLOC_DEFAULT);
+    if (result == NULL) {
+        return NULL;
+    }
+
+    ArrayObject* chars = dvmAllocPrimitiveArray('C', charsLength, ALLOC_DEFAULT);
+    if (chars == NULL) {
+        dvmReleaseTrackedAlloc(result, NULL);
+        return NULL;
+    }
+
+    dvmSetFieldInt(result, STRING_FIELDOFF_COUNT, charsLength);
+    dvmSetFieldObject(result, STRING_FIELDOFF_VALUE, (Object*) chars);
+    dvmReleaseTrackedAlloc((Object*) chars, NULL);
+    /* Leave offset and hashCode set to zero. */
+
+    *pChars = chars;
+    return (StringObject*) result;
+}
+
+/*
+ * Compute a hash code on a UTF-8 string, for use with internal hash tables.
+ *
+ * This may or may not yield the same results as the java/lang/String
+ * computeHashCode() function.  (To make sure this doesn't get abused,
+ * I'm initializing the hash code to 1 so they *don't* match up.)
+ *
+ * It would be more correct to invoke dexGetUtf16FromUtf8() here and compute
+ * the hash with the result.  That way, if something encoded the same
+ * character in two different ways, the hash value would be the same.  For
+ * our purposes that isn't necessary.
+ */
+u4 dvmComputeUtf8Hash(const char* utf8Str)
+{
+    u4 hash = 1;
+
+    while (*utf8Str != '\0')
+        hash = hash * 31 + *utf8Str++;
+
+    return hash;
+}
+
+/*
+ * Like "strlen", but for strings encoded with "modified" UTF-8.
+ *
+ * The value returned is the number of characters, which may or may not
+ * be the same as the number of bytes.
+ *
+ * (If this needs optimizing, try: mask against 0xa0, shift right 5,
+ * get increment {1-3} from table of 8 values.)
+ */
+size_t dvmUtf8Len(const char* utf8Str)
+{
+    size_t len = 0;
+    int ic;
+
+    while ((ic = *utf8Str++) != '\0') {
+        len++;
+        if ((ic & 0x80) != 0) {
+            /* two- or three-byte encoding */
+            utf8Str++;
+            if ((ic & 0x20) != 0) {
+                /* three-byte encoding */
+                utf8Str++;
+            }
+        }
+    }
+
+    return len;
+}
+
+/*
+ * Convert a "modified" UTF-8 string to UTF-16.
+ */
+void dvmConvertUtf8ToUtf16(u2* utf16Str, const char* utf8Str)
+{
+    while (*utf8Str != '\0')
+        *utf16Str++ = dexGetUtf16FromUtf8(&utf8Str);
+}
+
+/*
+ * Given a UTF-16 string, compute the length of the corresponding UTF-8
+ * string in bytes.
+ */
+static int utf16_utf8ByteLen(const u2* utf16Str, int len)
+{
+    int utf8Len = 0;
+
+    while (len--) {
+        unsigned int uic = *utf16Str++;
+
+        /*
+         * The most common case is (uic > 0 && uic <= 0x7f).
+         */
+        if (uic == 0 || uic > 0x7f) {
+            if (uic > 0x07ff)
+                utf8Len += 3;
+            else /*(uic > 0x7f || uic == 0) */
+                utf8Len += 2;
+        } else
+            utf8Len++;
+    }
+    return utf8Len;
+}
+
+/*
+ * Convert a UTF-16 string to UTF-8.
+ *
+ * Make sure you allocate "utf8Str" with the result of utf16_utf8ByteLen(),
+ * not just "len".
+ */
+static void convertUtf16ToUtf8(char* utf8Str, const u2* utf16Str, int len)
+{
+    assert(len >= 0);
+
+    while (len--) {
+        unsigned int uic = *utf16Str++;
+
+        /*
+         * The most common case is (uic > 0 && uic <= 0x7f).
+         */
+        if (uic == 0 || uic > 0x7f) {
+            if (uic > 0x07ff) {
+                *utf8Str++ = (uic >> 12) | 0xe0;
+                *utf8Str++ = ((uic >> 6) & 0x3f) | 0x80;
+                *utf8Str++ = (uic & 0x3f) | 0x80;
+            } else /*(uic > 0x7f || uic == 0)*/ {
+                *utf8Str++ = (uic >> 6) | 0xc0;
+                *utf8Str++ = (uic & 0x3f) | 0x80;
+            }
+        } else {
+            *utf8Str++ = uic;
+        }
+    }
+
+    *utf8Str = '\0';
+}
+
+/*
+ * Use the java/lang/String.computeHashCode() algorithm.
+ */
+static inline u4 computeUtf16Hash(const u2* utf16Str, size_t len)
+{
+    u4 hash = 0;
+
+    while (len--)
+        hash = hash * 31 + *utf16Str++;
+
+    return hash;
+}
+
+u4 dvmComputeStringHash(StringObject* strObj) {
+    int hashCode = dvmGetFieldInt(strObj, STRING_FIELDOFF_HASHCODE);
+    if (hashCode != 0) {
+      return hashCode;
+    }
+    int len = dvmGetFieldInt(strObj, STRING_FIELDOFF_COUNT);
+    int offset = dvmGetFieldInt(strObj, STRING_FIELDOFF_OFFSET);
+    ArrayObject* chars =
+            (ArrayObject*) dvmGetFieldObject(strObj, STRING_FIELDOFF_VALUE);
+    hashCode = computeUtf16Hash((u2*)(void*)chars->contents + offset, len);
+    dvmSetFieldInt(strObj, STRING_FIELDOFF_HASHCODE, hashCode);
+    return hashCode;
+}
+
+StringObject* dvmCreateStringFromCstr(const char* utf8Str) {
+    assert(utf8Str != NULL);
+    return dvmCreateStringFromCstrAndLength(utf8Str, dvmUtf8Len(utf8Str));
+}
+
+StringObject* dvmCreateStringFromCstr(const std::string& utf8Str) {
+    return dvmCreateStringFromCstr(utf8Str.c_str());
+}
+
+/*
+ * Create a java/lang/String from a C string, given its UTF-16 length
+ * (number of UTF-16 code points).
+ *
+ * The caller must call dvmReleaseTrackedAlloc() on the return value.
+ *
+ * Returns NULL and throws an exception on failure.
+ */
+StringObject* dvmCreateStringFromCstrAndLength(const char* utf8Str,
+    size_t utf16Length)
+{
+    assert(utf8Str != NULL);
+
+    ArrayObject* chars;
+    StringObject* newObj = makeStringObject(utf16Length, &chars);
+    if (newObj == NULL) {
+        return NULL;
+    }
+
+    dvmConvertUtf8ToUtf16((u2*)(void*)chars->contents, utf8Str);
+
+    u4 hashCode = computeUtf16Hash((u2*)(void*)chars->contents, utf16Length);
+    dvmSetFieldInt((Object*) newObj, STRING_FIELDOFF_HASHCODE, hashCode);
+
+    return newObj;
+}
+
+/*
+ * Create a new java/lang/String object, using the given Unicode data.
+ */
+StringObject* dvmCreateStringFromUnicode(const u2* unichars, int len)
+{
+    /* We allow a NULL pointer if the length is zero. */
+    assert(len == 0 || unichars != NULL);
+
+    ArrayObject* chars;
+    StringObject* newObj = makeStringObject(len, &chars);
+    if (newObj == NULL) {
+        return NULL;
+    }
+
+    if (len > 0) memcpy(chars->contents, unichars, len * sizeof(u2));
+
+    u4 hashCode = computeUtf16Hash((u2*)(void*)chars->contents, len);
+    dvmSetFieldInt((Object*)newObj, STRING_FIELDOFF_HASHCODE, hashCode);
+
+    return newObj;
+}
+
+/*
+ * Create a new C string from a java/lang/String object.
+ *
+ * Returns NULL if the object is NULL.
+ */
+char* dvmCreateCstrFromString(const StringObject* jstr)
+{
+    assert(gDvm.classJavaLangString != NULL);
+    if (jstr == NULL) {
+        return NULL;
+    }
+
+    int len = dvmGetFieldInt(jstr, STRING_FIELDOFF_COUNT);
+    int offset = dvmGetFieldInt(jstr, STRING_FIELDOFF_OFFSET);
+    ArrayObject* chars =
+            (ArrayObject*) dvmGetFieldObject(jstr, STRING_FIELDOFF_VALUE);
+    const u2* data = (const u2*)(void*)chars->contents + offset;
+    assert(offset + len <= (int) chars->length);
+
+    int byteLen = utf16_utf8ByteLen(data, len);
+    char* newStr = (char*) malloc(byteLen+1);
+    if (newStr == NULL) {
+        return NULL;
+    }
+    convertUtf16ToUtf8(newStr, data, len);
+
+    return newStr;
+}
+
+void dvmGetStringUtfRegion(const StringObject* jstr,
+        int start, int len, char* buf)
+{
+    const u2* data = jstr->chars() + start;
+    convertUtf16ToUtf8(buf, data, len);
+}
+
+int StringObject::utfLength() const
+{
+    assert(gDvm.classJavaLangString != NULL);
+
+    int len = dvmGetFieldInt(this, STRING_FIELDOFF_COUNT);
+    int offset = dvmGetFieldInt(this, STRING_FIELDOFF_OFFSET);
+    ArrayObject* chars =
+            (ArrayObject*) dvmGetFieldObject(this, STRING_FIELDOFF_VALUE);
+    const u2* data = (const u2*)(void*)chars->contents + offset;
+    assert(offset + len <= (int) chars->length);
+
+    return utf16_utf8ByteLen(data, len);
+}
+
+int StringObject::length() const
+{
+    return dvmGetFieldInt(this, STRING_FIELDOFF_COUNT);
+}
+
+ArrayObject* StringObject::array() const
+{
+    return (ArrayObject*) dvmGetFieldObject(this, STRING_FIELDOFF_VALUE);
+}
+
+const u2* StringObject::chars() const
+{
+    int offset = dvmGetFieldInt(this, STRING_FIELDOFF_OFFSET);
+    ArrayObject* chars =
+            (ArrayObject*) dvmGetFieldObject(this, STRING_FIELDOFF_VALUE);
+    return (const u2*)(void*)chars->contents + offset;
+}
+
+
+/*
+ * Compare two String objects.
+ *
+ * This is a dvmHashTableLookup() callback.  The function has already
+ * compared their hash values; we need to do a full compare to ensure
+ * that the strings really match.
+ */
+int dvmHashcmpStrings(const void* vstrObj1, const void* vstrObj2)
+{
+    const StringObject* strObj1 = (const StringObject*) vstrObj1;
+    const StringObject* strObj2 = (const StringObject*) vstrObj2;
+
+    assert(gDvm.classJavaLangString != NULL);
+
+    /* get offset and length into char array; all values are in 16-bit units */
+    int len1 = dvmGetFieldInt(strObj1, STRING_FIELDOFF_COUNT);
+    int offset1 = dvmGetFieldInt(strObj1, STRING_FIELDOFF_OFFSET);
+    int len2 = dvmGetFieldInt(strObj2, STRING_FIELDOFF_COUNT);
+    int offset2 = dvmGetFieldInt(strObj2, STRING_FIELDOFF_OFFSET);
+    if (len1 != len2) {
+        return len1 - len2;
+    }
+
+    ArrayObject* chars1 =
+            (ArrayObject*) dvmGetFieldObject(strObj1, STRING_FIELDOFF_VALUE);
+    ArrayObject* chars2 =
+            (ArrayObject*) dvmGetFieldObject(strObj2, STRING_FIELDOFF_VALUE);
+
+    /* damage here actually indicates a broken java/lang/String */
+    assert(offset1 + len1 <= (int) chars1->length);
+    assert(offset2 + len2 <= (int) chars2->length);
+
+    return memcmp((const u2*)(void*)chars1->contents + offset1,
+                  (const u2*)(void*)chars2->contents + offset2,
+                  len1 * sizeof(u2));
+}
+
+ArrayObject* dvmCreateStringArray(const std::vector<std::string>& strings) {
+    Thread* self = dvmThreadSelf();
+
+    // Allocate an array to hold the String objects.
+    ClassObject* elementClass = dvmFindArrayClassForElement(gDvm.classJavaLangString);
+    ArrayObject* stringArray = dvmAllocArrayByClass(elementClass, strings.size(), ALLOC_DEFAULT);
+    if (stringArray == NULL) {
+        // Probably OOM.
+        assert(dvmCheckException(self));
+        return NULL;
+    }
+
+    // Create the individual String objects and add them to the array.
+    for (size_t i = 0; i < strings.size(); i++) {
+        Object* str = (Object*) dvmCreateStringFromCstr(strings[i]);
+        if (str == NULL) {
+            // Probably OOM; drop out now.
+            assert(dvmCheckException(self));
+            dvmReleaseTrackedAlloc((Object*) stringArray, self);
+            return NULL;
+        }
+        dvmSetObjectArrayElement(stringArray, i, str);
+        /* stored in tracked array, okay to release */
+        dvmReleaseTrackedAlloc(str, self);
+    }
+
+    return stringArray;
+}