Merge "Update ICU with patch to fix Japanese alphabetic index."

author: Elliott Hughes <enh@google.com> 2013-10-11 21:28:06 +0000
committer: Gerrit Code Review <noreply-gerritcodereview@google.com> 2013-10-11 21:28:07 +0000
commit: dad7c4850119ff4cc8cb3c4c96dca4b695730dd1 (patch)
tree: 51ccd941b72e028ae228285c649565938dd78d4c
parent: f835eb774086f1f182dab02140818d25c1f75a4a (diff)
parent: 260c3da8a8e46f15a7a433f0ad009bd805b804d4 (diff)
download: icu4c-dad7c4850119ff4cc8cb3c4c96dca4b695730dd1.tar.gz
4 files changed, 56 insertions, 23 deletions
diff --git a/i18n/alphaindex.cpp b/i18n/alphaindex.cpp
index 88dcaabe..e80efc7a 100644
--- a/i18n/alphaindex.cpp
+++ b/i18n/alphaindex.cpp
@@ -245,7 +245,7 @@ AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorC
 
 
 AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) {
-    addIndexExemplars(locale, status);
+    addIndexExemplars(&locale, status);
     clearBuckets();
     return *this;
 }
@@ -709,12 +709,13 @@ void AlphabeticIndex::internalResetBucketIterator() {
 }
 
 
-void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) {
+void AlphabeticIndex::addIndexExemplars(const Locale *locale, UErrorCode &status) {
     if (U_FAILURE(status)) { return; }
     // Chinese index characters, which are specific to each of the several Chinese tailorings,
     // take precedence over the single locale data exemplar set per language.
-    const char *language = locale.getLanguage();
-    if (uprv_strcmp(language, "zh") == 0 || uprv_strcmp(language, "ja") == 0 ||
+    const char *language = locale == NULL ? NULL : locale->getLanguage();
+    if (language == NULL ||
+            uprv_strcmp(language, "zh") == 0 || uprv_strcmp(language, "ja") == 0 ||
             uprv_strcmp(language, "ko") == 0) {
         // TODO: This should be done regardless of the language, but it's expensive.
         // We should add a Collator function (can be @internal)
@@ -723,8 +724,9 @@ void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status
             return;
         }
     }
+    if (locale == NULL) { return; }
 
-    LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));
+    LocalULocaleDataPointer uld(ulocdata_open(locale->getName(), &status));
     if (U_FAILURE(status)) {
         return;
     }
@@ -777,7 +779,7 @@ void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status
     while (it.next()) {
         const UnicodeString &exemplarC = it.getString();
         upperC = exemplarC;
-        upperC.toUpper(locale);
+        upperC.toUpper(*locale);
         initialLabels_->add(upperC);
     }
 }
@@ -963,22 +965,38 @@ void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) {
     firstCharsInScripts_ = firstStringsInScript(status);
     if (U_FAILURE(status)) { return; }
     firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status);
+
+    // Add index exemplar characters before checking the script boundaries,
+    // since this might modify them.
+    addIndexExemplars(locale, status);
+
     UnicodeString _4E00((UChar)0x4E00);
-    UnicodeString _1100((UChar)0x1100);
-    UnicodeString _1112((UChar)0x1112);
-    if (collatorPrimaryOnly_->compare(_4E00, _1112, status) <= 0 &&
-            collatorPrimaryOnly_->compare(_1100, _4E00, status) <= 0) {
-        // The standard Korean tailoring sorts Hanja (Han characters)
-        // as secondary differences from Hangul syllables.
-        // This makes U+4E00 not useful as a Han-script boundary.
+    int32_t hanIndex = binarySearch(
+            *firstCharsInScripts_, _4E00, *collatorPrimaryOnly_);
+    if (hanIndex >= 0) {
+        // Adjust the Han script boundary if necessary.
         // TODO: This becomes obsolete when the root collator gets
         // reliable script-first-primary mappings.
-        int32_t hanIndex = binarySearch(
-                *firstCharsInScripts_, _4E00, *collatorPrimaryOnly_);
-        if (hanIndex >= 0) {
+        UnicodeString _1100((UChar)0x1100);
+        UnicodeString _1112((UChar)0x1112);
+        UnicodeString _4E9C((UChar)0x4E9C);
+        if (collatorPrimaryOnly_->compare(_4E00, _1112, status) <= 0 &&
+                collatorPrimaryOnly_->compare(_1100, _4E00, status) <= 0) {
+            // The standard Korean tailoring sorts Hanja (Han characters)
+            // as secondary differences from Hangul syllables.
+            // This makes U+4E00 not useful as a Han-script boundary.
             firstCharsInScripts_->removeElementAt(hanIndex);
+        } else if (collatorPrimaryOnly_->compare(_4E9C, _4E00, status) < 0) {
+            // The standard Japanese tailoring sorts U+4E9C first among Kanji.
+            UnicodeString *fh = new UnicodeString(_4E9C);
+            if (fh == NULL) {
+                status = U_MEMORY_ALLOCATION_ERROR;
+                return;
+            }
+            firstCharsInScripts_->setElementAt(fh, hanIndex);
         }
     }
+
     // Guard against a degenerate collator where
     // some script boundary strings are primary ignorable.
     for (;;) {
@@ -996,10 +1014,6 @@ void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) {
             break;
         }
     }
-
-    if (locale != NULL) {
-        addIndexExemplars(*locale, status);
-    }
 }
 
 
diff --git a/i18n/unicode/alphaindex.h b/i18n/unicode/alphaindex.h
index 64e2f543..6f47ea10 100644
--- a/i18n/unicode/alphaindex.h
+++ b/i18n/unicode/alphaindex.h
@@ -675,7 +675,7 @@ private:
      * This method is called to get the index exemplars. Normally these come from the locale directly,
      * but if they aren't available, we have to synthesize them.
      */
-    void addIndexExemplars(const Locale &locale, UErrorCode &status);
+    void addIndexExemplars(const Locale *locale, UErrorCode &status);
     /**
      * Add Chinese index characters from the tailoring.
      */
diff --git a/test/intltest/alphaindextst.cpp b/test/intltest/alphaindextst.cpp
index 5bef31b9..ea4eeaff 100644
--- a/test/intltest/alphaindextst.cpp
+++ b/test/intltest/alphaindextst.cpp
@@ -63,6 +63,7 @@ void AlphabeticIndexTest::runIndexedTest( int32_t index, UBool exec, const char*
     // BEGIN android-remove - test to be added in 51.1
     // TESTCASE_AUTO(TestChineseZhuyin);
     // END android-remove
+    TESTCASE_AUTO(TestJapaneseKanji);
     TESTCASE_AUTO_END;
 }
 
@@ -93,7 +94,8 @@ void AlphabeticIndexTest::APITest() {
     // Constructor from a Collator
     //
     status = U_ZERO_ERROR;
-    RuleBasedCollator *coll = dynamic_cast<RuleBasedCollator *>(Collator::createInstance(Locale::getChinese(), status));
+    RuleBasedCollator *coll = dynamic_cast<RuleBasedCollator *>(
+        Collator::createInstance(Locale::getGerman(), status));
     TEST_CHECK_STATUS;
     TEST_ASSERT(coll != NULL);
     index = new AlphabeticIndex(coll, status);
@@ -586,7 +588,6 @@ void AlphabeticIndexTest::TestPinyinFirst() {
     TEST_CHECK_STATUS; 
     AlphabeticIndex index(coll.orphan(), status);
     TEST_CHECK_STATUS; 
-    assertEquals("getBucketCount()", 1, index.getBucketCount(status));   // ... (underflow only)
     index.addLabels(Locale::getChinese(), status);
     assertEquals("getBucketCount()", 28, index.getBucketCount(status));  // ... A-Z ...
     int bucketIndex = index.getBucketIndex(UnicodeString((UChar)0x897f), status);
@@ -676,4 +677,21 @@ void AlphabeticIndexTest::TestChineseZhuyin() {
     assertEquals("label 5", UnicodeString((UChar)0x3109), immIndex->getBucket(5)->getLabel());
 }
 
+void AlphabeticIndexTest::TestJapaneseKanji() {
+    UErrorCode status = U_ZERO_ERROR;
+    AlphabeticIndex index(Locale::getJapanese(), status);
+    LocalPointer<AlphabeticIndex::ImmutableIndex> immIndex(index.buildImmutableIndex(status));
+    TEST_CHECK_STATUS;
+    // There are no index characters for Kanji in the Japanese standard collator.
+    // They should all go into the overflow bucket.
+    static const UChar32 kanji[] = { 0x4E9C, 0x95C7, 0x4E00, 0x58F1 };
+    int32_t overflowIndex = immIndex->getBucketCount() - 1;
+    for(int32_t i = 0; i < LENGTHOF(kanji); ++i) {
+        char msg[40];
+        sprintf(msg, "kanji[%d]=U+%04lX in overflow bucket", (int)i, (long)kanji[i]);
+        assertEquals(msg, overflowIndex, immIndex->getBucketIndex(UnicodeString(kanji[i]), status));
+        TEST_CHECK_STATUS;
+    }
+}
+
 #endif
diff --git a/test/intltest/alphaindextst.h b/test/intltest/alphaindextst.h
index 2f864712..1aa00755 100644
--- a/test/intltest/alphaindextst.h
+++ b/test/intltest/alphaindextst.h
@@ -45,6 +45,7 @@ public:
      * Test with the Bopomofo-phonetic tailoring.
      */
     void TestChineseZhuyin();
+    void TestJapaneseKanji();
 };
 
 #endif
author	Elliott Hughes <enh@google.com>	2013-10-11 21:28:06 +0000
committer	Gerrit Code Review <noreply-gerritcodereview@google.com>	2013-10-11 21:28:07 +0000
commit	dad7c4850119ff4cc8cb3c4c96dca4b695730dd1 (patch)
tree	51ccd941b72e028ae228285c649565938dd78d4c
parent	f835eb774086f1f182dab02140818d25c1f75a4a (diff)
parent	260c3da8a8e46f15a7a433f0ad009bd805b804d4 (diff)
download	icu4c-dad7c4850119ff4cc8cb3c4c96dca4b695730dd1.tar.gz