diff options
author | Craig Cornelius <ccornelius@google.com> | 2013-10-10 15:18:49 -0700 |
---|---|---|
committer | Elliott Hughes <enh@google.com> | 2013-10-11 14:34:32 -0700 |
commit | 0a61a367aa48577edf1c9ba57c501b8f5e7555d5 (patch) | |
tree | 711bf9961b01c8e853eb7ebe51822664a1a7a7ee | |
parent | 8f65a294aa83d78c7f04a69227ac61f070a381f4 (diff) | |
download | icu4c-0a61a367aa48577edf1c9ba57c501b8f5e7555d5.tar.gz |
Update ICU with patch to fix Japanese alphabetic index.android-sdk-4.4.2_r1.0.1android-sdk-4.4.2_r1android-cts-4.4_r4android-cts-4.4_r1android-4.4_r1.2.0.1android-4.4_r1.2android-4.4_r1.1.0.1android-4.4_r1.1android-4.4_r1.0.1android-4.4_r1android-4.4_r0.9android-4.4.2_r2.0.1android-4.4.2_r2android-4.4.2_r1.0.1android-4.4.2_r1android-4.4.1_r1.0.1android-4.4.1_r1kitkat-releasekitkat-mr1.1-releasekitkat-mr1-releasekitkat-cts-releasekitkat-cts-dev
Patch from http://bugs.icu-project.org/trac/ticket/10423.
Bug: 10809397
(cherry picked from commit 260c3da8a8e46f15a7a433f0ad009bd805b804d4)
Change-Id: If514618784c1528d1072d2e3f8792bf60d6283a8
-rw-r--r-- | i18n/alphaindex.cpp | 54 | ||||
-rw-r--r-- | i18n/unicode/alphaindex.h | 2 | ||||
-rw-r--r-- | test/intltest/alphaindextst.cpp | 22 | ||||
-rw-r--r-- | test/intltest/alphaindextst.h | 1 |
4 files changed, 56 insertions, 23 deletions
diff --git a/i18n/alphaindex.cpp b/i18n/alphaindex.cpp index 88dcaabe..e80efc7a 100644 --- a/i18n/alphaindex.cpp +++ b/i18n/alphaindex.cpp @@ -245,7 +245,7 @@ AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorC AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) { - addIndexExemplars(locale, status); + addIndexExemplars(&locale, status); clearBuckets(); return *this; } @@ -709,12 +709,13 @@ void AlphabeticIndex::internalResetBucketIterator() { } -void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) { +void AlphabeticIndex::addIndexExemplars(const Locale *locale, UErrorCode &status) { if (U_FAILURE(status)) { return; } // Chinese index characters, which are specific to each of the several Chinese tailorings, // take precedence over the single locale data exemplar set per language. - const char *language = locale.getLanguage(); - if (uprv_strcmp(language, "zh") == 0 || uprv_strcmp(language, "ja") == 0 || + const char *language = locale == NULL ? NULL : locale->getLanguage(); + if (language == NULL || + uprv_strcmp(language, "zh") == 0 || uprv_strcmp(language, "ja") == 0 || uprv_strcmp(language, "ko") == 0) { // TODO: This should be done regardless of the language, but it's expensive. // We should add a Collator function (can be @internal) @@ -723,8 +724,9 @@ void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status return; } } + if (locale == NULL) { return; } - LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status)); + LocalULocaleDataPointer uld(ulocdata_open(locale->getName(), &status)); if (U_FAILURE(status)) { return; } @@ -777,7 +779,7 @@ void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status while (it.next()) { const UnicodeString &exemplarC = it.getString(); upperC = exemplarC; - upperC.toUpper(locale); + upperC.toUpper(*locale); initialLabels_->add(upperC); } } @@ -963,22 +965,38 @@ void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) { firstCharsInScripts_ = firstStringsInScript(status); if (U_FAILURE(status)) { return; } firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status); + + // Add index exemplar characters before checking the script boundaries, + // since this might modify them. + addIndexExemplars(locale, status); + UnicodeString _4E00((UChar)0x4E00); - UnicodeString _1100((UChar)0x1100); - UnicodeString _1112((UChar)0x1112); - if (collatorPrimaryOnly_->compare(_4E00, _1112, status) <= 0 && - collatorPrimaryOnly_->compare(_1100, _4E00, status) <= 0) { - // The standard Korean tailoring sorts Hanja (Han characters) - // as secondary differences from Hangul syllables. - // This makes U+4E00 not useful as a Han-script boundary. + int32_t hanIndex = binarySearch( + *firstCharsInScripts_, _4E00, *collatorPrimaryOnly_); + if (hanIndex >= 0) { + // Adjust the Han script boundary if necessary. // TODO: This becomes obsolete when the root collator gets // reliable script-first-primary mappings. - int32_t hanIndex = binarySearch( - *firstCharsInScripts_, _4E00, *collatorPrimaryOnly_); - if (hanIndex >= 0) { + UnicodeString _1100((UChar)0x1100); + UnicodeString _1112((UChar)0x1112); + UnicodeString _4E9C((UChar)0x4E9C); + if (collatorPrimaryOnly_->compare(_4E00, _1112, status) <= 0 && + collatorPrimaryOnly_->compare(_1100, _4E00, status) <= 0) { + // The standard Korean tailoring sorts Hanja (Han characters) + // as secondary differences from Hangul syllables. + // This makes U+4E00 not useful as a Han-script boundary. firstCharsInScripts_->removeElementAt(hanIndex); + } else if (collatorPrimaryOnly_->compare(_4E9C, _4E00, status) < 0) { + // The standard Japanese tailoring sorts U+4E9C first among Kanji. + UnicodeString *fh = new UnicodeString(_4E9C); + if (fh == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + firstCharsInScripts_->setElementAt(fh, hanIndex); } } + // Guard against a degenerate collator where // some script boundary strings are primary ignorable. for (;;) { @@ -996,10 +1014,6 @@ void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) { break; } } - - if (locale != NULL) { - addIndexExemplars(*locale, status); - } } diff --git a/i18n/unicode/alphaindex.h b/i18n/unicode/alphaindex.h index 64e2f543..6f47ea10 100644 --- a/i18n/unicode/alphaindex.h +++ b/i18n/unicode/alphaindex.h @@ -675,7 +675,7 @@ private: * This method is called to get the index exemplars. Normally these come from the locale directly, * but if they aren't available, we have to synthesize them. */ - void addIndexExemplars(const Locale &locale, UErrorCode &status); + void addIndexExemplars(const Locale *locale, UErrorCode &status); /** * Add Chinese index characters from the tailoring. */ diff --git a/test/intltest/alphaindextst.cpp b/test/intltest/alphaindextst.cpp index 5bef31b9..ea4eeaff 100644 --- a/test/intltest/alphaindextst.cpp +++ b/test/intltest/alphaindextst.cpp @@ -63,6 +63,7 @@ void AlphabeticIndexTest::runIndexedTest( int32_t index, UBool exec, const char* // BEGIN android-remove - test to be added in 51.1 // TESTCASE_AUTO(TestChineseZhuyin); // END android-remove + TESTCASE_AUTO(TestJapaneseKanji); TESTCASE_AUTO_END; } @@ -93,7 +94,8 @@ void AlphabeticIndexTest::APITest() { // Constructor from a Collator // status = U_ZERO_ERROR; - RuleBasedCollator *coll = dynamic_cast<RuleBasedCollator *>(Collator::createInstance(Locale::getChinese(), status)); + RuleBasedCollator *coll = dynamic_cast<RuleBasedCollator *>( + Collator::createInstance(Locale::getGerman(), status)); TEST_CHECK_STATUS; TEST_ASSERT(coll != NULL); index = new AlphabeticIndex(coll, status); @@ -586,7 +588,6 @@ void AlphabeticIndexTest::TestPinyinFirst() { TEST_CHECK_STATUS; AlphabeticIndex index(coll.orphan(), status); TEST_CHECK_STATUS; - assertEquals("getBucketCount()", 1, index.getBucketCount(status)); // ... (underflow only) index.addLabels(Locale::getChinese(), status); assertEquals("getBucketCount()", 28, index.getBucketCount(status)); // ... A-Z ... int bucketIndex = index.getBucketIndex(UnicodeString((UChar)0x897f), status); @@ -676,4 +677,21 @@ void AlphabeticIndexTest::TestChineseZhuyin() { assertEquals("label 5", UnicodeString((UChar)0x3109), immIndex->getBucket(5)->getLabel()); } +void AlphabeticIndexTest::TestJapaneseKanji() { + UErrorCode status = U_ZERO_ERROR; + AlphabeticIndex index(Locale::getJapanese(), status); + LocalPointer<AlphabeticIndex::ImmutableIndex> immIndex(index.buildImmutableIndex(status)); + TEST_CHECK_STATUS; + // There are no index characters for Kanji in the Japanese standard collator. + // They should all go into the overflow bucket. + static const UChar32 kanji[] = { 0x4E9C, 0x95C7, 0x4E00, 0x58F1 }; + int32_t overflowIndex = immIndex->getBucketCount() - 1; + for(int32_t i = 0; i < LENGTHOF(kanji); ++i) { + char msg[40]; + sprintf(msg, "kanji[%d]=U+%04lX in overflow bucket", (int)i, (long)kanji[i]); + assertEquals(msg, overflowIndex, immIndex->getBucketIndex(UnicodeString(kanji[i]), status)); + TEST_CHECK_STATUS; + } +} + #endif diff --git a/test/intltest/alphaindextst.h b/test/intltest/alphaindextst.h index 2f864712..1aa00755 100644 --- a/test/intltest/alphaindextst.h +++ b/test/intltest/alphaindextst.h @@ -45,6 +45,7 @@ public: * Test with the Bopomofo-phonetic tailoring. */ void TestChineseZhuyin(); + void TestJapaneseKanji(); }; #endif |