aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorElliott Hughes <enh@google.com>2013-10-11 21:28:06 +0000
committerGerrit Code Review <noreply-gerritcodereview@google.com>2013-10-11 21:28:07 +0000
commitdad7c4850119ff4cc8cb3c4c96dca4b695730dd1 (patch)
tree51ccd941b72e028ae228285c649565938dd78d4c
parentf835eb774086f1f182dab02140818d25c1f75a4a (diff)
parent260c3da8a8e46f15a7a433f0ad009bd805b804d4 (diff)
downloadicu4c-dad7c4850119ff4cc8cb3c4c96dca4b695730dd1.tar.gz
Merge "Update ICU with patch to fix Japanese alphabetic index."
-rw-r--r--i18n/alphaindex.cpp54
-rw-r--r--i18n/unicode/alphaindex.h2
-rw-r--r--test/intltest/alphaindextst.cpp22
-rw-r--r--test/intltest/alphaindextst.h1
4 files changed, 56 insertions, 23 deletions
diff --git a/i18n/alphaindex.cpp b/i18n/alphaindex.cpp
index 88dcaabe..e80efc7a 100644
--- a/i18n/alphaindex.cpp
+++ b/i18n/alphaindex.cpp
@@ -245,7 +245,7 @@ AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorC
AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) {
- addIndexExemplars(locale, status);
+ addIndexExemplars(&locale, status);
clearBuckets();
return *this;
}
@@ -709,12 +709,13 @@ void AlphabeticIndex::internalResetBucketIterator() {
}
-void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) {
+void AlphabeticIndex::addIndexExemplars(const Locale *locale, UErrorCode &status) {
if (U_FAILURE(status)) { return; }
// Chinese index characters, which are specific to each of the several Chinese tailorings,
// take precedence over the single locale data exemplar set per language.
- const char *language = locale.getLanguage();
- if (uprv_strcmp(language, "zh") == 0 || uprv_strcmp(language, "ja") == 0 ||
+ const char *language = locale == NULL ? NULL : locale->getLanguage();
+ if (language == NULL ||
+ uprv_strcmp(language, "zh") == 0 || uprv_strcmp(language, "ja") == 0 ||
uprv_strcmp(language, "ko") == 0) {
// TODO: This should be done regardless of the language, but it's expensive.
// We should add a Collator function (can be @internal)
@@ -723,8 +724,9 @@ void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status
return;
}
}
+ if (locale == NULL) { return; }
- LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));
+ LocalULocaleDataPointer uld(ulocdata_open(locale->getName(), &status));
if (U_FAILURE(status)) {
return;
}
@@ -777,7 +779,7 @@ void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status
while (it.next()) {
const UnicodeString &exemplarC = it.getString();
upperC = exemplarC;
- upperC.toUpper(locale);
+ upperC.toUpper(*locale);
initialLabels_->add(upperC);
}
}
@@ -963,22 +965,38 @@ void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) {
firstCharsInScripts_ = firstStringsInScript(status);
if (U_FAILURE(status)) { return; }
firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status);
+
+ // Add index exemplar characters before checking the script boundaries,
+ // since this might modify them.
+ addIndexExemplars(locale, status);
+
UnicodeString _4E00((UChar)0x4E00);
- UnicodeString _1100((UChar)0x1100);
- UnicodeString _1112((UChar)0x1112);
- if (collatorPrimaryOnly_->compare(_4E00, _1112, status) <= 0 &&
- collatorPrimaryOnly_->compare(_1100, _4E00, status) <= 0) {
- // The standard Korean tailoring sorts Hanja (Han characters)
- // as secondary differences from Hangul syllables.
- // This makes U+4E00 not useful as a Han-script boundary.
+ int32_t hanIndex = binarySearch(
+ *firstCharsInScripts_, _4E00, *collatorPrimaryOnly_);
+ if (hanIndex >= 0) {
+ // Adjust the Han script boundary if necessary.
// TODO: This becomes obsolete when the root collator gets
// reliable script-first-primary mappings.
- int32_t hanIndex = binarySearch(
- *firstCharsInScripts_, _4E00, *collatorPrimaryOnly_);
- if (hanIndex >= 0) {
+ UnicodeString _1100((UChar)0x1100);
+ UnicodeString _1112((UChar)0x1112);
+ UnicodeString _4E9C((UChar)0x4E9C);
+ if (collatorPrimaryOnly_->compare(_4E00, _1112, status) <= 0 &&
+ collatorPrimaryOnly_->compare(_1100, _4E00, status) <= 0) {
+ // The standard Korean tailoring sorts Hanja (Han characters)
+ // as secondary differences from Hangul syllables.
+ // This makes U+4E00 not useful as a Han-script boundary.
firstCharsInScripts_->removeElementAt(hanIndex);
+ } else if (collatorPrimaryOnly_->compare(_4E9C, _4E00, status) < 0) {
+ // The standard Japanese tailoring sorts U+4E9C first among Kanji.
+ UnicodeString *fh = new UnicodeString(_4E9C);
+ if (fh == NULL) {
+ status = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+ firstCharsInScripts_->setElementAt(fh, hanIndex);
}
}
+
// Guard against a degenerate collator where
// some script boundary strings are primary ignorable.
for (;;) {
@@ -996,10 +1014,6 @@ void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) {
break;
}
}
-
- if (locale != NULL) {
- addIndexExemplars(*locale, status);
- }
}
diff --git a/i18n/unicode/alphaindex.h b/i18n/unicode/alphaindex.h
index 64e2f543..6f47ea10 100644
--- a/i18n/unicode/alphaindex.h
+++ b/i18n/unicode/alphaindex.h
@@ -675,7 +675,7 @@ private:
* This method is called to get the index exemplars. Normally these come from the locale directly,
* but if they aren't available, we have to synthesize them.
*/
- void addIndexExemplars(const Locale &locale, UErrorCode &status);
+ void addIndexExemplars(const Locale *locale, UErrorCode &status);
/**
* Add Chinese index characters from the tailoring.
*/
diff --git a/test/intltest/alphaindextst.cpp b/test/intltest/alphaindextst.cpp
index 5bef31b9..ea4eeaff 100644
--- a/test/intltest/alphaindextst.cpp
+++ b/test/intltest/alphaindextst.cpp
@@ -63,6 +63,7 @@ void AlphabeticIndexTest::runIndexedTest( int32_t index, UBool exec, const char*
// BEGIN android-remove - test to be added in 51.1
// TESTCASE_AUTO(TestChineseZhuyin);
// END android-remove
+ TESTCASE_AUTO(TestJapaneseKanji);
TESTCASE_AUTO_END;
}
@@ -93,7 +94,8 @@ void AlphabeticIndexTest::APITest() {
// Constructor from a Collator
//
status = U_ZERO_ERROR;
- RuleBasedCollator *coll = dynamic_cast<RuleBasedCollator *>(Collator::createInstance(Locale::getChinese(), status));
+ RuleBasedCollator *coll = dynamic_cast<RuleBasedCollator *>(
+ Collator::createInstance(Locale::getGerman(), status));
TEST_CHECK_STATUS;
TEST_ASSERT(coll != NULL);
index = new AlphabeticIndex(coll, status);
@@ -586,7 +588,6 @@ void AlphabeticIndexTest::TestPinyinFirst() {
TEST_CHECK_STATUS;
AlphabeticIndex index(coll.orphan(), status);
TEST_CHECK_STATUS;
- assertEquals("getBucketCount()", 1, index.getBucketCount(status)); // ... (underflow only)
index.addLabels(Locale::getChinese(), status);
assertEquals("getBucketCount()", 28, index.getBucketCount(status)); // ... A-Z ...
int bucketIndex = index.getBucketIndex(UnicodeString((UChar)0x897f), status);
@@ -676,4 +677,21 @@ void AlphabeticIndexTest::TestChineseZhuyin() {
assertEquals("label 5", UnicodeString((UChar)0x3109), immIndex->getBucket(5)->getLabel());
}
+void AlphabeticIndexTest::TestJapaneseKanji() {
+ UErrorCode status = U_ZERO_ERROR;
+ AlphabeticIndex index(Locale::getJapanese(), status);
+ LocalPointer<AlphabeticIndex::ImmutableIndex> immIndex(index.buildImmutableIndex(status));
+ TEST_CHECK_STATUS;
+ // There are no index characters for Kanji in the Japanese standard collator.
+ // They should all go into the overflow bucket.
+ static const UChar32 kanji[] = { 0x4E9C, 0x95C7, 0x4E00, 0x58F1 };
+ int32_t overflowIndex = immIndex->getBucketCount() - 1;
+ for(int32_t i = 0; i < LENGTHOF(kanji); ++i) {
+ char msg[40];
+ sprintf(msg, "kanji[%d]=U+%04lX in overflow bucket", (int)i, (long)kanji[i]);
+ assertEquals(msg, overflowIndex, immIndex->getBucketIndex(UnicodeString(kanji[i]), status));
+ TEST_CHECK_STATUS;
+ }
+}
+
#endif
diff --git a/test/intltest/alphaindextst.h b/test/intltest/alphaindextst.h
index 2f864712..1aa00755 100644
--- a/test/intltest/alphaindextst.h
+++ b/test/intltest/alphaindextst.h
@@ -45,6 +45,7 @@ public:
* Test with the Bopomofo-phonetic tailoring.
*/
void TestChineseZhuyin();
+ void TestJapaneseKanji();
};
#endif