void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) { if (U_FAILURE(status)) { return; } if (locale == NULL && collator_ == NULL) { status = U_ILLEGAL_ARGUMENT_ERROR; return; } initialLabels_ = new UnicodeSet(); if (initialLabels_ == NULL) { status = U_MEMORY_ALLOCATION_ERROR; return; } inflowLabel_.setTo((UChar)0x2026); // Ellipsis overflowLabel_ = inflowLabel_; underflowLabel_ = inflowLabel_; if (collator_ == NULL) { Collator *coll = Collator::createInstance(*locale, status); if (U_FAILURE(status)) { delete coll; return; } if (coll == NULL) { status = U_MEMORY_ALLOCATION_ERROR; return; } collator_ = dynamic_cast<RuleBasedCollator *>(coll); if (collator_ == NULL) { delete coll; status = U_UNSUPPORTED_ERROR; return; } } collatorPrimaryOnly_ = static_cast<RuleBasedCollator *>(collator_->clone()); if (collatorPrimaryOnly_ == NULL) { status = U_MEMORY_ALLOCATION_ERROR; return; } collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status); firstCharsInScripts_ = firstStringsInScript(status); if (U_FAILURE(status)) { return; } firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status); // Guard against a degenerate collator where // some script boundary strings are primary ignorable. for (;;) { if (U_FAILURE(status)) { return; } if (firstCharsInScripts_->isEmpty()) { // AlphabeticIndex requires some non-ignorable script boundary strings. status = U_ILLEGAL_ARGUMENT_ERROR; return; } if (collatorPrimaryOnly_->compare( *static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0)), emptyString_, status) == UCOL_EQUAL) { firstCharsInScripts_->removeElementAt(0); } else { break; } } // Chinese index characters, which are specific to each of the several Chinese tailorings, // take precedence over the single locale data exemplar set per language. if (!addChineseIndexCharacters(status) && locale != NULL) { addIndexExemplars(*locale, status); } }
void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) { if (U_FAILURE(status)) { return; } // Chinese index characters, which are specific to each of the several Chinese tailorings, // take precedence over the single locale data exemplar set per language. const char *language = locale.getLanguage(); if (uprv_strcmp(language, "zh") == 0 || uprv_strcmp(language, "ja") == 0 || uprv_strcmp(language, "ko") == 0) { // TODO: This should be done regardless of the language, but it's expensive. // We should add a Collator function (can be @internal) // to enumerate just the contractions that start with a given code point or string. if (addChineseIndexCharacters(status) || U_FAILURE(status)) { return; } } LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status)); if (U_FAILURE(status)) { return; } UnicodeSet exemplars; ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status); if (U_SUCCESS(status)) { initialLabels_->addAll(exemplars); return; } status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR // The locale data did not include explicit Index characters. // Synthesize a set of them from the locale's standard exemplar characters. ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status); if (U_FAILURE(status)) { return; } // question: should we add auxiliary exemplars? if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.size() == 0) { exemplars.add(0x61, 0x7A); } if (exemplars.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables // cut down to small list exemplars.remove(0xAC00, 0xD7A3). add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C). add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544). add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0). add(0xD30C).add(0xD558); } if (exemplars.containsSome(0x1200, 0x137F)) { // Ethiopic block // cut down to small list // make use of the fact that Ethiopic is allocated in 8's, where // the base is 0 mod 8. UnicodeSet ethiopic( UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"), status); UnicodeSetIterator it(ethiopic); while (it.next() && !it.isString()) { if ((it.getCodepoint() & 0x7) != 0) { exemplars.remove(it.getCodepoint()); } } } // Upper-case any that aren't already so. // (We only do this for synthesized index characters.) UnicodeSetIterator it(exemplars); UnicodeString upperC; while (it.next()) { const UnicodeString &exemplarC = it.getString(); upperC = exemplarC; upperC.toUpper(locale); initialLabels_->add(upperC); } }