Beispiel #1
0
void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) {
    LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));
    if (U_FAILURE(status)) {
        return;
    }

    UnicodeSet exemplars;
    ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status);
    if (U_SUCCESS(status)) {
        initialLabels_->addAll(exemplars);
        return;
    }
    status = U_ZERO_ERROR;  // Clear out U_MISSING_RESOURCE_ERROR

    // The locale data did not include explicit Index characters.
    // Synthesize a set of them from the locale's standard exemplar characters.
    ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status);
    if (U_FAILURE(status)) {
        return;
    }

    // question: should we add auxiliary exemplars?
    if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.size() == 0) {
        exemplars.add(0x61, 0x7A);
    }
    if (exemplars.containsSome(0xAC00, 0xD7A3)) {  // Hangul syllables
        // cut down to small list
        exemplars.remove(0xAC00, 0xD7A3).
            add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).
            add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544).
            add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).
            add(0xD30C).add(0xD558);
    }
    if (exemplars.containsSome(0x1200, 0x137F)) {  // Ethiopic block
        // cut down to small list
        // make use of the fact that Ethiopic is allocated in 8's, where
        // the base is 0 mod 8.
        UnicodeSet ethiopic(
            UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"), status);
        UnicodeSetIterator it(ethiopic);
        while (it.next() && !it.isString()) {
            if ((it.getCodepoint() & 0x7) != 0) {
                exemplars.remove(it.getCodepoint());
            }
        }
    }

    // Upper-case any that aren't already so.
    //   (We only do this for synthesized index characters.)
    UnicodeSetIterator it(exemplars);
    UnicodeString upperC;
    while (it.next()) {
        const UnicodeString &exemplarC = it.getString();
        upperC = exemplarC;
        upperC.toUpper(locale);
        initialLabels_->add(upperC);
    }
}
void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) {
    if (U_FAILURE(status)) { return; }
    // Chinese index characters, which are specific to each of the several Chinese tailorings,
    // take precedence over the single locale data exemplar set per language.
    const char *language = locale.getLanguage();
    if (uprv_strcmp(language, "zh") == 0 || uprv_strcmp(language, "ja") == 0 ||
            uprv_strcmp(language, "ko") == 0) {
        // TODO: This should be done regardless of the language, but it's expensive.
        // We should add a Collator function (can be @internal)
        // to enumerate just the contractions that start with a given code point or string.
        if (addChineseIndexCharacters(status) || U_FAILURE(status)) {
            return;
        }
    }

    LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));
    if (U_FAILURE(status)) {
        return;
    }

    UnicodeSet exemplars;
    ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status);
    if (U_SUCCESS(status)) {
        initialLabels_->addAll(exemplars);
        return;
    }
    status = U_ZERO_ERROR;  // Clear out U_MISSING_RESOURCE_ERROR

    // The locale data did not include explicit Index characters.
    // Synthesize a set of them from the locale's standard exemplar characters.
    ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status);
    if (U_FAILURE(status)) {
        return;
    }

    // question: should we add auxiliary exemplars?
    if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.size() == 0) {
        exemplars.add(0x61, 0x7A);
    }
    if (exemplars.containsSome(0xAC00, 0xD7A3)) {  // Hangul syllables
        // cut down to small list
        exemplars.remove(0xAC00, 0xD7A3).
            add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).
            add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544).
            add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).
            add(0xD30C).add(0xD558);
    }
    if (exemplars.containsSome(0x1200, 0x137F)) {  // Ethiopic block
        // cut down to small list
        // make use of the fact that Ethiopic is allocated in 8's, where
        // the base is 0 mod 8.
        UnicodeSet ethiopic(
            UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"), status);
        UnicodeSetIterator it(ethiopic);
        while (it.next() && !it.isString()) {
            if ((it.getCodepoint() & 0x7) != 0) {
                exemplars.remove(it.getCodepoint());
            }
        }
    }

    // Upper-case any that aren't already so.
    //   (We only do this for synthesized index characters.)
    UnicodeSetIterator it(exemplars);
    UnicodeString upperC;
    while (it.next()) {
        const UnicodeString &exemplarC = it.getString();
        upperC = exemplarC;
        upperC.toUpper(locale);
        initialLabels_->add(upperC);
    }
}
void AlphabeticIndex::getIndexExemplars(UnicodeSet  &dest, const Locale &locale, UErrorCode &status) {
    if (U_FAILURE(status)) {
        return;
    }

    LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));
    UnicodeSet exemplars;
    ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status);
    if (U_SUCCESS(status)) {
        dest.addAll(exemplars);
        return;
    }
    status = U_ZERO_ERROR;  // Clear out U_MISSING_RESOURCE_ERROR

    // Locale data did not include explicit Index characters.
    // Synthesize a set of them from the locale's standard exemplar characters.

    ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status);
    if (U_FAILURE(status)) {
        return;
    }

    // Upper-case any that aren't already so.
    //   (We only do this for synthesized index characters.)

    UnicodeSetIterator it(exemplars);
    UnicodeString upperC;
    UnicodeSet  lowersToRemove;
    UnicodeSet  uppersToAdd;
    while (it.next()) {
        const UnicodeString &exemplarC = it.getString();
        upperC = exemplarC;
        upperC.toUpper(locale);
        if (exemplarC != upperC) {
            lowersToRemove.add(exemplarC);
            uppersToAdd.add(upperC);
        }
    }
    exemplars.removeAll(lowersToRemove);
    exemplars.addAll(uppersToAdd);

    // get the exemplars, and handle special cases

    // question: should we add auxiliary exemplars?
    if (exemplars.containsSome(*CORE_LATIN)) {
        exemplars.addAll(*CORE_LATIN);
    }
    if (exemplars.containsSome(*HANGUL)) {
        // cut down to small list
        UnicodeSet BLOCK_HANGUL_SYLLABLES(UNICODE_STRING_SIMPLE("[:block=hangul_syllables:]"), status);
        exemplars.removeAll(BLOCK_HANGUL_SYLLABLES);
        exemplars.addAll(*HANGUL);
    }
    if (exemplars.containsSome(*ETHIOPIC)) {
        // cut down to small list
        // make use of the fact that Ethiopic is allocated in 8's, where
        // the base is 0 mod 8.
        UnicodeSetIterator  it(*ETHIOPIC);
        while (it.next() && !it.isString()) {
            if ((it.getCodepoint() & 0x7) != 0) {
                exemplars.remove(it.getCodepoint());
            }
        }
    }
    dest.addAll(exemplars);
}