コード例 #1
0
ファイル: ucol_sit.cpp プロジェクト: cran/stringi
/**
 * Get a set containing the contractions defined by the collator. The set includes
 * both the UCA contractions and the contractions defined by the collator
 * @param coll collator
 * @param conts the set to hold the result
 * @param status to hold the error code
 * @return the size of the contraction set
 */
U_CAPI int32_t U_EXPORT2
ucol_getContractions( const UCollator *coll,
                  USet *contractions,
                  UErrorCode *status)
{
  ucol_getContractionsAndExpansions(coll, contractions, NULL, FALSE, status);
  return uset_getItemCount(contractions);
}
コード例 #2
0
UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) {
    UnicodeSet contractions;
    ucol_getContractionsAndExpansions(collatorPrimaryOnly_->getUCollator(),
                                      contractions.toUSet(), NULL, FALSE, &errorCode);
    if (U_FAILURE(errorCode)) { return FALSE; }
    UnicodeString firstHanBoundary;
    UBool hasPinyin = FALSE;
    UnicodeSetIterator iter(contractions);
    while (iter.next()) {
        const UnicodeString &s = iter.getString();
        if (s.startsWith(BASE, BASE_LENGTH)) {
            initialLabels_->add(s);
            if (firstHanBoundary.isEmpty() ||
                    collatorPrimaryOnly_->compare(s, firstHanBoundary, errorCode) < 0) {
                firstHanBoundary = s;
            }
            UChar c = s.charAt(s.length() - 1);
            if (0x41 <= c && c <= 0x5A) {  // A-Z
                hasPinyin = TRUE;
            }
        }
    }
    if (hasPinyin) {
        initialLabels_->add(0x41, 0x5A);  // A-Z
    }
    if (!firstHanBoundary.isEmpty()) {
        // The hardcoded list of script boundaries includes U+4E00
        // which is tailored to not be the first primary
        // in all Chinese tailorings except "unihan".
        // Replace U+4E00 with the first boundary string from the tailoring.
        // TODO: This becomes obsolete when the root collator gets
        // reliable script-first-primary mappings.
        int32_t hanIndex = binarySearch(
                *firstCharsInScripts_, UnicodeString((UChar)0x4E00), *collatorPrimaryOnly_);
        if (hanIndex >= 0) {
            UnicodeString *fh = new UnicodeString(firstHanBoundary);
            if (fh == NULL) {
                errorCode = U_MEMORY_ALLOCATION_ERROR;
                return FALSE;
            }
            firstCharsInScripts_->setElementAt(fh, hanIndex);
        }
        return TRUE;
    } else {
        return FALSE;
    }
}
コード例 #3
0
ファイル: ucol_sit.cpp プロジェクト: 00zhengfu00/third_party
U_CDECL_END



/**
 * Get a set containing the contractions defined by the collator. The set includes
 * both the UCA contractions and the contractions defined by the collator
 * @param coll collator
 * @param conts the set to hold the result
 * @param status to hold the error code
 * @return the size of the contraction set
 */
U_CAPI int32_t U_EXPORT2
ucol_getContractions( const UCollator *coll,
                  USet *contractions,
                  UErrorCode *status)
{
  ucol_getContractionsAndExpansions(coll, contractions, NULL, FALSE, status);
  return uset_getItemCount(contractions);
}
コード例 #4
0
ファイル: ssearch.cpp プロジェクト: Distrotech/icu
void SSearchTest::monkeyTest(char *params)
{
    // ook!
    UErrorCode status = U_ZERO_ERROR;
  //UCollator *coll = ucol_open(NULL, &status);
    UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status);

    if (U_FAILURE(status)) {
        errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status));
        return;
    }

    CollData  *monkeyData = new CollData(coll, status);

    USet *expansions   = uset_openEmpty();
    USet *contractions = uset_openEmpty();

    ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);

    U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
    U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
    USet *letters = uset_openPattern(letter_pattern, 39, &status);
    SetMonkey letterMonkey(letters);
    StringSetMonkey contractionMonkey(contractions, coll, monkeyData);
    StringSetMonkey expansionMonkey(expansions, coll, monkeyData);
    UnicodeString testCase;
    UnicodeString alternate;
    UnicodeString pattern, altPattern;
    UnicodeString prefix, altPrefix;
    UnicodeString suffix, altSuffix;

    Monkey *monkeys[] = {
        &letterMonkey,
        &contractionMonkey,
        &expansionMonkey,
        &contractionMonkey,
        &expansionMonkey,
        &contractionMonkey,
        &expansionMonkey,
        &contractionMonkey,
        &expansionMonkey};
    int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]);
    // int32_t nonMatchCount = 0;

    UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY};
    const char *strengthNames[] = {"primary", "secondary", "tertiary"};
    int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]);
    int32_t loopCount = quick? 1000 : 10000;
    int32_t firstStrength = 0;
    int32_t lastStrength  = strengthCount - 1; //*/ 0;

    if (params != NULL) {
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
        UnicodeString p(params);

        loopCount = getIntParam("loop", p, loopCount);
        m_seed    = getIntParam("seed", p, m_seed);

        RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status);
        if (m.find()) {
            UnicodeString breakType = m.group(1, status);

            for (int32_t s = 0; s < strengthCount; s += 1) {
                if (breakType == strengthNames[s]) {
                    firstStrength = lastStrength = s;
                    break;
                }
            }

            m.reset();
            p = m.replaceFirst("", status);
        }

        if (RegexMatcher("\\S", p, 0, status).find()) {
            // Each option is stripped out of the option string as it is processed.
            // All options have been checked.  The option string should have been completely emptied..
            char buf[100];
            p.extract(buf, sizeof(buf), NULL, status);
            buf[sizeof(buf)-1] = 0;
            errln("Unrecognized or extra parameter:  %s\n", buf);
            return;
        }
#else
        infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters.");
#endif
    }

    for(int32_t s = firstStrength; s <= lastStrength; s += 1) {
        int32_t notFoundCount = 0;

        logln("Setting strength to %s.", strengthNames[s]);
        ucol_setStrength(coll, strengths[s]);

        // TODO: try alternate prefix and suffix too?
        // TODO: alterntaes are only equal at primary strength. Is this OK?
        for(int32_t t = 0; t < loopCount; t += 1) {
            uint32_t seed = m_seed;
            // int32_t  nmc = 0;

            generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern);
            generateTestCase(coll, monkeys, monkeyCount, prefix,  altPrefix);
            generateTestCase(coll, monkeys, monkeyCount, suffix,  altSuffix);

            // pattern
            notFoundCount += monkeyTestCase(coll, pattern, pattern, altPattern, "pattern", strengthNames[s], seed);

            testCase.remove();
            testCase.append(prefix);
            testCase.append(/*alt*/pattern);

            // prefix + pattern
            notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern", strengthNames[s], seed);

            testCase.append(suffix);

            // prefix + pattern + suffix
            notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern + suffix", strengthNames[s], seed);

            testCase.remove();
            testCase.append(pattern);
            testCase.append(suffix);

            // pattern + suffix
            notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "pattern + suffix", strengthNames[s], seed);
        }

       logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount);
    }

    uset_close(contractions);
    uset_close(expansions);
    uset_close(letters);
    delete monkeyData;

    ucol_close(coll);
}
コード例 #5
0
ファイル: colldata.cpp プロジェクト: icu-project/icu4c
CollData::CollData(UCollator *collator, UErrorCode &status)
    : coll(NULL), ceToCharsStartingWith(NULL)
{
    // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
    // i.e. other, control, private use, format, surrogate
    U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20);
    U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20);
    USet *charsToTest = uset_openPattern(test_pattern, 20, &status);

    // Han ext. A, Han, Jamo, Hangul, Han Ext. B
    // i.e. all the characers we handle implicitly
    U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
    U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
    USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status);

    if (U_FAILURE(status)) {
        return;
    }

    USet *expansions   = uset_openEmpty();
    USet *contractions = uset_openEmpty();
    int32_t itemCount;

    ceToCharsStartingWith = new CEToStringsMap(status);

    if (U_FAILURE(status)) {
        goto bail;
    }

#ifdef CLONE_COLLATOR
    coll = ucol_safeClone(collator, NULL, NULL, &status);

    if (U_FAILURE(status)) {
        goto bail;
    }
#else
    coll = collator;
#endif

    ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);

    uset_addAll(charsToTest, contractions);
    uset_addAll(charsToTest, expansions);
    uset_removeAll(charsToTest, charsToRemove);

    itemCount = uset_getItemCount(charsToTest);
    for(int32_t item = 0; item < itemCount; item += 1) {
        UChar32 start = 0, end = 0;
        UChar buffer[16];
        int32_t len = uset_getItem(charsToTest, item, &start, &end,
                                   buffer, 16, &status);

        if (len == 0) {
            for (UChar32 ch = start; ch <= end; ch += 1) {
                UnicodeString *st = new UnicodeString(ch);

                if (st == NULL) {
                    status = U_MEMORY_ALLOCATION_ERROR;
                    break;
                }

                CEList *ceList = new CEList(coll, *st, status);

                ceToCharsStartingWith->put(ceList->get(0), st, status);

                delete ceList;
                delete st;
            }
        } else if (len > 0) {
            UnicodeString *st = new UnicodeString(buffer, len);

            if (st == NULL) {
                status = U_MEMORY_ALLOCATION_ERROR;
                break;
            }

            CEList *ceList = new CEList(coll, *st, status);

            ceToCharsStartingWith->put(ceList->get(0), st, status);

            delete ceList;
            delete st;
        } else {
            // shouldn't happen...
        }

        if (U_FAILURE(status)) {
             break;
        }
    }

bail:
    uset_close(contractions);
    uset_close(expansions);
    uset_close(charsToRemove);
    uset_close(charsToTest);

    if (U_FAILURE(status)) {
        return;
    }

    UnicodeSet hanRanges(UNICODE_STRING_SIMPLE("[:Unified_Ideograph:]"), status);
    if (U_FAILURE(status)) {
        return;
    }
    UnicodeSetIterator hanIter(hanRanges);
    UnicodeString hanString;
    while(hanIter.nextRange()) {
        hanString.append(hanIter.getCodepoint());
        hanString.append(hanIter.getCodepointEnd());
    }
    // TODO: Why U+11FF? The old code had an outdated UCOL_LAST_T_JAMO=0x11F9,
    // but as of Unicode 6.3 the 11xx block is filled,
    // and there are also more Jamo T at U+D7CB..U+D7FB.
    // Maybe use [:HST=T:] and look for the end of the last range?
    // Maybe use script boundary mappings instead of this code??
    UChar  jamoRanges[] = {Hangul::JAMO_L_BASE, Hangul::JAMO_V_BASE, Hangul::JAMO_T_BASE + 1, 0x11FF};
     UnicodeString jamoString(FALSE, jamoRanges, UPRV_LENGTHOF(jamoRanges));
     CEList hanList(coll, hanString, status);
     CEList jamoList(coll, jamoString, status);
     int32_t j = 0;

     if (U_FAILURE(status)) {
         return;
     }

     for (int32_t c = 0; c < jamoList.size(); c += 1) {
         uint32_t jce = jamoList[c];

         if (! isContinuation(jce)) {
             jamoLimits[j++] = jce;
         }
     }

     jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT);

     minHan = 0xFFFFFFFF;
     maxHan = 0;

     for(int32_t h = 0; h < hanList.size(); h += 2) {
         uint32_t han = (uint32_t) hanList[h];

         if (han < minHan) {
             minHan = han;
         }

         if (han > maxHan) {
             maxHan = han;
         }
     }

     maxHan += (1 << UCOL_PRIMARYORDERSHIFT);
}
コード例 #6
0
//
//  First characters in scripts.
//  Create a UVector whose contents are pointers to UnicodeStrings for the First Characters in each script.
//  The vector is sorted according to this index's collation.
//
//  This code is too slow to use, so for now hard code the data.
//    Hard coded implementation is follows.
//
UVector *AlphabeticIndex::firstStringsInScript(Collator *ruleBasedCollator, UErrorCode &status) {

    if (U_FAILURE(status)) {
        return NULL;
    }

    UnicodeString results[USCRIPT_CODE_LIMIT];
    UnicodeString LOWER_A = UNICODE_STRING_SIMPLE("a");

    UnicodeSetIterator siter(*TO_TRY);
    while (siter.next()) {
        const UnicodeString &current = siter.getString();
        Collator::EComparisonResult r = ruleBasedCollator->compare(current, LOWER_A);
        if (r < 0) {  // TODO fix; we only want "real" script characters, not
                      // symbols.
            continue;
        }

        int script = uscript_getScript(current.char32At(0), &status);
        if (results[script].length() == 0) {
            results[script] = current;
        }
        else if (ruleBasedCollator->compare(current, results[script]) < 0) {
            results[script] = current;
        }
    }

    UnicodeSet extras;
    UnicodeSet expansions;
    RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(ruleBasedCollator);
    const UCollator *uRuleBasedCollator = rbc->getUCollator();
    ucol_getContractionsAndExpansions(uRuleBasedCollator, extras.toUSet(), expansions.toUSet(), true, &status);
    extras.addAll(expansions).removeAll(*TO_TRY);
    if (extras.size() != 0) {
        const Normalizer2 *normalizer = Normalizer2::getNFKCInstance(status);
        UnicodeSetIterator extrasIter(extras);
        while (extrasIter.next()) {
            const UnicodeString &current = extrasIter.next();
            if (!TO_TRY->containsAll(current))
                continue;
            if (!normalizer->isNormalized(current, status) ||
                ruleBasedCollator->compare(current, LOWER_A) < 0) {
                continue;
            }
            int script = uscript_getScript(current.char32At(0), &status);
            if (results[script].length() == 0) {
                results[script] = current;
            } else if (ruleBasedCollator->compare(current, results[script]) < 0) {
                results[script] = current;
            }
        }
    }

    UVector *dest = new UVector(status);
    dest->setDeleter(uprv_deleteUObject);
    for (uint32_t i = 0; i < sizeof(results) / sizeof(results[0]); ++i) {
        if (results[i].length() > 0) {
            dest->addElement(results[i].clone(), status);
        }
    }
    dest->sortWithUComparator(sortCollateComparator, ruleBasedCollator, status);
    return dest;
}