Ejemplo n.º 1
0
void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) {
    UScriptCode scripts[30];

    int32_t numScripts = uscript_getCode(locale, scripts, sizeof(scripts)/sizeof(UScriptCode), &status);
    if (U_FAILURE(status)) {
        return;
    }
    if (status == U_USING_DEFAULT_WARNING) {
        status = U_ILLEGAL_ARGUMENT_ERROR;
        return;
    }
    UnicodeSet tmpSet;
    int32_t    i;
    for (i=0; i<numScripts; i++) {
        tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status);
        allowedChars->addAll(tmpSet);
    }
}
Ejemplo n.º 2
0
void
NamesPropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
                            UErrorCode &errorCode) {
    if(U_FAILURE(errorCode)) { return; }
    if(!newValues.contains(UCHAR_NAME) && !newValues.contains(PPUCD_NAME_ALIAS)) {
        return;
    }

    U_ASSERT(props.start==props.end);

    const char *names[4]={ NULL, NULL, NULL, NULL };
    int16_t lengths[4]={ 0, 0, 0, 0 };

    /* get the character name */
    if(props.name!=NULL) {
        names[0]=props.name;
        lengths[0]=(int16_t)uprv_strlen(props.name);
        parseName(names[0], lengths[0]);
    }

    CharString buffer;
    if(props.nameAlias!=NULL) {
        /*
         * Only use "correction" aliases for now, from Unicode 6.1 NameAliases.txt with 3 fields per line.
         * TODO: Work on ticket #8963 to deal with multiple type:alias pairs per character.
         */
        const char *corr=uprv_strstr(props.nameAlias, "correction=");
        if(corr!=NULL) {
            corr+=11;  // skip "correction="
            const char *limit=uprv_strchr(corr, ',');
            if(limit!=NULL) {
                buffer.append(corr, limit-corr, errorCode);
                names[3]=buffer.data();
                lengths[3]=(int16_t)(limit-corr);
            } else {
                names[3]=corr;
                lengths[3]=(int16_t)uprv_strlen(corr);
            }
            parseName(names[3], lengths[3]);
        }
    }

    addLine(props.start, names, lengths, LENGTHOF(names));
}
Ejemplo n.º 3
0
void TransliteratorErrorTest::TestUnicodeSetErrors() {
    UnicodeString badPattern="[[:L:]-[0x0300-0x0400]";
    UnicodeSet set;
    UErrorCode status = U_ZERO_ERROR;
    UnicodeString result;

    if (!set.isEmpty()) {
        errln("FAIL: The default ctor of UnicodeSet created a non-empty object.");
    }
    set.applyPattern(badPattern, status);
    if (U_SUCCESS(status)) {
        errln("FAIL: Applied a bad pattern to the UnicodeSet object okay.");
    }
    status = U_ZERO_ERROR;
    UnicodeSet *set1 = new UnicodeSet(badPattern, status);
    if (U_SUCCESS(status)) {
        errln("FAIL: Created a UnicodeSet based on bad patterns.");
    }
    delete set1;
}
Ejemplo n.º 4
0
U_CAPI void U_EXPORT2
uspoof_setAllowedUnicodeSet(USpoofChecker *sc, const UnicodeSet *chars, UErrorCode *status) {
    SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
    if (This == NULL) {
        return;
    }
    if (chars->isBogus()) {
        *status = U_ILLEGAL_ARGUMENT_ERROR;
        return;
    }
    UnicodeSet *clonedSet = static_cast<UnicodeSet *>(chars->clone());
    if (clonedSet == NULL || clonedSet->isBogus()) {
        *status = U_MEMORY_ALLOCATION_ERROR;
        return;
    }
    clonedSet->freeze();
    delete This->fAllowedCharsSet;
    This->fAllowedCharsSet = clonedSet;
    This->fChecks |= USPOOF_CHAR_LIMIT;
}
Ejemplo n.º 5
0
 static int32_t span(const UnicodeSet &set, const UChar *s, int32_t length, UBool tf) {
     UChar32 c;
     int32_t start=0, prev;
     while((prev=start)<length) {
         U16_NEXT(s, start, length, c);
         if(tf!=set.contains(c)) {
             break;
         }
     }
     return prev;
 }
Ejemplo n.º 6
0
void
BiDiPropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
                           UErrorCode &errorCode) {
    if(U_FAILURE(errorCode) || newValues.containsNone(relevantProps)) { return; }

    UChar32 start=props.start;
    UChar32 end=props.end;

    // The runtime code relies on this invariant for returning both bmg and bpb
    // from the same data.
    int32_t bpt=props.getIntProp(UCHAR_BIDI_PAIRED_BRACKET_TYPE);
    if(!(bpt==0 ? props.bpb==U_SENTINEL : props.bpb==props.bmg)) {
        fprintf(stderr,
                "genprops error: invariant not true: "
                "if(bpt==None) then bpb=<none> else bpb=bmg\n");
        return;
    }
    int32_t delta=encodeBidiMirroringGlyph(start, end, props.bmg, errorCode);
    uint32_t value=(uint32_t)delta<<UBIDI_MIRROR_DELTA_SHIFT;
    if(props.binProps[UCHAR_BIDI_MIRRORED]) {
        value|=U_MASK(UBIDI_IS_MIRRORED_SHIFT);
    }
    if(props.binProps[UCHAR_BIDI_CONTROL]) {
        value|=U_MASK(UBIDI_BIDI_CONTROL_SHIFT);
    }
    if(props.binProps[UCHAR_JOIN_CONTROL]) {
        value|=U_MASK(UBIDI_JOIN_CONTROL_SHIFT);
    }
    value|=(uint32_t)bpt<<UBIDI_BPT_SHIFT;
    value|=(uint32_t)props.getIntProp(UCHAR_JOINING_TYPE)<<UBIDI_JT_SHIFT;
    value|=(uint32_t)props.getIntProp(UCHAR_BIDI_CLASS);
    utrie2_setRange32(pTrie, start, end, value, TRUE, &errorCode);
    if(U_FAILURE(errorCode)) {
        fprintf(stderr, "genprops error: BiDiPropsBuilder utrie2_setRange32() failed - %s\n",
                u_errorName(errorCode));
        return;
    }

    // Store Joining_Group values from vector column 1 in simple byte arrays.
    int32_t jg=props.getIntProp(UCHAR_JOINING_GROUP);
    for(UChar32 c=start; c<=end; ++c) {
        int32_t jgStart;
        if(MIN_JG_START<=c && c<MAX_JG_LIMIT) {
            jgArray[c-MIN_JG_START]=(uint8_t)jg;
        } else if(MIN_JG_START2<=c && c<MAX_JG_LIMIT2) {
            jgArray2[c-MIN_JG_START2]=(uint8_t)jg;
        } else if(jg!=U_JG_NO_JOINING_GROUP) {
            fprintf(stderr, "genprops error: Joining_Group for out-of-range code points U+%04lx..U+%04lx\n",
                    (long)start, (long)end);
            errorCode=U_ILLEGAL_ARGUMENT_ERROR;
            return;
        }
    }
}
Ejemplo n.º 7
0
void
PreparsedUCD::parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode) {
    if(U_FAILURE(errorCode)) { return; }
    scx.clear();
    CharString scString;
    for(;;) {
        const char *scs;
        const char *scLimit=strchr(s, ' ');
        if(scLimit!=NULL) {
            scs=scString.clear().append(s, (int32_t)(scLimit-s), errorCode).data();
            if(U_FAILURE(errorCode)) { return; }
        } else {
            scs=s;
        }
        int32_t script=pnames->getPropertyValueEnum(UCHAR_SCRIPT, scs);
        if(script==UCHAR_INVALID_CODE) {
            fprintf(stderr,
                    "error in preparsed UCD: '%s' is not a valid script code on line %ld\n",
                    scs, (long)lineNumber);
            errorCode=U_PARSE_ERROR;
            return;
        } else if(scx.contains(script)) {
            fprintf(stderr,
                    "error in preparsed UCD: scx has duplicate '%s' codes on line %ld\n",
                    scs, (long)lineNumber);
            errorCode=U_PARSE_ERROR;
            return;
        } else {
            scx.add(script);
        }
        if(scLimit!=NULL) {
            s=scLimit+1;
        } else {
            break;
        }
    }
    if(scx.isEmpty()) {
        fprintf(stderr, "error in preparsed UCD: empty scx= on line %ld\n", (long)lineNumber);
        errorCode=U_PARSE_ERROR;
    }
}
Ejemplo n.º 8
0
void AlphabeticIndex::buildBucketList(UErrorCode &status) {
    UnicodeString labelStr = getUnderflowLabel();
    Bucket *b = new Bucket(labelStr, *EMPTY_STRING, U_ALPHAINDEX_UNDERFLOW, status);
    bucketList_->addElement(b, status);

    // Build up the list, adding underflow, additions, overflow
    // insert infix labels as needed, using \uFFFF.
    const UnicodeString *last = static_cast<UnicodeString *>(labels_->elementAt(0));
    b = new Bucket(*last, *last, U_ALPHAINDEX_NORMAL, status);
    bucketList_->addElement(b, status);

    UnicodeSet lastSet;
    UnicodeSet set;
    AlphabeticIndex::getScriptSet(lastSet, *last, status);
    lastSet.removeAll(*IGNORE_SCRIPTS);

    for (int i = 1; i < labels_->size(); ++i) {
        UnicodeString *current = static_cast<UnicodeString *>(labels_->elementAt(i));
        getScriptSet(set, *current, status);
        set.removeAll(*IGNORE_SCRIPTS);
        if (lastSet.containsNone(set)) {
            // check for adjacent
            const UnicodeString &overflowComparisonString = getOverflowComparisonString(*last, status);
            if (collatorPrimaryOnly_->compare(overflowComparisonString, *current) < 0) {
                labelStr = getInflowLabel();
                b = new Bucket(labelStr, overflowComparisonString, U_ALPHAINDEX_INFLOW, status);
                bucketList_->addElement(b, status);
                i++;
                lastSet = set;
            }
        }
        b = new Bucket(*current, *current, U_ALPHAINDEX_NORMAL, status);
        bucketList_->addElement(b, status);
        last = current;
        lastSet = set;
    }
    const UnicodeString &limitString = getOverflowComparisonString(*last, status);
    b = new Bucket(getOverflowLabel(), limitString, U_ALPHAINDEX_OVERFLOW, status);
    bucketList_->addElement(b, status);
    // final overflow bucket
}
Ejemplo n.º 9
0
    SpanBackUTF8(const UnicodeSetPerformanceTest &testcase) : Command(testcase) {
        // Verify that the frozen set is equal to the unfrozen one.
        UnicodeSet set;
        char utf8[4];
        UChar32 c;
        int32_t length;

        for(c=0; c<=0x10ffff; ++c) {
            if(c==0xd800) {
                c=0xe000;
            }
            length=0;
            U8_APPEND_UNSAFE(utf8, length, c);
            if(testcase.set.spanBackUTF8(utf8, length, USET_SPAN_CONTAINED)==0) {
                set.add(c);
            }
        }
        if(set!=testcase.set) {
            fprintf(stderr, "error: frozen set != original!\n");
        }
    }
Ejemplo n.º 10
0
void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const {
    UChar32 ch;
    for (int32_t i=0; i<output.length(); i+=UTF_CHAR_LENGTH(ch)) {
    ch = output.char32At(i);
    UnicodeReplacer* r = data->lookupReplacer(ch);
    if (r == NULL) {
        toUnionTo.add(ch);
    } else {
        r->addReplacementSetTo(toUnionTo);
    }
    }
}
Ejemplo n.º 11
0
/**
 * Implement UnicodeMatcher
 */
void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
    UChar32 ch;
    for (int32_t i=0; i<pattern.length(); i+=UTF_CHAR_LENGTH(ch)) {
        ch = pattern.char32At(i);
        const UnicodeMatcher* matcher = data->lookupMatcher(ch);
        if (matcher == NULL) {
            toUnionTo.add(ch);
        } else {
            matcher->addMatchSetTo(toUnionTo);
        }
    }
}
Ejemplo n.º 12
0
/**
 * Union the set of all characters that may be modified by this rule
 * into the given set.
 */
void TransliterationRule::addSourceSetTo(UnicodeSet& toUnionTo) const {
    int32_t limit = anteContextLength + keyLength;
    for (int32_t i=anteContextLength; i<limit; ) {
        UChar32 ch = pattern.char32At(i);
        i += UTF_CHAR_LENGTH(ch);
        const UnicodeMatcher* matcher = data->lookupMatcher(ch);
        if (matcher == NULL) {
            toUnionTo.add(ch);
        } else {
            matcher->addMatchSetTo(toUnionTo);
        }
    }
}
Ejemplo n.º 13
0
UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) {
    if (U_FAILURE(status)) {
        return NULL;
    }
    LocalPointer<UVector> dest(new UVector(status), status);
    if (U_FAILURE(status)) {
        return NULL;
    }
    dest->setDeleter(uprv_deleteUObject);
    // Fetch the script-first-primary contractions which are defined in the root collator.
    // They all start with U+FDD1.
    UnicodeSet set;
    collatorPrimaryOnly_->internalAddContractions(0xFDD1, set, status);
    if (U_FAILURE(status)) {
        return NULL;
    }
    if (set.isEmpty()) {
        status = U_UNSUPPORTED_ERROR;
        return NULL;
    }
    UnicodeSetIterator iter(set);
    while (iter.next()) {
        const UnicodeString &boundary = iter.getString();
        uint32_t gcMask = U_GET_GC_MASK(boundary.char32At(1));
        if ((gcMask & (U_GC_L_MASK | U_GC_CN_MASK)) == 0) {
            // Ignore boundaries for the special reordering groups.
            // Take only those for "real scripts" (where the sample character is a Letter,
            // and the one for unassigned implicit weights (Cn).
            continue;
        }
        UnicodeString *s = new UnicodeString(boundary);
        if (s == NULL) {
            status = U_MEMORY_ALLOCATION_ERROR;
            return NULL;
        }
        dest->addElement(s, status);
    }
    return dest.orphan();
}
Ejemplo n.º 14
0
CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
: DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) {
    // Korean dictionary only includes Hangul syllables
    fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
    fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
    fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
    fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);

    if (U_SUCCESS(status)) {
        // handle Korean and Japanese/Chinese using different dictionaries
        if (type == kKorean) {
            setCharacters(fHangulWordSet);
        } else { //Chinese and Japanese
            UnicodeSet cjSet;
            cjSet.addAll(fHanWordSet);
            cjSet.addAll(fKatakanaWordSet);
            cjSet.addAll(fHiraganaWordSet);
            cjSet.add(0xFF70);
            cjSet.add(0x30FC);
            setCharacters(cjSet);
        }
    }
}
Ejemplo n.º 15
0
void StaticUnicodeSetsTest::testSetCoverage() {
    UErrorCode status = U_ZERO_ERROR;

    // Lenient comma/period should be supersets of strict comma/period;
    // it also makes the coverage logic cheaper.
    assertTrue(
            "COMMA should be superset of STRICT_COMMA",
            get(unisets::COMMA)->containsAll(*get(unisets::STRICT_COMMA)));
    assertTrue(
            "PERIOD should be superset of STRICT_PERIOD",
            get(unisets::PERIOD)->containsAll(*get(unisets::STRICT_PERIOD)));

    UnicodeSet decimals;
    decimals.addAll(*get(unisets::STRICT_COMMA));
    decimals.addAll(*get(unisets::STRICT_PERIOD));
    decimals.freeze();
    UnicodeSet grouping;
    grouping.addAll(decimals);
    grouping.addAll(*get(unisets::OTHER_GROUPING_SEPARATORS));
    decimals.freeze();

    const UnicodeSet &plusSign = *get(unisets::PLUS_SIGN);
    const UnicodeSet &minusSign = *get(unisets::MINUS_SIGN);
    const UnicodeSet &percent = *get(unisets::PERCENT_SIGN);
    const UnicodeSet &permille = *get(unisets::PERMILLE_SIGN);
    const UnicodeSet &infinity = *get(unisets::INFINITY_KEY);

    int32_t localeCount;
    const Locale* allAvailableLocales = Locale::getAvailableLocales(localeCount);
    for (int32_t i = 0; i < localeCount; i++) {
        Locale locale = allAvailableLocales[i];
        DecimalFormatSymbols dfs(locale, status);
        UnicodeString localeName;
        locale.getDisplayName(localeName);
        assertSuccess(UnicodeString("Making DFS for ") + localeName, status);

#define ASSERT_IN_SET(name, foo) assertInSet(localeName, UnicodeString("" #name ""), name, foo)
        ASSERT_IN_SET(decimals, dfs.getConstSymbol(DecimalFormatSymbols::kDecimalSeparatorSymbol));
        ASSERT_IN_SET(grouping, dfs.getConstSymbol(DecimalFormatSymbols::kGroupingSeparatorSymbol));
        ASSERT_IN_SET(plusSign, dfs.getConstSymbol(DecimalFormatSymbols::kPlusSignSymbol));
        ASSERT_IN_SET(minusSign, dfs.getConstSymbol(DecimalFormatSymbols::kMinusSignSymbol));
        ASSERT_IN_SET(percent, dfs.getConstSymbol(DecimalFormatSymbols::kPercentSymbol));
        ASSERT_IN_SET(permille, dfs.getConstSymbol(DecimalFormatSymbols::kPerMillSymbol));
        ASSERT_IN_SET(infinity, dfs.getConstSymbol(DecimalFormatSymbols::kInfinitySymbol));
    }
}
Ejemplo n.º 16
0
//---------------------------------------------------------------------------------
//
//  scanSet    Construct a UnicodeSet from the text at the current scan
//             position.  Advance the scan position to the first character
//             after the set.
//
//             A new RBBI setref node referring to the set is pushed onto the node
//             stack.
//
//             The scan position is normally under the control of the state machine
//             that controls rule parsing.  UnicodeSets, however, are parsed by
//             the UnicodeSet constructor, not by the RBBI rule parser.
//
//---------------------------------------------------------------------------------
void RBBIRuleScanner::scanSet() {
    UnicodeSet    *uset;
    ParsePosition  pos;
    int            startPos;
    int            i;

    if (U_FAILURE(*fRB->fStatus)) {
        return;
    }

    pos.setIndex(fScanIndex);
    startPos = fScanIndex;
    UErrorCode localStatus = U_ZERO_ERROR;
    uset = new UnicodeSet(fRB->fRules, pos, USET_IGNORE_SPACE,
                          fSymbolTable,
                          localStatus);
    if (U_FAILURE(localStatus)) {
        //  TODO:  Get more accurate position of the error from UnicodeSet's return info.
        //         UnicodeSet appears to not be reporting correctly at this time.
#ifdef RBBI_DEBUG
        RBBIDebugPrintf("UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());
#endif
        error(localStatus);
        delete uset;
        return;
    }

    // Verify that the set contains at least one code point.
    //
    if (uset->isEmpty()) {
        // This set is empty.
        //  Make it an error, because it almost certainly is not what the user wanted.
        //  Also, avoids having to think about corner cases in the tree manipulation code
        //   that occurs later on.
        error(U_BRK_RULE_EMPTY_SET);
        delete uset;
        return;
    }


    // Advance the RBBI parse postion over the UnicodeSet pattern.
    //   Don't just set fScanIndex because the line/char positions maintained
    //   for error reporting would be thrown off.
    i = pos.getIndex();
    for (;;) {
        if (fNextIndex >= i) {
            break;
        }
        nextCharLL();
    }

    if (U_SUCCESS(*fRB->fStatus)) {
        RBBINode         *n;

        n = pushNewNode(RBBINode::setRef);
        n->fFirstPos = startPos;
        n->fLastPos  = fNextIndex;
        fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
        //  findSetFor() serves several purposes here:
        //     - Adopts storage for the UnicodeSet, will be responsible for deleting.
        //     - Mantains collection of all sets in use, needed later for establishing
        //          character categories for run time engine.
        //     - Eliminates mulitiple instances of the same set.
        //     - Creates a new uset node if necessary (if this isn't a duplicate.)
        findSetFor(n->fText, n, uset);
    }

}
Ejemplo n.º 17
0
/**
 * Parse the pattern from the given RuleCharacterIterator.  The
 * iterator is advanced over the parsed pattern.
 * @param chars iterator over the pattern characters.  Upon return
 * it will be advanced to the first character after the parsed
 * pattern, or the end of the iteration if all characters are
 * parsed.
 * @param symbols symbol table to use to parse and dereference
 * variables, or null if none.
 * @param rebuiltPat the pattern that was parsed, rebuilt or
 * copied from the input pattern, as appropriate.
 * @param options a bit mask of zero or more of the following:
 * IGNORE_SPACE, CASE.
 */
void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
                              const SymbolTable* symbols,
                              UnicodeString& rebuiltPat,
                              uint32_t options,
                              UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
                              UErrorCode& ec) {
    if (U_FAILURE(ec)) return;

    // Syntax characters: [ ] ^ - & { }

    // Recognized special forms for chars, sets: c-c s-s s&s

    int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |
                   RuleCharacterIterator::PARSE_ESCAPES;
    if ((options & USET_IGNORE_SPACE) != 0) {
        opts |= RuleCharacterIterator::SKIP_WHITESPACE;
    }

    UnicodeString patLocal, buf;
    UBool usePat = FALSE;
    UnicodeSetPointer scratch;
    RuleCharacterIterator::Pos backup;

    // mode: 0=before [, 1=between [...], 2=after ]
    // lastItem: 0=none, 1=char, 2=set
    int8_t lastItem = 0, mode = 0;
    UChar32 lastChar = 0;
    UChar op = 0;

    UBool invert = FALSE;

    clear();

    while (mode != 2 && !chars.atEnd()) {
        U_ASSERT((lastItem == 0 && op == 0) ||
                 (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) ||
                 (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ ||
                                    op == INTERSECTION /*'&'*/)));

        UChar32 c = 0;
        UBool literal = FALSE;
        UnicodeSet* nested = 0; // alias - do not delete

        // -------- Check for property pattern

        // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
        int8_t setMode = 0;
        if (resemblesPropertyPattern(chars, opts)) {
            setMode = 2;
        }

        // -------- Parse '[' of opening delimiter OR nested set.
        // If there is a nested set, use `setMode' to define how
        // the set should be parsed.  If the '[' is part of the
        // opening delimiter for this pattern, parse special
        // strings "[", "[^", "[-", and "[^-".  Check for stand-in
        // characters representing a nested set in the symbol
        // table.

        else {
            // Prepare to backup if necessary
            chars.getPos(backup);
            c = chars.next(opts, literal, ec);
            if (U_FAILURE(ec)) return;

            if (c == 0x5B /*'['*/ && !literal) {
                if (mode == 1) {
                    chars.setPos(backup); // backup
                    setMode = 1;
                } else {
                    // Handle opening '[' delimiter
                    mode = 1;
                    patLocal.append((UChar) 0x5B /*'['*/);
                    chars.getPos(backup); // prepare to backup
                    c = chars.next(opts, literal, ec); 
                    if (U_FAILURE(ec)) return;
                    if (c == 0x5E /*'^'*/ && !literal) {
                        invert = TRUE;
                        patLocal.append((UChar) 0x5E /*'^'*/);
                        chars.getPos(backup); // prepare to backup
                        c = chars.next(opts, literal, ec);
                        if (U_FAILURE(ec)) return;
                    }
                    // Fall through to handle special leading '-';
                    // otherwise restart loop for nested [], \p{}, etc.
                    if (c == HYPHEN /*'-'*/) {
                        literal = TRUE;
                        // Fall through to handle literal '-' below
                    } else {
                        chars.setPos(backup); // backup
                        continue;
                    }
                }
            } else if (symbols != 0) {
                const UnicodeFunctor *m = symbols->lookupMatcher(c);
                if (m != 0) {
                    const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m);
                    if (ms == NULL) {
                        ec = U_MALFORMED_SET;
                        return;
                    }
                    // casting away const, but `nested' won't be modified
                    // (important not to modify stored set)
                    nested = const_cast<UnicodeSet*>(ms);
                    setMode = 3;
                }
            }
        }

        // -------- Handle a nested set.  This either is inline in
        // the pattern or represented by a stand-in that has
        // previously been parsed and was looked up in the symbol
        // table.

        if (setMode != 0) {
            if (lastItem == 1) {
                if (op != 0) {
                    // syntaxError(chars, "Char expected after operator");
                    ec = U_MALFORMED_SET;
                    return;
                }
                add(lastChar, lastChar);
                _appendToPat(patLocal, lastChar, FALSE);
                lastItem = 0;
                op = 0;
            }

            if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) {
                patLocal.append(op);
            }

            if (nested == 0) {
                // lazy allocation
                if (!scratch.allocate()) {
                    ec = U_MEMORY_ALLOCATION_ERROR;
                    return;
                }
                nested = scratch.pointer();
            }
            switch (setMode) {
            case 1:
                nested->applyPattern(chars, symbols, patLocal, options, caseClosure, ec);
                break;
            case 2:
                chars.skipIgnored(opts);
                nested->applyPropertyPattern(chars, patLocal, ec);
                if (U_FAILURE(ec)) return;
                break;
            case 3: // `nested' already parsed
                nested->_toPattern(patLocal, FALSE);
                break;
            }

            usePat = TRUE;

            if (mode == 0) {
                // Entire pattern is a category; leave parse loop
                *this = *nested;
                mode = 2;
                break;
            }

            switch (op) {
            case HYPHEN: /*'-'*/
                removeAll(*nested);
                break;
            case INTERSECTION: /*'&'*/
                retainAll(*nested);
                break;
            case 0:
                addAll(*nested);
                break;
            }

            op = 0;
            lastItem = 2;

            continue;
        }

        if (mode == 0) {
            // syntaxError(chars, "Missing '['");
            ec = U_MALFORMED_SET;
            return;
        }

        // -------- Parse special (syntax) characters.  If the
        // current character is not special, or if it is escaped,
        // then fall through and handle it below.

        if (!literal) {
            switch (c) {
            case 0x5D /*']'*/:
                if (lastItem == 1) {
                    add(lastChar, lastChar);
                    _appendToPat(patLocal, lastChar, FALSE);
                }
                // Treat final trailing '-' as a literal
                if (op == HYPHEN /*'-'*/) {
                    add(op, op);
                    patLocal.append(op);
                } else if (op == INTERSECTION /*'&'*/) {
                    // syntaxError(chars, "Trailing '&'");
                    ec = U_MALFORMED_SET;
                    return;
                }
                patLocal.append((UChar) 0x5D /*']'*/);
                mode = 2;
                continue;
            case HYPHEN /*'-'*/:
                if (op == 0) {
                    if (lastItem != 0) {
                        op = (UChar) c;
                        continue;
                    } else {
                        // Treat final trailing '-' as a literal
                        add(c, c);
                        c = chars.next(opts, literal, ec);
                        if (U_FAILURE(ec)) return;
                        if (c == 0x5D /*']'*/ && !literal) {
                            patLocal.append(HYPHEN_RIGHT_BRACE, 2);
                            mode = 2;
                            continue;
                        }
                    }
                }
                // syntaxError(chars, "'-' not after char or set");
                ec = U_MALFORMED_SET;
                return;
            case INTERSECTION /*'&'*/:
                if (lastItem == 2 && op == 0) {
                    op = (UChar) c;
                    continue;
                }
                // syntaxError(chars, "'&' not after set");
                ec = U_MALFORMED_SET;
                return;
            case 0x5E /*'^'*/:
                // syntaxError(chars, "'^' not after '['");
                ec = U_MALFORMED_SET;
                return;
            case 0x7B /*'{'*/:
                if (op != 0) {
                    // syntaxError(chars, "Missing operand after operator");
                    ec = U_MALFORMED_SET;
                    return;
                }
                if (lastItem == 1) {
                    add(lastChar, lastChar);
                    _appendToPat(patLocal, lastChar, FALSE);
                }
                lastItem = 0;
                buf.truncate(0);
                {
                    UBool ok = FALSE;
                    while (!chars.atEnd()) {
                        c = chars.next(opts, literal, ec);
                        if (U_FAILURE(ec)) return;
                        if (c == 0x7D /*'}'*/ && !literal) {
                            ok = TRUE;
                            break;
                        }
                        buf.append(c);
                    }
                    if (buf.length() < 1 || !ok) {
                        // syntaxError(chars, "Invalid multicharacter string");
                        ec = U_MALFORMED_SET;
                        return;
                    }
                }
                // We have new string. Add it to set and continue;
                // we don't need to drop through to the further
                // processing
                add(buf);
                patLocal.append((UChar) 0x7B /*'{'*/);
                _appendToPat(patLocal, buf, FALSE);
                patLocal.append((UChar) 0x7D /*'}'*/);
                continue;
            case SymbolTable::SYMBOL_REF:
                //         symbols  nosymbols
                // [a-$]   error    error (ambiguous)
                // [a$]    anchor   anchor
                // [a-$x]  var "x"* literal '$'
                // [a-$.]  error    literal '$'
                // *We won't get here in the case of var "x"
                {
                    chars.getPos(backup);
                    c = chars.next(opts, literal, ec);
                    if (U_FAILURE(ec)) return;
                    UBool anchor = (c == 0x5D /*']'*/ && !literal);
                    if (symbols == 0 && !anchor) {
                        c = SymbolTable::SYMBOL_REF;
                        chars.setPos(backup);
                        break; // literal '$'
                    }
                    if (anchor && op == 0) {
                        if (lastItem == 1) {
                            add(lastChar, lastChar);
                            _appendToPat(patLocal, lastChar, FALSE);
                        }
                        add(U_ETHER);
                        usePat = TRUE;
                        patLocal.append((UChar) SymbolTable::SYMBOL_REF);
                        patLocal.append((UChar) 0x5D /*']'*/);
                        mode = 2;
                        continue;
                    }
                    // syntaxError(chars, "Unquoted '$'");
                    ec = U_MALFORMED_SET;
                    return;
                }
            default:
                break;
            }
        }

        // -------- Parse literal characters.  This includes both
        // escaped chars ("\u4E01") and non-syntax characters
        // ("a").

        switch (lastItem) {
        case 0:
            lastItem = 1;
            lastChar = c;
            break;
        case 1:
            if (op == HYPHEN /*'-'*/) {
                if (lastChar >= c) {
                    // Don't allow redundant (a-a) or empty (b-a) ranges;
                    // these are most likely typos.
                    // syntaxError(chars, "Invalid range");
                    ec = U_MALFORMED_SET;
                    return;
                }
                add(lastChar, c);
                _appendToPat(patLocal, lastChar, FALSE);
                patLocal.append(op);
                _appendToPat(patLocal, c, FALSE);
                lastItem = 0;
                op = 0;
            } else {
                add(lastChar, lastChar);
                _appendToPat(patLocal, lastChar, FALSE);
                lastChar = c;
            }
            break;
        case 2:
            if (op != 0) {
                // syntaxError(chars, "Set expected after operator");
                ec = U_MALFORMED_SET;
                return;
            }
            lastChar = c;
            lastItem = 1;
            break;
        }
    }

    if (mode != 2) {
        // syntaxError(chars, "Missing ']'");
        ec = U_MALFORMED_SET;
        return;
    }

    chars.skipIgnored(opts);

    /**
     * Handle global flags (invert, case insensitivity).  If this
     * pattern should be compiled case-insensitive, then we need
     * to close over case BEFORE COMPLEMENTING.  This makes
     * patterns like /[^abc]/i work.
     */
    if ((options & USET_CASE_INSENSITIVE) != 0) {
        (this->*caseClosure)(USET_CASE_INSENSITIVE);
    }
    else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {
        (this->*caseClosure)(USET_ADD_CASE_MAPPINGS);
    }
    if (invert) {
        complement();
    }

    // Use the rebuilt pattern (patLocal) only if necessary.  Prefer the
    // generated pattern.
    if (usePat) {
        rebuiltPat.append(patLocal);
    } else {
        _generatePattern(rebuiltPat, FALSE);
    }
    if (isBogus() && U_SUCCESS(ec)) {
        // We likely ran out of memory. AHHH!
        ec = U_MEMORY_ALLOCATION_ERROR;
    }
}
Ejemplo n.º 18
0
//----------------------------------------------------------------------------
//
//  main      for genctd
//
//----------------------------------------------------------------------------
int  main(int argc, char **argv) {
    UErrorCode  status = U_ZERO_ERROR;
    const char *wordFileName;
    const char *outFileName;
    const char *outDir = NULL;
    const char *copyright = NULL;

    //
    // Pick up and check the command line arguments,
    //    using the standard ICU tool utils option handling.
    //
    U_MAIN_INIT_ARGS(argc, argv);
    progName = argv[0];
    argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
    if(argc<0) {
        // Unrecognized option
        fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    }

    if(options[0].doesOccur || options[1].doesOccur) {
        //  -? or -h for help.
        usageAndDie(0);
    }

    if (!options[3].doesOccur || argc < 2) {
        fprintf(stderr, "input and output file must both be specified.\n");
        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    }
    outFileName  = options[3].value;
    wordFileName = argv[1];

    if (options[4].doesOccur) {
        u_setDataDirectory(options[4].value);
    }

    status = U_ZERO_ERROR;

    /* Combine the directory with the file name */
    if(options[5].doesOccur) {
        outDir = options[5].value;
    }
    if (options[6].doesOccur) {
        copyright = U_COPYRIGHT_STRING;
    }

#if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO

    UNewDataMemory *pData;
    char msg[1024];

    /* write message with just the name */
    sprintf(msg, "genctd writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
    fprintf(stderr, "%s\n", msg);

    /* write the dummy data file */
    pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
    udata_writeBlock(pData, msg, strlen(msg));
    udata_finish(pData, &status);
    return (int)status;

#else
    /* Initialize ICU */
    u_init(&status);
    if (U_FAILURE(status)) {
        fprintf(stderr, "%s: can not initialize ICU.  status = %s\n",
            argv[0], u_errorName(status));
        exit(1);
    }
    status = U_ZERO_ERROR;

    //
    //  Read in the dictionary source file
    //
    long        result;
    long        wordFileSize;
    FILE        *file;
    char        *wordBufferC;
    MutableTrieDictionary *mtd = NULL;
    
    file = fopen(wordFileName, "rb");
    if( file == 0 ) { //cannot find file
        //create 1-line dummy file: ie 1 char, 1 value
        UNewDataMemory *pData;
        char msg[1024];

        /* write message with just the name */
        sprintf(msg, "%s not found, genctd writes dummy %s", wordFileName, outFileName);
        fprintf(stderr, "%s\n", msg);

        UChar c = 0x0020;
        mtd = new MutableTrieDictionary(c, status, TRUE);
        mtd->addWord(&c, 1, status, 1);

    } else { //read words in from input file
        fseek(file, 0, SEEK_END);
        wordFileSize = ftell(file);
        fseek(file, 0, SEEK_SET);
        wordBufferC = new char[wordFileSize+10];
    
        result = (long)fread(wordBufferC, 1, wordFileSize, file);
        if (result != wordFileSize)  {
            fprintf(stderr, "Error reading file \"%s\"\n", wordFileName);
            exit (-1);
        }
        wordBufferC[wordFileSize]=0;
        fclose(file);
    
        //
        // Look for a Unicode Signature (BOM) on the word file
        //
        int32_t        signatureLength;
        const char *   wordSourceC = wordBufferC;
        const char*    encoding = ucnv_detectUnicodeSignature(
                               wordSourceC, wordFileSize, &signatureLength, &status);
        if (U_FAILURE(status)) {
            exit(status);
        }
        if(encoding!=NULL ){
            wordSourceC  += signatureLength;
            wordFileSize -= signatureLength;
        }
    
        //
        // Open a converter to take the rule file to UTF-16
        //
        UConverter* conv;
        conv = ucnv_open(encoding, &status);
        if (U_FAILURE(status)) {
            fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
            exit(status);
        }
    
        //
        // Convert the words to UChar.
        //  Preflight first to determine required buffer size.
        //
        uint32_t destCap = ucnv_toUChars(conv,
                           NULL,           //  dest,
                           0,              //  destCapacity,
                           wordSourceC,
                           wordFileSize,
                           &status);
        if (status != U_BUFFER_OVERFLOW_ERROR) {
            fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
            exit(status);
        };
    
        status = U_ZERO_ERROR;
        UChar *wordSourceU = new UChar[destCap+1];
        ucnv_toUChars(conv,
                      wordSourceU,     //  dest,
                      destCap+1,
                      wordSourceC,
                      wordFileSize,
                      &status);
        if (U_FAILURE(status)) {
            fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
            exit(status);
        };
        ucnv_close(conv);
    
        // Get rid of the original file buffer
        delete[] wordBufferC;
    
        // Create a MutableTrieDictionary, and loop through all the lines, inserting
        // words.
    
        // First, pick a median character.
        UChar *current = wordSourceU + (destCap/2);
        UChar uc = *current++;
        UnicodeSet breaks;
        breaks.add(0x000A);     // Line Feed
        breaks.add(0x000D);     // Carriage Return
        breaks.add(0x2028);     // Line Separator
        breaks.add(0x2029);     // Paragraph Separator
    
        do { 
            // Look for line break
            while (uc && !breaks.contains(uc)) {
                uc = *current++;
            }
            // Now skip to first non-line-break
            while (uc && breaks.contains(uc)) {
                uc = *current++;
            }
        }
        while (uc && (breaks.contains(uc) || u_isspace(uc)));
    
        mtd = new MutableTrieDictionary(uc, status);
        
        if (U_FAILURE(status)) {
            fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
            exit(status);
        }
        
        // Now add the words. Words are non-space characters at the beginning of
        // lines, and must be at least one UChar. If a word has an associated value,
        // the value should follow the word on the same line after a tab character.
        current = wordSourceU;
        UChar *candidate = current;
        uc = *current++;
        int32_t length = 0;
        int count = 0;
                
        while (uc) {
            while (uc && !u_isspace(uc)) {
                ++length;
                uc = *current++;
            }
            
            UnicodeString valueString;
            UChar candidateValue;
            if(uc == 0x0009){ //separator is a tab char, read in number after space
            	while (uc && u_isspace(uc)) {
            		uc = *current++;
            	}
                while (uc && !u_isspace(uc)) {
                    valueString.append(uc);
                    uc = *current++;
                }
            }
            
            if (length > 0) {
                count++;
                if(valueString.length() > 0){
                    mtd->setValued(TRUE);
    
                    uint32_t value = 0;
                    char* s = new char[valueString.length()];
                    valueString.extract(0,valueString.length(), s, valueString.length());
                    int n = sscanf(s, "%ud", &value);
                    U_ASSERT(n == 1);
                    U_ASSERT(value >= 0); 
                    mtd->addWord(candidate, length, status, (uint16_t)value);
                    delete[] s;
                } else {
                    mtd->addWord(candidate, length, status);
                }
    
                if (U_FAILURE(status)) {
                    fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\" at line %d in input file\n",
                            u_errorName(status), count);
                    exit(status);
                }
            }
    
            // Find beginning of next line
            while (uc && !breaks.contains(uc)) {
                uc = *current++;
            }
            // Find next non-line-breaking character
            while (uc && breaks.contains(uc)) {
                uc = *current++;
            }
            candidate = current-1;
            length = 0;
        }
    
        // Get rid of the Unicode text buffer
        delete[] wordSourceU;
    }

    // Now, create a CompactTrieDictionary from the mutable dictionary
    CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status);
    if (U_FAILURE(status)) {
        fprintf(stderr, "new CompactTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
        exit(status);
    }
    
    // Get rid of the MutableTrieDictionary
    delete mtd;

    //
    //  Get the binary data from the dictionary.
    //
    uint32_t        outDataSize = ctd->dataSize();
    const uint8_t  *outData = (const uint8_t *)ctd->data();

    //
    //  Create the output file
    //
    size_t bytesWritten;
    UNewDataMemory *pData;
    pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
    if(U_FAILURE(status)) {
        fprintf(stderr, "genctd: Could not open output file \"%s\", \"%s\"\n", 
                         outFileName, u_errorName(status));
        exit(status);
    }


    //  Write the data itself.
    udata_writeBlock(pData, outData, outDataSize);
    // finish up 
    bytesWritten = udata_finish(pData, &status);
    if(U_FAILURE(status)) {
        fprintf(stderr, "genctd: error \"%s\" writing the output file\n", u_errorName(status));
        exit(status);
    }
    
    if (bytesWritten != outDataSize) {
        fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
        exit(-1);
    }
    
    // Get rid of the CompactTrieDictionary
    delete ctd;

    u_cleanup();

    printf("genctd: tool completed successfully.\n");
    return 0;

#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
}
Ejemplo n.º 19
0
//---------------------------------------------------------------------
//
//   dump    Output the compiled form of the pattern.
//           Debugging function only.
//
//---------------------------------------------------------------------
void   RegexPattern::dumpOp(int32_t index) const {
    (void)index;  // Suppress warnings in non-debug build.
#if defined(REGEX_DEBUG)
    static const char * const opNames[] = {URX_OPCODE_NAMES};
    int32_t op          = fCompiledPat->elementAti(index);
    int32_t val         = URX_VAL(op);
    int32_t type        = URX_TYPE(op);
    int32_t pinnedType  = type;
    if ((uint32_t)pinnedType >= sizeof(opNames)/sizeof(char *)) {
        pinnedType = 0;
    }

    printf("%4d   %08x    %-15s  ", index, op, opNames[pinnedType]);
    switch (type) {
    case URX_NOP:
    case URX_DOTANY:
    case URX_DOTANY_ALL:
    case URX_FAIL:
    case URX_CARET:
    case URX_DOLLAR:
    case URX_BACKSLASH_G:
    case URX_BACKSLASH_X:
    case URX_END:
    case URX_DOLLAR_M:
    case URX_CARET_M:
        // Types with no operand field of interest.
        break;

    case URX_RESERVED_OP:
    case URX_START_CAPTURE:
    case URX_END_CAPTURE:
    case URX_STATE_SAVE:
    case URX_JMP:
    case URX_JMP_SAV:
    case URX_JMP_SAV_X:
    case URX_BACKSLASH_B:
    case URX_BACKSLASH_BU:
    case URX_BACKSLASH_D:
    case URX_BACKSLASH_Z:
    case URX_STRING_LEN:
    case URX_CTR_INIT:
    case URX_CTR_INIT_NG:
    case URX_CTR_LOOP:
    case URX_CTR_LOOP_NG:
    case URX_RELOC_OPRND:
    case URX_STO_SP:
    case URX_LD_SP:
    case URX_BACKREF:
    case URX_STO_INP_LOC:
    case URX_JMPX:
    case URX_LA_START:
    case URX_LA_END:
    case URX_BACKREF_I:
    case URX_LB_START:
    case URX_LB_CONT:
    case URX_LB_END:
    case URX_LBN_CONT:
    case URX_LBN_END:
    case URX_LOOP_C:
    case URX_LOOP_DOT_I:
    case URX_BACKSLASH_H:
    case URX_BACKSLASH_R:
    case URX_BACKSLASH_V:
        // types with an integer operand field.
        printf("%d", val);
        break;

    case URX_ONECHAR:
    case URX_ONECHAR_I:
        printf("%c", val<256?val:'?');
        break;

    case URX_STRING:
    case URX_STRING_I:
        {
            int32_t lengthOp       = fCompiledPat->elementAti(index+1);
            U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN);
            int32_t length = URX_VAL(lengthOp);
            int32_t i;
            for (i=val; i<val+length; i++) {
                UChar c = fLiteralText[i];
                if (c < 32 || c >= 256) {c = '.';}
                printf("%c", c);
            }
        }
        break;

    case URX_SETREF:
    case URX_LOOP_SR_I:
        {
            UnicodeString s;
            UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val);
            set->toPattern(s, TRUE);
            for (int32_t i=0; i<s.length(); i++) {
                printf("%c", s.charAt(i));
            }
        }
        break;

    case URX_STATIC_SETREF:
    case URX_STAT_SETREF_N:
        {
            UnicodeString s;
            if (val & URX_NEG_SET) {
                printf("NOT ");
                val &= ~URX_NEG_SET;
            }
            UnicodeSet *set = fStaticSets[val];
            set->toPattern(s, TRUE);
            for (int32_t i=0; i<s.length(); i++) {
                printf("%c", s.charAt(i));
            }
        }
        break;


    default:
        printf("??????");
        break;
    }
    printf("\n");
#endif
}
//
//  APITest.   Invoke every function at least once, and check that it does something.
//             Does not attempt to check complete functionality.
//
void AlphabeticIndexTest::APITest() {
    //
    //  Simple constructor and destructor,  getBucketCount()
    //
    UErrorCode status = U_ZERO_ERROR;
    int32_t lc = 0;
    int32_t i  = 0;
    AlphabeticIndex *index = new AlphabeticIndex(Locale::getEnglish(), status);
    TEST_CHECK_STATUS;
    lc = index->getBucketCount(status);
    TEST_CHECK_STATUS;
    TEST_ASSERT(28 == lc);    // 26 letters plus two under/overflow labels.
    //printf("getBucketCount() == %d\n", lc);
    delete index;

    // Constructor from a Collator
    //
    status = U_ZERO_ERROR;
    RuleBasedCollator *coll = dynamic_cast<RuleBasedCollator *>(
        Collator::createInstance(Locale::getGerman(), status));
    TEST_CHECK_STATUS;
    TEST_ASSERT(coll != NULL);
    index = new AlphabeticIndex(coll, status);
    TEST_CHECK_STATUS;
    TEST_ASSERT(coll == &index->getCollator());
    assertEquals("only the underflow label in an index built from a collator",
                 1, index->getBucketCount(status));
    TEST_CHECK_STATUS;
    delete index;
    

    // addLabels()

    status = U_ZERO_ERROR;
    index = new AlphabeticIndex(Locale::getEnglish(), status);
    TEST_CHECK_STATUS;
    UnicodeSet additions;
    additions.add((UChar32)0x410).add((UChar32)0x415);   // A couple of Cyrillic letters
    index->addLabels(additions, status);
    TEST_CHECK_STATUS;
    lc = index->getBucketCount(status);
    TEST_CHECK_STATUS;
    assertEquals("underflow, A-Z, inflow, 2 Cyrillic, overflow",
                 31, index->getBucketCount(status));
    // std::cout << lc << std::endl;
    delete index;


    // addLabels(Locale)

    status = U_ZERO_ERROR;
    index = new AlphabeticIndex(Locale::getEnglish(), status);
    TEST_CHECK_STATUS;
    AlphabeticIndex &aip = index->addLabels(Locale::getJapanese(), status);
    TEST_ASSERT(&aip == index);
    TEST_CHECK_STATUS;
    lc = index->getBucketCount(status);
    TEST_CHECK_STATUS;
    TEST_ASSERT(35 < lc);  // Japanese should add a bunch.  Don't rely on the exact value.
    delete index;

    // GetCollator(),  Get under/in/over flow labels

    status = U_ZERO_ERROR;
    index = new AlphabeticIndex(Locale::getGerman(), status);
    TEST_CHECK_STATUS;
    Collator *germanCol = Collator::createInstance(Locale::getGerman(), status);
    TEST_CHECK_STATUS;
    const RuleBasedCollator &indexCol = index->getCollator();
    TEST_ASSERT(*germanCol == indexCol);
    delete germanCol;

    UnicodeString ELLIPSIS;  ELLIPSIS.append((UChar32)0x2026);
    UnicodeString s = index->getUnderflowLabel();
    TEST_ASSERT(ELLIPSIS == s);
    s = index->getOverflowLabel();
    TEST_ASSERT(ELLIPSIS == s);
    s = index->getInflowLabel();
    TEST_ASSERT(ELLIPSIS == s);
    index->setOverflowLabel(UNICODE_STRING_SIMPLE("O"), status);
    index->setUnderflowLabel(UNICODE_STRING_SIMPLE("U"), status).setInflowLabel(UNICODE_STRING_SIMPLE("I"), status);
    s = index->getUnderflowLabel();
    TEST_ASSERT(UNICODE_STRING_SIMPLE("U") == s);
    s = index->getOverflowLabel();
    TEST_ASSERT(UNICODE_STRING_SIMPLE("O") == s);
    s = index->getInflowLabel();
    TEST_ASSERT(UNICODE_STRING_SIMPLE("I") == s);




    delete index;



    const UnicodeString adam = UNICODE_STRING_SIMPLE("Adam");
    const UnicodeString baker = UNICODE_STRING_SIMPLE("Baker");
    const UnicodeString charlie = UNICODE_STRING_SIMPLE("Charlie");
    const UnicodeString chad = UNICODE_STRING_SIMPLE("Chad");
    const UnicodeString zed  = UNICODE_STRING_SIMPLE("Zed");
    const UnicodeString Cyrillic = UNICODE_STRING_SIMPLE("\\u0410\\u0443\\u0435").unescape();

    // addRecord(), verify that it comes back out.
    //
    status = U_ZERO_ERROR;
    index = new AlphabeticIndex(Locale::getEnglish(), status);
    TEST_CHECK_STATUS;
    index->addRecord(UnicodeString("Adam"), this, status);
    UBool   b;
    TEST_CHECK_STATUS;
    index->resetBucketIterator(status);
    TEST_CHECK_STATUS;
    index->nextBucket(status);  // Move to underflow label
    index->nextBucket(status);  // Move to "A"
    TEST_CHECK_STATUS;
    const UnicodeString &label2 = index->getBucketLabel();
    UnicodeString A_STR = UNICODE_STRING_SIMPLE("A");
    TEST_ASSERT(A_STR == label2);

    b = index->nextRecord(status);
    TEST_CHECK_STATUS;
    TEST_ASSERT(b);
    const UnicodeString &itemName = index->getRecordName();
    TEST_ASSERT(adam == itemName);

    const void *itemContext = index->getRecordData();
    TEST_ASSERT(itemContext == this);

    delete index;

    // clearRecords, addRecord(), Iteration

    status = U_ZERO_ERROR;
    index = new AlphabeticIndex(Locale::getEnglish(), status);
    TEST_CHECK_STATUS;
    while (index->nextBucket(status)) {
        TEST_CHECK_STATUS;
        while (index->nextRecord(status)) {
            TEST_CHECK_STATUS;
            TEST_ASSERT(FALSE);   // No items have been added.
        }
        TEST_CHECK_STATUS;
    }

    index->addRecord(adam, NULL, status);
    index->addRecord(baker, NULL, status);
    index->addRecord(charlie, NULL, status);
    index->addRecord(chad, NULL, status);
    TEST_CHECK_STATUS;
    int itemCount = 0;
    index->resetBucketIterator(status);
    while (index->nextBucket(status)) {
        TEST_CHECK_STATUS;
        while (index->nextRecord(status)) {
            TEST_CHECK_STATUS;
            ++itemCount;
        }
    }
    TEST_CHECK_STATUS;
    TEST_ASSERT(itemCount == 4);

    TEST_ASSERT(index->nextBucket(status) == FALSE);
    index->resetBucketIterator(status);
    TEST_CHECK_STATUS;
    TEST_ASSERT(index->nextBucket(status) == TRUE);

    index->clearRecords(status);
    TEST_CHECK_STATUS;
    index->resetBucketIterator(status);
    while (index->nextBucket(status)) {
        TEST_CHECK_STATUS;
        while (index->nextRecord(status)) {
            TEST_ASSERT(FALSE);   // No items have been added.
        }
    }
    TEST_CHECK_STATUS;
    delete index;

    // getBucketLabel(), getBucketType()

    status = U_ZERO_ERROR;
    index = new AlphabeticIndex(Locale::getEnglish(), status);
    TEST_CHECK_STATUS;
    index->setUnderflowLabel(adam, status).setOverflowLabel(charlie, status);
    TEST_CHECK_STATUS;
    for (i=0; index->nextBucket(status); i++) {
        TEST_CHECK_STATUS;
        UnicodeString label = index->getBucketLabel();
        UAlphabeticIndexLabelType type = index->getBucketLabelType();
        if (i == 0) {
            TEST_ASSERT(type == U_ALPHAINDEX_UNDERFLOW);
            TEST_ASSERT(label == adam);
        } else if (i <= 26) {
            // Labels A - Z for English locale
            TEST_ASSERT(type == U_ALPHAINDEX_NORMAL);
            UnicodeString expectedLabel((UChar)(0x40 + i));
            TEST_ASSERT(expectedLabel == label);
        } else if (i == 27) {
            TEST_ASSERT(type == U_ALPHAINDEX_OVERFLOW);
            TEST_ASSERT(label == charlie);
        } else {
            TEST_ASSERT(FALSE);
        }
    }
    TEST_ASSERT(i==28);
    delete index;

    // getBucketIndex()

    status = U_ZERO_ERROR;
    index = new AlphabeticIndex(Locale::getEnglish(), status);
    TEST_CHECK_STATUS;
    int32_t n = index->getBucketIndex(adam, status);
    TEST_CHECK_STATUS;
    TEST_ASSERT(n == 1);    /*  Label #0 is underflow, 1 is A, etc. */
    n = index->getBucketIndex(baker, status);
    TEST_ASSERT(n == 2);
    n = index->getBucketIndex(Cyrillic, status);
    TEST_ASSERT(n == 27);   // Overflow label
    n = index->getBucketIndex(zed, status);
    TEST_ASSERT(n == 26);

    for (i=0; index->nextBucket(status); i++) {
        n = index->getBucketIndex();
        TEST_ASSERT(n == i);
        UnicodeString label = index->getBucketLabel();
        TEST_ASSERT(n == i);
    }
    TEST_ASSERT(i == 28);

    delete index;
    index = new AlphabeticIndex(Locale::createFromName("ru"), status);
    TEST_CHECK_STATUS;
    assertEquals("Russian index.getBucketCount()", 32, index->getBucketCount(status));
    // Latin-script names should go into the underflow label (0)
    // if the Russian collation does not use script reordering,
    // but into the overflow label (getBucketCount()-1)
    // if Russian sorts Cyrillic first.
    int32_t reorderCodes[20];
    int32_t expectedLatinIndex = 0;
    if (index->getCollator().getReorderCodes(reorderCodes, LENGTHOF(reorderCodes), status) > 0) {
        expectedLatinIndex = index->getBucketCount(status) - 1;
    }
    n = index->getBucketIndex(adam, status);
    TEST_CHECK_STATUS;
    assertEquals("Russian index.getBucketIndex(adam)", expectedLatinIndex, n);
    n = index->getBucketIndex(baker, status);
    assertEquals("Russian index.getBucketIndex(baker)", expectedLatinIndex, n);
    n = index->getBucketIndex(Cyrillic, status);
    assertEquals("Russian index.getBucketIndex(Cyrillic)", 1, n);
    n = index->getBucketIndex(zed, status);
    assertEquals("Russian index.getBucketIndex(zed)", expectedLatinIndex, n);

    delete index;

}
Ejemplo n.º 21
0
void CanonicalIteratorTest::TestBasic() {

    UErrorCode status = U_ZERO_ERROR;

    static const char * const testArray[][2] = {
        {"\\u00C5d\\u0307\\u0327", "A\\u030Ad\\u0307\\u0327, A\\u030Ad\\u0327\\u0307, A\\u030A\\u1E0B\\u0327, "
            "A\\u030A\\u1E11\\u0307, \\u00C5d\\u0307\\u0327, \\u00C5d\\u0327\\u0307, "
            "\\u00C5\\u1E0B\\u0327, \\u00C5\\u1E11\\u0307, \\u212Bd\\u0307\\u0327, "
            "\\u212Bd\\u0327\\u0307, \\u212B\\u1E0B\\u0327, \\u212B\\u1E11\\u0307"},
        {"\\u010d\\u017E", "c\\u030Cz\\u030C, c\\u030C\\u017E, \\u010Dz\\u030C, \\u010D\\u017E"},
        {"x\\u0307\\u0327", "x\\u0307\\u0327, x\\u0327\\u0307, \\u1E8B\\u0327"},
    };
    
#if 0
    // This is not interesting for C/C++ as the data is already built beforehand
    // check build
    UnicodeSet ss = CanonicalIterator.getSafeStart();
    logln("Safe Start: " + ss.toPattern(true));
    ss = CanonicalIterator.getStarts('a');
    expectEqual("Characters with 'a' at the start of their decomposition: ", "", CanonicalIterator.getStarts('a'),
        new UnicodeSet("[\u00E0-\u00E5\u0101\u0103\u0105\u01CE\u01DF\u01E1\u01FB"
        + "\u0201\u0203\u0227\u1E01\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7]")
            );
#endif

    // check permute
    // NOTE: we use a TreeSet below to sort the output, which is not guaranteed to be sorted!

    Hashtable *permutations = new Hashtable(FALSE, status);
    permutations->setValueDeleter(uhash_deleteUnicodeString);
    UnicodeString toPermute("ABC");

    CanonicalIterator::permute(toPermute, FALSE, permutations, status);

    logln("testing permutation");
  
    expectEqual("Simple permutation ", "", collectionToString(permutations), "ABC, ACB, BAC, BCA, CAB, CBA");

    delete permutations;
    
    // try samples
    logln("testing samples");
    Hashtable *set = new Hashtable(FALSE, status);
    set->setValueDeleter(uhash_deleteUnicodeString);
    int32_t i = 0;
    CanonicalIterator it("", status);
    if(U_SUCCESS(status)) {
      for (i = 0; i < ARRAY_LENGTH(testArray); ++i) {
          //logln("Results for: " + name.transliterate(testArray[i]));
          UnicodeString testStr = CharsToUnicodeString(testArray[i][0]);
          it.setSource(testStr, status);
          set->removeAll();
          for (;;) {
              //UnicodeString *result = new UnicodeString(it.next());
              UnicodeString result(it.next());
              if (result.isBogus()) {
                  break;
              }
              set->put(result, new UnicodeString(result), status); // Add result to the table
              //logln(++counter + ": " + hex.transliterate(result));
              //logln(" = " + name.transliterate(result));
          }
          expectEqual(i + ": ", testStr, collectionToString(set), CharsToUnicodeString(testArray[i][1]));

      }
    } else {
      errln("Couldn't instantiate canonical iterator. Error: %s", u_errorName(status));
    }
    delete set;
}
Ejemplo n.º 22
0
/*
 * Find missing case mapping relationships and add mappings for case closure.
 * This function starts from an "original" code point and recursively
 * finds its case mappings and the case mappings of where it maps to.
 *
 * The recursion depth is capped at 3 nested calls of this function.
 * In each call, the current code point is c, and the function enumerates
 * all of c's simple (single-code point) case mappings.
 * prev is the code point that case-mapped to c.
 * prev2 is the code point that case-mapped to prev.
 *
 * The initial function call has prev2<0, prev<0, and c==orig
 * (marking no code points).
 * It enumerates c's case mappings and recurses without further action.
 *
 * The second-level function call has prev2<0, prev==orig, and c is
 * the destination code point of one of prev's case mappings.
 * The function checks if any of c's case mappings go back to orig
 * and adds a closure mapping if not.
 * In other words, it turns a case mapping relationship of
 *   orig->c
 * into
 *   orig<->c
 *
 * The third-level function call has prev2==orig, prev>=0, and c is
 * the destination code point of one of prev's case mappings.
 * (And prev is the destination of one of prev2's case mappings.)
 * The function checks if any of c's case mappings go back to orig
 * and adds a closure mapping if not.
 * In other words, it turns case mapping relationships of
 *   orig->prev->c or orig->prev<->c
 * into
 *   orig->prev->c->orig or orig->prev<->c->orig
 * etc.
 * (Graphically, this closes a triangle.)
 *
 * With repeated application on all code points until no more closure mappings
 * are added, all case equivalence groups get complete mappings.
 * That is, in each group of code points with case relationships
 * each code point will in the end have some mapping to each other
 * code point in the group.
 *
 * @return TRUE if a closure mapping was added
 */
UBool
CasePropsBuilder::addClosure(UChar32 orig, UChar32 prev2, UChar32 prev, UChar32 c, uint32_t value,
                             UErrorCode &errorCode) {
    if(U_FAILURE(errorCode)) { return FALSE; }

    UChar32 next;
    UBool someMappingsAdded=FALSE;

    if(c!=orig) {
        /* get the properties for c */
        value=utrie2_get32(pTrie, c);
    }
    /* else if c==orig then c's value was passed in */

    if(value&UCASE_EXCEPTION) {
        UnicodeSet set;

        ExcProps &ep=*excProps[value>>UGENCASE_EXC_SHIFT];
        UniProps &p=ep.props;

        /*
         * marker for whether any of c's mappings goes to orig
         * c==orig: prevent adding a closure mapping when getting orig's own, direct mappings
         */
        UBool mapsToOrig=(UBool)(c==orig);

        /* collect c's case mapping destinations in set[] */
        if((next=p.suc)>=0 && next!=c) {
            set.add(next);
        }
        if((next=p.slc)>=0 && next!=c) {
            set.add(next);
        }
        if(p.suc!=(next=p.stc) && next!=c) {
            set.add(next);
        }
        if((next=p.scf)>=0 && next!=c) {
            set.add(next);
        }

        /* add c's current closure mappings to set */
        set.addAll(ep.closure);

        /* process all code points to which c case-maps */
        UnicodeSetIterator iter(set);
        while(iter.next()) {
            next=iter.getCodepoint(); /* next!=c */

            if(next==orig) {
                mapsToOrig=TRUE; /* remember that we map to orig */
            } else if(prev2<0 && next!=prev) {
                /*
                 * recurse unless
                 * we have reached maximum depth (prev2>=0) or
                 * this is a mapping to one of the previous code points (orig, prev, c)
                 */
                someMappingsAdded|=addClosure(orig, prev, c, next, 0, errorCode);
            }
        }

        if(!mapsToOrig) {
            addClosureMapping(c, orig, errorCode);
            return TRUE;
        }
    } else {
Ejemplo n.º 23
0
void
CasePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
                           UErrorCode &errorCode) {
    if(U_FAILURE(errorCode) || newValues.containsNone(relevantProps)) { return; }

    UChar32 start=props.start;
    UChar32 end=props.end;

    /* default: map to self */
    int32_t delta=0;

    uint32_t type;
    if(props.binProps[UCHAR_LOWERCASE]) {
        type=UCASE_LOWER;
    } else if(props.binProps[UCHAR_UPPERCASE]) {
        type=UCASE_UPPER;
    } else if(props.getIntProp(UCHAR_GENERAL_CATEGORY)==U_TITLECASE_LETTER) {
        type=UCASE_TITLE;
    } else {
        type=UCASE_NONE;
    }
    uint32_t value=type;

    UBool hasMapping=FALSE;
    if(props.suc>=0) {
        /* uppercase mapping as delta if the character is lowercase */
        hasMapping=TRUE;
        if(type==UCASE_LOWER) {
            delta=props.suc-start;
        } else {
            value|=UCASE_EXCEPTION;
        }
    }
    if(props.slc>=0) {
        /* lowercase mapping as delta if the character is uppercase or titlecase */
        hasMapping=TRUE;
        if(type>=UCASE_UPPER) {
            delta=props.slc-start;
        } else {
            value|=UCASE_EXCEPTION;
        }
    }
    if(props.stc>=0) {
        hasMapping=TRUE;
    }
    if(props.suc!=props.stc) {
        value|=UCASE_EXCEPTION;
    }
    if(!props.lc.isEmpty() || !props.uc.isEmpty() || !props.tc.isEmpty() ||
        newValues.contains(PPUCD_CONDITIONAL_CASE_MAPPINGS)
    ) {
        hasMapping=TRUE;
        value|=UCASE_EXCEPTION;
    }
    if( (props.scf>=0 && props.scf!=props.slc) ||
        (!props.cf.isEmpty() && props.cf!=UnicodeString(props.scf)) ||
        newValues.contains(PPUCD_TURKIC_CASE_FOLDING)
    ) {
        hasMapping=TRUE;
        value|=UCASE_EXCEPTION;
    }

    // Simple case folding falls back to simple lowercasing.
    // If there is no case folding but there is a lowercase mapping,
    // then add a case folding mapping to the code point.
    // For example: Cherokee uppercase syllables since Unicode 8.
    // (Full case folding falls back to simple case folding,
    // not to full lowercasing, so we need not also handle it specially
    // for such cases.)
    UChar32 scf=props.scf;
    if(scf<0 && props.slc>=0) {
        scf=start;
        hasMapping=TRUE;
        value|=UCASE_EXCEPTION;
    }

    if(delta<UCASE_MIN_DELTA || UCASE_MAX_DELTA<delta) {
        value|=UCASE_EXCEPTION;
    }

    if(props.binProps[UCHAR_SOFT_DOTTED]) {
        value|=UCASE_SOFT_DOTTED;
    }
    int32_t cc=props.getIntProp(UCHAR_CANONICAL_COMBINING_CLASS);
    if(cc!=0) {
        if(props.binProps[UCHAR_SOFT_DOTTED]) {
            fprintf(stderr, "genprops error: a soft-dotted character has ccc!=0\n");
            errorCode=U_ILLEGAL_ARGUMENT_ERROR;
            return;
        }
        if(cc==230) {
            value|=UCASE_ABOVE;
        } else {
            value|=UCASE_OTHER_ACCENT;
        }
    }

    if(props.binProps[UCHAR_CASE_IGNORABLE]) {
        value|=UCASE_IGNORABLE;
    }

    if((hasMapping || (value&UCASE_EXCEPTION)) && start!=end) {
        fprintf(stderr,
                "genprops error: range %04lX..%04lX has case mappings "
                "or reasons for data structure exceptions\n",
                (long)start, (long)end);
        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return;
    }

    /* handle exceptions */
    if(value&UCASE_EXCEPTION) {
        /* simply store exceptions for later processing and encoding */
        if(excPropsCount==MAX_EXC_COUNT) {
            fprintf(stderr, "genprops error: casepropsbuilder: too many exceptions\n");
            errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
            return;
        }
        ExcProps *newExcProps=new ExcProps(props);
        if(newExcProps==NULL) {
            fprintf(stderr,
                    "genprops error: casepropsbuilder out of memory allocating "
                    "exceptions properties\n");
            errorCode=U_MEMORY_ALLOCATION_ERROR;
            return;
        }
        newExcProps->props.scf=scf;
        newExcProps->hasConditionalCaseMappings=newValues.contains(PPUCD_CONDITIONAL_CASE_MAPPINGS);
        newExcProps->hasTurkicCaseFolding=newValues.contains(PPUCD_TURKIC_CASE_FOLDING);
        value|=(uint32_t)excPropsCount<<UGENCASE_EXC_SHIFT;
        excProps[excPropsCount++]=newExcProps;
    } else {
        /* store the simple case mapping delta */
        value|=((uint32_t)delta<<UCASE_DELTA_SHIFT)&UCASE_DELTA_MASK;
    }

    utrie2_setRange32(pTrie, start, end, value, TRUE, &errorCode);
    if(U_FAILURE(errorCode)) {
        fprintf(stderr, "genprops error: unable to set case mapping values: %s\n",
                u_errorName(errorCode));
        return;
    }

    if(hasMapping) {
        /* update the case-sensitive set */
        caseSensitive.add(start);
        if(scf>=0) { caseSensitive.add(scf); }
        if(props.slc>=0) { caseSensitive.add(props.slc); }
        if(props.suc>=0) { caseSensitive.add(props.suc); }
        if(props.stc>=0) { caseSensitive.add(props.stc); }
        caseSensitive.addAll(props.cf);
        caseSensitive.addAll(props.lc);
        caseSensitive.addAll(props.uc);
        caseSensitive.addAll(props.tc);

        /* update maxFullLength */
        if(props.cf.length()>maxFullLength) { maxFullLength=props.cf.length(); }
        if(props.lc.length()>maxFullLength) { maxFullLength=props.lc.length(); }
        if(props.uc.length()>maxFullLength) { maxFullLength=props.uc.length(); }
        if(props.tc.length()>maxFullLength) { maxFullLength=props.tc.length(); }
    }

    /* add the multi-character case folding to the "unfold" data */
    if(props.cf.hasMoreChar32Than(0, 0x7fffffff, 1)) {
        addUnfolding(start, props.cf, errorCode);
    }
}
Ejemplo n.º 24
0
U_CAPI int32_t U_EXPORT2
uspoof_check(const USpoofChecker *sc,
             const UChar *text, int32_t length,
             int32_t *position,
             UErrorCode *status) {
             
    const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
    if (This == NULL) {
        return 0;
    }
    if (length < -1) {
        *status = U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }
    if (length == -1) {
        // It's not worth the bother to handle nul terminated strings everywhere.
        //   Just get the length and be done with it.
        length = u_strlen(text);
    }

    int32_t result = 0;
    int32_t failPos = 0x7fffffff;   // TODO: do we have a #define for max int32?

    // A count of the number of non-Common or inherited scripts.
    // Needed for both the SINGLE_SCRIPT and the WHOLE/MIXED_SCIRPT_CONFUSABLE tests.
    // Share the computation when possible.  scriptCount == -1 means that we haven't
    // done it yet.
    int32_t scriptCount = -1;

    if ((This->fChecks) & USPOOF_SINGLE_SCRIPT) {
        scriptCount = This->scriptScan(text, length, failPos, *status);
        // printf("scriptCount (clipped to 2) = %d\n", scriptCount);
        if ( scriptCount >= 2) {
            // Note: scriptCount == 2 covers all cases of the number of scripts >= 2
            result |= USPOOF_SINGLE_SCRIPT;
        }
    }

    if (This->fChecks & USPOOF_CHAR_LIMIT) {
        int32_t i;
        UChar32 c;
        for (i=0; i<length ;) {
            U16_NEXT(text, i, length, c);
            if (!This->fAllowedCharsSet->contains(c)) {
                result |= USPOOF_CHAR_LIMIT;
                if (i < failPos) {
                    failPos = i;
                }
                break;
            }
        }
    }

    if (This->fChecks & 
        (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) {
        // These are the checks that need to be done on NFD input
        NFDBuffer   normalizedInput(text, length, *status);
        const UChar  *nfdText = normalizedInput.getBuffer();
        int32_t      nfdLength = normalizedInput.getLength();

        if (This->fChecks & USPOOF_INVISIBLE) {
           
            // scan for more than one occurence of the same non-spacing mark
            // in a sequence of non-spacing marks.
            int32_t     i;
            UChar32     c;
            UChar32     firstNonspacingMark = 0;
            UBool       haveMultipleMarks = FALSE;  
            UnicodeSet  marksSeenSoFar;   // Set of combining marks in a single combining sequence.
            
            for (i=0; i<nfdLength ;) {
                U16_NEXT(nfdText, i, nfdLength, c);
                if (u_charType(c) != U_NON_SPACING_MARK) {
                    firstNonspacingMark = 0;
                    if (haveMultipleMarks) {
                        marksSeenSoFar.clear();
                        haveMultipleMarks = FALSE;
                    }
                    continue;
                }
                if (firstNonspacingMark == 0) {
                    firstNonspacingMark = c;
                    continue;
                }
                if (!haveMultipleMarks) {
                    marksSeenSoFar.add(firstNonspacingMark);
                    haveMultipleMarks = TRUE;
                }
                if (marksSeenSoFar.contains(c)) {
                    // report the error, and stop scanning.
                    // No need to find more than the first failure.
                    result |= USPOOF_INVISIBLE;
                    failPos = i;
                    // TODO: Bug 8655: failPos is the position in the NFD buffer, but what we want
                    //       to give back to our caller is a position in the original input string.
                    if (failPos > length) {
                        failPos = length;
                    }
                    break;
                }
                marksSeenSoFar.add(c);
            }
        }
       
        
        if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE)) {
            // The basic test is the same for both whole and mixed script confusables.
            // Compute the set of scripts that every input character has a confusable in.
            // For this computation an input character is always considered to be
            //    confusable with itself in its own script.
            // If the number of such scripts is two or more, and the input consisted of
            //   characters all from a single script, we have a whole script confusable.
            //   (The two scripts will be the original script and the one that is confusable)
            // If the number of such scripts >= one, and the original input contained characters from
            //   more than one script, we have a mixed script confusable.  (We can transform
            //   some of the characters, and end up with a visually similar string all in
            //   one script.)

            if (scriptCount == -1) {
                int32_t t;
                scriptCount = This->scriptScan(text, length, t, *status);
            }
            
            ScriptSet scripts;
            This->wholeScriptCheck(nfdText, nfdLength, &scripts, *status);
            int32_t confusableScriptCount = scripts.countMembers();
            //printf("confusableScriptCount = %d\n", confusableScriptCount);
            
            if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) &&
                confusableScriptCount >= 2 &&
                scriptCount == 1) {
                result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
            }
        
            if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) &&
                confusableScriptCount >= 1 &&
                scriptCount > 1) {
                result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
            }
        }
    }
    if (position != NULL && failPos != 0x7fffffff) {
        *position = failPos;
    }
    return result;
}
Ejemplo n.º 25
0
 /* Try to have the compiler inline these*/
 inline static int32_t getStringCount(const UnicodeSet& set) {
     return set.getStringCount();
 }
Ejemplo n.º 26
0
void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) {
    if (U_FAILURE(status)) { return; }
    // Chinese index characters, which are specific to each of the several Chinese tailorings,
    // take precedence over the single locale data exemplar set per language.
    const char *language = locale.getLanguage();
    if (uprv_strcmp(language, "zh") == 0 || uprv_strcmp(language, "ja") == 0 ||
            uprv_strcmp(language, "ko") == 0) {
        // TODO: This should be done regardless of the language, but it's expensive.
        // We should add a Collator function (can be @internal)
        // to enumerate just the contractions that start with a given code point or string.
        if (addChineseIndexCharacters(status) || U_FAILURE(status)) {
            return;
        }
    }

    LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));
    if (U_FAILURE(status)) {
        return;
    }

    UnicodeSet exemplars;
    ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status);
    if (U_SUCCESS(status)) {
        initialLabels_->addAll(exemplars);
        return;
    }
    status = U_ZERO_ERROR;  // Clear out U_MISSING_RESOURCE_ERROR

    // The locale data did not include explicit Index characters.
    // Synthesize a set of them from the locale's standard exemplar characters.
    ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status);
    if (U_FAILURE(status)) {
        return;
    }

    // question: should we add auxiliary exemplars?
    if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.size() == 0) {
        exemplars.add(0x61, 0x7A);
    }
    if (exemplars.containsSome(0xAC00, 0xD7A3)) {  // Hangul syllables
        // cut down to small list
        exemplars.remove(0xAC00, 0xD7A3).
            add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).
            add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544).
            add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).
            add(0xD30C).add(0xD558);
    }
    if (exemplars.containsSome(0x1200, 0x137F)) {  // Ethiopic block
        // cut down to small list
        // make use of the fact that Ethiopic is allocated in 8's, where
        // the base is 0 mod 8.
        UnicodeSet ethiopic(
            UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"), status);
        UnicodeSetIterator it(ethiopic);
        while (it.next() && !it.isString()) {
            if ((it.getCodepoint() & 0x7) != 0) {
                exemplars.remove(it.getCodepoint());
            }
        }
    }

    // Upper-case any that aren't already so.
    //   (We only do this for synthesized index characters.)
    UnicodeSetIterator it(exemplars);
    UnicodeString upperC;
    while (it.next()) {
        const UnicodeString &exemplarC = it.getString();
        upperC = exemplarC;
        upperC.toUpper(locale);
        initialLabels_->add(upperC);
    }
}
Ejemplo n.º 27
0
//---------------------------------------------------------------------------------
//
//  Parse RBBI rules.   The state machine for rules parsing is here.
//                      The state tables are hand-written in the file rbbirpt.txt,
//                      and converted to the form used here by a perl
//                      script rbbicst.pl
//
//---------------------------------------------------------------------------------
void RBBIRuleScanner::parse() {
    uint16_t                state;
    const RBBIRuleTableEl  *tableEl;

    if (U_FAILURE(*fRB->fStatus)) {
        return;
    }

    state = 1;
    nextChar(fC);
    //
    // Main loop for the rule parsing state machine.
    //   Runs once per state transition.
    //   Each time through optionally performs, depending on the state table,
    //      - an advance to the the next input char
    //      - an action to be performed.
    //      - pushing or popping a state to/from the local state return stack.
    //
    for (;;) {
        //  Bail out if anything has gone wrong.
        //  RBBI rule file parsing stops on the first error encountered.
        if (U_FAILURE(*fRB->fStatus)) {
            break;
        }

        // Quit if state == 0.  This is the normal way to exit the state machine.
        //
        if (state == 0) {
            break;
        }

        // Find the state table element that matches the input char from the rule, or the
        //    class of the input character.  Start with the first table row for this
        //    state, then linearly scan forward until we find a row that matches the
        //    character.  The last row for each state always matches all characters, so
        //    the search will stop there, if not before.
        //
        tableEl = &gRuleParseStateTable[state];
#ifdef RBBI_DEBUG
        if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) {
            RBBIDebugPrintf("char, line, col = (\'%c\', %d, %d)    state=%s ",
                            fC.fChar, fLineNum, fCharNum, RBBIRuleStateNames[state]);
        }
#endif

        for (;;) {
#ifdef RBBI_DEBUG
            if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) {
                RBBIDebugPrintf(".");
            }
#endif
            if (tableEl->fCharClass < 127 && fC.fEscaped == FALSE &&   tableEl->fCharClass == fC.fChar) {
                // Table row specified an individual character, not a set, and
                //   the input character is not escaped, and
                //   the input character matched it.
                break;
            }
            if (tableEl->fCharClass == 255) {
                // Table row specified default, match anything character class.
                break;
            }
            if (tableEl->fCharClass == 254 && fC.fEscaped)  {
                // Table row specified "escaped" and the char was escaped.
                break;
            }
            if (tableEl->fCharClass == 253 && fC.fEscaped &&
                    (fC.fChar == 0x50 || fC.fChar == 0x70 ))  {
                // Table row specified "escaped P" and the char is either 'p' or 'P'.
                break;
            }
            if (tableEl->fCharClass == 252 && fC.fChar == (UChar32)-1)  {
                // Table row specified eof and we hit eof on the input.
                break;
            }

            if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
                    fC.fEscaped == FALSE &&                                      //   char is not escaped &&
                    fC.fChar != (UChar32)-1) {                                   //   char is not EOF
                UnicodeSet *uniset = fRuleSets[tableEl->fCharClass-128];
                if (uniset->contains(fC.fChar)) {
                    // Table row specified a character class, or set of characters,
                    //   and the current char matches it.
                    break;
                }
            }

            // No match on this row, advance to the next  row for this state,
            tableEl++;
        }
        if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) {
            RBBIDebugPuts("");
        }

        //
        // We've found the row of the state table that matches the current input
        //   character from the rules string.
        // Perform any action specified  by this row in the state table.
        if (doParseActions((EParseAction)tableEl->fAction) == FALSE) {
            // Break out of the state machine loop if the
            //   the action signalled some kind of error, or
            //   the action was to exit, occurs on normal end-of-rules-input.
            break;
        }

        if (tableEl->fPushState != 0) {
            fStackPtr++;
            if (fStackPtr >= kStackSize) {
                error(U_BRK_INTERNAL_ERROR);
                RBBIDebugPuts("RBBIRuleScanner::parse() - state stack overflow.");
                fStackPtr--;
            }
            fStack[fStackPtr] = tableEl->fPushState;
        }

        if (tableEl->fNextChar) {
            nextChar(fC);
        }

        // Get the next state from the table entry, or from the
        //   state stack if the next state was specified as "pop".
        if (tableEl->fNextState != 255) {
            state = tableEl->fNextState;
        } else {
            state = fStack[fStackPtr];
            fStackPtr--;
            if (fStackPtr < 0) {
                error(U_BRK_INTERNAL_ERROR);
                RBBIDebugPuts("RBBIRuleScanner::parse() - state stack underflow.");
                fStackPtr++;
            }
        }

    }

    //
    // If there were NO user specified reverse rules, set up the equivalent of ".*;"
    //
    if (fRB->fReverseTree == NULL) {
        fRB->fReverseTree  = pushNewNode(RBBINode::opStar);
        RBBINode  *operand = pushNewNode(RBBINode::setRef);
        findSetFor(kAny, operand);
        fRB->fReverseTree->fLeftChild = operand;
        operand->fParent              = fRB->fReverseTree;
        fNodeStackPtr -= 2;
    }


    //
    // Parsing of the input RBBI rules is complete.
    // We now have a parse tree for the rule expressions
    // and a list of all UnicodeSets that are referenced.
    //
#ifdef RBBI_DEBUG
    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) {
        fSymbolTable->rbbiSymtablePrint();
    }
    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree"))
    {
        RBBIDebugPrintf("Completed Forward Rules Parse Tree...\n");
        fRB->fForwardTree->printTree(TRUE);
        RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n");
        fRB->fReverseTree->printTree(TRUE);
        RBBIDebugPrintf("\nCompleted Safe Point Forward Rules Parse Tree...\n");
        fRB->fSafeFwdTree->printTree(TRUE);
        RBBIDebugPrintf("\nCompleted Safe Point Reverse Rules Parse Tree...\n");
        fRB->fSafeRevTree->printTree(TRUE);
    }
#endif
}
//------------------------------------------------------------------------
//
//   build          Build the list of non-overlapping character ranges
//                  from the Unicode Sets.
//
//------------------------------------------------------------------------
void RBBISetBuilder::build() {
    RBBINode        *usetNode;
    RangeDescriptor *rlRange;

    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "usets")) {printSets();}

    //
    //  Initialize the process by creating a single range encompassing all characters
    //  that is in no sets.
    //
    fRangeList                = new RangeDescriptor(*fStatus); // will check for status here
    fRangeList->fStartChar    = 0;
    fRangeList->fEndChar      = 0x10ffff;

    if (U_FAILURE(*fStatus)) {
        return;
    }

    //
    //  Find the set of non-overlapping ranges of characters
    //
    int  ni;
    for (ni=0; ; ni++) {        // Loop over each of the UnicodeSets encountered in the input rules
        usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
        if (usetNode==NULL) {
            break;
        }

        UnicodeSet      *inputSet             = usetNode->fInputSet;
        int32_t          inputSetRangeCount   = inputSet->getRangeCount();
        int              inputSetRangeIndex   = 0;
                         rlRange              = fRangeList;

        for (;;) {
            if (inputSetRangeIndex >= inputSetRangeCount) {
                break;
            }
            UChar32      inputSetRangeBegin  = inputSet->getRangeStart(inputSetRangeIndex);
            UChar32      inputSetRangeEnd    = inputSet->getRangeEnd(inputSetRangeIndex);

            // skip over ranges from the range list that are completely
            //   below the current range from the input unicode set.
            while (rlRange->fEndChar < inputSetRangeBegin) {
                rlRange = rlRange->fNext;
            }

            // If the start of the range from the range list is before with
            //   the start of the range from the unicode set, split the range list range
            //   in two, with one part being before (wholly outside of) the unicode set
            //   and the other containing the rest.
            //   Then continue the loop; the post-split current range will then be skipped
            //     over
            if (rlRange->fStartChar < inputSetRangeBegin) {
                rlRange->split(inputSetRangeBegin, *fStatus);
                if (U_FAILURE(*fStatus)) {
                    return;
                }
                continue;
            }

            // Same thing at the end of the ranges...
            // If the end of the range from the range list doesn't coincide with
            //   the end of the range from the unicode set, split the range list
            //   range in two.  The first part of the split range will be
            //   wholly inside the Unicode set.
            if (rlRange->fEndChar > inputSetRangeEnd) {
                rlRange->split(inputSetRangeEnd+1, *fStatus);
                if (U_FAILURE(*fStatus)) {
                    return;
                }
            }

            // The current rlRange is now entirely within the UnicodeSet range.
            // Add this unicode set to the list of sets for this rlRange
            if (rlRange->fIncludesSets->indexOf(usetNode) == -1) {
                rlRange->fIncludesSets->addElement(usetNode, *fStatus);
                if (U_FAILURE(*fStatus)) {
                    return;
                }
            }

            // Advance over ranges that we are finished with.
            if (inputSetRangeEnd == rlRange->fEndChar) {
                inputSetRangeIndex++;
            }
            rlRange = rlRange->fNext;
        }
    }

    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "range")) { printRanges();}

    //
    //  Group the above ranges, with each group consisting of one or more
    //    ranges that are in exactly the same set of original UnicodeSets.
    //    The groups are numbered, and these group numbers are the set of
    //    input symbols recognized by the run-time state machine.
    //
    //    Numbering: # 0  (state table column 0) is unused.
    //               # 1  is reserved - table column 1 is for end-of-input
    //               # 2  is reserved - table column 2 is for beginning-in-input
    //               # 3  is the first range list.
    //
    RangeDescriptor *rlSearchRange;
    for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
        for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) {
            if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) {
                rlRange->fNum = rlSearchRange->fNum;
                break;
            }
        }
        if (rlRange->fNum == 0) {
            fGroupCount ++;
            rlRange->fNum = fGroupCount+2; 
            rlRange->setDictionaryFlag();
            addValToSets(rlRange->fIncludesSets, fGroupCount+2);
        }
    }

    // Handle input sets that contain the special string {eof}.
    //   Column 1 of the state table is reserved for EOF on input.
    //   Column 2 is reserved for before-the-start-input.
    //            (This column can be optimized away later if there are no rule
    //             references to {bof}.)
    //   Add this column value (1 or 2) to the equivalent expression
    //     subtree for each UnicodeSet that contains the string {eof}
    //   Because {bof} and {eof} are not a characters in the normal sense,
    //   they doesn't affect the computation of ranges or TRIE.
    static const UChar eofUString[] = {0x65, 0x6f, 0x66, 0};
    static const UChar bofUString[] = {0x62, 0x6f, 0x66, 0};

    UnicodeString eofString(eofUString);
    UnicodeString bofString(bofUString);
    for (ni=0; ; ni++) {        // Loop over each of the UnicodeSets encountered in the input rules
        usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
        if (usetNode==NULL) {
            break;
        }
        UnicodeSet      *inputSet = usetNode->fInputSet;
        if (inputSet->contains(eofString)) {
            addValToSet(usetNode, 1);
        }
        if (inputSet->contains(bofString)) {
            addValToSet(usetNode, 2);
            fSawBOF = TRUE;
        }
    }


    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();}
    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();}

    //
    // Build the Trie table for mapping UChar32 values to the corresponding
    //   range group number
    //
    fTrie = utrie_open(NULL,    //  Pre-existing trie to be filled in
                      NULL,    //  Data array  (utrie will allocate one)
                      100000,  //  Max Data Length
                      0,       //  Initial value for all code points
                      0,       //  Lead surrogate unit value
                      TRUE);   //  Keep Latin 1 in separately


    for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
        utrie_setRange32(fTrie, rlRange->fStartChar, rlRange->fEndChar+1, rlRange->fNum, TRUE);
    }
}
Ejemplo n.º 29
0
// Use LocalXyzPointer types that are not covered elsewhere in the intltest suite.
void LocalPointerTest::TestLocalXyzPointer() {
    IcuTestErrorCode errorCode(*this, "TestLocalXyzPointer");

    static const char *const encoding="ISO-8859-1";
    LocalUConverterSelectorPointer sel(
        ucnvsel_open(&encoding, 1, NULL, UCNV_ROUNDTRIP_SET, errorCode));
    if(errorCode.logIfFailureAndReset("ucnvsel_open()")) {
        return;
    }
    if(sel.isNull()) {
        errln("LocalUConverterSelectorPointer failure");
        return;
    }

#if !UCONFIG_NO_FORMATTING
    LocalUCalendarPointer cal(ucal_open(NULL, 0, "root", UCAL_GREGORIAN, errorCode));
    if(errorCode.logDataIfFailureAndReset("ucal_open()")) {
        return;
    }
    if(cal.isNull()) {
        errln("LocalUCalendarPointer failure");
        return;
    }

    LocalUDateTimePatternGeneratorPointer patgen(udatpg_open("root", errorCode));
    if(errorCode.logDataIfFailureAndReset("udatpg_open()")) {
        return;
    }
    if(patgen.isNull()) {
        errln("LocalUDateTimePatternGeneratorPointer failure");
        return;
    }

    LocalULocaleDisplayNamesPointer ldn(uldn_open("de-CH", ULDN_STANDARD_NAMES, errorCode));
    if(errorCode.logIfFailureAndReset("uldn_open()")) {
        return;
    }
    if(ldn.isNull()) {
        errln("LocalULocaleDisplayNamesPointer failure");
        return;
    }

    UnicodeString hello=UNICODE_STRING_SIMPLE("Hello {0}!");
    LocalUMessageFormatPointer msg(
        umsg_open(hello.getBuffer(), hello.length(), "root", NULL, errorCode));
    if(errorCode.logIfFailureAndReset("umsg_open()")) {
        return;
    }
    if(msg.isNull()) {
        errln("LocalUMessageFormatPointer failure");
        return;
    }
#endif  /* UCONFIG_NO_FORMATTING  */

#if !UCONFIG_NO_NORMALIZATION
    const UNormalizer2 *nfc=unorm2_getNFCInstance(errorCode);
    UnicodeSet emptySet;
    LocalUNormalizer2Pointer fn2(unorm2_openFiltered(nfc, emptySet.toUSet(), errorCode));
    if(errorCode.logIfFailureAndReset("unorm2_openFiltered()")) {
        return;
    }
    if(fn2.isNull()) {
        errln("LocalUNormalizer2Pointer failure");
        return;
    }
#endif /* !UCONFIG_NO_NORMALIZATION */

#if !UCONFIG_NO_IDNA
    LocalUIDNAPointer idna(uidna_openUTS46(0, errorCode));
    if(errorCode.logIfFailureAndReset("uidna_openUTS46()")) {
        return;
    }
    if(idna.isNull()) {
        errln("LocalUIDNAPointer failure");
        return;
    }
#endif  /* !UCONFIG_NO_IDNA */

#if !UCONFIG_NO_REGULAR_EXPRESSIONS
    UnicodeString pattern=UNICODE_STRING_SIMPLE("abc|xy+z");
    LocalURegularExpressionPointer regex(
        uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, errorCode));
    if(errorCode.logIfFailureAndReset("uregex_open()")) {
        return;
    }
    if(regex.isNull()) {
        errln("LocalURegularExpressionPointer failure");
        return;
    }
#endif /* UCONFIG_NO_REGULAR_EXPRESSIONS */

#if !UCONFIG_NO_TRANSLITERATION
    UnicodeString id=UNICODE_STRING_SIMPLE("Grek-Latn");
    LocalUTransliteratorPointer trans(
        utrans_openU(id.getBuffer(), id.length(), UTRANS_FORWARD, NULL, 0, NULL, errorCode));
    if(errorCode.logIfFailureAndReset("utrans_open()")) {
        return;
    }
    if(trans.isNull()) {
        errln("LocalUTransliteratorPointer failure");
        return;
    }
#endif /* !UCONFIG_NO_TRANSLITERATION */

    // destructors
}
Ejemplo n.º 30
0
//------------------------------------------------------------------------
//
//   build          Build the list of non-overlapping character ranges
//                  from the Unicode Sets.
//
//------------------------------------------------------------------------
void RBBISetBuilder::build() {
    RBBINode        *usetNode;
    RangeDescriptor *rlRange;

    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "usets")) {
        printSets();
    }

    //
    //  Initialize the process by creating a single range encompassing all characters
    //  that is in no sets.
    //
    fRangeList                = new RangeDescriptor(*fStatus); // will check for status here
    fRangeList->fStartChar    = 0;
    fRangeList->fEndChar      = 0x10ffff;

    if (U_FAILURE(*fStatus)) {
        return;
    }

    //
    //  Find the set of non-overlapping ranges of characters
    //
    int  ni;
    for (ni=0; ; ni++) {
        usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
        if (usetNode==NULL) {
            break;
        }

        UnicodeSet      *inputSet             = usetNode->fInputSet;
        int32_t          inputSetRangeCount   = inputSet->getRangeCount();
        int              inputSetRangeIndex   = 0;
        rlRange              = fRangeList;

        for (;;) {
            if (inputSetRangeIndex >= inputSetRangeCount) {
                break;
            }
            UChar32      inputSetRangeBegin  = inputSet->getRangeStart(inputSetRangeIndex);
            UChar32      inputSetRangeEnd    = inputSet->getRangeEnd(inputSetRangeIndex);

            // skip over ranges from the range list that are completely
            //   below the current range from the input unicode set.
            while (rlRange->fEndChar < inputSetRangeBegin) {
                rlRange = rlRange->fNext;
            }

            // If the start of the range from the range list is before with
            //   the start of the range from the unicode set, split the range list range
            //   in two, with one part being before (wholly outside of) the unicode set
            //   and the other containing the rest.
            //   Then continue the loop; the post-split current range will then be skipped
            //     over
            if (rlRange->fStartChar < inputSetRangeBegin) {
                rlRange->split(inputSetRangeBegin, *fStatus);
                if (U_FAILURE(*fStatus)) {
                    return;
                }
                continue;
            }

            // Same thing at the end of the ranges...
            // If the end of the range from the range list doesn't coincide with
            //   the end of the range from the unicode set, split the range list
            //   range in two.  The first part of the split range will be
            //   wholly inside the Unicode set.
            if (rlRange->fEndChar > inputSetRangeEnd) {
                rlRange->split(inputSetRangeEnd+1, *fStatus);
                if (U_FAILURE(*fStatus)) {
                    return;
                }
            }

            // The current rlRange is now entirely within the UnicodeSet range.
            // Add this unicode set to the list of sets for this rlRange
            if (rlRange->fIncludesSets->indexOf(usetNode) == -1) {
                rlRange->fIncludesSets->addElement(usetNode, *fStatus);
                if (U_FAILURE(*fStatus)) {
                    return;
                }
            }

            // Advance over ranges that we are finished with.
            if (inputSetRangeEnd == rlRange->fEndChar) {
                inputSetRangeIndex++;
            }
            rlRange = rlRange->fNext;
        }
    }

    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "range")) {
        printRanges();
    }

    //
    //  Group the above ranges, with each group consisting of one or more
    //    ranges that are in exactly the same set of original UnicodeSets.
    //    The groups are numbered, and these group numbers are the set of
    //    input symbols recognized by the run-time state machine.
    //
    RangeDescriptor *rlSearchRange;
    for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
        for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) {
            if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) {
                rlRange->fNum = rlSearchRange->fNum;
                break;
            }
        }
        if (rlRange->fNum == 0) {
            fGroupCount ++;
            rlRange->fNum = fGroupCount;
            rlRange->setDictionaryFlag();
            addValToSets(rlRange->fIncludesSets, fGroupCount);
        }
    }

    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {
        printRangeGroups();
    }
    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {
        printSets();
    }

    //
    // Build the Trie table for mapping UChar32 values to the corresponding
    //   range group number
    //
    fTrie = utrie_open(NULL,    //  Pre-existing trie to be filled in
                       NULL,    //  Data array  (utrie will allocate one)
                       100000,  //  Max Data Length
                       0,       //  Initial value for all code points
                       0,       //  Lead surrogate unit value
                       TRUE);   //  Keep Latin 1 in separately


    for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
        utrie_setRange32(fTrie, rlRange->fStartChar, rlRange->fEndChar+1, rlRange->fNum, TRUE);
    }
}