Beispiel #1
0
static int32_t
bytesTrieMatches(BytesTrie &trie,
                 UText *text, int32_t textLimit,
                 int32_t *lengths, int &count, int limit ) {
    UChar32 c=utext_next32(text);
    if(c<0) {
        return 0;
    }
    UStringTrieResult result=trie.first(thaiCharToByte(c));
    int32_t numChars=1;
    count=0;
    for(;;) {
        if(USTRINGTRIE_HAS_VALUE(result)) {
            if(count<limit) {
                // lengths[count++]=(int32_t)utext_getNativeIndex(text);
                lengths[count++]=numChars;  // CompactTrieDictionary just counts chars too.
            }
            if(result==USTRINGTRIE_FINAL_VALUE) {
                break;
            }
        } else if(result==USTRINGTRIE_NO_MATCH) {
            break;
        }
        if(numChars>=textLimit) {
            break;
        }
        UChar32 c=utext_next32(text);
        if(c<0) {
            break;
        }
        ++numChars;
        result=trie.next(thaiCharToByte(c));
    }
    return numChars;
}
Beispiel #2
0
// Closely imitate CompactTrieDictionary::matches().
// Note: CompactTrieDictionary::matches() is part of its trie implementation,
// and while it loops over the text, it knows the current state.
// By contrast, this implementation uses UCharsTrie API functions that have to
// check the trie state each time and load/store state in the object.
// (Whether it hasNext() and whether it is in the middle of a linear-match node.)
static int32_t
ucharsTrieMatches(UCharsTrie &trie,
                  UText *text, int32_t textLimit,
                  int32_t *lengths, int &count, int limit ) {
    UChar32 c=utext_next32(text);
    // Notes:
    // a) CompactTrieDictionary::matches() does not check for U_SENTINEL.
    // b) It also ignores non-BMP code points by casting to UChar!
    if(c<0) {
        return 0;
    }
    // Should be firstForCodePoint() but CompactTrieDictionary
    // handles only code units.
    UStringTrieResult result=trie.first(c);
    int32_t numChars=1;
    count=0;
    for(;;) {
        if(USTRINGTRIE_HAS_VALUE(result)) {
            if(count<limit) {
                // lengths[count++]=(int32_t)utext_getNativeIndex(text);
                lengths[count++]=numChars;  // CompactTrieDictionary just counts chars too.
            }
            if(result==USTRINGTRIE_FINAL_VALUE) {
                break;
            }
        } else if(result==USTRINGTRIE_NO_MATCH) {
            break;
        }
        if(numChars>=textLimit) {
            // Note: Why do we have both a text limit and a UText that knows its length?
            break;
        }
        UChar32 c=utext_next32(text);
        // Notes:
        // a) CompactTrieDictionary::matches() does not check for U_SENTINEL.
        // b) It also ignores non-BMP code points by casting to UChar!
        if(c<0) {
            break;
        }
        ++numChars;
        // Should be nextForCodePoint() but CompactTrieDictionary
        // handles only code units.
        result=trie.next(c);
    }
#if 0
    // Note: CompactTrieDictionary::matches() comments say that it leaves the UText
    // after the longest prefix match and returns the number of characters
    // that were matched.
    if(index!=lastMatch) {
        utext_setNativeIndex(text, lastMatch);
    }
    return lastMatch-start;
    // However, it does not do either of these, so I am not trying to
    // imitate it (or its docs) 100%.
#endif
    return numChars;
}
Beispiel #3
0
int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
                            int32_t *lengths, int32_t *cpLengths, int32_t *values,
                            int32_t *prefix, UnicodeSet const* ignoreSet, int32_t minLength) const {

    UCharsTrie uct(characters);
    int32_t startingTextIndex = utext_getNativeIndex(text);
    int32_t wordCount = 0;
    int32_t codePointsMatched = 0;

    for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
        UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
        int32_t lengthMatched = utext_getNativeIndex(text) - startingTextIndex;
        codePointsMatched += 1;
        if (ignoreSet != NULL && ignoreSet->contains(c)) {
            continue;
        }
        if (USTRINGTRIE_HAS_VALUE(result)) {
            if (codePointsMatched < minLength) {
                continue;
            }
            if (wordCount < limit) {
                if (values != NULL) {
                    values[wordCount] = uct.getValue();
                }
                if (lengths != NULL) {
                    lengths[wordCount] = lengthMatched;
                }
                if (cpLengths != NULL) {
                    cpLengths[wordCount] = codePointsMatched;
                }
                ++wordCount;
            }
            if (result == USTRINGTRIE_FINAL_VALUE) {
                break;
            }
        }
        else if (result == USTRINGTRIE_NO_MATCH) {
            break;
        }
        if (lengthMatched >= maxLength) {
            break;
        }
    }

    if (prefix != NULL) {
        *prefix = codePointsMatched;
    }
    return wordCount;
}
Beispiel #4
0
int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
                            int32_t *lengths, int32_t *cpLengths, int32_t *values,
                            int32_t *prefix) const {
    BytesTrie bt(characters);
    int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
    int32_t wordCount = 0;
    int32_t codePointsMatched = 0;

    for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
        UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
        int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
        codePointsMatched += 1;
        if (USTRINGTRIE_HAS_VALUE(result)) {
            if (wordCount < limit) {
                if (values != NULL) {
                    values[wordCount] = bt.getValue();
                }
                if (lengths != NULL) {
                    lengths[wordCount] = lengthMatched;
                }
                if (cpLengths != NULL) {
                    cpLengths[wordCount] = codePointsMatched;
                }
                ++wordCount;
            }
            if (result == USTRINGTRIE_FINAL_VALUE) {
                break;
            }
        }
        else if (result == USTRINGTRIE_NO_MATCH) {
            break;
        }
        if (lengthMatched >= maxLength) {
            break;
        }
    }

    if (prefix != NULL) {
        *prefix = codePointsMatched;
    }
    return wordCount;
}
int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
    UCharsTrie uct(characters);
    UChar32 c = utext_next32(text);
    if (c < 0) {
        return 0;
    }
    UStringTrieResult result = uct.first(c);
    int32_t numChars = 1;
    count = 0;
    for (;;) {
        if (USTRINGTRIE_HAS_VALUE(result)) {
            if (count < limit) {
                if (values != NULL) {
                    values[count] = uct.getValue();
                }
                lengths[count++] = numChars;
            }
            if (result == USTRINGTRIE_FINAL_VALUE) {
                break;
            }
        }
        else if (result == USTRINGTRIE_NO_MATCH) {
            break;
        }

        // TODO: why do we have a text limit if the UText knows its length?
        if (numChars >= maxLength) {
            break;
        }

        c = utext_next32(text);
        if (c < 0) {
            break;
        }
        ++numChars;
        result = uct.next(c);
    }
    return numChars;
}
Beispiel #6
0
void RegexPattern::dumpPattern() const {
#if defined(REGEX_DEBUG)
    int      index;

    UnicodeString patStr;
    for (UChar32 c = utext_next32From(fPattern, 0); c != U_SENTINEL; c = utext_next32(fPattern)) {
        patStr.append(c);
    }
    printf("Original Pattern:  \"%s\"\n", CStr(patStr)());
    printf("   Min Match Length:  %d\n", fMinMatchLen);
    printf("   Match Start Type:  %s\n", START_OF_MATCH_STR(fStartType));
    if (fStartType == START_STRING) {
        UnicodeString initialString(fLiteralText,fInitialStringIdx, fInitialStringLen);
        printf("   Initial match string: \"%s\"\n", CStr(initialString)());
    } else if (fStartType == START_SET) {
        UnicodeString s;
        fInitialChars->toPattern(s, TRUE);
        printf("    Match First Chars: %s\n", CStr(s)());

    } else if (fStartType == START_CHAR) {
        printf("    First char of Match: ");
        if (fInitialChar > 0x20) {
                printf("'%s'\n", CStr(UnicodeString(fInitialChar))());
            } else {
                printf("%#x\n", fInitialChar);
            }
    }

    printf("Named Capture Groups:\n");
    if (uhash_count(fNamedCaptureMap) == 0) {
        printf("   None\n");
    } else {
        int32_t pos = UHASH_FIRST;
        const UHashElement *el = NULL;
        while ((el = uhash_nextElement(fNamedCaptureMap, &pos))) {
            const UnicodeString *name = (const UnicodeString *)el->key.pointer;
            int32_t number = el->value.integer;
            printf("   %d\t%s\n", number, CStr(*name)());
        }
    }

    printf("\nIndex   Binary     Type             Operand\n" \
           "-------------------------------------------\n");
    for (index = 0; index<fCompiledPat->size(); index++) {
        dumpOp(index);
    }
    printf("\n\n");
#endif
}
Beispiel #7
0
int32_t
UnhandledEngine::findBreaks( UText *text,
                             int32_t /* startPos */,
                             int32_t endPos,
                             int32_t breakType,
                             UVector32 &/*foundBreaks*/ ) const {
    if (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)) {
        UChar32 c = utext_current32(text);
        while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
            utext_next32(text);            // TODO:  recast loop to work with post-increment operations.
            c = utext_current32(text);
        }
    }
    return 0;
}
int32_t
DictionaryBreakEngine::findBreaks( UText *text,
                                 int32_t startPos,
                                 int32_t endPos,
                                 UBool reverse,
                                 int32_t breakType,
                                 UStack &foundBreaks ) const {
    int32_t result = 0;

    // Find the span of characters included in the set.
    int32_t start = (int32_t)utext_getNativeIndex(text);
    int32_t current;
    int32_t rangeStart;
    int32_t rangeEnd;
    UChar32 c = utext_current32(text);
    if (reverse) {
        UBool   isDict = fSet.contains(c);
        while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) {
            c = utext_previous32(text);
            isDict = fSet.contains(c);
        }
        rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1);
        rangeEnd = start + 1;
    }
    else {
        while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
            utext_next32(text);         // TODO:  recast loop for postincrement
            c = utext_current32(text);
        }
        rangeStart = start;
        rangeEnd = current;
    }
    if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
        result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
        utext_setNativeIndex(text, current);
    }
    
    return result;
}
Beispiel #9
0
int32_t
UnhandledEngine::findBreaks( UText *text,
                                 int32_t startPos,
                                 int32_t endPos,
                                 UBool reverse,
                                 int32_t breakType,
                                 UStack &/*foundBreaks*/ ) const {
    if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
        UChar32 c = utext_current32(text); 
        if (reverse) {
            while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) {
                c = utext_previous32(text);
            }
        }
        else {
            while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
                utext_next32(text);            // TODO:  recast loop to work with post-increment operations.
                c = utext_current32(text);
            }
        }
    }
    return 0;
}
Beispiel #10
0
//-------------------------------------------------------------------------------
//
//  checkDictionary       This function handles all processing of characters in
//                        the "dictionary" set. It will determine the appropriate
//                        course of action, and possibly set up a cache in the
//                        process.
//
//-------------------------------------------------------------------------------
int32_t BreakIterator::checkDictionary(int32_t startPos,
                            int32_t endPos,
                            UBool reverse) {
#if 1
	return reverse ? startPos : endPos;
#else
    // Reset the old break cache first.
    uint32_t dictionaryCount = fDictionaryCharCount;
    reset();

    if (dictionaryCount <= 1 || (endPos - startPos) <= 1) {
        return (reverse ? startPos : endPos);
    }
    
    // Starting from the starting point, scan towards the proposed result,
    // looking for the first dictionary character (which may be the one
    // we're on, if we're starting in the middle of a range).
    utext_setNativeIndex(fText, reverse ? endPos : startPos);
    if (reverse) {
        UTEXT_PREVIOUS32(fText);
    }
    
    int32_t rangeStart = startPos;
    int32_t rangeEnd = endPos;

    uint16_t    category;
    int32_t     current;
    UErrorCode  status = U_ZERO_ERROR;
    UStack      breaks(status);
    int32_t     foundBreakCount = 0;
    UChar32     c = utext_current32(fText);

    UTRIE_GET16(&fData->fTrie, c, category);
    
    // Is the character we're starting on a dictionary character? If so, we
    // need to back up to include the entire run; otherwise the results of
    // the break algorithm will differ depending on where we start. Since
    // the result is cached and there is typically a non-dictionary break
    // within a small number of words, there should be little performance impact.
    if (category & 0x4000) {
        if (reverse) {
            do {
                utext_next32(fText);          // TODO:  recast to work directly with postincrement.
                c = utext_current32(fText);
                UTRIE_GET16(&fData->fTrie, c, category);
            } while (c != U_SENTINEL && (category & 0x4000));
            // Back up to the last dictionary character
            rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
            if (c == U_SENTINEL) {
                // c = fText->last32();
                //   TODO:  why was this if needed?
                c = UTEXT_PREVIOUS32(fText);
            }
            else {
                c = UTEXT_PREVIOUS32(fText);
            }
        }
        else {
            do {
                c = UTEXT_PREVIOUS32(fText);
                UTRIE_GET16(&fData->fTrie, c, category);
            }
            while (c != U_SENTINEL && (category & 0x4000));
            // Back up to the last dictionary character
            if (c == U_SENTINEL) {
                // c = fText->first32();
                c = utext_current32(fText);
            }
            else {
                utext_next32(fText);
                c = utext_current32(fText);
            }
            rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
        }
        UTRIE_GET16(&fData->fTrie, c, category);
    }
    
    // Loop through the text, looking for ranges of dictionary characters.
    // For each span, find the appropriate break engine, and ask it to find
    // any breaks within the span.
    // Note: we always do this in the forward direction, so that the break
    // cache is built in the right order.
    if (reverse) {
        utext_setNativeIndex(fText, rangeStart);
        c = utext_current32(fText);
        UTRIE_GET16(&fData->fTrie, c, category);
    }
    while(U_SUCCESS(status)) {
        while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
            utext_next32(fText);           // TODO:  tweak for post-increment operation
            c = utext_current32(fText);
            UTRIE_GET16(&fData->fTrie, c, category);
        }
        if (current >= rangeEnd) {
            break;
        }
        
        // We now have a dictionary character. Get the appropriate language object
        // to deal with it.
        const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);
        
        // Ask the language object if there are any breaks. It will leave the text
        // pointer on the other side of its range, ready to search for the next one.
        if (lbe != NULL) {
            foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks);
        }
        
        // Reload the loop variables for the next go-round
        c = utext_current32(fText);
        UTRIE_GET16(&fData->fTrie, c, category);
    }
    
    // If we found breaks, build a new break cache. The first and last entries must
    // be the original starting and ending position.
    if (foundBreakCount > 0) {
        int32_t totalBreaks = foundBreakCount;
        if (startPos < breaks.elementAti(0)) {
            totalBreaks += 1;
        }
        if (endPos > breaks.peeki()) {
            totalBreaks += 1;
        }
        fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t));
        if (fCachedBreakPositions != NULL) {
            int32_t out = 0;
            fNumCachedBreakPositions = totalBreaks;
            if (startPos < breaks.elementAti(0)) {
                fCachedBreakPositions[out++] = startPos;
            }
            for (int32_t i = 0; i < foundBreakCount; ++i) {
                fCachedBreakPositions[out++] = breaks.elementAti(i);
            }
            if (endPos > fCachedBreakPositions[out-1]) {
                fCachedBreakPositions[out] = endPos;
            }
            // If there are breaks, then by definition, we are replacing the original
            // proposed break by one of the breaks we found. Use following() and
            // preceding() to do the work. They should never recurse in this case.
            if (reverse) {
                return preceding(endPos - 1);
            }
            else {
                return following(startPos);
            }
        }
        // If the allocation failed, just fall through to the "no breaks found" case.
    }

    // If we get here, there were no language-based breaks. Set the text pointer
    // to the original proposed break.
    utext_setNativeIndex(fText, reverse ? startPos : endPos);
    return (reverse ? startPos : endPos);
#endif
}
/*
 * @param text A UText representing the text
 * @param rangeStart The start of the range of dictionary characters
 * @param rangeEnd The end of the range of dictionary characters
 * @param foundBreaks Output of C array of int32_t break positions, or 0
 * @return The number of breaks found
 */
int32_t 
CjkBreakEngine::divideUpDictionaryRange( UText *text,
        int32_t rangeStart,
        int32_t rangeEnd,
        UStack &foundBreaks ) const {
    if (rangeStart >= rangeEnd) {
        return 0;
    }

    const size_t defaultInputLength = 80;
    size_t inputLength = rangeEnd - rangeStart;
    // TODO: Replace by UnicodeString.
    AutoBuffer<UChar, defaultInputLength> charString(inputLength);

    // Normalize the input string and put it in normalizedText.
    // The map from the indices of the normalized input to the raw
    // input is kept in charPositions.
    UErrorCode status = U_ZERO_ERROR;
    utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status);
    if (U_FAILURE(status)) {
        return 0;
    }

    UnicodeString inputString(charString.elems(), inputLength);
    // TODO: Use Normalizer2.
    UNormalizationMode norm_mode = UNORM_NFKC;
    UBool isNormalized =
        Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES ||
        Normalizer::isNormalized(inputString, norm_mode, status);

    // TODO: Replace by UVector32.
    AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1);
    int numChars = 0;
    UText normalizedText = UTEXT_INITIALIZER;
    // Needs to be declared here because normalizedText holds onto its buffer.
    UnicodeString normalizedString;
    if (isNormalized) {
        int32_t index = 0;
        charPositions[0] = 0;
        while(index < inputString.length()) {
            index = inputString.moveIndex32(index, 1);
            charPositions[++numChars] = index;
        }
        utext_openUnicodeString(&normalizedText, &inputString, &status);
    }
    else {
        Normalizer::normalize(inputString, norm_mode, 0, normalizedString, status);
        if (U_FAILURE(status)) {
            return 0;
        }
        charPositions.resize(normalizedString.length() + 1);
        Normalizer normalizer(charString.elems(), inputLength, norm_mode);
        int32_t index = 0;
        charPositions[0] = 0;
        while(index < normalizer.endIndex()){
            /* UChar32 uc = */ normalizer.next();
            charPositions[++numChars] = index = normalizer.getIndex();
        }
        utext_openUnicodeString(&normalizedText, &normalizedString, &status);
    }

    if (U_FAILURE(status)) {
        return 0;
    }

    // From this point on, all the indices refer to the indices of
    // the normalized input string.

    // bestSnlp[i] is the snlp of the best segmentation of the first i
    // characters in the range to be matched.
    // TODO: Replace by UVector32.
    AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1);
    bestSnlp[0] = 0;
    for(int i = 1; i <= numChars; i++) {
        bestSnlp[i] = kuint32max;
    }

    // prev[i] is the index of the last CJK character in the previous word in 
    // the best segmentation of the first i characters.
    // TODO: Replace by UVector32.
    AutoBuffer<int, defaultInputLength> prev(numChars + 1);
    for(int i = 0; i <= numChars; i++){
        prev[i] = -1;
    }

    const size_t maxWordSize = 20;
    // TODO: Replace both with UVector32.
    AutoBuffer<int32_t, maxWordSize> values(numChars);
    AutoBuffer<int32_t, maxWordSize> lengths(numChars);

    // Dynamic programming to find the best segmentation.
    bool is_prev_katakana = false;
    for (int32_t i = 0; i < numChars; ++i) {
        //utext_setNativeIndex(text, rangeStart + i);
        utext_setNativeIndex(&normalizedText, i);
        if (bestSnlp[i] == kuint32max)
            continue;

        int32_t count;
        // limit maximum word length matched to size of current substring
        int32_t maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize : (numChars - i);

        fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems());

        // if there are no single character matches found in the dictionary 
        // starting with this charcter, treat character as a 1-character word 
        // with the highest value possible, i.e. the least likely to occur.
        // Exclude Korean characters from this treatment, as they should be left
        // together by default.
        if((count == 0 || lengths[0] != 1) &&
                !fHangulWordSet.contains(utext_current32(&normalizedText))) {
            values[count] = maxSnlp;
            lengths[count++] = 1;
        }

        for (int j = 0; j < count; j++) {
            uint32_t newSnlp = bestSnlp[i] + values[j];
            if (newSnlp < bestSnlp[lengths[j] + i]) {
                bestSnlp[lengths[j] + i] = newSnlp;
                prev[lengths[j] + i] = i;
            }
        }

        // In Japanese,
        // Katakana word in single character is pretty rare. So we apply
        // the following heuristic to Katakana: any continuous run of Katakana
        // characters is considered a candidate word with a default cost
        // specified in the katakanaCost table according to its length.
        //utext_setNativeIndex(text, rangeStart + i);
        utext_setNativeIndex(&normalizedText, i);
        bool is_katakana = isKatakana(utext_current32(&normalizedText));
        if (!is_prev_katakana && is_katakana) {
            int j = i + 1;
            utext_next32(&normalizedText);
            // Find the end of the continuous run of Katakana characters
            while (j < numChars && (j - i) < kMaxKatakanaGroupLength &&
                    isKatakana(utext_current32(&normalizedText))) {
                utext_next32(&normalizedText);
                ++j;
            }
            if ((j - i) < kMaxKatakanaGroupLength) {
                uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i);
                if (newSnlp < bestSnlp[j]) {
                    bestSnlp[j] = newSnlp;
                    prev[j] = i;
                }
            }
        }
        is_prev_katakana = is_katakana;
    }

    // Start pushing the optimal offset index into t_boundary (t for tentative).
    // prev[numChars] is guaranteed to be meaningful.
    // We'll first push in the reverse order, i.e.,
    // t_boundary[0] = numChars, and afterwards do a swap.
    // TODO: Replace by UVector32.
    AutoBuffer<int, maxWordSize> t_boundary(numChars + 1);

    int numBreaks = 0;
    // No segmentation found, set boundary to end of range
    if (bestSnlp[numChars] == kuint32max) {
        t_boundary[numBreaks++] = numChars;
    } else {
        for (int i = numChars; i > 0; i = prev[i]) {
            t_boundary[numBreaks++] = i;
        }
        U_ASSERT(prev[t_boundary[numBreaks - 1]] == 0);
    }

    // Reverse offset index in t_boundary.
    // Don't add a break for the start of the dictionary range if there is one
    // there already.
    if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
        t_boundary[numBreaks++] = 0;
    }

    // Now that we're done, convert positions in t_bdry[] (indices in 
    // the normalized input string) back to indices in the raw input string
    // while reversing t_bdry and pushing values to foundBreaks.
    for (int i = numBreaks-1; i >= 0; i--) {
        foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status);
    }

    utext_close(&normalizedText);
    return numBreaks;
}
int32_t
KhmerBreakEngine::divideUpDictionaryRange( UText *text,
                                                int32_t rangeStart,
                                                int32_t rangeEnd,
                                                UStack &foundBreaks ) const {
    if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
        return 0;       // Not enough characters for two words
    }

    uint32_t wordsFound = 0;
    int32_t wordLength;
    int32_t current;
    UErrorCode status = U_ZERO_ERROR;
    PossibleWord words[KHMER_LOOKAHEAD];
    UChar32 uc;

    utext_setNativeIndex(text, rangeStart);

    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
        wordLength = 0;

        // Look for candidate words at the current position
        int candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);

        // If we found exactly one, use that
        if (candidates == 1) {
            wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(text);
            wordsFound += 1;
        }

        // If there was more than one, see which one can take us forward the most words
        else if (candidates > 1) {
            // If we're already at the end of the range, we're done
            if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
                goto foundBest;
            }
            do {
                int wordsMatched = 1;
                if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
                    if (wordsMatched < 2) {
                        // Followed by another dictionary word; mark first word as a good candidate
                        words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
                        wordsMatched = 2;
                    }

                    // If we're already at the end of the range, we're done
                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
                        goto foundBest;
                    }

                    // See if any of the possible second words is followed by a third word
                    do {
                        // If we find a third word, stop right away
                        if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
                            words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
                            goto foundBest;
                        }
                    }
                    while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
                }
            }
            while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
foundBest:
            wordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
            wordsFound += 1;
        }

        // We come here after having either found a word or not. We look ahead to the
        // next word. If it's not a dictionary word, we will combine it with the word we
        // just found (if there is one), but only if the preceding word does not exceed
        // the threshold.
        // The text iterator should now be positioned at the end of the word we found.
        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
            // if it is a dictionary word, do nothing. If it isn't, then if there is
            // no preceding word, or the non-word shares less than the minimum threshold
            // of characters with a dictionary word, then scan to resynchronize
            if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
                  && (wordLength == 0
                      || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
                // Look for a plausible word boundary
                //TODO: This section will need a rework for UText.
                int32_t remaining = rangeEnd - (current+wordLength);
                UChar32 pc = utext_current32(text);
                int32_t chars = 0;
                for (;;) {
                    utext_next32(text);
                    uc = utext_current32(text);
                    // TODO: Here we're counting on the fact that the SA languages are all
                    // in the BMP. This should get fixed with the UText rework.
                    chars += 1;
                    if (--remaining <= 0) {
                        break;
                    }
                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
                        // Maybe. See if it's in the dictionary.
                        int candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
                        utext_setNativeIndex(text, current+wordLength+chars);
                        if (candidates > 0) {
                            break;
                        }
                    }
                    pc = uc;
                }

                // Bump the word count if there wasn't already one
                if (wordLength <= 0) {
                    wordsFound += 1;
                }

                // Update the length with the passed-over characters
                wordLength += chars;
            }
            else {
                // Back up to where we were for next iteration
                utext_setNativeIndex(text, current+wordLength);
            }
        }

        // Never stop before a combining mark.
        int32_t currPos;
        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
            utext_next32(text);
            wordLength += (int32_t)utext_getNativeIndex(text) - currPos;
        }

        // Look ahead for possible suffixes if a dictionary word does not follow.
        // We do this in code rather than using a rule so that the heuristic
        // resynch continues to function. For example, one of the suffix characters
        // could be a typo in the middle of a word.
//        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
//            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
//                && fSuffixSet.contains(uc = utext_current32(text))) {
//                if (uc == KHMER_PAIYANNOI) {
//                    if (!fSuffixSet.contains(utext_previous32(text))) {
//                        // Skip over previous end and PAIYANNOI
//                        utext_next32(text);
//                        utext_next32(text);
//                        wordLength += 1;            // Add PAIYANNOI to word
//                        uc = utext_current32(text);     // Fetch next character
//                    }
//                    else {
//                        // Restore prior position
//                        utext_next32(text);
//                    }
//                }
//                if (uc == KHMER_MAIYAMOK) {
//                    if (utext_previous32(text) != KHMER_MAIYAMOK) {
//                        // Skip over previous end and MAIYAMOK
//                        utext_next32(text);
//                        utext_next32(text);
//                        wordLength += 1;            // Add MAIYAMOK to word
//                    }
//                    else {
//                        // Restore prior position
//                        utext_next32(text);
//                    }
//                }
//            }
//            else {
//                utext_setNativeIndex(text, current+wordLength);
//            }
//        }

        // Did we find a word on this iteration? If so, push it on the break stack
        if (wordLength > 0) {
            foundBreaks.push((current+wordLength), status);
        }
    }
    
    // Don't return a break for the end of the dictionary range if there is one there.
    if (foundBreaks.peeki() >= rangeEnd) {
        (void) foundBreaks.popi();
        wordsFound -= 1;
    }

    return wordsFound;
}
Beispiel #13
0
static void TestAPI(void) {
    UErrorCode      status = U_ZERO_ERROR;
    UBool           gFailed = FALSE;
    (void)gFailed;   /* Suppress set but not used warning. */

    /* Open    */
    {
        UText           utLoc = UTEXT_INITIALIZER;
        const char *    cString = "\x61\x62\x63\x64";
        UChar           uString[]  = {0x41, 0x42, 0x43, 0};
        UText          *uta;
        UText          *utb;
        UChar           c;

        uta = utext_openUChars(NULL, uString, -1, &status);
        TEST_SUCCESS(status);
        c = utext_next32(uta);
        TEST_ASSERT(c == 0x41);
        utb = utext_close(uta); 
        TEST_ASSERT(utb == NULL);

        uta = utext_openUTF8(&utLoc, cString, -1, &status);
        TEST_SUCCESS(status);
        TEST_ASSERT(uta == &utLoc);

        uta = utext_close(&utLoc);
        TEST_ASSERT(uta == &utLoc);
    }

    /* utext_clone()  */
    {
        UChar   uString[]  = {0x41, 0x42, 0x43, 0};
        int64_t len;
        UText   *uta;
        UText   *utb;

        status = U_ZERO_ERROR;
        uta = utext_openUChars(NULL, uString, -1, &status);
        TEST_SUCCESS(status);
        utb = utext_clone(NULL, uta, FALSE, FALSE, &status);
        TEST_SUCCESS(status);
        TEST_ASSERT(utb != NULL);
        TEST_ASSERT(utb != uta);
        len = utext_nativeLength(uta);
        TEST_ASSERT(len == u_strlen(uString));
        utext_close(uta);
        utext_close(utb);
    }

    /* basic access functions  */
    {
        UChar     uString[]  = {0x41, 0x42, 0x43, 0};
        UText     *uta;
        UChar32   c;
        int64_t   len;
        UBool     b;
        int64_t   i;

        status = U_ZERO_ERROR;
        uta = utext_openUChars(NULL, uString, -1, &status);
        TEST_ASSERT(uta!=NULL);
        TEST_SUCCESS(status);
        b = utext_isLengthExpensive(uta);
        TEST_ASSERT(b==TRUE);
        len = utext_nativeLength(uta);
        TEST_ASSERT(len == u_strlen(uString));
        b = utext_isLengthExpensive(uta);
        TEST_ASSERT(b==FALSE);

        c = utext_char32At(uta, 0);
        TEST_ASSERT(c==uString[0]);
        
        c = utext_current32(uta);
        TEST_ASSERT(c==uString[0]);

        c = utext_next32(uta);
        TEST_ASSERT(c==uString[0]);
        c = utext_current32(uta);
        TEST_ASSERT(c==uString[1]);

        c = utext_previous32(uta);
        TEST_ASSERT(c==uString[0]);
        c = utext_current32(uta);
        TEST_ASSERT(c==uString[0]);

        c = utext_next32From(uta, 1);
        TEST_ASSERT(c==uString[1]);
        c = utext_next32From(uta, u_strlen(uString));
        TEST_ASSERT(c==U_SENTINEL);

        c = utext_previous32From(uta, 2);
        TEST_ASSERT(c==uString[1]);
        i = utext_getNativeIndex(uta);
        TEST_ASSERT(i == 1);

        utext_setNativeIndex(uta, 0);
        b = utext_moveIndex32(uta, 1);
        TEST_ASSERT(b==TRUE);
        i = utext_getNativeIndex(uta);
        TEST_ASSERT(i==1);

        b = utext_moveIndex32(uta, u_strlen(uString)-1);
        TEST_ASSERT(b==TRUE);
        i = utext_getNativeIndex(uta);
        TEST_ASSERT(i==u_strlen(uString));

        b = utext_moveIndex32(uta, 1);
        TEST_ASSERT(b==FALSE);
        i = utext_getNativeIndex(uta);
        TEST_ASSERT(i==u_strlen(uString));

        utext_setNativeIndex(uta, 0);
        c = UTEXT_NEXT32(uta);
        TEST_ASSERT(c==uString[0]);
        c = utext_current32(uta);
        TEST_ASSERT(c==uString[1]);

        c = UTEXT_PREVIOUS32(uta);
        TEST_ASSERT(c==uString[0]);
        c = UTEXT_PREVIOUS32(uta);
        TEST_ASSERT(c==U_SENTINEL);


        utext_close(uta);
    }

    {
        /*
         * UText opened on a NULL string with zero length
         */
        UText    *uta;
        UChar32   c;

        status = U_ZERO_ERROR;
        uta = utext_openUChars(NULL, NULL, 0, &status);
        TEST_SUCCESS(status);
        c = UTEXT_NEXT32(uta);
        TEST_ASSERT(c == U_SENTINEL);
        utext_close(uta);

        uta = utext_openUTF8(NULL, NULL, 0, &status);
        TEST_SUCCESS(status);
        c = UTEXT_NEXT32(uta);
        TEST_ASSERT(c == U_SENTINEL);
        utext_close(uta);
    }


    {
        /*
         * extract
         */
        UText     *uta;
        UChar     uString[]  = {0x41, 0x42, 0x43, 0};
        UChar     buf[100];
        int32_t   i;
        /* Test pinning of input bounds */
        UChar     uString2[]  = {0x41, 0x42, 0x43, 0x44, 0x45,
                                 0x46, 0x47, 0x48, 0x49, 0x4A, 0};
        UChar *   uString2Ptr = uString2 + 5;

        status = U_ZERO_ERROR;
        uta = utext_openUChars(NULL, uString, -1, &status);
        TEST_SUCCESS(status);

        status = U_ZERO_ERROR;
        i = utext_extract(uta, 0, 100, NULL, 0, &status);
        TEST_ASSERT(status==U_BUFFER_OVERFLOW_ERROR);
        TEST_ASSERT(i == u_strlen(uString));

        status = U_ZERO_ERROR;
        memset(buf, 0, sizeof(buf));
        i = utext_extract(uta, 0, 100, buf, 100, &status);
        TEST_SUCCESS(status);
        TEST_ASSERT(i == u_strlen(uString));
        i = u_strcmp(uString, buf);
        TEST_ASSERT(i == 0);
        utext_close(uta);

        /* Test pinning of input bounds */
        status = U_ZERO_ERROR;
        uta = utext_openUChars(NULL, uString2Ptr, -1, &status);
        TEST_SUCCESS(status);

        status = U_ZERO_ERROR;
        memset(buf, 0, sizeof(buf));
        i = utext_extract(uta, -3, 20, buf, 100, &status);
        TEST_SUCCESS(status);
        TEST_ASSERT(i == u_strlen(uString2Ptr));
        i = u_strcmp(uString2Ptr, buf);
        TEST_ASSERT(i == 0);
        utext_close(uta);
    }

    {
        /*
         *  Copy, Replace, isWritable
         *    Can't create an editable UText from plain C, so all we
         *    can easily do is check that errors returned.
         */
        UText     *uta;
        UChar     uString[]  = {0x41, 0x42, 0x43, 0};
        UBool     b;

        status = U_ZERO_ERROR;
        uta = utext_openUChars(NULL, uString, -1, &status);
        TEST_SUCCESS(status);

        b = utext_isWritable(uta);
        TEST_ASSERT(b == FALSE);

        b = utext_hasMetaData(uta);
        TEST_ASSERT(b == FALSE);

        utext_replace(uta,
                      0, 1,     /* start, limit */
                      uString, -1,  /* replacement, replacement length */
                      &status);
        TEST_ASSERT(status == U_NO_WRITE_PERMISSION);


        utext_copy(uta,
                   0, 1,         /* start, limit      */
                   2,            /* destination index */
                   FALSE,        /* move flag         */
                   &status);
        TEST_ASSERT(status == U_NO_WRITE_PERMISSION);

        utext_close(uta);
    }


}