C++ (Cpp) utext_getNativeIndex примеры использования

Пример #1

0

Показать файл

Файл: dictionarydata.cpp Проект: cambell-prince/icu

int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
                            int32_t *lengths, int32_t *cpLengths, int32_t *values,
                            int32_t *prefix, UnicodeSet const* ignoreSet, int32_t minLength) const {

    UCharsTrie uct(characters);
    int32_t startingTextIndex = utext_getNativeIndex(text);
    int32_t wordCount = 0;
    int32_t codePointsMatched = 0;

    for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
        UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
        int32_t lengthMatched = utext_getNativeIndex(text) - startingTextIndex;
        codePointsMatched += 1;
        if (ignoreSet != NULL && ignoreSet->contains(c)) {
            continue;
        }
        if (USTRINGTRIE_HAS_VALUE(result)) {
            if (codePointsMatched < minLength) {
                continue;
            }
            if (wordCount < limit) {
                if (values != NULL) {
                    values[wordCount] = uct.getValue();
                }
                if (lengths != NULL) {
                    lengths[wordCount] = lengthMatched;
                }
                if (cpLengths != NULL) {
                    cpLengths[wordCount] = codePointsMatched;
                }
                ++wordCount;
            }
            if (result == USTRINGTRIE_FINAL_VALUE) {
                break;
            }
        }
        else if (result == USTRINGTRIE_NO_MATCH) {
            break;
        }
        if (lengthMatched >= maxLength) {
            break;
        }
    }

    if (prefix != NULL) {
        *prefix = codePointsMatched;
    }
    return wordCount;
}

Пример #2

0

Показать файл

Файл: dictionarydata.cpp Проект: 119120119/node

int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
                            int32_t *lengths, int32_t *cpLengths, int32_t *values,
                            int32_t *prefix) const {
    BytesTrie bt(characters);
    int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
    int32_t wordCount = 0;
    int32_t codePointsMatched = 0;

    for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
        UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
        int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
        codePointsMatched += 1;
        if (USTRINGTRIE_HAS_VALUE(result)) {
            if (wordCount < limit) {
                if (values != NULL) {
                    values[wordCount] = bt.getValue();
                }
                if (lengths != NULL) {
                    lengths[wordCount] = lengthMatched;
                }
                if (cpLengths != NULL) {
                    cpLengths[wordCount] = codePointsMatched;
                }
                ++wordCount;
            }
            if (result == USTRINGTRIE_FINAL_VALUE) {
                break;
            }
        }
        else if (result == USTRINGTRIE_NO_MATCH) {
            break;
        }
        if (lengthMatched >= maxLength) {
            break;
        }
    }

    if (prefix != NULL) {
        *prefix = codePointsMatched;
    }
    return wordCount;
}

Пример #3

0

Показать файл

Файл: brkeng.cpp Проект: eyoung-father/libicu_full

void
UnhandledEngine::findBreaks( UText *text,
                             int32_t endPos,
                             int32_t /* breakType */,
                             UVector32 &foundBreaks,
                             UErrorCode &status) const {
    int32_t startPos = utext_getNativeIndex(text);
    foundBreaks.addElement(startPos, status);
    foundBreaks.addElement(endPos, status);
    return;
}

Пример #4

0

Показать файл

Файл: dictbe.cpp Проект: alfintatorkace/osx-10.9-opensource

int32_t
DictionaryBreakEngine::findBreaks( UText *text,
                                 int32_t startPos,
                                 int32_t endPos,
                                 UBool reverse,
                                 int32_t breakType,
                                 UStack &foundBreaks ) const {
    int32_t result = 0;

    // Find the span of characters included in the set.
    int32_t start = (int32_t)utext_getNativeIndex(text);
    int32_t current;
    int32_t rangeStart;
    int32_t rangeEnd;
    UChar32 c = utext_current32(text);
    if (reverse) {
        UBool   isDict = fSet.contains(c);
        while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) {
            c = utext_previous32(text);
            isDict = fSet.contains(c);
        }
        rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1);
        rangeEnd = start + 1;
    }
    else {
        while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
            utext_next32(text);         // TODO:  recast loop for postincrement
            c = utext_current32(text);
        }
        rangeStart = start;
        rangeEnd = current;
    }
    if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
        result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
        utext_setNativeIndex(text, current);
    }
    
    return result;
}

Пример #5

0

Показать файл

Файл: brkeng.cpp Проект: 00zhengfu00/third_party

int32_t
UnhandledEngine::findBreaks( UText *text,
                                 int32_t startPos,
                                 int32_t endPos,
                                 UBool reverse,
                                 int32_t breakType,
                                 UStack &/*foundBreaks*/ ) const {
    if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
        UChar32 c = utext_current32(text); 
        if (reverse) {
            while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) {
                c = utext_previous32(text);
            }
        }
        else {
            while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
                utext_next32(text);            // TODO:  recast loop to work with post-increment operations.
                c = utext_current32(text);
            }
        }
    }
    return 0;
}

Пример #6

0

Показать файл

Файл: brkeng.cpp Проект: Mikhaska/node

int32_t
UnhandledEngine::findBreaks( UText *text,
                             int32_t /* startPos */,
                             int32_t endPos,
                             int32_t breakType,
                             UVector32 &/*foundBreaks*/ ) const {
    if (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)) {
        UChar32 c = utext_current32(text);
        while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
            utext_next32(text);            // TODO:  recast loop to work with post-increment operations.
            c = utext_current32(text);
        }
    }
    return 0;
}

Пример #7

0

Показать файл

Файл: CodePointBreakIterator.cpp Проект: 191919/hhvm

CodePointBreakIterator&
CodePointBreakIterator::refreshInputText(UText *input,
           UErrorCode &status) {
  if (U_FAILURE(status)) {
    return *this;
  }
  if (!input) {
    status = U_ILLEGAL_ARGUMENT_ERROR;
    return *this;
  }

  int64_t pos = utext_getNativeIndex(m_text);
  m_text = utext_clone(m_text, input, false, true, &status);
  if (U_FAILURE(status)) {
    return *this;
  }

  utext_setNativeIndex(m_text, pos);
  if (utext_getNativeIndex(m_text) != pos) {
    status = U_ILLEGAL_ARGUMENT_ERROR;
  }

  return *this;
}

Пример #8

0

Показать файл

Файл: dictbe.cpp Проект: alfintatorkace/osx-10.9-opensource

inline int
PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
    // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
    int32_t start = (int32_t)utext_getNativeIndex(text);
    if (start != offset) {
        offset = start;
        prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(lengths)/sizeof(lengths[0]));
        // Dictionary leaves text after longest prefix, not longest word. Back up.
        if (count <= 0) {
            utext_setNativeIndex(text, start);
        }
    }
    if (count > 0) {
        utext_setNativeIndex(text, start+lengths[count-1]);
    }
    current = count-1;
    mark = current;
    return count;
}

Пример #9

0

Показать файл

Файл: rbbi.cpp Проект: Gin-Rye/duibrowser

//-----------------------------------------------------------------------------------
//
//  handlePrevious()
//
//      Iterate backwards, according to the logic of the reverse rules.
//      This version handles the exact style backwards rules.
//
//      The logic of this function is very similar to handleNext(), above.
//
//-----------------------------------------------------------------------------------
int32_t BreakIterator::handlePrevious(const RBBIStateTable *statetable) {
    int32_t             state;
    int16_t             category        = 0;
    RBBIRunMode         mode;
    RBBIStateTableRow  *row;
    UChar32             c;
    int32_t             lookaheadStatus = 0;
    int32_t             result          = 0;
    int32_t             initialPosition = 0;
    int32_t             lookaheadResult = 0;
    UBool               lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;

    #ifdef RBBI_DEBUG
        if (fTrace) {
            RBBIDebugPuts("Handle Previous   pos   char  state category");
        }
    #endif

    // handlePrevious() never gets the rule status.
    // Flag the status as invalid; if the user ever asks for status, we will need
    // to back up, then re-find the break position using handleNext(), which does
    // get the status value.
    fLastStatusIndexValid = FALSE;
    fLastRuleStatusIndex = 0;

    // if we're already at the start of the text, return DONE.
    if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) {
        return BreakIterator::DONE;
    }

    //  Set up the starting char.
    initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    result          = initialPosition;
    c               = UTEXT_PREVIOUS32(fText);

    //  Set the initial state for the state machine
    state = START_STATE;
    row = (RBBIStateTableRow *)
            (statetable->fTableData + (statetable->fRowLen * state));
    category = 3;
    mode     = RBBI_RUN;
    if (statetable->fFlags & RBBI_BOF_REQUIRED) {
        category = 2;
        mode     = RBBI_START;
    }


    // loop until we reach the start of the text or transition to state 0
    //
    for (;;) {
        if (c == U_SENTINEL) {
            // Reached end of input string.
            if (mode == RBBI_END || 
                *(int32_t *)fHeader->fFormatVersion == 1 ) {
                // We have already run the loop one last time with the 
                //   character set to the psueudo {eof} value.  Now it is time
                //   to unconditionally bail out.
                //  (Or we have an old format binary rule file that does not support {eof}.)
                if (lookaheadResult < result) {
                    // We ran off the end of the string with a pending look-ahead match.
                    // Treat this as if the look-ahead condition had been met, and return
                    //  the match at the / position from the look-ahead rule.
                    result               = lookaheadResult;
                    lookaheadStatus = 0;
                } else if (result == initialPosition) {
                    // Ran off start, no match found.
                    // move one index one (towards the start, since we are doing a previous())
                    UTEXT_SETNATIVEINDEX(fText, initialPosition);
                    UTEXT_PREVIOUS32(fText);   // TODO:  shouldn't be necessary.  We're already at beginning.  Check.
                }
                break;
            }
            // Run the loop one last time with the fake end-of-input character category.
            mode = RBBI_END;
            category = 1;
        }

        //
        // Get the char category.  An incoming category of 1 or 2 means that
        //      we are preset for doing the beginning or end of input, and
        //      that we shouldn't get a category from an actual text input character.
        //
        if (mode == RBBI_RUN) {
            // look up the current character's character category, which tells us
            // which column in the state table to look at.
            // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
            //        not the size of the character going in, which is a UChar32.
            //
            UTRIE_GET16(&fTrie, c, category);

            // Check the dictionary bit in the character's category.
            //    Counter is only used by dictionary based iterators (subclasses).
            //    Chars that need to be handled by a dictionary have a flag bit set
            //    in their category values.
            //
            if ((category & 0x4000) != 0)  {
                fDictionaryCharCount++;
                //  And off the dictionary flag bit.
                category &= ~0x4000;
            }
        }

        #ifdef RBBI_DEBUG
            if (fTrace) {
                RBBIDebugPrintf("             %4d   ", (int32_t)utext_getNativeIndex(fText));
                if (0x20<=c && c<0x7f) {
                    RBBIDebugPrintf("\"%c\"  ", c);
                } else {
                    RBBIDebugPrintf("%5x  ", c);
                }
                RBBIDebugPrintf("%3d  %3d\n", state, category);
            }
        #endif

        // State Transition - move machine to its next state
        //
        state = row->fNextState[category];
        row = (RBBIStateTableRow *)
            (statetable->fTableData + (statetable->fRowLen * state));

        if (row->fAccepting == -1) {
            // Match found, common case.
            result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
        }

        if (row->fLookAhead != 0) {
            if (lookaheadStatus != 0
                && row->fAccepting == lookaheadStatus) {
                // Lookahead match is completed.  
                result               = lookaheadResult;
                lookaheadStatus      = 0;
                // TODO:  make a standalone hard break in a rule work.
                if (lookAheadHardBreak) {
                    UTEXT_SETNATIVEINDEX(fText, result);
                    return result;
                }
                // Look-ahead completed, but other rules may match further.  Continue on
                //  TODO:  junk this feature?  I don't think it's used anywhwere.
                goto continueOn;
            }

            int32_t  r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
            lookaheadResult = r;
            lookaheadStatus = row->fLookAhead;
            goto continueOn;
        }


        if (row->fAccepting != 0) {
            // Because this is an accepting state, any in-progress look-ahead match
            //   is no longer relavant.  Clear out the pending lookahead status.
            lookaheadStatus = 0;    
        }

continueOn:
        if (state == STOP_STATE) {
            // This is the normal exit from the lookup state machine.
            // We have advanced through the string until it is certain that no
            //   longer match is possible, no matter what characters follow.
            break;
        }

        // Move (backwards) to the next character to process.  
        // If this is a beginning-of-input loop iteration, don't advance
        //    the input position.  The next iteration will be processing the
        //    first real input character.
        if (mode == RBBI_RUN) {
            c = UTEXT_PREVIOUS32(fText);
        } else {            
            if (mode == RBBI_START) {
                mode = RBBI_RUN;
            }
        }
    }

    // The state machine is done.  Check whether it found a match...

    // If the iterator failed to advance in the match engine, force it ahead by one.
    //   (This really indicates a defect in the break rules.  They should always match
    //    at least one character.)
    if (result == initialPosition) {
        UTEXT_SETNATIVEINDEX(fText, initialPosition);
        UTEXT_PREVIOUS32(fText);
        result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    }

    // Leave the iterator at our result position.
    UTEXT_SETNATIVEINDEX(fText, result);
    #ifdef RBBI_DEBUG
        if (fTrace) {
            RBBIDebugPrintf("result = %d\n\n", result);
        }
    #endif
    return result;
}

Пример #10

0

Показать файл

Файл: rbbi.cpp Проект: Gin-Rye/duibrowser

/**
 * Sets the iterator to refer to the first boundary position following
 * the specified position.
 * @offset The position from which to begin searching for a break position.
 * @return The position of the first break after the current position.
 */
int32_t BreakIterator::following(int32_t offset) {
    // if we have cached break positions and offset is in the range
    // covered by them, use them
    // TODO: could use binary search
    // TODO: what if offset is outside range, but break is not?
    if (fCachedBreakPositions != NULL) {
        if (offset >= fCachedBreakPositions[0]
                && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
            fPositionInCache = 0;
            // We are guaranteed not to leave the array due to range test above
            while (offset >= fCachedBreakPositions[fPositionInCache]) {
                ++fPositionInCache;
            }
            int32_t pos = fCachedBreakPositions[fPositionInCache];
            utext_setNativeIndex(fText, pos);
            return pos;
        }
        else {
            reset();
        }
    }

    // if the offset passed in is already past the end of the text,
    // just return DONE; if it's before the beginning, return the
    // text's starting offset
    fLastRuleStatusIndex  = 0;
    fLastStatusIndexValid = TRUE;
    if (fText == NULL || offset >= utext_nativeLength(fText)) {
        last();
        return next();
    }
    else if (offset < 0) {
        return first();
    }

    // otherwise, set our internal iteration position (temporarily)
    // to the position passed in.  If this is the _beginning_ position,
    // then we can just use next() to get our return value

    int32_t result = 0;

    if (fSafeRevTable != NULL) {
        // new rule syntax
        utext_setNativeIndex(fText, offset);
        // move forward one codepoint to prepare for moving back to a
        // safe point.
        // this handles offset being between a supplementary character
        UTEXT_NEXT32(fText);
        // handlePrevious will move most of the time to < 1 boundary away
        handlePrevious(fSafeRevTable);
        int32_t result = next();
        while (result <= offset) {
            result = next();
        }
        return result;
    }
    if (fSafeFwdTable != NULL) {
        // backup plan if forward safe table is not available
        utext_setNativeIndex(fText, offset);
        UTEXT_PREVIOUS32(fText);
        // handle next will give result >= offset
        handleNext(fSafeFwdTable);
        // previous will give result 0 or 1 boundary away from offset,
        // most of the time
        // we have to
        int32_t oldresult = previous();
        while (oldresult > offset) {
            int32_t result = previous();
            if (result <= offset) {
                return oldresult;
            }
            oldresult = result;
        }
        int32_t result = next();
        if (result <= offset) {
            return next();
        }
        return result;
    }
    // otherwise, we have to sync up first.  Use handlePrevious() to back
    // up to a known break position before the specified position (if
    // we can determine that the specified position is a break position,
    // we don't back up at all).  This may or may not be the last break
    // position at or before our starting position.  Advance forward
    // from here until we've passed the starting position.  The position
    // we stop on will be the first break position after the specified one.
    // old rule syntax

    utext_setNativeIndex(fText, offset);
    if (offset==0 || 
        offset==1  && utext_getNativeIndex(fText)==0) {
        return next();
    }
    result = previous();

    while (result != BreakIterator::DONE && result <= offset) {
        result = next();
    }

    return result;
}

Пример #11

0

Показать файл

Файл: dictbe.cpp Проект: alfintatorkace/osx-10.9-opensource

int32_t
KhmerBreakEngine::divideUpDictionaryRange( UText *text,
                                                int32_t rangeStart,
                                                int32_t rangeEnd,
                                                UStack &foundBreaks ) const {
    if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
        return 0;       // Not enough characters for two words
    }

    uint32_t wordsFound = 0;
    int32_t wordLength;
    int32_t current;
    UErrorCode status = U_ZERO_ERROR;
    PossibleWord words[KHMER_LOOKAHEAD];
    UChar32 uc;

    utext_setNativeIndex(text, rangeStart);

    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
        wordLength = 0;

        // Look for candidate words at the current position
        int candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);

        // If we found exactly one, use that
        if (candidates == 1) {
            wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(text);
            wordsFound += 1;
        }

        // If there was more than one, see which one can take us forward the most words
        else if (candidates > 1) {
            // If we're already at the end of the range, we're done
            if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
                goto foundBest;
            }
            do {
                int wordsMatched = 1;
                if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
                    if (wordsMatched < 2) {
                        // Followed by another dictionary word; mark first word as a good candidate
                        words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
                        wordsMatched = 2;
                    }

                    // If we're already at the end of the range, we're done
                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
                        goto foundBest;
                    }

                    // See if any of the possible second words is followed by a third word
                    do {
                        // If we find a third word, stop right away
                        if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
                            words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
                            goto foundBest;
                        }
                    }
                    while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
                }
            }
            while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
foundBest:
            wordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
            wordsFound += 1;
        }

        // We come here after having either found a word or not. We look ahead to the
        // next word. If it's not a dictionary word, we will combine it with the word we
        // just found (if there is one), but only if the preceding word does not exceed
        // the threshold.
        // The text iterator should now be positioned at the end of the word we found.
        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
            // if it is a dictionary word, do nothing. If it isn't, then if there is
            // no preceding word, or the non-word shares less than the minimum threshold
            // of characters with a dictionary word, then scan to resynchronize
            if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
                  && (wordLength == 0
                      || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
                // Look for a plausible word boundary
                //TODO: This section will need a rework for UText.
                int32_t remaining = rangeEnd - (current+wordLength);
                UChar32 pc = utext_current32(text);
                int32_t chars = 0;
                for (;;) {
                    utext_next32(text);
                    uc = utext_current32(text);
                    // TODO: Here we're counting on the fact that the SA languages are all
                    // in the BMP. This should get fixed with the UText rework.
                    chars += 1;
                    if (--remaining <= 0) {
                        break;
                    }
                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
                        // Maybe. See if it's in the dictionary.
                        int candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
                        utext_setNativeIndex(text, current+wordLength+chars);
                        if (candidates > 0) {
                            break;
                        }
                    }
                    pc = uc;
                }

                // Bump the word count if there wasn't already one
                if (wordLength <= 0) {
                    wordsFound += 1;
                }

                // Update the length with the passed-over characters
                wordLength += chars;
            }
            else {
                // Back up to where we were for next iteration
                utext_setNativeIndex(text, current+wordLength);
            }
        }

        // Never stop before a combining mark.
        int32_t currPos;
        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
            utext_next32(text);
            wordLength += (int32_t)utext_getNativeIndex(text) - currPos;
        }

        // Look ahead for possible suffixes if a dictionary word does not follow.
        // We do this in code rather than using a rule so that the heuristic
        // resynch continues to function. For example, one of the suffix characters
        // could be a typo in the middle of a word.
//        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
//            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
//                && fSuffixSet.contains(uc = utext_current32(text))) {
//                if (uc == KHMER_PAIYANNOI) {
//                    if (!fSuffixSet.contains(utext_previous32(text))) {
//                        // Skip over previous end and PAIYANNOI
//                        utext_next32(text);
//                        utext_next32(text);
//                        wordLength += 1;            // Add PAIYANNOI to word
//                        uc = utext_current32(text);     // Fetch next character
//                    }
//                    else {
//                        // Restore prior position
//                        utext_next32(text);
//                    }
//                }
//                if (uc == KHMER_MAIYAMOK) {
//                    if (utext_previous32(text) != KHMER_MAIYAMOK) {
//                        // Skip over previous end and MAIYAMOK
//                        utext_next32(text);
//                        utext_next32(text);
//                        wordLength += 1;            // Add MAIYAMOK to word
//                    }
//                    else {
//                        // Restore prior position
//                        utext_next32(text);
//                    }
//                }
//            }
//            else {
//                utext_setNativeIndex(text, current+wordLength);
//            }
//        }

        // Did we find a word on this iteration? If so, push it on the break stack
        if (wordLength > 0) {
            foundBreaks.push((current+wordLength), status);
        }
    }
    
    // Don't return a break for the end of the dictionary range if there is one there.
    if (foundBreaks.peeki() >= rangeEnd) {
        (void) foundBreaks.popi();
        wordsFound -= 1;
    }

    return wordsFound;
}

Пример #12

0

Показать файл

Файл: utexttst.c Проект: icu-project/icu4c

static void TestAPI(void) {
    UErrorCode      status = U_ZERO_ERROR;
    UBool           gFailed = FALSE;
    (void)gFailed;   /* Suppress set but not used warning. */

    /* Open    */
    {
        UText           utLoc = UTEXT_INITIALIZER;
        const char *    cString = "\x61\x62\x63\x64";
        UChar           uString[]  = {0x41, 0x42, 0x43, 0};
        UText          *uta;
        UText          *utb;
        UChar           c;

        uta = utext_openUChars(NULL, uString, -1, &status);
        TEST_SUCCESS(status);
        c = utext_next32(uta);
        TEST_ASSERT(c == 0x41);
        utb = utext_close(uta); 
        TEST_ASSERT(utb == NULL);

        uta = utext_openUTF8(&utLoc, cString, -1, &status);
        TEST_SUCCESS(status);
        TEST_ASSERT(uta == &utLoc);

        uta = utext_close(&utLoc);
        TEST_ASSERT(uta == &utLoc);
    }

    /* utext_clone()  */
    {
        UChar   uString[]  = {0x41, 0x42, 0x43, 0};
        int64_t len;
        UText   *uta;
        UText   *utb;

        status = U_ZERO_ERROR;
        uta = utext_openUChars(NULL, uString, -1, &status);
        TEST_SUCCESS(status);
        utb = utext_clone(NULL, uta, FALSE, FALSE, &status);
        TEST_SUCCESS(status);
        TEST_ASSERT(utb != NULL);
        TEST_ASSERT(utb != uta);
        len = utext_nativeLength(uta);
        TEST_ASSERT(len == u_strlen(uString));
        utext_close(uta);
        utext_close(utb);
    }

    /* basic access functions  */
    {
        UChar     uString[]  = {0x41, 0x42, 0x43, 0};
        UText     *uta;
        UChar32   c;
        int64_t   len;
        UBool     b;
        int64_t   i;

        status = U_ZERO_ERROR;
        uta = utext_openUChars(NULL, uString, -1, &status);
        TEST_ASSERT(uta!=NULL);
        TEST_SUCCESS(status);
        b = utext_isLengthExpensive(uta);
        TEST_ASSERT(b==TRUE);
        len = utext_nativeLength(uta);
        TEST_ASSERT(len == u_strlen(uString));
        b = utext_isLengthExpensive(uta);
        TEST_ASSERT(b==FALSE);

        c = utext_char32At(uta, 0);
        TEST_ASSERT(c==uString[0]);
        
        c = utext_current32(uta);
        TEST_ASSERT(c==uString[0]);

        c = utext_next32(uta);
        TEST_ASSERT(c==uString[0]);
        c = utext_current32(uta);
        TEST_ASSERT(c==uString[1]);

        c = utext_previous32(uta);
        TEST_ASSERT(c==uString[0]);
        c = utext_current32(uta);
        TEST_ASSERT(c==uString[0]);

        c = utext_next32From(uta, 1);
        TEST_ASSERT(c==uString[1]);
        c = utext_next32From(uta, u_strlen(uString));
        TEST_ASSERT(c==U_SENTINEL);

        c = utext_previous32From(uta, 2);
        TEST_ASSERT(c==uString[1]);
        i = utext_getNativeIndex(uta);
        TEST_ASSERT(i == 1);

        utext_setNativeIndex(uta, 0);
        b = utext_moveIndex32(uta, 1);
        TEST_ASSERT(b==TRUE);
        i = utext_getNativeIndex(uta);
        TEST_ASSERT(i==1);

        b = utext_moveIndex32(uta, u_strlen(uString)-1);
        TEST_ASSERT(b==TRUE);
        i = utext_getNativeIndex(uta);
        TEST_ASSERT(i==u_strlen(uString));

        b = utext_moveIndex32(uta, 1);
        TEST_ASSERT(b==FALSE);
        i = utext_getNativeIndex(uta);
        TEST_ASSERT(i==u_strlen(uString));

        utext_setNativeIndex(uta, 0);
        c = UTEXT_NEXT32(uta);
        TEST_ASSERT(c==uString[0]);
        c = utext_current32(uta);
        TEST_ASSERT(c==uString[1]);

        c = UTEXT_PREVIOUS32(uta);
        TEST_ASSERT(c==uString[0]);
        c = UTEXT_PREVIOUS32(uta);
        TEST_ASSERT(c==U_SENTINEL);


        utext_close(uta);
    }

    {
        /*
         * UText opened on a NULL string with zero length
         */
        UText    *uta;
        UChar32   c;

        status = U_ZERO_ERROR;
        uta = utext_openUChars(NULL, NULL, 0, &status);
        TEST_SUCCESS(status);
        c = UTEXT_NEXT32(uta);
        TEST_ASSERT(c == U_SENTINEL);
        utext_close(uta);

        uta = utext_openUTF8(NULL, NULL, 0, &status);
        TEST_SUCCESS(status);
        c = UTEXT_NEXT32(uta);
        TEST_ASSERT(c == U_SENTINEL);
        utext_close(uta);
    }


    {
        /*
         * extract
         */
        UText     *uta;
        UChar     uString[]  = {0x41, 0x42, 0x43, 0};
        UChar     buf[100];
        int32_t   i;
        /* Test pinning of input bounds */
        UChar     uString2[]  = {0x41, 0x42, 0x43, 0x44, 0x45,
                                 0x46, 0x47, 0x48, 0x49, 0x4A, 0};
        UChar *   uString2Ptr = uString2 + 5;

        status = U_ZERO_ERROR;
        uta = utext_openUChars(NULL, uString, -1, &status);
        TEST_SUCCESS(status);

        status = U_ZERO_ERROR;
        i = utext_extract(uta, 0, 100, NULL, 0, &status);
        TEST_ASSERT(status==U_BUFFER_OVERFLOW_ERROR);
        TEST_ASSERT(i == u_strlen(uString));

        status = U_ZERO_ERROR;
        memset(buf, 0, sizeof(buf));
        i = utext_extract(uta, 0, 100, buf, 100, &status);
        TEST_SUCCESS(status);
        TEST_ASSERT(i == u_strlen(uString));
        i = u_strcmp(uString, buf);
        TEST_ASSERT(i == 0);
        utext_close(uta);

        /* Test pinning of input bounds */
        status = U_ZERO_ERROR;
        uta = utext_openUChars(NULL, uString2Ptr, -1, &status);
        TEST_SUCCESS(status);

        status = U_ZERO_ERROR;
        memset(buf, 0, sizeof(buf));
        i = utext_extract(uta, -3, 20, buf, 100, &status);
        TEST_SUCCESS(status);
        TEST_ASSERT(i == u_strlen(uString2Ptr));
        i = u_strcmp(uString2Ptr, buf);
        TEST_ASSERT(i == 0);
        utext_close(uta);
    }

    {
        /*
         *  Copy, Replace, isWritable
         *    Can't create an editable UText from plain C, so all we
         *    can easily do is check that errors returned.
         */
        UText     *uta;
        UChar     uString[]  = {0x41, 0x42, 0x43, 0};
        UBool     b;

        status = U_ZERO_ERROR;
        uta = utext_openUChars(NULL, uString, -1, &status);
        TEST_SUCCESS(status);

        b = utext_isWritable(uta);
        TEST_ASSERT(b == FALSE);

        b = utext_hasMetaData(uta);
        TEST_ASSERT(b == FALSE);

        utext_replace(uta,
                      0, 1,     /* start, limit */
                      uString, -1,  /* replacement, replacement length */
                      &status);
        TEST_ASSERT(status == U_NO_WRITE_PERMISSION);


        utext_copy(uta,
                   0, 1,         /* start, limit      */
                   2,            /* destination index */
                   FALSE,        /* move flag         */
                   &status);
        TEST_ASSERT(status == U_NO_WRITE_PERMISSION);

        utext_close(uta);
    }


}