inline UBool PossibleWord::backUp( UText *text ) { if (current > 0) { utext_setNativeIndex(text, offset + lengths[--current]); return TRUE; } return FALSE; }
// Closely imitate CompactTrieDictionary::matches(). // Note: CompactTrieDictionary::matches() is part of its trie implementation, // and while it loops over the text, it knows the current state. // By contrast, this implementation uses UCharsTrie API functions that have to // check the trie state each time and load/store state in the object. // (Whether it hasNext() and whether it is in the middle of a linear-match node.) static int32_t ucharsTrieMatches(UCharsTrie &trie, UText *text, int32_t textLimit, int32_t *lengths, int &count, int limit ) { UChar32 c=utext_next32(text); // Notes: // a) CompactTrieDictionary::matches() does not check for U_SENTINEL. // b) It also ignores non-BMP code points by casting to UChar! if(c<0) { return 0; } // Should be firstForCodePoint() but CompactTrieDictionary // handles only code units. UStringTrieResult result=trie.first(c); int32_t numChars=1; count=0; for(;;) { if(USTRINGTRIE_HAS_VALUE(result)) { if(count<limit) { // lengths[count++]=(int32_t)utext_getNativeIndex(text); lengths[count++]=numChars; // CompactTrieDictionary just counts chars too. } if(result==USTRINGTRIE_FINAL_VALUE) { break; } } else if(result==USTRINGTRIE_NO_MATCH) { break; } if(numChars>=textLimit) { // Note: Why do we have both a text limit and a UText that knows its length? break; } UChar32 c=utext_next32(text); // Notes: // a) CompactTrieDictionary::matches() does not check for U_SENTINEL. // b) It also ignores non-BMP code points by casting to UChar! if(c<0) { break; } ++numChars; // Should be nextForCodePoint() but CompactTrieDictionary // handles only code units. result=trie.next(c); } #if 0 // Note: CompactTrieDictionary::matches() comments say that it leaves the UText // after the longest prefix match and returns the number of characters // that were matched. if(index!=lastMatch) { utext_setNativeIndex(text, lastMatch); } return lastMatch-start; // However, it does not do either of these, so I am not trying to // imitate it (or its docs) 100%. #endif return numChars; }
/** * Sets the current iteration position to the beginning of the text. * @return The offset of the beginning of the text. */ int32_t BreakIterator::first(void) { reset(); fLastRuleStatusIndex = 0; fLastStatusIndexValid = TRUE; //if (fText == NULL) // return BreakIterator::DONE; utext_setNativeIndex(fText, 0); return 0; }
inline int PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) { // TODO: If getIndex is too slow, use offset < 0 and add discardAll() int32_t start = (int32_t)utext_getNativeIndex(text); if (start != offset) { offset = start; prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(lengths)/sizeof(lengths[0])); // Dictionary leaves text after longest prefix, not longest word. Back up. if (count <= 0) { utext_setNativeIndex(text, start); } } if (count > 0) { utext_setNativeIndex(text, start+lengths[count-1]); } current = count-1; mark = current; return count; }
/** * Sets the current iteration position to the end of the text. * @return The text's past-the-end offset. */ int32_t BreakIterator::last(void) { reset(); if (fText == NULL) { fLastRuleStatusIndex = 0; fLastStatusIndexValid = TRUE; return BreakIterator::DONE; } fLastStatusIndexValid = FALSE; int32_t pos = (int32_t)utext_nativeLength(fText); utext_setNativeIndex(fText, pos); return pos; }
/** * Advances the iterator to the next boundary position. * @return The position of the first boundary after this one. */ int32_t BreakIterator::next(void) { // if we have cached break positions and we're still in the range // covered by them, just move one step forward in the cache if (fCachedBreakPositions != NULL) { if (fPositionInCache < fNumCachedBreakPositions - 1) { ++fPositionInCache; int32_t pos = fCachedBreakPositions[fPositionInCache]; utext_setNativeIndex(fText, pos); return pos; } else { reset(); } } int32_t startPos = current(); int32_t result = handleNext(fForwardTable); if (fDictionaryCharCount > 0) { result = checkDictionary(startPos, result, FALSE); } return result; }
int32_t DictionaryBreakEngine::findBreaks( UText *text, int32_t startPos, int32_t endPos, UBool reverse, int32_t breakType, UStack &foundBreaks ) const { int32_t result = 0; // Find the span of characters included in the set. int32_t start = (int32_t)utext_getNativeIndex(text); int32_t current; int32_t rangeStart; int32_t rangeEnd; UChar32 c = utext_current32(text); if (reverse) { UBool isDict = fSet.contains(c); while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) { c = utext_previous32(text); isDict = fSet.contains(c); } rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1); rangeEnd = start + 1; } else { while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) { utext_next32(text); // TODO: recast loop for postincrement c = utext_current32(text); } rangeStart = start; rangeEnd = current; } if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) { result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks); utext_setNativeIndex(text, current); } return result; }
CodePointBreakIterator& CodePointBreakIterator::refreshInputText(UText *input, UErrorCode &status) { if (U_FAILURE(status)) { return *this; } if (!input) { status = U_ILLEGAL_ARGUMENT_ERROR; return *this; } int64_t pos = utext_getNativeIndex(m_text); m_text = utext_clone(m_text, input, false, true, &status); if (U_FAILURE(status)) { return *this; } utext_setNativeIndex(m_text, pos); if (utext_getNativeIndex(m_text) != pos) { status = U_ILLEGAL_ARGUMENT_ERROR; } return *this; }
/** * Sets the iterator to refer to the last boundary position before the * specified position. * @offset The position to begin searching for a break from. * @return The position of the last boundary before the starting position. */ int32_t BreakIterator::preceding(int32_t offset) { // if we have cached break positions and offset is in the range // covered by them, use them if (fCachedBreakPositions != NULL) { // TODO: binary search? // TODO: What if offset is outside range, but break is not? if (offset > fCachedBreakPositions[0] && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) { fPositionInCache = 0; while (fPositionInCache < fNumCachedBreakPositions && offset > fCachedBreakPositions[fPositionInCache]) ++fPositionInCache; --fPositionInCache; // If we're at the beginning of the cache, need to reevaluate the // rule status if (fPositionInCache <= 0) { fLastStatusIndexValid = FALSE; } utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]); return fCachedBreakPositions[fPositionInCache]; } else { reset(); } } // if the offset passed in is already past the end of the text, // just return DONE; if it's before the beginning, return the // text's starting offset if (fText == NULL || offset > utext_nativeLength(fText)) { // return BreakIterator::DONE; return last(); } else if (offset < 0) { return first(); } // if we start by updating the current iteration position to the // position specified by the caller, we can just use previous() // to carry out this operation if (fSafeFwdTable != NULL) { // new rule syntax utext_setNativeIndex(fText, offset); int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText); if (newOffset != offset) { // Will come here if specified offset was not a code point boundary AND // the underlying implmentation is using UText, which snaps any non-code-point-boundary // indices to the containing code point. // For breakitereator::preceding only, these non-code-point indices need to be moved // up to refer to the following codepoint. UTEXT_NEXT32(fText); offset = (int32_t)UTEXT_GETNATIVEINDEX(fText); } // TODO: (synwee) would it be better to just check for being in the middle of a surrogate pair, // rather than adjusting the position unconditionally? // (Change would interact with safe rules.) // TODO: change RBBI behavior for off-boundary indices to match that of UText? // affects only preceding(), seems cleaner, but is slightly different. UTEXT_PREVIOUS32(fText); handleNext(fSafeFwdTable); int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText); while (result >= offset) { result = previous(); } return result; } if (fSafeRevTable != NULL) { // backup plan if forward safe table is not available // TODO: check whether this path can be discarded // It's probably OK to say that rules must supply both safe tables // if they use safe tables at all. We have certainly never described // to anyone how to work with just one safe table. utext_setNativeIndex(fText, offset); UTEXT_NEXT32(fText); // handle previous will give result <= offset handlePrevious(fSafeRevTable); // next will give result 0 or 1 boundary away from offset, // most of the time // we have to int32_t oldresult = next(); while (oldresult < offset) { int32_t result = next(); if (result >= offset) { return oldresult; } oldresult = result; } int32_t result = previous(); if (result >= offset) { return previous(); } return result; } // old rule syntax utext_setNativeIndex(fText, offset); return previous(); }
/** * Sets the iterator to refer to the first boundary position following * the specified position. * @offset The position from which to begin searching for a break position. * @return The position of the first break after the current position. */ int32_t BreakIterator::following(int32_t offset) { // if we have cached break positions and offset is in the range // covered by them, use them // TODO: could use binary search // TODO: what if offset is outside range, but break is not? if (fCachedBreakPositions != NULL) { if (offset >= fCachedBreakPositions[0] && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) { fPositionInCache = 0; // We are guaranteed not to leave the array due to range test above while (offset >= fCachedBreakPositions[fPositionInCache]) { ++fPositionInCache; } int32_t pos = fCachedBreakPositions[fPositionInCache]; utext_setNativeIndex(fText, pos); return pos; } else { reset(); } } // if the offset passed in is already past the end of the text, // just return DONE; if it's before the beginning, return the // text's starting offset fLastRuleStatusIndex = 0; fLastStatusIndexValid = TRUE; if (fText == NULL || offset >= utext_nativeLength(fText)) { last(); return next(); } else if (offset < 0) { return first(); } // otherwise, set our internal iteration position (temporarily) // to the position passed in. If this is the _beginning_ position, // then we can just use next() to get our return value int32_t result = 0; if (fSafeRevTable != NULL) { // new rule syntax utext_setNativeIndex(fText, offset); // move forward one codepoint to prepare for moving back to a // safe point. // this handles offset being between a supplementary character UTEXT_NEXT32(fText); // handlePrevious will move most of the time to < 1 boundary away handlePrevious(fSafeRevTable); int32_t result = next(); while (result <= offset) { result = next(); } return result; } if (fSafeFwdTable != NULL) { // backup plan if forward safe table is not available utext_setNativeIndex(fText, offset); UTEXT_PREVIOUS32(fText); // handle next will give result >= offset handleNext(fSafeFwdTable); // previous will give result 0 or 1 boundary away from offset, // most of the time // we have to int32_t oldresult = previous(); while (oldresult > offset) { int32_t result = previous(); if (result <= offset) { return oldresult; } oldresult = result; } int32_t result = next(); if (result <= offset) { return next(); } return result; } // otherwise, we have to sync up first. Use handlePrevious() to back // up to a known break position before the specified position (if // we can determine that the specified position is a break position, // we don't back up at all). This may or may not be the last break // position at or before our starting position. Advance forward // from here until we've passed the starting position. The position // we stop on will be the first break position after the specified one. // old rule syntax utext_setNativeIndex(fText, offset); if (offset==0 || offset==1 && utext_getNativeIndex(fText)==0) { return next(); } result = previous(); while (result != BreakIterator::DONE && result <= offset) { result = next(); } return result; }
/** * Advances the iterator backwards, to the last boundary preceding this one. * @return The position of the last boundary position preceding this one. */ int32_t BreakIterator::previous(void) { int32_t result; int32_t startPos; // if we have cached break positions and we're still in the range // covered by them, just move one step backward in the cache if (fCachedBreakPositions != NULL) { if (fPositionInCache > 0) { --fPositionInCache; // If we're at the beginning of the cache, need to reevaluate the // rule status if (fPositionInCache <= 0) { fLastStatusIndexValid = FALSE; } int32_t pos = fCachedBreakPositions[fPositionInCache]; utext_setNativeIndex(fText, pos); return pos; } else { reset(); } } // if we're already sitting at the beginning of the text, return DONE if (fText == NULL || (startPos = current()) == 0) { fLastRuleStatusIndex = 0; fLastStatusIndexValid = TRUE; return BreakIterator::DONE; } if (fSafeRevTable != NULL || fSafeFwdTable != NULL) { result = handlePrevious(fReverseTable); if (fDictionaryCharCount > 0) { result = checkDictionary(result, startPos, TRUE); } return result; } // old rule syntax // set things up. handlePrevious() will back us up to some valid // break position before the current position (we back our internal // iterator up one step to prevent handlePrevious() from returning // the current position), but not necessarily the last one before // where we started int32_t start = current(); UTEXT_PREVIOUS32(fText); int32_t lastResult = handlePrevious(fReverseTable); if (lastResult == UBRK_DONE) { lastResult = 0; utext_setNativeIndex(fText, 0); } result = lastResult; int32_t lastTag = 0; UBool breakTagValid = FALSE; // iterate forward from the known break position until we pass our // starting point. The last break position before the starting // point is our return value for (;;) { result = next(); if (result == BreakIterator::DONE || result >= start) { break; } lastResult = result; lastTag = fLastRuleStatusIndex; breakTagValid = TRUE; } // fLastBreakTag wants to have the value for section of text preceding // the result position that we are to return (in lastResult.) If // the backwards rules overshot and the above loop had to do two or more // next()s to move up to the desired return position, we will have a valid // tag value. But, if handlePrevious() took us to exactly the correct result positon, // we wont have a tag value for that position, which is only set by handleNext(). // set the current iteration position to be the last break position // before where we started, and then return that value utext_setNativeIndex(fText, lastResult); fLastRuleStatusIndex = lastTag; // for use by getRuleStatus() fLastStatusIndexValid = breakTagValid; // No need to check the dictionary; it will have been handled by // next() return lastResult; }
//------------------------------------------------------------------------------- // // checkDictionary This function handles all processing of characters in // the "dictionary" set. It will determine the appropriate // course of action, and possibly set up a cache in the // process. // //------------------------------------------------------------------------------- int32_t BreakIterator::checkDictionary(int32_t startPos, int32_t endPos, UBool reverse) { #if 1 return reverse ? startPos : endPos; #else // Reset the old break cache first. uint32_t dictionaryCount = fDictionaryCharCount; reset(); if (dictionaryCount <= 1 || (endPos - startPos) <= 1) { return (reverse ? startPos : endPos); } // Starting from the starting point, scan towards the proposed result, // looking for the first dictionary character (which may be the one // we're on, if we're starting in the middle of a range). utext_setNativeIndex(fText, reverse ? endPos : startPos); if (reverse) { UTEXT_PREVIOUS32(fText); } int32_t rangeStart = startPos; int32_t rangeEnd = endPos; uint16_t category; int32_t current; UErrorCode status = U_ZERO_ERROR; UStack breaks(status); int32_t foundBreakCount = 0; UChar32 c = utext_current32(fText); UTRIE_GET16(&fData->fTrie, c, category); // Is the character we're starting on a dictionary character? If so, we // need to back up to include the entire run; otherwise the results of // the break algorithm will differ depending on where we start. Since // the result is cached and there is typically a non-dictionary break // within a small number of words, there should be little performance impact. if (category & 0x4000) { if (reverse) { do { utext_next32(fText); // TODO: recast to work directly with postincrement. c = utext_current32(fText); UTRIE_GET16(&fData->fTrie, c, category); } while (c != U_SENTINEL && (category & 0x4000)); // Back up to the last dictionary character rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText); if (c == U_SENTINEL) { // c = fText->last32(); // TODO: why was this if needed? c = UTEXT_PREVIOUS32(fText); } else { c = UTEXT_PREVIOUS32(fText); } } else { do { c = UTEXT_PREVIOUS32(fText); UTRIE_GET16(&fData->fTrie, c, category); } while (c != U_SENTINEL && (category & 0x4000)); // Back up to the last dictionary character if (c == U_SENTINEL) { // c = fText->first32(); c = utext_current32(fText); } else { utext_next32(fText); c = utext_current32(fText); } rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);; } UTRIE_GET16(&fData->fTrie, c, category); } // Loop through the text, looking for ranges of dictionary characters. // For each span, find the appropriate break engine, and ask it to find // any breaks within the span. // Note: we always do this in the forward direction, so that the break // cache is built in the right order. if (reverse) { utext_setNativeIndex(fText, rangeStart); c = utext_current32(fText); UTRIE_GET16(&fData->fTrie, c, category); } while(U_SUCCESS(status)) { while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) { utext_next32(fText); // TODO: tweak for post-increment operation c = utext_current32(fText); UTRIE_GET16(&fData->fTrie, c, category); } if (current >= rangeEnd) { break; } // We now have a dictionary character. Get the appropriate language object // to deal with it. const LanguageBreakEngine *lbe = getLanguageBreakEngine(c); // Ask the language object if there are any breaks. It will leave the text // pointer on the other side of its range, ready to search for the next one. if (lbe != NULL) { foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks); } // Reload the loop variables for the next go-round c = utext_current32(fText); UTRIE_GET16(&fData->fTrie, c, category); } // If we found breaks, build a new break cache. The first and last entries must // be the original starting and ending position. if (foundBreakCount > 0) { int32_t totalBreaks = foundBreakCount; if (startPos < breaks.elementAti(0)) { totalBreaks += 1; } if (endPos > breaks.peeki()) { totalBreaks += 1; } fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t)); if (fCachedBreakPositions != NULL) { int32_t out = 0; fNumCachedBreakPositions = totalBreaks; if (startPos < breaks.elementAti(0)) { fCachedBreakPositions[out++] = startPos; } for (int32_t i = 0; i < foundBreakCount; ++i) { fCachedBreakPositions[out++] = breaks.elementAti(i); } if (endPos > fCachedBreakPositions[out-1]) { fCachedBreakPositions[out] = endPos; } // If there are breaks, then by definition, we are replacing the original // proposed break by one of the breaks we found. Use following() and // preceding() to do the work. They should never recurse in this case. if (reverse) { return preceding(endPos - 1); } else { return following(startPos); } } // If the allocation failed, just fall through to the "no breaks found" case. } // If we get here, there were no language-based breaks. Set the text pointer // to the original proposed break. utext_setNativeIndex(fText, reverse ? startPos : endPos); return (reverse ? startPos : endPos); #endif }
/* * @param text A UText representing the text * @param rangeStart The start of the range of dictionary characters * @param rangeEnd The end of the range of dictionary characters * @param foundBreaks Output of C array of int32_t break positions, or 0 * @return The number of breaks found */ int32_t CjkBreakEngine::divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, UStack &foundBreaks ) const { if (rangeStart >= rangeEnd) { return 0; } const size_t defaultInputLength = 80; size_t inputLength = rangeEnd - rangeStart; // TODO: Replace by UnicodeString. AutoBuffer<UChar, defaultInputLength> charString(inputLength); // Normalize the input string and put it in normalizedText. // The map from the indices of the normalized input to the raw // input is kept in charPositions. UErrorCode status = U_ZERO_ERROR; utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status); if (U_FAILURE(status)) { return 0; } UnicodeString inputString(charString.elems(), inputLength); // TODO: Use Normalizer2. UNormalizationMode norm_mode = UNORM_NFKC; UBool isNormalized = Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES || Normalizer::isNormalized(inputString, norm_mode, status); // TODO: Replace by UVector32. AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1); int numChars = 0; UText normalizedText = UTEXT_INITIALIZER; // Needs to be declared here because normalizedText holds onto its buffer. UnicodeString normalizedString; if (isNormalized) { int32_t index = 0; charPositions[0] = 0; while(index < inputString.length()) { index = inputString.moveIndex32(index, 1); charPositions[++numChars] = index; } utext_openUnicodeString(&normalizedText, &inputString, &status); } else { Normalizer::normalize(inputString, norm_mode, 0, normalizedString, status); if (U_FAILURE(status)) { return 0; } charPositions.resize(normalizedString.length() + 1); Normalizer normalizer(charString.elems(), inputLength, norm_mode); int32_t index = 0; charPositions[0] = 0; while(index < normalizer.endIndex()){ /* UChar32 uc = */ normalizer.next(); charPositions[++numChars] = index = normalizer.getIndex(); } utext_openUnicodeString(&normalizedText, &normalizedString, &status); } if (U_FAILURE(status)) { return 0; } // From this point on, all the indices refer to the indices of // the normalized input string. // bestSnlp[i] is the snlp of the best segmentation of the first i // characters in the range to be matched. // TODO: Replace by UVector32. AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1); bestSnlp[0] = 0; for(int i = 1; i <= numChars; i++) { bestSnlp[i] = kuint32max; } // prev[i] is the index of the last CJK character in the previous word in // the best segmentation of the first i characters. // TODO: Replace by UVector32. AutoBuffer<int, defaultInputLength> prev(numChars + 1); for(int i = 0; i <= numChars; i++){ prev[i] = -1; } const size_t maxWordSize = 20; // TODO: Replace both with UVector32. AutoBuffer<int32_t, maxWordSize> values(numChars); AutoBuffer<int32_t, maxWordSize> lengths(numChars); // Dynamic programming to find the best segmentation. bool is_prev_katakana = false; for (int32_t i = 0; i < numChars; ++i) { //utext_setNativeIndex(text, rangeStart + i); utext_setNativeIndex(&normalizedText, i); if (bestSnlp[i] == kuint32max) continue; int32_t count; // limit maximum word length matched to size of current substring int32_t maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize : (numChars - i); fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems()); // if there are no single character matches found in the dictionary // starting with this charcter, treat character as a 1-character word // with the highest value possible, i.e. the least likely to occur. // Exclude Korean characters from this treatment, as they should be left // together by default. if((count == 0 || lengths[0] != 1) && !fHangulWordSet.contains(utext_current32(&normalizedText))) { values[count] = maxSnlp; lengths[count++] = 1; } for (int j = 0; j < count; j++) { uint32_t newSnlp = bestSnlp[i] + values[j]; if (newSnlp < bestSnlp[lengths[j] + i]) { bestSnlp[lengths[j] + i] = newSnlp; prev[lengths[j] + i] = i; } } // In Japanese, // Katakana word in single character is pretty rare. So we apply // the following heuristic to Katakana: any continuous run of Katakana // characters is considered a candidate word with a default cost // specified in the katakanaCost table according to its length. //utext_setNativeIndex(text, rangeStart + i); utext_setNativeIndex(&normalizedText, i); bool is_katakana = isKatakana(utext_current32(&normalizedText)); if (!is_prev_katakana && is_katakana) { int j = i + 1; utext_next32(&normalizedText); // Find the end of the continuous run of Katakana characters while (j < numChars && (j - i) < kMaxKatakanaGroupLength && isKatakana(utext_current32(&normalizedText))) { utext_next32(&normalizedText); ++j; } if ((j - i) < kMaxKatakanaGroupLength) { uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i); if (newSnlp < bestSnlp[j]) { bestSnlp[j] = newSnlp; prev[j] = i; } } } is_prev_katakana = is_katakana; } // Start pushing the optimal offset index into t_boundary (t for tentative). // prev[numChars] is guaranteed to be meaningful. // We'll first push in the reverse order, i.e., // t_boundary[0] = numChars, and afterwards do a swap. // TODO: Replace by UVector32. AutoBuffer<int, maxWordSize> t_boundary(numChars + 1); int numBreaks = 0; // No segmentation found, set boundary to end of range if (bestSnlp[numChars] == kuint32max) { t_boundary[numBreaks++] = numChars; } else { for (int i = numChars; i > 0; i = prev[i]) { t_boundary[numBreaks++] = i; } U_ASSERT(prev[t_boundary[numBreaks - 1]] == 0); } // Reverse offset index in t_boundary. // Don't add a break for the start of the dictionary range if there is one // there already. if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) { t_boundary[numBreaks++] = 0; } // Now that we're done, convert positions in t_bdry[] (indices in // the normalized input string) back to indices in the raw input string // while reversing t_bdry and pushing values to foundBreaks. for (int i = numBreaks-1; i >= 0; i--) { foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status); } utext_close(&normalizedText); return numBreaks; }
int32_t KhmerBreakEngine::divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, UStack &foundBreaks ) const { if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) { return 0; // Not enough characters for two words } uint32_t wordsFound = 0; int32_t wordLength; int32_t current; UErrorCode status = U_ZERO_ERROR; PossibleWord words[KHMER_LOOKAHEAD]; UChar32 uc; utext_setNativeIndex(text, rangeStart); while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { wordLength = 0; // Look for candidate words at the current position int candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); // If we found exactly one, use that if (candidates == 1) { wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(text); wordsFound += 1; } // If there was more than one, see which one can take us forward the most words else if (candidates > 1) { // If we're already at the end of the range, we're done if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { goto foundBest; } do { int wordsMatched = 1; if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { if (wordsMatched < 2) { // Followed by another dictionary word; mark first word as a good candidate words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); wordsMatched = 2; } // If we're already at the end of the range, we're done if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { goto foundBest; } // See if any of the possible second words is followed by a third word do { // If we find a third word, stop right away if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); goto foundBest; } } while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text)); } } while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text)); foundBest: wordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text); wordsFound += 1; } // We come here after having either found a word or not. We look ahead to the // next word. If it's not a dictionary word, we will combine it with the word we // just found (if there is one), but only if the preceding word does not exceed // the threshold. // The text iterator should now be positioned at the end of the word we found. if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < KHMER_ROOT_COMBINE_THRESHOLD) { // if it is a dictionary word, do nothing. If it isn't, then if there is // no preceding word, or the non-word shares less than the minimum threshold // of characters with a dictionary word, then scan to resynchronize if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 && (wordLength == 0 || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) { // Look for a plausible word boundary //TODO: This section will need a rework for UText. int32_t remaining = rangeEnd - (current+wordLength); UChar32 pc = utext_current32(text); int32_t chars = 0; for (;;) { utext_next32(text); uc = utext_current32(text); // TODO: Here we're counting on the fact that the SA languages are all // in the BMP. This should get fixed with the UText rework. chars += 1; if (--remaining <= 0) { break; } if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { // Maybe. See if it's in the dictionary. int candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); utext_setNativeIndex(text, current+wordLength+chars); if (candidates > 0) { break; } } pc = uc; } // Bump the word count if there wasn't already one if (wordLength <= 0) { wordsFound += 1; } // Update the length with the passed-over characters wordLength += chars; } else { // Back up to where we were for next iteration utext_setNativeIndex(text, current+wordLength); } } // Never stop before a combining mark. int32_t currPos; while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { utext_next32(text); wordLength += (int32_t)utext_getNativeIndex(text) - currPos; } // Look ahead for possible suffixes if a dictionary word does not follow. // We do this in code rather than using a rule so that the heuristic // resynch continues to function. For example, one of the suffix characters // could be a typo in the middle of a word. // if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { // if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 // && fSuffixSet.contains(uc = utext_current32(text))) { // if (uc == KHMER_PAIYANNOI) { // if (!fSuffixSet.contains(utext_previous32(text))) { // // Skip over previous end and PAIYANNOI // utext_next32(text); // utext_next32(text); // wordLength += 1; // Add PAIYANNOI to word // uc = utext_current32(text); // Fetch next character // } // else { // // Restore prior position // utext_next32(text); // } // } // if (uc == KHMER_MAIYAMOK) { // if (utext_previous32(text) != KHMER_MAIYAMOK) { // // Skip over previous end and MAIYAMOK // utext_next32(text); // utext_next32(text); // wordLength += 1; // Add MAIYAMOK to word // } // else { // // Restore prior position // utext_next32(text); // } // } // } // else { // utext_setNativeIndex(text, current+wordLength); // } // } // Did we find a word on this iteration? If so, push it on the break stack if (wordLength > 0) { foundBreaks.push((current+wordLength), status); } } // Don't return a break for the end of the dictionary range if there is one there. if (foundBreaks.peeki() >= rangeEnd) { (void) foundBreaks.popi(); wordsFound -= 1; } return wordsFound; }
inline int32_t PossibleWord::acceptMarked( UText *text ) { utext_setNativeIndex(text, offset + lengths[mark]); return lengths[mark]; }
static void TestAPI(void) { UErrorCode status = U_ZERO_ERROR; UBool gFailed = FALSE; (void)gFailed; /* Suppress set but not used warning. */ /* Open */ { UText utLoc = UTEXT_INITIALIZER; const char * cString = "\x61\x62\x63\x64"; UChar uString[] = {0x41, 0x42, 0x43, 0}; UText *uta; UText *utb; UChar c; uta = utext_openUChars(NULL, uString, -1, &status); TEST_SUCCESS(status); c = utext_next32(uta); TEST_ASSERT(c == 0x41); utb = utext_close(uta); TEST_ASSERT(utb == NULL); uta = utext_openUTF8(&utLoc, cString, -1, &status); TEST_SUCCESS(status); TEST_ASSERT(uta == &utLoc); uta = utext_close(&utLoc); TEST_ASSERT(uta == &utLoc); } /* utext_clone() */ { UChar uString[] = {0x41, 0x42, 0x43, 0}; int64_t len; UText *uta; UText *utb; status = U_ZERO_ERROR; uta = utext_openUChars(NULL, uString, -1, &status); TEST_SUCCESS(status); utb = utext_clone(NULL, uta, FALSE, FALSE, &status); TEST_SUCCESS(status); TEST_ASSERT(utb != NULL); TEST_ASSERT(utb != uta); len = utext_nativeLength(uta); TEST_ASSERT(len == u_strlen(uString)); utext_close(uta); utext_close(utb); } /* basic access functions */ { UChar uString[] = {0x41, 0x42, 0x43, 0}; UText *uta; UChar32 c; int64_t len; UBool b; int64_t i; status = U_ZERO_ERROR; uta = utext_openUChars(NULL, uString, -1, &status); TEST_ASSERT(uta!=NULL); TEST_SUCCESS(status); b = utext_isLengthExpensive(uta); TEST_ASSERT(b==TRUE); len = utext_nativeLength(uta); TEST_ASSERT(len == u_strlen(uString)); b = utext_isLengthExpensive(uta); TEST_ASSERT(b==FALSE); c = utext_char32At(uta, 0); TEST_ASSERT(c==uString[0]); c = utext_current32(uta); TEST_ASSERT(c==uString[0]); c = utext_next32(uta); TEST_ASSERT(c==uString[0]); c = utext_current32(uta); TEST_ASSERT(c==uString[1]); c = utext_previous32(uta); TEST_ASSERT(c==uString[0]); c = utext_current32(uta); TEST_ASSERT(c==uString[0]); c = utext_next32From(uta, 1); TEST_ASSERT(c==uString[1]); c = utext_next32From(uta, u_strlen(uString)); TEST_ASSERT(c==U_SENTINEL); c = utext_previous32From(uta, 2); TEST_ASSERT(c==uString[1]); i = utext_getNativeIndex(uta); TEST_ASSERT(i == 1); utext_setNativeIndex(uta, 0); b = utext_moveIndex32(uta, 1); TEST_ASSERT(b==TRUE); i = utext_getNativeIndex(uta); TEST_ASSERT(i==1); b = utext_moveIndex32(uta, u_strlen(uString)-1); TEST_ASSERT(b==TRUE); i = utext_getNativeIndex(uta); TEST_ASSERT(i==u_strlen(uString)); b = utext_moveIndex32(uta, 1); TEST_ASSERT(b==FALSE); i = utext_getNativeIndex(uta); TEST_ASSERT(i==u_strlen(uString)); utext_setNativeIndex(uta, 0); c = UTEXT_NEXT32(uta); TEST_ASSERT(c==uString[0]); c = utext_current32(uta); TEST_ASSERT(c==uString[1]); c = UTEXT_PREVIOUS32(uta); TEST_ASSERT(c==uString[0]); c = UTEXT_PREVIOUS32(uta); TEST_ASSERT(c==U_SENTINEL); utext_close(uta); } { /* * UText opened on a NULL string with zero length */ UText *uta; UChar32 c; status = U_ZERO_ERROR; uta = utext_openUChars(NULL, NULL, 0, &status); TEST_SUCCESS(status); c = UTEXT_NEXT32(uta); TEST_ASSERT(c == U_SENTINEL); utext_close(uta); uta = utext_openUTF8(NULL, NULL, 0, &status); TEST_SUCCESS(status); c = UTEXT_NEXT32(uta); TEST_ASSERT(c == U_SENTINEL); utext_close(uta); } { /* * extract */ UText *uta; UChar uString[] = {0x41, 0x42, 0x43, 0}; UChar buf[100]; int32_t i; /* Test pinning of input bounds */ UChar uString2[] = {0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0}; UChar * uString2Ptr = uString2 + 5; status = U_ZERO_ERROR; uta = utext_openUChars(NULL, uString, -1, &status); TEST_SUCCESS(status); status = U_ZERO_ERROR; i = utext_extract(uta, 0, 100, NULL, 0, &status); TEST_ASSERT(status==U_BUFFER_OVERFLOW_ERROR); TEST_ASSERT(i == u_strlen(uString)); status = U_ZERO_ERROR; memset(buf, 0, sizeof(buf)); i = utext_extract(uta, 0, 100, buf, 100, &status); TEST_SUCCESS(status); TEST_ASSERT(i == u_strlen(uString)); i = u_strcmp(uString, buf); TEST_ASSERT(i == 0); utext_close(uta); /* Test pinning of input bounds */ status = U_ZERO_ERROR; uta = utext_openUChars(NULL, uString2Ptr, -1, &status); TEST_SUCCESS(status); status = U_ZERO_ERROR; memset(buf, 0, sizeof(buf)); i = utext_extract(uta, -3, 20, buf, 100, &status); TEST_SUCCESS(status); TEST_ASSERT(i == u_strlen(uString2Ptr)); i = u_strcmp(uString2Ptr, buf); TEST_ASSERT(i == 0); utext_close(uta); } { /* * Copy, Replace, isWritable * Can't create an editable UText from plain C, so all we * can easily do is check that errors returned. */ UText *uta; UChar uString[] = {0x41, 0x42, 0x43, 0}; UBool b; status = U_ZERO_ERROR; uta = utext_openUChars(NULL, uString, -1, &status); TEST_SUCCESS(status); b = utext_isWritable(uta); TEST_ASSERT(b == FALSE); b = utext_hasMetaData(uta); TEST_ASSERT(b == FALSE); utext_replace(uta, 0, 1, /* start, limit */ uString, -1, /* replacement, replacement length */ &status); TEST_ASSERT(status == U_NO_WRITE_PERMISSION); utext_copy(uta, 0, 1, /* start, limit */ 2, /* destination index */ FALSE, /* move flag */ &status); TEST_ASSERT(status == U_NO_WRITE_PERMISSION); utext_close(uta); } }