int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit, int32_t *lengths, int32_t *cpLengths, int32_t *values, int32_t *prefix, UnicodeSet const* ignoreSet, int32_t minLength) const { UCharsTrie uct(characters); int32_t startingTextIndex = utext_getNativeIndex(text); int32_t wordCount = 0; int32_t codePointsMatched = 0; for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) { UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c); int32_t lengthMatched = utext_getNativeIndex(text) - startingTextIndex; codePointsMatched += 1; if (ignoreSet != NULL && ignoreSet->contains(c)) { continue; } if (USTRINGTRIE_HAS_VALUE(result)) { if (codePointsMatched < minLength) { continue; } if (wordCount < limit) { if (values != NULL) { values[wordCount] = uct.getValue(); } if (lengths != NULL) { lengths[wordCount] = lengthMatched; } if (cpLengths != NULL) { cpLengths[wordCount] = codePointsMatched; } ++wordCount; } if (result == USTRINGTRIE_FINAL_VALUE) { break; } } else if (result == USTRINGTRIE_NO_MATCH) { break; } if (lengthMatched >= maxLength) { break; } } if (prefix != NULL) { *prefix = codePointsMatched; } return wordCount; }
int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit, int32_t *lengths, int32_t *cpLengths, int32_t *values, int32_t *prefix) const { BytesTrie bt(characters); int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text); int32_t wordCount = 0; int32_t codePointsMatched = 0; for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) { UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c)); int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex; codePointsMatched += 1; if (USTRINGTRIE_HAS_VALUE(result)) { if (wordCount < limit) { if (values != NULL) { values[wordCount] = bt.getValue(); } if (lengths != NULL) { lengths[wordCount] = lengthMatched; } if (cpLengths != NULL) { cpLengths[wordCount] = codePointsMatched; } ++wordCount; } if (result == USTRINGTRIE_FINAL_VALUE) { break; } } else if (result == USTRINGTRIE_NO_MATCH) { break; } if (lengthMatched >= maxLength) { break; } } if (prefix != NULL) { *prefix = codePointsMatched; } return wordCount; }
void UnhandledEngine::findBreaks( UText *text, int32_t endPos, int32_t /* breakType */, UVector32 &foundBreaks, UErrorCode &status) const { int32_t startPos = utext_getNativeIndex(text); foundBreaks.addElement(startPos, status); foundBreaks.addElement(endPos, status); return; }
int32_t DictionaryBreakEngine::findBreaks( UText *text, int32_t startPos, int32_t endPos, UBool reverse, int32_t breakType, UStack &foundBreaks ) const { int32_t result = 0; // Find the span of characters included in the set. int32_t start = (int32_t)utext_getNativeIndex(text); int32_t current; int32_t rangeStart; int32_t rangeEnd; UChar32 c = utext_current32(text); if (reverse) { UBool isDict = fSet.contains(c); while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) { c = utext_previous32(text); isDict = fSet.contains(c); } rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1); rangeEnd = start + 1; } else { while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) { utext_next32(text); // TODO: recast loop for postincrement c = utext_current32(text); } rangeStart = start; rangeEnd = current; } if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) { result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks); utext_setNativeIndex(text, current); } return result; }
int32_t UnhandledEngine::findBreaks( UText *text, int32_t startPos, int32_t endPos, UBool reverse, int32_t breakType, UStack &/*foundBreaks*/ ) const { if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) { UChar32 c = utext_current32(text); if (reverse) { while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) { c = utext_previous32(text); } } else { while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) { utext_next32(text); // TODO: recast loop to work with post-increment operations. c = utext_current32(text); } } } return 0; }
int32_t UnhandledEngine::findBreaks( UText *text, int32_t /* startPos */, int32_t endPos, int32_t breakType, UVector32 &/*foundBreaks*/ ) const { if (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)) { UChar32 c = utext_current32(text); while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) { utext_next32(text); // TODO: recast loop to work with post-increment operations. c = utext_current32(text); } } return 0; }
CodePointBreakIterator& CodePointBreakIterator::refreshInputText(UText *input, UErrorCode &status) { if (U_FAILURE(status)) { return *this; } if (!input) { status = U_ILLEGAL_ARGUMENT_ERROR; return *this; } int64_t pos = utext_getNativeIndex(m_text); m_text = utext_clone(m_text, input, false, true, &status); if (U_FAILURE(status)) { return *this; } utext_setNativeIndex(m_text, pos); if (utext_getNativeIndex(m_text) != pos) { status = U_ILLEGAL_ARGUMENT_ERROR; } return *this; }
inline int PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) { // TODO: If getIndex is too slow, use offset < 0 and add discardAll() int32_t start = (int32_t)utext_getNativeIndex(text); if (start != offset) { offset = start; prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(lengths)/sizeof(lengths[0])); // Dictionary leaves text after longest prefix, not longest word. Back up. if (count <= 0) { utext_setNativeIndex(text, start); } } if (count > 0) { utext_setNativeIndex(text, start+lengths[count-1]); } current = count-1; mark = current; return count; }
//----------------------------------------------------------------------------------- // // handlePrevious() // // Iterate backwards, according to the logic of the reverse rules. // This version handles the exact style backwards rules. // // The logic of this function is very similar to handleNext(), above. // //----------------------------------------------------------------------------------- int32_t BreakIterator::handlePrevious(const RBBIStateTable *statetable) { int32_t state; int16_t category = 0; RBBIRunMode mode; RBBIStateTableRow *row; UChar32 c; int32_t lookaheadStatus = 0; int32_t result = 0; int32_t initialPosition = 0; int32_t lookaheadResult = 0; UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; #ifdef RBBI_DEBUG if (fTrace) { RBBIDebugPuts("Handle Previous pos char state category"); } #endif // handlePrevious() never gets the rule status. // Flag the status as invalid; if the user ever asks for status, we will need // to back up, then re-find the break position using handleNext(), which does // get the status value. fLastStatusIndexValid = FALSE; fLastRuleStatusIndex = 0; // if we're already at the start of the text, return DONE. if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) { return BreakIterator::DONE; } // Set up the starting char. initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); result = initialPosition; c = UTEXT_PREVIOUS32(fText); // Set the initial state for the state machine state = START_STATE; row = (RBBIStateTableRow *) (statetable->fTableData + (statetable->fRowLen * state)); category = 3; mode = RBBI_RUN; if (statetable->fFlags & RBBI_BOF_REQUIRED) { category = 2; mode = RBBI_START; } // loop until we reach the start of the text or transition to state 0 // for (;;) { if (c == U_SENTINEL) { // Reached end of input string. if (mode == RBBI_END || *(int32_t *)fHeader->fFormatVersion == 1 ) { // We have already run the loop one last time with the // character set to the psueudo {eof} value. Now it is time // to unconditionally bail out. // (Or we have an old format binary rule file that does not support {eof}.) if (lookaheadResult < result) { // We ran off the end of the string with a pending look-ahead match. // Treat this as if the look-ahead condition had been met, and return // the match at the / position from the look-ahead rule. result = lookaheadResult; lookaheadStatus = 0; } else if (result == initialPosition) { // Ran off start, no match found. // move one index one (towards the start, since we are doing a previous()) UTEXT_SETNATIVEINDEX(fText, initialPosition); UTEXT_PREVIOUS32(fText); // TODO: shouldn't be necessary. We're already at beginning. Check. } break; } // Run the loop one last time with the fake end-of-input character category. mode = RBBI_END; category = 1; } // // Get the char category. An incoming category of 1 or 2 means that // we are preset for doing the beginning or end of input, and // that we shouldn't get a category from an actual text input character. // if (mode == RBBI_RUN) { // look up the current character's character category, which tells us // which column in the state table to look at. // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, // not the size of the character going in, which is a UChar32. // UTRIE_GET16(&fTrie, c, category); // Check the dictionary bit in the character's category. // Counter is only used by dictionary based iterators (subclasses). // Chars that need to be handled by a dictionary have a flag bit set // in their category values. // if ((category & 0x4000) != 0) { fDictionaryCharCount++; // And off the dictionary flag bit. category &= ~0x4000; } } #ifdef RBBI_DEBUG if (fTrace) { RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(fText)); if (0x20<=c && c<0x7f) { RBBIDebugPrintf("\"%c\" ", c); } else { RBBIDebugPrintf("%5x ", c); } RBBIDebugPrintf("%3d %3d\n", state, category); } #endif // State Transition - move machine to its next state // state = row->fNextState[category]; row = (RBBIStateTableRow *) (statetable->fTableData + (statetable->fRowLen * state)); if (row->fAccepting == -1) { // Match found, common case. result = (int32_t)UTEXT_GETNATIVEINDEX(fText); } if (row->fLookAhead != 0) { if (lookaheadStatus != 0 && row->fAccepting == lookaheadStatus) { // Lookahead match is completed. result = lookaheadResult; lookaheadStatus = 0; // TODO: make a standalone hard break in a rule work. if (lookAheadHardBreak) { UTEXT_SETNATIVEINDEX(fText, result); return result; } // Look-ahead completed, but other rules may match further. Continue on // TODO: junk this feature? I don't think it's used anywhwere. goto continueOn; } int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); lookaheadResult = r; lookaheadStatus = row->fLookAhead; goto continueOn; } if (row->fAccepting != 0) { // Because this is an accepting state, any in-progress look-ahead match // is no longer relavant. Clear out the pending lookahead status. lookaheadStatus = 0; } continueOn: if (state == STOP_STATE) { // This is the normal exit from the lookup state machine. // We have advanced through the string until it is certain that no // longer match is possible, no matter what characters follow. break; } // Move (backwards) to the next character to process. // If this is a beginning-of-input loop iteration, don't advance // the input position. The next iteration will be processing the // first real input character. if (mode == RBBI_RUN) { c = UTEXT_PREVIOUS32(fText); } else { if (mode == RBBI_START) { mode = RBBI_RUN; } } } // The state machine is done. Check whether it found a match... // If the iterator failed to advance in the match engine, force it ahead by one. // (This really indicates a defect in the break rules. They should always match // at least one character.) if (result == initialPosition) { UTEXT_SETNATIVEINDEX(fText, initialPosition); UTEXT_PREVIOUS32(fText); result = (int32_t)UTEXT_GETNATIVEINDEX(fText); } // Leave the iterator at our result position. UTEXT_SETNATIVEINDEX(fText, result); #ifdef RBBI_DEBUG if (fTrace) { RBBIDebugPrintf("result = %d\n\n", result); } #endif return result; }
/** * Sets the iterator to refer to the first boundary position following * the specified position. * @offset The position from which to begin searching for a break position. * @return The position of the first break after the current position. */ int32_t BreakIterator::following(int32_t offset) { // if we have cached break positions and offset is in the range // covered by them, use them // TODO: could use binary search // TODO: what if offset is outside range, but break is not? if (fCachedBreakPositions != NULL) { if (offset >= fCachedBreakPositions[0] && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) { fPositionInCache = 0; // We are guaranteed not to leave the array due to range test above while (offset >= fCachedBreakPositions[fPositionInCache]) { ++fPositionInCache; } int32_t pos = fCachedBreakPositions[fPositionInCache]; utext_setNativeIndex(fText, pos); return pos; } else { reset(); } } // if the offset passed in is already past the end of the text, // just return DONE; if it's before the beginning, return the // text's starting offset fLastRuleStatusIndex = 0; fLastStatusIndexValid = TRUE; if (fText == NULL || offset >= utext_nativeLength(fText)) { last(); return next(); } else if (offset < 0) { return first(); } // otherwise, set our internal iteration position (temporarily) // to the position passed in. If this is the _beginning_ position, // then we can just use next() to get our return value int32_t result = 0; if (fSafeRevTable != NULL) { // new rule syntax utext_setNativeIndex(fText, offset); // move forward one codepoint to prepare for moving back to a // safe point. // this handles offset being between a supplementary character UTEXT_NEXT32(fText); // handlePrevious will move most of the time to < 1 boundary away handlePrevious(fSafeRevTable); int32_t result = next(); while (result <= offset) { result = next(); } return result; } if (fSafeFwdTable != NULL) { // backup plan if forward safe table is not available utext_setNativeIndex(fText, offset); UTEXT_PREVIOUS32(fText); // handle next will give result >= offset handleNext(fSafeFwdTable); // previous will give result 0 or 1 boundary away from offset, // most of the time // we have to int32_t oldresult = previous(); while (oldresult > offset) { int32_t result = previous(); if (result <= offset) { return oldresult; } oldresult = result; } int32_t result = next(); if (result <= offset) { return next(); } return result; } // otherwise, we have to sync up first. Use handlePrevious() to back // up to a known break position before the specified position (if // we can determine that the specified position is a break position, // we don't back up at all). This may or may not be the last break // position at or before our starting position. Advance forward // from here until we've passed the starting position. The position // we stop on will be the first break position after the specified one. // old rule syntax utext_setNativeIndex(fText, offset); if (offset==0 || offset==1 && utext_getNativeIndex(fText)==0) { return next(); } result = previous(); while (result != BreakIterator::DONE && result <= offset) { result = next(); } return result; }
int32_t KhmerBreakEngine::divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, UStack &foundBreaks ) const { if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) { return 0; // Not enough characters for two words } uint32_t wordsFound = 0; int32_t wordLength; int32_t current; UErrorCode status = U_ZERO_ERROR; PossibleWord words[KHMER_LOOKAHEAD]; UChar32 uc; utext_setNativeIndex(text, rangeStart); while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { wordLength = 0; // Look for candidate words at the current position int candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); // If we found exactly one, use that if (candidates == 1) { wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(text); wordsFound += 1; } // If there was more than one, see which one can take us forward the most words else if (candidates > 1) { // If we're already at the end of the range, we're done if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { goto foundBest; } do { int wordsMatched = 1; if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { if (wordsMatched < 2) { // Followed by another dictionary word; mark first word as a good candidate words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); wordsMatched = 2; } // If we're already at the end of the range, we're done if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { goto foundBest; } // See if any of the possible second words is followed by a third word do { // If we find a third word, stop right away if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); goto foundBest; } } while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text)); } } while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text)); foundBest: wordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text); wordsFound += 1; } // We come here after having either found a word or not. We look ahead to the // next word. If it's not a dictionary word, we will combine it with the word we // just found (if there is one), but only if the preceding word does not exceed // the threshold. // The text iterator should now be positioned at the end of the word we found. if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < KHMER_ROOT_COMBINE_THRESHOLD) { // if it is a dictionary word, do nothing. If it isn't, then if there is // no preceding word, or the non-word shares less than the minimum threshold // of characters with a dictionary word, then scan to resynchronize if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 && (wordLength == 0 || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) { // Look for a plausible word boundary //TODO: This section will need a rework for UText. int32_t remaining = rangeEnd - (current+wordLength); UChar32 pc = utext_current32(text); int32_t chars = 0; for (;;) { utext_next32(text); uc = utext_current32(text); // TODO: Here we're counting on the fact that the SA languages are all // in the BMP. This should get fixed with the UText rework. chars += 1; if (--remaining <= 0) { break; } if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { // Maybe. See if it's in the dictionary. int candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); utext_setNativeIndex(text, current+wordLength+chars); if (candidates > 0) { break; } } pc = uc; } // Bump the word count if there wasn't already one if (wordLength <= 0) { wordsFound += 1; } // Update the length with the passed-over characters wordLength += chars; } else { // Back up to where we were for next iteration utext_setNativeIndex(text, current+wordLength); } } // Never stop before a combining mark. int32_t currPos; while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { utext_next32(text); wordLength += (int32_t)utext_getNativeIndex(text) - currPos; } // Look ahead for possible suffixes if a dictionary word does not follow. // We do this in code rather than using a rule so that the heuristic // resynch continues to function. For example, one of the suffix characters // could be a typo in the middle of a word. // if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { // if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 // && fSuffixSet.contains(uc = utext_current32(text))) { // if (uc == KHMER_PAIYANNOI) { // if (!fSuffixSet.contains(utext_previous32(text))) { // // Skip over previous end and PAIYANNOI // utext_next32(text); // utext_next32(text); // wordLength += 1; // Add PAIYANNOI to word // uc = utext_current32(text); // Fetch next character // } // else { // // Restore prior position // utext_next32(text); // } // } // if (uc == KHMER_MAIYAMOK) { // if (utext_previous32(text) != KHMER_MAIYAMOK) { // // Skip over previous end and MAIYAMOK // utext_next32(text); // utext_next32(text); // wordLength += 1; // Add MAIYAMOK to word // } // else { // // Restore prior position // utext_next32(text); // } // } // } // else { // utext_setNativeIndex(text, current+wordLength); // } // } // Did we find a word on this iteration? If so, push it on the break stack if (wordLength > 0) { foundBreaks.push((current+wordLength), status); } } // Don't return a break for the end of the dictionary range if there is one there. if (foundBreaks.peeki() >= rangeEnd) { (void) foundBreaks.popi(); wordsFound -= 1; } return wordsFound; }
static void TestAPI(void) { UErrorCode status = U_ZERO_ERROR; UBool gFailed = FALSE; (void)gFailed; /* Suppress set but not used warning. */ /* Open */ { UText utLoc = UTEXT_INITIALIZER; const char * cString = "\x61\x62\x63\x64"; UChar uString[] = {0x41, 0x42, 0x43, 0}; UText *uta; UText *utb; UChar c; uta = utext_openUChars(NULL, uString, -1, &status); TEST_SUCCESS(status); c = utext_next32(uta); TEST_ASSERT(c == 0x41); utb = utext_close(uta); TEST_ASSERT(utb == NULL); uta = utext_openUTF8(&utLoc, cString, -1, &status); TEST_SUCCESS(status); TEST_ASSERT(uta == &utLoc); uta = utext_close(&utLoc); TEST_ASSERT(uta == &utLoc); } /* utext_clone() */ { UChar uString[] = {0x41, 0x42, 0x43, 0}; int64_t len; UText *uta; UText *utb; status = U_ZERO_ERROR; uta = utext_openUChars(NULL, uString, -1, &status); TEST_SUCCESS(status); utb = utext_clone(NULL, uta, FALSE, FALSE, &status); TEST_SUCCESS(status); TEST_ASSERT(utb != NULL); TEST_ASSERT(utb != uta); len = utext_nativeLength(uta); TEST_ASSERT(len == u_strlen(uString)); utext_close(uta); utext_close(utb); } /* basic access functions */ { UChar uString[] = {0x41, 0x42, 0x43, 0}; UText *uta; UChar32 c; int64_t len; UBool b; int64_t i; status = U_ZERO_ERROR; uta = utext_openUChars(NULL, uString, -1, &status); TEST_ASSERT(uta!=NULL); TEST_SUCCESS(status); b = utext_isLengthExpensive(uta); TEST_ASSERT(b==TRUE); len = utext_nativeLength(uta); TEST_ASSERT(len == u_strlen(uString)); b = utext_isLengthExpensive(uta); TEST_ASSERT(b==FALSE); c = utext_char32At(uta, 0); TEST_ASSERT(c==uString[0]); c = utext_current32(uta); TEST_ASSERT(c==uString[0]); c = utext_next32(uta); TEST_ASSERT(c==uString[0]); c = utext_current32(uta); TEST_ASSERT(c==uString[1]); c = utext_previous32(uta); TEST_ASSERT(c==uString[0]); c = utext_current32(uta); TEST_ASSERT(c==uString[0]); c = utext_next32From(uta, 1); TEST_ASSERT(c==uString[1]); c = utext_next32From(uta, u_strlen(uString)); TEST_ASSERT(c==U_SENTINEL); c = utext_previous32From(uta, 2); TEST_ASSERT(c==uString[1]); i = utext_getNativeIndex(uta); TEST_ASSERT(i == 1); utext_setNativeIndex(uta, 0); b = utext_moveIndex32(uta, 1); TEST_ASSERT(b==TRUE); i = utext_getNativeIndex(uta); TEST_ASSERT(i==1); b = utext_moveIndex32(uta, u_strlen(uString)-1); TEST_ASSERT(b==TRUE); i = utext_getNativeIndex(uta); TEST_ASSERT(i==u_strlen(uString)); b = utext_moveIndex32(uta, 1); TEST_ASSERT(b==FALSE); i = utext_getNativeIndex(uta); TEST_ASSERT(i==u_strlen(uString)); utext_setNativeIndex(uta, 0); c = UTEXT_NEXT32(uta); TEST_ASSERT(c==uString[0]); c = utext_current32(uta); TEST_ASSERT(c==uString[1]); c = UTEXT_PREVIOUS32(uta); TEST_ASSERT(c==uString[0]); c = UTEXT_PREVIOUS32(uta); TEST_ASSERT(c==U_SENTINEL); utext_close(uta); } { /* * UText opened on a NULL string with zero length */ UText *uta; UChar32 c; status = U_ZERO_ERROR; uta = utext_openUChars(NULL, NULL, 0, &status); TEST_SUCCESS(status); c = UTEXT_NEXT32(uta); TEST_ASSERT(c == U_SENTINEL); utext_close(uta); uta = utext_openUTF8(NULL, NULL, 0, &status); TEST_SUCCESS(status); c = UTEXT_NEXT32(uta); TEST_ASSERT(c == U_SENTINEL); utext_close(uta); } { /* * extract */ UText *uta; UChar uString[] = {0x41, 0x42, 0x43, 0}; UChar buf[100]; int32_t i; /* Test pinning of input bounds */ UChar uString2[] = {0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0}; UChar * uString2Ptr = uString2 + 5; status = U_ZERO_ERROR; uta = utext_openUChars(NULL, uString, -1, &status); TEST_SUCCESS(status); status = U_ZERO_ERROR; i = utext_extract(uta, 0, 100, NULL, 0, &status); TEST_ASSERT(status==U_BUFFER_OVERFLOW_ERROR); TEST_ASSERT(i == u_strlen(uString)); status = U_ZERO_ERROR; memset(buf, 0, sizeof(buf)); i = utext_extract(uta, 0, 100, buf, 100, &status); TEST_SUCCESS(status); TEST_ASSERT(i == u_strlen(uString)); i = u_strcmp(uString, buf); TEST_ASSERT(i == 0); utext_close(uta); /* Test pinning of input bounds */ status = U_ZERO_ERROR; uta = utext_openUChars(NULL, uString2Ptr, -1, &status); TEST_SUCCESS(status); status = U_ZERO_ERROR; memset(buf, 0, sizeof(buf)); i = utext_extract(uta, -3, 20, buf, 100, &status); TEST_SUCCESS(status); TEST_ASSERT(i == u_strlen(uString2Ptr)); i = u_strcmp(uString2Ptr, buf); TEST_ASSERT(i == 0); utext_close(uta); } { /* * Copy, Replace, isWritable * Can't create an editable UText from plain C, so all we * can easily do is check that errors returned. */ UText *uta; UChar uString[] = {0x41, 0x42, 0x43, 0}; UBool b; status = U_ZERO_ERROR; uta = utext_openUChars(NULL, uString, -1, &status); TEST_SUCCESS(status); b = utext_isWritable(uta); TEST_ASSERT(b == FALSE); b = utext_hasMetaData(uta); TEST_ASSERT(b == FALSE); utext_replace(uta, 0, 1, /* start, limit */ uString, -1, /* replacement, replacement length */ &status); TEST_ASSERT(status == U_NO_WRITE_PERMISSION); utext_copy(uta, 0, 1, /* start, limit */ 2, /* destination index */ FALSE, /* move flag */ &status); TEST_ASSERT(status == U_NO_WRITE_PERMISSION); utext_close(uta); } }