int32_t UnhandledEngine::findBreaks( UText *text, int32_t /* startPos */, int32_t endPos, int32_t breakType, UVector32 &/*foundBreaks*/ ) const { if (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)) { UChar32 c = utext_current32(text); while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) { utext_next32(text); // TODO: recast loop to work with post-increment operations. c = utext_current32(text); } } return 0; }
int32_t DictionaryBreakEngine::findBreaks( UText *text, int32_t startPos, int32_t endPos, UBool reverse, int32_t breakType, UStack &foundBreaks ) const { int32_t result = 0; // Find the span of characters included in the set. int32_t start = (int32_t)utext_getNativeIndex(text); int32_t current; int32_t rangeStart; int32_t rangeEnd; UChar32 c = utext_current32(text); if (reverse) { UBool isDict = fSet.contains(c); while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) { c = utext_previous32(text); isDict = fSet.contains(c); } rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1); rangeEnd = start + 1; } else { while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) { utext_next32(text); // TODO: recast loop for postincrement c = utext_current32(text); } rangeStart = start; rangeEnd = current; } if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) { result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks); utext_setNativeIndex(text, current); } return result; }
int32_t UnhandledEngine::findBreaks( UText *text, int32_t startPos, int32_t endPos, UBool reverse, int32_t breakType, UStack &/*foundBreaks*/ ) const { if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) { UChar32 c = utext_current32(text); if (reverse) { while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) { c = utext_previous32(text); } } else { while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) { utext_next32(text); // TODO: recast loop to work with post-increment operations. c = utext_current32(text); } } } return 0; }
//------------------------------------------------------------------------------- // // checkDictionary This function handles all processing of characters in // the "dictionary" set. It will determine the appropriate // course of action, and possibly set up a cache in the // process. // //------------------------------------------------------------------------------- int32_t BreakIterator::checkDictionary(int32_t startPos, int32_t endPos, UBool reverse) { #if 1 return reverse ? startPos : endPos; #else // Reset the old break cache first. uint32_t dictionaryCount = fDictionaryCharCount; reset(); if (dictionaryCount <= 1 || (endPos - startPos) <= 1) { return (reverse ? startPos : endPos); } // Starting from the starting point, scan towards the proposed result, // looking for the first dictionary character (which may be the one // we're on, if we're starting in the middle of a range). utext_setNativeIndex(fText, reverse ? endPos : startPos); if (reverse) { UTEXT_PREVIOUS32(fText); } int32_t rangeStart = startPos; int32_t rangeEnd = endPos; uint16_t category; int32_t current; UErrorCode status = U_ZERO_ERROR; UStack breaks(status); int32_t foundBreakCount = 0; UChar32 c = utext_current32(fText); UTRIE_GET16(&fData->fTrie, c, category); // Is the character we're starting on a dictionary character? If so, we // need to back up to include the entire run; otherwise the results of // the break algorithm will differ depending on where we start. Since // the result is cached and there is typically a non-dictionary break // within a small number of words, there should be little performance impact. if (category & 0x4000) { if (reverse) { do { utext_next32(fText); // TODO: recast to work directly with postincrement. c = utext_current32(fText); UTRIE_GET16(&fData->fTrie, c, category); } while (c != U_SENTINEL && (category & 0x4000)); // Back up to the last dictionary character rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText); if (c == U_SENTINEL) { // c = fText->last32(); // TODO: why was this if needed? c = UTEXT_PREVIOUS32(fText); } else { c = UTEXT_PREVIOUS32(fText); } } else { do { c = UTEXT_PREVIOUS32(fText); UTRIE_GET16(&fData->fTrie, c, category); } while (c != U_SENTINEL && (category & 0x4000)); // Back up to the last dictionary character if (c == U_SENTINEL) { // c = fText->first32(); c = utext_current32(fText); } else { utext_next32(fText); c = utext_current32(fText); } rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);; } UTRIE_GET16(&fData->fTrie, c, category); } // Loop through the text, looking for ranges of dictionary characters. // For each span, find the appropriate break engine, and ask it to find // any breaks within the span. // Note: we always do this in the forward direction, so that the break // cache is built in the right order. if (reverse) { utext_setNativeIndex(fText, rangeStart); c = utext_current32(fText); UTRIE_GET16(&fData->fTrie, c, category); } while(U_SUCCESS(status)) { while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) { utext_next32(fText); // TODO: tweak for post-increment operation c = utext_current32(fText); UTRIE_GET16(&fData->fTrie, c, category); } if (current >= rangeEnd) { break; } // We now have a dictionary character. Get the appropriate language object // to deal with it. const LanguageBreakEngine *lbe = getLanguageBreakEngine(c); // Ask the language object if there are any breaks. It will leave the text // pointer on the other side of its range, ready to search for the next one. if (lbe != NULL) { foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks); } // Reload the loop variables for the next go-round c = utext_current32(fText); UTRIE_GET16(&fData->fTrie, c, category); } // If we found breaks, build a new break cache. The first and last entries must // be the original starting and ending position. if (foundBreakCount > 0) { int32_t totalBreaks = foundBreakCount; if (startPos < breaks.elementAti(0)) { totalBreaks += 1; } if (endPos > breaks.peeki()) { totalBreaks += 1; } fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t)); if (fCachedBreakPositions != NULL) { int32_t out = 0; fNumCachedBreakPositions = totalBreaks; if (startPos < breaks.elementAti(0)) { fCachedBreakPositions[out++] = startPos; } for (int32_t i = 0; i < foundBreakCount; ++i) { fCachedBreakPositions[out++] = breaks.elementAti(i); } if (endPos > fCachedBreakPositions[out-1]) { fCachedBreakPositions[out] = endPos; } // If there are breaks, then by definition, we are replacing the original // proposed break by one of the breaks we found. Use following() and // preceding() to do the work. They should never recurse in this case. if (reverse) { return preceding(endPos - 1); } else { return following(startPos); } } // If the allocation failed, just fall through to the "no breaks found" case. } // If we get here, there were no language-based breaks. Set the text pointer // to the original proposed break. utext_setNativeIndex(fText, reverse ? startPos : endPos); return (reverse ? startPos : endPos); #endif }
/* * @param text A UText representing the text * @param rangeStart The start of the range of dictionary characters * @param rangeEnd The end of the range of dictionary characters * @param foundBreaks Output of C array of int32_t break positions, or 0 * @return The number of breaks found */ int32_t CjkBreakEngine::divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, UStack &foundBreaks ) const { if (rangeStart >= rangeEnd) { return 0; } const size_t defaultInputLength = 80; size_t inputLength = rangeEnd - rangeStart; // TODO: Replace by UnicodeString. AutoBuffer<UChar, defaultInputLength> charString(inputLength); // Normalize the input string and put it in normalizedText. // The map from the indices of the normalized input to the raw // input is kept in charPositions. UErrorCode status = U_ZERO_ERROR; utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status); if (U_FAILURE(status)) { return 0; } UnicodeString inputString(charString.elems(), inputLength); // TODO: Use Normalizer2. UNormalizationMode norm_mode = UNORM_NFKC; UBool isNormalized = Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES || Normalizer::isNormalized(inputString, norm_mode, status); // TODO: Replace by UVector32. AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1); int numChars = 0; UText normalizedText = UTEXT_INITIALIZER; // Needs to be declared here because normalizedText holds onto its buffer. UnicodeString normalizedString; if (isNormalized) { int32_t index = 0; charPositions[0] = 0; while(index < inputString.length()) { index = inputString.moveIndex32(index, 1); charPositions[++numChars] = index; } utext_openUnicodeString(&normalizedText, &inputString, &status); } else { Normalizer::normalize(inputString, norm_mode, 0, normalizedString, status); if (U_FAILURE(status)) { return 0; } charPositions.resize(normalizedString.length() + 1); Normalizer normalizer(charString.elems(), inputLength, norm_mode); int32_t index = 0; charPositions[0] = 0; while(index < normalizer.endIndex()){ /* UChar32 uc = */ normalizer.next(); charPositions[++numChars] = index = normalizer.getIndex(); } utext_openUnicodeString(&normalizedText, &normalizedString, &status); } if (U_FAILURE(status)) { return 0; } // From this point on, all the indices refer to the indices of // the normalized input string. // bestSnlp[i] is the snlp of the best segmentation of the first i // characters in the range to be matched. // TODO: Replace by UVector32. AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1); bestSnlp[0] = 0; for(int i = 1; i <= numChars; i++) { bestSnlp[i] = kuint32max; } // prev[i] is the index of the last CJK character in the previous word in // the best segmentation of the first i characters. // TODO: Replace by UVector32. AutoBuffer<int, defaultInputLength> prev(numChars + 1); for(int i = 0; i <= numChars; i++){ prev[i] = -1; } const size_t maxWordSize = 20; // TODO: Replace both with UVector32. AutoBuffer<int32_t, maxWordSize> values(numChars); AutoBuffer<int32_t, maxWordSize> lengths(numChars); // Dynamic programming to find the best segmentation. bool is_prev_katakana = false; for (int32_t i = 0; i < numChars; ++i) { //utext_setNativeIndex(text, rangeStart + i); utext_setNativeIndex(&normalizedText, i); if (bestSnlp[i] == kuint32max) continue; int32_t count; // limit maximum word length matched to size of current substring int32_t maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize : (numChars - i); fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems()); // if there are no single character matches found in the dictionary // starting with this charcter, treat character as a 1-character word // with the highest value possible, i.e. the least likely to occur. // Exclude Korean characters from this treatment, as they should be left // together by default. if((count == 0 || lengths[0] != 1) && !fHangulWordSet.contains(utext_current32(&normalizedText))) { values[count] = maxSnlp; lengths[count++] = 1; } for (int j = 0; j < count; j++) { uint32_t newSnlp = bestSnlp[i] + values[j]; if (newSnlp < bestSnlp[lengths[j] + i]) { bestSnlp[lengths[j] + i] = newSnlp; prev[lengths[j] + i] = i; } } // In Japanese, // Katakana word in single character is pretty rare. So we apply // the following heuristic to Katakana: any continuous run of Katakana // characters is considered a candidate word with a default cost // specified in the katakanaCost table according to its length. //utext_setNativeIndex(text, rangeStart + i); utext_setNativeIndex(&normalizedText, i); bool is_katakana = isKatakana(utext_current32(&normalizedText)); if (!is_prev_katakana && is_katakana) { int j = i + 1; utext_next32(&normalizedText); // Find the end of the continuous run of Katakana characters while (j < numChars && (j - i) < kMaxKatakanaGroupLength && isKatakana(utext_current32(&normalizedText))) { utext_next32(&normalizedText); ++j; } if ((j - i) < kMaxKatakanaGroupLength) { uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i); if (newSnlp < bestSnlp[j]) { bestSnlp[j] = newSnlp; prev[j] = i; } } } is_prev_katakana = is_katakana; } // Start pushing the optimal offset index into t_boundary (t for tentative). // prev[numChars] is guaranteed to be meaningful. // We'll first push in the reverse order, i.e., // t_boundary[0] = numChars, and afterwards do a swap. // TODO: Replace by UVector32. AutoBuffer<int, maxWordSize> t_boundary(numChars + 1); int numBreaks = 0; // No segmentation found, set boundary to end of range if (bestSnlp[numChars] == kuint32max) { t_boundary[numBreaks++] = numChars; } else { for (int i = numChars; i > 0; i = prev[i]) { t_boundary[numBreaks++] = i; } U_ASSERT(prev[t_boundary[numBreaks - 1]] == 0); } // Reverse offset index in t_boundary. // Don't add a break for the start of the dictionary range if there is one // there already. if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) { t_boundary[numBreaks++] = 0; } // Now that we're done, convert positions in t_bdry[] (indices in // the normalized input string) back to indices in the raw input string // while reversing t_bdry and pushing values to foundBreaks. for (int i = numBreaks-1; i >= 0; i--) { foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status); } utext_close(&normalizedText); return numBreaks; }
int32_t KhmerBreakEngine::divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, UStack &foundBreaks ) const { if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) { return 0; // Not enough characters for two words } uint32_t wordsFound = 0; int32_t wordLength; int32_t current; UErrorCode status = U_ZERO_ERROR; PossibleWord words[KHMER_LOOKAHEAD]; UChar32 uc; utext_setNativeIndex(text, rangeStart); while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { wordLength = 0; // Look for candidate words at the current position int candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); // If we found exactly one, use that if (candidates == 1) { wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(text); wordsFound += 1; } // If there was more than one, see which one can take us forward the most words else if (candidates > 1) { // If we're already at the end of the range, we're done if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { goto foundBest; } do { int wordsMatched = 1; if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { if (wordsMatched < 2) { // Followed by another dictionary word; mark first word as a good candidate words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); wordsMatched = 2; } // If we're already at the end of the range, we're done if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { goto foundBest; } // See if any of the possible second words is followed by a third word do { // If we find a third word, stop right away if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); goto foundBest; } } while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text)); } } while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text)); foundBest: wordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text); wordsFound += 1; } // We come here after having either found a word or not. We look ahead to the // next word. If it's not a dictionary word, we will combine it with the word we // just found (if there is one), but only if the preceding word does not exceed // the threshold. // The text iterator should now be positioned at the end of the word we found. if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < KHMER_ROOT_COMBINE_THRESHOLD) { // if it is a dictionary word, do nothing. If it isn't, then if there is // no preceding word, or the non-word shares less than the minimum threshold // of characters with a dictionary word, then scan to resynchronize if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 && (wordLength == 0 || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) { // Look for a plausible word boundary //TODO: This section will need a rework for UText. int32_t remaining = rangeEnd - (current+wordLength); UChar32 pc = utext_current32(text); int32_t chars = 0; for (;;) { utext_next32(text); uc = utext_current32(text); // TODO: Here we're counting on the fact that the SA languages are all // in the BMP. This should get fixed with the UText rework. chars += 1; if (--remaining <= 0) { break; } if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { // Maybe. See if it's in the dictionary. int candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); utext_setNativeIndex(text, current+wordLength+chars); if (candidates > 0) { break; } } pc = uc; } // Bump the word count if there wasn't already one if (wordLength <= 0) { wordsFound += 1; } // Update the length with the passed-over characters wordLength += chars; } else { // Back up to where we were for next iteration utext_setNativeIndex(text, current+wordLength); } } // Never stop before a combining mark. int32_t currPos; while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { utext_next32(text); wordLength += (int32_t)utext_getNativeIndex(text) - currPos; } // Look ahead for possible suffixes if a dictionary word does not follow. // We do this in code rather than using a rule so that the heuristic // resynch continues to function. For example, one of the suffix characters // could be a typo in the middle of a word. // if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { // if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 // && fSuffixSet.contains(uc = utext_current32(text))) { // if (uc == KHMER_PAIYANNOI) { // if (!fSuffixSet.contains(utext_previous32(text))) { // // Skip over previous end and PAIYANNOI // utext_next32(text); // utext_next32(text); // wordLength += 1; // Add PAIYANNOI to word // uc = utext_current32(text); // Fetch next character // } // else { // // Restore prior position // utext_next32(text); // } // } // if (uc == KHMER_MAIYAMOK) { // if (utext_previous32(text) != KHMER_MAIYAMOK) { // // Skip over previous end and MAIYAMOK // utext_next32(text); // utext_next32(text); // wordLength += 1; // Add MAIYAMOK to word // } // else { // // Restore prior position // utext_next32(text); // } // } // } // else { // utext_setNativeIndex(text, current+wordLength); // } // } // Did we find a word on this iteration? If so, push it on the break stack if (wordLength > 0) { foundBreaks.push((current+wordLength), status); } } // Don't return a break for the end of the dictionary range if there is one there. if (foundBreaks.peeki() >= rangeEnd) { (void) foundBreaks.popi(); wordsFound -= 1; } return wordsFound; }
static void TestAPI(void) { UErrorCode status = U_ZERO_ERROR; UBool gFailed = FALSE; (void)gFailed; /* Suppress set but not used warning. */ /* Open */ { UText utLoc = UTEXT_INITIALIZER; const char * cString = "\x61\x62\x63\x64"; UChar uString[] = {0x41, 0x42, 0x43, 0}; UText *uta; UText *utb; UChar c; uta = utext_openUChars(NULL, uString, -1, &status); TEST_SUCCESS(status); c = utext_next32(uta); TEST_ASSERT(c == 0x41); utb = utext_close(uta); TEST_ASSERT(utb == NULL); uta = utext_openUTF8(&utLoc, cString, -1, &status); TEST_SUCCESS(status); TEST_ASSERT(uta == &utLoc); uta = utext_close(&utLoc); TEST_ASSERT(uta == &utLoc); } /* utext_clone() */ { UChar uString[] = {0x41, 0x42, 0x43, 0}; int64_t len; UText *uta; UText *utb; status = U_ZERO_ERROR; uta = utext_openUChars(NULL, uString, -1, &status); TEST_SUCCESS(status); utb = utext_clone(NULL, uta, FALSE, FALSE, &status); TEST_SUCCESS(status); TEST_ASSERT(utb != NULL); TEST_ASSERT(utb != uta); len = utext_nativeLength(uta); TEST_ASSERT(len == u_strlen(uString)); utext_close(uta); utext_close(utb); } /* basic access functions */ { UChar uString[] = {0x41, 0x42, 0x43, 0}; UText *uta; UChar32 c; int64_t len; UBool b; int64_t i; status = U_ZERO_ERROR; uta = utext_openUChars(NULL, uString, -1, &status); TEST_ASSERT(uta!=NULL); TEST_SUCCESS(status); b = utext_isLengthExpensive(uta); TEST_ASSERT(b==TRUE); len = utext_nativeLength(uta); TEST_ASSERT(len == u_strlen(uString)); b = utext_isLengthExpensive(uta); TEST_ASSERT(b==FALSE); c = utext_char32At(uta, 0); TEST_ASSERT(c==uString[0]); c = utext_current32(uta); TEST_ASSERT(c==uString[0]); c = utext_next32(uta); TEST_ASSERT(c==uString[0]); c = utext_current32(uta); TEST_ASSERT(c==uString[1]); c = utext_previous32(uta); TEST_ASSERT(c==uString[0]); c = utext_current32(uta); TEST_ASSERT(c==uString[0]); c = utext_next32From(uta, 1); TEST_ASSERT(c==uString[1]); c = utext_next32From(uta, u_strlen(uString)); TEST_ASSERT(c==U_SENTINEL); c = utext_previous32From(uta, 2); TEST_ASSERT(c==uString[1]); i = utext_getNativeIndex(uta); TEST_ASSERT(i == 1); utext_setNativeIndex(uta, 0); b = utext_moveIndex32(uta, 1); TEST_ASSERT(b==TRUE); i = utext_getNativeIndex(uta); TEST_ASSERT(i==1); b = utext_moveIndex32(uta, u_strlen(uString)-1); TEST_ASSERT(b==TRUE); i = utext_getNativeIndex(uta); TEST_ASSERT(i==u_strlen(uString)); b = utext_moveIndex32(uta, 1); TEST_ASSERT(b==FALSE); i = utext_getNativeIndex(uta); TEST_ASSERT(i==u_strlen(uString)); utext_setNativeIndex(uta, 0); c = UTEXT_NEXT32(uta); TEST_ASSERT(c==uString[0]); c = utext_current32(uta); TEST_ASSERT(c==uString[1]); c = UTEXT_PREVIOUS32(uta); TEST_ASSERT(c==uString[0]); c = UTEXT_PREVIOUS32(uta); TEST_ASSERT(c==U_SENTINEL); utext_close(uta); } { /* * UText opened on a NULL string with zero length */ UText *uta; UChar32 c; status = U_ZERO_ERROR; uta = utext_openUChars(NULL, NULL, 0, &status); TEST_SUCCESS(status); c = UTEXT_NEXT32(uta); TEST_ASSERT(c == U_SENTINEL); utext_close(uta); uta = utext_openUTF8(NULL, NULL, 0, &status); TEST_SUCCESS(status); c = UTEXT_NEXT32(uta); TEST_ASSERT(c == U_SENTINEL); utext_close(uta); } { /* * extract */ UText *uta; UChar uString[] = {0x41, 0x42, 0x43, 0}; UChar buf[100]; int32_t i; /* Test pinning of input bounds */ UChar uString2[] = {0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0}; UChar * uString2Ptr = uString2 + 5; status = U_ZERO_ERROR; uta = utext_openUChars(NULL, uString, -1, &status); TEST_SUCCESS(status); status = U_ZERO_ERROR; i = utext_extract(uta, 0, 100, NULL, 0, &status); TEST_ASSERT(status==U_BUFFER_OVERFLOW_ERROR); TEST_ASSERT(i == u_strlen(uString)); status = U_ZERO_ERROR; memset(buf, 0, sizeof(buf)); i = utext_extract(uta, 0, 100, buf, 100, &status); TEST_SUCCESS(status); TEST_ASSERT(i == u_strlen(uString)); i = u_strcmp(uString, buf); TEST_ASSERT(i == 0); utext_close(uta); /* Test pinning of input bounds */ status = U_ZERO_ERROR; uta = utext_openUChars(NULL, uString2Ptr, -1, &status); TEST_SUCCESS(status); status = U_ZERO_ERROR; memset(buf, 0, sizeof(buf)); i = utext_extract(uta, -3, 20, buf, 100, &status); TEST_SUCCESS(status); TEST_ASSERT(i == u_strlen(uString2Ptr)); i = u_strcmp(uString2Ptr, buf); TEST_ASSERT(i == 0); utext_close(uta); } { /* * Copy, Replace, isWritable * Can't create an editable UText from plain C, so all we * can easily do is check that errors returned. */ UText *uta; UChar uString[] = {0x41, 0x42, 0x43, 0}; UBool b; status = U_ZERO_ERROR; uta = utext_openUChars(NULL, uString, -1, &status); TEST_SUCCESS(status); b = utext_isWritable(uta); TEST_ASSERT(b == FALSE); b = utext_hasMetaData(uta); TEST_ASSERT(b == FALSE); utext_replace(uta, 0, 1, /* start, limit */ uString, -1, /* replacement, replacement length */ &status); TEST_ASSERT(status == U_NO_WRITE_PERMISSION); utext_copy(uta, 0, 1, /* start, limit */ 2, /* destination index */ FALSE, /* move flag */ &status); TEST_ASSERT(status == U_NO_WRITE_PERMISSION); utext_close(uta); } }