/** * Implements {@link Transliterator#handleTransliterate}. */ void UppercaseTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, UBool isIncremental) const { int32_t textPos = offsets.start; if (textPos >= offsets.limit) return; // get string for context UnicodeString original; text.extractBetween(offsets.contextStart, offsets.contextLimit, original); UCharIterator iter; uiter_setReplaceable(&iter, &text); iter.start = offsets.contextStart; iter.limit = offsets.contextLimit; // Walk through original string // If there is a case change, modify corresponding position in replaceable int32_t i = textPos - offsets.contextStart; int32_t limit = offsets.limit - offsets.contextStart; UChar32 cp; int32_t oldLen; for (; i < limit; ) { UTF_GET_CHAR(original.getBuffer(), 0, i, original.length(), cp); oldLen = UTF_CHAR_LENGTH(cp); i += oldLen; iter.index = i; // Point _past_ current char int32_t newLen = u_internalToUpper(cp, &iter, buffer, u_getMaxCaseExpansion(), loc.getName()); if (newLen >= 0) { UnicodeString temp(buffer, newLen); text.handleReplaceBetween(textPos, textPos + oldLen, temp); if (newLen != oldLen) { textPos += newLen; offsets.limit += newLen - oldLen; offsets.contextLimit += newLen - oldLen; continue; } } textPos += oldLen; } offsets.start = offsets.limit; }
void CharIterTest::TestUCharIterator() { // test string of length 8 UnicodeString s=UnicodeString("a \\U00010001b\\U0010fffdz", "").unescape(); const char *const moves= "0+++++++++" // 10 moves per line "----0-----" ">>|>>>>>>>" "<<|<<<<<<<" "22+>8>-8+2"; StringCharacterIterator sci(s), compareCI(s); UCharIterator sIter, cIter, rIter; uiter_setString(&sIter, s.getBuffer(), s.length()); uiter_setCharacterIterator(&cIter, &sci); uiter_setReplaceable(&rIter, &s); TestUCharIterator(&sIter, compareCI, moves, "uiter_setString"); compareCI.setIndex(0); TestUCharIterator(&cIter, compareCI, moves, "uiter_setCharacterIterator"); compareCI.setIndex(0); TestUCharIterator(&rIter, compareCI, moves, "uiter_setReplaceable"); // test move & getIndex some more sIter.start=2; sIter.index=3; sIter.limit=5; if( sIter.getIndex(&sIter, UITER_ZERO)!=0 || sIter.getIndex(&sIter, UITER_START)!=2 || sIter.getIndex(&sIter, UITER_CURRENT)!=3 || sIter.getIndex(&sIter, UITER_LIMIT)!=5 || sIter.getIndex(&sIter, UITER_LENGTH)!=s.length() ) { errln("error: UCharIterator(string).getIndex returns wrong index"); } if( sIter.move(&sIter, 4, UITER_ZERO)!=4 || sIter.move(&sIter, 1, UITER_START)!=3 || sIter.move(&sIter, 3, UITER_CURRENT)!=5 || sIter.move(&sIter, -1, UITER_LIMIT)!=4 || sIter.move(&sIter, -5, UITER_LENGTH)!=3 || sIter.move(&sIter, 0, UITER_CURRENT)!=sIter.getIndex(&sIter, UITER_CURRENT) || sIter.getIndex(&sIter, UITER_CURRENT)!=3 ) { errln("error: UCharIterator(string).move sets/returns wrong index"); } sci=StringCharacterIterator(s, 2, 5, 3); uiter_setCharacterIterator(&cIter, &sci); if( cIter.getIndex(&cIter, UITER_ZERO)!=0 || cIter.getIndex(&cIter, UITER_START)!=2 || cIter.getIndex(&cIter, UITER_CURRENT)!=3 || cIter.getIndex(&cIter, UITER_LIMIT)!=5 || cIter.getIndex(&cIter, UITER_LENGTH)!=s.length() ) { errln("error: UCharIterator(character iterator).getIndex returns wrong index"); } if( cIter.move(&cIter, 4, UITER_ZERO)!=4 || cIter.move(&cIter, 1, UITER_START)!=3 || cIter.move(&cIter, 3, UITER_CURRENT)!=5 || cIter.move(&cIter, -1, UITER_LIMIT)!=4 || cIter.move(&cIter, -5, UITER_LENGTH)!=3 || cIter.move(&cIter, 0, UITER_CURRENT)!=cIter.getIndex(&cIter, UITER_CURRENT) || cIter.getIndex(&cIter, UITER_CURRENT)!=3 ) { errln("error: UCharIterator(character iterator).move sets/returns wrong index"); } if(cIter.getIndex(&cIter, (enum UCharIteratorOrigin)-1) != -1) { errln("error: UCharIterator(char iter).getIndex did not return error value"); } if(cIter.move(&cIter, 0, (enum UCharIteratorOrigin)-1) != -1) { errln("error: UCharIterator(char iter).move did not return error value"); } if(rIter.getIndex(&rIter, (enum UCharIteratorOrigin)-1) != -1) { errln("error: UCharIterator(repl iter).getIndex did not return error value"); } if(rIter.move(&rIter, 0, (enum UCharIteratorOrigin)-1) != -1) { errln("error: UCharIterator(repl iter).move did not return error value"); } if(sIter.getIndex(&sIter, (enum UCharIteratorOrigin)-1) != -1) { errln("error: UCharIterator(string iter).getIndex did not return error value"); } if(sIter.move(&sIter, 0, (enum UCharIteratorOrigin)-1) != -1) { errln("error: UCharIterator(string iter).move did not return error value"); } /* Testing function coverage on bad input */ UErrorCode status = U_ZERO_ERROR; uiter_setString(&sIter, NULL, 1); uiter_setState(&sIter, 1, &status); if (status != U_UNSUPPORTED_ERROR) { errln("error: uiter_setState returned %s instead of U_UNSUPPORTED_ERROR", u_errorName(status)); } status = U_ZERO_ERROR; uiter_setState(NULL, 1, &status); if (status != U_ILLEGAL_ARGUMENT_ERROR) { errln("error: uiter_setState returned %s instead of U_ILLEGAL_ARGUMENT_ERROR", u_errorName(status)); } if (uiter_getState(&sIter) != UITER_NO_STATE) { errln("error: uiter_getState did not return UITER_NO_STATE on bad input"); } }
void CharIterTest::TestUCharIterator() { // test string of length 8 UnicodeString s=UnicodeString("a \\U00010001b\\U0010fffdz", "").unescape(); const char *const moves= "0+++++++++" // 10 moves per line "----0-----" ">>|>>>>>>>" "<<|<<<<<<<" "22+>8>-8+2"; StringCharacterIterator sci(s), compareCI(s); UCharIterator sIter, cIter, rIter; uiter_setString(&sIter, s.getBuffer(), s.length()); uiter_setCharacterIterator(&cIter, &sci); uiter_setReplaceable(&rIter, &s); TestUCharIterator(&sIter, compareCI, moves, "uiter_setString"); compareCI.setIndex(0); TestUCharIterator(&cIter, compareCI, moves, "uiter_setCharacterIterator"); compareCI.setIndex(0); TestUCharIterator(&rIter, compareCI, moves, "uiter_setReplaceable"); // test move & getIndex some more sIter.start=2; sIter.index=3; sIter.limit=5; if( sIter.getIndex(&sIter, UITER_ZERO)!=0 || sIter.getIndex(&sIter, UITER_START)!=2 || sIter.getIndex(&sIter, UITER_CURRENT)!=3 || sIter.getIndex(&sIter, UITER_LIMIT)!=5 || sIter.getIndex(&sIter, UITER_LENGTH)!=s.length() ) { errln("error: UCharIterator(string).getIndex returns wrong index"); } if( sIter.move(&sIter, 4, UITER_ZERO)!=4 || sIter.move(&sIter, 1, UITER_START)!=3 || sIter.move(&sIter, 3, UITER_CURRENT)!=5 || sIter.move(&sIter, -1, UITER_LIMIT)!=4 || sIter.move(&sIter, -5, UITER_LENGTH)!=3 || sIter.move(&sIter, 0, UITER_CURRENT)!=sIter.getIndex(&sIter, UITER_CURRENT) || sIter.getIndex(&sIter, UITER_CURRENT)!=3 ) { errln("error: UCharIterator(string).move sets/returns wrong index"); } sci=StringCharacterIterator(s, 2, 5, 3); uiter_setCharacterIterator(&cIter, &sci); if( cIter.getIndex(&cIter, UITER_ZERO)!=0 || cIter.getIndex(&cIter, UITER_START)!=2 || cIter.getIndex(&cIter, UITER_CURRENT)!=3 || cIter.getIndex(&cIter, UITER_LIMIT)!=5 || cIter.getIndex(&cIter, UITER_LENGTH)!=s.length() ) { errln("error: UCharIterator(character iterator).getIndex returns wrong index"); } if( cIter.move(&cIter, 4, UITER_ZERO)!=4 || cIter.move(&cIter, 1, UITER_START)!=3 || cIter.move(&cIter, 3, UITER_CURRENT)!=5 || cIter.move(&cIter, -1, UITER_LIMIT)!=4 || cIter.move(&cIter, -5, UITER_LENGTH)!=3 || cIter.move(&cIter, 0, UITER_CURRENT)!=cIter.getIndex(&cIter, UITER_CURRENT) || cIter.getIndex(&cIter, UITER_CURRENT)!=3 ) { errln("error: UCharIterator(character iterator).move sets/returns wrong index"); } if(cIter.getIndex(&cIter, (enum UCharIteratorOrigin)-1) != -1) { errln("error: UCharIterator(char iter).getIndex did not return error value"); } if(cIter.move(&cIter, 0, (enum UCharIteratorOrigin)-1) != -1) { errln("error: UCharIterator(char iter).move did not return error value"); } if(rIter.getIndex(&rIter, (enum UCharIteratorOrigin)-1) != -1) { errln("error: UCharIterator(repl iter).getIndex did not return error value"); } if(rIter.move(&rIter, 0, (enum UCharIteratorOrigin)-1) != -1) { errln("error: UCharIterator(repl iter).move did not return error value"); } if(sIter.getIndex(&sIter, (enum UCharIteratorOrigin)-1) != -1) { errln("error: UCharIterator(string iter).getIndex did not return error value"); } if(sIter.move(&sIter, 0, (enum UCharIteratorOrigin)-1) != -1) { errln("error: UCharIterator(string iter).move did not return error value"); } }
void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, UBool isIncremental) const { // start and limit of the input range int32_t start = offsets.start; int32_t limit = offsets.limit; int32_t length, delta; if(start >= limit) { return; } // a C code unit iterator, implemented around the Replaceable UCharIterator iter; uiter_setReplaceable(&iter, &text); // the output string and buffer pointer UnicodeString output; UChar *buffer; UBool neededToNormalize; UErrorCode errorCode; /* * Normalize as short chunks at a time as possible even in * bulk mode, so that styled text is minimally disrupted. * In incremental mode, a chunk that ends with offsets.limit * must not be normalized. * * If it was known that the input text is not styled, then * a bulk mode normalization could look like this: * UChar staticChars[256]; UnicodeString input; length = limit - start; input.setTo(staticChars, 0, sizeof(staticChars)/U_SIZEOF_UCHAR); // writable alias _Replaceable_extractBetween(text, start, limit, input.getBuffer(length)); input.releaseBuffer(length); UErrorCode status = U_ZERO_ERROR; Normalizer::normalize(input, fMode, options, output, status); text.handleReplaceBetween(start, limit, output); int32_t delta = output.length() - length; offsets.contextLimit += delta; offsets.limit += delta; offsets.start = limit + delta; * */ while(start < limit) { // set the iterator limits for the remaining input range // this is a moving target because of the replacements in the text object iter.start = iter.index = start; iter.limit = limit; // incrementally normalize a small chunk of the input buffer = output.getBuffer(-1); errorCode = U_ZERO_ERROR; length = unorm_next(&iter, buffer, output.getCapacity(), fMode, 0, TRUE, &neededToNormalize, &errorCode); output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0); if(errorCode == U_BUFFER_OVERFLOW_ERROR) { // use a larger output string buffer and do it again from the start iter.index = start; buffer = output.getBuffer(length); errorCode = U_ZERO_ERROR; length = unorm_next(&iter, buffer, output.getCapacity(), fMode, 0, TRUE, &neededToNormalize, &errorCode); output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0); } if(U_FAILURE(errorCode)) { break; } limit = iter.index; if(isIncremental && limit == iter.limit) { // stop in incremental mode when we reach the input limit // in case there are additional characters that could change the // normalization result // UNLESS all characters in the result of the normalization of // the last run are in the skippable set const UChar *s=output.getBuffer(); int32_t i=0, outLength=output.length(); UChar32 c; while(i<outLength) { U16_NEXT(s, i, outLength, c); if(!unorm_isNFSkippable(c, fMode)) { outLength=-1; // I wish C++ had labeled loops and break outer; ... break; } } if (outLength<0) { break; } } if(neededToNormalize) { // replace the input chunk with its normalized form text.handleReplaceBetween(start, limit, output); // update all necessary indexes accordingly delta = length - (limit - start); // length change in the text object start = limit += delta; // the next chunk starts where this one ends, with adjustment limit = offsets.limit += delta; // set the iteration limit to the adjusted end of the input range offsets.contextLimit += delta; } else { // delta == 0 start = limit; limit = offsets.limit; } } offsets.start = start; }
/** * Implements {@link Transliterator#handleTransliterate}. */ void TitlecaseTransliterator::handleTransliterate( Replaceable& text, UTransPosition& offsets, UBool isIncremental) const { if (SKIP == NULL) { return; } // Our mode; we are either converting letter toTitle or // toLower. UBool doTitle = TRUE; // Determine if there is a preceding context of CASED SKIP*, // in which case we want to start in toLower mode. If the // prior context is anything else (including empty) then start // in toTitle mode. UChar32 c; int32_t start; for (start = offsets.start - 1; start >= offsets.contextStart; start -= UTF_CHAR_LENGTH(c)) { c = text.char32At(start); if (SKIP->contains(c)) { continue; } doTitle = !CASED->contains(c); break; } // Convert things after a CASED character toLower; things // after a non-CASED, non-SKIP character toTitle. SKIP // characters are copied directly and do not change the mode. int32_t textPos = offsets.start; if (textPos >= offsets.limit) return; UnicodeString original; text.extractBetween(offsets.contextStart, offsets.contextLimit, original); UCharIterator iter; uiter_setReplaceable(&iter, &text); iter.start = offsets.contextStart; iter.limit = offsets.contextLimit; // Walk through original string // If there is a case change, modify corresponding position in replaceable int32_t i = textPos - offsets.contextStart; int32_t limit = offsets.limit - offsets.contextStart; UChar32 cp; int32_t oldLen; int32_t newLen; for (; i < limit; ) { UTF_GET_CHAR(original.getBuffer(), 0, i, original.length(), cp); oldLen = UTF_CHAR_LENGTH(cp); i += oldLen; iter.index = i; // Point _past_ current char if (!SKIP->contains(cp)) { if (doTitle) { newLen = u_internalToTitle(cp, &iter, buffer, u_getMaxCaseExpansion(), loc.getName()); } else { newLen = u_internalToLower(cp, &iter, buffer, u_getMaxCaseExpansion(), loc.getName()); } doTitle = !CASED->contains(cp); if (newLen >= 0) { UnicodeString temp(buffer, newLen); text.handleReplaceBetween(textPos, textPos + oldLen, temp); if (newLen != oldLen) { textPos += newLen; offsets.limit += newLen - oldLen; offsets.contextLimit += newLen - oldLen; continue; } } } textPos += oldLen; } offsets.start = offsets.limit; }