/* * parse a list of code points * store them as a string in dest[destCapacity] * set the first code point in *pFirst * @return The length of the string in numbers of UChars. */ U_CAPI int32_t U_EXPORT2 u_parseString(const char *s, UChar *dest, int32_t destCapacity, uint32_t *pFirst, UErrorCode *pErrorCode) { char *end; uint32_t value; int32_t destLength; if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { return 0; } if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; } if(pFirst!=NULL) { *pFirst=0xffffffff; } destLength=0; for(;;) { s=u_skipWhitespace(s); if(*s==';' || *s==0) { if(destLength<destCapacity) { dest[destLength]=0; } else if(destLength==destCapacity) { *pErrorCode=U_STRING_NOT_TERMINATED_WARNING; } else { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; } return destLength; } /* read one code point */ value=(uint32_t)uprv_strtoul(s, &end, 16); if(end<=s || (*end!=' ' && *end!='\t' && *end!=';' && *end!=0) || value>=0x110000) { *pErrorCode=U_PARSE_ERROR; return 0; } /* store the first code point */ if(destLength==0 && pFirst!=NULL) { *pFirst=value; } /* append it to the destination array */ if((destLength+UTF_CHAR_LENGTH(value))<=destCapacity) { UTF_APPEND_CHAR_UNSAFE(dest, destLength, value); } else { destLength+=UTF_CHAR_LENGTH(value); } /* go to the following characters */ s=end; } }
/* Do an invariant conversion of char* -> UChar*, with escape parsing */ U_CAPI int32_t U_EXPORT2 u_unescape(const char* src, UChar* dest, int32_t destCapacity) { const char* segment = src; int32_t i = 0; char c; while ((c = *src) != 0) { /* '\\' intentionally written as compiler-specific * character constant to correspond to compiler-specific * char* constants. */ if (c == '\\') { int32_t lenParsed = 0; UChar32 c32; if (src != segment) { if (dest != NULL) { _appendUChars(dest + i, destCapacity - i, segment, src - segment); } i += src - segment; } ++src; /* advance past '\\' */ c32 = u_unescapeAt(_charPtr_charAt, &lenParsed, uprv_strlen(src), (void*) src); if (lenParsed == 0) { goto err; } src += lenParsed; /* advance past escape seq. */ if (dest != NULL && UTF_CHAR_LENGTH(c32) <= (destCapacity - i)) { UTF_APPEND_CHAR_UNSAFE(dest, i, c32); } else { i += UTF_CHAR_LENGTH(c32); } segment = src; } else { ++src; } } if (src != segment) { if (dest != NULL) { _appendUChars(dest + i, destCapacity - i, segment, src - segment); } i += src - segment; } if (dest != NULL && i < destCapacity) { dest[i] = 0; } return i; err: if (dest != NULL && destCapacity > 0) { *dest = 0; } return 0; }
/** * Parse a Unicode identifier from the given string at the given * position. Return the identifier, or an empty string if there * is no identifier. * @param str the string to parse * @param pos INPUT-OUPUT parameter. On INPUT, pos is the * first character to examine. It must be less than str.length(), * and it must not point to a whitespace character. That is, must * have pos < str.length() and * !uprv_isRuleWhiteSpace(str.char32At(pos)). On * OUTPUT, the position after the last parsed character. * @return the Unicode identifier, or an empty string if there is * no valid identifier at pos. */ UnicodeString ICU_Utility::parseUnicodeIdentifier(const UnicodeString& str, int32_t& pos) { // assert(pos < str.length()); // assert(!uprv_isRuleWhiteSpace(str.char32At(pos))); UnicodeString buf; int p = pos; while (p < str.length()) { UChar32 ch = str.char32At(p); if (buf.length() == 0) { if (u_isIDStart(ch)) { buf.append(ch); } else { buf.truncate(0); return buf; } } else { if (u_isIDPart(ch)) { buf.append(ch); } else { break; } } p += UTF_CHAR_LENGTH(ch); } pos = p; return buf; }
/** * Transliterate the given text with the given UTransPosition * indices. Return TRUE if the transliteration should continue * or FALSE if it should halt (because of a U_PARTIAL_MATCH match). * Note that FALSE is only ever returned if isIncremental is TRUE. * @param text the text to be transliterated * @param pos the position indices, which will be updated * @param incremental if TRUE, assume new text may be inserted * at index.limit, and return FALSE if thre is a partial match. * @return TRUE unless a U_PARTIAL_MATCH has been obtained, * indicating that transliteration should stop until more text * arrives. */ UBool TransliterationRuleSet::transliterate(Replaceable & text, UTransPosition & pos, UBool incremental) { int16_t indexByte = (int16_t)(text.char32At(pos.start) & 0xFF); for (int32_t i = index[indexByte]; i < index[indexByte + 1]; ++i) { UMatchDegree m = rules[i]->matchAndReplace(text, pos, incremental); switch (m) { case U_MATCH: _debugOut("match", rules[i], text, pos); return TRUE; case U_PARTIAL_MATCH: _debugOut("partial match", rules[i], text, pos); return FALSE; default: /* Ram: added default to make GCC happy */ break; } } // No match or partial match from any rule pos.start += UTF_CHAR_LENGTH(text.char32At(pos.start)); _debugOut("no match", NULL, text, pos); return TRUE; }
// Replace nonprintable characters with unicode escapes UnicodeString & _escape(const UnicodeString & source, UnicodeString & target) { for (int32_t i = 0; i < source.length();) { UChar32 ch = source.char32At(i); i += UTF_CHAR_LENGTH(ch); if (ch < 0x09 || (ch > 0x0A && ch < 0x20) || ch > 0x7E) { if (ch <= 0xFFFF) { target += "\\u"; _appendHex(ch, 4, target); } else { target += "\\U"; _appendHex(ch, 8, target); } } else { target += ch; } } return target; }
void RuleCharacterIterator::skipIgnored(int32_t options) { if ((options & SKIP_WHITESPACE) != 0) { for (;;) { UChar32 a = _current(); if (!PatternProps::isWhiteSpace(a)) break; _advance(UTF_CHAR_LENGTH(a)); } } }
/** * Return the next character in the normalized text and advance * the iteration position by one. If the end * of the text has already been reached, {@link #DONE} is returned. */ UChar32 Normalizer::next() { if(bufferPos<buffer.length() || nextNormalize()) { UChar32 c=buffer.char32At(bufferPos); bufferPos+=UTF_CHAR_LENGTH(c); return c; } else { return DONE; } }
/** * Return the previous character in the normalized text and decrement * the iteration position by one. If the beginning * of the text has already been reached, {@link #DONE} is returned. */ UChar32 Normalizer::previous() { if(bufferPos>0 || previousNormalize()) { UChar32 c=buffer.char32At(bufferPos-1); bufferPos-=UTF_CHAR_LENGTH(c); return c; } else { return DONE; } }
/** * Implements {@link Transliterator#handleTransliterate}. * Ignore isIncremental since we don't need the context, and * we work on codepoints. */ void UnicodeNameTransliterator::handleTransliterate(Replaceable & text, UTransPosition & offsets, UBool /*isIncremental*/) const { // The failure mode, here and below, is to behave like Any-Null, // if either there is no name data (max len == 0) or there is no // memory (malloc() => NULL). int32_t maxLen = uprv_getMaxCharNameLength(); if (maxLen == 0) { offsets.start = offsets.limit; return; } // Accomodate the longest possible name plus padding char * buf = (char *) uprv_malloc(maxLen); if (buf == NULL) { offsets.start = offsets.limit; return; } int32_t cursor = offsets.start; int32_t limit = offsets.limit; UnicodeString str(FALSE, OPEN_DELIM, OPEN_DELIM_LEN); UErrorCode status; int32_t len; while (cursor < limit) { UChar32 c = text.char32At(cursor); int32_t clen = UTF_CHAR_LENGTH(c); status = U_ZERO_ERROR; if ((len = u_charName(c, U_EXTENDED_CHAR_NAME, buf, maxLen, &status)) > 0 && !U_FAILURE(status)) { str.truncate(OPEN_DELIM_LEN); str.append(UnicodeString(buf, len, US_INV)).append(CLOSE_DELIM); text.handleReplaceBetween(cursor, cursor + clen, str); len += OPEN_DELIM_LEN + 1; // adjust for delimiters cursor += len; // advance cursor and adjust for new text limit += len - clen; // change in length } else { cursor += clen; } } offsets.contextLimit += limit - offsets.limit; offsets.limit = limit; offsets.start = cursor; uprv_free(buf); }
/** * Implement UnicodeMatcher */ void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const { UChar32 ch; for (int32_t i=0; i<pattern.length(); i+=UTF_CHAR_LENGTH(ch)) { ch = pattern.char32At(i); const UnicodeMatcher* matcher = data->lookupMatcher(ch); if (matcher == NULL) { toUnionTo.add(ch); } else { matcher->addMatchSetTo(toUnionTo); } } }
/** * Implement UnicodeFunctor */ void StringMatcher::setData(const TransliterationRuleData* d) { data = d; int32_t i = 0; while (i<pattern.length()) { UChar32 c = pattern.char32At(i); UnicodeFunctor* f = data->lookup(c); if (f != NULL) { f->setData(data); } i += UTF_CHAR_LENGTH(c); } }
void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const { UChar32 ch; for (int32_t i=0; i<output.length(); i+=UTF_CHAR_LENGTH(ch)) { ch = output.char32At(i); UnicodeReplacer* r = data->lookupReplacer(ch); if (r == NULL) { toUnionTo.add(ch); } else { r->addReplacementSetTo(toUnionTo); } } }
UChar32 RuleCharacterIterator::next(int32_t options, UBool& isEscaped, UErrorCode& ec) { if (U_FAILURE(ec)) return DONE; UChar32 c = DONE; isEscaped = FALSE; for (;;) { c = _current(); _advance(UTF_CHAR_LENGTH(c)); if (c == SymbolTable::SYMBOL_REF && buf == 0 && (options & PARSE_VARIABLES) != 0 && sym != 0) { UnicodeString name = sym->parseReference(text, pos, text.length()); // If name is empty there was an isolated SYMBOL_REF; // return it. Caller must be prepared for this. if (name.length() == 0) { break; } bufPos = 0; buf = sym->lookup(name); if (buf == 0) { ec = U_UNDEFINED_VARIABLE; return DONE; } // Handle empty variable value if (buf->length() == 0) { buf = 0; } continue; } if ((options & SKIP_WHITESPACE) != 0 && PatternProps::isWhiteSpace(c)) { continue; } if (c == 0x5C /*'\\'*/ && (options & PARSE_ESCAPES) != 0) { UnicodeString tempEscape; int32_t offset = 0; c = lookahead(tempEscape, MAX_U_NOTATION_LEN).unescapeAt(offset); jumpahead(offset); isEscaped = TRUE; if (c < 0) { ec = U_MALFORMED_UNICODE_ESCAPE; return DONE; } } break; } return c; }
/** * Union the set of all characters that may be modified by this rule * into the given set. */ void TransliterationRule::addSourceSetTo(UnicodeSet& toUnionTo) const { int32_t limit = anteContextLength + keyLength; for (int32_t i=anteContextLength; i<limit; ) { UChar32 ch = pattern.char32At(i); i += UTF_CHAR_LENGTH(ch); const UnicodeMatcher* matcher = data->lookupMatcher(ch); if (matcher == NULL) { toUnionTo.add(ch); } else { matcher->addMatchSetTo(toUnionTo); } } }
/** * Implements {@link Transliterator#handleTransliterate}. */ void EscapeTransliterator::handleTransliterate(Replaceable & text, UTransPosition & pos, UBool /*isIncremental*/) const { /* TODO: Verify that isIncremental can be ignored */ int32_t start = pos.start; int32_t limit = pos.limit; UnicodeString buf(prefix); int32_t prefixLen = prefix.length(); UBool redoPrefix = FALSE; while (start < limit) { int32_t c = grokSupplementals ? text.char32At(start) : text.charAt(start); int32_t charLen = grokSupplementals ? UTF_CHAR_LENGTH(c) : 1; if ((c & 0xFFFF0000) != 0 && supplementalHandler != NULL) { buf.truncate(0); buf.append(supplementalHandler->prefix); ICU_Utility::appendNumber(buf, c, supplementalHandler->radix, supplementalHandler->minDigits); buf.append(supplementalHandler->suffix); redoPrefix = TRUE; } else { if (redoPrefix) { buf.truncate(0); buf.append(prefix); redoPrefix = FALSE; } else { buf.truncate(prefixLen); } ICU_Utility::appendNumber(buf, c, radix, minDigits); buf.append(suffix); } text.handleReplaceBetween(start, start + charLen, buf); start += buf.length(); limit += buf.length() - charLen; } pos.contextLimit += limit - pos.limit; pos.limit = limit; pos.start = start; }
/** * Implements {@link Transliterator#handleTransliterate}. */ void UppercaseTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, UBool isIncremental) const { int32_t textPos = offsets.start; if (textPos >= offsets.limit) return; // get string for context UnicodeString original; text.extractBetween(offsets.contextStart, offsets.contextLimit, original); UCharIterator iter; uiter_setReplaceable(&iter, &text); iter.start = offsets.contextStart; iter.limit = offsets.contextLimit; // Walk through original string // If there is a case change, modify corresponding position in replaceable int32_t i = textPos - offsets.contextStart; int32_t limit = offsets.limit - offsets.contextStart; UChar32 cp; int32_t oldLen; for (; i < limit; ) { UTF_GET_CHAR(original.getBuffer(), 0, i, original.length(), cp); oldLen = UTF_CHAR_LENGTH(cp); i += oldLen; iter.index = i; // Point _past_ current char int32_t newLen = u_internalToUpper(cp, &iter, buffer, u_getMaxCaseExpansion(), loc.getName()); if (newLen >= 0) { UnicodeString temp(buffer, newLen); text.handleReplaceBetween(textPos, textPos + oldLen, temp); if (newLen != oldLen) { textPos += newLen; offsets.limit += newLen - oldLen; offsets.contextLimit += newLen - oldLen; continue; } } textPos += oldLen; } offsets.start = offsets.limit; }
UnicodeString& Transliterator::toRules(UnicodeString& rulesSource, UBool escapeUnprintable) const { // The base class implementation of toRules munges the ID into // the correct format. That is: foo => ::foo if (escapeUnprintable) { rulesSource.truncate(0); UnicodeString id = getID(); for (int32_t i=0; i<id.length();) { UChar32 c = id.char32At(i); if (!ICU_Utility::escapeUnprintable(rulesSource, c)) { rulesSource.append(c); } i += UTF_CHAR_LENGTH(c); } } else { rulesSource = getID(); } // KEEP in sync with rbt_pars rulesSource.insert(0, UNICODE_STRING_SIMPLE("::")); rulesSource.append(ID_DELIM); return rulesSource; }
U_CAPI UChar* U_EXPORT2 u_strFromUTF32(UChar *dest, int32_t destCapacity, int32_t *pDestLength, const UChar32 *src, int32_t srcLength, UErrorCode *pErrorCode) { int32_t reqLength = 0; uint32_t ch =0; UChar *pDestLimit =dest+destCapacity; UChar *pDest = dest; const uint32_t *pSrc = (const uint32_t *)src; /* args check */ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ return NULL; } if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){ *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; return NULL; } /* Check if the source is null terminated */ if(srcLength == -1 ){ while(((ch=*pSrc)!=0) && (pDest < pDestLimit)){ ++pSrc; if(ch<=0xFFFF){ *(pDest++)=(UChar)ch; }else if(ch<=0x10ffff){ *(pDest++)=UTF16_LEAD(ch); if(pDest<pDestLimit){ *(pDest++)=UTF16_TRAIL(ch); }else{ reqLength++; break; } }else{ *pErrorCode = U_INVALID_CHAR_FOUND; return NULL; } } while((ch=*pSrc++) != 0){ reqLength+=UTF_CHAR_LENGTH(ch); } }else{ const uint32_t* pSrcLimit = ((const uint32_t*)pSrc) + srcLength; while((pSrc < pSrcLimit) && (pDest < pDestLimit)){ ch = *pSrc++; if(ch<=0xFFFF){ *(pDest++)=(UChar)ch; }else if(ch<=0x10FFFF){ *(pDest++)=UTF16_LEAD(ch); if(pDest<pDestLimit){ *(pDest++)=UTF16_TRAIL(ch); }else{ reqLength++; break; } }else{ *pErrorCode = U_INVALID_CHAR_FOUND; return NULL; } } while(pSrc <pSrcLimit){ ch = *pSrc++; reqLength+=UTF_CHAR_LENGTH(ch); } } reqLength += (int32_t)(pDest - dest); if(pDestLength){ *pDestLength = reqLength; } /* Terminate the buffer */ u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); return dest; }
/** * Implements {@link Transliterator#handleTransliterate}. */ void TitlecaseTransliterator::handleTransliterate( Replaceable& text, UTransPosition& offsets, UBool isIncremental) const { if (SKIP == NULL) { return; } // Our mode; we are either converting letter toTitle or // toLower. UBool doTitle = TRUE; // Determine if there is a preceding context of CASED SKIP*, // in which case we want to start in toLower mode. If the // prior context is anything else (including empty) then start // in toTitle mode. UChar32 c; int32_t start; for (start = offsets.start - 1; start >= offsets.contextStart; start -= UTF_CHAR_LENGTH(c)) { c = text.char32At(start); if (SKIP->contains(c)) { continue; } doTitle = !CASED->contains(c); break; } // Convert things after a CASED character toLower; things // after a non-CASED, non-SKIP character toTitle. SKIP // characters are copied directly and do not change the mode. int32_t textPos = offsets.start; if (textPos >= offsets.limit) return; UnicodeString original; text.extractBetween(offsets.contextStart, offsets.contextLimit, original); UCharIterator iter; uiter_setReplaceable(&iter, &text); iter.start = offsets.contextStart; iter.limit = offsets.contextLimit; // Walk through original string // If there is a case change, modify corresponding position in replaceable int32_t i = textPos - offsets.contextStart; int32_t limit = offsets.limit - offsets.contextStart; UChar32 cp; int32_t oldLen; int32_t newLen; for (; i < limit; ) { UTF_GET_CHAR(original.getBuffer(), 0, i, original.length(), cp); oldLen = UTF_CHAR_LENGTH(cp); i += oldLen; iter.index = i; // Point _past_ current char if (!SKIP->contains(cp)) { if (doTitle) { newLen = u_internalToTitle(cp, &iter, buffer, u_getMaxCaseExpansion(), loc.getName()); } else { newLen = u_internalToLower(cp, &iter, buffer, u_getMaxCaseExpansion(), loc.getName()); } doTitle = !CASED->contains(cp); if (newLen >= 0) { UnicodeString temp(buffer, newLen); text.handleReplaceBetween(textPos, textPos + oldLen, temp); if (newLen != oldLen) { textPos += newLen; offsets.limit += newLen - oldLen; offsets.contextLimit += newLen - oldLen; continue; } } } textPos += oldLen; } offsets.start = offsets.limit; }
static inline int32_t posAfter(const Replaceable& str, int32_t pos) { return (pos >= 0 && pos < str.length()) ? pos + UTF_CHAR_LENGTH(str.char32At(pos)) : pos + 1; }
void Transliterator::filteredTransliterate(Replaceable& text, UTransPosition& index, UBool incremental, UBool rollback) const { // Short circuit path for transliterators with no filter in // non-incremental mode. if (filter == 0 && !rollback) { handleTransliterate(text, index, incremental); return; } //---------------------------------------------------------------------- // This method processes text in two groupings: // // RUNS -- A run is a contiguous group of characters which are contained // in the filter for this transliterator (filter.contains(ch) == TRUE). // Text outside of runs may appear as context but it is not modified. // The start and limit Position values are narrowed to each run. // // PASSES (incremental only) -- To make incremental mode work correctly, // each run is broken up into n passes, where n is the length (in code // points) of the run. Each pass contains the first n characters. If a // pass is completely transliterated, it is committed, and further passes // include characters after the committed text. If a pass is blocked, // and does not transliterate completely, then this method rolls back // the changes made during the pass, extends the pass by one code point, // and tries again. //---------------------------------------------------------------------- // globalLimit is the limit value for the entire operation. We // set index.limit to the end of each unfiltered run before // calling handleTransliterate(), so we need to maintain the real // value of index.limit here. After each transliteration, we // update globalLimit for insertions or deletions that have // happened. int32_t globalLimit = index.limit; // If there is a non-null filter, then break the input text up. Say the // input text has the form: // xxxabcxxdefxx // where 'x' represents a filtered character (filter.contains('x') == // false). Then we break this up into: // xxxabc xxdef xx // Each pass through the loop consumes a run of filtered // characters (which are ignored) and a subsequent run of // unfiltered characters (which are transliterated). for (;;) { if (filter != NULL) { // Narrow the range to be transliterated to the first segment // of unfiltered characters at or after index.start. // Advance past filtered chars UChar32 c; while (index.start < globalLimit && !filter->contains(c=text.char32At(index.start))) { index.start += UTF_CHAR_LENGTH(c); } // Find the end of this run of unfiltered chars index.limit = index.start; while (index.limit < globalLimit && filter->contains(c=text.char32At(index.limit))) { index.limit += UTF_CHAR_LENGTH(c); } } // Check to see if the unfiltered run is empty. This only // happens at the end of the string when all the remaining // characters are filtered. if (index.limit == index.start) { // assert(index.start == globalLimit); break; } // Is this run incremental? If there is additional // filtered text (if limit < globalLimit) then we pass in // an incremental value of FALSE to force the subclass to // complete the transliteration for this run. UBool isIncrementalRun = (index.limit < globalLimit ? FALSE : incremental); int32_t delta; // Implement rollback. To understand the need for rollback, // consider the following transliterator: // // "t" is "a > A;" // "u" is "A > b;" // "v" is a compound of "t; NFD; u" with a filter [:Ll:] // // Now apply "c" to the input text "a". The result is "b". But if // the transliteration is done incrementally, then the NFD holds // things up after "t" has already transformed "a" to "A". When // finishTransliterate() is called, "A" is _not_ processed because // it gets excluded by the [:Ll:] filter, and the end result is "A" // -- incorrect. The problem is that the filter is applied to a // partially-transliterated result, when we only want it to apply to // input text. Although this example hinges on a compound // transliterator containing NFD and a specific filter, it can // actually happen with any transliterator which may do a partial // transformation in incremental mode into characters outside its // filter. // // To handle this, when in incremental mode we supply characters to // handleTransliterate() in several passes. Each pass adds one more // input character to the input text. That is, for input "ABCD", we // first try "A", then "AB", then "ABC", and finally "ABCD". If at // any point we block (upon return, start < limit) then we roll // back. If at any point we complete the run (upon return start == // limit) then we commit that run. if (rollback && isIncrementalRun) { int32_t runStart = index.start; int32_t runLimit = index.limit; int32_t runLength = runLimit - runStart; // Make a rollback copy at the end of the string int32_t rollbackOrigin = text.length(); text.copy(runStart, runLimit, rollbackOrigin); // Variables reflecting the commitment of completely // transliterated text. passStart is the runStart, advanced // past committed text. rollbackStart is the rollbackOrigin, // advanced past rollback text that corresponds to committed // text. int32_t passStart = runStart; int32_t rollbackStart = rollbackOrigin; // The limit for each pass; we advance by one code point with // each iteration. int32_t passLimit = index.start; // Total length, in 16-bit code units, of uncommitted text. // This is the length to be rolled back. int32_t uncommittedLength = 0; // Total delta (change in length) for all passes int32_t totalDelta = 0; // PASS MAIN LOOP -- Start with a single character, and extend // the text by one character at a time. Roll back partial // transliterations and commit complete transliterations. for (;;) { // Length of additional code point, either one or two int32_t charLength = UTF_CHAR_LENGTH(text.char32At(passLimit)); passLimit += charLength; if (passLimit > runLimit) { break; } uncommittedLength += charLength; index.limit = passLimit; // Delegate to subclass for actual transliteration. Upon // return, start will be updated to point after the // transliterated text, and limit and contextLimit will be // adjusted for length changes. handleTransliterate(text, index, TRUE); delta = index.limit - passLimit; // change in length // We failed to completely transliterate this pass. // Roll back the text. Indices remain unchanged; reset // them where necessary. if (index.start != index.limit) { // Find the rollbackStart, adjusted for length changes // and the deletion of partially transliterated text. int32_t rs = rollbackStart + delta - (index.limit - passStart); // Delete the partially transliterated text text.handleReplaceBetween(passStart, index.limit, EMPTY); // Copy the rollback text back text.copy(rs, rs + uncommittedLength, passStart); // Restore indices to their original values index.start = passStart; index.limit = passLimit; index.contextLimit -= delta; } // We did completely transliterate this pass. Update the // commit indices to record how far we got. Adjust indices // for length change. else { // Move the pass indices past the committed text. passStart = passLimit = index.start; // Adjust the rollbackStart for length changes and move // it past the committed text. All characters we've // processed to this point are committed now, so zero // out the uncommittedLength. rollbackStart += delta + uncommittedLength; uncommittedLength = 0; // Adjust indices for length changes. runLimit += delta; totalDelta += delta; } } // Adjust overall limit and rollbackOrigin for insertions and // deletions. Don't need to worry about contextLimit because // handleTransliterate() maintains that. rollbackOrigin += totalDelta; globalLimit += totalDelta; // Delete the rollback copy text.handleReplaceBetween(rollbackOrigin, rollbackOrigin + runLength, EMPTY); // Move start past committed text index.start = passStart; } else { // Delegate to subclass for actual transliteration. int32_t limit = index.limit; handleTransliterate(text, index, isIncrementalRun); delta = index.limit - limit; // change in length // In a properly written transliterator, start == limit after // handleTransliterate() returns when incremental is false. // Catch cases where the subclass doesn't do this, and throw // an exception. (Just pinning start to limit is a bad idea, // because what's probably happening is that the subclass // isn't transliterating all the way to the end, and it should // in non-incremental mode.) if (!incremental && index.start != index.limit) { // We can't throw an exception, so just fudge things index.start = index.limit; } // Adjust overall limit for insertions/deletions. Don't need // to worry about contextLimit because handleTransliterate() // maintains that. globalLimit += delta; } if (filter == NULL || isIncrementalRun) { break; } // If we did completely transliterate this // run, then repeat with the next unfiltered run. } // Start is valid where it is. Limit needs to be put back where // it was, modulo adjustments for deletions/insertions. index.limit = globalLimit; }
/** * Implements {@link Transliterator#handleTransliterate}. */ void NameUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, UBool isIncremental) const { // The failure mode, here and below, is to behave like Any-Null, // if either there is no name data (max len == 0) or there is no // memory (malloc() => NULL). int32_t maxLen = uprv_getMaxCharNameLength(); if (maxLen == 0) { offsets.start = offsets.limit; return; } // Accomodate the longest possible name ++maxLen; // allow for temporary trailing space char* cbuf = (char*) uprv_malloc(maxLen); if (cbuf == NULL) { offsets.start = offsets.limit; return; } UnicodeString openPat(TRUE, OPEN, -1); UnicodeString str, name; int32_t cursor = offsets.start; int32_t limit = offsets.limit; // Modes: // 0 - looking for open delimiter // 1 - after open delimiter int32_t mode = 0; int32_t openPos = -1; // open delim candidate pos UChar32 c; while (cursor < limit) { c = text.char32At(cursor); switch (mode) { case 0: // looking for open delimiter if (c == OPEN_DELIM) { // quick check first openPos = cursor; int32_t i = ICU_Utility::parsePattern(openPat, text, cursor, limit); if (i >= 0 && i < limit) { mode = 1; name.truncate(0); cursor = i; continue; // *** reprocess char32At(cursor) } } break; case 1: // after open delimiter // Look for legal chars. If \s+ is found, convert it // to a single space. If closeDelimiter is found, exit // the loop. If any other character is found, exit the // loop. If the limit is reached, exit the loop. // Convert \s+ => SPACE. This assumes there are no // runs of >1 space characters in names. if (uprv_isRuleWhiteSpace(c)) { // Ignore leading whitespace if (name.length() > 0 && name.charAt(name.length()-1) != SPACE) { name.append(SPACE); // If we are too long then abort. maxLen includes // temporary trailing space, so use '>'. if (name.length() > maxLen) { mode = 0; } } break; } if (c == CLOSE_DELIM) { int32_t len = name.length(); // Delete trailing space, if any if (len > 0 && name.charAt(len-1) == SPACE) { --len; } if (uprv_isInvariantUString(name.getBuffer(), len)) { name.extract(0, len, cbuf, maxLen, US_INV); UErrorCode status = U_ZERO_ERROR; c = u_charFromName(U_EXTENDED_CHAR_NAME, cbuf, &status); if (U_SUCCESS(status)) { // Lookup succeeded // assert(UTF_CHAR_LENGTH(CLOSE_DELIM) == 1); cursor++; // advance over CLOSE_DELIM str.truncate(0); str.append(c); text.handleReplaceBetween(openPos, cursor, str); // Adjust indices for the change in the length of // the string. Do not assume that str.length() == // 1, in case of surrogates. int32_t delta = cursor - openPos - str.length(); cursor -= delta; limit -= delta; // assert(cursor == openPos + str.length()); } } // If the lookup failed, we leave things as-is and // still switch to mode 0 and continue. mode = 0; openPos = -1; // close off candidate continue; // *** reprocess char32At(cursor) } // Check if c is a legal char. We assume here that // legal.contains(OPEN_DELIM) is FALSE, so when we abort a // name, we don't have to go back to openPos+1. if (legal.contains(c)) { name.append(c); // If we go past the longest possible name then abort. // maxLen includes temporary trailing space, so use '>='. if (name.length() >= maxLen) { mode = 0; } } // Invalid character else { --cursor; // Backup and reprocess this character mode = 0; } break; } cursor += UTF_CHAR_LENGTH(c); } offsets.contextLimit += limit - offsets.limit; offsets.limit = limit; // In incremental mode, only advance the cursor up to the last // open delimiter candidate. offsets.start = (isIncremental && openPos >= 0) ? openPos : cursor; uprv_free(cbuf); }
int32_t StringReplacer::replace(Replaceable& text, int32_t start, int32_t limit, int32_t& cursor) { int32_t outLen; int32_t newStart = 0; // NOTE: It should be possible to _always_ run the complex // processing code; just slower. If not, then there is a bug // in the complex processing code. // Simple (no nested replacers) Processing Code : if (!isComplex) { text.handleReplaceBetween(start, limit, output); outLen = output.length(); // Setup default cursor position (for cursorPos within output) newStart = cursorPos; } // Complex (nested replacers) Processing Code : else { /* When there are segments to be copied, use the Replaceable.copy() * API in order to retain out-of-band data. Copy everything to the * end of the string, then copy them back over the key. This preserves * the integrity of indices into the key and surrounding context while * generating the output text. */ UnicodeString buf; int32_t oOutput; // offset into 'output' isComplex = FALSE; // The temporary buffer starts at tempStart, and extends // to destLimit. The start of the buffer has a single // character from before the key. This provides style // data when addition characters are filled into the // temporary buffer. If there is nothing to the left, use // the non-character U+FFFF, which Replaceable subclasses // should treat specially as a "no-style character." // destStart points to the point after the style context // character, so it is tempStart+1 or tempStart+2. int32_t tempStart = text.length(); // start of temp buffer int32_t destStart = tempStart; // copy new text to here if (start > 0) { int32_t len = UTF_CHAR_LENGTH(text.char32At(start-1)); text.copy(start-len, start, tempStart); destStart += len; } else { UnicodeString str((UChar) 0xFFFF); text.handleReplaceBetween(tempStart, tempStart, str); destStart++; } int32_t destLimit = destStart; for (oOutput=0; oOutput<output.length(); ) { if (oOutput == cursorPos) { // Record the position of the cursor newStart = destLimit - destStart; // relative to start } UChar32 c = output.char32At(oOutput); UnicodeReplacer* r = data->lookupReplacer(c); if (r == NULL) { // Accumulate straight (non-segment) text. buf.append(c); } else { isComplex = TRUE; // Insert any accumulated straight text. if (buf.length() > 0) { text.handleReplaceBetween(destLimit, destLimit, buf); destLimit += buf.length(); buf.truncate(0); } // Delegate output generation to replacer object int32_t len = r->replace(text, destLimit, destLimit, cursor); destLimit += len; } oOutput += UTF_CHAR_LENGTH(c); } // Insert any accumulated straight text. if (buf.length() > 0) { text.handleReplaceBetween(destLimit, destLimit, buf); destLimit += buf.length(); } if (oOutput == cursorPos) { // Record the position of the cursor newStart = destLimit - destStart; // relative to start } outLen = destLimit - destStart; // Copy new text to start, and delete it text.copy(destStart, destLimit, start); text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, EMPTY); // Delete the old text (the key) text.handleReplaceBetween(start + outLen, limit + outLen, EMPTY); } if (hasCursor) { // Adjust the cursor for positions outside the key. These // refer to code points rather than code units. If cursorPos // is within the output string, then use newStart, which has // already been set above. if (cursorPos < 0) { newStart = start; int32_t n = cursorPos; // Outside the output string, cursorPos counts code points while (n < 0 && newStart > 0) { newStart -= UTF_CHAR_LENGTH(text.char32At(newStart-1)); ++n; } newStart += n; } else if (cursorPos > output.length()) { newStart = start + outLen; int32_t n = cursorPos - output.length(); // Outside the output string, cursorPos counts code points while (n > 0 && newStart < text.length()) { newStart += UTF_CHAR_LENGTH(text.char32At(newStart)); --n; } newStart += n; } else { // Cursor is within output string. It has been set up above // to be relative to start. newStart += start; } cursor = newStart; } return outLen; }
/** * Implements {@link Transliterator#handleTransliterate}. */ void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos, UBool isIncremental) const { int32_t start = pos.start; int32_t limit = pos.limit; int32_t i, j, ipat; while (start < limit) { // Loop over the forms in spec[]. Exit this loop when we // match one of the specs. Exit the outer loop if a // partial match is detected and isIncremental is true. for (j=0, ipat=0; spec[ipat] != END; ++j) { // Read the header int32_t prefixLen = spec[ipat++]; int32_t suffixLen = spec[ipat++]; int8_t radix = (int8_t) spec[ipat++]; int32_t minDigits = spec[ipat++]; int32_t maxDigits = spec[ipat++]; // s is a copy of start that is advanced over the // characters as we parse them. int32_t s = start; UBool match = TRUE; for (i=0; i<prefixLen; ++i) { if (s >= limit) { if (i > 0) { // We've already matched a character. This is // a partial match, so we return if in // incremental mode. In non-incremental mode, // go to the next spec. if (isIncremental) { goto exit; } match = FALSE; break; } } UChar c = text.charAt(s++); if (c != spec[ipat + i]) { match = FALSE; break; } } if (match) { UChar32 u = 0; int32_t digitCount = 0; for (;;) { if (s >= limit) { // Check for partial match in incremental mode. if (s > start && isIncremental) { goto exit; } break; } UChar32 ch = text.char32At(s); int32_t digit = u_digit(ch, radix); if (digit < 0) { break; } s += UTF_CHAR_LENGTH(ch); u = (u * radix) + digit; if (++digitCount == maxDigits) { break; } } match = (digitCount >= minDigits); if (match) { for (i=0; i<suffixLen; ++i) { if (s >= limit) { // Check for partial match in incremental mode. if (s > start && isIncremental) { goto exit; } match = FALSE; break; } UChar c = text.charAt(s++); if (c != spec[ipat + prefixLen + i]) { match = FALSE; break; } } if (match) { // At this point, we have a match UnicodeString str(u); text.handleReplaceBetween(start, s, str); limit -= s - start - str.length(); // The following break statement leaves the // loop that is traversing the forms in // spec[]. We then parse the next input // character. break; } } } ipat += prefixLen + suffixLen; } if (start < limit) { start += UTF_CHAR_LENGTH(text.char32At(start)); } } exit: pos.contextLimit += limit - pos.limit; pos.limit = limit; pos.start = start; }
U_CAPI UChar* U_EXPORT2 u_strFromUTF8(UChar *dest, int32_t destCapacity, int32_t *pDestLength, const char* src, int32_t srcLength, UErrorCode *pErrorCode){ UChar *pDest = dest; UChar *pDestLimit = dest+destCapacity; UChar32 ch=0; int32_t index = 0; int32_t reqLength = 0; uint8_t* pSrc = (uint8_t*) src; /* args check */ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ return NULL; } if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){ *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; return NULL; } if(srcLength == -1){ srcLength = (int32_t)uprv_strlen((char*)pSrc); } while((index < srcLength)&&(pDest<pDestLimit)){ ch = pSrc[index++]; if(ch <=0x7f){ *pDest++=(UChar)ch; }else{ ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1); if(ch<0){ *pErrorCode = U_INVALID_CHAR_FOUND; return NULL; }else if(ch<=0xFFFF){ *(pDest++)=(UChar)ch; }else{ *(pDest++)=UTF16_LEAD(ch); if(pDest<pDestLimit){ *(pDest++)=UTF16_TRAIL(ch); }else{ reqLength++; break; } } } } /* donot fill the dest buffer just count the UChars needed */ while(index < srcLength){ ch = pSrc[index++]; if(ch <= 0x7f){ reqLength++; }else{ ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1); if(ch<0){ *pErrorCode = U_INVALID_CHAR_FOUND; return NULL; } reqLength+=UTF_CHAR_LENGTH(ch); } } reqLength+=(int32_t)(pDest - dest); if(pDestLength){ *pDestLength = reqLength; } /* Terminate the buffer */ u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); return dest; }
static inline int32_t posBefore(const Replaceable& str, int32_t pos) { return (pos > 0) ? pos - UTF_CHAR_LENGTH(str.char32At(pos-1)) : pos - 1; }
void Transliterator::_transliterate(Replaceable& text, UTransPosition& index, const UnicodeString* insertion, UErrorCode &status) const { if (U_FAILURE(status)) { return; } if (!positionIsValid(index, text.length())) { status = U_ILLEGAL_ARGUMENT_ERROR; return; } // int32_t originalStart = index.contextStart; if (insertion != 0) { text.handleReplaceBetween(index.limit, index.limit, *insertion); index.limit += insertion->length(); index.contextLimit += insertion->length(); } if (index.limit > 0 && UTF_IS_LEAD(text.charAt(index.limit - 1))) { // Oops, there is a dangling lead surrogate in the buffer. // This will break most transliterators, since they will // assume it is part of a pair. Don't transliterate until // more text comes in. return; } filteredTransliterate(text, index, TRUE, TRUE); #if 0 // TODO // I CAN'T DO what I'm attempting below now that the Kleene star // operator is supported. For example, in the rule // ([:Lu:]+) { x } > $1; // what is the maximum context length? getMaximumContextLength() // will return 1, but this is just the length of the ante context // part of the pattern string -- 1 character, which is a standin // for a Quantifier, which contains a StringMatcher, which // contains a UnicodeSet. // There is a complicated way to make this work again, and that's // to add a "maximum left context" protocol into the // UnicodeMatcher hierarchy. At present I'm not convinced this is // worth it. // --- // The purpose of the code below is to keep the context small // while doing incremental transliteration. When part of the left // context (between contextStart and start) is no longer needed, // we try to advance contextStart past that portion. We use the // maximum context length to do so. int32_t newCS = index.start; int32_t n = getMaximumContextLength(); while (newCS > originalStart && n-- > 0) { --newCS; newCS -= UTF_CHAR_LENGTH(text.char32At(newCS)) - 1; } index.contextStart = uprv_max(newCS, originalStart); #endif }