コード例 #1
0
ファイル: uparse.c プロジェクト: andrewleech/firebird
/*
 * parse a list of code points
 * store them as a string in dest[destCapacity]
 * set the first code point in *pFirst
 * @return The length of the string in numbers of UChars.
 */
U_CAPI int32_t U_EXPORT2
u_parseString(const char *s,
              UChar *dest, int32_t destCapacity,
              uint32_t *pFirst,
              UErrorCode *pErrorCode) {
    char *end;
    uint32_t value;
    int32_t destLength;

    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return 0;
    }
    if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    }

    if(pFirst!=NULL) {
        *pFirst=0xffffffff;
    }

    destLength=0;
    for(;;) {
        s=u_skipWhitespace(s);
        if(*s==';' || *s==0) {
            if(destLength<destCapacity) {
                dest[destLength]=0;
            } else if(destLength==destCapacity) {
                *pErrorCode=U_STRING_NOT_TERMINATED_WARNING;
            } else {
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
            }
            return destLength;
        }

        /* read one code point */
        value=(uint32_t)uprv_strtoul(s, &end, 16);
        if(end<=s || (*end!=' ' && *end!='\t' && *end!=';' && *end!=0) || value>=0x110000) {
            *pErrorCode=U_PARSE_ERROR;
            return 0;
        }

        /* store the first code point */
        if(destLength==0 && pFirst!=NULL) {
            *pFirst=value;
        }

        /* append it to the destination array */
        if((destLength+UTF_CHAR_LENGTH(value))<=destCapacity) {
            UTF_APPEND_CHAR_UNSAFE(dest, destLength, value);
        } else {
            destLength+=UTF_CHAR_LENGTH(value);
        }

        /* go to the following characters */
        s=end;
    }
}
コード例 #2
0
ファイル: ustring.c プロジェクト: mathtexts/uimacpp
/* Do an invariant conversion of char* -> UChar*, with escape parsing */
U_CAPI int32_t U_EXPORT2
u_unescape(const char* src, UChar* dest, int32_t destCapacity) {
    const char* segment = src;
    int32_t i = 0;
    char c;

    while ((c = *src) != 0) {
        /* '\\' intentionally written as compiler-specific
         * character constant to correspond to compiler-specific
         * char* constants. */
        if (c == '\\') {
            int32_t lenParsed = 0;
            UChar32 c32;
            if (src != segment) {
                if (dest != NULL) {
                    _appendUChars(dest + i, destCapacity - i,
                                  segment, src - segment);
                }
                i += src - segment;
            }
            ++src; /* advance past '\\' */
            c32 = u_unescapeAt(_charPtr_charAt, &lenParsed, uprv_strlen(src), (void*) src);
            if (lenParsed == 0) {
                goto err;
            }
            src += lenParsed; /* advance past escape seq. */
            if (dest != NULL && UTF_CHAR_LENGTH(c32) <= (destCapacity - i)) {
                UTF_APPEND_CHAR_UNSAFE(dest, i, c32);
            } else {
                i += UTF_CHAR_LENGTH(c32);
            }
            segment = src;
        } else {
            ++src;
        }
    }
    if (src != segment) {
        if (dest != NULL) {
            _appendUChars(dest + i, destCapacity - i,
                          segment, src - segment);
        }
        i += src - segment;
    }
    if (dest != NULL && i < destCapacity) {
        dest[i] = 0;
    }
    return i;

    err:
    if (dest != NULL && destCapacity > 0) {
        *dest = 0;
    }
    return 0;
}
コード例 #3
0
ファイル: util_props.cpp プロジェクト: venkatarajasekhar/Qt
/**
 * Parse a Unicode identifier from the given string at the given
 * position.  Return the identifier, or an empty string if there
 * is no identifier.
 * @param str the string to parse
 * @param pos INPUT-OUPUT parameter.  On INPUT, pos is the
 * first character to examine.  It must be less than str.length(),
 * and it must not point to a whitespace character.  That is, must
 * have pos < str.length() and
 * !uprv_isRuleWhiteSpace(str.char32At(pos)).  On
 * OUTPUT, the position after the last parsed character.
 * @return the Unicode identifier, or an empty string if there is
 * no valid identifier at pos.
 */
UnicodeString ICU_Utility::parseUnicodeIdentifier(const UnicodeString& str, int32_t& pos) {
    // assert(pos < str.length());
    // assert(!uprv_isRuleWhiteSpace(str.char32At(pos)));
    UnicodeString buf;
    int p = pos;
    while (p < str.length()) {
        UChar32 ch = str.char32At(p);
        if (buf.length() == 0) {
            if (u_isIDStart(ch)) {
                buf.append(ch);
            } else {
                buf.truncate(0);
                return buf;
            }
        } else {
            if (u_isIDPart(ch)) {
                buf.append(ch);
            } else {
                break;
            }
        }
        p += UTF_CHAR_LENGTH(ch);
    }
    pos = p;
    return buf;
}
コード例 #4
0
ファイル: rbt_set.cpp プロジェクト: Botyto/Core
/**
 * Transliterate the given text with the given UTransPosition
 * indices.  Return TRUE if the transliteration should continue
 * or FALSE if it should halt (because of a U_PARTIAL_MATCH match).
 * Note that FALSE is only ever returned if isIncremental is TRUE.
 * @param text the text to be transliterated
 * @param pos the position indices, which will be updated
 * @param incremental if TRUE, assume new text may be inserted
 * at index.limit, and return FALSE if thre is a partial match.
 * @return TRUE unless a U_PARTIAL_MATCH has been obtained,
 * indicating that transliteration should stop until more text
 * arrives.
 */
UBool TransliterationRuleSet::transliterate(Replaceable & text,
        UTransPosition & pos,
        UBool incremental)
{
	int16_t indexByte = (int16_t)(text.char32At(pos.start) & 0xFF);
	for (int32_t i = index[indexByte]; i < index[indexByte + 1]; ++i)
	{
		UMatchDegree m = rules[i]->matchAndReplace(text, pos, incremental);
		switch (m)
		{
		case U_MATCH:
			_debugOut("match", rules[i], text, pos);
			return TRUE;
		case U_PARTIAL_MATCH:
			_debugOut("partial match", rules[i], text, pos);
			return FALSE;
		default: /* Ram: added default to make GCC happy */
			break;
		}
	}
	// No match or partial match from any rule
	pos.start += UTF_CHAR_LENGTH(text.char32At(pos.start));
	_debugOut("no match", NULL, text, pos);
	return TRUE;
}
コード例 #5
0
ファイル: rbt_set.cpp プロジェクト: Botyto/Core
// Replace nonprintable characters with unicode escapes
UnicodeString & _escape(const UnicodeString & source,
                        UnicodeString & target)
{
	for (int32_t i = 0; i < source.length();)
	{
		UChar32 ch = source.char32At(i);
		i += UTF_CHAR_LENGTH(ch);
		if (ch < 0x09 || (ch > 0x0A && ch < 0x20) || ch > 0x7E)
		{
			if (ch <= 0xFFFF)
			{
				target += "\\u";
				_appendHex(ch, 4, target);
			}
			else
			{
				target += "\\U";
				_appendHex(ch, 8, target);
			}
		}
		else
		{
			target += ch;
		}
	}
	return target;
}
コード例 #6
0
ファイル: ruleiter.cpp プロジェクト: Abocer/android-4.2_r1
void RuleCharacterIterator::skipIgnored(int32_t options) {
    if ((options & SKIP_WHITESPACE) != 0) {
        for (;;) {
            UChar32 a = _current();
            if (!PatternProps::isWhiteSpace(a)) break;
            _advance(UTF_CHAR_LENGTH(a));
        }
    }
}
コード例 #7
0
ファイル: normlzr.cpp プロジェクト: venkatarajasekhar/Qt
/**
 * Return the next character in the normalized text and advance
 * the iteration position by one.  If the end
 * of the text has already been reached, {@link #DONE} is returned.
 */
UChar32 Normalizer::next() {
    if(bufferPos<buffer.length() ||  nextNormalize()) {
        UChar32 c=buffer.char32At(bufferPos);
        bufferPos+=UTF_CHAR_LENGTH(c);
        return c;
    } else {
        return DONE;
    }
}
コード例 #8
0
ファイル: normlzr.cpp プロジェクト: venkatarajasekhar/Qt
/**
 * Return the previous character in the normalized text and decrement
 * the iteration position by one.  If the beginning
 * of the text has already been reached, {@link #DONE} is returned.
 */
UChar32 Normalizer::previous() {
    if(bufferPos>0 || previousNormalize()) {
        UChar32 c=buffer.char32At(bufferPos-1);
        bufferPos-=UTF_CHAR_LENGTH(c);
        return c;
    } else {
        return DONE;
    }
}
コード例 #9
0
ファイル: uni2name.cpp プロジェクト: Botyto/Core
/**
 * Implements {@link Transliterator#handleTransliterate}.
 * Ignore isIncremental since we don't need the context, and
 * we work on codepoints.
 */
void UnicodeNameTransliterator::handleTransliterate(Replaceable & text, UTransPosition & offsets,
        UBool /*isIncremental*/) const
{
	// The failure mode, here and below, is to behave like Any-Null,
	// if either there is no name data (max len == 0) or there is no
	// memory (malloc() => NULL).

	int32_t maxLen = uprv_getMaxCharNameLength();
	if (maxLen == 0)
	{
		offsets.start = offsets.limit;
		return;
	}

	// Accomodate the longest possible name plus padding
	char * buf = (char *) uprv_malloc(maxLen);
	if (buf == NULL)
	{
		offsets.start = offsets.limit;
		return;
	}

	int32_t cursor = offsets.start;
	int32_t limit = offsets.limit;

	UnicodeString str(FALSE, OPEN_DELIM, OPEN_DELIM_LEN);
	UErrorCode status;
	int32_t len;

	while (cursor < limit)
	{
		UChar32 c = text.char32At(cursor);
		int32_t clen = UTF_CHAR_LENGTH(c);
		status = U_ZERO_ERROR;
		if ((len = u_charName(c, U_EXTENDED_CHAR_NAME, buf, maxLen, &status)) > 0 && !U_FAILURE(status))
		{
			str.truncate(OPEN_DELIM_LEN);
			str.append(UnicodeString(buf, len, US_INV)).append(CLOSE_DELIM);
			text.handleReplaceBetween(cursor, cursor + clen, str);
			len += OPEN_DELIM_LEN + 1; // adjust for delimiters
			cursor += len; // advance cursor and adjust for new text
			limit += len - clen; // change in length
		}
		else
		{
			cursor += clen;
		}
	}

	offsets.contextLimit += limit - offsets.limit;
	offsets.limit = limit;
	offsets.start = cursor;

	uprv_free(buf);
}
コード例 #10
0
ファイル: strmatch.cpp プロジェクト: 0x4d52/JavaScriptCore-X
/**
 * Implement UnicodeMatcher
 */
void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
    UChar32 ch;
    for (int32_t i=0; i<pattern.length(); i+=UTF_CHAR_LENGTH(ch)) {
        ch = pattern.char32At(i);
        const UnicodeMatcher* matcher = data->lookupMatcher(ch);
        if (matcher == NULL) {
            toUnionTo.add(ch);
        } else {
            matcher->addMatchSetTo(toUnionTo);
        }
    }
}
コード例 #11
0
ファイル: strmatch.cpp プロジェクト: 0x4d52/JavaScriptCore-X
/**
 * Implement UnicodeFunctor
 */
void StringMatcher::setData(const TransliterationRuleData* d) {
    data = d;
    int32_t i = 0;
    while (i<pattern.length()) {
        UChar32 c = pattern.char32At(i);
        UnicodeFunctor* f = data->lookup(c);
        if (f != NULL) {
            f->setData(data);
        }
        i += UTF_CHAR_LENGTH(c);
    }    
}
コード例 #12
0
ファイル: strrepl.cpp プロジェクト: LittoCats/OT_4010D
void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const {
    UChar32 ch;
    for (int32_t i=0; i<output.length(); i+=UTF_CHAR_LENGTH(ch)) {
    ch = output.char32At(i);
    UnicodeReplacer* r = data->lookupReplacer(ch);
    if (r == NULL) {
        toUnionTo.add(ch);
    } else {
        r->addReplacementSetTo(toUnionTo);
    }
    }
}
コード例 #13
0
ファイル: ruleiter.cpp プロジェクト: Abocer/android-4.2_r1
UChar32 RuleCharacterIterator::next(int32_t options, UBool& isEscaped, UErrorCode& ec) {
    if (U_FAILURE(ec)) return DONE;

    UChar32 c = DONE;
    isEscaped = FALSE;

    for (;;) {
        c = _current();
        _advance(UTF_CHAR_LENGTH(c));

        if (c == SymbolTable::SYMBOL_REF && buf == 0 &&
            (options & PARSE_VARIABLES) != 0 && sym != 0) {
            UnicodeString name = sym->parseReference(text, pos, text.length());
            // If name is empty there was an isolated SYMBOL_REF;
            // return it.  Caller must be prepared for this.
            if (name.length() == 0) {
                break;
            }
            bufPos = 0;
            buf = sym->lookup(name);
            if (buf == 0) {
                ec = U_UNDEFINED_VARIABLE;
                return DONE;
            }
            // Handle empty variable value
            if (buf->length() == 0) {
                buf = 0;
            }
            continue;
        }

        if ((options & SKIP_WHITESPACE) != 0 && PatternProps::isWhiteSpace(c)) {
            continue;
        }

        if (c == 0x5C /*'\\'*/ && (options & PARSE_ESCAPES) != 0) {
            UnicodeString tempEscape;
            int32_t offset = 0;
            c = lookahead(tempEscape, MAX_U_NOTATION_LEN).unescapeAt(offset);
            jumpahead(offset);
            isEscaped = TRUE;
            if (c < 0) {
                ec = U_MALFORMED_UNICODE_ESCAPE;
                return DONE;
            }
        }

        break;
    }

    return c;
}
コード例 #14
0
/**
 * Union the set of all characters that may be modified by this rule
 * into the given set.
 */
void TransliterationRule::addSourceSetTo(UnicodeSet& toUnionTo) const {
    int32_t limit = anteContextLength + keyLength;
    for (int32_t i=anteContextLength; i<limit; ) {
        UChar32 ch = pattern.char32At(i);
        i += UTF_CHAR_LENGTH(ch);
        const UnicodeMatcher* matcher = data->lookupMatcher(ch);
        if (matcher == NULL) {
            toUnionTo.add(ch);
        } else {
            matcher->addMatchSetTo(toUnionTo);
        }
    }
}
コード例 #15
0
ファイル: esctrn.cpp プロジェクト: Botyto/Core
/**
 * Implements {@link Transliterator#handleTransliterate}.
 */
void EscapeTransliterator::handleTransliterate(Replaceable & text,
        UTransPosition & pos,
        UBool /*isIncremental*/) const
{
	/* TODO: Verify that isIncremental can be ignored */
	int32_t start = pos.start;
	int32_t limit = pos.limit;

	UnicodeString buf(prefix);
	int32_t prefixLen = prefix.length();
	UBool redoPrefix = FALSE;

	while (start < limit)
	{
		int32_t c = grokSupplementals ? text.char32At(start) : text.charAt(start);
		int32_t charLen = grokSupplementals ? UTF_CHAR_LENGTH(c) : 1;

		if ((c & 0xFFFF0000) != 0 && supplementalHandler != NULL)
		{
			buf.truncate(0);
			buf.append(supplementalHandler->prefix);
			ICU_Utility::appendNumber(buf, c, supplementalHandler->radix,
			                          supplementalHandler->minDigits);
			buf.append(supplementalHandler->suffix);
			redoPrefix = TRUE;
		}
		else
		{
			if (redoPrefix)
			{
				buf.truncate(0);
				buf.append(prefix);
				redoPrefix = FALSE;
			}
			else
			{
				buf.truncate(prefixLen);
			}
			ICU_Utility::appendNumber(buf, c, radix, minDigits);
			buf.append(suffix);
		}

		text.handleReplaceBetween(start, start + charLen, buf);
		start += buf.length();
		limit += buf.length() - charLen;
	}

	pos.contextLimit += limit - pos.limit;
	pos.limit = limit;
	pos.start = start;
}
コード例 #16
0
ファイル: toupptrn.cpp プロジェクト: gitpan/ponie
/**
 * Implements {@link Transliterator#handleTransliterate}.
 */
void UppercaseTransliterator::handleTransliterate(Replaceable& text,
                                 UTransPosition& offsets, 
                                 UBool isIncremental) const {
    int32_t textPos = offsets.start;
    if (textPos >= offsets.limit) return;

    // get string for context
    
    UnicodeString original;
    text.extractBetween(offsets.contextStart, offsets.contextLimit, original);
    
    UCharIterator iter;
    uiter_setReplaceable(&iter, &text);
    iter.start = offsets.contextStart;
    iter.limit = offsets.contextLimit;
            
    // Walk through original string
    // If there is a case change, modify corresponding position in replaceable
    
    int32_t i = textPos - offsets.contextStart;
    int32_t limit = offsets.limit - offsets.contextStart;
    UChar32 cp;
    int32_t oldLen;
    
    for (; i < limit; ) {
        UTF_GET_CHAR(original.getBuffer(), 0, i, original.length(), cp);
        oldLen = UTF_CHAR_LENGTH(cp);
        i += oldLen;
        iter.index = i; // Point _past_ current char
        int32_t newLen = u_internalToUpper(cp, &iter, buffer, u_getMaxCaseExpansion(), loc.getName());
        if (newLen >= 0) {
            UnicodeString temp(buffer, newLen);
            text.handleReplaceBetween(textPos, textPos + oldLen, temp);
            if (newLen != oldLen) {
                textPos += newLen;
                offsets.limit += newLen - oldLen;
                offsets.contextLimit += newLen - oldLen;
                continue;
            }
        }
        textPos += oldLen;
    }
    offsets.start = offsets.limit;
}
コード例 #17
0
UnicodeString& Transliterator::toRules(UnicodeString& rulesSource,
                                       UBool escapeUnprintable) const {
    // The base class implementation of toRules munges the ID into
    // the correct format.  That is: foo => ::foo
    if (escapeUnprintable) {
        rulesSource.truncate(0);
        UnicodeString id = getID();
        for (int32_t i=0; i<id.length();) {
            UChar32 c = id.char32At(i);
            if (!ICU_Utility::escapeUnprintable(rulesSource, c)) {
                rulesSource.append(c);
            }
            i += UTF_CHAR_LENGTH(c);
        }
    } else {
        rulesSource = getID();
    }
    // KEEP in sync with rbt_pars
    rulesSource.insert(0, UNICODE_STRING_SIMPLE("::"));
    rulesSource.append(ID_DELIM);
    return rulesSource;
}
コード例 #18
0
ファイル: ustrtrns.c プロジェクト: S0043640wipro/RiCRiPInt
U_CAPI UChar* U_EXPORT2
u_strFromUTF32(UChar   *dest,
               int32_t destCapacity,
               int32_t *pDestLength,
               const UChar32 *src,
               int32_t srcLength,
               UErrorCode *pErrorCode)
{
    int32_t reqLength = 0;
    uint32_t ch =0;
    UChar *pDestLimit =dest+destCapacity;
    UChar *pDest = dest;
    const uint32_t *pSrc = (const uint32_t *)src;

    /* args check */
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
        return NULL;
    }

    if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }

     /* Check if the source is null terminated */
    if(srcLength == -1 ){
        while(((ch=*pSrc)!=0) && (pDest < pDestLimit)){
            ++pSrc;
            if(ch<=0xFFFF){
                *(pDest++)=(UChar)ch;
            }else if(ch<=0x10ffff){
                *(pDest++)=UTF16_LEAD(ch);
                if(pDest<pDestLimit){
                    *(pDest++)=UTF16_TRAIL(ch);
                }else{
                    reqLength++;
                    break;
                }
            }else{
                *pErrorCode = U_INVALID_CHAR_FOUND;
                return NULL;
            }
        }
        while((ch=*pSrc++) != 0){
            reqLength+=UTF_CHAR_LENGTH(ch);
        }
    }else{
        const uint32_t* pSrcLimit = ((const uint32_t*)pSrc) + srcLength;
        while((pSrc < pSrcLimit) && (pDest < pDestLimit)){
            ch = *pSrc++;
            if(ch<=0xFFFF){
                *(pDest++)=(UChar)ch;
            }else if(ch<=0x10FFFF){
                *(pDest++)=UTF16_LEAD(ch);
                if(pDest<pDestLimit){
                    *(pDest++)=UTF16_TRAIL(ch);
                }else{
                    reqLength++;
                    break;
                }
            }else{
                *pErrorCode = U_INVALID_CHAR_FOUND;
                return NULL;
            }
        }
        while(pSrc <pSrcLimit){
            ch = *pSrc++;
            reqLength+=UTF_CHAR_LENGTH(ch);
        }
    }

    reqLength += (int32_t)(pDest - dest);
    if(pDestLength){
        *pDestLength = reqLength;
    }

    /* Terminate the buffer */
    u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);

    return dest;
}
コード例 #19
0
ファイル: titletrn.cpp プロジェクト: gitpan/ponie
/**
 * Implements {@link Transliterator#handleTransliterate}.
 */
void TitlecaseTransliterator::handleTransliterate(
                                  Replaceable& text, UTransPosition& offsets,
                                  UBool isIncremental) const {
    if (SKIP == NULL) {
        return;
    }

    // Our mode; we are either converting letter toTitle or
    // toLower.
    UBool doTitle = TRUE;
    
    // Determine if there is a preceding context of CASED SKIP*,
    // in which case we want to start in toLower mode.  If the
    // prior context is anything else (including empty) then start
    // in toTitle mode.
    UChar32 c;
    int32_t start;
    for (start = offsets.start - 1; start >= offsets.contextStart; start -= UTF_CHAR_LENGTH(c)) {
        c = text.char32At(start);
        if (SKIP->contains(c)) {
            continue;
        }
        doTitle = !CASED->contains(c);
        break;
    }
    
    // Convert things after a CASED character toLower; things
    // after a non-CASED, non-SKIP character toTitle.  SKIP
    // characters are copied directly and do not change the mode.
    int32_t textPos = offsets.start;
    if (textPos >= offsets.limit) return;

    UnicodeString original;
    text.extractBetween(offsets.contextStart, offsets.contextLimit, original);

    UCharIterator iter;
    uiter_setReplaceable(&iter, &text);
    iter.start = offsets.contextStart;
    iter.limit = offsets.contextLimit;

    // Walk through original string
    // If there is a case change, modify corresponding position in replaceable

    int32_t i = textPos - offsets.contextStart;
    int32_t limit = offsets.limit - offsets.contextStart;
    UChar32 cp;
    int32_t oldLen;
    int32_t newLen;

    for (; i < limit; ) {
        UTF_GET_CHAR(original.getBuffer(), 0, i, original.length(), cp);
        oldLen = UTF_CHAR_LENGTH(cp);
        i += oldLen;
        iter.index = i; // Point _past_ current char
        if (!SKIP->contains(cp)) {
            if (doTitle) {
                newLen = u_internalToTitle(cp, &iter, buffer, u_getMaxCaseExpansion(), loc.getName());
            } else {
                newLen = u_internalToLower(cp, &iter, buffer, u_getMaxCaseExpansion(), loc.getName());
            }
            doTitle = !CASED->contains(cp);
            if (newLen >= 0) {
                UnicodeString temp(buffer, newLen);
                text.handleReplaceBetween(textPos, textPos + oldLen, temp);
                if (newLen != oldLen) {
                    textPos += newLen;
                    offsets.limit += newLen - oldLen;
                    offsets.contextLimit += newLen - oldLen;
                    continue;
                }
            }
        }
        textPos += oldLen;
    }
    offsets.start = offsets.limit;
}
コード例 #20
0
static inline int32_t posAfter(const Replaceable& str, int32_t pos) {
    return (pos >= 0 && pos < str.length()) ?
        pos + UTF_CHAR_LENGTH(str.char32At(pos)) :
        pos + 1;
}
コード例 #21
0
void Transliterator::filteredTransliterate(Replaceable& text,
                                           UTransPosition& index,
                                           UBool incremental,
                                           UBool rollback) const {
    // Short circuit path for transliterators with no filter in
    // non-incremental mode.
    if (filter == 0 && !rollback) {
        handleTransliterate(text, index, incremental);
        return;
    }

    //----------------------------------------------------------------------
    // This method processes text in two groupings:
    //
    // RUNS -- A run is a contiguous group of characters which are contained
    // in the filter for this transliterator (filter.contains(ch) == TRUE).
    // Text outside of runs may appear as context but it is not modified.
    // The start and limit Position values are narrowed to each run.
    //
    // PASSES (incremental only) -- To make incremental mode work correctly,
    // each run is broken up into n passes, where n is the length (in code
    // points) of the run.  Each pass contains the first n characters.  If a
    // pass is completely transliterated, it is committed, and further passes
    // include characters after the committed text.  If a pass is blocked,
    // and does not transliterate completely, then this method rolls back
    // the changes made during the pass, extends the pass by one code point,
    // and tries again.
    //----------------------------------------------------------------------
    
    // globalLimit is the limit value for the entire operation.  We
    // set index.limit to the end of each unfiltered run before
    // calling handleTransliterate(), so we need to maintain the real
    // value of index.limit here.  After each transliteration, we
    // update globalLimit for insertions or deletions that have
    // happened.
    int32_t globalLimit = index.limit;
    
    // If there is a non-null filter, then break the input text up.  Say the
    // input text has the form:
    //   xxxabcxxdefxx
    // where 'x' represents a filtered character (filter.contains('x') ==
    // false).  Then we break this up into:
    //   xxxabc xxdef xx
    // Each pass through the loop consumes a run of filtered
    // characters (which are ignored) and a subsequent run of
    // unfiltered characters (which are transliterated).
    
    for (;;) {

        if (filter != NULL) {
            // Narrow the range to be transliterated to the first segment
            // of unfiltered characters at or after index.start.

            // Advance past filtered chars
            UChar32 c;
            while (index.start < globalLimit &&
                   !filter->contains(c=text.char32At(index.start))) {
                index.start += UTF_CHAR_LENGTH(c);
            }

            // Find the end of this run of unfiltered chars
            index.limit = index.start;
            while (index.limit < globalLimit &&
                   filter->contains(c=text.char32At(index.limit))) {
                index.limit += UTF_CHAR_LENGTH(c);
            }
        }

        // Check to see if the unfiltered run is empty.  This only
        // happens at the end of the string when all the remaining
        // characters are filtered.
        if (index.limit == index.start) {
            // assert(index.start == globalLimit);
            break;
        }

        // Is this run incremental?  If there is additional
        // filtered text (if limit < globalLimit) then we pass in
        // an incremental value of FALSE to force the subclass to
        // complete the transliteration for this run.
        UBool isIncrementalRun =
            (index.limit < globalLimit ? FALSE : incremental);
        
        int32_t delta;

        // Implement rollback.  To understand the need for rollback,
        // consider the following transliterator:
        //
        //  "t" is "a > A;"
        //  "u" is "A > b;"
        //  "v" is a compound of "t; NFD; u" with a filter [:Ll:]
        //
        // Now apply "c" to the input text "a".  The result is "b".  But if
        // the transliteration is done incrementally, then the NFD holds
        // things up after "t" has already transformed "a" to "A".  When
        // finishTransliterate() is called, "A" is _not_ processed because
        // it gets excluded by the [:Ll:] filter, and the end result is "A"
        // -- incorrect.  The problem is that the filter is applied to a
        // partially-transliterated result, when we only want it to apply to
        // input text.  Although this example hinges on a compound
        // transliterator containing NFD and a specific filter, it can
        // actually happen with any transliterator which may do a partial
        // transformation in incremental mode into characters outside its
        // filter.
        //
        // To handle this, when in incremental mode we supply characters to
        // handleTransliterate() in several passes.  Each pass adds one more
        // input character to the input text.  That is, for input "ABCD", we
        // first try "A", then "AB", then "ABC", and finally "ABCD".  If at
        // any point we block (upon return, start < limit) then we roll
        // back.  If at any point we complete the run (upon return start ==
        // limit) then we commit that run.

        if (rollback && isIncrementalRun) {

            int32_t runStart = index.start;
            int32_t runLimit = index.limit;
            int32_t runLength =  runLimit - runStart;

            // Make a rollback copy at the end of the string
            int32_t rollbackOrigin = text.length();
            text.copy(runStart, runLimit, rollbackOrigin);

            // Variables reflecting the commitment of completely
            // transliterated text.  passStart is the runStart, advanced
            // past committed text.  rollbackStart is the rollbackOrigin,
            // advanced past rollback text that corresponds to committed
            // text.
            int32_t passStart = runStart;
            int32_t rollbackStart = rollbackOrigin;

            // The limit for each pass; we advance by one code point with
            // each iteration.
            int32_t passLimit = index.start;

            // Total length, in 16-bit code units, of uncommitted text.
            // This is the length to be rolled back.
            int32_t uncommittedLength = 0;

            // Total delta (change in length) for all passes
            int32_t totalDelta = 0;

            // PASS MAIN LOOP -- Start with a single character, and extend
            // the text by one character at a time.  Roll back partial
            // transliterations and commit complete transliterations.
            for (;;) {
                // Length of additional code point, either one or two
                int32_t charLength =
                    UTF_CHAR_LENGTH(text.char32At(passLimit));
                passLimit += charLength;
                if (passLimit > runLimit) {
                    break;
                }
                uncommittedLength += charLength;

                index.limit = passLimit;

                // Delegate to subclass for actual transliteration.  Upon
                // return, start will be updated to point after the
                // transliterated text, and limit and contextLimit will be
                // adjusted for length changes.
                handleTransliterate(text, index, TRUE);

                delta = index.limit - passLimit; // change in length

                // We failed to completely transliterate this pass.
                // Roll back the text.  Indices remain unchanged; reset
                // them where necessary.
                if (index.start != index.limit) {
                    // Find the rollbackStart, adjusted for length changes
                    // and the deletion of partially transliterated text.
                    int32_t rs = rollbackStart + delta - (index.limit - passStart);

                    // Delete the partially transliterated text
                    text.handleReplaceBetween(passStart, index.limit, EMPTY);

                    // Copy the rollback text back
                    text.copy(rs, rs + uncommittedLength, passStart);

                    // Restore indices to their original values
                    index.start = passStart;
                    index.limit = passLimit;
                    index.contextLimit -= delta;
                }

                // We did completely transliterate this pass.  Update the
                // commit indices to record how far we got.  Adjust indices
                // for length change.
                else {
                    // Move the pass indices past the committed text.
                    passStart = passLimit = index.start;

                    // Adjust the rollbackStart for length changes and move
                    // it past the committed text.  All characters we've
                    // processed to this point are committed now, so zero
                    // out the uncommittedLength.
                    rollbackStart += delta + uncommittedLength;
                    uncommittedLength = 0;

                    // Adjust indices for length changes.
                    runLimit += delta;
                    totalDelta += delta;
                }
            }

            // Adjust overall limit and rollbackOrigin for insertions and
            // deletions.  Don't need to worry about contextLimit because
            // handleTransliterate() maintains that.
            rollbackOrigin += totalDelta;
            globalLimit += totalDelta;

            // Delete the rollback copy
            text.handleReplaceBetween(rollbackOrigin, rollbackOrigin + runLength, EMPTY);

            // Move start past committed text
            index.start = passStart;
        }

        else {
            // Delegate to subclass for actual transliteration.
            int32_t limit = index.limit;
            handleTransliterate(text, index, isIncrementalRun);
            delta = index.limit - limit; // change in length

            // In a properly written transliterator, start == limit after
            // handleTransliterate() returns when incremental is false.
            // Catch cases where the subclass doesn't do this, and throw
            // an exception.  (Just pinning start to limit is a bad idea,
            // because what's probably happening is that the subclass
            // isn't transliterating all the way to the end, and it should
            // in non-incremental mode.)
            if (!incremental && index.start != index.limit) {
                // We can't throw an exception, so just fudge things
                index.start = index.limit;
            }

            // Adjust overall limit for insertions/deletions.  Don't need
            // to worry about contextLimit because handleTransliterate()
            // maintains that.
            globalLimit += delta;
        }

        if (filter == NULL || isIncrementalRun) {
            break;
        }

        // If we did completely transliterate this
        // run, then repeat with the next unfiltered run.
    }

    // Start is valid where it is.  Limit needs to be put back where
    // it was, modulo adjustments for deletions/insertions.
    index.limit = globalLimit;
}
コード例 #22
0
/**
 * Implements {@link Transliterator#handleTransliterate}.
 */
void NameUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
                                                    UBool isIncremental) const {
    // The failure mode, here and below, is to behave like Any-Null,
    // if either there is no name data (max len == 0) or there is no
    // memory (malloc() => NULL).

    int32_t maxLen = uprv_getMaxCharNameLength();
    if (maxLen == 0) {
        offsets.start = offsets.limit;
        return;
    }

    // Accomodate the longest possible name
    ++maxLen; // allow for temporary trailing space
    char* cbuf = (char*) uprv_malloc(maxLen);
    if (cbuf == NULL) {
        offsets.start = offsets.limit;
        return;
    }

    UnicodeString openPat(TRUE, OPEN, -1);
    UnicodeString str, name;

    int32_t cursor = offsets.start;
    int32_t limit = offsets.limit;

    // Modes:
    // 0 - looking for open delimiter
    // 1 - after open delimiter
    int32_t mode = 0;
    int32_t openPos = -1; // open delim candidate pos

    UChar32 c;
    while (cursor < limit) {
        c = text.char32At(cursor);

        switch (mode) {
        case 0: // looking for open delimiter
            if (c == OPEN_DELIM) { // quick check first
                openPos = cursor;
                int32_t i =
                    ICU_Utility::parsePattern(openPat, text, cursor, limit);
                if (i >= 0 && i < limit) {
                    mode = 1;
                    name.truncate(0);
                    cursor = i;
                    continue; // *** reprocess char32At(cursor)
                }
            }
            break;

        case 1: // after open delimiter
            // Look for legal chars.  If \s+ is found, convert it
            // to a single space.  If closeDelimiter is found, exit
            // the loop.  If any other character is found, exit the
            // loop.  If the limit is reached, exit the loop.

            // Convert \s+ => SPACE.  This assumes there are no
            // runs of >1 space characters in names.
            if (uprv_isRuleWhiteSpace(c)) {
                // Ignore leading whitespace
                if (name.length() > 0 &&
                    name.charAt(name.length()-1) != SPACE) {
                    name.append(SPACE);
                    // If we are too long then abort.  maxLen includes
                    // temporary trailing space, so use '>'.
                    if (name.length() > maxLen) {
                        mode = 0;
                    }
                }
                break;
            }

            if (c == CLOSE_DELIM) {
                int32_t len = name.length();

                // Delete trailing space, if any
                if (len > 0 &&
                    name.charAt(len-1) == SPACE) {
                    --len;
                }

                if (uprv_isInvariantUString(name.getBuffer(), len)) {
                    name.extract(0, len, cbuf, maxLen, US_INV);

                    UErrorCode status = U_ZERO_ERROR;
                    c = u_charFromName(U_EXTENDED_CHAR_NAME, cbuf, &status);
                    if (U_SUCCESS(status)) {
                        // Lookup succeeded

                        // assert(UTF_CHAR_LENGTH(CLOSE_DELIM) == 1);
                        cursor++; // advance over CLOSE_DELIM

                        str.truncate(0);
                        str.append(c);
                        text.handleReplaceBetween(openPos, cursor, str);

                        // Adjust indices for the change in the length of
                        // the string.  Do not assume that str.length() ==
                        // 1, in case of surrogates.
                        int32_t delta = cursor - openPos - str.length();
                        cursor -= delta;
                        limit -= delta;
                        // assert(cursor == openPos + str.length());
                    }
                }
                // If the lookup failed, we leave things as-is and
                // still switch to mode 0 and continue.
                mode = 0;
                openPos = -1; // close off candidate
                continue; // *** reprocess char32At(cursor)
            }
            
            // Check if c is a legal char.  We assume here that
            // legal.contains(OPEN_DELIM) is FALSE, so when we abort a
            // name, we don't have to go back to openPos+1.
            if (legal.contains(c)) {
                name.append(c);
                // If we go past the longest possible name then abort.
                // maxLen includes temporary trailing space, so use '>='.
                if (name.length() >= maxLen) {
                    mode = 0;
                }
            }
            
            // Invalid character
            else {
                --cursor; // Backup and reprocess this character
                mode = 0;
            }

            break;
        }

        cursor += UTF_CHAR_LENGTH(c);
    }
        
    offsets.contextLimit += limit - offsets.limit;
    offsets.limit = limit;
    // In incremental mode, only advance the cursor up to the last
    // open delimiter candidate.
    offsets.start = (isIncremental && openPos >= 0) ? openPos : cursor;

    uprv_free(cbuf);
}
コード例 #23
0
ファイル: strrepl.cpp プロジェクト: LittoCats/OT_4010D
int32_t StringReplacer::replace(Replaceable& text,
                                int32_t start,
                                int32_t limit,
                                int32_t& cursor) {
    int32_t outLen;
    int32_t newStart = 0;

    // NOTE: It should be possible to _always_ run the complex
    // processing code; just slower.  If not, then there is a bug
    // in the complex processing code.

    // Simple (no nested replacers) Processing Code :
    if (!isComplex) {
        text.handleReplaceBetween(start, limit, output);
        outLen = output.length();

        // Setup default cursor position (for cursorPos within output)
        newStart = cursorPos;
    }

    // Complex (nested replacers) Processing Code :
    else {
        /* When there are segments to be copied, use the Replaceable.copy()
         * API in order to retain out-of-band data.  Copy everything to the
         * end of the string, then copy them back over the key.  This preserves
         * the integrity of indices into the key and surrounding context while
         * generating the output text.
         */
        UnicodeString buf;
        int32_t oOutput; // offset into 'output'
        isComplex = FALSE;

        // The temporary buffer starts at tempStart, and extends
        // to destLimit.  The start of the buffer has a single
        // character from before the key.  This provides style
        // data when addition characters are filled into the
        // temporary buffer.  If there is nothing to the left, use
        // the non-character U+FFFF, which Replaceable subclasses
        // should treat specially as a "no-style character."
        // destStart points to the point after the style context
        // character, so it is tempStart+1 or tempStart+2.
        int32_t tempStart = text.length(); // start of temp buffer
        int32_t destStart = tempStart; // copy new text to here
        if (start > 0) {
            int32_t len = UTF_CHAR_LENGTH(text.char32At(start-1));
            text.copy(start-len, start, tempStart);
            destStart += len;
        } else {
            UnicodeString str((UChar) 0xFFFF);
            text.handleReplaceBetween(tempStart, tempStart, str);
            destStart++;
        }
        int32_t destLimit = destStart;

        for (oOutput=0; oOutput<output.length(); ) {
            if (oOutput == cursorPos) {
                // Record the position of the cursor
                newStart = destLimit - destStart; // relative to start
            }
            UChar32 c = output.char32At(oOutput);
            UnicodeReplacer* r = data->lookupReplacer(c);
            if (r == NULL) {
                // Accumulate straight (non-segment) text.
                buf.append(c);
            } else {
                isComplex = TRUE;

                // Insert any accumulated straight text.
                if (buf.length() > 0) {
                    text.handleReplaceBetween(destLimit, destLimit, buf);
                    destLimit += buf.length();
                    buf.truncate(0);
                }

                // Delegate output generation to replacer object
                int32_t len = r->replace(text, destLimit, destLimit, cursor);
                destLimit += len;
            }
            oOutput += UTF_CHAR_LENGTH(c);
        }
        // Insert any accumulated straight text.
        if (buf.length() > 0) {
            text.handleReplaceBetween(destLimit, destLimit, buf);
            destLimit += buf.length();
        }
        if (oOutput == cursorPos) {
            // Record the position of the cursor
            newStart = destLimit - destStart; // relative to start
        }

        outLen = destLimit - destStart;

        // Copy new text to start, and delete it
        text.copy(destStart, destLimit, start);
        text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, EMPTY);

        // Delete the old text (the key)
        text.handleReplaceBetween(start + outLen, limit + outLen, EMPTY);
    }        

    if (hasCursor) {
        // Adjust the cursor for positions outside the key.  These
        // refer to code points rather than code units.  If cursorPos
        // is within the output string, then use newStart, which has
        // already been set above.
        if (cursorPos < 0) {
            newStart = start;
            int32_t n = cursorPos;
            // Outside the output string, cursorPos counts code points
            while (n < 0 && newStart > 0) {
                newStart -= UTF_CHAR_LENGTH(text.char32At(newStart-1));
                ++n;
            }
            newStart += n;
        } else if (cursorPos > output.length()) {
            newStart = start + outLen;
            int32_t n = cursorPos - output.length();
            // Outside the output string, cursorPos counts code points
            while (n > 0 && newStart < text.length()) {
                newStart += UTF_CHAR_LENGTH(text.char32At(newStart));
                --n;
            }
            newStart += n;
        } else {
            // Cursor is within output string.  It has been set up above
            // to be relative to start.
            newStart += start;
        }

        cursor = newStart;
    }

    return outLen;
}
コード例 #24
0
ファイル: unesctrn.cpp プロジェクト: 0x4d52/JavaScriptCore-X
/**
 * Implements {@link Transliterator#handleTransliterate}.
 */
void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
                                                 UBool isIncremental) const {
    int32_t start = pos.start;
    int32_t limit = pos.limit;
    int32_t i, j, ipat;

    while (start < limit) {
        // Loop over the forms in spec[].  Exit this loop when we
        // match one of the specs.  Exit the outer loop if a
        // partial match is detected and isIncremental is true.
        for (j=0, ipat=0; spec[ipat] != END; ++j) {

            // Read the header
            int32_t prefixLen = spec[ipat++];
            int32_t suffixLen = spec[ipat++];
            int8_t  radix     = (int8_t) spec[ipat++];
            int32_t minDigits = spec[ipat++];
            int32_t maxDigits = spec[ipat++];

            // s is a copy of start that is advanced over the
            // characters as we parse them.
            int32_t s = start;
            UBool match = TRUE;

            for (i=0; i<prefixLen; ++i) {
                if (s >= limit) {
                    if (i > 0) {
                        // We've already matched a character.  This is
                        // a partial match, so we return if in
                        // incremental mode.  In non-incremental mode,
                        // go to the next spec.
                        if (isIncremental) {
                            goto exit;
                        }
                        match = FALSE;
                        break;
                    }
                }
                UChar c = text.charAt(s++);
                if (c != spec[ipat + i]) {
                    match = FALSE;
                    break;
                }
            }

            if (match) {
                UChar32 u = 0;
                int32_t digitCount = 0;
                for (;;) {
                    if (s >= limit) {
                        // Check for partial match in incremental mode.
                        if (s > start && isIncremental) {
                            goto exit;
                        }
                        break;
                    }
                    UChar32 ch = text.char32At(s);
                    int32_t digit = u_digit(ch, radix);
                    if (digit < 0) {
                        break;
                    }
                    s += UTF_CHAR_LENGTH(ch);
                    u = (u * radix) + digit;
                    if (++digitCount == maxDigits) {
                        break;
                    }
                }

                match = (digitCount >= minDigits);

                if (match) {
                    for (i=0; i<suffixLen; ++i) {
                        if (s >= limit) {
                            // Check for partial match in incremental mode.
                            if (s > start && isIncremental) {
                                goto exit;
                            }
                            match = FALSE;
                            break;
                        }
                        UChar c = text.charAt(s++);
                        if (c != spec[ipat + prefixLen + i]) {
                            match = FALSE;
                            break;
                        }
                    }

                    if (match) {
                        // At this point, we have a match
                        UnicodeString str(u);
                        text.handleReplaceBetween(start, s, str);
                        limit -= s - start - str.length();
                        // The following break statement leaves the
                        // loop that is traversing the forms in
                        // spec[].  We then parse the next input
                        // character.
                        break;
                    }
                }
            }

            ipat += prefixLen + suffixLen;
        }

        if (start < limit) {
            start += UTF_CHAR_LENGTH(text.char32At(start));
        }
    }

  exit:
    pos.contextLimit += limit - pos.limit;
    pos.limit = limit;
    pos.start = start;
}
コード例 #25
0
ファイル: ustrtrns.c プロジェクト: S0043640wipro/RiCRiPInt
U_CAPI UChar* U_EXPORT2
u_strFromUTF8(UChar *dest,
              int32_t destCapacity,
              int32_t *pDestLength,
              const char* src,
              int32_t srcLength,
              UErrorCode *pErrorCode){

    UChar *pDest = dest;
    UChar *pDestLimit = dest+destCapacity;
    UChar32 ch=0;
    int32_t index = 0;
    int32_t reqLength = 0;
    uint8_t* pSrc = (uint8_t*) src;

    /* args check */
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
        return NULL;
    }

    if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }

    if(srcLength == -1){
       srcLength = (int32_t)uprv_strlen((char*)pSrc);
    }

    while((index < srcLength)&&(pDest<pDestLimit)){
        ch = pSrc[index++];
        if(ch <=0x7f){
            *pDest++=(UChar)ch;
        }else{
            ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1);
            if(ch<0){
                *pErrorCode = U_INVALID_CHAR_FOUND;
                return NULL;
            }else if(ch<=0xFFFF){
                *(pDest++)=(UChar)ch;
            }else{
                *(pDest++)=UTF16_LEAD(ch);
                if(pDest<pDestLimit){
                    *(pDest++)=UTF16_TRAIL(ch);
                }else{
                    reqLength++;
                    break;
                }
            }
        }
    }
    /* donot fill the dest buffer just count the UChars needed */
    while(index < srcLength){
        ch = pSrc[index++];
        if(ch <= 0x7f){
            reqLength++;
        }else{
            ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1);
            if(ch<0){
                *pErrorCode = U_INVALID_CHAR_FOUND;
                return NULL;
            }
            reqLength+=UTF_CHAR_LENGTH(ch);
        }
    }

    reqLength+=(int32_t)(pDest - dest);

    if(pDestLength){
        *pDestLength = reqLength;
    }

    /* Terminate the buffer */
    u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);

    return dest;
}
コード例 #26
0
static inline int32_t posBefore(const Replaceable& str, int32_t pos) {
    return (pos > 0) ?
        pos - UTF_CHAR_LENGTH(str.char32At(pos-1)) :
        pos - 1;
}
コード例 #27
0
void Transliterator::_transliterate(Replaceable& text,
                                    UTransPosition& index,
                                    const UnicodeString* insertion,
                                    UErrorCode &status) const {
    if (U_FAILURE(status)) {
        return;
    }

    if (!positionIsValid(index, text.length())) {
        status = U_ILLEGAL_ARGUMENT_ERROR;
        return;
    }

//    int32_t originalStart = index.contextStart;
    if (insertion != 0) {
        text.handleReplaceBetween(index.limit, index.limit, *insertion);
        index.limit += insertion->length();
        index.contextLimit += insertion->length();
    }

    if (index.limit > 0 &&
        UTF_IS_LEAD(text.charAt(index.limit - 1))) {
        // Oops, there is a dangling lead surrogate in the buffer.
        // This will break most transliterators, since they will
        // assume it is part of a pair.  Don't transliterate until
        // more text comes in.
        return;
    }

    filteredTransliterate(text, index, TRUE, TRUE);

#if 0
    // TODO
    // I CAN'T DO what I'm attempting below now that the Kleene star
    // operator is supported.  For example, in the rule

    //   ([:Lu:]+) { x } > $1;

    // what is the maximum context length?  getMaximumContextLength()
    // will return 1, but this is just the length of the ante context
    // part of the pattern string -- 1 character, which is a standin
    // for a Quantifier, which contains a StringMatcher, which
    // contains a UnicodeSet.

    // There is a complicated way to make this work again, and that's
    // to add a "maximum left context" protocol into the
    // UnicodeMatcher hierarchy.  At present I'm not convinced this is
    // worth it.

    // ---

    // The purpose of the code below is to keep the context small
    // while doing incremental transliteration.  When part of the left
    // context (between contextStart and start) is no longer needed,
    // we try to advance contextStart past that portion.  We use the
    // maximum context length to do so.
    int32_t newCS = index.start;
    int32_t n = getMaximumContextLength();
    while (newCS > originalStart && n-- > 0) {
        --newCS;
        newCS -= UTF_CHAR_LENGTH(text.char32At(newCS)) - 1;
    }
    index.contextStart = uprv_max(newCS, originalStart);
#endif
}