示例#1
0
文件: UTF8.cpp 项目: 1833183060/wke
ConversionResult convertUTF8ToUTF16(
    const char** sourceStart, const char* sourceEnd, 
    UChar** targetStart, UChar* targetEnd, bool strict)
{
    ConversionResult result = conversionOK;
    const char* source = *sourceStart;
    UChar* target = *targetStart;
    while (source < sourceEnd) {
        int utf8SequenceLength = inlineUTF8SequenceLength(*source);
        if (sourceEnd - source < utf8SequenceLength)  {
            result = sourceExhausted;
            break;
        }
        // Do this check whether lenient or strict
        if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8SequenceLength)) {
            result = sourceIllegal;
            break;
        }

        UChar32 character = readUTF8Sequence(source, utf8SequenceLength);

        if (target >= targetEnd) {
            source -= utf8SequenceLength; // Back up source pointer!
            result = targetExhausted;
            break;
        }

        if (U_IS_BMP(character)) {
            // UTF-16 surrogate values are illegal in UTF-32
            if (U_IS_SURROGATE(character)) {
                if (strict) {
                    source -= utf8SequenceLength; // return to the illegal value itself
                    result = sourceIllegal;
                    break;
                } else
                    *target++ = replacementCharacter;
            } else
                *target++ = character; // normal case
        } else if (U_IS_SUPPLEMENTARY(character)) {
            // target is a character in range 0xFFFF - 0x10FFFF
            if (target + 1 >= targetEnd) {
                source -= utf8SequenceLength; // Back up source pointer!
                result = targetExhausted;
                break;
            }
            *target++ = U16_LEAD(character);
            *target++ = U16_TRAIL(character);
        } else {
            if (strict) {
                source -= utf8SequenceLength; // return to the start
                result = sourceIllegal;
                break; // Bail out; shouldn't continue
            } else
                *target++ = replacementCharacter;
        }
    }
    *sourceStart = source;
    *targetStart = target;
    return result;
}
示例#2
0
U_CAPI UChar32 U_EXPORT2 /* U_CAPI ... U_EXPORT2 added by Peter Kirk 17 Nov 2001 */
u_fungetc(UChar32        ch,
          UFILE        *f)
{
    u_localized_string *str;

    str = &f->str;

    /* if we're at the beginning of the buffer, sorry! */
    if (str->fPos == str->fBuffer
            || (U_IS_LEAD(ch) && (str->fPos - 1) == str->fBuffer))
    {
        ch = U_EOF;
    }
    else {
        /* otherwise, put the character back */
        /* Remember, read them back on in the reverse order. */
        if (U_IS_LEAD(ch)) {
            if (*--(str->fPos) != U16_TRAIL(ch)
                    || *--(str->fPos) != U16_LEAD(ch))
            {
                ch = U_EOF;
            }
        }
        else if (*--(str->fPos) != ch) {
            ch = U_EOF;
        }
    }
    return ch;
}
示例#3
0
void XMLTreeBuilder::processHTMLEntity(const AtomicXMLToken& token)
{
    HTMLEntitySearch search;
    const AtomicString& name = token.name();
    for (size_t i = 0; i < name.length(); ++i) {
        search.advance(name[i]);
        if (!search.isEntityPrefix()) {
            m_parser->stopParsing();
            return;
        }
    }
    search.advance(';');
    if (!search.isEntityPrefix()) {
        m_parser->stopParsing();
        return;
    }
    UChar32 entityValue = search.mostRecentMatch()->firstValue;
    // FIXME: We need to account for secondValue if any XML entities are longer
    // than one unicode character.
    ASSERT_NOT_REACHED();
    // Darin Adler writes:
    //   You can see given the code above that this else is dead code. This code is in a strange state.
    //   And the reinterpret_cast to UChar* makes the code little-endian-specific. That is not good!
    if (entityValue <= 0xFFFF)
        appendToText(reinterpret_cast<UChar*>(&entityValue), 1);
    else {
        UChar utf16Pair[2] = { U16_LEAD(entityValue), U16_TRAIL(entityValue) };
        appendToText(utf16Pair, 2);
    }
}
示例#4
0
static UChar32 U_CALLCONV
lenient8IteratorPrevious(UCharIterator *iter) {
    int32_t index;

    if(iter->reservedField!=0) {
        UChar lead=U16_LEAD(iter->reservedField);
        iter->reservedField=0;
        iter->start-=4; /* we stayed behind the supplementary code point; go before it now */
        if((index=iter->index)>0) {
            iter->index=index-1;
        }
        return lead;
    } else if(iter->start>0) {
        const uint8_t *s=(const uint8_t *)iter->context;
        UChar32 c;

        L8_PREV(s, 0, iter->start, c);
        if((index=iter->index)>0) {
            iter->index=index-1;
        } else if(iter->start<=1) {
            iter->index= c<=0xffff ? iter->start : iter->start+1;
        }
        if(c<0) {
            return 0xfffd;
        } else if(c<=0xffff) {
            return c;
        } else {
            iter->start+=4; /* back to behind this supplementary code point for consistent state */
            iter->reservedField=c;
            return U16_TRAIL(c);
        }
    } else {
        return U_SENTINEL;
    }
}
示例#5
0
RefPtr<Font> Font::systemFallbackFontForCharacter(UChar32 character, const FontDescription& description, bool isForPlatformFont) const
{
    auto fontAddResult = systemFallbackCache().add(this, CharacterFallbackMap());

    if (!character) {
        UChar codeUnit = 0;
        return FontCache::singleton().systemFallbackForCharacters(description, this, isForPlatformFont, &codeUnit, 1);
    }

    auto key = CharacterFallbackMapKey(description.locale(), character, isForPlatformFont);
    auto characterAddResult = fontAddResult.iterator->value.add(WTF::move(key), nullptr);

    Font*& fallbackFont = characterAddResult.iterator->value;

    if (!fallbackFont) {
        UChar codeUnits[2];
        unsigned codeUnitsLength;
        if (U_IS_BMP(character)) {
            codeUnits[0] = FontCascade::normalizeSpaces(character);
            codeUnitsLength = 1;
        } else {
            codeUnits[0] = U16_LEAD(character);
            codeUnits[1] = U16_TRAIL(character);
            codeUnitsLength = 2;
        }

        fallbackFont = FontCache::singleton().systemFallbackForCharacters(description, this, isForPlatformFont, codeUnits, codeUnitsLength).get();
        if (fallbackFont)
            fallbackFont->m_isUsedInSystemFallbackCache = true;
    }

    return fallbackFont;
}
示例#6
0
/* Explain <xxxxx> tag to a native value
 *
 * Since <xxxxx> is always larger than the native value,
 * the operation will replace the tag directly in the buffer,
 * and, of course, will shift tail elements.
 */
void IdnaConfTest::ExplainCodePointTag(UnicodeString& buf){
    buf.append((UChar)0);    // add a terminal NULL
    UChar* bufBase = buf.getBuffer(buf.length());
    UChar* p = bufBase;
    while (*p != 0){
        if ( *p != 0x3C){    // <
            *bufBase++ = *p++;
        } else {
            p++;    // skip <
            UChar32 cp = 0;
            for ( ;*p != 0x3E; p++){   // >
                if (0x30 <= *p && *p <= 0x39){        // 0-9
                    cp = (cp * 16) + (*p - 0x30);
                } else if (0x61 <= *p && *p <= 0x66){ // a-f
                    cp = (cp * 16) + (*p - 0x61) + 10;
                } else if (0x41 <= *p && *p <= 0x46) {// A-F
                    cp = (cp * 16) + (*p - 0x41) + 10;
                }
                // no else. hope everything is good.
            }
            p++;    // skip >
            if (U_IS_BMP(cp)){
                *bufBase++ = cp;
            } else {
                *bufBase++ = U16_LEAD(cp);
                *bufBase++ = U16_TRAIL(cp);
            }
        }
    }
    *bufBase = 0;  // close our buffer
    buf.releaseBuffer();
}
示例#7
0
UStringTrieResult
UCharsTrie::nextForCodePoint(UChar32 cp) {
    return cp<=0xffff ?
        next(cp) :
        (USTRINGTRIE_HAS_NEXT(next(U16_LEAD(cp))) ?
            next(U16_TRAIL(cp)) :
            USTRINGTRIE_NO_MATCH);
}
示例#8
0
UBool
Appendable::appendCodePoint(UChar32 c) {
    if(c<=0xffff) {
        return appendCodeUnit((UChar)c);
    } else {
        return appendCodeUnit(U16_LEAD(c)) && appendCodeUnit(U16_TRAIL(c));
    }
}
示例#9
0
void git__utf8_to_16(wchar_t *dest, size_t length, const char *src)
{
	wchar_t *pDest = dest;
	uint32_t ch;
	const uint8_t* pSrc = (uint8_t*) src;

	assert(dest && src && length);

	length--;

	while(*pSrc && length > 0) {
		ch = *pSrc++;
		length--;

		if(ch < 0xc0) {
			/*
			 * ASCII, or a trail byte in lead position which is treated like
			 * a single-byte sequence for better character boundary
			 * resynchronization after illegal sequences.
			 */
			*pDest++ = (wchar_t)ch;
			continue;
		} else if(ch < 0xe0) { /* U+0080..U+07FF */
			if (pSrc[0]) {
				/* 0x3080 = (0xc0 << 6) + 0x80 */
				*pDest++ = (wchar_t)((ch << 6) + *pSrc++ - 0x3080);
				continue;
			}
		} else if(ch < 0xf0) { /* U+0800..U+FFFF */
			if (pSrc[0] && pSrc[1]) {
				/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
				/* 0x2080 = (0x80 << 6) + 0x80 */
				ch = (ch << 12) + (*pSrc++ << 6);
				*pDest++ = (wchar_t)(ch + *pSrc++ - 0x2080);
				continue;
			}
		} else /* f0..f4 */ { /* U+10000..U+10FFFF */
			if (length >= 1 && pSrc[0] && pSrc[1] && pSrc[2]) {
				/* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
				ch = (ch << 18) + (*pSrc++ << 12);
				ch += *pSrc++ << 6;
				ch += *pSrc++ - 0x3c82080;
				*(pDest++) = U16_LEAD(ch);
				*(pDest++) = U16_TRAIL(ch);
				length--; /* two bytes for this character */
				continue;
			}
		}

		/* truncated character at the end */
		*pDest++ = 0xfffd;
		break;
	}

	*pDest++ = 0x0;
}
示例#10
0
UBool
FCDUTF8CollationIterator::previousHasTccc() const {
    U_ASSERT(state == CHECK_BWD && pos != 0);
    UChar32 c = u8[pos - 1];
    if(c < 0x80) { return FALSE; }
    int32_t i = pos;
    U8_PREV_OR_FFFD(u8, 0, i, c);
    if(c > 0xffff) { c = U16_LEAD(c); }
    return CollationFCD::hasTccc(c);
}
示例#11
0
UBool
FCDUTF8CollationIterator::nextHasLccc() const {
    U_ASSERT(state == CHECK_FWD && pos != length);
    // The lowest code point with ccc!=0 is U+0300 which is CC 80 in UTF-8.
    // CJK U+4000..U+DFFF except U+Axxx are also FCD-inert. (Lead bytes E4..ED except EA.)
    UChar32 c = u8[pos];
    if(c < 0xcc || (0xe4 <= c && c <= 0xed && c != 0xea)) { return FALSE; }
    int32_t i = pos;
    U8_NEXT_OR_FFFD(u8, i, length, c);
    if(c > 0xffff) { c = U16_LEAD(c); }
    return CollationFCD::hasLccc(c);
}
示例#12
0
static inline UChar* appendCharacter(UChar* destination, int character)
{
    ASSERT(character != nonCharacter);
    ASSERT(!U_IS_SURROGATE(character));
    if (U_IS_BMP(character))
        *destination++ = character;
    else {
        *destination++ = U16_LEAD(character);
        *destination++ = U16_TRAIL(character);
    }
    return destination;
}
示例#13
0
static size_t appendUChar32ToUCharArray(UChar32 value, UChar* result)
{
    if (U_IS_BMP(value)) {
        UChar character = static_cast<UChar>(value);
        ASSERT(character == value);
        result[0] = character;
        return 1;
    }

    result[0] = U16_LEAD(value);
    result[1] = U16_TRAIL(value);
    return 2;
}
示例#14
0
文件: ustr.c 项目: cyrusimap/icu4c
U_CFUNC void 
ustr_u32cat(struct UString *dst, UChar32 c, UErrorCode *status){
    if(c > 0x10FFFF){
        *status = U_ILLEGAL_CHAR_FOUND;
        return;
    }
    if(c >0xFFFF){
        ustr_ucat(dst, U16_LEAD(c), status);
        ustr_ucat(dst, U16_TRAIL(c), status);
    }else{
        ustr_ucat(dst, (UChar) c, status);
    }
}
示例#15
0
static inline String singleCharacterString(UChar32 c) 
{
    if (!c)
        return String();
    if (c > 0xffff) {
        UChar lead = U16_LEAD(c);
        UChar trail = U16_TRAIL(c);
        UChar utf16[2] = {lead, trail};
        return String(utf16, 2);
    }
    UChar n = (UChar)c;
    return String(&n, 1);
}
示例#16
0
unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits(const char* data, const char* dataEnd, unsigned& dataLength, unsigned& utf16Length)
{
    if (!data)
        return 0;

    StringHasher stringHasher;
    dataLength = 0;
    utf16Length = 0;

    while (data < dataEnd || (!dataEnd && *data)) {
        if (isASCII(*data)) {
            stringHasher.addCharacter(*data++);
            dataLength++;
            utf16Length++;
            continue;
        }

        int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data);
        dataLength += utf8SequenceLength;

        if (!dataEnd) {
            for (int i = 1; i < utf8SequenceLength; ++i) {
                if (!data[i])
                    return 0;
            }
        } else if (dataEnd - data < utf8SequenceLength) {
            return 0;
        }

        if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8SequenceLength))
            return 0;

        UChar32 character = readUTF8Sequence(data, utf8SequenceLength);
        ASSERT(!isASCII(character));

        if (U_IS_BMP(character)) {
            // UTF-16 surrogate values are illegal in UTF-32
            if (U_IS_SURROGATE(character))
                return 0;
            stringHasher.addCharacter(static_cast<UChar>(character)); // normal case
            utf16Length++;
        } else if (U_IS_SUPPLEMENTARY(character)) {
            stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)), static_cast<UChar>(U16_TRAIL(character)));
            utf16Length += 2;
        } else {
            return 0;
        }
    }

    return stringHasher.hashWithTop8BitsMasked();
}
示例#17
0
std::vector<uint16_t> utf8ToUtf16(const std::string& text) {
  std::vector<uint16_t> result;
  int32_t i = 0;
  const int32_t textLength = static_cast<int32_t>(text.size());
  uint32_t c = 0;
  while (i < textLength) {
    U8_NEXT(text.c_str(), i, textLength, c);
    if (U16_LENGTH(c) == 1) {
      result.push_back(c);
    } else {
      result.push_back(U16_LEAD(c));
      result.push_back(U16_TRAIL(c));
    }
  }
  return result;
}
示例#18
0
unsigned calculateStringHashFromUTF8(const char* data, const char* dataEnd, unsigned& utf16Length)
{
    if (!data)
        return 0;

    WTF::StringHasher stringHasher;
    utf16Length = 0;

    while (data < dataEnd) {
        if (isASCII(*data)) {
            stringHasher.addCharacter(*data++);
            utf16Length++;
            continue;
        }

        int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data);

        if (dataEnd - data < utf8SequenceLength)
            return false;

        if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8SequenceLength))
            return 0;

        UChar32 character = readUTF8Sequence(data, utf8SequenceLength);
        ASSERT(!isASCII(character));

        if (U_IS_BMP(character)) {
            // UTF-16 surrogate values are illegal in UTF-32
            if (U_IS_SURROGATE(character))
                return 0;
            stringHasher.addCharacter(static_cast<UChar>(character)); // normal case
            utf16Length++;
        } else if (U_IS_SUPPLEMENTARY(character)) {
            stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)),
                                       static_cast<UChar>(U16_TRAIL(character)));
            utf16Length += 2;
        } else
            return 0;
    }

    return stringHasher.hash();
}
示例#19
0
U_CAPI UChar* U_EXPORT2
u_strchr32(const UChar* s, UChar32 c) {
    if ((uint32_t) c <= U_BMP_MAX) {
        /* find BMP code point */
        return u_strchr(s, (UChar) c);
    } else if ((uint32_t) c <= UCHAR_MAX_VALUE) {
        /* find supplementary code point as surrogate pair */
        UChar cs, lead = U16_LEAD(c), trail = U16_TRAIL(c);

        while ((cs = *s++) != 0) {
            if (cs == lead && *s == trail) {
                return (UChar*) (s - 1);
            }
        }
        return NULL;
    } else {
        /* not a Unicode code point, not findable */
        return NULL;
    }
}
示例#20
0
void XMLTreeBuilder::processHTMLEntity(const AtomicXMLToken& token)
{
    HTMLEntitySearch search;
    const AtomicString& name = token.name();
    for (size_t i = 0; i < name.length(); ++i) {
        search.advance(name[i]);
        if (!search.isEntityPrefix()) {
            m_parser->stopParsing();
            return;
        }
    }
    search.advance(';');
    UChar32 entityValue = search.currentValue();
    if (entityValue <= 0xFFFF)
       appendToText(reinterpret_cast<UChar*>(&entityValue), 1);
    else {
        UChar utf16Pair[2] = { U16_LEAD(entityValue), U16_TRAIL(entityValue) };
        appendToText(utf16Pair, 2);
    }
}
示例#21
0
static UChar32 U_CALLCONV
lenient8IteratorCurrent(UCharIterator *iter) {
    if(iter->reservedField!=0) {
        return U16_TRAIL(iter->reservedField);
    } else if(iter->start<iter->limit) {
        const uint8_t *s=(const uint8_t *)iter->context;
        UChar32 c;
        int32_t i=iter->start;

        L8_NEXT(s, i, iter->limit, c);
        if(c<0) {
            return 0xfffd;
        } else if(c<=0xffff) {
            return c;
        } else {
            return U16_LEAD(c);
        }
    } else {
        return U_SENTINEL;
    }
}
示例#22
0
static void TestSurrogate(){
    static UChar32 s[] = {0x10000, 0x10ffff, 0x50000, 0x100000, 0x1abcd};
    int i = 0;
    while (i < 5) {
        UChar first  = UTF_FIRST_SURROGATE(s[i]);
        UChar second = UTF_SECOND_SURROGATE(s[i]);
        /* algorithm from the Unicode consortium */
        UChar firstresult  = (UChar)(((s[i] - 0x10000) / 0x400) + 0xD800);
        UChar secondresult = (UChar)(((s[i] - 0x10000) % 0x400) + 0xDC00);

        if (first != UTF16_LEAD(s[i]) || first != U16_LEAD(s[i]) || first != firstresult) {
            log_err("Failure in first surrogate in 0x%x expected to be 0x%x\n",
                    s[i], firstresult);
        }
        if (second != UTF16_TRAIL(s[i]) || second != U16_TRAIL(s[i]) || second != secondresult) {
            log_err("Failure in second surrogate in 0x%x expected to be 0x%x\n",
                    s[i], secondresult);
        }
        i ++;
    }
}
示例#23
0
String SVGFontData::createStringWithMirroredCharacters(const UChar* characters, unsigned length) const
{
    StringBuilder mirroredCharacters;
    mirroredCharacters.reserveCapacity(length);

    UChar32 character;
    unsigned i = 0;
    while (i < length) {
        U16_NEXT(characters, i, length, character);
        character = mirroredChar(character);

        if (U16_LENGTH(character) == 1)
            mirroredCharacters.append(static_cast<UChar>(character));
        else {
            mirroredCharacters.append(U16_LEAD(character));
            mirroredCharacters.append(U16_TRAIL(character));
        }
    }

    return mirroredCharacters.toString();
}
TEST(StringBuilderTest, Append)
{
    StringBuilder builder;
    builder.append(String("0123456789"));
    expectBuilderContent("0123456789", builder);
    builder.append("abcd");
    expectBuilderContent("0123456789abcd", builder);
    builder.append("efgh", 3);
    expectBuilderContent("0123456789abcdefg", builder);
    builder.append("");
    expectBuilderContent("0123456789abcdefg", builder);
    builder.append('#');
    expectBuilderContent("0123456789abcdefg#", builder);

    builder.toString(); // Test after reifyString().
    StringBuilder builder1;
    builder.append("", 0);
    expectBuilderContent("0123456789abcdefg#", builder);
    builder1.append(builder.characters8(), builder.length());
    builder1.append("XYZ");
    builder.append(builder1.characters8(), builder1.length());
    expectBuilderContent("0123456789abcdefg#0123456789abcdefg#XYZ", builder);

    StringBuilder builder2;
    builder2.reserveCapacity(100);
    builder2.append("xyz");
    const LChar* characters = builder2.characters8();
    builder2.append("0123456789");
    EXPECT_EQ(characters, builder2.characters8());

    // Test appending UChar32 characters to StringBuilder.
    StringBuilder builderForUChar32Append;
    UChar32 frakturAChar = 0x1D504;
    builderForUChar32Append.append(frakturAChar); // The fraktur A is not in the BMP, so it's two UTF-16 code units long.
    EXPECT_EQ(2U, builderForUChar32Append.length());
    builderForUChar32Append.append(static_cast<UChar32>('A'));
    EXPECT_EQ(3U, builderForUChar32Append.length());
    const UChar resultArray[] = { U16_LEAD(frakturAChar), U16_TRAIL(frakturAChar), 'A' };
    expectBuilderContent(String(resultArray, WTF_ARRAY_LENGTH(resultArray)), builderForUChar32Append);
}
示例#25
0
ALWAYS_INLINE bool equalWithUTF8Internal(const CharType* a, const CharType* aEnd, const char* b, const char* bEnd)
{
    while (b < bEnd) {
        if (isASCII(*b)) {
            if (*a++ != *b++)
                return false;
            continue;
        }

        int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*b);

        if (bEnd - b < utf8SequenceLength)
            return false;

        if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(b), utf8SequenceLength))
            return 0;

        UChar32 character = readUTF8Sequence(b, utf8SequenceLength);
        ASSERT(!isASCII(character));

        if (U_IS_BMP(character)) {
            // UTF-16 surrogate values are illegal in UTF-32
            if (U_IS_SURROGATE(character))
                return false;
            if (*a++ != character)
                return false;
        } else if (U_IS_SUPPLEMENTARY(character)) {
            if (*a++ != U16_LEAD(character))
                return false;
            if (*a++ != U16_TRAIL(character))
                return false;
        } else {
            return false;
        }
    }

    return a == aEnd;
}
示例#26
0
UChar32
FCDUTF8CollationIterator::nextCodePoint(UErrorCode &errorCode) {
    UChar32 c;
    for(;;) {
        if(state == CHECK_FWD) {
            if(pos == length || ((c = u8[pos]) == 0 && length < 0)) {
                return U_SENTINEL;
            }
            if(c < 0x80) {
                ++pos;
                return c;
            }
            U8_NEXT_OR_FFFD(u8, pos, length, c);
            if(CollationFCD::hasTccc(c <= 0xffff ? c : U16_LEAD(c)) &&
                    (CollationFCD::maybeTibetanCompositeVowel(c) ||
                        (pos != length && nextHasLccc()))) {
                // c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence
                // and we can use U8_LENGTH() rather than a previous-position variable.
                pos -= U8_LENGTH(c);
                if(!nextSegment(errorCode)) {
                    return U_SENTINEL;
                }
                continue;
            }
            return c;
        } else if(state == IN_FCD_SEGMENT && pos != limit) {
            U8_NEXT_OR_FFFD(u8, pos, length, c);
            return c;
        } else if(state == IN_NORMALIZED && pos != normalized.length()) {
            c = normalized.char32At(pos);
            pos += U16_LENGTH(c);
            return c;
        } else {
            switchToForward();
        }
    }
}
示例#27
0
UChar32
FCDUTF8CollationIterator::previousCodePoint(UErrorCode &errorCode) {
    UChar32 c;
    for(;;) {
        if(state == CHECK_BWD) {
            if(pos == 0) {
                return U_SENTINEL;
            }
            if((c = u8[pos - 1]) < 0x80) {
                --pos;
                return c;
            }
            U8_PREV_OR_FFFD(u8, 0, pos, c);
            if(CollationFCD::hasLccc(c <= 0xffff ? c : U16_LEAD(c)) &&
                    (CollationFCD::maybeTibetanCompositeVowel(c) ||
                        (pos != 0 && previousHasTccc()))) {
                // c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence
                // and we can use U8_LENGTH() rather than a previous-position variable.
                pos += U8_LENGTH(c);
                if(!previousSegment(errorCode)) {
                    return U_SENTINEL;
                }
                continue;
            }
            return c;
        } else if(state == IN_FCD_SEGMENT && pos != start) {
            U8_PREV_OR_FFFD(u8, 0, pos, c);
            return c;
        } else if(state >= IN_NORMALIZED && pos != 0) {
            c = normalized.char32At(pos - 1);
            pos -= U16_LENGTH(c);
            return c;
        } else {
            switchToBackward();
        }
    }
}
示例#28
0
String XSSAuditor::decodeHTMLEntities(const String& string, bool leaveUndecodableEntitiesUntouched)
{
    SegmentedString source(string);
    SegmentedString sourceShadow;
    Vector<UChar> result;
    
    while (!source.isEmpty()) {
        UChar cc = *source;
        source.advance();
        
        if (cc != '&') {
            result.append(cc);
            continue;
        }
        
        if (leaveUndecodableEntitiesUntouched)
            sourceShadow = source;
        bool notEnoughCharacters = false;
        unsigned entity = PreloadScanner::consumeEntity(source, notEnoughCharacters);
        // We ignore notEnoughCharacters because we might as well use this loop
        // to copy the remaining characters into |result|.

        if (entity > 0xFFFF) {
            result.append(U16_LEAD(entity));
            result.append(U16_TRAIL(entity));
        } else if (entity && (!leaveUndecodableEntitiesUntouched || entity != 0xFFFD)){
            result.append(entity);
        } else {
            result.append('&');
            if (leaveUndecodableEntitiesUntouched)
                source = sourceShadow;
        }
    }
    
    return String::adopt(result);
}
示例#29
0
U_CAPI UChar* U_EXPORT2
u_memrchr32(const UChar* s, UChar32 c, int32_t count) {
    if ((uint32_t) c <= U_BMP_MAX) {
        /* find BMP code point */
        return u_memrchr(s, (UChar) c, count);
    } else if (count < 2) {
        /* too short for a surrogate pair */
        return NULL;
    } else if ((uint32_t) c <= UCHAR_MAX_VALUE) {
        /* find supplementary code point as surrogate pair */
        const UChar* limit = s + count - 1;
        UChar lead = U16_LEAD(c), trail = U16_TRAIL(c);

        do {
            if (*limit == trail && *(limit - 1) == lead) {
                return (UChar*) (limit - 1);
            }
        } while (s != --limit);
        return NULL;
    } else {
        /* not a Unicode code point, not findable */
        return NULL;
    }
}
示例#30
0
static UChar32 U_CALLCONV
lenient8IteratorNext(UCharIterator *iter) {
    int32_t index;

    if(iter->reservedField!=0) {
        UChar trail=U16_TRAIL(iter->reservedField);
        iter->reservedField=0;
        if((index=iter->index)>=0) {
            iter->index=index+1;
        }
        return trail;
    } else if(iter->start<iter->limit) {
        const uint8_t *s=(const uint8_t *)iter->context;
        UChar32 c;

        L8_NEXT(s, iter->start, iter->limit, c);
        if((index=iter->index)>=0) {
            iter->index=++index;
            if(iter->length<0 && iter->start==iter->limit) {
                iter->length= c<=0xffff ? index : index+1;
            }
        } else if(iter->start==iter->limit && iter->length>=0) {
            iter->index= c<=0xffff ? iter->length : iter->length-1;
        }
        if(c<0) {
            return 0xfffd;
        } else if(c<=0xffff) {
            return c;
        } else {
            iter->reservedField=c;
            return U16_LEAD(c);
        }
    } else {
        return U_SENTINEL;
    }
}