Beispiel #1
0
/* Explain <xxxxx> tag to a native value
 *
 * Since <xxxxx> is always larger than the native value,
 * the operation will replace the tag directly in the buffer,
 * and, of course, will shift tail elements.
 */
void IdnaConfTest::ExplainCodePointTag(UnicodeString& buf){
    buf.append((UChar)0);    // add a terminal NULL
    UChar* bufBase = buf.getBuffer(buf.length());
    UChar* p = bufBase;
    while (*p != 0){
        if ( *p != 0x3C){    // <
            *bufBase++ = *p++;
        } else {
            p++;    // skip <
            UChar32 cp = 0;
            for ( ;*p != 0x3E; p++){   // >
                if (0x30 <= *p && *p <= 0x39){        // 0-9
                    cp = (cp * 16) + (*p - 0x30);
                } else if (0x61 <= *p && *p <= 0x66){ // a-f
                    cp = (cp * 16) + (*p - 0x61) + 10;
                } else if (0x41 <= *p && *p <= 0x46) {// A-F
                    cp = (cp * 16) + (*p - 0x41) + 10;
                }
                // no else. hope everything is good.
            }
            p++;    // skip >
            if (U_IS_BMP(cp)){
                *bufBase++ = cp;
            } else {
                *bufBase++ = U16_LEAD(cp);
                *bufBase++ = U16_TRAIL(cp);
            }
        }
    }
    *bufBase = 0;  // close our buffer
    buf.releaseBuffer();
}
Beispiel #2
0
U_NAMESPACE_BEGIN

U_CFUNC UChar U_CALLCONV
uregex_utext_unescape_charAt(int32_t offset, void *ct) {
    struct URegexUTextUnescapeCharContext *context = (struct URegexUTextUnescapeCharContext *)ct;
    UChar32 c;
    if (offset == context->lastOffset + 1) {
        c = UTEXT_NEXT32(context->text);
        context->lastOffset++;
    } else if (offset == context->lastOffset) {
        c = UTEXT_PREVIOUS32(context->text);
        UTEXT_NEXT32(context->text);
    } else {
        utext_moveIndex32(context->text, offset - context->lastOffset - 1);
        c = UTEXT_NEXT32(context->text);
        context->lastOffset = offset;
    }
    
    // !!!: Doesn't handle characters outside BMP
    if (U_IS_BMP(c)) {
        return (UChar)c;
    } else {
        return 0;
    }
}
Beispiel #3
0
RefPtr<Font> Font::systemFallbackFontForCharacter(UChar32 character, const FontDescription& description, bool isForPlatformFont) const
{
    auto fontAddResult = systemFallbackCache().add(this, CharacterFallbackMap());

    if (!character) {
        UChar codeUnit = 0;
        return FontCache::singleton().systemFallbackForCharacters(description, this, isForPlatformFont, &codeUnit, 1);
    }

    auto key = CharacterFallbackMapKey(description.locale(), character, isForPlatformFont);
    auto characterAddResult = fontAddResult.iterator->value.add(WTF::move(key), nullptr);

    Font*& fallbackFont = characterAddResult.iterator->value;

    if (!fallbackFont) {
        UChar codeUnits[2];
        unsigned codeUnitsLength;
        if (U_IS_BMP(character)) {
            codeUnits[0] = FontCascade::normalizeSpaces(character);
            codeUnitsLength = 1;
        } else {
            codeUnits[0] = U16_LEAD(character);
            codeUnits[1] = U16_TRAIL(character);
            codeUnitsLength = 2;
        }

        fallbackFont = FontCache::singleton().systemFallbackForCharacters(description, this, isForPlatformFont, codeUnits, codeUnitsLength).get();
        if (fallbackFont)
            fallbackFont->m_isUsedInSystemFallbackCache = true;
    }

    return fallbackFont;
}
Beispiel #4
0
ConversionResult convertUTF8ToUTF16(
    const char** sourceStart, const char* sourceEnd, 
    UChar** targetStart, UChar* targetEnd, bool strict)
{
    ConversionResult result = conversionOK;
    const char* source = *sourceStart;
    UChar* target = *targetStart;
    while (source < sourceEnd) {
        int utf8SequenceLength = inlineUTF8SequenceLength(*source);
        if (sourceEnd - source < utf8SequenceLength)  {
            result = sourceExhausted;
            break;
        }
        // Do this check whether lenient or strict
        if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8SequenceLength)) {
            result = sourceIllegal;
            break;
        }

        UChar32 character = readUTF8Sequence(source, utf8SequenceLength);

        if (target >= targetEnd) {
            source -= utf8SequenceLength; // Back up source pointer!
            result = targetExhausted;
            break;
        }

        if (U_IS_BMP(character)) {
            // UTF-16 surrogate values are illegal in UTF-32
            if (U_IS_SURROGATE(character)) {
                if (strict) {
                    source -= utf8SequenceLength; // return to the illegal value itself
                    result = sourceIllegal;
                    break;
                } else
                    *target++ = replacementCharacter;
            } else
                *target++ = character; // normal case
        } else if (U_IS_SUPPLEMENTARY(character)) {
            // target is a character in range 0xFFFF - 0x10FFFF
            if (target + 1 >= targetEnd) {
                source -= utf8SequenceLength; // Back up source pointer!
                result = targetExhausted;
                break;
            }
            *target++ = U16_LEAD(character);
            *target++ = U16_TRAIL(character);
        } else {
            if (strict) {
                source -= utf8SequenceLength; // return to the start
                result = sourceIllegal;
                break; // Bail out; shouldn't continue
            } else
                *target++ = replacementCharacter;
        }
    }
    *sourceStart = source;
    *targetStart = target;
    return result;
}
 inline static UChar32 legalEntityFor(UChar32 value)
 {
     // FIXME: A number of specific entity values generate parse errors.
     if (!value || value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF))
         return 0xFFFD;
     if (U_IS_BMP(value))
         return adjustEntity(value);
     return value;
 }
Beispiel #6
0
static void
doCaseConvert(
            XMLCh*          convertString,
            FunctionType    caseFunction)
{
    // Note the semantics of this function are broken, since it's
    // possible that changing the case of a string could increase
    // its length, but there's no way to handle such a situation.
    const unsigned int  len =
            XMLString::stringLen(convertString);

    size_t  readPos = 0;
    size_t  writePos = 0;

    while(readPos < len)
    {
        UChar32     original;

        // Get the next Unicode code point.
        U16_NEXT_UNSAFE(convertString, readPos, original);

        // Convert the code point
        const UChar32   converted = caseFunction(original);

        // OK, now here's where it gets ugly.
        if (!U_IS_BMP(converted) && U_IS_BMP(original) &&
            readPos - writePos == 1)
        {
            // We do not have room to convert the
            // character without overwriting the next
            // character, so we will just stop.
            break;
        }
        else
        {
            U16_APPEND_UNSAFE(convertString, writePos, converted);
        }
    }

    convertString[writePos] = 0;
}
Beispiel #7
0
static inline UChar* appendCharacter(UChar* destination, int character)
{
    ASSERT(character != nonCharacter);
    ASSERT(!U_IS_SURROGATE(character));
    if (U_IS_BMP(character))
        *destination++ = character;
    else {
        *destination++ = U16_LEAD(character);
        *destination++ = U16_TRAIL(character);
    }
    return destination;
}
Beispiel #8
0
static size_t appendUChar32ToUCharArray(UChar32 value, UChar* result)
{
    if (U_IS_BMP(value)) {
        UChar character = static_cast<UChar>(value);
        ASSERT(character == value);
        result[0] = character;
        return 1;
    }

    result[0] = U16_LEAD(value);
    result[1] = U16_TRAIL(value);
    return 2;
}
Beispiel #9
0
unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits(const char* data, const char* dataEnd, unsigned& dataLength, unsigned& utf16Length)
{
    if (!data)
        return 0;

    StringHasher stringHasher;
    dataLength = 0;
    utf16Length = 0;

    while (data < dataEnd || (!dataEnd && *data)) {
        if (isASCII(*data)) {
            stringHasher.addCharacter(*data++);
            dataLength++;
            utf16Length++;
            continue;
        }

        int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data);
        dataLength += utf8SequenceLength;

        if (!dataEnd) {
            for (int i = 1; i < utf8SequenceLength; ++i) {
                if (!data[i])
                    return 0;
            }
        } else if (dataEnd - data < utf8SequenceLength) {
            return 0;
        }

        if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8SequenceLength))
            return 0;

        UChar32 character = readUTF8Sequence(data, utf8SequenceLength);
        ASSERT(!isASCII(character));

        if (U_IS_BMP(character)) {
            // UTF-16 surrogate values are illegal in UTF-32
            if (U_IS_SURROGATE(character))
                return 0;
            stringHasher.addCharacter(static_cast<UChar>(character)); // normal case
            utf16Length++;
        } else if (U_IS_SUPPLEMENTARY(character)) {
            stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)), static_cast<UChar>(U16_TRAIL(character)));
            utf16Length += 2;
        } else {
            return 0;
        }
    }

    return stringHasher.hashWithTop8BitsMasked();
}
Beispiel #10
0
unsigned calculateStringHashFromUTF8(const char* data, const char* dataEnd, unsigned& utf16Length)
{
    if (!data)
        return 0;

    WTF::StringHasher stringHasher;
    utf16Length = 0;

    while (data < dataEnd) {
        if (isASCII(*data)) {
            stringHasher.addCharacter(*data++);
            utf16Length++;
            continue;
        }

        int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data);

        if (dataEnd - data < utf8SequenceLength)
            return false;

        if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8SequenceLength))
            return 0;

        UChar32 character = readUTF8Sequence(data, utf8SequenceLength);
        ASSERT(!isASCII(character));

        if (U_IS_BMP(character)) {
            // UTF-16 surrogate values are illegal in UTF-32
            if (U_IS_SURROGATE(character))
                return 0;
            stringHasher.addCharacter(static_cast<UChar>(character)); // normal case
            utf16Length++;
        } else if (U_IS_SUPPLEMENTARY(character)) {
            stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)),
                                       static_cast<UChar>(U16_TRAIL(character)));
            utf16Length += 2;
        } else
            return 0;
    }

    return stringHasher.hash();
}
Beispiel #11
0
long
str_ucnv_length(rb_str_t *self, bool ucs2_mode)
{
    USE_CONVERTER(cnv, self->encoding);

    const char *pos = self->bytes;
    const char *end = pos + self->length_in_bytes;
    long len = 0;
    bool valid_encoding = true;
    for (;;) {
	const char *character_start_pos = pos;
	// iterate through the string one Unicode code point at a time
	UErrorCode err = U_ZERO_ERROR;
	UChar32 c = ucnv_getNextUChar(cnv, &pos, end, &err);
	if (err == U_INDEX_OUTOFBOUNDS_ERROR) {
	    // end of the string
	    break;
	}
	else if (U_FAILURE(err)) {
	    valid_encoding = false;
	    long min_char_size = self->encoding->min_char_size;
	    long converted_width = pos - character_start_pos;
	    len += div_round_up(converted_width, min_char_size);
	}
	else {
	    if (ucs2_mode && !U_IS_BMP(c)) {
		len += 2;
	    }
	    else {
		++len;
	    }
	}
    }

    ucnv_close(cnv);

    str_set_valid_encoding(self, valid_encoding);

    return len;
}
Beispiel #12
0
ALWAYS_INLINE bool equalWithUTF8Internal(const CharType* a, const CharType* aEnd, const char* b, const char* bEnd)
{
    while (b < bEnd) {
        if (isASCII(*b)) {
            if (*a++ != *b++)
                return false;
            continue;
        }

        int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*b);

        if (bEnd - b < utf8SequenceLength)
            return false;

        if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(b), utf8SequenceLength))
            return 0;

        UChar32 character = readUTF8Sequence(b, utf8SequenceLength);
        ASSERT(!isASCII(character));

        if (U_IS_BMP(character)) {
            // UTF-16 surrogate values are illegal in UTF-32
            if (U_IS_SURROGATE(character))
                return false;
            if (*a++ != character)
                return false;
        } else if (U_IS_SUPPLEMENTARY(character)) {
            if (*a++ != U16_LEAD(character))
                return false;
            if (*a++ != U16_TRAIL(character))
                return false;
        } else {
            return false;
        }
    }

    return a == aEnd;
}
Beispiel #13
0
static RefPtr<GlyphPage> createAndFillGlyphPage(unsigned pageNumber, const Font* font)
{
#if PLATFORM(IOS)
    // FIXME: Times New Roman contains Arabic glyphs, but Core Text doesn't know how to shape them. See <rdar://problem/9823975>.
    // Once we have the fix for <rdar://problem/9823975> then remove this code together with Font::shouldNotBeUsedForArabic()
    // in <rdar://problem/12096835>.
    if (pageNumber == 6 && font->shouldNotBeUsedForArabic())
        return nullptr;
#endif

    unsigned start = pageNumber * GlyphPage::size;
    UChar buffer[GlyphPage::size * 2 + 2];
    unsigned bufferLength;
    // Fill in a buffer with the entire "page" of characters that we want to look up glyphs for.
    if (U_IS_BMP(start)) {
        bufferLength = GlyphPage::size;
        for (unsigned i = 0; i < GlyphPage::size; i++)
            buffer[i] = start + i;

        if (!start) {
            // Control characters must not render at all.
            for (unsigned i = 0; i < 0x20; ++i)
                buffer[i] = zeroWidthSpace;
            for (unsigned i = 0x7F; i < 0xA0; i++)
                buffer[i] = zeroWidthSpace;
            buffer[softHyphen] = zeroWidthSpace;

            // \n, \t, and nonbreaking space must render as a space.
            buffer[(int)'\n'] = ' ';
            buffer[(int)'\t'] = ' ';
            buffer[noBreakSpace] = ' ';
        } else if (start == (leftToRightMark & ~(GlyphPage::size - 1))) {
            // LRM, RLM, LRE, RLE, ZWNJ, ZWJ, and PDF must not render at all.
            buffer[leftToRightMark - start] = zeroWidthSpace;
            buffer[rightToLeftMark - start] = zeroWidthSpace;
            buffer[leftToRightEmbed - start] = zeroWidthSpace;
            buffer[rightToLeftEmbed - start] = zeroWidthSpace;
            buffer[leftToRightOverride - start] = zeroWidthSpace;
            buffer[rightToLeftOverride - start] = zeroWidthSpace;
            buffer[zeroWidthNonJoiner - start] = zeroWidthSpace;
            buffer[zeroWidthJoiner - start] = zeroWidthSpace;
            buffer[popDirectionalFormatting - start] = zeroWidthSpace;
        } else if (start == (objectReplacementCharacter & ~(GlyphPage::size - 1))) {
            // Object replacement character must not render at all.
            buffer[objectReplacementCharacter - start] = zeroWidthSpace;
        } else if (start == (zeroWidthNoBreakSpace & ~(GlyphPage::size - 1))) {
            // ZWNBS/BOM must not render at all.
            buffer[zeroWidthNoBreakSpace - start] = zeroWidthSpace;
        }
    } else {
        bufferLength = GlyphPage::size * 2;
        for (unsigned i = 0; i < GlyphPage::size; i++) {
            int c = i + start;
            buffer[i * 2] = U16_LEAD(c);
            buffer[i * 2 + 1] = U16_TRAIL(c);
        }
    }

    // Now that we have a buffer full of characters, we want to get back an array
    // of glyph indices. This part involves calling into the platform-specific
    // routine of our glyph map for actually filling in the page with the glyphs.
    // Success is not guaranteed. For example, Times fails to fill page 260, giving glyph data
    // for only 128 out of 256 characters.
    RefPtr<GlyphPage> glyphPage;
    if (GlyphPage::mayUseMixedFontsWhenFilling(buffer, bufferLength, font))
        glyphPage = GlyphPage::createForMixedFonts();
    else
        glyphPage = GlyphPage::createForSingleFont(font);

    bool haveGlyphs = fillGlyphPage(*glyphPage, buffer, bufferLength, font);
    if (!haveGlyphs)
        return nullptr;

    glyphPage->setImmutable();
    return glyphPage;
}