/* Explain <xxxxx> tag to a native value * * Since <xxxxx> is always larger than the native value, * the operation will replace the tag directly in the buffer, * and, of course, will shift tail elements. */ void IdnaConfTest::ExplainCodePointTag(UnicodeString& buf){ buf.append((UChar)0); // add a terminal NULL UChar* bufBase = buf.getBuffer(buf.length()); UChar* p = bufBase; while (*p != 0){ if ( *p != 0x3C){ // < *bufBase++ = *p++; } else { p++; // skip < UChar32 cp = 0; for ( ;*p != 0x3E; p++){ // > if (0x30 <= *p && *p <= 0x39){ // 0-9 cp = (cp * 16) + (*p - 0x30); } else if (0x61 <= *p && *p <= 0x66){ // a-f cp = (cp * 16) + (*p - 0x61) + 10; } else if (0x41 <= *p && *p <= 0x46) {// A-F cp = (cp * 16) + (*p - 0x41) + 10; } // no else. hope everything is good. } p++; // skip > if (U_IS_BMP(cp)){ *bufBase++ = cp; } else { *bufBase++ = U16_LEAD(cp); *bufBase++ = U16_TRAIL(cp); } } } *bufBase = 0; // close our buffer buf.releaseBuffer(); }
U_NAMESPACE_BEGIN U_CFUNC UChar U_CALLCONV uregex_utext_unescape_charAt(int32_t offset, void *ct) { struct URegexUTextUnescapeCharContext *context = (struct URegexUTextUnescapeCharContext *)ct; UChar32 c; if (offset == context->lastOffset + 1) { c = UTEXT_NEXT32(context->text); context->lastOffset++; } else if (offset == context->lastOffset) { c = UTEXT_PREVIOUS32(context->text); UTEXT_NEXT32(context->text); } else { utext_moveIndex32(context->text, offset - context->lastOffset - 1); c = UTEXT_NEXT32(context->text); context->lastOffset = offset; } // !!!: Doesn't handle characters outside BMP if (U_IS_BMP(c)) { return (UChar)c; } else { return 0; } }
RefPtr<Font> Font::systemFallbackFontForCharacter(UChar32 character, const FontDescription& description, bool isForPlatformFont) const { auto fontAddResult = systemFallbackCache().add(this, CharacterFallbackMap()); if (!character) { UChar codeUnit = 0; return FontCache::singleton().systemFallbackForCharacters(description, this, isForPlatformFont, &codeUnit, 1); } auto key = CharacterFallbackMapKey(description.locale(), character, isForPlatformFont); auto characterAddResult = fontAddResult.iterator->value.add(WTF::move(key), nullptr); Font*& fallbackFont = characterAddResult.iterator->value; if (!fallbackFont) { UChar codeUnits[2]; unsigned codeUnitsLength; if (U_IS_BMP(character)) { codeUnits[0] = FontCascade::normalizeSpaces(character); codeUnitsLength = 1; } else { codeUnits[0] = U16_LEAD(character); codeUnits[1] = U16_TRAIL(character); codeUnitsLength = 2; } fallbackFont = FontCache::singleton().systemFallbackForCharacters(description, this, isForPlatformFont, codeUnits, codeUnitsLength).get(); if (fallbackFont) fallbackFont->m_isUsedInSystemFallbackCache = true; } return fallbackFont; }
ConversionResult convertUTF8ToUTF16( const char** sourceStart, const char* sourceEnd, UChar** targetStart, UChar* targetEnd, bool strict) { ConversionResult result = conversionOK; const char* source = *sourceStart; UChar* target = *targetStart; while (source < sourceEnd) { int utf8SequenceLength = inlineUTF8SequenceLength(*source); if (sourceEnd - source < utf8SequenceLength) { result = sourceExhausted; break; } // Do this check whether lenient or strict if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8SequenceLength)) { result = sourceIllegal; break; } UChar32 character = readUTF8Sequence(source, utf8SequenceLength); if (target >= targetEnd) { source -= utf8SequenceLength; // Back up source pointer! result = targetExhausted; break; } if (U_IS_BMP(character)) { // UTF-16 surrogate values are illegal in UTF-32 if (U_IS_SURROGATE(character)) { if (strict) { source -= utf8SequenceLength; // return to the illegal value itself result = sourceIllegal; break; } else *target++ = replacementCharacter; } else *target++ = character; // normal case } else if (U_IS_SUPPLEMENTARY(character)) { // target is a character in range 0xFFFF - 0x10FFFF if (target + 1 >= targetEnd) { source -= utf8SequenceLength; // Back up source pointer! result = targetExhausted; break; } *target++ = U16_LEAD(character); *target++ = U16_TRAIL(character); } else { if (strict) { source -= utf8SequenceLength; // return to the start result = sourceIllegal; break; // Bail out; shouldn't continue } else *target++ = replacementCharacter; } } *sourceStart = source; *targetStart = target; return result; }
inline static UChar32 legalEntityFor(UChar32 value) { // FIXME: A number of specific entity values generate parse errors. if (!value || value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF)) return 0xFFFD; if (U_IS_BMP(value)) return adjustEntity(value); return value; }
static void doCaseConvert( XMLCh* convertString, FunctionType caseFunction) { // Note the semantics of this function are broken, since it's // possible that changing the case of a string could increase // its length, but there's no way to handle such a situation. const unsigned int len = XMLString::stringLen(convertString); size_t readPos = 0; size_t writePos = 0; while(readPos < len) { UChar32 original; // Get the next Unicode code point. U16_NEXT_UNSAFE(convertString, readPos, original); // Convert the code point const UChar32 converted = caseFunction(original); // OK, now here's where it gets ugly. if (!U_IS_BMP(converted) && U_IS_BMP(original) && readPos - writePos == 1) { // We do not have room to convert the // character without overwriting the next // character, so we will just stop. break; } else { U16_APPEND_UNSAFE(convertString, writePos, converted); } } convertString[writePos] = 0; }
static inline UChar* appendCharacter(UChar* destination, int character) { ASSERT(character != nonCharacter); ASSERT(!U_IS_SURROGATE(character)); if (U_IS_BMP(character)) *destination++ = character; else { *destination++ = U16_LEAD(character); *destination++ = U16_TRAIL(character); } return destination; }
static size_t appendUChar32ToUCharArray(UChar32 value, UChar* result) { if (U_IS_BMP(value)) { UChar character = static_cast<UChar>(value); ASSERT(character == value); result[0] = character; return 1; } result[0] = U16_LEAD(value); result[1] = U16_TRAIL(value); return 2; }
unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits(const char* data, const char* dataEnd, unsigned& dataLength, unsigned& utf16Length) { if (!data) return 0; StringHasher stringHasher; dataLength = 0; utf16Length = 0; while (data < dataEnd || (!dataEnd && *data)) { if (isASCII(*data)) { stringHasher.addCharacter(*data++); dataLength++; utf16Length++; continue; } int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data); dataLength += utf8SequenceLength; if (!dataEnd) { for (int i = 1; i < utf8SequenceLength; ++i) { if (!data[i]) return 0; } } else if (dataEnd - data < utf8SequenceLength) { return 0; } if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8SequenceLength)) return 0; UChar32 character = readUTF8Sequence(data, utf8SequenceLength); ASSERT(!isASCII(character)); if (U_IS_BMP(character)) { // UTF-16 surrogate values are illegal in UTF-32 if (U_IS_SURROGATE(character)) return 0; stringHasher.addCharacter(static_cast<UChar>(character)); // normal case utf16Length++; } else if (U_IS_SUPPLEMENTARY(character)) { stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)), static_cast<UChar>(U16_TRAIL(character))); utf16Length += 2; } else { return 0; } } return stringHasher.hashWithTop8BitsMasked(); }
unsigned calculateStringHashFromUTF8(const char* data, const char* dataEnd, unsigned& utf16Length) { if (!data) return 0; WTF::StringHasher stringHasher; utf16Length = 0; while (data < dataEnd) { if (isASCII(*data)) { stringHasher.addCharacter(*data++); utf16Length++; continue; } int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data); if (dataEnd - data < utf8SequenceLength) return false; if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8SequenceLength)) return 0; UChar32 character = readUTF8Sequence(data, utf8SequenceLength); ASSERT(!isASCII(character)); if (U_IS_BMP(character)) { // UTF-16 surrogate values are illegal in UTF-32 if (U_IS_SURROGATE(character)) return 0; stringHasher.addCharacter(static_cast<UChar>(character)); // normal case utf16Length++; } else if (U_IS_SUPPLEMENTARY(character)) { stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)), static_cast<UChar>(U16_TRAIL(character))); utf16Length += 2; } else return 0; } return stringHasher.hash(); }
long str_ucnv_length(rb_str_t *self, bool ucs2_mode) { USE_CONVERTER(cnv, self->encoding); const char *pos = self->bytes; const char *end = pos + self->length_in_bytes; long len = 0; bool valid_encoding = true; for (;;) { const char *character_start_pos = pos; // iterate through the string one Unicode code point at a time UErrorCode err = U_ZERO_ERROR; UChar32 c = ucnv_getNextUChar(cnv, &pos, end, &err); if (err == U_INDEX_OUTOFBOUNDS_ERROR) { // end of the string break; } else if (U_FAILURE(err)) { valid_encoding = false; long min_char_size = self->encoding->min_char_size; long converted_width = pos - character_start_pos; len += div_round_up(converted_width, min_char_size); } else { if (ucs2_mode && !U_IS_BMP(c)) { len += 2; } else { ++len; } } } ucnv_close(cnv); str_set_valid_encoding(self, valid_encoding); return len; }
ALWAYS_INLINE bool equalWithUTF8Internal(const CharType* a, const CharType* aEnd, const char* b, const char* bEnd) { while (b < bEnd) { if (isASCII(*b)) { if (*a++ != *b++) return false; continue; } int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*b); if (bEnd - b < utf8SequenceLength) return false; if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(b), utf8SequenceLength)) return 0; UChar32 character = readUTF8Sequence(b, utf8SequenceLength); ASSERT(!isASCII(character)); if (U_IS_BMP(character)) { // UTF-16 surrogate values are illegal in UTF-32 if (U_IS_SURROGATE(character)) return false; if (*a++ != character) return false; } else if (U_IS_SUPPLEMENTARY(character)) { if (*a++ != U16_LEAD(character)) return false; if (*a++ != U16_TRAIL(character)) return false; } else { return false; } } return a == aEnd; }
static RefPtr<GlyphPage> createAndFillGlyphPage(unsigned pageNumber, const Font* font) { #if PLATFORM(IOS) // FIXME: Times New Roman contains Arabic glyphs, but Core Text doesn't know how to shape them. See <rdar://problem/9823975>. // Once we have the fix for <rdar://problem/9823975> then remove this code together with Font::shouldNotBeUsedForArabic() // in <rdar://problem/12096835>. if (pageNumber == 6 && font->shouldNotBeUsedForArabic()) return nullptr; #endif unsigned start = pageNumber * GlyphPage::size; UChar buffer[GlyphPage::size * 2 + 2]; unsigned bufferLength; // Fill in a buffer with the entire "page" of characters that we want to look up glyphs for. if (U_IS_BMP(start)) { bufferLength = GlyphPage::size; for (unsigned i = 0; i < GlyphPage::size; i++) buffer[i] = start + i; if (!start) { // Control characters must not render at all. for (unsigned i = 0; i < 0x20; ++i) buffer[i] = zeroWidthSpace; for (unsigned i = 0x7F; i < 0xA0; i++) buffer[i] = zeroWidthSpace; buffer[softHyphen] = zeroWidthSpace; // \n, \t, and nonbreaking space must render as a space. buffer[(int)'\n'] = ' '; buffer[(int)'\t'] = ' '; buffer[noBreakSpace] = ' '; } else if (start == (leftToRightMark & ~(GlyphPage::size - 1))) { // LRM, RLM, LRE, RLE, ZWNJ, ZWJ, and PDF must not render at all. buffer[leftToRightMark - start] = zeroWidthSpace; buffer[rightToLeftMark - start] = zeroWidthSpace; buffer[leftToRightEmbed - start] = zeroWidthSpace; buffer[rightToLeftEmbed - start] = zeroWidthSpace; buffer[leftToRightOverride - start] = zeroWidthSpace; buffer[rightToLeftOverride - start] = zeroWidthSpace; buffer[zeroWidthNonJoiner - start] = zeroWidthSpace; buffer[zeroWidthJoiner - start] = zeroWidthSpace; buffer[popDirectionalFormatting - start] = zeroWidthSpace; } else if (start == (objectReplacementCharacter & ~(GlyphPage::size - 1))) { // Object replacement character must not render at all. buffer[objectReplacementCharacter - start] = zeroWidthSpace; } else if (start == (zeroWidthNoBreakSpace & ~(GlyphPage::size - 1))) { // ZWNBS/BOM must not render at all. buffer[zeroWidthNoBreakSpace - start] = zeroWidthSpace; } } else { bufferLength = GlyphPage::size * 2; for (unsigned i = 0; i < GlyphPage::size; i++) { int c = i + start; buffer[i * 2] = U16_LEAD(c); buffer[i * 2 + 1] = U16_TRAIL(c); } } // Now that we have a buffer full of characters, we want to get back an array // of glyph indices. This part involves calling into the platform-specific // routine of our glyph map for actually filling in the page with the glyphs. // Success is not guaranteed. For example, Times fails to fill page 260, giving glyph data // for only 128 out of 256 characters. RefPtr<GlyphPage> glyphPage; if (GlyphPage::mayUseMixedFontsWhenFilling(buffer, bufferLength, font)) glyphPage = GlyphPage::createForMixedFonts(); else glyphPage = GlyphPage::createForSingleFont(font); bool haveGlyphs = fillGlyphPage(*glyphPage, buffer, bufferLength, font); if (!haveGlyphs) return nullptr; glyphPage->setImmutable(); return glyphPage; }