/* * Compare two strings as presented by UCharIterators. * Use code unit or code point order. * When the function returns, it is undefined where the iterators * have stopped. */ U_CAPI int32_t U_EXPORT2 u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder) { UChar32 c1, c2; /* argument checking */ if(iter1==NULL || iter2==NULL) { return 0; /* bad arguments */ } if(iter1==iter2) { return 0; /* identical iterators */ } /* reset iterators to start? */ iter1->move(iter1, 0, UITER_START); iter2->move(iter2, 0, UITER_START); /* compare identical prefixes - they do not need to be fixed up */ for(;;) { c1=iter1->next(iter1); c2=iter2->next(iter2); if(c1!=c2) { break; } if(c1==-1) { return 0; } } /* if both values are in or above the surrogate range, fix them up */ if(c1>=0xd800 && c2>=0xd800 && codePointOrder) { /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ if( (c1<=0xdbff && U16_IS_TRAIL(iter1->current(iter1))) || (U16_IS_TRAIL(c1) && (iter1->previous(iter1), U16_IS_LEAD(iter1->previous(iter1)))) ) { /* part of a surrogate pair, leave >=d800 */ } else { /* BMP code point - may be surrogate code point - make <d800 */ c1-=0x2800; } if( (c2<=0xdbff && U16_IS_TRAIL(iter2->current(iter2))) || (U16_IS_TRAIL(c2) && (iter2->previous(iter2), U16_IS_LEAD(iter2->previous(iter2)))) ) { /* part of a surrogate pair, leave >=d800 */ } else { /* BMP code point - may be surrogate code point - make <d800 */ c2-=0x2800; } } /* now c1 and c2 are in the requested (code unit or code point) order */ return (int32_t)c1-(int32_t)c2; }
/* * Test if a substring match inside a string is at code point boundaries. * All pointers refer to the same buffer. * The limit pointer may be NULL, all others must be real pointers. */ static U_INLINE UBool isMatchAtCPBoundary(const UChar* start, const UChar* match, const UChar* matchLimit, const UChar* limit) { if (U16_IS_TRAIL(*match) && start != match && U16_IS_LEAD(*(match - 1))) { /* the leading edge of the match is in the middle of a surrogate pair */ return FALSE; } if (U16_IS_LEAD(*(matchLimit - 1)) && match != limit && U16_IS_TRAIL(*matchLimit)) { /* the trailing edge of the match is in the middle of a surrogate pair */ return FALSE; } return TRUE; }
unsigned Font::expansionOpportunityCount(const UChar* characters, size_t length, TextDirection direction, bool& isAfterExpansion) { static bool expandAroundIdeographs = canExpandAroundIdeographsInComplexText(); unsigned count = 0; if (direction == LTR) { for (size_t i = 0; i < length; ++i) { UChar32 character = characters[i]; if (treatAsSpace(character)) { count++; isAfterExpansion = true; continue; } if (U16_IS_LEAD(character) && i + 1 < length && U16_IS_TRAIL(characters[i + 1])) { character = U16_GET_SUPPLEMENTARY(character, characters[i + 1]); i++; } if (expandAroundIdeographs && isCJKIdeographOrSymbol(character)) { if (!isAfterExpansion) count++; count++; isAfterExpansion = true; continue; } isAfterExpansion = false; } } else { for (size_t i = length; i > 0; --i) { UChar32 character = characters[i - 1]; if (treatAsSpace(character)) { count++; isAfterExpansion = true; continue; } if (U16_IS_TRAIL(character) && i > 1 && U16_IS_LEAD(characters[i - 2])) { character = U16_GET_SUPPLEMENTARY(characters[i - 2], character); i--; } if (expandAroundIdeographs && isCJKIdeographOrSymbol(character)) { if (!isAfterExpansion) count++; count++; isAfterExpansion = true; continue; } isAfterExpansion = false; } } return count; }
unsigned Character::expansionOpportunityCount(const UChar* characters, size_t length, TextDirection direction, bool& isAfterExpansion, const TextJustify textJustify) { unsigned count = 0; if (direction == LTR) { for (size_t i = 0; i < length; ++i) { UChar32 character = characters[i]; if (treatAsSpace(character)) { count++; isAfterExpansion = true; continue; } if (U16_IS_LEAD(character) && i + 1 < length && U16_IS_TRAIL(characters[i + 1])) { character = U16_GET_SUPPLEMENTARY(character, characters[i + 1]); i++; } if (textJustify == TextJustify::TextJustifyAuto && isCJKIdeographOrSymbol(character)) { if (!isAfterExpansion) count++; count++; isAfterExpansion = true; continue; } isAfterExpansion = false; } } else { for (size_t i = length; i > 0; --i) { UChar32 character = characters[i - 1]; if (treatAsSpace(character)) { count++; isAfterExpansion = true; continue; } if (U16_IS_TRAIL(character) && i > 1 && U16_IS_LEAD(characters[i - 2])) { character = U16_GET_SUPPLEMENTARY(characters[i - 2], character); i--; } if (textJustify == TextJustify::TextJustifyAuto && isCJKIdeographOrSymbol(character)) { if (!isAfterExpansion) count++; count++; isAfterExpansion = true; continue; } isAfterExpansion = false; } } return count; }
static void TestCodeUnitValues() { static uint16_t codeunit[]={0x0000,0xe065,0x20ac,0xd7ff,0xd800,0xd841,0xd905,0xdbff,0xdc00,0xdc02,0xddee,0xdfff,0}; int16_t i; for(i=0; i<sizeof(codeunit)/sizeof(codeunit[0]); i++){ UChar c=codeunit[i]; log_verbose("Testing code unit value of %x\n", c); if(i<4){ if(!UTF16_IS_SINGLE(c) || UTF16_IS_LEAD(c) || UTF16_IS_TRAIL(c) || !U16_IS_SINGLE(c) || U16_IS_LEAD(c) || U16_IS_TRAIL(c)){ log_err("ERROR: %x is a single character\n", c); } } if(i >= 4 && i< 8){ if(!UTF16_IS_LEAD(c) || UTF16_IS_SINGLE(c) || UTF16_IS_TRAIL(c) || !U16_IS_LEAD(c) || U16_IS_SINGLE(c) || U16_IS_TRAIL(c)){ log_err("ERROR: %x is a first surrogate\n", c); } } if(i >= 8 && i< 12){ if(!UTF16_IS_TRAIL(c) || UTF16_IS_SINGLE(c) || UTF16_IS_LEAD(c) || !U16_IS_TRAIL(c) || U16_IS_SINGLE(c) || U16_IS_LEAD(c)){ log_err("ERROR: %x is a second surrogate\n", c); } } } }
void MessagePattern::setParseError(UParseError *parseError, int32_t index) { if(parseError==NULL) { return; } parseError->offset=index; // Set preContext to some of msg before index. // Avoid splitting a surrogate pair. int32_t length=index; if(length>=U_PARSE_CONTEXT_LEN) { length=U_PARSE_CONTEXT_LEN-1; if(length>0 && U16_IS_TRAIL(msg[index-length])) { --length; } } msg.extract(index-length, length, parseError->preContext); parseError->preContext[length]=0; // Set postContext to some of msg starting at index. length=msg.length()-index; if(length>=U_PARSE_CONTEXT_LEN) { length=U_PARSE_CONTEXT_LEN-1; if(length>0 && U16_IS_LEAD(msg[index+length-1])) { --length; } } msg.extract(index, length, parseError->postContext); parseError->postContext[length]=0; }
U_CAPI const UChar * U_EXPORT2 res_getString(const ResourceData *pResData, Resource res, int32_t *pLength) { const UChar *p; uint32_t offset=RES_GET_OFFSET(res); int32_t length; if(RES_GET_TYPE(res)==URES_STRING_V2) { int32_t first; p=(const UChar *)(pResData->p16BitUnits+offset); first=*p; if(!U16_IS_TRAIL(first)) { length=u_strlen(p); } else if(first<0xdfef) { length=first&0x3ff; ++p; } else if(first<0xdfff) { length=((first-0xdfef)<<16)|p[1]; p+=2; } else { length=((int32_t)p[1]<<16)|p[2]; p+=3; } } else if(res==offset) /* RES_GET_TYPE(res)==URES_STRING */ { const int32_t *p32= res==0 ? &gEmptyString.length : pResData->pRoot+res; length=*p32++; p=(const UChar *)p32; } else { p=NULL; length=0; } if(pLength) { *pLength=length; } return p; }
// FIXME: This function may not work if the emphasis mark uses a complex script, but none of the // standard emphasis marks do so. bool Font::getEmphasisMarkGlyphData(const AtomicString& mark, GlyphData& glyphData) const { if (mark.isEmpty()) return false; #if ENABLE(SVG_FONTS) // FIXME: Implement for SVG fonts. if (primaryFont()->isSVGFont()) return false; #endif UChar32 character = mark[0]; if (U16_IS_SURROGATE(character)) { if (!U16_IS_SURROGATE_LEAD(character)) return false; if (mark.length() < 2) return false; UChar low = mark[1]; if (!U16_IS_TRAIL(low)) return false; character = U16_GET_SUPPLEMENTARY(character, low); } glyphData = glyphDataForCharacter(character, false, EmphasisMarkVariant); return true; }
bool SurrogatePairAwareTextIterator::consumeSlowCase(UChar32& character, unsigned& clusterLength) { if (character <= 0x30FE) { // Deal with Hiragana and Katakana voiced and semi-voiced syllables. // Normalize into composed form, and then look for glyph with base + combined mark. // Check above for character range to minimize performance impact. if (UChar32 normalized = normalizeVoicingMarks()) { character = normalized; clusterLength = 2; } return true; } if (!U16_IS_SURROGATE(character)) return true; // If we have a surrogate pair, make sure it starts with the high part. if (!U16_IS_SURROGATE_LEAD(character)) return false; // Do we have a surrogate pair? If so, determine the full Unicode (32 bit) code point before glyph lookup. // Make sure we have another character and it's a low surrogate. if (m_currentCharacter + 1 >= m_endCharacter) return false; UChar low = m_characters[1]; if (!U16_IS_TRAIL(low)) return false; character = U16_GET_SUPPLEMENTARY(character, low); clusterLength = 2; return true; }
UChar FCDUIterCollationIterator::handleGetTrailSurrogate() { if(state <= ITER_IN_FCD_SEGMENT) { UChar32 trail = iter.next(&iter); if(U16_IS_TRAIL(trail)) { if(state == ITER_IN_FCD_SEGMENT) { ++pos; } } else if(trail >= 0) { iter.previous(&iter); } return (UChar)trail; } else { U_ASSERT(pos < normalized.length()); UChar trail; if(U16_IS_TRAIL(trail = normalized[pos])) { ++pos; } return trail; } }
UChar32 StringImpl::characterStartingAt(unsigned i) { if (U16_IS_SINGLE(m_data[i])) return m_data[i]; if (i + 1 < m_length && U16_IS_LEAD(m_data[i]) && U16_IS_TRAIL(m_data[i + 1])) return U16_GET_SUPPLEMENTARY(m_data[i], m_data[i + 1]); return 0; }
UChar FCDUTF8CollationIterator::handleGetTrailSurrogate() { if(state != IN_NORMALIZED) { return 0; } U_ASSERT(pos < normalized.length()); UChar trail; if(U16_IS_TRAIL(trail = normalized[pos])) { ++pos; } return trail; }
static UChar32 surrogatePairAwareFirstCharacter(const UChar* characters, unsigned length) { if (U16_IS_SURROGATE(characters[0])) { if (!U16_IS_SURROGATE_LEAD(characters[0]) || length < 2 || !U16_IS_TRAIL(characters[1])) return ' '; return U16_GET_SUPPLEMENTARY(characters[0], characters[1]); } return characters[0]; }
static String getFontFamilyForCharacters(const UChar* characters, size_t numCharacters) { FcCharSet* cset = FcCharSetCreate(); for (size_t i = 0; i < numCharacters; ++i) { if (U16_IS_SURROGATE(characters[i]) && U16_IS_SURROGATE_LEAD(characters[i]) && i != numCharacters - 1 && U16_IS_TRAIL(characters[i + 1])) { if (FcCharSetAddChar(cset, U16_GET_SUPPLEMENTARY(characters[i], characters[i+1])) == FcFalse) return String(); i++; } else if (FcCharSetAddChar(cset, characters[i]) == FcFalse) return String(); } FcPattern *pattern = FcPatternCreate(); FcPatternAddCharSet(pattern, FC_CHARSET, cset); FcConfigSubstitute(0, pattern, FcMatchPattern); FcDefaultSubstitute(pattern); FcResult result; FcPattern *match = FcFontMatch(0, pattern, &result); FcChar8 *filename; if (FcPatternGetString(match, FC_FILE, 0, &filename) != FcResultMatch) { FcCharSetDestroy(cset); FcPatternDestroy(match); FcPatternDestroy(pattern); return String(); } FcChar8* family; if (FcPatternGetString(match, FC_FAMILY, 0, &family) == FcResultMatch) { FcCharSetDestroy(cset); FcPatternDestroy(match); FcPatternDestroy(pattern); const char* charFamily = reinterpret_cast<char*>(family); return String(charFamily); } FcPatternDestroy(match); FcCharSetDestroy(cset); FcPatternDestroy(pattern); return String(); }
float ShapeResultSpacing::computeSpacing(const TextRun& run, size_t index, float& offset) { UChar32 character = run[index]; bool treatAsSpace = (Character::treatAsSpace(character) || (m_normalizeSpace && Character::isNormalizedCanvasSpaceCharacter(character))) && (character != '\t' || !m_allowTabs); if (treatAsSpace && character != noBreakSpaceCharacter) character = spaceCharacter; float spacing = 0; if (m_letterSpacing && !Character::treatAsZeroWidthSpace(character)) spacing += m_letterSpacing; if (treatAsSpace && (index || !isFirstRun(run) || character == noBreakSpaceCharacter)) spacing += m_wordSpacing; if (!hasExpansion()) return spacing; if (treatAsSpace) return spacing + nextExpansion(); if (run.is8Bit() || m_textJustify != TextJustify::TextJustifyAuto) return spacing; // isCJKIdeographOrSymbol() has expansion opportunities both before and // after each character. // http://www.w3.org/TR/jlreq/#line_adjustment if (U16_IS_LEAD(character) && index + 1 < run.length() && U16_IS_TRAIL(run[index + 1])) character = U16_GET_SUPPLEMENTARY(character, run[index + 1]); if (!Character::isCJKIdeographOrSymbol(character)) { m_isAfterExpansion = false; return spacing; } if (!m_isAfterExpansion) { // Take the expansion opportunity before this ideograph. float expandBefore = nextExpansion(); if (expandBefore) { offset += expandBefore; spacing += expandBefore; } if (!hasExpansion()) return spacing; } return spacing + nextExpansion(); }
U_CAPI int32_t U_EXPORT2 u_countChar32(const UChar *s, int32_t length) { int32_t count; if(s==NULL || length<-1) { return 0; } count=0; if(length>=0) { while(length>0) { ++count; if(U16_IS_LEAD(*s) && length>=2 && U16_IS_TRAIL(*(s+1))) { s+=2; length-=2; } else { ++s; --length; } } } else /* length==-1 */ { UChar c; for(;;) { if((c=*s++)==0) { break; } ++count; /* * sufficient to look ahead one because of UTF-16; * safe to look ahead one because at worst that would be the terminating NUL */ if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) { ++s; } } } return count; }
UChar32 FCDUIterCollationIterator::previousCodePoint(UErrorCode &errorCode) { UChar32 c; for(;;) { if(state == ITER_CHECK_BWD) { c = iter.previous(&iter); if(c < 0) { start = pos = 0; state = ITER_IN_FCD_SEGMENT; return U_SENTINEL; } if(CollationFCD::hasLccc(c)) { UChar32 prev = U_SENTINEL; if(CollationFCD::maybeTibetanCompositeVowel(c) || CollationFCD::hasTccc(prev = iter.previous(&iter))) { iter.next(&iter); if(prev >= 0) { iter.next(&iter); } if(!previousSegment(errorCode)) { return U_SENTINEL; } continue; } // hasLccc(trail)=true for all trail surrogates if(U16_IS_TRAIL(c)) { if(prev < 0) { prev = iter.previous(&iter); } if(U16_IS_LEAD(prev)) { return U16_GET_SUPPLEMENTARY(prev, c); } } if(prev >= 0) { iter.next(&iter); } } return c; } else if(state == ITER_IN_FCD_SEGMENT && pos != start) { c = uiter_previous32(&iter); pos -= U16_LENGTH(c); U_ASSERT(c >= 0); return c; } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != 0) { c = normalized.char32At(pos - 1); pos -= U16_LENGTH(c); return c; } else { switchToBackward(); } } }
bool UTF16TextIterator::isValidSurrogatePair(UChar32& character) { // If we have a surrogate pair, make sure it starts with the high part. if (!U16_IS_SURROGATE_LEAD(character)) return false; // Do we have a surrogate pair? If so, determine the full Unicode (32 bit) // code point before glyph lookup. // Make sure we have another character and it's a low surrogate. if (m_characters + 1 >= m_charactersEnd) return false; UChar low = m_characters[1]; if (!U16_IS_TRAIL(low)) return false; return true; }
UChar32 FCDUIterCollationIterator::nextCodePoint(UErrorCode &errorCode) { UChar32 c; for(;;) { if(state == ITER_CHECK_FWD) { c = iter.next(&iter); if(c < 0) { return c; } if(CollationFCD::hasTccc(c)) { if(CollationFCD::maybeTibetanCompositeVowel(c) || CollationFCD::hasLccc(iter.current(&iter))) { iter.previous(&iter); if(!nextSegment(errorCode)) { return U_SENTINEL; } continue; } } if(U16_IS_LEAD(c)) { UChar32 trail = iter.next(&iter); if(U16_IS_TRAIL(trail)) { return U16_GET_SUPPLEMENTARY(c, trail); } else if(trail >= 0) { iter.previous(&iter); } } return c; } else if(state == ITER_IN_FCD_SEGMENT && pos != limit) { c = uiter_next32(&iter); pos += U16_LENGTH(c); U_ASSERT(c >= 0); return c; } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != normalized.length()) { c = normalized.char32At(pos); pos += U16_LENGTH(c); return c; } else { switchToForward(); } } }
FcPattern* createFontConfigPatternForCharacters(const UChar* characters, int length) { FcPattern* pattern = FcPatternCreate(); FcCharSet* fontConfigCharSet = FcCharSetCreate(); for (int i = 0; i < length; ++i) { if (U16_IS_SURROGATE(characters[i]) && U16_IS_SURROGATE_LEAD(characters[i]) && i != length - 1 && U16_IS_TRAIL(characters[i + 1])) { FcCharSetAddChar(fontConfigCharSet, U16_GET_SUPPLEMENTARY(characters[i], characters[i+1])); i++; } else FcCharSetAddChar(fontConfigCharSet, characters[i]); } FcPatternAddCharSet(pattern, FC_CHARSET, fontConfigCharSet); FcCharSetDestroy(fontConfigCharSet); FcPatternAddBool(pattern, FC_SCALABLE, FcTrue); FcConfigSubstitute(0, pattern, FcMatchPattern); FcDefaultSubstitute(pattern); return pattern; }
static inline int nextBreakablePositionBreakAllInternal(LazyLineBreakIterator& lazyBreakIterator, const CharacterType* str, unsigned length, int pos) { int len = static_cast<int>(length); CharacterType lastLastCh = pos > 1 ? str[pos - 2] : static_cast<CharacterType>(lazyBreakIterator.secondToLastCharacter()); CharacterType lastCh = pos > 0 ? str[pos - 1] : static_cast<CharacterType>(lazyBreakIterator.lastCharacter()); bool lastIsLetterOrNumber = isUnicodeCategoryLetterOrNumber(lastLastCh, lastCh); for (int i = pos; i < len; ++i) { CharacterType ch = str[i]; if (isBreakableSpace(ch) || shouldBreakAfter(lastLastCh, lastCh, ch)) return i; if (!U16_IS_LEAD(ch)) { bool isLetterOrNumber = isUnicodeCategoryLetterOrNumber(lastCh, ch); if (isLetterOrNumber && lastIsLetterOrNumber) return i > pos && U16_IS_TRAIL(ch) ? i - 1 : i; lastIsLetterOrNumber = isLetterOrNumber; } lastLastCh = lastCh; lastCh = ch; } return len; }
bool readUTFChar(const UChar* str, int* begin, int length, unsigned* codePoint) { if (U16_IS_SURROGATE(str[*begin])) { if (!U16_IS_SURROGATE_LEAD(str[*begin]) || *begin + 1 >= length || !U16_IS_TRAIL(str[*begin + 1])) { // Invalid surrogate pair. *codePoint = kUnicodeReplacementCharacter; return false; } // Valid surrogate pair. *codePoint = U16_GET_SUPPLEMENTARY(str[*begin], str[*begin + 1]); (*begin)++; } else { // Not a surrogate, just one 16-bit word. *codePoint = str[*begin]; } if (U_IS_UNICODE_CHAR(*codePoint)) return true; // Invalid code point. *codePoint = kUnicodeReplacementCharacter; return false; }
// FIXME: This function may not work if the emphasis mark uses a complex script, but none of the // standard emphasis marks do so. bool Font::getEmphasisMarkGlyphData(const AtomicString& mark, GlyphData& glyphData) const { if (mark.isEmpty()) return false; UChar32 character = mark[0]; if (U16_IS_SURROGATE(character)) { if (!U16_IS_SURROGATE_LEAD(character)) return false; if (mark.length() < 2) return false; UChar low = mark[1]; if (!U16_IS_TRAIL(low)) return false; character = U16_GET_SUPPLEMENTARY(character, low); } glyphData = glyphDataForCharacter(character, false, EmphasisMarkVariant); return true; }
Font::CodePath Font::codePath(const TextRun& run) const { if (s_codePath != Auto) return s_codePath; #if ENABLE(SVG_FONTS) if (run.renderingContext()) return Simple; #endif #if PLATFORM(QT) && !HAVE(QRAWFONT) if (run.expansion() || run.rtl() || isSmallCaps() || wordSpacing() || letterSpacing()) return Complex; #endif if (m_fontDescription.featureSettings() && m_fontDescription.featureSettings()->size() > 0) return Complex; CodePath result = Simple; // Start from 0 since drawing and highlighting also measure the characters before run->from // FIXME: Should use a UnicodeSet in ports where ICU is used. Note that we // can't simply use UnicodeCharacter Property/class because some characters // are not 'combining', but still need to go to the complex path. // Alternatively, we may as well consider binary search over a sorted // list of ranges. for (int i = 0; i < run.length(); i++) { const UChar c = run[i]; if (c < 0x2E5) // U+02E5 through U+02E9 (Modifier Letters : Tone letters) continue; if (c <= 0x2E9) return Complex; if (c < 0x300) // U+0300 through U+036F Combining diacritical marks continue; if (c <= 0x36F) return Complex; if (c < 0x0591 || c == 0x05BE) // U+0591 through U+05CF excluding U+05BE Hebrew combining marks, Hebrew punctuation Paseq, Sof Pasuq and Nun Hafukha continue; if (c <= 0x05CF) return Complex; // U+0600 through U+109F Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic, // Devanagari, Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada, // Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar if (c < 0x0600) continue; if (c <= 0x109F) return Complex; // U+1100 through U+11FF Hangul Jamo (only Ancient Korean should be left here if you precompose; // Modern Korean will be precomposed as a result of step A) if (c < 0x1100) continue; if (c <= 0x11FF) return Complex; if (c < 0x135D) // U+135D through U+135F Ethiopic combining marks continue; if (c <= 0x135F) return Complex; if (c < 0x1700) // U+1780 through U+18AF Tagalog, Hanunoo, Buhid, Taghanwa,Khmer, Mongolian continue; if (c <= 0x18AF) return Complex; if (c < 0x1900) // U+1900 through U+194F Limbu (Unicode 4.0) continue; if (c <= 0x194F) return Complex; if (c < 0x1980) // U+1980 through U+19DF New Tai Lue continue; if (c <= 0x19DF) return Complex; if (c < 0x1A00) // U+1A00 through U+1CFF Buginese, Tai Tham, Balinese, Batak, Lepcha, Vedic continue; if (c <= 0x1CFF) return Complex; if (c < 0x1DC0) // U+1DC0 through U+1DFF Comining diacritical mark supplement continue; if (c <= 0x1DFF) return Complex; // U+1E00 through U+2000 characters with diacritics and stacked diacritics if (c <= 0x2000) { result = SimpleWithGlyphOverflow; continue; } if (c < 0x20D0) // U+20D0 through U+20FF Combining marks for symbols continue; if (c <= 0x20FF) return Complex; if (c < 0x2CEF) // U+2CEF through U+2CF1 Combining marks for Coptic continue; if (c <= 0x2CF1) return Complex; if (c < 0x302A) // U+302A through U+302F Ideographic and Hangul Tone marks continue; if (c <= 0x302F) return Complex; if (c < 0xA67C) // U+A67C through U+A67D Combining marks for old Cyrillic continue; if (c <= 0xA67D) return Complex; if (c < 0xA6F0) // U+A6F0 through U+A6F1 Combining mark for Bamum continue; if (c <= 0xA6F1) return Complex; // U+A800 through U+ABFF Nagri, Phags-pa, Saurashtra, Devanagari Extended, // Hangul Jamo Ext. A, Javanese, Myanmar Extended A, Tai Viet, Meetei Mayek, if (c < 0xA800) continue; if (c <= 0xABFF) return Complex; if (c < 0xD7B0) // U+D7B0 through U+D7FF Hangul Jamo Ext. B continue; if (c <= 0xD7FF) return Complex; if (c <= 0xDBFF) { // High surrogate if (i == run.length() - 1) continue; UChar next = run[++i]; if (!U16_IS_TRAIL(next)) continue; UChar32 supplementaryCharacter = U16_GET_SUPPLEMENTARY(c, next); if (supplementaryCharacter < 0x1F1E6) // U+1F1E6 through U+1F1FF Regional Indicator Symbols continue; if (supplementaryCharacter <= 0x1F1FF) return Complex; if (supplementaryCharacter < 0xE0100) // U+E0100 through U+E01EF Unicode variation selectors. continue; if (supplementaryCharacter <= 0xE01EF) return Complex; // FIXME: Check for Brahmi (U+11000 block), Kaithi (U+11080 block) and other complex scripts // in plane 1 or higher. continue; } if (c < 0xFE00) // U+FE00 through U+FE0F Unicode variation selectors continue; if (c <= 0xFE0F) return Complex; if (c < 0xFE20) // U+FE20 through U+FE2F Combining half marks continue; if (c <= 0xFE2F) return Complex; } if (run.length() > 1 && typesettingFeatures()) return Complex; return result; }
static void _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, UErrorCode *pErrorCode) { UConverter *cnv; const UChar *source; char *target; int32_t *offsets; uint32_t targetCapacity, length, sourceIndex; UChar c, trail; char overflow[4]; source=pArgs->source; length=(int32_t)(pArgs->sourceLimit-source); if(length<=0) { /* no input, nothing to do */ return; } cnv=pArgs->converter; /* write the BOM if necessary */ if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { static const char bom[]={ (char)0xfe, (char)0xff }; ucnv_fromUWriteBytes(cnv, bom, 2, &pArgs->target, pArgs->targetLimit, &pArgs->offsets, -1, pErrorCode); cnv->fromUnicodeStatus=0; } target=pArgs->target; if(target >= pArgs->targetLimit) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; return; } targetCapacity=(uint32_t)(pArgs->targetLimit-target); offsets=pArgs->offsets; sourceIndex=0; /* c!=0 indicates in several places outside the main loops that a surrogate was found */ if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) { /* the last buffer ended with a lead surrogate, output the surrogate pair */ ++source; --length; target[0]=(uint8_t)(c>>8); target[1]=(uint8_t)c; target[2]=(uint8_t)(trail>>8); target[3]=(uint8_t)trail; target+=4; targetCapacity-=4; if(offsets!=NULL) { *offsets++=-1; *offsets++=-1; *offsets++=-1; *offsets++=-1; } sourceIndex=1; cnv->fromUChar32=c=0; }
/* internal function */ U_CFUNC int32_t u_strcmpFold(const UChar *s1, int32_t length1, const UChar *s2, int32_t length2, uint32_t options, UErrorCode *pErrorCode) { const UCaseProps *csp; /* current-level start/limit - s1/s2 as current */ const UChar *start1, *start2, *limit1, *limit2; /* case folding variables */ const UChar *p; int32_t length; /* stacks of previous-level start/current/limit */ CmpEquivLevel stack1[2], stack2[2]; /* case folding buffers, only use current-level start/limit */ UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1]; /* track which is the current level per string */ int32_t level1, level2; /* current code units, and code points for lookups */ UChar32 c1, c2, cp1, cp2; /* no argument error checking because this itself is not an API */ /* * assume that at least the option U_COMPARE_IGNORE_CASE is set * otherwise this function would have to behave exactly as uprv_strCompare() */ csp=ucase_getSingleton(); if(U_FAILURE(*pErrorCode)) { return 0; } /* initialize */ start1=s1; if(length1==-1) { limit1=NULL; } else { limit1=s1+length1; } start2=s2; if(length2==-1) { limit2=NULL; } else { limit2=s2+length2; } level1=level2=0; c1=c2=-1; /* comparison loop */ for(;;) { /* * here a code unit value of -1 means "get another code unit" * below it will mean "this source is finished" */ if(c1<0) { /* get next code unit from string 1, post-increment */ for(;;) { if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) { if(level1==0) { c1=-1; break; } } else { ++s1; break; } /* reached end of level buffer, pop one level */ do { --level1; start1=stack1[level1].start; } while(start1==NULL); s1=stack1[level1].s; limit1=stack1[level1].limit; } } if(c2<0) { /* get next code unit from string 2, post-increment */ for(;;) { if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) { if(level2==0) { c2=-1; break; } } else { ++s2; break; } /* reached end of level buffer, pop one level */ do { --level2; start2=stack2[level2].start; } while(start2==NULL); s2=stack2[level2].s; limit2=stack2[level2].limit; } } /* * compare c1 and c2 * either variable c1, c2 is -1 only if the corresponding string is finished */ if(c1==c2) { if(c1<0) { return 0; /* c1==c2==-1 indicating end of strings */ } c1=c2=-1; /* make us fetch new code units */ continue; } else if(c1<0) { return -1; /* string 1 ends before string 2 */ } else if(c2<0) { return 1; /* string 2 ends before string 1 */ } /* c1!=c2 && c1>=0 && c2>=0 */ /* get complete code points for c1, c2 for lookups if either is a surrogate */ cp1=c1; if(U_IS_SURROGATE(c1)) { UChar c; if(U_IS_SURROGATE_LEAD(c1)) { if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) { /* advance ++s1; only below if cp1 decomposes/case-folds */ cp1=U16_GET_SUPPLEMENTARY(c1, c); } } else /* isTrail(c1) */ { if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) { cp1=U16_GET_SUPPLEMENTARY(c, c1); } } } cp2=c2; if(U_IS_SURROGATE(c2)) { UChar c; if(U_IS_SURROGATE_LEAD(c2)) { if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) { /* advance ++s2; only below if cp2 decomposes/case-folds */ cp2=U16_GET_SUPPLEMENTARY(c2, c); } } else /* isTrail(c2) */ { if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) { cp2=U16_GET_SUPPLEMENTARY(c, c2); } } } /* * go down one level for each string * continue with the main loop as soon as there is a real change */ if( level1==0 && (length=ucase_toFullFolding(csp, (UChar32)cp1, &p, options))>=0 ) { /* cp1 case-folds to the code point "length" or to p[length] */ if(U_IS_SURROGATE(c1)) { if(U_IS_SURROGATE_LEAD(c1)) { /* advance beyond source surrogate pair if it case-folds */ ++s1; } else /* isTrail(c1) */ { /* * we got a supplementary code point when hitting its trail surrogate, * therefore the lead surrogate must have been the same as in the other string; * compare this decomposition with the lead surrogate in the other string * remember that this simulates bulk text replacement: * the decomposition would replace the entire code point */ --s2; c2=*(s2-1); } } /* push current level pointers */ stack1[0].start=start1; stack1[0].s=s1; stack1[0].limit=limit1; ++level1; /* copy the folding result to fold1[] */ if(length<=UCASE_MAX_STRING_LENGTH) { u_memcpy(fold1, p, length); } else { int32_t i=0; U16_APPEND_UNSAFE(fold1, i, length); length=i; } /* set next level pointers to case folding */ start1=s1=fold1; limit1=fold1+length; /* get ready to read from decomposition, continue with loop */ c1=-1; continue; } if( level2==0 && (length=ucase_toFullFolding(csp, (UChar32)cp2, &p, options))>=0 ) { /* cp2 case-folds to the code point "length" or to p[length] */ if(U_IS_SURROGATE(c2)) { if(U_IS_SURROGATE_LEAD(c2)) { /* advance beyond source surrogate pair if it case-folds */ ++s2; } else /* isTrail(c2) */ { /* * we got a supplementary code point when hitting its trail surrogate, * therefore the lead surrogate must have been the same as in the other string; * compare this decomposition with the lead surrogate in the other string * remember that this simulates bulk text replacement: * the decomposition would replace the entire code point */ --s1; c1=*(s1-1); } } /* push current level pointers */ stack2[0].start=start2; stack2[0].s=s2; stack2[0].limit=limit2; ++level2; /* copy the folding result to fold2[] */ if(length<=UCASE_MAX_STRING_LENGTH) { u_memcpy(fold2, p, length); } else { int32_t i=0; U16_APPEND_UNSAFE(fold2, i, length); length=i; } /* set next level pointers to case folding */ start2=s2=fold2; limit2=fold2+length; /* get ready to read from decomposition, continue with loop */ c2=-1; continue; } /* * no decomposition/case folding, max level for both sides: * return difference result * * code point order comparison must not just return cp1-cp2 * because when single surrogates are present then the surrogate pairs * that formed cp1 and cp2 may be from different string indexes * * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units * c1=d800 cp1=10001 c2=dc00 cp2=10000 * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 } * * therefore, use same fix-up as in ustring.c/uprv_strCompare() * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++ * so we have slightly different pointer/start/limit comparisons here */ if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) { /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ if( (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) || (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2))) ) { /* part of a surrogate pair, leave >=d800 */ } else { /* BMP code point - may be surrogate code point - make <d800 */ c1-=0x2800; } if( (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) || (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2))) ) { /* part of a surrogate pair, leave >=d800 */ } else { /* BMP code point - may be surrogate code point - make <d800 */ c2-=0x2800; } } return c1-c2; } }
void WidthIterator::advance(int offset, GlyphBuffer* glyphBuffer) { if (offset > m_end) offset = m_end; int currentCharacter = m_currentCharacter; const UChar* cp = m_run.data(currentCharacter); bool rtl = m_run.rtl(); bool hasExtraSpacing = (m_font->letterSpacing() || m_font->wordSpacing() || m_padding) && !m_run.spacingDisabled(); float widthSinceLastRounding = m_runWidthSoFar; m_runWidthSoFar = floorf(m_runWidthSoFar); widthSinceLastRounding -= m_runWidthSoFar; float lastRoundingWidth = m_finalRoundingWidth; FloatRect bounds; const SimpleFontData* primaryFont = m_font->primaryFont(); const SimpleFontData* lastFontData = primaryFont; while (currentCharacter < offset) { UChar32 c = *cp; unsigned clusterLength = 1; if (c >= 0x3041) { if (c <= 0x30FE) { // Deal with Hiragana and Katakana voiced and semi-voiced syllables. // Normalize into composed form, and then look for glyph with base + combined mark. // Check above for character range to minimize performance impact. UChar32 normalized = normalizeVoicingMarks(currentCharacter); if (normalized) { c = normalized; clusterLength = 2; } } else if (U16_IS_SURROGATE(c)) { if (!U16_IS_SURROGATE_LEAD(c)) break; // Do we have a surrogate pair? If so, determine the full Unicode (32 bit) // code point before glyph lookup. // Make sure we have another character and it's a low surrogate. if (currentCharacter + 1 >= m_run.length()) break; UChar low = cp[1]; if (!U16_IS_TRAIL(low)) break; c = U16_GET_SUPPLEMENTARY(c, low); clusterLength = 2; } } const GlyphData& glyphData = m_font->glyphDataForCharacter(c, rtl); Glyph glyph = glyphData.glyph; const SimpleFontData* fontData = glyphData.fontData; ASSERT(fontData); // Now that we have a glyph and font data, get its width. float width; if (c == '\t' && m_run.allowTabs()) { float tabWidth = m_font->tabWidth(*fontData); width = tabWidth - fmodf(m_run.xPos() + m_runWidthSoFar + widthSinceLastRounding, tabWidth); } else { width = fontData->widthForGlyph(glyph); #if ENABLE(SVG) // SVG uses horizontalGlyphStretch(), when textLength is used to stretch/squeeze text. width *= m_run.horizontalGlyphStretch(); #endif // We special case spaces in two ways when applying word rounding. // First, we round spaces to an adjusted width in all fonts. // Second, in fixed-pitch fonts we ensure that all characters that // match the width of the space character have the same width as the space character. if (width == fontData->spaceWidth() && (fontData->pitch() == FixedPitch || glyph == fontData->spaceGlyph()) && m_run.applyWordRounding()) width = fontData->adjustedSpaceWidth(); } if (fontData != lastFontData && width) { lastFontData = fontData; if (m_fallbackFonts && fontData != primaryFont) { // FIXME: This does a little extra work that could be avoided if // glyphDataForCharacter() returned whether it chose to use a small caps font. if (!m_font->isSmallCaps() || c == toUpper(c)) m_fallbackFonts->add(fontData); else { const GlyphData& uppercaseGlyphData = m_font->glyphDataForCharacter(toUpper(c), rtl); if (uppercaseGlyphData.fontData != primaryFont) m_fallbackFonts->add(uppercaseGlyphData.fontData); } } } if (hasExtraSpacing) { // Account for letter-spacing. if (width && m_font->letterSpacing()) width += m_font->letterSpacing(); if (Font::treatAsSpace(c)) { // Account for padding. WebCore uses space padding to justify text. // We distribute the specified padding over the available spaces in the run. if (m_padding) { // Use left over padding if not evenly divisible by number of spaces. if (m_padding < m_padPerSpace) { width += m_padding; m_padding = 0; } else { float previousPadding = m_padding; m_padding -= m_padPerSpace; width += roundf(previousPadding) - roundf(m_padding); } } // Account for word spacing. // We apply additional space between "words" by adding width to the space character. if (currentCharacter != 0 && !Font::treatAsSpace(cp[-1]) && m_font->wordSpacing()) width += m_font->wordSpacing(); } } if (m_accountForGlyphBounds) { bounds = fontData->boundsForGlyph(glyph); if (!currentCharacter) m_firstGlyphOverflow = max<float>(0, -bounds.x()); } if (m_forTextEmphasis && !Font::canReceiveTextEmphasis(c)) glyph = 0; // Advance past the character we just dealt with. cp += clusterLength; currentCharacter += clusterLength; // Account for float/integer impedance mismatch between CG and KHTML. "Words" (characters // followed by a character defined by isRoundingHackCharacter()) are always an integer width. // We adjust the width of the last character of a "word" to ensure an integer width. // If we move KHTML to floats we can remove this (and related) hacks. float oldWidth = width; // Force characters that are used to determine word boundaries for the rounding hack // to be integer width, so following words will start on an integer boundary. if (m_run.applyWordRounding() && Font::isRoundingHackCharacter(c)) { width = ceilf(width); // Since widthSinceLastRounding can lose precision if we include measurements for // preceding whitespace, we bypass it here. m_runWidthSoFar += width; // Since this is a rounding hack character, we should have reset this sum on the previous // iteration. ASSERT(!widthSinceLastRounding); } else { // Check to see if the next character is a "rounding hack character", if so, adjust // width so that the total run width will be on an integer boundary. if ((m_run.applyWordRounding() && currentCharacter < m_run.length() && Font::isRoundingHackCharacter(*cp)) || (m_run.applyRunRounding() && currentCharacter >= m_end)) { float totalWidth = widthSinceLastRounding + width; widthSinceLastRounding = ceilf(totalWidth); width += widthSinceLastRounding - totalWidth; m_runWidthSoFar += widthSinceLastRounding; widthSinceLastRounding = 0; } else widthSinceLastRounding += width; } if (glyphBuffer) glyphBuffer->add(glyph, fontData, (rtl ? oldWidth + lastRoundingWidth : width)); lastRoundingWidth = width - oldWidth; if (m_accountForGlyphBounds) { m_maxGlyphBoundingBoxY = max(m_maxGlyphBoundingBoxY, bounds.bottom()); m_minGlyphBoundingBoxY = min(m_minGlyphBoundingBoxY, bounds.y()); m_lastGlyphOverflow = max<float>(0, bounds.right() - width); } } m_currentCharacter = currentCharacter; m_runWidthSoFar += widthSinceLastRounding; m_finalRoundingWidth = lastRoundingWidth; }
UChar UIterCollationIterator::handleGetTrailSurrogate() { UChar32 trail = iter.next(&iter); if(!U16_IS_TRAIL(trail) && trail >= 0) { iter.previous(&iter); } return (UChar)trail; }
static void U_CALLCONV _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, UErrorCode *pErrorCode) { UConverter *cnv; const UChar *source, *sourceLimit; uint8_t *target; int32_t targetCapacity; int32_t *offsets; int32_t prev, c, diff; int32_t sourceIndex, nextSourceIndex; /* set up the local pointers */ cnv=pArgs->converter; source=pArgs->source; sourceLimit=pArgs->sourceLimit; target=(uint8_t *)pArgs->target; targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); offsets=pArgs->offsets; /* get the converter state from UConverter */ c=cnv->fromUChar32; prev=(int32_t)cnv->fromUnicodeStatus; if(prev==0) { prev=BOCU1_ASCII_PREV; } /* sourceIndex=-1 if the current character began in the previous buffer */ sourceIndex= c==0 ? 0 : -1; nextSourceIndex=0; /* conversion loop */ if(c!=0 && targetCapacity>0) { goto getTrail; } fastSingle: /* fast loop for single-byte differences */ /* use only one loop counter variable, targetCapacity, not also source */ diff=(int32_t)(sourceLimit-source); if(targetCapacity>diff) { targetCapacity=diff; } while(targetCapacity>0 && (c=*source)<0x3000) { if(c<=0x20) { if(c!=0x20) { prev=BOCU1_ASCII_PREV; } *target++=(uint8_t)c; *offsets++=nextSourceIndex++; ++source; --targetCapacity; } else { diff=c-prev; if(DIFF_IS_SINGLE(diff)) { prev=BOCU1_SIMPLE_PREV(c); *target++=(uint8_t)PACK_SINGLE_DIFF(diff); *offsets++=nextSourceIndex++; ++source; --targetCapacity; } else { break; } } } /* restore real values */ targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target); sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */ /* regular loop for all cases */ while(source<sourceLimit) { if(targetCapacity>0) { c=*source++; ++nextSourceIndex; if(c<=0x20) { /* * ISO C0 control & space: * Encode directly for MIME compatibility, * and reset state except for space, to not disrupt compression. */ if(c!=0x20) { prev=BOCU1_ASCII_PREV; } *target++=(uint8_t)c; *offsets++=sourceIndex; --targetCapacity; sourceIndex=nextSourceIndex; continue; } if(U16_IS_LEAD(c)) { getTrail: if(source<sourceLimit) { /* test the following code unit */ UChar trail=*source; if(U16_IS_TRAIL(trail)) { ++source; ++nextSourceIndex; c=U16_GET_SUPPLEMENTARY(c, trail); } } else { /* no more input */ c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */ break; } } /* * all other Unicode code points c==U+0021..U+10ffff * are encoded with the difference c-prev * * a new prev is computed from c, * placed in the middle of a 0x80-block (for most small scripts) or * in the middle of the Unihan and Hangul blocks * to statistically minimize the following difference */ diff=c-prev; prev=BOCU1_PREV(c); if(DIFF_IS_SINGLE(diff)) { *target++=(uint8_t)PACK_SINGLE_DIFF(diff); *offsets++=sourceIndex; --targetCapacity; sourceIndex=nextSourceIndex; if(c<0x3000) { goto fastSingle; } } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) { /* optimize 2-byte case */ int32_t m; if(diff>=0) { diff-=BOCU1_REACH_POS_1+1; m=diff%BOCU1_TRAIL_COUNT; diff/=BOCU1_TRAIL_COUNT; diff+=BOCU1_START_POS_2; } else { diff-=BOCU1_REACH_NEG_1; NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); diff+=BOCU1_START_NEG_2; } *target++=(uint8_t)diff; *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m); *offsets++=sourceIndex; *offsets++=sourceIndex; targetCapacity-=2; sourceIndex=nextSourceIndex; } else { int32_t length; /* will be 2..4 */ diff=packDiff(diff); length=BOCU1_LENGTH_FROM_PACKED(diff); /* write the output character bytes from diff and length */ /* from the first if in the loop we know that targetCapacity>0 */ if(length<=targetCapacity) { switch(length) { /* each branch falls through to the next one */ case 4: *target++=(uint8_t)(diff>>24); *offsets++=sourceIndex; U_FALLTHROUGH; case 3: *target++=(uint8_t)(diff>>16); *offsets++=sourceIndex; U_FALLTHROUGH; case 2: *target++=(uint8_t)(diff>>8); *offsets++=sourceIndex; /* case 1: handled above */ *target++=(uint8_t)diff; *offsets++=sourceIndex; U_FALLTHROUGH; default: /* will never occur */ break; } targetCapacity-=length; sourceIndex=nextSourceIndex; } else { uint8_t *charErrorBuffer; /* * We actually do this backwards here: * In order to save an intermediate variable, we output * first to the overflow buffer what does not fit into the * regular target. */ /* we know that 1<=targetCapacity<length<=4 */ length-=targetCapacity; charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; switch(length) { /* each branch falls through to the next one */ case 3: *charErrorBuffer++=(uint8_t)(diff>>16); U_FALLTHROUGH; case 2: *charErrorBuffer++=(uint8_t)(diff>>8); U_FALLTHROUGH; case 1: *charErrorBuffer=(uint8_t)diff; U_FALLTHROUGH; default: /* will never occur */ break; } cnv->charErrorBufferLength=(int8_t)length; /* now output what fits into the regular target */ diff>>=8*length; /* length was reduced by targetCapacity */ switch(targetCapacity) { /* each branch falls through to the next one */ case 3: *target++=(uint8_t)(diff>>16); *offsets++=sourceIndex; U_FALLTHROUGH; case 2: *target++=(uint8_t)(diff>>8); *offsets++=sourceIndex; U_FALLTHROUGH; case 1: *target++=(uint8_t)diff; *offsets++=sourceIndex; U_FALLTHROUGH; default: /* will never occur */ break; } /* target overflow */ targetCapacity=0; *pErrorCode=U_BUFFER_OVERFLOW_ERROR; break; } } } else {
CodePath Character::characterRangeCodePath(const UChar* characters, unsigned len) { static const UChar complexCodePathRanges[] = { // U+02E5 through U+02E9 (Modifier Letters : Tone letters) 0x2E5, 0x2E9, // U+0300 through U+036F Combining diacritical marks 0x300, 0x36F, // U+0591 through U+05CF excluding U+05BE Hebrew combining marks, ... 0x0591, 0x05BD, // ... Hebrew punctuation Paseq, Sof Pasuq and Nun Hafukha 0x05BF, 0x05CF, // U+0600 through U+109F Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic, // Devanagari, Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada, // Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar 0x0600, 0x109F, // U+1100 through U+11FF Hangul Jamo (only Ancient Korean should be left // here if you precompose; Modern Korean will be precomposed as a result of step A) 0x1100, 0x11FF, // U+135D through U+135F Ethiopic combining marks 0x135D, 0x135F, // U+1780 through U+18AF Tagalog, Hanunoo, Buhid, Taghanwa,Khmer, Mongolian 0x1700, 0x18AF, // U+1900 through U+194F Limbu (Unicode 4.0) 0x1900, 0x194F, // U+1980 through U+19DF New Tai Lue 0x1980, 0x19DF, // U+1A00 through U+1CFF Buginese, Tai Tham, Balinese, Batak, Lepcha, Vedic 0x1A00, 0x1CFF, // U+1DC0 through U+1DFF Comining diacritical mark supplement 0x1DC0, 0x1DFF, // U+20D0 through U+20FF Combining marks for symbols 0x20D0, 0x20FF, // U+2CEF through U+2CF1 Combining marks for Coptic 0x2CEF, 0x2CF1, // U+302A through U+302F Ideographic and Hangul Tone marks 0x302A, 0x302F, // U+A67C through U+A67D Combining marks for old Cyrillic 0xA67C, 0xA67D, // U+A6F0 through U+A6F1 Combining mark for Bamum 0xA6F0, 0xA6F1, // U+A800 through U+ABFF Nagri, Phags-pa, Saurashtra, Devanagari Extended, // Hangul Jamo Ext. A, Javanese, Myanmar Extended A, Tai Viet, Meetei Mayek 0xA800, 0xABFF, // U+D7B0 through U+D7FF Hangul Jamo Ext. B 0xD7B0, 0xD7FF, // U+FE00 through U+FE0F Unicode variation selectors 0xFE00, 0xFE0F, // U+FE20 through U+FE2F Combining half marks 0xFE20, 0xFE2F }; CodePath result = SimplePath; for (unsigned i = 0; i < len; i++) { const UChar c = characters[i]; // Shortcut for common case if (c < 0x2E5) continue; // U+1E00 through U+2000 characters with diacritics and stacked diacritics if (c >= 0x1E00 && c <= 0x2000) { result = SimpleWithGlyphOverflowPath; continue; } // Surrogate pairs if (c > 0xD7FF && c <= 0xDBFF) { if (i == len - 1) continue; UChar next = characters[++i]; if (!U16_IS_TRAIL(next)) continue; UChar32 supplementaryCharacter = U16_GET_SUPPLEMENTARY(c, next); if (supplementaryCharacter < 0x1F1E6) // U+1F1E6 through U+1F1FF Regional Indicator Symbols continue; if (supplementaryCharacter <= 0x1F1FF) return ComplexPath; if (supplementaryCharacter < 0xE0100) // U+E0100 through U+E01EF Unicode variation selectors. continue; if (supplementaryCharacter <= 0xE01EF) return ComplexPath; // FIXME: Check for Brahmi (U+11000 block), Kaithi (U+11080 block) and other complex scripts // in plane 1 or higher. continue; } // Search for other Complex cases if (valueInIntervalList(complexCodePathRanges, c)) return ComplexPath; } return result; }