unsigned Font::expansionOpportunityCount(const UChar* characters, size_t length, TextDirection direction, bool& isAfterExpansion) { static bool expandAroundIdeographs = canExpandAroundIdeographsInComplexText(); unsigned count = 0; if (direction == LTR) { for (size_t i = 0; i < length; ++i) { UChar32 character = characters[i]; if (treatAsSpace(character)) { count++; isAfterExpansion = true; continue; } if (U16_IS_LEAD(character) && i + 1 < length && U16_IS_TRAIL(characters[i + 1])) { character = U16_GET_SUPPLEMENTARY(character, characters[i + 1]); i++; } if (expandAroundIdeographs && isCJKIdeographOrSymbol(character)) { if (!isAfterExpansion) count++; count++; isAfterExpansion = true; continue; } isAfterExpansion = false; } } else { for (size_t i = length; i > 0; --i) { UChar32 character = characters[i - 1]; if (treatAsSpace(character)) { count++; isAfterExpansion = true; continue; } if (U16_IS_TRAIL(character) && i > 1 && U16_IS_LEAD(characters[i - 2])) { character = U16_GET_SUPPLEMENTARY(characters[i - 2], character); i--; } if (expandAroundIdeographs && isCJKIdeographOrSymbol(character)) { if (!isAfterExpansion) count++; count++; isAfterExpansion = true; continue; } isAfterExpansion = false; } } return count; }
unsigned Character::expansionOpportunityCount(const UChar* characters, size_t length, TextDirection direction, bool& isAfterExpansion, const TextJustify textJustify) { unsigned count = 0; if (direction == LTR) { for (size_t i = 0; i < length; ++i) { UChar32 character = characters[i]; if (treatAsSpace(character)) { count++; isAfterExpansion = true; continue; } if (U16_IS_LEAD(character) && i + 1 < length && U16_IS_TRAIL(characters[i + 1])) { character = U16_GET_SUPPLEMENTARY(character, characters[i + 1]); i++; } if (textJustify == TextJustify::TextJustifyAuto && isCJKIdeographOrSymbol(character)) { if (!isAfterExpansion) count++; count++; isAfterExpansion = true; continue; } isAfterExpansion = false; } } else { for (size_t i = length; i > 0; --i) { UChar32 character = characters[i - 1]; if (treatAsSpace(character)) { count++; isAfterExpansion = true; continue; } if (U16_IS_TRAIL(character) && i > 1 && U16_IS_LEAD(characters[i - 2])) { character = U16_GET_SUPPLEMENTARY(characters[i - 2], character); i--; } if (textJustify == TextJustify::TextJustifyAuto && isCJKIdeographOrSymbol(character)) { if (!isAfterExpansion) count++; count++; isAfterExpansion = true; continue; } isAfterExpansion = false; } } return count; }
U_CFUNC UBool U_EXPORT2 ufile_getch32(UFILE *f, UChar32 *c32) { UBool isValidChar = FALSE; u_localized_string *str; *c32 = U_EOF; /* Fill the buffer if it is empty */ str = &f->str; if (f && str->fPos + 1 >= str->fLimit) { ufile_fill_uchar_buffer(f); } /* Get the next character in the buffer */ if (str->fPos < str->fLimit) { *c32 = *(str->fPos)++; if (U_IS_LEAD(*c32)) { if (str->fPos < str->fLimit) { UChar c16 = *(str->fPos)++; *c32 = U16_GET_SUPPLEMENTARY(*c32, c16); isValidChar = TRUE; } else { *c32 = U_EOF; } } else { isValidChar = TRUE; } } return isValidChar; }
bool SurrogatePairAwareTextIterator::consumeSlowCase(UChar32& character, unsigned& clusterLength) { if (character <= 0x30FE) { // Deal with Hiragana and Katakana voiced and semi-voiced syllables. // Normalize into composed form, and then look for glyph with base + combined mark. // Check above for character range to minimize performance impact. if (UChar32 normalized = normalizeVoicingMarks()) { character = normalized; clusterLength = 2; } return true; } if (!U16_IS_SURROGATE(character)) return true; // If we have a surrogate pair, make sure it starts with the high part. if (!U16_IS_SURROGATE_LEAD(character)) return false; // Do we have a surrogate pair? If so, determine the full Unicode (32 bit) code point before glyph lookup. // Make sure we have another character and it's a low surrogate. if (m_currentCharacter + 1 >= m_endCharacter) return false; UChar low = m_characters[1]; if (!U16_IS_TRAIL(low)) return false; character = U16_GET_SUPPLEMENTARY(character, low); clusterLength = 2; return true; }
SpanBackUTF16(const UnicodeSetPerformanceTest &testcase) : Command(testcase) { // Verify that the frozen set is equal to the unfrozen one. UnicodeSet set; UChar utf16[2]; UChar32 c, c2; for(c=0; c<=0xffff; ++c) { utf16[0]=(UChar)c; if(testcase.set.spanBack(utf16, 1, USET_SPAN_CONTAINED)==0) { set.add(c); } } for(c=0xd800; c<=0xdbff; ++c) { utf16[0]=(UChar)c; for(c2=0xdc00; c2<=0xdfff; ++c2) { utf16[1]=(UChar)c2; if(testcase.set.spanBack(utf16, 2, USET_SPAN_CONTAINED)==0) { set.add(U16_GET_SUPPLEMENTARY(c, c2)); } } } if(set!=testcase.set) { fprintf(stderr, "error: frozen set != original!\n"); } }
// FIXME: This function may not work if the emphasis mark uses a complex script, but none of the // standard emphasis marks do so. bool Font::getEmphasisMarkGlyphData(const AtomicString& mark, GlyphData& glyphData) const { if (mark.isEmpty()) return false; #if ENABLE(SVG_FONTS) // FIXME: Implement for SVG fonts. if (primaryFont()->isSVGFont()) return false; #endif UChar32 character = mark[0]; if (U16_IS_SURROGATE(character)) { if (!U16_IS_SURROGATE_LEAD(character)) return false; if (mark.length() < 2) return false; UChar low = mark[1]; if (!U16_IS_TRAIL(low)) return false; character = U16_GET_SUPPLEMENTARY(character, low); } glyphData = glyphDataForCharacter(character, false, EmphasisMarkVariant); return true; }
UChar32 StringImpl::characterStartingAt(unsigned i) { if (U16_IS_SINGLE(m_data[i])) return m_data[i]; if (i + 1 < m_length && U16_IS_LEAD(m_data[i]) && U16_IS_TRAIL(m_data[i + 1])) return U16_GET_SUPPLEMENTARY(m_data[i], m_data[i + 1]); return 0; }
static UChar32 surrogatePairAwareFirstCharacter(const UChar* characters, unsigned length) { if (U16_IS_SURROGATE(characters[0])) { if (!U16_IS_SURROGATE_LEAD(characters[0]) || length < 2 || !U16_IS_TRAIL(characters[1])) return ' '; return U16_GET_SUPPLEMENTARY(characters[0], characters[1]); } return characters[0]; }
float ShapeResultSpacing::computeSpacing(const TextRun& run, size_t index, float& offset) { UChar32 character = run[index]; bool treatAsSpace = (Character::treatAsSpace(character) || (m_normalizeSpace && Character::isNormalizedCanvasSpaceCharacter(character))) && (character != '\t' || !m_allowTabs); if (treatAsSpace && character != noBreakSpaceCharacter) character = spaceCharacter; float spacing = 0; if (m_letterSpacing && !Character::treatAsZeroWidthSpace(character)) spacing += m_letterSpacing; if (treatAsSpace && (index || !isFirstRun(run) || character == noBreakSpaceCharacter)) spacing += m_wordSpacing; if (!hasExpansion()) return spacing; if (treatAsSpace) return spacing + nextExpansion(); if (run.is8Bit() || m_textJustify != TextJustify::TextJustifyAuto) return spacing; // isCJKIdeographOrSymbol() has expansion opportunities both before and // after each character. // http://www.w3.org/TR/jlreq/#line_adjustment if (U16_IS_LEAD(character) && index + 1 < run.length() && U16_IS_TRAIL(run[index + 1])) character = U16_GET_SUPPLEMENTARY(character, run[index + 1]); if (!Character::isCJKIdeographOrSymbol(character)) { m_isAfterExpansion = false; return spacing; } if (!m_isAfterExpansion) { // Take the expansion opportunity before this ideograph. float expandBefore = nextExpansion(); if (expandBefore) { offset += expandBefore; spacing += expandBefore; } if (!hasExpansion()) return spacing; } return spacing + nextExpansion(); }
static String getFontFamilyForCharacters(const UChar* characters, size_t numCharacters) { FcCharSet* cset = FcCharSetCreate(); for (size_t i = 0; i < numCharacters; ++i) { if (U16_IS_SURROGATE(characters[i]) && U16_IS_SURROGATE_LEAD(characters[i]) && i != numCharacters - 1 && U16_IS_TRAIL(characters[i + 1])) { if (FcCharSetAddChar(cset, U16_GET_SUPPLEMENTARY(characters[i], characters[i+1])) == FcFalse) return String(); i++; } else if (FcCharSetAddChar(cset, characters[i]) == FcFalse) return String(); } FcPattern *pattern = FcPatternCreate(); FcPatternAddCharSet(pattern, FC_CHARSET, cset); FcConfigSubstitute(0, pattern, FcMatchPattern); FcDefaultSubstitute(pattern); FcResult result; FcPattern *match = FcFontMatch(0, pattern, &result); FcChar8 *filename; if (FcPatternGetString(match, FC_FILE, 0, &filename) != FcResultMatch) { FcCharSetDestroy(cset); FcPatternDestroy(match); FcPatternDestroy(pattern); return String(); } FcChar8* family; if (FcPatternGetString(match, FC_FAMILY, 0, &family) == FcResultMatch) { FcCharSetDestroy(cset); FcPatternDestroy(match); FcPatternDestroy(pattern); const char* charFamily = reinterpret_cast<char*>(family); return String(charFamily); } FcPatternDestroy(match); FcCharSetDestroy(cset); FcPatternDestroy(pattern); return String(); }
UChar32 FCDUIterCollationIterator::previousCodePoint(UErrorCode &errorCode) { UChar32 c; for(;;) { if(state == ITER_CHECK_BWD) { c = iter.previous(&iter); if(c < 0) { start = pos = 0; state = ITER_IN_FCD_SEGMENT; return U_SENTINEL; } if(CollationFCD::hasLccc(c)) { UChar32 prev = U_SENTINEL; if(CollationFCD::maybeTibetanCompositeVowel(c) || CollationFCD::hasTccc(prev = iter.previous(&iter))) { iter.next(&iter); if(prev >= 0) { iter.next(&iter); } if(!previousSegment(errorCode)) { return U_SENTINEL; } continue; } // hasLccc(trail)=true for all trail surrogates if(U16_IS_TRAIL(c)) { if(prev < 0) { prev = iter.previous(&iter); } if(U16_IS_LEAD(prev)) { return U16_GET_SUPPLEMENTARY(prev, c); } } if(prev >= 0) { iter.next(&iter); } } return c; } else if(state == ITER_IN_FCD_SEGMENT && pos != start) { c = uiter_previous32(&iter); pos -= U16_LENGTH(c); U_ASSERT(c >= 0); return c; } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != 0) { c = normalized.char32At(pos - 1); pos -= U16_LENGTH(c); return c; } else { switchToBackward(); } } }
bool GlyphPage::fill(unsigned offset, unsigned length, UChar* charBuffer, unsigned bufferLength, const SimpleFontData* fontData) { const bool isUtf16 = (bufferLength != length); for (unsigned i = 0; i < length; ++i) { Glyph character = isUtf16 ? U16_GET_SUPPLEMENTARY(charBuffer[i*2], charBuffer[i*2 + 1]) : charBuffer[i]; setGlyphDataForIndex(offset + i, character, fontData); } return true; }
bool UTF16TextIterator::consumeSurrogatePair(UChar32& character) { if (!U16_IS_SURROGATE(character)) return true; if (!isValidSurrogatePair(character)) { character = replacementCharacter; return true; } UChar low = m_characters[1]; character = U16_GET_SUPPLEMENTARY(character, low); m_currentGlyphLength = 2; return true; }
RefPtr<Font> FontCache::systemFallbackForCharacters(const FontDescription& desc, const Font* originalFontData, bool isPlatformFont, const UChar* characters, unsigned int length) { ASSERT(characters && (length==1||length==2)); UChar32 c = 0; if (length==1) { c = characters[0]; } else { c = U16_GET_SUPPLEMENTARY(characters[0], characters[1]); } FontPlatformData alt(desc, desc.familyAt(0)); if (alt.font() && alt.font()->font()) { alt.font()->setSpecificUnicodeChar(c); return fontForPlatformData(alt); } else { return lastResortFallbackFont(desc); } }
UChar32 FCDUIterCollationIterator::nextCodePoint(UErrorCode &errorCode) { UChar32 c; for(;;) { if(state == ITER_CHECK_FWD) { c = iter.next(&iter); if(c < 0) { return c; } if(CollationFCD::hasTccc(c)) { if(CollationFCD::maybeTibetanCompositeVowel(c) || CollationFCD::hasLccc(iter.current(&iter))) { iter.previous(&iter); if(!nextSegment(errorCode)) { return U_SENTINEL; } continue; } } if(U16_IS_LEAD(c)) { UChar32 trail = iter.next(&iter); if(U16_IS_TRAIL(trail)) { return U16_GET_SUPPLEMENTARY(c, trail); } else if(trail >= 0) { iter.previous(&iter); } } return c; } else if(state == ITER_IN_FCD_SEGMENT && pos != limit) { c = uiter_next32(&iter); pos += U16_LENGTH(c); U_ASSERT(c >= 0); return c; } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != normalized.length()) { c = normalized.char32At(pos); pos += U16_LENGTH(c); return c; } else { switchToForward(); } } }
bool GlyphPage::fill(unsigned offset, unsigned length, UChar* buffer, unsigned bufferLength, const SimpleFontData* fontData) { bool isUtf16 = bufferLength != length; for (unsigned i = 0; i < length; i++) { UChar32 character; if(isUtf16) { UChar lead = buffer[i * 2]; UChar trail = buffer[i * 2 + 1]; character = U16_GET_SUPPLEMENTARY(lead, trail); } else { character = buffer[i]; } setGlyphDataForIndex(offset + i, character, fontData); } return true; }
FcPattern* createFontConfigPatternForCharacters(const UChar* characters, int length) { FcPattern* pattern = FcPatternCreate(); FcCharSet* fontConfigCharSet = FcCharSetCreate(); for (int i = 0; i < length; ++i) { if (U16_IS_SURROGATE(characters[i]) && U16_IS_SURROGATE_LEAD(characters[i]) && i != length - 1 && U16_IS_TRAIL(characters[i + 1])) { FcCharSetAddChar(fontConfigCharSet, U16_GET_SUPPLEMENTARY(characters[i], characters[i+1])); i++; } else FcCharSetAddChar(fontConfigCharSet, characters[i]); } FcPatternAddCharSet(pattern, FC_CHARSET, fontConfigCharSet); FcCharSetDestroy(fontConfigCharSet); FcPatternAddBool(pattern, FC_SCALABLE, FcTrue); FcConfigSubstitute(0, pattern, FcMatchPattern); FcDefaultSubstitute(pattern); return pattern; }
/* get a UChar32 from the stream*/ U_CAPI int32_t U_EXPORT2 ucbuf_getc32(UCHARBUF* buf,UErrorCode* error){ int32_t retVal = (int32_t)U_EOF; if(error==NULL || U_FAILURE(*error)){ return FALSE; } if(buf->currentPos+1>=buf->bufLimit){ if(buf->remaining==0){ return U_EOF; } buf=ucbuf_fillucbuf(buf,error); if(U_FAILURE(*error)){ return U_EOF; } } if(U16_IS_LEAD(*(buf->currentPos))){ retVal=U16_GET_SUPPLEMENTARY(buf->currentPos[0],buf->currentPos[1]); buf->currentPos+=2; }else{ retVal = *(buf->currentPos++); } return retVal; }
bool readUTFChar(const UChar* str, int* begin, int length, unsigned* codePoint) { if (U16_IS_SURROGATE(str[*begin])) { if (!U16_IS_SURROGATE_LEAD(str[*begin]) || *begin + 1 >= length || !U16_IS_TRAIL(str[*begin + 1])) { // Invalid surrogate pair. *codePoint = kUnicodeReplacementCharacter; return false; } // Valid surrogate pair. *codePoint = U16_GET_SUPPLEMENTARY(str[*begin], str[*begin + 1]); (*begin)++; } else { // Not a surrogate, just one 16-bit word. *codePoint = str[*begin]; } if (U_IS_UNICODE_CHAR(*codePoint)) return true; // Invalid code point. *codePoint = kUnicodeReplacementCharacter; return false; }
// FIXME: This function may not work if the emphasis mark uses a complex script, but none of the // standard emphasis marks do so. bool Font::getEmphasisMarkGlyphData(const AtomicString& mark, GlyphData& glyphData) const { if (mark.isEmpty()) return false; UChar32 character = mark[0]; if (U16_IS_SURROGATE(character)) { if (!U16_IS_SURROGATE_LEAD(character)) return false; if (mark.length() < 2) return false; UChar low = mark[1]; if (!U16_IS_TRAIL(low)) return false; character = U16_GET_SUPPLEMENTARY(character, low); } glyphData = glyphDataForCharacter(character, false, EmphasisMarkVariant); return true; }
static void U_CALLCONV _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, UErrorCode *pErrorCode) { UConverter *cnv; const UChar *source, *sourceLimit; uint8_t *target; int32_t targetCapacity; int32_t *offsets; int32_t prev, c, diff; int32_t sourceIndex, nextSourceIndex; /* set up the local pointers */ cnv=pArgs->converter; source=pArgs->source; sourceLimit=pArgs->sourceLimit; target=(uint8_t *)pArgs->target; targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); offsets=pArgs->offsets; /* get the converter state from UConverter */ c=cnv->fromUChar32; prev=(int32_t)cnv->fromUnicodeStatus; if(prev==0) { prev=BOCU1_ASCII_PREV; } /* sourceIndex=-1 if the current character began in the previous buffer */ sourceIndex= c==0 ? 0 : -1; nextSourceIndex=0; /* conversion loop */ if(c!=0 && targetCapacity>0) { goto getTrail; } fastSingle: /* fast loop for single-byte differences */ /* use only one loop counter variable, targetCapacity, not also source */ diff=(int32_t)(sourceLimit-source); if(targetCapacity>diff) { targetCapacity=diff; } while(targetCapacity>0 && (c=*source)<0x3000) { if(c<=0x20) { if(c!=0x20) { prev=BOCU1_ASCII_PREV; } *target++=(uint8_t)c; *offsets++=nextSourceIndex++; ++source; --targetCapacity; } else { diff=c-prev; if(DIFF_IS_SINGLE(diff)) { prev=BOCU1_SIMPLE_PREV(c); *target++=(uint8_t)PACK_SINGLE_DIFF(diff); *offsets++=nextSourceIndex++; ++source; --targetCapacity; } else { break; } } } /* restore real values */ targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target); sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */ /* regular loop for all cases */ while(source<sourceLimit) { if(targetCapacity>0) { c=*source++; ++nextSourceIndex; if(c<=0x20) { /* * ISO C0 control & space: * Encode directly for MIME compatibility, * and reset state except for space, to not disrupt compression. */ if(c!=0x20) { prev=BOCU1_ASCII_PREV; } *target++=(uint8_t)c; *offsets++=sourceIndex; --targetCapacity; sourceIndex=nextSourceIndex; continue; } if(U16_IS_LEAD(c)) { getTrail: if(source<sourceLimit) { /* test the following code unit */ UChar trail=*source; if(U16_IS_TRAIL(trail)) { ++source; ++nextSourceIndex; c=U16_GET_SUPPLEMENTARY(c, trail); } } else { /* no more input */ c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */ break; } } /* * all other Unicode code points c==U+0021..U+10ffff * are encoded with the difference c-prev * * a new prev is computed from c, * placed in the middle of a 0x80-block (for most small scripts) or * in the middle of the Unihan and Hangul blocks * to statistically minimize the following difference */ diff=c-prev; prev=BOCU1_PREV(c); if(DIFF_IS_SINGLE(diff)) { *target++=(uint8_t)PACK_SINGLE_DIFF(diff); *offsets++=sourceIndex; --targetCapacity; sourceIndex=nextSourceIndex; if(c<0x3000) { goto fastSingle; } } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) { /* optimize 2-byte case */ int32_t m; if(diff>=0) { diff-=BOCU1_REACH_POS_1+1; m=diff%BOCU1_TRAIL_COUNT; diff/=BOCU1_TRAIL_COUNT; diff+=BOCU1_START_POS_2; } else { diff-=BOCU1_REACH_NEG_1; NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); diff+=BOCU1_START_NEG_2; } *target++=(uint8_t)diff; *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m); *offsets++=sourceIndex; *offsets++=sourceIndex; targetCapacity-=2; sourceIndex=nextSourceIndex; } else { int32_t length; /* will be 2..4 */ diff=packDiff(diff); length=BOCU1_LENGTH_FROM_PACKED(diff); /* write the output character bytes from diff and length */ /* from the first if in the loop we know that targetCapacity>0 */ if(length<=targetCapacity) { switch(length) { /* each branch falls through to the next one */ case 4: *target++=(uint8_t)(diff>>24); *offsets++=sourceIndex; U_FALLTHROUGH; case 3: *target++=(uint8_t)(diff>>16); *offsets++=sourceIndex; U_FALLTHROUGH; case 2: *target++=(uint8_t)(diff>>8); *offsets++=sourceIndex; /* case 1: handled above */ *target++=(uint8_t)diff; *offsets++=sourceIndex; U_FALLTHROUGH; default: /* will never occur */ break; } targetCapacity-=length; sourceIndex=nextSourceIndex; } else { uint8_t *charErrorBuffer; /* * We actually do this backwards here: * In order to save an intermediate variable, we output * first to the overflow buffer what does not fit into the * regular target. */ /* we know that 1<=targetCapacity<length<=4 */ length-=targetCapacity; charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; switch(length) { /* each branch falls through to the next one */ case 3: *charErrorBuffer++=(uint8_t)(diff>>16); U_FALLTHROUGH; case 2: *charErrorBuffer++=(uint8_t)(diff>>8); U_FALLTHROUGH; case 1: *charErrorBuffer=(uint8_t)diff; U_FALLTHROUGH; default: /* will never occur */ break; } cnv->charErrorBufferLength=(int8_t)length; /* now output what fits into the regular target */ diff>>=8*length; /* length was reduced by targetCapacity */ switch(targetCapacity) { /* each branch falls through to the next one */ case 3: *target++=(uint8_t)(diff>>16); *offsets++=sourceIndex; U_FALLTHROUGH; case 2: *target++=(uint8_t)(diff>>8); *offsets++=sourceIndex; U_FALLTHROUGH; case 1: *target++=(uint8_t)diff; *offsets++=sourceIndex; U_FALLTHROUGH; default: /* will never occur */ break; } /* target overflow */ targetCapacity=0; *pErrorCode=U_BUFFER_OVERFLOW_ERROR; break; } } } else {
static int generateComponents(TextRunComponents* components, const Font &font, const TextRun &run) { int letterSpacing = font.letterSpacing(); int wordSpacing = font.wordSpacing(); int padding = run.expansion(); int numSpaces = 0; if (padding) { for (int i = 0; i < run.length(); i++) if (Font::treatAsSpace(run[i])) ++numSpaces; } int offset = 0; if (letterSpacing) { // need to draw every letter on it's own int start = 0; if (Font::treatAsSpace(run[0])) { int add = 0; if (numSpaces) { add = padding/numSpaces; padding -= add; --numSpaces; } components->append(TextRunComponent(1, font, offset)); offset += add + letterSpacing + components->last().m_width; start = 1; } for (int i = 1; i < run.length(); ++i) { UChar ch = run[i]; if (U16_IS_LEAD(ch) && U16_IS_TRAIL(run[i-1])) ch = U16_GET_SUPPLEMENTARY(ch, run[i-1]); if (U16_IS_TRAIL(ch) || U_GET_GC_MASK(ch) & U_GC_MN_MASK) continue; if (Font::treatAsSpace(run[i])) { int add = 0; if (i - start > 0) { components->append(TextRunComponent(run.characters16() + start, i - start, run, font, offset)); offset += components->last().m_width + letterSpacing; } if (numSpaces) { add = padding/numSpaces; padding -= add; --numSpaces; } components->append(TextRunComponent(1, font, offset)); offset += wordSpacing + add + components->last().m_width + letterSpacing; start = i + 1; continue; } if (i - start > 0) { components->append(TextRunComponent(run.characters16() + start, i - start, run, font, offset)); offset += components->last().m_width + letterSpacing; } start = i; } if (run.length() - start > 0) { components->append(TextRunComponent(run.characters16() + start, run.length() - start, run, font, offset)); offset += components->last().m_width; } offset += letterSpacing; } else { int start = 0; for (int i = 0; i < run.length(); ++i) { if (Font::treatAsSpace(run[i])) { if (i - start > 0) { components->append(TextRunComponent(run.characters16() + start, i - start, run, font, offset)); offset += components->last().m_width; } int add = 0; if (numSpaces) { add = padding/numSpaces; padding -= add; --numSpaces; } components->append(TextRunComponent(1, font, offset)); offset += add + components->last().m_width; if (i) offset += wordSpacing; start = i + 1; } } if (run.length() - start > 0) { components->append(TextRunComponent(run.characters16() + start, run.length() - start, run, font, offset)); offset += components->last().m_width; } } return offset; }
Font::CodePath Font::codePath(const TextRun& run) const { if (s_codePath != Auto) return s_codePath; #if ENABLE(SVG_FONTS) if (run.renderingContext()) return Simple; #endif #if PLATFORM(QT) && !HAVE(QRAWFONT) if (run.expansion() || run.rtl() || isSmallCaps() || wordSpacing() || letterSpacing()) return Complex; #endif if (m_fontDescription.featureSettings() && m_fontDescription.featureSettings()->size() > 0) return Complex; CodePath result = Simple; // Start from 0 since drawing and highlighting also measure the characters before run->from // FIXME: Should use a UnicodeSet in ports where ICU is used. Note that we // can't simply use UnicodeCharacter Property/class because some characters // are not 'combining', but still need to go to the complex path. // Alternatively, we may as well consider binary search over a sorted // list of ranges. for (int i = 0; i < run.length(); i++) { const UChar c = run[i]; if (c < 0x2E5) // U+02E5 through U+02E9 (Modifier Letters : Tone letters) continue; if (c <= 0x2E9) return Complex; if (c < 0x300) // U+0300 through U+036F Combining diacritical marks continue; if (c <= 0x36F) return Complex; if (c < 0x0591 || c == 0x05BE) // U+0591 through U+05CF excluding U+05BE Hebrew combining marks, Hebrew punctuation Paseq, Sof Pasuq and Nun Hafukha continue; if (c <= 0x05CF) return Complex; // U+0600 through U+109F Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic, // Devanagari, Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada, // Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar if (c < 0x0600) continue; if (c <= 0x109F) return Complex; // U+1100 through U+11FF Hangul Jamo (only Ancient Korean should be left here if you precompose; // Modern Korean will be precomposed as a result of step A) if (c < 0x1100) continue; if (c <= 0x11FF) return Complex; if (c < 0x135D) // U+135D through U+135F Ethiopic combining marks continue; if (c <= 0x135F) return Complex; if (c < 0x1700) // U+1780 through U+18AF Tagalog, Hanunoo, Buhid, Taghanwa,Khmer, Mongolian continue; if (c <= 0x18AF) return Complex; if (c < 0x1900) // U+1900 through U+194F Limbu (Unicode 4.0) continue; if (c <= 0x194F) return Complex; if (c < 0x1980) // U+1980 through U+19DF New Tai Lue continue; if (c <= 0x19DF) return Complex; if (c < 0x1A00) // U+1A00 through U+1CFF Buginese, Tai Tham, Balinese, Batak, Lepcha, Vedic continue; if (c <= 0x1CFF) return Complex; if (c < 0x1DC0) // U+1DC0 through U+1DFF Comining diacritical mark supplement continue; if (c <= 0x1DFF) return Complex; // U+1E00 through U+2000 characters with diacritics and stacked diacritics if (c <= 0x2000) { result = SimpleWithGlyphOverflow; continue; } if (c < 0x20D0) // U+20D0 through U+20FF Combining marks for symbols continue; if (c <= 0x20FF) return Complex; if (c < 0x2CEF) // U+2CEF through U+2CF1 Combining marks for Coptic continue; if (c <= 0x2CF1) return Complex; if (c < 0x302A) // U+302A through U+302F Ideographic and Hangul Tone marks continue; if (c <= 0x302F) return Complex; if (c < 0xA67C) // U+A67C through U+A67D Combining marks for old Cyrillic continue; if (c <= 0xA67D) return Complex; if (c < 0xA6F0) // U+A6F0 through U+A6F1 Combining mark for Bamum continue; if (c <= 0xA6F1) return Complex; // U+A800 through U+ABFF Nagri, Phags-pa, Saurashtra, Devanagari Extended, // Hangul Jamo Ext. A, Javanese, Myanmar Extended A, Tai Viet, Meetei Mayek, if (c < 0xA800) continue; if (c <= 0xABFF) return Complex; if (c < 0xD7B0) // U+D7B0 through U+D7FF Hangul Jamo Ext. B continue; if (c <= 0xD7FF) return Complex; if (c <= 0xDBFF) { // High surrogate if (i == run.length() - 1) continue; UChar next = run[++i]; if (!U16_IS_TRAIL(next)) continue; UChar32 supplementaryCharacter = U16_GET_SUPPLEMENTARY(c, next); if (supplementaryCharacter < 0x1F1E6) // U+1F1E6 through U+1F1FF Regional Indicator Symbols continue; if (supplementaryCharacter <= 0x1F1FF) return Complex; if (supplementaryCharacter < 0xE0100) // U+E0100 through U+E01EF Unicode variation selectors. continue; if (supplementaryCharacter <= 0xE01EF) return Complex; // FIXME: Check for Brahmi (U+11000 block), Kaithi (U+11080 block) and other complex scripts // in plane 1 or higher. continue; } if (c < 0xFE00) // U+FE00 through U+FE0F Unicode variation selectors continue; if (c <= 0xFE0F) return Complex; if (c < 0xFE20) // U+FE20 through U+FE2F Combining half marks continue; if (c <= 0xFE2F) return Complex; } if (run.length() > 1 && typesettingFeatures()) return Complex; return result; }
/* Parse a single escape sequence. Although this method deals in * UChars, it does not use C++ or UnicodeString. This allows it to * be used from C contexts. */ U_CAPI UChar32 U_EXPORT2 u_unescapeAt(UNESCAPE_CHAR_AT charAt, int32_t* offset, int32_t length, void* context) { int32_t start = *offset; UChar c; UChar32 result = 0; int8_t n = 0; int8_t minDig = 0; int8_t maxDig = 0; int8_t bitsPerDigit = 4; int8_t dig; int32_t i; UBool braces = FALSE; /* Check that offset is in range */ if (*offset < 0 || *offset >= length) { goto err; } /* Fetch first UChar after '\\' */ c = charAt((*offset)++, context); /* Convert hexadecimal and octal escapes */ switch (c) { case 0x0075 /*'u'*/: minDig = maxDig = 4; break; case 0x0055 /*'U'*/: minDig = maxDig = 8; break; case 0x0078 /*'x'*/: minDig = 1; if (*offset < length && charAt(*offset, context) == 0x7B /*{*/) { ++(*offset); braces = TRUE; maxDig = 8; } else { maxDig = 2; } break; default: dig = _digit8(c); if (dig >= 0) { minDig = 1; maxDig = 3; n = 1; /* Already have first octal digit */ bitsPerDigit = 3; result = dig; } break; } if (minDig != 0) { while (*offset < length && n < maxDig) { c = charAt(*offset, context); dig = (int8_t) ((bitsPerDigit == 3) ? _digit8(c) : _digit16(c)); if (dig < 0) { break; } result = (result << bitsPerDigit) | dig; ++(*offset); ++n; } if (n < minDig) { goto err; } if (braces) { if (c != 0x7D /*}*/) { goto err; } ++(*offset); } if (result < 0 || result >= 0x110000) { goto err; } /* If an escape sequence specifies a lead surrogate, see if * there is a trail surrogate after it, either as an escape or * as a literal. If so, join them up into a supplementary. */ if (*offset < length && U16_IS_LEAD(result)) { int32_t ahead = *offset + 1; c = charAt(*offset, context); if (c == 0x5C /*'\\'*/ && ahead < length) { c = (UChar) u_unescapeAt(charAt, &ahead, length, context); } if (U16_IS_TRAIL(c)) { *offset = ahead; result = U16_GET_SUPPLEMENTARY(result, c); } } return result; } /* Convert C-style escapes in table */ for (i = 0; i < UNESCAPE_MAP_LENGTH; i += 2) { if (c == UNESCAPE_MAP[i]) { return UNESCAPE_MAP[i + 1]; } else if (c < UNESCAPE_MAP[i]) { break; } } /* Map \cX to control-X: X & 0x1F */ if (c == 0x0063 /*'c'*/ && *offset < length) { c = charAt((*offset)++, context); if (UTF_IS_FIRST_SURROGATE(c) && *offset < length) { UChar c2 = charAt(*offset, context); if (UTF_IS_SECOND_SURROGATE(c2)) { ++(*offset); c = (UChar) UTF16_GET_PAIR_VALUE(c, c2); /* [sic] */ } } return 0x1F & c; } /* If no special forms are recognized, then consider * the backslash to generically escape the next character. * Deal with surrogate pairs. */ if (UTF_IS_FIRST_SURROGATE(c) && *offset < length) { UChar c2 = charAt(*offset, context); if (UTF_IS_SECOND_SURROGATE(c2)) { ++(*offset); return UTF16_GET_PAIR_VALUE(c, c2); } } return c; err: /* Invalid escape sequence */ *offset = start; /* Reset to initial value */ return (UChar32) 0xFFFFFFFF; }
/* internal function */ U_CFUNC int32_t u_strcmpFold(const UChar *s1, int32_t length1, const UChar *s2, int32_t length2, uint32_t options, UErrorCode *pErrorCode) { const UCaseProps *csp; /* current-level start/limit - s1/s2 as current */ const UChar *start1, *start2, *limit1, *limit2; /* case folding variables */ const UChar *p; int32_t length; /* stacks of previous-level start/current/limit */ CmpEquivLevel stack1[2], stack2[2]; /* case folding buffers, only use current-level start/limit */ UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1]; /* track which is the current level per string */ int32_t level1, level2; /* current code units, and code points for lookups */ UChar32 c1, c2, cp1, cp2; /* no argument error checking because this itself is not an API */ /* * assume that at least the option U_COMPARE_IGNORE_CASE is set * otherwise this function would have to behave exactly as uprv_strCompare() */ csp=ucase_getSingleton(); if(U_FAILURE(*pErrorCode)) { return 0; } /* initialize */ start1=s1; if(length1==-1) { limit1=NULL; } else { limit1=s1+length1; } start2=s2; if(length2==-1) { limit2=NULL; } else { limit2=s2+length2; } level1=level2=0; c1=c2=-1; /* comparison loop */ for(;;) { /* * here a code unit value of -1 means "get another code unit" * below it will mean "this source is finished" */ if(c1<0) { /* get next code unit from string 1, post-increment */ for(;;) { if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) { if(level1==0) { c1=-1; break; } } else { ++s1; break; } /* reached end of level buffer, pop one level */ do { --level1; start1=stack1[level1].start; } while(start1==NULL); s1=stack1[level1].s; limit1=stack1[level1].limit; } } if(c2<0) { /* get next code unit from string 2, post-increment */ for(;;) { if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) { if(level2==0) { c2=-1; break; } } else { ++s2; break; } /* reached end of level buffer, pop one level */ do { --level2; start2=stack2[level2].start; } while(start2==NULL); s2=stack2[level2].s; limit2=stack2[level2].limit; } } /* * compare c1 and c2 * either variable c1, c2 is -1 only if the corresponding string is finished */ if(c1==c2) { if(c1<0) { return 0; /* c1==c2==-1 indicating end of strings */ } c1=c2=-1; /* make us fetch new code units */ continue; } else if(c1<0) { return -1; /* string 1 ends before string 2 */ } else if(c2<0) { return 1; /* string 2 ends before string 1 */ } /* c1!=c2 && c1>=0 && c2>=0 */ /* get complete code points for c1, c2 for lookups if either is a surrogate */ cp1=c1; if(U_IS_SURROGATE(c1)) { UChar c; if(U_IS_SURROGATE_LEAD(c1)) { if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) { /* advance ++s1; only below if cp1 decomposes/case-folds */ cp1=U16_GET_SUPPLEMENTARY(c1, c); } } else /* isTrail(c1) */ { if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) { cp1=U16_GET_SUPPLEMENTARY(c, c1); } } } cp2=c2; if(U_IS_SURROGATE(c2)) { UChar c; if(U_IS_SURROGATE_LEAD(c2)) { if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) { /* advance ++s2; only below if cp2 decomposes/case-folds */ cp2=U16_GET_SUPPLEMENTARY(c2, c); } } else /* isTrail(c2) */ { if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) { cp2=U16_GET_SUPPLEMENTARY(c, c2); } } } /* * go down one level for each string * continue with the main loop as soon as there is a real change */ if( level1==0 && (length=ucase_toFullFolding(csp, (UChar32)cp1, &p, options))>=0 ) { /* cp1 case-folds to the code point "length" or to p[length] */ if(U_IS_SURROGATE(c1)) { if(U_IS_SURROGATE_LEAD(c1)) { /* advance beyond source surrogate pair if it case-folds */ ++s1; } else /* isTrail(c1) */ { /* * we got a supplementary code point when hitting its trail surrogate, * therefore the lead surrogate must have been the same as in the other string; * compare this decomposition with the lead surrogate in the other string * remember that this simulates bulk text replacement: * the decomposition would replace the entire code point */ --s2; c2=*(s2-1); } } /* push current level pointers */ stack1[0].start=start1; stack1[0].s=s1; stack1[0].limit=limit1; ++level1; /* copy the folding result to fold1[] */ if(length<=UCASE_MAX_STRING_LENGTH) { u_memcpy(fold1, p, length); } else { int32_t i=0; U16_APPEND_UNSAFE(fold1, i, length); length=i; } /* set next level pointers to case folding */ start1=s1=fold1; limit1=fold1+length; /* get ready to read from decomposition, continue with loop */ c1=-1; continue; } if( level2==0 && (length=ucase_toFullFolding(csp, (UChar32)cp2, &p, options))>=0 ) { /* cp2 case-folds to the code point "length" or to p[length] */ if(U_IS_SURROGATE(c2)) { if(U_IS_SURROGATE_LEAD(c2)) { /* advance beyond source surrogate pair if it case-folds */ ++s2; } else /* isTrail(c2) */ { /* * we got a supplementary code point when hitting its trail surrogate, * therefore the lead surrogate must have been the same as in the other string; * compare this decomposition with the lead surrogate in the other string * remember that this simulates bulk text replacement: * the decomposition would replace the entire code point */ --s1; c1=*(s1-1); } } /* push current level pointers */ stack2[0].start=start2; stack2[0].s=s2; stack2[0].limit=limit2; ++level2; /* copy the folding result to fold2[] */ if(length<=UCASE_MAX_STRING_LENGTH) { u_memcpy(fold2, p, length); } else { int32_t i=0; U16_APPEND_UNSAFE(fold2, i, length); length=i; } /* set next level pointers to case folding */ start2=s2=fold2; limit2=fold2+length; /* get ready to read from decomposition, continue with loop */ c2=-1; continue; } /* * no decomposition/case folding, max level for both sides: * return difference result * * code point order comparison must not just return cp1-cp2 * because when single surrogates are present then the surrogate pairs * that formed cp1 and cp2 may be from different string indexes * * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units * c1=d800 cp1=10001 c2=dc00 cp2=10000 * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 } * * therefore, use same fix-up as in ustring.c/uprv_strCompare() * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++ * so we have slightly different pointer/start/limit comparisons here */ if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) { /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ if( (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) || (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2))) ) { /* part of a surrogate pair, leave >=d800 */ } else { /* BMP code point - may be surrogate code point - make <d800 */ c1-=0x2800; } if( (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) || (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2))) ) { /* part of a surrogate pair, leave >=d800 */ } else { /* BMP code point - may be surrogate code point - make <d800 */ c2-=0x2800; } } return c1-c2; } }
void WidthIterator::advance(int offset, GlyphBuffer* glyphBuffer) { if (offset > m_end) offset = m_end; int currentCharacter = m_currentCharacter; const UChar* cp = m_run.data(currentCharacter); bool rtl = m_run.rtl(); bool hasExtraSpacing = (m_font->letterSpacing() || m_font->wordSpacing() || m_padding) && !m_run.spacingDisabled(); float widthSinceLastRounding = m_runWidthSoFar; m_runWidthSoFar = floorf(m_runWidthSoFar); widthSinceLastRounding -= m_runWidthSoFar; float lastRoundingWidth = m_finalRoundingWidth; FloatRect bounds; const SimpleFontData* primaryFont = m_font->primaryFont(); const SimpleFontData* lastFontData = primaryFont; while (currentCharacter < offset) { UChar32 c = *cp; unsigned clusterLength = 1; if (c >= 0x3041) { if (c <= 0x30FE) { // Deal with Hiragana and Katakana voiced and semi-voiced syllables. // Normalize into composed form, and then look for glyph with base + combined mark. // Check above for character range to minimize performance impact. UChar32 normalized = normalizeVoicingMarks(currentCharacter); if (normalized) { c = normalized; clusterLength = 2; } } else if (U16_IS_SURROGATE(c)) { if (!U16_IS_SURROGATE_LEAD(c)) break; // Do we have a surrogate pair? If so, determine the full Unicode (32 bit) // code point before glyph lookup. // Make sure we have another character and it's a low surrogate. if (currentCharacter + 1 >= m_run.length()) break; UChar low = cp[1]; if (!U16_IS_TRAIL(low)) break; c = U16_GET_SUPPLEMENTARY(c, low); clusterLength = 2; } } const GlyphData& glyphData = m_font->glyphDataForCharacter(c, rtl); Glyph glyph = glyphData.glyph; const SimpleFontData* fontData = glyphData.fontData; ASSERT(fontData); // Now that we have a glyph and font data, get its width. float width; if (c == '\t' && m_run.allowTabs()) { float tabWidth = m_font->tabWidth(*fontData); width = tabWidth - fmodf(m_run.xPos() + m_runWidthSoFar + widthSinceLastRounding, tabWidth); } else { width = fontData->widthForGlyph(glyph); #if ENABLE(SVG) // SVG uses horizontalGlyphStretch(), when textLength is used to stretch/squeeze text. width *= m_run.horizontalGlyphStretch(); #endif // We special case spaces in two ways when applying word rounding. // First, we round spaces to an adjusted width in all fonts. // Second, in fixed-pitch fonts we ensure that all characters that // match the width of the space character have the same width as the space character. if (width == fontData->spaceWidth() && (fontData->pitch() == FixedPitch || glyph == fontData->spaceGlyph()) && m_run.applyWordRounding()) width = fontData->adjustedSpaceWidth(); } if (fontData != lastFontData && width) { lastFontData = fontData; if (m_fallbackFonts && fontData != primaryFont) { // FIXME: This does a little extra work that could be avoided if // glyphDataForCharacter() returned whether it chose to use a small caps font. if (!m_font->isSmallCaps() || c == toUpper(c)) m_fallbackFonts->add(fontData); else { const GlyphData& uppercaseGlyphData = m_font->glyphDataForCharacter(toUpper(c), rtl); if (uppercaseGlyphData.fontData != primaryFont) m_fallbackFonts->add(uppercaseGlyphData.fontData); } } } if (hasExtraSpacing) { // Account for letter-spacing. if (width && m_font->letterSpacing()) width += m_font->letterSpacing(); if (Font::treatAsSpace(c)) { // Account for padding. WebCore uses space padding to justify text. // We distribute the specified padding over the available spaces in the run. if (m_padding) { // Use left over padding if not evenly divisible by number of spaces. if (m_padding < m_padPerSpace) { width += m_padding; m_padding = 0; } else { float previousPadding = m_padding; m_padding -= m_padPerSpace; width += roundf(previousPadding) - roundf(m_padding); } } // Account for word spacing. // We apply additional space between "words" by adding width to the space character. if (currentCharacter != 0 && !Font::treatAsSpace(cp[-1]) && m_font->wordSpacing()) width += m_font->wordSpacing(); } } if (m_accountForGlyphBounds) { bounds = fontData->boundsForGlyph(glyph); if (!currentCharacter) m_firstGlyphOverflow = max<float>(0, -bounds.x()); } if (m_forTextEmphasis && !Font::canReceiveTextEmphasis(c)) glyph = 0; // Advance past the character we just dealt with. cp += clusterLength; currentCharacter += clusterLength; // Account for float/integer impedance mismatch between CG and KHTML. "Words" (characters // followed by a character defined by isRoundingHackCharacter()) are always an integer width. // We adjust the width of the last character of a "word" to ensure an integer width. // If we move KHTML to floats we can remove this (and related) hacks. float oldWidth = width; // Force characters that are used to determine word boundaries for the rounding hack // to be integer width, so following words will start on an integer boundary. if (m_run.applyWordRounding() && Font::isRoundingHackCharacter(c)) { width = ceilf(width); // Since widthSinceLastRounding can lose precision if we include measurements for // preceding whitespace, we bypass it here. m_runWidthSoFar += width; // Since this is a rounding hack character, we should have reset this sum on the previous // iteration. ASSERT(!widthSinceLastRounding); } else { // Check to see if the next character is a "rounding hack character", if so, adjust // width so that the total run width will be on an integer boundary. if ((m_run.applyWordRounding() && currentCharacter < m_run.length() && Font::isRoundingHackCharacter(*cp)) || (m_run.applyRunRounding() && currentCharacter >= m_end)) { float totalWidth = widthSinceLastRounding + width; widthSinceLastRounding = ceilf(totalWidth); width += widthSinceLastRounding - totalWidth; m_runWidthSoFar += widthSinceLastRounding; widthSinceLastRounding = 0; } else widthSinceLastRounding += width; } if (glyphBuffer) glyphBuffer->add(glyph, fontData, (rtl ? oldWidth + lastRoundingWidth : width)); lastRoundingWidth = width - oldWidth; if (m_accountForGlyphBounds) { m_maxGlyphBoundingBoxY = max(m_maxGlyphBoundingBoxY, bounds.bottom()); m_minGlyphBoundingBoxY = min(m_minGlyphBoundingBoxY, bounds.y()); m_lastGlyphOverflow = max<float>(0, bounds.right() - width); } } m_currentCharacter = currentCharacter; m_runWidthSoFar += widthSinceLastRounding; m_finalRoundingWidth = lastRoundingWidth; }
/* * Match each code point in a string against each code point in the matchSet. * Return the index of the first string code point that * is (polarity==TRUE) or is not (FALSE) contained in the matchSet. * Return -(string length)-1 if there is no such code point. */ static int32_t _matchFromSet(const UChar* string, const UChar* matchSet, UBool polarity) { int32_t matchLen, matchBMPLen, strItr, matchItr; UChar32 stringCh, matchCh; UChar c, c2; /* first part of matchSet contains only BMP code points */ matchBMPLen = 0; while ((c = matchSet[matchBMPLen]) != 0 && U16_IS_SINGLE(c)) { ++matchBMPLen; } /* second part of matchSet contains BMP and supplementary code points */ matchLen = matchBMPLen; while (matchSet[matchLen] != 0) { ++matchLen; } for (strItr = 0; (c = string[strItr]) != 0;) { ++strItr; if (U16_IS_SINGLE(c)) { if (polarity) { for (matchItr = 0; matchItr < matchLen; ++matchItr) { if (c == matchSet[matchItr]) { return strItr - 1; /* one matches */ } } } else { for (matchItr = 0; matchItr < matchLen; ++matchItr) { if (c == matchSet[matchItr]) { goto endloop; } } return strItr - 1; /* none matches */ } } else { /* * No need to check for string length before U16_IS_TRAIL * because c2 could at worst be the terminating NUL. */ if (U16_IS_SURROGATE_LEAD(c) && U16_IS_TRAIL(c2 = string[strItr])) { ++strItr; stringCh = U16_GET_SUPPLEMENTARY(c, c2); } else { stringCh = c; /* unpaired trail surrogate */ } if (polarity) { for (matchItr = matchBMPLen; matchItr < matchLen;) { U16_NEXT(matchSet, matchItr, matchLen, matchCh); if (stringCh == matchCh) { return strItr - U16_LENGTH(stringCh); /* one matches */ } } } else { for (matchItr = matchBMPLen; matchItr < matchLen;) { U16_NEXT(matchSet, matchItr, matchLen, matchCh); if (stringCh == matchCh) { goto endloop; } } return strItr - U16_LENGTH(stringCh); /* none matches */ } } endloop: /* wish C had continue with labels like Java... */; } /* Didn't find it. */ return -strItr - 1; }
U_CFUNC int32_t u_strToPunycode(const UChar *src, int32_t srcLength, UChar *dest, int32_t destCapacity, const UBool *caseFlags, UErrorCode *pErrorCode) { int32_t cpBuffer[MAX_CP_COUNT]; int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount; UChar c, c2; /* argument checking */ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { return 0; } if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } /* * Handle the basic code points and * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit): */ srcCPCount=destLength=0; if(srcLength==-1) { /* NUL-terminated input */ for(j=0; /* no condition */; ++j) { if((c=src[j])==0) { break; } if(srcCPCount==MAX_CP_COUNT) { /* too many input code points */ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; return 0; } if(IS_BASIC(c)) { cpBuffer[srcCPCount++]=0; if(destLength<destCapacity) { dest[destLength]= caseFlags!=NULL ? asciiCaseMap((char)c, caseFlags[j]) : (char)c; } ++destLength; } else { n=(caseFlags!=NULL && caseFlags[j])<<31L; if(U16_IS_SINGLE(c)) { n|=c; } else if(U16_IS_LEAD(c) && U16_IS_TRAIL(c2=src[j+1])) { ++j; n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2); } else { /* error: unmatched surrogate */ *pErrorCode=U_INVALID_CHAR_FOUND; return 0; } cpBuffer[srcCPCount++]=n; } } } else { /* length-specified input */ for(j=0; j<srcLength; ++j) { if(srcCPCount==MAX_CP_COUNT) { /* too many input code points */ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; return 0; } c=src[j]; if(IS_BASIC(c)) { cpBuffer[srcCPCount++]=0; if(destLength<destCapacity) { dest[destLength]= caseFlags!=NULL ? asciiCaseMap((char)c, caseFlags[j]) : (char)c; } ++destLength; } else { n=(caseFlags!=NULL && caseFlags[j])<<31L; if(U16_IS_SINGLE(c)) { n|=c; } else if(U16_IS_LEAD(c) && (j+1)<srcLength && U16_IS_TRAIL(c2=src[j+1])) { ++j; n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2); } else { /* error: unmatched surrogate */ *pErrorCode=U_INVALID_CHAR_FOUND; return 0; } cpBuffer[srcCPCount++]=n; } } } /* Finish the basic string - if it is not empty - with a delimiter. */ basicLength=destLength; if(basicLength>0) { if(destLength<destCapacity) { dest[destLength]=DELIMITER; } ++destLength; } /* * handledCPCount is the number of code points that have been handled * basicLength is the number of basic code points * destLength is the number of chars that have been output */ /* Initialize the state: */ n=INITIAL_N; delta=0; bias=INITIAL_BIAS; /* Main encoding loop: */ for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) { /* * All non-basic code points < n have been handled already. * Find the next larger one: */ for(m=0x7fffffff, j=0; j<srcCPCount; ++j) { q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */ if(n<=q && q<m) { m=q; } } /* * Increase delta enough to advance the decoder's * <n,i> state to <m,0>, but guard against overflow: */ if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) { *pErrorCode=U_INTERNAL_PROGRAM_ERROR; return 0; } delta+=(m-n)*(handledCPCount+1); n=m; /* Encode a sequence of same code points n */ for(j=0; j<srcCPCount; ++j) { q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */ if(q<n) { ++delta; } else if(q==n) { /* Represent delta as a generalized variable-length integer: */ for(q=delta, k=BASE; /* no condition */; k+=BASE) { /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt t=k-bias; if(t<TMIN) { t=TMIN; } else if(t>TMAX) { t=TMAX; } */ t=k-bias; if(t<TMIN) { t=TMIN; } else if(k>=(bias+TMAX)) { t=TMAX; } if(q<t) { break; } if(destLength<destCapacity) { dest[destLength]=digitToBasic(t+(q-t)%(BASE-t), 0); } ++destLength; q=(q-t)/(BASE-t); } if(destLength<destCapacity) { dest[destLength]=digitToBasic(q, (UBool)(cpBuffer[j]<0)); } ++destLength; bias=adaptBias(delta, handledCPCount+1, (UBool)(handledCPCount==basicLength)); delta=0; ++handledCPCount; } } ++delta; ++n; } return u_terminateUChars(dest, destCapacity, destLength, pErrorCode); }
static inline bool isUnicodeCategoryLetterOrNumber(UChar lastCh, UChar ch) { UChar32 ch32 = U16_IS_LEAD(lastCh) && U16_IS_TRAIL(ch) ? U16_GET_SUPPLEMENTARY(lastCh, ch) : ch; return (U_MASK(u_charType(ch32)) & (U_GC_L_MASK | U_GC_N_MASK)); }
CodePath Character::characterRangeCodePath(const UChar* characters, unsigned len) { static const UChar complexCodePathRanges[] = { // U+02E5 through U+02E9 (Modifier Letters : Tone letters) 0x2E5, 0x2E9, // U+0300 through U+036F Combining diacritical marks 0x300, 0x36F, // U+0591 through U+05CF excluding U+05BE Hebrew combining marks, ... 0x0591, 0x05BD, // ... Hebrew punctuation Paseq, Sof Pasuq and Nun Hafukha 0x05BF, 0x05CF, // U+0600 through U+109F Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic, // Devanagari, Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada, // Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar 0x0600, 0x109F, // U+1100 through U+11FF Hangul Jamo (only Ancient Korean should be left // here if you precompose; Modern Korean will be precomposed as a result of step A) 0x1100, 0x11FF, // U+135D through U+135F Ethiopic combining marks 0x135D, 0x135F, // U+1780 through U+18AF Tagalog, Hanunoo, Buhid, Taghanwa,Khmer, Mongolian 0x1700, 0x18AF, // U+1900 through U+194F Limbu (Unicode 4.0) 0x1900, 0x194F, // U+1980 through U+19DF New Tai Lue 0x1980, 0x19DF, // U+1A00 through U+1CFF Buginese, Tai Tham, Balinese, Batak, Lepcha, Vedic 0x1A00, 0x1CFF, // U+1DC0 through U+1DFF Comining diacritical mark supplement 0x1DC0, 0x1DFF, // U+20D0 through U+20FF Combining marks for symbols 0x20D0, 0x20FF, // U+2CEF through U+2CF1 Combining marks for Coptic 0x2CEF, 0x2CF1, // U+302A through U+302F Ideographic and Hangul Tone marks 0x302A, 0x302F, // U+A67C through U+A67D Combining marks for old Cyrillic 0xA67C, 0xA67D, // U+A6F0 through U+A6F1 Combining mark for Bamum 0xA6F0, 0xA6F1, // U+A800 through U+ABFF Nagri, Phags-pa, Saurashtra, Devanagari Extended, // Hangul Jamo Ext. A, Javanese, Myanmar Extended A, Tai Viet, Meetei Mayek 0xA800, 0xABFF, // U+D7B0 through U+D7FF Hangul Jamo Ext. B 0xD7B0, 0xD7FF, // U+FE00 through U+FE0F Unicode variation selectors 0xFE00, 0xFE0F, // U+FE20 through U+FE2F Combining half marks 0xFE20, 0xFE2F }; CodePath result = SimplePath; for (unsigned i = 0; i < len; i++) { const UChar c = characters[i]; // Shortcut for common case if (c < 0x2E5) continue; // U+1E00 through U+2000 characters with diacritics and stacked diacritics if (c >= 0x1E00 && c <= 0x2000) { result = SimpleWithGlyphOverflowPath; continue; } // Surrogate pairs if (c > 0xD7FF && c <= 0xDBFF) { if (i == len - 1) continue; UChar next = characters[++i]; if (!U16_IS_TRAIL(next)) continue; UChar32 supplementaryCharacter = U16_GET_SUPPLEMENTARY(c, next); if (supplementaryCharacter < 0x1F1E6) // U+1F1E6 through U+1F1FF Regional Indicator Symbols continue; if (supplementaryCharacter <= 0x1F1FF) return ComplexPath; if (supplementaryCharacter < 0xE0100) // U+E0100 through U+E01EF Unicode variation selectors. continue; if (supplementaryCharacter <= 0xE01EF) return ComplexPath; // FIXME: Check for Brahmi (U+11000 block), Kaithi (U+11080 block) and other complex scripts // in plane 1 or higher. continue; } // Search for other Complex cases if (valueInIntervalList(complexCodePathRanges, c)) return ComplexPath; } return result; }