Пример #1
0
void XMLTreeBuilder::processHTMLEntity(const AtomicXMLToken& token)
{
    HTMLEntitySearch search;
    const AtomicString& name = token.name();
    for (size_t i = 0; i < name.length(); ++i) {
        search.advance(name[i]);
        if (!search.isEntityPrefix()) {
            m_parser->stopParsing();
            return;
        }
    }
    search.advance(';');
    UChar32 entityValue = search.currentValue();
    if (entityValue <= 0xFFFF)
       appendToText(reinterpret_cast<UChar*>(&entityValue), 1);
    else {
        UChar utf16Pair[2] = { U16_LEAD(entityValue), U16_TRAIL(entityValue) };
        appendToText(utf16Pair, 2);
    }
}
Пример #2
0
U_CAPI UChar* U_EXPORT2
u_strchr32(const UChar* s, UChar32 c) {
    if ((uint32_t) c <= U_BMP_MAX) {
        /* find BMP code point */
        return u_strchr(s, (UChar) c);
    } else if ((uint32_t) c <= UCHAR_MAX_VALUE) {
        /* find supplementary code point as surrogate pair */
        UChar cs, lead = U16_LEAD(c), trail = U16_TRAIL(c);

        while ((cs = *s++) != 0) {
            if (cs == lead && *s == trail) {
                return (UChar*) (s - 1);
            }
        }
        return NULL;
    } else {
        /* not a Unicode code point, not findable */
        return NULL;
    }
}
Пример #3
0
String SVGFontData::createStringWithMirroredCharacters(const UChar* characters, unsigned length) const
{
    StringBuilder mirroredCharacters;
    mirroredCharacters.reserveCapacity(length);

    UChar32 character;
    unsigned i = 0;
    while (i < length) {
        U16_NEXT(characters, i, length, character);
        character = mirroredChar(character);

        if (U16_LENGTH(character) == 1)
            mirroredCharacters.append(static_cast<UChar>(character));
        else {
            mirroredCharacters.append(U16_LEAD(character));
            mirroredCharacters.append(U16_TRAIL(character));
        }
    }

    return mirroredCharacters.toString();
}
Пример #4
0
static UChar32 U_CALLCONV
lenient8IteratorCurrent(UCharIterator *iter) {
    if(iter->reservedField!=0) {
        return U16_TRAIL(iter->reservedField);
    } else if(iter->start<iter->limit) {
        const uint8_t *s=(const uint8_t *)iter->context;
        UChar32 c;
        int32_t i=iter->start;

        L8_NEXT(s, i, iter->limit, c);
        if(c<0) {
            return 0xfffd;
        } else if(c<=0xffff) {
            return c;
        } else {
            return U16_LEAD(c);
        }
    } else {
        return U_SENTINEL;
    }
}
Пример #5
0
static void TestSurrogate(){
    static UChar32 s[] = {0x10000, 0x10ffff, 0x50000, 0x100000, 0x1abcd};
    int i = 0;
    while (i < 5) {
        UChar first  = UTF_FIRST_SURROGATE(s[i]);
        UChar second = UTF_SECOND_SURROGATE(s[i]);
        /* algorithm from the Unicode consortium */
        UChar firstresult  = (UChar)(((s[i] - 0x10000) / 0x400) + 0xD800);
        UChar secondresult = (UChar)(((s[i] - 0x10000) % 0x400) + 0xDC00);

        if (first != UTF16_LEAD(s[i]) || first != U16_LEAD(s[i]) || first != firstresult) {
            log_err("Failure in first surrogate in 0x%x expected to be 0x%x\n",
                    s[i], firstresult);
        }
        if (second != UTF16_TRAIL(s[i]) || second != U16_TRAIL(s[i]) || second != secondresult) {
            log_err("Failure in second surrogate in 0x%x expected to be 0x%x\n",
                    s[i], secondresult);
        }
        i ++;
    }
}
TEST(StringBuilderTest, Append)
{
    StringBuilder builder;
    builder.append(String("0123456789"));
    expectBuilderContent("0123456789", builder);
    builder.append("abcd");
    expectBuilderContent("0123456789abcd", builder);
    builder.append("efgh", 3);
    expectBuilderContent("0123456789abcdefg", builder);
    builder.append("");
    expectBuilderContent("0123456789abcdefg", builder);
    builder.append('#');
    expectBuilderContent("0123456789abcdefg#", builder);

    builder.toString(); // Test after reifyString().
    StringBuilder builder1;
    builder.append("", 0);
    expectBuilderContent("0123456789abcdefg#", builder);
    builder1.append(builder.characters8(), builder.length());
    builder1.append("XYZ");
    builder.append(builder1.characters8(), builder1.length());
    expectBuilderContent("0123456789abcdefg#0123456789abcdefg#XYZ", builder);

    StringBuilder builder2;
    builder2.reserveCapacity(100);
    builder2.append("xyz");
    const LChar* characters = builder2.characters8();
    builder2.append("0123456789");
    EXPECT_EQ(characters, builder2.characters8());

    // Test appending UChar32 characters to StringBuilder.
    StringBuilder builderForUChar32Append;
    UChar32 frakturAChar = 0x1D504;
    builderForUChar32Append.append(frakturAChar); // The fraktur A is not in the BMP, so it's two UTF-16 code units long.
    EXPECT_EQ(2U, builderForUChar32Append.length());
    builderForUChar32Append.append(static_cast<UChar32>('A'));
    EXPECT_EQ(3U, builderForUChar32Append.length());
    const UChar resultArray[] = { U16_LEAD(frakturAChar), U16_TRAIL(frakturAChar), 'A' };
    expectBuilderContent(String(resultArray, WTF_ARRAY_LENGTH(resultArray)), builderForUChar32Append);
}
Пример #7
0
ALWAYS_INLINE bool equalWithUTF8Internal(const CharType* a, const CharType* aEnd, const char* b, const char* bEnd)
{
    while (b < bEnd) {
        if (isASCII(*b)) {
            if (*a++ != *b++)
                return false;
            continue;
        }

        int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*b);

        if (bEnd - b < utf8SequenceLength)
            return false;

        if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(b), utf8SequenceLength))
            return 0;

        UChar32 character = readUTF8Sequence(b, utf8SequenceLength);
        ASSERT(!isASCII(character));

        if (U_IS_BMP(character)) {
            // UTF-16 surrogate values are illegal in UTF-32
            if (U_IS_SURROGATE(character))
                return false;
            if (*a++ != character)
                return false;
        } else if (U_IS_SUPPLEMENTARY(character)) {
            if (*a++ != U16_LEAD(character))
                return false;
            if (*a++ != U16_TRAIL(character))
                return false;
        } else {
            return false;
        }
    }

    return a == aEnd;
}
Пример #8
0
U_CAPI UChar* U_EXPORT2
u_memrchr32(const UChar* s, UChar32 c, int32_t count) {
    if ((uint32_t) c <= U_BMP_MAX) {
        /* find BMP code point */
        return u_memrchr(s, (UChar) c, count);
    } else if (count < 2) {
        /* too short for a surrogate pair */
        return NULL;
    } else if ((uint32_t) c <= UCHAR_MAX_VALUE) {
        /* find supplementary code point as surrogate pair */
        const UChar* limit = s + count - 1;
        UChar lead = U16_LEAD(c), trail = U16_TRAIL(c);

        do {
            if (*limit == trail && *(limit - 1) == lead) {
                return (UChar*) (limit - 1);
            }
        } while (s != --limit);
        return NULL;
    } else {
        /* not a Unicode code point, not findable */
        return NULL;
    }
}
Пример #9
0
String XSSAuditor::decodeHTMLEntities(const String& string, bool leaveUndecodableEntitiesUntouched)
{
    SegmentedString source(string);
    SegmentedString sourceShadow;
    Vector<UChar> result;
    
    while (!source.isEmpty()) {
        UChar cc = *source;
        source.advance();
        
        if (cc != '&') {
            result.append(cc);
            continue;
        }
        
        if (leaveUndecodableEntitiesUntouched)
            sourceShadow = source;
        bool notEnoughCharacters = false;
        unsigned entity = PreloadScanner::consumeEntity(source, notEnoughCharacters);
        // We ignore notEnoughCharacters because we might as well use this loop
        // to copy the remaining characters into |result|.

        if (entity > 0xFFFF) {
            result.append(U16_LEAD(entity));
            result.append(U16_TRAIL(entity));
        } else if (entity && (!leaveUndecodableEntitiesUntouched || entity != 0xFFFD)){
            result.append(entity);
        } else {
            result.append('&');
            if (leaveUndecodableEntitiesUntouched)
                source = sourceShadow;
        }
    }
    
    return String::adopt(result);
}
Пример #10
0
static UChar32 U_CALLCONV
lenient8IteratorNext(UCharIterator *iter) {
    int32_t index;

    if(iter->reservedField!=0) {
        UChar trail=U16_TRAIL(iter->reservedField);
        iter->reservedField=0;
        if((index=iter->index)>=0) {
            iter->index=index+1;
        }
        return trail;
    } else if(iter->start<iter->limit) {
        const uint8_t *s=(const uint8_t *)iter->context;
        UChar32 c;

        L8_NEXT(s, iter->start, iter->limit, c);
        if((index=iter->index)>=0) {
            iter->index=++index;
            if(iter->length<0 && iter->start==iter->limit) {
                iter->length= c<=0xffff ? index : index+1;
            }
        } else if(iter->start==iter->limit && iter->length>=0) {
            iter->index= c<=0xffff ? iter->length : iter->length-1;
        }
        if(c<0) {
            return 0xfffd;
        } else if(c<=0xffff) {
            return c;
        } else {
            iter->reservedField=c;
            return U16_LEAD(c);
        }
    } else {
        return U_SENTINEL;
    }
}
Пример #11
0
U_CAPI UChar* U_EXPORT2
u_memchr32(const UChar* s, UChar32 c, int32_t count) {
    if ((uint32_t) c <= U_BMP_MAX) {
        /* find BMP code point */
        return u_memchr(s, (UChar) c, count);
    } else if (count < 2) {
        /* too short for a surrogate pair */
        return NULL;
    } else if ((uint32_t) c <= UCHAR_MAX_VALUE) {
        /* find supplementary code point as surrogate pair */
        const UChar* limit = s + count - 1; /* -1 so that we do not need a separate check for the trail unit */
        UChar lead = U16_LEAD(c), trail = U16_TRAIL(c);

        do {
            if (*s == lead && *(s + 1) == trail) {
                return (UChar*) s;
            }
        } while (++s != limit);
        return NULL;
    } else {
        /* not a Unicode code point, not findable */
        return NULL;
    }
}
Пример #12
0
static RefPtr<GlyphPage> createAndFillGlyphPage(unsigned pageNumber, const Font* font)
{
#if PLATFORM(IOS)
    // FIXME: Times New Roman contains Arabic glyphs, but Core Text doesn't know how to shape them. See <rdar://problem/9823975>.
    // Once we have the fix for <rdar://problem/9823975> then remove this code together with Font::shouldNotBeUsedForArabic()
    // in <rdar://problem/12096835>.
    if (pageNumber == 6 && font->shouldNotBeUsedForArabic())
        return nullptr;
#endif

    unsigned start = pageNumber * GlyphPage::size;
    UChar buffer[GlyphPage::size * 2 + 2];
    unsigned bufferLength;
    // Fill in a buffer with the entire "page" of characters that we want to look up glyphs for.
    if (U_IS_BMP(start)) {
        bufferLength = GlyphPage::size;
        for (unsigned i = 0; i < GlyphPage::size; i++)
            buffer[i] = start + i;

        if (!start) {
            // Control characters must not render at all.
            for (unsigned i = 0; i < 0x20; ++i)
                buffer[i] = zeroWidthSpace;
            for (unsigned i = 0x7F; i < 0xA0; i++)
                buffer[i] = zeroWidthSpace;
            buffer[softHyphen] = zeroWidthSpace;

            // \n, \t, and nonbreaking space must render as a space.
            buffer[(int)'\n'] = ' ';
            buffer[(int)'\t'] = ' ';
            buffer[noBreakSpace] = ' ';
        } else if (start == (leftToRightMark & ~(GlyphPage::size - 1))) {
            // LRM, RLM, LRE, RLE, ZWNJ, ZWJ, and PDF must not render at all.
            buffer[leftToRightMark - start] = zeroWidthSpace;
            buffer[rightToLeftMark - start] = zeroWidthSpace;
            buffer[leftToRightEmbed - start] = zeroWidthSpace;
            buffer[rightToLeftEmbed - start] = zeroWidthSpace;
            buffer[leftToRightOverride - start] = zeroWidthSpace;
            buffer[rightToLeftOverride - start] = zeroWidthSpace;
            buffer[zeroWidthNonJoiner - start] = zeroWidthSpace;
            buffer[zeroWidthJoiner - start] = zeroWidthSpace;
            buffer[popDirectionalFormatting - start] = zeroWidthSpace;
        } else if (start == (objectReplacementCharacter & ~(GlyphPage::size - 1))) {
            // Object replacement character must not render at all.
            buffer[objectReplacementCharacter - start] = zeroWidthSpace;
        } else if (start == (zeroWidthNoBreakSpace & ~(GlyphPage::size - 1))) {
            // ZWNBS/BOM must not render at all.
            buffer[zeroWidthNoBreakSpace - start] = zeroWidthSpace;
        }
    } else {
        bufferLength = GlyphPage::size * 2;
        for (unsigned i = 0; i < GlyphPage::size; i++) {
            int c = i + start;
            buffer[i * 2] = U16_LEAD(c);
            buffer[i * 2 + 1] = U16_TRAIL(c);
        }
    }

    // Now that we have a buffer full of characters, we want to get back an array
    // of glyph indices. This part involves calling into the platform-specific
    // routine of our glyph map for actually filling in the page with the glyphs.
    // Success is not guaranteed. For example, Times fails to fill page 260, giving glyph data
    // for only 128 out of 256 characters.
    RefPtr<GlyphPage> glyphPage;
    if (GlyphPage::mayUseMixedFontsWhenFilling(buffer, bufferLength, font))
        glyphPage = GlyphPage::createForMixedFonts();
    else
        glyphPage = GlyphPage::createForSingleFont(font);

    bool haveGlyphs = fillGlyphPage(*glyphPage, buffer, bufferLength, font);
    if (!haveGlyphs)
        return nullptr;

    glyphPage->setImmutable();
    return glyphPage;
}
Пример #13
0
// Given the desired base font, this will create a SimpleFontData for a specific
// font that can be used to render the given range of characters.
PassRefPtr<SimpleFontData> FontCache::platformFallbackForCharacter(const FontDescription& fontDescription, UChar32 c, const SimpleFontData*, bool)
{
    // FIXME: We should fix getFallbackFamily to take a UChar32
    // and remove this split-to-UChar16 code.
    UChar codeUnits[2];
    int codeUnitsLength;
    if (inputC <= 0xFFFF) {
        codeUnits[0] = inputC;
        codeUnitsLength = 1;
    } else {
        codeUnits[0] = U16_LEAD(inputC);
        codeUnits[1] = U16_TRAIL(inputC);
        codeUnitsLength = 2;
    }

    // FIXME: Consider passing fontDescription.dominantScript()
    // to GetFallbackFamily here.
    FontDescription fontDescription = fontDescription;
    UChar32 c;
    UScriptCode script;
    const wchar_t* family = getFallbackFamily(codeUnits, codeUnitsLength, fontDescription.genericFamily(), &c, &script);
    FontPlatformData* data = 0;
    if (family)
        data = getFontPlatformData(fontDescription,  AtomicString(family, wcslen(family)));

    // Last resort font list : PanUnicode. CJK fonts have a pretty
    // large repertoire. Eventually, we need to scan all the fonts
    // on the system to have a Firefox-like coverage.
    // Make sure that all of them are lowercased.
    const static wchar_t* const cjkFonts[] = {
        L"arial unicode ms",
        L"ms pgothic",
        L"simsun",
        L"gulim",
        L"pmingliu",
        L"wenquanyi zen hei", // partial CJK Ext. A coverage but more
                              // widely known to Chinese users.
        L"ar pl shanheisun uni",
        L"ar pl zenkai uni",
        L"han nom a",  // Complete CJK Ext. A coverage
        L"code2000",   // Complete CJK Ext. A coverage
        // CJK Ext. B fonts are not listed here because it's of no use
        // with our current non-BMP character handling because we use
        // Uniscribe for it and that code path does not go through here.
    };

    const static wchar_t* const commonFonts[] = {
        L"tahoma",
        L"arial unicode ms",
        L"lucida sans unicode",
        L"microsoft sans serif",
        L"palatino linotype",
        // Six fonts below (and code2000 at the end) are not from MS, but
        // once installed, cover a very wide range of characters.
        L"dejavu serif",
        L"dejavu sasns",
        L"freeserif",
        L"freesans",
        L"gentium",
        L"gentiumalt",
        L"ms pgothic",
        L"simsun",
        L"gulim",
        L"pmingliu",
        L"code2000",
    };

    const wchar_t* const* panUniFonts = 0;
    int numFonts = 0;
    if (script == USCRIPT_HAN) {
        panUniFonts = cjkFonts;
        numFonts = WTF_ARRAY_LENGTH(cjkFonts);
    } else {
        panUniFonts = commonFonts;
        numFonts = WTF_ARRAY_LENGTH(commonFonts);
    }
    // Font returned from GetFallbackFamily may not cover |characters|
    // because it's based on script to font mapping. This problem is
    // critical enough for non-Latin scripts (especially Han) to
    // warrant an additional (real coverage) check with fontCotainsCharacter.
    int i;
    for (i = 0; (!data || !fontContainsCharacter(data, family, c)) && i < numFonts; ++i) {
        family = panUniFonts[i];
        data = getFontPlatformData(fontDescription, AtomicString(family, wcslen(family)));
    }
    // When i-th font (0-base) in |panUniFonts| contains a character and
    // we get out of the loop, |i| will be |i + 1|. That is, if only the
    // last font in the array covers the character, |i| will be numFonts.
    // So, we have to use '<=" rather than '<' to see if we found a font
    // covering the character.
    if (i <= numFonts)
        return fontDataFromPlatformData(data, DoNotRetain);

    return 0;

}
Пример #14
0
// src is of the form "U+1F431 | 'h' 'i'". Position of "|" gets saved to offset
// if non-null. Size is returned in an out parameter because gtest needs a void
// return for ASSERT to work.
void ParseUnicode(uint16_t* buf,
                  size_t buf_size,
                  const char* src,
                  size_t* result_size,
                  size_t* offset) {
  size_t input_ix = 0;
  size_t output_ix = 0;
  bool seen_offset = false;

  while (src[input_ix] != 0) {
    switch (src[input_ix]) {
      case '\'':
        // single ASCII char
        LOG_ALWAYS_FATAL_IF(static_cast<uint8_t>(src[input_ix]) >= 0x80);
        input_ix++;
        LOG_ALWAYS_FATAL_IF(src[input_ix] == 0);
        LOG_ALWAYS_FATAL_IF(output_ix >= buf_size);
        buf[output_ix++] = (uint16_t)src[input_ix++];
        LOG_ALWAYS_FATAL_IF(src[input_ix] != '\'');
        input_ix++;
        break;
      case 'u':
      case 'U': {
        // Unicode codepoint in hex syntax
        input_ix++;
        LOG_ALWAYS_FATAL_IF(src[input_ix] != '+');
        input_ix++;
        char* endptr = (char*)src + input_ix;
        unsigned long int codepoint = strtoul(src + input_ix, &endptr, 16);
        size_t num_hex_digits = endptr - (src + input_ix);

        // also triggers on invalid number syntax, digits = 0
        LOG_ALWAYS_FATAL_IF(num_hex_digits < 4u);
        LOG_ALWAYS_FATAL_IF(num_hex_digits > 6u);
        LOG_ALWAYS_FATAL_IF(codepoint > 0x10FFFFu);
        input_ix += num_hex_digits;
        if (U16_LENGTH(codepoint) == 1) {
          LOG_ALWAYS_FATAL_IF(output_ix + 1 > buf_size);
          buf[output_ix++] = codepoint;
        } else {
          // UTF-16 encoding
          LOG_ALWAYS_FATAL_IF(output_ix + 2 > buf_size);
          buf[output_ix++] = U16_LEAD(codepoint);
          buf[output_ix++] = U16_TRAIL(codepoint);
        }
        break;
      }
      case ' ':
        input_ix++;
        break;
      case '|':
        LOG_ALWAYS_FATAL_IF(seen_offset);
        LOG_ALWAYS_FATAL_IF(offset == nullptr);
        *offset = output_ix;
        seen_offset = true;
        input_ix++;
        break;
      default:
        LOG_ALWAYS_FATAL("Unexpected Character");
    }
  }
  LOG_ALWAYS_FATAL_IF(result_size == nullptr);
  *result_size = output_ix;
  LOG_ALWAYS_FATAL_IF(!seen_offset && offset != nullptr);
}
Пример #15
0
extern void
storeMapping(uint32_t codepoint, uint32_t* mapping,int32_t length,
             UStringPrepType type, UErrorCode* status){
    
 
    UChar* map = NULL;
    int16_t adjustedLen=0, i;
    uint16_t trieWord = 0;
    ValueStruct *value = NULL;
    uint32_t savedTrieWord = 0;

    /* initialize the hashtable */
    if(hashTable==NULL){
        hashTable = uhash_open(hashEntry, compareEntries, NULL, status);
        uhash_setValueDeleter(hashTable, valueDeleter);
    }
    
    /* figure out if the code point has type already stored */
    savedTrieWord= utrie_get32(sprepTrie,codepoint,NULL);
    if(savedTrieWord!=0){
        if((savedTrieWord- _SPREP_TYPE_THRESHOLD) == USPREP_PROHIBITED){
            /* turn on the first bit in trie word */
            trieWord += 0x01;
        }else{
            /* 
             * the codepoint has value something other than prohibited
             * and a mapping .. error! 
             */
            fprintf(stderr,"Type for codepoint \\U%08X already set!.\n", (int)codepoint);
            exit(U_ILLEGAL_ARGUMENT_ERROR); 
        } 
    }

    /* figure out the real length */ 
    for(i=0; i<length; i++){
        if(mapping[i] > 0xFFFF){
            adjustedLen +=2;
        }else{
            adjustedLen++;
        }      
    }

    if(adjustedLen == 0){
        trieWord = (uint16_t)(_SPREP_MAX_INDEX_VALUE << 2);
        /* make sure that the value of trieWord is less than the threshold */
        if(trieWord < _SPREP_TYPE_THRESHOLD){   
            /* now set the value in the trie */
            if(!utrie_set32(sprepTrie,codepoint,trieWord)){
                fprintf(stderr,"Could not set the value for code point.\n");
                exit(U_ILLEGAL_ARGUMENT_ERROR);   
            }
            /* value is set so just return */
            return;
        }else{
            fprintf(stderr,"trieWord cannot contain value greater than threshold 0x%04X.\n",_SPREP_TYPE_THRESHOLD);
            exit(U_ILLEGAL_CHAR_FOUND);
        }
    }

    if(adjustedLen == 1){
        /* calculate the delta */
        int16_t delta = (int16_t)((int32_t)codepoint - (int16_t) mapping[0]);
        if(delta >= SPREP_DELTA_RANGE_NEGATIVE_LIMIT && delta <= SPREP_DELTA_RANGE_POSITIVE_LIMIT){

            trieWord = delta << 2;


            /* make sure that the second bit is OFF */
            if((trieWord & 0x02) != 0 ){
                fprintf(stderr,"The second bit in the trie word is not zero while storing a delta.\n");
                exit(U_INTERNAL_PROGRAM_ERROR);
            }
            /* make sure that the value of trieWord is less than the threshold */
            if(trieWord < _SPREP_TYPE_THRESHOLD){   
                /* now set the value in the trie */
                if(!utrie_set32(sprepTrie,codepoint,trieWord)){
                    fprintf(stderr,"Could not set the value for code point.\n");
                    exit(U_ILLEGAL_ARGUMENT_ERROR);   
                }
                /* value is set so just return */
                return;
            }
        }
        /* 
         * if the delta is not in the given range or if the trieWord is larger than the threshold
         * just fall through for storing the mapping in the mapping table
         */
    }

    map = (UChar*) uprv_calloc(adjustedLen + 1, U_SIZEOF_UCHAR);
    i=0;
    
    while(i<length){
        if(mapping[i] <= 0xFFFF){
            map[i] = (uint16_t)mapping[i];
        }else{
            map[i]   = U16_LEAD(mapping[i]);
            map[i+1] = U16_TRAIL(mapping[i]);
        }
        i++;
    }
    
    value = (ValueStruct*) uprv_malloc(sizeof(ValueStruct));
    value->mapping = map;
    value->type   = type;
    value->length  = adjustedLen;
    if(value->length > _SPREP_MAX_INDEX_TOP_LENGTH){
        mappingDataCapacity++;
    }
    if(maxLength < value->length){
        maxLength = value->length;
    }
    uhash_iput(hashTable,codepoint,value,status);
    mappingDataCapacity += adjustedLen;

    if(U_FAILURE(*status)){
        fprintf(stderr, "Failed to put entries into the hastable. Error: %s\n", u_errorName(*status));
        exit(*status);
    }
}
Пример #16
0
unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits(const char* data, const char* dataEnd, unsigned& dataLength, unsigned& utf16Length)
{
    if (!data)
        return 0;

    StringHasher stringHasher;
    dataLength = 0;
    utf16Length = 0;

    while (data < dataEnd || (!dataEnd && *data)) {
        if (isASCII(*data)) {
            stringHasher.addCharacter(*data++);
            dataLength++;
            utf16Length++;
            continue;
        }

        int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data);
        dataLength += utf8SequenceLength;

        if (!dataEnd) {
            for (int i = 1; i < utf8SequenceLength; ++i) {
                if (!data[i])
                    return 0;
            }
        } else if (dataEnd - data < utf8SequenceLength) {
            return 0;
        }

        if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8SequenceLength))
            return 0;

        UChar32 character = readUTF8Sequence(data, utf8SequenceLength);
        ASSERT(!isASCII(character));

        if (U_IS_BMP(character)) {
            // UTF-16 surrogate values are illegal in UTF-32
            if (U_IS_SURROGATE(character))
                return 0;
            stringHasher.addCharacter(static_cast<UChar>(character)); // normal case
            utf16Length++;
        } else if (U_IS_SUPPLEMENTARY(character)) {
            stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)), static_cast<UChar>(U16_TRAIL(character)));
            utf16Length += 2;
        } else {
            return 0;
        }
    }

    return stringHasher.hashWithTop8BitsMasked();
}
Пример #17
0
U_CFUNC int32_t
u_strFromPunycode(const UChar *src, int32_t srcLength,
                  UChar *dest, int32_t destCapacity,
                  UBool *caseFlags,
                  UErrorCode *pErrorCode) {
    int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
            destCPCount, firstSupplementaryIndex, cpLength;
    UChar b;

    /* argument checking */
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return 0;
    }

    if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    if(srcLength==-1) {
        srcLength=u_strlen(src);
    }

    /*
     * Handle the basic code points:
     * Let basicLength be the number of input code points
     * before the last delimiter, or 0 if there is none,
     * then copy the first basicLength code points to the output.
     *
     * The two following loops iterate backward.
     */
    for(j=srcLength; j>0;) {
        if(src[--j]==DELIMITER) {
            break;
        }
    }
    destLength=basicLength=destCPCount=j;
    U_ASSERT(destLength>=0);

    while(j>0) {
        b=src[--j];
        if(!IS_BASIC(b)) {
            *pErrorCode=U_INVALID_CHAR_FOUND;
            return 0;
        }

        if(j<destCapacity) {
            dest[j]=(UChar)b;

            if(caseFlags!=NULL) {
                caseFlags[j]=IS_BASIC_UPPERCASE(b);
            }
        }
    }

    /* Initialize the state: */
    n=INITIAL_N;
    i=0;
    bias=INITIAL_BIAS;
    firstSupplementaryIndex=1000000000;

    /*
     * Main decoding loop:
     * Start just after the last delimiter if any
     * basic code points were copied; start at the beginning otherwise.
     */
    for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
        /*
         * in is the index of the next character to be consumed, and
         * destCPCount is the number of code points in the output array.
         *
         * Decode a generalized variable-length integer into delta,
         * which gets added to i.  The overflow checking is easier
         * if we increase i as we go, then subtract off its starting
         * value at the end to obtain delta.
         */
        for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
            if(in>=srcLength) {
                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
                return 0;
            }

            digit=basicToDigit[(uint8_t)src[in++]];
            if(digit<0) {
                *pErrorCode=U_INVALID_CHAR_FOUND;
                return 0;
            }
            if(digit>(0x7fffffff-i)/w) {
                /* integer overflow */
                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
                return 0;
            }

            i+=digit*w;
            /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
            t=k-bias;
            if(t<TMIN) {
                t=TMIN;
            } else if(t>TMAX) {
                t=TMAX;
            }
            */
            t=k-bias;
            if(t<TMIN) {
                t=TMIN;
            } else if(k>=(bias+TMAX)) {
                t=TMAX;
            }
            if(digit<t) {
                break;
            }

            if(w>0x7fffffff/(BASE-t)) {
                /* integer overflow */
                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
                return 0;
            }
            w*=BASE-t;
        }

        /*
         * Modification from sample code:
         * Increments destCPCount here,
         * where needed instead of in for() loop tail.
         */
        ++destCPCount;
        bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0));

        /*
         * i was supposed to wrap around from (incremented) destCPCount to 0,
         * incrementing n each time, so we'll fix that now:
         */
        if(i/destCPCount>(0x7fffffff-n)) {
            /* integer overflow */
            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
            return 0;
        }

        n+=i/destCPCount;
        i%=destCPCount;
        /* not needed for Punycode: */
        /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */

        if(n>0x10ffff || U_IS_SURROGATE(n)) {
            /* Unicode code point overflow */
            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
            return 0;
        }

        /* Insert n at position i of the output: */
        cpLength=U16_LENGTH(n);
        if(dest!=NULL && ((destLength+cpLength)<=destCapacity)) {
            int32_t codeUnitIndex;

            /*
             * Handle indexes when supplementary code points are present.
             *
             * In almost all cases, there will be only BMP code points before i
             * and even in the entire string.
             * This is handled with the same efficiency as with UTF-32.
             *
             * Only the rare cases with supplementary code points are handled
             * more slowly - but not too bad since this is an insertion anyway.
             */
            if(i<=firstSupplementaryIndex) {
                codeUnitIndex=i;
                if(cpLength>1) {
                    firstSupplementaryIndex=codeUnitIndex;
                } else {
                    ++firstSupplementaryIndex;
                }
            } else {
                codeUnitIndex=firstSupplementaryIndex;
                U16_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
            }

            /* use the UChar index codeUnitIndex instead of the code point index i */
            if(codeUnitIndex<destLength) {
                uprv_memmove(dest+codeUnitIndex+cpLength,
                             dest+codeUnitIndex,
                             (destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
                if(caseFlags!=NULL) {
                    uprv_memmove(caseFlags+codeUnitIndex+cpLength,
                                 caseFlags+codeUnitIndex,
                                 destLength-codeUnitIndex);
                }
            }
            if(cpLength==1) {
                /* BMP, insert one code unit */
                dest[codeUnitIndex]=(UChar)n;
            } else {
                /* supplementary character, insert two code units */
                dest[codeUnitIndex]=U16_LEAD(n);
                dest[codeUnitIndex+1]=U16_TRAIL(n);
            }
            if(caseFlags!=NULL) {
                /* Case of last character determines uppercase flag: */
                caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]);
                if(cpLength==2) {
                    caseFlags[codeUnitIndex+1]=FALSE;
                }
            }
        }
        destLength+=cpLength;
        U_ASSERT(destLength>=0);
        ++i;
    }

    return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
}
Пример #18
0
GlyphData Font::glyphDataForCharacter(UChar32 c, bool mirror, FontDataVariant variant) const
{
    ASSERT(isMainThread());

    if (variant == AutoVariant) {
        if (m_fontDescription.smallCaps()) {
            UChar32 upperC = toUpper(c);
            if (upperC != c) {
                c = upperC;
                variant = SmallCapsVariant;
            } else
                variant = NormalVariant;
        } else
            variant = NormalVariant;
    }

    if (mirror)
        c = mirroredChar(c);

    unsigned pageNumber = (c / GlyphPage::size);

    GlyphPageTreeNode* node = pageNumber ? m_fontList->m_pages.get(pageNumber) : m_fontList->m_pageZero;
    if (!node) {
        node = GlyphPageTreeNode::getRootChild(fontDataAt(0), pageNumber);
        if (pageNumber)
            m_fontList->m_pages.set(pageNumber, node);
        else
            m_fontList->m_pageZero = node;
    }

    GlyphPage* page;
    if (variant == NormalVariant) {
        // Fastest loop, for the common case (normal variant).
        while (true) {
            page = node->page();
            if (page) {
                GlyphData data = page->glyphDataForCharacter(c);
                if (data.fontData && (data.fontData->platformData().orientation() == Horizontal || data.fontData->isTextOrientationFallback()))
                    return data;
                
                if (data.fontData) {
                    if (isCJKIdeographOrSymbol(c)) {
                        if (!data.fontData->hasVerticalGlyphs()) {
                            // Use the broken ideograph font data. The broken ideograph font will use the horizontal width of glyphs
                            // to make sure you get a square (even for broken glyphs like symbols used for punctuation).
                            const SimpleFontData* brokenIdeographFontData = data.fontData->brokenIdeographFontData();
                            GlyphPageTreeNode* brokenIdeographNode = GlyphPageTreeNode::getRootChild(brokenIdeographFontData, pageNumber);
                            const GlyphPage* brokenIdeographPage = brokenIdeographNode->page();
                            if (brokenIdeographPage) {
                                GlyphData brokenIdeographData = brokenIdeographPage->glyphDataForCharacter(c);
                                if (brokenIdeographData.fontData)
                                    return brokenIdeographData;
                            }
                            
                            // Shouldn't be possible to even reach this point.
                            ASSERT_NOT_REACHED();
                        }
                    } else {
                        if (m_fontDescription.textOrientation() == TextOrientationVerticalRight) {
                            const SimpleFontData* verticalRightFontData = data.fontData->verticalRightOrientationFontData();
                            GlyphPageTreeNode* verticalRightNode = GlyphPageTreeNode::getRootChild(verticalRightFontData, pageNumber);
                            const GlyphPage* verticalRightPage = verticalRightNode->page();
                            if (verticalRightPage) {
                                GlyphData verticalRightData = verticalRightPage->glyphDataForCharacter(c);
                                // If the glyphs are distinct, we will make the assumption that the font has a vertical-right glyph baked
                                // into it.
                                if (data.glyph != verticalRightData.glyph)
                                    return data;
                                // The glyphs are identical, meaning that we should just use the horizontal glyph.
                                if (verticalRightData.fontData)
                                    return verticalRightData;
                            }
                        } else if (m_fontDescription.textOrientation() == TextOrientationUpright) {
                            const SimpleFontData* uprightFontData = data.fontData->uprightOrientationFontData();
                            GlyphPageTreeNode* uprightNode = GlyphPageTreeNode::getRootChild(uprightFontData, pageNumber);
                            const GlyphPage* uprightPage = uprightNode->page();
                            if (uprightPage) {
                                GlyphData uprightData = uprightPage->glyphDataForCharacter(c);
                                // If the glyphs are the same, then we know we can just use the horizontal glyph rotated vertically to be upright.
                                if (data.glyph == uprightData.glyph)
                                    return data;
                                // The glyphs are distinct, meaning that the font has a vertical-right glyph baked into it. We can't use that
                                // glyph, so we fall back to the upright data and use the horizontal glyph.
                                if (uprightData.fontData)
                                    return uprightData;
                            }
                        }

                        // Shouldn't be possible to even reach this point.
                        ASSERT_NOT_REACHED();
                    }
                    return data;
                }

                if (node->isSystemFallback())
                    break;
            }

            // Proceed with the fallback list.
            node = node->getChild(fontDataAt(node->level()), pageNumber);
            if (pageNumber)
                m_fontList->m_pages.set(pageNumber, node);
            else
                m_fontList->m_pageZero = node;
        }
    } else {
        while (true) {
            page = node->page();
            if (page) {
                GlyphData data = page->glyphDataForCharacter(c);
                if (data.fontData) {
                    // The variantFontData function should not normally return 0.
                    // But if it does, we will just render the capital letter big.
                    const SimpleFontData* variantFontData = data.fontData->variantFontData(m_fontDescription, variant);
                    if (!variantFontData)
                        return data;

                    GlyphPageTreeNode* variantNode = GlyphPageTreeNode::getRootChild(variantFontData, pageNumber);
                    const GlyphPage* variantPage = variantNode->page();
                    if (variantPage) {
                        GlyphData data = variantPage->glyphDataForCharacter(c);
                        if (data.fontData)
                            return data;
                    }

                    // Do not attempt system fallback off the variantFontData. This is the very unlikely case that
                    // a font has the lowercase character but the small caps font does not have its uppercase version.
                    return variantFontData->missingGlyphData();
                }

                if (node->isSystemFallback())
                    break;
            }

            // Proceed with the fallback list.
            node = node->getChild(fontDataAt(node->level()), pageNumber);
            if (pageNumber)
                m_fontList->m_pages.set(pageNumber, node);
            else
                m_fontList->m_pageZero = node;
        }
    }

    ASSERT(page);
    ASSERT(node->isSystemFallback());

    // System fallback is character-dependent. When we get here, we
    // know that the character in question isn't in the system fallback
    // font's glyph page. Try to lazily create it here.
    UChar codeUnits[2];
    int codeUnitsLength;
    if (c <= 0xFFFF) {
        codeUnits[0] = Font::normalizeSpaces(c);
        codeUnitsLength = 1;
    } else {
        codeUnits[0] = U16_LEAD(c);
        codeUnits[1] = U16_TRAIL(c);
        codeUnitsLength = 2;
    }
    const SimpleFontData* characterFontData = fontCache()->getFontDataForCharacters(*this, codeUnits, codeUnitsLength);
    if (variant != NormalVariant && characterFontData)
        characterFontData = characterFontData->variantFontData(m_fontDescription, variant);
    if (characterFontData) {
        // Got the fallback glyph and font.
        GlyphPage* fallbackPage = GlyphPageTreeNode::getRootChild(characterFontData, pageNumber)->page();
        GlyphData data = fallbackPage && fallbackPage->fontDataForCharacter(c) ? fallbackPage->glyphDataForCharacter(c) : characterFontData->missingGlyphData();
        // Cache it so we don't have to do system fallback again next time.
        if (variant == NormalVariant) {
#if OS(WINCE)
            // missingGlyphData returns a null character, which is not suitable for GDI to display.
            // Also, sometimes we cannot map a font for the character on WINCE, but GDI can still
            // display the character, probably because the font package is not installed correctly.
            // So we just always set the glyph to be same as the character, and let GDI solve it.
            page->setGlyphDataForCharacter(c, c, characterFontData);
            return page->glyphDataForCharacter(c);
#else
            page->setGlyphDataForCharacter(c, data.glyph, data.fontData);
#endif
        }
        return data;
    }

    // Even system fallback can fail; use the missing glyph in that case.
    // FIXME: It would be nicer to use the missing glyph from the last resort font instead.
    GlyphData data = primaryFont()->missingGlyphData();
    if (variant == NormalVariant) {
#if OS(WINCE)
        // See comment about WINCE GDI handling near setGlyphDataForCharacter above.
        page->setGlyphDataForCharacter(c, c, data.fontData);
        return page->glyphDataForCharacter(c);
#else
        page->setGlyphDataForCharacter(c, data.glyph, data.fontData);
#endif
    }
    return data;
}
Пример #19
0
void GlyphPageTreeNode::initializePage(const FontData* fontData, unsigned pageNumber)
{
    ASSERT(!m_page);

    // This function must not be called for the root of the tree, because that
    // level does not contain any glyphs.
    ASSERT(m_level > 0 && m_parent);

    // The parent's page will be 0 if we are level one or the parent's font data
    // did not contain any glyphs for that page.
    GlyphPage* parentPage = m_parent->page();

    // NULL FontData means we're being asked for the system fallback font.
    if (fontData) {
        if (m_level == 1) {
            // Children of the root hold pure pages. These will cover only one
            // font data's glyphs, and will have glyph index 0 if the font data does not
            // contain the glyph.
            unsigned start = pageNumber * GlyphPage::size;
            UChar buffer[GlyphPage::size * 2 + 2];
            unsigned bufferLength;
            unsigned i;

            // Fill in a buffer with the entire "page" of characters that we want to look up glyphs for.
            if (start < 0x10000) {
                bufferLength = GlyphPage::size;
                for (i = 0; i < GlyphPage::size; i++)
                    buffer[i] = start + i;

                if (start == 0) {
                    // Control characters must not render at all.
                    for (i = 0; i < 0x20; ++i)
                        buffer[i] = zeroWidthSpace;
                    for (i = 0x7F; i < 0xA0; i++)
                        buffer[i] = zeroWidthSpace;
                    buffer[softHyphen] = zeroWidthSpace;

                    // \n, \t, and nonbreaking space must render as a space.
                    buffer[(int)'\n'] = ' ';
                    buffer[(int)'\t'] = ' ';
                    buffer[noBreakSpace] = ' ';
                } else if (start == (leftToRightMark & ~(GlyphPage::size - 1))) {
                    // LRM, RLM, LRE, RLE, ZWNJ, ZWJ, and PDF must not render at all.
                    buffer[leftToRightMark - start] = zeroWidthSpace;
                    buffer[rightToLeftMark - start] = zeroWidthSpace;
                    buffer[leftToRightEmbed - start] = zeroWidthSpace;
                    buffer[rightToLeftEmbed - start] = zeroWidthSpace;
                    buffer[leftToRightOverride - start] = zeroWidthSpace;
                    buffer[rightToLeftOverride - start] = zeroWidthSpace;
                    buffer[zeroWidthNonJoiner - start] = zeroWidthSpace;
                    buffer[zeroWidthJoiner - start] = zeroWidthSpace;
                    buffer[popDirectionalFormatting - start] = zeroWidthSpace;
                } else if (start == (objectReplacementCharacter & ~(GlyphPage::size - 1))) {
                    // Object replacement character must not render at all.
                    buffer[objectReplacementCharacter - start] = zeroWidthSpace;
                } else if (start == (zeroWidthNoBreakSpace & ~(GlyphPage::size - 1))) {
                    // ZWNBS/BOM must not render at all.
                    buffer[zeroWidthNoBreakSpace - start] = zeroWidthSpace;
                }
            } else {
                bufferLength = GlyphPage::size * 2;
                for (i = 0; i < GlyphPage::size; i++) {
                    int c = i + start;
                    buffer[i * 2] = U16_LEAD(c);
                    buffer[i * 2 + 1] = U16_TRAIL(c);
                }
            }

            // Now that we have a buffer full of characters, we want to get back an array
            // of glyph indices.  This part involves calling into the platform-specific 
            // routine of our glyph map for actually filling in the page with the glyphs.
            // Success is not guaranteed. For example, Times fails to fill page 260, giving glyph data
            // for only 128 out of 256 characters.
            bool haveGlyphs;
            if (!fontData->isSegmented()) {
                if (GlyphPage::mayUseMixedFontDataWhenFilling(buffer, bufferLength, static_cast<const SimpleFontData*>(fontData)))
                    m_page = GlyphPage::createForMixedFontData(this);
                else
                    m_page = GlyphPage::createForSingleFontData(this, static_cast<const SimpleFontData*>(fontData));
#if PLATFORM(IOS)
                // FIXME: Times New Roman contains Arabic glyphs, but Core Text doesn't know how to shape them. See <rdar://problem/9823975>.
                // Once we have the fix for <rdar://problem/9823975> then remove this code together with SimpleFontData::shouldNotBeUsedForArabic()
                // in <rdar://problem/12096835>.
                if (pageNumber == 6 && static_cast<const SimpleFontData*>(fontData)->shouldNotBeUsedForArabic())
                    haveGlyphs = false;
                else
#endif
                haveGlyphs = fill(m_page.get(), 0, GlyphPage::size, buffer, bufferLength, static_cast<const SimpleFontData*>(fontData));
            } else {
                m_page = GlyphPage::createForMixedFontData(this);
                haveGlyphs = false;

                const SegmentedFontData* segmentedFontData = static_cast<const SegmentedFontData*>(fontData);
                unsigned numRanges = segmentedFontData->numRanges();
                bool zeroFilled = false;
                RefPtr<GlyphPage> scratchPage;
                GlyphPage* pageToFill = m_page.get();
                for (unsigned i = 0; i < numRanges; i++) {
                    const FontDataRange& range = segmentedFontData->rangeAt(i);
                    // all this casting is to ensure all the parameters to min and max have the same type,
                    // to avoid ambiguous template parameter errors on Windows
                    int from = std::max(0, static_cast<int>(range.from()) - static_cast<int>(start));
                    int to = 1 + std::min(static_cast<int>(range.to()) - static_cast<int>(start), static_cast<int>(GlyphPage::size) - 1);
                    if (from < static_cast<int>(GlyphPage::size) && to > 0) {
                        if (haveGlyphs && !scratchPage) {
                            scratchPage = GlyphPage::createForMixedFontData(this);
                            pageToFill = scratchPage.get();
                        }

                        if (!zeroFilled) {
                            if (from > 0 || to < static_cast<int>(GlyphPage::size)) {
                                for (unsigned i = 0; i < GlyphPage::size; i++)
                                    pageToFill->setGlyphDataForIndex(i, 0, 0);
                            }
                            zeroFilled = true;
                        }
                        haveGlyphs |= fill(pageToFill, from, to - from, buffer + from * (start < 0x10000 ? 1 : 2), (to - from) * (start < 0x10000 ? 1 : 2), range.fontData().get());
                        if (scratchPage) {
                            ASSERT_WITH_SECURITY_IMPLICATION(to <=  static_cast<int>(GlyphPage::size));
                            for (int j = from; j < to; j++) {
                                if (!m_page->glyphAt(j) && pageToFill->glyphAt(j))
                                    m_page->setGlyphDataForIndex(j, pageToFill->glyphDataForIndex(j));
                            }
                        }
                    }
                }
            }

            if (!haveGlyphs)
                m_page = 0;
        } else if (parentPage && parentPage->owner() != m_parent) {
            // The page we're overriding may not be owned by our parent node.
            // This happens when our parent node provides no useful overrides
            // and just copies the pointer to an already-existing page (see
            // below).
            //
            // We want our override to be shared by all nodes that reference
            // that page to avoid duplication, and so standardize on having the
            // page's owner collect all the overrides.  Call getChild on the
            // page owner with the desired font data (this will populate
            // the page) and then reference it.
            m_page = parentPage->owner()->getChild(fontData, pageNumber)->page();
        } else {
            // Get the pure page for the fallback font (at level 1 with no
            // overrides). getRootChild will always create a page if one
            // doesn't exist, but the page doesn't necessarily have glyphs
            // (this pointer may be 0).
            GlyphPage* fallbackPage = getRootChild(fontData, pageNumber)->page();
            if (!parentPage) {
                // When the parent has no glyphs for this page, we can easily
                // override it just by supplying the glyphs from our font.
                m_page = fallbackPage;
            } else if (!fallbackPage) {
                // When our font has no glyphs for this page, we can just reference the
                // parent page.
                m_page = parentPage;
            } else {
                // Combine the parent's glyphs and ours to form a new more complete page.
                m_page = GlyphPage::createForMixedFontData(this);

                // Overlay the parent page on the fallback page. Check if the fallback font
                // has added anything.
                bool newGlyphs = false;
                for (unsigned i = 0; i < GlyphPage::size; i++) {
                    if (parentPage->glyphAt(i))
                        m_page->setGlyphDataForIndex(i, parentPage->glyphDataForIndex(i));
                    else  if (fallbackPage->glyphAt(i)) {
                        m_page->setGlyphDataForIndex(i, fallbackPage->glyphDataForIndex(i));
                        newGlyphs = true;
                    } else
                        m_page->setGlyphDataForIndex(i, 0, 0);
                }

                if (!newGlyphs)
                    // We didn't override anything, so our override is just the parent page.
                    m_page = parentPage;
            }
        }
    } else {
        // System fallback. Initialized with the parent's page here, as individual
        // entries may use different fonts depending on character. If the Font
        // ever finds it needs a glyph out of the system fallback page, it will
        // ask the system for the best font to use and fill that glyph in for us.
        if (parentPage)
            m_page = parentPage->createCopiedSystemFallbackPage(this);
        else
            m_page = GlyphPage::createForMixedFontData(this);
    }
}
Пример #20
0
std::pair<GlyphData, GlyphPage*> FontGlyphs::glyphDataAndPageForCharacter(const FontDescription& description, UChar32 c, bool mirror, FontDataVariant variant) const
{
    ASSERT(isMainThread());

    if (variant == AutoVariant) {
        if (description.smallCaps() && !primarySimpleFontData(description)->isSVGFont()) {
            UChar32 upperC = u_toupper(c);
            if (upperC != c) {
                c = upperC;
                variant = SmallCapsVariant;
            } else
                variant = NormalVariant;
        } else
            variant = NormalVariant;
    }

    if (mirror)
        c = u_charMirror(c);

    unsigned pageNumber = (c / GlyphPage::size);

    GlyphPageTreeNode* node = pageNumber ? m_pages.get(pageNumber) : m_pageZero;
    if (!node) {
        node = GlyphPageTreeNode::getRootChild(realizeFontDataAt(description, 0), pageNumber);
        if (pageNumber)
            m_pages.set(pageNumber, node);
        else
            m_pageZero = node;
    }

    GlyphPage* page = 0;
    if (variant == NormalVariant) {
        // Fastest loop, for the common case (normal variant).
        while (true) {
            page = node->page();
            if (page) {
                GlyphData data = page->glyphDataForCharacter(c);
                if (data.fontData && (data.fontData->platformData().orientation() == Horizontal || data.fontData->isTextOrientationFallback()))
                    return std::make_pair(data, page);

                if (data.fontData) {
                    if (Font::isCJKIdeographOrSymbol(c)) {
                        if (!data.fontData->hasVerticalGlyphs()) {
                            // Use the broken ideograph font data. The broken ideograph font will use the horizontal width of glyphs
                            // to make sure you get a square (even for broken glyphs like symbols used for punctuation).
                            variant = BrokenIdeographVariant;
                            break;
                        }
#if PLATFORM(COCOA)
                        else if (data.fontData->platformData().syntheticOblique())
                            return glyphDataAndPageForCJKCharacterWithoutSyntheticItalic(c, data, page, pageNumber);
#endif
                    } else
                        return glyphDataAndPageForNonCJKCharacterWithGlyphOrientation(c, description.nonCJKGlyphOrientation(), data, page, pageNumber);

                    return std::make_pair(data, page);
                }

                if (node->isSystemFallback())
                    break;
            }

            node = node->getChild(realizeFontDataAt(description, node->level()), pageNumber);
            if (pageNumber)
                m_pages.set(pageNumber, node);
            else
                m_pageZero = node;
        }
    }
    if (variant != NormalVariant) {
        while (true) {
            page = node->page();
            if (page) {
                GlyphData data = page->glyphDataForCharacter(c);
                if (data.fontData) {
                    // The variantFontData function should not normally return 0.
                    // But if it does, we will just render the capital letter big.
                    RefPtr<SimpleFontData> variantFontData = data.fontData->variantFontData(description, variant);
                    if (!variantFontData)
                        return std::make_pair(data, page);

                    GlyphPageTreeNode* variantNode = GlyphPageTreeNode::getRootChild(variantFontData.get(), pageNumber);
                    GlyphPage* variantPage = variantNode->page();
                    if (variantPage) {
                        GlyphData data = variantPage->glyphDataForCharacter(c);
                        if (data.fontData)
                            return std::make_pair(data, variantPage);
                    }

                    // Do not attempt system fallback off the variantFontData. This is the very unlikely case that
                    // a font has the lowercase character but the small caps font does not have its uppercase version.
                    return std::make_pair(variantFontData->missingGlyphData(), page);
                }

                if (node->isSystemFallback())
                    break;
            }

            node = node->getChild(realizeFontDataAt(description, node->level()), pageNumber);
            if (pageNumber)
                m_pages.set(pageNumber, node);
            else
                m_pageZero = node;
        }
    }

    ASSERT(page);
    ASSERT(node->isSystemFallback());

    // System fallback is character-dependent. When we get here, we
    // know that the character in question isn't in the system fallback
    // font's glyph page. Try to lazily create it here.
    UChar codeUnits[2];
    int codeUnitsLength;
    if (c <= 0xFFFF) {
        codeUnits[0] = Font::normalizeSpaces(c);
        codeUnitsLength = 1;
    } else {
        codeUnits[0] = U16_LEAD(c);
        codeUnits[1] = U16_TRAIL(c);
        codeUnitsLength = 2;
    }
    const SimpleFontData* originalFontData = primaryFontData(description)->fontDataForCharacter(c);
    RefPtr<SimpleFontData> characterFontData = fontCache().systemFallbackForCharacters(description, originalFontData, m_isForPlatformFont, codeUnits, codeUnitsLength);
    if (characterFontData) {
        if (characterFontData->platformData().orientation() == Vertical && !characterFontData->hasVerticalGlyphs() && Font::isCJKIdeographOrSymbol(c))
            variant = BrokenIdeographVariant;
        if (variant != NormalVariant)
            characterFontData = characterFontData->variantFontData(description, variant);
    }
    if (characterFontData) {
        // Got the fallback glyph and font.
        GlyphPage* fallbackPage = GlyphPageTreeNode::getRootChild(characterFontData.get(), pageNumber)->page();
        GlyphData data = fallbackPage && fallbackPage->fontDataForCharacter(c) ? fallbackPage->glyphDataForCharacter(c) : characterFontData->missingGlyphData();
        // Cache it so we don't have to do system fallback again next time.
        if (variant == NormalVariant) {
#if OS(WINCE)
            // missingGlyphData returns a null character, which is not suitable for GDI to display.
            // Also, sometimes we cannot map a font for the character on WINCE, but GDI can still
            // display the character, probably because the font package is not installed correctly.
            // So we just always set the glyph to be same as the character, and let GDI solve it.
            page->setGlyphDataForCharacter(c, c, characterFontData.get());
            characterFontData->setMaxGlyphPageTreeLevel(std::max(characterFontData->maxGlyphPageTreeLevel(), node->level()));
            return std::make_pair(page->glyphDataForCharacter(c), page);
#else
            page->setGlyphDataForCharacter(c, data.glyph, data.fontData);
            data.fontData->setMaxGlyphPageTreeLevel(std::max(data.fontData->maxGlyphPageTreeLevel(), node->level()));
            if (!Font::isCJKIdeographOrSymbol(c) && data.fontData->platformData().orientation() != Horizontal && !data.fontData->isTextOrientationFallback())
                return glyphDataAndPageForNonCJKCharacterWithGlyphOrientation(c, description.nonCJKGlyphOrientation(), data, fallbackPage, pageNumber);
#endif
        }
        return std::make_pair(data, page);
    }

    // Even system fallback can fail; use the missing glyph in that case.
    // FIXME: It would be nicer to use the missing glyph from the last resort font instead.
    GlyphData data = primarySimpleFontData(description)->missingGlyphData();
    if (variant == NormalVariant) {
#if OS(WINCE)
        // See comment about WINCE GDI handling near setGlyphDataForCharacter above.
        page->setGlyphDataForCharacter(c, c, data.fontData);
        data.fontData->setMaxGlyphPageTreeLevel(std::max(data.fontData->maxGlyphPageTreeLevel(), node->level()));
        return std::make_pair(page->glyphDataForCharacter(c), page);
#else
        page->setGlyphDataForCharacter(c, data.glyph, data.fontData);
        data.fontData->setMaxGlyphPageTreeLevel(std::max(data.fontData->maxGlyphPageTreeLevel(), node->level()));
#endif
    }
    return std::make_pair(data, page);
}
Пример #21
0
GlyphData Font::glyphDataForCharacter(UChar32 c, bool mirror, bool forceSmallCaps) const
{
    ASSERT(isMainThread());

    bool useSmallCapsFont = forceSmallCaps;
    if (m_fontDescription.smallCaps()) {
        UChar32 upperC = toUpper(c);
        if (upperC != c) {
            c = upperC;
            useSmallCapsFont = true;
        }
    }

    if (mirror)
        c = mirroredChar(c);

    unsigned pageNumber = (c / GlyphPage::size);

    GlyphPageTreeNode* node = pageNumber ? m_fontList->m_pages.get(pageNumber) : m_fontList->m_pageZero;
    if (!node) {
        node = GlyphPageTreeNode::getRootChild(fontDataAt(0), pageNumber);
        if (pageNumber)
            m_fontList->m_pages.set(pageNumber, node);
        else
            m_fontList->m_pageZero = node;
    }

    GlyphPage* page;
    if (!useSmallCapsFont) {
        // Fastest loop, for the common case (not small caps).
        while (true) {
            page = node->page();
            if (page) {
                GlyphData data = page->glyphDataForCharacter(c);
                if (data.fontData)
                    return data;
                if (node->isSystemFallback())
                    break;
            }

            // Proceed with the fallback list.
            node = node->getChild(fontDataAt(node->level()), pageNumber);
            if (pageNumber)
                m_fontList->m_pages.set(pageNumber, node);
            else
                m_fontList->m_pageZero = node;
        }
    } else {
        while (true) {
            page = node->page();
            if (page) {
                GlyphData data = page->glyphDataForCharacter(c);
                if (data.fontData) {
                    // The smallCapsFontData function should not normally return 0.
                    // But if it does, we will just render the capital letter big.
                    const SimpleFontData* smallCapsFontData = data.fontData->smallCapsFontData(m_fontDescription);
                    if (!smallCapsFontData)
                        return data;

                    GlyphPageTreeNode* smallCapsNode = GlyphPageTreeNode::getRootChild(smallCapsFontData, pageNumber);
                    const GlyphPage* smallCapsPage = smallCapsNode->page();
                    if (smallCapsPage) {
                        GlyphData data = smallCapsPage->glyphDataForCharacter(c);
                        if (data.fontData)
                            return data;
                    }

                    // Do not attempt system fallback off the smallCapsFontData. This is the very unlikely case that
                    // a font has the lowercase character but the small caps font does not have its uppercase version.
                    return smallCapsFontData->missingGlyphData();
                }

                if (node->isSystemFallback())
                    break;
            }

            // Proceed with the fallback list.
            node = node->getChild(fontDataAt(node->level()), pageNumber);
            if (pageNumber)
                m_fontList->m_pages.set(pageNumber, node);
            else
                m_fontList->m_pageZero = node;
        }
    }

    ASSERT(page);
    ASSERT(node->isSystemFallback());

    // System fallback is character-dependent. When we get here, we
    // know that the character in question isn't in the system fallback
    // font's glyph page. Try to lazily create it here.
    UChar codeUnits[2];
    int codeUnitsLength;
    if (c <= 0xFFFF) {
        codeUnits[0] = Font::normalizeSpaces(c);
        codeUnitsLength = 1;
    } else {
        codeUnits[0] = U16_LEAD(c);
        codeUnits[1] = U16_TRAIL(c);
        codeUnitsLength = 2;
    }
    const SimpleFontData* characterFontData = fontCache()->getFontDataForCharacters(*this, codeUnits, codeUnitsLength);
    if (useSmallCapsFont && characterFontData)
        characterFontData = characterFontData->smallCapsFontData(m_fontDescription);
    if (characterFontData) {
        // Got the fallback glyph and font.
        GlyphPage* fallbackPage = GlyphPageTreeNode::getRootChild(characterFontData, pageNumber)->page();
        GlyphData data = fallbackPage && fallbackPage->fontDataForCharacter(c) ? fallbackPage->glyphDataForCharacter(c) : characterFontData->missingGlyphData();
        // Cache it so we don't have to do system fallback again next time.
        if (!useSmallCapsFont) {
#if OS(WINCE)
            // missingGlyphData returns a null character, which is not suitable for GDI to display.
            // Also, sometimes we cannot map a font for the character on WINCE, but GDI can still
            // display the character, probably because the font package is not installed correctly.
            // So we just always set the glyph to be same as the character, and let GDI solve it.
            page->setGlyphDataForCharacter(c, c, characterFontData);
            return page->glyphDataForCharacter(c);
#else
            page->setGlyphDataForCharacter(c, data.glyph, data.fontData);
#endif
        }
        return data;
    }

    // Even system fallback can fail; use the missing glyph in that case.
    // FIXME: It would be nicer to use the missing glyph from the last resort font instead.
    GlyphData data = primaryFont()->missingGlyphData();
    if (!useSmallCapsFont) {
#if OS(WINCE)
        // See comment about WINCE GDI handling near setGlyphDataForCharacter above.
        page->setGlyphDataForCharacter(c, c, data.fontData);
        return page->glyphDataForCharacter(c);
#else
        page->setGlyphDataForCharacter(c, data.glyph, data.fontData);
#endif
    }
    return data;
}
Пример #22
0
const GlyphData& Font::glyphDataForCharacter(UChar32 c, bool mirror, bool forceSmallCaps) const
{
    bool useSmallCapsFont = forceSmallCaps;
    if (m_fontDescription.smallCaps()) {
        UChar32 upperC = Unicode::toUpper(c);
        if (upperC != c) {
            c = upperC;
            useSmallCapsFont = true;
        }
    }

    if (mirror)
        c = mirroredChar(c);

    unsigned pageNumber = (c / GlyphPage::size);

    GlyphPageTreeNode* node = pageNumber ? m_pages.get(pageNumber) : m_pageZero;
    if (!node) {
        node = GlyphPageTreeNode::getRootChild(fontDataAt(0), pageNumber);
        if (pageNumber)
            m_pages.set(pageNumber, node);
        else
            m_pageZero = node;
    }

    GlyphPage* page;
    if (!useSmallCapsFont) {
        // Fastest loop, for the common case (not small caps).
        while (true) {
            page = node->page();
            if (page) {
                const GlyphData& data = page->glyphDataForCharacter(c);
                if (data.fontData)
                    return data;
                if (node->isSystemFallback())
                    break;
            }

            // Proceed with the fallback list.
            node = node->getChild(fontDataAt(node->level()), pageNumber);
            if (pageNumber)
                m_pages.set(pageNumber, node);
            else
                m_pageZero = node;
        }
    } else {
        while (true) {
            page = node->page();
            if (page) {
                const GlyphData& data = page->glyphDataForCharacter(c);
                if (data.fontData) {
                    // The smallCapsFontData function should not normally return 0.
                    // But if it does, we will just render the capital letter big.
                    const SimpleFontData* smallCapsFontData = data.fontData->smallCapsFontData(m_fontDescription);
                    if (!smallCapsFontData)
                        return data;

                    GlyphPageTreeNode* smallCapsNode = GlyphPageTreeNode::getRootChild(smallCapsFontData, pageNumber);
                    const GlyphPage* smallCapsPage = smallCapsNode->page();
                    if (smallCapsPage) {
                        const GlyphData& data = smallCapsPage->glyphDataForCharacter(c);
                        if (data.fontData)
                            return data;
                    }

                    // Do not attempt system fallback off the smallCapsFontData. This is the very unlikely case that
                    // a font has the lowercase character but the small caps font does not have its uppercase version.
                    return smallCapsFontData->missingGlyphData();
                }

                if (node->isSystemFallback())
                    break;
            }

            // Proceed with the fallback list.
            node = node->getChild(fontDataAt(node->level()), pageNumber);
            if (pageNumber)
                m_pages.set(pageNumber, node);
            else
                m_pageZero = node;
        }
    }

    ASSERT(page);
    ASSERT(node->isSystemFallback());

    // System fallback is character-dependent. When we get here, we
    // know that the character in question isn't in the system fallback
    // font's glyph page. Try to lazily create it here.
    UChar codeUnits[2];
    int codeUnitsLength;
    if (c <= 0xFFFF) {
        UChar c16 = c;
        if (Font::treatAsSpace(c16))
            codeUnits[0] = ' ';
        else if (Font::treatAsZeroWidthSpace(c16))
            codeUnits[0] = zeroWidthSpace;
        else
            codeUnits[0] = c16;
        codeUnitsLength = 1;
    } else {
        codeUnits[0] = U16_LEAD(c);
        codeUnits[1] = U16_TRAIL(c);
        codeUnitsLength = 2;
    }
    const SimpleFontData* characterFontData = FontCache::getFontDataForCharacters(*this, codeUnits, codeUnitsLength);
    if (useSmallCapsFont)
        characterFontData = characterFontData->smallCapsFontData(m_fontDescription);
    if (characterFontData) {
        // Got the fallback glyph and font.
        GlyphPage* fallbackPage = GlyphPageTreeNode::getRootChild(characterFontData, pageNumber)->page();
        const GlyphData& data = fallbackPage && fallbackPage->glyphDataForCharacter(c).fontData ? fallbackPage->glyphDataForCharacter(c) : characterFontData->missingGlyphData();
        // Cache it so we don't have to do system fallback again next time.
        if (!useSmallCapsFont)
            page->setGlyphDataForCharacter(c, data.glyph, data.fontData);
        return data;
    }

    // Even system fallback can fail; use the missing glyph in that case.
    // FIXME: It would be nicer to use the missing glyph from the last resort font instead.
    const GlyphData& data = primaryFont()->missingGlyphData();
    if (!useSmallCapsFont)
        page->setGlyphDataForCharacter(c, data.glyph, data.fontData);
    return data;
}
Пример #23
0
ConversionResult convertUTF8ToUTF16(
    const char** sourceStart, const char* sourceEnd, 
    UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII, bool strict)
{
    ConversionResult result = conversionOK;
    const char* source = *sourceStart;
    UChar* target = *targetStart;
    UChar orAllData = 0;
    while (source < sourceEnd) {
        int utf8SequenceLength = inlineUTF8SequenceLength(*source);
        if (sourceEnd - source < utf8SequenceLength)  {
            result = sourceExhausted;
            break;
        }
        // Do this check whether lenient or strict
        if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8SequenceLength)) {
            result = sourceIllegal;
            break;
        }

        UChar32 character = readUTF8Sequence(source, utf8SequenceLength);

        if (target >= targetEnd) {
            source -= utf8SequenceLength; // Back up source pointer!
            result = targetExhausted;
            break;
        }

        if (U_IS_BMP(character)) {
            // UTF-16 surrogate values are illegal in UTF-32
            if (U_IS_SURROGATE(character)) {
                if (strict) {
                    source -= utf8SequenceLength; // return to the illegal value itself
                    result = sourceIllegal;
                    break;
                } else {
                    *target++ = replacementCharacter;
                    orAllData |= replacementCharacter;
                }
            } else {
                *target++ = character; // normal case
                orAllData |= character;
            }
        } else if (U_IS_SUPPLEMENTARY(character)) {
            // target is a character in range 0xFFFF - 0x10FFFF
            if (target + 1 >= targetEnd) {
                source -= utf8SequenceLength; // Back up source pointer!
                result = targetExhausted;
                break;
            }
            *target++ = U16_LEAD(character);
            *target++ = U16_TRAIL(character);
            orAllData = 0xffff;
        } else {
            if (strict) {
                source -= utf8SequenceLength; // return to the start
                result = sourceIllegal;
                break; // Bail out; shouldn't continue
            } else {
                *target++ = replacementCharacter;
                orAllData |= replacementCharacter;
            }
        }
    }
    *sourceStart = source;
    *targetStart = target;

    if (sourceAllASCII)
        *sourceAllASCII = !(orAllData & ~0x7f);

    return result;
}
Пример #24
0
 * then the buckets will be off.
 * There are hacks in the code to handle the known CJK tailorings of U+4E00.
 *
 * <p>We use "A" not "a" because the en_US_POSIX tailoring sorts A primary-before a.
 *
 * Keep this in sync with HACK_FIRST_CHARS_IN_SCRIPTS in
 * ICU4J main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java
 */
static const UChar HACK_FIRST_CHARS_IN_SCRIPTS[] =  {
    0x41, 0, 0x03B1, 0,
    0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0xDD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0,
    0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0,
    0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0,
    0xAAF2, 0,  // Meetei Mayek
    0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0,
    U16_LEAD(0x111C4), U16_TRAIL(0x111C4), 0,  // Sharada
    U16_LEAD(0x11680), U16_TRAIL(0x11680), 0,  // Takri
    0x1B83, 0,
    0xD802, 0xDE00, 0, 0x0E01, 0,
    0x0EDE, 0,  // Lao
    0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0,
    0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0,
    U16_LEAD(0x11103), U16_TRAIL(0x11103), 0,  // Chakma
    0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0,
    0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0,
    0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0,
    U16_LEAD(0x16F00), U16_TRAIL(0x16F00), 0,  // Miao
    0xD800, 0xDE80, 0,
    0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0,
    0xD801, 0xDC80, 0,
    U16_LEAD(0x110D0), U16_TRAIL(0x110D0), 0,  // Sora Sompeng