Ejemplo n.º 1
0
/*
 * Compare two strings as presented by UCharIterators.
 * Use code unit or code point order.
 * When the function returns, it is undefined where the iterators
 * have stopped.
 */
U_CAPI int32_t U_EXPORT2
u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder) {
    UChar32 c1, c2;

    /* argument checking */
    if(iter1==NULL || iter2==NULL) {
        return 0; /* bad arguments */
    }
    if(iter1==iter2) {
        return 0; /* identical iterators */
    }

    /* reset iterators to start? */
    iter1->move(iter1, 0, UITER_START);
    iter2->move(iter2, 0, UITER_START);

    /* compare identical prefixes - they do not need to be fixed up */
    for(;;) {
        c1=iter1->next(iter1);
        c2=iter2->next(iter2);
        if(c1!=c2) {
            break;
        }
        if(c1==-1) {
            return 0;
        }
    }

    /* if both values are in or above the surrogate range, fix them up */
    if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
        /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
        if(
            (c1<=0xdbff && U16_IS_TRAIL(iter1->current(iter1))) ||
            (U16_IS_TRAIL(c1) && (iter1->previous(iter1), U16_IS_LEAD(iter1->previous(iter1))))
        ) {
            /* part of a surrogate pair, leave >=d800 */
        } else {
            /* BMP code point - may be surrogate code point - make <d800 */
            c1-=0x2800;
        }

        if(
            (c2<=0xdbff && U16_IS_TRAIL(iter2->current(iter2))) ||
            (U16_IS_TRAIL(c2) && (iter2->previous(iter2), U16_IS_LEAD(iter2->previous(iter2))))
        ) {
            /* part of a surrogate pair, leave >=d800 */
        } else {
            /* BMP code point - may be surrogate code point - make <d800 */
            c2-=0x2800;
        }
    }

    /* now c1 and c2 are in the requested (code unit or code point) order */
    return (int32_t)c1-(int32_t)c2;
}
Ejemplo n.º 2
0
/*
 * Test if a substring match inside a string is at code point boundaries.
 * All pointers refer to the same buffer.
 * The limit pointer may be NULL, all others must be real pointers.
 */
static U_INLINE UBool
isMatchAtCPBoundary(const UChar* start, const UChar* match, const UChar* matchLimit, const UChar* limit) {
    if (U16_IS_TRAIL(*match) && start != match && U16_IS_LEAD(*(match - 1))) {
        /* the leading edge of the match is in the middle of a surrogate pair */
        return FALSE;
    }
    if (U16_IS_LEAD(*(matchLimit - 1)) && match != limit && U16_IS_TRAIL(*matchLimit)) {
        /* the trailing edge of the match is in the middle of a surrogate pair */
        return FALSE;
    }
    return TRUE;
}
Ejemplo n.º 3
0
unsigned Font::expansionOpportunityCount(const UChar* characters, size_t length, TextDirection direction, bool& isAfterExpansion)
{
    static bool expandAroundIdeographs = canExpandAroundIdeographsInComplexText();
    unsigned count = 0;
    if (direction == LTR) {
        for (size_t i = 0; i < length; ++i) {
            UChar32 character = characters[i];
            if (treatAsSpace(character)) {
                count++;
                isAfterExpansion = true;
                continue;
            }
            if (U16_IS_LEAD(character) && i + 1 < length && U16_IS_TRAIL(characters[i + 1])) {
                character = U16_GET_SUPPLEMENTARY(character, characters[i + 1]);
                i++;
            }
            if (expandAroundIdeographs && isCJKIdeographOrSymbol(character)) {
                if (!isAfterExpansion)
                    count++;
                count++;
                isAfterExpansion = true;
                continue;
            }
            isAfterExpansion = false;
        }
    } else {
        for (size_t i = length; i > 0; --i) {
            UChar32 character = characters[i - 1];
            if (treatAsSpace(character)) {
                count++;
                isAfterExpansion = true;
                continue;
            }
            if (U16_IS_TRAIL(character) && i > 1 && U16_IS_LEAD(characters[i - 2])) {
                character = U16_GET_SUPPLEMENTARY(characters[i - 2], character);
                i--;
            }
            if (expandAroundIdeographs && isCJKIdeographOrSymbol(character)) {
                if (!isAfterExpansion)
                    count++;
                count++;
                isAfterExpansion = true;
                continue;
            }
            isAfterExpansion = false;
        }
    }
    return count;
}
Ejemplo n.º 4
0
unsigned Character::expansionOpportunityCount(const UChar* characters, size_t length, TextDirection direction, bool& isAfterExpansion, const TextJustify textJustify)
{
    unsigned count = 0;
    if (direction == LTR) {
        for (size_t i = 0; i < length; ++i) {
            UChar32 character = characters[i];
            if (treatAsSpace(character)) {
                count++;
                isAfterExpansion = true;
                continue;
            }
            if (U16_IS_LEAD(character) && i + 1 < length && U16_IS_TRAIL(characters[i + 1])) {
                character = U16_GET_SUPPLEMENTARY(character, characters[i + 1]);
                i++;
            }
            if (textJustify == TextJustify::TextJustifyAuto && isCJKIdeographOrSymbol(character)) {
                if (!isAfterExpansion)
                    count++;
                count++;
                isAfterExpansion = true;
                continue;
            }
            isAfterExpansion = false;
        }
    } else {
        for (size_t i = length; i > 0; --i) {
            UChar32 character = characters[i - 1];
            if (treatAsSpace(character)) {
                count++;
                isAfterExpansion = true;
                continue;
            }
            if (U16_IS_TRAIL(character) && i > 1 && U16_IS_LEAD(characters[i - 2])) {
                character = U16_GET_SUPPLEMENTARY(characters[i - 2], character);
                i--;
            }
            if (textJustify == TextJustify::TextJustifyAuto && isCJKIdeographOrSymbol(character)) {
                if (!isAfterExpansion)
                    count++;
                count++;
                isAfterExpansion = true;
                continue;
            }
            isAfterExpansion = false;
        }
    }
    return count;
}
Ejemplo n.º 5
0
bool SimpleFontData::fillGlyphPage(GlyphPage* pageToFill, unsigned offset, unsigned length, UChar* buffer, unsigned bufferLength) const
{
    if (U16_IS_LEAD(buffer[bufferLength-1])) {
        DLOG(ERROR) << "Last UTF-16 code unit is high-surrogate.";
        return false;
    }

    SkTypeface* typeface = platformData().typeface();
    if (!typeface) {
        DLOG(ERROR) << "fillGlyphPage called on an empty Skia typeface.";
        return false;
    }

    SkAutoSTMalloc<GlyphPage::size, uint16_t> glyphStorage(length);
    uint16_t* glyphs = glyphStorage.get();
    typeface->charsToGlyphs(buffer, SkTypeface::kUTF16_Encoding, glyphs, length);

    bool haveGlyphs = false;
    for (unsigned i = 0; i < length; i++) {
        if (glyphs[i]) {
            pageToFill->setGlyphDataForIndex(offset + i, glyphs[i], this);
            haveGlyphs = true;
        }
    }

    return haveGlyphs;
}
Ejemplo n.º 6
0
static void TestCodeUnitValues()
{
    static uint16_t codeunit[]={0x0000,0xe065,0x20ac,0xd7ff,0xd800,0xd841,0xd905,0xdbff,0xdc00,0xdc02,0xddee,0xdfff,0};
    
    int16_t i;
    for(i=0; i<sizeof(codeunit)/sizeof(codeunit[0]); i++){
        UChar c=codeunit[i];
        log_verbose("Testing code unit value of %x\n", c);
        if(i<4){
            if(!UTF16_IS_SINGLE(c) || UTF16_IS_LEAD(c) || UTF16_IS_TRAIL(c) || !U16_IS_SINGLE(c) || U16_IS_LEAD(c) || U16_IS_TRAIL(c)){
                log_err("ERROR: %x is a single character\n", c);
            }
        }
        if(i >= 4 && i< 8){
            if(!UTF16_IS_LEAD(c) || UTF16_IS_SINGLE(c) || UTF16_IS_TRAIL(c) || !U16_IS_LEAD(c) || U16_IS_SINGLE(c) || U16_IS_TRAIL(c)){
                log_err("ERROR: %x is a first surrogate\n", c);
            }
        }
        if(i >= 8 && i< 12){
            if(!UTF16_IS_TRAIL(c) || UTF16_IS_SINGLE(c) || UTF16_IS_LEAD(c) || !U16_IS_TRAIL(c) || U16_IS_SINGLE(c) || U16_IS_LEAD(c)){
                log_err("ERROR: %x is a second surrogate\n", c);
            }
        }
    }
}
Ejemplo n.º 7
0
bool SimpleFontData::fillGlyphPage(GlyphPage* pageToFill,
                                   unsigned offset,
                                   unsigned length,
                                   UChar* buffer,
                                   unsigned bufferLength) const {
  if (U16_IS_LEAD(buffer[bufferLength - 1])) {
    SkDebugf("%s last char is high-surrogate", __FUNCTION__);
    return false;
  }

  SkAutoSTMalloc<GlyphPage::size, uint16_t> glyphStorage(length);

  uint16_t* glyphs = glyphStorage.get();
  SkTypeface* typeface = platformData().typeface();
  typeface->charsToGlyphs(buffer, SkTypeface::kUTF16_Encoding, glyphs, length);

  bool haveGlyphs = false;
  for (unsigned i = 0; i < length; i++) {
    if (glyphs[i]) {
      pageToFill->setGlyphDataForIndex(offset + i, glyphs[i], this);
      haveGlyphs = true;
    }
  }

  return haveGlyphs;
}
Ejemplo n.º 8
0
void
MessagePattern::setParseError(UParseError *parseError, int32_t index) {
    if(parseError==NULL) {
        return;
    }
    parseError->offset=index;

    // Set preContext to some of msg before index.
    // Avoid splitting a surrogate pair.
    int32_t length=index;
    if(length>=U_PARSE_CONTEXT_LEN) {
        length=U_PARSE_CONTEXT_LEN-1;
        if(length>0 && U16_IS_TRAIL(msg[index-length])) {
            --length;
        }
    }
    msg.extract(index-length, length, parseError->preContext);
    parseError->preContext[length]=0;

    // Set postContext to some of msg starting at index.
    length=msg.length()-index;
    if(length>=U_PARSE_CONTEXT_LEN) {
        length=U_PARSE_CONTEXT_LEN-1;
        if(length>0 && U16_IS_LEAD(msg[index+length-1])) {
            --length;
        }
    }
    msg.extract(index, length, parseError->postContext);
    parseError->postContext[length]=0;
}
Ejemplo n.º 9
0
UChar32 StringImpl::characterStartingAt(unsigned i)
{
    if (U16_IS_SINGLE(m_data[i]))
        return m_data[i];
    if (i + 1 < m_length && U16_IS_LEAD(m_data[i]) && U16_IS_TRAIL(m_data[i + 1]))
        return U16_GET_SUPPLEMENTARY(m_data[i], m_data[i + 1]);
    return 0;
}
Ejemplo n.º 10
0
float ShapeResultSpacing::computeSpacing(const TextRun& run,
        size_t index,
        float& offset) {
    UChar32 character = run[index];
    bool treatAsSpace =
        (Character::treatAsSpace(character) ||
         (m_normalizeSpace &&
          Character::isNormalizedCanvasSpaceCharacter(character))) &&
        (character != '\t' || !m_allowTabs);
    if (treatAsSpace && character != noBreakSpaceCharacter)
        character = spaceCharacter;

    float spacing = 0;
    if (m_letterSpacing && !Character::treatAsZeroWidthSpace(character))
        spacing += m_letterSpacing;

    if (treatAsSpace &&
            (index || !isFirstRun(run) || character == noBreakSpaceCharacter))
        spacing += m_wordSpacing;

    if (!hasExpansion())
        return spacing;

    if (treatAsSpace)
        return spacing + nextExpansion();

    if (run.is8Bit() || m_textJustify != TextJustify::TextJustifyAuto)
        return spacing;

    // isCJKIdeographOrSymbol() has expansion opportunities both before and
    // after each character.
    // http://www.w3.org/TR/jlreq/#line_adjustment
    if (U16_IS_LEAD(character) && index + 1 < run.length() &&
            U16_IS_TRAIL(run[index + 1]))
        character = U16_GET_SUPPLEMENTARY(character, run[index + 1]);
    if (!Character::isCJKIdeographOrSymbol(character)) {
        m_isAfterExpansion = false;
        return spacing;
    }

    if (!m_isAfterExpansion) {
        // Take the expansion opportunity before this ideograph.
        float expandBefore = nextExpansion();
        if (expandBefore) {
            offset += expandBefore;
            spacing += expandBefore;
        }
        if (!hasExpansion())
            return spacing;
    }

    return spacing + nextExpansion();
}
Ejemplo n.º 11
0
U_CAPI int32_t U_EXPORT2
u_countChar32(const UChar *s, int32_t length) {
    int32_t count;

    if(s==NULL || length<-1) {
        return 0;
    }

    count=0;
    if(length>=0) {
        while(length>0) {
            ++count;
            if(U16_IS_LEAD(*s) && length>=2 && U16_IS_TRAIL(*(s+1))) {
                s+=2;
                length-=2;
            } else {
                ++s;
                --length;
            }
        }
    } else /* length==-1 */ {
        UChar c;

        for(;;) {
            if((c=*s++)==0) {
                break;
            }
            ++count;

            /*
             * sufficient to look ahead one because of UTF-16;
             * safe to look ahead one because at worst that would be the terminating NUL
             */
            if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) {
                ++s;
            }
        }
    }
    return count;
}
Ejemplo n.º 12
0
UChar32
FCDUIterCollationIterator::previousCodePoint(UErrorCode &errorCode) {
    UChar32 c;
    for(;;) {
        if(state == ITER_CHECK_BWD) {
            c = iter.previous(&iter);
            if(c < 0) {
                start = pos = 0;
                state = ITER_IN_FCD_SEGMENT;
                return U_SENTINEL;
            }
            if(CollationFCD::hasLccc(c)) {
                UChar32 prev = U_SENTINEL;
                if(CollationFCD::maybeTibetanCompositeVowel(c) ||
                        CollationFCD::hasTccc(prev = iter.previous(&iter))) {
                    iter.next(&iter);
                    if(prev >= 0) {
                        iter.next(&iter);
                    }
                    if(!previousSegment(errorCode)) {
                        return U_SENTINEL;
                    }
                    continue;
                }
                // hasLccc(trail)=true for all trail surrogates
                if(U16_IS_TRAIL(c)) {
                    if(prev < 0) {
                        prev = iter.previous(&iter);
                    }
                    if(U16_IS_LEAD(prev)) {
                        return U16_GET_SUPPLEMENTARY(prev, c);
                    }
                }
                if(prev >= 0) {
                    iter.next(&iter);
                }
            }
            return c;
        } else if(state == ITER_IN_FCD_SEGMENT && pos != start) {
            c = uiter_previous32(&iter);
            pos -= U16_LENGTH(c);
            U_ASSERT(c >= 0);
            return c;
        } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != 0) {
            c = normalized.char32At(pos - 1);
            pos -= U16_LENGTH(c);
            return c;
        } else {
            switchToBackward();
        }
    }
}
Ejemplo n.º 13
0
        //
        // Check if a UTF16 string is valid according to the UTF16 standard
        // Specifically, check that we don't have any invalid surrogate pairs
        // If the string is valid, we return true.
        // If not, we set invalidIndex to the index of the first invalid char index
        // and return false
        // If the invalid char is a lead surrogate pair, we return its index
        // Otherwise, we treat the char before as the invalid one and return index - 1
        // This function has defined behavior only for null-terminated strings.
        // If the string is not null terminated, the behavior is undefined (likely hang)
        //
        static bool IsUtf16StringValid(const UChar* str, size_t length, size_t* invalidIndex)
        {
            Assert(invalidIndex != nullptr);
            *invalidIndex = -1;

            size_t i = 0;

            for (;;)
            {
                // Iterate through the UTF16-LE string
                // If we are at the end of the null terminated string, return true
                // since the string is valid
                // If not, check if the codepoint we have is a surrogate code unit.
                // If it is, the string is malformed since U16_NEXT would have returned
                // is the full codepoint if both code units in the surrogate pair were present
                UChar32 c;
                U16_NEXT(str, i, length, c);
                if (c == 0)
                {
                    return true;
                }
                if (U_IS_SURROGATE(c))
                {
                    if (U16_IS_LEAD(c))
                    {
                        *invalidIndex = i;
                    }
                    else
                    {
                        Assert(i > 0);
                        *invalidIndex = i - 1;
                    }

                    return false;
                }

                if (i >= length)
                {
                    return true;
                }
            }
        }
Ejemplo n.º 14
0
UChar32
FCDUIterCollationIterator::nextCodePoint(UErrorCode &errorCode) {
    UChar32 c;
    for(;;) {
        if(state == ITER_CHECK_FWD) {
            c = iter.next(&iter);
            if(c < 0) {
                return c;
            }
            if(CollationFCD::hasTccc(c)) {
                if(CollationFCD::maybeTibetanCompositeVowel(c) ||
                        CollationFCD::hasLccc(iter.current(&iter))) {
                    iter.previous(&iter);
                    if(!nextSegment(errorCode)) {
                        return U_SENTINEL;
                    }
                    continue;
                }
            }
            if(U16_IS_LEAD(c)) {
                UChar32 trail = iter.next(&iter);
                if(U16_IS_TRAIL(trail)) {
                    return U16_GET_SUPPLEMENTARY(c, trail);
                } else if(trail >= 0) {
                    iter.previous(&iter);
                }
            }
            return c;
        } else if(state == ITER_IN_FCD_SEGMENT && pos != limit) {
            c = uiter_next32(&iter);
            pos += U16_LENGTH(c);
            U_ASSERT(c >= 0);
            return c;
        } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != normalized.length()) {
            c = normalized.char32At(pos);
            pos += U16_LENGTH(c);
            return c;
        } else {
            switchToForward();
        }
    }
}
Ejemplo n.º 15
0
String HTMLTextAreaElement::sanitizeUserInputValue(const String& proposedValue,
                                                   unsigned maxLength) {
  unsigned submissionLength = 0;
  unsigned i = 0;
  for (; i < proposedValue.length(); ++i) {
    if (proposedValue[i] == '\r' && i + 1 < proposedValue.length() &&
        proposedValue[i + 1] == '\n')
      continue;
    ++submissionLength;
    if (submissionLength == maxLength) {
      ++i;
      break;
    }
    if (submissionLength > maxLength)
      break;
  }
  if (i > 0 && U16_IS_LEAD(proposedValue[i - 1]))
    --i;
  return proposedValue.left(i);
}
Ejemplo n.º 16
0
void CollationElementIterator::setOffset(int32_t newOffset, 
                                         UErrorCode& status)
{
    if (U_FAILURE(status)) { return; }
    if (0 < newOffset && newOffset < string_.length()) {
        int32_t offset = newOffset;
        do {
            UChar c = string_.charAt(offset);
            if (!rbc_->isUnsafe(c) ||
                    (U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset)))) {
                break;
            }
            // Back up to before this unsafe character.
            --offset;
        } while (offset > 0);
        if (offset < newOffset) {
            // We might have backed up more than necessary.
            // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe,
            // but for text "chu" setOffset(2) should remain at 2
            // although we initially back up to offset 0.
            // Find the last safe offset no greater than newOffset by iterating forward.
            int32_t lastSafeOffset = offset;
            do {
                iter_->resetToOffset(lastSafeOffset);
                do {
                    iter_->nextCE(status);
                    if (U_FAILURE(status)) { return; }
                } while ((offset = iter_->getOffset()) == lastSafeOffset);
                if (offset <= newOffset) {
                    lastSafeOffset = offset;
                }
            } while (offset < newOffset);
            newOffset = lastSafeOffset;
        }
    }
    iter_->resetToOffset(newOffset);
    otherHalf_ = 0;
    dir_ = 1;
}
Ejemplo n.º 17
0
/* get a UChar32 from the stream*/
U_CAPI int32_t U_EXPORT2
ucbuf_getc32(UCHARBUF* buf,UErrorCode* error){
    int32_t retVal = (int32_t)U_EOF;
    if(error==NULL || U_FAILURE(*error)){
        return FALSE;
    }
    if(buf->currentPos+1>=buf->bufLimit){
        if(buf->remaining==0){
            return U_EOF;
        }
        buf=ucbuf_fillucbuf(buf,error);
        if(U_FAILURE(*error)){
            return U_EOF;
        }
    }
    if(U16_IS_LEAD(*(buf->currentPos))){
        retVal=U16_GET_SUPPLEMENTARY(buf->currentPos[0],buf->currentPos[1]);
        buf->currentPos+=2;
    }else{
        retVal = *(buf->currentPos++);
    }
    return retVal;
}
Ejemplo n.º 18
0
static inline int nextBreakablePositionBreakAllInternal(LazyLineBreakIterator& lazyBreakIterator, const CharacterType* str, unsigned length, int pos)
{
    int len = static_cast<int>(length);
    CharacterType lastLastCh = pos > 1 ? str[pos - 2] : static_cast<CharacterType>(lazyBreakIterator.secondToLastCharacter());
    CharacterType lastCh = pos > 0 ? str[pos - 1] : static_cast<CharacterType>(lazyBreakIterator.lastCharacter());
    bool lastIsLetterOrNumber = isUnicodeCategoryLetterOrNumber(lastLastCh, lastCh);
    for (int i = pos; i < len; ++i) {
        CharacterType ch = str[i];

        if (isBreakableSpace(ch) || shouldBreakAfter(lastLastCh, lastCh, ch))
            return i;

        if (!U16_IS_LEAD(ch)) {
            bool isLetterOrNumber = isUnicodeCategoryLetterOrNumber(lastCh, ch);
            if (isLetterOrNumber && lastIsLetterOrNumber)
                return i > pos && U16_IS_TRAIL(ch) ? i - 1 : i;
            lastIsLetterOrNumber = isLetterOrNumber;
        }

        lastLastCh = lastCh;
        lastCh = ch;
    }
    return len;
}
Ejemplo n.º 19
0
/* internal function */
U_CFUNC int32_t
u_strcmpFold(const UChar *s1, int32_t length1,
             const UChar *s2, int32_t length2,
             uint32_t options,
             UErrorCode *pErrorCode) {
    const UCaseProps *csp;

    /* current-level start/limit - s1/s2 as current */
    const UChar *start1, *start2, *limit1, *limit2;

    /* case folding variables */
    const UChar *p;
    int32_t length;

    /* stacks of previous-level start/current/limit */
    CmpEquivLevel stack1[2], stack2[2];

    /* case folding buffers, only use current-level start/limit */
    UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];

    /* track which is the current level per string */
    int32_t level1, level2;

    /* current code units, and code points for lookups */
    UChar32 c1, c2, cp1, cp2;

    /* no argument error checking because this itself is not an API */

    /*
     * assume that at least the option U_COMPARE_IGNORE_CASE is set
     * otherwise this function would have to behave exactly as uprv_strCompare()
     */
    csp=ucase_getSingleton();
    if(U_FAILURE(*pErrorCode)) {
        return 0;
    }

    /* initialize */
    start1=s1;
    if(length1==-1) {
        limit1=NULL;
    } else {
        limit1=s1+length1;
    }

    start2=s2;
    if(length2==-1) {
        limit2=NULL;
    } else {
        limit2=s2+length2;
    }

    level1=level2=0;
    c1=c2=-1;

    /* comparison loop */
    for(;;) {
        /*
         * here a code unit value of -1 means "get another code unit"
         * below it will mean "this source is finished"
         */

        if(c1<0) {
            /* get next code unit from string 1, post-increment */
            for(;;) {
                if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) {
                    if(level1==0) {
                        c1=-1;
                        break;
                    }
                } else {
                    ++s1;
                    break;
                }

                /* reached end of level buffer, pop one level */
                do {
                    --level1;
                    start1=stack1[level1].start;
                } while(start1==NULL);
                s1=stack1[level1].s;
                limit1=stack1[level1].limit;
            }
        }

        if(c2<0) {
            /* get next code unit from string 2, post-increment */
            for(;;) {
                if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) {
                    if(level2==0) {
                        c2=-1;
                        break;
                    }
                } else {
                    ++s2;
                    break;
                }

                /* reached end of level buffer, pop one level */
                do {
                    --level2;
                    start2=stack2[level2].start;
                } while(start2==NULL);
                s2=stack2[level2].s;
                limit2=stack2[level2].limit;
            }
        }

        /*
         * compare c1 and c2
         * either variable c1, c2 is -1 only if the corresponding string is finished
         */
        if(c1==c2) {
            if(c1<0) {
                return 0;   /* c1==c2==-1 indicating end of strings */
            }
            c1=c2=-1;       /* make us fetch new code units */
            continue;
        } else if(c1<0) {
            return -1;      /* string 1 ends before string 2 */
        } else if(c2<0) {
            return 1;       /* string 2 ends before string 1 */
        }
        /* c1!=c2 && c1>=0 && c2>=0 */

        /* get complete code points for c1, c2 for lookups if either is a surrogate */
        cp1=c1;
        if(U_IS_SURROGATE(c1)) {
            UChar c;

            if(U_IS_SURROGATE_LEAD(c1)) {
                if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) {
                    /* advance ++s1; only below if cp1 decomposes/case-folds */
                    cp1=U16_GET_SUPPLEMENTARY(c1, c);
                }
            } else /* isTrail(c1) */ {
                if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) {
                    cp1=U16_GET_SUPPLEMENTARY(c, c1);
                }
            }
        }

        cp2=c2;
        if(U_IS_SURROGATE(c2)) {
            UChar c;

            if(U_IS_SURROGATE_LEAD(c2)) {
                if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) {
                    /* advance ++s2; only below if cp2 decomposes/case-folds */
                    cp2=U16_GET_SUPPLEMENTARY(c2, c);
                }
            } else /* isTrail(c2) */ {
                if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) {
                    cp2=U16_GET_SUPPLEMENTARY(c, c2);
                }
            }
        }

        /*
         * go down one level for each string
         * continue with the main loop as soon as there is a real change
         */

        if( level1==0 &&
            (length=ucase_toFullFolding(csp, (UChar32)cp1, &p, options))>=0
        ) {
            /* cp1 case-folds to the code point "length" or to p[length] */
            if(U_IS_SURROGATE(c1)) {
                if(U_IS_SURROGATE_LEAD(c1)) {
                    /* advance beyond source surrogate pair if it case-folds */
                    ++s1;
                } else /* isTrail(c1) */ {
                    /*
                     * we got a supplementary code point when hitting its trail surrogate,
                     * therefore the lead surrogate must have been the same as in the other string;
                     * compare this decomposition with the lead surrogate in the other string
                     * remember that this simulates bulk text replacement:
                     * the decomposition would replace the entire code point
                     */
                    --s2;
                    c2=*(s2-1);
                }
            }

            /* push current level pointers */
            stack1[0].start=start1;
            stack1[0].s=s1;
            stack1[0].limit=limit1;
            ++level1;

            /* copy the folding result to fold1[] */
            if(length<=UCASE_MAX_STRING_LENGTH) {
                u_memcpy(fold1, p, length);
            } else {
                int32_t i=0;
                U16_APPEND_UNSAFE(fold1, i, length);
                length=i;
            }

            /* set next level pointers to case folding */
            start1=s1=fold1;
            limit1=fold1+length;

            /* get ready to read from decomposition, continue with loop */
            c1=-1;
            continue;
        }

        if( level2==0 &&
            (length=ucase_toFullFolding(csp, (UChar32)cp2, &p, options))>=0
        ) {
            /* cp2 case-folds to the code point "length" or to p[length] */
            if(U_IS_SURROGATE(c2)) {
                if(U_IS_SURROGATE_LEAD(c2)) {
                    /* advance beyond source surrogate pair if it case-folds */
                    ++s2;
                } else /* isTrail(c2) */ {
                    /*
                     * we got a supplementary code point when hitting its trail surrogate,
                     * therefore the lead surrogate must have been the same as in the other string;
                     * compare this decomposition with the lead surrogate in the other string
                     * remember that this simulates bulk text replacement:
                     * the decomposition would replace the entire code point
                     */
                    --s1;
                    c1=*(s1-1);
                }
            }

            /* push current level pointers */
            stack2[0].start=start2;
            stack2[0].s=s2;
            stack2[0].limit=limit2;
            ++level2;

            /* copy the folding result to fold2[] */
            if(length<=UCASE_MAX_STRING_LENGTH) {
                u_memcpy(fold2, p, length);
            } else {
                int32_t i=0;
                U16_APPEND_UNSAFE(fold2, i, length);
                length=i;
            }

            /* set next level pointers to case folding */
            start2=s2=fold2;
            limit2=fold2+length;

            /* get ready to read from decomposition, continue with loop */
            c2=-1;
            continue;
        }

        /*
         * no decomposition/case folding, max level for both sides:
         * return difference result
         *
         * code point order comparison must not just return cp1-cp2
         * because when single surrogates are present then the surrogate pairs
         * that formed cp1 and cp2 may be from different string indexes
         *
         * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
         * c1=d800 cp1=10001 c2=dc00 cp2=10000
         * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
         *
         * therefore, use same fix-up as in ustring.c/uprv_strCompare()
         * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
         * so we have slightly different pointer/start/limit comparisons here
         */

        if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) {
            /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
            if(
                (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) ||
                (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2)))
            ) {
                /* part of a surrogate pair, leave >=d800 */
            } else {
                /* BMP code point - may be surrogate code point - make <d800 */
                c1-=0x2800;
            }

            if(
                (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) ||
                (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2)))
            ) {
                /* part of a surrogate pair, leave >=d800 */
            } else {
                /* BMP code point - may be surrogate code point - make <d800 */
                c2-=0x2800;
            }
        }

        return c1-c2;
    }
}
Ejemplo n.º 20
0
void SVGTextLayoutAttributesBuilder::propagateLayoutAttributes(RenderObject* start, Vector<SVGTextLayoutAttributes>& allAttributes, unsigned& atCharacter, UChar& lastCharacter) const
{
    for (RenderObject* child = start->firstChild(); child; child = child->nextSibling()) { 
        if (child->isSVGInlineText()) {
            RenderSVGInlineText* text = toRenderSVGInlineText(child);
            const UChar* characters = text->characters();
            unsigned textLength = text->textLength();
            bool preserveWhiteSpace = shouldPreserveAllWhiteSpace(text->style());

            SVGTextLayoutAttributes attributes(text);
            attributes.reserveCapacity(textLength);
    
            unsigned valueListPosition = atCharacter;
            unsigned metricsLength = 1;
            SVGTextMetrics lastMetrics(SVGTextMetrics::SkippedSpaceMetrics);

            for (unsigned textPosition = 0; textPosition < textLength; textPosition += metricsLength) {
                const UChar& currentCharacter = characters[textPosition];

                SVGTextMetrics startToCurrentMetrics;
                SVGTextMetrics currentMetrics;
                unsigned valueListAdvance = 0;

                if (U16_IS_LEAD(currentCharacter) && (textPosition + 1) < textLength && U16_IS_TRAIL(characters[textPosition + 1])) {
                    // Handle surrogate pairs.
                    startToCurrentMetrics = SVGTextMetrics::measureCharacterRange(text, 0, textPosition + 2);
                    currentMetrics = SVGTextMetrics::measureCharacterRange(text, textPosition, 2);
                    metricsLength = currentMetrics.length();
                    valueListAdvance = 1;
                } else {
                    // Handle BMP characters.
                    startToCurrentMetrics = SVGTextMetrics::measureCharacterRange(text, 0, textPosition + 1);
                    currentMetrics = SVGTextMetrics::measureCharacterRange(text, textPosition, 1);
                    metricsLength = currentMetrics.length();
                    valueListAdvance = metricsLength;
                }

                if (!metricsLength)
                    break;
                
                // Frequent case for Arabic text: when measuring a single character the arabic isolated form is taken
                // when rendering the glyph "in context" (with it's surrounding characters) it changes due to shaping.
                // So whenever runWidthAdvance != currentMetrics.width(), we are processing a text run whose length is
                // not equal to the sum of the individual lengths of the glyphs, when measuring them isolated.
                float runWidthAdvance = startToCurrentMetrics.width() - lastMetrics.width();
                if (runWidthAdvance != currentMetrics.width())
                    currentMetrics.setWidth(runWidthAdvance);

                lastMetrics = startToCurrentMetrics;

                if (!preserveWhiteSpace && characterIsSpace(currentCharacter) && characterIsSpaceOrNull(lastCharacter)) {
                    attributes.positioningLists().appendEmptyValues();
                    attributes.textMetricsValues().append(SVGTextMetrics(SVGTextMetrics::SkippedSpaceMetrics));
                    continue;
                }

                SVGTextLayoutAttributes::PositioningLists& positioningLists = attributes.positioningLists();
                positioningLists.appendValuesFromPosition(m_positioningLists, valueListPosition);
                attributes.textMetricsValues().append(currentMetrics);

                // Pad x/y/dx/dy/rotate value lists with empty values, if the metrics span more than one character.
                if (metricsLength > 1) {
                    for (unsigned i = 0; i < metricsLength - 1; ++i)
                        positioningLists.appendEmptyValues();
                }

                lastCharacter = currentCharacter;
                valueListPosition += valueListAdvance;
            }

#if DUMP_TEXT_LAYOUT_ATTRIBUTES > 0
            fprintf(stderr, "\nDumping layout attributes for RenderSVGInlineText, renderer=%p, node=%p (atCharacter: %i)\n", text, text->node(), atCharacter);
            fprintf(stderr, "BiDi properties: unicode-bidi=%i, block direction=%i\n", text->style()->unicodeBidi(), text->style()->direction());
            attributes.dump();
#endif

            text->storeLayoutAttributes(attributes);
            allAttributes.append(attributes);
            atCharacter = valueListPosition;
            continue;
        }

        if (!child->isSVGInline())
            continue;

        propagateLayoutAttributes(child, allAttributes, atCharacter, lastCharacter);
    }
}
Ejemplo n.º 21
0
 bool characterStartsSurrogatePair(unsigned textPosition) const
 {
     return U16_IS_LEAD(m_run[textPosition]) && textPosition + 1 < textLength() && U16_IS_TRAIL(m_run[textPosition + 1]);
 }
Ejemplo n.º 22
0
U_CAPI UBool U_EXPORT2
u_strHasMoreChar32Than(const UChar* s, int32_t length, int32_t number) {

    if (number < 0) {
        return TRUE;
    }
    if (s == NULL || length < -1) {
        return FALSE;
    }

    if (length == -1) {
        /* s is NUL-terminated */
        UChar c;

        /* count code points until they exceed */
        for (; ;) {
            if ((c = *s++) == 0) {
                return FALSE;
            }
            if (number == 0) {
                return TRUE;
            }
            if (U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) {
                ++s;
            }
            --number;
        }
    } else {
        /* length>=0 known */
        const UChar* limit;
        int32_t maxSupplementary;

        /* s contains at least (length+1)/2 code points: <=2 UChars per cp */
        if (((length + 1) / 2) > number) {
            return TRUE;
        }

        /* check if s does not even contain enough UChars */
        maxSupplementary = length - number;
        if (maxSupplementary <= 0) {
            return FALSE;
        }
        /* there are maxSupplementary=length-number more UChars than asked-for code points */

        /*
         * count code points until they exceed and also check that there are
         * no more than maxSupplementary supplementary code points (UChar pairs)
         */
        limit = s + length;
        for (; ;) {
            if (s == limit) {
                return FALSE;
            }
            if (number == 0) {
                return TRUE;
            }
            if (U16_IS_LEAD(*s++) && s != limit && U16_IS_TRAIL(*s)) {
                ++s;
                if (--maxSupplementary <= 0) {
                    /* too many pairs - too few code points */
                    return FALSE;
                }
            }
            --number;
        }
    }
}
Ejemplo n.º 23
0
static inline bool isUnicodeCategoryLetterOrNumber(UChar lastCh, UChar ch)
{
    UChar32 ch32 = U16_IS_LEAD(lastCh) && U16_IS_TRAIL(ch) ? U16_GET_SUPPLEMENTARY(lastCh, ch) : ch;
    return (U_MASK(u_charType(ch32)) & (U_GC_L_MASK | U_GC_N_MASK));
}
String HTMLTextAreaElement::sanitizeUserInputValue(const String& proposedValue, unsigned maxLength)
{
    if (maxLength > 0 && U16_IS_LEAD(proposedValue[maxLength - 1]))
        --maxLength;
    return proposedValue.left(maxLength);
}
Ejemplo n.º 25
0
U_CFUNC int32_t
u_strToPunycode(const UChar *src, int32_t srcLength,
                UChar *dest, int32_t destCapacity,
                const UBool *caseFlags,
                UErrorCode *pErrorCode) {

    int32_t cpBuffer[MAX_CP_COUNT];
    int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
    UChar c, c2;

    /* argument checking */
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return 0;
    }

    if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    /*
     * Handle the basic code points and
     * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
     */
    srcCPCount=destLength=0;
    if(srcLength==-1) {
        /* NUL-terminated input */
        for(j=0; /* no condition */; ++j) {
            if((c=src[j])==0) {
                break;
            }
            if(srcCPCount==MAX_CP_COUNT) {
                /* too many input code points */
                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
                return 0;
            }
            if(IS_BASIC(c)) {
                cpBuffer[srcCPCount++]=0;
                if(destLength<destCapacity) {
                    dest[destLength]=
                        caseFlags!=NULL ?
                            asciiCaseMap((char)c, caseFlags[j]) :
                            (char)c;
                }
                ++destLength;
            } else {
                n=(caseFlags!=NULL && caseFlags[j])<<31L;
                if(U16_IS_SINGLE(c)) {
                    n|=c;
                } else if(U16_IS_LEAD(c) && U16_IS_TRAIL(c2=src[j+1])) {
                    ++j;
                    n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
                } else {
                    /* error: unmatched surrogate */
                    *pErrorCode=U_INVALID_CHAR_FOUND;
                    return 0;
                }
                cpBuffer[srcCPCount++]=n;
            }
        }
    } else {
        /* length-specified input */
        for(j=0; j<srcLength; ++j) {
            if(srcCPCount==MAX_CP_COUNT) {
                /* too many input code points */
                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
                return 0;
            }
            c=src[j];
            if(IS_BASIC(c)) {
                cpBuffer[srcCPCount++]=0;
                if(destLength<destCapacity) {
                    dest[destLength]=
                        caseFlags!=NULL ?
                            asciiCaseMap((char)c, caseFlags[j]) :
                            (char)c;
                }
                ++destLength;
            } else {
                n=(caseFlags!=NULL && caseFlags[j])<<31L;
                if(U16_IS_SINGLE(c)) {
                    n|=c;
                } else if(U16_IS_LEAD(c) && (j+1)<srcLength && U16_IS_TRAIL(c2=src[j+1])) {
                    ++j;
                    n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
                } else {
                    /* error: unmatched surrogate */
                    *pErrorCode=U_INVALID_CHAR_FOUND;
                    return 0;
                }
                cpBuffer[srcCPCount++]=n;
            }
        }
    }

    /* Finish the basic string - if it is not empty - with a delimiter. */
    basicLength=destLength;
    if(basicLength>0) {
        if(destLength<destCapacity) {
            dest[destLength]=DELIMITER;
        }
        ++destLength;
    }

    /*
     * handledCPCount is the number of code points that have been handled
     * basicLength is the number of basic code points
     * destLength is the number of chars that have been output
     */

    /* Initialize the state: */
    n=INITIAL_N;
    delta=0;
    bias=INITIAL_BIAS;

    /* Main encoding loop: */
    for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
        /*
         * All non-basic code points < n have been handled already.
         * Find the next larger one:
         */
        for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
            q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
            if(n<=q && q<m) {
                m=q;
            }
        }

        /*
         * Increase delta enough to advance the decoder's
         * <n,i> state to <m,0>, but guard against overflow:
         */
        if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
            *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
            return 0;
        }
        delta+=(m-n)*(handledCPCount+1);
        n=m;

        /* Encode a sequence of same code points n */
        for(j=0; j<srcCPCount; ++j) {
            q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
            if(q<n) {
                ++delta;
            } else if(q==n) {
                /* Represent delta as a generalized variable-length integer: */
                for(q=delta, k=BASE; /* no condition */; k+=BASE) {

                    /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt

                    t=k-bias;
                    if(t<TMIN) {
                        t=TMIN;
                    } else if(t>TMAX) {
                        t=TMAX;
                    }
                    */

                    t=k-bias;
                    if(t<TMIN) {
                        t=TMIN;
                    } else if(k>=(bias+TMAX)) {
                        t=TMAX;
                    }

                    if(q<t) {
                        break;
                    }

                    if(destLength<destCapacity) {
                        dest[destLength]=digitToBasic(t+(q-t)%(BASE-t), 0);
                    }
                    ++destLength;
                    q=(q-t)/(BASE-t);
                }

                if(destLength<destCapacity) {
                    dest[destLength]=digitToBasic(q, (UBool)(cpBuffer[j]<0));
                }
                ++destLength;
                bias=adaptBias(delta, handledCPCount+1, (UBool)(handledCPCount==basicLength));
                delta=0;
                ++handledCPCount;
            }
        }

        ++delta;
        ++n;
    }

    return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
}
inline bool SVGTextMetricsBuilder::currentCharacterStartsSurrogatePair() const
{
    return U16_IS_LEAD(m_run[m_textPosition]) && int(m_textPosition + 1) < m_run.charactersLength() && U16_IS_TRAIL(m_run[m_textPosition + 1]);
}
Ejemplo n.º 27
0
static int generateComponents(TextRunComponents* components, const Font &font, const TextRun &run)
{
    int letterSpacing = font.letterSpacing();
    int wordSpacing = font.wordSpacing();
    int padding = run.expansion();
    int numSpaces = 0;
    if (padding) {
        for (int i = 0; i < run.length(); i++)
            if (Font::treatAsSpace(run[i]))
                ++numSpaces;
    }

    int offset = 0;
    if (letterSpacing) {
        // need to draw every letter on it's own
        int start = 0;
        if (Font::treatAsSpace(run[0])) {
            int add = 0;
            if (numSpaces) {
                add = padding/numSpaces;
                padding -= add;
                --numSpaces;
            }
            components->append(TextRunComponent(1, font, offset));
            offset += add + letterSpacing + components->last().m_width;
            start = 1;
        }
        for (int i = 1; i < run.length(); ++i) {
            UChar ch = run[i];
            if (U16_IS_LEAD(ch) && U16_IS_TRAIL(run[i-1]))
                ch = U16_GET_SUPPLEMENTARY(ch, run[i-1]);
            if (U16_IS_TRAIL(ch) || U_GET_GC_MASK(ch) & U_GC_MN_MASK)
                continue;
            if (Font::treatAsSpace(run[i])) {
                int add = 0;
                if (i - start > 0) {
                    components->append(TextRunComponent(run.characters16() + start, i - start,
                                                        run, font, offset));
                    offset += components->last().m_width + letterSpacing;
                }
                if (numSpaces) {
                    add = padding/numSpaces;
                    padding -= add;
                    --numSpaces;
                }
                components->append(TextRunComponent(1, font, offset));
                offset += wordSpacing + add + components->last().m_width + letterSpacing;
                start = i + 1;
                continue;
            }
            if (i - start > 0) {
                components->append(TextRunComponent(run.characters16() + start, i - start,
                                                    run,
                                                    font, offset));
                offset += components->last().m_width + letterSpacing;
            }
            start = i;
        }
        if (run.length() - start > 0) {
            components->append(TextRunComponent(run.characters16() + start, run.length() - start,
                                                run,
                                                font, offset));
            offset += components->last().m_width;
        }
        offset += letterSpacing;
    } else {
        int start = 0;
        for (int i = 0; i < run.length(); ++i) {
            if (Font::treatAsSpace(run[i])) {
                if (i - start > 0) {
                    components->append(TextRunComponent(run.characters16() + start, i - start,
                                                        run,
                                                        font, offset));
                    offset += components->last().m_width;
                }
                int add = 0;
                if (numSpaces) {
                    add = padding/numSpaces;
                    padding -= add;
                    --numSpaces;
                }
                components->append(TextRunComponent(1, font, offset));
                offset += add + components->last().m_width;
                if (i)
                    offset += wordSpacing;
                start = i + 1;
            }
        }
        if (run.length() - start > 0) {
            components->append(TextRunComponent(run.characters16() + start, run.length() - start,
                                                run,
                                                font, offset));
            offset += components->last().m_width;
        }
    }
    return offset;
}
Ejemplo n.º 28
0
U_CFUNC int32_t U_EXPORT2
uprv_strCompare(const UChar *s1, int32_t length1,
                const UChar *s2, int32_t length2,
                UBool strncmpStyle, UBool codePointOrder) {
    const UChar *start1, *start2, *limit1, *limit2;
    UChar c1, c2;

    /* setup for fix-up */
    start1=s1;
    start2=s2;

    /* compare identical prefixes - they do not need to be fixed up */
    if(length1<0 && length2<0) {
        /* strcmp style, both NUL-terminated */
        if(s1==s2) {
            return 0;
        }

        for(;;) {
            c1=*s1;
            c2=*s2;
            if(c1!=c2) {
                break;
            }
            if(c1==0) {
                return 0;
            }
            ++s1;
            ++s2;
        }

        /* setup for fix-up */
        limit1=limit2=NULL;
    } else if(strncmpStyle) {
        /* special handling for strncmp, assume length1==length2>=0 but also check for NUL */
        if(s1==s2) {
            return 0;
        }

        limit1=start1+length1;

        for(;;) {
            /* both lengths are same, check only one limit */
            if(s1==limit1) {
                return 0;
            }

            c1=*s1;
            c2=*s2;
            if(c1!=c2) {
                break;
            }
            if(c1==0) {
                return 0;
            }
            ++s1;
            ++s2;
        }

        /* setup for fix-up */
        limit2=start2+length1; /* use length1 here, too, to enforce assumption */
    } else {
        /* memcmp/UnicodeString style, both length-specified */
        int32_t lengthResult;

        if(length1<0) {
            length1=u_strlen(s1);
        }
        if(length2<0) {
            length2=u_strlen(s2);
        }

        /* limit1=start1+min(lenght1, length2) */
        if(length1<length2) {
            lengthResult=-1;
            limit1=start1+length1;
        } else if(length1==length2) {
            lengthResult=0;
            limit1=start1+length1;
        } else /* length1>length2 */ {
            lengthResult=1;
            limit1=start1+length2;
        }

        if(s1==s2) {
            return lengthResult;
        }

        for(;;) {
            /* check pseudo-limit */
            if(s1==limit1) {
                return lengthResult;
            }

            c1=*s1;
            c2=*s2;
            if(c1!=c2) {
                break;
            }
            ++s1;
            ++s2;
        }

        /* setup for fix-up */
        limit1=start1+length1;
        limit2=start2+length2;
    }

    /* if both values are in or above the surrogate range, fix them up */
    if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
        /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
        if(
            (c1<=0xdbff && (s1+1)!=limit1 && U16_IS_TRAIL(*(s1+1))) ||
            (U16_IS_TRAIL(c1) && start1!=s1 && U16_IS_LEAD(*(s1-1)))
        ) {
            /* part of a surrogate pair, leave >=d800 */
        } else {
            /* BMP code point - may be surrogate code point - make <d800 */
            c1-=0x2800;
        }

        if(
            (c2<=0xdbff && (s2+1)!=limit2 && U16_IS_TRAIL(*(s2+1))) ||
            (U16_IS_TRAIL(c2) && start2!=s2 && U16_IS_LEAD(*(s2-1)))
        ) {
            /* part of a surrogate pair, leave >=d800 */
        } else {
            /* BMP code point - may be surrogate code point - make <d800 */
            c2-=0x2800;
        }
    }

    /* now c1 and c2 are in the requested (code unit or code point) order */
    return (int32_t)c1-(int32_t)c2;
}
Ejemplo n.º 29
0
static void U_CALLCONV
_Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
                             UErrorCode *pErrorCode) {
    UConverter *cnv;
    const UChar *source, *sourceLimit;
    uint8_t *target;
    int32_t targetCapacity;
    int32_t *offsets;

    int32_t prev, c, diff;

    int32_t sourceIndex, nextSourceIndex;

    /* set up the local pointers */
    cnv=pArgs->converter;
    source=pArgs->source;
    sourceLimit=pArgs->sourceLimit;
    target=(uint8_t *)pArgs->target;
    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
    offsets=pArgs->offsets;

    /* get the converter state from UConverter */
    c=cnv->fromUChar32;
    prev=(int32_t)cnv->fromUnicodeStatus;
    if(prev==0) {
        prev=BOCU1_ASCII_PREV;
    }

    /* sourceIndex=-1 if the current character began in the previous buffer */
    sourceIndex= c==0 ? 0 : -1;
    nextSourceIndex=0;

    /* conversion loop */
    if(c!=0 && targetCapacity>0) {
        goto getTrail;
    }

fastSingle:
    /* fast loop for single-byte differences */
    /* use only one loop counter variable, targetCapacity, not also source */
    diff=(int32_t)(sourceLimit-source);
    if(targetCapacity>diff) {
        targetCapacity=diff;
    }
    while(targetCapacity>0 && (c=*source)<0x3000) {
        if(c<=0x20) {
            if(c!=0x20) {
                prev=BOCU1_ASCII_PREV;
            }
            *target++=(uint8_t)c;
            *offsets++=nextSourceIndex++;
            ++source;
            --targetCapacity;
        } else {
            diff=c-prev;
            if(DIFF_IS_SINGLE(diff)) {
                prev=BOCU1_SIMPLE_PREV(c);
                *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
                *offsets++=nextSourceIndex++;
                ++source;
                --targetCapacity;
            } else {
                break;
            }
        }
    }
    /* restore real values */
    targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
    sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */

    /* regular loop for all cases */
    while(source<sourceLimit) {
        if(targetCapacity>0) {
            c=*source++;
            ++nextSourceIndex;

            if(c<=0x20) {
                /*
                 * ISO C0 control & space:
                 * Encode directly for MIME compatibility,
                 * and reset state except for space, to not disrupt compression.
                 */
                if(c!=0x20) {
                    prev=BOCU1_ASCII_PREV;
                }
                *target++=(uint8_t)c;
                *offsets++=sourceIndex;
                --targetCapacity;

                sourceIndex=nextSourceIndex;
                continue;
            }

            if(U16_IS_LEAD(c)) {
getTrail:
                if(source<sourceLimit) {
                    /* test the following code unit */
                    UChar trail=*source;
                    if(U16_IS_TRAIL(trail)) {
                        ++source;
                        ++nextSourceIndex;
                        c=U16_GET_SUPPLEMENTARY(c, trail);
                    }
                } else {
                    /* no more input */
                    c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
                    break;
                }
            }

            /*
             * all other Unicode code points c==U+0021..U+10ffff
             * are encoded with the difference c-prev
             *
             * a new prev is computed from c,
             * placed in the middle of a 0x80-block (for most small scripts) or
             * in the middle of the Unihan and Hangul blocks
             * to statistically minimize the following difference
             */
            diff=c-prev;
            prev=BOCU1_PREV(c);
            if(DIFF_IS_SINGLE(diff)) {
                *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
                *offsets++=sourceIndex;
                --targetCapacity;
                sourceIndex=nextSourceIndex;
                if(c<0x3000) {
                    goto fastSingle;
                }
            } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
                /* optimize 2-byte case */
                int32_t m;

                if(diff>=0) {
                    diff-=BOCU1_REACH_POS_1+1;
                    m=diff%BOCU1_TRAIL_COUNT;
                    diff/=BOCU1_TRAIL_COUNT;
                    diff+=BOCU1_START_POS_2;
                } else {
                    diff-=BOCU1_REACH_NEG_1;
                    NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
                    diff+=BOCU1_START_NEG_2;
                }
                *target++=(uint8_t)diff;
                *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
                *offsets++=sourceIndex;
                *offsets++=sourceIndex;
                targetCapacity-=2;
                sourceIndex=nextSourceIndex;
            } else {
                int32_t length; /* will be 2..4 */

                diff=packDiff(diff);
                length=BOCU1_LENGTH_FROM_PACKED(diff);

                /* write the output character bytes from diff and length */
                /* from the first if in the loop we know that targetCapacity>0 */
                if(length<=targetCapacity) {
                    switch(length) {
                        /* each branch falls through to the next one */
                    case 4:
                        *target++=(uint8_t)(diff>>24);
                        *offsets++=sourceIndex;
                        U_FALLTHROUGH;
                    case 3:
                        *target++=(uint8_t)(diff>>16);
                        *offsets++=sourceIndex;
                        U_FALLTHROUGH;
                    case 2:
                        *target++=(uint8_t)(diff>>8);
                        *offsets++=sourceIndex;
                    /* case 1: handled above */
                        *target++=(uint8_t)diff;
                        *offsets++=sourceIndex;
                        U_FALLTHROUGH;
                    default:
                        /* will never occur */
                        break;
                    }
                    targetCapacity-=length;
                    sourceIndex=nextSourceIndex;
                } else {
                    uint8_t *charErrorBuffer;

                    /*
                     * We actually do this backwards here:
                     * In order to save an intermediate variable, we output
                     * first to the overflow buffer what does not fit into the
                     * regular target.
                     */
                    /* we know that 1<=targetCapacity<length<=4 */
                    length-=targetCapacity;
                    charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
                    switch(length) {
                        /* each branch falls through to the next one */
                    case 3:
                        *charErrorBuffer++=(uint8_t)(diff>>16);
                        U_FALLTHROUGH;
                    case 2:
                        *charErrorBuffer++=(uint8_t)(diff>>8);
                        U_FALLTHROUGH;
                    case 1:
                        *charErrorBuffer=(uint8_t)diff;
                        U_FALLTHROUGH;
                    default:
                        /* will never occur */
                        break;
                    }
                    cnv->charErrorBufferLength=(int8_t)length;

                    /* now output what fits into the regular target */
                    diff>>=8*length; /* length was reduced by targetCapacity */
                    switch(targetCapacity) {
                        /* each branch falls through to the next one */
                    case 3:
                        *target++=(uint8_t)(diff>>16);
                        *offsets++=sourceIndex;
                        U_FALLTHROUGH;
                    case 2:
                        *target++=(uint8_t)(diff>>8);
                        *offsets++=sourceIndex;
                        U_FALLTHROUGH;
                    case 1:
                        *target++=(uint8_t)diff;
                        *offsets++=sourceIndex;
                        U_FALLTHROUGH;
                    default:
                        /* will never occur */
                        break;
                    }

                    /* target overflow */
                    targetCapacity=0;
                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
                    break;
                }
            }
        } else {
Ejemplo n.º 30
0
/* Parse a single escape sequence.  Although this method deals in
 * UChars, it does not use C++ or UnicodeString.  This allows it to
 * be used from C contexts. */
U_CAPI UChar32 U_EXPORT2
u_unescapeAt(UNESCAPE_CHAR_AT charAt,
             int32_t* offset,
             int32_t length,
             void* context) {

    int32_t start = *offset;
    UChar c;
    UChar32 result = 0;
    int8_t n = 0;
    int8_t minDig = 0;
    int8_t maxDig = 0;
    int8_t bitsPerDigit = 4;
    int8_t dig;
    int32_t i;
    UBool braces = FALSE;

    /* Check that offset is in range */
    if (*offset < 0 || *offset >= length) {
        goto err;
    }

    /* Fetch first UChar after '\\' */
    c = charAt((*offset)++, context);

    /* Convert hexadecimal and octal escapes */
    switch (c) {
        case 0x0075 /*'u'*/:
            minDig = maxDig = 4;
            break;
        case 0x0055 /*'U'*/:
            minDig = maxDig = 8;
            break;
        case 0x0078 /*'x'*/:
            minDig = 1;
            if (*offset < length && charAt(*offset, context) == 0x7B /*{*/) {
                ++(*offset);
                braces = TRUE;
                maxDig = 8;
            } else {
                maxDig = 2;
            }
            break;
        default:
            dig = _digit8(c);
            if (dig >= 0) {
                minDig = 1;
                maxDig = 3;
                n = 1; /* Already have first octal digit */
                bitsPerDigit = 3;
                result = dig;
            }
            break;
    }
    if (minDig != 0) {
        while (*offset < length && n < maxDig) {
            c = charAt(*offset, context);
            dig = (int8_t) ((bitsPerDigit == 3) ? _digit8(c) : _digit16(c));
            if (dig < 0) {
                break;
            }
            result = (result << bitsPerDigit) | dig;
            ++(*offset);
            ++n;
        }
        if (n < minDig) {
            goto err;
        }
        if (braces) {
            if (c != 0x7D /*}*/) {
                goto err;
            }
            ++(*offset);
        }
        if (result < 0 || result >= 0x110000) {
            goto err;
        }
        /* If an escape sequence specifies a lead surrogate, see if
         * there is a trail surrogate after it, either as an escape or
         * as a literal.  If so, join them up into a supplementary.
         */
        if (*offset < length && U16_IS_LEAD(result)) {
            int32_t ahead = *offset + 1;
            c = charAt(*offset, context);
            if (c == 0x5C /*'\\'*/ && ahead < length) {
                c = (UChar) u_unescapeAt(charAt, &ahead, length, context);
            }
            if (U16_IS_TRAIL(c)) {
                *offset = ahead;
                result = U16_GET_SUPPLEMENTARY(result, c);
            }
        }
        return result;
    }

    /* Convert C-style escapes in table */
    for (i = 0; i < UNESCAPE_MAP_LENGTH; i += 2) {
        if (c == UNESCAPE_MAP[i]) {
            return UNESCAPE_MAP[i + 1];
        } else if (c < UNESCAPE_MAP[i]) {
            break;
        }
    }

    /* Map \cX to control-X: X & 0x1F */
    if (c == 0x0063 /*'c'*/ && *offset < length) {
        c = charAt((*offset)++, context);
        if (UTF_IS_FIRST_SURROGATE(c) && *offset < length) {
            UChar c2 = charAt(*offset, context);
            if (UTF_IS_SECOND_SURROGATE(c2)) {
                ++(*offset);
                c = (UChar) UTF16_GET_PAIR_VALUE(c, c2); /* [sic] */
            }
        }
        return 0x1F & c;
    }

    /* If no special forms are recognized, then consider
     * the backslash to generically escape the next character.
     * Deal with surrogate pairs. */
    if (UTF_IS_FIRST_SURROGATE(c) && *offset < length) {
        UChar c2 = charAt(*offset, context);
        if (UTF_IS_SECOND_SURROGATE(c2)) {
            ++(*offset);
            return UTF16_GET_PAIR_VALUE(c, c2);
        }
    }
    return c;

    err:
    /* Invalid escape sequence */
    *offset = start; /* Reset to initial value */
    return (UChar32) 0xFFFFFFFF;
}