示例#1
0
/*
 * Compare two strings as presented by UCharIterators.
 * Use code unit or code point order.
 * When the function returns, it is undefined where the iterators
 * have stopped.
 */
U_CAPI int32_t U_EXPORT2
u_strCompareIter(UCharIterator* iter1, UCharIterator* iter2, UBool codePointOrder) {
    UChar32 c1, c2;

    /* argument checking */
    if (iter1 == NULL || iter2 == NULL) {
        return 0; /* bad arguments */
    }
    if (iter1 == iter2) {
        return 0; /* identical iterators */
    }

    /* reset iterators to start? */
    iter1->move(iter1, 0, UITER_START);
    iter2->move(iter2, 0, UITER_START);

    /* compare identical prefixes - they do not need to be fixed up */
    for (; ;) {
        c1 = iter1->next(iter1);
        c2 = iter2->next(iter2);
        if (c1 != c2) {
            break;
        }
        if (c1 == -1) {
            return 0;
        }
    }

    /* if both values are in or above the surrogate range, fix them up */
    if (c1 >= 0xd800 && c2 >= 0xd800 && codePointOrder) {
        /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
        if (
        (c1 <= 0xdbff && UTF_IS_TRAIL(iter1->current(iter1))) ||
        (UTF_IS_TRAIL(c1) && (iter1->previous(iter1), UTF_IS_LEAD(iter1->previous(iter1))))
        ) {
            /* part of a surrogate pair, leave >=d800 */
        } else {
            /* BMP code point - may be surrogate code point - make <d800 */
            c1 -= 0x2800;
        }

        if (
        (c2 <= 0xdbff && UTF_IS_TRAIL(iter2->current(iter2))) ||
        (UTF_IS_TRAIL(c2) && (iter2->previous(iter2), UTF_IS_LEAD(iter2->previous(iter2))))
        ) {
            /* part of a surrogate pair, leave >=d800 */
        } else {
            /* BMP code point - may be surrogate code point - make <d800 */
            c2 -= 0x2800;
        }
    }

    /* now c1 and c2 are in the requested (code unit or code point) order */
    return (int32_t) c1 - (int32_t) c2;
}
示例#2
0
U_CAPI int32_t U_EXPORT2
u_countChar32(const UChar* s, int32_t length) {
    int32_t count;

    if (s == NULL || length < -1) {
        return 0;
    }

    count = 0;
    if (length >= 0) {
        while (length > 0) {
            ++count;
            if (UTF_IS_LEAD(*s) && length >= 2 && UTF_IS_TRAIL(*(s + 1))) {
                s += 2;
                length -= 2;
            } else {
                ++s;
                --length;
            }
        }
    } else /* length==-1 */ {
        UChar c;

        for (; ;) {
            if ((c = *s++) == 0) {
                break;
            }
            ++count;

            /*
             * sufficient to look ahead one because of UTF-16;
             * safe to look ahead one because at worst that would be the terminating NUL
             */
            if (UTF_IS_LEAD(c) && UTF_IS_TRAIL(*s)) {
                ++s;
            }
        }
    }
    return count;
}
示例#3
0
/* get a UChar32 from the stream*/
U_CAPI int32_t U_EXPORT2
ucbuf_getc32(UCHARBUF* buf,UErrorCode* error){
    int32_t retVal = (int32_t)U_EOF;
    if(error==NULL || U_FAILURE(*error)){
        return FALSE;
    }
    if(buf->currentPos+1>=buf->bufLimit){
        if(buf->remaining==0){
            return U_EOF;
        }
        buf=ucbuf_fillucbuf(buf,error);
        if(U_FAILURE(*error)){
            return U_EOF;
        }
    }
    if(UTF_IS_LEAD(*(buf->currentPos))){
        retVal=UTF16_GET_PAIR_VALUE(buf->currentPos[0],buf->currentPos[1]);
        buf->currentPos+=2;
    }else{
        retVal = *(buf->currentPos++);
    }
    return retVal;
}
示例#4
0
U_CAPI int32_t U_EXPORT2
uprv_strCompare(const UChar* s1, int32_t length1,
                const UChar* s2, int32_t length2,
                UBool strncmpStyle, UBool codePointOrder) {
    const UChar* start1, * start2, * limit1, * limit2;
    UChar c1, c2;

    /* setup for fix-up */
    start1 = s1;
    start2 = s2;

    /* compare identical prefixes - they do not need to be fixed up */
    if (length1 < 0 && length2 < 0) {
        /* strcmp style, both NUL-terminated */
        if (s1 == s2) {
            return 0;
        }

        for (; ;) {
            c1 = *s1;
            c2 = *s2;
            if (c1 != c2) {
                break;
            }
            if (c1 == 0) {
                return 0;
            }
            ++s1;
            ++s2;
        }

        /* setup for fix-up */
        limit1 = limit2 = NULL;
    } else if (strncmpStyle) {
        /* special handling for strncmp, assume length1==length2>=0 but also check for NUL */
        if (s1 == s2) {
            return 0;
        }

        limit1 = start1 + length1;

        for (; ;) {
            /* both lengths are same, check only one limit */
            if (s1 == limit1) {
                return 0;
            }

            c1 = *s1;
            c2 = *s2;
            if (c1 != c2) {
                break;
            }
            if (c1 == 0) {
                return 0;
            }
            ++s1;
            ++s2;
        }

        /* setup for fix-up */
        limit2 = start2 + length1; /* use length1 here, too, to enforce assumption */
    } else {
        /* memcmp/UnicodeString style, both length-specified */
        int32_t lengthResult;

        if (length1 < 0) {
            length1 = u_strlen(s1);
        }
        if (length2 < 0) {
            length2 = u_strlen(s2);
        }

        /* limit1=start1+min(lenght1, length2) */
        if (length1 < length2) {
            lengthResult = -1;
            limit1 = start1 + length1;
        } else if (length1 == length2) {
            lengthResult = 0;
            limit1 = start1 + length1;
        } else /* length1>length2 */ {
            lengthResult = 1;
            limit1 = start1 + length2;
        }

        if (s1 == s2) {
            return lengthResult;
        }

        for (; ;) {
            /* check pseudo-limit */
            if (s1 == limit1) {
                return lengthResult;
            }

            c1 = *s1;
            c2 = *s2;
            if (c1 != c2) {
                break;
            }
            ++s1;
            ++s2;
        }

        /* setup for fix-up */
        limit1 = start1 + length1;
        limit2 = start2 + length2;
    }

    /* if both values are in or above the surrogate range, fix them up */
    if (c1 >= 0xd800 && c2 >= 0xd800 && codePointOrder) {
        /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
        if (
        (c1 <= 0xdbff && (s1 + 1) != limit1 && UTF_IS_TRAIL(*(s1 + 1))) ||
        (UTF_IS_TRAIL(c1) && start1 != s1 && UTF_IS_LEAD(*(s1 - 1)))
        ) {
            /* part of a surrogate pair, leave >=d800 */
        } else {
            /* BMP code point - may be surrogate code point - make <d800 */
            c1 -= 0x2800;
        }

        if (
        (c2 <= 0xdbff && (s2 + 1) != limit2 && UTF_IS_TRAIL(*(s2 + 1))) ||
        (UTF_IS_TRAIL(c2) && start2 != s2 && UTF_IS_LEAD(*(s2 - 1)))
        ) {
            /* part of a surrogate pair, leave >=d800 */
        } else {
            /* BMP code point - may be surrogate code point - make <d800 */
            c2 -= 0x2800;
        }
    }

    /* now c1 and c2 are in the requested (code unit or code point) order */
    return (int32_t) c1 - (int32_t) c2;
}
示例#5
0
void Transliterator::_transliterate(Replaceable& text,
                                    UTransPosition& index,
                                    const UnicodeString* insertion,
                                    UErrorCode &status) const {
    if (U_FAILURE(status)) {
        return;
    }

    if (!positionIsValid(index, text.length())) {
        status = U_ILLEGAL_ARGUMENT_ERROR;
        return;
    }

//    int32_t originalStart = index.contextStart;
    if (insertion != 0) {
        text.handleReplaceBetween(index.limit, index.limit, *insertion);
        index.limit += insertion->length();
        index.contextLimit += insertion->length();
    }

    if (index.limit > 0 &&
        UTF_IS_LEAD(text.charAt(index.limit - 1))) {
        // Oops, there is a dangling lead surrogate in the buffer.
        // This will break most transliterators, since they will
        // assume it is part of a pair.  Don't transliterate until
        // more text comes in.
        return;
    }

    filteredTransliterate(text, index, TRUE, TRUE);

#if 0
    // TODO
    // I CAN'T DO what I'm attempting below now that the Kleene star
    // operator is supported.  For example, in the rule

    //   ([:Lu:]+) { x } > $1;

    // what is the maximum context length?  getMaximumContextLength()
    // will return 1, but this is just the length of the ante context
    // part of the pattern string -- 1 character, which is a standin
    // for a Quantifier, which contains a StringMatcher, which
    // contains a UnicodeSet.

    // There is a complicated way to make this work again, and that's
    // to add a "maximum left context" protocol into the
    // UnicodeMatcher hierarchy.  At present I'm not convinced this is
    // worth it.

    // ---

    // The purpose of the code below is to keep the context small
    // while doing incremental transliteration.  When part of the left
    // context (between contextStart and start) is no longer needed,
    // we try to advance contextStart past that portion.  We use the
    // maximum context length to do so.
    int32_t newCS = index.start;
    int32_t n = getMaximumContextLength();
    while (newCS > originalStart && n-- > 0) {
        --newCS;
        newCS -= UTF_CHAR_LENGTH(text.char32At(newCS)) - 1;
    }
    index.contextStart = uprv_max(newCS, originalStart);
#endif
}
示例#6
0
static void
_Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
                             UErrorCode *pErrorCode) {
    UConverter *cnv;
    const UChar *source, *sourceLimit;
    uint8_t *target;
    int32_t targetCapacity;
    int32_t *offsets;

    int32_t prev, c, diff;

    int32_t sourceIndex, nextSourceIndex;

U_ALIGN_CODE(16)

    /* set up the local pointers */
    cnv=pArgs->converter;
    source=pArgs->source;
    sourceLimit=pArgs->sourceLimit;
    target=(uint8_t *)pArgs->target;
    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
    offsets=pArgs->offsets;

    /* get the converter state from UConverter */
    c=cnv->fromUChar32;
    prev=(int32_t)cnv->fromUnicodeStatus;
    if(prev==0) {
        prev=BOCU1_ASCII_PREV;
    }

    /* sourceIndex=-1 if the current character began in the previous buffer */
    sourceIndex= c==0 ? 0 : -1;
    nextSourceIndex=0;

    /* conversion loop */
    if(c!=0 && targetCapacity>0) {
        goto getTrail;
    }

fastSingle:
    /* fast loop for single-byte differences */
    /* use only one loop counter variable, targetCapacity, not also source */
    diff=(int32_t)(sourceLimit-source);
    if(targetCapacity>diff) {
        targetCapacity=diff;
    }
    while(targetCapacity>0 && (c=*source)<0x3000) {
        if(c<=0x20) {
            if(c!=0x20) {
                prev=BOCU1_ASCII_PREV;
            }
            *target++=(uint8_t)c;
            *offsets++=nextSourceIndex++;
            ++source;
            --targetCapacity;
        } else {
            diff=c-prev;
            if(DIFF_IS_SINGLE(diff)) {
                prev=BOCU1_SIMPLE_PREV(c);
                *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
                *offsets++=nextSourceIndex++;
                ++source;
                --targetCapacity;
            } else {
                break;
            }
        }
    }
    /* restore real values */
    targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
    sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */

    /* regular loop for all cases */
    while(source<sourceLimit) {
        if(targetCapacity>0) {
            c=*source++;
            ++nextSourceIndex;

            if(c<=0x20) {
                /*
                 * ISO C0 control & space:
                 * Encode directly for MIME compatibility,
                 * and reset state except for space, to not disrupt compression.
                 */
                if(c!=0x20) {
                    prev=BOCU1_ASCII_PREV;
                }
                *target++=(uint8_t)c;
                *offsets++=sourceIndex;
                --targetCapacity;

                sourceIndex=nextSourceIndex;
                continue;
            }

            if(UTF_IS_LEAD(c)) {
getTrail:
                if(source<sourceLimit) {
                    /* test the following code unit */
                    UChar trail=*source;
                    if(UTF_IS_SECOND_SURROGATE(trail)) {
                        ++source;
                        ++nextSourceIndex;
                        c=UTF16_GET_PAIR_VALUE(c, trail);
                    }
                } else {
                    /* no more input */
                    c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
                    break;
                }
            }

            /*
             * all other Unicode code points c==U+0021..U+10ffff
             * are encoded with the difference c-prev
             *
             * a new prev is computed from c,
             * placed in the middle of a 0x80-block (for most small scripts) or
             * in the middle of the Unihan and Hangul blocks
             * to statistically minimize the following difference
             */
            diff=c-prev;
            prev=BOCU1_PREV(c);
            if(DIFF_IS_SINGLE(diff)) {
                *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
                *offsets++=sourceIndex;
                --targetCapacity;
                sourceIndex=nextSourceIndex;
                if(c<0x3000) {
                    goto fastSingle;
                }
            } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
                /* optimize 2-byte case */
                int32_t m;

                if(diff>=0) {
                    diff-=BOCU1_REACH_POS_1+1;
                    m=diff%BOCU1_TRAIL_COUNT;
                    diff/=BOCU1_TRAIL_COUNT;
                    diff+=BOCU1_START_POS_2;
                } else {
                    diff-=BOCU1_REACH_NEG_1;
                    NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
                    diff+=BOCU1_START_NEG_2;
                }
                *target++=(uint8_t)diff;
                *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
                *offsets++=sourceIndex;
                *offsets++=sourceIndex;
                targetCapacity-=2;
                sourceIndex=nextSourceIndex;
            } else {
                int32_t length; /* will be 2..4 */

                diff=packDiff(diff);
                length=BOCU1_LENGTH_FROM_PACKED(diff);

                /* write the output character bytes from diff and length */
                /* from the first if in the loop we know that targetCapacity>0 */
                if(length<=targetCapacity) {
                    switch(length) {
                        /* each branch falls through to the next one */
                    case 4:
                        *target++=(uint8_t)(diff>>24);
                        *offsets++=sourceIndex;
                    case 3:
                        *target++=(uint8_t)(diff>>16);
                        *offsets++=sourceIndex;
                    case 2:
                        *target++=(uint8_t)(diff>>8);
                        *offsets++=sourceIndex;
                    /* case 1: handled above */
                        *target++=(uint8_t)diff;
                        *offsets++=sourceIndex;
                    default:
                        /* will never occur */
                        break;
                    }
                    targetCapacity-=length;
                    sourceIndex=nextSourceIndex;
                } else {
                    uint8_t *charErrorBuffer;

                    /*
                     * We actually do this backwards here:
                     * In order to save an intermediate variable, we output
                     * first to the overflow buffer what does not fit into the
                     * regular target.
                     */
                    /* we know that 1<=targetCapacity<length<=4 */
                    length-=targetCapacity;
                    charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
                    switch(length) {
                        /* each branch falls through to the next one */
                    case 3:
                        *charErrorBuffer++=(uint8_t)(diff>>16);
                    case 2:
                        *charErrorBuffer++=(uint8_t)(diff>>8);
                    case 1:
                        *charErrorBuffer=(uint8_t)diff;
                    default:
                        /* will never occur */
                        break;
                    }
                    cnv->charErrorBufferLength=(int8_t)length;

                    /* now output what fits into the regular target */
                    diff>>=8*length; /* length was reduced by targetCapacity */
                    switch(targetCapacity) {
                        /* each branch falls through to the next one */
                    case 3:
                        *target++=(uint8_t)(diff>>16);
                        *offsets++=sourceIndex;
                    case 2:
                        *target++=(uint8_t)(diff>>8);
                        *offsets++=sourceIndex;
                    case 1:
                        *target++=(uint8_t)diff;
                        *offsets++=sourceIndex;
                    default:
                        /* will never occur */
                        break;
                    }

                    /* target overflow */
                    targetCapacity=0;
                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
                    break;
                }
            }
        } else {
示例#7
0
U_CAPI UChar32* U_EXPORT2
u_strToUTF32(UChar32 *dest,
             int32_t  destCapacity,
             int32_t  *pDestLength,
             const UChar *src,
             int32_t  srcLength,
             UErrorCode *pErrorCode)
{
    const UChar* pSrc = src;
    const UChar* pSrcLimit;
    int32_t reqLength=0;
    uint32_t ch=0;
    uint32_t *pDest = (uint32_t *)dest;
    uint32_t *pDestLimit = pDest + destCapacity;
    UChar ch2=0;

    /* args check */
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
        return NULL;
    }


    if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }

    if(srcLength==-1) {
        while((ch=*pSrc)!=0 && pDest!=pDestLimit) {
            ++pSrc;
            /*need not check for NUL because NUL fails UTF_IS_TRAIL() anyway*/
            if(UTF_IS_LEAD(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
                ++pSrc;
                ch=UTF16_GET_PAIR_VALUE(ch, ch2);
            }
            *(pDest++)= ch;
        }
        while((ch=*pSrc++)!=0) {
            if(UTF_IS_LEAD(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
                ++pSrc;
            }
            ++reqLength;
        }
    } else {
        pSrcLimit = pSrc+srcLength;
        while(pSrc<pSrcLimit && pDest<pDestLimit) {
            ch=*pSrc++;
            if(UTF_IS_LEAD(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) {
                ++pSrc;
                ch=UTF16_GET_PAIR_VALUE(ch, ch2);
            }
            *(pDest++)= ch;
        }
        while(pSrc!=pSrcLimit) {
            ch=*pSrc++;
            if(UTF_IS_LEAD(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) {
                ++pSrc;
            }
            ++reqLength;
        }
    }

    reqLength+=(int32_t)(pDest - (uint32_t *)dest);
    if(pDestLength){
        *pDestLength = reqLength;
    }

    /* Terminate the buffer */
    u_terminateUChar32s(dest,destCapacity,reqLength,pErrorCode);

    return dest;
}