Exemple #1
0
static void
_Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
                             UErrorCode *pErrorCode) {
    UConverter *cnv;
    const UChar *source, *sourceLimit;
    uint8_t *target;
    int32_t targetCapacity;
    int32_t *offsets;

    int32_t prev, c, diff;

    int32_t sourceIndex, nextSourceIndex;

U_ALIGN_CODE(16)

    /* set up the local pointers */
    cnv=pArgs->converter;
    source=pArgs->source;
    sourceLimit=pArgs->sourceLimit;
    target=(uint8_t *)pArgs->target;
    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
    offsets=pArgs->offsets;

    /* get the converter state from UConverter */
    c=cnv->fromUChar32;
    prev=(int32_t)cnv->fromUnicodeStatus;
    if(prev==0) {
        prev=BOCU1_ASCII_PREV;
    }

    /* sourceIndex=-1 if the current character began in the previous buffer */
    sourceIndex= c==0 ? 0 : -1;
    nextSourceIndex=0;

    /* conversion loop */
    if(c!=0 && targetCapacity>0) {
        goto getTrail;
    }

fastSingle:
    /* fast loop for single-byte differences */
    /* use only one loop counter variable, targetCapacity, not also source */
    diff=(int32_t)(sourceLimit-source);
    if(targetCapacity>diff) {
        targetCapacity=diff;
    }
    while(targetCapacity>0 && (c=*source)<0x3000) {
        if(c<=0x20) {
            if(c!=0x20) {
                prev=BOCU1_ASCII_PREV;
            }
            *target++=(uint8_t)c;
            *offsets++=nextSourceIndex++;
            ++source;
            --targetCapacity;
        } else {
            diff=c-prev;
            if(DIFF_IS_SINGLE(diff)) {
                prev=BOCU1_SIMPLE_PREV(c);
                *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
                *offsets++=nextSourceIndex++;
                ++source;
                --targetCapacity;
            } else {
                break;
            }
        }
    }
    /* restore real values */
    targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
    sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */

    /* regular loop for all cases */
    while(source<sourceLimit) {
        if(targetCapacity>0) {
            c=*source++;
            ++nextSourceIndex;

            if(c<=0x20) {
                /*
                 * ISO C0 control & space:
                 * Encode directly for MIME compatibility,
                 * and reset state except for space, to not disrupt compression.
                 */
                if(c!=0x20) {
                    prev=BOCU1_ASCII_PREV;
                }
                *target++=(uint8_t)c;
                *offsets++=sourceIndex;
                --targetCapacity;

                sourceIndex=nextSourceIndex;
                continue;
            }

            if(UTF_IS_LEAD(c)) {
getTrail:
                if(source<sourceLimit) {
                    /* test the following code unit */
                    UChar trail=*source;
                    if(UTF_IS_SECOND_SURROGATE(trail)) {
                        ++source;
                        ++nextSourceIndex;
                        c=UTF16_GET_PAIR_VALUE(c, trail);
                    }
                } else {
                    /* no more input */
                    c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
                    break;
                }
            }

            /*
             * all other Unicode code points c==U+0021..U+10ffff
             * are encoded with the difference c-prev
             *
             * a new prev is computed from c,
             * placed in the middle of a 0x80-block (for most small scripts) or
             * in the middle of the Unihan and Hangul blocks
             * to statistically minimize the following difference
             */
            diff=c-prev;
            prev=BOCU1_PREV(c);
            if(DIFF_IS_SINGLE(diff)) {
                *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
                *offsets++=sourceIndex;
                --targetCapacity;
                sourceIndex=nextSourceIndex;
                if(c<0x3000) {
                    goto fastSingle;
                }
            } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
                /* optimize 2-byte case */
                int32_t m;

                if(diff>=0) {
                    diff-=BOCU1_REACH_POS_1+1;
                    m=diff%BOCU1_TRAIL_COUNT;
                    diff/=BOCU1_TRAIL_COUNT;
                    diff+=BOCU1_START_POS_2;
                } else {
                    diff-=BOCU1_REACH_NEG_1;
                    NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
                    diff+=BOCU1_START_NEG_2;
                }
                *target++=(uint8_t)diff;
                *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
                *offsets++=sourceIndex;
                *offsets++=sourceIndex;
                targetCapacity-=2;
                sourceIndex=nextSourceIndex;
            } else {
                int32_t length; /* will be 2..4 */

                diff=packDiff(diff);
                length=BOCU1_LENGTH_FROM_PACKED(diff);

                /* write the output character bytes from diff and length */
                /* from the first if in the loop we know that targetCapacity>0 */
                if(length<=targetCapacity) {
                    switch(length) {
                        /* each branch falls through to the next one */
                    case 4:
                        *target++=(uint8_t)(diff>>24);
                        *offsets++=sourceIndex;
                    case 3:
                        *target++=(uint8_t)(diff>>16);
                        *offsets++=sourceIndex;
                    case 2:
                        *target++=(uint8_t)(diff>>8);
                        *offsets++=sourceIndex;
                    /* case 1: handled above */
                        *target++=(uint8_t)diff;
                        *offsets++=sourceIndex;
                    default:
                        /* will never occur */
                        break;
                    }
                    targetCapacity-=length;
                    sourceIndex=nextSourceIndex;
                } else {
                    uint8_t *charErrorBuffer;

                    /*
                     * We actually do this backwards here:
                     * In order to save an intermediate variable, we output
                     * first to the overflow buffer what does not fit into the
                     * regular target.
                     */
                    /* we know that 1<=targetCapacity<length<=4 */
                    length-=targetCapacity;
                    charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
                    switch(length) {
                        /* each branch falls through to the next one */
                    case 3:
                        *charErrorBuffer++=(uint8_t)(diff>>16);
                    case 2:
                        *charErrorBuffer++=(uint8_t)(diff>>8);
                    case 1:
                        *charErrorBuffer=(uint8_t)diff;
                    default:
                        /* will never occur */
                        break;
                    }
                    cnv->charErrorBufferLength=(int8_t)length;

                    /* now output what fits into the regular target */
                    diff>>=8*length; /* length was reduced by targetCapacity */
                    switch(targetCapacity) {
                        /* each branch falls through to the next one */
                    case 3:
                        *target++=(uint8_t)(diff>>16);
                        *offsets++=sourceIndex;
                    case 2:
                        *target++=(uint8_t)(diff>>8);
                        *offsets++=sourceIndex;
                    case 1:
                        *target++=(uint8_t)diff;
                        *offsets++=sourceIndex;
                    default:
                        /* will never occur */
                        break;
                    }

                    /* target overflow */
                    targetCapacity=0;
                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
                    break;
                }
            }
        } else {
Exemple #2
0
/* Parse a single escape sequence.  Although this method deals in
 * UChars, it does not use C++ or UnicodeString.  This allows it to
 * be used from C contexts. */
U_CAPI UChar32 U_EXPORT2
u_unescapeAt(UNESCAPE_CHAR_AT charAt,
             int32_t* offset,
             int32_t length,
             void* context) {

    int32_t start = *offset;
    UChar c;
    UChar32 result = 0;
    int8_t n = 0;
    int8_t minDig = 0;
    int8_t maxDig = 0;
    int8_t bitsPerDigit = 4;
    int8_t dig;
    int32_t i;
    UBool braces = FALSE;

    /* Check that offset is in range */
    if (*offset < 0 || *offset >= length) {
        goto err;
    }

    /* Fetch first UChar after '\\' */
    c = charAt((*offset)++, context);

    /* Convert hexadecimal and octal escapes */
    switch (c) {
        case 0x0075 /*'u'*/:
            minDig = maxDig = 4;
            break;
        case 0x0055 /*'U'*/:
            minDig = maxDig = 8;
            break;
        case 0x0078 /*'x'*/:
            minDig = 1;
            if (*offset < length && charAt(*offset, context) == 0x7B /*{*/) {
                ++(*offset);
                braces = TRUE;
                maxDig = 8;
            } else {
                maxDig = 2;
            }
            break;
        default:
            dig = _digit8(c);
            if (dig >= 0) {
                minDig = 1;
                maxDig = 3;
                n = 1; /* Already have first octal digit */
                bitsPerDigit = 3;
                result = dig;
            }
            break;
    }
    if (minDig != 0) {
        while (*offset < length && n < maxDig) {
            c = charAt(*offset, context);
            dig = (int8_t) ((bitsPerDigit == 3) ? _digit8(c) : _digit16(c));
            if (dig < 0) {
                break;
            }
            result = (result << bitsPerDigit) | dig;
            ++(*offset);
            ++n;
        }
        if (n < minDig) {
            goto err;
        }
        if (braces) {
            if (c != 0x7D /*}*/) {
                goto err;
            }
            ++(*offset);
        }
        if (result < 0 || result >= 0x110000) {
            goto err;
        }
        /* If an escape sequence specifies a lead surrogate, see if
         * there is a trail surrogate after it, either as an escape or
         * as a literal.  If so, join them up into a supplementary.
         */
        if (*offset < length && U16_IS_LEAD(result)) {
            int32_t ahead = *offset + 1;
            c = charAt(*offset, context);
            if (c == 0x5C /*'\\'*/ && ahead < length) {
                c = (UChar) u_unescapeAt(charAt, &ahead, length, context);
            }
            if (U16_IS_TRAIL(c)) {
                *offset = ahead;
                result = U16_GET_SUPPLEMENTARY(result, c);
            }
        }
        return result;
    }

    /* Convert C-style escapes in table */
    for (i = 0; i < UNESCAPE_MAP_LENGTH; i += 2) {
        if (c == UNESCAPE_MAP[i]) {
            return UNESCAPE_MAP[i + 1];
        } else if (c < UNESCAPE_MAP[i]) {
            break;
        }
    }

    /* Map \cX to control-X: X & 0x1F */
    if (c == 0x0063 /*'c'*/ && *offset < length) {
        c = charAt((*offset)++, context);
        if (UTF_IS_FIRST_SURROGATE(c) && *offset < length) {
            UChar c2 = charAt(*offset, context);
            if (UTF_IS_SECOND_SURROGATE(c2)) {
                ++(*offset);
                c = (UChar) UTF16_GET_PAIR_VALUE(c, c2); /* [sic] */
            }
        }
        return 0x1F & c;
    }

    /* If no special forms are recognized, then consider
     * the backslash to generically escape the next character.
     * Deal with surrogate pairs. */
    if (UTF_IS_FIRST_SURROGATE(c) && *offset < length) {
        UChar c2 = charAt(*offset, context);
        if (UTF_IS_SECOND_SURROGATE(c2)) {
            ++(*offset);
            return UTF16_GET_PAIR_VALUE(c, c2);
        }
    }
    return c;

    err:
    /* Invalid escape sequence */
    *offset = start; /* Reset to initial value */
    return (UChar32) 0xFFFFFFFF;
}
Exemple #3
0
static void
UConverter_fromUnicode_CompoundText_OFFSETS(UConverterFromUnicodeArgs* args, UErrorCode* err){
    UConverter *cnv = args->converter;
    uint8_t *target = (uint8_t *) args->target;
    const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
    const UChar* source = args->source;
    const UChar* sourceLimit = args->sourceLimit;
    /* int32_t* offsets = args->offsets; */
    UChar32 sourceChar;
    UBool useFallback = cnv->useFallback;
    uint8_t tmpTargetBuffer[7];
    int32_t tmpTargetBufferLength = 0;
    COMPOUND_TEXT_CONVERTERS currentState, tmpState;
    uint32_t pValue;
    int32_t pValueLength = 0;
    int32_t i, n;

    UConverterDataCompoundText *myConverterData = (UConverterDataCompoundText *) cnv->extraInfo;

    currentState = myConverterData->state;

    /* check if the last codepoint of previous buffer was a lead surrogate*/
    if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
        goto getTrail;
    }

    while( source < sourceLimit){
        if(target < targetLimit){

            sourceChar  = *(source++);
            /*check if the char is a First surrogate*/
             if(UTF_IS_SURROGATE(sourceChar)) {
                if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
getTrail:
                    /*look ahead to find the trail surrogate*/
                    if(source < sourceLimit) {
                        /* test the following code unit */
                        UChar trail=(UChar) *source;
                        if(UTF_IS_SECOND_SURROGATE(trail)) {
                            source++;
                            sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
                            cnv->fromUChar32=0x00;
                            /* convert this supplementary code point */
                            /* exit this condition tree */
                        } else {
                            /* this is an unmatched lead code unit (1st surrogate) */
                            /* callback(illegal) */
                            *err=U_ILLEGAL_CHAR_FOUND;
                            cnv->fromUChar32=sourceChar;
                            break;
                        }
                    } else {
                        /* no more input */
                        cnv->fromUChar32=sourceChar;
                        break;
                    }
                } else {
                    /* this is an unmatched trail code unit (2nd surrogate) */
                    /* callback(illegal) */
                    *err=U_ILLEGAL_CHAR_FOUND;
                    cnv->fromUChar32=sourceChar;
                    break;
                }
            }

             tmpTargetBufferLength = 0;
             tmpState = getState(sourceChar);

             if (tmpState != DO_SEARCH && currentState != tmpState) {
                 /* Get escape sequence if necessary */
                 currentState = tmpState;
                 for (i = 0; escSeqCompoundText[currentState][i] != 0; i++) {
                     tmpTargetBuffer[tmpTargetBufferLength++] = escSeqCompoundText[currentState][i];
                 }
             }

             if (tmpState == DO_SEARCH) {
                 /* Test all available converters */
                 for (i = 1; i < SEARCH_LENGTH; i++) {
                     pValueLength = ucnv_MBCSFromUChar32(myConverterData->myConverterArray[i], sourceChar, &pValue, useFallback);
                     if (pValueLength > 0) {
                         tmpState = (COMPOUND_TEXT_CONVERTERS)i;
                         if (currentState != tmpState) {
                             currentState = tmpState;
                             for (i = 0; escSeqCompoundText[currentState][i] != 0; i++) {
                                 tmpTargetBuffer[tmpTargetBufferLength++] = escSeqCompoundText[currentState][i];
                             }
                         }
                         for (n = (pValueLength - 1); n >= 0; n--) {
                             tmpTargetBuffer[tmpTargetBufferLength++] = (uint8_t)(pValue >> (n * 8));
                         }
                         break;
                     }
                 }
             } else if (tmpState == COMPOUND_TEXT_SINGLE_0) {