Exemplo n.º 1
0
static void TestNextPrevNonCharacters() {
    /* test non-characters */
    static const uint8_t nonChars[]={
        0xef, 0xb7, 0x90,       /* U+fdd0 */
        0xef, 0xbf, 0xbf,       /* U+feff */
        0xf0, 0x9f, 0xbf, 0xbe, /* U+1fffe */
        0xf0, 0xbf, 0xbf, 0xbf, /* U+3ffff */
        0xf4, 0x8f, 0xbf, 0xbe  /* U+10fffe */
    };

    UChar32 ch;
    int32_t idx;

    for(idx=0; idx<(int32_t)sizeof(nonChars);) {
        U8_NEXT(nonChars, idx, sizeof(nonChars), ch);
        if(!U_IS_UNICODE_NONCHAR(ch)) {
            log_err("U8_NEXT(before %d) failed to read a non-character\n", idx);
        }
    }
    for(idx=(int32_t)sizeof(nonChars); idx>0;) {
        U8_PREV(nonChars, 0, idx, ch);
        if(!U_IS_UNICODE_NONCHAR(ch)) {
            log_err("U8_PREV(at %d) failed to read a non-character\n", idx);
        }
    }
}
Exemplo n.º 2
0
/*
 * Handle the non-inline part of the U8_NEXT() and U8_NEXT_FFFD() macros
 * and their obsolete sibling UTF8_NEXT_CHAR_SAFE().
 *
 * U8_NEXT() supports NUL-terminated strings indicated via length<0.
 *
 * The "strict" parameter controls the error behavior:
 * <0  "Safe" behavior of U8_NEXT():
 *     -1: All illegal byte sequences yield U_SENTINEL=-1.
 *     -2: Same as -1, except for lenient treatment of surrogate code points as legal.
 *         Some implementations use this for roundtripping of
 *         Unicode 16-bit strings that are not well-formed UTF-16, that is, they
 *         contain unpaired surrogates.
 *     -3: All illegal byte sequences yield U+FFFD.
 *  0  Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE):
 *     All illegal byte sequences yield a positive code point such that this
 *     result code point would be encoded with the same number of bytes as
 *     the illegal sequence.
 * >0  Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., TRUE):
 *     Same as the obsolete "safe" behavior, but non-characters are also treated
 *     like illegal sequences.
 *
 * Note that a UBool is the same as an int8_t.
 */
U_CAPI UChar32 U_EXPORT2
utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) {
    int32_t i=*pi;
    uint8_t count=U8_COUNT_TRAIL_BYTES(c);
    U_ASSERT(count <= 5); /* U8_COUNT_TRAIL_BYTES returns value 0...5 */
    if(i+count<=length || length<0) {
        uint8_t trail;

        U8_MASK_LEAD_BYTE(c, count);
        /* support NUL-terminated strings: do not read beyond the first non-trail byte */
        switch(count) {
        /* each branch falls through to the next one */
        case 0:
            /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
        case 5:
        case 4:
            /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
            break;
        case 3:
            trail=s[i++]-0x80;
            c=(c<<6)|trail;
            /* c>=0x110 would result in code point>0x10ffff, outside Unicode */
            if(c>=0x110 || trail>0x3f) { break; }
        case 2:
            trail=s[i++]-0x80;
            c=(c<<6)|trail;
            /*
             * test for a surrogate d800..dfff unless we are lenient:
             * before the last (c<<6), a surrogate is c=360..37f
             */
            if(((c&0xffe0)==0x360 && strict!=-2) || trail>0x3f) { break; }
        case 1:
            trail=s[i++]-0x80;
            c=(c<<6)|trail;
            if(trail>0x3f) { break; }
            /* correct sequence - all trail bytes have (b7..b6)==(10) */
            if(c>=utf8_minLegal[count] &&
                    /* strict: forbid non-characters like U+fffe */
                    (strict<=0 || !U_IS_UNICODE_NONCHAR(c))) {
                *pi=i;
                return c;
            }
        /* no default branch to optimize switch()  - all values are covered */
        }
    } else {
        /* too few bytes left */
        count=length-i;
    }

    /* error handling */
    i=*pi;
    while(count>0 && U8_IS_TRAIL(s[i])) {
        ++i;
        --count;
    }
    c=errorValue(i-*pi, strict);
    *pi=i;
    return c;
}
Exemplo n.º 3
0
static void
getAgeIfAssigned(UChar32 c, UVersionInfo age) {
    if(u_isdefined(c)) {
        u_charAge(c, age);
    } else if(U_IS_UNICODE_NONCHAR(c)) {
        age[0]=0;
        age[1]=0;
        age[2]=0;
        age[3]=1;
    } else {
        memset(age, 0, 4);
    }
}
Exemplo n.º 4
0
static void TestNextPrevChar(){
    static const uint8_t input[]={0x61, 0xf0, 0x90, 0x90, 0x81, 0xc0, 0x80, 0xfd, 0xbe, 0xc2, 0x61, 0x81, 0x90, 0x90, 0xf0, 0x00};
    static const UChar32 result[]={
    /*next_unsafe    next_safe_ns        next_safe_s          prev_unsafe   prev_safe_ns         prev_safe_s*/
        0x0061,        0x0061,             0x0061,              0x0000,       0x0000,             0x0000,
        0x10401,       0x10401,            0x10401,             0xf0,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
        0x90,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x2841410,    UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
        0x90,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0xa1050,      UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
        0x81,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x2841,       UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
        0x00,          UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,  0x61,         0x61,               0x61,
        0x80,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0xc2,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
        0xfd,          UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,  0x77e,        UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
        0xbe,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0xfd,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
        0xa1,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x00,         UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
        0x61,          0x61,               0x61,                0xc0,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
        0x81,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x10401,      0x10401,            0x10401,
        0x90,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x410,        UTF_ERROR_VALUE,    UTF_ERROR_VALUE,
        0x90,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x410,        UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
        0x0840,        UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0xf0,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
        0x0000,        0x0000,             0x0000,              0x0061,       0x0061,             0x0061
    };
    static const int32_t movedOffset[]={
   /*next_unsafe    next_safe_ns  next_safe_s       prev_unsafe   prev_safe_ns     prev_safe_s*/
        1,            1,           1,                15,           15,               15,
        5,            5,           5,                14,           14 ,              14, 
        3,            3,           3,                9,            13,               13, 
        4,            4,           4,                9,            12,               12,
        5,            5,           5,                9,            11,               11, 
        7,            7,           7,                10,           10,               10,  
        7,            7,           7,                9,            9,                9,  
        8,            9,           9,                7,            7,                7, 
        9,            9,           9,                7,            7,                7,  
        11,           10,          10,               5,            5,                5,    
        11,           11,          11,               5,            5,                5,   
        12,           12,          12,               1,            1,                1, 
        13,           13,          13,               1,            1,                1,   
        14,           14,          14,               1,            1,                1,      
        14,           15,          15,               1,            1,                1,  
        14,           16,          16,               0,            0,                0, 


    };


    UChar32 c=0x0000;
    uint32_t i=0;
    uint32_t offset=0;
    int32_t setOffset=0;
    for(offset=0; offset<sizeof(input); offset++){
         if (offset < sizeof(input) - 2) { /* Can't have it go off the end of the array based on input */
             setOffset=offset;
             UTF8_NEXT_CHAR_UNSAFE(input, setOffset, c);
             if(setOffset != movedOffset[i]){
                 log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                     offset, movedOffset[i], setOffset);
             }
             if(c != result[i]){
                 log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
             }

             setOffset=offset;
             U8_NEXT_UNSAFE(input, setOffset, c);
             if(setOffset != movedOffset[i]){
                 log_err("ERROR: U8_NEXT_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                     offset, movedOffset[i], setOffset);
             }
             if(c != result[i]){
                 log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
             }
         }

         setOffset=offset;
         UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, FALSE);
         if(setOffset != movedOffset[i+1]){
             log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+1], setOffset);
         }
         if(c != result[i+1]){
             log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
         }

         setOffset=offset;
         U8_NEXT(input, setOffset, sizeof(input), c);
         if(setOffset != movedOffset[i+1]){
             log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+1], setOffset);
         }
         if(UTF_IS_ERROR(result[i+1]) ? c >= 0 : c != result[i+1]){
             log_err("ERROR: U8_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
         }

         setOffset=offset;
         UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE);
         if(setOffset != movedOffset[i+1]){
             log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+2], setOffset);
         }
         if(c != result[i+2]){
             log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
         }

         i=i+6;
    }

    i=0;
    for(offset=sizeof(input); offset > 0; --offset){
         setOffset=offset;
         UTF8_PREV_CHAR_UNSAFE(input, setOffset, c);
         if(setOffset != movedOffset[i+3]){
             log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+3], setOffset);
         }
         if(c != result[i+3]){
             log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+3], c);
         }

         setOffset=offset;
         UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE);
         if(setOffset != movedOffset[i+4]){
             log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+4], setOffset);
         }
         if(c != result[i+4]){
             log_err("ERROR: UTF8_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c);
         }

         setOffset=offset;
         U8_PREV(input, 0, setOffset, c);
         if(setOffset != movedOffset[i+4]){
             log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+4], setOffset);
         }
         if(UTF_IS_ERROR(result[i+4]) ? c >= 0 : c != result[i+4]){
             log_err("ERROR: U8_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c);
         }

         setOffset=offset;
         UTF8_PREV_CHAR_SAFE(input, 0,  setOffset, c, TRUE);
         if(setOffset != movedOffset[i+5]){
             log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+5], setOffset);
         } 
         if(c != result[i+5]){
             log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+5], c);
         }

         i=i+6;
    }

    {
        /* test non-characters */
        static const uint8_t nonChars[]={
            0xef, 0xb7, 0x90,       /* U+fdd0 */
            0xef, 0xbf, 0xbf,       /* U+feff */
            0xf0, 0x9f, 0xbf, 0xbe, /* U+1fffe */
            0xf0, 0xbf, 0xbf, 0xbf, /* U+3ffff */
            0xf4, 0x8f, 0xbf, 0xbe  /* U+10fffe */
        };

        UChar32 ch;
        int32_t idx;

        for(idx=0; idx<(int32_t)sizeof(nonChars);) {
            U8_NEXT(nonChars, idx, sizeof(nonChars), ch);
            if(!U_IS_UNICODE_NONCHAR(ch)) {
                log_err("U8_NEXT(before %d) failed to read a non-character\n", idx);
            }
        }
        for(idx=(int32_t)sizeof(nonChars); idx>0;) {
            U8_PREV(nonChars, 0, idx, ch);
            if(!U_IS_UNICODE_NONCHAR(ch)) {
                log_err("U8_PREV(at %d) failed to read a non-character\n", idx);
            }
        }
    }
}