static void TestNextPrevCharUnsafe() { /* * Use a (mostly) well-formed UTF-8 string and test at code point boundaries. * The behavior of _UNSAFE macros for ill-formed strings is undefined. */ static const uint8_t input[]={ 0x61, 0xf0, 0x90, 0x90, 0x81, 0xc0, 0x80, /* non-shortest form */ 0xe2, 0x82, 0xac, 0xc2, 0xa1, 0xf4, 0x8f, 0xbf, 0xbf, 0x00 }; static const UChar32 codePoints[]={ 0x61, 0x10401, 0, 0x20ac, 0xa1, 0x10ffff, 0 }; UChar32 c; int32_t i; uint32_t offset; for(i=0, offset=0; offset<sizeof(input); ++i) { UTF8_NEXT_CHAR_UNSAFE(input, offset, c); if(c != codePoints[i]){ log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, codePoints[i], c); } } for(i=0, offset=0; offset<sizeof(input); ++i) { U8_NEXT_UNSAFE(input, offset, c); if(c != codePoints[i]){ log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, codePoints[i], c); } } for(i=LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){ UTF8_PREV_CHAR_UNSAFE(input, offset, c); if(c != codePoints[i]){ log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, codePoints[i], c); } } for(i=LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){ U8_PREV_UNSAFE(input, offset, c); if(c != codePoints[i]){ log_err("ERROR: U8_PREV_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, codePoints[i], c); } } }
uint32 BUnicodeChar::FromUTF8(const char **in) { int i = 0; uint32 c = 0; U8_NEXT_UNSAFE(*in, i, c); *in += i; return c; }
/** Convert character vector to UTF-32 * * @param str character vector * @return list with integer vectors * * @version 0.1 (Marek Gagolewski) * @version 0.2 (Marek Gagolewski, 2013-06-16) make StriException-friendly */ SEXP stri_enc_toutf32(SEXP str) { str = stri_prepare_arg_string(str, "str"); R_len_t n = LENGTH(str); STRI__ERROR_HANDLER_BEGIN StriContainerUTF8 str_cont(str, n); R_len_t bufsize = 0; for (R_len_t i=0; i<n; ++i) { if (str_cont.isNA(i)) continue; R_len_t ni = str_cont.get(i).length(); if (ni > bufsize) bufsize = ni; } bufsize = bufsize + 1; // at most 4 times too large... well, have to be int* buf = (int*)R_alloc(bufsize, (int)sizeof(int)); SEXP ret; PROTECT(ret = Rf_allocVector(VECSXP, n)); for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { if (str_cont.isNA(i)) continue; // leave NULL // deque<UChar32> chars; // this is slower than using a common, over-sized buf UChar32 c; const char* s = str_cont.get(i).c_str(); R_len_t sn = str_cont.get(i).length(); R_len_t j = 0; R_len_t k = 0; while (j < sn) { U8_NEXT_UNSAFE(s, j, c); buf[k++] = (int)c; // chars.push_back(c); } SEXP conv; PROTECT(conv = Rf_allocVector(INTSXP, k /*chars.size()*/)); memcpy(INTEGER(conv), buf, (size_t)sizeof(int)*k); // for (deque<UChar32>::iterator it = chars.begin(); it != chars.end(); ++it) // *(conv_tab++) = (int)*it; SET_VECTOR_ELT(ret, i, conv); UNPROTECT(1); } UNPROTECT(1); return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
int main() { for (int i = 0; i < PASSES; ++i) { const char *p = INPUT; UChar32 u; int i = 0; while (1) { U8_NEXT_UNSAFE(p, i, u); if (u == 0) { break; } } } return 0; }
static void TestSurrogates() { static const uint8_t b[]={ 0xc3, 0x9f, /* 00DF */ 0xed, 0x9f, 0xbf, /* D7FF */ 0xed, 0xa0, 0x81, /* D801 */ 0xed, 0xbf, 0xbe, /* DFFE */ 0xee, 0x80, 0x80, /* E000 */ 0xf0, 0x97, 0xbf, 0xbe /* 17FFE */ }; static const UChar32 cp[]={ 0xdf, 0xd7ff, 0xd801, 0xdffe, 0xe000, 0x17ffe }; UChar32 cu, cs, cl; int32_t i, j, k, iu, is, il, length; k=0; /* index into cp[] */ length=LENGTHOF(b); for(i=0; i<length;) { j=i; U8_NEXT_UNSAFE(b, j, cu); iu=j; j=i; U8_NEXT(b, j, length, cs); is=j; j=i; L8_NEXT(b, j, length, cl); il=j; if(cu!=cp[k]) { log_err("U8_NEXT_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]); } /* U8_NEXT() returns <0 for surrogate code points */ if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) { log_err("U8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu); } /* L8_NEXT() returns surrogate code points like U8_NEXT_UNSAFE() */ if(cl!=cu) { log_err("L8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu); } if(is!=iu || il!=iu) { log_err("U8_NEXT(b[%ld]) or L8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i); } ++k; /* next code point */ i=iu; /* advance by one UTF-8 sequence */ } while(i>0) { --k; /* previous code point */ j=i; U8_PREV_UNSAFE(b, j, cu); iu=j; j=i; U8_PREV(b, 0, j, cs); is=j; j=i; L8_PREV(b, 0, j, cl); il=j; if(cu!=cp[k]) { log_err("U8_PREV_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]); } /* U8_PREV() returns <0 for surrogate code points */ if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) { log_err("U8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu); } /* L8_PREV() returns surrogate code points like U8_PREV_UNSAFE() */ if(cl!=cu) { log_err("L8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu); } if(is!=iu || il !=iu) { log_err("U8_PREV(b[%ld]) or L8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i); } i=iu; /* go back by one UTF-8 sequence */ } }
static void TestNextPrevChar(){ static const uint8_t input[]={0x61, 0xf0, 0x90, 0x90, 0x81, 0xc0, 0x80, 0xfd, 0xbe, 0xc2, 0x61, 0x81, 0x90, 0x90, 0xf0, 0x00}; static const UChar32 result[]={ /*next_unsafe next_safe_ns next_safe_s prev_unsafe prev_safe_ns prev_safe_s*/ 0x0061, 0x0061, 0x0061, 0x0000, 0x0000, 0x0000, 0x10401, 0x10401, 0x10401, 0xf0, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x2841410, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xa1050, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x81, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x2841, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x00, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, 0x61, 0x61, 0x61, 0x80, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xc2, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xfd, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, 0x77e, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, 0xbe, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xfd, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xa1, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x00, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, 0x61, 0x61, 0x61, 0xc0, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x81, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x10401, 0x10401, 0x10401, 0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x410, UTF_ERROR_VALUE, UTF_ERROR_VALUE, 0x90, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x410, UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2, 0x0840, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0xf0, UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1, 0x0000, 0x0000, 0x0000, 0x0061, 0x0061, 0x0061 }; static const int32_t movedOffset[]={ /*next_unsafe next_safe_ns next_safe_s prev_unsafe prev_safe_ns prev_safe_s*/ 1, 1, 1, 15, 15, 15, 5, 5, 5, 14, 14 , 14, 3, 3, 3, 9, 13, 13, 4, 4, 4, 9, 12, 12, 5, 5, 5, 9, 11, 11, 7, 7, 7, 10, 10, 10, 7, 7, 7, 9, 9, 9, 8, 9, 9, 7, 7, 7, 9, 9, 9, 7, 7, 7, 11, 10, 10, 5, 5, 5, 11, 11, 11, 5, 5, 5, 12, 12, 12, 1, 1, 1, 13, 13, 13, 1, 1, 1, 14, 14, 14, 1, 1, 1, 14, 15, 15, 1, 1, 1, 14, 16, 16, 0, 0, 0, }; UChar32 c=0x0000; uint32_t i=0; uint32_t offset=0; int32_t setOffset=0; for(offset=0; offset<sizeof(input); offset++){ if (offset < sizeof(input) - 2) { /* Can't have it go off the end of the array based on input */ setOffset=offset; UTF8_NEXT_CHAR_UNSAFE(input, setOffset, c); if(setOffset != movedOffset[i]){ log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i], setOffset); } if(c != result[i]){ log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c); } setOffset=offset; U8_NEXT_UNSAFE(input, setOffset, c); if(setOffset != movedOffset[i]){ log_err("ERROR: U8_NEXT_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i], setOffset); } if(c != result[i]){ log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c); } } setOffset=offset; UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, FALSE); if(setOffset != movedOffset[i+1]){ log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+1], setOffset); } if(c != result[i+1]){ log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c); } setOffset=offset; U8_NEXT(input, setOffset, sizeof(input), c); if(setOffset != movedOffset[i+1]){ log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+1], setOffset); } if(UTF_IS_ERROR(result[i+1]) ? c >= 0 : c != result[i+1]){ log_err("ERROR: U8_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c); } setOffset=offset; UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE); if(setOffset != movedOffset[i+1]){ log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+2], setOffset); } if(c != result[i+2]){ log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c); } i=i+6; } i=0; for(offset=sizeof(input); offset > 0; --offset){ setOffset=offset; UTF8_PREV_CHAR_UNSAFE(input, setOffset, c); if(setOffset != movedOffset[i+3]){ log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+3], setOffset); } if(c != result[i+3]){ log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+3], c); } setOffset=offset; UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE); if(setOffset != movedOffset[i+4]){ log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+4], setOffset); } if(c != result[i+4]){ log_err("ERROR: UTF8_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c); } setOffset=offset; U8_PREV(input, 0, setOffset, c); if(setOffset != movedOffset[i+4]){ log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+4], setOffset); } if(UTF_IS_ERROR(result[i+4]) ? c >= 0 : c != result[i+4]){ log_err("ERROR: U8_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c); } setOffset=offset; UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, TRUE); if(setOffset != movedOffset[i+5]){ log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+5], setOffset); } if(c != result[i+5]){ log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+5], c); } i=i+6; } { /* test non-characters */ static const uint8_t nonChars[]={ 0xef, 0xb7, 0x90, /* U+fdd0 */ 0xef, 0xbf, 0xbf, /* U+feff */ 0xf0, 0x9f, 0xbf, 0xbe, /* U+1fffe */ 0xf0, 0xbf, 0xbf, 0xbf, /* U+3ffff */ 0xf4, 0x8f, 0xbf, 0xbe /* U+10fffe */ }; UChar32 ch; int32_t idx; for(idx=0; idx<(int32_t)sizeof(nonChars);) { U8_NEXT(nonChars, idx, sizeof(nonChars), ch); if(!U_IS_UNICODE_NONCHAR(ch)) { log_err("U8_NEXT(before %d) failed to read a non-character\n", idx); } } for(idx=(int32_t)sizeof(nonChars); idx>0;) { U8_PREV(nonChars, 0, idx, ch); if(!U_IS_UNICODE_NONCHAR(ch)) { log_err("U8_PREV(at %d) failed to read a non-character\n", idx); } } } }
/* ** Compare two UTF-8 strings for equality where the first string is ** a "LIKE" expression. Return true (1) if they are the same and ** false (0) if they are different. */ static int icuLikeCompare( const uint8_t *zPattern, /* LIKE pattern */ const uint8_t *zString, /* The UTF-8 string to compare against */ const UChar32 uEsc /* The escape character */ ){ static const int MATCH_ONE = (UChar32)'_'; static const int MATCH_ALL = (UChar32)'%'; int iPattern = 0; /* Current byte index in zPattern */ int iString = 0; /* Current byte index in zString */ int prevEscape = 0; /* True if the previous character was uEsc */ while( zPattern[iPattern]!=0 ){ /* Read (and consume) the next character from the input pattern. */ UChar32 uPattern; U8_NEXT_UNSAFE(zPattern, iPattern, uPattern); assert(uPattern!=0); /* There are now 4 possibilities: ** ** 1. uPattern is an unescaped match-all character "%", ** 2. uPattern is an unescaped match-one character "_", ** 3. uPattern is an unescaped escape character, or ** 4. uPattern is to be handled as an ordinary character */ if( !prevEscape && uPattern==MATCH_ALL ){ /* Case 1. */ uint8_t c; /* Skip any MATCH_ALL or MATCH_ONE characters that follow a ** MATCH_ALL. For each MATCH_ONE, skip one character in the ** test string. */ while( (c=zPattern[iPattern]) == MATCH_ALL || c == MATCH_ONE ){ if( c==MATCH_ONE ){ if( zString[iString]==0 ) return 0; U8_FWD_1_UNSAFE(zString, iString); } iPattern++; } if( zPattern[iPattern]==0 ) return 1; while( zString[iString] ){ if( icuLikeCompare(&zPattern[iPattern], &zString[iString], uEsc) ){ return 1; } U8_FWD_1_UNSAFE(zString, iString); } return 0; }else if( !prevEscape && uPattern==MATCH_ONE ){ /* Case 2. */ if( zString[iString]==0 ) return 0; U8_FWD_1_UNSAFE(zString, iString); }else if( !prevEscape && uPattern==uEsc){ /* Case 3. */ prevEscape = 1; }else{ /* Case 4. */ UChar32 uString; U8_NEXT_UNSAFE(zString, iString, uString); uString = u_foldCase(uString, U_FOLD_CASE_DEFAULT); uPattern = u_foldCase(uPattern, U_FOLD_CASE_DEFAULT); if( uString!=uPattern ){ return 0; } prevEscape = 0; } } return zString[iString]==0; }