Esempio n. 1
static void TestNextPrevCharUnsafe() {
     * Use a (mostly) well-formed UTF-8 string and test at code point boundaries.
     * The behavior of _UNSAFE macros for ill-formed strings is undefined.
    static const uint8_t input[]={
        0xf0, 0x90, 0x90, 0x81,
        0xc0, 0x80,  /* non-shortest form */
        0xe2, 0x82, 0xac,
        0xc2, 0xa1,
        0xf4, 0x8f, 0xbf, 0xbf,
    static const UChar32 codePoints[]={

    UChar32 c;
    int32_t i;
    uint32_t offset;
    for(i=0, offset=0; offset<sizeof(input); ++i) {
        UTF8_NEXT_CHAR_UNSAFE(input, offset, c);
        if(c != codePoints[i]){
            log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
                    offset, codePoints[i], c);
    for(i=0, offset=0; offset<sizeof(input); ++i) {
        U8_NEXT_UNSAFE(input, offset, c);
        if(c != codePoints[i]){
            log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
                    offset, codePoints[i], c);

    for(i=LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){
         UTF8_PREV_CHAR_UNSAFE(input, offset, c);
         if(c != codePoints[i]){
             log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
                     offset, codePoints[i], c);
    for(i=LENGTHOF(codePoints)-1, offset=sizeof(input); offset > 0; --i){
         U8_PREV_UNSAFE(input, offset, c);
         if(c != codePoints[i]){
             log_err("ERROR: U8_PREV_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n",
                     offset, codePoints[i], c);
Esempio n. 2
BUnicodeChar::FromUTF8(const char **in)
	int i = 0;
	uint32 c = 0;
	U8_NEXT_UNSAFE(*in, i, c);
	*in += i;

	return c;
/** Convert character vector to UTF-32
 * @param str character vector
 * @return list with integer vectors
 * @version 0.1 (Marek Gagolewski)
 * @version 0.2 (Marek Gagolewski, 2013-06-16) make StriException-friendly
SEXP stri_enc_toutf32(SEXP str)
   str = stri_prepare_arg_string(str, "str");
   R_len_t n = LENGTH(str);

   StriContainerUTF8 str_cont(str, n);

   R_len_t bufsize = 0;
   for (R_len_t i=0; i<n; ++i) {
       if (str_cont.isNA(i)) continue;
       R_len_t ni = str_cont.get(i).length();
       if (ni > bufsize) bufsize = ni;

   bufsize = bufsize + 1; // at most 4 times too large... well, have to be
   int* buf = (int*)R_alloc(bufsize, (int)sizeof(int));

   SEXP ret;
   PROTECT(ret = Rf_allocVector(VECSXP, n));

   for (R_len_t i = str_cont.vectorize_init();
         i != str_cont.vectorize_end();
         i = str_cont.vectorize_next(i)) {

      if (str_cont.isNA(i)) continue; // leave NULL

//      deque<UChar32> chars; // this is slower than using a common, over-sized buf

      UChar32 c;
      const char* s = str_cont.get(i).c_str();
      R_len_t sn = str_cont.get(i).length();
      R_len_t j = 0;
      R_len_t k = 0;
      while (j < sn) {
         U8_NEXT_UNSAFE(s, j, c);
         buf[k++] = (int)c;
//         chars.push_back(c);

      SEXP conv;
      PROTECT(conv = Rf_allocVector(INTSXP, k /*chars.size()*/));
      memcpy(INTEGER(conv), buf, (size_t)sizeof(int)*k);
//      for (deque<UChar32>::iterator it = chars.begin(); it != chars.end(); ++it)
//         *(conv_tab++) = (int)*it;
      SET_VECTOR_ELT(ret, i, conv);

   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
Esempio n. 4
int main() {
	for (int i = 0; i < PASSES; ++i) {
		const char *p = INPUT;
		UChar32 u;

		int i = 0;
		while (1) {
			U8_NEXT_UNSAFE(p, i, u);

			if (u == 0) {

	return 0;
Esempio n. 5
static void
TestSurrogates() {
    static const uint8_t b[]={
        0xc3, 0x9f,             /*  00DF */
        0xed, 0x9f, 0xbf,       /*  D7FF */
        0xed, 0xa0, 0x81,       /*  D801 */
        0xed, 0xbf, 0xbe,       /*  DFFE */
        0xee, 0x80, 0x80,       /*  E000 */
        0xf0, 0x97, 0xbf, 0xbe  /* 17FFE */
    static const UChar32 cp[]={
        0xdf, 0xd7ff, 0xd801, 0xdffe, 0xe000, 0x17ffe

    UChar32 cu, cs, cl;
    int32_t i, j, k, iu, is, il, length;

    k=0; /* index into cp[] */
    for(i=0; i<length;) {
        U8_NEXT_UNSAFE(b, j, cu);

        U8_NEXT(b, j, length, cs);

        L8_NEXT(b, j, length, cl);

        if(cu!=cp[k]) {
            log_err("U8_NEXT_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]);

        /* U8_NEXT() returns <0 for surrogate code points */
        if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) {
            log_err("U8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu);

        /* L8_NEXT() returns surrogate code points like U8_NEXT_UNSAFE() */
        if(cl!=cu) {
            log_err("L8_NEXT(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);

        if(is!=iu || il!=iu) {
            log_err("U8_NEXT(b[%ld]) or L8_NEXT(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);

        ++k;    /* next code point */
        i=iu;   /* advance by one UTF-8 sequence */

    while(i>0) {
        --k; /* previous code point */

        U8_PREV_UNSAFE(b, j, cu);

        U8_PREV(b, 0, j, cs);

        L8_PREV(b, 0, j, cl);

        if(cu!=cp[k]) {
            log_err("U8_PREV_UNSAFE(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cu, (long)cp[k]);

        /* U8_PREV() returns <0 for surrogate code points */
        if(U_IS_SURROGATE(cu) ? cs>=0 : cs!=cu) {
            log_err("U8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cs, (long)cu);

        /* L8_PREV() returns surrogate code points like U8_PREV_UNSAFE() */
        if(cl!=cu) {
            log_err("L8_PREV(b[%ld])=U+%04lX != U+%04lX\n", (long)i, (long)cl, (long)cu);

        if(is!=iu || il !=iu) {
            log_err("U8_PREV(b[%ld]) or L8_PREV(b[%ld]) did not advance the index correctly\n", (long)i, (long)i);

        i=iu;   /* go back by one UTF-8 sequence */
Esempio n. 6
static void TestNextPrevChar(){
    static const uint8_t input[]={0x61, 0xf0, 0x90, 0x90, 0x81, 0xc0, 0x80, 0xfd, 0xbe, 0xc2, 0x61, 0x81, 0x90, 0x90, 0xf0, 0x00};
    static const UChar32 result[]={
    /*next_unsafe    next_safe_ns        next_safe_s          prev_unsafe   prev_safe_ns         prev_safe_s*/
        0x0061,        0x0061,             0x0061,              0x0000,       0x0000,             0x0000,
        0x10401,       0x10401,            0x10401,             0xf0,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
        0x90,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x2841410,    UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
        0x90,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0xa1050,      UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
        0x81,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x2841,       UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
        0x00,          UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,  0x61,         0x61,               0x61,
        0x80,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0xc2,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
        0xfd,          UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,  0x77e,        UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
        0xbe,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0xfd,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
        0xa1,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x00,         UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
        0x61,          0x61,               0x61,                0xc0,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
        0x81,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x10401,      0x10401,            0x10401,
        0x90,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x410,        UTF_ERROR_VALUE,    UTF_ERROR_VALUE,
        0x90,          UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0x410,        UTF8_ERROR_VALUE_2, UTF8_ERROR_VALUE_2,
        0x0840,        UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,  0xf0,         UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_1,
        0x0000,        0x0000,             0x0000,              0x0061,       0x0061,             0x0061
    static const int32_t movedOffset[]={
   /*next_unsafe    next_safe_ns  next_safe_s       prev_unsafe   prev_safe_ns     prev_safe_s*/
        1,            1,           1,                15,           15,               15,
        5,            5,           5,                14,           14 ,              14, 
        3,            3,           3,                9,            13,               13, 
        4,            4,           4,                9,            12,               12,
        5,            5,           5,                9,            11,               11, 
        7,            7,           7,                10,           10,               10,  
        7,            7,           7,                9,            9,                9,  
        8,            9,           9,                7,            7,                7, 
        9,            9,           9,                7,            7,                7,  
        11,           10,          10,               5,            5,                5,    
        11,           11,          11,               5,            5,                5,   
        12,           12,          12,               1,            1,                1, 
        13,           13,          13,               1,            1,                1,   
        14,           14,          14,               1,            1,                1,      
        14,           15,          15,               1,            1,                1,  
        14,           16,          16,               0,            0,                0, 


    UChar32 c=0x0000;
    uint32_t i=0;
    uint32_t offset=0;
    int32_t setOffset=0;
    for(offset=0; offset<sizeof(input); offset++){
         if (offset < sizeof(input) - 2) { /* Can't have it go off the end of the array based on input */
             UTF8_NEXT_CHAR_UNSAFE(input, setOffset, c);
             if(setOffset != movedOffset[i]){
                 log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                     offset, movedOffset[i], setOffset);
             if(c != result[i]){
                 log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);

             U8_NEXT_UNSAFE(input, setOffset, c);
             if(setOffset != movedOffset[i]){
                 log_err("ERROR: U8_NEXT_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                     offset, movedOffset[i], setOffset);
             if(c != result[i]){
                 log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);

         UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, FALSE);
         if(setOffset != movedOffset[i+1]){
             log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+1], setOffset);
         if(c != result[i+1]){
             log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);

         U8_NEXT(input, setOffset, sizeof(input), c);
         if(setOffset != movedOffset[i+1]){
             log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+1], setOffset);
         if(UTF_IS_ERROR(result[i+1]) ? c >= 0 : c != result[i+1]){
             log_err("ERROR: U8_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);

         UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE);
         if(setOffset != movedOffset[i+1]){
             log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+2], setOffset);
         if(c != result[i+2]){
             log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);


    for(offset=sizeof(input); offset > 0; --offset){
         UTF8_PREV_CHAR_UNSAFE(input, setOffset, c);
         if(setOffset != movedOffset[i+3]){
             log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+3], setOffset);
         if(c != result[i+3]){
             log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+3], c);

         UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE);
         if(setOffset != movedOffset[i+4]){
             log_err("ERROR: UTF8_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+4], setOffset);
         if(c != result[i+4]){
             log_err("ERROR: UTF8_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c);

         U8_PREV(input, 0, setOffset, c);
         if(setOffset != movedOffset[i+4]){
             log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+4], setOffset);
         if(UTF_IS_ERROR(result[i+4]) ? c >= 0 : c != result[i+4]){
             log_err("ERROR: U8_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c);

         UTF8_PREV_CHAR_SAFE(input, 0,  setOffset, c, TRUE);
         if(setOffset != movedOffset[i+5]){
             log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+5], setOffset);
         if(c != result[i+5]){
             log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+5], c);


        /* test non-characters */
        static const uint8_t nonChars[]={
            0xef, 0xb7, 0x90,       /* U+fdd0 */
            0xef, 0xbf, 0xbf,       /* U+feff */
            0xf0, 0x9f, 0xbf, 0xbe, /* U+1fffe */
            0xf0, 0xbf, 0xbf, 0xbf, /* U+3ffff */
            0xf4, 0x8f, 0xbf, 0xbe  /* U+10fffe */

        UChar32 ch;
        int32_t idx;

        for(idx=0; idx<(int32_t)sizeof(nonChars);) {
            U8_NEXT(nonChars, idx, sizeof(nonChars), ch);
            if(!U_IS_UNICODE_NONCHAR(ch)) {
                log_err("U8_NEXT(before %d) failed to read a non-character\n", idx);
        for(idx=(int32_t)sizeof(nonChars); idx>0;) {
            U8_PREV(nonChars, 0, idx, ch);
            if(!U_IS_UNICODE_NONCHAR(ch)) {
                log_err("U8_PREV(at %d) failed to read a non-character\n", idx);
Esempio n. 7
** Compare two UTF-8 strings for equality where the first string is
** a "LIKE" expression. Return true (1) if they are the same and 
** false (0) if they are different.
static int icuLikeCompare(
  const uint8_t *zPattern,   /* LIKE pattern */
  const uint8_t *zString,    /* The UTF-8 string to compare against */
  const UChar32 uEsc         /* The escape character */
  static const int MATCH_ONE = (UChar32)'_';
  static const int MATCH_ALL = (UChar32)'%';

  int iPattern = 0;       /* Current byte index in zPattern */
  int iString = 0;        /* Current byte index in zString */

  int prevEscape = 0;     /* True if the previous character was uEsc */

  while( zPattern[iPattern]!=0 ){

    /* Read (and consume) the next character from the input pattern. */
    UChar32 uPattern;
    U8_NEXT_UNSAFE(zPattern, iPattern, uPattern);

    /* There are now 4 possibilities:
    **     1. uPattern is an unescaped match-all character "%",
    **     2. uPattern is an unescaped match-one character "_",
    **     3. uPattern is an unescaped escape character, or
    **     4. uPattern is to be handled as an ordinary character
    if( !prevEscape && uPattern==MATCH_ALL ){
      /* Case 1. */
      uint8_t c;

      /* Skip any MATCH_ALL or MATCH_ONE characters that follow a
      ** MATCH_ALL. For each MATCH_ONE, skip one character in the 
      ** test string.
      while( (c=zPattern[iPattern]) == MATCH_ALL || c == MATCH_ONE ){
        if( c==MATCH_ONE ){
          if( zString[iString]==0 ) return 0;
          U8_FWD_1_UNSAFE(zString, iString);

      if( zPattern[iPattern]==0 ) return 1;

      while( zString[iString] ){
        if( icuLikeCompare(&zPattern[iPattern], &zString[iString], uEsc) ){
          return 1;
        U8_FWD_1_UNSAFE(zString, iString);
      return 0;

    }else if( !prevEscape && uPattern==MATCH_ONE ){
      /* Case 2. */
      if( zString[iString]==0 ) return 0;
      U8_FWD_1_UNSAFE(zString, iString);

    }else if( !prevEscape && uPattern==uEsc){
      /* Case 3. */
      prevEscape = 1;

      /* Case 4. */
      UChar32 uString;
      U8_NEXT_UNSAFE(zString, iString, uString);
      uString = u_foldCase(uString, U_FOLD_CASE_DEFAULT);
      uPattern = u_foldCase(uPattern, U_FOLD_CASE_DEFAULT);
      if( uString!=uPattern ){
        return 0;
      prevEscape = 0;

  return zString[iString]==0;