/** Convert FORWARD UChar32-based index to UTF-8 based
 *
 * @param i string index (in container)
 * @param wh UChar32 character's position to look for,
 * counting starts from 0 == first character in i-th string
 * @return UTF-8 (byte) index
 *
 *
 * @version 0.1-?? (Bartek Tartanus)
 *          stri_sub
 *
 * @version 0.1-?? (Marek Gagolewski)
 *          stri__UChar32_to_UTF8_index
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-01)
 *          moved to StriContainerUTF8
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-20)
 *          moved to StriContainerUTF8_indexable
 *
 * @version 0.5-1 (Marek Gagolewski, 2015-02-14)
 *          use String8::isASCII
 */
R_len_t StriContainerUTF8_indexable::UChar32_to_UTF8_index_fwd(R_len_t i, R_len_t wh)
{
   if (wh <= 0) return 0;
   if (get(i).isASCII()) return std::min(wh, get(i).length());

   R_len_t cur_n = get(i).length();
   const char* cur_s = get(i).c_str();

#ifndef NDEBUG
   if (!cur_s)
      throw StriException("StriContainerUTF8::UChar32_to_UTF8_index_fwd: NULL cur_s");
#endif


   if (last_ind_fwd_str != cur_s) {
      // starting search in a different string
      last_ind_fwd_codepoint = 0;
      last_ind_fwd_utf8 = 0;
      last_ind_fwd_str = cur_s;
   }

   R_len_t j = 0;
   R_len_t jres = 0;

   if (last_ind_fwd_codepoint > 0) {
      if (wh < last_ind_fwd_codepoint) {
         // check if it makes sense to go backwards from last position,
         // or it is better to start from scratch
         if ((last_ind_fwd_codepoint-wh) < (wh-0)) {
            // less code points will be considered when going backwards
            j    = last_ind_fwd_codepoint;
            jres = last_ind_fwd_utf8;
            while (j > wh && jres > 0) {
               U8_BACK_1((const uint8_t*)cur_s, 0, jres);
               --j;
            }

            last_ind_fwd_codepoint = wh;
            last_ind_fwd_utf8 = jres;
            return jres; // stop right now
         }
         // else
      }
      else { //if (wh >= last_ind_fwd_codepoint)  // continue last search
         j    = last_ind_fwd_codepoint;
         jres = last_ind_fwd_utf8;
      }
   }

   // go forward
   while (j < wh && jres < cur_n) {
      U8_FWD_1((const uint8_t*)cur_s, jres, cur_n);
      ++j;
   }

   last_ind_fwd_codepoint = wh;
   last_ind_fwd_utf8 = jres;
   return jres;
}
/** Convert BACKWARD UChar32-based index to UTF-8 based
 *
 * @param i string index (in container)
 * @param wh UChar32 character's position to look for,
 * counting starts from 0 == byte after last character in the i-th string
 * @return UTF-8 (byte) index
 *
 *
 * @version 0.1-?? (Bartek Tartanus)
 *          stri_sub
 *
 * @version 0.1-?? (Marek Gagolewski)
 *          stri__UChar32_to_UTF8_index
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-01)
 *          moved to StriContainerUTF8
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-20)
 *          moved to StriContainerUTF8_indexable
 *
 * @version 0.5-1 (Marek Gagolewski, 2015-02-14)
 *          use String8::isASCII
 */
R_len_t StriContainerUTF8_indexable::UChar32_to_UTF8_index_back(R_len_t i, R_len_t wh)
{
   R_len_t cur_n = get(i).length();
   if (wh <= 0) return cur_n;
   if (get(i).isASCII()) return std::max(cur_n-wh, 0);
   const char* cur_s = get(i).c_str();

#ifndef NDEBUG
   if (!cur_s)
      throw StriException("StriContainerUTF8::UChar32_to_UTF8_index_back: NULL cur_s");
#endif

   if (last_ind_back_str != cur_s) {
      // starting search in a different string
      last_ind_back_codepoint = 0;
      last_ind_back_utf8 = cur_n;
      last_ind_back_str = cur_s;
   }


   R_len_t j = 0;
   R_len_t jres = cur_n;

   if (last_ind_back_codepoint > 0) {
      if (wh < last_ind_back_codepoint) {
         // check if it makes sense to go towards the end of the string
         // or maybe it will be better to start from the end and move backwards
         if ((last_ind_back_codepoint-wh) < (wh-0)) {
            // less code points will be considered when going backwards
            j    = last_ind_back_codepoint;
            jres = last_ind_back_utf8;
            while (j > wh && jres < cur_n) {
               U8_FWD_1((const uint8_t*)cur_s, jres, cur_n);
               --j;
            }

            last_ind_back_codepoint = wh;
            last_ind_back_utf8 = jres;
            return jres; // stop right now
         }
         // else
      }
      else { //if (wh >= last_ind_back_codepoint)  // continue last search
         j    = last_ind_back_codepoint;
         jres = last_ind_back_utf8;
      }
   }

   // go backward
   while (j < wh && jres > 0) {
      U8_BACK_1((const uint8_t*)cur_s, 0, jres);
      ++j;
   }

   last_ind_back_codepoint = wh;
   last_ind_back_utf8 = jres;

   return jres;
}
/** find last match - KMP
 *
 * @param startPos where to start
 * @return USEARCH_DONE on no match, otherwise start index
 *
 * @version 0.2-3 (Marek Gagolewski, 2014-05-11)
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-07)
 *    use BYTESEARCH_CASE_INSENSITIVE
 */
R_len_t StriContainerByteSearch::findFromPosBack_KMP(R_len_t startPos)
{
   int j = startPos;
   patternPos = 0;
   if (flags&BYTESEARCH_CASE_INSENSITIVE) {
      while (j > 0) {
         UChar32 c;
         U8_PREV(searchStr, 0, j, c);
         c = u_toupper(c);
         while (patternPos >= 0 &&
               patternStrCaseInsensitive[patternLenCaseInsensitive-1-patternPos] != c)
            patternPos = kmpNext[patternPos];
         patternPos++;
         if (patternPos == patternLenCaseInsensitive) {
            searchPos = j;

            // we need to go forward by patternLenCaseInsensitive code points
            R_len_t k = patternLenCaseInsensitive;
            searchEnd = j;
            while (k > 0) {
               U8_FWD_1((const uint8_t*)searchStr, searchEnd, searchLen);
               k--;
            }

            return searchPos;
         }
      }
   }
   else {
      while (j > 0) {
         j--;
         while (patternPos >= 0 && patternStr[patternLen-1-patternPos] != searchStr[j])
            patternPos = kmpNext[patternPos];
         patternPos++;
         if (patternPos == patternLen) {
            searchEnd = j+patternLen;
            searchPos = j;
            return searchPos;
         }
      }
   }

   // else not found
   searchPos = searchEnd = searchLen;
   return USEARCH_DONE;
}
예제 #4
0
static void TestFwdBack(){ 
    static const uint8_t input[]={0x61, 0xF0, 0x90, 0x90, 0x81, 0xff, 0x62, 0xc0, 0x80, 0x7f, 0x8f, 0xc0, 0x63, 0x81, 0x90, 0x90, 0xF0, 0x00};
    static const uint16_t fwd_unsafe[] ={1, 5, 6, 7,  9, 10, 11, 13, 14, 15, 16,  20, };
    static const uint16_t fwd_safe[]   ={1, 5, 6, 7, 9, 10, 11,  12, 13, 14, 15, 16, 17, 18};
    static const uint16_t back_unsafe[]={17, 16, 12, 11, 9, 7, 6, 5, 1, 0};
    static const uint16_t back_safe[]  ={17, 16, 15, 14, 13, 12, 11, 10, 9, 7, 6, 5, 1, 0};

    static const uint16_t Nvalue[]= {0, 1, 2, 3, 1, 2, 1, 5};
    static const uint16_t fwd_N_unsafe[] ={0, 1, 6, 10, 11, 14, 15};
    static const uint16_t fwd_N_safe[]   ={0, 1, 6, 10, 11, 13, 14, 18}; /*safe macro keeps it at the end of the string */
    static const uint16_t back_N_unsafe[]={18, 17, 12, 7, 6, 1, 0};
    static const uint16_t back_N_safe[]  ={18, 17, 15, 12, 11, 9, 7, 0};   


    uint32_t offunsafe=0, offsafe=0;

    uint32_t i=0;
    while(offunsafe < sizeof(input)){
        UTF8_FWD_1_UNSAFE(input, offunsafe);
        if(offunsafe != fwd_unsafe[i]){
            log_err("ERROR: Forward_unsafe offset expected:%d, Got:%d\n", fwd_unsafe[i], offunsafe);
        }
        i++;
    }

    i=0;
    while(offunsafe < sizeof(input)){
        U8_FWD_1_UNSAFE(input, offunsafe);
        if(offunsafe != fwd_unsafe[i]){
            log_err("ERROR: U8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", fwd_unsafe[i], offunsafe);
        }
        i++;
    }

    i=0;
    while(offsafe < sizeof(input)){
        UTF8_FWD_1_SAFE(input, offsafe, sizeof(input));
        if(offsafe != fwd_safe[i]){
            log_err("ERROR: Forward_safe offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
        }
        i++;
    }

    i=0;
    while(offsafe < sizeof(input)){
        U8_FWD_1(input, offsafe, sizeof(input));
        if(offsafe != fwd_safe[i]){
            log_err("ERROR: U8_FWD_1 offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
        }
        i++;
    }

    offunsafe=sizeof(input);
    i=0;
    while(offunsafe > 0){
        UTF8_BACK_1_UNSAFE(input, offunsafe);
        if(offunsafe != back_unsafe[i]){
            log_err("ERROR: Backward_unsafe offset expected:%d, Got:%d\n", back_unsafe[i], offunsafe);
        }
        i++;
    }

    offunsafe=sizeof(input);
    i=0;
    while(offunsafe > 0){
        U8_BACK_1_UNSAFE(input, offunsafe);
        if(offunsafe != back_unsafe[i]){
            log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", back_unsafe[i], offunsafe);
        }
        i++;
    }

    i=0;
    offsafe=sizeof(input);
    while(offsafe > 0){
        UTF8_BACK_1_SAFE(input, 0,  offsafe);
        if(offsafe != back_safe[i]){
            log_err("ERROR: Backward_safe offset expected:%d, Got:%d\n", back_unsafe[i], offsafe);
        }
        i++;
    }

    i=0;
    offsafe=sizeof(input);
    while(offsafe > 0){
        U8_BACK_1(input, 0,  offsafe);
        if(offsafe != back_safe[i]){
            log_err("ERROR: U8_BACK_1 offset expected:%d, Got:%d\n", back_unsafe[i], offsafe);
        }
        i++;
    }

    offunsafe=0;
    for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){  
        UTF8_FWD_N_UNSAFE(input, offunsafe, Nvalue[i]);
        if(offunsafe != fwd_N_unsafe[i]){
            log_err("ERROR: Forward_N_unsafe offset=%d expected:%d, Got:%d\n", i, fwd_N_unsafe[i], offunsafe);
        }
    }

    offunsafe=0;
    for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){  
        U8_FWD_N_UNSAFE(input, offunsafe, Nvalue[i]);
        if(offunsafe != fwd_N_unsafe[i]){
            log_err("ERROR: U8_FWD_N_UNSAFE offset=%d expected:%d, Got:%d\n", i, fwd_N_unsafe[i], offunsafe);
        }
    }

    offsafe=0;
    for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
        UTF8_FWD_N_SAFE(input, offsafe, sizeof(input), Nvalue[i]);
        if(offsafe != fwd_N_safe[i]){
            log_err("ERROR: Forward_N_safe offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
        }
    
    }

    offsafe=0;
    for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
        U8_FWD_N(input, offsafe, sizeof(input), Nvalue[i]);
        if(offsafe != fwd_N_safe[i]){
            log_err("ERROR: U8_FWD_N offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
        }
    
    }

    offunsafe=sizeof(input);
    for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){
        UTF8_BACK_N_UNSAFE(input, offunsafe, Nvalue[i]);
        if(offunsafe != back_N_unsafe[i]){
            log_err("ERROR: backward_N_unsafe offset=%d expected:%d, Got:%d\n", i, back_N_unsafe[i], offunsafe);
        }
    }

    offunsafe=sizeof(input);
    for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){
        U8_BACK_N_UNSAFE(input, offunsafe, Nvalue[i]);
        if(offunsafe != back_N_unsafe[i]){
            log_err("ERROR: U8_BACK_N_UNSAFE offset=%d expected:%d, Got:%d\n", i, back_N_unsafe[i], offunsafe);
        }
    }

    offsafe=sizeof(input);
    for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
        UTF8_BACK_N_SAFE(input, 0, offsafe, Nvalue[i]);
        if(offsafe != back_N_safe[i]){
            log_err("ERROR: backward_N_safe offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
        }
    }

    offsafe=sizeof(input);
    for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
        U8_BACK_N(input, 0, offsafe, Nvalue[i]);
        if(offsafe != back_N_safe[i]){
            log_err("ERROR: U8_BACK_N offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
        }
    }
}
예제 #5
0
/* keep this in sync with utf16tst.c's TestNulTerminated() */
static void TestNulTerminated() {
    static const uint8_t input[]={
        /*  0 */  0x61,
        /*  1 */  0xf0, 0x90, 0x90, 0x81,
        /*  5 */  0xc0, 0x80,
        /*  7 */  0xdf, 0x80,
        /*  9 */  0xc2,
        /* 10 */  0x62,
        /* 11 */  0xfd, 0xbe,
        /* 13 */  0xe0, 0xa0, 0x80,
        /* 16 */  0xe2, 0x82, 0xac,
        /* 19 */  0xf0, 0x90, 0x90,
        /* 22 */  0x00
        /* 23 */
    };
    static const UChar32 result[]={
        0x61,
        0x10401,
        U_SENTINEL,
        0x7c0,
        U_SENTINEL,
        0x62,
        U_SENTINEL,
        0x800,
        0x20ac,
        U_SENTINEL,
        0
    };

    UChar32 c, c2, expected;
    int32_t i0, i=0, j, k, expectedIndex;
    int32_t cpIndex=0;
    do {
        i0=i;
        U8_NEXT(input, i, -1, c);
        expected=result[cpIndex];
        if(c!=expected) {
            log_err("U8_NEXT(from %d)=U+%04x != U+%04x\n", i0, c, expected);
        }
        j=i0;
        U8_NEXT_OR_FFFD(input, j, -1, c);
        if(expected<0) { expected=0xfffd; }
        if(c!=expected) {
            log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x\n", i0, c, expected);
        }
        if(j!=i) {
            log_err("U8_NEXT_OR_FFFD() moved to index %d but U8_NEXT() moved to %d\n", j, i);
        }
        j=i0;
        U8_FWD_1(input, j, -1);
        if(j!=i) {
            log_err("U8_FWD_1() moved to index %d but U8_NEXT() moved to %d\n", j, i);
        }
        ++cpIndex;
        /*
         * Move by this many code points from the start.
         * U8_FWD_N() stops at the end of the string, that is, at the NUL if necessary.
         */
        expectedIndex= (c==0) ? i-1 : i;
        k=0;
        U8_FWD_N(input, k, -1, cpIndex);
        if(k!=expectedIndex) {
            log_err("U8_FWD_N(code points from 0) moved to index %d but expected %d\n", k, expectedIndex);
        }
    } while(c!=0);

    i=0;
    do {
        j=i0=i;
        U8_NEXT(input, i, -1, c);
        do {
            U8_GET(input, 0, j, -1, c2);
            if(c2!=c) {
                log_err("U8_NEXT(from %d)=U+%04x != U+%04x=U8_GET(at %d)\n", i0, c, c2, j);
            }
            U8_GET_OR_FFFD(input, 0, j, -1, c2);
            expected= (c>=0) ? c : 0xfffd;
            if(c2!=expected) {
                log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x=U8_GET_OR_FFFD(at %d)\n", i0, expected, c2, j);
            }
            /* U8_SET_CP_LIMIT moves from a non-lead byte to the limit of the code point */
            k=j+1;
            U8_SET_CP_LIMIT(input, 0, k, -1);
            if(k!=i) {
                log_err("U8_NEXT() moved to %d but U8_SET_CP_LIMIT(%d) moved to %d\n", i, j+1, k);
            }
        } while(++j<i);
    } while(c!=0);
}
 /** Convert UTF8-byte indices to Unicode32 (code points)
 *
 * \code{i1} and \code{i2} must be sorted increasingly
 *
 * @param i element index
 * @param i1 indices, 1-based [in/out]
 * @param i2 indices, 1-based [in/out]
 * @param ni size of \code{i1} and \code{i2}
 * @param adj1 adjust for \code{i1}
 * @param adj2 adjust for \code{i2}
 *
 *
 * @version 0.1-?? (Marek Gagolewski)
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-20)
 *          moved to StriContainerUTF8_indexable
 *
 * @version 0.5-1 (Marek Gagolewski, 2015-02-14)
 *          use String8::isASCII
 */
void StriContainerUTF8_indexable::UTF8_to_UChar32_index(R_len_t i,
   int* i1, int* i2, const int ni, int adj1, int adj2)
{
   if (get(i).isASCII()) {
      for (int i=0; i<ni; ++i) {
         i1[i] += adj1;
         i2[i] += adj2;
      }
      return;
   }

   const char* cstr = get(i).c_str();
   const int nstr = get(i).length();

   int j1 = 0;
   int j2 = 0;

   int i8 = 0;
   int i32 = 0;
   while (i8 < nstr && (j1 < ni || j2 < ni)) {

      if (j1 < ni && i1[j1] <= i8) {
#ifndef NDEBUG
      if (j1 < ni-1 && i1[j1] >= i1[j1+1])
         throw StriException("DEBUG: stri__UTF8_to_UChar32_index");
#endif
         i1[j1] = i32 + adj1;
         ++j1;
      }

      if (j2 < ni && i2[j2] <= i8) {
#ifndef NDEBUG
      if (j2 < ni-1 && i2[j2] >= i2[j2+1])
         throw StriException("DEBUG: stri__UTF8_to_UChar32_index");
#endif
         i2[j2] = i32 + adj2;
         ++j2;
      }

      // Next UChar32
      U8_FWD_1(cstr, i8, nstr);
      ++i32;
   }

   // CONVERT LAST:
   if (j1 < ni && i1[j1] <= nstr) {
#ifndef NDEBUG
      if (j1 < ni-1 && i1[j1] >= i1[j1+1])
         throw StriException("DEBUG: stri__UTF8_to_UChar32_index");
#endif
         i1[j1] = i32 + adj1;
         ++j1;
   }

   if (j2 < ni && i2[j2] <= nstr) {
#ifndef NDEBUG
      if (j2 < ni-1 && i2[j2] >= i2[j2+1])
         throw StriException("DEBUG: stri__UTF8_to_UChar32_index");
#endif
         i2[j2] = i32 + adj2;
         ++j2;
   }

   // CHECK:
#ifndef NDEBUG
      if (i8 >= nstr && (j1 < ni || j2 < ni))
         throw StriException("DEBUG: stri__UTF8_to_UChar32_index()");
#endif
}