/** Convert FORWARD UChar32-based index to UTF-8 based
 *
 * @param i string index (in container)
 * @param wh UChar32 character's position to look for,
 * counting starts from 0 == first character in i-th string
 * @return UTF-8 (byte) index
 *
 *
 * @version 0.1-?? (Bartek Tartanus)
 *          stri_sub
 *
 * @version 0.1-?? (Marek Gagolewski)
 *          stri__UChar32_to_UTF8_index
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-01)
 *          moved to StriContainerUTF8
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-20)
 *          moved to StriContainerUTF8_indexable
 *
 * @version 0.5-1 (Marek Gagolewski, 2015-02-14)
 *          use String8::isASCII
 */
R_len_t StriContainerUTF8_indexable::UChar32_to_UTF8_index_fwd(R_len_t i, R_len_t wh)
{
   if (wh <= 0) return 0;
   if (get(i).isASCII()) return std::min(wh, get(i).length());

   R_len_t cur_n = get(i).length();
   const char* cur_s = get(i).c_str();

#ifndef NDEBUG
   if (!cur_s)
      throw StriException("StriContainerUTF8::UChar32_to_UTF8_index_fwd: NULL cur_s");
#endif


   if (last_ind_fwd_str != cur_s) {
      // starting search in a different string
      last_ind_fwd_codepoint = 0;
      last_ind_fwd_utf8 = 0;
      last_ind_fwd_str = cur_s;
   }

   R_len_t j = 0;
   R_len_t jres = 0;

   if (last_ind_fwd_codepoint > 0) {
      if (wh < last_ind_fwd_codepoint) {
         // check if it makes sense to go backwards from last position,
         // or it is better to start from scratch
         if ((last_ind_fwd_codepoint-wh) < (wh-0)) {
            // less code points will be considered when going backwards
            j    = last_ind_fwd_codepoint;
            jres = last_ind_fwd_utf8;
            while (j > wh && jres > 0) {
               U8_BACK_1((const uint8_t*)cur_s, 0, jres);
               --j;
            }

            last_ind_fwd_codepoint = wh;
            last_ind_fwd_utf8 = jres;
            return jres; // stop right now
         }
         // else
      }
      else { //if (wh >= last_ind_fwd_codepoint)  // continue last search
         j    = last_ind_fwd_codepoint;
         jres = last_ind_fwd_utf8;
      }
   }

   // go forward
   while (j < wh && jres < cur_n) {
      U8_FWD_1((const uint8_t*)cur_s, jres, cur_n);
      ++j;
   }

   last_ind_fwd_codepoint = wh;
   last_ind_fwd_utf8 = jres;
   return jres;
}
/** Convert BACKWARD UChar32-based index to UTF-8 based
 *
 * @param i string index (in container)
 * @param wh UChar32 character's position to look for,
 * counting starts from 0 == byte after last character in the i-th string
 * @return UTF-8 (byte) index
 *
 *
 * @version 0.1-?? (Bartek Tartanus)
 *          stri_sub
 *
 * @version 0.1-?? (Marek Gagolewski)
 *          stri__UChar32_to_UTF8_index
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-01)
 *          moved to StriContainerUTF8
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-20)
 *          moved to StriContainerUTF8_indexable
 *
 * @version 0.5-1 (Marek Gagolewski, 2015-02-14)
 *          use String8::isASCII
 */
R_len_t StriContainerUTF8_indexable::UChar32_to_UTF8_index_back(R_len_t i, R_len_t wh)
{
   R_len_t cur_n = get(i).length();
   if (wh <= 0) return cur_n;
   if (get(i).isASCII()) return std::max(cur_n-wh, 0);
   const char* cur_s = get(i).c_str();

#ifndef NDEBUG
   if (!cur_s)
      throw StriException("StriContainerUTF8::UChar32_to_UTF8_index_back: NULL cur_s");
#endif

   if (last_ind_back_str != cur_s) {
      // starting search in a different string
      last_ind_back_codepoint = 0;
      last_ind_back_utf8 = cur_n;
      last_ind_back_str = cur_s;
   }


   R_len_t j = 0;
   R_len_t jres = cur_n;

   if (last_ind_back_codepoint > 0) {
      if (wh < last_ind_back_codepoint) {
         // check if it makes sense to go towards the end of the string
         // or maybe it will be better to start from the end and move backwards
         if ((last_ind_back_codepoint-wh) < (wh-0)) {
            // less code points will be considered when going backwards
            j    = last_ind_back_codepoint;
            jres = last_ind_back_utf8;
            while (j > wh && jres < cur_n) {
               U8_FWD_1((const uint8_t*)cur_s, jres, cur_n);
               --j;
            }

            last_ind_back_codepoint = wh;
            last_ind_back_utf8 = jres;
            return jres; // stop right now
         }
         // else
      }
      else { //if (wh >= last_ind_back_codepoint)  // continue last search
         j    = last_ind_back_codepoint;
         jres = last_ind_back_utf8;
      }
   }

   // go backward
   while (j < wh && jres > 0) {
      U8_BACK_1((const uint8_t*)cur_s, 0, jres);
      ++j;
   }

   last_ind_back_codepoint = wh;
   last_ind_back_utf8 = jres;

   return jres;
}
/** find first match - KMP
 *
 * @param startPos where to start
 * @return USEARCH_DONE on no match, otherwise start index
 *
 * @version 0.1-?? (Bartek Tartanus, 2013-08-15)
 *          KMP - first approach
 *
 * @version 0.2-3 (Marek Gagolewski, 2014-05-11)
 *          KMP upgraded; separate method
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-07)
 *    use BYTESEARCH_CASE_INSENSITIVE
 */
R_len_t StriContainerByteSearch::findFromPosFwd_KMP(R_len_t startPos)
{
   int j = startPos;
   patternPos = 0;
   if (flags&BYTESEARCH_CASE_INSENSITIVE) {
      UChar32 c = 0;
      while (j < searchLen) {
         U8_NEXT(searchStr, j, searchLen, c);
         c = u_toupper(c);
         while (patternPos >= 0 && patternStrCaseInsensitive[patternPos] != c)
            patternPos = kmpNext[patternPos];
         patternPos++;
         if (patternPos == patternLenCaseInsensitive) {
            searchEnd = j;

            // we need to go back by patternLenCaseInsensitive code points
            R_len_t k = patternLenCaseInsensitive;
            searchPos = j;
            while (k > 0) {
               U8_BACK_1((const uint8_t*)searchStr, 0, searchPos);
               k--;
            }
            return searchPos;
         }
      }
   }
   else {
      while (j < searchLen) {
         while (patternPos >= 0 && patternStr[patternPos] != searchStr[j])
            patternPos = kmpNext[patternPos];
         patternPos++;
         j++;
         if (patternPos == patternLen) {
            searchEnd = j;
            searchPos = j-patternLen;
            return searchPos;
         }
      }
   }
   // else not found
   searchPos = searchEnd = searchLen;
   return USEARCH_DONE;
}
Ejemplo n.º 4
0
static void TestFwdBack(){ 
    static const uint8_t input[]={0x61, 0xF0, 0x90, 0x90, 0x81, 0xff, 0x62, 0xc0, 0x80, 0x7f, 0x8f, 0xc0, 0x63, 0x81, 0x90, 0x90, 0xF0, 0x00};
    static const uint16_t fwd_unsafe[] ={1, 5, 6, 7,  9, 10, 11, 13, 14, 15, 16,  20, };
    static const uint16_t fwd_safe[]   ={1, 5, 6, 7, 9, 10, 11,  12, 13, 14, 15, 16, 17, 18};
    static const uint16_t back_unsafe[]={17, 16, 12, 11, 9, 7, 6, 5, 1, 0};
    static const uint16_t back_safe[]  ={17, 16, 15, 14, 13, 12, 11, 10, 9, 7, 6, 5, 1, 0};

    static const uint16_t Nvalue[]= {0, 1, 2, 3, 1, 2, 1, 5};
    static const uint16_t fwd_N_unsafe[] ={0, 1, 6, 10, 11, 14, 15};
    static const uint16_t fwd_N_safe[]   ={0, 1, 6, 10, 11, 13, 14, 18}; /*safe macro keeps it at the end of the string */
    static const uint16_t back_N_unsafe[]={18, 17, 12, 7, 6, 1, 0};
    static const uint16_t back_N_safe[]  ={18, 17, 15, 12, 11, 9, 7, 0};   


    uint32_t offunsafe=0, offsafe=0;

    uint32_t i=0;
    while(offunsafe < sizeof(input)){
        UTF8_FWD_1_UNSAFE(input, offunsafe);
        if(offunsafe != fwd_unsafe[i]){
            log_err("ERROR: Forward_unsafe offset expected:%d, Got:%d\n", fwd_unsafe[i], offunsafe);
        }
        i++;
    }

    i=0;
    while(offunsafe < sizeof(input)){
        U8_FWD_1_UNSAFE(input, offunsafe);
        if(offunsafe != fwd_unsafe[i]){
            log_err("ERROR: U8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", fwd_unsafe[i], offunsafe);
        }
        i++;
    }

    i=0;
    while(offsafe < sizeof(input)){
        UTF8_FWD_1_SAFE(input, offsafe, sizeof(input));
        if(offsafe != fwd_safe[i]){
            log_err("ERROR: Forward_safe offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
        }
        i++;
    }

    i=0;
    while(offsafe < sizeof(input)){
        U8_FWD_1(input, offsafe, sizeof(input));
        if(offsafe != fwd_safe[i]){
            log_err("ERROR: U8_FWD_1 offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
        }
        i++;
    }

    offunsafe=sizeof(input);
    i=0;
    while(offunsafe > 0){
        UTF8_BACK_1_UNSAFE(input, offunsafe);
        if(offunsafe != back_unsafe[i]){
            log_err("ERROR: Backward_unsafe offset expected:%d, Got:%d\n", back_unsafe[i], offunsafe);
        }
        i++;
    }

    offunsafe=sizeof(input);
    i=0;
    while(offunsafe > 0){
        U8_BACK_1_UNSAFE(input, offunsafe);
        if(offunsafe != back_unsafe[i]){
            log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", back_unsafe[i], offunsafe);
        }
        i++;
    }

    i=0;
    offsafe=sizeof(input);
    while(offsafe > 0){
        UTF8_BACK_1_SAFE(input, 0,  offsafe);
        if(offsafe != back_safe[i]){
            log_err("ERROR: Backward_safe offset expected:%d, Got:%d\n", back_unsafe[i], offsafe);
        }
        i++;
    }

    i=0;
    offsafe=sizeof(input);
    while(offsafe > 0){
        U8_BACK_1(input, 0,  offsafe);
        if(offsafe != back_safe[i]){
            log_err("ERROR: U8_BACK_1 offset expected:%d, Got:%d\n", back_unsafe[i], offsafe);
        }
        i++;
    }

    offunsafe=0;
    for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){  
        UTF8_FWD_N_UNSAFE(input, offunsafe, Nvalue[i]);
        if(offunsafe != fwd_N_unsafe[i]){
            log_err("ERROR: Forward_N_unsafe offset=%d expected:%d, Got:%d\n", i, fwd_N_unsafe[i], offunsafe);
        }
    }

    offunsafe=0;
    for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){  
        U8_FWD_N_UNSAFE(input, offunsafe, Nvalue[i]);
        if(offunsafe != fwd_N_unsafe[i]){
            log_err("ERROR: U8_FWD_N_UNSAFE offset=%d expected:%d, Got:%d\n", i, fwd_N_unsafe[i], offunsafe);
        }
    }

    offsafe=0;
    for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
        UTF8_FWD_N_SAFE(input, offsafe, sizeof(input), Nvalue[i]);
        if(offsafe != fwd_N_safe[i]){
            log_err("ERROR: Forward_N_safe offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
        }
    
    }

    offsafe=0;
    for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
        U8_FWD_N(input, offsafe, sizeof(input), Nvalue[i]);
        if(offsafe != fwd_N_safe[i]){
            log_err("ERROR: U8_FWD_N offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
        }
    
    }

    offunsafe=sizeof(input);
    for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){
        UTF8_BACK_N_UNSAFE(input, offunsafe, Nvalue[i]);
        if(offunsafe != back_N_unsafe[i]){
            log_err("ERROR: backward_N_unsafe offset=%d expected:%d, Got:%d\n", i, back_N_unsafe[i], offunsafe);
        }
    }

    offunsafe=sizeof(input);
    for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){
        U8_BACK_N_UNSAFE(input, offunsafe, Nvalue[i]);
        if(offunsafe != back_N_unsafe[i]){
            log_err("ERROR: U8_BACK_N_UNSAFE offset=%d expected:%d, Got:%d\n", i, back_N_unsafe[i], offunsafe);
        }
    }

    offsafe=sizeof(input);
    for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
        UTF8_BACK_N_SAFE(input, 0, offsafe, Nvalue[i]);
        if(offsafe != back_N_safe[i]){
            log_err("ERROR: backward_N_safe offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
        }
    }

    offsafe=sizeof(input);
    for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
        U8_BACK_N(input, 0, offsafe, Nvalue[i]);
        if(offsafe != back_N_safe[i]){
            log_err("ERROR: U8_BACK_N offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
        }
    }
}