/** Convert FORWARD UChar32-based index to UTF-8 based * * @param i string index (in container) * @param wh UChar32 character's position to look for, * counting starts from 0 == first character in i-th string * @return UTF-8 (byte) index * * * @version 0.1-?? (Bartek Tartanus) * stri_sub * * @version 0.1-?? (Marek Gagolewski) * stri__UChar32_to_UTF8_index * * @version 0.1-?? (Marek Gagolewski, 2013-06-01) * moved to StriContainerUTF8 * * @version 0.2-1 (Marek Gagolewski, 2014-03-20) * moved to StriContainerUTF8_indexable * * @version 0.5-1 (Marek Gagolewski, 2015-02-14) * use String8::isASCII */ R_len_t StriContainerUTF8_indexable::UChar32_to_UTF8_index_fwd(R_len_t i, R_len_t wh) { if (wh <= 0) return 0; if (get(i).isASCII()) return std::min(wh, get(i).length()); R_len_t cur_n = get(i).length(); const char* cur_s = get(i).c_str(); #ifndef NDEBUG if (!cur_s) throw StriException("StriContainerUTF8::UChar32_to_UTF8_index_fwd: NULL cur_s"); #endif if (last_ind_fwd_str != cur_s) { // starting search in a different string last_ind_fwd_codepoint = 0; last_ind_fwd_utf8 = 0; last_ind_fwd_str = cur_s; } R_len_t j = 0; R_len_t jres = 0; if (last_ind_fwd_codepoint > 0) { if (wh < last_ind_fwd_codepoint) { // check if it makes sense to go backwards from last position, // or it is better to start from scratch if ((last_ind_fwd_codepoint-wh) < (wh-0)) { // less code points will be considered when going backwards j = last_ind_fwd_codepoint; jres = last_ind_fwd_utf8; while (j > wh && jres > 0) { U8_BACK_1((const uint8_t*)cur_s, 0, jres); --j; } last_ind_fwd_codepoint = wh; last_ind_fwd_utf8 = jres; return jres; // stop right now } // else } else { //if (wh >= last_ind_fwd_codepoint) // continue last search j = last_ind_fwd_codepoint; jres = last_ind_fwd_utf8; } } // go forward while (j < wh && jres < cur_n) { U8_FWD_1((const uint8_t*)cur_s, jres, cur_n); ++j; } last_ind_fwd_codepoint = wh; last_ind_fwd_utf8 = jres; return jres; }
/** Convert BACKWARD UChar32-based index to UTF-8 based * * @param i string index (in container) * @param wh UChar32 character's position to look for, * counting starts from 0 == byte after last character in the i-th string * @return UTF-8 (byte) index * * * @version 0.1-?? (Bartek Tartanus) * stri_sub * * @version 0.1-?? (Marek Gagolewski) * stri__UChar32_to_UTF8_index * * @version 0.1-?? (Marek Gagolewski, 2013-06-01) * moved to StriContainerUTF8 * * @version 0.2-1 (Marek Gagolewski, 2014-03-20) * moved to StriContainerUTF8_indexable * * @version 0.5-1 (Marek Gagolewski, 2015-02-14) * use String8::isASCII */ R_len_t StriContainerUTF8_indexable::UChar32_to_UTF8_index_back(R_len_t i, R_len_t wh) { R_len_t cur_n = get(i).length(); if (wh <= 0) return cur_n; if (get(i).isASCII()) return std::max(cur_n-wh, 0); const char* cur_s = get(i).c_str(); #ifndef NDEBUG if (!cur_s) throw StriException("StriContainerUTF8::UChar32_to_UTF8_index_back: NULL cur_s"); #endif if (last_ind_back_str != cur_s) { // starting search in a different string last_ind_back_codepoint = 0; last_ind_back_utf8 = cur_n; last_ind_back_str = cur_s; } R_len_t j = 0; R_len_t jres = cur_n; if (last_ind_back_codepoint > 0) { if (wh < last_ind_back_codepoint) { // check if it makes sense to go towards the end of the string // or maybe it will be better to start from the end and move backwards if ((last_ind_back_codepoint-wh) < (wh-0)) { // less code points will be considered when going backwards j = last_ind_back_codepoint; jres = last_ind_back_utf8; while (j > wh && jres < cur_n) { U8_FWD_1((const uint8_t*)cur_s, jres, cur_n); --j; } last_ind_back_codepoint = wh; last_ind_back_utf8 = jres; return jres; // stop right now } // else } else { //if (wh >= last_ind_back_codepoint) // continue last search j = last_ind_back_codepoint; jres = last_ind_back_utf8; } } // go backward while (j < wh && jres > 0) { U8_BACK_1((const uint8_t*)cur_s, 0, jres); ++j; } last_ind_back_codepoint = wh; last_ind_back_utf8 = jres; return jres; }
/** find first match - KMP * * @param startPos where to start * @return USEARCH_DONE on no match, otherwise start index * * @version 0.1-?? (Bartek Tartanus, 2013-08-15) * KMP - first approach * * @version 0.2-3 (Marek Gagolewski, 2014-05-11) * KMP upgraded; separate method * * @version 0.4-1 (Marek Gagolewski, 2014-12-07) * use BYTESEARCH_CASE_INSENSITIVE */ R_len_t StriContainerByteSearch::findFromPosFwd_KMP(R_len_t startPos) { int j = startPos; patternPos = 0; if (flags&BYTESEARCH_CASE_INSENSITIVE) { UChar32 c = 0; while (j < searchLen) { U8_NEXT(searchStr, j, searchLen, c); c = u_toupper(c); while (patternPos >= 0 && patternStrCaseInsensitive[patternPos] != c) patternPos = kmpNext[patternPos]; patternPos++; if (patternPos == patternLenCaseInsensitive) { searchEnd = j; // we need to go back by patternLenCaseInsensitive code points R_len_t k = patternLenCaseInsensitive; searchPos = j; while (k > 0) { U8_BACK_1((const uint8_t*)searchStr, 0, searchPos); k--; } return searchPos; } } } else { while (j < searchLen) { while (patternPos >= 0 && patternStr[patternPos] != searchStr[j]) patternPos = kmpNext[patternPos]; patternPos++; j++; if (patternPos == patternLen) { searchEnd = j; searchPos = j-patternLen; return searchPos; } } } // else not found searchPos = searchEnd = searchLen; return USEARCH_DONE; }
static void TestFwdBack(){ static const uint8_t input[]={0x61, 0xF0, 0x90, 0x90, 0x81, 0xff, 0x62, 0xc0, 0x80, 0x7f, 0x8f, 0xc0, 0x63, 0x81, 0x90, 0x90, 0xF0, 0x00}; static const uint16_t fwd_unsafe[] ={1, 5, 6, 7, 9, 10, 11, 13, 14, 15, 16, 20, }; static const uint16_t fwd_safe[] ={1, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}; static const uint16_t back_unsafe[]={17, 16, 12, 11, 9, 7, 6, 5, 1, 0}; static const uint16_t back_safe[] ={17, 16, 15, 14, 13, 12, 11, 10, 9, 7, 6, 5, 1, 0}; static const uint16_t Nvalue[]= {0, 1, 2, 3, 1, 2, 1, 5}; static const uint16_t fwd_N_unsafe[] ={0, 1, 6, 10, 11, 14, 15}; static const uint16_t fwd_N_safe[] ={0, 1, 6, 10, 11, 13, 14, 18}; /*safe macro keeps it at the end of the string */ static const uint16_t back_N_unsafe[]={18, 17, 12, 7, 6, 1, 0}; static const uint16_t back_N_safe[] ={18, 17, 15, 12, 11, 9, 7, 0}; uint32_t offunsafe=0, offsafe=0; uint32_t i=0; while(offunsafe < sizeof(input)){ UTF8_FWD_1_UNSAFE(input, offunsafe); if(offunsafe != fwd_unsafe[i]){ log_err("ERROR: Forward_unsafe offset expected:%d, Got:%d\n", fwd_unsafe[i], offunsafe); } i++; } i=0; while(offunsafe < sizeof(input)){ U8_FWD_1_UNSAFE(input, offunsafe); if(offunsafe != fwd_unsafe[i]){ log_err("ERROR: U8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", fwd_unsafe[i], offunsafe); } i++; } i=0; while(offsafe < sizeof(input)){ UTF8_FWD_1_SAFE(input, offsafe, sizeof(input)); if(offsafe != fwd_safe[i]){ log_err("ERROR: Forward_safe offset expected:%d, Got:%d\n", fwd_safe[i], offsafe); } i++; } i=0; while(offsafe < sizeof(input)){ U8_FWD_1(input, offsafe, sizeof(input)); if(offsafe != fwd_safe[i]){ log_err("ERROR: U8_FWD_1 offset expected:%d, Got:%d\n", fwd_safe[i], offsafe); } i++; } offunsafe=sizeof(input); i=0; while(offunsafe > 0){ UTF8_BACK_1_UNSAFE(input, offunsafe); if(offunsafe != back_unsafe[i]){ log_err("ERROR: Backward_unsafe offset expected:%d, Got:%d\n", back_unsafe[i], offunsafe); } i++; } offunsafe=sizeof(input); i=0; while(offunsafe > 0){ U8_BACK_1_UNSAFE(input, offunsafe); if(offunsafe != back_unsafe[i]){ log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", back_unsafe[i], offunsafe); } i++; } i=0; offsafe=sizeof(input); while(offsafe > 0){ UTF8_BACK_1_SAFE(input, 0, offsafe); if(offsafe != back_safe[i]){ log_err("ERROR: Backward_safe offset expected:%d, Got:%d\n", back_unsafe[i], offsafe); } i++; } i=0; offsafe=sizeof(input); while(offsafe > 0){ U8_BACK_1(input, 0, offsafe); if(offsafe != back_safe[i]){ log_err("ERROR: U8_BACK_1 offset expected:%d, Got:%d\n", back_unsafe[i], offsafe); } i++; } offunsafe=0; for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){ UTF8_FWD_N_UNSAFE(input, offunsafe, Nvalue[i]); if(offunsafe != fwd_N_unsafe[i]){ log_err("ERROR: Forward_N_unsafe offset=%d expected:%d, Got:%d\n", i, fwd_N_unsafe[i], offunsafe); } } offunsafe=0; for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){ U8_FWD_N_UNSAFE(input, offunsafe, Nvalue[i]); if(offunsafe != fwd_N_unsafe[i]){ log_err("ERROR: U8_FWD_N_UNSAFE offset=%d expected:%d, Got:%d\n", i, fwd_N_unsafe[i], offunsafe); } } offsafe=0; for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){ UTF8_FWD_N_SAFE(input, offsafe, sizeof(input), Nvalue[i]); if(offsafe != fwd_N_safe[i]){ log_err("ERROR: Forward_N_safe offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe); } } offsafe=0; for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){ U8_FWD_N(input, offsafe, sizeof(input), Nvalue[i]); if(offsafe != fwd_N_safe[i]){ log_err("ERROR: U8_FWD_N offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe); } } offunsafe=sizeof(input); for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){ UTF8_BACK_N_UNSAFE(input, offunsafe, Nvalue[i]); if(offunsafe != back_N_unsafe[i]){ log_err("ERROR: backward_N_unsafe offset=%d expected:%d, Got:%d\n", i, back_N_unsafe[i], offunsafe); } } offunsafe=sizeof(input); for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){ U8_BACK_N_UNSAFE(input, offunsafe, Nvalue[i]); if(offunsafe != back_N_unsafe[i]){ log_err("ERROR: U8_BACK_N_UNSAFE offset=%d expected:%d, Got:%d\n", i, back_N_unsafe[i], offunsafe); } } offsafe=sizeof(input); for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){ UTF8_BACK_N_SAFE(input, 0, offsafe, Nvalue[i]); if(offsafe != back_N_safe[i]){ log_err("ERROR: backward_N_safe offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe); } } offsafe=sizeof(input); for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){ U8_BACK_N(input, 0, offsafe, Nvalue[i]); if(offsafe != back_N_safe[i]){ log_err("ERROR: U8_BACK_N offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe); } } }