/** Convert FORWARD UChar32-based index to UTF-8 based * * @param i string index (in container) * @param wh UChar32 character's position to look for, * counting starts from 0 == first character in i-th string * @return UTF-8 (byte) index * * * @version 0.1-?? (Bartek Tartanus) * stri_sub * * @version 0.1-?? (Marek Gagolewski) * stri__UChar32_to_UTF8_index * * @version 0.1-?? (Marek Gagolewski, 2013-06-01) * moved to StriContainerUTF8 * * @version 0.2-1 (Marek Gagolewski, 2014-03-20) * moved to StriContainerUTF8_indexable * * @version 0.5-1 (Marek Gagolewski, 2015-02-14) * use String8::isASCII */ R_len_t StriContainerUTF8_indexable::UChar32_to_UTF8_index_fwd(R_len_t i, R_len_t wh) { if (wh <= 0) return 0; if (get(i).isASCII()) return std::min(wh, get(i).length()); R_len_t cur_n = get(i).length(); const char* cur_s = get(i).c_str(); #ifndef NDEBUG if (!cur_s) throw StriException("StriContainerUTF8::UChar32_to_UTF8_index_fwd: NULL cur_s"); #endif if (last_ind_fwd_str != cur_s) { // starting search in a different string last_ind_fwd_codepoint = 0; last_ind_fwd_utf8 = 0; last_ind_fwd_str = cur_s; } R_len_t j = 0; R_len_t jres = 0; if (last_ind_fwd_codepoint > 0) { if (wh < last_ind_fwd_codepoint) { // check if it makes sense to go backwards from last position, // or it is better to start from scratch if ((last_ind_fwd_codepoint-wh) < (wh-0)) { // less code points will be considered when going backwards j = last_ind_fwd_codepoint; jres = last_ind_fwd_utf8; while (j > wh && jres > 0) { U8_BACK_1((const uint8_t*)cur_s, 0, jres); --j; } last_ind_fwd_codepoint = wh; last_ind_fwd_utf8 = jres; return jres; // stop right now } // else } else { //if (wh >= last_ind_fwd_codepoint) // continue last search j = last_ind_fwd_codepoint; jres = last_ind_fwd_utf8; } } // go forward while (j < wh && jres < cur_n) { U8_FWD_1((const uint8_t*)cur_s, jres, cur_n); ++j; } last_ind_fwd_codepoint = wh; last_ind_fwd_utf8 = jres; return jres; }
/** Convert BACKWARD UChar32-based index to UTF-8 based * * @param i string index (in container) * @param wh UChar32 character's position to look for, * counting starts from 0 == byte after last character in the i-th string * @return UTF-8 (byte) index * * * @version 0.1-?? (Bartek Tartanus) * stri_sub * * @version 0.1-?? (Marek Gagolewski) * stri__UChar32_to_UTF8_index * * @version 0.1-?? (Marek Gagolewski, 2013-06-01) * moved to StriContainerUTF8 * * @version 0.2-1 (Marek Gagolewski, 2014-03-20) * moved to StriContainerUTF8_indexable * * @version 0.5-1 (Marek Gagolewski, 2015-02-14) * use String8::isASCII */ R_len_t StriContainerUTF8_indexable::UChar32_to_UTF8_index_back(R_len_t i, R_len_t wh) { R_len_t cur_n = get(i).length(); if (wh <= 0) return cur_n; if (get(i).isASCII()) return std::max(cur_n-wh, 0); const char* cur_s = get(i).c_str(); #ifndef NDEBUG if (!cur_s) throw StriException("StriContainerUTF8::UChar32_to_UTF8_index_back: NULL cur_s"); #endif if (last_ind_back_str != cur_s) { // starting search in a different string last_ind_back_codepoint = 0; last_ind_back_utf8 = cur_n; last_ind_back_str = cur_s; } R_len_t j = 0; R_len_t jres = cur_n; if (last_ind_back_codepoint > 0) { if (wh < last_ind_back_codepoint) { // check if it makes sense to go towards the end of the string // or maybe it will be better to start from the end and move backwards if ((last_ind_back_codepoint-wh) < (wh-0)) { // less code points will be considered when going backwards j = last_ind_back_codepoint; jres = last_ind_back_utf8; while (j > wh && jres < cur_n) { U8_FWD_1((const uint8_t*)cur_s, jres, cur_n); --j; } last_ind_back_codepoint = wh; last_ind_back_utf8 = jres; return jres; // stop right now } // else } else { //if (wh >= last_ind_back_codepoint) // continue last search j = last_ind_back_codepoint; jres = last_ind_back_utf8; } } // go backward while (j < wh && jres > 0) { U8_BACK_1((const uint8_t*)cur_s, 0, jres); ++j; } last_ind_back_codepoint = wh; last_ind_back_utf8 = jres; return jres; }
/** find last match - KMP * * @param startPos where to start * @return USEARCH_DONE on no match, otherwise start index * * @version 0.2-3 (Marek Gagolewski, 2014-05-11) * * @version 0.4-1 (Marek Gagolewski, 2014-12-07) * use BYTESEARCH_CASE_INSENSITIVE */ R_len_t StriContainerByteSearch::findFromPosBack_KMP(R_len_t startPos) { int j = startPos; patternPos = 0; if (flags&BYTESEARCH_CASE_INSENSITIVE) { while (j > 0) { UChar32 c; U8_PREV(searchStr, 0, j, c); c = u_toupper(c); while (patternPos >= 0 && patternStrCaseInsensitive[patternLenCaseInsensitive-1-patternPos] != c) patternPos = kmpNext[patternPos]; patternPos++; if (patternPos == patternLenCaseInsensitive) { searchPos = j; // we need to go forward by patternLenCaseInsensitive code points R_len_t k = patternLenCaseInsensitive; searchEnd = j; while (k > 0) { U8_FWD_1((const uint8_t*)searchStr, searchEnd, searchLen); k--; } return searchPos; } } } else { while (j > 0) { j--; while (patternPos >= 0 && patternStr[patternLen-1-patternPos] != searchStr[j]) patternPos = kmpNext[patternPos]; patternPos++; if (patternPos == patternLen) { searchEnd = j+patternLen; searchPos = j; return searchPos; } } } // else not found searchPos = searchEnd = searchLen; return USEARCH_DONE; }
static void TestFwdBack(){ static const uint8_t input[]={0x61, 0xF0, 0x90, 0x90, 0x81, 0xff, 0x62, 0xc0, 0x80, 0x7f, 0x8f, 0xc0, 0x63, 0x81, 0x90, 0x90, 0xF0, 0x00}; static const uint16_t fwd_unsafe[] ={1, 5, 6, 7, 9, 10, 11, 13, 14, 15, 16, 20, }; static const uint16_t fwd_safe[] ={1, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}; static const uint16_t back_unsafe[]={17, 16, 12, 11, 9, 7, 6, 5, 1, 0}; static const uint16_t back_safe[] ={17, 16, 15, 14, 13, 12, 11, 10, 9, 7, 6, 5, 1, 0}; static const uint16_t Nvalue[]= {0, 1, 2, 3, 1, 2, 1, 5}; static const uint16_t fwd_N_unsafe[] ={0, 1, 6, 10, 11, 14, 15}; static const uint16_t fwd_N_safe[] ={0, 1, 6, 10, 11, 13, 14, 18}; /*safe macro keeps it at the end of the string */ static const uint16_t back_N_unsafe[]={18, 17, 12, 7, 6, 1, 0}; static const uint16_t back_N_safe[] ={18, 17, 15, 12, 11, 9, 7, 0}; uint32_t offunsafe=0, offsafe=0; uint32_t i=0; while(offunsafe < sizeof(input)){ UTF8_FWD_1_UNSAFE(input, offunsafe); if(offunsafe != fwd_unsafe[i]){ log_err("ERROR: Forward_unsafe offset expected:%d, Got:%d\n", fwd_unsafe[i], offunsafe); } i++; } i=0; while(offunsafe < sizeof(input)){ U8_FWD_1_UNSAFE(input, offunsafe); if(offunsafe != fwd_unsafe[i]){ log_err("ERROR: U8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", fwd_unsafe[i], offunsafe); } i++; } i=0; while(offsafe < sizeof(input)){ UTF8_FWD_1_SAFE(input, offsafe, sizeof(input)); if(offsafe != fwd_safe[i]){ log_err("ERROR: Forward_safe offset expected:%d, Got:%d\n", fwd_safe[i], offsafe); } i++; } i=0; while(offsafe < sizeof(input)){ U8_FWD_1(input, offsafe, sizeof(input)); if(offsafe != fwd_safe[i]){ log_err("ERROR: U8_FWD_1 offset expected:%d, Got:%d\n", fwd_safe[i], offsafe); } i++; } offunsafe=sizeof(input); i=0; while(offunsafe > 0){ UTF8_BACK_1_UNSAFE(input, offunsafe); if(offunsafe != back_unsafe[i]){ log_err("ERROR: Backward_unsafe offset expected:%d, Got:%d\n", back_unsafe[i], offunsafe); } i++; } offunsafe=sizeof(input); i=0; while(offunsafe > 0){ U8_BACK_1_UNSAFE(input, offunsafe); if(offunsafe != back_unsafe[i]){ log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", back_unsafe[i], offunsafe); } i++; } i=0; offsafe=sizeof(input); while(offsafe > 0){ UTF8_BACK_1_SAFE(input, 0, offsafe); if(offsafe != back_safe[i]){ log_err("ERROR: Backward_safe offset expected:%d, Got:%d\n", back_unsafe[i], offsafe); } i++; } i=0; offsafe=sizeof(input); while(offsafe > 0){ U8_BACK_1(input, 0, offsafe); if(offsafe != back_safe[i]){ log_err("ERROR: U8_BACK_1 offset expected:%d, Got:%d\n", back_unsafe[i], offsafe); } i++; } offunsafe=0; for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){ UTF8_FWD_N_UNSAFE(input, offunsafe, Nvalue[i]); if(offunsafe != fwd_N_unsafe[i]){ log_err("ERROR: Forward_N_unsafe offset=%d expected:%d, Got:%d\n", i, fwd_N_unsafe[i], offunsafe); } } offunsafe=0; for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){ U8_FWD_N_UNSAFE(input, offunsafe, Nvalue[i]); if(offunsafe != fwd_N_unsafe[i]){ log_err("ERROR: U8_FWD_N_UNSAFE offset=%d expected:%d, Got:%d\n", i, fwd_N_unsafe[i], offunsafe); } } offsafe=0; for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){ UTF8_FWD_N_SAFE(input, offsafe, sizeof(input), Nvalue[i]); if(offsafe != fwd_N_safe[i]){ log_err("ERROR: Forward_N_safe offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe); } } offsafe=0; for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){ U8_FWD_N(input, offsafe, sizeof(input), Nvalue[i]); if(offsafe != fwd_N_safe[i]){ log_err("ERROR: U8_FWD_N offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe); } } offunsafe=sizeof(input); for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){ UTF8_BACK_N_UNSAFE(input, offunsafe, Nvalue[i]); if(offunsafe != back_N_unsafe[i]){ log_err("ERROR: backward_N_unsafe offset=%d expected:%d, Got:%d\n", i, back_N_unsafe[i], offunsafe); } } offunsafe=sizeof(input); for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){ U8_BACK_N_UNSAFE(input, offunsafe, Nvalue[i]); if(offunsafe != back_N_unsafe[i]){ log_err("ERROR: U8_BACK_N_UNSAFE offset=%d expected:%d, Got:%d\n", i, back_N_unsafe[i], offunsafe); } } offsafe=sizeof(input); for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){ UTF8_BACK_N_SAFE(input, 0, offsafe, Nvalue[i]); if(offsafe != back_N_safe[i]){ log_err("ERROR: backward_N_safe offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe); } } offsafe=sizeof(input); for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){ U8_BACK_N(input, 0, offsafe, Nvalue[i]); if(offsafe != back_N_safe[i]){ log_err("ERROR: U8_BACK_N offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe); } } }
/* keep this in sync with utf16tst.c's TestNulTerminated() */ static void TestNulTerminated() { static const uint8_t input[]={ /* 0 */ 0x61, /* 1 */ 0xf0, 0x90, 0x90, 0x81, /* 5 */ 0xc0, 0x80, /* 7 */ 0xdf, 0x80, /* 9 */ 0xc2, /* 10 */ 0x62, /* 11 */ 0xfd, 0xbe, /* 13 */ 0xe0, 0xa0, 0x80, /* 16 */ 0xe2, 0x82, 0xac, /* 19 */ 0xf0, 0x90, 0x90, /* 22 */ 0x00 /* 23 */ }; static const UChar32 result[]={ 0x61, 0x10401, U_SENTINEL, 0x7c0, U_SENTINEL, 0x62, U_SENTINEL, 0x800, 0x20ac, U_SENTINEL, 0 }; UChar32 c, c2, expected; int32_t i0, i=0, j, k, expectedIndex; int32_t cpIndex=0; do { i0=i; U8_NEXT(input, i, -1, c); expected=result[cpIndex]; if(c!=expected) { log_err("U8_NEXT(from %d)=U+%04x != U+%04x\n", i0, c, expected); } j=i0; U8_NEXT_OR_FFFD(input, j, -1, c); if(expected<0) { expected=0xfffd; } if(c!=expected) { log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x\n", i0, c, expected); } if(j!=i) { log_err("U8_NEXT_OR_FFFD() moved to index %d but U8_NEXT() moved to %d\n", j, i); } j=i0; U8_FWD_1(input, j, -1); if(j!=i) { log_err("U8_FWD_1() moved to index %d but U8_NEXT() moved to %d\n", j, i); } ++cpIndex; /* * Move by this many code points from the start. * U8_FWD_N() stops at the end of the string, that is, at the NUL if necessary. */ expectedIndex= (c==0) ? i-1 : i; k=0; U8_FWD_N(input, k, -1, cpIndex); if(k!=expectedIndex) { log_err("U8_FWD_N(code points from 0) moved to index %d but expected %d\n", k, expectedIndex); } } while(c!=0); i=0; do { j=i0=i; U8_NEXT(input, i, -1, c); do { U8_GET(input, 0, j, -1, c2); if(c2!=c) { log_err("U8_NEXT(from %d)=U+%04x != U+%04x=U8_GET(at %d)\n", i0, c, c2, j); } U8_GET_OR_FFFD(input, 0, j, -1, c2); expected= (c>=0) ? c : 0xfffd; if(c2!=expected) { log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x=U8_GET_OR_FFFD(at %d)\n", i0, expected, c2, j); } /* U8_SET_CP_LIMIT moves from a non-lead byte to the limit of the code point */ k=j+1; U8_SET_CP_LIMIT(input, 0, k, -1); if(k!=i) { log_err("U8_NEXT() moved to %d but U8_SET_CP_LIMIT(%d) moved to %d\n", i, j+1, k); } } while(++j<i); } while(c!=0); }
/** Convert UTF8-byte indices to Unicode32 (code points) * * \code{i1} and \code{i2} must be sorted increasingly * * @param i element index * @param i1 indices, 1-based [in/out] * @param i2 indices, 1-based [in/out] * @param ni size of \code{i1} and \code{i2} * @param adj1 adjust for \code{i1} * @param adj2 adjust for \code{i2} * * * @version 0.1-?? (Marek Gagolewski) * * @version 0.2-1 (Marek Gagolewski, 2014-03-20) * moved to StriContainerUTF8_indexable * * @version 0.5-1 (Marek Gagolewski, 2015-02-14) * use String8::isASCII */ void StriContainerUTF8_indexable::UTF8_to_UChar32_index(R_len_t i, int* i1, int* i2, const int ni, int adj1, int adj2) { if (get(i).isASCII()) { for (int i=0; i<ni; ++i) { i1[i] += adj1; i2[i] += adj2; } return; } const char* cstr = get(i).c_str(); const int nstr = get(i).length(); int j1 = 0; int j2 = 0; int i8 = 0; int i32 = 0; while (i8 < nstr && (j1 < ni || j2 < ni)) { if (j1 < ni && i1[j1] <= i8) { #ifndef NDEBUG if (j1 < ni-1 && i1[j1] >= i1[j1+1]) throw StriException("DEBUG: stri__UTF8_to_UChar32_index"); #endif i1[j1] = i32 + adj1; ++j1; } if (j2 < ni && i2[j2] <= i8) { #ifndef NDEBUG if (j2 < ni-1 && i2[j2] >= i2[j2+1]) throw StriException("DEBUG: stri__UTF8_to_UChar32_index"); #endif i2[j2] = i32 + adj2; ++j2; } // Next UChar32 U8_FWD_1(cstr, i8, nstr); ++i32; } // CONVERT LAST: if (j1 < ni && i1[j1] <= nstr) { #ifndef NDEBUG if (j1 < ni-1 && i1[j1] >= i1[j1+1]) throw StriException("DEBUG: stri__UTF8_to_UChar32_index"); #endif i1[j1] = i32 + adj1; ++j1; } if (j2 < ni && i2[j2] <= nstr) { #ifndef NDEBUG if (j2 < ni-1 && i2[j2] >= i2[j2+1]) throw StriException("DEBUG: stri__UTF8_to_UChar32_index"); #endif i2[j2] = i32 + adj2; ++j2; } // CHECK: #ifndef NDEBUG if (i8 >= nstr && (j1 < ni || j2 < ni)) throw StriException("DEBUG: stri__UTF8_to_UChar32_index()"); #endif }