/** internal function - replace multiple substrings in a single string * can raise Rf_error * * @version 1.3.2 (Marek Gagolewski, 2019-02-23) * * @version 1.4.3 (Marek Gagolewski, 2019-03-12) * #346: na_omit for `value` */ SEXP stri__sub_replacement_all_single(SEXP curs, SEXP from, SEXP to, SEXP length, bool omit_na_1, SEXP value) { // curs is a CHARSXP in UTF-8 PROTECT(value = stri_enc_toutf8(value, Rf_ScalarLogical(FALSE), Rf_ScalarLogical(FALSE))); R_len_t value_len = LENGTH(value); R_len_t from_len = 0; // see below R_len_t to_len = 0; // see below R_len_t length_len = 0; // see below int* from_tab = 0; // see below int* to_tab = 0; // see below int* length_tab = 0; // see below R_len_t sub_protected = 1+ /* how many objects to PROTECT on ret? */ stri__sub_prepare_from_to_length(from, to, length, from_len, to_len, length_len, from_tab, to_tab, length_tab); R_len_t vectorize_len = stri__recycling_rule(true, 2, // does not care about value_len from_len, (to_len>length_len)?to_len:length_len); if (vectorize_len <= 0) { // "nothing" is being replaced -> return the input as-is UNPROTECT(sub_protected); return curs; } if (value_len <= 0) { // things are supposed to be replaced with "nothing"... UNPROTECT(sub_protected); Rf_warning(MSG__REPLACEMENT_ZERO); return NA_STRING; } if (vectorize_len % value_len != 0) Rf_warning(MSG__WARN_RECYCLING_RULE2); const char* curs_s = CHAR(curs); // already in UTF-8 R_len_t curs_n = LENGTH(curs); // first check for NAs.... if (!omit_na_1) { for (R_len_t i=0; i<vectorize_len; ++i) { R_len_t cur_from = from_tab[i % from_len]; R_len_t cur_to = (to_tab)?to_tab[i % to_len]:length_tab[i % length_len]; if (cur_from == NA_INTEGER || cur_to == NA_INTEGER) { UNPROTECT(sub_protected); if (omit_na_1) return curs; else return NA_STRING; } } for (R_len_t i=0; i<vectorize_len; ++i) { if (STRING_ELT(value, i%value_len) == NA_STRING) { UNPROTECT(sub_protected); return NA_STRING; } } } // get the number of code points in curs, if required (for negative indexes) R_len_t curs_m = -1; if (IS_ASCII(curs)) curs_m = curs_n; else { // is UTF-8 curs_m = 0; // code points count R_len_t j = 0; // byte pos while (j < curs_n) { U8_FWD_1_UNSAFE(curs_s, j); ++curs_m; } } STRI__ERROR_HANDLER_BEGIN(sub_protected) std::vector<char> buf; // convenience >> speed R_len_t buf_size; R_len_t last_pos = 0; R_len_t byte_pos = 0, byte_pos_last; for (R_len_t i=0; i<vectorize_len; ++i) { R_len_t cur_from = from_tab[i % from_len]; R_len_t cur_to = (to_tab)?to_tab[i % to_len]:length_tab[i % length_len]; if (cur_from == NA_INTEGER || cur_to == NA_INTEGER || STRING_ELT(value, i%value_len) == NA_STRING) { continue; } if (cur_from < 0) cur_from = curs_m+cur_from+1; if (cur_from <= 0) cur_from = 1; cur_from--; // 1-based -> 0-based index if (cur_from >= curs_m) cur_from = curs_m; // cur_from is in [0, curs_m] if (length_tab) { if (cur_to < 0) cur_to = 0; cur_to = cur_from+cur_to; } else { if (cur_to < 0) cur_to = curs_m+cur_to+1; if (cur_to < cur_from) cur_to = cur_from; // insertion } if (cur_to >= curs_m) cur_to = curs_m; // the chunk to replace is at code points [cur_from, cur_to) // Rprintf("orig [%d,%d) repl [%d,%d)\n", last_pos, cur_from, cur_from, cur_to); if (last_pos > cur_from) throw StriException(MSG__OVERLAPPING_OR_UNSORTED_INDEXES); // first, copy [last_pos, cur_from) byte_pos_last = byte_pos; while (last_pos < cur_from) { U8_FWD_1_UNSAFE(curs_s, byte_pos); ++last_pos; } buf_size = buf.size(); buf.resize(buf_size+byte_pos-byte_pos_last); memcpy(buf.data()+buf_size, curs_s+byte_pos_last, byte_pos-byte_pos_last); // then, copy the corresponding replacement string SEXP value_cur = STRING_ELT(value, i%value_len); const char* value_s = CHAR(value_cur); R_len_t value_n = LENGTH(value_cur); buf_size = buf.size(); buf.resize(buf_size+value_n); memcpy(buf.data()+buf_size, value_s, value_n); // lastly, update last_pos // ---> last_pos = cur_to; while (last_pos < cur_to) { U8_FWD_1_UNSAFE(curs_s, byte_pos); ++last_pos; } } // finally, copy [last_pos, curs_m) // Rprintf("orig [%d,%d)\n", last_pos, curs_m); buf_size = buf.size(); buf.resize(buf_size+curs_n-byte_pos); memcpy(buf.data()+buf_size, curs_s+byte_pos, curs_n-byte_pos); SEXP ret; STRI__PROTECT(ret = Rf_mkCharLenCE(buf.data(), buf.size(), CE_UTF8)); STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/* ** Compare two UTF-8 strings for equality where the first string is ** a "LIKE" expression. Return true (1) if they are the same and ** false (0) if they are different. */ static int icuLikeCompare( const uint8_t *zPattern, /* LIKE pattern */ const uint8_t *zString, /* The UTF-8 string to compare against */ const UChar32 uEsc /* The escape character */ ){ static const int MATCH_ONE = (UChar32)'_'; static const int MATCH_ALL = (UChar32)'%'; int iPattern = 0; /* Current byte index in zPattern */ int iString = 0; /* Current byte index in zString */ int prevEscape = 0; /* True if the previous character was uEsc */ while( zPattern[iPattern]!=0 ){ /* Read (and consume) the next character from the input pattern. */ UChar32 uPattern; U8_NEXT_UNSAFE(zPattern, iPattern, uPattern); assert(uPattern!=0); /* There are now 4 possibilities: ** ** 1. uPattern is an unescaped match-all character "%", ** 2. uPattern is an unescaped match-one character "_", ** 3. uPattern is an unescaped escape character, or ** 4. uPattern is to be handled as an ordinary character */ if( !prevEscape && uPattern==MATCH_ALL ){ /* Case 1. */ uint8_t c; /* Skip any MATCH_ALL or MATCH_ONE characters that follow a ** MATCH_ALL. For each MATCH_ONE, skip one character in the ** test string. */ while( (c=zPattern[iPattern]) == MATCH_ALL || c == MATCH_ONE ){ if( c==MATCH_ONE ){ if( zString[iString]==0 ) return 0; U8_FWD_1_UNSAFE(zString, iString); } iPattern++; } if( zPattern[iPattern]==0 ) return 1; while( zString[iString] ){ if( icuLikeCompare(&zPattern[iPattern], &zString[iString], uEsc) ){ return 1; } U8_FWD_1_UNSAFE(zString, iString); } return 0; }else if( !prevEscape && uPattern==MATCH_ONE ){ /* Case 2. */ if( zString[iString]==0 ) return 0; U8_FWD_1_UNSAFE(zString, iString); }else if( !prevEscape && uPattern==uEsc){ /* Case 3. */ prevEscape = 1; }else{ /* Case 4. */ UChar32 uString; U8_NEXT_UNSAFE(zString, iString, uString); uString = u_foldCase(uString, U_FOLD_CASE_DEFAULT); uPattern = u_foldCase(uPattern, U_FOLD_CASE_DEFAULT); if( uString!=uPattern ){ return 0; } prevEscape = 0; } } return zString[iString]==0; }
static void TestFwdBack(){ static const uint8_t input[]={0x61, 0xF0, 0x90, 0x90, 0x81, 0xff, 0x62, 0xc0, 0x80, 0x7f, 0x8f, 0xc0, 0x63, 0x81, 0x90, 0x90, 0xF0, 0x00}; static const uint16_t fwd_unsafe[] ={1, 5, 6, 7, 9, 10, 11, 13, 14, 15, 16, 20, }; static const uint16_t fwd_safe[] ={1, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}; static const uint16_t back_unsafe[]={17, 16, 12, 11, 9, 7, 6, 5, 1, 0}; static const uint16_t back_safe[] ={17, 16, 15, 14, 13, 12, 11, 10, 9, 7, 6, 5, 1, 0}; static const uint16_t Nvalue[]= {0, 1, 2, 3, 1, 2, 1, 5}; static const uint16_t fwd_N_unsafe[] ={0, 1, 6, 10, 11, 14, 15}; static const uint16_t fwd_N_safe[] ={0, 1, 6, 10, 11, 13, 14, 18}; /*safe macro keeps it at the end of the string */ static const uint16_t back_N_unsafe[]={18, 17, 12, 7, 6, 1, 0}; static const uint16_t back_N_safe[] ={18, 17, 15, 12, 11, 9, 7, 0}; uint32_t offunsafe=0, offsafe=0; uint32_t i=0; while(offunsafe < sizeof(input)){ UTF8_FWD_1_UNSAFE(input, offunsafe); if(offunsafe != fwd_unsafe[i]){ log_err("ERROR: Forward_unsafe offset expected:%d, Got:%d\n", fwd_unsafe[i], offunsafe); } i++; } i=0; while(offunsafe < sizeof(input)){ U8_FWD_1_UNSAFE(input, offunsafe); if(offunsafe != fwd_unsafe[i]){ log_err("ERROR: U8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", fwd_unsafe[i], offunsafe); } i++; } i=0; while(offsafe < sizeof(input)){ UTF8_FWD_1_SAFE(input, offsafe, sizeof(input)); if(offsafe != fwd_safe[i]){ log_err("ERROR: Forward_safe offset expected:%d, Got:%d\n", fwd_safe[i], offsafe); } i++; } i=0; while(offsafe < sizeof(input)){ U8_FWD_1(input, offsafe, sizeof(input)); if(offsafe != fwd_safe[i]){ log_err("ERROR: U8_FWD_1 offset expected:%d, Got:%d\n", fwd_safe[i], offsafe); } i++; } offunsafe=sizeof(input); i=0; while(offunsafe > 0){ UTF8_BACK_1_UNSAFE(input, offunsafe); if(offunsafe != back_unsafe[i]){ log_err("ERROR: Backward_unsafe offset expected:%d, Got:%d\n", back_unsafe[i], offunsafe); } i++; } offunsafe=sizeof(input); i=0; while(offunsafe > 0){ U8_BACK_1_UNSAFE(input, offunsafe); if(offunsafe != back_unsafe[i]){ log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", back_unsafe[i], offunsafe); } i++; } i=0; offsafe=sizeof(input); while(offsafe > 0){ UTF8_BACK_1_SAFE(input, 0, offsafe); if(offsafe != back_safe[i]){ log_err("ERROR: Backward_safe offset expected:%d, Got:%d\n", back_unsafe[i], offsafe); } i++; } i=0; offsafe=sizeof(input); while(offsafe > 0){ U8_BACK_1(input, 0, offsafe); if(offsafe != back_safe[i]){ log_err("ERROR: U8_BACK_1 offset expected:%d, Got:%d\n", back_unsafe[i], offsafe); } i++; } offunsafe=0; for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){ UTF8_FWD_N_UNSAFE(input, offunsafe, Nvalue[i]); if(offunsafe != fwd_N_unsafe[i]){ log_err("ERROR: Forward_N_unsafe offset=%d expected:%d, Got:%d\n", i, fwd_N_unsafe[i], offunsafe); } } offunsafe=0; for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){ U8_FWD_N_UNSAFE(input, offunsafe, Nvalue[i]); if(offunsafe != fwd_N_unsafe[i]){ log_err("ERROR: U8_FWD_N_UNSAFE offset=%d expected:%d, Got:%d\n", i, fwd_N_unsafe[i], offunsafe); } } offsafe=0; for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){ UTF8_FWD_N_SAFE(input, offsafe, sizeof(input), Nvalue[i]); if(offsafe != fwd_N_safe[i]){ log_err("ERROR: Forward_N_safe offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe); } } offsafe=0; for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){ U8_FWD_N(input, offsafe, sizeof(input), Nvalue[i]); if(offsafe != fwd_N_safe[i]){ log_err("ERROR: U8_FWD_N offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe); } } offunsafe=sizeof(input); for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){ UTF8_BACK_N_UNSAFE(input, offunsafe, Nvalue[i]); if(offunsafe != back_N_unsafe[i]){ log_err("ERROR: backward_N_unsafe offset=%d expected:%d, Got:%d\n", i, back_N_unsafe[i], offunsafe); } } offunsafe=sizeof(input); for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){ U8_BACK_N_UNSAFE(input, offunsafe, Nvalue[i]); if(offunsafe != back_N_unsafe[i]){ log_err("ERROR: U8_BACK_N_UNSAFE offset=%d expected:%d, Got:%d\n", i, back_N_unsafe[i], offunsafe); } } offsafe=sizeof(input); for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){ UTF8_BACK_N_SAFE(input, 0, offsafe, Nvalue[i]); if(offsafe != back_N_safe[i]){ log_err("ERROR: backward_N_safe offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe); } } offsafe=sizeof(input); for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){ U8_BACK_N(input, 0, offsafe, Nvalue[i]); if(offsafe != back_N_safe[i]){ log_err("ERROR: U8_BACK_N offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe); } } }
static void TestFwdBackUnsafe() { /* * Use a (mostly) well-formed UTF-8 string and test at code point boundaries. * The behavior of _UNSAFE macros for ill-formed strings is undefined. */ static const uint8_t input[]={ 0x61, 0xf0, 0x90, 0x90, 0x81, 0xc0, 0x80, /* non-shortest form */ 0xe2, 0x82, 0xac, 0xc2, 0xa1, 0xf4, 0x8f, 0xbf, 0xbf, 0x00 }; static const int8_t boundaries[]={ 0, 1, 5, 7, 10, 12, 16, 17 }; int32_t offset; int32_t i; for(i=1, offset=0; offset<LENGTHOF(input); ++i) { UTF8_FWD_1_UNSAFE(input, offset); if(offset != boundaries[i]){ log_err("ERROR: UTF8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset); } } for(i=1, offset=0; offset<LENGTHOF(input); ++i) { U8_FWD_1_UNSAFE(input, offset); if(offset != boundaries[i]){ log_err("ERROR: U8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset); } } for(i=LENGTHOF(boundaries)-2, offset=LENGTHOF(input); offset>0; --i) { UTF8_BACK_1_UNSAFE(input, offset); if(offset != boundaries[i]){ log_err("ERROR: UTF8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset); } } for(i=LENGTHOF(boundaries)-2, offset=LENGTHOF(input); offset>0; --i) { U8_BACK_1_UNSAFE(input, offset); if(offset != boundaries[i]){ log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset); } } for(i=0; i<LENGTHOF(boundaries); ++i) { offset=0; UTF8_FWD_N_UNSAFE(input, offset, i); if(offset != boundaries[i]) { log_err("ERROR: UTF8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset); } } for(i=0; i<LENGTHOF(boundaries); ++i) { offset=0; U8_FWD_N_UNSAFE(input, offset, i); if(offset != boundaries[i]) { log_err("ERROR: U8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset); } } for(i=0; i<LENGTHOF(boundaries); ++i) { int32_t j=LENGTHOF(boundaries)-1-i; offset=LENGTHOF(input); UTF8_BACK_N_UNSAFE(input, offset, i); if(offset != boundaries[j]) { log_err("ERROR: UTF8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[j], offset); } } for(i=0; i<LENGTHOF(boundaries); ++i) { int32_t j=LENGTHOF(boundaries)-1-i; offset=LENGTHOF(input); U8_BACK_N_UNSAFE(input, offset, i); if(offset != boundaries[j]) { log_err("ERROR: U8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[j], offset); } } }