/** * Count the number of recurrences of \code{pattern} in \code{str} [fast but dummy bitewise compare] * * @param str strings to search in * @param pattern patterns to search for * @return integer vector * * @version 0.1 (Bartek Tartanus) * @version 0.2 (Marek Gagolewski) - use StriContainerUTF8 * @version 0.3 (Marek Gagolewski) - corrected behavior on empty str/pattern * @version 0.4 (Marek Gagolewski, 2013-06-23) make StriException-friendly, * use StriContainerByteSearch */ SEXP stri__count_fixed_byte(SEXP str, SEXP pattern) { str = stri_prepare_arg_string(str, "str"); pattern = stri_prepare_arg_string(pattern, "pattern"); STRI__ERROR_HANDLER_BEGIN R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); StriContainerUTF8 str_cont(str, vectorize_length); StriContainerByteSearch pattern_cont(pattern, vectorize_length); SEXP ret; PROTECT(ret = Rf_allocVector(INTSXP, vectorize_length)); int* ret_tab = INTEGER(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_INTEGER, ret_tab[i] = 0) pattern_cont.setupMatcher(i, str_cont.get(i).c_str(), str_cont.get(i).length()); ret_tab[i] = 0; while (USEARCH_DONE != pattern_cont.findNext()) ++ret_tab[i]; } UNPROTECT(1); return ret; STRI__ERROR_HANDLER_END( ;/* do nothing special on error */ ) }
/** * Split a string into parts. * * The pattern matches identify delimiters that separate the input into fields. * The input data between the matches becomes the fields themselves. * * @param str character vector * @param pattern character vector * @param n_max integer vector * @param opts_regex * @return list of character vectors * * @version 0.1 (Marek Gagolewski, 2013-06-21) * @version 0.2 (Marek Gagolewski, 2013-07-10) - BUGFIX: wrong behavior on empty str */ SEXP stri_split_regex(SEXP str, SEXP pattern, SEXP n_max, SEXP omit_empty, SEXP opts_regex) { str = stri_prepare_arg_string(str, "str"); pattern = stri_prepare_arg_string(pattern, "pattern"); n_max = stri_prepare_arg_integer(n_max, "n_max"); omit_empty = stri_prepare_arg_logical(omit_empty, "omit_empty"); R_len_t vectorize_length = stri__recycling_rule(true, 4, LENGTH(str), LENGTH(pattern), LENGTH(n_max), LENGTH(omit_empty)); uint32_t pattern_flags = StriContainerRegexPattern::getRegexFlags(opts_regex); UText* str_text = NULL; // may potentially be slower, but definitely is more convenient! STRI__ERROR_HANDLER_BEGIN StriContainerUTF8 str_cont(str, vectorize_length); StriContainerInteger n_max_cont(n_max, vectorize_length); StriContainerLogical omit_empty_cont(omit_empty, vectorize_length); StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_flags); SEXP ret; PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (n_max_cont.isNA(i) || omit_empty_cont.isNA(i)) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1)); continue; } int n_max_cur = n_max_cont.get(i); int omit_empty_cur = omit_empty_cont.get(i); STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1));, SET_VECTOR_ELT(ret, i, stri__vector_empty_strings((omit_empty_cur || n_max_cur == 0)?0:1));)
/** * Extract all capture groups of the first/last occurence of a regex pattern in each string * * @param str character vector * @param pattern character vector * @param opts_regex list * @param firs logical - search for the first or the last occurence? * @return character matrix * * @version 0.1 (Marek Gagolewski, 2013-06-22) */ SEXP stri__match_firstlast_regex(SEXP str, SEXP pattern, SEXP opts_regex, bool first) { str = stri_prepare_arg_string(str, "str"); // prepare string argument pattern = stri_prepare_arg_string(pattern, "pattern"); // prepare string argument R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); uint32_t pattern_flags = StriContainerRegexPattern::getRegexFlags(opts_regex); UText* str_text = NULL; // may potentially be slower, but definitely is more convenient! STRI__ERROR_HANDLER_BEGIN StriContainerUTF8 str_cont(str, vectorize_length); StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_flags); vector< vector<charptr_x2> > occurences(vectorize_length); // we don't know how many capture groups are there R_len_t occurences_max = 1; for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, /*do nothing*/;, int pattern_cur_groups = pattern_cont.getMatcher(i)->groupCount(); if (occurences_max < pattern_cur_groups+1) occurences_max=pattern_cur_groups+1; )
/** * Split a string into parts [byte compare] * * The pattern matches identify delimiters that separate the input into fields. * The input data between the matches becomes the fields themselves. * * @param str character vector * @param pattern character vector * @param n_max integer vector * @param omit_empty logical vector * * * @version 0.1 (Bartek Tartanus) * @version 0.2 (Marek Gagolewski, 2013-06-25) StriException friendly, use StriContainerUTF8 * @version 0.3 (Marek Gagolewski, 2013-07-10) - BUGFIX: wrong behavior on empty str */ SEXP stri__split_fixed_byte(SEXP str, SEXP pattern, SEXP n_max, SEXP omit_empty) { str = stri_prepare_arg_string(str, "str"); pattern = stri_prepare_arg_string(pattern, "pattern"); n_max = stri_prepare_arg_integer(n_max, "n_max"); omit_empty = stri_prepare_arg_logical(omit_empty, "omit_empty"); STRI__ERROR_HANDLER_BEGIN R_len_t vectorize_length = stri__recycling_rule(true, 4, LENGTH(str), LENGTH(pattern), LENGTH(n_max), LENGTH(omit_empty)); StriContainerUTF8 str_cont(str, vectorize_length); StriContainerByteSearch pattern_cont(pattern, vectorize_length); StriContainerInteger n_max_cont(n_max, vectorize_length); StriContainerLogical omit_empty_cont(omit_empty, vectorize_length); SEXP ret; PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (n_max_cont.isNA(i) || omit_empty_cont.isNA(i)) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1)); continue; } int n_max_cur = n_max_cont.get(i); int omit_empty_cur = omit_empty_cont.get(i); STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1)); , SET_VECTOR_ELT(ret, i, stri__vector_empty_strings((omit_empty_cur || n_max_cur == 0)?0:1));)
/** * Detect if a pattern occurs in a string * * @param str R character vector * @param pattern R character vector containing regular expressions * @param opts_regex list * * @version 0.1 (Marcin Bujarski) * @version 0.2 (Marek Gagolewski) - use StriContainerUTF16 * @version 0.3 (Marek Gagolewski) - use StriContainerUTF16's vectorization * @version 0.4 (Marek Gagolewski, 2013-06-18) use StriContainerRegexPattern + opts_regex */ SEXP stri_detect_regex(SEXP str, SEXP pattern, SEXP opts_regex) { str = stri_prepare_arg_string(str, "str"); pattern = stri_prepare_arg_string(pattern, "pattern"); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); // this will work for vectorize_length == 0: uint32_t pattern_flags = StriContainerRegexPattern::getRegexFlags(opts_regex); STRI__ERROR_HANDLER_BEGIN StriContainerUTF16 str_cont(str, vectorize_length); // MG: tried StriContainerUTF8 + utext_openUTF8 - this was slower StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_flags); SEXP ret; PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_LOGICAL, ret_tab[i] = FALSE) RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically matcher->reset(str_cont.get(i)); ret_tab[i] = (int)matcher->find(); } UNPROTECT(1); return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Detect if a character class occurs in a string * * @param str character vector * @param pattern character vector * @param omit_na single logical value * @return logical vector * * @version 0.3-1 (Bartek Tartanus, 2014-07-25) * * @version 0.3-1 (Marek Gagolewski, 2014-10-17) * using std::vector<int> to avoid mem-leaks * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-04) * FR #122: omit_na arg added * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * FR #216: `negate` arg added */ SEXP stri_subset_charclass(SEXP str, SEXP pattern, SEXP omit_na, SEXP negate) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); bool omit_na1 = stri__prepare_arg_logical_1_notNA(omit_na, "omit_na"); PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); // BT: this cannot be done with deque, because pattern is reused so i does not // go like 0,1,2...n but 0,pat_len,2*pat_len,1,pat_len+1 and so on // MG: agreed std::vector<int> which(vectorize_length); int result_counter = 0; for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (str_cont.isNA(i) || pattern_cont.isNA(i)) { if (omit_na1) which[i] = FALSE; else { which[i] = NA_LOGICAL; result_counter++; } continue; } const UnicodeSet* pattern_cur = &pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); UChar32 chr = 0; which[i] = FALSE; for (R_len_t j=0; j<str_cur_n; ) { U8_NEXT(str_cur_s, j, str_cur_n, chr); if (chr < 0) // invalid utf-8 sequence throw StriException(MSG__INVALID_UTF8); if (pattern_cur->contains(chr)) { which[i] = TRUE; break; } } if (negate_1) which[i] = !which[i]; if (which[i]) result_counter++; } SEXP ret; STRI__PROTECT(ret = stri__subset_by_logical(str_cont, which, result_counter)); STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Detect if a pattern occurs in a string * * @param str R character vector * @param pattern R character vector containing regular expressions * @param negate single bool * @param max_count single int * @param opts_regex list * * @version 0.1-?? (Marcin Bujarski) * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF16 * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF16's vectorization * * @version 0.1-?? (Marek Gagolewski, 2013-06-18) * use StriContainerRegexPattern + opts_regex * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.0-2 (Marek Gagolewski, 2016-01-29) * Issue #214: allow a regex pattern like `.*` to match an empty string * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * FR #216: `negate` arg added * * @version 1.3.1 (Marek Gagolewski, 2019-02-08) * #232: `max_count` arg added */ SEXP stri_detect_regex(SEXP str, SEXP pattern, SEXP negate, SEXP max_count, SEXP opts_regex) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); int max_count_1 = stri__prepare_arg_integer_1_notNA(max_count, "max_count"); PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); uint32_t pattern_flags = StriContainerRegexPattern::getRegexFlags(opts_regex); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF16 str_cont(str, vectorize_length); // StriContainerUTF8 str_cont(str, vectorize_length); // utext_openUTF8, see below StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_flags); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (max_count_1 == 0) { ret_tab[i] = NA_LOGICAL; continue; } STRI__CONTINUE_ON_EMPTY_OR_NA_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_LOGICAL) RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically matcher->reset(str_cont.get(i)); ret_tab[i] = (int)matcher->find(); // returns UBool if (negate_1) ret_tab[i] = !ret_tab[i]; if (max_count_1 > 0 && ret_tab[i]) --max_count_1; // // mbmark-regex-detect1.R: UTF16 0.07171792 s; UText 0.10531605 s // UText* str_text = NULL; // UErrorCode status = U_ZERO_ERROR; // RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically // str_text = utext_openUTF8(str_text, str_cont.get(i).c_str(), str_cont.get(i).length(), &status); // STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // matcher->reset(str_text); // ret_tab[i] = (int)matcher->find(); // returns UBool // utext_close(str_text); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Extract first or last occurences of a character class in each string * * @param str character vector * @param pattern character vector * @return character vector * * @version 0.1 (Marek Gagolewski, 2013-06-08) * @version 0.2 (Marek Gagolewski, 2013-06-15) Use StrContainerCharClass * @version 0.3 (Marek Gagolewski, 2013-06-16) make StriException-friendly */ SEXP stri__extract_firstlast_charclass(SEXP str, SEXP pattern, bool first) { str = stri_prepare_arg_string(str, "str"); pattern = stri_prepare_arg_string(pattern, "pattern"); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); STRI__ERROR_HANDLER_BEGIN StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); SEXP ret; PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { SET_STRING_ELT(ret, i, NA_STRING); if (str_cont.isNA(i) || pattern_cont.isNA(i)) continue; CharClass pattern_cur = pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); R_len_t j, jlast; UChar32 chr; if (first) { for (jlast=j=0; j<str_cur_n; ) { U8_NEXT(str_cur_s, j, str_cur_n, chr); if (pattern_cur.test(chr)) { SET_STRING_ELT(ret, i, Rf_mkCharLenCE(str_cur_s+jlast, j-jlast, CE_UTF8)); break; // that's enough for first } jlast = j; } } else { for (jlast=j=str_cur_n; j>0; ) { U8_PREV(str_cur_s, 0, j, chr); // go backwards if (pattern_cur.test(chr)) { SET_STRING_ELT(ret, i, Rf_mkCharLenCE(str_cur_s+j, jlast-j, CE_UTF8)); break; // that's enough for last } jlast = j; } } } UNPROTECT(1); return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Compare elements in 2 character vectors, with collation * * @param e1 character vector * @param e2 character vector * @param opts_collator passed to stri__ucol_open() * @param type [internal] vector of length 2, * type[0]: 0 for ==, -1 for < and 1 for >, * type[1]: 0 or 1 (whether to negate the results) * * @return logical vector * * @version 0.2-1 (Marek Gagolewski, 2014-03-19) * * @version 0.2-3 (Marek Gagolewski, 2014-05-07) * opts_collator == NA no longer allowed * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_cmp_logical(SEXP e1, SEXP e2, SEXP opts_collator, SEXP type) { // we'll perform a collator-based cmp // type is an internal arg, check manually, error() allowed here if (!Rf_isInteger(type) || LENGTH(type) != 2) Rf_error(MSG__INCORRECT_INTERNAL_ARG); int _type = INTEGER(type)[0]; int _negate = INTEGER(type)[1]; if (_type > 1 || _type < -1 || _negate < 0 || _negate > 1) Rf_error(MSG__INCORRECT_INTERNAL_ARG); PROTECT(e1 = stri_prepare_arg_string(e1, "e1")); // prepare string argument PROTECT(e2 = stri_prepare_arg_string(e2, "e2")); // prepare string argument // call stri__ucol_open after prepare_arg: // if prepare_arg had failed, we would have a mem leak UCollator* col = NULL; col = stri__ucol_open(opts_collator); STRI__ERROR_HANDLER_BEGIN(2) R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(e1), LENGTH(e2)); StriContainerUTF8 e1_cont(e1, vectorize_length); StriContainerUTF8 e2_cont(e2, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = 0; i < vectorize_length; ++i) { if (e1_cont.isNA(i) || e2_cont.isNA(i)) { ret_tab[i] = NA_LOGICAL; continue; } R_len_t cur1_n = e1_cont.get(i).length(); const char* cur1_s = e1_cont.get(i).c_str(); R_len_t cur2_n = e2_cont.get(i).length(); const char* cur2_s = e2_cont.get(i).c_str(); // with collation UErrorCode status = U_ZERO_ERROR; ret_tab[i] = (_type == (int)ucol_strcollUTF8(col, cur1_s, cur1_n, cur2_s, cur2_n, &status )); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) if (_negate) ret_tab[i] = !ret_tab[i]; }
/** * Detect if a character class occurs in a string * * @param str character vector * @param pattern character vector * @param negate single bool * @param max_count single int * @return logical vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski, 2013-06-02) * Use StrContainerUTF8 and CharClass classes * * @version 0.1-?? (Marek Gagolewski, 2013-06-15) * Use StrContainerCharClass * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-04-03) * detects invalid UTF-8 byte stream * * @version 0.2-1 (Marek Gagolewski, 2014-04-05) * StriContainerCharClass now relies on UnicodeSet * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * FR #216: `negate` arg added * * @version 1.3.1 (Marek Gagolewski, 2019-02-08) * #232: `max_count` arg added */ SEXP stri_detect_charclass(SEXP str, SEXP pattern, SEXP negate, SEXP max_count) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); int max_count_1 = stri__prepare_arg_integer_1_notNA(max_count, "max_count"); PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (max_count_1 == 0 || str_cont.isNA(i) || pattern_cont.isNA(i)) { ret_tab[i] = NA_LOGICAL; continue; } const UnicodeSet* pattern_cur = &pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); UChar32 chr = 0; ret_tab[i] = FALSE; for (R_len_t j=0; j<str_cur_n; ) { U8_NEXT(str_cur_s, j, str_cur_n, chr); if (chr < 0) // invalid UTF-8 sequence throw StriException(MSG__INVALID_UTF8); if (pattern_cur->contains(chr)) { ret_tab[i] = TRUE; break; } } if (negate_1) ret_tab[i] = !ret_tab[i]; if (max_count_1 > 0 && ret_tab[i]) --max_count_1; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Compare elements in 2 character vectors, without collation * * @param e1 character vector * @param e2 character vector * @param type [internal] integer; 0 or 1 (whether to negate the results) * * @return logical vector * * @version 0.2-3 (Marek Gagolewski, 2014-05-07) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_cmp_codepoints(SEXP e1, SEXP e2, SEXP type) { // type is an internal arg, check manually, error() allowed here if (!Rf_isInteger(type) || LENGTH(type) != 1) Rf_error(MSG__INCORRECT_INTERNAL_ARG); int _negate = INTEGER(type)[0]; if (_negate < 0 || _negate > 1) Rf_error(MSG__INCORRECT_INTERNAL_ARG); PROTECT(e1 = stri_prepare_arg_string(e1, "e1")); // prepare string argument PROTECT(e2 = stri_prepare_arg_string(e2, "e2")); // prepare string argument STRI__ERROR_HANDLER_BEGIN(2) R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(e1), LENGTH(e2)); StriContainerUTF8 e1_cont(e1, vectorize_length); StriContainerUTF8 e2_cont(e2, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = 0; i < vectorize_length; ++i) { if (e1_cont.isNA(i) || e2_cont.isNA(i)) { ret_tab[i] = NA_LOGICAL; continue; } R_len_t cur1_n = e1_cont.get(i).length(); const char* cur1_s = e1_cont.get(i).c_str(); R_len_t cur2_n = e2_cont.get(i).length(); const char* cur2_s = e2_cont.get(i).c_str(); if (cur1_n != cur2_n) // different number of bytes => not equal ret_tab[i] = FALSE; else ret_tab[i] = (memcmp(cur1_s, cur2_s, cur1_n) == 0); if (_negate) ret_tab[i] = !ret_tab[i]; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({/* no-op on err */}) }
/** * Count pattern occurcess in a string [with collation] * * @param str character vector * @param pattern character vector * @param collator_opts passed to stri__ucol_open(), * if \code{NA}, then \code{stri_detect_fixed_byte} is called * @return integer vector * * @version 0.1 (Marek Gagolewski) * @version 0.2 (Marek Gagolewski) - corrected behavior on empty str/pattern * @version 0.3 (Marek Gagolewski, 2013-06-23) make StriException-friendly, * use StriContainerUStringSearch */ SEXP stri_count_fixed(SEXP str, SEXP pattern, SEXP collator_opts) { str = stri_prepare_arg_string(str, "str"); pattern = stri_prepare_arg_string(pattern, "pattern"); // call stri__ucol_open after prepare_arg: // if prepare_arg had failed, we would have a mem leak UCollator* collator = stri__ucol_open(collator_opts); if (!collator) return stri__count_fixed_byte(str, pattern); STRI__ERROR_HANDLER_BEGIN R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); StriContainerUTF16 str_cont(str, vectorize_length); StriContainerUStringSearch pattern_cont(pattern, vectorize_length, collator); // collator is not owned by pattern_cont SEXP ret; PROTECT(ret = Rf_allocVector(INTSXP, vectorize_length)); int* ret_tab = INTEGER(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_INTEGER, ret_tab[i] = 0) UStringSearch *matcher = pattern_cont.getMatcher(i, str_cont.get(i)); usearch_reset(matcher); UErrorCode status = U_ZERO_ERROR; ret_tab[i] = 0; while (((int)usearch_next(matcher, &status) != USEARCH_DONE) && !U_FAILURE(status)) ++ret_tab[i]; if (U_FAILURE(status)) throw StriException(status); } if (collator) { ucol_close(collator); collator=NULL; } UNPROTECT(1); return ret; STRI__ERROR_HANDLER_END( if (collator) ucol_close(collator); ) }
/** * Count pattern occurcess in a string [with collation] * * @param str character vector * @param pattern character vector * @param opts_collator passed to stri__ucol_open() * @return integer vector * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski) * corrected behavior on empty str/pattern * * @version 0.1-?? (Marek Gagolewski, 2013-06-23) * make StriException-friendly, * use StriContainerUStringSearch * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * new fun: stri_count_coll (opts_collator == NA not allowed) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_count_coll(SEXP str, SEXP pattern, SEXP opts_collator) { PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); // call stri__ucol_open after prepare_arg: // if prepare_arg had failed, we would have a mem leak UCollator* collator = NULL; collator = stri__ucol_open(opts_collator); STRI__ERROR_HANDLER_BEGIN(2) R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); StriContainerUTF16 str_cont(str, vectorize_length); StriContainerUStringSearch pattern_cont(pattern, vectorize_length, collator); // collator is not owned by pattern_cont SEXP ret; STRI__PROTECT(ret = Rf_allocVector(INTSXP, vectorize_length)); int* ret_tab = INTEGER(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_INTEGER, ret_tab[i] = 0) UStringSearch *matcher = pattern_cont.getMatcher(i, str_cont.get(i)); usearch_reset(matcher); UErrorCode status = U_ZERO_ERROR; R_len_t found = 0; while (!U_FAILURE(status) && ((int)usearch_next(matcher, &status) != USEARCH_DONE)) ++found; STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) ret_tab[i] = found; } if (collator) { ucol_close(collator); collator=NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END( if (collator) ucol_close(collator); ) }
/** * Detect if a character class occurs in a string * * @param str character vector * @param pattern character vector * @return logical vector * * @version 0.1 (Bartek Tartanus) * @version 0.2 (Marek Gagolewski, 2013-06-02) Use StrContainerUTF8 and CharClass classes * @version 0.3 (Marek Gagolewski, 2013-06-15) Use StrContainerCharClass * @version 0.4 (Marek Gagolewski, 2013-06-16) make StriException-friendly */ SEXP stri_detect_charclass(SEXP str, SEXP pattern) { str = stri_prepare_arg_string(str, "str"); pattern = stri_prepare_arg_string(pattern, "pattern"); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); STRI__ERROR_HANDLER_BEGIN StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); SEXP ret; PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (str_cont.isNA(i) || pattern_cont.isNA(i)) { ret_tab[i] = NA_LOGICAL; continue; } CharClass pattern_cur = pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); ret_tab[i] = FALSE; R_len_t j; UChar32 chr; for (j=0; j<str_cur_n; ) { U8_NEXT(str_cur_s, j, str_cur_n, chr); if (pattern_cur.test(chr)) { ret_tab[i] = TRUE; break; } } } UNPROTECT(1); return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Replace multiple substrings * * * @param str character vector * @param from integer vector (possibly with negative indices) * @param to integer vector (possibly with negative indices) or NULL * @param length integer vector or NULL * @param omit_na logical scalar * @param value character vector replacement * @return character vector * * @version 1.3.2 (Marek Gagolewski, 2019-02-22) * #30: new function * * * @version 1.4.3 (Marek Gagolewski, 2019-03-12) * #346: na_omit for `value` */ SEXP stri_sub_replacement_all(SEXP str, SEXP from, SEXP to, SEXP length, SEXP omit_na, SEXP value) { //PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(str = stri_enc_toutf8(str, Rf_ScalarLogical(FALSE), Rf_ScalarLogical(FALSE))); PROTECT(from = stri_prepare_arg_list(from, "from")); PROTECT(to = stri_prepare_arg_list(to, "to")); PROTECT(length = stri_prepare_arg_list(length, "length")); PROTECT(value = stri_prepare_arg_list(value, "value")); bool omit_na_1 = stri__prepare_arg_logical_1_notNA(omit_na, "omit_na"); R_len_t str_len = LENGTH(str); R_len_t from_len = LENGTH(from); R_len_t value_len = LENGTH(value); R_len_t vectorize_len; if (!isNull(to)) vectorize_len = stri__recycling_rule(true, 4, str_len, from_len, value_len, LENGTH(to)); else if (!isNull(length)) vectorize_len = stri__recycling_rule(true, 4, str_len, from_len, value_len, LENGTH(length)); else vectorize_len = stri__recycling_rule(true, 3, str_len, from_len, value_len); if (vectorize_len <= 0) { UNPROTECT(5); return Rf_allocVector(STRSXP, 0); } // no STRI__ERROR_HANDLER_BEGIN block ---- below we can longjmp with Rf_error... SEXP ret, curs, tmp; PROTECT(ret = Rf_allocVector(STRSXP, vectorize_len)); // 6 for (R_len_t i = 0; i<vectorize_len; ++i) { curs = STRING_ELT(str, i%str_len); if (curs == NA_STRING) { SET_STRING_ELT(ret, i, NA_STRING); continue; } if (!isNull(to)) { PROTECT(tmp = stri__sub_replacement_all_single(curs, VECTOR_ELT(from, i%from_len), VECTOR_ELT(to, i%LENGTH(to)), R_NilValue, omit_na_1, VECTOR_ELT(value, i%value_len))); } else if (!isNull(length)) { PROTECT(tmp = stri__sub_replacement_all_single(curs, VECTOR_ELT(from, i%from_len), R_NilValue, VECTOR_ELT(length, i%LENGTH(length)), omit_na_1, VECTOR_ELT(value, i%value_len))); } else { PROTECT(tmp = stri__sub_replacement_all_single(curs, VECTOR_ELT(from, i%from_len), R_NilValue, R_NilValue, omit_na_1, VECTOR_ELT(value, i%value_len))); } SET_STRING_ELT(ret, i, tmp); UNPROTECT(1); //tmp } UNPROTECT(6); return ret; }
/** * Extract multiple substrings * * * @param str character vector * @param from list * @param to list * @param length list * @return list of character vectors * * @version 1.3.2 (Marek Gagolewski, 2019-02-21) * #30: new function */ SEXP stri_sub_all(SEXP str, SEXP from, SEXP to, SEXP length) { PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(from = stri_prepare_arg_list(from, "from")); PROTECT(to = stri_prepare_arg_list(to, "to")); PROTECT(length = stri_prepare_arg_list(length, "length")); R_len_t str_len = LENGTH(str); R_len_t from_len = LENGTH(from); // R_len_t to_len = LENGTH(to); // R_len_t length_len = LENGTH(length); R_len_t vectorize_len; if (!isNull(to)) vectorize_len = stri__recycling_rule(true, 3, str_len, from_len, LENGTH(to)); else if (!isNull(length)) vectorize_len = stri__recycling_rule(true, 3, str_len, from_len, LENGTH(length)); else vectorize_len = stri__recycling_rule(true, 2, str_len, from_len); if (vectorize_len <= 0) { UNPROTECT(4); return Rf_allocVector(VECSXP, 0); } // no STRI__ERROR_HANDLER_BEGIN block ---- stri_sub can longjmp with Rf_error... SEXP ret, str_tmp, tmp; PROTECT(ret = Rf_allocVector(VECSXP, vectorize_len)); //5 PROTECT(str_tmp = Rf_allocVector(STRSXP, 1)); //6 for (R_len_t i = 0; i<vectorize_len; ++i) { PROTECT(tmp = STRING_ELT(str, i%str_len)); SET_STRING_ELT(str_tmp, 0, tmp); UNPROTECT(1); //tmp if (!isNull(to)) { PROTECT(tmp = stri_sub(str_tmp, VECTOR_ELT(from, i%from_len), VECTOR_ELT(to, i%LENGTH(to)), R_NilValue)); } else if (!isNull(length)) { PROTECT(tmp = stri_sub(str_tmp, VECTOR_ELT(from, i%from_len), R_NilValue, VECTOR_ELT(length, i%LENGTH(length)))); } else { PROTECT(tmp = stri_sub(str_tmp, VECTOR_ELT(from, i%from_len), R_NilValue, R_NilValue)); } SET_VECTOR_ELT(ret, i, tmp); UNPROTECT(1); //tmp } UNPROTECT(6); return ret; }
/** internal function - replace multiple substrings in a single string * can raise Rf_error * * @version 1.3.2 (Marek Gagolewski, 2019-02-23) * * @version 1.4.3 (Marek Gagolewski, 2019-03-12) * #346: na_omit for `value` */ SEXP stri__sub_replacement_all_single(SEXP curs, SEXP from, SEXP to, SEXP length, bool omit_na_1, SEXP value) { // curs is a CHARSXP in UTF-8 PROTECT(value = stri_enc_toutf8(value, Rf_ScalarLogical(FALSE), Rf_ScalarLogical(FALSE))); R_len_t value_len = LENGTH(value); R_len_t from_len = 0; // see below R_len_t to_len = 0; // see below R_len_t length_len = 0; // see below int* from_tab = 0; // see below int* to_tab = 0; // see below int* length_tab = 0; // see below R_len_t sub_protected = 1+ /* how many objects to PROTECT on ret? */ stri__sub_prepare_from_to_length(from, to, length, from_len, to_len, length_len, from_tab, to_tab, length_tab); R_len_t vectorize_len = stri__recycling_rule(true, 2, // does not care about value_len from_len, (to_len>length_len)?to_len:length_len); if (vectorize_len <= 0) { // "nothing" is being replaced -> return the input as-is UNPROTECT(sub_protected); return curs; } if (value_len <= 0) { // things are supposed to be replaced with "nothing"... UNPROTECT(sub_protected); Rf_warning(MSG__REPLACEMENT_ZERO); return NA_STRING; } if (vectorize_len % value_len != 0) Rf_warning(MSG__WARN_RECYCLING_RULE2); const char* curs_s = CHAR(curs); // already in UTF-8 R_len_t curs_n = LENGTH(curs); // first check for NAs.... if (!omit_na_1) { for (R_len_t i=0; i<vectorize_len; ++i) { R_len_t cur_from = from_tab[i % from_len]; R_len_t cur_to = (to_tab)?to_tab[i % to_len]:length_tab[i % length_len]; if (cur_from == NA_INTEGER || cur_to == NA_INTEGER) { UNPROTECT(sub_protected); if (omit_na_1) return curs; else return NA_STRING; } } for (R_len_t i=0; i<vectorize_len; ++i) { if (STRING_ELT(value, i%value_len) == NA_STRING) { UNPROTECT(sub_protected); return NA_STRING; } } } // get the number of code points in curs, if required (for negative indexes) R_len_t curs_m = -1; if (IS_ASCII(curs)) curs_m = curs_n; else { // is UTF-8 curs_m = 0; // code points count R_len_t j = 0; // byte pos while (j < curs_n) { U8_FWD_1_UNSAFE(curs_s, j); ++curs_m; } } STRI__ERROR_HANDLER_BEGIN(sub_protected) std::vector<char> buf; // convenience >> speed R_len_t buf_size; R_len_t last_pos = 0; R_len_t byte_pos = 0, byte_pos_last; for (R_len_t i=0; i<vectorize_len; ++i) { R_len_t cur_from = from_tab[i % from_len]; R_len_t cur_to = (to_tab)?to_tab[i % to_len]:length_tab[i % length_len]; if (cur_from == NA_INTEGER || cur_to == NA_INTEGER || STRING_ELT(value, i%value_len) == NA_STRING) { continue; } if (cur_from < 0) cur_from = curs_m+cur_from+1; if (cur_from <= 0) cur_from = 1; cur_from--; // 1-based -> 0-based index if (cur_from >= curs_m) cur_from = curs_m; // cur_from is in [0, curs_m] if (length_tab) { if (cur_to < 0) cur_to = 0; cur_to = cur_from+cur_to; } else { if (cur_to < 0) cur_to = curs_m+cur_to+1; if (cur_to < cur_from) cur_to = cur_from; // insertion } if (cur_to >= curs_m) cur_to = curs_m; // the chunk to replace is at code points [cur_from, cur_to) // Rprintf("orig [%d,%d) repl [%d,%d)\n", last_pos, cur_from, cur_from, cur_to); if (last_pos > cur_from) throw StriException(MSG__OVERLAPPING_OR_UNSORTED_INDEXES); // first, copy [last_pos, cur_from) byte_pos_last = byte_pos; while (last_pos < cur_from) { U8_FWD_1_UNSAFE(curs_s, byte_pos); ++last_pos; } buf_size = buf.size(); buf.resize(buf_size+byte_pos-byte_pos_last); memcpy(buf.data()+buf_size, curs_s+byte_pos_last, byte_pos-byte_pos_last); // then, copy the corresponding replacement string SEXP value_cur = STRING_ELT(value, i%value_len); const char* value_s = CHAR(value_cur); R_len_t value_n = LENGTH(value_cur); buf_size = buf.size(); buf.resize(buf_size+value_n); memcpy(buf.data()+buf_size, value_s, value_n); // lastly, update last_pos // ---> last_pos = cur_to; while (last_pos < cur_to) { U8_FWD_1_UNSAFE(curs_s, byte_pos); ++last_pos; } } // finally, copy [last_pos, curs_m) // Rprintf("orig [%d,%d)\n", last_pos, curs_m); buf_size = buf.size(); buf.resize(buf_size+curs_n-byte_pos); memcpy(buf.data()+buf_size, curs_s+byte_pos, curs_n-byte_pos); SEXP ret; STRI__PROTECT(ret = Rf_mkCharLenCE(buf.data(), buf.size(), CE_UTF8)); STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Get substring * * * @param str character vector * @param from integer vector (possibly with negative indices) * @param to integer vector (possibly with negative indices) or NULL * @param length integer vector or NULL * @return character vector * * @version 0.1-?? (Bartek Tartanus) * stri_sub * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF8 and stri__UChar32_to_UTF8_index * * @version 0.1-?? (Marek Gagolewski, 2013-06-01) * use StriContainerUTF8's UChar32-to-UTF8 index * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-20) * Use StriContainerUTF8_indexable * * @version 0.2-1 (Marek Gagolewski, 2014-04-03) * Use stri__sub_prepare_from_to_length() * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.5-9003 (Marek Gagolewski, 2015-08-05) * Bugfix #183: floating point exception when to or length is an empty vector */ SEXP stri_sub(SEXP str, SEXP from, SEXP to, SEXP length) { PROTECT(str = stri_prepare_arg_string(str, "str")); R_len_t str_len = LENGTH(str); R_len_t from_len = 0; R_len_t to_len = 0; R_len_t length_len = 0; int* from_tab = 0; int* to_tab = 0; int* length_tab = 0; R_len_t sub_protected = 1+ /* how many objects to PROTECT on ret? */ stri__sub_prepare_from_to_length(from, to, length, from_len, to_len, length_len, from_tab, to_tab, length_tab); R_len_t vectorize_len = stri__recycling_rule(true, 3, str_len, from_len, (to_len>length_len)?to_len:length_len); if (vectorize_len <= 0) { UNPROTECT(sub_protected); return Rf_allocVector(STRSXP, 0); } STRI__ERROR_HANDLER_BEGIN(sub_protected) StriContainerUTF8_indexable str_cont(str, vectorize_len); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_len)); for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { R_len_t cur_from = from_tab[i % from_len]; R_len_t cur_to = (to_tab)?to_tab[i % to_len]:length_tab[i % length_len]; if (str_cont.isNA(i) || cur_from == NA_INTEGER || cur_to == NA_INTEGER) { SET_STRING_ELT(ret, i, NA_STRING); continue; } if (length_tab) { if (cur_to <= 0) { SET_STRING_ELT(ret, i, R_BlankString); continue; } cur_to = cur_from + cur_to - 1; if (cur_from < 0 && cur_to >= 0) cur_to = -1; } const char* str_cur_s = str_cont.get(i).c_str(); R_len_t cur_from2; // UTF-8 byte indices R_len_t cur_to2; // UTF-8 byte indices stri__sub_get_indices(str_cont, i, cur_from, cur_to, cur_from2, cur_to2); if (cur_to2 > cur_from2) { // just copy SET_STRING_ELT(ret, i, Rf_mkCharLenCE(str_cur_s+cur_from2, cur_to2-cur_from2, CE_UTF8)); } else { // maybe a warning here? SET_STRING_ELT(ret, i, Rf_mkCharLen(NULL, 0)); } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Substring replacement function * * * @param str character vector * @param from integer vector (possibly with negative indices) * @param to integer vector (possibly with negative indices) or NULL * @param length integer vector or NULL * @param omit_na logical scalar * @param value character vector replacement * @return character vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF8 and stri__UChar32_to_UTF8_index * * @version 0.1-?? (Marek Gagolewski, 2013-06-01) * use StriContainerUTF8's UChar32-to-UTF8 index * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-03-20) * Use StriContainerUTF8_indexable * * @version 0.2-1 (Marek Gagolewski, 2014-04-03) * Use stri__sub_prepare_from_to_length() * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.5-9003 (Marek Gagolewski, 2015-08-05) * Bugfix #183: floating point exception when to or length is an empty vector * * @version 1.0-2 (Marek Gagolewski, 2016-01-31) * FR #199: new arg: `omit_na` * FR #207: allow insertions * * * @version 1.4.3 (Marek Gagolewski, 2019-03-12) * #346: na_omit for `value` */ SEXP stri_sub_replacement(SEXP str, SEXP from, SEXP to, SEXP length, SEXP omit_na, SEXP value) { PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(value = stri_prepare_arg_string(value, "value")); bool omit_na_1 = stri__prepare_arg_logical_1_notNA(omit_na, "omit_na"); R_len_t value_len = LENGTH(value); R_len_t str_len = LENGTH(str); R_len_t from_len = 0; // see below R_len_t to_len = 0; // see below R_len_t length_len = 0; // see below int* from_tab = 0; // see below int* to_tab = 0; // see below int* length_tab = 0; // see below R_len_t sub_protected = 2+ /* how many objects to PROTECT on ret? */ stri__sub_prepare_from_to_length(from, to, length, from_len, to_len, length_len, from_tab, to_tab, length_tab); R_len_t vectorize_len = stri__recycling_rule(true, 4, str_len, value_len, from_len, (to_len>length_len)?to_len:length_len); if (vectorize_len <= 0) { UNPROTECT(sub_protected); return Rf_allocVector(STRSXP, 0); } STRI__ERROR_HANDLER_BEGIN(sub_protected) StriContainerUTF8_indexable str_cont(str, vectorize_len); StriContainerUTF8 value_cont(value, vectorize_len); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_len)); String8buf buf(0); // @TODO: estimate bufsize a priori for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { R_len_t cur_from = from_tab[i % from_len]; R_len_t cur_to = (to_tab)?to_tab[i % to_len]:length_tab[i % length_len]; if (str_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } if (cur_from == NA_INTEGER || cur_to == NA_INTEGER || value_cont.isNA(i)) { if (omit_na_1) { SET_STRING_ELT(ret, i, str_cont.toR(i)); } else { SET_STRING_ELT(ret, i, NA_STRING); } continue; } if (length_tab) { if (cur_to <= 0) { // SET_STRING_ELT(ret, i, R_BlankString); // continue; cur_to = 0; } else { cur_to = cur_from + cur_to - 1; if (cur_from < 0 && cur_to >= 0) cur_to = -1; } } const char* str_cur_s = str_cont.get(i).c_str(); R_len_t str_cur_n = str_cont.get(i).length(); const char* value_cur_s = value_cont.get(i).c_str(); R_len_t value_cur_n = value_cont.get(i).length(); R_len_t cur_from2; // UTF-8 byte indices R_len_t cur_to2; // UTF-8 byte indices stri__sub_get_indices(str_cont, i, cur_from, cur_to, cur_from2, cur_to2); if (cur_to2 < cur_from2) cur_to2 = cur_from2; R_len_t buflen = str_cur_n-(cur_to2-cur_from2)+value_cur_n; buf.resize(buflen, false/*destroy contents*/); memcpy(buf.data(), str_cur_s, (size_t)cur_from2); memcpy(buf.data()+cur_from2, value_cur_s, (size_t)value_cur_n); memcpy(buf.data()+cur_from2+value_cur_n, str_cur_s+cur_to2, (size_t)str_cur_n-cur_to2); SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), buflen, CE_UTF8)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Trim characters from a charclass from left AND/OR right side of the string * * @param str character vector * @param pattern character vector * @param left from left? * @param right from left? * @return character vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski, 2013-06-04) * Use StriContainerUTF8 and CharClass * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly & Use StrContainerCharClass * * @version 0.2-1 (Marek Gagolewski, 2014-04-03) * detects invalid UTF-8 byte stream * * @version 0.2-1 (Marek Gagolewski, 2014-04-05) * StriContainerCharClass now relies on UnicodeSet * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri__trim_leftright(SEXP str, SEXP pattern, bool left, bool right) { PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (str_cont.isNA(i) || pattern_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } const UnicodeSet* pattern_cur = &pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); R_len_t jlast1 = 0; R_len_t jlast2 = str_cur_n; if (left) { UChar32 chr; for (R_len_t j=0; j<str_cur_n; ) { U8_NEXT(str_cur_s, j, str_cur_n, chr); // "look ahead" if (chr < 0) // invalid utf-8 sequence throw StriException(MSG__INVALID_UTF8); if (pattern_cur->contains(chr)) { break; // break at first occurrence } jlast1 = j; } } if (right && jlast1 < str_cur_n) { UChar32 chr; for (R_len_t j=str_cur_n; j>0; ) { U8_PREV(str_cur_s, 0, j, chr); // "look behind" if (chr < 0) // invalid utf-8 sequence throw StriException(MSG__INVALID_UTF8); if (pattern_cur->contains(chr)) { break; // break at first occurrence } jlast2 = j; } } // now jlast is the index, from which we start copying SET_STRING_ELT(ret, i, Rf_mkCharLenCE(str_cur_s+jlast1, (jlast2-jlast1), CE_UTF8)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Split a string into text lines * * @param str character vector * @param n_max integer vector * @param omit_empty logical vector * * @return list of character vectors * * @version 0.1 (Marek Gagolewski, 2013-08-04) */ SEXP stri_split_lines(SEXP str, SEXP n_max, SEXP omit_empty) { str = stri_prepare_arg_string(str, "str"); n_max = stri_prepare_arg_integer(n_max, "n_max"); omit_empty = stri_prepare_arg_logical(omit_empty, "omit_empty"); R_len_t vectorize_length = stri__recycling_rule(true, 3, LENGTH(str), LENGTH(n_max), LENGTH(omit_empty)); STRI__ERROR_HANDLER_BEGIN StriContainerUTF8 str_cont(str, vectorize_length); StriContainerInteger n_max_cont(n_max, vectorize_length); StriContainerLogical omit_empty_cont(omit_empty, vectorize_length); SEXP ret; PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length)); for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { if (str_cont.isNA(i)) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1)); continue; } const char* str_cur_s = str_cont.get(i).c_str(); R_len_t str_cur_n = str_cont.get(i).length(); int n_max_cur = n_max_cont.get(i); int omit_empty_cur = omit_empty_cont.get(i); if (n_max_cur < 0) n_max_cur = INT_MAX; else if (n_max_cur == 0) { SET_VECTOR_ELT(ret, i, Rf_allocVector(STRSXP, 0)); continue; } //#define STRI_INDEX_NEWLINE_CR 0 //#define STRI_INDEX_NEWLINE_LF 1 //#define STRI_INDEX_NEWLINE_CRLF 2 //#define STRI_INDEX_NEWLINE_NEL 3 //#define STRI_INDEX_NEWLINE_VT 4 //#define STRI_INDEX_NEWLINE_FF 5 //#define STRI_INDEX_NEWLINE_LS 6 //#define STRI_INDEX_NEWLINE_PS 7 //#define STRI_INDEX_NEWLINE_LAST 8 // int counts[STRI_INDEX_NEWLINE_LAST]; // for (R_len_t j=0; j<STRI_INDEX_NEWLINE_LAST; ++j) // counts[j] = 0; UChar32 c; R_len_t jlast, k=1; deque<R_len_t_x2> occurences; occurences.push_back(R_len_t_x2(0, 0)); for (R_len_t j=0; j < str_cur_n && k < n_max_cur; /* null */) { jlast = j; U8_NEXT(str_cur_s, j, str_cur_n, c); switch (c) { case ASCII_CR: /* CR */ // counts[STRI_INDEX_NEWLINE_CR]++; /* check if next is LF */ if (str_cur_s[j] == ASCII_LF) { // look ahead one byte // counts[STRI_INDEX_NEWLINE_LF]++; // counts[STRI_INDEX_NEWLINE_CRLF]++; j++; // just one byte } break; case ASCII_LF: /* LF */ // counts[STRI_INDEX_NEWLINE_LF]++; break; case UCHAR_NEL: /* NEL */ // counts[STRI_INDEX_NEWLINE_NEL]++; break; case ASCII_VT: /* VT */ // counts[STRI_INDEX_NEWLINE_VT]++; break; case ASCII_FF: /* FF */ // counts[STRI_INDEX_NEWLINE_FF]++; break; case UCHAR_LS: /* LS */ // counts[STRI_INDEX_NEWLINE_LS]++; break; case UCHAR_PS: /* PS */ // counts[STRI_INDEX_NEWLINE_PS]++; break; default: /* not a newline character */ occurences.back().v2 = j; continue; } // if here, then at newline if (omit_empty_cur && occurences.back().v2 == occurences.back().v1) occurences.back().v1 = occurences.back().v2 = j; // don't start new field else { occurences.back().v2 = jlast; occurences.push_back(R_len_t_x2(j, j)); ++k; // another field } } if (k == n_max_cur) occurences.back().v2 = str_cur_n; if (omit_empty_cur && occurences.back().v1 == occurences.back().v2) occurences.pop_back(); SEXP ans; PROTECT(ans = Rf_allocVector(STRSXP, (R_len_t)occurences.size())); deque<R_len_t_x2>::iterator iter = occurences.begin(); for (R_len_t l = 0; iter != occurences.end(); ++iter, ++l) { R_len_t_x2 curoccur = *iter; SET_STRING_ELT(ans, l, Rf_mkCharLenCE(str_cur_s+curoccur.v1, curoccur.v2-curoccur.v1, CE_UTF8)); } SET_VECTOR_ELT(ret, i, ans); UNPROTECT(1); } UNPROTECT(1); return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Pad a string * * vectorized over str, length and pad * if str or pad or length is NA the result will be NA * * @param str character vector * @param min_length integer vector * @param side [internal int] * @param pad character vector * @param use_length single logical value * @return character vector * * @version 0.1-?? (Bartlomiej Tartanus) * * @version 0.2-2 (Marek Gagolewski, 2014-04-20) * use stri_error_handler, pad should be a single code point, not byte * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.5-1 (Marek Gagolewski, 2015-04-22) * `use_length` arg added, * second argument renamed `width` */ SEXP stri_pad(SEXP str, SEXP width, SEXP side, SEXP pad, SEXP use_length) { // this is an internal arg, check manually, error() allowed here if (!Rf_isInteger(side) || LENGTH(side) != 1) Rf_error(MSG__INCORRECT_INTERNAL_ARG); int _side = INTEGER(side)[0]; if (_side < 0 || _side > 2) Rf_error(MSG__INCORRECT_INTERNAL_ARG); bool use_length_val = stri__prepare_arg_logical_1_notNA(use_length, "use_length"); PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(width = stri_prepare_arg_integer(width, "width")); PROTECT(pad = stri_prepare_arg_string(pad, "pad")); // side = stri_prepare_arg_string(side, "side"); // const char* side_opts[] = {"left", "right", "both", NULL}; R_len_t str_length = LENGTH(str); R_len_t width_length = LENGTH(width); // R_len_t side_length = LENGTH(side); R_len_t pad_length = LENGTH(pad); R_len_t vectorize_length = stri__recycling_rule(true, 3, str_length, width_length, /*side_length, */ pad_length); STRI__ERROR_HANDLER_BEGIN(3) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerInteger width_cont(width, vectorize_length); // StriContainerUTF8 side_cont(side, vectorize_length); StriContainerUTF8 pad_cont(pad, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); String8buf buf(0); // TODO: prealloc for (R_len_t i=0; i<vectorize_length; ++i) { if (str_cont.isNA(i) || pad_cont.isNA(i) || /*side_cont.isNA(i) ||*/ width_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } // get the current string R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); R_len_t str_cur_width; // get the width/length of padding code point(s) R_len_t pad_cur_n = pad_cont.get(i).length(); const char* pad_cur_s = pad_cont.get(i).c_str(); R_len_t pad_cur_width; if (use_length_val) { pad_cur_width = 1; str_cur_width = str_cont.get(i).countCodePoints(); R_len_t k = 0; UChar32 pad_cur = 0; U8_NEXT(pad_cur_s, k, pad_cur_n, pad_cur); if (pad_cur <= 0 || k < pad_cur_n) throw StriException(MSG__NOT_EQ_N_CODEPOINTS, "pad", 1); } else { pad_cur_width = stri__width_string(pad_cur_s, pad_cur_n); str_cur_width = stri__width_string(str_cur_s, str_cur_n); if (pad_cur_width != 1) throw StriException(MSG__NOT_EQ_N_WIDTH, "pad", 1); } // get the minimal width R_len_t width_cur = width_cont.get(i); if (str_cur_width >= width_cur) { // no padding at all SET_STRING_ELT(ret, i, str_cont.toR(i)); continue; } R_len_t padnum = width_cur-str_cur_width; buf.resize(str_cur_n+padnum*pad_cur_n, false); char* buftmp = buf.data(); R_len_t k = 0; switch(_side) { case 0: // left for (k=0; k<padnum; ++k) { memcpy(buftmp, pad_cur_s, pad_cur_n); buftmp += pad_cur_n; } memcpy(buftmp, str_cur_s, str_cur_n); buftmp += str_cur_n; break; case 1: // right memcpy(buftmp, str_cur_s, str_cur_n); buftmp += str_cur_n; for (k=0; k<padnum; ++k) { memcpy(buftmp, pad_cur_s, pad_cur_n); buftmp += pad_cur_n; } break; case 2: // both for (k=0; k<padnum/2; ++k) { memcpy(buftmp, pad_cur_s, pad_cur_n); buftmp += pad_cur_n; } memcpy(buftmp, str_cur_s, str_cur_n); buftmp += str_cur_n; for (; k<padnum; ++k) { memcpy(buftmp, pad_cur_s, pad_cur_n); buftmp += pad_cur_n; } break; } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), (int)(buftmp-buf.data()), CE_UTF8)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Extract all occurences of a character class in each string * * @param str character vector * @param pattern character vector * @return list of character vectors * * @version 0.1 (Marek Gagolewski, 2013-06-08) * @version 0.2 (Marek Gagolewski, 2013-06-15) Use StrContainerCharClass * @version 0.3 (Marek Gagolewski, 2013-06-16) make StriException-friendly */ SEXP stri_extract_all_charclass(SEXP str, SEXP pattern, SEXP merge) { str = stri_prepare_arg_string(str, "str"); pattern = stri_prepare_arg_string(pattern, "pattern"); merge = stri_prepare_arg_logical(merge, "merge"); R_len_t vectorize_length = stri__recycling_rule(true, 3, LENGTH(str), LENGTH(pattern), LENGTH(merge)); STRI__ERROR_HANDLER_BEGIN StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); StriContainerLogical merge_cont(merge, vectorize_length); SEXP notfound; // this vector will be set iff not found or NA PROTECT(notfound = stri__vector_NA_strings(1)); SEXP ret; PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (pattern_cont.isNA(i) || str_cont.isNA(i) || merge_cont.isNA(i)) { SET_VECTOR_ELT(ret, i, notfound); continue; } bool merge_cur = merge_cont.get(i); CharClass pattern_cur = pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); R_len_t j, jlast; UChar32 chr; deque<R_len_t_x2> occurences; // codepoint based-indices for (jlast=j=0; j<str_cur_n; ) { U8_NEXT(str_cur_s, j, str_cur_n, chr); if (pattern_cur.test(chr)) { occurences.push_back(R_len_t_x2(jlast, j)); } jlast = j; } R_len_t noccurences = (R_len_t)occurences.size(); if (noccurences == 0) SET_VECTOR_ELT(ret, i, notfound); else if (merge_cur && noccurences > 1) { // do merge deque<R_len_t_x2> occurences2; deque<R_len_t_x2>::iterator iter = occurences.begin(); occurences2.push_back(*iter); for (++iter; iter != occurences.end(); ++iter) { R_len_t_x2 curoccur = *iter; if (occurences2.back().v2 == curoccur.v1) { // continue seq occurences2.back().v2 = curoccur.v2; // change `end` } else { // new seq occurences2.push_back(curoccur); } } // create resulting matrix from occurences2 R_len_t noccurences2 = (R_len_t)occurences2.size(); SEXP cur_res; PROTECT(cur_res = Rf_allocVector(STRSXP, noccurences2)); iter = occurences2.begin(); for (R_len_t f = 0; iter != occurences2.end(); ++iter, ++f) { R_len_t_x2 curo = *iter; SET_STRING_ELT(cur_res, f, Rf_mkCharLenCE(str_cur_s+curo.v1, curo.v2-curo.v1, CE_UTF8)); } SET_VECTOR_ELT(ret, i, cur_res); UNPROTECT(1); } else { // do not merge SEXP cur_res; PROTECT(cur_res = Rf_allocVector(STRSXP, noccurences)); deque<R_len_t_x2>::iterator iter = occurences.begin(); for (R_len_t f = 0; iter != occurences.end(); ++iter, ++f) { R_len_t_x2 curo = *iter; SET_STRING_ELT(cur_res, f, Rf_mkCharLenCE(str_cur_s+curo.v1, curo.v2-curo.v1, CE_UTF8)); } SET_VECTOR_ELT(ret, i, cur_res); UNPROTECT(1); } } UNPROTECT(2); return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** Date-time artithmetic * * @param time * @param value * @param units * @param tz * @param locale * * @return POSIXst * * @version 0.5-1 (Marek Gagolewski, 2014-12-30) * @version 0.5-1 (Marek Gagolewski, 2015-03-06) tz arg added */ SEXP stri_datetime_add(SEXP time, SEXP value, SEXP units, SEXP tz, SEXP locale) { PROTECT(time = stri_prepare_arg_POSIXct(time, "time")); PROTECT(value = stri_prepare_arg_integer(value, "value")); if (!isNull(tz)) PROTECT(tz = stri_prepare_arg_string_1(tz, "tz")); else PROTECT(tz); /* needed to set tzone attrib */ R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(time), LENGTH(value)); const char* units_val = stri__prepare_arg_string_1_notNA(units, "units"); const char* units_opts[] = {"years", "months", "weeks", "days", "hours", "minutes", "seconds", "milliseconds", NULL}; int units_cur = stri__match_arg(units_val, units_opts); const char* locale_val = stri__prepare_arg_locale(locale, "locale", true); TimeZone* tz_val = stri__prepare_arg_timezone(tz, "tz", true/*allowdefault*/); Calendar* cal = NULL; STRI__ERROR_HANDLER_BEGIN(3) StriContainerDouble time_cont(time, vectorize_length); StriContainerInteger value_cont(value, vectorize_length); UCalendarDateFields units_field; switch (units_cur) { case 0: units_field = UCAL_YEAR; break; case 1: units_field = UCAL_MONTH; break; case 2: units_field = UCAL_WEEK_OF_YEAR; break; case 3: units_field = UCAL_DAY_OF_MONTH; break; case 4: units_field = UCAL_HOUR_OF_DAY; break; case 5: units_field = UCAL_MINUTE; break; case 6: units_field = UCAL_SECOND; break; case 7: units_field = UCAL_MILLISECOND; break; default: throw StriException(MSG__INCORRECT_MATCH_OPTION, "units"); } UErrorCode status = U_ZERO_ERROR; cal = Calendar::createInstance(locale_val, status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) cal->adoptTimeZone(tz_val); tz_val = NULL; /* The Calendar takes ownership of the TimeZone. */ SEXP ret; STRI__PROTECT(ret = Rf_allocVector(REALSXP, vectorize_length)); double* ret_val = REAL(ret); for (R_len_t i=0; i<vectorize_length; ++i) { if (time_cont.isNA(i) || value_cont.isNA(i)) { ret_val[i] = NA_REAL; continue; } status = U_ZERO_ERROR; cal->setTime((UDate)(time_cont.get(i)*1000.0), status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) status = U_ZERO_ERROR; cal->add(units_field, value_cont.get(i), status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) status = U_ZERO_ERROR; ret_val[i] = ((double)cal->getTime(status))/1000.0; STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } if (!isNull(tz)) Rf_setAttrib(ret, Rf_ScalarString(Rf_mkChar("tzone")), tz); stri__set_class_POSIXct(ret); if (tz_val) { delete tz_val; tz_val = NULL; } if (cal) { delete cal; cal = NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (tz_val) { delete tz_val; tz_val = NULL; } if (cal) { delete cal; cal = NULL; } }) }