/** * Split a string into parts [byte compare] * * The pattern matches identify delimiters that separate the input into fields. * The input data between the matches becomes the fields themselves. * * @param str character vector * @param pattern character vector * @param n_max integer vector * @param omit_empty logical vector * * * @version 0.1 (Bartek Tartanus) * @version 0.2 (Marek Gagolewski, 2013-06-25) StriException friendly, use StriContainerUTF8 * @version 0.3 (Marek Gagolewski, 2013-07-10) - BUGFIX: wrong behavior on empty str */ SEXP stri__split_fixed_byte(SEXP str, SEXP pattern, SEXP n_max, SEXP omit_empty) { str = stri_prepare_arg_string(str, "str"); pattern = stri_prepare_arg_string(pattern, "pattern"); n_max = stri_prepare_arg_integer(n_max, "n_max"); omit_empty = stri_prepare_arg_logical(omit_empty, "omit_empty"); STRI__ERROR_HANDLER_BEGIN R_len_t vectorize_length = stri__recycling_rule(true, 4, LENGTH(str), LENGTH(pattern), LENGTH(n_max), LENGTH(omit_empty)); StriContainerUTF8 str_cont(str, vectorize_length); StriContainerByteSearch pattern_cont(pattern, vectorize_length); StriContainerInteger n_max_cont(n_max, vectorize_length); StriContainerLogical omit_empty_cont(omit_empty, vectorize_length); SEXP ret; PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (n_max_cont.isNA(i) || omit_empty_cont.isNA(i)) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1)); continue; } int n_max_cur = n_max_cont.get(i); int omit_empty_cur = omit_empty_cont.get(i); STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1)); , SET_VECTOR_ELT(ret, i, stri__vector_empty_strings((omit_empty_cur || n_max_cur == 0)?0:1));)
/** * Detect if a pattern occurs in a string * * @param str R character vector * @param pattern R character vector containing regular expressions * @param opts_regex list * * @version 0.1 (Marcin Bujarski) * @version 0.2 (Marek Gagolewski) - use StriContainerUTF16 * @version 0.3 (Marek Gagolewski) - use StriContainerUTF16's vectorization * @version 0.4 (Marek Gagolewski, 2013-06-18) use StriContainerRegexPattern + opts_regex */ SEXP stri_detect_regex(SEXP str, SEXP pattern, SEXP opts_regex) { str = stri_prepare_arg_string(str, "str"); pattern = stri_prepare_arg_string(pattern, "pattern"); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); // this will work for vectorize_length == 0: uint32_t pattern_flags = StriContainerRegexPattern::getRegexFlags(opts_regex); STRI__ERROR_HANDLER_BEGIN StriContainerUTF16 str_cont(str, vectorize_length); // MG: tried StriContainerUTF8 + utext_openUTF8 - this was slower StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_flags); SEXP ret; PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_LOGICAL, ret_tab[i] = FALSE) RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically matcher->reset(str_cont.get(i)); ret_tab[i] = (int)matcher->find(); } UNPROTECT(1); return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Split a string into parts. * * The pattern matches identify delimiters that separate the input into fields. * The input data between the matches becomes the fields themselves. * * @param str character vector * @param pattern character vector * @param n_max integer vector * @param opts_regex * @return list of character vectors * * @version 0.1 (Marek Gagolewski, 2013-06-21) * @version 0.2 (Marek Gagolewski, 2013-07-10) - BUGFIX: wrong behavior on empty str */ SEXP stri_split_regex(SEXP str, SEXP pattern, SEXP n_max, SEXP omit_empty, SEXP opts_regex) { str = stri_prepare_arg_string(str, "str"); pattern = stri_prepare_arg_string(pattern, "pattern"); n_max = stri_prepare_arg_integer(n_max, "n_max"); omit_empty = stri_prepare_arg_logical(omit_empty, "omit_empty"); R_len_t vectorize_length = stri__recycling_rule(true, 4, LENGTH(str), LENGTH(pattern), LENGTH(n_max), LENGTH(omit_empty)); uint32_t pattern_flags = StriContainerRegexPattern::getRegexFlags(opts_regex); UText* str_text = NULL; // may potentially be slower, but definitely is more convenient! STRI__ERROR_HANDLER_BEGIN StriContainerUTF8 str_cont(str, vectorize_length); StriContainerInteger n_max_cont(n_max, vectorize_length); StriContainerLogical omit_empty_cont(omit_empty, vectorize_length); StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_flags); SEXP ret; PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (n_max_cont.isNA(i) || omit_empty_cont.isNA(i)) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1)); continue; } int n_max_cur = n_max_cont.get(i); int omit_empty_cur = omit_empty_cont.get(i); STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1));, SET_VECTOR_ELT(ret, i, stri__vector_empty_strings((omit_empty_cur || n_max_cur == 0)?0:1));)
/** * Extract all capture groups of the first/last occurence of a regex pattern in each string * * @param str character vector * @param pattern character vector * @param opts_regex list * @param firs logical - search for the first or the last occurence? * @return character matrix * * @version 0.1 (Marek Gagolewski, 2013-06-22) */ SEXP stri__match_firstlast_regex(SEXP str, SEXP pattern, SEXP opts_regex, bool first) { str = stri_prepare_arg_string(str, "str"); // prepare string argument pattern = stri_prepare_arg_string(pattern, "pattern"); // prepare string argument R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); uint32_t pattern_flags = StriContainerRegexPattern::getRegexFlags(opts_regex); UText* str_text = NULL; // may potentially be slower, but definitely is more convenient! STRI__ERROR_HANDLER_BEGIN StriContainerUTF8 str_cont(str, vectorize_length); StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_flags); vector< vector<charptr_x2> > occurences(vectorize_length); // we don't know how many capture groups are there R_len_t occurences_max = 1; for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, /*do nothing*/;, int pattern_cur_groups = pattern_cont.getMatcher(i)->groupCount(); if (occurences_max < pattern_cur_groups+1) occurences_max=pattern_cur_groups+1; )
/** * Count the number of recurrences of \code{pattern} in \code{str} [fast but dummy bitewise compare] * * @param str strings to search in * @param pattern patterns to search for * @return integer vector * * @version 0.1 (Bartek Tartanus) * @version 0.2 (Marek Gagolewski) - use StriContainerUTF8 * @version 0.3 (Marek Gagolewski) - corrected behavior on empty str/pattern * @version 0.4 (Marek Gagolewski, 2013-06-23) make StriException-friendly, * use StriContainerByteSearch */ SEXP stri__count_fixed_byte(SEXP str, SEXP pattern) { str = stri_prepare_arg_string(str, "str"); pattern = stri_prepare_arg_string(pattern, "pattern"); STRI__ERROR_HANDLER_BEGIN R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); StriContainerUTF8 str_cont(str, vectorize_length); StriContainerByteSearch pattern_cont(pattern, vectorize_length); SEXP ret; PROTECT(ret = Rf_allocVector(INTSXP, vectorize_length)); int* ret_tab = INTEGER(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_INTEGER, ret_tab[i] = 0) pattern_cont.setupMatcher(i, str_cont.get(i).c_str(), str_cont.get(i).length()); ret_tab[i] = 0; while (USEARCH_DONE != pattern_cont.findNext()) ++ret_tab[i]; } UNPROTECT(1); return ret; STRI__ERROR_HANDLER_END( ;/* do nothing special on error */ ) }
/** * Detect if a character class occurs in a string * * @param str character vector * @param pattern character vector * @param omit_na single logical value * @return logical vector * * @version 0.3-1 (Bartek Tartanus, 2014-07-25) * * @version 0.3-1 (Marek Gagolewski, 2014-10-17) * using std::vector<int> to avoid mem-leaks * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-04) * FR #122: omit_na arg added * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * FR #216: `negate` arg added */ SEXP stri_subset_charclass(SEXP str, SEXP pattern, SEXP omit_na, SEXP negate) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); bool omit_na1 = stri__prepare_arg_logical_1_notNA(omit_na, "omit_na"); PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); // BT: this cannot be done with deque, because pattern is reused so i does not // go like 0,1,2...n but 0,pat_len,2*pat_len,1,pat_len+1 and so on // MG: agreed std::vector<int> which(vectorize_length); int result_counter = 0; for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (str_cont.isNA(i) || pattern_cont.isNA(i)) { if (omit_na1) which[i] = FALSE; else { which[i] = NA_LOGICAL; result_counter++; } continue; } const UnicodeSet* pattern_cur = &pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); UChar32 chr = 0; which[i] = FALSE; for (R_len_t j=0; j<str_cur_n; ) { U8_NEXT(str_cur_s, j, str_cur_n, chr); if (chr < 0) // invalid utf-8 sequence throw StriException(MSG__INVALID_UTF8); if (pattern_cur->contains(chr)) { which[i] = TRUE; break; } } if (negate_1) which[i] = !which[i]; if (which[i]) result_counter++; } SEXP ret; STRI__PROTECT(ret = stri__subset_by_logical(str_cont, which, result_counter)); STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Substitutes vector elements if a pattern occurs in a string * * @param str character vector * @param pattern character vector * @param value character vector * @return character vector * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * FR#124 * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * FR #216: `negate` arg added */ SEXP stri_subset_charclass_replacement(SEXP str, SEXP pattern, SEXP negate, SEXP value) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string_1(pattern, "pattern")); PROTECT(value = stri_prepare_arg_string(value, "value")); int vectorize_length = LENGTH(str); int value_length = LENGTH(value); if (value_length == 0) Rf_error(MSG__REPLACEMENT_ZERO); STRI__ERROR_HANDLER_BEGIN(3) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerUTF8 value_cont(value, value_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); R_len_t k = 0; for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { if (str_cont.isNA(i) || pattern_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } const UnicodeSet* pattern_cur = &pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); UChar32 chr = 0; bool found = false; for (R_len_t j=0; j<str_cur_n; ) { U8_NEXT(str_cur_s, j, str_cur_n, chr); if (chr < 0) // invalid utf-8 sequence throw StriException(MSG__INVALID_UTF8); if (pattern_cur->contains(chr)) { found = true; break; } } if ((found && !negate_1) || (!found && negate_1)) SET_STRING_ELT(ret, i, value_cont.toR((k++)%value_length)); else SET_STRING_ELT(ret, i, str_cont.toR(i)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Extract first or last occurences of a character class in each string * * @param str character vector * @param pattern character vector * @return character vector * * @version 0.1 (Marek Gagolewski, 2013-06-08) * @version 0.2 (Marek Gagolewski, 2013-06-15) Use StrContainerCharClass * @version 0.3 (Marek Gagolewski, 2013-06-16) make StriException-friendly */ SEXP stri__extract_firstlast_charclass(SEXP str, SEXP pattern, bool first) { str = stri_prepare_arg_string(str, "str"); pattern = stri_prepare_arg_string(pattern, "pattern"); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); STRI__ERROR_HANDLER_BEGIN StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); SEXP ret; PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { SET_STRING_ELT(ret, i, NA_STRING); if (str_cont.isNA(i) || pattern_cont.isNA(i)) continue; CharClass pattern_cur = pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); R_len_t j, jlast; UChar32 chr; if (first) { for (jlast=j=0; j<str_cur_n; ) { U8_NEXT(str_cur_s, j, str_cur_n, chr); if (pattern_cur.test(chr)) { SET_STRING_ELT(ret, i, Rf_mkCharLenCE(str_cur_s+jlast, j-jlast, CE_UTF8)); break; // that's enough for first } jlast = j; } } else { for (jlast=j=str_cur_n; j>0; ) { U8_PREV(str_cur_s, 0, j, chr); // go backwards if (pattern_cur.test(chr)) { SET_STRING_ELT(ret, i, Rf_mkCharLenCE(str_cur_s+j, jlast-j, CE_UTF8)); break; // that's enough for last } jlast = j; } } } UNPROTECT(1); return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Detect if a pattern occurs in a string * * @param str R character vector * @param pattern R character vector containing regular expressions * @param negate single bool * @param max_count single int * @param opts_regex list * * @version 0.1-?? (Marcin Bujarski) * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF16 * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF16's vectorization * * @version 0.1-?? (Marek Gagolewski, 2013-06-18) * use StriContainerRegexPattern + opts_regex * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.0-2 (Marek Gagolewski, 2016-01-29) * Issue #214: allow a regex pattern like `.*` to match an empty string * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * FR #216: `negate` arg added * * @version 1.3.1 (Marek Gagolewski, 2019-02-08) * #232: `max_count` arg added */ SEXP stri_detect_regex(SEXP str, SEXP pattern, SEXP negate, SEXP max_count, SEXP opts_regex) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); int max_count_1 = stri__prepare_arg_integer_1_notNA(max_count, "max_count"); PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); uint32_t pattern_flags = StriContainerRegexPattern::getRegexFlags(opts_regex); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF16 str_cont(str, vectorize_length); // StriContainerUTF8 str_cont(str, vectorize_length); // utext_openUTF8, see below StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_flags); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (max_count_1 == 0) { ret_tab[i] = NA_LOGICAL; continue; } STRI__CONTINUE_ON_EMPTY_OR_NA_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_LOGICAL) RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically matcher->reset(str_cont.get(i)); ret_tab[i] = (int)matcher->find(); // returns UBool if (negate_1) ret_tab[i] = !ret_tab[i]; if (max_count_1 > 0 && ret_tab[i]) --max_count_1; // // mbmark-regex-detect1.R: UTF16 0.07171792 s; UText 0.10531605 s // UText* str_text = NULL; // UErrorCode status = U_ZERO_ERROR; // RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically // str_text = utext_openUTF8(str_text, str_cont.get(i).c_str(), str_cont.get(i).length(), &status); // STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // matcher->reset(str_text); // ret_tab[i] = (int)matcher->find(); // returns UBool // utext_close(str_text); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Detect if a character class occurs in a string * * @param str character vector * @param pattern character vector * @param negate single bool * @param max_count single int * @return logical vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski, 2013-06-02) * Use StrContainerUTF8 and CharClass classes * * @version 0.1-?? (Marek Gagolewski, 2013-06-15) * Use StrContainerCharClass * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.2-1 (Marek Gagolewski, 2014-04-03) * detects invalid UTF-8 byte stream * * @version 0.2-1 (Marek Gagolewski, 2014-04-05) * StriContainerCharClass now relies on UnicodeSet * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * FR #216: `negate` arg added * * @version 1.3.1 (Marek Gagolewski, 2019-02-08) * #232: `max_count` arg added */ SEXP stri_detect_charclass(SEXP str, SEXP pattern, SEXP negate, SEXP max_count) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); int max_count_1 = stri__prepare_arg_integer_1_notNA(max_count, "max_count"); PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (max_count_1 == 0 || str_cont.isNA(i) || pattern_cont.isNA(i)) { ret_tab[i] = NA_LOGICAL; continue; } const UnicodeSet* pattern_cur = &pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); UChar32 chr = 0; ret_tab[i] = FALSE; for (R_len_t j=0; j<str_cur_n; ) { U8_NEXT(str_cur_s, j, str_cur_n, chr); if (chr < 0) // invalid UTF-8 sequence throw StriException(MSG__INVALID_UTF8); if (pattern_cur->contains(chr)) { ret_tab[i] = TRUE; break; } } if (negate_1) ret_tab[i] = !ret_tab[i]; if (max_count_1 > 0 && ret_tab[i]) --max_count_1; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Count pattern occurcess in a string [with collation] * * @param str character vector * @param pattern character vector * @param opts_collator passed to stri__ucol_open() * @return integer vector * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski) * corrected behavior on empty str/pattern * * @version 0.1-?? (Marek Gagolewski, 2013-06-23) * make StriException-friendly, * use StriContainerUStringSearch * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * new fun: stri_count_coll (opts_collator == NA not allowed) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_count_coll(SEXP str, SEXP pattern, SEXP opts_collator) { PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); // call stri__ucol_open after prepare_arg: // if prepare_arg had failed, we would have a mem leak UCollator* collator = NULL; collator = stri__ucol_open(opts_collator); STRI__ERROR_HANDLER_BEGIN(2) R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); StriContainerUTF16 str_cont(str, vectorize_length); StriContainerUStringSearch pattern_cont(pattern, vectorize_length, collator); // collator is not owned by pattern_cont SEXP ret; STRI__PROTECT(ret = Rf_allocVector(INTSXP, vectorize_length)); int* ret_tab = INTEGER(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_INTEGER, ret_tab[i] = 0) UStringSearch *matcher = pattern_cont.getMatcher(i, str_cont.get(i)); usearch_reset(matcher); UErrorCode status = U_ZERO_ERROR; R_len_t found = 0; while (!U_FAILURE(status) && ((int)usearch_next(matcher, &status) != USEARCH_DONE)) ++found; STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) ret_tab[i] = found; } if (collator) { ucol_close(collator); collator=NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END( if (collator) ucol_close(collator); ) }
/** * Count pattern occurcess in a string [with collation] * * @param str character vector * @param pattern character vector * @param collator_opts passed to stri__ucol_open(), * if \code{NA}, then \code{stri_detect_fixed_byte} is called * @return integer vector * * @version 0.1 (Marek Gagolewski) * @version 0.2 (Marek Gagolewski) - corrected behavior on empty str/pattern * @version 0.3 (Marek Gagolewski, 2013-06-23) make StriException-friendly, * use StriContainerUStringSearch */ SEXP stri_count_fixed(SEXP str, SEXP pattern, SEXP collator_opts) { str = stri_prepare_arg_string(str, "str"); pattern = stri_prepare_arg_string(pattern, "pattern"); // call stri__ucol_open after prepare_arg: // if prepare_arg had failed, we would have a mem leak UCollator* collator = stri__ucol_open(collator_opts); if (!collator) return stri__count_fixed_byte(str, pattern); STRI__ERROR_HANDLER_BEGIN R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); StriContainerUTF16 str_cont(str, vectorize_length); StriContainerUStringSearch pattern_cont(pattern, vectorize_length, collator); // collator is not owned by pattern_cont SEXP ret; PROTECT(ret = Rf_allocVector(INTSXP, vectorize_length)); int* ret_tab = INTEGER(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_INTEGER, ret_tab[i] = 0) UStringSearch *matcher = pattern_cont.getMatcher(i, str_cont.get(i)); usearch_reset(matcher); UErrorCode status = U_ZERO_ERROR; ret_tab[i] = 0; while (((int)usearch_next(matcher, &status) != USEARCH_DONE) && !U_FAILURE(status)) ++ret_tab[i]; if (U_FAILURE(status)) throw StriException(status); } if (collator) { ucol_close(collator); collator=NULL; } UNPROTECT(1); return ret; STRI__ERROR_HANDLER_END( if (collator) ucol_close(collator); ) }
/** * Detect if a character class occurs in a string * * @param str character vector * @param pattern character vector * @return logical vector * * @version 0.1 (Bartek Tartanus) * @version 0.2 (Marek Gagolewski, 2013-06-02) Use StrContainerUTF8 and CharClass classes * @version 0.3 (Marek Gagolewski, 2013-06-15) Use StrContainerCharClass * @version 0.4 (Marek Gagolewski, 2013-06-16) make StriException-friendly */ SEXP stri_detect_charclass(SEXP str, SEXP pattern) { str = stri_prepare_arg_string(str, "str"); pattern = stri_prepare_arg_string(pattern, "pattern"); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); STRI__ERROR_HANDLER_BEGIN StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); SEXP ret; PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (str_cont.isNA(i) || pattern_cont.isNA(i)) { ret_tab[i] = NA_LOGICAL; continue; } CharClass pattern_cur = pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); ret_tab[i] = FALSE; R_len_t j; UChar32 chr; for (j=0; j<str_cur_n; ) { U8_NEXT(str_cur_s, j, str_cur_n, chr); if (pattern_cur.test(chr)) { ret_tab[i] = TRUE; break; } } } UNPROTECT(1); return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Trim characters from a charclass from left AND/OR right side of the string * * @param str character vector * @param pattern character vector * @param left from left? * @param right from left? * @return character vector * * @version 0.1-?? (Bartek Tartanus) * * @version 0.1-?? (Marek Gagolewski, 2013-06-04) * Use StriContainerUTF8 and CharClass * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly & Use StrContainerCharClass * * @version 0.2-1 (Marek Gagolewski, 2014-04-03) * detects invalid UTF-8 byte stream * * @version 0.2-1 (Marek Gagolewski, 2014-04-05) * StriContainerCharClass now relies on UnicodeSet * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri__trim_leftright(SEXP str, SEXP pattern, bool left, bool right) { PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (str_cont.isNA(i) || pattern_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } const UnicodeSet* pattern_cur = &pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); R_len_t jlast1 = 0; R_len_t jlast2 = str_cur_n; if (left) { UChar32 chr; for (R_len_t j=0; j<str_cur_n; ) { U8_NEXT(str_cur_s, j, str_cur_n, chr); // "look ahead" if (chr < 0) // invalid utf-8 sequence throw StriException(MSG__INVALID_UTF8); if (pattern_cur->contains(chr)) { break; // break at first occurrence } jlast1 = j; } } if (right && jlast1 < str_cur_n) { UChar32 chr; for (R_len_t j=str_cur_n; j>0; ) { U8_PREV(str_cur_s, 0, j, chr); // "look behind" if (chr < 0) // invalid utf-8 sequence throw StriException(MSG__INVALID_UTF8); if (pattern_cur->contains(chr)) { break; // break at first occurrence } jlast2 = j; } } // now jlast is the index, from which we start copying SET_STRING_ELT(ret, i, Rf_mkCharLenCE(str_cur_s+jlast1, (jlast2-jlast1), CE_UTF8)); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** Generate random strings * * @param n single integer * @param length integer vector * @param pattern character vector * @return character vector * * @version 0.2-1 (Marek Gagolewski, 2014-04-04) * * @version 0.2-1 (Marek Gagolewski, 2014-04-05) * Use StriContainerCharClass which now contains UnicodeSets; * vectorized also over pattern * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_rand_strings(SEXP n, SEXP length, SEXP pattern) { int n_val = stri__prepare_arg_integer_1_notNA(n, "n"); PROTECT(length = stri_prepare_arg_integer(length, "length")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); if (n_val < 0) n_val = 0; /* that's not NA for sure now */ R_len_t length_len = LENGTH(length); if (length_len <= 0) { UNPROTECT(2); Rf_error(MSG__ARG_EXPECTED_NOT_EMPTY, "length"); } else if (length_len > n_val || n_val % length_len != 0) Rf_warning(MSG__WARN_RECYCLING_RULE2); R_len_t pattern_len = LENGTH(pattern); if (pattern_len <= 0) { UNPROTECT(2); Rf_error(MSG__ARG_EXPECTED_NOT_EMPTY, "pattern"); } else if (pattern_len > n_val || n_val % pattern_len != 0) Rf_warning(MSG__WARN_RECYCLING_RULE2); GetRNGstate(); STRI__ERROR_HANDLER_BEGIN(2) StriContainerCharClass pattern_cont(pattern, max(n_val, pattern_len)); StriContainerInteger length_cont(length, max(n_val, length_len)); // get max required bufsize int* length_tab = INTEGER(length); R_len_t bufsize = 0; for (R_len_t i=0; i<length_len; ++i) { if (length_tab[i] != NA_INTEGER && length_tab[i] > bufsize) bufsize = length_tab[i]; } bufsize *= 4; // 1 UChar32 -> max. 4 UTF-8 bytes String8buf buf(bufsize); char* bufdata = buf.data(); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, n_val)); for (R_len_t i=0; i<n_val; ++i) { if (length_cont.isNA(i) || pattern_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } int length_cur = length_cont.get(i); if (length_cur < 0) length_cur = 0; const UnicodeSet* uset = &(pattern_cont.get(i)); int32_t uset_size = uset->size(); // generate string: R_len_t j = 0; UBool err = FALSE; for (R_len_t k=0; k<length_cur; ++k) { int32_t idx = (int32_t)floor(unif_rand()*(double)uset_size); /* 0..uset_size-1 */ UChar32 c = uset->charAt(idx); if (c < 0) throw StriException(MSG__INTERNAL_ERROR); U8_APPEND((uint8_t*)bufdata, j, bufsize, c, err); if (err) throw StriException(MSG__INTERNAL_ERROR); } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, j, CE_UTF8)); } PutRNGstate(); STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ PutRNGstate(); }) }
/** * Extract all occurences of a character class in each string * * @param str character vector * @param pattern character vector * @return list of character vectors * * @version 0.1 (Marek Gagolewski, 2013-06-08) * @version 0.2 (Marek Gagolewski, 2013-06-15) Use StrContainerCharClass * @version 0.3 (Marek Gagolewski, 2013-06-16) make StriException-friendly */ SEXP stri_extract_all_charclass(SEXP str, SEXP pattern, SEXP merge) { str = stri_prepare_arg_string(str, "str"); pattern = stri_prepare_arg_string(pattern, "pattern"); merge = stri_prepare_arg_logical(merge, "merge"); R_len_t vectorize_length = stri__recycling_rule(true, 3, LENGTH(str), LENGTH(pattern), LENGTH(merge)); STRI__ERROR_HANDLER_BEGIN StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); StriContainerLogical merge_cont(merge, vectorize_length); SEXP notfound; // this vector will be set iff not found or NA PROTECT(notfound = stri__vector_NA_strings(1)); SEXP ret; PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (pattern_cont.isNA(i) || str_cont.isNA(i) || merge_cont.isNA(i)) { SET_VECTOR_ELT(ret, i, notfound); continue; } bool merge_cur = merge_cont.get(i); CharClass pattern_cur = pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); R_len_t j, jlast; UChar32 chr; deque<R_len_t_x2> occurences; // codepoint based-indices for (jlast=j=0; j<str_cur_n; ) { U8_NEXT(str_cur_s, j, str_cur_n, chr); if (pattern_cur.test(chr)) { occurences.push_back(R_len_t_x2(jlast, j)); } jlast = j; } R_len_t noccurences = (R_len_t)occurences.size(); if (noccurences == 0) SET_VECTOR_ELT(ret, i, notfound); else if (merge_cur && noccurences > 1) { // do merge deque<R_len_t_x2> occurences2; deque<R_len_t_x2>::iterator iter = occurences.begin(); occurences2.push_back(*iter); for (++iter; iter != occurences.end(); ++iter) { R_len_t_x2 curoccur = *iter; if (occurences2.back().v2 == curoccur.v1) { // continue seq occurences2.back().v2 = curoccur.v2; // change `end` } else { // new seq occurences2.push_back(curoccur); } } // create resulting matrix from occurences2 R_len_t noccurences2 = (R_len_t)occurences2.size(); SEXP cur_res; PROTECT(cur_res = Rf_allocVector(STRSXP, noccurences2)); iter = occurences2.begin(); for (R_len_t f = 0; iter != occurences2.end(); ++iter, ++f) { R_len_t_x2 curo = *iter; SET_STRING_ELT(cur_res, f, Rf_mkCharLenCE(str_cur_s+curo.v1, curo.v2-curo.v1, CE_UTF8)); } SET_VECTOR_ELT(ret, i, cur_res); UNPROTECT(1); } else { // do not merge SEXP cur_res; PROTECT(cur_res = Rf_allocVector(STRSXP, noccurences)); deque<R_len_t_x2>::iterator iter = occurences.begin(); for (R_len_t f = 0; iter != occurences.end(); ++iter, ++f) { R_len_t_x2 curo = *iter; SET_STRING_ELT(cur_res, f, Rf_mkCharLenCE(str_cur_s+curo.v1, curo.v2-curo.v1, CE_UTF8)); } SET_VECTOR_ELT(ret, i, cur_res); UNPROTECT(1); } } UNPROTECT(2); return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }