/** * Detect if a pattern occurs in a string * * @param str R character vector * @param pattern R character vector containing regular expressions * @param opts_regex list * * @version 0.1 (Marcin Bujarski) * @version 0.2 (Marek Gagolewski) - use StriContainerUTF16 * @version 0.3 (Marek Gagolewski) - use StriContainerUTF16's vectorization * @version 0.4 (Marek Gagolewski, 2013-06-18) use StriContainerRegexPattern + opts_regex */ SEXP stri_detect_regex(SEXP str, SEXP pattern, SEXP opts_regex) { str = stri_prepare_arg_string(str, "str"); pattern = stri_prepare_arg_string(pattern, "pattern"); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); // this will work for vectorize_length == 0: uint32_t pattern_flags = StriContainerRegexPattern::getRegexFlags(opts_regex); STRI__ERROR_HANDLER_BEGIN StriContainerUTF16 str_cont(str, vectorize_length); // MG: tried StriContainerUTF8 + utext_openUTF8 - this was slower StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_flags); SEXP ret; PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_LOGICAL, ret_tab[i] = FALSE) RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically matcher->reset(str_cont.get(i)); ret_tab[i] = (int)matcher->find(); } UNPROTECT(1); return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Count the number of recurrences of \code{pattern} in \code{str} [fast but dummy bitewise compare] * * @param str strings to search in * @param pattern patterns to search for * @return integer vector * * @version 0.1 (Bartek Tartanus) * @version 0.2 (Marek Gagolewski) - use StriContainerUTF8 * @version 0.3 (Marek Gagolewski) - corrected behavior on empty str/pattern * @version 0.4 (Marek Gagolewski, 2013-06-23) make StriException-friendly, * use StriContainerByteSearch */ SEXP stri__count_fixed_byte(SEXP str, SEXP pattern) { str = stri_prepare_arg_string(str, "str"); pattern = stri_prepare_arg_string(pattern, "pattern"); STRI__ERROR_HANDLER_BEGIN R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); StriContainerUTF8 str_cont(str, vectorize_length); StriContainerByteSearch pattern_cont(pattern, vectorize_length); SEXP ret; PROTECT(ret = Rf_allocVector(INTSXP, vectorize_length)); int* ret_tab = INTEGER(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_INTEGER, ret_tab[i] = 0) pattern_cont.setupMatcher(i, str_cont.get(i).c_str(), str_cont.get(i).length()); ret_tab[i] = 0; while (USEARCH_DONE != pattern_cont.findNext()) ++ret_tab[i]; } UNPROTECT(1); return ret; STRI__ERROR_HANDLER_END( ;/* do nothing special on error */ ) }
/** * Count pattern occurcess in a string [with collation] * * @param str character vector * @param pattern character vector * @param opts_collator passed to stri__ucol_open() * @return integer vector * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski) * corrected behavior on empty str/pattern * * @version 0.1-?? (Marek Gagolewski, 2013-06-23) * make StriException-friendly, * use StriContainerUStringSearch * * @version 0.2-3 (Marek Gagolewski, 2014-05-08) * new fun: stri_count_coll (opts_collator == NA not allowed) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_count_coll(SEXP str, SEXP pattern, SEXP opts_collator) { PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); // call stri__ucol_open after prepare_arg: // if prepare_arg had failed, we would have a mem leak UCollator* collator = NULL; collator = stri__ucol_open(opts_collator); STRI__ERROR_HANDLER_BEGIN(2) R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); StriContainerUTF16 str_cont(str, vectorize_length); StriContainerUStringSearch pattern_cont(pattern, vectorize_length, collator); // collator is not owned by pattern_cont SEXP ret; STRI__PROTECT(ret = Rf_allocVector(INTSXP, vectorize_length)); int* ret_tab = INTEGER(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_INTEGER, ret_tab[i] = 0) UStringSearch *matcher = pattern_cont.getMatcher(i, str_cont.get(i)); usearch_reset(matcher); UErrorCode status = U_ZERO_ERROR; R_len_t found = 0; while (!U_FAILURE(status) && ((int)usearch_next(matcher, &status) != USEARCH_DONE)) ++found; STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) ret_tab[i] = found; } if (collator) { ucol_close(collator); collator=NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END( if (collator) ucol_close(collator); ) }
/** * Count pattern occurcess in a string [with collation] * * @param str character vector * @param pattern character vector * @param collator_opts passed to stri__ucol_open(), * if \code{NA}, then \code{stri_detect_fixed_byte} is called * @return integer vector * * @version 0.1 (Marek Gagolewski) * @version 0.2 (Marek Gagolewski) - corrected behavior on empty str/pattern * @version 0.3 (Marek Gagolewski, 2013-06-23) make StriException-friendly, * use StriContainerUStringSearch */ SEXP stri_count_fixed(SEXP str, SEXP pattern, SEXP collator_opts) { str = stri_prepare_arg_string(str, "str"); pattern = stri_prepare_arg_string(pattern, "pattern"); // call stri__ucol_open after prepare_arg: // if prepare_arg had failed, we would have a mem leak UCollator* collator = stri__ucol_open(collator_opts); if (!collator) return stri__count_fixed_byte(str, pattern); STRI__ERROR_HANDLER_BEGIN R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); StriContainerUTF16 str_cont(str, vectorize_length); StriContainerUStringSearch pattern_cont(pattern, vectorize_length, collator); // collator is not owned by pattern_cont SEXP ret; PROTECT(ret = Rf_allocVector(INTSXP, vectorize_length)); int* ret_tab = INTEGER(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_INTEGER, ret_tab[i] = 0) UStringSearch *matcher = pattern_cont.getMatcher(i, str_cont.get(i)); usearch_reset(matcher); UErrorCode status = U_ZERO_ERROR; ret_tab[i] = 0; while (((int)usearch_next(matcher, &status) != USEARCH_DONE) && !U_FAILURE(status)) ++ret_tab[i]; if (U_FAILURE(status)) throw StriException(status); } if (collator) { ucol_close(collator); collator=NULL; } UNPROTECT(1); return ret; STRI__ERROR_HANDLER_END( if (collator) ucol_close(collator); ) }