/** * Extract all capture groups of the first/last occurrence * of a regex pattern in each string * * @param str character vector * @param pattern character vector * @param opts_regex list * @param first logical - search for the first or the last occurrence? * @param cg_missing single string * @return character matrix * * @version 0.1-??? (Marek Gagolewski, 2013-06-22) * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-06) * new arg: cg_missing * * @version 1.0-2 (Marek Gagolewski, 2016-01-29) * Issue #214: allow a regex pattern like `.*` to match an empty string */ SEXP stri__match_firstlast_regex(SEXP str, SEXP pattern, SEXP cg_missing, SEXP opts_regex, bool first) { // @TODO: capture_groups arg (integer vector/set - which capture groups to extract) PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); // prepare string argument PROTECT(cg_missing = stri_prepare_arg_string_1(cg_missing, "cg_missing")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); uint32_t pattern_flags = StriContainerRegexPattern::getRegexFlags(opts_regex); UText* str_text = NULL; // may potentially be slower, but definitely is more convenient! STRI__ERROR_HANDLER_BEGIN(3) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_flags); StriContainerUTF8 cg_missing_cont(cg_missing, 1); STRI__PROTECT(cg_missing = STRING_ELT(cg_missing, 0)); // we don't know how many capture groups are there: vector< vector< pair<const char*, const char*> > > occurrences(vectorize_length); R_len_t occurrences_max = 1; for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_PATTERN(str_cont, pattern_cont, /*do nothing*/; ) UErrorCode status = U_ZERO_ERROR; RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically int pattern_cur_groups = matcher->groupCount(); if (occurrences_max < pattern_cur_groups+1) occurrences_max=pattern_cur_groups+1; str_text = utext_openUTF8(str_text, str_cont.get(i).c_str(), str_cont.get(i).length(), &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})