bool t4p::FinderClass::FindNextRegularExpression(const UnicodeString& text, int32_t start) { if (U_SUCCESS(PatternErrorCode) && Pattern != NULL) { UnicodeString findText(text); if (start > 0 && start < text.length()) { findText.setTo(text, start); } else if (start > 0) { findText = UNICODE_STRING_SIMPLE(""); } int32_t foundPos = 0, length = 0, endPos = 0; UErrorCode error = U_ZERO_ERROR; RegexMatcher* matcher = Pattern->matcher(findText, error); if (U_SUCCESS(error) && matcher) { if (matcher->find()) { foundPos = matcher->start(error); endPos = matcher->end(error); if (U_SUCCESS(error) && U_SUCCESS(error)) { IsFound = true; length = endPos - foundPos; // end is the index after the match // if search was started from the middle of a string, // need to correct the found position LastPosition = start > 0 ? foundPos + start : foundPos; LastLength = length; } } delete matcher; } } return IsFound; }
/** * Detect if a pattern occurs in a string * * @param str R character vector * @param pattern R character vector containing regular expressions * @param opts_regex list * * @version 0.1 (Marcin Bujarski) * @version 0.2 (Marek Gagolewski) - use StriContainerUTF16 * @version 0.3 (Marek Gagolewski) - use StriContainerUTF16's vectorization * @version 0.4 (Marek Gagolewski, 2013-06-18) use StriContainerRegexPattern + opts_regex */ SEXP stri_detect_regex(SEXP str, SEXP pattern, SEXP opts_regex) { str = stri_prepare_arg_string(str, "str"); pattern = stri_prepare_arg_string(pattern, "pattern"); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); // this will work for vectorize_length == 0: uint32_t pattern_flags = StriContainerRegexPattern::getRegexFlags(opts_regex); STRI__ERROR_HANDLER_BEGIN StriContainerUTF16 str_cont(str, vectorize_length); // MG: tried StriContainerUTF8 + utext_openUTF8 - this was slower StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_flags); SEXP ret; PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_LOGICAL, ret_tab[i] = FALSE) RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically matcher->reset(str_cont.get(i)); ret_tab[i] = (int)matcher->find(); } UNPROTECT(1); return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Detect if a pattern occurs in a string * * @param str R character vector * @param pattern R character vector containing regular expressions * @param negate single bool * @param max_count single int * @param opts_regex list * * @version 0.1-?? (Marcin Bujarski) * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF16 * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF16's vectorization * * @version 0.1-?? (Marek Gagolewski, 2013-06-18) * use StriContainerRegexPattern + opts_regex * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.0-2 (Marek Gagolewski, 2016-01-29) * Issue #214: allow a regex pattern like `.*` to match an empty string * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * FR #216: `negate` arg added * * @version 1.3.1 (Marek Gagolewski, 2019-02-08) * #232: `max_count` arg added */ SEXP stri_detect_regex(SEXP str, SEXP pattern, SEXP negate, SEXP max_count, SEXP opts_regex) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); int max_count_1 = stri__prepare_arg_integer_1_notNA(max_count, "max_count"); PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); uint32_t pattern_flags = StriContainerRegexPattern::getRegexFlags(opts_regex); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF16 str_cont(str, vectorize_length); // StriContainerUTF8 str_cont(str, vectorize_length); // utext_openUTF8, see below StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_flags); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (max_count_1 == 0) { ret_tab[i] = NA_LOGICAL; continue; } STRI__CONTINUE_ON_EMPTY_OR_NA_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_LOGICAL) RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically matcher->reset(str_cont.get(i)); ret_tab[i] = (int)matcher->find(); // returns UBool if (negate_1) ret_tab[i] = !ret_tab[i]; if (max_count_1 > 0 && ret_tab[i]) --max_count_1; // // mbmark-regex-detect1.R: UTF16 0.07171792 s; UText 0.10531605 s // UText* str_text = NULL; // UErrorCode status = U_ZERO_ERROR; // RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically // str_text = utext_openUTF8(str_text, str_cont.get(i).c_str(), str_cont.get(i).length(), &status); // STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // matcher->reset(str_text); // ret_tab[i] = (int)matcher->find(); // returns UBool // utext_close(str_text); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
int t4p::FinderClass::ReplaceAllMatches(UnicodeString& text) const { int matches = 0; // no check for ReplaceExpression.isEmpty() allow for empty replacements // this allows the user to 'delete' parts of a strin if (IsPrepared) { UnicodeString replacement = ReplaceExpression; RegexMatcher* matcher = NULL; UErrorCode error = U_ZERO_ERROR; UnicodeString dest(text.length(), ' ', 0); int32_t pos = 0; if (EXACT == Mode || (REGULAR_EXPRESSION == Mode && ReplaceExpression.isEmpty())) { pos = text.indexOf(Expression, 0); while (pos >= 0) { text.replaceBetween(pos, pos + Expression.length(), replacement); pos = text.indexOf(Expression, pos + replacement.length()); ++matches; } } else { matcher = Pattern->matcher(text, error); if (U_SUCCESS(error) && matcher) { while (matcher->find()) { if (U_SUCCESS(error)) { matcher->appendReplacement(dest, replacement, error); if (U_SUCCESS(error)) { ++matches; } } } matcher->appendTail(dest); text = dest; } } if (matcher) { delete matcher; } } return matches; }
//------------------------------------------------------------------------------------------ // // main for ugrep // // Structurally, all use of the ICU Regular Expression API is in main(), // and all of the supporting stuff necessary to make a running program, but // not directly related to regular expressions, is factored out into these other // functions. // //------------------------------------------------------------------------------------------ int main(int argc, const char** argv) { UBool matchFound = FALSE; // // Process the commmand line options. // processOptions(argc, argv); // // Create a RegexPattern object from the user supplied pattern string. // UErrorCode status = U_ZERO_ERROR; // All ICU operations report success or failure // in a status variable. UParseError parseErr; // In the event of a syntax error in the regex pattern, // this struct will contain the position of the // error. RegexPattern *rePat = RegexPattern::compile(pattern, parseErr, status); // Note that C++ is doing an automatic conversion // of the (char *) pattern to a temporary // UnicodeString object. if (U_FAILURE(status)) { fprintf(stderr, "ugrep: error in pattern: \"%s\" at position %d\n", u_errorName(status), parseErr.offset); exit(-1); } // // Create a RegexMatcher from the newly created pattern. // UnicodeString empty; RegexMatcher *matcher = rePat->matcher(empty, status); if (U_FAILURE(status)) { fprintf(stderr, "ugrep: error in creating RegexMatcher: \"%s\"\n", u_errorName(status)); exit(-1); } // // Loop, processing each of the input files. // for (int fileNum=firstFileNum; fileNum < argc; fileNum++) { readFile(argv[fileNum]); // // Loop through the lines of a file, trying to match the regex pattern on each. // for (nextLine(0); lineStart<fileLen; nextLine(lineEnd)) { UnicodeString s(FALSE, ucharBuf+lineStart, lineEnd-lineStart); matcher->reset(s); if (matcher->find()) { matchFound = TRUE; printMatch(); } } } // // Clean up // delete matcher; delete rePat; free(ucharBuf); free(charBuf); ucnv_close(outConverter); u_cleanup(); // shut down ICU, release any cached data it owns. return matchFound? 0: 1; }