/** * Detect if a pattern occurs in a string * * @param str R character vector * @param pattern R character vector containing regular expressions * @param opts_regex list * * @version 0.1 (Marcin Bujarski) * @version 0.2 (Marek Gagolewski) - use StriContainerUTF16 * @version 0.3 (Marek Gagolewski) - use StriContainerUTF16's vectorization * @version 0.4 (Marek Gagolewski, 2013-06-18) use StriContainerRegexPattern + opts_regex */ SEXP stri_detect_regex(SEXP str, SEXP pattern, SEXP opts_regex) { str = stri_prepare_arg_string(str, "str"); pattern = stri_prepare_arg_string(pattern, "pattern"); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); // this will work for vectorize_length == 0: uint32_t pattern_flags = StriContainerRegexPattern::getRegexFlags(opts_regex); STRI__ERROR_HANDLER_BEGIN StriContainerUTF16 str_cont(str, vectorize_length); // MG: tried StriContainerUTF8 + utext_openUTF8 - this was slower StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_flags); SEXP ret; PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_LOGICAL, ret_tab[i] = FALSE) RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically matcher->reset(str_cont.get(i)); ret_tab[i] = (int)matcher->find(); } UNPROTECT(1); return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
bool t4p::FinderClass::FindNextRegularExpression(const UnicodeString& text, int32_t start) { if (U_SUCCESS(PatternErrorCode) && Pattern != NULL) { UnicodeString findText(text); if (start > 0 && start < text.length()) { findText.setTo(text, start); } else if (start > 0) { findText = UNICODE_STRING_SIMPLE(""); } int32_t foundPos = 0, length = 0, endPos = 0; UErrorCode error = U_ZERO_ERROR; RegexMatcher* matcher = Pattern->matcher(findText, error); if (U_SUCCESS(error) && matcher) { if (matcher->find()) { foundPos = matcher->start(error); endPos = matcher->end(error); if (U_SUCCESS(error) && U_SUCCESS(error)) { IsFound = true; length = endPos - foundPos; // end is the index after the match // if search was started from the middle of a string, // need to correct the found position LastPosition = start > 0 ? foundPos + start : foundPos; LastLength = length; } } delete matcher; } } return IsFound; }
bool t4p::FinderClass::GetLastReplacementText(const UnicodeString& text, UnicodeString& replacementText) const { UBool matchFound = FALSE; if (IsFound && (LastPosition + LastLength) <= text.length()) { UnicodeString matchedText(text, LastPosition, LastLength); UnicodeString replaceWith = ReplaceExpression; UErrorCode error = U_ZERO_ERROR; RegexMatcher* matcher = NULL; switch (Mode) { case EXACT: matchFound = Expression == matchedText; if (matchFound) { replacementText = replaceWith; } break; case REGULAR_EXPRESSION: matcher = Pattern->matcher(matchedText, error); if (U_SUCCESS(error) && matcher && matcher->matches(error) && U_SUCCESS(error)) { replacementText = matcher->replaceFirst(replaceWith, error); matchFound = TRUE; } break; } if (matcher) { delete matcher; } } return matchFound == TRUE; }
/** * Extract all capture groups of the first/last occurrence * of a regex pattern in each string * * @param str character vector * @param pattern character vector * @param opts_regex list * @param first logical - search for the first or the last occurrence? * @param cg_missing single string * @return character matrix * * @version 0.1-??? (Marek Gagolewski, 2013-06-22) * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-06) * new arg: cg_missing * * @version 1.0-2 (Marek Gagolewski, 2016-01-29) * Issue #214: allow a regex pattern like `.*` to match an empty string */ SEXP stri__match_firstlast_regex(SEXP str, SEXP pattern, SEXP cg_missing, SEXP opts_regex, bool first) { // @TODO: capture_groups arg (integer vector/set - which capture groups to extract) PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); // prepare string argument PROTECT(cg_missing = stri_prepare_arg_string_1(cg_missing, "cg_missing")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); uint32_t pattern_flags = StriContainerRegexPattern::getRegexFlags(opts_regex); UText* str_text = NULL; // may potentially be slower, but definitely is more convenient! STRI__ERROR_HANDLER_BEGIN(3) StriContainerUTF8 str_cont(str, vectorize_length); StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_flags); StriContainerUTF8 cg_missing_cont(cg_missing, 1); STRI__PROTECT(cg_missing = STRING_ELT(cg_missing, 0)); // we don't know how many capture groups are there: vector< vector< pair<const char*, const char*> > > occurrences(vectorize_length); R_len_t occurrences_max = 1; for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_PATTERN(str_cont, pattern_cont, /*do nothing*/; ) UErrorCode status = U_ZERO_ERROR; RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically int pattern_cur_groups = matcher->groupCount(); if (occurrences_max < pattern_cur_groups+1) occurrences_max=pattern_cur_groups+1; str_text = utext_openUTF8(str_text, str_cont.get(i).c_str(), str_cont.get(i).length(), &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
//--------------------------------------------------------------------- // // matcher(UnicodeString, err) // //--------------------------------------------------------------------- RegexMatcher *RegexPattern::matcher(const UnicodeString &input, UErrorCode &status) const { RegexMatcher *retMatcher = matcher(status); if (retMatcher != NULL) { retMatcher->reset(input); } return retMatcher; };
// // matcher, UText mode // RegexMatcher *RegexPattern::matcher(UText *input, PatternIsUTextFlag /*flag*/, UErrorCode &status) const { RegexMatcher *retMatcher = matcher(status); if (retMatcher != NULL) { retMatcher->fDeferredStatus = status; retMatcher->reset(input); } return retMatcher; }
/** * Detect if a pattern occurs in a string * * @param str R character vector * @param pattern R character vector containing regular expressions * @param negate single bool * @param max_count single int * @param opts_regex list * * @version 0.1-?? (Marcin Bujarski) * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF16 * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF16's vectorization * * @version 0.1-?? (Marek Gagolewski, 2013-06-18) * use StriContainerRegexPattern + opts_regex * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.0-2 (Marek Gagolewski, 2016-01-29) * Issue #214: allow a regex pattern like `.*` to match an empty string * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * FR #216: `negate` arg added * * @version 1.3.1 (Marek Gagolewski, 2019-02-08) * #232: `max_count` arg added */ SEXP stri_detect_regex(SEXP str, SEXP pattern, SEXP negate, SEXP max_count, SEXP opts_regex) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); int max_count_1 = stri__prepare_arg_integer_1_notNA(max_count, "max_count"); PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); uint32_t pattern_flags = StriContainerRegexPattern::getRegexFlags(opts_regex); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF16 str_cont(str, vectorize_length); // StriContainerUTF8 str_cont(str, vectorize_length); // utext_openUTF8, see below StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_flags); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (max_count_1 == 0) { ret_tab[i] = NA_LOGICAL; continue; } STRI__CONTINUE_ON_EMPTY_OR_NA_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_LOGICAL) RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically matcher->reset(str_cont.get(i)); ret_tab[i] = (int)matcher->find(); // returns UBool if (negate_1) ret_tab[i] = !ret_tab[i]; if (max_count_1 > 0 && ret_tab[i]) --max_count_1; // // mbmark-regex-detect1.R: UTF16 0.07171792 s; UText 0.10531605 s // UText* str_text = NULL; // UErrorCode status = U_ZERO_ERROR; // RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically // str_text = utext_openUTF8(str_text, str_cont.get(i).c_str(), str_cont.get(i).length(), &status); // STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // matcher->reset(str_text); // ret_tab[i] = (int)matcher->find(); // returns UBool // utext_close(str_text); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
//--------------------------------------------------------------------- // // matches Convenience function to test for a match, starting // with a pattern string and a data string. // //--------------------------------------------------------------------- UBool U_EXPORT2 RegexPattern::matches(const UnicodeString ®ex, const UnicodeString &input, UParseError &pe, UErrorCode &status) { if (U_FAILURE(status)) {return FALSE;} UBool retVal; RegexPattern *pat = NULL; RegexMatcher *matcher = NULL; pat = RegexPattern::compile(regex, 0, pe, status); matcher = pat->matcher(input, status); retVal = matcher->matches(status); delete matcher; delete pat; return retVal; }
// // matches, UText mode // UBool U_EXPORT2 RegexPattern::matches(UText *regex, UText *input, UParseError &pe, UErrorCode &status) { if (U_FAILURE(status)) {return FALSE;} UBool retVal; RegexPattern *pat = NULL; RegexMatcher *matcher = NULL; pat = RegexPattern::compile(regex, 0, pe, status); matcher = pat->matcher(input, PATTERN_IS_UTEXT, status); retVal = matcher->matches(status); delete matcher; delete pat; return retVal; }
// // matches, UText mode // UBool U_EXPORT2 RegexPattern::matches(UText *regex, UText *input, UParseError &pe, UErrorCode &status) { if (U_FAILURE(status)) {return FALSE;} UBool retVal = FALSE; RegexPattern *pat = NULL; RegexMatcher *matcher = NULL; pat = RegexPattern::compile(regex, 0, pe, status); matcher = pat->matcher(status); if (U_SUCCESS(status)) { matcher->reset(input); retVal = matcher->matches(status); } delete matcher; delete pat; return retVal; }
int t4p::FinderClass::ReplaceAllMatches(UnicodeString& text) const { int matches = 0; // no check for ReplaceExpression.isEmpty() allow for empty replacements // this allows the user to 'delete' parts of a strin if (IsPrepared) { UnicodeString replacement = ReplaceExpression; RegexMatcher* matcher = NULL; UErrorCode error = U_ZERO_ERROR; UnicodeString dest(text.length(), ' ', 0); int32_t pos = 0; if (EXACT == Mode || (REGULAR_EXPRESSION == Mode && ReplaceExpression.isEmpty())) { pos = text.indexOf(Expression, 0); while (pos >= 0) { text.replaceBetween(pos, pos + Expression.length(), replacement); pos = text.indexOf(Expression, pos + replacement.length()); ++matches; } } else { matcher = Pattern->matcher(text, error); if (U_SUCCESS(error) && matcher) { while (matcher->find()) { if (U_SUCCESS(error)) { matcher->appendReplacement(dest, replacement, error); if (U_SUCCESS(error)) { ++matches; } } } matcher->appendTail(dest); text = dest; } } if (matcher) { delete matcher; } } return matches; }
void DecimalFormatTest::DataDrivenTests() { char tdd[2048]; const char *srcPath; UErrorCode status = U_ZERO_ERROR; int32_t lineNum = 0; // // Open and read the test data file. // srcPath=getPath(tdd, "dcfmtest.txt"); if(srcPath==NULL) { return; /* something went wrong, error already output */ } int32_t len; UChar *testData = ReadAndConvertFile(srcPath, len, status); if (U_FAILURE(status)) { return; /* something went wrong, error already output */ } // // Put the test data into a UnicodeString // UnicodeString testString(FALSE, testData, len); RegexMatcher parseLineMat(UnicodeString( "(?i)\\s*parse\\s+" "\"([^\"]*)\"\\s+" // Capture group 1: input text "([ild])\\s+" // Capture group 2: expected parsed type "\"([^\"]*)\"\\s+" // Capture group 3: expected parsed decimal "\\s*(?:#.*)?"), // Trailing comment 0, status); RegexMatcher formatLineMat(UnicodeString( "(?i)\\s*format\\s+" "(\\S+)\\s+" // Capture group 1: pattern "(ceiling|floor|down|up|halfeven|halfdown|halfup|default)\\s+" // Capture group 2: Rounding Mode "\"([^\"]*)\"\\s+" // Capture group 3: input "\"([^\"]*)\"" // Capture group 4: expected output "\\s*(?:#.*)?"), // Trailing comment 0, status); RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status); RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(?m)^(.*?)$"), testString, 0, status); if (U_FAILURE(status)){ dataerrln("Construct RegexMatcher() error."); delete [] testData; return; } // // Loop over the test data file, once per line. // while (lineMat.find()) { lineNum++; if (U_FAILURE(status)) { errln("File dcfmtest.txt, line %d: ICU Error \"%s\"", lineNum, u_errorName(status)); } status = U_ZERO_ERROR; UnicodeString testLine = lineMat.group(1, status); // printf("%s\n", UnicodeStringPiece(testLine).data()); if (testLine.length() == 0) { continue; } // // Parse the test line. Skip blank and comment only lines. // Separate out the three main fields - pattern, flags, target. // commentMat.reset(testLine); if (commentMat.lookingAt(status)) { // This line is a comment, or blank. continue; } // // Handle "parse" test case line from file // parseLineMat.reset(testLine); if (parseLineMat.lookingAt(status)) { execParseTest(lineNum, parseLineMat.group(1, status), // input parseLineMat.group(2, status), // Expected Type parseLineMat.group(3, status), // Expected Decimal String status ); continue; } // // Handle "format" test case line // formatLineMat.reset(testLine); if (formatLineMat.lookingAt(status)) { execFormatTest(lineNum, formatLineMat.group(1, status), // Pattern formatLineMat.group(2, status), // rounding mode formatLineMat.group(3, status), // input decimal number formatLineMat.group(4, status), // expected formatted result status); continue; } // // Line is not a recognizable test case. // errln("Badly formed test case at line %d.\n%s\n", lineNum, UnicodeStringPiece(testLine).data()); } delete [] testData; }
//------------------------------------------------------------------------------------------ // // main for ugrep // // Structurally, all use of the ICU Regular Expression API is in main(), // and all of the supporting stuff necessary to make a running program, but // not directly related to regular expressions, is factored out into these other // functions. // //------------------------------------------------------------------------------------------ int main(int argc, const char** argv) { UBool matchFound = FALSE; // // Process the commmand line options. // processOptions(argc, argv); // // Create a RegexPattern object from the user supplied pattern string. // UErrorCode status = U_ZERO_ERROR; // All ICU operations report success or failure // in a status variable. UParseError parseErr; // In the event of a syntax error in the regex pattern, // this struct will contain the position of the // error. RegexPattern *rePat = RegexPattern::compile(pattern, parseErr, status); // Note that C++ is doing an automatic conversion // of the (char *) pattern to a temporary // UnicodeString object. if (U_FAILURE(status)) { fprintf(stderr, "ugrep: error in pattern: \"%s\" at position %d\n", u_errorName(status), parseErr.offset); exit(-1); } // // Create a RegexMatcher from the newly created pattern. // UnicodeString empty; RegexMatcher *matcher = rePat->matcher(empty, status); if (U_FAILURE(status)) { fprintf(stderr, "ugrep: error in creating RegexMatcher: \"%s\"\n", u_errorName(status)); exit(-1); } // // Loop, processing each of the input files. // for (int fileNum=firstFileNum; fileNum < argc; fileNum++) { readFile(argv[fileNum]); // // Loop through the lines of a file, trying to match the regex pattern on each. // for (nextLine(0); lineStart<fileLen; nextLine(lineEnd)) { UnicodeString s(FALSE, ucharBuf+lineStart, lineEnd-lineStart); matcher->reset(s); if (matcher->find()) { matchFound = TRUE; printMatch(); } } } // // Clean up // delete matcher; delete rePat; free(ucharBuf); free(charBuf); ucnv_close(outConverter); u_cleanup(); // shut down ICU, release any cached data it owns. return matchFound? 0: 1; }