/** * Detect if a pattern occurs in a string * * @param str R character vector * @param pattern R character vector containing regular expressions * @param opts_regex list * * @version 0.1 (Marcin Bujarski) * @version 0.2 (Marek Gagolewski) - use StriContainerUTF16 * @version 0.3 (Marek Gagolewski) - use StriContainerUTF16's vectorization * @version 0.4 (Marek Gagolewski, 2013-06-18) use StriContainerRegexPattern + opts_regex */ SEXP stri_detect_regex(SEXP str, SEXP pattern, SEXP opts_regex) { str = stri_prepare_arg_string(str, "str"); pattern = stri_prepare_arg_string(pattern, "pattern"); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); // this will work for vectorize_length == 0: uint32_t pattern_flags = StriContainerRegexPattern::getRegexFlags(opts_regex); STRI__ERROR_HANDLER_BEGIN StriContainerUTF16 str_cont(str, vectorize_length); // MG: tried StriContainerUTF8 + utext_openUTF8 - this was slower StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_flags); SEXP ret; PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_LOGICAL, ret_tab[i] = FALSE) RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically matcher->reset(str_cont.get(i)); ret_tab[i] = (int)matcher->find(); } UNPROTECT(1); return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
//--------------------------------------------------------------------- // // matcher(UnicodeString, err) // //--------------------------------------------------------------------- RegexMatcher *RegexPattern::matcher(const UnicodeString &input, UErrorCode &status) const { RegexMatcher *retMatcher = matcher(status); if (retMatcher != NULL) { retMatcher->reset(input); } return retMatcher; };
// // matcher, UText mode // RegexMatcher *RegexPattern::matcher(UText *input, PatternIsUTextFlag /*flag*/, UErrorCode &status) const { RegexMatcher *retMatcher = matcher(status); if (retMatcher != NULL) { retMatcher->fDeferredStatus = status; retMatcher->reset(input); } return retMatcher; }
/** * Detect if a pattern occurs in a string * * @param str R character vector * @param pattern R character vector containing regular expressions * @param negate single bool * @param max_count single int * @param opts_regex list * * @version 0.1-?? (Marcin Bujarski) * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF16 * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF16's vectorization * * @version 0.1-?? (Marek Gagolewski, 2013-06-18) * use StriContainerRegexPattern + opts_regex * * @version 0.3-1 (Marek Gagolewski, 2014-11-05) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 1.0-2 (Marek Gagolewski, 2016-01-29) * Issue #214: allow a regex pattern like `.*` to match an empty string * * @version 1.0-3 (Marek Gagolewski, 2016-02-03) * FR #216: `negate` arg added * * @version 1.3.1 (Marek Gagolewski, 2019-02-08) * #232: `max_count` arg added */ SEXP stri_detect_regex(SEXP str, SEXP pattern, SEXP negate, SEXP max_count, SEXP opts_regex) { bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate"); int max_count_1 = stri__prepare_arg_integer_1_notNA(max_count, "max_count"); PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern)); uint32_t pattern_flags = StriContainerRegexPattern::getRegexFlags(opts_regex); STRI__ERROR_HANDLER_BEGIN(2) StriContainerUTF16 str_cont(str, vectorize_length); // StriContainerUTF8 str_cont(str, vectorize_length); // utext_openUTF8, see below StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_flags); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length)); int* ret_tab = LOGICAL(ret); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (max_count_1 == 0) { ret_tab[i] = NA_LOGICAL; continue; } STRI__CONTINUE_ON_EMPTY_OR_NA_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_LOGICAL) RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically matcher->reset(str_cont.get(i)); ret_tab[i] = (int)matcher->find(); // returns UBool if (negate_1) ret_tab[i] = !ret_tab[i]; if (max_count_1 > 0 && ret_tab[i]) --max_count_1; // // mbmark-regex-detect1.R: UTF16 0.07171792 s; UText 0.10531605 s // UText* str_text = NULL; // UErrorCode status = U_ZERO_ERROR; // RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically // str_text = utext_openUTF8(str_text, str_cont.get(i).c_str(), str_cont.get(i).length(), &status); // STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // matcher->reset(str_text); // ret_tab[i] = (int)matcher->find(); // returns UBool // utext_close(str_text); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
// // matches, UText mode // UBool U_EXPORT2 RegexPattern::matches(UText *regex, UText *input, UParseError &pe, UErrorCode &status) { if (U_FAILURE(status)) {return FALSE;} UBool retVal = FALSE; RegexPattern *pat = NULL; RegexMatcher *matcher = NULL; pat = RegexPattern::compile(regex, 0, pe, status); matcher = pat->matcher(status); if (U_SUCCESS(status)) { matcher->reset(input); retVal = matcher->matches(status); } delete matcher; delete pat; return retVal; }
void DecimalFormatTest::DataDrivenTests() { char tdd[2048]; const char *srcPath; UErrorCode status = U_ZERO_ERROR; int32_t lineNum = 0; // // Open and read the test data file. // srcPath=getPath(tdd, "dcfmtest.txt"); if(srcPath==NULL) { return; /* something went wrong, error already output */ } int32_t len; UChar *testData = ReadAndConvertFile(srcPath, len, status); if (U_FAILURE(status)) { return; /* something went wrong, error already output */ } // // Put the test data into a UnicodeString // UnicodeString testString(FALSE, testData, len); RegexMatcher parseLineMat(UnicodeString( "(?i)\\s*parse\\s+" "\"([^\"]*)\"\\s+" // Capture group 1: input text "([ild])\\s+" // Capture group 2: expected parsed type "\"([^\"]*)\"\\s+" // Capture group 3: expected parsed decimal "\\s*(?:#.*)?"), // Trailing comment 0, status); RegexMatcher formatLineMat(UnicodeString( "(?i)\\s*format\\s+" "(\\S+)\\s+" // Capture group 1: pattern "(ceiling|floor|down|up|halfeven|halfdown|halfup|default)\\s+" // Capture group 2: Rounding Mode "\"([^\"]*)\"\\s+" // Capture group 3: input "\"([^\"]*)\"" // Capture group 4: expected output "\\s*(?:#.*)?"), // Trailing comment 0, status); RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status); RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(?m)^(.*?)$"), testString, 0, status); if (U_FAILURE(status)){ dataerrln("Construct RegexMatcher() error."); delete [] testData; return; } // // Loop over the test data file, once per line. // while (lineMat.find()) { lineNum++; if (U_FAILURE(status)) { errln("File dcfmtest.txt, line %d: ICU Error \"%s\"", lineNum, u_errorName(status)); } status = U_ZERO_ERROR; UnicodeString testLine = lineMat.group(1, status); // printf("%s\n", UnicodeStringPiece(testLine).data()); if (testLine.length() == 0) { continue; } // // Parse the test line. Skip blank and comment only lines. // Separate out the three main fields - pattern, flags, target. // commentMat.reset(testLine); if (commentMat.lookingAt(status)) { // This line is a comment, or blank. continue; } // // Handle "parse" test case line from file // parseLineMat.reset(testLine); if (parseLineMat.lookingAt(status)) { execParseTest(lineNum, parseLineMat.group(1, status), // input parseLineMat.group(2, status), // Expected Type parseLineMat.group(3, status), // Expected Decimal String status ); continue; } // // Handle "format" test case line // formatLineMat.reset(testLine); if (formatLineMat.lookingAt(status)) { execFormatTest(lineNum, formatLineMat.group(1, status), // Pattern formatLineMat.group(2, status), // rounding mode formatLineMat.group(3, status), // input decimal number formatLineMat.group(4, status), // expected formatted result status); continue; } // // Line is not a recognizable test case. // errln("Badly formed test case at line %d.\n%s\n", lineNum, UnicodeStringPiece(testLine).data()); } delete [] testData; }
//------------------------------------------------------------------------------------------ // // main for ugrep // // Structurally, all use of the ICU Regular Expression API is in main(), // and all of the supporting stuff necessary to make a running program, but // not directly related to regular expressions, is factored out into these other // functions. // //------------------------------------------------------------------------------------------ int main(int argc, const char** argv) { UBool matchFound = FALSE; // // Process the commmand line options. // processOptions(argc, argv); // // Create a RegexPattern object from the user supplied pattern string. // UErrorCode status = U_ZERO_ERROR; // All ICU operations report success or failure // in a status variable. UParseError parseErr; // In the event of a syntax error in the regex pattern, // this struct will contain the position of the // error. RegexPattern *rePat = RegexPattern::compile(pattern, parseErr, status); // Note that C++ is doing an automatic conversion // of the (char *) pattern to a temporary // UnicodeString object. if (U_FAILURE(status)) { fprintf(stderr, "ugrep: error in pattern: \"%s\" at position %d\n", u_errorName(status), parseErr.offset); exit(-1); } // // Create a RegexMatcher from the newly created pattern. // UnicodeString empty; RegexMatcher *matcher = rePat->matcher(empty, status); if (U_FAILURE(status)) { fprintf(stderr, "ugrep: error in creating RegexMatcher: \"%s\"\n", u_errorName(status)); exit(-1); } // // Loop, processing each of the input files. // for (int fileNum=firstFileNum; fileNum < argc; fileNum++) { readFile(argv[fileNum]); // // Loop through the lines of a file, trying to match the regex pattern on each. // for (nextLine(0); lineStart<fileLen; nextLine(lineEnd)) { UnicodeString s(FALSE, ucharBuf+lineStart, lineEnd-lineStart); matcher->reset(s); if (matcher->find()) { matchFound = TRUE; printMatch(); } } } // // Clean up // delete matcher; delete rePat; free(ucharBuf); free(charBuf); ucnv_close(outConverter); u_cleanup(); // shut down ICU, release any cached data it owns. return matchFound? 0: 1; }