/**
 * Detect if a pattern occurs in a string
 *
 * @param str R character vector
 * @param pattern R character vector containing regular expressions
 * @param opts_regex list
 *
 * @version 0.1 (Marcin Bujarski)
 * @version 0.2 (Marek Gagolewski) - use StriContainerUTF16
 * @version 0.3 (Marek Gagolewski) - use StriContainerUTF16's vectorization
 * @version 0.4 (Marek Gagolewski, 2013-06-18) use StriContainerRegexPattern + opts_regex
 */
SEXP stri_detect_regex(SEXP str, SEXP pattern, SEXP opts_regex)
{
   str = stri_prepare_arg_string(str, "str");
   pattern = stri_prepare_arg_string(pattern, "pattern");
   R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern));
   // this will work for vectorize_length == 0:

   uint32_t pattern_flags = StriContainerRegexPattern::getRegexFlags(opts_regex);

   STRI__ERROR_HANDLER_BEGIN

   StriContainerUTF16 str_cont(str, vectorize_length);
   // MG: tried StriContainerUTF8 + utext_openUTF8 - this was slower
   StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_flags);

   SEXP ret;
   PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length));
   int* ret_tab = LOGICAL(ret);

   for (R_len_t i = pattern_cont.vectorize_init();
         i != pattern_cont.vectorize_end();
         i = pattern_cont.vectorize_next(i))
   {
      STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_LOGICAL, ret_tab[i] = FALSE)

      RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically
      matcher->reset(str_cont.get(i));
      ret_tab[i] = (int)matcher->find();
   }

   UNPROTECT(1);
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
Example #2
0
bool t4p::FinderClass::FindNextRegularExpression(const UnicodeString& text, int32_t start) {
    if (U_SUCCESS(PatternErrorCode) && Pattern != NULL) {
        UnicodeString findText(text);
        if (start > 0 && start < text.length()) {
            findText.setTo(text, start);
        } else if (start > 0) {
            findText = UNICODE_STRING_SIMPLE("");
        }
        int32_t foundPos = 0,
                length = 0,
                endPos = 0;
        UErrorCode error = U_ZERO_ERROR;
        RegexMatcher* matcher = Pattern->matcher(findText, error);
        if (U_SUCCESS(error) && matcher) {
            if (matcher->find()) {
                foundPos = matcher->start(error);
                endPos = matcher->end(error);
                if (U_SUCCESS(error) && U_SUCCESS(error)) {
                    IsFound = true;

                    length = endPos - foundPos;  // end is the index after the match

                    // if search was started from the middle of a string,
                    // need to correct the found position
                    LastPosition = start > 0 ? foundPos + start : foundPos;
                    LastLength = length;
                }
            }
            delete matcher;
        }
    }
    return IsFound;
}
Example #3
0
bool t4p::FinderClass::GetLastReplacementText(const UnicodeString& text, UnicodeString& replacementText) const {
    UBool matchFound = FALSE;
    if (IsFound && (LastPosition + LastLength) <= text.length()) {
        UnicodeString matchedText(text, LastPosition, LastLength);
        UnicodeString replaceWith = ReplaceExpression;
        UErrorCode error = U_ZERO_ERROR;
        RegexMatcher* matcher = NULL;
        switch (Mode) {
        case EXACT:
            matchFound = Expression == matchedText;
            if (matchFound) {
                replacementText = replaceWith;
            }
            break;
        case REGULAR_EXPRESSION:
            matcher = Pattern->matcher(matchedText, error);
            if (U_SUCCESS(error) && matcher && matcher->matches(error) && U_SUCCESS(error)) {
                replacementText = matcher->replaceFirst(replaceWith, error);
                matchFound = TRUE;
            }
            break;
        }
        if (matcher) {
            delete matcher;
        }
    }
    return matchFound == TRUE;
}
/**
 * Extract all capture groups of the first/last occurrence
 * of a regex pattern in each string
 *
 * @param str character vector
 * @param pattern character vector
 * @param opts_regex list
 * @param first logical - search for the first or the last occurrence?
 * @param cg_missing single string
 * @return character matrix
 *
 * @version 0.1-??? (Marek Gagolewski, 2013-06-22)
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-05)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-06)
 *    new arg: cg_missing
 *
 * @version 1.0-2 (Marek Gagolewski, 2016-01-29)
 *    Issue #214: allow a regex pattern like `.*`  to match an empty string
 */
SEXP stri__match_firstlast_regex(SEXP str, SEXP pattern, SEXP cg_missing, SEXP opts_regex, bool first)
{
   // @TODO: capture_groups arg (integer vector/set - which capture groups to extract)
   PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument
   PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern")); // prepare string argument
   PROTECT(cg_missing = stri_prepare_arg_string_1(cg_missing, "cg_missing"));
   R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern));

   uint32_t pattern_flags = StriContainerRegexPattern::getRegexFlags(opts_regex);

   UText* str_text = NULL; // may potentially be slower, but definitely is more convenient!
   STRI__ERROR_HANDLER_BEGIN(3)
   StriContainerUTF8 str_cont(str, vectorize_length);
   StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_flags);
   StriContainerUTF8 cg_missing_cont(cg_missing, 1);
   STRI__PROTECT(cg_missing = STRING_ELT(cg_missing, 0));

   // we don't know how many capture groups are there:
   vector< vector< pair<const char*, const char*> > > occurrences(vectorize_length);
   R_len_t occurrences_max = 1;

   for (R_len_t i = pattern_cont.vectorize_init();
         i != pattern_cont.vectorize_end();
         i = pattern_cont.vectorize_next(i))
   {
      STRI__CONTINUE_ON_EMPTY_OR_NA_PATTERN(str_cont, pattern_cont,
         /*do nothing*/;      )

      UErrorCode status = U_ZERO_ERROR;
      RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically
      int pattern_cur_groups = matcher->groupCount();
      if (occurrences_max < pattern_cur_groups+1) occurrences_max=pattern_cur_groups+1;
      str_text = utext_openUTF8(str_text, str_cont.get(i).c_str(), str_cont.get(i).length(), &status);
      STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
Example #5
0
//---------------------------------------------------------------------
//
//   matcher(UnicodeString, err)
//
//---------------------------------------------------------------------
RegexMatcher *RegexPattern::matcher(const UnicodeString &input,
                                    UErrorCode          &status)  const {
    RegexMatcher    *retMatcher = matcher(status);
    if (retMatcher != NULL) {
        retMatcher->reset(input);
    }
    return retMatcher;
};
Example #6
0
//
//   matcher, UText mode
//
RegexMatcher *RegexPattern::matcher(UText               *input,
                                    PatternIsUTextFlag  /*flag*/,
                                    UErrorCode          &status)  const {
    RegexMatcher    *retMatcher = matcher(status);
    if (retMatcher != NULL) {
        retMatcher->fDeferredStatus = status;
        retMatcher->reset(input);
    }
    return retMatcher;
}
/**
 * Detect if a pattern occurs in a string
 *
 * @param str R character vector
 * @param pattern R character vector containing regular expressions
 * @param negate single bool
 * @param max_count single int
 * @param opts_regex list
 *
 * @version 0.1-?? (Marcin Bujarski)
 *
 * @version 0.1-?? (Marek Gagolewski)
 *          use StriContainerUTF16
 *
 * @version 0.1-?? (Marek Gagolewski)
 *          use StriContainerUTF16's vectorization
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-18)
 *          use StriContainerRegexPattern + opts_regex
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-05)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 *
 * @version 1.0-2 (Marek Gagolewski, 2016-01-29)
 *    Issue #214: allow a regex pattern like `.*`  to match an empty string
 *
 * @version 1.0-3 (Marek Gagolewski, 2016-02-03)
 *    FR #216: `negate` arg added
 *
 * @version 1.3.1 (Marek Gagolewski, 2019-02-08)
 *    #232: `max_count` arg added
 */
SEXP stri_detect_regex(SEXP str, SEXP pattern, SEXP negate,
    SEXP max_count, SEXP opts_regex)
{
   bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate");
   int max_count_1 = stri__prepare_arg_integer_1_notNA(max_count, "max_count");
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern"));
   R_len_t vectorize_length =
      stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern));

   uint32_t pattern_flags = StriContainerRegexPattern::getRegexFlags(opts_regex);

   STRI__ERROR_HANDLER_BEGIN(2)
   StriContainerUTF16 str_cont(str, vectorize_length);
//   StriContainerUTF8 str_cont(str, vectorize_length); // utext_openUTF8, see below
   StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_flags);

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length));
   int* ret_tab = LOGICAL(ret);

   for (R_len_t i = pattern_cont.vectorize_init();
         i != pattern_cont.vectorize_end();
         i = pattern_cont.vectorize_next(i))
   {
      if (max_count_1 == 0) {
          ret_tab[i] = NA_LOGICAL;
          continue;
      }

      STRI__CONTINUE_ON_EMPTY_OR_NA_PATTERN(str_cont,
         pattern_cont, ret_tab[i] = NA_LOGICAL)

      RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically
      matcher->reset(str_cont.get(i));
      ret_tab[i] = (int)matcher->find(); // returns UBool
      if (negate_1) ret_tab[i] = !ret_tab[i];
      if (max_count_1 > 0 && ret_tab[i]) --max_count_1;

//      // mbmark-regex-detect1.R: UTF16 0.07171792 s; UText 0.10531605 s
//      UText* str_text = NULL;
//      UErrorCode status = U_ZERO_ERROR;
//      RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically
//      str_text = utext_openUTF8(str_text, str_cont.get(i).c_str(), str_cont.get(i).length(), &status);
//      STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
//      matcher->reset(str_text);
//      ret_tab[i] = (int)matcher->find(); // returns UBool
//      utext_close(str_text);
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
Example #8
0
//---------------------------------------------------------------------
//
//   matches        Convenience function to test for a match, starting
//                  with a pattern string and a data string.
//
//---------------------------------------------------------------------
UBool U_EXPORT2 RegexPattern::matches(const UnicodeString   &regex,
              const UnicodeString   &input,
                    UParseError     &pe,
                    UErrorCode      &status) {

    if (U_FAILURE(status)) {return FALSE;}

    UBool         retVal;
    RegexPattern *pat     = NULL;
    RegexMatcher *matcher = NULL;

    pat     = RegexPattern::compile(regex, 0, pe, status);
    matcher = pat->matcher(input, status);
    retVal  = matcher->matches(status);

    delete matcher;
    delete pat;
    return retVal;
}
Example #9
0
//
//   matches, UText mode
//
UBool U_EXPORT2 RegexPattern::matches(UText                *regex,
                    UText           *input,
                    UParseError     &pe,
                    UErrorCode      &status) {

    if (U_FAILURE(status)) {return FALSE;}

    UBool         retVal;
    RegexPattern *pat     = NULL;
    RegexMatcher *matcher = NULL;

    pat     = RegexPattern::compile(regex, 0, pe, status);
    matcher = pat->matcher(input, PATTERN_IS_UTEXT, status);
    retVal  = matcher->matches(status);

    delete matcher;
    delete pat;
    return retVal;
}
Example #10
0
//
//   matches, UText mode
//
UBool U_EXPORT2 RegexPattern::matches(UText                *regex,
                    UText           *input,
                    UParseError     &pe,
                    UErrorCode      &status) {

    if (U_FAILURE(status)) {return FALSE;}

    UBool         retVal  = FALSE;
    RegexPattern *pat     = NULL;
    RegexMatcher *matcher = NULL;

    pat     = RegexPattern::compile(regex, 0, pe, status);
    matcher = pat->matcher(status);
    if (U_SUCCESS(status)) {
        matcher->reset(input);
        retVal  = matcher->matches(status);
    }

    delete matcher;
    delete pat;
    return retVal;
}
Example #11
0
int t4p::FinderClass::ReplaceAllMatches(UnicodeString& text) const {
    int matches = 0;

    // no check for ReplaceExpression.isEmpty() allow for empty replacements
    // this allows the user to 'delete' parts of a strin
    if (IsPrepared) {
        UnicodeString replacement = ReplaceExpression;
        RegexMatcher* matcher = NULL;
        UErrorCode error = U_ZERO_ERROR;
        UnicodeString dest(text.length(), ' ', 0);
        int32_t pos = 0;
        if (EXACT == Mode || (REGULAR_EXPRESSION == Mode && ReplaceExpression.isEmpty())) {
            pos = text.indexOf(Expression, 0);
            while (pos >= 0) {
                text.replaceBetween(pos, pos + Expression.length(), replacement);
                pos = text.indexOf(Expression, pos + replacement.length());
                ++matches;
            }
        } else {
            matcher = Pattern->matcher(text, error);
            if (U_SUCCESS(error) && matcher) {
                while (matcher->find()) {
                    if (U_SUCCESS(error)) {
                        matcher->appendReplacement(dest, replacement, error);
                        if (U_SUCCESS(error)) {
                            ++matches;
                        }
                    }
                }
                matcher->appendTail(dest);
                text = dest;
            }
        }
        if (matcher) {
            delete matcher;
        }
    }
    return matches;
}
Example #12
0
void DecimalFormatTest::DataDrivenTests() {
    char tdd[2048];
    const char *srcPath;
    UErrorCode  status  = U_ZERO_ERROR;
    int32_t     lineNum = 0;

    //
    //  Open and read the test data file.
    //
    srcPath=getPath(tdd, "dcfmtest.txt");
    if(srcPath==NULL) {
        return; /* something went wrong, error already output */
    }

    int32_t    len;
    UChar *testData = ReadAndConvertFile(srcPath, len, status);
    if (U_FAILURE(status)) {
        return; /* something went wrong, error already output */
    }

    //
    //  Put the test data into a UnicodeString
    //
    UnicodeString testString(FALSE, testData, len);

    RegexMatcher    parseLineMat(UnicodeString(
            "(?i)\\s*parse\\s+"
            "\"([^\"]*)\"\\s+"           // Capture group 1: input text
            "([ild])\\s+"                // Capture group 2: expected parsed type
            "\"([^\"]*)\"\\s+"           // Capture group 3: expected parsed decimal
            "\\s*(?:#.*)?"),             // Trailing comment
         0, status);

    RegexMatcher    formatLineMat(UnicodeString(
            "(?i)\\s*format\\s+"
            "(\\S+)\\s+"                 // Capture group 1: pattern
            "(ceiling|floor|down|up|halfeven|halfdown|halfup|default)\\s+"  // Capture group 2: Rounding Mode
            "\"([^\"]*)\"\\s+"           // Capture group 3: input
            "\"([^\"]*)\""           // Capture group 4: expected output
            "\\s*(?:#.*)?"),             // Trailing comment
         0, status);

    RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
    RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(?m)^(.*?)$"), testString, 0, status);

    if (U_FAILURE(status)){
        dataerrln("Construct RegexMatcher() error.");
        delete [] testData;
        return;
    }

    //
    //  Loop over the test data file, once per line.
    //
    while (lineMat.find()) {
        lineNum++;
        if (U_FAILURE(status)) {
            errln("File dcfmtest.txt, line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
        }

        status = U_ZERO_ERROR;
        UnicodeString testLine = lineMat.group(1, status);
        // printf("%s\n", UnicodeStringPiece(testLine).data());
        if (testLine.length() == 0) {
            continue;
        }

        //
        // Parse the test line.  Skip blank and comment only lines.
        // Separate out the three main fields - pattern, flags, target.
        //

        commentMat.reset(testLine);
        if (commentMat.lookingAt(status)) {
            // This line is a comment, or blank.
            continue;
        }


        //
        //  Handle "parse" test case line from file
        //
        parseLineMat.reset(testLine);
        if (parseLineMat.lookingAt(status)) {
            execParseTest(lineNum,
                          parseLineMat.group(1, status),    // input
                          parseLineMat.group(2, status),    // Expected Type
                          parseLineMat.group(3, status),    // Expected Decimal String
                          status
                          );
            continue;
        }

        //
        //  Handle "format" test case line
        //
        formatLineMat.reset(testLine);
        if (formatLineMat.lookingAt(status)) {
            execFormatTest(lineNum,
                           formatLineMat.group(1, status),    // Pattern
                           formatLineMat.group(2, status),    // rounding mode
                           formatLineMat.group(3, status),    // input decimal number
                           formatLineMat.group(4, status),    // expected formatted result
                           status);
            continue;
        }

        //
        //  Line is not a recognizable test case.
        //
        errln("Badly formed test case at line %d.\n%s\n", 
             lineNum, UnicodeStringPiece(testLine).data());

    }

    delete [] testData;
}
Example #13
0
//------------------------------------------------------------------------------------------
//
//   main          for ugrep
//
//           Structurally, all use of the ICU Regular Expression API is in main(),
//           and all of the supporting stuff necessary to make a running program, but
//           not directly related to regular expressions, is factored out into these other
//           functions.
//
//------------------------------------------------------------------------------------------
int main(int argc, const char** argv) {
    UBool     matchFound = FALSE;

    //
    //  Process the commmand line options.
    //
    processOptions(argc, argv);

    //
    // Create a RegexPattern object from the user supplied pattern string.
    //
    UErrorCode status = U_ZERO_ERROR;   // All ICU operations report success or failure
                                        //   in a status variable.

    UParseError    parseErr;            // In the event of a syntax error in the regex pattern,
                                        //   this struct will contain the position of the
                                        //   error.

    RegexPattern  *rePat = RegexPattern::compile(pattern, parseErr, status);
                                        // Note that C++ is doing an automatic conversion
                                        //  of the (char *) pattern to a temporary
                                        //  UnicodeString object.
    if (U_FAILURE(status)) {
        fprintf(stderr, "ugrep:  error in pattern: \"%s\" at position %d\n",
            u_errorName(status), parseErr.offset);
        exit(-1);
    }

    //
    // Create a RegexMatcher from the newly created pattern.
    //
    UnicodeString empty;
    RegexMatcher *matcher = rePat->matcher(empty, status);
    if (U_FAILURE(status)) {
        fprintf(stderr, "ugrep:  error in creating RegexMatcher: \"%s\"\n",
            u_errorName(status));
        exit(-1);
    }

    //
    // Loop, processing each of the input files.
    //
    for (int fileNum=firstFileNum; fileNum < argc; fileNum++) {
        readFile(argv[fileNum]);

        //
        //  Loop through the lines of a file, trying to match the regex pattern on each.
        //
        for (nextLine(0); lineStart<fileLen; nextLine(lineEnd)) {
            UnicodeString s(FALSE, ucharBuf+lineStart, lineEnd-lineStart);
            matcher->reset(s);
            if (matcher->find()) {
                matchFound = TRUE;
                printMatch();
            }
        }
    }

    //
    //  Clean up
    //
    delete matcher;
    delete rePat;
    free(ucharBuf);
    free(charBuf);
    ucnv_close(outConverter);
    
    u_cleanup();       // shut down ICU, release any cached data it owns.

    return matchFound? 0: 1;
}