Example #1
0
bool t4p::FinderClass::FindNextRegularExpression(const UnicodeString& text, int32_t start) {
    if (U_SUCCESS(PatternErrorCode) && Pattern != NULL) {
        UnicodeString findText(text);
        if (start > 0 && start < text.length()) {
            findText.setTo(text, start);
        } else if (start > 0) {
            findText = UNICODE_STRING_SIMPLE("");
        }
        int32_t foundPos = 0,
                length = 0,
                endPos = 0;
        UErrorCode error = U_ZERO_ERROR;
        RegexMatcher* matcher = Pattern->matcher(findText, error);
        if (U_SUCCESS(error) && matcher) {
            if (matcher->find()) {
                foundPos = matcher->start(error);
                endPos = matcher->end(error);
                if (U_SUCCESS(error) && U_SUCCESS(error)) {
                    IsFound = true;

                    length = endPos - foundPos;  // end is the index after the match

                    // if search was started from the middle of a string,
                    // need to correct the found position
                    LastPosition = start > 0 ? foundPos + start : foundPos;
                    LastLength = length;
                }
            }
            delete matcher;
        }
    }
    return IsFound;
}
/**
 * Detect if a pattern occurs in a string
 *
 * @param str R character vector
 * @param pattern R character vector containing regular expressions
 * @param opts_regex list
 *
 * @version 0.1 (Marcin Bujarski)
 * @version 0.2 (Marek Gagolewski) - use StriContainerUTF16
 * @version 0.3 (Marek Gagolewski) - use StriContainerUTF16's vectorization
 * @version 0.4 (Marek Gagolewski, 2013-06-18) use StriContainerRegexPattern + opts_regex
 */
SEXP stri_detect_regex(SEXP str, SEXP pattern, SEXP opts_regex)
{
   str = stri_prepare_arg_string(str, "str");
   pattern = stri_prepare_arg_string(pattern, "pattern");
   R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern));
   // this will work for vectorize_length == 0:

   uint32_t pattern_flags = StriContainerRegexPattern::getRegexFlags(opts_regex);

   STRI__ERROR_HANDLER_BEGIN

   StriContainerUTF16 str_cont(str, vectorize_length);
   // MG: tried StriContainerUTF8 + utext_openUTF8 - this was slower
   StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_flags);

   SEXP ret;
   PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length));
   int* ret_tab = LOGICAL(ret);

   for (R_len_t i = pattern_cont.vectorize_init();
         i != pattern_cont.vectorize_end();
         i = pattern_cont.vectorize_next(i))
   {
      STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, ret_tab[i] = NA_LOGICAL, ret_tab[i] = FALSE)

      RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically
      matcher->reset(str_cont.get(i));
      ret_tab[i] = (int)matcher->find();
   }

   UNPROTECT(1);
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
/**
 * Detect if a pattern occurs in a string
 *
 * @param str R character vector
 * @param pattern R character vector containing regular expressions
 * @param negate single bool
 * @param max_count single int
 * @param opts_regex list
 *
 * @version 0.1-?? (Marcin Bujarski)
 *
 * @version 0.1-?? (Marek Gagolewski)
 *          use StriContainerUTF16
 *
 * @version 0.1-?? (Marek Gagolewski)
 *          use StriContainerUTF16's vectorization
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-18)
 *          use StriContainerRegexPattern + opts_regex
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-05)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 *
 * @version 1.0-2 (Marek Gagolewski, 2016-01-29)
 *    Issue #214: allow a regex pattern like `.*`  to match an empty string
 *
 * @version 1.0-3 (Marek Gagolewski, 2016-02-03)
 *    FR #216: `negate` arg added
 *
 * @version 1.3.1 (Marek Gagolewski, 2019-02-08)
 *    #232: `max_count` arg added
 */
SEXP stri_detect_regex(SEXP str, SEXP pattern, SEXP negate,
    SEXP max_count, SEXP opts_regex)
{
   bool negate_1 = stri__prepare_arg_logical_1_notNA(negate, "negate");
   int max_count_1 = stri__prepare_arg_integer_1_notNA(max_count, "max_count");
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   PROTECT(pattern = stri_prepare_arg_string(pattern, "pattern"));
   R_len_t vectorize_length =
      stri__recycling_rule(true, 2, LENGTH(str), LENGTH(pattern));

   uint32_t pattern_flags = StriContainerRegexPattern::getRegexFlags(opts_regex);

   STRI__ERROR_HANDLER_BEGIN(2)
   StriContainerUTF16 str_cont(str, vectorize_length);
//   StriContainerUTF8 str_cont(str, vectorize_length); // utext_openUTF8, see below
   StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_flags);

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(LGLSXP, vectorize_length));
   int* ret_tab = LOGICAL(ret);

   for (R_len_t i = pattern_cont.vectorize_init();
         i != pattern_cont.vectorize_end();
         i = pattern_cont.vectorize_next(i))
   {
      if (max_count_1 == 0) {
          ret_tab[i] = NA_LOGICAL;
          continue;
      }

      STRI__CONTINUE_ON_EMPTY_OR_NA_PATTERN(str_cont,
         pattern_cont, ret_tab[i] = NA_LOGICAL)

      RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically
      matcher->reset(str_cont.get(i));
      ret_tab[i] = (int)matcher->find(); // returns UBool
      if (negate_1) ret_tab[i] = !ret_tab[i];
      if (max_count_1 > 0 && ret_tab[i]) --max_count_1;

//      // mbmark-regex-detect1.R: UTF16 0.07171792 s; UText 0.10531605 s
//      UText* str_text = NULL;
//      UErrorCode status = U_ZERO_ERROR;
//      RegexMatcher *matcher = pattern_cont.getMatcher(i); // will be deleted automatically
//      str_text = utext_openUTF8(str_text, str_cont.get(i).c_str(), str_cont.get(i).length(), &status);
//      STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
//      matcher->reset(str_text);
//      ret_tab[i] = (int)matcher->find(); // returns UBool
//      utext_close(str_text);
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
Example #4
0
int t4p::FinderClass::ReplaceAllMatches(UnicodeString& text) const {
    int matches = 0;

    // no check for ReplaceExpression.isEmpty() allow for empty replacements
    // this allows the user to 'delete' parts of a strin
    if (IsPrepared) {
        UnicodeString replacement = ReplaceExpression;
        RegexMatcher* matcher = NULL;
        UErrorCode error = U_ZERO_ERROR;
        UnicodeString dest(text.length(), ' ', 0);
        int32_t pos = 0;
        if (EXACT == Mode || (REGULAR_EXPRESSION == Mode && ReplaceExpression.isEmpty())) {
            pos = text.indexOf(Expression, 0);
            while (pos >= 0) {
                text.replaceBetween(pos, pos + Expression.length(), replacement);
                pos = text.indexOf(Expression, pos + replacement.length());
                ++matches;
            }
        } else {
            matcher = Pattern->matcher(text, error);
            if (U_SUCCESS(error) && matcher) {
                while (matcher->find()) {
                    if (U_SUCCESS(error)) {
                        matcher->appendReplacement(dest, replacement, error);
                        if (U_SUCCESS(error)) {
                            ++matches;
                        }
                    }
                }
                matcher->appendTail(dest);
                text = dest;
            }
        }
        if (matcher) {
            delete matcher;
        }
    }
    return matches;
}
Example #5
0
//------------------------------------------------------------------------------------------
//
//   main          for ugrep
//
//           Structurally, all use of the ICU Regular Expression API is in main(),
//           and all of the supporting stuff necessary to make a running program, but
//           not directly related to regular expressions, is factored out into these other
//           functions.
//
//------------------------------------------------------------------------------------------
int main(int argc, const char** argv) {
    UBool     matchFound = FALSE;

    //
    //  Process the commmand line options.
    //
    processOptions(argc, argv);

    //
    // Create a RegexPattern object from the user supplied pattern string.
    //
    UErrorCode status = U_ZERO_ERROR;   // All ICU operations report success or failure
                                        //   in a status variable.

    UParseError    parseErr;            // In the event of a syntax error in the regex pattern,
                                        //   this struct will contain the position of the
                                        //   error.

    RegexPattern  *rePat = RegexPattern::compile(pattern, parseErr, status);
                                        // Note that C++ is doing an automatic conversion
                                        //  of the (char *) pattern to a temporary
                                        //  UnicodeString object.
    if (U_FAILURE(status)) {
        fprintf(stderr, "ugrep:  error in pattern: \"%s\" at position %d\n",
            u_errorName(status), parseErr.offset);
        exit(-1);
    }

    //
    // Create a RegexMatcher from the newly created pattern.
    //
    UnicodeString empty;
    RegexMatcher *matcher = rePat->matcher(empty, status);
    if (U_FAILURE(status)) {
        fprintf(stderr, "ugrep:  error in creating RegexMatcher: \"%s\"\n",
            u_errorName(status));
        exit(-1);
    }

    //
    // Loop, processing each of the input files.
    //
    for (int fileNum=firstFileNum; fileNum < argc; fileNum++) {
        readFile(argv[fileNum]);

        //
        //  Loop through the lines of a file, trying to match the regex pattern on each.
        //
        for (nextLine(0); lineStart<fileLen; nextLine(lineEnd)) {
            UnicodeString s(FALSE, ucharBuf+lineStart, lineEnd-lineStart);
            matcher->reset(s);
            if (matcher->find()) {
                matchFound = TRUE;
                printMatch();
            }
        }
    }

    //
    //  Clean up
    //
    delete matcher;
    delete rePat;
    free(ucharBuf);
    free(charBuf);
    ucnv_close(outConverter);
    
    u_cleanup();       // shut down ICU, release any cached data it owns.

    return matchFound? 0: 1;
}