Пример #1
0
/**
 * Case of a delimited element
 * @param elem
 * @param state
 */
void
RegExpStateBuilder::build(DelimitedLangElem *elem, RegExpStatePointer state)
{
  RegExpStatePtr inner;

  const string &name = elem->getName();

  StringDef *start = elem->getStart();
  StringDef *end = elem->getEnd();
  StringDef *escape = elem->getEscape();

  string start_string;
  if (start)
    start_string = start->toString();

  string exp_string = start_string;

  string end_string;
  if (end)
    end_string = end->toString();
  string escape_string;
  if (escape)
    escape_string = escape->toString();
  
  bool end_string_has_references = false;
  
  // check possible back reference markers and their correctness
  if (end && end->isBackRef() && end_string.size()) {
      backreference_info ref_info = 
          RegexPreProcessor::num_of_references(end_string);
      subexpressions_info info = 
          RegexPreProcessor::num_of_marked_subexpressions(start_string, true, true);
      
      // possible errors, e.g., unbalanced parenthesis
      if (info.errors.size()) {
          exitError(info.errors, elem);
      }
      
      // check that there are enough subexpressions as requested by the maximal
      // back reference number
      int max = ref_info.second;
      if (max > info.marked) {
          ostringstream error;
          error << max << " subexpressions requested, but only " <<
              info.marked << " found ";
          exitError(error.str(), elem);
      }
      
      end_string_has_references = true;
  }

  //printMessage("building " + name + " " + start_string + " " + end_string);

  if (! elem->getStateLangElem() &&
        ! elem->isMultiline() && escaped_string_size(start_string) == 1 &&
        escaped_string_size(end_string) == 1 &&
        ! end_string_has_references) {
    /*
    in case the expression is not the start element of a
    State/Environment and it must not spawn multiple lines, and the
    delimiters are only one character, build a regular
    expression of the shape

    <startdelim>(everything but delimiters)<enddelim>

    For instance if delimiters are "<" and ">" the built regular expression is

    "<(?:[^<>])*>"
    */
    if (!escape) {
      exp_string = start_string + non_marking_group("[^" +
          start_string +
          (end_string != start_string ? end_string : "") +
          "]") + "*" + end_string;
    } else {
      /*
      in case of a specified escape character it will use it for the
      (everything but delimiters) part.
      For instace, if in the example above the escape character is the
      backslash, the generated expression is

      <(?:[^\\<\\>]|\\.)*>
      */
      exp_string = start_string + non_marking_group("[^" +
          escape_string +
          start_string +
          (end_string != start_string ? escape_string + end_string : "") +
          "]|"+ escape_string + ".") +
          "*" + end_string;
    }
  } else {
    /*
    Otherwise we cannot simply build a regular expression as above but
    we must build more states of the automaton:
    if we match the start delimiter we enter a new state,
    called here "inner"
    */
    inner = RegExpStatePtr(new RegExpState); // for internal elements
    nested_states.push_back(inner);

    // record where the inner state has reference to replace at run-time
    if (end_string_has_references)
        inner->setHasReferences();
    
    /*
    Since this is a delimited element, everything inside this element,
    that does not match anything else, must be formatted in the same
    way of this element.
    */
    inner->set_default_formatter(RegExpFormatterPtr(new RegExpFormatter(name)));

    /*
    We exit from this state when we match the end delimiter
    (or the end of buffer if no end delimiter was specified).

    For instance, consider this definition
    comment delim "[*" "*]"

    The inner state will contain the regular expression
    \*\]
    and when matched it will exit from the inner state.

    Notice that if this element has been specified with "exit"
    we must increment the synthetized "exit" RegExpFormatter,
    since it must exit the inner state and the state this
    element belongs to.
    */
    RegExpFormatterPtr exit
        (new RegExpFormatter(name, RegExpStatePtr(),
         1 + (elem->doExit() ? 1 : 0),
         elem->exitAll()));
    if (end)
      add_exp(inner, end_string, elem, exit);
    else
      inner->add_exp(buildex("\\z"), elem, exit);

    /*
    If an escape character was specified we must match also everything
    that is prefixed with the escape character.
    For instance,
    comment delim "[*" "*]" escape "\\"
    will generate the inner state
    (\*\])|(\\.)
    */
    if (escape) {
      add_exp(inner, escape_string + ".",
              elem,
              RegExpFormatterPtr(new RegExpFormatter(name)));
    }

    /*
    If the delimited element can be nested (e.g., C comments)
    we must deal with counted occurrences of start delimiter and
    end delimiter.
    We thus create a "nested" formatter, that has as the next state the
    inner we saw above.
    We then add to the "inner" state above the start delimiter expression,
    corresponding to the "nested" formatter.
    This will implement the stack of occurrences.

    For instance, consider
    comment delim "[*" "*]" nested

    The inner state will have these expressions
    (\*\])|(\[\*)
    in case it matches the first one it will exit,
    in case it matches the second it will enter the same inner state
    */
    if (elem->isNested()) {
      RegExpFormatterPtr nested(new RegExpFormatter(name, inner));
      nested_formatters.push_back(nested);
      add_exp(inner, start_string, elem, nested);
    }
  }

  if (inner) {
    inner->freeze();
  }

  RegExpFormatterPtr formatter(new RegExpFormatter(name, inner));
  add_exp(state, exp_string, elem, formatter);
  build(static_cast<StateStartLangElem *>(elem), state);
}
int main() {
    cout << boolalpha;

    testPreprocess("simple", "simple");
    testPreprocess("(inside)", "(?:inside)");
    testPreprocess("(dou(b)le)", "(?:dou(?:b)le)");

    testMakeNonSensitive("foo", "[Ff][Oo][Oo]");

    testOnlyNumOfMarkedSubexpressions("none", 0);
    testOnlyNumOfMarkedSubexpressions("just (one)", 1);
    testOnlyNumOfMarkedSubexpressions("(3 of (them)) just (one)", 3);

    testOnlyNumOfMarkedSubexpressions("none \\(", 0);
    testOnlyNumOfMarkedSubexpressions("(?: again) none \\(", 0);

    testNumOfMarkedSubexpressions("none", 0,
            subexpressions_info::ERR_OUTSIDE_SUBEXP);
    testNumOfMarkedSubexpressions("none", 0, "", true);
    testNumOfMarkedSubexpressions("just (one)", 0,
            subexpressions_info::ERR_OUTSIDE_SUBEXP);
    testNumOfMarkedSubexpressions("just (one)", 1, "", true);
    testNumOfMarkedSubexpressions("(3 of (them)) just (one)", 1,
            subexpressions_info::ERR_NESTED_SUBEXP);
    
    // now some critic cases
    testNumOfMarkedSubexpressions("(\\((?:\\\\\\)|[^)])*\\))", 1);
    testNumOfMarkedSubexpressions("(\\[(?:\\\\\\]|[^\\]])*\\])", 1);
    testNumOfMarkedSubexpressions("(:[^:]+\\:)", 1);

    testNumOfMarkedSubexpressions("none \\(", 0,
            subexpressions_info::ERR_OUTSIDE_SUBEXP);
    testNumOfMarkedSubexpressions("(?: again) none \\(", 0,
            subexpressions_info::ERR_OUTER_UNMARKED);
    testNumOfMarkedSubexpressions("(?: again) none \\(", 0, "", true, true);
    // outer nonmarked are allowed, but outer chars are not
    testNumOfMarkedSubexpressions("(?: again) none \\(", 0,
            subexpressions_info::ERR_OUTSIDE_SUBEXP, false, true);

    testNumOfMarkedSubexpressions("(just one)", 1);
    testNumOfMarkedSubexpressions("(just one (?:some) and unmarked)", 1);
    testNumOfMarkedSubexpressions("(just one \\( and escapes)", 1);
    testNumOfMarkedSubexpressions("(just one \\( and \\) escapes)", 1);
    testNumOfMarkedSubexpressions("(one) ", 1,
            subexpressions_info::ERR_OUTSIDE_SUBEXP);

    testNumOfMarkedSubexpressions("(one", 1,
            subexpressions_info::ERR_UNBALANCED_PAREN);
    testNumOfMarkedSubexpressions("(one))", 1,
            subexpressions_info::ERR_UNBALANCED_PAREN);

    testNumOfMarkedSubexpressions("(one)(two)((?:three)*)", 3);
    testNumOfMarkedSubexpressions("(one) (two)", 1,
            subexpressions_info::ERR_OUTSIDE_SUBEXP);

    subexpressions_strings expected;
    expected.push_back("(this)");
    expected.push_back("(is)");
    expected.push_back("(one)");
    testSplit("(this)(is)(one)", expected);

    expected.clear();
    expected.push_back("(this)");
    expected.push_back("(contains \\( some \\) other parenthesis)");
    expected.push_back("(and (?:non marked) ones)");
    testSplit(
            "(this)(contains \\( some \\) other parenthesis)(and (?:non marked) ones)",
            expected);

    testBackReference("this does not contain any", false);
    testBackReference("this does contain \\1 one", true);
    testBackReference("and also this one (?(2)...) does", true);
    testBackReference("while this one (?(foo)...) does NOT does", false);

    testNumOfBackReferences("this does not contain any", 0, 0);
    testNumOfBackReferences("this does contain \\1 one", 1, 1);
    testNumOfBackReferences("this does \\3 contain \\1 two", 2, 3);
    testNumOfBackReferences("and also this one (?(2)...) does", 1, 2);
    testNumOfBackReferences("and also \\1 this one (?(2)...) does", 2, 2);
    testNumOfBackReferences("and also this one (?(2)...) \\3 does", 2, 3);
    testNumOfBackReferences("while this one (?(foo)...) does NOT does", 0, 0);

    testNumOfReferences("this does not contain any", 0, 0);
    testNumOfReferences("this does contain @{1} one", 1, 1);
    testNumOfReferences("this does @{3} contain @{1} two", 2, 3);
    testNumOfReferences("while this one \\@{2} does NOT does", 0, 0);
    
    backreference_replacements replace(9);

    replace[1] = "SECOND";
    replace[2] = "THIRD";

    testReferenceReplace("this does not contain any", replace,
            "this does not contain any");

    // test for an empty replace string
    testReferenceReplace("this does contain @{1} one", replace,
            "this does contain  one");

    replace[0] = "FIRST";

    testReferenceReplace("this does contain @{1} one", replace,
            "this does contain FIRST one");

    testReferenceReplace("this does contain @{1} one", replace,
            "this does contain FIRST one");

    testReferenceReplace("and also this one (?(@{2})...) @{3} does", replace,
            "and also this one (?(SECOND)...) THIRD does");

    testReferenceReplace(
            "and (@{1}|@{3}@{1}) also this one (?((?(@{2})foo|bar))...) (@{3}) does",
            replace,
            "and (FIRST|THIRDFIRST) also this one (?((?(SECOND)foo|bar))...) (THIRD) does");

    // we test replacement when what we replace might be a special char in regex syntax
    replace[0] = "|";
    replace[1] = "$";
    replace[2] = "{";
    replace[3] = "=";
    
    testReferenceReplace("Here are special chars: @{1} @{2} @{3} @{4}", replace,
                "Here are special chars: \\| \\$ \\{ =");
    
    // now test substitutions using match results
    boost::regex test_regex("--\\[(=*)\\[");
    string to_match = "--[[";
    string to_substitute = "]]"; // between the ] ]
    regex_match_results what;

    result += assertEquals(true,
            boost::regex_search(to_match, what, test_regex));
    // no substitution must take place
    testReferenceReplace(to_substitute, what, "]]");

    // skip one char and replace with the first subexp
    to_substitute = "]@{1}]"; // between the ] ]

    what = regex_match_results();
    to_match = "--[=[";

    result += assertEquals(true,
            boost::regex_search(to_match, what, test_regex));
    testReferenceReplace(to_substitute, what, "]=]");

    test_regex = boost::regex("--\\[(=*)\\[(-*)\\[");
    what = regex_match_results();
    to_substitute = "]@{1}]@{2}]";
    to_match = "--[=[-[";

    result += assertEquals(true,
            boost::regex_search(to_match, what, test_regex));
    testReferenceReplace(to_substitute, what, "]=]-]");

    what = regex_match_results();
    to_match = "--[=[-[";

    to_substitute = "](?(@{1})@{1}|@{2})]@{2}]";
    result += assertEquals(true,
            boost::regex_search(to_match, what, test_regex));
    testReferenceReplace(to_substitute, what, "](?(=)=|-)]-]");

    what = regex_match_results();

    to_match = "--[=[[";

    result += assertEquals(true,
            boost::regex_search(to_match, what, test_regex));
    testReferenceReplace(to_substitute, what, "](?(=)=|)]]");
    
    // check StringDef for backreferences
    
    StringDef s1("foo");
    StringDef s2("bar");
    
    result += assertTrue(!s1.isBackRef());
    
    s1 = StringDef("");
    s1.setBackRef(true);
    
    result += assertTrue(s1.isBackRef());
    result += assertTrue(!s2.isBackRef());

    StringDef *conc = StringDef::concat(&s1, &s2);
    
    result += assertTrue(conc->isBackRef());
    
    return result;
}