Пример #1
0
/**
 * Case of a delimited element
 * @param elem
 * @param state
 */
void
RegExpStateBuilder::build(DelimitedLangElem *elem, RegExpStatePointer state)
{
  RegExpStatePtr inner;

  const string &name = elem->getName();

  StringDef *start = elem->getStart();
  StringDef *end = elem->getEnd();
  StringDef *escape = elem->getEscape();

  string start_string;
  if (start)
    start_string = start->toString();

  string exp_string = start_string;

  string end_string;
  if (end)
    end_string = end->toString();
  string escape_string;
  if (escape)
    escape_string = escape->toString();
  
  bool end_string_has_references = false;
  
  // check possible back reference markers and their correctness
  if (end && end->isBackRef() && end_string.size()) {
      backreference_info ref_info = 
          RegexPreProcessor::num_of_references(end_string);
      subexpressions_info info = 
          RegexPreProcessor::num_of_marked_subexpressions(start_string, true, true);
      
      // possible errors, e.g., unbalanced parenthesis
      if (info.errors.size()) {
          exitError(info.errors, elem);
      }
      
      // check that there are enough subexpressions as requested by the maximal
      // back reference number
      int max = ref_info.second;
      if (max > info.marked) {
          ostringstream error;
          error << max << " subexpressions requested, but only " <<
              info.marked << " found ";
          exitError(error.str(), elem);
      }
      
      end_string_has_references = true;
  }

  //printMessage("building " + name + " " + start_string + " " + end_string);

  if (! elem->getStateLangElem() &&
        ! elem->isMultiline() && escaped_string_size(start_string) == 1 &&
        escaped_string_size(end_string) == 1 &&
        ! end_string_has_references) {
    /*
    in case the expression is not the start element of a
    State/Environment and it must not spawn multiple lines, and the
    delimiters are only one character, build a regular
    expression of the shape

    <startdelim>(everything but delimiters)<enddelim>

    For instance if delimiters are "<" and ">" the built regular expression is

    "<(?:[^<>])*>"
    */
    if (!escape) {
      exp_string = start_string + non_marking_group("[^" +
          start_string +
          (end_string != start_string ? end_string : "") +
          "]") + "*" + end_string;
    } else {
      /*
      in case of a specified escape character it will use it for the
      (everything but delimiters) part.
      For instace, if in the example above the escape character is the
      backslash, the generated expression is

      <(?:[^\\<\\>]|\\.)*>
      */
      exp_string = start_string + non_marking_group("[^" +
          escape_string +
          start_string +
          (end_string != start_string ? escape_string + end_string : "") +
          "]|"+ escape_string + ".") +
          "*" + end_string;
    }
  } else {
    /*
    Otherwise we cannot simply build a regular expression as above but
    we must build more states of the automaton:
    if we match the start delimiter we enter a new state,
    called here "inner"
    */
    inner = RegExpStatePtr(new RegExpState); // for internal elements
    nested_states.push_back(inner);

    // record where the inner state has reference to replace at run-time
    if (end_string_has_references)
        inner->setHasReferences();
    
    /*
    Since this is a delimited element, everything inside this element,
    that does not match anything else, must be formatted in the same
    way of this element.
    */
    inner->set_default_formatter(RegExpFormatterPtr(new RegExpFormatter(name)));

    /*
    We exit from this state when we match the end delimiter
    (or the end of buffer if no end delimiter was specified).

    For instance, consider this definition
    comment delim "[*" "*]"

    The inner state will contain the regular expression
    \*\]
    and when matched it will exit from the inner state.

    Notice that if this element has been specified with "exit"
    we must increment the synthetized "exit" RegExpFormatter,
    since it must exit the inner state and the state this
    element belongs to.
    */
    RegExpFormatterPtr exit
        (new RegExpFormatter(name, RegExpStatePtr(),
         1 + (elem->doExit() ? 1 : 0),
         elem->exitAll()));
    if (end)
      add_exp(inner, end_string, elem, exit);
    else
      inner->add_exp(buildex("\\z"), elem, exit);

    /*
    If an escape character was specified we must match also everything
    that is prefixed with the escape character.
    For instance,
    comment delim "[*" "*]" escape "\\"
    will generate the inner state
    (\*\])|(\\.)
    */
    if (escape) {
      add_exp(inner, escape_string + ".",
              elem,
              RegExpFormatterPtr(new RegExpFormatter(name)));
    }

    /*
    If the delimited element can be nested (e.g., C comments)
    we must deal with counted occurrences of start delimiter and
    end delimiter.
    We thus create a "nested" formatter, that has as the next state the
    inner we saw above.
    We then add to the "inner" state above the start delimiter expression,
    corresponding to the "nested" formatter.
    This will implement the stack of occurrences.

    For instance, consider
    comment delim "[*" "*]" nested

    The inner state will have these expressions
    (\*\])|(\[\*)
    in case it matches the first one it will exit,
    in case it matches the second it will enter the same inner state
    */
    if (elem->isNested()) {
      RegExpFormatterPtr nested(new RegExpFormatter(name, inner));
      nested_formatters.push_back(nested);
      add_exp(inner, start_string, elem, nested);
    }
  }

  if (inner) {
    inner->freeze();
  }

  RegExpFormatterPtr formatter(new RegExpFormatter(name, inner));
  add_exp(state, exp_string, elem, formatter);
  build(static_cast<StateStartLangElem *>(elem), state);
}
void HighlightStateBuilder::build(DelimitedLangElem *elem,
        HighlightState *state) {
    const string &name = elem->getName();

    StringDef *start = elem->getStart();
    StringDef *end = elem->getEnd();
    StringDef *escape = elem->getEscape();

    string start_string = (start ? start->toString() : "");
    string end_string = (end ? end->toString() : "");
    string escape_string = (escape ? escape->toString() : "");

    if (elem->isNested() && start_string == end_string) {
        // the two delimiters must be different for nested elements
        throw HighlightBuilderException(
                "delimiters must be different for nested elements", elem);
    }

    bool end_string_has_references = false;
    // check possible back reference markers and their correctness
    if (end && end->hasBackRef() && end_string.size()) {
        backreference_info ref_info = RegexPreProcessor::num_of_references(
                end_string);
        subexpressions_info info =
                RegexPreProcessor::num_of_marked_subexpressions(start_string,
                        true, true);

        // possible errors, e.g., unbalanced parenthesis
        if (info.errors.size()) {
            throw HighlightBuilderException(info.errors, elem);
        }

        // check that there are enough subexpressions as requested by the maximal
        // back reference number
        unsigned int max = ref_info.second;
        if (max > info.marked) {
            std::ostringstream error;
            error << max << " subexpressions requested, but only "
                    << info.marked << " found";
            throw HighlightBuilderException(error.str(), elem);
        }

        end_string_has_references = true;
    }

    HighlightRulePtr rule;

    // if this element starts a new state/environment, we must split it
    if (elem->getStateLangElem() || elem->isMultiline()
            || end_string_has_references) {
        rule = HighlightRulePtr(highlightRuleFactory->createMultiLineRule(name,
                start_string, end_string, escape_string, elem->isNested()));

        if (end_string_has_references) {
            // record that the state (and the rule representing the end)
            // need to have dynamic back references replaced
            rule->getNextState()->setNeedsReferenceReplacement();
            rule->getNextState()->getRuleList().front()->setNeedsReferenceReplacement();

            // and that the starting rule has sub expressions
            // (that will be used for replacing dynamic back references)
            rule->setHasSubexpressions();

            // if the element is nested, then the last rule is a sort of copy
            // of the first one, so we need to record that it has subexpressions too
            if (elem->isNested()) {
                rule->getNextState()->getRuleList().back()->setHasSubexpressions();
            }
        }
    } else {
        rule = HighlightRulePtr(highlightRuleFactory->createLineRule(name,
                start_string, end_string, escape_string, elem->isNested()));
    }

    rule->setAdditionalInfo(elem->toStringParserInfo());
    state->addRule(rule);

    if (rule->getNextState().get()) {
        // as for exit level, if the rule was split using states, we must set
        // the exit level of the first rule of the next state (i.e., the end expression) of the rule
        // this exit level must be incremented by one: 1 is for exiting the inner state
        // of the rule, and 1 for exiting the state this rule belongs to
        setExitLevel(elem, rule->getNextState()->getRuleList().front().get(), 1);

        // adjust the additional info of the exiting rule
        rule->getNextState()->getRuleList().front()->setAdditionalInfo(
                elem->toStringParserInfo());

        // since this is a delimited element, we must set the default element for
        // the inner state to the name of the element itself
        rule->getNextState()->setDefaultElement(name);
    } else {
        setExitLevel(elem, rule.get());
    }
}
int main() {
    cout << boolalpha;

    testPreprocess("simple", "simple");
    testPreprocess("(inside)", "(?:inside)");
    testPreprocess("(dou(b)le)", "(?:dou(?:b)le)");

    testMakeNonSensitive("foo", "[Ff][Oo][Oo]");

    testOnlyNumOfMarkedSubexpressions("none", 0);
    testOnlyNumOfMarkedSubexpressions("just (one)", 1);
    testOnlyNumOfMarkedSubexpressions("(3 of (them)) just (one)", 3);

    testOnlyNumOfMarkedSubexpressions("none \\(", 0);
    testOnlyNumOfMarkedSubexpressions("(?: again) none \\(", 0);

    testNumOfMarkedSubexpressions("none", 0,
            subexpressions_info::ERR_OUTSIDE_SUBEXP);
    testNumOfMarkedSubexpressions("none", 0, "", true);
    testNumOfMarkedSubexpressions("just (one)", 0,
            subexpressions_info::ERR_OUTSIDE_SUBEXP);
    testNumOfMarkedSubexpressions("just (one)", 1, "", true);
    testNumOfMarkedSubexpressions("(3 of (them)) just (one)", 1,
            subexpressions_info::ERR_NESTED_SUBEXP);
    
    // now some critic cases
    testNumOfMarkedSubexpressions("(\\((?:\\\\\\)|[^)])*\\))", 1);
    testNumOfMarkedSubexpressions("(\\[(?:\\\\\\]|[^\\]])*\\])", 1);
    testNumOfMarkedSubexpressions("(:[^:]+\\:)", 1);

    testNumOfMarkedSubexpressions("none \\(", 0,
            subexpressions_info::ERR_OUTSIDE_SUBEXP);
    testNumOfMarkedSubexpressions("(?: again) none \\(", 0,
            subexpressions_info::ERR_OUTER_UNMARKED);
    testNumOfMarkedSubexpressions("(?: again) none \\(", 0, "", true, true);
    // outer nonmarked are allowed, but outer chars are not
    testNumOfMarkedSubexpressions("(?: again) none \\(", 0,
            subexpressions_info::ERR_OUTSIDE_SUBEXP, false, true);

    testNumOfMarkedSubexpressions("(just one)", 1);
    testNumOfMarkedSubexpressions("(just one (?:some) and unmarked)", 1);
    testNumOfMarkedSubexpressions("(just one \\( and escapes)", 1);
    testNumOfMarkedSubexpressions("(just one \\( and \\) escapes)", 1);
    testNumOfMarkedSubexpressions("(one) ", 1,
            subexpressions_info::ERR_OUTSIDE_SUBEXP);

    testNumOfMarkedSubexpressions("(one", 1,
            subexpressions_info::ERR_UNBALANCED_PAREN);
    testNumOfMarkedSubexpressions("(one))", 1,
            subexpressions_info::ERR_UNBALANCED_PAREN);

    testNumOfMarkedSubexpressions("(one)(two)((?:three)*)", 3);
    testNumOfMarkedSubexpressions("(one) (two)", 1,
            subexpressions_info::ERR_OUTSIDE_SUBEXP);

    subexpressions_strings expected;
    expected.push_back("(this)");
    expected.push_back("(is)");
    expected.push_back("(one)");
    testSplit("(this)(is)(one)", expected);

    expected.clear();
    expected.push_back("(this)");
    expected.push_back("(contains \\( some \\) other parenthesis)");
    expected.push_back("(and (?:non marked) ones)");
    testSplit(
            "(this)(contains \\( some \\) other parenthesis)(and (?:non marked) ones)",
            expected);

    testBackReference("this does not contain any", false);
    testBackReference("this does contain \\1 one", true);
    testBackReference("and also this one (?(2)...) does", true);
    testBackReference("while this one (?(foo)...) does NOT does", false);

    testNumOfBackReferences("this does not contain any", 0, 0);
    testNumOfBackReferences("this does contain \\1 one", 1, 1);
    testNumOfBackReferences("this does \\3 contain \\1 two", 2, 3);
    testNumOfBackReferences("and also this one (?(2)...) does", 1, 2);
    testNumOfBackReferences("and also \\1 this one (?(2)...) does", 2, 2);
    testNumOfBackReferences("and also this one (?(2)...) \\3 does", 2, 3);
    testNumOfBackReferences("while this one (?(foo)...) does NOT does", 0, 0);

    testNumOfReferences("this does not contain any", 0, 0);
    testNumOfReferences("this does contain @{1} one", 1, 1);
    testNumOfReferences("this does @{3} contain @{1} two", 2, 3);
    testNumOfReferences("while this one \\@{2} does NOT does", 0, 0);
    
    backreference_replacements replace(9);

    replace[1] = "SECOND";
    replace[2] = "THIRD";

    testReferenceReplace("this does not contain any", replace,
            "this does not contain any");

    // test for an empty replace string
    testReferenceReplace("this does contain @{1} one", replace,
            "this does contain  one");

    replace[0] = "FIRST";

    testReferenceReplace("this does contain @{1} one", replace,
            "this does contain FIRST one");

    testReferenceReplace("this does contain @{1} one", replace,
            "this does contain FIRST one");

    testReferenceReplace("and also this one (?(@{2})...) @{3} does", replace,
            "and also this one (?(SECOND)...) THIRD does");

    testReferenceReplace(
            "and (@{1}|@{3}@{1}) also this one (?((?(@{2})foo|bar))...) (@{3}) does",
            replace,
            "and (FIRST|THIRDFIRST) also this one (?((?(SECOND)foo|bar))...) (THIRD) does");

    // we test replacement when what we replace might be a special char in regex syntax
    replace[0] = "|";
    replace[1] = "$";
    replace[2] = "{";
    replace[3] = "=";
    
    testReferenceReplace("Here are special chars: @{1} @{2} @{3} @{4}", replace,
                "Here are special chars: \\| \\$ \\{ =");
    
    // now test substitutions using match results
    boost::regex test_regex("--\\[(=*)\\[");
    string to_match = "--[[";
    string to_substitute = "]]"; // between the ] ]
    regex_match_results what;

    result += assertEquals(true,
            boost::regex_search(to_match, what, test_regex));
    // no substitution must take place
    testReferenceReplace(to_substitute, what, "]]");

    // skip one char and replace with the first subexp
    to_substitute = "]@{1}]"; // between the ] ]

    what = regex_match_results();
    to_match = "--[=[";

    result += assertEquals(true,
            boost::regex_search(to_match, what, test_regex));
    testReferenceReplace(to_substitute, what, "]=]");

    test_regex = boost::regex("--\\[(=*)\\[(-*)\\[");
    what = regex_match_results();
    to_substitute = "]@{1}]@{2}]";
    to_match = "--[=[-[";

    result += assertEquals(true,
            boost::regex_search(to_match, what, test_regex));
    testReferenceReplace(to_substitute, what, "]=]-]");

    what = regex_match_results();
    to_match = "--[=[-[";

    to_substitute = "](?(@{1})@{1}|@{2})]@{2}]";
    result += assertEquals(true,
            boost::regex_search(to_match, what, test_regex));
    testReferenceReplace(to_substitute, what, "](?(=)=|-)]-]");

    what = regex_match_results();

    to_match = "--[=[[";

    result += assertEquals(true,
            boost::regex_search(to_match, what, test_regex));
    testReferenceReplace(to_substitute, what, "](?(=)=|)]]");
    
    // check StringDef for backreferences
    
    StringDef s1("foo");
    StringDef s2("bar");
    
    result += assertTrue(!s1.isBackRef());
    
    s1 = StringDef("");
    s1.setBackRef(true);
    
    result += assertTrue(s1.isBackRef());
    result += assertTrue(!s2.isBackRef());

    StringDef *conc = StringDef::concat(&s1, &s2);
    
    result += assertTrue(conc->isBackRef());
    
    return result;
}