/** * Case of a delimited element * @param elem * @param state */ void RegExpStateBuilder::build(DelimitedLangElem *elem, RegExpStatePointer state) { RegExpStatePtr inner; const string &name = elem->getName(); StringDef *start = elem->getStart(); StringDef *end = elem->getEnd(); StringDef *escape = elem->getEscape(); string start_string; if (start) start_string = start->toString(); string exp_string = start_string; string end_string; if (end) end_string = end->toString(); string escape_string; if (escape) escape_string = escape->toString(); bool end_string_has_references = false; // check possible back reference markers and their correctness if (end && end->isBackRef() && end_string.size()) { backreference_info ref_info = RegexPreProcessor::num_of_references(end_string); subexpressions_info info = RegexPreProcessor::num_of_marked_subexpressions(start_string, true, true); // possible errors, e.g., unbalanced parenthesis if (info.errors.size()) { exitError(info.errors, elem); } // check that there are enough subexpressions as requested by the maximal // back reference number int max = ref_info.second; if (max > info.marked) { ostringstream error; error << max << " subexpressions requested, but only " << info.marked << " found "; exitError(error.str(), elem); } end_string_has_references = true; } //printMessage("building " + name + " " + start_string + " " + end_string); if (! elem->getStateLangElem() && ! elem->isMultiline() && escaped_string_size(start_string) == 1 && escaped_string_size(end_string) == 1 && ! end_string_has_references) { /* in case the expression is not the start element of a State/Environment and it must not spawn multiple lines, and the delimiters are only one character, build a regular expression of the shape <startdelim>(everything but delimiters)<enddelim> For instance if delimiters are "<" and ">" the built regular expression is "<(?:[^<>])*>" */ if (!escape) { exp_string = start_string + non_marking_group("[^" + start_string + (end_string != start_string ? end_string : "") + "]") + "*" + end_string; } else { /* in case of a specified escape character it will use it for the (everything but delimiters) part. For instace, if in the example above the escape character is the backslash, the generated expression is <(?:[^\\<\\>]|\\.)*> */ exp_string = start_string + non_marking_group("[^" + escape_string + start_string + (end_string != start_string ? escape_string + end_string : "") + "]|"+ escape_string + ".") + "*" + end_string; } } else { /* Otherwise we cannot simply build a regular expression as above but we must build more states of the automaton: if we match the start delimiter we enter a new state, called here "inner" */ inner = RegExpStatePtr(new RegExpState); // for internal elements nested_states.push_back(inner); // record where the inner state has reference to replace at run-time if (end_string_has_references) inner->setHasReferences(); /* Since this is a delimited element, everything inside this element, that does not match anything else, must be formatted in the same way of this element. */ inner->set_default_formatter(RegExpFormatterPtr(new RegExpFormatter(name))); /* We exit from this state when we match the end delimiter (or the end of buffer if no end delimiter was specified). For instance, consider this definition comment delim "[*" "*]" The inner state will contain the regular expression \*\] and when matched it will exit from the inner state. Notice that if this element has been specified with "exit" we must increment the synthetized "exit" RegExpFormatter, since it must exit the inner state and the state this element belongs to. */ RegExpFormatterPtr exit (new RegExpFormatter(name, RegExpStatePtr(), 1 + (elem->doExit() ? 1 : 0), elem->exitAll())); if (end) add_exp(inner, end_string, elem, exit); else inner->add_exp(buildex("\\z"), elem, exit); /* If an escape character was specified we must match also everything that is prefixed with the escape character. For instance, comment delim "[*" "*]" escape "\\" will generate the inner state (\*\])|(\\.) */ if (escape) { add_exp(inner, escape_string + ".", elem, RegExpFormatterPtr(new RegExpFormatter(name))); } /* If the delimited element can be nested (e.g., C comments) we must deal with counted occurrences of start delimiter and end delimiter. We thus create a "nested" formatter, that has as the next state the inner we saw above. We then add to the "inner" state above the start delimiter expression, corresponding to the "nested" formatter. This will implement the stack of occurrences. For instance, consider comment delim "[*" "*]" nested The inner state will have these expressions (\*\])|(\[\*) in case it matches the first one it will exit, in case it matches the second it will enter the same inner state */ if (elem->isNested()) { RegExpFormatterPtr nested(new RegExpFormatter(name, inner)); nested_formatters.push_back(nested); add_exp(inner, start_string, elem, nested); } } if (inner) { inner->freeze(); } RegExpFormatterPtr formatter(new RegExpFormatter(name, inner)); add_exp(state, exp_string, elem, formatter); build(static_cast<StateStartLangElem *>(elem), state); }
void HighlightStateBuilder::build(DelimitedLangElem *elem, HighlightState *state) { const string &name = elem->getName(); StringDef *start = elem->getStart(); StringDef *end = elem->getEnd(); StringDef *escape = elem->getEscape(); string start_string = (start ? start->toString() : ""); string end_string = (end ? end->toString() : ""); string escape_string = (escape ? escape->toString() : ""); if (elem->isNested() && start_string == end_string) { // the two delimiters must be different for nested elements throw HighlightBuilderException( "delimiters must be different for nested elements", elem); } bool end_string_has_references = false; // check possible back reference markers and their correctness if (end && end->hasBackRef() && end_string.size()) { backreference_info ref_info = RegexPreProcessor::num_of_references( end_string); subexpressions_info info = RegexPreProcessor::num_of_marked_subexpressions(start_string, true, true); // possible errors, e.g., unbalanced parenthesis if (info.errors.size()) { throw HighlightBuilderException(info.errors, elem); } // check that there are enough subexpressions as requested by the maximal // back reference number unsigned int max = ref_info.second; if (max > info.marked) { std::ostringstream error; error << max << " subexpressions requested, but only " << info.marked << " found"; throw HighlightBuilderException(error.str(), elem); } end_string_has_references = true; } HighlightRulePtr rule; // if this element starts a new state/environment, we must split it if (elem->getStateLangElem() || elem->isMultiline() || end_string_has_references) { rule = HighlightRulePtr(highlightRuleFactory->createMultiLineRule(name, start_string, end_string, escape_string, elem->isNested())); if (end_string_has_references) { // record that the state (and the rule representing the end) // need to have dynamic back references replaced rule->getNextState()->setNeedsReferenceReplacement(); rule->getNextState()->getRuleList().front()->setNeedsReferenceReplacement(); // and that the starting rule has sub expressions // (that will be used for replacing dynamic back references) rule->setHasSubexpressions(); // if the element is nested, then the last rule is a sort of copy // of the first one, so we need to record that it has subexpressions too if (elem->isNested()) { rule->getNextState()->getRuleList().back()->setHasSubexpressions(); } } } else { rule = HighlightRulePtr(highlightRuleFactory->createLineRule(name, start_string, end_string, escape_string, elem->isNested())); } rule->setAdditionalInfo(elem->toStringParserInfo()); state->addRule(rule); if (rule->getNextState().get()) { // as for exit level, if the rule was split using states, we must set // the exit level of the first rule of the next state (i.e., the end expression) of the rule // this exit level must be incremented by one: 1 is for exiting the inner state // of the rule, and 1 for exiting the state this rule belongs to setExitLevel(elem, rule->getNextState()->getRuleList().front().get(), 1); // adjust the additional info of the exiting rule rule->getNextState()->getRuleList().front()->setAdditionalInfo( elem->toStringParserInfo()); // since this is a delimited element, we must set the default element for // the inner state to the name of the element itself rule->getNextState()->setDefaultElement(name); } else { setExitLevel(elem, rule.get()); } }
int main() { cout << boolalpha; testPreprocess("simple", "simple"); testPreprocess("(inside)", "(?:inside)"); testPreprocess("(dou(b)le)", "(?:dou(?:b)le)"); testMakeNonSensitive("foo", "[Ff][Oo][Oo]"); testOnlyNumOfMarkedSubexpressions("none", 0); testOnlyNumOfMarkedSubexpressions("just (one)", 1); testOnlyNumOfMarkedSubexpressions("(3 of (them)) just (one)", 3); testOnlyNumOfMarkedSubexpressions("none \\(", 0); testOnlyNumOfMarkedSubexpressions("(?: again) none \\(", 0); testNumOfMarkedSubexpressions("none", 0, subexpressions_info::ERR_OUTSIDE_SUBEXP); testNumOfMarkedSubexpressions("none", 0, "", true); testNumOfMarkedSubexpressions("just (one)", 0, subexpressions_info::ERR_OUTSIDE_SUBEXP); testNumOfMarkedSubexpressions("just (one)", 1, "", true); testNumOfMarkedSubexpressions("(3 of (them)) just (one)", 1, subexpressions_info::ERR_NESTED_SUBEXP); // now some critic cases testNumOfMarkedSubexpressions("(\\((?:\\\\\\)|[^)])*\\))", 1); testNumOfMarkedSubexpressions("(\\[(?:\\\\\\]|[^\\]])*\\])", 1); testNumOfMarkedSubexpressions("(:[^:]+\\:)", 1); testNumOfMarkedSubexpressions("none \\(", 0, subexpressions_info::ERR_OUTSIDE_SUBEXP); testNumOfMarkedSubexpressions("(?: again) none \\(", 0, subexpressions_info::ERR_OUTER_UNMARKED); testNumOfMarkedSubexpressions("(?: again) none \\(", 0, "", true, true); // outer nonmarked are allowed, but outer chars are not testNumOfMarkedSubexpressions("(?: again) none \\(", 0, subexpressions_info::ERR_OUTSIDE_SUBEXP, false, true); testNumOfMarkedSubexpressions("(just one)", 1); testNumOfMarkedSubexpressions("(just one (?:some) and unmarked)", 1); testNumOfMarkedSubexpressions("(just one \\( and escapes)", 1); testNumOfMarkedSubexpressions("(just one \\( and \\) escapes)", 1); testNumOfMarkedSubexpressions("(one) ", 1, subexpressions_info::ERR_OUTSIDE_SUBEXP); testNumOfMarkedSubexpressions("(one", 1, subexpressions_info::ERR_UNBALANCED_PAREN); testNumOfMarkedSubexpressions("(one))", 1, subexpressions_info::ERR_UNBALANCED_PAREN); testNumOfMarkedSubexpressions("(one)(two)((?:three)*)", 3); testNumOfMarkedSubexpressions("(one) (two)", 1, subexpressions_info::ERR_OUTSIDE_SUBEXP); subexpressions_strings expected; expected.push_back("(this)"); expected.push_back("(is)"); expected.push_back("(one)"); testSplit("(this)(is)(one)", expected); expected.clear(); expected.push_back("(this)"); expected.push_back("(contains \\( some \\) other parenthesis)"); expected.push_back("(and (?:non marked) ones)"); testSplit( "(this)(contains \\( some \\) other parenthesis)(and (?:non marked) ones)", expected); testBackReference("this does not contain any", false); testBackReference("this does contain \\1 one", true); testBackReference("and also this one (?(2)...) does", true); testBackReference("while this one (?(foo)...) does NOT does", false); testNumOfBackReferences("this does not contain any", 0, 0); testNumOfBackReferences("this does contain \\1 one", 1, 1); testNumOfBackReferences("this does \\3 contain \\1 two", 2, 3); testNumOfBackReferences("and also this one (?(2)...) does", 1, 2); testNumOfBackReferences("and also \\1 this one (?(2)...) does", 2, 2); testNumOfBackReferences("and also this one (?(2)...) \\3 does", 2, 3); testNumOfBackReferences("while this one (?(foo)...) does NOT does", 0, 0); testNumOfReferences("this does not contain any", 0, 0); testNumOfReferences("this does contain @{1} one", 1, 1); testNumOfReferences("this does @{3} contain @{1} two", 2, 3); testNumOfReferences("while this one \\@{2} does NOT does", 0, 0); backreference_replacements replace(9); replace[1] = "SECOND"; replace[2] = "THIRD"; testReferenceReplace("this does not contain any", replace, "this does not contain any"); // test for an empty replace string testReferenceReplace("this does contain @{1} one", replace, "this does contain one"); replace[0] = "FIRST"; testReferenceReplace("this does contain @{1} one", replace, "this does contain FIRST one"); testReferenceReplace("this does contain @{1} one", replace, "this does contain FIRST one"); testReferenceReplace("and also this one (?(@{2})...) @{3} does", replace, "and also this one (?(SECOND)...) THIRD does"); testReferenceReplace( "and (@{1}|@{3}@{1}) also this one (?((?(@{2})foo|bar))...) (@{3}) does", replace, "and (FIRST|THIRDFIRST) also this one (?((?(SECOND)foo|bar))...) (THIRD) does"); // we test replacement when what we replace might be a special char in regex syntax replace[0] = "|"; replace[1] = "$"; replace[2] = "{"; replace[3] = "="; testReferenceReplace("Here are special chars: @{1} @{2} @{3} @{4}", replace, "Here are special chars: \\| \\$ \\{ ="); // now test substitutions using match results boost::regex test_regex("--\\[(=*)\\["); string to_match = "--[["; string to_substitute = "]]"; // between the ] ] regex_match_results what; result += assertEquals(true, boost::regex_search(to_match, what, test_regex)); // no substitution must take place testReferenceReplace(to_substitute, what, "]]"); // skip one char and replace with the first subexp to_substitute = "]@{1}]"; // between the ] ] what = regex_match_results(); to_match = "--[=["; result += assertEquals(true, boost::regex_search(to_match, what, test_regex)); testReferenceReplace(to_substitute, what, "]=]"); test_regex = boost::regex("--\\[(=*)\\[(-*)\\["); what = regex_match_results(); to_substitute = "]@{1}]@{2}]"; to_match = "--[=[-["; result += assertEquals(true, boost::regex_search(to_match, what, test_regex)); testReferenceReplace(to_substitute, what, "]=]-]"); what = regex_match_results(); to_match = "--[=[-["; to_substitute = "](?(@{1})@{1}|@{2})]@{2}]"; result += assertEquals(true, boost::regex_search(to_match, what, test_regex)); testReferenceReplace(to_substitute, what, "](?(=)=|-)]-]"); what = regex_match_results(); to_match = "--[=[["; result += assertEquals(true, boost::regex_search(to_match, what, test_regex)); testReferenceReplace(to_substitute, what, "](?(=)=|)]]"); // check StringDef for backreferences StringDef s1("foo"); StringDef s2("bar"); result += assertTrue(!s1.isBackRef()); s1 = StringDef(""); s1.setBackRef(true); result += assertTrue(s1.isBackRef()); result += assertTrue(!s2.isBackRef()); StringDef *conc = StringDef::concat(&s1, &s2); result += assertTrue(conc->isBackRef()); return result; }