/** Returns the minimum width in bytes of an input that will match the given * graph. */ depth findMinWidth(const NGHolder &h) { depth startDepth = findMinWidth(h, h.start); depth dotstarDepth = findMinWidth(h, h.startDs); DEBUG_PRINTF("startDepth=%s, dotstarDepth=%s\n", startDepth.str().c_str(), dotstarDepth.str().c_str()); if (startDepth.is_unreachable()) { assert(dotstarDepth.is_finite()); return dotstarDepth; } else if (dotstarDepth.is_unreachable()) { assert(startDepth.is_finite()); return startDepth; } else { assert(min(startDepth, dotstarDepth).is_finite()); return min(startDepth, dotstarDepth); } }
void SmallWriteBuildImpl::add(const NGWrapper &w) { // If the graph is poisoned (i.e. we can't build a SmallWrite version), // we don't even try. if (poisoned) { return; } if (w.som || w.min_length || isVacuous(w)) { /* cannot support in smwr */ poisoned = true; return; } DEBUG_PRINTF("w=%p\n", &w); // make a copy of the graph so that we can modify it for our purposes unique_ptr<NGHolder> h = cloneHolder(w); reduceGraph(*h, SOM_NONE, w.utf8, cc); // If the earliest match location is outside the small write region, // then we don't need to build a SmallWrite version. // However, we don't poison this case either, since it is simply a case, // where we know the resulting graph won't match. if (findMinWidth(*h) > depth(cc.grey.smallWriteLargestBuffer)) { return; } // Now we can actually build the McClellan DFA assert(h->kind == NFA_OUTFIX); auto r = buildMcClellan(*h, &rm, cc.grey); // If we couldn't build a McClellan DFA for this portion, we won't be able // build a smwr which represents the pattern set if (!r) { DEBUG_PRINTF("failed to determinise\n"); poisoned = true; return; } prune_overlong(*r, cc.grey.smallWriteLargestBuffer); if (rdfa) { // do a merge of the new dfa with the existing dfa auto merged = mergeTwoDfas(rdfa.get(), r.get(), DFA_MERGE_MAX_STATES, &rm, cc.grey); if (!merged) { DEBUG_PRINTF("merge failed\n"); poisoned = true; return; } DEBUG_PRINTF("merge succeeded, built %p\n", merged.get()); rdfa = move(merged); } else { rdfa = move(r); } }
void handleExtendedParams(ReportManager &rm, NGWrapper &g, UNUSED const CompileContext &cc) { if (!hasExtParams(g)) { return; } depth minWidth = findMinWidth(g); depth maxWidth = findMaxWidth(g); bool is_anchored = !has_proper_successor(g.startDs, g) && out_degree(g.start, g); bool has_offset_adj = hasOffsetAdjustments(rm, g); DEBUG_PRINTF("minWidth=%s, maxWidth=%s, anchored=%d, offset_adj=%d\n", minWidth.str().c_str(), maxWidth.str().c_str(), is_anchored, has_offset_adj); DepthMinMax match_depths = findMatchLengths(rm, g); DEBUG_PRINTF("match depths %s\n", match_depths.str().c_str()); if (is_anchored && maxWidth.is_finite() && g.min_offset > maxWidth) { ostringstream oss; oss << "Expression is anchored and cannot satisfy min_offset=" << g.min_offset << " as it can only produce matches of length " << maxWidth << " bytes at most."; throw CompileError(g.expressionIndex, oss.str()); } if (minWidth > g.max_offset) { ostringstream oss; oss << "Expression has max_offset=" << g.max_offset << " but requires " << minWidth << " bytes to match."; throw CompileError(g.expressionIndex, oss.str()); } if (maxWidth.is_finite() && match_depths.max < g.min_length) { ostringstream oss; oss << "Expression has min_length=" << g.min_length << " but can " "only produce matches of length " << match_depths.max << " bytes at most."; throw CompileError(g.expressionIndex, oss.str()); } if (g.min_length && g.min_length <= match_depths.min) { DEBUG_PRINTF("min_length=%llu constraint is unnecessary\n", g.min_length); g.min_length = 0; } if (!hasExtParams(g)) { return; } pruneVacuousEdges(g); pruneUnmatchable(g, rm); if (!has_offset_adj) { pruneExtUnreachable(g); } // We may have removed all the edges to accept, in which case this // expression cannot match. if (in_degree(g.accept, g) == 0 && in_degree(g.acceptEod, g) == 1) { throw CompileError(g.expressionIndex, "Extended parameter " "constraints can not be satisfied for any match from " "this expression."); } // Remove reports on vertices without an edge to accept (which have been // pruned above). clearReports(g); // Recalc. minWidth = findMinWidth(g); maxWidth = findMaxWidth(g); is_anchored = proper_out_degree(g.startDs, g) == 0 && out_degree(g.start, g); has_offset_adj = hasOffsetAdjustments(rm, g); // If the pattern is completely anchored and has a min_length set, this can // be converted to a min_offset. if (g.min_length && (g.min_offset <= g.min_length) && is_anchored) { DEBUG_PRINTF("converting min_length to min_offset=%llu for " "anchored case\n", g.min_length); g.min_offset = g.min_length; g.min_length = 0; } if (g.min_offset && g.min_offset <= minWidth && !has_offset_adj) { DEBUG_PRINTF("min_offset=%llu constraint is unnecessary\n", g.min_offset); g.min_offset = 0; } if (!hasExtParams(g)) { return; } // If the pattern has a min_length and is of "ratchet" form with one // unbounded repeat, that repeat can become a bounded repeat. // e.g. /foo.*bar/{min_length=100} --> /foo.{94,}bar/ if (g.min_length && transformMinLengthToRepeat(rm, g)) { DEBUG_PRINTF("converted min_length to bounded repeat\n"); // recalc minWidth = findMinWidth(g); } // If the pattern is unanchored, has a max_offset and has not asked for // SOM, we can use that knowledge to anchor it which will limit its // lifespan. Note that we can't use this transformation if there's a // min_length, as it's currently handled using "sly SOM". // Note that it is possible to handle graphs that have a combination of // anchored and unanchored paths, but it's too tricky for the moment. if (g.max_offset != MAX_OFFSET && !g.som && !g.min_length && !has_offset_adj && isUnanchored(g)) { if (anchorPatternWithBoundedRepeat(g, minWidth, maxWidth)) { DEBUG_PRINTF("minWidth=%s, maxWidth=%s\n", minWidth.str().c_str(), maxWidth.str().c_str()); if (minWidth == maxWidth) { // For a fixed width pattern, we can retire the offsets as they // are implicit in the graph now. g.min_offset = 0; g.max_offset = MAX_OFFSET; } } } //dumpGraph("final.dot", g.g); if (!hasExtParams(g)) { return; } set<NFAVertex> done; updateReportBounds(rm, g, g.accept, done); updateReportBounds(rm, g, g.acceptEod, done); }
depth findMinWidth(const NGHolder &h, u32 top) { return findMinWidth(h, SpecialEdgeFilter(h, top)); }
depth findMinWidth(const NGHolder &h) { return findMinWidth(h, SpecialEdgeFilter(h)); }
static depth findMaxWidth(const NGHolder &h, const SpecialEdgeFilter &filter, NFAVertex src) { if (isLeafNode(src, h.g)) { return depth::unreachable(); } if (hasReachableCycle(h, src)) { // There's a cycle reachable from this src, so we have inf width. return depth::infinity(); } boost::filtered_graph<NFAGraph, SpecialEdgeFilter> g(h.g, filter); assert(hasCorrectlyNumberedVertices(h)); const size_t num = num_vertices(h); vector<int> distance(num); vector<boost::default_color_type> colors(num); auto index_map = get(&NFAGraphVertexProps::index, g); // DAG shortest paths with negative edge weights. dag_shortest_paths( g, src, distance_map(make_iterator_property_map(distance.begin(), index_map)) .weight_map(boost::make_constant_property<NFAEdge>(-1)) .vertex_index_map(index_map) .color_map(make_iterator_property_map(colors.begin(), index_map))); depth acceptDepth, acceptEodDepth; if (colors.at(NODE_ACCEPT) == boost::white_color) { acceptDepth = depth::unreachable(); } else { acceptDepth = -1 * distance.at(NODE_ACCEPT); } if (colors.at(NODE_ACCEPT_EOD) == boost::white_color) { acceptEodDepth = depth::unreachable(); } else { acceptEodDepth = -1 * distance.at(NODE_ACCEPT_EOD); } depth d; if (acceptDepth.is_unreachable()) { d = acceptEodDepth; } else if (acceptEodDepth.is_unreachable()) { d = acceptDepth; } else { d = max(acceptDepth, acceptEodDepth); } if (d.is_unreachable()) { // If we're actually reachable, we'll have a min width, so we can // return infinity in this case. if (findMinWidth(h, filter, src).is_reachable()) { return depth::infinity(); } return d; } // Invert sign and subtract one for start transition. assert(d.is_finite() && d > depth(0)); return d - depth(1); }