Beispiel #1
0
/** Returns the minimum width in bytes of an input that will match the given
 * graph. */
depth findMinWidth(const NGHolder &h) {
    depth startDepth = findMinWidth(h, h.start);
    depth dotstarDepth = findMinWidth(h, h.startDs);
    DEBUG_PRINTF("startDepth=%s, dotstarDepth=%s\n", startDepth.str().c_str(),
                 dotstarDepth.str().c_str());
    if (startDepth.is_unreachable()) {
        assert(dotstarDepth.is_finite());
        return dotstarDepth;
    } else if (dotstarDepth.is_unreachable()) {
        assert(startDepth.is_finite());
        return startDepth;
    } else {
        assert(min(startDepth, dotstarDepth).is_finite());
        return min(startDepth, dotstarDepth);
    }
}
Beispiel #2
0
void SmallWriteBuildImpl::add(const NGWrapper &w) {
    // If the graph is poisoned (i.e. we can't build a SmallWrite version),
    // we don't even try.
    if (poisoned) {
        return;
    }

    if (w.som || w.min_length || isVacuous(w)) { /* cannot support in smwr */
        poisoned = true;
        return;
    }

    DEBUG_PRINTF("w=%p\n", &w);

    // make a copy of the graph so that we can modify it for our purposes
    unique_ptr<NGHolder> h = cloneHolder(w);

    reduceGraph(*h, SOM_NONE, w.utf8, cc);

    // If the earliest match location is outside the small write region,
    // then we don't need to build a SmallWrite version.
    // However, we don't poison this case either, since it is simply a case,
    // where we know the resulting graph won't match.
    if (findMinWidth(*h) > depth(cc.grey.smallWriteLargestBuffer)) {
        return;
    }

    // Now we can actually build the McClellan DFA
    assert(h->kind == NFA_OUTFIX);
    auto r = buildMcClellan(*h, &rm, cc.grey);

    // If we couldn't build a McClellan DFA for this portion, we won't be able
    // build a smwr which represents the pattern set
    if (!r) {
        DEBUG_PRINTF("failed to determinise\n");
        poisoned = true;
        return;
    }

    prune_overlong(*r, cc.grey.smallWriteLargestBuffer);

    if (rdfa) {
        // do a merge of the new dfa with the existing dfa
        auto merged = mergeTwoDfas(rdfa.get(), r.get(), DFA_MERGE_MAX_STATES,
                                   &rm, cc.grey);
        if (!merged) {
            DEBUG_PRINTF("merge failed\n");
            poisoned = true;
            return;
        }
        DEBUG_PRINTF("merge succeeded, built %p\n", merged.get());
        rdfa = move(merged);
    } else {
        rdfa = move(r);
    }
}
Beispiel #3
0
void handleExtendedParams(ReportManager &rm, NGWrapper &g,
                          UNUSED const CompileContext &cc) {
    if (!hasExtParams(g)) {
        return;
    }

    depth minWidth = findMinWidth(g);
    depth maxWidth = findMaxWidth(g);
    bool is_anchored = !has_proper_successor(g.startDs, g)
                     && out_degree(g.start, g);
    bool has_offset_adj = hasOffsetAdjustments(rm, g);

    DEBUG_PRINTF("minWidth=%s, maxWidth=%s, anchored=%d, offset_adj=%d\n",
                 minWidth.str().c_str(), maxWidth.str().c_str(), is_anchored,
                 has_offset_adj);

    DepthMinMax match_depths = findMatchLengths(rm, g);
    DEBUG_PRINTF("match depths %s\n", match_depths.str().c_str());

    if (is_anchored && maxWidth.is_finite() && g.min_offset > maxWidth) {
        ostringstream oss;
        oss << "Expression is anchored and cannot satisfy min_offset="
            << g.min_offset << " as it can only produce matches of length "
            << maxWidth << " bytes at most.";
        throw CompileError(g.expressionIndex, oss.str());
    }

    if (minWidth > g.max_offset) {
        ostringstream oss;
        oss << "Expression has max_offset=" << g.max_offset << " but requires "
             << minWidth << " bytes to match.";
        throw CompileError(g.expressionIndex, oss.str());
    }

    if (maxWidth.is_finite() && match_depths.max < g.min_length) {
        ostringstream oss;
        oss << "Expression has min_length=" << g.min_length << " but can "
            "only produce matches of length " << match_depths.max <<
            " bytes at most.";
        throw CompileError(g.expressionIndex, oss.str());
    }

    if (g.min_length && g.min_length <= match_depths.min) {
        DEBUG_PRINTF("min_length=%llu constraint is unnecessary\n",
                     g.min_length);
        g.min_length = 0;
    }

    if (!hasExtParams(g)) {
        return;
    }

    pruneVacuousEdges(g);
    pruneUnmatchable(g, rm);

    if (!has_offset_adj) {
        pruneExtUnreachable(g);
    }

    // We may have removed all the edges to accept, in which case this
    // expression cannot match.
    if (in_degree(g.accept, g) == 0 && in_degree(g.acceptEod, g) == 1) {
        throw CompileError(g.expressionIndex, "Extended parameter "
                "constraints can not be satisfied for any match from "
                "this expression.");
    }

    // Remove reports on vertices without an edge to accept (which have been
    // pruned above).
    clearReports(g);

    // Recalc.
    minWidth = findMinWidth(g);
    maxWidth = findMaxWidth(g);
    is_anchored = proper_out_degree(g.startDs, g) == 0 &&
                  out_degree(g.start, g);
    has_offset_adj = hasOffsetAdjustments(rm, g);

    // If the pattern is completely anchored and has a min_length set, this can
    // be converted to a min_offset.
    if (g.min_length && (g.min_offset <= g.min_length) && is_anchored) {
        DEBUG_PRINTF("converting min_length to min_offset=%llu for "
                     "anchored case\n", g.min_length);
        g.min_offset = g.min_length;
        g.min_length = 0;
    }

    if (g.min_offset && g.min_offset <= minWidth && !has_offset_adj) {
        DEBUG_PRINTF("min_offset=%llu constraint is unnecessary\n",
                     g.min_offset);
        g.min_offset = 0;
    }

    if (!hasExtParams(g)) {
        return;
    }

    // If the pattern has a min_length and is of "ratchet" form with one
    // unbounded repeat, that repeat can become a bounded repeat.
    // e.g. /foo.*bar/{min_length=100} --> /foo.{94,}bar/
    if (g.min_length && transformMinLengthToRepeat(rm, g)) {
        DEBUG_PRINTF("converted min_length to bounded repeat\n");
        // recalc
        minWidth = findMinWidth(g);
    }

    // If the pattern is unanchored, has a max_offset and has not asked for
    // SOM, we can use that knowledge to anchor it which will limit its
    // lifespan. Note that we can't use this transformation if there's a
    // min_length, as it's currently handled using "sly SOM".

    // Note that it is possible to handle graphs that have a combination of
    // anchored and unanchored paths, but it's too tricky for the moment.

    if (g.max_offset != MAX_OFFSET && !g.som && !g.min_length &&
                !has_offset_adj && isUnanchored(g)) {
        if (anchorPatternWithBoundedRepeat(g, minWidth, maxWidth)) {
            DEBUG_PRINTF("minWidth=%s, maxWidth=%s\n", minWidth.str().c_str(),
                         maxWidth.str().c_str());
            if (minWidth == maxWidth) {
                // For a fixed width pattern, we can retire the offsets as they
                // are implicit in the graph now.
                g.min_offset = 0;
                g.max_offset = MAX_OFFSET;
            }
        }
    }
    //dumpGraph("final.dot", g.g);

    if (!hasExtParams(g)) {
        return;
    }

    set<NFAVertex> done;
    updateReportBounds(rm, g, g.accept, done);
    updateReportBounds(rm, g, g.acceptEod, done);
}
Beispiel #4
0
depth findMinWidth(const NGHolder &h, u32 top) {
    return findMinWidth(h, SpecialEdgeFilter(h, top));
}
Beispiel #5
0
depth findMinWidth(const NGHolder &h) {
    return findMinWidth(h, SpecialEdgeFilter(h));
}
Beispiel #6
0
static
depth findMaxWidth(const NGHolder &h, const SpecialEdgeFilter &filter,
                   NFAVertex src) {
    if (isLeafNode(src, h.g)) {
        return depth::unreachable();
    }

    if (hasReachableCycle(h, src)) {
        // There's a cycle reachable from this src, so we have inf width.
        return depth::infinity();
    }

    boost::filtered_graph<NFAGraph, SpecialEdgeFilter> g(h.g, filter);

    assert(hasCorrectlyNumberedVertices(h));
    const size_t num = num_vertices(h);
    vector<int> distance(num);
    vector<boost::default_color_type> colors(num);

    auto index_map = get(&NFAGraphVertexProps::index, g);

    // DAG shortest paths with negative edge weights.
    dag_shortest_paths(
        g, src,
        distance_map(make_iterator_property_map(distance.begin(), index_map))
            .weight_map(boost::make_constant_property<NFAEdge>(-1))
            .vertex_index_map(index_map)
            .color_map(make_iterator_property_map(colors.begin(), index_map)));

    depth acceptDepth, acceptEodDepth;
    if (colors.at(NODE_ACCEPT) == boost::white_color) {
        acceptDepth = depth::unreachable();
    } else {
        acceptDepth = -1 * distance.at(NODE_ACCEPT);
    }
    if (colors.at(NODE_ACCEPT_EOD) == boost::white_color) {
        acceptEodDepth = depth::unreachable();
    } else {
        acceptEodDepth = -1 * distance.at(NODE_ACCEPT_EOD);
    }

    depth d;
    if (acceptDepth.is_unreachable()) {
        d = acceptEodDepth;
    } else if (acceptEodDepth.is_unreachable()) {
        d = acceptDepth;
    } else {
        d = max(acceptDepth, acceptEodDepth);
    }

    if (d.is_unreachable()) {
        // If we're actually reachable, we'll have a min width, so we can
        // return infinity in this case.
        if (findMinWidth(h, filter, src).is_reachable()) {
            return depth::infinity();
        }
        return d;
    }

    // Invert sign and subtract one for start transition.
    assert(d.is_finite() && d > depth(0));
    return d - depth(1);
}