Exemplo n.º 1
0
void SmallWriteBuildImpl::add(const NGWrapper &w) {
    // If the graph is poisoned (i.e. we can't build a SmallWrite version),
    // we don't even try.
    if (poisoned) {
        return;
    }

    if (w.som || w.min_length || isVacuous(w)) { /* cannot support in smwr */
        poisoned = true;
        return;
    }

    DEBUG_PRINTF("w=%p\n", &w);

    // make a copy of the graph so that we can modify it for our purposes
    unique_ptr<NGHolder> h = cloneHolder(w);

    reduceGraph(*h, SOM_NONE, w.utf8, cc);

    // If the earliest match location is outside the small write region,
    // then we don't need to build a SmallWrite version.
    // However, we don't poison this case either, since it is simply a case,
    // where we know the resulting graph won't match.
    if (findMinWidth(*h) > depth(cc.grey.smallWriteLargestBuffer)) {
        return;
    }

    // Now we can actually build the McClellan DFA
    assert(h->kind == NFA_OUTFIX);
    auto r = buildMcClellan(*h, &rm, cc.grey);

    // If we couldn't build a McClellan DFA for this portion, we won't be able
    // build a smwr which represents the pattern set
    if (!r) {
        DEBUG_PRINTF("failed to determinise\n");
        poisoned = true;
        return;
    }

    prune_overlong(*r, cc.grey.smallWriteLargestBuffer);

    if (rdfa) {
        // do a merge of the new dfa with the existing dfa
        auto merged = mergeTwoDfas(rdfa.get(), r.get(), DFA_MERGE_MAX_STATES,
                                   &rm, cc.grey);
        if (!merged) {
            DEBUG_PRINTF("merge failed\n");
            poisoned = true;
            return;
        }
        DEBUG_PRINTF("merge succeeded, built %p\n", merged.get());
        rdfa = move(merged);
    } else {
        rdfa = move(r);
    }
}
Exemplo n.º 2
0
/**
 * True if the graphs have mergeable starts.
 *
 * Nowadays, this means that any vacuous edges must have the same tops. In
 * addition, mixed-accept cases need to have matching reports.
 */
static
bool mergeableStarts(const NGHolder &h1, const NGHolder &h2) {
    if (!isVacuous(h1) || !isVacuous(h2)) {
        return true;
    }

    // Vacuous edges from startDs should not occur: we have better ways to
    // implement true dot-star relationships. Just in case they do, ban them
    // from being merged unless they have identical reports.
    if (is_match_vertex(h1.startDs, h1) || is_match_vertex(h2.startDs, h2)) {
        assert(0);
        return false;
    }

    // If both graphs have edge (start, accept), the tops must match.
    auto e1_accept = edge(h1.start, h1.accept, h1);
    auto e2_accept = edge(h2.start, h2.accept, h2);
    if (e1_accept.second && e2_accept.second &&
        h1[e1_accept.first].top != h2[e2_accept.first].top) {
        return false;
    }

    // If both graphs have edge (start, acceptEod), the tops must match.
    auto e1_eod = edge(h1.start, h1.acceptEod, h1);
    auto e2_eod = edge(h2.start, h2.acceptEod, h2);
    if (e1_eod.second && e2_eod.second &&
        h1[e1_eod.first].top != h2[e2_eod.first].top) {
        return false;
    }

    // If one graph has an edge to accept and the other has an edge to
    // acceptEod, the reports must match for the merge to be safe.
    if ((e1_accept.second && e2_eod.second) ||
        (e2_accept.second && e1_eod.second)) {
        if (h1[h1.start].reports != h2[h2.start].reports) {
            return false;
        }
    }

    return true;
}
Exemplo n.º 3
0
/** If the pattern is unanchored, has a max_offset and has not asked for SOM,
 * we can use that knowledge to anchor it which will limit its lifespan. Note
 * that we can't use this transformation if there's a min_length, as it's
 * currently handled using "sly SOM".
 *
 * Note that it is possible to handle graphs that have a combination of
 * anchored and unanchored paths, but it's too tricky for the moment.
 */
static
bool anchorPatternWithBoundedRepeat(NGWrapper &g, const depth &minWidth,
                                    const depth &maxWidth) {
    assert(!g.som);
    assert(g.max_offset != MAX_OFFSET);
    assert(minWidth <= maxWidth);
    assert(maxWidth.is_reachable());

    DEBUG_PRINTF("widths=[%s,%s], min/max offsets=[%llu,%llu]\n",
                 minWidth.str().c_str(), maxWidth.str().c_str(), g.min_offset,
                 g.max_offset);

    if (g.max_offset > MAX_MAXOFFSET_TO_ANCHOR) {
        return false;
    }

    if (g.max_offset < minWidth) {
        assert(0);
        return false;
    }

    // If the pattern has virtual starts, we probably don't want to touch it.
    if (hasVirtualStarts(g)) {
        DEBUG_PRINTF("virtual starts, bailing\n");
        return false;
    }

    // Similarly, bail if the pattern is vacuous. TODO: this could be done, we
    // would just need to be a little careful with reports.
    if (isVacuous(g)) {
        DEBUG_PRINTF("vacuous, bailing\n");
        return false;
    }

    u32 min_bound, max_bound;
    if (maxWidth.is_infinite()) {
        min_bound = 0;
        max_bound = g.max_offset - minWidth;
    } else {
        min_bound = g.min_offset > maxWidth ? g.min_offset - maxWidth : 0;
        max_bound = g.max_offset - minWidth;
    }

    DEBUG_PRINTF("prepending ^.{%u,%u}\n", min_bound, max_bound);

    vector<NFAVertex> initials;
    for (auto v : adjacent_vertices_range(g.startDs, g)) {
        if (v == g.startDs) {
            continue;
        }
        initials.push_back(v);
    }
    if (initials.empty()) {
        DEBUG_PRINTF("no initial vertices\n");
        return false;
    }

    // Wire up 'min_offset' mandatory dots from anchored start.
    NFAVertex u = g.start;
    for (u32 i = 0; i < min_bound; i++) {
        NFAVertex v = add_vertex(g);
        g[v].char_reach.setall();
        add_edge(u, v, g);
        u = v;
    }

    NFAVertex head = u;

    // Wire up optional dots for (max_offset - min_offset).
    for (u32 i = 0; i < max_bound - min_bound; i++) {
        NFAVertex v = add_vertex(g);
        g[v].char_reach.setall();
        if (head != u) {
            add_edge(head, v, g);
        }
        add_edge(u, v, g);
        u = v;
    }

    // Remove edges from starts and wire both head and u to our initials.
    for (auto v : initials) {
        remove_edge(g.startDs, v, g);
        remove_edge(g.start, v, g);

        if (head != u) {
            add_edge(head, v, g);
        }
        add_edge(u, v, g);
    }

    g.renumberVertices();
    g.renumberEdges();

    return true;
}