Пример #1
0
/** Some squash states are clearly not advantageous in the NFA, as they do
 * incur the cost of an exception:
 * -# acyclic states
 * -# squash only a few acyclic states
 */
void filterSquashers(const NGHolder &g,
                     map<NFAVertex, NFAStateSet> &squash) {
    DEBUG_PRINTF("filtering\n");
    map<u32, NFAVertex> rev; /* vertex_index -> vertex */
    for (auto v : vertices_range(g)) {
        rev[g[v].index] = v;
    }

    for (auto v : vertices_range(g)) {
        if (!contains(squash, v)) {
            continue;
        }
        DEBUG_PRINTF("looking at squash set for vertex %u\n",
                     g[v].index);

        if (!hasSelfLoop(v, g)) {
            DEBUG_PRINTF("acyclic\n");
            squash.erase(v);
            continue;
        }

        NFAStateSet squashed = squash[v];
        squashed.flip(); /* default sense for mask of survivors */
        for (NFAStateSet::size_type sq = squashed.find_first();
             sq != squashed.npos; sq = squashed.find_next(sq)) {
            NFAVertex u = rev[sq];
            if (hasSelfLoop(u, g)) {
                DEBUG_PRINTF("squashing a cyclic (%zu) is always good\n", sq);
                goto next_vertex;
            }
        }

        if (squashed.count() < MIN_PURE_ACYCLIC_SQUASH) {
            DEBUG_PRINTF("squash set too small\n");
            squash.erase(v);
            continue;
        }

    next_vertex:;
        DEBUG_PRINTF("squash set ok\n");
    }
}
Пример #2
0
bool somMayGoBackwards(NFAVertex u, const NGHolder &g,
                       const ue2::unordered_map<NFAVertex, u32> &region_map,
                       smgb_cache &cache) {
    /* Need to ensure all matches of the graph g up to u contain no infixes
     * which are also matches of the graph to u.
     *
     * This is basically the same as firstMatchIsFirst except we g is not
     * always a dag. As we haven't gotten around to writing an execute_graph
     * that operates on general graphs, we take some (hopefully) conservative
     * short cuts.
     *
     * Note: if the u can be jumped we will take jump edges
     * into account as a possibility of som going backwards
     *
     * TODO: write a generalised ng_execute_graph/make this less hacky
     */
    assert(&g == &cache.g);
    if (contains(cache.smgb, u)) {
        return cache.smgb[u];
    }

    DEBUG_PRINTF("checking if som can go backwards on %u\n",
                  g[u].index);

    set<NFAEdge> be;
    BackEdges<set<NFAEdge>> backEdgeVisitor(be);
    depth_first_search(
        g.g, visitor(backEdgeVisitor)
                 .root_vertex(g.start)
                 .vertex_index_map(get(&NFAGraphVertexProps::index, g.g)));

    bool rv;
    if (0) {
    exit:
        DEBUG_PRINTF("using cached result\n");
        cache.smgb[u] = rv;
        return rv;
    }

    assert(contains(region_map, u));
    const u32 u_region = region_map.at(u);

    for (const auto &e : be) {
        NFAVertex s = source(e, g);
        NFAVertex t = target(e, g);
        /* only need to worry about big cycles including/before u */
        DEBUG_PRINTF("back edge %u %u\n", g[s].index,
                      g[t].index);
        if (s != t && region_map.at(s) <= u_region) {
            DEBUG_PRINTF("eek big cycle\n");
            rv = true; /* big cycle -> eek */
            goto exit;
        }
    }

    ue2::unordered_map<NFAVertex, NFAVertex> orig_to_copy;
    NGHolder c_g;
    cloneHolder(c_g, g, &orig_to_copy);

    for (NFAVertex v : vertices_range(g)) {
        if (!is_virtual_start(v, g)) {
            continue;
        }
        NFAVertex c_v = orig_to_copy[v];
        orig_to_copy[v] = c_g.startDs;
        for (NFAVertex c_w : adjacent_vertices_range(c_v, c_g)) {
            add_edge_if_not_present(c_g.startDs, c_w, c_g);
        }
        clear_vertex(c_v, c_g);
    }

    NFAVertex c_u = orig_to_copy[u];
    clear_in_edges(c_g.acceptEod, c_g);
    add_edge(c_g.accept, c_g.acceptEod, c_g);
    clear_in_edges(c_g.accept, c_g);
    clear_out_edges(c_u, c_g);
    if (hasSelfLoop(u, g)) {
        add_edge(c_u, c_u, c_g);
    }
    add_edge(c_u, c_g.accept, c_g);

    set<NFAVertex> u_succ;
    insert(&u_succ, adjacent_vertices(u, g));
    u_succ.erase(u);

    for (auto t : inv_adjacent_vertices_range(u, g)) {
        if (t == u) {
            continue;
        }
        for (auto v : adjacent_vertices_range(t, g)) {
            if (contains(u_succ, v)) {
                add_edge(orig_to_copy[t], c_g.accept, c_g);
                break;
            }
        }
    }

    pruneUseless(c_g);

    be.clear();
    depth_first_search(c_g.g, visitor(backEdgeVisitor).root_vertex(c_g.start).
                       vertex_index_map(get(&NFAGraphVertexProps::index, c_g.g)));

    for (const auto &e : be) {
        NFAVertex s = source(e, c_g);
        NFAVertex t = target(e, c_g);
        DEBUG_PRINTF("back edge %u %u\n", c_g[s].index, c_g[t].index);
        if (s != t) {
            assert(0);
            DEBUG_PRINTF("eek big cycle\n");
            rv = true; /* big cycle -> eek */
            goto exit;
        }
    }

    DEBUG_PRINTF("checking acyclic+selfloop graph\n");

    rv = !firstMatchIsFirst(c_g);
    DEBUG_PRINTF("som may regress? %d\n", (int)rv);
    goto exit;
}
Пример #3
0
map<NFAVertex, NFAStateSet> findSquashers(const NGHolder &g, som_type som) {
    map<NFAVertex, NFAStateSet> squash;

    // Number of bits to use for all our masks. If we're a triggered graph,
    // tops have already been assigned, so we don't have to account for them.
    const u32 numStates = num_vertices(g);

    // Build post-dominator tree.
    PostDomTree pdom_tree;
    buildPDomTree(g, pdom_tree);

    // Build list of vertices by state ID and a set of init states.
    vector<NFAVertex> vByIndex(numStates, NFAGraph::null_vertex());
    NFAStateSet initStates(numStates);
    smgb_cache cache(g);

    // Mappings used for SOM mode calculations, otherwise left empty.
    unordered_map<NFAVertex, u32> region_map;
    vector<DepthMinMax> som_depths;
    if (som) {
        region_map = assignRegions(g);
        som_depths = getDistancesFromSOM(g);
    }

    for (auto v : vertices_range(g)) {
        const u32 vert_id = g[v].index;
        DEBUG_PRINTF("vertex %u/%u\n", vert_id, numStates);
        assert(vert_id < numStates);
        vByIndex[vert_id] = v;

        if (is_any_start(v, g) || !in_degree(v, g)) {
            initStates.set(vert_id);
        }
    }

    for (u32 i = 0; i < numStates; i++) {
        NFAVertex v = vByIndex[i];
        assert(v != NFAGraph::null_vertex());
        const CharReach &cr = g[v].char_reach;

        /* only non-init cyclics can be squashers */
        if (!hasSelfLoop(v, g) || initStates.test(i)) {
            continue;
        }

        DEBUG_PRINTF("state %u is cyclic\n", i);

        NFAStateSet mask(numStates), succ(numStates), pred(numStates);
        buildSquashMask(mask, g, v, cr, initStates, vByIndex, pdom_tree, som,
                        som_depths, region_map, cache);
        buildSucc(succ, g, v);
        buildPred(pred, g, v);
        const auto &reports = g[v].reports;

        for (size_t j = succ.find_first(); j != succ.npos;
             j = succ.find_next(j)) {
            NFAVertex vj = vByIndex[j];
            NFAStateSet pred2(numStates);
            buildPred(pred2, g, vj);
            if (pred2 == pred) {
                DEBUG_PRINTF("adding the sm from %zu to %u's sm\n", j, i);
                NFAStateSet tmp(numStates);
                buildSquashMask(tmp, g, vj, cr, initStates, vByIndex, pdom_tree,
                                som, som_depths, region_map, cache);
                mask &= tmp;
            }
        }

        for (size_t j = pred.find_first(); j != pred.npos;
             j = pred.find_next(j)) {
            NFAVertex vj = vByIndex[j];
            NFAStateSet succ2(numStates);
            buildSucc(succ2, g, vj);
            /* we can use j as a basis for squashing if its succs are a subset
             * of ours */
            if ((succ2 & ~succ).any()) {
                continue;
            }

            if (som) {
                /* We cannot use j to add to the squash mask of v if it may
                 * have an earlier start of match offset. ie for us j as a
                 * basis for the squash mask of v we require:
                 * maxSomDist(j) <= minSomDist(v)
                 */

                /* ** TODO ** */

                const depth &max_som_dist_j =
                    som_depths[g[vj].index].max;
                const depth &min_som_dist_v =
                    som_depths[g[v].index].min;
                if (max_som_dist_j > min_som_dist_v ||
                    max_som_dist_j.is_infinite()) {
                    /* j can't be used as it may be storing an earlier SOM */
                    continue;
                }
            }

            const CharReach &crv = g[vj].char_reach;

            /* we also require that j's report information be a subset of ours
             */
            bool seen_special = false;
            for (auto w : adjacent_vertices_range(vj, g)) {
                if (is_special(w, g)) {
                    if (!edge(v, w, g).second) {
                        goto next_j;
                    }
                    seen_special = true;
                }
            }

            // FIXME: should be subset check?
            if (seen_special && g[vj].reports != reports) {
                continue;
            }

            /* ok we can use j */
            if ((crv & ~cr).none()) {
                NFAStateSet tmp(numStates);
                buildSquashMask(tmp, g, vj, cr, initStates, vByIndex, pdom_tree,
                                som, som_depths, region_map, cache);
                mask &= tmp;
                mask.reset(j);
            }

        next_j:;
        }

        mask.set(i); /* never clear ourselves */

        if ((~mask).any()) { // i.e. some bits unset in mask
            DEBUG_PRINTF("%u squashes %zu other states\n", i, (~mask).count());
            squash.emplace(v, mask);
        }
    }

    findDerivedSquashers(g, vByIndex, pdom_tree, initStates, &squash, som,
                         som_depths, region_map, cache);

    clearMutualSquashers(g, vByIndex, squash);

    return squash;
}
Пример #4
0
static
void findSeeds(const NGHolder &h, const bool som, vector<NFAVertex> *seeds) {
    set<NFAVertex> bad; /* from zero-width asserts near accepts, etc */
    for (auto v : inv_adjacent_vertices_range(h.accept, h)) {
        const CharReach &cr = h[v].char_reach;
        if (!isutf8ascii(cr) && !isutf8start(cr)) {
            bad.insert(v);
        }
    }

    for (auto v : inv_adjacent_vertices_range(h.acceptEod, h)) {
        const CharReach &cr = h[v].char_reach;
        if (!isutf8ascii(cr) && !isutf8start(cr)) {
            bad.insert(v);
        }
    }

    // we want to be careful with asserts connected to starts
    // as well as they may not finish a code point
    for (auto v : vertices_range(h)) {
        if (is_virtual_start(v, h)) {
            bad.insert(v);
            insert(&bad, adjacent_vertices(v, h));
        }
    }

    /* we cannot handle vertices connected to accept as would report matches in
     * the middle of codepoints. acceptEod is not a problem as the input must
     * end at a codepoint boundary */
    bad.insert(h.accept);

    // If we're in SOM mode, we don't want to mess with vertices that have a
    // direct edge from startDs.
    if (som) {
        insert(&bad, adjacent_vertices(h.startDs, h));
    }

    set<NFAVertex> already_seeds; /* already marked as seeds */
    for (auto v : vertices_range(h)) {
        const CharReach &cr = h[v].char_reach;

        if (!isutf8ascii(cr) || !hasSelfLoop(v, h)) {
            continue;
        }

        if (hasSuccInSet(h, v, bad)) {
            continue;
        }

        // Skip vertices that are directly connected to other vertices already
        // in the seeds list: we can't collapse two of these directly next to
        // each other.
        if (hasPredInSet(h, v, already_seeds) ||
            hasSuccInSet(h, v, already_seeds)) {
            continue;
        }

        DEBUG_PRINTF("%zu is a seed\n", h[v].index);
        seeds->push_back(v);
        already_seeds.insert(v);
    }
}