Пример #1
0
vector<DepthMinMax> getDistancesFromSOM(const NGHolder &g_orig) {
    // We operate on a temporary copy of the original graph here, so we don't
    // have to mutate the original.
    NGHolder g;
    ue2::unordered_map<NFAVertex, NFAVertex> vmap; // vertex in g_orig to vertex in g
    cloneHolder(g, g_orig, &vmap);

    vector<NFAVertex> vstarts;
    for (auto v : vertices_range(g)) {
        if (is_virtual_start(v, g)) {
            vstarts.push_back(v);
        }
    }
    vstarts.push_back(g.startDs);

    // wire the successors of every virtual start or startDs to g.start.
    for (auto v : vstarts) {
        wireSuccessorsToStart(g, v);
    }

    // drop the in-edges of every virtual start so that they don't participate
    // in the depth calculation.
    for (auto v : vstarts) {
        clear_in_edges(v, g);
    }

    //dumpGraph("som_depth.dot", g.g);

    vector<DepthMinMax> temp_depths; // numbered by vertex index in g
    calcDepthsFrom(g, g.start, temp_depths);

    // Transfer depths, indexed by vertex index in g_orig.
    vector<DepthMinMax> depths(num_vertices(g_orig));

    for (auto v_orig : vertices_range(g_orig)) {
        assert(contains(vmap, v_orig));
        NFAVertex v_new = vmap[v_orig];

        u32 orig_idx = g_orig[v_orig].index;

        DepthMinMax &d = depths.at(orig_idx);

        if (v_orig == g_orig.startDs || is_virtual_start(v_orig, g_orig)) {
            // StartDs and virtual starts always have zero depth.
            d = DepthMinMax(0, 0);
        } else {
            u32 new_idx = g[v_new].index;
            d = temp_depths.at(new_idx);
        }
    }

    return depths;
}
Пример #2
0
bool firstMatchIsFirst(const NGHolder &p) {
    /* If the first match (by end offset) is not the first match (by start
     * offset) then we can't create a lock after it.
     *
     * Consider: 4009:/(foobar|ob).*bugger/s
     *
     * We don't care about races on the last byte as they can be resolved easily
     * at runtime /(foobar|obar).*hi/
     *
     * It should be obvious we don't care about one match being a prefix
     * of another as they share the same start offset.
     *
     * Therefore, the case were we cannot establish that the som does not
     * regress is when there exists s1 and s2 in the language of p and s2 is a
     * proper infix of s1.
     *
     * It is tempting to add the further restriction that there does not exist a
     * prefix of s1 that is in the language of p (as in which case we would
     * presume, the lock has already been set). However, we have no way of
     * knowing if the lock can be cleared by some characters, and if so, if it
     * is still set. TODO: if we knew the lock's escapes where we could verify
     * that the rest of s1 does not clear the lock. (1)
     */

    DEBUG_PRINTF("entry\n");

    /* If there are any big cycles throw up our hands in despair */
    if (hasBigCycles(p)) {
        DEBUG_PRINTF("fail, big cycles\n");
        return false;
    }

    set<NFAVertex> states;
    /* turn on all states (except starts - avoid suffix matches) */
    /* If we were doing (1) we would also except states leading to accepts -
       avoid prefix matches */
    for (auto v : vertices_range(p)) {
        assert(!is_virtual_start(v, p));
        if (!is_special(v, p)) {
            DEBUG_PRINTF("turning on %u\n", p[v].index);
            states.insert(v);
        }
    }

    /* run the prefix the main graph */
    execute_graph(p, p, &states);

    for (auto v : states) {
        /* need to check if this vertex may represent an infix match - ie
         * it does not have an edge to accept. */
        DEBUG_PRINTF("check %u\n", p[v].index);
        if (!edge(v, p.accept, p).second) {
            DEBUG_PRINTF("fail %u\n", p[v].index);
            return false;
        }
    }

    DEBUG_PRINTF("done first is first check\n");
    return true;
}
Пример #3
0
bool somMayGoBackwards(NFAVertex u, const NGHolder &g,
                       const ue2::unordered_map<NFAVertex, u32> &region_map,
                       smgb_cache &cache) {
    /* Need to ensure all matches of the graph g up to u contain no infixes
     * which are also matches of the graph to u.
     *
     * This is basically the same as firstMatchIsFirst except we g is not
     * always a dag. As we haven't gotten around to writing an execute_graph
     * that operates on general graphs, we take some (hopefully) conservative
     * short cuts.
     *
     * Note: if the u can be jumped we will take jump edges
     * into account as a possibility of som going backwards
     *
     * TODO: write a generalised ng_execute_graph/make this less hacky
     */
    assert(&g == &cache.g);
    if (contains(cache.smgb, u)) {
        return cache.smgb[u];
    }

    DEBUG_PRINTF("checking if som can go backwards on %u\n",
                  g[u].index);

    set<NFAEdge> be;
    BackEdges<set<NFAEdge>> backEdgeVisitor(be);
    depth_first_search(
        g.g, visitor(backEdgeVisitor)
                 .root_vertex(g.start)
                 .vertex_index_map(get(&NFAGraphVertexProps::index, g.g)));

    bool rv;
    if (0) {
    exit:
        DEBUG_PRINTF("using cached result\n");
        cache.smgb[u] = rv;
        return rv;
    }

    assert(contains(region_map, u));
    const u32 u_region = region_map.at(u);

    for (const auto &e : be) {
        NFAVertex s = source(e, g);
        NFAVertex t = target(e, g);
        /* only need to worry about big cycles including/before u */
        DEBUG_PRINTF("back edge %u %u\n", g[s].index,
                      g[t].index);
        if (s != t && region_map.at(s) <= u_region) {
            DEBUG_PRINTF("eek big cycle\n");
            rv = true; /* big cycle -> eek */
            goto exit;
        }
    }

    ue2::unordered_map<NFAVertex, NFAVertex> orig_to_copy;
    NGHolder c_g;
    cloneHolder(c_g, g, &orig_to_copy);

    for (NFAVertex v : vertices_range(g)) {
        if (!is_virtual_start(v, g)) {
            continue;
        }
        NFAVertex c_v = orig_to_copy[v];
        orig_to_copy[v] = c_g.startDs;
        for (NFAVertex c_w : adjacent_vertices_range(c_v, c_g)) {
            add_edge_if_not_present(c_g.startDs, c_w, c_g);
        }
        clear_vertex(c_v, c_g);
    }

    NFAVertex c_u = orig_to_copy[u];
    clear_in_edges(c_g.acceptEod, c_g);
    add_edge(c_g.accept, c_g.acceptEod, c_g);
    clear_in_edges(c_g.accept, c_g);
    clear_out_edges(c_u, c_g);
    if (hasSelfLoop(u, g)) {
        add_edge(c_u, c_u, c_g);
    }
    add_edge(c_u, c_g.accept, c_g);

    set<NFAVertex> u_succ;
    insert(&u_succ, adjacent_vertices(u, g));
    u_succ.erase(u);

    for (auto t : inv_adjacent_vertices_range(u, g)) {
        if (t == u) {
            continue;
        }
        for (auto v : adjacent_vertices_range(t, g)) {
            if (contains(u_succ, v)) {
                add_edge(orig_to_copy[t], c_g.accept, c_g);
                break;
            }
        }
    }

    pruneUseless(c_g);

    be.clear();
    depth_first_search(c_g.g, visitor(backEdgeVisitor).root_vertex(c_g.start).
                       vertex_index_map(get(&NFAGraphVertexProps::index, c_g.g)));

    for (const auto &e : be) {
        NFAVertex s = source(e, c_g);
        NFAVertex t = target(e, c_g);
        DEBUG_PRINTF("back edge %u %u\n", c_g[s].index, c_g[t].index);
        if (s != t) {
            assert(0);
            DEBUG_PRINTF("eek big cycle\n");
            rv = true; /* big cycle -> eek */
            goto exit;
        }
    }

    DEBUG_PRINTF("checking acyclic+selfloop graph\n");

    rv = !firstMatchIsFirst(c_g);
    DEBUG_PRINTF("som may regress? %d\n", (int)rv);
    goto exit;
}
Пример #4
0
static
void findSeeds(const NGHolder &h, const bool som, vector<NFAVertex> *seeds) {
    set<NFAVertex> bad; /* from zero-width asserts near accepts, etc */
    for (auto v : inv_adjacent_vertices_range(h.accept, h)) {
        const CharReach &cr = h[v].char_reach;
        if (!isutf8ascii(cr) && !isutf8start(cr)) {
            bad.insert(v);
        }
    }

    for (auto v : inv_adjacent_vertices_range(h.acceptEod, h)) {
        const CharReach &cr = h[v].char_reach;
        if (!isutf8ascii(cr) && !isutf8start(cr)) {
            bad.insert(v);
        }
    }

    // we want to be careful with asserts connected to starts
    // as well as they may not finish a code point
    for (auto v : vertices_range(h)) {
        if (is_virtual_start(v, h)) {
            bad.insert(v);
            insert(&bad, adjacent_vertices(v, h));
        }
    }

    /* we cannot handle vertices connected to accept as would report matches in
     * the middle of codepoints. acceptEod is not a problem as the input must
     * end at a codepoint boundary */
    bad.insert(h.accept);

    // If we're in SOM mode, we don't want to mess with vertices that have a
    // direct edge from startDs.
    if (som) {
        insert(&bad, adjacent_vertices(h.startDs, h));
    }

    set<NFAVertex> already_seeds; /* already marked as seeds */
    for (auto v : vertices_range(h)) {
        const CharReach &cr = h[v].char_reach;

        if (!isutf8ascii(cr) || !hasSelfLoop(v, h)) {
            continue;
        }

        if (hasSuccInSet(h, v, bad)) {
            continue;
        }

        // Skip vertices that are directly connected to other vertices already
        // in the seeds list: we can't collapse two of these directly next to
        // each other.
        if (hasPredInSet(h, v, already_seeds) ||
            hasSuccInSet(h, v, already_seeds)) {
            continue;
        }

        DEBUG_PRINTF("%zu is a seed\n", h[v].index);
        seeds->push_back(v);
        already_seeds.insert(v);
    }
}