/** Returns true if graphs \p h1 and \p h2 can (and should) be merged. */ static bool shouldMerge(NGHolder &ha, const ue2::unordered_map<NFAVertex, u32> &a_state_ids, NGHolder &hb, const ue2::unordered_map<NFAVertex, u32> &b_state_ids, size_t cpl, const ReportManager *rm, const CompileContext &cc) { size_t combinedStateCount = countStates(ha, a_state_ids) + countStates(hb, b_state_ids) - cpl; if (combinedStateCount > FAST_STATE_LIMIT) { // More complex implementability check. NGHolder h_temp; cloneHolder(h_temp, ha); assert(h_temp.kind == hb.kind); mergeNfaComponent(h_temp, hb, cpl); reduceImplementableGraph(h_temp, SOM_NONE, rm, cc); u32 numStates = isImplementableNFA(h_temp, rm, cc); DEBUG_PRINTF("isImplementableNFA returned %u states\n", numStates); if (!numStates) { DEBUG_PRINTF("not implementable\n"); return false; } else if (numStates > FAST_STATE_LIMIT) { DEBUG_PRINTF("too many states to merge\n"); return false; } } return true; }
void SmallWriteBuildImpl::add(const NGWrapper &w) { // If the graph is poisoned (i.e. we can't build a SmallWrite version), // we don't even try. if (poisoned) { return; } if (w.som || w.min_length || isVacuous(w)) { /* cannot support in smwr */ poisoned = true; return; } DEBUG_PRINTF("w=%p\n", &w); // make a copy of the graph so that we can modify it for our purposes unique_ptr<NGHolder> h = cloneHolder(w); reduceGraph(*h, SOM_NONE, w.utf8, cc); // If the earliest match location is outside the small write region, // then we don't need to build a SmallWrite version. // However, we don't poison this case either, since it is simply a case, // where we know the resulting graph won't match. if (findMinWidth(*h) > depth(cc.grey.smallWriteLargestBuffer)) { return; } // Now we can actually build the McClellan DFA assert(h->kind == NFA_OUTFIX); auto r = buildMcClellan(*h, &rm, cc.grey); // If we couldn't build a McClellan DFA for this portion, we won't be able // build a smwr which represents the pattern set if (!r) { DEBUG_PRINTF("failed to determinise\n"); poisoned = true; return; } prune_overlong(*r, cc.grey.smallWriteLargestBuffer); if (rdfa) { // do a merge of the new dfa with the existing dfa auto merged = mergeTwoDfas(rdfa.get(), r.get(), DFA_MERGE_MAX_STATES, &rm, cc.grey); if (!merged) { DEBUG_PRINTF("merge failed\n"); poisoned = true; return; } DEBUG_PRINTF("merge succeeded, built %p\n", merged.get()); rdfa = move(merged); } else { rdfa = move(r); } }
vector<DepthMinMax> getDistancesFromSOM(const NGHolder &g_orig) { // We operate on a temporary copy of the original graph here, so we don't // have to mutate the original. NGHolder g; ue2::unordered_map<NFAVertex, NFAVertex> vmap; // vertex in g_orig to vertex in g cloneHolder(g, g_orig, &vmap); vector<NFAVertex> vstarts; for (auto v : vertices_range(g)) { if (is_virtual_start(v, g)) { vstarts.push_back(v); } } vstarts.push_back(g.startDs); // wire the successors of every virtual start or startDs to g.start. for (auto v : vstarts) { wireSuccessorsToStart(g, v); } // drop the in-edges of every virtual start so that they don't participate // in the depth calculation. for (auto v : vstarts) { clear_in_edges(v, g); } //dumpGraph("som_depth.dot", g.g); vector<DepthMinMax> temp_depths; // numbered by vertex index in g calcDepthsFrom(g, g.start, temp_depths); // Transfer depths, indexed by vertex index in g_orig. vector<DepthMinMax> depths(num_vertices(g_orig)); for (auto v_orig : vertices_range(g_orig)) { assert(contains(vmap, v_orig)); NFAVertex v_new = vmap[v_orig]; u32 orig_idx = g_orig[v_orig].index; DepthMinMax &d = depths.at(orig_idx); if (v_orig == g_orig.startDs || is_virtual_start(v_orig, g_orig)) { // StartDs and virtual starts always have zero depth. d = DepthMinMax(0, 0); } else { u32 new_idx = g[v_new].index; d = temp_depths.at(new_idx); } } return depths; }
/** Populates squash masks for states that can be switched off by highlander * (single match) reporters. */ map<NFAVertex, NFAStateSet> findHighlanderSquashers(const NGHolder &g, const ReportManager &rm) { map<NFAVertex, NFAStateSet> squash; set<NFAVertex> verts; getHighlanderReporters(g, g.accept, rm, verts); getHighlanderReporters(g, g.acceptEod, rm, verts); if (verts.empty()) { DEBUG_PRINTF("no highlander reports\n"); return squash; } const u32 numStates = num_vertices(g); for (auto v : verts) { DEBUG_PRINTF("vertex %u with %zu reports\n", g[v].index, g[v].reports.size()); // Find the set of vertices that lead to v or any other reporter with a // subset of v's reports. We do this by creating a copy of the graph, // cutting the appropriate out-edges to accept and seeing which // vertices become unreachable. ue2::unordered_map<NFAVertex, NFAVertex> orig_to_copy; NGHolder h; cloneHolder(h, g, &orig_to_copy); removeEdgesToAccept(h, orig_to_copy[v]); vector<NFAVertex> unreach = findUnreachable(h); DEBUG_PRINTF("can squash %zu vertices\n", unreach.size()); if (unreach.empty()) { continue; } if (!contains(squash, v)) { squash[v] = NFAStateSet(numStates); squash[v].set(); } NFAStateSet &mask = squash[v]; for (auto uv : unreach) { DEBUG_PRINTF("squashes index %u\n", h[uv].index); mask.reset(h[uv].index); } } return squash; }
static u32 prepareRoleGraph(NGHolder &h, const role_id &s1) { u32 num = 0; if (s1.castle()) { num = num_vertices(h); NFAVertex u = add_vertex(h); h[u].char_reach = s1.castle()->reach(); add_edge(h.startDs, u, h); // add self loop to repeat characters add_edge(u, u, h); } else if (s1.graph()) { const NGHolder &g = *s1.graph(); cloneHolder(h, g); num = num_vertices(h); } else { // only infixes and suffixes with graph properties are possible // candidates, already filtered out other cases before // exclusive analysis assert(0); } return num; }
bool somMayGoBackwards(NFAVertex u, const NGHolder &g, const ue2::unordered_map<NFAVertex, u32> ®ion_map, smgb_cache &cache) { /* Need to ensure all matches of the graph g up to u contain no infixes * which are also matches of the graph to u. * * This is basically the same as firstMatchIsFirst except we g is not * always a dag. As we haven't gotten around to writing an execute_graph * that operates on general graphs, we take some (hopefully) conservative * short cuts. * * Note: if the u can be jumped we will take jump edges * into account as a possibility of som going backwards * * TODO: write a generalised ng_execute_graph/make this less hacky */ assert(&g == &cache.g); if (contains(cache.smgb, u)) { return cache.smgb[u]; } DEBUG_PRINTF("checking if som can go backwards on %u\n", g[u].index); set<NFAEdge> be; BackEdges<set<NFAEdge>> backEdgeVisitor(be); depth_first_search( g.g, visitor(backEdgeVisitor) .root_vertex(g.start) .vertex_index_map(get(&NFAGraphVertexProps::index, g.g))); bool rv; if (0) { exit: DEBUG_PRINTF("using cached result\n"); cache.smgb[u] = rv; return rv; } assert(contains(region_map, u)); const u32 u_region = region_map.at(u); for (const auto &e : be) { NFAVertex s = source(e, g); NFAVertex t = target(e, g); /* only need to worry about big cycles including/before u */ DEBUG_PRINTF("back edge %u %u\n", g[s].index, g[t].index); if (s != t && region_map.at(s) <= u_region) { DEBUG_PRINTF("eek big cycle\n"); rv = true; /* big cycle -> eek */ goto exit; } } ue2::unordered_map<NFAVertex, NFAVertex> orig_to_copy; NGHolder c_g; cloneHolder(c_g, g, &orig_to_copy); for (NFAVertex v : vertices_range(g)) { if (!is_virtual_start(v, g)) { continue; } NFAVertex c_v = orig_to_copy[v]; orig_to_copy[v] = c_g.startDs; for (NFAVertex c_w : adjacent_vertices_range(c_v, c_g)) { add_edge_if_not_present(c_g.startDs, c_w, c_g); } clear_vertex(c_v, c_g); } NFAVertex c_u = orig_to_copy[u]; clear_in_edges(c_g.acceptEod, c_g); add_edge(c_g.accept, c_g.acceptEod, c_g); clear_in_edges(c_g.accept, c_g); clear_out_edges(c_u, c_g); if (hasSelfLoop(u, g)) { add_edge(c_u, c_u, c_g); } add_edge(c_u, c_g.accept, c_g); set<NFAVertex> u_succ; insert(&u_succ, adjacent_vertices(u, g)); u_succ.erase(u); for (auto t : inv_adjacent_vertices_range(u, g)) { if (t == u) { continue; } for (auto v : adjacent_vertices_range(t, g)) { if (contains(u_succ, v)) { add_edge(orig_to_copy[t], c_g.accept, c_g); break; } } } pruneUseless(c_g); be.clear(); depth_first_search(c_g.g, visitor(backEdgeVisitor).root_vertex(c_g.start). vertex_index_map(get(&NFAGraphVertexProps::index, c_g.g))); for (const auto &e : be) { NFAVertex s = source(e, c_g); NFAVertex t = target(e, c_g); DEBUG_PRINTF("back edge %u %u\n", c_g[s].index, c_g[t].index); if (s != t) { assert(0); DEBUG_PRINTF("eek big cycle\n"); rv = true; /* big cycle -> eek */ goto exit; } } DEBUG_PRINTF("checking acyclic+selfloop graph\n"); rv = !firstMatchIsFirst(c_g); DEBUG_PRINTF("som may regress? %d\n", (int)rv); goto exit; }
static u32 findMaxInfixMatches(const NGHolder &h, const set<ue2_literal> &lits) { DEBUG_PRINTF("h=%p, %zu literals\n", &h, lits.size()); //dumpGraph("infix.dot", h.g); if (!onlyOneTop(h)) { DEBUG_PRINTF("more than one top!n"); return NO_MATCH_LIMIT; } // Indices of vertices that could terminate any of the literals in 'lits'. set<u32> terms; for (const auto &s : lits) { DEBUG_PRINTF("lit s='%s'\n", escapeString(s).c_str()); if (s.empty()) { // Likely an anchored case, be conservative here. return NO_MATCH_LIMIT; } for (auto v : vertices_range(h)) { if (is_special(v, h)) { continue; } if (couldEndLiteral(s, v, h)) { u32 idx = h[v].index; DEBUG_PRINTF("vertex %u could terminate lit\n", idx); terms.insert(idx); } } } if (terms.empty()) { DEBUG_PRINTF("literals cannot match inside infix\n"); return 0; } NGHolder g; cloneHolder(g, h); vector<NFAVertex> dead; // The set of all edges in the graph is used for existence checks in contractVertex. ue2::unordered_set<pair<NFAVertex, NFAVertex>> all_edges; for (const auto &e : edges_range(g)) { all_edges.emplace(source(e, g), target(e, g)); } for (auto v : vertices_range(g)) { if (is_special(v, g)) { continue; } if (contains(terms, g[v].index)) { continue; } contractVertex(g, v, all_edges); dead.push_back(v); } remove_vertices(dead, g); //dumpGraph("relaxed.dot", g.g); depth maxWidth = findMaxWidth(g); DEBUG_PRINTF("maxWidth=%s\n", maxWidth.str().c_str()); assert(maxWidth.is_reachable()); if (maxWidth.is_infinite()) { // Cycle detected, so we can likely squeeze an unlimited number of // matches into this graph. return NO_MATCH_LIMIT; } assert(terms.size() >= maxWidth); return maxWidth; }