/** Some squash states are clearly not advantageous in the NFA, as they do * incur the cost of an exception: * -# acyclic states * -# squash only a few acyclic states */ void filterSquashers(const NGHolder &g, map<NFAVertex, NFAStateSet> &squash) { DEBUG_PRINTF("filtering\n"); map<u32, NFAVertex> rev; /* vertex_index -> vertex */ for (auto v : vertices_range(g)) { rev[g[v].index] = v; } for (auto v : vertices_range(g)) { if (!contains(squash, v)) { continue; } DEBUG_PRINTF("looking at squash set for vertex %u\n", g[v].index); if (!hasSelfLoop(v, g)) { DEBUG_PRINTF("acyclic\n"); squash.erase(v); continue; } NFAStateSet squashed = squash[v]; squashed.flip(); /* default sense for mask of survivors */ for (NFAStateSet::size_type sq = squashed.find_first(); sq != squashed.npos; sq = squashed.find_next(sq)) { NFAVertex u = rev[sq]; if (hasSelfLoop(u, g)) { DEBUG_PRINTF("squashing a cyclic (%zu) is always good\n", sq); goto next_vertex; } } if (squashed.count() < MIN_PURE_ACYCLIC_SQUASH) { DEBUG_PRINTF("squash set too small\n"); squash.erase(v); continue; } next_vertex:; DEBUG_PRINTF("squash set ok\n"); } }
bool somMayGoBackwards(NFAVertex u, const NGHolder &g, const ue2::unordered_map<NFAVertex, u32> ®ion_map, smgb_cache &cache) { /* Need to ensure all matches of the graph g up to u contain no infixes * which are also matches of the graph to u. * * This is basically the same as firstMatchIsFirst except we g is not * always a dag. As we haven't gotten around to writing an execute_graph * that operates on general graphs, we take some (hopefully) conservative * short cuts. * * Note: if the u can be jumped we will take jump edges * into account as a possibility of som going backwards * * TODO: write a generalised ng_execute_graph/make this less hacky */ assert(&g == &cache.g); if (contains(cache.smgb, u)) { return cache.smgb[u]; } DEBUG_PRINTF("checking if som can go backwards on %u\n", g[u].index); set<NFAEdge> be; BackEdges<set<NFAEdge>> backEdgeVisitor(be); depth_first_search( g.g, visitor(backEdgeVisitor) .root_vertex(g.start) .vertex_index_map(get(&NFAGraphVertexProps::index, g.g))); bool rv; if (0) { exit: DEBUG_PRINTF("using cached result\n"); cache.smgb[u] = rv; return rv; } assert(contains(region_map, u)); const u32 u_region = region_map.at(u); for (const auto &e : be) { NFAVertex s = source(e, g); NFAVertex t = target(e, g); /* only need to worry about big cycles including/before u */ DEBUG_PRINTF("back edge %u %u\n", g[s].index, g[t].index); if (s != t && region_map.at(s) <= u_region) { DEBUG_PRINTF("eek big cycle\n"); rv = true; /* big cycle -> eek */ goto exit; } } ue2::unordered_map<NFAVertex, NFAVertex> orig_to_copy; NGHolder c_g; cloneHolder(c_g, g, &orig_to_copy); for (NFAVertex v : vertices_range(g)) { if (!is_virtual_start(v, g)) { continue; } NFAVertex c_v = orig_to_copy[v]; orig_to_copy[v] = c_g.startDs; for (NFAVertex c_w : adjacent_vertices_range(c_v, c_g)) { add_edge_if_not_present(c_g.startDs, c_w, c_g); } clear_vertex(c_v, c_g); } NFAVertex c_u = orig_to_copy[u]; clear_in_edges(c_g.acceptEod, c_g); add_edge(c_g.accept, c_g.acceptEod, c_g); clear_in_edges(c_g.accept, c_g); clear_out_edges(c_u, c_g); if (hasSelfLoop(u, g)) { add_edge(c_u, c_u, c_g); } add_edge(c_u, c_g.accept, c_g); set<NFAVertex> u_succ; insert(&u_succ, adjacent_vertices(u, g)); u_succ.erase(u); for (auto t : inv_adjacent_vertices_range(u, g)) { if (t == u) { continue; } for (auto v : adjacent_vertices_range(t, g)) { if (contains(u_succ, v)) { add_edge(orig_to_copy[t], c_g.accept, c_g); break; } } } pruneUseless(c_g); be.clear(); depth_first_search(c_g.g, visitor(backEdgeVisitor).root_vertex(c_g.start). vertex_index_map(get(&NFAGraphVertexProps::index, c_g.g))); for (const auto &e : be) { NFAVertex s = source(e, c_g); NFAVertex t = target(e, c_g); DEBUG_PRINTF("back edge %u %u\n", c_g[s].index, c_g[t].index); if (s != t) { assert(0); DEBUG_PRINTF("eek big cycle\n"); rv = true; /* big cycle -> eek */ goto exit; } } DEBUG_PRINTF("checking acyclic+selfloop graph\n"); rv = !firstMatchIsFirst(c_g); DEBUG_PRINTF("som may regress? %d\n", (int)rv); goto exit; }
map<NFAVertex, NFAStateSet> findSquashers(const NGHolder &g, som_type som) { map<NFAVertex, NFAStateSet> squash; // Number of bits to use for all our masks. If we're a triggered graph, // tops have already been assigned, so we don't have to account for them. const u32 numStates = num_vertices(g); // Build post-dominator tree. PostDomTree pdom_tree; buildPDomTree(g, pdom_tree); // Build list of vertices by state ID and a set of init states. vector<NFAVertex> vByIndex(numStates, NFAGraph::null_vertex()); NFAStateSet initStates(numStates); smgb_cache cache(g); // Mappings used for SOM mode calculations, otherwise left empty. unordered_map<NFAVertex, u32> region_map; vector<DepthMinMax> som_depths; if (som) { region_map = assignRegions(g); som_depths = getDistancesFromSOM(g); } for (auto v : vertices_range(g)) { const u32 vert_id = g[v].index; DEBUG_PRINTF("vertex %u/%u\n", vert_id, numStates); assert(vert_id < numStates); vByIndex[vert_id] = v; if (is_any_start(v, g) || !in_degree(v, g)) { initStates.set(vert_id); } } for (u32 i = 0; i < numStates; i++) { NFAVertex v = vByIndex[i]; assert(v != NFAGraph::null_vertex()); const CharReach &cr = g[v].char_reach; /* only non-init cyclics can be squashers */ if (!hasSelfLoop(v, g) || initStates.test(i)) { continue; } DEBUG_PRINTF("state %u is cyclic\n", i); NFAStateSet mask(numStates), succ(numStates), pred(numStates); buildSquashMask(mask, g, v, cr, initStates, vByIndex, pdom_tree, som, som_depths, region_map, cache); buildSucc(succ, g, v); buildPred(pred, g, v); const auto &reports = g[v].reports; for (size_t j = succ.find_first(); j != succ.npos; j = succ.find_next(j)) { NFAVertex vj = vByIndex[j]; NFAStateSet pred2(numStates); buildPred(pred2, g, vj); if (pred2 == pred) { DEBUG_PRINTF("adding the sm from %zu to %u's sm\n", j, i); NFAStateSet tmp(numStates); buildSquashMask(tmp, g, vj, cr, initStates, vByIndex, pdom_tree, som, som_depths, region_map, cache); mask &= tmp; } } for (size_t j = pred.find_first(); j != pred.npos; j = pred.find_next(j)) { NFAVertex vj = vByIndex[j]; NFAStateSet succ2(numStates); buildSucc(succ2, g, vj); /* we can use j as a basis for squashing if its succs are a subset * of ours */ if ((succ2 & ~succ).any()) { continue; } if (som) { /* We cannot use j to add to the squash mask of v if it may * have an earlier start of match offset. ie for us j as a * basis for the squash mask of v we require: * maxSomDist(j) <= minSomDist(v) */ /* ** TODO ** */ const depth &max_som_dist_j = som_depths[g[vj].index].max; const depth &min_som_dist_v = som_depths[g[v].index].min; if (max_som_dist_j > min_som_dist_v || max_som_dist_j.is_infinite()) { /* j can't be used as it may be storing an earlier SOM */ continue; } } const CharReach &crv = g[vj].char_reach; /* we also require that j's report information be a subset of ours */ bool seen_special = false; for (auto w : adjacent_vertices_range(vj, g)) { if (is_special(w, g)) { if (!edge(v, w, g).second) { goto next_j; } seen_special = true; } } // FIXME: should be subset check? if (seen_special && g[vj].reports != reports) { continue; } /* ok we can use j */ if ((crv & ~cr).none()) { NFAStateSet tmp(numStates); buildSquashMask(tmp, g, vj, cr, initStates, vByIndex, pdom_tree, som, som_depths, region_map, cache); mask &= tmp; mask.reset(j); } next_j:; } mask.set(i); /* never clear ourselves */ if ((~mask).any()) { // i.e. some bits unset in mask DEBUG_PRINTF("%u squashes %zu other states\n", i, (~mask).count()); squash.emplace(v, mask); } } findDerivedSquashers(g, vByIndex, pdom_tree, initStates, &squash, som, som_depths, region_map, cache); clearMutualSquashers(g, vByIndex, squash); return squash; }
static void findSeeds(const NGHolder &h, const bool som, vector<NFAVertex> *seeds) { set<NFAVertex> bad; /* from zero-width asserts near accepts, etc */ for (auto v : inv_adjacent_vertices_range(h.accept, h)) { const CharReach &cr = h[v].char_reach; if (!isutf8ascii(cr) && !isutf8start(cr)) { bad.insert(v); } } for (auto v : inv_adjacent_vertices_range(h.acceptEod, h)) { const CharReach &cr = h[v].char_reach; if (!isutf8ascii(cr) && !isutf8start(cr)) { bad.insert(v); } } // we want to be careful with asserts connected to starts // as well as they may not finish a code point for (auto v : vertices_range(h)) { if (is_virtual_start(v, h)) { bad.insert(v); insert(&bad, adjacent_vertices(v, h)); } } /* we cannot handle vertices connected to accept as would report matches in * the middle of codepoints. acceptEod is not a problem as the input must * end at a codepoint boundary */ bad.insert(h.accept); // If we're in SOM mode, we don't want to mess with vertices that have a // direct edge from startDs. if (som) { insert(&bad, adjacent_vertices(h.startDs, h)); } set<NFAVertex> already_seeds; /* already marked as seeds */ for (auto v : vertices_range(h)) { const CharReach &cr = h[v].char_reach; if (!isutf8ascii(cr) || !hasSelfLoop(v, h)) { continue; } if (hasSuccInSet(h, v, bad)) { continue; } // Skip vertices that are directly connected to other vertices already // in the seeds list: we can't collapse two of these directly next to // each other. if (hasPredInSet(h, v, already_seeds) || hasSuccInSet(h, v, already_seeds)) { continue; } DEBUG_PRINTF("%zu is a seed\n", h[v].index); seeds->push_back(v); already_seeds.insert(v); } }