vector<DepthMinMax> getDistancesFromSOM(const NGHolder &g_orig) { // We operate on a temporary copy of the original graph here, so we don't // have to mutate the original. NGHolder g; ue2::unordered_map<NFAVertex, NFAVertex> vmap; // vertex in g_orig to vertex in g cloneHolder(g, g_orig, &vmap); vector<NFAVertex> vstarts; for (auto v : vertices_range(g)) { if (is_virtual_start(v, g)) { vstarts.push_back(v); } } vstarts.push_back(g.startDs); // wire the successors of every virtual start or startDs to g.start. for (auto v : vstarts) { wireSuccessorsToStart(g, v); } // drop the in-edges of every virtual start so that they don't participate // in the depth calculation. for (auto v : vstarts) { clear_in_edges(v, g); } //dumpGraph("som_depth.dot", g.g); vector<DepthMinMax> temp_depths; // numbered by vertex index in g calcDepthsFrom(g, g.start, temp_depths); // Transfer depths, indexed by vertex index in g_orig. vector<DepthMinMax> depths(num_vertices(g_orig)); for (auto v_orig : vertices_range(g_orig)) { assert(contains(vmap, v_orig)); NFAVertex v_new = vmap[v_orig]; u32 orig_idx = g_orig[v_orig].index; DepthMinMax &d = depths.at(orig_idx); if (v_orig == g_orig.startDs || is_virtual_start(v_orig, g_orig)) { // StartDs and virtual starts always have zero depth. d = DepthMinMax(0, 0); } else { u32 new_idx = g[v_new].index; d = temp_depths.at(new_idx); } } return depths; }
bool firstMatchIsFirst(const NGHolder &p) { /* If the first match (by end offset) is not the first match (by start * offset) then we can't create a lock after it. * * Consider: 4009:/(foobar|ob).*bugger/s * * We don't care about races on the last byte as they can be resolved easily * at runtime /(foobar|obar).*hi/ * * It should be obvious we don't care about one match being a prefix * of another as they share the same start offset. * * Therefore, the case were we cannot establish that the som does not * regress is when there exists s1 and s2 in the language of p and s2 is a * proper infix of s1. * * It is tempting to add the further restriction that there does not exist a * prefix of s1 that is in the language of p (as in which case we would * presume, the lock has already been set). However, we have no way of * knowing if the lock can be cleared by some characters, and if so, if it * is still set. TODO: if we knew the lock's escapes where we could verify * that the rest of s1 does not clear the lock. (1) */ DEBUG_PRINTF("entry\n"); /* If there are any big cycles throw up our hands in despair */ if (hasBigCycles(p)) { DEBUG_PRINTF("fail, big cycles\n"); return false; } set<NFAVertex> states; /* turn on all states (except starts - avoid suffix matches) */ /* If we were doing (1) we would also except states leading to accepts - avoid prefix matches */ for (auto v : vertices_range(p)) { assert(!is_virtual_start(v, p)); if (!is_special(v, p)) { DEBUG_PRINTF("turning on %u\n", p[v].index); states.insert(v); } } /* run the prefix the main graph */ execute_graph(p, p, &states); for (auto v : states) { /* need to check if this vertex may represent an infix match - ie * it does not have an edge to accept. */ DEBUG_PRINTF("check %u\n", p[v].index); if (!edge(v, p.accept, p).second) { DEBUG_PRINTF("fail %u\n", p[v].index); return false; } } DEBUG_PRINTF("done first is first check\n"); return true; }
bool somMayGoBackwards(NFAVertex u, const NGHolder &g, const ue2::unordered_map<NFAVertex, u32> ®ion_map, smgb_cache &cache) { /* Need to ensure all matches of the graph g up to u contain no infixes * which are also matches of the graph to u. * * This is basically the same as firstMatchIsFirst except we g is not * always a dag. As we haven't gotten around to writing an execute_graph * that operates on general graphs, we take some (hopefully) conservative * short cuts. * * Note: if the u can be jumped we will take jump edges * into account as a possibility of som going backwards * * TODO: write a generalised ng_execute_graph/make this less hacky */ assert(&g == &cache.g); if (contains(cache.smgb, u)) { return cache.smgb[u]; } DEBUG_PRINTF("checking if som can go backwards on %u\n", g[u].index); set<NFAEdge> be; BackEdges<set<NFAEdge>> backEdgeVisitor(be); depth_first_search( g.g, visitor(backEdgeVisitor) .root_vertex(g.start) .vertex_index_map(get(&NFAGraphVertexProps::index, g.g))); bool rv; if (0) { exit: DEBUG_PRINTF("using cached result\n"); cache.smgb[u] = rv; return rv; } assert(contains(region_map, u)); const u32 u_region = region_map.at(u); for (const auto &e : be) { NFAVertex s = source(e, g); NFAVertex t = target(e, g); /* only need to worry about big cycles including/before u */ DEBUG_PRINTF("back edge %u %u\n", g[s].index, g[t].index); if (s != t && region_map.at(s) <= u_region) { DEBUG_PRINTF("eek big cycle\n"); rv = true; /* big cycle -> eek */ goto exit; } } ue2::unordered_map<NFAVertex, NFAVertex> orig_to_copy; NGHolder c_g; cloneHolder(c_g, g, &orig_to_copy); for (NFAVertex v : vertices_range(g)) { if (!is_virtual_start(v, g)) { continue; } NFAVertex c_v = orig_to_copy[v]; orig_to_copy[v] = c_g.startDs; for (NFAVertex c_w : adjacent_vertices_range(c_v, c_g)) { add_edge_if_not_present(c_g.startDs, c_w, c_g); } clear_vertex(c_v, c_g); } NFAVertex c_u = orig_to_copy[u]; clear_in_edges(c_g.acceptEod, c_g); add_edge(c_g.accept, c_g.acceptEod, c_g); clear_in_edges(c_g.accept, c_g); clear_out_edges(c_u, c_g); if (hasSelfLoop(u, g)) { add_edge(c_u, c_u, c_g); } add_edge(c_u, c_g.accept, c_g); set<NFAVertex> u_succ; insert(&u_succ, adjacent_vertices(u, g)); u_succ.erase(u); for (auto t : inv_adjacent_vertices_range(u, g)) { if (t == u) { continue; } for (auto v : adjacent_vertices_range(t, g)) { if (contains(u_succ, v)) { add_edge(orig_to_copy[t], c_g.accept, c_g); break; } } } pruneUseless(c_g); be.clear(); depth_first_search(c_g.g, visitor(backEdgeVisitor).root_vertex(c_g.start). vertex_index_map(get(&NFAGraphVertexProps::index, c_g.g))); for (const auto &e : be) { NFAVertex s = source(e, c_g); NFAVertex t = target(e, c_g); DEBUG_PRINTF("back edge %u %u\n", c_g[s].index, c_g[t].index); if (s != t) { assert(0); DEBUG_PRINTF("eek big cycle\n"); rv = true; /* big cycle -> eek */ goto exit; } } DEBUG_PRINTF("checking acyclic+selfloop graph\n"); rv = !firstMatchIsFirst(c_g); DEBUG_PRINTF("som may regress? %d\n", (int)rv); goto exit; }
static void findSeeds(const NGHolder &h, const bool som, vector<NFAVertex> *seeds) { set<NFAVertex> bad; /* from zero-width asserts near accepts, etc */ for (auto v : inv_adjacent_vertices_range(h.accept, h)) { const CharReach &cr = h[v].char_reach; if (!isutf8ascii(cr) && !isutf8start(cr)) { bad.insert(v); } } for (auto v : inv_adjacent_vertices_range(h.acceptEod, h)) { const CharReach &cr = h[v].char_reach; if (!isutf8ascii(cr) && !isutf8start(cr)) { bad.insert(v); } } // we want to be careful with asserts connected to starts // as well as they may not finish a code point for (auto v : vertices_range(h)) { if (is_virtual_start(v, h)) { bad.insert(v); insert(&bad, adjacent_vertices(v, h)); } } /* we cannot handle vertices connected to accept as would report matches in * the middle of codepoints. acceptEod is not a problem as the input must * end at a codepoint boundary */ bad.insert(h.accept); // If we're in SOM mode, we don't want to mess with vertices that have a // direct edge from startDs. if (som) { insert(&bad, adjacent_vertices(h.startDs, h)); } set<NFAVertex> already_seeds; /* already marked as seeds */ for (auto v : vertices_range(h)) { const CharReach &cr = h[v].char_reach; if (!isutf8ascii(cr) || !hasSelfLoop(v, h)) { continue; } if (hasSuccInSet(h, v, bad)) { continue; } // Skip vertices that are directly connected to other vertices already // in the seeds list: we can't collapse two of these directly next to // each other. if (hasPredInSet(h, v, already_seeds) || hasSuccInSet(h, v, already_seeds)) { continue; } DEBUG_PRINTF("%zu is a seed\n", h[v].index); seeds->push_back(v); already_seeds.insert(v); } }