static vector<NFAVertex> getSortedVA(const NGHolder &g, const ue2::unordered_map<NFAVertex, u32> &state_ids) { vector<NFAVertex> out; out.reserve(num_vertices(g)); for (auto v : vertices_range(g)) { assert(contains(state_ids, v)); if (state_ids.at(v) == NO_STATE) { continue; } out.push_back(v); } // Order vertices by their state indices. sort(begin(out), end(out), [&state_ids](NFAVertex a, NFAVertex b) { return state_ids.at(a) < state_ids.at(b); }); #ifndef NDEBUG // State indices should match vector indices. for (u32 i = 0; i < out.size(); i++) { assert(state_ids.at(out.at(i)) == i); } #endif return out; }
u32 countStates(const NGHolder &g, const ue2::unordered_map<NFAVertex, u32> &state_ids, bool addTops) { if (state_ids.empty()) { return 0; } u32 max_state = 0; for (const auto &m : state_ids) { if (m.second != NO_STATE) { max_state = max(m.second, max_state); } } u32 num_states = max_state + 1; assert(contains(state_ids, g.start)); if (addTops && state_ids.at(g.start) != NO_STATE) { num_states--; set<u32> tops; for (auto e : out_edges_range(g.start, g)) { tops.insert(g[e].top); } num_states += tops.size(); } return num_states; }
static bool isDominatedByReporter(const NGHolder &g, const ue2::unordered_map<NFAVertex, NFAVertex> &dom, NFAVertex v, ReportID report_id) { for (auto it = dom.find(v); it != end(dom); it = dom.find(v)) { NFAVertex u = it->second; // Note: reporters with edges only to acceptEod are not considered to // dominate. if (edge(u, g.accept, g).second && contains(g[u].reports, report_id)) { DEBUG_PRINTF("%u is dominated by %u, and both report %u\n", g[v].index, g[u].index, report_id); return true; } v = u; } return false; }
bool sentClearsTail(const NGHolder &g, const ue2::unordered_map<NFAVertex, u32> ®ion_map, const NGHolder &sent, u32 last_head_region, u32 *bad_region) { /* if a subsequent match from the prefix clears the rest of the pattern * we can just keep track of the last match of the prefix. * To see if this property holds, we could: * * 1A: turn on all states in the tail and run all strings that may * match the prefix past the tail, if we are still in any states then * this property does not hold. * * 1B: we turn on the initial states of the tail and run any strings which * may finish any partial matches in the prefix and see if we end up with * anything which would also imply that this property does not hold. * * OR * * 2: we just turn everything and run the prefix inputs past it and see what * we are left with. I think that is equivalent to scheme 1 and is easier to * implement. TODO: ponder * * Anyway, we are going with scheme 2 until further notice. */ u32 first_bad_region = ~0U; set<NFAVertex> states; /* turn on all states */ DEBUG_PRINTF("region %u is cutover\n", last_head_region); for (auto v : vertices_range(g)) { if (v != g.accept && v != g.acceptEod) { states.insert(v); } } for (UNUSED auto v : states) { DEBUG_PRINTF("start state: %u\n", g[v].index); } /* run the prefix the main graph */ execute_graph(g, sent, &states); /* .. and check if we are left with anything in the tail region */ for (auto v : states) { if (v == g.start || v == g.startDs) { continue; /* not in tail */ } DEBUG_PRINTF("v %u is still on\n", g[v].index); assert(v != g.accept && v != g.acceptEod); /* no cr */ assert(contains(region_map, v)); const u32 v_region = region_map.at(v); if (v_region > last_head_region) { DEBUG_PRINTF("bailing, %u > %u\n", v_region, last_head_region); first_bad_region = min(first_bad_region, v_region); } } if (first_bad_region != ~0U) { DEBUG_PRINTF("first bad region is %u\n", first_bad_region); *bad_region = first_bad_region; return false; } return true; }
bool somMayGoBackwards(NFAVertex u, const NGHolder &g, const ue2::unordered_map<NFAVertex, u32> ®ion_map, smgb_cache &cache) { /* Need to ensure all matches of the graph g up to u contain no infixes * which are also matches of the graph to u. * * This is basically the same as firstMatchIsFirst except we g is not * always a dag. As we haven't gotten around to writing an execute_graph * that operates on general graphs, we take some (hopefully) conservative * short cuts. * * Note: if the u can be jumped we will take jump edges * into account as a possibility of som going backwards * * TODO: write a generalised ng_execute_graph/make this less hacky */ assert(&g == &cache.g); if (contains(cache.smgb, u)) { return cache.smgb[u]; } DEBUG_PRINTF("checking if som can go backwards on %u\n", g[u].index); set<NFAEdge> be; BackEdges<set<NFAEdge>> backEdgeVisitor(be); depth_first_search( g.g, visitor(backEdgeVisitor) .root_vertex(g.start) .vertex_index_map(get(&NFAGraphVertexProps::index, g.g))); bool rv; if (0) { exit: DEBUG_PRINTF("using cached result\n"); cache.smgb[u] = rv; return rv; } assert(contains(region_map, u)); const u32 u_region = region_map.at(u); for (const auto &e : be) { NFAVertex s = source(e, g); NFAVertex t = target(e, g); /* only need to worry about big cycles including/before u */ DEBUG_PRINTF("back edge %u %u\n", g[s].index, g[t].index); if (s != t && region_map.at(s) <= u_region) { DEBUG_PRINTF("eek big cycle\n"); rv = true; /* big cycle -> eek */ goto exit; } } ue2::unordered_map<NFAVertex, NFAVertex> orig_to_copy; NGHolder c_g; cloneHolder(c_g, g, &orig_to_copy); for (NFAVertex v : vertices_range(g)) { if (!is_virtual_start(v, g)) { continue; } NFAVertex c_v = orig_to_copy[v]; orig_to_copy[v] = c_g.startDs; for (NFAVertex c_w : adjacent_vertices_range(c_v, c_g)) { add_edge_if_not_present(c_g.startDs, c_w, c_g); } clear_vertex(c_v, c_g); } NFAVertex c_u = orig_to_copy[u]; clear_in_edges(c_g.acceptEod, c_g); add_edge(c_g.accept, c_g.acceptEod, c_g); clear_in_edges(c_g.accept, c_g); clear_out_edges(c_u, c_g); if (hasSelfLoop(u, g)) { add_edge(c_u, c_u, c_g); } add_edge(c_u, c_g.accept, c_g); set<NFAVertex> u_succ; insert(&u_succ, adjacent_vertices(u, g)); u_succ.erase(u); for (auto t : inv_adjacent_vertices_range(u, g)) { if (t == u) { continue; } for (auto v : adjacent_vertices_range(t, g)) { if (contains(u_succ, v)) { add_edge(orig_to_copy[t], c_g.accept, c_g); break; } } } pruneUseless(c_g); be.clear(); depth_first_search(c_g.g, visitor(backEdgeVisitor).root_vertex(c_g.start). vertex_index_map(get(&NFAGraphVertexProps::index, c_g.g))); for (const auto &e : be) { NFAVertex s = source(e, c_g); NFAVertex t = target(e, c_g); DEBUG_PRINTF("back edge %u %u\n", c_g[s].index, c_g[t].index); if (s != t) { assert(0); DEBUG_PRINTF("eek big cycle\n"); rv = true; /* big cycle -> eek */ goto exit; } } DEBUG_PRINTF("checking acyclic+selfloop graph\n"); rv = !firstMatchIsFirst(c_g); DEBUG_PRINTF("som may regress? %d\n", (int)rv); goto exit; }
static never_inline void mergeNfa(NGHolder &dest, vector<NFAVertex> &destStateMap, ue2::unordered_map<NFAVertex, u32> &dest_state_ids, NGHolder &vic, vector<NFAVertex> &vicStateMap, size_t common_len) { map<NFAVertex, NFAVertex> vmap; // vic -> dest vmap[vic.start] = dest.start; vmap[vic.startDs] = dest.startDs; vmap[vic.accept] = dest.accept; vmap[vic.acceptEod] = dest.acceptEod; vmap[nullptr] = nullptr; u32 stateNum = countStates(dest, dest_state_ids); // For vertices in the common len, add to vmap and merge in the reports, if // any. for (u32 i = 0; i < common_len; i++) { NFAVertex v_old = vicStateMap[i], v = destStateMap[i]; vmap[v_old] = v; const auto &reports = vic[v_old].reports; dest[v].reports.insert(reports.begin(), reports.end()); } // Add in vertices beyond the common len, giving them state numbers // starting at stateNum. for (u32 i = common_len; i < vicStateMap.size(); i++) { NFAVertex v_old = vicStateMap[i]; if (is_special(v_old, vic)) { // Dest already has start vertices, just merge the reports. u32 idx = vic[v_old].index; NFAVertex v = dest.getSpecialVertex(idx); const auto &reports = vic[v_old].reports; dest[v].reports.insert(reports.begin(), reports.end()); continue; } NFAVertex v = add_vertex(vic[v_old], dest); dest_state_ids[v] = stateNum++; vmap[v_old] = v; } /* add edges */ DEBUG_PRINTF("common_len=%zu\n", common_len); for (const auto &e : edges_range(vic)) { NFAVertex u_old = source(e, vic), v_old = target(e, vic); NFAVertex u = vmap[u_old], v = vmap[v_old]; bool uspecial = is_special(u, dest); bool vspecial = is_special(v, dest); // Skip stylised edges that are already present. if (uspecial && vspecial && edge(u, v, dest).second) { continue; } // We're in the common region if v's state ID is low enough, unless v // is a special (an accept), in which case we use u's state ID. assert(contains(dest_state_ids, v)); bool in_common_region = dest_state_ids.at(v) < common_len; if (vspecial && dest_state_ids.at(u) < common_len) { in_common_region = true; } DEBUG_PRINTF("adding idx=%u (state %u) -> idx=%u (state %u)%s\n", dest[u].index, dest_state_ids.at(u), dest[v].index, dest_state_ids.at(v), in_common_region ? " [common]" : ""); if (in_common_region) { if (!is_special(v, dest)) { DEBUG_PRINTF("skipping common edge\n"); assert(edge(u, v, dest).second); // Should never merge edges with different top values. assert(vic[e].top == dest[edge(u, v, dest).first].top); continue; } else { assert(is_any_accept(v, dest)); // If the edge exists in both graphs, skip it. if (edge(u, v, dest).second) { DEBUG_PRINTF("skipping common edge to accept\n"); continue; } } } assert(!edge(u, v, dest).second); add_edge(u, v, vic[e], dest); } dest.renumberEdges(); dest.renumberVertices(); }
u32 commonPrefixLength(const NGHolder &ga, const ue2::unordered_map<NFAVertex, u32> &a_state_ids, const NGHolder &gb, const ue2::unordered_map<NFAVertex, u32> &b_state_ids) { vector<NFAVertex> a = getSortedVA(ga, a_state_ids); vector<NFAVertex> b = getSortedVA(gb, b_state_ids); /* upper bound on the common region based on local properties */ u32 max = cplCommonReachAndSimple(ga, a, gb, b); DEBUG_PRINTF("cpl upper bound %u\n", max); while (max > 0) { bool ok = true; /* shrink max region based on in-edges from outside the region */ for (size_t j = max; j > 0; j--) { for (auto u : inv_adjacent_vertices_range(a[j - 1], ga)) { u32 state_id = a_state_ids.at(u); if (state_id != NO_STATE && state_id >= max) { max = j - 1; DEBUG_PRINTF("lowering max to %u\n", max); goto next_vertex; } } for (auto u : inv_adjacent_vertices_range(b[j - 1], gb)) { u32 state_id = b_state_ids.at(u); if (state_id != NO_STATE && state_id >= max) { max = j - 1; DEBUG_PRINTF("lowering max to %u\n", max); goto next_vertex; } } next_vertex:; } /* Ensure that every pair of vertices has same out-edges to vertices in the region. */ for (size_t i = 0; ok && i < max; i++) { size_t a_count = 0; size_t b_count = 0; NFAGraph::out_edge_iterator ei, ee; for (tie(ei, ee) = out_edges(a[i], ga); ok && ei != ee; ++ei) { u32 sid = a_state_ids.at(target(*ei, ga)); if (sid == NO_STATE || sid >= max) { continue; } a_count++; NFAEdge b_edge; bool has_b_edge; tie(b_edge, has_b_edge) = edge(b[i], b[sid], gb); if (!has_b_edge) { max = i; ok = false; DEBUG_PRINTF("lowering max to %u due to edge %zu->%u\n", max, i, sid); break; } if (ga[*ei].top != gb[b_edge].top) { max = i; ok = false; DEBUG_PRINTF("tops don't match on edge %zu->%u\n", i, sid); } } NFAGraph::adjacency_iterator ai, ae; for (tie(ai, ae) = adjacent_vertices(b[i], gb); ok && ai != ae; ++ai) { u32 sid = b_state_ids.at(*ai); if (sid == NO_STATE || sid >= max) { continue; } b_count++; } if (a_count != b_count) { max = i; DEBUG_PRINTF("lowering max to %u due to a,b count " "(a_count=%zu, b_count=%zu)\n", max, a_count, b_count); ok = false; } } if (ok) { DEBUG_PRINTF("survived checks, returning cpl %u\n", max); return max; } } DEBUG_PRINTF("failed to find any common region\n"); return 0; }