static void getHighlanderReporters(const NGHolder &g, const NFAVertex accept, const ReportManager &rm, set<NFAVertex> &verts) { for (auto v : inv_adjacent_vertices_range(accept, g)) { if (v == g.accept) { continue; } const auto &reports = g[v].reports; if (reports.empty()) { assert(0); continue; } // Must be _all_ highlander callback reports. for (auto report : reports) { const Report &ir = rm.getReport(report); if (ir.ekey == INVALID_EKEY || ir.type != EXTERNAL_CALLBACK) { goto next_vertex; } // If there's any bounds, these are handled outside the NFA and // probably shouldn't be pre-empted. if (ir.hasBounds()) { goto next_vertex; } } verts.insert(v); next_vertex: continue; } }
static void getBackwardReach(const NGHolder &g, ReportID report, u32 lag, map<s32, CharReach> &look) { ue2::flat_set<NFAVertex> curr, next; for (auto v : inv_adjacent_vertices_range(g.accept, g)) { if (contains(g[v].reports, report)) { curr.insert(v); } } for (u32 i = lag + 1; i <= MAX_BACK_LEN; i++) { if (curr.empty() || contains(curr, g.start) || contains(curr, g.startDs)) { break; } next.clear(); CharReach cr; for (auto v : curr) { assert(!is_special(v, g)); cr |= g[v].char_reach; insert(&next, inv_adjacent_vertices(v, g)); } assert(cr.any()); look[0 - i] |= cr; curr.swap(next); } }
static void buildPred(NFAStateSet &pred, const NGHolder &g, NFAVertex v) { for (auto u : inv_adjacent_vertices_range(v, g)) { if (!is_special(u, g)) { pred.set(g[u].index); } } }
static bool hasPredInSet(const NGHolder &g, NFAVertex v, const set<NFAVertex> &s) { for (auto u : inv_adjacent_vertices_range(v, g)) { if (contains(s, u)) { return true; } } return false; }
static bool inIsIrreducible(NFAVertex &v, const NGHolder &g) { unsigned nonSpecialVertices = 0; for (auto u : inv_adjacent_vertices_range(v, g)) { if (!is_special(u, g) && u != v) { nonSpecialVertices++; } } return nonSpecialVertices == 1; }
static bool getTransientPrefixReach(const NGHolder &g, u32 lag, map<s32, CharReach> &look) { if (in_degree(g.accept, g) != 1) { DEBUG_PRINTF("more than one accept\n"); return false; } // Must be a floating chain wired to startDs. if (!hasSingleFloatingStart(g)) { DEBUG_PRINTF("not a single floating start\n"); return false; } NFAVertex v = *(inv_adjacent_vertices(g.accept, g).first); u32 i = lag + 1; while (v != g.startDs) { DEBUG_PRINTF("i=%u, v=%u\n", i, g[v].index); if (is_special(v, g)) { DEBUG_PRINTF("special\n"); return false; } look[0 - i] = g[v].char_reach; NFAVertex next = NGHolder::null_vertex(); for (auto u : inv_adjacent_vertices_range(v, g)) { if (u == g.start) { continue; // Benign, checked by hasSingleFloatingStart } if (next == NGHolder::null_vertex()) { next = u; continue; } DEBUG_PRINTF("branch\n"); return false; } if (next == NGHolder::null_vertex() || next == v) { DEBUG_PRINTF("no predecessor or only self-loop\n"); // This graph is malformed -- all vertices in a graph that makes it // to this analysis should have predecessors. assert(0); return false; } v = next; i++; } DEBUG_PRINTF("done\n"); return true; }
static bool requiresDedupe(const NGHolder &h, const flat_set<ReportID> &reports, const Grey &grey) { /* TODO: tighten */ NFAVertex seen_vert = NGHolder::null_vertex(); for (auto v : inv_adjacent_vertices_range(h.accept, h)) { if (has_intersection(h[v].reports, reports)) { if (seen_vert != NGHolder::null_vertex()) { return true; } seen_vert = v; } } for (auto v : inv_adjacent_vertices_range(h.acceptEod, h)) { if (has_intersection(h[v].reports, reports)) { if (seen_vert != NGHolder::null_vertex()) { return true; } seen_vert = v; } } if (seen_vert) { /* if the reporting vertex is part of of a terminal repeat, the * construction process may reform the graph splitting it into two * vertices (pos, cyclic) and hence require dedupe */ vector<GraphRepeatInfo> repeats; findRepeats(h, grey.minExtBoundedRepeatSize, &repeats); for (const auto &repeat : repeats) { if (find(repeat.vertices.begin(), repeat.vertices.end(), seen_vert) != repeat.vertices.end()) { return true; } } } return false; }
/** \brief Find the (min, max) length of any match for the given holder. */ static DepthMinMax findMatchLengths(const ReportManager &rm, const NGHolder &g) { DepthMinMax match_depths; vector<DepthMinMax> depths = getDistancesFromSOM(g); pair<s32, s32> adj; for (auto v : inv_adjacent_vertices_range(g.accept, g)) { u32 idx = g[v].index; DepthMinMax d = depths[idx]; // copy adj = getMinMaxOffsetAdjust(rm, g, v); DEBUG_PRINTF("vertex %u: depths=%s, adj=[%d,%d]\n", idx, d.str().c_str(), adj.first, adj.second); d.min += adj.first; d.max += adj.second; match_depths = unionDepthMinMax(match_depths, d); } for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) { if (v == g.accept) { continue; } u32 idx = g[v].index; DepthMinMax d = depths[idx]; // copy adj = getMinMaxOffsetAdjust(rm, g, v); DEBUG_PRINTF("vertex %u: depths=%s, adj=[%d,%d]\n", idx, d.str().c_str(), adj.first, adj.second); d.min += adj.first; d.max += adj.second; match_depths = unionDepthMinMax(match_depths, d); } DEBUG_PRINTF("match_depths=%s\n", match_depths.str().c_str()); assert(match_depths.min.is_reachable()); assert(match_depths.max.is_reachable()); return match_depths; }
/** \brief Replace the graph's reports with new reports that specify bounds. */ static void updateReportBounds(ReportManager &rm, NGWrapper &g, NFAVertex accept, set<NFAVertex> &done) { for (auto v : inv_adjacent_vertices_range(accept, g)) { // Don't operate on g.accept itself. if (v == g.accept) { assert(accept == g.acceptEod); continue; } // Don't operate on a vertex we've already done. if (contains(done, v)) { continue; } done.insert(v); flat_set<ReportID> new_reports; auto &reports = g[v].reports; for (auto id : reports) { Report ir = rm.getReport(id); // make a copy assert(!ir.hasBounds()); // Note that we need to cope with offset adjustment here. ir.minOffset = g.min_offset - ir.offsetAdjust; if (g.max_offset == MAX_OFFSET) { ir.maxOffset = MAX_OFFSET; } else { ir.maxOffset = g.max_offset - ir.offsetAdjust; } assert(ir.maxOffset >= ir.minOffset); ir.minLength = g.min_length; if (g.min_length && !g.som) { ir.quashSom = true; } DEBUG_PRINTF("id %u -> min_offset=%llu, max_offset=%llu, " "min_length=%llu\n", id, ir.minOffset, ir.maxOffset, ir.minLength); new_reports.insert(rm.getInternalId(ir)); } DEBUG_PRINTF("swapping reports on vertex %u\n", g[v].index); reports.swap(new_reports); } }
static void findDerivedSquashers(const NGHolder &g, const vector<NFAVertex> &vByIndex, const PostDomTree &pdom_tree, const NFAStateSet &init, map<NFAVertex, NFAStateSet> *squash, som_type som, const vector<DepthMinMax> &som_depths, const ue2::unordered_map<NFAVertex, u32> ®ion_map, smgb_cache &cache) { deque<NFAVertex> remaining; for (const auto &m : *squash) { remaining.push_back(m.first); } while (!remaining.empty()) { NFAVertex v = remaining.back(); remaining.pop_back(); for (auto u : inv_adjacent_vertices_range(v, g)) { if (is_special(u, g)) { continue; } if (g[v].char_reach != g[u].char_reach) { continue; } if (out_degree(u, g) != 1) { continue; } NFAStateSet u_squash(init.size()); u32 u_index = g[u].index; buildSquashMask(u_squash, g, u, g[u].char_reach, init, vByIndex, pdom_tree, som, som_depths, region_map, cache); u_squash.set(u_index); /* never clear ourselves */ if ((~u_squash).any()) { // i.e. some bits unset in mask DEBUG_PRINTF("%u is an upstream squasher of %u\n", u_index, g[v].index); (*squash)[u] = u_squash; remaining.push_back(u); } } } }
/** Remove any edges from vertices that generate accepts (for Highlander * graphs). */ void pruneHighlanderAccepts(NGHolder &g, const ReportManager &rm) { // Safety check: all reports must be simple exhaustible reports, or this is // not safe. This optimisation should be called early enough that no // internal reports have been added. for (auto report_id : all_reports(g)) { const Report &ir = rm.getReport(report_id); if (ir.ekey == INVALID_EKEY || ir.hasBounds() || !isExternalReport(ir)) { DEBUG_PRINTF("report %u is not external highlander with " "no bounds\n", report_id); return; } } vector<NFAEdge> dead; for (auto u : inv_adjacent_vertices_range(g.accept, g)) { if (is_special(u, g)) { continue; } // We can prune any out-edges that aren't accepts for (const auto &e : out_edges_range(u, g)) { if (!is_any_accept(target(e, g), g)) { dead.push_back(e); } } } if (dead.empty()) { return; } DEBUG_PRINTF("found %zu removable edges due to single match\n", dead.size()); remove_edges(dead, g); pruneUseless(g); }
static void contractVertex(NGHolder &g, NFAVertex v, ue2::unordered_set<pair<NFAVertex, NFAVertex>> &all_edges) { for (auto u : inv_adjacent_vertices_range(v, g)) { if (u == v) { continue; // self-edge } for (auto w : adjacent_vertices_range(v, g)) { if (w == v) { continue; // self-edge } // Construct edge (u, v) only if it doesn't already exist. We use // the all_edges container here, as checking existence inside the // graph is expensive when u or v have large degree. if (all_edges.emplace(u, w).second) { add_edge(u, w, g); } } } // Note that edges to/from v will remain in all_edges. clear_vertex(v, g); }
static void findSeeds(const NGHolder &h, const bool som, vector<NFAVertex> *seeds) { set<NFAVertex> bad; /* from zero-width asserts near accepts, etc */ for (auto v : inv_adjacent_vertices_range(h.accept, h)) { const CharReach &cr = h[v].char_reach; if (!isutf8ascii(cr) && !isutf8start(cr)) { bad.insert(v); } } for (auto v : inv_adjacent_vertices_range(h.acceptEod, h)) { const CharReach &cr = h[v].char_reach; if (!isutf8ascii(cr) && !isutf8start(cr)) { bad.insert(v); } } // we want to be careful with asserts connected to starts // as well as they may not finish a code point for (auto v : vertices_range(h)) { if (is_virtual_start(v, h)) { bad.insert(v); insert(&bad, adjacent_vertices(v, h)); } } /* we cannot handle vertices connected to accept as would report matches in * the middle of codepoints. acceptEod is not a problem as the input must * end at a codepoint boundary */ bad.insert(h.accept); // If we're in SOM mode, we don't want to mess with vertices that have a // direct edge from startDs. if (som) { insert(&bad, adjacent_vertices(h.startDs, h)); } set<NFAVertex> already_seeds; /* already marked as seeds */ for (auto v : vertices_range(h)) { const CharReach &cr = h[v].char_reach; if (!isutf8ascii(cr) || !hasSelfLoop(v, h)) { continue; } if (hasSuccInSet(h, v, bad)) { continue; } // Skip vertices that are directly connected to other vertices already // in the seeds list: we can't collapse two of these directly next to // each other. if (hasPredInSet(h, v, already_seeds) || hasSuccInSet(h, v, already_seeds)) { continue; } DEBUG_PRINTF("%zu is a seed\n", h[v].index); seeds->push_back(v); already_seeds.insert(v); } }
bool somMayGoBackwards(NFAVertex u, const NGHolder &g, const ue2::unordered_map<NFAVertex, u32> ®ion_map, smgb_cache &cache) { /* Need to ensure all matches of the graph g up to u contain no infixes * which are also matches of the graph to u. * * This is basically the same as firstMatchIsFirst except we g is not * always a dag. As we haven't gotten around to writing an execute_graph * that operates on general graphs, we take some (hopefully) conservative * short cuts. * * Note: if the u can be jumped we will take jump edges * into account as a possibility of som going backwards * * TODO: write a generalised ng_execute_graph/make this less hacky */ assert(&g == &cache.g); if (contains(cache.smgb, u)) { return cache.smgb[u]; } DEBUG_PRINTF("checking if som can go backwards on %u\n", g[u].index); set<NFAEdge> be; BackEdges<set<NFAEdge>> backEdgeVisitor(be); depth_first_search( g.g, visitor(backEdgeVisitor) .root_vertex(g.start) .vertex_index_map(get(&NFAGraphVertexProps::index, g.g))); bool rv; if (0) { exit: DEBUG_PRINTF("using cached result\n"); cache.smgb[u] = rv; return rv; } assert(contains(region_map, u)); const u32 u_region = region_map.at(u); for (const auto &e : be) { NFAVertex s = source(e, g); NFAVertex t = target(e, g); /* only need to worry about big cycles including/before u */ DEBUG_PRINTF("back edge %u %u\n", g[s].index, g[t].index); if (s != t && region_map.at(s) <= u_region) { DEBUG_PRINTF("eek big cycle\n"); rv = true; /* big cycle -> eek */ goto exit; } } ue2::unordered_map<NFAVertex, NFAVertex> orig_to_copy; NGHolder c_g; cloneHolder(c_g, g, &orig_to_copy); for (NFAVertex v : vertices_range(g)) { if (!is_virtual_start(v, g)) { continue; } NFAVertex c_v = orig_to_copy[v]; orig_to_copy[v] = c_g.startDs; for (NFAVertex c_w : adjacent_vertices_range(c_v, c_g)) { add_edge_if_not_present(c_g.startDs, c_w, c_g); } clear_vertex(c_v, c_g); } NFAVertex c_u = orig_to_copy[u]; clear_in_edges(c_g.acceptEod, c_g); add_edge(c_g.accept, c_g.acceptEod, c_g); clear_in_edges(c_g.accept, c_g); clear_out_edges(c_u, c_g); if (hasSelfLoop(u, g)) { add_edge(c_u, c_u, c_g); } add_edge(c_u, c_g.accept, c_g); set<NFAVertex> u_succ; insert(&u_succ, adjacent_vertices(u, g)); u_succ.erase(u); for (auto t : inv_adjacent_vertices_range(u, g)) { if (t == u) { continue; } for (auto v : adjacent_vertices_range(t, g)) { if (contains(u_succ, v)) { add_edge(orig_to_copy[t], c_g.accept, c_g); break; } } } pruneUseless(c_g); be.clear(); depth_first_search(c_g.g, visitor(backEdgeVisitor).root_vertex(c_g.start). vertex_index_map(get(&NFAGraphVertexProps::index, c_g.g))); for (const auto &e : be) { NFAVertex s = source(e, c_g); NFAVertex t = target(e, c_g); DEBUG_PRINTF("back edge %u %u\n", c_g[s].index, c_g[t].index); if (s != t) { assert(0); DEBUG_PRINTF("eek big cycle\n"); rv = true; /* big cycle -> eek */ goto exit; } } DEBUG_PRINTF("checking acyclic+selfloop graph\n"); rv = !firstMatchIsFirst(c_g); DEBUG_PRINTF("som may regress? %d\n", (int)rv); goto exit; }
/** If the pattern has a min_length and is of "ratchet" form with one unbounded * repeat, that repeat can become a bounded repeat. * * /foo.*bar/{min_length=100} --> /foo.{94,}bar/ */ static bool transformMinLengthToRepeat(const ReportManager &rm, NGWrapper &g) { assert(g.min_length); if (g.min_length > MAX_MINLENGTH_TO_CONVERT) { return false; } // If the pattern has virtual starts, we probably don't want to touch it. if (hasVirtualStarts(g)) { DEBUG_PRINTF("virtual starts, bailing\n"); return false; } // The graph must contain a single cyclic vertex (other than startDs), and // that vertex can have one pred and one successor. NFAVertex cyclic = findSingleCyclic(g); if (cyclic == NGHolder::null_vertex()) { return false; } NGHolder::adjacency_iterator ai, ae; tie(ai, ae) = adjacent_vertices(g.start, g); if (*ai == g.startDs) { ++ai; } NFAVertex v = *ai; if (++ai != ae) { DEBUG_PRINTF("more than one initial vertex\n"); return false; } u32 width = 0; // Walk from the start vertex to the cyclic state and ensure we have a // chain of vertices. while (v != cyclic) { DEBUG_PRINTF("vertex %u\n", g[v].index); width++; tie(ai, ae) = adjacent_vertices(v, g); set<NFAVertex> succ(ai, ae); if (contains(succ, cyclic)) { if (succ.size() == 1) { v = cyclic; } else if (succ.size() == 2) { // Cyclic and jump edge. succ.erase(cyclic); NFAVertex v2 = *succ.begin(); if (!edge(cyclic, v2, g).second) { DEBUG_PRINTF("bad form\n"); return false; } v = cyclic; } else { DEBUG_PRINTF("bad form\n"); return false; } } else { if (succ.size() != 1) { DEBUG_PRINTF("bad form\n"); return false; } v = *succ.begin(); } } // Check the cyclic state is A-OK. v = getSoleDestVertex(g, cyclic); if (v == NGHolder::null_vertex()) { DEBUG_PRINTF("cyclic has more than one successor\n"); return false; } // Walk from the cyclic state to an accept and ensure we have a chain of // vertices. while (!is_any_accept(v, g)) { DEBUG_PRINTF("vertex %u\n", g[v].index); width++; tie(ai, ae) = adjacent_vertices(v, g); set<NFAVertex> succ(ai, ae); if (succ.size() != 1) { DEBUG_PRINTF("bad form\n"); return false; } v = *succ.begin(); } int offsetAdjust = 0; if (!hasOffsetAdjust(rm, g, &offsetAdjust)) { return false; } DEBUG_PRINTF("adjusting width by %d\n", offsetAdjust); width += offsetAdjust; DEBUG_PRINTF("width=%u, vertex %u is cyclic\n", width, g[cyclic].index); if (width >= g.min_length) { DEBUG_PRINTF("min_length=%llu is guaranteed, as width=%u\n", g.min_length, width); g.min_length = 0; return true; } vector<NFAVertex> preds; vector<NFAEdge> dead; for (auto u : inv_adjacent_vertices_range(cyclic, g)) { DEBUG_PRINTF("pred %u\n", g[u].index); if (u == cyclic) { continue; } preds.push_back(u); // We want to delete the out-edges of each predecessor, but need to // make sure we don't delete the startDs self loop. for (const auto &e : out_edges_range(u, g)) { if (target(e, g) != g.startDs) { dead.push_back(e); } } } remove_edges(dead, g); assert(!preds.empty()); const CharReach &cr = g[cyclic].char_reach; for (u32 i = 0; i < g.min_length - width - 1; ++i) { v = add_vertex(g); g[v].char_reach = cr; for (auto u : preds) { add_edge(u, v, g); } preds.clear(); preds.push_back(v); } assert(!preds.empty()); for (auto u : preds) { add_edge(u, cyclic, g); } g.renumberVertices(); g.renumberEdges(); clearReports(g); g.min_length = 0; return true; }
u32 commonPrefixLength(const NGHolder &ga, const ue2::unordered_map<NFAVertex, u32> &a_state_ids, const NGHolder &gb, const ue2::unordered_map<NFAVertex, u32> &b_state_ids) { vector<NFAVertex> a = getSortedVA(ga, a_state_ids); vector<NFAVertex> b = getSortedVA(gb, b_state_ids); /* upper bound on the common region based on local properties */ u32 max = cplCommonReachAndSimple(ga, a, gb, b); DEBUG_PRINTF("cpl upper bound %u\n", max); while (max > 0) { bool ok = true; /* shrink max region based on in-edges from outside the region */ for (size_t j = max; j > 0; j--) { for (auto u : inv_adjacent_vertices_range(a[j - 1], ga)) { u32 state_id = a_state_ids.at(u); if (state_id != NO_STATE && state_id >= max) { max = j - 1; DEBUG_PRINTF("lowering max to %u\n", max); goto next_vertex; } } for (auto u : inv_adjacent_vertices_range(b[j - 1], gb)) { u32 state_id = b_state_ids.at(u); if (state_id != NO_STATE && state_id >= max) { max = j - 1; DEBUG_PRINTF("lowering max to %u\n", max); goto next_vertex; } } next_vertex:; } /* Ensure that every pair of vertices has same out-edges to vertices in the region. */ for (size_t i = 0; ok && i < max; i++) { size_t a_count = 0; size_t b_count = 0; NFAGraph::out_edge_iterator ei, ee; for (tie(ei, ee) = out_edges(a[i], ga); ok && ei != ee; ++ei) { u32 sid = a_state_ids.at(target(*ei, ga)); if (sid == NO_STATE || sid >= max) { continue; } a_count++; NFAEdge b_edge; bool has_b_edge; tie(b_edge, has_b_edge) = edge(b[i], b[sid], gb); if (!has_b_edge) { max = i; ok = false; DEBUG_PRINTF("lowering max to %u due to edge %zu->%u\n", max, i, sid); break; } if (ga[*ei].top != gb[b_edge].top) { max = i; ok = false; DEBUG_PRINTF("tops don't match on edge %zu->%u\n", i, sid); } } NFAGraph::adjacency_iterator ai, ae; for (tie(ai, ae) = adjacent_vertices(b[i], gb); ok && ai != ae; ++ai) { u32 sid = b_state_ids.at(*ai); if (sid == NO_STATE || sid >= max) { continue; } b_count++; } if (a_count != b_count) { max = i; DEBUG_PRINTF("lowering max to %u due to a,b count " "(a_count=%zu, b_count=%zu)\n", max, a_count, b_count); ok = false; } } if (ok) { DEBUG_PRINTF("survived checks, returning cpl %u\n", max); return max; } } DEBUG_PRINTF("failed to find any common region\n"); return 0; }
static bool expandCyclic(NGHolder &h, NFAVertex v) { DEBUG_PRINTF("inspecting %zu\n", h[v].index); bool changes = false; auto v_preds = preds(v, h); auto v_succs = succs(v, h); set<NFAVertex> start_siblings; set<NFAVertex> end_siblings; CharReach &v_cr = h[v].char_reach; /* We need to find start vertices which have all of our preds. * As we have a self loop, it must be one of our succs. */ for (auto a : adjacent_vertices_range(v, h)) { auto a_preds = preds(a, h); if (a_preds == v_preds && isutf8start(h[a].char_reach)) { DEBUG_PRINTF("%zu is a start v\n", h[a].index); start_siblings.insert(a); } } /* We also need to find full cont vertices which have all our own succs; * As we have a self loop, it must be one of our preds. */ for (auto a : inv_adjacent_vertices_range(v, h)) { auto a_succs = succs(a, h); if (a_succs == v_succs && h[a].char_reach == UTF_CONT_CR) { DEBUG_PRINTF("%zu is a full tail cont\n", h[a].index); end_siblings.insert(a); } } for (auto s : start_siblings) { if (out_degree(s, h) != 1) { continue; } const CharReach &cr = h[s].char_reach; if (cr.isSubsetOf(UTF_TWO_START_CR)) { if (end_siblings.find(*adjacent_vertices(s, h).first) == end_siblings.end()) { DEBUG_PRINTF("%zu is odd\n", h[s].index); continue; } } else if (cr.isSubsetOf(UTF_THREE_START_CR)) { NFAVertex m = *adjacent_vertices(s, h).first; if (h[m].char_reach != UTF_CONT_CR || out_degree(m, h) != 1) { continue; } if (end_siblings.find(*adjacent_vertices(m, h).first) == end_siblings.end()) { DEBUG_PRINTF("%zu is odd\n", h[s].index); continue; } } else if (cr.isSubsetOf(UTF_FOUR_START_CR)) { NFAVertex m1 = *adjacent_vertices(s, h).first; if (h[m1].char_reach != UTF_CONT_CR || out_degree(m1, h) != 1) { continue; } NFAVertex m2 = *adjacent_vertices(m1, h).first; if (h[m2].char_reach != UTF_CONT_CR || out_degree(m2, h) != 1) { continue; } if (end_siblings.find(*adjacent_vertices(m2, h).first) == end_siblings.end()) { DEBUG_PRINTF("%zu is odd\n", h[s].index); continue; } } else { DEBUG_PRINTF("%zu is bad\n", h[s].index); continue; } v_cr |= cr; clear_vertex(s, h); changes = true; } if (changes) { v_cr |= UTF_CONT_CR; /* we need to add in cont reach */ v_cr.set(0xc0); /* we can also add in the forbidden bytes as we require * valid unicode data */ v_cr.set(0xc1); v_cr |= CharReach(0xf5, 0xff); } return changes; }
bool RoseDedupeAuxImpl::requiresDedupeSupport( const flat_set<ReportID> &reports_in) const { /* TODO: this could be expanded to check for offset or character constraints */ // We don't want to consider dead reports (tracked by ReportManager but no // longer used) for the purposes of assigning dupe keys. flat_set<ReportID> reports; for (auto id : reports_in) { if (contains(live_reports, id)) { reports.insert(id); } } DEBUG_PRINTF("live reports: %s\n", as_string_list(reports).c_str()); const RoseGraph &g = build.g; bool has_suffix = false; bool has_outfix = false; if (!hasSafeMultiReports(reports)) { DEBUG_PRINTF("multiple reports not safe\n"); return true; } set<RoseVertex> roles; set<suffix_id> suffixes; set<const OutfixInfo *> outfixes; set<const raw_puff *> puffettes; for (ReportID r : reports) { if (contains(vert_map, r)) { insert(&roles, vert_map.at(r)); } if (contains(suffix_map, r)) { insert(&suffixes, suffix_map.at(r)); } if (contains(outfix_map, r)) { insert(&outfixes, outfix_map.at(r)); } if (contains(puff_map, r)) { insert(&puffettes, puff_map.at(r)); } } /* roles */ map<u32, u32> lits; // Literal ID -> count of occurrences. const bool has_role = !roles.empty(); for (auto v : roles) { for (const auto &lit : g[v].literals) { lits[lit]++; } if (g[v].eod_accept) { // Literals plugged into this EOD accept must be taken into account // as well. for (auto u : inv_adjacent_vertices_range(v, g)) { for (const auto &lit : g[u].literals) { lits[lit]++; } } } } /* literals */ for (const auto &m : lits) { if (m.second > 1) { DEBUG_PRINTF("lit %u used by >1 reporting roles\n", m.first); return true; } } for (auto it = begin(lits); it != end(lits); ++it) { const auto &lit1 = build.literals.at(it->first); for (auto jt = next(it); jt != end(lits); ++jt) { const auto &lit2 = build.literals.at(jt->first); if (literalsCouldRace(lit1, lit2)) { DEBUG_PRINTF("literals could race\n"); return true; } } } /* suffixes */ for (const auto &suffix : suffixes) { if (has_suffix || has_role) { return true; /* scope for badness */ } has_suffix = true; /* some lesser suffix engines (nfas, haig, castle) can raise multiple * matches for a report id at the same offset if there are multiple * report states live. */ if (suffix.haig()) { return true; } if (suffix.graph() && requiresDedupe(*suffix.graph(), reports, build.cc.grey)) { return true; } if (suffix.castle() && requiresDedupe(*suffix.castle(), reports)) { return true; } } /* outfixes */ for (const auto &outfix_ptr : outfixes) { assert(outfix_ptr); const OutfixInfo &out = *outfix_ptr; if (has_outfix || has_role || has_suffix) { return true; } has_outfix = true; if (out.haig()) { return true; /* haig may report matches with different SOM at the same offset */ } if (out.holder() && requiresDedupe(*out.holder(), reports, build.cc.grey)) { return true; } } /* mpv */ for (UNUSED const auto &puff : puffettes) { if (has_outfix || has_role || has_suffix) { return true; } has_outfix = true; } /* boundary */ if (has_intersection(build.boundary.report_at_eod, reports)) { if (has_outfix || has_role || has_suffix) { return true; } } return false; }
void pruneHighlanderDominated(NGHolder &g, const ReportManager &rm) { vector<NFAVertex> reporters; for (auto v : inv_adjacent_vertices_range(g.accept, g)) { for (const auto &report_id : g[v].reports) { const Report &r = rm.getReport(report_id); if (isSimpleExhaustible(r)) { reporters.push_back(v); break; } } } for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) { for (const auto &report_id : g[v].reports) { const Report &r = rm.getReport(report_id); if (isSimpleExhaustible(r)) { reporters.push_back(v); break; } } } if (reporters.empty()) { return; } sort(begin(reporters), end(reporters), make_index_ordering(g)); reporters.erase(unique(begin(reporters), end(reporters)), end(reporters)); DEBUG_PRINTF("%zu vertices have simple exhaustible reports\n", reporters.size()); const auto &dom = findDominators(g); bool modified = false; // If a reporter vertex is dominated by another with the same report, we // can remove that report; if all reports are removed, we can remove the // vertex entirely. for (const auto v : reporters) { const auto reports = g[v].reports; // copy, as we're going to mutate for (const auto &report_id : reports) { if (!isSimpleExhaustible(rm.getReport(report_id))) { continue; } if (isDominatedByReporter(g, dom, v, report_id)) { DEBUG_PRINTF("removed dominated report %u from vertex %u\n", report_id, g[v].index); g[v].reports.erase(report_id); } } if (g[v].reports.empty()) { DEBUG_PRINTF("removed edges to accepts from %u, no reports left\n", g[v].index); remove_edge(v, g.accept, g); remove_edge(v, g.acceptEod, g); modified = true; } } // If a reporter vertex has a self-loop, but otherwise only leads to accept // (note: NOT acceptEod) and has simple exhaustible reports, we can delete // the self-loop. for (const auto v : reporters) { if (hasOnlySelfLoopAndExhaustibleAccepts(g, rm, v)) { remove_edge(v, v, g); modified = true; DEBUG_PRINTF("removed self-loop on %u\n", g[v].index); } } if (!modified) { return; } pruneUseless(g); // We may have only removed self-loops, in which case pruneUseless wouldn't // renumber, so we do edge renumbering explicitly here. g.renumberEdges(); }