static bool hasSingleFloatingStart(const NGHolder &g) { NFAVertex initial = NGHolder::null_vertex(); for (auto v : adjacent_vertices_range(g.startDs, g)) { if (v == g.startDs) { continue; } if (initial != NGHolder::null_vertex()) { DEBUG_PRINTF("more than one start\n"); return false; } initial = v; } if (initial == NGHolder::null_vertex()) { DEBUG_PRINTF("no floating starts\n"); return false; } // Anchored start must have no successors other than startDs and initial. for (auto v : adjacent_vertices_range(g.start, g)) { if (v != initial && v != g.startDs) { DEBUG_PRINTF("anchored start\n"); return false; } } return true; }
static void checkForMultilineStart(ReportManager &rm, NGWrapper &g) { vector<NFAEdge> dead; for (auto v : adjacent_vertices_range(g.start, g)) { if (!(g[v].assert_flags & POS_FLAG_MULTILINE_START)) { continue; } DEBUG_PRINTF("mls %u %08x\n", g[v].index, g[v].assert_flags); /* we have found a multi-line start (maybe more than one) */ /* we need to interpose a dummy dot vertex between v and accept if * required so that ^ doesn't match trailing \n */ for (const auto &e : out_edges_range(v, g)) { if (target(e, g) == g.accept) { dead.push_back(e); } } /* assert has been resolved; clear flag */ g[v].assert_flags &= ~POS_FLAG_MULTILINE_START; } for (const auto &e : dead) { NFAVertex dummy = add_vertex(g); g[dummy].char_reach.setall(); setReportId(rm, g, dummy, -1); add_edge(source(e, g), dummy, g[e], g); add_edge(dummy, g.accept, g); } remove_edges(dead, g); }
static void buildSucc(NFAStateSet &succ, const NGHolder &g, NFAVertex v) { for (auto w : adjacent_vertices_range(v, g)) { if (!is_special(w, g)) { succ.set(g[w].index); } } }
static bool hasVirtualStarts(const NGHolder &g) { for (auto v : adjacent_vertices_range(g.start, g)) { if (g[v].assert_flags & POS_FLAG_VIRTUAL_START) { return true; } } return false; }
static bool hasSuccInSet(const NGHolder &g, NFAVertex v, const set<NFAVertex> &s) { for (auto w : adjacent_vertices_range(v, g)) { if (contains(s, w)) { return true; } } return false; }
static bool outIsIrreducible(NFAVertex &v, const NGHolder &g) { unsigned nonSpecialVertices = 0; for (auto w : adjacent_vertices_range(v, g)) { if (!is_special(w, g) && w != v) { nonSpecialVertices++; } } return nonSpecialVertices == 1; }
static bool isUnanchored(const NGHolder &g) { for (auto v : adjacent_vertices_range(g.start, g)) { if (!edge(g.startDs, v, g).second) { DEBUG_PRINTF("fail, %u is anchored vertex\n", g[v].index); return false; } } return true; }
static void step(const NGHolder &g, const vector<StateInfo> &info, const dynamic_bitset<> &in, dynamic_bitset<> *out) { out->reset(); for (size_t i = in.find_first(); i != in.npos; i = in.find_next(i)) { NFAVertex u = info[i].vertex; for (auto v : adjacent_vertices_range(u, g)) { out->set(g[v].index); } } }
/** \brief loose hash of an NGHolder; equal if is_equal would return true. */ u64a hash_holder(const NGHolder &g) { size_t rv = 0; for (auto v : vertices_range(g)) { boost::hash_combine(rv, g[v].index); boost::hash_combine(rv, g[v].char_reach); for (auto w : adjacent_vertices_range(v, g)) { boost::hash_combine(rv, g[w].index); } } return rv; }
/* add prefix literals to engine graph */ static bool addPrefixLiterals(NGHolder &h, unordered_set<u32> &tailId, const vector<vector<CharReach>> &triggers) { DEBUG_PRINTF("add literals to graph\n"); NFAVertex start = h.start; vector<NFAVertex> heads; vector<NFAVertex> tails; for (const auto &lit : triggers) { NFAVertex last = start; if (lit.empty()) { return false; } u32 i = 0; for (const auto &c : lit) { DEBUG_PRINTF("lit:%s \n", c.to_string().c_str()); NFAVertex u = add_vertex(h); h[u].char_reach = c; if (!i++) { heads.push_back(u); last = u; continue; } add_edge(last, u, h); last = u; } tails.push_back(last); tailId.insert(h[last].index); } for (auto v : adjacent_vertices_range(start, h)) { if (v != h.startDs) { for (auto &t : tails) { add_edge(t, v, h); } } } clear_out_edges(start, h); add_edge(h.start, h.start, h); for (auto &t : heads) { add_edge(start, t, h); } DEBUG_PRINTF("literals addition done\n"); return true; }
// populate VertexInfo table static ptr_vector<VertexInfo> getVertexInfos(const NGHolder &g) { const size_t num_verts = num_vertices(g); ptr_vector<VertexInfo> infos; infos.reserve(num_verts * 2); vector<VertexInfo *> vertex_map; // indexed by vertex_index property vertex_map.resize(num_verts); for (auto v : vertices_range(g)) { VertexInfo *vi = new VertexInfo(v, g); // insert our new shiny VertexInfo into the info map infos.push_back(vi); vertex_map[g[v].index] = vi; } // now, go through each vertex and populate its predecessor and successor lists for (VertexInfo &cur_vi : infos) { // find predecessors for (const auto &e : in_edges_range(cur_vi.v, g)) { NFAVertex u = source(e, g); VertexInfo *vmi = vertex_map[g[u].index]; cur_vi.pred_cr |= vmi->cr; cur_vi.pred.insert(vmi); // also set up edge tops if (is_triggered(g) && u == g.start) { cur_vi.edge_top = g[e].top; } } // find successors for (auto w : adjacent_vertices_range(cur_vi.v, g)) { VertexInfo *vmi = vertex_map[g[w].index]; cur_vi.succ_cr |= vmi->cr; cur_vi.succ.insert(vmi); } assert(!hasEdgeAsserts(cur_vi.v, g)); } return infos; }
/** \brief Relax forbidden UTF-8 sequences. * * Some byte sequences can not appear in valid UTF-8 as they encode code points * above \\x{10ffff} or they represent overlong encodings. As we require valid * UTF-8 input, we have no defined behaviour in these cases, as a result we can * accept them if it simplifies the graph. */ void relaxForbiddenUtf8(NGHolder &g, const ExpressionInfo &expr) { if (!expr.utf8) { return; } const CharReach e0(0xe0); const CharReach f0(0xf0); const CharReach f4(0xf4); for (auto v : vertices_range(g)) { const CharReach &cr = g[v].char_reach; if (cr == e0 || cr == f0 || cr == f4) { u8 pred_char = cr.find_first(); for (auto t : adjacent_vertices_range(v, g)) { allowIllegal(g, t, pred_char); } } } }
/** * True if the vertex has (a) a self-loop, (b) only out-edges to accept and * itself and (c) only simple exhaustible reports. */ static bool hasOnlySelfLoopAndExhaustibleAccepts(const NGHolder &g, const ReportManager &rm, NFAVertex v) { if (!edge(v, v, g).second) { return false; } for (auto w : adjacent_vertices_range(v, g)) { if (w != v && w != g.accept) { return false; } } for (const auto &report_id : g[v].reports) { if (!isSimpleExhaustible(rm.getReport(report_id))) { return false; } } return true; }
static void contractVertex(NGHolder &g, NFAVertex v, ue2::unordered_set<pair<NFAVertex, NFAVertex>> &all_edges) { for (auto u : inv_adjacent_vertices_range(v, g)) { if (u == v) { continue; // self-edge } for (auto w : adjacent_vertices_range(v, g)) { if (w == v) { continue; // self-edge } // Construct edge (u, v) only if it doesn't already exist. We use // the all_edges container here, as checking existence inside the // graph is expensive when u or v have large degree. if (all_edges.emplace(u, w).second) { add_edge(u, w, g); } } } // Note that edges to/from v will remain in all_edges. clear_vertex(v, g); }
map<NFAVertex, NFAStateSet> findSquashers(const NGHolder &g, som_type som) { map<NFAVertex, NFAStateSet> squash; // Number of bits to use for all our masks. If we're a triggered graph, // tops have already been assigned, so we don't have to account for them. const u32 numStates = num_vertices(g); // Build post-dominator tree. PostDomTree pdom_tree; buildPDomTree(g, pdom_tree); // Build list of vertices by state ID and a set of init states. vector<NFAVertex> vByIndex(numStates, NFAGraph::null_vertex()); NFAStateSet initStates(numStates); smgb_cache cache(g); // Mappings used for SOM mode calculations, otherwise left empty. unordered_map<NFAVertex, u32> region_map; vector<DepthMinMax> som_depths; if (som) { region_map = assignRegions(g); som_depths = getDistancesFromSOM(g); } for (auto v : vertices_range(g)) { const u32 vert_id = g[v].index; DEBUG_PRINTF("vertex %u/%u\n", vert_id, numStates); assert(vert_id < numStates); vByIndex[vert_id] = v; if (is_any_start(v, g) || !in_degree(v, g)) { initStates.set(vert_id); } } for (u32 i = 0; i < numStates; i++) { NFAVertex v = vByIndex[i]; assert(v != NFAGraph::null_vertex()); const CharReach &cr = g[v].char_reach; /* only non-init cyclics can be squashers */ if (!hasSelfLoop(v, g) || initStates.test(i)) { continue; } DEBUG_PRINTF("state %u is cyclic\n", i); NFAStateSet mask(numStates), succ(numStates), pred(numStates); buildSquashMask(mask, g, v, cr, initStates, vByIndex, pdom_tree, som, som_depths, region_map, cache); buildSucc(succ, g, v); buildPred(pred, g, v); const auto &reports = g[v].reports; for (size_t j = succ.find_first(); j != succ.npos; j = succ.find_next(j)) { NFAVertex vj = vByIndex[j]; NFAStateSet pred2(numStates); buildPred(pred2, g, vj); if (pred2 == pred) { DEBUG_PRINTF("adding the sm from %zu to %u's sm\n", j, i); NFAStateSet tmp(numStates); buildSquashMask(tmp, g, vj, cr, initStates, vByIndex, pdom_tree, som, som_depths, region_map, cache); mask &= tmp; } } for (size_t j = pred.find_first(); j != pred.npos; j = pred.find_next(j)) { NFAVertex vj = vByIndex[j]; NFAStateSet succ2(numStates); buildSucc(succ2, g, vj); /* we can use j as a basis for squashing if its succs are a subset * of ours */ if ((succ2 & ~succ).any()) { continue; } if (som) { /* We cannot use j to add to the squash mask of v if it may * have an earlier start of match offset. ie for us j as a * basis for the squash mask of v we require: * maxSomDist(j) <= minSomDist(v) */ /* ** TODO ** */ const depth &max_som_dist_j = som_depths[g[vj].index].max; const depth &min_som_dist_v = som_depths[g[v].index].min; if (max_som_dist_j > min_som_dist_v || max_som_dist_j.is_infinite()) { /* j can't be used as it may be storing an earlier SOM */ continue; } } const CharReach &crv = g[vj].char_reach; /* we also require that j's report information be a subset of ours */ bool seen_special = false; for (auto w : adjacent_vertices_range(vj, g)) { if (is_special(w, g)) { if (!edge(v, w, g).second) { goto next_j; } seen_special = true; } } // FIXME: should be subset check? if (seen_special && g[vj].reports != reports) { continue; } /* ok we can use j */ if ((crv & ~cr).none()) { NFAStateSet tmp(numStates); buildSquashMask(tmp, g, vj, cr, initStates, vByIndex, pdom_tree, som, som_depths, region_map, cache); mask &= tmp; mask.reset(j); } next_j:; } mask.set(i); /* never clear ourselves */ if ((~mask).any()) { // i.e. some bits unset in mask DEBUG_PRINTF("%u squashes %zu other states\n", i, (~mask).count()); squash.emplace(v, mask); } } findDerivedSquashers(g, vByIndex, pdom_tree, initStates, &squash, som, som_depths, region_map, cache); clearMutualSquashers(g, vByIndex, squash); return squash; }
static bool expandCyclic(NGHolder &h, NFAVertex v) { DEBUG_PRINTF("inspecting %zu\n", h[v].index); bool changes = false; auto v_preds = preds(v, h); auto v_succs = succs(v, h); set<NFAVertex> start_siblings; set<NFAVertex> end_siblings; CharReach &v_cr = h[v].char_reach; /* We need to find start vertices which have all of our preds. * As we have a self loop, it must be one of our succs. */ for (auto a : adjacent_vertices_range(v, h)) { auto a_preds = preds(a, h); if (a_preds == v_preds && isutf8start(h[a].char_reach)) { DEBUG_PRINTF("%zu is a start v\n", h[a].index); start_siblings.insert(a); } } /* We also need to find full cont vertices which have all our own succs; * As we have a self loop, it must be one of our preds. */ for (auto a : inv_adjacent_vertices_range(v, h)) { auto a_succs = succs(a, h); if (a_succs == v_succs && h[a].char_reach == UTF_CONT_CR) { DEBUG_PRINTF("%zu is a full tail cont\n", h[a].index); end_siblings.insert(a); } } for (auto s : start_siblings) { if (out_degree(s, h) != 1) { continue; } const CharReach &cr = h[s].char_reach; if (cr.isSubsetOf(UTF_TWO_START_CR)) { if (end_siblings.find(*adjacent_vertices(s, h).first) == end_siblings.end()) { DEBUG_PRINTF("%zu is odd\n", h[s].index); continue; } } else if (cr.isSubsetOf(UTF_THREE_START_CR)) { NFAVertex m = *adjacent_vertices(s, h).first; if (h[m].char_reach != UTF_CONT_CR || out_degree(m, h) != 1) { continue; } if (end_siblings.find(*adjacent_vertices(m, h).first) == end_siblings.end()) { DEBUG_PRINTF("%zu is odd\n", h[s].index); continue; } } else if (cr.isSubsetOf(UTF_FOUR_START_CR)) { NFAVertex m1 = *adjacent_vertices(s, h).first; if (h[m1].char_reach != UTF_CONT_CR || out_degree(m1, h) != 1) { continue; } NFAVertex m2 = *adjacent_vertices(m1, h).first; if (h[m2].char_reach != UTF_CONT_CR || out_degree(m2, h) != 1) { continue; } if (end_siblings.find(*adjacent_vertices(m2, h).first) == end_siblings.end()) { DEBUG_PRINTF("%zu is odd\n", h[s].index); continue; } } else { DEBUG_PRINTF("%zu is bad\n", h[s].index); continue; } v_cr |= cr; clear_vertex(s, h); changes = true; } if (changes) { v_cr |= UTF_CONT_CR; /* we need to add in cont reach */ v_cr.set(0xc0); /* we can also add in the forbidden bytes as we require * valid unicode data */ v_cr.set(0xc1); v_cr |= CharReach(0xf5, 0xff); } return changes; }
/** * Builds a squash mask based on the pdom tree of v and the given char reach. * The built squash mask is a bit conservative for non-dot cases and could * be improved with a bit of thought. */ static void buildSquashMask(NFAStateSet &mask, const NGHolder &g, NFAVertex v, const CharReach &cr, const NFAStateSet &init, const vector<NFAVertex> &vByIndex, const PostDomTree &tree, som_type som, const vector<DepthMinMax> &som_depths, const ue2::unordered_map<NFAVertex, u32> ®ion_map, smgb_cache &cache) { DEBUG_PRINTF("build base squash mask for vertex %u)\n", g[v].index); vector<NFAVertex> q; PostDomTree::const_iterator it = tree.find(v); if (it != tree.end()) { q.insert(q.end(), it->second.begin(), it->second.end()); } const u32 v_index = g[v].index; while (!q.empty()) { NFAVertex u = q.back(); q.pop_back(); const CharReach &cru = g[u].char_reach; if ((cru & ~cr).any()) { /* bail: bad cr on vertex u */ /* TODO: this could be better * * we still need to ensure that we record any paths leading to u. * Hence all vertices R which can reach u must be excluded from the * squash mask. Note: R != pdom(u) and there may exist an x in (R - * pdom(u)) which is in pdom(y) where y is in q. Clear ? */ mask.set(); return; } const u32 u_index = g[u].index; if (som) { /* We cannot add a state u to the squash mask of v if it may have an * earlier start of match offset. ie for us to add a state u to v * maxSomDist(u) <= minSomDist(v) */ const depth &max_som_dist_u = som_depths[u_index].max; const depth &min_som_dist_v = som_depths[v_index].min; if (max_som_dist_u.is_infinite()) { /* it is hard to tell due to the INF if u can actually store an * earlier SOM than w (state we are building the squash mask * for) - need to think more deeply */ if (mustBeSetBefore(u, v, g, cache) && !somMayGoBackwards(u, g, region_map, cache)) { DEBUG_PRINTF("u %u v %u\n", u_index, v_index); goto squash_ok; } } if (max_som_dist_u > min_som_dist_v) { /* u can't be squashed as it may be storing an earlier SOM */ goto add_children_to_queue; } } squash_ok: mask.set(u_index); DEBUG_PRINTF("pdom'ed %u\n", u_index); add_children_to_queue: it = tree.find(u); if (it != tree.end()) { q.insert(q.end(), it->second.begin(), it->second.end()); } } if (cr.all()) { /* the init states aren't in the pdom tree. If all their succ states * are set (or v), we can consider them post dominated */ /* Note: init states will always result in a later som */ for (size_t i = init.find_first(); i != init.npos; i = init.find_next(i)) { /* Yes vacuous patterns do exist */ NFAVertex iv = vByIndex[i]; for (auto w : adjacent_vertices_range(iv, g)) { if (w == g.accept || w == g.acceptEod) { DEBUG_PRINTF("skipping %zu due to vacuous accept\n", i); goto next_init_state; } u32 vert_id = g[w].index; if (w != iv && w != v && !mask.test(vert_id)) { DEBUG_PRINTF("skipping %zu due to %u\n", i, vert_id); goto next_init_state; } } DEBUG_PRINTF("pdom'ed %zu\n", i); mask.set(i); next_init_state:; } } mask.flip(); }
static void wireSuccessorsToStart(NGHolder &g, NFAVertex u) { for (auto v : adjacent_vertices_range(u, g)) { add_edge_if_not_present(g.start, v, g); } }
bool somMayGoBackwards(NFAVertex u, const NGHolder &g, const ue2::unordered_map<NFAVertex, u32> ®ion_map, smgb_cache &cache) { /* Need to ensure all matches of the graph g up to u contain no infixes * which are also matches of the graph to u. * * This is basically the same as firstMatchIsFirst except we g is not * always a dag. As we haven't gotten around to writing an execute_graph * that operates on general graphs, we take some (hopefully) conservative * short cuts. * * Note: if the u can be jumped we will take jump edges * into account as a possibility of som going backwards * * TODO: write a generalised ng_execute_graph/make this less hacky */ assert(&g == &cache.g); if (contains(cache.smgb, u)) { return cache.smgb[u]; } DEBUG_PRINTF("checking if som can go backwards on %u\n", g[u].index); set<NFAEdge> be; BackEdges<set<NFAEdge>> backEdgeVisitor(be); depth_first_search( g.g, visitor(backEdgeVisitor) .root_vertex(g.start) .vertex_index_map(get(&NFAGraphVertexProps::index, g.g))); bool rv; if (0) { exit: DEBUG_PRINTF("using cached result\n"); cache.smgb[u] = rv; return rv; } assert(contains(region_map, u)); const u32 u_region = region_map.at(u); for (const auto &e : be) { NFAVertex s = source(e, g); NFAVertex t = target(e, g); /* only need to worry about big cycles including/before u */ DEBUG_PRINTF("back edge %u %u\n", g[s].index, g[t].index); if (s != t && region_map.at(s) <= u_region) { DEBUG_PRINTF("eek big cycle\n"); rv = true; /* big cycle -> eek */ goto exit; } } ue2::unordered_map<NFAVertex, NFAVertex> orig_to_copy; NGHolder c_g; cloneHolder(c_g, g, &orig_to_copy); for (NFAVertex v : vertices_range(g)) { if (!is_virtual_start(v, g)) { continue; } NFAVertex c_v = orig_to_copy[v]; orig_to_copy[v] = c_g.startDs; for (NFAVertex c_w : adjacent_vertices_range(c_v, c_g)) { add_edge_if_not_present(c_g.startDs, c_w, c_g); } clear_vertex(c_v, c_g); } NFAVertex c_u = orig_to_copy[u]; clear_in_edges(c_g.acceptEod, c_g); add_edge(c_g.accept, c_g.acceptEod, c_g); clear_in_edges(c_g.accept, c_g); clear_out_edges(c_u, c_g); if (hasSelfLoop(u, g)) { add_edge(c_u, c_u, c_g); } add_edge(c_u, c_g.accept, c_g); set<NFAVertex> u_succ; insert(&u_succ, adjacent_vertices(u, g)); u_succ.erase(u); for (auto t : inv_adjacent_vertices_range(u, g)) { if (t == u) { continue; } for (auto v : adjacent_vertices_range(t, g)) { if (contains(u_succ, v)) { add_edge(orig_to_copy[t], c_g.accept, c_g); break; } } } pruneUseless(c_g); be.clear(); depth_first_search(c_g.g, visitor(backEdgeVisitor).root_vertex(c_g.start). vertex_index_map(get(&NFAGraphVertexProps::index, c_g.g))); for (const auto &e : be) { NFAVertex s = source(e, c_g); NFAVertex t = target(e, c_g); DEBUG_PRINTF("back edge %u %u\n", c_g[s].index, c_g[t].index); if (s != t) { assert(0); DEBUG_PRINTF("eek big cycle\n"); rv = true; /* big cycle -> eek */ goto exit; } } DEBUG_PRINTF("checking acyclic+selfloop graph\n"); rv = !firstMatchIsFirst(c_g); DEBUG_PRINTF("som may regress? %d\n", (int)rv); goto exit; }
/** If the pattern is unanchored, has a max_offset and has not asked for SOM, * we can use that knowledge to anchor it which will limit its lifespan. Note * that we can't use this transformation if there's a min_length, as it's * currently handled using "sly SOM". * * Note that it is possible to handle graphs that have a combination of * anchored and unanchored paths, but it's too tricky for the moment. */ static bool anchorPatternWithBoundedRepeat(NGWrapper &g, const depth &minWidth, const depth &maxWidth) { assert(!g.som); assert(g.max_offset != MAX_OFFSET); assert(minWidth <= maxWidth); assert(maxWidth.is_reachable()); DEBUG_PRINTF("widths=[%s,%s], min/max offsets=[%llu,%llu]\n", minWidth.str().c_str(), maxWidth.str().c_str(), g.min_offset, g.max_offset); if (g.max_offset > MAX_MAXOFFSET_TO_ANCHOR) { return false; } if (g.max_offset < minWidth) { assert(0); return false; } // If the pattern has virtual starts, we probably don't want to touch it. if (hasVirtualStarts(g)) { DEBUG_PRINTF("virtual starts, bailing\n"); return false; } // Similarly, bail if the pattern is vacuous. TODO: this could be done, we // would just need to be a little careful with reports. if (isVacuous(g)) { DEBUG_PRINTF("vacuous, bailing\n"); return false; } u32 min_bound, max_bound; if (maxWidth.is_infinite()) { min_bound = 0; max_bound = g.max_offset - minWidth; } else { min_bound = g.min_offset > maxWidth ? g.min_offset - maxWidth : 0; max_bound = g.max_offset - minWidth; } DEBUG_PRINTF("prepending ^.{%u,%u}\n", min_bound, max_bound); vector<NFAVertex> initials; for (auto v : adjacent_vertices_range(g.startDs, g)) { if (v == g.startDs) { continue; } initials.push_back(v); } if (initials.empty()) { DEBUG_PRINTF("no initial vertices\n"); return false; } // Wire up 'min_offset' mandatory dots from anchored start. NFAVertex u = g.start; for (u32 i = 0; i < min_bound; i++) { NFAVertex v = add_vertex(g); g[v].char_reach.setall(); add_edge(u, v, g); u = v; } NFAVertex head = u; // Wire up optional dots for (max_offset - min_offset). for (u32 i = 0; i < max_bound - min_bound; i++) { NFAVertex v = add_vertex(g); g[v].char_reach.setall(); if (head != u) { add_edge(head, v, g); } add_edge(u, v, g); u = v; } // Remove edges from starts and wire both head and u to our initials. for (auto v : initials) { remove_edge(g.startDs, v, g); remove_edge(g.start, v, g); if (head != u) { add_edge(head, v, g); } add_edge(u, v, g); } g.renumberVertices(); g.renumberEdges(); return true; }