void SmallWriteBuildImpl::add(const NGWrapper &w) { // If the graph is poisoned (i.e. we can't build a SmallWrite version), // we don't even try. if (poisoned) { return; } if (w.som || w.min_length || isVacuous(w)) { /* cannot support in smwr */ poisoned = true; return; } DEBUG_PRINTF("w=%p\n", &w); // make a copy of the graph so that we can modify it for our purposes unique_ptr<NGHolder> h = cloneHolder(w); reduceGraph(*h, SOM_NONE, w.utf8, cc); // If the earliest match location is outside the small write region, // then we don't need to build a SmallWrite version. // However, we don't poison this case either, since it is simply a case, // where we know the resulting graph won't match. if (findMinWidth(*h) > depth(cc.grey.smallWriteLargestBuffer)) { return; } // Now we can actually build the McClellan DFA assert(h->kind == NFA_OUTFIX); auto r = buildMcClellan(*h, &rm, cc.grey); // If we couldn't build a McClellan DFA for this portion, we won't be able // build a smwr which represents the pattern set if (!r) { DEBUG_PRINTF("failed to determinise\n"); poisoned = true; return; } prune_overlong(*r, cc.grey.smallWriteLargestBuffer); if (rdfa) { // do a merge of the new dfa with the existing dfa auto merged = mergeTwoDfas(rdfa.get(), r.get(), DFA_MERGE_MAX_STATES, &rm, cc.grey); if (!merged) { DEBUG_PRINTF("merge failed\n"); poisoned = true; return; } DEBUG_PRINTF("merge succeeded, built %p\n", merged.get()); rdfa = move(merged); } else { rdfa = move(r); } }
/** * True if the graphs have mergeable starts. * * Nowadays, this means that any vacuous edges must have the same tops. In * addition, mixed-accept cases need to have matching reports. */ static bool mergeableStarts(const NGHolder &h1, const NGHolder &h2) { if (!isVacuous(h1) || !isVacuous(h2)) { return true; } // Vacuous edges from startDs should not occur: we have better ways to // implement true dot-star relationships. Just in case they do, ban them // from being merged unless they have identical reports. if (is_match_vertex(h1.startDs, h1) || is_match_vertex(h2.startDs, h2)) { assert(0); return false; } // If both graphs have edge (start, accept), the tops must match. auto e1_accept = edge(h1.start, h1.accept, h1); auto e2_accept = edge(h2.start, h2.accept, h2); if (e1_accept.second && e2_accept.second && h1[e1_accept.first].top != h2[e2_accept.first].top) { return false; } // If both graphs have edge (start, acceptEod), the tops must match. auto e1_eod = edge(h1.start, h1.acceptEod, h1); auto e2_eod = edge(h2.start, h2.acceptEod, h2); if (e1_eod.second && e2_eod.second && h1[e1_eod.first].top != h2[e2_eod.first].top) { return false; } // If one graph has an edge to accept and the other has an edge to // acceptEod, the reports must match for the merge to be safe. if ((e1_accept.second && e2_eod.second) || (e2_accept.second && e1_eod.second)) { if (h1[h1.start].reports != h2[h2.start].reports) { return false; } } return true; }
/** If the pattern is unanchored, has a max_offset and has not asked for SOM, * we can use that knowledge to anchor it which will limit its lifespan. Note * that we can't use this transformation if there's a min_length, as it's * currently handled using "sly SOM". * * Note that it is possible to handle graphs that have a combination of * anchored and unanchored paths, but it's too tricky for the moment. */ static bool anchorPatternWithBoundedRepeat(NGWrapper &g, const depth &minWidth, const depth &maxWidth) { assert(!g.som); assert(g.max_offset != MAX_OFFSET); assert(minWidth <= maxWidth); assert(maxWidth.is_reachable()); DEBUG_PRINTF("widths=[%s,%s], min/max offsets=[%llu,%llu]\n", minWidth.str().c_str(), maxWidth.str().c_str(), g.min_offset, g.max_offset); if (g.max_offset > MAX_MAXOFFSET_TO_ANCHOR) { return false; } if (g.max_offset < minWidth) { assert(0); return false; } // If the pattern has virtual starts, we probably don't want to touch it. if (hasVirtualStarts(g)) { DEBUG_PRINTF("virtual starts, bailing\n"); return false; } // Similarly, bail if the pattern is vacuous. TODO: this could be done, we // would just need to be a little careful with reports. if (isVacuous(g)) { DEBUG_PRINTF("vacuous, bailing\n"); return false; } u32 min_bound, max_bound; if (maxWidth.is_infinite()) { min_bound = 0; max_bound = g.max_offset - minWidth; } else { min_bound = g.min_offset > maxWidth ? g.min_offset - maxWidth : 0; max_bound = g.max_offset - minWidth; } DEBUG_PRINTF("prepending ^.{%u,%u}\n", min_bound, max_bound); vector<NFAVertex> initials; for (auto v : adjacent_vertices_range(g.startDs, g)) { if (v == g.startDs) { continue; } initials.push_back(v); } if (initials.empty()) { DEBUG_PRINTF("no initial vertices\n"); return false; } // Wire up 'min_offset' mandatory dots from anchored start. NFAVertex u = g.start; for (u32 i = 0; i < min_bound; i++) { NFAVertex v = add_vertex(g); g[v].char_reach.setall(); add_edge(u, v, g); u = v; } NFAVertex head = u; // Wire up optional dots for (max_offset - min_offset). for (u32 i = 0; i < max_bound - min_bound; i++) { NFAVertex v = add_vertex(g); g[v].char_reach.setall(); if (head != u) { add_edge(head, v, g); } add_edge(u, v, g); u = v; } // Remove edges from starts and wire both head and u to our initials. for (auto v : initials) { remove_edge(g.startDs, v, g); remove_edge(g.start, v, g); if (head != u) { add_edge(head, v, g); } add_edge(u, v, g); } g.renumberVertices(); g.renumberEdges(); return true; }