template <class LbrStruct> static void fillNfa(NFA *nfa, lbr_common *c, ReportID report, const depth &repeatMin, const depth &repeatMax, u32 minPeriod, enum RepeatType rtype) { assert(nfa); RepeatStateInfo rsi(rtype, repeatMin, repeatMax, minPeriod); DEBUG_PRINTF("selected %s model for {%s,%s} repeat\n", repeatTypeName(rtype), repeatMin.str().c_str(), repeatMax.str().c_str()); // Fill the lbr_common structure first. Note that the RepeatInfo structure // directly follows the LbrStruct. const u32 info_offset = sizeof(LbrStruct); c->repeatInfoOffset = info_offset; c->report = report; RepeatInfo *info = (RepeatInfo *)((char *)c + info_offset); info->type = verify_u8(rtype); info->repeatMin = depth_to_u32(repeatMin); info->repeatMax = depth_to_u32(repeatMax); info->stateSize = rsi.stateSize; info->packedCtrlSize = rsi.packedCtrlSize; info->horizon = rsi.horizon; info->minPeriod = minPeriod; copy_bytes(&info->packedFieldSizes, rsi.packedFieldSizes); info->patchCount = rsi.patchCount; info->patchSize = rsi.patchSize; info->encodingSize = rsi.encodingSize; info->patchesOffset = rsi.patchesOffset; // Fill the NFA structure. nfa->nPositions = repeatMin; nfa->streamStateSize = verify_u32(rsi.packedCtrlSize + rsi.stateSize); nfa->scratchStateSize = (u32)sizeof(lbr_state); nfa->minWidth = verify_u32(repeatMin); nfa->maxWidth = repeatMax.is_finite() ? verify_u32(repeatMax) : 0; // Fill the lbr table for sparse lbr model. if (rtype == REPEAT_SPARSE_OPTIMAL_P) { u64a *table = getTable<LbrStruct>(nfa); // Adjust table length according to the optimal patch length. size_t len = nfa->length; assert((u32)repeatMax >= rsi.patchSize); len -= sizeof(u64a) * ((u32)repeatMax - rsi.patchSize); nfa->length = verify_u32(len); info->length = verify_u32(sizeof(RepeatInfo) + sizeof(u64a) * (rsi.patchSize + 1)); copy_bytes(table, rsi.table); } }
static aligned_unique_ptr<NFA> constructLBR(const CharReach &cr, const depth &repeatMin, const depth &repeatMax, u32 minPeriod, bool is_reset, ReportID report) { DEBUG_PRINTF("bounds={%s,%s}, cr=%s (count %zu), report=%u\n", repeatMin.str().c_str(), repeatMax.str().c_str(), describeClass(cr, 20, CC_OUT_TEXT).c_str(), cr.count(), report); assert(repeatMin <= repeatMax); assert(repeatMax.is_reachable()); aligned_unique_ptr<NFA> nfa = buildLbrDot(cr, repeatMin, repeatMax, minPeriod, is_reset, report); if (!nfa) { nfa = buildLbrVerm(cr, repeatMin, repeatMax, minPeriod, is_reset, report); } if (!nfa) { nfa = buildLbrNVerm(cr, repeatMin, repeatMax, minPeriod, is_reset, report); } if (!nfa) { nfa = buildLbrShuf(cr, repeatMin, repeatMax, minPeriod, is_reset, report); } if (!nfa) { nfa = buildLbrTruf(cr, repeatMin, repeatMax, minPeriod, is_reset, report); } if (!nfa) { assert(0); return nullptr; } return nfa; }
vector<u8> findLeftOffsetStopAlphabet(const CastleProto &castle, UNUSED som_type som) { const depth max_width = findMaxWidth(castle); DEBUG_PRINTF("castle has reach %s and max width %s\n", describeClass(castle.reach()).c_str(), max_width.str().c_str()); const CharReach escape = ~castle.reach(); // invert reach for stop chars. u32 d = min(max_width, depth(MAX_STOP_DEPTH)); const u8 mask = verify_u8((1U << d) - 1); vector<u8> stop(N_CHARS, 0); for (size_t c = escape.find_first(); c != escape.npos; c = escape.find_next(c)) { stop[c] |= mask; } return stop; }
/** If the pattern is unanchored, has a max_offset and has not asked for SOM, * we can use that knowledge to anchor it which will limit its lifespan. Note * that we can't use this transformation if there's a min_length, as it's * currently handled using "sly SOM". * * Note that it is possible to handle graphs that have a combination of * anchored and unanchored paths, but it's too tricky for the moment. */ static bool anchorPatternWithBoundedRepeat(NGWrapper &g, const depth &minWidth, const depth &maxWidth) { assert(!g.som); assert(g.max_offset != MAX_OFFSET); assert(minWidth <= maxWidth); assert(maxWidth.is_reachable()); DEBUG_PRINTF("widths=[%s,%s], min/max offsets=[%llu,%llu]\n", minWidth.str().c_str(), maxWidth.str().c_str(), g.min_offset, g.max_offset); if (g.max_offset > MAX_MAXOFFSET_TO_ANCHOR) { return false; } if (g.max_offset < minWidth) { assert(0); return false; } // If the pattern has virtual starts, we probably don't want to touch it. if (hasVirtualStarts(g)) { DEBUG_PRINTF("virtual starts, bailing\n"); return false; } // Similarly, bail if the pattern is vacuous. TODO: this could be done, we // would just need to be a little careful with reports. if (isVacuous(g)) { DEBUG_PRINTF("vacuous, bailing\n"); return false; } u32 min_bound, max_bound; if (maxWidth.is_infinite()) { min_bound = 0; max_bound = g.max_offset - minWidth; } else { min_bound = g.min_offset > maxWidth ? g.min_offset - maxWidth : 0; max_bound = g.max_offset - minWidth; } DEBUG_PRINTF("prepending ^.{%u,%u}\n", min_bound, max_bound); vector<NFAVertex> initials; for (auto v : adjacent_vertices_range(g.startDs, g)) { if (v == g.startDs) { continue; } initials.push_back(v); } if (initials.empty()) { DEBUG_PRINTF("no initial vertices\n"); return false; } // Wire up 'min_offset' mandatory dots from anchored start. NFAVertex u = g.start; for (u32 i = 0; i < min_bound; i++) { NFAVertex v = add_vertex(g); g[v].char_reach.setall(); add_edge(u, v, g); u = v; } NFAVertex head = u; // Wire up optional dots for (max_offset - min_offset). for (u32 i = 0; i < max_bound - min_bound; i++) { NFAVertex v = add_vertex(g); g[v].char_reach.setall(); if (head != u) { add_edge(head, v, g); } add_edge(u, v, g); u = v; } // Remove edges from starts and wire both head and u to our initials. for (auto v : initials) { remove_edge(g.startDs, v, g); remove_edge(g.start, v, g); if (head != u) { add_edge(head, v, g); } add_edge(u, v, g); } g.renumberVertices(); g.renumberEdges(); return true; }