static really_inline void rawStreamExec(struct hs_stream *stream_state, struct hs_scratch *scratch) { assert(stream_state); assert(scratch); char *state = getMultiState(stream_state); u8 broken = getBroken(state); if (unlikely(broken)) { assert(broken == BROKEN_FROM_USER || broken == BROKEN_EXHAUSTED); scratch->core_info.broken = broken; return; } DEBUG_PRINTF("::: streaming rose ::: offset = %llu len = %zu\n", stream_state->offset, scratch->core_info.len); const struct RoseEngine *rose = stream_state->rose; assert(rose); u8 *rose_state = (u8 *)state; roseStreamExec(rose, rose_state, scratch, selectAdaptor(rose), selectSomAdaptor(rose), scratch); if (!told_to_stop_matching(scratch) && isAllExhausted(rose, scratch->core_info.exhaustionVector)) { DEBUG_PRINTF("stream exhausted\n"); scratch->core_info.broken = BROKEN_EXHAUSTED; } }
static never_inline void soleOutfixStreamExec(struct hs_stream *stream_state, struct hs_scratch *scratch) { assert(stream_state); assert(scratch); const struct RoseEngine *t = stream_state->rose; assert(t->outfixEndQueue == 1); assert(!t->amatcherOffset); assert(!t->ematcherOffset); assert(!t->fmatcherOffset); const struct NFA *nfa = getNfaByQueue(t, 0); struct mq *q = scratch->queues; initQueue(q, 0, t, scratch); if (!scratch->core_info.buf_offset) { nfaQueueInitState(nfa, q); pushQueueAt(q, 0, MQE_START, 0); pushQueueAt(q, 1, MQE_TOP, 0); pushQueueAt(q, 2, MQE_END, scratch->core_info.len); } else { nfaExpandState(nfa, q->state, q->streamState, q->offset, queue_prev_byte(q, 0)); pushQueueAt(q, 0, MQE_START, 0); pushQueueAt(q, 1, MQE_END, scratch->core_info.len); } if (nfaQueueExec(q->nfa, q, scratch->core_info.len)) { nfaQueueCompressState(nfa, q, scratch->core_info.len); } else if (!told_to_stop_matching(scratch)) { scratch->core_info.broken = BROKEN_EXHAUSTED; } }
static really_inline void report_eod_matches(hs_stream_t *id, hs_scratch_t *scratch, match_event_handler onEvent, void *context) { DEBUG_PRINTF("--- report eod matches at offset %llu\n", id->offset); assert(onEvent); const struct RoseEngine *rose = id->rose; char *state = getMultiState(id); if (getBroken(state)) { DEBUG_PRINTF("stream is broken, just freeing storage\n"); return; } populateCoreInfo(scratch, rose, state, onEvent, context, NULL, 0, getHistory(state, rose, id->offset), getHistoryAmount(rose, id->offset), id->offset, 0); if (rose->somLocationCount) { loadSomFromStream(scratch, id->offset); } if (!id->offset) { if (rose->boundary.reportZeroEodOffset) { processReportList(rose, rose->boundary.reportZeroEodOffset, 0, scratch); } } else { if (rose->boundary.reportEodOffset) { processReportList(rose, rose->boundary.reportEodOffset, id->offset, scratch); } if (rose->requiresEodCheck) { switch (rose->runtimeImpl) { default: case ROSE_RUNTIME_PURE_LITERAL: assert(0); case ROSE_RUNTIME_FULL_ROSE: rawEodExec(id, scratch); break; case ROSE_RUNTIME_SINGLE_OUTFIX: soleOutfixEodExec(id, scratch); break; } } } if (rose->hasSom && !told_to_stop_matching(scratch)) { int halt = flushStoredSomMatches(scratch, ~0ULL); if (halt) { DEBUG_PRINTF("told to stop matching\n"); scratch->core_info.broken = BROKEN_FROM_USER; DEBUG_PRINTF("broken = %hhd\n", scratch->core_info.broken); } } }
HS_PUBLIC_API hs_error_t hs_scan_vector(const hs_database_t *db, const char * const * data, const unsigned int *length, unsigned int count, UNUSED unsigned int flags, hs_scratch_t *scratch, match_event_handler onEvent, void *context) { if (unlikely(!scratch || !data || !length)) { return HS_INVALID; } hs_error_t err = validDatabase(db); if (unlikely(err != HS_SUCCESS)) { return err; } const struct RoseEngine *rose = hs_get_bytecode(db); if (unlikely(!ISALIGNED_16(rose))) { return HS_INVALID; } if (unlikely(rose->mode != HS_MODE_VECTORED)) { return HS_DB_MODE_ERROR; } if (unlikely(!validScratch(rose, scratch))) { return HS_INVALID; } hs_stream_t *id = (hs_stream_t *)(scratch->bstate); init_stream(id, rose); /* open stream */ for (u32 i = 0; i < count; i++) { DEBUG_PRINTF("block %u/%u offset=%llu len=%u\n", i, count, id->offset, length[i]); #ifdef DEBUG dumpData(data[i], length[i]); #endif hs_error_t ret = hs_scan_stream_internal(id, data[i], length[i], 0, scratch, onEvent, context); if (ret != HS_SUCCESS) { return ret; } } /* close stream */ if (onEvent) { report_eod_matches(id, scratch, onEvent, context); if (told_to_stop_matching(scratch)) { return HS_SCAN_TERMINATED; } } return HS_SUCCESS; }
static really_inline void pureLiteralStreamExec(struct hs_stream *stream_state, struct hs_scratch *scratch) { assert(stream_state); assert(scratch); char *state = getMultiState(stream_state); u8 broken = getBroken(state); if (unlikely(broken)) { assert(broken == BROKEN_FROM_USER || broken == BROKEN_EXHAUSTED); scratch->core_info.broken = broken; return; } const struct RoseEngine *rose = stream_state->rose; const struct HWLM *ftable = getFLiteralMatcher(rose); size_t len2 = scratch->core_info.len; u8 *hwlm_stream_state; if (rose->floatingStreamState) { hwlm_stream_state = getFloatingMatcherState(rose, (u8 *)state); } else { hwlm_stream_state = NULL; } DEBUG_PRINTF("::: streaming rose ::: offset = %llu len = %zu\n", stream_state->offset, scratch->core_info.len); // Pure literal cases don't have floatingMinDistance set, so we always // start the match region at zero. const size_t start = 0; hwlmExecStreaming(ftable, scratch, len2, start, selectHwlmAdaptor(rose), scratch, rose->initialGroups, hwlm_stream_state); if (!told_to_stop_matching(scratch) && isAllExhausted(rose, scratch->core_info.exhaustionVector)) { DEBUG_PRINTF("stream exhausted\n"); scratch->core_info.broken = BROKEN_EXHAUSTED; } }
static really_inline void rawStreamExec(struct hs_stream *stream_state, struct hs_scratch *scratch) { assert(stream_state); assert(scratch); assert(!can_stop_matching(scratch)); DEBUG_PRINTF("::: streaming rose ::: offset = %llu len = %zu\n", stream_state->offset, scratch->core_info.len); const struct RoseEngine *rose = stream_state->rose; assert(rose); roseStreamExec(rose, scratch); if (!told_to_stop_matching(scratch) && isAllExhausted(rose, scratch->core_info.exhaustionVector)) { DEBUG_PRINTF("stream exhausted\n"); scratch->core_info.status |= STATUS_EXHAUSTED; } }
void roseStreamEodExec(const struct RoseEngine *t, u64a offset, struct hs_scratch *scratch) { assert(scratch); assert(t->requiresEodCheck); DEBUG_PRINTF("ci buf %p/%zu his %p/%zu\n", scratch->core_info.buf, scratch->core_info.len, scratch->core_info.hbuf, scratch->core_info.hlen); // We should not have been called if we've already been told to terminate // matching. assert(!told_to_stop_matching(scratch)); if (t->maxBiAnchoredWidth != ROSE_BOUND_INF && offset > t->maxBiAnchoredWidth) { DEBUG_PRINTF("bailing, we are beyond max width\n"); /* also some of the history/state may be stale */ return; } if (!t->eodProgramOffset) { DEBUG_PRINTF("no eod program\n"); return; } roseStreamInitEod(t, offset, scratch); DEBUG_PRINTF("running eod program at %u\n", t->eodProgramOffset); // There should be no pending delayed literals. assert(!scratch->tctxt.filledDelayedSlots); const u64a som = 0; const size_t match_len = 0; const u8 flags = ROSE_PROG_FLAG_SKIP_MPV_CATCHUP; // Note: we ignore the result, as this is the last thing to ever happen on // a scan. roseRunProgram(t, scratch, t->eodProgramOffset, som, offset, match_len, flags); }
int flushStoredSomMatches_i(struct hs_scratch *scratch, u64a offset) { DEBUG_PRINTF("flush som matches\n"); int halt = 0; assert(!told_to_stop_matching(scratch)); if (scratch->deduper.current_report_offset == ~0ULL) { /* no matches recorded yet; just need to clear the logs */ fatbit_clear(scratch->deduper.som_log[0]); fatbit_clear(scratch->deduper.som_log[1]); scratch->deduper.som_log_dirty = 0; return 0; } /* fire any reports from the logs and clear them */ if (offset == scratch->deduper.current_report_offset + 1) { struct fatbit *done_log = scratch->deduper.som_log[offset % 2]; u64a *done_starts = scratch->deduper.som_start_log[offset % 2]; halt = clearSomLog(scratch, scratch->deduper.current_report_offset - 1, done_log, done_starts); scratch->deduper.som_log_dirty >>= 1; } else {
static really_inline void pureLiteralStreamExec(struct hs_stream *stream_state, struct hs_scratch *scratch) { assert(stream_state); assert(scratch); assert(!can_stop_matching(scratch)); char *state = getMultiState(stream_state); const struct RoseEngine *rose = stream_state->rose; const struct HWLM *ftable = getFLiteralMatcher(rose); size_t len2 = scratch->core_info.len; u8 *hwlm_stream_state; if (rose->floatingStreamState) { hwlm_stream_state = getFloatingMatcherState(rose, state); } else { hwlm_stream_state = NULL; } DEBUG_PRINTF("::: streaming rose ::: offset = %llu len = %zu\n", stream_state->offset, scratch->core_info.len); // Pure literal cases don't have floatingMinDistance set, so we always // start the match region at zero. const size_t start = 0; hwlmExecStreaming(ftable, scratch, len2, start, rosePureLiteralCallback, scratch, rose->initialGroups, hwlm_stream_state); if (!told_to_stop_matching(scratch) && isAllExhausted(rose, scratch->core_info.exhaustionVector)) { DEBUG_PRINTF("stream exhausted\n"); scratch->core_info.status |= STATUS_EXHAUSTED; } }
static never_inline void processReportList(const struct RoseEngine *rose, u32 base_offset, u64a stream_offset, hs_scratch_t *scratch) { DEBUG_PRINTF("running report list at offset %u\n", base_offset); if (told_to_stop_matching(scratch)) { DEBUG_PRINTF("matching has been terminated\n"); return; } if (rose->hasSom && scratch->deduper.current_report_offset == ~0ULL) { /* we cannot delay the initialization of the som deduper logs any longer * as we are reporting matches. This is done explicitly as we are * shortcutting the som handling in the vacuous repeats as we know they * all come from non-som patterns. */ fatbit_clear(scratch->deduper.som_log[0]); fatbit_clear(scratch->deduper.som_log[1]); scratch->deduper.som_log_dirty = 0; } const ReportID *report = (const ReportID *)((const char *)rose + base_offset); /* never required to do som as vacuous reports are always external */ if (rose->simpleCallback) { for (; *report != MO_INVALID_IDX; report++) { roseSimpleAdaptor(stream_offset, *report, scratch); } } else { for (; *report != MO_INVALID_IDX; report++) { roseAdaptor(stream_offset, *report, scratch); } } }
void roseStreamExec(const struct RoseEngine *t, struct hs_scratch *scratch) { DEBUG_PRINTF("OH HAI [%llu, %llu)\n", scratch->core_info.buf_offset, scratch->core_info.buf_offset + (u64a)scratch->core_info.len); assert(t); assert(scratch->core_info.hbuf); assert(scratch->core_info.buf); // We should not have been called if we've already been told to terminate // matching. assert(!told_to_stop_matching(scratch)); assert(mmbit_sparse_iter_state_size(t->rolesWithStateCount) < MAX_SPARSE_ITER_STATES); size_t length = scratch->core_info.len; u64a offset = scratch->core_info.buf_offset; // We may have a maximum width (for engines constructed entirely // of bi-anchored patterns). If this write would result in us progressing // beyond this point, we cannot possibly match. if (t->maxBiAnchoredWidth != ROSE_BOUND_INF && offset + length > t->maxBiAnchoredWidth) { DEBUG_PRINTF("bailing, write would progress beyond maxBAWidth\n"); return; } char *state = scratch->core_info.state; struct RoseContext *tctxt = &scratch->tctxt; tctxt->mpv_inactive = 0; tctxt->groups = loadGroups(t, state); tctxt->lit_offset_adjust = offset + 1; // index after last byte tctxt->delayLastEndOffset = offset; tctxt->lastEndOffset = offset; tctxt->filledDelayedSlots = 0; tctxt->lastMatchOffset = 0; tctxt->minMatchOffset = offset; tctxt->minNonMpvMatchOffset = offset; tctxt->next_mpv_offset = 0; DEBUG_PRINTF("BEGIN: history len=%zu, buffer len=%zu groups=%016llx\n", scratch->core_info.hlen, scratch->core_info.len, tctxt->groups); fatbit_clear(scratch->aqa); scratch->al_log_sum = 0; scratch->catchup_pq.qm_size = 0; if (t->outfixBeginQueue != t->outfixEndQueue) { streamInitSufPQ(t, state, scratch); } runEagerPrefixesStream(t, scratch); u32 alen = t->anchoredDistance > offset ? MIN(length + offset, t->anchoredDistance) - offset : 0; const struct anchored_matcher_info *atable = getALiteralMatcher(t); if (atable && alen) { DEBUG_PRINTF("BEGIN ANCHORED %zu/%u\n", scratch->core_info.hlen, alen); runAnchoredTableStream(t, atable, alen, offset, scratch); if (can_stop_matching(scratch)) { goto exit; } } const struct HWLM *ftable = getFLiteralMatcher(t); if (ftable) { if (t->noFloatingRoots && !roseHasInFlightMatches(t, state, scratch)) { DEBUG_PRINTF("skip FLOATING: no inflight matches\n"); goto flush_delay_and_exit; } size_t flen = length; if (t->floatingDistance != ROSE_BOUND_INF) { flen = t->floatingDistance > offset ? MIN(t->floatingDistance, length + offset) - offset : 0; } size_t hlength = scratch->core_info.hlen; char rebuild = hlength && (scratch->core_info.status & STATUS_DELAY_DIRTY) && (t->maxFloatingDelayedMatch == ROSE_BOUND_INF || offset < t->maxFloatingDelayedMatch); DEBUG_PRINTF("**rebuild %hhd status %hhu mfdm %u, offset %llu\n", rebuild, scratch->core_info.status, t->maxFloatingDelayedMatch, offset); if (!flen) { if (rebuild) { /* rebuild floating delayed match stuff */ do_rebuild(t, ftable, scratch); } goto flush_delay_and_exit; } if (rebuild) { /* rebuild floating delayed match stuff */ do_rebuild(t, ftable, scratch); } if (flen + offset <= t->floatingMinDistance) { DEBUG_PRINTF("skip FLOATING: before floating min\n"); goto flush_delay_and_exit; } size_t start = 0; if (offset < t->floatingMinDistance) { // This scan crosses the floating min distance, so we can use that // to set HWLM's "start" offset. start = t->floatingMinDistance - offset; } DEBUG_PRINTF("start=%zu\n", start); u8 *stream_state; if (t->floatingStreamState) { stream_state = getFloatingMatcherState(t, state); } else { stream_state = NULL; } DEBUG_PRINTF("BEGIN FLOATING (over %zu/%zu)\n", flen, length); hwlmExecStreaming(ftable, scratch, flen, start, roseFloatingCallback, scratch, tctxt->groups & t->floating_group_mask, stream_state); } flush_delay_and_exit: DEBUG_PRINTF("flushing floating\n"); if (cleanUpDelayed(t, scratch, length, offset) == HWLM_TERMINATE_MATCHING) { return; } exit: DEBUG_PRINTF("CLEAN UP TIME\n"); if (!can_stop_matching(scratch)) { ensureStreamNeatAndTidy(t, state, scratch, length, offset); } DEBUG_PRINTF("DONE STREAMING SCAN, status = %u\n", scratch->core_info.status); return; }
HS_PUBLIC_API hs_error_t hs_scan(const hs_database_t *db, const char *data, unsigned length, unsigned flags, hs_scratch_t *scratch, match_event_handler onEvent, void *userCtx) { if (unlikely(!scratch || !data)) { return HS_INVALID; } hs_error_t err = validDatabase(db); if (unlikely(err != HS_SUCCESS)) { return err; } const struct RoseEngine *rose = hs_get_bytecode(db); if (unlikely(!ISALIGNED_16(rose))) { return HS_INVALID; } if (unlikely(rose->mode != HS_MODE_BLOCK)) { return HS_DB_MODE_ERROR; } if (unlikely(!validScratch(rose, scratch))) { return HS_INVALID; } if (rose->minWidth > length) { DEBUG_PRINTF("minwidth=%u > length=%u\n", rose->minWidth, length); return HS_SUCCESS; } prefetch_data(data, length); /* populate core info in scratch */ populateCoreInfo(scratch, rose, scratch->bstate, onEvent, userCtx, data, length, NULL, 0, 0, flags); clearEvec(scratch->core_info.exhaustionVector, rose); if (!length) { if (rose->boundary.reportZeroEodOffset) { processReportList(rose, rose->boundary.reportZeroEodOffset, 0, scratch); } goto set_retval; } if (rose->boundary.reportZeroOffset) { processReportList(rose, rose->boundary.reportZeroOffset, 0, scratch); } if (rose->minWidthExcludingBoundaries > length) { DEBUG_PRINTF("minWidthExcludingBoundaries=%u > length=%u\n", rose->minWidthExcludingBoundaries, length); goto done_scan; } // Similarly, we may have a maximum width (for engines constructed entirely // of bi-anchored patterns). if (rose->maxBiAnchoredWidth != ROSE_BOUND_INF && length > rose->maxBiAnchoredWidth) { DEBUG_PRINTF("block len=%u longer than maxBAWidth=%u\n", length, rose->maxBiAnchoredWidth); goto done_scan; } // Is this a small write case? if (rose->smallWriteOffset) { const struct SmallWriteEngine *smwr = getSmallWrite(rose); assert(smwr); // Apply the small write engine if and only if the block (buffer) is // small enough. Otherwise, we allow rose &co to deal with it. if (length < smwr->largestBuffer) { DEBUG_PRINTF("Attempting small write of block %u bytes long.\n", length); runSmallWriteEngine(smwr, scratch); goto done_scan; } } switch (rose->runtimeImpl) { default: assert(0); case ROSE_RUNTIME_FULL_ROSE: rawBlockExec(rose, scratch); break; case ROSE_RUNTIME_PURE_LITERAL: pureLiteralBlockExec(rose, scratch); break; case ROSE_RUNTIME_SINGLE_OUTFIX: soleOutfixBlockExec(rose, scratch); break; } done_scan: if (told_to_stop_matching(scratch)) { return HS_SCAN_TERMINATED; } if (rose->hasSom) { int halt = flushStoredSomMatches(scratch, ~0ULL); if (halt) { return HS_SCAN_TERMINATED; } } if (rose->boundary.reportEodOffset) { processReportList(rose, rose->boundary.reportEodOffset, length, scratch); } set_retval: DEBUG_PRINTF("done. told_to_stop_matching=%d\n", told_to_stop_matching(scratch)); return told_to_stop_matching(scratch) ? HS_SCAN_TERMINATED : HS_SUCCESS; }
static inline hs_error_t hs_scan_stream_internal(hs_stream_t *id, const char *data, unsigned length, UNUSED unsigned flags, hs_scratch_t *scratch, match_event_handler onEvent, void *context) { if (unlikely(!id || !scratch || !data || !validScratch(id->rose, scratch))) { return HS_INVALID; } const struct RoseEngine *rose = id->rose; char *state = getMultiState(id); u8 broken = getBroken(state); if (broken) { DEBUG_PRINTF("stream is broken, halting scan\n"); if (broken == BROKEN_FROM_USER) { return HS_SCAN_TERMINATED; } else { assert(broken == BROKEN_EXHAUSTED); return HS_SUCCESS; } } // We avoid doing any work if the user has given us zero bytes of data to // scan. Arguably we should define some semantics for how we treat vacuous // cases here. if (unlikely(length == 0)) { DEBUG_PRINTF("zero length block\n"); assert(getBroken(state) != BROKEN_FROM_USER); return HS_SUCCESS; } u32 historyAmount = getHistoryAmount(rose, id->offset); populateCoreInfo(scratch, rose, state, onEvent, context, data, length, getHistory(state, rose, id->offset), historyAmount, id->offset, flags); assert(scratch->core_info.hlen <= id->offset && scratch->core_info.hlen <= rose->historyRequired); prefetch_data(data, length); if (rose->somLocationCount) { loadSomFromStream(scratch, id->offset); } if (!id->offset && rose->boundary.reportZeroOffset) { DEBUG_PRINTF("zero reports\n"); processReportList(rose, rose->boundary.reportZeroOffset, 0, scratch); } switch (rose->runtimeImpl) { default: assert(0); case ROSE_RUNTIME_FULL_ROSE: rawStreamExec(id, scratch); break; case ROSE_RUNTIME_PURE_LITERAL: pureLiteralStreamExec(id, scratch); break; case ROSE_RUNTIME_SINGLE_OUTFIX: soleOutfixStreamExec(id, scratch); } if (rose->hasSom && !told_to_stop_matching(scratch)) { int halt = flushStoredSomMatches(scratch, ~0ULL); if (halt) { setBroken(state, BROKEN_FROM_USER); scratch->core_info.broken = BROKEN_FROM_USER; } } if (likely(!can_stop_matching(scratch))) { maintainHistoryBuffer(id->rose, getMultiState(id), data, length); id->offset += length; /* maintain offset */ if (rose->somLocationCount) { storeSomToStream(scratch, id->offset); } } else if (told_to_stop_matching(scratch)) { return HS_SCAN_TERMINATED; } else { /* exhausted */ setBroken(state, BROKEN_EXHAUSTED); } return HS_SUCCESS; }