static inline void rdtsc_prof_start(struct rdtsc_prof *p) { p->clk_start = rte_rdtsc_precise(); }
/* run benchmark per burst size */ static inline int pmd_cyclecount_bench_burst_sz( struct pmd_cyclecount_state *state, uint16_t test_burst_size) { uint64_t tsc_start; uint64_t tsc_end; uint64_t tsc_op; uint64_t tsc_enq; uint64_t tsc_deq; uint32_t cur_op; /* reset all counters */ tsc_enq = 0; tsc_deq = 0; state->ops_enqd = 0; state->ops_enq_retries = 0; state->ops_deqd = 0; state->ops_deq_retries = 0; /* * Benchmark crypto op alloc-build-free separately. */ tsc_start = rte_rdtsc_precise(); for (cur_op = 0; cur_op < state->opts->total_ops; cur_op += state->opts->nb_descriptors) { if (unlikely(pmd_cyclecount_bench_ops( state, cur_op, test_burst_size))) return -1; } tsc_end = rte_rdtsc_precise(); tsc_op = tsc_end - tsc_start; /* * Hardware acceleration cyclecount benchmarking loop. * * We're benchmarking raw enq/deq performance by filling up the device * queue, so we never get any failed enqs unless the driver won't accept * the exact number of descriptors we requested, or the driver won't * wrap around the end of the TX ring. However, since we're only * dequeueing once we've filled up the queue, we have to benchmark it * piecemeal and then average out the results. */ cur_op = 0; while (cur_op < state->opts->total_ops) { uint32_t iter_ops_left = state->opts->total_ops - cur_op; uint32_t iter_ops_needed = RTE_MIN( state->opts->nb_descriptors, iter_ops_left); uint32_t iter_ops_allocd = iter_ops_needed; /* allocate and build ops */ if (unlikely(pmd_cyclecount_build_ops(state, iter_ops_needed, test_burst_size))) return -1; tsc_start = rte_rdtsc_precise(); /* fill up TX ring */ iter_ops_needed = pmd_cyclecount_bench_enq(state, iter_ops_needed, test_burst_size); tsc_end = rte_rdtsc_precise(); tsc_enq += tsc_end - tsc_start; /* allow for HW to catch up */ if (state->delay) rte_delay_us_block(state->delay); tsc_start = rte_rdtsc_precise(); /* drain RX ring */ pmd_cyclecount_bench_deq(state, iter_ops_needed, test_burst_size); tsc_end = rte_rdtsc_precise(); tsc_deq += tsc_end - tsc_start; cur_op += iter_ops_needed; /* * we may not have processed all ops that we allocated, so * free everything we've allocated. */ rte_mempool_put_bulk(state->ctx->pool, (void **)state->ctx->ops, iter_ops_allocd); } state->cycles_per_build = (double)tsc_op / state->opts->total_ops; state->cycles_per_enq = (double)tsc_enq / state->ops_enqd; state->cycles_per_deq = (double)tsc_deq / state->ops_deqd; return 0; }