Example #1
0
static inline void rdtsc_prof_start(struct rdtsc_prof *p)
{
	p->clk_start = rte_rdtsc_precise();
}
/* run benchmark per burst size */
static inline int
pmd_cyclecount_bench_burst_sz(
		struct pmd_cyclecount_state *state, uint16_t test_burst_size)
{
	uint64_t tsc_start;
	uint64_t tsc_end;
	uint64_t tsc_op;
	uint64_t tsc_enq;
	uint64_t tsc_deq;
	uint32_t cur_op;

	/* reset all counters */
	tsc_enq = 0;
	tsc_deq = 0;
	state->ops_enqd = 0;
	state->ops_enq_retries = 0;
	state->ops_deqd = 0;
	state->ops_deq_retries = 0;

	/*
	 * Benchmark crypto op alloc-build-free separately.
	 */
	tsc_start = rte_rdtsc_precise();

	for (cur_op = 0; cur_op < state->opts->total_ops;
			cur_op += state->opts->nb_descriptors) {
		if (unlikely(pmd_cyclecount_bench_ops(
				state, cur_op, test_burst_size)))
			return -1;
	}

	tsc_end = rte_rdtsc_precise();
	tsc_op = tsc_end - tsc_start;


	/*
	 * Hardware acceleration cyclecount benchmarking loop.
	 *
	 * We're benchmarking raw enq/deq performance by filling up the device
	 * queue, so we never get any failed enqs unless the driver won't accept
	 * the exact number of descriptors we requested, or the driver won't
	 * wrap around the end of the TX ring. However, since we're only
	 * dequeueing once we've filled up the queue, we have to benchmark it
	 * piecemeal and then average out the results.
	 */
	cur_op = 0;
	while (cur_op < state->opts->total_ops) {
		uint32_t iter_ops_left = state->opts->total_ops - cur_op;
		uint32_t iter_ops_needed = RTE_MIN(
				state->opts->nb_descriptors, iter_ops_left);
		uint32_t iter_ops_allocd = iter_ops_needed;

		/* allocate and build ops */
		if (unlikely(pmd_cyclecount_build_ops(state, iter_ops_needed,
				test_burst_size)))
			return -1;

		tsc_start = rte_rdtsc_precise();

		/* fill up TX ring */
		iter_ops_needed = pmd_cyclecount_bench_enq(state,
				iter_ops_needed, test_burst_size);

		tsc_end = rte_rdtsc_precise();

		tsc_enq += tsc_end - tsc_start;

		/* allow for HW to catch up */
		if (state->delay)
			rte_delay_us_block(state->delay);

		tsc_start = rte_rdtsc_precise();

		/* drain RX ring */
		pmd_cyclecount_bench_deq(state, iter_ops_needed,
				test_burst_size);

		tsc_end = rte_rdtsc_precise();

		tsc_deq += tsc_end - tsc_start;

		cur_op += iter_ops_needed;

		/*
		 * we may not have processed all ops that we allocated, so
		 * free everything we've allocated.
		 */
		rte_mempool_put_bulk(state->ctx->pool,
				(void **)state->ctx->ops, iter_ops_allocd);
	}

	state->cycles_per_build = (double)tsc_op / state->opts->total_ops;
	state->cycles_per_enq = (double)tsc_enq / state->ops_enqd;
	state->cycles_per_deq = (double)tsc_deq / state->ops_deqd;

	return 0;
}