int main(int argc, char** argv)
{
	if (argc < 2) {
		fprintf(stderr, "Usage: %s n", argv[0]);
		return 1;
	}

	const uint32_t n = atol(argv[1]);

	float *buf;
	posix_memalign((void**) &buf, 16, sizeof(float)*n);

	printf("Initialize a random buffer of %u floats...\n", n);
	srand(time(NULL));
	for (uint32_t i = 0; i < n; i++) {
		buf[i] = (float) rand();
	}
	printf("Done!\n");

	{
		float min, max;
		uint32_t min_idx, max_idx;

		BENCH_START(org);
		minmax(n, buf, &min_idx, &max_idx, &min, &max);
		BENCH_END(org, "org", sizeof(float), n, 1, 1);

		printf("Min (idx): %0.4f (%u)\n", min, min_idx);
		printf("Max (idx): %0.4f (%u)\n", max, max_idx);
	}

	{
		float min, max;
		uint32_t min_idx, max_idx;

		BENCH_START(sse);
		minmax_vec(n, buf, &min_idx, &max_idx, &min, &max);
		BENCH_END(sse, "sse", sizeof(float), n, 1, 1);

		printf("Min (idx): %0.4f (%u)\n", min, min_idx);
		printf("Max (idx): %0.4f (%u)\n", max, max_idx);
	}

	{
		float min, max;
		uint32_t min_idx, max_idx;

		BENCH_START(sse);
		minmax_vec2(n, buf, &min_idx, &max_idx, &min, &max);
		BENCH_END(sse, "sse2", sizeof(float), n, 1, 1);

		printf("Min (idx): %0.4f (%u)\n", min, min_idx);
		printf("Max (idx): %0.4f (%u)\n", max, max_idx);
	}

	return 0;
}
uint32_t* compute(float const* pts, const size_t n, const uint32_t size_interval_u32, const float size_interval)
{
	std::vector<uint32_t*> hist_locals;
	hist_locals.resize(omp_get_num_threads(), NULL);

	BENCH_START(b);

#pragma omp parallel num_threads(4)
	{
		uint32_t* hist_local;
		posix_memalign((void**) &hist_local, 16, sizeof(uint32_t)*size_interval_u32);
		memset(hist_local, 0, sizeof(uint32_t)*size_interval_u32);
		hist_locals[omp_get_thread_num()] = hist_local;

#pragma omp for
		for (size_t i = 0; i < n; i++) {
			const size_t idx = floor(pts[i]/size_interval);
			hist_local[idx]++;
		}
	}

	std::cerr << "Final reduction..." << std::endl;
	uint32_t* final_hist = NULL;

	// Find the first non-null partial histogram
	std::vector<uint32_t*>::const_iterator it;
	for (it = hist_locals.begin(); it != hist_locals.end(); it++) {
		if (*it != NULL) {
			final_hist = *it;
			it++;
			break;
		}
	}

	if (final_hist == NULL) {
		return NULL;
	}

	// Final reduction
	for (; it != hist_locals.end(); it++) {
		uint32_t* hist_local = *it;
		if (hist_local == NULL) {
			continue;
		}
		for (uint32_t i = 0; i < size_interval_u32; i++) {
			final_hist[i] += hist_local[i];
		}
	}

	BENCH_END(b, "compute_omp", sizeof(float), n, sizeof(uint32_t), n);

	return final_hist;
}