int main(int argc, char** argv) { if (argc < 2) { fprintf(stderr, "Usage: %s n", argv[0]); return 1; } const uint32_t n = atol(argv[1]); float *buf; posix_memalign((void**) &buf, 16, sizeof(float)*n); printf("Initialize a random buffer of %u floats...\n", n); srand(time(NULL)); for (uint32_t i = 0; i < n; i++) { buf[i] = (float) rand(); } printf("Done!\n"); { float min, max; uint32_t min_idx, max_idx; BENCH_START(org); minmax(n, buf, &min_idx, &max_idx, &min, &max); BENCH_END(org, "org", sizeof(float), n, 1, 1); printf("Min (idx): %0.4f (%u)\n", min, min_idx); printf("Max (idx): %0.4f (%u)\n", max, max_idx); } { float min, max; uint32_t min_idx, max_idx; BENCH_START(sse); minmax_vec(n, buf, &min_idx, &max_idx, &min, &max); BENCH_END(sse, "sse", sizeof(float), n, 1, 1); printf("Min (idx): %0.4f (%u)\n", min, min_idx); printf("Max (idx): %0.4f (%u)\n", max, max_idx); } { float min, max; uint32_t min_idx, max_idx; BENCH_START(sse); minmax_vec2(n, buf, &min_idx, &max_idx, &min, &max); BENCH_END(sse, "sse2", sizeof(float), n, 1, 1); printf("Min (idx): %0.4f (%u)\n", min, min_idx); printf("Max (idx): %0.4f (%u)\n", max, max_idx); } return 0; }
uint32_t* compute(float const* pts, const size_t n, const uint32_t size_interval_u32, const float size_interval) { std::vector<uint32_t*> hist_locals; hist_locals.resize(omp_get_num_threads(), NULL); BENCH_START(b); #pragma omp parallel num_threads(4) { uint32_t* hist_local; posix_memalign((void**) &hist_local, 16, sizeof(uint32_t)*size_interval_u32); memset(hist_local, 0, sizeof(uint32_t)*size_interval_u32); hist_locals[omp_get_thread_num()] = hist_local; #pragma omp for for (size_t i = 0; i < n; i++) { const size_t idx = floor(pts[i]/size_interval); hist_local[idx]++; } } std::cerr << "Final reduction..." << std::endl; uint32_t* final_hist = NULL; // Find the first non-null partial histogram std::vector<uint32_t*>::const_iterator it; for (it = hist_locals.begin(); it != hist_locals.end(); it++) { if (*it != NULL) { final_hist = *it; it++; break; } } if (final_hist == NULL) { return NULL; } // Final reduction for (; it != hist_locals.end(); it++) { uint32_t* hist_local = *it; if (hist_local == NULL) { continue; } for (uint32_t i = 0; i < size_interval_u32; i++) { final_hist[i] += hist_local[i]; } } BENCH_END(b, "compute_omp", sizeof(float), n, sizeof(uint32_t), n); return final_hist; }