int benchmark(int num_elements, int num_directions, OpType op_type, int loc, int precision, bool evaluate_2d, int niter, double& time_taken) { int status = 0; int type = precision | OSKAR_COMPLEX; oskar_Mem *beam = 0, *signal = 0, *z = 0, *z_i = 0; oskar_Mem *x = oskar_mem_create(precision, loc, num_directions, &status); oskar_Mem *y = oskar_mem_create(precision, loc, num_directions, &status); oskar_Mem *x_i = oskar_mem_create(precision, loc, num_elements, &status); oskar_Mem *y_i = oskar_mem_create(precision, loc, num_elements, &status); oskar_Mem *weights = oskar_mem_create(type, loc, num_elements, &status); if (!evaluate_2d) { z = oskar_mem_create(precision, loc, num_directions, &status); z_i = oskar_mem_create(precision, loc, num_elements, &status); } if (op_type == O2C) beam = oskar_mem_create(type, loc, num_directions, &status); else if (op_type == C2C || op_type == M2M) { int num_signals = num_directions * num_elements; if (op_type == C2C) { beam = oskar_mem_create(type, loc, num_directions, &status); signal = oskar_mem_create(type, loc, num_signals, &status); } else { type |= OSKAR_MATRIX; beam = oskar_mem_create(type, loc, num_directions, &status); signal = oskar_mem_create(type, loc, num_signals, &status); } } oskar_Timer *tmr = oskar_timer_create(OSKAR_TIMER_NATIVE); if (!status) { oskar_timer_start(tmr); for (int i = 0; i < niter; ++i) { oskar_dftw(num_elements, 2.0 * M_PI, x_i, y_i, z_i, weights, num_directions, x, y, z, signal, beam, &status); } time_taken = oskar_timer_elapsed(tmr); } // Free memory. oskar_timer_free(tmr); oskar_mem_free(x, &status); oskar_mem_free(y, &status); oskar_mem_free(z, &status); oskar_mem_free(x_i, &status); oskar_mem_free(y_i, &status); oskar_mem_free(z_i, &status); oskar_mem_free(weights, &status); oskar_mem_free(beam, &status); oskar_mem_free(signal, &status); return status; }
static void record_timing(oskar_Simulator* h) { /* Obtain component times. */ int i; double t_copy = 0., t_clip = 0., t_E = 0., t_K = 0., t_join = 0.; double t_correlate = 0., t_compute = 0., t_components = 0.; double *compute_times; compute_times = (double*) calloc(h->num_devices, sizeof(double)); for (i = 0; i < h->num_devices; ++i) { compute_times[i] = oskar_timer_elapsed(h->d[i].tmr_compute); t_copy += oskar_timer_elapsed(h->d[i].tmr_copy); t_clip += oskar_timer_elapsed(h->d[i].tmr_clip); t_join += oskar_timer_elapsed(h->d[i].tmr_join); t_E += oskar_timer_elapsed(h->d[i].tmr_E); t_K += oskar_timer_elapsed(h->d[i].tmr_K); t_correlate += oskar_timer_elapsed(h->d[i].tmr_correlate); t_compute += compute_times[i]; } t_components = t_copy + t_clip + t_E + t_K + t_join + t_correlate; /* Record time taken. */ oskar_log_section(h->log, 'M', "Simulation timing"); oskar_log_value(h->log, 'M', 0, "Total wall time", "%.3f s", oskar_timer_elapsed(h->tmr_sim)); for (i = 0; i < h->num_devices; ++i) oskar_log_value(h->log, 'M', 0, "Compute", "%.3f s [Device %i]", compute_times[i], i); oskar_log_value(h->log, 'M', 0, "Write", "%.3f s", oskar_timer_elapsed(h->tmr_write)); oskar_log_message(h->log, 'M', 0, "Compute components:"); oskar_log_value(h->log, 'M', 1, "Copy", "%4.1f%%", (t_copy / t_compute) * 100.0); oskar_log_value(h->log, 'M', 1, "Horizon clip", "%4.1f%%", (t_clip / t_compute) * 100.0); oskar_log_value(h->log, 'M', 1, "Jones E", "%4.1f%%", (t_E / t_compute) * 100.0); oskar_log_value(h->log, 'M', 1, "Jones K", "%4.1f%%", (t_K / t_compute) * 100.0); oskar_log_value(h->log, 'M', 1, "Jones join", "%4.1f%%", (t_join / t_compute) * 100.0); oskar_log_value(h->log, 'M', 1, "Jones correlate", "%4.1f%%", (t_correlate / t_compute) * 100.0); oskar_log_value(h->log, 'M', 1, "Other", "%4.1f%%", ((t_compute - t_components) / t_compute) * 100.0); free(compute_times); }
int benchmark(int num_stations, int num_sources, int type, int jones_type, int loc, int use_extended, int use_time_ave, int niter, std::vector<double>& times) { int status = 0; oskar_Timer* timer; timer = oskar_timer_create(loc == OSKAR_GPU ? OSKAR_TIMER_CUDA : OSKAR_TIMER_OMP); // Set up a test sky model, telescope model and Jones matrices. oskar_Telescope* tel = oskar_telescope_create(type, loc, num_stations, &status); oskar_Sky* sky = oskar_sky_create(type, loc, num_sources, &status); oskar_Jones* J = oskar_jones_create(jones_type, loc, num_stations, num_sources, &status); oskar_telescope_set_channel_bandwidth(tel, 1e6); oskar_telescope_set_time_average(tel, (double) use_time_ave); oskar_sky_set_use_extended(sky, use_extended); // Memory for visibility coordinates and output visibility slice. oskar_Mem *vis, *u, *v, *w; vis = oskar_mem_create(jones_type, loc, oskar_telescope_num_baselines(tel), &status); u = oskar_mem_create(type, loc, num_stations, &status); v = oskar_mem_create(type, loc, num_stations, &status); w = oskar_mem_create(type, loc, num_stations, &status); // Run benchmark. times.resize(niter); for (int i = 0; i < niter; ++i) { oskar_timer_start(timer); oskar_cross_correlate(vis, oskar_sky_num_sources(sky), J, sky, tel, u, v, w, 0.0, 100e6, &status); times[i] = oskar_timer_elapsed(timer); } // Free memory. oskar_mem_free(u, &status); oskar_mem_free(v, &status); oskar_mem_free(w, &status); oskar_mem_free(vis, &status); oskar_jones_free(J, &status); oskar_telescope_free(tel, &status); oskar_sky_free(sky, &status); oskar_timer_free(timer); return status; }
void oskar_simulator_run(oskar_Simulator* h, int* status) { int i, num_threads = 1, num_vis_blocks; if (*status) return; /* Check the visibilities are going somewhere. */ if (!h->vis_name #ifndef OSKAR_NO_MS && !h->ms_name #endif ) { oskar_log_error(h->log, "No output file specified."); #ifdef OSKAR_NO_MS if (h->ms_name) oskar_log_error(h->log, "OSKAR was compiled without Measurement Set support."); #endif *status = OSKAR_ERR_FILE_IO; return; } /* Initialise if required. */ oskar_simulator_check_init(h, status); /* Get the number of visibility blocks to be processed. */ num_vis_blocks = oskar_simulator_num_vis_blocks(h); /* Record memory usage. */ if (h->log && !*status) { oskar_log_section(h->log, 'M', "Initial memory usage"); #ifdef OSKAR_HAVE_CUDA for (i = 0; i < h->num_gpus; ++i) oskar_cuda_mem_log(h->log, 0, h->gpu_ids[i]); #endif system_mem_log(h->log); oskar_log_section(h->log, 'M', "Starting simulation..."); } /* Start simulation timer. */ oskar_timer_start(h->tmr_sim); /*----------------------------------------------------------------------- *-- START OF MULTITHREADED SIMULATION CODE ----------------------------- *-----------------------------------------------------------------------*/ /* Loop over blocks of observation time, running simulation and file * writing one block at a time. Simulation and file output are overlapped * by using double buffering, and a dedicated thread is used for file * output. * * Thread 0 is used for file writes. * Threads 1 to n (mapped to compute devices) do the simulation. * * Note that no write is launched on the first loop counter (as no * data are ready yet) and no simulation is performed for the last loop * counter (which corresponds to the last block + 1) as this iteration * simply writes the last block. */ #ifdef _OPENMP num_threads = h->num_devices + 1; omp_set_num_threads(num_threads); omp_set_nested(0); #else oskar_log_warning(h->log, "OpenMP not found: Using one compute device."); #endif oskar_simulator_reset_work_unit_index(h); #pragma omp parallel { int b, thread_id = 0, device_id = 0; /* Get host thread ID and device ID. */ #ifdef _OPENMP thread_id = omp_get_thread_num(); device_id = thread_id - 1; #endif /* Loop over simulation time blocks (+1, for the last write). */ for (b = 0; b < num_vis_blocks + 1; ++b) { if ((thread_id > 0 || num_threads == 1) && b < num_vis_blocks) oskar_simulator_run_block(h, b, device_id, status); if (thread_id == 0 && b > 0) { oskar_VisBlock* block; block = oskar_simulator_finalise_block(h, b - 1, status); oskar_simulator_write_block(h, block, b - 1, status); } /* Barrier 1: Reset work unit index. */ #pragma omp barrier if (thread_id == 0) oskar_simulator_reset_work_unit_index(h); /* Barrier 2: Synchronise before moving to the next block. */ #pragma omp barrier if (thread_id == 0 && b < num_vis_blocks && h->log && !*status) oskar_log_message(h->log, 'S', 0, "Block %*i/%i (%3.0f%%) " "complete. Simulation time elapsed: %.3f s", disp_width(num_vis_blocks), b+1, num_vis_blocks, 100.0 * (b+1) / (double)num_vis_blocks, oskar_timer_elapsed(h->tmr_sim)); } } /*----------------------------------------------------------------------- *-- END OF MULTITHREADED SIMULATION CODE ------------------------------- *-----------------------------------------------------------------------*/ /* Record memory usage. */ if (h->log && !*status) { oskar_log_section(h->log, 'M', "Final memory usage"); #ifdef OSKAR_HAVE_CUDA for (i = 0; i < h->num_gpus; ++i) oskar_cuda_mem_log(h->log, 0, h->gpu_ids[i]); #endif system_mem_log(h->log); } /* If there are sources in the simulation and the station beam is not * normalised to 1.0 at the phase centre, the values of noise RMS * may give a very unexpected S/N ratio! * The alternative would be to scale the noise to match the station * beam gain but that would require knowledge of the station beam * amplitude at the phase centre for each time and channel. */ if (h->log && oskar_telescope_noise_enabled(h->tel) && !*status) { int have_sources, amp_calibrated; have_sources = (h->num_sky_chunks > 0 && oskar_sky_num_sources(h->sky_chunks[0]) > 0); amp_calibrated = oskar_station_normalise_final_beam( oskar_telescope_station_const(h->tel, 0)); if (have_sources && !amp_calibrated) { const char* a = "WARNING: System noise added to visibilities"; const char* b = "without station beam normalisation enabled."; const char* c = "This will give an invalid signal to noise ratio."; oskar_log_line(h->log, 'W', ' '); oskar_log_line(h->log, 'W', '*'); oskar_log_message(h->log, 'W', -1, a); oskar_log_message(h->log, 'W', -1, b); oskar_log_message(h->log, 'W', -1, c); oskar_log_line(h->log, 'W', '*'); oskar_log_line(h->log, 'W', ' '); } } /* Record times and summarise output files. */ if (h->log && !*status) { size_t log_size = 0; char* log_data; oskar_log_set_value_width(h->log, 25); record_timing(h); oskar_log_section(h->log, 'M', "Simulation complete"); oskar_log_message(h->log, 'M', 0, "Output(s):"); if (h->vis_name) oskar_log_value(h->log, 'M', 1, "OSKAR binary file", "%s", h->vis_name); if (h->ms_name) oskar_log_value(h->log, 'M', 1, "Measurement Set", "%s", h->ms_name); /* Write simulation log to the output files. */ log_data = oskar_log_file_data(h->log, &log_size); #ifndef OSKAR_NO_MS if (h->ms) oskar_ms_add_history(h->ms, "OSKAR_LOG", log_data, log_size); #endif if (h->vis) oskar_binary_write(h->vis, OSKAR_CHAR, OSKAR_TAG_GROUP_RUN, OSKAR_TAG_RUN_LOG, 0, log_size, log_data, status); free(log_data); } /* Finalise. */ oskar_simulator_finalise(h, status); }
static void* run_blocks(void* arg) { oskar_Interferometer* h; int b, thread_id, device_id, num_blocks, num_threads, *status; /* Get thread function arguments. */ h = ((ThreadArgs*)arg)->h; num_threads = ((ThreadArgs*)arg)->num_threads; thread_id = ((ThreadArgs*)arg)->thread_id; device_id = thread_id - 1; status = &(h->status); #ifdef _OPENMP /* Disable any nested parallelism. */ omp_set_nested(0); omp_set_num_threads(1); #endif /* Loop over blocks of observation time, running simulation and file * writing one block at a time. Simulation and file output are overlapped * by using double buffering, and a dedicated thread is used for file * output. * * Thread 0 is used for file writes. * Threads 1 to n (mapped to compute devices) do the simulation. * * Note that no write is launched on the first loop counter (as no * data are ready yet) and no simulation is performed for the last loop * counter (which corresponds to the last block + 1) as this iteration * simply writes the last block. */ num_blocks = oskar_interferometer_num_vis_blocks(h); for (b = 0; b < num_blocks + 1; ++b) { if ((thread_id > 0 || num_threads == 1) && b < num_blocks) oskar_interferometer_run_block(h, b, device_id, status); if (thread_id == 0 && b > 0) { oskar_VisBlock* block; block = oskar_interferometer_finalise_block(h, b - 1, status); oskar_interferometer_write_block(h, block, b - 1, status); } /* Barrier 1: Reset work unit index and print status. */ oskar_barrier_wait(h->barrier); if (thread_id == 0) { oskar_interferometer_reset_work_unit_index(h); if (b < num_blocks && h->log && !*status) oskar_log_message(h->log, 'S', 0, "Block %*i/%i (%3.0f%%) " "complete. Simulation time elapsed: %.3f s", disp_width(num_blocks), b+1, num_blocks, 100.0 * (b+1) / (double)num_blocks, oskar_timer_elapsed(h->tmr_sim)); } /* Barrier 2: Synchronise before moving to the next block. */ oskar_barrier_wait(h->barrier); } return 0; }
TEST(Mem, random_uniform) { int seed = 1; int c1 = 437; int c2 = 0; int c3 = 0xDECAFBAD; int n = 544357; int status = 0; double max_err = 0.0, avg_err = 0.0; oskar_Mem* v_cpu_f = oskar_mem_create(OSKAR_SINGLE, OSKAR_CPU, n, &status); oskar_Mem* v_gpu_f = oskar_mem_create(OSKAR_SINGLE, OSKAR_GPU, n, &status); oskar_Mem* v_cpu_d = oskar_mem_create(OSKAR_DOUBLE, OSKAR_CPU, n, &status); oskar_Mem* v_gpu_d = oskar_mem_create(OSKAR_DOUBLE, OSKAR_GPU, n, &status); oskar_Timer* tmr = oskar_timer_create(OSKAR_TIMER_CUDA); // Run in single precision. oskar_timer_start(tmr); oskar_mem_random_uniform(v_cpu_f, seed, c1, c2, c3, &status); report_time(n, "uniform", "single", "CPU", oskar_timer_elapsed(tmr)); ASSERT_EQ(0, status) << oskar_get_error_string(status); oskar_timer_start(tmr); oskar_mem_random_uniform(v_gpu_f, seed, c1, c2, c3, &status); report_time(n, "uniform", "single", "GPU", oskar_timer_elapsed(tmr)); ASSERT_EQ(0, status) << oskar_get_error_string(status); // Check consistency between CPU and GPU results. oskar_mem_evaluate_relative_error(v_gpu_f, v_cpu_f, 0, &max_err, &avg_err, 0, &status); EXPECT_LT(max_err, 1e-5); EXPECT_LT(avg_err, 1e-5); // Run in double precision. oskar_timer_start(tmr); oskar_mem_random_uniform(v_cpu_d, seed, c1, c2, c3, &status); report_time(n, "uniform", "double", "CPU", oskar_timer_elapsed(tmr)); ASSERT_EQ(0, status) << oskar_get_error_string(status); oskar_timer_start(tmr); oskar_mem_random_uniform(v_gpu_d, seed, c1, c2, c3, &status); report_time(n, "uniform", "double", "GPU", oskar_timer_elapsed(tmr)); ASSERT_EQ(0, status) << oskar_get_error_string(status); // Check consistency between CPU and GPU results. oskar_mem_evaluate_relative_error(v_gpu_d, v_cpu_d, 0, &max_err, &avg_err, 0, &status); EXPECT_LT(max_err, 1e-10); EXPECT_LT(avg_err, 1e-10); // Check consistency between single and double precision. oskar_mem_evaluate_relative_error(v_cpu_f, v_cpu_d, 0, &max_err, &avg_err, 0, &status); EXPECT_LT(max_err, 1e-5); EXPECT_LT(avg_err, 1e-5); if (save) { FILE* fhan = fopen("random_uniform.txt", "w"); oskar_mem_save_ascii(fhan, 4, n, &status, v_cpu_f, v_gpu_f, v_cpu_d, v_gpu_d); fclose(fhan); } // Free memory. oskar_mem_free(v_cpu_f, &status); oskar_mem_free(v_gpu_f, &status); oskar_mem_free(v_cpu_d, &status); oskar_mem_free(v_gpu_d, &status); oskar_timer_free(tmr); }
void runTest(int prec1, int prec2, int loc1, int loc2, int matrix, int extended, double time_average) { int num_baselines, status = 0, type; oskar_Mem *vis1, *vis2; oskar_Timer *timer1, *timer2; double time1, time2, frequency = 100e6; // Create the timers. timer1 = oskar_timer_create(loc1 == OSKAR_GPU ? OSKAR_TIMER_CUDA : OSKAR_TIMER_NATIVE); timer2 = oskar_timer_create(loc2 == OSKAR_GPU ? OSKAR_TIMER_CUDA : OSKAR_TIMER_NATIVE); // Run first part. createTestData(prec1, loc1, matrix); num_baselines = oskar_telescope_num_baselines(tel); type = prec1 | OSKAR_COMPLEX; if (matrix) type |= OSKAR_MATRIX; vis1 = oskar_mem_create(type, loc1, num_baselines, &status); oskar_mem_clear_contents(vis1, &status); ASSERT_EQ(0, status) << oskar_get_error_string(status); oskar_sky_set_use_extended(sky, extended); oskar_telescope_set_channel_bandwidth(tel, bandwidth); oskar_telescope_set_time_average(tel, time_average); oskar_timer_start(timer1); oskar_cross_correlate(vis1, oskar_sky_num_sources(sky), jones, sky, tel, u_, v_, w_, 1.0, frequency, &status); time1 = oskar_timer_elapsed(timer1); destroyTestData(); ASSERT_EQ(0, status) << oskar_get_error_string(status); // Run second part. createTestData(prec2, loc2, matrix); num_baselines = oskar_telescope_num_baselines(tel); type = prec2 | OSKAR_COMPLEX; if (matrix) type |= OSKAR_MATRIX; vis2 = oskar_mem_create(type, loc2, num_baselines, &status); oskar_mem_clear_contents(vis2, &status); ASSERT_EQ(0, status) << oskar_get_error_string(status); oskar_sky_set_use_extended(sky, extended); oskar_telescope_set_channel_bandwidth(tel, bandwidth); oskar_telescope_set_time_average(tel, time_average); oskar_timer_start(timer2); oskar_cross_correlate(vis2, oskar_sky_num_sources(sky), jones, sky, tel, u_, v_, w_, 1.0, frequency, &status); time2 = oskar_timer_elapsed(timer2); destroyTestData(); ASSERT_EQ(0, status) << oskar_get_error_string(status); // Destroy the timers. oskar_timer_free(timer1); oskar_timer_free(timer2); // Compare results. check_values(vis1, vis2); // Free memory. oskar_mem_free(vis1, &status); oskar_mem_free(vis2, &status); ASSERT_EQ(0, status) << oskar_get_error_string(status); // Record properties for test. RecordProperty("SourceType", extended ? "Gaussian" : "Point"); RecordProperty("JonesType", matrix ? "Matrix" : "Scalar"); RecordProperty("TimeSmearing", time_average == 0.0 ? "off" : "on"); RecordProperty("Prec1", prec1 == OSKAR_SINGLE ? "Single" : "Double"); RecordProperty("Loc1", loc1 == OSKAR_CPU ? "CPU" : "GPU"); RecordProperty("Time1_ms", int(time1 * 1000)); RecordProperty("Prec2", prec2 == OSKAR_SINGLE ? "Single" : "Double"); RecordProperty("Loc2", loc2 == OSKAR_CPU ? "CPU" : "GPU"); RecordProperty("Time2_ms", int(time2 * 1000)); #ifdef ALLOW_PRINTING // Print times. printf(" > %s. %s sources. Time smearing %s.\n", matrix ? "Matrix" : "Scalar", extended ? "Gaussian" : "Point", time_average == 0.0 ? "off" : "on"); printf(" %s precision %s: %.2f ms, %s precision %s: %.2f ms\n", prec1 == OSKAR_SINGLE ? "Single" : "Double", loc1 == OSKAR_CPU ? "CPU" : "GPU", time1 * 1000.0, prec2 == OSKAR_SINGLE ? "Single" : "Double", loc2 == OSKAR_CPU ? "CPU" : "GPU", time2 * 1000.0); #endif }
TEST(prefix_sum, test) { int n = 100000, status = 0, exclusive = 1; oskar_Mem* in_cpu = oskar_mem_create(OSKAR_INT, OSKAR_CPU, n, &status); oskar_Mem* out_cpu = oskar_mem_create(OSKAR_INT, OSKAR_CPU, n, &status); oskar_Timer* tmr = oskar_timer_create(OSKAR_TIMER_NATIVE); // Fill input with random integers from 0 to 9. int* t = oskar_mem_int(in_cpu, &status); srand(1556); for (int i = 0; i < n; ++i) t[i] = (int) (10.0 * rand() / ((double) RAND_MAX)); t[0] = 3; // Run on CPU. oskar_timer_start(tmr); oskar_prefix_sum(n, in_cpu, out_cpu, 0, exclusive, &status); EXPECT_EQ(0, status); printf("Prefix sum on CPU took %.3f sec\n", oskar_timer_elapsed(tmr)); #ifdef OSKAR_HAVE_CUDA // Run on GPU with CUDA. oskar_Mem* in_gpu = oskar_mem_create_copy(in_cpu, OSKAR_GPU, &status); oskar_Mem* out_gpu = oskar_mem_create(OSKAR_INT, OSKAR_GPU, n, &status); oskar_timer_start(tmr); oskar_prefix_sum(n, in_gpu, out_gpu, 0, exclusive, &status); EXPECT_EQ(0, status); printf("Prefix sum on GPU took %.3f sec\n", oskar_timer_elapsed(tmr)); // Check consistency between CPU and GPU results. oskar_Mem* out_cmp_gpu = oskar_mem_create_copy(out_gpu, OSKAR_CPU, &status); EXPECT_EQ(0, oskar_mem_different(out_cpu, out_cmp_gpu, n, &status)); #endif #ifdef OSKAR_HAVE_OPENCL // Run on OpenCL. oskar_Mem* in_cl = oskar_mem_create_copy(in_cpu, OSKAR_CL, &status); oskar_Mem* out_cl = oskar_mem_create(OSKAR_INT, OSKAR_CL, n, &status); oskar_timer_start(tmr); printf("Using %s\n", oskar_cl_device_name()); oskar_prefix_sum(n, in_cl, out_cl, 0, exclusive, &status); EXPECT_EQ(0, status); printf("Prefix sum on OpenCL took %.3f sec\n", oskar_timer_elapsed(tmr)); // Check consistency between CPU and OpenCL results. oskar_Mem* out_cmp_cl = oskar_mem_create_copy(out_cl, OSKAR_CPU, &status); EXPECT_EQ(0, oskar_mem_different(out_cpu, out_cmp_cl, n, &status)); #endif if (save) { size_t num_mem = 1; FILE* fhan = fopen("prefix_sum_test.txt", "w"); #ifdef OSKAR_HAVE_CUDA num_mem += 1; #endif #ifdef OSKAR_HAVE_OPENCL num_mem += 1; #endif oskar_mem_save_ascii(fhan, num_mem, n, &status, out_cpu #ifdef OSKAR_HAVE_CUDA , out_cmp_gpu #endif #ifdef OSKAR_HAVE_OPENCL , out_cmp_cl #endif ); fclose(fhan); } // Clean up. oskar_timer_free(tmr); oskar_mem_free(in_cpu, &status); oskar_mem_free(out_cpu, &status); #ifdef OSKAR_HAVE_CUDA oskar_mem_free(in_gpu, &status); oskar_mem_free(out_gpu, &status); oskar_mem_free(out_cmp_gpu, &status); #endif #ifdef OSKAR_HAVE_OPENCL oskar_mem_free(in_cl, &status); oskar_mem_free(out_cl, &status); oskar_mem_free(out_cmp_cl, &status); #endif }
void runTest(int prec1, int prec2, int loc1, int loc2, int matrix) { int status = 0, type; oskar_Mem *beam1, *beam2; oskar_Timer *timer1, *timer2; double time1, time2; // Create the timers. timer1 = oskar_timer_create(loc1 == OSKAR_GPU ? OSKAR_TIMER_CUDA : OSKAR_TIMER_NATIVE); timer2 = oskar_timer_create(loc2 == OSKAR_GPU ? OSKAR_TIMER_CUDA : OSKAR_TIMER_NATIVE); // Run first part. type = prec1 | OSKAR_COMPLEX; if (matrix) type |= OSKAR_MATRIX; beam1 = oskar_mem_create(type, loc1, num_sources, &status); oskar_mem_clear_contents(beam1, &status); ASSERT_EQ(0, status) << oskar_get_error_string(status); createTestData(prec1, loc1, matrix); oskar_timer_start(timer1); oskar_evaluate_cross_power(num_sources, num_stations, jones, 0, beam1, &status); time1 = oskar_timer_elapsed(timer1); destroyTestData(); ASSERT_EQ(0, status) << oskar_get_error_string(status); // Run second part. type = prec2 | OSKAR_COMPLEX; if (matrix) type |= OSKAR_MATRIX; beam2 = oskar_mem_create(type, loc2, num_sources, &status); oskar_mem_clear_contents(beam2, &status); ASSERT_EQ(0, status) << oskar_get_error_string(status); createTestData(prec2, loc2, matrix); oskar_timer_start(timer2); oskar_evaluate_cross_power(num_sources, num_stations, jones, 0, beam2, &status); time2 = oskar_timer_elapsed(timer2); destroyTestData(); ASSERT_EQ(0, status) << oskar_get_error_string(status); // Destroy the timers. oskar_timer_free(timer1); oskar_timer_free(timer2); // Compare results. check_values(beam1, beam2); // Free memory. oskar_mem_free(beam1, &status); oskar_mem_free(beam2, &status); ASSERT_EQ(0, status) << oskar_get_error_string(status); // Record properties for test. RecordProperty("JonesType", matrix ? "Matrix" : "Scalar"); RecordProperty("Prec1", prec1 == OSKAR_SINGLE ? "Single" : "Double"); RecordProperty("Loc1", loc1 == OSKAR_CPU ? "CPU" : "GPU"); RecordProperty("Time1_ms", int(time1 * 1000)); RecordProperty("Prec2", prec2 == OSKAR_SINGLE ? "Single" : "Double"); RecordProperty("Loc2", loc2 == OSKAR_CPU ? "CPU" : "GPU"); RecordProperty("Time2_ms", int(time2 * 1000)); #ifdef ALLOW_PRINTING // Print times. printf(" > %s.\n", matrix ? "Matrix" : "Scalar"); printf(" %s precision %s: %.2f ms, %s precision %s: %.2f ms\n", prec1 == OSKAR_SINGLE ? "Single" : "Double", loc1 == OSKAR_CPU ? "CPU" : "GPU", time1 * 1000.0, prec2 == OSKAR_SINGLE ? "Single" : "Double", loc2 == OSKAR_CPU ? "CPU" : "GPU", time2 * 1000.0); #endif }
int benchmark(int num_elements, int num_directions, OpType op_type, int loc, int precision, bool evaluate_2d, int niter, double& time_taken) { int status = 0; // Create the timer. oskar_Timer *tmr = oskar_timer_create(OSKAR_TIMER_CUDA); oskar_Station* station = oskar_station_create(precision, loc, num_elements, &status); if (status) return status; station->array_is_3d = (evaluate_2d) ? OSKAR_FALSE : OSKAR_TRUE; oskar_Mem *x, *y, *z, *weights = 0, *beam = 0, *signal = 0; x = oskar_mem_create(precision, loc, num_directions, &status); y = oskar_mem_create(precision, loc, num_directions, &status); z = oskar_mem_create(precision, loc, num_directions, &status); if (status) return status; if (op_type == O2C) { int type = precision | OSKAR_COMPLEX; beam = oskar_mem_create(type, loc, num_directions, &status); weights = oskar_mem_create(type, loc, num_elements, &status); if (status) return status; oskar_timer_start(tmr); for (int i = 0; i < niter; ++i) { oskar_evaluate_array_pattern(beam, 2.0 * M_PI, station, num_directions, x, y, z, weights, &status); } time_taken = oskar_timer_elapsed(tmr); } else if (op_type == C2C || op_type == M2M) { int type = precision | OSKAR_COMPLEX; int num_signals = num_directions * num_elements; weights = oskar_mem_create(type, loc, num_elements, &status); if (op_type == C2C) { beam = oskar_mem_create(type, loc, num_directions, &status); signal = oskar_mem_create(type, loc, num_signals, &status); } else { type |= OSKAR_MATRIX; beam = oskar_mem_create(type, loc, num_directions, &status); signal = oskar_mem_create(type, loc, num_signals, &status); } if (status) return status; oskar_timer_start(tmr); for (int i = 0; i < niter; ++i) { oskar_evaluate_array_pattern_hierarchical(beam, 2.0 * M_PI, station, num_directions, x, y, z, signal, weights, &status); } time_taken = oskar_timer_elapsed(tmr); } // Destroy the timer. oskar_timer_free(tmr); // Free memory. oskar_station_free(station, &status); oskar_mem_free(x, &status); oskar_mem_free(y, &status); oskar_mem_free(z, &status); oskar_mem_free(weights, &status); oskar_mem_free(beam, &status); oskar_mem_free(signal, &status); return status; }