int main (int argc, const char * argv[]) { printf("4. OpenCL Profile No Overhead in the Loop\n"); float range = BIG_RANGE; float *in, *out; // ======== Initialize init_all_perfs(); create_data(&in, &out); start_perf_measurement(&total_perf); // ======== Setup OpenCL setup_cl(argc, argv, &opencl_device, &opencl_context, &opencl_queue); // ======== Setup the computation setup_cl_compute(); start_perf_measurement(&write_perf); copy_data_to_device(in, out); stop_perf_measurement(&write_perf); // ======== Compute while (range > LIMIT) { // Calculation start_perf_measurement(&update_perf); update_cl(get_in_buffer(), get_out_buffer()); stop_perf_measurement(&update_perf); // Read back the data start_perf_measurement(&read_perf); read_back_data(get_out_buffer(), out); stop_perf_measurement(&read_perf); // Compute Range start_perf_measurement(&range_perf); range = find_range(out, SIZE*SIZE); stop_perf_measurement(&range_perf); iterations++; printf("Iteration %d, range=%f.\n", iterations, range); } // ======== Finish and cleanup OpenCL start_perf_measurement(&finish_perf); clFinish(opencl_queue); stop_perf_measurement(&finish_perf); start_perf_measurement(&cleanup_perf); cleanup_cl(); stop_perf_measurement(&cleanup_perf); stop_perf_measurement(&total_perf); print_perfs(); free(in); free(out); }
void run_opencl_test(use_gpu){ init_opencl(use_gpu); load_cl_kernels(&clData); allocate_cl_buffers(&clData); transfer_buffers_to_gpu(); flush_cl_queue(); run_cl_advect_density(&clData, dt); flush_cl_queue(); transfer_buffers_to_cpu(); flush_cl_queue(); // printf("dens[%d] = %3.2f\n",IX(16,3,0),g_dens[IX(16,3,0)]); // // if(g_dens[IX(16,3,0)] > 0.0f) // { // printf("Success!!\n"); // } // // for (int i = 0; i < clData.n; ++i) // { // if(i == 112) { // int j = i*clData.dn; // printf("debug_data1[%d] = %3.2f, %3.2f, %3.2f, %3.2f\n",i,clData.debug_data1[j], clData.debug_data1[j+1], clData.debug_data1[j+2], clData.debug_data1[j+3]); // } // // } cleanup_cl(&clData); }
void runTimings(int use_gpu){ int ntrips = 10; char device_name[256]; timestamp_type time1, time2; //////////////////////////////////////////////////// ///GPU TIMINGS //////////////////////////////////////////////////// init_opencl(use_gpu); load_cl_kernels(&clData); allocate_cl_buffers(&clData); print_device_info_from_queue(clData.queue); get_device_name_from_queue(clData.queue, device_name, 256); transfer_buffers_to_gpu(); double advectionVelocityTimeGPU, advectionDensityTimeGPU, divergenceTimeGPU, projectJacobiTimeGPU, projectCGTimeGPU, pressureApplyTimeGPU; transfer_buffers_to_gpu(); get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { run_cl_advect_velocity(&clData, dt); } flush_cl_queue(); get_timestamp(&time2); advectionVelocityTimeGPU = timestamp_diff_in_seconds(time1,time2)/ntrips; get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { run_cl_calculate_divergence(&clData, dt); } flush_cl_queue(); get_timestamp(&time2); divergenceTimeGPU = timestamp_diff_in_seconds(time1,time2)/ntrips; transfer_buffers_to_cpu(); flush_cl_queue(); //This needs ntrips different divergence matrices to get accurate timings. //This is because by the time the second time it is called it will detect //the system is solved and exit after one matrix get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { transfer_cl_float_buffer_from_device(&clData,clData.buf_pressure,g_pressure,clData.n,true); transfer_cl_float_buffer_from_device(&clData,clData.buf_divergence,g_divergence,clData.n,true); run_cl_cg_no_mtx(&clData,g_pressure, g_divergence, g_cg_r, g_cg_d, g_cg_q, clData.n, 10, 0.0001f); flush_cl_queue(); transfer_cl_float_buffer_to_device(&clData,clData.buf_pressure,g_pressure,clData.n,true); } flush_cl_queue(); get_timestamp(&time2); projectCGTimeGPU = timestamp_diff_in_seconds(time1,time2)/ntrips; get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { for(int i = 0; i < 20; ++i) { run_cl_pressure_solve(&clData, dt); } } flush_cl_queue(); get_timestamp(&time2); projectJacobiTimeGPU = timestamp_diff_in_seconds(time1,time2)/ntrips; get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { run_cl_pressure_apply(&clData, dt); } flush_cl_queue(); get_timestamp(&time2); pressureApplyTimeGPU = timestamp_diff_in_seconds(time1,time2)/ntrips; get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { run_cl_advect_density(&clData, dt); } flush_cl_queue(); get_timestamp(&time2); advectionDensityTimeGPU = timestamp_diff_in_seconds(time1,time2)/ntrips; printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"GPU","Advection Velocity",advectionVelocityTimeGPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/advectionVelocityTimeGPU); printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"GPU","Advection Density",advectionDensityTimeGPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/advectionDensityTimeGPU); printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"GPU", "Divergence",divergenceTimeGPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/divergenceTimeGPU); printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"GPU", "Projection Jacobi",projectJacobiTimeGPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/projectJacobiTimeGPU); printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t",device_name,NX,NY,NZ,"GPU", "Projection Conjugate Gradient",projectCGTimeGPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/projectCGTimeGPU); printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"GPU","Pressure Apply",pressureApplyTimeGPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/pressureApplyTimeGPU); cleanup_cl(&clData); //////////////////////////////////////////////////// ///CPU TIMINGS //////////////////////////////////////////////////// double advectionVelocityTimeCPU, advectionDensityTimeCPU, divergenceTimeCPU, projectJacobiTimeCPU, projectCGTimeCPU, pressureApplyTimeCPU; get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { advect_velocity_RK2(dt, g_u, g_v, g_w, g_u_prev, g_v_prev, g_w_prev); } get_timestamp(&time2); advectionVelocityTimeCPU = timestamp_diff_in_seconds(time1,time2)/ntrips; //project(dt,g_u,g_v, g_w, g_divergence, g_pressure, g_pressure_prev, g_laplacian_matrix,useCG); get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { calculate_divergence(g_divergence, g_u, g_v, g_w, dt); } get_timestamp(&time2); divergenceTimeCPU = timestamp_diff_in_seconds(time1,time2)/ntrips; //This needs ntrips different divergence matrices to get accurate timings. //This is because by the time the second time it is called it will detect //the system is solved and exit after one matrix get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { pressure_solve_cg_no_matrix(g_pressure, g_divergence, g_cg_r, g_cg_d, g_cg_q); } get_timestamp(&time2); projectCGTimeCPU = timestamp_diff_in_seconds(time1,time2)/ntrips; get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { pressure_solve(g_pressure,g_pressure_prev, g_divergence, dt); } get_timestamp(&time2); projectJacobiTimeCPU = timestamp_diff_in_seconds(time1,time2)/ntrips; get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { pressure_apply(g_u, g_v, g_w, g_pressure, dt); } get_timestamp(&time2); pressureApplyTimeCPU = timestamp_diff_in_seconds(time1,time2)/ntrips; get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { advectRK2(dt,g_dens,g_dens_prev, g_u, g_v, g_w); } get_timestamp(&time2); advectionDensityTimeCPU = timestamp_diff_in_seconds(time1,time2)/ntrips; printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"CPU","Advection Velocity",advectionVelocityTimeCPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/advectionVelocityTimeCPU); printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"CPU","Advection Density",advectionDensityTimeCPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/advectionDensityTimeCPU); printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"CPU","Divergence",divergenceTimeCPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/divergenceTimeCPU); printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"CPU","Projection Jacobi",projectJacobiTimeCPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/projectJacobiTimeCPU); printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"CPU","Projection Conjugate Gradient",projectCGTimeCPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/projectCGTimeCPU); printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"CPU","Pressure Apply",pressureApplyTimeCPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/pressureApplyTimeCPU); }
int main ( int argc, char ** argv ) { // Parse command line options // int use_gpu = 1; int use_interop = 0; for(int i = 0; i < argc && argv; i++) { if(!argv[i]) continue; if(strstr(argv[i], "cpu")) use_gpu = 0; else if(strstr(argv[i], "gpu")) use_gpu = 1; else if(strstr(argv[i], "interop")) use_interop = 1; } printf("Parameter detect %s device (%s)\n",use_gpu==1?"GPU":"CPU",use_interop==1?"Share OpenGL":"Not Sharing OpenGL"); OPENCL_SHARE_WITH_OPENGL = use_interop; //testCG(); win_x = 512; win_y = 512; glutInit ( &argc, argv ); open_glut_window (); //test_opencl_opengl_interop(); dt = 0.1f; force = 10.0f; source = 10.0f; printf ( "\n\nHow to use this demo:\n\n" ); printf ( "\t Add densities with the left mouse button\n" ); printf ( "\t Add velocities with the left mouse button and dragging the mouse\n" ); printf ( "\t Toggle density/velocity display with the 'v' key\n" ); printf ( "\t Clear the simulation by pressing the 'x' key\n" ); printf ( "\t switch poisson solvers from jacobi to conjugate gradient by pressing the 'c' key\n" ); printf ( "\t switch advection scheme from RK2 to MacCormack by pressing the 'm' key\n" ); printf ( "\t toggle vorticity confinement by pressing the 'o' key\n" ); printf ( "\t Quit by pressing the 'q' key\n" ); dvel = 0; step = 0; maccormack = 0; vorticity = 0; useCG = 0; if ( !allocate_data () ) exit ( 1 ); clear_data (); //setupMatrix(g_laplacian_matrix); // FOR_EACH_FACE // { // //if(i < NX - NX*0.4 && i > NX*0.4 // // && // // j < NY - NY*0.4 && j > NY*0.4 ) // { // g_u_prev[IX(i,j,0)] = -0.01 * cosf(3.14159 * 2.0 * i/NX); // g_v_prev[IX(i,j,0)] = 0.01 * sinf(3.14159 * 2.0 * j/NY); // } // } #if RUN_TIMINGS runTimings(use_gpu); exit(0); #endif copy_grid(g_u_prev, g_u); copy_grid(g_v_prev, g_v); g_dens_prev[IX(16,3,0)] = 10.0f; //g_u_prev[IX(16,3,0)] = 10.0f; /* calculate_divergence(g_divergence, g_u_prev, g_v_prev, g_w_prev, dt); pressure_solve(g_pressure,g_pressure_prev, g_divergence, dt); pressure_apply(g_u_prev, g_v_prev, g_w_prev, g_pressure, dt); //project(dt,g_u_prev,g_v_prev, g_w_prev, g_divergence, g_pressure, g_pressure_prev); SWAP(g_u_prev,g_u); SWAP(g_v_prev,g_v); SWAP(g_w_prev,g_w); if(!check_divergence(g_u_prev, g_v_prev, g_w_prev)) { printf("Initial field wasn't divergence free!\n"); } */ //print_platforms_devices(); // run_opencl_test(use_gpu); // run_tests(); #if USE_OPENCL init_opencl(use_gpu); load_cl_kernels(&clData); allocate_cl_buffers(&clData); transfer_buffers_to_gpu(); flush_cl_queue(); #endif glutMainLoop (); #if USE_OPENCL cleanup_cl(&clData); #endif exit ( 0 ); }