int main(int argc, char **argv) #endif { int i, niter, step; double mflops, t, tmax; logical verified; char class; double tsum[t_last+2], t1[t_last+2], tming[t_last+2], tmaxg[t_last+2]; char *t_recs[t_last+2] = { "total", "rhs", "xsolve", "ysolve", "zsolve", "bpack", "exch", "xcomm", "ycomm", "zcomm", " totcomp", " totcomm" }; //--------------------------------------------------------------------- // Root node reads input file (if it exists) else takes // defaults from parameters //--------------------------------------------------------------------- printf("\n\n NAS Parallel Benchmarks (NPB3.3-OCL-MD) - SP Benchmark\n\n"); FILE *fp; fp = fopen("timer.flag", "r"); timeron = false; if (fp != NULL) { timeron = true; fclose(fp); } if ((fp = fopen("inputsp.data", "r")) != NULL) { int result; printf(" Reading from input file inputsp.data\n"); result = fscanf(fp, "%d", &niter); while (fgetc(fp) != '\n'); result = fscanf(fp, "%*f"); while (fgetc(fp) != '\n'); result = fscanf(fp, "%d%d%d", &grid_points[0], &grid_points[1], &grid_points[2]); fclose(fp); } else { printf(" No input file inputsp.data. Using compiled defaults\n"); niter = NITER_DEFAULT; grid_points[0] = PROBLEM_SIZE; grid_points[1] = PROBLEM_SIZE; grid_points[2] = PROBLEM_SIZE; } setup_opencl(argc, argv); printf(" Size: %4dx%4dx%4d\n", grid_points[0], grid_points[1], grid_points[2]); printf(" Iterations: %4d", niter); if (num_devices != MAXCELLS*MAXCELLS) printf(" WARNING: compiled for %5d devices \n", MAXCELLS*MAXCELLS); printf(" Number of active devices: %5d\n\n", num_devices); make_set(); for (i = 0; i < t_last; i++) { timer_clear(i); } set_constants(); initialize(); lhsinit(); exact_rhs(); compute_buffer_size(5); set_kernel_args(); //--------------------------------------------------------------------- // do one time step to touch all code, and reinitialize //--------------------------------------------------------------------- #ifdef MINIMD_SNUCL_OPTIMIZATIONS // set cmd queue property for(i = 0; i < num_devices; i++) { clSetCommandQueueProperty(cmd_queue[i], CL_QUEUE_AUTO_DEVICE_SELECTION | //CL_QUEUE_ITERATIVE | CL_QUEUE_COMPUTE_INTENSIVE, true, NULL); } #endif adi(); #ifdef MINIMD_SNUCL_OPTIMIZATIONS for(i = 0; i < num_devices; i++) { clSetCommandQueueProperty(cmd_queue[i], 0, true, NULL); } #endif initialize(); //--------------------------------------------------------------------- // Synchronize before placing time stamp //--------------------------------------------------------------------- for (i = 0; i < t_last; i++) { timer_clear(i); } timer_clear(0); timer_start(0); for (step = 1; step <= niter; step++) { if ((step % 20) == 0 || step == 1) { printf(" Time step %4d\n", step); } adi(); } timer_stop(0); t = timer_read(0); verify(niter, &class, &verified); tmax = t; if( tmax != 0.0 ) { mflops = (881.174*(double)( PROBLEM_SIZE*PROBLEM_SIZE*PROBLEM_SIZE ) -4683.91*(double)( PROBLEM_SIZE*PROBLEM_SIZE ) +11484.5*(double)( PROBLEM_SIZE ) -19272.4) * (double)( niter ) / (tmax*1000000.0); } else { mflops = 0.0; } c_print_results("SP", class, grid_points[0], grid_points[1], grid_points[2], niter, tmax, mflops, " floating point", verified, NPBVERSION,COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7, clu_GetDeviceTypeName(device_type), device_name, num_devices); if (timeron) { /* for (i = 0; i < t_last; i++) { t1[i] = timer_read(i); } t1[t_xsolve] = t1[t_xsolve] - t1[t_xcomm]; t1[t_ysolve] = t1[t_ysolve] - t1[t_ycomm]; t1[t_zsolve] = t1[t_zsolve] - t1[t_zcomm]; t1[t_last+2] = t1[t_xcomm]+t1[t_ycomm]+t1[t_zcomm]+t1[t_exch]; t1[t_last+1] = t1[t_total] - t1[t_last+2]; MPI_Reduce(&t1, tsum, t_last+2, dp_type, MPI_SUM, 0, comm_setup); MPI_Reduce(&t1, tming, t_last+2, dp_type, MPI_MIN, 0, comm_setup); MPI_Reduce(&t1, tmaxg, t_last+2, dp_type, MPI_MAX, 0, comm_setup); if (node == 0) { printf(" nprocs =%6d minimum maximum average\n", total_nodes); for (i = 0; i < t_last+2; i++) { tsum[i] = tsum[i] / total_nodes; printf(" timer %2d(%8s) : %10.4f %10.4f %10.4f\n", i+1, t_recs[i], tming[i], tmaxg[i], tsum[i]); } } */ } release_opencl(); return 0; }
void sph_simulation::simulate_single_frame(particle* in_particles, particle* out_particles) { // Calculate the optimal size for workgroups // Start groups size at their maximum, make them smaller if necessary // Optimally parameters.particles_count should be devisible by // CL_DEVICE_MAX_WORK_GROUP_SIZE // Refer to CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE unsigned int size_of_groups = running_device->getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>(); while (parameters.particles_count % size_of_groups != 0) { size_of_groups /= 2; } // Make sure that the workgroups are small enough and that the particle data // will fit in local memory assert(size_of_groups <= running_device->getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>()); assert(size_of_groups * sizeof(particle) <= running_device->getInfo<CL_DEVICE_LOCAL_MEM_SIZE>()); // Initial transfer to the GPU check_cl_error(queue_.enqueueWriteBuffer( front_buffer_, CL_TRUE, 0, sizeof(particle) * parameters.particles_count, in_particles)); // Recalculate the boundaries of the grid since the particles probably moved // since the last frame. cl_float min_x, max_x, min_y, max_y, min_z, max_z; cl_float grid_cell_side_length = (parameters.h * 2); min_x = min_y = min_z = std::numeric_limits<cl_int>::max(); max_x = max_y = max_z = std::numeric_limits<cl_int>::min(); for (size_t i = 0; i < parameters.particles_count; ++i) { cl_float3 pos = in_particles[i].position; if (pos.s[0] < min_x) min_x = pos.s[0]; if (pos.s[1] < min_y) min_y = pos.s[1]; if (pos.s[2] < min_z) min_z = pos.s[2]; if (pos.s[0] > max_x) max_x = pos.s[0]; if (pos.s[1] > max_y) max_y = pos.s[1]; if (pos.s[2] > max_z) max_z = pos.s[2]; } // Add or subtracts a cell length to all sides to create a padding layer // This simplifies calculations further down the line min_x -= grid_cell_side_length * 2; min_y -= grid_cell_side_length * 2; min_z -= grid_cell_side_length * 2; max_x += grid_cell_side_length * 2; max_y += grid_cell_side_length * 2; max_z += grid_cell_side_length * 2; parameters.min_point.s[0] = min_x; parameters.min_point.s[1] = min_y; parameters.min_point.s[2] = min_z; parameters.max_point.s[0] = max_x; parameters.max_point.s[1] = max_y; parameters.max_point.s[2] = max_z; parameters.grid_size_x = static_cast<cl_uint>((max_x - min_x) / grid_cell_side_length); parameters.grid_size_y = static_cast<cl_uint>((max_y - min_y) / grid_cell_side_length); parameters.grid_size_z = static_cast<cl_uint>((max_z - min_z) / grid_cell_side_length); // The Z-curve uses interleaving of bits in a uint to caculate the index. // This means we have floor(32/dimension_count) bits to represent each // dimension. assert(parameters.grid_size_x < 1024); assert(parameters.grid_size_y < 1024); assert(parameters.grid_size_z < 1024); parameters.grid_cell_count = get_grid_index_z_curve( parameters.grid_size_x, parameters.grid_size_y, parameters.grid_size_z); // Locate each particle in the grid and build the grid count table unsigned int* cell_table = new unsigned int[parameters.grid_cell_count]; set_kernel_args(kernel_locate_in_grid_, front_buffer_, back_buffer_, parameters); check_cl_error(queue_.enqueueNDRangeKernel( kernel_locate_in_grid_, cl::NullRange, cl::NDRange(parameters.particles_count), cl::NDRange(size_of_groups))); check_cl_error(queue_.enqueueReadBuffer( back_buffer_, CL_TRUE, 0, sizeof(particle) * parameters.particles_count, out_particles)); sort_particles(out_particles, back_buffer_, front_buffer_, cell_table); cl::Buffer cell_table_buffer( context_, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(unsigned int) * parameters.grid_cell_count); check_cl_error(queue_.enqueueWriteBuffer( cell_table_buffer, CL_TRUE, 0, sizeof(unsigned int) * parameters.grid_cell_count, cell_table)); check_cl_error(queue_.enqueueWriteBuffer( front_buffer_, CL_TRUE, 0, sizeof(particle) * parameters.particles_count, out_particles)); // Compute the density and the pressure term at every particle. check_cl_error(kernel_density_pressure_.setArg(0, front_buffer_)); check_cl_error(kernel_density_pressure_.setArg( 1, size_of_groups * sizeof(particle), nullptr)); // Declare local memory in arguments check_cl_error(kernel_density_pressure_.setArg(2, back_buffer_)); check_cl_error(kernel_density_pressure_.setArg(3, parameters)); check_cl_error(kernel_density_pressure_.setArg(4, precomputed_terms)); check_cl_error(kernel_density_pressure_.setArg(5, cell_table_buffer)); check_cl_error(queue_.enqueueNDRangeKernel( kernel_density_pressure_, cl::NullRange, cl::NDRange(parameters.particles_count), cl::NDRange(size_of_groups))); cl::Buffer face_normals_buffer( context_, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(float) * current_scene.face_normals.size()); cl::Buffer vertices_buffer(context_, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(float) * current_scene.vertices.size()); cl::Buffer indices_buffer( context_, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(unsigned int) * current_scene.indices.size()); check_cl_error(queue_.enqueueWriteBuffer( face_normals_buffer, CL_TRUE, 0, sizeof(float) * current_scene.face_normals.size(), current_scene.face_normals.data())); check_cl_error( queue_.enqueueWriteBuffer(vertices_buffer, CL_TRUE, 0, sizeof(float) * current_scene.vertices.size(), current_scene.vertices.data())); check_cl_error(queue_.enqueueWriteBuffer( indices_buffer, CL_TRUE, 0, sizeof(unsigned int) * current_scene.indices.size(), current_scene.indices.data())); // Compute the density-forces at every particle. set_kernel_args(kernel_forces_, back_buffer_, front_buffer_, parameters, precomputed_terms, cell_table_buffer); check_cl_error(queue_.enqueueNDRangeKernel( kernel_forces_, cl::NullRange, cl::NDRange(parameters.particles_count), cl::NDRange(size_of_groups))); // Advect particles and resolve collisions with scene geometry. set_kernel_args(kernel_advection_collision_, front_buffer_, back_buffer_, parameters, precomputed_terms, cell_table_buffer, face_normals_buffer, vertices_buffer, indices_buffer, current_scene.face_count); check_cl_error(queue_.enqueueNDRangeKernel( kernel_advection_collision_, cl::NullRange, cl::NDRange(parameters.particles_count), cl::NDRange(size_of_groups))); check_cl_error(queue_.enqueueReadBuffer( back_buffer_, CL_TRUE, 0, sizeof(particle) * parameters.particles_count, out_particles)); delete[] cell_table; }
static void set(const constant_vector<T> &rv, backend::kernel &kernel, unsigned device, size_t index_offset, detail::kernel_generator_state_ptr state) { set_kernel_args(rv.v, kernel, device, index_offset, state); }
/** * @brief Sorts the particles according to their grid index using Radix Sort * * @param[in,out] particles The array that contains the actual particle *data * @param[in] first_buffer The first OpenCL buffer used * @param[in] second_buffer The second OpenCL buffer used * @param[out] cell_table The array that contains the start indexes of *the cell in the sorted array * */ void sph_simulation::sort_particles(particle* particles, cl::Buffer& first_buffer, cl::Buffer& second_buffer, unsigned int* cell_table) { cl::Buffer* current_input_buffer = &first_buffer; cl::Buffer* current_output_buffer = &second_buffer; for (int pass_number = 0; pass_number < 4; ++pass_number) { unsigned int zero = 0; check_cl_error(queue_.enqueueFillBuffer( sort_count_buffer_, zero, 0, kSortThreadCount * kBucketCount * sizeof(int))); set_kernel_args(kernel_sort_count_, *current_input_buffer, sort_count_buffer_, parameters, kSortThreadCount, pass_number, kRadixWidth); check_cl_error(queue_.enqueueNDRangeKernel( kernel_sort_count_, cl::NullRange, cl::NDRange(kSortThreadCount), cl::NullRange)); check_cl_error(queue_.enqueueReadBuffer( sort_count_buffer_, CL_TRUE, 0, sizeof(unsigned int) * kSortThreadCount * kBucketCount, sort_count_array_.data())); unsigned int running_count = 0; for (int i = 0; i < kSortThreadCount * kBucketCount; ++i) { unsigned int tmp = sort_count_array_[i]; sort_count_array_[i] = running_count; running_count += tmp; } check_cl_error(queue_.enqueueWriteBuffer( sort_count_buffer_, CL_TRUE, 0, sizeof(unsigned int) * kSortThreadCount * kBucketCount, sort_count_array_.data())); set_kernel_args(kernel_sort_, *current_input_buffer, *current_output_buffer, sort_count_buffer_, parameters, kSortThreadCount, pass_number, kRadixWidth); check_cl_error(queue_.enqueueNDRangeKernel(kernel_sort_, cl::NullRange, cl::NDRange(kSortThreadCount), cl::NullRange)); cl::Buffer* tmp = current_input_buffer; current_input_buffer = current_output_buffer; current_output_buffer = tmp; } check_cl_error(queue_.enqueueReadBuffer( *current_input_buffer, CL_TRUE, 0, sizeof(particle) * parameters.particles_count, particles)); // Build the cell table by computing the cumulative sum at every cell. unsigned int current_index = 0; for (unsigned int i = 0; i < parameters.grid_cell_count; ++i) { cell_table[i] = current_index; while (current_index != parameters.particles_count && particles[current_index].grid_index == i) { current_index++; } } }