//------------------------------------------------------------------------------ void release_clenv(CLEnv& e) { check_cl_error(clReleaseCommandQueue(e.commandQueue), "clReleaseCommandQueue"); check_cl_error(clReleaseKernel(e.kernel), "clReleaseKernel"); check_cl_error(clReleaseProgram(e.program), "clReleaseProgram"); check_cl_error(clReleaseContext(e.context), "clReleaseContext"); }
void sph_simulation::simulate(int frame_count) { if (frame_count == 0) { frame_count = (int)ceil(parameters.simulation_time * parameters.target_fps); } cl_int cl_error; std::vector<cl::Device> device_array; check_cl_error(init_cl_single_device(&context_, device_array, "", "", true)); queue_ = cl::CommandQueue(context_, device_array[0], 0, &cl_error); check_cl_error(cl_error); running_device = &device_array[0]; std::string source = readKernelFile(BUFFER_KERNEL_FILE_NAME); cl::Program program; check_cl_error(make_program(&program, context_, device_array, source, true, "-I ./kernels/ -I ./common/")); kernel_density_pressure_ = make_kernel(program, "density_pressure"); kernel_advection_collision_ = make_kernel(program, "advection_collision"); kernel_forces_ = make_kernel(program, "forces"); kernel_locate_in_grid_ = make_kernel(program, "locate_in_grid"); kernel_sort_count_ = make_kernel(program, "sort_count"); kernel_sort_ = make_kernel(program, "sort"); front_buffer_ = cl::Buffer(context_, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(particle) * parameters.particles_count); back_buffer_ = cl::Buffer(context_, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(particle) * parameters.particles_count); sort_count_buffer_ = cl::Buffer(context_, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(unsigned int) * kSortThreadCount * kBucketCount); particle* particles = new particle[parameters.particles_count]; init_particles(particles, parameters); for (int i = 0; i < frame_count; ++i) { if (pre_frame) { pre_frame(particles, parameters, true); } for (int j = 0; (float)j < (1.f / parameters.simulation_scale); ++j) { if (pre_frame) pre_frame(particles, parameters, false); simulate_single_frame(particles, particles); if (post_frame) post_frame(particles, parameters, false); } if (post_frame) { post_frame(particles, parameters, true); } } delete[] particles; }
cl::Kernel make_kernel(cl::Program& p, const char* name) { cl_int cl_error; cl::Kernel k = cl::Kernel(p, name, &cl_error); check_cl_error(cl_error); return k; }
//------------------------------------------------------------------------------ double get_cl_time(cl_event ev) { cl_ulong startTime = cl_ulong(0); cl_ulong endTime = cl_ulong(0); size_t retBytes = size_t(0); cl_int status = clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &startTime, &retBytes); check_cl_error(status, "clGetEventProfilingInfo"); status = clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, &retBytes); check_cl_error(status, "clGetEventProfilingInfo"); //event timing is reported in nanoseconds: divide by 1e6 to get //time in milliseconds return double((endTime - startTime) / 1E6); }
//------------------------------------------------------------------------------ cl_device_id get_device_id(cl_context ctx) { cl_device_id deviceID; // retrieve actual device id from context cl_int status = clGetContextInfo(ctx, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &deviceID, 0); check_cl_error(status, "clGetContextInfo"); return deviceID; }
//------------------------------------------------------------------------------ double timeEnqueueNDRangeKernel(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list) { cl_int status = clFinish(command_queue); check_cl_error(status, "clFinish"); cl_event profilingEvent; status = clEnqueueNDRangeKernel(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_events_in_wait_list, event_wait_list, &profilingEvent); check_cl_error(status, "clEnqueueNDRangeKernel"); status = clFinish(command_queue); //ensure kernel execution is //terminated; used for timing purposes only; there is no need to enforce //termination when issuing a subsequent blocking data transfer operation check_cl_error(status, "clFinish"); status = clWaitForEvents(1, &profilingEvent); check_cl_error(status, "clWaitForEvents"); cl_ulong kernelStartTime = cl_ulong(0); cl_ulong kernelEndTime = cl_ulong(0); size_t retBytes = size_t(0); double kernelElapsedTime_ms = double(0); status = clGetEventProfilingInfo(profilingEvent, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &kernelStartTime, &retBytes); check_cl_error(status, "clGetEventProfilingInfo"); status = clGetEventProfilingInfo(profilingEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernelEndTime, &retBytes); check_cl_error(status, "clGetEventProfilingInfo"); //event timing is reported in nano seconds: divide by 1e6 to get //time in milliseconds kernelElapsedTime_ms = (double)(kernelEndTime - kernelStartTime) / 1E6; return kernelElapsedTime_ms; }
//------------------------------------------------------------------------------ int main(int argc, char** argv) { if(argc < 9) { std::cerr << "usage: " << argv[0] << " <platform name> <device type = default | cpu | gpu " "| acc | all> <device num> <OpenCL source file path>" " <kernel name> <size> <local size> <vec element width>" << std::endl; exit(EXIT_FAILURE); } const int SIZE = atoi(argv[argc - 3]); // number of elements const int CL_ELEMENT_SIZE = atoi(argv[argc - 1]); // number of per-element // components const int CPU_BLOCK_SIZE = 16384; //use block dot product if SIZE divisible //by this value const size_t BYTE_SIZE = SIZE * sizeof(real_t); const int BLOCK_SIZE = atoi(argv[argc - 2]); //local cache for reduction //equal to local workgroup size const int REDUCED_SIZE = SIZE / BLOCK_SIZE; const int REDUCED_BYTE_SIZE = REDUCED_SIZE * sizeof(real_t); //setup text header that will be prefixed to opencl code std::ostringstream clheaderStream; clheaderStream << "#define BLOCK_SIZE " << BLOCK_SIZE << '\n'; clheaderStream << "#define VEC_WIDTH " << CL_ELEMENT_SIZE << '\n'; #ifdef USE_DOUBLE clheaderStream << "#define DOUBLE\n"; const double EPS = 0.000000001; #else const float EPS = 0.00001; #endif const bool PROFILE_ENABLE_OPTION = true; CLEnv clenv = create_clenv(argv[1], argv[2], atoi(argv[3]), PROFILE_ENABLE_OPTION, argv[4], argv[5], clheaderStream.str()); cl_int status; //create input and output matrices std::vector<real_t> V1 = create_vector(SIZE); std::vector<real_t> V2 = create_vector(SIZE); real_t hostDot = std::numeric_limits< real_t >::quiet_NaN(); real_t deviceDot = std::numeric_limits< real_t >::quiet_NaN(); //ALLOCATE DATA AND COPY TO DEVICE //allocate output buffer on OpenCL device //the partialReduction array contains a sequence of dot products //computed on sub-arrays of size BLOCK_SIZE cl_mem partialReduction = clCreateBuffer(clenv.context, CL_MEM_WRITE_ONLY, REDUCED_BYTE_SIZE, 0, &status); check_cl_error(status, "clCreateBuffer"); //allocate input buffers on OpenCL devices and copy data cl_mem devV1 = clCreateBuffer(clenv.context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, BYTE_SIZE, &V1[0], //<-- copy data from V1 &status); check_cl_error(status, "clCreateBuffer"); cl_mem devV2 = clCreateBuffer(clenv.context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, BYTE_SIZE, &V2[0], //<-- copy data from V2 &status); check_cl_error(status, "clCreateBuffer"); //set kernel parameters status = clSetKernelArg(clenv.kernel, //kernel 0, //parameter id sizeof(cl_mem), //size of parameter &devV1); //pointer to parameter check_cl_error(status, "clSetKernelArg(V1)"); status = clSetKernelArg(clenv.kernel, //kernel 1, //parameter id sizeof(cl_mem), //size of parameter &devV2); //pointer to parameter check_cl_error(status, "clSetKernelArg(V2)"); status = clSetKernelArg(clenv.kernel, //kernel 2, //parameter id sizeof(cl_mem), //size of parameter &partialReduction); //pointer to parameter check_cl_error(status, "clSetKernelArg(devOut)"); //setup kernel launch configuration //total number of threads == number of array elements const size_t globalWorkSize[1] = {SIZE / CL_ELEMENT_SIZE}; //number of per-workgroup local threads const size_t localWorkSize[1] = {BLOCK_SIZE}; //LAUNCH KERNEL // make sure all work on the OpenCL device is finished status = clFinish(clenv.commandQueue); check_cl_error(status, "clFinish"); cl_event profilingEvent; timespec kernelStart = {0, 0}; timespec kernelEnd = {0, 0}; clock_gettime(CLOCK_MONOTONIC, &kernelStart); //launch kernel status = clEnqueueNDRangeKernel(clenv.commandQueue, //queue clenv.kernel, //kernel 1, //number of dimensions for work-items 0, //global work offset globalWorkSize, //total number of threads localWorkSize, //threads per workgroup 0, //number of events that need to //complete before kernel executed 0, //list of events that need to complete //before kernel executed &profilingEvent); //event object associated // with this particular // kernel execution // instance check_cl_error(status, "clEnqueueNDRangeKernel"); status = clFinish(clenv.commandQueue); //ensure kernel execution is //terminated; used for timing purposes only; there is no need to enforce //termination when issuing a subsequent blocking data transfer operation check_cl_error(status, "clFinish"); status = clWaitForEvents(1, &profilingEvent); clock_gettime(CLOCK_MONOTONIC, &kernelEnd); check_cl_error(status, "clWaitForEvents"); //get_cl_time(profilingEvent); //gives similar results to the following const double kernelElapsedTime_ms = time_diff_ms(kernelStart, kernelEnd); //READ DATA FROM DEVICE //read back and print results std::vector< real_t > partialDot(REDUCED_SIZE); status = clEnqueueReadBuffer(clenv.commandQueue, partialReduction, CL_TRUE, //blocking read 0, //offset REDUCED_BYTE_SIZE, //byte size of data &partialDot[0], //destination buffer in host //memory 0, //number of events that need to //complete before transfer executed 0, //list of events that need to complete //before transfer executed &profilingEvent); //event identifying this //specific operation check_cl_error(status, "clEnqueueReadBuffer"); const double dataTransferTime_ms = get_cl_time(profilingEvent); timespec accStart = {0, 0}; timespec accEnd = {0, 0}; //FINAL REDUCTION ON HOST clock_gettime(CLOCK_MONOTONIC, &accStart); deviceDot = std::accumulate(partialDot.begin(), partialDot.end(), real_t(0)); clock_gettime(CLOCK_MONOTONIC, &accEnd); const double accTime_ms = time_diff_ms(accStart, accEnd); //COMPUTE DOT PRODUCT ON HOST timespec hostStart = {0, 0}; timespec hostEnd = {0, 0}; clock_gettime(CLOCK_MONOTONIC, &hostStart); if(true || SIZE % CPU_BLOCK_SIZE != 0) hostDot = host_dot_product(V1, V2); else hostDot = host_dot_block(&V1[0], &V2[0], SIZE, CPU_BLOCK_SIZE); clock_gettime(CLOCK_MONOTONIC, &hostEnd); const double host_time = time_diff_ms(hostStart, hostEnd); //PRINT RESULTS std::cout << deviceDot << ' ' << hostDot << std::endl; if(check_result(hostDot, deviceDot, EPS)) { std::cout << "PASSED" << std::endl; std::cout << "kernel: " << kernelElapsedTime_ms << "ms\n" << "host reduction: " << accTime_ms << "ms\n" << "total: " << (kernelElapsedTime_ms + accTime_ms) << "ms" << std::endl; std::cout << "transfer: " << dataTransferTime_ms << "ms\n" << std::endl; if(true || SIZE % CPU_BLOCK_SIZE != 0) { std::cout << "host: " << host_time << "ms" << std::endl; } else { std::cout << "host (16k blocks): " << host_time << "ms" << std::endl; } } else { std::cout << "FAILED" << std::endl; } check_cl_error(clReleaseMemObject(devV1), "clReleaseMemObject"); check_cl_error(clReleaseMemObject(devV2), "clReleaseMemObject"); check_cl_error(clReleaseMemObject(partialReduction), "clReleaseMemObject"); release_clenv(clenv); return 0; }
void set_kernel_args_internal(int index, cl::Kernel& kernel, T1 a, TArgs... args) { check_cl_error(kernel.setArg(index, a)); set_kernel_args_internal(index + 1, kernel, args...); }
void sph_simulation::simulate_single_frame(particle* in_particles, particle* out_particles) { // Calculate the optimal size for workgroups // Start groups size at their maximum, make them smaller if necessary // Optimally parameters.particles_count should be devisible by // CL_DEVICE_MAX_WORK_GROUP_SIZE // Refer to CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE unsigned int size_of_groups = running_device->getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>(); while (parameters.particles_count % size_of_groups != 0) { size_of_groups /= 2; } // Make sure that the workgroups are small enough and that the particle data // will fit in local memory assert(size_of_groups <= running_device->getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>()); assert(size_of_groups * sizeof(particle) <= running_device->getInfo<CL_DEVICE_LOCAL_MEM_SIZE>()); // Initial transfer to the GPU check_cl_error(queue_.enqueueWriteBuffer( front_buffer_, CL_TRUE, 0, sizeof(particle) * parameters.particles_count, in_particles)); // Recalculate the boundaries of the grid since the particles probably moved // since the last frame. cl_float min_x, max_x, min_y, max_y, min_z, max_z; cl_float grid_cell_side_length = (parameters.h * 2); min_x = min_y = min_z = std::numeric_limits<cl_int>::max(); max_x = max_y = max_z = std::numeric_limits<cl_int>::min(); for (size_t i = 0; i < parameters.particles_count; ++i) { cl_float3 pos = in_particles[i].position; if (pos.s[0] < min_x) min_x = pos.s[0]; if (pos.s[1] < min_y) min_y = pos.s[1]; if (pos.s[2] < min_z) min_z = pos.s[2]; if (pos.s[0] > max_x) max_x = pos.s[0]; if (pos.s[1] > max_y) max_y = pos.s[1]; if (pos.s[2] > max_z) max_z = pos.s[2]; } // Add or subtracts a cell length to all sides to create a padding layer // This simplifies calculations further down the line min_x -= grid_cell_side_length * 2; min_y -= grid_cell_side_length * 2; min_z -= grid_cell_side_length * 2; max_x += grid_cell_side_length * 2; max_y += grid_cell_side_length * 2; max_z += grid_cell_side_length * 2; parameters.min_point.s[0] = min_x; parameters.min_point.s[1] = min_y; parameters.min_point.s[2] = min_z; parameters.max_point.s[0] = max_x; parameters.max_point.s[1] = max_y; parameters.max_point.s[2] = max_z; parameters.grid_size_x = static_cast<cl_uint>((max_x - min_x) / grid_cell_side_length); parameters.grid_size_y = static_cast<cl_uint>((max_y - min_y) / grid_cell_side_length); parameters.grid_size_z = static_cast<cl_uint>((max_z - min_z) / grid_cell_side_length); // The Z-curve uses interleaving of bits in a uint to caculate the index. // This means we have floor(32/dimension_count) bits to represent each // dimension. assert(parameters.grid_size_x < 1024); assert(parameters.grid_size_y < 1024); assert(parameters.grid_size_z < 1024); parameters.grid_cell_count = get_grid_index_z_curve( parameters.grid_size_x, parameters.grid_size_y, parameters.grid_size_z); // Locate each particle in the grid and build the grid count table unsigned int* cell_table = new unsigned int[parameters.grid_cell_count]; set_kernel_args(kernel_locate_in_grid_, front_buffer_, back_buffer_, parameters); check_cl_error(queue_.enqueueNDRangeKernel( kernel_locate_in_grid_, cl::NullRange, cl::NDRange(parameters.particles_count), cl::NDRange(size_of_groups))); check_cl_error(queue_.enqueueReadBuffer( back_buffer_, CL_TRUE, 0, sizeof(particle) * parameters.particles_count, out_particles)); sort_particles(out_particles, back_buffer_, front_buffer_, cell_table); cl::Buffer cell_table_buffer( context_, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(unsigned int) * parameters.grid_cell_count); check_cl_error(queue_.enqueueWriteBuffer( cell_table_buffer, CL_TRUE, 0, sizeof(unsigned int) * parameters.grid_cell_count, cell_table)); check_cl_error(queue_.enqueueWriteBuffer( front_buffer_, CL_TRUE, 0, sizeof(particle) * parameters.particles_count, out_particles)); // Compute the density and the pressure term at every particle. check_cl_error(kernel_density_pressure_.setArg(0, front_buffer_)); check_cl_error(kernel_density_pressure_.setArg( 1, size_of_groups * sizeof(particle), nullptr)); // Declare local memory in arguments check_cl_error(kernel_density_pressure_.setArg(2, back_buffer_)); check_cl_error(kernel_density_pressure_.setArg(3, parameters)); check_cl_error(kernel_density_pressure_.setArg(4, precomputed_terms)); check_cl_error(kernel_density_pressure_.setArg(5, cell_table_buffer)); check_cl_error(queue_.enqueueNDRangeKernel( kernel_density_pressure_, cl::NullRange, cl::NDRange(parameters.particles_count), cl::NDRange(size_of_groups))); cl::Buffer face_normals_buffer( context_, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(float) * current_scene.face_normals.size()); cl::Buffer vertices_buffer(context_, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(float) * current_scene.vertices.size()); cl::Buffer indices_buffer( context_, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(unsigned int) * current_scene.indices.size()); check_cl_error(queue_.enqueueWriteBuffer( face_normals_buffer, CL_TRUE, 0, sizeof(float) * current_scene.face_normals.size(), current_scene.face_normals.data())); check_cl_error( queue_.enqueueWriteBuffer(vertices_buffer, CL_TRUE, 0, sizeof(float) * current_scene.vertices.size(), current_scene.vertices.data())); check_cl_error(queue_.enqueueWriteBuffer( indices_buffer, CL_TRUE, 0, sizeof(unsigned int) * current_scene.indices.size(), current_scene.indices.data())); // Compute the density-forces at every particle. set_kernel_args(kernel_forces_, back_buffer_, front_buffer_, parameters, precomputed_terms, cell_table_buffer); check_cl_error(queue_.enqueueNDRangeKernel( kernel_forces_, cl::NullRange, cl::NDRange(parameters.particles_count), cl::NDRange(size_of_groups))); // Advect particles and resolve collisions with scene geometry. set_kernel_args(kernel_advection_collision_, front_buffer_, back_buffer_, parameters, precomputed_terms, cell_table_buffer, face_normals_buffer, vertices_buffer, indices_buffer, current_scene.face_count); check_cl_error(queue_.enqueueNDRangeKernel( kernel_advection_collision_, cl::NullRange, cl::NDRange(parameters.particles_count), cl::NDRange(size_of_groups))); check_cl_error(queue_.enqueueReadBuffer( back_buffer_, CL_TRUE, 0, sizeof(particle) * parameters.particles_count, out_particles)); delete[] cell_table; }
/** * @brief Sorts the particles according to their grid index using Radix Sort * * @param[in,out] particles The array that contains the actual particle *data * @param[in] first_buffer The first OpenCL buffer used * @param[in] second_buffer The second OpenCL buffer used * @param[out] cell_table The array that contains the start indexes of *the cell in the sorted array * */ void sph_simulation::sort_particles(particle* particles, cl::Buffer& first_buffer, cl::Buffer& second_buffer, unsigned int* cell_table) { cl::Buffer* current_input_buffer = &first_buffer; cl::Buffer* current_output_buffer = &second_buffer; for (int pass_number = 0; pass_number < 4; ++pass_number) { unsigned int zero = 0; check_cl_error(queue_.enqueueFillBuffer( sort_count_buffer_, zero, 0, kSortThreadCount * kBucketCount * sizeof(int))); set_kernel_args(kernel_sort_count_, *current_input_buffer, sort_count_buffer_, parameters, kSortThreadCount, pass_number, kRadixWidth); check_cl_error(queue_.enqueueNDRangeKernel( kernel_sort_count_, cl::NullRange, cl::NDRange(kSortThreadCount), cl::NullRange)); check_cl_error(queue_.enqueueReadBuffer( sort_count_buffer_, CL_TRUE, 0, sizeof(unsigned int) * kSortThreadCount * kBucketCount, sort_count_array_.data())); unsigned int running_count = 0; for (int i = 0; i < kSortThreadCount * kBucketCount; ++i) { unsigned int tmp = sort_count_array_[i]; sort_count_array_[i] = running_count; running_count += tmp; } check_cl_error(queue_.enqueueWriteBuffer( sort_count_buffer_, CL_TRUE, 0, sizeof(unsigned int) * kSortThreadCount * kBucketCount, sort_count_array_.data())); set_kernel_args(kernel_sort_, *current_input_buffer, *current_output_buffer, sort_count_buffer_, parameters, kSortThreadCount, pass_number, kRadixWidth); check_cl_error(queue_.enqueueNDRangeKernel(kernel_sort_, cl::NullRange, cl::NDRange(kSortThreadCount), cl::NullRange)); cl::Buffer* tmp = current_input_buffer; current_input_buffer = current_output_buffer; current_output_buffer = tmp; } check_cl_error(queue_.enqueueReadBuffer( *current_input_buffer, CL_TRUE, 0, sizeof(particle) * parameters.particles_count, particles)); // Build the cell table by computing the cumulative sum at every cell. unsigned int current_index = 0; for (unsigned int i = 0; i < parameters.grid_cell_count; ++i) { cell_table[i] = current_index; while (current_index != parameters.particles_count && particles[current_index].grid_index == i) { current_index++; } } }
//------------------------------------------------------------------------------ int main(int argc, char** argv) { if(argc < 6) { std::cerr << "usage: " << argv[0] << " <platform name> <device type = default | cpu | gpu " "| acc | all> <device num> <OpenCL source file path>" " <kernel name>" << std::endl; exit(EXIT_FAILURE); } std::string platformName = argv[ 1 ]; std::string deviceType = argv[2]; int deviceNum = atoi(argv[3]); log_thread_count("\n\nstart"); #ifdef _OPENMP { // const int CHUNKSIZE = 100; // const int N = 1000; // int i, chunk; // float a[N], b[N], c[N]; // for (i=0; i < N; i++) // a[i] = b[i] = i * 1.0; // chunk = CHUNKSIZE; // printf("%d", omp_get_max_threads()); #pragma omp parallel //shared(a,b,c,chunk) private(i) { if(omp_get_thread_num() == 0) { log_thread_count("OMP PARALLEL"); } // printf("%d", omp_get_max_threads()); // #pragma omp for schedule(dynamic,chunk) nowait // for (i=0; i < N; i++) { // c[i] = a[i] + b[i]; // } } #pragma omp barrier } #endif //1)create context cl_context ctx = create_cl_context(platformName, deviceType, deviceNum); std::cout << "OpenCL context created" << std::endl; log_thread_count("CL context created"); //2)load kernel source const std::string programSource = load_text(argv[4]); std::cout << "OpenCL source code loaded" << std::endl; const char* src = programSource.c_str(); const size_t sourceLength = programSource.length(); //3)build program and create kernel cl_int status; cl_program program = clCreateProgramWithSource(ctx, //context 1, //number of strings &src, //source &sourceLength, // size &status); // status check_cl_error(status, "clCreateProgramWithSource"); cl_device_id deviceID; //only a single device was selected // retrieve actual device id from context status = clGetContextInfo(ctx, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &deviceID, 0); check_cl_error(status, "clGetContextInfo"); cl_int buildStatus = clBuildProgram(program, //program 1, //number of devices &deviceID, //array of device ids 0, //program options as passed on //the command line to regualar //compilers e.g. -DUSE_DOUBLE 0, 0); //log output if any char buffer[0x10000] = ""; size_t len = 0; status = clGetProgramBuildInfo(program, deviceID, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); check_cl_error(status, "clBuildProgramInfo"); if(len > 1) std::cout << "Build output: " << buffer << std::endl; check_cl_error(buildStatus, "clBuildProgram"); std::cout << "Built OpenCL program" << std::endl; const char* kernelName = argv[5]; cl_kernel kernel = clCreateKernel(program, kernelName, &status); check_cl_error(status, "clCreateKernel"); //4)allocate output buffer on OpenCL device typedef float real_t; const size_t ARRAY_LENGTH = 16; const size_t ARRAY_BYTE_LENGTH = ARRAY_LENGTH * sizeof(real_t); cl_mem outputCLBuffer = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, ARRAY_BYTE_LENGTH, 0, &status); check_cl_error(status, "clCreateBuffer"); //5)create command queue cl_command_queue commands = clCreateCommandQueue(ctx, deviceID, 0, &status); check_cl_error(status, "clCreateCommandQueue"); //6)set kernel parameters const real_t value = real_t(3.14); //first parameter: output array status = clSetKernelArg(kernel, //kernel 0, //parameter id sizeof(cl_mem), //size of parameter &outputCLBuffer); //pointer to parameter check_cl_error(status, "clSetKernelArg(0)"); //second parameter: value to assign to each array element status = clSetKernelArg(kernel, //kernel 1, //parameter id sizeof(real_t), //size of parameter &value); //pointer to parameter check_cl_error(status, "clSetKernelArg(1)"); //7)setup kernel launch configuration //total number of threads == number of array elements const size_t globalWorkSize[1] = {ARRAY_LENGTH}; //number of per-workgroup local threads const size_t localWorkSize[1] = {1}; //8)launch kernel status = clEnqueueNDRangeKernel(commands, //queue kernel, //kernel 1, //number of dimensions for work-items 0, //global work offset globalWorkSize, //total number of threads localWorkSize, //threads per workgroup 0, //number of events that need to //complete before kernel executed 0, //list of events that need to complete //before kernel executed 0); //event object identifying this //particular kernel execution instance check_cl_error(status, "clEnqueueNDRangeKernel"); log_thread_count("kernel launched"); std::cout << "Lunched OpenCL kernel - setting all array elements to " << value << std::endl; //9)read back and print results std::vector< real_t > hostArray(ARRAY_LENGTH, real_t(0)); status = clEnqueueReadBuffer(commands, outputCLBuffer, CL_TRUE, //blocking read 0, //offset ARRAY_BYTE_LENGTH, //byte size of data &hostArray[0], //destination buffer in host // memory 0, //number of events that need to //complete before transfer executed 0, //list of events that need to complete //before transfer executed 0); //event identifying this specific operation check_cl_error(status, "clEnqueueReadBuffer"); log_thread_count("device -> host transfer"); std::cout << "Output array: " << std::endl; std::ostream_iterator<real_t> out_it(std::cout, " "); std::copy(hostArray.begin(), hostArray.end(), out_it); std::cout << std::endl; //10)release resources check_cl_error(clReleaseMemObject(outputCLBuffer), "clReleaseMemObject"); check_cl_error(clReleaseCommandQueue(commands),"clReleaseCommandQueue"); check_cl_error(clReleaseKernel(kernel), "clReleaseKernel"); check_cl_error(clReleaseProgram(program), "clReleaseProgram"); check_cl_error(clReleaseContext(ctx), "clReleaseContext"); std::cout << "Released OpenCL resources" << std::endl; log_thread_count("released resources"); print_thread_count(threadlog.begin(), threadlog.end()); std::cout << std::endl; return 0; }
//------------------------------------------------------------------------------ // returns context associated with single device only, // to make it support multiple devices, a list of // <device type, device num> pairs is required cl_context create_cl_context(const std::string& platformName, const std::string& deviceTypeName, int deviceNum) { cl_int status = 0; //1) get platfors and search for platform matching platformName cl_uint numPlatforms = 0; status = clGetPlatformIDs(0, 0, &numPlatforms); check_cl_error(status, "clGetPlatformIDs"); if(numPlatforms < 1) { std::cout << "No OpenCL platforms found" << std::endl; exit(EXIT_SUCCESS); } typedef std::vector< cl_platform_id > PlatformIDs; PlatformIDs platformIDs(numPlatforms); status = clGetPlatformIDs(numPlatforms, &platformIDs[0], 0); check_cl_error(status, "clGetPlatformIDs"); std::vector< char > buf(0x10000, char(0)); cl_platform_id platformID; PlatformIDs::const_iterator pi = platformIDs.begin(); for(; pi != platformIDs.end(); ++pi) { status = clGetPlatformInfo(*pi, CL_PLATFORM_NAME, buf.size(), &buf[0], 0); check_cl_error(status, "clGetPlatformInfo"); if(platformName == &buf[0]) { platformID = *pi; break; } } if(pi == platformIDs.end()) { std::cerr << "ERROR - Couldn't find platform " << platformName << std::endl; exit(EXIT_FAILURE); } //2) get devices of deviceTypeName type and store their ids into // an array then select device id at position deviceNum cl_device_type deviceType; if(deviceTypeName == "default") deviceType = CL_DEVICE_TYPE_DEFAULT; else if(deviceTypeName == "cpu") deviceType = CL_DEVICE_TYPE_CPU; else if(deviceTypeName == "gpu") deviceType = CL_DEVICE_TYPE_GPU; else if(deviceTypeName == "acc") deviceType = CL_DEVICE_TYPE_ACCELERATOR; else if(deviceTypeName == "all") deviceType = CL_DEVICE_TYPE_CPU; else { std::cerr << "ERROR - device type " << deviceTypeName << " unknown" << std::endl; exit(EXIT_FAILURE); } cl_uint numDevices = 0; status = clGetDeviceIDs(platformID, deviceType, 0, 0, &numDevices); check_cl_error(status, "clGetDeviceIDs"); if(numDevices < 1) { std::cerr << "ERROR - Cannot find device of type " << deviceTypeName << std::endl; exit(EXIT_FAILURE); } typedef std::vector< cl_device_id > DeviceIDs; DeviceIDs deviceIDs(numDevices); status = clGetDeviceIDs(platformID, deviceType, numDevices, &deviceIDs[0], 0); check_cl_error(status, "clGetDeviceIDs"); if(deviceNum < 0 || deviceNum >= numDevices) { std::cerr << "ERROR - device number out of range: [0," << (numDevices - 1) << ']' << std::endl; exit(EXIT_FAILURE); } cl_device_id deviceID = deviceIDs[deviceNum]; //3) create and return context cl_context_properties ctxProps[] = { CL_CONTEXT_PLATFORM, cl_context_properties(platformID), 0 }; //only a single device supported cl_context ctx = clCreateContext(ctxProps, 1, &deviceID, &context_callback, 0, &status); check_cl_error(status, "clCreateContext"); return ctx; }
//------------------------------------------------------------------------------ int main(int argc, char** argv) { if(argc < 6) { std::cerr << "usage: " << argv[0] << " <platform name> <device type = default | cpu | gpu " "| acc | all> <device num> <OpenCL source file path>" " <kernel name>" << std::endl; exit(EXIT_FAILURE); } const int SIZE = 256; const size_t BYTE_SIZE = SIZE * sizeof(real_t); const int BLOCK_SIZE = 16; const int REDUCED_SIZE = SIZE / BLOCK_SIZE; const int REDUCED_BYTE_SIZE = REDUCED_SIZE * sizeof(real_t); //setup text header that will be prefixed to opencl code std::ostringstream clheaderStream; clheaderStream << "#define BLOCK_SIZE " << BLOCK_SIZE << '\n'; #ifdef USE_DOUBLE clheaderStream << "#define DOUBLE\n"; const double EPS = 0.000000001; #else const double EPS = 0.00001; #endif CLEnv clenv = create_clenv(argv[1], argv[2], atoi(argv[3]), false, argv[4], argv[5], clheaderStream.str()); cl_int status; //create input and output matrices std::vector<real_t> V1 = create_vector(SIZE); std::vector<real_t> V2 = create_vector(SIZE); real_t hostDot = std::numeric_limits< real_t >::quiet_NaN(); real_t deviceDot = std::numeric_limits< real_t >::quiet_NaN(); //allocate output buffer on OpenCL device //the partialReduction array contains a sequence of dot products //computed on sub-arrays of size BLOCK_SIZE cl_mem partialReduction = clCreateBuffer(clenv.context, CL_MEM_WRITE_ONLY, REDUCED_BYTE_SIZE, 0, &status); check_cl_error(status, "clCreateBuffer"); //allocate input buffers on OpenCL devices and copy data cl_mem devV1 = clCreateBuffer(clenv.context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, BYTE_SIZE, &V1[0], //<-- copy data from V1 &status); check_cl_error(status, "clCreateBuffer"); cl_mem devV2 = clCreateBuffer(clenv.context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, BYTE_SIZE, &V2[0], //<-- copy data from V2 &status); check_cl_error(status, "clCreateBuffer"); //set kernel parameters status = clSetKernelArg(clenv.kernel, //kernel 0, //parameter id sizeof(cl_mem), //size of parameter &devV1); //pointer to parameter check_cl_error(status, "clSetKernelArg(V1)"); status = clSetKernelArg(clenv.kernel, //kernel 1, //parameter id sizeof(cl_mem), //size of parameter &devV2); //pointer to parameter check_cl_error(status, "clSetKernelArg(V2)"); status = clSetKernelArg(clenv.kernel, //kernel 2, //parameter id sizeof(cl_mem), //size of parameter &partialReduction); //pointer to parameter check_cl_error(status, "clSetKernelArg(devOut)"); //setup kernel launch configuration //total number of threads == number of array elements const size_t globalWorkSize[1] = {SIZE}; //number of per-workgroup local threads const size_t localWorkSize[1] = {BLOCK_SIZE}; //launch kernel status = clEnqueueNDRangeKernel(clenv.commandQueue, //queue clenv.kernel, //kernel 1, //number of dimensions for work-items 0, //global work offset globalWorkSize, //total number of threads localWorkSize, //threads per workgroup 0, //number of events that need to //complete before kernel executed 0, //list of events that need to complete //before kernel executed 0); //event object identifying this //particular kernel execution instance check_cl_error(status, "clEnqueueNDRangeKernel"); //read back and print results std::vector< real_t > partialDot(REDUCED_SIZE); status = clEnqueueReadBuffer(clenv.commandQueue, partialReduction, CL_TRUE, //blocking read 0, //offset REDUCED_BYTE_SIZE, //byte size of data &partialDot[0], //destination buffer in host memory 0, //number of events that need to //complete before transfer executed 0, //list of events that need to complete //before transfer executed 0); //event identifying this specific operation check_cl_error(status, "clEnqueueReadBuffer"); deviceDot = std::accumulate(partialDot.begin(), partialDot.end(), real_t(0)); hostDot = host_dot_product(V1, V2); std::cout << deviceDot << ' ' << hostDot << std::endl; if(check_result(hostDot, deviceDot, EPS)) { std::cout << "PASSED" << std::endl; } else { std::cout << "FAILED" << std::endl; } check_cl_error(clReleaseMemObject(devV1), "clReleaseMemObject"); check_cl_error(clReleaseMemObject(devV2), "clReleaseMemObject"); check_cl_error(clReleaseMemObject(partialReduction), "clReleaseMemObject"); release_clenv(clenv); return 0; }
//------------------------------------------------------------------------------ CLEnv create_clenv(const std::string& platformName, const std::string& deviceType, int deviceNum, bool enableProfiling, const char* clSourcePath, const char* kernelName, const std::string& clSourcePrefix, const std::string& buildOptions) { CLEnv rt; cl_int status; cl_device_id deviceID; //1)create context rt.context = create_cl_context(platformName, deviceType, deviceNum); //only a single device was selected //retrieve actual device id from context status = clGetContextInfo(rt.context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &deviceID, 0); check_cl_error(status, "clGetContextInfo"); //2)load kernel source if(clSourcePath != 0) { const std::string programSource = clSourcePrefix + "\n" + load_text(clSourcePath); const char* src = programSource.c_str(); const size_t sourceLength = programSource.length(); //3)build program and create kernel rt.program = clCreateProgramWithSource(rt.context, //context 1, //number of strings &src, //lines &sourceLength, // size &status); // status check_cl_error(status, "clCreateProgramWithSource"); cl_int buildStatus = buildOptions.size() ? clBuildProgram(rt.program, 1, &deviceID, buildOptions.c_str(), 0, 0) : clBuildProgram(rt.program, 1, &deviceID, 0, 0, 0); //log output if any char buffer[0x10000] = ""; size_t len = 0; status = clGetProgramBuildInfo(rt.program, deviceID, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); check_cl_error(status, "clBuildProgramInfo"); if(len > 1) std::cout << "Build output: " << buffer << std::endl; check_cl_error(buildStatus, "clBuildProgram"); if(kernelName != 0) { rt.kernel = clCreateKernel(rt.program, kernelName, &status); check_cl_error(status, "clCreateKernel"); } } rt.commandQueue = enableProfiling ? clCreateCommandQueue(rt.context, deviceID, CL_QUEUE_PROFILING_ENABLE, &status) : clCreateCommandQueue(rt.context, deviceID, 0, &status); check_cl_error(status, "clCreateCommandQueue"); return rt; }
//------------------------------------------------------------------------------ int main(int argc, char** argv) { if(argc < 8) { std::cerr << "usage: " << argv[0] << " <platform name> <device type = default | cpu | gpu " "| acc | all> <device num> <OpenCL source file path>" " <kernel name> <matrix size> <workgroup size>" << std::endl; exit(EXIT_FAILURE); } const int SIZE = atoi(argv[6]); const size_t BYTE_SIZE = SIZE * SIZE * sizeof(real_t); const int BLOCK_SIZE = atoi(argv[7]); //4 x 4 tiles if( SIZE < 1 || BLOCK_SIZE < 1 || (SIZE % BLOCK_SIZE) != 0) { std::cerr << "ERROR - size and block size *must* be greater than zero " "and size *must* be evenly divsible by block size" << std::endl; exit(EXIT_FAILURE); } //setup text header that will be prefixed to opencl code std::ostringstream clheaderStream; clheaderStream << "#define BLOCK_SIZE " << BLOCK_SIZE << '\n'; #ifdef USE_DOUBLE clheaderStream << "#define DOUBLE\n"; const double EPS = 0.000000001; #else const double EPS = 0.00001; #endif //enable profiling on queue CLEnv clenv = create_clenv(argv[1], argv[2], atoi(argv[3]), true, argv[4], argv[5], clheaderStream.str()); cl_int status; //create input and output matrices std::vector<real_t> A = create_matrix(SIZE, SIZE); std::vector<real_t> B = create_matrix(SIZE, SIZE); std::vector<real_t> C(SIZE * SIZE,real_t(0)); std::vector<real_t> refC(SIZE * SIZE,real_t(0)); //allocate output buffer on OpenCL device cl_mem devC = clCreateBuffer(clenv.context, CL_MEM_WRITE_ONLY, BYTE_SIZE, 0, &status); check_cl_error(status, "clCreateBuffer"); //allocate input buffers on OpenCL devices and copy data cl_mem devA = clCreateBuffer(clenv.context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, BYTE_SIZE, &A[0], //<-- copy data from A &status); check_cl_error(status, "clCreateBuffer"); cl_mem devB = clCreateBuffer(clenv.context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, BYTE_SIZE, &B[0], //<-- copy data from B &status); check_cl_error(status, "clCreateBuffer"); //set kernel parameters status = clSetKernelArg(clenv.kernel, //kernel 0, //parameter id sizeof(cl_mem), //size of parameter &devA); //pointer to parameter check_cl_error(status, "clSetKernelArg(A)"); status = clSetKernelArg(clenv.kernel, //kernel 1, //parameter id sizeof(cl_mem), //size of parameter &devB); //pointer to parameter check_cl_error(status, "clSetKernelArg(B)"); status = clSetKernelArg(clenv.kernel, //kernel 2, //parameter id sizeof(cl_mem), //size of parameter &devC); //pointer to parameter check_cl_error(status, "clSetKernelArg(C)"); status = clSetKernelArg(clenv.kernel, //kernel 3, //parameter id sizeof(int), //size of parameter &SIZE); //pointer to parameter check_cl_error(status, "clSetKernelArg(SIZE)"); //setup kernel launch configuration //total number of threads == number of array elements const size_t globalWorkSize[2] = {SIZE, SIZE}; //number of per-workgroup local threads const size_t localWorkSize[2] = {BLOCK_SIZE, BLOCK_SIZE}; //launch kernel //to make sure there are no pending commands in the queue do wait //for any commands to finish execution status = clFinish(clenv.commandQueue); check_cl_error(status, "clFinish"); cl_event profilingEvent; status = clEnqueueNDRangeKernel(clenv.commandQueue, //queue clenv.kernel, //kernel 2, //number of dimensions for work-items 0, //global work offset globalWorkSize, //total number of threads localWorkSize, //threads per workgroup 0, //number of events that need to //complete before kernel executed 0, //list of events that need to complete //before kernel executed &profilingEvent); //event object //identifying this //particular kernel //execution instance check_cl_error(status, "clEnqueueNDRangeKernel"); status = clFinish(clenv.commandQueue); //ensure kernel execution is //terminated; used for timing purposes only; there is no need to enforce //termination when issuing a subsequent blocking data transfer operation check_cl_error(status, "clFinish"); status = clWaitForEvents(1, &profilingEvent); check_cl_error(status, "clWaitForEvents"); cl_ulong kernelStartTime = cl_ulong(0); cl_ulong kernelEndTime = cl_ulong(0); size_t retBytes = size_t(0); double kernelElapsedTime_ms = double(0); status = clGetEventProfilingInfo(profilingEvent, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &kernelStartTime, &retBytes); check_cl_error(status, "clGetEventProfilingInfo"); status = clGetEventProfilingInfo(profilingEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernelEndTime, &retBytes); check_cl_error(status, "clGetEventProfilingInfo"); //event timing is reported in nano seconds: divide by 1e6 to get //time in milliseconds kernelElapsedTime_ms = (double)(kernelEndTime - kernelStartTime) / 1E6; //read back and check results status = clEnqueueReadBuffer(clenv.commandQueue, devC, CL_TRUE, //blocking read 0, //offset BYTE_SIZE, //byte size of data &C[0], //destination buffer in host memory 0, //number of events that need to //complete before transfer executed 0, //list of events that need to complete //before transfer executed 0); //event identifying this specific operation check_cl_error(status, "clEnqueueReadBuffer"); host_matmul(A, B, refC, SIZE, SIZE); if(check_result(refC, C, EPS)) { std::cout << "PASSED" << std::endl; std::cout << "Elapsed time(ms): " << kernelElapsedTime_ms << std::endl; } else { std::cout << "FAILED" << std::endl; } check_cl_error(clReleaseMemObject(devA), "clReleaseMemObject"); check_cl_error(clReleaseMemObject(devB), "clReleaseMemObject"); check_cl_error(clReleaseMemObject(devC), "clReleaseMemObject"); release_clenv(clenv); return 0; }