int main(int argc, char** argv) { if (argc != 2) { std::cout << "Usage: ./pi_vocl num\n" << "\twhere num = 1, 4 or 8\n"; return EXIT_FAILURE; } int vector_size = atoi(argv[1]); // Define some vector size specific constants unsigned int ITERS, WGS; if (vector_size == 1) { ITERS = 262144; WGS = 8; } else if (vector_size == 4) { ITERS = 262144 / 4; WGS = 32; } else if (vector_size == 8) { ITERS = 262144 / 8; WGS = 64; } else { std::cerr << "Invalid vector size\n"; return EXIT_FAILURE; } // Set some default values: // Default number of steps (updated later to device preferable) unsigned int in_nsteps = INSTEPS; // Default number of iterations unsigned int niters = ITERS; unsigned int work_group_size = WGS; try { // Create context, queue and build program cl::Context context(DEVICE); cl::CommandQueue queue(context); cl::Program program(context, util::loadProgram("../pi_vocl.cl"), true); cl::Kernel kernel; // Now that we know the size of the work_groups, we can set the number of work // groups, the actual number of steps, and the step size unsigned int nwork_groups = in_nsteps/(work_group_size*niters); // Get the max work group size for the kernel pi on our device unsigned int max_size; std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>(); if (vector_size == 1) { kernel = cl::Kernel(program, "pi"); max_size = kernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(devices[0]); } else if (vector_size == 4) { kernel = cl::Kernel(program, "pi_vec4"); max_size = kernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(devices[0]); } else if (vector_size == 8) { kernel = cl::Kernel(program, "pi_vec8"); max_size = kernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(devices[0]); } if (max_size > work_group_size) { work_group_size = max_size; nwork_groups = in_nsteps/(nwork_groups*niters); } if (nwork_groups < 1) { nwork_groups = devices[0].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(); work_group_size = in_nsteps/(nwork_groups*niters); } unsigned int nsteps = work_group_size * niters * nwork_groups; float step_size = 1.0f / (float) nsteps; // Vector to hold partial sum std::vector<float> h_psum(nwork_groups); std::cout << nwork_groups << " work groups of size " << work_group_size << ".\n" << nsteps << " Integration steps\n"; cl::Buffer d_partial_sums(context, CL_MEM_WRITE_ONLY, sizeof(float) * nwork_groups); // Start the timer util::Timer timer; // Execute the kernel over the entire range of our 1d input data et // using the maximum number of work group items for this device cl::NDRange global(nwork_groups * work_group_size); cl::NDRange local(work_group_size); kernel.setArg(0, niters); kernel.setArg(1, step_size); cl::LocalSpaceArg localmem = cl::Local(sizeof(float) * work_group_size); kernel.setArg(2, localmem); kernel.setArg(3, d_partial_sums); queue.enqueueNDRangeKernel(kernel, cl::NullRange, global, local); cl::copy(queue, d_partial_sums, begin(h_psum), end(h_psum)); // Complete the sum and compute the final integral value float pi_res = 0.0; for (float x : h_psum) pi_res += x; pi_res *= step_size; // Stop the timer double rtime = static_cast<double>(timer.getTimeMilliseconds()) / 1000.; std::cout << "The calculation ran in " << rtime << " seconds\n" << " pi = " << pi_res << " for " << nsteps << " steps\n"; return EXIT_SUCCESS; } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << "(" << err_code(err.err()) << ")" << std::endl; return EXIT_FAILURE; } }
int main(int argc, char *argv[]) { float *h_psum; // vector to hold partial sum int in_nsteps = INSTEPS; // default number of steps (updated later to device prefereable) int niters = ITERS; // number of iterations int nsteps; float step_size; ::size_t nwork_groups; ::size_t max_size, work_group_size = 8; float pi_res; cl::Buffer d_partial_sums; try { cl_uint deviceIndex = 0; parseArguments(argc, argv, &deviceIndex); // Get list of devices std::vector<cl::Device> devices; unsigned numDevices = getDeviceList(devices); // Check device index in range if (deviceIndex >= numDevices) { std::cout << "Invalid device index (try '--list')\n"; return EXIT_FAILURE; } cl::Device device = devices[deviceIndex]; std::string name; getDeviceName(device, name); std::cout << "\nUsing OpenCL device: " << name << "\n"; std::vector<cl::Device> chosen_device; chosen_device.push_back(device); cl::Context context(chosen_device); cl::CommandQueue queue(context, device); // Create the program object cl::Program program(context, util::loadProgram("../pi_ocl.cl"), true); // Create the kernel object for quering information cl::Kernel ko_pi(program, "pi"); // Get the work group size work_group_size = ko_pi.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(device); //printf("wgroup_size = %lu\n", work_group_size); cl::make_kernel<int, float, cl::LocalSpaceArg, cl::Buffer> pi(program, "pi"); // Now that we know the size of the work_groups, we can set the number of work // groups, the actual number of steps, and the step size nwork_groups = in_nsteps/(work_group_size*niters); if ( nwork_groups < 1) { nwork_groups = device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(); work_group_size=in_nsteps / (nwork_groups*niters); } nsteps = work_group_size * niters * nwork_groups; step_size = 1.0f/static_cast<float>(nsteps); std::vector<float> h_psum(nwork_groups); printf( " %d work groups of size %d. %d Integration steps\n", (int)nwork_groups, (int)work_group_size, nsteps); d_partial_sums = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * nwork_groups); util::Timer timer; // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device pi( cl::EnqueueArgs( queue, cl::NDRange(nsteps / niters), cl::NDRange(work_group_size)), niters, step_size, cl::Local(sizeof(float) * work_group_size), d_partial_sums); cl::copy(queue, d_partial_sums, h_psum.begin(), h_psum.end()); // complete the sum and compute final integral value pi_res = 0.0f; for (unsigned int i = 0; i< nwork_groups; i++) { pi_res += h_psum[i]; } pi_res = pi_res * step_size; //rtime = wtime() - rtime; double rtime = static_cast<double>(timer.getTimeMilliseconds()) / 1000.; printf("\nThe calculation ran in %lf seconds\n", rtime); printf(" pi = %f for %d steps\n", pi_res, nsteps); } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << "(" << err_code(err.err()) << ")" << std::endl; } }
int main(void) { float *h_psum; // vector to hold partial sum int in_nsteps = INSTEPS; // default number of steps (updated later to device prefereable) int niters = ITERS; // number of iterations int nsteps; float step_size; ::size_t nwork_groups; ::size_t max_size, work_group_size = 8; float pi_res; cl::Buffer d_partial_sums; try { // Create a context cl::Context context(DEVICE); // Create the program object cl::Program program(context, util::loadProgram("pi_ocl.cl"), true); // Get the command queue cl::CommandQueue queue(context); // Create the kernel object for quering information cl::Kernel ko_pi(program, "pi"); // Get the device we are using std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>(); cl::Device device = devices[0]; // Get the work group size work_group_size = ko_pi.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(device); //printf("wgroup_size = %lu\n", work_group_size); auto pi = cl::make_kernel<int, float, cl::LocalSpaceArg, cl::Buffer>(program, "pi"); // Now that we know the size of the work_groups, we can set the number of work // groups, the actual number of steps, and the step size nwork_groups = in_nsteps/(work_group_size*niters); if ( nwork_groups < 1) { nwork_groups = device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(); work_group_size=in_nsteps / (nwork_groups*niters); } nsteps = work_group_size * niters * nwork_groups; step_size = 1.0f/static_cast<float>(nsteps); std::vector<float> h_psum(nwork_groups); printf( " %d work groups of size %d. %d Integration steps\n", (int)nwork_groups, (int)work_group_size, nsteps); d_partial_sums = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * nwork_groups); util::Timer timer; // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device pi( cl::EnqueueArgs( queue, cl::NDRange(nwork_groups * work_group_size), cl::NDRange(work_group_size)), niters, step_size, cl::Local(sizeof(float) * work_group_size), d_partial_sums); cl::copy(queue, d_partial_sums, begin(h_psum), end(h_psum)); // complete the sum and compute final integral value pi_res = 0.0f; for (unsigned int i = 0; i< nwork_groups; i++) { pi_res += h_psum[i]; } pi_res = pi_res * step_size; //rtime = wtime() - rtime; double rtime = static_cast<double>(timer.getTimeMilliseconds()) / 1000.; printf("\nThe calculation ran in %lf seconds\n", rtime); printf(" pi = %f for %d steps\n", pi_res, nsteps); } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << "(" << err_code(err.err()) << ")" << std::endl; } }