virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/) { cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice); /* Use small global size on CPU devices as it seems to be much faster. */ if(type == CL_DEVICE_TYPE_CPU) { VLOG(1) << "Global size: (64, 64)."; return make_int2(64, 64); } cl_ulong max_buffer_size; clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL); if(DebugFlags().opencl.mem_limit) { max_buffer_size = min(max_buffer_size, cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used)); } VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(max_buffer_size) << " bytes. (" << string_human_readable_size(max_buffer_size) << ")."; /* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */ max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l*1024*1024*1024); size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size); int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64), (int)sqrt(num_elements)); VLOG(1) << "Global size: " << global_size << "."; return global_size; }
//------------------------------------------------------------------------------ double timeEnqueueNDRangeKernel(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list) { cl_int status = clFinish(command_queue); check_cl_error(status, "clFinish"); cl_event profilingEvent; status = clEnqueueNDRangeKernel(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_events_in_wait_list, event_wait_list, &profilingEvent); check_cl_error(status, "clEnqueueNDRangeKernel"); status = clFinish(command_queue); //ensure kernel execution is //terminated; used for timing purposes only; there is no need to enforce //termination when issuing a subsequent blocking data transfer operation check_cl_error(status, "clFinish"); status = clWaitForEvents(1, &profilingEvent); check_cl_error(status, "clWaitForEvents"); cl_ulong kernelStartTime = cl_ulong(0); cl_ulong kernelEndTime = cl_ulong(0); size_t retBytes = size_t(0); double kernelElapsedTime_ms = double(0); status = clGetEventProfilingInfo(profilingEvent, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &kernelStartTime, &retBytes); check_cl_error(status, "clGetEventProfilingInfo"); status = clGetEventProfilingInfo(profilingEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernelEndTime, &retBytes); check_cl_error(status, "clGetEventProfilingInfo"); //event timing is reported in nano seconds: divide by 1e6 to get //time in milliseconds kernelElapsedTime_ms = (double)(kernelEndTime - kernelStartTime) / 1E6; return kernelElapsedTime_ms; }
//------------------------------------------------------------------------------ double get_cl_time(cl_event ev) { cl_ulong startTime = cl_ulong(0); cl_ulong endTime = cl_ulong(0); size_t retBytes = size_t(0); cl_int status = clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &startTime, &retBytes); check_cl_error(status, "clGetEventProfilingInfo"); status = clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, &retBytes); check_cl_error(status, "clGetEventProfilingInfo"); //event timing is reported in nanoseconds: divide by 1e6 to get //time in milliseconds return double((endTime - startTime) / 1E6); }
void DebugParticleSystem::update_impl(float time_step) { if (m_cl_timer.isReady()) m_cl_timer.start(); size_t part_cnt = m_sim_params->particleCount(); float sim_time = m_sim_params->time(); if (m_spiral) m_polar_spiral_kernel.set_arg(3, cl_ulong(time(nullptr) + sim_time)); else m_gen_rand_particles_kernel.set_arg(2, cl_ulong(time(nullptr) + sim_time)); cl_mem buffers[] = { m_particle_pos_buf.getCLID(), m_particle_col_buf.getCLID() }; utils::ocl::GLSyncHandler sync(m_queue, sizeof(buffers) / sizeof(buffers[0]), buffers); if (!sync) return; cl_int err = CL_SUCCESS; if (m_spiral) { err = clEnqueueNDRangeKernel(m_queue, m_polar_spiral_kernel, 1, nullptr, &part_cnt, nullptr, 0, nullptr, m_stats.event("polar_spiral")); } else { err = clEnqueueNDRangeKernel(m_queue, m_gen_rand_particles_kernel, 1, nullptr, &part_cnt, nullptr, 0, nullptr, m_stats.event("gen_part_positions")); } if (err != CL_SUCCESS) { WARNM("DebugParticleSystem: Failed to enqueue test simulation kernel: " << utils::ocl::errorToStr(err)); } m_sim_params->advanceTime(time_step); // 3.0f; if (m_cl_timer.isReady()) m_cl_timer.stop(); }
cl_ulong eventProfilingInfo(cl_event id, cl_profiling_info info) { cl_ulong value; cl_int error = CL_SUCCESS; if(!id || (error = clGetEventProfilingInfo(id, info, sizeof(cl_ulong), &value, nullptr)) != CL_SUCCESS) { reportError("eventInfo(): ", error); return cl_ulong(0); } return value; }
//------------------------------------------------------------------------------ int main(int argc, char** argv) { if(argc < 8) { std::cerr << "usage: " << argv[0] << " <platform name> <device type = default | cpu | gpu " "| acc | all> <device num> <OpenCL source file path>" " <kernel name> <matrix size> <workgroup size>" << std::endl; exit(EXIT_FAILURE); } const int SIZE = atoi(argv[6]); const size_t BYTE_SIZE = SIZE * SIZE * sizeof(real_t); const int BLOCK_SIZE = atoi(argv[7]); //4 x 4 tiles if( SIZE < 1 || BLOCK_SIZE < 1 || (SIZE % BLOCK_SIZE) != 0) { std::cerr << "ERROR - size and block size *must* be greater than zero " "and size *must* be evenly divsible by block size" << std::endl; exit(EXIT_FAILURE); } //setup text header that will be prefixed to opencl code std::ostringstream clheaderStream; clheaderStream << "#define BLOCK_SIZE " << BLOCK_SIZE << '\n'; #ifdef USE_DOUBLE clheaderStream << "#define DOUBLE\n"; const double EPS = 0.000000001; #else const double EPS = 0.00001; #endif //enable profiling on queue CLEnv clenv = create_clenv(argv[1], argv[2], atoi(argv[3]), true, argv[4], argv[5], clheaderStream.str()); cl_int status; //create input and output matrices std::vector<real_t> A = create_matrix(SIZE, SIZE); std::vector<real_t> B = create_matrix(SIZE, SIZE); std::vector<real_t> C(SIZE * SIZE,real_t(0)); std::vector<real_t> refC(SIZE * SIZE,real_t(0)); //allocate output buffer on OpenCL device cl_mem devC = clCreateBuffer(clenv.context, CL_MEM_WRITE_ONLY, BYTE_SIZE, 0, &status); check_cl_error(status, "clCreateBuffer"); //allocate input buffers on OpenCL devices and copy data cl_mem devA = clCreateBuffer(clenv.context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, BYTE_SIZE, &A[0], //<-- copy data from A &status); check_cl_error(status, "clCreateBuffer"); cl_mem devB = clCreateBuffer(clenv.context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, BYTE_SIZE, &B[0], //<-- copy data from B &status); check_cl_error(status, "clCreateBuffer"); //set kernel parameters status = clSetKernelArg(clenv.kernel, //kernel 0, //parameter id sizeof(cl_mem), //size of parameter &devA); //pointer to parameter check_cl_error(status, "clSetKernelArg(A)"); status = clSetKernelArg(clenv.kernel, //kernel 1, //parameter id sizeof(cl_mem), //size of parameter &devB); //pointer to parameter check_cl_error(status, "clSetKernelArg(B)"); status = clSetKernelArg(clenv.kernel, //kernel 2, //parameter id sizeof(cl_mem), //size of parameter &devC); //pointer to parameter check_cl_error(status, "clSetKernelArg(C)"); status = clSetKernelArg(clenv.kernel, //kernel 3, //parameter id sizeof(int), //size of parameter &SIZE); //pointer to parameter check_cl_error(status, "clSetKernelArg(SIZE)"); //setup kernel launch configuration //total number of threads == number of array elements const size_t globalWorkSize[2] = {SIZE, SIZE}; //number of per-workgroup local threads const size_t localWorkSize[2] = {BLOCK_SIZE, BLOCK_SIZE}; //launch kernel //to make sure there are no pending commands in the queue do wait //for any commands to finish execution status = clFinish(clenv.commandQueue); check_cl_error(status, "clFinish"); cl_event profilingEvent; status = clEnqueueNDRangeKernel(clenv.commandQueue, //queue clenv.kernel, //kernel 2, //number of dimensions for work-items 0, //global work offset globalWorkSize, //total number of threads localWorkSize, //threads per workgroup 0, //number of events that need to //complete before kernel executed 0, //list of events that need to complete //before kernel executed &profilingEvent); //event object //identifying this //particular kernel //execution instance check_cl_error(status, "clEnqueueNDRangeKernel"); status = clFinish(clenv.commandQueue); //ensure kernel execution is //terminated; used for timing purposes only; there is no need to enforce //termination when issuing a subsequent blocking data transfer operation check_cl_error(status, "clFinish"); status = clWaitForEvents(1, &profilingEvent); check_cl_error(status, "clWaitForEvents"); cl_ulong kernelStartTime = cl_ulong(0); cl_ulong kernelEndTime = cl_ulong(0); size_t retBytes = size_t(0); double kernelElapsedTime_ms = double(0); status = clGetEventProfilingInfo(profilingEvent, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &kernelStartTime, &retBytes); check_cl_error(status, "clGetEventProfilingInfo"); status = clGetEventProfilingInfo(profilingEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernelEndTime, &retBytes); check_cl_error(status, "clGetEventProfilingInfo"); //event timing is reported in nano seconds: divide by 1e6 to get //time in milliseconds kernelElapsedTime_ms = (double)(kernelEndTime - kernelStartTime) / 1E6; //read back and check results status = clEnqueueReadBuffer(clenv.commandQueue, devC, CL_TRUE, //blocking read 0, //offset BYTE_SIZE, //byte size of data &C[0], //destination buffer in host memory 0, //number of events that need to //complete before transfer executed 0, //list of events that need to complete //before transfer executed 0); //event identifying this specific operation check_cl_error(status, "clEnqueueReadBuffer"); host_matmul(A, B, refC, SIZE, SIZE); if(check_result(refC, C, EPS)) { std::cout << "PASSED" << std::endl; std::cout << "Elapsed time(ms): " << kernelElapsedTime_ms << std::endl; } else { std::cout << "FAILED" << std::endl; } check_cl_error(clReleaseMemObject(devA), "clReleaseMemObject"); check_cl_error(clReleaseMemObject(devB), "clReleaseMemObject"); check_cl_error(clReleaseMemObject(devC), "clReleaseMemObject"); release_clenv(clenv); return 0; }