static void CL_CALLBACK kernel_profiler_cb (cl_event event, cl_int event_command_exec_status, void *user_data) { static cl_ulong tstart, tstop, len; cl_int refcnt; struct ld_kernel_s *ldKernel = (struct ld_kernel_s *) user_data; pthread_mutex_lock(&stats_lock); clReleaseEvent(event); clCheck(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(tstop), &tstop, NULL)); clCheck(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(tstart), &tstart, NULL)); clCheck(clGetEventInfo(event, CL_EVENT_REFERENCE_COUNT, sizeof(refcnt), &refcnt, NULL)); len = tstop - tstart; if (tstart > tstop) { len = tstart - tstop; } if (tstart == 0ul || tstop == 0ul) { // invalid timestamps len = 0; } ldKernel->exec_span_ns += len; pthread_mutex_unlock(&stats_lock); }
cl_int clEnqueueNDRangeKernel (cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) { static struct work_size_s work_sizes; struct ld_kernel_s *ldKernel = find_kernel_entry(kernel); int i; cl_int errcode; if (num_events_in_wait_list) { clCheck(clWaitForEvents(num_events_in_wait_list, event_wait_list)); } assert(ldKernel); for (i = 0; i < work_dim; i++) { work_sizes.local[i] = local_work_size[i]; work_sizes.global[i] = global_work_size[i]/work_sizes.local[i]; } #if ENABLE_KERNEL_PROFILING == 1 static cl_event kern_event; if (!event) { event = &kern_event; // scope of the event is limited to this function. } #endif kernel_executed_event(ldKernel, &work_sizes, work_dim); errcode = real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_events_in_wait_list, event_wait_list, event); #if ENABLE_KERNEL_PROFILING == 1 clCheck(errcode); clRetainEvent(*event); clSetEventCallback(*event, CL_COMPLETE, kernel_profiler_cb, ldKernel); #endif #if FORCE_FINISH_KERNEL real_clFinish(command_queue); #endif kernel_finished_event(ldKernel, &work_sizes, work_dim); return errcode; }
// copies real array from CPU host to GPU device void gpuCreateCopy_todevice_realw (gpu_realw_mem *d_array_addr_ptr, realw *h_array, int size) { TRACE ("gpuCreateCopy_todevice_realw"); // allocates memory on GPU #ifdef USE_OPENCL if (run_opencl) { cl_int errcode; d_array_addr_ptr->ocl = clCreateBuffer (mocl.context, CL_MEM_READ_WRITE, size * sizeof (realw), NULL, clck_(&errcode)); // copies values onto GPU clCheck (clEnqueueWriteBuffer (mocl.command_queue, d_array_addr_ptr->ocl, CL_TRUE, 0, size * sizeof (realw), h_array, 0, NULL, NULL)); } #endif #ifdef USE_CUDA if (run_cuda) { // allocates memory on GPU print_CUDA_error_if_any(cudaMalloc((void**)&d_array_addr_ptr->cuda,size*sizeof(realw)),22001); // copies values onto GPU print_CUDA_error_if_any(cudaMemcpy((realw*) d_array_addr_ptr->cuda,h_array,size*sizeof(realw),cudaMemcpyHostToDevice),22002); } #endif }
// copies integer array from CPU host to GPU device void gpuCreateCopy_todevice_int (gpu_int_mem *d_array_addr_ptr, int *h_array, int size) { TRACE ("gpuCreateCopy_todevice_int"); #ifdef USE_OPENCL if (run_opencl) { cl_int errcode; // allocates memory on GPU d_array_addr_ptr->ocl = clCreateBuffer (mocl.context, CL_MEM_READ_WRITE, size * sizeof (int), NULL, clck_(&errcode)); // copies values onto GPU clCheck (clEnqueueWriteBuffer (mocl.command_queue, d_array_addr_ptr->ocl, CL_TRUE, 0, size*sizeof (int), h_array, 0, NULL, NULL)); } #endif #ifdef USE_CUDA if (run_cuda) { // allocates memory on GPU // // note: cudaMalloc uses a double-pointer, such that it can return an error code in case it fails // we thus pass the address to the pointer above (as void double-pointer) to have it // pointing to the correct pointer of the array here print_CUDA_error_if_any(cudaMalloc((void**)&d_array_addr_ptr->cuda,size*sizeof(int)),12001); // copies values onto GPU // // note: cudaMemcpy uses the pointer to the array, we thus re-cast the value of // the double-pointer above to have the correct pointer to the array print_CUDA_error_if_any(cudaMemcpy((int*) d_array_addr_ptr->cuda,h_array,size*sizeof(int),cudaMemcpyHostToDevice),12002); } #endif }
void clChoosePlatform(cl_device_id*& devices, cl_platform_id& platform) { // Choose the first available platform cl_platform_id* clPlatformIDs; cl_uint numPlatforms; clCheck(clGetPlatformIDs(0, NULL, &numPlatforms)); if(numPlatforms > 0) { cl_platform_id* platforms = (cl_platform_id*) malloc(numPlatforms * sizeof(cl_platform_id)); clCheck(clGetPlatformIDs(numPlatforms, platforms, NULL)); platform = platforms[0]; free(platforms); } // Choose a device from the platform according to DEVICE_PREFERENCE cl_uint numCpus = 0; cl_uint numGpus = 0; cl_uint numAccelerators = 0; clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 0, NULL, &numCpus); clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numGpus); clGetDeviceIDs(platform, CL_DEVICE_TYPE_ACCELERATOR, 0, NULL, &numAccelerators); devices = (cl_device_id*) malloc(numAccelerators * sizeof(cl_device_id)); DEBUG << std::endl << "Devices available: " << std::endl << "CPU: " << numCpus << std::endl << "GPU: " << numGpus << std::endl << "Accelerators: " << numAccelerators << std::endl; if (DEVICE_PREFERENCE == DEVICE_CPU && numCpus > 0) { DEBUG << "Choosing CPU" << std::endl; clCheck(clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, numCpus, devices, NULL)); } else if (DEVICE_PREFERENCE == DEVICE_GPU && numGpus > 0) { DEBUG << "Choosing GPU" << std::endl; clCheck(clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numGpus, devices, NULL)); } else if (DEVICE_PREFERENCE == DEVICE_ACCELERATOR && numAccelerators > 0) { DEBUG << "Choosing accelerator" << std::endl; clCheck(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ACCELERATOR, numAccelerators, devices, NULL)); } else { // We couldn't match the preference. // Let's try the first device that appears available. cl_uint numDevices = 0; clCheck(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices)); if (numDevices > 0) { DEBUG << "Preference device couldn't be met" << std::endl << "Choosing an available OpenCL capable device" << std::endl; clCheck(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, numDevices, devices, NULL)); } else { DEBUG << "No OpenCL capable device detected" << std::endl << "Check the drivers, OpenCL runtime or ICDs are available" << std::endl; exit(-1); } } DEBUG << std::endl; }
void moclEnqueueFillBuffer (cl_mem *buffer, int val, size_t size_byte) { // creates/gets OpenCL memset kernel cl_kernel *memset_kern = setup_ocl_memset(1); // value to fill buffer cl_int value = val; // gets size as number of integer values int size; size = size_byte / sizeof(cl_int); size_t global_work_size[2]; size_t local_work_size[2]; cl_uint idx = 0; clCheck (clSetKernelArg (*memset_kern, idx++, sizeof (cl_mem), (void *) buffer)); clCheck (clSetKernelArg (*memset_kern, idx++, sizeof (cl_int), (void *) &size)); clCheck (clSetKernelArg (*memset_kern, idx++, sizeof (cl_int), (void *) &value)); int blocksize = BLOCKSIZE_TRANSFER; int size_padded = ((int) ceil ((double) size / (double) blocksize)) * blocksize; int num_blocks_x, num_blocks_y; get_blocks_xy (size_padded/blocksize, &num_blocks_x, &num_blocks_y); local_work_size[0] = blocksize; local_work_size[1] = 1; global_work_size[0] = num_blocks_x * blocksize; global_work_size[1] = num_blocks_y; //debug //printf("moclEnqueueFillBuffer: size %i value %i - work_size %zu %zu \n",size,value,local_work_size[0],global_work_size[0]); clCheck (clEnqueueNDRangeKernel (mocl.command_queue, *memset_kern, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL)); // synchronizes clFinish (mocl.command_queue); }
cl_kernel *setup_ocl_memset (int do_setup) { static int inited = 0; static cl_kernel memset_kern; cl_int errcode; if (do_setup) { if (!inited) { // creates openCL kernel cl_program memset_program = clCreateProgramWithSource(mocl.context, 1, memset_kern_code, 0, clck_(&errcode)); clCheck (clBuildProgram (memset_program, 0, NULL, NULL, NULL, NULL)); memset_kern = clCreateKernel (memset_program, "memset_uint4", clck_(&errcode)); inited = 1; } } else { // releases kernel if (inited) { clCheck(clReleaseKernel (memset_kern)); } } return &memset_kern; }
int ocl_getAndReleaseParameterValue (struct ld_kernel_s *ldKernel, struct ld_kern_param_s *ldParam, void *buffer_handle, void *buffer, size_t size) { if (buffer_handle == (void *) -1) { return 1; } if (size == 0) { goto do_release; } clCheck(real_clEnqueueReadBuffer(ldOclEnv.command_queue, (cl_mem) buffer_handle, CL_TRUE, 0, size, buffer, 0, NULL, NULL)); do_release: clCheck(real_clReleaseMemObject((cl_mem) buffer_handle)); return 1; }
void *ocl_setParameterValue (struct ld_kernel_s *ldKernel, struct ld_kern_param_s *ldParam, void *buffer, size_t size) { cl_mem mem_obj = (void *) -1; cl_int errcode_ret; if (ldParam->is_pointer) { DEFAULT_SIZE(size) mem_obj = real_clCreateBuffer(ldOclEnv.context, CL_MEM_READ_WRITE, size, NULL, clck_(&errcode_ret)); clCheck(real_clEnqueueWriteBuffer(ldOclEnv.command_queue, (cl_mem) mem_obj, CL_TRUE, 0, size, buffer, 0, NULL, NULL)); buffer = &mem_obj; size = sizeof(cl_mem); } if (size == 0 && strstr(ldParam->name, "_tex")) { cl_image_format format = {CL_R, CL_UNSIGNED_INT32}; #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" mem_obj = clCreateImage2D (ldOclEnv.context, CL_MEM_READ_ONLY, &format, 100, 1, 0, &format, clck_(&errcode_ret)); #pragma GCC diagnostic pop buffer = &mem_obj; size = sizeof(cl_mem); } clCheck(real_clSetKernelArg ((cl_kernel) ldKernel->handle, ldParam->index, size, buffer)); return mem_obj; }
// copies array from GPU to CPU void gpuCopy_from_device_realw (gpu_realw_mem *d_array_addr_ptr, realw *h_array, int size) { TRACE ("gpuCopy_from_device_realw"); // copies memory from GPU back to CPU #ifdef USE_OPENCL if (run_opencl) { // blocking copy clCheck (clEnqueueReadBuffer (mocl.command_queue, d_array_addr_ptr->ocl, CL_TRUE, 0, sizeof (realw) * size, h_array, 0, NULL, NULL)); } #endif #ifdef USE_CUDA if (run_cuda) { // note: cudaMemcpy implicitly synchronizes all other cuda operations print_CUDA_error_if_any(cudaMemcpy(h_array,d_array_addr_ptr->cuda, sizeof(realw)*size, cudaMemcpyDeviceToHost),33001); } #endif }
// setup functions void gpuSetConst (gpu_realw_mem *buffer, size_t size, realw *array) { TRACE ("gpuSetConst"); // allocates array on GPU #ifdef USE_OPENCL if (run_opencl) { cl_int errcode; buffer->ocl = clCreateBuffer (mocl.context, CL_MEM_READ_ONLY, size * sizeof(realw), NULL, clck_(&errcode)); clCheck (clEnqueueWriteBuffer (mocl.command_queue, buffer->ocl, CL_TRUE, 0, size * sizeof(realw), array, 0, NULL, NULL)); } #endif #ifdef USE_CUDA if (run_cuda) { print_CUDA_error_if_any(cudaMalloc(&buffer->cuda, size * sizeof(realw)), 1400); print_CUDA_error_if_any(cudaMemcpy(buffer->cuda, array, size * sizeof(realw), cudaMemcpyHostToDevice),1401); } #endif }
// copies integer array from CPU host to GPU device void gpuCopy_todevice_int (gpu_int_mem *d_array_addr_ptr, int *h_array, int size) { TRACE ("gpuCopy_todevice_int"); // copies memory on from CPU to GPU // uses blocking copies #ifdef USE_OPENCL if (run_opencl) { // copies values onto GPU clCheck (clEnqueueWriteBuffer (mocl.command_queue, d_array_addr_ptr->ocl, CL_TRUE, 0, size * sizeof (int), h_array, 0, NULL, NULL)); } #endif #ifdef USE_CUDA if (run_cuda) { // copies values onto GPU print_CUDA_error_if_any(cudaMemcpy((int*) d_array_addr_ptr->cuda,h_array,size*sizeof(int),cudaMemcpyHostToDevice),22003); } #endif }
int ocl_triggerKernelExecution (struct ld_kernel_s *ldKernel, const struct work_size_s *work_sizes, unsigned int work_dim) { static size_t global_work_size[MAX_WORK_DIM], local_work_size[MAX_WORK_DIM]; unsigned int i; for (i = 0; i < work_dim; i++) { local_work_size[i] = work_sizes->local[i]; global_work_size[i] = work_sizes->global[i] * work_sizes->local[i]; } clCheck(real_clEnqueueNDRangeKernel(ldOclEnv.command_queue, (cl_kernel) ldKernel->handle, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL)); return 1; }