Ejemplo n.º 1
0
static void CL_CALLBACK  kernel_profiler_cb (cl_event event,
                                             cl_int event_command_exec_status,
                                             void *user_data)
{
  static cl_ulong tstart, tstop, len;
  cl_int refcnt;
  struct ld_kernel_s *ldKernel = (struct ld_kernel_s *) user_data;
  
  pthread_mutex_lock(&stats_lock);
  clReleaseEvent(event);
  clCheck(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(tstop), &tstop, NULL));
  clCheck(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(tstart), &tstart, NULL));

  clCheck(clGetEventInfo(event,  CL_EVENT_REFERENCE_COUNT, sizeof(refcnt), &refcnt, NULL));
  
  len = tstop - tstart;
  if (tstart > tstop) {
    len = tstart - tstop;
  }

  if (tstart == 0ul || tstop == 0ul) {
    // invalid timestamps
    len = 0;
  }

  ldKernel->exec_span_ns += len;
  pthread_mutex_unlock(&stats_lock);
}
Ejemplo n.º 2
0
cl_int clEnqueueNDRangeKernel (cl_command_queue command_queue,
                               cl_kernel kernel,
                               cl_uint work_dim,
                               const size_t *global_work_offset,
                               const size_t *global_work_size,
                               const size_t *local_work_size,
                               cl_uint num_events_in_wait_list,
                               const cl_event *event_wait_list,
                               cl_event *event)
{
  static struct work_size_s work_sizes;
  
  struct ld_kernel_s *ldKernel = find_kernel_entry(kernel);
  int i;
  cl_int errcode;

  if (num_events_in_wait_list) {
    clCheck(clWaitForEvents(num_events_in_wait_list, event_wait_list));
  }
  
  assert(ldKernel);
  for (i = 0; i < work_dim; i++) {
    work_sizes.local[i] = local_work_size[i];
    work_sizes.global[i] = global_work_size[i]/work_sizes.local[i];
  }

#if ENABLE_KERNEL_PROFILING == 1
  static cl_event kern_event;
  
  if (!event) {
    event = &kern_event; // scope of the event is limited to this function.
  }
#endif
  
  kernel_executed_event(ldKernel, &work_sizes, work_dim);
  
  errcode = real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim,
                                        global_work_offset, global_work_size,
                                        local_work_size, num_events_in_wait_list,
                                        event_wait_list, event);
#if ENABLE_KERNEL_PROFILING == 1
  clCheck(errcode);
  
  clRetainEvent(*event);
  clSetEventCallback(*event, CL_COMPLETE, kernel_profiler_cb, ldKernel);
#endif

#if FORCE_FINISH_KERNEL
  real_clFinish(command_queue);
#endif
  
  kernel_finished_event(ldKernel, &work_sizes, work_dim);
  
  return errcode;
}
Ejemplo n.º 3
0
// copies real array from CPU host to GPU device
void gpuCreateCopy_todevice_realw (gpu_realw_mem *d_array_addr_ptr, realw *h_array, int size) {

  TRACE ("gpuCreateCopy_todevice_realw");

  // allocates memory on GPU
#ifdef USE_OPENCL
  if (run_opencl) {
    cl_int errcode;

    d_array_addr_ptr->ocl = clCreateBuffer (mocl.context, CL_MEM_READ_WRITE, size * sizeof (realw),
                                            NULL, clck_(&errcode));

    // copies values onto GPU
    clCheck (clEnqueueWriteBuffer (mocl.command_queue, d_array_addr_ptr->ocl, CL_TRUE, 0,
                                   size * sizeof (realw), h_array, 0, NULL, NULL));
  }
#endif
#ifdef USE_CUDA
  if (run_cuda) {
    // allocates memory on GPU
    print_CUDA_error_if_any(cudaMalloc((void**)&d_array_addr_ptr->cuda,size*sizeof(realw)),22001);
    // copies values onto GPU
    print_CUDA_error_if_any(cudaMemcpy((realw*) d_array_addr_ptr->cuda,h_array,size*sizeof(realw),cudaMemcpyHostToDevice),22002);
  }
#endif
}
Ejemplo n.º 4
0
// copies integer array from CPU host to GPU device
void gpuCreateCopy_todevice_int (gpu_int_mem *d_array_addr_ptr, int *h_array, int size) {

  TRACE ("gpuCreateCopy_todevice_int");

#ifdef USE_OPENCL
  if (run_opencl) {
    cl_int errcode;

    // allocates memory on GPU
    d_array_addr_ptr->ocl = clCreateBuffer (mocl.context, CL_MEM_READ_WRITE,
                                            size * sizeof (int), NULL, clck_(&errcode));

    // copies values onto GPU
    clCheck (clEnqueueWriteBuffer (mocl.command_queue, d_array_addr_ptr->ocl, CL_TRUE, 0,
                                   size*sizeof (int), h_array, 0, NULL, NULL));
  }
#endif
#ifdef USE_CUDA
  if (run_cuda) {
    // allocates memory on GPU
    //
    // note: cudaMalloc uses a double-pointer, such that it can return an error code in case it fails
    //          we thus pass the address to the pointer above (as void double-pointer) to have it
    //          pointing to the correct pointer of the array here
    print_CUDA_error_if_any(cudaMalloc((void**)&d_array_addr_ptr->cuda,size*sizeof(int)),12001);

    // copies values onto GPU
    //
    // note: cudaMemcpy uses the pointer to the array, we thus re-cast the value of
    //          the double-pointer above to have the correct pointer to the array
    print_CUDA_error_if_any(cudaMemcpy((int*) d_array_addr_ptr->cuda,h_array,size*sizeof(int),cudaMemcpyHostToDevice),12002);
  }
#endif
}
Ejemplo n.º 5
0
void clChoosePlatform(cl_device_id*& devices, cl_platform_id& platform) {
  // Choose the first available platform
  cl_platform_id* clPlatformIDs;
  cl_uint numPlatforms;
  clCheck(clGetPlatformIDs(0, NULL, &numPlatforms));
  if(numPlatforms > 0)
  {
    cl_platform_id* platforms = (cl_platform_id*) malloc(numPlatforms * sizeof(cl_platform_id));
    clCheck(clGetPlatformIDs(numPlatforms, platforms, NULL));
    platform = platforms[0];
    free(platforms);
  }

  // Choose a device from the platform according to DEVICE_PREFERENCE
  cl_uint numCpus = 0;
  cl_uint numGpus = 0;
  cl_uint numAccelerators = 0;
  clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 0, NULL, &numCpus);
  clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numGpus);
  clGetDeviceIDs(platform, CL_DEVICE_TYPE_ACCELERATOR, 0, NULL, &numAccelerators);
  devices = (cl_device_id*) malloc(numAccelerators * sizeof(cl_device_id));

  DEBUG << std::endl << "Devices available: " << std::endl
    << "CPU: " << numCpus << std::endl
    << "GPU: " << numGpus << std::endl
    << "Accelerators: " << numAccelerators << std::endl;

  if (DEVICE_PREFERENCE == DEVICE_CPU && numCpus > 0) {
    DEBUG << "Choosing CPU" << std::endl;
    clCheck(clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, numCpus, devices, NULL));
  }
  else if (DEVICE_PREFERENCE == DEVICE_GPU && numGpus > 0) {
    DEBUG << "Choosing GPU" << std::endl;
    clCheck(clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numGpus, devices, NULL));
  }
  else if (DEVICE_PREFERENCE == DEVICE_ACCELERATOR && numAccelerators > 0) {
    DEBUG << "Choosing accelerator" << std::endl;
    clCheck(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ACCELERATOR, numAccelerators, devices, NULL));
  }
  else {
    // We couldn't match the preference.
    // Let's try the first device that appears available.
    cl_uint numDevices = 0;
    clCheck(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices));
    if (numDevices > 0) {
      DEBUG << "Preference device couldn't be met" << std::endl
            << "Choosing an available OpenCL capable device" << std::endl;
      clCheck(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, numDevices, devices, NULL));
    }
    else {
      DEBUG << "No OpenCL capable device detected" << std::endl
            << "Check the drivers, OpenCL runtime or ICDs are available" << std::endl;
      exit(-1);
    }
  }
  DEBUG << std::endl;
}
Ejemplo n.º 6
0
void moclEnqueueFillBuffer (cl_mem *buffer, int val, size_t size_byte) {

  // creates/gets OpenCL memset kernel
  cl_kernel *memset_kern = setup_ocl_memset(1);

  // value to fill buffer
  cl_int value = val;

  // gets size as number of integer values
  int size;
  size = size_byte / sizeof(cl_int);

  size_t global_work_size[2];
  size_t local_work_size[2];
  cl_uint idx = 0;

  clCheck (clSetKernelArg (*memset_kern, idx++, sizeof (cl_mem), (void *) buffer));
  clCheck (clSetKernelArg (*memset_kern, idx++, sizeof (cl_int), (void *) &size));
  clCheck (clSetKernelArg (*memset_kern, idx++, sizeof (cl_int), (void *) &value));

  int blocksize = BLOCKSIZE_TRANSFER;
  int size_padded = ((int) ceil ((double) size / (double) blocksize)) * blocksize;

  int num_blocks_x, num_blocks_y;
  get_blocks_xy (size_padded/blocksize, &num_blocks_x, &num_blocks_y);

  local_work_size[0] = blocksize;
  local_work_size[1] = 1;
  global_work_size[0] = num_blocks_x * blocksize;
  global_work_size[1] = num_blocks_y;

  //debug
  //printf("moclEnqueueFillBuffer: size %i value %i - work_size %zu %zu \n",size,value,local_work_size[0],global_work_size[0]);

  clCheck (clEnqueueNDRangeKernel (mocl.command_queue, *memset_kern, 2, NULL,
                                   global_work_size, local_work_size, 0, NULL, NULL));
  // synchronizes
  clFinish (mocl.command_queue);
}
Ejemplo n.º 7
0
cl_kernel *setup_ocl_memset (int do_setup) {

  static int inited = 0;
  static cl_kernel memset_kern;
  cl_int errcode;

  if (do_setup) {
    if (!inited) {
      // creates openCL kernel
      cl_program memset_program = clCreateProgramWithSource(mocl.context, 1,
                                                            memset_kern_code, 0,
                                                            clck_(&errcode));
      clCheck (clBuildProgram (memset_program, 0, NULL, NULL, NULL, NULL));
      memset_kern = clCreateKernel (memset_program, "memset_uint4", clck_(&errcode));
      inited = 1;
    }
  } else {
    // releases kernel
    if (inited) { clCheck(clReleaseKernel (memset_kern)); }
  }

  return &memset_kern;
}
Ejemplo n.º 8
0
int ocl_getAndReleaseParameterValue (struct ld_kernel_s *ldKernel,
                                       struct ld_kern_param_s *ldParam,
                                       void *buffer_handle,
                                       void *buffer,  size_t size)
{
  if (buffer_handle == (void *) -1) {
    return 1;
  }
  
  if (size == 0) {
    goto do_release;
  }
  
  clCheck(real_clEnqueueReadBuffer(ldOclEnv.command_queue,
                                   (cl_mem) buffer_handle,
                                   CL_TRUE,
                                   0, size, buffer,
                                   0, NULL, NULL));
do_release:
  clCheck(real_clReleaseMemObject((cl_mem) buffer_handle));
  
  return 1;
}
Ejemplo n.º 9
0
void *ocl_setParameterValue (struct ld_kernel_s *ldKernel,
                             struct ld_kern_param_s *ldParam,
                             void *buffer,  size_t size)
{
  cl_mem mem_obj = (void *) -1;
  cl_int errcode_ret;

  if (ldParam->is_pointer) {
    DEFAULT_SIZE(size)
    
    mem_obj = real_clCreateBuffer(ldOclEnv.context, CL_MEM_READ_WRITE, size, NULL, clck_(&errcode_ret));

    clCheck(real_clEnqueueWriteBuffer(ldOclEnv.command_queue,
                                      (cl_mem) mem_obj,
                                      CL_TRUE,
                                      0, size, buffer,
                                      0, NULL, NULL));
    buffer = &mem_obj;
    size = sizeof(cl_mem);
  }

  if (size == 0 && strstr(ldParam->name, "_tex")) {
    cl_image_format format = {CL_R, CL_UNSIGNED_INT32};
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
    mem_obj = clCreateImage2D (ldOclEnv.context, CL_MEM_READ_ONLY, &format, 100, 1, 0, &format, clck_(&errcode_ret));
#pragma GCC diagnostic pop
    buffer = &mem_obj;
    size = sizeof(cl_mem);
  }
  
  clCheck(real_clSetKernelArg ((cl_kernel) ldKernel->handle, ldParam->index,
                               size, buffer));

  return mem_obj;
    
}
Ejemplo n.º 10
0
// copies array from GPU to CPU
void gpuCopy_from_device_realw (gpu_realw_mem *d_array_addr_ptr, realw *h_array, int size) {

  TRACE ("gpuCopy_from_device_realw");

  // copies memory from GPU back to CPU
#ifdef USE_OPENCL
  if (run_opencl) {
    // blocking copy
    clCheck (clEnqueueReadBuffer (mocl.command_queue, d_array_addr_ptr->ocl, CL_TRUE, 0, sizeof (realw) * size, h_array, 0, NULL, NULL));
  }
#endif
#ifdef USE_CUDA
  if (run_cuda) {
    // note: cudaMemcpy implicitly synchronizes all other cuda operations
    print_CUDA_error_if_any(cudaMemcpy(h_array,d_array_addr_ptr->cuda, sizeof(realw)*size, cudaMemcpyDeviceToHost),33001);
  }
#endif
}
Ejemplo n.º 11
0
// setup functions
void gpuSetConst (gpu_realw_mem *buffer, size_t size, realw *array) {

  TRACE ("gpuSetConst");

  // allocates array on GPU
#ifdef USE_OPENCL
  if (run_opencl) {
    cl_int errcode;
    buffer->ocl = clCreateBuffer (mocl.context, CL_MEM_READ_ONLY, size * sizeof(realw), NULL, clck_(&errcode));
    clCheck (clEnqueueWriteBuffer (mocl.command_queue, buffer->ocl, CL_TRUE, 0, size * sizeof(realw), array, 0, NULL, NULL));
  }
#endif
#ifdef USE_CUDA
  if (run_cuda) {
    print_CUDA_error_if_any(cudaMalloc(&buffer->cuda, size * sizeof(realw)), 1400);
    print_CUDA_error_if_any(cudaMemcpy(buffer->cuda, array, size * sizeof(realw), cudaMemcpyHostToDevice),1401);
  }
#endif
}
Ejemplo n.º 12
0
// copies integer array from CPU host to GPU device
void gpuCopy_todevice_int (gpu_int_mem *d_array_addr_ptr, int *h_array, int size) {

  TRACE ("gpuCopy_todevice_int");

  // copies memory on from CPU to GPU
  // uses blocking copies
#ifdef USE_OPENCL
  if (run_opencl) {
    // copies values onto GPU
    clCheck (clEnqueueWriteBuffer (mocl.command_queue, d_array_addr_ptr->ocl, CL_TRUE, 0, size * sizeof (int), h_array, 0, NULL, NULL));
  }
#endif
#ifdef USE_CUDA
  if (run_cuda) {
    // copies values onto GPU
    print_CUDA_error_if_any(cudaMemcpy((int*) d_array_addr_ptr->cuda,h_array,size*sizeof(int),cudaMemcpyHostToDevice),22003);
  }
#endif
}
Ejemplo n.º 13
0
int ocl_triggerKernelExecution (struct ld_kernel_s *ldKernel,
                                const struct work_size_s *work_sizes,
                                unsigned int work_dim)
{
  static size_t global_work_size[MAX_WORK_DIM], local_work_size[MAX_WORK_DIM];
  unsigned int i;
  
  for (i = 0; i < work_dim; i++) {
    local_work_size[i] = work_sizes->local[i];
    global_work_size[i] = work_sizes->global[i] * work_sizes->local[i];
  }
  
  clCheck(real_clEnqueueNDRangeKernel(ldOclEnv.command_queue, (cl_kernel) ldKernel->handle,
                                      work_dim, NULL,
                                      global_work_size,
                                      local_work_size,
                                      0, NULL, NULL));
  return 1;
}