d_void * acc_malloc_(size_t device_idx, size_t n) {
#if PRINT_INFO
  printf("[info]  acc_malloc_(size_t device_idx = %u, size_t n = %d)\n", device_idx, n);
#endif

  cl_int status;
	
  cl_mem buffer = clCreateBuffer(
    /* cl_context context  */ acc_runtime.opencl_data->devices_data[device_idx]->context,
    /* cl_mem_flags flags  */ CL_MEM_READ_WRITE,
    /* size_t size         */ n,
    /* void *host_ptr      */ NULL,
    /* cl_int *errcode_ret */ &status
  );
  if (status != CL_SUCCESS) {
    const char * status_str = acc_ocl_status_to_char(status);
    printf("[fatal]   clCreateBuffer return %s for device %u and size %u.\n", status_str, device_idx, (unsigned)n);
    exit(-1); /// \todo error code
  }

#if PRINT_INFO
  printf("[info]      return %X\n", buffer);
#endif

  return (d_void *)buffer;
}
void acc_free_(size_t device_idx, d_void * dev_ptr) {
  cl_int status = clReleaseMemObject((cl_mem)dev_ptr);
  if (status != CL_SUCCESS) {
    const char * status_str = acc_ocl_status_to_char(status);
    printf("[fatal]   clReleaseMemObject return %s for device ptr = %x.\n", status_str, dev_ptr);
    exit(-1); /// \todo error code
  }
}
void acc_memcpy_from_device_(size_t device_idx, h_void * dest, d_void * src, size_t bytes) {
  cl_event event;

  cl_int status = clEnqueueReadBuffer (
    /* cl_command_queue command_queue */ acc_runtime.opencl_data->devices_data[device_idx]->command_queue,
    /* cl_mem buffer */ (cl_mem)src,
    /* cl_bool blocking_read */ CL_FALSE,
    /* size_t offset */ 0,
    /* size_t cb */ bytes,
    /* void *ptr */ dest,
    /* cl_uint num_events_in_wait_list */ 0,
    /* const cl_event *event_wait_list */ NULL,
    /* cl_event *event */ &event
  );
  if (status != CL_SUCCESS) {
    char * status_str = acc_ocl_status_to_char(status);
    printf("[fatal] clEnqueueReadBuffer return %s for device ptr = %X to host ptr = %X of size %u.\n", status_str, (unsigned int)src, (unsigned int)dest, (unsigned int)bytes);
    exit(-1); /// \todo error code
  }

  acc_profiling_register_memcpy_from_device(event, device_idx, dest, src, bytes);
}
void acc_enqueue_kernel(acc_region_t region, acc_kernel_t kernel) {
#if PRINT_INFO
  printf("[info]  acc_enqueue_kernel\n");
#endif

  unsigned dev_idx;
  for (dev_idx = 0; dev_idx < region->num_devices; dev_idx++) {
    assert(region->devices[dev_idx].num_gang > 0);
    assert(region->devices[dev_idx].num_worker > 0);
    assert(region->devices[dev_idx].vector_length > 0);

    size_t device_idx = region->devices[dev_idx].device_idx;
    assert(acc_runtime.opencl_data->devices_data[device_idx] != NULL);

    // Create a default context
    acc_context_t context = acc_create_context(region, kernel, device_idx);

    // If nothing have to be done on this device the context is NULL.
    if (context == NULL) continue;

    // Look for a matching ‭version of the kernel, fill the context according to the selected version
    cl_kernel ocl_kernel = acc_build_ocl_kernel(region, kernel, context, device_idx);

    cl_int status;
    cl_uint idx = 0;
    unsigned i, j, k, l;

    // Set params kernel arguments 
    for (i = 0; i < kernel->desc->num_params; i++) {
      status = clSetKernelArg(ocl_kernel, idx, kernel->desc->size_params[i], kernel->param_ptrs[i]);
      if (status != CL_SUCCESS) {
        const char * status_str = acc_ocl_status_to_char(status);
        printf("[fatal]   clSetKernelArg return %s for region[%u].kernel[%u] argument %u (scalar #%u).\n",
                  status_str, region->desc->id, kernel->desc->id, idx, i
              );
        exit(-1); /// \todo error code
      }
      idx++;
    }

    // Set scalar kernel arguments 
    for (i = 0; i < kernel->desc->num_scalars; i++) {
      status = clSetKernelArg(ocl_kernel, idx, kernel->desc->size_scalars[i], kernel->scalar_ptrs[i]);
      if (status != CL_SUCCESS) {
        const char * status_str = acc_ocl_status_to_char(status);
        printf("[fatal]   clSetKernelArg return %s for region[%u].kernel[%u] argument %u (scalar #%u).\n",
                  status_str, region->desc->id, kernel->desc->id, idx, i
              );
        exit(-1); /// \todo error code
      }
      idx++;
    }

    // Set data kernel argument
    for (i = 0; i < kernel->desc->num_datas; i++) {
      assert(kernel->data_ptrs[i] != NULL);

      h_void * h_data_ptr = kernel->data_ptrs[i];
      size_t n = kernel->data_size[i];

      acc_distributed_data(region, device_idx, &h_data_ptr, &n);

      d_void * d_data_ptr = acc_deviceptr_(device_idx, h_data_ptr);
      assert(d_data_ptr != NULL);
      status = clSetKernelArg(ocl_kernel, idx, sizeof(cl_mem), &(d_data_ptr));
      if (status != CL_SUCCESS) {
        const char * status_str = acc_ocl_status_to_char(status);
        printf("[fatal]   clSetKernelArg return %s for region[%u].kernel[%u] argument %u (data #%u).\n",
                  status_str, region->desc->id, kernel->desc->id, idx, i
              );
        exit(-1); /// \todo error code
      }
      idx++;

      // if data is distributed need to provide the offset
      for (j = 0; j < region->desc->num_distributed_data; j++)
        if (kernel->data_ptrs[i] == region->distributed_data[j].ptr)
          break;
      if (j < region->desc->num_distributed_data && region->desc->distributed_data[j].mode != e_all) {
#if PRINT_INFO
        printf("[info]    region[%u].kernel[%u] on device #%u  data #%u is distributed.\n",
                    region->desc->id, kernel->desc->id, device_idx, i
                );
#endif
        assert( region->desc->distributed_data[j].mode == e_contiguous &&
                region->desc->distributed_data[j].nbr_dev == region->num_devices &&
                region->desc->distributed_data[j].portions != NULL
              );

        for (k = 0; k < region->num_devices; k++)
          if (region->devices[k].device_idx == device_idx)
            break;
        assert(k < region->num_devices);

        unsigned sum_portions = 0;
        unsigned prev_portion = 0;
        for (l = 0; l < region->num_devices; l++) {
          sum_portions += region->desc->distributed_data[j].portions[l];
          if (l < k)
            prev_portion += region->desc->distributed_data[j].portions[l];
        };

        int offset = (region->distributed_data[j].size * prev_portion) / sum_portions;

#if PRINT_INFO
        printf("[info]        sum_portions = %d\n", sum_portions);
        printf("[info]        prev_portion = %d\n", prev_portion);
        printf("[info]        offset       = %d\n", offset);
#endif

        status = clSetKernelArg(ocl_kernel, idx, sizeof(int), &offset);
        if (status != CL_SUCCESS) {
          const char * status_str = acc_ocl_status_to_char(status);
          printf("[fatal]   clSetKernelArg return %s for region[%u].kernel[%u] argument %u: offset for distributed data %u.\n",
                    status_str, region->desc->id, kernel->desc->id, idx, i
                );
          exit(-1); /// \todo error code
        }
        idx++;
      }
    }

    // Allocate/copy context in constant memory \todo alloc only copy before launch with event wait
    cl_mem ocl_context = clCreateBuffer( acc_runtime.opencl_data->devices_data[device_idx]->context,
                                         CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                         sizeof(struct acc_context_t_) + context->num_loop * sizeof(struct acc_kernel_loop_t_),
                                         context, &status );
    if (status != CL_SUCCESS) {
      const char * status_str = acc_ocl_status_to_char(status);
      printf("[fatal]   clCreateBuffer return %s for region[%u].kernel[%u] when call to build the kernel copy of context.\n",
                status_str, region->desc->id, kernel->desc->id
            );
      exit(-1); /// \todo error code
    }

    // Set context of the kernel
    status = clSetKernelArg(ocl_kernel, idx, sizeof(cl_mem), &ocl_context);
    if (status != CL_SUCCESS) {
      const char * status_str = acc_ocl_status_to_char(status);
      printf("[fatal]   clSetKernelArg return %s for region[%u].kernel[%u] argument %u (context).\n",
                status_str, region->desc->id, kernel->desc->id, idx, i
            );
      exit(-1); /// \todo error code
    }
    idx++;

    assert(acc_runtime.opencl_data->devices_data[device_idx]->command_queue != NULL);

    // Launch the kernel
    size_t global_work_size[1] = { region->devices[dev_idx].num_gang * region->devices[dev_idx].num_worker };
    size_t local_work_size[1] = { region->devices[dev_idx].num_worker };

    cl_event event;

    status = clEnqueueNDRangeKernel(
      acc_runtime.opencl_data->devices_data[device_idx]->command_queue,
      ocl_kernel,
      /* cl_uint work_dim                  = */ 1,
      /* const size_t * global_work_offset = */ NULL,
      /* const size_t * global_work_size   = */ global_work_size,
      /* const size_t * local_work_size    = */ local_work_size,
      /* cl_uint num_events_in_wait_list   = */ 0,
      /* const cl_event * event_wait_list  = */ NULL,
      /* cl_event * event                  = */ &event
    );
    if (status != CL_SUCCESS) {
      const char * status_str = acc_ocl_status_to_char(status);
      printf("[fatal]   clEnqueueNDRangeKernel return %s for region[%u].kernel[%u].\n",
                status_str, region->desc->id, kernel->desc->id
            );
      exit(-1); /// \todo error code
    }

    acc_profiling_register_kernel_launch(event, device_idx, region->desc->id, kernel->desc->id);
  }
}
Esempio n. 5
0
void acc_enqueue_kernel(acc_region_t region, acc_kernel_t kernel) {
#if DBG_KERNEL
  printf("[debug] acc_enqueue_kernel #%zd\n", kernel->desc->id);
#endif

  size_t i, j, k, l, dev_idx;
  for (dev_idx = 0; dev_idx < region->desc->num_devices; dev_idx++) {
    assert(region->devices[dev_idx].num_gang[0] > 0);
    assert(region->devices[dev_idx].num_gang[1] > 0);
    assert(region->devices[dev_idx].num_gang[2] > 0);
    assert(region->devices[dev_idx].num_worker[0] > 0);
    assert(region->devices[dev_idx].num_worker[1] > 0);
    assert(region->devices[dev_idx].num_worker[2] > 0);
    assert(region->devices[dev_idx].vector_length > 0);

    for (i = 0; i < kernel->desc->num_loops; i++)
      assert(kernel->loops[i].stride != 0);

    size_t device_idx = region->devices[dev_idx].device_idx;
    assert(acc_runtime.opencl_data->devices_data[device_idx] != NULL);

    // Create a default context
    acc_context_t context;

    // Look for a matching ‭version of the kernel, fill the context according to the selected version
    cl_kernel ocl_kernel = acc_build_ocl_kernel(region, kernel, &context, device_idx);

    cl_int status;
    cl_uint idx = 0;

    // Set params kernel arguments 
    for (i = 0; i < kernel->desc->num_params; i++) {
      size_t size_param = region->desc->size_params[kernel->desc->param_ids[i]];
      status = clSetKernelArg(ocl_kernel, idx, size_param, kernel->param_ptrs[i]);
      if (status != CL_SUCCESS) {
        const char * status_str = acc_ocl_status_to_char(status);
        printf("[fatal]   clSetKernelArg return %s for region[%zd].kernel[%zd] argument %u (scalar #%zd).\n",
                  status_str, region->desc->id, kernel->desc->id, idx, i
              );
        exit(-1); /// \todo error code
      }
      idx++;
    }

    // Set scalar kernel arguments 
    for (i = 0; i < kernel->desc->num_scalars; i++) {
      size_t size_scalar = region->desc->size_scalars[kernel->desc->scalar_ids[i]];
      status = clSetKernelArg(ocl_kernel, idx, size_scalar, kernel->scalar_ptrs[i]);
      if (status != CL_SUCCESS) {
        const char * status_str = acc_ocl_status_to_char(status);
        printf("[fatal]   clSetKernelArg return %s for region[%zd].kernel[%zd] argument %u (scalar #%zd).\n",
                  status_str, region->desc->id, kernel->desc->id, idx, i
              );
        exit(-1); /// \todo error code
      }
      idx++;
    }

    // Set data kernel argument
    for (i = 0; i < kernel->desc->num_datas; i++) {
      assert(kernel->data_ptrs[i] != NULL);

      h_void * h_data_ptr = kernel->data_ptrs[i];
      size_t n = kernel->data_size[i];

      acc_distributed_data(region, device_idx, &h_data_ptr, &n);

      d_void * d_data_ptr = acc_deviceptr_(device_idx, h_data_ptr);
      if (d_data_ptr == NULL) {
        printf("[fatal]   Cannot find device pointer for %016" PRIxPTR " (%016" PRIxPTR ") on device #%zd for region[%zd].kernel[%zd] argument %u (data #%zd).\n",
                  (uintptr_t)h_data_ptr, (uintptr_t)kernel->data_ptrs[i], device_idx, region->desc->id, kernel->desc->id, idx, i);
        exit(-1); /// \todo error code
      }
      status = clSetKernelArg(ocl_kernel, idx, sizeof(cl_mem), &(d_data_ptr));
      if (status != CL_SUCCESS) {
        const char * status_str = acc_ocl_status_to_char(status);
        printf("[fatal]   clSetKernelArg return %s for region[%zd].kernel[%zd] argument %u (data #%zd).\n",
                  status_str, region->desc->id, kernel->desc->id, idx, i
              );
        exit(-1); /// \todo error code
      }
      idx++;

      // if data is distributed need to provide the offset
      for (j = 0; j < region->desc->num_distributed_data; j++)
        if (kernel->desc->data_ids[i] == region->desc->distributed_data[j].id)
          break;
      if (j < region->desc->num_distributed_data) {
#if DBG_KERNEL
        printf("[debug]   region[%zd].kernel[%zd] on device #%zd  data #%zd is distributed.\n",
                    region->desc->id, kernel->desc->id, device_idx, i
                );
#endif
        assert( region->desc->distributed_data[j].mode == e_contiguous &&
                region->desc->distributed_data[j].nbr_dev == region->desc->num_devices &&
                region->desc->distributed_data[j].portions != NULL
              );

        for (k = 0; k < region->desc->num_devices; k++)
          if (region->devices[k].device_idx == device_idx)
            break;
        assert(k < region->desc->num_devices);

        unsigned sum_portions = 0;
        unsigned prev_portion = 0;
        for (l = 0; l < region->desc->num_devices; l++) {
          sum_portions += region->desc->distributed_data[j].portions[l];
          if (l < k)
            prev_portion += region->desc->distributed_data[j].portions[l];
        };

        int offset = (region->data[kernel->desc->data_ids[i]].nbr_elements_dominant_dimension * prev_portion) / sum_portions;

#if DBG_KERNEL
        printf("[debug]       sum_portions = %d\n", sum_portions);
        printf("[debug]       prev_portion = %d\n", prev_portion);
        printf("[debug]       offset       = %d\n", offset);
#endif

        status = clSetKernelArg(ocl_kernel, idx, sizeof(int), &offset);
        if (status != CL_SUCCESS) {
          const char * status_str = acc_ocl_status_to_char(status);
          printf("[fatal]   clSetKernelArg return %s for region[%zd].kernel[%zd] argument %u: offset for distributed data %zd.\n",
                    status_str, region->desc->id, kernel->desc->id, idx, i
                );
          exit(-1); /// \todo error code
        }
        idx++;
      }
    }

    // Set private data kernel argument
    for (i = 0; i < kernel->desc->num_privates; i++) {
      status = clSetKernelArg(ocl_kernel, idx, kernel->private_size[i], NULL);
      if (status != CL_SUCCESS) {
        const char * status_str = acc_ocl_status_to_char(status);
        printf("[fatal]   clSetKernelArg return %s for region[%zd].kernel[%zd] argument %u (privatedata #%zd).\n",
                  status_str, region->desc->id, kernel->desc->id, idx, i
              );
        exit(-1); /// \todo error code
      }
      idx++;
    }

    // Allocate/copy context in constant memory \todo alloc only copy before launch with event wait
    cl_mem ocl_context = clCreateBuffer( acc_runtime.opencl_data->devices_data[device_idx]->context,
                                         CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                         sizeof(struct acc_context_t_) + 2 * (context->num_loops + context->num_tiles) * sizeof(long),
                                         context, &status );
    if (status != CL_SUCCESS) {
      const char * status_str = acc_ocl_status_to_char(status);
      printf("[fatal]   clCreateBuffer return %s for region[%zd].kernel[%zd] when call to build the kernel copy of context.\n",
                status_str, region->desc->id, kernel->desc->id
            );
      exit(-1); /// \todo error code
    }

    free(context); // Not needed anymore

    // Set context of the kernel
    status = clSetKernelArg(ocl_kernel, idx, sizeof(cl_mem), &ocl_context);
    if (status != CL_SUCCESS) {
      const char * status_str = acc_ocl_status_to_char(status);
      printf("[fatal]   clSetKernelArg return %s for region[%zd].kernel[%zd] argument %u (context).\n",
                status_str, region->desc->id, kernel->desc->id, idx, i
            );
      exit(-1); /// \todo error code
    }
    idx++;

    assert(acc_runtime.opencl_data->devices_data[device_idx]->command_queue != NULL);

    // Launch the kernel
    size_t work_dim = 0;
    if      (region->devices[dev_idx].num_gang[2] > 1 || region->devices[dev_idx].num_worker[2] > 1) work_dim = 3;
    else if (region->devices[dev_idx].num_gang[1] > 1 || region->devices[dev_idx].num_worker[1] > 1) work_dim = 2;
    else if (region->devices[dev_idx].num_gang[0] > 1 || region->devices[dev_idx].num_worker[0] > 1) work_dim = 1;
    assert(work_dim > 0);

    size_t global_work_size[3] = {
                                   region->devices[dev_idx].num_gang[0] * region->devices[dev_idx].num_worker[0],
                                   region->devices[dev_idx].num_gang[1] * region->devices[dev_idx].num_worker[1],
                                   region->devices[dev_idx].num_gang[2] * region->devices[dev_idx].num_worker[2]
                                 };
    size_t local_work_size[3] =  {
                                   region->devices[dev_idx].num_worker[0],
                                   region->devices[dev_idx].num_worker[1],
                                   region->devices[dev_idx].num_worker[2]
                                 };

#if DBG_KERNEL
    printf("[debug]              work_dim = %zd\n", work_dim);
    printf("[debug]   global_work_size[3] = {%zd,%zd,%zd} (= %zd)\n", global_work_size[0], global_work_size[1], global_work_size[2], global_work_size[0] * global_work_size[1] * global_work_size[2]);
    printf("[debug]    local_work_size[3] = {%zd,%zd,%zd} (= %zd)\n", local_work_size[0], local_work_size[1], local_work_size[2], local_work_size[0] * local_work_size[1] * local_work_size[2]);

    cl_ulong kernel_local_mem_size = 0;
    status = clGetKernelWorkGroupInfo(ocl_kernel, acc_runtime.opencl_data->devices[0][device_idx], CL_KERNEL_LOCAL_MEM_SIZE, sizeof(cl_ulong), &kernel_local_mem_size, NULL);
    printf("[debug]   kernel_local_mem_size = %lu\n", kernel_local_mem_size);

    size_t kernel_work_group_size = 0;
    status = clGetKernelWorkGroupInfo(ocl_kernel, acc_runtime.opencl_data->devices[0][device_idx], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernel_work_group_size, NULL);
    printf("[debug]   kernel_work_group_size = %lu\n", kernel_work_group_size);
#endif

    cl_event event;

    status = clEnqueueNDRangeKernel(
      acc_runtime.opencl_data->devices_data[device_idx]->command_queue,
      ocl_kernel,
      /* cl_uint work_dim                  = */ work_dim,
      /* const size_t * global_work_offset = */ NULL,
      /* const size_t * global_work_size   = */ global_work_size,
      /* const size_t * local_work_size    = */ local_work_size,
      /* cl_uint num_events_in_wait_list   = */ 0,
      /* const cl_event * event_wait_list  = */ NULL,
      /* cl_event * event                  = */ &event
    );
    if (status != CL_SUCCESS) {
      const char * status_str = acc_ocl_status_to_char(status);
      printf("[fatal]   clEnqueueNDRangeKernel return %s for region[%zd].kernel[%zd].\n",
                status_str, region->desc->id, kernel->desc->id
            );
      assert(0); /// \todo error code
    }

    acc_profiling_register_kernel_launch(event, device_idx, region->desc->id, kernel->desc->id);

    clReleaseMemObject(ocl_context);

    clReleaseKernel(ocl_kernel);
  }
}
Esempio n. 6
0
void acc_region_init(struct acc_region_t_ * region) {
#if DBG_REGION
  printf("[debug]  acc_region_init\n");
#endif

  size_t region_id;
  for (region_id = 0; region_id < compiler_data.num_regions; region_id++)
    if (compiler_data.regions[region_id]->id == region->desc->id)
      break;
  assert(region_id < compiler_data.num_regions);

  char * ocl_sources[2] = {
    acc_runtime.opencl_data->runtime_sources,
    acc_runtime.opencl_data->region_sources[region_id]
  };

  char build_options[1024];
  build_options[0] = '\0';
  strcpy(build_options, "-I");
  strcat(build_options, compiler_data.acc_inc_path);
  strcat(build_options, " ");

  assert(compiler_data.regions[region_id]->num_options == 0 || compiler_data.regions[region_id]->options != NULL);

  unsigned i;
  for (i = 0; i < compiler_data.regions[region_id]->num_options; i++) {
    assert(strlen(build_options) < 512);
    strcat(build_options, compiler_data.regions[region_id]->options[i]);
    strcat(build_options, " ");
  }

  cl_int status;

  unsigned idx;
  for (idx = 0; idx < region->desc->num_devices; idx ++) {
    size_t device_idx = region->devices[idx].device_idx;

    assert(acc_runtime.opencl_data->devices_data[device_idx] != NULL);

    if (acc_runtime.opencl_data->devices_data[device_idx]->programs[region_id] == NULL) {
      cl_context * context = &(acc_runtime.opencl_data->devices_data[device_idx]->context);
      cl_program * program = &(acc_runtime.opencl_data->devices_data[device_idx]->programs[region_id]);

      *program = clCreateProgramWithSource(*context, 2, (const char **)ocl_sources, NULL, &status);
      if (status != CL_SUCCESS) {
        const char * status_str = acc_ocl_status_to_char(status);
        printf("[fatal]   clCreateProgramWithSource on %s (#%u) for region %zd return %s : failed\n",
               acc_device_name[acc_runtime.curr_device_type], acc_runtime.curr_device_num, region_id, status_str);
        exit(-1);
      }

      status = clBuildProgram(*program, 1, &(acc_runtime.opencl_data->devices[0][device_idx]), build_options, NULL, NULL);
      if (status == CL_BUILD_PROGRAM_FAILURE)
        acc_dbg_ocl_build_log(device_idx, *program);
      if (status != CL_SUCCESS) {
        const char * status_str = acc_ocl_status_to_char(status);
        printf("[fatal]   clBuildProgram on %s (#%u) for region %zd return %s\n",
               acc_device_name[acc_runtime.curr_device_type], acc_runtime.curr_device_num, region_id, status_str);
        exit(-1);
      }

      cl_build_status build_status;
      status = clGetProgramBuildInfo(*program, acc_runtime.opencl_data->devices[0][device_idx], CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &build_status, NULL);
      if (status != CL_SUCCESS) {
        const char * status_str = acc_ocl_status_to_char(status);
        printf("[fatal]   clGetProgramBuildInfo return %s\n", status_str);
        exit(-1); /// \todo error code
      }
      switch (build_status) {
        case CL_BUILD_NONE:
          printf("[fatal]   clGetProgramBuildInfo: build status: CL_BUILD_NONE.\n");
          exit(-1);
        case CL_BUILD_ERROR:
          printf("[fatal]   clGetProgramBuildInfo: build status: CL_BUILD_ERROR.\n");
          acc_dbg_ocl_build_log(device_idx, *program);
          exit(-1);
        case CL_BUILD_SUCCESS:
#if BUILD_LOG
          printf("[info]    clGetProgramBuildInfo: build status: CL_BUILD_SUCCESS.\n");
          acc_dbg_ocl_build_log(device_idx, *program);
#endif
          break;
        case CL_BUILD_IN_PROGRESS: // Should not append as we do not provide a callback function to clBuildProgram
          printf("[fatal]   clGetProgramBuildInfo: build status: CL_BUILD_IN_PROGRESS.\n"\
                 "[fatal]                          It should not append as we do not provide a callback function to clBuildProgram");
        default:
          assert(0);
      }
    }
  }
}