void acc_update_self_(size_t device_idx, h_void * host_ptr, size_t n) {
  assert(acc_is_present_(device_idx, host_ptr, n));

  d_void * dev_ptr = acc_deviceptr_(device_idx, host_ptr);

  acc_memcpy_from_device_(device_idx, host_ptr, dev_ptr, n);
}
void acc_delete_(size_t device_idx, h_void * host_ptr, size_t n) {
  assert(acc_is_present_(device_idx, host_ptr, n));

  d_void * dev_ptr = acc_deviceptr_(device_idx, host_ptr);

  acc_unmap_data_(device_idx, host_ptr);

  acc_free_(device_idx, dev_ptr);
}
void acc_enqueue_kernel(acc_region_t region, acc_kernel_t kernel) {
#if PRINT_INFO
  printf("[info]  acc_enqueue_kernel\n");
#endif

  unsigned dev_idx;
  for (dev_idx = 0; dev_idx < region->num_devices; dev_idx++) {
    assert(region->devices[dev_idx].num_gang > 0);
    assert(region->devices[dev_idx].num_worker > 0);
    assert(region->devices[dev_idx].vector_length > 0);

    size_t device_idx = region->devices[dev_idx].device_idx;
    assert(acc_runtime.opencl_data->devices_data[device_idx] != NULL);

    // Create a default context
    acc_context_t context = acc_create_context(region, kernel, device_idx);

    // If nothing have to be done on this device the context is NULL.
    if (context == NULL) continue;

    // Look for a matching ‭version of the kernel, fill the context according to the selected version
    cl_kernel ocl_kernel = acc_build_ocl_kernel(region, kernel, context, device_idx);

    cl_int status;
    cl_uint idx = 0;
    unsigned i, j, k, l;

    // Set params kernel arguments 
    for (i = 0; i < kernel->desc->num_params; i++) {
      status = clSetKernelArg(ocl_kernel, idx, kernel->desc->size_params[i], kernel->param_ptrs[i]);
      if (status != CL_SUCCESS) {
        const char * status_str = acc_ocl_status_to_char(status);
        printf("[fatal]   clSetKernelArg return %s for region[%u].kernel[%u] argument %u (scalar #%u).\n",
                  status_str, region->desc->id, kernel->desc->id, idx, i
              );
        exit(-1); /// \todo error code
      }
      idx++;
    }

    // Set scalar kernel arguments 
    for (i = 0; i < kernel->desc->num_scalars; i++) {
      status = clSetKernelArg(ocl_kernel, idx, kernel->desc->size_scalars[i], kernel->scalar_ptrs[i]);
      if (status != CL_SUCCESS) {
        const char * status_str = acc_ocl_status_to_char(status);
        printf("[fatal]   clSetKernelArg return %s for region[%u].kernel[%u] argument %u (scalar #%u).\n",
                  status_str, region->desc->id, kernel->desc->id, idx, i
              );
        exit(-1); /// \todo error code
      }
      idx++;
    }

    // Set data kernel argument
    for (i = 0; i < kernel->desc->num_datas; i++) {
      assert(kernel->data_ptrs[i] != NULL);

      h_void * h_data_ptr = kernel->data_ptrs[i];
      size_t n = kernel->data_size[i];

      acc_distributed_data(region, device_idx, &h_data_ptr, &n);

      d_void * d_data_ptr = acc_deviceptr_(device_idx, h_data_ptr);
      assert(d_data_ptr != NULL);
      status = clSetKernelArg(ocl_kernel, idx, sizeof(cl_mem), &(d_data_ptr));
      if (status != CL_SUCCESS) {
        const char * status_str = acc_ocl_status_to_char(status);
        printf("[fatal]   clSetKernelArg return %s for region[%u].kernel[%u] argument %u (data #%u).\n",
                  status_str, region->desc->id, kernel->desc->id, idx, i
              );
        exit(-1); /// \todo error code
      }
      idx++;

      // if data is distributed need to provide the offset
      for (j = 0; j < region->desc->num_distributed_data; j++)
        if (kernel->data_ptrs[i] == region->distributed_data[j].ptr)
          break;
      if (j < region->desc->num_distributed_data && region->desc->distributed_data[j].mode != e_all) {
#if PRINT_INFO
        printf("[info]    region[%u].kernel[%u] on device #%u  data #%u is distributed.\n",
                    region->desc->id, kernel->desc->id, device_idx, i
                );
#endif
        assert( region->desc->distributed_data[j].mode == e_contiguous &&
                region->desc->distributed_data[j].nbr_dev == region->num_devices &&
                region->desc->distributed_data[j].portions != NULL
              );

        for (k = 0; k < region->num_devices; k++)
          if (region->devices[k].device_idx == device_idx)
            break;
        assert(k < region->num_devices);

        unsigned sum_portions = 0;
        unsigned prev_portion = 0;
        for (l = 0; l < region->num_devices; l++) {
          sum_portions += region->desc->distributed_data[j].portions[l];
          if (l < k)
            prev_portion += region->desc->distributed_data[j].portions[l];
        };

        int offset = (region->distributed_data[j].size * prev_portion) / sum_portions;

#if PRINT_INFO
        printf("[info]        sum_portions = %d\n", sum_portions);
        printf("[info]        prev_portion = %d\n", prev_portion);
        printf("[info]        offset       = %d\n", offset);
#endif

        status = clSetKernelArg(ocl_kernel, idx, sizeof(int), &offset);
        if (status != CL_SUCCESS) {
          const char * status_str = acc_ocl_status_to_char(status);
          printf("[fatal]   clSetKernelArg return %s for region[%u].kernel[%u] argument %u: offset for distributed data %u.\n",
                    status_str, region->desc->id, kernel->desc->id, idx, i
                );
          exit(-1); /// \todo error code
        }
        idx++;
      }
    }

    // Allocate/copy context in constant memory \todo alloc only copy before launch with event wait
    cl_mem ocl_context = clCreateBuffer( acc_runtime.opencl_data->devices_data[device_idx]->context,
                                         CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                         sizeof(struct acc_context_t_) + context->num_loop * sizeof(struct acc_kernel_loop_t_),
                                         context, &status );
    if (status != CL_SUCCESS) {
      const char * status_str = acc_ocl_status_to_char(status);
      printf("[fatal]   clCreateBuffer return %s for region[%u].kernel[%u] when call to build the kernel copy of context.\n",
                status_str, region->desc->id, kernel->desc->id
            );
      exit(-1); /// \todo error code
    }

    // Set context of the kernel
    status = clSetKernelArg(ocl_kernel, idx, sizeof(cl_mem), &ocl_context);
    if (status != CL_SUCCESS) {
      const char * status_str = acc_ocl_status_to_char(status);
      printf("[fatal]   clSetKernelArg return %s for region[%u].kernel[%u] argument %u (context).\n",
                status_str, region->desc->id, kernel->desc->id, idx, i
            );
      exit(-1); /// \todo error code
    }
    idx++;

    assert(acc_runtime.opencl_data->devices_data[device_idx]->command_queue != NULL);

    // Launch the kernel
    size_t global_work_size[1] = { region->devices[dev_idx].num_gang * region->devices[dev_idx].num_worker };
    size_t local_work_size[1] = { region->devices[dev_idx].num_worker };

    cl_event event;

    status = clEnqueueNDRangeKernel(
      acc_runtime.opencl_data->devices_data[device_idx]->command_queue,
      ocl_kernel,
      /* cl_uint work_dim                  = */ 1,
      /* const size_t * global_work_offset = */ NULL,
      /* const size_t * global_work_size   = */ global_work_size,
      /* const size_t * local_work_size    = */ local_work_size,
      /* cl_uint num_events_in_wait_list   = */ 0,
      /* const cl_event * event_wait_list  = */ NULL,
      /* cl_event * event                  = */ &event
    );
    if (status != CL_SUCCESS) {
      const char * status_str = acc_ocl_status_to_char(status);
      printf("[fatal]   clEnqueueNDRangeKernel return %s for region[%u].kernel[%u].\n",
                status_str, region->desc->id, kernel->desc->id
            );
      exit(-1); /// \todo error code
    }

    acc_profiling_register_kernel_launch(event, device_idx, region->desc->id, kernel->desc->id);
  }
}
d_void * acc_present_or_copyout_(size_t device_idx, h_void * host_ptr, size_t n) {
  /// \todo incorrect need to check if {host_ptr, n} is allocated on top of the data-env
  if (acc_is_present_(device_idx, host_ptr, n))
    acc_copyout_(device_idx, host_ptr, n);
  return acc_deviceptr_(device_idx, host_ptr); /// should return NULL if previous executed
}
d_void * acc_present_or_create_(size_t device_idx, h_void * host_ptr, size_t n) {
  if (acc_is_present_(device_idx, host_ptr, n))
    return acc_deviceptr_(device_idx, host_ptr);
  else
    return acc_create_(device_idx, host_ptr, n);
}
Example #6
0
void acc_enqueue_kernel(acc_region_t region, acc_kernel_t kernel) {
#if DBG_KERNEL
  printf("[debug] acc_enqueue_kernel #%zd\n", kernel->desc->id);
#endif

  size_t i, j, k, l, dev_idx;
  for (dev_idx = 0; dev_idx < region->desc->num_devices; dev_idx++) {
    assert(region->devices[dev_idx].num_gang[0] > 0);
    assert(region->devices[dev_idx].num_gang[1] > 0);
    assert(region->devices[dev_idx].num_gang[2] > 0);
    assert(region->devices[dev_idx].num_worker[0] > 0);
    assert(region->devices[dev_idx].num_worker[1] > 0);
    assert(region->devices[dev_idx].num_worker[2] > 0);
    assert(region->devices[dev_idx].vector_length > 0);

    for (i = 0; i < kernel->desc->num_loops; i++)
      assert(kernel->loops[i].stride != 0);

    size_t device_idx = region->devices[dev_idx].device_idx;
    assert(acc_runtime.opencl_data->devices_data[device_idx] != NULL);

    // Create a default context
    acc_context_t context;

    // Look for a matching ‭version of the kernel, fill the context according to the selected version
    cl_kernel ocl_kernel = acc_build_ocl_kernel(region, kernel, &context, device_idx);

    cl_int status;
    cl_uint idx = 0;

    // Set params kernel arguments 
    for (i = 0; i < kernel->desc->num_params; i++) {
      size_t size_param = region->desc->size_params[kernel->desc->param_ids[i]];
      status = clSetKernelArg(ocl_kernel, idx, size_param, kernel->param_ptrs[i]);
      if (status != CL_SUCCESS) {
        const char * status_str = acc_ocl_status_to_char(status);
        printf("[fatal]   clSetKernelArg return %s for region[%zd].kernel[%zd] argument %u (scalar #%zd).\n",
                  status_str, region->desc->id, kernel->desc->id, idx, i
              );
        exit(-1); /// \todo error code
      }
      idx++;
    }

    // Set scalar kernel arguments 
    for (i = 0; i < kernel->desc->num_scalars; i++) {
      size_t size_scalar = region->desc->size_scalars[kernel->desc->scalar_ids[i]];
      status = clSetKernelArg(ocl_kernel, idx, size_scalar, kernel->scalar_ptrs[i]);
      if (status != CL_SUCCESS) {
        const char * status_str = acc_ocl_status_to_char(status);
        printf("[fatal]   clSetKernelArg return %s for region[%zd].kernel[%zd] argument %u (scalar #%zd).\n",
                  status_str, region->desc->id, kernel->desc->id, idx, i
              );
        exit(-1); /// \todo error code
      }
      idx++;
    }

    // Set data kernel argument
    for (i = 0; i < kernel->desc->num_datas; i++) {
      assert(kernel->data_ptrs[i] != NULL);

      h_void * h_data_ptr = kernel->data_ptrs[i];
      size_t n = kernel->data_size[i];

      acc_distributed_data(region, device_idx, &h_data_ptr, &n);

      d_void * d_data_ptr = acc_deviceptr_(device_idx, h_data_ptr);
      if (d_data_ptr == NULL) {
        printf("[fatal]   Cannot find device pointer for %016" PRIxPTR " (%016" PRIxPTR ") on device #%zd for region[%zd].kernel[%zd] argument %u (data #%zd).\n",
                  (uintptr_t)h_data_ptr, (uintptr_t)kernel->data_ptrs[i], device_idx, region->desc->id, kernel->desc->id, idx, i);
        exit(-1); /// \todo error code
      }
      status = clSetKernelArg(ocl_kernel, idx, sizeof(cl_mem), &(d_data_ptr));
      if (status != CL_SUCCESS) {
        const char * status_str = acc_ocl_status_to_char(status);
        printf("[fatal]   clSetKernelArg return %s for region[%zd].kernel[%zd] argument %u (data #%zd).\n",
                  status_str, region->desc->id, kernel->desc->id, idx, i
              );
        exit(-1); /// \todo error code
      }
      idx++;

      // if data is distributed need to provide the offset
      for (j = 0; j < region->desc->num_distributed_data; j++)
        if (kernel->desc->data_ids[i] == region->desc->distributed_data[j].id)
          break;
      if (j < region->desc->num_distributed_data) {
#if DBG_KERNEL
        printf("[debug]   region[%zd].kernel[%zd] on device #%zd  data #%zd is distributed.\n",
                    region->desc->id, kernel->desc->id, device_idx, i
                );
#endif
        assert( region->desc->distributed_data[j].mode == e_contiguous &&
                region->desc->distributed_data[j].nbr_dev == region->desc->num_devices &&
                region->desc->distributed_data[j].portions != NULL
              );

        for (k = 0; k < region->desc->num_devices; k++)
          if (region->devices[k].device_idx == device_idx)
            break;
        assert(k < region->desc->num_devices);

        unsigned sum_portions = 0;
        unsigned prev_portion = 0;
        for (l = 0; l < region->desc->num_devices; l++) {
          sum_portions += region->desc->distributed_data[j].portions[l];
          if (l < k)
            prev_portion += region->desc->distributed_data[j].portions[l];
        };

        int offset = (region->data[kernel->desc->data_ids[i]].nbr_elements_dominant_dimension * prev_portion) / sum_portions;

#if DBG_KERNEL
        printf("[debug]       sum_portions = %d\n", sum_portions);
        printf("[debug]       prev_portion = %d\n", prev_portion);
        printf("[debug]       offset       = %d\n", offset);
#endif

        status = clSetKernelArg(ocl_kernel, idx, sizeof(int), &offset);
        if (status != CL_SUCCESS) {
          const char * status_str = acc_ocl_status_to_char(status);
          printf("[fatal]   clSetKernelArg return %s for region[%zd].kernel[%zd] argument %u: offset for distributed data %zd.\n",
                    status_str, region->desc->id, kernel->desc->id, idx, i
                );
          exit(-1); /// \todo error code
        }
        idx++;
      }
    }

    // Set private data kernel argument
    for (i = 0; i < kernel->desc->num_privates; i++) {
      status = clSetKernelArg(ocl_kernel, idx, kernel->private_size[i], NULL);
      if (status != CL_SUCCESS) {
        const char * status_str = acc_ocl_status_to_char(status);
        printf("[fatal]   clSetKernelArg return %s for region[%zd].kernel[%zd] argument %u (privatedata #%zd).\n",
                  status_str, region->desc->id, kernel->desc->id, idx, i
              );
        exit(-1); /// \todo error code
      }
      idx++;
    }

    // Allocate/copy context in constant memory \todo alloc only copy before launch with event wait
    cl_mem ocl_context = clCreateBuffer( acc_runtime.opencl_data->devices_data[device_idx]->context,
                                         CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                         sizeof(struct acc_context_t_) + 2 * (context->num_loops + context->num_tiles) * sizeof(long),
                                         context, &status );
    if (status != CL_SUCCESS) {
      const char * status_str = acc_ocl_status_to_char(status);
      printf("[fatal]   clCreateBuffer return %s for region[%zd].kernel[%zd] when call to build the kernel copy of context.\n",
                status_str, region->desc->id, kernel->desc->id
            );
      exit(-1); /// \todo error code
    }

    free(context); // Not needed anymore

    // Set context of the kernel
    status = clSetKernelArg(ocl_kernel, idx, sizeof(cl_mem), &ocl_context);
    if (status != CL_SUCCESS) {
      const char * status_str = acc_ocl_status_to_char(status);
      printf("[fatal]   clSetKernelArg return %s for region[%zd].kernel[%zd] argument %u (context).\n",
                status_str, region->desc->id, kernel->desc->id, idx, i
            );
      exit(-1); /// \todo error code
    }
    idx++;

    assert(acc_runtime.opencl_data->devices_data[device_idx]->command_queue != NULL);

    // Launch the kernel
    size_t work_dim = 0;
    if      (region->devices[dev_idx].num_gang[2] > 1 || region->devices[dev_idx].num_worker[2] > 1) work_dim = 3;
    else if (region->devices[dev_idx].num_gang[1] > 1 || region->devices[dev_idx].num_worker[1] > 1) work_dim = 2;
    else if (region->devices[dev_idx].num_gang[0] > 1 || region->devices[dev_idx].num_worker[0] > 1) work_dim = 1;
    assert(work_dim > 0);

    size_t global_work_size[3] = {
                                   region->devices[dev_idx].num_gang[0] * region->devices[dev_idx].num_worker[0],
                                   region->devices[dev_idx].num_gang[1] * region->devices[dev_idx].num_worker[1],
                                   region->devices[dev_idx].num_gang[2] * region->devices[dev_idx].num_worker[2]
                                 };
    size_t local_work_size[3] =  {
                                   region->devices[dev_idx].num_worker[0],
                                   region->devices[dev_idx].num_worker[1],
                                   region->devices[dev_idx].num_worker[2]
                                 };

#if DBG_KERNEL
    printf("[debug]              work_dim = %zd\n", work_dim);
    printf("[debug]   global_work_size[3] = {%zd,%zd,%zd} (= %zd)\n", global_work_size[0], global_work_size[1], global_work_size[2], global_work_size[0] * global_work_size[1] * global_work_size[2]);
    printf("[debug]    local_work_size[3] = {%zd,%zd,%zd} (= %zd)\n", local_work_size[0], local_work_size[1], local_work_size[2], local_work_size[0] * local_work_size[1] * local_work_size[2]);

    cl_ulong kernel_local_mem_size = 0;
    status = clGetKernelWorkGroupInfo(ocl_kernel, acc_runtime.opencl_data->devices[0][device_idx], CL_KERNEL_LOCAL_MEM_SIZE, sizeof(cl_ulong), &kernel_local_mem_size, NULL);
    printf("[debug]   kernel_local_mem_size = %lu\n", kernel_local_mem_size);

    size_t kernel_work_group_size = 0;
    status = clGetKernelWorkGroupInfo(ocl_kernel, acc_runtime.opencl_data->devices[0][device_idx], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernel_work_group_size, NULL);
    printf("[debug]   kernel_work_group_size = %lu\n", kernel_work_group_size);
#endif

    cl_event event;

    status = clEnqueueNDRangeKernel(
      acc_runtime.opencl_data->devices_data[device_idx]->command_queue,
      ocl_kernel,
      /* cl_uint work_dim                  = */ work_dim,
      /* const size_t * global_work_offset = */ NULL,
      /* const size_t * global_work_size   = */ global_work_size,
      /* const size_t * local_work_size    = */ local_work_size,
      /* cl_uint num_events_in_wait_list   = */ 0,
      /* const cl_event * event_wait_list  = */ NULL,
      /* cl_event * event                  = */ &event
    );
    if (status != CL_SUCCESS) {
      const char * status_str = acc_ocl_status_to_char(status);
      printf("[fatal]   clEnqueueNDRangeKernel return %s for region[%zd].kernel[%zd].\n",
                status_str, region->desc->id, kernel->desc->id
            );
      assert(0); /// \todo error code
    }

    acc_profiling_register_kernel_launch(event, device_idx, region->desc->id, kernel->desc->id);

    clReleaseMemObject(ocl_context);

    clReleaseKernel(ocl_kernel);
  }
}