예제 #1
0
파일: devices.c 프로젝트: NatTuck/pocl
void 
pocl_init_devices()
{
  const char *device_list;
  char *ptr, *tofree, *token, *saveptr;
  int i, devcount;
  if (pocl_num_devices > 0)
    return;
  
  if (getenv(POCL_DEVICES_ENV) != NULL) 
    {
      device_list = getenv(POCL_DEVICES_ENV);
    }
  else
    {
      device_list = "pthread";
    }
  
  ptr = tofree = strdup(device_list);
  while ((token = strtok_r (ptr, " ", &saveptr)) != NULL)
    {
      ++pocl_num_devices;
      ptr = NULL;
    }
  free (tofree);

  pocl_devices = malloc (pocl_num_devices * sizeof *pocl_devices);

  ptr = tofree = strdup(device_list);
  devcount = 0;
  while ((token = strtok_r (ptr, " ", &saveptr)) != NULL)
    {
      struct _cl_device_id* device_type = NULL;

      for (i = 0; i < POCL_NUM_DEVICE_TYPES; ++i)
        {
          if (strcmp(pocl_device_types[i].name, token) == 0)
            {
              /* Check if there are device-specific parameters set in the
                 POCL_DEVICEn_PARAMETERS env. */
              char env_name[1024];
              
              if (snprintf (env_name, 1024, "POCL_DEVICE%d_PARAMETERS", devcount) < 0)
                POCL_ABORT("Unable to generate the env string.");

              device_type = &pocl_device_types[i];
              memcpy (&pocl_devices[devcount], device_type, sizeof(struct _cl_device_id));
              pocl_devices[devcount].init(&pocl_devices[devcount], getenv(env_name));
              pocl_devices[devcount].dev_id = devcount;
              devcount++;
              break;
            }
        }
      if (device_type == NULL) 
          POCL_ABORT("device type not found\n");
      ptr = NULL;
    }
  free (tofree);
}
예제 #2
0
파일: pocl_cache.c 프로젝트: glupescu/pocl
void pocl_cache_init_topdir() {

    if (cache_topdir_initialized)
        return;

    const char *tmp_path = pocl_get_string_option("POCL_CACHE_DIR", NULL);
    int needed;

    if (tmp_path && (pocl_exists(tmp_path))) {
        needed = snprintf(cache_topdir, POCL_FILENAME_LENGTH, "%s", tmp_path);
    } else     {
#ifdef POCL_ANDROID
        char* process_name = pocl_get_process_name();
        needed = snprintf(cache_topdir, POCL_FILENAME_LENGTH,
                          "/data/data/%s/cache/", process_name);
        free(process_name);

        if (!pocl_exists(cache_topdir))
            needed = snprintf(cache_topdir,
                              POCL_FILENAME_LENGTH,
                              "/sdcard/pocl/kcache");
#elif defined(_MSC_VER) || defined(__MINGW32__)
        tmp_path = getenv("LOCALAPPDATA");
        if (!tmp_path)
            tmp_path = getenv("TEMP");
        assert(tmp_path);
        needed = snprintf(cache_topdir, POCL_FILENAME_LENGTH,
                          "%s\\pocl", tmp_path);
#else
        // "If $XDG_CACHE_HOME is either not set or empty, a default equal to
        // $HOME/.cache should be used."
        // http://standards.freedesktop.org/basedir-spec/latest/
        tmp_path = getenv("XDG_CACHE_HOME");

        if (tmp_path && tmp_path[0] != '\0') {
            needed = snprintf(cache_topdir, POCL_FILENAME_LENGTH,
                              "%s/pocl/kcache", tmp_path);
        }
        else if ((tmp_path = getenv("HOME")) != NULL) {
            needed = snprintf(cache_topdir, POCL_FILENAME_LENGTH,
                              "%s/.cache/pocl/kcache", tmp_path);
        }
        else {
            needed = snprintf(cache_topdir, POCL_FILENAME_LENGTH,
                              "/tmp/pocl/kcache");
        }
#endif
    }

    if (needed >= POCL_FILENAME_LENGTH) {
        POCL_ABORT("pocl: cache path longer than maximum filename length");
    }

    assert(strlen(cache_topdir) > 0);
    if (pocl_mkdir_p(cache_topdir))
        POCL_ABORT("Could not create topdir for cache");
    cache_topdir_initialized = 1;

}
예제 #3
0
파일: pocl-hsa.c 프로젝트: larsmans/pocl
/* TODO: there's not much to do here, just build the kernel for HSA.
   Perhaps share the same function for all WG sizes in case it's an
   SPMD target. */
static void compile (_cl_command_node *cmd)
{
  int error;
  char bytecode[POCL_FILENAME_LENGTH];
  char objfile[POCL_FILENAME_LENGTH];
  FILE *file;
  char *elf_blob;
  size_t file_size, got_size;
  hsa_runtime_caller_t caller;

  error = snprintf (bytecode, POCL_FILENAME_LENGTH,
                    "%s/%s", cmd->command.run.tmp_dir,
                    POCL_PARALLEL_BC_FILENAME);
  assert (error >= 0);

  error = snprintf (objfile, POCL_FILENAME_LENGTH,
                    "%s/%s.o", cmd->command.run.tmp_dir,
                    POCL_PARALLEL_BC_FILENAME);
  assert (error >= 0);

  error = pocl_llvm_codegen (cmd->command.run.kernel, cmd->device, bytecode, objfile);
  assert (error == 0);

  /* Load the built AMDGPU ELF file. */
  file = fopen (objfile, "rb");
  assert (file != NULL);

  cmd->command.run.device_data = (void**)malloc (sizeof(void*)*2);
  cmd->command.run.device_data[0] = malloc (sizeof(hsa_amd_code_unit_t));
  cmd->command.run.device_data[1] = malloc (sizeof(hsa_amd_code_t));

  file_size = pocl_file_size (file);
  elf_blob = (char*)malloc (file_size);
  got_size = fread (elf_blob, 1, file_size, file);

  if (file_size != got_size)
    POCL_ABORT ("pocl-hsa: could not read the AMD ELF.");

  caller.caller = 0;
  if (hsa_ext_code_unit_load
      (caller, NULL, 0, elf_blob, file_size, NULL, NULL,
       (hsa_amd_code_unit_t*)cmd->command.run.device_data[0]) != HSA_STATUS_SUCCESS)
    {
      POCL_ABORT ("pocl-hsa: error while loading the built AMDGPU ELF binary.");
    }

  if (hsa_ext_code_unit_get_info
      (*(hsa_amd_code_unit_t*)cmd->command.run.device_data[0],
       HSA_EXT_CODE_UNIT_INFO_CODE_ENTITY_CODE, 0,
       (hsa_amd_code_t*)cmd->command.run.device_data[1]) != HSA_STATUS_SUCCESS)
    {
      POCL_ABORT ("pocl-hsa: unable to get the code handle to the kernel.");
    }
  free (elf_blob);
  fclose (file);
}
예제 #4
0
파일: pocl-hsa.c 프로젝트: larsmans/pocl
unsigned int
pocl_hsa_probe(struct pocl_device_ops *ops)
{
  int env_count = pocl_device_get_env_count(ops->device_name);

  POCL_MSG_PRINT_INFO("pocl-hsa: found %d env devices with %s.\n",
                      env_count, ops->device_name);

  /* No hsa env specified, the user did not request for HSA agents. */
  if (env_count <= 0)
    return 0;

  if (hsa_init() != HSA_STATUS_SUCCESS)
    {
      POCL_ABORT("pocl-hsa: hsa_init() failed.");
    }

  if (hsa_iterate_agents(pocl_hsa_get_agents, NULL) !=
      HSA_STATUS_SUCCESS)
    {
      assert (0 && "pocl-hsa: could not get agents.");
    }
  POCL_MSG_PRINT_INFO("pocl-hsa: found %d agents.\n", found_hsa_agents);
  return found_hsa_agents;
}
예제 #5
0
파일: pocl_tracing.c 프로젝트: MoKarma/pocl
static void
text_tracer_init ()
{
  const char *text_tracer_output;

  text_tracer_output = pocl_get_string_option ("POCL_TRACE_EVENT_OPT",
                                          "pocl_trace_events.log");
  text_tracer_file = fopen (text_tracer_output, "w");
  if (!text_tracer_file)
    POCL_ABORT ("Failed to open text tracer output\n");
}
예제 #6
0
파일: pocl-cuda.c 프로젝트: jrprice/pocl
static void
pocl_cuda_abort_on_error (CUresult result, unsigned line, const char *func,
                          const char *code, const char *api)
{
  if (result != CUDA_SUCCESS)
    {
      const char *err_name;
      const char *err_string;
      cuGetErrorName (result, &err_name);
      cuGetErrorString (result, &err_string);
      POCL_MSG_PRINT2 (func, line, "Error during %s\n", api);
      POCL_ABORT ("%s: %s\n", err_name, err_string);
    }
}
예제 #7
0
void pocl_init_queue_list()
{
  POCL_INIT_LOCK(queue_lock);

  POCL_LOCK(queue_lock);
  // will probably never need a realloc, but still
  queue_alloc = QUEUE_ALLOC_SIZE;

  queue_list = calloc(queue_alloc, sizeof(cl_command_queue));

  if (!queue_list)
    POCL_ABORT("unable to allocate queue list!");

  //atexit(pocl_finish_all_queues);

  POCL_UNLOCK(queue_lock);

}
예제 #8
0
void pocl_queue_list_insert(cl_command_queue q)
{
  POCL_LOCK(queue_lock);
  if (queue_size == queue_alloc) {
    // queue is full, try and compact it by removing the deleted queues
    pocl_compact_queue_list();
  }

  if (queue_size == queue_alloc) {
    // compaction failed to give us room
    cl_command_queue *resized = realloc(queue_list, queue_alloc + 256);
    if (!resized)
      POCL_ABORT("failed to enlarge queue list!");
    queue_list = resized;
    queue_alloc += 256;
  }

  queue_list[queue_size++] = q;
  POCL_UNLOCK(queue_lock);
}
예제 #9
0
파일: pocl-hsa.c 프로젝트: larsmans/pocl
void
pocl_hsa_init (cl_device_id device, const char* parameters)
{
  struct data *d;
  static int global_mem_id;
  static int first_hsa_init = 1;
  hsa_device_type_t dev_type;
  hsa_status_t status;

  if (first_hsa_init)
    {
      first_hsa_init = 0;
      global_mem_id = device->dev_id;
    }
  device->global_mem_id = global_mem_id;

  d = (struct data *) calloc (1, sizeof (struct data));

  d->current_kernel = NULL;
  device->data = d;

  assert (found_hsa_agents > 0);

  /* TODO: support controlling multiple agents.
     Now all pocl devices control the same one. */
  d->agent = &hsa_agents[0];

  if (hsa_queue_create(*d->agent, 1, HSA_QUEUE_TYPE_MULTI, NULL, NULL,
                       &d->queue) != HSA_STATUS_SUCCESS)
    {
      POCL_ABORT("pocl-hsa: could not create the queue.");
    }

  /* TODO: replace with HSA calls: */
#if 0
  pocl_topology_detect_device_info(device);
  pocl_cpuinfo_detect_device_info(device);
#endif
  /* TODO: detect with HSA calls: */
  device->max_compute_units = 1;
}
예제 #10
0
파일: devices.c 프로젝트: ochafik/pocl
void 
pocl_init_devices()
{
  static unsigned int init_done = 0;
  static pocl_lock_t pocl_init_lock = POCL_LOCK_INITIALIZER;

  int i, j, dev_index;
  char env_name[1024];
  char dev_name[MAX_DEV_NAME_LEN] = {0};
  unsigned int device_count[POCL_NUM_DEVICE_TYPES];

  if (init_done == 0)
    POCL_INIT_LOCK(pocl_init_lock);
  POCL_LOCK(pocl_init_lock);
  if (init_done) 
    {
      POCL_UNLOCK(pocl_init_lock);
      return;
    }

  /* Set a global debug flag, so we don't have to call pocl_get_bool_option
   * everytime we use the debug macros */
#ifdef POCL_DEBUG_MESSAGES
  pocl_debug_messages = pocl_get_bool_option("POCL_DEBUG", 0);
#endif

  /* Init operations */
  for (i = 0; i < POCL_NUM_DEVICE_TYPES; ++i)
    {
      pocl_devices_init_ops[i](&pocl_device_ops[i]);
      assert(pocl_device_ops[i].device_name != NULL);

      /* Probe and add the result to the number of probbed devices */
      assert(pocl_device_ops[i].probe);
      device_count[i] = pocl_device_ops[i].probe(&pocl_device_ops[i]);
      pocl_num_devices += device_count[i];
    }

  assert(pocl_num_devices > 0);
  pocl_devices = calloc(pocl_num_devices, sizeof(struct _cl_device_id));
  if (pocl_devices == NULL)
    POCL_ABORT("Can not allocate memory for devices\n");

  dev_index = 0;
  /* Init infos for each probbed devices */
  for (i = 0; i < POCL_NUM_DEVICE_TYPES; ++i)
    {
      assert(pocl_device_ops[i].init);
      for (j = 0; j < device_count[i]; ++j)
        {
          pocl_devices[dev_index].ops = &pocl_device_ops[i];
          /* The default value for the global memory space identifier is
             the same as the device id. The device instance can then override 
             it to point to some other device's global memory id in case of
             a shared global memory. */
          pocl_devices[dev_index].global_mem_id = dev_index;
          
          pocl_device_ops[i].init_device_infos(&pocl_devices[dev_index]);

          pocl_device_common_init(&pocl_devices[dev_index]);

          str_toupper(dev_name, pocl_device_ops[i].device_name);
          /* Check if there are device-specific parameters set in the
             POCL_DEVICEn_PARAMETERS env. */
          if (snprintf (env_name, 1024, "POCL_%s%d_PARAMETERS", dev_name, j) < 0)
            POCL_ABORT("Unable to generate the env string.");

          pocl_devices[dev_index].ops->init(&pocl_devices[dev_index], getenv(env_name));

          if (dev_index == 0)
            pocl_devices[dev_index].type |= CL_DEVICE_TYPE_DEFAULT;
          
          ++dev_index;
        }
    }

  init_done = 1;
  POCL_UNLOCK(pocl_init_lock);
}
예제 #11
0
파일: devices.c 프로젝트: larsmans/pocl
void 
pocl_init_devices()
{
  static unsigned int init_done = 0;
  static unsigned int init_in_progress = 0;
  static pocl_lock_t pocl_init_lock = POCL_LOCK_INITIALIZER;

  unsigned i, j, dev_index;
  char env_name[1024];
  char dev_name[MAX_DEV_NAME_LEN] = {0};
  unsigned int device_count[POCL_NUM_DEVICE_TYPES];

  /* This is a workaround to a nasty problem with libhwloc: When
     initializing basic, it calls libhwloc to query device info.
     In case libhwloc has the OpenCL plugin installed, it initializes
     it and it leads to initializing pocl again which leads to an
     infinite loop. */

  if (init_in_progress)
      return;
  init_in_progress = 1;

  if (init_done == 0)
    POCL_INIT_LOCK(pocl_init_lock);
  POCL_LOCK(pocl_init_lock);
  if (init_done) 
    {
      POCL_UNLOCK(pocl_init_lock);
      return;
    }

  /* Set a global debug flag, so we don't have to call pocl_get_bool_option
   * everytime we use the debug macros */
#ifdef POCL_DEBUG_MESSAGES
  pocl_debug_messages = pocl_get_bool_option("POCL_DEBUG", 0);
#endif

  pocl_cache_init_topdir();

  pocl_init_queue_list();

  /* Init operations */
  for (i = 0; i < POCL_NUM_DEVICE_TYPES; ++i)
    {
      pocl_devices_init_ops[i](&pocl_device_ops[i]);
      assert(pocl_device_ops[i].device_name != NULL);

      /* Probe and add the result to the number of probbed devices */
      assert(pocl_device_ops[i].probe);
      device_count[i] = pocl_device_ops[i].probe(&pocl_device_ops[i]);
      pocl_num_devices += device_count[i];
    }

  assert(pocl_num_devices > 0);
  pocl_devices = (struct _cl_device_id*) calloc(pocl_num_devices, sizeof(struct _cl_device_id));
  if (pocl_devices == NULL)
    POCL_ABORT("Can not allocate memory for devices\n");

  dev_index = 0;
  /* Init infos for each probed devices */
  for (i = 0; i < POCL_NUM_DEVICE_TYPES; ++i)
    {
      assert(pocl_device_ops[i].init);
      for (j = 0; j < device_count[i]; ++j)
        {
          pocl_devices[dev_index].ops = &pocl_device_ops[i];
          pocl_devices[dev_index].dev_id = dev_index;
          /* The default value for the global memory space identifier is
             the same as the device id. The device instance can then override 
             it to point to some other device's global memory id in case of
             a shared global memory. */
          pocl_devices[dev_index].global_mem_id = dev_index;
          
          pocl_device_ops[i].init_device_infos(&pocl_devices[dev_index]);

          pocl_device_common_init(&pocl_devices[dev_index]);

          str_toupper(dev_name, pocl_device_ops[i].device_name);
          /* Check if there are device-specific parameters set in the
             POCL_DEVICEn_PARAMETERS env. */
          if (snprintf (env_name, 1024, "POCL_%s%d_PARAMETERS", dev_name, j) < 0)
            POCL_ABORT("Unable to generate the env string.");

          pocl_devices[dev_index].ops->init(&pocl_devices[dev_index], getenv(env_name));

          if (dev_index == 0)
            pocl_devices[dev_index].type |= CL_DEVICE_TYPE_DEFAULT;

          pocl_devices[dev_index].cache_dir_name = strdup(pocl_devices[dev_index].long_name);
          pocl_string_to_dirname(pocl_devices[dev_index].cache_dir_name);
          
          ++dev_index;
        }
    }

  init_done = 1;
  POCL_UNLOCK(pocl_init_lock);
}
예제 #12
0
파일: pocl-hsa.c 프로젝트: larsmans/pocl
void
pocl_hsa_run
(void *data,
 _cl_command_node* cmd)
{
  struct data *d;
  struct pocl_argument *al;
  unsigned i;
  cl_kernel kernel = cmd->command.run.kernel;
  struct pocl_context *pc = &cmd->command.run.pc;
  hsa_signal_value_t initial_value = 1;
#if 0
  /* Not yet supported by the reference library. */
  hsa_kernel_dispatch_packet_t kernel_packet;
#else
  hsa_dispatch_packet_t kernel_packet;
#endif
  hsa_signal_t kernel_completion_signal = 0;
  hsa_region_t region;
  int error;
  amdgpu_args_t *args;
  /* 32b word offset in the kernel arguments buffer we can push the next
     argument to. */
  int args_offset = 0;

  assert (data != NULL);
  d = (struct data *) data;

  d->current_kernel = kernel;

  memset (&kernel_packet, 0, sizeof (hsa_dispatch_packet_t));

#if 0
  /* TODO: not yet supported by the open source runtime implementation.
     Assume the HSA Full profile so we can simply use host malloc().
   */
  hsa_agent_iterate_regions(kernel_agent, pocl_hsa_get_kernarg, &region);

  if (hsa_memory_allocate(region, sizeof(amdgpu_args_t),
                          (void**)&args) !=
      HSA_STATUS_SUCCESS)
    {
      assert (0 && "hsa_memory_allocate() failed.");
    }
#else
  args = (amdgpu_args_t*)malloc(sizeof(amdgpu_args_t));
#endif

  kernel_packet.kernarg_address = (uint64_t)args;

  /* Process the kernel arguments. Convert the opaque buffer
     pointers to real device pointers, allocate dynamic local
     memory buffers, etc. */
  for (i = 0; i < kernel->num_args; ++i)
    {
      al = &(cmd->command.run.arguments[i]);
      if (kernel->arg_info[i].is_local)
        {
          POCL_ABORT_UNIMPLEMENTED("pocl-hsa: local buffers not implemented.");
#if 0
          arguments[i] = malloc (sizeof (void *));
          *(void **)(arguments[i]) = pocl_hsa_malloc(data, 0, al->size, NULL);
#endif
        }
      else if (kernel->arg_info[i].type == POCL_ARG_TYPE_POINTER)
        {
          if (args_offset + 1 >= MAX_KERNEL_ARG_WORDS)
            POCL_ABORT("pocl-hsa: too many kernel arguments!");
          /* Assuming the pointers are 64b (or actually the same as in
             host) due to HSA. TODO: the 32b profile. */
          if (al->value == NULL)
            {
              args->kernel_args[args_offset] = 0;
              args->kernel_args[args_offset + 1] = 0;
            }
          else
            {
              *(uint64_t*)&args->kernel_args[args_offset] =
                (uint64_t)(*(cl_mem *) (al->value))->
                device_ptrs[cmd->device->dev_id].mem_ptr;
            }
          args_offset += 2;

#if 0
          /* It's legal to pass a NULL pointer to clSetKernelArguments. In
             that case we must pass the same NULL forward to the kernel.
             Otherwise, the user must have created a buffer with per device
             pointers stored in the cl_mem. */
          if (al->value == NULL)
            {
              arguments[i] = malloc (sizeof (void *));
              *(void **)arguments[i] = NULL;
            }
          else
            arguments[i] =
              &((*(cl_mem *) (al->value))->device_ptrs[cmd->device->dev_id].mem_ptr);
#endif
        }
      else if (kernel->arg_info[i].type == POCL_ARG_TYPE_IMAGE)
        {
          POCL_ABORT_UNIMPLEMENTED("hsa: image arguments not implemented.");
#if 0
          dev_image_t di;
          fill_dev_image_t (&di, al, cmd->device);

          void* devptr = pocl_hsa_malloc (data, 0, sizeof(dev_image_t), NULL);
          arguments[i] = malloc (sizeof (void *));
          *(void **)(arguments[i]) = devptr;
          pocl_hsa_write (data, &di, devptr, 0, sizeof(dev_image_t));
#endif
        }
      else if (kernel->arg_info[i].type == POCL_ARG_TYPE_SAMPLER)
        {
          POCL_ABORT_UNIMPLEMENTED("hsa: sampler arguments not implemented.");
#if 0
          dev_sampler_t ds;
          arguments[i] = malloc (sizeof (void *));
          *(void **)(arguments[i]) = pocl_hsa_malloc
            (data, 0, sizeof(dev_sampler_t), NULL);
          pocl_hsa_write (data, &ds, *(void**)arguments[i], 0,
                            sizeof(dev_sampler_t));
#endif
        }
      else
        {
          if (args_offset >= MAX_KERNEL_ARG_WORDS)
            POCL_ABORT("pocl-hsa: too many kernel arguments!");

          /* Assuming the scalar fits to a 32b slot. TODO! */
          assert (al->size <= 4);
          args->kernel_args[args_offset] = *(uint32_t*)al->value;
          ++args_offset;
        }
    }

  for (i = kernel->num_args;
       i < kernel->num_args + kernel->num_locals;
       ++i)
    {
      POCL_ABORT_UNIMPLEMENTED("hsa: automatic local buffers not implemented.");
#if 0
      al = &(cmd->command.run.arguments[i]);
      arguments[i] = malloc (sizeof (void *));
      *(void **)(arguments[i]) = pocl_hsa_malloc (data, 0, al->size, NULL);
#endif
    }


  args->workgroup_size_x = kernel_packet.workgroup_size_x = cmd->command.run.local_x;
  args->workgroup_size_y = kernel_packet.workgroup_size_y = cmd->command.run.local_y;
  args->workgroup_size_z = kernel_packet.workgroup_size_z = cmd->command.run.local_z;

  kernel_packet.grid_size_x = pc->num_groups[0] * cmd->command.run.local_x;
  kernel_packet.grid_size_y = pc->num_groups[1] * cmd->command.run.local_y;
  kernel_packet.grid_size_z = pc->num_groups[2] * cmd->command.run.local_z;

  /* AMDGPU specific OpenCL argument data. */

  args->wgs_x = pc->num_groups[0];
  args->wgs_y = pc->num_groups[1];
  args->wgs_z = pc->num_groups[2];

  kernel_packet.dimensions = 1;
  if (cmd->command.run.local_y > 1) kernel_packet.dimensions = 2;
  if (cmd->command.run.local_z > 1) kernel_packet.dimensions = 3;

  kernel_packet.header.type = HSA_PACKET_TYPE_DISPATCH;
  kernel_packet.header.acquire_fence_scope = HSA_FENCE_SCOPE_SYSTEM;
  kernel_packet.header.release_fence_scope = HSA_FENCE_SCOPE_SYSTEM;
  kernel_packet.header.barrier = 1;

  kernel_packet.kernel_object_address =
    *(hsa_amd_code_t*)cmd->command.run.device_data[1];

  error =  hsa_signal_create(initial_value, 0, NULL, &kernel_completion_signal);
  assert (error == HSA_STATUS_SUCCESS);

  kernel_packet.completion_signal = kernel_completion_signal;

  {
    /* Launch the kernel by allocating a slot in the queue, writing the
       command to it, signaling the update with a door bell and finally,
       block waiting until finish signalled with the completion_signal. */
    const uint32_t queue_mask = d->queue->size - 1;
    uint64_t queue_index = hsa_queue_load_write_index_relaxed(d->queue);
    hsa_signal_value_t sigval;
    ((hsa_dispatch_packet_t*)(d->queue->base_address))[queue_index & queue_mask] =
      kernel_packet;
    hsa_queue_store_write_index_relaxed(d->queue, queue_index + 1);
    hsa_signal_store_relaxed(d->queue->doorbell_signal, queue_index);

    sigval = hsa_signal_wait_acquire(kernel_completion_signal, HSA_EQ, 0,
                                     (uint64_t)(-1), HSA_WAIT_EXPECTANCY_UNKNOWN);
  }

  for (i = 0; i < kernel->num_args; ++i)
    {
      if (kernel->arg_info[i].is_local)
        {
#if 0
          pocl_hsa_free (data, 0, *(void **)(arguments[i]));
          POCL_MEM_FREE(arguments[i]);
#endif
        }
      else if (kernel->arg_info[i].type == POCL_ARG_TYPE_IMAGE)
        {
#if 0
          pocl_hsa_free (data, 0, *(void **)(arguments[i]));
          POCL_MEM_FREE(arguments[i]);
#endif
        }
#if 0
      else if (kernel->arg_info[i].type == POCL_ARG_TYPE_SAMPLER ||
               (kernel->arg_info[i].type == POCL_ARG_TYPE_POINTER &&
                *(void**)args->kernel_args[i] == NULL))
        {
          POCL_MEM_FREE(arguments[i]);
        }
#endif
    }
  for (i = kernel->num_args;
       i < kernel->num_args + kernel->num_locals;
       ++i)
    {
#if 0
      pocl_hsa_free(data, 0, *(void **)(arguments[i]));
      POCL_MEM_FREE(arguments[i]);
#endif
    }
  free(args);
}
예제 #13
0
파일: cellspu.c 프로젝트: clachan/pocl
void
pocl_cellspu_run 
(void *data, 
 _cl_command_node* cmd)
{
  struct data *d;
  int error;
  char bytecode[POCL_FILENAME_LENGTH];
  char assembly[POCL_FILENAME_LENGTH];
  char module[POCL_FILENAME_LENGTH];
  char command[COMMAND_LENGTH];
  char workgroup_string[WORKGROUP_STRING_LENGTH];
  unsigned device;
  struct pocl_argument *al;
  size_t x, y, z;
  unsigned i;
  pocl_workgroup w;
  char* tmpdir = cmd->command.run.tmp_dir;
  cl_kernel kernel = cmd->command.run.kernel;
  struct pocl_context *pc = &cmd->command.run.pc;
  const char* kern_func = kernel->function_name;
  unsigned int entry = SPE_DEFAULT_ENTRY;

  assert (data != NULL);
  d = (struct data *) data;

  error = snprintf 
    (module, POCL_FILENAME_LENGTH,
     "%s/parallel.so", tmpdir);
  assert (error >= 0);

  // This is the entry to the kenrel. We currently hard-code it
  // into the SPU binary. Resulting in only one entry-point per 
  // SPU image.
  // TODO: figure out which function to call given what conditions
  snprintf (workgroup_string, WORKGROUP_STRING_LENGTH,
            "_%s_workgroup_fast", kernel->function_name);


  if ( access (module, F_OK) != 0)
    {
      char *llvm_ld;
      error = snprintf (bytecode, POCL_FILENAME_LENGTH,
                        "%s/linked.bc", tmpdir);
      assert (error >= 0);
      
      if (getenv("POCL_BUILDING") != NULL)
        llvm_ld = BUILDDIR "/tools/llvm-ld/pocl-llvm-ld";
      else if (access(PKGLIBEXECDIR "/pocl-llvm-ld", X_OK) == 0)
        llvm_ld = PKGLIBEXECDIR "/pocl-llvm-ld";
      else
        llvm_ld = "pocl-llvm-ld";

      error = snprintf (command, COMMAND_LENGTH,
			"%s --disable-opt -link-as-library -o %s %s/%s",
                        llvm_ld, bytecode, tmpdir, POCL_PARALLEL_BC_FILENAME);
      assert (error >= 0);
      
      error = system(command);
      assert (error == 0);
      
      error = snprintf (assembly, POCL_FILENAME_LENGTH,
			"%s/parallel.s",
			tmpdir);
      assert (error >= 0);
      
      // "-relocation-model=dynamic-no-pic" is a magic string,
      // I do not know why it has to be there to produce valid
      // sos on x86_64
      error = snprintf (command, COMMAND_LENGTH,
			LLC " " HOST_LLC_FLAGS " -o %s %s",
			assembly,
			bytecode);
      assert (error >= 0);
      error = system (command);
      assert (error == 0);
           

      // Compile the assembly version of the OCL kernel with the
      // C wrapper to get a spulet
      error = snprintf (command, COMMAND_LENGTH,
			"spu-gcc lib/CL/devices/cellspu/spe_wrap.c -o %s %s "
			" -Xlinker --defsym -Xlinker _ocl_buffer=%d"
			" -Xlinker --defsym -Xlinker kernel_command=%d"
			" -I . -D_KERNEL=%s -std=c99",
			module,
			assembly, 
			CELLSPU_OCL_BUFFERS_START,
			CELLSPU_KERNEL_CMD_ADDR,
			workgroup_string);
      assert (error >= 0);
#ifdef DEBUG_CELLSPU_DRIVER
      printf("compiling: %s\n", command); fflush(stdout); 
#endif
      error = system (command);
      assert (error == 0);

    }
      
    // Load the SPU with the newly generated binary
    hello_spu = spe_image_open( (const char*)module );
    if( spe_program_load( spe_context, hello_spu) )
        perror("spe_program_load fails");
    
//
//  /* Find which device number within the context correspond
//     to current device.  */
//  for (i = 0; i < kernel->context->num_devices; ++i)
//    {
//      if (kernel->context->devices[i]->data == data)
//	{
//	  device = i;
//	  break;
//	}
//    }
//

  // This structure gets passed to the device.
  // It contains all the info needed to run a kernel  
  __kernel_exec_cmd dev_cmd;
  dev_cmd.work_dim = cmd->command.run.pc.work_dim;
  dev_cmd.num_groups[0] = cmd->command.run.pc.num_groups[0];
  dev_cmd.num_groups[1] = cmd->command.run.pc.num_groups[1];
  dev_cmd.num_groups[2] = cmd->command.run.pc.num_groups[2];

  dev_cmd.global_offset[0] = cmd->command.run.pc.global_offset[0];
  dev_cmd.global_offset[1] = cmd->command.run.pc.global_offset[1];
  dev_cmd.global_offset[2] = cmd->command.run.pc.global_offset[2];


  // the code below is lifted from pthreads :) 
  uint32_t *arguments = dev_cmd.args;

  for (i = 0; i < kernel->num_args; ++i)
    {
      al = &(kernel->dyn_arguments[i]);
      if (kernel->arg_is_local[i])
        {
          chunk_info_t* local_chunk = cellspu_malloc_local (d, al->size);
          if (local_chunk == NULL)
            POCL_ABORT ("Could not allocate memory for a local argument. Out of local mem?\n");

          dev_cmd.args[i] = local_chunk->start_address;

        }
      else if (kernel->arg_is_pointer[i])
        {
          /* It's legal to pass a NULL pointer to clSetKernelArguments. In 
             that case we must pass the same NULL forward to the kernel.
             Otherwise, the user must have created a buffer with per device
             pointers stored in the cl_mem. */
          if (al->value == NULL)
            arguments[i] = (uint32_t)NULL;
          else
            arguments[i] = \
              ((chunk_info_t*)((*(cl_mem *)\
                (al->value))->device_ptrs[0]))->start_address;
		//TODO: '0' above is the device number... don't hard-code!
        }
      else if (kernel->arg_is_image[i])
        {
          POCL_ABORT_UNIMPLEMENTED();
//          dev_image2d_t di;      
//          cl_mem mem = *(cl_mem*)al->value;
//          di.data = &((*(cl_mem *) (al->value))->device_ptrs[device]);
//          di.data = ((*(cl_mem *) (al->value))->device_ptrs[device]);
//          di.width = mem->image_width;
//          di.height = mem->image_height;
//          di.rowpitch = mem->image_row_pitch;
//          di.order = mem->image_channel_order;
//          di.data_type = mem->image_channel_data_type;
//          void* devptr = pocl_cellspu_malloc(data, 0, sizeof(dev_image2d_t), NULL);
//          arguments[i] = malloc (sizeof (void *));
//          *(void **)(arguments[i]) = devptr; 
//          pocl_cellspu_write (data, &di, devptr, sizeof(dev_image2d_t));
        }
      else if (kernel->arg_is_sampler[i])
        {
          POCL_ABORT_UNIMPLEMENTED();
//          dev_sampler_t ds;
//          
//          arguments[i] = malloc (sizeof (void *));
//          *(void **)(arguments[i]) = pocl_cellspu_malloc(data, 0, sizeof(dev_sampler_t), NULL);
//          pocl_cellspu_write (data, &ds, *(void**)arguments[i], sizeof(dev_sampler_t));
        }
      else
        {
          arguments[i] = (uint32_t)al->value;
        }
    }

  // allocate memory for kernel local variables
  for (i = kernel->num_args;
       i < kernel->num_args + kernel->num_locals;
       ++i)
    {
      al = &(kernel->dyn_arguments[i]);
      arguments[i] = (uint32_t)malloc (sizeof (void *));
      *(void **)(arguments[i]) = cellspu_malloc_local(data, al->size);
    }

  // the main loop on the spe needs an auxiliary struct for to get the 
  // number of arguments and such. 
  __kernel_metadata kmd;
  strncpy( kmd.name, workgroup_string, sizeof( kmd.name ) );  
  kmd.num_args = kernel->num_args;
  kmd.num_locals = kernel->num_locals;
  // TODO: fill in the rest, if used by the spu main function.

  // TODO malloc_local should be given the 'device data'. as long as teh 
  // spu context is global this is ok.
  void *chunk = cellspu_malloc_local( NULL, sizeof(__kernel_metadata) ); 
  void *kernel_area = ((chunk_info_t*)chunk)->start_address;
  cellspu_memwrite( kernel_area, &kmd, sizeof(__kernel_metadata) );
  dev_cmd.kernel = kernel_area;
  
  // finish up the command, send it to SPE
  dev_cmd.status =POCL_KST_READY;
  cellspu_memwrite( (void*)CELLSPU_KERNEL_CMD_ADDR, &dev_cmd, sizeof(__kernel_exec_cmd) );
       
  // Execute code on SPU. This starts with the main() in the spu - see spe_wrap.c
  if (spe_context_run(spe_context,&entry,0,NULL,NULL,NULL) < 0)
    perror("context_run error");

//  for (z = 0; z < pc->num_groups[2]; ++z)
//    {
//      for (y = 0; y < pc->num_groups[1]; ++y)
//        {
//          for (x = 0; x < pc->num_groups[0]; ++x)
//            {
//              pc->group_id[0] = x;
//              pc->group_id[1] = y;
//              pc->group_id[2] = z;
//
//              w (arguments, pc);
//
//            }
//        }
//    }


  // Clean-up ? 
  for (i = 0; i < kernel->num_args; ++i)
    {
      if (kernel->arg_is_local[i])
        pocl_cellspu_free(data, 0, *(void **)(arguments[i]));
    }
  for (i = kernel->num_args;
       i < kernel->num_args + kernel->num_locals;
       ++i)
    pocl_cellspu_free(data, 0, *(void **)(arguments[i]));
}