Exemple #1
0
extern void
pocl_get_image_information (cl_mem        image,
                          int* channels_out,
                          int* elem_size_out)
  {
    cl_channel_order order = image->image_channel_order;
    cl_channel_type type = image->image_channel_data_type;
    
    int host_elem_size;
    if (type == CL_FLOAT)
      host_elem_size=4;
    else if (type==CL_UNORM_INT8)
      host_elem_size=1;
    else
      POCL_ABORT_UNIMPLEMENTED();
    if (elem_size_out != NULL) 
      *elem_size_out = host_elem_size;
    
    int host_channels;
    if (order == CL_RGBA)
      host_channels=4;
    else if (order == CL_R) 
      host_channels=1;
    else
      POCL_ABORT_UNIMPLEMENTED();
    if (channels_out != NULL) 
      *channels_out = host_channels;
  }
Exemple #2
0
void
pocl_cellspu_free (void *data, cl_mem_flags flags, void *ptr)
{
  POCL_ABORT_UNIMPLEMENTED();

  if (flags & CL_MEM_USE_HOST_PTR)
    return;
  
  free (ptr);
}
Exemple #3
0
static void exec_commands (_cl_command_node *node_list)
{
  int i;
  cl_event *event;
  _cl_command_node *node;
  cl_command_queue command_queue = NULL;
  LL_FOREACH (node_list, node)
    {
      event = &(node->event);
      /* Command queue is needed for POCL_UPDATE_EVENT macros */
      if (node->event)
        command_queue = node->event->queue;

      switch (node->type)
        {
        case CL_COMMAND_READ_BUFFER:
          POCL_UPDATE_EVENT_RUNNING;
          node->device->read
            (node->command.read.data, 
             node->command.read.host_ptr, 
             node->command.read.device_ptr, 
             node->command.read.cb); 
          POCL_UPDATE_EVENT_COMPLETE;
          POname(clReleaseMemObject) (node->command.read.buffer);
          break;
        case CL_COMMAND_WRITE_BUFFER:
          POCL_UPDATE_EVENT_RUNNING;
          node->device->write
            (node->command.write.data, 
             node->command.write.host_ptr, 
             node->command.write.device_ptr, 
             node->command.write.cb);
          POCL_UPDATE_EVENT_COMPLETE;
          POname(clReleaseMemObject) (node->command.write.buffer);
          break;
        case CL_COMMAND_COPY_BUFFER:
          POCL_UPDATE_EVENT_RUNNING;
          node->device->copy
            (node->command.copy.data, 
             node->command.copy.src_ptr, 
             node->command.copy.dst_ptr,
             node->command.copy.cb);
          POCL_UPDATE_EVENT_COMPLETE;
          POname(clReleaseMemObject) (node->command.copy.src_buffer);
          POname(clReleaseMemObject) (node->command.copy.dst_buffer);
          break;
        case CL_COMMAND_MAP_BUFFER: 
          POCL_UPDATE_EVENT_RUNNING;            
          pocl_map_mem_cmd (node->device, node->command.map.buffer, 
                            node->command.map.mapping);
          POCL_UPDATE_EVENT_COMPLETE;
          break;
        case CL_COMMAND_MAP_IMAGE:
          POCL_UPDATE_EVENT_RUNNING; 
          node->device->read_rect 
            (node->command.map_image.data, node->command.map_image.map_ptr,
             node->command.map_image.device_ptr, node->command.map_image.origin,
             node->command.map_image.origin, node->command.map_image.region, 
             node->command.map_image.rowpitch, 
             node->command.map_image.slicepitch,
             node->command.map_image.rowpitch,
             node->command.map_image.slicepitch);
          POCL_UPDATE_EVENT_COMPLETE;
          break;
        case CL_COMMAND_WRITE_IMAGE:
          POCL_UPDATE_EVENT_RUNNING; 
          node->device->write_rect 
            (node->command.map_image.data, node->command.map_image.map_ptr,
             node->command.map_image.device_ptr, node->command.map_image.origin,
             node->command.map_image.origin, node->command.map_image.region, 
             node->command.map_image.rowpitch, 
             node->command.map_image.slicepitch,
             node->command.map_image.rowpitch,
             node->command.map_image.slicepitch);
          POCL_UPDATE_EVENT_COMPLETE;
          break;
        case CL_COMMAND_READ_IMAGE:
          POCL_UPDATE_EVENT_RUNNING; 
          node->device->read_rect 
            (node->command.map_image.data, node->command.map_image.map_ptr,
             node->command.map_image.device_ptr, node->command.map_image.origin,
             node->command.map_image.origin, node->command.map_image.region, 
             node->command.map_image.rowpitch, 
             node->command.map_image.slicepitch,
             node->command.map_image.rowpitch,
             node->command.map_image.slicepitch);
          POCL_UPDATE_EVENT_COMPLETE;
          break;
        case CL_COMMAND_UNMAP_MEM_OBJECT:
          POCL_UPDATE_EVENT_RUNNING;
          if ((node->command.unmap.memobj)->flags & 
              (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR))
            {
              /* TODO: should we ensure the device global region is updated from
                 the host memory? How does the specs define it,
                 can the host_ptr be assumed to point to the host and the
                 device accessible memory or just point there until the
                 kernel(s) get executed or similar? */
              /* Assume the region is automatically up to date. */
            } else 
            {
              /* TODO: fixme. The offset computation must be done at the device 
                 driver. */
              if (node->device->unmap_mem != NULL)        
                node->device->unmap_mem
                  (node->device->data, 
                   (node->command.unmap.mapping)->host_ptr, 
                   (node->command.unmap.memobj)->device_ptrs[node->device->dev_id], 
                   (node->command.unmap.mapping)->size);
            }
          DL_DELETE((node->command.unmap.memobj)->mappings, 
                    node->command.unmap.mapping);
          (node->command.unmap.memobj)->map_count--;
          POCL_UPDATE_EVENT_COMPLETE;
          break;
        case CL_COMMAND_NDRANGE_KERNEL:
          assert (*event == node->event);
          POCL_UPDATE_EVENT_RUNNING;
          node->device->run(node->command.run.data, node);
          POCL_UPDATE_EVENT_COMPLETE;
          for (i = 0; i < node->command.run.arg_buffer_count; ++i)
            {
              cl_mem buf = node->command.run.arg_buffers[i];
              if (buf == NULL) continue;
              /*printf ("### releasing arg %d - the buffer %x of kernel %s\n", i, 
                buf,  node->command.run.kernel->function_name); */
              POname(clReleaseMemObject) (buf);
            }
          free (node->command.run.arg_buffers);
          free (node->command.run.tmp_dir);
          for (i = 0; i < node->command.run.kernel->num_args + 
                 node->command.run.kernel->num_locals; ++i)
            {
              pocl_aligned_free (node->command.run.arguments[i].value);
            }
          free (node->command.run.arguments);
      
          POname(clReleaseKernel)(node->command.run.kernel);
          break;
        case CL_COMMAND_FILL_IMAGE:
          POCL_UPDATE_EVENT_RUNNING;
          node->device->fill_rect 
            (node->command.fill_image.data, 
             node->command.fill_image.device_ptr,
             node->command.fill_image.buffer_origin,
             node->command.fill_image.region,
             node->command.fill_image.rowpitch, 
             node->command.fill_image.slicepitch,
             node->command.fill_image.fill_pixel,
             node->command.fill_image.pixel_size);
          free(node->command.fill_image.fill_pixel);
          POCL_UPDATE_EVENT_COMPLETE;
          break;
        case CL_COMMAND_MARKER:
          POCL_UPDATE_EVENT_RUNNING;
          POCL_UPDATE_EVENT_COMPLETE;
          break;
        default:
          POCL_ABORT_UNIMPLEMENTED();
          break;
        }   
    }
Exemple #4
0
void
pocl_hsa_run
(void *data,
 _cl_command_node* cmd)
{
  struct data *d;
  struct pocl_argument *al;
  unsigned i;
  cl_kernel kernel = cmd->command.run.kernel;
  struct pocl_context *pc = &cmd->command.run.pc;
  hsa_signal_value_t initial_value = 1;
#if 0
  /* Not yet supported by the reference library. */
  hsa_kernel_dispatch_packet_t kernel_packet;
#else
  hsa_dispatch_packet_t kernel_packet;
#endif
  hsa_signal_t kernel_completion_signal = 0;
  hsa_region_t region;
  int error;
  amdgpu_args_t *args;
  /* 32b word offset in the kernel arguments buffer we can push the next
     argument to. */
  int args_offset = 0;

  assert (data != NULL);
  d = (struct data *) data;

  d->current_kernel = kernel;

  memset (&kernel_packet, 0, sizeof (hsa_dispatch_packet_t));

#if 0
  /* TODO: not yet supported by the open source runtime implementation.
     Assume the HSA Full profile so we can simply use host malloc().
   */
  hsa_agent_iterate_regions(kernel_agent, pocl_hsa_get_kernarg, &region);

  if (hsa_memory_allocate(region, sizeof(amdgpu_args_t),
                          (void**)&args) !=
      HSA_STATUS_SUCCESS)
    {
      assert (0 && "hsa_memory_allocate() failed.");
    }
#else
  args = (amdgpu_args_t*)malloc(sizeof(amdgpu_args_t));
#endif

  kernel_packet.kernarg_address = (uint64_t)args;

  /* Process the kernel arguments. Convert the opaque buffer
     pointers to real device pointers, allocate dynamic local
     memory buffers, etc. */
  for (i = 0; i < kernel->num_args; ++i)
    {
      al = &(cmd->command.run.arguments[i]);
      if (kernel->arg_info[i].is_local)
        {
          POCL_ABORT_UNIMPLEMENTED("pocl-hsa: local buffers not implemented.");
#if 0
          arguments[i] = malloc (sizeof (void *));
          *(void **)(arguments[i]) = pocl_hsa_malloc(data, 0, al->size, NULL);
#endif
        }
      else if (kernel->arg_info[i].type == POCL_ARG_TYPE_POINTER)
        {
          if (args_offset + 1 >= MAX_KERNEL_ARG_WORDS)
            POCL_ABORT("pocl-hsa: too many kernel arguments!");
          /* Assuming the pointers are 64b (or actually the same as in
             host) due to HSA. TODO: the 32b profile. */
          if (al->value == NULL)
            {
              args->kernel_args[args_offset] = 0;
              args->kernel_args[args_offset + 1] = 0;
            }
          else
            {
              *(uint64_t*)&args->kernel_args[args_offset] =
                (uint64_t)(*(cl_mem *) (al->value))->
                device_ptrs[cmd->device->dev_id].mem_ptr;
            }
          args_offset += 2;

#if 0
          /* It's legal to pass a NULL pointer to clSetKernelArguments. In
             that case we must pass the same NULL forward to the kernel.
             Otherwise, the user must have created a buffer with per device
             pointers stored in the cl_mem. */
          if (al->value == NULL)
            {
              arguments[i] = malloc (sizeof (void *));
              *(void **)arguments[i] = NULL;
            }
          else
            arguments[i] =
              &((*(cl_mem *) (al->value))->device_ptrs[cmd->device->dev_id].mem_ptr);
#endif
        }
      else if (kernel->arg_info[i].type == POCL_ARG_TYPE_IMAGE)
        {
          POCL_ABORT_UNIMPLEMENTED("hsa: image arguments not implemented.");
#if 0
          dev_image_t di;
          fill_dev_image_t (&di, al, cmd->device);

          void* devptr = pocl_hsa_malloc (data, 0, sizeof(dev_image_t), NULL);
          arguments[i] = malloc (sizeof (void *));
          *(void **)(arguments[i]) = devptr;
          pocl_hsa_write (data, &di, devptr, 0, sizeof(dev_image_t));
#endif
        }
      else if (kernel->arg_info[i].type == POCL_ARG_TYPE_SAMPLER)
        {
          POCL_ABORT_UNIMPLEMENTED("hsa: sampler arguments not implemented.");
#if 0
          dev_sampler_t ds;
          arguments[i] = malloc (sizeof (void *));
          *(void **)(arguments[i]) = pocl_hsa_malloc
            (data, 0, sizeof(dev_sampler_t), NULL);
          pocl_hsa_write (data, &ds, *(void**)arguments[i], 0,
                            sizeof(dev_sampler_t));
#endif
        }
      else
        {
          if (args_offset >= MAX_KERNEL_ARG_WORDS)
            POCL_ABORT("pocl-hsa: too many kernel arguments!");

          /* Assuming the scalar fits to a 32b slot. TODO! */
          assert (al->size <= 4);
          args->kernel_args[args_offset] = *(uint32_t*)al->value;
          ++args_offset;
        }
    }

  for (i = kernel->num_args;
       i < kernel->num_args + kernel->num_locals;
       ++i)
    {
      POCL_ABORT_UNIMPLEMENTED("hsa: automatic local buffers not implemented.");
#if 0
      al = &(cmd->command.run.arguments[i]);
      arguments[i] = malloc (sizeof (void *));
      *(void **)(arguments[i]) = pocl_hsa_malloc (data, 0, al->size, NULL);
#endif
    }


  args->workgroup_size_x = kernel_packet.workgroup_size_x = cmd->command.run.local_x;
  args->workgroup_size_y = kernel_packet.workgroup_size_y = cmd->command.run.local_y;
  args->workgroup_size_z = kernel_packet.workgroup_size_z = cmd->command.run.local_z;

  kernel_packet.grid_size_x = pc->num_groups[0] * cmd->command.run.local_x;
  kernel_packet.grid_size_y = pc->num_groups[1] * cmd->command.run.local_y;
  kernel_packet.grid_size_z = pc->num_groups[2] * cmd->command.run.local_z;

  /* AMDGPU specific OpenCL argument data. */

  args->wgs_x = pc->num_groups[0];
  args->wgs_y = pc->num_groups[1];
  args->wgs_z = pc->num_groups[2];

  kernel_packet.dimensions = 1;
  if (cmd->command.run.local_y > 1) kernel_packet.dimensions = 2;
  if (cmd->command.run.local_z > 1) kernel_packet.dimensions = 3;

  kernel_packet.header.type = HSA_PACKET_TYPE_DISPATCH;
  kernel_packet.header.acquire_fence_scope = HSA_FENCE_SCOPE_SYSTEM;
  kernel_packet.header.release_fence_scope = HSA_FENCE_SCOPE_SYSTEM;
  kernel_packet.header.barrier = 1;

  kernel_packet.kernel_object_address =
    *(hsa_amd_code_t*)cmd->command.run.device_data[1];

  error =  hsa_signal_create(initial_value, 0, NULL, &kernel_completion_signal);
  assert (error == HSA_STATUS_SUCCESS);

  kernel_packet.completion_signal = kernel_completion_signal;

  {
    /* Launch the kernel by allocating a slot in the queue, writing the
       command to it, signaling the update with a door bell and finally,
       block waiting until finish signalled with the completion_signal. */
    const uint32_t queue_mask = d->queue->size - 1;
    uint64_t queue_index = hsa_queue_load_write_index_relaxed(d->queue);
    hsa_signal_value_t sigval;
    ((hsa_dispatch_packet_t*)(d->queue->base_address))[queue_index & queue_mask] =
      kernel_packet;
    hsa_queue_store_write_index_relaxed(d->queue, queue_index + 1);
    hsa_signal_store_relaxed(d->queue->doorbell_signal, queue_index);

    sigval = hsa_signal_wait_acquire(kernel_completion_signal, HSA_EQ, 0,
                                     (uint64_t)(-1), HSA_WAIT_EXPECTANCY_UNKNOWN);
  }

  for (i = 0; i < kernel->num_args; ++i)
    {
      if (kernel->arg_info[i].is_local)
        {
#if 0
          pocl_hsa_free (data, 0, *(void **)(arguments[i]));
          POCL_MEM_FREE(arguments[i]);
#endif
        }
      else if (kernel->arg_info[i].type == POCL_ARG_TYPE_IMAGE)
        {
#if 0
          pocl_hsa_free (data, 0, *(void **)(arguments[i]));
          POCL_MEM_FREE(arguments[i]);
#endif
        }
#if 0
      else if (kernel->arg_info[i].type == POCL_ARG_TYPE_SAMPLER ||
               (kernel->arg_info[i].type == POCL_ARG_TYPE_POINTER &&
                *(void**)args->kernel_args[i] == NULL))
        {
          POCL_MEM_FREE(arguments[i]);
        }
#endif
    }
  for (i = kernel->num_args;
       i < kernel->num_args + kernel->num_locals;
       ++i)
    {
#if 0
      pocl_hsa_free(data, 0, *(void **)(arguments[i]));
      POCL_MEM_FREE(arguments[i]);
#endif
    }
  free(args);
}
Exemple #5
0
void
pocl_cellspu_run 
(void *data, 
 _cl_command_node* cmd)
{
  struct data *d;
  int error;
  char bytecode[POCL_FILENAME_LENGTH];
  char assembly[POCL_FILENAME_LENGTH];
  char module[POCL_FILENAME_LENGTH];
  char command[COMMAND_LENGTH];
  char workgroup_string[WORKGROUP_STRING_LENGTH];
  unsigned device;
  struct pocl_argument *al;
  size_t x, y, z;
  unsigned i;
  pocl_workgroup w;
  char* tmpdir = cmd->command.run.tmp_dir;
  cl_kernel kernel = cmd->command.run.kernel;
  struct pocl_context *pc = &cmd->command.run.pc;
  const char* kern_func = kernel->function_name;
  unsigned int entry = SPE_DEFAULT_ENTRY;

  assert (data != NULL);
  d = (struct data *) data;

  error = snprintf 
    (module, POCL_FILENAME_LENGTH,
     "%s/parallel.so", tmpdir);
  assert (error >= 0);

  // This is the entry to the kenrel. We currently hard-code it
  // into the SPU binary. Resulting in only one entry-point per 
  // SPU image.
  // TODO: figure out which function to call given what conditions
  snprintf (workgroup_string, WORKGROUP_STRING_LENGTH,
            "_%s_workgroup_fast", kernel->function_name);


  if ( access (module, F_OK) != 0)
    {
      char *llvm_ld;
      error = snprintf (bytecode, POCL_FILENAME_LENGTH,
                        "%s/linked.bc", tmpdir);
      assert (error >= 0);
      
      if (getenv("POCL_BUILDING") != NULL)
        llvm_ld = BUILDDIR "/tools/llvm-ld/pocl-llvm-ld";
      else if (access(PKGLIBEXECDIR "/pocl-llvm-ld", X_OK) == 0)
        llvm_ld = PKGLIBEXECDIR "/pocl-llvm-ld";
      else
        llvm_ld = "pocl-llvm-ld";

      error = snprintf (command, COMMAND_LENGTH,
			"%s --disable-opt -link-as-library -o %s %s/%s",
                        llvm_ld, bytecode, tmpdir, POCL_PARALLEL_BC_FILENAME);
      assert (error >= 0);
      
      error = system(command);
      assert (error == 0);
      
      error = snprintf (assembly, POCL_FILENAME_LENGTH,
			"%s/parallel.s",
			tmpdir);
      assert (error >= 0);
      
      // "-relocation-model=dynamic-no-pic" is a magic string,
      // I do not know why it has to be there to produce valid
      // sos on x86_64
      error = snprintf (command, COMMAND_LENGTH,
			LLC " " HOST_LLC_FLAGS " -o %s %s",
			assembly,
			bytecode);
      assert (error >= 0);
      error = system (command);
      assert (error == 0);
           

      // Compile the assembly version of the OCL kernel with the
      // C wrapper to get a spulet
      error = snprintf (command, COMMAND_LENGTH,
			"spu-gcc lib/CL/devices/cellspu/spe_wrap.c -o %s %s "
			" -Xlinker --defsym -Xlinker _ocl_buffer=%d"
			" -Xlinker --defsym -Xlinker kernel_command=%d"
			" -I . -D_KERNEL=%s -std=c99",
			module,
			assembly, 
			CELLSPU_OCL_BUFFERS_START,
			CELLSPU_KERNEL_CMD_ADDR,
			workgroup_string);
      assert (error >= 0);
#ifdef DEBUG_CELLSPU_DRIVER
      printf("compiling: %s\n", command); fflush(stdout); 
#endif
      error = system (command);
      assert (error == 0);

    }
      
    // Load the SPU with the newly generated binary
    hello_spu = spe_image_open( (const char*)module );
    if( spe_program_load( spe_context, hello_spu) )
        perror("spe_program_load fails");
    
//
//  /* Find which device number within the context correspond
//     to current device.  */
//  for (i = 0; i < kernel->context->num_devices; ++i)
//    {
//      if (kernel->context->devices[i]->data == data)
//	{
//	  device = i;
//	  break;
//	}
//    }
//

  // This structure gets passed to the device.
  // It contains all the info needed to run a kernel  
  __kernel_exec_cmd dev_cmd;
  dev_cmd.work_dim = cmd->command.run.pc.work_dim;
  dev_cmd.num_groups[0] = cmd->command.run.pc.num_groups[0];
  dev_cmd.num_groups[1] = cmd->command.run.pc.num_groups[1];
  dev_cmd.num_groups[2] = cmd->command.run.pc.num_groups[2];

  dev_cmd.global_offset[0] = cmd->command.run.pc.global_offset[0];
  dev_cmd.global_offset[1] = cmd->command.run.pc.global_offset[1];
  dev_cmd.global_offset[2] = cmd->command.run.pc.global_offset[2];


  // the code below is lifted from pthreads :) 
  uint32_t *arguments = dev_cmd.args;

  for (i = 0; i < kernel->num_args; ++i)
    {
      al = &(kernel->dyn_arguments[i]);
      if (kernel->arg_is_local[i])
        {
          chunk_info_t* local_chunk = cellspu_malloc_local (d, al->size);
          if (local_chunk == NULL)
            POCL_ABORT ("Could not allocate memory for a local argument. Out of local mem?\n");

          dev_cmd.args[i] = local_chunk->start_address;

        }
      else if (kernel->arg_is_pointer[i])
        {
          /* It's legal to pass a NULL pointer to clSetKernelArguments. In 
             that case we must pass the same NULL forward to the kernel.
             Otherwise, the user must have created a buffer with per device
             pointers stored in the cl_mem. */
          if (al->value == NULL)
            arguments[i] = (uint32_t)NULL;
          else
            arguments[i] = \
              ((chunk_info_t*)((*(cl_mem *)\
                (al->value))->device_ptrs[0]))->start_address;
		//TODO: '0' above is the device number... don't hard-code!
        }
      else if (kernel->arg_is_image[i])
        {
          POCL_ABORT_UNIMPLEMENTED();
//          dev_image2d_t di;      
//          cl_mem mem = *(cl_mem*)al->value;
//          di.data = &((*(cl_mem *) (al->value))->device_ptrs[device]);
//          di.data = ((*(cl_mem *) (al->value))->device_ptrs[device]);
//          di.width = mem->image_width;
//          di.height = mem->image_height;
//          di.rowpitch = mem->image_row_pitch;
//          di.order = mem->image_channel_order;
//          di.data_type = mem->image_channel_data_type;
//          void* devptr = pocl_cellspu_malloc(data, 0, sizeof(dev_image2d_t), NULL);
//          arguments[i] = malloc (sizeof (void *));
//          *(void **)(arguments[i]) = devptr; 
//          pocl_cellspu_write (data, &di, devptr, sizeof(dev_image2d_t));
        }
      else if (kernel->arg_is_sampler[i])
        {
          POCL_ABORT_UNIMPLEMENTED();
//          dev_sampler_t ds;
//          
//          arguments[i] = malloc (sizeof (void *));
//          *(void **)(arguments[i]) = pocl_cellspu_malloc(data, 0, sizeof(dev_sampler_t), NULL);
//          pocl_cellspu_write (data, &ds, *(void**)arguments[i], sizeof(dev_sampler_t));
        }
      else
        {
          arguments[i] = (uint32_t)al->value;
        }
    }

  // allocate memory for kernel local variables
  for (i = kernel->num_args;
       i < kernel->num_args + kernel->num_locals;
       ++i)
    {
      al = &(kernel->dyn_arguments[i]);
      arguments[i] = (uint32_t)malloc (sizeof (void *));
      *(void **)(arguments[i]) = cellspu_malloc_local(data, al->size);
    }

  // the main loop on the spe needs an auxiliary struct for to get the 
  // number of arguments and such. 
  __kernel_metadata kmd;
  strncpy( kmd.name, workgroup_string, sizeof( kmd.name ) );  
  kmd.num_args = kernel->num_args;
  kmd.num_locals = kernel->num_locals;
  // TODO: fill in the rest, if used by the spu main function.

  // TODO malloc_local should be given the 'device data'. as long as teh 
  // spu context is global this is ok.
  void *chunk = cellspu_malloc_local( NULL, sizeof(__kernel_metadata) ); 
  void *kernel_area = ((chunk_info_t*)chunk)->start_address;
  cellspu_memwrite( kernel_area, &kmd, sizeof(__kernel_metadata) );
  dev_cmd.kernel = kernel_area;
  
  // finish up the command, send it to SPE
  dev_cmd.status =POCL_KST_READY;
  cellspu_memwrite( (void*)CELLSPU_KERNEL_CMD_ADDR, &dev_cmd, sizeof(__kernel_exec_cmd) );
       
  // Execute code on SPU. This starts with the main() in the spu - see spe_wrap.c
  if (spe_context_run(spe_context,&entry,0,NULL,NULL,NULL) < 0)
    perror("context_run error");

//  for (z = 0; z < pc->num_groups[2]; ++z)
//    {
//      for (y = 0; y < pc->num_groups[1]; ++y)
//        {
//          for (x = 0; x < pc->num_groups[0]; ++x)
//            {
//              pc->group_id[0] = x;
//              pc->group_id[1] = y;
//              pc->group_id[2] = z;
//
//              w (arguments, pc);
//
//            }
//        }
//    }


  // Clean-up ? 
  for (i = 0; i < kernel->num_args; ++i)
    {
      if (kernel->arg_is_local[i])
        pocl_cellspu_free(data, 0, *(void **)(arguments[i]));
    }
  for (i = kernel->num_args;
       i < kernel->num_args + kernel->num_locals;
       ++i)
    pocl_cellspu_free(data, 0, *(void **)(arguments[i]));
}
Exemple #6
0
extern cl_int
pocl_write_image    (cl_mem               image,
                   cl_device_id         device_id,
                   const size_t *       origin_, /*[3]*/
                   const size_t *       region_, /*[3]*/
                   size_t               host_row_pitch,
                   size_t               host_slice_pitch, 
                   const void *         ptr)
  {
    if (image == NULL)
      return CL_INVALID_MEM_OBJECT;

    if ((ptr == NULL) ||
        (region_ == NULL))
      return CL_INVALID_VALUE;
    
    int width = image->image_width;
    int height = image->image_height;
    cl_channel_order order = image->image_channel_order;
    cl_channel_type type = image->image_channel_data_type;
    
    size_t dev_elem_size = sizeof(cl_float);
    int dev_channels = 4;
    
    int host_elem_size;
    int host_channels;
    pocl_get_image_information (image, &host_channels, &host_elem_size);
    
    size_t origin[3] = { origin_[0]*dev_elem_size*dev_channels, origin_[1], origin_[2] };
    size_t region[3] = { region_[0]*dev_elem_size*dev_channels, region_[1], region_[2] };
    
    size_t image_row_pitch = width*dev_elem_size*dev_channels;
    size_t image_slice_pitch = 0;
    
    if ((region[0]*region[1]*region[2] > 0) &&
        (region[0]-1 +
        image_row_pitch * (region[1]-1) +
        image_slice_pitch * (region[2]-1) >= image->size))
      return CL_INVALID_VALUE;
    
    cl_float* temp = malloc( width*height*dev_channels*dev_elem_size );
    
    if (temp == NULL) 
      return CL_OUT_OF_HOST_MEMORY;
    
    int x, y, k;
    
    for (y=0; y<height; y++)
      for (x=0; x<width*dev_channels; x++)
        temp[x+y*width*dev_channels] = 0.f;
    
    for (y=0; y<height; y++)
      {
        for (x=0; x<width; x++)
          {
            cl_float elem[4]; //TODO 0,0,0,0 for some modes?
            
            for (k=0; k<host_channels; k++) 
              {
                if (type == CL_FLOAT)
                  elem[k] = ((float*)ptr)[k+(x+y*width)*host_channels];
                else if (type==CL_UNORM_INT8) 
                  {
                    cl_uchar foo = ((cl_uchar*)ptr)[k+(x+y*width)*host_channels];
                    elem[k] = (float)(foo) * (1.f/255.f);
                  }
                else
                  POCL_ABORT_UNIMPLEMENTED();
              }
          
            if (order == CL_RGBA) 
              for (k=0; k<4; k++)
                temp[(x+y*width)*dev_channels+k] = elem[k];
            else if (order == CL_R) 
              {
                temp[(x+y*width)*dev_channels+0] = elem[0];
                temp[(x+y*width)*dev_channels+1] = 0.f;
                temp[(x+y*width)*dev_channels+2] = 0.f;
                temp[(x+y*width)*dev_channels+3] = 1.f;
              }
          }
      }
      
    
    device_id->write_rect(device_id->data, temp, 
                        image->device_ptrs[device_id->dev_id],
                        origin, origin, region,
                        image_row_pitch, image_slice_pitch,
                        image_row_pitch, image_slice_pitch);
    
    free (temp);
    return CL_SUCCESS;
  }
Exemple #7
0
extern cl_int         
pocl_read_image   (cl_mem               image,
                   cl_device_id         device_id,
                   const size_t *       origin_, /*[3]*/
                   const size_t *       region_, /*[3]*/
                   size_t               host_row_pitch,
                   size_t               host_slice_pitch, 
                   void *               ptr) 
  {
    
    if (image == NULL)
      return CL_INVALID_MEM_OBJECT;

    if ((ptr == NULL) ||
        (region_ == NULL))
      return CL_INVALID_VALUE;
    
    int width = image->image_width;
    int height = image->image_height;
    int dev_elem_size = sizeof(cl_float);
    int dev_channels = 4;
    size_t origin[3] = { origin_[0]*dev_elem_size*dev_channels, origin_[1], origin_[2] };
    size_t region[3] = { region_[0]*dev_elem_size*dev_channels, region_[1], region_[2] };
    
    size_t image_row_pitch = width*dev_elem_size*dev_channels;
    size_t image_slice_pitch = 0;
    
    if ((region[0]*region[1]*region[2] > 0) &&
        (region[0]-1 +
        image_row_pitch * (region[1]-1) +
        image_slice_pitch * (region[2]-1) >= image->size))
      return CL_INVALID_VALUE;

  
    int i, j, k;
  
    cl_channel_order order = image->image_channel_order;
    cl_channel_type type = image->image_channel_data_type;
    
    cl_float* temp = malloc( width*height*dev_channels*dev_elem_size );
    
    if (temp == NULL)
      return CL_OUT_OF_HOST_MEMORY;
      
    int host_channels, host_elem_size;
      
    pocl_get_image_information(image, &host_channels, &host_elem_size);
      
    if (host_row_pitch == 0) {
      host_row_pitch = width*host_channels;
    }
    
    size_t buffer_origin[3] = { 0, 0, 0 };
    
    device_id->read_rect(device_id->data, temp, 
                        image->device_ptrs[device_id->dev_id],
                        origin, origin, region,
                        image_row_pitch, image_slice_pitch,
                        image_row_pitch, image_slice_pitch);
    
    for (j=0; j<height; j++) {
      for (i=0; i<width; i++) {
        cl_float elem[4];
        
        for (k=0; k<4; k++)
          elem[k]=0;
        
        if (order == CL_RGBA) {
          for (k=0; k<4; k++) 
            elem[k] = temp[i*dev_channels + j*width*dev_channels + k];
        }
        else if (order == CL_R) { // host_channels == 1
          elem[0] = temp[i*dev_channels + j*width*dev_channels + 0];
        }
        
        if (type == CL_UNORM_INT8) 
          { // host_channels == 4
            for (k=0; k<host_channels; k++)
              {
                ((cl_uchar*)ptr)[i*host_channels + j*host_row_pitch + k] 
                  = (unsigned char)(255*elem[k]);
              }
          }
        else if (type == CL_FLOAT) 
          {
            for (k=0; k<host_channels; k++)
              {
                POCL_ABORT_UNIMPLEMENTED();
                ((cl_float*)ptr)[i*host_channels + j*host_row_pitch + k] 
                  = elem[k];
              }
          }
        else
          POCL_ABORT_UNIMPLEMENTED();
      }
    }
    
    free (temp);
    
    return CL_SUCCESS;
  }