void pocl_basic_run (void *data, _cl_command_node* cmd) { struct data *d; struct pocl_argument *al; size_t x, y, z; unsigned i; cl_kernel kernel = cmd->command.run.kernel; struct pocl_context *pc = &cmd->command.run.pc; assert (data != NULL); d = (struct data *) data; d->current_kernel = kernel; void **arguments = (void**)malloc( sizeof(void*) * (kernel->num_args + kernel->num_locals) ); /* Process the kernel arguments. Convert the opaque buffer pointers to real device pointers, allocate dynamic local memory buffers, etc. */ for (i = 0; i < kernel->num_args; ++i) { al = &(cmd->command.run.arguments[i]); if (kernel->arg_info[i].is_local) { arguments[i] = malloc (sizeof (void *)); *(void **)(arguments[i]) = pocl_memalign_alloc(MAX_EXTENDED_ALIGNMENT, al->size); } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_POINTER) { /* It's legal to pass a NULL pointer to clSetKernelArguments. In that case we must pass the same NULL forward to the kernel. Otherwise, the user must have created a buffer with per device pointers stored in the cl_mem. */ if (al->value == NULL) { arguments[i] = malloc (sizeof (void *)); *(void **)arguments[i] = NULL; } else arguments[i] = &((*(cl_mem *) (al->value))->device_ptrs[cmd->device->dev_id].mem_ptr); } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_IMAGE) { dev_image_t di; fill_dev_image_t (&di, al, cmd->device); void* devptr = pocl_memalign_alloc(MAX_EXTENDED_ALIGNMENT, sizeof(dev_image_t)); arguments[i] = malloc (sizeof (void *)); *(void **)(arguments[i]) = devptr; pocl_basic_write (data, &di, devptr, 0, sizeof(dev_image_t)); } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_SAMPLER) { dev_sampler_t ds; fill_dev_sampler_t(&ds, al); void* devptr = pocl_memalign_alloc(MAX_EXTENDED_ALIGNMENT, sizeof(dev_sampler_t)); arguments[i] = malloc (sizeof (void *)); *(void **)(arguments[i]) = devptr; pocl_basic_write (data, &ds, devptr, 0, sizeof(dev_sampler_t)); } else { arguments[i] = al->value; } } for (i = kernel->num_args; i < kernel->num_args + kernel->num_locals; ++i) { al = &(cmd->command.run.arguments[i]); arguments[i] = malloc (sizeof (void *)); *(void **)(arguments[i]) = pocl_memalign_alloc(MAX_EXTENDED_ALIGNMENT, al->size); } for (z = 0; z < pc->num_groups[2]; ++z) { for (y = 0; y < pc->num_groups[1]; ++y) { for (x = 0; x < pc->num_groups[0]; ++x) { pc->group_id[0] = x; pc->group_id[1] = y; pc->group_id[2] = z; cmd->command.run.wg (arguments, pc); } } } for (i = 0; i < kernel->num_args; ++i) { if (kernel->arg_info[i].is_local) { POCL_MEM_FREE(*(void **)(arguments[i])); POCL_MEM_FREE(arguments[i]); } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_IMAGE || kernel->arg_info[i].type == POCL_ARG_TYPE_SAMPLER) { POCL_MEM_FREE(*(void **)(arguments[i])); POCL_MEM_FREE(arguments[i]); } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_POINTER && *(void**)arguments[i] == NULL) { POCL_MEM_FREE(arguments[i]); } } for (i = kernel->num_args; i < kernel->num_args + kernel->num_locals; ++i) { POCL_MEM_FREE(*(void **)(arguments[i])); POCL_MEM_FREE(arguments[i]); } free(arguments); }
void pocl_hsa_run (void *data, _cl_command_node* cmd) { struct data *d; struct pocl_argument *al; unsigned i; cl_kernel kernel = cmd->command.run.kernel; struct pocl_context *pc = &cmd->command.run.pc; hsa_signal_value_t initial_value = 1; #if 0 /* Not yet supported by the reference library. */ hsa_kernel_dispatch_packet_t kernel_packet; #else hsa_dispatch_packet_t kernel_packet; #endif hsa_signal_t kernel_completion_signal = 0; hsa_region_t region; int error; amdgpu_args_t *args; /* 32b word offset in the kernel arguments buffer we can push the next argument to. */ int args_offset = 0; assert (data != NULL); d = (struct data *) data; d->current_kernel = kernel; memset (&kernel_packet, 0, sizeof (hsa_dispatch_packet_t)); #if 0 /* TODO: not yet supported by the open source runtime implementation. Assume the HSA Full profile so we can simply use host malloc(). */ hsa_agent_iterate_regions(kernel_agent, pocl_hsa_get_kernarg, ®ion); if (hsa_memory_allocate(region, sizeof(amdgpu_args_t), (void**)&args) != HSA_STATUS_SUCCESS) { assert (0 && "hsa_memory_allocate() failed."); } #else args = (amdgpu_args_t*)malloc(sizeof(amdgpu_args_t)); #endif kernel_packet.kernarg_address = (uint64_t)args; /* Process the kernel arguments. Convert the opaque buffer pointers to real device pointers, allocate dynamic local memory buffers, etc. */ for (i = 0; i < kernel->num_args; ++i) { al = &(cmd->command.run.arguments[i]); if (kernel->arg_info[i].is_local) { POCL_ABORT_UNIMPLEMENTED("pocl-hsa: local buffers not implemented."); #if 0 arguments[i] = malloc (sizeof (void *)); *(void **)(arguments[i]) = pocl_hsa_malloc(data, 0, al->size, NULL); #endif } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_POINTER) { if (args_offset + 1 >= MAX_KERNEL_ARG_WORDS) POCL_ABORT("pocl-hsa: too many kernel arguments!"); /* Assuming the pointers are 64b (or actually the same as in host) due to HSA. TODO: the 32b profile. */ if (al->value == NULL) { args->kernel_args[args_offset] = 0; args->kernel_args[args_offset + 1] = 0; } else { *(uint64_t*)&args->kernel_args[args_offset] = (uint64_t)(*(cl_mem *) (al->value))-> device_ptrs[cmd->device->dev_id].mem_ptr; } args_offset += 2; #if 0 /* It's legal to pass a NULL pointer to clSetKernelArguments. In that case we must pass the same NULL forward to the kernel. Otherwise, the user must have created a buffer with per device pointers stored in the cl_mem. */ if (al->value == NULL) { arguments[i] = malloc (sizeof (void *)); *(void **)arguments[i] = NULL; } else arguments[i] = &((*(cl_mem *) (al->value))->device_ptrs[cmd->device->dev_id].mem_ptr); #endif } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_IMAGE) { POCL_ABORT_UNIMPLEMENTED("hsa: image arguments not implemented."); #if 0 dev_image_t di; fill_dev_image_t (&di, al, cmd->device); void* devptr = pocl_hsa_malloc (data, 0, sizeof(dev_image_t), NULL); arguments[i] = malloc (sizeof (void *)); *(void **)(arguments[i]) = devptr; pocl_hsa_write (data, &di, devptr, 0, sizeof(dev_image_t)); #endif } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_SAMPLER) { POCL_ABORT_UNIMPLEMENTED("hsa: sampler arguments not implemented."); #if 0 dev_sampler_t ds; arguments[i] = malloc (sizeof (void *)); *(void **)(arguments[i]) = pocl_hsa_malloc (data, 0, sizeof(dev_sampler_t), NULL); pocl_hsa_write (data, &ds, *(void**)arguments[i], 0, sizeof(dev_sampler_t)); #endif } else { if (args_offset >= MAX_KERNEL_ARG_WORDS) POCL_ABORT("pocl-hsa: too many kernel arguments!"); /* Assuming the scalar fits to a 32b slot. TODO! */ assert (al->size <= 4); args->kernel_args[args_offset] = *(uint32_t*)al->value; ++args_offset; } } for (i = kernel->num_args; i < kernel->num_args + kernel->num_locals; ++i) { POCL_ABORT_UNIMPLEMENTED("hsa: automatic local buffers not implemented."); #if 0 al = &(cmd->command.run.arguments[i]); arguments[i] = malloc (sizeof (void *)); *(void **)(arguments[i]) = pocl_hsa_malloc (data, 0, al->size, NULL); #endif } args->workgroup_size_x = kernel_packet.workgroup_size_x = cmd->command.run.local_x; args->workgroup_size_y = kernel_packet.workgroup_size_y = cmd->command.run.local_y; args->workgroup_size_z = kernel_packet.workgroup_size_z = cmd->command.run.local_z; kernel_packet.grid_size_x = pc->num_groups[0] * cmd->command.run.local_x; kernel_packet.grid_size_y = pc->num_groups[1] * cmd->command.run.local_y; kernel_packet.grid_size_z = pc->num_groups[2] * cmd->command.run.local_z; /* AMDGPU specific OpenCL argument data. */ args->wgs_x = pc->num_groups[0]; args->wgs_y = pc->num_groups[1]; args->wgs_z = pc->num_groups[2]; kernel_packet.dimensions = 1; if (cmd->command.run.local_y > 1) kernel_packet.dimensions = 2; if (cmd->command.run.local_z > 1) kernel_packet.dimensions = 3; kernel_packet.header.type = HSA_PACKET_TYPE_DISPATCH; kernel_packet.header.acquire_fence_scope = HSA_FENCE_SCOPE_SYSTEM; kernel_packet.header.release_fence_scope = HSA_FENCE_SCOPE_SYSTEM; kernel_packet.header.barrier = 1; kernel_packet.kernel_object_address = *(hsa_amd_code_t*)cmd->command.run.device_data[1]; error = hsa_signal_create(initial_value, 0, NULL, &kernel_completion_signal); assert (error == HSA_STATUS_SUCCESS); kernel_packet.completion_signal = kernel_completion_signal; { /* Launch the kernel by allocating a slot in the queue, writing the command to it, signaling the update with a door bell and finally, block waiting until finish signalled with the completion_signal. */ const uint32_t queue_mask = d->queue->size - 1; uint64_t queue_index = hsa_queue_load_write_index_relaxed(d->queue); hsa_signal_value_t sigval; ((hsa_dispatch_packet_t*)(d->queue->base_address))[queue_index & queue_mask] = kernel_packet; hsa_queue_store_write_index_relaxed(d->queue, queue_index + 1); hsa_signal_store_relaxed(d->queue->doorbell_signal, queue_index); sigval = hsa_signal_wait_acquire(kernel_completion_signal, HSA_EQ, 0, (uint64_t)(-1), HSA_WAIT_EXPECTANCY_UNKNOWN); } for (i = 0; i < kernel->num_args; ++i) { if (kernel->arg_info[i].is_local) { #if 0 pocl_hsa_free (data, 0, *(void **)(arguments[i])); POCL_MEM_FREE(arguments[i]); #endif } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_IMAGE) { #if 0 pocl_hsa_free (data, 0, *(void **)(arguments[i])); POCL_MEM_FREE(arguments[i]); #endif } #if 0 else if (kernel->arg_info[i].type == POCL_ARG_TYPE_SAMPLER || (kernel->arg_info[i].type == POCL_ARG_TYPE_POINTER && *(void**)args->kernel_args[i] == NULL)) { POCL_MEM_FREE(arguments[i]); } #endif } for (i = kernel->num_args; i < kernel->num_args + kernel->num_locals; ++i) { #if 0 pocl_hsa_free(data, 0, *(void **)(arguments[i])); POCL_MEM_FREE(arguments[i]); #endif } free(args); }