extern void pocl_get_image_information (cl_mem image, int* channels_out, int* elem_size_out) { cl_channel_order order = image->image_channel_order; cl_channel_type type = image->image_channel_data_type; int host_elem_size; if (type == CL_FLOAT) host_elem_size=4; else if (type==CL_UNORM_INT8) host_elem_size=1; else POCL_ABORT_UNIMPLEMENTED(); if (elem_size_out != NULL) *elem_size_out = host_elem_size; int host_channels; if (order == CL_RGBA) host_channels=4; else if (order == CL_R) host_channels=1; else POCL_ABORT_UNIMPLEMENTED(); if (channels_out != NULL) *channels_out = host_channels; }
void pocl_cellspu_free (void *data, cl_mem_flags flags, void *ptr) { POCL_ABORT_UNIMPLEMENTED(); if (flags & CL_MEM_USE_HOST_PTR) return; free (ptr); }
static void exec_commands (_cl_command_node *node_list) { int i; cl_event *event; _cl_command_node *node; cl_command_queue command_queue = NULL; LL_FOREACH (node_list, node) { event = &(node->event); /* Command queue is needed for POCL_UPDATE_EVENT macros */ if (node->event) command_queue = node->event->queue; switch (node->type) { case CL_COMMAND_READ_BUFFER: POCL_UPDATE_EVENT_RUNNING; node->device->read (node->command.read.data, node->command.read.host_ptr, node->command.read.device_ptr, node->command.read.cb); POCL_UPDATE_EVENT_COMPLETE; POname(clReleaseMemObject) (node->command.read.buffer); break; case CL_COMMAND_WRITE_BUFFER: POCL_UPDATE_EVENT_RUNNING; node->device->write (node->command.write.data, node->command.write.host_ptr, node->command.write.device_ptr, node->command.write.cb); POCL_UPDATE_EVENT_COMPLETE; POname(clReleaseMemObject) (node->command.write.buffer); break; case CL_COMMAND_COPY_BUFFER: POCL_UPDATE_EVENT_RUNNING; node->device->copy (node->command.copy.data, node->command.copy.src_ptr, node->command.copy.dst_ptr, node->command.copy.cb); POCL_UPDATE_EVENT_COMPLETE; POname(clReleaseMemObject) (node->command.copy.src_buffer); POname(clReleaseMemObject) (node->command.copy.dst_buffer); break; case CL_COMMAND_MAP_BUFFER: POCL_UPDATE_EVENT_RUNNING; pocl_map_mem_cmd (node->device, node->command.map.buffer, node->command.map.mapping); POCL_UPDATE_EVENT_COMPLETE; break; case CL_COMMAND_MAP_IMAGE: POCL_UPDATE_EVENT_RUNNING; node->device->read_rect (node->command.map_image.data, node->command.map_image.map_ptr, node->command.map_image.device_ptr, node->command.map_image.origin, node->command.map_image.origin, node->command.map_image.region, node->command.map_image.rowpitch, node->command.map_image.slicepitch, node->command.map_image.rowpitch, node->command.map_image.slicepitch); POCL_UPDATE_EVENT_COMPLETE; break; case CL_COMMAND_WRITE_IMAGE: POCL_UPDATE_EVENT_RUNNING; node->device->write_rect (node->command.map_image.data, node->command.map_image.map_ptr, node->command.map_image.device_ptr, node->command.map_image.origin, node->command.map_image.origin, node->command.map_image.region, node->command.map_image.rowpitch, node->command.map_image.slicepitch, node->command.map_image.rowpitch, node->command.map_image.slicepitch); POCL_UPDATE_EVENT_COMPLETE; break; case CL_COMMAND_READ_IMAGE: POCL_UPDATE_EVENT_RUNNING; node->device->read_rect (node->command.map_image.data, node->command.map_image.map_ptr, node->command.map_image.device_ptr, node->command.map_image.origin, node->command.map_image.origin, node->command.map_image.region, node->command.map_image.rowpitch, node->command.map_image.slicepitch, node->command.map_image.rowpitch, node->command.map_image.slicepitch); POCL_UPDATE_EVENT_COMPLETE; break; case CL_COMMAND_UNMAP_MEM_OBJECT: POCL_UPDATE_EVENT_RUNNING; if ((node->command.unmap.memobj)->flags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)) { /* TODO: should we ensure the device global region is updated from the host memory? How does the specs define it, can the host_ptr be assumed to point to the host and the device accessible memory or just point there until the kernel(s) get executed or similar? */ /* Assume the region is automatically up to date. */ } else { /* TODO: fixme. The offset computation must be done at the device driver. */ if (node->device->unmap_mem != NULL) node->device->unmap_mem (node->device->data, (node->command.unmap.mapping)->host_ptr, (node->command.unmap.memobj)->device_ptrs[node->device->dev_id], (node->command.unmap.mapping)->size); } DL_DELETE((node->command.unmap.memobj)->mappings, node->command.unmap.mapping); (node->command.unmap.memobj)->map_count--; POCL_UPDATE_EVENT_COMPLETE; break; case CL_COMMAND_NDRANGE_KERNEL: assert (*event == node->event); POCL_UPDATE_EVENT_RUNNING; node->device->run(node->command.run.data, node); POCL_UPDATE_EVENT_COMPLETE; for (i = 0; i < node->command.run.arg_buffer_count; ++i) { cl_mem buf = node->command.run.arg_buffers[i]; if (buf == NULL) continue; /*printf ("### releasing arg %d - the buffer %x of kernel %s\n", i, buf, node->command.run.kernel->function_name); */ POname(clReleaseMemObject) (buf); } free (node->command.run.arg_buffers); free (node->command.run.tmp_dir); for (i = 0; i < node->command.run.kernel->num_args + node->command.run.kernel->num_locals; ++i) { pocl_aligned_free (node->command.run.arguments[i].value); } free (node->command.run.arguments); POname(clReleaseKernel)(node->command.run.kernel); break; case CL_COMMAND_FILL_IMAGE: POCL_UPDATE_EVENT_RUNNING; node->device->fill_rect (node->command.fill_image.data, node->command.fill_image.device_ptr, node->command.fill_image.buffer_origin, node->command.fill_image.region, node->command.fill_image.rowpitch, node->command.fill_image.slicepitch, node->command.fill_image.fill_pixel, node->command.fill_image.pixel_size); free(node->command.fill_image.fill_pixel); POCL_UPDATE_EVENT_COMPLETE; break; case CL_COMMAND_MARKER: POCL_UPDATE_EVENT_RUNNING; POCL_UPDATE_EVENT_COMPLETE; break; default: POCL_ABORT_UNIMPLEMENTED(); break; } }
void pocl_hsa_run (void *data, _cl_command_node* cmd) { struct data *d; struct pocl_argument *al; unsigned i; cl_kernel kernel = cmd->command.run.kernel; struct pocl_context *pc = &cmd->command.run.pc; hsa_signal_value_t initial_value = 1; #if 0 /* Not yet supported by the reference library. */ hsa_kernel_dispatch_packet_t kernel_packet; #else hsa_dispatch_packet_t kernel_packet; #endif hsa_signal_t kernel_completion_signal = 0; hsa_region_t region; int error; amdgpu_args_t *args; /* 32b word offset in the kernel arguments buffer we can push the next argument to. */ int args_offset = 0; assert (data != NULL); d = (struct data *) data; d->current_kernel = kernel; memset (&kernel_packet, 0, sizeof (hsa_dispatch_packet_t)); #if 0 /* TODO: not yet supported by the open source runtime implementation. Assume the HSA Full profile so we can simply use host malloc(). */ hsa_agent_iterate_regions(kernel_agent, pocl_hsa_get_kernarg, ®ion); if (hsa_memory_allocate(region, sizeof(amdgpu_args_t), (void**)&args) != HSA_STATUS_SUCCESS) { assert (0 && "hsa_memory_allocate() failed."); } #else args = (amdgpu_args_t*)malloc(sizeof(amdgpu_args_t)); #endif kernel_packet.kernarg_address = (uint64_t)args; /* Process the kernel arguments. Convert the opaque buffer pointers to real device pointers, allocate dynamic local memory buffers, etc. */ for (i = 0; i < kernel->num_args; ++i) { al = &(cmd->command.run.arguments[i]); if (kernel->arg_info[i].is_local) { POCL_ABORT_UNIMPLEMENTED("pocl-hsa: local buffers not implemented."); #if 0 arguments[i] = malloc (sizeof (void *)); *(void **)(arguments[i]) = pocl_hsa_malloc(data, 0, al->size, NULL); #endif } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_POINTER) { if (args_offset + 1 >= MAX_KERNEL_ARG_WORDS) POCL_ABORT("pocl-hsa: too many kernel arguments!"); /* Assuming the pointers are 64b (or actually the same as in host) due to HSA. TODO: the 32b profile. */ if (al->value == NULL) { args->kernel_args[args_offset] = 0; args->kernel_args[args_offset + 1] = 0; } else { *(uint64_t*)&args->kernel_args[args_offset] = (uint64_t)(*(cl_mem *) (al->value))-> device_ptrs[cmd->device->dev_id].mem_ptr; } args_offset += 2; #if 0 /* It's legal to pass a NULL pointer to clSetKernelArguments. In that case we must pass the same NULL forward to the kernel. Otherwise, the user must have created a buffer with per device pointers stored in the cl_mem. */ if (al->value == NULL) { arguments[i] = malloc (sizeof (void *)); *(void **)arguments[i] = NULL; } else arguments[i] = &((*(cl_mem *) (al->value))->device_ptrs[cmd->device->dev_id].mem_ptr); #endif } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_IMAGE) { POCL_ABORT_UNIMPLEMENTED("hsa: image arguments not implemented."); #if 0 dev_image_t di; fill_dev_image_t (&di, al, cmd->device); void* devptr = pocl_hsa_malloc (data, 0, sizeof(dev_image_t), NULL); arguments[i] = malloc (sizeof (void *)); *(void **)(arguments[i]) = devptr; pocl_hsa_write (data, &di, devptr, 0, sizeof(dev_image_t)); #endif } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_SAMPLER) { POCL_ABORT_UNIMPLEMENTED("hsa: sampler arguments not implemented."); #if 0 dev_sampler_t ds; arguments[i] = malloc (sizeof (void *)); *(void **)(arguments[i]) = pocl_hsa_malloc (data, 0, sizeof(dev_sampler_t), NULL); pocl_hsa_write (data, &ds, *(void**)arguments[i], 0, sizeof(dev_sampler_t)); #endif } else { if (args_offset >= MAX_KERNEL_ARG_WORDS) POCL_ABORT("pocl-hsa: too many kernel arguments!"); /* Assuming the scalar fits to a 32b slot. TODO! */ assert (al->size <= 4); args->kernel_args[args_offset] = *(uint32_t*)al->value; ++args_offset; } } for (i = kernel->num_args; i < kernel->num_args + kernel->num_locals; ++i) { POCL_ABORT_UNIMPLEMENTED("hsa: automatic local buffers not implemented."); #if 0 al = &(cmd->command.run.arguments[i]); arguments[i] = malloc (sizeof (void *)); *(void **)(arguments[i]) = pocl_hsa_malloc (data, 0, al->size, NULL); #endif } args->workgroup_size_x = kernel_packet.workgroup_size_x = cmd->command.run.local_x; args->workgroup_size_y = kernel_packet.workgroup_size_y = cmd->command.run.local_y; args->workgroup_size_z = kernel_packet.workgroup_size_z = cmd->command.run.local_z; kernel_packet.grid_size_x = pc->num_groups[0] * cmd->command.run.local_x; kernel_packet.grid_size_y = pc->num_groups[1] * cmd->command.run.local_y; kernel_packet.grid_size_z = pc->num_groups[2] * cmd->command.run.local_z; /* AMDGPU specific OpenCL argument data. */ args->wgs_x = pc->num_groups[0]; args->wgs_y = pc->num_groups[1]; args->wgs_z = pc->num_groups[2]; kernel_packet.dimensions = 1; if (cmd->command.run.local_y > 1) kernel_packet.dimensions = 2; if (cmd->command.run.local_z > 1) kernel_packet.dimensions = 3; kernel_packet.header.type = HSA_PACKET_TYPE_DISPATCH; kernel_packet.header.acquire_fence_scope = HSA_FENCE_SCOPE_SYSTEM; kernel_packet.header.release_fence_scope = HSA_FENCE_SCOPE_SYSTEM; kernel_packet.header.barrier = 1; kernel_packet.kernel_object_address = *(hsa_amd_code_t*)cmd->command.run.device_data[1]; error = hsa_signal_create(initial_value, 0, NULL, &kernel_completion_signal); assert (error == HSA_STATUS_SUCCESS); kernel_packet.completion_signal = kernel_completion_signal; { /* Launch the kernel by allocating a slot in the queue, writing the command to it, signaling the update with a door bell and finally, block waiting until finish signalled with the completion_signal. */ const uint32_t queue_mask = d->queue->size - 1; uint64_t queue_index = hsa_queue_load_write_index_relaxed(d->queue); hsa_signal_value_t sigval; ((hsa_dispatch_packet_t*)(d->queue->base_address))[queue_index & queue_mask] = kernel_packet; hsa_queue_store_write_index_relaxed(d->queue, queue_index + 1); hsa_signal_store_relaxed(d->queue->doorbell_signal, queue_index); sigval = hsa_signal_wait_acquire(kernel_completion_signal, HSA_EQ, 0, (uint64_t)(-1), HSA_WAIT_EXPECTANCY_UNKNOWN); } for (i = 0; i < kernel->num_args; ++i) { if (kernel->arg_info[i].is_local) { #if 0 pocl_hsa_free (data, 0, *(void **)(arguments[i])); POCL_MEM_FREE(arguments[i]); #endif } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_IMAGE) { #if 0 pocl_hsa_free (data, 0, *(void **)(arguments[i])); POCL_MEM_FREE(arguments[i]); #endif } #if 0 else if (kernel->arg_info[i].type == POCL_ARG_TYPE_SAMPLER || (kernel->arg_info[i].type == POCL_ARG_TYPE_POINTER && *(void**)args->kernel_args[i] == NULL)) { POCL_MEM_FREE(arguments[i]); } #endif } for (i = kernel->num_args; i < kernel->num_args + kernel->num_locals; ++i) { #if 0 pocl_hsa_free(data, 0, *(void **)(arguments[i])); POCL_MEM_FREE(arguments[i]); #endif } free(args); }
void pocl_cellspu_run (void *data, _cl_command_node* cmd) { struct data *d; int error; char bytecode[POCL_FILENAME_LENGTH]; char assembly[POCL_FILENAME_LENGTH]; char module[POCL_FILENAME_LENGTH]; char command[COMMAND_LENGTH]; char workgroup_string[WORKGROUP_STRING_LENGTH]; unsigned device; struct pocl_argument *al; size_t x, y, z; unsigned i; pocl_workgroup w; char* tmpdir = cmd->command.run.tmp_dir; cl_kernel kernel = cmd->command.run.kernel; struct pocl_context *pc = &cmd->command.run.pc; const char* kern_func = kernel->function_name; unsigned int entry = SPE_DEFAULT_ENTRY; assert (data != NULL); d = (struct data *) data; error = snprintf (module, POCL_FILENAME_LENGTH, "%s/parallel.so", tmpdir); assert (error >= 0); // This is the entry to the kenrel. We currently hard-code it // into the SPU binary. Resulting in only one entry-point per // SPU image. // TODO: figure out which function to call given what conditions snprintf (workgroup_string, WORKGROUP_STRING_LENGTH, "_%s_workgroup_fast", kernel->function_name); if ( access (module, F_OK) != 0) { char *llvm_ld; error = snprintf (bytecode, POCL_FILENAME_LENGTH, "%s/linked.bc", tmpdir); assert (error >= 0); if (getenv("POCL_BUILDING") != NULL) llvm_ld = BUILDDIR "/tools/llvm-ld/pocl-llvm-ld"; else if (access(PKGLIBEXECDIR "/pocl-llvm-ld", X_OK) == 0) llvm_ld = PKGLIBEXECDIR "/pocl-llvm-ld"; else llvm_ld = "pocl-llvm-ld"; error = snprintf (command, COMMAND_LENGTH, "%s --disable-opt -link-as-library -o %s %s/%s", llvm_ld, bytecode, tmpdir, POCL_PARALLEL_BC_FILENAME); assert (error >= 0); error = system(command); assert (error == 0); error = snprintf (assembly, POCL_FILENAME_LENGTH, "%s/parallel.s", tmpdir); assert (error >= 0); // "-relocation-model=dynamic-no-pic" is a magic string, // I do not know why it has to be there to produce valid // sos on x86_64 error = snprintf (command, COMMAND_LENGTH, LLC " " HOST_LLC_FLAGS " -o %s %s", assembly, bytecode); assert (error >= 0); error = system (command); assert (error == 0); // Compile the assembly version of the OCL kernel with the // C wrapper to get a spulet error = snprintf (command, COMMAND_LENGTH, "spu-gcc lib/CL/devices/cellspu/spe_wrap.c -o %s %s " " -Xlinker --defsym -Xlinker _ocl_buffer=%d" " -Xlinker --defsym -Xlinker kernel_command=%d" " -I . -D_KERNEL=%s -std=c99", module, assembly, CELLSPU_OCL_BUFFERS_START, CELLSPU_KERNEL_CMD_ADDR, workgroup_string); assert (error >= 0); #ifdef DEBUG_CELLSPU_DRIVER printf("compiling: %s\n", command); fflush(stdout); #endif error = system (command); assert (error == 0); } // Load the SPU with the newly generated binary hello_spu = spe_image_open( (const char*)module ); if( spe_program_load( spe_context, hello_spu) ) perror("spe_program_load fails"); // // /* Find which device number within the context correspond // to current device. */ // for (i = 0; i < kernel->context->num_devices; ++i) // { // if (kernel->context->devices[i]->data == data) // { // device = i; // break; // } // } // // This structure gets passed to the device. // It contains all the info needed to run a kernel __kernel_exec_cmd dev_cmd; dev_cmd.work_dim = cmd->command.run.pc.work_dim; dev_cmd.num_groups[0] = cmd->command.run.pc.num_groups[0]; dev_cmd.num_groups[1] = cmd->command.run.pc.num_groups[1]; dev_cmd.num_groups[2] = cmd->command.run.pc.num_groups[2]; dev_cmd.global_offset[0] = cmd->command.run.pc.global_offset[0]; dev_cmd.global_offset[1] = cmd->command.run.pc.global_offset[1]; dev_cmd.global_offset[2] = cmd->command.run.pc.global_offset[2]; // the code below is lifted from pthreads :) uint32_t *arguments = dev_cmd.args; for (i = 0; i < kernel->num_args; ++i) { al = &(kernel->dyn_arguments[i]); if (kernel->arg_is_local[i]) { chunk_info_t* local_chunk = cellspu_malloc_local (d, al->size); if (local_chunk == NULL) POCL_ABORT ("Could not allocate memory for a local argument. Out of local mem?\n"); dev_cmd.args[i] = local_chunk->start_address; } else if (kernel->arg_is_pointer[i]) { /* It's legal to pass a NULL pointer to clSetKernelArguments. In that case we must pass the same NULL forward to the kernel. Otherwise, the user must have created a buffer with per device pointers stored in the cl_mem. */ if (al->value == NULL) arguments[i] = (uint32_t)NULL; else arguments[i] = \ ((chunk_info_t*)((*(cl_mem *)\ (al->value))->device_ptrs[0]))->start_address; //TODO: '0' above is the device number... don't hard-code! } else if (kernel->arg_is_image[i]) { POCL_ABORT_UNIMPLEMENTED(); // dev_image2d_t di; // cl_mem mem = *(cl_mem*)al->value; // di.data = &((*(cl_mem *) (al->value))->device_ptrs[device]); // di.data = ((*(cl_mem *) (al->value))->device_ptrs[device]); // di.width = mem->image_width; // di.height = mem->image_height; // di.rowpitch = mem->image_row_pitch; // di.order = mem->image_channel_order; // di.data_type = mem->image_channel_data_type; // void* devptr = pocl_cellspu_malloc(data, 0, sizeof(dev_image2d_t), NULL); // arguments[i] = malloc (sizeof (void *)); // *(void **)(arguments[i]) = devptr; // pocl_cellspu_write (data, &di, devptr, sizeof(dev_image2d_t)); } else if (kernel->arg_is_sampler[i]) { POCL_ABORT_UNIMPLEMENTED(); // dev_sampler_t ds; // // arguments[i] = malloc (sizeof (void *)); // *(void **)(arguments[i]) = pocl_cellspu_malloc(data, 0, sizeof(dev_sampler_t), NULL); // pocl_cellspu_write (data, &ds, *(void**)arguments[i], sizeof(dev_sampler_t)); } else { arguments[i] = (uint32_t)al->value; } } // allocate memory for kernel local variables for (i = kernel->num_args; i < kernel->num_args + kernel->num_locals; ++i) { al = &(kernel->dyn_arguments[i]); arguments[i] = (uint32_t)malloc (sizeof (void *)); *(void **)(arguments[i]) = cellspu_malloc_local(data, al->size); } // the main loop on the spe needs an auxiliary struct for to get the // number of arguments and such. __kernel_metadata kmd; strncpy( kmd.name, workgroup_string, sizeof( kmd.name ) ); kmd.num_args = kernel->num_args; kmd.num_locals = kernel->num_locals; // TODO: fill in the rest, if used by the spu main function. // TODO malloc_local should be given the 'device data'. as long as teh // spu context is global this is ok. void *chunk = cellspu_malloc_local( NULL, sizeof(__kernel_metadata) ); void *kernel_area = ((chunk_info_t*)chunk)->start_address; cellspu_memwrite( kernel_area, &kmd, sizeof(__kernel_metadata) ); dev_cmd.kernel = kernel_area; // finish up the command, send it to SPE dev_cmd.status =POCL_KST_READY; cellspu_memwrite( (void*)CELLSPU_KERNEL_CMD_ADDR, &dev_cmd, sizeof(__kernel_exec_cmd) ); // Execute code on SPU. This starts with the main() in the spu - see spe_wrap.c if (spe_context_run(spe_context,&entry,0,NULL,NULL,NULL) < 0) perror("context_run error"); // for (z = 0; z < pc->num_groups[2]; ++z) // { // for (y = 0; y < pc->num_groups[1]; ++y) // { // for (x = 0; x < pc->num_groups[0]; ++x) // { // pc->group_id[0] = x; // pc->group_id[1] = y; // pc->group_id[2] = z; // // w (arguments, pc); // // } // } // } // Clean-up ? for (i = 0; i < kernel->num_args; ++i) { if (kernel->arg_is_local[i]) pocl_cellspu_free(data, 0, *(void **)(arguments[i])); } for (i = kernel->num_args; i < kernel->num_args + kernel->num_locals; ++i) pocl_cellspu_free(data, 0, *(void **)(arguments[i])); }
extern cl_int pocl_write_image (cl_mem image, cl_device_id device_id, const size_t * origin_, /*[3]*/ const size_t * region_, /*[3]*/ size_t host_row_pitch, size_t host_slice_pitch, const void * ptr) { if (image == NULL) return CL_INVALID_MEM_OBJECT; if ((ptr == NULL) || (region_ == NULL)) return CL_INVALID_VALUE; int width = image->image_width; int height = image->image_height; cl_channel_order order = image->image_channel_order; cl_channel_type type = image->image_channel_data_type; size_t dev_elem_size = sizeof(cl_float); int dev_channels = 4; int host_elem_size; int host_channels; pocl_get_image_information (image, &host_channels, &host_elem_size); size_t origin[3] = { origin_[0]*dev_elem_size*dev_channels, origin_[1], origin_[2] }; size_t region[3] = { region_[0]*dev_elem_size*dev_channels, region_[1], region_[2] }; size_t image_row_pitch = width*dev_elem_size*dev_channels; size_t image_slice_pitch = 0; if ((region[0]*region[1]*region[2] > 0) && (region[0]-1 + image_row_pitch * (region[1]-1) + image_slice_pitch * (region[2]-1) >= image->size)) return CL_INVALID_VALUE; cl_float* temp = malloc( width*height*dev_channels*dev_elem_size ); if (temp == NULL) return CL_OUT_OF_HOST_MEMORY; int x, y, k; for (y=0; y<height; y++) for (x=0; x<width*dev_channels; x++) temp[x+y*width*dev_channels] = 0.f; for (y=0; y<height; y++) { for (x=0; x<width; x++) { cl_float elem[4]; //TODO 0,0,0,0 for some modes? for (k=0; k<host_channels; k++) { if (type == CL_FLOAT) elem[k] = ((float*)ptr)[k+(x+y*width)*host_channels]; else if (type==CL_UNORM_INT8) { cl_uchar foo = ((cl_uchar*)ptr)[k+(x+y*width)*host_channels]; elem[k] = (float)(foo) * (1.f/255.f); } else POCL_ABORT_UNIMPLEMENTED(); } if (order == CL_RGBA) for (k=0; k<4; k++) temp[(x+y*width)*dev_channels+k] = elem[k]; else if (order == CL_R) { temp[(x+y*width)*dev_channels+0] = elem[0]; temp[(x+y*width)*dev_channels+1] = 0.f; temp[(x+y*width)*dev_channels+2] = 0.f; temp[(x+y*width)*dev_channels+3] = 1.f; } } } device_id->write_rect(device_id->data, temp, image->device_ptrs[device_id->dev_id], origin, origin, region, image_row_pitch, image_slice_pitch, image_row_pitch, image_slice_pitch); free (temp); return CL_SUCCESS; }
extern cl_int pocl_read_image (cl_mem image, cl_device_id device_id, const size_t * origin_, /*[3]*/ const size_t * region_, /*[3]*/ size_t host_row_pitch, size_t host_slice_pitch, void * ptr) { if (image == NULL) return CL_INVALID_MEM_OBJECT; if ((ptr == NULL) || (region_ == NULL)) return CL_INVALID_VALUE; int width = image->image_width; int height = image->image_height; int dev_elem_size = sizeof(cl_float); int dev_channels = 4; size_t origin[3] = { origin_[0]*dev_elem_size*dev_channels, origin_[1], origin_[2] }; size_t region[3] = { region_[0]*dev_elem_size*dev_channels, region_[1], region_[2] }; size_t image_row_pitch = width*dev_elem_size*dev_channels; size_t image_slice_pitch = 0; if ((region[0]*region[1]*region[2] > 0) && (region[0]-1 + image_row_pitch * (region[1]-1) + image_slice_pitch * (region[2]-1) >= image->size)) return CL_INVALID_VALUE; int i, j, k; cl_channel_order order = image->image_channel_order; cl_channel_type type = image->image_channel_data_type; cl_float* temp = malloc( width*height*dev_channels*dev_elem_size ); if (temp == NULL) return CL_OUT_OF_HOST_MEMORY; int host_channels, host_elem_size; pocl_get_image_information(image, &host_channels, &host_elem_size); if (host_row_pitch == 0) { host_row_pitch = width*host_channels; } size_t buffer_origin[3] = { 0, 0, 0 }; device_id->read_rect(device_id->data, temp, image->device_ptrs[device_id->dev_id], origin, origin, region, image_row_pitch, image_slice_pitch, image_row_pitch, image_slice_pitch); for (j=0; j<height; j++) { for (i=0; i<width; i++) { cl_float elem[4]; for (k=0; k<4; k++) elem[k]=0; if (order == CL_RGBA) { for (k=0; k<4; k++) elem[k] = temp[i*dev_channels + j*width*dev_channels + k]; } else if (order == CL_R) { // host_channels == 1 elem[0] = temp[i*dev_channels + j*width*dev_channels + 0]; } if (type == CL_UNORM_INT8) { // host_channels == 4 for (k=0; k<host_channels; k++) { ((cl_uchar*)ptr)[i*host_channels + j*host_row_pitch + k] = (unsigned char)(255*elem[k]); } } else if (type == CL_FLOAT) { for (k=0; k<host_channels; k++) { POCL_ABORT_UNIMPLEMENTED(); ((cl_float*)ptr)[i*host_channels + j*host_row_pitch + k] = elem[k]; } } else POCL_ABORT_UNIMPLEMENTED(); } } free (temp); return CL_SUCCESS; }