void * pocl_hsa_malloc (void *device_data, cl_mem_flags flags, size_t size, void *host_ptr) { void *b; if (flags & CL_MEM_COPY_HOST_PTR) { b = pocl_memalign_alloc(MAX_EXTENDED_ALIGNMENT, size); if (b != NULL) { memcpy(b, host_ptr, size); return b; } return NULL; } if (flags & CL_MEM_USE_HOST_PTR && host_ptr != NULL) { return host_ptr; } b = pocl_memalign_alloc(MAX_EXTENDED_ALIGNMENT, size); if (b != NULL) return b; return NULL; }
void * pocl_aligned_malloc (size_t alignment, size_t size) { # ifdef HAVE_POSIX_MEMALIGN /* make sure that size is a multiple of alignment, as posix_memalign * does not perform this test, whereas aligned_alloc does */ if ((size & (alignment - 1)) != 0) { errno = EINVAL; return NULL; } /* posix_memalign requires alignment to be at least sizeof(void *) */ if (alignment < sizeof(void *)) alignment = sizeof(void* ); void* result; result = pocl_memalign_alloc(alignment, size); if (result == NULL) { errno = -1; return NULL; } return result; # else /* allow zero-sized allocations, force alignment to 1 */ if (!size) alignment = 1; /* make sure alignment is a non-zero power of two and that * size is a multiple of alignment */ size_t mask = alignment - 1; if (!alignment || ((alignment & mask) != 0) || ((size & mask) != 0)) { errno = EINVAL; return NULL; } /* allocate memory plus space for alignment header */ uintptr_t address = (uintptr_t)malloc(size + mask + sizeof(void *)); if (!address) return NULL; /* align the address, and store original pointer for future use * with free in the preceding bytes */ uintptr_t aligned_address = (address + mask + sizeof(void *)) & ~mask; void** address_ptr = (void **)(aligned_address - sizeof(void *)); *address_ptr = (void *)address; return (void *)aligned_address; #endif }
void pocl_basic_run (void *data, _cl_command_node* cmd) { struct data *d; struct pocl_argument *al; size_t x, y, z; unsigned i; cl_kernel kernel = cmd->command.run.kernel; struct pocl_context *pc = &cmd->command.run.pc; assert (data != NULL); d = (struct data *) data; d->current_kernel = kernel; void **arguments = (void**)malloc( sizeof(void*) * (kernel->num_args + kernel->num_locals) ); /* Process the kernel arguments. Convert the opaque buffer pointers to real device pointers, allocate dynamic local memory buffers, etc. */ for (i = 0; i < kernel->num_args; ++i) { al = &(cmd->command.run.arguments[i]); if (kernel->arg_info[i].is_local) { arguments[i] = malloc (sizeof (void *)); *(void **)(arguments[i]) = pocl_memalign_alloc(MAX_EXTENDED_ALIGNMENT, al->size); } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_POINTER) { /* It's legal to pass a NULL pointer to clSetKernelArguments. In that case we must pass the same NULL forward to the kernel. Otherwise, the user must have created a buffer with per device pointers stored in the cl_mem. */ if (al->value == NULL) { arguments[i] = malloc (sizeof (void *)); *(void **)arguments[i] = NULL; } else arguments[i] = &((*(cl_mem *) (al->value))->device_ptrs[cmd->device->dev_id].mem_ptr); } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_IMAGE) { dev_image_t di; fill_dev_image_t (&di, al, cmd->device); void* devptr = pocl_memalign_alloc(MAX_EXTENDED_ALIGNMENT, sizeof(dev_image_t)); arguments[i] = malloc (sizeof (void *)); *(void **)(arguments[i]) = devptr; pocl_basic_write (data, &di, devptr, 0, sizeof(dev_image_t)); } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_SAMPLER) { dev_sampler_t ds; fill_dev_sampler_t(&ds, al); void* devptr = pocl_memalign_alloc(MAX_EXTENDED_ALIGNMENT, sizeof(dev_sampler_t)); arguments[i] = malloc (sizeof (void *)); *(void **)(arguments[i]) = devptr; pocl_basic_write (data, &ds, devptr, 0, sizeof(dev_sampler_t)); } else { arguments[i] = al->value; } } for (i = kernel->num_args; i < kernel->num_args + kernel->num_locals; ++i) { al = &(cmd->command.run.arguments[i]); arguments[i] = malloc (sizeof (void *)); *(void **)(arguments[i]) = pocl_memalign_alloc(MAX_EXTENDED_ALIGNMENT, al->size); } for (z = 0; z < pc->num_groups[2]; ++z) { for (y = 0; y < pc->num_groups[1]; ++y) { for (x = 0; x < pc->num_groups[0]; ++x) { pc->group_id[0] = x; pc->group_id[1] = y; pc->group_id[2] = z; cmd->command.run.wg (arguments, pc); } } } for (i = 0; i < kernel->num_args; ++i) { if (kernel->arg_info[i].is_local) { POCL_MEM_FREE(*(void **)(arguments[i])); POCL_MEM_FREE(arguments[i]); } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_IMAGE || kernel->arg_info[i].type == POCL_ARG_TYPE_SAMPLER) { POCL_MEM_FREE(*(void **)(arguments[i])); POCL_MEM_FREE(arguments[i]); } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_POINTER && *(void**)arguments[i] == NULL) { POCL_MEM_FREE(arguments[i]); } } for (i = kernel->num_args; i < kernel->num_args + kernel->num_locals; ++i) { POCL_MEM_FREE(*(void **)(arguments[i])); POCL_MEM_FREE(arguments[i]); } free(arguments); }