void* LoaderContext::AgentAlloc(hsa_agent_t agent, size_t size, size_t align, bool zero) { assert(agent.handle); assert(size); assert(align); hsa_status_t hsa_status = HSA_STATUS_SUCCESS; void *result = NULL; Agent2RegionMap::iterator used_region = agent2region_.end(); { std::lock_guard<std::mutex> lock(agent2region_mutex_); used_region = agent2region_.find(agent); if (used_region == agent2region_.end()) { hsa_region_t agent_region = {0}; hsa_status = hsa_agent_iterate_regions(agent, FindGlobalRegion, &agent_region); if (HSA_STATUS_SUCCESS != hsa_status) { return NULL; } used_region = agent2region_.insert(used_region, std::make_pair(agent, agent_region)); } } assert(used_region != agent2region_.end()); assert(used_region->first.handle == agent.handle); assert(used_region->second.handle); hsa_status = hsa_memory_allocate(used_region->second, size, &result); if (HSA_STATUS_SUCCESS != hsa_status) { return NULL; } assert(result); // @todo(runtime): need more efficient way of allocating zero-initialized // memory. if (zero) { void *zero_initialized = calloc(size, 1); if (!zero_initialized) { hsa_memory_free(result); return NULL; } hsa_status = hsa_memory_copy(result, zero_initialized, size); if (HSA_STATUS_SUCCESS != hsa_status) { hsa_memory_free(result); free(zero_initialized); return NULL; } free(zero_initialized); } return result; }
TemplateDispatch<T>::TemplateDispatch(HSAContext::Kernel* p_kernel) : m_p_kernel(p_kernel) { hsa_region_t region; HSAContextKaveriImpl::KernelImpl *p_impl = (HSAContextKaveriImpl::KernelImpl *) m_p_kernel; run_kernel_arg_buffer_size = p_impl->hsaCodeDescriptor->kernarg_segment_byte_size; hsa_agent_iterate_regions(p_impl->context->device, get_kernarg, ®ion); hsa_memory_allocate(region, run_kernel_arg_buffer_size, &run_kernel_arg_buffer); }
void MultiKernelDispatch<T>::addKernel(HSAContext::Kernel* p_kernel) { void *run_kernel_arg_buffer; hsa_region_t region; HSAContextKaveriImpl::KernelImpl *p_impl = (HSAContextKaveriImpl::KernelImpl *) p_kernel; size_t run_kernel_arg_buffer_size = p_impl->hsaCodeDescriptor->kernarg_segment_byte_size; hsa_agent_iterate_regions(p_impl->context->device, get_kernarg, ®ion); hsa_memory_allocate(region, run_kernel_arg_buffer_size, &run_kernel_arg_buffer); p_impl->m_run_arg_buffer= run_kernel_arg_buffer; }
int main(int argc, char **argv) { struct timespec timer_1, timer_2; hsa_status_t err; err = hsa_init(); check(Initializing the hsa runtime, err); /* * Iterate over the agents and pick the gpu agent using * the get_gpu_agent callback. */ hsa_agent_t agent; err = hsa_iterate_agents(get_gpu_agent, &agent); if(err == HSA_STATUS_INFO_BREAK) { err = HSA_STATUS_SUCCESS; } check(Getting a gpu agent, err); /* * Query the name of the agent. */ char name[64] = { 0 }; err = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, name); check(Querying the agent name, err); printf("The agent name is %s.\n", name); /* * Query the maximum size of the queue. */ uint32_t queue_size = 0; err = hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size); check(Querying the agent maximum queue size, err); printf("The maximum queue size is %u.\n", (unsigned int) queue_size); /* * Create a queue using the maximum size. */ hsa_queue_t* queue; err = hsa_queue_create(agent, queue_size, HSA_QUEUE_TYPE_SINGLE, NULL, NULL, UINT32_MAX, UINT32_MAX, &queue); check(Creating the queue, err); /* * Load the BRIG binary. */ hsa_ext_module_t module; load_module_from_file("vector_copy.brig",&module); /* * Create hsa program. */ hsa_ext_program_t program; memset(&program,0,sizeof(hsa_ext_program_t)); err = hsa_ext_program_create(HSA_MACHINE_MODEL_LARGE, HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, &program); check(Create the program, err); /* * Add the BRIG module to hsa program. */ err = hsa_ext_program_add_module(program, module); check(Adding the brig module to the program, err); /* * Determine the agents ISA. */ hsa_isa_t isa; err = hsa_agent_get_info(agent, HSA_AGENT_INFO_ISA, &isa); check(Query the agents isa, err); /* * Finalize the program and extract the code object. */ hsa_ext_control_directives_t control_directives; memset(&control_directives, 0, sizeof(hsa_ext_control_directives_t)); hsa_code_object_t code_object; err = hsa_ext_program_finalize(program, isa, 0, control_directives, "-O0", HSA_CODE_OBJECT_TYPE_PROGRAM, &code_object); check(Finalizing the program, err); /* * Destroy the program, it is no longer needed. */ err=hsa_ext_program_destroy(program); check(Destroying the program, err); /* * Create the empty executable. */ hsa_executable_t executable; err = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, "", &executable); check(Create the executable, err); /* * Load the code<F3> object. */ err = hsa_executable_load_code_object(executable, agent, code_object, ""); check(Loading the code object, err); /* * Freeze the executable; it can now be queried for symbols. */ err = hsa_executable_freeze(executable, ""); check(Freeze the executable, err); /* * Extract the symbol from the executable. */ hsa_executable_symbol_t symbol; err = hsa_executable_get_symbol(executable, "", "&__OpenCL_vector_copy_kernel", agent, 0, &symbol); check(Extract the symbol from the executable, err); /* * Extract dispatch information from the symbol */ uint64_t kernel_object; uint32_t kernarg_segment_size; uint32_t group_segment_size; uint32_t private_segment_size; err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_object); check(Extracting the symbol from the executable, err); err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &kernarg_segment_size); check(Extracting the kernarg segment size from the executable, err); err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_segment_size); check(Extracting the group segment size from the executable, err); err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_segment_size); check(Extracting the private segment from the executable, err); /* * Create a signal to wait for the dispatch to finish. */ hsa_signal_t signal; err=hsa_signal_create(1, 0, NULL, &signal); check(Creating a HSA signal, err); /* * Allocate and initialize the kernel arguments and data. */ int* in=(int*)malloc(SIZE); int i; for(i=0;i<ELEMENT;i++) in[i]=(rand()%50000+1); err=hsa_memory_register(in, SIZE); check(Registering argument memory for input parameter, err); int* out=(int*)malloc(SIZE); memset(out, 0, SIZE); err=hsa_memory_register(out, SIZE); check(Registering argument memory for output parameter, err); int element = ELEMENT; int iter = ITER; struct __attribute__ ((aligned(16))) args_t { uint64_t global_offset_0; uint64_t global_offset_1; uint64_t global_offset_2; uint64_t printf_buffer; uint64_t vqueue_pointer; uint64_t aqlwrap_pointer; void* in; void* out; int iter; int element; } args; memset(&args, 0, sizeof(args)); args.in=in; args.out=out; args.element=element; args.iter=iter; /* * Find a memory region that supports kernel arguments. */ hsa_region_t kernarg_region; kernarg_region.handle=(uint64_t)-1; hsa_agent_iterate_regions(agent, get_kernarg_memory_region, &kernarg_region); err = (kernarg_region.handle == (uint64_t)-1) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS; check(Finding a kernarg memory region, err); void* kernarg_address = NULL; /* * Allocate the kernel argument buffer from the correct region. */ err = hsa_memory_allocate(kernarg_region, kernarg_segment_size, &kernarg_address); check(Allocating kernel argument memory buffer, err); memcpy(kernarg_address, &args, sizeof(args)); /* * Obtain the current queue write index. */ uint64_t index = hsa_queue_load_write_index_relaxed(queue); /* * Write the aql packet at the calculated queue index address. */ const uint32_t queueMask = queue->size - 1; hsa_kernel_dispatch_packet_t* dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(queue->base_address))[index&queueMask]); dispatch_packet->header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; dispatch_packet->header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; dispatch_packet->setup |= 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; dispatch_packet->workgroup_size_x = (uint16_t)LOCAL_SIZE; dispatch_packet->workgroup_size_y = (uint16_t)1; dispatch_packet->workgroup_size_z = (uint16_t)1; dispatch_packet->grid_size_x = (uint32_t) (GLOBAL_SIZE); dispatch_packet->grid_size_y = 1; dispatch_packet->grid_size_z = 1; dispatch_packet->completion_signal = signal; dispatch_packet->kernel_object = kernel_object; dispatch_packet->kernarg_address = (void*) kernarg_address; dispatch_packet->private_segment_size = private_segment_size; dispatch_packet->group_segment_size = group_segment_size; __atomic_store_n((uint8_t*)(&dispatch_packet->header), (uint8_t)HSA_PACKET_TYPE_KERNEL_DISPATCH, __ATOMIC_RELEASE); /* * Increment the write index and ring the doorbell to dispatch the kernel. */ tic(&timer_1); hsa_queue_store_write_index_relaxed(queue, index+1); hsa_signal_store_relaxed(queue->doorbell_signal, index); check(Dispatching the kernel, err); /* * Wait on the dispatch completion signal until the kernel is finished. */ hsa_signal_value_t value = hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); toc("Execution Period", &timer_1, &timer_2); /* * Validate the data in the output buffer. */ int temp = 0; for(i=0;i<element;i++) { if(temp<in[i]) temp = in[i]; } if(temp==out[GLOBAL_SIZE]) printf("PASS \n"); else printf("FAIL out=%d in=%d \n",out[GLOBAL_SIZE],temp); /* * Cleanup all allocated resources. */ err=hsa_signal_destroy(signal); check(Destroying the signal, err); err=hsa_executable_destroy(executable); check(Destroying the executable, err); err=hsa_code_object_destroy(code_object); check(Destroying the code object, err); err=hsa_queue_destroy(queue); check(Destroying the queue, err); err=hsa_shut_down(); check(Shutting down the runtime, err); free(in); free(out); //printf("kernarg_segment_size:%d group_segment_size:%d private_segment_size:%d",kernarg_segment_size,group_segment_size,private_segment_size); return 0; }
int main(int argc, char **argv) { hsa_status_t err; err = hsa_init(); check(Initializing the hsa runtime, err); /* * Determine if the finalizer 1.0 extension is supported. */ bool support; err = hsa_system_extension_supported(HSA_EXTENSION_FINALIZER, 1, 0, &support); check(Checking finalizer 1.0 extension support, err); /* * Generate the finalizer function table. */ hsa_ext_finalizer_1_00_pfn_t table_1_00; err = hsa_system_get_extension_table(HSA_EXTENSION_FINALIZER, 1, 0, &table_1_00); check(Generating function table for finalizer, err); /* * Iterate over the agents and pick the gpu agent using * the get_gpu_agent callback. */ hsa_agent_t agent; err = hsa_iterate_agents(get_gpu_agent, &agent); if(err == HSA_STATUS_INFO_BREAK) { err = HSA_STATUS_SUCCESS; } check(Getting a gpu agent, err); /* * Query the name of the agent. */ char name[64] = { 0 }; err = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, name); check(Querying the agent name, err); printf("The agent name is %s.\n", name); /* * Query the maximum size of the queue. */ uint32_t queue_size = 0; err = hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size); check(Querying the agent maximum queue size, err); printf("The maximum queue size is %u.\n", (unsigned int) queue_size); /* * Create a queue using the maximum size. */ hsa_queue_t* queue; err = hsa_queue_create(agent, queue_size, HSA_QUEUE_TYPE_SINGLE, NULL, NULL, UINT32_MAX, UINT32_MAX, &queue); check(Creating the queue, err); /* * Load the BRIG binary. */ hsa_ext_module_t module; load_module_from_file("vector_copy.brig",&module); /* * Create hsa program. */ hsa_ext_program_t program; memset(&program,0,sizeof(hsa_ext_program_t)); err = table_1_00.hsa_ext_program_create(HSA_MACHINE_MODEL_LARGE, HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, &program); check(Create the program, err); /* * Add the BRIG module to hsa program. */ err = table_1_00.hsa_ext_program_add_module(program, module); check(Adding the brig module to the program, err); /* * Determine the agents ISA. */ hsa_isa_t isa; err = hsa_agent_get_info(agent, HSA_AGENT_INFO_ISA, &isa); check(Query the agents isa, err); /* * Finalize the program and extract the code object. */ hsa_ext_control_directives_t control_directives; memset(&control_directives, 0, sizeof(hsa_ext_control_directives_t)); hsa_code_object_t code_object; err = table_1_00.hsa_ext_program_finalize(program, isa, 0, control_directives, "", HSA_CODE_OBJECT_TYPE_PROGRAM, &code_object); check(Finalizing the program, err); /* * Destroy the program, it is no longer needed. */ err=table_1_00.hsa_ext_program_destroy(program); check(Destroying the program, err); /* * Create the empty executable. */ hsa_executable_t executable; err = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, "", &executable); check(Create the executable, err); /* * Load the code object. */ err = hsa_executable_load_code_object(executable, agent, code_object, ""); check(Loading the code object, err); /* * Freeze the executable; it can now be queried for symbols. */ err = hsa_executable_freeze(executable, ""); check(Freeze the executable, err); /* * Extract the symbol from the executable. */ hsa_executable_symbol_t symbol; err = hsa_executable_get_symbol(executable, NULL, "&__vector_copy_kernel", agent, 0, &symbol); check(Extract the symbol from the executable, err); /* * Extract dispatch information from the symbol */ uint64_t kernel_object; uint32_t kernarg_segment_size; uint32_t group_segment_size; uint32_t private_segment_size; err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_object); check(Extracting the symbol from the executable, err); err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &kernarg_segment_size); check(Extracting the kernarg segment size from the executable, err); err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_segment_size); check(Extracting the group segment size from the executable, err); err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_segment_size); check(Extracting the private segment from the executable, err); /* * Create a signal to wait for the dispatch to finish. */ hsa_signal_t signal; err=hsa_signal_create(1, 0, NULL, &signal); check(Creating a HSA signal, err); /* * Allocate and initialize the kernel arguments and data. */ char* in=(char*)malloc(1024*1024*4); memset(in, 1, 1024*1024*4); err=hsa_memory_register(in, 1024*1024*4); check(Registering argument memory for input parameter, err); char* out=(char*)malloc(1024*1024*4); memset(out, 0, 1024*1024*4); err=hsa_memory_register(out, 1024*1024*4); check(Registering argument memory for output parameter, err); struct __attribute__ ((aligned(16))) args_t { void* in; void* out; } args; args.in=in; args.out=out; /* * Find a memory region that supports kernel arguments. */ hsa_region_t kernarg_region; kernarg_region.handle=(uint64_t)-1; hsa_agent_iterate_regions(agent, get_kernarg_memory_region, &kernarg_region); err = (kernarg_region.handle == (uint64_t)-1) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS; check(Finding a kernarg memory region, err); void* kernarg_address = NULL; /* * Allocate the kernel argument buffer from the correct region. */ err = hsa_memory_allocate(kernarg_region, kernarg_segment_size, &kernarg_address); check(Allocating kernel argument memory buffer, err); memcpy(kernarg_address, &args, sizeof(args)); /* * Obtain the current queue write index. */ uint64_t index = hsa_queue_load_write_index_relaxed(queue); /* * Write the aql packet at the calculated queue index address. */ const uint32_t queueMask = queue->size - 1; hsa_kernel_dispatch_packet_t* dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(queue->base_address))[index&queueMask]); dispatch_packet->setup |= 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; dispatch_packet->workgroup_size_x = (uint16_t)256; dispatch_packet->workgroup_size_y = (uint16_t)1; dispatch_packet->workgroup_size_z = (uint16_t)1; dispatch_packet->grid_size_x = (uint32_t) (1024*1024); dispatch_packet->grid_size_y = 1; dispatch_packet->grid_size_z = 1; dispatch_packet->completion_signal = signal; dispatch_packet->kernel_object = kernel_object; dispatch_packet->kernarg_address = (void*) kernarg_address; dispatch_packet->private_segment_size = private_segment_size; dispatch_packet->group_segment_size = group_segment_size; uint16_t header = 0; header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; __atomic_store_n((uint16_t*)(&dispatch_packet->header), header, __ATOMIC_RELEASE); /* * Increment the write index and ring the doorbell to dispatch the kernel. */ hsa_queue_store_write_index_relaxed(queue, index+1); hsa_signal_store_relaxed(queue->doorbell_signal, index); check(Dispatching the kernel, err); /* * Wait on the dispatch completion signal until the kernel is finished. */ hsa_signal_value_t value = hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); /* * Validate the data in the output buffer. */ int valid=1; int fail_index=0; for(int i=0; i<1024*1024; i++) { if(out[i]!=in[i]) { fail_index=i; valid=0; break; } } if(valid) { printf("Passed validation.\n"); } else { printf("VALIDATION FAILED!\nBad index: %d\n", fail_index); } /* * Cleanup all allocated resources. */ err = hsa_memory_free(kernarg_address); check(Freeing kernel argument memory buffer, err); err=hsa_signal_destroy(signal); check(Destroying the signal, err); err=hsa_executable_destroy(executable); check(Destroying the executable, err); err=hsa_code_object_destroy(code_object); check(Destroying the code object, err); err=hsa_queue_destroy(queue); check(Destroying the queue, err); err=hsa_shut_down(); check(Shutting down the runtime, err); free(in); free(out); return 0; }
void initial_kernel() { hsa_status_t err; err = hsa_init(); check(Initializing the hsa runtime, err); /* * Iterate over the agents and pick the gpu agent using * the get_gpu_agent callback. */ hsa_agent_t agent; err = hsa_iterate_agents(get_gpu_agent, &agent); if(err == HSA_STATUS_INFO_BREAK) { err = HSA_STATUS_SUCCESS; } check(Getting a gpu agent, err); /* * Query the name of the agent. */ char name[64] = { 0 }; err = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, name); check(Querying the agent name, err); fprintf(stderr, "The agent name is %s.\n", name); /* * Query the maximum size of the queue. */ uint32_t queue_size = 0; err = hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size); check(Querying the agent maximum queue size, err); fprintf(stderr, "The maximum queue size is %u.\n", (unsigned int) queue_size); /* * Create a queue using the maximum size. */ err = hsa_queue_create(agent, queue_size, HSA_QUEUE_TYPE_SINGLE, NULL, NULL, UINT32_MAX, UINT32_MAX, &queue); check(Creating the queue, err); /* * Load the BRIG binary. */ hsa_ext_module_t module; load_module_from_file("shader_hsa.brig",&module); /* * Create hsa program. */ hsa_ext_program_t program; memset(&program,0,sizeof(hsa_ext_program_t)); err = hsa_ext_program_create(HSA_MACHINE_MODEL_LARGE, HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, &program); check(Create the program, err); /* * Add the BRIG module to hsa program. */ err = hsa_ext_program_add_module(program, module); check(Adding the brig module to the program, err); /* * Determine the agents ISA. */ hsa_isa_t isa; err = hsa_agent_get_info(agent, HSA_AGENT_INFO_ISA, &isa); check(Query the agents isa, err); /* * Finalize the program and extract the code object. */ hsa_ext_control_directives_t control_directives; memset(&control_directives, 0, sizeof(hsa_ext_control_directives_t)); hsa_code_object_t code_object; err = hsa_ext_program_finalize(program, isa, 0, control_directives, "", HSA_CODE_OBJECT_TYPE_PROGRAM, &code_object); check(Finalizing the program, err); /* * Destroy the program, it is no longer needed. */ err=hsa_ext_program_destroy(program); check(Destroying the program, err); /* * Create the empty executable. */ hsa_executable_t executable; err = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, "", &executable); check(Create the executable, err); /* * Load the code object. */ err = hsa_executable_load_code_object(executable, agent, code_object, ""); check(Loading the code object, err); /* * Freeze the executable; it can now be queried for symbols. */ err = hsa_executable_freeze(executable, ""); check(Freeze the executable, err); /* * Extract the symbol from the executable. */ hsa_executable_symbol_t symbol; err = hsa_executable_get_symbol(executable, "", "&__OpenCL_cal_diskernel_kernel", agent, 0, &symbol); check(Extract the symbol from the executable, err); /* * Extract dispatch information from the symbol */ err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_object); check(Extracting the symbol from the executable, err); err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &kernarg_segment_size); check(Extracting the kernarg segment size from the executable, err); err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_segment_size); check(Extracting the group segment size from the executable, err); err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_segment_size); check(Extracting the private segment from the executable, err); /* * Create a signal to wait for the dispatch to finish. */ err=hsa_signal_create(1, 0, NULL, &signal); check(Creating a HSA signal, err); /* * Allocate and initialize the kernel arguments and data. */ err |= hsa_memory_register(data, sizeof(float)*N_DCNT*N_DIM); err |= hsa_memory_register(cent, sizeof(float)*N_K*N_DIM); err |= hsa_memory_register(table, sizeof(int)*N_DCNT); err |= hsa_memory_register(chpt, sizeof(int)*N_DCNT); err |= hsa_memory_register(cent_c, sizeof(int)*N_DCNT); err |= hsa_memory_register(min_dis, sizeof(float)*N_DCNT); check(Registering argument memory for output parameter, err); struct __attribute__ ((aligned(16))) args_t { uint64_t global_offset_0; uint64_t global_offset_1; uint64_t global_offset_2; uint64_t printf_buffer; uint64_t vqueue_pointer; uint64_t aqlwrap_pointer; void * data_ker; void * cent_ker; void * table; unsigned int K; unsigned int DIM; unsigned int DCNT; void * chpt; void * cent_c_ker; void * min_dis; } args; memset(&args, 0, sizeof(args)); args.data_ker = data; args.cent_ker = cent; args.table = table; args.K = N_K; args.DIM = N_DIM; args.DCNT = N_DCNT; args.chpt = chpt; args.cent_c_ker = cent_c_ker; args.min_dis = min_dis; /* * Find a memory region that supports kernel arguments. */ hsa_region_t kernarg_region; kernarg_region.handle=(uint64_t)-1; hsa_agent_iterate_regions(agent, get_kernarg_memory_region, &kernarg_region); err = (kernarg_region.handle == (uint64_t)-1) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS; check(Finding a kernarg memory region, err); /* * Allocate the kernel argument buffer from the correct region. */ err = hsa_memory_allocate(kernarg_region, kernarg_segment_size, &kernarg_address); check(Allocating kernel argument memory buffer, err); memcpy(kernarg_address, &args, sizeof(args)); }
void pocl_hsa_run (void *data, _cl_command_node* cmd) { struct data *d; struct pocl_argument *al; unsigned i; cl_kernel kernel = cmd->command.run.kernel; struct pocl_context *pc = &cmd->command.run.pc; hsa_signal_value_t initial_value = 1; #if 0 /* Not yet supported by the reference library. */ hsa_kernel_dispatch_packet_t kernel_packet; #else hsa_dispatch_packet_t kernel_packet; #endif hsa_signal_t kernel_completion_signal = 0; hsa_region_t region; int error; amdgpu_args_t *args; /* 32b word offset in the kernel arguments buffer we can push the next argument to. */ int args_offset = 0; assert (data != NULL); d = (struct data *) data; d->current_kernel = kernel; memset (&kernel_packet, 0, sizeof (hsa_dispatch_packet_t)); #if 0 /* TODO: not yet supported by the open source runtime implementation. Assume the HSA Full profile so we can simply use host malloc(). */ hsa_agent_iterate_regions(kernel_agent, pocl_hsa_get_kernarg, ®ion); if (hsa_memory_allocate(region, sizeof(amdgpu_args_t), (void**)&args) != HSA_STATUS_SUCCESS) { assert (0 && "hsa_memory_allocate() failed."); } #else args = (amdgpu_args_t*)malloc(sizeof(amdgpu_args_t)); #endif kernel_packet.kernarg_address = (uint64_t)args; /* Process the kernel arguments. Convert the opaque buffer pointers to real device pointers, allocate dynamic local memory buffers, etc. */ for (i = 0; i < kernel->num_args; ++i) { al = &(cmd->command.run.arguments[i]); if (kernel->arg_info[i].is_local) { POCL_ABORT_UNIMPLEMENTED("pocl-hsa: local buffers not implemented."); #if 0 arguments[i] = malloc (sizeof (void *)); *(void **)(arguments[i]) = pocl_hsa_malloc(data, 0, al->size, NULL); #endif } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_POINTER) { if (args_offset + 1 >= MAX_KERNEL_ARG_WORDS) POCL_ABORT("pocl-hsa: too many kernel arguments!"); /* Assuming the pointers are 64b (or actually the same as in host) due to HSA. TODO: the 32b profile. */ if (al->value == NULL) { args->kernel_args[args_offset] = 0; args->kernel_args[args_offset + 1] = 0; } else { *(uint64_t*)&args->kernel_args[args_offset] = (uint64_t)(*(cl_mem *) (al->value))-> device_ptrs[cmd->device->dev_id].mem_ptr; } args_offset += 2; #if 0 /* It's legal to pass a NULL pointer to clSetKernelArguments. In that case we must pass the same NULL forward to the kernel. Otherwise, the user must have created a buffer with per device pointers stored in the cl_mem. */ if (al->value == NULL) { arguments[i] = malloc (sizeof (void *)); *(void **)arguments[i] = NULL; } else arguments[i] = &((*(cl_mem *) (al->value))->device_ptrs[cmd->device->dev_id].mem_ptr); #endif } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_IMAGE) { POCL_ABORT_UNIMPLEMENTED("hsa: image arguments not implemented."); #if 0 dev_image_t di; fill_dev_image_t (&di, al, cmd->device); void* devptr = pocl_hsa_malloc (data, 0, sizeof(dev_image_t), NULL); arguments[i] = malloc (sizeof (void *)); *(void **)(arguments[i]) = devptr; pocl_hsa_write (data, &di, devptr, 0, sizeof(dev_image_t)); #endif } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_SAMPLER) { POCL_ABORT_UNIMPLEMENTED("hsa: sampler arguments not implemented."); #if 0 dev_sampler_t ds; arguments[i] = malloc (sizeof (void *)); *(void **)(arguments[i]) = pocl_hsa_malloc (data, 0, sizeof(dev_sampler_t), NULL); pocl_hsa_write (data, &ds, *(void**)arguments[i], 0, sizeof(dev_sampler_t)); #endif } else { if (args_offset >= MAX_KERNEL_ARG_WORDS) POCL_ABORT("pocl-hsa: too many kernel arguments!"); /* Assuming the scalar fits to a 32b slot. TODO! */ assert (al->size <= 4); args->kernel_args[args_offset] = *(uint32_t*)al->value; ++args_offset; } } for (i = kernel->num_args; i < kernel->num_args + kernel->num_locals; ++i) { POCL_ABORT_UNIMPLEMENTED("hsa: automatic local buffers not implemented."); #if 0 al = &(cmd->command.run.arguments[i]); arguments[i] = malloc (sizeof (void *)); *(void **)(arguments[i]) = pocl_hsa_malloc (data, 0, al->size, NULL); #endif } args->workgroup_size_x = kernel_packet.workgroup_size_x = cmd->command.run.local_x; args->workgroup_size_y = kernel_packet.workgroup_size_y = cmd->command.run.local_y; args->workgroup_size_z = kernel_packet.workgroup_size_z = cmd->command.run.local_z; kernel_packet.grid_size_x = pc->num_groups[0] * cmd->command.run.local_x; kernel_packet.grid_size_y = pc->num_groups[1] * cmd->command.run.local_y; kernel_packet.grid_size_z = pc->num_groups[2] * cmd->command.run.local_z; /* AMDGPU specific OpenCL argument data. */ args->wgs_x = pc->num_groups[0]; args->wgs_y = pc->num_groups[1]; args->wgs_z = pc->num_groups[2]; kernel_packet.dimensions = 1; if (cmd->command.run.local_y > 1) kernel_packet.dimensions = 2; if (cmd->command.run.local_z > 1) kernel_packet.dimensions = 3; kernel_packet.header.type = HSA_PACKET_TYPE_DISPATCH; kernel_packet.header.acquire_fence_scope = HSA_FENCE_SCOPE_SYSTEM; kernel_packet.header.release_fence_scope = HSA_FENCE_SCOPE_SYSTEM; kernel_packet.header.barrier = 1; kernel_packet.kernel_object_address = *(hsa_amd_code_t*)cmd->command.run.device_data[1]; error = hsa_signal_create(initial_value, 0, NULL, &kernel_completion_signal); assert (error == HSA_STATUS_SUCCESS); kernel_packet.completion_signal = kernel_completion_signal; { /* Launch the kernel by allocating a slot in the queue, writing the command to it, signaling the update with a door bell and finally, block waiting until finish signalled with the completion_signal. */ const uint32_t queue_mask = d->queue->size - 1; uint64_t queue_index = hsa_queue_load_write_index_relaxed(d->queue); hsa_signal_value_t sigval; ((hsa_dispatch_packet_t*)(d->queue->base_address))[queue_index & queue_mask] = kernel_packet; hsa_queue_store_write_index_relaxed(d->queue, queue_index + 1); hsa_signal_store_relaxed(d->queue->doorbell_signal, queue_index); sigval = hsa_signal_wait_acquire(kernel_completion_signal, HSA_EQ, 0, (uint64_t)(-1), HSA_WAIT_EXPECTANCY_UNKNOWN); } for (i = 0; i < kernel->num_args; ++i) { if (kernel->arg_info[i].is_local) { #if 0 pocl_hsa_free (data, 0, *(void **)(arguments[i])); POCL_MEM_FREE(arguments[i]); #endif } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_IMAGE) { #if 0 pocl_hsa_free (data, 0, *(void **)(arguments[i])); POCL_MEM_FREE(arguments[i]); #endif } #if 0 else if (kernel->arg_info[i].type == POCL_ARG_TYPE_SAMPLER || (kernel->arg_info[i].type == POCL_ARG_TYPE_POINTER && *(void**)args->kernel_args[i] == NULL)) { POCL_MEM_FREE(arguments[i]); } #endif } for (i = kernel->num_args; i < kernel->num_args + kernel->num_locals; ++i) { #if 0 pocl_hsa_free(data, 0, *(void **)(arguments[i])); POCL_MEM_FREE(arguments[i]); #endif } free(args); }