void* LoaderContext::AgentAlloc(hsa_agent_t agent, size_t size, size_t align, bool zero) { assert(agent.handle); assert(size); assert(align); hsa_status_t hsa_status = HSA_STATUS_SUCCESS; void *result = NULL; Agent2RegionMap::iterator used_region = agent2region_.end(); { std::lock_guard<std::mutex> lock(agent2region_mutex_); used_region = agent2region_.find(agent); if (used_region == agent2region_.end()) { hsa_region_t agent_region = {0}; hsa_status = hsa_agent_iterate_regions(agent, FindGlobalRegion, &agent_region); if (HSA_STATUS_SUCCESS != hsa_status) { return NULL; } used_region = agent2region_.insert(used_region, std::make_pair(agent, agent_region)); } } assert(used_region != agent2region_.end()); assert(used_region->first.handle == agent.handle); assert(used_region->second.handle); hsa_status = hsa_memory_allocate(used_region->second, size, &result); if (HSA_STATUS_SUCCESS != hsa_status) { return NULL; } assert(result); // @todo(runtime): need more efficient way of allocating zero-initialized // memory. if (zero) { void *zero_initialized = calloc(size, 1); if (!zero_initialized) { hsa_memory_free(result); return NULL; } hsa_status = hsa_memory_copy(result, zero_initialized, size); if (HSA_STATUS_SUCCESS != hsa_status) { hsa_memory_free(result); free(zero_initialized); return NULL; } free(zero_initialized); } return result; }
int main(int argc, char **argv) { hsa_status_t err; err = hsa_init(); check(Initializing the hsa runtime, err); /* * Determine if the finalizer 1.0 extension is supported. */ bool support; err = hsa_system_extension_supported(HSA_EXTENSION_FINALIZER, 1, 0, &support); check(Checking finalizer 1.0 extension support, err); /* * Generate the finalizer function table. */ hsa_ext_finalizer_1_00_pfn_t table_1_00; err = hsa_system_get_extension_table(HSA_EXTENSION_FINALIZER, 1, 0, &table_1_00); check(Generating function table for finalizer, err); /* * Iterate over the agents and pick the gpu agent using * the get_gpu_agent callback. */ hsa_agent_t agent; err = hsa_iterate_agents(get_gpu_agent, &agent); if(err == HSA_STATUS_INFO_BREAK) { err = HSA_STATUS_SUCCESS; } check(Getting a gpu agent, err); /* * Query the name of the agent. */ char name[64] = { 0 }; err = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, name); check(Querying the agent name, err); printf("The agent name is %s.\n", name); /* * Query the maximum size of the queue. */ uint32_t queue_size = 0; err = hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size); check(Querying the agent maximum queue size, err); printf("The maximum queue size is %u.\n", (unsigned int) queue_size); /* * Create a queue using the maximum size. */ hsa_queue_t* queue; err = hsa_queue_create(agent, queue_size, HSA_QUEUE_TYPE_SINGLE, NULL, NULL, UINT32_MAX, UINT32_MAX, &queue); check(Creating the queue, err); /* * Load the BRIG binary. */ hsa_ext_module_t module; load_module_from_file("vector_copy.brig",&module); /* * Create hsa program. */ hsa_ext_program_t program; memset(&program,0,sizeof(hsa_ext_program_t)); err = table_1_00.hsa_ext_program_create(HSA_MACHINE_MODEL_LARGE, HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, &program); check(Create the program, err); /* * Add the BRIG module to hsa program. */ err = table_1_00.hsa_ext_program_add_module(program, module); check(Adding the brig module to the program, err); /* * Determine the agents ISA. */ hsa_isa_t isa; err = hsa_agent_get_info(agent, HSA_AGENT_INFO_ISA, &isa); check(Query the agents isa, err); /* * Finalize the program and extract the code object. */ hsa_ext_control_directives_t control_directives; memset(&control_directives, 0, sizeof(hsa_ext_control_directives_t)); hsa_code_object_t code_object; err = table_1_00.hsa_ext_program_finalize(program, isa, 0, control_directives, "", HSA_CODE_OBJECT_TYPE_PROGRAM, &code_object); check(Finalizing the program, err); /* * Destroy the program, it is no longer needed. */ err=table_1_00.hsa_ext_program_destroy(program); check(Destroying the program, err); /* * Create the empty executable. */ hsa_executable_t executable; err = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, "", &executable); check(Create the executable, err); /* * Load the code object. */ err = hsa_executable_load_code_object(executable, agent, code_object, ""); check(Loading the code object, err); /* * Freeze the executable; it can now be queried for symbols. */ err = hsa_executable_freeze(executable, ""); check(Freeze the executable, err); /* * Extract the symbol from the executable. */ hsa_executable_symbol_t symbol; err = hsa_executable_get_symbol(executable, NULL, "&__vector_copy_kernel", agent, 0, &symbol); check(Extract the symbol from the executable, err); /* * Extract dispatch information from the symbol */ uint64_t kernel_object; uint32_t kernarg_segment_size; uint32_t group_segment_size; uint32_t private_segment_size; err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_object); check(Extracting the symbol from the executable, err); err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &kernarg_segment_size); check(Extracting the kernarg segment size from the executable, err); err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_segment_size); check(Extracting the group segment size from the executable, err); err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_segment_size); check(Extracting the private segment from the executable, err); /* * Create a signal to wait for the dispatch to finish. */ hsa_signal_t signal; err=hsa_signal_create(1, 0, NULL, &signal); check(Creating a HSA signal, err); /* * Allocate and initialize the kernel arguments and data. */ char* in=(char*)malloc(1024*1024*4); memset(in, 1, 1024*1024*4); err=hsa_memory_register(in, 1024*1024*4); check(Registering argument memory for input parameter, err); char* out=(char*)malloc(1024*1024*4); memset(out, 0, 1024*1024*4); err=hsa_memory_register(out, 1024*1024*4); check(Registering argument memory for output parameter, err); struct __attribute__ ((aligned(16))) args_t { void* in; void* out; } args; args.in=in; args.out=out; /* * Find a memory region that supports kernel arguments. */ hsa_region_t kernarg_region; kernarg_region.handle=(uint64_t)-1; hsa_agent_iterate_regions(agent, get_kernarg_memory_region, &kernarg_region); err = (kernarg_region.handle == (uint64_t)-1) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS; check(Finding a kernarg memory region, err); void* kernarg_address = NULL; /* * Allocate the kernel argument buffer from the correct region. */ err = hsa_memory_allocate(kernarg_region, kernarg_segment_size, &kernarg_address); check(Allocating kernel argument memory buffer, err); memcpy(kernarg_address, &args, sizeof(args)); /* * Obtain the current queue write index. */ uint64_t index = hsa_queue_load_write_index_relaxed(queue); /* * Write the aql packet at the calculated queue index address. */ const uint32_t queueMask = queue->size - 1; hsa_kernel_dispatch_packet_t* dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(queue->base_address))[index&queueMask]); dispatch_packet->setup |= 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; dispatch_packet->workgroup_size_x = (uint16_t)256; dispatch_packet->workgroup_size_y = (uint16_t)1; dispatch_packet->workgroup_size_z = (uint16_t)1; dispatch_packet->grid_size_x = (uint32_t) (1024*1024); dispatch_packet->grid_size_y = 1; dispatch_packet->grid_size_z = 1; dispatch_packet->completion_signal = signal; dispatch_packet->kernel_object = kernel_object; dispatch_packet->kernarg_address = (void*) kernarg_address; dispatch_packet->private_segment_size = private_segment_size; dispatch_packet->group_segment_size = group_segment_size; uint16_t header = 0; header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; __atomic_store_n((uint16_t*)(&dispatch_packet->header), header, __ATOMIC_RELEASE); /* * Increment the write index and ring the doorbell to dispatch the kernel. */ hsa_queue_store_write_index_relaxed(queue, index+1); hsa_signal_store_relaxed(queue->doorbell_signal, index); check(Dispatching the kernel, err); /* * Wait on the dispatch completion signal until the kernel is finished. */ hsa_signal_value_t value = hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); /* * Validate the data in the output buffer. */ int valid=1; int fail_index=0; for(int i=0; i<1024*1024; i++) { if(out[i]!=in[i]) { fail_index=i; valid=0; break; } } if(valid) { printf("Passed validation.\n"); } else { printf("VALIDATION FAILED!\nBad index: %d\n", fail_index); } /* * Cleanup all allocated resources. */ err = hsa_memory_free(kernarg_address); check(Freeing kernel argument memory buffer, err); err=hsa_signal_destroy(signal); check(Destroying the signal, err); err=hsa_executable_destroy(executable); check(Destroying the executable, err); err=hsa_code_object_destroy(code_object); check(Destroying the code object, err); err=hsa_queue_destroy(queue); check(Destroying the queue, err); err=hsa_shut_down(); check(Shutting down the runtime, err); free(in); free(out); return 0; }
TemplateDispatch<T>::~TemplateDispatch() { hsa_memory_free(run_kernel_arg_buffer); }
void LoaderContext::AgentFree(void *ptr, size_t size) { assert(ptr); assert(size); hsa_memory_free(ptr); }