//--- UnpinnedCopyEngine::~UnpinnedCopyEngine() { for (int i=0; i<_numBuffers; i++) { if (_pinnedStagingBuffer[i]) { hsa_amd_memory_pool_free(_pinnedStagingBuffer[i]); _pinnedStagingBuffer[i] = NULL; } hsa_signal_destroy(_completionSignal[i]); hsa_signal_destroy(_completionSignal2[i]); } }
void TemplateDispatch<T>::waitComplete(hsa_signal_t& signal) { /* Wait on the dispatch signal until the kernel is finished. */ hsa_signal_wait_acquire(signal, HSA_LT, 1, (uint64_t) -1, HSA_WAIT_EXPECTANCY_UNKNOWN); hsa_signal_destroy(signal); }
int main(int argc, char **argv) { struct timespec timer_1, timer_2; hsa_status_t err; err = hsa_init(); check(Initializing the hsa runtime, err); /* * Iterate over the agents and pick the gpu agent using * the get_gpu_agent callback. */ hsa_agent_t agent; err = hsa_iterate_agents(get_gpu_agent, &agent); if(err == HSA_STATUS_INFO_BREAK) { err = HSA_STATUS_SUCCESS; } check(Getting a gpu agent, err); /* * Query the name of the agent. */ char name[64] = { 0 }; err = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, name); check(Querying the agent name, err); printf("The agent name is %s.\n", name); /* * Query the maximum size of the queue. */ uint32_t queue_size = 0; err = hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size); check(Querying the agent maximum queue size, err); printf("The maximum queue size is %u.\n", (unsigned int) queue_size); /* * Create a queue using the maximum size. */ hsa_queue_t* queue; err = hsa_queue_create(agent, queue_size, HSA_QUEUE_TYPE_SINGLE, NULL, NULL, UINT32_MAX, UINT32_MAX, &queue); check(Creating the queue, err); /* * Load the BRIG binary. */ hsa_ext_module_t module; load_module_from_file("vector_copy.brig",&module); /* * Create hsa program. */ hsa_ext_program_t program; memset(&program,0,sizeof(hsa_ext_program_t)); err = hsa_ext_program_create(HSA_MACHINE_MODEL_LARGE, HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, &program); check(Create the program, err); /* * Add the BRIG module to hsa program. */ err = hsa_ext_program_add_module(program, module); check(Adding the brig module to the program, err); /* * Determine the agents ISA. */ hsa_isa_t isa; err = hsa_agent_get_info(agent, HSA_AGENT_INFO_ISA, &isa); check(Query the agents isa, err); /* * Finalize the program and extract the code object. */ hsa_ext_control_directives_t control_directives; memset(&control_directives, 0, sizeof(hsa_ext_control_directives_t)); hsa_code_object_t code_object; err = hsa_ext_program_finalize(program, isa, 0, control_directives, "-O0", HSA_CODE_OBJECT_TYPE_PROGRAM, &code_object); check(Finalizing the program, err); /* * Destroy the program, it is no longer needed. */ err=hsa_ext_program_destroy(program); check(Destroying the program, err); /* * Create the empty executable. */ hsa_executable_t executable; err = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, "", &executable); check(Create the executable, err); /* * Load the code<F3> object. */ err = hsa_executable_load_code_object(executable, agent, code_object, ""); check(Loading the code object, err); /* * Freeze the executable; it can now be queried for symbols. */ err = hsa_executable_freeze(executable, ""); check(Freeze the executable, err); /* * Extract the symbol from the executable. */ hsa_executable_symbol_t symbol; err = hsa_executable_get_symbol(executable, "", "&__OpenCL_vector_copy_kernel", agent, 0, &symbol); check(Extract the symbol from the executable, err); /* * Extract dispatch information from the symbol */ uint64_t kernel_object; uint32_t kernarg_segment_size; uint32_t group_segment_size; uint32_t private_segment_size; err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_object); check(Extracting the symbol from the executable, err); err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &kernarg_segment_size); check(Extracting the kernarg segment size from the executable, err); err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_segment_size); check(Extracting the group segment size from the executable, err); err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_segment_size); check(Extracting the private segment from the executable, err); /* * Create a signal to wait for the dispatch to finish. */ hsa_signal_t signal; err=hsa_signal_create(1, 0, NULL, &signal); check(Creating a HSA signal, err); /* * Allocate and initialize the kernel arguments and data. */ int* in=(int*)malloc(SIZE); int i; for(i=0;i<ELEMENT;i++) in[i]=(rand()%50000+1); err=hsa_memory_register(in, SIZE); check(Registering argument memory for input parameter, err); int* out=(int*)malloc(SIZE); memset(out, 0, SIZE); err=hsa_memory_register(out, SIZE); check(Registering argument memory for output parameter, err); int element = ELEMENT; int iter = ITER; struct __attribute__ ((aligned(16))) args_t { uint64_t global_offset_0; uint64_t global_offset_1; uint64_t global_offset_2; uint64_t printf_buffer; uint64_t vqueue_pointer; uint64_t aqlwrap_pointer; void* in; void* out; int iter; int element; } args; memset(&args, 0, sizeof(args)); args.in=in; args.out=out; args.element=element; args.iter=iter; /* * Find a memory region that supports kernel arguments. */ hsa_region_t kernarg_region; kernarg_region.handle=(uint64_t)-1; hsa_agent_iterate_regions(agent, get_kernarg_memory_region, &kernarg_region); err = (kernarg_region.handle == (uint64_t)-1) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS; check(Finding a kernarg memory region, err); void* kernarg_address = NULL; /* * Allocate the kernel argument buffer from the correct region. */ err = hsa_memory_allocate(kernarg_region, kernarg_segment_size, &kernarg_address); check(Allocating kernel argument memory buffer, err); memcpy(kernarg_address, &args, sizeof(args)); /* * Obtain the current queue write index. */ uint64_t index = hsa_queue_load_write_index_relaxed(queue); /* * Write the aql packet at the calculated queue index address. */ const uint32_t queueMask = queue->size - 1; hsa_kernel_dispatch_packet_t* dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(queue->base_address))[index&queueMask]); dispatch_packet->header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; dispatch_packet->header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; dispatch_packet->setup |= 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; dispatch_packet->workgroup_size_x = (uint16_t)LOCAL_SIZE; dispatch_packet->workgroup_size_y = (uint16_t)1; dispatch_packet->workgroup_size_z = (uint16_t)1; dispatch_packet->grid_size_x = (uint32_t) (GLOBAL_SIZE); dispatch_packet->grid_size_y = 1; dispatch_packet->grid_size_z = 1; dispatch_packet->completion_signal = signal; dispatch_packet->kernel_object = kernel_object; dispatch_packet->kernarg_address = (void*) kernarg_address; dispatch_packet->private_segment_size = private_segment_size; dispatch_packet->group_segment_size = group_segment_size; __atomic_store_n((uint8_t*)(&dispatch_packet->header), (uint8_t)HSA_PACKET_TYPE_KERNEL_DISPATCH, __ATOMIC_RELEASE); /* * Increment the write index and ring the doorbell to dispatch the kernel. */ tic(&timer_1); hsa_queue_store_write_index_relaxed(queue, index+1); hsa_signal_store_relaxed(queue->doorbell_signal, index); check(Dispatching the kernel, err); /* * Wait on the dispatch completion signal until the kernel is finished. */ hsa_signal_value_t value = hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); toc("Execution Period", &timer_1, &timer_2); /* * Validate the data in the output buffer. */ int temp = 0; for(i=0;i<element;i++) { if(temp<in[i]) temp = in[i]; } if(temp==out[GLOBAL_SIZE]) printf("PASS \n"); else printf("FAIL out=%d in=%d \n",out[GLOBAL_SIZE],temp); /* * Cleanup all allocated resources. */ err=hsa_signal_destroy(signal); check(Destroying the signal, err); err=hsa_executable_destroy(executable); check(Destroying the executable, err); err=hsa_code_object_destroy(code_object); check(Destroying the code object, err); err=hsa_queue_destroy(queue); check(Destroying the queue, err); err=hsa_shut_down(); check(Shutting down the runtime, err); free(in); free(out); //printf("kernarg_segment_size:%d group_segment_size:%d private_segment_size:%d",kernarg_segment_size,group_segment_size,private_segment_size); return 0; }
int main(int argc, char **argv) { hsa_status_t err; err = hsa_init(); check(Initializing the hsa runtime, err); /* * Determine if the finalizer 1.0 extension is supported. */ bool support; err = hsa_system_extension_supported(HSA_EXTENSION_FINALIZER, 1, 0, &support); check(Checking finalizer 1.0 extension support, err); /* * Generate the finalizer function table. */ hsa_ext_finalizer_1_00_pfn_t table_1_00; err = hsa_system_get_extension_table(HSA_EXTENSION_FINALIZER, 1, 0, &table_1_00); check(Generating function table for finalizer, err); /* * Iterate over the agents and pick the gpu agent using * the get_gpu_agent callback. */ hsa_agent_t agent; err = hsa_iterate_agents(get_gpu_agent, &agent); if(err == HSA_STATUS_INFO_BREAK) { err = HSA_STATUS_SUCCESS; } check(Getting a gpu agent, err); /* * Query the name of the agent. */ char name[64] = { 0 }; err = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, name); check(Querying the agent name, err); printf("The agent name is %s.\n", name); /* * Query the maximum size of the queue. */ uint32_t queue_size = 0; err = hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size); check(Querying the agent maximum queue size, err); printf("The maximum queue size is %u.\n", (unsigned int) queue_size); /* * Create a queue using the maximum size. */ hsa_queue_t* queue; err = hsa_queue_create(agent, queue_size, HSA_QUEUE_TYPE_SINGLE, NULL, NULL, UINT32_MAX, UINT32_MAX, &queue); check(Creating the queue, err); /* * Load the BRIG binary. */ hsa_ext_module_t module; load_module_from_file("vector_copy.brig",&module); /* * Create hsa program. */ hsa_ext_program_t program; memset(&program,0,sizeof(hsa_ext_program_t)); err = table_1_00.hsa_ext_program_create(HSA_MACHINE_MODEL_LARGE, HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, &program); check(Create the program, err); /* * Add the BRIG module to hsa program. */ err = table_1_00.hsa_ext_program_add_module(program, module); check(Adding the brig module to the program, err); /* * Determine the agents ISA. */ hsa_isa_t isa; err = hsa_agent_get_info(agent, HSA_AGENT_INFO_ISA, &isa); check(Query the agents isa, err); /* * Finalize the program and extract the code object. */ hsa_ext_control_directives_t control_directives; memset(&control_directives, 0, sizeof(hsa_ext_control_directives_t)); hsa_code_object_t code_object; err = table_1_00.hsa_ext_program_finalize(program, isa, 0, control_directives, "", HSA_CODE_OBJECT_TYPE_PROGRAM, &code_object); check(Finalizing the program, err); /* * Destroy the program, it is no longer needed. */ err=table_1_00.hsa_ext_program_destroy(program); check(Destroying the program, err); /* * Create the empty executable. */ hsa_executable_t executable; err = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, "", &executable); check(Create the executable, err); /* * Load the code object. */ err = hsa_executable_load_code_object(executable, agent, code_object, ""); check(Loading the code object, err); /* * Freeze the executable; it can now be queried for symbols. */ err = hsa_executable_freeze(executable, ""); check(Freeze the executable, err); /* * Extract the symbol from the executable. */ hsa_executable_symbol_t symbol; err = hsa_executable_get_symbol(executable, NULL, "&__vector_copy_kernel", agent, 0, &symbol); check(Extract the symbol from the executable, err); /* * Extract dispatch information from the symbol */ uint64_t kernel_object; uint32_t kernarg_segment_size; uint32_t group_segment_size; uint32_t private_segment_size; err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_object); check(Extracting the symbol from the executable, err); err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &kernarg_segment_size); check(Extracting the kernarg segment size from the executable, err); err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_segment_size); check(Extracting the group segment size from the executable, err); err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_segment_size); check(Extracting the private segment from the executable, err); /* * Create a signal to wait for the dispatch to finish. */ hsa_signal_t signal; err=hsa_signal_create(1, 0, NULL, &signal); check(Creating a HSA signal, err); /* * Allocate and initialize the kernel arguments and data. */ char* in=(char*)malloc(1024*1024*4); memset(in, 1, 1024*1024*4); err=hsa_memory_register(in, 1024*1024*4); check(Registering argument memory for input parameter, err); char* out=(char*)malloc(1024*1024*4); memset(out, 0, 1024*1024*4); err=hsa_memory_register(out, 1024*1024*4); check(Registering argument memory for output parameter, err); struct __attribute__ ((aligned(16))) args_t { void* in; void* out; } args; args.in=in; args.out=out; /* * Find a memory region that supports kernel arguments. */ hsa_region_t kernarg_region; kernarg_region.handle=(uint64_t)-1; hsa_agent_iterate_regions(agent, get_kernarg_memory_region, &kernarg_region); err = (kernarg_region.handle == (uint64_t)-1) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS; check(Finding a kernarg memory region, err); void* kernarg_address = NULL; /* * Allocate the kernel argument buffer from the correct region. */ err = hsa_memory_allocate(kernarg_region, kernarg_segment_size, &kernarg_address); check(Allocating kernel argument memory buffer, err); memcpy(kernarg_address, &args, sizeof(args)); /* * Obtain the current queue write index. */ uint64_t index = hsa_queue_load_write_index_relaxed(queue); /* * Write the aql packet at the calculated queue index address. */ const uint32_t queueMask = queue->size - 1; hsa_kernel_dispatch_packet_t* dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(queue->base_address))[index&queueMask]); dispatch_packet->setup |= 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; dispatch_packet->workgroup_size_x = (uint16_t)256; dispatch_packet->workgroup_size_y = (uint16_t)1; dispatch_packet->workgroup_size_z = (uint16_t)1; dispatch_packet->grid_size_x = (uint32_t) (1024*1024); dispatch_packet->grid_size_y = 1; dispatch_packet->grid_size_z = 1; dispatch_packet->completion_signal = signal; dispatch_packet->kernel_object = kernel_object; dispatch_packet->kernarg_address = (void*) kernarg_address; dispatch_packet->private_segment_size = private_segment_size; dispatch_packet->group_segment_size = group_segment_size; uint16_t header = 0; header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; __atomic_store_n((uint16_t*)(&dispatch_packet->header), header, __ATOMIC_RELEASE); /* * Increment the write index and ring the doorbell to dispatch the kernel. */ hsa_queue_store_write_index_relaxed(queue, index+1); hsa_signal_store_relaxed(queue->doorbell_signal, index); check(Dispatching the kernel, err); /* * Wait on the dispatch completion signal until the kernel is finished. */ hsa_signal_value_t value = hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); /* * Validate the data in the output buffer. */ int valid=1; int fail_index=0; for(int i=0; i<1024*1024; i++) { if(out[i]!=in[i]) { fail_index=i; valid=0; break; } } if(valid) { printf("Passed validation.\n"); } else { printf("VALIDATION FAILED!\nBad index: %d\n", fail_index); } /* * Cleanup all allocated resources. */ err = hsa_memory_free(kernarg_address); check(Freeing kernel argument memory buffer, err); err=hsa_signal_destroy(signal); check(Destroying the signal, err); err=hsa_executable_destroy(executable); check(Destroying the executable, err); err=hsa_code_object_destroy(code_object); check(Destroying the code object, err); err=hsa_queue_destroy(queue); check(Destroying the queue, err); err=hsa_shut_down(); check(Shutting down the runtime, err); free(in); free(out); return 0; }
void MultiKernelDispatch<T>::queueKernel(HSAContext::Kernel* p_kernel, T kern_args, Launch_params_t lparm, bool wait) { if (wait) { hsa_status_t err; m_expected_value =1; err = hsa_signal_create(m_expected_value, 0, NULL, &m_signal); } HSAContextKaveriImpl* p_ctx = (HSAContextKaveriImpl*) m_pcontext; /* Obtain the current queue write index. increases with each call to kernel */ uint64_t index = hsa_queue_load_write_index_relaxed(p_ctx->commandQueue); /* printf("DEBUG:Call #%d to kernel \"%s\" \n",(int) index+1,"run"); */ HSAContextKaveriImpl::KernelImpl *p_impl = (HSAContextKaveriImpl::KernelImpl *) p_kernel; /* Setup this call to this kernel dispatch packet from scratch. */ memset(&m_aql, 0, sizeof(m_aql)); /* Set the dimensions passed from the application */ m_aql.dimensions = (uint16_t) lparm.ndim; m_aql.grid_size_x = lparm.gdims[0]; m_aql.workgroup_size_x = lparm.ldims[0]; m_aql.completion_signal = m_signal; if (lparm.ndim > 1) { m_aql.grid_size_y = lparm.gdims[1]; m_aql.workgroup_size_y = lparm.ldims[1]; } else { m_aql.grid_size_y = 1; m_aql.workgroup_size_y = 1; } if (lparm.ndim > 2) { m_aql.grid_size_z = lparm.gdims[2]; m_aql.workgroup_size_z = lparm.ldims[2]; } else { m_aql.grid_size_z = 1; m_aql.workgroup_size_z = 1; } /* In the future, we may use environment variables for some of these */ m_aql.header.type = HSA_PACKET_TYPE_DISPATCH; m_aql.header.acquire_fence_scope = 2; m_aql.header.release_fence_scope = 2; m_aql.header.barrier = 1; m_aql.group_segment_size = p_impl->hsaCodeDescriptor->workgroup_group_segment_byte_size; m_aql.private_segment_size = p_impl->hsaCodeDescriptor->workitem_private_segment_byte_size; void * run_kernel_arg_buffer = p_impl->m_run_arg_buffer; /* copy args from the custom run_args structure */ /* FIXME We should align kernel_arg_buffer because run_args is aligned */ memcpy(run_kernel_arg_buffer, &kern_args, sizeof(kern_args)); /* Bind kernelcode to the packet. */ m_aql.kernel_object_address = p_impl->hsaCodeDescriptor->code.handle; /* Bind kernel argument buffer to the aql packet. */ m_aql.kernarg_address = (uint64_t) run_kernel_arg_buffer; const uint32_t queueMask = p_ctx->commandQueue->size - 1; const uint32_t pos = (index) & queueMask; ((hsa_dispatch_packet_t*) (p_ctx->commandQueue->base_address))[pos] = m_aql; if (wait) hsa_signal_store_relaxed(p_ctx->commandQueue->doorbell_signal, index); /* Increment the write index and ring the doorbell to dispatch the kernel. */ hsa_queue_store_write_index_relaxed(p_ctx->commandQueue, index + 1); if (wait) { hsa_signal_wait_acquire(m_signal, HSA_LT, m_expected_value, (uint64_t)-1, HSA_WAIT_EXPECTANCY_SHORT); hsa_signal_destroy(m_signal); m_signal = 0; } m_expected_value--; }