//--- //Copies sizeBytes from src to dst, using either a copy to a staging buffer or a staged pin-in-place strategy //IN: dst - dest pointer - must be accessible from host CPU. //IN: src - src pointer for copy. Must be accessible from agent this buffer is associated with (via _hsaAgent) //IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. void UnpinnedCopyEngine::CopyHostToDeviceStaging(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) { { std::lock_guard<std::mutex> l (_copyLock); const char *srcp = static_cast<const char*> (src); char *dstp = static_cast<char*> (dst); for (int i=0; i<_numBuffers; i++) { hsa_signal_store_relaxed(_completionSignal[i], 0); } if (sizeBytes >= UINT64_MAX/2) { THROW_ERROR (hipErrorInvalidValue, HSA_STATUS_ERROR_INVALID_ARGUMENT); } int bufferIndex = 0; for (int64_t bytesRemaining=sizeBytes; bytesRemaining>0 ; bytesRemaining -= _bufferSize) { size_t theseBytes = (bytesRemaining > _bufferSize) ? _bufferSize : bytesRemaining; tprintf (DB_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completionSignal[bufferIndex].handle); hsa_signal_wait_acquire(_completionSignal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: copy %zu bytes %p to stagingBuf[%d]:%p\n", bytesRemaining, theseBytes, srcp, bufferIndex, _pinnedStagingBuffer[bufferIndex]); // TODO - use uncached memcpy, someday. memcpy(_pinnedStagingBuffer[bufferIndex], srcp, theseBytes); hsa_signal_store_relaxed(_completionSignal[bufferIndex], 1); hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _hsaAgent, _pinnedStagingBuffer[bufferIndex], _cpuAgent, theseBytes, waitFor ? 1:0, waitFor, _completionSignal[bufferIndex]); tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: async_copy %zu bytes %p to %p status=%x\n", bytesRemaining, theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status); if (hsa_status != HSA_STATUS_SUCCESS) { THROW_ERROR (hipErrorRuntimeMemory, hsa_status); } srcp += theseBytes; dstp += theseBytes; if (++bufferIndex >= _numBuffers) { bufferIndex = 0; } // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 waitFor = NULL; } for (int i=0; i<_numBuffers; i++) { hsa_signal_wait_acquire(_completionSignal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); } } }
void TemplateDispatch<T>::waitComplete(hsa_signal_t& signal) { /* Wait on the dispatch signal until the kernel is finished. */ hsa_signal_wait_acquire(signal, HSA_LT, 1, (uint64_t) -1, HSA_WAIT_EXPECTANCY_UNKNOWN); hsa_signal_destroy(signal); }
//--- //Copies sizeBytes from src to dst, using either a copy to a staging buffer or a staged pin-in-place strategy //IN: dst - dest pointer - must be accessible from agent this buffer is associated with (via _hsaAgent). //IN: src - src pointer for copy. Must be accessible from host CPU. //IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. void UnpinnedCopyEngine::CopyDeviceToHostStaging(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) { { std::lock_guard<std::mutex> l (_copyLock); const char *srcp0 = static_cast<const char*> (src); char *dstp1 = static_cast<char*> (dst); for (int i=0; i<_numBuffers; i++) { hsa_signal_store_relaxed(_completionSignal[i], 0); } if (sizeBytes >= UINT64_MAX/2) { THROW_ERROR (hipErrorInvalidValue, HSA_STATUS_ERROR_INVALID_ARGUMENT); } int64_t bytesRemaining0 = sizeBytes; // bytes to copy from dest into staging buffer. int64_t bytesRemaining1 = sizeBytes; // bytes to copy from staging buffer into final dest while (bytesRemaining1 > 0) { // First launch the async copies to copy from dest to host for (int bufferIndex = 0; (bytesRemaining0>0) && (bufferIndex < _numBuffers); bytesRemaining0 -= _bufferSize, bufferIndex++) { size_t theseBytes = (bytesRemaining0 > _bufferSize) ? _bufferSize : bytesRemaining0; tprintf (DB_COPY2, "D2H: bytesRemaining0=%zu async_copy %zu bytes src:%p to staging:%p\n", bytesRemaining0, theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]); hsa_signal_store_relaxed(_completionSignal[bufferIndex], 1); hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _cpuAgent, srcp0, _hsaAgent, theseBytes, waitFor ? 1:0, waitFor, _completionSignal[bufferIndex]); if (hsa_status != HSA_STATUS_SUCCESS) { THROW_ERROR (hipErrorRuntimeMemory, hsa_status); } srcp0 += theseBytes; // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 waitFor = NULL; } // Now unload the staging buffers: for (int bufferIndex=0; (bytesRemaining1>0) && (bufferIndex < _numBuffers); bytesRemaining1 -= _bufferSize, bufferIndex++) { size_t theseBytes = (bytesRemaining1 > _bufferSize) ? _bufferSize : bytesRemaining1; tprintf (DB_COPY2, "D2H: wait_completion[%d] bytesRemaining=%zu\n", bufferIndex, bytesRemaining1); hsa_signal_wait_acquire(_completionSignal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); tprintf (DB_COPY2, "D2H: bytesRemaining1=%zu copy %zu bytes stagingBuf[%d]:%p to dst:%p\n", bytesRemaining1, theseBytes, bufferIndex, _pinnedStagingBuffer[bufferIndex], dstp1); memcpy(dstp1, _pinnedStagingBuffer[bufferIndex], theseBytes); dstp1 += theseBytes; } } } }
//--- //Copies sizeBytes from src to dst, using either a copy to a staging buffer or a staged pin-in-place strategy //IN: dst - dest pointer - must be accessible from host CPU. //IN: src - src pointer for copy. Must be accessible from agent this buffer is associated with (via _hsaAgent) //IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. void UnpinnedCopyEngine::CopyHostToDevicePinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) { std::lock_guard<std::mutex> l (_copyLock); const char *srcp = static_cast<const char*> (src); char *dstp = static_cast<char*> (dst); for (int i=0; i<_numBuffers; i++) { hsa_signal_store_relaxed(_completionSignal[i], 0); } if (sizeBytes >= UINT64_MAX/2) { THROW_ERROR (hipErrorInvalidValue, HSA_STATUS_ERROR_INVALID_ARGUMENT); } int bufferIndex = 0; size_t theseBytes= sizeBytes; //tprintf (DB_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completionSignal[bufferIndex].handle); //hsa_signal_wait_acquire(_completionSignal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); //void * masked_srcp = (void*) ((uintptr_t)srcp & (uintptr_t)(~0x3f)) ; // TODO void *locked_srcp; //hsa_status_t hsa_status = hsa_amd_memory_lock(masked_srcp, theseBytes, &_hsaAgent, 1, &locked_srcp); hsa_status_t hsa_status = hsa_amd_memory_lock(const_cast<char*> (srcp), theseBytes, &_hsaAgent, 1, &locked_srcp); //tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: pin-in-place:%p+%zu bufferIndex[%d]\n", bytesRemaining, srcp, theseBytes, bufferIndex); //printf ("status=%x srcp=%p, masked_srcp=%p, locked_srcp=%p\n", hsa_status, srcp, masked_srcp, locked_srcp); if (hsa_status != HSA_STATUS_SUCCESS) { THROW_ERROR (hipErrorRuntimeMemory, hsa_status); } hsa_signal_store_relaxed(_completionSignal[bufferIndex], 1); hsa_status = hsa_amd_memory_async_copy(dstp, _hsaAgent, locked_srcp, _cpuAgent, theseBytes, waitFor ? 1:0, waitFor, _completionSignal[bufferIndex]); //tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: async_copy %zu bytes %p to %p status=%x\n", bytesRemaining, theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status); if (hsa_status != HSA_STATUS_SUCCESS) { THROW_ERROR (hipErrorRuntimeMemory, hsa_status); } tprintf (DB_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completionSignal[bufferIndex].handle); hsa_signal_wait_acquire(_completionSignal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); hsa_amd_memory_unlock(const_cast<char*> (srcp)); // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 waitFor = NULL; }
void hsa_enqueue() { hsa_status_t err = 0; /* * Obtain the current queue write index. */ uint64_t index = hsa_queue_load_write_index_relaxed(queue); /* * Write the aql packet at the calculated queue index address. */ const uint32_t queueMask = queue->size - 1; hsa_kernel_dispatch_packet_t* dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(queue->base_address))[index&queueMask]); dispatch_packet->header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; dispatch_packet->header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; dispatch_packet->setup |= 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; dispatch_packet->workgroup_size_x = (uint16_t)LOCAL_SIZE; dispatch_packet->workgroup_size_y = (uint16_t)1; dispatch_packet->workgroup_size_z = (uint16_t)1; dispatch_packet->grid_size_x = (uint32_t) (N_DCNT); dispatch_packet->grid_size_y = 1; dispatch_packet->grid_size_z = 1; dispatch_packet->completion_signal = signal; dispatch_packet->kernel_object = kernel_object; dispatch_packet->kernarg_address = (void*) kernarg_address; dispatch_packet->private_segment_size = private_segment_size; dispatch_packet->group_segment_size = group_segment_size; __atomic_store_n((uint8_t*)(&dispatch_packet->header), (uint8_t)HSA_PACKET_TYPE_KERNEL_DISPATCH, __ATOMIC_RELEASE); /* * Increment the write index and ring the doorbell to dispatch the kernel. */ hsa_queue_store_write_index_relaxed(queue, index+1); hsa_signal_store_relaxed(queue->doorbell_signal, index); check(Dispatching the kernel, err); /* * Wait on the dispatch completion signal until the kernel is finished. */ hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); }
void UnpinnedCopyEngine::CopyDeviceToHostPinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) { std::lock_guard<std::mutex> l (_copyLock); const char *srcp = static_cast<const char*> (src); char *dstp = static_cast<char*> (dst); for (int i=0; i<_numBuffers; i++) { hsa_signal_store_relaxed(_completionSignal[i], 0); } if (sizeBytes >= UINT64_MAX/2) { THROW_ERROR (hipErrorInvalidValue, HSA_STATUS_ERROR_INVALID_ARGUMENT); } int bufferIndex = 0; size_t theseBytes= sizeBytes; void *locked_destp; hsa_status_t hsa_status = hsa_amd_memory_lock(const_cast<char*> (dstp), theseBytes, &_hsaAgent, 1, &locked_destp); if (hsa_status != HSA_STATUS_SUCCESS) { THROW_ERROR (hipErrorRuntimeMemory, hsa_status); } hsa_signal_store_relaxed(_completionSignal[bufferIndex], 1); hsa_status = hsa_amd_memory_async_copy(locked_destp,_cpuAgent , srcp, _hsaAgent, theseBytes, waitFor ? 1:0, waitFor, _completionSignal[bufferIndex]); if (hsa_status != HSA_STATUS_SUCCESS) { THROW_ERROR (hipErrorRuntimeMemory, hsa_status); } tprintf (DB_COPY2, "D2H: waiting... on completion signal handle=%lu\n", _completionSignal[bufferIndex].handle); hsa_signal_wait_acquire(_completionSignal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); hsa_amd_memory_unlock(const_cast<char*> (dstp)); // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 waitFor = NULL; }
int main(int argc, char **argv) { struct timespec timer_1, timer_2; hsa_status_t err; err = hsa_init(); check(Initializing the hsa runtime, err); /* * Iterate over the agents and pick the gpu agent using * the get_gpu_agent callback. */ hsa_agent_t agent; err = hsa_iterate_agents(get_gpu_agent, &agent); if(err == HSA_STATUS_INFO_BREAK) { err = HSA_STATUS_SUCCESS; } check(Getting a gpu agent, err); /* * Query the name of the agent. */ char name[64] = { 0 }; err = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, name); check(Querying the agent name, err); printf("The agent name is %s.\n", name); /* * Query the maximum size of the queue. */ uint32_t queue_size = 0; err = hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size); check(Querying the agent maximum queue size, err); printf("The maximum queue size is %u.\n", (unsigned int) queue_size); /* * Create a queue using the maximum size. */ hsa_queue_t* queue; err = hsa_queue_create(agent, queue_size, HSA_QUEUE_TYPE_SINGLE, NULL, NULL, UINT32_MAX, UINT32_MAX, &queue); check(Creating the queue, err); /* * Load the BRIG binary. */ hsa_ext_module_t module; load_module_from_file("vector_copy.brig",&module); /* * Create hsa program. */ hsa_ext_program_t program; memset(&program,0,sizeof(hsa_ext_program_t)); err = hsa_ext_program_create(HSA_MACHINE_MODEL_LARGE, HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, &program); check(Create the program, err); /* * Add the BRIG module to hsa program. */ err = hsa_ext_program_add_module(program, module); check(Adding the brig module to the program, err); /* * Determine the agents ISA. */ hsa_isa_t isa; err = hsa_agent_get_info(agent, HSA_AGENT_INFO_ISA, &isa); check(Query the agents isa, err); /* * Finalize the program and extract the code object. */ hsa_ext_control_directives_t control_directives; memset(&control_directives, 0, sizeof(hsa_ext_control_directives_t)); hsa_code_object_t code_object; err = hsa_ext_program_finalize(program, isa, 0, control_directives, "-O0", HSA_CODE_OBJECT_TYPE_PROGRAM, &code_object); check(Finalizing the program, err); /* * Destroy the program, it is no longer needed. */ err=hsa_ext_program_destroy(program); check(Destroying the program, err); /* * Create the empty executable. */ hsa_executable_t executable; err = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, "", &executable); check(Create the executable, err); /* * Load the code<F3> object. */ err = hsa_executable_load_code_object(executable, agent, code_object, ""); check(Loading the code object, err); /* * Freeze the executable; it can now be queried for symbols. */ err = hsa_executable_freeze(executable, ""); check(Freeze the executable, err); /* * Extract the symbol from the executable. */ hsa_executable_symbol_t symbol; err = hsa_executable_get_symbol(executable, "", "&__OpenCL_vector_copy_kernel", agent, 0, &symbol); check(Extract the symbol from the executable, err); /* * Extract dispatch information from the symbol */ uint64_t kernel_object; uint32_t kernarg_segment_size; uint32_t group_segment_size; uint32_t private_segment_size; err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_object); check(Extracting the symbol from the executable, err); err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &kernarg_segment_size); check(Extracting the kernarg segment size from the executable, err); err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_segment_size); check(Extracting the group segment size from the executable, err); err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_segment_size); check(Extracting the private segment from the executable, err); /* * Create a signal to wait for the dispatch to finish. */ hsa_signal_t signal; err=hsa_signal_create(1, 0, NULL, &signal); check(Creating a HSA signal, err); /* * Allocate and initialize the kernel arguments and data. */ int* in=(int*)malloc(SIZE); int i; for(i=0;i<ELEMENT;i++) in[i]=(rand()%50000+1); err=hsa_memory_register(in, SIZE); check(Registering argument memory for input parameter, err); int* out=(int*)malloc(SIZE); memset(out, 0, SIZE); err=hsa_memory_register(out, SIZE); check(Registering argument memory for output parameter, err); int element = ELEMENT; int iter = ITER; struct __attribute__ ((aligned(16))) args_t { uint64_t global_offset_0; uint64_t global_offset_1; uint64_t global_offset_2; uint64_t printf_buffer; uint64_t vqueue_pointer; uint64_t aqlwrap_pointer; void* in; void* out; int iter; int element; } args; memset(&args, 0, sizeof(args)); args.in=in; args.out=out; args.element=element; args.iter=iter; /* * Find a memory region that supports kernel arguments. */ hsa_region_t kernarg_region; kernarg_region.handle=(uint64_t)-1; hsa_agent_iterate_regions(agent, get_kernarg_memory_region, &kernarg_region); err = (kernarg_region.handle == (uint64_t)-1) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS; check(Finding a kernarg memory region, err); void* kernarg_address = NULL; /* * Allocate the kernel argument buffer from the correct region. */ err = hsa_memory_allocate(kernarg_region, kernarg_segment_size, &kernarg_address); check(Allocating kernel argument memory buffer, err); memcpy(kernarg_address, &args, sizeof(args)); /* * Obtain the current queue write index. */ uint64_t index = hsa_queue_load_write_index_relaxed(queue); /* * Write the aql packet at the calculated queue index address. */ const uint32_t queueMask = queue->size - 1; hsa_kernel_dispatch_packet_t* dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(queue->base_address))[index&queueMask]); dispatch_packet->header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; dispatch_packet->header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; dispatch_packet->setup |= 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; dispatch_packet->workgroup_size_x = (uint16_t)LOCAL_SIZE; dispatch_packet->workgroup_size_y = (uint16_t)1; dispatch_packet->workgroup_size_z = (uint16_t)1; dispatch_packet->grid_size_x = (uint32_t) (GLOBAL_SIZE); dispatch_packet->grid_size_y = 1; dispatch_packet->grid_size_z = 1; dispatch_packet->completion_signal = signal; dispatch_packet->kernel_object = kernel_object; dispatch_packet->kernarg_address = (void*) kernarg_address; dispatch_packet->private_segment_size = private_segment_size; dispatch_packet->group_segment_size = group_segment_size; __atomic_store_n((uint8_t*)(&dispatch_packet->header), (uint8_t)HSA_PACKET_TYPE_KERNEL_DISPATCH, __ATOMIC_RELEASE); /* * Increment the write index and ring the doorbell to dispatch the kernel. */ tic(&timer_1); hsa_queue_store_write_index_relaxed(queue, index+1); hsa_signal_store_relaxed(queue->doorbell_signal, index); check(Dispatching the kernel, err); /* * Wait on the dispatch completion signal until the kernel is finished. */ hsa_signal_value_t value = hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); toc("Execution Period", &timer_1, &timer_2); /* * Validate the data in the output buffer. */ int temp = 0; for(i=0;i<element;i++) { if(temp<in[i]) temp = in[i]; } if(temp==out[GLOBAL_SIZE]) printf("PASS \n"); else printf("FAIL out=%d in=%d \n",out[GLOBAL_SIZE],temp); /* * Cleanup all allocated resources. */ err=hsa_signal_destroy(signal); check(Destroying the signal, err); err=hsa_executable_destroy(executable); check(Destroying the executable, err); err=hsa_code_object_destroy(code_object); check(Destroying the code object, err); err=hsa_queue_destroy(queue); check(Destroying the queue, err); err=hsa_shut_down(); check(Shutting down the runtime, err); free(in); free(out); //printf("kernarg_segment_size:%d group_segment_size:%d private_segment_size:%d",kernarg_segment_size,group_segment_size,private_segment_size); return 0; }
//--- //Copies sizeBytes from src to dst, using either a copy to a staging buffer or a staged pin-in-place strategy //IN: dst - dest pointer - must be accessible from agent this buffer is associated with (via _hsaAgent). //IN: src - src pointer for copy. Must be accessible from host CPU. //IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. void UnpinnedCopyEngine::CopyPeerToPeer(void* dst, hsa_agent_t dstAgent, const void* src, hsa_agent_t srcAgent, size_t sizeBytes, hsa_signal_t *waitFor) { std::lock_guard<std::mutex> l (_copyLock); const char *srcp0 = static_cast<const char*> (src); char *dstp1 = static_cast<char*> (dst); for (int i=0; i<_numBuffers; i++) { hsa_signal_store_relaxed(_completionSignal[i], 0); hsa_signal_store_relaxed(_completionSignal2[i], 0); } if (sizeBytes >= UINT64_MAX/2) { THROW_ERROR (hipErrorInvalidValue, HSA_STATUS_ERROR_INVALID_ARGUMENT); } int64_t bytesRemaining0 = sizeBytes; // bytes to copy from dest into staging buffer. int64_t bytesRemaining1 = sizeBytes; // bytes to copy from staging buffer into final dest while (bytesRemaining1 > 0) { // First launch the async copies to copy from dest to host for (int bufferIndex = 0; (bytesRemaining0>0) && (bufferIndex < _numBuffers); bytesRemaining0 -= _bufferSize, bufferIndex++) { size_t theseBytes = (bytesRemaining0 > _bufferSize) ? _bufferSize : bytesRemaining0; // Wait to make sure we are not overwriting a buffer before it has been drained: hsa_signal_wait_acquire(_completionSignal2[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); tprintf (DB_COPY2, "P2P: bytesRemaining0=%zu async_copy %zu bytes src:%p to staging:%p\n", bytesRemaining0, theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]); hsa_signal_store_relaxed(_completionSignal[bufferIndex], 1); hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _cpuAgent, srcp0, srcAgent, theseBytes, waitFor ? 1:0, waitFor, _completionSignal[bufferIndex]); if (hsa_status != HSA_STATUS_SUCCESS) { THROW_ERROR (hipErrorRuntimeMemory, hsa_status); } srcp0 += theseBytes; // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 waitFor = NULL; } // Now unload the staging buffers: for (int bufferIndex=0; (bytesRemaining1>0) && (bufferIndex < _numBuffers); bytesRemaining1 -= _bufferSize, bufferIndex++) { size_t theseBytes = (bytesRemaining1 > _bufferSize) ? _bufferSize : bytesRemaining1; tprintf (DB_COPY2, "P2P: wait_completion[%d] bytesRemaining=%zu\n", bufferIndex, bytesRemaining1); bool hostWait = 0; // TODO - remove me if (hostWait) { // Host-side wait, should not be necessary: hsa_signal_wait_acquire(_completionSignal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); } tprintf (DB_COPY2, "P2P: bytesRemaining1=%zu copy %zu bytes stagingBuf[%d]:%p to device:%p\n", bytesRemaining1, theseBytes, bufferIndex, _pinnedStagingBuffer[bufferIndex], dstp1); hsa_signal_store_relaxed(_completionSignal2[bufferIndex], 1); hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp1, dstAgent, _pinnedStagingBuffer[bufferIndex], _cpuAgent /*not used*/, theseBytes, hostWait ? 0:1, hostWait ? NULL : &_completionSignal[bufferIndex], _completionSignal2[bufferIndex]); dstp1 += theseBytes; } } // Wait for the staging-buffer to dest copies to complete: for (int i=0; i<_numBuffers; i++) { hsa_signal_wait_acquire(_completionSignal2[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); } }
int main(int argc, char **argv) { hsa_status_t err; err = hsa_init(); check(Initializing the hsa runtime, err); /* * Determine if the finalizer 1.0 extension is supported. */ bool support; err = hsa_system_extension_supported(HSA_EXTENSION_FINALIZER, 1, 0, &support); check(Checking finalizer 1.0 extension support, err); /* * Generate the finalizer function table. */ hsa_ext_finalizer_1_00_pfn_t table_1_00; err = hsa_system_get_extension_table(HSA_EXTENSION_FINALIZER, 1, 0, &table_1_00); check(Generating function table for finalizer, err); /* * Iterate over the agents and pick the gpu agent using * the get_gpu_agent callback. */ hsa_agent_t agent; err = hsa_iterate_agents(get_gpu_agent, &agent); if(err == HSA_STATUS_INFO_BREAK) { err = HSA_STATUS_SUCCESS; } check(Getting a gpu agent, err); /* * Query the name of the agent. */ char name[64] = { 0 }; err = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, name); check(Querying the agent name, err); printf("The agent name is %s.\n", name); /* * Query the maximum size of the queue. */ uint32_t queue_size = 0; err = hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size); check(Querying the agent maximum queue size, err); printf("The maximum queue size is %u.\n", (unsigned int) queue_size); /* * Create a queue using the maximum size. */ hsa_queue_t* queue; err = hsa_queue_create(agent, queue_size, HSA_QUEUE_TYPE_SINGLE, NULL, NULL, UINT32_MAX, UINT32_MAX, &queue); check(Creating the queue, err); /* * Load the BRIG binary. */ hsa_ext_module_t module; load_module_from_file("vector_copy.brig",&module); /* * Create hsa program. */ hsa_ext_program_t program; memset(&program,0,sizeof(hsa_ext_program_t)); err = table_1_00.hsa_ext_program_create(HSA_MACHINE_MODEL_LARGE, HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, &program); check(Create the program, err); /* * Add the BRIG module to hsa program. */ err = table_1_00.hsa_ext_program_add_module(program, module); check(Adding the brig module to the program, err); /* * Determine the agents ISA. */ hsa_isa_t isa; err = hsa_agent_get_info(agent, HSA_AGENT_INFO_ISA, &isa); check(Query the agents isa, err); /* * Finalize the program and extract the code object. */ hsa_ext_control_directives_t control_directives; memset(&control_directives, 0, sizeof(hsa_ext_control_directives_t)); hsa_code_object_t code_object; err = table_1_00.hsa_ext_program_finalize(program, isa, 0, control_directives, "", HSA_CODE_OBJECT_TYPE_PROGRAM, &code_object); check(Finalizing the program, err); /* * Destroy the program, it is no longer needed. */ err=table_1_00.hsa_ext_program_destroy(program); check(Destroying the program, err); /* * Create the empty executable. */ hsa_executable_t executable; err = hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, "", &executable); check(Create the executable, err); /* * Load the code object. */ err = hsa_executable_load_code_object(executable, agent, code_object, ""); check(Loading the code object, err); /* * Freeze the executable; it can now be queried for symbols. */ err = hsa_executable_freeze(executable, ""); check(Freeze the executable, err); /* * Extract the symbol from the executable. */ hsa_executable_symbol_t symbol; err = hsa_executable_get_symbol(executable, NULL, "&__vector_copy_kernel", agent, 0, &symbol); check(Extract the symbol from the executable, err); /* * Extract dispatch information from the symbol */ uint64_t kernel_object; uint32_t kernarg_segment_size; uint32_t group_segment_size; uint32_t private_segment_size; err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_object); check(Extracting the symbol from the executable, err); err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &kernarg_segment_size); check(Extracting the kernarg segment size from the executable, err); err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_segment_size); check(Extracting the group segment size from the executable, err); err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_segment_size); check(Extracting the private segment from the executable, err); /* * Create a signal to wait for the dispatch to finish. */ hsa_signal_t signal; err=hsa_signal_create(1, 0, NULL, &signal); check(Creating a HSA signal, err); /* * Allocate and initialize the kernel arguments and data. */ char* in=(char*)malloc(1024*1024*4); memset(in, 1, 1024*1024*4); err=hsa_memory_register(in, 1024*1024*4); check(Registering argument memory for input parameter, err); char* out=(char*)malloc(1024*1024*4); memset(out, 0, 1024*1024*4); err=hsa_memory_register(out, 1024*1024*4); check(Registering argument memory for output parameter, err); struct __attribute__ ((aligned(16))) args_t { void* in; void* out; } args; args.in=in; args.out=out; /* * Find a memory region that supports kernel arguments. */ hsa_region_t kernarg_region; kernarg_region.handle=(uint64_t)-1; hsa_agent_iterate_regions(agent, get_kernarg_memory_region, &kernarg_region); err = (kernarg_region.handle == (uint64_t)-1) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS; check(Finding a kernarg memory region, err); void* kernarg_address = NULL; /* * Allocate the kernel argument buffer from the correct region. */ err = hsa_memory_allocate(kernarg_region, kernarg_segment_size, &kernarg_address); check(Allocating kernel argument memory buffer, err); memcpy(kernarg_address, &args, sizeof(args)); /* * Obtain the current queue write index. */ uint64_t index = hsa_queue_load_write_index_relaxed(queue); /* * Write the aql packet at the calculated queue index address. */ const uint32_t queueMask = queue->size - 1; hsa_kernel_dispatch_packet_t* dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(queue->base_address))[index&queueMask]); dispatch_packet->setup |= 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; dispatch_packet->workgroup_size_x = (uint16_t)256; dispatch_packet->workgroup_size_y = (uint16_t)1; dispatch_packet->workgroup_size_z = (uint16_t)1; dispatch_packet->grid_size_x = (uint32_t) (1024*1024); dispatch_packet->grid_size_y = 1; dispatch_packet->grid_size_z = 1; dispatch_packet->completion_signal = signal; dispatch_packet->kernel_object = kernel_object; dispatch_packet->kernarg_address = (void*) kernarg_address; dispatch_packet->private_segment_size = private_segment_size; dispatch_packet->group_segment_size = group_segment_size; uint16_t header = 0; header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; __atomic_store_n((uint16_t*)(&dispatch_packet->header), header, __ATOMIC_RELEASE); /* * Increment the write index and ring the doorbell to dispatch the kernel. */ hsa_queue_store_write_index_relaxed(queue, index+1); hsa_signal_store_relaxed(queue->doorbell_signal, index); check(Dispatching the kernel, err); /* * Wait on the dispatch completion signal until the kernel is finished. */ hsa_signal_value_t value = hsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); /* * Validate the data in the output buffer. */ int valid=1; int fail_index=0; for(int i=0; i<1024*1024; i++) { if(out[i]!=in[i]) { fail_index=i; valid=0; break; } } if(valid) { printf("Passed validation.\n"); } else { printf("VALIDATION FAILED!\nBad index: %d\n", fail_index); } /* * Cleanup all allocated resources. */ err = hsa_memory_free(kernarg_address); check(Freeing kernel argument memory buffer, err); err=hsa_signal_destroy(signal); check(Destroying the signal, err); err=hsa_executable_destroy(executable); check(Destroying the executable, err); err=hsa_code_object_destroy(code_object); check(Destroying the code object, err); err=hsa_queue_destroy(queue); check(Destroying the queue, err); err=hsa_shut_down(); check(Shutting down the runtime, err); free(in); free(out); return 0; }
void pocl_hsa_run (void *data, _cl_command_node* cmd) { struct data *d; struct pocl_argument *al; unsigned i; cl_kernel kernel = cmd->command.run.kernel; struct pocl_context *pc = &cmd->command.run.pc; hsa_signal_value_t initial_value = 1; #if 0 /* Not yet supported by the reference library. */ hsa_kernel_dispatch_packet_t kernel_packet; #else hsa_dispatch_packet_t kernel_packet; #endif hsa_signal_t kernel_completion_signal = 0; hsa_region_t region; int error; amdgpu_args_t *args; /* 32b word offset in the kernel arguments buffer we can push the next argument to. */ int args_offset = 0; assert (data != NULL); d = (struct data *) data; d->current_kernel = kernel; memset (&kernel_packet, 0, sizeof (hsa_dispatch_packet_t)); #if 0 /* TODO: not yet supported by the open source runtime implementation. Assume the HSA Full profile so we can simply use host malloc(). */ hsa_agent_iterate_regions(kernel_agent, pocl_hsa_get_kernarg, ®ion); if (hsa_memory_allocate(region, sizeof(amdgpu_args_t), (void**)&args) != HSA_STATUS_SUCCESS) { assert (0 && "hsa_memory_allocate() failed."); } #else args = (amdgpu_args_t*)malloc(sizeof(amdgpu_args_t)); #endif kernel_packet.kernarg_address = (uint64_t)args; /* Process the kernel arguments. Convert the opaque buffer pointers to real device pointers, allocate dynamic local memory buffers, etc. */ for (i = 0; i < kernel->num_args; ++i) { al = &(cmd->command.run.arguments[i]); if (kernel->arg_info[i].is_local) { POCL_ABORT_UNIMPLEMENTED("pocl-hsa: local buffers not implemented."); #if 0 arguments[i] = malloc (sizeof (void *)); *(void **)(arguments[i]) = pocl_hsa_malloc(data, 0, al->size, NULL); #endif } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_POINTER) { if (args_offset + 1 >= MAX_KERNEL_ARG_WORDS) POCL_ABORT("pocl-hsa: too many kernel arguments!"); /* Assuming the pointers are 64b (or actually the same as in host) due to HSA. TODO: the 32b profile. */ if (al->value == NULL) { args->kernel_args[args_offset] = 0; args->kernel_args[args_offset + 1] = 0; } else { *(uint64_t*)&args->kernel_args[args_offset] = (uint64_t)(*(cl_mem *) (al->value))-> device_ptrs[cmd->device->dev_id].mem_ptr; } args_offset += 2; #if 0 /* It's legal to pass a NULL pointer to clSetKernelArguments. In that case we must pass the same NULL forward to the kernel. Otherwise, the user must have created a buffer with per device pointers stored in the cl_mem. */ if (al->value == NULL) { arguments[i] = malloc (sizeof (void *)); *(void **)arguments[i] = NULL; } else arguments[i] = &((*(cl_mem *) (al->value))->device_ptrs[cmd->device->dev_id].mem_ptr); #endif } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_IMAGE) { POCL_ABORT_UNIMPLEMENTED("hsa: image arguments not implemented."); #if 0 dev_image_t di; fill_dev_image_t (&di, al, cmd->device); void* devptr = pocl_hsa_malloc (data, 0, sizeof(dev_image_t), NULL); arguments[i] = malloc (sizeof (void *)); *(void **)(arguments[i]) = devptr; pocl_hsa_write (data, &di, devptr, 0, sizeof(dev_image_t)); #endif } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_SAMPLER) { POCL_ABORT_UNIMPLEMENTED("hsa: sampler arguments not implemented."); #if 0 dev_sampler_t ds; arguments[i] = malloc (sizeof (void *)); *(void **)(arguments[i]) = pocl_hsa_malloc (data, 0, sizeof(dev_sampler_t), NULL); pocl_hsa_write (data, &ds, *(void**)arguments[i], 0, sizeof(dev_sampler_t)); #endif } else { if (args_offset >= MAX_KERNEL_ARG_WORDS) POCL_ABORT("pocl-hsa: too many kernel arguments!"); /* Assuming the scalar fits to a 32b slot. TODO! */ assert (al->size <= 4); args->kernel_args[args_offset] = *(uint32_t*)al->value; ++args_offset; } } for (i = kernel->num_args; i < kernel->num_args + kernel->num_locals; ++i) { POCL_ABORT_UNIMPLEMENTED("hsa: automatic local buffers not implemented."); #if 0 al = &(cmd->command.run.arguments[i]); arguments[i] = malloc (sizeof (void *)); *(void **)(arguments[i]) = pocl_hsa_malloc (data, 0, al->size, NULL); #endif } args->workgroup_size_x = kernel_packet.workgroup_size_x = cmd->command.run.local_x; args->workgroup_size_y = kernel_packet.workgroup_size_y = cmd->command.run.local_y; args->workgroup_size_z = kernel_packet.workgroup_size_z = cmd->command.run.local_z; kernel_packet.grid_size_x = pc->num_groups[0] * cmd->command.run.local_x; kernel_packet.grid_size_y = pc->num_groups[1] * cmd->command.run.local_y; kernel_packet.grid_size_z = pc->num_groups[2] * cmd->command.run.local_z; /* AMDGPU specific OpenCL argument data. */ args->wgs_x = pc->num_groups[0]; args->wgs_y = pc->num_groups[1]; args->wgs_z = pc->num_groups[2]; kernel_packet.dimensions = 1; if (cmd->command.run.local_y > 1) kernel_packet.dimensions = 2; if (cmd->command.run.local_z > 1) kernel_packet.dimensions = 3; kernel_packet.header.type = HSA_PACKET_TYPE_DISPATCH; kernel_packet.header.acquire_fence_scope = HSA_FENCE_SCOPE_SYSTEM; kernel_packet.header.release_fence_scope = HSA_FENCE_SCOPE_SYSTEM; kernel_packet.header.barrier = 1; kernel_packet.kernel_object_address = *(hsa_amd_code_t*)cmd->command.run.device_data[1]; error = hsa_signal_create(initial_value, 0, NULL, &kernel_completion_signal); assert (error == HSA_STATUS_SUCCESS); kernel_packet.completion_signal = kernel_completion_signal; { /* Launch the kernel by allocating a slot in the queue, writing the command to it, signaling the update with a door bell and finally, block waiting until finish signalled with the completion_signal. */ const uint32_t queue_mask = d->queue->size - 1; uint64_t queue_index = hsa_queue_load_write_index_relaxed(d->queue); hsa_signal_value_t sigval; ((hsa_dispatch_packet_t*)(d->queue->base_address))[queue_index & queue_mask] = kernel_packet; hsa_queue_store_write_index_relaxed(d->queue, queue_index + 1); hsa_signal_store_relaxed(d->queue->doorbell_signal, queue_index); sigval = hsa_signal_wait_acquire(kernel_completion_signal, HSA_EQ, 0, (uint64_t)(-1), HSA_WAIT_EXPECTANCY_UNKNOWN); } for (i = 0; i < kernel->num_args; ++i) { if (kernel->arg_info[i].is_local) { #if 0 pocl_hsa_free (data, 0, *(void **)(arguments[i])); POCL_MEM_FREE(arguments[i]); #endif } else if (kernel->arg_info[i].type == POCL_ARG_TYPE_IMAGE) { #if 0 pocl_hsa_free (data, 0, *(void **)(arguments[i])); POCL_MEM_FREE(arguments[i]); #endif } #if 0 else if (kernel->arg_info[i].type == POCL_ARG_TYPE_SAMPLER || (kernel->arg_info[i].type == POCL_ARG_TYPE_POINTER && *(void**)args->kernel_args[i] == NULL)) { POCL_MEM_FREE(arguments[i]); } #endif } for (i = kernel->num_args; i < kernel->num_args + kernel->num_locals; ++i) { #if 0 pocl_hsa_free(data, 0, *(void **)(arguments[i])); POCL_MEM_FREE(arguments[i]); #endif } free(args); }
void MultiKernelDispatch<T>::queueKernel(HSAContext::Kernel* p_kernel, T kern_args, Launch_params_t lparm, bool wait) { if (wait) { hsa_status_t err; m_expected_value =1; err = hsa_signal_create(m_expected_value, 0, NULL, &m_signal); } HSAContextKaveriImpl* p_ctx = (HSAContextKaveriImpl*) m_pcontext; /* Obtain the current queue write index. increases with each call to kernel */ uint64_t index = hsa_queue_load_write_index_relaxed(p_ctx->commandQueue); /* printf("DEBUG:Call #%d to kernel \"%s\" \n",(int) index+1,"run"); */ HSAContextKaveriImpl::KernelImpl *p_impl = (HSAContextKaveriImpl::KernelImpl *) p_kernel; /* Setup this call to this kernel dispatch packet from scratch. */ memset(&m_aql, 0, sizeof(m_aql)); /* Set the dimensions passed from the application */ m_aql.dimensions = (uint16_t) lparm.ndim; m_aql.grid_size_x = lparm.gdims[0]; m_aql.workgroup_size_x = lparm.ldims[0]; m_aql.completion_signal = m_signal; if (lparm.ndim > 1) { m_aql.grid_size_y = lparm.gdims[1]; m_aql.workgroup_size_y = lparm.ldims[1]; } else { m_aql.grid_size_y = 1; m_aql.workgroup_size_y = 1; } if (lparm.ndim > 2) { m_aql.grid_size_z = lparm.gdims[2]; m_aql.workgroup_size_z = lparm.ldims[2]; } else { m_aql.grid_size_z = 1; m_aql.workgroup_size_z = 1; } /* In the future, we may use environment variables for some of these */ m_aql.header.type = HSA_PACKET_TYPE_DISPATCH; m_aql.header.acquire_fence_scope = 2; m_aql.header.release_fence_scope = 2; m_aql.header.barrier = 1; m_aql.group_segment_size = p_impl->hsaCodeDescriptor->workgroup_group_segment_byte_size; m_aql.private_segment_size = p_impl->hsaCodeDescriptor->workitem_private_segment_byte_size; void * run_kernel_arg_buffer = p_impl->m_run_arg_buffer; /* copy args from the custom run_args structure */ /* FIXME We should align kernel_arg_buffer because run_args is aligned */ memcpy(run_kernel_arg_buffer, &kern_args, sizeof(kern_args)); /* Bind kernelcode to the packet. */ m_aql.kernel_object_address = p_impl->hsaCodeDescriptor->code.handle; /* Bind kernel argument buffer to the aql packet. */ m_aql.kernarg_address = (uint64_t) run_kernel_arg_buffer; const uint32_t queueMask = p_ctx->commandQueue->size - 1; const uint32_t pos = (index) & queueMask; ((hsa_dispatch_packet_t*) (p_ctx->commandQueue->base_address))[pos] = m_aql; if (wait) hsa_signal_store_relaxed(p_ctx->commandQueue->doorbell_signal, index); /* Increment the write index and ring the doorbell to dispatch the kernel. */ hsa_queue_store_write_index_relaxed(p_ctx->commandQueue, index + 1); if (wait) { hsa_signal_wait_acquire(m_signal, HSA_LT, m_expected_value, (uint64_t)-1, HSA_WAIT_EXPECTANCY_SHORT); hsa_signal_destroy(m_signal); m_signal = 0; } m_expected_value--; }