hsa_status_t HSA_API hsa_amd_queue_sdma_create(hsa_agent_t agent_handle, size_t buffer_size, void* buffer_addr, uint64_t* queue_id, uint32_t** read_ptr, uint32_t** write_ptr, uint32_t** doorbell) { IS_OPEN(); static const size_t kPageSize = 4096; if (!IsMultipleOf(buffer_size, kPageSize) || !IsMultipleOf(buffer_addr, kPageSize) || queue_id == NULL || read_ptr == NULL || write_ptr == NULL || doorbell == NULL) { return HSA_STATUS_ERROR_INVALID_ARGUMENT; } core::Agent* agent = core::Agent::Convert(agent_handle); IS_VALID(agent); if (agent->device_type() != core::Agent::kAmdGpuDevice) { return HSA_STATUS_ERROR_INVALID_QUEUE_CREATION; } amd::GpuAgentInt* gpu_agent = static_cast<amd::GpuAgentInt*>(agent); HsaQueueResource queue_resource = {0}; const HSA_QUEUE_TYPE kQueueType_ = HSA_QUEUE_SDMA; if (HSAKMT_STATUS_SUCCESS != hsaKmtCreateQueue(gpu_agent->node_id(), kQueueType_, 100, HSA_QUEUE_PRIORITY_MAXIMUM, buffer_addr, buffer_size, NULL, &queue_resource)) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } *queue_id = queue_resource.QueueId; *read_ptr = queue_resource.Queue_read_ptr; *write_ptr = queue_resource.Queue_write_ptr; *doorbell = queue_resource.Queue_DoorBell; return HSA_STATUS_SUCCESS; }
MemoryRegion::MemoryRegion(bool fine_grain, const core::Agent& owner, const HsaMemoryProperties& mem_props) : core::MemoryRegion(fine_grain), owner_(&owner), mem_props_(mem_props), max_single_alloc_size_(0), virtual_size_(0) { virtual_size_ = GetPhysicalSize(); mem_flag_.Value = 0; if (IsLocalMemory()) { mem_flag_.ui32.PageSize = HSA_PAGE_SIZE_4KB; mem_flag_.ui32.NoSubstitute = 1; mem_flag_.ui32.HostAccess = 0; mem_flag_.ui32.NonPaged = 1; char* char_end = NULL; HSAuint64 max_alloc_size = static_cast<HSAuint64>(strtoull( os::GetEnvVar("HSA_LOCAL_MEMORY_MAX_ALLOC").c_str(), &char_end, 10)); max_alloc_size = std::max(max_alloc_size, GetPhysicalSize() / 4); max_alloc_size = std::min(max_alloc_size, GetPhysicalSize()); max_single_alloc_size_ = AlignDown(static_cast<size_t>(max_alloc_size), kPageSize_); static const HSAuint64 kGpuVmSize = (1ULL << 40); virtual_size_ = kGpuVmSize; } else if (IsSystem()) { mem_flag_.ui32.PageSize = HSA_PAGE_SIZE_4KB; mem_flag_.ui32.NoSubstitute = 1; mem_flag_.ui32.HostAccess = 1; mem_flag_.ui32.CachePolicy = HSA_CACHING_CACHED; max_single_alloc_size_ = AlignDown(static_cast<size_t>(GetPhysicalSize()), kPageSize_); virtual_size_ = os::GetUserModeVirtualMemorySize(); } assert(GetVirtualSize() != 0); assert(GetPhysicalSize() <= GetVirtualSize()); assert(IsMultipleOf(max_single_alloc_size_, kPageSize_)); }
GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, const std::vector<HsaCacheProperties>& cache_props) : node_id_(node), properties_(node_props), cache_props_(cache_props), ape1_base_(0), ape1_size_(0), current_coherency_type_(HSA_AMD_COHERENCY_TYPE_COHERENT), blit_(NULL) { HSAKMT_STATUS err = hsaKmtGetClockCounters(node_id_, &t0_); t1_ = t0_; assert(err == HSAKMT_STATUS_SUCCESS && "hsaGetClockCounters error"); HsaMemFlags flags; flags.Value = 0; flags.ui32.Scratch = 1; flags.ui32.HostAccess = 1; scratch_per_thread_ = atoi(os::GetEnvVar("HSA_SCRATCH_MEM").c_str()); if (scratch_per_thread_ == 0) scratch_per_thread_ = DEFAULT_SCRATCH_BYTES_PER_THREAD; int queues = atoi(os::GetEnvVar("HSA_MAX_QUEUES").c_str()); #if !defined(HSA_LARGE_MODEL) || !defined(__linux__) if (queues == 0) queues = 10; #endif // Scratch length is: waves/CU * threads/wave * queues * #CUs * // scratch/thread queue_scratch_len_ = 0; queue_scratch_len_ = AlignUp(32 * 64 * 8 * scratch_per_thread_, 65536); size_t scratchLen = queue_scratch_len_ * queues; // For 64-bit linux use max queues unless otherwise specified #if defined(HSA_LARGE_MODEL) && defined(__linux__) if ((scratchLen == 0) || (scratchLen > 4294967296)) scratchLen = 4294967296; // 4GB apeture max #endif void* scratchBase; err = hsaKmtAllocMemory(node_id_, scratchLen, flags, &scratchBase); assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtAllocMemory(Scratch) failed"); assert(IsMultipleOf(scratchBase, 0x1000) && "Scratch base is not page aligned!"); scratch_pool_. ~SmallHeap(); new (&scratch_pool_) SmallHeap(scratchBase, scratchLen); if (sizeof(void*) == 8) { // 64 bit only. Setup APE1 memory region, which contains // non coherent memory. static const size_t kApe1Alignment = 64 * 1024; static const size_t kApe1Size = kApe1Alignment; const HsaMemoryProperties ape1_prop = ReserveApe1(node_id_, kApe1Size, kApe1Alignment); if (ape1_prop.SizeInBytes > 0) { SetApe1BaseAndSize((uintptr_t)ape1_prop.VirtualBaseAddress, (size_t)ape1_prop.SizeInBytes); } } }