hsa_status_t HSA_API
hsa_amd_queue_sdma_create(hsa_agent_t agent_handle, size_t buffer_size,
                          void* buffer_addr, uint64_t* queue_id,
                          uint32_t** read_ptr, uint32_t** write_ptr,
                          uint32_t** doorbell) {
  IS_OPEN();

  static const size_t kPageSize = 4096;

  if (!IsMultipleOf(buffer_size, kPageSize) ||
      !IsMultipleOf(buffer_addr, kPageSize) || queue_id == NULL ||
      read_ptr == NULL || write_ptr == NULL || doorbell == NULL) {
    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
  }

  core::Agent* agent = core::Agent::Convert(agent_handle);

  IS_VALID(agent);

  if (agent->device_type() != core::Agent::kAmdGpuDevice) {
    return HSA_STATUS_ERROR_INVALID_QUEUE_CREATION;
  }

  amd::GpuAgentInt* gpu_agent = static_cast<amd::GpuAgentInt*>(agent);

  HsaQueueResource queue_resource = {0};
  const HSA_QUEUE_TYPE kQueueType_ = HSA_QUEUE_SDMA;
  if (HSAKMT_STATUS_SUCCESS !=
      hsaKmtCreateQueue(gpu_agent->node_id(), kQueueType_, 100,
                        HSA_QUEUE_PRIORITY_MAXIMUM, buffer_addr, buffer_size,
                        NULL, &queue_resource)) {
    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
  }

  *queue_id = queue_resource.QueueId;
  *read_ptr = queue_resource.Queue_read_ptr;
  *write_ptr = queue_resource.Queue_write_ptr;
  *doorbell = queue_resource.Queue_DoorBell;

  return HSA_STATUS_SUCCESS;
}
MemoryRegion::MemoryRegion(bool fine_grain, const core::Agent& owner,
                           const HsaMemoryProperties& mem_props)
    : core::MemoryRegion(fine_grain),
      owner_(&owner),
      mem_props_(mem_props),
      max_single_alloc_size_(0),
      virtual_size_(0) {
    virtual_size_ = GetPhysicalSize();

    mem_flag_.Value = 0;
    if (IsLocalMemory()) {
        mem_flag_.ui32.PageSize = HSA_PAGE_SIZE_4KB;
        mem_flag_.ui32.NoSubstitute = 1;
        mem_flag_.ui32.HostAccess = 0;
        mem_flag_.ui32.NonPaged = 1;

        char* char_end = NULL;
        HSAuint64 max_alloc_size = static_cast<HSAuint64>(strtoull(
                                       os::GetEnvVar("HSA_LOCAL_MEMORY_MAX_ALLOC").c_str(), &char_end, 10));
        max_alloc_size = std::max(max_alloc_size, GetPhysicalSize() / 4);
        max_alloc_size = std::min(max_alloc_size, GetPhysicalSize());

        max_single_alloc_size_ =
            AlignDown(static_cast<size_t>(max_alloc_size), kPageSize_);

        static const HSAuint64 kGpuVmSize = (1ULL << 40);
        virtual_size_ = kGpuVmSize;
    } else if (IsSystem()) {
        mem_flag_.ui32.PageSize = HSA_PAGE_SIZE_4KB;
        mem_flag_.ui32.NoSubstitute = 1;
        mem_flag_.ui32.HostAccess = 1;
        mem_flag_.ui32.CachePolicy = HSA_CACHING_CACHED;

        max_single_alloc_size_ =
            AlignDown(static_cast<size_t>(GetPhysicalSize()), kPageSize_);

        virtual_size_ = os::GetUserModeVirtualMemorySize();
    }

    assert(GetVirtualSize() != 0);
    assert(GetPhysicalSize() <= GetVirtualSize());
    assert(IsMultipleOf(max_single_alloc_size_, kPageSize_));
}
GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props,
                   const std::vector<HsaCacheProperties>& cache_props)
    : node_id_(node),
      properties_(node_props),
      cache_props_(cache_props),
      ape1_base_(0),
      ape1_size_(0),
      current_coherency_type_(HSA_AMD_COHERENCY_TYPE_COHERENT),
      blit_(NULL) {
  HSAKMT_STATUS err = hsaKmtGetClockCounters(node_id_, &t0_);
  t1_ = t0_;
  assert(err == HSAKMT_STATUS_SUCCESS && "hsaGetClockCounters error");

  HsaMemFlags flags;
  flags.Value = 0;
  flags.ui32.Scratch = 1;
  flags.ui32.HostAccess = 1;

  scratch_per_thread_ = atoi(os::GetEnvVar("HSA_SCRATCH_MEM").c_str());
  if (scratch_per_thread_ == 0)
    scratch_per_thread_ = DEFAULT_SCRATCH_BYTES_PER_THREAD;

  int queues = atoi(os::GetEnvVar("HSA_MAX_QUEUES").c_str());
#if !defined(HSA_LARGE_MODEL) || !defined(__linux__)
  if (queues == 0) queues = 10;
#endif

  // Scratch length is: waves/CU * threads/wave * queues * #CUs *
  // scratch/thread
  queue_scratch_len_ = 0;
  queue_scratch_len_ = AlignUp(32 * 64 * 8 * scratch_per_thread_, 65536);
  size_t scratchLen = queue_scratch_len_ * queues;

// For 64-bit linux use max queues unless otherwise specified
#if defined(HSA_LARGE_MODEL) && defined(__linux__)
  if ((scratchLen == 0) || (scratchLen > 4294967296))
    scratchLen = 4294967296;  // 4GB apeture max
#endif

  void* scratchBase;
  err = hsaKmtAllocMemory(node_id_, scratchLen, flags, &scratchBase);
  assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtAllocMemory(Scratch) failed");
  assert(IsMultipleOf(scratchBase, 0x1000) &&
         "Scratch base is not page aligned!");

  scratch_pool_. ~SmallHeap();
  new (&scratch_pool_) SmallHeap(scratchBase, scratchLen);

  if (sizeof(void*) == 8) {
    // 64 bit only. Setup APE1 memory region, which contains
    // non coherent memory.

    static const size_t kApe1Alignment = 64 * 1024;
    static const size_t kApe1Size = kApe1Alignment;

    const HsaMemoryProperties ape1_prop =
        ReserveApe1(node_id_, kApe1Size, kApe1Alignment);

    if (ape1_prop.SizeInBytes > 0) {
      SetApe1BaseAndSize((uintptr_t)ape1_prop.VirtualBaseAddress,
                         (size_t)ape1_prop.SizeInBytes);
    }
  }
}