static int init_mqd_hiq(struct mqd_manager *mm, void **mqd,
		struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
		struct queue_properties *q)
{
	uint64_t addr;
	struct cik_mqd *m;
	int retval;

	BUG_ON(!mm || !q || !mqd || !mqd_mem_obj);

	pr_debug("kfd: In func %s\n", __func__);

	retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct cik_mqd),
					mqd_mem_obj);

	if (retval != 0)
		return -ENOMEM;

	m = (struct cik_mqd *) (*mqd_mem_obj)->cpu_ptr;
	addr = (*mqd_mem_obj)->gpu_addr;

	memset(m, 0, ALIGN(sizeof(struct cik_mqd), 256));

	m->header = 0xC0310800;
	m->compute_pipelinestat_enable = 1;
	m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF;
	m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF;
	m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF;
	m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF;

	m->cp_hqd_persistent_state = DEFAULT_CP_HQD_PERSISTENT_STATE |
					PRELOAD_REQ;
	m->cp_hqd_quantum = QUANTUM_EN | QUANTUM_SCALE_1MS |
				QUANTUM_DURATION(10);

	m->cp_mqd_control             = MQD_CONTROL_PRIV_STATE_EN;
	m->cp_mqd_base_addr_lo        = lower_32_bits(addr);
	m->cp_mqd_base_addr_hi        = upper_32_bits(addr);

	m->cp_hqd_ib_control = DEFAULT_MIN_IB_AVAIL_SIZE;

	/*
	 * Pipe Priority
	 * Identifies the pipe relative priority when this queue is connected
	 * to the pipeline. The pipe priority is against the GFX pipe and HP3D.
	 * In KFD we are using a fixed pipe priority set to CS_MEDIUM.
	 * 0 = CS_LOW (typically below GFX)
	 * 1 = CS_MEDIUM (typically between HP3D and GFX
	 * 2 = CS_HIGH (typically above HP3D)
	 */
	m->cp_hqd_pipe_priority = 1;
	m->cp_hqd_queue_priority = 15;

	*mqd = m;
	if (gart_addr)
		*gart_addr = addr;
	retval = mm->update_mqd(mm, m, q);

	return retval;
}
Exemplo n.º 2
0
static int init_mqd_sdma(struct mqd_manager *mm, void **mqd,
			struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
			struct queue_properties *q)
{
	int retval;
	struct cik_sdma_rlc_registers *m;

	retval = kfd_gtt_sa_allocate(mm->dev,
					sizeof(struct cik_sdma_rlc_registers),
					mqd_mem_obj);

	if (retval != 0)
		return -ENOMEM;

	m = (struct cik_sdma_rlc_registers *) (*mqd_mem_obj)->cpu_ptr;

	memset(m, 0, sizeof(struct cik_sdma_rlc_registers));

	*mqd = m;
	if (gart_addr)
		*gart_addr = (*mqd_mem_obj)->gpu_addr;

	retval = mm->update_mqd(mm, m, q);

	return retval;
}
Exemplo n.º 3
0
static bool initialize_v9(struct kernel_queue *kq, struct kfd_dev *dev,
			enum kfd_queue_type type, unsigned int queue_size)
{
	int retval;

	retval = kfd_gtt_sa_allocate(dev, PAGE_SIZE, &kq->eop_mem);
	if (retval)
		return false;

	kq->eop_gpu_addr = kq->eop_mem->gpu_addr;
	kq->eop_kernel_addr = kq->eop_mem->cpu_ptr;

	memset(kq->eop_kernel_addr, 0, PAGE_SIZE);

	return true;
}
Exemplo n.º 4
0
static struct kfd_mem_obj *allocate_mqd(struct kfd_dev *kfd,
		struct queue_properties *q)
{
	int retval;
	struct kfd_mem_obj *mqd_mem_obj = NULL;

	if (q->type == KFD_QUEUE_TYPE_HIQ)
		return allocate_hiq_mqd(kfd);

	/* From V9,  for CWSR, the control stack is located on the next page
	 * boundary after the mqd, we will use the gtt allocation function
	 * instead of sub-allocation function.
	 */
	if (kfd->cwsr_enabled && (q->type == KFD_QUEUE_TYPE_COMPUTE)) {
		mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_NOIO);
		if (!mqd_mem_obj)
			return NULL;
		retval = amdgpu_amdkfd_alloc_gtt_mem(kfd->kgd,
			ALIGN(q->ctl_stack_size, PAGE_SIZE) +
				ALIGN(sizeof(struct v9_mqd), PAGE_SIZE),
			&(mqd_mem_obj->gtt_mem),
			&(mqd_mem_obj->gpu_addr),
			(void *)&(mqd_mem_obj->cpu_ptr), true);
	} else {
		retval = kfd_gtt_sa_allocate(kfd, sizeof(struct v9_mqd),
				&mqd_mem_obj);
	}

	if (retval) {
		kfree(mqd_mem_obj);
		return NULL;
	}

	return mqd_mem_obj;

}
Exemplo n.º 5
0
static int init_mqd(struct mqd_manager *mm, void **mqd,
		struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
		struct queue_properties *q)
{
	uint64_t addr;
	struct cik_mqd *m;
	int retval;

	retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct cik_mqd),
					mqd_mem_obj);

	if (retval != 0)
		return -ENOMEM;

	m = (struct cik_mqd *) (*mqd_mem_obj)->cpu_ptr;
	addr = (*mqd_mem_obj)->gpu_addr;

	memset(m, 0, ALIGN(sizeof(struct cik_mqd), 256));

	m->header = 0xC0310800;
	m->compute_pipelinestat_enable = 1;
	m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF;
	m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF;
	m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF;
	m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF;

	/*
	 * Make sure to use the last queue state saved on mqd when the cp
	 * reassigns the queue, so when queue is switched on/off (e.g over
	 * subscription or quantum timeout) the context will be consistent
	 */
	m->cp_hqd_persistent_state =
				DEFAULT_CP_HQD_PERSISTENT_STATE | PRELOAD_REQ;

	m->cp_mqd_control             = MQD_CONTROL_PRIV_STATE_EN;
	m->cp_mqd_base_addr_lo        = lower_32_bits(addr);
	m->cp_mqd_base_addr_hi        = upper_32_bits(addr);

	m->cp_hqd_quantum = QUANTUM_EN | QUANTUM_SCALE_1MS |
				QUANTUM_DURATION(10);

	/*
	 * Pipe Priority
	 * Identifies the pipe relative priority when this queue is connected
	 * to the pipeline. The pipe priority is against the GFX pipe and HP3D.
	 * In KFD we are using a fixed pipe priority set to CS_MEDIUM.
	 * 0 = CS_LOW (typically below GFX)
	 * 1 = CS_MEDIUM (typically between HP3D and GFX
	 * 2 = CS_HIGH (typically above HP3D)
	 */
	m->cp_hqd_pipe_priority = 1;
	m->cp_hqd_queue_priority = 15;

	if (q->format == KFD_QUEUE_FORMAT_AQL)
		m->cp_hqd_iq_rptr = AQL_ENABLE;

	*mqd = m;
	if (gart_addr)
		*gart_addr = addr;
	retval = mm->update_mqd(mm, m, q);

	return retval;
}
Exemplo n.º 6
0
static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev,
					struct dbg_wave_control_info *wac_info)
{

	int status;
	union SQ_CMD_BITS reg_sq_cmd;
	union GRBM_GFX_INDEX_BITS reg_gfx_index;
	struct kfd_mem_obj *mem_obj;
	uint32_t *packet_buff_uint;
	struct pm4__set_config_reg *packets_vec;
	size_t ib_size = sizeof(struct pm4__set_config_reg) * 3;

	BUG_ON(!dbgdev || !wac_info);

	reg_sq_cmd.u32All = 0;

	status = dbgdev_wave_control_set_registers(wac_info, &reg_sq_cmd,
							&reg_gfx_index);
	if (status) {
		pr_err("amdkfd: Failed to set wave control registers\n");
		return status;
	}

	/* we do not control the VMID in DIQ,so reset it to a known value */
	reg_sq_cmd.bits.vm_id = 0;

	pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *");

	pr_debug("\t\t mode      is: %u\n", wac_info->mode);
	pr_debug("\t\t operand   is: %u\n", wac_info->operand);
	pr_debug("\t\t trap id   is: %u\n", wac_info->trapId);
	pr_debug("\t\t msg value is: %u\n",
			wac_info->dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value);
	pr_debug("\t\t vmid      is: N/A\n");

	pr_debug("\t\t chk_vmid  is : %u\n", reg_sq_cmd.bitfields.check_vmid);
	pr_debug("\t\t command   is : %u\n", reg_sq_cmd.bitfields.cmd);
	pr_debug("\t\t queue id  is : %u\n", reg_sq_cmd.bitfields.queue_id);
	pr_debug("\t\t simd id   is : %u\n", reg_sq_cmd.bitfields.simd_id);
	pr_debug("\t\t mode      is : %u\n", reg_sq_cmd.bitfields.mode);
	pr_debug("\t\t vm_id     is : %u\n", reg_sq_cmd.bitfields.vm_id);
	pr_debug("\t\t wave_id   is : %u\n", reg_sq_cmd.bitfields.wave_id);

	pr_debug("\t\t ibw       is : %u\n",
			reg_gfx_index.bitfields.instance_broadcast_writes);
	pr_debug("\t\t ii        is : %u\n",
			reg_gfx_index.bitfields.instance_index);
	pr_debug("\t\t sebw      is : %u\n",
			reg_gfx_index.bitfields.se_broadcast_writes);
	pr_debug("\t\t se_ind    is : %u\n", reg_gfx_index.bitfields.se_index);
	pr_debug("\t\t sh_ind    is : %u\n", reg_gfx_index.bitfields.sh_index);
	pr_debug("\t\t sbw       is : %u\n",
			reg_gfx_index.bitfields.sh_broadcast_writes);

	pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *");

	status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj);

	if (status != 0) {
		pr_err("amdkfd: Failed to allocate GART memory\n");
		return status;
	}

	packet_buff_uint = mem_obj->cpu_ptr;

	memset(packet_buff_uint, 0, ib_size);

	packets_vec =  (struct pm4__set_config_reg *) packet_buff_uint;
	packets_vec[0].header.count = 1;
	packets_vec[0].header.opcode = IT_SET_UCONFIG_REG;
	packets_vec[0].header.type = PM4_TYPE_3;
	packets_vec[0].bitfields2.reg_offset =
			GRBM_GFX_INDEX / (sizeof(uint32_t)) -
				USERCONFIG_REG_BASE;

	packets_vec[0].bitfields2.insert_vmid = 0;
	packets_vec[0].reg_data[0] = reg_gfx_index.u32All;

	packets_vec[1].header.count = 1;
	packets_vec[1].header.opcode = IT_SET_CONFIG_REG;
	packets_vec[1].header.type = PM4_TYPE_3;
	packets_vec[1].bitfields2.reg_offset = SQ_CMD / (sizeof(uint32_t)) -
						AMD_CONFIG_REG_BASE;

	packets_vec[1].bitfields2.vmid_shift = SQ_CMD_VMID_OFFSET;
	packets_vec[1].bitfields2.insert_vmid = 1;
	packets_vec[1].reg_data[0] = reg_sq_cmd.u32All;

	/* Restore the GRBM_GFX_INDEX register */

	reg_gfx_index.u32All = 0;
	reg_gfx_index.bits.sh_broadcast_writes = 1;
	reg_gfx_index.bits.instance_broadcast_writes = 1;
	reg_gfx_index.bits.se_broadcast_writes = 1;


	packets_vec[2].ordinal1 = packets_vec[0].ordinal1;
	packets_vec[2].bitfields2.reg_offset =
				GRBM_GFX_INDEX / (sizeof(uint32_t)) -
					USERCONFIG_REG_BASE;

	packets_vec[2].bitfields2.insert_vmid = 0;
	packets_vec[2].reg_data[0] = reg_gfx_index.u32All;

	status = dbgdev_diq_submit_ib(
			dbgdev,
			wac_info->process->pasid,
			mem_obj->gpu_addr,
			packet_buff_uint,
			ib_size);

	if (status != 0)
		pr_err("amdkfd: Failed to submit IB to DIQ\n");

	kfd_gtt_sa_free(dbgdev->dev, mem_obj);

	return status;
}
Exemplo n.º 7
0
static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev,
				unsigned int pasid, uint64_t vmid0_address,
				uint32_t *packet_buff, size_t size_in_bytes)
{
	struct pm4__release_mem *rm_packet;
	struct pm4__indirect_buffer_pasid *ib_packet;
	struct kfd_mem_obj *mem_obj;
	size_t pq_packets_size_in_bytes;
	union ULARGE_INTEGER *largep;
	union ULARGE_INTEGER addr;
	struct kernel_queue *kq;
	uint64_t *rm_state;
	unsigned int *ib_packet_buff;
	int status;

	BUG_ON(!dbgdev || !dbgdev->kq || !packet_buff || !size_in_bytes);

	kq = dbgdev->kq;

	pq_packets_size_in_bytes = sizeof(struct pm4__release_mem) +
				sizeof(struct pm4__indirect_buffer_pasid);

	/*
	 * We acquire a buffer from DIQ
	 * The receive packet buff will be sitting on the Indirect Buffer
	 * and in the PQ we put the IB packet + sync packet(s).
	 */
	status = kq->ops.acquire_packet_buffer(kq,
				pq_packets_size_in_bytes / sizeof(uint32_t),
				&ib_packet_buff);
	if (status != 0) {
		pr_err("amdkfd: acquire_packet_buffer failed\n");
		return status;
	}

	memset(ib_packet_buff, 0, pq_packets_size_in_bytes);

	ib_packet = (struct pm4__indirect_buffer_pasid *) (ib_packet_buff);

	ib_packet->header.count = 3;
	ib_packet->header.opcode = IT_INDIRECT_BUFFER_PASID;
	ib_packet->header.type = PM4_TYPE_3;

	largep = (union ULARGE_INTEGER *) &vmid0_address;

	ib_packet->bitfields2.ib_base_lo = largep->u.low_part >> 2;
	ib_packet->bitfields3.ib_base_hi = largep->u.high_part;

	ib_packet->control = (1 << 23) | (1 << 31) |
			((size_in_bytes / sizeof(uint32_t)) & 0xfffff);

	ib_packet->bitfields5.pasid = pasid;

	/*
	 * for now we use release mem for GPU-CPU synchronization
	 * Consider WaitRegMem + WriteData as a better alternative
	 * we get a GART allocations ( gpu/cpu mapping),
	 * for the sync variable, and wait until:
	 * (a) Sync with HW
	 * (b) Sync var is written by CP to mem.
	 */
	rm_packet = (struct pm4__release_mem *) (ib_packet_buff +
			(sizeof(struct pm4__indirect_buffer_pasid) /
					sizeof(unsigned int)));

	status = kfd_gtt_sa_allocate(dbgdev->dev, sizeof(uint64_t),
					&mem_obj);

	if (status != 0) {
		pr_err("amdkfd: Failed to allocate GART memory\n");
		kq->ops.rollback_packet(kq);
		return status;
	}

	rm_state = (uint64_t *) mem_obj->cpu_ptr;

	*rm_state = QUEUESTATE__ACTIVE_COMPLETION_PENDING;

	rm_packet->header.opcode = IT_RELEASE_MEM;
	rm_packet->header.type = PM4_TYPE_3;
	rm_packet->header.count = sizeof(struct pm4__release_mem) /
					sizeof(unsigned int) - 2;

	rm_packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT;
	rm_packet->bitfields2.event_index =
				event_index___release_mem__end_of_pipe;

	rm_packet->bitfields2.cache_policy = cache_policy___release_mem__lru;
	rm_packet->bitfields2.atc = 0;
	rm_packet->bitfields2.tc_wb_action_ena = 1;

	addr.quad_part = mem_obj->gpu_addr;

	rm_packet->bitfields4.address_lo_32b = addr.u.low_part >> 2;
	rm_packet->address_hi = addr.u.high_part;

	rm_packet->bitfields3.data_sel =
				data_sel___release_mem__send_64_bit_data;

	rm_packet->bitfields3.int_sel =
			int_sel___release_mem__send_data_after_write_confirm;

	rm_packet->bitfields3.dst_sel =
			dst_sel___release_mem__memory_controller;

	rm_packet->data_lo = QUEUESTATE__ACTIVE;

	kq->ops.submit_packet(kq);

	/* Wait till CP writes sync code: */
	status = amdkfd_fence_wait_timeout(
			(unsigned int *) rm_state,
			QUEUESTATE__ACTIVE, 1500);

	kfd_gtt_sa_free(dbgdev->dev, mem_obj);

	return status;
}
Exemplo n.º 8
0
static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev,
					struct dbg_address_watch_info *adw_info)
{
	struct pm4__set_config_reg *packets_vec;
	union TCP_WATCH_ADDR_H_BITS addrHi;
	union TCP_WATCH_ADDR_L_BITS addrLo;
	union TCP_WATCH_CNTL_BITS cntl;
	struct kfd_mem_obj *mem_obj;
	unsigned int aw_reg_add_dword;
	uint32_t *packet_buff_uint;
	unsigned int i;
	int status;
	size_t ib_size = sizeof(struct pm4__set_config_reg) * 4;
	/* we do not control the vmid in DIQ mode, just a place holder */
	unsigned int vmid = 0;

	BUG_ON(!dbgdev || !dbgdev->dev || !adw_info);

	addrHi.u32All = 0;
	addrLo.u32All = 0;
	cntl.u32All = 0;

	if ((adw_info->num_watch_points > MAX_WATCH_ADDRESSES) ||
			(adw_info->num_watch_points == 0)) {
		pr_err("amdkfd: num_watch_points is invalid\n");
		return -EINVAL;
	}

	if ((NULL == adw_info->watch_mode) ||
			(NULL == adw_info->watch_address)) {
		pr_err("amdkfd: adw_info fields are not valid\n");
		return -EINVAL;
	}

	status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj);

	if (status != 0) {
		pr_err("amdkfd: Failed to allocate GART memory\n");
		return status;
	}

	packet_buff_uint = mem_obj->cpu_ptr;

	memset(packet_buff_uint, 0, ib_size);

	packets_vec = (struct pm4__set_config_reg *) (packet_buff_uint);

	packets_vec[0].header.count = 1;
	packets_vec[0].header.opcode = IT_SET_CONFIG_REG;
	packets_vec[0].header.type = PM4_TYPE_3;
	packets_vec[0].bitfields2.vmid_shift = ADDRESS_WATCH_CNTL_OFFSET;
	packets_vec[0].bitfields2.insert_vmid = 1;
	packets_vec[1].ordinal1 = packets_vec[0].ordinal1;
	packets_vec[1].bitfields2.insert_vmid = 0;
	packets_vec[2].ordinal1 = packets_vec[0].ordinal1;
	packets_vec[2].bitfields2.insert_vmid = 0;
	packets_vec[3].ordinal1 = packets_vec[0].ordinal1;
	packets_vec[3].bitfields2.vmid_shift = ADDRESS_WATCH_CNTL_OFFSET;
	packets_vec[3].bitfields2.insert_vmid = 1;

	for (i = 0; i < adw_info->num_watch_points; i++) {
		dbgdev_address_watch_set_registers(adw_info,
						&addrHi,
						&addrLo,
						&cntl,
						i,
						vmid);

		pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *");
		pr_debug("\t\t%20s %08x\n", "register index :", i);
		pr_debug("\t\t%20s %08x\n", "vmid is :", vmid);
		pr_debug("\t\t%20s %p\n", "Add ptr is :",
				adw_info->watch_address);
		pr_debug("\t\t%20s %08llx\n", "Add     is :",
				adw_info->watch_address[i]);
		pr_debug("\t\t%20s %08x\n", "Address Low is :",
				addrLo.bitfields.addr);
		pr_debug("\t\t%20s %08x\n", "Address high is :",
				addrHi.bitfields.addr);
		pr_debug("\t\t%20s %08x\n", "Control Mask is :",
				cntl.bitfields.mask);
		pr_debug("\t\t%20s %08x\n", "Control Mode is :",
				cntl.bitfields.mode);
		pr_debug("\t\t%20s %08x\n", "Control Vmid is :",
				cntl.bitfields.vmid);
		pr_debug("\t\t%20s %08x\n", "Control atc  is :",
				cntl.bitfields.atc);
		pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *");

		aw_reg_add_dword =
				dbgdev->dev->kfd2kgd->address_watch_get_offset(
					dbgdev->dev->kgd,
					i,
					ADDRESS_WATCH_REG_CNTL);

		aw_reg_add_dword /= sizeof(uint32_t);

		packets_vec[0].bitfields2.reg_offset =
					aw_reg_add_dword - AMD_CONFIG_REG_BASE;

		packets_vec[0].reg_data[0] = cntl.u32All;

		aw_reg_add_dword =
				dbgdev->dev->kfd2kgd->address_watch_get_offset(
					dbgdev->dev->kgd,
					i,
					ADDRESS_WATCH_REG_ADDR_HI);

		aw_reg_add_dword /= sizeof(uint32_t);

		packets_vec[1].bitfields2.reg_offset =
					aw_reg_add_dword - AMD_CONFIG_REG_BASE;
		packets_vec[1].reg_data[0] = addrHi.u32All;

		aw_reg_add_dword =
				dbgdev->dev->kfd2kgd->address_watch_get_offset(
					dbgdev->dev->kgd,
					i,
					ADDRESS_WATCH_REG_ADDR_LO);

		aw_reg_add_dword /= sizeof(uint32_t);

		packets_vec[2].bitfields2.reg_offset =
				aw_reg_add_dword - AMD_CONFIG_REG_BASE;
		packets_vec[2].reg_data[0] = addrLo.u32All;

		/* enable watch flag if address is not zero*/
		if (adw_info->watch_address[i] > 0)
			cntl.bitfields.valid = 1;
		else
			cntl.bitfields.valid = 0;

		aw_reg_add_dword =
				dbgdev->dev->kfd2kgd->address_watch_get_offset(
					dbgdev->dev->kgd,
					i,
					ADDRESS_WATCH_REG_CNTL);

		aw_reg_add_dword /= sizeof(uint32_t);

		packets_vec[3].bitfields2.reg_offset =
					aw_reg_add_dword - AMD_CONFIG_REG_BASE;
		packets_vec[3].reg_data[0] = cntl.u32All;

		status = dbgdev_diq_submit_ib(
					dbgdev,
					adw_info->process->pasid,
					mem_obj->gpu_addr,
					packet_buff_uint,
					ib_size);

		if (status != 0) {
			pr_err("amdkfd: Failed to submit IB to DIQ\n");
			break;
		}
	}

	kfd_gtt_sa_free(dbgdev->dev, mem_obj);
	return status;
}
static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
		enum kfd_queue_type type, unsigned int queue_size)
{
	struct queue_properties prop;
	int retval;
	union PM4_MES_TYPE_3_HEADER nop;

	BUG_ON(!kq || !dev);
	BUG_ON(type != KFD_QUEUE_TYPE_DIQ && type != KFD_QUEUE_TYPE_HIQ);

	pr_debug("amdkfd: In func %s initializing queue type %d size %d\n",
			__func__, KFD_QUEUE_TYPE_HIQ, queue_size);

	memset(&prop, 0, sizeof(prop));
	memset(&nop, 0, sizeof(nop));

	nop.opcode = IT_NOP;
	nop.type = PM4_TYPE_3;
	nop.u32all |= PM4_COUNT_ZERO;

	kq->dev = dev;
	kq->nop_packet = nop.u32all;
	switch (type) {
	case KFD_QUEUE_TYPE_DIQ:
	case KFD_QUEUE_TYPE_HIQ:
		kq->mqd = dev->dqm->ops.get_mqd_manager(dev->dqm,
						KFD_MQD_TYPE_HIQ);
		break;
	default:
		BUG();
		break;
	}

	if (kq->mqd == NULL)
		return false;

	prop.doorbell_ptr = kfd_get_kernel_doorbell(dev, &prop.doorbell_off);

	if (prop.doorbell_ptr == NULL) {
		pr_err("amdkfd: error init doorbell");
		goto err_get_kernel_doorbell;
	}

	retval = kfd_gtt_sa_allocate(dev, queue_size, &kq->pq);
	if (retval != 0) {
		pr_err("amdkfd: error init pq queues size (%d)\n", queue_size);
		goto err_pq_allocate_vidmem;
	}

	kq->pq_kernel_addr = kq->pq->cpu_ptr;
	kq->pq_gpu_addr = kq->pq->gpu_addr;

	retval = kq->ops_asic_specific.initialize(kq, dev, type, queue_size);
	if (retval == false)
		goto err_eop_allocate_vidmem;

	retval = kfd_gtt_sa_allocate(dev, sizeof(*kq->rptr_kernel),
					&kq->rptr_mem);

	if (retval != 0)
		goto err_rptr_allocate_vidmem;

	kq->rptr_kernel = kq->rptr_mem->cpu_ptr;
	kq->rptr_gpu_addr = kq->rptr_mem->gpu_addr;

	retval = kfd_gtt_sa_allocate(dev, sizeof(*kq->wptr_kernel),
					&kq->wptr_mem);

	if (retval != 0)
		goto err_wptr_allocate_vidmem;

	kq->wptr_kernel = kq->wptr_mem->cpu_ptr;
	kq->wptr_gpu_addr = kq->wptr_mem->gpu_addr;

	memset(kq->pq_kernel_addr, 0, queue_size);
	memset(kq->rptr_kernel, 0, sizeof(*kq->rptr_kernel));
	memset(kq->wptr_kernel, 0, sizeof(*kq->wptr_kernel));

	prop.queue_size = queue_size;
	prop.is_interop = false;
	prop.priority = 1;
	prop.queue_percent = 100;
	prop.type = type;
	prop.vmid = 0;
	prop.queue_address = kq->pq_gpu_addr;
	prop.read_ptr = (uint32_t *) kq->rptr_gpu_addr;
	prop.write_ptr = (uint32_t *) kq->wptr_gpu_addr;
	prop.eop_ring_buffer_address = kq->eop_gpu_addr;
	prop.eop_ring_buffer_size = PAGE_SIZE;

	if (init_queue(&kq->queue, &prop) != 0)
		goto err_init_queue;

	kq->queue->device = dev;
	kq->queue->process = kfd_get_process(current);

	retval = kq->mqd->init_mqd(kq->mqd, &kq->queue->mqd,
					&kq->queue->mqd_mem_obj,
					&kq->queue->gart_mqd_addr,
					&kq->queue->properties);
	if (retval != 0)
		goto err_init_mqd;

	/* assign HIQ to HQD */
	if (type == KFD_QUEUE_TYPE_HIQ) {
		pr_debug("assigning hiq to hqd\n");
		kq->queue->pipe = KFD_CIK_HIQ_PIPE;
		kq->queue->queue = KFD_CIK_HIQ_QUEUE;
		kq->mqd->load_mqd(kq->mqd, kq->queue->mqd, kq->queue->pipe,
					kq->queue->queue, NULL, NULL);
	} else {
		/* allocate fence for DIQ */

		retval = kfd_gtt_sa_allocate(dev, sizeof(uint32_t),
						&kq->fence_mem_obj);

		if (retval != 0)
			goto err_alloc_fence;

		kq->fence_kernel_address = kq->fence_mem_obj->cpu_ptr;
		kq->fence_gpu_addr = kq->fence_mem_obj->gpu_addr;
	}

	print_queue(kq->queue);

	return true;
err_alloc_fence:
err_init_mqd:
	uninit_queue(kq->queue);
err_init_queue:
	kfd_gtt_sa_free(dev, kq->wptr_mem);
err_wptr_allocate_vidmem:
	kfd_gtt_sa_free(dev, kq->rptr_mem);
err_rptr_allocate_vidmem:
	kfd_gtt_sa_free(dev, kq->eop_mem);
err_eop_allocate_vidmem:
	kfd_gtt_sa_free(dev, kq->pq);
err_pq_allocate_vidmem:
	kfd_release_kernel_doorbell(dev, prop.doorbell_ptr);
err_get_kernel_doorbell:
	return false;

}
Exemplo n.º 10
0
static int init_mqd(struct mqd_manager *mm, void **mqd,
			struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
			struct queue_properties *q)
{
	int retval;
	uint64_t addr;
	struct v9_mqd *m;
	struct kfd_dev *kfd = mm->dev;

	/* From V9,  for CWSR, the control stack is located on the next page
	 * boundary after the mqd, we will use the gtt allocation function
	 * instead of sub-allocation function.
	 */
	if (kfd->cwsr_enabled && (q->type == KFD_QUEUE_TYPE_COMPUTE)) {
		*mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL);
		if (!*mqd_mem_obj)
			return -ENOMEM;
		retval = kfd->kfd2kgd->init_gtt_mem_allocation(kfd->kgd,
			ALIGN(q->ctl_stack_size, PAGE_SIZE) +
				ALIGN(sizeof(struct v9_mqd), PAGE_SIZE),
			&((*mqd_mem_obj)->gtt_mem),
			&((*mqd_mem_obj)->gpu_addr),
			(void *)&((*mqd_mem_obj)->cpu_ptr), true);
	} else
		retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct v9_mqd),
				mqd_mem_obj);
	if (retval != 0)
		return -ENOMEM;

	m = (struct v9_mqd *) (*mqd_mem_obj)->cpu_ptr;
	addr = (*mqd_mem_obj)->gpu_addr;

	memset(m, 0, sizeof(struct v9_mqd));

	m->header = 0xC0310800;
	m->compute_pipelinestat_enable = 1;
	m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF;
	m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF;
	m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF;
	m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF;

	m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK |
			0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT;

	m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT;

	m->cp_mqd_base_addr_lo        = lower_32_bits(addr);
	m->cp_mqd_base_addr_hi        = upper_32_bits(addr);

	m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT |
			1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
			10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;

	m->cp_hqd_pipe_priority = 1;
	m->cp_hqd_queue_priority = 15;

	if (q->format == KFD_QUEUE_FORMAT_AQL) {
		m->cp_hqd_aql_control =
			1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT;
	}

	if (q->tba_addr) {
		m->compute_pgm_rsrc2 |=
			(1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT);
	}

	if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address) {
		m->cp_hqd_persistent_state |=
			(1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT);
		m->cp_hqd_ctx_save_base_addr_lo =
			lower_32_bits(q->ctx_save_restore_area_address);
		m->cp_hqd_ctx_save_base_addr_hi =
			upper_32_bits(q->ctx_save_restore_area_address);
		m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size;
		m->cp_hqd_cntl_stack_size = q->ctl_stack_size;
		m->cp_hqd_cntl_stack_offset = q->ctl_stack_size;
		m->cp_hqd_wg_state_offset = q->ctl_stack_size;
	}

	*mqd = m;
	if (gart_addr)
		*gart_addr = addr;
	retval = mm->update_mqd(mm, m, q);

	return retval;
}