Exemple #1
0
/**
 * Emit function for r600_cs_shader_state atom
 */
void evergreen_emit_cs_shader(struct r600_context *rctx,
			      struct r600_atom *atom)
{
	struct r600_cs_shader_state *state =
					(struct r600_cs_shader_state*)atom;
	struct r600_pipe_compute *shader = state->shader;
	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
	uint64_t va;
	struct r600_resource *code_bo;
	unsigned ngpr, nstack;

	code_bo = shader->code_bo;
	va = shader->code_bo->gpu_address + state->pc;
	ngpr = shader->bc.ngpr;
	nstack = shader->bc.nstack;

	radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
	radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
	radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
			S_0288D4_NUM_GPRS(ngpr)
			| S_0288D4_STACK_SIZE(nstack));
	radeon_emit(cs, 0);	/* R_0288D8_SQ_PGM_RESOURCES_LS_2 */

	radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
					      code_bo, RADEON_USAGE_READ,
					      RADEON_PRIO_USER_SHADER));
}
Exemple #2
0
static void si_emit_cp_dma_copy_buffer(struct radv_cmd_buffer *cmd_buffer,
				       uint64_t dst_va, uint64_t src_va,
				       unsigned size, unsigned flags)
{
	struct radeon_winsys_cs *cs = cmd_buffer->cs;
	uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? S_411_CP_SYNC(1) : 0;
	uint32_t wr_confirm = !(flags & R600_CP_DMA_SYNC) ? S_414_DISABLE_WR_CONFIRM(1) : 0;
	uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? S_414_RAW_WAIT(1) : 0;
	uint32_t sel = flags & CIK_CP_DMA_USE_L2 ?
			   S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2) |
			   S_411_DSL_SEL(V_411_DST_ADDR_TC_L2) : 0;

	assert(size);
	assert((size & ((1<<21)-1)) == size);

	radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 9);

	if (cmd_buffer->device->instance->physicalDevice.rad_info.chip_class >= CIK) {
		radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
		radeon_emit(cs, sync_flag | sel);	/* CP_SYNC [31] */
		radeon_emit(cs, src_va);		/* SRC_ADDR_LO [31:0] */
		radeon_emit(cs, src_va >> 32);		/* SRC_ADDR_HI [31:0] */
		radeon_emit(cs, dst_va);		/* DST_ADDR_LO [31:0] */
		radeon_emit(cs, dst_va >> 32);		/* DST_ADDR_HI [31:0] */
		radeon_emit(cs, size | wr_confirm | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
	} else {
Exemple #3
0
void si_ce_enable_loads(struct radeon_winsys_cs *ib)
{
	radeon_emit(ib, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
	radeon_emit(ib, CONTEXT_CONTROL_LOAD_ENABLE(1) |
	                CONTEXT_CONTROL_LOAD_CE_RAM(1));
	radeon_emit(ib, CONTEXT_CONTROL_SHADOW_ENABLE(1));
}
Exemple #4
0
static void si_reinitialize_ce_ram(struct si_context *sctx,
                            struct si_descriptors *desc)
{
	if (desc->buffer) {
		struct r600_resource *buffer = (struct r600_resource*)desc->buffer;
		unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
		uint64_t va = buffer->gpu_address + desc->buffer_offset;
		struct radeon_winsys_cs *ib = sctx->ce_preamble_ib;

		if (!ib)
			ib = sctx->ce_ib;

		list_size = align(list_size, 32);

		radeon_emit(ib, PKT3(PKT3_LOAD_CONST_RAM, 3, 0));
		radeon_emit(ib, va);
		radeon_emit(ib, va >> 32);
		radeon_emit(ib, list_size / 4);
		radeon_emit(ib, desc->ce_offset);

		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
		                    RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
	}
	desc->ce_ram_dirty = false;
}
Exemple #5
0
static bool si_upload_descriptors(struct si_context *sctx,
				  struct si_descriptors *desc,
				  struct r600_atom * atom)
{
	unsigned list_size = desc->num_elements * desc->element_dw_size * 4;

	if (!desc->dirty_mask)
		return true;

	if (sctx->ce_ib) {
		uint32_t const* list = (uint32_t const*)desc->list;

		if (desc->ce_ram_dirty)
			si_reinitialize_ce_ram(sctx, desc);

		while(desc->dirty_mask) {
			int begin, count;
			u_bit_scan_consecutive_range(&desc->dirty_mask, &begin,
						     &count);

			begin *= desc->element_dw_size;
			count *= desc->element_dw_size;

			radeon_emit(sctx->ce_ib,
			            PKT3(PKT3_WRITE_CONST_RAM, count, 0));
			radeon_emit(sctx->ce_ib, desc->ce_offset + begin * 4);
			radeon_emit_array(sctx->ce_ib, list + begin, count);
		}

		if (!si_ce_upload(sctx, desc->ce_offset, list_size,
		                           &desc->buffer_offset, &desc->buffer))
			return false;
	} else {
		void *ptr;

		u_upload_alloc(sctx->b.uploader, 0, list_size, 256,
			&desc->buffer_offset,
			(struct pipe_resource**)&desc->buffer, &ptr);
		if (!desc->buffer)
			return false; /* skip the draw call */

		util_memcpy_cpu_to_le32(ptr, desc->list, list_size);

		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
	                            RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
	}
	desc->pointer_dirty = true;
	desc->dirty_mask = 0;

	if (atom)
		si_mark_atom_dirty(sctx, atom);

	return true;
}
Exemple #6
0
void cayman_emit_msaa_config(struct radeon_winsys_cs *cs, int nr_samples,
			     int ps_iter_samples, int overrast_samples)
{
	int setup_samples = nr_samples > 1 ? nr_samples :
			    overrast_samples > 1 ? overrast_samples : 0;

	if (setup_samples > 1) {
		/* indexed by log2(nr_samples) */
		unsigned max_dist[] = {
			0,
			eg_max_dist_2x,
			eg_max_dist_4x,
			cm_max_dist_8x,
			cm_max_dist_16x
		};
		unsigned log_samples = util_logbase2(setup_samples);
		unsigned log_ps_iter_samples =
			util_logbase2(util_next_power_of_two(ps_iter_samples));

		radeon_set_context_reg_seq(cs, CM_R_028BDC_PA_SC_LINE_CNTL, 2);
		radeon_emit(cs, S_028BDC_LAST_PIXEL(1) |
			    S_028BDC_EXPAND_LINE_WIDTH(1)); /* CM_R_028BDC_PA_SC_LINE_CNTL */
		radeon_emit(cs, S_028BE0_MSAA_NUM_SAMPLES(log_samples) |
			    S_028BE0_MAX_SAMPLE_DIST(max_dist[log_samples]) |
			    S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples)); /* CM_R_028BE0_PA_SC_AA_CONFIG */

		if (nr_samples > 1) {
			radeon_set_context_reg(cs, CM_R_028804_DB_EQAA,
					       S_028804_MAX_ANCHOR_SAMPLES(log_samples) |
					       S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) |
					       S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) |
					       S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples) |
					       S_028804_HIGH_QUALITY_INTERSECTIONS(1) |
					       S_028804_STATIC_ANCHOR_ASSOCIATIONS(1));
			radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1,
					     EG_S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1));
		} else if (overrast_samples > 1) {
			radeon_set_context_reg(cs, CM_R_028804_DB_EQAA,
					       S_028804_HIGH_QUALITY_INTERSECTIONS(1) |
					       S_028804_STATIC_ANCHOR_ASSOCIATIONS(1) |
					       S_028804_OVERRASTERIZATION_AMOUNT(log_samples));
			radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, 0);
		}
	} else {
		radeon_set_context_reg_seq(cs, CM_R_028BDC_PA_SC_LINE_CNTL, 2);
		radeon_emit(cs, S_028BDC_LAST_PIXEL(1)); /* CM_R_028BDC_PA_SC_LINE_CNTL */
		radeon_emit(cs, 0); /* CM_R_028BE0_PA_SC_AA_CONFIG */

		radeon_set_context_reg(cs, CM_R_028804_DB_EQAA,
				       S_028804_HIGH_QUALITY_INTERSECTIONS(1) |
				       S_028804_STATIC_ANCHOR_ASSOCIATIONS(1));
		radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, 0);
	}
}
Exemple #7
0
static void evergreen_set_streamout_enable(struct r600_common_context *rctx, unsigned buffer_enable_bit)
{
	struct radeon_winsys_cs *cs = rctx->rings.gfx.cs;

	if (buffer_enable_bit) {
		r600_write_context_reg_seq(cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
		radeon_emit(cs, S_028B94_STREAMOUT_0_EN(1)); /* R_028B94_VGT_STRMOUT_CONFIG */
		radeon_emit(cs, S_028B98_STREAM_0_BUFFER_EN(buffer_enable_bit)); /* R_028B98_VGT_STRMOUT_BUFFER_CONFIG */
	} else {
		r600_write_context_reg(cs, R_028B94_VGT_STRMOUT_CONFIG, S_028B94_STREAMOUT_0_EN(0));
	}
}
Exemple #8
0
static void r600_flush_vgt_streamout(struct r600_common_context *rctx)
{
    struct radeon_winsys_cs *cs = rctx->rings.gfx.cs;
    unsigned reg_strmout_cntl;

    /* The register is at different places on different ASICs. */
    if (rctx->chip_class >= CIK) {
        reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
    } else if (rctx->chip_class >= EVERGREEN) {
        reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
    } else {
        reg_strmout_cntl = R_008490_CP_STRMOUT_CNTL;
    }

    if (rctx->chip_class >= CIK) {
        cik_write_uconfig_reg(cs, reg_strmout_cntl, 0);
    } else {
        r600_write_config_reg(cs, reg_strmout_cntl, 0);
    }

    radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
    radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));

    radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
    radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
    radeon_emit(cs, reg_strmout_cntl >> 2);  /* register */
    radeon_emit(cs, 0);
    radeon_emit(cs, S_008490_OFFSET_UPDATE_DONE(1)); /* reference value */
    radeon_emit(cs, S_008490_OFFSET_UPDATE_DONE(1)); /* mask */
    radeon_emit(cs, 4); /* poll interval */
}
Exemple #9
0
static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r600_atom *atom)
{
	struct radeon_winsys_cs *cs = rctx->rings.gfx.cs;
	struct r600_so_target **t = rctx->streamout.targets;
	unsigned *stride_in_dw = rctx->streamout.stride_in_dw;
	unsigned i, update_flags = 0;

	r600_flush_vgt_streamout(rctx);

	if (rctx->chip_class >= EVERGREEN) {
		evergreen_set_streamout_enable(rctx, rctx->streamout.enabled_mask);
	} else {
		r600_set_streamout_enable(rctx, rctx->streamout.enabled_mask);
	}

	for (i = 0; i < rctx->streamout.num_targets; i++) {
		if (!t[i])
			continue;

		t[i]->stride_in_dw = stride_in_dw[i];

		if (rctx->chip_class >= SI) {
			/* SI binds streamout buffers as shader resources.
			 * VGT only counts primitives and tells the shader
			 * through SGPRs what to do. */
			r600_write_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 2);
			radeon_emit(cs, (t[i]->b.buffer_offset +
					 t[i]->b.buffer_size) >> 2);	/* BUFFER_SIZE (in DW) */
			radeon_emit(cs, stride_in_dw[i]);		/* VTX_STRIDE (in DW) */
		} else {
Exemple #10
0
void
si_write_scissors(struct radeon_winsys_cs *cs, int first,
                  int count, const VkRect2D *scissors)
{
	int i;
	if (count == 0)
		return;

	radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL + first * 4 * 2, count * 2);
	for (i = 0; i < count; i++) {
		radeon_emit(cs, S_028250_TL_X(scissors[i].offset.x) |
			    S_028250_TL_Y(scissors[i].offset.y) |
			    S_028250_WINDOW_OFFSET_DISABLE(1));
		radeon_emit(cs, S_028254_BR_X(scissors[i].offset.x + scissors[i].extent.width) |
			    S_028254_BR_Y(scissors[i].offset.y + scissors[i].extent.height));
	}
}
Exemple #11
0
/* Emit a CP DMA packet to do a copy from one buffer to another.
 * The size must fit in bits [20:0].
 */
static void si_emit_cp_dma_copy_buffer(struct si_context *sctx,
				       uint64_t dst_va, uint64_t src_va,
				       unsigned size, unsigned flags)
{
	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
	uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? PKT3_CP_DMA_CP_SYNC : 0;
	uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? PKT3_CP_DMA_CMD_RAW_WAIT : 0;

	assert(size);
	assert((size & ((1<<21)-1)) == size);

	if (sctx->b.chip_class >= CIK) {
		radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
		radeon_emit(cs, sync_flag);		/* CP_SYNC [31] */
		radeon_emit(cs, src_va);		/* SRC_ADDR_LO [31:0] */
		radeon_emit(cs, src_va >> 32);		/* SRC_ADDR_HI [31:0] */
		radeon_emit(cs, dst_va);		/* DST_ADDR_LO [31:0] */
		radeon_emit(cs, dst_va >> 32);		/* DST_ADDR_HI [31:0] */
		radeon_emit(cs, size | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
	} else {
static void r600_dma_emit_wait_idle(struct r600_common_context *rctx)
{
	struct radeon_winsys_cs *cs = rctx->dma.cs;

	if (rctx->chip_class >= EVERGREEN)
		radeon_emit(cs, 0xf0000000); /* NOP */
	else {
		/* TODO: R600-R700 should use the FENCE packet.
		 * CS checker support is required. */
	}
}
Exemple #13
0
static void si_initialize_compute(struct si_context *sctx)
{
	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
	uint64_t bc_va;

	radeon_set_sh_reg_seq(cs, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2);
	/* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1 */
	radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
	radeon_emit(cs, S_00B85C_SH0_CU_EN(0xffff) | S_00B85C_SH1_CU_EN(0xffff));

	if (sctx->b.chip_class >= CIK) {
		/* Also set R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE2 / SE3 */
		radeon_set_sh_reg_seq(cs,
		                     R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 2);
		radeon_emit(cs, S_00B864_SH0_CU_EN(0xffff) |
		                S_00B864_SH1_CU_EN(0xffff));
		radeon_emit(cs, S_00B868_SH0_CU_EN(0xffff) |
		                S_00B868_SH1_CU_EN(0xffff));
	}

	/* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID
	 * and is now per pipe, so it should be handled in the
	 * kernel if we want to use something other than the default value,
	 * which is now 0x22f.
	 */
	if (sctx->b.chip_class <= SI) {
		/* XXX: This should be:
		 * (number of compute units) * 4 * (waves per simd) - 1 */

		radeon_set_sh_reg(cs, R_00B82C_COMPUTE_MAX_WAVE_ID,
		                  0x190 /* Default value */);
	}

	/* Set the pointer to border colors. */
	bc_va = sctx->border_color_buffer->gpu_address;

	if (sctx->b.chip_class >= CIK) {
		radeon_set_uconfig_reg_seq(cs, R_030E00_TA_CS_BC_BASE_ADDR, 2);
		radeon_emit(cs, bc_va >> 8);  /* R_030E00_TA_CS_BC_BASE_ADDR */
		radeon_emit(cs, S_030E04_ADDRESS(bc_va >> 40)); /* R_030E04_TA_CS_BC_BASE_ADDR_HI */
	} else {
Exemple #14
0
void r600_dma_copy_buffer(struct r600_context *rctx,
			  struct pipe_resource *dst,
			  struct pipe_resource *src,
			  uint64_t dst_offset,
			  uint64_t src_offset,
			  uint64_t size)
{
	struct radeon_winsys_cs *cs = rctx->b.dma.cs;
	unsigned i, ncopy, csize;
	struct r600_resource *rdst = (struct r600_resource*)dst;
	struct r600_resource *rsrc = (struct r600_resource*)src;

	/* Mark the buffer range of destination as valid (initialized),
	 * so that transfer_map knows it should wait for the GPU when mapping
	 * that range. */
	util_range_add(&rdst->valid_buffer_range, dst_offset,
		       dst_offset + size);

	size >>= 2; /* convert to dwords */
	ncopy = (size / R600_DMA_COPY_MAX_SIZE_DW) + !!(size % R600_DMA_COPY_MAX_SIZE_DW);

	r600_need_dma_space(&rctx->b, ncopy * 5, rdst, rsrc);
	for (i = 0; i < ncopy; i++) {
		csize = size < R600_DMA_COPY_MAX_SIZE_DW ? size : R600_DMA_COPY_MAX_SIZE_DW;
		/* emit reloc before writing cs so that cs is always in consistent state */
		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rsrc, RADEON_USAGE_READ,
				      RADEON_PRIO_SDMA_BUFFER);
		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rdst, RADEON_USAGE_WRITE,
				      RADEON_PRIO_SDMA_BUFFER);
		radeon_emit(cs, DMA_PACKET(DMA_PACKET_COPY, 0, 0, csize));
		radeon_emit(cs, dst_offset & 0xfffffffc);
		radeon_emit(cs, src_offset & 0xfffffffc);
		radeon_emit(cs, (dst_offset >> 32UL) & 0xff);
		radeon_emit(cs, (src_offset >> 32UL) & 0xff);
		dst_offset += csize << 2;
		src_offset += csize << 2;
		size -= csize;
	}
	r600_dma_emit_wait_idle(&rctx->b);
}
/**
 * Write an EOP event.
 *
 * \param event		EVENT_TYPE_*
 * \param event_flags	Optional cache flush flags (TC)
 * \param data_sel	1 = fence, 3 = timestamp
 * \param buf		Buffer
 * \param va		GPU address
 * \param old_value	Previous fence value (for a bug workaround)
 * \param new_value	Fence value to write for this event.
 */
void r600_gfx_write_event_eop(struct r600_common_context *ctx,
			      unsigned event, unsigned event_flags,
			      unsigned data_sel,
			      struct r600_resource *buf, uint64_t va,
			      uint32_t new_fence, unsigned query_type)
{
	struct radeon_winsys_cs *cs = ctx->gfx.cs;
	unsigned op = EVENT_TYPE(event) |
		      EVENT_INDEX(5) |
		      event_flags;
	unsigned sel = EOP_DATA_SEL(data_sel);

	radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
	radeon_emit(cs, op);
	radeon_emit(cs, va);
	radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
	radeon_emit(cs, new_fence); /* immediate data */
	radeon_emit(cs, 0); /* unused */

	if (buf)
		r600_emit_reloc(ctx, &ctx->gfx, buf, RADEON_USAGE_WRITE,
				RADEON_PRIO_QUERY);
}
Exemple #16
0
static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned size,
			 unsigned *out_offset, struct r600_resource **out_buf) {
	uint64_t va;

	u_suballocator_alloc(sctx->ce_suballocator, size, out_offset,
			     (struct pipe_resource**)out_buf);
	if (!out_buf)
			return false;

	va = (*out_buf)->gpu_address + *out_offset;

	radeon_emit(sctx->ce_ib, PKT3(PKT3_DUMP_CONST_RAM, 3, 0));
	radeon_emit(sctx->ce_ib, ce_offset);
	radeon_emit(sctx->ce_ib, size / 4);
	radeon_emit(sctx->ce_ib, va);
	radeon_emit(sctx->ce_ib, va >> 32);

	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, *out_buf,
	                       RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS);

	sctx->ce_need_synchronization = true;
	return true;
}
/**
 * Emit function for r600_cs_shader_state atom
 */
void evergreen_emit_cs_shader(
		struct r600_context *rctx,
		struct r600_atom *atom)
{
	struct r600_cs_shader_state *state =
					(struct r600_cs_shader_state*)atom;
	struct r600_pipe_compute *shader = state->shader;
	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
	uint64_t va;
	struct r600_resource *code_bo;
	unsigned ngpr, nstack;

#if HAVE_LLVM < 0x0306
	struct r600_kernel *kernel = &shader->kernels[state->kernel_index];
	code_bo = kernel->code_bo;
	va = kernel->code_bo->gpu_address;
	ngpr = kernel->bc.ngpr;
	nstack = kernel->bc.nstack;
#else
	code_bo = shader->code_bo;
	va = shader->code_bo->gpu_address + state->pc;
	ngpr = shader->bc.ngpr;
	nstack = shader->bc.nstack;
#endif

	r600_write_compute_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
	radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
	radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
			S_0288D4_NUM_GPRS(ngpr)
			| S_0288D4_STACK_SIZE(nstack));
	radeon_emit(cs, 0);	/* R_0288D8_SQ_PGM_RESOURCES_LS_2 */

	radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
	radeon_emit(cs, r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx,
					      code_bo, RADEON_USAGE_READ,
					      RADEON_PRIO_SHADER_DATA));
}
Exemple #18
0
/* Emit a CP DMA packet to do a copy from one buffer to another, or to clear
 * a buffer. The size must fit in bits [20:0]. If CP_DMA_CLEAR is set, src_va is a 32-bit
 * clear value.
 */
static void si_emit_cp_dma(struct si_context *sctx, uint64_t dst_va,
			   uint64_t src_va, unsigned size, unsigned flags,
			   enum r600_coherency coher)
{
	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
	uint32_t header = 0, command = S_414_BYTE_COUNT(size);

	assert(size);
	assert(size <= CP_DMA_MAX_BYTE_COUNT);

	/* Sync flags. */
	if (flags & CP_DMA_SYNC)
		header |= S_411_CP_SYNC(1);
	else
		command |= S_414_DISABLE_WR_CONFIRM(1);

	if (flags & CP_DMA_RAW_WAIT)
		command |= S_414_RAW_WAIT(1);

	/* Src and dst flags. */
	if (flags & CP_DMA_USE_L2)
		header |= S_411_DSL_SEL(V_411_DST_ADDR_TC_L2);

	if (flags & CP_DMA_CLEAR)
		header |= S_411_SRC_SEL(V_411_DATA);
	else if (flags & CP_DMA_USE_L2)
		header |= S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2);

	if (sctx->b.chip_class >= CIK) {
		radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
		radeon_emit(cs, header);
		radeon_emit(cs, src_va);	/* SRC_ADDR_LO [31:0] */
		radeon_emit(cs, src_va >> 32);	/* SRC_ADDR_HI [31:0] */
		radeon_emit(cs, dst_va);	/* DST_ADDR_LO [31:0] */
		radeon_emit(cs, dst_va >> 32);	/* DST_ADDR_HI [31:0] */
		radeon_emit(cs, command);
	} else {
Exemple #19
0
/* This is required to prevent read-after-write hazards. */
void r600_dma_emit_wait_idle(struct r600_common_context *rctx)
{
	struct radeon_winsys_cs *cs = rctx->dma.cs;

	/* done at the end of DMA calls, so increment this. */
	rctx->num_dma_calls++;

	/* IBs using too little memory are limited by the IB submission overhead.
	 * IBs using too much memory are limited by the kernel/TTM overhead.
	 * Too long IBs create CPU-GPU pipeline bubbles and add latency.
	 *
	 * This heuristic makes sure that DMA requests are executed
	 * very soon after the call is made and lowers memory usage.
	 * It improves texture upload performance by keeping the DMA
	 * engine busy while uploads are being submitted.
	 */
	if (rctx->ws->cs_query_memory_usage(rctx->dma.cs) > 64 * 1024 * 1024) {
		rctx->dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
		return;
	}

	r600_need_dma_space(rctx, 1, NULL, NULL);

	if (!radeon_emitted(cs, 0)) /* empty queue */
		return;

	/* NOP waits for idle on Evergreen and later. */
	if (rctx->chip_class >= CIK)
		radeon_emit(cs, 0x00000000); /* NOP */
	else if (rctx->chip_class >= EVERGREEN)
		radeon_emit(cs, 0xf0000000); /* NOP */
	else {
		/* TODO: R600-R700 should use the FENCE packet.
		 * CS checker support is required. */
	}
}
Exemple #20
0
/**
 * Emit function for r600_cs_shader_state atom
 */
void evergreen_emit_cs_shader(
		struct r600_context *rctx,
		struct r600_atom *atom)
{
	struct r600_cs_shader_state *state =
					(struct r600_cs_shader_state*)atom;
	struct r600_pipe_compute *shader = state->shader;
	struct r600_kernel *kernel = &shader->kernels[state->kernel_index];
	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
	uint64_t va;

	va = r600_resource_va(&rctx->screen->b.b, &kernel->code_bo->b.b);

	r600_write_compute_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
	radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
	radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
			S_0288D4_NUM_GPRS(kernel->bc.ngpr)
			| S_0288D4_STACK_SIZE(kernel->bc.nstack));
	radeon_emit(cs, 0);	/* R_0288D8_SQ_PGM_RESOURCES_LS_2 */

	radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
	radeon_emit(cs, r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx,
							kernel->code_bo, RADEON_USAGE_READ));
}
Exemple #21
0
void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state)
{
	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;

	for (int i = 0; i < state->nbo; ++i) {
		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, state->bo[i],
				      state->bo_usage[i], state->bo_priority[i]);
	}

	if (!state->indirect_buffer) {
		radeon_emit_array(cs, state->pm4, state->ndw);
	} else {
		struct r600_resource *ib = state->indirect_buffer;

		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, ib,
					  RADEON_USAGE_READ,
                                          RADEON_PRIO_IB2);

		radeon_emit(cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
		radeon_emit(cs, ib->gpu_address);
		radeon_emit(cs, ib->gpu_address >> 32);
		radeon_emit(cs, (ib->b.b.width0 >> 2) & 0xfffff);
	}
}
void r600_gfx_wait_fence(struct r600_common_context *ctx,
			 uint64_t va, uint32_t ref, uint32_t mask)
{
	struct radeon_winsys_cs *cs = ctx->gfx.cs;

	radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
	radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1));
	radeon_emit(cs, va);
	radeon_emit(cs, va >> 32);
	radeon_emit(cs, ref); /* reference value */
	radeon_emit(cs, mask); /* mask */
	radeon_emit(cs, 4); /* poll interval */
}
Exemple #23
0
static void
si_init_compute(struct radv_physical_device *physical_device,
                struct radeon_winsys_cs *cs)
{
	radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3);
	radeon_emit(cs, 0);
	radeon_emit(cs, 0);
	radeon_emit(cs, 0);

	radeon_set_sh_reg_seq(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, 3);
	radeon_emit(cs, 0);
	/* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1 */
	radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
	radeon_emit(cs, S_00B85C_SH0_CU_EN(0xffff) | S_00B85C_SH1_CU_EN(0xffff));

	if (physical_device->rad_info.chip_class >= CIK) {
		/* Also set R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE2 / SE3 */
		radeon_set_sh_reg_seq(cs,
				      R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 2);
		radeon_emit(cs, S_00B864_SH0_CU_EN(0xffff) |
			    S_00B864_SH1_CU_EN(0xffff));
		radeon_emit(cs, S_00B868_SH0_CU_EN(0xffff) |
			    S_00B868_SH1_CU_EN(0xffff));
	}

	/* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID
	 * and is now per pipe, so it should be handled in the
	 * kernel if we want to use something other than the default value,
	 * which is now 0x22f.
	 */
	if (physical_device->rad_info.chip_class <= SI) {
		/* XXX: This should be:
		 * (number of compute units) * 4 * (waves per simd) - 1 */

		radeon_set_sh_reg(cs, R_00B82C_COMPUTE_MAX_WAVE_ID,
		                  0x190 /* Default value */);
	}
}
Exemple #24
0
static void si_initialize_compute(struct si_context *sctx)
{
	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;

	radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3);
	radeon_emit(cs, 0);
	radeon_emit(cs, 0);
	radeon_emit(cs, 0);

	radeon_set_sh_reg_seq(cs, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2);
	/* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1 */
	radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
	radeon_emit(cs, S_00B85C_SH0_CU_EN(0xffff) | S_00B85C_SH1_CU_EN(0xffff));

	if (sctx->b.chip_class >= CIK) {
		/* Also set R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE2 / SE3 */
		radeon_set_sh_reg_seq(cs,
		                     R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, 2);
		radeon_emit(cs, S_00B864_SH0_CU_EN(0xffff) |
		                S_00B864_SH1_CU_EN(0xffff));
		radeon_emit(cs, S_00B868_SH0_CU_EN(0xffff) |
		                S_00B868_SH1_CU_EN(0xffff));
	}

	/* This register has been moved to R_00CD20_COMPUTE_MAX_WAVE_ID
	 * and is now per pipe, so it should be handled in the
	 * kernel if we want to use something other than the default value,
	 * which is now 0x22f.
	 */
	if (sctx->b.chip_class <= SI) {
		/* XXX: This should be:
		 * (number of compute units) * 4 * (waves per simd) - 1 */

		radeon_set_sh_reg(cs, R_00B82C_COMPUTE_MAX_WAVE_ID,
		                  0x190 /* Default value */);
	}

	sctx->cs_shader_state.emitted_program = NULL;
	sctx->cs_shader_state.initialized = true;
}
Exemple #25
0
static void evergreen_emit_direct_dispatch(
		struct r600_context *rctx,
		const uint *block_layout, const uint *grid_layout)
{
	int i;
	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
	unsigned num_waves;
	unsigned num_pipes = rctx->screen->b.info.r600_max_pipes;
	unsigned wave_divisor = (16 * num_pipes);
	int group_size = 1;
	int grid_size = 1;
	unsigned lds_size = shader->local_size / 4 +
#if HAVE_LLVM < 0x0306
		shader->active_kernel->bc.nlds_dw;
#else
		shader->bc.nlds_dw;
#endif


	/* Calculate group_size/grid_size */
	for (i = 0; i < 3; i++) {
		group_size *= block_layout[i];
	}

	for (i = 0; i < 3; i++)	{
		grid_size *= grid_layout[i];
	}

	/* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
	num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
			wave_divisor - 1) / wave_divisor;

	COMPUTE_DBG(rctx->screen, "Using %u pipes, "
				"%u wavefronts per thread block, "
				"allocating %u dwords lds.\n",
				num_pipes, num_waves, lds_size);

	radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);

	radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
	radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
	radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
	radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */

	radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
								group_size);

	radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
	radeon_emit(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
	radeon_emit(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
	radeon_emit(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */

	if (rctx->b.chip_class < CAYMAN) {
		assert(lds_size <= 8192);
	} else {
		/* Cayman appears to have a slightly smaller limit, see the
		 * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
		assert(lds_size <= 8160);
	}

	radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
					lds_size | (num_waves << 14));

	/* Dispatch packet */
	radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
	radeon_emit(cs, grid_layout[0]);
	radeon_emit(cs, grid_layout[1]);
	radeon_emit(cs, grid_layout[2]);
	/* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
	radeon_emit(cs, 1);
}
Exemple #26
0
void r600_cp_dma_copy_buffer(struct r600_context *rctx,
			     struct pipe_resource *dst, uint64_t dst_offset,
			     struct pipe_resource *src, uint64_t src_offset,
			     unsigned size)
{
	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;

	assert(size);
	assert(rctx->screen->b.has_cp_dma);

	/* Mark the buffer range of destination as valid (initialized),
	 * so that transfer_map knows it should wait for the GPU when mapping
	 * that range. */
	util_range_add(&r600_resource(dst)->valid_buffer_range, dst_offset,
		       dst_offset + size);

	dst_offset += r600_resource(dst)->gpu_address;
	src_offset += r600_resource(src)->gpu_address;

	/* Flush the caches where the resources are bound. */
	rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
			 R600_CONTEXT_INV_VERTEX_CACHE |
			 R600_CONTEXT_INV_TEX_CACHE |
			 R600_CONTEXT_FLUSH_AND_INV |
			 R600_CONTEXT_FLUSH_AND_INV_CB |
			 R600_CONTEXT_FLUSH_AND_INV_DB |
			 R600_CONTEXT_FLUSH_AND_INV_CB_META |
			 R600_CONTEXT_FLUSH_AND_INV_DB_META |
			 R600_CONTEXT_STREAMOUT_FLUSH |
			 R600_CONTEXT_WAIT_3D_IDLE;

	/* There are differences between R700 and EG in CP DMA,
	 * but we only use the common bits here. */
	while (size) {
		unsigned sync = 0;
		unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
		unsigned src_reloc, dst_reloc;

		r600_need_cs_space(rctx, 10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0), FALSE);

		/* Flush the caches for the first copy only. */
		if (rctx->b.flags) {
			r600_flush_emit(rctx);
		}

		/* Do the synchronization after the last copy, so that all data is written to memory. */
		if (size == byte_count) {
			sync = PKT3_CP_DMA_CP_SYNC;
		}

		/* This must be done after r600_need_cs_space. */
		src_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)src,
						  RADEON_USAGE_READ, RADEON_PRIO_CP_DMA);
		dst_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)dst,
						  RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);

		radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
		radeon_emit(cs, src_offset);	/* SRC_ADDR_LO [31:0] */
		radeon_emit(cs, sync | ((src_offset >> 32) & 0xff));		/* CP_SYNC [31] | SRC_ADDR_HI [7:0] */
		radeon_emit(cs, dst_offset);	/* DST_ADDR_LO [31:0] */
		radeon_emit(cs, (dst_offset >> 32) & 0xff);		/* DST_ADDR_HI [7:0] */
		radeon_emit(cs, byte_count);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */

		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
		radeon_emit(cs, src_reloc);
		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
		radeon_emit(cs, dst_reloc);

		size -= byte_count;
		src_offset += byte_count;
		dst_offset += byte_count;
	}

	/* Invalidate the read caches. */
	rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
			 R600_CONTEXT_INV_VERTEX_CACHE |
			 R600_CONTEXT_INV_TEX_CACHE;
}
Exemple #27
0
void
si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer)
{
	enum chip_class chip_class = cmd_buffer->device->instance->physicalDevice.rad_info.chip_class;
	unsigned cp_coher_cntl = 0;

	radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 128);

	if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_INV_ICACHE)
		cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
	if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_INV_SMEM_L1)
		cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
	if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_INV_VMEM_L1)
		cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1);
	if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_INV_GLOBAL_L2) {
		cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1);
		if (chip_class >= VI)
			cp_coher_cntl |= S_0301F0_TC_WB_ACTION_ENA(1);
	}

	if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB) {
		cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) |
			S_0085F0_CB0_DEST_BASE_ENA(1) |
			S_0085F0_CB1_DEST_BASE_ENA(1) |
			S_0085F0_CB2_DEST_BASE_ENA(1) |
			S_0085F0_CB3_DEST_BASE_ENA(1) |
			S_0085F0_CB4_DEST_BASE_ENA(1) |
			S_0085F0_CB5_DEST_BASE_ENA(1) |
			S_0085F0_CB6_DEST_BASE_ENA(1) |
			S_0085F0_CB7_DEST_BASE_ENA(1);

		/* Necessary for DCC */
		if (cmd_buffer->device->instance->physicalDevice.rad_info.chip_class >= VI) {
			radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
			radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_DATA_TS) |
			                            EVENT_INDEX(5));
			radeon_emit(cmd_buffer->cs, 0);
			radeon_emit(cmd_buffer->cs, 0);
			radeon_emit(cmd_buffer->cs, 0);
			radeon_emit(cmd_buffer->cs, 0);
		}
	}

	if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB) {
		cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) |
			S_0085F0_DB_DEST_BASE_ENA(1);
	}

	if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_CB_META) {
		radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
		radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
	}

	if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_FLUSH_AND_INV_DB_META) {
		radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
		radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
	}

	if (!(cmd_buffer->state.flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB |
					      RADV_CMD_FLAG_FLUSH_AND_INV_DB))) {
		if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) {
			radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
			radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
		} else if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_VS_PARTIAL_FLUSH) {
			radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
			radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
		}
	}

	if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH) {
		radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
		radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
	}

	/* VGT state sync */
	if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_VGT_FLUSH) {
		radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
		radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
	}

	/* Make sure ME is idle (it executes most packets) before continuing.
	 * This prevents read-after-write hazards between PFP and ME.
	 */
	if (cp_coher_cntl || (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
		radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
		radeon_emit(cmd_buffer->cs, 0);
	}

	/* When one of the DEST_BASE flags is set, SURFACE_SYNC waits for idle.
	 * Therefore, it should be last. Done in PFP.
	 */
	if (cp_coher_cntl) {
		/* ACQUIRE_MEM is only required on a compute ring. */
		radeon_emit(cmd_buffer->cs, PKT3(PKT3_SURFACE_SYNC, 3, 0));
		radeon_emit(cmd_buffer->cs, cp_coher_cntl);   /* CP_COHER_CNTL */
		radeon_emit(cmd_buffer->cs, 0xffffffff);      /* CP_COHER_SIZE */
		radeon_emit(cmd_buffer->cs, 0);               /* CP_COHER_BASE */
		radeon_emit(cmd_buffer->cs, 0x0000000A);      /* POLL_INTERVAL */
	}

	cmd_buffer->state.flush_bits = 0;
}
Exemple #28
0
void cayman_emit_msaa_sample_locs(struct radeon_winsys_cs *cs, int nr_samples)
{
	switch (nr_samples) {
	case 2:
		radeon_set_context_reg(cs, CM_R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, eg_sample_locs_2x[0]);
		radeon_set_context_reg(cs, CM_R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, eg_sample_locs_2x[1]);
		radeon_set_context_reg(cs, CM_R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, eg_sample_locs_2x[2]);
		radeon_set_context_reg(cs, CM_R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, eg_sample_locs_2x[3]);
		break;
	case 4:
		radeon_set_context_reg(cs, CM_R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, eg_sample_locs_4x[0]);
		radeon_set_context_reg(cs, CM_R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, eg_sample_locs_4x[1]);
		radeon_set_context_reg(cs, CM_R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, eg_sample_locs_4x[2]);
		radeon_set_context_reg(cs, CM_R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, eg_sample_locs_4x[3]);
		break;
	case 8:
		radeon_set_context_reg_seq(cs, CM_R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 14);
		radeon_emit(cs, cm_sample_locs_8x[0]);
		radeon_emit(cs, cm_sample_locs_8x[4]);
		radeon_emit(cs, 0);
		radeon_emit(cs, 0);
		radeon_emit(cs, cm_sample_locs_8x[1]);
		radeon_emit(cs, cm_sample_locs_8x[5]);
		radeon_emit(cs, 0);
		radeon_emit(cs, 0);
		radeon_emit(cs, cm_sample_locs_8x[2]);
		radeon_emit(cs, cm_sample_locs_8x[6]);
		radeon_emit(cs, 0);
		radeon_emit(cs, 0);
		radeon_emit(cs, cm_sample_locs_8x[3]);
		radeon_emit(cs, cm_sample_locs_8x[7]);
		break;
	case 16:
		radeon_set_context_reg_seq(cs, CM_R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 16);
		radeon_emit(cs, cm_sample_locs_16x[0]);
		radeon_emit(cs, cm_sample_locs_16x[4]);
		radeon_emit(cs, cm_sample_locs_16x[8]);
		radeon_emit(cs, cm_sample_locs_16x[12]);
		radeon_emit(cs, cm_sample_locs_16x[1]);
		radeon_emit(cs, cm_sample_locs_16x[5]);
		radeon_emit(cs, cm_sample_locs_16x[9]);
		radeon_emit(cs, cm_sample_locs_16x[13]);
		radeon_emit(cs, cm_sample_locs_16x[2]);
		radeon_emit(cs, cm_sample_locs_16x[6]);
		radeon_emit(cs, cm_sample_locs_16x[10]);
		radeon_emit(cs, cm_sample_locs_16x[14]);
		radeon_emit(cs, cm_sample_locs_16x[3]);
		radeon_emit(cs, cm_sample_locs_16x[7]);
		radeon_emit(cs, cm_sample_locs_16x[11]);
		radeon_emit(cs, cm_sample_locs_16x[15]);
		break;
	}
}
Exemple #29
0
static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
		const uint *grid_layout)
{
	struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
	unsigned i;

	/* make sure that the gfx ring is only one active */
	if (ctx->b.dma.cs && ctx->b.dma.cs->cdw) {
		ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
	}

	/* Initialize all the compute-related registers.
	 *
	 * See evergreen_init_atom_start_compute_cs() in this file for the list
	 * of registers initialized by the start_compute_cs_cmd atom.
	 */
	r600_emit_command_buffer(cs, &ctx->start_compute_cs_cmd);

	/* emit config state */
	if (ctx->b.chip_class == EVERGREEN)
		r600_emit_atom(ctx, &ctx->config_state.atom);

	ctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
	r600_flush_emit(ctx);

	/* Emit colorbuffers. */
	/* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
	for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) {
		struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
		unsigned reloc = radeon_add_to_buffer_list(&ctx->b, &ctx->b.gfx,
						       (struct r600_resource*)cb->base.texture,
						       RADEON_USAGE_READWRITE,
						       RADEON_PRIO_SHADER_RW_BUFFER);

		radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
		radeon_emit(cs, cb->cb_color_base);	/* R_028C60_CB_COLOR0_BASE */
		radeon_emit(cs, cb->cb_color_pitch);	/* R_028C64_CB_COLOR0_PITCH */
		radeon_emit(cs, cb->cb_color_slice);	/* R_028C68_CB_COLOR0_SLICE */
		radeon_emit(cs, cb->cb_color_view);	/* R_028C6C_CB_COLOR0_VIEW */
		radeon_emit(cs, cb->cb_color_info);	/* R_028C70_CB_COLOR0_INFO */
		radeon_emit(cs, cb->cb_color_attrib);	/* R_028C74_CB_COLOR0_ATTRIB */
		radeon_emit(cs, cb->cb_color_dim);		/* R_028C78_CB_COLOR0_DIM */

		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
		radeon_emit(cs, reloc);

		if (!ctx->keep_tiling_flags) {
			radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C70_CB_COLOR0_INFO */
			radeon_emit(cs, reloc);
		}

		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
		radeon_emit(cs, reloc);
	}
	if (ctx->keep_tiling_flags) {
		for (; i < 8 ; i++) {
			radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
						       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
		}
		for (; i < 12; i++) {
			radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
						       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
		}
	}

	/* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
	radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
					ctx->compute_cb_target_mask);


	/* Emit vertex buffer state */
	ctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(ctx->cs_vertex_buffer_state.dirty_mask);
	r600_emit_atom(ctx, &ctx->cs_vertex_buffer_state.atom);

	/* Emit constant buffer state */
	r600_emit_atom(ctx, &ctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);

	/* Emit sampler state */
	r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].states.atom);

	/* Emit sampler view (texture resource) state */
	r600_emit_atom(ctx, &ctx->samplers[PIPE_SHADER_COMPUTE].views.atom);

	/* Emit compute shader state */
	r600_emit_atom(ctx, &ctx->cs_shader_state.atom);

	/* Emit dispatch state and dispatch packet */
	evergreen_emit_direct_dispatch(ctx, block_layout, grid_layout);

	/* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
	 */
	ctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
		      R600_CONTEXT_INV_VERTEX_CACHE |
	              R600_CONTEXT_INV_TEX_CACHE;
	r600_flush_emit(ctx);
	ctx->b.flags = 0;

	if (ctx->b.chip_class >= CAYMAN) {
		cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4);
		/* DEALLOC_STATE prevents the GPU from hanging when a
		 * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
		 * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
		 */
		cs->buf[cs->cdw++] = PKT3C(PKT3_DEALLOC_STATE, 0, 0);
		cs->buf[cs->cdw++] = 0;
	}

#if 0
	COMPUTE_DBG(ctx->screen, "cdw: %i\n", cs->cdw);
	for (i = 0; i < cs->cdw; i++) {
		COMPUTE_DBG(ctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
	}
#endif

}
Exemple #30
0
static int radeon_drm_cs_flush(struct radeon_cmdbuf *rcs,
                               unsigned flags,
                               struct pipe_fence_handle **pfence)
{
    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
    struct radeon_cs_context *tmp;

    switch (cs->ring_type) {
    case RING_DMA:
        /* pad DMA ring to 8 DWs */
        if (cs->ws->info.chip_class <= GFX6) {
            while (rcs->current.cdw & 7)
                radeon_emit(&cs->base, 0xf0000000); /* NOP packet */
        } else {
            while (rcs->current.cdw & 7)
                radeon_emit(&cs->base, 0x00000000); /* NOP packet */
        }
        break;
    case RING_GFX:
        /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements
         * r6xx, requires at least 4 dw alignment to avoid a hw bug.
         */
        if (cs->ws->info.gfx_ib_pad_with_type2) {
            while (rcs->current.cdw & 7)
                radeon_emit(&cs->base, 0x80000000); /* type2 nop packet */
        } else {
            while (rcs->current.cdw & 7)
                radeon_emit(&cs->base, 0xffff1000); /* type3 nop packet */
        }
        break;
    case RING_UVD:
        while (rcs->current.cdw & 15)
            radeon_emit(&cs->base, 0x80000000); /* type2 nop packet */
        break;
    default:
        break;
    }

    if (rcs->current.cdw > rcs->current.max_dw) {
       fprintf(stderr, "radeon: command stream overflowed\n");
    }

    if (pfence || cs->csc->num_slab_buffers) {
        struct pipe_fence_handle *fence;

        if (cs->next_fence) {
            fence = cs->next_fence;
            cs->next_fence = NULL;
        } else {
            fence = radeon_cs_create_fence(rcs);
        }

        if (fence) {
            if (pfence)
                radeon_fence_reference(pfence, fence);

            mtx_lock(&cs->ws->bo_fence_lock);
            for (unsigned i = 0; i < cs->csc->num_slab_buffers; ++i) {
                struct radeon_bo *bo = cs->csc->slab_buffers[i].bo;
                p_atomic_inc(&bo->num_active_ioctls);
                radeon_bo_slab_fence(bo, (struct radeon_bo *)fence);
            }
            mtx_unlock(&cs->ws->bo_fence_lock);

            radeon_fence_reference(&fence, NULL);
        }
    } else {
        radeon_fence_reference(&cs->next_fence, NULL);
    }

    radeon_drm_cs_sync_flush(rcs);

    /* Swap command streams. */
    tmp = cs->csc;
    cs->csc = cs->cst;
    cs->cst = tmp;

    /* If the CS is not empty or overflowed, emit it in a separate thread. */
    if (cs->base.current.cdw && cs->base.current.cdw <= cs->base.current.max_dw && !debug_get_option_noop()) {
        unsigned i, num_relocs;

        num_relocs = cs->cst->num_relocs;

        cs->cst->chunks[0].length_dw = cs->base.current.cdw;

        for (i = 0; i < num_relocs; i++) {
            /* Update the number of active asynchronous CS ioctls for the buffer. */
            p_atomic_inc(&cs->cst->relocs_bo[i].bo->num_active_ioctls);
        }

        switch (cs->ring_type) {
        case RING_DMA:
            cs->cst->flags[0] = 0;
            cs->cst->flags[1] = RADEON_CS_RING_DMA;
            cs->cst->cs.num_chunks = 3;
            if (cs->ws->info.r600_has_virtual_memory) {
                cs->cst->flags[0] |= RADEON_CS_USE_VM;
            }
            break;

        case RING_UVD:
            cs->cst->flags[0] = 0;
            cs->cst->flags[1] = RADEON_CS_RING_UVD;
            cs->cst->cs.num_chunks = 3;
            break;

        case RING_VCE:
            cs->cst->flags[0] = 0;
            cs->cst->flags[1] = RADEON_CS_RING_VCE;
            cs->cst->cs.num_chunks = 3;
            break;

        default:
        case RING_GFX:
        case RING_COMPUTE:
            cs->cst->flags[0] = RADEON_CS_KEEP_TILING_FLAGS;
            cs->cst->flags[1] = RADEON_CS_RING_GFX;
            cs->cst->cs.num_chunks = 3;

            if (cs->ws->info.r600_has_virtual_memory) {
                cs->cst->flags[0] |= RADEON_CS_USE_VM;
                cs->cst->cs.num_chunks = 3;
            }
            if (flags & PIPE_FLUSH_END_OF_FRAME) {
                cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
                cs->cst->cs.num_chunks = 3;
            }
            if (cs->ring_type == RING_COMPUTE) {
                cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
                cs->cst->cs.num_chunks = 3;
            }
            break;
        }

        if (util_queue_is_initialized(&cs->ws->cs_queue)) {
            util_queue_add_job(&cs->ws->cs_queue, cs, &cs->flush_completed,
                               radeon_drm_cs_emit_ioctl_oneshot, NULL);
            if (!(flags & PIPE_FLUSH_ASYNC))
                radeon_drm_cs_sync_flush(rcs);
        } else {
            radeon_drm_cs_emit_ioctl_oneshot(cs, 0);
        }
    } else {
        radeon_cs_context_cleanup(cs->cst);
    }

    /* Prepare a new CS. */
    cs->base.current.buf = cs->csc->buf;
    cs->base.current.cdw = 0;
    cs->base.used_vram = 0;
    cs->base.used_gart = 0;

    if (cs->ring_type == RING_GFX)
        cs->ws->num_gfx_IBs++;
    else if (cs->ring_type == RING_DMA)
        cs->ws->num_sdma_IBs++;
    return 0;
}