Пример #1
0
static void r600_flush_vgt_streamout(struct r600_common_context *rctx)
{
    struct radeon_winsys_cs *cs = rctx->rings.gfx.cs;
    unsigned reg_strmout_cntl;

    /* The register is at different places on different ASICs. */
    if (rctx->chip_class >= CIK) {
        reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
    } else if (rctx->chip_class >= EVERGREEN) {
        reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
    } else {
        reg_strmout_cntl = R_008490_CP_STRMOUT_CNTL;
    }

    if (rctx->chip_class >= CIK) {
        cik_write_uconfig_reg(cs, reg_strmout_cntl, 0);
    } else {
        r600_write_config_reg(cs, reg_strmout_cntl, 0);
    }

    radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
    radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));

    radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
    radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
    radeon_emit(cs, reg_strmout_cntl >> 2);  /* register */
    radeon_emit(cs, 0);
    radeon_emit(cs, S_008490_OFFSET_UPDATE_DONE(1)); /* reference value */
    radeon_emit(cs, S_008490_OFFSET_UPDATE_DONE(1)); /* mask */
    radeon_emit(cs, 4); /* poll interval */
}
Пример #2
0
void evergreen_flush_vgt_streamout(struct r600_context *ctx)
{
	struct radeon_winsys_cs *cs = ctx->rings.gfx.cs;

	r600_write_config_reg(cs, R_0084FC_CP_STRMOUT_CNTL, 0);

	cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
	cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0);

	cs->buf[cs->cdw++] = PKT3(PKT3_WAIT_REG_MEM, 5, 0);
	cs->buf[cs->cdw++] = WAIT_REG_MEM_EQUAL; /* wait until the register is equal to the reference value */
	cs->buf[cs->cdw++] = R_0084FC_CP_STRMOUT_CNTL >> 2;  /* register */
	cs->buf[cs->cdw++] = 0;
	cs->buf[cs->cdw++] = S_0084FC_OFFSET_UPDATE_DONE(1); /* reference value */
	cs->buf[cs->cdw++] = S_0084FC_OFFSET_UPDATE_DONE(1); /* mask */
	cs->buf[cs->cdw++] = 4; /* poll interval */
}
Пример #3
0
static void evergreen_emit_direct_dispatch(
		struct r600_context *rctx,
		const uint *block_layout, const uint *grid_layout)
{
	int i;
	struct radeon_winsys_cs *cs = rctx->cs;
	unsigned num_waves;
	unsigned num_pipes = rctx->screen->info.r600_max_pipes;
	unsigned wave_divisor = (16 * num_pipes);
	int group_size = 1;
	int grid_size = 1;
	/* XXX: Enable lds and get size from cs_shader_state */
	unsigned lds_size = 0;

	/* Calculate group_size/grid_size */
	for (i = 0; i < 3; i++) {
		group_size *= block_layout[i];
	}

	for (i = 0; i < 3; i++)	{
		grid_size *= grid_layout[i];
	}

	/* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
	num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
			wave_divisor - 1) / wave_divisor;

	COMPUTE_DBG("Using %u pipes, there are %u wavefronts per thread block\n",
							num_pipes, num_waves);

	/* XXX: Partition the LDS between PS/CS.  By default half (4096 dwords
	 * on Evergreen) oes to Pixel Shaders and half goes to Compute Shaders.
	 * We may need to allocat the entire LDS space for Compute Shaders.
	 *
	 * EG: R_008E2C_SQ_LDS_RESOURCE_MGMT := S_008E2C_NUM_LS_LDS(lds_dwords)
	 * CM: CM_R_0286FC_SPI_LDS_MGMT :=  S_0286FC_NUM_LS_LDS(lds_dwords)
	 */

	r600_write_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);

	r600_write_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
	r600_write_value(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
	r600_write_value(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
	r600_write_value(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */

	r600_write_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
								group_size);

	r600_write_compute_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
	r600_write_value(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
	r600_write_value(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
	r600_write_value(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */

	r600_write_compute_context_reg(cs, CM_R_0288E8_SQ_LDS_ALLOC,
					lds_size | (num_waves << 14));

	/* Dispatch packet */
	r600_write_value(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
	r600_write_value(cs, grid_layout[0]);
	r600_write_value(cs, grid_layout[1]);
	r600_write_value(cs, grid_layout[2]);
	/* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
	r600_write_value(cs, 1);
}
Пример #4
0
static void evergreen_emit_direct_dispatch(
		struct r600_context *rctx,
		const uint *block_layout, const uint *grid_layout)
{
	int i;
	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
	unsigned num_waves;
	unsigned num_pipes = rctx->screen->b.info.r600_max_pipes;
	unsigned wave_divisor = (16 * num_pipes);
	int group_size = 1;
	int grid_size = 1;
	unsigned lds_size = shader->local_size / 4 + shader->active_kernel->bc.nlds_dw;

	/* Calculate group_size/grid_size */
	for (i = 0; i < 3; i++) {
		group_size *= block_layout[i];
	}

	for (i = 0; i < 3; i++)	{
		grid_size *= grid_layout[i];
	}

	/* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
	num_waves = (block_layout[0] * block_layout[1] * block_layout[2] +
			wave_divisor - 1) / wave_divisor;

	COMPUTE_DBG(rctx->screen, "Using %u pipes, "
				"%u wavefronts per thread block, "
				"allocating %u dwords lds.\n",
				num_pipes, num_waves, lds_size);

	r600_write_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);

	r600_write_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
	radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
	radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
	radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */

	r600_write_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
								group_size);

	r600_write_compute_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
	radeon_emit(cs, block_layout[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
	radeon_emit(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
	radeon_emit(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */

	if (rctx->b.chip_class < CAYMAN) {
		assert(lds_size <= 8192);
	} else {
		/* Cayman appears to have a slightly smaller limit, see the
		 * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
		assert(lds_size <= 8160);
	}

	r600_write_compute_context_reg(cs, CM_R_0288E8_SQ_LDS_ALLOC,
					lds_size | (num_waves << 14));

	/* Dispatch packet */
	radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, 0));
	radeon_emit(cs, grid_layout[0]);
	radeon_emit(cs, grid_layout[1]);
	radeon_emit(cs, grid_layout[2]);
	/* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
	radeon_emit(cs, 1);
}