Exemplo n.º 1
0
/**
 * This function initializes all the compute specific registers that need to
 * be initialized for each compute command stream.  Registers that are common
 * to both compute and 3D will be initialized at the beginning of each compute
 * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
 * packet requires that the shader type bit be set, we must initialize all
 * context registers needed for compute in this function.  The registers
 * intialized by the start_cs_cmd atom can be found in evereen_state.c in the
 * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
 * on the GPU family.
 */
void evergreen_init_atom_start_compute_cs(struct r600_context *ctx)
{
	struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd;
	int num_threads;
	int num_stack_entries;

	/* since all required registers are initialised in the
	 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
	 */
	r600_init_command_buffer(ctx, cb, 1, 256);
	cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;

	switch (ctx->family) {
	case CHIP_CEDAR:
	default:
		num_threads = 128;
		num_stack_entries = 256;
		break;
	case CHIP_REDWOOD:
		num_threads = 128;
		num_stack_entries = 256;
		break;
	case CHIP_JUNIPER:
		num_threads = 128;
		num_stack_entries = 512;
		break;
	case CHIP_CYPRESS:
	case CHIP_HEMLOCK:
		num_threads = 128;
		num_stack_entries = 512;
		break;
	case CHIP_PALM:
		num_threads = 128;
		num_stack_entries = 256;
		break;
	case CHIP_SUMO:
		num_threads = 128;
		num_stack_entries = 256;
		break;
	case CHIP_SUMO2:
		num_threads = 128;
		num_stack_entries = 512;
		break;
	case CHIP_BARTS:
		num_threads = 128;
		num_stack_entries = 512;
		break;
	case CHIP_TURKS:
		num_threads = 128;
		num_stack_entries = 256;
		break;
	case CHIP_CAICOS:
		num_threads = 128;
		num_stack_entries = 256;
		break;
	}

	/* Config Registers */
	evergreen_init_common_regs(cb, ctx->chip_class
			, ctx->family, ctx->screen->info.drm_minor);

	/* The primitive type always needs to be POINTLIST for compute. */
	r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
						V_008958_DI_PT_POINTLIST);

	if (ctx->chip_class < CAYMAN) {

		/* These registers control which simds can be used by each stage.
		 * The default for these registers is 0xffffffff, which means
		 * all simds are available for each stage.  It's possible we may
		 * want to play around with these in the future, but for now
		 * the default value is fine.
		 *
		 * R_008E20_SQ_STATIC_THREAD_MGMT1
		 * R_008E24_SQ_STATIC_THREAD_MGMT2
		 * R_008E28_SQ_STATIC_THREAD_MGMT3
		 */

		/* XXX: We may need to adjust the thread and stack resouce
		 * values for 3D/compute interop */

		r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);

		/* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
		 * Set the number of threads used by the PS/VS/GS/ES stage to
		 * 0.
		 */
		r600_store_value(cb, 0);

		/* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
		 * Set the number of threads used by the CS (aka LS) stage to
		 * the maximum number of threads and set the number of threads
		 * for the HS stage to 0. */
		r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));

		/* R_008C20_SQ_STACK_RESOURCE_MGMT_1
		 * Set the Control Flow stack entries to 0 for PS/VS stages */
		r600_store_value(cb, 0);

		/* R_008C24_SQ_STACK_RESOURCE_MGMT_2
		 * Set the Control Flow stack entries to 0 for GS/ES stages */
		r600_store_value(cb, 0);

		/* R_008C28_SQ_STACK_RESOURCE_MGMT_3
		 * Set the Contol Flow stack entries to 0 for the HS stage, and
		 * set it to the maximum value for the CS (aka LS) stage. */
		r600_store_value(cb,
			S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
	}

	/* Context Registers */

	if (ctx->chip_class < CAYMAN) {
		/* workaround for hw issues with dyn gpr - must set all limits
		 * to 240 instead of 0, 0x1e == 240 / 8
		 */
		r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
				S_028838_PS_GPRS(0x1e) |
				S_028838_VS_GPRS(0x1e) |
				S_028838_GS_GPRS(0x1e) |
				S_028838_ES_GPRS(0x1e) |
				S_028838_HS_GPRS(0x1e) |
				S_028838_LS_GPRS(0x1e));
	}

	/* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
	r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
		S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));

	r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);

	r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
						S_0286E8_TID_IN_GROUP_ENA
						| S_0286E8_TGID_ENA
						| S_0286E8_DISABLE_INDEX_PACK)
						;

	/* The LOOP_CONST registers are an optimizations for loops that allows
	 * you to store the initial counter, increment value, and maximum
	 * counter value in a register so that hardware can calculate the
	 * correct number of iterations for the loop, so that you don't need
	 * to have the loop counter in your shader code.  We don't currently use
	 * this optimization, so we must keep track of the counter in the
	 * shader and use a break instruction to exit loops.  However, the
	 * hardware will still uses this register to determine when to exit a
	 * loop, so we need to initialize the counter to 0, set the increment
	 * value to 1 and the maximum counter value to the 4095 (0xfff) which
	 * is the maximum value allowed.  This gives us a maximum of 4096
	 * iterations for our loops, but hopefully our break instruction will
	 * execute before some time before the 4096th iteration.
	 */
	eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
}
Exemplo n.º 2
0
void evergreen_compute_init_config(struct r600_context *ctx)
{
    struct evergreen_compute_resource* res =
        get_empty_res(ctx->cs_shader, COMPUTE_RESOURCE_CONFIG, 0);

    int num_threads;
    int num_stack_entries;
    int num_temp_gprs;

    enum radeon_family family;
    unsigned tmp;

    family = ctx->family;

    switch (family) {
    case CHIP_CEDAR:
    default:
        num_temp_gprs = 4;
        num_threads = 128;
        num_stack_entries = 256;
        break;
    case CHIP_REDWOOD:
        num_temp_gprs = 4;
        num_threads = 128;
        num_stack_entries = 256;
        break;
    case CHIP_JUNIPER:
        num_temp_gprs = 4;
        num_threads = 128;
        num_stack_entries = 512;
        break;
    case CHIP_CYPRESS:
    case CHIP_HEMLOCK:
        num_temp_gprs = 4;
        num_threads = 128;
        num_stack_entries = 512;
        break;
    case CHIP_PALM:
        num_temp_gprs = 4;
        num_threads = 128;
        num_stack_entries = 256;
        break;
    case CHIP_SUMO:
        num_temp_gprs = 4;
        num_threads = 128;
        num_stack_entries = 256;
        break;
    case CHIP_SUMO2:
        num_temp_gprs = 4;
        num_threads = 128;
        num_stack_entries = 512;
        break;
    case CHIP_BARTS:
        num_temp_gprs = 4;
        num_threads = 128;
        num_stack_entries = 512;
        break;
    case CHIP_TURKS:
        num_temp_gprs = 4;
        num_threads = 128;
        num_stack_entries = 256;
        break;
    case CHIP_CAICOS:
        num_temp_gprs = 4;
        num_threads = 128;
        num_stack_entries = 256;
        break;
    }

    tmp = 0x00000000;
    switch (family) {
    case CHIP_CEDAR:
    case CHIP_PALM:
    case CHIP_SUMO:
    case CHIP_SUMO2:
    case CHIP_CAICOS:
        break;
    default:
        tmp |= S_008C00_VC_ENABLE(1);
        break;
    }
    tmp |= S_008C00_EXPORT_SRC_C(1);
    tmp |= S_008C00_CS_PRIO(0);
    tmp |= S_008C00_LS_PRIO(0);
    tmp |= S_008C00_HS_PRIO(0);
    tmp |= S_008C00_PS_PRIO(0);
    tmp |= S_008C00_VS_PRIO(0);
    tmp |= S_008C00_GS_PRIO(0);
    tmp |= S_008C00_ES_PRIO(0);

    evergreen_reg_set(res, R_008C00_SQ_CONFIG, tmp);

    evergreen_reg_set(res, R_008C04_SQ_GPR_RESOURCE_MGMT_1,
                      S_008C04_NUM_CLAUSE_TEMP_GPRS(num_temp_gprs));
    if (ctx->chip_class < CAYMAN) {
        evergreen_reg_set(res, R_008C08_SQ_GPR_RESOURCE_MGMT_2, 0);
    }
    evergreen_reg_set(res, R_008C10_SQ_GLOBAL_GPR_RESOURCE_MGMT_1, 0);
    evergreen_reg_set(res, R_008C14_SQ_GLOBAL_GPR_RESOURCE_MGMT_2, 0);
    evergreen_reg_set(res, R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, (1 << 8));

    /* workaround for hw issues with dyn gpr - must set all limits to 240
     * instead of 0, 0x1e == 240/8 */
    if (ctx->chip_class < CAYMAN) {
        evergreen_reg_set(res, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
                          S_028838_PS_GPRS(0x1e) |
                          S_028838_VS_GPRS(0x1e) |
                          S_028838_GS_GPRS(0x1e) |
                          S_028838_ES_GPRS(0x1e) |
                          S_028838_HS_GPRS(0x1e) |
                          S_028838_LS_GPRS(0x1e));
    } else {
        evergreen_reg_set(res, 0x286f8,
                          S_028838_PS_GPRS(0x1e) |
                          S_028838_VS_GPRS(0x1e) |
                          S_028838_GS_GPRS(0x1e) |
                          S_028838_ES_GPRS(0x1e) |
                          S_028838_HS_GPRS(0x1e) |
                          S_028838_LS_GPRS(0x1e));
    }

    if (ctx->chip_class < CAYMAN) {

        evergreen_reg_set(res, R_008E20_SQ_STATIC_THREAD_MGMT1, 0xFFFFFFFF);
        evergreen_reg_set(res, R_008E24_SQ_STATIC_THREAD_MGMT2, 0xFFFFFFFF);
        evergreen_reg_set(res, R_008E20_SQ_STATIC_THREAD_MGMT1, 0xFFFFFFFF);
        evergreen_reg_set(res, R_008E24_SQ_STATIC_THREAD_MGMT2, 0xFFFFFFFF);
        evergreen_reg_set(res, R_008E28_SQ_STATIC_THREAD_MGMT3, 0xFFFFFFFF);
        evergreen_reg_set(res, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 0);
        tmp = S_008C1C_NUM_LS_THREADS(num_threads);
        evergreen_reg_set(res, R_008C1C_SQ_THREAD_RESOURCE_MGMT_2, tmp);
        evergreen_reg_set(res, R_008C20_SQ_STACK_RESOURCE_MGMT_1, 0);
        evergreen_reg_set(res, R_008C24_SQ_STACK_RESOURCE_MGMT_2, 0);
        tmp = S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries);
        evergreen_reg_set(res, R_008C28_SQ_STACK_RESOURCE_MGMT_3, tmp);
    }
    evergreen_reg_set(res, R_0286CC_SPI_PS_IN_CONTROL_0, S_0286CC_LINEAR_GRADIENT_ENA(1));
    evergreen_reg_set(res, R_0286D0_SPI_PS_IN_CONTROL_1, 0);
    evergreen_reg_set(res, R_0286E4_SPI_PS_IN_CONTROL_2, 0);
    evergreen_reg_set(res, R_0286D8_SPI_INPUT_Z, 0);
    evergreen_reg_set(res, R_0286E0_SPI_BARYC_CNTL, 1 << 20);
    tmp = S_0286E8_TID_IN_GROUP_ENA | S_0286E8_TGID_ENA | S_0286E8_DISABLE_INDEX_PACK;
    evergreen_reg_set(res, R_0286E8_SPI_COMPUTE_INPUT_CNTL, tmp);
    tmp = S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1);
    evergreen_reg_set(res, R_028A40_VGT_GS_MODE, tmp);
    evergreen_reg_set(res, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
    evergreen_reg_set(res, R_028800_DB_DEPTH_CONTROL, 0);
    evergreen_reg_set(res, R_02880C_DB_SHADER_CONTROL, 0);
    evergreen_reg_set(res, R_028000_DB_RENDER_CONTROL, S_028000_COLOR_DISABLE(1));
    evergreen_reg_set(res, R_02800C_DB_RENDER_OVERRIDE, 0);
    evergreen_reg_set(res, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
                      S_0286E8_TID_IN_GROUP_ENA
                      | S_0286E8_TGID_ENA
                      | S_0286E8_DISABLE_INDEX_PACK)
    ;
}