/** * This function initializes all the compute specific registers that need to * be initialized for each compute command stream. Registers that are common * to both compute and 3D will be initialized at the beginning of each compute * command stream by the start_cs_cmd atom. However, since the SET_CONTEXT_REG * packet requires that the shader type bit be set, we must initialize all * context registers needed for compute in this function. The registers * intialized by the start_cs_cmd atom can be found in evereen_state.c in the * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending * on the GPU family. */ void evergreen_init_atom_start_compute_cs(struct r600_context *ctx) { struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd; int num_threads; int num_stack_entries; /* since all required registers are initialised in the * start_compute_cs_cmd atom, we can EMIT_EARLY here. */ r600_init_command_buffer(ctx, cb, 1, 256); cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE; switch (ctx->family) { case CHIP_CEDAR: default: num_threads = 128; num_stack_entries = 256; break; case CHIP_REDWOOD: num_threads = 128; num_stack_entries = 256; break; case CHIP_JUNIPER: num_threads = 128; num_stack_entries = 512; break; case CHIP_CYPRESS: case CHIP_HEMLOCK: num_threads = 128; num_stack_entries = 512; break; case CHIP_PALM: num_threads = 128; num_stack_entries = 256; break; case CHIP_SUMO: num_threads = 128; num_stack_entries = 256; break; case CHIP_SUMO2: num_threads = 128; num_stack_entries = 512; break; case CHIP_BARTS: num_threads = 128; num_stack_entries = 512; break; case CHIP_TURKS: num_threads = 128; num_stack_entries = 256; break; case CHIP_CAICOS: num_threads = 128; num_stack_entries = 256; break; } /* Config Registers */ evergreen_init_common_regs(cb, ctx->chip_class , ctx->family, ctx->screen->info.drm_minor); /* The primitive type always needs to be POINTLIST for compute. */ r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE, V_008958_DI_PT_POINTLIST); if (ctx->chip_class < CAYMAN) { /* These registers control which simds can be used by each stage. * The default for these registers is 0xffffffff, which means * all simds are available for each stage. It's possible we may * want to play around with these in the future, but for now * the default value is fine. * * R_008E20_SQ_STATIC_THREAD_MGMT1 * R_008E24_SQ_STATIC_THREAD_MGMT2 * R_008E28_SQ_STATIC_THREAD_MGMT3 */ /* XXX: We may need to adjust the thread and stack resouce * values for 3D/compute interop */ r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5); /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1 * Set the number of threads used by the PS/VS/GS/ES stage to * 0. */ r600_store_value(cb, 0); /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2 * Set the number of threads used by the CS (aka LS) stage to * the maximum number of threads and set the number of threads * for the HS stage to 0. */ r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads)); /* R_008C20_SQ_STACK_RESOURCE_MGMT_1 * Set the Control Flow stack entries to 0 for PS/VS stages */ r600_store_value(cb, 0); /* R_008C24_SQ_STACK_RESOURCE_MGMT_2 * Set the Control Flow stack entries to 0 for GS/ES stages */ r600_store_value(cb, 0); /* R_008C28_SQ_STACK_RESOURCE_MGMT_3 * Set the Contol Flow stack entries to 0 for the HS stage, and * set it to the maximum value for the CS (aka LS) stage. */ r600_store_value(cb, S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries)); } /* Context Registers */ if (ctx->chip_class < CAYMAN) { /* workaround for hw issues with dyn gpr - must set all limits * to 240 instead of 0, 0x1e == 240 / 8 */ r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1, S_028838_PS_GPRS(0x1e) | S_028838_VS_GPRS(0x1e) | S_028838_GS_GPRS(0x1e) | S_028838_ES_GPRS(0x1e) | S_028838_HS_GPRS(0x1e) | S_028838_LS_GPRS(0x1e)); } /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */ r600_store_context_reg(cb, R_028A40_VGT_GS_MODE, S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1)); r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/); r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL, S_0286E8_TID_IN_GROUP_ENA | S_0286E8_TGID_ENA | S_0286E8_DISABLE_INDEX_PACK) ; /* The LOOP_CONST registers are an optimizations for loops that allows * you to store the initial counter, increment value, and maximum * counter value in a register so that hardware can calculate the * correct number of iterations for the loop, so that you don't need * to have the loop counter in your shader code. We don't currently use * this optimization, so we must keep track of the counter in the * shader and use a break instruction to exit loops. However, the * hardware will still uses this register to determine when to exit a * loop, so we need to initialize the counter to 0, set the increment * value to 1 and the maximum counter value to the 4095 (0xfff) which * is the maximum value allowed. This gives us a maximum of 4096 * iterations for our loops, but hopefully our break instruction will * execute before some time before the 4096th iteration. */ eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF); }
void evergreen_compute_init_config(struct r600_context *ctx) { struct evergreen_compute_resource* res = get_empty_res(ctx->cs_shader, COMPUTE_RESOURCE_CONFIG, 0); int num_threads; int num_stack_entries; int num_temp_gprs; enum radeon_family family; unsigned tmp; family = ctx->family; switch (family) { case CHIP_CEDAR: default: num_temp_gprs = 4; num_threads = 128; num_stack_entries = 256; break; case CHIP_REDWOOD: num_temp_gprs = 4; num_threads = 128; num_stack_entries = 256; break; case CHIP_JUNIPER: num_temp_gprs = 4; num_threads = 128; num_stack_entries = 512; break; case CHIP_CYPRESS: case CHIP_HEMLOCK: num_temp_gprs = 4; num_threads = 128; num_stack_entries = 512; break; case CHIP_PALM: num_temp_gprs = 4; num_threads = 128; num_stack_entries = 256; break; case CHIP_SUMO: num_temp_gprs = 4; num_threads = 128; num_stack_entries = 256; break; case CHIP_SUMO2: num_temp_gprs = 4; num_threads = 128; num_stack_entries = 512; break; case CHIP_BARTS: num_temp_gprs = 4; num_threads = 128; num_stack_entries = 512; break; case CHIP_TURKS: num_temp_gprs = 4; num_threads = 128; num_stack_entries = 256; break; case CHIP_CAICOS: num_temp_gprs = 4; num_threads = 128; num_stack_entries = 256; break; } tmp = 0x00000000; switch (family) { case CHIP_CEDAR: case CHIP_PALM: case CHIP_SUMO: case CHIP_SUMO2: case CHIP_CAICOS: break; default: tmp |= S_008C00_VC_ENABLE(1); break; } tmp |= S_008C00_EXPORT_SRC_C(1); tmp |= S_008C00_CS_PRIO(0); tmp |= S_008C00_LS_PRIO(0); tmp |= S_008C00_HS_PRIO(0); tmp |= S_008C00_PS_PRIO(0); tmp |= S_008C00_VS_PRIO(0); tmp |= S_008C00_GS_PRIO(0); tmp |= S_008C00_ES_PRIO(0); evergreen_reg_set(res, R_008C00_SQ_CONFIG, tmp); evergreen_reg_set(res, R_008C04_SQ_GPR_RESOURCE_MGMT_1, S_008C04_NUM_CLAUSE_TEMP_GPRS(num_temp_gprs)); if (ctx->chip_class < CAYMAN) { evergreen_reg_set(res, R_008C08_SQ_GPR_RESOURCE_MGMT_2, 0); } evergreen_reg_set(res, R_008C10_SQ_GLOBAL_GPR_RESOURCE_MGMT_1, 0); evergreen_reg_set(res, R_008C14_SQ_GLOBAL_GPR_RESOURCE_MGMT_2, 0); evergreen_reg_set(res, R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, (1 << 8)); /* workaround for hw issues with dyn gpr - must set all limits to 240 * instead of 0, 0x1e == 240/8 */ if (ctx->chip_class < CAYMAN) { evergreen_reg_set(res, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1, S_028838_PS_GPRS(0x1e) | S_028838_VS_GPRS(0x1e) | S_028838_GS_GPRS(0x1e) | S_028838_ES_GPRS(0x1e) | S_028838_HS_GPRS(0x1e) | S_028838_LS_GPRS(0x1e)); } else { evergreen_reg_set(res, 0x286f8, S_028838_PS_GPRS(0x1e) | S_028838_VS_GPRS(0x1e) | S_028838_GS_GPRS(0x1e) | S_028838_ES_GPRS(0x1e) | S_028838_HS_GPRS(0x1e) | S_028838_LS_GPRS(0x1e)); } if (ctx->chip_class < CAYMAN) { evergreen_reg_set(res, R_008E20_SQ_STATIC_THREAD_MGMT1, 0xFFFFFFFF); evergreen_reg_set(res, R_008E24_SQ_STATIC_THREAD_MGMT2, 0xFFFFFFFF); evergreen_reg_set(res, R_008E20_SQ_STATIC_THREAD_MGMT1, 0xFFFFFFFF); evergreen_reg_set(res, R_008E24_SQ_STATIC_THREAD_MGMT2, 0xFFFFFFFF); evergreen_reg_set(res, R_008E28_SQ_STATIC_THREAD_MGMT3, 0xFFFFFFFF); evergreen_reg_set(res, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 0); tmp = S_008C1C_NUM_LS_THREADS(num_threads); evergreen_reg_set(res, R_008C1C_SQ_THREAD_RESOURCE_MGMT_2, tmp); evergreen_reg_set(res, R_008C20_SQ_STACK_RESOURCE_MGMT_1, 0); evergreen_reg_set(res, R_008C24_SQ_STACK_RESOURCE_MGMT_2, 0); tmp = S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries); evergreen_reg_set(res, R_008C28_SQ_STACK_RESOURCE_MGMT_3, tmp); } evergreen_reg_set(res, R_0286CC_SPI_PS_IN_CONTROL_0, S_0286CC_LINEAR_GRADIENT_ENA(1)); evergreen_reg_set(res, R_0286D0_SPI_PS_IN_CONTROL_1, 0); evergreen_reg_set(res, R_0286E4_SPI_PS_IN_CONTROL_2, 0); evergreen_reg_set(res, R_0286D8_SPI_INPUT_Z, 0); evergreen_reg_set(res, R_0286E0_SPI_BARYC_CNTL, 1 << 20); tmp = S_0286E8_TID_IN_GROUP_ENA | S_0286E8_TGID_ENA | S_0286E8_DISABLE_INDEX_PACK; evergreen_reg_set(res, R_0286E8_SPI_COMPUTE_INPUT_CNTL, tmp); tmp = S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1); evergreen_reg_set(res, R_028A40_VGT_GS_MODE, tmp); evergreen_reg_set(res, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/); evergreen_reg_set(res, R_028800_DB_DEPTH_CONTROL, 0); evergreen_reg_set(res, R_02880C_DB_SHADER_CONTROL, 0); evergreen_reg_set(res, R_028000_DB_RENDER_CONTROL, S_028000_COLOR_DISABLE(1)); evergreen_reg_set(res, R_02800C_DB_RENDER_OVERRIDE, 0); evergreen_reg_set(res, R_0286E8_SPI_COMPUTE_INPUT_CNTL, S_0286E8_TID_IN_GROUP_ENA | S_0286E8_TGID_ENA | S_0286E8_DISABLE_INDEX_PACK) ; }