static void brw_upload_initial_gpu_state(struct brw_context *brw) { /* On platforms with hardware contexts, we can set our initial GPU state * right away rather than doing it via state atoms. This saves a small * amount of overhead on every draw call. */ if (!brw->hw_ctx) return; if (brw->gen == 6) brw_emit_post_sync_nonzero_flush(brw); brw_upload_invariant_state(brw); /* Recommended optimization for Victim Cache eviction in pixel backend. */ if (brw->gen >= 9) { BEGIN_BATCH(3); OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2)); OUT_BATCH(GEN7_CACHE_MODE_1); OUT_BATCH(REG_MASK(GEN9_PARTIAL_RESOLVE_DISABLE_IN_VC) | GEN9_PARTIAL_RESOLVE_DISABLE_IN_VC); ADVANCE_BATCH(); } if (brw->gen >= 8) { gen8_emit_3dstate_sample_pattern(brw); } }
/** * Program the hardware to use the specified L3 configuration. */ static void setup_l3_config(struct brw_context *brw, const struct gen_l3_config *cfg) { const bool has_dc = cfg->n[GEN_L3P_DC] || cfg->n[GEN_L3P_ALL]; const bool has_is = cfg->n[GEN_L3P_IS] || cfg->n[GEN_L3P_RO] || cfg->n[GEN_L3P_ALL]; const bool has_c = cfg->n[GEN_L3P_C] || cfg->n[GEN_L3P_RO] || cfg->n[GEN_L3P_ALL]; const bool has_t = cfg->n[GEN_L3P_T] || cfg->n[GEN_L3P_RO] || cfg->n[GEN_L3P_ALL]; const bool has_slm = cfg->n[GEN_L3P_SLM]; /* According to the hardware docs, the L3 partitioning can only be changed * while the pipeline is completely drained and the caches are flushed, * which involves a first PIPE_CONTROL flush which stalls the pipeline... */ brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DATA_CACHE_FLUSH | PIPE_CONTROL_NO_WRITE | PIPE_CONTROL_CS_STALL); /* ...followed by a second pipelined PIPE_CONTROL that initiates * invalidation of the relevant caches. Note that because RO invalidation * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL * command is processed by the CS) we cannot combine it with the previous * stalling flush as the hardware documentation suggests, because that * would cause the CS to stall on previous rendering *after* RO * invalidation and wouldn't prevent the RO caches from being polluted by * concurrent rendering before the stall completes. This intentionally * doesn't implement the SKL+ hardware workaround suggesting to enable CS * stall on PIPE_CONTROLs with the texture cache invalidation bit set for * GPGPU workloads because the previous and subsequent PIPE_CONTROLs * already guarantee that there is no concurrent GPGPU kernel execution * (see SKL HSD 2132585). */ brw_emit_pipe_control_flush(brw, PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | PIPE_CONTROL_CONST_CACHE_INVALIDATE | PIPE_CONTROL_INSTRUCTION_INVALIDATE | PIPE_CONTROL_STATE_CACHE_INVALIDATE | PIPE_CONTROL_NO_WRITE); /* Now send a third stalling flush to make sure that invalidation is * complete when the L3 configuration registers are modified. */ brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DATA_CACHE_FLUSH | PIPE_CONTROL_NO_WRITE | PIPE_CONTROL_CS_STALL); if (brw->gen >= 8) { assert(!cfg->n[GEN_L3P_IS] && !cfg->n[GEN_L3P_C] && !cfg->n[GEN_L3P_T]); BEGIN_BATCH(3); OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2)); /* Set up the L3 partitioning. */ OUT_BATCH(GEN8_L3CNTLREG); OUT_BATCH((has_slm ? GEN8_L3CNTLREG_SLM_ENABLE : 0) | SET_FIELD(cfg->n[GEN_L3P_URB], GEN8_L3CNTLREG_URB_ALLOC) | SET_FIELD(cfg->n[GEN_L3P_RO], GEN8_L3CNTLREG_RO_ALLOC) | SET_FIELD(cfg->n[GEN_L3P_DC], GEN8_L3CNTLREG_DC_ALLOC) | SET_FIELD(cfg->n[GEN_L3P_ALL], GEN8_L3CNTLREG_ALL_ALLOC)); ADVANCE_BATCH(); } else { assert(!cfg->n[GEN_L3P_ALL]); /* When enabled SLM only uses a portion of the L3 on half of the banks, * the matching space on the remaining banks has to be allocated to a * client (URB for all validated configurations) set to the * lower-bandwidth 2-bank address hashing mode. */ const bool urb_low_bw = has_slm && !brw->is_baytrail; assert(!urb_low_bw || cfg->n[GEN_L3P_URB] == cfg->n[GEN_L3P_SLM]); /* Minimum number of ways that can be allocated to the URB. */ const unsigned n0_urb = (brw->is_baytrail ? 32 : 0); assert(cfg->n[GEN_L3P_URB] >= n0_urb); BEGIN_BATCH(7); OUT_BATCH(MI_LOAD_REGISTER_IMM | (7 - 2)); /* Demote any clients with no ways assigned to LLC. */ OUT_BATCH(GEN7_L3SQCREG1); OUT_BATCH((brw->is_haswell ? HSW_L3SQCREG1_SQGHPCI_DEFAULT : brw->is_baytrail ? VLV_L3SQCREG1_SQGHPCI_DEFAULT : IVB_L3SQCREG1_SQGHPCI_DEFAULT) | (has_dc ? 0 : GEN7_L3SQCREG1_CONV_DC_UC) | (has_is ? 0 : GEN7_L3SQCREG1_CONV_IS_UC) | (has_c ? 0 : GEN7_L3SQCREG1_CONV_C_UC) | (has_t ? 0 : GEN7_L3SQCREG1_CONV_T_UC)); /* Set up the L3 partitioning. */ OUT_BATCH(GEN7_L3CNTLREG2); OUT_BATCH((has_slm ? GEN7_L3CNTLREG2_SLM_ENABLE : 0) | SET_FIELD(cfg->n[GEN_L3P_URB] - n0_urb, GEN7_L3CNTLREG2_URB_ALLOC) | (urb_low_bw ? GEN7_L3CNTLREG2_URB_LOW_BW : 0) | SET_FIELD(cfg->n[GEN_L3P_ALL], GEN7_L3CNTLREG2_ALL_ALLOC) | SET_FIELD(cfg->n[GEN_L3P_RO], GEN7_L3CNTLREG2_RO_ALLOC) | SET_FIELD(cfg->n[GEN_L3P_DC], GEN7_L3CNTLREG2_DC_ALLOC)); OUT_BATCH(GEN7_L3CNTLREG3); OUT_BATCH(SET_FIELD(cfg->n[GEN_L3P_IS], GEN7_L3CNTLREG3_IS_ALLOC) | SET_FIELD(cfg->n[GEN_L3P_C], GEN7_L3CNTLREG3_C_ALLOC) | SET_FIELD(cfg->n[GEN_L3P_T], GEN7_L3CNTLREG3_T_ALLOC)); ADVANCE_BATCH(); if (brw->is_haswell && brw->screen->cmd_parser_version >= 4) { /* Enable L3 atomics on HSW if we have a DC partition, otherwise keep * them disabled to avoid crashing the system hard. */ BEGIN_BATCH(5); OUT_BATCH(MI_LOAD_REGISTER_IMM | (5 - 2)); OUT_BATCH(HSW_SCRATCH1); OUT_BATCH(has_dc ? 0 : HSW_SCRATCH1_L3_ATOMIC_DISABLE); OUT_BATCH(HSW_ROW_CHICKEN3); OUT_BATCH(REG_MASK(HSW_ROW_CHICKEN3_L3_ATOMIC_DISABLE) | (has_dc ? 0 : HSW_ROW_CHICKEN3_L3_ATOMIC_DISABLE)); ADVANCE_BATCH(); } } }
static void brw_upload_initial_gpu_state(struct brw_context *brw) { const struct gen_device_info *devinfo = &brw->screen->devinfo; const struct brw_compiler *compiler = brw->screen->compiler; /* On platforms with hardware contexts, we can set our initial GPU state * right away rather than doing it via state atoms. This saves a small * amount of overhead on every draw call. */ if (!brw->hw_ctx) return; if (devinfo->gen == 6) brw_emit_post_sync_nonzero_flush(brw); brw_upload_invariant_state(brw); if (devinfo->gen == 11) { /* The default behavior of bit 5 "Headerless Message for Pre-emptable * Contexts" in SAMPLER MODE register is set to 0, which means * headerless sampler messages are not allowed for pre-emptable * contexts. Set the bit 5 to 1 to allow them. */ brw_load_register_imm32(brw, GEN11_SAMPLER_MODE, HEADERLESS_MESSAGE_FOR_PREEMPTABLE_CONTEXTS_MASK | HEADERLESS_MESSAGE_FOR_PREEMPTABLE_CONTEXTS); /* Bit 1 "Enabled Texel Offset Precision Fix" must be set in * HALF_SLICE_CHICKEN7 register. */ brw_load_register_imm32(brw, HALF_SLICE_CHICKEN7, TEXEL_OFFSET_FIX_MASK | TEXEL_OFFSET_FIX_ENABLE); /* WA_1406697149: Bit 9 "Error Detection Behavior Control" must be set * in L3CNTLREG register. The default setting of the bit is not the * desirable behavior. */ brw_load_register_imm32(brw, GEN8_L3CNTLREG, GEN8_L3CNTLREG_EDBC_NO_HANG); /* WA_2204188704: Pixel Shader Panic dispatch must be disabled. */ brw_load_register_imm32(brw, COMMON_SLICE_CHICKEN3, PS_THREAD_PANIC_DISPATCH_MASK | PS_THREAD_PANIC_DISPATCH); /* WaEnableStateCacheRedirectToCS:icl */ brw_load_register_imm32(brw, SLICE_COMMON_ECO_CHICKEN1, GEN11_STATE_CACHE_REDIRECT_TO_CS_SECTION_ENABLE | REG_MASK(GEN11_STATE_CACHE_REDIRECT_TO_CS_SECTION_ENABLE)); } if (devinfo->gen == 10 || devinfo->gen == 11) { /* From gen10 workaround table in h/w specs: * * "On 3DSTATE_3D_MODE, driver must always program bits 31:16 of DW1 * a value of 0xFFFF" * * This means that we end up setting the entire 3D_MODE state. Bits * in this register control things such as slice hashing and we want * the default values of zero at the moment. */ BEGIN_BATCH(2); OUT_BATCH(_3DSTATE_3D_MODE << 16 | (2 - 2)); OUT_BATCH(0xFFFF << 16); ADVANCE_BATCH(); } if (devinfo->gen == 9) { /* Recommended optimizations for Victim Cache eviction and floating * point blending. */ brw_load_register_imm32(brw, GEN7_CACHE_MODE_1, REG_MASK(GEN9_FLOAT_BLEND_OPTIMIZATION_ENABLE) | REG_MASK(GEN9_PARTIAL_RESOLVE_DISABLE_IN_VC) | GEN9_FLOAT_BLEND_OPTIMIZATION_ENABLE | GEN9_PARTIAL_RESOLVE_DISABLE_IN_VC); if (gen_device_info_is_9lp(devinfo)) { brw_load_register_imm32(brw, GEN7_GT_MODE, GEN9_SUBSLICE_HASHING_MASK_BITS | GEN9_SUBSLICE_HASHING_16x16); } } if (devinfo->gen >= 8) { gen8_emit_3dstate_sample_pattern(brw); BEGIN_BATCH(5); OUT_BATCH(_3DSTATE_WM_HZ_OP << 16 | (5 - 2)); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); BEGIN_BATCH(2); OUT_BATCH(_3DSTATE_WM_CHROMAKEY << 16 | (2 - 2)); OUT_BATCH(0); ADVANCE_BATCH(); } /* Set the "CONSTANT_BUFFER Address Offset Disable" bit, so * 3DSTATE_CONSTANT_XS buffer 0 is an absolute address. * * This is only safe on kernels with context isolation support. */ if (!compiler->constant_buffer_0_is_relative) { if (devinfo->gen >= 9) { BEGIN_BATCH(3); OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2)); OUT_BATCH(CS_DEBUG_MODE2); OUT_BATCH(REG_MASK(CSDBG2_CONSTANT_BUFFER_ADDRESS_OFFSET_DISABLE) | CSDBG2_CONSTANT_BUFFER_ADDRESS_OFFSET_DISABLE); ADVANCE_BATCH(); } else if (devinfo->gen == 8) { BEGIN_BATCH(3); OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2)); OUT_BATCH(INSTPM); OUT_BATCH(REG_MASK(INSTPM_CONSTANT_BUFFER_ADDRESS_OFFSET_DISABLE) | INSTPM_CONSTANT_BUFFER_ADDRESS_OFFSET_DISABLE); ADVANCE_BATCH(); } } brw->object_preemption = false; if (devinfo->gen >= 10) brw_enable_obj_preemption(brw, true); }