コード例 #1
0
static void
brw_upload_initial_gpu_state(struct brw_context *brw)
{
   /* On platforms with hardware contexts, we can set our initial GPU state
    * right away rather than doing it via state atoms.  This saves a small
    * amount of overhead on every draw call.
    */
   if (!brw->hw_ctx)
      return;

   if (brw->gen == 6)
      brw_emit_post_sync_nonzero_flush(brw);

   brw_upload_invariant_state(brw);

   /* Recommended optimization for Victim Cache eviction in pixel backend. */
   if (brw->gen >= 9) {
      BEGIN_BATCH(3);
      OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
      OUT_BATCH(GEN7_CACHE_MODE_1);
      OUT_BATCH(REG_MASK(GEN9_PARTIAL_RESOLVE_DISABLE_IN_VC) |
                GEN9_PARTIAL_RESOLVE_DISABLE_IN_VC);
      ADVANCE_BATCH();
   }

   if (brw->gen >= 8) {
      gen8_emit_3dstate_sample_pattern(brw);
   }
}
コード例 #2
0
ファイル: gen7_l3_state.c プロジェクト: etnaviv/mesa
/**
 * Program the hardware to use the specified L3 configuration.
 */
static void
setup_l3_config(struct brw_context *brw, const struct gen_l3_config *cfg)
{
   const bool has_dc = cfg->n[GEN_L3P_DC] || cfg->n[GEN_L3P_ALL];
   const bool has_is = cfg->n[GEN_L3P_IS] || cfg->n[GEN_L3P_RO] ||
                       cfg->n[GEN_L3P_ALL];
   const bool has_c = cfg->n[GEN_L3P_C] || cfg->n[GEN_L3P_RO] ||
                      cfg->n[GEN_L3P_ALL];
   const bool has_t = cfg->n[GEN_L3P_T] || cfg->n[GEN_L3P_RO] ||
                      cfg->n[GEN_L3P_ALL];
   const bool has_slm = cfg->n[GEN_L3P_SLM];

   /* According to the hardware docs, the L3 partitioning can only be changed
    * while the pipeline is completely drained and the caches are flushed,
    * which involves a first PIPE_CONTROL flush which stalls the pipeline...
    */
   brw_emit_pipe_control_flush(brw,
                               PIPE_CONTROL_DATA_CACHE_FLUSH |
                               PIPE_CONTROL_NO_WRITE |
                               PIPE_CONTROL_CS_STALL);

   /* ...followed by a second pipelined PIPE_CONTROL that initiates
    * invalidation of the relevant caches.  Note that because RO invalidation
    * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
    * command is processed by the CS) we cannot combine it with the previous
    * stalling flush as the hardware documentation suggests, because that
    * would cause the CS to stall on previous rendering *after* RO
    * invalidation and wouldn't prevent the RO caches from being polluted by
    * concurrent rendering before the stall completes.  This intentionally
    * doesn't implement the SKL+ hardware workaround suggesting to enable CS
    * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
    * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
    * already guarantee that there is no concurrent GPGPU kernel execution
    * (see SKL HSD 2132585).
    */
   brw_emit_pipe_control_flush(brw,
                               PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
                               PIPE_CONTROL_CONST_CACHE_INVALIDATE |
                               PIPE_CONTROL_INSTRUCTION_INVALIDATE |
                               PIPE_CONTROL_STATE_CACHE_INVALIDATE |
                               PIPE_CONTROL_NO_WRITE);

   /* Now send a third stalling flush to make sure that invalidation is
    * complete when the L3 configuration registers are modified.
    */
   brw_emit_pipe_control_flush(brw,
                               PIPE_CONTROL_DATA_CACHE_FLUSH |
                               PIPE_CONTROL_NO_WRITE |
                               PIPE_CONTROL_CS_STALL);

   if (brw->gen >= 8) {
      assert(!cfg->n[GEN_L3P_IS] && !cfg->n[GEN_L3P_C] && !cfg->n[GEN_L3P_T]);

      BEGIN_BATCH(3);
      OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));

      /* Set up the L3 partitioning. */
      OUT_BATCH(GEN8_L3CNTLREG);
      OUT_BATCH((has_slm ? GEN8_L3CNTLREG_SLM_ENABLE : 0) |
                SET_FIELD(cfg->n[GEN_L3P_URB], GEN8_L3CNTLREG_URB_ALLOC) |
                SET_FIELD(cfg->n[GEN_L3P_RO], GEN8_L3CNTLREG_RO_ALLOC) |
                SET_FIELD(cfg->n[GEN_L3P_DC], GEN8_L3CNTLREG_DC_ALLOC) |
                SET_FIELD(cfg->n[GEN_L3P_ALL], GEN8_L3CNTLREG_ALL_ALLOC));

      ADVANCE_BATCH();

   } else {
      assert(!cfg->n[GEN_L3P_ALL]);

      /* When enabled SLM only uses a portion of the L3 on half of the banks,
       * the matching space on the remaining banks has to be allocated to a
       * client (URB for all validated configurations) set to the
       * lower-bandwidth 2-bank address hashing mode.
       */
      const bool urb_low_bw = has_slm && !brw->is_baytrail;
      assert(!urb_low_bw || cfg->n[GEN_L3P_URB] == cfg->n[GEN_L3P_SLM]);

      /* Minimum number of ways that can be allocated to the URB. */
      const unsigned n0_urb = (brw->is_baytrail ? 32 : 0);
      assert(cfg->n[GEN_L3P_URB] >= n0_urb);

      BEGIN_BATCH(7);
      OUT_BATCH(MI_LOAD_REGISTER_IMM | (7 - 2));

      /* Demote any clients with no ways assigned to LLC. */
      OUT_BATCH(GEN7_L3SQCREG1);
      OUT_BATCH((brw->is_haswell ? HSW_L3SQCREG1_SQGHPCI_DEFAULT :
                 brw->is_baytrail ? VLV_L3SQCREG1_SQGHPCI_DEFAULT :
                 IVB_L3SQCREG1_SQGHPCI_DEFAULT) |
                (has_dc ? 0 : GEN7_L3SQCREG1_CONV_DC_UC) |
                (has_is ? 0 : GEN7_L3SQCREG1_CONV_IS_UC) |
                (has_c ? 0 : GEN7_L3SQCREG1_CONV_C_UC) |
                (has_t ? 0 : GEN7_L3SQCREG1_CONV_T_UC));

      /* Set up the L3 partitioning. */
      OUT_BATCH(GEN7_L3CNTLREG2);
      OUT_BATCH((has_slm ? GEN7_L3CNTLREG2_SLM_ENABLE : 0) |
                SET_FIELD(cfg->n[GEN_L3P_URB] - n0_urb, GEN7_L3CNTLREG2_URB_ALLOC) |
                (urb_low_bw ? GEN7_L3CNTLREG2_URB_LOW_BW : 0) |
                SET_FIELD(cfg->n[GEN_L3P_ALL], GEN7_L3CNTLREG2_ALL_ALLOC) |
                SET_FIELD(cfg->n[GEN_L3P_RO], GEN7_L3CNTLREG2_RO_ALLOC) |
                SET_FIELD(cfg->n[GEN_L3P_DC], GEN7_L3CNTLREG2_DC_ALLOC));
      OUT_BATCH(GEN7_L3CNTLREG3);
      OUT_BATCH(SET_FIELD(cfg->n[GEN_L3P_IS], GEN7_L3CNTLREG3_IS_ALLOC) |
                SET_FIELD(cfg->n[GEN_L3P_C], GEN7_L3CNTLREG3_C_ALLOC) |
                SET_FIELD(cfg->n[GEN_L3P_T], GEN7_L3CNTLREG3_T_ALLOC));

      ADVANCE_BATCH();

      if (brw->is_haswell && brw->screen->cmd_parser_version >= 4) {
         /* Enable L3 atomics on HSW if we have a DC partition, otherwise keep
          * them disabled to avoid crashing the system hard.
          */
         BEGIN_BATCH(5);
         OUT_BATCH(MI_LOAD_REGISTER_IMM | (5 - 2));
         OUT_BATCH(HSW_SCRATCH1);
         OUT_BATCH(has_dc ? 0 : HSW_SCRATCH1_L3_ATOMIC_DISABLE);
         OUT_BATCH(HSW_ROW_CHICKEN3);
         OUT_BATCH(REG_MASK(HSW_ROW_CHICKEN3_L3_ATOMIC_DISABLE) |
                   (has_dc ? 0 : HSW_ROW_CHICKEN3_L3_ATOMIC_DISABLE));
         ADVANCE_BATCH();
      }
   }
}
コード例 #3
0
static void
brw_upload_initial_gpu_state(struct brw_context *brw)
{
   const struct gen_device_info *devinfo = &brw->screen->devinfo;
   const struct brw_compiler *compiler = brw->screen->compiler;

   /* On platforms with hardware contexts, we can set our initial GPU state
    * right away rather than doing it via state atoms.  This saves a small
    * amount of overhead on every draw call.
    */
   if (!brw->hw_ctx)
      return;

   if (devinfo->gen == 6)
      brw_emit_post_sync_nonzero_flush(brw);

   brw_upload_invariant_state(brw);

   if (devinfo->gen == 11) {
      /* The default behavior of bit 5 "Headerless Message for Pre-emptable
       * Contexts" in SAMPLER MODE register is set to 0, which means
       * headerless sampler messages are not allowed for pre-emptable
       * contexts. Set the bit 5 to 1 to allow them.
       */
      brw_load_register_imm32(brw, GEN11_SAMPLER_MODE,
                              HEADERLESS_MESSAGE_FOR_PREEMPTABLE_CONTEXTS_MASK |
                              HEADERLESS_MESSAGE_FOR_PREEMPTABLE_CONTEXTS);

      /* Bit 1 "Enabled Texel Offset Precision Fix" must be set in
       * HALF_SLICE_CHICKEN7 register.
       */
      brw_load_register_imm32(brw, HALF_SLICE_CHICKEN7,
                              TEXEL_OFFSET_FIX_MASK |
                              TEXEL_OFFSET_FIX_ENABLE);

      /* WA_1406697149: Bit 9 "Error Detection Behavior Control" must be set
       * in L3CNTLREG register. The default setting of the bit is not the
       * desirable behavior.
       */
      brw_load_register_imm32(brw, GEN8_L3CNTLREG,
                              GEN8_L3CNTLREG_EDBC_NO_HANG);

      /* WA_2204188704: Pixel Shader Panic dispatch must be disabled.
       */
       brw_load_register_imm32(brw, COMMON_SLICE_CHICKEN3,
                               PS_THREAD_PANIC_DISPATCH_MASK |
                               PS_THREAD_PANIC_DISPATCH);

       /* WaEnableStateCacheRedirectToCS:icl */
       brw_load_register_imm32(brw, SLICE_COMMON_ECO_CHICKEN1,
                               GEN11_STATE_CACHE_REDIRECT_TO_CS_SECTION_ENABLE |
                               REG_MASK(GEN11_STATE_CACHE_REDIRECT_TO_CS_SECTION_ENABLE));
   }

   if (devinfo->gen == 10 || devinfo->gen == 11) {
      /* From gen10 workaround table in h/w specs:
       *
       *    "On 3DSTATE_3D_MODE, driver must always program bits 31:16 of DW1
       *     a value of 0xFFFF"
       *
       * This means that we end up setting the entire 3D_MODE state. Bits
       * in this register control things such as slice hashing and we want
       * the default values of zero at the moment.
       */
      BEGIN_BATCH(2);
      OUT_BATCH(_3DSTATE_3D_MODE  << 16 | (2 - 2));
      OUT_BATCH(0xFFFF << 16);
      ADVANCE_BATCH();
   }

   if (devinfo->gen == 9) {
      /* Recommended optimizations for Victim Cache eviction and floating
       * point blending.
       */
      brw_load_register_imm32(brw, GEN7_CACHE_MODE_1,
                              REG_MASK(GEN9_FLOAT_BLEND_OPTIMIZATION_ENABLE) |
                              REG_MASK(GEN9_PARTIAL_RESOLVE_DISABLE_IN_VC) |
                              GEN9_FLOAT_BLEND_OPTIMIZATION_ENABLE |
                              GEN9_PARTIAL_RESOLVE_DISABLE_IN_VC);

      if (gen_device_info_is_9lp(devinfo)) {
         brw_load_register_imm32(brw, GEN7_GT_MODE,
                                 GEN9_SUBSLICE_HASHING_MASK_BITS |
                                 GEN9_SUBSLICE_HASHING_16x16);
      }
   }

   if (devinfo->gen >= 8) {
      gen8_emit_3dstate_sample_pattern(brw);

      BEGIN_BATCH(5);
      OUT_BATCH(_3DSTATE_WM_HZ_OP << 16 | (5 - 2));
      OUT_BATCH(0);
      OUT_BATCH(0);
      OUT_BATCH(0);
      OUT_BATCH(0);
      ADVANCE_BATCH();

      BEGIN_BATCH(2);
      OUT_BATCH(_3DSTATE_WM_CHROMAKEY << 16 | (2 - 2));
      OUT_BATCH(0);
      ADVANCE_BATCH();
   }

   /* Set the "CONSTANT_BUFFER Address Offset Disable" bit, so
    * 3DSTATE_CONSTANT_XS buffer 0 is an absolute address.
    *
    * This is only safe on kernels with context isolation support.
    */
   if (!compiler->constant_buffer_0_is_relative) {
      if (devinfo->gen >= 9) {
         BEGIN_BATCH(3);
         OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
         OUT_BATCH(CS_DEBUG_MODE2);
         OUT_BATCH(REG_MASK(CSDBG2_CONSTANT_BUFFER_ADDRESS_OFFSET_DISABLE) |
                   CSDBG2_CONSTANT_BUFFER_ADDRESS_OFFSET_DISABLE);
         ADVANCE_BATCH();
      } else if (devinfo->gen == 8) {
         BEGIN_BATCH(3);
         OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
         OUT_BATCH(INSTPM);
         OUT_BATCH(REG_MASK(INSTPM_CONSTANT_BUFFER_ADDRESS_OFFSET_DISABLE) |
                   INSTPM_CONSTANT_BUFFER_ADDRESS_OFFSET_DISABLE);
         ADVANCE_BATCH();
      }
   }

   brw->object_preemption = false;

   if (devinfo->gen >= 10)
      brw_enable_obj_preemption(brw, true);
}