Example #1
0
static void
gen8_upload_3dstate_so_buffers(struct brw_context *brw)
{
   struct gl_context *ctx = &brw->ctx;
   /* BRW_NEW_TRANSFORM_FEEDBACK */
   struct gl_transform_feedback_object *xfb_obj =
      ctx->TransformFeedback.CurrentObject;
   struct brw_transform_feedback_object *brw_obj =
      (struct brw_transform_feedback_object *) xfb_obj;

   /* Set up the up to 4 output buffers.  These are the ranges defined in the
    * gl_transform_feedback_object.
    */
   for (int i = 0; i < 4; i++) {
      struct intel_buffer_object *bufferobj =
         intel_buffer_object(xfb_obj->Buffers[i]);

      if (!bufferobj) {
         BEGIN_BATCH(8);
         OUT_BATCH(_3DSTATE_SO_BUFFER << 16 | (8 - 2));
         OUT_BATCH((i << SO_BUFFER_INDEX_SHIFT));
         OUT_BATCH(0);
         OUT_BATCH(0);
         OUT_BATCH(0);
         OUT_BATCH(0);
         OUT_BATCH(0);
         OUT_BATCH(0);
         ADVANCE_BATCH();
         continue;
      }

      uint32_t start = xfb_obj->Offset[i];
      assert(start % 4 == 0);
      uint32_t end = ALIGN(start + xfb_obj->Size[i], 4);
      drm_intel_bo *bo =
         intel_bufferobj_buffer(brw, bufferobj, start, end - start);
      assert(end <= bo->size);

      perf_debug("Missing MOCS setup for 3DSTATE_SO_BUFFER.");

      BEGIN_BATCH(8);
      OUT_BATCH(_3DSTATE_SO_BUFFER << 16 | (8 - 2));
      OUT_BATCH(GEN8_SO_BUFFER_ENABLE | (i << SO_BUFFER_INDEX_SHIFT) |
                GEN8_SO_BUFFER_OFFSET_WRITE_ENABLE |
                GEN8_SO_BUFFER_OFFSET_ADDRESS_ENABLE |
                (BDW_MOCS_WB << 22));
      OUT_RELOC64(bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, start);
      OUT_BATCH(xfb_obj->Size[i] / 4 - 1);
      OUT_RELOC64(brw_obj->offset_bo,
                  I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
                  i * sizeof(uint32_t));
      if (brw_obj->zero_offsets)
         OUT_BATCH(0); /* Zero out the offset and write that to offset_bo */
      else
         OUT_BATCH(0xFFFFFFFF); /* Use offset_bo as the "Stream Offset." */
      ADVANCE_BATCH();
   }
   brw_obj->zero_offsets = false;
}
Example #2
0
static void
gen8_upload_ds_state(struct brw_context *brw)
{
   struct gl_context *ctx = &brw->ctx;
   const struct brw_stage_state *stage_state = &brw->tes.base;
   /* BRW_NEW_TESS_PROGRAMS */
   bool active = brw->tess_eval_program;

   /* BRW_NEW_TES_PROG_DATA */
   const struct brw_tes_prog_data *tes_prog_data = brw->tes.prog_data;
   const struct brw_vue_prog_data *vue_prog_data = &tes_prog_data->base;
   const struct brw_stage_prog_data *prog_data = &vue_prog_data->base;

   if (active) {
      BEGIN_BATCH(9);
      OUT_BATCH(_3DSTATE_DS << 16 | (9 - 2));
      OUT_BATCH(stage_state->prog_offset);
      OUT_BATCH(0);
      OUT_BATCH(SET_FIELD(DIV_ROUND_UP(stage_state->sampler_count, 4),
                          GEN7_DS_SAMPLER_COUNT) |
                SET_FIELD(prog_data->binding_table.size_bytes / 4,
                          GEN7_DS_BINDING_TABLE_ENTRY_COUNT));
      if (prog_data->total_scratch) {
         OUT_RELOC64(stage_state->scratch_bo,
                     I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
                     ffs(prog_data->total_scratch) - 11);
      } else {
         OUT_BATCH(0);
         OUT_BATCH(0);
      }
      OUT_BATCH(SET_FIELD(prog_data->dispatch_grf_start_reg,
                          GEN7_DS_DISPATCH_START_GRF) |
                SET_FIELD(vue_prog_data->urb_read_length,
                          GEN7_DS_URB_READ_LENGTH));

      OUT_BATCH(GEN7_DS_ENABLE |
                GEN7_DS_STATISTICS_ENABLE |
                (brw->max_ds_threads - 1) << HSW_DS_MAX_THREADS_SHIFT |
                (vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8 ?
                 GEN7_DS_SIMD8_DISPATCH_ENABLE : 0) |
                (tes_prog_data->domain == BRW_TESS_DOMAIN_TRI ?
                 GEN7_DS_COMPUTE_W_COORDINATE_ENABLE : 0));
      OUT_BATCH(SET_FIELD(ctx->Transform.ClipPlanesEnabled,
                          GEN8_DS_USER_CLIP_DISTANCE));
      ADVANCE_BATCH();
   } else {
      BEGIN_BATCH(9);
      OUT_BATCH(_3DSTATE_DS << 16 | (9 - 2));
      OUT_BATCH(0);
      OUT_BATCH(0);
      OUT_BATCH(0);
      OUT_BATCH(0);
      OUT_BATCH(0);
      OUT_BATCH(0);
      OUT_BATCH(0);
      OUT_BATCH(0);
      ADVANCE_BATCH();
   }
   brw->tes.enabled = active;
}
Example #3
0
static void
upload_vs_state(struct brw_context *brw)
{
   struct gl_context *ctx = &brw->ctx;
   const struct brw_stage_state *stage_state = &brw->vs.base;
   uint32_t floating_point_mode = 0;

   /* CACHE_NEW_VS_PROG */
   const struct brw_vec4_prog_data *prog_data = &brw->vs.prog_data->base;

   /* CACHE_NEW_SAMPLER */
   BEGIN_BATCH(2);
   OUT_BATCH(_3DSTATE_SAMPLER_STATE_POINTERS_VS << 16 | (2 - 2));
   OUT_BATCH(stage_state->sampler_offset);
   ADVANCE_BATCH();

   gen8_upload_constant_state(brw, stage_state, true /* active */,
                              _3DSTATE_CONSTANT_VS);

   /* Use ALT floating point mode for ARB vertex programs, because they
    * require 0^0 == 1.
    */
   if (ctx->Shader.CurrentProgram[MESA_SHADER_VERTEX] == NULL)
      floating_point_mode = GEN6_VS_FLOATING_POINT_MODE_ALT;

   BEGIN_BATCH(9);
   OUT_BATCH(_3DSTATE_VS << 16 | (9 - 2));
   OUT_BATCH(stage_state->prog_offset);
   OUT_BATCH(0);
   OUT_BATCH(floating_point_mode |
             ((ALIGN(stage_state->sampler_count, 4) / 4) <<
               GEN6_VS_SAMPLER_COUNT_SHIFT) |
             ((prog_data->base.binding_table.size_bytes / 4) <<
               GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT));

   if (prog_data->total_scratch) {
      OUT_RELOC64(stage_state->scratch_bo,
                  I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
                  ffs(prog_data->total_scratch) - 11);
   } else {
      OUT_BATCH(0);
      OUT_BATCH(0);
   }

   OUT_BATCH((prog_data->dispatch_grf_start_reg <<
              GEN6_VS_DISPATCH_START_GRF_SHIFT) |
             (prog_data->urb_read_length << GEN6_VS_URB_READ_LENGTH_SHIFT) |
             (0 << GEN6_VS_URB_ENTRY_READ_OFFSET_SHIFT));

   OUT_BATCH(((brw->max_vs_threads - 1) << HSW_VS_MAX_THREADS_SHIFT) |
             GEN6_VS_STATISTICS_ENABLE |
             GEN6_VS_ENABLE);

   /* _NEW_TRANSFORM */
   OUT_BATCH((ctx->Transform.ClipPlanesEnabled <<
              GEN8_VS_USER_CLIP_DISTANCE_SHIFT));
   ADVANCE_BATCH();
}
Example #4
0
/**
 * Used to initialize the alpha value of an ARGB8888 miptree after copying
 * into it from an XRGB8888 source.
 *
 * This is very common with glCopyTexImage2D().  Note that the coordinates are
 * relative to the start of the miptree, not relative to a slice within the
 * miptree.
 */
static void
intel_miptree_set_alpha_to_one(struct brw_context *brw,
                              struct intel_mipmap_tree *mt,
                              int x, int y, int width, int height)
{
   uint32_t BR13, CMD;
   int pitch, cpp;
   drm_intel_bo *aper_array[2];

   pitch = mt->pitch;
   cpp = mt->cpp;

   DBG("%s dst:buf(%p)/%d %d,%d sz:%dx%d\n",
       __FUNCTION__, mt->bo, pitch, x, y, width, height);

   BR13 = br13_for_cpp(cpp) | 0xf0 << 16;
   CMD = XY_COLOR_BLT_CMD;
   CMD |= XY_BLT_WRITE_ALPHA;

   if (mt->tiling != I915_TILING_NONE) {
      CMD |= XY_DST_TILED;
      pitch /= 4;
   }
   BR13 |= pitch;

   /* do space check before going any further */
   aper_array[0] = brw->batch.bo;
   aper_array[1] = mt->bo;

   if (drm_intel_bufmgr_check_aperture_space(aper_array,
					     ARRAY_SIZE(aper_array)) != 0) {
      intel_batchbuffer_flush(brw);
   }

   unsigned length = brw->gen >= 8 ? 7 : 6;
   bool dst_y_tiled = mt->tiling == I915_TILING_Y;

   BEGIN_BATCH_BLT_TILED(length, dst_y_tiled, false);
   OUT_BATCH(CMD | (length - 2));
   OUT_BATCH(BR13);
   OUT_BATCH(SET_FIELD(y, BLT_Y) | SET_FIELD(x, BLT_X));
   OUT_BATCH(SET_FIELD(y + height, BLT_Y) | SET_FIELD(x + width, BLT_X));
   if (brw->gen >= 8) {
      OUT_RELOC64(mt->bo,
                  I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
                  0);
   } else {
      OUT_RELOC(mt->bo,
                I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
                0);
   }
   OUT_BATCH(0xffffffff); /* white, but only alpha gets written */
   ADVANCE_BATCH_TILED(dst_y_tiled, false);

   intel_batchbuffer_emit_mi_flush(brw);
}
Example #5
0
static void
gen8_upload_hs_state(struct brw_context *brw)
{
   const struct gen_device_info *devinfo = &brw->screen->devinfo;
   const struct brw_stage_state *stage_state = &brw->tcs.base;
   /* BRW_NEW_TESS_PROGRAMS */
   bool active = brw->tess_eval_program;
   /* BRW_NEW_TCS_PROG_DATA */
   const struct brw_stage_prog_data *prog_data = stage_state->prog_data;
   const struct brw_tcs_prog_data *tcs_prog_data =
      brw_tcs_prog_data(stage_state->prog_data);

   if (active) {
      BEGIN_BATCH(9);
      OUT_BATCH(_3DSTATE_HS << 16 | (9 - 2));
      OUT_BATCH(SET_FIELD(DIV_ROUND_UP(stage_state->sampler_count, 4),
                          GEN7_HS_SAMPLER_COUNT) |
                SET_FIELD(prog_data->binding_table.size_bytes / 4,
                          GEN7_HS_BINDING_TABLE_ENTRY_COUNT));
      OUT_BATCH(GEN7_HS_ENABLE |
                GEN7_HS_STATISTICS_ENABLE |
                (devinfo->max_tcs_threads - 1) << GEN8_HS_MAX_THREADS_SHIFT |
                SET_FIELD(tcs_prog_data->instances - 1,
                          GEN7_HS_INSTANCE_COUNT));
      OUT_BATCH(stage_state->prog_offset);
      OUT_BATCH(0);
      if (prog_data->total_scratch) {
         OUT_RELOC64(stage_state->scratch_bo,
                     I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
                     ffs(stage_state->per_thread_scratch) - 11);
      } else {
         OUT_BATCH(0);
         OUT_BATCH(0);
      }
      OUT_BATCH(GEN7_HS_INCLUDE_VERTEX_HANDLES |
                SET_FIELD(prog_data->dispatch_grf_start_reg,
                          GEN7_HS_DISPATCH_START_GRF));
      OUT_BATCH(0); /* MBZ */
      ADVANCE_BATCH();
   } else {
      BEGIN_BATCH(9);
      OUT_BATCH(_3DSTATE_HS << 16 | (9 - 2));
      OUT_BATCH(0);
      OUT_BATCH(0);
      OUT_BATCH(0);
      OUT_BATCH(0);
      OUT_BATCH(0);
      OUT_BATCH(0);
      OUT_BATCH(0);
      OUT_BATCH(0);
      ADVANCE_BATCH();
   }
   brw->tcs.enabled = active;
}
Example #6
0
static void
upload_vs_state(struct brw_context *brw)
{
   struct gl_context *ctx = &brw->ctx;
   const struct brw_stage_state *stage_state = &brw->vs.base;
   uint32_t floating_point_mode = 0;

   /* BRW_NEW_VS_PROG_DATA */
   const struct brw_vue_prog_data *prog_data = &brw->vs.prog_data->base;

   assert(prog_data->dispatch_mode == DISPATCH_MODE_SIMD8 ||
          prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT);

   if (prog_data->base.use_alt_mode)
      floating_point_mode = GEN6_VS_FLOATING_POINT_MODE_ALT;

   BEGIN_BATCH(9);
   OUT_BATCH(_3DSTATE_VS << 16 | (9 - 2));
   OUT_BATCH(stage_state->prog_offset);
   OUT_BATCH(0);
   OUT_BATCH(floating_point_mode |
             ((ALIGN(stage_state->sampler_count, 4) / 4) <<
               GEN6_VS_SAMPLER_COUNT_SHIFT) |
             ((prog_data->base.binding_table.size_bytes / 4) <<
               GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT));

   if (prog_data->base.total_scratch) {
      OUT_RELOC64(stage_state->scratch_bo,
                  I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
                  ffs(prog_data->base.total_scratch) - 11);
   } else {
      OUT_BATCH(0);
      OUT_BATCH(0);
   }

   OUT_BATCH((prog_data->base.dispatch_grf_start_reg <<
              GEN6_VS_DISPATCH_START_GRF_SHIFT) |
             (prog_data->urb_read_length << GEN6_VS_URB_READ_LENGTH_SHIFT) |
             (0 << GEN6_VS_URB_ENTRY_READ_OFFSET_SHIFT));

   uint32_t simd8_enable = prog_data->dispatch_mode == DISPATCH_MODE_SIMD8 ?
      GEN8_VS_SIMD8_ENABLE : 0;
   OUT_BATCH(((brw->max_vs_threads - 1) << HSW_VS_MAX_THREADS_SHIFT) |
             GEN6_VS_STATISTICS_ENABLE |
             simd8_enable |
             GEN6_VS_ENABLE);

   /* _NEW_TRANSFORM */
   OUT_BATCH(prog_data->cull_distance_mask |
             (ctx->Transform.ClipPlanesEnabled <<
              GEN8_VS_USER_CLIP_DISTANCE_SHIFT));
   ADVANCE_BATCH();
}
Example #7
0
/**
 * Enable hardware binding tables and set up the binding table pool.
 */
void
gen7_enable_hw_binding_tables(struct brw_context *brw)
{
   if (!brw->use_resource_streamer)
      return;

   if (!brw->hw_bt_pool.bo) {
      /* We use a single re-usable buffer object for the lifetime of the
       * context and size it to maximum allowed binding tables that can be
       * programmed per batch:
       *
       * From the Haswell PRM, Volume 7: 3D Media GPGPU,
       * 3DSTATE_BINDING_TABLE_POOL_ALLOC > Programming Note:
       * "A maximum of 16,383 Binding tables are allowed in any batch buffer"
       */
      static const int max_size = 16383 * 4;
      brw->hw_bt_pool.bo = drm_intel_bo_alloc(brw->bufmgr, "hw_bt",
                                              max_size, 64);
      brw->hw_bt_pool.next_offset = 0;
   }

   /* From the Haswell PRM, Volume 7: 3D Media GPGPU,
    * 3DSTATE_BINDING_TABLE_POOL_ALLOC > Programming Note:
    *
    * "When switching between HW and SW binding table generation, SW must
    * issue a state cache invalidate."
    */
   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_STATE_CACHE_INVALIDATE);

   int pkt_len = brw->gen >= 8 ? 4 : 3;
   uint32_t dw1 = BRW_HW_BINDING_TABLE_ENABLE;
   if (brw->is_haswell) {
      dw1 |= SET_FIELD(GEN7_MOCS_L3, GEN7_HW_BT_POOL_MOCS) |
             HSW_BT_POOL_ALLOC_MUST_BE_ONE;
   } else if (brw->gen >= 8) {
      dw1 |= BDW_MOCS_WB;
   }

   BEGIN_BATCH(pkt_len);
   OUT_BATCH(_3DSTATE_BINDING_TABLE_POOL_ALLOC << 16 | (pkt_len - 2));
   if (brw->gen >= 8) {
      OUT_RELOC64(brw->hw_bt_pool.bo, I915_GEM_DOMAIN_SAMPLER, 0, dw1);
      OUT_BATCH(brw->hw_bt_pool.bo->size);
   } else {
      OUT_RELOC(brw->hw_bt_pool.bo, I915_GEM_DOMAIN_SAMPLER, 0, dw1);
      OUT_RELOC(brw->hw_bt_pool.bo, I915_GEM_DOMAIN_SAMPLER, 0,
             brw->hw_bt_pool.bo->size);
   }
   ADVANCE_BATCH();
}
Example #8
0
/**
 * Define the base addresses which some state is referenced from.
 */
static void upload_state_base_address(struct brw_context *brw)
{
    perf_debug("Missing MOCS setup for STATE_BASE_ADDRESS.");

    BEGIN_BATCH(16);
    OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (16 - 2));
    /* General state base address: stateless DP read/write requests */
    OUT_BATCH(BDW_MOCS_WB << 4 | 1);
    OUT_BATCH(0);
    OUT_BATCH(BDW_MOCS_WB << 16);
    /* Surface state base address: */
    OUT_RELOC64(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0,
                BDW_MOCS_WB << 4 | 1);
    /* Dynamic state base address: */
    OUT_RELOC64(brw->batch.bo,
                I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION, 0,
                BDW_MOCS_WB << 4 | 1);
    /* Indirect object base address: MEDIA_OBJECT data */
    OUT_BATCH(BDW_MOCS_WB << 4 | 1);
    OUT_BATCH(0);
    /* Instruction base address: shader kernels (incl. SIP) */
    OUT_RELOC64(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
                BDW_MOCS_WB << 4 | 1);

    /* General state buffer size */
    OUT_BATCH(0xfffff001);
    /* Dynamic state buffer size */
    OUT_BATCH(ALIGN(brw->batch.bo->size, 4096) | 1);
    /* Indirect object upper bound */
    OUT_BATCH(0xfffff001);
    /* Instruction access upper bound */
    OUT_BATCH(ALIGN(brw->cache.bo->size, 4096) | 1);
    ADVANCE_BATCH();

    brw->state.dirty.brw |= BRW_NEW_STATE_BASE_ADDRESS;
}
Example #9
0
static void
gen8_emit_index_buffer(struct brw_context *brw)
{
   const struct _mesa_index_buffer *index_buffer = brw->ib.ib;
   uint32_t mocs_wb = brw->gen >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;

   if (index_buffer == NULL)
      return;

   BEGIN_BATCH(5);
   OUT_BATCH(CMD_INDEX_BUFFER << 16 | (5 - 2));
   OUT_BATCH(brw_get_index_type(index_buffer->type) | mocs_wb);
   OUT_RELOC64(brw->ib.bo, I915_GEM_DOMAIN_VERTEX, 0, 0);
   OUT_BATCH(brw->ib.bo->size);
   ADVANCE_BATCH();
}
Example #10
0
static void
gen6_blorp_emit_vertex_buffer_state(struct brw_context *brw,
                                    unsigned num_elems,
                                    unsigned vbo_size,
                                    uint32_t vertex_offset)
{
   /* 3DSTATE_VERTEX_BUFFERS */
   const int num_buffers = 1;
   const int batch_length = 1 + 4 * num_buffers;

   uint32_t dw0 = GEN6_VB0_ACCESS_VERTEXDATA |
                  (num_elems * sizeof(float)) << BRW_VB0_PITCH_SHIFT;

   if (brw->gen >= 7)
      dw0 |= GEN7_VB0_ADDRESS_MODIFYENABLE;

   switch (brw->gen) {
   case 7:
      dw0 |= GEN7_MOCS_L3 << 16;
      break;
   case 8:
      dw0 |= BDW_MOCS_WB << 16;
      break;
   case 9:
      dw0 |= SKL_MOCS_WB << 16;
      break;
   }

   BEGIN_BATCH(batch_length);
   OUT_BATCH((_3DSTATE_VERTEX_BUFFERS << 16) | (batch_length - 2));
   OUT_BATCH(dw0);
   if (brw->gen >= 8) {
      OUT_RELOC64(brw->batch.bo, I915_GEM_DOMAIN_VERTEX, 0, vertex_offset);
      OUT_BATCH(vbo_size);
   } else {
      /* start address */
      OUT_RELOC(brw->batch.bo, I915_GEM_DOMAIN_VERTEX, 0,
                vertex_offset);
      /* end address */
      OUT_RELOC(brw->batch.bo, I915_GEM_DOMAIN_VERTEX, 0,
                vertex_offset + vbo_size - 1);
      OUT_BATCH(0);
   }
   ADVANCE_BATCH();
}
Example #11
0
/**
 * Emit a PIPE_CONTROL that writes to a buffer object.
 *
 * \p flags should contain one of the following items:
 *  - PIPE_CONTROL_WRITE_IMMEDIATE
 *  - PIPE_CONTROL_WRITE_TIMESTAMP
 *  - PIPE_CONTROL_WRITE_DEPTH_COUNT
 */
void
brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags,
                            drm_intel_bo *bo, uint32_t offset,
                            uint32_t imm_lower, uint32_t imm_upper)
{
   if (brw->gen >= 8) {
      if (brw->gen == 8)
         gen8_add_cs_stall_workaround_bits(&flags);

      BEGIN_BATCH(6);
      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
      OUT_BATCH(flags);
      OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
                  offset);
      OUT_BATCH(imm_lower);
      OUT_BATCH(imm_upper);
      ADVANCE_BATCH();
   } else if (brw->gen >= 6) {
      flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags);

      /* PPGTT/GGTT is selected by DW2 bit 2 on Sandybridge, but DW1 bit 24
       * on later platforms.  We always use PPGTT on Gen7+.
       */
      unsigned gen6_gtt = brw->gen == 6 ? PIPE_CONTROL_GLOBAL_GTT_WRITE : 0;

      BEGIN_BATCH(5);
      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
      OUT_BATCH(flags);
      OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
                gen6_gtt | offset);
      OUT_BATCH(imm_lower);
      OUT_BATCH(imm_upper);
      ADVANCE_BATCH();
   } else {
      BEGIN_BATCH(4);
      OUT_BATCH(_3DSTATE_PIPE_CONTROL | flags | (4 - 2));
      OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
                PIPE_CONTROL_GLOBAL_GTT_WRITE | offset);
      OUT_BATCH(imm_lower);
      OUT_BATCH(imm_upper);
      ADVANCE_BATCH();
   }
}
Example #12
0
static void
brw_upload_cs_state(struct brw_context *brw)
{
   if (!brw->cs.prog_data)
      return;

   uint32_t offset;
   uint32_t *desc = (uint32_t*) brw_state_batch(brw, AUB_TRACE_SURFACE_STATE,
                                                8 * 4, 64, &offset);
   struct brw_stage_state *stage_state = &brw->cs.base;
   struct brw_cs_prog_data *cs_prog_data = brw->cs.prog_data;
   struct brw_stage_prog_data *prog_data = &cs_prog_data->base;

   if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
      brw->vtbl.emit_buffer_surface_state(
         brw, &stage_state->surf_offset[
                 prog_data->binding_table.shader_time_start],
         brw->shader_time.bo, 0, BRW_SURFACEFORMAT_RAW,
         brw->shader_time.bo->size, 1, true);
   }

   uint32_t *bind = (uint32_t*) brw_state_batch(brw, AUB_TRACE_BINDING_TABLE,
                                            prog_data->binding_table.size_bytes,
                                            32, &stage_state->bind_bo_offset);

   unsigned threads = get_cs_thread_count(cs_prog_data);

   uint32_t dwords = brw->gen < 8 ? 8 : 9;
   BEGIN_BATCH(dwords);
   OUT_BATCH(MEDIA_VFE_STATE << 16 | (dwords - 2));

   if (prog_data->total_scratch) {
      if (brw->gen >= 8)
         OUT_RELOC64(stage_state->scratch_bo,
                     I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
                     ffs(prog_data->total_scratch) - 11);
      else
         OUT_RELOC(stage_state->scratch_bo,
                   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
                   ffs(prog_data->total_scratch) - 11);
   } else {
      OUT_BATCH(0);
      if (brw->gen >= 8)
         OUT_BATCH(0);
   }

   const uint32_t vfe_num_urb_entries = brw->gen >= 8 ? 2 : 0;
   const uint32_t vfe_gpgpu_mode =
      brw->gen == 7 ? SET_FIELD(1, GEN7_MEDIA_VFE_STATE_GPGPU_MODE) : 0;
   OUT_BATCH(SET_FIELD(brw->max_cs_threads - 1, MEDIA_VFE_STATE_MAX_THREADS) |
             SET_FIELD(vfe_num_urb_entries, MEDIA_VFE_STATE_URB_ENTRIES) |
             SET_FIELD(1, MEDIA_VFE_STATE_RESET_GTW_TIMER) |
             SET_FIELD(1, MEDIA_VFE_STATE_BYPASS_GTW) |
             vfe_gpgpu_mode);

   OUT_BATCH(0);
   const uint32_t vfe_urb_allocation = brw->gen >= 8 ? 2 : 0;
   OUT_BATCH(SET_FIELD(vfe_urb_allocation, MEDIA_VFE_STATE_URB_ALLOC));
   OUT_BATCH(0);
   OUT_BATCH(0);
   OUT_BATCH(0);
   ADVANCE_BATCH();

   /* BRW_NEW_SURFACES and BRW_NEW_*_CONSTBUF */
   memcpy(bind, stage_state->surf_offset,
          prog_data->binding_table.size_bytes);

   memset(desc, 0, 8 * 4);

   int dw = 0;
   desc[dw++] = brw->cs.base.prog_offset;
   if (brw->gen >= 8)
      desc[dw++] = 0; /* Kernel Start Pointer High */
   desc[dw++] = 0;
   desc[dw++] = 0;
   desc[dw++] = stage_state->bind_bo_offset;
   desc[dw++] = 0;
   const uint32_t media_threads =
      brw->gen >= 8 ?
      SET_FIELD(threads, GEN8_MEDIA_GPGPU_THREAD_COUNT) :
      SET_FIELD(threads, MEDIA_GPGPU_THREAD_COUNT);
   assert(threads <= brw->max_cs_threads);
   desc[dw++] = media_threads;

   BEGIN_BATCH(4);
   OUT_BATCH(MEDIA_INTERFACE_DESCRIPTOR_LOAD << 16 | (4 - 2));
   OUT_BATCH(0);
   OUT_BATCH(8 * 4);
   OUT_BATCH(offset);
   ADVANCE_BATCH();
}
Example #13
0
/* Copy BitBlt
 */
static bool
emit_copy_blit(struct brw_context *brw,
               GLuint cpp,
               int32_t src_pitch,
               struct brw_bo *src_buffer,
               GLuint src_offset,
               enum isl_tiling src_tiling,
               int32_t dst_pitch,
               struct brw_bo *dst_buffer,
               GLuint dst_offset,
               enum isl_tiling dst_tiling,
               GLshort src_x, GLshort src_y,
               GLshort dst_x, GLshort dst_y,
               GLshort w, GLshort h,
               enum gl_logicop_mode logic_op)
{
   const struct gen_device_info *devinfo = &brw->screen->devinfo;
   GLuint CMD, BR13;
   int dst_y2 = dst_y + h;
   int dst_x2 = dst_x + w;
   bool dst_y_tiled = dst_tiling == ISL_TILING_Y0;
   bool src_y_tiled = src_tiling == ISL_TILING_Y0;
   uint32_t src_tile_w, src_tile_h;
   uint32_t dst_tile_w, dst_tile_h;

   if ((dst_y_tiled || src_y_tiled) && devinfo->gen < 6)
      return false;

   const unsigned bo_sizes = dst_buffer->size + src_buffer->size;

   /* do space check before going any further */
   if (!brw_batch_has_aperture_space(brw, bo_sizes))
      intel_batchbuffer_flush(brw);

   if (!brw_batch_has_aperture_space(brw, bo_sizes))
      return false;

   unsigned length = devinfo->gen >= 8 ? 10 : 8;

   intel_batchbuffer_require_space(brw, length * 4);
   DBG("%s src:buf(%p)/%d+%d %d,%d dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
       __func__,
       src_buffer, src_pitch, src_offset, src_x, src_y,
       dst_buffer, dst_pitch, dst_offset, dst_x, dst_y, w, h);

   intel_get_tile_dims(src_tiling, cpp, &src_tile_w, &src_tile_h);
   intel_get_tile_dims(dst_tiling, cpp, &dst_tile_w, &dst_tile_h);

   /* For Tiled surfaces, the pitch has to be a multiple of the Tile width
    * (X direction width of the Tile). This is ensured while allocating the
    * buffer object.
    */
   assert(src_tiling == ISL_TILING_LINEAR || (src_pitch % src_tile_w) == 0);
   assert(dst_tiling == ISL_TILING_LINEAR || (dst_pitch % dst_tile_w) == 0);

   /* For big formats (such as floating point), do the copy using 16 or
    * 32bpp and multiply the coordinates.
    */
   if (cpp > 4) {
      if (cpp % 4 == 2) {
         dst_x *= cpp / 2;
         dst_x2 *= cpp / 2;
         src_x *= cpp / 2;
         cpp = 2;
      } else {
         assert(cpp % 4 == 0);
         dst_x *= cpp / 4;
         dst_x2 *= cpp / 4;
         src_x *= cpp / 4;
         cpp = 4;
      }
   }

   if (!alignment_valid(brw, dst_offset, dst_tiling))
      return false;
   if (!alignment_valid(brw, src_offset, src_tiling))
      return false;

   /* Blit pitch must be dword-aligned.  Otherwise, the hardware appears to drop
    * the low bits.  Offsets must be naturally aligned.
    */
   if (src_pitch % 4 != 0 || src_offset % cpp != 0 ||
       dst_pitch % 4 != 0 || dst_offset % cpp != 0)
      return false;

   assert(cpp <= 4);
   BR13 = br13_for_cpp(cpp) | translate_raster_op(logic_op) << 16;

   CMD = xy_blit_cmd(src_tiling, dst_tiling, cpp);

   /* For tiled source and destination, pitch value should be specified
    * as a number of Dwords.
    */
   if (dst_tiling != ISL_TILING_LINEAR)
      dst_pitch /= 4;

   if (src_tiling != ISL_TILING_LINEAR)
      src_pitch /= 4;

   if (dst_y2 <= dst_y || dst_x2 <= dst_x)
      return true;

   assert(dst_x < dst_x2);
   assert(dst_y < dst_y2);

   BEGIN_BATCH_BLT_TILED(length, dst_y_tiled, src_y_tiled);
   OUT_BATCH(CMD | (length - 2));
   OUT_BATCH(BR13 | (uint16_t)dst_pitch);
   OUT_BATCH(SET_FIELD(dst_y, BLT_Y) | SET_FIELD(dst_x, BLT_X));
   OUT_BATCH(SET_FIELD(dst_y2, BLT_Y) | SET_FIELD(dst_x2, BLT_X));
   if (devinfo->gen >= 8) {
      OUT_RELOC64(dst_buffer, RELOC_WRITE, dst_offset);
   } else {
      OUT_RELOC(dst_buffer, RELOC_WRITE, dst_offset);
   }
   OUT_BATCH(SET_FIELD(src_y, BLT_Y) | SET_FIELD(src_x, BLT_X));
   OUT_BATCH((uint16_t)src_pitch);
   if (devinfo->gen >= 8) {
      OUT_RELOC64(src_buffer, 0, src_offset);
   } else {
      OUT_RELOC(src_buffer, 0, src_offset);
   }

   ADVANCE_BATCH_TILED(dst_y_tiled, src_y_tiled);

   brw_emit_mi_flush(brw);

   return true;
}
Example #14
0
void
gen8_upload_ps_state(struct brw_context *brw,
                     const struct brw_stage_state *stage_state,
                     const struct brw_wm_prog_data *prog_data,
                     uint32_t fast_clear_op)
{
   uint32_t dw3 = 0, dw6 = 0, dw7 = 0, ksp0, ksp2 = 0;

   /* Initialize the execution mask with VMask.  Otherwise, derivatives are
    * incorrect for subspans where some of the pixels are unlit.  We believe
    * the bit just didn't take effect in previous generations.
    */
   dw3 |= GEN7_PS_VECTOR_MASK_ENABLE;

   const unsigned sampler_count =
      DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);
   dw3 |= SET_FIELD(sampler_count, GEN7_PS_SAMPLER_COUNT);

   /* BRW_NEW_FS_PROG_DATA */
   dw3 |=
      ((prog_data->base.binding_table.size_bytes / 4) <<
       GEN7_PS_BINDING_TABLE_ENTRY_COUNT_SHIFT);

   if (prog_data->base.use_alt_mode)
      dw3 |= GEN7_PS_FLOATING_POINT_MODE_ALT;

   /* 3DSTATE_PS expects the number of threads per PSD, which is always 64;
    * it implicitly scales for different GT levels (which have some # of PSDs).
    *
    * In Gen8 the format is U8-2 whereas in Gen9 it is U8-1.
    */
   if (brw->gen >= 9)
      dw6 |= (64 - 1) << HSW_PS_MAX_THREADS_SHIFT;
   else
      dw6 |= (64 - 2) << HSW_PS_MAX_THREADS_SHIFT;

   if (prog_data->base.nr_params > 0)
      dw6 |= GEN7_PS_PUSH_CONSTANT_ENABLE;

   /* From the documentation for this packet:
    * "If the PS kernel does not need the Position XY Offsets to
    *  compute a Position Value, then this field should be programmed
    *  to POSOFFSET_NONE."
    *
    * "SW Recommendation: If the PS kernel needs the Position Offsets
    *  to compute a Position XY value, this field should match Position
    *  ZW Interpolation Mode to ensure a consistent position.xyzw
    *  computation."
    *
    * We only require XY sample offsets. So, this recommendation doesn't
    * look useful at the moment. We might need this in future.
    */
   if (prog_data->uses_pos_offset)
      dw6 |= GEN7_PS_POSOFFSET_SAMPLE;
   else
      dw6 |= GEN7_PS_POSOFFSET_NONE;

   dw6 |= fast_clear_op;

   if (prog_data->dispatch_8)
      dw6 |= GEN7_PS_8_DISPATCH_ENABLE;

   if (prog_data->dispatch_16)
      dw6 |= GEN7_PS_16_DISPATCH_ENABLE;

   dw7 |= prog_data->base.dispatch_grf_start_reg <<
          GEN7_PS_DISPATCH_START_GRF_SHIFT_0;
   dw7 |= prog_data->dispatch_grf_start_reg_2 <<
          GEN7_PS_DISPATCH_START_GRF_SHIFT_2;

   ksp0 = stage_state->prog_offset;
   ksp2 = stage_state->prog_offset + prog_data->prog_offset_2;

   BEGIN_BATCH(12);
   OUT_BATCH(_3DSTATE_PS << 16 | (12 - 2));
   OUT_BATCH(ksp0);
   OUT_BATCH(0);
   OUT_BATCH(dw3);
   if (prog_data->base.total_scratch) {
      OUT_RELOC64(stage_state->scratch_bo,
                  I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
                  ffs(stage_state->per_thread_scratch) - 11);
   } else {
      OUT_BATCH(0);
      OUT_BATCH(0);
   }
   OUT_BATCH(dw6);
   OUT_BATCH(dw7);
   OUT_BATCH(0); /* kernel 1 pointer */
   OUT_BATCH(0);
   OUT_BATCH(ksp2);
   OUT_BATCH(0);
   ADVANCE_BATCH();
}
Example #15
0
static void
brw_upload_cs_state(struct brw_context *brw)
{
   if (!brw->cs.prog_data)
      return;

   uint32_t offset;
   uint32_t *desc = (uint32_t*) brw_state_batch(brw, AUB_TRACE_SURFACE_STATE,
                                                8 * 4, 64, &offset);
   struct brw_stage_state *stage_state = &brw->cs.base;
   struct brw_cs_prog_data *cs_prog_data = brw->cs.prog_data;
   struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
   const struct brw_device_info *devinfo = brw->intelScreen->devinfo;

   if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
      brw->vtbl.emit_buffer_surface_state(
         brw, &stage_state->surf_offset[
                 prog_data->binding_table.shader_time_start],
         brw->shader_time.bo, 0, BRW_SURFACEFORMAT_RAW,
         brw->shader_time.bo->size, 1, true);
   }

   uint32_t *bind = (uint32_t*) brw_state_batch(brw, AUB_TRACE_BINDING_TABLE,
                                            prog_data->binding_table.size_bytes,
                                            32, &stage_state->bind_bo_offset);

   uint32_t dwords = brw->gen < 8 ? 8 : 9;
   BEGIN_BATCH(dwords);
   OUT_BATCH(MEDIA_VFE_STATE << 16 | (dwords - 2));

   if (prog_data->total_scratch) {
      if (brw->gen >= 8) {
         /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
          * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
          */
         OUT_RELOC64(stage_state->scratch_bo,
                     I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
                     ffs(stage_state->per_thread_scratch) - 11);
      } else if (brw->is_haswell) {
         /* Haswell's Per Thread Scratch Space is in the range [0, 10]
          * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
          */
         OUT_RELOC(stage_state->scratch_bo,
                   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
                   ffs(stage_state->per_thread_scratch) - 12);
      } else {
         /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB]
          * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
          */
         OUT_RELOC(stage_state->scratch_bo,
                   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
                   stage_state->per_thread_scratch / 1024 - 1);
      }
   } else {
      OUT_BATCH(0);
      if (brw->gen >= 8)
         OUT_BATCH(0);
   }

   const uint32_t vfe_num_urb_entries = brw->gen >= 8 ? 2 : 0;
   const uint32_t vfe_gpgpu_mode =
      brw->gen == 7 ? SET_FIELD(1, GEN7_MEDIA_VFE_STATE_GPGPU_MODE) : 0;
   const uint32_t subslices = MAX2(brw->intelScreen->subslice_total, 1);
   OUT_BATCH(SET_FIELD(brw->max_cs_threads * subslices - 1,
                       MEDIA_VFE_STATE_MAX_THREADS) |
             SET_FIELD(vfe_num_urb_entries, MEDIA_VFE_STATE_URB_ENTRIES) |
             SET_FIELD(1, MEDIA_VFE_STATE_RESET_GTW_TIMER) |
             SET_FIELD(1, MEDIA_VFE_STATE_BYPASS_GTW) |
             vfe_gpgpu_mode);

   OUT_BATCH(0);
   const uint32_t vfe_urb_allocation = brw->gen >= 8 ? 2 : 0;

   /* We are uploading duplicated copies of push constant uniforms for each
    * thread. Although the local id data needs to vary per thread, it won't
    * change for other uniform data. Unfortunately this duplication is
    * required for gen7. As of Haswell, this duplication can be avoided, but
    * this older mechanism with duplicated data continues to work.
    *
    * FINISHME: As of Haswell, we could make use of the
    * INTERFACE_DESCRIPTOR_DATA "Cross-Thread Constant Data Read Length" field
    * to only store one copy of uniform data.
    *
    * FINISHME: Broadwell adds a new alternative "Indirect Payload Storage"
    * which is described in the GPGPU_WALKER command and in the Broadwell PRM
    * Volume 7: 3D Media GPGPU, under Media GPGPU Pipeline => Mode of
    * Operations => GPGPU Mode => Indirect Payload Storage.
    *
    * Note: The constant data is built in brw_upload_cs_push_constants below.
    */
   const uint32_t vfe_curbe_allocation =
      ALIGN(cs_prog_data->push.per_thread.regs * cs_prog_data->threads +
            cs_prog_data->push.cross_thread.regs, 2);
   OUT_BATCH(SET_FIELD(vfe_urb_allocation, MEDIA_VFE_STATE_URB_ALLOC) |
             SET_FIELD(vfe_curbe_allocation, MEDIA_VFE_STATE_CURBE_ALLOC));
   OUT_BATCH(0);
   OUT_BATCH(0);
   OUT_BATCH(0);
   ADVANCE_BATCH();

   if (cs_prog_data->push.total.size > 0) {
      BEGIN_BATCH(4);
      OUT_BATCH(MEDIA_CURBE_LOAD << 16 | (4 - 2));
      OUT_BATCH(0);
      OUT_BATCH(ALIGN(cs_prog_data->push.total.size, 64));
      OUT_BATCH(stage_state->push_const_offset);
      ADVANCE_BATCH();
   }

   /* BRW_NEW_SURFACES and BRW_NEW_*_CONSTBUF */
   memcpy(bind, stage_state->surf_offset,
          prog_data->binding_table.size_bytes);

   memset(desc, 0, 8 * 4);

   int dw = 0;
   desc[dw++] = brw->cs.base.prog_offset;
   if (brw->gen >= 8)
      desc[dw++] = 0; /* Kernel Start Pointer High */
   desc[dw++] = 0;
   desc[dw++] = stage_state->sampler_offset |
      ((stage_state->sampler_count + 3) / 4);
   desc[dw++] = stage_state->bind_bo_offset;
   desc[dw++] = SET_FIELD(cs_prog_data->push.per_thread.regs,
                          MEDIA_CURBE_READ_LENGTH);
   const uint32_t media_threads =
      brw->gen >= 8 ?
      SET_FIELD(cs_prog_data->threads, GEN8_MEDIA_GPGPU_THREAD_COUNT) :
      SET_FIELD(cs_prog_data->threads, MEDIA_GPGPU_THREAD_COUNT);
   assert(cs_prog_data->threads <= brw->max_cs_threads);

   const uint32_t slm_size =
      encode_slm_size(devinfo->gen, prog_data->total_shared);

   desc[dw++] =
      SET_FIELD(cs_prog_data->uses_barrier, MEDIA_BARRIER_ENABLE) |
      SET_FIELD(slm_size, MEDIA_SHARED_LOCAL_MEMORY_SIZE) |
      media_threads;

   desc[dw++] =
      SET_FIELD(cs_prog_data->push.cross_thread.regs, CROSS_THREAD_READ_LENGTH);

   BEGIN_BATCH(4);
   OUT_BATCH(MEDIA_INTERFACE_DESCRIPTOR_LOAD << 16 | (4 - 2));
   OUT_BATCH(0);
   OUT_BATCH(8 * 4);
   OUT_BATCH(offset);
   ADVANCE_BATCH();
}
Example #16
0
/**
 * Used to initialize the alpha value of an ARGB8888 miptree after copying
 * into it from an XRGB8888 source.
 *
 * This is very common with glCopyTexImage2D().  Note that the coordinates are
 * relative to the start of the miptree, not relative to a slice within the
 * miptree.
 */
static void
intel_miptree_set_alpha_to_one(struct brw_context *brw,
                              struct intel_mipmap_tree *mt,
                              int x, int y, int width, int height)
{
   const struct gen_device_info *devinfo = &brw->screen->devinfo;
   uint32_t BR13, CMD;
   int pitch, cpp;

   pitch = mt->surf.row_pitch_B;
   cpp = mt->cpp;

   DBG("%s dst:buf(%p)/%d %d,%d sz:%dx%d\n",
       __func__, mt->bo, pitch, x, y, width, height);

   /* Note: Currently only handles 8 bit alpha channel. Extension to < 8 Bit
    * alpha channel would be likely possible via ROP code 0xfa instead of 0xf0
    * and writing a suitable bit-mask instead of 0xffffffff.
    */
   BR13 = br13_for_cpp(cpp) | 0xf0 << 16;
   CMD = XY_COLOR_BLT_CMD;
   CMD |= XY_BLT_WRITE_ALPHA;

   if (mt->surf.tiling != ISL_TILING_LINEAR) {
      CMD |= XY_DST_TILED;
      pitch /= 4;
   }
   BR13 |= pitch;

   /* do space check before going any further */
   if (!brw_batch_has_aperture_space(brw, mt->bo->size))
      intel_batchbuffer_flush(brw);

   unsigned length = devinfo->gen >= 8 ? 7 : 6;
   const bool dst_y_tiled = mt->surf.tiling == ISL_TILING_Y0;

   /* We need to split the blit into chunks that each fit within the blitter's
    * restrictions.  We can't use a chunk size of 32768 because we need to
    * ensure that src_tile_x + chunk_size fits.  We choose 16384 because it's
    * a nice round power of two, big enough that performance won't suffer, and
    * small enough to guarantee everything fits.
    */
   const uint32_t max_chunk_size = 16384;

   for (uint32_t chunk_x = 0; chunk_x < width; chunk_x += max_chunk_size) {
      for (uint32_t chunk_y = 0; chunk_y < height; chunk_y += max_chunk_size) {
         const uint32_t chunk_w = MIN2(max_chunk_size, width - chunk_x);
         const uint32_t chunk_h = MIN2(max_chunk_size, height - chunk_y);

         uint32_t offset, tile_x, tile_y;
         get_blit_intratile_offset_el(brw, mt,
                                      x + chunk_x, y + chunk_y,
                                      &offset, &tile_x, &tile_y);

         BEGIN_BATCH_BLT_TILED(length, dst_y_tiled, false);
         OUT_BATCH(CMD | (length - 2));
         OUT_BATCH(BR13);
         OUT_BATCH(SET_FIELD(y + chunk_y, BLT_Y) |
                   SET_FIELD(x + chunk_x, BLT_X));
         OUT_BATCH(SET_FIELD(y + chunk_y + chunk_h, BLT_Y) |
                   SET_FIELD(x + chunk_x + chunk_w, BLT_X));
         if (devinfo->gen >= 8) {
            OUT_RELOC64(mt->bo, RELOC_WRITE, mt->offset + offset);
         } else {
            OUT_RELOC(mt->bo, RELOC_WRITE, mt->offset + offset);
         }
         OUT_BATCH(0xffffffff); /* white, but only alpha gets written */
         ADVANCE_BATCH_TILED(dst_y_tiled, false);
      }
   }

   brw_emit_mi_flush(brw);
}
Example #17
0
/**
 * Helper function to emit depth related command packets.
 */
static void
emit_depth_packets(struct brw_context *brw,
                   struct intel_mipmap_tree *depth_mt,
                   uint32_t depthbuffer_format,
                   uint32_t depth_surface_type,
                   bool depth_writable,
                   struct intel_mipmap_tree *stencil_mt,
                   bool stencil_writable,
                   uint32_t stencil_offset,
                   bool hiz,
                   uint32_t width,
                   uint32_t height,
                   uint32_t depth,
                   uint32_t lod,
                   uint32_t min_array_element)
{
   /* Skip repeated NULL depth/stencil emits (think 2D rendering). */
   if (!depth_mt && !stencil_mt && brw->no_depth_or_stencil) {
      assert(brw->hw_ctx);
      return;
   }

   intel_emit_depth_stall_flushes(brw);

   /* _NEW_BUFFERS, _NEW_DEPTH, _NEW_STENCIL */
   BEGIN_BATCH(8);
   OUT_BATCH(GEN7_3DSTATE_DEPTH_BUFFER << 16 | (8 - 2));
   OUT_BATCH(depth_surface_type << 29 |
             (depth_writable ? (1 << 28) : 0) |
             (stencil_mt != NULL && stencil_writable) << 27 |
             (hiz ? 1 : 0) << 22 |
             depthbuffer_format << 18 |
             (depth_mt ? depth_mt->region->pitch - 1 : 0));
   if (depth_mt) {
      OUT_RELOC64(depth_mt->region->bo,
                  I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0);
   } else {
      OUT_BATCH(0);
      OUT_BATCH(0);
   }
   OUT_BATCH(((width - 1) << 4) | ((height - 1) << 18) | lod);
   OUT_BATCH(((depth - 1) << 21) | (min_array_element << 10) | BDW_MOCS_WB);
   OUT_BATCH(0);
   OUT_BATCH(depth_mt ? depth_mt->qpitch >> 2 : 0);
   ADVANCE_BATCH();

   if (!hiz) {
      BEGIN_BATCH(5);
      OUT_BATCH(GEN7_3DSTATE_HIER_DEPTH_BUFFER << 16 | (5 - 2));
      OUT_BATCH(0);
      OUT_BATCH(0);
      OUT_BATCH(0);
      OUT_BATCH(0);
      ADVANCE_BATCH();
   } else {
      BEGIN_BATCH(5);
      OUT_BATCH(GEN7_3DSTATE_HIER_DEPTH_BUFFER << 16 | (5 - 2));
      OUT_BATCH((depth_mt->hiz_mt->region->pitch - 1) | BDW_MOCS_WB << 25);
      OUT_RELOC64(depth_mt->hiz_mt->region->bo,
                  I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0);
      OUT_BATCH(depth_mt->hiz_mt->qpitch >> 2);
      ADVANCE_BATCH();
   }

   if (stencil_mt == NULL) {
      BEGIN_BATCH(5);
      OUT_BATCH(GEN7_3DSTATE_STENCIL_BUFFER << 16 | (5 - 2));
      OUT_BATCH(0);
      OUT_BATCH(0);
      OUT_BATCH(0);
      OUT_BATCH(0);
      ADVANCE_BATCH();
   } else {
      BEGIN_BATCH(5);
      OUT_BATCH(GEN7_3DSTATE_STENCIL_BUFFER << 16 | (5 - 2));
      /* The stencil buffer has quirky pitch requirements.  From the Graphics
       * BSpec: vol2a.11 3D Pipeline Windower > Early Depth/Stencil Processing
       * > Depth/Stencil Buffer State > 3DSTATE_STENCIL_BUFFER [DevIVB+],
       * field "Surface Pitch":
       *
       *    The pitch must be set to 2x the value computed based on width, as
       *    the stencil buffer is stored with two rows interleaved.
       *
       * (Note that it is not 100% clear whether this intended to apply to
       * Gen7; the BSpec flags this comment as "DevILK,DevSNB" (which would
       * imply that it doesn't), however the comment appears on a "DevIVB+"
       * page (which would imply that it does).  Experiments with the hardware
       * indicate that it does.
       */
      OUT_BATCH(HSW_STENCIL_ENABLED | BDW_MOCS_WB << 22 |
                (2 * stencil_mt->region->pitch - 1));
      OUT_RELOC64(stencil_mt->region->bo,
                  I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
                  stencil_offset);
      OUT_BATCH(stencil_mt ? stencil_mt->qpitch >> 2 : 0);
      ADVANCE_BATCH();
   }

   BEGIN_BATCH(3);
   OUT_BATCH(GEN7_3DSTATE_CLEAR_PARAMS << 16 | (3 - 2));
   OUT_BATCH(depth_mt ? depth_mt->depth_clear_value : 0);
   OUT_BATCH(1);
   ADVANCE_BATCH();

   brw->no_depth_or_stencil = !depth_mt && !stencil_mt;
}
Example #18
0
/* Copy BitBlt
 */
bool
intelEmitCopyBlit(struct brw_context *brw,
		  GLuint cpp,
		  GLshort src_pitch,
		  drm_intel_bo *src_buffer,
		  GLuint src_offset,
		  uint32_t src_tiling,
		  GLshort dst_pitch,
		  drm_intel_bo *dst_buffer,
		  GLuint dst_offset,
		  uint32_t dst_tiling,
		  GLshort src_x, GLshort src_y,
		  GLshort dst_x, GLshort dst_y,
		  GLshort w, GLshort h,
		  GLenum logic_op)
{
   GLuint CMD, BR13, pass = 0;
   int dst_y2 = dst_y + h;
   int dst_x2 = dst_x + w;
   drm_intel_bo *aper_array[3];
   bool dst_y_tiled = dst_tiling == I915_TILING_Y;
   bool src_y_tiled = src_tiling == I915_TILING_Y;

   if (dst_tiling != I915_TILING_NONE) {
      if (dst_offset & 4095)
	 return false;
   }
   if (src_tiling != I915_TILING_NONE) {
      if (src_offset & 4095)
	 return false;
   }
   if ((dst_y_tiled || src_y_tiled) && brw->gen < 6)
      return false;

   /* do space check before going any further */
   do {
       aper_array[0] = brw->batch.bo;
       aper_array[1] = dst_buffer;
       aper_array[2] = src_buffer;

       if (dri_bufmgr_check_aperture_space(aper_array, 3) != 0) {
           intel_batchbuffer_flush(brw);
           pass++;
       } else
           break;
   } while (pass < 2);

   if (pass >= 2)
      return false;

   intel_batchbuffer_require_space(brw, 8 * 4, BLT_RING);
   DBG("%s src:buf(%p)/%d+%d %d,%d dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
       __FUNCTION__,
       src_buffer, src_pitch, src_offset, src_x, src_y,
       dst_buffer, dst_pitch, dst_offset, dst_x, dst_y, w, h);

   /* Blit pitch must be dword-aligned.  Otherwise, the hardware appears to drop
    * the low bits.
    */
   if (src_pitch % 4 != 0 || dst_pitch % 4 != 0)
      return false;

   /* For big formats (such as floating point), do the copy using 16 or 32bpp
    * and multiply the coordinates.
    */
   if (cpp > 4) {
      if (cpp % 4 == 2) {
         dst_x *= cpp / 2;
         dst_x2 *= cpp / 2;
         src_x *= cpp / 2;
         cpp = 2;
      } else {
         assert(cpp % 4 == 0);
         dst_x *= cpp / 4;
         dst_x2 *= cpp / 4;
         src_x *= cpp / 4;
         cpp = 4;
      }
   }

   BR13 = br13_for_cpp(cpp) | translate_raster_op(logic_op) << 16;

   switch (cpp) {
   case 1:
   case 2:
      CMD = XY_SRC_COPY_BLT_CMD;
      break;
   case 4:
      CMD = XY_SRC_COPY_BLT_CMD | XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
      break;
   default:
      return false;
   }

   if (dst_tiling != I915_TILING_NONE) {
      CMD |= XY_DST_TILED;
      dst_pitch /= 4;
   }
   if (src_tiling != I915_TILING_NONE) {
      CMD |= XY_SRC_TILED;
      src_pitch /= 4;
   }

   if (dst_y2 <= dst_y || dst_x2 <= dst_x) {
      return true;
   }

   assert(dst_x < dst_x2);
   assert(dst_y < dst_y2);
   assert(src_offset + (src_y + h - 1) * abs(src_pitch) +
          (w * cpp) <= src_buffer->size);
   assert(dst_offset + (dst_y + h - 1) * abs(dst_pitch) +
          (w * cpp) <= dst_buffer->size);

   unsigned length = brw->gen >= 8 ? 10 : 8;

   BEGIN_BATCH_BLT_TILED(length, dst_y_tiled, src_y_tiled);
   OUT_BATCH(CMD | (length - 2));
   OUT_BATCH(BR13 | (uint16_t)dst_pitch);
   OUT_BATCH(SET_FIELD(dst_y, BLT_Y) | SET_FIELD(dst_x, BLT_X));
   OUT_BATCH(SET_FIELD(dst_y2, BLT_Y) | SET_FIELD(dst_x2, BLT_X));
   if (brw->gen >= 8) {
      OUT_RELOC64(dst_buffer,
                  I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
                  dst_offset);
   } else {
      OUT_RELOC(dst_buffer,
                I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
                dst_offset);
   }
   OUT_BATCH(SET_FIELD(src_y, BLT_Y) | SET_FIELD(src_x, BLT_X));
   OUT_BATCH((uint16_t)src_pitch);
   if (brw->gen >= 8) {
      OUT_RELOC64(src_buffer,
                  I915_GEM_DOMAIN_RENDER, 0,
                  src_offset);
   } else {
      OUT_RELOC(src_buffer,
                I915_GEM_DOMAIN_RENDER, 0,
                src_offset);
   }

   ADVANCE_BATCH_TILED(dst_y_tiled, src_y_tiled);

   intel_batchbuffer_emit_mi_flush(brw);

   return true;
}
Example #19
0
/**
 * Emit a VERTEX_BUFFER_STATE entry (part of 3DSTATE_VERTEX_BUFFERS).
 */
uint32_t *
brw_emit_vertex_buffer_state(struct brw_context *brw,
                             unsigned buffer_nr,
                             drm_intel_bo *bo,
                             unsigned start_offset,
                             unsigned end_offset,
                             unsigned stride,
                             unsigned step_rate,
                             uint32_t *__map)
{
   struct gl_context *ctx = &brw->ctx;
   uint32_t dw0;

   if (brw->gen >= 8) {
      dw0 = buffer_nr << GEN6_VB0_INDEX_SHIFT;
   } else if (brw->gen >= 6) {
      dw0 = (buffer_nr << GEN6_VB0_INDEX_SHIFT) |
            (step_rate ? GEN6_VB0_ACCESS_INSTANCEDATA
                       : GEN6_VB0_ACCESS_VERTEXDATA);
   } else {
      dw0 = (buffer_nr << BRW_VB0_INDEX_SHIFT) |
            (step_rate ? BRW_VB0_ACCESS_INSTANCEDATA
                       : BRW_VB0_ACCESS_VERTEXDATA);
   }

   if (brw->gen >= 7)
      dw0 |= GEN7_VB0_ADDRESS_MODIFYENABLE;

   switch (brw->gen) {
   case 7:
      dw0 |= GEN7_MOCS_L3 << 16;
      break;
   case 8:
      dw0 |= BDW_MOCS_WB << 16;
      break;
   case 9:
      dw0 |= SKL_MOCS_WB << 16;
      break;
   }

   WARN_ONCE(stride >= (brw->gen >= 5 ? 2048 : 2047),
             "VBO stride %d too large, bad rendering may occur\n",
             stride);
   OUT_BATCH(dw0 | (stride << BRW_VB0_PITCH_SHIFT));
   if (brw->gen >= 8) {
      OUT_RELOC64(bo, I915_GEM_DOMAIN_VERTEX, 0, start_offset);
      /* From the BSpec: 3D Pipeline Stages - 3D Pipeline Geometry -
       *                 Vertex Fetch (VF) Stage - State
       *
       * Instead of "VBState.StartingBufferAddress + VBState.MaxIndex x
       * VBState.BufferPitch", the address of the byte immediately beyond the
       * last valid byte of the buffer is determined by
       * "VBState.StartingBufferAddress + VBState.BufferSize".
       */
      OUT_BATCH(end_offset - start_offset);
   } else if (brw->gen >= 5) {
      OUT_RELOC(bo, I915_GEM_DOMAIN_VERTEX, 0, start_offset);
      /* From the BSpec: 3D Pipeline Stages - 3D Pipeline Geometry -
       *                 Vertex Fetch (VF) Stage - State
       *
       *  Instead of "VBState.StartingBufferAddress + VBState.MaxIndex x
       *  VBState.BufferPitch", the address of the byte immediately beyond the
       *  last valid byte of the buffer is determined by
       *  "VBState.EndAddress + 1".
       */
      OUT_RELOC(bo, I915_GEM_DOMAIN_VERTEX, 0, end_offset - 1);
      OUT_BATCH(step_rate);
   } else {
      OUT_RELOC(bo, I915_GEM_DOMAIN_VERTEX, 0, start_offset);
      OUT_BATCH(0);
      OUT_BATCH(step_rate);
   }

   return __map;
}
Example #20
0
static void
gen8_emit_vertices(struct brw_context *brw)
{
   struct gl_context *ctx = &brw->ctx;
   uint32_t mocs_wb = brw->gen >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
   bool uses_edge_flag;

   brw_prepare_vertices(brw);
   brw_prepare_shader_draw_parameters(brw);

   uses_edge_flag = (ctx->Polygon.FrontMode != GL_FILL ||
                     ctx->Polygon.BackMode != GL_FILL);

   if (brw->vs.prog_data->uses_vertexid || brw->vs.prog_data->uses_instanceid) {
      unsigned vue = brw->vb.nr_enabled;

      /* The element for the edge flags must always be last, so we have to
       * insert the SGVS before it in that case.
       */
      if (uses_edge_flag) {
         assert(vue > 0);
         vue--;
      }

      WARN_ONCE(vue >= 33,
                "Trying to insert VID/IID past 33rd vertex element, "
                "need to reorder the vertex attrbutes.");

      unsigned dw1 = 0;
      if (brw->vs.prog_data->uses_vertexid) {
         dw1 |= GEN8_SGVS_ENABLE_VERTEX_ID |
                (2 << GEN8_SGVS_VERTEX_ID_COMPONENT_SHIFT) |  /* .z channel */
                (vue << GEN8_SGVS_VERTEX_ID_ELEMENT_OFFSET_SHIFT);
      }

      if (brw->vs.prog_data->uses_instanceid) {
         dw1 |= GEN8_SGVS_ENABLE_INSTANCE_ID |
                (3 << GEN8_SGVS_INSTANCE_ID_COMPONENT_SHIFT) | /* .w channel */
                (vue << GEN8_SGVS_INSTANCE_ID_ELEMENT_OFFSET_SHIFT);
      }

      BEGIN_BATCH(2);
      OUT_BATCH(_3DSTATE_VF_SGVS << 16 | (2 - 2));
      OUT_BATCH(dw1);
      ADVANCE_BATCH();

      BEGIN_BATCH(3);
      OUT_BATCH(_3DSTATE_VF_INSTANCING << 16 | (3 - 2));
      OUT_BATCH(vue | GEN8_VF_INSTANCING_ENABLE);
      OUT_BATCH(0);
      ADVANCE_BATCH();
   } else {
      BEGIN_BATCH(2);
      OUT_BATCH(_3DSTATE_VF_SGVS << 16 | (2 - 2));
      OUT_BATCH(0);
      ADVANCE_BATCH();
   }

   /* If the VS doesn't read any inputs (calculating vertex position from
    * a state variable for some reason, for example), emit a single pad
    * VERTEX_ELEMENT struct and bail.
    *
    * The stale VB state stays in place, but they don't do anything unless
    * a VE loads from them.
    */
   if (brw->vb.nr_enabled == 0) {
      BEGIN_BATCH(3);
      OUT_BATCH((_3DSTATE_VERTEX_ELEMENTS << 16) | (3 - 2));
      OUT_BATCH((0 << GEN6_VE0_INDEX_SHIFT) |
                GEN6_VE0_VALID |
                (BRW_SURFACEFORMAT_R32G32B32A32_FLOAT << BRW_VE0_FORMAT_SHIFT) |
                (0 << BRW_VE0_SRC_OFFSET_SHIFT));
      OUT_BATCH((BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_0_SHIFT) |
                (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) |
                (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) |
                (BRW_VE1_COMPONENT_STORE_1_FLT << BRW_VE1_COMPONENT_3_SHIFT));
      ADVANCE_BATCH();
      return;
   }

   /* Now emit 3DSTATE_VERTEX_BUFFERS and 3DSTATE_VERTEX_ELEMENTS packets. */
   unsigned nr_buffers = brw->vb.nr_buffers + brw->vs.prog_data->uses_vertexid;
   if (nr_buffers) {
      assert(nr_buffers <= 33);

      BEGIN_BATCH(1 + 4 * nr_buffers);
      OUT_BATCH((_3DSTATE_VERTEX_BUFFERS << 16) | (4 * nr_buffers - 1));
      for (unsigned i = 0; i < brw->vb.nr_buffers; i++) {
         struct brw_vertex_buffer *buffer = &brw->vb.buffers[i];
         uint32_t dw0 = 0;

         dw0 |= i << GEN6_VB0_INDEX_SHIFT;
         dw0 |= GEN7_VB0_ADDRESS_MODIFYENABLE;
         dw0 |= buffer->stride << BRW_VB0_PITCH_SHIFT;
         dw0 |= mocs_wb << 16;

         OUT_BATCH(dw0);
         OUT_RELOC64(buffer->bo, I915_GEM_DOMAIN_VERTEX, 0, buffer->offset);
         OUT_BATCH(buffer->bo->size);
      }

      if (brw->vs.prog_data->uses_vertexid) {
         OUT_BATCH(brw->vb.nr_buffers << GEN6_VB0_INDEX_SHIFT |
                   GEN7_VB0_ADDRESS_MODIFYENABLE |
                   mocs_wb << 16);
         OUT_RELOC64(brw->draw.draw_params_bo, I915_GEM_DOMAIN_VERTEX, 0,
                     brw->draw.draw_params_offset);
         OUT_BATCH(brw->draw.draw_params_bo->size);
      }
      ADVANCE_BATCH();
   }

   /* Normally we don't need an element for the SGVS attribute because the
    * 3DSTATE_VF_SGVS instruction lets you store the generated attribute in an
    * element that is past the list in 3DSTATE_VERTEX_ELEMENTS. However if the
    * vertex ID is used then it needs an element for the base vertex buffer.
    * Additionally if there is an edge flag element then the SGVS can't be
    * inserted past that so we need a dummy element to ensure that the edge
    * flag is the last one.
    */
   bool needs_sgvs_element = (brw->vs.prog_data->uses_vertexid ||
                              (brw->vs.prog_data->uses_instanceid &&
                               uses_edge_flag));
   unsigned nr_elements = brw->vb.nr_enabled + needs_sgvs_element;

   /* The hardware allows one more VERTEX_ELEMENTS than VERTEX_BUFFERS,
    * presumably for VertexID/InstanceID.
    */
   assert(nr_elements <= 34);

   struct brw_vertex_element *gen6_edgeflag_input = NULL;

   BEGIN_BATCH(1 + nr_elements * 2);
   OUT_BATCH((_3DSTATE_VERTEX_ELEMENTS << 16) | (2 * nr_elements - 1));
   for (unsigned i = 0; i < brw->vb.nr_enabled; i++) {
      struct brw_vertex_element *input = brw->vb.enabled[i];
      uint32_t format = brw_get_vertex_surface_type(brw, input->glarray);
      uint32_t comp0 = BRW_VE1_COMPONENT_STORE_SRC;
      uint32_t comp1 = BRW_VE1_COMPONENT_STORE_SRC;
      uint32_t comp2 = BRW_VE1_COMPONENT_STORE_SRC;
      uint32_t comp3 = BRW_VE1_COMPONENT_STORE_SRC;

      /* The gen4 driver expects edgeflag to come in as a float, and passes
       * that float on to the tests in the clipper.  Mesa's current vertex
       * attribute value for EdgeFlag is stored as a float, which works out.
       * glEdgeFlagPointer, on the other hand, gives us an unnormalized
       * integer ubyte.  Just rewrite that to convert to a float.
       */
      if (input == &brw->vb.inputs[VERT_ATTRIB_EDGEFLAG]) {
         /* Gen6+ passes edgeflag as sideband along with the vertex, instead
          * of in the VUE.  We have to upload it sideband as the last vertex
          * element according to the B-Spec.
          */
         gen6_edgeflag_input = input;
         continue;
      }

      switch (input->glarray->Size) {
      case 0: comp0 = BRW_VE1_COMPONENT_STORE_0;
      case 1: comp1 = BRW_VE1_COMPONENT_STORE_0;
      case 2: comp2 = BRW_VE1_COMPONENT_STORE_0;
      case 3: comp3 = input->glarray->Integer ? BRW_VE1_COMPONENT_STORE_1_INT
                                              : BRW_VE1_COMPONENT_STORE_1_FLT;
         break;
      }

      OUT_BATCH((input->buffer << GEN6_VE0_INDEX_SHIFT) |
                GEN6_VE0_VALID |
                (format << BRW_VE0_FORMAT_SHIFT) |
                (input->offset << BRW_VE0_SRC_OFFSET_SHIFT));

      OUT_BATCH((comp0 << BRW_VE1_COMPONENT_0_SHIFT) |
                (comp1 << BRW_VE1_COMPONENT_1_SHIFT) |
                (comp2 << BRW_VE1_COMPONENT_2_SHIFT) |
                (comp3 << BRW_VE1_COMPONENT_3_SHIFT));
   }

   if (needs_sgvs_element) {
      if (brw->vs.prog_data->uses_vertexid) {
         OUT_BATCH(GEN6_VE0_VALID |
                   brw->vb.nr_buffers << GEN6_VE0_INDEX_SHIFT |
                   BRW_SURFACEFORMAT_R32_UINT << BRW_VE0_FORMAT_SHIFT);
         OUT_BATCH((BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_0_SHIFT) |
                   (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) |
                   (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) |
                   (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_3_SHIFT));
      } else {
         OUT_BATCH(GEN6_VE0_VALID);
         OUT_BATCH((BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_0_SHIFT) |
                   (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) |
                   (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) |
                   (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_3_SHIFT));
      }
   }

   if (gen6_edgeflag_input) {
      uint32_t format =
         brw_get_vertex_surface_type(brw, gen6_edgeflag_input->glarray);

      OUT_BATCH((gen6_edgeflag_input->buffer << GEN6_VE0_INDEX_SHIFT) |
                GEN6_VE0_VALID |
                GEN6_VE0_EDGE_FLAG_ENABLE |
                (format << BRW_VE0_FORMAT_SHIFT) |
                (gen6_edgeflag_input->offset << BRW_VE0_SRC_OFFSET_SHIFT));
      OUT_BATCH((BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_0_SHIFT) |
                (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) |
                (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) |
                (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_3_SHIFT));
   }
   ADVANCE_BATCH();

   for (unsigned i = 0, j = 0; i < brw->vb.nr_enabled; i++) {
      const struct brw_vertex_element *input = brw->vb.enabled[i];
      const struct brw_vertex_buffer *buffer = &brw->vb.buffers[input->buffer];
      unsigned element_index;

      /* The edge flag element is reordered to be the last one in the code
       * above so we need to compensate for that in the element indices used
       * below.
       */
      if (input == gen6_edgeflag_input)
         element_index = nr_elements - 1;
      else
         element_index = j++;

      BEGIN_BATCH(3);
      OUT_BATCH(_3DSTATE_VF_INSTANCING << 16 | (3 - 2));
      OUT_BATCH(element_index |
                (buffer->step_rate ? GEN8_VF_INSTANCING_ENABLE : 0));
      OUT_BATCH(buffer->step_rate);
      ADVANCE_BATCH();
   }
}
Example #21
0
static void
gen8_upload_gs_state(struct brw_context *brw)
{
   struct gl_context *ctx = &brw->ctx;
   const struct brw_stage_state *stage_state = &brw->gs.base;
   /* BRW_NEW_GEOMETRY_PROGRAM */
   bool active = brw->geometry_program;
   /* CACHE_NEW_GS_PROG */
   const struct brw_vec4_prog_data *prog_data = &brw->gs.prog_data->base;

   if (active) {
      int urb_entry_write_offset = 1;
      uint32_t urb_entry_output_length =
         ((prog_data->vue_map.num_slots + 1) / 2 - urb_entry_write_offset);

      if (urb_entry_output_length == 0)
         urb_entry_output_length = 1;

      BEGIN_BATCH(10);
      OUT_BATCH(_3DSTATE_GS << 16 | (10 - 2));
      OUT_BATCH(stage_state->prog_offset);
      OUT_BATCH(0);
      OUT_BATCH(GEN6_GS_VECTOR_MASK_ENABLE |
                brw->geometry_program->VerticesIn |
                ((ALIGN(stage_state->sampler_count, 4)/4) <<
                 GEN6_GS_SAMPLER_COUNT_SHIFT) |
                ((prog_data->base.binding_table.size_bytes / 4) <<
                 GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT));

      if (brw->gs.prog_data->base.base.total_scratch) {
         OUT_RELOC64(stage_state->scratch_bo,
                     I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
                     ffs(brw->gs.prog_data->base.base.total_scratch) - 11);
         WARN_ONCE(true,
                   "May need to implement a temporary workaround: GS Number of "
                   "URB Entries must be less than or equal to the GS Maximum "
                   "Number of Threads.\n");
      } else {
         OUT_BATCH(0);
         OUT_BATCH(0);
      }

      /* DW6 */
      OUT_BATCH(((brw->gs.prog_data->output_vertex_size_hwords * 2 - 1) <<
                 GEN7_GS_OUTPUT_VERTEX_SIZE_SHIFT) |
                (brw->gs.prog_data->output_topology <<
                 GEN7_GS_OUTPUT_TOPOLOGY_SHIFT) |
                (prog_data->urb_read_length <<
                 GEN6_GS_URB_READ_LENGTH_SHIFT) |
                (0 << GEN6_GS_URB_ENTRY_READ_OFFSET_SHIFT) |
                (prog_data->base.dispatch_grf_start_reg <<
                 GEN6_GS_DISPATCH_START_GRF_SHIFT));

      /* DW7 */
      OUT_BATCH(((brw->max_gs_threads / 2 - 1) << HSW_GS_MAX_THREADS_SHIFT) |
                (brw->gs.prog_data->control_data_header_size_hwords <<
                 GEN7_GS_CONTROL_DATA_HEADER_SIZE_SHIFT) |
                brw->gs.prog_data->dispatch_mode |
                GEN6_GS_STATISTICS_ENABLE |
                (brw->gs.prog_data->include_primitive_id ?
                 GEN7_GS_INCLUDE_PRIMITIVE_ID : 0) |
                GEN7_GS_REORDER_TRAILING |
                GEN7_GS_ENABLE);

      /* DW8 */
      OUT_BATCH(brw->gs.prog_data->control_data_format <<
                HSW_GS_CONTROL_DATA_FORMAT_SHIFT);

      /* DW9 / _NEW_TRANSFORM */
      OUT_BATCH((ctx->Transform.ClipPlanesEnabled <<
                 GEN8_GS_USER_CLIP_DISTANCE_SHIFT) |
                (urb_entry_output_length << GEN8_GS_URB_OUTPUT_LENGTH_SHIFT) |
                (urb_entry_write_offset <<
                 GEN8_GS_URB_ENTRY_OUTPUT_OFFSET_SHIFT));
      ADVANCE_BATCH();
   } else {
      BEGIN_BATCH(10);
      OUT_BATCH(_3DSTATE_GS << 16 | (10 - 2));
      OUT_BATCH(0); /* prog_bo */
      OUT_BATCH(0);
      OUT_BATCH(0);
      OUT_BATCH(0); /* scratch space base offset */
      OUT_BATCH(0);
      OUT_BATCH(0);
      OUT_BATCH(GEN6_GS_STATISTICS_ENABLE);
      OUT_BATCH(0);
      OUT_BATCH(0);
      ADVANCE_BATCH();
   }
}
Example #22
0
bool
intelEmitImmediateColorExpandBlit(struct brw_context *brw,
				  GLuint cpp,
				  GLubyte *src_bits, GLuint src_size,
				  GLuint fg_color,
				  GLshort dst_pitch,
				  struct brw_bo *dst_buffer,
				  GLuint dst_offset,
				  enum isl_tiling dst_tiling,
				  GLshort x, GLshort y,
				  GLshort w, GLshort h,
				  enum gl_logicop_mode logic_op)
{
   const struct gen_device_info *devinfo = &brw->screen->devinfo;
   int dwords = ALIGN(src_size, 8) / 4;
   uint32_t opcode, br13, blit_cmd;

   if (dst_tiling != ISL_TILING_LINEAR) {
      if (dst_offset & 4095)
	 return false;
      if (dst_tiling == ISL_TILING_Y0)
	 return false;
   }

   assert((unsigned) logic_op <= 0x0f);
   assert(dst_pitch > 0);

   if (w < 0 || h < 0)
      return true;

   DBG("%s dst:buf(%p)/%d+%d %d,%d sz:%dx%d, %d bytes %d dwords\n",
       __func__,
       dst_buffer, dst_pitch, dst_offset, x, y, w, h, src_size, dwords);

   unsigned xy_setup_blt_length = devinfo->gen >= 8 ? 10 : 8;
   intel_batchbuffer_require_space(brw, (xy_setup_blt_length * 4) +
                                        (3 * 4) + dwords * 4);

   opcode = XY_SETUP_BLT_CMD;
   if (cpp == 4)
      opcode |= XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
   if (dst_tiling != ISL_TILING_LINEAR) {
      opcode |= XY_DST_TILED;
      dst_pitch /= 4;
   }

   br13 = dst_pitch | (translate_raster_op(logic_op) << 16) | (1 << 29);
   br13 |= br13_for_cpp(cpp);

   blit_cmd = XY_TEXT_IMMEDIATE_BLIT_CMD | XY_TEXT_BYTE_PACKED; /* packing? */
   if (dst_tiling != ISL_TILING_LINEAR)
      blit_cmd |= XY_DST_TILED;

   BEGIN_BATCH_BLT(xy_setup_blt_length + 3);
   OUT_BATCH(opcode | (xy_setup_blt_length - 2));
   OUT_BATCH(br13);
   OUT_BATCH((0 << 16) | 0); /* clip x1, y1 */
   OUT_BATCH((100 << 16) | 100); /* clip x2, y2 */
   if (devinfo->gen >= 8) {
      OUT_RELOC64(dst_buffer, RELOC_WRITE, dst_offset);
   } else {
      OUT_RELOC(dst_buffer, RELOC_WRITE, dst_offset);
   }
   OUT_BATCH(0); /* bg */
   OUT_BATCH(fg_color); /* fg */
   OUT_BATCH(0); /* pattern base addr */
   if (devinfo->gen >= 8)
      OUT_BATCH(0);

   OUT_BATCH(blit_cmd | ((3 - 2) + dwords));
   OUT_BATCH(SET_FIELD(y, BLT_Y) | SET_FIELD(x, BLT_X));
   OUT_BATCH(SET_FIELD(y + h, BLT_Y) | SET_FIELD(x + w, BLT_X));
   ADVANCE_BATCH();

   intel_batchbuffer_data(brw, src_bits, dwords * 4);

   brw_emit_mi_flush(brw);

   return true;
}
Example #23
0
/**
 * Define the base addresses which some state is referenced from.
 *
 * This allows us to avoid having to emit relocations for the objects,
 * and is actually required for binding table pointers on gen6.
 *
 * Surface state base address covers binding table pointers and
 * surface state objects, but not the surfaces that the surface state
 * objects point to.
 */
void
brw_upload_state_base_address(struct brw_context *brw)
{
   if (brw->batch.state_base_address_emitted)
      return;

   /* FINISHME: According to section 3.6.1 "STATE_BASE_ADDRESS" of
    * vol1a of the G45 PRM, MI_FLUSH with the ISC invalidate should be
    * programmed prior to STATE_BASE_ADDRESS.
    *
    * However, given that the instruction SBA (general state base
    * address) on this chipset is always set to 0 across X and GL,
    * maybe this isn't required for us in particular.
    */

   if (brw->gen >= 8) {
      uint32_t mocs_wb = brw->gen >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
      int pkt_len = brw->gen >= 9 ? 19 : 16;

      BEGIN_BATCH(pkt_len);
      OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (pkt_len - 2));
      /* General state base address: stateless DP read/write requests */
      OUT_BATCH(mocs_wb << 4 | 1);
      OUT_BATCH(0);
      OUT_BATCH(mocs_wb << 16);
      /* Surface state base address: */
      OUT_RELOC64(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0,
                  mocs_wb << 4 | 1);
      /* Dynamic state base address: */
      OUT_RELOC64(brw->batch.bo,
                  I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION, 0,
                  mocs_wb << 4 | 1);
      /* Indirect object base address: MEDIA_OBJECT data */
      OUT_BATCH(mocs_wb << 4 | 1);
      OUT_BATCH(0);
      /* Instruction base address: shader kernels (incl. SIP) */
      OUT_RELOC64(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
                  mocs_wb << 4 | 1);

      /* General state buffer size */
      OUT_BATCH(0xfffff001);
      /* Dynamic state buffer size */
      OUT_BATCH(ALIGN(brw->batch.bo->size, 4096) | 1);
      /* Indirect object upper bound */
      OUT_BATCH(0xfffff001);
      /* Instruction access upper bound */
      OUT_BATCH(ALIGN(brw->cache.bo->size, 4096) | 1);
      if (brw->gen >= 9) {
         OUT_BATCH(1);
         OUT_BATCH(0);
         OUT_BATCH(0);
      }
      ADVANCE_BATCH();
   } else if (brw->gen >= 6) {
      uint8_t mocs = brw->gen == 7 ? GEN7_MOCS_L3 : 0;

       BEGIN_BATCH(10);
       OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (10 - 2));
       OUT_BATCH(mocs << 8 | /* General State Memory Object Control State */
                 mocs << 4 | /* Stateless Data Port Access Memory Object Control State */
                 1); /* General State Base Address Modify Enable */
       /* Surface state base address:
	* BINDING_TABLE_STATE
	* SURFACE_STATE
	*/
       OUT_RELOC(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0, 1);
        /* Dynamic state base address:
	 * SAMPLER_STATE
	 * SAMPLER_BORDER_COLOR_STATE
	 * CLIP, SF, WM/CC viewport state
	 * COLOR_CALC_STATE
	 * DEPTH_STENCIL_STATE
	 * BLEND_STATE
	 * Push constants (when INSTPM: CONSTANT_BUFFER Address Offset
	 * Disable is clear, which we rely on)
	 */
       OUT_RELOC(brw->batch.bo, (I915_GEM_DOMAIN_RENDER |
				   I915_GEM_DOMAIN_INSTRUCTION), 0, 1);

       OUT_BATCH(1); /* Indirect object base address: MEDIA_OBJECT data */
       OUT_RELOC(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
		 1); /* Instruction base address: shader kernels (incl. SIP) */

       OUT_BATCH(1); /* General state upper bound */
       /* Dynamic state upper bound.  Although the documentation says that
	* programming it to zero will cause it to be ignored, that is a lie.
	* If this isn't programmed to a real bound, the sampler border color
	* pointer is rejected, causing border color to mysteriously fail.
	*/
       OUT_BATCH(0xfffff001);
       OUT_BATCH(1); /* Indirect object upper bound */
       OUT_BATCH(1); /* Instruction access upper bound */
       ADVANCE_BATCH();
   } else if (brw->gen == 5) {
       BEGIN_BATCH(8);
       OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (8 - 2));
       OUT_BATCH(1); /* General state base address */
       OUT_RELOC(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0,
		 1); /* Surface state base address */
       OUT_BATCH(1); /* Indirect object base address */
       OUT_RELOC(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
		 1); /* Instruction base address */
       OUT_BATCH(0xfffff001); /* General state upper bound */
       OUT_BATCH(1); /* Indirect object upper bound */
       OUT_BATCH(1); /* Instruction access upper bound */
       ADVANCE_BATCH();
   } else {
       BEGIN_BATCH(6);
       OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (6 - 2));
       OUT_BATCH(1); /* General state base address */
       OUT_RELOC(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0,
		 1); /* Surface state base address */
       OUT_BATCH(1); /* Indirect object base address */
       OUT_BATCH(1); /* General state upper bound */
       OUT_BATCH(1); /* Indirect object upper bound */
       ADVANCE_BATCH();
   }

   /* According to section 3.6.1 of VOL1 of the 965 PRM,
    * STATE_BASE_ADDRESS updates require a reissue of:
    *
    * 3DSTATE_PIPELINE_POINTERS
    * 3DSTATE_BINDING_TABLE_POINTERS
    * MEDIA_STATE_POINTERS
    *
    * and this continues through Ironlake.  The Sandy Bridge PRM, vol
    * 1 part 1 says that the folowing packets must be reissued:
    *
    * 3DSTATE_CC_POINTERS
    * 3DSTATE_BINDING_TABLE_POINTERS
    * 3DSTATE_SAMPLER_STATE_POINTERS
    * 3DSTATE_VIEWPORT_STATE_POINTERS
    * MEDIA_STATE_POINTERS
    *
    * Those are always reissued following SBA updates anyway (new
    * batch time), except in the case of the program cache BO
    * changing.  Having a separate state flag makes the sequence more
    * obvious.
    */

   brw->ctx.NewDriverState |= BRW_NEW_STATE_BASE_ADDRESS;
   brw->batch.state_base_address_emitted = true;
}
Example #24
0
bool
intelEmitImmediateColorExpandBlit(struct brw_context *brw,
				  GLuint cpp,
				  GLubyte *src_bits, GLuint src_size,
				  GLuint fg_color,
				  GLshort dst_pitch,
				  drm_intel_bo *dst_buffer,
				  GLuint dst_offset,
				  uint32_t dst_tiling,
				  GLshort x, GLshort y,
				  GLshort w, GLshort h,
				  GLenum logic_op)
{
   int dwords = ALIGN(src_size, 8) / 4;
   uint32_t opcode, br13, blit_cmd;

   if (dst_tiling != I915_TILING_NONE) {
      if (dst_offset & 4095)
	 return false;
      if (dst_tiling == I915_TILING_Y)
	 return false;
   }

   assert((logic_op >= GL_CLEAR) && (logic_op <= (GL_CLEAR + 0x0f)));
   assert(dst_pitch > 0);

   if (w < 0 || h < 0)
      return true;

   DBG("%s dst:buf(%p)/%d+%d %d,%d sz:%dx%d, %d bytes %d dwords\n",
       __FUNCTION__,
       dst_buffer, dst_pitch, dst_offset, x, y, w, h, src_size, dwords);

   intel_batchbuffer_require_space(brw, (8 * 4) + (3 * 4) + dwords * 4, BLT_RING);

   opcode = XY_SETUP_BLT_CMD;
   if (cpp == 4)
      opcode |= XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
   if (dst_tiling != I915_TILING_NONE) {
      opcode |= XY_DST_TILED;
      dst_pitch /= 4;
   }

   br13 = dst_pitch | (translate_raster_op(logic_op) << 16) | (1 << 29);
   br13 |= br13_for_cpp(cpp);

   blit_cmd = XY_TEXT_IMMEDIATE_BLIT_CMD | XY_TEXT_BYTE_PACKED; /* packing? */
   if (dst_tiling != I915_TILING_NONE)
      blit_cmd |= XY_DST_TILED;

   unsigned xy_setup_blt_length = brw->gen >= 8 ? 10 : 8;

   BEGIN_BATCH_BLT(xy_setup_blt_length + 3);
   OUT_BATCH(opcode | (xy_setup_blt_length - 2));
   OUT_BATCH(br13);
   OUT_BATCH((0 << 16) | 0); /* clip x1, y1 */
   OUT_BATCH((100 << 16) | 100); /* clip x2, y2 */
   if (brw->gen >= 8) {
      OUT_RELOC64(dst_buffer,
                  I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
                  dst_offset);
   } else {
      OUT_RELOC(dst_buffer,
                I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
                dst_offset);
   }
   OUT_BATCH(0); /* bg */
   OUT_BATCH(fg_color); /* fg */
   OUT_BATCH(0); /* pattern base addr */
   if (brw->gen >= 8)
      OUT_BATCH(0);

   OUT_BATCH(blit_cmd | ((3 - 2) + dwords));
   OUT_BATCH(SET_FIELD(y, BLT_Y) | SET_FIELD(x, BLT_X));
   OUT_BATCH(SET_FIELD(y + h, BLT_Y) | SET_FIELD(x + w, BLT_X));
   ADVANCE_BATCH();

   intel_batchbuffer_data(brw, src_bits, dwords * 4, BLT_RING);

   intel_batchbuffer_emit_mi_flush(brw);

   return true;
}