static inline void
brw_upload_programs(struct brw_context *brw,
                    enum brw_pipeline pipeline)
{
   struct gl_context *ctx = &brw->ctx;
   const struct gen_device_info *devinfo = &brw->screen->devinfo;

   if (pipeline == BRW_RENDER_PIPELINE) {
      brw_upload_vs_prog(brw);
      brw_upload_tess_programs(brw);

      if (brw->programs[MESA_SHADER_GEOMETRY]) {
         brw_upload_gs_prog(brw);
      } else {
         brw->gs.base.prog_data = NULL;
         if (devinfo->gen < 7)
            brw_upload_ff_gs_prog(brw);
      }

      /* Update the VUE map for data exiting the GS stage of the pipeline.
       * This comes from the last enabled shader stage.
       */
      GLbitfield64 old_slots = brw->vue_map_geom_out.slots_valid;
      bool old_separate = brw->vue_map_geom_out.separate;
      struct brw_vue_prog_data *vue_prog_data;
      if (brw->programs[MESA_SHADER_GEOMETRY])
         vue_prog_data = brw_vue_prog_data(brw->gs.base.prog_data);
      else if (brw->programs[MESA_SHADER_TESS_EVAL])
         vue_prog_data = brw_vue_prog_data(brw->tes.base.prog_data);
      else
         vue_prog_data = brw_vue_prog_data(brw->vs.base.prog_data);

      brw->vue_map_geom_out = vue_prog_data->vue_map;

      /* If the layout has changed, signal BRW_NEW_VUE_MAP_GEOM_OUT. */
      if (old_slots != brw->vue_map_geom_out.slots_valid ||
          old_separate != brw->vue_map_geom_out.separate)
         brw->ctx.NewDriverState |= BRW_NEW_VUE_MAP_GEOM_OUT;

      if ((old_slots ^ brw->vue_map_geom_out.slots_valid) &
          VARYING_BIT_VIEWPORT) {
         ctx->NewDriverState |= BRW_NEW_VIEWPORT_COUNT;
         brw->clip.viewport_count =
            (brw->vue_map_geom_out.slots_valid & VARYING_BIT_VIEWPORT) ?
            ctx->Const.MaxViewports : 1;
      }

      brw_upload_wm_prog(brw);

      if (devinfo->gen < 6) {
         brw_upload_clip_prog(brw);
         brw_upload_sf_prog(brw);
      }

      brw_disk_cache_write_render_programs(brw);
   } else if (pipeline == BRW_COMPUTE_PIPELINE) {
      brw_upload_cs_prog(brw);
      brw_disk_cache_write_compute_program(brw);
   }
}
Exemple #2
0
static void
upload_urb(struct brw_context *brw)
{
   /* BRW_NEW_VS_PROG_DATA */
   const struct brw_vue_prog_data *vs_vue_prog_data =
      brw_vue_prog_data(brw->vs.base.prog_data);
   const unsigned vs_size = MAX2(vs_vue_prog_data->urb_entry_size, 1);

   /* BRW_NEW_GEOMETRY_PROGRAM, BRW_NEW_GS_PROG_DATA */
   const bool gs_present = brw->ff_gs.prog_active || brw->geometry_program;

   /* Whe using GS to do transform feedback only we use the same VUE layout for
    * VS outputs and GS outputs (as it's what the SF and Clipper expect), so we
    * can simply make the GS URB entry size the same as for the VS.  This may
    * technically be too large in cases where we have few vertex attributes and
    * a lot of varyings, since the VS size is determined by the larger of the
    * two. For now, it's safe.
    *
    * For user-provided GS the assumption above does not hold since the GS
    * outputs can be different from the VS outputs.
    */
   unsigned gs_size = vs_size;
   if (brw->geometry_program) {
      const struct brw_vue_prog_data *gs_vue_prog_data =
         brw_vue_prog_data(brw->gs.base.prog_data);
      gs_size = gs_vue_prog_data->urb_entry_size;
      assert(gs_size >= 1);
   }

   gen6_upload_urb(brw, vs_size, gs_present, gs_size);
}
Exemple #3
0
static inline void
brw_upload_programs(struct brw_context *brw,
                    enum brw_pipeline pipeline)
{
   struct gl_context *ctx = &brw->ctx;

   if (pipeline == BRW_RENDER_PIPELINE) {
      brw_upload_vs_prog(brw);
      brw_upload_tess_programs(brw);

      if (brw->gen < 6)
         brw_upload_ff_gs_prog(brw);
      else
         brw_upload_gs_prog(brw);

      /* Update the VUE map for data exiting the GS stage of the pipeline.
       * This comes from the last enabled shader stage.
       */
      GLbitfield64 old_slots = brw->vue_map_geom_out.slots_valid;
      bool old_separate = brw->vue_map_geom_out.separate;
      struct brw_vue_prog_data *vue_prog_data;
      if (brw->geometry_program)
         vue_prog_data = brw_vue_prog_data(brw->gs.base.prog_data);
      else if (brw->tess_eval_program)
         vue_prog_data = brw_vue_prog_data(brw->tes.base.prog_data);
      else
         vue_prog_data = brw_vue_prog_data(brw->vs.base.prog_data);

      brw->vue_map_geom_out = vue_prog_data->vue_map;

      /* If the layout has changed, signal BRW_NEW_VUE_MAP_GEOM_OUT. */
      if (old_slots != brw->vue_map_geom_out.slots_valid ||
          old_separate != brw->vue_map_geom_out.separate)
         brw->ctx.NewDriverState |= BRW_NEW_VUE_MAP_GEOM_OUT;

      if ((old_slots ^ brw->vue_map_geom_out.slots_valid) &
          VARYING_BIT_VIEWPORT) {
         ctx->NewDriverState |= BRW_NEW_VIEWPORT_COUNT;
         brw->clip.viewport_count =
            (brw->vue_map_geom_out.slots_valid & VARYING_BIT_VIEWPORT) ?
            ctx->Const.MaxViewports : 1;
      }

      brw_upload_wm_prog(brw);

      if (brw->gen < 6) {
         brw_upload_clip_prog(brw);
         brw_upload_sf_prog(brw);
      }
   } else if (pipeline == BRW_COMPUTE_PIPELINE) {
      brw_upload_cs_prog(brw);
   }
}
Exemple #4
0
static void
gen7_upload_ds_state(struct brw_context *brw)
{
   const struct gen_device_info *devinfo = &brw->screen->devinfo;
   const struct brw_stage_state *stage_state = &brw->tes.base;
   /* BRW_NEW_TESS_PROGRAMS */
   bool active = brw->tess_eval_program;

   /* BRW_NEW_TES_PROG_DATA */
   const struct brw_stage_prog_data *prog_data = stage_state->prog_data;
   const struct brw_vue_prog_data *vue_prog_data =
      brw_vue_prog_data(stage_state->prog_data);
   const struct brw_tes_prog_data *tes_prog_data =
      brw_tes_prog_data(stage_state->prog_data);

   const unsigned thread_count = (devinfo->max_tes_threads - 1) <<
      (brw->is_haswell ? HSW_DS_MAX_THREADS_SHIFT : GEN7_DS_MAX_THREADS_SHIFT);

   if (active) {
      BEGIN_BATCH(6);
      OUT_BATCH(_3DSTATE_DS << 16 | (6 - 2));
      OUT_BATCH(stage_state->prog_offset);
      OUT_BATCH(SET_FIELD(DIV_ROUND_UP(stage_state->sampler_count, 4),
                          GEN7_DS_SAMPLER_COUNT) |
                SET_FIELD(prog_data->binding_table.size_bytes / 4,
                          GEN7_DS_BINDING_TABLE_ENTRY_COUNT));
      if (prog_data->total_scratch) {
         OUT_RELOC(stage_state->scratch_bo,
                   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
                   ffs(stage_state->per_thread_scratch) - 11);
      } else {
         OUT_BATCH(0);
      }
      OUT_BATCH(SET_FIELD(prog_data->dispatch_grf_start_reg,
                          GEN7_DS_DISPATCH_START_GRF) |
                SET_FIELD(vue_prog_data->urb_read_length,
                          GEN7_DS_URB_READ_LENGTH));

      OUT_BATCH(GEN7_DS_ENABLE |
                GEN7_DS_STATISTICS_ENABLE |
                thread_count |
                (tes_prog_data->domain == BRW_TESS_DOMAIN_TRI ?
                 GEN7_DS_COMPUTE_W_COORDINATE_ENABLE : 0));
      ADVANCE_BATCH();
   } else {
      BEGIN_BATCH(6);
      OUT_BATCH(_3DSTATE_DS << 16 | (6 - 2));
      OUT_BATCH(0);
      OUT_BATCH(0);
      OUT_BATCH(0);
      OUT_BATCH(0);
      OUT_BATCH(0);
      ADVANCE_BATCH();
   }
   brw->tes.enabled = active;
}
Exemple #5
0
static void
upload_gs_state(struct brw_context *brw)
{
   const struct gen_device_info *devinfo = &brw->screen->devinfo;
   const struct brw_stage_state *stage_state = &brw->gs.base;
   const int max_threads_shift = brw->is_haswell ?
      HSW_GS_MAX_THREADS_SHIFT : GEN6_GS_MAX_THREADS_SHIFT;
   /* BRW_NEW_GEOMETRY_PROGRAM */
   bool active = brw->geometry_program;
   /* BRW_NEW_GS_PROG_DATA */
   const struct brw_stage_prog_data *prog_data = stage_state->prog_data;
   const struct brw_vue_prog_data *vue_prog_data =
      brw_vue_prog_data(stage_state->prog_data);
   const struct brw_gs_prog_data *gs_prog_data =
      brw_gs_prog_data(stage_state->prog_data);

   /**
    * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages >
    * Geometry > Geometry Shader > State:
    *
    *     "Note: Because of corruption in IVB:GT2, software needs to flush the
    *     whole fixed function pipeline when the GS enable changes value in
    *     the 3DSTATE_GS."
    *
    * The hardware architects have clarified that in this context "flush the
    * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
    * Stall" bit set.
    */
   if (!brw->is_haswell && brw->gt == 2 && brw->gs.enabled != active)
      gen7_emit_cs_stall_flush(brw);

   if (active) {
      BEGIN_BATCH(7);
      OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2));
      OUT_BATCH(stage_state->prog_offset);
      OUT_BATCH(((ALIGN(stage_state->sampler_count, 4)/4) <<
                 GEN6_GS_SAMPLER_COUNT_SHIFT) |
                ((prog_data->binding_table.size_bytes / 4) <<
                 GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT));

      if (prog_data->total_scratch) {
         OUT_RELOC(stage_state->scratch_bo,
                   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
                   ffs(stage_state->per_thread_scratch) - 11);
      } else {
         OUT_BATCH(0);
      }

      uint32_t dw4 =
         ((gs_prog_data->output_vertex_size_hwords * 2 - 1) <<
          GEN7_GS_OUTPUT_VERTEX_SIZE_SHIFT) |
         (gs_prog_data->output_topology << GEN7_GS_OUTPUT_TOPOLOGY_SHIFT) |
         (vue_prog_data->urb_read_length <<
          GEN6_GS_URB_READ_LENGTH_SHIFT) |
         (0 << GEN6_GS_URB_ENTRY_READ_OFFSET_SHIFT) |
         (prog_data->dispatch_grf_start_reg <<
          GEN6_GS_DISPATCH_START_GRF_SHIFT);

      /* Note: the meaning of the GEN7_GS_REORDER_TRAILING bit changes between
       * Ivy Bridge and Haswell.
       *
       * On Ivy Bridge, setting this bit causes the vertices of a triangle
       * strip to be delivered to the geometry shader in an order that does
       * not strictly follow the OpenGL spec, but preserves triangle
       * orientation.  For example, if the vertices are (1, 2, 3, 4, 5), then
       * the geometry shader sees triangles:
       *
       * (1, 2, 3), (2, 4, 3), (3, 4, 5)
       *
       * (Clearing the bit is even worse, because it fails to preserve
       * orientation).
       *
       * Triangle strips with adjacency always ordered in a way that preserves
       * triangle orientation but does not strictly follow the OpenGL spec,
       * regardless of the setting of this bit.
       *
       * On Haswell, both triangle strips and triangle strips with adjacency
       * are always ordered in a way that preserves triangle orientation.
       * Setting this bit causes the ordering to strictly follow the OpenGL
       * spec.
       *
       * So in either case we want to set the bit.  Unfortunately on Ivy
       * Bridge this will get the order close to correct but not perfect.
       */
      uint32_t dw5 =
         ((devinfo->max_gs_threads - 1) << max_threads_shift) |
         (gs_prog_data->control_data_header_size_hwords <<
          GEN7_GS_CONTROL_DATA_HEADER_SIZE_SHIFT) |
         ((gs_prog_data->invocations - 1) <<
          GEN7_GS_INSTANCE_CONTROL_SHIFT) |
         SET_FIELD(vue_prog_data->dispatch_mode, GEN7_GS_DISPATCH_MODE) |
         GEN6_GS_STATISTICS_ENABLE |
         (gs_prog_data->include_primitive_id ?
          GEN7_GS_INCLUDE_PRIMITIVE_ID : 0) |
         GEN7_GS_REORDER_TRAILING |
         GEN7_GS_ENABLE;
      uint32_t dw6 = 0;

      if (brw->is_haswell) {
         dw6 |= gs_prog_data->control_data_format <<
            HSW_GS_CONTROL_DATA_FORMAT_SHIFT;
      } else {
         dw5 |= gs_prog_data->control_data_format <<
            IVB_GS_CONTROL_DATA_FORMAT_SHIFT;
      }

      OUT_BATCH(dw4);
      OUT_BATCH(dw5);
      OUT_BATCH(dw6);
      ADVANCE_BATCH();
   } else {
      BEGIN_BATCH(7);
      OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2));
      OUT_BATCH(0); /* prog_bo */
      OUT_BATCH((0 << GEN6_GS_SAMPLER_COUNT_SHIFT) |
                (0 << GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
      OUT_BATCH(0); /* scratch space base offset */
      OUT_BATCH((1 << GEN6_GS_DISPATCH_START_GRF_SHIFT) |
                (0 << GEN6_GS_URB_READ_LENGTH_SHIFT) |
                GEN7_GS_INCLUDE_VERTEX_HANDLES |
                (0 << GEN6_GS_URB_ENTRY_READ_OFFSET_SHIFT));
      OUT_BATCH((0 << GEN6_GS_MAX_THREADS_SHIFT) |
                GEN6_GS_STATISTICS_ENABLE);
      OUT_BATCH(0);
      ADVANCE_BATCH();
   }
   brw->gs.enabled = active;
}
Exemple #6
0
void
brw_codegen_ff_gs_prog(struct brw_context *brw,
                       struct brw_ff_gs_prog_key *key)
{
   const struct gen_device_info *devinfo = &brw->screen->devinfo;
   struct brw_ff_gs_compile c;
   const GLuint *program;
   void *mem_ctx;
   GLuint program_size;

   memset(&c, 0, sizeof(c));

   c.key = *key;
   c.vue_map = brw_vue_prog_data(brw->vs.base.prog_data)->vue_map;
   c.nr_regs = (c.vue_map.num_slots + 1)/2;

   mem_ctx = ralloc_context(NULL);

   /* Begin the compilation:
    */
   brw_init_codegen(&brw->screen->devinfo, &c.func, mem_ctx);

   c.func.single_program_flow = 1;

   /* For some reason the thread is spawned with only 4 channels
    * unmasked.
    */
   brw_set_default_mask_control(&c.func, BRW_MASK_DISABLE);

   if (devinfo->gen >= 6) {
      unsigned num_verts;
      bool check_edge_flag;
      /* On Sandybridge, we use the GS for implementing transform feedback
       * (called "Stream Out" in the PRM).
       */
      switch (key->primitive) {
      case _3DPRIM_POINTLIST:
         num_verts = 1;
         check_edge_flag = false;
	 break;
      case _3DPRIM_LINELIST:
      case _3DPRIM_LINESTRIP:
      case _3DPRIM_LINELOOP:
         num_verts = 2;
         check_edge_flag = false;
	 break;
      case _3DPRIM_TRILIST:
      case _3DPRIM_TRIFAN:
      case _3DPRIM_TRISTRIP:
      case _3DPRIM_RECTLIST:
	 num_verts = 3;
         check_edge_flag = false;
         break;
      case _3DPRIM_QUADLIST:
      case _3DPRIM_QUADSTRIP:
      case _3DPRIM_POLYGON:
         num_verts = 3;
         check_edge_flag = true;
         break;
      default:
	 unreachable("Unexpected primitive type in Gen6 SOL program.");
      }
      gen6_sol_program(&c, key, num_verts, check_edge_flag);
   } else {
      /* On Gen4-5, we use the GS to decompose certain types of primitives.
       * Note that primitives which don't require a GS program have already
       * been weeded out by now.
       */
      switch (key->primitive) {
      case _3DPRIM_QUADLIST:
	 brw_ff_gs_quads( &c, key );
	 break;
      case _3DPRIM_QUADSTRIP:
	 brw_ff_gs_quad_strip( &c, key );
	 break;
      case _3DPRIM_LINELOOP:
	 brw_ff_gs_lines( &c );
	 break;
      default:
	 ralloc_free(mem_ctx);
	 return;
      }
   }

   brw_compact_instructions(&c.func, 0, NULL);

   /* get the program
    */
   program = brw_get_program(&c.func, &program_size);

   if (unlikely(INTEL_DEBUG & DEBUG_GS)) {
      fprintf(stderr, "gs:\n");
      brw_disassemble(&brw->screen->devinfo, c.func.store,
                      0, program_size, stderr);
      fprintf(stderr, "\n");
    }

   brw_upload_cache(&brw->cache, BRW_CACHE_FF_GS_PROG,
		    &c.key, sizeof(c.key),
		    program, program_size,
		    &c.prog_data, sizeof(c.prog_data),
		    &brw->ff_gs.prog_offset, &brw->ff_gs.prog_data);
   ralloc_free(mem_ctx);
}
Exemple #7
0
static void
brw_ff_gs_populate_key(struct brw_context *brw,
                       struct brw_ff_gs_prog_key *key)
{
   const struct gen_device_info *devinfo = &brw->screen->devinfo;
   static const unsigned swizzle_for_offset[4] = {
      BRW_SWIZZLE4(0, 1, 2, 3),
      BRW_SWIZZLE4(1, 2, 3, 3),
      BRW_SWIZZLE4(2, 3, 3, 3),
      BRW_SWIZZLE4(3, 3, 3, 3)
   };

   struct gl_context *ctx = &brw->ctx;

   assert(devinfo->gen < 7);

   memset(key, 0, sizeof(*key));

   /* BRW_NEW_VS_PROG_DATA (part of VUE map) */
   key->attrs = brw_vue_prog_data(brw->vs.base.prog_data)->vue_map.slots_valid;

   /* BRW_NEW_PRIMITIVE */
   key->primitive = brw->primitive;

   /* _NEW_LIGHT */
   key->pv_first = (ctx->Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION);
   if (key->primitive == _3DPRIM_QUADLIST && ctx->Light.ShadeModel != GL_FLAT) {
      /* Provide consistent primitive order with brw_set_prim's
       * optimization of single quads to trifans.
       */
      key->pv_first = true;
   }

   if (devinfo->gen == 6) {
      /* On Gen6, GS is used for transform feedback. */
      /* BRW_NEW_TRANSFORM_FEEDBACK */
      if (_mesa_is_xfb_active_and_unpaused(ctx)) {
         const struct gl_program *prog =
            ctx->_Shader->CurrentProgram[MESA_SHADER_VERTEX];
         const struct gl_transform_feedback_info *linked_xfb_info =
            prog->sh.LinkedTransformFeedback;
         int i;

         /* Make sure that the VUE slots won't overflow the unsigned chars in
          * key->transform_feedback_bindings[].
          */
         STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 256);

         /* Make sure that we don't need more binding table entries than we've
          * set aside for use in transform feedback.  (We shouldn't, since we
          * set aside enough binding table entries to have one per component).
          */
         assert(linked_xfb_info->NumOutputs <= BRW_MAX_SOL_BINDINGS);

         key->need_gs_prog = true;
         key->num_transform_feedback_bindings = linked_xfb_info->NumOutputs;
         for (i = 0; i < key->num_transform_feedback_bindings; ++i) {
            key->transform_feedback_bindings[i] =
               linked_xfb_info->Outputs[i].OutputRegister;
            key->transform_feedback_swizzles[i] =
               swizzle_for_offset[linked_xfb_info->Outputs[i].ComponentOffset];
         }
      }
   } else {
      /* Pre-gen6, GS is used to transform QUADLIST, QUADSTRIP, and LINELOOP
       * into simpler primitives.
       */
      key->need_gs_prog = (brw->primitive == _3DPRIM_QUADLIST ||
                           brw->primitive == _3DPRIM_QUADSTRIP ||
                           brw->primitive == _3DPRIM_LINELOOP);
   }
}
Exemple #8
0
static void
upload_vs_state(struct brw_context *brw)
{
   const struct gen_device_info *devinfo = &brw->screen->devinfo;
   const struct brw_stage_state *stage_state = &brw->vs.base;
   const struct brw_stage_prog_data *prog_data = stage_state->prog_data;
   const struct brw_vue_prog_data *vue_prog_data =
      brw_vue_prog_data(stage_state->prog_data);
   uint32_t floating_point_mode = 0;

   /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State,
    * 3DSTATE_VS, Dword 5.0 "VS Function Enable":
    *
    *   [DevSNB] A pipeline flush must be programmed prior to a 3DSTATE_VS
    *   command that causes the VS Function Enable to toggle. Pipeline
    *   flush can be executed by sending a PIPE_CONTROL command with CS
    *   stall bit set and a post sync operation.
    *
    * We've already done such a flush at the start of state upload, so we
    * don't need to do another one here.
    */

   if (stage_state->push_const_size == 0) {
      /* Disable the push constant buffers. */
      BEGIN_BATCH(5);
      OUT_BATCH(_3DSTATE_CONSTANT_VS << 16 | (5 - 2));
      OUT_BATCH(0);
      OUT_BATCH(0);
      OUT_BATCH(0);
      OUT_BATCH(0);
      ADVANCE_BATCH();
   } else {
      BEGIN_BATCH(5);
      OUT_BATCH(_3DSTATE_CONSTANT_VS << 16 |
		GEN6_CONSTANT_BUFFER_0_ENABLE |
		(5 - 2));
      /* Pointer to the VS constant buffer.  Covered by the set of
       * state flags from gen6_upload_vs_constants
       */
      OUT_BATCH(stage_state->push_const_offset +
		stage_state->push_const_size - 1);
      OUT_BATCH(0);
      OUT_BATCH(0);
      OUT_BATCH(0);
      ADVANCE_BATCH();
   }

   if (prog_data->use_alt_mode)
      floating_point_mode = GEN6_VS_FLOATING_POINT_MODE_ALT;

   BEGIN_BATCH(6);
   OUT_BATCH(_3DSTATE_VS << 16 | (6 - 2));
   OUT_BATCH(stage_state->prog_offset);
   OUT_BATCH(floating_point_mode |
	     ((ALIGN(stage_state->sampler_count, 4)/4) << GEN6_VS_SAMPLER_COUNT_SHIFT) |
             ((prog_data->binding_table.size_bytes / 4) <<
              GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT));

   if (prog_data->total_scratch) {
      OUT_RELOC(stage_state->scratch_bo,
		I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
		ffs(stage_state->per_thread_scratch) - 11);
   } else {
      OUT_BATCH(0);
   }

   OUT_BATCH((prog_data->dispatch_grf_start_reg <<
              GEN6_VS_DISPATCH_START_GRF_SHIFT) |
	     (vue_prog_data->urb_read_length << GEN6_VS_URB_READ_LENGTH_SHIFT) |
	     (0 << GEN6_VS_URB_ENTRY_READ_OFFSET_SHIFT));

   OUT_BATCH(((devinfo->max_vs_threads - 1) << GEN6_VS_MAX_THREADS_SHIFT) |
	     GEN6_VS_STATISTICS_ENABLE |
	     GEN6_VS_ENABLE);
   ADVANCE_BATCH();

   /* Based on my reading of the simulator, the VS constants don't get
    * pulled into the VS FF unit until an appropriate pipeline flush
    * happens, and instead the 3DSTATE_CONSTANT_VS packet just adds
    * references to them into a little FIFO.  The flushes are common,
    * but don't reliably happen between this and a 3DPRIMITIVE, causing
    * the primitive to use the wrong constants.  Then the FIFO
    * containing the constant setup gets added to again on the next
    * constants change, and eventually when a flush does happen the
    * unit is overwhelmed by constant changes and dies.
    *
    * To avoid this, send a PIPE_CONTROL down the line that will
    * update the unit immediately loading the constants.  The flush
    * type bits here were those set by the STATE_BASE_ADDRESS whose
    * move in a82a43e8d99e1715dd11c9c091b5ab734079b6a6 triggered the
    * bug reports that led to this workaround, and may be more than
    * what is strictly required to avoid the issue.
    */
   brw_emit_pipe_control_flush(brw,
                               PIPE_CONTROL_DEPTH_STALL |
                               PIPE_CONTROL_INSTRUCTION_INVALIDATE |
                               PIPE_CONTROL_STATE_CACHE_INVALIDATE);
}