Beispiel #1
0
void
brw_upload_cs_prog(struct brw_context *brw)
{
   struct gl_context *ctx = &brw->ctx;
   struct brw_cs_prog_key key;
   struct brw_compute_program *cp = (struct brw_compute_program *)
      brw->compute_program;

   if (!cp)
      return;

   if (!brw_state_dirty(brw, _NEW_TEXTURE, BRW_NEW_COMPUTE_PROGRAM))
      return;

   brw->cs.base.sampler_count =
      _mesa_fls(ctx->ComputeProgram._Current->Base.SamplersUsed);

   brw_cs_populate_key(brw, &key);

   if (!brw_search_cache(&brw->cache, BRW_CACHE_CS_PROG,
                         &key, sizeof(key),
                         &brw->cs.base.prog_offset, &brw->cs.prog_data)) {
      bool success =
         brw_codegen_cs_prog(brw,
                             ctx->Shader.CurrentProgram[MESA_SHADER_COMPUTE],
                             cp, &key);
      (void) success;
      assert(success);
   }
   brw->cs.base.prog_data = &brw->cs.prog_data->base;
}
Beispiel #2
0
static void
gen7_upload_samplers(struct brw_context *brw)
{
   struct gl_context *ctx = &brw->intel.ctx;
   struct gen7_sampler_state *samplers;

   /* BRW_NEW_VERTEX_PROGRAM and BRW_NEW_FRAGMENT_PROGRAM */
   struct gl_program *vs = (struct gl_program *) brw->vertex_program;
   struct gl_program *fs = (struct gl_program *) brw->fragment_program;

   GLbitfield SamplersUsed = vs->SamplersUsed | fs->SamplersUsed;

   brw->sampler.count = _mesa_fls(SamplersUsed);

   if (brw->sampler.count == 0)
      return;

   samplers = brw_state_batch(brw, AUB_TRACE_SAMPLER_STATE,
			      brw->sampler.count * sizeof(*samplers),
			      32, &brw->sampler.offset);
   memset(samplers, 0, brw->sampler.count * sizeof(*samplers));

   for (unsigned s = 0; s < brw->sampler.count; s++) {
      if (SamplersUsed & (1 << s)) {
         const unsigned unit = (fs->SamplersUsed & (1 << s)) ?
            fs->SamplerUnits[s] : vs->SamplerUnits[s];
         if (ctx->Texture.Unit[unit]._ReallyEnabled)
            gen7_update_sampler_state(brw, unit, s, &samplers[s]);
      }
   }

   brw->state.dirty.cache |= CACHE_NEW_SAMPLER;
}
Beispiel #3
0
bool
brw_vs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
{
   struct brw_context *brw = brw_context(ctx);
   struct brw_vs_prog_key key;
   uint32_t old_prog_offset = brw->vs.base.prog_offset;
   struct brw_vs_prog_data *old_prog_data = brw->vs.prog_data;
   bool success;

   if (!prog->_LinkedShaders[MESA_SHADER_VERTEX])
      return true;

   struct gl_vertex_program *vp = (struct gl_vertex_program *)
      prog->_LinkedShaders[MESA_SHADER_VERTEX]->Program;
   struct brw_vertex_program *bvp = brw_vertex_program(vp);

   memset(&key, 0, sizeof(key));

   key.base.program_string_id = bvp->id;
   key.base.clamp_vertex_color = ctx->API == API_OPENGL_COMPAT;

   unsigned sampler_count = _mesa_fls(vp->Base.SamplersUsed);
   for (unsigned i = 0; i < sampler_count; i++) {
      if (vp->Base.ShadowSamplers & (1 << i)) {
         /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
         key.base.tex.swizzles[i] =
            MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
      } else {
         /* Color sampler: assume no swizzling. */
         key.base.tex.swizzles[i] = SWIZZLE_XYZW;
      }
   }

   success = do_vs_prog(brw, prog, bvp, &key);

   brw->vs.base.prog_offset = old_prog_offset;
   brw->vs.prog_data = old_prog_data;

   return success;
}
Beispiel #4
0
/* May fail if out of video memory for texture or vbo upload, or on
 * fallback conditions.
 */
static void
brw_try_draw_prims(struct gl_context *ctx,
                   const struct gl_client_array *arrays[],
                   const struct _mesa_prim *prims,
                   GLuint nr_prims,
                   const struct _mesa_index_buffer *ib,
                   GLuint min_index,
                   GLuint max_index,
                   struct gl_buffer_object *indirect)
{
   struct brw_context *brw = brw_context(ctx);
   GLuint i;
   bool fail_next = false;

   if (ctx->NewState)
      _mesa_update_state(ctx);

   /* Find the highest sampler unit used by each shader program.  A bit-count
    * won't work since ARB programs use the texture unit number as the sampler
    * index.
    */
   brw->wm.base.sampler_count =
      _mesa_fls(ctx->FragmentProgram._Current->Base.SamplersUsed);
   brw->gs.base.sampler_count = ctx->GeometryProgram._Current ?
      _mesa_fls(ctx->GeometryProgram._Current->Base.SamplersUsed) : 0;
   brw->vs.base.sampler_count =
      _mesa_fls(ctx->VertexProgram._Current->Base.SamplersUsed);

   /* We have to validate the textures *before* checking for fallbacks;
    * otherwise, the software fallback won't be able to rely on the
    * texture state, the firstLevel and lastLevel fields won't be
    * set in the intel texture object (they'll both be 0), and the
    * software fallback will segfault if it attempts to access any
    * texture level other than level 0.
    */
   brw_validate_textures(brw);

   intel_prepare_render(brw);

   /* This workaround has to happen outside of brw_upload_render_state()
    * because it may flush the batchbuffer for a blit, affecting the state
    * flags.
    */
   brw_workaround_depthstencil_alignment(brw, 0);

   /* Bind all inputs, derive varying and size information:
    */
   brw_merge_inputs(brw, arrays);

   brw->ib.ib = ib;
   brw->ctx.NewDriverState |= BRW_NEW_INDICES;

   brw->vb.min_index = min_index;
   brw->vb.max_index = max_index;
   brw->ctx.NewDriverState |= BRW_NEW_VERTICES;

   for (i = 0; i < nr_prims; i++) {
      int estimated_max_prim_size;
      const int sampler_state_size = 16;

      estimated_max_prim_size = 512; /* batchbuffer commands */
      estimated_max_prim_size += BRW_MAX_TEX_UNIT *
         (sampler_state_size + sizeof(struct gen5_sampler_default_color));
      estimated_max_prim_size += 1024; /* gen6 VS push constants */
      estimated_max_prim_size += 1024; /* gen6 WM push constants */
      estimated_max_prim_size += 512; /* misc. pad */

      /* Flush the batch if it's approaching full, so that we don't wrap while
       * we've got validated state that needs to be in the same batch as the
       * primitives.
       */
      intel_batchbuffer_require_space(brw, estimated_max_prim_size, RENDER_RING);
      intel_batchbuffer_save_state(brw);

      if (brw->num_instances != prims[i].num_instances ||
          brw->basevertex != prims[i].basevertex) {
         brw->num_instances = prims[i].num_instances;
         brw->basevertex = prims[i].basevertex;
         if (i > 0) { /* For i == 0 we just did this before the loop */
            brw->ctx.NewDriverState |= BRW_NEW_VERTICES;
            brw_merge_inputs(brw, arrays);
         }
      }

      brw->draw.gl_basevertex =
         prims[i].indexed ? prims[i].basevertex : prims[i].start;

      drm_intel_bo_unreference(brw->draw.draw_params_bo);

      if (prims[i].is_indirect) {
         /* Point draw_params_bo at the indirect buffer. */
         brw->draw.draw_params_bo =
            intel_buffer_object(ctx->DrawIndirectBuffer)->buffer;
         drm_intel_bo_reference(brw->draw.draw_params_bo);
         brw->draw.draw_params_offset =
            prims[i].indirect_offset + (prims[i].indexed ? 12 : 8);
      } else {
         /* Set draw_params_bo to NULL so brw_prepare_vertices knows it
          * has to upload gl_BaseVertex and such if they're needed.
          */
         brw->draw.draw_params_bo = NULL;
         brw->draw.draw_params_offset = 0;
      }

      if (brw->gen < 6)
	 brw_set_prim(brw, &prims[i]);
      else
	 gen6_set_prim(brw, &prims[i]);

retry:

      /* Note that before the loop, brw->ctx.NewDriverState was set to != 0, and
       * that the state updated in the loop outside of this block is that in
       * *_set_prim or intel_batchbuffer_flush(), which only impacts
       * brw->ctx.NewDriverState.
       */
      if (brw->ctx.NewDriverState) {
	 brw->no_batch_wrap = true;
	 brw_upload_render_state(brw);
      }

      brw_emit_prim(brw, &prims[i], brw->primitive);

      brw->no_batch_wrap = false;

      if (dri_bufmgr_check_aperture_space(&brw->batch.bo, 1)) {
	 if (!fail_next) {
	    intel_batchbuffer_reset_to_saved(brw);
	    intel_batchbuffer_flush(brw);
	    fail_next = true;
	    goto retry;
	 } else {
            int ret = intel_batchbuffer_flush(brw);
            WARN_ONCE(ret == -ENOSPC,
                      "i965: Single primitive emit exceeded "
                      "available aperture space\n");
	 }
      }

      /* Now that we know we haven't run out of aperture space, we can safely
       * reset the dirty bits.
       */
      if (brw->ctx.NewDriverState)
         brw_render_state_finished(brw);
   }

   if (brw->always_flush_batch)
      intel_batchbuffer_flush(brw);

   brw_state_cache_check_size(brw);
   brw_postdraw_set_buffers_need_resolve(brw);

   return;
}
/* May fail if out of video memory for texture or vbo upload, or on
 * fallback conditions.
 */
static bool brw_try_draw_prims( struct gl_context *ctx,
				     const struct gl_client_array *arrays[],
				     const struct _mesa_prim *prims,
				     GLuint nr_prims,
				     const struct _mesa_index_buffer *ib,
				     GLuint min_index,
				     GLuint max_index,
				     struct gl_buffer_object *indirect)
{
   struct brw_context *brw = brw_context(ctx);
   bool retval = true;
   GLuint i;
   bool fail_next = false;

   if (ctx->NewState)
      _mesa_update_state( ctx );

   /* Find the highest sampler unit used by each shader program.  A bit-count
    * won't work since ARB programs use the texture unit number as the sampler
    * index.
    */
   brw->wm.base.sampler_count =
      _mesa_fls(ctx->FragmentProgram._Current->Base.SamplersUsed);
   brw->gs.base.sampler_count = ctx->GeometryProgram._Current ?
      _mesa_fls(ctx->GeometryProgram._Current->Base.SamplersUsed) : 0;
   brw->vs.base.sampler_count =
      _mesa_fls(ctx->VertexProgram._Current->Base.SamplersUsed);

   /* We have to validate the textures *before* checking for fallbacks;
    * otherwise, the software fallback won't be able to rely on the
    * texture state, the firstLevel and lastLevel fields won't be
    * set in the intel texture object (they'll both be 0), and the 
    * software fallback will segfault if it attempts to access any
    * texture level other than level 0.
    */
   brw_validate_textures( brw );

   intel_prepare_render(brw);

   /* This workaround has to happen outside of brw_upload_state() because it
    * may flush the batchbuffer for a blit, affecting the state flags.
    */
   brw_workaround_depthstencil_alignment(brw, 0);

   /* Resolves must occur after updating renderbuffers, updating context state,
    * and finalizing textures but before setting up any hardware state for
    * this draw call.
    */
   brw_predraw_resolve_buffers(brw);

   /* Bind all inputs, derive varying and size information:
    */
   brw_merge_inputs( brw, arrays );

   brw->ib.ib = ib;
   brw->state.dirty.brw |= BRW_NEW_INDICES;

   brw->vb.min_index = min_index;
   brw->vb.max_index = max_index;
   brw->state.dirty.brw |= BRW_NEW_VERTICES;

   for (i = 0; i < nr_prims; i++) {
      int estimated_max_prim_size;

      estimated_max_prim_size = 512; /* batchbuffer commands */
      estimated_max_prim_size += (BRW_MAX_TEX_UNIT *
				  (sizeof(struct brw_sampler_state) +
				   sizeof(struct gen5_sampler_default_color)));
      estimated_max_prim_size += 1024; /* gen6 VS push constants */
      estimated_max_prim_size += 1024; /* gen6 WM push constants */
      estimated_max_prim_size += 512; /* misc. pad */

      /* Flush the batch if it's approaching full, so that we don't wrap while
       * we've got validated state that needs to be in the same batch as the
       * primitives.
       */
      intel_batchbuffer_require_space(brw, estimated_max_prim_size, RENDER_RING);
      intel_batchbuffer_save_state(brw);

      if (brw->num_instances != prims[i].num_instances) {
         brw->num_instances = prims[i].num_instances;
         brw->state.dirty.brw |= BRW_NEW_VERTICES;
         brw_merge_inputs(brw, arrays);
      }
      if (brw->basevertex != prims[i].basevertex) {
         brw->basevertex = prims[i].basevertex;
         brw->state.dirty.brw |= BRW_NEW_VERTICES;
         brw_merge_inputs(brw, arrays);
      }
      if (brw->gen < 6)
	 brw_set_prim(brw, &prims[i]);
      else
	 gen6_set_prim(brw, &prims[i]);

retry:
      /* Note that before the loop, brw->state.dirty.brw was set to != 0, and
       * that the state updated in the loop outside of this block is that in
       * *_set_prim or intel_batchbuffer_flush(), which only impacts
       * brw->state.dirty.brw.
       */
      if (brw->state.dirty.brw) {
	 brw->no_batch_wrap = true;
	 brw_upload_state(brw);
      }

      brw_emit_prim(brw, &prims[i], brw->primitive);

      brw->no_batch_wrap = false;

      if (dri_bufmgr_check_aperture_space(&brw->batch.bo, 1)) {
	 if (!fail_next) {
	    intel_batchbuffer_reset_to_saved(brw);
	    intel_batchbuffer_flush(brw);
	    fail_next = true;
	    goto retry;
	 } else {
	    if (intel_batchbuffer_flush(brw) == -ENOSPC) {
	       static bool warned = false;

	       if (!warned) {
		  fprintf(stderr, "i965: Single primitive emit exceeded"
			  "available aperture space\n");
		  warned = true;
	       }

	       retval = false;
	    }
	 }
      }
   }

   if (brw->always_flush_batch)
      intel_batchbuffer_flush(brw);

   brw_state_cache_check_size(brw);
   brw_postdraw_set_buffers_need_resolve(brw);

   return retval;
}
/**
 * Write out a batch of 32 control data bits from the control_data_bits
 * register to the URB.
 *
 * The current value of the vertex_count register determines which DWORD in
 * the URB receives the control data bits.  The control_data_bits register is
 * assumed to contain the correct data for the vertex that was most recently
 * output, and all previous vertices that share the same DWORD.
 *
 * This function takes care of ensuring that if no vertices have been output
 * yet, no control bits are emitted.
 */
void
vec4_gs_visitor::emit_control_data_bits()
{
   assert(c->control_data_bits_per_vertex != 0);

   /* Since the URB_WRITE_OWORD message operates with 128-bit (vec4 sized)
    * granularity, we need to use two tricks to ensure that the batch of 32
    * control data bits is written to the appropriate DWORD in the URB.  To
    * select which vec4 we are writing to, we use the "slot {0,1} offset"
    * fields of the message header.  To select which DWORD in the vec4 we are
    * writing to, we use the channel mask fields of the message header.  To
    * avoid penalizing geometry shaders that emit a small number of vertices
    * with extra bookkeeping, we only do each of these tricks when
    * c->prog_data.control_data_header_size_bits is large enough to make it
    * necessary.
    *
    * Note: this means that if we're outputting just a single DWORD of control
    * data bits, we'll actually replicate it four times since we won't do any
    * channel masking.  But that's not a problem since in this case the
    * hardware only pays attention to the first DWORD.
    */
   enum brw_urb_write_flags urb_write_flags = BRW_URB_WRITE_OWORD;
   if (c->control_data_header_size_bits > 32)
      urb_write_flags = urb_write_flags | BRW_URB_WRITE_USE_CHANNEL_MASKS;
   if (c->control_data_header_size_bits > 128)
      urb_write_flags = urb_write_flags | BRW_URB_WRITE_PER_SLOT_OFFSET;

   /* If vertex_count is 0, then no control data bits have been accumulated
    * yet, so we should do nothing.
    */
   emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_NEQ));
   emit(IF(BRW_PREDICATE_NORMAL));
   {
      /* If we are using either channel masks or a per-slot offset, then we
       * need to figure out which DWORD we are trying to write to, using the
       * formula:
       *
       *     dword_index = (vertex_count - 1) * bits_per_vertex / 32
       *
       * Since bits_per_vertex is a power of two, and is known at compile
       * time, this can be optimized to:
       *
       *     dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
       */
      src_reg dword_index(this, glsl_type::uint_type);
      if (urb_write_flags) {
         src_reg prev_count(this, glsl_type::uint_type);
         emit(ADD(dst_reg(prev_count), this->vertex_count, 0xffffffffu));
         unsigned log2_bits_per_vertex =
            _mesa_fls(c->control_data_bits_per_vertex);
         emit(SHR(dst_reg(dword_index), prev_count,
                  (uint32_t) (6 - log2_bits_per_vertex)));
      }

      /* Start building the URB write message.  The first MRF gets a copy of
       * R0.
       */
      int base_mrf = 1;
      dst_reg mrf_reg(MRF, base_mrf);
      src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
      vec4_instruction *inst = emit(MOV(mrf_reg, r0));
      inst->force_writemask_all = true;

      if (urb_write_flags & BRW_URB_WRITE_PER_SLOT_OFFSET) {
         /* Set the per-slot offset to dword_index / 4, to that we'll write to
          * the appropriate OWORD within the control data header.
          */
         src_reg per_slot_offset(this, glsl_type::uint_type);
         emit(SHR(dst_reg(per_slot_offset), dword_index, 2u));
         emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, per_slot_offset, 1u);
      }

      if (urb_write_flags & BRW_URB_WRITE_USE_CHANNEL_MASKS) {
         /* Set the channel masks to 1 << (dword_index % 4), so that we'll
          * write to the appropriate DWORD within the OWORD.  We need to do
          * this computation with force_writemask_all, otherwise garbage data
          * from invocation 0 might clobber the mask for invocation 1 when
          * GS_OPCODE_PREPARE_CHANNEL_MASKS tries to OR the two masks
          * together.
          */
         src_reg channel(this, glsl_type::uint_type);
         inst = emit(AND(dst_reg(channel), dword_index, 3u));
         inst->force_writemask_all = true;
         src_reg one(this, glsl_type::uint_type);
         inst = emit(MOV(dst_reg(one), 1u));
         inst->force_writemask_all = true;
         src_reg channel_mask(this, glsl_type::uint_type);
         inst = emit(SHL(dst_reg(channel_mask), one, channel));
         inst->force_writemask_all = true;
         emit(GS_OPCODE_PREPARE_CHANNEL_MASKS, dst_reg(channel_mask),
                                               channel_mask);
         emit(GS_OPCODE_SET_CHANNEL_MASKS, mrf_reg, channel_mask);
      }

      /* Store the control data bits in the message payload and send it. */
      dst_reg mrf_reg2(MRF, base_mrf + 1);
      inst = emit(MOV(mrf_reg2, this->control_data_bits));
      inst->force_writemask_all = true;
      inst = emit(GS_OPCODE_URB_WRITE);
      inst->urb_write_flags = urb_write_flags;
      /* We need to increment Global Offset by 256-bits to make room for
       * Broadwell's extra "Vertex Count" payload at the beginning of the
       * URB entry.  Since this is an OWord message, Global Offset is counted
       * in 128-bit units, so we must set it to 2.
       */
      if (brw->gen >= 8)
         inst->offset = 2;
      inst->base_mrf = base_mrf;
      inst->mlen = 2;
   }
   emit(BRW_OPCODE_ENDIF);
}