Beispiel #1
0
/**
 * Implementation of up or downsampling for window-system MSAA miptrees.
 */
void
brw_meta_updownsample(struct brw_context *brw,
                      struct intel_mipmap_tree *src_mt,
                      struct intel_mipmap_tree *dst_mt)
{
   struct gl_context *ctx = &brw->ctx;
   GLuint fbos[2], src_rbo, dst_rbo, src_fbo, dst_fbo;
   GLenum drawbuffer;
   GLbitfield attachment, blit_bit;

   if (_mesa_get_format_base_format(src_mt->format) == GL_DEPTH_COMPONENT ||
       _mesa_get_format_base_format(src_mt->format) == GL_DEPTH_STENCIL) {
      attachment = GL_DEPTH_ATTACHMENT;
      drawbuffer = GL_NONE;
      blit_bit = GL_DEPTH_BUFFER_BIT;
   } else {
      attachment = GL_COLOR_ATTACHMENT0;
      drawbuffer = GL_COLOR_ATTACHMENT0;
      blit_bit = GL_COLOR_BUFFER_BIT;
   }

   brw_emit_mi_flush(brw);

   _mesa_meta_begin(ctx, MESA_META_ALL);
   _mesa_GenFramebuffers(2, fbos);
   src_rbo = brw_get_rb_for_slice(brw, src_mt, 0, 0, false);
   dst_rbo = brw_get_rb_for_slice(brw, dst_mt, 0, 0, false);
   src_fbo = fbos[0];
   dst_fbo = fbos[1];

   _mesa_BindFramebuffer(GL_READ_FRAMEBUFFER, src_fbo);
   _mesa_FramebufferRenderbuffer(GL_READ_FRAMEBUFFER, attachment,
                                 GL_RENDERBUFFER, src_rbo);
   _mesa_ReadBuffer(drawbuffer);

   _mesa_BindFramebuffer(GL_DRAW_FRAMEBUFFER, dst_fbo);
   _mesa_FramebufferRenderbuffer(GL_DRAW_FRAMEBUFFER, attachment,
                                 GL_RENDERBUFFER, dst_rbo);
   _mesa_DrawBuffer(drawbuffer);

   _mesa_BlitFramebuffer(0, 0,
                         src_mt->logical_width0, src_mt->logical_height0,
                         0, 0,
                         dst_mt->logical_width0, dst_mt->logical_height0,
                         blit_bit, GL_NEAREST);

   _mesa_DeleteRenderbuffers(1, &src_rbo);
   _mesa_DeleteRenderbuffers(1, &dst_rbo);
   _mesa_DeleteFramebuffers(2, fbos);

   _mesa_meta_end(ctx);

   brw_emit_mi_flush(brw);
}
Beispiel #2
0
static void
intel_texture_barrier(struct gl_context *ctx)
{
   struct brw_context *brw = brw_context(ctx);

   brw_emit_mi_flush(brw);
}
Beispiel #3
0
/**
 * When the GS is not in use, we assign the entire URB space to the VS.  When
 * the GS is in use, we split the URB space evenly between the VS and the GS.
 * This is not ideal, but it's simple.
 *
 *           URB size / 2                   URB size / 2
 *   _____________-______________   _____________-______________
 *  /                            \ /                            \
 * +-------------------------------------------------------------+
 * | Vertex Shader Entries        | Geometry Shader Entries      |
 * +-------------------------------------------------------------+
 *
 * Sandybridge GT1 has 32kB of URB space, while GT2 has 64kB.
 * (See the Sandybridge PRM, Volume 2, Part 1, Section 1.4.7: 3DSTATE_URB.)
 */
void
gen6_upload_urb(struct brw_context *brw, unsigned vs_size,
                bool gs_present, unsigned gs_size)
{
   int nr_vs_entries, nr_gs_entries;
   int total_urb_size = brw->urb.size * 1024; /* in bytes */
   const struct gen_device_info *devinfo = &brw->screen->devinfo;

   /* Calculate how many entries fit in each stage's section of the URB */
   if (gs_present) {
      nr_vs_entries = (total_urb_size/2) / (vs_size * 128);
      nr_gs_entries = (total_urb_size/2) / (gs_size * 128);
   } else {
      nr_vs_entries = total_urb_size / (vs_size * 128);
      nr_gs_entries = 0;
   }

   /* Then clamp to the maximum allowed by the hardware */
   if (nr_vs_entries > devinfo->urb.max_vs_entries)
      nr_vs_entries = devinfo->urb.max_vs_entries;

   if (nr_gs_entries > devinfo->urb.max_gs_entries)
      nr_gs_entries = devinfo->urb.max_gs_entries;

   /* Finally, both must be a multiple of 4 (see 3DSTATE_URB in the PRM). */
   brw->urb.nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, 4);
   brw->urb.nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, 4);

   assert(brw->urb.nr_vs_entries >= devinfo->urb.min_vs_entries);
   assert(brw->urb.nr_vs_entries % 4 == 0);
   assert(brw->urb.nr_gs_entries % 4 == 0);
   assert(vs_size <= 5);
   assert(gs_size <= 5);

   BEGIN_BATCH(3);
   OUT_BATCH(_3DSTATE_URB << 16 | (3 - 2));
   OUT_BATCH(((vs_size - 1) << GEN6_URB_VS_SIZE_SHIFT) |
	     ((brw->urb.nr_vs_entries) << GEN6_URB_VS_ENTRIES_SHIFT));
   OUT_BATCH(((gs_size - 1) << GEN6_URB_GS_SIZE_SHIFT) |
	     ((brw->urb.nr_gs_entries) << GEN6_URB_GS_ENTRIES_SHIFT));
   ADVANCE_BATCH();

   /* From the PRM Volume 2 part 1, section 1.4.7:
    *
    *   Because of a urb corruption caused by allocating a previous gsunit’s
    *   urb entry to vsunit software is required to send a "GS NULL
    *   Fence"(Send URB fence with VS URB size == 1 and GS URB size == 0) plus
    *   a dummy DRAW call before any case where VS will be taking over GS URB
    *   space.
    *
    * It is not clear exactly what this means ("URB fence" is a command that
    * doesn't exist on Gen6).  So for now we just do a full pipeline flush as
    * a workaround.
    */
   if (brw->urb.gs_present && !gs_present)
      brw_emit_mi_flush(brw);
   brw->urb.gs_present = gs_present;
}
void
brw_meta_resolve_color(struct brw_context *brw,
                       struct intel_mipmap_tree *mt)
{
    struct gl_context *ctx = &brw->ctx;
    GLuint fbo;
    struct gl_renderbuffer *rb;
    struct rect rect;

    brw_emit_mi_flush(brw);

    _mesa_meta_begin(ctx, MESA_META_ALL);

    _mesa_GenFramebuffers(1, &fbo);
    rb = brw_get_rb_for_slice(brw, mt, 0, 0, false);

    _mesa_BindFramebuffer(GL_DRAW_FRAMEBUFFER, fbo);
    _mesa_framebuffer_renderbuffer(ctx, ctx->DrawBuffer, GL_COLOR_ATTACHMENT0,
                                   rb);
    _mesa_DrawBuffer(GL_COLOR_ATTACHMENT0);

    brw_fast_clear_init(brw);

    use_rectlist(brw, true);

    brw_bind_rep_write_shader(brw, (float *) fast_clear_color);

    /* SKL+ also has a resolve mode for compressed render targets and thus more
     * bits to let us select the type of resolve.  For fast clear resolves, it
     * turns out we can use the same value as pre-SKL though.
     */
    if (intel_miptree_is_lossless_compressed(brw, mt))
        set_fast_clear_op(brw, GEN9_PS_RENDER_TARGET_RESOLVE_FULL);
    else
        set_fast_clear_op(brw, GEN7_PS_RENDER_TARGET_RESOLVE_ENABLE);

    mt->fast_clear_state = INTEL_FAST_CLEAR_STATE_RESOLVED;
    get_resolve_rect(brw, mt, &rect);

    brw_draw_rectlist(brw, &rect, 1);

    set_fast_clear_op(brw, 0);
    use_rectlist(brw, false);

    _mesa_reference_renderbuffer(&rb, NULL);
    _mesa_DeleteFramebuffers(1, &fbo);

    _mesa_meta_end(ctx);

    /* We're typically called from intel_update_state() and we're supposed to
     * return with the state all updated to what it was before
     * brw_meta_resolve_color() was called.  The meta rendering will have
     * messed up the state and we need to call _mesa_update_state() again to
     * get back to where we were supposed to be when resolve was called.
     */
    if (ctx->NewState)
        _mesa_update_state(ctx);
}
Beispiel #5
0
static void
brw_fence_insert(struct brw_context *brw, struct brw_fence *fence)
{
   assert(!fence->batch_bo);
   assert(!fence->signalled);

   brw_emit_mi_flush(brw);
   fence->batch_bo = brw->batch.bo;
   drm_intel_bo_reference(fence->batch_bo);
   intel_batchbuffer_flush(brw);
}
Beispiel #6
0
void
brw_end_transform_feedback(struct gl_context *ctx,
                           struct gl_transform_feedback_object *obj)
{
   /* After EndTransformFeedback, it's likely that the client program will try
    * to draw using the contents of the transform feedback buffer as vertex
    * input.  In order for this to work, we need to flush the data through at
    * least the GS stage of the pipeline, and flush out the render cache.  For
    * simplicity, just do a full flush.
    */
   struct brw_context *brw = brw_context(ctx);
   brw_emit_mi_flush(brw);
}
Beispiel #7
0
void
brw_meta_resolve_color(struct brw_context *brw,
                       struct intel_mipmap_tree *mt)
{
   struct gl_context *ctx = &brw->ctx;
   GLuint fbo, rbo;
   struct rect rect;

   brw_emit_mi_flush(brw);

   _mesa_meta_begin(ctx, MESA_META_ALL);

   _mesa_GenFramebuffers(1, &fbo);
   rbo = brw_get_rb_for_slice(brw, mt, 0, 0, false);

   _mesa_BindFramebuffer(GL_DRAW_FRAMEBUFFER, fbo);
   _mesa_FramebufferRenderbuffer(GL_DRAW_FRAMEBUFFER,
                                 GL_COLOR_ATTACHMENT0,
                                 GL_RENDERBUFFER, rbo);
   _mesa_DrawBuffer(GL_COLOR_ATTACHMENT0);

   brw_fast_clear_init(brw);

   use_rectlist(brw, true);

   brw_bind_rep_write_shader(brw, (float *) fast_clear_color);

   set_fast_clear_op(brw, GEN7_PS_RENDER_TARGET_RESOLVE_ENABLE);

   mt->fast_clear_state = INTEL_FAST_CLEAR_STATE_RESOLVED;
   get_resolve_rect(brw, mt, &rect);

   brw_draw_rectlist(ctx, &rect, 1);

   set_fast_clear_op(brw, 0);
   use_rectlist(brw, false);

   _mesa_DeleteRenderbuffers(1, &rbo);
   _mesa_DeleteFramebuffers(1, &fbo);

   _mesa_meta_end(ctx);

   /* We're typically called from intel_update_state() and we're supposed to
    * return with the state all updated to what it was before
    * brw_meta_resolve_color() was called.  The meta rendering will have
    * messed up the state and we need to call _mesa_update_state() again to
    * get back to where we were supposed to be when resolve was called.
    */
   if (ctx->NewState)
      _mesa_update_state(ctx);
}
Beispiel #8
0
/**
 * Store the SO_NUM_PRIMS_WRITTEN counters for each stream (4 uint64_t values)
 * to prim_count_bo.
 */
static void
save_prim_start_values(struct brw_context *brw,
                       struct brw_transform_feedback_object *obj)
{
   /* Flush any drawing so that the counters have the right values. */
   brw_emit_mi_flush(brw);

   /* Emit MI_STORE_REGISTER_MEM commands to write the values. */
   for (int i = 0; i < BRW_MAX_XFB_STREAMS; i++) {
      brw_store_register_mem64(brw, obj->prim_count_bo,
                               GEN7_SO_NUM_PRIMS_WRITTEN(i),
                               START_OFFSET + i * sizeof(uint64_t));
   }
}
Beispiel #9
0
static void
intel_texture_barrier(struct gl_context *ctx)
{
   struct brw_context *brw = brw_context(ctx);
   const struct gen_device_info *devinfo = &brw->screen->devinfo;

   if (devinfo->gen >= 6) {
      brw_emit_pipe_control_flush(brw,
                                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
                                  PIPE_CONTROL_RENDER_TARGET_FLUSH |
                                  PIPE_CONTROL_CS_STALL);

      brw_emit_pipe_control_flush(brw,
                                  PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
   } else {
      brw_emit_mi_flush(brw);
   }
}
Beispiel #10
0
static void
intel_get_tex_sub_image(struct gl_context *ctx,
                        GLint xoffset, GLint yoffset, GLint zoffset,
                        GLsizei width, GLsizei height, GLint depth,
                        GLenum format, GLenum type, GLvoid *pixels,
                        struct gl_texture_image *texImage)
{
   struct brw_context *brw = brw_context(ctx);
   bool ok;

   DBG("%s\n", __func__);

   if (_mesa_is_bufferobj(ctx->Pack.BufferObj)) {
      if (_mesa_meta_pbo_GetTexSubImage(ctx, 3, texImage,
                                        xoffset, yoffset, zoffset,
                                        width, height, depth, format, type,
                                        pixels, &ctx->Pack)) {
         /* Flush to guarantee coherency between the render cache and other
          * caches the PBO could potentially be bound to after this point.
          * See the related comment in intelReadPixels() for a more detailed
          * explanation.
          */
         brw_emit_mi_flush(brw);
         return;
      }

      perf_debug("%s: fallback to CPU mapping in PBO case\n", __func__);
   }

   ok = intel_gettexsubimage_tiled_memcpy(ctx, texImage, xoffset, yoffset,
                                          width, height,
                                          format, type, pixels, &ctx->Pack);

   if(ok)
      return;

   _mesa_meta_GetTexSubImage(ctx, xoffset, yoffset, zoffset,
                             width, height, depth,
                             format, type, pixels, texImage);

   DBG("%s - DONE\n", __func__);
}
Beispiel #11
0
static void
brw_emit_prim(struct brw_context *brw,
              const struct _mesa_prim *prim,
              uint32_t hw_prim)
{
   int verts_per_instance;
   int vertex_access_type;
   int indirect_flag;

   DBG("PRIM: %s %d %d\n", _mesa_enum_to_string(prim->mode),
       prim->start, prim->count);

   int start_vertex_location = prim->start;
   int base_vertex_location = prim->basevertex;

   if (prim->indexed) {
      vertex_access_type = brw->gen >= 7 ?
         GEN7_3DPRIM_VERTEXBUFFER_ACCESS_RANDOM :
         GEN4_3DPRIM_VERTEXBUFFER_ACCESS_RANDOM;
      start_vertex_location += brw->ib.start_vertex_offset;
      base_vertex_location += brw->vb.start_vertex_bias;
   } else {
      vertex_access_type = brw->gen >= 7 ?
         GEN7_3DPRIM_VERTEXBUFFER_ACCESS_SEQUENTIAL :
         GEN4_3DPRIM_VERTEXBUFFER_ACCESS_SEQUENTIAL;
      start_vertex_location += brw->vb.start_vertex_bias;
   }

   /* We only need to trim the primitive count on pre-Gen6. */
   if (brw->gen < 6)
      verts_per_instance = trim(prim->mode, prim->count);
   else
      verts_per_instance = prim->count;

   /* If nothing to emit, just return. */
   if (verts_per_instance == 0 && !prim->is_indirect)
      return;

   /* If we're set to always flush, do it before and after the primitive emit.
    * We want to catch both missed flushes that hurt instruction/state cache
    * and missed flushes of the render cache as it heads to other parts of
    * the besides the draw code.
    */
   if (brw->always_flush_cache)
      brw_emit_mi_flush(brw);

   /* If indirect, emit a bunch of loads from the indirect BO. */
   if (prim->is_indirect) {
      struct gl_buffer_object *indirect_buffer = brw->ctx.DrawIndirectBuffer;
      drm_intel_bo *bo = intel_bufferobj_buffer(brw,
            intel_buffer_object(indirect_buffer),
            prim->indirect_offset, 5 * sizeof(GLuint));

      indirect_flag = GEN7_3DPRIM_INDIRECT_PARAMETER_ENABLE;

      brw_load_register_mem(brw, GEN7_3DPRIM_VERTEX_COUNT, bo,
                            I915_GEM_DOMAIN_VERTEX, 0,
                            prim->indirect_offset + 0);
      brw_load_register_mem(brw, GEN7_3DPRIM_INSTANCE_COUNT, bo,
                            I915_GEM_DOMAIN_VERTEX, 0,
                            prim->indirect_offset + 4);

      brw_load_register_mem(brw, GEN7_3DPRIM_START_VERTEX, bo,
                            I915_GEM_DOMAIN_VERTEX, 0,
                            prim->indirect_offset + 8);
      if (prim->indexed) {
         brw_load_register_mem(brw, GEN7_3DPRIM_BASE_VERTEX, bo,
                               I915_GEM_DOMAIN_VERTEX, 0,
                               prim->indirect_offset + 12);
         brw_load_register_mem(brw, GEN7_3DPRIM_START_INSTANCE, bo,
                               I915_GEM_DOMAIN_VERTEX, 0,
                               prim->indirect_offset + 16);
      } else {
         brw_load_register_mem(brw, GEN7_3DPRIM_START_INSTANCE, bo,
                               I915_GEM_DOMAIN_VERTEX, 0,
                               prim->indirect_offset + 12);
         BEGIN_BATCH(3);
         OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
         OUT_BATCH(GEN7_3DPRIM_BASE_VERTEX);
         OUT_BATCH(0);
         ADVANCE_BATCH();
      }
   } else {
      indirect_flag = 0;
   }

   BEGIN_BATCH(brw->gen >= 7 ? 7 : 6);

   if (brw->gen >= 7) {
      const int predicate_enable =
         (brw->predicate.state == BRW_PREDICATE_STATE_USE_BIT)
         ? GEN7_3DPRIM_PREDICATE_ENABLE : 0;

      OUT_BATCH(CMD_3D_PRIM << 16 | (7 - 2) | indirect_flag | predicate_enable);
      OUT_BATCH(hw_prim | vertex_access_type);
   } else {
      OUT_BATCH(CMD_3D_PRIM << 16 | (6 - 2) |
                hw_prim << GEN4_3DPRIM_TOPOLOGY_TYPE_SHIFT |
                vertex_access_type);
   }
   OUT_BATCH(verts_per_instance);
   OUT_BATCH(start_vertex_location);
   OUT_BATCH(prim->num_instances);
   OUT_BATCH(prim->base_instance);
   OUT_BATCH(base_vertex_location);
   ADVANCE_BATCH();

   if (brw->always_flush_cache)
      brw_emit_mi_flush(brw);
}
void
brw_blorp_exec(struct brw_context *brw, const brw_blorp_params *params)
{
   struct gl_context *ctx = &brw->ctx;
   uint32_t estimated_max_batch_usage = 1500;
   bool check_aperture_failed_once = false;

   /* Flush the sampler and render caches.  We definitely need to flush the
    * sampler cache so that we get updated contents from the render cache for
    * the glBlitFramebuffer() source.  Also, we are sometimes warned in the
    * docs to flush the cache between reinterpretations of the same surface
    * data with different formats, which blorp does for stencil and depth
    * data.
    */
   brw_emit_mi_flush(brw);

retry:
   intel_batchbuffer_require_space(brw, estimated_max_batch_usage, RENDER_RING);
   intel_batchbuffer_save_state(brw);
   drm_intel_bo *saved_bo = brw->batch.bo;
   uint32_t saved_used = USED_BATCH(brw->batch);
   uint32_t saved_state_batch_offset = brw->batch.state_batch_offset;

   switch (brw->gen) {
   case 6:
      gen6_blorp_exec(brw, params);
      break;
   case 7:
      gen7_blorp_exec(brw, params);
      break;
   default:
      /* BLORP is not supported before Gen6. */
      unreachable("not reached");
   }

   /* Make sure we didn't wrap the batch unintentionally, and make sure we
    * reserved enough space that a wrap will never happen.
    */
   assert(brw->batch.bo == saved_bo);
   assert((USED_BATCH(brw->batch) - saved_used) * 4 +
          (saved_state_batch_offset - brw->batch.state_batch_offset) <
          estimated_max_batch_usage);
   /* Shut up compiler warnings on release build */
   (void)saved_bo;
   (void)saved_used;
   (void)saved_state_batch_offset;

   /* Check if the blorp op we just did would make our batch likely to fail to
    * map all the BOs into the GPU at batch exec time later.  If so, flush the
    * batch and try again with nothing else in the batch.
    */
   if (dri_bufmgr_check_aperture_space(&brw->batch.bo, 1)) {
      if (!check_aperture_failed_once) {
         check_aperture_failed_once = true;
         intel_batchbuffer_reset_to_saved(brw);
         intel_batchbuffer_flush(brw);
         goto retry;
      } else {
         int ret = intel_batchbuffer_flush(brw);
         WARN_ONCE(ret == -ENOSPC,
                   "i965: blorp emit exceeded available aperture space\n");
      }
   }

   if (unlikely(brw->always_flush_batch))
      intel_batchbuffer_flush(brw);

   /* We've smashed all state compared to what the normal 3D pipeline
    * rendering tracks for GL.
    */
   brw->ctx.NewDriverState = ~0ull;
   brw->no_depth_or_stencil = false;
   brw->ib.type = -1;

   /* Flush the sampler cache so any texturing from the destination is
    * coherent.
    */
   brw_emit_mi_flush(brw);
}
Beispiel #13
0
bool
brw_meta_fast_clear(struct brw_context *brw, struct gl_framebuffer *fb,
                    GLbitfield buffers, bool partial_clear)
{
   struct gl_context *ctx = &brw->ctx;
   mesa_format format;
   enum { FAST_CLEAR, REP_CLEAR, PLAIN_CLEAR } clear_type;
   GLbitfield plain_clear_buffers, meta_save, rep_clear_buffers, fast_clear_buffers;
   struct rect fast_clear_rect, clear_rect;
   int layers;

   fast_clear_buffers = rep_clear_buffers = plain_clear_buffers = 0;

   /* First we loop through the color draw buffers and determine which ones
    * can be fast cleared, which ones can use the replicated write and which
    * ones have to fall back to regular color clear.
    */
   for (unsigned buf = 0; buf < fb->_NumColorDrawBuffers; buf++) {
      struct gl_renderbuffer *rb = fb->_ColorDrawBuffers[buf];
      struct intel_renderbuffer *irb = intel_renderbuffer(rb);
      int index = fb->_ColorDrawBufferIndexes[buf];

      /* Only clear the buffers present in the provided mask */
      if (((1 << index) & buffers) == 0)
         continue;

      /* If this is an ES2 context or GL_ARB_ES2_compatibility is supported,
       * the framebuffer can be complete with some attachments missing.  In
       * this case the _ColorDrawBuffers pointer will be NULL.
       */
      if (rb == NULL)
         continue;

      clear_type = FAST_CLEAR;

      /* We don't have fast clear until gen7. */
      if (brw->gen < 7)
         clear_type = REP_CLEAR;

      if (irb->mt->fast_clear_state == INTEL_FAST_CLEAR_STATE_NO_MCS)
         clear_type = REP_CLEAR;

      /* We can't do scissored fast clears because of the restrictions on the
       * fast clear rectangle size.
       */
      if (partial_clear)
         clear_type = REP_CLEAR;

      /* Fast clear is only supported for colors where all components are
       * either 0 or 1.
       */
      format = _mesa_get_render_format(ctx, irb->mt->format);
      if (!is_color_fast_clear_compatible(brw, format, &ctx->Color.ClearColor))
         clear_type = REP_CLEAR;

      /* From the SNB PRM (Vol4_Part1):
       *
       *     "Replicated data (Message Type = 111) is only supported when
       *      accessing tiled memory.  Using this Message Type to access
       *      linear (untiled) memory is UNDEFINED."
       */
      if (irb->mt->tiling == I915_TILING_NONE) {
         perf_debug("Falling back to plain clear because %dx%d buffer is untiled\n",
                    irb->mt->logical_width0, irb->mt->logical_height0);
         clear_type = PLAIN_CLEAR;
      }

      /* Constant color writes ignore everything in blend and color calculator
       * state.  This is not documented.
       */
      GLubyte *color_mask = ctx->Color.ColorMask[buf];
      for (int i = 0; i < 4; i++) {
         if (_mesa_format_has_color_component(irb->mt->format, i) &&
             !color_mask[i]) {
            perf_debug("Falling back to plain clear on %dx%d buffer because of color mask\n",
                       irb->mt->logical_width0, irb->mt->logical_height0);
            clear_type = PLAIN_CLEAR;
         }
      }

      /* Allocate the MCS for non MSRT surfaces now if we're doing a fast
       * clear and we don't have the MCS yet.  On failure, fall back to
       * replicated clear.
       */
      if (clear_type == FAST_CLEAR && irb->mt->mcs_mt == NULL)
         if (!intel_miptree_alloc_non_msrt_mcs(brw, irb->mt))
            clear_type = REP_CLEAR;

      switch (clear_type) {
      case FAST_CLEAR:
         irb->mt->fast_clear_color_value =
            compute_fast_clear_color_bits(&ctx->Color.ClearColor);
         irb->need_downsample = true;

         /* If the buffer is already in INTEL_FAST_CLEAR_STATE_CLEAR, the
          * clear is redundant and can be skipped.  Only skip after we've
          * updated the fast clear color above though.
          */
         if (irb->mt->fast_clear_state == INTEL_FAST_CLEAR_STATE_CLEAR)
            continue;

         /* Set fast_clear_state to RESOLVED so we don't try resolve them when
          * we draw, in case the mt is also bound as a texture.
          */
         irb->mt->fast_clear_state = INTEL_FAST_CLEAR_STATE_RESOLVED;
         irb->need_downsample = true;
         fast_clear_buffers |= 1 << index;
         get_fast_clear_rect(brw, fb, irb, &fast_clear_rect);
         break;

      case REP_CLEAR:
         rep_clear_buffers |= 1 << index;
         get_buffer_rect(brw, fb, irb, &clear_rect);
         break;

      case PLAIN_CLEAR:
         plain_clear_buffers |= 1 << index;
         get_buffer_rect(brw, fb, irb, &clear_rect);
         continue;
      }
   }

   if (!(fast_clear_buffers | rep_clear_buffers)) {
      if (plain_clear_buffers)
         /* If we only have plain clears, skip the meta save/restore. */
         goto out;
      else
         /* Nothing left to do.  This happens when we hit the redundant fast
          * clear case above and nothing else.
          */
         return true;
   }

   meta_save =
      MESA_META_ALPHA_TEST |
      MESA_META_BLEND |
      MESA_META_DEPTH_TEST |
      MESA_META_RASTERIZATION |
      MESA_META_SHADER |
      MESA_META_STENCIL_TEST |
      MESA_META_VERTEX |
      MESA_META_VIEWPORT |
      MESA_META_CLIP |
      MESA_META_CLAMP_FRAGMENT_COLOR |
      MESA_META_MULTISAMPLE |
      MESA_META_OCCLUSION_QUERY |
      MESA_META_DRAW_BUFFERS;

   _mesa_meta_begin(ctx, meta_save);

   if (!brw_fast_clear_init(brw)) {
      /* This is going to be hard to recover from, most likely out of memory.
       * Bail and let meta try and (probably) fail for us.
       */
      plain_clear_buffers = buffers;
      goto bail_to_meta;
   }

   /* Clears never have the color clamped. */
   if (ctx->Extensions.ARB_color_buffer_float)
      _mesa_ClampColor(GL_CLAMP_FRAGMENT_COLOR, GL_FALSE);

   _mesa_set_enable(ctx, GL_DEPTH_TEST, GL_FALSE);
   _mesa_DepthMask(GL_FALSE);
   _mesa_set_enable(ctx, GL_STENCIL_TEST, GL_FALSE);

   use_rectlist(brw, true);

   layers = MAX2(1, fb->MaxNumLayers);
   if (fast_clear_buffers) {
      _mesa_meta_drawbuffers_from_bitfield(fast_clear_buffers);
      brw_bind_rep_write_shader(brw, (float *) fast_clear_color);
      set_fast_clear_op(brw, GEN7_PS_RENDER_TARGET_FAST_CLEAR_ENABLE);
      brw_draw_rectlist(ctx, &fast_clear_rect, layers);
      set_fast_clear_op(brw, 0);
   }

   if (rep_clear_buffers) {
      _mesa_meta_drawbuffers_from_bitfield(rep_clear_buffers);
      brw_bind_rep_write_shader(brw, ctx->Color.ClearColor.f);
      brw_draw_rectlist(ctx, &clear_rect, layers);
   }

   /* Now set the mts we cleared to INTEL_FAST_CLEAR_STATE_CLEAR so we'll
    * resolve them eventually.
    */
   for (unsigned buf = 0; buf < fb->_NumColorDrawBuffers; buf++) {
      struct gl_renderbuffer *rb = fb->_ColorDrawBuffers[buf];
      struct intel_renderbuffer *irb = intel_renderbuffer(rb);
      int index = fb->_ColorDrawBufferIndexes[buf];

      if ((1 << index) & fast_clear_buffers)
         irb->mt->fast_clear_state = INTEL_FAST_CLEAR_STATE_CLEAR;
   }

 bail_to_meta:
   /* Dirty _NEW_BUFFERS so we reemit SURFACE_STATE which sets the fast clear
    * color before resolve and sets irb->mt->fast_clear_state to UNRESOLVED if
    * we render to it.
    */
   brw->NewGLState |= _NEW_BUFFERS;


   /* Set the custom state back to normal and dirty the same bits as above */
   use_rectlist(brw, false);

   _mesa_meta_end(ctx);

   /* From BSpec: Render Target Fast Clear:
    *
    *     After Render target fast clear, pipe-control with color cache
    *     write-flush must be issued before sending any DRAW commands on that
    *     render target.
    */
   brw_emit_mi_flush(brw);

   /* If we had to fall back to plain clear for any buffers, clear those now
    * by calling into meta.
    */
 out:
   if (plain_clear_buffers)
      _mesa_meta_glsl_Clear(&brw->ctx, plain_clear_buffers);

   return true;
}
Beispiel #14
0
/**
 * When the GS is not in use, we assign the entire URB space to the VS.  When
 * the GS is in use, we split the URB space evenly between the VS and the GS.
 * This is not ideal, but it's simple.
 *
 *           URB size / 2                   URB size / 2
 *   _____________-______________   _____________-______________
 *  /                            \ /                            \
 * +-------------------------------------------------------------+
 * | Vertex Shader Entries        | Geometry Shader Entries      |
 * +-------------------------------------------------------------+
 *
 * Sandybridge GT1 has 32kB of URB space, while GT2 has 64kB.
 * (See the Sandybridge PRM, Volume 2, Part 1, Section 1.4.7: 3DSTATE_URB.)
 */
static void
gen6_upload_urb( struct brw_context *brw )
{
    int nr_vs_entries, nr_gs_entries;
    int total_urb_size = brw->urb.size * 1024; /* in bytes */

    bool gs_present = brw->ff_gs.prog_active || brw->geometry_program;

    /* BRW_NEW_VS_PROG_DATA */
    unsigned vs_size = MAX2(brw->vs.prog_data->base.urb_entry_size, 1);

    /* Whe using GS to do transform feedback only we use the same VUE layout for
     * VS outputs and GS outputs (as it's what the SF and Clipper expect), so we
     * can simply make the GS URB entry size the same as for the VS.  This may
     * technically be too large in cases where we have few vertex attributes and
     * a lot of varyings, since the VS size is determined by the larger of the
     * two. For now, it's safe.
     *
     * For user-provided GS the assumption above does not hold since the GS
     * outputs can be different from the VS outputs.
     */
    unsigned gs_size = vs_size;
    if (brw->geometry_program) {
        gs_size = brw->gs.prog_data->base.urb_entry_size;
        assert(gs_size >= 1);
    }

    /* Calculate how many entries fit in each stage's section of the URB */
    if (gs_present) {
        nr_vs_entries = (total_urb_size/2) / (vs_size * 128);
        nr_gs_entries = (total_urb_size/2) / (gs_size * 128);
    } else {
        nr_vs_entries = total_urb_size / (vs_size * 128);
        nr_gs_entries = 0;
    }

    /* Then clamp to the maximum allowed by the hardware */
    if (nr_vs_entries > brw->urb.max_vs_entries)
        nr_vs_entries = brw->urb.max_vs_entries;

    if (nr_gs_entries > brw->urb.max_gs_entries)
        nr_gs_entries = brw->urb.max_gs_entries;

    /* Finally, both must be a multiple of 4 (see 3DSTATE_URB in the PRM). */
    brw->urb.nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, 4);
    brw->urb.nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, 4);

    assert(brw->urb.nr_vs_entries >= brw->urb.min_vs_entries);
    assert(brw->urb.nr_vs_entries % 4 == 0);
    assert(brw->urb.nr_gs_entries % 4 == 0);
    assert(vs_size <= 5);
    assert(gs_size <= 5);

    BEGIN_BATCH(3);
    OUT_BATCH(_3DSTATE_URB << 16 | (3 - 2));
    OUT_BATCH(((vs_size - 1) << GEN6_URB_VS_SIZE_SHIFT) |
              ((brw->urb.nr_vs_entries) << GEN6_URB_VS_ENTRIES_SHIFT));
    OUT_BATCH(((gs_size - 1) << GEN6_URB_GS_SIZE_SHIFT) |
              ((brw->urb.nr_gs_entries) << GEN6_URB_GS_ENTRIES_SHIFT));
    ADVANCE_BATCH();

    /* From the PRM Volume 2 part 1, section 1.4.7:
     *
     *   Because of a urb corruption caused by allocating a previous gsunit’s
     *   urb entry to vsunit software is required to send a "GS NULL
     *   Fence"(Send URB fence with VS URB size == 1 and GS URB size == 0) plus
     *   a dummy DRAW call before any case where VS will be taking over GS URB
     *   space.
     *
     * It is not clear exactly what this means ("URB fence" is a command that
     * doesn't exist on Gen6).  So for now we just do a full pipeline flush as
     * a workaround.
     */
    if (brw->urb.gs_present && !gs_present)
        brw_emit_mi_flush(brw);
    brw->urb.gs_present = gs_present;
}
Beispiel #15
0
bool
intelEmitImmediateColorExpandBlit(struct brw_context *brw,
				  GLuint cpp,
				  GLubyte *src_bits, GLuint src_size,
				  GLuint fg_color,
				  GLshort dst_pitch,
				  struct brw_bo *dst_buffer,
				  GLuint dst_offset,
				  enum isl_tiling dst_tiling,
				  GLshort x, GLshort y,
				  GLshort w, GLshort h,
				  enum gl_logicop_mode logic_op)
{
   const struct gen_device_info *devinfo = &brw->screen->devinfo;
   int dwords = ALIGN(src_size, 8) / 4;
   uint32_t opcode, br13, blit_cmd;

   if (dst_tiling != ISL_TILING_LINEAR) {
      if (dst_offset & 4095)
	 return false;
      if (dst_tiling == ISL_TILING_Y0)
	 return false;
   }

   assert((unsigned) logic_op <= 0x0f);
   assert(dst_pitch > 0);

   if (w < 0 || h < 0)
      return true;

   DBG("%s dst:buf(%p)/%d+%d %d,%d sz:%dx%d, %d bytes %d dwords\n",
       __func__,
       dst_buffer, dst_pitch, dst_offset, x, y, w, h, src_size, dwords);

   unsigned xy_setup_blt_length = devinfo->gen >= 8 ? 10 : 8;
   intel_batchbuffer_require_space(brw, (xy_setup_blt_length * 4) +
                                        (3 * 4) + dwords * 4);

   opcode = XY_SETUP_BLT_CMD;
   if (cpp == 4)
      opcode |= XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
   if (dst_tiling != ISL_TILING_LINEAR) {
      opcode |= XY_DST_TILED;
      dst_pitch /= 4;
   }

   br13 = dst_pitch | (translate_raster_op(logic_op) << 16) | (1 << 29);
   br13 |= br13_for_cpp(cpp);

   blit_cmd = XY_TEXT_IMMEDIATE_BLIT_CMD | XY_TEXT_BYTE_PACKED; /* packing? */
   if (dst_tiling != ISL_TILING_LINEAR)
      blit_cmd |= XY_DST_TILED;

   BEGIN_BATCH_BLT(xy_setup_blt_length + 3);
   OUT_BATCH(opcode | (xy_setup_blt_length - 2));
   OUT_BATCH(br13);
   OUT_BATCH((0 << 16) | 0); /* clip x1, y1 */
   OUT_BATCH((100 << 16) | 100); /* clip x2, y2 */
   if (devinfo->gen >= 8) {
      OUT_RELOC64(dst_buffer, RELOC_WRITE, dst_offset);
   } else {
      OUT_RELOC(dst_buffer, RELOC_WRITE, dst_offset);
   }
   OUT_BATCH(0); /* bg */
   OUT_BATCH(fg_color); /* fg */
   OUT_BATCH(0); /* pattern base addr */
   if (devinfo->gen >= 8)
      OUT_BATCH(0);

   OUT_BATCH(blit_cmd | ((3 - 2) + dwords));
   OUT_BATCH(SET_FIELD(y, BLT_Y) | SET_FIELD(x, BLT_X));
   OUT_BATCH(SET_FIELD(y + h, BLT_Y) | SET_FIELD(x + w, BLT_X));
   ADVANCE_BATCH();

   intel_batchbuffer_data(brw, src_bits, dwords * 4);

   brw_emit_mi_flush(brw);

   return true;
}
Beispiel #16
0
static bool MUST_CHECK
brw_fence_insert_locked(struct brw_context *brw, struct brw_fence *fence)
{
   __DRIcontext *driContext = brw->driContext;
   __DRIdrawable *driDrawable = driContext->driDrawablePriv;

   /*
    * From KHR_fence_sync:
    *
    *   When the condition of the sync object is satisfied by the fence
    *   command, the sync is signaled by the associated client API context,
    *   causing any eglClientWaitSyncKHR commands (see below) blocking on
    *   <sync> to unblock. The only condition currently supported is
    *   EGL_SYNC_PRIOR_COMMANDS_COMPLETE_KHR, which is satisfied by
    *   completion of the fence command corresponding to the sync object,
    *   and all preceding commands in the associated client API context's
    *   command stream. The sync object will not be signaled until all
    *   effects from these commands on the client API's internal and
    *   framebuffer state are fully realized. No other state is affected by
    *   execution of the fence command.
    *
    * Note the emphasis there on ensuring that the framebuffer is fully
    * realised before the fence is signaled. We cannot just flush the batch,
    * but must also resolve the drawable first. The importance of this is,
    * for example, in creating a fence for a frame to be passed to a
    * remote compositor. Without us flushing the drawable explicitly, the
    * resolve will be in a following batch (when the client finally calls
    * SwapBuffers, or triggers a resolve via some other path) and so the
    * compositor may read the incomplete framebuffer instead.
    */
   if (driDrawable)
      intel_resolve_for_dri2_flush(brw, driDrawable);
   brw_emit_mi_flush(brw);

   switch (fence->type) {
   case BRW_FENCE_TYPE_BO_WAIT:
      assert(!fence->batch_bo);
      assert(!fence->signalled);

      fence->batch_bo = brw->batch.bo;
      brw_bo_reference(fence->batch_bo);

      if (intel_batchbuffer_flush(brw) < 0) {
         brw_bo_unreference(fence->batch_bo);
         fence->batch_bo = NULL;
         return false;
      }
      break;
   case BRW_FENCE_TYPE_SYNC_FD:
      assert(!fence->signalled);

      if (fence->sync_fd == -1) {
         /* Create an out-fence that signals after all pending commands
          * complete.
          */
         if (intel_batchbuffer_flush_fence(brw, -1, &fence->sync_fd) < 0)
            return false;
         assert(fence->sync_fd != -1);
      } else {
         /* Wait on the in-fence before executing any subsequently submitted
          * commands.
          */
         if (intel_batchbuffer_flush(brw) < 0)
            return false;

         /* Emit a dummy batch just for the fence. */
         brw_emit_mi_flush(brw);
         if (intel_batchbuffer_flush_fence(brw, fence->sync_fd, NULL) < 0)
            return false;
      }
      break;
   }

   return true;
}
Beispiel #17
0
/**
 * Implements fast depth clears on gen6+.
 *
 * Fast clears basically work by setting a flag in each of the subspans
 * represented in the HiZ buffer that says "When you need the depth values for
 * this subspan, it's the hardware's current clear value."  Then later rendering
 * can just use the static clear value instead of referencing memory.
 *
 * The tricky part of the implementation is that you have to have the clear
 * value that was used on the depth buffer in place for all further rendering,
 * at least until a resolve to the real depth buffer happens.
 */
static bool
brw_fast_clear_depth(struct gl_context *ctx)
{
   struct brw_context *brw = brw_context(ctx);
   struct gl_framebuffer *fb = ctx->DrawBuffer;
   struct intel_renderbuffer *depth_irb =
      intel_get_renderbuffer(fb, BUFFER_DEPTH);
   struct intel_mipmap_tree *mt = depth_irb->mt;
   struct gl_renderbuffer_attachment *depth_att = &fb->Attachment[BUFFER_DEPTH];

   if (brw->gen < 6)
      return false;

   if (!intel_renderbuffer_has_hiz(depth_irb))
      return false;

   /* We only handle full buffer clears -- otherwise you'd have to track whether
    * a previous clear had happened at a different clear value and resolve it
    * first.
    */
   if ((ctx->Scissor.EnableFlags & 1) && !noop_scissor(ctx, fb)) {
      perf_debug("Failed to fast clear %dx%d depth because of scissors.  "
                 "Possible 5%% performance win if avoided.\n",
                 mt->logical_width0, mt->logical_height0);
      return false;
   }

   uint32_t depth_clear_value;
   switch (mt->format) {
   case MESA_FORMAT_Z32_FLOAT_S8X24_UINT:
   case MESA_FORMAT_Z24_UNORM_S8_UINT:
      /* From the Sandy Bridge PRM, volume 2 part 1, page 314:
       *
       *     "[DevSNB+]: Several cases exist where Depth Buffer Clear cannot be
       *      enabled (the legacy method of clearing must be performed):
       *
       *      - If the depth buffer format is D32_FLOAT_S8X24_UINT or
       *        D24_UNORM_S8_UINT.
       */
      return false;

   case MESA_FORMAT_Z_FLOAT32:
      depth_clear_value = float_as_int(ctx->Depth.Clear);
      break;

   case MESA_FORMAT_Z_UNORM16:
      /* From the Sandy Bridge PRM, volume 2 part 1, page 314:
       *
       *     "[DevSNB+]: Several cases exist where Depth Buffer Clear cannot be
       *      enabled (the legacy method of clearing must be performed):
       *
       *      - DevSNB{W/A}]: When depth buffer format is D16_UNORM and the
       *        width of the map (LOD0) is not multiple of 16, fast clear
       *        optimization must be disabled.
       */
      if (brw->gen == 6 &&
          (minify(mt->physical_width0,
                  depth_irb->mt_level - mt->first_level) % 16) != 0)
	 return false;
      /* FALLTHROUGH */

   default:
      if (brw->gen >= 8)
         depth_clear_value = float_as_int(ctx->Depth.Clear);
      else
         depth_clear_value = fb->_DepthMax * ctx->Depth.Clear;
      break;
   }

   /* If we're clearing to a new clear value, then we need to resolve any clear
    * flags out of the HiZ buffer into the real depth buffer.
    */
   if (mt->depth_clear_value != depth_clear_value) {
      intel_miptree_all_slices_resolve_depth(brw, mt);
      mt->depth_clear_value = depth_clear_value;
   }

   /* From the Sandy Bridge PRM, volume 2 part 1, page 313:
    *
    *     "If other rendering operations have preceded this clear, a
    *      PIPE_CONTROL with write cache flush enabled and Z-inhibit disabled
    *      must be issued before the rectangle primitive used for the depth
    *      buffer clear operation.
    */
   brw_emit_mi_flush(brw);

   if (fb->MaxNumLayers > 0) {
      for (unsigned layer = 0; layer < depth_irb->layer_count; layer++) {
         intel_hiz_exec(brw, mt, depth_irb->mt_level,
                        depth_irb->mt_layer + layer,
                        GEN6_HIZ_OP_DEPTH_CLEAR);
      }
   } else {
      intel_hiz_exec(brw, mt, depth_irb->mt_level, depth_irb->mt_layer,
                     GEN6_HIZ_OP_DEPTH_CLEAR);
   }

   if (brw->gen == 6) {
      /* From the Sandy Bridge PRM, volume 2 part 1, page 314:
       *
       *     "DevSNB, DevSNB-B{W/A}]: Depth buffer clear pass must be followed
       *      by a PIPE_CONTROL command with DEPTH_STALL bit set and Then
       *      followed by Depth FLUSH'
      */
      brw_emit_mi_flush(brw);
   }

   /* Now, the HiZ buffer contains data that needs to be resolved to the depth
    * buffer.
    */
   intel_renderbuffer_att_set_needs_depth_resolve(depth_att);

   return true;
}
Beispiel #18
0
void
intelReadPixels(struct gl_context * ctx,
                GLint x, GLint y, GLsizei width, GLsizei height,
                GLenum format, GLenum type,
                const struct gl_pixelstore_attrib *pack, GLvoid * pixels)
{
   bool ok;

   struct brw_context *brw = brw_context(ctx);
   bool dirty;

   DBG("%s\n", __func__);

   if (_mesa_is_bufferobj(pack->BufferObj)) {
      if (_mesa_meta_pbo_GetTexSubImage(ctx, 2, NULL, x, y, 0, width, height, 1,
                                        format, type, pixels, pack)) {
         /* _mesa_meta_pbo_GetTexSubImage() implements PBO transfers by
          * binding the user-provided BO as a fake framebuffer and rendering
          * to it.  This breaks the invariant of the GL that nothing is able
          * to render to a BO, causing nondeterministic corruption issues
          * because the render cache is not coherent with a number of other
          * caches that the BO could potentially be bound to afterwards.
          *
          * This could be solved in the same way that we guarantee texture
          * coherency after a texture is attached to a framebuffer and
          * rendered to, but that would involve checking *all* BOs bound to
          * the pipeline for the case we need to emit a cache flush due to
          * previous rendering to any of them -- Including vertex, index,
          * uniform, atomic counter, shader image, transform feedback,
          * indirect draw buffers, etc.
          *
          * That would increase the per-draw call overhead even though it's
          * very unlikely that any of the BOs bound to the pipeline has been
          * rendered to via a PBO at any point, so it seems better to just
          * flush here unconditionally.
          */
         brw_emit_mi_flush(brw);
         return;
      }

      perf_debug("%s: fallback to CPU mapping in PBO case\n", __func__);
   }

   ok = intel_readpixels_tiled_memcpy(ctx, x, y, width, height,
                                      format, type, pixels, pack);
   if(ok)
      return;

   /* glReadPixels() wont dirty the front buffer, so reset the dirty
    * flag after calling intel_prepare_render(). */
   dirty = brw->front_buffer_dirty;
   intel_prepare_render(brw);
   brw->front_buffer_dirty = dirty;

   /* Update Mesa state before calling _mesa_readpixels().
    * XXX this may not be needed since ReadPixels no longer uses the
    * span code.
    */

   if (ctx->NewState)
      _mesa_update_state(ctx);

   _mesa_readpixels(ctx, x, y, width, height, format, type, pack, pixels);

   /* There's an intel_prepare_render() call in intelSpanRenderStart(). */
   brw->front_buffer_dirty = dirty;
}
Beispiel #19
0
/**
 * Used to initialize the alpha value of an ARGB8888 miptree after copying
 * into it from an XRGB8888 source.
 *
 * This is very common with glCopyTexImage2D().  Note that the coordinates are
 * relative to the start of the miptree, not relative to a slice within the
 * miptree.
 */
static void
intel_miptree_set_alpha_to_one(struct brw_context *brw,
                              struct intel_mipmap_tree *mt,
                              int x, int y, int width, int height)
{
   const struct gen_device_info *devinfo = &brw->screen->devinfo;
   uint32_t BR13, CMD;
   int pitch, cpp;

   pitch = mt->surf.row_pitch_B;
   cpp = mt->cpp;

   DBG("%s dst:buf(%p)/%d %d,%d sz:%dx%d\n",
       __func__, mt->bo, pitch, x, y, width, height);

   /* Note: Currently only handles 8 bit alpha channel. Extension to < 8 Bit
    * alpha channel would be likely possible via ROP code 0xfa instead of 0xf0
    * and writing a suitable bit-mask instead of 0xffffffff.
    */
   BR13 = br13_for_cpp(cpp) | 0xf0 << 16;
   CMD = XY_COLOR_BLT_CMD;
   CMD |= XY_BLT_WRITE_ALPHA;

   if (mt->surf.tiling != ISL_TILING_LINEAR) {
      CMD |= XY_DST_TILED;
      pitch /= 4;
   }
   BR13 |= pitch;

   /* do space check before going any further */
   if (!brw_batch_has_aperture_space(brw, mt->bo->size))
      intel_batchbuffer_flush(brw);

   unsigned length = devinfo->gen >= 8 ? 7 : 6;
   const bool dst_y_tiled = mt->surf.tiling == ISL_TILING_Y0;

   /* We need to split the blit into chunks that each fit within the blitter's
    * restrictions.  We can't use a chunk size of 32768 because we need to
    * ensure that src_tile_x + chunk_size fits.  We choose 16384 because it's
    * a nice round power of two, big enough that performance won't suffer, and
    * small enough to guarantee everything fits.
    */
   const uint32_t max_chunk_size = 16384;

   for (uint32_t chunk_x = 0; chunk_x < width; chunk_x += max_chunk_size) {
      for (uint32_t chunk_y = 0; chunk_y < height; chunk_y += max_chunk_size) {
         const uint32_t chunk_w = MIN2(max_chunk_size, width - chunk_x);
         const uint32_t chunk_h = MIN2(max_chunk_size, height - chunk_y);

         uint32_t offset, tile_x, tile_y;
         get_blit_intratile_offset_el(brw, mt,
                                      x + chunk_x, y + chunk_y,
                                      &offset, &tile_x, &tile_y);

         BEGIN_BATCH_BLT_TILED(length, dst_y_tiled, false);
         OUT_BATCH(CMD | (length - 2));
         OUT_BATCH(BR13);
         OUT_BATCH(SET_FIELD(y + chunk_y, BLT_Y) |
                   SET_FIELD(x + chunk_x, BLT_X));
         OUT_BATCH(SET_FIELD(y + chunk_y + chunk_h, BLT_Y) |
                   SET_FIELD(x + chunk_x + chunk_w, BLT_X));
         if (devinfo->gen >= 8) {
            OUT_RELOC64(mt->bo, RELOC_WRITE, mt->offset + offset);
         } else {
            OUT_RELOC(mt->bo, RELOC_WRITE, mt->offset + offset);
         }
         OUT_BATCH(0xffffffff); /* white, but only alpha gets written */
         ADVANCE_BATCH_TILED(dst_y_tiled, false);
      }
   }

   brw_emit_mi_flush(brw);
}
Beispiel #20
0
/**
 * Compute the number of primitives written during our most recent
 * transform feedback activity (the current SO_NUM_PRIMS_WRITTEN value
 * minus the stashed "start" value), and add it to our running tally.
 *
 * If \p finalize is true, also compute the number of vertices written
 * (by multiplying by the number of vertices per primitive), and store
 * that to the "final" location.
 *
 * Otherwise, just overwrite the old tally with the new one.
 */
static void
tally_prims_written(struct brw_context *brw,
                    struct brw_transform_feedback_object *obj,
                    bool finalize)
{
   /* Flush any drawing so that the counters have the right values. */
   brw_emit_mi_flush(brw);

   for (int i = 0; i < BRW_MAX_XFB_STREAMS; i++) {
      /* GPR0 = Tally */
      brw_load_register_imm32(brw, HSW_CS_GPR(0) + 4, 0);
      brw_load_register_mem(brw, HSW_CS_GPR(0), obj->prim_count_bo,
                            I915_GEM_DOMAIN_INSTRUCTION,
                            I915_GEM_DOMAIN_INSTRUCTION,
                            TALLY_OFFSET + i * sizeof(uint32_t));
      if (!obj->base.Paused) {
         /* GPR1 = Start Snapshot */
         brw_load_register_mem64(brw, HSW_CS_GPR(1), obj->prim_count_bo,
                                 I915_GEM_DOMAIN_INSTRUCTION,
                                 I915_GEM_DOMAIN_INSTRUCTION,
                                 START_OFFSET + i * sizeof(uint64_t));
         /* GPR2 = Ending Snapshot */
         brw_load_register_reg64(brw, GEN7_SO_NUM_PRIMS_WRITTEN(i), HSW_CS_GPR(2));

         BEGIN_BATCH(9);
         OUT_BATCH(HSW_MI_MATH | (9 - 2));
         /* GPR1 = GPR2 (End) - GPR1 (Start) */
         OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R2));
         OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R1));
         OUT_BATCH(MI_MATH_ALU0(SUB));
         OUT_BATCH(MI_MATH_ALU2(STORE, R1, ACCU));
         /* GPR0 = GPR0 (Tally) + GPR1 (Diff) */
         OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R0));
         OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R1));
            OUT_BATCH(MI_MATH_ALU0(ADD));
         OUT_BATCH(MI_MATH_ALU2(STORE, R0, ACCU));
         ADVANCE_BATCH();
      }

      if (!finalize) {
         /* Write back the new tally */
         brw_store_register_mem32(brw, obj->prim_count_bo, HSW_CS_GPR(0),
                                  TALLY_OFFSET + i * sizeof(uint32_t));
      } else {
         /* Convert the number of primitives to the number of vertices. */
         if (obj->primitive_mode == GL_LINES) {
            /* Double R0 (R0 = R0 + R0) */
            BEGIN_BATCH(5);
            OUT_BATCH(HSW_MI_MATH | (5 - 2));
            OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R0));
            OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R0));
            OUT_BATCH(MI_MATH_ALU0(ADD));
            OUT_BATCH(MI_MATH_ALU2(STORE, R0, ACCU));
            ADVANCE_BATCH();
         } else if (obj->primitive_mode == GL_TRIANGLES) {
            /* Triple R0 (R1 = R0 + R0, R0 = R0 + R1) */
            BEGIN_BATCH(9);
            OUT_BATCH(HSW_MI_MATH | (9 - 2));
            OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R0));
            OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R0));
            OUT_BATCH(MI_MATH_ALU0(ADD));
            OUT_BATCH(MI_MATH_ALU2(STORE, R1, ACCU));
            OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R0));
            OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R1));
            OUT_BATCH(MI_MATH_ALU0(ADD));
            OUT_BATCH(MI_MATH_ALU2(STORE, R0, ACCU));
            ADVANCE_BATCH();
         }
         /* Store it to the final result */
         brw_store_register_mem32(brw, obj->prim_count_bo, HSW_CS_GPR(0),
                                  i * sizeof(uint32_t));
      }
   }
}
Beispiel #21
0
/* Copy BitBlt
 */
static bool
emit_copy_blit(struct brw_context *brw,
               GLuint cpp,
               int32_t src_pitch,
               struct brw_bo *src_buffer,
               GLuint src_offset,
               enum isl_tiling src_tiling,
               int32_t dst_pitch,
               struct brw_bo *dst_buffer,
               GLuint dst_offset,
               enum isl_tiling dst_tiling,
               GLshort src_x, GLshort src_y,
               GLshort dst_x, GLshort dst_y,
               GLshort w, GLshort h,
               enum gl_logicop_mode logic_op)
{
   const struct gen_device_info *devinfo = &brw->screen->devinfo;
   GLuint CMD, BR13;
   int dst_y2 = dst_y + h;
   int dst_x2 = dst_x + w;
   bool dst_y_tiled = dst_tiling == ISL_TILING_Y0;
   bool src_y_tiled = src_tiling == ISL_TILING_Y0;
   uint32_t src_tile_w, src_tile_h;
   uint32_t dst_tile_w, dst_tile_h;

   if ((dst_y_tiled || src_y_tiled) && devinfo->gen < 6)
      return false;

   const unsigned bo_sizes = dst_buffer->size + src_buffer->size;

   /* do space check before going any further */
   if (!brw_batch_has_aperture_space(brw, bo_sizes))
      intel_batchbuffer_flush(brw);

   if (!brw_batch_has_aperture_space(brw, bo_sizes))
      return false;

   unsigned length = devinfo->gen >= 8 ? 10 : 8;

   intel_batchbuffer_require_space(brw, length * 4);
   DBG("%s src:buf(%p)/%d+%d %d,%d dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
       __func__,
       src_buffer, src_pitch, src_offset, src_x, src_y,
       dst_buffer, dst_pitch, dst_offset, dst_x, dst_y, w, h);

   intel_get_tile_dims(src_tiling, cpp, &src_tile_w, &src_tile_h);
   intel_get_tile_dims(dst_tiling, cpp, &dst_tile_w, &dst_tile_h);

   /* For Tiled surfaces, the pitch has to be a multiple of the Tile width
    * (X direction width of the Tile). This is ensured while allocating the
    * buffer object.
    */
   assert(src_tiling == ISL_TILING_LINEAR || (src_pitch % src_tile_w) == 0);
   assert(dst_tiling == ISL_TILING_LINEAR || (dst_pitch % dst_tile_w) == 0);

   /* For big formats (such as floating point), do the copy using 16 or
    * 32bpp and multiply the coordinates.
    */
   if (cpp > 4) {
      if (cpp % 4 == 2) {
         dst_x *= cpp / 2;
         dst_x2 *= cpp / 2;
         src_x *= cpp / 2;
         cpp = 2;
      } else {
         assert(cpp % 4 == 0);
         dst_x *= cpp / 4;
         dst_x2 *= cpp / 4;
         src_x *= cpp / 4;
         cpp = 4;
      }
   }

   if (!alignment_valid(brw, dst_offset, dst_tiling))
      return false;
   if (!alignment_valid(brw, src_offset, src_tiling))
      return false;

   /* Blit pitch must be dword-aligned.  Otherwise, the hardware appears to drop
    * the low bits.  Offsets must be naturally aligned.
    */
   if (src_pitch % 4 != 0 || src_offset % cpp != 0 ||
       dst_pitch % 4 != 0 || dst_offset % cpp != 0)
      return false;

   assert(cpp <= 4);
   BR13 = br13_for_cpp(cpp) | translate_raster_op(logic_op) << 16;

   CMD = xy_blit_cmd(src_tiling, dst_tiling, cpp);

   /* For tiled source and destination, pitch value should be specified
    * as a number of Dwords.
    */
   if (dst_tiling != ISL_TILING_LINEAR)
      dst_pitch /= 4;

   if (src_tiling != ISL_TILING_LINEAR)
      src_pitch /= 4;

   if (dst_y2 <= dst_y || dst_x2 <= dst_x)
      return true;

   assert(dst_x < dst_x2);
   assert(dst_y < dst_y2);

   BEGIN_BATCH_BLT_TILED(length, dst_y_tiled, src_y_tiled);
   OUT_BATCH(CMD | (length - 2));
   OUT_BATCH(BR13 | (uint16_t)dst_pitch);
   OUT_BATCH(SET_FIELD(dst_y, BLT_Y) | SET_FIELD(dst_x, BLT_X));
   OUT_BATCH(SET_FIELD(dst_y2, BLT_Y) | SET_FIELD(dst_x2, BLT_X));
   if (devinfo->gen >= 8) {
      OUT_RELOC64(dst_buffer, RELOC_WRITE, dst_offset);
   } else {
      OUT_RELOC(dst_buffer, RELOC_WRITE, dst_offset);
   }
   OUT_BATCH(SET_FIELD(src_y, BLT_Y) | SET_FIELD(src_x, BLT_X));
   OUT_BATCH((uint16_t)src_pitch);
   if (devinfo->gen >= 8) {
      OUT_RELOC64(src_buffer, 0, src_offset);
   } else {
      OUT_RELOC(src_buffer, 0, src_offset);
   }

   ADVANCE_BATCH_TILED(dst_y_tiled, src_y_tiled);

   brw_emit_mi_flush(brw);

   return true;
}