Beispiel #1
0
/**
 * Allocates a block of space in the batchbuffer for indirect state.
 *
 * We don't want to allocate separate BOs for every bit of indirect
 * state in the driver.  It means overallocating by a significant
 * margin (4096 bytes, even if the object is just a 20-byte surface
 * state), and more buffers to walk and count for aperture size checking.
 *
 * However, due to the restrictions imposed by the aperture size
 * checking performance hacks, we can't have the batch point at a
 * separate indirect state buffer, because once the batch points at
 * it, no more relocations can be added to it.  So, we sneak these
 * buffers in at the top of the batchbuffer.
 */
void *
brw_state_batch(struct brw_context *brw,
                int size,
                int alignment,
                uint32_t *out_offset)
{
   struct intel_batchbuffer *batch = &brw->batch;
   uint32_t offset;

   assert(size < batch->bo->size);
   offset = ROUND_DOWN_TO(batch->state_batch_offset - size, alignment);

   /* If allocating from the top would wrap below the batchbuffer, or
    * if the batch's used space (plus the reserved pad) collides with our
    * space, then flush and try again.
    */
   if (batch->state_batch_offset < size ||
       offset < 4 * USED_BATCH(*batch) + batch->reserved_space) {
      intel_batchbuffer_flush(brw);
      offset = ROUND_DOWN_TO(batch->state_batch_offset - size, alignment);
   }

   batch->state_batch_offset = offset;

   if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
      _mesa_hash_table_insert(batch->state_batch_sizes,
                              (void *) (uintptr_t) offset,
                              (void *) (uintptr_t) size);
   }

   *out_offset = offset;
   return batch->map + (offset>>2);
}
/**
 * Allocates a block of space in the batchbuffer for indirect state.
 *
 * We don't want to allocate separate BOs for every bit of indirect
 * state in the driver.  It means overallocating by a significant
 * margin (4096 bytes, even if the object is just a 20-byte surface
 * state), and more buffers to walk and count for aperture size checking.
 *
 * However, due to the restrictions inposed by the aperture size
 * checking performance hacks, we can't have the batch point at a
 * separate indirect state buffer, because once the batch points at
 * it, no more relocations can be added to it.  So, we sneak these
 * buffers in at the top of the batchbuffer.
 */
void *
brw_state_batch(struct brw_context *brw,
		enum state_struct_type type,
		int size,
		int alignment,
		uint32_t *out_offset)
{
   struct intel_batchbuffer *batch = &brw->intel.batch;
   uint32_t offset;

   assert(size < batch->bo->size);
   offset = ROUND_DOWN_TO(batch->state_batch_offset - size, alignment);

   /* If allocating from the top would wrap below the batchbuffer, or
    * if the batch's used space (plus the reserved pad) collides with our
    * space, then flush and try again.
    */
   if (batch->state_batch_offset < size ||
       offset < 4*batch->used + batch->reserved_space) {
      intel_batchbuffer_flush(&brw->intel);
      offset = ROUND_DOWN_TO(batch->state_batch_offset - size, alignment);
   }

   batch->state_batch_offset = offset;

   if (unlikely(INTEL_DEBUG & (DEBUG_BATCH | DEBUG_AUB)))
      brw_track_state_batch(brw, type, offset, size);

   *out_offset = offset;
   return batch->map + (offset>>2);
}
/**
 * Allocates a block of space in the batchbuffer for indirect state.
 *
 * We don't want to allocate separate BOs for every bit of indirect
 * state in the driver.  It means overallocating by a significant
 * margin (4096 bytes, even if the object is just a 20-byte surface
 * state), and more buffers to walk and count for aperture size checking.
 *
 * However, due to the restrictions inposed by the aperture size
 * checking performance hacks, we can't have the batch point at a
 * separate indirect state buffer, because once the batch points at
 * it, no more relocations can be added to it.  So, we sneak these
 * buffers in at the top of the batchbuffer.
 */
void *
brw_state_batch(struct brw_context *brw,
		int size,
		int alignment,
		drm_intel_bo **out_bo,
		uint32_t *out_offset)
{
   struct intel_batchbuffer *batch = brw->intel.batch;
   uint32_t offset;

   assert(size < batch->buf->size);
   offset = ROUND_DOWN_TO(batch->state_batch_offset - size, alignment);

   /* If allocating from the top would wrap below the batchbuffer, or
    * if the batch's used space (plus the reserved pad) collides with our
    * space, then flush and try again.
    */
   if (batch->state_batch_offset < size ||
       offset < batch->ptr - batch->map + batch->reserved_space) {
      intel_batchbuffer_flush(batch);
      offset = ROUND_DOWN_TO(batch->state_batch_offset - size, alignment);
   }

   batch->state_batch_offset = offset;

   if (*out_bo != batch->buf) {
      drm_intel_bo_unreference(*out_bo);
      drm_intel_bo_reference(batch->buf);
      *out_bo = batch->buf;
   }

   *out_offset = offset;
   return batch->map + offset;
}
Beispiel #4
0
/**
 * When the GS is not in use, we assign the entire URB space to the VS.  When
 * the GS is in use, we split the URB space evenly between the VS and the GS.
 * This is not ideal, but it's simple.
 *
 *           URB size / 2                   URB size / 2
 *   _____________-______________   _____________-______________
 *  /                            \ /                            \
 * +-------------------------------------------------------------+
 * | Vertex Shader Entries        | Geometry Shader Entries      |
 * +-------------------------------------------------------------+
 *
 * Sandybridge GT1 has 32kB of URB space, while GT2 has 64kB.
 * (See the Sandybridge PRM, Volume 2, Part 1, Section 1.4.7: 3DSTATE_URB.)
 */
void
gen6_upload_urb(struct brw_context *brw, unsigned vs_size,
                bool gs_present, unsigned gs_size)
{
   int nr_vs_entries, nr_gs_entries;
   int total_urb_size = brw->urb.size * 1024; /* in bytes */
   const struct gen_device_info *devinfo = &brw->screen->devinfo;

   /* Calculate how many entries fit in each stage's section of the URB */
   if (gs_present) {
      nr_vs_entries = (total_urb_size/2) / (vs_size * 128);
      nr_gs_entries = (total_urb_size/2) / (gs_size * 128);
   } else {
      nr_vs_entries = total_urb_size / (vs_size * 128);
      nr_gs_entries = 0;
   }

   /* Then clamp to the maximum allowed by the hardware */
   if (nr_vs_entries > devinfo->urb.max_vs_entries)
      nr_vs_entries = devinfo->urb.max_vs_entries;

   if (nr_gs_entries > devinfo->urb.max_gs_entries)
      nr_gs_entries = devinfo->urb.max_gs_entries;

   /* Finally, both must be a multiple of 4 (see 3DSTATE_URB in the PRM). */
   brw->urb.nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, 4);
   brw->urb.nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, 4);

   assert(brw->urb.nr_vs_entries >= devinfo->urb.min_vs_entries);
   assert(brw->urb.nr_vs_entries % 4 == 0);
   assert(brw->urb.nr_gs_entries % 4 == 0);
   assert(vs_size <= 5);
   assert(gs_size <= 5);

   BEGIN_BATCH(3);
   OUT_BATCH(_3DSTATE_URB << 16 | (3 - 2));
   OUT_BATCH(((vs_size - 1) << GEN6_URB_VS_SIZE_SHIFT) |
	     ((brw->urb.nr_vs_entries) << GEN6_URB_VS_ENTRIES_SHIFT));
   OUT_BATCH(((gs_size - 1) << GEN6_URB_GS_SIZE_SHIFT) |
	     ((brw->urb.nr_gs_entries) << GEN6_URB_GS_ENTRIES_SHIFT));
   ADVANCE_BATCH();

   /* From the PRM Volume 2 part 1, section 1.4.7:
    *
    *   Because of a urb corruption caused by allocating a previous gsunit’s
    *   urb entry to vsunit software is required to send a "GS NULL
    *   Fence"(Send URB fence with VS URB size == 1 and GS URB size == 0) plus
    *   a dummy DRAW call before any case where VS will be taking over GS URB
    *   space.
    *
    * It is not clear exactly what this means ("URB fence" is a command that
    * doesn't exist on Gen6).  So for now we just do a full pipeline flush as
    * a workaround.
    */
   if (brw->urb.gs_present && !gs_present)
      brw_emit_mi_flush(brw);
   brw->urb.gs_present = gs_present;
}
/**
 * Stencil is mapped as Y-tiled render target and the dimensions need to be
 * adjusted in order for the Y-tiled rectangle to cover the entire linear
 * memory space of the original W-tiled rectangle.
 */
static void
adjust_tiling(struct blit_dims *dims, int num_samples)
{
   const unsigned x_align = 8, y_align = num_samples > 2 ? 8 : 4;

   dims->dst_x0 = ROUND_DOWN_TO(dims->dst_x0, x_align) * 2;
   dims->dst_y0 = ROUND_DOWN_TO(dims->dst_y0, y_align) / 2;
   dims->dst_x1 = ALIGN(dims->dst_x1, x_align) * 2;
   dims->dst_y1 = ALIGN(dims->dst_y1, y_align) / 2;
}
/**
 * Samples in stencil buffer are interleaved, and unfortunately the data port 
 * does not support it as render target. Therefore the surface is set up as
 * single sampled and the program handles the interleaving.
 * In case of single sampled stencil, the render buffer is adjusted with
 * twice the base level height in order for the program to be able to write
 * any mip-level. (Used to set the drawing rectangle for the hw).
 */
static void
adjust_msaa(struct blit_dims *dims, int num_samples)
{
   if (num_samples == 2) {
      dims->dst_x0 *= 2;
      dims->dst_x1 *= 2;
   } else if (num_samples) {
      const int x_num_samples = num_samples / 2;
      dims->dst_x0 = ROUND_DOWN_TO(dims->dst_x0 * x_num_samples, num_samples);
      dims->dst_y0 = ROUND_DOWN_TO(dims->dst_y0 * 2, 4);
      dims->dst_x1 = ALIGN(dims->dst_x1 * x_num_samples, num_samples);
      dims->dst_y1 = ALIGN(dims->dst_y1 * 2, 4);
   }
}
static void
gen7_upload_urb(struct brw_context *brw)
{
   struct intel_context *intel = &brw->intel;
   const int push_size_kB = intel->is_haswell && intel->gt == 3 ? 32 : 16;

   /* Total space for entries is URB size - 16kB for push constants */
   int handle_region_size = (brw->urb.size - push_size_kB) * 1024; /* bytes */

   /* CACHE_NEW_VS_PROG */
   unsigned vs_size = MAX2(brw->vs.prog_data->urb_entry_size, 1);

   int nr_vs_entries = handle_region_size / (vs_size * 64);
   if (nr_vs_entries > brw->urb.max_vs_entries)
      nr_vs_entries = brw->urb.max_vs_entries;

   /* According to volume 2a, nr_vs_entries must be a multiple of 8. */
   brw->urb.nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, 8);

   /* URB Starting Addresses are specified in multiples of 8kB. */
   brw->urb.vs_start = push_size_kB / 8; /* skip over push constants */

   assert(brw->urb.nr_vs_entries % 8 == 0);
   assert(brw->urb.nr_gs_entries % 8 == 0);
   /* GS requirement */
   assert(!brw->gs.prog_active);

   gen7_emit_vs_workaround_flush(intel);
   gen7_emit_urb_state(brw, brw->urb.nr_vs_entries, vs_size, brw->urb.vs_start);
}
Beispiel #8
0
/* We don't have a memmove-type blit like some other hardware, so we'll do a
 * rectangular blit covering a large space, then emit 1-scanline blit at the
 * end to cover the last if we need.
 */
void
intel_emit_linear_blit(struct intel_context *intel,
		       drm_intel_bo *dst_bo,
		       unsigned int dst_offset,
		       drm_intel_bo *src_bo,
		       unsigned int src_offset,
		       unsigned int size)
{
   struct gl_context *ctx = &intel->ctx;
   GLuint pitch, height;
   bool ok;

   /* The pitch given to the GPU must be DWORD aligned, and
    * we want width to match pitch. Max width is (1 << 15 - 1),
    * rounding that down to the nearest DWORD is 1 << 15 - 4
    */
   pitch = ROUND_DOWN_TO(MIN2(size, (1 << 15) - 1), 4);
   height = (pitch == 0) ? 1 : size / pitch;
   ok = intelEmitCopyBlit(intel, 1,
			  pitch, src_bo, src_offset, I915_TILING_NONE,
			  pitch, dst_bo, dst_offset, I915_TILING_NONE,
			  0, 0, /* src x/y */
			  0, 0, /* dst x/y */
			  pitch, height, /* w, h */
			  GL_COPY);
   if (!ok)
      _mesa_problem(ctx, "Failed to linear blit %dx%d\n", pitch, height);

   src_offset += pitch * height;
   dst_offset += pitch * height;
   size -= pitch * height;
   assert (size < (1 << 15));
   pitch = ALIGN(size, 4);
   if (size != 0) {
      ok = intelEmitCopyBlit(intel, 1,
			     pitch, src_bo, src_offset, I915_TILING_NONE,
			     pitch, dst_bo, dst_offset, I915_TILING_NONE,
			     0, 0, /* src x/y */
			     0, 0, /* dst x/y */
			     size, 1, /* w, h */
			     GL_COPY);
      if (!ok)
         _mesa_problem(ctx, "Failed to linear blit %dx%d\n", size, 1);
   }
}
static void
get_fast_clear_rect(struct brw_context *brw, struct gl_framebuffer *fb,
                    struct intel_renderbuffer *irb, struct rect *rect)
{
   unsigned int x_align, y_align;
   unsigned int x_scaledown, y_scaledown;

   if (irb->mt->msaa_layout == INTEL_MSAA_LAYOUT_NONE) {
      /* From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render
       * Target(s)", beneath the "Fast Color Clear" bullet (p327):
       *
       *     Clear pass must have a clear rectangle that must follow
       *     alignment rules in terms of pixels and lines as shown in the
       *     table below. Further, the clear-rectangle height and width
       *     must be multiple of the following dimensions. If the height
       *     and width of the render target being cleared do not meet these
       *     requirements, an MCS buffer can be created such that it
       *     follows the requirement and covers the RT.
       *
       * The alignment size in the table that follows is related to the
       * alignment size returned by intel_get_non_msrt_mcs_alignment(), but
       * with X alignment multiplied by 16 and Y alignment multiplied by 32.
       */
      intel_get_non_msrt_mcs_alignment(brw, irb->mt, &x_align, &y_align);
      x_align *= 16;
      y_align *= 32;

      /* From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render
       * Target(s)", beneath the "Fast Color Clear" bullet (p327):
       *
       *     In order to optimize the performance MCS buffer (when bound to
       *     1X RT) clear similarly to MCS buffer clear for MSRT case,
       *     clear rect is required to be scaled by the following factors
       *     in the horizontal and vertical directions:
       *
       * The X and Y scale down factors in the table that follows are each
       * equal to half the alignment value computed above.
       */
      x_scaledown = x_align / 2;
      y_scaledown = y_align / 2;

      /* From BSpec: 3D-Media-GPGPU Engine > 3D Pipeline > Pixel > Pixel
       * Backend > MCS Buffer for Render Target(s) [DevIVB+] > Table "Color
       * Clear of Non-MultiSampled Render Target Restrictions":
       *
       *   Clear rectangle must be aligned to two times the number of
       *   pixels in the table shown below due to 16x16 hashing across the
       *   slice.
       */
      x_align *= 2;
      y_align *= 2;
   } else {
      /* From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render
       * Target(s)", beneath the "MSAA Compression" bullet (p326):
       *
       *     Clear pass for this case requires that scaled down primitive
       *     is sent down with upper left co-ordinate to coincide with
       *     actual rectangle being cleared. For MSAA, clear rectangle’s
       *     height and width need to as show in the following table in
       *     terms of (width,height) of the RT.
       *
       *     MSAA  Width of Clear Rect  Height of Clear Rect
       *      4X     Ceil(1/8*width)      Ceil(1/2*height)
       *      8X     Ceil(1/2*width)      Ceil(1/2*height)
       *
       * The text "with upper left co-ordinate to coincide with actual
       * rectangle being cleared" is a little confusing--it seems to imply
       * that to clear a rectangle from (x,y) to (x+w,y+h), one needs to
       * feed the pipeline using the rectangle (x,y) to
       * (x+Ceil(w/N),y+Ceil(h/2)), where N is either 2 or 8 depending on
       * the number of samples.  Experiments indicate that this is not
       * quite correct; actually, what the hardware appears to do is to
       * align whatever rectangle is sent down the pipeline to the nearest
       * multiple of 2x2 blocks, and then scale it up by a factor of N
       * horizontally and 2 vertically.  So the resulting alignment is 4
       * vertically and either 4 or 16 horizontally, and the scaledown
       * factor is 2 vertically and either 2 or 8 horizontally.
       */
      switch (irb->mt->num_samples) {
      case 2:
      case 4:
         x_scaledown = 8;
         break;
      case 8:
         x_scaledown = 2;
         break;
      default:
         unreachable("Unexpected sample count for fast clear");
      }
      y_scaledown = 2;
      x_align = x_scaledown * 2;
      y_align = y_scaledown * 2;
   }

   rect->x0 = fb->_Xmin;
   rect->x1 = fb->_Xmax;
   if (fb->Name != 0) {
      rect->y0 = fb->_Ymin;
      rect->y1 = fb->_Ymax;
   } else {
      rect->y0 = fb->Height - fb->_Ymax;
      rect->y1 = fb->Height - fb->_Ymin;
   }

   rect->x0 = ROUND_DOWN_TO(rect->x0,  x_align) / x_scaledown;
   rect->y0 = ROUND_DOWN_TO(rect->y0, y_align) / y_scaledown;
   rect->x1 = ALIGN(rect->x1, x_align) / x_scaledown;
   rect->y1 = ALIGN(rect->y1, y_align) / y_scaledown;
}
Beispiel #10
0
/*
 * Allocate memory from the given pool.  Grow the pool if needed and if
 * possible.
 */
static unsigned long
AllocFromPool(ScrnInfoPtr pScrn, I830MemRange *result, I830MemPool *pool,
	      unsigned long size, unsigned long alignment, int flags)
{
   I830Ptr pI830 = I830PTR(pScrn);
   unsigned long needed, start, end;
   Bool dryrun = ((flags & ALLOCATE_DRY_RUN) != 0);

   if (!result || !pool || !size)
      return 0;

   /* Calculate how much space is needed. */
   if (alignment <= GTT_PAGE_SIZE)
      needed = size;
   else {
      if (flags & ALLOCATE_AT_BOTTOM) {
	 start = ROUND_TO(pool->Free.Start, alignment);
	 if (flags & ALIGN_BOTH_ENDS)
	    end = ROUND_TO(start + size, alignment);
	 else
	    end = start + size;
	 needed = end - pool->Free.Start;
      } else {				/* allocate at top */
	 if (flags & ALIGN_BOTH_ENDS)
	    end = ROUND_DOWN_TO(pool->Free.End, alignment);
	 else
	    end = pool->Free.End;

	 start = ROUND_DOWN_TO(end - size, alignment);
	 needed = pool->Free.End - start;
      }
   }
   if (needed > pool->Free.Size) {
      unsigned long extra;
      /* See if the pool can be grown. */
      if (pI830->StolenOnly && !dryrun)
	 return 0;
      extra = needed - pool->Free.Size;
      extra = ROUND_TO_PAGE(extra);
      if (extra > pI830->FreeMemory) {
	 if (dryrun)
	    pI830->FreeMemory = extra;
	 else
	    return 0;
      }

      if (!dryrun && (extra > pI830->MemoryAperture.Size))
	 return 0;

      pool->Free.Size += extra;
      pool->Free.End += extra;
      pool->Total.Size += extra;
      pool->Total.End += extra;
      pI830->FreeMemory -= extra;
      pI830->MemoryAperture.Start += extra;
      pI830->MemoryAperture.Size -= extra;
   }
   if (flags & ALLOCATE_AT_BOTTOM) {
      result->Start = ROUND_TO(pool->Free.Start, alignment);
      pool->Free.Start += needed;
      result->End = pool->Free.Start;
   } else {
      result->Start = ROUND_DOWN_TO(pool->Free.End - size, alignment) -
			pool->Total.End;
      result->End = pool->Free.End - pool->Total.End;
      pool->Free.End -= needed;
   }
   pool->Free.Size = pool->Free.End - pool->Free.Start;
   result->Size = result->End - result->Start;
   result->Pool = pool;
   result->Alignment = alignment;
   return needed;
}
Beispiel #11
0
Bool
I830Allocate2DMemory(ScrnInfoPtr pScrn, const int flags)
{
   I830Ptr pI830 = I830PTR(pScrn);
   unsigned long size, alloced;
   Bool dryrun = ((flags & ALLOCATE_DRY_RUN) != 0);
   int verbosity = dryrun ? 4 : 1;
   const char *s = dryrun ? "[dryrun] " : "";
   Bool tileable;
   int align, alignflags;

   DPRINTF(PFX, "I830Allocate2DMemory: inital is %s\n",
	   BOOLTOSTRING(flags & ALLOC_INITIAL));

   if (!pI830->StolenOnly &&
       (!xf86AgpGARTSupported() || !xf86AcquireGART(pScrn->scrnIndex))) {
      if (!dryrun) {
	 xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
		    "AGP GART support is either not available or cannot "
		    "be used.\n"
		    "\tMake sure your kernel has agpgart support or has the\n"
		    "\tagpgart module loaded.\n");
      }
      return FALSE;
   }


   /*
    * The I830 is slightly different from the I830/I815, it has no
    * dcache and it has stolen memory by default in its gtt.  All
    * additional memory must go after it.
    */

   DPRINTF(PFX,
	   "size == %luk (%lu bytes == pScrn->videoRam)\n"
	   "pI830->StolenSize == %luk (%lu bytes)\n",
	   pScrn->videoRam, pScrn->videoRam * 1024,
	   pI830->StolenPool.Free.Size / 1024,
	   pI830->StolenPool.Free.Size);

   if (flags & ALLOC_INITIAL) {
      unsigned long minspace, avail, lineSize;
      int cacheLines, maxCacheLines;

      if (pI830->NeedRingBufferLow)
	 AllocateRingBuffer(pScrn, flags | FORCE_LOW);

      /* Clear everything first. */
      memset(&(pI830->FbMemBox), 0, sizeof(pI830->FbMemBox));
      memset(&(pI830->FrontBuffer), 0, sizeof(pI830->FrontBuffer));
      pI830->FrontBuffer.Key = -1;

      pI830->FbMemBox.x1 = 0;
      pI830->FbMemBox.x2 = pScrn->displayWidth;
      pI830->FbMemBox.y1 = 0;
      pI830->FbMemBox.y2 = pScrn->virtualY;

      /*
       * Calculate how much framebuffer memory to allocate.  For the
       * initial allocation, calculate a reasonable minimum.  This is
       * enough for the virtual screen size, plus some pixmap cache
       * space.
       */

      lineSize = pScrn->displayWidth * pI830->cpp;
      minspace = lineSize * pScrn->virtualY;
      avail = pScrn->videoRam * 1024;
      maxCacheLines = (avail - minspace) / lineSize;
      /* This shouldn't happen. */
      if (maxCacheLines < 0) {
	 xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
		    "Internal Error: "
		    "maxCacheLines < 0 in I830Allocate2DMemory()\n");
	 maxCacheLines = 0;
      }
      if (maxCacheLines > (MAX_DISPLAY_HEIGHT - pScrn->virtualY))
	 maxCacheLines = MAX_DISPLAY_HEIGHT - pScrn->virtualY;

      if (pI830->CacheLines >= 0) {
	 cacheLines = pI830->CacheLines;
      } else {
#if 1
	 /* Make sure there is enough for two DVD sized YUV buffers */
	 cacheLines = (pScrn->depth == 24) ? 256 : 384;
	 if (pScrn->displayWidth <= 1024)
	    cacheLines *= 2;
#else
	 /*
	  * Make sure there is enough for two DVD sized YUV buffers.
	  * Make that 1.5MB, which is around what was allocated with
	  * the old algorithm
	  */
	 cacheLines = (MB(1) + KB(512)) / pI830->cpp / pScrn->displayWidth;
#endif
      }
      if (cacheLines > maxCacheLines)
	 cacheLines = maxCacheLines;

      pI830->FbMemBox.y2 += cacheLines;

      xf86DrvMsgVerb(pScrn->scrnIndex, X_INFO, verbosity,
		     "%sAllocating at least %d scanlines for pixmap cache\n",
		     s, cacheLines);

      tileable = !(flags & ALLOC_NO_TILING) && pI830->allowPageFlip &&
		 IsTileable(pScrn->displayWidth * pI830->cpp);
      if (tileable) {
	 align = KB(512);
	 alignflags = ALIGN_BOTH_ENDS;
      } else {
	 align = KB(64);
	 alignflags = 0;
      }

      size = lineSize * (pScrn->virtualY + cacheLines);
      size = ROUND_TO_PAGE(size);
      xf86DrvMsgVerb(pScrn->scrnIndex, X_INFO, verbosity,
		     "%sInitial framebuffer allocation size: %d kByte\n", s,
		     size / 1024);
      alloced = I830AllocVidMem(pScrn, &(pI830->FrontBuffer),
				&(pI830->StolenPool), size, align,
				flags | alignflags |
				FROM_ANYWHERE | ALLOCATE_AT_BOTTOM);
      if (alloced < size) {
	 if (!dryrun) {
	    xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
		       "Failed to allocate framebuffer.\n");
	 }
	 return FALSE;
      }
   } else {
      unsigned long lineSize;
      unsigned long extra = 0;
      unsigned long maxFb = 0;

      /*
       * XXX Need to "free" up any 3D allocations if the DRI ended up
       * and make them available for 2D.  The best way to do this would
       * be position all of those regions contiguously at the end of the
       * StolenPool.
       */
      extra = GetFreeSpace(pScrn);

      if (extra == 0)
	 return TRUE;

      maxFb = pI830->FrontBuffer.Size + extra;
      lineSize = pScrn->displayWidth * pI830->cpp;
      maxFb = ROUND_DOWN_TO(maxFb, lineSize);
      if (maxFb > lineSize * MAX_DISPLAY_HEIGHT)
	 maxFb = lineSize * MAX_DISPLAY_HEIGHT;
      if (maxFb > pI830->FrontBuffer.Size) {
	 unsigned long oldsize;
	 /*
	  * Sanity check -- the fb should be the last thing allocated at
	  * the bottom of the stolen pool.
	  */
	 if (pI830->StolenPool.Free.Start != pI830->FrontBuffer.End) {
	    xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
		       "Internal error in I830Allocate2DMemory():\n\t"
		       "Framebuffer isn't the last allocation at the bottom"
		       " of StolenPool\n\t(%x != %x).\n",
		       pI830->FrontBuffer.End,
		       pI830->StolenPool.Free.Start);
	    return FALSE;
	 }
	 /*
	  * XXX Maybe should have a "Free" function.  This should be
	  * the only place where a region is resized, and we know that
	  * the fb is always at the bottom of the aperture/stolen pool,
	  * and is the only region that is allocated bottom-up.
	  * Allowing for more general realloction would require a smarter
	  * allocation system.
	  */
	 oldsize = pI830->FrontBuffer.Size;
	 pI830->StolenPool.Free.Size += pI830->FrontBuffer.Size;
	 pI830->StolenPool.Free.Start -= pI830->FrontBuffer.Size;
	 xf86DrvMsgVerb(pScrn->scrnIndex, X_INFO, verbosity,
			"%sUpdated framebuffer allocation size from %d "
			"to %d kByte\n", s, oldsize / 1024, maxFb / 1024);
	 xf86DrvMsgVerb(pScrn->scrnIndex, X_INFO, verbosity,
			"%sUpdated pixmap cache from %d scanlines to %d "
			"scanlines\n", s,
			oldsize / lineSize - pScrn->virtualY,
			maxFb / lineSize - pScrn->virtualY);
	 pI830->FbMemBox.y2 = maxFb / lineSize;
	 tileable = !(flags & ALLOC_NO_TILING) && pI830->allowPageFlip &&
		 IsTileable(pScrn->displayWidth * pI830->cpp);
	 if (tileable) {
	    align = KB(512);
	    alignflags = ALIGN_BOTH_ENDS;
	 } else {
	    align = KB(64);
	    alignflags = 0;
	 }
	 alloced = I830AllocVidMem(pScrn, &(pI830->FrontBuffer),
				   &(pI830->StolenPool), maxFb, align,
				   flags | alignflags |
				   FROM_ANYWHERE | ALLOCATE_AT_BOTTOM);
	 if (alloced < maxFb) {
	    if (!dryrun) {
	       xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
			  "Failed to re-allocate framebuffer\n");
	    }
	    return FALSE;
	 }
      }
      return TRUE;
   }

#if REMAP_RESERVED
   /*
    * Allocate a dummy page to pass when attempting to rebind the
    * pre-allocated region.
    */
   if (!dryrun) {
      memset(&(pI830->Dummy), 0, sizeof(pI830->Dummy));
      pI830->Dummy.Key =
	   xf86AllocateGARTMemory(pScrn->scrnIndex, size, 0, NULL);
      pI830->Dummy.Offset = 0;
   }
#endif

   /* Clear cursor info */
   memset(&(pI830->CursorMem), 0, sizeof(pI830->CursorMem));
   pI830->CursorMem.Key = -1;

   if (!pI830->SWCursor) {
      int cursFlags = 0;
      /*
       * Mouse cursor -- The i810-i830 need a physical address in system
       * memory from which to upload the cursor.  We get this from
       * the agpgart module using a special memory type.
       */

      size = HWCURSOR_SIZE;
      cursFlags = FROM_ANYWHERE | ALLOCATE_AT_TOP;
      if (pI830->CursorNeedsPhysical)
	 cursFlags |= NEED_PHYSICAL_ADDR;

      alloced = I830AllocVidMem(pScrn, &(pI830->CursorMem),
				&(pI830->StolenPool), size,
				GTT_PAGE_SIZE, flags | cursFlags);
      if (alloced < size) {
	 if (!dryrun) {
	    xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
		       "Failed to allocate HW cursor space.\n");
	 }
      } else {
	 xf86DrvMsgVerb(pScrn->scrnIndex, X_INFO, verbosity,
			"%sAllocated %d kB for HW cursor at 0x%x", s,
			alloced / 1024, pI830->CursorMem.Start);
	 if (pI830->CursorNeedsPhysical)
	    xf86ErrorFVerb(verbosity, " (0x%08x)", pI830->CursorMem.Physical);
	 xf86ErrorFVerb(verbosity, "\n");
      }
   }

#ifdef I830_XV
   AllocateOverlay(pScrn, flags);
#endif

   if (!pI830->NeedRingBufferLow)
      AllocateRingBuffer(pScrn, flags);

   /* Clear scratch info */
   memset(&(pI830->Scratch), 0, sizeof(pI830->Scratch));
   pI830->Scratch.Key = -1;

   if (!pI830->noAccel) {
      size = MAX_SCRATCH_BUFFER_SIZE;
      alloced = I830AllocVidMem(pScrn, &(pI830->Scratch), &(pI830->StolenPool),
				size, GTT_PAGE_SIZE,
				flags | FROM_ANYWHERE | ALLOCATE_AT_TOP);
      if (alloced < size) {
	 size = MIN_SCRATCH_BUFFER_SIZE;
         alloced = I830AllocVidMem(pScrn, &(pI830->Scratch),
				   &(pI830->StolenPool), size,
				   GTT_PAGE_SIZE,
				   flags | FROM_ANYWHERE | ALLOCATE_AT_TOP);
      }
      if (alloced < size) {
	 if (!dryrun) {
	    xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
		       "Failed to allocate scratch buffer space\n");
	 }
	 return FALSE;
      }
      xf86DrvMsgVerb(pScrn->scrnIndex, X_INFO, verbosity,
		    "%sAllocated %d kB for the scratch buffer at 0x%x\n", s,
		    alloced / 1024, pI830->Scratch.Start);
   }
   return TRUE;
}
Beispiel #12
0
static unsigned long
AllocFromAGP(ScrnInfoPtr pScrn, I830MemRange *result, unsigned long size,
	     unsigned long alignment, int flags)
{
   I830Ptr pI830 = I830PTR(pScrn);
   unsigned long start, end;
   unsigned long newApStart, newApEnd;
   Bool dryrun = ((flags & ALLOCATE_DRY_RUN) != 0);

   if (!result || !size)
      return 0;

   if ((flags & ALLOCATE_AT_BOTTOM) && pI830->StolenMemory.Size != 0) {
      xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
		 "AllocFromAGP(): can't allocate from "
		 "bottom when there is stolen memory\n");
      return 0;
   }

   if (size > pI830->FreeMemory) {
      if (dryrun)
	 pI830->FreeMemory = size;
      else
	 return 0;
   }

   /* Calculate offset */
   if (flags & ALLOCATE_AT_BOTTOM) {
      start = ROUND_TO(pI830->MemoryAperture.Start, alignment);
      if (flags & ALIGN_BOTH_ENDS)
	 end = ROUND_TO(start + size, alignment);
      else
	 end = start + size;
      newApStart = end;
      newApEnd = pI830->MemoryAperture.End;
   } else {
      if (flags & ALIGN_BOTH_ENDS)
	 end = ROUND_DOWN_TO(pI830->MemoryAperture.End, alignment);
      else
	 end = pI830->MemoryAperture.End;
      start = ROUND_DOWN_TO(end - size, alignment);
      newApStart = pI830->MemoryAperture.Start;
      newApEnd = start;
   }

   if (!dryrun) {
      if (newApStart > newApEnd)
	 return 0;

      if (flags & NEED_PHYSICAL_ADDR) {
	 result->Key = xf86AllocateGARTMemory(pScrn->scrnIndex, size, 2,
					      &(result->Physical));
      } else {
	 result->Key = xf86AllocateGARTMemory(pScrn->scrnIndex, size, 0, NULL);
      }
      if (result->Key == -1)
	 return 0;
   }

   pI830->allocatedMemory += size;
   pI830->MemoryAperture.Start = newApStart;
   pI830->MemoryAperture.End = newApEnd;
   pI830->MemoryAperture.Size = newApEnd - newApStart;
   pI830->FreeMemory -= size;
   result->Start = start;
   result->End = start + size;
   result->Size = size;
   result->Offset = start;
   result->Alignment = alignment;
   result->Pool = NULL;

   return size;
}
Beispiel #13
0
brw_blorp_clear_params::brw_blorp_clear_params(struct brw_context *brw,
                                               struct gl_framebuffer *fb,
                                               struct gl_renderbuffer *rb,
                                               GLubyte *color_mask,
                                               bool partial_clear,
                                               unsigned layer)
{
   struct gl_context *ctx = &brw->ctx;
   struct intel_renderbuffer *irb = intel_renderbuffer(rb);

   dst.set(brw, irb->mt, irb->mt_level, layer, true);

   /* Override the surface format according to the context's sRGB rules. */
   mesa_format format = _mesa_get_render_format(ctx, irb->mt->format);
   dst.brw_surfaceformat = brw->render_target_format[format];

   x0 = fb->_Xmin;
   x1 = fb->_Xmax;
   if (rb->Name != 0) {
      y0 = fb->_Ymin;
      y1 = fb->_Ymax;
   } else {
      y0 = rb->Height - fb->_Ymax;
      y1 = rb->Height - fb->_Ymin;
   }

   float *push_consts = (float *)&wm_push_consts;

   push_consts[0] = ctx->Color.ClearColor.f[0];
   push_consts[1] = ctx->Color.ClearColor.f[1];
   push_consts[2] = ctx->Color.ClearColor.f[2];
   push_consts[3] = ctx->Color.ClearColor.f[3];

   use_wm_prog = true;

   memset(&wm_prog_key, 0, sizeof(wm_prog_key));

   wm_prog_key.use_simd16_replicated_data = true;

   /* From the SNB PRM (Vol4_Part1):
    *
    *     "Replicated data (Message Type = 111) is only supported when
    *      accessing tiled memory.  Using this Message Type to access linear
    *      (untiled) memory is UNDEFINED."
    */
   if (irb->mt->tiling == I915_TILING_NONE)
      wm_prog_key.use_simd16_replicated_data = false;

   /* Constant color writes ignore everyting in blend and color calculator
    * state.  This is not documented.
    */
   for (int i = 0; i < 4; i++) {
      if (_mesa_format_has_color_component(irb->mt->format, i) &&
          !color_mask[i]) {
         color_write_disable[i] = true;
         wm_prog_key.use_simd16_replicated_data = false;
      }
   }

   /* If we can do this as a fast color clear, do so.
    *
    * Note that the condition "!partial_clear" means we only try to do full
    * buffer clears using fast color clear logic.  This is necessary because
    * the fast color clear alignment requirements mean that we typically have
    * to clear a larger rectangle than (x0, y0) to (x1, y1).  Restricting fast
    * color clears to the full-buffer condition guarantees that the extra
    * memory locations that get written to are outside the image boundary (and
    * hence irrelevant).  Note that the rectangle alignment requirements are
    * never larger than the size of a tile, so there is no danger of
    * overflowing beyond the memory belonging to the region.
    */
   if (irb->mt->fast_clear_state != INTEL_FAST_CLEAR_STATE_NO_MCS &&
       !partial_clear && wm_prog_key.use_simd16_replicated_data &&
       is_color_fast_clear_compatible(brw, format, &ctx->Color.ClearColor)) {
      memset(push_consts, 0xff, 4*sizeof(float));
      fast_clear_op = GEN7_FAST_CLEAR_OP_FAST_CLEAR;

      /* Figure out what the clear rectangle needs to be aligned to, and how
       * much it needs to be scaled down.
       */
      unsigned x_align, y_align, x_scaledown, y_scaledown;

      if (irb->mt->msaa_layout == INTEL_MSAA_LAYOUT_NONE) {
         /* From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render
          * Target(s)", beneath the "Fast Color Clear" bullet (p327):
          *
          *     Clear pass must have a clear rectangle that must follow
          *     alignment rules in terms of pixels and lines as shown in the
          *     table below. Further, the clear-rectangle height and width
          *     must be multiple of the following dimensions. If the height
          *     and width of the render target being cleared do not meet these
          *     requirements, an MCS buffer can be created such that it
          *     follows the requirement and covers the RT.
          *
          * The alignment size in the table that follows is related to the
          * alignment size returned by intel_get_non_msrt_mcs_alignment(), but
          * with X alignment multiplied by 16 and Y alignment multiplied by 32.
          */
         intel_get_non_msrt_mcs_alignment(brw, irb->mt, &x_align, &y_align);
         x_align *= 16;
         y_align *= 32;

         /* From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render
          * Target(s)", beneath the "Fast Color Clear" bullet (p327):
          *
          *     In order to optimize the performance MCS buffer (when bound to
          *     1X RT) clear similarly to MCS buffer clear for MSRT case,
          *     clear rect is required to be scaled by the following factors
          *     in the horizontal and vertical directions:
          *
          * The X and Y scale down factors in the table that follows are each
          * equal to half the alignment value computed above.
          */
         x_scaledown = x_align / 2;
         y_scaledown = y_align / 2;

         /* From BSpec: 3D-Media-GPGPU Engine > 3D Pipeline > Pixel > Pixel
          * Backend > MCS Buffer for Render Target(s) [DevIVB+] > Table "Color
          * Clear of Non-MultiSampled Render Target Restrictions":
          *
          *   Clear rectangle must be aligned to two times the number of
          *   pixels in the table shown below due to 16x16 hashing across the
          *   slice.
          */
         x_align *= 2;
         y_align *= 2;
      } else {
         /* From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render
          * Target(s)", beneath the "MSAA Compression" bullet (p326):
          *
          *     Clear pass for this case requires that scaled down primitive
          *     is sent down with upper left co-ordinate to coincide with
          *     actual rectangle being cleared. For MSAA, clear rectangle’s
          *     height and width need to as show in the following table in
          *     terms of (width,height) of the RT.
          *
          *     MSAA  Width of Clear Rect  Height of Clear Rect
          *      4X     Ceil(1/8*width)      Ceil(1/2*height)
          *      8X     Ceil(1/2*width)      Ceil(1/2*height)
          *
          * The text "with upper left co-ordinate to coincide with actual
          * rectangle being cleared" is a little confusing--it seems to imply
          * that to clear a rectangle from (x,y) to (x+w,y+h), one needs to
          * feed the pipeline using the rectangle (x,y) to
          * (x+Ceil(w/N),y+Ceil(h/2)), where N is either 2 or 8 depending on
          * the number of samples.  Experiments indicate that this is not
          * quite correct; actually, what the hardware appears to do is to
          * align whatever rectangle is sent down the pipeline to the nearest
          * multiple of 2x2 blocks, and then scale it up by a factor of N
          * horizontally and 2 vertically.  So the resulting alignment is 4
          * vertically and either 4 or 16 horizontally, and the scaledown
          * factor is 2 vertically and either 2 or 8 horizontally.
          */
         switch (irb->mt->num_samples) {
         case 4:
            x_scaledown = 8;
            break;
         case 8:
            x_scaledown = 2;
            break;
         default:
            assert(!"Unexpected sample count for fast clear");
            break;
         }
         y_scaledown = 2;
         x_align = x_scaledown * 2;
         y_align = y_scaledown * 2;
      }

      /* Do the alignment and scaledown. */
      x0 = ROUND_DOWN_TO(x0,  x_align) / x_scaledown;
      y0 = ROUND_DOWN_TO(y0, y_align) / y_scaledown;
      x1 = ALIGN(x1, x_align) / x_scaledown;
      y1 = ALIGN(y1, y_align) / y_scaledown;
   }
}
Beispiel #14
0
brw_blorp_clear_params::brw_blorp_clear_params(struct brw_context *brw,
                                               struct gl_framebuffer *fb,
                                               struct gl_renderbuffer *rb,
                                               GLubyte *color_mask,
                                               bool partial_clear)
{
   struct gl_context *ctx = &brw->ctx;
   struct intel_renderbuffer *irb = intel_renderbuffer(rb);

   dst.set(brw, irb->mt, irb->mt_level, irb->mt_layer);

   /* Override the surface format according to the context's sRGB rules. */
   gl_format format = _mesa_get_render_format(ctx, irb->mt->format);
   dst.brw_surfaceformat = brw->render_target_format[format];

   x0 = fb->_Xmin;
   x1 = fb->_Xmax;
   if (rb->Name != 0) {
      y0 = fb->_Ymin;
      y1 = fb->_Ymax;
   } else {
      y0 = rb->Height - fb->_Ymax;
      y1 = rb->Height - fb->_Ymin;
   }

   float *push_consts = (float *)&wm_push_consts;

   push_consts[0] = ctx->Color.ClearColor.f[0];
   push_consts[1] = ctx->Color.ClearColor.f[1];
   push_consts[2] = ctx->Color.ClearColor.f[2];
   push_consts[3] = ctx->Color.ClearColor.f[3];

   use_wm_prog = true;

   memset(&wm_prog_key, 0, sizeof(wm_prog_key));

   wm_prog_key.use_simd16_replicated_data = true;

   /* From the SNB PRM (Vol4_Part1):
    *
    *     "Replicated data (Message Type = 111) is only supported when
    *      accessing tiled memory.  Using this Message Type to access linear
    *      (untiled) memory is UNDEFINED."
    */
   if (irb->mt->region->tiling == I915_TILING_NONE)
      wm_prog_key.use_simd16_replicated_data = false;

   /* Constant color writes ignore everyting in blend and color calculator
    * state.  This is not documented.
    */
   for (int i = 0; i < 4; i++) {
      if (!color_mask[i]) {
         color_write_disable[i] = true;
         wm_prog_key.use_simd16_replicated_data = false;
      }
   }

   /* If we can do this as a fast color clear, do so. */
   if (irb->mt->mcs_state != INTEL_MCS_STATE_NONE && !partial_clear &&
       wm_prog_key.use_simd16_replicated_data &&
       is_color_fast_clear_compatible(brw, format, &ctx->Color.ClearColor)) {
      memset(push_consts, 0xff, 4*sizeof(float));
      fast_clear_op = GEN7_FAST_CLEAR_OP_FAST_CLEAR;

      /* From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render
       * Target(s)", beneath the "Fast Color Clear" bullet (p327):
       *
       *     Clear pass must have a clear rectangle that must follow alignment
       *     rules in terms of pixels and lines as shown in the table
       *     below. Further, the clear-rectangle height and width must be
       *     multiple of the following dimensions. If the height and width of
       *     the render target being cleared do not meet these requirements,
       *     an MCS buffer can be created such that it follows the requirement
       *     and covers the RT.
       *
       * The alignment size in the table that follows is related to the
       * alignment size returned by intel_get_non_msrt_mcs_alignment(), but
       * with X alignment multiplied by 16 and Y alignment multiplied by 32.
       */
      unsigned x_align, y_align;
      intel_get_non_msrt_mcs_alignment(brw, irb->mt, &x_align, &y_align);
      x_align *= 16;
      y_align *= 32;

      /* From BSpec: 3D-Media-GPGPU Engine > 3D Pipeline > Pixel > Pixel
       * Backend > MCS Buffer for Render Target(s) [DevIVB+] > Table "Color
       * Clear of Non-MultiSampled Render Target Restrictions":
       *
       *   Clear rectangle must be aligned to two times the number of pixels in
       *   the table shown below due to 16x16 hashing across the slice.
       */
      x0 = ROUND_DOWN_TO(x0, 2 * x_align);
      y0 = ROUND_DOWN_TO(y0, 2 * y_align);
      x1 = ALIGN(x1, 2 * x_align);
      y1 = ALIGN(y1, 2 * y_align);

      /* From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render
       * Target(s)", beneath the "Fast Color Clear" bullet (p327):
       *
       *     In order to optimize the performance MCS buffer (when bound to 1X
       *     RT) clear similarly to MCS buffer clear for MSRT case, clear rect
       *     is required to be scaled by the following factors in the
       *     horizontal and vertical directions:
       *
       * The X and Y scale down factors in the table that follows are each
       * equal to half the alignment value computed above.
       */
      unsigned x_scaledown = x_align / 2;
      unsigned y_scaledown = y_align / 2;
      x0 /= x_scaledown;
      y0 /= y_scaledown;
      x1 /= x_scaledown;
      y1 /= y_scaledown;
   }
}
Beispiel #15
0
static void
gen7_upload_urb(struct brw_context *brw)
{
   const int push_size_kB =
      (brw->gen >= 8 || (brw->is_haswell && brw->gt == 3)) ? 32 : 16;

   /* BRW_NEW_VS_PROG_DATA */
   unsigned vs_size = MAX2(brw->vs.prog_data->base.urb_entry_size, 1);
   unsigned vs_entry_size_bytes = vs_size * 64;
   /* BRW_NEW_GEOMETRY_PROGRAM, BRW_NEW_GS_PROG_DATA */
   bool gs_present = brw->geometry_program;
   unsigned gs_size = gs_present ? brw->gs.prog_data->base.urb_entry_size : 1;
   unsigned gs_entry_size_bytes = gs_size * 64;

   /* If we're just switching between programs with the same URB requirements,
    * skip the rest of the logic.
    */
   if (!(brw->ctx.NewDriverState & BRW_NEW_CONTEXT) &&
       brw->urb.vsize == vs_size &&
       brw->urb.gs_present == gs_present &&
       brw->urb.gsize == gs_size) {
      return;
   }
   brw->urb.vsize = vs_size;
   brw->urb.gs_present = gs_present;
   brw->urb.gsize = gs_size;

   /* From p35 of the Ivy Bridge PRM (section 1.7.1: 3DSTATE_URB_GS):
    *
    *     VS Number of URB Entries must be divisible by 8 if the VS URB Entry
    *     Allocation Size is less than 9 512-bit URB entries.
    *
    * Similar text exists for GS.
    */
   unsigned vs_granularity = (vs_size < 9) ? 8 : 1;
   unsigned gs_granularity = (gs_size < 9) ? 8 : 1;

   /* URB allocations must be done in 8k chunks. */
   unsigned chunk_size_bytes = 8192;

   /* Determine the size of the URB in chunks.
    */
   unsigned urb_chunks = brw->urb.size * 1024 / chunk_size_bytes;

   /* Reserve space for push constants */
   unsigned push_constant_bytes = 1024 * push_size_kB;
   unsigned push_constant_chunks =
      push_constant_bytes / chunk_size_bytes;

   /* Initially, assign each stage the minimum amount of URB space it needs,
    * and make a note of how much additional space it "wants" (the amount of
    * additional space it could actually make use of).
    */

   /* VS has a lower limit on the number of URB entries */
   unsigned vs_chunks =
      ALIGN(brw->urb.min_vs_entries * vs_entry_size_bytes, chunk_size_bytes) /
      chunk_size_bytes;
   unsigned vs_wants =
      ALIGN(brw->urb.max_vs_entries * vs_entry_size_bytes,
            chunk_size_bytes) / chunk_size_bytes - vs_chunks;

   unsigned gs_chunks = 0;
   unsigned gs_wants = 0;
   if (gs_present) {
      /* There are two constraints on the minimum amount of URB space we can
       * allocate:
       *
       * (1) We need room for at least 2 URB entries, since we always operate
       * the GS in DUAL_OBJECT mode.
       *
       * (2) We can't allocate less than nr_gs_entries_granularity.
       */
      gs_chunks = ALIGN(MAX2(gs_granularity, 2) * gs_entry_size_bytes,
                        chunk_size_bytes) / chunk_size_bytes;
      gs_wants =
         ALIGN(brw->urb.max_gs_entries * gs_entry_size_bytes,
               chunk_size_bytes) / chunk_size_bytes - gs_chunks;
   }

   /* There should always be enough URB space to satisfy the minimum
    * requirements of each stage.
    */
   unsigned total_needs = push_constant_chunks + vs_chunks + gs_chunks;
   assert(total_needs <= urb_chunks);

   /* Mete out remaining space (if any) in proportion to "wants". */
   unsigned total_wants = vs_wants + gs_wants;
   unsigned remaining_space = urb_chunks - total_needs;
   if (remaining_space > total_wants)
      remaining_space = total_wants;
   if (remaining_space > 0) {
      unsigned vs_additional = (unsigned)
         roundf(vs_wants * (((float) remaining_space) / total_wants));
      vs_chunks += vs_additional;
      remaining_space -= vs_additional;
      gs_chunks += remaining_space;
   }

   /* Sanity check that we haven't over-allocated. */
   assert(push_constant_chunks + vs_chunks + gs_chunks <= urb_chunks);

   /* Finally, compute the number of entries that can fit in the space
    * allocated to each stage.
    */
   unsigned nr_vs_entries = vs_chunks * chunk_size_bytes / vs_entry_size_bytes;
   unsigned nr_gs_entries = gs_chunks * chunk_size_bytes / gs_entry_size_bytes;

   /* Since we rounded up when computing *_wants, this may be slightly more
    * than the maximum allowed amount, so correct for that.
    */
   nr_vs_entries = MIN2(nr_vs_entries, brw->urb.max_vs_entries);
   nr_gs_entries = MIN2(nr_gs_entries, brw->urb.max_gs_entries);

   /* Ensure that we program a multiple of the granularity. */
   nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, vs_granularity);
   nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, gs_granularity);

   /* Finally, sanity check to make sure we have at least the minimum number
    * of entries needed for each stage.
    */
   assert(nr_vs_entries >= brw->urb.min_vs_entries);
   if (gs_present)
      assert(nr_gs_entries >= 2);

   /* Gen7 doesn't actually use brw->urb.nr_{vs,gs}_entries, but it seems
    * better to put reasonable data in there rather than leave them
    * uninitialized.
    */
   brw->urb.nr_vs_entries = nr_vs_entries;
   brw->urb.nr_gs_entries = nr_gs_entries;

   /* Lay out the URB in the following order:
    * - push constants
    * - VS
    * - GS
    */
   brw->urb.vs_start = push_constant_chunks;
   brw->urb.gs_start = push_constant_chunks + vs_chunks;

   if (brw->gen == 7 && !brw->is_haswell && !brw->is_baytrail)
      gen7_emit_vs_workaround_flush(brw);
   gen7_emit_urb_state(brw,
                       brw->urb.nr_vs_entries, vs_size, brw->urb.vs_start,
                       brw->urb.nr_gs_entries, gs_size, brw->urb.gs_start);
}
Beispiel #16
0
/**
 * When the GS is not in use, we assign the entire URB space to the VS.  When
 * the GS is in use, we split the URB space evenly between the VS and the GS.
 * This is not ideal, but it's simple.
 *
 *           URB size / 2                   URB size / 2
 *   _____________-______________   _____________-______________
 *  /                            \ /                            \
 * +-------------------------------------------------------------+
 * | Vertex Shader Entries        | Geometry Shader Entries      |
 * +-------------------------------------------------------------+
 *
 * Sandybridge GT1 has 32kB of URB space, while GT2 has 64kB.
 * (See the Sandybridge PRM, Volume 2, Part 1, Section 1.4.7: 3DSTATE_URB.)
 */
static void
gen6_upload_urb( struct brw_context *brw )
{
   struct intel_context *intel = &brw->intel;
   int nr_vs_entries, nr_gs_entries;
   int total_urb_size = brw->urb.size * 1024; /* in bytes */

   /* CACHE_NEW_VS_PROG */
   brw->urb.vs_size = MAX2(brw->vs.prog_data->urb_entry_size, 1);

   /* We use the same VUE layout for VS outputs and GS outputs (as it's what
    * the SF and Clipper expect), so we can simply make the GS URB entry size
    * the same as for the VS.  This may technically be too large in cases
    * where we have few vertex attributes and a lot of varyings, since the VS
    * size is determined by the larger of the two.  For now, it's safe.
    */
   brw->urb.gs_size = brw->urb.vs_size;

   /* Calculate how many entries fit in each stage's section of the URB */
   if (brw->gs.prog_active) {
      nr_vs_entries = (total_urb_size/2) / (brw->urb.vs_size * 128);
      nr_gs_entries = (total_urb_size/2) / (brw->urb.gs_size * 128);
   } else {
      nr_vs_entries = total_urb_size / (brw->urb.vs_size * 128);
      nr_gs_entries = 0;
   }

   /* Then clamp to the maximum allowed by the hardware */
   if (nr_vs_entries > brw->urb.max_vs_entries)
      nr_vs_entries = brw->urb.max_vs_entries;

   if (nr_gs_entries > brw->urb.max_gs_entries)
      nr_gs_entries = brw->urb.max_gs_entries;

   /* Finally, both must be a multiple of 4 (see 3DSTATE_URB in the PRM). */
   brw->urb.nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, 4);
   brw->urb.nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, 4);

   assert(brw->urb.nr_vs_entries >= 24);
   assert(brw->urb.nr_vs_entries % 4 == 0);
   assert(brw->urb.nr_gs_entries % 4 == 0);
   assert(brw->urb.vs_size < 5);
   assert(brw->urb.gs_size < 5);

   BEGIN_BATCH(3);
   OUT_BATCH(_3DSTATE_URB << 16 | (3 - 2));
   OUT_BATCH(((brw->urb.vs_size - 1) << GEN6_URB_VS_SIZE_SHIFT) |
	     ((brw->urb.nr_vs_entries) << GEN6_URB_VS_ENTRIES_SHIFT));
   OUT_BATCH(((brw->urb.gs_size - 1) << GEN6_URB_GS_SIZE_SHIFT) |
	     ((brw->urb.nr_gs_entries) << GEN6_URB_GS_ENTRIES_SHIFT));
   ADVANCE_BATCH();

   /* From the PRM Volume 2 part 1, section 1.4.7:
    *
    *   Because of a urb corruption caused by allocating a previous gsunit’s
    *   urb entry to vsunit software is required to send a "GS NULL
    *   Fence"(Send URB fence with VS URB size == 1 and GS URB size == 0) plus
    *   a dummy DRAW call before any case where VS will be taking over GS URB
    *   space.
    *
    * It is not clear exactly what this means ("URB fence" is a command that
    * doesn't exist on Gen6).  So for now we just do a full pipeline flush as
    * a workaround.
    */
   if (brw->urb.gen6_gs_previously_active && !brw->gs.prog_active)
      intel_batchbuffer_emit_mi_flush(intel);
   brw->urb.gen6_gs_previously_active = brw->gs.prog_active;
}
Beispiel #17
0
/* The x0, y0, x1, and y1 parameters must already be populated with the render
 * area of the framebuffer to be cleared.
 */
static void
get_fast_clear_rect(const struct isl_device *dev,
                    const struct isl_surf *aux_surf,
                    unsigned *x0, unsigned *y0,
                    unsigned *x1, unsigned *y1)
{
   unsigned int x_align, y_align;
   unsigned int x_scaledown, y_scaledown;

   /* Only single sampled surfaces need to (and actually can) be resolved. */
   if (aux_surf->usage == ISL_SURF_USAGE_CCS_BIT) {
      /* From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render
       * Target(s)", beneath the "Fast Color Clear" bullet (p327):
       *
       *     Clear pass must have a clear rectangle that must follow
       *     alignment rules in terms of pixels and lines as shown in the
       *     table below. Further, the clear-rectangle height and width
       *     must be multiple of the following dimensions. If the height
       *     and width of the render target being cleared do not meet these
       *     requirements, an MCS buffer can be created such that it
       *     follows the requirement and covers the RT.
       *
       * The alignment size in the table that follows is related to the
       * alignment size that is baked into the CCS surface format but with X
       * alignment multiplied by 16 and Y alignment multiplied by 32.
       */
      x_align = isl_format_get_layout(aux_surf->format)->bw;
      y_align = isl_format_get_layout(aux_surf->format)->bh;

      x_align *= 16;

      /* SKL+ line alignment requirement for Y-tiled are half those of the prior
       * generations.
       */
      if (dev->info->gen >= 9)
         y_align *= 16;
      else
         y_align *= 32;

      /* From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render
       * Target(s)", beneath the "Fast Color Clear" bullet (p327):
       *
       *     In order to optimize the performance MCS buffer (when bound to
       *     1X RT) clear similarly to MCS buffer clear for MSRT case,
       *     clear rect is required to be scaled by the following factors
       *     in the horizontal and vertical directions:
       *
       * The X and Y scale down factors in the table that follows are each
       * equal to half the alignment value computed above.
       */
      x_scaledown = x_align / 2;
      y_scaledown = y_align / 2;

      /* From BSpec: 3D-Media-GPGPU Engine > 3D Pipeline > Pixel > Pixel
       * Backend > MCS Buffer for Render Target(s) [DevIVB+] > Table "Color
       * Clear of Non-MultiSampled Render Target Restrictions":
       *
       *   Clear rectangle must be aligned to two times the number of
       *   pixels in the table shown below due to 16x16 hashing across the
       *   slice.
       */
      x_align *= 2;
      y_align *= 2;
   } else {
      assert(aux_surf->usage == ISL_SURF_USAGE_MCS_BIT);

      /* From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render
       * Target(s)", beneath the "MSAA Compression" bullet (p326):
       *
       *     Clear pass for this case requires that scaled down primitive
       *     is sent down with upper left co-ordinate to coincide with
       *     actual rectangle being cleared. For MSAA, clear rectangle’s
       *     height and width need to as show in the following table in
       *     terms of (width,height) of the RT.
       *
       *     MSAA  Width of Clear Rect  Height of Clear Rect
       *      2X     Ceil(1/8*width)      Ceil(1/2*height)
       *      4X     Ceil(1/8*width)      Ceil(1/2*height)
       *      8X     Ceil(1/2*width)      Ceil(1/2*height)
       *     16X         width            Ceil(1/2*height)
       *
       * The text "with upper left co-ordinate to coincide with actual
       * rectangle being cleared" is a little confusing--it seems to imply
       * that to clear a rectangle from (x,y) to (x+w,y+h), one needs to
       * feed the pipeline using the rectangle (x,y) to
       * (x+Ceil(w/N),y+Ceil(h/2)), where N is either 2 or 8 depending on
       * the number of samples.  Experiments indicate that this is not
       * quite correct; actually, what the hardware appears to do is to
       * align whatever rectangle is sent down the pipeline to the nearest
       * multiple of 2x2 blocks, and then scale it up by a factor of N
       * horizontally and 2 vertically.  So the resulting alignment is 4
       * vertically and either 4 or 16 horizontally, and the scaledown
       * factor is 2 vertically and either 2 or 8 horizontally.
       */
      switch (aux_surf->format) {
      case ISL_FORMAT_MCS_2X:
      case ISL_FORMAT_MCS_4X:
         x_scaledown = 8;
         break;
      case ISL_FORMAT_MCS_8X:
         x_scaledown = 2;
         break;
      case ISL_FORMAT_MCS_16X:
         x_scaledown = 1;
         break;
      default:
         unreachable("Unexpected MCS format for fast clear");
      }
      y_scaledown = 2;
      x_align = x_scaledown * 2;
      y_align = y_scaledown * 2;
   }

   *x0 = ROUND_DOWN_TO(*x0,  x_align) / x_scaledown;
   *y0 = ROUND_DOWN_TO(*y0, y_align) / y_scaledown;
   *x1 = ALIGN(*x1, x_align) / x_scaledown;
   *y1 = ALIGN(*y1, y_align) / y_scaledown;
}
Beispiel #18
0
/**
 * When the GS is not in use, we assign the entire URB space to the VS.  When
 * the GS is in use, we split the URB space evenly between the VS and the GS.
 * This is not ideal, but it's simple.
 *
 *           URB size / 2                   URB size / 2
 *   _____________-______________   _____________-______________
 *  /                            \ /                            \
 * +-------------------------------------------------------------+
 * | Vertex Shader Entries        | Geometry Shader Entries      |
 * +-------------------------------------------------------------+
 *
 * Sandybridge GT1 has 32kB of URB space, while GT2 has 64kB.
 * (See the Sandybridge PRM, Volume 2, Part 1, Section 1.4.7: 3DSTATE_URB.)
 */
static void
gen6_upload_urb( struct brw_context *brw )
{
    int nr_vs_entries, nr_gs_entries;
    int total_urb_size = brw->urb.size * 1024; /* in bytes */

    bool gs_present = brw->ff_gs.prog_active || brw->geometry_program;

    /* BRW_NEW_VS_PROG_DATA */
    unsigned vs_size = MAX2(brw->vs.prog_data->base.urb_entry_size, 1);

    /* Whe using GS to do transform feedback only we use the same VUE layout for
     * VS outputs and GS outputs (as it's what the SF and Clipper expect), so we
     * can simply make the GS URB entry size the same as for the VS.  This may
     * technically be too large in cases where we have few vertex attributes and
     * a lot of varyings, since the VS size is determined by the larger of the
     * two. For now, it's safe.
     *
     * For user-provided GS the assumption above does not hold since the GS
     * outputs can be different from the VS outputs.
     */
    unsigned gs_size = vs_size;
    if (brw->geometry_program) {
        gs_size = brw->gs.prog_data->base.urb_entry_size;
        assert(gs_size >= 1);
    }

    /* Calculate how many entries fit in each stage's section of the URB */
    if (gs_present) {
        nr_vs_entries = (total_urb_size/2) / (vs_size * 128);
        nr_gs_entries = (total_urb_size/2) / (gs_size * 128);
    } else {
        nr_vs_entries = total_urb_size / (vs_size * 128);
        nr_gs_entries = 0;
    }

    /* Then clamp to the maximum allowed by the hardware */
    if (nr_vs_entries > brw->urb.max_vs_entries)
        nr_vs_entries = brw->urb.max_vs_entries;

    if (nr_gs_entries > brw->urb.max_gs_entries)
        nr_gs_entries = brw->urb.max_gs_entries;

    /* Finally, both must be a multiple of 4 (see 3DSTATE_URB in the PRM). */
    brw->urb.nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, 4);
    brw->urb.nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, 4);

    assert(brw->urb.nr_vs_entries >= brw->urb.min_vs_entries);
    assert(brw->urb.nr_vs_entries % 4 == 0);
    assert(brw->urb.nr_gs_entries % 4 == 0);
    assert(vs_size <= 5);
    assert(gs_size <= 5);

    BEGIN_BATCH(3);
    OUT_BATCH(_3DSTATE_URB << 16 | (3 - 2));
    OUT_BATCH(((vs_size - 1) << GEN6_URB_VS_SIZE_SHIFT) |
              ((brw->urb.nr_vs_entries) << GEN6_URB_VS_ENTRIES_SHIFT));
    OUT_BATCH(((gs_size - 1) << GEN6_URB_GS_SIZE_SHIFT) |
              ((brw->urb.nr_gs_entries) << GEN6_URB_GS_ENTRIES_SHIFT));
    ADVANCE_BATCH();

    /* From the PRM Volume 2 part 1, section 1.4.7:
     *
     *   Because of a urb corruption caused by allocating a previous gsunit’s
     *   urb entry to vsunit software is required to send a "GS NULL
     *   Fence"(Send URB fence with VS URB size == 1 and GS URB size == 0) plus
     *   a dummy DRAW call before any case where VS will be taking over GS URB
     *   space.
     *
     * It is not clear exactly what this means ("URB fence" is a command that
     * doesn't exist on Gen6).  So for now we just do a full pipeline flush as
     * a workaround.
     */
    if (brw->urb.gs_present && !gs_present)
        brw_emit_mi_flush(brw);
    brw->urb.gs_present = gs_present;
}
Beispiel #19
0
/**
 * Decide how to partition the URB among the various stages.
 *
 * \param[in] push_constant_bytes - space allocate for push constants.
 * \param[in] urb_size_bytes - total size of the URB (from L3 config).
 * \param[in] tess_present - are tessellation shaders active?
 * \param[in] gs_present - are geometry shaders active?
 * \param[in] entry_size - the URB entry size (from the shader compiler)
 * \param[out] entries - the number of URB entries for each stage
 * \param[out] start - the starting offset for each stage
 */
void
gen_get_urb_config(const struct gen_device_info *devinfo,
                   unsigned push_constant_bytes, unsigned urb_size_bytes,
                   bool tess_present, bool gs_present,
                   const unsigned entry_size[4],
                   unsigned entries[4], unsigned start[4])
{
   const bool active[4] = { true, tess_present, tess_present, gs_present };

   /* URB allocations must be done in 8k chunks. */
   const unsigned chunk_size_bytes = 8192;

   const unsigned push_constant_chunks =
      push_constant_bytes / chunk_size_bytes;
   const unsigned urb_chunks = urb_size_bytes / chunk_size_bytes;

   /* From p35 of the Ivy Bridge PRM (section 1.7.1: 3DSTATE_URB_GS):
    *
    *     VS Number of URB Entries must be divisible by 8 if the VS URB Entry
    *     Allocation Size is less than 9 512-bit URB entries.
    *
    * Similar text exists for HS, DS and GS.
    */
   unsigned granularity[4];
   for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
      granularity[i] = (entry_size[i] < 9) ? 8 : 1;
   }

   unsigned min_entries[4] = {
      /* VS has a lower limit on the number of URB entries.
       *
       * From the Broadwell PRM, 3DSTATE_URB_VS instruction:
       * "When tessellation is enabled, the VS Number of URB Entries must be
       *  greater than or equal to 192."
       */
      [MESA_SHADER_VERTEX] = tess_present && devinfo->gen == 8 ?
         192 : devinfo->urb.min_entries[MESA_SHADER_VERTEX],

      /* There are two constraints on the minimum amount of URB space we can
       * allocate:
       *
       * (1) We need room for at least 2 URB entries, since we always operate
       * the GS in DUAL_OBJECT mode.
       *
       * (2) We can't allocate less than nr_gs_entries_granularity.
       */
      [MESA_SHADER_GEOMETRY] = gs_present ? 2 : 0,

      [MESA_SHADER_TESS_CTRL] = tess_present ? 1 : 0,

      [MESA_SHADER_TESS_EVAL] = tess_present ?
         devinfo->urb.min_entries[MESA_SHADER_TESS_EVAL] : 0,
   };

   /* Min VS Entries isn't a multiple of 8 on Cherryview/Broxton; round up.
    * Round them all up.
    */
   for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
      min_entries[i] = ALIGN(min_entries[i], granularity[i]);
   }

   unsigned entry_size_bytes[4];
   for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
      entry_size_bytes[i] = 64 * entry_size[i];
   }

   /* Initially, assign each stage the minimum amount of URB space it needs,
    * and make a note of how much additional space it "wants" (the amount of
    * additional space it could actually make use of).
    */
   unsigned chunks[4];
   unsigned wants[4];
   unsigned total_needs = push_constant_chunks;
   unsigned total_wants = 0;

   for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
      if (active[i]) {
         chunks[i] = DIV_ROUND_UP(min_entries[i] * entry_size_bytes[i],
                                  chunk_size_bytes);

         wants[i] =
            DIV_ROUND_UP(devinfo->urb.max_entries[i] * entry_size_bytes[i],
                         chunk_size_bytes) - chunks[i];
      } else {
         chunks[i] = 0;
         wants[i] = 0;
      }

      total_needs += chunks[i];
      total_wants += wants[i];
   }

   assert(total_needs <= urb_chunks);

   /* Mete out remaining space (if any) in proportion to "wants". */
   unsigned remaining_space = MIN2(urb_chunks - total_needs, total_wants);

   if (remaining_space > 0) {
      for (int i = MESA_SHADER_VERTEX;
           total_wants > 0 && i <= MESA_SHADER_TESS_EVAL; i++) {
         unsigned additional = (unsigned)
            roundf(wants[i] * (((float) remaining_space) / total_wants));
         chunks[i] += additional;
         remaining_space -= additional;
         total_wants -= wants[i];
      }

      chunks[MESA_SHADER_GEOMETRY] += remaining_space;
   }

   /* Sanity check that we haven't over-allocated. */
   unsigned total_chunks = push_constant_chunks;
   for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
      total_chunks += chunks[i];
   }
   assert(total_chunks <= urb_chunks);

   /* Finally, compute the number of entries that can fit in the space
    * allocated to each stage.
    */
   for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
      entries[i] = chunks[i] * chunk_size_bytes / entry_size_bytes[i];

      /* Since we rounded up when computing wants[], this may be slightly
       * more than the maximum allowed amount, so correct for that.
       */
      entries[i] = MIN2(entries[i], devinfo->urb.max_entries[i]);

      /* Ensure that we program a multiple of the granularity. */
      entries[i] = ROUND_DOWN_TO(entries[i], granularity[i]);

      /* Finally, sanity check to make sure we have at least the minimum
       * number of entries needed for each stage.
       */
      assert(entries[i] >= min_entries[i]);
   }

   /* Lay out the URB in pipeline order: push constants, VS, HS, DS, GS. */
   start[0] = push_constant_chunks;
   for (int i = MESA_SHADER_TESS_CTRL; i <= MESA_SHADER_GEOMETRY; i++) {
      start[i] = start[i - 1] + chunks[i - 1];
   }
}