Exemplo n.º 1
0
/**
 * Replace data in a subrange of buffer object.  If the data range
 * specified by size + offset extends beyond the end of the buffer or
 * if data is NULL, no copy is performed.
 * Called via glBufferSubDataARB().
 */
static void
intel_bufferobj_subdata(struct gl_context * ctx,
                        GLintptrARB offset,
                        GLsizeiptrARB size,
                        const GLvoid * data, struct gl_buffer_object *obj)
{
   struct intel_context *intel = intel_context(ctx);
   struct intel_buffer_object *intel_obj = intel_buffer_object(obj);
   bool busy;

   if (size == 0)
      return;

   assert(intel_obj);

   /* If we have a single copy in system memory, update that */
   if (intel_obj->sys_buffer) {
      if (intel_obj->source)
	 release_buffer(intel_obj);

      if (intel_obj->buffer == NULL) {
	 memcpy((char *)intel_obj->sys_buffer + offset, data, size);
	 return;
      }

      free(intel_obj->sys_buffer);
      intel_obj->sys_buffer = NULL;
   }

   /* Otherwise we need to update the copy in video memory. */
   busy =
      drm_intel_bo_busy(intel_obj->buffer) ||
      drm_intel_bo_references(intel->batch.bo, intel_obj->buffer);

   if (busy) {
      if (size == intel_obj->Base.Size) {
	 /* Replace the current busy bo with fresh data. */
	 drm_intel_bo_unreference(intel_obj->buffer);
	 intel_bufferobj_alloc_buffer(intel, intel_obj);
	 drm_intel_bo_subdata(intel_obj->buffer, 0, size, data);
      } else {
         perf_debug("Using a blit copy to avoid stalling on %ldb "
                    "glBufferSubData() to a busy buffer object.\n",
                    (long)size);
	 drm_intel_bo *temp_bo =
	    drm_intel_bo_alloc(intel->bufmgr, "subdata temp", size, 64);

	 drm_intel_bo_subdata(temp_bo, 0, size, data);

	 intel_emit_linear_blit(intel,
				intel_obj->buffer, offset,
				temp_bo, 0,
				size);

	 drm_intel_bo_unreference(temp_bo);
      }
   } else {
      drm_intel_bo_subdata(intel_obj->buffer, offset, size, data);
   }
}
Exemplo n.º 2
0
static boolean
i915_drm_buffer_is_busy(struct i915_winsys *iws,
                        struct i915_winsys_buffer *buffer)
{
   struct i915_drm_buffer* i915_buffer = i915_drm_buffer(buffer);
   if (!i915_buffer)
      return FALSE;
   return drm_intel_bo_busy(i915_buffer->bo);
}
Exemplo n.º 3
0
static void intel_check_sync(GLcontext *ctx, struct gl_sync_object *s)
{
   struct intel_sync_object *sync = (struct intel_sync_object *)s;

   if (sync->bo && !drm_intel_bo_busy(sync->bo)) {
      drm_intel_bo_unreference(sync->bo);
      sync->bo = NULL;
      s->StatusFlag = 1;
   }
}
Exemplo n.º 4
0
static void
intelTexImage(struct gl_context * ctx,
              GLuint dims,
              struct gl_texture_image *texImage,
              GLenum format, GLenum type, const void *pixels,
              const struct gl_pixelstore_attrib *unpack)
{
   struct intel_texture_image *intelImage = intel_texture_image(texImage);
   bool ok;

   bool tex_busy = intelImage->mt && drm_intel_bo_busy(intelImage->mt->bo);

   DBG("%s mesa_format %s target %s format %s type %s level %d %dx%dx%d\n",
       __func__, _mesa_get_format_name(texImage->TexFormat),
       _mesa_enum_to_string(texImage->TexObject->Target),
       _mesa_enum_to_string(format), _mesa_enum_to_string(type),
       texImage->Level, texImage->Width, texImage->Height, texImage->Depth);

   /* Allocate storage for texture data. */
   if (!ctx->Driver.AllocTextureImageBuffer(ctx, texImage)) {
      _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage%uD", dims);
      return;
   }

   assert(intelImage->mt);

   if (intelImage->mt->format == MESA_FORMAT_S_UINT8)
      intelImage->mt->r8stencil_needs_update = true;

   ok = _mesa_meta_pbo_TexSubImage(ctx, dims, texImage, 0, 0, 0,
                                   texImage->Width, texImage->Height,
                                   texImage->Depth,
                                   format, type, pixels,
                                   tex_busy, unpack);
   if (ok)
      return;

   ok = intel_texsubimage_tiled_memcpy(ctx, dims, texImage,
                                       0, 0, 0, /*x,y,z offsets*/
                                       texImage->Width,
                                       texImage->Height,
                                       texImage->Depth,
                                       format, type, pixels, unpack,
                                       false /*allocate_storage*/);
   if (ok)
      return;

   DBG("%s: upload image %dx%dx%d pixels %p\n",
       __func__, texImage->Width, texImage->Height, texImage->Depth,
       pixels);

   _mesa_store_teximage(ctx, dims, texImage,
                        format, type, pixels, unpack);
}
Exemplo n.º 5
0
static int
i915_drm_fence_signalled(struct i915_winsys *iws,
                          struct pipe_fence_handle *fence)
{
   struct i915_drm_fence *f = (struct i915_drm_fence *)fence;

   /* fence already expired */
   if (!f->bo)
	   return 1;

   return !drm_intel_bo_busy(f->bo);
}
Exemplo n.º 6
0
int
brw_bo_map_gtt(struct brw_context *brw, drm_intel_bo *bo, const char *bo_name)
{
   if (likely(!brw->perf_debug) || !drm_intel_bo_busy(bo))
      return drm_intel_gem_bo_map_gtt(bo);

   float start_time = get_time();

   int ret = drm_intel_gem_bo_map_gtt(bo);

   perf_debug("GTT mapping a busy %s BO stalled and took %.03f ms.\n",
              bo_name, (get_time() - start_time) * 1000);

   return ret;
}
Exemplo n.º 7
0
static bool
brw_fence_has_completed(struct brw_fence *fence)
{
   if (fence->signalled)
      return true;

   if (fence->batch_bo && !drm_intel_bo_busy(fence->batch_bo)) {
      drm_intel_bo_unreference(fence->batch_bo);
      fence->batch_bo = NULL;
      fence->signalled = true;
      return true;
   }

   return false;
}
Exemplo n.º 8
0
/**
 * Map a buffer object; issue performance warnings if mapping causes stalls.
 *
 * This matches the drm_intel_bo_map API, but takes an additional human-readable
 * name for the buffer object to use in the performance debug message.
 */
int
brw_bo_map(struct brw_context *brw,
           drm_intel_bo *bo, int write_enable,
           const char *bo_name)
{
   if (likely(!brw->perf_debug) || !drm_intel_bo_busy(bo))
      return drm_intel_bo_map(bo, write_enable);

   double start_time = get_time();

   int ret = drm_intel_bo_map(bo, write_enable);

   perf_debug("CPU mapping a busy %s BO stalled and took %.03f ms.\n",
              bo_name, (get_time() - start_time) * 1000);

   return ret;
}
void *
intel_miptree_map_raw(struct intel_context *intel, struct intel_mipmap_tree *mt)
{
    drm_intel_bo *bo = mt->region->bo;

    if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
        if (drm_intel_bo_busy(bo)) {
            perf_debug("Mapping a busy BO, causing a stall on the GPU.\n");
        }
    }

    intel_flush(&intel->ctx);

    if (mt->region->tiling != I915_TILING_NONE)
        drm_intel_gem_bo_map_gtt(bo);
    else
        drm_intel_bo_map(bo, true);

    return bo->virtual;
}
Exemplo n.º 10
0
/**
 * Replace data in a subrange of buffer object.  If the data range
 * specified by size + offset extends beyond the end of the buffer or
 * if data is NULL, no copy is performed.
 * Called via glBufferSubDataARB().
 */
static void
intel_bufferobj_subdata(GLcontext * ctx,
                        GLenum target,
                        GLintptrARB offset,
                        GLsizeiptrARB size,
                        const GLvoid * data, struct gl_buffer_object *obj)
{
   struct intel_context *intel = intel_context(ctx);
   struct intel_buffer_object *intel_obj = intel_buffer_object(obj);

   if (size == 0)
      return;

   assert(intel_obj);

   if (intel_obj->region)
      intel_bufferobj_cow(intel, intel_obj);

   if (intel_obj->sys_buffer)
      memcpy((char *)intel_obj->sys_buffer + offset, data, size);
   else {
      /* Flush any existing batchbuffer that might reference this data. */
      if (drm_intel_bo_busy(intel_obj->buffer) ||
	  drm_intel_bo_references(intel->batch->buf, intel_obj->buffer)) {
	 drm_intel_bo *temp_bo;

	 temp_bo = drm_intel_bo_alloc(intel->bufmgr, "subdata temp", size, 64);

	 drm_intel_bo_subdata(temp_bo, 0, size, data);

	 intel_emit_linear_blit(intel,
				intel_obj->buffer, offset,
				temp_bo, 0,
				size);

	 drm_intel_bo_unreference(temp_bo);
      } else {
	 drm_intel_bo_subdata(intel_obj->buffer, offset, size, data);
      }
   }
}
Exemplo n.º 11
0
bool
brw_codegen_vs_prog(struct brw_context *brw,
                    struct gl_shader_program *prog,
                    struct brw_vertex_program *vp,
                    struct brw_vs_prog_key *key)
{
   GLuint program_size;
   const GLuint *program;
   struct brw_vs_prog_data prog_data;
   struct brw_stage_prog_data *stage_prog_data = &prog_data.base.base;
   void *mem_ctx;
   int i;
   struct brw_shader *vs = NULL;
   bool start_busy = false;
   double start_time = 0;

   if (prog)
      vs = (struct brw_shader *) prog->_LinkedShaders[MESA_SHADER_VERTEX];

   memset(&prog_data, 0, sizeof(prog_data));

   /* Use ALT floating point mode for ARB programs so that 0^0 == 1. */
   if (!prog)
      stage_prog_data->use_alt_mode = true;

   mem_ctx = ralloc_context(NULL);

   brw_assign_common_binding_table_offsets(MESA_SHADER_VERTEX,
                                           brw->intelScreen->devinfo,
                                           prog, &vp->program.Base,
                                           &prog_data.base.base, 0);

   /* Allocate the references to the uniforms that will end up in the
    * prog_data associated with the compiled program, and which will be freed
    * by the state cache.
    */
   int param_count = vp->program.Base.nir->num_uniforms;
   if (!brw->intelScreen->compiler->scalar_vs)
      param_count *= 4;

   if (vs)
      prog_data.base.base.nr_image_params = vs->base.NumImages;

   /* vec4_visitor::setup_uniform_clipplane_values() also uploads user clip
    * planes as uniforms.
    */
   param_count += key->nr_userclip_plane_consts * 4;

   stage_prog_data->param =
      rzalloc_array(NULL, const gl_constant_value *, param_count);
   stage_prog_data->pull_param =
      rzalloc_array(NULL, const gl_constant_value *, param_count);
   stage_prog_data->image_param =
      rzalloc_array(NULL, struct brw_image_param,
                    stage_prog_data->nr_image_params);
   stage_prog_data->nr_params = param_count;

   if (prog) {
      brw_nir_setup_glsl_uniforms(vp->program.Base.nir, prog, &vp->program.Base,
                                  &prog_data.base.base,
                                  brw->intelScreen->compiler->scalar_vs);
   } else {
      brw_nir_setup_arb_uniforms(vp->program.Base.nir, &vp->program.Base,
                                 &prog_data.base.base);
   }

   GLbitfield64 outputs_written = vp->program.Base.OutputsWritten;
   prog_data.inputs_read = vp->program.Base.InputsRead;

   if (key->copy_edgeflag) {
      outputs_written |= BITFIELD64_BIT(VARYING_SLOT_EDGE);
      prog_data.inputs_read |= VERT_BIT_EDGEFLAG;
   }

   if (brw->gen < 6) {
      /* Put dummy slots into the VUE for the SF to put the replaced
       * point sprite coords in.  We shouldn't need these dummy slots,
       * which take up precious URB space, but it would mean that the SF
       * doesn't get nice aligned pairs of input coords into output
       * coords, which would be a pain to handle.
       */
      for (i = 0; i < 8; i++) {
         if (key->point_coord_replace & (1 << i))
            outputs_written |= BITFIELD64_BIT(VARYING_SLOT_TEX0 + i);
      }

      /* if back colors are written, allocate slots for front colors too */
      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_BFC0))
         outputs_written |= BITFIELD64_BIT(VARYING_SLOT_COL0);
      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_BFC1))
         outputs_written |= BITFIELD64_BIT(VARYING_SLOT_COL1);
   }

   /* In order for legacy clipping to work, we need to populate the clip
    * distance varying slots whenever clipping is enabled, even if the vertex
    * shader doesn't write to gl_ClipDistance.
    */
   if (key->nr_userclip_plane_consts > 0) {
      outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0);
      outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1);
   }

   brw_compute_vue_map(brw->intelScreen->devinfo,
                       &prog_data.base.vue_map, outputs_written,
                       prog ? prog->SeparateShader : false);

   if (0) {
      _mesa_fprint_program_opt(stderr, &vp->program.Base, PROG_PRINT_DEBUG,
			       true);
   }

   if (unlikely(brw->perf_debug)) {
      start_busy = (brw->batch.last_bo &&
                    drm_intel_bo_busy(brw->batch.last_bo));
      start_time = get_time();
   }

   if (unlikely(INTEL_DEBUG & DEBUG_VS))
      brw_dump_ir("vertex", prog, vs ? &vs->base : NULL, &vp->program.Base);

   int st_index = -1;
   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
      st_index = brw_get_shader_time_index(brw, prog, &vp->program.Base, ST_VS);

   /* Emit GEN4 code.
    */
   char *error_str;
   program = brw_compile_vs(brw->intelScreen->compiler, brw, mem_ctx, key,
                            &prog_data, vp->program.Base.nir,
                            brw_select_clip_planes(&brw->ctx),
                            !_mesa_is_gles3(&brw->ctx),
                            st_index, &program_size, &error_str);
   if (program == NULL) {
      if (prog) {
         prog->LinkStatus = false;
         ralloc_strcat(&prog->InfoLog, error_str);
      }

      _mesa_problem(NULL, "Failed to compile vertex shader: %s\n", error_str);

      ralloc_free(mem_ctx);
      return false;
   }

   if (unlikely(brw->perf_debug) && vs) {
      if (vs->compiled_once) {
         brw_vs_debug_recompile(brw, prog, key);
      }
      if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
         perf_debug("VS compile took %.03f ms and stalled the GPU\n",
                    (get_time() - start_time) * 1000);
      }
      vs->compiled_once = true;
   }

   /* Scratch space is used for register spilling */
   if (prog_data.base.base.total_scratch) {
      brw_get_scratch_bo(brw, &brw->vs.base.scratch_bo,
			 prog_data.base.base.total_scratch *
                         brw->max_vs_threads);
   }

   brw_upload_cache(&brw->cache, BRW_CACHE_VS_PROG,
		    key, sizeof(struct brw_vs_prog_key),
		    program, program_size,
		    &prog_data, sizeof(prog_data),
		    &brw->vs.base.prog_offset, &brw->vs.prog_data);
   ralloc_free(mem_ctx);

   return true;
}
Exemplo n.º 12
0
/**
 * Called via glMapBufferRange().
 *
 * The goal of this extension is to allow apps to accumulate their rendering
 * at the same time as they accumulate their buffer object.  Without it,
 * you'd end up blocking on execution of rendering every time you mapped
 * the buffer to put new data in.
 *
 * We support it in 3 ways: If unsynchronized, then don't bother
 * flushing the batchbuffer before mapping the buffer, which can save blocking
 * in many cases.  If we would still block, and they allow the whole buffer
 * to be invalidated, then just allocate a new buffer to replace the old one.
 * If not, and we'd block, and they allow the subrange of the buffer to be
 * invalidated, then we can make a new little BO, let them write into that,
 * and blit it into the real BO at unmap time.
 */
static void *
intel_bufferobj_map_range(GLcontext * ctx,
			  GLenum target, GLintptr offset, GLsizeiptr length,
			  GLbitfield access, struct gl_buffer_object *obj)
{
   struct intel_context *intel = intel_context(ctx);
   struct intel_buffer_object *intel_obj = intel_buffer_object(obj);

   assert(intel_obj);

   /* _mesa_MapBufferRange (GL entrypoint) sets these, but the vbo module also
    * internally uses our functions directly.
    */
   obj->Offset = offset;
   obj->Length = length;
   obj->AccessFlags = access;

   if (intel_obj->sys_buffer) {
      obj->Pointer = intel_obj->sys_buffer + offset;
      return obj->Pointer;
   }

   if (intel_obj->region)
      intel_bufferobj_cow(intel, intel_obj);

   /* If the mapping is synchronized with other GL operations, flush
    * the batchbuffer so that GEM knows about the buffer access for later
    * syncing.
    */
   if (!(access & GL_MAP_UNSYNCHRONIZED_BIT) &&
       drm_intel_bo_references(intel->batch->buf, intel_obj->buffer))
      intelFlush(ctx);

   if (intel_obj->buffer == NULL) {
      obj->Pointer = NULL;
      return NULL;
   }

   /* If the user doesn't care about existing buffer contents and mapping
    * would cause us to block, then throw out the old buffer.
    */
   if (!(access & GL_MAP_UNSYNCHRONIZED_BIT) &&
       (access & GL_MAP_INVALIDATE_BUFFER_BIT) &&
       drm_intel_bo_busy(intel_obj->buffer)) {
      drm_intel_bo_unreference(intel_obj->buffer);
      intel_obj->buffer = dri_bo_alloc(intel->bufmgr, "bufferobj",
				       intel_obj->Base.Size, 64);
   }

   /* If the user is mapping a range of an active buffer object but
    * doesn't require the current contents of that range, make a new
    * BO, and we'll copy what they put in there out at unmap or
    * FlushRange time.
    */
   if ((access & GL_MAP_INVALIDATE_RANGE_BIT) &&
       drm_intel_bo_busy(intel_obj->buffer)) {
      if (access & GL_MAP_FLUSH_EXPLICIT_BIT) {
	 intel_obj->range_map_buffer = _mesa_malloc(length);
	 obj->Pointer = intel_obj->range_map_buffer;
      } else {
	 intel_obj->range_map_bo = drm_intel_bo_alloc(intel->bufmgr,
						      "range map",
						      length, 64);
	 if (!(access & GL_MAP_READ_BIT) &&
	     intel->intelScreen->kernel_exec_fencing) {
	    drm_intel_gem_bo_map_gtt(intel_obj->range_map_bo);
	    intel_obj->mapped_gtt = GL_TRUE;
	 } else {
	    drm_intel_bo_map(intel_obj->range_map_bo,
			     (access & GL_MAP_WRITE_BIT) != 0);
	    intel_obj->mapped_gtt = GL_FALSE;
	 }
	 obj->Pointer = intel_obj->range_map_bo->virtual;
      }
      return obj->Pointer;
   }

   if (!(access & GL_MAP_READ_BIT) &&
       intel->intelScreen->kernel_exec_fencing) {
      drm_intel_gem_bo_map_gtt(intel_obj->buffer);
      intel_obj->mapped_gtt = GL_TRUE;
   } else {
      drm_intel_bo_map(intel_obj->buffer, (access & GL_MAP_WRITE_BIT) != 0);
      intel_obj->mapped_gtt = GL_FALSE;
   }

   obj->Pointer = intel_obj->buffer->virtual + offset;
   return obj->Pointer;
}
Exemplo n.º 13
0
bool
brw_codegen_gs_prog(struct brw_context *brw,
                    struct gl_shader_program *prog,
                    struct brw_geometry_program *gp,
                    struct brw_gs_prog_key *key)
{
   struct brw_compiler *compiler = brw->intelScreen->compiler;
   struct gl_shader *shader = prog->_LinkedShaders[MESA_SHADER_GEOMETRY];
   struct brw_stage_state *stage_state = &brw->gs.base;
   struct brw_gs_prog_data prog_data;
   bool start_busy = false;
   double start_time = 0;

   memset(&prog_data, 0, sizeof(prog_data));

   assign_gs_binding_table_offsets(brw->intelScreen->devinfo, prog,
                                   &gp->program.Base, &prog_data);

   /* Allocate the references to the uniforms that will end up in the
    * prog_data associated with the compiled program, and which will be freed
    * by the state cache.
    *
    * Note: param_count needs to be num_uniform_components * 4, since we add
    * padding around uniform values below vec4 size, so the worst case is that
    * every uniform is a float which gets padded to the size of a vec4.
    */
   struct gl_shader *gs = prog->_LinkedShaders[MESA_SHADER_GEOMETRY];
   struct brw_shader *bgs = (struct brw_shader *) gs;
   int param_count = gp->program.Base.nir->num_uniforms;
   if (!compiler->scalar_stage[MESA_SHADER_GEOMETRY])
      param_count *= 4;

   prog_data.base.base.param =
      rzalloc_array(NULL, const gl_constant_value *, param_count);
   prog_data.base.base.pull_param =
      rzalloc_array(NULL, const gl_constant_value *, param_count);
   prog_data.base.base.image_param =
      rzalloc_array(NULL, struct brw_image_param, gs->NumImages);
   prog_data.base.base.nr_params = param_count;
   prog_data.base.base.nr_image_params = gs->NumImages;

   brw_nir_setup_glsl_uniforms(gp->program.Base.nir, prog, &gp->program.Base,
                               &prog_data.base.base,
                               compiler->scalar_stage[MESA_SHADER_GEOMETRY]);

   GLbitfield64 outputs_written = gp->program.Base.OutputsWritten;

   brw_compute_vue_map(brw->intelScreen->devinfo,
                       &prog_data.base.vue_map, outputs_written,
                       prog ? prog->SeparateShader : false);

   if (unlikely(INTEL_DEBUG & DEBUG_GS))
      brw_dump_ir("geometry", prog, gs, NULL);

   int st_index = -1;
   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
      st_index = brw_get_shader_time_index(brw, prog, NULL, ST_GS);

   if (unlikely(brw->perf_debug)) {
      start_busy = brw->batch.last_bo && drm_intel_bo_busy(brw->batch.last_bo);
      start_time = get_time();
   }

   void *mem_ctx = ralloc_context(NULL);
   unsigned program_size;
   char *error_str;
   const unsigned *program =
      brw_compile_gs(brw->intelScreen->compiler, brw, mem_ctx, key,
                     &prog_data, shader->Program->nir, prog,
                     st_index, &program_size, &error_str);
   if (program == NULL) {
      ralloc_free(mem_ctx);
      return false;
   }

   if (unlikely(brw->perf_debug)) {
      if (bgs->compiled_once) {
         brw_gs_debug_recompile(brw, prog, key);
      }
      if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
         perf_debug("GS compile took %.03f ms and stalled the GPU\n",
                    (get_time() - start_time) * 1000);
      }
      bgs->compiled_once = true;
   }

   /* Scratch space is used for register spilling */
   if (prog_data.base.base.total_scratch) {
      brw_get_scratch_bo(brw, &stage_state->scratch_bo,
			 prog_data.base.base.total_scratch *
                         brw->max_gs_threads);
   }

   brw_upload_cache(&brw->cache, BRW_CACHE_GS_PROG,
                    key, sizeof(*key),
                    program, program_size,
                    &prog_data, sizeof(prog_data),
                    &stage_state->prog_offset, &brw->gs.prog_data);
   ralloc_free(mem_ctx);

   return true;
}
Exemplo n.º 14
0
/**
 * All Mesa program -> GPU code generation goes through this function.
 * Depending on the instructions used (i.e. flow control instructions)
 * we'll use one of two code generators.
 */
bool
brw_codegen_wm_prog(struct brw_context *brw,
                    struct gl_shader_program *prog,
                    struct brw_fragment_program *fp,
                    struct brw_wm_prog_key *key)
{
   struct gl_context *ctx = &brw->ctx;
   void *mem_ctx = ralloc_context(NULL);
   struct brw_wm_prog_data prog_data;
   const GLuint *program;
   struct brw_shader *fs = NULL;
   GLuint program_size;
   bool start_busy = false;
   double start_time = 0;

   if (prog)
      fs = (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT];

   memset(&prog_data, 0, sizeof(prog_data));

   /* Use ALT floating point mode for ARB programs so that 0^0 == 1. */
   if (!prog)
      prog_data.base.use_alt_mode = true;

   assign_fs_binding_table_offsets(brw->intelScreen->devinfo, prog,
                                   &fp->program.Base, key, &prog_data);

   /* Allocate the references to the uniforms that will end up in the
    * prog_data associated with the compiled program, and which will be freed
    * by the state cache.
    */
   int param_count = fp->program.Base.nir->num_uniforms;
   if (fs)
      prog_data.base.nr_image_params = fs->base.NumImages;
   /* The backend also sometimes adds params for texture size. */
   param_count += 2 * ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits;
   prog_data.base.param =
      rzalloc_array(NULL, const gl_constant_value *, param_count);
   prog_data.base.pull_param =
      rzalloc_array(NULL, const gl_constant_value *, param_count);
   prog_data.base.image_param =
      rzalloc_array(NULL, struct brw_image_param,
                    prog_data.base.nr_image_params);
   prog_data.base.nr_params = param_count;

   if (prog) {
      brw_nir_setup_glsl_uniforms(fp->program.Base.nir, prog, &fp->program.Base,
                                  &prog_data.base, true);
   } else {
      brw_nir_setup_arb_uniforms(fp->program.Base.nir, &fp->program.Base,
                                 &prog_data.base);
   }

   if (unlikely(brw->perf_debug)) {
      start_busy = (brw->batch.last_bo &&
                    drm_intel_bo_busy(brw->batch.last_bo));
      start_time = get_time();
   }

   if (unlikely(INTEL_DEBUG & DEBUG_WM))
      brw_dump_ir("fragment", prog, fs ? &fs->base : NULL, &fp->program.Base);

   int st_index8 = -1, st_index16 = -1;
   if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
      st_index8 = brw_get_shader_time_index(brw, prog, &fp->program.Base, ST_FS8);
      st_index16 = brw_get_shader_time_index(brw, prog, &fp->program.Base, ST_FS16);
   }

   char *error_str = NULL;
   program = brw_compile_fs(brw->intelScreen->compiler, brw, mem_ctx,
                            key, &prog_data, fp->program.Base.nir,
                            &fp->program.Base, st_index8, st_index16,
                            brw->use_rep_send, &program_size, &error_str);
   if (program == NULL) {
      if (prog) {
         prog->LinkStatus = false;
         ralloc_strcat(&prog->InfoLog, error_str);
      }

      _mesa_problem(NULL, "Failed to compile fragment shader: %s\n", error_str);

      ralloc_free(mem_ctx);
      return false;
   }

   if (unlikely(brw->perf_debug) && fs) {
      if (fs->compiled_once)
         brw_wm_debug_recompile(brw, prog, key);
      fs->compiled_once = true;

      if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
         perf_debug("FS compile took %.03f ms and stalled the GPU\n",
                    (get_time() - start_time) * 1000);
      }
   }

   if (prog_data.base.total_scratch) {
      brw_get_scratch_bo(brw, &brw->wm.base.scratch_bo,
			 prog_data.base.total_scratch * brw->max_wm_threads);
   }

   if (unlikely(INTEL_DEBUG & DEBUG_WM))
      fprintf(stderr, "\n");

   brw_upload_cache(&brw->cache, BRW_CACHE_FS_PROG,
		    key, sizeof(struct brw_wm_prog_key),
		    program, program_size,
		    &prog_data, sizeof(prog_data),
		    &brw->wm.base.prog_offset, &brw->wm.prog_data);

   ralloc_free(mem_ctx);

   return true;
}
Exemplo n.º 15
0
static const unsigned *
brw_cs_emit(struct brw_context *brw,
            void *mem_ctx,
            const struct brw_cs_prog_key *key,
            struct brw_cs_prog_data *prog_data,
            struct gl_compute_program *cp,
            struct gl_shader_program *prog,
            unsigned *final_assembly_size)
{
   bool start_busy = false;
   double start_time = 0;

   if (unlikely(brw->perf_debug)) {
      start_busy = (brw->batch.last_bo &&
                    drm_intel_bo_busy(brw->batch.last_bo));
      start_time = get_time();
   }

   struct brw_shader *shader =
      (struct brw_shader *) prog->_LinkedShaders[MESA_SHADER_COMPUTE];

   if (unlikely(INTEL_DEBUG & DEBUG_CS))
      brw_dump_ir("compute", prog, &shader->base, &cp->Base);

   prog_data->local_size[0] = cp->LocalSize[0];
   prog_data->local_size[1] = cp->LocalSize[1];
   prog_data->local_size[2] = cp->LocalSize[2];
   unsigned local_workgroup_size =
      cp->LocalSize[0] * cp->LocalSize[1] * cp->LocalSize[2];

   cfg_t *cfg = NULL;
   const char *fail_msg = NULL;

   int st_index = -1;
   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
      st_index = brw_get_shader_time_index(brw, prog, &cp->Base, ST_CS);

   /* Now the main event: Visit the shader IR and generate our CS IR for it.
    */
   fs_visitor v8(brw->intelScreen->compiler, brw,
                 mem_ctx, MESA_SHADER_COMPUTE, key, &prog_data->base, prog,
                 &cp->Base, 8, st_index);
   if (!v8.run_cs()) {
      fail_msg = v8.fail_msg;
   } else if (local_workgroup_size <= 8 * brw->max_cs_threads) {
      cfg = v8.cfg;
      prog_data->simd_size = 8;
   }

   fs_visitor v16(brw->intelScreen->compiler, brw,
                  mem_ctx, MESA_SHADER_COMPUTE, key, &prog_data->base, prog,
                  &cp->Base, 16, st_index);
   if (likely(!(INTEL_DEBUG & DEBUG_NO16)) &&
       !fail_msg && !v8.simd16_unsupported &&
       local_workgroup_size <= 16 * brw->max_cs_threads) {
      /* Try a SIMD16 compile */
      v16.import_uniforms(&v8);
      if (!v16.run_cs()) {
         perf_debug("SIMD16 shader failed to compile: %s", v16.fail_msg);
         if (!cfg) {
            fail_msg =
               "Couldn't generate SIMD16 program and not "
               "enough threads for SIMD8";
         }
      } else {
         cfg = v16.cfg;
         prog_data->simd_size = 16;
      }
   }

   if (unlikely(cfg == NULL)) {
      assert(fail_msg);
      prog->LinkStatus = false;
      ralloc_strcat(&prog->InfoLog, fail_msg);
      _mesa_problem(NULL, "Failed to compile compute shader: %s\n",
                    fail_msg);
      return NULL;
   }

   fs_generator g(brw->intelScreen->compiler, brw,
                  mem_ctx, (void*) key, &prog_data->base, &cp->Base,
                  v8.promoted_constants, v8.runtime_check_aads_emit, "CS");
   if (INTEL_DEBUG & DEBUG_CS) {
      char *name = ralloc_asprintf(mem_ctx, "%s compute shader %d",
                                   prog->Label ? prog->Label : "unnamed",
                                   prog->Name);
      g.enable_debug(name);
   }

   g.generate_code(cfg, prog_data->simd_size);

   if (unlikely(brw->perf_debug) && shader) {
      if (shader->compiled_once) {
         _mesa_problem(&brw->ctx, "CS programs shouldn't need recompiles");
      }
      shader->compiled_once = true;

      if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
         perf_debug("CS compile took %.03f ms and stalled the GPU\n",
                    (get_time() - start_time) * 1000);
      }
   }

   return g.get_assembly(final_assembly_size);
}
Exemplo n.º 16
0
static bool
brw_codegen_cs_prog(struct brw_context *brw,
                    struct gl_shader_program *prog,
                    struct brw_compute_program *cp,
                    struct brw_cs_prog_key *key)
{
   struct gl_context *ctx = &brw->ctx;
   const GLuint *program;
   void *mem_ctx = ralloc_context(NULL);
   GLuint program_size;
   struct brw_cs_prog_data prog_data;
   bool start_busy = false;
   double start_time = 0;

   struct brw_shader *cs =
      (struct brw_shader *) prog->_LinkedShaders[MESA_SHADER_COMPUTE];
   assert (cs);

   memset(&prog_data, 0, sizeof(prog_data));

   if (prog->Comp.SharedSize > 64 * 1024) {
      prog->LinkStatus = false;
      const char *error_str =
         "Compute shader used more than 64KB of shared variables";
      ralloc_strcat(&prog->InfoLog, error_str);
      _mesa_problem(NULL, "Failed to link compute shader: %s\n", error_str);

      ralloc_free(mem_ctx);
      return false;
   } else {
      prog_data.base.total_shared = prog->Comp.SharedSize;
   }

   assign_cs_binding_table_offsets(brw->intelScreen->devinfo, prog,
                                   &cp->program.Base, &prog_data);

   /* Allocate the references to the uniforms that will end up in the
    * prog_data associated with the compiled program, and which will be freed
    * by the state cache.
    */
   int param_count = cp->program.Base.nir->num_uniforms;

   /* The backend also sometimes adds params for texture size. */
   param_count += 2 * ctx->Const.Program[MESA_SHADER_COMPUTE].MaxTextureImageUnits;
   prog_data.base.param =
      rzalloc_array(NULL, const gl_constant_value *, param_count);
   prog_data.base.pull_param =
      rzalloc_array(NULL, const gl_constant_value *, param_count);
   prog_data.base.image_param =
      rzalloc_array(NULL, struct brw_image_param, cs->base.NumImages);
   prog_data.base.nr_params = param_count;
   prog_data.base.nr_image_params = cs->base.NumImages;

   brw_nir_setup_glsl_uniforms(cp->program.Base.nir, prog, &cp->program.Base,
                               &prog_data.base, true);

   if (unlikely(brw->perf_debug)) {
      start_busy = (brw->batch.last_bo &&
                    drm_intel_bo_busy(brw->batch.last_bo));
      start_time = get_time();
   }

   if (unlikely(INTEL_DEBUG & DEBUG_CS))
      brw_dump_ir("compute", prog, &cs->base, &cp->program.Base);

   int st_index = -1;
   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
      st_index = brw_get_shader_time_index(brw, prog, &cp->program.Base, ST_CS);

   char *error_str;
   program = brw_compile_cs(brw->intelScreen->compiler, brw, mem_ctx,
                            key, &prog_data, cp->program.Base.nir,
                            st_index, &program_size, &error_str);
   if (program == NULL) {
      prog->LinkStatus = false;
      ralloc_strcat(&prog->InfoLog, error_str);
      _mesa_problem(NULL, "Failed to compile compute shader: %s\n", error_str);

      ralloc_free(mem_ctx);
      return false;
   }

   if (unlikely(brw->perf_debug) && cs) {
      if (cs->compiled_once) {
         _mesa_problem(&brw->ctx, "CS programs shouldn't need recompiles");
      }
      cs->compiled_once = true;

      if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
         perf_debug("CS compile took %.03f ms and stalled the GPU\n",
                    (get_time() - start_time) * 1000);
      }
   }

   if (prog_data.base.total_scratch) {
      brw_get_scratch_bo(brw, &brw->cs.base.scratch_bo,
                         prog_data.base.total_scratch * brw->max_cs_threads);
   }

   if (unlikely(INTEL_DEBUG & DEBUG_CS))
      fprintf(stderr, "\n");

   brw_upload_cache(&brw->cache, BRW_CACHE_CS_PROG,
                    key, sizeof(*key),
                    program, program_size,
                    &prog_data, sizeof(prog_data),
                    &brw->cs.base.prog_offset, &brw->cs.prog_data);
   ralloc_free(mem_ctx);

   return true;
}
Exemplo n.º 17
0
/**
 * Called via glMapBufferRange and glMapBuffer
 *
 * The goal of this extension is to allow apps to accumulate their rendering
 * at the same time as they accumulate their buffer object.  Without it,
 * you'd end up blocking on execution of rendering every time you mapped
 * the buffer to put new data in.
 *
 * We support it in 3 ways: If unsynchronized, then don't bother
 * flushing the batchbuffer before mapping the buffer, which can save blocking
 * in many cases.  If we would still block, and they allow the whole buffer
 * to be invalidated, then just allocate a new buffer to replace the old one.
 * If not, and we'd block, and they allow the subrange of the buffer to be
 * invalidated, then we can make a new little BO, let them write into that,
 * and blit it into the real BO at unmap time.
 */
static void *
intel_bufferobj_map_range(struct gl_context * ctx,
			  GLintptr offset, GLsizeiptr length,
			  GLbitfield access, struct gl_buffer_object *obj)
{
   struct intel_context *intel = intel_context(ctx);
   struct intel_buffer_object *intel_obj = intel_buffer_object(obj);

   assert(intel_obj);

   /* _mesa_MapBufferRange (GL entrypoint) sets these, but the vbo module also
    * internally uses our functions directly.
    */
   obj->Offset = offset;
   obj->Length = length;
   obj->AccessFlags = access;

   if (intel_obj->sys_buffer) {
      const bool read_only =
	 (access & (GL_MAP_READ_BIT | GL_MAP_WRITE_BIT)) == GL_MAP_READ_BIT;

      if (!read_only && intel_obj->source)
	 release_buffer(intel_obj);

      if (!intel_obj->buffer || intel_obj->source) {
	 obj->Pointer = intel_obj->sys_buffer + offset;
	 return obj->Pointer;
      }

      free(intel_obj->sys_buffer);
      intel_obj->sys_buffer = NULL;
   }

   if (intel_obj->buffer == NULL) {
      obj->Pointer = NULL;
      return NULL;
   }

   /* If the access is synchronized (like a normal buffer mapping), then get
    * things flushed out so the later mapping syncs appropriately through GEM.
    * If the user doesn't care about existing buffer contents and mapping would
    * cause us to block, then throw out the old buffer.
    *
    * If they set INVALIDATE_BUFFER, we can pitch the current contents to
    * achieve the required synchronization.
    */
   if (!(access & GL_MAP_UNSYNCHRONIZED_BIT)) {
      if (drm_intel_bo_references(intel->batch.bo, intel_obj->buffer)) {
	 if (access & GL_MAP_INVALIDATE_BUFFER_BIT) {
	    drm_intel_bo_unreference(intel_obj->buffer);
	    intel_bufferobj_alloc_buffer(intel, intel_obj);
	 } else {
            perf_debug("Stalling on the GPU for mapping a busy buffer "
                       "object\n");
	    intel_flush(ctx);
	 }
      } else if (drm_intel_bo_busy(intel_obj->buffer) &&
		 (access & GL_MAP_INVALIDATE_BUFFER_BIT)) {
	 drm_intel_bo_unreference(intel_obj->buffer);
	 intel_bufferobj_alloc_buffer(intel, intel_obj);
      }
   }

   /* If the user is mapping a range of an active buffer object but
    * doesn't require the current contents of that range, make a new
    * BO, and we'll copy what they put in there out at unmap or
    * FlushRange time.
    */
   if ((access & GL_MAP_INVALIDATE_RANGE_BIT) &&
       drm_intel_bo_busy(intel_obj->buffer)) {
      if (access & GL_MAP_FLUSH_EXPLICIT_BIT) {
	 intel_obj->range_map_buffer = malloc(length);
	 obj->Pointer = intel_obj->range_map_buffer;
      } else {
	 intel_obj->range_map_bo = drm_intel_bo_alloc(intel->bufmgr,
						      "range map",
						      length, 64);
	 if (!(access & GL_MAP_READ_BIT)) {
	    drm_intel_gem_bo_map_gtt(intel_obj->range_map_bo);
	 } else {
	    drm_intel_bo_map(intel_obj->range_map_bo,
			     (access & GL_MAP_WRITE_BIT) != 0);
	 }
	 obj->Pointer = intel_obj->range_map_bo->virtual;
      }
      return obj->Pointer;
   }

   if (access & GL_MAP_UNSYNCHRONIZED_BIT)
      drm_intel_gem_bo_map_unsynchronized(intel_obj->buffer);
   else if (!(access & GL_MAP_READ_BIT)) {
      drm_intel_gem_bo_map_gtt(intel_obj->buffer);
   } else {
      drm_intel_bo_map(intel_obj->buffer, (access & GL_MAP_WRITE_BIT) != 0);
   }

   obj->Pointer = intel_obj->buffer->virtual + offset;
   return obj->Pointer;
}
Exemplo n.º 18
0
/**
 * Replace data in a subrange of buffer object.  If the data range
 * specified by size + offset extends beyond the end of the buffer or
 * if data is NULL, no copy is performed.
 * Called via glBufferSubDataARB().
 */
static void
intel_bufferobj_subdata(struct gl_context * ctx,
                        GLintptrARB offset,
                        GLsizeiptrARB size,
                        const GLvoid * data, struct gl_buffer_object *obj)
{
   struct intel_context *intel = intel_context(ctx);
   struct intel_buffer_object *intel_obj = intel_buffer_object(obj);
   bool busy;

   if (size == 0)
      return;

   assert(intel_obj);

   /* If we have a single copy in system memory, update that */
   if (intel_obj->sys_buffer) {
      if (intel_obj->source)
	 release_buffer(intel_obj);

      if (intel_obj->buffer == NULL) {
	 memcpy((char *)intel_obj->sys_buffer + offset, data, size);
	 return;
      }

      free(intel_obj->sys_buffer);
      intel_obj->sys_buffer = NULL;
   }

   /* Otherwise we need to update the copy in video memory. */
   busy =
      drm_intel_bo_busy(intel_obj->buffer) ||
      drm_intel_bo_references(intel->batch.bo, intel_obj->buffer);

   /* replace the current busy bo with fresh data */
   if (busy && size == intel_obj->Base.Size) {
      drm_intel_bo_unreference(intel_obj->buffer);
      intel_bufferobj_alloc_buffer(intel, intel_obj);
      drm_intel_bo_subdata(intel_obj->buffer, 0, size, data);
   } else if (intel->gen < 6) {
      if (busy) {
	 drm_intel_bo *temp_bo;

	 temp_bo = drm_intel_bo_alloc(intel->bufmgr, "subdata temp", size, 64);

	 drm_intel_bo_subdata(temp_bo, 0, size, data);

	 intel_emit_linear_blit(intel,
				intel_obj->buffer, offset,
				temp_bo, 0,
				size);

	 drm_intel_bo_unreference(temp_bo);
      } else {
	 drm_intel_bo_subdata(intel_obj->buffer, offset, size, data);
      }
   } else {
      /* Can't use the blit to modify the buffer in the middle of batch. */
      if (drm_intel_bo_references(intel->batch.bo, intel_obj->buffer)) {
	 intel_batchbuffer_flush(intel);
      }
      drm_intel_bo_subdata(intel_obj->buffer, offset, size, data);
   }
}
Exemplo n.º 19
0
    */
   if (brw->has_llc) {
      if (offset + size <= intel_obj->gpu_active_start ||
          intel_obj->gpu_active_end <= offset) {
         drm_intel_gem_bo_map_unsynchronized(intel_obj->buffer);
         memcpy(intel_obj->buffer->virtual + offset, data, size);
         drm_intel_bo_unmap(intel_obj->buffer);

         if (intel_obj->gpu_active_end > intel_obj->gpu_active_start)
            intel_obj->prefer_stall_to_blit = true;
         return;
      }
   }

   busy =
      drm_intel_bo_busy(intel_obj->buffer) ||
      drm_intel_bo_references(brw->batch.bo, intel_obj->buffer);

   if (busy) {
      if (size == intel_obj->Base.Size) {
	 /* Replace the current busy bo so the subdata doesn't stall. */
	 drm_intel_bo_unreference(intel_obj->buffer);
	 intel_bufferobj_alloc_buffer(brw, intel_obj);
      } else if (!intel_obj->prefer_stall_to_blit) {
         perf_debug("Using a blit copy to avoid stalling on "
                    "glBufferSubData(%ld, %ld) (%ldkb) to a busy "
                    "(%d-%d) buffer object.\n",
                    (long)offset, (long)offset + size, (long)(size/1024),
                    intel_obj->gpu_active_start,
                    intel_obj->gpu_active_end);
	 drm_intel_bo *temp_bo =
Exemplo n.º 20
0
static bool
intel_blit_texsubimage(struct gl_context * ctx,
		       struct gl_texture_image *texImage,
		       GLint xoffset, GLint yoffset,
		       GLint width, GLint height,
		       GLenum format, GLenum type, const void *pixels,
		       const struct gl_pixelstore_attrib *packing)
{
   struct intel_context *intel = intel_context(ctx);
   struct intel_texture_image *intelImage = intel_texture_image(texImage);

   /* Try to do a blit upload of the subimage if the texture is
    * currently busy.
    */
   if (!intelImage->mt)
      return false;

   /* The blitter can't handle Y tiling */
   if (intelImage->mt->region->tiling == I915_TILING_Y)
      return false;

   if (texImage->TexObject->Target != GL_TEXTURE_2D)
      return false;

   if (!drm_intel_bo_busy(intelImage->mt->region->bo))
      return false;

   DBG("BLT subimage %s target %s level %d offset %d,%d %dx%d\n",
       __func__,
       _mesa_enum_to_string(texImage->TexObject->Target),
       texImage->Level, xoffset, yoffset, width, height);

   pixels = _mesa_validate_pbo_teximage(ctx, 2, width, height, 1,
					format, type, pixels, packing,
					"glTexSubImage");
   if (!pixels)
      return false;

   struct intel_mipmap_tree *temp_mt =
      intel_miptree_create(intel, GL_TEXTURE_2D, texImage->TexFormat,
                           0, 0,
                           width, height, 1,
                           false, INTEL_MIPTREE_TILING_NONE);
   if (!temp_mt)
      goto err;

   GLubyte *dst = intel_miptree_map_raw(intel, temp_mt);
   if (!dst)
      goto err;

   if (!_mesa_texstore(ctx, 2, texImage->_BaseFormat,
		       texImage->TexFormat,
		       temp_mt->region->pitch,
		       &dst,
		       width, height, 1,
		       format, type, pixels, packing)) {
      _mesa_error(ctx, GL_OUT_OF_MEMORY, "intelTexSubImage");
   }

   intel_miptree_unmap_raw(temp_mt);

   bool ret;

   ret = intel_miptree_blit(intel,
                            temp_mt, 0, 0,
                            0, 0, false,
                            intelImage->mt, texImage->Level, texImage->Face,
                            xoffset, yoffset, false,
                            width, height, COLOR_LOGICOP_COPY);
   assert(ret);

   intel_miptree_release(&temp_mt);
   _mesa_unmap_teximage_pbo(ctx, packing);

   return ret;

err:
   _mesa_error(ctx, GL_OUT_OF_MEMORY, "intelTexSubImage");
   intel_miptree_release(&temp_mt);
   _mesa_unmap_teximage_pbo(ctx, packing);
   return false;
}
Exemplo n.º 21
0
static bool
brw_codegen_cs_prog(struct brw_context *brw,
                    struct gl_shader_program *prog,
                    struct brw_program *cp,
                    struct brw_cs_prog_key *key)
{
   const struct gen_device_info *devinfo = &brw->screen->devinfo;
   struct gl_context *ctx = &brw->ctx;
   const GLuint *program;
   void *mem_ctx = ralloc_context(NULL);
   GLuint program_size;
   struct brw_cs_prog_data prog_data;
   bool start_busy = false;
   double start_time = 0;

   struct brw_shader *cs =
      (struct brw_shader *) prog->_LinkedShaders[MESA_SHADER_COMPUTE];
   assert (cs);

   memset(&prog_data, 0, sizeof(prog_data));

   if (prog->Comp.SharedSize > 64 * 1024) {
      prog->LinkStatus = false;
      const char *error_str =
         "Compute shader used more than 64KB of shared variables";
      ralloc_strcat(&prog->InfoLog, error_str);
      _mesa_problem(NULL, "Failed to link compute shader: %s\n", error_str);

      ralloc_free(mem_ctx);
      return false;
   } else {
      prog_data.base.total_shared = prog->Comp.SharedSize;
   }

   assign_cs_binding_table_offsets(devinfo, prog, &cp->program, &prog_data);

   /* Allocate the references to the uniforms that will end up in the
    * prog_data associated with the compiled program, and which will be freed
    * by the state cache.
    */
   int param_count = cp->program.nir->num_uniforms / 4;

   /* The backend also sometimes add a param for the thread local id. */
   prog_data.thread_local_id_index = param_count++;

   /* The backend also sometimes adds params for texture size. */
   param_count += 2 * ctx->Const.Program[MESA_SHADER_COMPUTE].MaxTextureImageUnits;
   prog_data.base.param =
      rzalloc_array(NULL, const gl_constant_value *, param_count);
   prog_data.base.pull_param =
      rzalloc_array(NULL, const gl_constant_value *, param_count);
   prog_data.base.image_param =
      rzalloc_array(NULL, struct brw_image_param, cs->base.NumImages);
   prog_data.base.nr_params = param_count;
   prog_data.base.nr_image_params = cs->base.NumImages;

   brw_nir_setup_glsl_uniforms(cp->program.nir, prog, &cp->program,
                               &prog_data.base, true);

   if (unlikely(brw->perf_debug)) {
      start_busy = (brw->batch.last_bo &&
                    drm_intel_bo_busy(brw->batch.last_bo));
      start_time = get_time();
   }

   if (unlikely(INTEL_DEBUG & DEBUG_CS))
      brw_dump_ir("compute", prog, &cs->base, &cp->program);

   int st_index = -1;
   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
      st_index = brw_get_shader_time_index(brw, prog, &cp->program, ST_CS);

   char *error_str;
   program = brw_compile_cs(brw->screen->compiler, brw, mem_ctx, key,
                            &prog_data, cp->program.nir, st_index,
                            &program_size, &error_str);
   if (program == NULL) {
      prog->LinkStatus = false;
      ralloc_strcat(&prog->InfoLog, error_str);
      _mesa_problem(NULL, "Failed to compile compute shader: %s\n", error_str);

      ralloc_free(mem_ctx);
      return false;
   }

   if (unlikely(brw->perf_debug) && cs) {
      if (cs->compiled_once) {
         _mesa_problem(&brw->ctx, "CS programs shouldn't need recompiles");
      }
      cs->compiled_once = true;

      if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
         perf_debug("CS compile took %.03f ms and stalled the GPU\n",
                    (get_time() - start_time) * 1000);
      }
   }

   const unsigned subslices = MAX2(brw->screen->subslice_total, 1);

   /* WaCSScratchSize:hsw
    *
    * Haswell's scratch space address calculation appears to be sparse
    * rather than tightly packed.  The Thread ID has bits indicating
    * which subslice, EU within a subslice, and thread within an EU
    * it is.  There's a maximum of two slices and two subslices, so these
    * can be stored with a single bit.  Even though there are only 10 EUs
    * per subslice, this is stored in 4 bits, so there's an effective
    * maximum value of 16 EUs.  Similarly, although there are only 7
    * threads per EU, this is stored in a 3 bit number, giving an effective
    * maximum value of 8 threads per EU.
    *
    * This means that we need to use 16 * 8 instead of 10 * 7 for the
    * number of threads per subslice.
    */
   const unsigned scratch_ids_per_subslice =
      brw->is_haswell ? 16 * 8 : devinfo->max_cs_threads;

   brw_alloc_stage_scratch(brw, &brw->cs.base,
                           prog_data.base.total_scratch,
                           scratch_ids_per_subslice * subslices);

   if (unlikely(INTEL_DEBUG & DEBUG_CS))
      fprintf(stderr, "\n");

   brw_upload_cache(&brw->cache, BRW_CACHE_CS_PROG,
                    key, sizeof(*key),
                    program, program_size,
                    &prog_data, sizeof(prog_data),
                    &brw->cs.base.prog_offset, &brw->cs.base.prog_data);
   ralloc_free(mem_ctx);

   return true;
}