/** * Replace data in a subrange of buffer object. If the data range * specified by size + offset extends beyond the end of the buffer or * if data is NULL, no copy is performed. * Called via glBufferSubDataARB(). */ static void intel_bufferobj_subdata(struct gl_context * ctx, GLintptrARB offset, GLsizeiptrARB size, const GLvoid * data, struct gl_buffer_object *obj) { struct intel_context *intel = intel_context(ctx); struct intel_buffer_object *intel_obj = intel_buffer_object(obj); bool busy; if (size == 0) return; assert(intel_obj); /* If we have a single copy in system memory, update that */ if (intel_obj->sys_buffer) { if (intel_obj->source) release_buffer(intel_obj); if (intel_obj->buffer == NULL) { memcpy((char *)intel_obj->sys_buffer + offset, data, size); return; } free(intel_obj->sys_buffer); intel_obj->sys_buffer = NULL; } /* Otherwise we need to update the copy in video memory. */ busy = drm_intel_bo_busy(intel_obj->buffer) || drm_intel_bo_references(intel->batch.bo, intel_obj->buffer); if (busy) { if (size == intel_obj->Base.Size) { /* Replace the current busy bo with fresh data. */ drm_intel_bo_unreference(intel_obj->buffer); intel_bufferobj_alloc_buffer(intel, intel_obj); drm_intel_bo_subdata(intel_obj->buffer, 0, size, data); } else { perf_debug("Using a blit copy to avoid stalling on %ldb " "glBufferSubData() to a busy buffer object.\n", (long)size); drm_intel_bo *temp_bo = drm_intel_bo_alloc(intel->bufmgr, "subdata temp", size, 64); drm_intel_bo_subdata(temp_bo, 0, size, data); intel_emit_linear_blit(intel, intel_obj->buffer, offset, temp_bo, 0, size); drm_intel_bo_unreference(temp_bo); } } else { drm_intel_bo_subdata(intel_obj->buffer, offset, size, data); } }
static boolean i915_drm_buffer_is_busy(struct i915_winsys *iws, struct i915_winsys_buffer *buffer) { struct i915_drm_buffer* i915_buffer = i915_drm_buffer(buffer); if (!i915_buffer) return FALSE; return drm_intel_bo_busy(i915_buffer->bo); }
static void intel_check_sync(GLcontext *ctx, struct gl_sync_object *s) { struct intel_sync_object *sync = (struct intel_sync_object *)s; if (sync->bo && !drm_intel_bo_busy(sync->bo)) { drm_intel_bo_unreference(sync->bo); sync->bo = NULL; s->StatusFlag = 1; } }
static void intelTexImage(struct gl_context * ctx, GLuint dims, struct gl_texture_image *texImage, GLenum format, GLenum type, const void *pixels, const struct gl_pixelstore_attrib *unpack) { struct intel_texture_image *intelImage = intel_texture_image(texImage); bool ok; bool tex_busy = intelImage->mt && drm_intel_bo_busy(intelImage->mt->bo); DBG("%s mesa_format %s target %s format %s type %s level %d %dx%dx%d\n", __func__, _mesa_get_format_name(texImage->TexFormat), _mesa_enum_to_string(texImage->TexObject->Target), _mesa_enum_to_string(format), _mesa_enum_to_string(type), texImage->Level, texImage->Width, texImage->Height, texImage->Depth); /* Allocate storage for texture data. */ if (!ctx->Driver.AllocTextureImageBuffer(ctx, texImage)) { _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage%uD", dims); return; } assert(intelImage->mt); if (intelImage->mt->format == MESA_FORMAT_S_UINT8) intelImage->mt->r8stencil_needs_update = true; ok = _mesa_meta_pbo_TexSubImage(ctx, dims, texImage, 0, 0, 0, texImage->Width, texImage->Height, texImage->Depth, format, type, pixels, tex_busy, unpack); if (ok) return; ok = intel_texsubimage_tiled_memcpy(ctx, dims, texImage, 0, 0, 0, /*x,y,z offsets*/ texImage->Width, texImage->Height, texImage->Depth, format, type, pixels, unpack, false /*allocate_storage*/); if (ok) return; DBG("%s: upload image %dx%dx%d pixels %p\n", __func__, texImage->Width, texImage->Height, texImage->Depth, pixels); _mesa_store_teximage(ctx, dims, texImage, format, type, pixels, unpack); }
static int i915_drm_fence_signalled(struct i915_winsys *iws, struct pipe_fence_handle *fence) { struct i915_drm_fence *f = (struct i915_drm_fence *)fence; /* fence already expired */ if (!f->bo) return 1; return !drm_intel_bo_busy(f->bo); }
int brw_bo_map_gtt(struct brw_context *brw, drm_intel_bo *bo, const char *bo_name) { if (likely(!brw->perf_debug) || !drm_intel_bo_busy(bo)) return drm_intel_gem_bo_map_gtt(bo); float start_time = get_time(); int ret = drm_intel_gem_bo_map_gtt(bo); perf_debug("GTT mapping a busy %s BO stalled and took %.03f ms.\n", bo_name, (get_time() - start_time) * 1000); return ret; }
static bool brw_fence_has_completed(struct brw_fence *fence) { if (fence->signalled) return true; if (fence->batch_bo && !drm_intel_bo_busy(fence->batch_bo)) { drm_intel_bo_unreference(fence->batch_bo); fence->batch_bo = NULL; fence->signalled = true; return true; } return false; }
/** * Map a buffer object; issue performance warnings if mapping causes stalls. * * This matches the drm_intel_bo_map API, but takes an additional human-readable * name for the buffer object to use in the performance debug message. */ int brw_bo_map(struct brw_context *brw, drm_intel_bo *bo, int write_enable, const char *bo_name) { if (likely(!brw->perf_debug) || !drm_intel_bo_busy(bo)) return drm_intel_bo_map(bo, write_enable); double start_time = get_time(); int ret = drm_intel_bo_map(bo, write_enable); perf_debug("CPU mapping a busy %s BO stalled and took %.03f ms.\n", bo_name, (get_time() - start_time) * 1000); return ret; }
void * intel_miptree_map_raw(struct intel_context *intel, struct intel_mipmap_tree *mt) { drm_intel_bo *bo = mt->region->bo; if (unlikely(INTEL_DEBUG & DEBUG_PERF)) { if (drm_intel_bo_busy(bo)) { perf_debug("Mapping a busy BO, causing a stall on the GPU.\n"); } } intel_flush(&intel->ctx); if (mt->region->tiling != I915_TILING_NONE) drm_intel_gem_bo_map_gtt(bo); else drm_intel_bo_map(bo, true); return bo->virtual; }
/** * Replace data in a subrange of buffer object. If the data range * specified by size + offset extends beyond the end of the buffer or * if data is NULL, no copy is performed. * Called via glBufferSubDataARB(). */ static void intel_bufferobj_subdata(GLcontext * ctx, GLenum target, GLintptrARB offset, GLsizeiptrARB size, const GLvoid * data, struct gl_buffer_object *obj) { struct intel_context *intel = intel_context(ctx); struct intel_buffer_object *intel_obj = intel_buffer_object(obj); if (size == 0) return; assert(intel_obj); if (intel_obj->region) intel_bufferobj_cow(intel, intel_obj); if (intel_obj->sys_buffer) memcpy((char *)intel_obj->sys_buffer + offset, data, size); else { /* Flush any existing batchbuffer that might reference this data. */ if (drm_intel_bo_busy(intel_obj->buffer) || drm_intel_bo_references(intel->batch->buf, intel_obj->buffer)) { drm_intel_bo *temp_bo; temp_bo = drm_intel_bo_alloc(intel->bufmgr, "subdata temp", size, 64); drm_intel_bo_subdata(temp_bo, 0, size, data); intel_emit_linear_blit(intel, intel_obj->buffer, offset, temp_bo, 0, size); drm_intel_bo_unreference(temp_bo); } else { drm_intel_bo_subdata(intel_obj->buffer, offset, size, data); } } }
bool brw_codegen_vs_prog(struct brw_context *brw, struct gl_shader_program *prog, struct brw_vertex_program *vp, struct brw_vs_prog_key *key) { GLuint program_size; const GLuint *program; struct brw_vs_prog_data prog_data; struct brw_stage_prog_data *stage_prog_data = &prog_data.base.base; void *mem_ctx; int i; struct brw_shader *vs = NULL; bool start_busy = false; double start_time = 0; if (prog) vs = (struct brw_shader *) prog->_LinkedShaders[MESA_SHADER_VERTEX]; memset(&prog_data, 0, sizeof(prog_data)); /* Use ALT floating point mode for ARB programs so that 0^0 == 1. */ if (!prog) stage_prog_data->use_alt_mode = true; mem_ctx = ralloc_context(NULL); brw_assign_common_binding_table_offsets(MESA_SHADER_VERTEX, brw->intelScreen->devinfo, prog, &vp->program.Base, &prog_data.base.base, 0); /* Allocate the references to the uniforms that will end up in the * prog_data associated with the compiled program, and which will be freed * by the state cache. */ int param_count = vp->program.Base.nir->num_uniforms; if (!brw->intelScreen->compiler->scalar_vs) param_count *= 4; if (vs) prog_data.base.base.nr_image_params = vs->base.NumImages; /* vec4_visitor::setup_uniform_clipplane_values() also uploads user clip * planes as uniforms. */ param_count += key->nr_userclip_plane_consts * 4; stage_prog_data->param = rzalloc_array(NULL, const gl_constant_value *, param_count); stage_prog_data->pull_param = rzalloc_array(NULL, const gl_constant_value *, param_count); stage_prog_data->image_param = rzalloc_array(NULL, struct brw_image_param, stage_prog_data->nr_image_params); stage_prog_data->nr_params = param_count; if (prog) { brw_nir_setup_glsl_uniforms(vp->program.Base.nir, prog, &vp->program.Base, &prog_data.base.base, brw->intelScreen->compiler->scalar_vs); } else { brw_nir_setup_arb_uniforms(vp->program.Base.nir, &vp->program.Base, &prog_data.base.base); } GLbitfield64 outputs_written = vp->program.Base.OutputsWritten; prog_data.inputs_read = vp->program.Base.InputsRead; if (key->copy_edgeflag) { outputs_written |= BITFIELD64_BIT(VARYING_SLOT_EDGE); prog_data.inputs_read |= VERT_BIT_EDGEFLAG; } if (brw->gen < 6) { /* Put dummy slots into the VUE for the SF to put the replaced * point sprite coords in. We shouldn't need these dummy slots, * which take up precious URB space, but it would mean that the SF * doesn't get nice aligned pairs of input coords into output * coords, which would be a pain to handle. */ for (i = 0; i < 8; i++) { if (key->point_coord_replace & (1 << i)) outputs_written |= BITFIELD64_BIT(VARYING_SLOT_TEX0 + i); } /* if back colors are written, allocate slots for front colors too */ if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_BFC0)) outputs_written |= BITFIELD64_BIT(VARYING_SLOT_COL0); if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_BFC1)) outputs_written |= BITFIELD64_BIT(VARYING_SLOT_COL1); } /* In order for legacy clipping to work, we need to populate the clip * distance varying slots whenever clipping is enabled, even if the vertex * shader doesn't write to gl_ClipDistance. */ if (key->nr_userclip_plane_consts > 0) { outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0); outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1); } brw_compute_vue_map(brw->intelScreen->devinfo, &prog_data.base.vue_map, outputs_written, prog ? prog->SeparateShader : false); if (0) { _mesa_fprint_program_opt(stderr, &vp->program.Base, PROG_PRINT_DEBUG, true); } if (unlikely(brw->perf_debug)) { start_busy = (brw->batch.last_bo && drm_intel_bo_busy(brw->batch.last_bo)); start_time = get_time(); } if (unlikely(INTEL_DEBUG & DEBUG_VS)) brw_dump_ir("vertex", prog, vs ? &vs->base : NULL, &vp->program.Base); int st_index = -1; if (INTEL_DEBUG & DEBUG_SHADER_TIME) st_index = brw_get_shader_time_index(brw, prog, &vp->program.Base, ST_VS); /* Emit GEN4 code. */ char *error_str; program = brw_compile_vs(brw->intelScreen->compiler, brw, mem_ctx, key, &prog_data, vp->program.Base.nir, brw_select_clip_planes(&brw->ctx), !_mesa_is_gles3(&brw->ctx), st_index, &program_size, &error_str); if (program == NULL) { if (prog) { prog->LinkStatus = false; ralloc_strcat(&prog->InfoLog, error_str); } _mesa_problem(NULL, "Failed to compile vertex shader: %s\n", error_str); ralloc_free(mem_ctx); return false; } if (unlikely(brw->perf_debug) && vs) { if (vs->compiled_once) { brw_vs_debug_recompile(brw, prog, key); } if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) { perf_debug("VS compile took %.03f ms and stalled the GPU\n", (get_time() - start_time) * 1000); } vs->compiled_once = true; } /* Scratch space is used for register spilling */ if (prog_data.base.base.total_scratch) { brw_get_scratch_bo(brw, &brw->vs.base.scratch_bo, prog_data.base.base.total_scratch * brw->max_vs_threads); } brw_upload_cache(&brw->cache, BRW_CACHE_VS_PROG, key, sizeof(struct brw_vs_prog_key), program, program_size, &prog_data, sizeof(prog_data), &brw->vs.base.prog_offset, &brw->vs.prog_data); ralloc_free(mem_ctx); return true; }
/** * Called via glMapBufferRange(). * * The goal of this extension is to allow apps to accumulate their rendering * at the same time as they accumulate their buffer object. Without it, * you'd end up blocking on execution of rendering every time you mapped * the buffer to put new data in. * * We support it in 3 ways: If unsynchronized, then don't bother * flushing the batchbuffer before mapping the buffer, which can save blocking * in many cases. If we would still block, and they allow the whole buffer * to be invalidated, then just allocate a new buffer to replace the old one. * If not, and we'd block, and they allow the subrange of the buffer to be * invalidated, then we can make a new little BO, let them write into that, * and blit it into the real BO at unmap time. */ static void * intel_bufferobj_map_range(GLcontext * ctx, GLenum target, GLintptr offset, GLsizeiptr length, GLbitfield access, struct gl_buffer_object *obj) { struct intel_context *intel = intel_context(ctx); struct intel_buffer_object *intel_obj = intel_buffer_object(obj); assert(intel_obj); /* _mesa_MapBufferRange (GL entrypoint) sets these, but the vbo module also * internally uses our functions directly. */ obj->Offset = offset; obj->Length = length; obj->AccessFlags = access; if (intel_obj->sys_buffer) { obj->Pointer = intel_obj->sys_buffer + offset; return obj->Pointer; } if (intel_obj->region) intel_bufferobj_cow(intel, intel_obj); /* If the mapping is synchronized with other GL operations, flush * the batchbuffer so that GEM knows about the buffer access for later * syncing. */ if (!(access & GL_MAP_UNSYNCHRONIZED_BIT) && drm_intel_bo_references(intel->batch->buf, intel_obj->buffer)) intelFlush(ctx); if (intel_obj->buffer == NULL) { obj->Pointer = NULL; return NULL; } /* If the user doesn't care about existing buffer contents and mapping * would cause us to block, then throw out the old buffer. */ if (!(access & GL_MAP_UNSYNCHRONIZED_BIT) && (access & GL_MAP_INVALIDATE_BUFFER_BIT) && drm_intel_bo_busy(intel_obj->buffer)) { drm_intel_bo_unreference(intel_obj->buffer); intel_obj->buffer = dri_bo_alloc(intel->bufmgr, "bufferobj", intel_obj->Base.Size, 64); } /* If the user is mapping a range of an active buffer object but * doesn't require the current contents of that range, make a new * BO, and we'll copy what they put in there out at unmap or * FlushRange time. */ if ((access & GL_MAP_INVALIDATE_RANGE_BIT) && drm_intel_bo_busy(intel_obj->buffer)) { if (access & GL_MAP_FLUSH_EXPLICIT_BIT) { intel_obj->range_map_buffer = _mesa_malloc(length); obj->Pointer = intel_obj->range_map_buffer; } else { intel_obj->range_map_bo = drm_intel_bo_alloc(intel->bufmgr, "range map", length, 64); if (!(access & GL_MAP_READ_BIT) && intel->intelScreen->kernel_exec_fencing) { drm_intel_gem_bo_map_gtt(intel_obj->range_map_bo); intel_obj->mapped_gtt = GL_TRUE; } else { drm_intel_bo_map(intel_obj->range_map_bo, (access & GL_MAP_WRITE_BIT) != 0); intel_obj->mapped_gtt = GL_FALSE; } obj->Pointer = intel_obj->range_map_bo->virtual; } return obj->Pointer; } if (!(access & GL_MAP_READ_BIT) && intel->intelScreen->kernel_exec_fencing) { drm_intel_gem_bo_map_gtt(intel_obj->buffer); intel_obj->mapped_gtt = GL_TRUE; } else { drm_intel_bo_map(intel_obj->buffer, (access & GL_MAP_WRITE_BIT) != 0); intel_obj->mapped_gtt = GL_FALSE; } obj->Pointer = intel_obj->buffer->virtual + offset; return obj->Pointer; }
bool brw_codegen_gs_prog(struct brw_context *brw, struct gl_shader_program *prog, struct brw_geometry_program *gp, struct brw_gs_prog_key *key) { struct brw_compiler *compiler = brw->intelScreen->compiler; struct gl_shader *shader = prog->_LinkedShaders[MESA_SHADER_GEOMETRY]; struct brw_stage_state *stage_state = &brw->gs.base; struct brw_gs_prog_data prog_data; bool start_busy = false; double start_time = 0; memset(&prog_data, 0, sizeof(prog_data)); assign_gs_binding_table_offsets(brw->intelScreen->devinfo, prog, &gp->program.Base, &prog_data); /* Allocate the references to the uniforms that will end up in the * prog_data associated with the compiled program, and which will be freed * by the state cache. * * Note: param_count needs to be num_uniform_components * 4, since we add * padding around uniform values below vec4 size, so the worst case is that * every uniform is a float which gets padded to the size of a vec4. */ struct gl_shader *gs = prog->_LinkedShaders[MESA_SHADER_GEOMETRY]; struct brw_shader *bgs = (struct brw_shader *) gs; int param_count = gp->program.Base.nir->num_uniforms; if (!compiler->scalar_stage[MESA_SHADER_GEOMETRY]) param_count *= 4; prog_data.base.base.param = rzalloc_array(NULL, const gl_constant_value *, param_count); prog_data.base.base.pull_param = rzalloc_array(NULL, const gl_constant_value *, param_count); prog_data.base.base.image_param = rzalloc_array(NULL, struct brw_image_param, gs->NumImages); prog_data.base.base.nr_params = param_count; prog_data.base.base.nr_image_params = gs->NumImages; brw_nir_setup_glsl_uniforms(gp->program.Base.nir, prog, &gp->program.Base, &prog_data.base.base, compiler->scalar_stage[MESA_SHADER_GEOMETRY]); GLbitfield64 outputs_written = gp->program.Base.OutputsWritten; brw_compute_vue_map(brw->intelScreen->devinfo, &prog_data.base.vue_map, outputs_written, prog ? prog->SeparateShader : false); if (unlikely(INTEL_DEBUG & DEBUG_GS)) brw_dump_ir("geometry", prog, gs, NULL); int st_index = -1; if (INTEL_DEBUG & DEBUG_SHADER_TIME) st_index = brw_get_shader_time_index(brw, prog, NULL, ST_GS); if (unlikely(brw->perf_debug)) { start_busy = brw->batch.last_bo && drm_intel_bo_busy(brw->batch.last_bo); start_time = get_time(); } void *mem_ctx = ralloc_context(NULL); unsigned program_size; char *error_str; const unsigned *program = brw_compile_gs(brw->intelScreen->compiler, brw, mem_ctx, key, &prog_data, shader->Program->nir, prog, st_index, &program_size, &error_str); if (program == NULL) { ralloc_free(mem_ctx); return false; } if (unlikely(brw->perf_debug)) { if (bgs->compiled_once) { brw_gs_debug_recompile(brw, prog, key); } if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) { perf_debug("GS compile took %.03f ms and stalled the GPU\n", (get_time() - start_time) * 1000); } bgs->compiled_once = true; } /* Scratch space is used for register spilling */ if (prog_data.base.base.total_scratch) { brw_get_scratch_bo(brw, &stage_state->scratch_bo, prog_data.base.base.total_scratch * brw->max_gs_threads); } brw_upload_cache(&brw->cache, BRW_CACHE_GS_PROG, key, sizeof(*key), program, program_size, &prog_data, sizeof(prog_data), &stage_state->prog_offset, &brw->gs.prog_data); ralloc_free(mem_ctx); return true; }
/** * All Mesa program -> GPU code generation goes through this function. * Depending on the instructions used (i.e. flow control instructions) * we'll use one of two code generators. */ bool brw_codegen_wm_prog(struct brw_context *brw, struct gl_shader_program *prog, struct brw_fragment_program *fp, struct brw_wm_prog_key *key) { struct gl_context *ctx = &brw->ctx; void *mem_ctx = ralloc_context(NULL); struct brw_wm_prog_data prog_data; const GLuint *program; struct brw_shader *fs = NULL; GLuint program_size; bool start_busy = false; double start_time = 0; if (prog) fs = (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; memset(&prog_data, 0, sizeof(prog_data)); /* Use ALT floating point mode for ARB programs so that 0^0 == 1. */ if (!prog) prog_data.base.use_alt_mode = true; assign_fs_binding_table_offsets(brw->intelScreen->devinfo, prog, &fp->program.Base, key, &prog_data); /* Allocate the references to the uniforms that will end up in the * prog_data associated with the compiled program, and which will be freed * by the state cache. */ int param_count = fp->program.Base.nir->num_uniforms; if (fs) prog_data.base.nr_image_params = fs->base.NumImages; /* The backend also sometimes adds params for texture size. */ param_count += 2 * ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits; prog_data.base.param = rzalloc_array(NULL, const gl_constant_value *, param_count); prog_data.base.pull_param = rzalloc_array(NULL, const gl_constant_value *, param_count); prog_data.base.image_param = rzalloc_array(NULL, struct brw_image_param, prog_data.base.nr_image_params); prog_data.base.nr_params = param_count; if (prog) { brw_nir_setup_glsl_uniforms(fp->program.Base.nir, prog, &fp->program.Base, &prog_data.base, true); } else { brw_nir_setup_arb_uniforms(fp->program.Base.nir, &fp->program.Base, &prog_data.base); } if (unlikely(brw->perf_debug)) { start_busy = (brw->batch.last_bo && drm_intel_bo_busy(brw->batch.last_bo)); start_time = get_time(); } if (unlikely(INTEL_DEBUG & DEBUG_WM)) brw_dump_ir("fragment", prog, fs ? &fs->base : NULL, &fp->program.Base); int st_index8 = -1, st_index16 = -1; if (INTEL_DEBUG & DEBUG_SHADER_TIME) { st_index8 = brw_get_shader_time_index(brw, prog, &fp->program.Base, ST_FS8); st_index16 = brw_get_shader_time_index(brw, prog, &fp->program.Base, ST_FS16); } char *error_str = NULL; program = brw_compile_fs(brw->intelScreen->compiler, brw, mem_ctx, key, &prog_data, fp->program.Base.nir, &fp->program.Base, st_index8, st_index16, brw->use_rep_send, &program_size, &error_str); if (program == NULL) { if (prog) { prog->LinkStatus = false; ralloc_strcat(&prog->InfoLog, error_str); } _mesa_problem(NULL, "Failed to compile fragment shader: %s\n", error_str); ralloc_free(mem_ctx); return false; } if (unlikely(brw->perf_debug) && fs) { if (fs->compiled_once) brw_wm_debug_recompile(brw, prog, key); fs->compiled_once = true; if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) { perf_debug("FS compile took %.03f ms and stalled the GPU\n", (get_time() - start_time) * 1000); } } if (prog_data.base.total_scratch) { brw_get_scratch_bo(brw, &brw->wm.base.scratch_bo, prog_data.base.total_scratch * brw->max_wm_threads); } if (unlikely(INTEL_DEBUG & DEBUG_WM)) fprintf(stderr, "\n"); brw_upload_cache(&brw->cache, BRW_CACHE_FS_PROG, key, sizeof(struct brw_wm_prog_key), program, program_size, &prog_data, sizeof(prog_data), &brw->wm.base.prog_offset, &brw->wm.prog_data); ralloc_free(mem_ctx); return true; }
static const unsigned * brw_cs_emit(struct brw_context *brw, void *mem_ctx, const struct brw_cs_prog_key *key, struct brw_cs_prog_data *prog_data, struct gl_compute_program *cp, struct gl_shader_program *prog, unsigned *final_assembly_size) { bool start_busy = false; double start_time = 0; if (unlikely(brw->perf_debug)) { start_busy = (brw->batch.last_bo && drm_intel_bo_busy(brw->batch.last_bo)); start_time = get_time(); } struct brw_shader *shader = (struct brw_shader *) prog->_LinkedShaders[MESA_SHADER_COMPUTE]; if (unlikely(INTEL_DEBUG & DEBUG_CS)) brw_dump_ir("compute", prog, &shader->base, &cp->Base); prog_data->local_size[0] = cp->LocalSize[0]; prog_data->local_size[1] = cp->LocalSize[1]; prog_data->local_size[2] = cp->LocalSize[2]; unsigned local_workgroup_size = cp->LocalSize[0] * cp->LocalSize[1] * cp->LocalSize[2]; cfg_t *cfg = NULL; const char *fail_msg = NULL; int st_index = -1; if (INTEL_DEBUG & DEBUG_SHADER_TIME) st_index = brw_get_shader_time_index(brw, prog, &cp->Base, ST_CS); /* Now the main event: Visit the shader IR and generate our CS IR for it. */ fs_visitor v8(brw->intelScreen->compiler, brw, mem_ctx, MESA_SHADER_COMPUTE, key, &prog_data->base, prog, &cp->Base, 8, st_index); if (!v8.run_cs()) { fail_msg = v8.fail_msg; } else if (local_workgroup_size <= 8 * brw->max_cs_threads) { cfg = v8.cfg; prog_data->simd_size = 8; } fs_visitor v16(brw->intelScreen->compiler, brw, mem_ctx, MESA_SHADER_COMPUTE, key, &prog_data->base, prog, &cp->Base, 16, st_index); if (likely(!(INTEL_DEBUG & DEBUG_NO16)) && !fail_msg && !v8.simd16_unsupported && local_workgroup_size <= 16 * brw->max_cs_threads) { /* Try a SIMD16 compile */ v16.import_uniforms(&v8); if (!v16.run_cs()) { perf_debug("SIMD16 shader failed to compile: %s", v16.fail_msg); if (!cfg) { fail_msg = "Couldn't generate SIMD16 program and not " "enough threads for SIMD8"; } } else { cfg = v16.cfg; prog_data->simd_size = 16; } } if (unlikely(cfg == NULL)) { assert(fail_msg); prog->LinkStatus = false; ralloc_strcat(&prog->InfoLog, fail_msg); _mesa_problem(NULL, "Failed to compile compute shader: %s\n", fail_msg); return NULL; } fs_generator g(brw->intelScreen->compiler, brw, mem_ctx, (void*) key, &prog_data->base, &cp->Base, v8.promoted_constants, v8.runtime_check_aads_emit, "CS"); if (INTEL_DEBUG & DEBUG_CS) { char *name = ralloc_asprintf(mem_ctx, "%s compute shader %d", prog->Label ? prog->Label : "unnamed", prog->Name); g.enable_debug(name); } g.generate_code(cfg, prog_data->simd_size); if (unlikely(brw->perf_debug) && shader) { if (shader->compiled_once) { _mesa_problem(&brw->ctx, "CS programs shouldn't need recompiles"); } shader->compiled_once = true; if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) { perf_debug("CS compile took %.03f ms and stalled the GPU\n", (get_time() - start_time) * 1000); } } return g.get_assembly(final_assembly_size); }
static bool brw_codegen_cs_prog(struct brw_context *brw, struct gl_shader_program *prog, struct brw_compute_program *cp, struct brw_cs_prog_key *key) { struct gl_context *ctx = &brw->ctx; const GLuint *program; void *mem_ctx = ralloc_context(NULL); GLuint program_size; struct brw_cs_prog_data prog_data; bool start_busy = false; double start_time = 0; struct brw_shader *cs = (struct brw_shader *) prog->_LinkedShaders[MESA_SHADER_COMPUTE]; assert (cs); memset(&prog_data, 0, sizeof(prog_data)); if (prog->Comp.SharedSize > 64 * 1024) { prog->LinkStatus = false; const char *error_str = "Compute shader used more than 64KB of shared variables"; ralloc_strcat(&prog->InfoLog, error_str); _mesa_problem(NULL, "Failed to link compute shader: %s\n", error_str); ralloc_free(mem_ctx); return false; } else { prog_data.base.total_shared = prog->Comp.SharedSize; } assign_cs_binding_table_offsets(brw->intelScreen->devinfo, prog, &cp->program.Base, &prog_data); /* Allocate the references to the uniforms that will end up in the * prog_data associated with the compiled program, and which will be freed * by the state cache. */ int param_count = cp->program.Base.nir->num_uniforms; /* The backend also sometimes adds params for texture size. */ param_count += 2 * ctx->Const.Program[MESA_SHADER_COMPUTE].MaxTextureImageUnits; prog_data.base.param = rzalloc_array(NULL, const gl_constant_value *, param_count); prog_data.base.pull_param = rzalloc_array(NULL, const gl_constant_value *, param_count); prog_data.base.image_param = rzalloc_array(NULL, struct brw_image_param, cs->base.NumImages); prog_data.base.nr_params = param_count; prog_data.base.nr_image_params = cs->base.NumImages; brw_nir_setup_glsl_uniforms(cp->program.Base.nir, prog, &cp->program.Base, &prog_data.base, true); if (unlikely(brw->perf_debug)) { start_busy = (brw->batch.last_bo && drm_intel_bo_busy(brw->batch.last_bo)); start_time = get_time(); } if (unlikely(INTEL_DEBUG & DEBUG_CS)) brw_dump_ir("compute", prog, &cs->base, &cp->program.Base); int st_index = -1; if (INTEL_DEBUG & DEBUG_SHADER_TIME) st_index = brw_get_shader_time_index(brw, prog, &cp->program.Base, ST_CS); char *error_str; program = brw_compile_cs(brw->intelScreen->compiler, brw, mem_ctx, key, &prog_data, cp->program.Base.nir, st_index, &program_size, &error_str); if (program == NULL) { prog->LinkStatus = false; ralloc_strcat(&prog->InfoLog, error_str); _mesa_problem(NULL, "Failed to compile compute shader: %s\n", error_str); ralloc_free(mem_ctx); return false; } if (unlikely(brw->perf_debug) && cs) { if (cs->compiled_once) { _mesa_problem(&brw->ctx, "CS programs shouldn't need recompiles"); } cs->compiled_once = true; if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) { perf_debug("CS compile took %.03f ms and stalled the GPU\n", (get_time() - start_time) * 1000); } } if (prog_data.base.total_scratch) { brw_get_scratch_bo(brw, &brw->cs.base.scratch_bo, prog_data.base.total_scratch * brw->max_cs_threads); } if (unlikely(INTEL_DEBUG & DEBUG_CS)) fprintf(stderr, "\n"); brw_upload_cache(&brw->cache, BRW_CACHE_CS_PROG, key, sizeof(*key), program, program_size, &prog_data, sizeof(prog_data), &brw->cs.base.prog_offset, &brw->cs.prog_data); ralloc_free(mem_ctx); return true; }
/** * Called via glMapBufferRange and glMapBuffer * * The goal of this extension is to allow apps to accumulate their rendering * at the same time as they accumulate their buffer object. Without it, * you'd end up blocking on execution of rendering every time you mapped * the buffer to put new data in. * * We support it in 3 ways: If unsynchronized, then don't bother * flushing the batchbuffer before mapping the buffer, which can save blocking * in many cases. If we would still block, and they allow the whole buffer * to be invalidated, then just allocate a new buffer to replace the old one. * If not, and we'd block, and they allow the subrange of the buffer to be * invalidated, then we can make a new little BO, let them write into that, * and blit it into the real BO at unmap time. */ static void * intel_bufferobj_map_range(struct gl_context * ctx, GLintptr offset, GLsizeiptr length, GLbitfield access, struct gl_buffer_object *obj) { struct intel_context *intel = intel_context(ctx); struct intel_buffer_object *intel_obj = intel_buffer_object(obj); assert(intel_obj); /* _mesa_MapBufferRange (GL entrypoint) sets these, but the vbo module also * internally uses our functions directly. */ obj->Offset = offset; obj->Length = length; obj->AccessFlags = access; if (intel_obj->sys_buffer) { const bool read_only = (access & (GL_MAP_READ_BIT | GL_MAP_WRITE_BIT)) == GL_MAP_READ_BIT; if (!read_only && intel_obj->source) release_buffer(intel_obj); if (!intel_obj->buffer || intel_obj->source) { obj->Pointer = intel_obj->sys_buffer + offset; return obj->Pointer; } free(intel_obj->sys_buffer); intel_obj->sys_buffer = NULL; } if (intel_obj->buffer == NULL) { obj->Pointer = NULL; return NULL; } /* If the access is synchronized (like a normal buffer mapping), then get * things flushed out so the later mapping syncs appropriately through GEM. * If the user doesn't care about existing buffer contents and mapping would * cause us to block, then throw out the old buffer. * * If they set INVALIDATE_BUFFER, we can pitch the current contents to * achieve the required synchronization. */ if (!(access & GL_MAP_UNSYNCHRONIZED_BIT)) { if (drm_intel_bo_references(intel->batch.bo, intel_obj->buffer)) { if (access & GL_MAP_INVALIDATE_BUFFER_BIT) { drm_intel_bo_unreference(intel_obj->buffer); intel_bufferobj_alloc_buffer(intel, intel_obj); } else { perf_debug("Stalling on the GPU for mapping a busy buffer " "object\n"); intel_flush(ctx); } } else if (drm_intel_bo_busy(intel_obj->buffer) && (access & GL_MAP_INVALIDATE_BUFFER_BIT)) { drm_intel_bo_unreference(intel_obj->buffer); intel_bufferobj_alloc_buffer(intel, intel_obj); } } /* If the user is mapping a range of an active buffer object but * doesn't require the current contents of that range, make a new * BO, and we'll copy what they put in there out at unmap or * FlushRange time. */ if ((access & GL_MAP_INVALIDATE_RANGE_BIT) && drm_intel_bo_busy(intel_obj->buffer)) { if (access & GL_MAP_FLUSH_EXPLICIT_BIT) { intel_obj->range_map_buffer = malloc(length); obj->Pointer = intel_obj->range_map_buffer; } else { intel_obj->range_map_bo = drm_intel_bo_alloc(intel->bufmgr, "range map", length, 64); if (!(access & GL_MAP_READ_BIT)) { drm_intel_gem_bo_map_gtt(intel_obj->range_map_bo); } else { drm_intel_bo_map(intel_obj->range_map_bo, (access & GL_MAP_WRITE_BIT) != 0); } obj->Pointer = intel_obj->range_map_bo->virtual; } return obj->Pointer; } if (access & GL_MAP_UNSYNCHRONIZED_BIT) drm_intel_gem_bo_map_unsynchronized(intel_obj->buffer); else if (!(access & GL_MAP_READ_BIT)) { drm_intel_gem_bo_map_gtt(intel_obj->buffer); } else { drm_intel_bo_map(intel_obj->buffer, (access & GL_MAP_WRITE_BIT) != 0); } obj->Pointer = intel_obj->buffer->virtual + offset; return obj->Pointer; }
/** * Replace data in a subrange of buffer object. If the data range * specified by size + offset extends beyond the end of the buffer or * if data is NULL, no copy is performed. * Called via glBufferSubDataARB(). */ static void intel_bufferobj_subdata(struct gl_context * ctx, GLintptrARB offset, GLsizeiptrARB size, const GLvoid * data, struct gl_buffer_object *obj) { struct intel_context *intel = intel_context(ctx); struct intel_buffer_object *intel_obj = intel_buffer_object(obj); bool busy; if (size == 0) return; assert(intel_obj); /* If we have a single copy in system memory, update that */ if (intel_obj->sys_buffer) { if (intel_obj->source) release_buffer(intel_obj); if (intel_obj->buffer == NULL) { memcpy((char *)intel_obj->sys_buffer + offset, data, size); return; } free(intel_obj->sys_buffer); intel_obj->sys_buffer = NULL; } /* Otherwise we need to update the copy in video memory. */ busy = drm_intel_bo_busy(intel_obj->buffer) || drm_intel_bo_references(intel->batch.bo, intel_obj->buffer); /* replace the current busy bo with fresh data */ if (busy && size == intel_obj->Base.Size) { drm_intel_bo_unreference(intel_obj->buffer); intel_bufferobj_alloc_buffer(intel, intel_obj); drm_intel_bo_subdata(intel_obj->buffer, 0, size, data); } else if (intel->gen < 6) { if (busy) { drm_intel_bo *temp_bo; temp_bo = drm_intel_bo_alloc(intel->bufmgr, "subdata temp", size, 64); drm_intel_bo_subdata(temp_bo, 0, size, data); intel_emit_linear_blit(intel, intel_obj->buffer, offset, temp_bo, 0, size); drm_intel_bo_unreference(temp_bo); } else { drm_intel_bo_subdata(intel_obj->buffer, offset, size, data); } } else { /* Can't use the blit to modify the buffer in the middle of batch. */ if (drm_intel_bo_references(intel->batch.bo, intel_obj->buffer)) { intel_batchbuffer_flush(intel); } drm_intel_bo_subdata(intel_obj->buffer, offset, size, data); } }
*/ if (brw->has_llc) { if (offset + size <= intel_obj->gpu_active_start || intel_obj->gpu_active_end <= offset) { drm_intel_gem_bo_map_unsynchronized(intel_obj->buffer); memcpy(intel_obj->buffer->virtual + offset, data, size); drm_intel_bo_unmap(intel_obj->buffer); if (intel_obj->gpu_active_end > intel_obj->gpu_active_start) intel_obj->prefer_stall_to_blit = true; return; } } busy = drm_intel_bo_busy(intel_obj->buffer) || drm_intel_bo_references(brw->batch.bo, intel_obj->buffer); if (busy) { if (size == intel_obj->Base.Size) { /* Replace the current busy bo so the subdata doesn't stall. */ drm_intel_bo_unreference(intel_obj->buffer); intel_bufferobj_alloc_buffer(brw, intel_obj); } else if (!intel_obj->prefer_stall_to_blit) { perf_debug("Using a blit copy to avoid stalling on " "glBufferSubData(%ld, %ld) (%ldkb) to a busy " "(%d-%d) buffer object.\n", (long)offset, (long)offset + size, (long)(size/1024), intel_obj->gpu_active_start, intel_obj->gpu_active_end); drm_intel_bo *temp_bo =
static bool intel_blit_texsubimage(struct gl_context * ctx, struct gl_texture_image *texImage, GLint xoffset, GLint yoffset, GLint width, GLint height, GLenum format, GLenum type, const void *pixels, const struct gl_pixelstore_attrib *packing) { struct intel_context *intel = intel_context(ctx); struct intel_texture_image *intelImage = intel_texture_image(texImage); /* Try to do a blit upload of the subimage if the texture is * currently busy. */ if (!intelImage->mt) return false; /* The blitter can't handle Y tiling */ if (intelImage->mt->region->tiling == I915_TILING_Y) return false; if (texImage->TexObject->Target != GL_TEXTURE_2D) return false; if (!drm_intel_bo_busy(intelImage->mt->region->bo)) return false; DBG("BLT subimage %s target %s level %d offset %d,%d %dx%d\n", __func__, _mesa_enum_to_string(texImage->TexObject->Target), texImage->Level, xoffset, yoffset, width, height); pixels = _mesa_validate_pbo_teximage(ctx, 2, width, height, 1, format, type, pixels, packing, "glTexSubImage"); if (!pixels) return false; struct intel_mipmap_tree *temp_mt = intel_miptree_create(intel, GL_TEXTURE_2D, texImage->TexFormat, 0, 0, width, height, 1, false, INTEL_MIPTREE_TILING_NONE); if (!temp_mt) goto err; GLubyte *dst = intel_miptree_map_raw(intel, temp_mt); if (!dst) goto err; if (!_mesa_texstore(ctx, 2, texImage->_BaseFormat, texImage->TexFormat, temp_mt->region->pitch, &dst, width, height, 1, format, type, pixels, packing)) { _mesa_error(ctx, GL_OUT_OF_MEMORY, "intelTexSubImage"); } intel_miptree_unmap_raw(temp_mt); bool ret; ret = intel_miptree_blit(intel, temp_mt, 0, 0, 0, 0, false, intelImage->mt, texImage->Level, texImage->Face, xoffset, yoffset, false, width, height, COLOR_LOGICOP_COPY); assert(ret); intel_miptree_release(&temp_mt); _mesa_unmap_teximage_pbo(ctx, packing); return ret; err: _mesa_error(ctx, GL_OUT_OF_MEMORY, "intelTexSubImage"); intel_miptree_release(&temp_mt); _mesa_unmap_teximage_pbo(ctx, packing); return false; }
static bool brw_codegen_cs_prog(struct brw_context *brw, struct gl_shader_program *prog, struct brw_program *cp, struct brw_cs_prog_key *key) { const struct gen_device_info *devinfo = &brw->screen->devinfo; struct gl_context *ctx = &brw->ctx; const GLuint *program; void *mem_ctx = ralloc_context(NULL); GLuint program_size; struct brw_cs_prog_data prog_data; bool start_busy = false; double start_time = 0; struct brw_shader *cs = (struct brw_shader *) prog->_LinkedShaders[MESA_SHADER_COMPUTE]; assert (cs); memset(&prog_data, 0, sizeof(prog_data)); if (prog->Comp.SharedSize > 64 * 1024) { prog->LinkStatus = false; const char *error_str = "Compute shader used more than 64KB of shared variables"; ralloc_strcat(&prog->InfoLog, error_str); _mesa_problem(NULL, "Failed to link compute shader: %s\n", error_str); ralloc_free(mem_ctx); return false; } else { prog_data.base.total_shared = prog->Comp.SharedSize; } assign_cs_binding_table_offsets(devinfo, prog, &cp->program, &prog_data); /* Allocate the references to the uniforms that will end up in the * prog_data associated with the compiled program, and which will be freed * by the state cache. */ int param_count = cp->program.nir->num_uniforms / 4; /* The backend also sometimes add a param for the thread local id. */ prog_data.thread_local_id_index = param_count++; /* The backend also sometimes adds params for texture size. */ param_count += 2 * ctx->Const.Program[MESA_SHADER_COMPUTE].MaxTextureImageUnits; prog_data.base.param = rzalloc_array(NULL, const gl_constant_value *, param_count); prog_data.base.pull_param = rzalloc_array(NULL, const gl_constant_value *, param_count); prog_data.base.image_param = rzalloc_array(NULL, struct brw_image_param, cs->base.NumImages); prog_data.base.nr_params = param_count; prog_data.base.nr_image_params = cs->base.NumImages; brw_nir_setup_glsl_uniforms(cp->program.nir, prog, &cp->program, &prog_data.base, true); if (unlikely(brw->perf_debug)) { start_busy = (brw->batch.last_bo && drm_intel_bo_busy(brw->batch.last_bo)); start_time = get_time(); } if (unlikely(INTEL_DEBUG & DEBUG_CS)) brw_dump_ir("compute", prog, &cs->base, &cp->program); int st_index = -1; if (INTEL_DEBUG & DEBUG_SHADER_TIME) st_index = brw_get_shader_time_index(brw, prog, &cp->program, ST_CS); char *error_str; program = brw_compile_cs(brw->screen->compiler, brw, mem_ctx, key, &prog_data, cp->program.nir, st_index, &program_size, &error_str); if (program == NULL) { prog->LinkStatus = false; ralloc_strcat(&prog->InfoLog, error_str); _mesa_problem(NULL, "Failed to compile compute shader: %s\n", error_str); ralloc_free(mem_ctx); return false; } if (unlikely(brw->perf_debug) && cs) { if (cs->compiled_once) { _mesa_problem(&brw->ctx, "CS programs shouldn't need recompiles"); } cs->compiled_once = true; if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) { perf_debug("CS compile took %.03f ms and stalled the GPU\n", (get_time() - start_time) * 1000); } } const unsigned subslices = MAX2(brw->screen->subslice_total, 1); /* WaCSScratchSize:hsw * * Haswell's scratch space address calculation appears to be sparse * rather than tightly packed. The Thread ID has bits indicating * which subslice, EU within a subslice, and thread within an EU * it is. There's a maximum of two slices and two subslices, so these * can be stored with a single bit. Even though there are only 10 EUs * per subslice, this is stored in 4 bits, so there's an effective * maximum value of 16 EUs. Similarly, although there are only 7 * threads per EU, this is stored in a 3 bit number, giving an effective * maximum value of 8 threads per EU. * * This means that we need to use 16 * 8 instead of 10 * 7 for the * number of threads per subslice. */ const unsigned scratch_ids_per_subslice = brw->is_haswell ? 16 * 8 : devinfo->max_cs_threads; brw_alloc_stage_scratch(brw, &brw->cs.base, prog_data.base.total_scratch, scratch_ids_per_subslice * subslices); if (unlikely(INTEL_DEBUG & DEBUG_CS)) fprintf(stderr, "\n"); brw_upload_cache(&brw->cache, BRW_CACHE_CS_PROG, key, sizeof(*key), program, program_size, &prog_data, sizeof(prog_data), &brw->cs.base.prog_offset, &brw->cs.base.prog_data); ralloc_free(mem_ctx); return true; }