/** * Implementation of up or downsampling for window-system MSAA miptrees. */ void brw_meta_updownsample(struct brw_context *brw, struct intel_mipmap_tree *src_mt, struct intel_mipmap_tree *dst_mt) { struct gl_context *ctx = &brw->ctx; GLuint fbos[2], src_rbo, dst_rbo, src_fbo, dst_fbo; GLenum drawbuffer; GLbitfield attachment, blit_bit; if (_mesa_get_format_base_format(src_mt->format) == GL_DEPTH_COMPONENT || _mesa_get_format_base_format(src_mt->format) == GL_DEPTH_STENCIL) { attachment = GL_DEPTH_ATTACHMENT; drawbuffer = GL_NONE; blit_bit = GL_DEPTH_BUFFER_BIT; } else { attachment = GL_COLOR_ATTACHMENT0; drawbuffer = GL_COLOR_ATTACHMENT0; blit_bit = GL_COLOR_BUFFER_BIT; } brw_emit_mi_flush(brw); _mesa_meta_begin(ctx, MESA_META_ALL); _mesa_GenFramebuffers(2, fbos); src_rbo = brw_get_rb_for_slice(brw, src_mt, 0, 0, false); dst_rbo = brw_get_rb_for_slice(brw, dst_mt, 0, 0, false); src_fbo = fbos[0]; dst_fbo = fbos[1]; _mesa_BindFramebuffer(GL_READ_FRAMEBUFFER, src_fbo); _mesa_FramebufferRenderbuffer(GL_READ_FRAMEBUFFER, attachment, GL_RENDERBUFFER, src_rbo); _mesa_ReadBuffer(drawbuffer); _mesa_BindFramebuffer(GL_DRAW_FRAMEBUFFER, dst_fbo); _mesa_FramebufferRenderbuffer(GL_DRAW_FRAMEBUFFER, attachment, GL_RENDERBUFFER, dst_rbo); _mesa_DrawBuffer(drawbuffer); _mesa_BlitFramebuffer(0, 0, src_mt->logical_width0, src_mt->logical_height0, 0, 0, dst_mt->logical_width0, dst_mt->logical_height0, blit_bit, GL_NEAREST); _mesa_DeleteRenderbuffers(1, &src_rbo); _mesa_DeleteRenderbuffers(1, &dst_rbo); _mesa_DeleteFramebuffers(2, fbos); _mesa_meta_end(ctx); brw_emit_mi_flush(brw); }
static void intel_texture_barrier(struct gl_context *ctx) { struct brw_context *brw = brw_context(ctx); brw_emit_mi_flush(brw); }
/** * When the GS is not in use, we assign the entire URB space to the VS. When * the GS is in use, we split the URB space evenly between the VS and the GS. * This is not ideal, but it's simple. * * URB size / 2 URB size / 2 * _____________-______________ _____________-______________ * / \ / \ * +-------------------------------------------------------------+ * | Vertex Shader Entries | Geometry Shader Entries | * +-------------------------------------------------------------+ * * Sandybridge GT1 has 32kB of URB space, while GT2 has 64kB. * (See the Sandybridge PRM, Volume 2, Part 1, Section 1.4.7: 3DSTATE_URB.) */ void gen6_upload_urb(struct brw_context *brw, unsigned vs_size, bool gs_present, unsigned gs_size) { int nr_vs_entries, nr_gs_entries; int total_urb_size = brw->urb.size * 1024; /* in bytes */ const struct gen_device_info *devinfo = &brw->screen->devinfo; /* Calculate how many entries fit in each stage's section of the URB */ if (gs_present) { nr_vs_entries = (total_urb_size/2) / (vs_size * 128); nr_gs_entries = (total_urb_size/2) / (gs_size * 128); } else { nr_vs_entries = total_urb_size / (vs_size * 128); nr_gs_entries = 0; } /* Then clamp to the maximum allowed by the hardware */ if (nr_vs_entries > devinfo->urb.max_vs_entries) nr_vs_entries = devinfo->urb.max_vs_entries; if (nr_gs_entries > devinfo->urb.max_gs_entries) nr_gs_entries = devinfo->urb.max_gs_entries; /* Finally, both must be a multiple of 4 (see 3DSTATE_URB in the PRM). */ brw->urb.nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, 4); brw->urb.nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, 4); assert(brw->urb.nr_vs_entries >= devinfo->urb.min_vs_entries); assert(brw->urb.nr_vs_entries % 4 == 0); assert(brw->urb.nr_gs_entries % 4 == 0); assert(vs_size <= 5); assert(gs_size <= 5); BEGIN_BATCH(3); OUT_BATCH(_3DSTATE_URB << 16 | (3 - 2)); OUT_BATCH(((vs_size - 1) << GEN6_URB_VS_SIZE_SHIFT) | ((brw->urb.nr_vs_entries) << GEN6_URB_VS_ENTRIES_SHIFT)); OUT_BATCH(((gs_size - 1) << GEN6_URB_GS_SIZE_SHIFT) | ((brw->urb.nr_gs_entries) << GEN6_URB_GS_ENTRIES_SHIFT)); ADVANCE_BATCH(); /* From the PRM Volume 2 part 1, section 1.4.7: * * Because of a urb corruption caused by allocating a previous gsunit’s * urb entry to vsunit software is required to send a "GS NULL * Fence"(Send URB fence with VS URB size == 1 and GS URB size == 0) plus * a dummy DRAW call before any case where VS will be taking over GS URB * space. * * It is not clear exactly what this means ("URB fence" is a command that * doesn't exist on Gen6). So for now we just do a full pipeline flush as * a workaround. */ if (brw->urb.gs_present && !gs_present) brw_emit_mi_flush(brw); brw->urb.gs_present = gs_present; }
void brw_meta_resolve_color(struct brw_context *brw, struct intel_mipmap_tree *mt) { struct gl_context *ctx = &brw->ctx; GLuint fbo; struct gl_renderbuffer *rb; struct rect rect; brw_emit_mi_flush(brw); _mesa_meta_begin(ctx, MESA_META_ALL); _mesa_GenFramebuffers(1, &fbo); rb = brw_get_rb_for_slice(brw, mt, 0, 0, false); _mesa_BindFramebuffer(GL_DRAW_FRAMEBUFFER, fbo); _mesa_framebuffer_renderbuffer(ctx, ctx->DrawBuffer, GL_COLOR_ATTACHMENT0, rb); _mesa_DrawBuffer(GL_COLOR_ATTACHMENT0); brw_fast_clear_init(brw); use_rectlist(brw, true); brw_bind_rep_write_shader(brw, (float *) fast_clear_color); /* SKL+ also has a resolve mode for compressed render targets and thus more * bits to let us select the type of resolve. For fast clear resolves, it * turns out we can use the same value as pre-SKL though. */ if (intel_miptree_is_lossless_compressed(brw, mt)) set_fast_clear_op(brw, GEN9_PS_RENDER_TARGET_RESOLVE_FULL); else set_fast_clear_op(brw, GEN7_PS_RENDER_TARGET_RESOLVE_ENABLE); mt->fast_clear_state = INTEL_FAST_CLEAR_STATE_RESOLVED; get_resolve_rect(brw, mt, &rect); brw_draw_rectlist(brw, &rect, 1); set_fast_clear_op(brw, 0); use_rectlist(brw, false); _mesa_reference_renderbuffer(&rb, NULL); _mesa_DeleteFramebuffers(1, &fbo); _mesa_meta_end(ctx); /* We're typically called from intel_update_state() and we're supposed to * return with the state all updated to what it was before * brw_meta_resolve_color() was called. The meta rendering will have * messed up the state and we need to call _mesa_update_state() again to * get back to where we were supposed to be when resolve was called. */ if (ctx->NewState) _mesa_update_state(ctx); }
static void brw_fence_insert(struct brw_context *brw, struct brw_fence *fence) { assert(!fence->batch_bo); assert(!fence->signalled); brw_emit_mi_flush(brw); fence->batch_bo = brw->batch.bo; drm_intel_bo_reference(fence->batch_bo); intel_batchbuffer_flush(brw); }
void brw_end_transform_feedback(struct gl_context *ctx, struct gl_transform_feedback_object *obj) { /* After EndTransformFeedback, it's likely that the client program will try * to draw using the contents of the transform feedback buffer as vertex * input. In order for this to work, we need to flush the data through at * least the GS stage of the pipeline, and flush out the render cache. For * simplicity, just do a full flush. */ struct brw_context *brw = brw_context(ctx); brw_emit_mi_flush(brw); }
void brw_meta_resolve_color(struct brw_context *brw, struct intel_mipmap_tree *mt) { struct gl_context *ctx = &brw->ctx; GLuint fbo, rbo; struct rect rect; brw_emit_mi_flush(brw); _mesa_meta_begin(ctx, MESA_META_ALL); _mesa_GenFramebuffers(1, &fbo); rbo = brw_get_rb_for_slice(brw, mt, 0, 0, false); _mesa_BindFramebuffer(GL_DRAW_FRAMEBUFFER, fbo); _mesa_FramebufferRenderbuffer(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, rbo); _mesa_DrawBuffer(GL_COLOR_ATTACHMENT0); brw_fast_clear_init(brw); use_rectlist(brw, true); brw_bind_rep_write_shader(brw, (float *) fast_clear_color); set_fast_clear_op(brw, GEN7_PS_RENDER_TARGET_RESOLVE_ENABLE); mt->fast_clear_state = INTEL_FAST_CLEAR_STATE_RESOLVED; get_resolve_rect(brw, mt, &rect); brw_draw_rectlist(ctx, &rect, 1); set_fast_clear_op(brw, 0); use_rectlist(brw, false); _mesa_DeleteRenderbuffers(1, &rbo); _mesa_DeleteFramebuffers(1, &fbo); _mesa_meta_end(ctx); /* We're typically called from intel_update_state() and we're supposed to * return with the state all updated to what it was before * brw_meta_resolve_color() was called. The meta rendering will have * messed up the state and we need to call _mesa_update_state() again to * get back to where we were supposed to be when resolve was called. */ if (ctx->NewState) _mesa_update_state(ctx); }
/** * Store the SO_NUM_PRIMS_WRITTEN counters for each stream (4 uint64_t values) * to prim_count_bo. */ static void save_prim_start_values(struct brw_context *brw, struct brw_transform_feedback_object *obj) { /* Flush any drawing so that the counters have the right values. */ brw_emit_mi_flush(brw); /* Emit MI_STORE_REGISTER_MEM commands to write the values. */ for (int i = 0; i < BRW_MAX_XFB_STREAMS; i++) { brw_store_register_mem64(brw, obj->prim_count_bo, GEN7_SO_NUM_PRIMS_WRITTEN(i), START_OFFSET + i * sizeof(uint64_t)); } }
static void intel_texture_barrier(struct gl_context *ctx) { struct brw_context *brw = brw_context(ctx); const struct gen_device_info *devinfo = &brw->screen->devinfo; if (devinfo->gen >= 6) { brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_CACHE_FLUSH | PIPE_CONTROL_RENDER_TARGET_FLUSH | PIPE_CONTROL_CS_STALL); brw_emit_pipe_control_flush(brw, PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE); } else { brw_emit_mi_flush(brw); } }
static void intel_get_tex_sub_image(struct gl_context *ctx, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLint depth, GLenum format, GLenum type, GLvoid *pixels, struct gl_texture_image *texImage) { struct brw_context *brw = brw_context(ctx); bool ok; DBG("%s\n", __func__); if (_mesa_is_bufferobj(ctx->Pack.BufferObj)) { if (_mesa_meta_pbo_GetTexSubImage(ctx, 3, texImage, xoffset, yoffset, zoffset, width, height, depth, format, type, pixels, &ctx->Pack)) { /* Flush to guarantee coherency between the render cache and other * caches the PBO could potentially be bound to after this point. * See the related comment in intelReadPixels() for a more detailed * explanation. */ brw_emit_mi_flush(brw); return; } perf_debug("%s: fallback to CPU mapping in PBO case\n", __func__); } ok = intel_gettexsubimage_tiled_memcpy(ctx, texImage, xoffset, yoffset, width, height, format, type, pixels, &ctx->Pack); if(ok) return; _mesa_meta_GetTexSubImage(ctx, xoffset, yoffset, zoffset, width, height, depth, format, type, pixels, texImage); DBG("%s - DONE\n", __func__); }
static void brw_emit_prim(struct brw_context *brw, const struct _mesa_prim *prim, uint32_t hw_prim) { int verts_per_instance; int vertex_access_type; int indirect_flag; DBG("PRIM: %s %d %d\n", _mesa_enum_to_string(prim->mode), prim->start, prim->count); int start_vertex_location = prim->start; int base_vertex_location = prim->basevertex; if (prim->indexed) { vertex_access_type = brw->gen >= 7 ? GEN7_3DPRIM_VERTEXBUFFER_ACCESS_RANDOM : GEN4_3DPRIM_VERTEXBUFFER_ACCESS_RANDOM; start_vertex_location += brw->ib.start_vertex_offset; base_vertex_location += brw->vb.start_vertex_bias; } else { vertex_access_type = brw->gen >= 7 ? GEN7_3DPRIM_VERTEXBUFFER_ACCESS_SEQUENTIAL : GEN4_3DPRIM_VERTEXBUFFER_ACCESS_SEQUENTIAL; start_vertex_location += brw->vb.start_vertex_bias; } /* We only need to trim the primitive count on pre-Gen6. */ if (brw->gen < 6) verts_per_instance = trim(prim->mode, prim->count); else verts_per_instance = prim->count; /* If nothing to emit, just return. */ if (verts_per_instance == 0 && !prim->is_indirect) return; /* If we're set to always flush, do it before and after the primitive emit. * We want to catch both missed flushes that hurt instruction/state cache * and missed flushes of the render cache as it heads to other parts of * the besides the draw code. */ if (brw->always_flush_cache) brw_emit_mi_flush(brw); /* If indirect, emit a bunch of loads from the indirect BO. */ if (prim->is_indirect) { struct gl_buffer_object *indirect_buffer = brw->ctx.DrawIndirectBuffer; drm_intel_bo *bo = intel_bufferobj_buffer(brw, intel_buffer_object(indirect_buffer), prim->indirect_offset, 5 * sizeof(GLuint)); indirect_flag = GEN7_3DPRIM_INDIRECT_PARAMETER_ENABLE; brw_load_register_mem(brw, GEN7_3DPRIM_VERTEX_COUNT, bo, I915_GEM_DOMAIN_VERTEX, 0, prim->indirect_offset + 0); brw_load_register_mem(brw, GEN7_3DPRIM_INSTANCE_COUNT, bo, I915_GEM_DOMAIN_VERTEX, 0, prim->indirect_offset + 4); brw_load_register_mem(brw, GEN7_3DPRIM_START_VERTEX, bo, I915_GEM_DOMAIN_VERTEX, 0, prim->indirect_offset + 8); if (prim->indexed) { brw_load_register_mem(brw, GEN7_3DPRIM_BASE_VERTEX, bo, I915_GEM_DOMAIN_VERTEX, 0, prim->indirect_offset + 12); brw_load_register_mem(brw, GEN7_3DPRIM_START_INSTANCE, bo, I915_GEM_DOMAIN_VERTEX, 0, prim->indirect_offset + 16); } else { brw_load_register_mem(brw, GEN7_3DPRIM_START_INSTANCE, bo, I915_GEM_DOMAIN_VERTEX, 0, prim->indirect_offset + 12); BEGIN_BATCH(3); OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2)); OUT_BATCH(GEN7_3DPRIM_BASE_VERTEX); OUT_BATCH(0); ADVANCE_BATCH(); } } else { indirect_flag = 0; } BEGIN_BATCH(brw->gen >= 7 ? 7 : 6); if (brw->gen >= 7) { const int predicate_enable = (brw->predicate.state == BRW_PREDICATE_STATE_USE_BIT) ? GEN7_3DPRIM_PREDICATE_ENABLE : 0; OUT_BATCH(CMD_3D_PRIM << 16 | (7 - 2) | indirect_flag | predicate_enable); OUT_BATCH(hw_prim | vertex_access_type); } else { OUT_BATCH(CMD_3D_PRIM << 16 | (6 - 2) | hw_prim << GEN4_3DPRIM_TOPOLOGY_TYPE_SHIFT | vertex_access_type); } OUT_BATCH(verts_per_instance); OUT_BATCH(start_vertex_location); OUT_BATCH(prim->num_instances); OUT_BATCH(prim->base_instance); OUT_BATCH(base_vertex_location); ADVANCE_BATCH(); if (brw->always_flush_cache) brw_emit_mi_flush(brw); }
void brw_blorp_exec(struct brw_context *brw, const brw_blorp_params *params) { struct gl_context *ctx = &brw->ctx; uint32_t estimated_max_batch_usage = 1500; bool check_aperture_failed_once = false; /* Flush the sampler and render caches. We definitely need to flush the * sampler cache so that we get updated contents from the render cache for * the glBlitFramebuffer() source. Also, we are sometimes warned in the * docs to flush the cache between reinterpretations of the same surface * data with different formats, which blorp does for stencil and depth * data. */ brw_emit_mi_flush(brw); retry: intel_batchbuffer_require_space(brw, estimated_max_batch_usage, RENDER_RING); intel_batchbuffer_save_state(brw); drm_intel_bo *saved_bo = brw->batch.bo; uint32_t saved_used = USED_BATCH(brw->batch); uint32_t saved_state_batch_offset = brw->batch.state_batch_offset; switch (brw->gen) { case 6: gen6_blorp_exec(brw, params); break; case 7: gen7_blorp_exec(brw, params); break; default: /* BLORP is not supported before Gen6. */ unreachable("not reached"); } /* Make sure we didn't wrap the batch unintentionally, and make sure we * reserved enough space that a wrap will never happen. */ assert(brw->batch.bo == saved_bo); assert((USED_BATCH(brw->batch) - saved_used) * 4 + (saved_state_batch_offset - brw->batch.state_batch_offset) < estimated_max_batch_usage); /* Shut up compiler warnings on release build */ (void)saved_bo; (void)saved_used; (void)saved_state_batch_offset; /* Check if the blorp op we just did would make our batch likely to fail to * map all the BOs into the GPU at batch exec time later. If so, flush the * batch and try again with nothing else in the batch. */ if (dri_bufmgr_check_aperture_space(&brw->batch.bo, 1)) { if (!check_aperture_failed_once) { check_aperture_failed_once = true; intel_batchbuffer_reset_to_saved(brw); intel_batchbuffer_flush(brw); goto retry; } else { int ret = intel_batchbuffer_flush(brw); WARN_ONCE(ret == -ENOSPC, "i965: blorp emit exceeded available aperture space\n"); } } if (unlikely(brw->always_flush_batch)) intel_batchbuffer_flush(brw); /* We've smashed all state compared to what the normal 3D pipeline * rendering tracks for GL. */ brw->ctx.NewDriverState = ~0ull; brw->no_depth_or_stencil = false; brw->ib.type = -1; /* Flush the sampler cache so any texturing from the destination is * coherent. */ brw_emit_mi_flush(brw); }
bool brw_meta_fast_clear(struct brw_context *brw, struct gl_framebuffer *fb, GLbitfield buffers, bool partial_clear) { struct gl_context *ctx = &brw->ctx; mesa_format format; enum { FAST_CLEAR, REP_CLEAR, PLAIN_CLEAR } clear_type; GLbitfield plain_clear_buffers, meta_save, rep_clear_buffers, fast_clear_buffers; struct rect fast_clear_rect, clear_rect; int layers; fast_clear_buffers = rep_clear_buffers = plain_clear_buffers = 0; /* First we loop through the color draw buffers and determine which ones * can be fast cleared, which ones can use the replicated write and which * ones have to fall back to regular color clear. */ for (unsigned buf = 0; buf < fb->_NumColorDrawBuffers; buf++) { struct gl_renderbuffer *rb = fb->_ColorDrawBuffers[buf]; struct intel_renderbuffer *irb = intel_renderbuffer(rb); int index = fb->_ColorDrawBufferIndexes[buf]; /* Only clear the buffers present in the provided mask */ if (((1 << index) & buffers) == 0) continue; /* If this is an ES2 context or GL_ARB_ES2_compatibility is supported, * the framebuffer can be complete with some attachments missing. In * this case the _ColorDrawBuffers pointer will be NULL. */ if (rb == NULL) continue; clear_type = FAST_CLEAR; /* We don't have fast clear until gen7. */ if (brw->gen < 7) clear_type = REP_CLEAR; if (irb->mt->fast_clear_state == INTEL_FAST_CLEAR_STATE_NO_MCS) clear_type = REP_CLEAR; /* We can't do scissored fast clears because of the restrictions on the * fast clear rectangle size. */ if (partial_clear) clear_type = REP_CLEAR; /* Fast clear is only supported for colors where all components are * either 0 or 1. */ format = _mesa_get_render_format(ctx, irb->mt->format); if (!is_color_fast_clear_compatible(brw, format, &ctx->Color.ClearColor)) clear_type = REP_CLEAR; /* From the SNB PRM (Vol4_Part1): * * "Replicated data (Message Type = 111) is only supported when * accessing tiled memory. Using this Message Type to access * linear (untiled) memory is UNDEFINED." */ if (irb->mt->tiling == I915_TILING_NONE) { perf_debug("Falling back to plain clear because %dx%d buffer is untiled\n", irb->mt->logical_width0, irb->mt->logical_height0); clear_type = PLAIN_CLEAR; } /* Constant color writes ignore everything in blend and color calculator * state. This is not documented. */ GLubyte *color_mask = ctx->Color.ColorMask[buf]; for (int i = 0; i < 4; i++) { if (_mesa_format_has_color_component(irb->mt->format, i) && !color_mask[i]) { perf_debug("Falling back to plain clear on %dx%d buffer because of color mask\n", irb->mt->logical_width0, irb->mt->logical_height0); clear_type = PLAIN_CLEAR; } } /* Allocate the MCS for non MSRT surfaces now if we're doing a fast * clear and we don't have the MCS yet. On failure, fall back to * replicated clear. */ if (clear_type == FAST_CLEAR && irb->mt->mcs_mt == NULL) if (!intel_miptree_alloc_non_msrt_mcs(brw, irb->mt)) clear_type = REP_CLEAR; switch (clear_type) { case FAST_CLEAR: irb->mt->fast_clear_color_value = compute_fast_clear_color_bits(&ctx->Color.ClearColor); irb->need_downsample = true; /* If the buffer is already in INTEL_FAST_CLEAR_STATE_CLEAR, the * clear is redundant and can be skipped. Only skip after we've * updated the fast clear color above though. */ if (irb->mt->fast_clear_state == INTEL_FAST_CLEAR_STATE_CLEAR) continue; /* Set fast_clear_state to RESOLVED so we don't try resolve them when * we draw, in case the mt is also bound as a texture. */ irb->mt->fast_clear_state = INTEL_FAST_CLEAR_STATE_RESOLVED; irb->need_downsample = true; fast_clear_buffers |= 1 << index; get_fast_clear_rect(brw, fb, irb, &fast_clear_rect); break; case REP_CLEAR: rep_clear_buffers |= 1 << index; get_buffer_rect(brw, fb, irb, &clear_rect); break; case PLAIN_CLEAR: plain_clear_buffers |= 1 << index; get_buffer_rect(brw, fb, irb, &clear_rect); continue; } } if (!(fast_clear_buffers | rep_clear_buffers)) { if (plain_clear_buffers) /* If we only have plain clears, skip the meta save/restore. */ goto out; else /* Nothing left to do. This happens when we hit the redundant fast * clear case above and nothing else. */ return true; } meta_save = MESA_META_ALPHA_TEST | MESA_META_BLEND | MESA_META_DEPTH_TEST | MESA_META_RASTERIZATION | MESA_META_SHADER | MESA_META_STENCIL_TEST | MESA_META_VERTEX | MESA_META_VIEWPORT | MESA_META_CLIP | MESA_META_CLAMP_FRAGMENT_COLOR | MESA_META_MULTISAMPLE | MESA_META_OCCLUSION_QUERY | MESA_META_DRAW_BUFFERS; _mesa_meta_begin(ctx, meta_save); if (!brw_fast_clear_init(brw)) { /* This is going to be hard to recover from, most likely out of memory. * Bail and let meta try and (probably) fail for us. */ plain_clear_buffers = buffers; goto bail_to_meta; } /* Clears never have the color clamped. */ if (ctx->Extensions.ARB_color_buffer_float) _mesa_ClampColor(GL_CLAMP_FRAGMENT_COLOR, GL_FALSE); _mesa_set_enable(ctx, GL_DEPTH_TEST, GL_FALSE); _mesa_DepthMask(GL_FALSE); _mesa_set_enable(ctx, GL_STENCIL_TEST, GL_FALSE); use_rectlist(brw, true); layers = MAX2(1, fb->MaxNumLayers); if (fast_clear_buffers) { _mesa_meta_drawbuffers_from_bitfield(fast_clear_buffers); brw_bind_rep_write_shader(brw, (float *) fast_clear_color); set_fast_clear_op(brw, GEN7_PS_RENDER_TARGET_FAST_CLEAR_ENABLE); brw_draw_rectlist(ctx, &fast_clear_rect, layers); set_fast_clear_op(brw, 0); } if (rep_clear_buffers) { _mesa_meta_drawbuffers_from_bitfield(rep_clear_buffers); brw_bind_rep_write_shader(brw, ctx->Color.ClearColor.f); brw_draw_rectlist(ctx, &clear_rect, layers); } /* Now set the mts we cleared to INTEL_FAST_CLEAR_STATE_CLEAR so we'll * resolve them eventually. */ for (unsigned buf = 0; buf < fb->_NumColorDrawBuffers; buf++) { struct gl_renderbuffer *rb = fb->_ColorDrawBuffers[buf]; struct intel_renderbuffer *irb = intel_renderbuffer(rb); int index = fb->_ColorDrawBufferIndexes[buf]; if ((1 << index) & fast_clear_buffers) irb->mt->fast_clear_state = INTEL_FAST_CLEAR_STATE_CLEAR; } bail_to_meta: /* Dirty _NEW_BUFFERS so we reemit SURFACE_STATE which sets the fast clear * color before resolve and sets irb->mt->fast_clear_state to UNRESOLVED if * we render to it. */ brw->NewGLState |= _NEW_BUFFERS; /* Set the custom state back to normal and dirty the same bits as above */ use_rectlist(brw, false); _mesa_meta_end(ctx); /* From BSpec: Render Target Fast Clear: * * After Render target fast clear, pipe-control with color cache * write-flush must be issued before sending any DRAW commands on that * render target. */ brw_emit_mi_flush(brw); /* If we had to fall back to plain clear for any buffers, clear those now * by calling into meta. */ out: if (plain_clear_buffers) _mesa_meta_glsl_Clear(&brw->ctx, plain_clear_buffers); return true; }
/** * When the GS is not in use, we assign the entire URB space to the VS. When * the GS is in use, we split the URB space evenly between the VS and the GS. * This is not ideal, but it's simple. * * URB size / 2 URB size / 2 * _____________-______________ _____________-______________ * / \ / \ * +-------------------------------------------------------------+ * | Vertex Shader Entries | Geometry Shader Entries | * +-------------------------------------------------------------+ * * Sandybridge GT1 has 32kB of URB space, while GT2 has 64kB. * (See the Sandybridge PRM, Volume 2, Part 1, Section 1.4.7: 3DSTATE_URB.) */ static void gen6_upload_urb( struct brw_context *brw ) { int nr_vs_entries, nr_gs_entries; int total_urb_size = brw->urb.size * 1024; /* in bytes */ bool gs_present = brw->ff_gs.prog_active || brw->geometry_program; /* BRW_NEW_VS_PROG_DATA */ unsigned vs_size = MAX2(brw->vs.prog_data->base.urb_entry_size, 1); /* Whe using GS to do transform feedback only we use the same VUE layout for * VS outputs and GS outputs (as it's what the SF and Clipper expect), so we * can simply make the GS URB entry size the same as for the VS. This may * technically be too large in cases where we have few vertex attributes and * a lot of varyings, since the VS size is determined by the larger of the * two. For now, it's safe. * * For user-provided GS the assumption above does not hold since the GS * outputs can be different from the VS outputs. */ unsigned gs_size = vs_size; if (brw->geometry_program) { gs_size = brw->gs.prog_data->base.urb_entry_size; assert(gs_size >= 1); } /* Calculate how many entries fit in each stage's section of the URB */ if (gs_present) { nr_vs_entries = (total_urb_size/2) / (vs_size * 128); nr_gs_entries = (total_urb_size/2) / (gs_size * 128); } else { nr_vs_entries = total_urb_size / (vs_size * 128); nr_gs_entries = 0; } /* Then clamp to the maximum allowed by the hardware */ if (nr_vs_entries > brw->urb.max_vs_entries) nr_vs_entries = brw->urb.max_vs_entries; if (nr_gs_entries > brw->urb.max_gs_entries) nr_gs_entries = brw->urb.max_gs_entries; /* Finally, both must be a multiple of 4 (see 3DSTATE_URB in the PRM). */ brw->urb.nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, 4); brw->urb.nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, 4); assert(brw->urb.nr_vs_entries >= brw->urb.min_vs_entries); assert(brw->urb.nr_vs_entries % 4 == 0); assert(brw->urb.nr_gs_entries % 4 == 0); assert(vs_size <= 5); assert(gs_size <= 5); BEGIN_BATCH(3); OUT_BATCH(_3DSTATE_URB << 16 | (3 - 2)); OUT_BATCH(((vs_size - 1) << GEN6_URB_VS_SIZE_SHIFT) | ((brw->urb.nr_vs_entries) << GEN6_URB_VS_ENTRIES_SHIFT)); OUT_BATCH(((gs_size - 1) << GEN6_URB_GS_SIZE_SHIFT) | ((brw->urb.nr_gs_entries) << GEN6_URB_GS_ENTRIES_SHIFT)); ADVANCE_BATCH(); /* From the PRM Volume 2 part 1, section 1.4.7: * * Because of a urb corruption caused by allocating a previous gsunit’s * urb entry to vsunit software is required to send a "GS NULL * Fence"(Send URB fence with VS URB size == 1 and GS URB size == 0) plus * a dummy DRAW call before any case where VS will be taking over GS URB * space. * * It is not clear exactly what this means ("URB fence" is a command that * doesn't exist on Gen6). So for now we just do a full pipeline flush as * a workaround. */ if (brw->urb.gs_present && !gs_present) brw_emit_mi_flush(brw); brw->urb.gs_present = gs_present; }
bool intelEmitImmediateColorExpandBlit(struct brw_context *brw, GLuint cpp, GLubyte *src_bits, GLuint src_size, GLuint fg_color, GLshort dst_pitch, struct brw_bo *dst_buffer, GLuint dst_offset, enum isl_tiling dst_tiling, GLshort x, GLshort y, GLshort w, GLshort h, enum gl_logicop_mode logic_op) { const struct gen_device_info *devinfo = &brw->screen->devinfo; int dwords = ALIGN(src_size, 8) / 4; uint32_t opcode, br13, blit_cmd; if (dst_tiling != ISL_TILING_LINEAR) { if (dst_offset & 4095) return false; if (dst_tiling == ISL_TILING_Y0) return false; } assert((unsigned) logic_op <= 0x0f); assert(dst_pitch > 0); if (w < 0 || h < 0) return true; DBG("%s dst:buf(%p)/%d+%d %d,%d sz:%dx%d, %d bytes %d dwords\n", __func__, dst_buffer, dst_pitch, dst_offset, x, y, w, h, src_size, dwords); unsigned xy_setup_blt_length = devinfo->gen >= 8 ? 10 : 8; intel_batchbuffer_require_space(brw, (xy_setup_blt_length * 4) + (3 * 4) + dwords * 4); opcode = XY_SETUP_BLT_CMD; if (cpp == 4) opcode |= XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB; if (dst_tiling != ISL_TILING_LINEAR) { opcode |= XY_DST_TILED; dst_pitch /= 4; } br13 = dst_pitch | (translate_raster_op(logic_op) << 16) | (1 << 29); br13 |= br13_for_cpp(cpp); blit_cmd = XY_TEXT_IMMEDIATE_BLIT_CMD | XY_TEXT_BYTE_PACKED; /* packing? */ if (dst_tiling != ISL_TILING_LINEAR) blit_cmd |= XY_DST_TILED; BEGIN_BATCH_BLT(xy_setup_blt_length + 3); OUT_BATCH(opcode | (xy_setup_blt_length - 2)); OUT_BATCH(br13); OUT_BATCH((0 << 16) | 0); /* clip x1, y1 */ OUT_BATCH((100 << 16) | 100); /* clip x2, y2 */ if (devinfo->gen >= 8) { OUT_RELOC64(dst_buffer, RELOC_WRITE, dst_offset); } else { OUT_RELOC(dst_buffer, RELOC_WRITE, dst_offset); } OUT_BATCH(0); /* bg */ OUT_BATCH(fg_color); /* fg */ OUT_BATCH(0); /* pattern base addr */ if (devinfo->gen >= 8) OUT_BATCH(0); OUT_BATCH(blit_cmd | ((3 - 2) + dwords)); OUT_BATCH(SET_FIELD(y, BLT_Y) | SET_FIELD(x, BLT_X)); OUT_BATCH(SET_FIELD(y + h, BLT_Y) | SET_FIELD(x + w, BLT_X)); ADVANCE_BATCH(); intel_batchbuffer_data(brw, src_bits, dwords * 4); brw_emit_mi_flush(brw); return true; }
static bool MUST_CHECK brw_fence_insert_locked(struct brw_context *brw, struct brw_fence *fence) { __DRIcontext *driContext = brw->driContext; __DRIdrawable *driDrawable = driContext->driDrawablePriv; /* * From KHR_fence_sync: * * When the condition of the sync object is satisfied by the fence * command, the sync is signaled by the associated client API context, * causing any eglClientWaitSyncKHR commands (see below) blocking on * <sync> to unblock. The only condition currently supported is * EGL_SYNC_PRIOR_COMMANDS_COMPLETE_KHR, which is satisfied by * completion of the fence command corresponding to the sync object, * and all preceding commands in the associated client API context's * command stream. The sync object will not be signaled until all * effects from these commands on the client API's internal and * framebuffer state are fully realized. No other state is affected by * execution of the fence command. * * Note the emphasis there on ensuring that the framebuffer is fully * realised before the fence is signaled. We cannot just flush the batch, * but must also resolve the drawable first. The importance of this is, * for example, in creating a fence for a frame to be passed to a * remote compositor. Without us flushing the drawable explicitly, the * resolve will be in a following batch (when the client finally calls * SwapBuffers, or triggers a resolve via some other path) and so the * compositor may read the incomplete framebuffer instead. */ if (driDrawable) intel_resolve_for_dri2_flush(brw, driDrawable); brw_emit_mi_flush(brw); switch (fence->type) { case BRW_FENCE_TYPE_BO_WAIT: assert(!fence->batch_bo); assert(!fence->signalled); fence->batch_bo = brw->batch.bo; brw_bo_reference(fence->batch_bo); if (intel_batchbuffer_flush(brw) < 0) { brw_bo_unreference(fence->batch_bo); fence->batch_bo = NULL; return false; } break; case BRW_FENCE_TYPE_SYNC_FD: assert(!fence->signalled); if (fence->sync_fd == -1) { /* Create an out-fence that signals after all pending commands * complete. */ if (intel_batchbuffer_flush_fence(brw, -1, &fence->sync_fd) < 0) return false; assert(fence->sync_fd != -1); } else { /* Wait on the in-fence before executing any subsequently submitted * commands. */ if (intel_batchbuffer_flush(brw) < 0) return false; /* Emit a dummy batch just for the fence. */ brw_emit_mi_flush(brw); if (intel_batchbuffer_flush_fence(brw, fence->sync_fd, NULL) < 0) return false; } break; } return true; }
/** * Implements fast depth clears on gen6+. * * Fast clears basically work by setting a flag in each of the subspans * represented in the HiZ buffer that says "When you need the depth values for * this subspan, it's the hardware's current clear value." Then later rendering * can just use the static clear value instead of referencing memory. * * The tricky part of the implementation is that you have to have the clear * value that was used on the depth buffer in place for all further rendering, * at least until a resolve to the real depth buffer happens. */ static bool brw_fast_clear_depth(struct gl_context *ctx) { struct brw_context *brw = brw_context(ctx); struct gl_framebuffer *fb = ctx->DrawBuffer; struct intel_renderbuffer *depth_irb = intel_get_renderbuffer(fb, BUFFER_DEPTH); struct intel_mipmap_tree *mt = depth_irb->mt; struct gl_renderbuffer_attachment *depth_att = &fb->Attachment[BUFFER_DEPTH]; if (brw->gen < 6) return false; if (!intel_renderbuffer_has_hiz(depth_irb)) return false; /* We only handle full buffer clears -- otherwise you'd have to track whether * a previous clear had happened at a different clear value and resolve it * first. */ if ((ctx->Scissor.EnableFlags & 1) && !noop_scissor(ctx, fb)) { perf_debug("Failed to fast clear %dx%d depth because of scissors. " "Possible 5%% performance win if avoided.\n", mt->logical_width0, mt->logical_height0); return false; } uint32_t depth_clear_value; switch (mt->format) { case MESA_FORMAT_Z32_FLOAT_S8X24_UINT: case MESA_FORMAT_Z24_UNORM_S8_UINT: /* From the Sandy Bridge PRM, volume 2 part 1, page 314: * * "[DevSNB+]: Several cases exist where Depth Buffer Clear cannot be * enabled (the legacy method of clearing must be performed): * * - If the depth buffer format is D32_FLOAT_S8X24_UINT or * D24_UNORM_S8_UINT. */ return false; case MESA_FORMAT_Z_FLOAT32: depth_clear_value = float_as_int(ctx->Depth.Clear); break; case MESA_FORMAT_Z_UNORM16: /* From the Sandy Bridge PRM, volume 2 part 1, page 314: * * "[DevSNB+]: Several cases exist where Depth Buffer Clear cannot be * enabled (the legacy method of clearing must be performed): * * - DevSNB{W/A}]: When depth buffer format is D16_UNORM and the * width of the map (LOD0) is not multiple of 16, fast clear * optimization must be disabled. */ if (brw->gen == 6 && (minify(mt->physical_width0, depth_irb->mt_level - mt->first_level) % 16) != 0) return false; /* FALLTHROUGH */ default: if (brw->gen >= 8) depth_clear_value = float_as_int(ctx->Depth.Clear); else depth_clear_value = fb->_DepthMax * ctx->Depth.Clear; break; } /* If we're clearing to a new clear value, then we need to resolve any clear * flags out of the HiZ buffer into the real depth buffer. */ if (mt->depth_clear_value != depth_clear_value) { intel_miptree_all_slices_resolve_depth(brw, mt); mt->depth_clear_value = depth_clear_value; } /* From the Sandy Bridge PRM, volume 2 part 1, page 313: * * "If other rendering operations have preceded this clear, a * PIPE_CONTROL with write cache flush enabled and Z-inhibit disabled * must be issued before the rectangle primitive used for the depth * buffer clear operation. */ brw_emit_mi_flush(brw); if (fb->MaxNumLayers > 0) { for (unsigned layer = 0; layer < depth_irb->layer_count; layer++) { intel_hiz_exec(brw, mt, depth_irb->mt_level, depth_irb->mt_layer + layer, GEN6_HIZ_OP_DEPTH_CLEAR); } } else { intel_hiz_exec(brw, mt, depth_irb->mt_level, depth_irb->mt_layer, GEN6_HIZ_OP_DEPTH_CLEAR); } if (brw->gen == 6) { /* From the Sandy Bridge PRM, volume 2 part 1, page 314: * * "DevSNB, DevSNB-B{W/A}]: Depth buffer clear pass must be followed * by a PIPE_CONTROL command with DEPTH_STALL bit set and Then * followed by Depth FLUSH' */ brw_emit_mi_flush(brw); } /* Now, the HiZ buffer contains data that needs to be resolved to the depth * buffer. */ intel_renderbuffer_att_set_needs_depth_resolve(depth_att); return true; }
void intelReadPixels(struct gl_context * ctx, GLint x, GLint y, GLsizei width, GLsizei height, GLenum format, GLenum type, const struct gl_pixelstore_attrib *pack, GLvoid * pixels) { bool ok; struct brw_context *brw = brw_context(ctx); bool dirty; DBG("%s\n", __func__); if (_mesa_is_bufferobj(pack->BufferObj)) { if (_mesa_meta_pbo_GetTexSubImage(ctx, 2, NULL, x, y, 0, width, height, 1, format, type, pixels, pack)) { /* _mesa_meta_pbo_GetTexSubImage() implements PBO transfers by * binding the user-provided BO as a fake framebuffer and rendering * to it. This breaks the invariant of the GL that nothing is able * to render to a BO, causing nondeterministic corruption issues * because the render cache is not coherent with a number of other * caches that the BO could potentially be bound to afterwards. * * This could be solved in the same way that we guarantee texture * coherency after a texture is attached to a framebuffer and * rendered to, but that would involve checking *all* BOs bound to * the pipeline for the case we need to emit a cache flush due to * previous rendering to any of them -- Including vertex, index, * uniform, atomic counter, shader image, transform feedback, * indirect draw buffers, etc. * * That would increase the per-draw call overhead even though it's * very unlikely that any of the BOs bound to the pipeline has been * rendered to via a PBO at any point, so it seems better to just * flush here unconditionally. */ brw_emit_mi_flush(brw); return; } perf_debug("%s: fallback to CPU mapping in PBO case\n", __func__); } ok = intel_readpixels_tiled_memcpy(ctx, x, y, width, height, format, type, pixels, pack); if(ok) return; /* glReadPixels() wont dirty the front buffer, so reset the dirty * flag after calling intel_prepare_render(). */ dirty = brw->front_buffer_dirty; intel_prepare_render(brw); brw->front_buffer_dirty = dirty; /* Update Mesa state before calling _mesa_readpixels(). * XXX this may not be needed since ReadPixels no longer uses the * span code. */ if (ctx->NewState) _mesa_update_state(ctx); _mesa_readpixels(ctx, x, y, width, height, format, type, pack, pixels); /* There's an intel_prepare_render() call in intelSpanRenderStart(). */ brw->front_buffer_dirty = dirty; }
/** * Used to initialize the alpha value of an ARGB8888 miptree after copying * into it from an XRGB8888 source. * * This is very common with glCopyTexImage2D(). Note that the coordinates are * relative to the start of the miptree, not relative to a slice within the * miptree. */ static void intel_miptree_set_alpha_to_one(struct brw_context *brw, struct intel_mipmap_tree *mt, int x, int y, int width, int height) { const struct gen_device_info *devinfo = &brw->screen->devinfo; uint32_t BR13, CMD; int pitch, cpp; pitch = mt->surf.row_pitch_B; cpp = mt->cpp; DBG("%s dst:buf(%p)/%d %d,%d sz:%dx%d\n", __func__, mt->bo, pitch, x, y, width, height); /* Note: Currently only handles 8 bit alpha channel. Extension to < 8 Bit * alpha channel would be likely possible via ROP code 0xfa instead of 0xf0 * and writing a suitable bit-mask instead of 0xffffffff. */ BR13 = br13_for_cpp(cpp) | 0xf0 << 16; CMD = XY_COLOR_BLT_CMD; CMD |= XY_BLT_WRITE_ALPHA; if (mt->surf.tiling != ISL_TILING_LINEAR) { CMD |= XY_DST_TILED; pitch /= 4; } BR13 |= pitch; /* do space check before going any further */ if (!brw_batch_has_aperture_space(brw, mt->bo->size)) intel_batchbuffer_flush(brw); unsigned length = devinfo->gen >= 8 ? 7 : 6; const bool dst_y_tiled = mt->surf.tiling == ISL_TILING_Y0; /* We need to split the blit into chunks that each fit within the blitter's * restrictions. We can't use a chunk size of 32768 because we need to * ensure that src_tile_x + chunk_size fits. We choose 16384 because it's * a nice round power of two, big enough that performance won't suffer, and * small enough to guarantee everything fits. */ const uint32_t max_chunk_size = 16384; for (uint32_t chunk_x = 0; chunk_x < width; chunk_x += max_chunk_size) { for (uint32_t chunk_y = 0; chunk_y < height; chunk_y += max_chunk_size) { const uint32_t chunk_w = MIN2(max_chunk_size, width - chunk_x); const uint32_t chunk_h = MIN2(max_chunk_size, height - chunk_y); uint32_t offset, tile_x, tile_y; get_blit_intratile_offset_el(brw, mt, x + chunk_x, y + chunk_y, &offset, &tile_x, &tile_y); BEGIN_BATCH_BLT_TILED(length, dst_y_tiled, false); OUT_BATCH(CMD | (length - 2)); OUT_BATCH(BR13); OUT_BATCH(SET_FIELD(y + chunk_y, BLT_Y) | SET_FIELD(x + chunk_x, BLT_X)); OUT_BATCH(SET_FIELD(y + chunk_y + chunk_h, BLT_Y) | SET_FIELD(x + chunk_x + chunk_w, BLT_X)); if (devinfo->gen >= 8) { OUT_RELOC64(mt->bo, RELOC_WRITE, mt->offset + offset); } else { OUT_RELOC(mt->bo, RELOC_WRITE, mt->offset + offset); } OUT_BATCH(0xffffffff); /* white, but only alpha gets written */ ADVANCE_BATCH_TILED(dst_y_tiled, false); } } brw_emit_mi_flush(brw); }
/** * Compute the number of primitives written during our most recent * transform feedback activity (the current SO_NUM_PRIMS_WRITTEN value * minus the stashed "start" value), and add it to our running tally. * * If \p finalize is true, also compute the number of vertices written * (by multiplying by the number of vertices per primitive), and store * that to the "final" location. * * Otherwise, just overwrite the old tally with the new one. */ static void tally_prims_written(struct brw_context *brw, struct brw_transform_feedback_object *obj, bool finalize) { /* Flush any drawing so that the counters have the right values. */ brw_emit_mi_flush(brw); for (int i = 0; i < BRW_MAX_XFB_STREAMS; i++) { /* GPR0 = Tally */ brw_load_register_imm32(brw, HSW_CS_GPR(0) + 4, 0); brw_load_register_mem(brw, HSW_CS_GPR(0), obj->prim_count_bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, TALLY_OFFSET + i * sizeof(uint32_t)); if (!obj->base.Paused) { /* GPR1 = Start Snapshot */ brw_load_register_mem64(brw, HSW_CS_GPR(1), obj->prim_count_bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, START_OFFSET + i * sizeof(uint64_t)); /* GPR2 = Ending Snapshot */ brw_load_register_reg64(brw, GEN7_SO_NUM_PRIMS_WRITTEN(i), HSW_CS_GPR(2)); BEGIN_BATCH(9); OUT_BATCH(HSW_MI_MATH | (9 - 2)); /* GPR1 = GPR2 (End) - GPR1 (Start) */ OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R2)); OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R1)); OUT_BATCH(MI_MATH_ALU0(SUB)); OUT_BATCH(MI_MATH_ALU2(STORE, R1, ACCU)); /* GPR0 = GPR0 (Tally) + GPR1 (Diff) */ OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R0)); OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R1)); OUT_BATCH(MI_MATH_ALU0(ADD)); OUT_BATCH(MI_MATH_ALU2(STORE, R0, ACCU)); ADVANCE_BATCH(); } if (!finalize) { /* Write back the new tally */ brw_store_register_mem32(brw, obj->prim_count_bo, HSW_CS_GPR(0), TALLY_OFFSET + i * sizeof(uint32_t)); } else { /* Convert the number of primitives to the number of vertices. */ if (obj->primitive_mode == GL_LINES) { /* Double R0 (R0 = R0 + R0) */ BEGIN_BATCH(5); OUT_BATCH(HSW_MI_MATH | (5 - 2)); OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R0)); OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R0)); OUT_BATCH(MI_MATH_ALU0(ADD)); OUT_BATCH(MI_MATH_ALU2(STORE, R0, ACCU)); ADVANCE_BATCH(); } else if (obj->primitive_mode == GL_TRIANGLES) { /* Triple R0 (R1 = R0 + R0, R0 = R0 + R1) */ BEGIN_BATCH(9); OUT_BATCH(HSW_MI_MATH | (9 - 2)); OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R0)); OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R0)); OUT_BATCH(MI_MATH_ALU0(ADD)); OUT_BATCH(MI_MATH_ALU2(STORE, R1, ACCU)); OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R0)); OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R1)); OUT_BATCH(MI_MATH_ALU0(ADD)); OUT_BATCH(MI_MATH_ALU2(STORE, R0, ACCU)); ADVANCE_BATCH(); } /* Store it to the final result */ brw_store_register_mem32(brw, obj->prim_count_bo, HSW_CS_GPR(0), i * sizeof(uint32_t)); } } }
/* Copy BitBlt */ static bool emit_copy_blit(struct brw_context *brw, GLuint cpp, int32_t src_pitch, struct brw_bo *src_buffer, GLuint src_offset, enum isl_tiling src_tiling, int32_t dst_pitch, struct brw_bo *dst_buffer, GLuint dst_offset, enum isl_tiling dst_tiling, GLshort src_x, GLshort src_y, GLshort dst_x, GLshort dst_y, GLshort w, GLshort h, enum gl_logicop_mode logic_op) { const struct gen_device_info *devinfo = &brw->screen->devinfo; GLuint CMD, BR13; int dst_y2 = dst_y + h; int dst_x2 = dst_x + w; bool dst_y_tiled = dst_tiling == ISL_TILING_Y0; bool src_y_tiled = src_tiling == ISL_TILING_Y0; uint32_t src_tile_w, src_tile_h; uint32_t dst_tile_w, dst_tile_h; if ((dst_y_tiled || src_y_tiled) && devinfo->gen < 6) return false; const unsigned bo_sizes = dst_buffer->size + src_buffer->size; /* do space check before going any further */ if (!brw_batch_has_aperture_space(brw, bo_sizes)) intel_batchbuffer_flush(brw); if (!brw_batch_has_aperture_space(brw, bo_sizes)) return false; unsigned length = devinfo->gen >= 8 ? 10 : 8; intel_batchbuffer_require_space(brw, length * 4); DBG("%s src:buf(%p)/%d+%d %d,%d dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n", __func__, src_buffer, src_pitch, src_offset, src_x, src_y, dst_buffer, dst_pitch, dst_offset, dst_x, dst_y, w, h); intel_get_tile_dims(src_tiling, cpp, &src_tile_w, &src_tile_h); intel_get_tile_dims(dst_tiling, cpp, &dst_tile_w, &dst_tile_h); /* For Tiled surfaces, the pitch has to be a multiple of the Tile width * (X direction width of the Tile). This is ensured while allocating the * buffer object. */ assert(src_tiling == ISL_TILING_LINEAR || (src_pitch % src_tile_w) == 0); assert(dst_tiling == ISL_TILING_LINEAR || (dst_pitch % dst_tile_w) == 0); /* For big formats (such as floating point), do the copy using 16 or * 32bpp and multiply the coordinates. */ if (cpp > 4) { if (cpp % 4 == 2) { dst_x *= cpp / 2; dst_x2 *= cpp / 2; src_x *= cpp / 2; cpp = 2; } else { assert(cpp % 4 == 0); dst_x *= cpp / 4; dst_x2 *= cpp / 4; src_x *= cpp / 4; cpp = 4; } } if (!alignment_valid(brw, dst_offset, dst_tiling)) return false; if (!alignment_valid(brw, src_offset, src_tiling)) return false; /* Blit pitch must be dword-aligned. Otherwise, the hardware appears to drop * the low bits. Offsets must be naturally aligned. */ if (src_pitch % 4 != 0 || src_offset % cpp != 0 || dst_pitch % 4 != 0 || dst_offset % cpp != 0) return false; assert(cpp <= 4); BR13 = br13_for_cpp(cpp) | translate_raster_op(logic_op) << 16; CMD = xy_blit_cmd(src_tiling, dst_tiling, cpp); /* For tiled source and destination, pitch value should be specified * as a number of Dwords. */ if (dst_tiling != ISL_TILING_LINEAR) dst_pitch /= 4; if (src_tiling != ISL_TILING_LINEAR) src_pitch /= 4; if (dst_y2 <= dst_y || dst_x2 <= dst_x) return true; assert(dst_x < dst_x2); assert(dst_y < dst_y2); BEGIN_BATCH_BLT_TILED(length, dst_y_tiled, src_y_tiled); OUT_BATCH(CMD | (length - 2)); OUT_BATCH(BR13 | (uint16_t)dst_pitch); OUT_BATCH(SET_FIELD(dst_y, BLT_Y) | SET_FIELD(dst_x, BLT_X)); OUT_BATCH(SET_FIELD(dst_y2, BLT_Y) | SET_FIELD(dst_x2, BLT_X)); if (devinfo->gen >= 8) { OUT_RELOC64(dst_buffer, RELOC_WRITE, dst_offset); } else { OUT_RELOC(dst_buffer, RELOC_WRITE, dst_offset); } OUT_BATCH(SET_FIELD(src_y, BLT_Y) | SET_FIELD(src_x, BLT_X)); OUT_BATCH((uint16_t)src_pitch); if (devinfo->gen >= 8) { OUT_RELOC64(src_buffer, 0, src_offset); } else { OUT_RELOC(src_buffer, 0, src_offset); } ADVANCE_BATCH_TILED(dst_y_tiled, src_y_tiled); brw_emit_mi_flush(brw); return true; }