static void gen8_upload_3dstate_so_buffers(struct brw_context *brw) { struct gl_context *ctx = &brw->ctx; /* BRW_NEW_TRANSFORM_FEEDBACK */ struct gl_transform_feedback_object *xfb_obj = ctx->TransformFeedback.CurrentObject; struct brw_transform_feedback_object *brw_obj = (struct brw_transform_feedback_object *) xfb_obj; /* Set up the up to 4 output buffers. These are the ranges defined in the * gl_transform_feedback_object. */ for (int i = 0; i < 4; i++) { struct intel_buffer_object *bufferobj = intel_buffer_object(xfb_obj->Buffers[i]); if (!bufferobj) { BEGIN_BATCH(8); OUT_BATCH(_3DSTATE_SO_BUFFER << 16 | (8 - 2)); OUT_BATCH((i << SO_BUFFER_INDEX_SHIFT)); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); continue; } uint32_t start = xfb_obj->Offset[i]; assert(start % 4 == 0); uint32_t end = ALIGN(start + xfb_obj->Size[i], 4); drm_intel_bo *bo = intel_bufferobj_buffer(brw, bufferobj, start, end - start); assert(end <= bo->size); perf_debug("Missing MOCS setup for 3DSTATE_SO_BUFFER."); BEGIN_BATCH(8); OUT_BATCH(_3DSTATE_SO_BUFFER << 16 | (8 - 2)); OUT_BATCH(GEN8_SO_BUFFER_ENABLE | (i << SO_BUFFER_INDEX_SHIFT) | GEN8_SO_BUFFER_OFFSET_WRITE_ENABLE | GEN8_SO_BUFFER_OFFSET_ADDRESS_ENABLE | (BDW_MOCS_WB << 22)); OUT_RELOC64(bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, start); OUT_BATCH(xfb_obj->Size[i] / 4 - 1); OUT_RELOC64(brw_obj->offset_bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, i * sizeof(uint32_t)); if (brw_obj->zero_offsets) OUT_BATCH(0); /* Zero out the offset and write that to offset_bo */ else OUT_BATCH(0xFFFFFFFF); /* Use offset_bo as the "Stream Offset." */ ADVANCE_BATCH(); } brw_obj->zero_offsets = false; }
static void gen8_upload_ds_state(struct brw_context *brw) { struct gl_context *ctx = &brw->ctx; const struct brw_stage_state *stage_state = &brw->tes.base; /* BRW_NEW_TESS_PROGRAMS */ bool active = brw->tess_eval_program; /* BRW_NEW_TES_PROG_DATA */ const struct brw_tes_prog_data *tes_prog_data = brw->tes.prog_data; const struct brw_vue_prog_data *vue_prog_data = &tes_prog_data->base; const struct brw_stage_prog_data *prog_data = &vue_prog_data->base; if (active) { BEGIN_BATCH(9); OUT_BATCH(_3DSTATE_DS << 16 | (9 - 2)); OUT_BATCH(stage_state->prog_offset); OUT_BATCH(0); OUT_BATCH(SET_FIELD(DIV_ROUND_UP(stage_state->sampler_count, 4), GEN7_DS_SAMPLER_COUNT) | SET_FIELD(prog_data->binding_table.size_bytes / 4, GEN7_DS_BINDING_TABLE_ENTRY_COUNT)); if (prog_data->total_scratch) { OUT_RELOC64(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, ffs(prog_data->total_scratch) - 11); } else { OUT_BATCH(0); OUT_BATCH(0); } OUT_BATCH(SET_FIELD(prog_data->dispatch_grf_start_reg, GEN7_DS_DISPATCH_START_GRF) | SET_FIELD(vue_prog_data->urb_read_length, GEN7_DS_URB_READ_LENGTH)); OUT_BATCH(GEN7_DS_ENABLE | GEN7_DS_STATISTICS_ENABLE | (brw->max_ds_threads - 1) << HSW_DS_MAX_THREADS_SHIFT | (vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8 ? GEN7_DS_SIMD8_DISPATCH_ENABLE : 0) | (tes_prog_data->domain == BRW_TESS_DOMAIN_TRI ? GEN7_DS_COMPUTE_W_COORDINATE_ENABLE : 0)); OUT_BATCH(SET_FIELD(ctx->Transform.ClipPlanesEnabled, GEN8_DS_USER_CLIP_DISTANCE)); ADVANCE_BATCH(); } else { BEGIN_BATCH(9); OUT_BATCH(_3DSTATE_DS << 16 | (9 - 2)); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } brw->tes.enabled = active; }
static void upload_vs_state(struct brw_context *brw) { struct gl_context *ctx = &brw->ctx; const struct brw_stage_state *stage_state = &brw->vs.base; uint32_t floating_point_mode = 0; /* CACHE_NEW_VS_PROG */ const struct brw_vec4_prog_data *prog_data = &brw->vs.prog_data->base; /* CACHE_NEW_SAMPLER */ BEGIN_BATCH(2); OUT_BATCH(_3DSTATE_SAMPLER_STATE_POINTERS_VS << 16 | (2 - 2)); OUT_BATCH(stage_state->sampler_offset); ADVANCE_BATCH(); gen8_upload_constant_state(brw, stage_state, true /* active */, _3DSTATE_CONSTANT_VS); /* Use ALT floating point mode for ARB vertex programs, because they * require 0^0 == 1. */ if (ctx->Shader.CurrentProgram[MESA_SHADER_VERTEX] == NULL) floating_point_mode = GEN6_VS_FLOATING_POINT_MODE_ALT; BEGIN_BATCH(9); OUT_BATCH(_3DSTATE_VS << 16 | (9 - 2)); OUT_BATCH(stage_state->prog_offset); OUT_BATCH(0); OUT_BATCH(floating_point_mode | ((ALIGN(stage_state->sampler_count, 4) / 4) << GEN6_VS_SAMPLER_COUNT_SHIFT) | ((prog_data->base.binding_table.size_bytes / 4) << GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT)); if (prog_data->total_scratch) { OUT_RELOC64(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, ffs(prog_data->total_scratch) - 11); } else { OUT_BATCH(0); OUT_BATCH(0); } OUT_BATCH((prog_data->dispatch_grf_start_reg << GEN6_VS_DISPATCH_START_GRF_SHIFT) | (prog_data->urb_read_length << GEN6_VS_URB_READ_LENGTH_SHIFT) | (0 << GEN6_VS_URB_ENTRY_READ_OFFSET_SHIFT)); OUT_BATCH(((brw->max_vs_threads - 1) << HSW_VS_MAX_THREADS_SHIFT) | GEN6_VS_STATISTICS_ENABLE | GEN6_VS_ENABLE); /* _NEW_TRANSFORM */ OUT_BATCH((ctx->Transform.ClipPlanesEnabled << GEN8_VS_USER_CLIP_DISTANCE_SHIFT)); ADVANCE_BATCH(); }
/** * Used to initialize the alpha value of an ARGB8888 miptree after copying * into it from an XRGB8888 source. * * This is very common with glCopyTexImage2D(). Note that the coordinates are * relative to the start of the miptree, not relative to a slice within the * miptree. */ static void intel_miptree_set_alpha_to_one(struct brw_context *brw, struct intel_mipmap_tree *mt, int x, int y, int width, int height) { uint32_t BR13, CMD; int pitch, cpp; drm_intel_bo *aper_array[2]; pitch = mt->pitch; cpp = mt->cpp; DBG("%s dst:buf(%p)/%d %d,%d sz:%dx%d\n", __FUNCTION__, mt->bo, pitch, x, y, width, height); BR13 = br13_for_cpp(cpp) | 0xf0 << 16; CMD = XY_COLOR_BLT_CMD; CMD |= XY_BLT_WRITE_ALPHA; if (mt->tiling != I915_TILING_NONE) { CMD |= XY_DST_TILED; pitch /= 4; } BR13 |= pitch; /* do space check before going any further */ aper_array[0] = brw->batch.bo; aper_array[1] = mt->bo; if (drm_intel_bufmgr_check_aperture_space(aper_array, ARRAY_SIZE(aper_array)) != 0) { intel_batchbuffer_flush(brw); } unsigned length = brw->gen >= 8 ? 7 : 6; bool dst_y_tiled = mt->tiling == I915_TILING_Y; BEGIN_BATCH_BLT_TILED(length, dst_y_tiled, false); OUT_BATCH(CMD | (length - 2)); OUT_BATCH(BR13); OUT_BATCH(SET_FIELD(y, BLT_Y) | SET_FIELD(x, BLT_X)); OUT_BATCH(SET_FIELD(y + height, BLT_Y) | SET_FIELD(x + width, BLT_X)); if (brw->gen >= 8) { OUT_RELOC64(mt->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0); } else { OUT_RELOC(mt->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0); } OUT_BATCH(0xffffffff); /* white, but only alpha gets written */ ADVANCE_BATCH_TILED(dst_y_tiled, false); intel_batchbuffer_emit_mi_flush(brw); }
static void gen8_upload_hs_state(struct brw_context *brw) { const struct gen_device_info *devinfo = &brw->screen->devinfo; const struct brw_stage_state *stage_state = &brw->tcs.base; /* BRW_NEW_TESS_PROGRAMS */ bool active = brw->tess_eval_program; /* BRW_NEW_TCS_PROG_DATA */ const struct brw_stage_prog_data *prog_data = stage_state->prog_data; const struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(stage_state->prog_data); if (active) { BEGIN_BATCH(9); OUT_BATCH(_3DSTATE_HS << 16 | (9 - 2)); OUT_BATCH(SET_FIELD(DIV_ROUND_UP(stage_state->sampler_count, 4), GEN7_HS_SAMPLER_COUNT) | SET_FIELD(prog_data->binding_table.size_bytes / 4, GEN7_HS_BINDING_TABLE_ENTRY_COUNT)); OUT_BATCH(GEN7_HS_ENABLE | GEN7_HS_STATISTICS_ENABLE | (devinfo->max_tcs_threads - 1) << GEN8_HS_MAX_THREADS_SHIFT | SET_FIELD(tcs_prog_data->instances - 1, GEN7_HS_INSTANCE_COUNT)); OUT_BATCH(stage_state->prog_offset); OUT_BATCH(0); if (prog_data->total_scratch) { OUT_RELOC64(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, ffs(stage_state->per_thread_scratch) - 11); } else { OUT_BATCH(0); OUT_BATCH(0); } OUT_BATCH(GEN7_HS_INCLUDE_VERTEX_HANDLES | SET_FIELD(prog_data->dispatch_grf_start_reg, GEN7_HS_DISPATCH_START_GRF)); OUT_BATCH(0); /* MBZ */ ADVANCE_BATCH(); } else { BEGIN_BATCH(9); OUT_BATCH(_3DSTATE_HS << 16 | (9 - 2)); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } brw->tcs.enabled = active; }
static void upload_vs_state(struct brw_context *brw) { struct gl_context *ctx = &brw->ctx; const struct brw_stage_state *stage_state = &brw->vs.base; uint32_t floating_point_mode = 0; /* BRW_NEW_VS_PROG_DATA */ const struct brw_vue_prog_data *prog_data = &brw->vs.prog_data->base; assert(prog_data->dispatch_mode == DISPATCH_MODE_SIMD8 || prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT); if (prog_data->base.use_alt_mode) floating_point_mode = GEN6_VS_FLOATING_POINT_MODE_ALT; BEGIN_BATCH(9); OUT_BATCH(_3DSTATE_VS << 16 | (9 - 2)); OUT_BATCH(stage_state->prog_offset); OUT_BATCH(0); OUT_BATCH(floating_point_mode | ((ALIGN(stage_state->sampler_count, 4) / 4) << GEN6_VS_SAMPLER_COUNT_SHIFT) | ((prog_data->base.binding_table.size_bytes / 4) << GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT)); if (prog_data->base.total_scratch) { OUT_RELOC64(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, ffs(prog_data->base.total_scratch) - 11); } else { OUT_BATCH(0); OUT_BATCH(0); } OUT_BATCH((prog_data->base.dispatch_grf_start_reg << GEN6_VS_DISPATCH_START_GRF_SHIFT) | (prog_data->urb_read_length << GEN6_VS_URB_READ_LENGTH_SHIFT) | (0 << GEN6_VS_URB_ENTRY_READ_OFFSET_SHIFT)); uint32_t simd8_enable = prog_data->dispatch_mode == DISPATCH_MODE_SIMD8 ? GEN8_VS_SIMD8_ENABLE : 0; OUT_BATCH(((brw->max_vs_threads - 1) << HSW_VS_MAX_THREADS_SHIFT) | GEN6_VS_STATISTICS_ENABLE | simd8_enable | GEN6_VS_ENABLE); /* _NEW_TRANSFORM */ OUT_BATCH(prog_data->cull_distance_mask | (ctx->Transform.ClipPlanesEnabled << GEN8_VS_USER_CLIP_DISTANCE_SHIFT)); ADVANCE_BATCH(); }
/** * Enable hardware binding tables and set up the binding table pool. */ void gen7_enable_hw_binding_tables(struct brw_context *brw) { if (!brw->use_resource_streamer) return; if (!brw->hw_bt_pool.bo) { /* We use a single re-usable buffer object for the lifetime of the * context and size it to maximum allowed binding tables that can be * programmed per batch: * * From the Haswell PRM, Volume 7: 3D Media GPGPU, * 3DSTATE_BINDING_TABLE_POOL_ALLOC > Programming Note: * "A maximum of 16,383 Binding tables are allowed in any batch buffer" */ static const int max_size = 16383 * 4; brw->hw_bt_pool.bo = drm_intel_bo_alloc(brw->bufmgr, "hw_bt", max_size, 64); brw->hw_bt_pool.next_offset = 0; } /* From the Haswell PRM, Volume 7: 3D Media GPGPU, * 3DSTATE_BINDING_TABLE_POOL_ALLOC > Programming Note: * * "When switching between HW and SW binding table generation, SW must * issue a state cache invalidate." */ brw_emit_pipe_control_flush(brw, PIPE_CONTROL_STATE_CACHE_INVALIDATE); int pkt_len = brw->gen >= 8 ? 4 : 3; uint32_t dw1 = BRW_HW_BINDING_TABLE_ENABLE; if (brw->is_haswell) { dw1 |= SET_FIELD(GEN7_MOCS_L3, GEN7_HW_BT_POOL_MOCS) | HSW_BT_POOL_ALLOC_MUST_BE_ONE; } else if (brw->gen >= 8) { dw1 |= BDW_MOCS_WB; } BEGIN_BATCH(pkt_len); OUT_BATCH(_3DSTATE_BINDING_TABLE_POOL_ALLOC << 16 | (pkt_len - 2)); if (brw->gen >= 8) { OUT_RELOC64(brw->hw_bt_pool.bo, I915_GEM_DOMAIN_SAMPLER, 0, dw1); OUT_BATCH(brw->hw_bt_pool.bo->size); } else { OUT_RELOC(brw->hw_bt_pool.bo, I915_GEM_DOMAIN_SAMPLER, 0, dw1); OUT_RELOC(brw->hw_bt_pool.bo, I915_GEM_DOMAIN_SAMPLER, 0, brw->hw_bt_pool.bo->size); } ADVANCE_BATCH(); }
/** * Define the base addresses which some state is referenced from. */ static void upload_state_base_address(struct brw_context *brw) { perf_debug("Missing MOCS setup for STATE_BASE_ADDRESS."); BEGIN_BATCH(16); OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (16 - 2)); /* General state base address: stateless DP read/write requests */ OUT_BATCH(BDW_MOCS_WB << 4 | 1); OUT_BATCH(0); OUT_BATCH(BDW_MOCS_WB << 16); /* Surface state base address: */ OUT_RELOC64(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0, BDW_MOCS_WB << 4 | 1); /* Dynamic state base address: */ OUT_RELOC64(brw->batch.bo, I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION, 0, BDW_MOCS_WB << 4 | 1); /* Indirect object base address: MEDIA_OBJECT data */ OUT_BATCH(BDW_MOCS_WB << 4 | 1); OUT_BATCH(0); /* Instruction base address: shader kernels (incl. SIP) */ OUT_RELOC64(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, BDW_MOCS_WB << 4 | 1); /* General state buffer size */ OUT_BATCH(0xfffff001); /* Dynamic state buffer size */ OUT_BATCH(ALIGN(brw->batch.bo->size, 4096) | 1); /* Indirect object upper bound */ OUT_BATCH(0xfffff001); /* Instruction access upper bound */ OUT_BATCH(ALIGN(brw->cache.bo->size, 4096) | 1); ADVANCE_BATCH(); brw->state.dirty.brw |= BRW_NEW_STATE_BASE_ADDRESS; }
static void gen8_emit_index_buffer(struct brw_context *brw) { const struct _mesa_index_buffer *index_buffer = brw->ib.ib; uint32_t mocs_wb = brw->gen >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB; if (index_buffer == NULL) return; BEGIN_BATCH(5); OUT_BATCH(CMD_INDEX_BUFFER << 16 | (5 - 2)); OUT_BATCH(brw_get_index_type(index_buffer->type) | mocs_wb); OUT_RELOC64(brw->ib.bo, I915_GEM_DOMAIN_VERTEX, 0, 0); OUT_BATCH(brw->ib.bo->size); ADVANCE_BATCH(); }
static void gen6_blorp_emit_vertex_buffer_state(struct brw_context *brw, unsigned num_elems, unsigned vbo_size, uint32_t vertex_offset) { /* 3DSTATE_VERTEX_BUFFERS */ const int num_buffers = 1; const int batch_length = 1 + 4 * num_buffers; uint32_t dw0 = GEN6_VB0_ACCESS_VERTEXDATA | (num_elems * sizeof(float)) << BRW_VB0_PITCH_SHIFT; if (brw->gen >= 7) dw0 |= GEN7_VB0_ADDRESS_MODIFYENABLE; switch (brw->gen) { case 7: dw0 |= GEN7_MOCS_L3 << 16; break; case 8: dw0 |= BDW_MOCS_WB << 16; break; case 9: dw0 |= SKL_MOCS_WB << 16; break; } BEGIN_BATCH(batch_length); OUT_BATCH((_3DSTATE_VERTEX_BUFFERS << 16) | (batch_length - 2)); OUT_BATCH(dw0); if (brw->gen >= 8) { OUT_RELOC64(brw->batch.bo, I915_GEM_DOMAIN_VERTEX, 0, vertex_offset); OUT_BATCH(vbo_size); } else { /* start address */ OUT_RELOC(brw->batch.bo, I915_GEM_DOMAIN_VERTEX, 0, vertex_offset); /* end address */ OUT_RELOC(brw->batch.bo, I915_GEM_DOMAIN_VERTEX, 0, vertex_offset + vbo_size - 1); OUT_BATCH(0); } ADVANCE_BATCH(); }
/** * Emit a PIPE_CONTROL that writes to a buffer object. * * \p flags should contain one of the following items: * - PIPE_CONTROL_WRITE_IMMEDIATE * - PIPE_CONTROL_WRITE_TIMESTAMP * - PIPE_CONTROL_WRITE_DEPTH_COUNT */ void brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags, drm_intel_bo *bo, uint32_t offset, uint32_t imm_lower, uint32_t imm_upper) { if (brw->gen >= 8) { if (brw->gen == 8) gen8_add_cs_stall_workaround_bits(&flags); BEGIN_BATCH(6); OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2)); OUT_BATCH(flags); OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, offset); OUT_BATCH(imm_lower); OUT_BATCH(imm_upper); ADVANCE_BATCH(); } else if (brw->gen >= 6) { flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags); /* PPGTT/GGTT is selected by DW2 bit 2 on Sandybridge, but DW1 bit 24 * on later platforms. We always use PPGTT on Gen7+. */ unsigned gen6_gtt = brw->gen == 6 ? PIPE_CONTROL_GLOBAL_GTT_WRITE : 0; BEGIN_BATCH(5); OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2)); OUT_BATCH(flags); OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, gen6_gtt | offset); OUT_BATCH(imm_lower); OUT_BATCH(imm_upper); ADVANCE_BATCH(); } else { BEGIN_BATCH(4); OUT_BATCH(_3DSTATE_PIPE_CONTROL | flags | (4 - 2)); OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, PIPE_CONTROL_GLOBAL_GTT_WRITE | offset); OUT_BATCH(imm_lower); OUT_BATCH(imm_upper); ADVANCE_BATCH(); } }
static void brw_upload_cs_state(struct brw_context *brw) { if (!brw->cs.prog_data) return; uint32_t offset; uint32_t *desc = (uint32_t*) brw_state_batch(brw, AUB_TRACE_SURFACE_STATE, 8 * 4, 64, &offset); struct brw_stage_state *stage_state = &brw->cs.base; struct brw_cs_prog_data *cs_prog_data = brw->cs.prog_data; struct brw_stage_prog_data *prog_data = &cs_prog_data->base; if (INTEL_DEBUG & DEBUG_SHADER_TIME) { brw->vtbl.emit_buffer_surface_state( brw, &stage_state->surf_offset[ prog_data->binding_table.shader_time_start], brw->shader_time.bo, 0, BRW_SURFACEFORMAT_RAW, brw->shader_time.bo->size, 1, true); } uint32_t *bind = (uint32_t*) brw_state_batch(brw, AUB_TRACE_BINDING_TABLE, prog_data->binding_table.size_bytes, 32, &stage_state->bind_bo_offset); unsigned threads = get_cs_thread_count(cs_prog_data); uint32_t dwords = brw->gen < 8 ? 8 : 9; BEGIN_BATCH(dwords); OUT_BATCH(MEDIA_VFE_STATE << 16 | (dwords - 2)); if (prog_data->total_scratch) { if (brw->gen >= 8) OUT_RELOC64(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, ffs(prog_data->total_scratch) - 11); else OUT_RELOC(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, ffs(prog_data->total_scratch) - 11); } else { OUT_BATCH(0); if (brw->gen >= 8) OUT_BATCH(0); } const uint32_t vfe_num_urb_entries = brw->gen >= 8 ? 2 : 0; const uint32_t vfe_gpgpu_mode = brw->gen == 7 ? SET_FIELD(1, GEN7_MEDIA_VFE_STATE_GPGPU_MODE) : 0; OUT_BATCH(SET_FIELD(brw->max_cs_threads - 1, MEDIA_VFE_STATE_MAX_THREADS) | SET_FIELD(vfe_num_urb_entries, MEDIA_VFE_STATE_URB_ENTRIES) | SET_FIELD(1, MEDIA_VFE_STATE_RESET_GTW_TIMER) | SET_FIELD(1, MEDIA_VFE_STATE_BYPASS_GTW) | vfe_gpgpu_mode); OUT_BATCH(0); const uint32_t vfe_urb_allocation = brw->gen >= 8 ? 2 : 0; OUT_BATCH(SET_FIELD(vfe_urb_allocation, MEDIA_VFE_STATE_URB_ALLOC)); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); /* BRW_NEW_SURFACES and BRW_NEW_*_CONSTBUF */ memcpy(bind, stage_state->surf_offset, prog_data->binding_table.size_bytes); memset(desc, 0, 8 * 4); int dw = 0; desc[dw++] = brw->cs.base.prog_offset; if (brw->gen >= 8) desc[dw++] = 0; /* Kernel Start Pointer High */ desc[dw++] = 0; desc[dw++] = 0; desc[dw++] = stage_state->bind_bo_offset; desc[dw++] = 0; const uint32_t media_threads = brw->gen >= 8 ? SET_FIELD(threads, GEN8_MEDIA_GPGPU_THREAD_COUNT) : SET_FIELD(threads, MEDIA_GPGPU_THREAD_COUNT); assert(threads <= brw->max_cs_threads); desc[dw++] = media_threads; BEGIN_BATCH(4); OUT_BATCH(MEDIA_INTERFACE_DESCRIPTOR_LOAD << 16 | (4 - 2)); OUT_BATCH(0); OUT_BATCH(8 * 4); OUT_BATCH(offset); ADVANCE_BATCH(); }
/* Copy BitBlt */ static bool emit_copy_blit(struct brw_context *brw, GLuint cpp, int32_t src_pitch, struct brw_bo *src_buffer, GLuint src_offset, enum isl_tiling src_tiling, int32_t dst_pitch, struct brw_bo *dst_buffer, GLuint dst_offset, enum isl_tiling dst_tiling, GLshort src_x, GLshort src_y, GLshort dst_x, GLshort dst_y, GLshort w, GLshort h, enum gl_logicop_mode logic_op) { const struct gen_device_info *devinfo = &brw->screen->devinfo; GLuint CMD, BR13; int dst_y2 = dst_y + h; int dst_x2 = dst_x + w; bool dst_y_tiled = dst_tiling == ISL_TILING_Y0; bool src_y_tiled = src_tiling == ISL_TILING_Y0; uint32_t src_tile_w, src_tile_h; uint32_t dst_tile_w, dst_tile_h; if ((dst_y_tiled || src_y_tiled) && devinfo->gen < 6) return false; const unsigned bo_sizes = dst_buffer->size + src_buffer->size; /* do space check before going any further */ if (!brw_batch_has_aperture_space(brw, bo_sizes)) intel_batchbuffer_flush(brw); if (!brw_batch_has_aperture_space(brw, bo_sizes)) return false; unsigned length = devinfo->gen >= 8 ? 10 : 8; intel_batchbuffer_require_space(brw, length * 4); DBG("%s src:buf(%p)/%d+%d %d,%d dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n", __func__, src_buffer, src_pitch, src_offset, src_x, src_y, dst_buffer, dst_pitch, dst_offset, dst_x, dst_y, w, h); intel_get_tile_dims(src_tiling, cpp, &src_tile_w, &src_tile_h); intel_get_tile_dims(dst_tiling, cpp, &dst_tile_w, &dst_tile_h); /* For Tiled surfaces, the pitch has to be a multiple of the Tile width * (X direction width of the Tile). This is ensured while allocating the * buffer object. */ assert(src_tiling == ISL_TILING_LINEAR || (src_pitch % src_tile_w) == 0); assert(dst_tiling == ISL_TILING_LINEAR || (dst_pitch % dst_tile_w) == 0); /* For big formats (such as floating point), do the copy using 16 or * 32bpp and multiply the coordinates. */ if (cpp > 4) { if (cpp % 4 == 2) { dst_x *= cpp / 2; dst_x2 *= cpp / 2; src_x *= cpp / 2; cpp = 2; } else { assert(cpp % 4 == 0); dst_x *= cpp / 4; dst_x2 *= cpp / 4; src_x *= cpp / 4; cpp = 4; } } if (!alignment_valid(brw, dst_offset, dst_tiling)) return false; if (!alignment_valid(brw, src_offset, src_tiling)) return false; /* Blit pitch must be dword-aligned. Otherwise, the hardware appears to drop * the low bits. Offsets must be naturally aligned. */ if (src_pitch % 4 != 0 || src_offset % cpp != 0 || dst_pitch % 4 != 0 || dst_offset % cpp != 0) return false; assert(cpp <= 4); BR13 = br13_for_cpp(cpp) | translate_raster_op(logic_op) << 16; CMD = xy_blit_cmd(src_tiling, dst_tiling, cpp); /* For tiled source and destination, pitch value should be specified * as a number of Dwords. */ if (dst_tiling != ISL_TILING_LINEAR) dst_pitch /= 4; if (src_tiling != ISL_TILING_LINEAR) src_pitch /= 4; if (dst_y2 <= dst_y || dst_x2 <= dst_x) return true; assert(dst_x < dst_x2); assert(dst_y < dst_y2); BEGIN_BATCH_BLT_TILED(length, dst_y_tiled, src_y_tiled); OUT_BATCH(CMD | (length - 2)); OUT_BATCH(BR13 | (uint16_t)dst_pitch); OUT_BATCH(SET_FIELD(dst_y, BLT_Y) | SET_FIELD(dst_x, BLT_X)); OUT_BATCH(SET_FIELD(dst_y2, BLT_Y) | SET_FIELD(dst_x2, BLT_X)); if (devinfo->gen >= 8) { OUT_RELOC64(dst_buffer, RELOC_WRITE, dst_offset); } else { OUT_RELOC(dst_buffer, RELOC_WRITE, dst_offset); } OUT_BATCH(SET_FIELD(src_y, BLT_Y) | SET_FIELD(src_x, BLT_X)); OUT_BATCH((uint16_t)src_pitch); if (devinfo->gen >= 8) { OUT_RELOC64(src_buffer, 0, src_offset); } else { OUT_RELOC(src_buffer, 0, src_offset); } ADVANCE_BATCH_TILED(dst_y_tiled, src_y_tiled); brw_emit_mi_flush(brw); return true; }
void gen8_upload_ps_state(struct brw_context *brw, const struct brw_stage_state *stage_state, const struct brw_wm_prog_data *prog_data, uint32_t fast_clear_op) { uint32_t dw3 = 0, dw6 = 0, dw7 = 0, ksp0, ksp2 = 0; /* Initialize the execution mask with VMask. Otherwise, derivatives are * incorrect for subspans where some of the pixels are unlit. We believe * the bit just didn't take effect in previous generations. */ dw3 |= GEN7_PS_VECTOR_MASK_ENABLE; const unsigned sampler_count = DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4); dw3 |= SET_FIELD(sampler_count, GEN7_PS_SAMPLER_COUNT); /* BRW_NEW_FS_PROG_DATA */ dw3 |= ((prog_data->base.binding_table.size_bytes / 4) << GEN7_PS_BINDING_TABLE_ENTRY_COUNT_SHIFT); if (prog_data->base.use_alt_mode) dw3 |= GEN7_PS_FLOATING_POINT_MODE_ALT; /* 3DSTATE_PS expects the number of threads per PSD, which is always 64; * it implicitly scales for different GT levels (which have some # of PSDs). * * In Gen8 the format is U8-2 whereas in Gen9 it is U8-1. */ if (brw->gen >= 9) dw6 |= (64 - 1) << HSW_PS_MAX_THREADS_SHIFT; else dw6 |= (64 - 2) << HSW_PS_MAX_THREADS_SHIFT; if (prog_data->base.nr_params > 0) dw6 |= GEN7_PS_PUSH_CONSTANT_ENABLE; /* From the documentation for this packet: * "If the PS kernel does not need the Position XY Offsets to * compute a Position Value, then this field should be programmed * to POSOFFSET_NONE." * * "SW Recommendation: If the PS kernel needs the Position Offsets * to compute a Position XY value, this field should match Position * ZW Interpolation Mode to ensure a consistent position.xyzw * computation." * * We only require XY sample offsets. So, this recommendation doesn't * look useful at the moment. We might need this in future. */ if (prog_data->uses_pos_offset) dw6 |= GEN7_PS_POSOFFSET_SAMPLE; else dw6 |= GEN7_PS_POSOFFSET_NONE; dw6 |= fast_clear_op; if (prog_data->dispatch_8) dw6 |= GEN7_PS_8_DISPATCH_ENABLE; if (prog_data->dispatch_16) dw6 |= GEN7_PS_16_DISPATCH_ENABLE; dw7 |= prog_data->base.dispatch_grf_start_reg << GEN7_PS_DISPATCH_START_GRF_SHIFT_0; dw7 |= prog_data->dispatch_grf_start_reg_2 << GEN7_PS_DISPATCH_START_GRF_SHIFT_2; ksp0 = stage_state->prog_offset; ksp2 = stage_state->prog_offset + prog_data->prog_offset_2; BEGIN_BATCH(12); OUT_BATCH(_3DSTATE_PS << 16 | (12 - 2)); OUT_BATCH(ksp0); OUT_BATCH(0); OUT_BATCH(dw3); if (prog_data->base.total_scratch) { OUT_RELOC64(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, ffs(stage_state->per_thread_scratch) - 11); } else { OUT_BATCH(0); OUT_BATCH(0); } OUT_BATCH(dw6); OUT_BATCH(dw7); OUT_BATCH(0); /* kernel 1 pointer */ OUT_BATCH(0); OUT_BATCH(ksp2); OUT_BATCH(0); ADVANCE_BATCH(); }
static void brw_upload_cs_state(struct brw_context *brw) { if (!brw->cs.prog_data) return; uint32_t offset; uint32_t *desc = (uint32_t*) brw_state_batch(brw, AUB_TRACE_SURFACE_STATE, 8 * 4, 64, &offset); struct brw_stage_state *stage_state = &brw->cs.base; struct brw_cs_prog_data *cs_prog_data = brw->cs.prog_data; struct brw_stage_prog_data *prog_data = &cs_prog_data->base; const struct brw_device_info *devinfo = brw->intelScreen->devinfo; if (INTEL_DEBUG & DEBUG_SHADER_TIME) { brw->vtbl.emit_buffer_surface_state( brw, &stage_state->surf_offset[ prog_data->binding_table.shader_time_start], brw->shader_time.bo, 0, BRW_SURFACEFORMAT_RAW, brw->shader_time.bo->size, 1, true); } uint32_t *bind = (uint32_t*) brw_state_batch(brw, AUB_TRACE_BINDING_TABLE, prog_data->binding_table.size_bytes, 32, &stage_state->bind_bo_offset); uint32_t dwords = brw->gen < 8 ? 8 : 9; BEGIN_BATCH(dwords); OUT_BATCH(MEDIA_VFE_STATE << 16 | (dwords - 2)); if (prog_data->total_scratch) { if (brw->gen >= 8) { /* Broadwell's Per Thread Scratch Space is in the range [0, 11] * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M. */ OUT_RELOC64(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, ffs(stage_state->per_thread_scratch) - 11); } else if (brw->is_haswell) { /* Haswell's Per Thread Scratch Space is in the range [0, 10] * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M. */ OUT_RELOC(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, ffs(stage_state->per_thread_scratch) - 12); } else { /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB] * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB. */ OUT_RELOC(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, stage_state->per_thread_scratch / 1024 - 1); } } else { OUT_BATCH(0); if (brw->gen >= 8) OUT_BATCH(0); } const uint32_t vfe_num_urb_entries = brw->gen >= 8 ? 2 : 0; const uint32_t vfe_gpgpu_mode = brw->gen == 7 ? SET_FIELD(1, GEN7_MEDIA_VFE_STATE_GPGPU_MODE) : 0; const uint32_t subslices = MAX2(brw->intelScreen->subslice_total, 1); OUT_BATCH(SET_FIELD(brw->max_cs_threads * subslices - 1, MEDIA_VFE_STATE_MAX_THREADS) | SET_FIELD(vfe_num_urb_entries, MEDIA_VFE_STATE_URB_ENTRIES) | SET_FIELD(1, MEDIA_VFE_STATE_RESET_GTW_TIMER) | SET_FIELD(1, MEDIA_VFE_STATE_BYPASS_GTW) | vfe_gpgpu_mode); OUT_BATCH(0); const uint32_t vfe_urb_allocation = brw->gen >= 8 ? 2 : 0; /* We are uploading duplicated copies of push constant uniforms for each * thread. Although the local id data needs to vary per thread, it won't * change for other uniform data. Unfortunately this duplication is * required for gen7. As of Haswell, this duplication can be avoided, but * this older mechanism with duplicated data continues to work. * * FINISHME: As of Haswell, we could make use of the * INTERFACE_DESCRIPTOR_DATA "Cross-Thread Constant Data Read Length" field * to only store one copy of uniform data. * * FINISHME: Broadwell adds a new alternative "Indirect Payload Storage" * which is described in the GPGPU_WALKER command and in the Broadwell PRM * Volume 7: 3D Media GPGPU, under Media GPGPU Pipeline => Mode of * Operations => GPGPU Mode => Indirect Payload Storage. * * Note: The constant data is built in brw_upload_cs_push_constants below. */ const uint32_t vfe_curbe_allocation = ALIGN(cs_prog_data->push.per_thread.regs * cs_prog_data->threads + cs_prog_data->push.cross_thread.regs, 2); OUT_BATCH(SET_FIELD(vfe_urb_allocation, MEDIA_VFE_STATE_URB_ALLOC) | SET_FIELD(vfe_curbe_allocation, MEDIA_VFE_STATE_CURBE_ALLOC)); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); if (cs_prog_data->push.total.size > 0) { BEGIN_BATCH(4); OUT_BATCH(MEDIA_CURBE_LOAD << 16 | (4 - 2)); OUT_BATCH(0); OUT_BATCH(ALIGN(cs_prog_data->push.total.size, 64)); OUT_BATCH(stage_state->push_const_offset); ADVANCE_BATCH(); } /* BRW_NEW_SURFACES and BRW_NEW_*_CONSTBUF */ memcpy(bind, stage_state->surf_offset, prog_data->binding_table.size_bytes); memset(desc, 0, 8 * 4); int dw = 0; desc[dw++] = brw->cs.base.prog_offset; if (brw->gen >= 8) desc[dw++] = 0; /* Kernel Start Pointer High */ desc[dw++] = 0; desc[dw++] = stage_state->sampler_offset | ((stage_state->sampler_count + 3) / 4); desc[dw++] = stage_state->bind_bo_offset; desc[dw++] = SET_FIELD(cs_prog_data->push.per_thread.regs, MEDIA_CURBE_READ_LENGTH); const uint32_t media_threads = brw->gen >= 8 ? SET_FIELD(cs_prog_data->threads, GEN8_MEDIA_GPGPU_THREAD_COUNT) : SET_FIELD(cs_prog_data->threads, MEDIA_GPGPU_THREAD_COUNT); assert(cs_prog_data->threads <= brw->max_cs_threads); const uint32_t slm_size = encode_slm_size(devinfo->gen, prog_data->total_shared); desc[dw++] = SET_FIELD(cs_prog_data->uses_barrier, MEDIA_BARRIER_ENABLE) | SET_FIELD(slm_size, MEDIA_SHARED_LOCAL_MEMORY_SIZE) | media_threads; desc[dw++] = SET_FIELD(cs_prog_data->push.cross_thread.regs, CROSS_THREAD_READ_LENGTH); BEGIN_BATCH(4); OUT_BATCH(MEDIA_INTERFACE_DESCRIPTOR_LOAD << 16 | (4 - 2)); OUT_BATCH(0); OUT_BATCH(8 * 4); OUT_BATCH(offset); ADVANCE_BATCH(); }
/** * Used to initialize the alpha value of an ARGB8888 miptree after copying * into it from an XRGB8888 source. * * This is very common with glCopyTexImage2D(). Note that the coordinates are * relative to the start of the miptree, not relative to a slice within the * miptree. */ static void intel_miptree_set_alpha_to_one(struct brw_context *brw, struct intel_mipmap_tree *mt, int x, int y, int width, int height) { const struct gen_device_info *devinfo = &brw->screen->devinfo; uint32_t BR13, CMD; int pitch, cpp; pitch = mt->surf.row_pitch_B; cpp = mt->cpp; DBG("%s dst:buf(%p)/%d %d,%d sz:%dx%d\n", __func__, mt->bo, pitch, x, y, width, height); /* Note: Currently only handles 8 bit alpha channel. Extension to < 8 Bit * alpha channel would be likely possible via ROP code 0xfa instead of 0xf0 * and writing a suitable bit-mask instead of 0xffffffff. */ BR13 = br13_for_cpp(cpp) | 0xf0 << 16; CMD = XY_COLOR_BLT_CMD; CMD |= XY_BLT_WRITE_ALPHA; if (mt->surf.tiling != ISL_TILING_LINEAR) { CMD |= XY_DST_TILED; pitch /= 4; } BR13 |= pitch; /* do space check before going any further */ if (!brw_batch_has_aperture_space(brw, mt->bo->size)) intel_batchbuffer_flush(brw); unsigned length = devinfo->gen >= 8 ? 7 : 6; const bool dst_y_tiled = mt->surf.tiling == ISL_TILING_Y0; /* We need to split the blit into chunks that each fit within the blitter's * restrictions. We can't use a chunk size of 32768 because we need to * ensure that src_tile_x + chunk_size fits. We choose 16384 because it's * a nice round power of two, big enough that performance won't suffer, and * small enough to guarantee everything fits. */ const uint32_t max_chunk_size = 16384; for (uint32_t chunk_x = 0; chunk_x < width; chunk_x += max_chunk_size) { for (uint32_t chunk_y = 0; chunk_y < height; chunk_y += max_chunk_size) { const uint32_t chunk_w = MIN2(max_chunk_size, width - chunk_x); const uint32_t chunk_h = MIN2(max_chunk_size, height - chunk_y); uint32_t offset, tile_x, tile_y; get_blit_intratile_offset_el(brw, mt, x + chunk_x, y + chunk_y, &offset, &tile_x, &tile_y); BEGIN_BATCH_BLT_TILED(length, dst_y_tiled, false); OUT_BATCH(CMD | (length - 2)); OUT_BATCH(BR13); OUT_BATCH(SET_FIELD(y + chunk_y, BLT_Y) | SET_FIELD(x + chunk_x, BLT_X)); OUT_BATCH(SET_FIELD(y + chunk_y + chunk_h, BLT_Y) | SET_FIELD(x + chunk_x + chunk_w, BLT_X)); if (devinfo->gen >= 8) { OUT_RELOC64(mt->bo, RELOC_WRITE, mt->offset + offset); } else { OUT_RELOC(mt->bo, RELOC_WRITE, mt->offset + offset); } OUT_BATCH(0xffffffff); /* white, but only alpha gets written */ ADVANCE_BATCH_TILED(dst_y_tiled, false); } } brw_emit_mi_flush(brw); }
/** * Helper function to emit depth related command packets. */ static void emit_depth_packets(struct brw_context *brw, struct intel_mipmap_tree *depth_mt, uint32_t depthbuffer_format, uint32_t depth_surface_type, bool depth_writable, struct intel_mipmap_tree *stencil_mt, bool stencil_writable, uint32_t stencil_offset, bool hiz, uint32_t width, uint32_t height, uint32_t depth, uint32_t lod, uint32_t min_array_element) { /* Skip repeated NULL depth/stencil emits (think 2D rendering). */ if (!depth_mt && !stencil_mt && brw->no_depth_or_stencil) { assert(brw->hw_ctx); return; } intel_emit_depth_stall_flushes(brw); /* _NEW_BUFFERS, _NEW_DEPTH, _NEW_STENCIL */ BEGIN_BATCH(8); OUT_BATCH(GEN7_3DSTATE_DEPTH_BUFFER << 16 | (8 - 2)); OUT_BATCH(depth_surface_type << 29 | (depth_writable ? (1 << 28) : 0) | (stencil_mt != NULL && stencil_writable) << 27 | (hiz ? 1 : 0) << 22 | depthbuffer_format << 18 | (depth_mt ? depth_mt->region->pitch - 1 : 0)); if (depth_mt) { OUT_RELOC64(depth_mt->region->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0); } else { OUT_BATCH(0); OUT_BATCH(0); } OUT_BATCH(((width - 1) << 4) | ((height - 1) << 18) | lod); OUT_BATCH(((depth - 1) << 21) | (min_array_element << 10) | BDW_MOCS_WB); OUT_BATCH(0); OUT_BATCH(depth_mt ? depth_mt->qpitch >> 2 : 0); ADVANCE_BATCH(); if (!hiz) { BEGIN_BATCH(5); OUT_BATCH(GEN7_3DSTATE_HIER_DEPTH_BUFFER << 16 | (5 - 2)); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } else { BEGIN_BATCH(5); OUT_BATCH(GEN7_3DSTATE_HIER_DEPTH_BUFFER << 16 | (5 - 2)); OUT_BATCH((depth_mt->hiz_mt->region->pitch - 1) | BDW_MOCS_WB << 25); OUT_RELOC64(depth_mt->hiz_mt->region->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0); OUT_BATCH(depth_mt->hiz_mt->qpitch >> 2); ADVANCE_BATCH(); } if (stencil_mt == NULL) { BEGIN_BATCH(5); OUT_BATCH(GEN7_3DSTATE_STENCIL_BUFFER << 16 | (5 - 2)); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } else { BEGIN_BATCH(5); OUT_BATCH(GEN7_3DSTATE_STENCIL_BUFFER << 16 | (5 - 2)); /* The stencil buffer has quirky pitch requirements. From the Graphics * BSpec: vol2a.11 3D Pipeline Windower > Early Depth/Stencil Processing * > Depth/Stencil Buffer State > 3DSTATE_STENCIL_BUFFER [DevIVB+], * field "Surface Pitch": * * The pitch must be set to 2x the value computed based on width, as * the stencil buffer is stored with two rows interleaved. * * (Note that it is not 100% clear whether this intended to apply to * Gen7; the BSpec flags this comment as "DevILK,DevSNB" (which would * imply that it doesn't), however the comment appears on a "DevIVB+" * page (which would imply that it does). Experiments with the hardware * indicate that it does. */ OUT_BATCH(HSW_STENCIL_ENABLED | BDW_MOCS_WB << 22 | (2 * stencil_mt->region->pitch - 1)); OUT_RELOC64(stencil_mt->region->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, stencil_offset); OUT_BATCH(stencil_mt ? stencil_mt->qpitch >> 2 : 0); ADVANCE_BATCH(); } BEGIN_BATCH(3); OUT_BATCH(GEN7_3DSTATE_CLEAR_PARAMS << 16 | (3 - 2)); OUT_BATCH(depth_mt ? depth_mt->depth_clear_value : 0); OUT_BATCH(1); ADVANCE_BATCH(); brw->no_depth_or_stencil = !depth_mt && !stencil_mt; }
/* Copy BitBlt */ bool intelEmitCopyBlit(struct brw_context *brw, GLuint cpp, GLshort src_pitch, drm_intel_bo *src_buffer, GLuint src_offset, uint32_t src_tiling, GLshort dst_pitch, drm_intel_bo *dst_buffer, GLuint dst_offset, uint32_t dst_tiling, GLshort src_x, GLshort src_y, GLshort dst_x, GLshort dst_y, GLshort w, GLshort h, GLenum logic_op) { GLuint CMD, BR13, pass = 0; int dst_y2 = dst_y + h; int dst_x2 = dst_x + w; drm_intel_bo *aper_array[3]; bool dst_y_tiled = dst_tiling == I915_TILING_Y; bool src_y_tiled = src_tiling == I915_TILING_Y; if (dst_tiling != I915_TILING_NONE) { if (dst_offset & 4095) return false; } if (src_tiling != I915_TILING_NONE) { if (src_offset & 4095) return false; } if ((dst_y_tiled || src_y_tiled) && brw->gen < 6) return false; /* do space check before going any further */ do { aper_array[0] = brw->batch.bo; aper_array[1] = dst_buffer; aper_array[2] = src_buffer; if (dri_bufmgr_check_aperture_space(aper_array, 3) != 0) { intel_batchbuffer_flush(brw); pass++; } else break; } while (pass < 2); if (pass >= 2) return false; intel_batchbuffer_require_space(brw, 8 * 4, BLT_RING); DBG("%s src:buf(%p)/%d+%d %d,%d dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n", __FUNCTION__, src_buffer, src_pitch, src_offset, src_x, src_y, dst_buffer, dst_pitch, dst_offset, dst_x, dst_y, w, h); /* Blit pitch must be dword-aligned. Otherwise, the hardware appears to drop * the low bits. */ if (src_pitch % 4 != 0 || dst_pitch % 4 != 0) return false; /* For big formats (such as floating point), do the copy using 16 or 32bpp * and multiply the coordinates. */ if (cpp > 4) { if (cpp % 4 == 2) { dst_x *= cpp / 2; dst_x2 *= cpp / 2; src_x *= cpp / 2; cpp = 2; } else { assert(cpp % 4 == 0); dst_x *= cpp / 4; dst_x2 *= cpp / 4; src_x *= cpp / 4; cpp = 4; } } BR13 = br13_for_cpp(cpp) | translate_raster_op(logic_op) << 16; switch (cpp) { case 1: case 2: CMD = XY_SRC_COPY_BLT_CMD; break; case 4: CMD = XY_SRC_COPY_BLT_CMD | XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB; break; default: return false; } if (dst_tiling != I915_TILING_NONE) { CMD |= XY_DST_TILED; dst_pitch /= 4; } if (src_tiling != I915_TILING_NONE) { CMD |= XY_SRC_TILED; src_pitch /= 4; } if (dst_y2 <= dst_y || dst_x2 <= dst_x) { return true; } assert(dst_x < dst_x2); assert(dst_y < dst_y2); assert(src_offset + (src_y + h - 1) * abs(src_pitch) + (w * cpp) <= src_buffer->size); assert(dst_offset + (dst_y + h - 1) * abs(dst_pitch) + (w * cpp) <= dst_buffer->size); unsigned length = brw->gen >= 8 ? 10 : 8; BEGIN_BATCH_BLT_TILED(length, dst_y_tiled, src_y_tiled); OUT_BATCH(CMD | (length - 2)); OUT_BATCH(BR13 | (uint16_t)dst_pitch); OUT_BATCH(SET_FIELD(dst_y, BLT_Y) | SET_FIELD(dst_x, BLT_X)); OUT_BATCH(SET_FIELD(dst_y2, BLT_Y) | SET_FIELD(dst_x2, BLT_X)); if (brw->gen >= 8) { OUT_RELOC64(dst_buffer, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, dst_offset); } else { OUT_RELOC(dst_buffer, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, dst_offset); } OUT_BATCH(SET_FIELD(src_y, BLT_Y) | SET_FIELD(src_x, BLT_X)); OUT_BATCH((uint16_t)src_pitch); if (brw->gen >= 8) { OUT_RELOC64(src_buffer, I915_GEM_DOMAIN_RENDER, 0, src_offset); } else { OUT_RELOC(src_buffer, I915_GEM_DOMAIN_RENDER, 0, src_offset); } ADVANCE_BATCH_TILED(dst_y_tiled, src_y_tiled); intel_batchbuffer_emit_mi_flush(brw); return true; }
/** * Emit a VERTEX_BUFFER_STATE entry (part of 3DSTATE_VERTEX_BUFFERS). */ uint32_t * brw_emit_vertex_buffer_state(struct brw_context *brw, unsigned buffer_nr, drm_intel_bo *bo, unsigned start_offset, unsigned end_offset, unsigned stride, unsigned step_rate, uint32_t *__map) { struct gl_context *ctx = &brw->ctx; uint32_t dw0; if (brw->gen >= 8) { dw0 = buffer_nr << GEN6_VB0_INDEX_SHIFT; } else if (brw->gen >= 6) { dw0 = (buffer_nr << GEN6_VB0_INDEX_SHIFT) | (step_rate ? GEN6_VB0_ACCESS_INSTANCEDATA : GEN6_VB0_ACCESS_VERTEXDATA); } else { dw0 = (buffer_nr << BRW_VB0_INDEX_SHIFT) | (step_rate ? BRW_VB0_ACCESS_INSTANCEDATA : BRW_VB0_ACCESS_VERTEXDATA); } if (brw->gen >= 7) dw0 |= GEN7_VB0_ADDRESS_MODIFYENABLE; switch (brw->gen) { case 7: dw0 |= GEN7_MOCS_L3 << 16; break; case 8: dw0 |= BDW_MOCS_WB << 16; break; case 9: dw0 |= SKL_MOCS_WB << 16; break; } WARN_ONCE(stride >= (brw->gen >= 5 ? 2048 : 2047), "VBO stride %d too large, bad rendering may occur\n", stride); OUT_BATCH(dw0 | (stride << BRW_VB0_PITCH_SHIFT)); if (brw->gen >= 8) { OUT_RELOC64(bo, I915_GEM_DOMAIN_VERTEX, 0, start_offset); /* From the BSpec: 3D Pipeline Stages - 3D Pipeline Geometry - * Vertex Fetch (VF) Stage - State * * Instead of "VBState.StartingBufferAddress + VBState.MaxIndex x * VBState.BufferPitch", the address of the byte immediately beyond the * last valid byte of the buffer is determined by * "VBState.StartingBufferAddress + VBState.BufferSize". */ OUT_BATCH(end_offset - start_offset); } else if (brw->gen >= 5) { OUT_RELOC(bo, I915_GEM_DOMAIN_VERTEX, 0, start_offset); /* From the BSpec: 3D Pipeline Stages - 3D Pipeline Geometry - * Vertex Fetch (VF) Stage - State * * Instead of "VBState.StartingBufferAddress + VBState.MaxIndex x * VBState.BufferPitch", the address of the byte immediately beyond the * last valid byte of the buffer is determined by * "VBState.EndAddress + 1". */ OUT_RELOC(bo, I915_GEM_DOMAIN_VERTEX, 0, end_offset - 1); OUT_BATCH(step_rate); } else { OUT_RELOC(bo, I915_GEM_DOMAIN_VERTEX, 0, start_offset); OUT_BATCH(0); OUT_BATCH(step_rate); } return __map; }
static void gen8_emit_vertices(struct brw_context *brw) { struct gl_context *ctx = &brw->ctx; uint32_t mocs_wb = brw->gen >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB; bool uses_edge_flag; brw_prepare_vertices(brw); brw_prepare_shader_draw_parameters(brw); uses_edge_flag = (ctx->Polygon.FrontMode != GL_FILL || ctx->Polygon.BackMode != GL_FILL); if (brw->vs.prog_data->uses_vertexid || brw->vs.prog_data->uses_instanceid) { unsigned vue = brw->vb.nr_enabled; /* The element for the edge flags must always be last, so we have to * insert the SGVS before it in that case. */ if (uses_edge_flag) { assert(vue > 0); vue--; } WARN_ONCE(vue >= 33, "Trying to insert VID/IID past 33rd vertex element, " "need to reorder the vertex attrbutes."); unsigned dw1 = 0; if (brw->vs.prog_data->uses_vertexid) { dw1 |= GEN8_SGVS_ENABLE_VERTEX_ID | (2 << GEN8_SGVS_VERTEX_ID_COMPONENT_SHIFT) | /* .z channel */ (vue << GEN8_SGVS_VERTEX_ID_ELEMENT_OFFSET_SHIFT); } if (brw->vs.prog_data->uses_instanceid) { dw1 |= GEN8_SGVS_ENABLE_INSTANCE_ID | (3 << GEN8_SGVS_INSTANCE_ID_COMPONENT_SHIFT) | /* .w channel */ (vue << GEN8_SGVS_INSTANCE_ID_ELEMENT_OFFSET_SHIFT); } BEGIN_BATCH(2); OUT_BATCH(_3DSTATE_VF_SGVS << 16 | (2 - 2)); OUT_BATCH(dw1); ADVANCE_BATCH(); BEGIN_BATCH(3); OUT_BATCH(_3DSTATE_VF_INSTANCING << 16 | (3 - 2)); OUT_BATCH(vue | GEN8_VF_INSTANCING_ENABLE); OUT_BATCH(0); ADVANCE_BATCH(); } else { BEGIN_BATCH(2); OUT_BATCH(_3DSTATE_VF_SGVS << 16 | (2 - 2)); OUT_BATCH(0); ADVANCE_BATCH(); } /* If the VS doesn't read any inputs (calculating vertex position from * a state variable for some reason, for example), emit a single pad * VERTEX_ELEMENT struct and bail. * * The stale VB state stays in place, but they don't do anything unless * a VE loads from them. */ if (brw->vb.nr_enabled == 0) { BEGIN_BATCH(3); OUT_BATCH((_3DSTATE_VERTEX_ELEMENTS << 16) | (3 - 2)); OUT_BATCH((0 << GEN6_VE0_INDEX_SHIFT) | GEN6_VE0_VALID | (BRW_SURFACEFORMAT_R32G32B32A32_FLOAT << BRW_VE0_FORMAT_SHIFT) | (0 << BRW_VE0_SRC_OFFSET_SHIFT)); OUT_BATCH((BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_0_SHIFT) | (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) | (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) | (BRW_VE1_COMPONENT_STORE_1_FLT << BRW_VE1_COMPONENT_3_SHIFT)); ADVANCE_BATCH(); return; } /* Now emit 3DSTATE_VERTEX_BUFFERS and 3DSTATE_VERTEX_ELEMENTS packets. */ unsigned nr_buffers = brw->vb.nr_buffers + brw->vs.prog_data->uses_vertexid; if (nr_buffers) { assert(nr_buffers <= 33); BEGIN_BATCH(1 + 4 * nr_buffers); OUT_BATCH((_3DSTATE_VERTEX_BUFFERS << 16) | (4 * nr_buffers - 1)); for (unsigned i = 0; i < brw->vb.nr_buffers; i++) { struct brw_vertex_buffer *buffer = &brw->vb.buffers[i]; uint32_t dw0 = 0; dw0 |= i << GEN6_VB0_INDEX_SHIFT; dw0 |= GEN7_VB0_ADDRESS_MODIFYENABLE; dw0 |= buffer->stride << BRW_VB0_PITCH_SHIFT; dw0 |= mocs_wb << 16; OUT_BATCH(dw0); OUT_RELOC64(buffer->bo, I915_GEM_DOMAIN_VERTEX, 0, buffer->offset); OUT_BATCH(buffer->bo->size); } if (brw->vs.prog_data->uses_vertexid) { OUT_BATCH(brw->vb.nr_buffers << GEN6_VB0_INDEX_SHIFT | GEN7_VB0_ADDRESS_MODIFYENABLE | mocs_wb << 16); OUT_RELOC64(brw->draw.draw_params_bo, I915_GEM_DOMAIN_VERTEX, 0, brw->draw.draw_params_offset); OUT_BATCH(brw->draw.draw_params_bo->size); } ADVANCE_BATCH(); } /* Normally we don't need an element for the SGVS attribute because the * 3DSTATE_VF_SGVS instruction lets you store the generated attribute in an * element that is past the list in 3DSTATE_VERTEX_ELEMENTS. However if the * vertex ID is used then it needs an element for the base vertex buffer. * Additionally if there is an edge flag element then the SGVS can't be * inserted past that so we need a dummy element to ensure that the edge * flag is the last one. */ bool needs_sgvs_element = (brw->vs.prog_data->uses_vertexid || (brw->vs.prog_data->uses_instanceid && uses_edge_flag)); unsigned nr_elements = brw->vb.nr_enabled + needs_sgvs_element; /* The hardware allows one more VERTEX_ELEMENTS than VERTEX_BUFFERS, * presumably for VertexID/InstanceID. */ assert(nr_elements <= 34); struct brw_vertex_element *gen6_edgeflag_input = NULL; BEGIN_BATCH(1 + nr_elements * 2); OUT_BATCH((_3DSTATE_VERTEX_ELEMENTS << 16) | (2 * nr_elements - 1)); for (unsigned i = 0; i < brw->vb.nr_enabled; i++) { struct brw_vertex_element *input = brw->vb.enabled[i]; uint32_t format = brw_get_vertex_surface_type(brw, input->glarray); uint32_t comp0 = BRW_VE1_COMPONENT_STORE_SRC; uint32_t comp1 = BRW_VE1_COMPONENT_STORE_SRC; uint32_t comp2 = BRW_VE1_COMPONENT_STORE_SRC; uint32_t comp3 = BRW_VE1_COMPONENT_STORE_SRC; /* The gen4 driver expects edgeflag to come in as a float, and passes * that float on to the tests in the clipper. Mesa's current vertex * attribute value for EdgeFlag is stored as a float, which works out. * glEdgeFlagPointer, on the other hand, gives us an unnormalized * integer ubyte. Just rewrite that to convert to a float. */ if (input == &brw->vb.inputs[VERT_ATTRIB_EDGEFLAG]) { /* Gen6+ passes edgeflag as sideband along with the vertex, instead * of in the VUE. We have to upload it sideband as the last vertex * element according to the B-Spec. */ gen6_edgeflag_input = input; continue; } switch (input->glarray->Size) { case 0: comp0 = BRW_VE1_COMPONENT_STORE_0; case 1: comp1 = BRW_VE1_COMPONENT_STORE_0; case 2: comp2 = BRW_VE1_COMPONENT_STORE_0; case 3: comp3 = input->glarray->Integer ? BRW_VE1_COMPONENT_STORE_1_INT : BRW_VE1_COMPONENT_STORE_1_FLT; break; } OUT_BATCH((input->buffer << GEN6_VE0_INDEX_SHIFT) | GEN6_VE0_VALID | (format << BRW_VE0_FORMAT_SHIFT) | (input->offset << BRW_VE0_SRC_OFFSET_SHIFT)); OUT_BATCH((comp0 << BRW_VE1_COMPONENT_0_SHIFT) | (comp1 << BRW_VE1_COMPONENT_1_SHIFT) | (comp2 << BRW_VE1_COMPONENT_2_SHIFT) | (comp3 << BRW_VE1_COMPONENT_3_SHIFT)); } if (needs_sgvs_element) { if (brw->vs.prog_data->uses_vertexid) { OUT_BATCH(GEN6_VE0_VALID | brw->vb.nr_buffers << GEN6_VE0_INDEX_SHIFT | BRW_SURFACEFORMAT_R32_UINT << BRW_VE0_FORMAT_SHIFT); OUT_BATCH((BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_0_SHIFT) | (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) | (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) | (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_3_SHIFT)); } else { OUT_BATCH(GEN6_VE0_VALID); OUT_BATCH((BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_0_SHIFT) | (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) | (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) | (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_3_SHIFT)); } } if (gen6_edgeflag_input) { uint32_t format = brw_get_vertex_surface_type(brw, gen6_edgeflag_input->glarray); OUT_BATCH((gen6_edgeflag_input->buffer << GEN6_VE0_INDEX_SHIFT) | GEN6_VE0_VALID | GEN6_VE0_EDGE_FLAG_ENABLE | (format << BRW_VE0_FORMAT_SHIFT) | (gen6_edgeflag_input->offset << BRW_VE0_SRC_OFFSET_SHIFT)); OUT_BATCH((BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_0_SHIFT) | (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) | (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) | (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_3_SHIFT)); } ADVANCE_BATCH(); for (unsigned i = 0, j = 0; i < brw->vb.nr_enabled; i++) { const struct brw_vertex_element *input = brw->vb.enabled[i]; const struct brw_vertex_buffer *buffer = &brw->vb.buffers[input->buffer]; unsigned element_index; /* The edge flag element is reordered to be the last one in the code * above so we need to compensate for that in the element indices used * below. */ if (input == gen6_edgeflag_input) element_index = nr_elements - 1; else element_index = j++; BEGIN_BATCH(3); OUT_BATCH(_3DSTATE_VF_INSTANCING << 16 | (3 - 2)); OUT_BATCH(element_index | (buffer->step_rate ? GEN8_VF_INSTANCING_ENABLE : 0)); OUT_BATCH(buffer->step_rate); ADVANCE_BATCH(); } }
static void gen8_upload_gs_state(struct brw_context *brw) { struct gl_context *ctx = &brw->ctx; const struct brw_stage_state *stage_state = &brw->gs.base; /* BRW_NEW_GEOMETRY_PROGRAM */ bool active = brw->geometry_program; /* CACHE_NEW_GS_PROG */ const struct brw_vec4_prog_data *prog_data = &brw->gs.prog_data->base; if (active) { int urb_entry_write_offset = 1; uint32_t urb_entry_output_length = ((prog_data->vue_map.num_slots + 1) / 2 - urb_entry_write_offset); if (urb_entry_output_length == 0) urb_entry_output_length = 1; BEGIN_BATCH(10); OUT_BATCH(_3DSTATE_GS << 16 | (10 - 2)); OUT_BATCH(stage_state->prog_offset); OUT_BATCH(0); OUT_BATCH(GEN6_GS_VECTOR_MASK_ENABLE | brw->geometry_program->VerticesIn | ((ALIGN(stage_state->sampler_count, 4)/4) << GEN6_GS_SAMPLER_COUNT_SHIFT) | ((prog_data->base.binding_table.size_bytes / 4) << GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT)); if (brw->gs.prog_data->base.base.total_scratch) { OUT_RELOC64(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, ffs(brw->gs.prog_data->base.base.total_scratch) - 11); WARN_ONCE(true, "May need to implement a temporary workaround: GS Number of " "URB Entries must be less than or equal to the GS Maximum " "Number of Threads.\n"); } else { OUT_BATCH(0); OUT_BATCH(0); } /* DW6 */ OUT_BATCH(((brw->gs.prog_data->output_vertex_size_hwords * 2 - 1) << GEN7_GS_OUTPUT_VERTEX_SIZE_SHIFT) | (brw->gs.prog_data->output_topology << GEN7_GS_OUTPUT_TOPOLOGY_SHIFT) | (prog_data->urb_read_length << GEN6_GS_URB_READ_LENGTH_SHIFT) | (0 << GEN6_GS_URB_ENTRY_READ_OFFSET_SHIFT) | (prog_data->base.dispatch_grf_start_reg << GEN6_GS_DISPATCH_START_GRF_SHIFT)); /* DW7 */ OUT_BATCH(((brw->max_gs_threads / 2 - 1) << HSW_GS_MAX_THREADS_SHIFT) | (brw->gs.prog_data->control_data_header_size_hwords << GEN7_GS_CONTROL_DATA_HEADER_SIZE_SHIFT) | brw->gs.prog_data->dispatch_mode | GEN6_GS_STATISTICS_ENABLE | (brw->gs.prog_data->include_primitive_id ? GEN7_GS_INCLUDE_PRIMITIVE_ID : 0) | GEN7_GS_REORDER_TRAILING | GEN7_GS_ENABLE); /* DW8 */ OUT_BATCH(brw->gs.prog_data->control_data_format << HSW_GS_CONTROL_DATA_FORMAT_SHIFT); /* DW9 / _NEW_TRANSFORM */ OUT_BATCH((ctx->Transform.ClipPlanesEnabled << GEN8_GS_USER_CLIP_DISTANCE_SHIFT) | (urb_entry_output_length << GEN8_GS_URB_OUTPUT_LENGTH_SHIFT) | (urb_entry_write_offset << GEN8_GS_URB_ENTRY_OUTPUT_OFFSET_SHIFT)); ADVANCE_BATCH(); } else { BEGIN_BATCH(10); OUT_BATCH(_3DSTATE_GS << 16 | (10 - 2)); OUT_BATCH(0); /* prog_bo */ OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); /* scratch space base offset */ OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(GEN6_GS_STATISTICS_ENABLE); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } }
bool intelEmitImmediateColorExpandBlit(struct brw_context *brw, GLuint cpp, GLubyte *src_bits, GLuint src_size, GLuint fg_color, GLshort dst_pitch, struct brw_bo *dst_buffer, GLuint dst_offset, enum isl_tiling dst_tiling, GLshort x, GLshort y, GLshort w, GLshort h, enum gl_logicop_mode logic_op) { const struct gen_device_info *devinfo = &brw->screen->devinfo; int dwords = ALIGN(src_size, 8) / 4; uint32_t opcode, br13, blit_cmd; if (dst_tiling != ISL_TILING_LINEAR) { if (dst_offset & 4095) return false; if (dst_tiling == ISL_TILING_Y0) return false; } assert((unsigned) logic_op <= 0x0f); assert(dst_pitch > 0); if (w < 0 || h < 0) return true; DBG("%s dst:buf(%p)/%d+%d %d,%d sz:%dx%d, %d bytes %d dwords\n", __func__, dst_buffer, dst_pitch, dst_offset, x, y, w, h, src_size, dwords); unsigned xy_setup_blt_length = devinfo->gen >= 8 ? 10 : 8; intel_batchbuffer_require_space(brw, (xy_setup_blt_length * 4) + (3 * 4) + dwords * 4); opcode = XY_SETUP_BLT_CMD; if (cpp == 4) opcode |= XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB; if (dst_tiling != ISL_TILING_LINEAR) { opcode |= XY_DST_TILED; dst_pitch /= 4; } br13 = dst_pitch | (translate_raster_op(logic_op) << 16) | (1 << 29); br13 |= br13_for_cpp(cpp); blit_cmd = XY_TEXT_IMMEDIATE_BLIT_CMD | XY_TEXT_BYTE_PACKED; /* packing? */ if (dst_tiling != ISL_TILING_LINEAR) blit_cmd |= XY_DST_TILED; BEGIN_BATCH_BLT(xy_setup_blt_length + 3); OUT_BATCH(opcode | (xy_setup_blt_length - 2)); OUT_BATCH(br13); OUT_BATCH((0 << 16) | 0); /* clip x1, y1 */ OUT_BATCH((100 << 16) | 100); /* clip x2, y2 */ if (devinfo->gen >= 8) { OUT_RELOC64(dst_buffer, RELOC_WRITE, dst_offset); } else { OUT_RELOC(dst_buffer, RELOC_WRITE, dst_offset); } OUT_BATCH(0); /* bg */ OUT_BATCH(fg_color); /* fg */ OUT_BATCH(0); /* pattern base addr */ if (devinfo->gen >= 8) OUT_BATCH(0); OUT_BATCH(blit_cmd | ((3 - 2) + dwords)); OUT_BATCH(SET_FIELD(y, BLT_Y) | SET_FIELD(x, BLT_X)); OUT_BATCH(SET_FIELD(y + h, BLT_Y) | SET_FIELD(x + w, BLT_X)); ADVANCE_BATCH(); intel_batchbuffer_data(brw, src_bits, dwords * 4); brw_emit_mi_flush(brw); return true; }
/** * Define the base addresses which some state is referenced from. * * This allows us to avoid having to emit relocations for the objects, * and is actually required for binding table pointers on gen6. * * Surface state base address covers binding table pointers and * surface state objects, but not the surfaces that the surface state * objects point to. */ void brw_upload_state_base_address(struct brw_context *brw) { if (brw->batch.state_base_address_emitted) return; /* FINISHME: According to section 3.6.1 "STATE_BASE_ADDRESS" of * vol1a of the G45 PRM, MI_FLUSH with the ISC invalidate should be * programmed prior to STATE_BASE_ADDRESS. * * However, given that the instruction SBA (general state base * address) on this chipset is always set to 0 across X and GL, * maybe this isn't required for us in particular. */ if (brw->gen >= 8) { uint32_t mocs_wb = brw->gen >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB; int pkt_len = brw->gen >= 9 ? 19 : 16; BEGIN_BATCH(pkt_len); OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (pkt_len - 2)); /* General state base address: stateless DP read/write requests */ OUT_BATCH(mocs_wb << 4 | 1); OUT_BATCH(0); OUT_BATCH(mocs_wb << 16); /* Surface state base address: */ OUT_RELOC64(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0, mocs_wb << 4 | 1); /* Dynamic state base address: */ OUT_RELOC64(brw->batch.bo, I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION, 0, mocs_wb << 4 | 1); /* Indirect object base address: MEDIA_OBJECT data */ OUT_BATCH(mocs_wb << 4 | 1); OUT_BATCH(0); /* Instruction base address: shader kernels (incl. SIP) */ OUT_RELOC64(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, mocs_wb << 4 | 1); /* General state buffer size */ OUT_BATCH(0xfffff001); /* Dynamic state buffer size */ OUT_BATCH(ALIGN(brw->batch.bo->size, 4096) | 1); /* Indirect object upper bound */ OUT_BATCH(0xfffff001); /* Instruction access upper bound */ OUT_BATCH(ALIGN(brw->cache.bo->size, 4096) | 1); if (brw->gen >= 9) { OUT_BATCH(1); OUT_BATCH(0); OUT_BATCH(0); } ADVANCE_BATCH(); } else if (brw->gen >= 6) { uint8_t mocs = brw->gen == 7 ? GEN7_MOCS_L3 : 0; BEGIN_BATCH(10); OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (10 - 2)); OUT_BATCH(mocs << 8 | /* General State Memory Object Control State */ mocs << 4 | /* Stateless Data Port Access Memory Object Control State */ 1); /* General State Base Address Modify Enable */ /* Surface state base address: * BINDING_TABLE_STATE * SURFACE_STATE */ OUT_RELOC(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0, 1); /* Dynamic state base address: * SAMPLER_STATE * SAMPLER_BORDER_COLOR_STATE * CLIP, SF, WM/CC viewport state * COLOR_CALC_STATE * DEPTH_STENCIL_STATE * BLEND_STATE * Push constants (when INSTPM: CONSTANT_BUFFER Address Offset * Disable is clear, which we rely on) */ OUT_RELOC(brw->batch.bo, (I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION), 0, 1); OUT_BATCH(1); /* Indirect object base address: MEDIA_OBJECT data */ OUT_RELOC(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1); /* Instruction base address: shader kernels (incl. SIP) */ OUT_BATCH(1); /* General state upper bound */ /* Dynamic state upper bound. Although the documentation says that * programming it to zero will cause it to be ignored, that is a lie. * If this isn't programmed to a real bound, the sampler border color * pointer is rejected, causing border color to mysteriously fail. */ OUT_BATCH(0xfffff001); OUT_BATCH(1); /* Indirect object upper bound */ OUT_BATCH(1); /* Instruction access upper bound */ ADVANCE_BATCH(); } else if (brw->gen == 5) { BEGIN_BATCH(8); OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (8 - 2)); OUT_BATCH(1); /* General state base address */ OUT_RELOC(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0, 1); /* Surface state base address */ OUT_BATCH(1); /* Indirect object base address */ OUT_RELOC(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1); /* Instruction base address */ OUT_BATCH(0xfffff001); /* General state upper bound */ OUT_BATCH(1); /* Indirect object upper bound */ OUT_BATCH(1); /* Instruction access upper bound */ ADVANCE_BATCH(); } else { BEGIN_BATCH(6); OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (6 - 2)); OUT_BATCH(1); /* General state base address */ OUT_RELOC(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0, 1); /* Surface state base address */ OUT_BATCH(1); /* Indirect object base address */ OUT_BATCH(1); /* General state upper bound */ OUT_BATCH(1); /* Indirect object upper bound */ ADVANCE_BATCH(); } /* According to section 3.6.1 of VOL1 of the 965 PRM, * STATE_BASE_ADDRESS updates require a reissue of: * * 3DSTATE_PIPELINE_POINTERS * 3DSTATE_BINDING_TABLE_POINTERS * MEDIA_STATE_POINTERS * * and this continues through Ironlake. The Sandy Bridge PRM, vol * 1 part 1 says that the folowing packets must be reissued: * * 3DSTATE_CC_POINTERS * 3DSTATE_BINDING_TABLE_POINTERS * 3DSTATE_SAMPLER_STATE_POINTERS * 3DSTATE_VIEWPORT_STATE_POINTERS * MEDIA_STATE_POINTERS * * Those are always reissued following SBA updates anyway (new * batch time), except in the case of the program cache BO * changing. Having a separate state flag makes the sequence more * obvious. */ brw->ctx.NewDriverState |= BRW_NEW_STATE_BASE_ADDRESS; brw->batch.state_base_address_emitted = true; }
bool intelEmitImmediateColorExpandBlit(struct brw_context *brw, GLuint cpp, GLubyte *src_bits, GLuint src_size, GLuint fg_color, GLshort dst_pitch, drm_intel_bo *dst_buffer, GLuint dst_offset, uint32_t dst_tiling, GLshort x, GLshort y, GLshort w, GLshort h, GLenum logic_op) { int dwords = ALIGN(src_size, 8) / 4; uint32_t opcode, br13, blit_cmd; if (dst_tiling != I915_TILING_NONE) { if (dst_offset & 4095) return false; if (dst_tiling == I915_TILING_Y) return false; } assert((logic_op >= GL_CLEAR) && (logic_op <= (GL_CLEAR + 0x0f))); assert(dst_pitch > 0); if (w < 0 || h < 0) return true; DBG("%s dst:buf(%p)/%d+%d %d,%d sz:%dx%d, %d bytes %d dwords\n", __FUNCTION__, dst_buffer, dst_pitch, dst_offset, x, y, w, h, src_size, dwords); intel_batchbuffer_require_space(brw, (8 * 4) + (3 * 4) + dwords * 4, BLT_RING); opcode = XY_SETUP_BLT_CMD; if (cpp == 4) opcode |= XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB; if (dst_tiling != I915_TILING_NONE) { opcode |= XY_DST_TILED; dst_pitch /= 4; } br13 = dst_pitch | (translate_raster_op(logic_op) << 16) | (1 << 29); br13 |= br13_for_cpp(cpp); blit_cmd = XY_TEXT_IMMEDIATE_BLIT_CMD | XY_TEXT_BYTE_PACKED; /* packing? */ if (dst_tiling != I915_TILING_NONE) blit_cmd |= XY_DST_TILED; unsigned xy_setup_blt_length = brw->gen >= 8 ? 10 : 8; BEGIN_BATCH_BLT(xy_setup_blt_length + 3); OUT_BATCH(opcode | (xy_setup_blt_length - 2)); OUT_BATCH(br13); OUT_BATCH((0 << 16) | 0); /* clip x1, y1 */ OUT_BATCH((100 << 16) | 100); /* clip x2, y2 */ if (brw->gen >= 8) { OUT_RELOC64(dst_buffer, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, dst_offset); } else { OUT_RELOC(dst_buffer, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, dst_offset); } OUT_BATCH(0); /* bg */ OUT_BATCH(fg_color); /* fg */ OUT_BATCH(0); /* pattern base addr */ if (brw->gen >= 8) OUT_BATCH(0); OUT_BATCH(blit_cmd | ((3 - 2) + dwords)); OUT_BATCH(SET_FIELD(y, BLT_Y) | SET_FIELD(x, BLT_X)); OUT_BATCH(SET_FIELD(y + h, BLT_Y) | SET_FIELD(x + w, BLT_X)); ADVANCE_BATCH(); intel_batchbuffer_data(brw, src_bits, dwords * 4, BLT_RING); intel_batchbuffer_emit_mi_flush(brw); return true; }