/** * Compute the number of primitives written during our most recent * transform feedback activity (the current SO_NUM_PRIMS_WRITTEN value * minus the stashed "start" value), and add it to our running tally. * * If \p finalize is true, also compute the number of vertices written * (by multiplying by the number of vertices per primitive), and store * that to the "final" location. * * Otherwise, just overwrite the old tally with the new one. */ static void tally_prims_written(struct brw_context *brw, struct brw_transform_feedback_object *obj, bool finalize) { /* Flush any drawing so that the counters have the right values. */ brw_emit_mi_flush(brw); for (int i = 0; i < BRW_MAX_XFB_STREAMS; i++) { /* GPR0 = Tally */ brw_load_register_imm32(brw, HSW_CS_GPR(0) + 4, 0); brw_load_register_mem(brw, HSW_CS_GPR(0), obj->prim_count_bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, TALLY_OFFSET + i * sizeof(uint32_t)); if (!obj->base.Paused) { /* GPR1 = Start Snapshot */ brw_load_register_mem64(brw, HSW_CS_GPR(1), obj->prim_count_bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, START_OFFSET + i * sizeof(uint64_t)); /* GPR2 = Ending Snapshot */ brw_load_register_reg64(brw, GEN7_SO_NUM_PRIMS_WRITTEN(i), HSW_CS_GPR(2)); BEGIN_BATCH(9); OUT_BATCH(HSW_MI_MATH | (9 - 2)); /* GPR1 = GPR2 (End) - GPR1 (Start) */ OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R2)); OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R1)); OUT_BATCH(MI_MATH_ALU0(SUB)); OUT_BATCH(MI_MATH_ALU2(STORE, R1, ACCU)); /* GPR0 = GPR0 (Tally) + GPR1 (Diff) */ OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R0)); OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R1)); OUT_BATCH(MI_MATH_ALU0(ADD)); OUT_BATCH(MI_MATH_ALU2(STORE, R0, ACCU)); ADVANCE_BATCH(); } if (!finalize) { /* Write back the new tally */ brw_store_register_mem32(brw, obj->prim_count_bo, HSW_CS_GPR(0), TALLY_OFFSET + i * sizeof(uint32_t)); } else { /* Convert the number of primitives to the number of vertices. */ if (obj->primitive_mode == GL_LINES) { /* Double R0 (R0 = R0 + R0) */ BEGIN_BATCH(5); OUT_BATCH(HSW_MI_MATH | (5 - 2)); OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R0)); OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R0)); OUT_BATCH(MI_MATH_ALU0(ADD)); OUT_BATCH(MI_MATH_ALU2(STORE, R0, ACCU)); ADVANCE_BATCH(); } else if (obj->primitive_mode == GL_TRIANGLES) { /* Triple R0 (R1 = R0 + R0, R0 = R0 + R1) */ BEGIN_BATCH(9); OUT_BATCH(HSW_MI_MATH | (9 - 2)); OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R0)); OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R0)); OUT_BATCH(MI_MATH_ALU0(ADD)); OUT_BATCH(MI_MATH_ALU2(STORE, R1, ACCU)); OUT_BATCH(MI_MATH_ALU2(LOAD, SRCA, R0)); OUT_BATCH(MI_MATH_ALU2(LOAD, SRCB, R1)); OUT_BATCH(MI_MATH_ALU0(ADD)); OUT_BATCH(MI_MATH_ALU2(STORE, R0, ACCU)); ADVANCE_BATCH(); } /* Store it to the final result */ brw_store_register_mem32(brw, obj->prim_count_bo, HSW_CS_GPR(0), i * sizeof(uint32_t)); } } }
static void brw_emit_prim(struct brw_context *brw, const struct _mesa_prim *prim, uint32_t hw_prim) { int verts_per_instance; int vertex_access_type; int indirect_flag; DBG("PRIM: %s %d %d\n", _mesa_enum_to_string(prim->mode), prim->start, prim->count); int start_vertex_location = prim->start; int base_vertex_location = prim->basevertex; if (prim->indexed) { vertex_access_type = brw->gen >= 7 ? GEN7_3DPRIM_VERTEXBUFFER_ACCESS_RANDOM : GEN4_3DPRIM_VERTEXBUFFER_ACCESS_RANDOM; start_vertex_location += brw->ib.start_vertex_offset; base_vertex_location += brw->vb.start_vertex_bias; } else { vertex_access_type = brw->gen >= 7 ? GEN7_3DPRIM_VERTEXBUFFER_ACCESS_SEQUENTIAL : GEN4_3DPRIM_VERTEXBUFFER_ACCESS_SEQUENTIAL; start_vertex_location += brw->vb.start_vertex_bias; } /* We only need to trim the primitive count on pre-Gen6. */ if (brw->gen < 6) verts_per_instance = trim(prim->mode, prim->count); else verts_per_instance = prim->count; /* If nothing to emit, just return. */ if (verts_per_instance == 0 && !prim->is_indirect) return; /* If we're set to always flush, do it before and after the primitive emit. * We want to catch both missed flushes that hurt instruction/state cache * and missed flushes of the render cache as it heads to other parts of * the besides the draw code. */ if (brw->always_flush_cache) brw_emit_mi_flush(brw); /* If indirect, emit a bunch of loads from the indirect BO. */ if (prim->is_indirect) { struct gl_buffer_object *indirect_buffer = brw->ctx.DrawIndirectBuffer; drm_intel_bo *bo = intel_bufferobj_buffer(brw, intel_buffer_object(indirect_buffer), prim->indirect_offset, 5 * sizeof(GLuint)); indirect_flag = GEN7_3DPRIM_INDIRECT_PARAMETER_ENABLE; brw_load_register_mem(brw, GEN7_3DPRIM_VERTEX_COUNT, bo, I915_GEM_DOMAIN_VERTEX, 0, prim->indirect_offset + 0); brw_load_register_mem(brw, GEN7_3DPRIM_INSTANCE_COUNT, bo, I915_GEM_DOMAIN_VERTEX, 0, prim->indirect_offset + 4); brw_load_register_mem(brw, GEN7_3DPRIM_START_VERTEX, bo, I915_GEM_DOMAIN_VERTEX, 0, prim->indirect_offset + 8); if (prim->indexed) { brw_load_register_mem(brw, GEN7_3DPRIM_BASE_VERTEX, bo, I915_GEM_DOMAIN_VERTEX, 0, prim->indirect_offset + 12); brw_load_register_mem(brw, GEN7_3DPRIM_START_INSTANCE, bo, I915_GEM_DOMAIN_VERTEX, 0, prim->indirect_offset + 16); } else { brw_load_register_mem(brw, GEN7_3DPRIM_START_INSTANCE, bo, I915_GEM_DOMAIN_VERTEX, 0, prim->indirect_offset + 12); BEGIN_BATCH(3); OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2)); OUT_BATCH(GEN7_3DPRIM_BASE_VERTEX); OUT_BATCH(0); ADVANCE_BATCH(); } } else { indirect_flag = 0; } BEGIN_BATCH(brw->gen >= 7 ? 7 : 6); if (brw->gen >= 7) { const int predicate_enable = (brw->predicate.state == BRW_PREDICATE_STATE_USE_BIT) ? GEN7_3DPRIM_PREDICATE_ENABLE : 0; OUT_BATCH(CMD_3D_PRIM << 16 | (7 - 2) | indirect_flag | predicate_enable); OUT_BATCH(hw_prim | vertex_access_type); } else { OUT_BATCH(CMD_3D_PRIM << 16 | (6 - 2) | hw_prim << GEN4_3DPRIM_TOPOLOGY_TYPE_SHIFT | vertex_access_type); } OUT_BATCH(verts_per_instance); OUT_BATCH(start_vertex_location); OUT_BATCH(prim->num_instances); OUT_BATCH(prim->base_instance); OUT_BATCH(base_vertex_location); ADVANCE_BATCH(); if (brw->always_flush_cache) brw_emit_mi_flush(brw); }