void brw_debug_compact_uncompact(struct intel_context *intel, struct brw_instruction *orig, struct brw_instruction *uncompacted) { fprintf(stderr, "Instruction compact/uncompact changed (gen%d):\n", intel->gen); fprintf(stderr, " before: "); brw_disasm(stderr, orig, intel->gen); fprintf(stderr, " after: "); brw_disasm(stderr, uncompacted, intel->gen); uint32_t *before_bits = (uint32_t *)orig; uint32_t *after_bits = (uint32_t *)uncompacted; printf(" changed bits:\n"); for (int i = 0; i < 128; i++) { uint32_t before = before_bits[i / 32] & (1 << (i & 31)); uint32_t after = after_bits[i / 32] & (1 << (i & 31)); if (before != after) { printf(" bit %d, %s to %s\n", i, before ? "set" : "unset", after ? "set" : "unset"); } } }
static void wm_src_sample_argb(struct brw_compile *p) { static const uint32_t fragment[][4] = { #include "exa_wm_src_affine.g6b" #include "exa_wm_src_sample_argb.g6b" #include "exa_wm_write.g6b" }; int n; brw_push_insn_state(p); brw_set_mask_control(p, BRW_MASK_DISABLE); brw_set_compression_control(p, BRW_COMPRESSION_NONE); brw_MOV(p, retype(brw_vec1_grf(0,2), BRW_REGISTER_TYPE_UD), brw_imm_ud(0)); brw_pop_insn_state(p); brw_SAMPLE(p, retype(vec16(brw_vec8_grf(14, 0)), BRW_REGISTER_TYPE_UW), 1, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD), 1, 0, WRITEMASK_XYZW, GEN5_SAMPLER_MESSAGE_SAMPLE, 8, 5, true, BRW_SAMPLER_SIMD_MODE_SIMD16); for (n = 0; n < p->nr_insn; n++) { brw_disasm(stdout, &p->store[n], 60); } printf("\n\n"); for (n = 0; n < ARRAY_SIZE(fragment); n++) { brw_disasm(stdout, (const struct brw_instruction *)&fragment[n][0], 60); } }
void brw_dump_compile(struct brw_compile *p, FILE *out, int start, int end) { struct brw_context *brw = p->brw; struct intel_context *intel = &brw->intel; void *store = p->store; bool dump_hex = false; for (int offset = start; offset < end;) { struct brw_instruction *insn = store + offset; struct brw_instruction uncompacted; printf("0x%08x: ", offset); if (insn->header.cmpt_control) { struct brw_compact_instruction *compacted = (void *)insn; if (dump_hex) { printf("0x%08x 0x%08x ", ((uint32_t *)insn)[1], ((uint32_t *)insn)[0]); } brw_uncompact_instruction(intel, &uncompacted, compacted); insn = &uncompacted; offset += 8; } else { if (dump_hex) { printf("0x%08x 0x%08x 0x%08x 0x%08x ", ((uint32_t *)insn)[3], ((uint32_t *)insn)[2], ((uint32_t *)insn)[1], ((uint32_t *)insn)[0]); } offset += 16; } brw_disasm(stdout, insn, p->brw->intel.gen); } }
static void compile_gs_prog( struct brw_context *brw, struct brw_gs_prog_key *key ) { struct intel_context *intel = &brw->intel; struct brw_gs_compile c; const GLuint *program; void *mem_ctx; GLuint program_size; /* Gen6: VF has already converted into polygon, and LINELOOP is * converted to LINESTRIP at the beginning of the 3D pipeline. */ if (intel->gen >= 6) return; memset(&c, 0, sizeof(c)); c.key = *key; /* Need to locate the two positions present in vertex + header. * These are currently hardcoded: */ c.nr_attrs = brw_count_bits(c.key.attrs); if (intel->gen >= 5) c.nr_regs = (c.nr_attrs + 1) / 2 + 3; /* are vertices packed, or reg-aligned? */ else c.nr_regs = (c.nr_attrs + 1) / 2 + 1; /* are vertices packed, or reg-aligned? */ c.nr_bytes = c.nr_regs * REG_SIZE; mem_ctx = NULL; /* Begin the compilation: */ brw_init_compile(brw, &c.func, mem_ctx); c.func.single_program_flow = 1; /* For some reason the thread is spawned with only 4 channels * unmasked. */ brw_set_mask_control(&c.func, BRW_MASK_DISABLE); /* Note that primitives which don't require a GS program have * already been weeded out by this stage: */ switch (key->primitive) { case GL_QUADS: brw_gs_quads( &c, key ); break; case GL_QUAD_STRIP: brw_gs_quad_strip( &c, key ); break; case GL_LINE_LOOP: brw_gs_lines( &c ); break; default: ralloc_free(mem_ctx); return; } /* get the program */ program = brw_get_program(&c.func, &program_size); if (unlikely(INTEL_DEBUG & DEBUG_GS)) { int i; printf("gs:\n"); for (i = 0; i < program_size / sizeof(struct brw_instruction); i++) brw_disasm(stdout, &((struct brw_instruction *)program)[i], intel->gen); printf("\n"); } brw_upload_cache(&brw->cache, BRW_GS_PROG, &c.key, sizeof(c.key), program, program_size, &c.prog_data, sizeof(c.prog_data), &brw->gs.prog_offset, &brw->gs.prog_data); ralloc_free(mem_ctx); }
static void compile_ff_gs_prog(struct brw_context *brw, struct brw_ff_gs_prog_key *key) { struct brw_ff_gs_compile c; const GLuint *program; void *mem_ctx; GLuint program_size; memset(&c, 0, sizeof(c)); c.key = *key; c.vue_map = brw->vs.prog_data->base.vue_map; c.nr_regs = (c.vue_map.num_slots + 1)/2; mem_ctx = ralloc_context(NULL); /* Begin the compilation: */ brw_init_compile(brw, &c.func, mem_ctx); c.func.single_program_flow = 1; /* For some reason the thread is spawned with only 4 channels * unmasked. */ brw_set_mask_control(&c.func, BRW_MASK_DISABLE); if (brw->gen >= 6) { unsigned num_verts; bool check_edge_flag; /* On Sandybridge, we use the GS for implementing transform feedback * (called "Stream Out" in the PRM). */ switch (key->primitive) { case _3DPRIM_POINTLIST: num_verts = 1; check_edge_flag = false; break; case _3DPRIM_LINELIST: case _3DPRIM_LINESTRIP: case _3DPRIM_LINELOOP: num_verts = 2; check_edge_flag = false; break; case _3DPRIM_TRILIST: case _3DPRIM_TRIFAN: case _3DPRIM_TRISTRIP: case _3DPRIM_RECTLIST: num_verts = 3; check_edge_flag = false; break; case _3DPRIM_QUADLIST: case _3DPRIM_QUADSTRIP: case _3DPRIM_POLYGON: num_verts = 3; check_edge_flag = true; break; default: assert(!"Unexpected primitive type in Gen6 SOL program."); return; } gen6_sol_program(&c, key, num_verts, check_edge_flag); } else { /* On Gen4-5, we use the GS to decompose certain types of primitives. * Note that primitives which don't require a GS program have already * been weeded out by now. */ switch (key->primitive) { case _3DPRIM_QUADLIST: brw_ff_gs_quads( &c, key ); break; case _3DPRIM_QUADSTRIP: brw_ff_gs_quad_strip( &c, key ); break; case _3DPRIM_LINELOOP: brw_ff_gs_lines( &c ); break; default: ralloc_free(mem_ctx); return; } } /* get the program */ program = brw_get_program(&c.func, &program_size); if (unlikely(INTEL_DEBUG & DEBUG_GS)) { int i; printf("gs:\n"); for (i = 0; i < program_size / sizeof(struct brw_instruction); i++) brw_disasm(stdout, &((struct brw_instruction *)program)[i], brw->gen); printf("\n"); } brw_upload_cache(&brw->cache, BRW_FF_GS_PROG, &c.key, sizeof(c.key), program, program_size, &c.prog_data, sizeof(c.prog_data), &brw->ff_gs.prog_offset, &brw->ff_gs.prog_data); ralloc_free(mem_ctx); }
static void compile_sf_prog( struct brw_context *brw, struct brw_sf_prog_key *key ) { struct brw_sf_compile c; const GLuint *program; void *mem_ctx; GLuint program_size; GLuint i; memset(&c, 0, sizeof(c)); mem_ctx = ralloc_context(NULL); /* Begin the compilation: */ brw_init_compile(brw, &c.func, mem_ctx); c.key = *key; c.vue_map = brw->vue_map_geom_out; if (c.key.do_point_coord) { /* * gl_PointCoord is a FS instead of VS builtin variable, thus it's * not included in c.vue_map generated in VS stage. Here we add * it manually to let SF shader generate the needed interpolation * coefficient for FS shader. */ c.vue_map.varying_to_slot[BRW_VARYING_SLOT_PNTC] = c.vue_map.num_slots; c.vue_map.slot_to_varying[c.vue_map.num_slots++] = BRW_VARYING_SLOT_PNTC; } c.urb_entry_read_offset = BRW_SF_URB_ENTRY_READ_OFFSET; c.nr_attr_regs = (c.vue_map.num_slots + 1)/2 - c.urb_entry_read_offset; c.nr_setup_regs = c.nr_attr_regs; c.prog_data.urb_read_length = c.nr_attr_regs; c.prog_data.urb_entry_size = c.nr_setup_regs * 2; c.has_flat_shading = brw_any_flat_varyings(&key->interpolation_mode); /* Which primitive? Or all three? */ switch (key->primitive) { case SF_TRIANGLES: c.nr_verts = 3; brw_emit_tri_setup( &c, true ); break; case SF_LINES: c.nr_verts = 2; brw_emit_line_setup( &c, true ); break; case SF_POINTS: c.nr_verts = 1; if (key->do_point_sprite) brw_emit_point_sprite_setup( &c, true ); else brw_emit_point_setup( &c, true ); break; case SF_UNFILLED_TRIS: c.nr_verts = 3; brw_emit_anyprim_setup( &c ); break; default: assert(0); return; } /* get the program */ program = brw_get_program(&c.func, &program_size); if (unlikely(INTEL_DEBUG & DEBUG_SF)) { printf("sf:\n"); for (i = 0; i < program_size / sizeof(struct brw_instruction); i++) brw_disasm(stdout, &((struct brw_instruction *)program)[i], brw->gen); printf("\n"); } brw_upload_cache(&brw->cache, BRW_SF_PROG, &c.key, sizeof(c.key), program, program_size, &c.prog_data, sizeof(c.prog_data), &brw->sf.prog_offset, &brw->sf.prog_data); ralloc_free(mem_ctx); }
/* Emit the fragment program instructions here. */ void brw_wm_emit( struct brw_wm_compile *c ) { struct brw_compile *p = &c->func; struct intel_context *intel = &p->brw->intel; GLuint insn; brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); if (intel->gen >= 6) brw_set_acc_write_control(p, 1); /* Check if any of the payload regs need to be spilled: */ spill_values(c, c->payload.depth, 4); spill_values(c, c->creg, c->nr_creg); spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX); for (insn = 0; insn < c->nr_insns; insn++) { struct brw_wm_instruction *inst = &c->instruction[insn]; struct brw_reg args[3][4], dst[4]; GLuint i, dst_flags; /* Get argument regs: */ for (i = 0; i < 3; i++) get_argument_regs(c, inst->src[i], args[i]); /* Get dest regs: */ for (i = 0; i < 4; i++) if (inst->dst[i]) dst[i] = inst->dst[i]->hw_reg; else dst[i] = brw_null_reg(); /* Flags */ dst_flags = inst->writemask; if (inst->saturate) dst_flags |= SATURATE; switch (inst->opcode) { /* Generated instructions for calculating triangle interpolants: */ case WM_PIXELXY: emit_pixel_xy(c, dst, dst_flags); break; case WM_DELTAXY: emit_delta_xy(p, dst, dst_flags, args[0]); break; case WM_WPOSXY: emit_wpos_xy(c, dst, dst_flags, args[0]); break; case WM_PIXELW: emit_pixel_w(c, dst, dst_flags, args[0], args[1]); break; case WM_LINTERP: emit_linterp(p, dst, dst_flags, args[0], args[1]); break; case WM_PINTERP: emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]); break; case WM_CINTERP: emit_cinterp(p, dst, dst_flags, args[0]); break; case WM_FB_WRITE: emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot); break; case WM_FRONTFACING: emit_frontfacing(p, dst, dst_flags); break; /* Straightforward arithmetic: */ case OPCODE_ADD: emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]); break; case OPCODE_FRC: emit_alu1(p, brw_FRC, dst, dst_flags, args[0]); break; case OPCODE_FLR: emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]); break; case OPCODE_DDX: emit_ddxy(p, dst, dst_flags, true, args[0]); break; case OPCODE_DDY: emit_ddxy(p, dst, dst_flags, false, args[0]); break; case OPCODE_DP2: emit_dp2(p, dst, dst_flags, args[0], args[1]); break; case OPCODE_DP3: emit_dp3(p, dst, dst_flags, args[0], args[1]); break; case OPCODE_DP4: emit_dp4(p, dst, dst_flags, args[0], args[1]); break; case OPCODE_DPH: emit_dph(p, dst, dst_flags, args[0], args[1]); break; case OPCODE_TRUNC: for (i = 0; i < 4; i++) { if (dst_flags & (1<<i)) { brw_RNDZ(p, dst[i], args[0][i]); } } break; case OPCODE_LRP: emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]); break; case OPCODE_MAD: emit_mad(p, dst, dst_flags, args[0], args[1], args[2]); break; case OPCODE_MOV: case OPCODE_SWZ: emit_alu1(p, brw_MOV, dst, dst_flags, args[0]); break; case OPCODE_MUL: emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]); break; case OPCODE_XPD: emit_xpd(p, dst, dst_flags, args[0], args[1]); break; /* Higher math functions: */ case OPCODE_RCP: emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]); break; case OPCODE_RSQ: emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]); break; case OPCODE_SIN: emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]); break; case OPCODE_COS: emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]); break; case OPCODE_EX2: emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]); break; case OPCODE_LG2: emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]); break; case OPCODE_SCS: /* There is an scs math function, but it would need some * fixup for 16-element execution. */ if (dst_flags & WRITEMASK_X) emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]); if (dst_flags & WRITEMASK_Y) emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]); break; case OPCODE_POW: emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]); break; /* Comparisons: */ case OPCODE_CMP: emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]); break; case OPCODE_MAX: emit_max(p, dst, dst_flags, args[0], args[1]); break; case OPCODE_MIN: emit_min(p, dst, dst_flags, args[0], args[1]); break; case OPCODE_SLT: emit_slt(p, dst, dst_flags, args[0], args[1]); break; case OPCODE_SLE: emit_sle(p, dst, dst_flags, args[0], args[1]); break; case OPCODE_SGT: emit_sgt(p, dst, dst_flags, args[0], args[1]); break; case OPCODE_SGE: emit_sge(p, dst, dst_flags, args[0], args[1]); break; case OPCODE_SEQ: emit_seq(p, dst, dst_flags, args[0], args[1]); break; case OPCODE_SNE: emit_sne(p, dst, dst_flags, args[0], args[1]); break; case OPCODE_SSG: emit_sign(p, dst, dst_flags, args[0]); break; case OPCODE_LIT: emit_lit(c, dst, dst_flags, args[0]); break; /* Texturing operations: */ case OPCODE_TEX: emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg, inst->tex_idx, inst->tex_unit, inst->tex_shadow); break; case OPCODE_TXB: emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg, inst->tex_idx, inst->tex_unit); break; case OPCODE_KIL: emit_kil(c, args[0]); break; default: printf("Unsupported opcode %i (%s) in fragment shader\n", inst->opcode, inst->opcode < MAX_OPCODE ? _mesa_opcode_string(inst->opcode) : "unknown"); } for (i = 0; i < 4; i++) if (inst->dst[i] && inst->dst[i]->spill_slot) emit_spill(c, inst->dst[i]->hw_reg, inst->dst[i]->spill_slot); } /* Only properly tested on ILK */ if (p->brw->intel.gen == 5) { brw_remove_duplicate_mrf_moves(p); if (c->dispatch_width == 16) brw_remove_grf_to_mrf_moves(p); } if (unlikely(INTEL_DEBUG & DEBUG_WM)) { int i; printf("wm-native:\n"); for (i = 0; i < p->nr_insn; i++) brw_disasm(stdout, &p->store[i], p->brw->intel.gen); printf("\n"); } }
static void compile_clip_prog( struct brw_context *brw, struct brw_clip_prog_key *key ) { struct brw_clip_compile c; const GLuint *program; void *mem_ctx; GLuint program_size; GLuint i; memset(&c, 0, sizeof(c)); mem_ctx = ralloc_context(NULL); /* Begin the compilation: */ brw_init_compile(brw, &c.func, mem_ctx); c.func.single_program_flow = 1; c.key = *key; c.vue_map = brw->vue_map_geom_out; c.has_flat_shading = brw_any_flat_varyings(&key->interpolation_mode); c.has_noperspective_shading = brw_any_noperspective_varyings(&key->interpolation_mode); /* nr_regs is the number of registers filled by reading data from the VUE. * This program accesses the entire VUE, so nr_regs needs to be the size of * the VUE (measured in pairs, since two slots are stored in each * register). */ c.nr_regs = (c.vue_map.num_slots + 1)/2; c.prog_data.clip_mode = c.key.clip_mode; /* XXX */ /* For some reason the thread is spawned with only 4 channels * unmasked. */ brw_set_mask_control(&c.func, BRW_MASK_DISABLE); /* Would ideally have the option of producing a program which could * do all three: */ switch (key->primitive) { case GL_TRIANGLES: if (key->do_unfilled) brw_emit_unfilled_clip( &c ); else brw_emit_tri_clip( &c ); break; case GL_LINES: brw_emit_line_clip( &c ); break; case GL_POINTS: brw_emit_point_clip( &c ); break; default: assert(0); return; } /* get the program */ program = brw_get_program(&c.func, &program_size); if (unlikely(INTEL_DEBUG & DEBUG_CLIP)) { printf("clip:\n"); for (i = 0; i < program_size / sizeof(struct brw_instruction); i++) brw_disasm(stdout, &((struct brw_instruction *)program)[i], brw->gen); printf("\n"); } brw_upload_cache(&brw->cache, BRW_CLIP_PROG, &c.key, sizeof(c.key), program, program_size, &c.prog_data, sizeof(c.prog_data), &brw->clip.prog_offset, &brw->clip.prog_data); ralloc_free(mem_ctx); }
static void compile_clip_prog( struct brw_context *brw, struct brw_clip_prog_key *key ) { struct intel_context *intel = &brw->intel; struct brw_clip_compile c; const GLuint *program; GLuint program_size; GLuint delta; GLuint i; GLuint header_regs; memset(&c, 0, sizeof(c)); /* Begin the compilation: */ brw_init_compile(brw, &c.func); c.func.single_program_flow = 1; c.key = *key; /* Need to locate the two positions present in vertex + header. * These are currently hardcoded: */ c.header_position_offset = ATTR_SIZE; if (intel->gen == 5) header_regs = 3; else header_regs = 1; delta = header_regs * REG_SIZE; for (i = 0; i < VERT_RESULT_MAX; i++) { if (c.key.attrs & BITFIELD64_BIT(i)) { c.offset[i] = delta; delta += ATTR_SIZE; c.idx_to_attr[c.nr_attrs] = i; c.nr_attrs++; } } /* The vertex attributes start at a URB row-aligned offset after * the 8-20 dword vertex header, and continue for a URB row-aligned * length. nr_regs determines the urb_read_length from the start * of the header to the end of the vertex data. */ c.nr_regs = header_regs + (c.nr_attrs + 1) / 2; c.nr_bytes = c.nr_regs * REG_SIZE; c.prog_data.clip_mode = c.key.clip_mode; /* XXX */ /* For some reason the thread is spawned with only 4 channels * unmasked. */ brw_set_mask_control(&c.func, BRW_MASK_DISABLE); /* Would ideally have the option of producing a program which could * do all three: */ switch (key->primitive) { case GL_TRIANGLES: if (key->do_unfilled) brw_emit_unfilled_clip( &c ); else brw_emit_tri_clip( &c ); break; case GL_LINES: brw_emit_line_clip( &c ); break; case GL_POINTS: brw_emit_point_clip( &c ); break; default: assert(0); return; } /* get the program */ program = brw_get_program(&c.func, &program_size); if (INTEL_DEBUG & DEBUG_CLIP) { printf("clip:\n"); for (i = 0; i < program_size / sizeof(struct brw_instruction); i++) brw_disasm(stdout, &((struct brw_instruction *)program)[i], intel->gen); printf("\n"); } /* Upload */ drm_intel_bo_unreference(brw->clip.prog_bo); brw->clip.prog_bo = brw_upload_cache_with_auxdata(&brw->cache, BRW_CLIP_PROG, &c.key, sizeof(c.key), NULL, 0, program, program_size, &c.prog_data, sizeof(c.prog_data), &brw->clip.prog_data); }