static void emit_vtx_fetch(struct fd_ringbuffer *ring, struct fd_shader *shader, struct fd_parameters *attr, uint32_t first) { uint32_t i; for (i = 0; i < shader->ir->attributes_count; i++) { bool switchnext = (i != (shader->ir->attributes_count - 1)); struct ir3_attribute *a = shader->ir->attributes[i]; struct fd_param *p = find_param(attr, a->name); uint32_t s = fmt2size(p->fmt); OUT_PKT0(ring, REG_A3XX_VFD_FETCH(i), 2); OUT_RING(ring, A3XX_VFD_FETCH_INSTR_0_FETCHSIZE(s - 1) | A3XX_VFD_FETCH_INSTR_0_BUFSTRIDE(s) | COND(switchnext, A3XX_VFD_FETCH_INSTR_0_SWITCHNEXT) | A3XX_VFD_FETCH_INSTR_0_INDEXCODE(i) | A3XX_VFD_FETCH_INSTR_0_STEPRATE(1)); OUT_RELOC(ring, p->bo, s * first, 0); /* VFD_FETCH[i].INSTR_1 */ OUT_PKT0(ring, REG_A3XX_VFD_DECODE_INSTR(i), 1); OUT_RING(ring, A3XX_VFD_DECODE_INSTR_WRITEMASK(regmask(a->num)) | A3XX_VFD_DECODE_INSTR_CONSTFILL | A3XX_VFD_DECODE_INSTR_FORMAT(p->fmt) | A3XX_VFD_DECODE_INSTR_REGID(a->rstart->num) | A3XX_VFD_DECODE_INSTR_SHIFTCNT(s) | A3XX_VFD_DECODE_INSTR_LASTCOMPVALID | COND(switchnext, A3XX_VFD_DECODE_INSTR_SWITCHNEXT)); } }
code *cdsetjmp(elem *e,regm_t *pretregs) { code cs; code *c; regm_t retregs; unsigned stackpushsave; unsigned flag; c = NULL; stackpushsave = stackpush; #if SCPP if (CPP && (funcsym_p->Sfunc->Fflags3 & Fcppeh || usednteh & NTEHcpp)) { /* If in C++ try block If the frame that is calling setjmp has a try,catch block then the call to setjmp3 is as follows: __setjmp3(environment,3,__cpp_longjmp_unwind,trylevel,funcdata); __cpp_longjmp_unwind is a routine in the RTL. This is a stdcall routine that will deal with unwinding for CPP Frames. trylevel is the value that gets incremented at each catch, constructor invocation. funcdata is the same value that you put into EAX prior to cppframehandler getting called. */ symbol *s; s = except_gensym(); if (!s) goto L1; c = gencs(c,0x68,0,FLextern,s); // PUSH &scope_table stackpush += 4; genadjesp(c,4); c = genc1(c,0xFF,modregrm(1,6,BP),FLconst,(targ_uns)-4); // PUSH trylevel stackpush += 4; genadjesp(c,4); cs.Iop = 0x68; cs.Iflags = CFoff; cs.Irex = 0; cs.IFL2 = FLextern; cs.IEVsym2 = rtlsym[RTLSYM_CPP_LONGJMP]; cs.IEVoffset2 = 0; c = gen(c,&cs); // PUSH &_cpp_longjmp_unwind stackpush += 4; genadjesp(c,4); flag = 3; } else #endif if (funcsym_p->Sfunc->Fflags3 & Fnteh) { /* If in NT SEH try block If the frame that is calling setjmp has a try, except block then the call to setjmp3 is as follows: __setjmp3(environment,2,__seh_longjmp_unwind,trylevel); __seth_longjmp_unwind is supplied by the RTL and is a stdcall function. It is the name that MSOFT uses, we should probably use the same one. trylevel is the value that you increment at each try and decrement at the close of the try. This corresponds to the index field of the ehrec. */ int sindex_off; sindex_off = 20; // offset of __context.sindex cs.Iop = 0xFF; cs.Irm = modregrm(2,6,BPRM); cs.Iflags = 0; cs.Irex = 0; cs.IFL1 = FLbprel; cs.IEVsym1 = nteh_contextsym(); cs.IEVoffset1 = sindex_off; c = gen(c,&cs); // PUSH scope_index stackpush += 4; genadjesp(c,4); cs.Iop = 0x68; cs.Iflags = CFoff; cs.Irex = 0; cs.IFL2 = FLextern; cs.IEVsym2 = rtlsym[RTLSYM_LONGJMP]; cs.IEVoffset2 = 0; c = gen(c,&cs); // PUSH &_seh_longjmp_unwind stackpush += 4; genadjesp(c,4); flag = 2; } else { /* If the frame calling setjmp has neither a try..except, nor a try..catch, then call setjmp3 as follows: _setjmp3(environment,0) */ L1: flag = 0; } cs.Iop = 0x68; cs.Iflags = 0; cs.Irex = 0; cs.IFL2 = FLconst; cs.IEV2.Vint = flag; c = gen(c,&cs); // PUSH flag stackpush += 4; genadjesp(c,4); c = cat(c,params(e->E1,REGSIZE)); c = cat(c,getregs(~rtlsym[RTLSYM_SETJMP3]->Sregsaved & (ALLREGS | mES))); gencs(c,0xE8,0,FLfunc,rtlsym[RTLSYM_SETJMP3]); // CALL __setjmp3 c = genc2(c,0x81,modregrm(3,0,SP),stackpush - stackpushsave); // ADD ESP,8 genadjesp(c,-(stackpush - stackpushsave)); stackpush = stackpushsave; retregs = regmask(e->Ety, TYnfunc); return cat(c,fixresult(e,retregs,pretregs)); }
void fd_program_emit_state(struct fd_program *program, uint32_t first, struct fd_parameters *uniforms, struct fd_parameters *attr, struct fd_ringbuffer *ring) { struct fd_shader *vs = get_shader(program, FD_SHADER_VERTEX); struct fd_shader *fs = get_shader(program, FD_SHADER_FRAGMENT); struct ir3_shader_info *vsi = &vs->info; struct ir3_shader_info *fsi = &fs->info; uint32_t vsconstlen = constlen(vs); uint32_t fsconstlen = constlen(fs); uint32_t i, outloc; uint32_t posregid = getpos(vs, "gl_Position", 0); uint32_t psizeregid = getpos(vs, "gl_PointSize", (63 << 2)); uint32_t colorregid = getpos(fs, "gl_FragColor", 0); uint32_t numvar = totalvar(fs); assert (vs->ir->varyings_count == fs->ir->varyings_count); OUT_PKT0(ring, REG_A3XX_HLSQ_CONTROL_0_REG, 6); OUT_RING(ring, A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(FOUR_QUADS) | A3XX_HLSQ_CONTROL_0_REG_SPSHADERRESTART | A3XX_HLSQ_CONTROL_0_REG_SPCONSTFULLUPDATE); OUT_RING(ring, A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE(TWO_QUADS) | A3XX_HLSQ_CONTROL_1_REG_VSSUPERTHREADENABLE); OUT_RING(ring, A3XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD(31)); OUT_RING(ring, 0x00000000); /* HLSQ_CONTROL_3_REG */ OUT_RING(ring, A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH(vsconstlen) | A3XX_HLSQ_VS_CONTROL_REG_CONSTSTARTOFFSET(0) | A3XX_HLSQ_VS_CONTROL_REG_INSTRLENGTH(instrlen(vs))); OUT_RING(ring, A3XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH(fsconstlen) | A3XX_HLSQ_FS_CONTROL_REG_CONSTSTARTOFFSET(128) | A3XX_HLSQ_FS_CONTROL_REG_INSTRLENGTH(instrlen(fs))); OUT_PKT0(ring, REG_A3XX_SP_SP_CTRL_REG, 1); OUT_RING(ring, A3XX_SP_SP_CTRL_REG_CONSTMODE(0) | A3XX_SP_SP_CTRL_REG_SLEEPMODE(1) | // XXX "resolve" (?) bit set on gmem->mem pass.. COND(!uniforms, A3XX_SP_SP_CTRL_REG_RESOLVE) | // XXX sometimes 0, sometimes 1: A3XX_SP_SP_CTRL_REG_LOMODE(1)); /* emit unknown sequence of writes to 0x0ec4/0x0ec8 that the blob * emits as part of the program state (it seems).. */ for (i = 0; i < 6; i++) { OUT_PKT0(ring, REG_A3XX_SP_PERFCOUNTER0_SELECT, 1); OUT_RING(ring, 0x00000000); /* SP_PERFCOUNTER0_SELECT */ OUT_PKT0(ring, REG_A3XX_SP_PERFCOUNTER3_SELECT, 1); OUT_RING(ring, 0x00000000); /* SP_PERFCOUNTER3_SELECT */ } OUT_PKT0(ring, REG_A3XX_SP_VS_LENGTH_REG, 1); OUT_RING(ring, A3XX_SP_VS_LENGTH_REG_SHADERLENGTH(instrlen(vs))); OUT_PKT0(ring, REG_A3XX_SP_VS_CTRL_REG0, 3); OUT_RING(ring, A3XX_SP_VS_CTRL_REG0_THREADMODE(MULTI) | A3XX_SP_VS_CTRL_REG0_INSTRBUFFERMODE(BUFFER) | A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(vsi->max_half_reg + 1) | A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(vsi->max_reg + 1) | A3XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP(0) | A3XX_SP_VS_CTRL_REG0_THREADSIZE(TWO_QUADS) | A3XX_SP_VS_CTRL_REG0_SUPERTHREADMODE | A3XX_SP_VS_CTRL_REG0_LENGTH(instrlen(vs))); OUT_RING(ring, A3XX_SP_VS_CTRL_REG1_CONSTLENGTH(vsconstlen) | A3XX_SP_VS_CTRL_REG1_INITIALOUTSTANDING(totalattr(vs)) | A3XX_SP_VS_CTRL_REG1_CONSTFOOTPRINT(max(vsi->max_const, 0))); OUT_RING(ring, A3XX_SP_VS_PARAM_REG_POSREGID(posregid) | A3XX_SP_VS_PARAM_REG_PSIZEREGID(psizeregid) | A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(fs->ir->varyings_count)); for (i = 0; i < vs->ir->varyings_count; ) { struct ir3_varying *v; uint32_t reg = 0; OUT_PKT0(ring, REG_A3XX_SP_VS_OUT_REG(i/2), 1); v = vs->ir->varyings[i++]; if (v) { reg |= A3XX_SP_VS_OUT_REG_A_REGID(v->rstart->num); reg |= A3XX_SP_VS_OUT_REG_A_COMPMASK(regmask(v->num)); } v = vs->ir->varyings[i++]; if (v) { reg |= A3XX_SP_VS_OUT_REG_B_REGID(v->rstart->num); reg |= A3XX_SP_VS_OUT_REG_B_COMPMASK(regmask(v->num)); } OUT_RING(ring, reg); } outloc = 8; /* I assume 0 and 4 are gl_Position/gl_PointSize? */ for (i = 0; i < vs->ir->varyings_count; ) { struct ir3_varying *v; uint32_t reg = 0; OUT_PKT0(ring, REG_A3XX_SP_VS_VPC_DST_REG(i/4), 1); /* note: if we supported anything other than vec4 varyings, we'd * actually be incrementing outloc by the actual varying size in * units of scalar registers (ie. vec3 -> 3) */ v = vs->ir->varyings[i++]; if (v) { reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC0(outloc); outloc += v->num; } v = vs->ir->varyings[i++]; if (v) { reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC1(outloc); outloc += v->num; } v = vs->ir->varyings[i++]; if (v) { reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC2(outloc); outloc += v->num; } v = vs->ir->varyings[i++]; if (v) { reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC3(outloc); outloc += v->num; } OUT_RING(ring, reg); } // TODO SP_VS_OBJ_OFFSET_REG / SP_VS_OBJ_START_REG OUT_PKT0(ring, REG_A3XX_SP_FS_LENGTH_REG, 1); OUT_RING(ring, A3XX_SP_FS_LENGTH_REG_SHADERLENGTH(instrlen(fs))); OUT_PKT0(ring, REG_A3XX_SP_FS_CTRL_REG0, 2); OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) | A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(BUFFER) | A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(fsi->max_half_reg + 1) | A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(fsi->max_reg + 1) | A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) | A3XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) | A3XX_SP_FS_CTRL_REG0_SUPERTHREADMODE | COND(fs->ir->samplers_count > 0, A3XX_SP_FS_CTRL_REG0_PIXLODENABLE) | A3XX_SP_FS_CTRL_REG0_LENGTH(instrlen(fs))); OUT_RING(ring, A3XX_SP_FS_CTRL_REG1_CONSTLENGTH(fsconstlen) | A3XX_SP_FS_CTRL_REG1_INITIALOUTSTANDING(0) | A3XX_SP_FS_CTRL_REG1_CONSTFOOTPRINT(max(fsi->max_const, 0)) | A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET(63)); // TODO SP_FS_OBJ_OFFSET_REG / SP_FS_OBJ_START_REG OUT_PKT0(ring, REG_A3XX_SP_FS_FLAT_SHAD_MODE_REG_0, 2); OUT_RING(ring, 0x00000000); /* SP_FS_FLAT_SHAD_MODE_REG_0 */ OUT_RING(ring, 0x00000000); /* SP_FS_FLAT_SHAD_MODE_REG_1 */ OUT_PKT0(ring, REG_A3XX_SP_FS_OUTPUT_REG, 1); OUT_RING(ring, 0x00000000); /* SP_FS_OUTPUT_REG */ OUT_PKT0(ring, REG_A3XX_SP_FS_MRT_REG(0), 4); OUT_RING(ring, A3XX_SP_FS_MRT_REG_REGID(colorregid) | /* SP_FS_MRT[0].REG */ A3XX_SP_FS_MRT_REG_HALF_PRECISION); OUT_RING(ring, A3XX_SP_FS_MRT_REG_REGID(0)); /* SP_FS_MRT[1].REG */ OUT_RING(ring, A3XX_SP_FS_MRT_REG_REGID(0)); /* SP_FS_MRT[2].REG */ OUT_RING(ring, A3XX_SP_FS_MRT_REG_REGID(0)); /* SP_FS_MRT[3].REG */ OUT_PKT0(ring, REG_A3XX_VPC_ATTR, 2); OUT_RING(ring, A3XX_VPC_ATTR_TOTALATTR(numvar) | A3XX_VPC_ATTR_THRDASSIGN(1) | A3XX_VPC_ATTR_LMSIZE(1)); OUT_RING(ring, A3XX_VPC_PACK_NUMFPNONPOSVAR(numvar) | A3XX_VPC_PACK_NUMNONPOSVSVAR(numvar)); OUT_PKT0(ring, REG_A3XX_VPC_VARYING_INTERP_MODE(0), 4); OUT_RING(ring, 0x00000000); /* VPC_VARYING_INTERP[0].MODE */ OUT_RING(ring, 0x00000000); /* VPC_VARYING_INTERP[1].MODE */ OUT_RING(ring, 0x00000000); /* VPC_VARYING_INTERP[2].MODE */ OUT_RING(ring, 0x00000000); /* VPC_VARYING_INTERP[3].MODE */ OUT_PKT0(ring, REG_A3XX_VPC_VARYING_PS_REPL_MODE(0), 4); OUT_RING(ring, 0x00000000); /* VPC_VARYING_PS_REPL[0].MODE */ OUT_RING(ring, 0x00000000); /* VPC_VARYING_PS_REPL[1].MODE */ OUT_RING(ring, 0x00000000); /* VPC_VARYING_PS_REPL[2].MODE */ OUT_RING(ring, 0x00000000); /* VPC_VARYING_PS_REPL[3].MODE */ OUT_PKT0(ring, REG_A3XX_VFD_VS_THREADING_THRESHOLD, 1); OUT_RING(ring, A3XX_VFD_VS_THREADING_THRESHOLD_REGID_THRESHOLD(15) | A3XX_VFD_VS_THREADING_THRESHOLD_REGID_VTXCNT(252)); emit_shader(ring, vs, SB_VERT_SHADER); OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1); OUT_RING(ring, 0x00000000); /* VFD_PERFCOUNTER0_SELECT */ emit_shader(ring, fs, SB_FRAG_SHADER); OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1); OUT_RING(ring, 0x00000000); /* VFD_PERFCOUNTER0_SELECT */ OUT_PKT0(ring, REG_A3XX_VFD_CONTROL_0, 2); OUT_RING(ring, A3XX_VFD_CONTROL_0_TOTALATTRTOVS(totalattr(vs)) | A3XX_VFD_CONTROL_0_PACKETSIZE(2) | A3XX_VFD_CONTROL_0_STRMDECINSTRCNT(vs->ir->attributes_count) | A3XX_VFD_CONTROL_0_STRMFETCHINSTRCNT(vs->ir->attributes_count)); OUT_RING(ring, A3XX_VFD_CONTROL_1_MAXSTORAGE(1) | // XXX A3XX_VFD_CONTROL_1_REGID4VTX(63 << 2) | A3XX_VFD_CONTROL_1_REGID4INST(63 << 2)); emit_vtx_fetch(ring, vs, attr, first); /* we have this sometimes, not others.. perhaps we could be clever * and figure out actually when we need to invalidate cache: */ OUT_PKT0(ring, REG_A3XX_UCHE_CACHE_INVALIDATE0_REG, 2); OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE0_REG_ADDR(0)); OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE1_REG_ADDR(0) | A3XX_UCHE_CACHE_INVALIDATE1_REG_OPCODE(INVALIDATE) | A3XX_UCHE_CACHE_INVALIDATE1_REG_ENTIRE_CACHE); /* for RB_RESOLVE_PASS, I think the consts are not needed: */ if (uniforms) { emit_uniconst(ring, vs, uniforms, SB_VERT_SHADER); emit_uniconst(ring, fs, uniforms, SB_FRAG_SHADER); } }