static __forceinline void DCT_8_INV_ROW(const uint8_t * const ecx,const uint8_t * const esi,__m128i &xmm0,__m128i &xmm1,__m128i &xmm2,__m128i &xmm3,__m128i &xmm4,__m128i &xmm5,__m128i &xmm6,__m128i &xmm7) { xmm0=_mm_shufflelo_epi16(xmm0, 0xD8 ); xmm1=_mm_shuffle_epi32( xmm0, 0 ); pmaddwd (xmm1, esi); xmm3=_mm_shuffle_epi32( xmm0, 0x55); xmm0=_mm_shufflehi_epi16( xmm0, 0xD8 ); pmaddwd( xmm3, esi+32 ); xmm2=_mm_shuffle_epi32( xmm0, 0xAA ); xmm0=_mm_shuffle_epi32( xmm0, 0xFF ); pmaddwd( xmm2, esi+16 ); xmm4=_mm_shufflehi_epi16( xmm4, 0xD8 ); paddd (xmm1, M128_round_inv_row); xmm4=_mm_shufflelo_epi16 (xmm4, 0xD8 ); pmaddwd (xmm0, esi+48 ); xmm5=_mm_shuffle_epi32( xmm4, 0 ); xmm6=_mm_shuffle_epi32( xmm4, 0xAA ); pmaddwd (xmm5, ecx ); paddd (xmm1, xmm2 ); movdqa (xmm2, xmm1 ); xmm7=_mm_shuffle_epi32( xmm4, 0x55 ); pmaddwd (xmm6, ecx+16 ); paddd (xmm0, xmm3 ); xmm4=_mm_shuffle_epi32( xmm4, 0xFF ); psubd (xmm2, xmm0 ); pmaddwd (xmm7, ecx+32 ); paddd (xmm0, xmm1 ); psrad (xmm2, 12 ); paddd (xmm5, M128_round_inv_row); pmaddwd (xmm4, ecx+48 ); paddd (xmm5, xmm6 ); movdqa (xmm6, xmm5 ); psrad (xmm0, 12 ); xmm2=_mm_shuffle_epi32( xmm2, 0x1B ); packssdw (xmm0, xmm2 ); paddd (xmm4, xmm7 ); psubd (xmm6, xmm4 ); paddd (xmm4, xmm5 ); psrad (xmm6, 12 ); psrad (xmm4, 12 ); xmm6=_mm_shuffle_epi32( xmm6, 0x1B ); packssdw (xmm4, xmm6 ); }
int main() { int rval; mmx_t ma; mmx_t mb; movq_r2r(mm0, mm1); rval = mmx_ok(); /* Announce return value of mmx_ok() */ // printf("Value returned from init was %x.", rval); // printf(" (Indicates MMX %s available)\n\n",(rval)? "is" : "not"); // fflush(stdout); fflush(stderr); // if(rval) { /* PADD *****************************************************/ ma.q = 0x1111111180000000LL; mb.q = 0x7fffffff00000001LL; paddd(ma, mb); fprintf(stdout, "paddd: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddd: mb.q is 9111111080000001\n"); fflush(stdout); fflush(stderr); ma.q = 0x0001000100010001LL; mb.q = 0x80007fffffff0001LL; paddw(ma, mb); fprintf(stdout, "paddw: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddw: mb.q is 8001800000000002\n"); fflush(stdout); fflush(stderr); ma.q = 0x80007fffffff0001LL; mb.q = 0x0001000100010000LL; paddw(ma, mb); fprintf(stdout, "paddw: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddw: mb.q is 8001800000000001\n"); fflush(stdout); fflush(stderr); ma.q = 0x01010101807fff01LL; mb.q = 0x807fff0101010101LL; paddb(ma, mb); fprintf(stdout, "paddb: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddb: mb.q is 8180000281800002\n"); fflush(stdout); fflush(stderr); /* PADDS ****************************************************/ ma.q = 0x0001000100010001LL; mb.q = 0x80007fffffff0001LL; paddsw(ma, mb); fprintf(stdout, "paddsw: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddsw: mb.q is 80017fff00000002\n"); ma.q = 0x80007fffffff0001LL; mb.q = 0x0001000100010000LL; paddsw(ma, mb); fprintf(stdout, "paddsw: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddsw: mb.q is 80017fff00000001\n"); ma.q = 0x01010101807fff01LL; mb.q = 0x807fff0101010101LL; paddsb(ma, mb); fprintf(stdout, "paddsb: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddsb: mb.q is 817f0002817f0002\n"); fflush(stdout); fflush(stderr); /* PADDUS ***************************************************/ ma.q = 0x0001000100010001LL; mb.q = 0x80007fffffff0001LL; paddusw(ma, mb); fprintf(stdout, "paddusw: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddusw: mb.q is 80018000ffff0002\n"); fflush(stdout); fflush(stderr); ma.q = 0x80007fffffff0001LL; mb.q = 0x0001000100010000LL; paddusw(ma, mb); fprintf(stdout, "paddusw: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddusw: mb.q is 80018000ffff0001\n"); fflush(stdout); fflush(stderr); ma.q = 0x01010101807fff01LL; mb.q = 0x807fff0101010101LL; paddusb(ma, mb); fprintf(stdout, "paddusb: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddusb: mb.q is 8180ff028180ff02\n"); fflush(stdout); fflush(stderr); /* PSUB *****************************************************/ ma.q = 0x7fffffff00000001LL; mb.q = 0x1111111180000000LL; psubd(ma, mb); fprintf(stdout, "psubd: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubd: mb.q is 911111127fffffff\n"); fflush(stdout); fflush(stderr); ma.q = 0x80007fffffff0001LL; mb.q = 0x0001000100010001LL; psubw(ma, mb); fprintf(stdout, "psubw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubw: mb.q is 8001800200020000\n"); fflush(stdout); fflush(stderr); ma.q = 0x0001000100010000LL; mb.q = 0x80007fffffff0001LL; psubw(ma, mb); fprintf(stdout, "psubw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubw: mb.q is 7fff7ffefffe0001\n"); fflush(stdout); fflush(stderr); ma.q = 0x807fff0101010101LL; mb.q = 0x01010101807fff01LL; psubb(ma, mb); fprintf(stdout, "psubb: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubb: mb.q is 818202007f7efe00\n"); fflush(stdout); fflush(stderr); /* PSUBS ****************************************************/ ma.q = 0x80007fffffff0001LL; mb.q = 0x0001000100010001LL; psubsw(ma, mb); fprintf(stdout, "psubsw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubsw: mb.q is 7fff800200020000\n"); fflush(stdout); fflush(stderr); ma.q = 0x0001000100010000LL; mb.q = 0x80007fffffff0001LL; psubsw(ma, mb); fprintf(stdout, "psubsw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubsw: mb.q is 80007ffefffe0001\n"); fflush(stdout); fflush(stderr); ma.q = 0x807fff0101010101LL; mb.q = 0x01010101807fff01LL; psubsb(ma, mb); fprintf(stdout, "psubsb: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubsb: mb.q is 7f820200807efe00\n"); fflush(stdout); fflush(stderr); /* PSUBUS ***************************************************/ ma.q = 0x80007fffffff0001LL; mb.q = 0x0001000100010001LL; psubusw(ma, mb); fprintf(stdout, "psubusw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubusw: mb.q is 0000000000000000\n"); fflush(stdout); fflush(stderr); ma.q = 0x0001000100010000LL; mb.q = 0x80007fffffff0001LL; psubusw(ma, mb); fprintf(stdout, "psubusw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubusw: mb.q is 7fff7ffefffe0001\n"); fflush(stdout); fflush(stderr); ma.q = 0x807fff0101010101LL; mb.q = 0x01010101807fff01LL; psubusb(ma, mb); fprintf(stdout, "psubusb: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubusb: mb.q is 000000007f7efe00\n"); fflush(stdout); fflush(stderr); /* PMUL *****************************************************/ ma.q = 0x8000ffff00ff0000LL; mb.q = 0x0200ffff00ffffffLL; pmulhw(ma, mb); fprintf(stdout, "pmulhw: mb.q is %016llx\n", mb.q); fprintf(stderr, "pmulhw: mb.q is ff00000000000000\n"); fflush(stdout); fflush(stderr); mb.q = 0x0200ffff00ffffffLL; pmullw(ma, mb); fprintf(stdout, "pmullw: mb.q is %016llx\n", mb.q); fprintf(stderr, "pmullw: mb.q is 00000001fe010000\n"); fflush(stdout); fflush(stderr); /* PMADD ****************************************************/ ma.q = 0x8000345680007f34LL; mb.q = 0x93234a27ffff1707LL; pmaddwd(ma, mb); fprintf(stdout, "pmaddwd: mb.q is %016llx\n", mb.q); fprintf(stderr, "pmaddwd: mb.q is 4597551a0b71a66c\n"); fflush(stdout); fflush(stderr); /* PCMPEQ ***************************************************/ ma.q = 0x800034568f237f34LL; mb.q = 0x93009a568f237f34LL; pcmpeqd(ma, mb); fprintf(stdout, "pcmpeqd: mb.q is %016llx\n", mb.q); fprintf(stderr, "pcmpeqd: mb.q is 00000000ffffffff\n"); fflush(stdout); fflush(stderr); mb.q = 0x93009a568f237f34LL; pcmpeqw(ma, mb); fprintf(stdout, "pcmpeqw: mb.q is %016llx\n", mb.q); fprintf(stderr, "pcmpeqw: mb.q is 00000000ffffffff\n"); fflush(stdout); fflush(stderr); mb.q = 0x93009a568f237f34LL; pcmpeqb(ma, mb); fprintf(stdout, "pcmpeqb: mb.q is %016llx\n", mb.q); fprintf(stderr, "pcmpeqb: mb.q is 00ff00ffffffffff\n"); fflush(stdout); fflush(stderr); /* PCMPGT ***************************************************/ ma.q = 0x666688884477aaffLL; mb.q = 0x1234567890abcdefLL; pcmpgtd(ma, mb); fprintf(stdout, "pcmpgtd: mb.q is %016llx\n", mb.q); fprintf(stderr, "pcmpgtd: mb.q is 0000000000000000\n"); fflush(stdout); fflush(stderr); mb.q = 0x1234567890abcdefLL; pcmpgtw(ma, mb); fprintf(stdout, "pcmpgtw: mb.q is %016llx\n", mb.q); fprintf(stderr, "pcmpgtw: mb.q is 0000ffff0000ffff\n"); fflush(stdout); fflush(stderr); mb.q = 0x1234567890abcdefLL; pcmpgtb(ma, mb); fprintf(stdout, "pcmpgtb: mb.q is %016llx\n", mb.q); fprintf(stderr, "pcmpgtb: mb.q is 0000ffff0000ff00\n"); fflush(stdout); fflush(stderr); /* PACKSS ***************************************************/ ma.q = 0x00012222000abbbbLL; mb.q = 0x0000888800003333LL; packssdw(ma, mb); fprintf(stdout, "packssdw: mb.q is %016llx\n", mb.q); fprintf(stderr, "packssdw: mb.q is 7fff7fff7fff3333\n"); fflush(stdout); fflush(stderr); ma.q = 0x00aa00dd01009999LL; mb.q = 0x0011002200330044LL; packsswb(ma, mb); fprintf(stdout, "packsswb: mb.q is %016llx\n", mb.q); fprintf(stderr, "packsswb: mb.q is 7f7f7f8011223344\n"); fflush(stdout); fflush(stderr); /* PACKUS ***************************************************/ ma.q = 0x00aa00dd01009999LL; mb.q = 0x0011002200330044LL; packuswb(ma, mb); fprintf(stdout, "packuswb: mb.q is %016llx\n", mb.q); fprintf(stderr, "packuswb: mb.q is aaddff0011223344\n"); fflush(stdout); fflush(stderr); /* PUNPCKH **************************************************/ ma.q = 0x090a0b0c0d0e0f00LL; mb.q = 0x0102030405060708LL; punpckhdq(ma, mb); fprintf(stdout, "punpckhdq: mb.q is %016llx\n", mb.q); fprintf(stderr, "punpckhdq: mb.q is 090a0b0c01020304\n"); fflush(stdout); fflush(stderr); mb.q = 0x0102030405060708LL; punpckhwd(ma, mb); fprintf(stdout, "punpckhwd: mb.q is %016llx\n", mb.q); fprintf(stderr, "punpckhwd: mb.q is 090a01020b0c0304\n"); fflush(stdout); fflush(stderr); mb.q = 0x0102030405060708LL; punpckhbw(ma, mb); fprintf(stdout, "punpckhbw: mb.q is %016llx\n", mb.q); fprintf(stderr, "punpckhbw: mb.q is 09010a020b030c04\n"); fflush(stdout); fflush(stderr); /* PUNPCKL **************************************************/ ma.q = 0x090a0b0c0d0e0f00LL; mb.q = 0x0102030405060708LL; punpckldq(ma, mb); fprintf(stdout, "punpckldq: mb.q is %016llx\n", mb.q); fprintf(stderr, "punpckldq: mb.q is 0d0e0f0005060708\n"); fflush(stdout); fflush(stderr); mb.q = 0x0102030405060708LL; punpcklwd(ma, mb); fprintf(stdout, "punpcklwd: mb.q is %016llx\n", mb.q); fprintf(stderr, "punpcklwd: mb.q is 0d0e05060f000708\n"); fflush(stdout); fflush(stderr); mb.q = 0x0102030405060708LL; punpcklbw(ma, mb); fprintf(stdout, "punpcklbw: mb.q is %016llx\n", mb.q); fprintf(stderr, "punpcklbw: mb.q is 0d050e060f070008\n"); fflush(stdout); fflush(stderr); /* PAND, PANDN, POR, PXOR ***********************************/ ma.q = 0x5555555555555555LL; mb.q = 0x3333333333333333LL; pand(ma, mb); fprintf(stdout, "pand: mb.q is %016llx\n", mb.q); fprintf(stderr, "pand: mb.q is 1111111111111111\n"); fflush(stdout); fflush(stderr); mb.q = 0x3333333333333333LL; pandn(ma, mb); fprintf(stdout, "pandn: mb.q is %016llx\n", mb.q); fprintf(stderr, "pandn: mb.q is 4444444444444444\n"); fflush(stdout); fflush(stderr); mb.q = 0x3333333333333333LL; por(ma, mb); fprintf(stdout, "por: mb.q is %016llx\n", mb.q); fprintf(stderr, "por: mb.q is 7777777777777777\n"); fflush(stdout); fflush(stderr); mb.q = 0x3333333333333333LL; pxor(ma, mb); fprintf(stdout, "pxor: mb.q is %016llx\n", mb.q); fprintf(stderr, "pxor: mb.q is 6666666666666666\n"); fflush(stdout); fflush(stderr); /* PSLL *****************************************************/ ma.q = 0x0000000000000018LL; mb.q = 0x0123456789abcdefLL; psllq(ma, mb); fprintf(stdout, "psllq: mb.q is %016llx\n", mb.q); fprintf(stderr, "psllq: mb.q is 6789abcdef000000\n"); fflush(stdout); fflush(stderr); mb.q = 0x0123456789abcdefLL; pslld(ma, mb); fprintf(stdout, "pslld: mb.q is %016llx\n", mb.q); fprintf(stderr, "pslld: mb.q is 67000000ef000000\n"); fflush(stdout); fflush(stderr); mb.q = 0x0123456789abcdefLL; psllw(ma, mb); fprintf(stdout, "psllw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psllw: mb.q is 0000000000000000\n"); fflush(stdout); fflush(stderr); /* PSRL *****************************************************/ ma.q = 0x0000000000000018LL; mb.q = 0x0123456789abcdefLL; psrlq(ma, mb); fprintf(stdout, "psrlq: mb.q is %016llx\n", mb.q); fprintf(stderr, "psrlq: mb.q is 0000000123456789\n"); fflush(stdout); fflush(stderr); mb.q = 0x0123456789abcdefLL; psrld(ma, mb); fprintf(stdout, "psrld: mb.q is %016llx\n", mb.q); fprintf(stderr, "psrld: mb.q is 0000000100000089\n"); fflush(stdout); fflush(stderr); mb.q = 0x0123456789abcdefLL; psrlw(ma, mb); fprintf(stdout, "psrlw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psrlw: mb.q is 0000000000000000\n"); fflush(stdout); fflush(stderr); /* PSRA *****************************************************/ ma.q = 0x0000000000000018LL; mb.q = 0x0123456789abcdefLL; psrad(ma, mb); fprintf(stdout, "psrad: mb.q is %016llx\n", mb.q); fprintf(stderr, "psrad: mb.q is 00000001ffffff89\n"); fflush(stdout); fflush(stderr); mb.q = 0x0123456789abcdefLL; psraw(ma, mb); fprintf(stdout, "psraw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psraw: mb.q is 00000000ffffffff\n"); fflush(stdout); fflush(stderr); /* Exit MXX *************************************************/ emms(); } /* Clean-up and exit nicely */ exit(0); }
void GSSetupPrimCodeGenerator::Color() { if(!m_en.c) { return; } if(m_env.sel.iip) { // GSVector4 c = dscan.c; movaps(xmm0, xmmword[edx]); movaps(xmm1, xmm0); // m_env.d4.c = GSVector4i(c * 4.0f).xzyw().ps32(); movaps(xmm2, xmm0); mulps(xmm2, xmm3); cvttps2dq(xmm2, xmm2); pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0)); packssdw(xmm2, xmm2); movdqa(xmmword[&m_env.d4.c], xmm2); // xmm3 is not needed anymore // GSVector4 dr = c.xxxx(); // GSVector4 db = c.zzzz(); shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); for(int i = 0; i < 4; i++) { // GSVector4i r = GSVector4i(dr * m_shift[i]).ps32(); movaps(xmm2, xmm0); mulps(xmm2, Xmm(4 + i)); cvttps2dq(xmm2, xmm2); packssdw(xmm2, xmm2); // GSVector4i b = GSVector4i(db * m_shift[i]).ps32(); movaps(xmm3, xmm1); mulps(xmm3, Xmm(4 + i)); cvttps2dq(xmm3, xmm3); packssdw(xmm3, xmm3); // m_env.d[i].rb = r.upl16(b); punpcklwd(xmm2, xmm3); movdqa(xmmword[&m_env.d[i].rb], xmm2); } // GSVector4 c = dscan.c; movaps(xmm0, xmmword[edx]); // not enough regs, have to reload it movaps(xmm1, xmm0); // GSVector4 dg = c.yyyy(); // GSVector4 da = c.wwww(); shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); for(int i = 0; i < 4; i++) { // GSVector4i g = GSVector4i(dg * m_shift[i]).ps32(); movaps(xmm2, xmm0); mulps(xmm2, Xmm(4 + i)); cvttps2dq(xmm2, xmm2); packssdw(xmm2, xmm2); // GSVector4i a = GSVector4i(da * m_shift[i]).ps32(); movaps(xmm3, xmm1); mulps(xmm3, Xmm(4 + i)); cvttps2dq(xmm3, xmm3); packssdw(xmm3, xmm3); // m_env.d[i].ga = g.upl16(a); punpcklwd(xmm2, xmm3); movdqa(xmmword[&m_env.d[i].ga], xmm2); } } else { // GSVector4i c = GSVector4i(vertices[0].c); movaps(xmm0, xmmword[ecx]); cvttps2dq(xmm0, xmm0); // c = c.upl16(c.zwxy()); movdqa(xmm1, xmm0); pshufd(xmm1, xmm1, _MM_SHUFFLE(1, 0, 3, 2)); punpcklwd(xmm0, xmm1); // if(!tme) c = c.srl16(7); if(m_env.sel.tfx == TFX_NONE) { psrlw(xmm0, 7); } // m_env.c.rb = c.xxxx(); // m_env.c.ga = c.zzzz(); movdqa(xmm1, xmm0); pshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); movdqa(xmmword[&m_env.c.rb], xmm0); movdqa(xmmword[&m_env.c.ga], xmm1); } }
void GSDrawScanlineCodeGenerator::SampleTexture() { if(!m_sel.fb || m_sel.tfx == TFX_NONE) { return; } mov(ebx, dword[&m_env.tex]); // ebx = tex if(!m_sel.fst) { // TODO: move these into Init/Step too? cvttps2dq(xmm2, xmm2); cvttps2dq(xmm3, xmm3); if(m_sel.ltf) { // u -= 0x8000; // v -= 0x8000; mov(eax, 0x8000); movd(xmm4, eax); pshufd(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); psubd(xmm2, xmm4); psubd(xmm3, xmm4); } } // xmm2 = u // xmm3 = v if(m_sel.ltf) { // GSVector4i uf = u.xxzzlh().srl16(1); movdqa(xmm0, xmm2); pshuflw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); psrlw(xmm0, 1); movdqa(xmmword[&m_env.temp.uf], xmm0); if(!m_sel.sprite) { // GSVector4i vf = v.xxzzlh().srl16(1); movdqa(xmm1, xmm3); pshuflw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 0, 0)); psrlw(xmm1, 1); movdqa(xmmword[&m_env.temp.vf], xmm1); } } // GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16)); psrad(xmm2, 16); psrad(xmm3, 16); packssdw(xmm2, xmm3); if(m_sel.ltf) { // GSVector4i uv1 = uv0.add16(GSVector4i::x0001()); movdqa(xmm3, xmm2); pcmpeqd(xmm1, xmm1); psrlw(xmm1, 15); paddw(xmm3, xmm1); // uv0 = Wrap(uv0); // uv1 = Wrap(uv1); Wrap(xmm2, xmm3); } else { // uv0 = Wrap(uv0); Wrap(xmm2); } // xmm2 = uv0 // xmm3 = uv1 (ltf) // xmm0, xmm1, xmm4, xmm5, xmm6 = free // xmm7 = used // GSVector4i y0 = uv0.uph16() << tw; // GSVector4i x0 = uv0.upl16(); pxor(xmm0, xmm0); movd(xmm1, ptr[&m_env.tw]); movdqa(xmm4, xmm2); punpckhwd(xmm2, xmm0); punpcklwd(xmm4, xmm0); pslld(xmm2, xmm1); // xmm0 = 0 // xmm1 = tw // xmm2 = y0 // xmm3 = uv1 (ltf) // xmm4 = x0 // xmm5, xmm6 = free // xmm7 = used if(m_sel.ltf) { // GSVector4i y1 = uv1.uph16() << tw; // GSVector4i x1 = uv1.upl16(); movdqa(xmm6, xmm3); punpckhwd(xmm3, xmm0); punpcklwd(xmm6, xmm0); pslld(xmm3, xmm1); // xmm2 = y0 // xmm3 = y1 // xmm4 = x0 // xmm6 = x1 // xmm0, xmm5, xmm6 = free // xmm7 = used // GSVector4i addr00 = y0 + x0; // GSVector4i addr01 = y0 + x1; // GSVector4i addr10 = y1 + x0; // GSVector4i addr11 = y1 + x1; movdqa(xmm5, xmm2); paddd(xmm5, xmm4); paddd(xmm2, xmm6); movdqa(xmm0, xmm3); paddd(xmm0, xmm4); paddd(xmm3, xmm6); // xmm5 = addr00 // xmm2 = addr01 // xmm0 = addr10 // xmm3 = addr11 // xmm1, xmm4, xmm6 = free // xmm7 = used // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]); // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); ReadTexel(xmm6, xmm5, xmm1, xmm4); // xmm2, xmm5, xmm1 = free ReadTexel(xmm4, xmm2, xmm5, xmm1); // xmm0, xmm2, xmm5 = free ReadTexel(xmm1, xmm0, xmm2, xmm5); // xmm3, xmm0, xmm2 = free ReadTexel(xmm5, xmm3, xmm0, xmm2); // xmm6 = c00 // xmm4 = c01 // xmm1 = c10 // xmm5 = c11 // xmm0, xmm2, xmm3 = free // xmm7 = used movdqa(xmm0, xmmword[&m_env.temp.uf]); // GSVector4i rb00 = c00 & mask; // GSVector4i ga00 = (c00 >> 8) & mask; movdqa(xmm2, xmm6); psllw(xmm2, 8); psrlw(xmm2, 8); psrlw(xmm6, 8); // GSVector4i rb01 = c01 & mask; // GSVector4i ga01 = (c01 >> 8) & mask; movdqa(xmm3, xmm4); psllw(xmm3, 8); psrlw(xmm3, 8); psrlw(xmm4, 8); // xmm0 = uf // xmm2 = rb00 // xmm3 = rb01 // xmm6 = ga00 // xmm4 = ga01 // xmm1 = c10 // xmm5 = c11 // xmm7 = used // rb00 = rb00.lerp16<0>(rb01, uf); // ga00 = ga00.lerp16<0>(ga01, uf); lerp16<0>(xmm3, xmm2, xmm0); lerp16<0>(xmm4, xmm6, xmm0); // xmm0 = uf // xmm3 = rb00 // xmm4 = ga00 // xmm1 = c10 // xmm5 = c11 // xmm2, xmm6 = free // xmm7 = used // GSVector4i rb10 = c10 & mask; // GSVector4i ga10 = (c10 >> 8) & mask; movdqa(xmm2, xmm1); psllw(xmm1, 8); psrlw(xmm1, 8); psrlw(xmm2, 8); // GSVector4i rb11 = c11 & mask; // GSVector4i ga11 = (c11 >> 8) & mask; movdqa(xmm6, xmm5); psllw(xmm5, 8); psrlw(xmm5, 8); psrlw(xmm6, 8); // xmm0 = uf // xmm3 = rb00 // xmm4 = ga00 // xmm1 = rb10 // xmm5 = rb11 // xmm2 = ga10 // xmm6 = ga11 // xmm7 = used // rb10 = rb10.lerp16<0>(rb11, uf); // ga10 = ga10.lerp16<0>(ga11, uf); lerp16<0>(xmm5, xmm1, xmm0); lerp16<0>(xmm6, xmm2, xmm0); // xmm3 = rb00 // xmm4 = ga00 // xmm5 = rb10 // xmm6 = ga10 // xmm0, xmm1, xmm2 = free // xmm7 = used // rb00 = rb00.lerp16<0>(rb10, vf); // ga00 = ga00.lerp16<0>(ga10, vf); movdqa(xmm0, xmmword[&m_env.temp.vf]); lerp16<0>(xmm5, xmm3, xmm0); lerp16<0>(xmm6, xmm4, xmm0); } else { // GSVector4i addr00 = y0 + x0; paddd(xmm2, xmm4); // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); ReadTexel(xmm5, xmm2, xmm0, xmm1); // GSVector4i mask = GSVector4i::x00ff(); // c[0] = c00 & mask; // c[1] = (c00 >> 8) & mask; movdqa(xmm6, xmm5); psllw(xmm5, 8); psrlw(xmm5, 8); psrlw(xmm6, 8); } }
void GSDrawScanlineCodeGenerator::Generate() { push(ebx); push(esi); push(edi); push(ebp); const int params = 16; Init(params); if(!m_sel.edge) { align(16); } L("loop"); // ecx = steps // esi = fzbr // edi = fzbc // xmm0 = z/zi // xmm2 = u (tme) // xmm3 = v (tme) // xmm5 = rb (!tme) // xmm6 = ga (!tme) // xmm7 = test bool tme = m_sel.tfx != TFX_NONE; TestZ(tme ? xmm5 : xmm2, tme ? xmm6 : xmm3); // ecx = steps // esi = fzbr // edi = fzbc // - xmm0 // xmm2 = u (tme) // xmm3 = v (tme) // xmm5 = rb (!tme) // xmm6 = ga (!tme) // xmm7 = test SampleTexture(); // ecx = steps // esi = fzbr // edi = fzbc // ebp = za // - xmm2 // - xmm3 // - xmm4 // xmm5 = rb // xmm6 = ga // xmm7 = test AlphaTFX(); // ecx = steps // esi = fzbr // edi = fzbc // ebp = za // xmm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) // xmm5 = rb // xmm6 = ga // xmm7 = test if(m_sel.fwrite) { movdqa(xmm3, xmmword[&m_env.fm]); } if(m_sel.zwrite) { movdqa(xmm4, xmmword[&m_env.zm]); } // ecx = steps // esi = fzbr // edi = fzbc // ebp = za // xmm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) // xmm3 = fm // xmm4 = zm // xmm5 = rb // xmm6 = ga // xmm7 = test TestAlpha(); // ecx = steps // esi = fzbr // edi = fzbc // ebp = za // xmm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc) // xmm3 = fm // xmm4 = zm // xmm5 = rb // xmm6 = ga // xmm7 = test ColorTFX(); // ecx = steps // esi = fzbr // edi = fzbc // ebp = za // xmm3 = fm // xmm4 = zm // xmm5 = rb // xmm6 = ga // xmm7 = test Fog(); // ecx = steps // esi = fzbr // edi = fzbc // ebp = za // xmm3 = fm // xmm4 = zm // xmm5 = rb // xmm6 = ga // xmm7 = test ReadFrame(); // ecx = steps // esi = fzbr // edi = fzbc // ebp = za // xmm2 = fd // xmm3 = fm // xmm4 = zm // xmm5 = rb // xmm6 = ga // xmm7 = test TestDestAlpha(); // fm |= test; // zm |= test; if(m_sel.fwrite) { por(xmm3, xmm7); } if(m_sel.zwrite) { por(xmm4, xmm7); } // int fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask(); pcmpeqd(xmm1, xmm1); if(m_sel.fwrite && m_sel.zwrite) { movdqa(xmm0, xmm1); pcmpeqd(xmm1, xmm3); pcmpeqd(xmm0, xmm4); packssdw(xmm1, xmm0); } else if(m_sel.fwrite) { pcmpeqd(xmm1, xmm3); packssdw(xmm1, xmm1); } else if(m_sel.zwrite) { pcmpeqd(xmm1, xmm4); packssdw(xmm1, xmm1); } pmovmskb(edx, xmm1); not(edx); // ebx = fa // ecx = steps // edx = fzm // esi = fzbr // edi = fzbc // ebp = za // xmm2 = fd // xmm3 = fm // xmm4 = zm // xmm5 = rb // xmm6 = ga WriteZBuf(); // ebx = fa // ecx = steps // edx = fzm // esi = fzbr // edi = fzbc // - ebp // xmm2 = fd // xmm3 = fm // - xmm4 // xmm5 = rb // xmm6 = ga AlphaBlend(); // ebx = fa // ecx = steps // edx = fzm // esi = fzbr // edi = fzbc // xmm2 = fd // xmm3 = fm // xmm5 = rb // xmm6 = ga WriteFrame(params); L("step"); // if(steps <= 0) break; if(!m_sel.edge) { test(ecx, ecx); jle("exit", T_NEAR); Step(); jmp("loop", T_NEAR); } L("exit"); pop(ebp); pop(edi); pop(esi); pop(ebx); ret(8); }