template<class _mm> void TimgFilterNoiseMplayer::Tprocess::lineNoiseAvg_simd(uint8_t *dst,const uint8_t *src, int len, int8_t **shift_) { if (_mm::align && (intptr_t(src)&15 || intptr_t(dst)&15)) { lineNoiseAvg_simd<typename _mm::T64>(dst,src,len,shift_); return; } const int mmx_len=len&(~(_mm::size-1)); int8_t *shift2[3]= {shift_[0]+mmx_len, shift_[1]+mmx_len, shift_[2]+mmx_len}; for (int x=-mmx_len; x<0; x+=_mm::size) { //".balign 16 \n\t" typename _mm::__m mm0,mm1,mm2,mm3; movdqu (mm1,shift2[0]+mmx_len+x); movq (mm0,src+mmx_len+x); typename _mm::__m shift1_8; movVqu(shift1_8, shift2[1]+mmx_len+x); paddb (mm1,shift1_8); typename _mm::__m shift2_8; movVqu(shift2_8, shift2[2]+mmx_len+x); paddb (mm1,shift2_8); movq (mm2,mm0); movq (mm3,mm1); punpcklbw (mm0,mm0); punpckhbw (mm2,mm2); punpcklbw (mm1,mm1); punpckhbw (mm3,mm3); pmulhw (mm1,mm0); pmulhw (mm3,mm2); paddw (mm1,mm1); paddw (mm3,mm3); paddw (mm1,mm0); paddw (mm3,mm2); psrlw (mm1,8); psrlw (mm3,8); packuswb (mm1,mm3); movq (dst+mmx_len+x,mm1); } if (mmx_len!=len) { lineNoiseAvg_C(dst+mmx_len, src+mmx_len, len-mmx_len, shift2); } }
void GSDrawScanlineCodeGenerator::lerp16(const Xmm& a, const Xmm& b, const Xmm& f, int shift) { #if _M_SSE >= 0x500 vpsubw(a, b); modulate16(a, f, shift); vpaddw(a, b); #else psubw(a, b); modulate16(a, f, shift); paddw(a, b); #endif }
void GSDrawScanlineCodeGenerator::lerp16_4(const Xmm& a, const Xmm& b, const Xmm& f) { #if _M_SSE >= 0x500 vpsubw(a, b); vpmullw(a, f); vpsraw(a, 4); vpaddw(a, b); #else psubw(a, b); pmullw(a, f); psraw(a, 4); paddw(a, b); #endif }
int main() { int rval; mmx_t ma; mmx_t mb; movq_r2r(mm0, mm1); rval = mmx_ok(); /* Announce return value of mmx_ok() */ // printf("Value returned from init was %x.", rval); // printf(" (Indicates MMX %s available)\n\n",(rval)? "is" : "not"); // fflush(stdout); fflush(stderr); // if(rval) { /* PADD *****************************************************/ ma.q = 0x1111111180000000LL; mb.q = 0x7fffffff00000001LL; paddd(ma, mb); fprintf(stdout, "paddd: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddd: mb.q is 9111111080000001\n"); fflush(stdout); fflush(stderr); ma.q = 0x0001000100010001LL; mb.q = 0x80007fffffff0001LL; paddw(ma, mb); fprintf(stdout, "paddw: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddw: mb.q is 8001800000000002\n"); fflush(stdout); fflush(stderr); ma.q = 0x80007fffffff0001LL; mb.q = 0x0001000100010000LL; paddw(ma, mb); fprintf(stdout, "paddw: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddw: mb.q is 8001800000000001\n"); fflush(stdout); fflush(stderr); ma.q = 0x01010101807fff01LL; mb.q = 0x807fff0101010101LL; paddb(ma, mb); fprintf(stdout, "paddb: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddb: mb.q is 8180000281800002\n"); fflush(stdout); fflush(stderr); /* PADDS ****************************************************/ ma.q = 0x0001000100010001LL; mb.q = 0x80007fffffff0001LL; paddsw(ma, mb); fprintf(stdout, "paddsw: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddsw: mb.q is 80017fff00000002\n"); ma.q = 0x80007fffffff0001LL; mb.q = 0x0001000100010000LL; paddsw(ma, mb); fprintf(stdout, "paddsw: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddsw: mb.q is 80017fff00000001\n"); ma.q = 0x01010101807fff01LL; mb.q = 0x807fff0101010101LL; paddsb(ma, mb); fprintf(stdout, "paddsb: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddsb: mb.q is 817f0002817f0002\n"); fflush(stdout); fflush(stderr); /* PADDUS ***************************************************/ ma.q = 0x0001000100010001LL; mb.q = 0x80007fffffff0001LL; paddusw(ma, mb); fprintf(stdout, "paddusw: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddusw: mb.q is 80018000ffff0002\n"); fflush(stdout); fflush(stderr); ma.q = 0x80007fffffff0001LL; mb.q = 0x0001000100010000LL; paddusw(ma, mb); fprintf(stdout, "paddusw: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddusw: mb.q is 80018000ffff0001\n"); fflush(stdout); fflush(stderr); ma.q = 0x01010101807fff01LL; mb.q = 0x807fff0101010101LL; paddusb(ma, mb); fprintf(stdout, "paddusb: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddusb: mb.q is 8180ff028180ff02\n"); fflush(stdout); fflush(stderr); /* PSUB *****************************************************/ ma.q = 0x7fffffff00000001LL; mb.q = 0x1111111180000000LL; psubd(ma, mb); fprintf(stdout, "psubd: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubd: mb.q is 911111127fffffff\n"); fflush(stdout); fflush(stderr); ma.q = 0x80007fffffff0001LL; mb.q = 0x0001000100010001LL; psubw(ma, mb); fprintf(stdout, "psubw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubw: mb.q is 8001800200020000\n"); fflush(stdout); fflush(stderr); ma.q = 0x0001000100010000LL; mb.q = 0x80007fffffff0001LL; psubw(ma, mb); fprintf(stdout, "psubw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubw: mb.q is 7fff7ffefffe0001\n"); fflush(stdout); fflush(stderr); ma.q = 0x807fff0101010101LL; mb.q = 0x01010101807fff01LL; psubb(ma, mb); fprintf(stdout, "psubb: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubb: mb.q is 818202007f7efe00\n"); fflush(stdout); fflush(stderr); /* PSUBS ****************************************************/ ma.q = 0x80007fffffff0001LL; mb.q = 0x0001000100010001LL; psubsw(ma, mb); fprintf(stdout, "psubsw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubsw: mb.q is 7fff800200020000\n"); fflush(stdout); fflush(stderr); ma.q = 0x0001000100010000LL; mb.q = 0x80007fffffff0001LL; psubsw(ma, mb); fprintf(stdout, "psubsw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubsw: mb.q is 80007ffefffe0001\n"); fflush(stdout); fflush(stderr); ma.q = 0x807fff0101010101LL; mb.q = 0x01010101807fff01LL; psubsb(ma, mb); fprintf(stdout, "psubsb: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubsb: mb.q is 7f820200807efe00\n"); fflush(stdout); fflush(stderr); /* PSUBUS ***************************************************/ ma.q = 0x80007fffffff0001LL; mb.q = 0x0001000100010001LL; psubusw(ma, mb); fprintf(stdout, "psubusw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubusw: mb.q is 0000000000000000\n"); fflush(stdout); fflush(stderr); ma.q = 0x0001000100010000LL; mb.q = 0x80007fffffff0001LL; psubusw(ma, mb); fprintf(stdout, "psubusw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubusw: mb.q is 7fff7ffefffe0001\n"); fflush(stdout); fflush(stderr); ma.q = 0x807fff0101010101LL; mb.q = 0x01010101807fff01LL; psubusb(ma, mb); fprintf(stdout, "psubusb: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubusb: mb.q is 000000007f7efe00\n"); fflush(stdout); fflush(stderr); /* PMUL *****************************************************/ ma.q = 0x8000ffff00ff0000LL; mb.q = 0x0200ffff00ffffffLL; pmulhw(ma, mb); fprintf(stdout, "pmulhw: mb.q is %016llx\n", mb.q); fprintf(stderr, "pmulhw: mb.q is ff00000000000000\n"); fflush(stdout); fflush(stderr); mb.q = 0x0200ffff00ffffffLL; pmullw(ma, mb); fprintf(stdout, "pmullw: mb.q is %016llx\n", mb.q); fprintf(stderr, "pmullw: mb.q is 00000001fe010000\n"); fflush(stdout); fflush(stderr); /* PMADD ****************************************************/ ma.q = 0x8000345680007f34LL; mb.q = 0x93234a27ffff1707LL; pmaddwd(ma, mb); fprintf(stdout, "pmaddwd: mb.q is %016llx\n", mb.q); fprintf(stderr, "pmaddwd: mb.q is 4597551a0b71a66c\n"); fflush(stdout); fflush(stderr); /* PCMPEQ ***************************************************/ ma.q = 0x800034568f237f34LL; mb.q = 0x93009a568f237f34LL; pcmpeqd(ma, mb); fprintf(stdout, "pcmpeqd: mb.q is %016llx\n", mb.q); fprintf(stderr, "pcmpeqd: mb.q is 00000000ffffffff\n"); fflush(stdout); fflush(stderr); mb.q = 0x93009a568f237f34LL; pcmpeqw(ma, mb); fprintf(stdout, "pcmpeqw: mb.q is %016llx\n", mb.q); fprintf(stderr, "pcmpeqw: mb.q is 00000000ffffffff\n"); fflush(stdout); fflush(stderr); mb.q = 0x93009a568f237f34LL; pcmpeqb(ma, mb); fprintf(stdout, "pcmpeqb: mb.q is %016llx\n", mb.q); fprintf(stderr, "pcmpeqb: mb.q is 00ff00ffffffffff\n"); fflush(stdout); fflush(stderr); /* PCMPGT ***************************************************/ ma.q = 0x666688884477aaffLL; mb.q = 0x1234567890abcdefLL; pcmpgtd(ma, mb); fprintf(stdout, "pcmpgtd: mb.q is %016llx\n", mb.q); fprintf(stderr, "pcmpgtd: mb.q is 0000000000000000\n"); fflush(stdout); fflush(stderr); mb.q = 0x1234567890abcdefLL; pcmpgtw(ma, mb); fprintf(stdout, "pcmpgtw: mb.q is %016llx\n", mb.q); fprintf(stderr, "pcmpgtw: mb.q is 0000ffff0000ffff\n"); fflush(stdout); fflush(stderr); mb.q = 0x1234567890abcdefLL; pcmpgtb(ma, mb); fprintf(stdout, "pcmpgtb: mb.q is %016llx\n", mb.q); fprintf(stderr, "pcmpgtb: mb.q is 0000ffff0000ff00\n"); fflush(stdout); fflush(stderr); /* PACKSS ***************************************************/ ma.q = 0x00012222000abbbbLL; mb.q = 0x0000888800003333LL; packssdw(ma, mb); fprintf(stdout, "packssdw: mb.q is %016llx\n", mb.q); fprintf(stderr, "packssdw: mb.q is 7fff7fff7fff3333\n"); fflush(stdout); fflush(stderr); ma.q = 0x00aa00dd01009999LL; mb.q = 0x0011002200330044LL; packsswb(ma, mb); fprintf(stdout, "packsswb: mb.q is %016llx\n", mb.q); fprintf(stderr, "packsswb: mb.q is 7f7f7f8011223344\n"); fflush(stdout); fflush(stderr); /* PACKUS ***************************************************/ ma.q = 0x00aa00dd01009999LL; mb.q = 0x0011002200330044LL; packuswb(ma, mb); fprintf(stdout, "packuswb: mb.q is %016llx\n", mb.q); fprintf(stderr, "packuswb: mb.q is aaddff0011223344\n"); fflush(stdout); fflush(stderr); /* PUNPCKH **************************************************/ ma.q = 0x090a0b0c0d0e0f00LL; mb.q = 0x0102030405060708LL; punpckhdq(ma, mb); fprintf(stdout, "punpckhdq: mb.q is %016llx\n", mb.q); fprintf(stderr, "punpckhdq: mb.q is 090a0b0c01020304\n"); fflush(stdout); fflush(stderr); mb.q = 0x0102030405060708LL; punpckhwd(ma, mb); fprintf(stdout, "punpckhwd: mb.q is %016llx\n", mb.q); fprintf(stderr, "punpckhwd: mb.q is 090a01020b0c0304\n"); fflush(stdout); fflush(stderr); mb.q = 0x0102030405060708LL; punpckhbw(ma, mb); fprintf(stdout, "punpckhbw: mb.q is %016llx\n", mb.q); fprintf(stderr, "punpckhbw: mb.q is 09010a020b030c04\n"); fflush(stdout); fflush(stderr); /* PUNPCKL **************************************************/ ma.q = 0x090a0b0c0d0e0f00LL; mb.q = 0x0102030405060708LL; punpckldq(ma, mb); fprintf(stdout, "punpckldq: mb.q is %016llx\n", mb.q); fprintf(stderr, "punpckldq: mb.q is 0d0e0f0005060708\n"); fflush(stdout); fflush(stderr); mb.q = 0x0102030405060708LL; punpcklwd(ma, mb); fprintf(stdout, "punpcklwd: mb.q is %016llx\n", mb.q); fprintf(stderr, "punpcklwd: mb.q is 0d0e05060f000708\n"); fflush(stdout); fflush(stderr); mb.q = 0x0102030405060708LL; punpcklbw(ma, mb); fprintf(stdout, "punpcklbw: mb.q is %016llx\n", mb.q); fprintf(stderr, "punpcklbw: mb.q is 0d050e060f070008\n"); fflush(stdout); fflush(stderr); /* PAND, PANDN, POR, PXOR ***********************************/ ma.q = 0x5555555555555555LL; mb.q = 0x3333333333333333LL; pand(ma, mb); fprintf(stdout, "pand: mb.q is %016llx\n", mb.q); fprintf(stderr, "pand: mb.q is 1111111111111111\n"); fflush(stdout); fflush(stderr); mb.q = 0x3333333333333333LL; pandn(ma, mb); fprintf(stdout, "pandn: mb.q is %016llx\n", mb.q); fprintf(stderr, "pandn: mb.q is 4444444444444444\n"); fflush(stdout); fflush(stderr); mb.q = 0x3333333333333333LL; por(ma, mb); fprintf(stdout, "por: mb.q is %016llx\n", mb.q); fprintf(stderr, "por: mb.q is 7777777777777777\n"); fflush(stdout); fflush(stderr); mb.q = 0x3333333333333333LL; pxor(ma, mb); fprintf(stdout, "pxor: mb.q is %016llx\n", mb.q); fprintf(stderr, "pxor: mb.q is 6666666666666666\n"); fflush(stdout); fflush(stderr); /* PSLL *****************************************************/ ma.q = 0x0000000000000018LL; mb.q = 0x0123456789abcdefLL; psllq(ma, mb); fprintf(stdout, "psllq: mb.q is %016llx\n", mb.q); fprintf(stderr, "psllq: mb.q is 6789abcdef000000\n"); fflush(stdout); fflush(stderr); mb.q = 0x0123456789abcdefLL; pslld(ma, mb); fprintf(stdout, "pslld: mb.q is %016llx\n", mb.q); fprintf(stderr, "pslld: mb.q is 67000000ef000000\n"); fflush(stdout); fflush(stderr); mb.q = 0x0123456789abcdefLL; psllw(ma, mb); fprintf(stdout, "psllw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psllw: mb.q is 0000000000000000\n"); fflush(stdout); fflush(stderr); /* PSRL *****************************************************/ ma.q = 0x0000000000000018LL; mb.q = 0x0123456789abcdefLL; psrlq(ma, mb); fprintf(stdout, "psrlq: mb.q is %016llx\n", mb.q); fprintf(stderr, "psrlq: mb.q is 0000000123456789\n"); fflush(stdout); fflush(stderr); mb.q = 0x0123456789abcdefLL; psrld(ma, mb); fprintf(stdout, "psrld: mb.q is %016llx\n", mb.q); fprintf(stderr, "psrld: mb.q is 0000000100000089\n"); fflush(stdout); fflush(stderr); mb.q = 0x0123456789abcdefLL; psrlw(ma, mb); fprintf(stdout, "psrlw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psrlw: mb.q is 0000000000000000\n"); fflush(stdout); fflush(stderr); /* PSRA *****************************************************/ ma.q = 0x0000000000000018LL; mb.q = 0x0123456789abcdefLL; psrad(ma, mb); fprintf(stdout, "psrad: mb.q is %016llx\n", mb.q); fprintf(stderr, "psrad: mb.q is 00000001ffffff89\n"); fflush(stdout); fflush(stderr); mb.q = 0x0123456789abcdefLL; psraw(ma, mb); fprintf(stdout, "psraw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psraw: mb.q is 00000000ffffffff\n"); fflush(stdout); fflush(stderr); /* Exit MXX *************************************************/ emms(); } /* Clean-up and exit nicely */ exit(0); }
void GPUDrawScanlineCodeGenerator::lerp16(const Xmm& a, const Xmm& b, const Operand& f) { psubw(a, b); modulate16<shift>(a, f); paddw(a, b); }
void GPUDrawScanlineCodeGenerator::SampleTexture() { if(!m_sel.tme) { return; } if(m_sel.tlu) { mov(edx, ptr[&m_local.gd->clut]); } // xmm2 = s // xmm3 = t // xmm7 = test // xmm0, xmm4, xmm5, xmm6 = free // xmm1 = used if(m_sel.ltf) { // GSVector4i u = s.sub16(GSVector4i(0x00200020)); // - 0.125f // GSVector4i v = t.sub16(GSVector4i(0x00200020)); // - 0.125f mov(eax, 0x00200020); movd(xmm0, eax); pshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); psubw(xmm2, xmm0); psubw(xmm3, xmm0); // GSVector4i uf = (u & GSVector4i::x00ff()) << 7; // GSVector4i vf = (v & GSVector4i::x00ff()) << 7; movdqa(xmm0, xmm2); psllw(xmm0, 8); psrlw(xmm0, 1); movdqa(ptr[&m_local.temp.uf], xmm0); if(!m_sel.sprite) { movdqa(xmm0, xmm3); psllw(xmm0, 8); psrlw(xmm0, 1); movdqa(ptr[&m_local.temp.vf], xmm0); } } // GSVector4i u0 = s.srl16(8); // GSVector4i v0 = t.srl16(8); psrlw(xmm2, 8); psrlw(xmm3, 8); // xmm2 = u // xmm3 = v // xmm7 = test // xmm0, xmm4, xmm5, xmm6 = free // xmm1 = used if(m_sel.ltf) { // GSVector4i u1 = u0.add16(GSVector4i::x0001()); // GSVector4i v1 = v0.add16(GSVector4i::x0001()); movdqa(xmm4, xmm2); movdqa(xmm5, xmm3); pcmpeqd(xmm0, xmm0); psrlw(xmm0, 15); paddw(xmm4, xmm0); paddw(xmm5, xmm0); if(m_sel.twin) { // u0 = (u0 & m_local.twin[0].u).add16(m_local.twin[1].u); // v0 = (v0 & m_local.twin[0].v).add16(m_local.twin[1].v); // u1 = (u1 & m_local.twin[0].u).add16(m_local.twin[1].u); // v1 = (v1 & m_local.twin[0].v).add16(m_local.twin[1].v); movdqa(xmm0, ptr[&m_local.twin[0].u]); movdqa(xmm6, ptr[&m_local.twin[1].u]); pand(xmm2, xmm0); paddw(xmm2, xmm6); pand(xmm4, xmm0); paddw(xmm4, xmm6); movdqa(xmm0, ptr[&m_local.twin[0].v]); movdqa(xmm6, ptr[&m_local.twin[1].v]); pand(xmm3, xmm0); paddw(xmm3, xmm6); pand(xmm5, xmm0); paddw(xmm5, xmm6); } else { // u0 = u0.min_i16(m_local.twin[2].u); // v0 = v0.min_i16(m_local.twin[2].v); // u1 = u1.min_i16(m_local.twin[2].u); // v1 = v1.min_i16(m_local.twin[2].v); // TODO: if(!sprite) clamp16 else: movdqa(xmm0, ptr[&m_local.twin[2].u]); movdqa(xmm6, ptr[&m_local.twin[2].v]); pminsw(xmm2, xmm0); pminsw(xmm3, xmm6); pminsw(xmm4, xmm0); pminsw(xmm5, xmm6); } // xmm2 = u0 // xmm3 = v0 // xmm4 = u1 // xmm5 = v1 // xmm7 = test // xmm0, xmm6 = free // xmm1 = used // GSVector4i addr00 = v0.sll16(8) | u0; // GSVector4i addr01 = v0.sll16(8) | u1; // GSVector4i addr10 = v1.sll16(8) | u0; // GSVector4i addr11 = v1.sll16(8) | u1; psllw(xmm3, 8); movdqa(xmm0, xmm3); por(xmm3, xmm2); por(xmm0, xmm4); psllw(xmm5, 8); movdqa(xmm6, xmm5); por(xmm5, xmm2); por(xmm6, xmm4); // xmm3 = addr00 // xmm0 = addr01 // xmm5 = addr10 // xmm6 = addr11 // xmm7 = test // xmm2, xmm4 = free // xmm1 = used ReadTexel(xmm2, xmm3); ReadTexel(xmm4, xmm0); ReadTexel(xmm3, xmm5); ReadTexel(xmm5, xmm6); // xmm2 = c00 // xmm4 = c01 // xmm3 = c10 // xmm5 = c11 // xmm7 = test // xmm0, xmm6 = free // xmm1 = used // spill (TODO) movdqa(ptr[&m_local.temp.fd], xmm1); movdqa(ptr[&m_local.temp.test], xmm7); // xmm2 = c00 // xmm4 = c01 // xmm3 = c10 // xmm5 = c11 // xmm0, xmm1, xmm6, xmm7 = free movdqa(xmm1, xmm2); psllw(xmm1, 11); psrlw(xmm1, 8); movdqa(xmm0, xmm4); psllw(xmm0, 11); psrlw(xmm0, 8); lerp16<0>(xmm0, xmm1, ptr[&m_local.temp.uf]); movdqa(xmm6, xmm2); psllw(xmm6, 6); psrlw(xmm6, 11); psllw(xmm6, 3); movdqa(xmm1, xmm4); psllw(xmm1, 6); psrlw(xmm1, 11); psllw(xmm1, 3); lerp16<0>(xmm1, xmm6, ptr[&m_local.temp.uf]); movdqa(xmm7, xmm2); psllw(xmm7, 1); psrlw(xmm7, 11); psllw(xmm7, 3); movdqa(xmm6, xmm4); psllw(xmm6, 1); psrlw(xmm6, 11); psllw(xmm6, 3); lerp16<0>(xmm6, xmm7, ptr[&m_local.temp.uf]); psraw(xmm2, 15); psrlw(xmm2, 8); psraw(xmm4, 15); psrlw(xmm4, 8); lerp16<0>(xmm4, xmm2, ptr[&m_local.temp.uf]); // xmm0 = r00 // xmm1 = g00 // xmm6 = b00 // xmm4 = a00 // xmm3 = c10 // xmm5 = c11 // xmm2, xmm7 = free movdqa(xmm7, xmm3); psllw(xmm7, 11); psrlw(xmm7, 8); movdqa(xmm2, xmm5); psllw(xmm2, 11); psrlw(xmm2, 8); lerp16<0>(xmm2, xmm7, ptr[&m_local.temp.uf]); lerp16<0>(xmm2, xmm0, ptr[&m_local.temp.vf]); // xmm2 = r // xmm1 = g00 // xmm6 = b00 // xmm4 = a00 // xmm3 = c10 // xmm5 = c11 // xmm0, xmm7 = free movdqa(xmm7, xmm3); psllw(xmm7, 6); psrlw(xmm7, 11); psllw(xmm7, 3); movdqa(xmm0, xmm5); psllw(xmm0, 6); psrlw(xmm0, 11); psllw(xmm0, 3); lerp16<0>(xmm0, xmm7, ptr[&m_local.temp.uf]); lerp16<0>(xmm0, xmm1, ptr[&m_local.temp.vf]); // xmm2 = r // xmm0 = g // xmm6 = b00 // xmm4 = a00 // xmm3 = c10 // xmm5 = c11 // xmm1, xmm7 = free movdqa(xmm7, xmm3); psllw(xmm7, 1); psrlw(xmm7, 11); psllw(xmm7, 3); movdqa(xmm1, xmm5); psllw(xmm1, 1); psrlw(xmm1, 11); psllw(xmm1, 3); lerp16<0>(xmm1, xmm7, ptr[&m_local.temp.uf]); lerp16<0>(xmm1, xmm6, ptr[&m_local.temp.vf]); // xmm2 = r // xmm0 = g // xmm1 = b // xmm4 = a00 // xmm3 = c10 // xmm5 = c11 // xmm6, xmm7 = free psraw(xmm3, 15); psrlw(xmm3, 8); psraw(xmm5, 15); psrlw(xmm5, 8); lerp16<0>(xmm5, xmm3, ptr[&m_local.temp.uf]); lerp16<0>(xmm5, xmm4, ptr[&m_local.temp.vf]); // xmm2 = r // xmm0 = g // xmm1 = b // xmm5 = a // xmm3, xmm4, xmm6, xmm7 = free // TODO movdqa(xmm3, xmm5); // a movdqa(xmm4, xmm2); // r movdqa(xmm6, xmm1); // b movdqa(xmm5, xmm0); // g // reload test movdqa(xmm7, ptr[&m_local.temp.test]); // xmm4 = r // xmm5 = g // xmm6 = b // xmm3 = a // xmm7 = test // xmm0, xmm1, xmm2 = free // test |= (c[0] | c[1] | c[2] | c[3]).eq16(GSVector4i::zero()); // mask out blank pixels (not perfect) movdqa(xmm1, xmm3); por(xmm1, xmm4); movdqa(xmm2, xmm5); por(xmm2, xmm6); por(xmm1, xmm2); pxor(xmm0, xmm0); pcmpeqw(xmm1, xmm0); por(xmm7, xmm1); // a = a.gt16(GSVector4i::zero()); pcmpgtw(xmm3, xmm0); // reload fd movdqa(xmm1, ptr[&m_local.temp.fd]); } else { if(m_sel.twin) { // u = (u & m_local.twin[0].u).add16(m_local.twin[1].u); // v = (v & m_local.twin[0].v).add16(m_local.twin[1].v); pand(xmm2, ptr[&m_local.twin[0].u]); paddw(xmm2, ptr[&m_local.twin[1].u]); pand(xmm3, ptr[&m_local.twin[0].v]); paddw(xmm3, ptr[&m_local.twin[1].v]); } else { // u = u.min_i16(m_local.twin[2].u); // v = v.min_i16(m_local.twin[2].v); // TODO: if(!sprite) clamp16 else: pminsw(xmm2, ptr[&m_local.twin[2].u]); pminsw(xmm3, ptr[&m_local.twin[2].v]); } // xmm2 = u // xmm3 = v // xmm7 = test // xmm0, xmm4, xmm5, xmm6 = free // xmm1 = used // GSVector4i addr = v.sll16(8) | u; psllw(xmm3, 8); por(xmm3, xmm2); // xmm3 = addr // xmm7 = test // xmm0, xmm2, xmm4, xmm5, xmm6 = free // xmm1 = used ReadTexel(xmm6, xmm3); // xmm3 = c00 // xmm7 = test // xmm0, xmm2, xmm4, xmm5, xmm6 = free // xmm1 = used // test |= c00.eq16(GSVector4i::zero()); // mask out blank pixels pxor(xmm0, xmm0); pcmpeqw(xmm0, xmm6); por(xmm7, xmm0); // c[0] = (c00 << 3) & 0x00f800f8; // c[1] = (c00 >> 2) & 0x00f800f8; // c[2] = (c00 >> 7) & 0x00f800f8; // c[3] = c00.sra16(15); movdqa(xmm3, xmm6); psraw(xmm3, 15); // a pcmpeqd(xmm0, xmm0); psrlw(xmm0, 11); psllw(xmm0, 3); // 0x00f8 movdqa(xmm4, xmm6); psllw(xmm4, 3); pand(xmm4, xmm0); // r movdqa(xmm5, xmm6); psrlw(xmm5, 2); pand(xmm5, xmm0); // g psrlw(xmm6, 7); pand(xmm6, xmm0); // b } }
void GPUDrawScanlineCodeGenerator::Step() { // steps -= 8; sub(ecx, 8); // fb += 8; add(edi, 8 * sizeof(uint16)); if(m_sel.tme) { // GSVector4i st = m_local.d8.st; movdqa(xmm4, ptr[&m_local.d8.st]); // s = s.add16(st.xxxx()); // t = t.add16(st.yyyy()); pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); paddw(xmm2, ptr[&m_local.temp.s]); movdqa(ptr[&m_local.temp.s], xmm2); // TODO: if(!sprite) ... else reload t pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1)); paddw(xmm3, ptr[&m_local.temp.t]); movdqa(ptr[&m_local.temp.t], xmm3); } if(m_sel.tfx != 3) // != decal { if(m_sel.iip) { // GSVector4i c = m_local.d8.c; // r = r.add16(c.xxxx()); // g = g.add16(c.yyyy()); // b = b.add16(c.zzzz()); movdqa(xmm6, ptr[&m_local.d8.c]); pshufd(xmm4, xmm6, _MM_SHUFFLE(0, 0, 0, 0)); pshufd(xmm5, xmm6, _MM_SHUFFLE(1, 1, 1, 1)); pshufd(xmm6, xmm6, _MM_SHUFFLE(2, 2, 2, 2)); paddw(xmm4, ptr[&m_local.temp.r]); paddw(xmm5, ptr[&m_local.temp.g]); paddw(xmm6, ptr[&m_local.temp.b]); movdqa(ptr[&m_local.temp.r], xmm4); movdqa(ptr[&m_local.temp.g], xmm5); movdqa(ptr[&m_local.temp.b], xmm6); } else { movdqa(xmm4, ptr[&m_local.temp.r]); movdqa(xmm5, ptr[&m_local.temp.g]); movdqa(xmm6, ptr[&m_local.temp.b]); } } }
void GPUDrawScanlineCodeGenerator::Init() { mov(eax, dword[esp + _top]); // uint16* fb = (uint16*)m_global.vm + (top << (10 + sel.scalex)) + left; mov(edi, eax); shl(edi, 10 + m_sel.scalex); add(edi, edx); lea(edi, ptr[edi * 2 + (size_t)m_local.gd->vm]); // int steps = pixels - 8; sub(ecx, 8); if(m_sel.dtd) { // dither = GSVector4i::load<false>(&m_dither[top & 3][left & 3]); and(eax, 3); shl(eax, 5); and(edx, 3); shl(edx, 1); movdqu(xmm0, ptr[eax + edx + (size_t)m_dither]); movdqa(ptr[&m_local.temp.dither], xmm0); } mov(edx, dword[esp + _v]); if(m_sel.tme) { mov(esi, dword[&m_local.gd->tex]); // GSVector4i vt = GSVector4i(v.t).xxzzl(); cvttps2dq(xmm4, ptr[edx + offsetof(GSVertexSW, t)]); pshuflw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0)); // s = vt.xxxx().add16(m_local.d.s); // t = vt.yyyy().add16(m_local.d.t); pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1)); paddw(xmm2, ptr[&m_local.d.s]); if(!m_sel.sprite) { paddw(xmm3, ptr[&m_local.d.t]); } else { if(m_sel.ltf) { movdqa(xmm0, xmm3); psllw(xmm0, 8); psrlw(xmm0, 1); movdqa(ptr[&m_local.temp.vf], xmm0); } } movdqa(ptr[&m_local.temp.s], xmm2); movdqa(ptr[&m_local.temp.t], xmm3); } if(m_sel.tfx != 3) // != decal { // GSVector4i vc = GSVector4i(v.c).xxzzlh(); cvttps2dq(xmm6, ptr[edx + offsetof(GSVertexSW, c)]); pshuflw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0)); // r = vc.xxxx(); // g = vc.yyyy(); // b = vc.zzzz(); pshufd(xmm4, xmm6, _MM_SHUFFLE(0, 0, 0, 0)); pshufd(xmm5, xmm6, _MM_SHUFFLE(1, 1, 1, 1)); pshufd(xmm6, xmm6, _MM_SHUFFLE(2, 2, 2, 2)); if(m_sel.iip) { // r = r.add16(m_local.d.r); // g = g.add16(m_local.d.g); // b = b.add16(m_local.d.b); paddw(xmm4, ptr[&m_local.d.r]); paddw(xmm5, ptr[&m_local.d.g]); paddw(xmm6, ptr[&m_local.d.b]); } movdqa(ptr[&m_local.temp.r], xmm4); movdqa(ptr[&m_local.temp.g], xmm5); movdqa(ptr[&m_local.temp.b], xmm6); } }
void GSDrawScanlineCodeGenerator::SampleTexture() { if(!m_sel.fb || m_sel.tfx == TFX_NONE) { return; } mov(ebx, dword[&m_env.tex]); // ebx = tex if(!m_sel.fst) { // TODO: move these into Init/Step too? cvttps2dq(xmm2, xmm2); cvttps2dq(xmm3, xmm3); if(m_sel.ltf) { // u -= 0x8000; // v -= 0x8000; mov(eax, 0x8000); movd(xmm4, eax); pshufd(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); psubd(xmm2, xmm4); psubd(xmm3, xmm4); } } // xmm2 = u // xmm3 = v if(m_sel.ltf) { // GSVector4i uf = u.xxzzlh().srl16(1); movdqa(xmm0, xmm2); pshuflw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); psrlw(xmm0, 1); movdqa(xmmword[&m_env.temp.uf], xmm0); if(!m_sel.sprite) { // GSVector4i vf = v.xxzzlh().srl16(1); movdqa(xmm1, xmm3); pshuflw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 0, 0)); psrlw(xmm1, 1); movdqa(xmmword[&m_env.temp.vf], xmm1); } } // GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16)); psrad(xmm2, 16); psrad(xmm3, 16); packssdw(xmm2, xmm3); if(m_sel.ltf) { // GSVector4i uv1 = uv0.add16(GSVector4i::x0001()); movdqa(xmm3, xmm2); pcmpeqd(xmm1, xmm1); psrlw(xmm1, 15); paddw(xmm3, xmm1); // uv0 = Wrap(uv0); // uv1 = Wrap(uv1); Wrap(xmm2, xmm3); } else { // uv0 = Wrap(uv0); Wrap(xmm2); } // xmm2 = uv0 // xmm3 = uv1 (ltf) // xmm0, xmm1, xmm4, xmm5, xmm6 = free // xmm7 = used // GSVector4i y0 = uv0.uph16() << tw; // GSVector4i x0 = uv0.upl16(); pxor(xmm0, xmm0); movd(xmm1, ptr[&m_env.tw]); movdqa(xmm4, xmm2); punpckhwd(xmm2, xmm0); punpcklwd(xmm4, xmm0); pslld(xmm2, xmm1); // xmm0 = 0 // xmm1 = tw // xmm2 = y0 // xmm3 = uv1 (ltf) // xmm4 = x0 // xmm5, xmm6 = free // xmm7 = used if(m_sel.ltf) { // GSVector4i y1 = uv1.uph16() << tw; // GSVector4i x1 = uv1.upl16(); movdqa(xmm6, xmm3); punpckhwd(xmm3, xmm0); punpcklwd(xmm6, xmm0); pslld(xmm3, xmm1); // xmm2 = y0 // xmm3 = y1 // xmm4 = x0 // xmm6 = x1 // xmm0, xmm5, xmm6 = free // xmm7 = used // GSVector4i addr00 = y0 + x0; // GSVector4i addr01 = y0 + x1; // GSVector4i addr10 = y1 + x0; // GSVector4i addr11 = y1 + x1; movdqa(xmm5, xmm2); paddd(xmm5, xmm4); paddd(xmm2, xmm6); movdqa(xmm0, xmm3); paddd(xmm0, xmm4); paddd(xmm3, xmm6); // xmm5 = addr00 // xmm2 = addr01 // xmm0 = addr10 // xmm3 = addr11 // xmm1, xmm4, xmm6 = free // xmm7 = used // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]); // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); ReadTexel(xmm6, xmm5, xmm1, xmm4); // xmm2, xmm5, xmm1 = free ReadTexel(xmm4, xmm2, xmm5, xmm1); // xmm0, xmm2, xmm5 = free ReadTexel(xmm1, xmm0, xmm2, xmm5); // xmm3, xmm0, xmm2 = free ReadTexel(xmm5, xmm3, xmm0, xmm2); // xmm6 = c00 // xmm4 = c01 // xmm1 = c10 // xmm5 = c11 // xmm0, xmm2, xmm3 = free // xmm7 = used movdqa(xmm0, xmmword[&m_env.temp.uf]); // GSVector4i rb00 = c00 & mask; // GSVector4i ga00 = (c00 >> 8) & mask; movdqa(xmm2, xmm6); psllw(xmm2, 8); psrlw(xmm2, 8); psrlw(xmm6, 8); // GSVector4i rb01 = c01 & mask; // GSVector4i ga01 = (c01 >> 8) & mask; movdqa(xmm3, xmm4); psllw(xmm3, 8); psrlw(xmm3, 8); psrlw(xmm4, 8); // xmm0 = uf // xmm2 = rb00 // xmm3 = rb01 // xmm6 = ga00 // xmm4 = ga01 // xmm1 = c10 // xmm5 = c11 // xmm7 = used // rb00 = rb00.lerp16<0>(rb01, uf); // ga00 = ga00.lerp16<0>(ga01, uf); lerp16<0>(xmm3, xmm2, xmm0); lerp16<0>(xmm4, xmm6, xmm0); // xmm0 = uf // xmm3 = rb00 // xmm4 = ga00 // xmm1 = c10 // xmm5 = c11 // xmm2, xmm6 = free // xmm7 = used // GSVector4i rb10 = c10 & mask; // GSVector4i ga10 = (c10 >> 8) & mask; movdqa(xmm2, xmm1); psllw(xmm1, 8); psrlw(xmm1, 8); psrlw(xmm2, 8); // GSVector4i rb11 = c11 & mask; // GSVector4i ga11 = (c11 >> 8) & mask; movdqa(xmm6, xmm5); psllw(xmm5, 8); psrlw(xmm5, 8); psrlw(xmm6, 8); // xmm0 = uf // xmm3 = rb00 // xmm4 = ga00 // xmm1 = rb10 // xmm5 = rb11 // xmm2 = ga10 // xmm6 = ga11 // xmm7 = used // rb10 = rb10.lerp16<0>(rb11, uf); // ga10 = ga10.lerp16<0>(ga11, uf); lerp16<0>(xmm5, xmm1, xmm0); lerp16<0>(xmm6, xmm2, xmm0); // xmm3 = rb00 // xmm4 = ga00 // xmm5 = rb10 // xmm6 = ga10 // xmm0, xmm1, xmm2 = free // xmm7 = used // rb00 = rb00.lerp16<0>(rb10, vf); // ga00 = ga00.lerp16<0>(ga10, vf); movdqa(xmm0, xmmword[&m_env.temp.vf]); lerp16<0>(xmm5, xmm3, xmm0); lerp16<0>(xmm6, xmm4, xmm0); } else { // GSVector4i addr00 = y0 + x0; paddd(xmm2, xmm4); // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); ReadTexel(xmm5, xmm2, xmm0, xmm1); // GSVector4i mask = GSVector4i::x00ff(); // c[0] = c00 & mask; // c[1] = (c00 >> 8) & mask; movdqa(xmm6, xmm5); psllw(xmm5, 8); psrlw(xmm5, 8); psrlw(xmm6, 8); } }
void GSDrawScanlineCodeGenerator::Step() { // steps -= 4; sub(ecx, 4); // fza_offset++; add(edi, 8); if(!m_sel.sprite) { // z += m_env.d4.z; if(m_sel.zb) { movaps(xmm0, xmmword[&m_env.temp.z]); addps(xmm0, xmmword[&m_env.d4.z]); movaps(xmmword[&m_env.temp.z], xmm0); } // f = f.add16(m_env.d4.f); if(m_sel.fwrite && m_sel.fge) { movdqa(xmm1, xmmword[&m_env.temp.f]); paddw(xmm1, xmmword[&m_env.d4.f]); movdqa(xmmword[&m_env.temp.f], xmm1); } } else { if(m_sel.ztest) { movdqa(xmm0, xmmword[&m_env.p.z]); } } if(m_sel.fb) { if(m_sel.tfx != TFX_NONE) { if(m_sel.fst) { // GSVector4i st = m_env.d4.st; // si += st.xxxx(); // if(!sprite) ti += st.yyyy(); movdqa(xmm4, xmmword[&m_env.d4.st]); pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); paddd(xmm2, xmmword[&m_env.temp.s]); movdqa(xmmword[&m_env.temp.s], xmm2); if(!m_sel.sprite) { pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1)); paddd(xmm3, xmmword[&m_env.temp.t]); movdqa(xmmword[&m_env.temp.t], xmm3); } else { movdqa(xmm3, xmmword[&m_env.temp.t]); } } else { // GSVector4 stq = m_env.d4.stq; // s += stq.xxxx(); // t += stq.yyyy(); // q += stq.zzzz(); movaps(xmm2, xmmword[&m_env.d4.stq]); movaps(xmm3, xmm2); movaps(xmm4, xmm2); shufps(xmm2, xmm2, _MM_SHUFFLE(0, 0, 0, 0)); shufps(xmm3, xmm3, _MM_SHUFFLE(1, 1, 1, 1)); shufps(xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2)); addps(xmm2, xmmword[&m_env.temp.s]); addps(xmm3, xmmword[&m_env.temp.t]); addps(xmm4, xmmword[&m_env.temp.q]); movaps(xmmword[&m_env.temp.s], xmm2); movaps(xmmword[&m_env.temp.t], xmm3); movaps(xmmword[&m_env.temp.q], xmm4); rcpps(xmm4, xmm4); mulps(xmm2, xmm4); mulps(xmm3, xmm4); } } if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc)) { if(m_sel.iip) { // GSVector4i c = m_env.d4.c; // rb = rb.add16(c.xxxx()); // ga = ga.add16(c.yyyy()); movdqa(xmm7, xmmword[&m_env.d4.c]); pshufd(xmm5, xmm7, _MM_SHUFFLE(0, 0, 0, 0)); pshufd(xmm6, xmm7, _MM_SHUFFLE(1, 1, 1, 1)); paddw(xmm5, xmmword[&m_env.temp.rb]); paddw(xmm6, xmmword[&m_env.temp.ga]); movdqa(xmmword[&m_env.temp.rb], xmm5); movdqa(xmmword[&m_env.temp.ga], xmm6); } else { if(m_sel.tfx == TFX_NONE) { movdqa(xmm5, xmmword[&m_env.c.rb]); movdqa(xmm6, xmmword[&m_env.c.ga]); } } } } // test = m_test[7 + (steps & (steps >> 31))]; mov(edx, ecx); sar(edx, 31); and(edx, ecx); shl(edx, 4); movdqa(xmm7, xmmword[edx + (size_t)&m_test[7]]); }
void GSDrawScanlineCodeGenerator::Init(int params) { const int _top = params + 4; const int _v = params + 8; // int skip = left & 3; mov(ebx, edx); and(edx, 3); // left -= skip; sub(ebx, edx); // int steps = right - left - 4; sub(ecx, ebx); sub(ecx, 4); // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))]; shl(edx, 4); movdqa(xmm7, xmmword[edx + (size_t)&m_test[0]]); mov(eax, ecx); sar(eax, 31); and(eax, ecx); shl(eax, 4); por(xmm7, xmmword[eax + (size_t)&m_test[7]]); // GSVector2i* fza_base = &m_env.fzbr[top]; mov(esi, dword[esp + _top]); lea(esi, ptr[esi * 8]); add(esi, dword[&m_env.fzbr]); // GSVector2i* fza_offset = &m_env.fzbc[left >> 2]; lea(edi, ptr[ebx * 2]); add(edi, dword[&m_env.fzbc]); if(!m_sel.sprite && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip)) { // edx = &m_env.d[skip] shl(edx, 4); lea(edx, ptr[edx + (size_t)m_env.d]); // ebx = &v mov(ebx, dword[esp + _v]); } if(!m_sel.sprite) { if(m_sel.fwrite && m_sel.fge || m_sel.zb) { movaps(xmm0, xmmword[ebx + 16]); // v.p if(m_sel.fwrite && m_sel.fge) { // f = GSVector4i(vp).zzzzh().zzzz().add16(m_env.d[skip].f); cvttps2dq(xmm1, xmm0); pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); paddw(xmm1, xmmword[edx + 16 * 6]); movdqa(xmmword[&m_env.temp.f], xmm1); } if(m_sel.zb) { // z = vp.zzzz() + m_env.d[skip].z; shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); addps(xmm0, xmmword[edx]); movaps(xmmword[&m_env.temp.z], xmm0); } } } else { if(m_sel.ztest) { movdqa(xmm0, xmmword[&m_env.p.z]); } } if(m_sel.fb) { if(m_sel.edge || m_sel.tfx != TFX_NONE) { movaps(xmm4, xmmword[ebx + 32]); // v.t } if(m_sel.edge) { pshufhw(xmm3, xmm4, _MM_SHUFFLE(2, 2, 2, 2)); pshufd(xmm3, xmm3, _MM_SHUFFLE(3, 3, 3, 3)); psrlw(xmm3, 9); movdqa(xmmword[&m_env.temp.cov], xmm3); } if(m_sel.tfx != TFX_NONE) { if(m_sel.fst) { // GSVector4i vti(vt); cvttps2dq(xmm4, xmm4); // si = vti.xxxx() + m_env.d[skip].si; // ti = vti.yyyy(); if(!sprite) ti += m_env.d[skip].ti; pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1)); paddd(xmm2, xmmword[edx + 16 * 7]); if(!m_sel.sprite) { paddd(xmm3, xmmword[edx + 16 * 8]); } else { if(m_sel.ltf) { movdqa(xmm4, xmm3); pshuflw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0)); psrlw(xmm4, 1); movdqa(xmmword[&m_env.temp.vf], xmm4); } } movdqa(xmmword[&m_env.temp.s], xmm2); movdqa(xmmword[&m_env.temp.t], xmm3); } else { // s = vt.xxxx() + m_env.d[skip].s; // t = vt.yyyy() + m_env.d[skip].t; // q = vt.zzzz() + m_env.d[skip].q; movaps(xmm2, xmm4); movaps(xmm3, xmm4); shufps(xmm2, xmm2, _MM_SHUFFLE(0, 0, 0, 0)); shufps(xmm3, xmm3, _MM_SHUFFLE(1, 1, 1, 1)); shufps(xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2)); addps(xmm2, xmmword[edx + 16 * 1]); addps(xmm3, xmmword[edx + 16 * 2]); addps(xmm4, xmmword[edx + 16 * 3]); movaps(xmmword[&m_env.temp.s], xmm2); movaps(xmmword[&m_env.temp.t], xmm3); movaps(xmmword[&m_env.temp.q], xmm4); rcpps(xmm4, xmm4); mulps(xmm2, xmm4); mulps(xmm3, xmm4); } } if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc)) { if(m_sel.iip) { // GSVector4i vc = GSVector4i(v.c); cvttps2dq(xmm6, xmmword[ebx]); // v.c // vc = vc.upl16(vc.zwxy()); pshufd(xmm5, xmm6, _MM_SHUFFLE(1, 0, 3, 2)); punpcklwd(xmm6, xmm5); // rb = vc.xxxx().add16(m_env.d[skip].rb); // ga = vc.zzzz().add16(m_env.d[skip].ga); pshufd(xmm5, xmm6, _MM_SHUFFLE(0, 0, 0, 0)); pshufd(xmm6, xmm6, _MM_SHUFFLE(2, 2, 2, 2)); paddw(xmm5, xmmword[edx + 16 * 4]); paddw(xmm6, xmmword[edx + 16 * 5]); movdqa(xmmword[&m_env.temp.rb], xmm5); movdqa(xmmword[&m_env.temp.ga], xmm6); } else { if(m_sel.tfx == TFX_NONE) { movdqa(xmm5, xmmword[&m_env.c.rb]); movdqa(xmm6, xmmword[&m_env.c.ga]); } } } } }
void GSDrawScanlineCodeGenerator::WriteFrame(int params) { const int _top = params + 4; if(!m_sel.fwrite) { return; } if(m_sel.colclamp == 0) { // c[0] &= 0x000000ff; // c[1] &= 0x000000ff; pcmpeqd(xmm7, xmm7); psrlw(xmm7, 8); pand(xmm5, xmm7); pand(xmm6, xmm7); } if(m_sel.fpsm == 2 && m_sel.dthe) { mov(eax, dword[esp + _top]); and(eax, 3); shl(eax, 5); paddw(xmm5, xmmword[eax + (size_t)&m_env.dimx[0]]); paddw(xmm6, xmmword[eax + (size_t)&m_env.dimx[1]]); } // GSVector4i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1])); movdqa(xmm7, xmm5); punpcklwd(xmm5, xmm6); punpckhwd(xmm7, xmm6); packuswb(xmm5, xmm7); if(m_sel.fba && m_sel.fpsm != 1) { // fs |= 0x80000000; pcmpeqd(xmm7, xmm7); pslld(xmm7, 31); por(xmm5, xmm7); } if(m_sel.fpsm == 2) { // GSVector4i rb = fs & 0x00f800f8; // GSVector4i ga = fs & 0x8000f800; mov(eax, 0x00f800f8); movd(xmm6, eax); pshufd(xmm6, xmm6, _MM_SHUFFLE(0, 0, 0, 0)); mov(eax, 0x8000f800); movd(xmm7, eax); pshufd(xmm7, xmm7, _MM_SHUFFLE(0, 0, 0, 0)); movdqa(xmm4, xmm5); pand(xmm4, xmm6); pand(xmm5, xmm7); // fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3); movdqa(xmm6, xmm4); movdqa(xmm7, xmm5); psrld(xmm4, 3); psrld(xmm6, 9); psrld(xmm5, 6); psrld(xmm7, 16); por(xmm5, xmm4); por(xmm7, xmm6); por(xmm5, xmm7); } if(m_sel.rfb) { // fs = fs.blend(fd, fm); blend(xmm5, xmm2, xmm3); // TODO: could be skipped in certain cases, depending on fpsm and fm } bool fast = m_sel.rfb && m_sel.fpsm < 2; WritePixel(xmm5, xmm0, ebx, dl, fast, m_sel.fpsm); }
void GSDrawScanlineCodeGenerator::AlphaBlend() { if(!m_sel.fwrite) { return; } if(m_sel.abe == 0 && m_sel.aa1 == 0) { return; } if((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1) || m_sel.abd == 1) { switch(m_sel.fpsm) { case 0: case 1: // c[2] = fd & mask; // c[3] = (fd >> 8) & mask; movdqa(xmm0, xmm2); movdqa(xmm1, xmm2); psllw(xmm0, 8); psrlw(xmm0, 8); psrlw(xmm1, 8); break; case 2: // c[2] = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3); // c[3] = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2); movdqa(xmm0, xmm2); movdqa(xmm1, xmm2); movdqa(xmm4, xmm2); pcmpeqd(xmm7, xmm7); psrld(xmm7, 27); // 0x0000001f pand(xmm0, xmm7); pslld(xmm0, 3); pslld(xmm7, 10); // 0x00007c00 pand(xmm4, xmm7); pslld(xmm4, 9); por(xmm0, xmm4); movdqa(xmm4, xmm1); psrld(xmm7, 5); // 0x000003e0 pand(xmm1, xmm7); psrld(xmm1, 2); psllw(xmm7, 10); // 0x00008000 pand(xmm4, xmm7); pslld(xmm4, 8); por(xmm1, xmm4); break; } } // xmm5, xmm6 = src rb, ga // xmm0, xmm1 = dst rb, ga // xmm2, xmm3 = used // xmm4, xmm7 = free if(m_sel.pabe || (m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0)) { movdqa(xmm4, xmm5); } if(m_sel.aba != m_sel.abb) { // rb = c[aba * 2 + 0]; switch(m_sel.aba) { case 0: break; case 1: movdqa(xmm5, xmm0); break; case 2: pxor(xmm5, xmm5); break; } // rb = rb.sub16(c[abb * 2 + 0]); switch(m_sel.abb) { case 0: psubw(xmm5, xmm4); break; case 1: psubw(xmm5, xmm0); break; case 2: break; } if(!(m_sel.fpsm == 1 && m_sel.abc == 1)) { // GSVector4i a = abc < 2 ? c[abc * 2 + 1].yywwlh().sll16(7) : m_env.afix; switch(m_sel.abc) { case 0: case 1: movdqa(xmm7, m_sel.abc ? xmm1 : xmm6); pshuflw(xmm7, xmm7, _MM_SHUFFLE(3, 3, 1, 1)); pshufhw(xmm7, xmm7, _MM_SHUFFLE(3, 3, 1, 1)); psllw(xmm7, 7); break; case 2: movdqa(xmm7, xmmword[&m_env.afix]); break; } // rb = rb.modulate16<1>(a); modulate16<1>(xmm5, xmm7); } // rb = rb.add16(c[abd * 2 + 0]); switch(m_sel.abd) { case 0: paddw(xmm5, xmm4); break; case 1: paddw(xmm5, xmm0); break; case 2: break; } } else { // rb = c[abd * 2 + 0]; switch(m_sel.abd) { case 0: break; case 1: movdqa(xmm5, xmm0); break; case 2: pxor(xmm5, xmm5); break; } } if(m_sel.pabe) { // mask = (c[1] << 8).sra32(31); movdqa(xmm0, xmm6); pslld(xmm0, 8); psrad(xmm0, 31); // rb = c[0].blend8(rb, mask); blend8r(xmm5, xmm4); } // xmm6 = src ga // xmm1 = dst ga // xmm5 = rb // xmm7 = a // xmm2, xmm3 = used // xmm0, xmm4 = free movdqa(xmm4, xmm6); if(m_sel.aba != m_sel.abb) { // ga = c[aba * 2 + 1]; switch(m_sel.aba) { case 0: break; case 1: movdqa(xmm6, xmm1); break; case 2: pxor(xmm6, xmm6); break; } // ga = ga.sub16(c[abeb * 2 + 1]); switch(m_sel.abb) { case 0: psubw(xmm6, xmm4); break; case 1: psubw(xmm6, xmm1); break; case 2: break; } if(!(m_sel.fpsm == 1 && m_sel.abc == 1)) { // ga = ga.modulate16<1>(a); modulate16<1>(xmm6, xmm7); } // ga = ga.add16(c[abd * 2 + 1]); switch(m_sel.abd) { case 0: paddw(xmm6, xmm4); break; case 1: paddw(xmm6, xmm1); break; case 2: break; } } else { // ga = c[abd * 2 + 1]; switch(m_sel.abd) { case 0: break; case 1: movdqa(xmm6, xmm1); break; case 2: pxor(xmm6, xmm6); break; } } // xmm4 = src ga // xmm5 = rb // xmm6 = ga // xmm2, xmm3 = used // xmm0, xmm1, xmm7 = free if(m_sel.pabe) { if(!m_cpu.has(util::Cpu::tSSE41)) { // doh, previous blend8r overwrote xmm0 (sse41 uses pblendvb) movdqa(xmm0, xmm4); pslld(xmm0, 8); psrad(xmm0, 31); } psrld(xmm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16) // ga = c[1].blend8(ga, mask).mix16(c[1]); blend8r(xmm6, xmm4); } else { if(m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx { mix16(xmm6, xmm4, xmm7); } } }
void GSDrawScanlineCodeGenerator::ColorTFX() { if(!m_sel.fwrite) { return; } switch(m_sel.tfx) { case TFX_MODULATE: // GSVector4i rb = iip ? rbf : m_env.c.rb; // rbt = rbt.modulate16<1>(rb).clamp8(); modulate16<1>(xmm5, xmmword[m_sel.iip ? &m_env.temp.rb : &m_env.c.rb]); clamp16(xmm5, xmm1); break; case TFX_DECAL: break; case TFX_HIGHLIGHT: case TFX_HIGHLIGHT2: if(m_sel.tfx == TFX_HIGHLIGHT2 && m_sel.tcc) { // GSVector4i ga = iip ? gaf : m_env.c.ga; movdqa(xmm2, xmmword[m_sel.iip ? &m_env.temp.ga : &m_env.c.ga]); } // gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat); movdqa(xmm1, xmm6); modulate16<1>(xmm6, xmm2); pshuflw(xmm2, xmm2, _MM_SHUFFLE(3, 3, 1, 1)); pshufhw(xmm2, xmm2, _MM_SHUFFLE(3, 3, 1, 1)); psrlw(xmm2, 7); paddw(xmm6, xmm2); clamp16(xmm6, xmm0); mix16(xmm6, xmm1, xmm0); // GSVector4i rb = iip ? rbf : m_env.c.rb; // rbt = rbt.modulate16<1>(rb).add16(af).clamp8(); modulate16<1>(xmm5, xmmword[m_sel.iip ? &m_env.temp.rb : &m_env.c.rb]); paddw(xmm5, xmm2); clamp16(xmm5, xmm0); break; case TFX_NONE: // rbt = iip ? rb.srl16(7) : rb; if(m_sel.iip) { psrlw(xmm5, 7); } break; } }
int main() { int rval, i, co, tmp; //, j; mmx_t ma, mb; //, mm0, mm1, *pm0, *pm1; tmp = calc_cpu_speed(); printf(" Calculating CPU-Speed....running at %d MHz\n", tmp); printf(" Calculating CPU-OFFSET...found : "); co = calc_cpu_ofs(); printf(" %d\n",co); rval = 1; //mmx_ok(); /* Announce return value of mmx_ok() */ printf(" *** Bogus message since we are emulating so MMX does allways exist ***\n"); printf(" Value returned from init was %x.", rval); printf(" (Indicates MMX %s available)\n\n",(rval)? "is" : "not"); if(rval) { /* PADD *****************************************************/ ma.q = 0x1111111180000000LL; mb.q = 0x7fffffff00000001LL; mmx_regdump(ma); mmx_regdump(mb); paddd( &ma, &mb); printf("paddd: mb.q is %016llx\n", mb.q); mmx_regdump(mb); paddw( &ma, &mb); mmx_regdump(ma); mmx_regdump(mb); ma.q = 0x0000000000000008LL; mb.q = 0x0001000200030004LL; mmx_regdump(ma); mmx_regdump(mb); psllw( &ma, &mb); mmx_regdump(ma); mmx_regdump(mb); ma.q = 0x0000000000000000LL; mb.q = 0x888044a87f06fe80LL; mmx_regdump(ma); mmx_regdump(mb); // packuswb( &ma,&mb); ma.q = 0x00aa00dd01009999LL; mb.q = 0x0011002200330044LL; mmx_regdump(ma); mmx_regdump(mb); packuswb( &ma, &mb); // punpckhdq(&ma,&mb); mmx_regdump(ma); mmx_regdump(mb); /* punpckhdq */ ma.q = 0x090a0b0c0d0e0f00LL; mb.q = 0x0102030405060708LL; mmx_regdump(ma); mmx_regdump(mb); punpckhdq( &ma, &mb); mmx_regdump(ma); mmx_regdump(mb); /* punpckhwd */ ma.q = 0x090a0b0c0d0e0f00LL; mb.q = 0x0102030405060708LL; mmx_regdump(ma); mmx_regdump(mb); punpckhwd( &ma, &mb); mmx_regdump(ma); mmx_regdump(mb); /* punpckhbw */ ma.q = 0x090a0b0c0d0e0f00LL; mb.q = 0x0102030405060708LL; mmx_regdump(ma); mmx_regdump(mb); punpckhbw( &ma, &mb); mmx_regdump(ma); mmx_regdump(mb); /* punpckldq */ ma.q = 0x090a0b0c0d0e0f00LL; mb.q = 0x0102030405060708LL; mmx_regdump(ma); mmx_regdump(mb); punpckldq( &ma, &mb); mmx_regdump(ma); mmx_regdump(mb); /* punpcklwd */ ma.q = 0x090a0b0c0d0e0f00LL; mb.q = 0x0102030405060708LL; mmx_regdump(ma); mmx_regdump(mb); punpcklwd( &ma, &mb); mmx_regdump(ma); mmx_regdump(mb); /* punpcklbw */ ma.q = 0x090a0b0c0d0e0f00LL; mb.q = 0x0102030405060708LL; mmx_regdump(ma); mmx_regdump(mb); punpcklbw( &ma, &mb); mmx_regdump(ma); mmx_regdump(mb); /* pmaddwd */ ma.q = 0x8000800080008000LL; mb.q = 0x8000800080008000LL; mmx_regdump(ma); mmx_regdump(mb); pmaddwd( &ma, &mb); mmx_regdump(ma); mmx_regdump(mb); /* packsswb */ ma.q = 0x00aa00dd01009999LL; mb.q = 0x0011002200330044LL; mmx_regdump(ma); mmx_regdump(mb); packsswb( &ma, &mb); mmx_regdump(ma); mmx_regdump(mb); /* packsswb */ ma.q = 0x007e7f00ef9dff88LL; mb.q = 0xff020085007e81cfLL; mmx_regdump(ma); mmx_regdump(mb); packsswb( &ma, &mb); mmx_regdump(ma); mmx_regdump(mb); for (i=0; i< 0x0F; i++){ printf ("%d. Test running\n", i); GET_TSC(tsc1); mb.w[0] = i; ma.w[0] = i; GET_TSC(tsc2); tick_dump(co); GET_TSC(tsc1); /* Let's test some real asm */ asm(" // pushl %eax\n // movl $0, %eax\n // cpuid\n packsswb %mm0, %mm1\n pmaddwd %mm0, %mm1\n punpcklbw %mm0, %mm1\n punpcklbw %mm0, %mm1\n pmaddwd %mm0, %mm1\n punpcklbw %mm0, %mm1\n // popl %eax\n "); GET_TSC(tsc2); tick_dump(co); ma.q = 0x8000800080008000LL; mb.q = 0x8000800080008000LL; GET_TSC(tsc1); /* Here some emulated routines */ pmaddwd( &ma, &mb); // packsswb( &ma, &mb); GET_TSC(tsc2); tick_dump(co); // mmx_regdump(mb); } } exit(0); /* Clean-up and exit nicely */ }