void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) { // xmm0, xmm1, xmm2, xmm3 = free int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; if(wms_clamp == wmt_clamp) { if(wms_clamp) { if(region) { vpmaxsw(uv, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]); } else { vpxor(xmm0, xmm0); vpmaxsw(uv, xmm0); } vpminsw(uv, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]); } else { vpand(uv, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]); if(region) { vpor(uv, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]); } } } else { vmovdqa(xmm2, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]); vmovdqa(xmm3, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]); vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, t.mask)]); // GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max; vpand(xmm1, uv, xmm2); if(region) { vpor(xmm1, xmm3); } // GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max); vpmaxsw(uv, xmm2); vpminsw(uv, xmm3); // clamp.blend8(repeat, m_local.gd->t.mask); vpblendvb(uv, xmm1, xmm0); } }
void GSDrawScanlineCodeGenerator::blendr(const Xmm& b, const Xmm& a, const Xmm& mask) { #if _M_SSE >= 0x500 vpand(b, mask); vpandn(mask, a); vpor(b, mask); #else pand(b, mask); pandn(mask, a); por(b, mask); #endif }
void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) { if(!m_sel.zb) { return; } // int za = fza_base.y + fza_offset->y; movsxd(rbp, dword[rsi + 4]); movsxd(rax, dword[rdi + 4]); add(rbp, rax); // GSVector4i zs = zi; if(!m_sel.sprite) { if(m_sel.zoverflow) { // zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); mov(rax, (size_t)&GSVector4::m_half); vbroadcastss(xmm0, ptr[rax]); vmulps(xmm0, xmm8); vcvttps2dq(xmm0, xmm0); vpslld(xmm0, 1); vcvttps2dq(xmm1, xmm8); vpcmpeqd(xmm2, xmm2); vpsrld(xmm2, 31); vpand(xmm1, xmm2); vpor(xmm0, xmm1); } else { // zs = GSVector4i(z); vcvttps2dq(xmm0, xmm8); } if(m_sel.zwrite) { vmovdqa(ptr[r11 + offsetof(GSScanlineLocalData, temp.zs)], xmm0); } } if(m_sel.ztest) { ReadPixel(xmm1, rbp); if(m_sel.zwrite && m_sel.zpsm < 2) { vmovdqa(ptr[r11 + offsetof(GSScanlineLocalData, temp.zd)], xmm1); } // zd &= 0xffffffff >> m_sel.zpsm * 8; if(m_sel.zpsm) { vpslld(xmm1, m_sel.zpsm * 8); vpsrld(xmm1, m_sel.zpsm * 8); } if(m_sel.zoverflow || m_sel.zpsm == 0) { // GSVector4i off = GSVector4i::x80000000(); vpcmpeqd(xmm2, xmm2); vpslld(xmm2, 31); // GSVector4i zso = zs - off; // GSVector4i zdo = zd - off; vpsubd(xmm0, xmm2); vpsubd(xmm1, xmm2); } switch(m_sel.ztst) { case ZTST_GEQUAL: // test |= zso < zdo; // ~(zso >= zdo) vpcmpgtd(xmm1, xmm0); vpor(xmm15, xmm1); break; case ZTST_GREATER: // TODO: tidus hair and chocobo wings only appear fully when this is tested as ZTST_GEQUAL // test |= zso <= zdo; // ~(zso > zdo) vpcmpgtd(xmm0, xmm1); vpcmpeqd(xmm2, xmm2); vpxor(xmm0, xmm2); vpor(xmm15, xmm0); break; } alltrue(); } }
void GSDrawScanlineCodeGenerator::WriteFrame() { if(!m_sel.fwrite) { return; } if(m_sel.colclamp == 0) { // c[0] &= 0x00ff00ff; // c[1] &= 0x00ff00ff; vpcmpeqd(xmm15, xmm15); vpsrlw(xmm15, 8); vpand(xmm2, xmm15); vpand(xmm3, xmm15); } if(m_sel.fpsm == 2 && m_sel.dthe) { mov(rax, r8); and(rax, 3); shl(rax, 5); vpaddw(xmm2, ptr[r12 + rax + offsetof(GSScanlineGlobalData, dimx[0])]); vpaddw(xmm3, ptr[r12 + rax + offsetof(GSScanlineGlobalData, dimx[1])]); } // GSVector4i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1])); vpunpckhwd(xmm15, xmm2, xmm3); vpunpcklwd(xmm2, xmm3); vpackuswb(xmm2, xmm15); if(m_sel.fba && m_sel.fpsm != 1) { // fs |= 0x80000000; vpcmpeqd(xmm15, xmm15); vpslld(xmm15, 31); vpor(xmm2, xmm15); } // xmm2 = fs // xmm4 = fm // xmm6 = fd if(m_sel.fpsm == 2) { // GSVector4i rb = fs & 0x00f800f8; // GSVector4i ga = fs & 0x8000f800; mov(eax, 0x00f800f8); vmovd(xmm0, eax); vpshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); mov(eax, 0x8000f800); vmovd(xmm1, eax); vpshufd(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0)); vpand(xmm0, xmm2); vpand(xmm1, xmm2); // fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3); vpsrld(xmm2, xmm0, 9); vpsrld(xmm0, 3); vpsrld(xmm3, xmm1, 16); vpsrld(xmm1, 6); vpor(xmm0, xmm1); vpor(xmm2, xmm3); vpor(xmm2, xmm0); } if(m_sel.rfb) { // fs = fs.blend(fd, fm); blend(xmm2, xmm6, xmm4); // TODO: could be skipped in certain cases, depending on fpsm and fm } bool fast = m_sel.rfb && m_sel.fpsm < 2; WritePixel(xmm2, rbx, dl, fast, m_sel.fpsm, 0); }
void GSDrawScanlineCodeGenerator::AlphaBlend() { if(!m_sel.fwrite) { return; } if(m_sel.abe == 0 && m_sel.aa1 == 0) { return; } if((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1) || m_sel.abd == 1) { switch(m_sel.fpsm) { case 0: case 1: // c[2] = fd & mask; // c[3] = (fd >> 8) & mask; vpsllw(xmm0, xmm6, 8); vpsrlw(xmm0, 8); vpsrlw(xmm1, xmm6, 8); break; case 2: // c[2] = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3); // c[3] = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2); vpcmpeqd(xmm15, xmm15); vpsrld(xmm15, 27); // 0x0000001f vpand(xmm0, xmm6, xmm15); vpslld(xmm0, 3); vpslld(xmm15, 10); // 0x00007c00 vpand(xmm5, xmm6, xmm15); vpslld(xmm5, 9); vpor(xmm0, xmm1); vpsrld(xmm15, 5); // 0x000003e0 vpand(xmm1, xmm6, xmm15); vpsrld(xmm1, 2); vpsllw(xmm15, 10); // 0x00008000 vpand(xmm5, xmm6, xmm15); vpslld(xmm5, 8); vpor(xmm1, xmm5); break; } } // xmm2, xmm3 = src rb, ga // xmm0, xmm1 = dst rb, ga // xmm5, xmm15 = free if(m_sel.pabe || (m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0)) { vmovdqa(xmm5, xmm2); } if(m_sel.aba != m_sel.abb) { // rb = c[aba * 2 + 0]; switch(m_sel.aba) { case 0: break; case 1: vmovdqa(xmm2, xmm0); break; case 2: vpxor(xmm2, xmm2); break; } // rb = rb.sub16(c[abb * 2 + 0]); switch(m_sel.abb) { case 0: vpsubw(xmm2, xmm5); break; case 1: vpsubw(xmm2, xmm0); break; case 2: break; } if(!(m_sel.fpsm == 1 && m_sel.abc == 1)) { // GSVector4i a = abc < 2 ? c[abc * 2 + 1].yywwlh().sll16(7) : m_local.gd->afix; switch(m_sel.abc) { case 0: case 1: vpshuflw(xmm15, m_sel.abc ? xmm1 : xmm3, _MM_SHUFFLE(3, 3, 1, 1)); vpshufhw(xmm15, xmm15, _MM_SHUFFLE(3, 3, 1, 1)); vpsllw(xmm15, 7); break; case 2: vmovdqa(xmm15, ptr[r12 + offsetof(GSScanlineGlobalData, afix)]); break; } // rb = rb.modulate16<1>(a); modulate16(xmm2, xmm15, 1); } // rb = rb.add16(c[abd * 2 + 0]); switch(m_sel.abd) { case 0: vpaddw(xmm2, xmm5); break; case 1: vpaddw(xmm2, xmm0); break; case 2: break; } } else { // rb = c[abd * 2 + 0]; switch(m_sel.abd) { case 0: break; case 1: vmovdqa(xmm2, xmm0); break; case 2: vpxor(xmm2, xmm2); break; } } if(m_sel.pabe) { // mask = (c[1] << 8).sra32(31); vpslld(xmm0, xmm3, 8); vpsrad(xmm0, 31); // rb = c[0].blend8(rb, mask); vpblendvb(xmm2, xmm5, xmm2, xmm0); } // xmm0 = pabe mask // xmm3 = src ga // xmm1 = dst ga // xmm2 = rb // xmm15 = a // xmm5 = free vmovdqa(xmm5, xmm3); if(m_sel.aba != m_sel.abb) { // ga = c[aba * 2 + 1]; switch(m_sel.aba) { case 0: break; case 1: vmovdqa(xmm3, xmm1); break; case 2: vpxor(xmm3, xmm3); break; } // ga = ga.sub16(c[abeb * 2 + 1]); switch(m_sel.abb) { case 0: vpsubw(xmm3, xmm5); break; case 1: vpsubw(xmm3, xmm1); break; case 2: break; } if(!(m_sel.fpsm == 1 && m_sel.abc == 1)) { // ga = ga.modulate16<1>(a); modulate16(xmm3, xmm15, 1); } // ga = ga.add16(c[abd * 2 + 1]); switch(m_sel.abd) { case 0: vpaddw(xmm3, xmm5); break; case 1: vpaddw(xmm3, xmm1); break; case 2: break; } } else { // ga = c[abd * 2 + 1]; switch(m_sel.abd) { case 0: break; case 1: vmovdqa(xmm3, xmm1); break; case 2: vpxor(xmm3, xmm3); break; } } // xmm0 = pabe mask // xmm5 = src ga // xmm2 = rb // xmm3 = ga // xmm1, xmm15 = free if(m_sel.pabe) { vpsrld(xmm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16) // ga = c[1].blend8(ga, mask).mix16(c[1]); vpblendvb(xmm3, xmm5, xmm3, xmm0); } else { if(m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx { mix16(xmm3, xmm5, xmm15); } } }
void GSDrawScanlineCodeGenerator::blendr(const Ymm& b, const Ymm& a, const Ymm& mask) { vpand(b, mask); vpandn(mask, a); vpor(b, mask); }