void GSDrawScanlineCodeGenerator::modulate16(const Xmm& a, const Operand& f, int shift) { #if _M_SSE >= 0x500 if(shift == 0) { vpmulhrsw(a, f); } else { vpsllw(a, shift + 1); vpmulhw(a, f); } #else if(shift == 0 && m_cpu.has(util::Cpu::tSSSE3)) { pmulhrsw(a, f); } else { psllw(a, shift + 1); pmulhw(a, f); } #endif }
void GSDrawScanlineCodeGenerator::modulate16(const Ymm& a, const Operand& f, int shift) { if(shift == 0) { vpmulhrsw(a, f); } else { vpsllw(a, (uint8)(shift + 1)); vpmulhw(a, f); } }
void GSDrawScanlineCodeGenerator::SampleTexture() { if(!m_sel.fb || m_sel.tfx == TFX_NONE) { return; } mov(rbx, ptr[r12 + offsetof(GSScanlineGlobalData, tex)]); // ebx = tex if(!m_sel.fst) { vrcpps(xmm0, xmm12); vmulps(xmm4, xmm10, xmm0); vmulps(xmm5, xmm11, xmm0); vcvttps2dq(xmm4, xmm4); vcvttps2dq(xmm5, xmm5); if(m_sel.ltf) { // u -= 0x8000; // v -= 0x8000; mov(eax, 0x8000); vmovd(xmm0, eax); vpshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); vpsubd(xmm4, xmm0); vpsubd(xmm5, xmm0); } } else { vmovdqa(xmm4, xmm10); vmovdqa(xmm5, xmm11); } if(m_sel.ltf) { // GSVector4i uf = u.xxzzlh().srl16(1); vpshuflw(xmm6, xmm4, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0)); vpsrlw(xmm6, 1); if(!m_sel.sprite) { // GSVector4i vf = v.xxzzlh().srl16(1); vpshuflw(xmm7, xmm5, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm7, xmm7, _MM_SHUFFLE(2, 2, 0, 0)); vpsrlw(xmm7, 1); } } // GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16)); vpsrad(xmm4, 16); vpsrad(xmm5, 16); vpackssdw(xmm4, xmm5); if(m_sel.ltf) { // GSVector4i uv1 = uv0.add16(GSVector4i::x0001()); vpcmpeqd(xmm0, xmm0); vpsrlw(xmm0, 15); vpaddw(xmm5, xmm4, xmm0); // uv0 = Wrap(uv0); // uv1 = Wrap(uv1); Wrap(xmm4, xmm5); } else { // uv0 = Wrap(uv0); Wrap(xmm4); } // xmm4 = uv0 // xmm5 = uv1 (ltf) // xmm6 = uf // xmm7 = vf // GSVector4i x0 = uv0.upl16(); // GSVector4i y0 = uv0.uph16() << tw; vpxor(xmm0, xmm0); vpunpcklwd(xmm2, xmm4, xmm0); vpunpckhwd(xmm3, xmm4, xmm0); vpslld(xmm3, m_sel.tw + 3); // xmm0 = 0 // xmm2 = x0 // xmm3 = y0 // xmm5 = uv1 (ltf) // xmm6 = uf // xmm7 = vf if(m_sel.ltf) { // GSVector4i x1 = uv1.upl16(); // GSVector4i y1 = uv1.uph16() << tw; vpunpcklwd(xmm4, xmm5, xmm0); vpunpckhwd(xmm5, xmm5, xmm0); vpslld(xmm5, m_sel.tw + 3); // xmm2 = x0 // xmm3 = y0 // xmm4 = x1 // xmm5 = y1 // xmm6 = uf // xmm7 = vf // GSVector4i addr00 = y0 + x0; // GSVector4i addr01 = y0 + x1; // GSVector4i addr10 = y1 + x0; // GSVector4i addr11 = y1 + x1; vpaddd(xmm0, xmm3, xmm2); vpaddd(xmm1, xmm3, xmm4); vpaddd(xmm2, xmm5, xmm2); vpaddd(xmm3, xmm5, xmm4); // xmm0 = addr00 // xmm1 = addr01 // xmm2 = addr10 // xmm3 = addr11 // xmm6 = uf // xmm7 = vf // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]); // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); ReadTexel(4, 0); // xmm0 = c00 // xmm1 = c01 // xmm2 = c10 // xmm3 = c11 // xmm6 = uf // xmm7 = vf // GSVector4i rb00 = c00 & mask; // GSVector4i ga00 = (c00 >> 8) & mask; vpsllw(xmm4, xmm0, 8); vpsrlw(xmm4, 8); vpsrlw(xmm5, xmm0, 8); // GSVector4i rb01 = c01 & mask; // GSVector4i ga01 = (c01 >> 8) & mask; vpsllw(xmm0, xmm1, 8); vpsrlw(xmm0, 8); vpsrlw(xmm1, 8); // xmm0 = rb01 // xmm1 = ga01 // xmm2 = c10 // xmm3 = c11 // xmm4 = rb00 // xmm5 = ga00 // xmm6 = uf // xmm7 = vf // rb00 = rb00.lerp16<0>(rb01, uf); // ga00 = ga00.lerp16<0>(ga01, uf); lerp16(xmm0, xmm4, xmm6, 0); lerp16(xmm1, xmm5, xmm6, 0); // xmm0 = rb00 // xmm1 = ga00 // xmm2 = c10 // xmm3 = c11 // xmm6 = uf // xmm7 = vf // GSVector4i rb10 = c10 & mask; // GSVector4i ga10 = (c10 >> 8) & mask; vpsrlw(xmm5, xmm2, 8); vpsllw(xmm2, 8); vpsrlw(xmm4, xmm2, 8); // GSVector4i rb11 = c11 & mask; // GSVector4i ga11 = (c11 >> 8) & mask; vpsrlw(xmm2, xmm3, 8); vpsllw(xmm3, 8); vpsrlw(xmm3, 8); // xmm0 = rb00 // xmm1 = ga00 // xmm2 = rb11 // xmm3 = ga11 // xmm4 = rb10 // xmm5 = ga10 // xmm6 = uf // xmm7 = vf // rb10 = rb10.lerp16<0>(rb11, uf); // ga10 = ga10.lerp16<0>(ga11, uf); lerp16(xmm2, xmm4, xmm6, 0); lerp16(xmm3, xmm5, xmm6, 0); // xmm0 = rb00 // xmm1 = ga00 // xmm2 = rb10 // xmm3 = ga10 // xmm7 = vf // rb00 = rb00.lerp16<0>(rb10, vf); // ga00 = ga00.lerp16<0>(ga10, vf); lerp16(xmm2, xmm0, xmm7, 0); lerp16(xmm3, xmm1, xmm7, 0); } else { // GSVector4i addr00 = y0 + x0; vpaddd(xmm3, xmm2); // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); ReadTexel(1, 0); // GSVector4i mask = GSVector4i::x00ff(); // c[0] = c00 & mask; // c[1] = (c00 >> 8) & mask; vpsrlw(xmm3, xmm2, 8); vpsllw(xmm2, 8); vpsrlw(xmm2, 8); } // xmm2 = rb // xmm3 = ga }
void GSDrawScanlineCodeGenerator::AlphaBlend() { if(!m_sel.fwrite) { return; } if(m_sel.abe == 0 && m_sel.aa1 == 0) { return; } if((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1) || m_sel.abd == 1) { switch(m_sel.fpsm) { case 0: case 1: // c[2] = fd & mask; // c[3] = (fd >> 8) & mask; vpsllw(xmm0, xmm6, 8); vpsrlw(xmm0, 8); vpsrlw(xmm1, xmm6, 8); break; case 2: // c[2] = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3); // c[3] = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2); vpcmpeqd(xmm15, xmm15); vpsrld(xmm15, 27); // 0x0000001f vpand(xmm0, xmm6, xmm15); vpslld(xmm0, 3); vpslld(xmm15, 10); // 0x00007c00 vpand(xmm5, xmm6, xmm15); vpslld(xmm5, 9); vpor(xmm0, xmm1); vpsrld(xmm15, 5); // 0x000003e0 vpand(xmm1, xmm6, xmm15); vpsrld(xmm1, 2); vpsllw(xmm15, 10); // 0x00008000 vpand(xmm5, xmm6, xmm15); vpslld(xmm5, 8); vpor(xmm1, xmm5); break; } } // xmm2, xmm3 = src rb, ga // xmm0, xmm1 = dst rb, ga // xmm5, xmm15 = free if(m_sel.pabe || (m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0)) { vmovdqa(xmm5, xmm2); } if(m_sel.aba != m_sel.abb) { // rb = c[aba * 2 + 0]; switch(m_sel.aba) { case 0: break; case 1: vmovdqa(xmm2, xmm0); break; case 2: vpxor(xmm2, xmm2); break; } // rb = rb.sub16(c[abb * 2 + 0]); switch(m_sel.abb) { case 0: vpsubw(xmm2, xmm5); break; case 1: vpsubw(xmm2, xmm0); break; case 2: break; } if(!(m_sel.fpsm == 1 && m_sel.abc == 1)) { // GSVector4i a = abc < 2 ? c[abc * 2 + 1].yywwlh().sll16(7) : m_local.gd->afix; switch(m_sel.abc) { case 0: case 1: vpshuflw(xmm15, m_sel.abc ? xmm1 : xmm3, _MM_SHUFFLE(3, 3, 1, 1)); vpshufhw(xmm15, xmm15, _MM_SHUFFLE(3, 3, 1, 1)); vpsllw(xmm15, 7); break; case 2: vmovdqa(xmm15, ptr[r12 + offsetof(GSScanlineGlobalData, afix)]); break; } // rb = rb.modulate16<1>(a); modulate16(xmm2, xmm15, 1); } // rb = rb.add16(c[abd * 2 + 0]); switch(m_sel.abd) { case 0: vpaddw(xmm2, xmm5); break; case 1: vpaddw(xmm2, xmm0); break; case 2: break; } } else { // rb = c[abd * 2 + 0]; switch(m_sel.abd) { case 0: break; case 1: vmovdqa(xmm2, xmm0); break; case 2: vpxor(xmm2, xmm2); break; } } if(m_sel.pabe) { // mask = (c[1] << 8).sra32(31); vpslld(xmm0, xmm3, 8); vpsrad(xmm0, 31); // rb = c[0].blend8(rb, mask); vpblendvb(xmm2, xmm5, xmm2, xmm0); } // xmm0 = pabe mask // xmm3 = src ga // xmm1 = dst ga // xmm2 = rb // xmm15 = a // xmm5 = free vmovdqa(xmm5, xmm3); if(m_sel.aba != m_sel.abb) { // ga = c[aba * 2 + 1]; switch(m_sel.aba) { case 0: break; case 1: vmovdqa(xmm3, xmm1); break; case 2: vpxor(xmm3, xmm3); break; } // ga = ga.sub16(c[abeb * 2 + 1]); switch(m_sel.abb) { case 0: vpsubw(xmm3, xmm5); break; case 1: vpsubw(xmm3, xmm1); break; case 2: break; } if(!(m_sel.fpsm == 1 && m_sel.abc == 1)) { // ga = ga.modulate16<1>(a); modulate16(xmm3, xmm15, 1); } // ga = ga.add16(c[abd * 2 + 1]); switch(m_sel.abd) { case 0: vpaddw(xmm3, xmm5); break; case 1: vpaddw(xmm3, xmm1); break; case 2: break; } } else { // ga = c[abd * 2 + 1]; switch(m_sel.abd) { case 0: break; case 1: vmovdqa(xmm3, xmm1); break; case 2: vpxor(xmm3, xmm3); break; } } // xmm0 = pabe mask // xmm5 = src ga // xmm2 = rb // xmm3 = ga // xmm1, xmm15 = free if(m_sel.pabe) { vpsrld(xmm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16) // ga = c[1].blend8(ga, mask).mix16(c[1]); vpblendvb(xmm3, xmm5, xmm3, xmm0); } else { if(m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx { mix16(xmm3, xmm5, xmm15); } } }