void GPUDrawScanlineCodeGenerator::ColorTFX() { switch(m_sel.tfx) { case 0: // none (tfx = 0) case 1: // none (tfx = tge) // c[0] = r.srl16(7); // c[1] = g.srl16(7); // c[2] = b.srl16(7); psrlw(xmm4, 7); psrlw(xmm5, 7); psrlw(xmm6, 7); break; case 2: // modulate (tfx = tme | tge) // c[0] = c[0].modulate16<1>(r).clamp8(); // c[1] = c[1].modulate16<1>(g).clamp8(); // c[2] = c[2].modulate16<1>(b).clamp8(); pcmpeqd(xmm0, xmm0); psrlw(xmm0, 8); modulate16<1>(xmm4, ptr[&m_local.temp.r]); pminsw(xmm4, xmm0); modulate16<1>(xmm5, ptr[&m_local.temp.g]); pminsw(xmm5, xmm0); modulate16<1>(xmm6, ptr[&m_local.temp.b]); pminsw(xmm6, xmm0); break; case 3: // decal (tfx = tme) break; } }
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) { // xmm0, xmm1, xmm4, xmm5, xmm6 = free int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; if(wms_clamp == wmt_clamp) { if(wms_clamp) { if(region) { movdqa(xmm4, xmmword[&m_env.t.min]); pmaxsw(uv0, xmm4); pmaxsw(uv1, xmm4); } else { pxor(xmm0, xmm0); pmaxsw(uv0, xmm0); pmaxsw(uv1, xmm0); } movdqa(xmm5, xmmword[&m_env.t.max]); pminsw(uv0, xmm5); pminsw(uv1, xmm5); } else { movdqa(xmm4, xmmword[&m_env.t.min]); pand(uv0, xmm4); pand(uv1, xmm4); if(region) { movdqa(xmm5, xmmword[&m_env.t.max]); por(uv0, xmm5); por(uv1, xmm5); } } } else { movdqa(xmm1, uv0); movdqa(xmm6, uv1); movdqa(xmm4, xmmword[&m_env.t.min]); movdqa(xmm5, xmmword[&m_env.t.max]); // GSVector4i clamp = t.sat_i16(m_env.t.min, m_env.t.max); pmaxsw(uv0, xmm4); pmaxsw(uv1, xmm4); pminsw(uv0, xmm5); pminsw(uv1, xmm5); // GSVector4i repeat = (t & m_env.t.min) | m_env.t.max; pand(xmm1, xmm4); pand(xmm6, xmm4); if(region) { por(xmm1, xmm5); por(xmm6, xmm5); } // clamp.blend8(repeat, m_env.t.mask); if(m_cpu.has(util::Cpu::tSSE41)) { movdqa(xmm0, xmmword[&m_env.t.mask]); pblendvb(uv0, xmm1); pblendvb(uv1, xmm6); } else { movdqa(xmm0, xmmword[&m_env.t.invmask]); movdqa(xmm4, xmm0); pand(uv0, xmm0); pandn(xmm0, xmm1); por(uv0, xmm0); pand(uv1, xmm4); pandn(xmm4, xmm6); por(uv1, xmm4); } } }
void GPUDrawScanlineCodeGenerator::SampleTexture() { if(!m_sel.tme) { return; } if(m_sel.tlu) { mov(edx, ptr[&m_local.gd->clut]); } // xmm2 = s // xmm3 = t // xmm7 = test // xmm0, xmm4, xmm5, xmm6 = free // xmm1 = used if(m_sel.ltf) { // GSVector4i u = s.sub16(GSVector4i(0x00200020)); // - 0.125f // GSVector4i v = t.sub16(GSVector4i(0x00200020)); // - 0.125f mov(eax, 0x00200020); movd(xmm0, eax); pshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); psubw(xmm2, xmm0); psubw(xmm3, xmm0); // GSVector4i uf = (u & GSVector4i::x00ff()) << 7; // GSVector4i vf = (v & GSVector4i::x00ff()) << 7; movdqa(xmm0, xmm2); psllw(xmm0, 8); psrlw(xmm0, 1); movdqa(ptr[&m_local.temp.uf], xmm0); if(!m_sel.sprite) { movdqa(xmm0, xmm3); psllw(xmm0, 8); psrlw(xmm0, 1); movdqa(ptr[&m_local.temp.vf], xmm0); } } // GSVector4i u0 = s.srl16(8); // GSVector4i v0 = t.srl16(8); psrlw(xmm2, 8); psrlw(xmm3, 8); // xmm2 = u // xmm3 = v // xmm7 = test // xmm0, xmm4, xmm5, xmm6 = free // xmm1 = used if(m_sel.ltf) { // GSVector4i u1 = u0.add16(GSVector4i::x0001()); // GSVector4i v1 = v0.add16(GSVector4i::x0001()); movdqa(xmm4, xmm2); movdqa(xmm5, xmm3); pcmpeqd(xmm0, xmm0); psrlw(xmm0, 15); paddw(xmm4, xmm0); paddw(xmm5, xmm0); if(m_sel.twin) { // u0 = (u0 & m_local.twin[0].u).add16(m_local.twin[1].u); // v0 = (v0 & m_local.twin[0].v).add16(m_local.twin[1].v); // u1 = (u1 & m_local.twin[0].u).add16(m_local.twin[1].u); // v1 = (v1 & m_local.twin[0].v).add16(m_local.twin[1].v); movdqa(xmm0, ptr[&m_local.twin[0].u]); movdqa(xmm6, ptr[&m_local.twin[1].u]); pand(xmm2, xmm0); paddw(xmm2, xmm6); pand(xmm4, xmm0); paddw(xmm4, xmm6); movdqa(xmm0, ptr[&m_local.twin[0].v]); movdqa(xmm6, ptr[&m_local.twin[1].v]); pand(xmm3, xmm0); paddw(xmm3, xmm6); pand(xmm5, xmm0); paddw(xmm5, xmm6); } else { // u0 = u0.min_i16(m_local.twin[2].u); // v0 = v0.min_i16(m_local.twin[2].v); // u1 = u1.min_i16(m_local.twin[2].u); // v1 = v1.min_i16(m_local.twin[2].v); // TODO: if(!sprite) clamp16 else: movdqa(xmm0, ptr[&m_local.twin[2].u]); movdqa(xmm6, ptr[&m_local.twin[2].v]); pminsw(xmm2, xmm0); pminsw(xmm3, xmm6); pminsw(xmm4, xmm0); pminsw(xmm5, xmm6); } // xmm2 = u0 // xmm3 = v0 // xmm4 = u1 // xmm5 = v1 // xmm7 = test // xmm0, xmm6 = free // xmm1 = used // GSVector4i addr00 = v0.sll16(8) | u0; // GSVector4i addr01 = v0.sll16(8) | u1; // GSVector4i addr10 = v1.sll16(8) | u0; // GSVector4i addr11 = v1.sll16(8) | u1; psllw(xmm3, 8); movdqa(xmm0, xmm3); por(xmm3, xmm2); por(xmm0, xmm4); psllw(xmm5, 8); movdqa(xmm6, xmm5); por(xmm5, xmm2); por(xmm6, xmm4); // xmm3 = addr00 // xmm0 = addr01 // xmm5 = addr10 // xmm6 = addr11 // xmm7 = test // xmm2, xmm4 = free // xmm1 = used ReadTexel(xmm2, xmm3); ReadTexel(xmm4, xmm0); ReadTexel(xmm3, xmm5); ReadTexel(xmm5, xmm6); // xmm2 = c00 // xmm4 = c01 // xmm3 = c10 // xmm5 = c11 // xmm7 = test // xmm0, xmm6 = free // xmm1 = used // spill (TODO) movdqa(ptr[&m_local.temp.fd], xmm1); movdqa(ptr[&m_local.temp.test], xmm7); // xmm2 = c00 // xmm4 = c01 // xmm3 = c10 // xmm5 = c11 // xmm0, xmm1, xmm6, xmm7 = free movdqa(xmm1, xmm2); psllw(xmm1, 11); psrlw(xmm1, 8); movdqa(xmm0, xmm4); psllw(xmm0, 11); psrlw(xmm0, 8); lerp16<0>(xmm0, xmm1, ptr[&m_local.temp.uf]); movdqa(xmm6, xmm2); psllw(xmm6, 6); psrlw(xmm6, 11); psllw(xmm6, 3); movdqa(xmm1, xmm4); psllw(xmm1, 6); psrlw(xmm1, 11); psllw(xmm1, 3); lerp16<0>(xmm1, xmm6, ptr[&m_local.temp.uf]); movdqa(xmm7, xmm2); psllw(xmm7, 1); psrlw(xmm7, 11); psllw(xmm7, 3); movdqa(xmm6, xmm4); psllw(xmm6, 1); psrlw(xmm6, 11); psllw(xmm6, 3); lerp16<0>(xmm6, xmm7, ptr[&m_local.temp.uf]); psraw(xmm2, 15); psrlw(xmm2, 8); psraw(xmm4, 15); psrlw(xmm4, 8); lerp16<0>(xmm4, xmm2, ptr[&m_local.temp.uf]); // xmm0 = r00 // xmm1 = g00 // xmm6 = b00 // xmm4 = a00 // xmm3 = c10 // xmm5 = c11 // xmm2, xmm7 = free movdqa(xmm7, xmm3); psllw(xmm7, 11); psrlw(xmm7, 8); movdqa(xmm2, xmm5); psllw(xmm2, 11); psrlw(xmm2, 8); lerp16<0>(xmm2, xmm7, ptr[&m_local.temp.uf]); lerp16<0>(xmm2, xmm0, ptr[&m_local.temp.vf]); // xmm2 = r // xmm1 = g00 // xmm6 = b00 // xmm4 = a00 // xmm3 = c10 // xmm5 = c11 // xmm0, xmm7 = free movdqa(xmm7, xmm3); psllw(xmm7, 6); psrlw(xmm7, 11); psllw(xmm7, 3); movdqa(xmm0, xmm5); psllw(xmm0, 6); psrlw(xmm0, 11); psllw(xmm0, 3); lerp16<0>(xmm0, xmm7, ptr[&m_local.temp.uf]); lerp16<0>(xmm0, xmm1, ptr[&m_local.temp.vf]); // xmm2 = r // xmm0 = g // xmm6 = b00 // xmm4 = a00 // xmm3 = c10 // xmm5 = c11 // xmm1, xmm7 = free movdqa(xmm7, xmm3); psllw(xmm7, 1); psrlw(xmm7, 11); psllw(xmm7, 3); movdqa(xmm1, xmm5); psllw(xmm1, 1); psrlw(xmm1, 11); psllw(xmm1, 3); lerp16<0>(xmm1, xmm7, ptr[&m_local.temp.uf]); lerp16<0>(xmm1, xmm6, ptr[&m_local.temp.vf]); // xmm2 = r // xmm0 = g // xmm1 = b // xmm4 = a00 // xmm3 = c10 // xmm5 = c11 // xmm6, xmm7 = free psraw(xmm3, 15); psrlw(xmm3, 8); psraw(xmm5, 15); psrlw(xmm5, 8); lerp16<0>(xmm5, xmm3, ptr[&m_local.temp.uf]); lerp16<0>(xmm5, xmm4, ptr[&m_local.temp.vf]); // xmm2 = r // xmm0 = g // xmm1 = b // xmm5 = a // xmm3, xmm4, xmm6, xmm7 = free // TODO movdqa(xmm3, xmm5); // a movdqa(xmm4, xmm2); // r movdqa(xmm6, xmm1); // b movdqa(xmm5, xmm0); // g // reload test movdqa(xmm7, ptr[&m_local.temp.test]); // xmm4 = r // xmm5 = g // xmm6 = b // xmm3 = a // xmm7 = test // xmm0, xmm1, xmm2 = free // test |= (c[0] | c[1] | c[2] | c[3]).eq16(GSVector4i::zero()); // mask out blank pixels (not perfect) movdqa(xmm1, xmm3); por(xmm1, xmm4); movdqa(xmm2, xmm5); por(xmm2, xmm6); por(xmm1, xmm2); pxor(xmm0, xmm0); pcmpeqw(xmm1, xmm0); por(xmm7, xmm1); // a = a.gt16(GSVector4i::zero()); pcmpgtw(xmm3, xmm0); // reload fd movdqa(xmm1, ptr[&m_local.temp.fd]); } else { if(m_sel.twin) { // u = (u & m_local.twin[0].u).add16(m_local.twin[1].u); // v = (v & m_local.twin[0].v).add16(m_local.twin[1].v); pand(xmm2, ptr[&m_local.twin[0].u]); paddw(xmm2, ptr[&m_local.twin[1].u]); pand(xmm3, ptr[&m_local.twin[0].v]); paddw(xmm3, ptr[&m_local.twin[1].v]); } else { // u = u.min_i16(m_local.twin[2].u); // v = v.min_i16(m_local.twin[2].v); // TODO: if(!sprite) clamp16 else: pminsw(xmm2, ptr[&m_local.twin[2].u]); pminsw(xmm3, ptr[&m_local.twin[2].v]); } // xmm2 = u // xmm3 = v // xmm7 = test // xmm0, xmm4, xmm5, xmm6 = free // xmm1 = used // GSVector4i addr = v.sll16(8) | u; psllw(xmm3, 8); por(xmm3, xmm2); // xmm3 = addr // xmm7 = test // xmm0, xmm2, xmm4, xmm5, xmm6 = free // xmm1 = used ReadTexel(xmm6, xmm3); // xmm3 = c00 // xmm7 = test // xmm0, xmm2, xmm4, xmm5, xmm6 = free // xmm1 = used // test |= c00.eq16(GSVector4i::zero()); // mask out blank pixels pxor(xmm0, xmm0); pcmpeqw(xmm0, xmm6); por(xmm7, xmm0); // c[0] = (c00 << 3) & 0x00f800f8; // c[1] = (c00 >> 2) & 0x00f800f8; // c[2] = (c00 >> 7) & 0x00f800f8; // c[3] = c00.sra16(15); movdqa(xmm3, xmm6); psraw(xmm3, 15); // a pcmpeqd(xmm0, xmm0); psrlw(xmm0, 11); psllw(xmm0, 3); // 0x00f8 movdqa(xmm4, xmm6); psllw(xmm4, 3); pand(xmm4, xmm0); // r movdqa(xmm5, xmm6); psrlw(xmm5, 2); pand(xmm5, xmm0); // g psrlw(xmm6, 7); pand(xmm6, xmm0); // b } }
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) { // xmm0, xmm1, xmm4, xmm5, xmm6 = free int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; if(wms_clamp == wmt_clamp) { if(wms_clamp) { if(region) { pmaxsw(uv, xmmword[&m_env.t.min]); } else { pxor(xmm0, xmm0); pmaxsw(uv, xmm0); } pminsw(uv, xmmword[&m_env.t.max]); } else { pand(uv, xmmword[&m_env.t.min]); if(region) { por(uv, xmmword[&m_env.t.max]); } } } else { movdqa(xmm1, uv); movdqa(xmm4, xmmword[&m_env.t.min]); movdqa(xmm5, xmmword[&m_env.t.max]); // GSVector4i clamp = t.sat_i16(m_env.t.min, m_env.t.max); pmaxsw(uv, xmm4); pminsw(uv, xmm5); // GSVector4i repeat = (t & m_env.t.min) | m_env.t.max; pand(xmm1, xmm4); if(region) { por(xmm1, xmm5); } // clamp.blend8(repeat, m_env.t.mask); movdqa(xmm0, xmmword[&m_env.t.mask]); blend8(uv, xmm1); } }