void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, const Xmm& temp1, const Xmm& temp2) { if(m_cpu.has(util::Cpu::tSSE41)) { ReadTexel(dst, addr, 0); ReadTexel(dst, addr, 1); ReadTexel(dst, addr, 2); ReadTexel(dst, addr, 3); } else { ReadTexel(dst, addr, 0); psrldq(addr, 4); // shuffle instead? (1 2 3 0 ~ rotation) ReadTexel(temp1, addr, 0); psrldq(addr, 4); punpckldq(dst, temp1); ReadTexel(temp1, addr, 0); psrldq(addr, 4); ReadTexel(temp2, addr, 0); // psrldq(addr, 4); punpckldq(temp1, temp2); punpcklqdq(dst, temp1); } }
void GSDrawScanlineCodeGenerator::SampleTexture() { if(!m_sel.fb || m_sel.tfx == TFX_NONE) { return; } mov(rbx, ptr[r12 + offsetof(GSScanlineGlobalData, tex)]); // ebx = tex if(!m_sel.fst) { vrcpps(xmm0, xmm12); vmulps(xmm4, xmm10, xmm0); vmulps(xmm5, xmm11, xmm0); vcvttps2dq(xmm4, xmm4); vcvttps2dq(xmm5, xmm5); if(m_sel.ltf) { // u -= 0x8000; // v -= 0x8000; mov(eax, 0x8000); vmovd(xmm0, eax); vpshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); vpsubd(xmm4, xmm0); vpsubd(xmm5, xmm0); } } else { vmovdqa(xmm4, xmm10); vmovdqa(xmm5, xmm11); } if(m_sel.ltf) { // GSVector4i uf = u.xxzzlh().srl16(1); vpshuflw(xmm6, xmm4, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0)); vpsrlw(xmm6, 1); if(!m_sel.sprite) { // GSVector4i vf = v.xxzzlh().srl16(1); vpshuflw(xmm7, xmm5, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm7, xmm7, _MM_SHUFFLE(2, 2, 0, 0)); vpsrlw(xmm7, 1); } } // GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16)); vpsrad(xmm4, 16); vpsrad(xmm5, 16); vpackssdw(xmm4, xmm5); if(m_sel.ltf) { // GSVector4i uv1 = uv0.add16(GSVector4i::x0001()); vpcmpeqd(xmm0, xmm0); vpsrlw(xmm0, 15); vpaddw(xmm5, xmm4, xmm0); // uv0 = Wrap(uv0); // uv1 = Wrap(uv1); Wrap(xmm4, xmm5); } else { // uv0 = Wrap(uv0); Wrap(xmm4); } // xmm4 = uv0 // xmm5 = uv1 (ltf) // xmm6 = uf // xmm7 = vf // GSVector4i x0 = uv0.upl16(); // GSVector4i y0 = uv0.uph16() << tw; vpxor(xmm0, xmm0); vpunpcklwd(xmm2, xmm4, xmm0); vpunpckhwd(xmm3, xmm4, xmm0); vpslld(xmm3, m_sel.tw + 3); // xmm0 = 0 // xmm2 = x0 // xmm3 = y0 // xmm5 = uv1 (ltf) // xmm6 = uf // xmm7 = vf if(m_sel.ltf) { // GSVector4i x1 = uv1.upl16(); // GSVector4i y1 = uv1.uph16() << tw; vpunpcklwd(xmm4, xmm5, xmm0); vpunpckhwd(xmm5, xmm5, xmm0); vpslld(xmm5, m_sel.tw + 3); // xmm2 = x0 // xmm3 = y0 // xmm4 = x1 // xmm5 = y1 // xmm6 = uf // xmm7 = vf // GSVector4i addr00 = y0 + x0; // GSVector4i addr01 = y0 + x1; // GSVector4i addr10 = y1 + x0; // GSVector4i addr11 = y1 + x1; vpaddd(xmm0, xmm3, xmm2); vpaddd(xmm1, xmm3, xmm4); vpaddd(xmm2, xmm5, xmm2); vpaddd(xmm3, xmm5, xmm4); // xmm0 = addr00 // xmm1 = addr01 // xmm2 = addr10 // xmm3 = addr11 // xmm6 = uf // xmm7 = vf // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]); // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); ReadTexel(4, 0); // xmm0 = c00 // xmm1 = c01 // xmm2 = c10 // xmm3 = c11 // xmm6 = uf // xmm7 = vf // GSVector4i rb00 = c00 & mask; // GSVector4i ga00 = (c00 >> 8) & mask; vpsllw(xmm4, xmm0, 8); vpsrlw(xmm4, 8); vpsrlw(xmm5, xmm0, 8); // GSVector4i rb01 = c01 & mask; // GSVector4i ga01 = (c01 >> 8) & mask; vpsllw(xmm0, xmm1, 8); vpsrlw(xmm0, 8); vpsrlw(xmm1, 8); // xmm0 = rb01 // xmm1 = ga01 // xmm2 = c10 // xmm3 = c11 // xmm4 = rb00 // xmm5 = ga00 // xmm6 = uf // xmm7 = vf // rb00 = rb00.lerp16<0>(rb01, uf); // ga00 = ga00.lerp16<0>(ga01, uf); lerp16(xmm0, xmm4, xmm6, 0); lerp16(xmm1, xmm5, xmm6, 0); // xmm0 = rb00 // xmm1 = ga00 // xmm2 = c10 // xmm3 = c11 // xmm6 = uf // xmm7 = vf // GSVector4i rb10 = c10 & mask; // GSVector4i ga10 = (c10 >> 8) & mask; vpsrlw(xmm5, xmm2, 8); vpsllw(xmm2, 8); vpsrlw(xmm4, xmm2, 8); // GSVector4i rb11 = c11 & mask; // GSVector4i ga11 = (c11 >> 8) & mask; vpsrlw(xmm2, xmm3, 8); vpsllw(xmm3, 8); vpsrlw(xmm3, 8); // xmm0 = rb00 // xmm1 = ga00 // xmm2 = rb11 // xmm3 = ga11 // xmm4 = rb10 // xmm5 = ga10 // xmm6 = uf // xmm7 = vf // rb10 = rb10.lerp16<0>(rb11, uf); // ga10 = ga10.lerp16<0>(ga11, uf); lerp16(xmm2, xmm4, xmm6, 0); lerp16(xmm3, xmm5, xmm6, 0); // xmm0 = rb00 // xmm1 = ga00 // xmm2 = rb10 // xmm3 = ga10 // xmm7 = vf // rb00 = rb00.lerp16<0>(rb10, vf); // ga00 = ga00.lerp16<0>(ga10, vf); lerp16(xmm2, xmm0, xmm7, 0); lerp16(xmm3, xmm1, xmm7, 0); } else { // GSVector4i addr00 = y0 + x0; vpaddd(xmm3, xmm2); // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); ReadTexel(1, 0); // GSVector4i mask = GSVector4i::x00ff(); // c[0] = c00 & mask; // c[1] = (c00 >> 8) & mask; vpsrlw(xmm3, xmm2, 8); vpsllw(xmm2, 8); vpsrlw(xmm2, 8); } // xmm2 = rb // xmm3 = ga }
void GPUDrawScanlineCodeGenerator::SampleTexture() { if(!m_sel.tme) { return; } if(m_sel.tlu) { mov(edx, ptr[&m_local.gd->clut]); } // xmm2 = s // xmm3 = t // xmm7 = test // xmm0, xmm4, xmm5, xmm6 = free // xmm1 = used if(m_sel.ltf) { // GSVector4i u = s.sub16(GSVector4i(0x00200020)); // - 0.125f // GSVector4i v = t.sub16(GSVector4i(0x00200020)); // - 0.125f mov(eax, 0x00200020); movd(xmm0, eax); pshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); psubw(xmm2, xmm0); psubw(xmm3, xmm0); // GSVector4i uf = (u & GSVector4i::x00ff()) << 7; // GSVector4i vf = (v & GSVector4i::x00ff()) << 7; movdqa(xmm0, xmm2); psllw(xmm0, 8); psrlw(xmm0, 1); movdqa(ptr[&m_local.temp.uf], xmm0); if(!m_sel.sprite) { movdqa(xmm0, xmm3); psllw(xmm0, 8); psrlw(xmm0, 1); movdqa(ptr[&m_local.temp.vf], xmm0); } } // GSVector4i u0 = s.srl16(8); // GSVector4i v0 = t.srl16(8); psrlw(xmm2, 8); psrlw(xmm3, 8); // xmm2 = u // xmm3 = v // xmm7 = test // xmm0, xmm4, xmm5, xmm6 = free // xmm1 = used if(m_sel.ltf) { // GSVector4i u1 = u0.add16(GSVector4i::x0001()); // GSVector4i v1 = v0.add16(GSVector4i::x0001()); movdqa(xmm4, xmm2); movdqa(xmm5, xmm3); pcmpeqd(xmm0, xmm0); psrlw(xmm0, 15); paddw(xmm4, xmm0); paddw(xmm5, xmm0); if(m_sel.twin) { // u0 = (u0 & m_local.twin[0].u).add16(m_local.twin[1].u); // v0 = (v0 & m_local.twin[0].v).add16(m_local.twin[1].v); // u1 = (u1 & m_local.twin[0].u).add16(m_local.twin[1].u); // v1 = (v1 & m_local.twin[0].v).add16(m_local.twin[1].v); movdqa(xmm0, ptr[&m_local.twin[0].u]); movdqa(xmm6, ptr[&m_local.twin[1].u]); pand(xmm2, xmm0); paddw(xmm2, xmm6); pand(xmm4, xmm0); paddw(xmm4, xmm6); movdqa(xmm0, ptr[&m_local.twin[0].v]); movdqa(xmm6, ptr[&m_local.twin[1].v]); pand(xmm3, xmm0); paddw(xmm3, xmm6); pand(xmm5, xmm0); paddw(xmm5, xmm6); } else { // u0 = u0.min_i16(m_local.twin[2].u); // v0 = v0.min_i16(m_local.twin[2].v); // u1 = u1.min_i16(m_local.twin[2].u); // v1 = v1.min_i16(m_local.twin[2].v); // TODO: if(!sprite) clamp16 else: movdqa(xmm0, ptr[&m_local.twin[2].u]); movdqa(xmm6, ptr[&m_local.twin[2].v]); pminsw(xmm2, xmm0); pminsw(xmm3, xmm6); pminsw(xmm4, xmm0); pminsw(xmm5, xmm6); } // xmm2 = u0 // xmm3 = v0 // xmm4 = u1 // xmm5 = v1 // xmm7 = test // xmm0, xmm6 = free // xmm1 = used // GSVector4i addr00 = v0.sll16(8) | u0; // GSVector4i addr01 = v0.sll16(8) | u1; // GSVector4i addr10 = v1.sll16(8) | u0; // GSVector4i addr11 = v1.sll16(8) | u1; psllw(xmm3, 8); movdqa(xmm0, xmm3); por(xmm3, xmm2); por(xmm0, xmm4); psllw(xmm5, 8); movdqa(xmm6, xmm5); por(xmm5, xmm2); por(xmm6, xmm4); // xmm3 = addr00 // xmm0 = addr01 // xmm5 = addr10 // xmm6 = addr11 // xmm7 = test // xmm2, xmm4 = free // xmm1 = used ReadTexel(xmm2, xmm3); ReadTexel(xmm4, xmm0); ReadTexel(xmm3, xmm5); ReadTexel(xmm5, xmm6); // xmm2 = c00 // xmm4 = c01 // xmm3 = c10 // xmm5 = c11 // xmm7 = test // xmm0, xmm6 = free // xmm1 = used // spill (TODO) movdqa(ptr[&m_local.temp.fd], xmm1); movdqa(ptr[&m_local.temp.test], xmm7); // xmm2 = c00 // xmm4 = c01 // xmm3 = c10 // xmm5 = c11 // xmm0, xmm1, xmm6, xmm7 = free movdqa(xmm1, xmm2); psllw(xmm1, 11); psrlw(xmm1, 8); movdqa(xmm0, xmm4); psllw(xmm0, 11); psrlw(xmm0, 8); lerp16<0>(xmm0, xmm1, ptr[&m_local.temp.uf]); movdqa(xmm6, xmm2); psllw(xmm6, 6); psrlw(xmm6, 11); psllw(xmm6, 3); movdqa(xmm1, xmm4); psllw(xmm1, 6); psrlw(xmm1, 11); psllw(xmm1, 3); lerp16<0>(xmm1, xmm6, ptr[&m_local.temp.uf]); movdqa(xmm7, xmm2); psllw(xmm7, 1); psrlw(xmm7, 11); psllw(xmm7, 3); movdqa(xmm6, xmm4); psllw(xmm6, 1); psrlw(xmm6, 11); psllw(xmm6, 3); lerp16<0>(xmm6, xmm7, ptr[&m_local.temp.uf]); psraw(xmm2, 15); psrlw(xmm2, 8); psraw(xmm4, 15); psrlw(xmm4, 8); lerp16<0>(xmm4, xmm2, ptr[&m_local.temp.uf]); // xmm0 = r00 // xmm1 = g00 // xmm6 = b00 // xmm4 = a00 // xmm3 = c10 // xmm5 = c11 // xmm2, xmm7 = free movdqa(xmm7, xmm3); psllw(xmm7, 11); psrlw(xmm7, 8); movdqa(xmm2, xmm5); psllw(xmm2, 11); psrlw(xmm2, 8); lerp16<0>(xmm2, xmm7, ptr[&m_local.temp.uf]); lerp16<0>(xmm2, xmm0, ptr[&m_local.temp.vf]); // xmm2 = r // xmm1 = g00 // xmm6 = b00 // xmm4 = a00 // xmm3 = c10 // xmm5 = c11 // xmm0, xmm7 = free movdqa(xmm7, xmm3); psllw(xmm7, 6); psrlw(xmm7, 11); psllw(xmm7, 3); movdqa(xmm0, xmm5); psllw(xmm0, 6); psrlw(xmm0, 11); psllw(xmm0, 3); lerp16<0>(xmm0, xmm7, ptr[&m_local.temp.uf]); lerp16<0>(xmm0, xmm1, ptr[&m_local.temp.vf]); // xmm2 = r // xmm0 = g // xmm6 = b00 // xmm4 = a00 // xmm3 = c10 // xmm5 = c11 // xmm1, xmm7 = free movdqa(xmm7, xmm3); psllw(xmm7, 1); psrlw(xmm7, 11); psllw(xmm7, 3); movdqa(xmm1, xmm5); psllw(xmm1, 1); psrlw(xmm1, 11); psllw(xmm1, 3); lerp16<0>(xmm1, xmm7, ptr[&m_local.temp.uf]); lerp16<0>(xmm1, xmm6, ptr[&m_local.temp.vf]); // xmm2 = r // xmm0 = g // xmm1 = b // xmm4 = a00 // xmm3 = c10 // xmm5 = c11 // xmm6, xmm7 = free psraw(xmm3, 15); psrlw(xmm3, 8); psraw(xmm5, 15); psrlw(xmm5, 8); lerp16<0>(xmm5, xmm3, ptr[&m_local.temp.uf]); lerp16<0>(xmm5, xmm4, ptr[&m_local.temp.vf]); // xmm2 = r // xmm0 = g // xmm1 = b // xmm5 = a // xmm3, xmm4, xmm6, xmm7 = free // TODO movdqa(xmm3, xmm5); // a movdqa(xmm4, xmm2); // r movdqa(xmm6, xmm1); // b movdqa(xmm5, xmm0); // g // reload test movdqa(xmm7, ptr[&m_local.temp.test]); // xmm4 = r // xmm5 = g // xmm6 = b // xmm3 = a // xmm7 = test // xmm0, xmm1, xmm2 = free // test |= (c[0] | c[1] | c[2] | c[3]).eq16(GSVector4i::zero()); // mask out blank pixels (not perfect) movdqa(xmm1, xmm3); por(xmm1, xmm4); movdqa(xmm2, xmm5); por(xmm2, xmm6); por(xmm1, xmm2); pxor(xmm0, xmm0); pcmpeqw(xmm1, xmm0); por(xmm7, xmm1); // a = a.gt16(GSVector4i::zero()); pcmpgtw(xmm3, xmm0); // reload fd movdqa(xmm1, ptr[&m_local.temp.fd]); } else { if(m_sel.twin) { // u = (u & m_local.twin[0].u).add16(m_local.twin[1].u); // v = (v & m_local.twin[0].v).add16(m_local.twin[1].v); pand(xmm2, ptr[&m_local.twin[0].u]); paddw(xmm2, ptr[&m_local.twin[1].u]); pand(xmm3, ptr[&m_local.twin[0].v]); paddw(xmm3, ptr[&m_local.twin[1].v]); } else { // u = u.min_i16(m_local.twin[2].u); // v = v.min_i16(m_local.twin[2].v); // TODO: if(!sprite) clamp16 else: pminsw(xmm2, ptr[&m_local.twin[2].u]); pminsw(xmm3, ptr[&m_local.twin[2].v]); } // xmm2 = u // xmm3 = v // xmm7 = test // xmm0, xmm4, xmm5, xmm6 = free // xmm1 = used // GSVector4i addr = v.sll16(8) | u; psllw(xmm3, 8); por(xmm3, xmm2); // xmm3 = addr // xmm7 = test // xmm0, xmm2, xmm4, xmm5, xmm6 = free // xmm1 = used ReadTexel(xmm6, xmm3); // xmm3 = c00 // xmm7 = test // xmm0, xmm2, xmm4, xmm5, xmm6 = free // xmm1 = used // test |= c00.eq16(GSVector4i::zero()); // mask out blank pixels pxor(xmm0, xmm0); pcmpeqw(xmm0, xmm6); por(xmm7, xmm0); // c[0] = (c00 << 3) & 0x00f800f8; // c[1] = (c00 >> 2) & 0x00f800f8; // c[2] = (c00 >> 7) & 0x00f800f8; // c[3] = c00.sra16(15); movdqa(xmm3, xmm6); psraw(xmm3, 15); // a pcmpeqd(xmm0, xmm0); psrlw(xmm0, 11); psllw(xmm0, 3); // 0x00f8 movdqa(xmm4, xmm6); psllw(xmm4, 3); pand(xmm4, xmm0); // r movdqa(xmm5, xmm6); psrlw(xmm5, 2); pand(xmm5, xmm0); // g psrlw(xmm6, 7); pand(xmm6, xmm0); // b } }
void GSDrawScanlineCodeGenerator::SampleTexture() { if(!m_sel.fb || m_sel.tfx == TFX_NONE) { return; } mov(ebx, dword[&m_env.tex]); // ebx = tex if(!m_sel.fst) { // TODO: move these into Init/Step too? cvttps2dq(xmm2, xmm2); cvttps2dq(xmm3, xmm3); if(m_sel.ltf) { // u -= 0x8000; // v -= 0x8000; mov(eax, 0x8000); movd(xmm4, eax); pshufd(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); psubd(xmm2, xmm4); psubd(xmm3, xmm4); } } // xmm2 = u // xmm3 = v if(m_sel.ltf) { // GSVector4i uf = u.xxzzlh().srl16(1); movdqa(xmm0, xmm2); pshuflw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); psrlw(xmm0, 1); movdqa(xmmword[&m_env.temp.uf], xmm0); if(!m_sel.sprite) { // GSVector4i vf = v.xxzzlh().srl16(1); movdqa(xmm1, xmm3); pshuflw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 0, 0)); psrlw(xmm1, 1); movdqa(xmmword[&m_env.temp.vf], xmm1); } } // GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16)); psrad(xmm2, 16); psrad(xmm3, 16); packssdw(xmm2, xmm3); if(m_sel.ltf) { // GSVector4i uv1 = uv0.add16(GSVector4i::x0001()); movdqa(xmm3, xmm2); pcmpeqd(xmm1, xmm1); psrlw(xmm1, 15); paddw(xmm3, xmm1); // uv0 = Wrap(uv0); // uv1 = Wrap(uv1); Wrap(xmm2, xmm3); } else { // uv0 = Wrap(uv0); Wrap(xmm2); } // xmm2 = uv0 // xmm3 = uv1 (ltf) // xmm0, xmm1, xmm4, xmm5, xmm6 = free // xmm7 = used // GSVector4i y0 = uv0.uph16() << tw; // GSVector4i x0 = uv0.upl16(); pxor(xmm0, xmm0); movd(xmm1, ptr[&m_env.tw]); movdqa(xmm4, xmm2); punpckhwd(xmm2, xmm0); punpcklwd(xmm4, xmm0); pslld(xmm2, xmm1); // xmm0 = 0 // xmm1 = tw // xmm2 = y0 // xmm3 = uv1 (ltf) // xmm4 = x0 // xmm5, xmm6 = free // xmm7 = used if(m_sel.ltf) { // GSVector4i y1 = uv1.uph16() << tw; // GSVector4i x1 = uv1.upl16(); movdqa(xmm6, xmm3); punpckhwd(xmm3, xmm0); punpcklwd(xmm6, xmm0); pslld(xmm3, xmm1); // xmm2 = y0 // xmm3 = y1 // xmm4 = x0 // xmm6 = x1 // xmm0, xmm5, xmm6 = free // xmm7 = used // GSVector4i addr00 = y0 + x0; // GSVector4i addr01 = y0 + x1; // GSVector4i addr10 = y1 + x0; // GSVector4i addr11 = y1 + x1; movdqa(xmm5, xmm2); paddd(xmm5, xmm4); paddd(xmm2, xmm6); movdqa(xmm0, xmm3); paddd(xmm0, xmm4); paddd(xmm3, xmm6); // xmm5 = addr00 // xmm2 = addr01 // xmm0 = addr10 // xmm3 = addr11 // xmm1, xmm4, xmm6 = free // xmm7 = used // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]); // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); ReadTexel(xmm6, xmm5, xmm1, xmm4); // xmm2, xmm5, xmm1 = free ReadTexel(xmm4, xmm2, xmm5, xmm1); // xmm0, xmm2, xmm5 = free ReadTexel(xmm1, xmm0, xmm2, xmm5); // xmm3, xmm0, xmm2 = free ReadTexel(xmm5, xmm3, xmm0, xmm2); // xmm6 = c00 // xmm4 = c01 // xmm1 = c10 // xmm5 = c11 // xmm0, xmm2, xmm3 = free // xmm7 = used movdqa(xmm0, xmmword[&m_env.temp.uf]); // GSVector4i rb00 = c00 & mask; // GSVector4i ga00 = (c00 >> 8) & mask; movdqa(xmm2, xmm6); psllw(xmm2, 8); psrlw(xmm2, 8); psrlw(xmm6, 8); // GSVector4i rb01 = c01 & mask; // GSVector4i ga01 = (c01 >> 8) & mask; movdqa(xmm3, xmm4); psllw(xmm3, 8); psrlw(xmm3, 8); psrlw(xmm4, 8); // xmm0 = uf // xmm2 = rb00 // xmm3 = rb01 // xmm6 = ga00 // xmm4 = ga01 // xmm1 = c10 // xmm5 = c11 // xmm7 = used // rb00 = rb00.lerp16<0>(rb01, uf); // ga00 = ga00.lerp16<0>(ga01, uf); lerp16<0>(xmm3, xmm2, xmm0); lerp16<0>(xmm4, xmm6, xmm0); // xmm0 = uf // xmm3 = rb00 // xmm4 = ga00 // xmm1 = c10 // xmm5 = c11 // xmm2, xmm6 = free // xmm7 = used // GSVector4i rb10 = c10 & mask; // GSVector4i ga10 = (c10 >> 8) & mask; movdqa(xmm2, xmm1); psllw(xmm1, 8); psrlw(xmm1, 8); psrlw(xmm2, 8); // GSVector4i rb11 = c11 & mask; // GSVector4i ga11 = (c11 >> 8) & mask; movdqa(xmm6, xmm5); psllw(xmm5, 8); psrlw(xmm5, 8); psrlw(xmm6, 8); // xmm0 = uf // xmm3 = rb00 // xmm4 = ga00 // xmm1 = rb10 // xmm5 = rb11 // xmm2 = ga10 // xmm6 = ga11 // xmm7 = used // rb10 = rb10.lerp16<0>(rb11, uf); // ga10 = ga10.lerp16<0>(ga11, uf); lerp16<0>(xmm5, xmm1, xmm0); lerp16<0>(xmm6, xmm2, xmm0); // xmm3 = rb00 // xmm4 = ga00 // xmm5 = rb10 // xmm6 = ga10 // xmm0, xmm1, xmm2 = free // xmm7 = used // rb00 = rb00.lerp16<0>(rb10, vf); // ga00 = ga00.lerp16<0>(ga10, vf); movdqa(xmm0, xmmword[&m_env.temp.vf]); lerp16<0>(xmm5, xmm3, xmm0); lerp16<0>(xmm6, xmm4, xmm0); } else { // GSVector4i addr00 = y0 + x0; paddd(xmm2, xmm4); // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); ReadTexel(xmm5, xmm2, xmm0, xmm1); // GSVector4i mask = GSVector4i::x00ff(); // c[0] = c00 & mask; // c[1] = (c00 >> 8) & mask; movdqa(xmm6, xmm5); psllw(xmm5, 8); psrlw(xmm5, 8); psrlw(xmm6, 8); } }