void GSSetupPrimCodeGenerator::Depth() { if(!m_en.z && !m_en.f) { return; } if(!m_env.sel.sprite) { // GSVector4 t = dscan.p; movaps(xmm0, xmmword[edx + 16]); if(m_en.f) { // GSVector4 df = p.wwww(); movaps(xmm1, xmm0); shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); // m_env.d4.f = GSVector4i(df * 4.0f).xxzzlh(); movaps(xmm2, xmm1); mulps(xmm2, xmm3); cvttps2dq(xmm2, xmm2); pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); movdqa(xmmword[&m_env.d4.f], xmm2); for(int i = 0; i < 4; i++) { // m_env.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh(); movaps(xmm2, xmm1); mulps(xmm2, Xmm(4 + i)); cvttps2dq(xmm2, xmm2); pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); movdqa(xmmword[&m_env.d[i].f], xmm2); } } if(m_en.z) { // GSVector4 dz = p.zzzz(); shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); // m_env.d4.z = dz * 4.0f; movaps(xmm1, xmm0); mulps(xmm1, xmm3); movdqa(xmmword[&m_env.d4.z], xmm1); for(int i = 0; i < 4; i++) { // m_env.d[i].z = dz * m_shift[i]; movaps(xmm1, xmm0); mulps(xmm1, Xmm(4 + i)); movdqa(xmmword[&m_env.d[i].z], xmm1); } } } else { // GSVector4 p = vertices[0].p; movaps(xmm0, xmmword[ecx + 16]); if(m_en.f) { // m_env.p.f = GSVector4i(p).zzzzh().zzzz(); movaps(xmm1, xmm0); cvttps2dq(xmm1, xmm1); pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); movdqa(xmmword[&m_env.p.f], xmm1); } if(m_en.z) { // GSVector4 z = p.zzzz(); shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); if(m_env.sel.zoverflow) { // m_env.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); static const float half = 0.5f; movss(xmm1, dword[&half]); shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0)); mulps(xmm1, xmm0); cvttps2dq(xmm1, xmm1); pslld(xmm1, 1); cvttps2dq(xmm0, xmm0); pcmpeqd(xmm2, xmm2); psrld(xmm2, 31); pand(xmm0, xmm2); por(xmm0, xmm1); } else { // m_env.p.z = GSVector4i(z); cvttps2dq(xmm0, xmm0); } movdqa(xmmword[&m_env.p.z], xmm0); } } }
void GPUDrawScanlineCodeGenerator::Init() { mov(eax, dword[esp + _top]); // uint16* fb = (uint16*)m_global.vm + (top << (10 + sel.scalex)) + left; mov(edi, eax); shl(edi, 10 + m_sel.scalex); add(edi, edx); lea(edi, ptr[edi * 2 + (size_t)m_local.gd->vm]); // int steps = pixels - 8; sub(ecx, 8); if(m_sel.dtd) { // dither = GSVector4i::load<false>(&m_dither[top & 3][left & 3]); and(eax, 3); shl(eax, 5); and(edx, 3); shl(edx, 1); movdqu(xmm0, ptr[eax + edx + (size_t)m_dither]); movdqa(ptr[&m_local.temp.dither], xmm0); } mov(edx, dword[esp + _v]); if(m_sel.tme) { mov(esi, dword[&m_local.gd->tex]); // GSVector4i vt = GSVector4i(v.t).xxzzl(); cvttps2dq(xmm4, ptr[edx + offsetof(GSVertexSW, t)]); pshuflw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0)); // s = vt.xxxx().add16(m_local.d.s); // t = vt.yyyy().add16(m_local.d.t); pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1)); paddw(xmm2, ptr[&m_local.d.s]); if(!m_sel.sprite) { paddw(xmm3, ptr[&m_local.d.t]); } else { if(m_sel.ltf) { movdqa(xmm0, xmm3); psllw(xmm0, 8); psrlw(xmm0, 1); movdqa(ptr[&m_local.temp.vf], xmm0); } } movdqa(ptr[&m_local.temp.s], xmm2); movdqa(ptr[&m_local.temp.t], xmm3); } if(m_sel.tfx != 3) // != decal { // GSVector4i vc = GSVector4i(v.c).xxzzlh(); cvttps2dq(xmm6, ptr[edx + offsetof(GSVertexSW, c)]); pshuflw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0)); // r = vc.xxxx(); // g = vc.yyyy(); // b = vc.zzzz(); pshufd(xmm4, xmm6, _MM_SHUFFLE(0, 0, 0, 0)); pshufd(xmm5, xmm6, _MM_SHUFFLE(1, 1, 1, 1)); pshufd(xmm6, xmm6, _MM_SHUFFLE(2, 2, 2, 2)); if(m_sel.iip) { // r = r.add16(m_local.d.r); // g = g.add16(m_local.d.g); // b = b.add16(m_local.d.b); paddw(xmm4, ptr[&m_local.d.r]); paddw(xmm5, ptr[&m_local.d.g]); paddw(xmm6, ptr[&m_local.d.b]); } movdqa(ptr[&m_local.temp.r], xmm4); movdqa(ptr[&m_local.temp.g], xmm5); movdqa(ptr[&m_local.temp.b], xmm6); } }
void GSDrawScanlineCodeGenerator::SampleTexture() { if(!m_sel.fb || m_sel.tfx == TFX_NONE) { return; } mov(ebx, dword[&m_env.tex]); // ebx = tex if(!m_sel.fst) { // TODO: move these into Init/Step too? cvttps2dq(xmm2, xmm2); cvttps2dq(xmm3, xmm3); if(m_sel.ltf) { // u -= 0x8000; // v -= 0x8000; mov(eax, 0x8000); movd(xmm4, eax); pshufd(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); psubd(xmm2, xmm4); psubd(xmm3, xmm4); } } // xmm2 = u // xmm3 = v if(m_sel.ltf) { // GSVector4i uf = u.xxzzlh().srl16(1); movdqa(xmm0, xmm2); pshuflw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); psrlw(xmm0, 1); movdqa(xmmword[&m_env.temp.uf], xmm0); if(!m_sel.sprite) { // GSVector4i vf = v.xxzzlh().srl16(1); movdqa(xmm1, xmm3); pshuflw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 0, 0)); psrlw(xmm1, 1); movdqa(xmmword[&m_env.temp.vf], xmm1); } } // GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16)); psrad(xmm2, 16); psrad(xmm3, 16); packssdw(xmm2, xmm3); if(m_sel.ltf) { // GSVector4i uv1 = uv0.add16(GSVector4i::x0001()); movdqa(xmm3, xmm2); pcmpeqd(xmm1, xmm1); psrlw(xmm1, 15); paddw(xmm3, xmm1); // uv0 = Wrap(uv0); // uv1 = Wrap(uv1); Wrap(xmm2, xmm3); } else { // uv0 = Wrap(uv0); Wrap(xmm2); } // xmm2 = uv0 // xmm3 = uv1 (ltf) // xmm0, xmm1, xmm4, xmm5, xmm6 = free // xmm7 = used // GSVector4i y0 = uv0.uph16() << tw; // GSVector4i x0 = uv0.upl16(); pxor(xmm0, xmm0); movd(xmm1, ptr[&m_env.tw]); movdqa(xmm4, xmm2); punpckhwd(xmm2, xmm0); punpcklwd(xmm4, xmm0); pslld(xmm2, xmm1); // xmm0 = 0 // xmm1 = tw // xmm2 = y0 // xmm3 = uv1 (ltf) // xmm4 = x0 // xmm5, xmm6 = free // xmm7 = used if(m_sel.ltf) { // GSVector4i y1 = uv1.uph16() << tw; // GSVector4i x1 = uv1.upl16(); movdqa(xmm6, xmm3); punpckhwd(xmm3, xmm0); punpcklwd(xmm6, xmm0); pslld(xmm3, xmm1); // xmm2 = y0 // xmm3 = y1 // xmm4 = x0 // xmm6 = x1 // xmm0, xmm5, xmm6 = free // xmm7 = used // GSVector4i addr00 = y0 + x0; // GSVector4i addr01 = y0 + x1; // GSVector4i addr10 = y1 + x0; // GSVector4i addr11 = y1 + x1; movdqa(xmm5, xmm2); paddd(xmm5, xmm4); paddd(xmm2, xmm6); movdqa(xmm0, xmm3); paddd(xmm0, xmm4); paddd(xmm3, xmm6); // xmm5 = addr00 // xmm2 = addr01 // xmm0 = addr10 // xmm3 = addr11 // xmm1, xmm4, xmm6 = free // xmm7 = used // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]); // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); ReadTexel(xmm6, xmm5, xmm1, xmm4); // xmm2, xmm5, xmm1 = free ReadTexel(xmm4, xmm2, xmm5, xmm1); // xmm0, xmm2, xmm5 = free ReadTexel(xmm1, xmm0, xmm2, xmm5); // xmm3, xmm0, xmm2 = free ReadTexel(xmm5, xmm3, xmm0, xmm2); // xmm6 = c00 // xmm4 = c01 // xmm1 = c10 // xmm5 = c11 // xmm0, xmm2, xmm3 = free // xmm7 = used movdqa(xmm0, xmmword[&m_env.temp.uf]); // GSVector4i rb00 = c00 & mask; // GSVector4i ga00 = (c00 >> 8) & mask; movdqa(xmm2, xmm6); psllw(xmm2, 8); psrlw(xmm2, 8); psrlw(xmm6, 8); // GSVector4i rb01 = c01 & mask; // GSVector4i ga01 = (c01 >> 8) & mask; movdqa(xmm3, xmm4); psllw(xmm3, 8); psrlw(xmm3, 8); psrlw(xmm4, 8); // xmm0 = uf // xmm2 = rb00 // xmm3 = rb01 // xmm6 = ga00 // xmm4 = ga01 // xmm1 = c10 // xmm5 = c11 // xmm7 = used // rb00 = rb00.lerp16<0>(rb01, uf); // ga00 = ga00.lerp16<0>(ga01, uf); lerp16<0>(xmm3, xmm2, xmm0); lerp16<0>(xmm4, xmm6, xmm0); // xmm0 = uf // xmm3 = rb00 // xmm4 = ga00 // xmm1 = c10 // xmm5 = c11 // xmm2, xmm6 = free // xmm7 = used // GSVector4i rb10 = c10 & mask; // GSVector4i ga10 = (c10 >> 8) & mask; movdqa(xmm2, xmm1); psllw(xmm1, 8); psrlw(xmm1, 8); psrlw(xmm2, 8); // GSVector4i rb11 = c11 & mask; // GSVector4i ga11 = (c11 >> 8) & mask; movdqa(xmm6, xmm5); psllw(xmm5, 8); psrlw(xmm5, 8); psrlw(xmm6, 8); // xmm0 = uf // xmm3 = rb00 // xmm4 = ga00 // xmm1 = rb10 // xmm5 = rb11 // xmm2 = ga10 // xmm6 = ga11 // xmm7 = used // rb10 = rb10.lerp16<0>(rb11, uf); // ga10 = ga10.lerp16<0>(ga11, uf); lerp16<0>(xmm5, xmm1, xmm0); lerp16<0>(xmm6, xmm2, xmm0); // xmm3 = rb00 // xmm4 = ga00 // xmm5 = rb10 // xmm6 = ga10 // xmm0, xmm1, xmm2 = free // xmm7 = used // rb00 = rb00.lerp16<0>(rb10, vf); // ga00 = ga00.lerp16<0>(ga10, vf); movdqa(xmm0, xmmword[&m_env.temp.vf]); lerp16<0>(xmm5, xmm3, xmm0); lerp16<0>(xmm6, xmm4, xmm0); } else { // GSVector4i addr00 = y0 + x0; paddd(xmm2, xmm4); // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); ReadTexel(xmm5, xmm2, xmm0, xmm1); // GSVector4i mask = GSVector4i::x00ff(); // c[0] = c00 & mask; // c[1] = (c00 >> 8) & mask; movdqa(xmm6, xmm5); psllw(xmm5, 8); psrlw(xmm5, 8); psrlw(xmm6, 8); } }
void GSDrawScanlineCodeGenerator::Init(int params) { const int _top = params + 4; const int _v = params + 8; // int skip = left & 3; mov(ebx, edx); and(edx, 3); // left -= skip; sub(ebx, edx); // int steps = right - left - 4; sub(ecx, ebx); sub(ecx, 4); // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))]; shl(edx, 4); movdqa(xmm7, xmmword[edx + (size_t)&m_test[0]]); mov(eax, ecx); sar(eax, 31); and(eax, ecx); shl(eax, 4); por(xmm7, xmmword[eax + (size_t)&m_test[7]]); // GSVector2i* fza_base = &m_env.fzbr[top]; mov(esi, dword[esp + _top]); lea(esi, ptr[esi * 8]); add(esi, dword[&m_env.fzbr]); // GSVector2i* fza_offset = &m_env.fzbc[left >> 2]; lea(edi, ptr[ebx * 2]); add(edi, dword[&m_env.fzbc]); if(!m_sel.sprite && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip)) { // edx = &m_env.d[skip] shl(edx, 4); lea(edx, ptr[edx + (size_t)m_env.d]); // ebx = &v mov(ebx, dword[esp + _v]); } if(!m_sel.sprite) { if(m_sel.fwrite && m_sel.fge || m_sel.zb) { movaps(xmm0, xmmword[ebx + 16]); // v.p if(m_sel.fwrite && m_sel.fge) { // f = GSVector4i(vp).zzzzh().zzzz().add16(m_env.d[skip].f); cvttps2dq(xmm1, xmm0); pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); paddw(xmm1, xmmword[edx + 16 * 6]); movdqa(xmmword[&m_env.temp.f], xmm1); } if(m_sel.zb) { // z = vp.zzzz() + m_env.d[skip].z; shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); addps(xmm0, xmmword[edx]); movaps(xmmword[&m_env.temp.z], xmm0); } } } else { if(m_sel.ztest) { movdqa(xmm0, xmmword[&m_env.p.z]); } } if(m_sel.fb) { if(m_sel.edge || m_sel.tfx != TFX_NONE) { movaps(xmm4, xmmword[ebx + 32]); // v.t } if(m_sel.edge) { pshufhw(xmm3, xmm4, _MM_SHUFFLE(2, 2, 2, 2)); pshufd(xmm3, xmm3, _MM_SHUFFLE(3, 3, 3, 3)); psrlw(xmm3, 9); movdqa(xmmword[&m_env.temp.cov], xmm3); } if(m_sel.tfx != TFX_NONE) { if(m_sel.fst) { // GSVector4i vti(vt); cvttps2dq(xmm4, xmm4); // si = vti.xxxx() + m_env.d[skip].si; // ti = vti.yyyy(); if(!sprite) ti += m_env.d[skip].ti; pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1)); paddd(xmm2, xmmword[edx + 16 * 7]); if(!m_sel.sprite) { paddd(xmm3, xmmword[edx + 16 * 8]); } else { if(m_sel.ltf) { movdqa(xmm4, xmm3); pshuflw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0)); psrlw(xmm4, 1); movdqa(xmmword[&m_env.temp.vf], xmm4); } } movdqa(xmmword[&m_env.temp.s], xmm2); movdqa(xmmword[&m_env.temp.t], xmm3); } else { // s = vt.xxxx() + m_env.d[skip].s; // t = vt.yyyy() + m_env.d[skip].t; // q = vt.zzzz() + m_env.d[skip].q; movaps(xmm2, xmm4); movaps(xmm3, xmm4); shufps(xmm2, xmm2, _MM_SHUFFLE(0, 0, 0, 0)); shufps(xmm3, xmm3, _MM_SHUFFLE(1, 1, 1, 1)); shufps(xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2)); addps(xmm2, xmmword[edx + 16 * 1]); addps(xmm3, xmmword[edx + 16 * 2]); addps(xmm4, xmmword[edx + 16 * 3]); movaps(xmmword[&m_env.temp.s], xmm2); movaps(xmmword[&m_env.temp.t], xmm3); movaps(xmmword[&m_env.temp.q], xmm4); rcpps(xmm4, xmm4); mulps(xmm2, xmm4); mulps(xmm3, xmm4); } } if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc)) { if(m_sel.iip) { // GSVector4i vc = GSVector4i(v.c); cvttps2dq(xmm6, xmmword[ebx]); // v.c // vc = vc.upl16(vc.zwxy()); pshufd(xmm5, xmm6, _MM_SHUFFLE(1, 0, 3, 2)); punpcklwd(xmm6, xmm5); // rb = vc.xxxx().add16(m_env.d[skip].rb); // ga = vc.zzzz().add16(m_env.d[skip].ga); pshufd(xmm5, xmm6, _MM_SHUFFLE(0, 0, 0, 0)); pshufd(xmm6, xmm6, _MM_SHUFFLE(2, 2, 2, 2)); paddw(xmm5, xmmword[edx + 16 * 4]); paddw(xmm6, xmmword[edx + 16 * 5]); movdqa(xmmword[&m_env.temp.rb], xmm5); movdqa(xmmword[&m_env.temp.ga], xmm6); } else { if(m_sel.tfx == TFX_NONE) { movdqa(xmm5, xmmword[&m_env.c.rb]); movdqa(xmm6, xmmword[&m_env.c.ga]); } } } } }
void GSDrawScanlineCodeGenerator::AlphaBlend() { if(!m_sel.fwrite) { return; } if(m_sel.abe == 0 && m_sel.aa1 == 0) { return; } if((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1) || m_sel.abd == 1) { switch(m_sel.fpsm) { case 0: case 1: // c[2] = fd & mask; // c[3] = (fd >> 8) & mask; movdqa(xmm0, xmm2); movdqa(xmm1, xmm2); psllw(xmm0, 8); psrlw(xmm0, 8); psrlw(xmm1, 8); break; case 2: // c[2] = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3); // c[3] = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2); movdqa(xmm0, xmm2); movdqa(xmm1, xmm2); movdqa(xmm4, xmm2); pcmpeqd(xmm7, xmm7); psrld(xmm7, 27); // 0x0000001f pand(xmm0, xmm7); pslld(xmm0, 3); pslld(xmm7, 10); // 0x00007c00 pand(xmm4, xmm7); pslld(xmm4, 9); por(xmm0, xmm4); movdqa(xmm4, xmm1); psrld(xmm7, 5); // 0x000003e0 pand(xmm1, xmm7); psrld(xmm1, 2); psllw(xmm7, 10); // 0x00008000 pand(xmm4, xmm7); pslld(xmm4, 8); por(xmm1, xmm4); break; } } // xmm5, xmm6 = src rb, ga // xmm0, xmm1 = dst rb, ga // xmm2, xmm3 = used // xmm4, xmm7 = free if(m_sel.pabe || (m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0)) { movdqa(xmm4, xmm5); } if(m_sel.aba != m_sel.abb) { // rb = c[aba * 2 + 0]; switch(m_sel.aba) { case 0: break; case 1: movdqa(xmm5, xmm0); break; case 2: pxor(xmm5, xmm5); break; } // rb = rb.sub16(c[abb * 2 + 0]); switch(m_sel.abb) { case 0: psubw(xmm5, xmm4); break; case 1: psubw(xmm5, xmm0); break; case 2: break; } if(!(m_sel.fpsm == 1 && m_sel.abc == 1)) { // GSVector4i a = abc < 2 ? c[abc * 2 + 1].yywwlh().sll16(7) : m_env.afix; switch(m_sel.abc) { case 0: case 1: movdqa(xmm7, m_sel.abc ? xmm1 : xmm6); pshuflw(xmm7, xmm7, _MM_SHUFFLE(3, 3, 1, 1)); pshufhw(xmm7, xmm7, _MM_SHUFFLE(3, 3, 1, 1)); psllw(xmm7, 7); break; case 2: movdqa(xmm7, xmmword[&m_env.afix]); break; } // rb = rb.modulate16<1>(a); modulate16<1>(xmm5, xmm7); } // rb = rb.add16(c[abd * 2 + 0]); switch(m_sel.abd) { case 0: paddw(xmm5, xmm4); break; case 1: paddw(xmm5, xmm0); break; case 2: break; } } else { // rb = c[abd * 2 + 0]; switch(m_sel.abd) { case 0: break; case 1: movdqa(xmm5, xmm0); break; case 2: pxor(xmm5, xmm5); break; } } if(m_sel.pabe) { // mask = (c[1] << 8).sra32(31); movdqa(xmm0, xmm6); pslld(xmm0, 8); psrad(xmm0, 31); // rb = c[0].blend8(rb, mask); blend8r(xmm5, xmm4); } // xmm6 = src ga // xmm1 = dst ga // xmm5 = rb // xmm7 = a // xmm2, xmm3 = used // xmm0, xmm4 = free movdqa(xmm4, xmm6); if(m_sel.aba != m_sel.abb) { // ga = c[aba * 2 + 1]; switch(m_sel.aba) { case 0: break; case 1: movdqa(xmm6, xmm1); break; case 2: pxor(xmm6, xmm6); break; } // ga = ga.sub16(c[abeb * 2 + 1]); switch(m_sel.abb) { case 0: psubw(xmm6, xmm4); break; case 1: psubw(xmm6, xmm1); break; case 2: break; } if(!(m_sel.fpsm == 1 && m_sel.abc == 1)) { // ga = ga.modulate16<1>(a); modulate16<1>(xmm6, xmm7); } // ga = ga.add16(c[abd * 2 + 1]); switch(m_sel.abd) { case 0: paddw(xmm6, xmm4); break; case 1: paddw(xmm6, xmm1); break; case 2: break; } } else { // ga = c[abd * 2 + 1]; switch(m_sel.abd) { case 0: break; case 1: movdqa(xmm6, xmm1); break; case 2: pxor(xmm6, xmm6); break; } } // xmm4 = src ga // xmm5 = rb // xmm6 = ga // xmm2, xmm3 = used // xmm0, xmm1, xmm7 = free if(m_sel.pabe) { if(!m_cpu.has(util::Cpu::tSSE41)) { // doh, previous blend8r overwrote xmm0 (sse41 uses pblendvb) movdqa(xmm0, xmm4); pslld(xmm0, 8); psrad(xmm0, 31); } psrld(xmm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16) // ga = c[1].blend8(ga, mask).mix16(c[1]); blend8r(xmm6, xmm4); } else { if(m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx { mix16(xmm6, xmm4, xmm7); } } }
void GSDrawScanlineCodeGenerator::ColorTFX() { if(!m_sel.fwrite) { return; } switch(m_sel.tfx) { case TFX_MODULATE: // GSVector4i rb = iip ? rbf : m_env.c.rb; // rbt = rbt.modulate16<1>(rb).clamp8(); modulate16<1>(xmm5, xmmword[m_sel.iip ? &m_env.temp.rb : &m_env.c.rb]); clamp16(xmm5, xmm1); break; case TFX_DECAL: break; case TFX_HIGHLIGHT: case TFX_HIGHLIGHT2: if(m_sel.tfx == TFX_HIGHLIGHT2 && m_sel.tcc) { // GSVector4i ga = iip ? gaf : m_env.c.ga; movdqa(xmm2, xmmword[m_sel.iip ? &m_env.temp.ga : &m_env.c.ga]); } // gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat); movdqa(xmm1, xmm6); modulate16<1>(xmm6, xmm2); pshuflw(xmm2, xmm2, _MM_SHUFFLE(3, 3, 1, 1)); pshufhw(xmm2, xmm2, _MM_SHUFFLE(3, 3, 1, 1)); psrlw(xmm2, 7); paddw(xmm6, xmm2); clamp16(xmm6, xmm0); mix16(xmm6, xmm1, xmm0); // GSVector4i rb = iip ? rbf : m_env.c.rb; // rbt = rbt.modulate16<1>(rb).add16(af).clamp8(); modulate16<1>(xmm5, xmmword[m_sel.iip ? &m_env.temp.rb : &m_env.c.rb]); paddw(xmm5, xmm2); clamp16(xmm5, xmm0); break; case TFX_NONE: // rbt = iip ? rb.srl16(7) : rb; if(m_sel.iip) { psrlw(xmm5, 7); } break; } }