void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Xmm& temp, const Reg32& addr, uint8 i, int psm)
{
    static const int offsets[4] = {0, 2, 8, 10};

    Address dst = ptr[addr * 2 + (size_t)m_env.vm + offsets[i] * 2];

    if(m_cpu.has(util::Cpu::tSSE41))
    {
        switch(psm)
        {
        case 0:
            if(i == 0) movd(dst, src);
            else pextrd(dst, src, i);
            break;
        case 1:
            if(i == 0) movd(eax, src);
            else pextrd(eax, src, i);
            xor(eax, dst);
            and(eax, 0xffffff);
            xor(dst, eax);
            break;
        case 2:
            pextrw(eax, src, i * 2);
            mov(dst, ax);
            break;
        }
    }
    else
    {
        switch(psm)
        {
        case 0:
            if(i == 0) movd(dst, src);
            else {
                pshufd(temp, src, _MM_SHUFFLE(i, i, i, i));
                movd(dst, temp);
            }
            break;
        case 1:
            if(i == 0) movd(eax, src);
            else {
                pshufd(temp, src, _MM_SHUFFLE(i, i, i, i));
                movd(eax, temp);
            }
            xor(eax, dst);
            and(eax, 0xffffff);
            xor(dst, eax);
            break;
        case 2:
            pextrw(eax, src, i * 2);
            mov(dst, ax);
            break;
        }
    }
}
void GSSetupPrimCodeGenerator::Depth()
{
	if(!m_en.z && !m_en.f)
	{
		return;
	}

	if(!m_env.sel.sprite)
	{
		// GSVector4 t = dscan.p;

		movaps(xmm0, xmmword[edx + 16]);

		if(m_en.f)
		{
			// GSVector4 df = p.wwww();

			movaps(xmm1, xmm0);
			shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));

			// m_env.d4.f = GSVector4i(df * 4.0f).xxzzlh();

			movaps(xmm2, xmm1);
			mulps(xmm2, xmm3);
			cvttps2dq(xmm2, xmm2);
			pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
			pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
			movdqa(xmmword[&m_env.d4.f], xmm2);

			for(int i = 0; i < 4; i++)
			{
				// m_env.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();

				movaps(xmm2, xmm1);
				mulps(xmm2, Xmm(4 + i));
				cvttps2dq(xmm2, xmm2);
				pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
				pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
				movdqa(xmmword[&m_env.d[i].f], xmm2);
			}
		}

		if(m_en.z)
		{
			// GSVector4 dz = p.zzzz();

			shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));

			// m_env.d4.z = dz * 4.0f;

			movaps(xmm1, xmm0);
			mulps(xmm1, xmm3);
			movdqa(xmmword[&m_env.d4.z], xmm1);

			for(int i = 0; i < 4; i++)
			{
				// m_env.d[i].z = dz * m_shift[i];

				movaps(xmm1, xmm0);
				mulps(xmm1, Xmm(4 + i));
				movdqa(xmmword[&m_env.d[i].z], xmm1);
			}
		}
	}
	else
	{
		// GSVector4 p = vertices[0].p;

		movaps(xmm0, xmmword[ecx + 16]);

		if(m_en.f)
		{
			// m_env.p.f = GSVector4i(p).zzzzh().zzzz();

			movaps(xmm1, xmm0);
			cvttps2dq(xmm1, xmm1);
			pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
			pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
			movdqa(xmmword[&m_env.p.f], xmm1);
		}

		if(m_en.z)
		{
			// GSVector4 z = p.zzzz();

			shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));

			if(m_env.sel.zoverflow)
			{
				// m_env.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());

				static const float half = 0.5f;

				movss(xmm1, dword[&half]);
				shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
				mulps(xmm1, xmm0);
				cvttps2dq(xmm1, xmm1);
				pslld(xmm1, 1);

				cvttps2dq(xmm0, xmm0);
				pcmpeqd(xmm2, xmm2);
				psrld(xmm2, 31);
				pand(xmm0, xmm2);
				
				por(xmm0, xmm1);
			}
			else
			{
				// m_env.p.z = GSVector4i(z);

				cvttps2dq(xmm0, xmm0);
			}

			movdqa(xmmword[&m_env.p.z], xmm0);
		}
	}
}
void GSSetupPrimCodeGenerator::Color()
{
	if(!m_en.c)
	{
		return;
	}

	if(m_env.sel.iip)
	{
		// GSVector4 c = dscan.c;

		movaps(xmm0, xmmword[edx]);
		movaps(xmm1, xmm0);

		// m_env.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();

		movaps(xmm2, xmm0);
		mulps(xmm2, xmm3);
		cvttps2dq(xmm2, xmm2);
		pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0));
		packssdw(xmm2, xmm2);
		movdqa(xmmword[&m_env.d4.c], xmm2);

		// xmm3 is not needed anymore

		// GSVector4 dr = c.xxxx();
		// GSVector4 db = c.zzzz();

		shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
		shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));

		for(int i = 0; i < 4; i++)
		{
			// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();

			movaps(xmm2, xmm0);
			mulps(xmm2, Xmm(4 + i));
			cvttps2dq(xmm2, xmm2);
			packssdw(xmm2, xmm2);

			// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();

			movaps(xmm3, xmm1);
			mulps(xmm3, Xmm(4 + i));
			cvttps2dq(xmm3, xmm3);
			packssdw(xmm3, xmm3);

			// m_env.d[i].rb = r.upl16(b);

			punpcklwd(xmm2, xmm3);
			movdqa(xmmword[&m_env.d[i].rb], xmm2);
		}

		// GSVector4 c = dscan.c;

		movaps(xmm0, xmmword[edx]); // not enough regs, have to reload it
		movaps(xmm1, xmm0);

		// GSVector4 dg = c.yyyy();
		// GSVector4 da = c.wwww();

		shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
		shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));

		for(int i = 0; i < 4; i++)
		{
			// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();

			movaps(xmm2, xmm0);
			mulps(xmm2, Xmm(4 + i));
			cvttps2dq(xmm2, xmm2);
			packssdw(xmm2, xmm2);

			// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();

			movaps(xmm3, xmm1);
			mulps(xmm3, Xmm(4 + i));
			cvttps2dq(xmm3, xmm3);
			packssdw(xmm3, xmm3);

			// m_env.d[i].ga = g.upl16(a);

			punpcklwd(xmm2, xmm3);
			movdqa(xmmword[&m_env.d[i].ga], xmm2);
		}
	}
	else
	{
		// GSVector4i c = GSVector4i(vertices[0].c);

		movaps(xmm0, xmmword[ecx]);
		cvttps2dq(xmm0, xmm0);

		// c = c.upl16(c.zwxy());

		movdqa(xmm1, xmm0);
		pshufd(xmm1, xmm1, _MM_SHUFFLE(1, 0, 3, 2));
		punpcklwd(xmm0, xmm1);

		// if(!tme) c = c.srl16(7);

		if(m_env.sel.tfx == TFX_NONE)
		{
			psrlw(xmm0, 7);
		}

		// m_env.c.rb = c.xxxx();
		// m_env.c.ga = c.zzzz();

		movdqa(xmm1, xmm0);
		pshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
		pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
		movdqa(xmmword[&m_env.c.rb], xmm0);
		movdqa(xmmword[&m_env.c.ga], xmm1);
	}
}
void GPUDrawScanlineCodeGenerator::SampleTexture()
{
	if(!m_sel.tme)
	{
		return;
	}

	if(m_sel.tlu)
	{
		mov(edx, ptr[&m_local.gd->clut]);
	}

	// xmm2 = s
	// xmm3 = t
	// xmm7 = test
	// xmm0, xmm4, xmm5, xmm6 = free
	// xmm1 = used

	if(m_sel.ltf)
	{
		// GSVector4i u = s.sub16(GSVector4i(0x00200020)); // - 0.125f
		// GSVector4i v = t.sub16(GSVector4i(0x00200020)); // - 0.125f

		mov(eax, 0x00200020);
		movd(xmm0, eax);
		pshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));

		psubw(xmm2, xmm0);
		psubw(xmm3, xmm0);

		// GSVector4i uf = (u & GSVector4i::x00ff()) << 7;
		// GSVector4i vf = (v & GSVector4i::x00ff()) << 7;

		movdqa(xmm0, xmm2);
		psllw(xmm0, 8);
		psrlw(xmm0, 1);
		movdqa(ptr[&m_local.temp.uf], xmm0);

		if(!m_sel.sprite)
		{
			movdqa(xmm0, xmm3);
			psllw(xmm0, 8);
			psrlw(xmm0, 1);
			movdqa(ptr[&m_local.temp.vf], xmm0);
		}
	}

	// GSVector4i u0 = s.srl16(8);
	// GSVector4i v0 = t.srl16(8);

	psrlw(xmm2, 8);
	psrlw(xmm3, 8);

	// xmm2 = u
	// xmm3 = v
	// xmm7 = test
	// xmm0, xmm4, xmm5, xmm6 = free
	// xmm1 = used

	if(m_sel.ltf)
	{
		// GSVector4i u1 = u0.add16(GSVector4i::x0001());
		// GSVector4i v1 = v0.add16(GSVector4i::x0001());

		movdqa(xmm4, xmm2);
		movdqa(xmm5, xmm3);

		pcmpeqd(xmm0, xmm0);
		psrlw(xmm0, 15);
		paddw(xmm4, xmm0);
		paddw(xmm5, xmm0);

		if(m_sel.twin)
		{
			// u0 = (u0 & m_local.twin[0].u).add16(m_local.twin[1].u);
			// v0 = (v0 & m_local.twin[0].v).add16(m_local.twin[1].v);
			// u1 = (u1 & m_local.twin[0].u).add16(m_local.twin[1].u);
			// v1 = (v1 & m_local.twin[0].v).add16(m_local.twin[1].v);

			movdqa(xmm0, ptr[&m_local.twin[0].u]);
			movdqa(xmm6, ptr[&m_local.twin[1].u]);

			pand(xmm2, xmm0);
			paddw(xmm2, xmm6);
			pand(xmm4, xmm0);
			paddw(xmm4, xmm6);

			movdqa(xmm0, ptr[&m_local.twin[0].v]);
			movdqa(xmm6, ptr[&m_local.twin[1].v]);

			pand(xmm3, xmm0);
			paddw(xmm3, xmm6);
			pand(xmm5, xmm0);
			paddw(xmm5, xmm6);
		}
		else
		{
			// u0 = u0.min_i16(m_local.twin[2].u);
			// v0 = v0.min_i16(m_local.twin[2].v);
			// u1 = u1.min_i16(m_local.twin[2].u);
			// v1 = v1.min_i16(m_local.twin[2].v);

			// TODO: if(!sprite) clamp16 else:

			movdqa(xmm0, ptr[&m_local.twin[2].u]);
			movdqa(xmm6, ptr[&m_local.twin[2].v]);

			pminsw(xmm2, xmm0);
			pminsw(xmm3, xmm6);
			pminsw(xmm4, xmm0);
			pminsw(xmm5, xmm6);
		}

		// xmm2 = u0
		// xmm3 = v0
		// xmm4 = u1
		// xmm5 = v1
		// xmm7 = test
		// xmm0, xmm6 = free
		// xmm1 = used

		// GSVector4i addr00 = v0.sll16(8) | u0;
		// GSVector4i addr01 = v0.sll16(8) | u1;
		// GSVector4i addr10 = v1.sll16(8) | u0;
		// GSVector4i addr11 = v1.sll16(8) | u1;

		psllw(xmm3, 8);
		movdqa(xmm0, xmm3);
		por(xmm3, xmm2);
		por(xmm0, xmm4);

		psllw(xmm5, 8);
		movdqa(xmm6, xmm5);
		por(xmm5, xmm2);
		por(xmm6, xmm4);

		// xmm3 = addr00
		// xmm0 = addr01
		// xmm5 = addr10
		// xmm6 = addr11
		// xmm7 = test
		// xmm2, xmm4 = free
		// xmm1 = used

		ReadTexel(xmm2, xmm3);
		ReadTexel(xmm4, xmm0);
		ReadTexel(xmm3, xmm5);
		ReadTexel(xmm5, xmm6);

		// xmm2 = c00
		// xmm4 = c01
		// xmm3 = c10
		// xmm5 = c11
		// xmm7 = test
		// xmm0, xmm6 = free
		// xmm1 = used

		// spill (TODO)

		movdqa(ptr[&m_local.temp.fd], xmm1);
		movdqa(ptr[&m_local.temp.test], xmm7);

		// xmm2 = c00
		// xmm4 = c01
		// xmm3 = c10
		// xmm5 = c11
		// xmm0, xmm1, xmm6, xmm7 = free

		movdqa(xmm1, xmm2);
		psllw(xmm1, 11);
		psrlw(xmm1, 8);

		movdqa(xmm0, xmm4);
		psllw(xmm0, 11);
		psrlw(xmm0, 8);

		lerp16<0>(xmm0, xmm1, ptr[&m_local.temp.uf]);

		movdqa(xmm6, xmm2);
		psllw(xmm6, 6);
		psrlw(xmm6, 11);
		psllw(xmm6, 3);

		movdqa(xmm1, xmm4);
		psllw(xmm1, 6);
		psrlw(xmm1, 11);
		psllw(xmm1, 3);

		lerp16<0>(xmm1, xmm6, ptr[&m_local.temp.uf]);

		movdqa(xmm7, xmm2);
		psllw(xmm7, 1);
		psrlw(xmm7, 11);
		psllw(xmm7, 3);

		movdqa(xmm6, xmm4);
		psllw(xmm6, 1);
		psrlw(xmm6, 11);
		psllw(xmm6, 3);

		lerp16<0>(xmm6, xmm7, ptr[&m_local.temp.uf]);

		psraw(xmm2, 15);
		psrlw(xmm2, 8);
		psraw(xmm4, 15);
		psrlw(xmm4, 8);

		lerp16<0>(xmm4, xmm2, ptr[&m_local.temp.uf]);

		// xmm0 = r00
		// xmm1 = g00
		// xmm6 = b00
		// xmm4 = a00
		// xmm3 = c10
		// xmm5 = c11
		// xmm2, xmm7 = free

		movdqa(xmm7, xmm3);
		psllw(xmm7, 11);
		psrlw(xmm7, 8);

		movdqa(xmm2, xmm5);
		psllw(xmm2, 11);
		psrlw(xmm2, 8);

		lerp16<0>(xmm2, xmm7, ptr[&m_local.temp.uf]);
		lerp16<0>(xmm2, xmm0, ptr[&m_local.temp.vf]);

		// xmm2 = r
		// xmm1 = g00
		// xmm6 = b00
		// xmm4 = a00
		// xmm3 = c10
		// xmm5 = c11
		// xmm0, xmm7 = free

		movdqa(xmm7, xmm3);
		psllw(xmm7, 6);
		psrlw(xmm7, 11);
		psllw(xmm7, 3);

		movdqa(xmm0, xmm5);
		psllw(xmm0, 6);
		psrlw(xmm0, 11);
		psllw(xmm0, 3);

		lerp16<0>(xmm0, xmm7, ptr[&m_local.temp.uf]);
		lerp16<0>(xmm0, xmm1, ptr[&m_local.temp.vf]);

		// xmm2 = r
		// xmm0 = g
		// xmm6 = b00
		// xmm4 = a00
		// xmm3 = c10
		// xmm5 = c11
		// xmm1, xmm7 = free

		movdqa(xmm7, xmm3);
		psllw(xmm7, 1);
		psrlw(xmm7, 11);
		psllw(xmm7, 3);

		movdqa(xmm1, xmm5);
		psllw(xmm1, 1);
		psrlw(xmm1, 11);
		psllw(xmm1, 3);

		lerp16<0>(xmm1, xmm7, ptr[&m_local.temp.uf]);
		lerp16<0>(xmm1, xmm6, ptr[&m_local.temp.vf]);

		// xmm2 = r
		// xmm0 = g
		// xmm1 = b
		// xmm4 = a00
		// xmm3 = c10
		// xmm5 = c11
		// xmm6, xmm7 = free

		psraw(xmm3, 15);
		psrlw(xmm3, 8);
		psraw(xmm5, 15);
		psrlw(xmm5, 8);

		lerp16<0>(xmm5, xmm3, ptr[&m_local.temp.uf]);
		lerp16<0>(xmm5, xmm4, ptr[&m_local.temp.vf]);

		// xmm2 = r
		// xmm0 = g
		// xmm1 = b
		// xmm5 = a
		// xmm3, xmm4, xmm6, xmm7 = free

		// TODO
		movdqa(xmm3, xmm5); // a
		movdqa(xmm4, xmm2); // r
		movdqa(xmm6, xmm1); // b
		movdqa(xmm5, xmm0); // g

		// reload test

		movdqa(xmm7, ptr[&m_local.temp.test]);

		// xmm4 = r
		// xmm5 = g
		// xmm6 = b
		// xmm3 = a
		// xmm7 = test
		// xmm0, xmm1, xmm2 = free

		// test |= (c[0] | c[1] | c[2] | c[3]).eq16(GSVector4i::zero()); // mask out blank pixels (not perfect)

		movdqa(xmm1, xmm3);
		por(xmm1, xmm4);
		movdqa(xmm2, xmm5);
		por(xmm2, xmm6);
		por(xmm1, xmm2);

		pxor(xmm0, xmm0);
		pcmpeqw(xmm1, xmm0);
		por(xmm7, xmm1);

		// a = a.gt16(GSVector4i::zero());

		pcmpgtw(xmm3, xmm0);

		// reload fd

		movdqa(xmm1, ptr[&m_local.temp.fd]);
	}
	else
	{
		if(m_sel.twin)
		{
			// u = (u & m_local.twin[0].u).add16(m_local.twin[1].u);
			// v = (v & m_local.twin[0].v).add16(m_local.twin[1].v);

			pand(xmm2, ptr[&m_local.twin[0].u]);
			paddw(xmm2, ptr[&m_local.twin[1].u]);
			pand(xmm3, ptr[&m_local.twin[0].v]);
			paddw(xmm3, ptr[&m_local.twin[1].v]);
		}
		else
		{
			// u = u.min_i16(m_local.twin[2].u);
			// v = v.min_i16(m_local.twin[2].v);

			// TODO: if(!sprite) clamp16 else:

			pminsw(xmm2, ptr[&m_local.twin[2].u]);
			pminsw(xmm3, ptr[&m_local.twin[2].v]);
		}

		// xmm2 = u
		// xmm3 = v
		// xmm7 = test
		// xmm0, xmm4, xmm5, xmm6 = free
		// xmm1 = used

		// GSVector4i addr = v.sll16(8) | u;

		psllw(xmm3, 8);
		por(xmm3, xmm2);

		// xmm3 = addr
		// xmm7 = test
		// xmm0, xmm2, xmm4, xmm5, xmm6 = free
		// xmm1 = used

		ReadTexel(xmm6, xmm3);

		// xmm3 = c00
		// xmm7 = test
		// xmm0, xmm2, xmm4, xmm5, xmm6 = free
		// xmm1 = used

		// test |= c00.eq16(GSVector4i::zero()); // mask out blank pixels

		pxor(xmm0, xmm0);
		pcmpeqw(xmm0, xmm6);
		por(xmm7, xmm0);

		// c[0] = (c00 << 3) & 0x00f800f8;
		// c[1] = (c00 >> 2) & 0x00f800f8;
		// c[2] = (c00 >> 7) & 0x00f800f8;
		// c[3] = c00.sra16(15);

		movdqa(xmm3, xmm6);
		psraw(xmm3, 15); // a

		pcmpeqd(xmm0, xmm0);
		psrlw(xmm0, 11);
		psllw(xmm0, 3); // 0x00f8

		movdqa(xmm4, xmm6);
		psllw(xmm4, 3);
		pand(xmm4, xmm0); // r

		movdqa(xmm5, xmm6);
		psrlw(xmm5, 2);
		pand(xmm5, xmm0); // g

		psrlw(xmm6, 7);
		pand(xmm6, xmm0); // b
	}
}
void GPUDrawScanlineCodeGenerator::Step()
{
	// steps -= 8;

	sub(ecx, 8);

	// fb += 8;

	add(edi, 8 * sizeof(uint16));

	if(m_sel.tme)
	{
		// GSVector4i st = m_local.d8.st;

		movdqa(xmm4, ptr[&m_local.d8.st]);

		// s = s.add16(st.xxxx());
		// t = t.add16(st.yyyy());

		pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
		paddw(xmm2, ptr[&m_local.temp.s]);
		movdqa(ptr[&m_local.temp.s], xmm2);

		// TODO: if(!sprite) ... else reload t

		pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1));
		paddw(xmm3, ptr[&m_local.temp.t]);
		movdqa(ptr[&m_local.temp.t], xmm3);
	}

	if(m_sel.tfx != 3) // != decal
	{
		if(m_sel.iip)
		{
			// GSVector4i c = m_local.d8.c;

			// r = r.add16(c.xxxx());
			// g = g.add16(c.yyyy());
			// b = b.add16(c.zzzz());

			movdqa(xmm6, ptr[&m_local.d8.c]);

			pshufd(xmm4, xmm6, _MM_SHUFFLE(0, 0, 0, 0));
			pshufd(xmm5, xmm6, _MM_SHUFFLE(1, 1, 1, 1));
			pshufd(xmm6, xmm6, _MM_SHUFFLE(2, 2, 2, 2));

			paddw(xmm4, ptr[&m_local.temp.r]);
			paddw(xmm5, ptr[&m_local.temp.g]);
			paddw(xmm6, ptr[&m_local.temp.b]);

			movdqa(ptr[&m_local.temp.r], xmm4);
			movdqa(ptr[&m_local.temp.g], xmm5);
			movdqa(ptr[&m_local.temp.b], xmm6);
		}
		else
		{
			movdqa(xmm4, ptr[&m_local.temp.r]);
			movdqa(xmm5, ptr[&m_local.temp.g]);
			movdqa(xmm6, ptr[&m_local.temp.b]);
		}
	}
}
void GPUDrawScanlineCodeGenerator::Init()
{
	mov(eax, dword[esp + _top]);

	// uint16* fb = (uint16*)m_global.vm + (top << (10 + sel.scalex)) + left;

	mov(edi, eax);
	shl(edi, 10 + m_sel.scalex);
	add(edi, edx);
	lea(edi, ptr[edi * 2 + (size_t)m_local.gd->vm]);

	// int steps = pixels - 8;

	sub(ecx, 8);

	if(m_sel.dtd)
	{
		// dither = GSVector4i::load<false>(&m_dither[top & 3][left & 3]);

		and(eax, 3);
		shl(eax, 5);
		and(edx, 3);
		shl(edx, 1);
		movdqu(xmm0, ptr[eax + edx + (size_t)m_dither]);
		movdqa(ptr[&m_local.temp.dither], xmm0);
	}

	mov(edx, dword[esp + _v]);

	if(m_sel.tme)
	{
		mov(esi, dword[&m_local.gd->tex]);

		// GSVector4i vt = GSVector4i(v.t).xxzzl();

		cvttps2dq(xmm4, ptr[edx + offsetof(GSVertexSW, t)]);
		pshuflw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0));

		// s = vt.xxxx().add16(m_local.d.s);
		// t = vt.yyyy().add16(m_local.d.t);

		pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
		pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1));

		paddw(xmm2, ptr[&m_local.d.s]);

		if(!m_sel.sprite)
		{
			paddw(xmm3, ptr[&m_local.d.t]);
		}
		else
		{
			if(m_sel.ltf)
			{
				movdqa(xmm0, xmm3);
				psllw(xmm0, 8);
				psrlw(xmm0, 1);
				movdqa(ptr[&m_local.temp.vf], xmm0);
			}
		}

		movdqa(ptr[&m_local.temp.s], xmm2);
		movdqa(ptr[&m_local.temp.t], xmm3);
	}

	if(m_sel.tfx != 3) // != decal
	{
		// GSVector4i vc = GSVector4i(v.c).xxzzlh();

		cvttps2dq(xmm6, ptr[edx + offsetof(GSVertexSW, c)]);
		pshuflw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0));
		pshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0));

		// r = vc.xxxx();
		// g = vc.yyyy();
		// b = vc.zzzz();

		pshufd(xmm4, xmm6, _MM_SHUFFLE(0, 0, 0, 0));
		pshufd(xmm5, xmm6, _MM_SHUFFLE(1, 1, 1, 1));
		pshufd(xmm6, xmm6, _MM_SHUFFLE(2, 2, 2, 2));

		if(m_sel.iip)
		{
			// r = r.add16(m_local.d.r);
			// g = g.add16(m_local.d.g);
			// b = b.add16(m_local.d.b);

			paddw(xmm4, ptr[&m_local.d.r]);
			paddw(xmm5, ptr[&m_local.d.g]);
			paddw(xmm6, ptr[&m_local.d.b]);
		}

		movdqa(ptr[&m_local.temp.r], xmm4);
		movdqa(ptr[&m_local.temp.g], xmm5);
		movdqa(ptr[&m_local.temp.b], xmm6);
	}
}
void MacroAssembler::fast_log(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register eax, Register ecx, Register edx, Register tmp1, Register tmp2) {
  Label L_2TAG_PACKET_0_0_2, L_2TAG_PACKET_1_0_2, L_2TAG_PACKET_2_0_2, L_2TAG_PACKET_3_0_2;
  Label L_2TAG_PACKET_4_0_2, L_2TAG_PACKET_5_0_2, L_2TAG_PACKET_6_0_2, L_2TAG_PACKET_7_0_2;
  Label L_2TAG_PACKET_8_0_2;
  Label L_2TAG_PACKET_12_0_2, L_2TAG_PACKET_13_0_2, B1_3, B1_5, start;

  assert_different_registers(tmp1, tmp2, eax, ecx, edx);
  jmp(start);
  address L_tbl = (address)_L_tbl;
  address log2 = (address)_log2;
  address coeff = (address)_coeff;

  bind(start);
  subq(rsp, 24);
  movsd(Address(rsp, 0), xmm0);
  mov64(rax, 0x3ff0000000000000);
  movdq(xmm2, rax);
  mov64(rdx, 0x77f0000000000000);
  movdq(xmm3, rdx);
  movl(ecx, 32768);
  movdl(xmm4, rcx);
  mov64(tmp1, 0xffffe00000000000);
  movdq(xmm5, tmp1);
  movdqu(xmm1, xmm0);
  pextrw(eax, xmm0, 3);
  por(xmm0, xmm2);
  movl(ecx, 16352);
  psrlq(xmm0, 27);
  lea(tmp2, ExternalAddress(L_tbl));
  psrld(xmm0, 2);
  rcpps(xmm0, xmm0);
  psllq(xmm1, 12);
  pshufd(xmm6, xmm5, 228);
  psrlq(xmm1, 12);
  subl(eax, 16);
  cmpl(eax, 32736);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_0_0_2);

  bind(L_2TAG_PACKET_1_0_2);
  paddd(xmm0, xmm4);
  por(xmm1, xmm3);
  movdl(edx, xmm0);
  psllq(xmm0, 29);
  pand(xmm5, xmm1);
  pand(xmm0, xmm6);
  subsd(xmm1, xmm5);
  mulpd(xmm5, xmm0);
  andl(eax, 32752);
  subl(eax, ecx);
  cvtsi2sdl(xmm7, eax);
  mulsd(xmm1, xmm0);
  movq(xmm6, ExternalAddress(log2));       // 0xfefa3800UL, 0x3fa62e42UL
  movdqu(xmm3, ExternalAddress(coeff));    // 0x92492492UL, 0x3fc24924UL, 0x00000000UL, 0xbfd00000UL
  subsd(xmm5, xmm2);
  andl(edx, 16711680);
  shrl(edx, 12);
  movdqu(xmm0, Address(tmp2, edx));
  movdqu(xmm4, ExternalAddress(16 + coeff)); // 0x3d6fb175UL, 0xbfc5555eUL, 0x55555555UL, 0x3fd55555UL
  addsd(xmm1, xmm5);
  movdqu(xmm2, ExternalAddress(32 + coeff)); // 0x9999999aUL, 0x3fc99999UL, 0x00000000UL, 0xbfe00000UL
  mulsd(xmm6, xmm7);
  movddup(xmm5, xmm1);
  mulsd(xmm7, ExternalAddress(8 + log2));    // 0x93c76730UL, 0x3ceef357UL
  mulsd(xmm3, xmm1);
  addsd(xmm0, xmm6);
  mulpd(xmm4, xmm5);
  mulpd(xmm5, xmm5);
  movddup(xmm6, xmm0);
  addsd(xmm0, xmm1);
  addpd(xmm4, xmm2);
  mulpd(xmm3, xmm5);
  subsd(xmm6, xmm0);
  mulsd(xmm4, xmm1);
  pshufd(xmm2, xmm0, 238);
  addsd(xmm1, xmm6);
  mulsd(xmm5, xmm5);
  addsd(xmm7, xmm2);
  addpd(xmm4, xmm3);
  addsd(xmm1, xmm7);
  mulpd(xmm4, xmm5);
  addsd(xmm1, xmm4);
  pshufd(xmm5, xmm4, 238);
  addsd(xmm1, xmm5);
  addsd(xmm0, xmm1);
  jmp(B1_5);

  bind(L_2TAG_PACKET_0_0_2);
  movq(xmm0, Address(rsp, 0));
  movq(xmm1, Address(rsp, 0));
  addl(eax, 16);
  cmpl(eax, 32768);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_2_0_2);
  cmpl(eax, 16);
  jcc(Assembler::below, L_2TAG_PACKET_3_0_2);

  bind(L_2TAG_PACKET_4_0_2);
  addsd(xmm0, xmm0);
  jmp(B1_5);

  bind(L_2TAG_PACKET_5_0_2);
  jcc(Assembler::above, L_2TAG_PACKET_4_0_2);
  cmpl(edx, 0);
  jcc(Assembler::above, L_2TAG_PACKET_4_0_2);
  jmp(L_2TAG_PACKET_6_0_2);

  bind(L_2TAG_PACKET_3_0_2);
  xorpd(xmm1, xmm1);
  addsd(xmm1, xmm0);
  movdl(edx, xmm1);
  psrlq(xmm1, 32);
  movdl(ecx, xmm1);
  orl(edx, ecx);
  cmpl(edx, 0);
  jcc(Assembler::equal, L_2TAG_PACKET_7_0_2);
  xorpd(xmm1, xmm1);
  movl(eax, 18416);
  pinsrw(xmm1, eax, 3);
  mulsd(xmm0, xmm1);
  movdqu(xmm1, xmm0);
  pextrw(eax, xmm0, 3);
  por(xmm0, xmm2);
  psrlq(xmm0, 27);
  movl(ecx, 18416);
  psrld(xmm0, 2);
  rcpps(xmm0, xmm0);
  psllq(xmm1, 12);
  pshufd(xmm6, xmm5, 228);
  psrlq(xmm1, 12);
  jmp(L_2TAG_PACKET_1_0_2);

  bind(L_2TAG_PACKET_2_0_2);
  movdl(edx, xmm1);
  psrlq(xmm1, 32);
  movdl(ecx, xmm1);
  addl(ecx, ecx);
  cmpl(ecx, -2097152);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_5_0_2);
  orl(edx, ecx);
  cmpl(edx, 0);
  jcc(Assembler::equal, L_2TAG_PACKET_7_0_2);

  bind(L_2TAG_PACKET_6_0_2);
  xorpd(xmm1, xmm1);
  xorpd(xmm0, xmm0);
  movl(eax, 32752);
  pinsrw(xmm1, eax, 3);
  mulsd(xmm0, xmm1);
  movl(Address(rsp, 16), 3);
  jmp(L_2TAG_PACKET_8_0_2);
  bind(L_2TAG_PACKET_7_0_2);
  xorpd(xmm1, xmm1);
  xorpd(xmm0, xmm0);
  movl(eax, 49136);
  pinsrw(xmm0, eax, 3);
  divsd(xmm0, xmm1);
  movl(Address(rsp, 16), 2);

  bind(L_2TAG_PACKET_8_0_2);
  movq(Address(rsp, 8), xmm0);

  bind(B1_3);
  movq(xmm0, Address(rsp, 8));

  bind(B1_5);
  addq(rsp, 24);
}
void MacroAssembler::fast_exp(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register eax, Register ecx, Register edx, Register tmp) {
  Label L_2TAG_PACKET_0_0_2, L_2TAG_PACKET_1_0_2, L_2TAG_PACKET_2_0_2, L_2TAG_PACKET_3_0_2;
  Label L_2TAG_PACKET_4_0_2, L_2TAG_PACKET_5_0_2, L_2TAG_PACKET_6_0_2, L_2TAG_PACKET_7_0_2;
  Label L_2TAG_PACKET_8_0_2, L_2TAG_PACKET_9_0_2, L_2TAG_PACKET_10_0_2, L_2TAG_PACKET_11_0_2;
  Label L_2TAG_PACKET_12_0_2, B1_3, B1_5, start;

  assert_different_registers(tmp, eax, ecx, edx);
  jmp(start);
  address cv = (address)_cv;
  address Shifter = (address)_shifter;
  address mmask = (address)_mmask;
  address bias = (address)_bias;
  address Tbl_addr = (address)_Tbl_addr;
  address ALLONES = (address)_ALLONES;
  address ebias = (address)_ebias;
  address XMAX = (address)_XMAX;
  address XMIN = (address)_XMIN;
  address INF = (address)_INF;
  address ZERO = (address)_ZERO;
  address ONE_val = (address)_ONE_val;

  bind(start);
  subq(rsp, 24);
  movsd(Address(rsp, 8), xmm0);
  unpcklpd(xmm0, xmm0);
  movdqu(xmm1, ExternalAddress(cv));       // 0x652b82feUL, 0x40571547UL, 0x652b82feUL, 0x40571547UL
  movdqu(xmm6, ExternalAddress(Shifter));  // 0x00000000UL, 0x43380000UL, 0x00000000UL, 0x43380000UL
  movdqu(xmm2, ExternalAddress(16+cv));    // 0xfefa0000UL, 0x3f862e42UL, 0xfefa0000UL, 0x3f862e42UL
  movdqu(xmm3, ExternalAddress(32+cv));    // 0xbc9e3b3aUL, 0x3d1cf79aUL, 0xbc9e3b3aUL, 0x3d1cf79aUL
  pextrw(eax, xmm0, 3);
  andl(eax, 32767);
  movl(edx, 16527);
  subl(edx, eax);
  subl(eax, 15504);
  orl(edx, eax);
  cmpl(edx, INT_MIN);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_0_0_2);
  mulpd(xmm1, xmm0);
  addpd(xmm1, xmm6);
  movapd(xmm7, xmm1);
  subpd(xmm1, xmm6);
  mulpd(xmm2, xmm1);
  movdqu(xmm4, ExternalAddress(64+cv));    // 0xe3289860UL, 0x3f56c15cUL, 0x555b9e25UL, 0x3fa55555UL
  mulpd(xmm3, xmm1);
  movdqu(xmm5, ExternalAddress(80+cv));    // 0xc090cf0fUL, 0x3f811115UL, 0x55548ba1UL, 0x3fc55555UL
  subpd(xmm0, xmm2);
  movdl(eax, xmm7);
  movl(ecx, eax);
  andl(ecx, 63);
  shll(ecx, 4);
  sarl(eax, 6);
  movl(edx, eax);
  movdqu(xmm6, ExternalAddress(mmask));    // 0xffffffc0UL, 0x00000000UL, 0xffffffc0UL, 0x00000000UL
  pand(xmm7, xmm6);
  movdqu(xmm6, ExternalAddress(bias));     // 0x0000ffc0UL, 0x00000000UL, 0x0000ffc0UL, 0x00000000UL
  paddq(xmm7, xmm6);
  psllq(xmm7, 46);
  subpd(xmm0, xmm3);
  lea(tmp, ExternalAddress(Tbl_addr));
  movdqu(xmm2, Address(ecx,tmp));
  mulpd(xmm4, xmm0);
  movapd(xmm6, xmm0);
  movapd(xmm1, xmm0);
  mulpd(xmm6, xmm6);
  mulpd(xmm0, xmm6);
  addpd(xmm5, xmm4);
  mulsd(xmm0, xmm6);
  mulpd(xmm6, ExternalAddress(48+cv));     // 0xfffffffeUL, 0x3fdfffffUL, 0xfffffffeUL, 0x3fdfffffUL
  addsd(xmm1, xmm2);
  unpckhpd(xmm2, xmm2);
  mulpd(xmm0, xmm5);
  addsd(xmm1, xmm0);
  por(xmm2, xmm7);
  unpckhpd(xmm0, xmm0);
  addsd(xmm0, xmm1);
  addsd(xmm0, xmm6);
  addl(edx, 894);
  cmpl(edx, 1916);
  jcc (Assembler::above, L_2TAG_PACKET_1_0_2);
  mulsd(xmm0, xmm2);
  addsd(xmm0, xmm2);
  jmp (B1_5);

  bind(L_2TAG_PACKET_1_0_2);
  xorpd(xmm3, xmm3);
  movdqu(xmm4, ExternalAddress(ALLONES));  // 0xffffffffUL, 0xffffffffUL, 0xffffffffUL, 0xffffffffUL
  movl(edx, -1022);
  subl(edx, eax);
  movdl(xmm5, edx);
  psllq(xmm4, xmm5);
  movl(ecx, eax);
  sarl(eax, 1);
  pinsrw(xmm3, eax, 3);
  movdqu(xmm6, ExternalAddress(ebias));    // 0x00000000UL, 0x3ff00000UL, 0x00000000UL, 0x3ff00000UL
  psllq(xmm3, 4);
  psubd(xmm2, xmm3);
  mulsd(xmm0, xmm2);
  cmpl(edx, 52);
  jcc(Assembler::greater, L_2TAG_PACKET_2_0_2);
  pand(xmm4, xmm2);
  paddd(xmm3, xmm6);
  subsd(xmm2, xmm4);
  addsd(xmm0, xmm2);
  cmpl(ecx, 1023);
  jcc(Assembler::greaterEqual, L_2TAG_PACKET_3_0_2);
  pextrw(ecx, xmm0, 3);
  andl(ecx, 32768);
  orl(edx, ecx);
  cmpl(edx, 0);
  jcc(Assembler::equal, L_2TAG_PACKET_4_0_2);
  movapd(xmm6, xmm0);
  addsd(xmm0, xmm4);
  mulsd(xmm0, xmm3);
  pextrw(ecx, xmm0, 3);
  andl(ecx, 32752);
  cmpl(ecx, 0);
  jcc(Assembler::equal, L_2TAG_PACKET_5_0_2);
  jmp(B1_5);

  bind(L_2TAG_PACKET_5_0_2);
  mulsd(xmm6, xmm3);
  mulsd(xmm4, xmm3);
  movdqu(xmm0, xmm6);
  pxor(xmm6, xmm4);
  psrad(xmm6, 31);
  pshufd(xmm6, xmm6, 85);
  psllq(xmm0, 1);
  psrlq(xmm0, 1);
  pxor(xmm0, xmm6);
  psrlq(xmm6, 63);
  paddq(xmm0, xmm6);
  paddq(xmm0, xmm4);
  movl(Address(rsp,0), 15);
  jmp(L_2TAG_PACKET_6_0_2);

  bind(L_2TAG_PACKET_4_0_2);
  addsd(xmm0, xmm4);
  mulsd(xmm0, xmm3);
  jmp(B1_5);

  bind(L_2TAG_PACKET_3_0_2);
  addsd(xmm0, xmm4);
  mulsd(xmm0, xmm3);
  pextrw(ecx, xmm0, 3);
  andl(ecx, 32752);
  cmpl(ecx, 32752);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_7_0_2);
  jmp(B1_5);

  bind(L_2TAG_PACKET_2_0_2);
  paddd(xmm3, xmm6);
  addpd(xmm0, xmm2);
  mulsd(xmm0, xmm3);
  movl(Address(rsp,0), 15);
  jmp(L_2TAG_PACKET_6_0_2);

  bind(L_2TAG_PACKET_8_0_2);
  cmpl(eax, 2146435072);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_9_0_2);
  movl(eax, Address(rsp,12));
  cmpl(eax, INT_MIN);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_10_0_2);
  movsd(xmm0, ExternalAddress(XMAX));      // 0xffffffffUL, 0x7fefffffUL
  mulsd(xmm0, xmm0);

  bind(L_2TAG_PACKET_7_0_2);
  movl(Address(rsp,0), 14);
  jmp(L_2TAG_PACKET_6_0_2);

  bind(L_2TAG_PACKET_10_0_2);
  movsd(xmm0, ExternalAddress(XMIN));      // 0x00000000UL, 0x00100000UL
  mulsd(xmm0, xmm0);
  movl(Address(rsp,0), 15);
  jmp(L_2TAG_PACKET_6_0_2);

  bind(L_2TAG_PACKET_9_0_2);
  movl(edx, Address(rsp,8));
  cmpl(eax, 2146435072);
  jcc(Assembler::above, L_2TAG_PACKET_11_0_2);
  cmpl(edx, 0);
  jcc(Assembler::notEqual, L_2TAG_PACKET_11_0_2);
  movl(eax, Address(rsp,12));
  cmpl(eax, 2146435072);
  jcc(Assembler::notEqual, L_2TAG_PACKET_12_0_2);
  movsd(xmm0, ExternalAddress(INF));       // 0x00000000UL, 0x7ff00000UL
  jmp(B1_5);

  bind(L_2TAG_PACKET_12_0_2);
  movsd(xmm0, ExternalAddress(ZERO));      // 0x00000000UL, 0x00000000UL
  jmp(B1_5);

  bind(L_2TAG_PACKET_11_0_2);
  movsd(xmm0, Address(rsp, 8));
  addsd(xmm0, xmm0);
  jmp(B1_5);

  bind(L_2TAG_PACKET_0_0_2);
  movl(eax, Address(rsp, 12));
  andl(eax, 2147483647);
  cmpl(eax, 1083179008);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_8_0_2);
  movsd(Address(rsp, 8), xmm0);
  addsd(xmm0, ExternalAddress(ONE_val));   // 0x00000000UL, 0x3ff00000UL
  jmp(B1_5);

  bind(L_2TAG_PACKET_6_0_2);
  movq(Address(rsp, 16), xmm0);

  bind(B1_3);
  movq(xmm0, Address(rsp, 16));

  bind(B1_5);
  addq(rsp, 24);
}
void MacroAssembler::fast_log(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register eax, Register ecx, Register edx, Register tmp) {
  Label L_2TAG_PACKET_0_0_2, L_2TAG_PACKET_1_0_2, L_2TAG_PACKET_2_0_2, L_2TAG_PACKET_3_0_2;
  Label L_2TAG_PACKET_4_0_2, L_2TAG_PACKET_5_0_2, L_2TAG_PACKET_6_0_2, L_2TAG_PACKET_7_0_2;
  Label L_2TAG_PACKET_8_0_2, L_2TAG_PACKET_9_0_2;
  Label L_2TAG_PACKET_10_0_2, start;

  assert_different_registers(tmp, eax, ecx, edx);
  jmp(start);
  address static_const_table = (address)_static_const_table_log;

  bind(start);
  subl(rsp, 104);
  movl(Address(rsp, 40), tmp);
  lea(tmp, ExternalAddress(static_const_table));
  xorpd(xmm2, xmm2);
  movl(eax, 16368);
  pinsrw(xmm2, eax, 3);
  xorpd(xmm3, xmm3);
  movl(edx, 30704);
  pinsrw(xmm3, edx, 3);
  movsd(xmm0, Address(rsp, 112));
  movapd(xmm1, xmm0);
  movl(ecx, 32768);
  movdl(xmm4, ecx);
  movsd(xmm5, Address(tmp, 2128));         // 0x00000000UL, 0xffffe000UL
  pextrw(eax, xmm0, 3);
  por(xmm0, xmm2);
  psllq(xmm0, 5);
  movl(ecx, 16352);
  psrlq(xmm0, 34);
  rcpss(xmm0, xmm0);
  psllq(xmm1, 12);
  pshufd(xmm6, xmm5, 228);
  psrlq(xmm1, 12);
  subl(eax, 16);
  cmpl(eax, 32736);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_0_0_2);

  bind(L_2TAG_PACKET_1_0_2);
  paddd(xmm0, xmm4);
  por(xmm1, xmm3);
  movdl(edx, xmm0);
  psllq(xmm0, 29);
  pand(xmm5, xmm1);
  pand(xmm0, xmm6);
  subsd(xmm1, xmm5);
  mulpd(xmm5, xmm0);
  andl(eax, 32752);
  subl(eax, ecx);
  cvtsi2sdl(xmm7, eax);
  mulsd(xmm1, xmm0);
  movsd(xmm6, Address(tmp, 2064));         // 0xfefa3800UL, 0x3fa62e42UL
  movdqu(xmm3, Address(tmp, 2080));        // 0x92492492UL, 0x3fc24924UL, 0x00000000UL, 0xbfd00000UL
  subsd(xmm5, xmm2);
  andl(edx, 16711680);
  shrl(edx, 12);
  movdqu(xmm0, Address(tmp, edx));
  movdqu(xmm4, Address(tmp, 2096));        // 0x3d6fb175UL, 0xbfc5555eUL, 0x55555555UL, 0x3fd55555UL
  addsd(xmm1, xmm5);
  movdqu(xmm2, Address(tmp, 2112));        // 0x9999999aUL, 0x3fc99999UL, 0x00000000UL, 0xbfe00000UL
  mulsd(xmm6, xmm7);
  pshufd(xmm5, xmm1, 68);
  mulsd(xmm7, Address(tmp, 2072));         // 0x93c76730UL, 0x3ceef357UL, 0x92492492UL, 0x3fc24924UL
  mulsd(xmm3, xmm1);
  addsd(xmm0, xmm6);
  mulpd(xmm4, xmm5);
  mulpd(xmm5, xmm5);
  pshufd(xmm6, xmm0, 228);
  addsd(xmm0, xmm1);
  addpd(xmm4, xmm2);
  mulpd(xmm3, xmm5);
  subsd(xmm6, xmm0);
  mulsd(xmm4, xmm1);
  pshufd(xmm2, xmm0, 238);
  addsd(xmm1, xmm6);
  mulsd(xmm5, xmm5);
  addsd(xmm7, xmm2);
  addpd(xmm4, xmm3);
  addsd(xmm1, xmm7);
  mulpd(xmm4, xmm5);
  addsd(xmm1, xmm4);
  pshufd(xmm5, xmm4, 238);
  addsd(xmm1, xmm5);
  addsd(xmm0, xmm1);
  jmp(L_2TAG_PACKET_2_0_2);

  bind(L_2TAG_PACKET_0_0_2);
  movsd(xmm0, Address(rsp, 112));
  movdqu(xmm1, xmm0);
  addl(eax, 16);
  cmpl(eax, 32768);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_3_0_2);
  cmpl(eax, 16);
  jcc(Assembler::below, L_2TAG_PACKET_4_0_2);

  bind(L_2TAG_PACKET_5_0_2);
  addsd(xmm0, xmm0);
  jmp(L_2TAG_PACKET_2_0_2);

  bind(L_2TAG_PACKET_6_0_2);
  jcc(Assembler::above, L_2TAG_PACKET_5_0_2);
  cmpl(edx, 0);
  jcc(Assembler::above, L_2TAG_PACKET_5_0_2);
  jmp(L_2TAG_PACKET_7_0_2);

  bind(L_2TAG_PACKET_3_0_2);
  movdl(edx, xmm1);
  psrlq(xmm1, 32);
  movdl(ecx, xmm1);
  addl(ecx, ecx);
  cmpl(ecx, -2097152);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_6_0_2);
  orl(edx, ecx);
  cmpl(edx, 0);
  jcc(Assembler::equal, L_2TAG_PACKET_8_0_2);

  bind(L_2TAG_PACKET_7_0_2);
  xorpd(xmm1, xmm1);
  xorpd(xmm0, xmm0);
  movl(eax, 32752);
  pinsrw(xmm1, eax, 3);
  movl(edx, 3);
  mulsd(xmm0, xmm1);

  bind(L_2TAG_PACKET_9_0_2);
  movsd(Address(rsp, 0), xmm0);
  movsd(xmm0, Address(rsp, 112));
  fld_d(Address(rsp, 0));
  jmp(L_2TAG_PACKET_10_0_2);

  bind(L_2TAG_PACKET_8_0_2);
  xorpd(xmm1, xmm1);
  xorpd(xmm0, xmm0);
  movl(eax, 49136);
  pinsrw(xmm0, eax, 3);
  divsd(xmm0, xmm1);
  movl(edx, 2);
  jmp(L_2TAG_PACKET_9_0_2);

  bind(L_2TAG_PACKET_4_0_2);
  movdl(edx, xmm1);
  psrlq(xmm1, 32);
  movdl(ecx, xmm1);
  orl(edx, ecx);
  cmpl(edx, 0);
  jcc(Assembler::equal, L_2TAG_PACKET_8_0_2);
  xorpd(xmm1, xmm1);
  movl(eax, 18416);
  pinsrw(xmm1, eax, 3);
  mulsd(xmm0, xmm1);
  movapd(xmm1, xmm0);
  pextrw(eax, xmm0, 3);
  por(xmm0, xmm2);
  psllq(xmm0, 5);
  movl(ecx, 18416);
  psrlq(xmm0, 34);
  rcpss(xmm0, xmm0);
  psllq(xmm1, 12);
  pshufd(xmm6, xmm5, 228);
  psrlq(xmm1, 12);
  jmp(L_2TAG_PACKET_1_0_2);

  bind(L_2TAG_PACKET_2_0_2);
  movsd(Address(rsp, 24), xmm0);
  fld_d(Address(rsp, 24));

  bind(L_2TAG_PACKET_10_0_2);
  movl(tmp, Address(rsp, 40));
}
void GSDrawScanlineCodeGenerator::SampleTexture()
{
    if(!m_sel.fb || m_sel.tfx == TFX_NONE)
    {
        return;
    }

    mov(ebx, dword[&m_env.tex]);

    // ebx = tex

    if(!m_sel.fst)
    {
        // TODO: move these into Init/Step too?

        cvttps2dq(xmm2, xmm2);
        cvttps2dq(xmm3, xmm3);

        if(m_sel.ltf)
        {
            // u -= 0x8000;
            // v -= 0x8000;

            mov(eax, 0x8000);
            movd(xmm4, eax);
            pshufd(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
            psubd(xmm2, xmm4);
            psubd(xmm3, xmm4);
        }
    }

    // xmm2 = u
    // xmm3 = v

    if(m_sel.ltf)
    {
        // GSVector4i uf = u.xxzzlh().srl16(1);

        movdqa(xmm0, xmm2);
        pshuflw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
        pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
        psrlw(xmm0, 1);
        movdqa(xmmword[&m_env.temp.uf], xmm0);

        if(!m_sel.sprite)
        {
            // GSVector4i vf = v.xxzzlh().srl16(1);

            movdqa(xmm1, xmm3);
            pshuflw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 0, 0));
            pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 0, 0));
            psrlw(xmm1, 1);
            movdqa(xmmword[&m_env.temp.vf], xmm1);
        }
    }

    // GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16));

    psrad(xmm2, 16);
    psrad(xmm3, 16);
    packssdw(xmm2, xmm3);

    if(m_sel.ltf)
    {
        // GSVector4i uv1 = uv0.add16(GSVector4i::x0001());

        movdqa(xmm3, xmm2);
        pcmpeqd(xmm1, xmm1);
        psrlw(xmm1, 15);
        paddw(xmm3, xmm1);

        // uv0 = Wrap(uv0);
        // uv1 = Wrap(uv1);

        Wrap(xmm2, xmm3);
    }
    else
    {
        // uv0 = Wrap(uv0);

        Wrap(xmm2);
    }

    // xmm2 = uv0
    // xmm3 = uv1 (ltf)
    // xmm0, xmm1, xmm4, xmm5, xmm6 = free
    // xmm7 = used

    // GSVector4i y0 = uv0.uph16() << tw;
    // GSVector4i x0 = uv0.upl16();

    pxor(xmm0, xmm0);
    movd(xmm1, ptr[&m_env.tw]);

    movdqa(xmm4, xmm2);
    punpckhwd(xmm2, xmm0);
    punpcklwd(xmm4, xmm0);
    pslld(xmm2, xmm1);

    // xmm0 = 0
    // xmm1 = tw
    // xmm2 = y0
    // xmm3 = uv1 (ltf)
    // xmm4 = x0
    // xmm5, xmm6 = free
    // xmm7 = used

    if(m_sel.ltf)
    {
        // GSVector4i y1 = uv1.uph16() << tw;
        // GSVector4i x1 = uv1.upl16();

        movdqa(xmm6, xmm3);
        punpckhwd(xmm3, xmm0);
        punpcklwd(xmm6, xmm0);
        pslld(xmm3, xmm1);

        // xmm2 = y0
        // xmm3 = y1
        // xmm4 = x0
        // xmm6 = x1
        // xmm0, xmm5, xmm6 = free
        // xmm7 = used

        // GSVector4i addr00 = y0 + x0;
        // GSVector4i addr01 = y0 + x1;
        // GSVector4i addr10 = y1 + x0;
        // GSVector4i addr11 = y1 + x1;

        movdqa(xmm5, xmm2);
        paddd(xmm5, xmm4);
        paddd(xmm2, xmm6);

        movdqa(xmm0, xmm3);
        paddd(xmm0, xmm4);
        paddd(xmm3, xmm6);

        // xmm5 = addr00
        // xmm2 = addr01
        // xmm0 = addr10
        // xmm3 = addr11
        // xmm1, xmm4, xmm6 = free
        // xmm7 = used

        // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
        // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]);
        // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
        // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);

        ReadTexel(xmm6, xmm5, xmm1, xmm4);

        // xmm2, xmm5, xmm1 = free

        ReadTexel(xmm4, xmm2, xmm5, xmm1);

        // xmm0, xmm2, xmm5 = free

        ReadTexel(xmm1, xmm0, xmm2, xmm5);

        // xmm3, xmm0, xmm2 = free

        ReadTexel(xmm5, xmm3, xmm0, xmm2);

        // xmm6 = c00
        // xmm4 = c01
        // xmm1 = c10
        // xmm5 = c11
        // xmm0, xmm2, xmm3 = free
        // xmm7 = used

        movdqa(xmm0, xmmword[&m_env.temp.uf]);

        // GSVector4i rb00 = c00 & mask;
        // GSVector4i ga00 = (c00 >> 8) & mask;

        movdqa(xmm2, xmm6);
        psllw(xmm2, 8);
        psrlw(xmm2, 8);
        psrlw(xmm6, 8);

        // GSVector4i rb01 = c01 & mask;
        // GSVector4i ga01 = (c01 >> 8) & mask;

        movdqa(xmm3, xmm4);
        psllw(xmm3, 8);
        psrlw(xmm3, 8);
        psrlw(xmm4, 8);

        // xmm0 = uf
        // xmm2 = rb00
        // xmm3 = rb01
        // xmm6 = ga00
        // xmm4 = ga01
        // xmm1 = c10
        // xmm5 = c11
        // xmm7 = used

        // rb00 = rb00.lerp16<0>(rb01, uf);
        // ga00 = ga00.lerp16<0>(ga01, uf);

        lerp16<0>(xmm3, xmm2, xmm0);
        lerp16<0>(xmm4, xmm6, xmm0);

        // xmm0 = uf
        // xmm3 = rb00
        // xmm4 = ga00
        // xmm1 = c10
        // xmm5 = c11
        // xmm2, xmm6 = free
        // xmm7 = used

        // GSVector4i rb10 = c10 & mask;
        // GSVector4i ga10 = (c10 >> 8) & mask;

        movdqa(xmm2, xmm1);
        psllw(xmm1, 8);
        psrlw(xmm1, 8);
        psrlw(xmm2, 8);

        // GSVector4i rb11 = c11 & mask;
        // GSVector4i ga11 = (c11 >> 8) & mask;

        movdqa(xmm6, xmm5);
        psllw(xmm5, 8);
        psrlw(xmm5, 8);
        psrlw(xmm6, 8);

        // xmm0 = uf
        // xmm3 = rb00
        // xmm4 = ga00
        // xmm1 = rb10
        // xmm5 = rb11
        // xmm2 = ga10
        // xmm6 = ga11
        // xmm7 = used

        // rb10 = rb10.lerp16<0>(rb11, uf);
        // ga10 = ga10.lerp16<0>(ga11, uf);

        lerp16<0>(xmm5, xmm1, xmm0);
        lerp16<0>(xmm6, xmm2, xmm0);

        // xmm3 = rb00
        // xmm4 = ga00
        // xmm5 = rb10
        // xmm6 = ga10
        // xmm0, xmm1, xmm2 = free
        // xmm7 = used

        // rb00 = rb00.lerp16<0>(rb10, vf);
        // ga00 = ga00.lerp16<0>(ga10, vf);

        movdqa(xmm0, xmmword[&m_env.temp.vf]);

        lerp16<0>(xmm5, xmm3, xmm0);
        lerp16<0>(xmm6, xmm4, xmm0);
    }
    else
    {
        // GSVector4i addr00 = y0 + x0;

        paddd(xmm2, xmm4);

        // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);

        ReadTexel(xmm5, xmm2, xmm0, xmm1);

        // GSVector4i mask = GSVector4i::x00ff();

        // c[0] = c00 & mask;
        // c[1] = (c00 >> 8) & mask;

        movdqa(xmm6, xmm5);

        psllw(xmm5, 8);
        psrlw(xmm5, 8);
        psrlw(xmm6, 8);
    }
}
void GSDrawScanlineCodeGenerator::Step()
{
    // steps -= 4;

    sub(ecx, 4);

    // fza_offset++;

    add(edi, 8);

    if(!m_sel.sprite)
    {
        // z += m_env.d4.z;

        if(m_sel.zb)
        {
            movaps(xmm0, xmmword[&m_env.temp.z]);
            addps(xmm0, xmmword[&m_env.d4.z]);
            movaps(xmmword[&m_env.temp.z], xmm0);
        }

        // f = f.add16(m_env.d4.f);

        if(m_sel.fwrite && m_sel.fge)
        {
            movdqa(xmm1, xmmword[&m_env.temp.f]);
            paddw(xmm1, xmmword[&m_env.d4.f]);
            movdqa(xmmword[&m_env.temp.f], xmm1);
        }
    }
    else
    {
        if(m_sel.ztest)
        {
            movdqa(xmm0, xmmword[&m_env.p.z]);
        }
    }

    if(m_sel.fb)
    {
        if(m_sel.tfx != TFX_NONE)
        {
            if(m_sel.fst)
            {
                // GSVector4i st = m_env.d4.st;

                // si += st.xxxx();
                // if(!sprite) ti += st.yyyy();

                movdqa(xmm4, xmmword[&m_env.d4.st]);

                pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
                paddd(xmm2, xmmword[&m_env.temp.s]);
                movdqa(xmmword[&m_env.temp.s], xmm2);

                if(!m_sel.sprite)
                {
                    pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1));
                    paddd(xmm3, xmmword[&m_env.temp.t]);
                    movdqa(xmmword[&m_env.temp.t], xmm3);
                }
                else
                {
                    movdqa(xmm3, xmmword[&m_env.temp.t]);
                }
            }
            else
            {
                // GSVector4 stq = m_env.d4.stq;

                // s += stq.xxxx();
                // t += stq.yyyy();
                // q += stq.zzzz();

                movaps(xmm2, xmmword[&m_env.d4.stq]);
                movaps(xmm3, xmm2);
                movaps(xmm4, xmm2);

                shufps(xmm2, xmm2, _MM_SHUFFLE(0, 0, 0, 0));
                shufps(xmm3, xmm3, _MM_SHUFFLE(1, 1, 1, 1));
                shufps(xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2));

                addps(xmm2, xmmword[&m_env.temp.s]);
                addps(xmm3, xmmword[&m_env.temp.t]);
                addps(xmm4, xmmword[&m_env.temp.q]);

                movaps(xmmword[&m_env.temp.s], xmm2);
                movaps(xmmword[&m_env.temp.t], xmm3);
                movaps(xmmword[&m_env.temp.q], xmm4);

                rcpps(xmm4, xmm4);
                mulps(xmm2, xmm4);
                mulps(xmm3, xmm4);
            }
        }

        if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc))
        {
            if(m_sel.iip)
            {
                // GSVector4i c = m_env.d4.c;

                // rb = rb.add16(c.xxxx());
                // ga = ga.add16(c.yyyy());

                movdqa(xmm7, xmmword[&m_env.d4.c]);

                pshufd(xmm5, xmm7, _MM_SHUFFLE(0, 0, 0, 0));
                pshufd(xmm6, xmm7, _MM_SHUFFLE(1, 1, 1, 1));

                paddw(xmm5, xmmword[&m_env.temp.rb]);
                paddw(xmm6, xmmword[&m_env.temp.ga]);

                movdqa(xmmword[&m_env.temp.rb], xmm5);
                movdqa(xmmword[&m_env.temp.ga], xmm6);
            }
            else
            {
                if(m_sel.tfx == TFX_NONE)
                {
                    movdqa(xmm5, xmmword[&m_env.c.rb]);
                    movdqa(xmm6, xmmword[&m_env.c.ga]);
                }
            }
        }
    }

    // test = m_test[7 + (steps & (steps >> 31))];

    mov(edx, ecx);
    sar(edx, 31);
    and(edx, ecx);
    shl(edx, 4);

    movdqa(xmm7, xmmword[edx + (size_t)&m_test[7]]);
}
void GSDrawScanlineCodeGenerator::Init(int params)
{
    const int _top = params + 4;
    const int _v = params + 8;

    // int skip = left & 3;

    mov(ebx, edx);
    and(edx, 3);

    // left -= skip;

    sub(ebx, edx);

    // int steps = right - left - 4;

    sub(ecx, ebx);
    sub(ecx, 4);

    // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))];

    shl(edx, 4);

    movdqa(xmm7, xmmword[edx + (size_t)&m_test[0]]);

    mov(eax, ecx);
    sar(eax, 31);
    and(eax, ecx);
    shl(eax, 4);

    por(xmm7, xmmword[eax + (size_t)&m_test[7]]);

    // GSVector2i* fza_base = &m_env.fzbr[top];

    mov(esi, dword[esp + _top]);
    lea(esi, ptr[esi * 8]);
    add(esi, dword[&m_env.fzbr]);

    // GSVector2i* fza_offset = &m_env.fzbc[left >> 2];

    lea(edi, ptr[ebx * 2]);
    add(edi, dword[&m_env.fzbc]);

    if(!m_sel.sprite && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip))
    {
        // edx = &m_env.d[skip]

        shl(edx, 4);
        lea(edx, ptr[edx + (size_t)m_env.d]);

        // ebx = &v

        mov(ebx, dword[esp + _v]);
    }

    if(!m_sel.sprite)
    {
        if(m_sel.fwrite && m_sel.fge || m_sel.zb)
        {
            movaps(xmm0, xmmword[ebx + 16]); // v.p

            if(m_sel.fwrite && m_sel.fge)
            {
                // f = GSVector4i(vp).zzzzh().zzzz().add16(m_env.d[skip].f);

                cvttps2dq(xmm1, xmm0);
                pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
                pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
                paddw(xmm1, xmmword[edx + 16 * 6]);

                movdqa(xmmword[&m_env.temp.f], xmm1);
            }

            if(m_sel.zb)
            {
                // z = vp.zzzz() + m_env.d[skip].z;

                shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
                addps(xmm0, xmmword[edx]);

                movaps(xmmword[&m_env.temp.z], xmm0);
            }
        }
    }
    else
    {
        if(m_sel.ztest)
        {
            movdqa(xmm0, xmmword[&m_env.p.z]);
        }
    }

    if(m_sel.fb)
    {
        if(m_sel.edge || m_sel.tfx != TFX_NONE)
        {
            movaps(xmm4, xmmword[ebx + 32]); // v.t
        }

        if(m_sel.edge)
        {
            pshufhw(xmm3, xmm4, _MM_SHUFFLE(2, 2, 2, 2));
            pshufd(xmm3, xmm3, _MM_SHUFFLE(3, 3, 3, 3));
            psrlw(xmm3, 9);

            movdqa(xmmword[&m_env.temp.cov], xmm3);
        }

        if(m_sel.tfx != TFX_NONE)
        {
            if(m_sel.fst)
            {
                // GSVector4i vti(vt);

                cvttps2dq(xmm4, xmm4);

                // si = vti.xxxx() + m_env.d[skip].si;
                // ti = vti.yyyy(); if(!sprite) ti += m_env.d[skip].ti;

                pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
                pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1));

                paddd(xmm2, xmmword[edx + 16 * 7]);

                if(!m_sel.sprite)
                {
                    paddd(xmm3, xmmword[edx + 16 * 8]);
                }
                else
                {
                    if(m_sel.ltf)
                    {
                        movdqa(xmm4, xmm3);
                        pshuflw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0));
                        pshufhw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0));
                        psrlw(xmm4, 1);
                        movdqa(xmmword[&m_env.temp.vf], xmm4);
                    }
                }

                movdqa(xmmword[&m_env.temp.s], xmm2);
                movdqa(xmmword[&m_env.temp.t], xmm3);
            }
            else
            {
                // s = vt.xxxx() + m_env.d[skip].s;
                // t = vt.yyyy() + m_env.d[skip].t;
                // q = vt.zzzz() + m_env.d[skip].q;

                movaps(xmm2, xmm4);
                movaps(xmm3, xmm4);

                shufps(xmm2, xmm2, _MM_SHUFFLE(0, 0, 0, 0));
                shufps(xmm3, xmm3, _MM_SHUFFLE(1, 1, 1, 1));
                shufps(xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2));

                addps(xmm2, xmmword[edx + 16 * 1]);
                addps(xmm3, xmmword[edx + 16 * 2]);
                addps(xmm4, xmmword[edx + 16 * 3]);

                movaps(xmmword[&m_env.temp.s], xmm2);
                movaps(xmmword[&m_env.temp.t], xmm3);
                movaps(xmmword[&m_env.temp.q], xmm4);

                rcpps(xmm4, xmm4);
                mulps(xmm2, xmm4);
                mulps(xmm3, xmm4);
            }
        }

        if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc))
        {
            if(m_sel.iip)
            {
                // GSVector4i vc = GSVector4i(v.c);

                cvttps2dq(xmm6, xmmword[ebx]); // v.c

                // vc = vc.upl16(vc.zwxy());

                pshufd(xmm5, xmm6, _MM_SHUFFLE(1, 0, 3, 2));
                punpcklwd(xmm6, xmm5);

                // rb = vc.xxxx().add16(m_env.d[skip].rb);
                // ga = vc.zzzz().add16(m_env.d[skip].ga);

                pshufd(xmm5, xmm6, _MM_SHUFFLE(0, 0, 0, 0));
                pshufd(xmm6, xmm6, _MM_SHUFFLE(2, 2, 2, 2));

                paddw(xmm5, xmmword[edx + 16 * 4]);
                paddw(xmm6, xmmword[edx + 16 * 5]);

                movdqa(xmmword[&m_env.temp.rb], xmm5);
                movdqa(xmmword[&m_env.temp.ga], xmm6);
            }
            else
            {
                if(m_sel.tfx == TFX_NONE)
                {
                    movdqa(xmm5, xmmword[&m_env.c.rb]);
                    movdqa(xmm6, xmmword[&m_env.c.ga]);
                }
            }
        }
    }
}
void GSDrawScanlineCodeGenerator::WriteFrame(int params)
{
    const int _top = params + 4;

    if(!m_sel.fwrite)
    {
        return;
    }

    if(m_sel.colclamp == 0)
    {
        // c[0] &= 0x000000ff;
        // c[1] &= 0x000000ff;

        pcmpeqd(xmm7, xmm7);
        psrlw(xmm7, 8);
        pand(xmm5, xmm7);
        pand(xmm6, xmm7);
    }

    if(m_sel.fpsm == 2 && m_sel.dthe)
    {
        mov(eax, dword[esp + _top]);
        and(eax, 3);
        shl(eax, 5);
        paddw(xmm5, xmmword[eax + (size_t)&m_env.dimx[0]]);
        paddw(xmm6, xmmword[eax + (size_t)&m_env.dimx[1]]);
    }

    // GSVector4i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1]));

    movdqa(xmm7, xmm5);
    punpcklwd(xmm5, xmm6);
    punpckhwd(xmm7, xmm6);
    packuswb(xmm5, xmm7);

    if(m_sel.fba && m_sel.fpsm != 1)
    {
        // fs |= 0x80000000;

        pcmpeqd(xmm7, xmm7);
        pslld(xmm7, 31);
        por(xmm5, xmm7);
    }

    if(m_sel.fpsm == 2)
    {
        // GSVector4i rb = fs & 0x00f800f8;
        // GSVector4i ga = fs & 0x8000f800;

        mov(eax, 0x00f800f8);
        movd(xmm6, eax);
        pshufd(xmm6, xmm6, _MM_SHUFFLE(0, 0, 0, 0));

        mov(eax, 0x8000f800);
        movd(xmm7, eax);
        pshufd(xmm7, xmm7, _MM_SHUFFLE(0, 0, 0, 0));

        movdqa(xmm4, xmm5);
        pand(xmm4, xmm6);
        pand(xmm5, xmm7);

        // fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3);

        movdqa(xmm6, xmm4);
        movdqa(xmm7, xmm5);

        psrld(xmm4, 3);
        psrld(xmm6, 9);
        psrld(xmm5, 6);
        psrld(xmm7, 16);

        por(xmm5, xmm4);
        por(xmm7, xmm6);
        por(xmm5, xmm7);
    }

    if(m_sel.rfb)
    {
        // fs = fs.blend(fd, fm);

        blend(xmm5, xmm2, xmm3); // TODO: could be skipped in certain cases, depending on fpsm and fm
    }

    bool fast = m_sel.rfb && m_sel.fpsm < 2;

    WritePixel(xmm5, xmm0, ebx, dl, fast, m_sel.fpsm);
}