void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Xmm& temp, const Reg32& addr, const Reg8& mask, bool fast, int psm)
{
    if(fast)
    {
        // if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs);
        // if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs);

        test(mask, 0x0f);
        je("@f");
        movq(qword[addr * 2 + (size_t)m_env.vm], src);
        L("@@");

        test(mask, 0xf0);
        je("@f");
        movhps(qword[addr * 2 + (size_t)m_env.vm + 8 * 2], src);
        L("@@");
    }
    else
    {
        // if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>());
        // if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>());
        // if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>());
        // if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>());

        test(mask, 0x03);
        je("@f");
        WritePixel(src, temp, addr, 0, psm);
        L("@@");

        test(mask, 0x0c);
        je("@f");
        WritePixel(src, temp, addr, 1, psm);
        L("@@");

        test(mask, 0x30);
        je("@f");
        WritePixel(src, temp, addr, 2, psm);
        L("@@");

        test(mask, 0xc0);
        je("@f");
        WritePixel(src, temp, addr, 3, psm);
        L("@@");
    }
}
void GPUDrawScanlineCodeGenerator::WriteFrame()
{
	// GSVector4i fs = r | g | b | (m_sel.md ? GSVector4i(0x80008000) : m_sel.tme ? a : 0);

	pcmpeqd(xmm0, xmm0);

	if(m_sel.md || m_sel.tme)
	{
		movdqa(xmm2, xmm0);
		psllw(xmm2, 15);
	}

	psrlw(xmm0, 11);
	psllw(xmm0, 3);

	// xmm0 = 0x00f8
	// xmm2 = 0x8000 (md)

	// GSVector4i r = (c[0] & 0x00f800f8) >> 3;

	pand(xmm4, xmm0);
	psrlw(xmm4, 3);

	// GSVector4i g = (c[1] & 0x00f800f8) << 2;

	pand(xmm5, xmm0);
	psllw(xmm5, 2);
	por(xmm4, xmm5);

	// GSVector4i b = (c[2] & 0x00f800f8) << 7;

	pand(xmm6, xmm0);
	psllw(xmm6, 7);
	por(xmm4, xmm6);

	if(m_sel.md)
	{
		// GSVector4i a = GSVector4i(0x80008000);

		por(xmm4, xmm2);
	}
	else if(m_sel.tme)
	{
		// GSVector4i a = (c[3] << 8) & 0x80008000;

		psllw(xmm3, 8);
		pand(xmm3, xmm2);
		por(xmm4, xmm3);
	}

	// fs = fs.blend8(fd, test);

	movdqa(xmm0, xmm7);
	blend8(xmm4, xmm1);

	// GSVector4i::store<false>(fb, fs);

	// movdqu(ptr[edi], xmm4);

	movq(qword[edi], xmm4);
	movhps(qword[edi + 8], xmm4);
}
void GPUDrawScanlineCodeGenerator::Generate()
{
	push(esi);
	push(edi);

	Init();

	align(16);

L("loop");

	// GSVector4i test = m_test[7 + (steps & (steps >> 31))];

	mov(edx, ecx);
	sar(edx, 31);
	and(edx, ecx);
	shl(edx, 4);

	movdqa(xmm7, ptr[edx + (size_t)&m_test[7]]);

	// movdqu(xmm1, ptr[edi]);

	movq(xmm1, qword[edi]);
	movhps(xmm1, qword[edi + 8]);

	// ecx = steps
	// esi = tex (tme)
	// edi = fb
	// xmm1 = fd
	// xmm2 = s
	// xmm3 = t
	// xmm4 = r
	// xmm5 = g
	// xmm6 = b
	// xmm7 = test

	TestMask();

	SampleTexture();

	// xmm1 = fd
	// xmm3 = a
	// xmm4 = r
	// xmm5 = g
	// xmm6 = b
	// xmm7 = test
	// xmm0, xmm2 = free

	ColorTFX();

	AlphaBlend();

	Dither();

	WriteFrame();

L("step");

	// if(steps <= 0) break;

	test(ecx, ecx);
	jle("exit", T_NEAR);

	Step();

	jmp("loop", T_NEAR);

L("exit");

	pop(edi);
	pop(esi);

	ret(8);
}
void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg32& addr)
{
    movq(dst, qword[addr * 2 + (size_t)m_env.vm]);
    movhps(dst, qword[addr * 2 + (size_t)m_env.vm + 8 * 2]);
}