void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv)
{
	// xmm0, xmm1, xmm2, xmm3 = free

	int wms_clamp = ((m_sel.wms + 1) >> 1) & 1;
	int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1;

	int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1;

	if(wms_clamp == wmt_clamp)
	{
		if(wms_clamp)
		{
			if(region)
			{
				vpmaxsw(uv, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]);
			}
			else
			{
				vpxor(xmm0, xmm0);
				vpmaxsw(uv, xmm0);
			}

			vpminsw(uv, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]);
		}
		else
		{
			vpand(uv, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]);

			if(region)
			{
				vpor(uv, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]);
			}
		}
	}
	else
	{
		vmovdqa(xmm2, ptr[r12 + offsetof(GSScanlineGlobalData, t.min)]);
		vmovdqa(xmm3, ptr[r12 + offsetof(GSScanlineGlobalData, t.max)]);
		vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, t.mask)]);

		// GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;

		vpand(xmm1, uv, xmm2);

		if(region)
		{
			vpor(xmm1, xmm3);
		}

		// GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);

		vpmaxsw(uv, xmm2);
		vpminsw(uv, xmm3);

		// clamp.blend8(repeat, m_local.gd->t.mask);

		vpblendvb(uv, xmm1, xmm0);
	}
}
void GSDrawScanlineCodeGenerator::blend8(const Xmm& a, const Xmm& b)
{
#if _M_SSE >= 0x500

    vpblendvb(a, a, b, xmm0);

#elif _M_SSE >= 0x401

    pblendvb(a, b);

#else

    blend(a, b, xmm0);

#endif
}
void GSDrawScanlineCodeGenerator::blend8r(const Xmm& b, const Xmm& a)
{
#if _M_SSE >= 0x500

    vpblendvb(b, a, b, xmm0);

#elif _M_SSE >= 0x401

    pblendvb(a, b);
    movdqa(b, a);

#else

    blendr(b, a, xmm0);

#endif
}
void GSDrawScanlineCodeGenerator::WriteZBuf()
{
	if(!m_sel.zwrite)
	{
		return;
	}

	bool fast = m_sel.ztest && m_sel.zpsm < 2;

	vmovdqa(xmm1, ptr[r11 + offsetof(GSScanlineLocalData, temp.zs)]);

	if(fast)
	{
		// zs = zs.blend8(zd, zm);

		vpblendvb(xmm1, ptr[r11 + offsetof(GSScanlineLocalData, temp.zd)], xmm4);
	}

	WritePixel(xmm1, rbp, dh, fast, m_sel.zpsm, 1);
}
void GSDrawScanlineCodeGenerator::AlphaBlend()
{
	if(!m_sel.fwrite)
	{
		return;
	}

	if(m_sel.abe == 0 && m_sel.aa1 == 0)
	{
		return;
	}

	if((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1) || m_sel.abd == 1)
	{
		switch(m_sel.fpsm)
		{
		case 0:
		case 1:

			// c[2] = fd & mask;
			// c[3] = (fd >> 8) & mask;

			vpsllw(xmm0, xmm6, 8);
			vpsrlw(xmm0, 8);
			vpsrlw(xmm1, xmm6, 8);

			break;

		case 2:

			// c[2] = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3);
			// c[3] = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2);

			vpcmpeqd(xmm15, xmm15);

			vpsrld(xmm15, 27); // 0x0000001f
			vpand(xmm0, xmm6, xmm15);
			vpslld(xmm0, 3);

			vpslld(xmm15, 10); // 0x00007c00
			vpand(xmm5, xmm6, xmm15);
			vpslld(xmm5, 9);

			vpor(xmm0, xmm1);

			vpsrld(xmm15, 5); // 0x000003e0
			vpand(xmm1, xmm6, xmm15);
			vpsrld(xmm1, 2);

			vpsllw(xmm15, 10); // 0x00008000
			vpand(xmm5, xmm6, xmm15);
			vpslld(xmm5, 8);

			vpor(xmm1, xmm5);

			break;
		}
	}

	// xmm2, xmm3 = src rb, ga
	// xmm0, xmm1 = dst rb, ga
	// xmm5, xmm15 = free

	if(m_sel.pabe || (m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0))
	{
		vmovdqa(xmm5, xmm2);
	}

	if(m_sel.aba != m_sel.abb)
	{
		// rb = c[aba * 2 + 0];

		switch(m_sel.aba)
		{
		case 0: break;
		case 1: vmovdqa(xmm2, xmm0); break;
		case 2: vpxor(xmm2, xmm2); break;
		}

		// rb = rb.sub16(c[abb * 2 + 0]);

		switch(m_sel.abb)
		{
		case 0: vpsubw(xmm2, xmm5); break;
		case 1: vpsubw(xmm2, xmm0); break;
		case 2: break;
		}

		if(!(m_sel.fpsm == 1 && m_sel.abc == 1))
		{
			// GSVector4i a = abc < 2 ? c[abc * 2 + 1].yywwlh().sll16(7) : m_local.gd->afix;

			switch(m_sel.abc)
			{
			case 0:
			case 1:
				vpshuflw(xmm15, m_sel.abc ? xmm1 : xmm3, _MM_SHUFFLE(3, 3, 1, 1));
				vpshufhw(xmm15, xmm15, _MM_SHUFFLE(3, 3, 1, 1));
				vpsllw(xmm15, 7);
				break;
			case 2:
				vmovdqa(xmm15, ptr[r12 + offsetof(GSScanlineGlobalData, afix)]);
				break;
			}

			// rb = rb.modulate16<1>(a);

			modulate16(xmm2, xmm15, 1);
		}

		// rb = rb.add16(c[abd * 2 + 0]);

		switch(m_sel.abd)
		{
		case 0: vpaddw(xmm2, xmm5); break;
		case 1: vpaddw(xmm2, xmm0); break;
		case 2: break;
		}
	}
	else
	{
		// rb = c[abd * 2 + 0];

		switch(m_sel.abd)
		{
		case 0: break;
		case 1: vmovdqa(xmm2, xmm0); break;
		case 2: vpxor(xmm2, xmm2); break;
		}
	}

	if(m_sel.pabe)
	{
		// mask = (c[1] << 8).sra32(31);

		vpslld(xmm0, xmm3, 8);
		vpsrad(xmm0, 31);

		// rb = c[0].blend8(rb, mask);

		vpblendvb(xmm2, xmm5, xmm2, xmm0);
	}

	// xmm0 = pabe mask
	// xmm3 = src ga
	// xmm1 = dst ga
	// xmm2 = rb
	// xmm15 = a
	// xmm5 = free

	vmovdqa(xmm5, xmm3);

	if(m_sel.aba != m_sel.abb)
	{
		// ga = c[aba * 2 + 1];

		switch(m_sel.aba)
		{
		case 0: break;
		case 1: vmovdqa(xmm3, xmm1); break;
		case 2: vpxor(xmm3, xmm3); break;
		}

		// ga = ga.sub16(c[abeb * 2 + 1]);

		switch(m_sel.abb)
		{
		case 0: vpsubw(xmm3, xmm5); break;
		case 1: vpsubw(xmm3, xmm1); break;
		case 2: break;
		}

		if(!(m_sel.fpsm == 1 && m_sel.abc == 1))
		{
			// ga = ga.modulate16<1>(a);

			modulate16(xmm3, xmm15, 1);
		}

		// ga = ga.add16(c[abd * 2 + 1]);

		switch(m_sel.abd)
		{
		case 0: vpaddw(xmm3, xmm5); break;
		case 1: vpaddw(xmm3, xmm1); break;
		case 2: break;
		}
	}
	else
	{
		// ga = c[abd * 2 + 1];

		switch(m_sel.abd)
		{
		case 0: break;
		case 1: vmovdqa(xmm3, xmm1); break;
		case 2: vpxor(xmm3, xmm3); break;
		}
	}

	// xmm0 = pabe mask
	// xmm5 = src ga
	// xmm2 = rb
	// xmm3 = ga
	// xmm1, xmm15 = free

	if(m_sel.pabe)
	{
		vpsrld(xmm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16)

		// ga = c[1].blend8(ga, mask).mix16(c[1]);

		vpblendvb(xmm3, xmm5, xmm3, xmm0);
	}
	else
	{
		if(m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx
		{
			mix16(xmm3, xmm5, xmm15);
		}
	}
}
void GSDrawScanlineCodeGenerator::blend8r(const Ymm& b, const Ymm& a)
{
    vpblendvb(b, a, b, xmm0);
}
void GSDrawScanlineCodeGenerator::blend8(const Ymm& a, const Ymm& b)
{
    vpblendvb(a, a, b, xmm0);
}