void GSDrawScanlineCodeGenerator::clamp16(const Xmm& a, const Xmm& temp)
{
#if _M_SSE >= 0x500

    vpackuswb(a, a);
    vpmovzxbw(a, a);

#elif _M_SSE >= 0x401

    packuswb(a, a);
    pmovzxbw(a, a);

#else

    packuswb(a, a);
    pxor(temp, temp);
    punpcklbw(a, temp);

#endif
}
void GSDrawScanlineCodeGenerator::WriteFrame()
{
	if(!m_sel.fwrite)
	{
		return;
	}

	if(m_sel.colclamp == 0)
	{
		// c[0] &= 0x00ff00ff;
		// c[1] &= 0x00ff00ff;

		vpcmpeqd(xmm15, xmm15);
		vpsrlw(xmm15, 8);
		vpand(xmm2, xmm15);
		vpand(xmm3, xmm15);
	}

	if(m_sel.fpsm == 2 && m_sel.dthe)
	{
		mov(rax, r8);
		and(rax, 3);
		shl(rax, 5);
		vpaddw(xmm2, ptr[r12 + rax + offsetof(GSScanlineGlobalData, dimx[0])]);
		vpaddw(xmm3, ptr[r12 + rax + offsetof(GSScanlineGlobalData, dimx[1])]);
	}

	// GSVector4i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1]));

	vpunpckhwd(xmm15, xmm2, xmm3);
	vpunpcklwd(xmm2, xmm3);
	vpackuswb(xmm2, xmm15);

	if(m_sel.fba && m_sel.fpsm != 1)
	{
		// fs |= 0x80000000;

		vpcmpeqd(xmm15, xmm15);
		vpslld(xmm15, 31);
		vpor(xmm2, xmm15);
	}

	// xmm2 = fs
	// xmm4 = fm
	// xmm6 = fd

	if(m_sel.fpsm == 2)
	{
		// GSVector4i rb = fs & 0x00f800f8;
		// GSVector4i ga = fs & 0x8000f800;

		mov(eax, 0x00f800f8);
		vmovd(xmm0, eax);
		vpshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));

		mov(eax, 0x8000f800);
		vmovd(xmm1, eax);
		vpshufd(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0));

		vpand(xmm0, xmm2);
		vpand(xmm1, xmm2);

		// fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3);

		vpsrld(xmm2, xmm0, 9);
		vpsrld(xmm0, 3);
		vpsrld(xmm3, xmm1, 16);
		vpsrld(xmm1, 6);

		vpor(xmm0, xmm1);
		vpor(xmm2, xmm3);
		vpor(xmm2, xmm0);
	}

	if(m_sel.rfb)
	{
		// fs = fs.blend(fd, fm);

		blend(xmm2, xmm6, xmm4); // TODO: could be skipped in certain cases, depending on fpsm and fm
	}

	bool fast = m_sel.rfb && m_sel.fpsm < 2;

	WritePixel(xmm2, rbx, dl, fast, m_sel.fpsm, 0);
}
void GSDrawScanlineCodeGenerator::clamp16(const Ymm& a, const Ymm& temp)
{
    vpackuswb(a, a);
    vpermq(a, a, _MM_SHUFFLE(3, 1, 2, 0)); // this sucks
    vpmovzxbw(a, a);
}