void GPUDrawScanlineCodeGenerator::blend8(const Xmm& a, const Xmm& b)
{
	if(m_cpu.has(util::Cpu::tSSE41))
	{
		pblendvb(a, b);
	}
	else
	{
		blend(a, b, xmm0);
	}
}
void GSDrawScanlineCodeGenerator::blend8r(const Xmm& b, const Xmm& a)
{
    if(m_cpu.has(util::Cpu::tSSE41))
    {
        pblendvb(a, b);
        movdqa(b, a);
    }
    else
    {
        blendr(b, a, xmm0);
    }
}
void GSDrawScanlineCodeGenerator::blend8(const Xmm& a, const Xmm& b)
{
#if _M_SSE >= 0x500

    vpblendvb(a, a, b, xmm0);

#elif _M_SSE >= 0x401

    pblendvb(a, b);

#else

    blend(a, b, xmm0);

#endif
}
void GSDrawScanlineCodeGenerator::blend8r(const Xmm& b, const Xmm& a)
{
#if _M_SSE >= 0x500

    vpblendvb(b, a, b, xmm0);

#elif _M_SSE >= 0x401

    pblendvb(a, b);
    movdqa(b, a);

#else

    blendr(b, a, xmm0);

#endif
}
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1)
{
    // xmm0, xmm1, xmm4, xmm5, xmm6 = free

    int wms_clamp = ((m_sel.wms + 1) >> 1) & 1;
    int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1;

    int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1;

    if(wms_clamp == wmt_clamp)
    {
        if(wms_clamp)
        {
            if(region)
            {
                movdqa(xmm4, xmmword[&m_env.t.min]);

                pmaxsw(uv0, xmm4);
                pmaxsw(uv1, xmm4);
            }
            else
            {
                pxor(xmm0, xmm0);
                pmaxsw(uv0, xmm0);
                pmaxsw(uv1, xmm0);
            }

            movdqa(xmm5, xmmword[&m_env.t.max]);

            pminsw(uv0, xmm5);
            pminsw(uv1, xmm5);
        }
        else
        {
            movdqa(xmm4, xmmword[&m_env.t.min]);

            pand(uv0, xmm4);
            pand(uv1, xmm4);

            if(region)
            {
                movdqa(xmm5, xmmword[&m_env.t.max]);

                por(uv0, xmm5);
                por(uv1, xmm5);
            }
        }
    }
    else
    {
        movdqa(xmm1, uv0);
        movdqa(xmm6, uv1);

        movdqa(xmm4, xmmword[&m_env.t.min]);
        movdqa(xmm5, xmmword[&m_env.t.max]);

        // GSVector4i clamp = t.sat_i16(m_env.t.min, m_env.t.max);

        pmaxsw(uv0, xmm4);
        pmaxsw(uv1, xmm4);
        pminsw(uv0, xmm5);
        pminsw(uv1, xmm5);

        // GSVector4i repeat = (t & m_env.t.min) | m_env.t.max;

        pand(xmm1, xmm4);
        pand(xmm6, xmm4);

        if(region)
        {
            por(xmm1, xmm5);
            por(xmm6, xmm5);
        }

        // clamp.blend8(repeat, m_env.t.mask);

        if(m_cpu.has(util::Cpu::tSSE41))
        {
            movdqa(xmm0, xmmword[&m_env.t.mask]);

            pblendvb(uv0, xmm1);
            pblendvb(uv1, xmm6);
        }
        else
        {
            movdqa(xmm0, xmmword[&m_env.t.invmask]);
            movdqa(xmm4, xmm0);

            pand(uv0, xmm0);
            pandn(xmm0, xmm1);
            por(uv0, xmm0);

            pand(uv1, xmm4);
            pandn(xmm4, xmm6);
            por(uv1, xmm4);
        }
    }
}