void GSDrawScanlineCodeGenerator::WriteMask()
{
	// fm |= test;
	// zm |= test;

	if(m_sel.fwrite)
	{
		vpor(xmm4, xmm15);
	}

	if(m_sel.zwrite)
	{
		vpor(xmm5, xmm15);
	}

	// int fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask();

	vpcmpeqd(xmm1, xmm1);

	if(m_sel.fwrite && m_sel.zwrite)
	{
		vpcmpeqd(xmm0, xmm1, xmm5);
		vpcmpeqd(xmm1, xmm4);
		vpackssdw(xmm1, xmm0);
	}
	else if(m_sel.fwrite)
	{
		vpcmpeqd(xmm1, xmm4);
		vpackssdw(xmm1, xmm1);
	}
	else if(m_sel.zwrite)
	{
		vpcmpeqd(xmm1, xmm5);
		vpackssdw(xmm1, xmm1);
	}

	vpmovmskb(edx, xmm1);

	not(edx);
}
void GSDrawScanlineCodeGenerator::SampleTexture()
{
	if(!m_sel.fb || m_sel.tfx == TFX_NONE)
	{
		return;
	}

	mov(rbx, ptr[r12 + offsetof(GSScanlineGlobalData, tex)]);

	// ebx = tex

	if(!m_sel.fst)
	{
		vrcpps(xmm0, xmm12);

		vmulps(xmm4, xmm10, xmm0);
		vmulps(xmm5, xmm11, xmm0);

		vcvttps2dq(xmm4, xmm4);
		vcvttps2dq(xmm5, xmm5);

		if(m_sel.ltf)
		{
			// u -= 0x8000;
			// v -= 0x8000;

			mov(eax, 0x8000);
			vmovd(xmm0, eax);
			vpshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));

			vpsubd(xmm4, xmm0);
			vpsubd(xmm5, xmm0);
		}
	}
	else
	{
		vmovdqa(xmm4, xmm10);
		vmovdqa(xmm5, xmm11);
	}

	if(m_sel.ltf)
	{
		// GSVector4i uf = u.xxzzlh().srl16(1);

		vpshuflw(xmm6, xmm4, _MM_SHUFFLE(2, 2, 0, 0));
		vpshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0));
		vpsrlw(xmm6, 1);

		if(!m_sel.sprite)
		{
			// GSVector4i vf = v.xxzzlh().srl16(1);

			vpshuflw(xmm7, xmm5, _MM_SHUFFLE(2, 2, 0, 0));
			vpshufhw(xmm7, xmm7, _MM_SHUFFLE(2, 2, 0, 0));
			vpsrlw(xmm7, 1);
		}
	}

	// GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16));

	vpsrad(xmm4, 16);
	vpsrad(xmm5, 16);
	vpackssdw(xmm4, xmm5);

	if(m_sel.ltf)
	{
		// GSVector4i uv1 = uv0.add16(GSVector4i::x0001());

		vpcmpeqd(xmm0, xmm0);
		vpsrlw(xmm0, 15);
		vpaddw(xmm5, xmm4, xmm0);

		// uv0 = Wrap(uv0);
		// uv1 = Wrap(uv1);

		Wrap(xmm4, xmm5);
	}
	else
	{
		// uv0 = Wrap(uv0);

		Wrap(xmm4);
	}

	// xmm4 = uv0
	// xmm5 = uv1 (ltf)
	// xmm6 = uf
	// xmm7 = vf

	// GSVector4i x0 = uv0.upl16();
	// GSVector4i y0 = uv0.uph16() << tw;

	vpxor(xmm0, xmm0);

	vpunpcklwd(xmm2, xmm4, xmm0);
	vpunpckhwd(xmm3, xmm4, xmm0);
	vpslld(xmm3, m_sel.tw + 3);

	// xmm0 = 0
	// xmm2 = x0
	// xmm3 = y0
	// xmm5 = uv1 (ltf)
	// xmm6 = uf
	// xmm7 = vf

	if(m_sel.ltf)
	{
		// GSVector4i x1 = uv1.upl16();
		// GSVector4i y1 = uv1.uph16() << tw;

		vpunpcklwd(xmm4, xmm5, xmm0);
		vpunpckhwd(xmm5, xmm5, xmm0);
		vpslld(xmm5, m_sel.tw + 3);

		// xmm2 = x0
		// xmm3 = y0
		// xmm4 = x1
		// xmm5 = y1
		// xmm6 = uf
		// xmm7 = vf

		// GSVector4i addr00 = y0 + x0;
		// GSVector4i addr01 = y0 + x1;
		// GSVector4i addr10 = y1 + x0;
		// GSVector4i addr11 = y1 + x1;

		vpaddd(xmm0, xmm3, xmm2);
		vpaddd(xmm1, xmm3, xmm4);
		vpaddd(xmm2, xmm5, xmm2);
		vpaddd(xmm3, xmm5, xmm4);

		// xmm0 = addr00
		// xmm1 = addr01
		// xmm2 = addr10
		// xmm3 = addr11
		// xmm6 = uf
		// xmm7 = vf

		// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
		// c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]);
		// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
		// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
		
		ReadTexel(4, 0);

		// xmm0 = c00
		// xmm1 = c01
		// xmm2 = c10
		// xmm3 = c11
		// xmm6 = uf
		// xmm7 = vf

		// GSVector4i rb00 = c00 & mask;
		// GSVector4i ga00 = (c00 >> 8) & mask;

		vpsllw(xmm4, xmm0, 8);
		vpsrlw(xmm4, 8);
		vpsrlw(xmm5, xmm0, 8);

		// GSVector4i rb01 = c01 & mask;
		// GSVector4i ga01 = (c01 >> 8) & mask;

		vpsllw(xmm0, xmm1, 8);
		vpsrlw(xmm0, 8);
		vpsrlw(xmm1, 8);

		// xmm0 = rb01
		// xmm1 = ga01
		// xmm2 = c10
		// xmm3 = c11
		// xmm4 = rb00
		// xmm5 = ga00
		// xmm6 = uf
		// xmm7 = vf

		// rb00 = rb00.lerp16<0>(rb01, uf);
		// ga00 = ga00.lerp16<0>(ga01, uf);

		lerp16(xmm0, xmm4, xmm6, 0);
		lerp16(xmm1, xmm5, xmm6, 0);

		// xmm0 = rb00
		// xmm1 = ga00
		// xmm2 = c10
		// xmm3 = c11
		// xmm6 = uf
		// xmm7 = vf

		// GSVector4i rb10 = c10 & mask;
		// GSVector4i ga10 = (c10 >> 8) & mask;

		vpsrlw(xmm5, xmm2, 8);
		vpsllw(xmm2, 8);
		vpsrlw(xmm4, xmm2, 8);

		// GSVector4i rb11 = c11 & mask;
		// GSVector4i ga11 = (c11 >> 8) & mask;

		vpsrlw(xmm2, xmm3, 8);
		vpsllw(xmm3, 8);
		vpsrlw(xmm3, 8);

		// xmm0 = rb00
		// xmm1 = ga00
		// xmm2 = rb11
		// xmm3 = ga11
		// xmm4 = rb10
		// xmm5 = ga10
		// xmm6 = uf
		// xmm7 = vf

		// rb10 = rb10.lerp16<0>(rb11, uf);
		// ga10 = ga10.lerp16<0>(ga11, uf);

		lerp16(xmm2, xmm4, xmm6, 0);
		lerp16(xmm3, xmm5, xmm6, 0);

		// xmm0 = rb00
		// xmm1 = ga00
		// xmm2 = rb10
		// xmm3 = ga10
		// xmm7 = vf

		// rb00 = rb00.lerp16<0>(rb10, vf);
		// ga00 = ga00.lerp16<0>(ga10, vf);

		lerp16(xmm2, xmm0, xmm7, 0);
		lerp16(xmm3, xmm1, xmm7, 0);
	}
	else
	{
		// GSVector4i addr00 = y0 + x0;

		vpaddd(xmm3, xmm2);

		// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);

		ReadTexel(1, 0);

		// GSVector4i mask = GSVector4i::x00ff();

		// c[0] = c00 & mask;
		// c[1] = (c00 >> 8) & mask;

		vpsrlw(xmm3, xmm2, 8);
		vpsllw(xmm2, 8);
		vpsrlw(xmm2, 8);
	}

	// xmm2 = rb
	// xmm3 = ga
}
    void forward_avx2() {
        xor_(reg_soff, reg_soff);
        Label mb_sp_loop;
        L(mb_sp_loop); {

            channel_loop([=](size_t unroll) {
                        // Load 32 channels (two C16_blocks) in ymm, then
                        // split the work in half, each half splits in two
                        // regs with 8 channels per. When down converting,
                        // put the result in a temp register for the 1st
                        // iteration, combine the result at 2nd iteration
                        // and store ymm with 32 channels.
                        // If 16 channels, do just one half and store the
                        // result with mask.
                        Vmm v0 = Vmm(0);
                        Vmm v1 = Vmm(1);
                        Vmm vscale0 = Vmm(2);
                        Vmm vshift0 = Vmm(3);
                        Vmm vmean0 = Vmm(4);
                        Vmm vsqrtvar0 = Vmm(5);
                        Vmm vscale1 = Vmm(6);
                        Vmm vshift1 = Vmm(7);
                        Vmm vmean1 = Vmm(8);
                        Vmm vsqrtvar1 = Vmm(9);
                        Vmm tmp = Vmm(10);

                        for (size_t i = 0; i < unroll; i++) {
                            compute_vscaleshift(vscale0, vshift0, vmean0,
                                    vsqrtvar0, i * c_in_xmm_ * sizeof(float));
                            compute_vscaleshift(vscale1, vshift1, vmean1,
                                    vsqrtvar1, i * c_in_xmm_ * sizeof(float)
                                    + simd_w_ * sizeof(float));

                            vpmovsxbd(v0, src_ptr(i*c_in_xmm_));
                            vpmovsxbd(v1, src_ptr(i*c_in_xmm_ + simd_w_));
                            vcvtdq2ps(v0, v0);
                            vcvtdq2ps(v1, v1);

                            uni_vfmadd213ps(v0, vscale0, vshift0);
                            uni_vfmadd213ps(v1, vscale1, vshift1);
                            if (with_relu_) {
                                uni_vmaxps(v0, v0, vzero);
                                uni_vmaxps(v1, v1, vzero);
                            }

                            vcvtps2dq(v0, v0); // BA
                            vcvtps2dq(v1, v1); // DC
                            vpackssdw(v0, v0, v1); // BA + DC -> DBCA
                            vpermq(v0, v0, 0xD8); // DBCA -> DCBA
                            vperm2i128(v1, v0, v0, 0x1); // DCBA -> BADC
                            vpacksswb(v0, v0, v1); // DCBA + BADC -> badcDCBA
                            if (i == 0 && unroll != 1)
                                uni_vmovups(tmp, v0);
                            else if (i == 1) {
                                // badcDCBA + fehgHGFE -> HGFEDCBA
                                vperm2i128(v0, v0, tmp, 0x2);
                            }
                        }

                        if (unroll == 1)
                            vmaskmovps(dst_ptr(), vbody_mask, v0);
                        else
                            uni_vmovups(dst_ptr(), v0);
                    },
                    [=]() {
                        // handle first 8 channels. If tail is bigger,
                        // handle second part separately. There is no way
                        // to get performance as one has to work with bytes
                        // via xmm. vzeroupper kills all the perf.
                        Xmm x0 = Xmm(0);
                        Vmm v0 = Vmm(0);
                        Vmm vscale0 = Vmm(1);
                        Vmm vshift0 = Vmm(2);
                        Vmm vmean0 = Vmm(3);
                        Vmm vsqrtvar0 = Vmm(4);

                        size_t tail = nstl::min(c_tail_, simd_w_);
                        size_t num_iters = c_tail_ > simd_w_ ? 2 : 1;

                        for (size_t i = 0; i < num_iters; i++) {
                            if (i > 0)
                                tail = c_tail_ - simd_w_;

                            for (size_t tl = 0; tl < tail; tl++)
                                vpinsrb(x0, x0, src_ptr(8*i + tl), tl);

                            if (tail == simd_w_)
                                compute_vscaleshift(vscale0, vshift0, vmean0,
                                        vsqrtvar0, 32*i);
                            else
                                compute_vscaleshift(vscale0, vshift0, vmean0,
                                        vsqrtvar0, 32*i, true);

                            vpmovsxbd(v0, x0);
                            vcvtdq2ps(v0, v0);
                            uni_vfmadd213ps(v0, vscale0, vshift0);
                            if (with_relu_)
                                uni_vmaxps(v0, v0, vzero);
                            vcvtps2dq(v0, v0);
                            vpackssdw(v0, v0, vzero);
                            vpermq(v0, v0, 0xD8);
                            vpacksswb(v0, v0, vzero);

                            for (size_t tl = 0; tl < tail; tl++)
                                vpextrb(dst_ptr(8*i + tl), x0, tl);
                        }
                    });

            add(reg_soff, reg_coff_max);
            cmp(reg_soff, reg_soff_max);
            jl(mb_sp_loop);
        }
    }