void GSDrawScanlineCodeGenerator::Fog()
{
	if(!m_sel.fwrite || !m_sel.fge)
	{
		return;
	}

	// rb = m_local.gd->frb.lerp16<0>(rb, f);
	// ga = m_local.gd->fga.lerp16<0>(ga, f).mix16(ga);

	vmovdqa(xmm0, ptr[r12 + offsetof(GSScanlineGlobalData, frb)]);
	vmovdqa(xmm1, ptr[r12 + offsetof(GSScanlineGlobalData, fga)]);

	vmovdqa(xmm6, xmm3);

	lerp16(xmm2, xmm0, xmm9, 0);
	lerp16(xmm3, xmm1, xmm9, 0);

	mix16(xmm3, xmm6, xmm9);
}
void GSDrawScanlineCodeGenerator::Fog()
{
    if(!m_sel.fwrite || !m_sel.fge)
    {
        return;
    }

    // rb = m_env.frb.lerp16<0>(rb, f);
    // ga = m_env.fga.lerp16<0>(ga, f).mix16(ga);

    movdqa(xmm0, xmmword[!m_sel.sprite ? &m_env.temp.f : &m_env.p.f]);
    movdqa(xmm1, xmm6);

    movdqa(xmm2, xmmword[&m_env.frb]);
    lerp16<0>(xmm5, xmm2, xmm0);

    movdqa(xmm2, xmmword[&m_env.fga]);
    lerp16<0>(xmm6, xmm2, xmm0);

    mix16(xmm6, xmm1, xmm0);
}
void GSDrawScanlineCodeGenerator::AlphaBlend()
{
	if(!m_sel.fwrite)
	{
		return;
	}

	if(m_sel.abe == 0 && m_sel.aa1 == 0)
	{
		return;
	}

	if((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1) || m_sel.abd == 1)
	{
		switch(m_sel.fpsm)
		{
		case 0:
		case 1:

			// c[2] = fd & mask;
			// c[3] = (fd >> 8) & mask;

			vpsllw(xmm0, xmm6, 8);
			vpsrlw(xmm0, 8);
			vpsrlw(xmm1, xmm6, 8);

			break;

		case 2:

			// c[2] = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3);
			// c[3] = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2);

			vpcmpeqd(xmm15, xmm15);

			vpsrld(xmm15, 27); // 0x0000001f
			vpand(xmm0, xmm6, xmm15);
			vpslld(xmm0, 3);

			vpslld(xmm15, 10); // 0x00007c00
			vpand(xmm5, xmm6, xmm15);
			vpslld(xmm5, 9);

			vpor(xmm0, xmm1);

			vpsrld(xmm15, 5); // 0x000003e0
			vpand(xmm1, xmm6, xmm15);
			vpsrld(xmm1, 2);

			vpsllw(xmm15, 10); // 0x00008000
			vpand(xmm5, xmm6, xmm15);
			vpslld(xmm5, 8);

			vpor(xmm1, xmm5);

			break;
		}
	}

	// xmm2, xmm3 = src rb, ga
	// xmm0, xmm1 = dst rb, ga
	// xmm5, xmm15 = free

	if(m_sel.pabe || (m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0))
	{
		vmovdqa(xmm5, xmm2);
	}

	if(m_sel.aba != m_sel.abb)
	{
		// rb = c[aba * 2 + 0];

		switch(m_sel.aba)
		{
		case 0: break;
		case 1: vmovdqa(xmm2, xmm0); break;
		case 2: vpxor(xmm2, xmm2); break;
		}

		// rb = rb.sub16(c[abb * 2 + 0]);

		switch(m_sel.abb)
		{
		case 0: vpsubw(xmm2, xmm5); break;
		case 1: vpsubw(xmm2, xmm0); break;
		case 2: break;
		}

		if(!(m_sel.fpsm == 1 && m_sel.abc == 1))
		{
			// GSVector4i a = abc < 2 ? c[abc * 2 + 1].yywwlh().sll16(7) : m_local.gd->afix;

			switch(m_sel.abc)
			{
			case 0:
			case 1:
				vpshuflw(xmm15, m_sel.abc ? xmm1 : xmm3, _MM_SHUFFLE(3, 3, 1, 1));
				vpshufhw(xmm15, xmm15, _MM_SHUFFLE(3, 3, 1, 1));
				vpsllw(xmm15, 7);
				break;
			case 2:
				vmovdqa(xmm15, ptr[r12 + offsetof(GSScanlineGlobalData, afix)]);
				break;
			}

			// rb = rb.modulate16<1>(a);

			modulate16(xmm2, xmm15, 1);
		}

		// rb = rb.add16(c[abd * 2 + 0]);

		switch(m_sel.abd)
		{
		case 0: vpaddw(xmm2, xmm5); break;
		case 1: vpaddw(xmm2, xmm0); break;
		case 2: break;
		}
	}
	else
	{
		// rb = c[abd * 2 + 0];

		switch(m_sel.abd)
		{
		case 0: break;
		case 1: vmovdqa(xmm2, xmm0); break;
		case 2: vpxor(xmm2, xmm2); break;
		}
	}

	if(m_sel.pabe)
	{
		// mask = (c[1] << 8).sra32(31);

		vpslld(xmm0, xmm3, 8);
		vpsrad(xmm0, 31);

		// rb = c[0].blend8(rb, mask);

		vpblendvb(xmm2, xmm5, xmm2, xmm0);
	}

	// xmm0 = pabe mask
	// xmm3 = src ga
	// xmm1 = dst ga
	// xmm2 = rb
	// xmm15 = a
	// xmm5 = free

	vmovdqa(xmm5, xmm3);

	if(m_sel.aba != m_sel.abb)
	{
		// ga = c[aba * 2 + 1];

		switch(m_sel.aba)
		{
		case 0: break;
		case 1: vmovdqa(xmm3, xmm1); break;
		case 2: vpxor(xmm3, xmm3); break;
		}

		// ga = ga.sub16(c[abeb * 2 + 1]);

		switch(m_sel.abb)
		{
		case 0: vpsubw(xmm3, xmm5); break;
		case 1: vpsubw(xmm3, xmm1); break;
		case 2: break;
		}

		if(!(m_sel.fpsm == 1 && m_sel.abc == 1))
		{
			// ga = ga.modulate16<1>(a);

			modulate16(xmm3, xmm15, 1);
		}

		// ga = ga.add16(c[abd * 2 + 1]);

		switch(m_sel.abd)
		{
		case 0: vpaddw(xmm3, xmm5); break;
		case 1: vpaddw(xmm3, xmm1); break;
		case 2: break;
		}
	}
	else
	{
		// ga = c[abd * 2 + 1];

		switch(m_sel.abd)
		{
		case 0: break;
		case 1: vmovdqa(xmm3, xmm1); break;
		case 2: vpxor(xmm3, xmm3); break;
		}
	}

	// xmm0 = pabe mask
	// xmm5 = src ga
	// xmm2 = rb
	// xmm3 = ga
	// xmm1, xmm15 = free

	if(m_sel.pabe)
	{
		vpsrld(xmm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16)

		// ga = c[1].blend8(ga, mask).mix16(c[1]);

		vpblendvb(xmm3, xmm5, xmm3, xmm0);
	}
	else
	{
		if(m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx
		{
			mix16(xmm3, xmm5, xmm15);
		}
	}
}
void GSDrawScanlineCodeGenerator::ColorTFX()
{
	if(!m_sel.fwrite)
	{
		return;
	}

	switch(m_sel.tfx)
	{
	case TFX_MODULATE:

		// rbt = rbt.modulate16<1>(rb).clamp8();

		modulate16(xmm2, xmm13, 1);

		clamp16(xmm2, xmm0);

		break;

	case TFX_DECAL:

		break;

	case TFX_HIGHLIGHT:
	case TFX_HIGHLIGHT2:

		// gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat);

		vmovdqa(xmm1, xmm3);

		modulate16(xmm3, xmm14, 1);

		vpshuflw(xmm6, xmm14, _MM_SHUFFLE(3, 3, 1, 1));
		vpshufhw(xmm6, xmm6, _MM_SHUFFLE(3, 3, 1, 1));
		vpsrlw(xmm6, 7);

		vpaddw(xmm3, xmm6);

		clamp16(xmm3, xmm0);
		
		mix16(xmm3, xmm1, xmm0);

		// rbt = rbt.modulate16<1>(rb).add16(af).clamp8();

		modulate16(xmm2, xmm13, 1);

		vpaddw(xmm2, xmm6);
		
		clamp16(xmm2, xmm0);

		break;

	case TFX_NONE:

		// rbt = iip ? rb.srl16(7) : rb;

		if(m_sel.iip)
		{
			vpsrlw(xmm2, xmm13, 7);
		}

		break;
	}
}
void GSDrawScanlineCodeGenerator::AlphaTFX()
{
	if(!m_sel.fb)
	{
		return;
	}

	switch(m_sel.tfx)
	{
	case TFX_MODULATE:

		// gat = gat.modulate16<1>(ga).clamp8();

		modulate16(xmm3, xmm14, 1);

		clamp16(xmm3, xmm0);

		// if(!tcc) gat = gat.mix16(ga.srl16(7));

		if(!m_sel.tcc)
		{
			vpsrlw(xmm1, xmm14, 7);

			mix16(xmm3, xmm1, xmm0);
		}

		break;

	case TFX_DECAL:

		// if(!tcc) gat = gat.mix16(ga.srl16(7));

		if(!m_sel.tcc)
		{
			vpsrlw(xmm1, xmm14, 7);

			mix16(xmm3, xmm1, xmm0);
		}

		break;

	case TFX_HIGHLIGHT:

		// gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7)));

		vpsrlw(xmm1, xmm14, 7);

		if(m_sel.tcc) 
		{
			vpaddusb(xmm1, xmm3);
		}

		mix16(xmm3, xmm1, xmm0);

		break;

	case TFX_HIGHLIGHT2:

		// if(!tcc) gat = gat.mix16(ga.srl16(7));

		if(!m_sel.tcc)
		{
			vpsrlw(xmm1, xmm14, 7);

			mix16(xmm3, xmm1, xmm0);
		}

		break;

	case TFX_NONE:

		// gat = iip ? ga.srl16(7) : ga;

		if(m_sel.iip)
		{
			vpsrlw(xmm3, xmm14, 7);
		}

		break;
	}

	// TODO: aa1
}
void GSDrawScanlineCodeGenerator::AlphaBlend()
{
    if(!m_sel.fwrite)
    {
        return;
    }

    if(m_sel.abe == 0 && m_sel.aa1 == 0)
    {
        return;
    }

    if((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1) || m_sel.abd == 1)
    {
        switch(m_sel.fpsm)
        {
        case 0:
        case 1:

            // c[2] = fd & mask;
            // c[3] = (fd >> 8) & mask;

            movdqa(xmm0, xmm2);
            movdqa(xmm1, xmm2);

            psllw(xmm0, 8);
            psrlw(xmm0, 8);
            psrlw(xmm1, 8);

            break;

        case 2:

            // c[2] = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3);
            // c[3] = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2);

            movdqa(xmm0, xmm2);
            movdqa(xmm1, xmm2);
            movdqa(xmm4, xmm2);

            pcmpeqd(xmm7, xmm7);
            psrld(xmm7, 27); // 0x0000001f
            pand(xmm0, xmm7);
            pslld(xmm0, 3);

            pslld(xmm7, 10); // 0x00007c00
            pand(xmm4, xmm7);
            pslld(xmm4, 9);

            por(xmm0, xmm4);

            movdqa(xmm4, xmm1);

            psrld(xmm7, 5); // 0x000003e0
            pand(xmm1, xmm7);
            psrld(xmm1, 2);

            psllw(xmm7, 10); // 0x00008000
            pand(xmm4, xmm7);
            pslld(xmm4, 8);

            por(xmm1, xmm4);

            break;
        }
    }

    // xmm5, xmm6 = src rb, ga
    // xmm0, xmm1 = dst rb, ga
    // xmm2, xmm3 = used
    // xmm4, xmm7 = free

    if(m_sel.pabe || (m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0))
    {
        movdqa(xmm4, xmm5);
    }

    if(m_sel.aba != m_sel.abb)
    {
        // rb = c[aba * 2 + 0];

        switch(m_sel.aba)
        {
        case 0:
            break;
        case 1:
            movdqa(xmm5, xmm0);
            break;
        case 2:
            pxor(xmm5, xmm5);
            break;
        }

        // rb = rb.sub16(c[abb * 2 + 0]);

        switch(m_sel.abb)
        {
        case 0:
            psubw(xmm5, xmm4);
            break;
        case 1:
            psubw(xmm5, xmm0);
            break;
        case 2:
            break;
        }

        if(!(m_sel.fpsm == 1 && m_sel.abc == 1))
        {
            // GSVector4i a = abc < 2 ? c[abc * 2 + 1].yywwlh().sll16(7) : m_env.afix;

            switch(m_sel.abc)
            {
            case 0:
            case 1:
                movdqa(xmm7, m_sel.abc ? xmm1 : xmm6);
                pshuflw(xmm7, xmm7, _MM_SHUFFLE(3, 3, 1, 1));
                pshufhw(xmm7, xmm7, _MM_SHUFFLE(3, 3, 1, 1));
                psllw(xmm7, 7);
                break;
            case 2:
                movdqa(xmm7, xmmword[&m_env.afix]);
                break;
            }

            // rb = rb.modulate16<1>(a);

            modulate16<1>(xmm5, xmm7);
        }

        // rb = rb.add16(c[abd * 2 + 0]);

        switch(m_sel.abd)
        {
        case 0:
            paddw(xmm5, xmm4);
            break;
        case 1:
            paddw(xmm5, xmm0);
            break;
        case 2:
            break;
        }
    }
    else
    {
        // rb = c[abd * 2 + 0];

        switch(m_sel.abd)
        {
        case 0:
            break;
        case 1:
            movdqa(xmm5, xmm0);
            break;
        case 2:
            pxor(xmm5, xmm5);
            break;
        }
    }

    if(m_sel.pabe)
    {
        // mask = (c[1] << 8).sra32(31);

        movdqa(xmm0, xmm6);
        pslld(xmm0, 8);
        psrad(xmm0, 31);

        // rb = c[0].blend8(rb, mask);

        blend8r(xmm5, xmm4);
    }

    // xmm6 = src ga
    // xmm1 = dst ga
    // xmm5 = rb
    // xmm7 = a
    // xmm2, xmm3 = used
    // xmm0, xmm4 = free

    movdqa(xmm4, xmm6);

    if(m_sel.aba != m_sel.abb)
    {
        // ga = c[aba * 2 + 1];

        switch(m_sel.aba)
        {
        case 0:
            break;
        case 1:
            movdqa(xmm6, xmm1);
            break;
        case 2:
            pxor(xmm6, xmm6);
            break;
        }

        // ga = ga.sub16(c[abeb * 2 + 1]);

        switch(m_sel.abb)
        {
        case 0:
            psubw(xmm6, xmm4);
            break;
        case 1:
            psubw(xmm6, xmm1);
            break;
        case 2:
            break;
        }

        if(!(m_sel.fpsm == 1 && m_sel.abc == 1))
        {
            // ga = ga.modulate16<1>(a);

            modulate16<1>(xmm6, xmm7);
        }

        // ga = ga.add16(c[abd * 2 + 1]);

        switch(m_sel.abd)
        {
        case 0:
            paddw(xmm6, xmm4);
            break;
        case 1:
            paddw(xmm6, xmm1);
            break;
        case 2:
            break;
        }
    }
    else
    {
        // ga = c[abd * 2 + 1];

        switch(m_sel.abd)
        {
        case 0:
            break;
        case 1:
            movdqa(xmm6, xmm1);
            break;
        case 2:
            pxor(xmm6, xmm6);
            break;
        }
    }

    // xmm4 = src ga
    // xmm5 = rb
    // xmm6 = ga
    // xmm2, xmm3 = used
    // xmm0, xmm1, xmm7 = free

    if(m_sel.pabe)
    {
        if(!m_cpu.has(util::Cpu::tSSE41))
        {
            // doh, previous blend8r overwrote xmm0 (sse41 uses pblendvb)

            movdqa(xmm0, xmm4);
            pslld(xmm0, 8);
            psrad(xmm0, 31);
        }

        psrld(xmm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16)

        // ga = c[1].blend8(ga, mask).mix16(c[1]);

        blend8r(xmm6, xmm4);
    }
    else
    {
        if(m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx
        {
            mix16(xmm6, xmm4, xmm7);
        }
    }
}
void GSDrawScanlineCodeGenerator::ColorTFX()
{
    if(!m_sel.fwrite)
    {
        return;
    }

    switch(m_sel.tfx)
    {
    case TFX_MODULATE:

        // GSVector4i rb = iip ? rbf : m_env.c.rb;

        // rbt = rbt.modulate16<1>(rb).clamp8();

        modulate16<1>(xmm5, xmmword[m_sel.iip ? &m_env.temp.rb : &m_env.c.rb]);

        clamp16(xmm5, xmm1);

        break;

    case TFX_DECAL:

        break;

    case TFX_HIGHLIGHT:
    case TFX_HIGHLIGHT2:

        if(m_sel.tfx == TFX_HIGHLIGHT2 && m_sel.tcc)
        {
            // GSVector4i ga = iip ? gaf : m_env.c.ga;

            movdqa(xmm2, xmmword[m_sel.iip ? &m_env.temp.ga : &m_env.c.ga]);
        }

        // gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat);

        movdqa(xmm1, xmm6);

        modulate16<1>(xmm6, xmm2);

        pshuflw(xmm2, xmm2, _MM_SHUFFLE(3, 3, 1, 1));
        pshufhw(xmm2, xmm2, _MM_SHUFFLE(3, 3, 1, 1));
        psrlw(xmm2, 7);

        paddw(xmm6, xmm2);

        clamp16(xmm6, xmm0);

        mix16(xmm6, xmm1, xmm0);

        // GSVector4i rb = iip ? rbf : m_env.c.rb;

        // rbt = rbt.modulate16<1>(rb).add16(af).clamp8();

        modulate16<1>(xmm5, xmmword[m_sel.iip ? &m_env.temp.rb : &m_env.c.rb]);

        paddw(xmm5, xmm2);

        clamp16(xmm5, xmm0);

        break;

    case TFX_NONE:

        // rbt = iip ? rb.srl16(7) : rb;

        if(m_sel.iip)
        {
            psrlw(xmm5, 7);
        }

        break;
    }
}
void GSDrawScanlineCodeGenerator::AlphaTFX()
{
    if(!m_sel.fb)
    {
        return;
    }

    switch(m_sel.tfx)
    {
    case TFX_MODULATE:

        // GSVector4i ga = iip ? gaf : m_env.c.ga;

        movdqa(xmm4, xmmword[m_sel.iip ? &m_env.temp.ga : &m_env.c.ga]);

        // gat = gat.modulate16<1>(ga).clamp8();

        modulate16<1>(xmm6, xmm4);

        clamp16(xmm6, xmm3);

        // if(!tcc) gat = gat.mix16(ga.srl16(7));

        if(!m_sel.tcc)
        {
            psrlw(xmm4, 7);

            mix16(xmm6, xmm4, xmm3);
        }

        break;

    case TFX_DECAL:

        // if(!tcc) gat = gat.mix16(ga.srl16(7));

        if(!m_sel.tcc)
        {
            // GSVector4i ga = iip ? gaf : m_env.c.ga;

            movdqa(xmm4, xmmword[m_sel.iip ? &m_env.temp.ga : &m_env.c.ga]);

            psrlw(xmm4, 7);

            mix16(xmm6, xmm4, xmm3);
        }

        break;

    case TFX_HIGHLIGHT:

        // GSVector4i ga = iip ? gaf : m_env.c.ga;

        movdqa(xmm4, xmmword[m_sel.iip ? &m_env.temp.ga : &m_env.c.ga]);
        movdqa(xmm2, xmm4);

        // gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7)));

        psrlw(xmm4, 7);

        if(m_sel.tcc)
        {
            paddusb(xmm4, xmm6);
        }

        mix16(xmm6, xmm4, xmm3);

        break;

    case TFX_HIGHLIGHT2:

        // if(!tcc) gat = gat.mix16(ga.srl16(7));

        if(!m_sel.tcc)
        {
            // GSVector4i ga = iip ? gaf : m_env.c.ga;

            movdqa(xmm4, xmmword[m_sel.iip ? &m_env.temp.ga : &m_env.c.ga]);
            movdqa(xmm2, xmm4);

            psrlw(xmm4, 7);

            mix16(xmm6, xmm4, xmm3);
        }

        break;

    case TFX_NONE:

        // gat = iip ? ga.srl16(7) : ga;

        if(m_sel.iip)
        {
            psrlw(xmm6, 7);
        }

        break;
    }

    if(m_sel.aa1)
    {
        // gs_user figure 3-2: anti-aliasing after tfx, before tests, modifies alpha

        // FIXME: bios config screen cubes

        if(!m_sel.abe)
        {
            // a = cov

            if(m_sel.edge)
            {
                movdqa(xmm0, xmmword[&m_env.temp.cov]);
            }
            else
            {
                pcmpeqd(xmm0, xmm0);
                psllw(xmm0, 15);
                psrlw(xmm0, 8);
            }

            mix16(xmm6, xmm0, xmm1);
        }
        else
        {
            // a = a == 0x80 ? cov : a

            pcmpeqd(xmm0, xmm0);
            psllw(xmm0, 15);
            psrlw(xmm0, 8);

            if(m_sel.edge)
            {
                movdqa(xmm1, xmmword[&m_env.temp.cov]);
            }
            else
            {
                movdqa(xmm1, xmm0);
            }

            pcmpeqw(xmm0, xmm6);
            psrld(xmm0, 16);
            pslld(xmm0, 16);

            blend8(xmm6, xmm1);
        }
    }
}