Exemplo n.º 1
0
static __forceinline void DCT_8_INV_ROW(const uint8_t * const ecx,const uint8_t * const esi,__m128i &xmm0,__m128i &xmm1,__m128i &xmm2,__m128i &xmm3,__m128i &xmm4,__m128i &xmm5,__m128i &xmm6,__m128i &xmm7)
{
     xmm0=_mm_shufflelo_epi16(xmm0, 0xD8 );
     xmm1=_mm_shuffle_epi32( xmm0, 0 );
     pmaddwd (xmm1, esi);
     xmm3=_mm_shuffle_epi32( xmm0, 0x55);
     xmm0=_mm_shufflehi_epi16( xmm0, 0xD8 );
     pmaddwd( xmm3, esi+32 );
     xmm2=_mm_shuffle_epi32( xmm0, 0xAA );
     xmm0=_mm_shuffle_epi32( xmm0, 0xFF );
     pmaddwd( xmm2, esi+16 );
     xmm4=_mm_shufflehi_epi16( xmm4, 0xD8 );
     paddd (xmm1, M128_round_inv_row);
     xmm4=_mm_shufflelo_epi16 (xmm4, 0xD8 );
     pmaddwd (xmm0, esi+48 );
     xmm5=_mm_shuffle_epi32( xmm4, 0 );
     xmm6=_mm_shuffle_epi32( xmm4, 0xAA );
     pmaddwd (xmm5, ecx );
     paddd (xmm1, xmm2 );
     movdqa (xmm2, xmm1 );
     xmm7=_mm_shuffle_epi32( xmm4, 0x55 );
     pmaddwd (xmm6, ecx+16 );
     paddd (xmm0, xmm3 );
     xmm4=_mm_shuffle_epi32( xmm4, 0xFF );
     psubd (xmm2, xmm0 );
     pmaddwd (xmm7, ecx+32 );
     paddd (xmm0, xmm1 );
     psrad (xmm2, 12 );
     paddd (xmm5, M128_round_inv_row);
     pmaddwd (xmm4, ecx+48 );
     paddd (xmm5, xmm6 );
     movdqa (xmm6, xmm5 );
     psrad (xmm0, 12 );
     xmm2=_mm_shuffle_epi32( xmm2, 0x1B );
     packssdw (xmm0, xmm2 );
     paddd (xmm4, xmm7 );
     psubd (xmm6, xmm4 );
     paddd (xmm4, xmm5 );
     psrad (xmm6, 12 );
     psrad (xmm4, 12 );
     xmm6=_mm_shuffle_epi32( xmm6, 0x1B );
     packssdw (xmm4, xmm6 );
}
Exemplo n.º 2
0
int main()
{
	int rval;
	mmx_t ma;
	mmx_t mb;

	movq_r2r(mm0, mm1);

	rval = mmx_ok();

	/* Announce return value of mmx_ok() */
//	printf("Value returned from init was %x.", rval);
//	printf(" (Indicates MMX %s available)\n\n",(rval)? "is" : "not");
//	fflush(stdout); fflush(stderr);

//	if(rval)
	{
		/* PADD *****************************************************/
		ma.q = 0x1111111180000000LL;
		mb.q = 0x7fffffff00000001LL;
		paddd(ma, mb);
		fprintf(stdout, "paddd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddd: mb.q is 9111111080000001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x0001000100010001LL;
		mb.q = 0x80007fffffff0001LL;
		paddw(ma, mb);
		fprintf(stdout, "paddw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddw: mb.q is 8001800000000002\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010000LL;
		paddw(ma, mb);
		fprintf(stdout, "paddw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddw: mb.q is 8001800000000001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x01010101807fff01LL;
		mb.q = 0x807fff0101010101LL;
		paddb(ma, mb);
		fprintf(stdout, "paddb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddb: mb.q is 8180000281800002\n");
		fflush(stdout); fflush(stderr);


		/* PADDS ****************************************************/
		ma.q = 0x0001000100010001LL;
		mb.q = 0x80007fffffff0001LL;
		paddsw(ma, mb);
		fprintf(stdout, "paddsw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddsw: mb.q is 80017fff00000002\n");

		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010000LL;
		paddsw(ma, mb);
		fprintf(stdout, "paddsw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddsw: mb.q is 80017fff00000001\n");

		ma.q = 0x01010101807fff01LL;
		mb.q = 0x807fff0101010101LL;
		paddsb(ma, mb);
		fprintf(stdout, "paddsb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddsb: mb.q is 817f0002817f0002\n");
		fflush(stdout); fflush(stderr);


		/* PADDUS ***************************************************/
		ma.q = 0x0001000100010001LL;
		mb.q = 0x80007fffffff0001LL;
		paddusw(ma, mb);
		fprintf(stdout, "paddusw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddusw: mb.q is 80018000ffff0002\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010000LL;
		paddusw(ma, mb);
		fprintf(stdout, "paddusw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddusw: mb.q is 80018000ffff0001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x01010101807fff01LL;
		mb.q = 0x807fff0101010101LL;
		paddusb(ma, mb);
		fprintf(stdout, "paddusb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddusb: mb.q is 8180ff028180ff02\n");
		fflush(stdout); fflush(stderr);


		/* PSUB *****************************************************/
		ma.q = 0x7fffffff00000001LL;
		mb.q = 0x1111111180000000LL;
		psubd(ma, mb);
		fprintf(stdout, "psubd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubd: mb.q is 911111127fffffff\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010001LL;
		psubw(ma, mb);
		fprintf(stdout, "psubw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubw: mb.q is 8001800200020000\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x0001000100010000LL;
		mb.q = 0x80007fffffff0001LL;
		psubw(ma, mb);
		fprintf(stdout, "psubw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubw: mb.q is 7fff7ffefffe0001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x807fff0101010101LL;
		mb.q = 0x01010101807fff01LL;
		psubb(ma, mb);
		fprintf(stdout, "psubb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubb: mb.q is 818202007f7efe00\n");
		fflush(stdout); fflush(stderr);


		/* PSUBS ****************************************************/
		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010001LL;
		psubsw(ma, mb);
		fprintf(stdout, "psubsw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubsw: mb.q is 7fff800200020000\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x0001000100010000LL;
		mb.q = 0x80007fffffff0001LL;
		psubsw(ma, mb);
		fprintf(stdout, "psubsw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubsw: mb.q is 80007ffefffe0001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x807fff0101010101LL;
		mb.q = 0x01010101807fff01LL;
		psubsb(ma, mb);
		fprintf(stdout, "psubsb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubsb: mb.q is 7f820200807efe00\n");
		fflush(stdout); fflush(stderr);
 

		/* PSUBUS ***************************************************/
		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010001LL;
		psubusw(ma, mb);
		fprintf(stdout, "psubusw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubusw: mb.q is 0000000000000000\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x0001000100010000LL;
		mb.q = 0x80007fffffff0001LL;
		psubusw(ma, mb);
		fprintf(stdout, "psubusw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubusw: mb.q is 7fff7ffefffe0001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x807fff0101010101LL;
		mb.q = 0x01010101807fff01LL;
		psubusb(ma, mb);
		fprintf(stdout, "psubusb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubusb: mb.q is 000000007f7efe00\n");
		fflush(stdout); fflush(stderr);


		/* PMUL *****************************************************/
		ma.q = 0x8000ffff00ff0000LL;
		mb.q = 0x0200ffff00ffffffLL;
		pmulhw(ma, mb);
		fprintf(stdout, "pmulhw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pmulhw: mb.q is ff00000000000000\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0200ffff00ffffffLL;
		pmullw(ma, mb);
		fprintf(stdout, "pmullw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pmullw: mb.q is 00000001fe010000\n");
		fflush(stdout); fflush(stderr);


		/* PMADD ****************************************************/
		ma.q = 0x8000345680007f34LL;
		mb.q = 0x93234a27ffff1707LL;

		pmaddwd(ma, mb);
		fprintf(stdout, "pmaddwd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pmaddwd: mb.q is 4597551a0b71a66c\n");
		fflush(stdout); fflush(stderr);


		/* PCMPEQ ***************************************************/
		ma.q = 0x800034568f237f34LL;
		mb.q = 0x93009a568f237f34LL;

		pcmpeqd(ma, mb);
		fprintf(stdout, "pcmpeqd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpeqd: mb.q is 00000000ffffffff\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x93009a568f237f34LL;
		pcmpeqw(ma, mb);
		fprintf(stdout, "pcmpeqw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpeqw: mb.q is 00000000ffffffff\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x93009a568f237f34LL;
		pcmpeqb(ma, mb);
		fprintf(stdout, "pcmpeqb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpeqb: mb.q is 00ff00ffffffffff\n");
		fflush(stdout); fflush(stderr);



		/* PCMPGT ***************************************************/
		ma.q = 0x666688884477aaffLL;
		mb.q = 0x1234567890abcdefLL;

		pcmpgtd(ma, mb);
		fprintf(stdout, "pcmpgtd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpgtd: mb.q is 0000000000000000\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x1234567890abcdefLL;
		pcmpgtw(ma, mb);
		fprintf(stdout, "pcmpgtw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpgtw: mb.q is 0000ffff0000ffff\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x1234567890abcdefLL;
		pcmpgtb(ma, mb);
		fprintf(stdout, "pcmpgtb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpgtb: mb.q is 0000ffff0000ff00\n");
		fflush(stdout); fflush(stderr);


		/* PACKSS ***************************************************/
		ma.q = 0x00012222000abbbbLL;
		mb.q = 0x0000888800003333LL;

		packssdw(ma, mb);
		fprintf(stdout, "packssdw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "packssdw: mb.q is 7fff7fff7fff3333\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x00aa00dd01009999LL;
		mb.q = 0x0011002200330044LL;

		packsswb(ma, mb);
		fprintf(stdout, "packsswb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "packsswb: mb.q is 7f7f7f8011223344\n");
		fflush(stdout); fflush(stderr);


		/* PACKUS ***************************************************/
		ma.q = 0x00aa00dd01009999LL;
		mb.q = 0x0011002200330044LL;

		packuswb(ma, mb);
		fprintf(stdout, "packuswb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "packuswb: mb.q is aaddff0011223344\n");
		fflush(stdout); fflush(stderr);


		/* PUNPCKH **************************************************/
		ma.q = 0x090a0b0c0d0e0f00LL;
		mb.q = 0x0102030405060708LL;

		punpckhdq(ma, mb);
		fprintf(stdout, "punpckhdq: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpckhdq: mb.q is 090a0b0c01020304\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0102030405060708LL;
		punpckhwd(ma, mb);
		fprintf(stdout, "punpckhwd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpckhwd: mb.q is 090a01020b0c0304\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0102030405060708LL;
		punpckhbw(ma, mb);
		fprintf(stdout, "punpckhbw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpckhbw: mb.q is 09010a020b030c04\n");
		fflush(stdout); fflush(stderr);


		/* PUNPCKL **************************************************/
		ma.q = 0x090a0b0c0d0e0f00LL;
		mb.q = 0x0102030405060708LL;

		punpckldq(ma, mb);
		fprintf(stdout, "punpckldq: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpckldq: mb.q is 0d0e0f0005060708\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0102030405060708LL;
		punpcklwd(ma, mb);
		fprintf(stdout, "punpcklwd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpcklwd: mb.q is 0d0e05060f000708\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0102030405060708LL;
		punpcklbw(ma, mb);
		fprintf(stdout, "punpcklbw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpcklbw: mb.q is 0d050e060f070008\n");
		fflush(stdout); fflush(stderr);



		/* PAND, PANDN, POR, PXOR ***********************************/
		ma.q = 0x5555555555555555LL;
		mb.q = 0x3333333333333333LL;

		pand(ma, mb);
		fprintf(stdout, "pand: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pand: mb.q is 1111111111111111\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x3333333333333333LL;
		pandn(ma, mb);
		fprintf(stdout, "pandn: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pandn: mb.q is 4444444444444444\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x3333333333333333LL;
		por(ma, mb);
		fprintf(stdout, "por: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "por: mb.q is 7777777777777777\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x3333333333333333LL;
		pxor(ma, mb);
		fprintf(stdout, "pxor: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pxor: mb.q is 6666666666666666\n");
		fflush(stdout); fflush(stderr);



		/* PSLL *****************************************************/
		ma.q = 0x0000000000000018LL;
		mb.q = 0x0123456789abcdefLL;

		psllq(ma, mb);
		fprintf(stdout, "psllq: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psllq: mb.q is 6789abcdef000000\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0123456789abcdefLL;
		pslld(ma, mb);
		fprintf(stdout, "pslld: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pslld: mb.q is 67000000ef000000\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0123456789abcdefLL;
		psllw(ma, mb);
		fprintf(stdout, "psllw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psllw: mb.q is 0000000000000000\n");
		fflush(stdout); fflush(stderr);



		/* PSRL *****************************************************/
		ma.q = 0x0000000000000018LL;
		mb.q = 0x0123456789abcdefLL;

		psrlq(ma, mb);
		fprintf(stdout, "psrlq: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psrlq: mb.q is 0000000123456789\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0123456789abcdefLL;
		psrld(ma, mb);
		fprintf(stdout, "psrld: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psrld: mb.q is 0000000100000089\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0123456789abcdefLL;
		psrlw(ma, mb);
		fprintf(stdout, "psrlw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psrlw: mb.q is 0000000000000000\n");
		fflush(stdout); fflush(stderr);



		/* PSRA *****************************************************/
		ma.q = 0x0000000000000018LL;
		mb.q = 0x0123456789abcdefLL;

		psrad(ma, mb);
		fprintf(stdout, "psrad: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psrad: mb.q is 00000001ffffff89\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0123456789abcdefLL;
		psraw(ma, mb);
		fprintf(stdout, "psraw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psraw: mb.q is 00000000ffffffff\n");
		fflush(stdout); fflush(stderr);

		/* Exit MXX *************************************************/
		emms();
	}

	/* Clean-up and exit nicely */
	exit(0);
}
Exemplo n.º 3
0
void GSSetupPrimCodeGenerator::Color()
{
	if(!m_en.c)
	{
		return;
	}

	if(m_env.sel.iip)
	{
		// GSVector4 c = dscan.c;

		movaps(xmm0, xmmword[edx]);
		movaps(xmm1, xmm0);

		// m_env.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();

		movaps(xmm2, xmm0);
		mulps(xmm2, xmm3);
		cvttps2dq(xmm2, xmm2);
		pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0));
		packssdw(xmm2, xmm2);
		movdqa(xmmword[&m_env.d4.c], xmm2);

		// xmm3 is not needed anymore

		// GSVector4 dr = c.xxxx();
		// GSVector4 db = c.zzzz();

		shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
		shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));

		for(int i = 0; i < 4; i++)
		{
			// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();

			movaps(xmm2, xmm0);
			mulps(xmm2, Xmm(4 + i));
			cvttps2dq(xmm2, xmm2);
			packssdw(xmm2, xmm2);

			// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();

			movaps(xmm3, xmm1);
			mulps(xmm3, Xmm(4 + i));
			cvttps2dq(xmm3, xmm3);
			packssdw(xmm3, xmm3);

			// m_env.d[i].rb = r.upl16(b);

			punpcklwd(xmm2, xmm3);
			movdqa(xmmword[&m_env.d[i].rb], xmm2);
		}

		// GSVector4 c = dscan.c;

		movaps(xmm0, xmmword[edx]); // not enough regs, have to reload it
		movaps(xmm1, xmm0);

		// GSVector4 dg = c.yyyy();
		// GSVector4 da = c.wwww();

		shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
		shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));

		for(int i = 0; i < 4; i++)
		{
			// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();

			movaps(xmm2, xmm0);
			mulps(xmm2, Xmm(4 + i));
			cvttps2dq(xmm2, xmm2);
			packssdw(xmm2, xmm2);

			// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();

			movaps(xmm3, xmm1);
			mulps(xmm3, Xmm(4 + i));
			cvttps2dq(xmm3, xmm3);
			packssdw(xmm3, xmm3);

			// m_env.d[i].ga = g.upl16(a);

			punpcklwd(xmm2, xmm3);
			movdqa(xmmword[&m_env.d[i].ga], xmm2);
		}
	}
	else
	{
		// GSVector4i c = GSVector4i(vertices[0].c);

		movaps(xmm0, xmmword[ecx]);
		cvttps2dq(xmm0, xmm0);

		// c = c.upl16(c.zwxy());

		movdqa(xmm1, xmm0);
		pshufd(xmm1, xmm1, _MM_SHUFFLE(1, 0, 3, 2));
		punpcklwd(xmm0, xmm1);

		// if(!tme) c = c.srl16(7);

		if(m_env.sel.tfx == TFX_NONE)
		{
			psrlw(xmm0, 7);
		}

		// m_env.c.rb = c.xxxx();
		// m_env.c.ga = c.zzzz();

		movdqa(xmm1, xmm0);
		pshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
		pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
		movdqa(xmmword[&m_env.c.rb], xmm0);
		movdqa(xmmword[&m_env.c.ga], xmm1);
	}
}
void GSDrawScanlineCodeGenerator::SampleTexture()
{
    if(!m_sel.fb || m_sel.tfx == TFX_NONE)
    {
        return;
    }

    mov(ebx, dword[&m_env.tex]);

    // ebx = tex

    if(!m_sel.fst)
    {
        // TODO: move these into Init/Step too?

        cvttps2dq(xmm2, xmm2);
        cvttps2dq(xmm3, xmm3);

        if(m_sel.ltf)
        {
            // u -= 0x8000;
            // v -= 0x8000;

            mov(eax, 0x8000);
            movd(xmm4, eax);
            pshufd(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
            psubd(xmm2, xmm4);
            psubd(xmm3, xmm4);
        }
    }

    // xmm2 = u
    // xmm3 = v

    if(m_sel.ltf)
    {
        // GSVector4i uf = u.xxzzlh().srl16(1);

        movdqa(xmm0, xmm2);
        pshuflw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
        pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
        psrlw(xmm0, 1);
        movdqa(xmmword[&m_env.temp.uf], xmm0);

        if(!m_sel.sprite)
        {
            // GSVector4i vf = v.xxzzlh().srl16(1);

            movdqa(xmm1, xmm3);
            pshuflw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 0, 0));
            pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 0, 0));
            psrlw(xmm1, 1);
            movdqa(xmmword[&m_env.temp.vf], xmm1);
        }
    }

    // GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16));

    psrad(xmm2, 16);
    psrad(xmm3, 16);
    packssdw(xmm2, xmm3);

    if(m_sel.ltf)
    {
        // GSVector4i uv1 = uv0.add16(GSVector4i::x0001());

        movdqa(xmm3, xmm2);
        pcmpeqd(xmm1, xmm1);
        psrlw(xmm1, 15);
        paddw(xmm3, xmm1);

        // uv0 = Wrap(uv0);
        // uv1 = Wrap(uv1);

        Wrap(xmm2, xmm3);
    }
    else
    {
        // uv0 = Wrap(uv0);

        Wrap(xmm2);
    }

    // xmm2 = uv0
    // xmm3 = uv1 (ltf)
    // xmm0, xmm1, xmm4, xmm5, xmm6 = free
    // xmm7 = used

    // GSVector4i y0 = uv0.uph16() << tw;
    // GSVector4i x0 = uv0.upl16();

    pxor(xmm0, xmm0);
    movd(xmm1, ptr[&m_env.tw]);

    movdqa(xmm4, xmm2);
    punpckhwd(xmm2, xmm0);
    punpcklwd(xmm4, xmm0);
    pslld(xmm2, xmm1);

    // xmm0 = 0
    // xmm1 = tw
    // xmm2 = y0
    // xmm3 = uv1 (ltf)
    // xmm4 = x0
    // xmm5, xmm6 = free
    // xmm7 = used

    if(m_sel.ltf)
    {
        // GSVector4i y1 = uv1.uph16() << tw;
        // GSVector4i x1 = uv1.upl16();

        movdqa(xmm6, xmm3);
        punpckhwd(xmm3, xmm0);
        punpcklwd(xmm6, xmm0);
        pslld(xmm3, xmm1);

        // xmm2 = y0
        // xmm3 = y1
        // xmm4 = x0
        // xmm6 = x1
        // xmm0, xmm5, xmm6 = free
        // xmm7 = used

        // GSVector4i addr00 = y0 + x0;
        // GSVector4i addr01 = y0 + x1;
        // GSVector4i addr10 = y1 + x0;
        // GSVector4i addr11 = y1 + x1;

        movdqa(xmm5, xmm2);
        paddd(xmm5, xmm4);
        paddd(xmm2, xmm6);

        movdqa(xmm0, xmm3);
        paddd(xmm0, xmm4);
        paddd(xmm3, xmm6);

        // xmm5 = addr00
        // xmm2 = addr01
        // xmm0 = addr10
        // xmm3 = addr11
        // xmm1, xmm4, xmm6 = free
        // xmm7 = used

        // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
        // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]);
        // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
        // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);

        ReadTexel(xmm6, xmm5, xmm1, xmm4);

        // xmm2, xmm5, xmm1 = free

        ReadTexel(xmm4, xmm2, xmm5, xmm1);

        // xmm0, xmm2, xmm5 = free

        ReadTexel(xmm1, xmm0, xmm2, xmm5);

        // xmm3, xmm0, xmm2 = free

        ReadTexel(xmm5, xmm3, xmm0, xmm2);

        // xmm6 = c00
        // xmm4 = c01
        // xmm1 = c10
        // xmm5 = c11
        // xmm0, xmm2, xmm3 = free
        // xmm7 = used

        movdqa(xmm0, xmmword[&m_env.temp.uf]);

        // GSVector4i rb00 = c00 & mask;
        // GSVector4i ga00 = (c00 >> 8) & mask;

        movdqa(xmm2, xmm6);
        psllw(xmm2, 8);
        psrlw(xmm2, 8);
        psrlw(xmm6, 8);

        // GSVector4i rb01 = c01 & mask;
        // GSVector4i ga01 = (c01 >> 8) & mask;

        movdqa(xmm3, xmm4);
        psllw(xmm3, 8);
        psrlw(xmm3, 8);
        psrlw(xmm4, 8);

        // xmm0 = uf
        // xmm2 = rb00
        // xmm3 = rb01
        // xmm6 = ga00
        // xmm4 = ga01
        // xmm1 = c10
        // xmm5 = c11
        // xmm7 = used

        // rb00 = rb00.lerp16<0>(rb01, uf);
        // ga00 = ga00.lerp16<0>(ga01, uf);

        lerp16<0>(xmm3, xmm2, xmm0);
        lerp16<0>(xmm4, xmm6, xmm0);

        // xmm0 = uf
        // xmm3 = rb00
        // xmm4 = ga00
        // xmm1 = c10
        // xmm5 = c11
        // xmm2, xmm6 = free
        // xmm7 = used

        // GSVector4i rb10 = c10 & mask;
        // GSVector4i ga10 = (c10 >> 8) & mask;

        movdqa(xmm2, xmm1);
        psllw(xmm1, 8);
        psrlw(xmm1, 8);
        psrlw(xmm2, 8);

        // GSVector4i rb11 = c11 & mask;
        // GSVector4i ga11 = (c11 >> 8) & mask;

        movdqa(xmm6, xmm5);
        psllw(xmm5, 8);
        psrlw(xmm5, 8);
        psrlw(xmm6, 8);

        // xmm0 = uf
        // xmm3 = rb00
        // xmm4 = ga00
        // xmm1 = rb10
        // xmm5 = rb11
        // xmm2 = ga10
        // xmm6 = ga11
        // xmm7 = used

        // rb10 = rb10.lerp16<0>(rb11, uf);
        // ga10 = ga10.lerp16<0>(ga11, uf);

        lerp16<0>(xmm5, xmm1, xmm0);
        lerp16<0>(xmm6, xmm2, xmm0);

        // xmm3 = rb00
        // xmm4 = ga00
        // xmm5 = rb10
        // xmm6 = ga10
        // xmm0, xmm1, xmm2 = free
        // xmm7 = used

        // rb00 = rb00.lerp16<0>(rb10, vf);
        // ga00 = ga00.lerp16<0>(ga10, vf);

        movdqa(xmm0, xmmword[&m_env.temp.vf]);

        lerp16<0>(xmm5, xmm3, xmm0);
        lerp16<0>(xmm6, xmm4, xmm0);
    }
    else
    {
        // GSVector4i addr00 = y0 + x0;

        paddd(xmm2, xmm4);

        // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);

        ReadTexel(xmm5, xmm2, xmm0, xmm1);

        // GSVector4i mask = GSVector4i::x00ff();

        // c[0] = c00 & mask;
        // c[1] = (c00 >> 8) & mask;

        movdqa(xmm6, xmm5);

        psllw(xmm5, 8);
        psrlw(xmm5, 8);
        psrlw(xmm6, 8);
    }
}
void GSDrawScanlineCodeGenerator::Generate()
{
    push(ebx);
    push(esi);
    push(edi);
    push(ebp);

    const int params = 16;

    Init(params);

    if(!m_sel.edge)
    {
        align(16);
    }

    L("loop");

    // ecx = steps
    // esi = fzbr
    // edi = fzbc
    // xmm0 = z/zi
    // xmm2 = u (tme)
    // xmm3 = v (tme)
    // xmm5 = rb (!tme)
    // xmm6 = ga (!tme)
    // xmm7 = test

    bool tme = m_sel.tfx != TFX_NONE;

    TestZ(tme ? xmm5 : xmm2, tme ? xmm6 : xmm3);

    // ecx = steps
    // esi = fzbr
    // edi = fzbc
    // - xmm0
    // xmm2 = u (tme)
    // xmm3 = v (tme)
    // xmm5 = rb (!tme)
    // xmm6 = ga (!tme)
    // xmm7 = test

    SampleTexture();

    // ecx = steps
    // esi = fzbr
    // edi = fzbc
    // ebp = za
    // - xmm2
    // - xmm3
    // - xmm4
    // xmm5 = rb
    // xmm6 = ga
    // xmm7 = test

    AlphaTFX();

    // ecx = steps
    // esi = fzbr
    // edi = fzbc
    // ebp = za
    // xmm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc)
    // xmm5 = rb
    // xmm6 = ga
    // xmm7 = test

    if(m_sel.fwrite)
    {
        movdqa(xmm3, xmmword[&m_env.fm]);
    }

    if(m_sel.zwrite)
    {
        movdqa(xmm4, xmmword[&m_env.zm]);
    }

    // ecx = steps
    // esi = fzbr
    // edi = fzbc
    // ebp = za
    // xmm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc)
    // xmm3 = fm
    // xmm4 = zm
    // xmm5 = rb
    // xmm6 = ga
    // xmm7 = test

    TestAlpha();

    // ecx = steps
    // esi = fzbr
    // edi = fzbc
    // ebp = za
    // xmm2 = gaf (TFX_HIGHLIGHT || TFX_HIGHLIGHT2 && !tcc)
    // xmm3 = fm
    // xmm4 = zm
    // xmm5 = rb
    // xmm6 = ga
    // xmm7 = test

    ColorTFX();

    // ecx = steps
    // esi = fzbr
    // edi = fzbc
    // ebp = za
    // xmm3 = fm
    // xmm4 = zm
    // xmm5 = rb
    // xmm6 = ga
    // xmm7 = test

    Fog();

    // ecx = steps
    // esi = fzbr
    // edi = fzbc
    // ebp = za
    // xmm3 = fm
    // xmm4 = zm
    // xmm5 = rb
    // xmm6 = ga
    // xmm7 = test

    ReadFrame();

    // ecx = steps
    // esi = fzbr
    // edi = fzbc
    // ebp = za
    // xmm2 = fd
    // xmm3 = fm
    // xmm4 = zm
    // xmm5 = rb
    // xmm6 = ga
    // xmm7 = test

    TestDestAlpha();

    // fm |= test;
    // zm |= test;

    if(m_sel.fwrite)
    {
        por(xmm3, xmm7);
    }

    if(m_sel.zwrite)
    {
        por(xmm4, xmm7);
    }

    // int fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask();

    pcmpeqd(xmm1, xmm1);

    if(m_sel.fwrite && m_sel.zwrite)
    {
        movdqa(xmm0, xmm1);
        pcmpeqd(xmm1, xmm3);
        pcmpeqd(xmm0, xmm4);
        packssdw(xmm1, xmm0);
    }
    else if(m_sel.fwrite)
    {
        pcmpeqd(xmm1, xmm3);
        packssdw(xmm1, xmm1);
    }
    else if(m_sel.zwrite)
    {
        pcmpeqd(xmm1, xmm4);
        packssdw(xmm1, xmm1);
    }

    pmovmskb(edx, xmm1);
    not(edx);

    // ebx = fa
    // ecx = steps
    // edx = fzm
    // esi = fzbr
    // edi = fzbc
    // ebp = za
    // xmm2 = fd
    // xmm3 = fm
    // xmm4 = zm
    // xmm5 = rb
    // xmm6 = ga

    WriteZBuf();

    // ebx = fa
    // ecx = steps
    // edx = fzm
    // esi = fzbr
    // edi = fzbc
    // - ebp
    // xmm2 = fd
    // xmm3 = fm
    // - xmm4
    // xmm5 = rb
    // xmm6 = ga

    AlphaBlend();

    // ebx = fa
    // ecx = steps
    // edx = fzm
    // esi = fzbr
    // edi = fzbc
    // xmm2 = fd
    // xmm3 = fm
    // xmm5 = rb
    // xmm6 = ga

    WriteFrame(params);

    L("step");

    // if(steps <= 0) break;

    if(!m_sel.edge)
    {
        test(ecx, ecx);
        jle("exit", T_NEAR);

        Step();

        jmp("loop", T_NEAR);
    }

    L("exit");

    pop(ebp);
    pop(edi);
    pop(esi);
    pop(ebx);

    ret(8);
}