void GPUDrawScanlineCodeGenerator::SampleTexture()
{
	if(!m_sel.tme)
	{
		return;
	}

	if(m_sel.tlu)
	{
		mov(edx, ptr[&m_local.gd->clut]);
	}

	// xmm2 = s
	// xmm3 = t
	// xmm7 = test
	// xmm0, xmm4, xmm5, xmm6 = free
	// xmm1 = used

	if(m_sel.ltf)
	{
		// GSVector4i u = s.sub16(GSVector4i(0x00200020)); // - 0.125f
		// GSVector4i v = t.sub16(GSVector4i(0x00200020)); // - 0.125f

		mov(eax, 0x00200020);
		movd(xmm0, eax);
		pshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));

		psubw(xmm2, xmm0);
		psubw(xmm3, xmm0);

		// GSVector4i uf = (u & GSVector4i::x00ff()) << 7;
		// GSVector4i vf = (v & GSVector4i::x00ff()) << 7;

		movdqa(xmm0, xmm2);
		psllw(xmm0, 8);
		psrlw(xmm0, 1);
		movdqa(ptr[&m_local.temp.uf], xmm0);

		if(!m_sel.sprite)
		{
			movdqa(xmm0, xmm3);
			psllw(xmm0, 8);
			psrlw(xmm0, 1);
			movdqa(ptr[&m_local.temp.vf], xmm0);
		}
	}

	// GSVector4i u0 = s.srl16(8);
	// GSVector4i v0 = t.srl16(8);

	psrlw(xmm2, 8);
	psrlw(xmm3, 8);

	// xmm2 = u
	// xmm3 = v
	// xmm7 = test
	// xmm0, xmm4, xmm5, xmm6 = free
	// xmm1 = used

	if(m_sel.ltf)
	{
		// GSVector4i u1 = u0.add16(GSVector4i::x0001());
		// GSVector4i v1 = v0.add16(GSVector4i::x0001());

		movdqa(xmm4, xmm2);
		movdqa(xmm5, xmm3);

		pcmpeqd(xmm0, xmm0);
		psrlw(xmm0, 15);
		paddw(xmm4, xmm0);
		paddw(xmm5, xmm0);

		if(m_sel.twin)
		{
			// u0 = (u0 & m_local.twin[0].u).add16(m_local.twin[1].u);
			// v0 = (v0 & m_local.twin[0].v).add16(m_local.twin[1].v);
			// u1 = (u1 & m_local.twin[0].u).add16(m_local.twin[1].u);
			// v1 = (v1 & m_local.twin[0].v).add16(m_local.twin[1].v);

			movdqa(xmm0, ptr[&m_local.twin[0].u]);
			movdqa(xmm6, ptr[&m_local.twin[1].u]);

			pand(xmm2, xmm0);
			paddw(xmm2, xmm6);
			pand(xmm4, xmm0);
			paddw(xmm4, xmm6);

			movdqa(xmm0, ptr[&m_local.twin[0].v]);
			movdqa(xmm6, ptr[&m_local.twin[1].v]);

			pand(xmm3, xmm0);
			paddw(xmm3, xmm6);
			pand(xmm5, xmm0);
			paddw(xmm5, xmm6);
		}
		else
		{
			// u0 = u0.min_i16(m_local.twin[2].u);
			// v0 = v0.min_i16(m_local.twin[2].v);
			// u1 = u1.min_i16(m_local.twin[2].u);
			// v1 = v1.min_i16(m_local.twin[2].v);

			// TODO: if(!sprite) clamp16 else:

			movdqa(xmm0, ptr[&m_local.twin[2].u]);
			movdqa(xmm6, ptr[&m_local.twin[2].v]);

			pminsw(xmm2, xmm0);
			pminsw(xmm3, xmm6);
			pminsw(xmm4, xmm0);
			pminsw(xmm5, xmm6);
		}

		// xmm2 = u0
		// xmm3 = v0
		// xmm4 = u1
		// xmm5 = v1
		// xmm7 = test
		// xmm0, xmm6 = free
		// xmm1 = used

		// GSVector4i addr00 = v0.sll16(8) | u0;
		// GSVector4i addr01 = v0.sll16(8) | u1;
		// GSVector4i addr10 = v1.sll16(8) | u0;
		// GSVector4i addr11 = v1.sll16(8) | u1;

		psllw(xmm3, 8);
		movdqa(xmm0, xmm3);
		por(xmm3, xmm2);
		por(xmm0, xmm4);

		psllw(xmm5, 8);
		movdqa(xmm6, xmm5);
		por(xmm5, xmm2);
		por(xmm6, xmm4);

		// xmm3 = addr00
		// xmm0 = addr01
		// xmm5 = addr10
		// xmm6 = addr11
		// xmm7 = test
		// xmm2, xmm4 = free
		// xmm1 = used

		ReadTexel(xmm2, xmm3);
		ReadTexel(xmm4, xmm0);
		ReadTexel(xmm3, xmm5);
		ReadTexel(xmm5, xmm6);

		// xmm2 = c00
		// xmm4 = c01
		// xmm3 = c10
		// xmm5 = c11
		// xmm7 = test
		// xmm0, xmm6 = free
		// xmm1 = used

		// spill (TODO)

		movdqa(ptr[&m_local.temp.fd], xmm1);
		movdqa(ptr[&m_local.temp.test], xmm7);

		// xmm2 = c00
		// xmm4 = c01
		// xmm3 = c10
		// xmm5 = c11
		// xmm0, xmm1, xmm6, xmm7 = free

		movdqa(xmm1, xmm2);
		psllw(xmm1, 11);
		psrlw(xmm1, 8);

		movdqa(xmm0, xmm4);
		psllw(xmm0, 11);
		psrlw(xmm0, 8);

		lerp16<0>(xmm0, xmm1, ptr[&m_local.temp.uf]);

		movdqa(xmm6, xmm2);
		psllw(xmm6, 6);
		psrlw(xmm6, 11);
		psllw(xmm6, 3);

		movdqa(xmm1, xmm4);
		psllw(xmm1, 6);
		psrlw(xmm1, 11);
		psllw(xmm1, 3);

		lerp16<0>(xmm1, xmm6, ptr[&m_local.temp.uf]);

		movdqa(xmm7, xmm2);
		psllw(xmm7, 1);
		psrlw(xmm7, 11);
		psllw(xmm7, 3);

		movdqa(xmm6, xmm4);
		psllw(xmm6, 1);
		psrlw(xmm6, 11);
		psllw(xmm6, 3);

		lerp16<0>(xmm6, xmm7, ptr[&m_local.temp.uf]);

		psraw(xmm2, 15);
		psrlw(xmm2, 8);
		psraw(xmm4, 15);
		psrlw(xmm4, 8);

		lerp16<0>(xmm4, xmm2, ptr[&m_local.temp.uf]);

		// xmm0 = r00
		// xmm1 = g00
		// xmm6 = b00
		// xmm4 = a00
		// xmm3 = c10
		// xmm5 = c11
		// xmm2, xmm7 = free

		movdqa(xmm7, xmm3);
		psllw(xmm7, 11);
		psrlw(xmm7, 8);

		movdqa(xmm2, xmm5);
		psllw(xmm2, 11);
		psrlw(xmm2, 8);

		lerp16<0>(xmm2, xmm7, ptr[&m_local.temp.uf]);
		lerp16<0>(xmm2, xmm0, ptr[&m_local.temp.vf]);

		// xmm2 = r
		// xmm1 = g00
		// xmm6 = b00
		// xmm4 = a00
		// xmm3 = c10
		// xmm5 = c11
		// xmm0, xmm7 = free

		movdqa(xmm7, xmm3);
		psllw(xmm7, 6);
		psrlw(xmm7, 11);
		psllw(xmm7, 3);

		movdqa(xmm0, xmm5);
		psllw(xmm0, 6);
		psrlw(xmm0, 11);
		psllw(xmm0, 3);

		lerp16<0>(xmm0, xmm7, ptr[&m_local.temp.uf]);
		lerp16<0>(xmm0, xmm1, ptr[&m_local.temp.vf]);

		// xmm2 = r
		// xmm0 = g
		// xmm6 = b00
		// xmm4 = a00
		// xmm3 = c10
		// xmm5 = c11
		// xmm1, xmm7 = free

		movdqa(xmm7, xmm3);
		psllw(xmm7, 1);
		psrlw(xmm7, 11);
		psllw(xmm7, 3);

		movdqa(xmm1, xmm5);
		psllw(xmm1, 1);
		psrlw(xmm1, 11);
		psllw(xmm1, 3);

		lerp16<0>(xmm1, xmm7, ptr[&m_local.temp.uf]);
		lerp16<0>(xmm1, xmm6, ptr[&m_local.temp.vf]);

		// xmm2 = r
		// xmm0 = g
		// xmm1 = b
		// xmm4 = a00
		// xmm3 = c10
		// xmm5 = c11
		// xmm6, xmm7 = free

		psraw(xmm3, 15);
		psrlw(xmm3, 8);
		psraw(xmm5, 15);
		psrlw(xmm5, 8);

		lerp16<0>(xmm5, xmm3, ptr[&m_local.temp.uf]);
		lerp16<0>(xmm5, xmm4, ptr[&m_local.temp.vf]);

		// xmm2 = r
		// xmm0 = g
		// xmm1 = b
		// xmm5 = a
		// xmm3, xmm4, xmm6, xmm7 = free

		// TODO
		movdqa(xmm3, xmm5); // a
		movdqa(xmm4, xmm2); // r
		movdqa(xmm6, xmm1); // b
		movdqa(xmm5, xmm0); // g

		// reload test

		movdqa(xmm7, ptr[&m_local.temp.test]);

		// xmm4 = r
		// xmm5 = g
		// xmm6 = b
		// xmm3 = a
		// xmm7 = test
		// xmm0, xmm1, xmm2 = free

		// test |= (c[0] | c[1] | c[2] | c[3]).eq16(GSVector4i::zero()); // mask out blank pixels (not perfect)

		movdqa(xmm1, xmm3);
		por(xmm1, xmm4);
		movdqa(xmm2, xmm5);
		por(xmm2, xmm6);
		por(xmm1, xmm2);

		pxor(xmm0, xmm0);
		pcmpeqw(xmm1, xmm0);
		por(xmm7, xmm1);

		// a = a.gt16(GSVector4i::zero());

		pcmpgtw(xmm3, xmm0);

		// reload fd

		movdqa(xmm1, ptr[&m_local.temp.fd]);
	}
	else
	{
		if(m_sel.twin)
		{
			// u = (u & m_local.twin[0].u).add16(m_local.twin[1].u);
			// v = (v & m_local.twin[0].v).add16(m_local.twin[1].v);

			pand(xmm2, ptr[&m_local.twin[0].u]);
			paddw(xmm2, ptr[&m_local.twin[1].u]);
			pand(xmm3, ptr[&m_local.twin[0].v]);
			paddw(xmm3, ptr[&m_local.twin[1].v]);
		}
		else
		{
			// u = u.min_i16(m_local.twin[2].u);
			// v = v.min_i16(m_local.twin[2].v);

			// TODO: if(!sprite) clamp16 else:

			pminsw(xmm2, ptr[&m_local.twin[2].u]);
			pminsw(xmm3, ptr[&m_local.twin[2].v]);
		}

		// xmm2 = u
		// xmm3 = v
		// xmm7 = test
		// xmm0, xmm4, xmm5, xmm6 = free
		// xmm1 = used

		// GSVector4i addr = v.sll16(8) | u;

		psllw(xmm3, 8);
		por(xmm3, xmm2);

		// xmm3 = addr
		// xmm7 = test
		// xmm0, xmm2, xmm4, xmm5, xmm6 = free
		// xmm1 = used

		ReadTexel(xmm6, xmm3);

		// xmm3 = c00
		// xmm7 = test
		// xmm0, xmm2, xmm4, xmm5, xmm6 = free
		// xmm1 = used

		// test |= c00.eq16(GSVector4i::zero()); // mask out blank pixels

		pxor(xmm0, xmm0);
		pcmpeqw(xmm0, xmm6);
		por(xmm7, xmm0);

		// c[0] = (c00 << 3) & 0x00f800f8;
		// c[1] = (c00 >> 2) & 0x00f800f8;
		// c[2] = (c00 >> 7) & 0x00f800f8;
		// c[3] = c00.sra16(15);

		movdqa(xmm3, xmm6);
		psraw(xmm3, 15); // a

		pcmpeqd(xmm0, xmm0);
		psrlw(xmm0, 11);
		psllw(xmm0, 3); // 0x00f8

		movdqa(xmm4, xmm6);
		psllw(xmm4, 3);
		pand(xmm4, xmm0); // r

		movdqa(xmm5, xmm6);
		psrlw(xmm5, 2);
		pand(xmm5, xmm0); // g

		psrlw(xmm6, 7);
		pand(xmm6, xmm0); // b
	}
}
Example #2
0
int main()
{
	int rval;
	mmx_t ma;
	mmx_t mb;

	movq_r2r(mm0, mm1);

	rval = mmx_ok();

	/* Announce return value of mmx_ok() */
//	printf("Value returned from init was %x.", rval);
//	printf(" (Indicates MMX %s available)\n\n",(rval)? "is" : "not");
//	fflush(stdout); fflush(stderr);

//	if(rval)
	{
		/* PADD *****************************************************/
		ma.q = 0x1111111180000000LL;
		mb.q = 0x7fffffff00000001LL;
		paddd(ma, mb);
		fprintf(stdout, "paddd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddd: mb.q is 9111111080000001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x0001000100010001LL;
		mb.q = 0x80007fffffff0001LL;
		paddw(ma, mb);
		fprintf(stdout, "paddw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddw: mb.q is 8001800000000002\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010000LL;
		paddw(ma, mb);
		fprintf(stdout, "paddw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddw: mb.q is 8001800000000001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x01010101807fff01LL;
		mb.q = 0x807fff0101010101LL;
		paddb(ma, mb);
		fprintf(stdout, "paddb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddb: mb.q is 8180000281800002\n");
		fflush(stdout); fflush(stderr);


		/* PADDS ****************************************************/
		ma.q = 0x0001000100010001LL;
		mb.q = 0x80007fffffff0001LL;
		paddsw(ma, mb);
		fprintf(stdout, "paddsw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddsw: mb.q is 80017fff00000002\n");

		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010000LL;
		paddsw(ma, mb);
		fprintf(stdout, "paddsw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddsw: mb.q is 80017fff00000001\n");

		ma.q = 0x01010101807fff01LL;
		mb.q = 0x807fff0101010101LL;
		paddsb(ma, mb);
		fprintf(stdout, "paddsb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddsb: mb.q is 817f0002817f0002\n");
		fflush(stdout); fflush(stderr);


		/* PADDUS ***************************************************/
		ma.q = 0x0001000100010001LL;
		mb.q = 0x80007fffffff0001LL;
		paddusw(ma, mb);
		fprintf(stdout, "paddusw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddusw: mb.q is 80018000ffff0002\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010000LL;
		paddusw(ma, mb);
		fprintf(stdout, "paddusw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddusw: mb.q is 80018000ffff0001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x01010101807fff01LL;
		mb.q = 0x807fff0101010101LL;
		paddusb(ma, mb);
		fprintf(stdout, "paddusb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddusb: mb.q is 8180ff028180ff02\n");
		fflush(stdout); fflush(stderr);


		/* PSUB *****************************************************/
		ma.q = 0x7fffffff00000001LL;
		mb.q = 0x1111111180000000LL;
		psubd(ma, mb);
		fprintf(stdout, "psubd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubd: mb.q is 911111127fffffff\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010001LL;
		psubw(ma, mb);
		fprintf(stdout, "psubw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubw: mb.q is 8001800200020000\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x0001000100010000LL;
		mb.q = 0x80007fffffff0001LL;
		psubw(ma, mb);
		fprintf(stdout, "psubw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubw: mb.q is 7fff7ffefffe0001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x807fff0101010101LL;
		mb.q = 0x01010101807fff01LL;
		psubb(ma, mb);
		fprintf(stdout, "psubb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubb: mb.q is 818202007f7efe00\n");
		fflush(stdout); fflush(stderr);


		/* PSUBS ****************************************************/
		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010001LL;
		psubsw(ma, mb);
		fprintf(stdout, "psubsw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubsw: mb.q is 7fff800200020000\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x0001000100010000LL;
		mb.q = 0x80007fffffff0001LL;
		psubsw(ma, mb);
		fprintf(stdout, "psubsw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubsw: mb.q is 80007ffefffe0001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x807fff0101010101LL;
		mb.q = 0x01010101807fff01LL;
		psubsb(ma, mb);
		fprintf(stdout, "psubsb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubsb: mb.q is 7f820200807efe00\n");
		fflush(stdout); fflush(stderr);
 

		/* PSUBUS ***************************************************/
		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010001LL;
		psubusw(ma, mb);
		fprintf(stdout, "psubusw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubusw: mb.q is 0000000000000000\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x0001000100010000LL;
		mb.q = 0x80007fffffff0001LL;
		psubusw(ma, mb);
		fprintf(stdout, "psubusw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubusw: mb.q is 7fff7ffefffe0001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x807fff0101010101LL;
		mb.q = 0x01010101807fff01LL;
		psubusb(ma, mb);
		fprintf(stdout, "psubusb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubusb: mb.q is 000000007f7efe00\n");
		fflush(stdout); fflush(stderr);


		/* PMUL *****************************************************/
		ma.q = 0x8000ffff00ff0000LL;
		mb.q = 0x0200ffff00ffffffLL;
		pmulhw(ma, mb);
		fprintf(stdout, "pmulhw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pmulhw: mb.q is ff00000000000000\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0200ffff00ffffffLL;
		pmullw(ma, mb);
		fprintf(stdout, "pmullw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pmullw: mb.q is 00000001fe010000\n");
		fflush(stdout); fflush(stderr);


		/* PMADD ****************************************************/
		ma.q = 0x8000345680007f34LL;
		mb.q = 0x93234a27ffff1707LL;

		pmaddwd(ma, mb);
		fprintf(stdout, "pmaddwd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pmaddwd: mb.q is 4597551a0b71a66c\n");
		fflush(stdout); fflush(stderr);


		/* PCMPEQ ***************************************************/
		ma.q = 0x800034568f237f34LL;
		mb.q = 0x93009a568f237f34LL;

		pcmpeqd(ma, mb);
		fprintf(stdout, "pcmpeqd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpeqd: mb.q is 00000000ffffffff\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x93009a568f237f34LL;
		pcmpeqw(ma, mb);
		fprintf(stdout, "pcmpeqw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpeqw: mb.q is 00000000ffffffff\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x93009a568f237f34LL;
		pcmpeqb(ma, mb);
		fprintf(stdout, "pcmpeqb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpeqb: mb.q is 00ff00ffffffffff\n");
		fflush(stdout); fflush(stderr);



		/* PCMPGT ***************************************************/
		ma.q = 0x666688884477aaffLL;
		mb.q = 0x1234567890abcdefLL;

		pcmpgtd(ma, mb);
		fprintf(stdout, "pcmpgtd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpgtd: mb.q is 0000000000000000\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x1234567890abcdefLL;
		pcmpgtw(ma, mb);
		fprintf(stdout, "pcmpgtw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpgtw: mb.q is 0000ffff0000ffff\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x1234567890abcdefLL;
		pcmpgtb(ma, mb);
		fprintf(stdout, "pcmpgtb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpgtb: mb.q is 0000ffff0000ff00\n");
		fflush(stdout); fflush(stderr);


		/* PACKSS ***************************************************/
		ma.q = 0x00012222000abbbbLL;
		mb.q = 0x0000888800003333LL;

		packssdw(ma, mb);
		fprintf(stdout, "packssdw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "packssdw: mb.q is 7fff7fff7fff3333\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x00aa00dd01009999LL;
		mb.q = 0x0011002200330044LL;

		packsswb(ma, mb);
		fprintf(stdout, "packsswb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "packsswb: mb.q is 7f7f7f8011223344\n");
		fflush(stdout); fflush(stderr);


		/* PACKUS ***************************************************/
		ma.q = 0x00aa00dd01009999LL;
		mb.q = 0x0011002200330044LL;

		packuswb(ma, mb);
		fprintf(stdout, "packuswb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "packuswb: mb.q is aaddff0011223344\n");
		fflush(stdout); fflush(stderr);


		/* PUNPCKH **************************************************/
		ma.q = 0x090a0b0c0d0e0f00LL;
		mb.q = 0x0102030405060708LL;

		punpckhdq(ma, mb);
		fprintf(stdout, "punpckhdq: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpckhdq: mb.q is 090a0b0c01020304\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0102030405060708LL;
		punpckhwd(ma, mb);
		fprintf(stdout, "punpckhwd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpckhwd: mb.q is 090a01020b0c0304\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0102030405060708LL;
		punpckhbw(ma, mb);
		fprintf(stdout, "punpckhbw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpckhbw: mb.q is 09010a020b030c04\n");
		fflush(stdout); fflush(stderr);


		/* PUNPCKL **************************************************/
		ma.q = 0x090a0b0c0d0e0f00LL;
		mb.q = 0x0102030405060708LL;

		punpckldq(ma, mb);
		fprintf(stdout, "punpckldq: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpckldq: mb.q is 0d0e0f0005060708\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0102030405060708LL;
		punpcklwd(ma, mb);
		fprintf(stdout, "punpcklwd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpcklwd: mb.q is 0d0e05060f000708\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0102030405060708LL;
		punpcklbw(ma, mb);
		fprintf(stdout, "punpcklbw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpcklbw: mb.q is 0d050e060f070008\n");
		fflush(stdout); fflush(stderr);



		/* PAND, PANDN, POR, PXOR ***********************************/
		ma.q = 0x5555555555555555LL;
		mb.q = 0x3333333333333333LL;

		pand(ma, mb);
		fprintf(stdout, "pand: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pand: mb.q is 1111111111111111\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x3333333333333333LL;
		pandn(ma, mb);
		fprintf(stdout, "pandn: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pandn: mb.q is 4444444444444444\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x3333333333333333LL;
		por(ma, mb);
		fprintf(stdout, "por: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "por: mb.q is 7777777777777777\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x3333333333333333LL;
		pxor(ma, mb);
		fprintf(stdout, "pxor: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pxor: mb.q is 6666666666666666\n");
		fflush(stdout); fflush(stderr);



		/* PSLL *****************************************************/
		ma.q = 0x0000000000000018LL;
		mb.q = 0x0123456789abcdefLL;

		psllq(ma, mb);
		fprintf(stdout, "psllq: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psllq: mb.q is 6789abcdef000000\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0123456789abcdefLL;
		pslld(ma, mb);
		fprintf(stdout, "pslld: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pslld: mb.q is 67000000ef000000\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0123456789abcdefLL;
		psllw(ma, mb);
		fprintf(stdout, "psllw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psllw: mb.q is 0000000000000000\n");
		fflush(stdout); fflush(stderr);



		/* PSRL *****************************************************/
		ma.q = 0x0000000000000018LL;
		mb.q = 0x0123456789abcdefLL;

		psrlq(ma, mb);
		fprintf(stdout, "psrlq: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psrlq: mb.q is 0000000123456789\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0123456789abcdefLL;
		psrld(ma, mb);
		fprintf(stdout, "psrld: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psrld: mb.q is 0000000100000089\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0123456789abcdefLL;
		psrlw(ma, mb);
		fprintf(stdout, "psrlw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psrlw: mb.q is 0000000000000000\n");
		fflush(stdout); fflush(stderr);



		/* PSRA *****************************************************/
		ma.q = 0x0000000000000018LL;
		mb.q = 0x0123456789abcdefLL;

		psrad(ma, mb);
		fprintf(stdout, "psrad: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psrad: mb.q is 00000001ffffff89\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0123456789abcdefLL;
		psraw(ma, mb);
		fprintf(stdout, "psraw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psraw: mb.q is 00000000ffffffff\n");
		fflush(stdout); fflush(stderr);

		/* Exit MXX *************************************************/
		emms();
	}

	/* Clean-up and exit nicely */
	exit(0);
}
void GSDrawScanlineCodeGenerator::AlphaTFX()
{
    if(!m_sel.fb)
    {
        return;
    }

    switch(m_sel.tfx)
    {
    case TFX_MODULATE:

        // GSVector4i ga = iip ? gaf : m_env.c.ga;

        movdqa(xmm4, xmmword[m_sel.iip ? &m_env.temp.ga : &m_env.c.ga]);

        // gat = gat.modulate16<1>(ga).clamp8();

        modulate16<1>(xmm6, xmm4);

        clamp16(xmm6, xmm3);

        // if(!tcc) gat = gat.mix16(ga.srl16(7));

        if(!m_sel.tcc)
        {
            psrlw(xmm4, 7);

            mix16(xmm6, xmm4, xmm3);
        }

        break;

    case TFX_DECAL:

        // if(!tcc) gat = gat.mix16(ga.srl16(7));

        if(!m_sel.tcc)
        {
            // GSVector4i ga = iip ? gaf : m_env.c.ga;

            movdqa(xmm4, xmmword[m_sel.iip ? &m_env.temp.ga : &m_env.c.ga]);

            psrlw(xmm4, 7);

            mix16(xmm6, xmm4, xmm3);
        }

        break;

    case TFX_HIGHLIGHT:

        // GSVector4i ga = iip ? gaf : m_env.c.ga;

        movdqa(xmm4, xmmword[m_sel.iip ? &m_env.temp.ga : &m_env.c.ga]);
        movdqa(xmm2, xmm4);

        // gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7)));

        psrlw(xmm4, 7);

        if(m_sel.tcc)
        {
            paddusb(xmm4, xmm6);
        }

        mix16(xmm6, xmm4, xmm3);

        break;

    case TFX_HIGHLIGHT2:

        // if(!tcc) gat = gat.mix16(ga.srl16(7));

        if(!m_sel.tcc)
        {
            // GSVector4i ga = iip ? gaf : m_env.c.ga;

            movdqa(xmm4, xmmword[m_sel.iip ? &m_env.temp.ga : &m_env.c.ga]);
            movdqa(xmm2, xmm4);

            psrlw(xmm4, 7);

            mix16(xmm6, xmm4, xmm3);
        }

        break;

    case TFX_NONE:

        // gat = iip ? ga.srl16(7) : ga;

        if(m_sel.iip)
        {
            psrlw(xmm6, 7);
        }

        break;
    }

    if(m_sel.aa1)
    {
        // gs_user figure 3-2: anti-aliasing after tfx, before tests, modifies alpha

        // FIXME: bios config screen cubes

        if(!m_sel.abe)
        {
            // a = cov

            if(m_sel.edge)
            {
                movdqa(xmm0, xmmword[&m_env.temp.cov]);
            }
            else
            {
                pcmpeqd(xmm0, xmm0);
                psllw(xmm0, 15);
                psrlw(xmm0, 8);
            }

            mix16(xmm6, xmm0, xmm1);
        }
        else
        {
            // a = a == 0x80 ? cov : a

            pcmpeqd(xmm0, xmm0);
            psllw(xmm0, 15);
            psrlw(xmm0, 8);

            if(m_sel.edge)
            {
                movdqa(xmm1, xmmword[&m_env.temp.cov]);
            }
            else
            {
                movdqa(xmm1, xmm0);
            }

            pcmpeqw(xmm0, xmm6);
            psrld(xmm0, 16);
            pslld(xmm0, 16);

            blend8(xmm6, xmm1);
        }
    }
}