Exemplo n.º 1
1
void GSDrawScanlineCodeGenerator::blendr(const Xmm& b, const Xmm& a, const Xmm& mask)
{
#if _M_SSE >= 0x500

    vpand(b, mask);
    vpandn(mask, a);
    vpor(b, mask);

#else

    pand(b, mask);
    pandn(mask, a);
    por(b, mask);

#endif
}
void GPUDrawScanlineCodeGenerator::blend(const Xmm& a, const Xmm& b, const Xmm& mask)
{
	pand(b, mask);
	pandn(mask, a);
	por(b, mask);
	movdqa(a, b);
}
Exemplo n.º 3
0
  // Cycle through integer types
  TEST(Exp, VarySubgroup) {
    const int pbits = 2048;

    for(int qbits = 160; qbits<pbits; qbits+=64) {
      // Generate prime
      CryptoPP::AutoSeededX917RNG<CryptoPP::AES> rng(false, true);
      CryptoPP::PrimeAndGenerator pand(1, rng, pbits, qbits);

      const CryptoPP::Integer two(2);

      double total = 0.0f;
      // Do 1000 exps
      for(int i=0; i<1000; i++) {
        CryptoPP::Integer v = a_exp_b_mod_c(
            pand.Generator(),
            CryptoPP::Integer(rng, two, pand.SubPrime(), CryptoPP::Integer::ANY),
            pand.Prime());
        CryptoPP::Integer e(rng, two, pand.SubPrime(), CryptoPP::Integer::ANY);

        double start = QDateTime::currentMSecsSinceEpoch();
        CryptoPP::Integer r = a_exp_b_mod_c(v, e, pand.Prime());
        double end = QDateTime::currentMSecsSinceEpoch();

        total += (end-start);
      }

      qDebug() << qbits << total;

    }
  }
void GSDrawScanlineCodeGenerator::mix16(const Xmm& a, const Xmm& b, const Xmm& temp)
{
    if(m_cpu.has(util::Cpu::tSSE41))
    {
        pblendw(a, b, 0xaa);
    }
    else
    {
        pcmpeqd(temp, temp);
        psrld(temp, 16);
        pand(a, temp);
        pandn(temp, b);
        por(a, temp);
    }
}
Exemplo n.º 5
0
void GSDrawScanlineCodeGenerator::mix16(const Xmm& a, const Xmm& b, const Xmm& temp)
{
#if _M_SSE >= 0x500

    vpblendw(a, b, 0xaa);

#elif _M_SSE >= 0x401

    pblendw(a, b, 0xaa);

#else

    pcmpeqd(temp, temp);
    psrld(temp, 16);
    pand(a, temp);
    pandn(temp, b);
    por(a, temp);

#endif
}
Exemplo n.º 6
0
void GSSetupPrimCodeGenerator::Depth()
{
	if(!m_en.z && !m_en.f)
	{
		return;
	}

	if(!m_env.sel.sprite)
	{
		// GSVector4 t = dscan.p;

		movaps(xmm0, xmmword[edx + 16]);

		if(m_en.f)
		{
			// GSVector4 df = p.wwww();

			movaps(xmm1, xmm0);
			shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));

			// m_env.d4.f = GSVector4i(df * 4.0f).xxzzlh();

			movaps(xmm2, xmm1);
			mulps(xmm2, xmm3);
			cvttps2dq(xmm2, xmm2);
			pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
			pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
			movdqa(xmmword[&m_env.d4.f], xmm2);

			for(int i = 0; i < 4; i++)
			{
				// m_env.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();

				movaps(xmm2, xmm1);
				mulps(xmm2, Xmm(4 + i));
				cvttps2dq(xmm2, xmm2);
				pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
				pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
				movdqa(xmmword[&m_env.d[i].f], xmm2);
			}
		}

		if(m_en.z)
		{
			// GSVector4 dz = p.zzzz();

			shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));

			// m_env.d4.z = dz * 4.0f;

			movaps(xmm1, xmm0);
			mulps(xmm1, xmm3);
			movdqa(xmmword[&m_env.d4.z], xmm1);

			for(int i = 0; i < 4; i++)
			{
				// m_env.d[i].z = dz * m_shift[i];

				movaps(xmm1, xmm0);
				mulps(xmm1, Xmm(4 + i));
				movdqa(xmmword[&m_env.d[i].z], xmm1);
			}
		}
	}
	else
	{
		// GSVector4 p = vertices[0].p;

		movaps(xmm0, xmmword[ecx + 16]);

		if(m_en.f)
		{
			// m_env.p.f = GSVector4i(p).zzzzh().zzzz();

			movaps(xmm1, xmm0);
			cvttps2dq(xmm1, xmm1);
			pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
			pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
			movdqa(xmmword[&m_env.p.f], xmm1);
		}

		if(m_en.z)
		{
			// GSVector4 z = p.zzzz();

			shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));

			if(m_env.sel.zoverflow)
			{
				// m_env.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());

				static const float half = 0.5f;

				movss(xmm1, dword[&half]);
				shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
				mulps(xmm1, xmm0);
				cvttps2dq(xmm1, xmm1);
				pslld(xmm1, 1);

				cvttps2dq(xmm0, xmm0);
				pcmpeqd(xmm2, xmm2);
				psrld(xmm2, 31);
				pand(xmm0, xmm2);
				
				por(xmm0, xmm1);
			}
			else
			{
				// m_env.p.z = GSVector4i(z);

				cvttps2dq(xmm0, xmm0);
			}

			movdqa(xmmword[&m_env.p.z], xmm0);
		}
	}
}
void GPUDrawScanlineCodeGenerator::AlphaBlend()
{
	if(!m_sel.abe)
	{
		return;
	}

	// xmm1 = fd
	// xmm3 = a
	// xmm4 = r
	// xmm5 = g
	// xmm6 = b
	// xmm7 = test
	// xmm0, xmm2 = free

	// GSVector4i r = (fd & 0x001f001f) << 3;

	pcmpeqd(xmm0, xmm0);
	psrlw(xmm0, 11); // 0x001f
	movdqa(xmm2, xmm1);
	pand(xmm2, xmm0);
	psllw(xmm2, 3);

	switch(m_sel.abr)
	{
	case 0:
		// r = r.avg8(c[0]);
		pavgb(xmm2, xmm4);
		break;
	case 1:
		// r = r.addus8(c[0]);
		paddusb(xmm2, xmm4);
		break;
	case 2:
		// r = r.subus8(c[0]);
		psubusb(xmm2, xmm4);
		break;
	case 3:
		// r = r.addus8(c[0].srl16(2));
		movdqa(xmm0, xmm4);
		psrlw(xmm0, 2);
		paddusb(xmm2, xmm0);
		break;
	}

	if(m_sel.tme)
	{
		movdqa(xmm0, xmm3);
		blend8(xmm4, xmm2);
	}
	else
	{
		movdqa(xmm4, xmm2);
	}

	// GSVector4i g = (d & 0x03e003e0) >> 2;

	pcmpeqd(xmm0, xmm0);
	psrlw(xmm0, 11);
	psllw(xmm0, 5); // 0x03e0
	movdqa(xmm2, xmm1);
	pand(xmm2, xmm0);
	psrlw(xmm2, 2);

	switch(m_sel.abr)
	{
	case 0:
		// g = g.avg8(c[2]);
		pavgb(xmm2, xmm5);
		break;
	case 1:
		// g = g.addus8(c[2]);
		paddusb(xmm2, xmm5);
		break;
	case 2:
		// g = g.subus8(c[2]);
		psubusb(xmm2, xmm5);
		break;
	case 3:
		// g = g.addus8(c[2].srl16(2));
		movdqa(xmm0, xmm5);
		psrlw(xmm0, 2);
		paddusb(xmm2, xmm0);
		break;
	}

	if(m_sel.tme)
	{
		movdqa(xmm0, xmm3);
		blend8(xmm5, xmm2);
	}
	else
	{
		movdqa(xmm5, xmm2);
	}

	// GSVector4i b = (d & 0x7c007c00) >> 7;

	pcmpeqd(xmm0, xmm0);
	psrlw(xmm0, 11);
	psllw(xmm0, 10); // 0x7c00
	movdqa(xmm2, xmm1);
	pand(xmm2, xmm0);
	psrlw(xmm2, 7);

	switch(m_sel.abr)
	{
	case 0:
		// b = b.avg8(c[2]);
		pavgb(xmm2, xmm6);
		break;
	case 1:
		// b = b.addus8(c[2]);
		paddusb(xmm2, xmm6);
		break;
	case 2:
		// b = b.subus8(c[2]);
		psubusb(xmm2, xmm6);
		break;
	case 3:
		// b = b.addus8(c[2].srl16(2));
		movdqa(xmm0, xmm6);
		psrlw(xmm0, 2);
		paddusb(xmm2, xmm0);
		break;
	}

	if(m_sel.tme)
	{
		movdqa(xmm0, xmm3);
		blend8(xmm6, xmm2);
	}
	else
	{
		movdqa(xmm6, xmm2);
	}
}
void GPUDrawScanlineCodeGenerator::WriteFrame()
{
	// GSVector4i fs = r | g | b | (m_sel.md ? GSVector4i(0x80008000) : m_sel.tme ? a : 0);

	pcmpeqd(xmm0, xmm0);

	if(m_sel.md || m_sel.tme)
	{
		movdqa(xmm2, xmm0);
		psllw(xmm2, 15);
	}

	psrlw(xmm0, 11);
	psllw(xmm0, 3);

	// xmm0 = 0x00f8
	// xmm2 = 0x8000 (md)

	// GSVector4i r = (c[0] & 0x00f800f8) >> 3;

	pand(xmm4, xmm0);
	psrlw(xmm4, 3);

	// GSVector4i g = (c[1] & 0x00f800f8) << 2;

	pand(xmm5, xmm0);
	psllw(xmm5, 2);
	por(xmm4, xmm5);

	// GSVector4i b = (c[2] & 0x00f800f8) << 7;

	pand(xmm6, xmm0);
	psllw(xmm6, 7);
	por(xmm4, xmm6);

	if(m_sel.md)
	{
		// GSVector4i a = GSVector4i(0x80008000);

		por(xmm4, xmm2);
	}
	else if(m_sel.tme)
	{
		// GSVector4i a = (c[3] << 8) & 0x80008000;

		psllw(xmm3, 8);
		pand(xmm3, xmm2);
		por(xmm4, xmm3);
	}

	// fs = fs.blend8(fd, test);

	movdqa(xmm0, xmm7);
	blend8(xmm4, xmm1);

	// GSVector4i::store<false>(fb, fs);

	// movdqu(ptr[edi], xmm4);

	movq(qword[edi], xmm4);
	movhps(qword[edi + 8], xmm4);
}
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv)
{
    // xmm0, xmm1, xmm4, xmm5, xmm6 = free

    int wms_clamp = ((m_sel.wms + 1) >> 1) & 1;
    int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1;

    int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1;

    if(wms_clamp == wmt_clamp)
    {
        if(wms_clamp)
        {
            if(region)
            {
                pmaxsw(uv, xmmword[&m_env.t.min]);
            }
            else
            {
                pxor(xmm0, xmm0);
                pmaxsw(uv, xmm0);
            }

            pminsw(uv, xmmword[&m_env.t.max]);
        }
        else
        {
            pand(uv, xmmword[&m_env.t.min]);

            if(region)
            {
                por(uv, xmmword[&m_env.t.max]);
            }
        }
    }
    else
    {
        movdqa(xmm1, uv);

        movdqa(xmm4, xmmword[&m_env.t.min]);
        movdqa(xmm5, xmmword[&m_env.t.max]);

        // GSVector4i clamp = t.sat_i16(m_env.t.min, m_env.t.max);

        pmaxsw(uv, xmm4);
        pminsw(uv, xmm5);

        // GSVector4i repeat = (t & m_env.t.min) | m_env.t.max;

        pand(xmm1, xmm4);

        if(region)
        {
            por(xmm1, xmm5);
        }

        // clamp.blend8(repeat, m_env.t.mask);

        movdqa(xmm0, xmmword[&m_env.t.mask]);
        blend8(uv, xmm1);
    }
}
Exemplo n.º 10
0
void GPUDrawScanlineCodeGenerator::SampleTexture()
{
	if(!m_sel.tme)
	{
		return;
	}

	if(m_sel.tlu)
	{
		mov(edx, ptr[&m_local.gd->clut]);
	}

	// xmm2 = s
	// xmm3 = t
	// xmm7 = test
	// xmm0, xmm4, xmm5, xmm6 = free
	// xmm1 = used

	if(m_sel.ltf)
	{
		// GSVector4i u = s.sub16(GSVector4i(0x00200020)); // - 0.125f
		// GSVector4i v = t.sub16(GSVector4i(0x00200020)); // - 0.125f

		mov(eax, 0x00200020);
		movd(xmm0, eax);
		pshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));

		psubw(xmm2, xmm0);
		psubw(xmm3, xmm0);

		// GSVector4i uf = (u & GSVector4i::x00ff()) << 7;
		// GSVector4i vf = (v & GSVector4i::x00ff()) << 7;

		movdqa(xmm0, xmm2);
		psllw(xmm0, 8);
		psrlw(xmm0, 1);
		movdqa(ptr[&m_local.temp.uf], xmm0);

		if(!m_sel.sprite)
		{
			movdqa(xmm0, xmm3);
			psllw(xmm0, 8);
			psrlw(xmm0, 1);
			movdqa(ptr[&m_local.temp.vf], xmm0);
		}
	}

	// GSVector4i u0 = s.srl16(8);
	// GSVector4i v0 = t.srl16(8);

	psrlw(xmm2, 8);
	psrlw(xmm3, 8);

	// xmm2 = u
	// xmm3 = v
	// xmm7 = test
	// xmm0, xmm4, xmm5, xmm6 = free
	// xmm1 = used

	if(m_sel.ltf)
	{
		// GSVector4i u1 = u0.add16(GSVector4i::x0001());
		// GSVector4i v1 = v0.add16(GSVector4i::x0001());

		movdqa(xmm4, xmm2);
		movdqa(xmm5, xmm3);

		pcmpeqd(xmm0, xmm0);
		psrlw(xmm0, 15);
		paddw(xmm4, xmm0);
		paddw(xmm5, xmm0);

		if(m_sel.twin)
		{
			// u0 = (u0 & m_local.twin[0].u).add16(m_local.twin[1].u);
			// v0 = (v0 & m_local.twin[0].v).add16(m_local.twin[1].v);
			// u1 = (u1 & m_local.twin[0].u).add16(m_local.twin[1].u);
			// v1 = (v1 & m_local.twin[0].v).add16(m_local.twin[1].v);

			movdqa(xmm0, ptr[&m_local.twin[0].u]);
			movdqa(xmm6, ptr[&m_local.twin[1].u]);

			pand(xmm2, xmm0);
			paddw(xmm2, xmm6);
			pand(xmm4, xmm0);
			paddw(xmm4, xmm6);

			movdqa(xmm0, ptr[&m_local.twin[0].v]);
			movdqa(xmm6, ptr[&m_local.twin[1].v]);

			pand(xmm3, xmm0);
			paddw(xmm3, xmm6);
			pand(xmm5, xmm0);
			paddw(xmm5, xmm6);
		}
		else
		{
			// u0 = u0.min_i16(m_local.twin[2].u);
			// v0 = v0.min_i16(m_local.twin[2].v);
			// u1 = u1.min_i16(m_local.twin[2].u);
			// v1 = v1.min_i16(m_local.twin[2].v);

			// TODO: if(!sprite) clamp16 else:

			movdqa(xmm0, ptr[&m_local.twin[2].u]);
			movdqa(xmm6, ptr[&m_local.twin[2].v]);

			pminsw(xmm2, xmm0);
			pminsw(xmm3, xmm6);
			pminsw(xmm4, xmm0);
			pminsw(xmm5, xmm6);
		}

		// xmm2 = u0
		// xmm3 = v0
		// xmm4 = u1
		// xmm5 = v1
		// xmm7 = test
		// xmm0, xmm6 = free
		// xmm1 = used

		// GSVector4i addr00 = v0.sll16(8) | u0;
		// GSVector4i addr01 = v0.sll16(8) | u1;
		// GSVector4i addr10 = v1.sll16(8) | u0;
		// GSVector4i addr11 = v1.sll16(8) | u1;

		psllw(xmm3, 8);
		movdqa(xmm0, xmm3);
		por(xmm3, xmm2);
		por(xmm0, xmm4);

		psllw(xmm5, 8);
		movdqa(xmm6, xmm5);
		por(xmm5, xmm2);
		por(xmm6, xmm4);

		// xmm3 = addr00
		// xmm0 = addr01
		// xmm5 = addr10
		// xmm6 = addr11
		// xmm7 = test
		// xmm2, xmm4 = free
		// xmm1 = used

		ReadTexel(xmm2, xmm3);
		ReadTexel(xmm4, xmm0);
		ReadTexel(xmm3, xmm5);
		ReadTexel(xmm5, xmm6);

		// xmm2 = c00
		// xmm4 = c01
		// xmm3 = c10
		// xmm5 = c11
		// xmm7 = test
		// xmm0, xmm6 = free
		// xmm1 = used

		// spill (TODO)

		movdqa(ptr[&m_local.temp.fd], xmm1);
		movdqa(ptr[&m_local.temp.test], xmm7);

		// xmm2 = c00
		// xmm4 = c01
		// xmm3 = c10
		// xmm5 = c11
		// xmm0, xmm1, xmm6, xmm7 = free

		movdqa(xmm1, xmm2);
		psllw(xmm1, 11);
		psrlw(xmm1, 8);

		movdqa(xmm0, xmm4);
		psllw(xmm0, 11);
		psrlw(xmm0, 8);

		lerp16<0>(xmm0, xmm1, ptr[&m_local.temp.uf]);

		movdqa(xmm6, xmm2);
		psllw(xmm6, 6);
		psrlw(xmm6, 11);
		psllw(xmm6, 3);

		movdqa(xmm1, xmm4);
		psllw(xmm1, 6);
		psrlw(xmm1, 11);
		psllw(xmm1, 3);

		lerp16<0>(xmm1, xmm6, ptr[&m_local.temp.uf]);

		movdqa(xmm7, xmm2);
		psllw(xmm7, 1);
		psrlw(xmm7, 11);
		psllw(xmm7, 3);

		movdqa(xmm6, xmm4);
		psllw(xmm6, 1);
		psrlw(xmm6, 11);
		psllw(xmm6, 3);

		lerp16<0>(xmm6, xmm7, ptr[&m_local.temp.uf]);

		psraw(xmm2, 15);
		psrlw(xmm2, 8);
		psraw(xmm4, 15);
		psrlw(xmm4, 8);

		lerp16<0>(xmm4, xmm2, ptr[&m_local.temp.uf]);

		// xmm0 = r00
		// xmm1 = g00
		// xmm6 = b00
		// xmm4 = a00
		// xmm3 = c10
		// xmm5 = c11
		// xmm2, xmm7 = free

		movdqa(xmm7, xmm3);
		psllw(xmm7, 11);
		psrlw(xmm7, 8);

		movdqa(xmm2, xmm5);
		psllw(xmm2, 11);
		psrlw(xmm2, 8);

		lerp16<0>(xmm2, xmm7, ptr[&m_local.temp.uf]);
		lerp16<0>(xmm2, xmm0, ptr[&m_local.temp.vf]);

		// xmm2 = r
		// xmm1 = g00
		// xmm6 = b00
		// xmm4 = a00
		// xmm3 = c10
		// xmm5 = c11
		// xmm0, xmm7 = free

		movdqa(xmm7, xmm3);
		psllw(xmm7, 6);
		psrlw(xmm7, 11);
		psllw(xmm7, 3);

		movdqa(xmm0, xmm5);
		psllw(xmm0, 6);
		psrlw(xmm0, 11);
		psllw(xmm0, 3);

		lerp16<0>(xmm0, xmm7, ptr[&m_local.temp.uf]);
		lerp16<0>(xmm0, xmm1, ptr[&m_local.temp.vf]);

		// xmm2 = r
		// xmm0 = g
		// xmm6 = b00
		// xmm4 = a00
		// xmm3 = c10
		// xmm5 = c11
		// xmm1, xmm7 = free

		movdqa(xmm7, xmm3);
		psllw(xmm7, 1);
		psrlw(xmm7, 11);
		psllw(xmm7, 3);

		movdqa(xmm1, xmm5);
		psllw(xmm1, 1);
		psrlw(xmm1, 11);
		psllw(xmm1, 3);

		lerp16<0>(xmm1, xmm7, ptr[&m_local.temp.uf]);
		lerp16<0>(xmm1, xmm6, ptr[&m_local.temp.vf]);

		// xmm2 = r
		// xmm0 = g
		// xmm1 = b
		// xmm4 = a00
		// xmm3 = c10
		// xmm5 = c11
		// xmm6, xmm7 = free

		psraw(xmm3, 15);
		psrlw(xmm3, 8);
		psraw(xmm5, 15);
		psrlw(xmm5, 8);

		lerp16<0>(xmm5, xmm3, ptr[&m_local.temp.uf]);
		lerp16<0>(xmm5, xmm4, ptr[&m_local.temp.vf]);

		// xmm2 = r
		// xmm0 = g
		// xmm1 = b
		// xmm5 = a
		// xmm3, xmm4, xmm6, xmm7 = free

		// TODO
		movdqa(xmm3, xmm5); // a
		movdqa(xmm4, xmm2); // r
		movdqa(xmm6, xmm1); // b
		movdqa(xmm5, xmm0); // g

		// reload test

		movdqa(xmm7, ptr[&m_local.temp.test]);

		// xmm4 = r
		// xmm5 = g
		// xmm6 = b
		// xmm3 = a
		// xmm7 = test
		// xmm0, xmm1, xmm2 = free

		// test |= (c[0] | c[1] | c[2] | c[3]).eq16(GSVector4i::zero()); // mask out blank pixels (not perfect)

		movdqa(xmm1, xmm3);
		por(xmm1, xmm4);
		movdqa(xmm2, xmm5);
		por(xmm2, xmm6);
		por(xmm1, xmm2);

		pxor(xmm0, xmm0);
		pcmpeqw(xmm1, xmm0);
		por(xmm7, xmm1);

		// a = a.gt16(GSVector4i::zero());

		pcmpgtw(xmm3, xmm0);

		// reload fd

		movdqa(xmm1, ptr[&m_local.temp.fd]);
	}
	else
	{
		if(m_sel.twin)
		{
			// u = (u & m_local.twin[0].u).add16(m_local.twin[1].u);
			// v = (v & m_local.twin[0].v).add16(m_local.twin[1].v);

			pand(xmm2, ptr[&m_local.twin[0].u]);
			paddw(xmm2, ptr[&m_local.twin[1].u]);
			pand(xmm3, ptr[&m_local.twin[0].v]);
			paddw(xmm3, ptr[&m_local.twin[1].v]);
		}
		else
		{
			// u = u.min_i16(m_local.twin[2].u);
			// v = v.min_i16(m_local.twin[2].v);

			// TODO: if(!sprite) clamp16 else:

			pminsw(xmm2, ptr[&m_local.twin[2].u]);
			pminsw(xmm3, ptr[&m_local.twin[2].v]);
		}

		// xmm2 = u
		// xmm3 = v
		// xmm7 = test
		// xmm0, xmm4, xmm5, xmm6 = free
		// xmm1 = used

		// GSVector4i addr = v.sll16(8) | u;

		psllw(xmm3, 8);
		por(xmm3, xmm2);

		// xmm3 = addr
		// xmm7 = test
		// xmm0, xmm2, xmm4, xmm5, xmm6 = free
		// xmm1 = used

		ReadTexel(xmm6, xmm3);

		// xmm3 = c00
		// xmm7 = test
		// xmm0, xmm2, xmm4, xmm5, xmm6 = free
		// xmm1 = used

		// test |= c00.eq16(GSVector4i::zero()); // mask out blank pixels

		pxor(xmm0, xmm0);
		pcmpeqw(xmm0, xmm6);
		por(xmm7, xmm0);

		// c[0] = (c00 << 3) & 0x00f800f8;
		// c[1] = (c00 >> 2) & 0x00f800f8;
		// c[2] = (c00 >> 7) & 0x00f800f8;
		// c[3] = c00.sra16(15);

		movdqa(xmm3, xmm6);
		psraw(xmm3, 15); // a

		pcmpeqd(xmm0, xmm0);
		psrlw(xmm0, 11);
		psllw(xmm0, 3); // 0x00f8

		movdqa(xmm4, xmm6);
		psllw(xmm4, 3);
		pand(xmm4, xmm0); // r

		movdqa(xmm5, xmm6);
		psrlw(xmm5, 2);
		pand(xmm5, xmm0); // g

		psrlw(xmm6, 7);
		pand(xmm6, xmm0); // b
	}
}
Exemplo n.º 11
0
void MacroAssembler::fast_exp(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register eax, Register ecx, Register edx, Register tmp) {
  Label L_2TAG_PACKET_0_0_2, L_2TAG_PACKET_1_0_2, L_2TAG_PACKET_2_0_2, L_2TAG_PACKET_3_0_2;
  Label L_2TAG_PACKET_4_0_2, L_2TAG_PACKET_5_0_2, L_2TAG_PACKET_6_0_2, L_2TAG_PACKET_7_0_2;
  Label L_2TAG_PACKET_8_0_2, L_2TAG_PACKET_9_0_2, L_2TAG_PACKET_10_0_2, L_2TAG_PACKET_11_0_2;
  Label L_2TAG_PACKET_12_0_2, L_2TAG_PACKET_13_0_2, B1_3, B1_5, start;

  assert_different_registers(tmp, eax, ecx, edx);
  jmp(start);
  address static_const_table = (address)_static_const_table;

  bind(start);
  subl(rsp, 120);
  movl(Address(rsp, 64), tmp);
  lea(tmp, ExternalAddress(static_const_table));
  movdqu(xmm0, Address(rsp, 128));
  unpcklpd(xmm0, xmm0);
  movdqu(xmm1, Address(tmp, 64));          // 0x652b82feUL, 0x40571547UL, 0x652b82feUL, 0x40571547UL
  movdqu(xmm6, Address(tmp, 48));          // 0x00000000UL, 0x43380000UL, 0x00000000UL, 0x43380000UL
  movdqu(xmm2, Address(tmp, 80));          // 0xfefa0000UL, 0x3f862e42UL, 0xfefa0000UL, 0x3f862e42UL
  movdqu(xmm3, Address(tmp, 96));          // 0xbc9e3b3aUL, 0x3d1cf79aUL, 0xbc9e3b3aUL, 0x3d1cf79aUL
  pextrw(eax, xmm0, 3);
  andl(eax, 32767);
  movl(edx, 16527);
  subl(edx, eax);
  subl(eax, 15504);
  orl(edx, eax);
  cmpl(edx, INT_MIN);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_0_0_2);
  mulpd(xmm1, xmm0);
  addpd(xmm1, xmm6);
  movapd(xmm7, xmm1);
  subpd(xmm1, xmm6);
  mulpd(xmm2, xmm1);
  movdqu(xmm4, Address(tmp, 128));         // 0xe3289860UL, 0x3f56c15cUL, 0x555b9e25UL, 0x3fa55555UL
  mulpd(xmm3, xmm1);
  movdqu(xmm5, Address(tmp, 144));         // 0xc090cf0fUL, 0x3f811115UL, 0x55548ba1UL, 0x3fc55555UL
  subpd(xmm0, xmm2);
  movdl(eax, xmm7);
  movl(ecx, eax);
  andl(ecx, 63);
  shll(ecx, 4);
  sarl(eax, 6);
  movl(edx, eax);
  movdqu(xmm6, Address(tmp, 16));          // 0xffffffc0UL, 0x00000000UL, 0xffffffc0UL, 0x00000000UL
  pand(xmm7, xmm6);
  movdqu(xmm6, Address(tmp, 32));          // 0x0000ffc0UL, 0x00000000UL, 0x0000ffc0UL, 0x00000000UL
  paddq(xmm7, xmm6);
  psllq(xmm7, 46);
  subpd(xmm0, xmm3);
  movdqu(xmm2, Address(tmp, ecx, Address::times_1, 160));
  mulpd(xmm4, xmm0);
  movapd(xmm6, xmm0);
  movapd(xmm1, xmm0);
  mulpd(xmm6, xmm6);
  mulpd(xmm0, xmm6);
  addpd(xmm5, xmm4);
  mulsd(xmm0, xmm6);
  mulpd(xmm6, Address(tmp, 112));          // 0xfffffffeUL, 0x3fdfffffUL, 0xfffffffeUL, 0x3fdfffffUL
  addsd(xmm1, xmm2);
  unpckhpd(xmm2, xmm2);
  mulpd(xmm0, xmm5);
  addsd(xmm1, xmm0);
  por(xmm2, xmm7);
  unpckhpd(xmm0, xmm0);
  addsd(xmm0, xmm1);
  addsd(xmm0, xmm6);
  addl(edx, 894);
  cmpl(edx, 1916);
  jcc (Assembler::above, L_2TAG_PACKET_1_0_2);
  mulsd(xmm0, xmm2);
  addsd(xmm0, xmm2);
  jmp(L_2TAG_PACKET_2_0_2);

  bind(L_2TAG_PACKET_1_0_2);
  fnstcw(Address(rsp, 24));
  movzwl(edx, Address(rsp, 24));
  orl(edx, 768);
  movw(Address(rsp, 28), edx);
  fldcw(Address(rsp, 28));
  movl(edx, eax);
  sarl(eax, 1);
  subl(edx, eax);
  movdqu(xmm6, Address(tmp, 0));           // 0x00000000UL, 0xfff00000UL, 0x00000000UL, 0xfff00000UL
  pandn(xmm6, xmm2);
  addl(eax, 1023);
  movdl(xmm3, eax);
  psllq(xmm3, 52);
  por(xmm6, xmm3);
  addl(edx, 1023);
  movdl(xmm4, edx);
  psllq(xmm4, 52);
  movsd(Address(rsp, 8), xmm0);
  fld_d(Address(rsp, 8));
  movsd(Address(rsp, 16), xmm6);
  fld_d(Address(rsp, 16));
  fmula(1);
  faddp(1);
  movsd(Address(rsp, 8), xmm4);
  fld_d(Address(rsp, 8));
  fmulp(1);
  fstp_d(Address(rsp, 8));
  movsd(xmm0,Address(rsp, 8));
  fldcw(Address(rsp, 24));
  pextrw(ecx, xmm0, 3);
  andl(ecx, 32752);
  cmpl(ecx, 32752);
  jcc(Assembler::greaterEqual, L_2TAG_PACKET_3_0_2);
  cmpl(ecx, 0);
  jcc(Assembler::equal, L_2TAG_PACKET_4_0_2);
  jmp(L_2TAG_PACKET_2_0_2);
  cmpl(ecx, INT_MIN);
  jcc(Assembler::less, L_2TAG_PACKET_3_0_2);
  cmpl(ecx, -1064950997);
  jcc(Assembler::less, L_2TAG_PACKET_2_0_2);
  jcc(Assembler::greater, L_2TAG_PACKET_4_0_2);
  movl(edx, Address(rsp, 128));
  cmpl(edx ,-17155601);
  jcc(Assembler::less, L_2TAG_PACKET_2_0_2);
  jmp(L_2TAG_PACKET_4_0_2);

  bind(L_2TAG_PACKET_3_0_2);
  movl(edx, 14);
  jmp(L_2TAG_PACKET_5_0_2);

  bind(L_2TAG_PACKET_4_0_2);
  movl(edx, 15);

  bind(L_2TAG_PACKET_5_0_2);
  movsd(Address(rsp, 0), xmm0);
  movsd(xmm0, Address(rsp, 128));
  fld_d(Address(rsp, 0));
  jmp(L_2TAG_PACKET_6_0_2);

  bind(L_2TAG_PACKET_7_0_2);
  cmpl(eax, 2146435072);
  jcc(Assembler::greaterEqual, L_2TAG_PACKET_8_0_2);
  movl(eax, Address(rsp, 132));
  cmpl(eax, INT_MIN);
  jcc(Assembler::greaterEqual, L_2TAG_PACKET_9_0_2);
  movsd(xmm0, Address(tmp, 1208));         // 0xffffffffUL, 0x7fefffffUL
  mulsd(xmm0, xmm0);
  movl(edx, 14);
  jmp(L_2TAG_PACKET_5_0_2);

  bind(L_2TAG_PACKET_9_0_2);
  movsd(xmm0, Address(tmp, 1216));
  mulsd(xmm0, xmm0);
  movl(edx, 15);
  jmp(L_2TAG_PACKET_5_0_2);

  bind(L_2TAG_PACKET_8_0_2);
  movl(edx, Address(rsp, 128));
  cmpl(eax, 2146435072);
  jcc(Assembler::above, L_2TAG_PACKET_10_0_2);
  cmpl(edx, 0);
  jcc(Assembler::notEqual, L_2TAG_PACKET_10_0_2);
  movl(eax, Address(rsp, 132));
  cmpl(eax, 2146435072);
  jcc(Assembler::notEqual, L_2TAG_PACKET_11_0_2);
  movsd(xmm0, Address(tmp, 1192));         // 0x00000000UL, 0x7ff00000UL
  jmp(L_2TAG_PACKET_2_0_2);

  bind(L_2TAG_PACKET_11_0_2);
  movsd(xmm0, Address(tmp, 1200));         // 0x00000000UL, 0x00000000UL
  jmp(L_2TAG_PACKET_2_0_2);

  bind(L_2TAG_PACKET_10_0_2);
  movsd(xmm0, Address(rsp, 128));
  addsd(xmm0, xmm0);
  jmp(L_2TAG_PACKET_2_0_2);

  bind(L_2TAG_PACKET_0_0_2);
  movl(eax, Address(rsp, 132));
  andl(eax, 2147483647);
  cmpl(eax, 1083179008);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_7_0_2);
  movsd(xmm0, Address(rsp, 128));
  addsd(xmm0, Address(tmp, 1184));         // 0x00000000UL, 0x3ff00000UL
  jmp(L_2TAG_PACKET_2_0_2);

  bind(L_2TAG_PACKET_2_0_2);
  movsd(Address(rsp, 48), xmm0);
  fld_d(Address(rsp, 48));

  bind(L_2TAG_PACKET_6_0_2);
  movl(tmp, Address(rsp, 64));
}
Exemplo n.º 12
0
void MacroAssembler::fast_log(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register eax, Register ecx, Register edx, Register tmp1, Register tmp2) {
  Label L_2TAG_PACKET_0_0_2, L_2TAG_PACKET_1_0_2, L_2TAG_PACKET_2_0_2, L_2TAG_PACKET_3_0_2;
  Label L_2TAG_PACKET_4_0_2, L_2TAG_PACKET_5_0_2, L_2TAG_PACKET_6_0_2, L_2TAG_PACKET_7_0_2;
  Label L_2TAG_PACKET_8_0_2;
  Label L_2TAG_PACKET_12_0_2, L_2TAG_PACKET_13_0_2, B1_3, B1_5, start;

  assert_different_registers(tmp1, tmp2, eax, ecx, edx);
  jmp(start);
  address L_tbl = (address)_L_tbl;
  address log2 = (address)_log2;
  address coeff = (address)_coeff;

  bind(start);
  subq(rsp, 24);
  movsd(Address(rsp, 0), xmm0);
  mov64(rax, 0x3ff0000000000000);
  movdq(xmm2, rax);
  mov64(rdx, 0x77f0000000000000);
  movdq(xmm3, rdx);
  movl(ecx, 32768);
  movdl(xmm4, rcx);
  mov64(tmp1, 0xffffe00000000000);
  movdq(xmm5, tmp1);
  movdqu(xmm1, xmm0);
  pextrw(eax, xmm0, 3);
  por(xmm0, xmm2);
  movl(ecx, 16352);
  psrlq(xmm0, 27);
  lea(tmp2, ExternalAddress(L_tbl));
  psrld(xmm0, 2);
  rcpps(xmm0, xmm0);
  psllq(xmm1, 12);
  pshufd(xmm6, xmm5, 228);
  psrlq(xmm1, 12);
  subl(eax, 16);
  cmpl(eax, 32736);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_0_0_2);

  bind(L_2TAG_PACKET_1_0_2);
  paddd(xmm0, xmm4);
  por(xmm1, xmm3);
  movdl(edx, xmm0);
  psllq(xmm0, 29);
  pand(xmm5, xmm1);
  pand(xmm0, xmm6);
  subsd(xmm1, xmm5);
  mulpd(xmm5, xmm0);
  andl(eax, 32752);
  subl(eax, ecx);
  cvtsi2sdl(xmm7, eax);
  mulsd(xmm1, xmm0);
  movq(xmm6, ExternalAddress(log2));       // 0xfefa3800UL, 0x3fa62e42UL
  movdqu(xmm3, ExternalAddress(coeff));    // 0x92492492UL, 0x3fc24924UL, 0x00000000UL, 0xbfd00000UL
  subsd(xmm5, xmm2);
  andl(edx, 16711680);
  shrl(edx, 12);
  movdqu(xmm0, Address(tmp2, edx));
  movdqu(xmm4, ExternalAddress(16 + coeff)); // 0x3d6fb175UL, 0xbfc5555eUL, 0x55555555UL, 0x3fd55555UL
  addsd(xmm1, xmm5);
  movdqu(xmm2, ExternalAddress(32 + coeff)); // 0x9999999aUL, 0x3fc99999UL, 0x00000000UL, 0xbfe00000UL
  mulsd(xmm6, xmm7);
  movddup(xmm5, xmm1);
  mulsd(xmm7, ExternalAddress(8 + log2));    // 0x93c76730UL, 0x3ceef357UL
  mulsd(xmm3, xmm1);
  addsd(xmm0, xmm6);
  mulpd(xmm4, xmm5);
  mulpd(xmm5, xmm5);
  movddup(xmm6, xmm0);
  addsd(xmm0, xmm1);
  addpd(xmm4, xmm2);
  mulpd(xmm3, xmm5);
  subsd(xmm6, xmm0);
  mulsd(xmm4, xmm1);
  pshufd(xmm2, xmm0, 238);
  addsd(xmm1, xmm6);
  mulsd(xmm5, xmm5);
  addsd(xmm7, xmm2);
  addpd(xmm4, xmm3);
  addsd(xmm1, xmm7);
  mulpd(xmm4, xmm5);
  addsd(xmm1, xmm4);
  pshufd(xmm5, xmm4, 238);
  addsd(xmm1, xmm5);
  addsd(xmm0, xmm1);
  jmp(B1_5);

  bind(L_2TAG_PACKET_0_0_2);
  movq(xmm0, Address(rsp, 0));
  movq(xmm1, Address(rsp, 0));
  addl(eax, 16);
  cmpl(eax, 32768);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_2_0_2);
  cmpl(eax, 16);
  jcc(Assembler::below, L_2TAG_PACKET_3_0_2);

  bind(L_2TAG_PACKET_4_0_2);
  addsd(xmm0, xmm0);
  jmp(B1_5);

  bind(L_2TAG_PACKET_5_0_2);
  jcc(Assembler::above, L_2TAG_PACKET_4_0_2);
  cmpl(edx, 0);
  jcc(Assembler::above, L_2TAG_PACKET_4_0_2);
  jmp(L_2TAG_PACKET_6_0_2);

  bind(L_2TAG_PACKET_3_0_2);
  xorpd(xmm1, xmm1);
  addsd(xmm1, xmm0);
  movdl(edx, xmm1);
  psrlq(xmm1, 32);
  movdl(ecx, xmm1);
  orl(edx, ecx);
  cmpl(edx, 0);
  jcc(Assembler::equal, L_2TAG_PACKET_7_0_2);
  xorpd(xmm1, xmm1);
  movl(eax, 18416);
  pinsrw(xmm1, eax, 3);
  mulsd(xmm0, xmm1);
  movdqu(xmm1, xmm0);
  pextrw(eax, xmm0, 3);
  por(xmm0, xmm2);
  psrlq(xmm0, 27);
  movl(ecx, 18416);
  psrld(xmm0, 2);
  rcpps(xmm0, xmm0);
  psllq(xmm1, 12);
  pshufd(xmm6, xmm5, 228);
  psrlq(xmm1, 12);
  jmp(L_2TAG_PACKET_1_0_2);

  bind(L_2TAG_PACKET_2_0_2);
  movdl(edx, xmm1);
  psrlq(xmm1, 32);
  movdl(ecx, xmm1);
  addl(ecx, ecx);
  cmpl(ecx, -2097152);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_5_0_2);
  orl(edx, ecx);
  cmpl(edx, 0);
  jcc(Assembler::equal, L_2TAG_PACKET_7_0_2);

  bind(L_2TAG_PACKET_6_0_2);
  xorpd(xmm1, xmm1);
  xorpd(xmm0, xmm0);
  movl(eax, 32752);
  pinsrw(xmm1, eax, 3);
  mulsd(xmm0, xmm1);
  movl(Address(rsp, 16), 3);
  jmp(L_2TAG_PACKET_8_0_2);
  bind(L_2TAG_PACKET_7_0_2);
  xorpd(xmm1, xmm1);
  xorpd(xmm0, xmm0);
  movl(eax, 49136);
  pinsrw(xmm0, eax, 3);
  divsd(xmm0, xmm1);
  movl(Address(rsp, 16), 2);

  bind(L_2TAG_PACKET_8_0_2);
  movq(Address(rsp, 8), xmm0);

  bind(B1_3);
  movq(xmm0, Address(rsp, 8));

  bind(B1_5);
  addq(rsp, 24);
}
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1)
{
    // xmm0, xmm1, xmm4, xmm5, xmm6 = free

    int wms_clamp = ((m_sel.wms + 1) >> 1) & 1;
    int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1;

    int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1;

    if(wms_clamp == wmt_clamp)
    {
        if(wms_clamp)
        {
            if(region)
            {
                movdqa(xmm4, xmmword[&m_env.t.min]);

                pmaxsw(uv0, xmm4);
                pmaxsw(uv1, xmm4);
            }
            else
            {
                pxor(xmm0, xmm0);
                pmaxsw(uv0, xmm0);
                pmaxsw(uv1, xmm0);
            }

            movdqa(xmm5, xmmword[&m_env.t.max]);

            pminsw(uv0, xmm5);
            pminsw(uv1, xmm5);
        }
        else
        {
            movdqa(xmm4, xmmword[&m_env.t.min]);

            pand(uv0, xmm4);
            pand(uv1, xmm4);

            if(region)
            {
                movdqa(xmm5, xmmword[&m_env.t.max]);

                por(uv0, xmm5);
                por(uv1, xmm5);
            }
        }
    }
    else
    {
        movdqa(xmm1, uv0);
        movdqa(xmm6, uv1);

        movdqa(xmm4, xmmword[&m_env.t.min]);
        movdqa(xmm5, xmmword[&m_env.t.max]);

        // GSVector4i clamp = t.sat_i16(m_env.t.min, m_env.t.max);

        pmaxsw(uv0, xmm4);
        pmaxsw(uv1, xmm4);
        pminsw(uv0, xmm5);
        pminsw(uv1, xmm5);

        // GSVector4i repeat = (t & m_env.t.min) | m_env.t.max;

        pand(xmm1, xmm4);
        pand(xmm6, xmm4);

        if(region)
        {
            por(xmm1, xmm5);
            por(xmm6, xmm5);
        }

        // clamp.blend8(repeat, m_env.t.mask);

        if(m_cpu.has(util::Cpu::tSSE41))
        {
            movdqa(xmm0, xmmword[&m_env.t.mask]);

            pblendvb(uv0, xmm1);
            pblendvb(uv1, xmm6);
        }
        else
        {
            movdqa(xmm0, xmmword[&m_env.t.invmask]);
            movdqa(xmm4, xmm0);

            pand(uv0, xmm0);
            pandn(xmm0, xmm1);
            por(uv0, xmm0);

            pand(uv1, xmm4);
            pandn(xmm4, xmm6);
            por(uv1, xmm4);
        }
    }
}
Exemplo n.º 14
0
void MacroAssembler::fast_exp(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register eax, Register ecx, Register edx, Register tmp) {
  Label L_2TAG_PACKET_0_0_2, L_2TAG_PACKET_1_0_2, L_2TAG_PACKET_2_0_2, L_2TAG_PACKET_3_0_2;
  Label L_2TAG_PACKET_4_0_2, L_2TAG_PACKET_5_0_2, L_2TAG_PACKET_6_0_2, L_2TAG_PACKET_7_0_2;
  Label L_2TAG_PACKET_8_0_2, L_2TAG_PACKET_9_0_2, L_2TAG_PACKET_10_0_2, L_2TAG_PACKET_11_0_2;
  Label L_2TAG_PACKET_12_0_2, B1_3, B1_5, start;

  assert_different_registers(tmp, eax, ecx, edx);
  jmp(start);
  address cv = (address)_cv;
  address Shifter = (address)_shifter;
  address mmask = (address)_mmask;
  address bias = (address)_bias;
  address Tbl_addr = (address)_Tbl_addr;
  address ALLONES = (address)_ALLONES;
  address ebias = (address)_ebias;
  address XMAX = (address)_XMAX;
  address XMIN = (address)_XMIN;
  address INF = (address)_INF;
  address ZERO = (address)_ZERO;
  address ONE_val = (address)_ONE_val;

  bind(start);
  subq(rsp, 24);
  movsd(Address(rsp, 8), xmm0);
  unpcklpd(xmm0, xmm0);
  movdqu(xmm1, ExternalAddress(cv));       // 0x652b82feUL, 0x40571547UL, 0x652b82feUL, 0x40571547UL
  movdqu(xmm6, ExternalAddress(Shifter));  // 0x00000000UL, 0x43380000UL, 0x00000000UL, 0x43380000UL
  movdqu(xmm2, ExternalAddress(16+cv));    // 0xfefa0000UL, 0x3f862e42UL, 0xfefa0000UL, 0x3f862e42UL
  movdqu(xmm3, ExternalAddress(32+cv));    // 0xbc9e3b3aUL, 0x3d1cf79aUL, 0xbc9e3b3aUL, 0x3d1cf79aUL
  pextrw(eax, xmm0, 3);
  andl(eax, 32767);
  movl(edx, 16527);
  subl(edx, eax);
  subl(eax, 15504);
  orl(edx, eax);
  cmpl(edx, INT_MIN);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_0_0_2);
  mulpd(xmm1, xmm0);
  addpd(xmm1, xmm6);
  movapd(xmm7, xmm1);
  subpd(xmm1, xmm6);
  mulpd(xmm2, xmm1);
  movdqu(xmm4, ExternalAddress(64+cv));    // 0xe3289860UL, 0x3f56c15cUL, 0x555b9e25UL, 0x3fa55555UL
  mulpd(xmm3, xmm1);
  movdqu(xmm5, ExternalAddress(80+cv));    // 0xc090cf0fUL, 0x3f811115UL, 0x55548ba1UL, 0x3fc55555UL
  subpd(xmm0, xmm2);
  movdl(eax, xmm7);
  movl(ecx, eax);
  andl(ecx, 63);
  shll(ecx, 4);
  sarl(eax, 6);
  movl(edx, eax);
  movdqu(xmm6, ExternalAddress(mmask));    // 0xffffffc0UL, 0x00000000UL, 0xffffffc0UL, 0x00000000UL
  pand(xmm7, xmm6);
  movdqu(xmm6, ExternalAddress(bias));     // 0x0000ffc0UL, 0x00000000UL, 0x0000ffc0UL, 0x00000000UL
  paddq(xmm7, xmm6);
  psllq(xmm7, 46);
  subpd(xmm0, xmm3);
  lea(tmp, ExternalAddress(Tbl_addr));
  movdqu(xmm2, Address(ecx,tmp));
  mulpd(xmm4, xmm0);
  movapd(xmm6, xmm0);
  movapd(xmm1, xmm0);
  mulpd(xmm6, xmm6);
  mulpd(xmm0, xmm6);
  addpd(xmm5, xmm4);
  mulsd(xmm0, xmm6);
  mulpd(xmm6, ExternalAddress(48+cv));     // 0xfffffffeUL, 0x3fdfffffUL, 0xfffffffeUL, 0x3fdfffffUL
  addsd(xmm1, xmm2);
  unpckhpd(xmm2, xmm2);
  mulpd(xmm0, xmm5);
  addsd(xmm1, xmm0);
  por(xmm2, xmm7);
  unpckhpd(xmm0, xmm0);
  addsd(xmm0, xmm1);
  addsd(xmm0, xmm6);
  addl(edx, 894);
  cmpl(edx, 1916);
  jcc (Assembler::above, L_2TAG_PACKET_1_0_2);
  mulsd(xmm0, xmm2);
  addsd(xmm0, xmm2);
  jmp (B1_5);

  bind(L_2TAG_PACKET_1_0_2);
  xorpd(xmm3, xmm3);
  movdqu(xmm4, ExternalAddress(ALLONES));  // 0xffffffffUL, 0xffffffffUL, 0xffffffffUL, 0xffffffffUL
  movl(edx, -1022);
  subl(edx, eax);
  movdl(xmm5, edx);
  psllq(xmm4, xmm5);
  movl(ecx, eax);
  sarl(eax, 1);
  pinsrw(xmm3, eax, 3);
  movdqu(xmm6, ExternalAddress(ebias));    // 0x00000000UL, 0x3ff00000UL, 0x00000000UL, 0x3ff00000UL
  psllq(xmm3, 4);
  psubd(xmm2, xmm3);
  mulsd(xmm0, xmm2);
  cmpl(edx, 52);
  jcc(Assembler::greater, L_2TAG_PACKET_2_0_2);
  pand(xmm4, xmm2);
  paddd(xmm3, xmm6);
  subsd(xmm2, xmm4);
  addsd(xmm0, xmm2);
  cmpl(ecx, 1023);
  jcc(Assembler::greaterEqual, L_2TAG_PACKET_3_0_2);
  pextrw(ecx, xmm0, 3);
  andl(ecx, 32768);
  orl(edx, ecx);
  cmpl(edx, 0);
  jcc(Assembler::equal, L_2TAG_PACKET_4_0_2);
  movapd(xmm6, xmm0);
  addsd(xmm0, xmm4);
  mulsd(xmm0, xmm3);
  pextrw(ecx, xmm0, 3);
  andl(ecx, 32752);
  cmpl(ecx, 0);
  jcc(Assembler::equal, L_2TAG_PACKET_5_0_2);
  jmp(B1_5);

  bind(L_2TAG_PACKET_5_0_2);
  mulsd(xmm6, xmm3);
  mulsd(xmm4, xmm3);
  movdqu(xmm0, xmm6);
  pxor(xmm6, xmm4);
  psrad(xmm6, 31);
  pshufd(xmm6, xmm6, 85);
  psllq(xmm0, 1);
  psrlq(xmm0, 1);
  pxor(xmm0, xmm6);
  psrlq(xmm6, 63);
  paddq(xmm0, xmm6);
  paddq(xmm0, xmm4);
  movl(Address(rsp,0), 15);
  jmp(L_2TAG_PACKET_6_0_2);

  bind(L_2TAG_PACKET_4_0_2);
  addsd(xmm0, xmm4);
  mulsd(xmm0, xmm3);
  jmp(B1_5);

  bind(L_2TAG_PACKET_3_0_2);
  addsd(xmm0, xmm4);
  mulsd(xmm0, xmm3);
  pextrw(ecx, xmm0, 3);
  andl(ecx, 32752);
  cmpl(ecx, 32752);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_7_0_2);
  jmp(B1_5);

  bind(L_2TAG_PACKET_2_0_2);
  paddd(xmm3, xmm6);
  addpd(xmm0, xmm2);
  mulsd(xmm0, xmm3);
  movl(Address(rsp,0), 15);
  jmp(L_2TAG_PACKET_6_0_2);

  bind(L_2TAG_PACKET_8_0_2);
  cmpl(eax, 2146435072);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_9_0_2);
  movl(eax, Address(rsp,12));
  cmpl(eax, INT_MIN);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_10_0_2);
  movsd(xmm0, ExternalAddress(XMAX));      // 0xffffffffUL, 0x7fefffffUL
  mulsd(xmm0, xmm0);

  bind(L_2TAG_PACKET_7_0_2);
  movl(Address(rsp,0), 14);
  jmp(L_2TAG_PACKET_6_0_2);

  bind(L_2TAG_PACKET_10_0_2);
  movsd(xmm0, ExternalAddress(XMIN));      // 0x00000000UL, 0x00100000UL
  mulsd(xmm0, xmm0);
  movl(Address(rsp,0), 15);
  jmp(L_2TAG_PACKET_6_0_2);

  bind(L_2TAG_PACKET_9_0_2);
  movl(edx, Address(rsp,8));
  cmpl(eax, 2146435072);
  jcc(Assembler::above, L_2TAG_PACKET_11_0_2);
  cmpl(edx, 0);
  jcc(Assembler::notEqual, L_2TAG_PACKET_11_0_2);
  movl(eax, Address(rsp,12));
  cmpl(eax, 2146435072);
  jcc(Assembler::notEqual, L_2TAG_PACKET_12_0_2);
  movsd(xmm0, ExternalAddress(INF));       // 0x00000000UL, 0x7ff00000UL
  jmp(B1_5);

  bind(L_2TAG_PACKET_12_0_2);
  movsd(xmm0, ExternalAddress(ZERO));      // 0x00000000UL, 0x00000000UL
  jmp(B1_5);

  bind(L_2TAG_PACKET_11_0_2);
  movsd(xmm0, Address(rsp, 8));
  addsd(xmm0, xmm0);
  jmp(B1_5);

  bind(L_2TAG_PACKET_0_0_2);
  movl(eax, Address(rsp, 12));
  andl(eax, 2147483647);
  cmpl(eax, 1083179008);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_8_0_2);
  movsd(Address(rsp, 8), xmm0);
  addsd(xmm0, ExternalAddress(ONE_val));   // 0x00000000UL, 0x3ff00000UL
  jmp(B1_5);

  bind(L_2TAG_PACKET_6_0_2);
  movq(Address(rsp, 16), xmm0);

  bind(B1_3);
  movq(xmm0, Address(rsp, 16));

  bind(B1_5);
  addq(rsp, 24);
}
Exemplo n.º 15
0
template <class _mm, int src_aligned, int dst_aligned> void TffdshowConverters2::convert_NV12toYV12(
    const uint8_t* srcY,
    const uint8_t* srcCbCr,
    uint8_t* dstY,
    uint8_t* dstCb,
    uint8_t* dstCr,
    int dx,
    int dy,
    stride_t stride_Y,
    stride_t stride_CbCr,
    stride_t stride_dstY,
    stride_t stride_dstCbCr)
{
    int xCount = dx / (_mm::size*2);
    if (xCount <= 0)
        return;

    _mm::__m _mm0,_mm1,_mm2,_mm3,_mm_00ff;

    // fill with 0xff
    pxor(_mm_00ff,_mm_00ff);
    pcmpeqb(_mm_00ff,_mm_00ff);
    psllw(_mm_00ff, 8);
    psrlw(_mm_00ff, 8);

    for (int y = 0 ; y < dy ; y++) {
        const uint8_t *src = srcY + y * stride_Y;
        uint8_t *dst = dstY + y * stride_dstY;
        int x = xCount;
        do {
            if (src_aligned) {
                movVqa(_mm0, src);
                movVqa(_mm1, src + _mm::size);
            } else {
                movVqu(_mm0, src);
                movVqu(_mm1, src + _mm::size);
            }
            src += _mm::size * 2;
            if (dst_aligned) {
                _mm::movntVq(dst, _mm0);
                _mm::movntVq(dst + _mm::size, _mm1);
            } else {
                movVqu(dst, _mm0);
                movVqu(dst + _mm::size, _mm1);
            }
            dst += _mm::size * 2;
        } while(--x);
    }
    int dyCbCr = dy/2;
    for (int y = 0 ; y < dyCbCr ; y++) {
        const uint8_t *srcCbCrLn = srcCbCr + y * stride_CbCr;
        uint8_t *dstCbLn = dstCb + y * stride_dstCbCr;
        uint8_t *dstCrLn = dstCr + y * stride_dstCbCr;
        int x = xCount;
        do {
            if (src_aligned) {
                movVqa(_mm0, srcCbCrLn);
                movVqa(_mm1, srcCbCrLn + _mm::size);
            } else {
                movVqu(_mm0, srcCbCrLn);
                movVqu(_mm1, srcCbCrLn + _mm::size);
            }
            _mm2 = _mm0;
            _mm3 = _mm1;
            srcCbCrLn += _mm::size * 2;
            pand(_mm0, _mm_00ff);
            pand(_mm1, _mm_00ff);
            psrlw(_mm2,8);
            psrlw(_mm3,8);
            packuswb(_mm0, _mm1);
            packuswb(_mm2, _mm3);

            if (dst_aligned) {
                _mm::movntVq(dstCbLn, _mm0);
                _mm::movntVq(dstCrLn, _mm2);
            } else {
                movVqu(dstCbLn, _mm0);
                movVqu(dstCrLn, _mm2);
            }
            dstCbLn += _mm::size;
            dstCrLn += _mm::size;
        } while(--x);
    }

    if (xCount * (int)_mm::size * 2 < dx && dx > _mm::size * 2) {
        int dxDone = dx - _mm::size * 2;
        srcY  += dxDone;
        srcCbCr += dxDone;
        dstY    += dxDone;
        dstCb   += dxDone/2;
        dstCr   += dxDone/2;
        convert_NV12toYV12<_mm, 0, 0>(srcY, srcCbCr, dstY, dstCb, dstCr, _mm::size * 2, dy, stride_Y, stride_CbCr, stride_dstY, stride_dstCbCr);
    }

    _mm::empty();
}
void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
{
    if(!m_sel.zb)
    {
        return;
    }

    // int za = fza_base.y + fza_offset->y;

    mov(ebp, dword[esi + 4]);
    add(ebp, dword[edi + 4]);

    // GSVector4i zs = zi;

    if(!m_sel.sprite)
    {
        if(m_sel.zoverflow)
        {
            // zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());

            static float half = 0.5f;

            movss(temp1, dword[&half]);
            shufps(temp1, temp1, _MM_SHUFFLE(0, 0, 0, 0));
            mulps(temp1, xmm0);
            cvttps2dq(temp1, temp1);
            pslld(temp1, 1);

            cvttps2dq(xmm0, xmm0);
            pcmpeqd(temp2, temp2);
            psrld(temp2, 31);
            pand(xmm0, temp2);

            por(xmm0, temp1);
        }
        else
        {
            // zs = GSVector4i(z);

            cvttps2dq(xmm0, xmm0);
        }

        if(m_sel.zwrite)
        {
            movdqa(xmmword[&m_env.temp.zs], xmm0);
        }
    }

    if(m_sel.ztest)
    {
        ReadPixel(xmm1, ebp);

        if(m_sel.zwrite && m_sel.zpsm < 2)
        {
            movdqa(xmmword[&m_env.temp.zd], xmm1);
        }

        // zd &= 0xffffffff >> m_sel.zpsm * 8;

        if(m_sel.zpsm)
        {
            pslld(xmm1, m_sel.zpsm * 8);
            psrld(xmm1, m_sel.zpsm * 8);
        }

        if(m_sel.zoverflow || m_sel.zpsm == 0)
        {
            // GSVector4i o = GSVector4i::x80000000();

            pcmpeqd(xmm4, xmm4);
            pslld(xmm4, 31);

            // GSVector4i zso = zs - o;

            psubd(xmm0, xmm4);

            // GSVector4i zdo = zd - o;

            psubd(xmm1, xmm4);
        }

        switch(m_sel.ztst)
        {
        case ZTST_GEQUAL:
            // test |= zso < zdo; // ~(zso >= zdo)
            pcmpgtd(xmm1, xmm0);
            por(xmm7, xmm1);
            break;

        case ZTST_GREATER: // TODO: tidus hair and chocobo wings only appear fully when this is tested as ZTST_GEQUAL
            // test |= zso <= zdo; // ~(zso > zdo)
            pcmpgtd(xmm0, xmm1);
            pcmpeqd(xmm4, xmm4);
            pxor(xmm0, xmm4);
            por(xmm7, xmm0);
            break;
        }

        alltrue();
    }
}
void GSDrawScanlineCodeGenerator::blendr(const Xmm& b, const Xmm& a, const Xmm& mask)
{
    pand(b, mask);
    pandn(mask, a);
    por(b, mask);
}
void GSDrawScanlineCodeGenerator::AlphaBlend()
{
    if(!m_sel.fwrite)
    {
        return;
    }

    if(m_sel.abe == 0 && m_sel.aa1 == 0)
    {
        return;
    }

    if((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1) || m_sel.abd == 1)
    {
        switch(m_sel.fpsm)
        {
        case 0:
        case 1:

            // c[2] = fd & mask;
            // c[3] = (fd >> 8) & mask;

            movdqa(xmm0, xmm2);
            movdqa(xmm1, xmm2);

            psllw(xmm0, 8);
            psrlw(xmm0, 8);
            psrlw(xmm1, 8);

            break;

        case 2:

            // c[2] = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3);
            // c[3] = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2);

            movdqa(xmm0, xmm2);
            movdqa(xmm1, xmm2);
            movdqa(xmm4, xmm2);

            pcmpeqd(xmm7, xmm7);
            psrld(xmm7, 27); // 0x0000001f
            pand(xmm0, xmm7);
            pslld(xmm0, 3);

            pslld(xmm7, 10); // 0x00007c00
            pand(xmm4, xmm7);
            pslld(xmm4, 9);

            por(xmm0, xmm4);

            movdqa(xmm4, xmm1);

            psrld(xmm7, 5); // 0x000003e0
            pand(xmm1, xmm7);
            psrld(xmm1, 2);

            psllw(xmm7, 10); // 0x00008000
            pand(xmm4, xmm7);
            pslld(xmm4, 8);

            por(xmm1, xmm4);

            break;
        }
    }

    // xmm5, xmm6 = src rb, ga
    // xmm0, xmm1 = dst rb, ga
    // xmm2, xmm3 = used
    // xmm4, xmm7 = free

    if(m_sel.pabe || (m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0))
    {
        movdqa(xmm4, xmm5);
    }

    if(m_sel.aba != m_sel.abb)
    {
        // rb = c[aba * 2 + 0];

        switch(m_sel.aba)
        {
        case 0:
            break;
        case 1:
            movdqa(xmm5, xmm0);
            break;
        case 2:
            pxor(xmm5, xmm5);
            break;
        }

        // rb = rb.sub16(c[abb * 2 + 0]);

        switch(m_sel.abb)
        {
        case 0:
            psubw(xmm5, xmm4);
            break;
        case 1:
            psubw(xmm5, xmm0);
            break;
        case 2:
            break;
        }

        if(!(m_sel.fpsm == 1 && m_sel.abc == 1))
        {
            // GSVector4i a = abc < 2 ? c[abc * 2 + 1].yywwlh().sll16(7) : m_env.afix;

            switch(m_sel.abc)
            {
            case 0:
            case 1:
                movdqa(xmm7, m_sel.abc ? xmm1 : xmm6);
                pshuflw(xmm7, xmm7, _MM_SHUFFLE(3, 3, 1, 1));
                pshufhw(xmm7, xmm7, _MM_SHUFFLE(3, 3, 1, 1));
                psllw(xmm7, 7);
                break;
            case 2:
                movdqa(xmm7, xmmword[&m_env.afix]);
                break;
            }

            // rb = rb.modulate16<1>(a);

            modulate16<1>(xmm5, xmm7);
        }

        // rb = rb.add16(c[abd * 2 + 0]);

        switch(m_sel.abd)
        {
        case 0:
            paddw(xmm5, xmm4);
            break;
        case 1:
            paddw(xmm5, xmm0);
            break;
        case 2:
            break;
        }
    }
    else
    {
        // rb = c[abd * 2 + 0];

        switch(m_sel.abd)
        {
        case 0:
            break;
        case 1:
            movdqa(xmm5, xmm0);
            break;
        case 2:
            pxor(xmm5, xmm5);
            break;
        }
    }

    if(m_sel.pabe)
    {
        // mask = (c[1] << 8).sra32(31);

        movdqa(xmm0, xmm6);
        pslld(xmm0, 8);
        psrad(xmm0, 31);

        // rb = c[0].blend8(rb, mask);

        blend8r(xmm5, xmm4);
    }

    // xmm6 = src ga
    // xmm1 = dst ga
    // xmm5 = rb
    // xmm7 = a
    // xmm2, xmm3 = used
    // xmm0, xmm4 = free

    movdqa(xmm4, xmm6);

    if(m_sel.aba != m_sel.abb)
    {
        // ga = c[aba * 2 + 1];

        switch(m_sel.aba)
        {
        case 0:
            break;
        case 1:
            movdqa(xmm6, xmm1);
            break;
        case 2:
            pxor(xmm6, xmm6);
            break;
        }

        // ga = ga.sub16(c[abeb * 2 + 1]);

        switch(m_sel.abb)
        {
        case 0:
            psubw(xmm6, xmm4);
            break;
        case 1:
            psubw(xmm6, xmm1);
            break;
        case 2:
            break;
        }

        if(!(m_sel.fpsm == 1 && m_sel.abc == 1))
        {
            // ga = ga.modulate16<1>(a);

            modulate16<1>(xmm6, xmm7);
        }

        // ga = ga.add16(c[abd * 2 + 1]);

        switch(m_sel.abd)
        {
        case 0:
            paddw(xmm6, xmm4);
            break;
        case 1:
            paddw(xmm6, xmm1);
            break;
        case 2:
            break;
        }
    }
    else
    {
        // ga = c[abd * 2 + 1];

        switch(m_sel.abd)
        {
        case 0:
            break;
        case 1:
            movdqa(xmm6, xmm1);
            break;
        case 2:
            pxor(xmm6, xmm6);
            break;
        }
    }

    // xmm4 = src ga
    // xmm5 = rb
    // xmm6 = ga
    // xmm2, xmm3 = used
    // xmm0, xmm1, xmm7 = free

    if(m_sel.pabe)
    {
        if(!m_cpu.has(util::Cpu::tSSE41))
        {
            // doh, previous blend8r overwrote xmm0 (sse41 uses pblendvb)

            movdqa(xmm0, xmm4);
            pslld(xmm0, 8);
            psrad(xmm0, 31);
        }

        psrld(xmm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16)

        // ga = c[1].blend8(ga, mask).mix16(c[1]);

        blend8r(xmm6, xmm4);
    }
    else
    {
        if(m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx
        {
            mix16(xmm6, xmm4, xmm7);
        }
    }
}
void GSDrawScanlineCodeGenerator::WriteFrame(int params)
{
    const int _top = params + 4;

    if(!m_sel.fwrite)
    {
        return;
    }

    if(m_sel.colclamp == 0)
    {
        // c[0] &= 0x000000ff;
        // c[1] &= 0x000000ff;

        pcmpeqd(xmm7, xmm7);
        psrlw(xmm7, 8);
        pand(xmm5, xmm7);
        pand(xmm6, xmm7);
    }

    if(m_sel.fpsm == 2 && m_sel.dthe)
    {
        mov(eax, dword[esp + _top]);
        and(eax, 3);
        shl(eax, 5);
        paddw(xmm5, xmmword[eax + (size_t)&m_env.dimx[0]]);
        paddw(xmm6, xmmword[eax + (size_t)&m_env.dimx[1]]);
    }

    // GSVector4i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1]));

    movdqa(xmm7, xmm5);
    punpcklwd(xmm5, xmm6);
    punpckhwd(xmm7, xmm6);
    packuswb(xmm5, xmm7);

    if(m_sel.fba && m_sel.fpsm != 1)
    {
        // fs |= 0x80000000;

        pcmpeqd(xmm7, xmm7);
        pslld(xmm7, 31);
        por(xmm5, xmm7);
    }

    if(m_sel.fpsm == 2)
    {
        // GSVector4i rb = fs & 0x00f800f8;
        // GSVector4i ga = fs & 0x8000f800;

        mov(eax, 0x00f800f8);
        movd(xmm6, eax);
        pshufd(xmm6, xmm6, _MM_SHUFFLE(0, 0, 0, 0));

        mov(eax, 0x8000f800);
        movd(xmm7, eax);
        pshufd(xmm7, xmm7, _MM_SHUFFLE(0, 0, 0, 0));

        movdqa(xmm4, xmm5);
        pand(xmm4, xmm6);
        pand(xmm5, xmm7);

        // fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3);

        movdqa(xmm6, xmm4);
        movdqa(xmm7, xmm5);

        psrld(xmm4, 3);
        psrld(xmm6, 9);
        psrld(xmm5, 6);
        psrld(xmm7, 16);

        por(xmm5, xmm4);
        por(xmm7, xmm6);
        por(xmm5, xmm7);
    }

    if(m_sel.rfb)
    {
        // fs = fs.blend(fd, fm);

        blend(xmm5, xmm2, xmm3); // TODO: could be skipped in certain cases, depending on fpsm and fm
    }

    bool fast = m_sel.rfb && m_sel.fpsm < 2;

    WritePixel(xmm5, xmm0, ebx, dl, fast, m_sel.fpsm);
}
Exemplo n.º 20
0
int main()
{
	int rval;
	mmx_t ma;
	mmx_t mb;

	movq_r2r(mm0, mm1);

	rval = mmx_ok();

	/* Announce return value of mmx_ok() */
//	printf("Value returned from init was %x.", rval);
//	printf(" (Indicates MMX %s available)\n\n",(rval)? "is" : "not");
//	fflush(stdout); fflush(stderr);

//	if(rval)
	{
		/* PADD *****************************************************/
		ma.q = 0x1111111180000000LL;
		mb.q = 0x7fffffff00000001LL;
		paddd(ma, mb);
		fprintf(stdout, "paddd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddd: mb.q is 9111111080000001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x0001000100010001LL;
		mb.q = 0x80007fffffff0001LL;
		paddw(ma, mb);
		fprintf(stdout, "paddw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddw: mb.q is 8001800000000002\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010000LL;
		paddw(ma, mb);
		fprintf(stdout, "paddw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddw: mb.q is 8001800000000001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x01010101807fff01LL;
		mb.q = 0x807fff0101010101LL;
		paddb(ma, mb);
		fprintf(stdout, "paddb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddb: mb.q is 8180000281800002\n");
		fflush(stdout); fflush(stderr);


		/* PADDS ****************************************************/
		ma.q = 0x0001000100010001LL;
		mb.q = 0x80007fffffff0001LL;
		paddsw(ma, mb);
		fprintf(stdout, "paddsw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddsw: mb.q is 80017fff00000002\n");

		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010000LL;
		paddsw(ma, mb);
		fprintf(stdout, "paddsw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddsw: mb.q is 80017fff00000001\n");

		ma.q = 0x01010101807fff01LL;
		mb.q = 0x807fff0101010101LL;
		paddsb(ma, mb);
		fprintf(stdout, "paddsb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddsb: mb.q is 817f0002817f0002\n");
		fflush(stdout); fflush(stderr);


		/* PADDUS ***************************************************/
		ma.q = 0x0001000100010001LL;
		mb.q = 0x80007fffffff0001LL;
		paddusw(ma, mb);
		fprintf(stdout, "paddusw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddusw: mb.q is 80018000ffff0002\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010000LL;
		paddusw(ma, mb);
		fprintf(stdout, "paddusw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddusw: mb.q is 80018000ffff0001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x01010101807fff01LL;
		mb.q = 0x807fff0101010101LL;
		paddusb(ma, mb);
		fprintf(stdout, "paddusb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddusb: mb.q is 8180ff028180ff02\n");
		fflush(stdout); fflush(stderr);


		/* PSUB *****************************************************/
		ma.q = 0x7fffffff00000001LL;
		mb.q = 0x1111111180000000LL;
		psubd(ma, mb);
		fprintf(stdout, "psubd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubd: mb.q is 911111127fffffff\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010001LL;
		psubw(ma, mb);
		fprintf(stdout, "psubw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubw: mb.q is 8001800200020000\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x0001000100010000LL;
		mb.q = 0x80007fffffff0001LL;
		psubw(ma, mb);
		fprintf(stdout, "psubw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubw: mb.q is 7fff7ffefffe0001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x807fff0101010101LL;
		mb.q = 0x01010101807fff01LL;
		psubb(ma, mb);
		fprintf(stdout, "psubb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubb: mb.q is 818202007f7efe00\n");
		fflush(stdout); fflush(stderr);


		/* PSUBS ****************************************************/
		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010001LL;
		psubsw(ma, mb);
		fprintf(stdout, "psubsw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubsw: mb.q is 7fff800200020000\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x0001000100010000LL;
		mb.q = 0x80007fffffff0001LL;
		psubsw(ma, mb);
		fprintf(stdout, "psubsw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubsw: mb.q is 80007ffefffe0001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x807fff0101010101LL;
		mb.q = 0x01010101807fff01LL;
		psubsb(ma, mb);
		fprintf(stdout, "psubsb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubsb: mb.q is 7f820200807efe00\n");
		fflush(stdout); fflush(stderr);
 

		/* PSUBUS ***************************************************/
		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010001LL;
		psubusw(ma, mb);
		fprintf(stdout, "psubusw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubusw: mb.q is 0000000000000000\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x0001000100010000LL;
		mb.q = 0x80007fffffff0001LL;
		psubusw(ma, mb);
		fprintf(stdout, "psubusw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubusw: mb.q is 7fff7ffefffe0001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x807fff0101010101LL;
		mb.q = 0x01010101807fff01LL;
		psubusb(ma, mb);
		fprintf(stdout, "psubusb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubusb: mb.q is 000000007f7efe00\n");
		fflush(stdout); fflush(stderr);


		/* PMUL *****************************************************/
		ma.q = 0x8000ffff00ff0000LL;
		mb.q = 0x0200ffff00ffffffLL;
		pmulhw(ma, mb);
		fprintf(stdout, "pmulhw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pmulhw: mb.q is ff00000000000000\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0200ffff00ffffffLL;
		pmullw(ma, mb);
		fprintf(stdout, "pmullw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pmullw: mb.q is 00000001fe010000\n");
		fflush(stdout); fflush(stderr);


		/* PMADD ****************************************************/
		ma.q = 0x8000345680007f34LL;
		mb.q = 0x93234a27ffff1707LL;

		pmaddwd(ma, mb);
		fprintf(stdout, "pmaddwd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pmaddwd: mb.q is 4597551a0b71a66c\n");
		fflush(stdout); fflush(stderr);


		/* PCMPEQ ***************************************************/
		ma.q = 0x800034568f237f34LL;
		mb.q = 0x93009a568f237f34LL;

		pcmpeqd(ma, mb);
		fprintf(stdout, "pcmpeqd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpeqd: mb.q is 00000000ffffffff\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x93009a568f237f34LL;
		pcmpeqw(ma, mb);
		fprintf(stdout, "pcmpeqw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpeqw: mb.q is 00000000ffffffff\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x93009a568f237f34LL;
		pcmpeqb(ma, mb);
		fprintf(stdout, "pcmpeqb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpeqb: mb.q is 00ff00ffffffffff\n");
		fflush(stdout); fflush(stderr);



		/* PCMPGT ***************************************************/
		ma.q = 0x666688884477aaffLL;
		mb.q = 0x1234567890abcdefLL;

		pcmpgtd(ma, mb);
		fprintf(stdout, "pcmpgtd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpgtd: mb.q is 0000000000000000\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x1234567890abcdefLL;
		pcmpgtw(ma, mb);
		fprintf(stdout, "pcmpgtw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpgtw: mb.q is 0000ffff0000ffff\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x1234567890abcdefLL;
		pcmpgtb(ma, mb);
		fprintf(stdout, "pcmpgtb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpgtb: mb.q is 0000ffff0000ff00\n");
		fflush(stdout); fflush(stderr);


		/* PACKSS ***************************************************/
		ma.q = 0x00012222000abbbbLL;
		mb.q = 0x0000888800003333LL;

		packssdw(ma, mb);
		fprintf(stdout, "packssdw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "packssdw: mb.q is 7fff7fff7fff3333\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x00aa00dd01009999LL;
		mb.q = 0x0011002200330044LL;

		packsswb(ma, mb);
		fprintf(stdout, "packsswb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "packsswb: mb.q is 7f7f7f8011223344\n");
		fflush(stdout); fflush(stderr);


		/* PACKUS ***************************************************/
		ma.q = 0x00aa00dd01009999LL;
		mb.q = 0x0011002200330044LL;

		packuswb(ma, mb);
		fprintf(stdout, "packuswb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "packuswb: mb.q is aaddff0011223344\n");
		fflush(stdout); fflush(stderr);


		/* PUNPCKH **************************************************/
		ma.q = 0x090a0b0c0d0e0f00LL;
		mb.q = 0x0102030405060708LL;

		punpckhdq(ma, mb);
		fprintf(stdout, "punpckhdq: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpckhdq: mb.q is 090a0b0c01020304\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0102030405060708LL;
		punpckhwd(ma, mb);
		fprintf(stdout, "punpckhwd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpckhwd: mb.q is 090a01020b0c0304\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0102030405060708LL;
		punpckhbw(ma, mb);
		fprintf(stdout, "punpckhbw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpckhbw: mb.q is 09010a020b030c04\n");
		fflush(stdout); fflush(stderr);


		/* PUNPCKL **************************************************/
		ma.q = 0x090a0b0c0d0e0f00LL;
		mb.q = 0x0102030405060708LL;

		punpckldq(ma, mb);
		fprintf(stdout, "punpckldq: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpckldq: mb.q is 0d0e0f0005060708\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0102030405060708LL;
		punpcklwd(ma, mb);
		fprintf(stdout, "punpcklwd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpcklwd: mb.q is 0d0e05060f000708\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0102030405060708LL;
		punpcklbw(ma, mb);
		fprintf(stdout, "punpcklbw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpcklbw: mb.q is 0d050e060f070008\n");
		fflush(stdout); fflush(stderr);



		/* PAND, PANDN, POR, PXOR ***********************************/
		ma.q = 0x5555555555555555LL;
		mb.q = 0x3333333333333333LL;

		pand(ma, mb);
		fprintf(stdout, "pand: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pand: mb.q is 1111111111111111\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x3333333333333333LL;
		pandn(ma, mb);
		fprintf(stdout, "pandn: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pandn: mb.q is 4444444444444444\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x3333333333333333LL;
		por(ma, mb);
		fprintf(stdout, "por: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "por: mb.q is 7777777777777777\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x3333333333333333LL;
		pxor(ma, mb);
		fprintf(stdout, "pxor: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pxor: mb.q is 6666666666666666\n");
		fflush(stdout); fflush(stderr);



		/* PSLL *****************************************************/
		ma.q = 0x0000000000000018LL;
		mb.q = 0x0123456789abcdefLL;

		psllq(ma, mb);
		fprintf(stdout, "psllq: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psllq: mb.q is 6789abcdef000000\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0123456789abcdefLL;
		pslld(ma, mb);
		fprintf(stdout, "pslld: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pslld: mb.q is 67000000ef000000\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0123456789abcdefLL;
		psllw(ma, mb);
		fprintf(stdout, "psllw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psllw: mb.q is 0000000000000000\n");
		fflush(stdout); fflush(stderr);



		/* PSRL *****************************************************/
		ma.q = 0x0000000000000018LL;
		mb.q = 0x0123456789abcdefLL;

		psrlq(ma, mb);
		fprintf(stdout, "psrlq: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psrlq: mb.q is 0000000123456789\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0123456789abcdefLL;
		psrld(ma, mb);
		fprintf(stdout, "psrld: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psrld: mb.q is 0000000100000089\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0123456789abcdefLL;
		psrlw(ma, mb);
		fprintf(stdout, "psrlw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psrlw: mb.q is 0000000000000000\n");
		fflush(stdout); fflush(stderr);



		/* PSRA *****************************************************/
		ma.q = 0x0000000000000018LL;
		mb.q = 0x0123456789abcdefLL;

		psrad(ma, mb);
		fprintf(stdout, "psrad: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psrad: mb.q is 00000001ffffff89\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0123456789abcdefLL;
		psraw(ma, mb);
		fprintf(stdout, "psraw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psraw: mb.q is 00000000ffffffff\n");
		fflush(stdout); fflush(stderr);

		/* Exit MXX *************************************************/
		emms();
	}

	/* Clean-up and exit nicely */
	exit(0);
}
Exemplo n.º 21
0
void MacroAssembler::fast_log(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register eax, Register ecx, Register edx, Register tmp) {
  Label L_2TAG_PACKET_0_0_2, L_2TAG_PACKET_1_0_2, L_2TAG_PACKET_2_0_2, L_2TAG_PACKET_3_0_2;
  Label L_2TAG_PACKET_4_0_2, L_2TAG_PACKET_5_0_2, L_2TAG_PACKET_6_0_2, L_2TAG_PACKET_7_0_2;
  Label L_2TAG_PACKET_8_0_2, L_2TAG_PACKET_9_0_2;
  Label L_2TAG_PACKET_10_0_2, start;

  assert_different_registers(tmp, eax, ecx, edx);
  jmp(start);
  address static_const_table = (address)_static_const_table_log;

  bind(start);
  subl(rsp, 104);
  movl(Address(rsp, 40), tmp);
  lea(tmp, ExternalAddress(static_const_table));
  xorpd(xmm2, xmm2);
  movl(eax, 16368);
  pinsrw(xmm2, eax, 3);
  xorpd(xmm3, xmm3);
  movl(edx, 30704);
  pinsrw(xmm3, edx, 3);
  movsd(xmm0, Address(rsp, 112));
  movapd(xmm1, xmm0);
  movl(ecx, 32768);
  movdl(xmm4, ecx);
  movsd(xmm5, Address(tmp, 2128));         // 0x00000000UL, 0xffffe000UL
  pextrw(eax, xmm0, 3);
  por(xmm0, xmm2);
  psllq(xmm0, 5);
  movl(ecx, 16352);
  psrlq(xmm0, 34);
  rcpss(xmm0, xmm0);
  psllq(xmm1, 12);
  pshufd(xmm6, xmm5, 228);
  psrlq(xmm1, 12);
  subl(eax, 16);
  cmpl(eax, 32736);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_0_0_2);

  bind(L_2TAG_PACKET_1_0_2);
  paddd(xmm0, xmm4);
  por(xmm1, xmm3);
  movdl(edx, xmm0);
  psllq(xmm0, 29);
  pand(xmm5, xmm1);
  pand(xmm0, xmm6);
  subsd(xmm1, xmm5);
  mulpd(xmm5, xmm0);
  andl(eax, 32752);
  subl(eax, ecx);
  cvtsi2sdl(xmm7, eax);
  mulsd(xmm1, xmm0);
  movsd(xmm6, Address(tmp, 2064));         // 0xfefa3800UL, 0x3fa62e42UL
  movdqu(xmm3, Address(tmp, 2080));        // 0x92492492UL, 0x3fc24924UL, 0x00000000UL, 0xbfd00000UL
  subsd(xmm5, xmm2);
  andl(edx, 16711680);
  shrl(edx, 12);
  movdqu(xmm0, Address(tmp, edx));
  movdqu(xmm4, Address(tmp, 2096));        // 0x3d6fb175UL, 0xbfc5555eUL, 0x55555555UL, 0x3fd55555UL
  addsd(xmm1, xmm5);
  movdqu(xmm2, Address(tmp, 2112));        // 0x9999999aUL, 0x3fc99999UL, 0x00000000UL, 0xbfe00000UL
  mulsd(xmm6, xmm7);
  pshufd(xmm5, xmm1, 68);
  mulsd(xmm7, Address(tmp, 2072));         // 0x93c76730UL, 0x3ceef357UL, 0x92492492UL, 0x3fc24924UL
  mulsd(xmm3, xmm1);
  addsd(xmm0, xmm6);
  mulpd(xmm4, xmm5);
  mulpd(xmm5, xmm5);
  pshufd(xmm6, xmm0, 228);
  addsd(xmm0, xmm1);
  addpd(xmm4, xmm2);
  mulpd(xmm3, xmm5);
  subsd(xmm6, xmm0);
  mulsd(xmm4, xmm1);
  pshufd(xmm2, xmm0, 238);
  addsd(xmm1, xmm6);
  mulsd(xmm5, xmm5);
  addsd(xmm7, xmm2);
  addpd(xmm4, xmm3);
  addsd(xmm1, xmm7);
  mulpd(xmm4, xmm5);
  addsd(xmm1, xmm4);
  pshufd(xmm5, xmm4, 238);
  addsd(xmm1, xmm5);
  addsd(xmm0, xmm1);
  jmp(L_2TAG_PACKET_2_0_2);

  bind(L_2TAG_PACKET_0_0_2);
  movsd(xmm0, Address(rsp, 112));
  movdqu(xmm1, xmm0);
  addl(eax, 16);
  cmpl(eax, 32768);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_3_0_2);
  cmpl(eax, 16);
  jcc(Assembler::below, L_2TAG_PACKET_4_0_2);

  bind(L_2TAG_PACKET_5_0_2);
  addsd(xmm0, xmm0);
  jmp(L_2TAG_PACKET_2_0_2);

  bind(L_2TAG_PACKET_6_0_2);
  jcc(Assembler::above, L_2TAG_PACKET_5_0_2);
  cmpl(edx, 0);
  jcc(Assembler::above, L_2TAG_PACKET_5_0_2);
  jmp(L_2TAG_PACKET_7_0_2);

  bind(L_2TAG_PACKET_3_0_2);
  movdl(edx, xmm1);
  psrlq(xmm1, 32);
  movdl(ecx, xmm1);
  addl(ecx, ecx);
  cmpl(ecx, -2097152);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_6_0_2);
  orl(edx, ecx);
  cmpl(edx, 0);
  jcc(Assembler::equal, L_2TAG_PACKET_8_0_2);

  bind(L_2TAG_PACKET_7_0_2);
  xorpd(xmm1, xmm1);
  xorpd(xmm0, xmm0);
  movl(eax, 32752);
  pinsrw(xmm1, eax, 3);
  movl(edx, 3);
  mulsd(xmm0, xmm1);

  bind(L_2TAG_PACKET_9_0_2);
  movsd(Address(rsp, 0), xmm0);
  movsd(xmm0, Address(rsp, 112));
  fld_d(Address(rsp, 0));
  jmp(L_2TAG_PACKET_10_0_2);

  bind(L_2TAG_PACKET_8_0_2);
  xorpd(xmm1, xmm1);
  xorpd(xmm0, xmm0);
  movl(eax, 49136);
  pinsrw(xmm0, eax, 3);
  divsd(xmm0, xmm1);
  movl(edx, 2);
  jmp(L_2TAG_PACKET_9_0_2);

  bind(L_2TAG_PACKET_4_0_2);
  movdl(edx, xmm1);
  psrlq(xmm1, 32);
  movdl(ecx, xmm1);
  orl(edx, ecx);
  cmpl(edx, 0);
  jcc(Assembler::equal, L_2TAG_PACKET_8_0_2);
  xorpd(xmm1, xmm1);
  movl(eax, 18416);
  pinsrw(xmm1, eax, 3);
  mulsd(xmm0, xmm1);
  movapd(xmm1, xmm0);
  pextrw(eax, xmm0, 3);
  por(xmm0, xmm2);
  psllq(xmm0, 5);
  movl(ecx, 18416);
  psrlq(xmm0, 34);
  rcpss(xmm0, xmm0);
  psllq(xmm1, 12);
  pshufd(xmm6, xmm5, 228);
  psrlq(xmm1, 12);
  jmp(L_2TAG_PACKET_1_0_2);

  bind(L_2TAG_PACKET_2_0_2);
  movsd(Address(rsp, 24), xmm0);
  fld_d(Address(rsp, 24));

  bind(L_2TAG_PACKET_10_0_2);
  movl(tmp, Address(rsp, 40));
}