void GSDrawScanlineCodeGenerator::blendr(const Xmm& b, const Xmm& a, const Xmm& mask) { #if _M_SSE >= 0x500 vpand(b, mask); vpandn(mask, a); vpor(b, mask); #else pand(b, mask); pandn(mask, a); por(b, mask); #endif }
void GPUDrawScanlineCodeGenerator::blend(const Xmm& a, const Xmm& b, const Xmm& mask) { pand(b, mask); pandn(mask, a); por(b, mask); movdqa(a, b); }
// Cycle through integer types TEST(Exp, VarySubgroup) { const int pbits = 2048; for(int qbits = 160; qbits<pbits; qbits+=64) { // Generate prime CryptoPP::AutoSeededX917RNG<CryptoPP::AES> rng(false, true); CryptoPP::PrimeAndGenerator pand(1, rng, pbits, qbits); const CryptoPP::Integer two(2); double total = 0.0f; // Do 1000 exps for(int i=0; i<1000; i++) { CryptoPP::Integer v = a_exp_b_mod_c( pand.Generator(), CryptoPP::Integer(rng, two, pand.SubPrime(), CryptoPP::Integer::ANY), pand.Prime()); CryptoPP::Integer e(rng, two, pand.SubPrime(), CryptoPP::Integer::ANY); double start = QDateTime::currentMSecsSinceEpoch(); CryptoPP::Integer r = a_exp_b_mod_c(v, e, pand.Prime()); double end = QDateTime::currentMSecsSinceEpoch(); total += (end-start); } qDebug() << qbits << total; } }
void GSDrawScanlineCodeGenerator::mix16(const Xmm& a, const Xmm& b, const Xmm& temp) { if(m_cpu.has(util::Cpu::tSSE41)) { pblendw(a, b, 0xaa); } else { pcmpeqd(temp, temp); psrld(temp, 16); pand(a, temp); pandn(temp, b); por(a, temp); } }
void GSDrawScanlineCodeGenerator::mix16(const Xmm& a, const Xmm& b, const Xmm& temp) { #if _M_SSE >= 0x500 vpblendw(a, b, 0xaa); #elif _M_SSE >= 0x401 pblendw(a, b, 0xaa); #else pcmpeqd(temp, temp); psrld(temp, 16); pand(a, temp); pandn(temp, b); por(a, temp); #endif }
void GSSetupPrimCodeGenerator::Depth() { if(!m_en.z && !m_en.f) { return; } if(!m_env.sel.sprite) { // GSVector4 t = dscan.p; movaps(xmm0, xmmword[edx + 16]); if(m_en.f) { // GSVector4 df = p.wwww(); movaps(xmm1, xmm0); shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); // m_env.d4.f = GSVector4i(df * 4.0f).xxzzlh(); movaps(xmm2, xmm1); mulps(xmm2, xmm3); cvttps2dq(xmm2, xmm2); pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); movdqa(xmmword[&m_env.d4.f], xmm2); for(int i = 0; i < 4; i++) { // m_env.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh(); movaps(xmm2, xmm1); mulps(xmm2, Xmm(4 + i)); cvttps2dq(xmm2, xmm2); pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); movdqa(xmmword[&m_env.d[i].f], xmm2); } } if(m_en.z) { // GSVector4 dz = p.zzzz(); shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); // m_env.d4.z = dz * 4.0f; movaps(xmm1, xmm0); mulps(xmm1, xmm3); movdqa(xmmword[&m_env.d4.z], xmm1); for(int i = 0; i < 4; i++) { // m_env.d[i].z = dz * m_shift[i]; movaps(xmm1, xmm0); mulps(xmm1, Xmm(4 + i)); movdqa(xmmword[&m_env.d[i].z], xmm1); } } } else { // GSVector4 p = vertices[0].p; movaps(xmm0, xmmword[ecx + 16]); if(m_en.f) { // m_env.p.f = GSVector4i(p).zzzzh().zzzz(); movaps(xmm1, xmm0); cvttps2dq(xmm1, xmm1); pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); movdqa(xmmword[&m_env.p.f], xmm1); } if(m_en.z) { // GSVector4 z = p.zzzz(); shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); if(m_env.sel.zoverflow) { // m_env.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); static const float half = 0.5f; movss(xmm1, dword[&half]); shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0)); mulps(xmm1, xmm0); cvttps2dq(xmm1, xmm1); pslld(xmm1, 1); cvttps2dq(xmm0, xmm0); pcmpeqd(xmm2, xmm2); psrld(xmm2, 31); pand(xmm0, xmm2); por(xmm0, xmm1); } else { // m_env.p.z = GSVector4i(z); cvttps2dq(xmm0, xmm0); } movdqa(xmmword[&m_env.p.z], xmm0); } } }
void GPUDrawScanlineCodeGenerator::AlphaBlend() { if(!m_sel.abe) { return; } // xmm1 = fd // xmm3 = a // xmm4 = r // xmm5 = g // xmm6 = b // xmm7 = test // xmm0, xmm2 = free // GSVector4i r = (fd & 0x001f001f) << 3; pcmpeqd(xmm0, xmm0); psrlw(xmm0, 11); // 0x001f movdqa(xmm2, xmm1); pand(xmm2, xmm0); psllw(xmm2, 3); switch(m_sel.abr) { case 0: // r = r.avg8(c[0]); pavgb(xmm2, xmm4); break; case 1: // r = r.addus8(c[0]); paddusb(xmm2, xmm4); break; case 2: // r = r.subus8(c[0]); psubusb(xmm2, xmm4); break; case 3: // r = r.addus8(c[0].srl16(2)); movdqa(xmm0, xmm4); psrlw(xmm0, 2); paddusb(xmm2, xmm0); break; } if(m_sel.tme) { movdqa(xmm0, xmm3); blend8(xmm4, xmm2); } else { movdqa(xmm4, xmm2); } // GSVector4i g = (d & 0x03e003e0) >> 2; pcmpeqd(xmm0, xmm0); psrlw(xmm0, 11); psllw(xmm0, 5); // 0x03e0 movdqa(xmm2, xmm1); pand(xmm2, xmm0); psrlw(xmm2, 2); switch(m_sel.abr) { case 0: // g = g.avg8(c[2]); pavgb(xmm2, xmm5); break; case 1: // g = g.addus8(c[2]); paddusb(xmm2, xmm5); break; case 2: // g = g.subus8(c[2]); psubusb(xmm2, xmm5); break; case 3: // g = g.addus8(c[2].srl16(2)); movdqa(xmm0, xmm5); psrlw(xmm0, 2); paddusb(xmm2, xmm0); break; } if(m_sel.tme) { movdqa(xmm0, xmm3); blend8(xmm5, xmm2); } else { movdqa(xmm5, xmm2); } // GSVector4i b = (d & 0x7c007c00) >> 7; pcmpeqd(xmm0, xmm0); psrlw(xmm0, 11); psllw(xmm0, 10); // 0x7c00 movdqa(xmm2, xmm1); pand(xmm2, xmm0); psrlw(xmm2, 7); switch(m_sel.abr) { case 0: // b = b.avg8(c[2]); pavgb(xmm2, xmm6); break; case 1: // b = b.addus8(c[2]); paddusb(xmm2, xmm6); break; case 2: // b = b.subus8(c[2]); psubusb(xmm2, xmm6); break; case 3: // b = b.addus8(c[2].srl16(2)); movdqa(xmm0, xmm6); psrlw(xmm0, 2); paddusb(xmm2, xmm0); break; } if(m_sel.tme) { movdqa(xmm0, xmm3); blend8(xmm6, xmm2); } else { movdqa(xmm6, xmm2); } }
void GPUDrawScanlineCodeGenerator::WriteFrame() { // GSVector4i fs = r | g | b | (m_sel.md ? GSVector4i(0x80008000) : m_sel.tme ? a : 0); pcmpeqd(xmm0, xmm0); if(m_sel.md || m_sel.tme) { movdqa(xmm2, xmm0); psllw(xmm2, 15); } psrlw(xmm0, 11); psllw(xmm0, 3); // xmm0 = 0x00f8 // xmm2 = 0x8000 (md) // GSVector4i r = (c[0] & 0x00f800f8) >> 3; pand(xmm4, xmm0); psrlw(xmm4, 3); // GSVector4i g = (c[1] & 0x00f800f8) << 2; pand(xmm5, xmm0); psllw(xmm5, 2); por(xmm4, xmm5); // GSVector4i b = (c[2] & 0x00f800f8) << 7; pand(xmm6, xmm0); psllw(xmm6, 7); por(xmm4, xmm6); if(m_sel.md) { // GSVector4i a = GSVector4i(0x80008000); por(xmm4, xmm2); } else if(m_sel.tme) { // GSVector4i a = (c[3] << 8) & 0x80008000; psllw(xmm3, 8); pand(xmm3, xmm2); por(xmm4, xmm3); } // fs = fs.blend8(fd, test); movdqa(xmm0, xmm7); blend8(xmm4, xmm1); // GSVector4i::store<false>(fb, fs); // movdqu(ptr[edi], xmm4); movq(qword[edi], xmm4); movhps(qword[edi + 8], xmm4); }
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv) { // xmm0, xmm1, xmm4, xmm5, xmm6 = free int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; if(wms_clamp == wmt_clamp) { if(wms_clamp) { if(region) { pmaxsw(uv, xmmword[&m_env.t.min]); } else { pxor(xmm0, xmm0); pmaxsw(uv, xmm0); } pminsw(uv, xmmword[&m_env.t.max]); } else { pand(uv, xmmword[&m_env.t.min]); if(region) { por(uv, xmmword[&m_env.t.max]); } } } else { movdqa(xmm1, uv); movdqa(xmm4, xmmword[&m_env.t.min]); movdqa(xmm5, xmmword[&m_env.t.max]); // GSVector4i clamp = t.sat_i16(m_env.t.min, m_env.t.max); pmaxsw(uv, xmm4); pminsw(uv, xmm5); // GSVector4i repeat = (t & m_env.t.min) | m_env.t.max; pand(xmm1, xmm4); if(region) { por(xmm1, xmm5); } // clamp.blend8(repeat, m_env.t.mask); movdqa(xmm0, xmmword[&m_env.t.mask]); blend8(uv, xmm1); } }
void GPUDrawScanlineCodeGenerator::SampleTexture() { if(!m_sel.tme) { return; } if(m_sel.tlu) { mov(edx, ptr[&m_local.gd->clut]); } // xmm2 = s // xmm3 = t // xmm7 = test // xmm0, xmm4, xmm5, xmm6 = free // xmm1 = used if(m_sel.ltf) { // GSVector4i u = s.sub16(GSVector4i(0x00200020)); // - 0.125f // GSVector4i v = t.sub16(GSVector4i(0x00200020)); // - 0.125f mov(eax, 0x00200020); movd(xmm0, eax); pshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); psubw(xmm2, xmm0); psubw(xmm3, xmm0); // GSVector4i uf = (u & GSVector4i::x00ff()) << 7; // GSVector4i vf = (v & GSVector4i::x00ff()) << 7; movdqa(xmm0, xmm2); psllw(xmm0, 8); psrlw(xmm0, 1); movdqa(ptr[&m_local.temp.uf], xmm0); if(!m_sel.sprite) { movdqa(xmm0, xmm3); psllw(xmm0, 8); psrlw(xmm0, 1); movdqa(ptr[&m_local.temp.vf], xmm0); } } // GSVector4i u0 = s.srl16(8); // GSVector4i v0 = t.srl16(8); psrlw(xmm2, 8); psrlw(xmm3, 8); // xmm2 = u // xmm3 = v // xmm7 = test // xmm0, xmm4, xmm5, xmm6 = free // xmm1 = used if(m_sel.ltf) { // GSVector4i u1 = u0.add16(GSVector4i::x0001()); // GSVector4i v1 = v0.add16(GSVector4i::x0001()); movdqa(xmm4, xmm2); movdqa(xmm5, xmm3); pcmpeqd(xmm0, xmm0); psrlw(xmm0, 15); paddw(xmm4, xmm0); paddw(xmm5, xmm0); if(m_sel.twin) { // u0 = (u0 & m_local.twin[0].u).add16(m_local.twin[1].u); // v0 = (v0 & m_local.twin[0].v).add16(m_local.twin[1].v); // u1 = (u1 & m_local.twin[0].u).add16(m_local.twin[1].u); // v1 = (v1 & m_local.twin[0].v).add16(m_local.twin[1].v); movdqa(xmm0, ptr[&m_local.twin[0].u]); movdqa(xmm6, ptr[&m_local.twin[1].u]); pand(xmm2, xmm0); paddw(xmm2, xmm6); pand(xmm4, xmm0); paddw(xmm4, xmm6); movdqa(xmm0, ptr[&m_local.twin[0].v]); movdqa(xmm6, ptr[&m_local.twin[1].v]); pand(xmm3, xmm0); paddw(xmm3, xmm6); pand(xmm5, xmm0); paddw(xmm5, xmm6); } else { // u0 = u0.min_i16(m_local.twin[2].u); // v0 = v0.min_i16(m_local.twin[2].v); // u1 = u1.min_i16(m_local.twin[2].u); // v1 = v1.min_i16(m_local.twin[2].v); // TODO: if(!sprite) clamp16 else: movdqa(xmm0, ptr[&m_local.twin[2].u]); movdqa(xmm6, ptr[&m_local.twin[2].v]); pminsw(xmm2, xmm0); pminsw(xmm3, xmm6); pminsw(xmm4, xmm0); pminsw(xmm5, xmm6); } // xmm2 = u0 // xmm3 = v0 // xmm4 = u1 // xmm5 = v1 // xmm7 = test // xmm0, xmm6 = free // xmm1 = used // GSVector4i addr00 = v0.sll16(8) | u0; // GSVector4i addr01 = v0.sll16(8) | u1; // GSVector4i addr10 = v1.sll16(8) | u0; // GSVector4i addr11 = v1.sll16(8) | u1; psllw(xmm3, 8); movdqa(xmm0, xmm3); por(xmm3, xmm2); por(xmm0, xmm4); psllw(xmm5, 8); movdqa(xmm6, xmm5); por(xmm5, xmm2); por(xmm6, xmm4); // xmm3 = addr00 // xmm0 = addr01 // xmm5 = addr10 // xmm6 = addr11 // xmm7 = test // xmm2, xmm4 = free // xmm1 = used ReadTexel(xmm2, xmm3); ReadTexel(xmm4, xmm0); ReadTexel(xmm3, xmm5); ReadTexel(xmm5, xmm6); // xmm2 = c00 // xmm4 = c01 // xmm3 = c10 // xmm5 = c11 // xmm7 = test // xmm0, xmm6 = free // xmm1 = used // spill (TODO) movdqa(ptr[&m_local.temp.fd], xmm1); movdqa(ptr[&m_local.temp.test], xmm7); // xmm2 = c00 // xmm4 = c01 // xmm3 = c10 // xmm5 = c11 // xmm0, xmm1, xmm6, xmm7 = free movdqa(xmm1, xmm2); psllw(xmm1, 11); psrlw(xmm1, 8); movdqa(xmm0, xmm4); psllw(xmm0, 11); psrlw(xmm0, 8); lerp16<0>(xmm0, xmm1, ptr[&m_local.temp.uf]); movdqa(xmm6, xmm2); psllw(xmm6, 6); psrlw(xmm6, 11); psllw(xmm6, 3); movdqa(xmm1, xmm4); psllw(xmm1, 6); psrlw(xmm1, 11); psllw(xmm1, 3); lerp16<0>(xmm1, xmm6, ptr[&m_local.temp.uf]); movdqa(xmm7, xmm2); psllw(xmm7, 1); psrlw(xmm7, 11); psllw(xmm7, 3); movdqa(xmm6, xmm4); psllw(xmm6, 1); psrlw(xmm6, 11); psllw(xmm6, 3); lerp16<0>(xmm6, xmm7, ptr[&m_local.temp.uf]); psraw(xmm2, 15); psrlw(xmm2, 8); psraw(xmm4, 15); psrlw(xmm4, 8); lerp16<0>(xmm4, xmm2, ptr[&m_local.temp.uf]); // xmm0 = r00 // xmm1 = g00 // xmm6 = b00 // xmm4 = a00 // xmm3 = c10 // xmm5 = c11 // xmm2, xmm7 = free movdqa(xmm7, xmm3); psllw(xmm7, 11); psrlw(xmm7, 8); movdqa(xmm2, xmm5); psllw(xmm2, 11); psrlw(xmm2, 8); lerp16<0>(xmm2, xmm7, ptr[&m_local.temp.uf]); lerp16<0>(xmm2, xmm0, ptr[&m_local.temp.vf]); // xmm2 = r // xmm1 = g00 // xmm6 = b00 // xmm4 = a00 // xmm3 = c10 // xmm5 = c11 // xmm0, xmm7 = free movdqa(xmm7, xmm3); psllw(xmm7, 6); psrlw(xmm7, 11); psllw(xmm7, 3); movdqa(xmm0, xmm5); psllw(xmm0, 6); psrlw(xmm0, 11); psllw(xmm0, 3); lerp16<0>(xmm0, xmm7, ptr[&m_local.temp.uf]); lerp16<0>(xmm0, xmm1, ptr[&m_local.temp.vf]); // xmm2 = r // xmm0 = g // xmm6 = b00 // xmm4 = a00 // xmm3 = c10 // xmm5 = c11 // xmm1, xmm7 = free movdqa(xmm7, xmm3); psllw(xmm7, 1); psrlw(xmm7, 11); psllw(xmm7, 3); movdqa(xmm1, xmm5); psllw(xmm1, 1); psrlw(xmm1, 11); psllw(xmm1, 3); lerp16<0>(xmm1, xmm7, ptr[&m_local.temp.uf]); lerp16<0>(xmm1, xmm6, ptr[&m_local.temp.vf]); // xmm2 = r // xmm0 = g // xmm1 = b // xmm4 = a00 // xmm3 = c10 // xmm5 = c11 // xmm6, xmm7 = free psraw(xmm3, 15); psrlw(xmm3, 8); psraw(xmm5, 15); psrlw(xmm5, 8); lerp16<0>(xmm5, xmm3, ptr[&m_local.temp.uf]); lerp16<0>(xmm5, xmm4, ptr[&m_local.temp.vf]); // xmm2 = r // xmm0 = g // xmm1 = b // xmm5 = a // xmm3, xmm4, xmm6, xmm7 = free // TODO movdqa(xmm3, xmm5); // a movdqa(xmm4, xmm2); // r movdqa(xmm6, xmm1); // b movdqa(xmm5, xmm0); // g // reload test movdqa(xmm7, ptr[&m_local.temp.test]); // xmm4 = r // xmm5 = g // xmm6 = b // xmm3 = a // xmm7 = test // xmm0, xmm1, xmm2 = free // test |= (c[0] | c[1] | c[2] | c[3]).eq16(GSVector4i::zero()); // mask out blank pixels (not perfect) movdqa(xmm1, xmm3); por(xmm1, xmm4); movdqa(xmm2, xmm5); por(xmm2, xmm6); por(xmm1, xmm2); pxor(xmm0, xmm0); pcmpeqw(xmm1, xmm0); por(xmm7, xmm1); // a = a.gt16(GSVector4i::zero()); pcmpgtw(xmm3, xmm0); // reload fd movdqa(xmm1, ptr[&m_local.temp.fd]); } else { if(m_sel.twin) { // u = (u & m_local.twin[0].u).add16(m_local.twin[1].u); // v = (v & m_local.twin[0].v).add16(m_local.twin[1].v); pand(xmm2, ptr[&m_local.twin[0].u]); paddw(xmm2, ptr[&m_local.twin[1].u]); pand(xmm3, ptr[&m_local.twin[0].v]); paddw(xmm3, ptr[&m_local.twin[1].v]); } else { // u = u.min_i16(m_local.twin[2].u); // v = v.min_i16(m_local.twin[2].v); // TODO: if(!sprite) clamp16 else: pminsw(xmm2, ptr[&m_local.twin[2].u]); pminsw(xmm3, ptr[&m_local.twin[2].v]); } // xmm2 = u // xmm3 = v // xmm7 = test // xmm0, xmm4, xmm5, xmm6 = free // xmm1 = used // GSVector4i addr = v.sll16(8) | u; psllw(xmm3, 8); por(xmm3, xmm2); // xmm3 = addr // xmm7 = test // xmm0, xmm2, xmm4, xmm5, xmm6 = free // xmm1 = used ReadTexel(xmm6, xmm3); // xmm3 = c00 // xmm7 = test // xmm0, xmm2, xmm4, xmm5, xmm6 = free // xmm1 = used // test |= c00.eq16(GSVector4i::zero()); // mask out blank pixels pxor(xmm0, xmm0); pcmpeqw(xmm0, xmm6); por(xmm7, xmm0); // c[0] = (c00 << 3) & 0x00f800f8; // c[1] = (c00 >> 2) & 0x00f800f8; // c[2] = (c00 >> 7) & 0x00f800f8; // c[3] = c00.sra16(15); movdqa(xmm3, xmm6); psraw(xmm3, 15); // a pcmpeqd(xmm0, xmm0); psrlw(xmm0, 11); psllw(xmm0, 3); // 0x00f8 movdqa(xmm4, xmm6); psllw(xmm4, 3); pand(xmm4, xmm0); // r movdqa(xmm5, xmm6); psrlw(xmm5, 2); pand(xmm5, xmm0); // g psrlw(xmm6, 7); pand(xmm6, xmm0); // b } }
void MacroAssembler::fast_exp(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register eax, Register ecx, Register edx, Register tmp) { Label L_2TAG_PACKET_0_0_2, L_2TAG_PACKET_1_0_2, L_2TAG_PACKET_2_0_2, L_2TAG_PACKET_3_0_2; Label L_2TAG_PACKET_4_0_2, L_2TAG_PACKET_5_0_2, L_2TAG_PACKET_6_0_2, L_2TAG_PACKET_7_0_2; Label L_2TAG_PACKET_8_0_2, L_2TAG_PACKET_9_0_2, L_2TAG_PACKET_10_0_2, L_2TAG_PACKET_11_0_2; Label L_2TAG_PACKET_12_0_2, L_2TAG_PACKET_13_0_2, B1_3, B1_5, start; assert_different_registers(tmp, eax, ecx, edx); jmp(start); address static_const_table = (address)_static_const_table; bind(start); subl(rsp, 120); movl(Address(rsp, 64), tmp); lea(tmp, ExternalAddress(static_const_table)); movdqu(xmm0, Address(rsp, 128)); unpcklpd(xmm0, xmm0); movdqu(xmm1, Address(tmp, 64)); // 0x652b82feUL, 0x40571547UL, 0x652b82feUL, 0x40571547UL movdqu(xmm6, Address(tmp, 48)); // 0x00000000UL, 0x43380000UL, 0x00000000UL, 0x43380000UL movdqu(xmm2, Address(tmp, 80)); // 0xfefa0000UL, 0x3f862e42UL, 0xfefa0000UL, 0x3f862e42UL movdqu(xmm3, Address(tmp, 96)); // 0xbc9e3b3aUL, 0x3d1cf79aUL, 0xbc9e3b3aUL, 0x3d1cf79aUL pextrw(eax, xmm0, 3); andl(eax, 32767); movl(edx, 16527); subl(edx, eax); subl(eax, 15504); orl(edx, eax); cmpl(edx, INT_MIN); jcc(Assembler::aboveEqual, L_2TAG_PACKET_0_0_2); mulpd(xmm1, xmm0); addpd(xmm1, xmm6); movapd(xmm7, xmm1); subpd(xmm1, xmm6); mulpd(xmm2, xmm1); movdqu(xmm4, Address(tmp, 128)); // 0xe3289860UL, 0x3f56c15cUL, 0x555b9e25UL, 0x3fa55555UL mulpd(xmm3, xmm1); movdqu(xmm5, Address(tmp, 144)); // 0xc090cf0fUL, 0x3f811115UL, 0x55548ba1UL, 0x3fc55555UL subpd(xmm0, xmm2); movdl(eax, xmm7); movl(ecx, eax); andl(ecx, 63); shll(ecx, 4); sarl(eax, 6); movl(edx, eax); movdqu(xmm6, Address(tmp, 16)); // 0xffffffc0UL, 0x00000000UL, 0xffffffc0UL, 0x00000000UL pand(xmm7, xmm6); movdqu(xmm6, Address(tmp, 32)); // 0x0000ffc0UL, 0x00000000UL, 0x0000ffc0UL, 0x00000000UL paddq(xmm7, xmm6); psllq(xmm7, 46); subpd(xmm0, xmm3); movdqu(xmm2, Address(tmp, ecx, Address::times_1, 160)); mulpd(xmm4, xmm0); movapd(xmm6, xmm0); movapd(xmm1, xmm0); mulpd(xmm6, xmm6); mulpd(xmm0, xmm6); addpd(xmm5, xmm4); mulsd(xmm0, xmm6); mulpd(xmm6, Address(tmp, 112)); // 0xfffffffeUL, 0x3fdfffffUL, 0xfffffffeUL, 0x3fdfffffUL addsd(xmm1, xmm2); unpckhpd(xmm2, xmm2); mulpd(xmm0, xmm5); addsd(xmm1, xmm0); por(xmm2, xmm7); unpckhpd(xmm0, xmm0); addsd(xmm0, xmm1); addsd(xmm0, xmm6); addl(edx, 894); cmpl(edx, 1916); jcc (Assembler::above, L_2TAG_PACKET_1_0_2); mulsd(xmm0, xmm2); addsd(xmm0, xmm2); jmp(L_2TAG_PACKET_2_0_2); bind(L_2TAG_PACKET_1_0_2); fnstcw(Address(rsp, 24)); movzwl(edx, Address(rsp, 24)); orl(edx, 768); movw(Address(rsp, 28), edx); fldcw(Address(rsp, 28)); movl(edx, eax); sarl(eax, 1); subl(edx, eax); movdqu(xmm6, Address(tmp, 0)); // 0x00000000UL, 0xfff00000UL, 0x00000000UL, 0xfff00000UL pandn(xmm6, xmm2); addl(eax, 1023); movdl(xmm3, eax); psllq(xmm3, 52); por(xmm6, xmm3); addl(edx, 1023); movdl(xmm4, edx); psllq(xmm4, 52); movsd(Address(rsp, 8), xmm0); fld_d(Address(rsp, 8)); movsd(Address(rsp, 16), xmm6); fld_d(Address(rsp, 16)); fmula(1); faddp(1); movsd(Address(rsp, 8), xmm4); fld_d(Address(rsp, 8)); fmulp(1); fstp_d(Address(rsp, 8)); movsd(xmm0,Address(rsp, 8)); fldcw(Address(rsp, 24)); pextrw(ecx, xmm0, 3); andl(ecx, 32752); cmpl(ecx, 32752); jcc(Assembler::greaterEqual, L_2TAG_PACKET_3_0_2); cmpl(ecx, 0); jcc(Assembler::equal, L_2TAG_PACKET_4_0_2); jmp(L_2TAG_PACKET_2_0_2); cmpl(ecx, INT_MIN); jcc(Assembler::less, L_2TAG_PACKET_3_0_2); cmpl(ecx, -1064950997); jcc(Assembler::less, L_2TAG_PACKET_2_0_2); jcc(Assembler::greater, L_2TAG_PACKET_4_0_2); movl(edx, Address(rsp, 128)); cmpl(edx ,-17155601); jcc(Assembler::less, L_2TAG_PACKET_2_0_2); jmp(L_2TAG_PACKET_4_0_2); bind(L_2TAG_PACKET_3_0_2); movl(edx, 14); jmp(L_2TAG_PACKET_5_0_2); bind(L_2TAG_PACKET_4_0_2); movl(edx, 15); bind(L_2TAG_PACKET_5_0_2); movsd(Address(rsp, 0), xmm0); movsd(xmm0, Address(rsp, 128)); fld_d(Address(rsp, 0)); jmp(L_2TAG_PACKET_6_0_2); bind(L_2TAG_PACKET_7_0_2); cmpl(eax, 2146435072); jcc(Assembler::greaterEqual, L_2TAG_PACKET_8_0_2); movl(eax, Address(rsp, 132)); cmpl(eax, INT_MIN); jcc(Assembler::greaterEqual, L_2TAG_PACKET_9_0_2); movsd(xmm0, Address(tmp, 1208)); // 0xffffffffUL, 0x7fefffffUL mulsd(xmm0, xmm0); movl(edx, 14); jmp(L_2TAG_PACKET_5_0_2); bind(L_2TAG_PACKET_9_0_2); movsd(xmm0, Address(tmp, 1216)); mulsd(xmm0, xmm0); movl(edx, 15); jmp(L_2TAG_PACKET_5_0_2); bind(L_2TAG_PACKET_8_0_2); movl(edx, Address(rsp, 128)); cmpl(eax, 2146435072); jcc(Assembler::above, L_2TAG_PACKET_10_0_2); cmpl(edx, 0); jcc(Assembler::notEqual, L_2TAG_PACKET_10_0_2); movl(eax, Address(rsp, 132)); cmpl(eax, 2146435072); jcc(Assembler::notEqual, L_2TAG_PACKET_11_0_2); movsd(xmm0, Address(tmp, 1192)); // 0x00000000UL, 0x7ff00000UL jmp(L_2TAG_PACKET_2_0_2); bind(L_2TAG_PACKET_11_0_2); movsd(xmm0, Address(tmp, 1200)); // 0x00000000UL, 0x00000000UL jmp(L_2TAG_PACKET_2_0_2); bind(L_2TAG_PACKET_10_0_2); movsd(xmm0, Address(rsp, 128)); addsd(xmm0, xmm0); jmp(L_2TAG_PACKET_2_0_2); bind(L_2TAG_PACKET_0_0_2); movl(eax, Address(rsp, 132)); andl(eax, 2147483647); cmpl(eax, 1083179008); jcc(Assembler::aboveEqual, L_2TAG_PACKET_7_0_2); movsd(xmm0, Address(rsp, 128)); addsd(xmm0, Address(tmp, 1184)); // 0x00000000UL, 0x3ff00000UL jmp(L_2TAG_PACKET_2_0_2); bind(L_2TAG_PACKET_2_0_2); movsd(Address(rsp, 48), xmm0); fld_d(Address(rsp, 48)); bind(L_2TAG_PACKET_6_0_2); movl(tmp, Address(rsp, 64)); }
void MacroAssembler::fast_log(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register eax, Register ecx, Register edx, Register tmp1, Register tmp2) { Label L_2TAG_PACKET_0_0_2, L_2TAG_PACKET_1_0_2, L_2TAG_PACKET_2_0_2, L_2TAG_PACKET_3_0_2; Label L_2TAG_PACKET_4_0_2, L_2TAG_PACKET_5_0_2, L_2TAG_PACKET_6_0_2, L_2TAG_PACKET_7_0_2; Label L_2TAG_PACKET_8_0_2; Label L_2TAG_PACKET_12_0_2, L_2TAG_PACKET_13_0_2, B1_3, B1_5, start; assert_different_registers(tmp1, tmp2, eax, ecx, edx); jmp(start); address L_tbl = (address)_L_tbl; address log2 = (address)_log2; address coeff = (address)_coeff; bind(start); subq(rsp, 24); movsd(Address(rsp, 0), xmm0); mov64(rax, 0x3ff0000000000000); movdq(xmm2, rax); mov64(rdx, 0x77f0000000000000); movdq(xmm3, rdx); movl(ecx, 32768); movdl(xmm4, rcx); mov64(tmp1, 0xffffe00000000000); movdq(xmm5, tmp1); movdqu(xmm1, xmm0); pextrw(eax, xmm0, 3); por(xmm0, xmm2); movl(ecx, 16352); psrlq(xmm0, 27); lea(tmp2, ExternalAddress(L_tbl)); psrld(xmm0, 2); rcpps(xmm0, xmm0); psllq(xmm1, 12); pshufd(xmm6, xmm5, 228); psrlq(xmm1, 12); subl(eax, 16); cmpl(eax, 32736); jcc(Assembler::aboveEqual, L_2TAG_PACKET_0_0_2); bind(L_2TAG_PACKET_1_0_2); paddd(xmm0, xmm4); por(xmm1, xmm3); movdl(edx, xmm0); psllq(xmm0, 29); pand(xmm5, xmm1); pand(xmm0, xmm6); subsd(xmm1, xmm5); mulpd(xmm5, xmm0); andl(eax, 32752); subl(eax, ecx); cvtsi2sdl(xmm7, eax); mulsd(xmm1, xmm0); movq(xmm6, ExternalAddress(log2)); // 0xfefa3800UL, 0x3fa62e42UL movdqu(xmm3, ExternalAddress(coeff)); // 0x92492492UL, 0x3fc24924UL, 0x00000000UL, 0xbfd00000UL subsd(xmm5, xmm2); andl(edx, 16711680); shrl(edx, 12); movdqu(xmm0, Address(tmp2, edx)); movdqu(xmm4, ExternalAddress(16 + coeff)); // 0x3d6fb175UL, 0xbfc5555eUL, 0x55555555UL, 0x3fd55555UL addsd(xmm1, xmm5); movdqu(xmm2, ExternalAddress(32 + coeff)); // 0x9999999aUL, 0x3fc99999UL, 0x00000000UL, 0xbfe00000UL mulsd(xmm6, xmm7); movddup(xmm5, xmm1); mulsd(xmm7, ExternalAddress(8 + log2)); // 0x93c76730UL, 0x3ceef357UL mulsd(xmm3, xmm1); addsd(xmm0, xmm6); mulpd(xmm4, xmm5); mulpd(xmm5, xmm5); movddup(xmm6, xmm0); addsd(xmm0, xmm1); addpd(xmm4, xmm2); mulpd(xmm3, xmm5); subsd(xmm6, xmm0); mulsd(xmm4, xmm1); pshufd(xmm2, xmm0, 238); addsd(xmm1, xmm6); mulsd(xmm5, xmm5); addsd(xmm7, xmm2); addpd(xmm4, xmm3); addsd(xmm1, xmm7); mulpd(xmm4, xmm5); addsd(xmm1, xmm4); pshufd(xmm5, xmm4, 238); addsd(xmm1, xmm5); addsd(xmm0, xmm1); jmp(B1_5); bind(L_2TAG_PACKET_0_0_2); movq(xmm0, Address(rsp, 0)); movq(xmm1, Address(rsp, 0)); addl(eax, 16); cmpl(eax, 32768); jcc(Assembler::aboveEqual, L_2TAG_PACKET_2_0_2); cmpl(eax, 16); jcc(Assembler::below, L_2TAG_PACKET_3_0_2); bind(L_2TAG_PACKET_4_0_2); addsd(xmm0, xmm0); jmp(B1_5); bind(L_2TAG_PACKET_5_0_2); jcc(Assembler::above, L_2TAG_PACKET_4_0_2); cmpl(edx, 0); jcc(Assembler::above, L_2TAG_PACKET_4_0_2); jmp(L_2TAG_PACKET_6_0_2); bind(L_2TAG_PACKET_3_0_2); xorpd(xmm1, xmm1); addsd(xmm1, xmm0); movdl(edx, xmm1); psrlq(xmm1, 32); movdl(ecx, xmm1); orl(edx, ecx); cmpl(edx, 0); jcc(Assembler::equal, L_2TAG_PACKET_7_0_2); xorpd(xmm1, xmm1); movl(eax, 18416); pinsrw(xmm1, eax, 3); mulsd(xmm0, xmm1); movdqu(xmm1, xmm0); pextrw(eax, xmm0, 3); por(xmm0, xmm2); psrlq(xmm0, 27); movl(ecx, 18416); psrld(xmm0, 2); rcpps(xmm0, xmm0); psllq(xmm1, 12); pshufd(xmm6, xmm5, 228); psrlq(xmm1, 12); jmp(L_2TAG_PACKET_1_0_2); bind(L_2TAG_PACKET_2_0_2); movdl(edx, xmm1); psrlq(xmm1, 32); movdl(ecx, xmm1); addl(ecx, ecx); cmpl(ecx, -2097152); jcc(Assembler::aboveEqual, L_2TAG_PACKET_5_0_2); orl(edx, ecx); cmpl(edx, 0); jcc(Assembler::equal, L_2TAG_PACKET_7_0_2); bind(L_2TAG_PACKET_6_0_2); xorpd(xmm1, xmm1); xorpd(xmm0, xmm0); movl(eax, 32752); pinsrw(xmm1, eax, 3); mulsd(xmm0, xmm1); movl(Address(rsp, 16), 3); jmp(L_2TAG_PACKET_8_0_2); bind(L_2TAG_PACKET_7_0_2); xorpd(xmm1, xmm1); xorpd(xmm0, xmm0); movl(eax, 49136); pinsrw(xmm0, eax, 3); divsd(xmm0, xmm1); movl(Address(rsp, 16), 2); bind(L_2TAG_PACKET_8_0_2); movq(Address(rsp, 8), xmm0); bind(B1_3); movq(xmm0, Address(rsp, 8)); bind(B1_5); addq(rsp, 24); }
void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) { // xmm0, xmm1, xmm4, xmm5, xmm6 = free int wms_clamp = ((m_sel.wms + 1) >> 1) & 1; int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1; int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1; if(wms_clamp == wmt_clamp) { if(wms_clamp) { if(region) { movdqa(xmm4, xmmword[&m_env.t.min]); pmaxsw(uv0, xmm4); pmaxsw(uv1, xmm4); } else { pxor(xmm0, xmm0); pmaxsw(uv0, xmm0); pmaxsw(uv1, xmm0); } movdqa(xmm5, xmmword[&m_env.t.max]); pminsw(uv0, xmm5); pminsw(uv1, xmm5); } else { movdqa(xmm4, xmmword[&m_env.t.min]); pand(uv0, xmm4); pand(uv1, xmm4); if(region) { movdqa(xmm5, xmmword[&m_env.t.max]); por(uv0, xmm5); por(uv1, xmm5); } } } else { movdqa(xmm1, uv0); movdqa(xmm6, uv1); movdqa(xmm4, xmmword[&m_env.t.min]); movdqa(xmm5, xmmword[&m_env.t.max]); // GSVector4i clamp = t.sat_i16(m_env.t.min, m_env.t.max); pmaxsw(uv0, xmm4); pmaxsw(uv1, xmm4); pminsw(uv0, xmm5); pminsw(uv1, xmm5); // GSVector4i repeat = (t & m_env.t.min) | m_env.t.max; pand(xmm1, xmm4); pand(xmm6, xmm4); if(region) { por(xmm1, xmm5); por(xmm6, xmm5); } // clamp.blend8(repeat, m_env.t.mask); if(m_cpu.has(util::Cpu::tSSE41)) { movdqa(xmm0, xmmword[&m_env.t.mask]); pblendvb(uv0, xmm1); pblendvb(uv1, xmm6); } else { movdqa(xmm0, xmmword[&m_env.t.invmask]); movdqa(xmm4, xmm0); pand(uv0, xmm0); pandn(xmm0, xmm1); por(uv0, xmm0); pand(uv1, xmm4); pandn(xmm4, xmm6); por(uv1, xmm4); } } }
void MacroAssembler::fast_exp(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register eax, Register ecx, Register edx, Register tmp) { Label L_2TAG_PACKET_0_0_2, L_2TAG_PACKET_1_0_2, L_2TAG_PACKET_2_0_2, L_2TAG_PACKET_3_0_2; Label L_2TAG_PACKET_4_0_2, L_2TAG_PACKET_5_0_2, L_2TAG_PACKET_6_0_2, L_2TAG_PACKET_7_0_2; Label L_2TAG_PACKET_8_0_2, L_2TAG_PACKET_9_0_2, L_2TAG_PACKET_10_0_2, L_2TAG_PACKET_11_0_2; Label L_2TAG_PACKET_12_0_2, B1_3, B1_5, start; assert_different_registers(tmp, eax, ecx, edx); jmp(start); address cv = (address)_cv; address Shifter = (address)_shifter; address mmask = (address)_mmask; address bias = (address)_bias; address Tbl_addr = (address)_Tbl_addr; address ALLONES = (address)_ALLONES; address ebias = (address)_ebias; address XMAX = (address)_XMAX; address XMIN = (address)_XMIN; address INF = (address)_INF; address ZERO = (address)_ZERO; address ONE_val = (address)_ONE_val; bind(start); subq(rsp, 24); movsd(Address(rsp, 8), xmm0); unpcklpd(xmm0, xmm0); movdqu(xmm1, ExternalAddress(cv)); // 0x652b82feUL, 0x40571547UL, 0x652b82feUL, 0x40571547UL movdqu(xmm6, ExternalAddress(Shifter)); // 0x00000000UL, 0x43380000UL, 0x00000000UL, 0x43380000UL movdqu(xmm2, ExternalAddress(16+cv)); // 0xfefa0000UL, 0x3f862e42UL, 0xfefa0000UL, 0x3f862e42UL movdqu(xmm3, ExternalAddress(32+cv)); // 0xbc9e3b3aUL, 0x3d1cf79aUL, 0xbc9e3b3aUL, 0x3d1cf79aUL pextrw(eax, xmm0, 3); andl(eax, 32767); movl(edx, 16527); subl(edx, eax); subl(eax, 15504); orl(edx, eax); cmpl(edx, INT_MIN); jcc(Assembler::aboveEqual, L_2TAG_PACKET_0_0_2); mulpd(xmm1, xmm0); addpd(xmm1, xmm6); movapd(xmm7, xmm1); subpd(xmm1, xmm6); mulpd(xmm2, xmm1); movdqu(xmm4, ExternalAddress(64+cv)); // 0xe3289860UL, 0x3f56c15cUL, 0x555b9e25UL, 0x3fa55555UL mulpd(xmm3, xmm1); movdqu(xmm5, ExternalAddress(80+cv)); // 0xc090cf0fUL, 0x3f811115UL, 0x55548ba1UL, 0x3fc55555UL subpd(xmm0, xmm2); movdl(eax, xmm7); movl(ecx, eax); andl(ecx, 63); shll(ecx, 4); sarl(eax, 6); movl(edx, eax); movdqu(xmm6, ExternalAddress(mmask)); // 0xffffffc0UL, 0x00000000UL, 0xffffffc0UL, 0x00000000UL pand(xmm7, xmm6); movdqu(xmm6, ExternalAddress(bias)); // 0x0000ffc0UL, 0x00000000UL, 0x0000ffc0UL, 0x00000000UL paddq(xmm7, xmm6); psllq(xmm7, 46); subpd(xmm0, xmm3); lea(tmp, ExternalAddress(Tbl_addr)); movdqu(xmm2, Address(ecx,tmp)); mulpd(xmm4, xmm0); movapd(xmm6, xmm0); movapd(xmm1, xmm0); mulpd(xmm6, xmm6); mulpd(xmm0, xmm6); addpd(xmm5, xmm4); mulsd(xmm0, xmm6); mulpd(xmm6, ExternalAddress(48+cv)); // 0xfffffffeUL, 0x3fdfffffUL, 0xfffffffeUL, 0x3fdfffffUL addsd(xmm1, xmm2); unpckhpd(xmm2, xmm2); mulpd(xmm0, xmm5); addsd(xmm1, xmm0); por(xmm2, xmm7); unpckhpd(xmm0, xmm0); addsd(xmm0, xmm1); addsd(xmm0, xmm6); addl(edx, 894); cmpl(edx, 1916); jcc (Assembler::above, L_2TAG_PACKET_1_0_2); mulsd(xmm0, xmm2); addsd(xmm0, xmm2); jmp (B1_5); bind(L_2TAG_PACKET_1_0_2); xorpd(xmm3, xmm3); movdqu(xmm4, ExternalAddress(ALLONES)); // 0xffffffffUL, 0xffffffffUL, 0xffffffffUL, 0xffffffffUL movl(edx, -1022); subl(edx, eax); movdl(xmm5, edx); psllq(xmm4, xmm5); movl(ecx, eax); sarl(eax, 1); pinsrw(xmm3, eax, 3); movdqu(xmm6, ExternalAddress(ebias)); // 0x00000000UL, 0x3ff00000UL, 0x00000000UL, 0x3ff00000UL psllq(xmm3, 4); psubd(xmm2, xmm3); mulsd(xmm0, xmm2); cmpl(edx, 52); jcc(Assembler::greater, L_2TAG_PACKET_2_0_2); pand(xmm4, xmm2); paddd(xmm3, xmm6); subsd(xmm2, xmm4); addsd(xmm0, xmm2); cmpl(ecx, 1023); jcc(Assembler::greaterEqual, L_2TAG_PACKET_3_0_2); pextrw(ecx, xmm0, 3); andl(ecx, 32768); orl(edx, ecx); cmpl(edx, 0); jcc(Assembler::equal, L_2TAG_PACKET_4_0_2); movapd(xmm6, xmm0); addsd(xmm0, xmm4); mulsd(xmm0, xmm3); pextrw(ecx, xmm0, 3); andl(ecx, 32752); cmpl(ecx, 0); jcc(Assembler::equal, L_2TAG_PACKET_5_0_2); jmp(B1_5); bind(L_2TAG_PACKET_5_0_2); mulsd(xmm6, xmm3); mulsd(xmm4, xmm3); movdqu(xmm0, xmm6); pxor(xmm6, xmm4); psrad(xmm6, 31); pshufd(xmm6, xmm6, 85); psllq(xmm0, 1); psrlq(xmm0, 1); pxor(xmm0, xmm6); psrlq(xmm6, 63); paddq(xmm0, xmm6); paddq(xmm0, xmm4); movl(Address(rsp,0), 15); jmp(L_2TAG_PACKET_6_0_2); bind(L_2TAG_PACKET_4_0_2); addsd(xmm0, xmm4); mulsd(xmm0, xmm3); jmp(B1_5); bind(L_2TAG_PACKET_3_0_2); addsd(xmm0, xmm4); mulsd(xmm0, xmm3); pextrw(ecx, xmm0, 3); andl(ecx, 32752); cmpl(ecx, 32752); jcc(Assembler::aboveEqual, L_2TAG_PACKET_7_0_2); jmp(B1_5); bind(L_2TAG_PACKET_2_0_2); paddd(xmm3, xmm6); addpd(xmm0, xmm2); mulsd(xmm0, xmm3); movl(Address(rsp,0), 15); jmp(L_2TAG_PACKET_6_0_2); bind(L_2TAG_PACKET_8_0_2); cmpl(eax, 2146435072); jcc(Assembler::aboveEqual, L_2TAG_PACKET_9_0_2); movl(eax, Address(rsp,12)); cmpl(eax, INT_MIN); jcc(Assembler::aboveEqual, L_2TAG_PACKET_10_0_2); movsd(xmm0, ExternalAddress(XMAX)); // 0xffffffffUL, 0x7fefffffUL mulsd(xmm0, xmm0); bind(L_2TAG_PACKET_7_0_2); movl(Address(rsp,0), 14); jmp(L_2TAG_PACKET_6_0_2); bind(L_2TAG_PACKET_10_0_2); movsd(xmm0, ExternalAddress(XMIN)); // 0x00000000UL, 0x00100000UL mulsd(xmm0, xmm0); movl(Address(rsp,0), 15); jmp(L_2TAG_PACKET_6_0_2); bind(L_2TAG_PACKET_9_0_2); movl(edx, Address(rsp,8)); cmpl(eax, 2146435072); jcc(Assembler::above, L_2TAG_PACKET_11_0_2); cmpl(edx, 0); jcc(Assembler::notEqual, L_2TAG_PACKET_11_0_2); movl(eax, Address(rsp,12)); cmpl(eax, 2146435072); jcc(Assembler::notEqual, L_2TAG_PACKET_12_0_2); movsd(xmm0, ExternalAddress(INF)); // 0x00000000UL, 0x7ff00000UL jmp(B1_5); bind(L_2TAG_PACKET_12_0_2); movsd(xmm0, ExternalAddress(ZERO)); // 0x00000000UL, 0x00000000UL jmp(B1_5); bind(L_2TAG_PACKET_11_0_2); movsd(xmm0, Address(rsp, 8)); addsd(xmm0, xmm0); jmp(B1_5); bind(L_2TAG_PACKET_0_0_2); movl(eax, Address(rsp, 12)); andl(eax, 2147483647); cmpl(eax, 1083179008); jcc(Assembler::aboveEqual, L_2TAG_PACKET_8_0_2); movsd(Address(rsp, 8), xmm0); addsd(xmm0, ExternalAddress(ONE_val)); // 0x00000000UL, 0x3ff00000UL jmp(B1_5); bind(L_2TAG_PACKET_6_0_2); movq(Address(rsp, 16), xmm0); bind(B1_3); movq(xmm0, Address(rsp, 16)); bind(B1_5); addq(rsp, 24); }
template <class _mm, int src_aligned, int dst_aligned> void TffdshowConverters2::convert_NV12toYV12( const uint8_t* srcY, const uint8_t* srcCbCr, uint8_t* dstY, uint8_t* dstCb, uint8_t* dstCr, int dx, int dy, stride_t stride_Y, stride_t stride_CbCr, stride_t stride_dstY, stride_t stride_dstCbCr) { int xCount = dx / (_mm::size*2); if (xCount <= 0) return; _mm::__m _mm0,_mm1,_mm2,_mm3,_mm_00ff; // fill with 0xff pxor(_mm_00ff,_mm_00ff); pcmpeqb(_mm_00ff,_mm_00ff); psllw(_mm_00ff, 8); psrlw(_mm_00ff, 8); for (int y = 0 ; y < dy ; y++) { const uint8_t *src = srcY + y * stride_Y; uint8_t *dst = dstY + y * stride_dstY; int x = xCount; do { if (src_aligned) { movVqa(_mm0, src); movVqa(_mm1, src + _mm::size); } else { movVqu(_mm0, src); movVqu(_mm1, src + _mm::size); } src += _mm::size * 2; if (dst_aligned) { _mm::movntVq(dst, _mm0); _mm::movntVq(dst + _mm::size, _mm1); } else { movVqu(dst, _mm0); movVqu(dst + _mm::size, _mm1); } dst += _mm::size * 2; } while(--x); } int dyCbCr = dy/2; for (int y = 0 ; y < dyCbCr ; y++) { const uint8_t *srcCbCrLn = srcCbCr + y * stride_CbCr; uint8_t *dstCbLn = dstCb + y * stride_dstCbCr; uint8_t *dstCrLn = dstCr + y * stride_dstCbCr; int x = xCount; do { if (src_aligned) { movVqa(_mm0, srcCbCrLn); movVqa(_mm1, srcCbCrLn + _mm::size); } else { movVqu(_mm0, srcCbCrLn); movVqu(_mm1, srcCbCrLn + _mm::size); } _mm2 = _mm0; _mm3 = _mm1; srcCbCrLn += _mm::size * 2; pand(_mm0, _mm_00ff); pand(_mm1, _mm_00ff); psrlw(_mm2,8); psrlw(_mm3,8); packuswb(_mm0, _mm1); packuswb(_mm2, _mm3); if (dst_aligned) { _mm::movntVq(dstCbLn, _mm0); _mm::movntVq(dstCrLn, _mm2); } else { movVqu(dstCbLn, _mm0); movVqu(dstCrLn, _mm2); } dstCbLn += _mm::size; dstCrLn += _mm::size; } while(--x); } if (xCount * (int)_mm::size * 2 < dx && dx > _mm::size * 2) { int dxDone = dx - _mm::size * 2; srcY += dxDone; srcCbCr += dxDone; dstY += dxDone; dstCb += dxDone/2; dstCr += dxDone/2; convert_NV12toYV12<_mm, 0, 0>(srcY, srcCbCr, dstY, dstCb, dstCr, _mm::size * 2, dy, stride_Y, stride_CbCr, stride_dstY, stride_dstCbCr); } _mm::empty(); }
void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) { if(!m_sel.zb) { return; } // int za = fza_base.y + fza_offset->y; mov(ebp, dword[esi + 4]); add(ebp, dword[edi + 4]); // GSVector4i zs = zi; if(!m_sel.sprite) { if(m_sel.zoverflow) { // zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); static float half = 0.5f; movss(temp1, dword[&half]); shufps(temp1, temp1, _MM_SHUFFLE(0, 0, 0, 0)); mulps(temp1, xmm0); cvttps2dq(temp1, temp1); pslld(temp1, 1); cvttps2dq(xmm0, xmm0); pcmpeqd(temp2, temp2); psrld(temp2, 31); pand(xmm0, temp2); por(xmm0, temp1); } else { // zs = GSVector4i(z); cvttps2dq(xmm0, xmm0); } if(m_sel.zwrite) { movdqa(xmmword[&m_env.temp.zs], xmm0); } } if(m_sel.ztest) { ReadPixel(xmm1, ebp); if(m_sel.zwrite && m_sel.zpsm < 2) { movdqa(xmmword[&m_env.temp.zd], xmm1); } // zd &= 0xffffffff >> m_sel.zpsm * 8; if(m_sel.zpsm) { pslld(xmm1, m_sel.zpsm * 8); psrld(xmm1, m_sel.zpsm * 8); } if(m_sel.zoverflow || m_sel.zpsm == 0) { // GSVector4i o = GSVector4i::x80000000(); pcmpeqd(xmm4, xmm4); pslld(xmm4, 31); // GSVector4i zso = zs - o; psubd(xmm0, xmm4); // GSVector4i zdo = zd - o; psubd(xmm1, xmm4); } switch(m_sel.ztst) { case ZTST_GEQUAL: // test |= zso < zdo; // ~(zso >= zdo) pcmpgtd(xmm1, xmm0); por(xmm7, xmm1); break; case ZTST_GREATER: // TODO: tidus hair and chocobo wings only appear fully when this is tested as ZTST_GEQUAL // test |= zso <= zdo; // ~(zso > zdo) pcmpgtd(xmm0, xmm1); pcmpeqd(xmm4, xmm4); pxor(xmm0, xmm4); por(xmm7, xmm0); break; } alltrue(); } }
void GSDrawScanlineCodeGenerator::blendr(const Xmm& b, const Xmm& a, const Xmm& mask) { pand(b, mask); pandn(mask, a); por(b, mask); }
void GSDrawScanlineCodeGenerator::AlphaBlend() { if(!m_sel.fwrite) { return; } if(m_sel.abe == 0 && m_sel.aa1 == 0) { return; } if((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1) || m_sel.abd == 1) { switch(m_sel.fpsm) { case 0: case 1: // c[2] = fd & mask; // c[3] = (fd >> 8) & mask; movdqa(xmm0, xmm2); movdqa(xmm1, xmm2); psllw(xmm0, 8); psrlw(xmm0, 8); psrlw(xmm1, 8); break; case 2: // c[2] = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3); // c[3] = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2); movdqa(xmm0, xmm2); movdqa(xmm1, xmm2); movdqa(xmm4, xmm2); pcmpeqd(xmm7, xmm7); psrld(xmm7, 27); // 0x0000001f pand(xmm0, xmm7); pslld(xmm0, 3); pslld(xmm7, 10); // 0x00007c00 pand(xmm4, xmm7); pslld(xmm4, 9); por(xmm0, xmm4); movdqa(xmm4, xmm1); psrld(xmm7, 5); // 0x000003e0 pand(xmm1, xmm7); psrld(xmm1, 2); psllw(xmm7, 10); // 0x00008000 pand(xmm4, xmm7); pslld(xmm4, 8); por(xmm1, xmm4); break; } } // xmm5, xmm6 = src rb, ga // xmm0, xmm1 = dst rb, ga // xmm2, xmm3 = used // xmm4, xmm7 = free if(m_sel.pabe || (m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0)) { movdqa(xmm4, xmm5); } if(m_sel.aba != m_sel.abb) { // rb = c[aba * 2 + 0]; switch(m_sel.aba) { case 0: break; case 1: movdqa(xmm5, xmm0); break; case 2: pxor(xmm5, xmm5); break; } // rb = rb.sub16(c[abb * 2 + 0]); switch(m_sel.abb) { case 0: psubw(xmm5, xmm4); break; case 1: psubw(xmm5, xmm0); break; case 2: break; } if(!(m_sel.fpsm == 1 && m_sel.abc == 1)) { // GSVector4i a = abc < 2 ? c[abc * 2 + 1].yywwlh().sll16(7) : m_env.afix; switch(m_sel.abc) { case 0: case 1: movdqa(xmm7, m_sel.abc ? xmm1 : xmm6); pshuflw(xmm7, xmm7, _MM_SHUFFLE(3, 3, 1, 1)); pshufhw(xmm7, xmm7, _MM_SHUFFLE(3, 3, 1, 1)); psllw(xmm7, 7); break; case 2: movdqa(xmm7, xmmword[&m_env.afix]); break; } // rb = rb.modulate16<1>(a); modulate16<1>(xmm5, xmm7); } // rb = rb.add16(c[abd * 2 + 0]); switch(m_sel.abd) { case 0: paddw(xmm5, xmm4); break; case 1: paddw(xmm5, xmm0); break; case 2: break; } } else { // rb = c[abd * 2 + 0]; switch(m_sel.abd) { case 0: break; case 1: movdqa(xmm5, xmm0); break; case 2: pxor(xmm5, xmm5); break; } } if(m_sel.pabe) { // mask = (c[1] << 8).sra32(31); movdqa(xmm0, xmm6); pslld(xmm0, 8); psrad(xmm0, 31); // rb = c[0].blend8(rb, mask); blend8r(xmm5, xmm4); } // xmm6 = src ga // xmm1 = dst ga // xmm5 = rb // xmm7 = a // xmm2, xmm3 = used // xmm0, xmm4 = free movdqa(xmm4, xmm6); if(m_sel.aba != m_sel.abb) { // ga = c[aba * 2 + 1]; switch(m_sel.aba) { case 0: break; case 1: movdqa(xmm6, xmm1); break; case 2: pxor(xmm6, xmm6); break; } // ga = ga.sub16(c[abeb * 2 + 1]); switch(m_sel.abb) { case 0: psubw(xmm6, xmm4); break; case 1: psubw(xmm6, xmm1); break; case 2: break; } if(!(m_sel.fpsm == 1 && m_sel.abc == 1)) { // ga = ga.modulate16<1>(a); modulate16<1>(xmm6, xmm7); } // ga = ga.add16(c[abd * 2 + 1]); switch(m_sel.abd) { case 0: paddw(xmm6, xmm4); break; case 1: paddw(xmm6, xmm1); break; case 2: break; } } else { // ga = c[abd * 2 + 1]; switch(m_sel.abd) { case 0: break; case 1: movdqa(xmm6, xmm1); break; case 2: pxor(xmm6, xmm6); break; } } // xmm4 = src ga // xmm5 = rb // xmm6 = ga // xmm2, xmm3 = used // xmm0, xmm1, xmm7 = free if(m_sel.pabe) { if(!m_cpu.has(util::Cpu::tSSE41)) { // doh, previous blend8r overwrote xmm0 (sse41 uses pblendvb) movdqa(xmm0, xmm4); pslld(xmm0, 8); psrad(xmm0, 31); } psrld(xmm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16) // ga = c[1].blend8(ga, mask).mix16(c[1]); blend8r(xmm6, xmm4); } else { if(m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx { mix16(xmm6, xmm4, xmm7); } } }
void GSDrawScanlineCodeGenerator::WriteFrame(int params) { const int _top = params + 4; if(!m_sel.fwrite) { return; } if(m_sel.colclamp == 0) { // c[0] &= 0x000000ff; // c[1] &= 0x000000ff; pcmpeqd(xmm7, xmm7); psrlw(xmm7, 8); pand(xmm5, xmm7); pand(xmm6, xmm7); } if(m_sel.fpsm == 2 && m_sel.dthe) { mov(eax, dword[esp + _top]); and(eax, 3); shl(eax, 5); paddw(xmm5, xmmword[eax + (size_t)&m_env.dimx[0]]); paddw(xmm6, xmmword[eax + (size_t)&m_env.dimx[1]]); } // GSVector4i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1])); movdqa(xmm7, xmm5); punpcklwd(xmm5, xmm6); punpckhwd(xmm7, xmm6); packuswb(xmm5, xmm7); if(m_sel.fba && m_sel.fpsm != 1) { // fs |= 0x80000000; pcmpeqd(xmm7, xmm7); pslld(xmm7, 31); por(xmm5, xmm7); } if(m_sel.fpsm == 2) { // GSVector4i rb = fs & 0x00f800f8; // GSVector4i ga = fs & 0x8000f800; mov(eax, 0x00f800f8); movd(xmm6, eax); pshufd(xmm6, xmm6, _MM_SHUFFLE(0, 0, 0, 0)); mov(eax, 0x8000f800); movd(xmm7, eax); pshufd(xmm7, xmm7, _MM_SHUFFLE(0, 0, 0, 0)); movdqa(xmm4, xmm5); pand(xmm4, xmm6); pand(xmm5, xmm7); // fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3); movdqa(xmm6, xmm4); movdqa(xmm7, xmm5); psrld(xmm4, 3); psrld(xmm6, 9); psrld(xmm5, 6); psrld(xmm7, 16); por(xmm5, xmm4); por(xmm7, xmm6); por(xmm5, xmm7); } if(m_sel.rfb) { // fs = fs.blend(fd, fm); blend(xmm5, xmm2, xmm3); // TODO: could be skipped in certain cases, depending on fpsm and fm } bool fast = m_sel.rfb && m_sel.fpsm < 2; WritePixel(xmm5, xmm0, ebx, dl, fast, m_sel.fpsm); }
int main() { int rval; mmx_t ma; mmx_t mb; movq_r2r(mm0, mm1); rval = mmx_ok(); /* Announce return value of mmx_ok() */ // printf("Value returned from init was %x.", rval); // printf(" (Indicates MMX %s available)\n\n",(rval)? "is" : "not"); // fflush(stdout); fflush(stderr); // if(rval) { /* PADD *****************************************************/ ma.q = 0x1111111180000000LL; mb.q = 0x7fffffff00000001LL; paddd(ma, mb); fprintf(stdout, "paddd: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddd: mb.q is 9111111080000001\n"); fflush(stdout); fflush(stderr); ma.q = 0x0001000100010001LL; mb.q = 0x80007fffffff0001LL; paddw(ma, mb); fprintf(stdout, "paddw: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddw: mb.q is 8001800000000002\n"); fflush(stdout); fflush(stderr); ma.q = 0x80007fffffff0001LL; mb.q = 0x0001000100010000LL; paddw(ma, mb); fprintf(stdout, "paddw: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddw: mb.q is 8001800000000001\n"); fflush(stdout); fflush(stderr); ma.q = 0x01010101807fff01LL; mb.q = 0x807fff0101010101LL; paddb(ma, mb); fprintf(stdout, "paddb: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddb: mb.q is 8180000281800002\n"); fflush(stdout); fflush(stderr); /* PADDS ****************************************************/ ma.q = 0x0001000100010001LL; mb.q = 0x80007fffffff0001LL; paddsw(ma, mb); fprintf(stdout, "paddsw: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddsw: mb.q is 80017fff00000002\n"); ma.q = 0x80007fffffff0001LL; mb.q = 0x0001000100010000LL; paddsw(ma, mb); fprintf(stdout, "paddsw: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddsw: mb.q is 80017fff00000001\n"); ma.q = 0x01010101807fff01LL; mb.q = 0x807fff0101010101LL; paddsb(ma, mb); fprintf(stdout, "paddsb: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddsb: mb.q is 817f0002817f0002\n"); fflush(stdout); fflush(stderr); /* PADDUS ***************************************************/ ma.q = 0x0001000100010001LL; mb.q = 0x80007fffffff0001LL; paddusw(ma, mb); fprintf(stdout, "paddusw: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddusw: mb.q is 80018000ffff0002\n"); fflush(stdout); fflush(stderr); ma.q = 0x80007fffffff0001LL; mb.q = 0x0001000100010000LL; paddusw(ma, mb); fprintf(stdout, "paddusw: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddusw: mb.q is 80018000ffff0001\n"); fflush(stdout); fflush(stderr); ma.q = 0x01010101807fff01LL; mb.q = 0x807fff0101010101LL; paddusb(ma, mb); fprintf(stdout, "paddusb: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddusb: mb.q is 8180ff028180ff02\n"); fflush(stdout); fflush(stderr); /* PSUB *****************************************************/ ma.q = 0x7fffffff00000001LL; mb.q = 0x1111111180000000LL; psubd(ma, mb); fprintf(stdout, "psubd: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubd: mb.q is 911111127fffffff\n"); fflush(stdout); fflush(stderr); ma.q = 0x80007fffffff0001LL; mb.q = 0x0001000100010001LL; psubw(ma, mb); fprintf(stdout, "psubw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubw: mb.q is 8001800200020000\n"); fflush(stdout); fflush(stderr); ma.q = 0x0001000100010000LL; mb.q = 0x80007fffffff0001LL; psubw(ma, mb); fprintf(stdout, "psubw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubw: mb.q is 7fff7ffefffe0001\n"); fflush(stdout); fflush(stderr); ma.q = 0x807fff0101010101LL; mb.q = 0x01010101807fff01LL; psubb(ma, mb); fprintf(stdout, "psubb: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubb: mb.q is 818202007f7efe00\n"); fflush(stdout); fflush(stderr); /* PSUBS ****************************************************/ ma.q = 0x80007fffffff0001LL; mb.q = 0x0001000100010001LL; psubsw(ma, mb); fprintf(stdout, "psubsw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubsw: mb.q is 7fff800200020000\n"); fflush(stdout); fflush(stderr); ma.q = 0x0001000100010000LL; mb.q = 0x80007fffffff0001LL; psubsw(ma, mb); fprintf(stdout, "psubsw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubsw: mb.q is 80007ffefffe0001\n"); fflush(stdout); fflush(stderr); ma.q = 0x807fff0101010101LL; mb.q = 0x01010101807fff01LL; psubsb(ma, mb); fprintf(stdout, "psubsb: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubsb: mb.q is 7f820200807efe00\n"); fflush(stdout); fflush(stderr); /* PSUBUS ***************************************************/ ma.q = 0x80007fffffff0001LL; mb.q = 0x0001000100010001LL; psubusw(ma, mb); fprintf(stdout, "psubusw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubusw: mb.q is 0000000000000000\n"); fflush(stdout); fflush(stderr); ma.q = 0x0001000100010000LL; mb.q = 0x80007fffffff0001LL; psubusw(ma, mb); fprintf(stdout, "psubusw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubusw: mb.q is 7fff7ffefffe0001\n"); fflush(stdout); fflush(stderr); ma.q = 0x807fff0101010101LL; mb.q = 0x01010101807fff01LL; psubusb(ma, mb); fprintf(stdout, "psubusb: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubusb: mb.q is 000000007f7efe00\n"); fflush(stdout); fflush(stderr); /* PMUL *****************************************************/ ma.q = 0x8000ffff00ff0000LL; mb.q = 0x0200ffff00ffffffLL; pmulhw(ma, mb); fprintf(stdout, "pmulhw: mb.q is %016llx\n", mb.q); fprintf(stderr, "pmulhw: mb.q is ff00000000000000\n"); fflush(stdout); fflush(stderr); mb.q = 0x0200ffff00ffffffLL; pmullw(ma, mb); fprintf(stdout, "pmullw: mb.q is %016llx\n", mb.q); fprintf(stderr, "pmullw: mb.q is 00000001fe010000\n"); fflush(stdout); fflush(stderr); /* PMADD ****************************************************/ ma.q = 0x8000345680007f34LL; mb.q = 0x93234a27ffff1707LL; pmaddwd(ma, mb); fprintf(stdout, "pmaddwd: mb.q is %016llx\n", mb.q); fprintf(stderr, "pmaddwd: mb.q is 4597551a0b71a66c\n"); fflush(stdout); fflush(stderr); /* PCMPEQ ***************************************************/ ma.q = 0x800034568f237f34LL; mb.q = 0x93009a568f237f34LL; pcmpeqd(ma, mb); fprintf(stdout, "pcmpeqd: mb.q is %016llx\n", mb.q); fprintf(stderr, "pcmpeqd: mb.q is 00000000ffffffff\n"); fflush(stdout); fflush(stderr); mb.q = 0x93009a568f237f34LL; pcmpeqw(ma, mb); fprintf(stdout, "pcmpeqw: mb.q is %016llx\n", mb.q); fprintf(stderr, "pcmpeqw: mb.q is 00000000ffffffff\n"); fflush(stdout); fflush(stderr); mb.q = 0x93009a568f237f34LL; pcmpeqb(ma, mb); fprintf(stdout, "pcmpeqb: mb.q is %016llx\n", mb.q); fprintf(stderr, "pcmpeqb: mb.q is 00ff00ffffffffff\n"); fflush(stdout); fflush(stderr); /* PCMPGT ***************************************************/ ma.q = 0x666688884477aaffLL; mb.q = 0x1234567890abcdefLL; pcmpgtd(ma, mb); fprintf(stdout, "pcmpgtd: mb.q is %016llx\n", mb.q); fprintf(stderr, "pcmpgtd: mb.q is 0000000000000000\n"); fflush(stdout); fflush(stderr); mb.q = 0x1234567890abcdefLL; pcmpgtw(ma, mb); fprintf(stdout, "pcmpgtw: mb.q is %016llx\n", mb.q); fprintf(stderr, "pcmpgtw: mb.q is 0000ffff0000ffff\n"); fflush(stdout); fflush(stderr); mb.q = 0x1234567890abcdefLL; pcmpgtb(ma, mb); fprintf(stdout, "pcmpgtb: mb.q is %016llx\n", mb.q); fprintf(stderr, "pcmpgtb: mb.q is 0000ffff0000ff00\n"); fflush(stdout); fflush(stderr); /* PACKSS ***************************************************/ ma.q = 0x00012222000abbbbLL; mb.q = 0x0000888800003333LL; packssdw(ma, mb); fprintf(stdout, "packssdw: mb.q is %016llx\n", mb.q); fprintf(stderr, "packssdw: mb.q is 7fff7fff7fff3333\n"); fflush(stdout); fflush(stderr); ma.q = 0x00aa00dd01009999LL; mb.q = 0x0011002200330044LL; packsswb(ma, mb); fprintf(stdout, "packsswb: mb.q is %016llx\n", mb.q); fprintf(stderr, "packsswb: mb.q is 7f7f7f8011223344\n"); fflush(stdout); fflush(stderr); /* PACKUS ***************************************************/ ma.q = 0x00aa00dd01009999LL; mb.q = 0x0011002200330044LL; packuswb(ma, mb); fprintf(stdout, "packuswb: mb.q is %016llx\n", mb.q); fprintf(stderr, "packuswb: mb.q is aaddff0011223344\n"); fflush(stdout); fflush(stderr); /* PUNPCKH **************************************************/ ma.q = 0x090a0b0c0d0e0f00LL; mb.q = 0x0102030405060708LL; punpckhdq(ma, mb); fprintf(stdout, "punpckhdq: mb.q is %016llx\n", mb.q); fprintf(stderr, "punpckhdq: mb.q is 090a0b0c01020304\n"); fflush(stdout); fflush(stderr); mb.q = 0x0102030405060708LL; punpckhwd(ma, mb); fprintf(stdout, "punpckhwd: mb.q is %016llx\n", mb.q); fprintf(stderr, "punpckhwd: mb.q is 090a01020b0c0304\n"); fflush(stdout); fflush(stderr); mb.q = 0x0102030405060708LL; punpckhbw(ma, mb); fprintf(stdout, "punpckhbw: mb.q is %016llx\n", mb.q); fprintf(stderr, "punpckhbw: mb.q is 09010a020b030c04\n"); fflush(stdout); fflush(stderr); /* PUNPCKL **************************************************/ ma.q = 0x090a0b0c0d0e0f00LL; mb.q = 0x0102030405060708LL; punpckldq(ma, mb); fprintf(stdout, "punpckldq: mb.q is %016llx\n", mb.q); fprintf(stderr, "punpckldq: mb.q is 0d0e0f0005060708\n"); fflush(stdout); fflush(stderr); mb.q = 0x0102030405060708LL; punpcklwd(ma, mb); fprintf(stdout, "punpcklwd: mb.q is %016llx\n", mb.q); fprintf(stderr, "punpcklwd: mb.q is 0d0e05060f000708\n"); fflush(stdout); fflush(stderr); mb.q = 0x0102030405060708LL; punpcklbw(ma, mb); fprintf(stdout, "punpcklbw: mb.q is %016llx\n", mb.q); fprintf(stderr, "punpcklbw: mb.q is 0d050e060f070008\n"); fflush(stdout); fflush(stderr); /* PAND, PANDN, POR, PXOR ***********************************/ ma.q = 0x5555555555555555LL; mb.q = 0x3333333333333333LL; pand(ma, mb); fprintf(stdout, "pand: mb.q is %016llx\n", mb.q); fprintf(stderr, "pand: mb.q is 1111111111111111\n"); fflush(stdout); fflush(stderr); mb.q = 0x3333333333333333LL; pandn(ma, mb); fprintf(stdout, "pandn: mb.q is %016llx\n", mb.q); fprintf(stderr, "pandn: mb.q is 4444444444444444\n"); fflush(stdout); fflush(stderr); mb.q = 0x3333333333333333LL; por(ma, mb); fprintf(stdout, "por: mb.q is %016llx\n", mb.q); fprintf(stderr, "por: mb.q is 7777777777777777\n"); fflush(stdout); fflush(stderr); mb.q = 0x3333333333333333LL; pxor(ma, mb); fprintf(stdout, "pxor: mb.q is %016llx\n", mb.q); fprintf(stderr, "pxor: mb.q is 6666666666666666\n"); fflush(stdout); fflush(stderr); /* PSLL *****************************************************/ ma.q = 0x0000000000000018LL; mb.q = 0x0123456789abcdefLL; psllq(ma, mb); fprintf(stdout, "psllq: mb.q is %016llx\n", mb.q); fprintf(stderr, "psllq: mb.q is 6789abcdef000000\n"); fflush(stdout); fflush(stderr); mb.q = 0x0123456789abcdefLL; pslld(ma, mb); fprintf(stdout, "pslld: mb.q is %016llx\n", mb.q); fprintf(stderr, "pslld: mb.q is 67000000ef000000\n"); fflush(stdout); fflush(stderr); mb.q = 0x0123456789abcdefLL; psllw(ma, mb); fprintf(stdout, "psllw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psllw: mb.q is 0000000000000000\n"); fflush(stdout); fflush(stderr); /* PSRL *****************************************************/ ma.q = 0x0000000000000018LL; mb.q = 0x0123456789abcdefLL; psrlq(ma, mb); fprintf(stdout, "psrlq: mb.q is %016llx\n", mb.q); fprintf(stderr, "psrlq: mb.q is 0000000123456789\n"); fflush(stdout); fflush(stderr); mb.q = 0x0123456789abcdefLL; psrld(ma, mb); fprintf(stdout, "psrld: mb.q is %016llx\n", mb.q); fprintf(stderr, "psrld: mb.q is 0000000100000089\n"); fflush(stdout); fflush(stderr); mb.q = 0x0123456789abcdefLL; psrlw(ma, mb); fprintf(stdout, "psrlw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psrlw: mb.q is 0000000000000000\n"); fflush(stdout); fflush(stderr); /* PSRA *****************************************************/ ma.q = 0x0000000000000018LL; mb.q = 0x0123456789abcdefLL; psrad(ma, mb); fprintf(stdout, "psrad: mb.q is %016llx\n", mb.q); fprintf(stderr, "psrad: mb.q is 00000001ffffff89\n"); fflush(stdout); fflush(stderr); mb.q = 0x0123456789abcdefLL; psraw(ma, mb); fprintf(stdout, "psraw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psraw: mb.q is 00000000ffffffff\n"); fflush(stdout); fflush(stderr); /* Exit MXX *************************************************/ emms(); } /* Clean-up and exit nicely */ exit(0); }
void MacroAssembler::fast_log(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register eax, Register ecx, Register edx, Register tmp) { Label L_2TAG_PACKET_0_0_2, L_2TAG_PACKET_1_0_2, L_2TAG_PACKET_2_0_2, L_2TAG_PACKET_3_0_2; Label L_2TAG_PACKET_4_0_2, L_2TAG_PACKET_5_0_2, L_2TAG_PACKET_6_0_2, L_2TAG_PACKET_7_0_2; Label L_2TAG_PACKET_8_0_2, L_2TAG_PACKET_9_0_2; Label L_2TAG_PACKET_10_0_2, start; assert_different_registers(tmp, eax, ecx, edx); jmp(start); address static_const_table = (address)_static_const_table_log; bind(start); subl(rsp, 104); movl(Address(rsp, 40), tmp); lea(tmp, ExternalAddress(static_const_table)); xorpd(xmm2, xmm2); movl(eax, 16368); pinsrw(xmm2, eax, 3); xorpd(xmm3, xmm3); movl(edx, 30704); pinsrw(xmm3, edx, 3); movsd(xmm0, Address(rsp, 112)); movapd(xmm1, xmm0); movl(ecx, 32768); movdl(xmm4, ecx); movsd(xmm5, Address(tmp, 2128)); // 0x00000000UL, 0xffffe000UL pextrw(eax, xmm0, 3); por(xmm0, xmm2); psllq(xmm0, 5); movl(ecx, 16352); psrlq(xmm0, 34); rcpss(xmm0, xmm0); psllq(xmm1, 12); pshufd(xmm6, xmm5, 228); psrlq(xmm1, 12); subl(eax, 16); cmpl(eax, 32736); jcc(Assembler::aboveEqual, L_2TAG_PACKET_0_0_2); bind(L_2TAG_PACKET_1_0_2); paddd(xmm0, xmm4); por(xmm1, xmm3); movdl(edx, xmm0); psllq(xmm0, 29); pand(xmm5, xmm1); pand(xmm0, xmm6); subsd(xmm1, xmm5); mulpd(xmm5, xmm0); andl(eax, 32752); subl(eax, ecx); cvtsi2sdl(xmm7, eax); mulsd(xmm1, xmm0); movsd(xmm6, Address(tmp, 2064)); // 0xfefa3800UL, 0x3fa62e42UL movdqu(xmm3, Address(tmp, 2080)); // 0x92492492UL, 0x3fc24924UL, 0x00000000UL, 0xbfd00000UL subsd(xmm5, xmm2); andl(edx, 16711680); shrl(edx, 12); movdqu(xmm0, Address(tmp, edx)); movdqu(xmm4, Address(tmp, 2096)); // 0x3d6fb175UL, 0xbfc5555eUL, 0x55555555UL, 0x3fd55555UL addsd(xmm1, xmm5); movdqu(xmm2, Address(tmp, 2112)); // 0x9999999aUL, 0x3fc99999UL, 0x00000000UL, 0xbfe00000UL mulsd(xmm6, xmm7); pshufd(xmm5, xmm1, 68); mulsd(xmm7, Address(tmp, 2072)); // 0x93c76730UL, 0x3ceef357UL, 0x92492492UL, 0x3fc24924UL mulsd(xmm3, xmm1); addsd(xmm0, xmm6); mulpd(xmm4, xmm5); mulpd(xmm5, xmm5); pshufd(xmm6, xmm0, 228); addsd(xmm0, xmm1); addpd(xmm4, xmm2); mulpd(xmm3, xmm5); subsd(xmm6, xmm0); mulsd(xmm4, xmm1); pshufd(xmm2, xmm0, 238); addsd(xmm1, xmm6); mulsd(xmm5, xmm5); addsd(xmm7, xmm2); addpd(xmm4, xmm3); addsd(xmm1, xmm7); mulpd(xmm4, xmm5); addsd(xmm1, xmm4); pshufd(xmm5, xmm4, 238); addsd(xmm1, xmm5); addsd(xmm0, xmm1); jmp(L_2TAG_PACKET_2_0_2); bind(L_2TAG_PACKET_0_0_2); movsd(xmm0, Address(rsp, 112)); movdqu(xmm1, xmm0); addl(eax, 16); cmpl(eax, 32768); jcc(Assembler::aboveEqual, L_2TAG_PACKET_3_0_2); cmpl(eax, 16); jcc(Assembler::below, L_2TAG_PACKET_4_0_2); bind(L_2TAG_PACKET_5_0_2); addsd(xmm0, xmm0); jmp(L_2TAG_PACKET_2_0_2); bind(L_2TAG_PACKET_6_0_2); jcc(Assembler::above, L_2TAG_PACKET_5_0_2); cmpl(edx, 0); jcc(Assembler::above, L_2TAG_PACKET_5_0_2); jmp(L_2TAG_PACKET_7_0_2); bind(L_2TAG_PACKET_3_0_2); movdl(edx, xmm1); psrlq(xmm1, 32); movdl(ecx, xmm1); addl(ecx, ecx); cmpl(ecx, -2097152); jcc(Assembler::aboveEqual, L_2TAG_PACKET_6_0_2); orl(edx, ecx); cmpl(edx, 0); jcc(Assembler::equal, L_2TAG_PACKET_8_0_2); bind(L_2TAG_PACKET_7_0_2); xorpd(xmm1, xmm1); xorpd(xmm0, xmm0); movl(eax, 32752); pinsrw(xmm1, eax, 3); movl(edx, 3); mulsd(xmm0, xmm1); bind(L_2TAG_PACKET_9_0_2); movsd(Address(rsp, 0), xmm0); movsd(xmm0, Address(rsp, 112)); fld_d(Address(rsp, 0)); jmp(L_2TAG_PACKET_10_0_2); bind(L_2TAG_PACKET_8_0_2); xorpd(xmm1, xmm1); xorpd(xmm0, xmm0); movl(eax, 49136); pinsrw(xmm0, eax, 3); divsd(xmm0, xmm1); movl(edx, 2); jmp(L_2TAG_PACKET_9_0_2); bind(L_2TAG_PACKET_4_0_2); movdl(edx, xmm1); psrlq(xmm1, 32); movdl(ecx, xmm1); orl(edx, ecx); cmpl(edx, 0); jcc(Assembler::equal, L_2TAG_PACKET_8_0_2); xorpd(xmm1, xmm1); movl(eax, 18416); pinsrw(xmm1, eax, 3); mulsd(xmm0, xmm1); movapd(xmm1, xmm0); pextrw(eax, xmm0, 3); por(xmm0, xmm2); psllq(xmm0, 5); movl(ecx, 18416); psrlq(xmm0, 34); rcpss(xmm0, xmm0); psllq(xmm1, 12); pshufd(xmm6, xmm5, 228); psrlq(xmm1, 12); jmp(L_2TAG_PACKET_1_0_2); bind(L_2TAG_PACKET_2_0_2); movsd(Address(rsp, 24), xmm0); fld_d(Address(rsp, 24)); bind(L_2TAG_PACKET_10_0_2); movl(tmp, Address(rsp, 40)); }