void GSDrawScanlineCodeGenerator::clamp16(const Xmm& a, const Xmm& temp) { #if _M_SSE >= 0x500 vpackuswb(a, a); vpmovzxbw(a, a); #elif _M_SSE >= 0x401 packuswb(a, a); pmovzxbw(a, a); #else packuswb(a, a); pxor(temp, temp); punpcklbw(a, temp); #endif }
void GSDrawScanlineCodeGenerator::WriteFrame() { if(!m_sel.fwrite) { return; } if(m_sel.colclamp == 0) { // c[0] &= 0x00ff00ff; // c[1] &= 0x00ff00ff; vpcmpeqd(xmm15, xmm15); vpsrlw(xmm15, 8); vpand(xmm2, xmm15); vpand(xmm3, xmm15); } if(m_sel.fpsm == 2 && m_sel.dthe) { mov(rax, r8); and(rax, 3); shl(rax, 5); vpaddw(xmm2, ptr[r12 + rax + offsetof(GSScanlineGlobalData, dimx[0])]); vpaddw(xmm3, ptr[r12 + rax + offsetof(GSScanlineGlobalData, dimx[1])]); } // GSVector4i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1])); vpunpckhwd(xmm15, xmm2, xmm3); vpunpcklwd(xmm2, xmm3); vpackuswb(xmm2, xmm15); if(m_sel.fba && m_sel.fpsm != 1) { // fs |= 0x80000000; vpcmpeqd(xmm15, xmm15); vpslld(xmm15, 31); vpor(xmm2, xmm15); } // xmm2 = fs // xmm4 = fm // xmm6 = fd if(m_sel.fpsm == 2) { // GSVector4i rb = fs & 0x00f800f8; // GSVector4i ga = fs & 0x8000f800; mov(eax, 0x00f800f8); vmovd(xmm0, eax); vpshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); mov(eax, 0x8000f800); vmovd(xmm1, eax); vpshufd(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0)); vpand(xmm0, xmm2); vpand(xmm1, xmm2); // fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3); vpsrld(xmm2, xmm0, 9); vpsrld(xmm0, 3); vpsrld(xmm3, xmm1, 16); vpsrld(xmm1, 6); vpor(xmm0, xmm1); vpor(xmm2, xmm3); vpor(xmm2, xmm0); } if(m_sel.rfb) { // fs = fs.blend(fd, fm); blend(xmm2, xmm6, xmm4); // TODO: could be skipped in certain cases, depending on fpsm and fm } bool fast = m_sel.rfb && m_sel.fpsm < 2; WritePixel(xmm2, rbx, dl, fast, m_sel.fpsm, 0); }
void GSDrawScanlineCodeGenerator::clamp16(const Ymm& a, const Ymm& temp) { vpackuswb(a, a); vpermq(a, a, _MM_SHUFFLE(3, 1, 2, 0)); // this sucks vpmovzxbw(a, a); }