void GSDrawScanlineCodeGenerator::WriteMask() { // fm |= test; // zm |= test; if(m_sel.fwrite) { vpor(xmm4, xmm15); } if(m_sel.zwrite) { vpor(xmm5, xmm15); } // int fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask(); vpcmpeqd(xmm1, xmm1); if(m_sel.fwrite && m_sel.zwrite) { vpcmpeqd(xmm0, xmm1, xmm5); vpcmpeqd(xmm1, xmm4); vpackssdw(xmm1, xmm0); } else if(m_sel.fwrite) { vpcmpeqd(xmm1, xmm4); vpackssdw(xmm1, xmm1); } else if(m_sel.zwrite) { vpcmpeqd(xmm1, xmm5); vpackssdw(xmm1, xmm1); } vpmovmskb(edx, xmm1); not(edx); }
void GSDrawScanlineCodeGenerator::SampleTexture() { if(!m_sel.fb || m_sel.tfx == TFX_NONE) { return; } mov(rbx, ptr[r12 + offsetof(GSScanlineGlobalData, tex)]); // ebx = tex if(!m_sel.fst) { vrcpps(xmm0, xmm12); vmulps(xmm4, xmm10, xmm0); vmulps(xmm5, xmm11, xmm0); vcvttps2dq(xmm4, xmm4); vcvttps2dq(xmm5, xmm5); if(m_sel.ltf) { // u -= 0x8000; // v -= 0x8000; mov(eax, 0x8000); vmovd(xmm0, eax); vpshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); vpsubd(xmm4, xmm0); vpsubd(xmm5, xmm0); } } else { vmovdqa(xmm4, xmm10); vmovdqa(xmm5, xmm11); } if(m_sel.ltf) { // GSVector4i uf = u.xxzzlh().srl16(1); vpshuflw(xmm6, xmm4, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0)); vpsrlw(xmm6, 1); if(!m_sel.sprite) { // GSVector4i vf = v.xxzzlh().srl16(1); vpshuflw(xmm7, xmm5, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm7, xmm7, _MM_SHUFFLE(2, 2, 0, 0)); vpsrlw(xmm7, 1); } } // GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16)); vpsrad(xmm4, 16); vpsrad(xmm5, 16); vpackssdw(xmm4, xmm5); if(m_sel.ltf) { // GSVector4i uv1 = uv0.add16(GSVector4i::x0001()); vpcmpeqd(xmm0, xmm0); vpsrlw(xmm0, 15); vpaddw(xmm5, xmm4, xmm0); // uv0 = Wrap(uv0); // uv1 = Wrap(uv1); Wrap(xmm4, xmm5); } else { // uv0 = Wrap(uv0); Wrap(xmm4); } // xmm4 = uv0 // xmm5 = uv1 (ltf) // xmm6 = uf // xmm7 = vf // GSVector4i x0 = uv0.upl16(); // GSVector4i y0 = uv0.uph16() << tw; vpxor(xmm0, xmm0); vpunpcklwd(xmm2, xmm4, xmm0); vpunpckhwd(xmm3, xmm4, xmm0); vpslld(xmm3, m_sel.tw + 3); // xmm0 = 0 // xmm2 = x0 // xmm3 = y0 // xmm5 = uv1 (ltf) // xmm6 = uf // xmm7 = vf if(m_sel.ltf) { // GSVector4i x1 = uv1.upl16(); // GSVector4i y1 = uv1.uph16() << tw; vpunpcklwd(xmm4, xmm5, xmm0); vpunpckhwd(xmm5, xmm5, xmm0); vpslld(xmm5, m_sel.tw + 3); // xmm2 = x0 // xmm3 = y0 // xmm4 = x1 // xmm5 = y1 // xmm6 = uf // xmm7 = vf // GSVector4i addr00 = y0 + x0; // GSVector4i addr01 = y0 + x1; // GSVector4i addr10 = y1 + x0; // GSVector4i addr11 = y1 + x1; vpaddd(xmm0, xmm3, xmm2); vpaddd(xmm1, xmm3, xmm4); vpaddd(xmm2, xmm5, xmm2); vpaddd(xmm3, xmm5, xmm4); // xmm0 = addr00 // xmm1 = addr01 // xmm2 = addr10 // xmm3 = addr11 // xmm6 = uf // xmm7 = vf // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); // c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]); // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); ReadTexel(4, 0); // xmm0 = c00 // xmm1 = c01 // xmm2 = c10 // xmm3 = c11 // xmm6 = uf // xmm7 = vf // GSVector4i rb00 = c00 & mask; // GSVector4i ga00 = (c00 >> 8) & mask; vpsllw(xmm4, xmm0, 8); vpsrlw(xmm4, 8); vpsrlw(xmm5, xmm0, 8); // GSVector4i rb01 = c01 & mask; // GSVector4i ga01 = (c01 >> 8) & mask; vpsllw(xmm0, xmm1, 8); vpsrlw(xmm0, 8); vpsrlw(xmm1, 8); // xmm0 = rb01 // xmm1 = ga01 // xmm2 = c10 // xmm3 = c11 // xmm4 = rb00 // xmm5 = ga00 // xmm6 = uf // xmm7 = vf // rb00 = rb00.lerp16<0>(rb01, uf); // ga00 = ga00.lerp16<0>(ga01, uf); lerp16(xmm0, xmm4, xmm6, 0); lerp16(xmm1, xmm5, xmm6, 0); // xmm0 = rb00 // xmm1 = ga00 // xmm2 = c10 // xmm3 = c11 // xmm6 = uf // xmm7 = vf // GSVector4i rb10 = c10 & mask; // GSVector4i ga10 = (c10 >> 8) & mask; vpsrlw(xmm5, xmm2, 8); vpsllw(xmm2, 8); vpsrlw(xmm4, xmm2, 8); // GSVector4i rb11 = c11 & mask; // GSVector4i ga11 = (c11 >> 8) & mask; vpsrlw(xmm2, xmm3, 8); vpsllw(xmm3, 8); vpsrlw(xmm3, 8); // xmm0 = rb00 // xmm1 = ga00 // xmm2 = rb11 // xmm3 = ga11 // xmm4 = rb10 // xmm5 = ga10 // xmm6 = uf // xmm7 = vf // rb10 = rb10.lerp16<0>(rb11, uf); // ga10 = ga10.lerp16<0>(ga11, uf); lerp16(xmm2, xmm4, xmm6, 0); lerp16(xmm3, xmm5, xmm6, 0); // xmm0 = rb00 // xmm1 = ga00 // xmm2 = rb10 // xmm3 = ga10 // xmm7 = vf // rb00 = rb00.lerp16<0>(rb10, vf); // ga00 = ga00.lerp16<0>(ga10, vf); lerp16(xmm2, xmm0, xmm7, 0); lerp16(xmm3, xmm1, xmm7, 0); } else { // GSVector4i addr00 = y0 + x0; vpaddd(xmm3, xmm2); // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); ReadTexel(1, 0); // GSVector4i mask = GSVector4i::x00ff(); // c[0] = c00 & mask; // c[1] = (c00 >> 8) & mask; vpsrlw(xmm3, xmm2, 8); vpsllw(xmm2, 8); vpsrlw(xmm2, 8); } // xmm2 = rb // xmm3 = ga }
void forward_avx2() { xor_(reg_soff, reg_soff); Label mb_sp_loop; L(mb_sp_loop); { channel_loop([=](size_t unroll) { // Load 32 channels (two C16_blocks) in ymm, then // split the work in half, each half splits in two // regs with 8 channels per. When down converting, // put the result in a temp register for the 1st // iteration, combine the result at 2nd iteration // and store ymm with 32 channels. // If 16 channels, do just one half and store the // result with mask. Vmm v0 = Vmm(0); Vmm v1 = Vmm(1); Vmm vscale0 = Vmm(2); Vmm vshift0 = Vmm(3); Vmm vmean0 = Vmm(4); Vmm vsqrtvar0 = Vmm(5); Vmm vscale1 = Vmm(6); Vmm vshift1 = Vmm(7); Vmm vmean1 = Vmm(8); Vmm vsqrtvar1 = Vmm(9); Vmm tmp = Vmm(10); for (size_t i = 0; i < unroll; i++) { compute_vscaleshift(vscale0, vshift0, vmean0, vsqrtvar0, i * c_in_xmm_ * sizeof(float)); compute_vscaleshift(vscale1, vshift1, vmean1, vsqrtvar1, i * c_in_xmm_ * sizeof(float) + simd_w_ * sizeof(float)); vpmovsxbd(v0, src_ptr(i*c_in_xmm_)); vpmovsxbd(v1, src_ptr(i*c_in_xmm_ + simd_w_)); vcvtdq2ps(v0, v0); vcvtdq2ps(v1, v1); uni_vfmadd213ps(v0, vscale0, vshift0); uni_vfmadd213ps(v1, vscale1, vshift1); if (with_relu_) { uni_vmaxps(v0, v0, vzero); uni_vmaxps(v1, v1, vzero); } vcvtps2dq(v0, v0); // BA vcvtps2dq(v1, v1); // DC vpackssdw(v0, v0, v1); // BA + DC -> DBCA vpermq(v0, v0, 0xD8); // DBCA -> DCBA vperm2i128(v1, v0, v0, 0x1); // DCBA -> BADC vpacksswb(v0, v0, v1); // DCBA + BADC -> badcDCBA if (i == 0 && unroll != 1) uni_vmovups(tmp, v0); else if (i == 1) { // badcDCBA + fehgHGFE -> HGFEDCBA vperm2i128(v0, v0, tmp, 0x2); } } if (unroll == 1) vmaskmovps(dst_ptr(), vbody_mask, v0); else uni_vmovups(dst_ptr(), v0); }, [=]() { // handle first 8 channels. If tail is bigger, // handle second part separately. There is no way // to get performance as one has to work with bytes // via xmm. vzeroupper kills all the perf. Xmm x0 = Xmm(0); Vmm v0 = Vmm(0); Vmm vscale0 = Vmm(1); Vmm vshift0 = Vmm(2); Vmm vmean0 = Vmm(3); Vmm vsqrtvar0 = Vmm(4); size_t tail = nstl::min(c_tail_, simd_w_); size_t num_iters = c_tail_ > simd_w_ ? 2 : 1; for (size_t i = 0; i < num_iters; i++) { if (i > 0) tail = c_tail_ - simd_w_; for (size_t tl = 0; tl < tail; tl++) vpinsrb(x0, x0, src_ptr(8*i + tl), tl); if (tail == simd_w_) compute_vscaleshift(vscale0, vshift0, vmean0, vsqrtvar0, 32*i); else compute_vscaleshift(vscale0, vshift0, vmean0, vsqrtvar0, 32*i, true); vpmovsxbd(v0, x0); vcvtdq2ps(v0, v0); uni_vfmadd213ps(v0, vscale0, vshift0); if (with_relu_) uni_vmaxps(v0, v0, vzero); vcvtps2dq(v0, v0); vpackssdw(v0, v0, vzero); vpermq(v0, v0, 0xD8); vpacksswb(v0, v0, vzero); for (size_t tl = 0; tl < tail; tl++) vpextrb(dst_ptr(8*i + tl), x0, tl); } }); add(reg_soff, reg_coff_max); cmp(reg_soff, reg_soff_max); jl(mb_sp_loop); } }