/* @param y [out] the value of f(var) @param var [in] table of input variables func(double *y, const double var[]); @note func does not return double to avoid difference of compiler */ FuncGen(const std::vector<std::string>& varTbl) : constTblPos_(0) , regIdx_(-1) #ifdef XBYAK32 , valTbl_(eax) , tbl_(edx) #elif defined(XBYAK64_WIN) , valTbl_(rcx) , tbl_(rdx) #else , valTbl_(rdi) , tbl_(rsi) #endif { #ifdef XBYAK32 mov(valTbl_, ptr[esp+8]); // eax == varTbl mov(tbl_, (size_t)constTbl_); #else #ifdef XBYAK64_WIN movaps(ptr [rsp + 8], xm6); // save xm6, xm7 movaps(ptr [rsp + 8 + 16], xm7); #endif mov(tbl_, (size_t)constTbl_); #endif for (int i = 0, n = static_cast<int>(varTbl.size()); i < n; i++) { varMap_[varTbl[i]] = i; } }
Encoding *RegisterAllocator::movaps(OperandXMMREG r128, OperandR_M128 r_m128) { if(r_m128.isSubtypeOf(Operand::OPERAND_XMMREG)) { return movaps(r128, (OperandXMMREG)r_m128); } else { return movaps(r128, (OperandMEM128)r_m128); } }
void complete() { #ifdef XBYAK32 mov(eax, ptr [esp + 4]); // eax = valTbl movsd(ptr [eax], xm0); #else #ifdef XBYAK64_WIN movaps(xm6, ptr [rsp + 8]); movaps(xm7, ptr [rsp + 8 + 16]); #endif #endif ret(); }
void complete() { #ifdef XBYAK32 sub(esp, 8); movsd(ptr [esp], xm0); fld(qword [esp]); add(esp, 8); #else #ifdef XBYAK64_WIN movaps(xm6, ptr [rsp + 8]); movaps(xm7, ptr [rsp + 8 + 16]); #endif #endif ret(); }
void GSSetupPrimCodeGenerator::Generate() { const int params = 0; const int _vertices = params + 4; const int _dscan = params + 8; mov(ecx, dword[esp + _vertices]); mov(edx, dword[esp + _dscan]); if((m_en.z || m_en.f) && !m_env.sel.sprite || m_en.t || m_en.c && m_env.sel.iip) { for(int i = 0; i < 5; i++) { movaps(Xmm(3 + i), xmmword[&m_shift[i]]); } } Depth(); Texture(); Color(); ret(); }
Encoding *RegisterAllocator::spill128(int i) { // Register loaded but not used, eliminate load and don't spill if(XMM[i].loadInstruction && loadElimination) { XMM[i].loadInstruction->reserve(); XMM[i].loadInstruction = 0; XMM[i].reference = 0; XMM[i].priority = 0; XMM[i].partial = 0; XMM[i].copyInstruction = 0; XMM[i].loadInstruction = 0; // XMM[i].spillInstruction = 0; // NOTE: Keep previous spill info return 0; } Encoding *spillInstruction = 0; if(XMM[i].reference != 0 && (XMM[i].modified || !dropUnmodified)) { if(XMM[i].partial) spillInstruction = movss(dword_ptr [XMM[i].reference], OperandXMMREG(i)); else spillInstruction = movaps(xword_ptr [XMM[i].reference], OperandXMMREG(i)); } XMM[i].free(); return spillInstruction; }
/* double jit(double x); @note 32bit: x : [esp+4], return fp0 64bit: x [rcx](win), xmm0(gcc), return xmm0 */ Jit() : negConst_(0x8000000000000000ULL) , constTblPos_(0) , regIdx_(-1) #ifdef XBYAK32 , varTbl_(eax) , tbl_(edx) #elif defined(XBYAK64_WIN) , tbl_(rcx) #else , tbl_(rdi) #endif { #ifdef XBYAK32 lea(varTbl_, ptr[esp+4]); #else #ifdef XBYAK64_WIN movaps(ptr [rsp + 8], xm6); // save xm6, xm7 movaps(ptr [rsp + 8 + 16], xm7); #endif movaps(xm7, xm0); // save xm0 #endif mov(tbl_, (size_t)constTbl_); }
void GSSetupPrimCodeGenerator::Generate() { if((m_en.z || m_en.f) && !m_sel.sprite || m_en.t || m_en.c && m_sel.iip) { for(int i = 0; i < 5; i++) { movaps(Xmm(3 + i), xmmword[&m_shift[i]]); } } Depth(); Texture(); Color(); ret(); }
OperandXMMREG RegisterAllocator::allocate128(int i, const OperandREF &ref, bool copy, bool ss) { XMM[i].reference = ref; XMM[i].partial = ss ? 4 : 0; prioritize128(i); Encoding *loadInstruction = 0; Encoding *spillInstruction = XMM[i].spillInstruction; AllocationData spillAllocation = XMM[i].spill; if(copy) { if(ss) loadInstruction = movss(OperandXMMREG(i), dword_ptr [ref]); else loadInstruction = movaps(OperandXMMREG(i), xword_ptr [ref]); } XMM[i].loadInstruction = loadInstruction; XMM[i].spillInstruction = spillInstruction; XMM[i].spill = spillAllocation; XMM[i].modified = false; return OperandXMMREG(i); }
void GSSetupPrimCodeGenerator::Depth() { if(!m_en.z && !m_en.f) { return; } if(!m_env.sel.sprite) { // GSVector4 t = dscan.p; movaps(xmm0, xmmword[edx + 16]); if(m_en.f) { // GSVector4 df = p.wwww(); movaps(xmm1, xmm0); shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); // m_env.d4.f = GSVector4i(df * 4.0f).xxzzlh(); movaps(xmm2, xmm1); mulps(xmm2, xmm3); cvttps2dq(xmm2, xmm2); pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); movdqa(xmmword[&m_env.d4.f], xmm2); for(int i = 0; i < 4; i++) { // m_env.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh(); movaps(xmm2, xmm1); mulps(xmm2, Xmm(4 + i)); cvttps2dq(xmm2, xmm2); pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); movdqa(xmmword[&m_env.d[i].f], xmm2); } } if(m_en.z) { // GSVector4 dz = p.zzzz(); shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); // m_env.d4.z = dz * 4.0f; movaps(xmm1, xmm0); mulps(xmm1, xmm3); movdqa(xmmword[&m_env.d4.z], xmm1); for(int i = 0; i < 4; i++) { // m_env.d[i].z = dz * m_shift[i]; movaps(xmm1, xmm0); mulps(xmm1, Xmm(4 + i)); movdqa(xmmword[&m_env.d[i].z], xmm1); } } } else { // GSVector4 p = vertices[0].p; movaps(xmm0, xmmword[ecx + 16]); if(m_en.f) { // m_env.p.f = GSVector4i(p).zzzzh().zzzz(); movaps(xmm1, xmm0); cvttps2dq(xmm1, xmm1); pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); movdqa(xmmword[&m_env.p.f], xmm1); } if(m_en.z) { // GSVector4 z = p.zzzz(); shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); if(m_env.sel.zoverflow) { // m_env.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); static const float half = 0.5f; movss(xmm1, dword[&half]); shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0)); mulps(xmm1, xmm0); cvttps2dq(xmm1, xmm1); pslld(xmm1, 1); cvttps2dq(xmm0, xmm0); pcmpeqd(xmm2, xmm2); psrld(xmm2, 31); pand(xmm0, xmm2); por(xmm0, xmm1); } else { // m_env.p.z = GSVector4i(z); cvttps2dq(xmm0, xmm0); } movdqa(xmmword[&m_env.p.z], xmm0); } } }
void GSSetupPrimCodeGenerator::Color() { if(!m_en.c) { return; } if(m_env.sel.iip) { // GSVector4 c = dscan.c; movaps(xmm0, xmmword[edx]); movaps(xmm1, xmm0); // m_env.d4.c = GSVector4i(c * 4.0f).xzyw().ps32(); movaps(xmm2, xmm0); mulps(xmm2, xmm3); cvttps2dq(xmm2, xmm2); pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0)); packssdw(xmm2, xmm2); movdqa(xmmword[&m_env.d4.c], xmm2); // xmm3 is not needed anymore // GSVector4 dr = c.xxxx(); // GSVector4 db = c.zzzz(); shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); for(int i = 0; i < 4; i++) { // GSVector4i r = GSVector4i(dr * m_shift[i]).ps32(); movaps(xmm2, xmm0); mulps(xmm2, Xmm(4 + i)); cvttps2dq(xmm2, xmm2); packssdw(xmm2, xmm2); // GSVector4i b = GSVector4i(db * m_shift[i]).ps32(); movaps(xmm3, xmm1); mulps(xmm3, Xmm(4 + i)); cvttps2dq(xmm3, xmm3); packssdw(xmm3, xmm3); // m_env.d[i].rb = r.upl16(b); punpcklwd(xmm2, xmm3); movdqa(xmmword[&m_env.d[i].rb], xmm2); } // GSVector4 c = dscan.c; movaps(xmm0, xmmword[edx]); // not enough regs, have to reload it movaps(xmm1, xmm0); // GSVector4 dg = c.yyyy(); // GSVector4 da = c.wwww(); shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); for(int i = 0; i < 4; i++) { // GSVector4i g = GSVector4i(dg * m_shift[i]).ps32(); movaps(xmm2, xmm0); mulps(xmm2, Xmm(4 + i)); cvttps2dq(xmm2, xmm2); packssdw(xmm2, xmm2); // GSVector4i a = GSVector4i(da * m_shift[i]).ps32(); movaps(xmm3, xmm1); mulps(xmm3, Xmm(4 + i)); cvttps2dq(xmm3, xmm3); packssdw(xmm3, xmm3); // m_env.d[i].ga = g.upl16(a); punpcklwd(xmm2, xmm3); movdqa(xmmword[&m_env.d[i].ga], xmm2); } } else { // GSVector4i c = GSVector4i(vertices[0].c); movaps(xmm0, xmmword[ecx]); cvttps2dq(xmm0, xmm0); // c = c.upl16(c.zwxy()); movdqa(xmm1, xmm0); pshufd(xmm1, xmm1, _MM_SHUFFLE(1, 0, 3, 2)); punpcklwd(xmm0, xmm1); // if(!tme) c = c.srl16(7); if(m_env.sel.tfx == TFX_NONE) { psrlw(xmm0, 7); } // m_env.c.rb = c.xxxx(); // m_env.c.ga = c.zzzz(); movdqa(xmm1, xmm0); pshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); movdqa(xmmword[&m_env.c.rb], xmm0); movdqa(xmmword[&m_env.c.ga], xmm1); } }
void GSSetupPrimCodeGenerator::Texture() { if(!m_en.t) { return; } // GSVector4 t = dscan.t; movaps(xmm0, xmmword[edx + 32]); movaps(xmm1, xmm0); mulps(xmm1, xmm3); if(m_env.sel.fst) { // m_env.d4.st = GSVector4i(t * 4.0f); cvttps2dq(xmm1, xmm1); movdqa(xmmword[&m_env.d4.st], xmm1); } else { // m_env.d4.stq = t * 4.0f; movaps(xmmword[&m_env.d4.stq], xmm1); } for(int j = 0, k = m_env.sel.fst ? 2 : 3; j < k; j++) { // GSVector4 ds = t.xxxx(); // GSVector4 dt = t.yyyy(); // GSVector4 dq = t.zzzz(); movaps(xmm1, xmm0); shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j)); for(int i = 0; i < 4; i++) { // GSVector4 v = ds/dt * m_shift[i]; movaps(xmm2, xmm1); mulps(xmm2, Xmm(4 + i)); if(m_env.sel.fst) { // m_env.d[i].si/ti = GSVector4i(v); cvttps2dq(xmm2, xmm2); switch(j) { case 0: movdqa(xmmword[&m_env.d[i].si], xmm2); break; case 1: movdqa(xmmword[&m_env.d[i].ti], xmm2); break; } } else { // m_env.d[i].s/t/q = v; switch(j) { case 0: movaps(xmmword[&m_env.d[i].s], xmm2); break; case 1: movaps(xmmword[&m_env.d[i].t], xmm2); break; case 2: movaps(xmmword[&m_env.d[i].q], xmm2); break; } } } } }
struct code encode(struct instruction instr) { switch (instr.opcode) { case INSTR_ADD: return add(instr.optype, instr.source, instr.dest); case INSTR_NOT: return not(instr.optype, instr.source); case INSTR_MUL: return mul(instr.optype, instr.source); case INSTR_XOR: return xor(instr.optype, instr.source, instr.dest); case INSTR_DIV: return encode_div(instr.optype, instr.source); case INSTR_AND: return and(instr.optype, instr.source, instr.dest); case INSTR_OR: return or(instr.optype, instr.source, instr.dest); case INSTR_SHL: return shl(instr.optype, instr.source, instr.dest); case INSTR_SHR: return shr(instr.optype, instr.source, instr.dest); case INSTR_SAR: return sar(instr.optype, instr.source, instr.dest); case INSTR_CALL: return call(instr.optype, instr.source); case INSTR_CMP: return cmp(instr.optype, instr.source, instr.dest); case INSTR_MOV: return mov(instr.optype, instr.source, instr.dest); case INSTR_MOVSX: return movsx(instr.optype, instr.source, instr.dest); case INSTR_MOVZX: return movzx(instr.optype, instr.source, instr.dest); case INSTR_MOVAPS: return movaps(instr.optype, instr.source, instr.dest); case INSTR_PUSH: return push(instr.optype, instr.source); case INSTR_SUB: return sub(instr.optype, instr.source, instr.dest); case INSTR_LEA: return lea(instr.optype, instr.source, instr.dest); case INSTR_LEAVE: return leave(); case INSTR_REP_MOVSQ: assert(instr.optype == OPT_NONE); return rep_movsq(); case INSTR_RET: return ret(); case INSTR_JMP: return jmp(instr.optype, instr.source); case INSTR_JA: return jcc(instr.optype, TEST_A, instr.source); case INSTR_JG: return jcc(instr.optype, TEST_G, instr.source); case INSTR_JZ: return jcc(instr.optype, TEST_Z, instr.source); case INSTR_JAE: return jcc(instr.optype, TEST_AE, instr.source); case INSTR_JGE: return jcc(instr.optype, TEST_GE, instr.source); case INSTR_SETZ: return setcc(instr.optype, TEST_Z, instr.source); case INSTR_SETA: return setcc(instr.optype, TEST_A, instr.source); case INSTR_SETG: return setcc(instr.optype, TEST_G, instr.source); case INSTR_SETAE: return setcc(instr.optype, TEST_AE, instr.source); case INSTR_SETGE: return setcc(instr.optype, TEST_GE, instr.source); case INSTR_TEST: return test(instr.optype, instr.source, instr.dest); default: return nop(); } }
void GSDrawScanlineCodeGenerator::Step() { // steps -= 4; sub(ecx, 4); // fza_offset++; add(edi, 8); if(!m_sel.sprite) { // z += m_env.d4.z; if(m_sel.zb) { movaps(xmm0, xmmword[&m_env.temp.z]); addps(xmm0, xmmword[&m_env.d4.z]); movaps(xmmword[&m_env.temp.z], xmm0); } // f = f.add16(m_env.d4.f); if(m_sel.fwrite && m_sel.fge) { movdqa(xmm1, xmmword[&m_env.temp.f]); paddw(xmm1, xmmword[&m_env.d4.f]); movdqa(xmmword[&m_env.temp.f], xmm1); } } else { if(m_sel.ztest) { movdqa(xmm0, xmmword[&m_env.p.z]); } } if(m_sel.fb) { if(m_sel.tfx != TFX_NONE) { if(m_sel.fst) { // GSVector4i st = m_env.d4.st; // si += st.xxxx(); // if(!sprite) ti += st.yyyy(); movdqa(xmm4, xmmword[&m_env.d4.st]); pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); paddd(xmm2, xmmword[&m_env.temp.s]); movdqa(xmmword[&m_env.temp.s], xmm2); if(!m_sel.sprite) { pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1)); paddd(xmm3, xmmword[&m_env.temp.t]); movdqa(xmmword[&m_env.temp.t], xmm3); } else { movdqa(xmm3, xmmword[&m_env.temp.t]); } } else { // GSVector4 stq = m_env.d4.stq; // s += stq.xxxx(); // t += stq.yyyy(); // q += stq.zzzz(); movaps(xmm2, xmmword[&m_env.d4.stq]); movaps(xmm3, xmm2); movaps(xmm4, xmm2); shufps(xmm2, xmm2, _MM_SHUFFLE(0, 0, 0, 0)); shufps(xmm3, xmm3, _MM_SHUFFLE(1, 1, 1, 1)); shufps(xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2)); addps(xmm2, xmmword[&m_env.temp.s]); addps(xmm3, xmmword[&m_env.temp.t]); addps(xmm4, xmmword[&m_env.temp.q]); movaps(xmmword[&m_env.temp.s], xmm2); movaps(xmmword[&m_env.temp.t], xmm3); movaps(xmmword[&m_env.temp.q], xmm4); rcpps(xmm4, xmm4); mulps(xmm2, xmm4); mulps(xmm3, xmm4); } } if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc)) { if(m_sel.iip) { // GSVector4i c = m_env.d4.c; // rb = rb.add16(c.xxxx()); // ga = ga.add16(c.yyyy()); movdqa(xmm7, xmmword[&m_env.d4.c]); pshufd(xmm5, xmm7, _MM_SHUFFLE(0, 0, 0, 0)); pshufd(xmm6, xmm7, _MM_SHUFFLE(1, 1, 1, 1)); paddw(xmm5, xmmword[&m_env.temp.rb]); paddw(xmm6, xmmword[&m_env.temp.ga]); movdqa(xmmword[&m_env.temp.rb], xmm5); movdqa(xmmword[&m_env.temp.ga], xmm6); } else { if(m_sel.tfx == TFX_NONE) { movdqa(xmm5, xmmword[&m_env.c.rb]); movdqa(xmm6, xmmword[&m_env.c.ga]); } } } } // test = m_test[7 + (steps & (steps >> 31))]; mov(edx, ecx); sar(edx, 31); and(edx, ecx); shl(edx, 4); movdqa(xmm7, xmmword[edx + (size_t)&m_test[7]]); }
void GSDrawScanlineCodeGenerator::Init(int params) { const int _top = params + 4; const int _v = params + 8; // int skip = left & 3; mov(ebx, edx); and(edx, 3); // left -= skip; sub(ebx, edx); // int steps = right - left - 4; sub(ecx, ebx); sub(ecx, 4); // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))]; shl(edx, 4); movdqa(xmm7, xmmword[edx + (size_t)&m_test[0]]); mov(eax, ecx); sar(eax, 31); and(eax, ecx); shl(eax, 4); por(xmm7, xmmword[eax + (size_t)&m_test[7]]); // GSVector2i* fza_base = &m_env.fzbr[top]; mov(esi, dword[esp + _top]); lea(esi, ptr[esi * 8]); add(esi, dword[&m_env.fzbr]); // GSVector2i* fza_offset = &m_env.fzbc[left >> 2]; lea(edi, ptr[ebx * 2]); add(edi, dword[&m_env.fzbc]); if(!m_sel.sprite && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip)) { // edx = &m_env.d[skip] shl(edx, 4); lea(edx, ptr[edx + (size_t)m_env.d]); // ebx = &v mov(ebx, dword[esp + _v]); } if(!m_sel.sprite) { if(m_sel.fwrite && m_sel.fge || m_sel.zb) { movaps(xmm0, xmmword[ebx + 16]); // v.p if(m_sel.fwrite && m_sel.fge) { // f = GSVector4i(vp).zzzzh().zzzz().add16(m_env.d[skip].f); cvttps2dq(xmm1, xmm0); pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); paddw(xmm1, xmmword[edx + 16 * 6]); movdqa(xmmword[&m_env.temp.f], xmm1); } if(m_sel.zb) { // z = vp.zzzz() + m_env.d[skip].z; shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); addps(xmm0, xmmword[edx]); movaps(xmmword[&m_env.temp.z], xmm0); } } } else { if(m_sel.ztest) { movdqa(xmm0, xmmword[&m_env.p.z]); } } if(m_sel.fb) { if(m_sel.edge || m_sel.tfx != TFX_NONE) { movaps(xmm4, xmmword[ebx + 32]); // v.t } if(m_sel.edge) { pshufhw(xmm3, xmm4, _MM_SHUFFLE(2, 2, 2, 2)); pshufd(xmm3, xmm3, _MM_SHUFFLE(3, 3, 3, 3)); psrlw(xmm3, 9); movdqa(xmmword[&m_env.temp.cov], xmm3); } if(m_sel.tfx != TFX_NONE) { if(m_sel.fst) { // GSVector4i vti(vt); cvttps2dq(xmm4, xmm4); // si = vti.xxxx() + m_env.d[skip].si; // ti = vti.yyyy(); if(!sprite) ti += m_env.d[skip].ti; pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1)); paddd(xmm2, xmmword[edx + 16 * 7]); if(!m_sel.sprite) { paddd(xmm3, xmmword[edx + 16 * 8]); } else { if(m_sel.ltf) { movdqa(xmm4, xmm3); pshuflw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0)); psrlw(xmm4, 1); movdqa(xmmword[&m_env.temp.vf], xmm4); } } movdqa(xmmword[&m_env.temp.s], xmm2); movdqa(xmmword[&m_env.temp.t], xmm3); } else { // s = vt.xxxx() + m_env.d[skip].s; // t = vt.yyyy() + m_env.d[skip].t; // q = vt.zzzz() + m_env.d[skip].q; movaps(xmm2, xmm4); movaps(xmm3, xmm4); shufps(xmm2, xmm2, _MM_SHUFFLE(0, 0, 0, 0)); shufps(xmm3, xmm3, _MM_SHUFFLE(1, 1, 1, 1)); shufps(xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2)); addps(xmm2, xmmword[edx + 16 * 1]); addps(xmm3, xmmword[edx + 16 * 2]); addps(xmm4, xmmword[edx + 16 * 3]); movaps(xmmword[&m_env.temp.s], xmm2); movaps(xmmword[&m_env.temp.t], xmm3); movaps(xmmword[&m_env.temp.q], xmm4); rcpps(xmm4, xmm4); mulps(xmm2, xmm4); mulps(xmm3, xmm4); } } if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc)) { if(m_sel.iip) { // GSVector4i vc = GSVector4i(v.c); cvttps2dq(xmm6, xmmword[ebx]); // v.c // vc = vc.upl16(vc.zwxy()); pshufd(xmm5, xmm6, _MM_SHUFFLE(1, 0, 3, 2)); punpcklwd(xmm6, xmm5); // rb = vc.xxxx().add16(m_env.d[skip].rb); // ga = vc.zzzz().add16(m_env.d[skip].ga); pshufd(xmm5, xmm6, _MM_SHUFFLE(0, 0, 0, 0)); pshufd(xmm6, xmm6, _MM_SHUFFLE(2, 2, 2, 2)); paddw(xmm5, xmmword[edx + 16 * 4]); paddw(xmm6, xmmword[edx + 16 * 5]); movdqa(xmmword[&m_env.temp.rb], xmm5); movdqa(xmmword[&m_env.temp.ga], xmm6); } else { if(m_sel.tfx == TFX_NONE) { movdqa(xmm5, xmmword[&m_env.c.rb]); movdqa(xmm6, xmmword[&m_env.c.ga]); } } } } }
float TfirFilter::vec_inner_prod_sse(const float *eax, const float *edi, int ecx) { __m128 xmm3,xmm4,xmm0,xmm1,xmm5,xmm6; xorps (xmm3, xmm3); xorps (xmm4, xmm4); ecx-=8;// sub $8, %%ecx if (ecx<0) goto //jb mul8_skip; mul8_loop: movups (xmm0,eax); movups (xmm1,edi); movups (xmm5,16/sizeof(float)+eax); movups (xmm6,16/sizeof(float)+edi); eax+=32/sizeof(float); edi+=32/sizeof(float); mulps (xmm1,xmm0); mulps (xmm6,xmm5); addps (xmm3,xmm1); addps (xmm4,xmm6); ecx-=8; if (ecx>=0) //jae goto mul8_loop; mul8_skip: addps (xmm3,xmm4); ecx+=4; if (ecx<0) //jl goto mul4_skip; movups (xmm0,eax); movups (xmm1,edi); eax+=16/sizeof(float); edi+=16/sizeof(float); mulps (xmm1, xmm0); addps (xmm3, xmm1); ecx-=4; mul4_skip: ecx+=4; goto cond1; mul1_loop: movss (xmm0,eax); movss (xmm1,edi); eax+=4/sizeof(float); edi+=4/sizeof(float); mulss (xmm1,xmm0); addss (xmm3,xmm1); cond1: ecx-=1; if (ecx>=0) // jae goto mul1_loop; movhlps (xmm4,xmm3); addps (xmm3,xmm4); movaps (xmm4,xmm3); //FIXME: which one? xmm4=_mm_shuffle_ps(xmm4,xmm4,0x55);// shufps $0x55, xmm4, xmm4 // shufps $33, xmm4, xmm4 addss (xmm3, xmm4); float sum; movss (&sum , xmm3); return sum; }