void GSDrawScanlineCodeGenerator::Init(int params)
{
    const int _top = params + 4;
    const int _v = params + 8;

    // int skip = left & 3;

    mov(ebx, edx);
    and(edx, 3);

    // left -= skip;

    sub(ebx, edx);

    // int steps = right - left - 4;

    sub(ecx, ebx);
    sub(ecx, 4);

    // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))];

    shl(edx, 4);

    movdqa(xmm7, xmmword[edx + (size_t)&m_test[0]]);

    mov(eax, ecx);
    sar(eax, 31);
    and(eax, ecx);
    shl(eax, 4);

    por(xmm7, xmmword[eax + (size_t)&m_test[7]]);

    // GSVector2i* fza_base = &m_env.fzbr[top];

    mov(esi, dword[esp + _top]);
    lea(esi, ptr[esi * 8]);
    add(esi, dword[&m_env.fzbr]);

    // GSVector2i* fza_offset = &m_env.fzbc[left >> 2];

    lea(edi, ptr[ebx * 2]);
    add(edi, dword[&m_env.fzbc]);

    if(!m_sel.sprite && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip))
    {
        // edx = &m_env.d[skip]

        shl(edx, 4);
        lea(edx, ptr[edx + (size_t)m_env.d]);

        // ebx = &v

        mov(ebx, dword[esp + _v]);
    }

    if(!m_sel.sprite)
    {
        if(m_sel.fwrite && m_sel.fge || m_sel.zb)
        {
            movaps(xmm0, xmmword[ebx + 16]); // v.p

            if(m_sel.fwrite && m_sel.fge)
            {
                // f = GSVector4i(vp).zzzzh().zzzz().add16(m_env.d[skip].f);

                cvttps2dq(xmm1, xmm0);
                pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
                pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
                paddw(xmm1, xmmword[edx + 16 * 6]);

                movdqa(xmmword[&m_env.temp.f], xmm1);
            }

            if(m_sel.zb)
            {
                // z = vp.zzzz() + m_env.d[skip].z;

                shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
                addps(xmm0, xmmword[edx]);

                movaps(xmmword[&m_env.temp.z], xmm0);
            }
        }
    }
    else
    {
        if(m_sel.ztest)
        {
            movdqa(xmm0, xmmword[&m_env.p.z]);
        }
    }

    if(m_sel.fb)
    {
        if(m_sel.edge || m_sel.tfx != TFX_NONE)
        {
            movaps(xmm4, xmmword[ebx + 32]); // v.t
        }

        if(m_sel.edge)
        {
            pshufhw(xmm3, xmm4, _MM_SHUFFLE(2, 2, 2, 2));
            pshufd(xmm3, xmm3, _MM_SHUFFLE(3, 3, 3, 3));
            psrlw(xmm3, 9);

            movdqa(xmmword[&m_env.temp.cov], xmm3);
        }

        if(m_sel.tfx != TFX_NONE)
        {
            if(m_sel.fst)
            {
                // GSVector4i vti(vt);

                cvttps2dq(xmm4, xmm4);

                // si = vti.xxxx() + m_env.d[skip].si;
                // ti = vti.yyyy(); if(!sprite) ti += m_env.d[skip].ti;

                pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
                pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1));

                paddd(xmm2, xmmword[edx + 16 * 7]);

                if(!m_sel.sprite)
                {
                    paddd(xmm3, xmmword[edx + 16 * 8]);
                }
                else
                {
                    if(m_sel.ltf)
                    {
                        movdqa(xmm4, xmm3);
                        pshuflw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0));
                        pshufhw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0));
                        psrlw(xmm4, 1);
                        movdqa(xmmword[&m_env.temp.vf], xmm4);
                    }
                }

                movdqa(xmmword[&m_env.temp.s], xmm2);
                movdqa(xmmword[&m_env.temp.t], xmm3);
            }
            else
            {
                // s = vt.xxxx() + m_env.d[skip].s;
                // t = vt.yyyy() + m_env.d[skip].t;
                // q = vt.zzzz() + m_env.d[skip].q;

                movaps(xmm2, xmm4);
                movaps(xmm3, xmm4);

                shufps(xmm2, xmm2, _MM_SHUFFLE(0, 0, 0, 0));
                shufps(xmm3, xmm3, _MM_SHUFFLE(1, 1, 1, 1));
                shufps(xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2));

                addps(xmm2, xmmword[edx + 16 * 1]);
                addps(xmm3, xmmword[edx + 16 * 2]);
                addps(xmm4, xmmword[edx + 16 * 3]);

                movaps(xmmword[&m_env.temp.s], xmm2);
                movaps(xmmword[&m_env.temp.t], xmm3);
                movaps(xmmword[&m_env.temp.q], xmm4);

                rcpps(xmm4, xmm4);
                mulps(xmm2, xmm4);
                mulps(xmm3, xmm4);
            }
        }

        if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc))
        {
            if(m_sel.iip)
            {
                // GSVector4i vc = GSVector4i(v.c);

                cvttps2dq(xmm6, xmmword[ebx]); // v.c

                // vc = vc.upl16(vc.zwxy());

                pshufd(xmm5, xmm6, _MM_SHUFFLE(1, 0, 3, 2));
                punpcklwd(xmm6, xmm5);

                // rb = vc.xxxx().add16(m_env.d[skip].rb);
                // ga = vc.zzzz().add16(m_env.d[skip].ga);

                pshufd(xmm5, xmm6, _MM_SHUFFLE(0, 0, 0, 0));
                pshufd(xmm6, xmm6, _MM_SHUFFLE(2, 2, 2, 2));

                paddw(xmm5, xmmword[edx + 16 * 4]);
                paddw(xmm6, xmmword[edx + 16 * 5]);

                movdqa(xmmword[&m_env.temp.rb], xmm5);
                movdqa(xmmword[&m_env.temp.ga], xmm6);
            }
            else
            {
                if(m_sel.tfx == TFX_NONE)
                {
                    movdqa(xmm5, xmmword[&m_env.c.rb]);
                    movdqa(xmm6, xmmword[&m_env.c.ga]);
                }
            }
        }
    }
}
void GSDrawScanlineCodeGenerator::Step()
{
    // steps -= 4;

    sub(ecx, 4);

    // fza_offset++;

    add(edi, 8);

    if(!m_sel.sprite)
    {
        // z += m_env.d4.z;

        if(m_sel.zb)
        {
            movaps(xmm0, xmmword[&m_env.temp.z]);
            addps(xmm0, xmmword[&m_env.d4.z]);
            movaps(xmmword[&m_env.temp.z], xmm0);
        }

        // f = f.add16(m_env.d4.f);

        if(m_sel.fwrite && m_sel.fge)
        {
            movdqa(xmm1, xmmword[&m_env.temp.f]);
            paddw(xmm1, xmmword[&m_env.d4.f]);
            movdqa(xmmword[&m_env.temp.f], xmm1);
        }
    }
    else
    {
        if(m_sel.ztest)
        {
            movdqa(xmm0, xmmword[&m_env.p.z]);
        }
    }

    if(m_sel.fb)
    {
        if(m_sel.tfx != TFX_NONE)
        {
            if(m_sel.fst)
            {
                // GSVector4i st = m_env.d4.st;

                // si += st.xxxx();
                // if(!sprite) ti += st.yyyy();

                movdqa(xmm4, xmmword[&m_env.d4.st]);

                pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
                paddd(xmm2, xmmword[&m_env.temp.s]);
                movdqa(xmmword[&m_env.temp.s], xmm2);

                if(!m_sel.sprite)
                {
                    pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1));
                    paddd(xmm3, xmmword[&m_env.temp.t]);
                    movdqa(xmmword[&m_env.temp.t], xmm3);
                }
                else
                {
                    movdqa(xmm3, xmmword[&m_env.temp.t]);
                }
            }
            else
            {
                // GSVector4 stq = m_env.d4.stq;

                // s += stq.xxxx();
                // t += stq.yyyy();
                // q += stq.zzzz();

                movaps(xmm2, xmmword[&m_env.d4.stq]);
                movaps(xmm3, xmm2);
                movaps(xmm4, xmm2);

                shufps(xmm2, xmm2, _MM_SHUFFLE(0, 0, 0, 0));
                shufps(xmm3, xmm3, _MM_SHUFFLE(1, 1, 1, 1));
                shufps(xmm4, xmm4, _MM_SHUFFLE(2, 2, 2, 2));

                addps(xmm2, xmmword[&m_env.temp.s]);
                addps(xmm3, xmmword[&m_env.temp.t]);
                addps(xmm4, xmmword[&m_env.temp.q]);

                movaps(xmmword[&m_env.temp.s], xmm2);
                movaps(xmmword[&m_env.temp.t], xmm3);
                movaps(xmmword[&m_env.temp.q], xmm4);

                rcpps(xmm4, xmm4);
                mulps(xmm2, xmm4);
                mulps(xmm3, xmm4);
            }
        }

        if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc))
        {
            if(m_sel.iip)
            {
                // GSVector4i c = m_env.d4.c;

                // rb = rb.add16(c.xxxx());
                // ga = ga.add16(c.yyyy());

                movdqa(xmm7, xmmword[&m_env.d4.c]);

                pshufd(xmm5, xmm7, _MM_SHUFFLE(0, 0, 0, 0));
                pshufd(xmm6, xmm7, _MM_SHUFFLE(1, 1, 1, 1));

                paddw(xmm5, xmmword[&m_env.temp.rb]);
                paddw(xmm6, xmmword[&m_env.temp.ga]);

                movdqa(xmmword[&m_env.temp.rb], xmm5);
                movdqa(xmmword[&m_env.temp.ga], xmm6);
            }
            else
            {
                if(m_sel.tfx == TFX_NONE)
                {
                    movdqa(xmm5, xmmword[&m_env.c.rb]);
                    movdqa(xmm6, xmmword[&m_env.c.ga]);
                }
            }
        }
    }

    // test = m_test[7 + (steps & (steps >> 31))];

    mov(edx, ecx);
    sar(edx, 31);
    and(edx, ecx);
    shl(edx, 4);

    movdqa(xmm7, xmmword[edx + (size_t)&m_test[7]]);
}
Esempio n. 3
0
float TfirFilter::vec_inner_prod_sse(const float *eax, const float *edi, int ecx)
{
  __m128 xmm3,xmm4,xmm0,xmm1,xmm5,xmm6;
  xorps (xmm3, xmm3);
  xorps (xmm4, xmm4);

  ecx-=8;// sub $8, %%ecx
  if (ecx<0) goto //jb
   mul8_skip;

mul8_loop:
  movups (xmm0,eax);
  movups (xmm1,edi);
  movups (xmm5,16/sizeof(float)+eax);
  movups (xmm6,16/sizeof(float)+edi);
  eax+=32/sizeof(float);
  edi+=32/sizeof(float);
  mulps (xmm1,xmm0);
  mulps (xmm6,xmm5);
  addps (xmm3,xmm1);
  addps (xmm4,xmm6);

  ecx-=8;
  if (ecx>=0) //jae
   goto mul8_loop;

mul8_skip:

  addps (xmm3,xmm4);

  ecx+=4;
  if (ecx<0) //jl
   goto mul4_skip;

  movups (xmm0,eax);
  movups (xmm1,edi);
  eax+=16/sizeof(float);
  edi+=16/sizeof(float);
  mulps (xmm1, xmm0);
  addps (xmm3, xmm1);

  ecx-=4;

mul4_skip:

  ecx+=4;

  goto cond1;

mul1_loop:
  movss (xmm0,eax);
  movss (xmm1,edi);
  eax+=4/sizeof(float);
  edi+=4/sizeof(float);
  mulss (xmm1,xmm0);
  addss (xmm3,xmm1);

cond1:
  ecx-=1;
  if (ecx>=0) // jae
   goto mul1_loop;

  movhlps  (xmm4,xmm3);
  addps    (xmm3,xmm4);
  movaps   (xmm4,xmm3);
  //FIXME: which one?
  xmm4=_mm_shuffle_ps(xmm4,xmm4,0x55);// shufps $0x55, xmm4, xmm4
                                      // shufps $33, xmm4, xmm4
  addss    (xmm3, xmm4);
  float sum;
  movss    (&sum , xmm3);
  return sum;
}