示例#1
0
void Predict_16x16_C(const NEW_GMC_DATA * const This,
                     uint8_t *dst,
                     const uint8_t *src,
                     int dststride,
                     int srcstride,
                     int x,
                     int y,
                     int rounding)
{
    const int W       = This->sW;
    const int H	    = This->sH;
    const int rho     = 3 - This->accuracy;
    const int Rounder = ( (1<<7) - (rounding<<(2*rho)) ) << 16;

    const int dUx = This->dU[0];
    const int dVx = This->dV[0];
    const int dUy = This->dU[1];
    const int dVy = This->dV[1];

    int Uo = This->Uo + 16*(dUy*y + dUx*x);
    int Vo = This->Vo + 16*(dVy*y + dVx*x);

    int i, j;

    dst += 16;
    {
        unsigned int ri = 16;
        unsigned int rj = 16;
        int Offset;
        int u,v;

        uint8_t *srctmp;
        uint32_t tmpf = 0;

        S32I2M(xr15,dUx);
        S32I2M(xr14,dVx);
        S32I2M(xr13,dUy);
        S32I2M(xr12,dVy);

        S32I2M(xr11,Uo); // Uo 11
        S32I2M(xr10,Vo); // Vo 10

        S32I2M(xr5, Rounder);

        for (j = 16; j>0; --j)
        {
            D32SLL(xr9,xr11,xr10,xr8, 0x0); // U 9 ,V 8
            D32ASUM_AA(xr11,xr13,xr12,xr10); // += dUy; +=dVy;


            for (i = -16; i<0; ++i)
            {
                ri = 16;
                rj = 16;

                // ( U >> 16 ) ,( V >> 16 )
                D32SAR(xr7,xr9,xr8,xr6, 0x8);
                D32SAR(xr7,xr7,xr6,xr6, 0x8);

                D32SLLV(xr7,xr6, rho); // << rho

                u = S32M2I(xr7);
                v = S32M2I(xr6);
                D32ASUM_AA(xr9,xr15,xr14,xr8); // U += dUx; V += dVx;

                if (u > 0 && u <= W)
                {
                    ri = MTab[u&15];
                    Offset = u>>4;
                }
                else
                {
                    if (u > W)
                        Offset = W>>4;
                    else
                        Offset = 0;

                    ri = MTab[0];
                }

                if (v > 0 && v <= H)
                {
                    rj      = MTab[v&15];
                    Offset += (v>>4)*srcstride;
                }
示例#2
0
static void rv40_dequant4x4(DCTELEM *block,uint32_t *dst, int n)
{
  int i;
  uint32_t src=block-4;
  uint32_t dst_t = dst-4;
#if 0
  for(i = 0; i < n; i++){   
    S32LDI(xr1,src,0x8);
    S32LDD(xr2,src,0x4);
    S32LDI(xr7,src,0x8);
    S32LDD(xr8,src,0x4);

    D16MUL_LW(xr4,xr12,xr1,xr3);
    D16MUL_LW(xr10,xr12,xr7,xr9);
    D16MUL_LW(xr14,xr12,xr8,xr15);
    D32ASUM_AA(xr3,xr13,xr13,xr4);
    D16MUL_LW(xr6,xr12,xr2,xr5);
    D32SLR(xr3,xr3,xr4,xr4,4);
    D32ASUM_AA(xr5,xr13,xr13,xr6);
    D32ASUM_AA(xr9,xr13,xr13,xr10);
    D32SLR(xr5,xr5,xr6,xr6,4);
    D32SLR(xr9,xr9,xr10,xr10,4);
    D32ASUM_AA(xr15,xr13,xr13,xr14);

    S32SDI(xr3,dst_t,0x10);
    S32STD(xr4,dst_t,0x4);
    S32STD(xr5,dst_t,0x8);
    S32STD(xr6,dst_t,0xc);

    D32SLR(xr15,xr15,xr14,xr14,4);
    S32SDI(xr9,dst_t,0x10);
    S32STD(xr10,dst_t,0x4);
    S32STD(xr15,dst_t,0x8);
    S32STD(xr14,dst_t,0xc);      
  }
#else
  /////////////////////     
  if(n == 1){
    S32LDI(xr1,src,0x8);
    S32LDI(xr2,src,0x8);
    S32LDI(xr7,src,0x8);
    S32LDI(xr8,src,0x8);
	  
    D16MUL_XW(xr4,xr12,xr1,xr3);
    D16MUL_LW(xr10,xr12,xr7,xr9);
    D16MUL_LW(xr14,xr12,xr8,xr15);
    D32ASUM_AA(xr3,xr13,xr13,xr4);
    D16MUL_LW(xr6,xr12,xr2,xr5);
    D32SLR(xr3,xr3,xr4,xr4,4);
    D32ASUM_AA(xr5,xr13,xr13,xr6);
    D32ASUM_AA(xr9,xr13,xr13,xr10);
    D32SLR(xr5,xr5,xr6,xr6,4);
    D32SLR(xr9,xr9,xr10,xr10,4);
    D32ASUM_AA(xr15,xr13,xr13,xr14);

    S32SDI(xr3,dst_t,0x10);
    S32STD(xr4,dst_t,0x4);
    S32STD(xr0,dst_t,0x8);
    S32STD(xr0,dst_t,0xc);

    S32SDI(xr5,dst_t,0x10);
    S32STD(xr6,dst_t,0x4);
    S32STD(xr0,dst_t,0x8);
    S32STD(xr0,dst_t,0xc);

    D32SLR(xr15,xr15,xr14,xr14,4);
    S32SDI(xr9,dst_t,0x10);
    S32STD(xr10,dst_t,0x4);
    S32STD(xr0,dst_t,0x8);
    S32STD(xr0,dst_t,0xc);
    //S32STD(xr15,dst_t,0x8);
    //S32STD(xr14,dst_t,0xc);
    S32SDI(xr15,dst_t,0x10);
    S32STD(xr14,dst_t,0x4);
    S32STD(xr0,dst_t,0x8);
    S32STD(xr0,dst_t,0xc);
  }
  else if(n==2)
    {
      S32LDI(xr1,src,0x8);
      S32LDD(xr2,src,0x4);
      S32LDI(xr7,src,0x8);
      S32LDD(xr8,src,0x4);
	  
      D16MUL_XW(xr4,xr12,xr1,xr3);
      D16MUL_LW(xr10,xr12,xr7,xr9);
      D16MUL_LW(xr14,xr12,xr8,xr15);
      D32ASUM_AA(xr3,xr13,xr13,xr4);
      D16MUL_LW(xr6,xr12,xr2,xr5);
      D32SLR(xr3,xr3,xr4,xr4,4);
      D32ASUM_AA(xr5,xr13,xr13,xr6);
      D32ASUM_AA(xr9,xr13,xr13,xr10);
      D32SLR(xr5,xr5,xr6,xr6,4);
      D32SLR(xr9,xr9,xr10,xr10,4);
      D32ASUM_AA(xr15,xr13,xr13,xr14);

      S32SDI(xr3,dst_t,0x10);
      S32STD(xr4,dst_t,0x4);
      S32STD(xr5,dst_t,0x8);
      S32STD(xr6,dst_t,0xc);

      D32SLR(xr15,xr15,xr14,xr14,4);
      S32SDI(xr9,dst_t,0x10);
      S32STD(xr10,dst_t,0x4);
      S32STD(xr15,dst_t,0x8);
      S32STD(xr14,dst_t,0xc);

      S32SDI(xr0,dst_t,0x10);
      S32STD(xr0,dst_t,0x4);
      S32STD(xr0,dst_t,0x8);
      S32STD(xr0,dst_t,0xc);
	  
      S32SDI(xr0,dst_t,0x10);
      S32STD(xr0,dst_t,0x4);
      S32STD(xr0,dst_t,0x8);
      S32STD(xr0,dst_t,0xc);	  
    }

  else
    {
      S32LDI(xr1,src,0x8);
      S32LDD(xr2,src,0x4);
      S32LDI(xr7,src,0x8);
      S32LDD(xr8,src,0x4);

      D16MUL_XW(xr4,xr12,xr1,xr3);
      D16MUL_LW(xr10,xr12,xr7,xr9);
      D16MUL_LW(xr14,xr12,xr8,xr15);
      D32ASUM_AA(xr3,xr13,xr13,xr4);
      D16MUL_LW(xr6,xr12,xr2,xr5);
      D32SLR(xr3,xr3,xr4,xr4,4);
      D32ASUM_AA(xr5,xr13,xr13,xr6);
      D32ASUM_AA(xr9,xr13,xr13,xr10);
      D32SLR(xr5,xr5,xr6,xr6,4);
      D32SLR(xr9,xr9,xr10,xr10,4);
      D32ASUM_AA(xr15,xr13,xr13,xr14);

      S32SDI(xr3,dst_t,0x10);
      S32STD(xr4,dst_t,0x4);
      S32STD(xr5,dst_t,0x8);
      S32STD(xr6,dst_t,0xc);

      D32SLR(xr15,xr15,xr14,xr14,4);
      S32SDI(xr9,dst_t,0x10);
      S32STD(xr10,dst_t,0x4);
      S32STD(xr15,dst_t,0x8);
      S32STD(xr14,dst_t,0xc);

      S32LDI(xr1,src,0x8);
      S32LDD(xr2,src,0x4);
      S32LDI(xr7,src,0x8);
      S32LDD(xr8,src,0x4);

      D16MUL_LW(xr4,xr12,xr1,xr3);
      D16MUL_LW(xr10,xr12,xr7,xr9);
      D16MUL_LW(xr14,xr12,xr8,xr15);
      D32ASUM_AA(xr3,xr13,xr13,xr4);
      D16MUL_LW(xr6,xr12,xr2,xr5);
      D32SLR(xr3,xr3,xr4,xr4,4);
      D32ASUM_AA(xr5,xr13,xr13,xr6);
      D32ASUM_AA(xr9,xr13,xr13,xr10);
      D32SLR(xr5,xr5,xr6,xr6,4);
      D32SLR(xr9,xr9,xr10,xr10,4);
      D32ASUM_AA(xr15,xr13,xr13,xr14);

      S32SDI(xr3,dst_t,0x10);
      S32STD(xr4,dst_t,0x4);
      S32STD(xr5,dst_t,0x8);
      S32STD(xr6,dst_t,0xc);

      D32SLR(xr15,xr15,xr14,xr14,4);
      S32SDI(xr9,dst_t,0x10);
      S32STD(xr10,dst_t,0x4);
      S32STD(xr15,dst_t,0x8);
      S32STD(xr14,dst_t,0xc);      
    }
#endif

}