void Predict_16x16_C(const NEW_GMC_DATA * const This, uint8_t *dst, const uint8_t *src, int dststride, int srcstride, int x, int y, int rounding) { const int W = This->sW; const int H = This->sH; const int rho = 3 - This->accuracy; const int Rounder = ( (1<<7) - (rounding<<(2*rho)) ) << 16; const int dUx = This->dU[0]; const int dVx = This->dV[0]; const int dUy = This->dU[1]; const int dVy = This->dV[1]; int Uo = This->Uo + 16*(dUy*y + dUx*x); int Vo = This->Vo + 16*(dVy*y + dVx*x); int i, j; dst += 16; { unsigned int ri = 16; unsigned int rj = 16; int Offset; int u,v; uint8_t *srctmp; uint32_t tmpf = 0; S32I2M(xr15,dUx); S32I2M(xr14,dVx); S32I2M(xr13,dUy); S32I2M(xr12,dVy); S32I2M(xr11,Uo); // Uo 11 S32I2M(xr10,Vo); // Vo 10 S32I2M(xr5, Rounder); for (j = 16; j>0; --j) { D32SLL(xr9,xr11,xr10,xr8, 0x0); // U 9 ,V 8 D32ASUM_AA(xr11,xr13,xr12,xr10); // += dUy; +=dVy; for (i = -16; i<0; ++i) { ri = 16; rj = 16; // ( U >> 16 ) ,( V >> 16 ) D32SAR(xr7,xr9,xr8,xr6, 0x8); D32SAR(xr7,xr7,xr6,xr6, 0x8); D32SLLV(xr7,xr6, rho); // << rho u = S32M2I(xr7); v = S32M2I(xr6); D32ASUM_AA(xr9,xr15,xr14,xr8); // U += dUx; V += dVx; if (u > 0 && u <= W) { ri = MTab[u&15]; Offset = u>>4; } else { if (u > W) Offset = W>>4; else Offset = 0; ri = MTab[0]; } if (v > 0 && v <= H) { rj = MTab[v&15]; Offset += (v>>4)*srcstride; }
static void rv40_dequant4x4(DCTELEM *block,uint32_t *dst, int n) { int i; uint32_t src=block-4; uint32_t dst_t = dst-4; #if 0 for(i = 0; i < n; i++){ S32LDI(xr1,src,0x8); S32LDD(xr2,src,0x4); S32LDI(xr7,src,0x8); S32LDD(xr8,src,0x4); D16MUL_LW(xr4,xr12,xr1,xr3); D16MUL_LW(xr10,xr12,xr7,xr9); D16MUL_LW(xr14,xr12,xr8,xr15); D32ASUM_AA(xr3,xr13,xr13,xr4); D16MUL_LW(xr6,xr12,xr2,xr5); D32SLR(xr3,xr3,xr4,xr4,4); D32ASUM_AA(xr5,xr13,xr13,xr6); D32ASUM_AA(xr9,xr13,xr13,xr10); D32SLR(xr5,xr5,xr6,xr6,4); D32SLR(xr9,xr9,xr10,xr10,4); D32ASUM_AA(xr15,xr13,xr13,xr14); S32SDI(xr3,dst_t,0x10); S32STD(xr4,dst_t,0x4); S32STD(xr5,dst_t,0x8); S32STD(xr6,dst_t,0xc); D32SLR(xr15,xr15,xr14,xr14,4); S32SDI(xr9,dst_t,0x10); S32STD(xr10,dst_t,0x4); S32STD(xr15,dst_t,0x8); S32STD(xr14,dst_t,0xc); } #else ///////////////////// if(n == 1){ S32LDI(xr1,src,0x8); S32LDI(xr2,src,0x8); S32LDI(xr7,src,0x8); S32LDI(xr8,src,0x8); D16MUL_XW(xr4,xr12,xr1,xr3); D16MUL_LW(xr10,xr12,xr7,xr9); D16MUL_LW(xr14,xr12,xr8,xr15); D32ASUM_AA(xr3,xr13,xr13,xr4); D16MUL_LW(xr6,xr12,xr2,xr5); D32SLR(xr3,xr3,xr4,xr4,4); D32ASUM_AA(xr5,xr13,xr13,xr6); D32ASUM_AA(xr9,xr13,xr13,xr10); D32SLR(xr5,xr5,xr6,xr6,4); D32SLR(xr9,xr9,xr10,xr10,4); D32ASUM_AA(xr15,xr13,xr13,xr14); S32SDI(xr3,dst_t,0x10); S32STD(xr4,dst_t,0x4); S32STD(xr0,dst_t,0x8); S32STD(xr0,dst_t,0xc); S32SDI(xr5,dst_t,0x10); S32STD(xr6,dst_t,0x4); S32STD(xr0,dst_t,0x8); S32STD(xr0,dst_t,0xc); D32SLR(xr15,xr15,xr14,xr14,4); S32SDI(xr9,dst_t,0x10); S32STD(xr10,dst_t,0x4); S32STD(xr0,dst_t,0x8); S32STD(xr0,dst_t,0xc); //S32STD(xr15,dst_t,0x8); //S32STD(xr14,dst_t,0xc); S32SDI(xr15,dst_t,0x10); S32STD(xr14,dst_t,0x4); S32STD(xr0,dst_t,0x8); S32STD(xr0,dst_t,0xc); } else if(n==2) { S32LDI(xr1,src,0x8); S32LDD(xr2,src,0x4); S32LDI(xr7,src,0x8); S32LDD(xr8,src,0x4); D16MUL_XW(xr4,xr12,xr1,xr3); D16MUL_LW(xr10,xr12,xr7,xr9); D16MUL_LW(xr14,xr12,xr8,xr15); D32ASUM_AA(xr3,xr13,xr13,xr4); D16MUL_LW(xr6,xr12,xr2,xr5); D32SLR(xr3,xr3,xr4,xr4,4); D32ASUM_AA(xr5,xr13,xr13,xr6); D32ASUM_AA(xr9,xr13,xr13,xr10); D32SLR(xr5,xr5,xr6,xr6,4); D32SLR(xr9,xr9,xr10,xr10,4); D32ASUM_AA(xr15,xr13,xr13,xr14); S32SDI(xr3,dst_t,0x10); S32STD(xr4,dst_t,0x4); S32STD(xr5,dst_t,0x8); S32STD(xr6,dst_t,0xc); D32SLR(xr15,xr15,xr14,xr14,4); S32SDI(xr9,dst_t,0x10); S32STD(xr10,dst_t,0x4); S32STD(xr15,dst_t,0x8); S32STD(xr14,dst_t,0xc); S32SDI(xr0,dst_t,0x10); S32STD(xr0,dst_t,0x4); S32STD(xr0,dst_t,0x8); S32STD(xr0,dst_t,0xc); S32SDI(xr0,dst_t,0x10); S32STD(xr0,dst_t,0x4); S32STD(xr0,dst_t,0x8); S32STD(xr0,dst_t,0xc); } else { S32LDI(xr1,src,0x8); S32LDD(xr2,src,0x4); S32LDI(xr7,src,0x8); S32LDD(xr8,src,0x4); D16MUL_XW(xr4,xr12,xr1,xr3); D16MUL_LW(xr10,xr12,xr7,xr9); D16MUL_LW(xr14,xr12,xr8,xr15); D32ASUM_AA(xr3,xr13,xr13,xr4); D16MUL_LW(xr6,xr12,xr2,xr5); D32SLR(xr3,xr3,xr4,xr4,4); D32ASUM_AA(xr5,xr13,xr13,xr6); D32ASUM_AA(xr9,xr13,xr13,xr10); D32SLR(xr5,xr5,xr6,xr6,4); D32SLR(xr9,xr9,xr10,xr10,4); D32ASUM_AA(xr15,xr13,xr13,xr14); S32SDI(xr3,dst_t,0x10); S32STD(xr4,dst_t,0x4); S32STD(xr5,dst_t,0x8); S32STD(xr6,dst_t,0xc); D32SLR(xr15,xr15,xr14,xr14,4); S32SDI(xr9,dst_t,0x10); S32STD(xr10,dst_t,0x4); S32STD(xr15,dst_t,0x8); S32STD(xr14,dst_t,0xc); S32LDI(xr1,src,0x8); S32LDD(xr2,src,0x4); S32LDI(xr7,src,0x8); S32LDD(xr8,src,0x4); D16MUL_LW(xr4,xr12,xr1,xr3); D16MUL_LW(xr10,xr12,xr7,xr9); D16MUL_LW(xr14,xr12,xr8,xr15); D32ASUM_AA(xr3,xr13,xr13,xr4); D16MUL_LW(xr6,xr12,xr2,xr5); D32SLR(xr3,xr3,xr4,xr4,4); D32ASUM_AA(xr5,xr13,xr13,xr6); D32ASUM_AA(xr9,xr13,xr13,xr10); D32SLR(xr5,xr5,xr6,xr6,4); D32SLR(xr9,xr9,xr10,xr10,4); D32ASUM_AA(xr15,xr13,xr13,xr14); S32SDI(xr3,dst_t,0x10); S32STD(xr4,dst_t,0x4); S32STD(xr5,dst_t,0x8); S32STD(xr6,dst_t,0xc); D32SLR(xr15,xr15,xr14,xr14,4); S32SDI(xr9,dst_t,0x10); S32STD(xr10,dst_t,0x4); S32STD(xr15,dst_t,0x8); S32STD(xr14,dst_t,0xc); } #endif }