// MODE 3 static void pred4x4_down_left_mxu(uint8_t *dst, uint8_t *src, uint8_t *topright, uint8_t *top, uint8_t *topleft){ //load S32LDD(xr1, top, 0x0); // xr1 <- t3, t2, t1, t0 ; S32LDD(xr2, topright, 0x0); // xr2 <- t7, t6, t5, t4 ; S32LDDR(xr15, topright, 0x0); //xr15 <- t4, t5, t6, t7 ; S32ALNI(xr3, xr2, xr1, ptn2); // xr3: t5, t4, t3, t2 ; Q8AVG(xr4, xr1, xr3); S32ALNI(xr5, xr2, xr1, ptn3); // xr5: t4, t3, t2, t1 ; Q8AVGR(xr6, xr4, xr5); S32ALNI(xr7, xr2, xr1, ptn1); // xr7: t6, t5, t4, t3 ; S32ALNI(xr8, xr15, xr2, ptn3);// xr8: t7, t7, t6, t5 ; Q8AVG(xr9, xr7, xr8); Q8AVGR(xr10, xr9, xr2); D32SLL(xr11, xr6, xr0, xr0, 0x8); S32ALNI(xr12, xr10, xr11, ptn1); S32ALNI(xr13, xr10, xr11, ptn2); //store S32STD(xr6, dst, 0x0); S32SDIV(xr13, dst, MB_LUMA_EDGED_WIDTH, 0x0); S32SDIV(xr12, dst, MB_LUMA_EDGED_WIDTH, 0x0); S32SDIV(xr10, dst, MB_LUMA_EDGED_WIDTH, 0x0); }
// MODE 5 static void pred4x4_vertical_right_mxu(uint8_t *dst, uint8_t *src, uint8_t *topright, uint8_t *top, uint8_t *topleft){ uint8_t *src_left; // left address src_left = src -0x4; // load right S32LDD(xr8, top, 0x0); // xr8: t3, t2, t1, t0 ; high -> low, [31->0]; // load left S32LDD(xr1, topleft, -0x4); // xr1[31:24] <- src_topleft[3] (lt) ; S32LDD(xr2, src_left, 0x0); // xr2[31:24] <- src_topleft[stride+3] (l0) ; S32LDIV(xr3, src_left, MB_LUMA_EDGED_WIDTH, 0x0); // xr3[31:24] <- src_topleft[2*stride+3] (l1) ; S32LDIV(xr4, src_left, MB_LUMA_EDGED_WIDTH, 0x0); // xr4[31:24] <- src_topleft[3*stride+3] (l2) ; S32SFL(xr5, xr2, xr1, xr0, ptn2); // xr5[31:16] <- l0, lt ; S32SFL(xr6, xr4, xr3, xr0, ptn2); // xr6[31:16] <- l2, l1 ; S32SFL(xr7, xr3, xr2, xr0, ptn2); // xr7[31:16] <- l1, l0 ; // alni S32ALNI(xr3, xr8, xr1, ptn1); // xr3: t2, t1, t0, lt ; S32ALNI(xr4, xr3, xr2, ptn1); // xr4: t1, t0, lt, l0 ; // cal Q8AVGR(xr1, xr3, xr8); // xr1: Q8AVG(xr9, xr4, xr8); Q8AVGR(xr2, xr9, xr3); // xr2: Q8AVG(xr10, xr5, xr6); Q8AVGR(xr11, xr10, xr7); // xr11: src[0,3], src[0,2], ~, ~ ; // alni S32ALNI(xr12, xr2, xr11, ptn1); D32SLL(xr13, xr11, xr0, xr0, 0x8); S32ALNI(xr14, xr1, xr13, ptn1); // store S32STD(xr1, dst, 0x0); S32SDIV(xr2, dst, MB_LUMA_EDGED_WIDTH, 0x0); S32SDIV(xr14, dst, MB_LUMA_EDGED_WIDTH, 0x0); S32SDIV(xr12, dst, MB_LUMA_EDGED_WIDTH, 0x0); }
// MODE 6 static void pred4x4_horizontal_down_mxu(uint8_t *dst, uint8_t *src, uint8_t *topright, uint8_t *top, uint8_t *topleft){ uint8_t *src_left; // left address src_left = src - 0x4; // load TOP S32LDDR(xr8, top, 0x0); // xr8[31:0]: t0, t1, t2, t3 ; S32LDDR(xr15, topleft, -0x4);// xr15[7:0]: lt ; S32LDD(xr9, topleft, -0x4); // xr9[31:24]: lt ; S32ALNI(xr10, xr15, xr8, ptn3); //xr10[31:0]: lt, t0, t1, t2 ; // load LEFT S32LDDR(xr1, src_left, 0x0); // xr1[7:0] <- src_left[3] (l0) ; S32LDIVR(xr2, src_left, MB_LUMA_EDGED_WIDTH, 0x0); // xr2[7:0] <- src_left[stride+3] (l1) ; S32LDIVR(xr3, src_left, MB_LUMA_EDGED_WIDTH, 0x0); // xr3[7:0] <- src_left[2*stride+3] (l2) ; S32LDIVR(xr4, src_left, MB_LUMA_EDGED_WIDTH, 0x0); // xr4[7:0] <- src_left[3*stride+3] (l3) ; S32SFL(xr0, xr2, xr1, xr5, ptn2); // xr5[15:0] <- l1, l0 ; S32SFL(xr0, xr4, xr3, xr6, ptn2); // xr6[15:0] <- l3, l2 ; S32SFL(xr0, xr6, xr5, xr7, ptn3); // xr7[31:0] <- l3, l2, l1, l0 ; // ALNI for CAL S32ALNI(xr11, xr7, xr9, ptn1); // xr11: l2, l1, l0, lt ; S32ALNI(xr12, xr1, xr10, ptn3); // xr12: l0, lt, t0, t1 ; D32SLL(xr0, xr0, xr11, xr13, 0x8); // xr13: l1, l0, lt, 0 ; // CAL Q8AVGR(xr1, xr11, xr7); // xr1: src[0,3], src[0,2]/src[2,3], src[0,1]/src[2,2], src[0,0]/src[2,1] ; Q8AVG(xr2, xr12, xr8); Q8AVGR(xr3, xr2, xr10); // xr3: src[1,0]/src[3,1], src[2,0], src[3,0], ~ ; Q8AVG(xr4, xr13, xr7); Q8AVGR(xr5, xr4, xr11); // xr5: src[1,3], src[1,2]/src[3,3], src[1,1]/src[3,2], ~ ; // ALNI for STORE S32ALNI(xr8, xr1, xr3, ptn3); // xr8: src[0,0]/src[2,1], src[1,0]/src[3,1], src[2,0], src[3,0] ; S32SFL(xr9, xr1, xr5, xr10, ptn0); // xr9: src[0,3], src[1,3], src[0,2]/src[2,3], src[1,2]/src[3,3] ; //xr10: src[0,1]/src[2,2], src[1,1]/src[3,2], src[0,0]/src[2,1], ~ ; S32SFL(xr11, xr10, xr8, xr0, ptn3); // xr11: src[0,1]/src[2,2], src[1,1]/src[3,2], // src[0,0]/src[2,1], src[1,0]/src[3,1] ; S32ALNI(xr12, xr9, xr10, ptn2); // xr12: src[0,2]/src[2,3], src[1,2]/src[3,3], // src[0,1]/src[2,2], src[1,1]/src[3,2] ; // STORE S32STDR(xr8, dst, 0x0); S32SDIVR(xr11, dst, MB_LUMA_EDGED_WIDTH, 0x0); S32SDIVR(xr12, dst, MB_LUMA_EDGED_WIDTH, 0x0); S32SDIVR(xr9, dst, MB_LUMA_EDGED_WIDTH, 0x0); }
// MODE 4 static void pred4x4_down_right_mxu(uint8_t *dst, uint8_t *src, uint8_t *topright, uint8_t *top, uint8_t *topleft){ uint8_t *src_left; // left address src_left = src - 0x4; // load right S32LDDR(xr8, top, 0x0); // xr8: t0, t1, t2, t3 ; high -> low, [31->0]; S32LDDR(xr9, topleft, -0x4); // xr9[7:0]: lt ; // load left S32LDD(xr7, topleft, -0x4); // xr7[31:24]: lt ; S32LDD(xr1, src_left, 0x0); // xr1[31:24] <- src_left[3] (l0) ; S32LDIV(xr2, src_left, MB_LUMA_EDGED_WIDTH, 0x0); // xr2[31:24] <- src_left[stride+3] (l1) ; S32LDIV(xr3, src_left, MB_LUMA_EDGED_WIDTH, 0x0); // xr3[31:24] <- src_left[2*stride+3] (l2) ; S32LDIV(xr4, src_left, MB_LUMA_EDGED_WIDTH, 0x0); // xr4[31:24] <- src_left[3*stride+3] (l3) ; S32SFL(xr5, xr2, xr1, xr0, ptn2); // xr5[31:16] <- l1, l0 ; S32SFL(xr6, xr4, xr3, xr0, ptn2); // xr6[31:16] <- l3, l2 ; S32SFL(xr1, xr6, xr5, xr0, ptn3); // xr1[31: 0] <- l3, l2, l1, l0 ; // alni S32ALNI(xr10, xr9, xr8, ptn3); // xr10: lt, t0, t1, t2 ; S32ALNI(xr11, xr1, xr7, ptn1); // xr11: l2, l1, l0, lt ; S32ALNI(xr12, xr11, xr8, ptn2);// xr12: l0, lt, t0, t1 ; S32ALNI(xr13, xr1, xr10, ptn2);// xr13: l1, l0, lt, t0 ; // cal Q8AVG(xr3, xr1, xr13); Q8AVGR(xr4, xr3, xr11); // xr4: src[0,3], src[0,2]/src[1,3], src[0,1]/src[1,2]/src[2,3], // src[0,0]/src[1,1]/src[2,2]/src[3,3] ; Q8AVG(xr5, xr8, xr12); Q8AVGR(xr6, xr5, xr10); // xr6: src[0,0]/src[1,1]/src[2,2]/src[3,3], // src[1,0]/src[2,1]/src[3,2], src[2,0]/src[3,1], src[3,0] ; // alni for store D32SLL(xr7, xr6, xr0, xr0, 0x8); // xr7: src[1,0]/src[2,1]/src[3,2], src[2,0]/src[3,1], src[3,0] ; S32ALNI(xr8, xr4, xr7, ptn1); S32ALNI(xr9, xr4, xr7, ptn2); //store S32STDR(xr6, dst, 0x0); S32SDIVR(xr9, dst, MB_LUMA_EDGED_WIDTH, 0x0); S32SDIVR(xr8, dst, MB_LUMA_EDGED_WIDTH, 0x0); S32SDIVR(xr4, dst, MB_LUMA_EDGED_WIDTH, 0x0); }
// MODE 8 static void pred4x4_horizontal_up_mxu(uint8_t *dst, uint8_t *src, uint8_t *topright, uint8_t *top, uint8_t *topleft){ uint8_t *src_left; // left address src_left = src - 0x4; //load S32LDD(xr1, src_left, 0x0); // xr1[31:24] <- src_left[3] (l0) ; S32LDIV(xr2, src_left, MB_LUMA_EDGED_WIDTH, 0x0); // xr2[31:24] <- src_left[stride+3] (l1) ; S32LDIV(xr3, src_left, MB_LUMA_EDGED_WIDTH, 0x0); // xr3[31:24] <- src_left[2*stride+3] (l2) ; S32LDIV(xr4, src_left, MB_LUMA_EDGED_WIDTH, 0x0); // xr4[31:24] <- src_left[3*stride+3] (l3) ; S32SFL(xr5, xr2, xr1, xr0, ptn2); // xr5[31:16] <- l1, l0 ; S32SFL(xr6, xr4, xr3, xr0, ptn2); // xr6[31:16] <- l3, l2 ; S32SFL(xr1, xr6, xr5, xr0, ptn3); // xr1[31: 0] <- l3, l2, l1, l0 ; D32SLL(xr2, xr1, xr0, xr0, 0x8); // xr2: l2, l1, l0, 0 ; S32SFL(xr3, xr1, xr1, xr0, ptn0); // xr3: l3, l3, l2, l2; Q8AVGR(xr4, xr1, xr2); // xr4: src[2,1]/src[0,2], src[2,0]/src[0,1], src[0,0], ~ ; Q8AVG(xr5, xr2, xr3); Q8AVGR(xr6, xr5, xr1); // xr6: src[3,1]/src[1,2], src[3,0]/src[1,1], src[1,0], ~ ; S32SFL(xr7, xr6, xr4, xr0, ptn0); // xr7: src[3,1]/src[1,2], src[2,1]/src[0,2], // src[3,0]/src[1,1], src[2,0]/src[0,1]; D32SLR(xr8, xr4, xr6, xr9, 0x8); // xr8: 0, src[2,1]/src[0,2], src[2,0]/src[0,1], src[0,0] ; // xr9: 0, src[3,1]/src[1,2], src[3,0]/src[1,1], src[1,0] ; S32SFL(xr0, xr9, xr8, xr10, ptn0); // xr10: src[3,0], src[2,0], src[1,0], src[0,0] ; S32SFL(xr11, xr3, xr7, xr0, ptn3); // xr11: l3, l3, src[3,1]/src[1,2], src[2,1]/src[0,2] ; S32SFL(xr12, xr3, xr3, xr0, ptn3); // xr12: l3, l3, l3, l3 ; //store S32STD(xr10, dst, 0x0); S32SDIV(xr7, dst, MB_LUMA_EDGED_WIDTH, 0x0); S32SDIV(xr11, dst, MB_LUMA_EDGED_WIDTH, 0x0); S32SDIV(xr12, dst, MB_LUMA_EDGED_WIDTH, 0x0); }
void imdct_half_fix_c(MDCTContext_fix *s, FFTSample_fix *output, const FFTSample_fix *input) { //PMON_ON(qmf); int k, n8, n4, n2, n, j,j1; const FFTSample_fix *in1, *in2; const unsigned short *revtab = s->fft.revtab; const FFTSample_fix *tcos = s->tcos; const FFTSample_fix *tsin = s->tsin; FFTComplex_fix *z = (FFTComplex_fix *)output; n = 1 << s->nbits;//64 n2 = n >> 1;//32 n4 = n >> 2;//16 n8 = n >> 3;//8 /* pre rotation */ in1 = input; //head in2 = input + n2 - 1;//tail for(k = 0; k < n8; k++) { #if 0 j=revtab[k]; FFT_CMUL_fix(z[j].re, z[j].im, *in2, *in1, tcos[k], tsin[k]); in1 += 2; in2 -= 2; #else FFTSample_fix _are,_bre,_aim,_bim,are,aim; _are = *in2; _bre = tcos[k]; _aim = *in1; _bim = tsin[k]; j=revtab[k]; n=n4-k-1; j1=revtab[n]; S32MUL(xr1,xr2, _are, _bre); S32MUL(xr3, xr4, _are, _bim); in2--; are = *in2; S32MUL(xr7,xr8, are, _bre); S32MUL(xr9, xr10, are, _bim); S32MSUB(xr1, xr2, _aim, _bim); S32MADD(xr3, xr4, _aim, _bre); ; in1++; aim = *in1; D32SLL(xr5,xr1,xr3,xr6,1); S32MSUB(xr7, xr8, aim, _bim); S32MADD(xr9, xr10, aim, _bre); z[j].re=S32M2I(xr5); D32SLL(xr11,xr7,xr9,xr12,1); z[j].im=S32M2I(xr6); in1++; in2--; z[j1].re=S32M2I(xr11); z[j1].im=S32M2I(xr12); #endif } s->fft.fft_calc(&s->fft, z); /* post rotation + reordering */ /* XXX: optimize */ for(k = 0; k < n8; k++) { FFTSample_fix r0, i0, r1, i1; FFT_CMUL_fix(r0, i1, z[n8-k-1].im, z[n8-k-1].re, tsin[n8-k-1], tcos[n8-k-1]); FFT_CMUL_fix(r1, i0, z[n8+k ].im, z[n8+k ].re, tsin[n8+k ], tcos[n8+k ]); z[n8-k-1].re = r0; z[n8-k-1].im = i0; z[n8+k ].re = r1; z[n8+k ].im = i1; } //PMON_OFF(qmf); }
void Predict_16x16_C(const NEW_GMC_DATA * const This, uint8_t *dst, const uint8_t *src, int dststride, int srcstride, int x, int y, int rounding) { const int W = This->sW; const int H = This->sH; const int rho = 3 - This->accuracy; const int Rounder = ( (1<<7) - (rounding<<(2*rho)) ) << 16; const int dUx = This->dU[0]; const int dVx = This->dV[0]; const int dUy = This->dU[1]; const int dVy = This->dV[1]; int Uo = This->Uo + 16*(dUy*y + dUx*x); int Vo = This->Vo + 16*(dVy*y + dVx*x); int i, j; dst += 16; { unsigned int ri = 16; unsigned int rj = 16; int Offset; int u,v; uint8_t *srctmp; uint32_t tmpf = 0; S32I2M(xr15,dUx); S32I2M(xr14,dVx); S32I2M(xr13,dUy); S32I2M(xr12,dVy); S32I2M(xr11,Uo); // Uo 11 S32I2M(xr10,Vo); // Vo 10 S32I2M(xr5, Rounder); for (j = 16; j>0; --j) { D32SLL(xr9,xr11,xr10,xr8, 0x0); // U 9 ,V 8 D32ASUM_AA(xr11,xr13,xr12,xr10); // += dUy; +=dVy; for (i = -16; i<0; ++i) { ri = 16; rj = 16; // ( U >> 16 ) ,( V >> 16 ) D32SAR(xr7,xr9,xr8,xr6, 0x8); D32SAR(xr7,xr7,xr6,xr6, 0x8); D32SLLV(xr7,xr6, rho); // << rho u = S32M2I(xr7); v = S32M2I(xr6); D32ASUM_AA(xr9,xr15,xr14,xr8); // U += dUx; V += dVx; if (u > 0 && u <= W) { ri = MTab[u&15]; Offset = u>>4; } else { if (u > W) Offset = W>>4; else Offset = 0; ri = MTab[0]; } if (v > 0 && v <= H) { rj = MTab[v&15]; Offset += (v>>4)*srcstride; }
uint32_t dequant_h263_intra_mxu(int16_t * data, uint8_t yuv_len, const uint32_t quant, const uint32_t dcscalar, const uint16_t * mpeg_quant_matrices) { uint32_t i = 0; S32LUI(xr9,1,0); S32I2M(xr1,quant); D32SLL(xr5,xr1,xr0,xr0,1);// quant_m_2 /* quant_add */ S32AND(xr15,xr1,xr9); S32MOVN(xr2,xr15,xr1); D32ADD_SS(xr1,xr1,xr9,xr3); S32MOVZ(xr2,xr15,xr1); S32I2M(xr3,-2048); S32I2M(xr4,2047); /* part1 */ //S32MUL(xr4,xr6,*data,dcscalar); S32MUL(xr0,xr6,(int32_t)data[0],dcscalar); D16MUL_WW(xr0,xr6,xr9,xr6); S32MIN(xr6,xr6,xr4); S32MAX(xr6,xr6,xr3); /* part2 */ yuv_len = ((yuv_len&~1)+3)>>1; data-=2; for (i = 0; i < yuv_len; i++) { S32LDI(xr1,data,4); D16MUL_LW(xr13,xr9,xr1,xr14);// resave sign of data[i] and data[i+1] D16CPS(xr1,xr1,xr1); /* quant_m_2 * acLevel + quant_add */ D16MUL_LW(xr7,xr5,xr1,xr8); D32ADD_AA(xr7,xr7,xr2,xr0); D32ADD_AA(xr8,xr8,xr2,xr0); #if 0 /* -2048 < data[i+1] <2047 */ S32CPS(xr7,xr7,xr13); S32MAX(xr10,xr7,xr3); S32MIN(xr10,xr10,xr4); S32MOVZ(xr10,xr13,xr13); /* -2048 < data[i] <2047 */ S32CPS(xr8,xr8,xr14); S32MAX(xr11,xr8,xr3); S32MIN(xr11,xr11,xr4); S32MOVZ(xr11,xr14,xr14); #else /* -2048 < data[i+1] <2047 */ S32AND(xr7,xr7,xr4); S32CPS(xr10,xr7,xr13); S32MOVZ(xr10,xr13,xr13); /* -2048 < data[i] <2047 */ S32AND(xr8,xr8,xr4); S32CPS(xr11,xr8,xr14); S32MOVZ(xr11,xr14,xr14); #endif S32SFL(xr0,xr10,xr11,xr12,3); S32STD(xr12,data,0); } S16STD(xr6,data-(yuv_len*2-2),0,0);// data[0] return(0); }
void fft_calc_fix_inverse(FFTContext_fix *s, FFTComplex_fix *z) { int ln = s->nbits; int j, np, np2; int nblocks, nloops; register FFTComplex_fix *p, *q; FFTComplex_fix *exptab = s->exptab; int l; FFTSample_fix tmp_re, tmp_im; np = 1 << ln; /* function is :butterfly all 4 step ,N=16 */ /* pass 0 */ #if 0 p=&z[0]; j=(np >> 1); do { /* X(k) = G(k)+H(k)*W (= e j*0) */ FFT_BF_fix(p[0].re, p[0].im, p[1].re, p[1].im, p[0].re, p[0].im, p[1].re, p[1].im); p+=2; } while (--j); #endif /* pass 1 */ p=&z[0]; j=np >> 2; do { #if 1 S32LDD(xr1,p,0); S32LDD(xr2,p,4); S32LDD(xr3,p,8); S32LDD(xr4,p,12); S32LDD(xr5,p,16); S32LDD(xr6,p,20); S32LDD(xr7,p,24); S32LDD(xr8,p,28); D32ADD_AS(xr1,xr1,xr3,xr3); D32ADD_AS(xr2,xr2,xr4,xr4); D32ADD_AS(xr5,xr5,xr7,xr7); D32ADD_AS(xr6,xr6,xr8,xr8); D32ADD_AS(xr1,xr1,xr5,xr5); D32ADD_AS(xr2,xr2,xr6,xr6); D32ADD_SA(xr3,xr3,xr8,xr9); D32ADD_AS(xr4,xr4,xr7,xr8); S32STD(xr1,p,0); S32STD(xr2,p,4); S32STD(xr3,p,8); S32STD(xr4,p,12); S32STD(xr5,p,16); S32STD(xr6,p,20); S32STD(xr9,p,24); S32STD(xr8,p,28); #else FFT_BF_fix(p[0].re, p[0].im, p[1].re, p[1].im, p[0].re, p[0].im, p[1].re, p[1].im); FFT_BF_fix(p[2].re, p[2].im, p[3].re, p[3].im, p[2].re, p[2].im, p[3].re, p[3].im); FFT_BF_fix(p[0].re, p[0].im, p[2].re, p[2].im, p[0].re, p[0].im, p[2].re, p[2].im); FFT_BF_fix(p[1].re, p[1].im, p[3].re, p[3].im, p[1].re, p[1].im, -p[3].im, p[3].re); #endif p+=4; } while (--j); /* pass 2 .. ln-1 */ nblocks = np >> 3; nloops = 1 << 2; np2 = np >> 1; do { p = z; q = z + nloops; for (j = 0; j < nblocks; ++j) { #if 1 S32LDD(xr1,p,0); S32LDD(xr2,p,4); S32LDD(xr3,q,0); S32LDD(xr4,q,4); D32ADD_AS(xr1,xr1,xr3,xr3); D32ADD_AS(xr2,xr2,xr4,xr4); S32STD(xr1,p,0); S32STD(xr2,p,4); S32STD(xr3,q,0); S32STD(xr4,q,4); #else FFT_BF_fix(p->re, p->im, q->re, q->im, p->re, p->im, q->re, q->im); #endif p++; q++; for(l = nblocks; l < np2; l += nblocks) { /* FFT_CMUL_fix( ) fuction is : (-j 2*PI/N *km) H(i) * E */ #if 1 FFTSample_fix _are = exptab[l].re; FFTSample_fix _bre = q->re; FFTSample_fix _aim = exptab[l].im; FFTSample_fix _bim = q->im; S32MUL(xr1, xr2, _are, _bre); S32MUL(xr5, xr6, _are, _bim); S32LDD(xr7,p,0); S32MSUB(xr1, xr2, _aim, _bim); S32MADD(xr5, xr6, _aim, _bre); S32LDD(xr8,p,4); D32SLL(xr1, xr1, xr5, xr5, 1); D32ADD_AS(xr7,xr7,xr1,xr1); D32ADD_AS(xr8,xr8,xr5,xr5); S32STD(xr7,p,0); S32STD(xr8,p,4); S32STD(xr1,q,0); S32STD(xr5,q,4); #else FFT_CMUL_fix(tmp_re, tmp_im, exptab[l].re, exptab[l].im, q->re, q->im); FFT_BF_fix(p->re, p->im, q->re, q->im, p->re, p->im, tmp_re, tmp_im); #endif p++; q++; } p += nloops; q += nloops; } nblocks = nblocks >> 1; nloops = nloops << 1; } while (nblocks); }
/* * NAME: layer->II() * DESCRIPTION: decode a single Layer II frame */ int mad_layer_II(struct mad_stream *stream, struct mad_frame *frame) { struct mad_header *header = &frame->header; struct mad_bitptr start; unsigned int index, sblimit, nbal, nch, bound, gr, ch, s, sb; unsigned char const *offsets; unsigned char allocation[2][32], scfsi[2][32], scalefactor[2][32][3]; mad_fixed_t samples[3]; nch = MAD_NCHANNELS(header); if (header->flags & MAD_FLAG_LSF_EXT) index = 4; else if (header->flags & MAD_FLAG_FREEFORMAT) goto freeformat; else { unsigned long bitrate_per_channel; bitrate_per_channel = header->bitrate; if (nch == 2) { bitrate_per_channel /= 2; # if defined(OPT_STRICT) /* * ISO/IEC 11172-3 allows only single channel mode for 32, 48, 56, and * 80 kbps bitrates in Layer II, but some encoders ignore this * restriction. We enforce it if OPT_STRICT is defined. */ if (bitrate_per_channel <= 28000 || bitrate_per_channel == 40000) { stream->error = MAD_ERROR_BADMODE; return -1; } # endif } else { /* nch == 1 */ if (bitrate_per_channel > 192000 && bitrate_per_channel != 320000) { /* * ISO/IEC 11172-3 does not allow single channel mode for 224, 256, * 320, or 384 kbps bitrates in Layer II. */ stream->error = MAD_ERROR_BADMODE; return -1; } } if (bitrate_per_channel <= 48000) index = (header->samplerate == 32000) ? 3 : 2; else if (bitrate_per_channel <= 80000) index = 0; else { freeformat: index = (header->samplerate == 48000) ? 0 : 1; } } sblimit = sbquant_table[index].sblimit; offsets = sbquant_table[index].offsets; bound = 32; if (header->mode == MAD_MODE_JOINT_STEREO) { header->flags |= MAD_FLAG_I_STEREO; bound = 4 + header->mode_extension * 4; } if (bound > sblimit) bound = sblimit; start = stream->ptr; /* decode bit allocations */ for (sb = 0; sb < bound; ++sb) { nbal = bitalloc_table[offsets[sb]].nbal; for (ch = 0; ch < nch; ++ch) allocation[ch][sb] = mad_bit_read(&stream->ptr, nbal); } for (sb = bound; sb < sblimit; ++sb) { nbal = bitalloc_table[offsets[sb]].nbal; allocation[0][sb] = allocation[1][sb] = mad_bit_read(&stream->ptr, nbal); } /* decode scalefactor selection info */ for (sb = 0; sb < sblimit; ++sb) { for (ch = 0; ch < nch; ++ch) { if (allocation[ch][sb]) scfsi[ch][sb] = mad_bit_read(&stream->ptr, 2); } } /* check CRC word */ if (header->flags & MAD_FLAG_PROTECTION) { header->crc_check = mad_bit_crc(start, mad_bit_length(&start, &stream->ptr), header->crc_check); if (header->crc_check != header->crc_target && !(frame->options & MAD_OPTION_IGNORECRC)) { stream->error = MAD_ERROR_BADCRC; return -1; } } /* decode scalefactors */ for (sb = 0; sb < sblimit; ++sb) { for (ch = 0; ch < nch; ++ch) { if (allocation[ch][sb]) { scalefactor[ch][sb][0] = mad_bit_read(&stream->ptr, 6); switch (scfsi[ch][sb]) { case 2: scalefactor[ch][sb][2] = scalefactor[ch][sb][1] = scalefactor[ch][sb][0]; break; case 0: scalefactor[ch][sb][1] = mad_bit_read(&stream->ptr, 6); /* fall through */ case 1: case 3: scalefactor[ch][sb][2] = mad_bit_read(&stream->ptr, 6); } if (scfsi[ch][sb] & 1) scalefactor[ch][sb][1] = scalefactor[ch][sb][scfsi[ch][sb] - 1]; # if defined(OPT_STRICT) /* * Scalefactor index 63 does not appear in Table B.1 of * ISO/IEC 11172-3. Nonetheless, other implementations accept it, * so we only reject it if OPT_STRICT is defined. */ if (scalefactor[ch][sb][0] == 63 || scalefactor[ch][sb][1] == 63 || scalefactor[ch][sb][2] == 63) { stream->error = MAD_ERROR_BADSCALEFACTOR; return -1; } # endif } } } /* decode samples */ for (gr = 0; gr < 12; ++gr) { for (sb = 0; sb < bound; ++sb) { for (ch = 0; ch < nch; ++ch) { if ((index = allocation[ch][sb])) { #ifdef JZ4750_OPT mad_fixed_t sf_val; mad_fixed_t *sb_ptr; sb_ptr = &(frame->sbsample[ch][3*gr-1][sb]); sf_val = sf_table[scalefactor[ch][sb][gr/4]]; index = offset_table[bitalloc_table[offsets[sb]].offset][index - 1]; II_samples(&stream->ptr, &qc_table[index], samples); S32MUL(xr1,xr2, samples[0], sf_val); S32MUL(xr3,xr4, samples[1], sf_val); S32MUL(xr5,xr6, samples[2], sf_val); S32EXTR(xr1,xr2,(32 - MAD_F_SCALEBITS), 31); S32EXTR(xr3,xr4,(32 - MAD_F_SCALEBITS), 31); S32EXTR(xr5,xr6,(32 - MAD_F_SCALEBITS), 31); D32SLL(xr1,xr1,xr3,xr3,1); D32SLL(xr5,xr5,xr0,xr0,1); S32SDIV(xr1, sb_ptr, 32, 2); S32SDIV(xr3, sb_ptr, 32, 2); S32SDIV(xr5, sb_ptr, 32, 2); #else index = offset_table[bitalloc_table[offsets[sb]].offset][index - 1]; II_samples(&stream->ptr, &qc_table[index], samples); for (s = 0; s < 3; ++s) { frame->sbsample[ch][3 * gr + s][sb] = mad_f_mul(samples[s], sf_table[scalefactor[ch][sb][gr / 4]]); } #endif } else { for (s = 0; s < 3; ++s) frame->sbsample[ch][3 * gr + s][sb] = 0; } } } for (sb = bound; sb < sblimit; ++sb) { if ((index = allocation[0][sb])) { index = offset_table[bitalloc_table[offsets[sb]].offset][index - 1]; II_samples(&stream->ptr, &qc_table[index], samples); for (ch = 0; ch < nch; ++ch) { #ifdef JZ4750_OPT mad_fixed_t sf_val; mad_fixed_t *sb_ptr; sb_ptr = &(frame->sbsample[ch][3*gr-1][sb]); sf_val = sf_table[scalefactor[ch][sb][gr/4]]; S32MUL(xr1,xr2, samples[0], sf_val); S32MUL(xr3,xr4, samples[1], sf_val); S32MUL(xr5,xr6, samples[2], sf_val); S32EXTR(xr1,xr2,(32 - MAD_F_SCALEBITS), 31); S32EXTR(xr3,xr4,(32 - MAD_F_SCALEBITS), 31); S32EXTR(xr5,xr6,(32 - MAD_F_SCALEBITS), 31); D32SLL(xr1,xr1,xr3,xr3,1); D32SLL(xr5,xr5,xr0,xr0,1); S32SDIV(xr1, sb_ptr, 32, 2); S32SDIV(xr3, sb_ptr, 32, 2); S32SDIV(xr5, sb_ptr, 32, 2); #else for (s = 0; s < 3; ++s) { frame->sbsample[ch][3 * gr + s][sb] = mad_f_mul(samples[s], sf_table[scalefactor[ch][sb][gr / 4]]); } #endif } } else { for (ch = 0; ch < nch; ++ch) { for (s = 0; s < 3; ++s) frame->sbsample[ch][3 * gr + s][sb] = 0; } } } for (ch = 0; ch < nch; ++ch) { for (s = 0; s < 3; ++s) { for (sb = sblimit; sb < 32; ++sb) frame->sbsample[ch][3 * gr + s][sb] = 0; } } } return 0; }
// MODE 3 static void pred16x16_plane_mxu(uint8_t *dst, uint8_t *src, uint8_t *top){ int i, j, k, a; uint8_t *src_top; // top address uint8_t *src_topleft, *src_left; // left address src_top = top; src_topleft = src_top - 0x14; src_left = src - 0x4; //----- H, LOAD ----- S32LDD(xr1, src_top, -0x14); // xr1 <- src_top[-4]; xr1: lt, 0, 0, 0 ; S32LDD(xr5, src_top, 0x0); // xr5 <- src_top[0] ; xr5: t3, t2, t1, t0 ; S32LDD(xr2, src_top, 0x4); // xr2 <- src_top[4] ; xr2: t7, t6, t5, t4 ; S32LDDR(xr3, src_top, 0x8); // xr3 <- src_top[8] ; xr3: t8, t9, t10, t11 ; S32LDDR(xr4, src_top, 0xc); // xr4 <- src_top[12]; xr4: t12, t13, t14, t15 ; S32ALNI(xr1, xr5, xr1, ptn1); // xr1: t2, t1, t0, lt ; S32ALNI(xr2, xr2, xr5, ptn1); // xr2: t6, t5, t4, t3 ; ---xr5 is free to use ; S32I2M(xr9, MUL_12); // xr9 : 0x00010002 ; S32I2M(xr10, MUL_34); // xr10: 0x00030004 ; //----- H, SUM ----- Q8ADDE_SS(xr5, xr3, xr2, xr6); // xr5[31:16] <- t8-t6 ; xr5[15:0] <- t9-t5 ; // xr6[31:16] <- t10-t4; xr6[15:0] <- t11-t3; S32I2M(xr11, MUL_56); // xr11: 0x00050006 ; D16MUL_WW(xr13, xr9, xr5, xr14); // xr13 <- 1*(t8-t6) ; xr14 <- 2*(t9-t5) ; D16MAC_AA_WW(xr13, xr10, xr6, xr14); // xr13 <- 1*(t8-t6)+3*(t10-t4) ; xr14 <- 2*(t9-t5)+4*(t11-t3) ; Q8ADDE_SS(xr5, xr4, xr1, xr6); // xr5[31:16] <- t12-t2; xr5[15:0] <- t13-t1; // xr6[31:16] <- t14-t0; xr6[15:0] <- t15-lt; S32I2M(xr12, MUL_78); // xr12: 0x00070008 ; D16MAC_AA_WW(xr13, xr11, xr5, xr14); // xr13 <- 1*(t8-t6)+3*(t10-t4)+5*(t12-t2) ; // xr14 <- 2*(t9-t5)+4*(t11-t3)+6*(t13-t1) ; D16MAC_AA_WW(xr13, xr12, xr6, xr14); // xr13 <- 1*(t8-t6)+3*(t10-t4)+5*(t12-t2)+7*(t14-t0) ; // xr14 <- 2*(t9-t5)+4*(t11-t3)+6*(t13-t1)+8*(t15-lt) ; S32LDD(xr1, src_topleft, 0x0); // xr1[31:24] <- src_topleft[3] (lt) ; S32LDD(xr2, src_left, 0x0); // xr2[31:24] <- src_topleft[stride+3] (l0) ; D32ADD_AA(xr15, xr13, xr14, xr0); // xr15 <- 1*(t8-t6)+3*(t10-t4)+5*(t12-t2)+7*(t14-t0) // + 2*(t9-t5)+4*(t11-t3)+6*(t13-t1)+8*(t15-lt) ; //----- V, LOAD ----- // S32LDD(xr1, src_topleft, 0x0); // xr1[31:24] <- src_topleft[3] (lt) ; // S32LDIV(xr2, src_topleft, stride, 0x0); // xr2[31:24] <- src_topleft[stride+3] (l0) ; S32LDIV(xr3, src_left, MB_LUMA_EDGED_WIDTH, 0x0); // xr3[31:24] <- src_topleft[2*stride+3] (l1) ; S32LDIV(xr8, src_left, MB_LUMA_EDGED_WIDTH, 0x0); // xr9[31:24] <- src_topleft[3*stride+3] (l2) ; S32SFL(xr5, xr2, xr1, xr0, ptn2); // xr5[31:16] <- l0, lt ; S32SFL(xr6, xr8, xr3, xr0, ptn2); // xr8[31:16] <- l2, l1 ; S32SFL(xr7, xr6, xr5, xr0, ptn3); // xr7[31: 0] <- l2, l1, l0, lt ; S32LDIV(xr1, src_left, MB_LUMA_EDGED_WIDTH, 0x0); S32LDIV(xr2, src_left, MB_LUMA_EDGED_WIDTH, 0x0); S32LDIV(xr3, src_left, MB_LUMA_EDGED_WIDTH, 0x0); S32LDIV(xr8, src_left, MB_LUMA_EDGED_WIDTH, 0x0); S32SFL(xr5, xr2, xr1, xr0, ptn2); S32SFL(xr6, xr8, xr3, xr0, ptn2); S32SFL(xr13, xr6, xr5, xr0, ptn3); // xr13[31:0] <- l6, l5, l4, l3 ; src_left += MB_LUMA_EDGED_WIDTH; S32LDIV(xr8, src_left, MB_LUMA_EDGED_WIDTH, 0x0); S32LDIV(xr3, src_left, MB_LUMA_EDGED_WIDTH, 0x0); S32LDIV(xr2, src_left, MB_LUMA_EDGED_WIDTH, 0x0); S32LDIV(xr1, src_left, MB_LUMA_EDGED_WIDTH, 0x0); S32SFL(xr6, xr8, xr3, xr0, ptn2); S32SFL(xr5, xr2, xr1, xr0, ptn2); S32SFL(xr14, xr6, xr5, xr0, ptn3); // xr14[31:0] <- l8, l9, l10, l11 ; S32LDIV(xr8, src_left, MB_LUMA_EDGED_WIDTH, 0x0); S32LDIV(xr3, src_left, MB_LUMA_EDGED_WIDTH, 0x0); S32LDIV(xr2, src_left, MB_LUMA_EDGED_WIDTH, 0x0); S32LDIV(xr1, src_left, MB_LUMA_EDGED_WIDTH, 0x0); S32SFL(xr6, xr8, xr3, xr0, ptn2); S32SFL(xr5, xr2, xr1, xr0, ptn2); S32SFL(xr1, xr6, xr5, xr0, ptn3); // xr1[31: 0] <- l12, l13, l14, l15 ; //----- V, SUM ----- Q8ADDE_SS(xr5, xr14, xr13, xr6); Q8ADDE_SS(xr2, xr1, xr7, xr3); D16MUL_WW(xr13, xr9, xr5, xr14); D16MAC_AA_WW(xr13, xr10, xr6, xr14); D16MAC_AA_WW(xr13, xr11, xr2, xr14); D16MAC_AA_WW(xr13, xr12, xr3, xr14); D32SLR(xr2, xr11, xr12, xr3, 0x8); // xr2: 0x00000500 ; xr3: 0x00000700 ; D32SLR(xr11, xr2, xr3, xr12, 0x8); //xr11: 0x00000005 ; xr12: 0x00000007 ; D32ADD_AA(xr14, xr13, xr14, xr0); // xr14 <- 1*(l8-l6)+3*(l10-l4)+5*(l12-l2)+7*(l14-l0) // + 2*(l9-l5)+4*(l11-l3)+6*(l13-l1)+8*(l15-lt) ; //----- P, CAL ----- // D32SLR(xr2, xr11, xr12, xr3, 0x8); // xr2: 0x00000500 ; xr3: 0x00000700 ; // D32SLR(xr11, xr2, xr3, xr12, 0x8); //xr11: 0x00000005 ; xr12: 0x00000007 ; D16MUL_WW(xr0, xr15, xr11, xr2); // xr2: 5*H ; D16MUL_WW(xr0, xr14, xr11, xr3); // xr3: 5*V ; D32SLR(xr8, xr11, xr0, xr0, 0x2); // xr8: 0x00000001 ; D32SLL(xr13, xr8, xr0, xr0, 0x5); //xr13: 0x00000020 ; Q8ACCE_AA(xr0, xr1, xr4, xr8); // xr8[15:0]: src1[0] + src2[16] + 1 D32ADD_AA(xr5, xr2, xr13, xr0); // xr5: 5*H+32 ; D32ADD_AA(xr6, xr3, xr13, xr0); // xr6: 5*V+32 ; D32SLR(xr2, xr5, xr6, xr3, 0x6); // xr2: ( 5*H+32 ) >> 6 ; xr3: ( 5*V+32 ) >> 6 ; // Q8ACCE_AA(xr0, xr1, xr4, xr8); // xr8[15:0]: src1[0] + src2[16] + 1 D32SLL(xr5, xr8, xr0, xr0, 0x4); // xr5[15:0]: 16*(src1[0] + src2[16] + 1) Q16ADD_AA_WW(xr7, xr2, xr3, xr0); // xr7: V+H // S32NOR(xr0, xr0, xr0); // idle S32I2M(xr4, MUX_H16); // xr4: 0x0000ffff ; D16MUL_WW(xr0, xr7, xr12, xr8); // xr8: 7*(V+H) S32SFL(xr0, xr3, xr3, xr14, ptn3); // xr14[31:16]: V ; xr14[15:0]: V ; D32SLL(xr7, xr2, xr0, xr0, 0x1); Q16ADD_SS_WW(xr9, xr5, xr8, xr0); // xr9: 16*(src1[0] + src2[16] + 1) - 7*(V+H) S32SFL(xr0, xr9, xr9, xr5, ptn3); // xr5[31:16]: a ; xr5[15:0]: a ; // S32SFL(xr0, xr3, xr3, xr14, ptn3); // xr14[31:16]: V ; xr14[15:0]: V ; // D32SLL(xr7, xr2, xr0, xr0, 0x1); S32SFL(xr0, xr7, xr7, xr8, ptn3); // xr8[31:16]: 2H ; xr8[15:0]: 2H ; S32AND(xr2, xr4, xr2); Q16ADD_AA_WW(xr15, xr5, xr2, xr0); // xr15[31:16]: a ; xr15[15:0]: a + H ; dst -= MB_LUMA_EDGED_WIDTH; //----- SRC, STORE ----- for (i=0; i<16; i++) { Q16ADD_AA_WW(xr1, xr15, xr8, xr0); Q16ADD_AA_WW(xr2, xr1, xr8, xr0); Q16SAR(xr9, xr15, xr1, xr1, 0x5); Q16ADD_AA_WW(xr3, xr2, xr8, xr0); Q16SAT(xr10, xr9, xr1); Q16ADD_AA_WW(xr4, xr3, xr8, xr0); Q16SAR(xr2, xr2, xr3, xr3, 0x5); Q16ADD_AA_WW(xr5, xr4, xr8, xr0); Q16SAT(xr11, xr2, xr3); Q16ADD_AA_WW(xr6, xr5, xr8, xr0); Q16SAR(xr4, xr4, xr5, xr5, 0x5); Q16ADD_AA_WW(xr7, xr6, xr8, xr0); Q16SAR(xr6, xr6, xr7, xr7, 0x5); Q16SAT(xr12, xr4, xr5); Q16SAT(xr13, xr6, xr7); S32SDIVR(xr10, dst, MB_LUMA_EDGED_WIDTH, 0x0); S32STDR(xr11, dst, 0x4); S32STDR(xr12, dst, 0x8); // S32STDR(xr13, dst, 0xc); Q16ADD_AA_WW(xr15, xr15, xr14, xr0); S32STDR(xr13, dst, 0xc); } }
// MODE 3 static void pred8x8_plane_mxu(uint8_t *dst, uint8_t *src, uint8_t *top){ unsigned int i; uint8_t *src_top; // top address uint8_t *src_topleft, *src_left; // left address src_top = top; src_topleft = src_top - 0x1c; src_left = src - 0x4; //----- H, LOAD ----- S32LDD(xr1, src_top, -0x1c); // xr1 <- src_top[-4]; xr1: lt, 0, 0, 0 ; S32LDD(xr3, src_top, 0x0); // xr3 <- src_top[0] ; xr3: t3, t2, t1, t0 ; S32LDDR(xr2, src_top, 0x4); // xr2 <- src_top[4] ; xr2: t4, t5, t6, t7 ; S32ALNI(xr1, xr3, xr1, ptn1);// xr1: t2, t1, t0, lt ; S32I2M(xr8, MUL_12); // xr8: 0x00010002 ; S32I2M(xr9, MUL_34); // xr9: 0x00030004 ; //----- H, SUM ----- Q8ADDE_SS(xr3, xr2, xr1, xr4); // xr3[31:16] <- t4-t2 ; xr3[15:0] <- t5-t1 ; // xr4[31:16] <- t6-t0 ; xr4[15:0] <- t7-lt; S32LDD(xr1, src_topleft, 0x0); // xr1[31:24] <- src_topleft[3] (lt) ; D16MUL_WW(xr5, xr8, xr3, xr6); // xr5 <- 1*(t4-t2) ; xr6 <- 2*(t5-t1) ; D16MAC_AA_WW(xr5, xr9, xr4, xr6); // xr5 <- 1*(t4-t2)+3*(t6-t0) ; xr6 <- 2*(t5-t1)+4*(t7-lt) ; S32LDD(xr12, src_left, 0x0);//xr12[31:24] <- src_topleft[stride+3] (l0) ; S32LDIV(xr3, src_left, MB_CHROM_EDGED_WIDTH, 0x0); // xr3[31:24] <- src_topleft[2*stride+3] (l1) ; D32ADD_AA(xr7, xr5, xr6, xr0); // xr7 <- 1*(t4-t2)+3*(t6-t0)+2*(t5-t1)+4*(t7-lt) ; //----- V, LOAD ----- // S32LDD(xr1, src_topleft, 0x0); // xr1[31:24] <- src_topleft[3] (lt) ; // S32LDIV(xr12, src_topleft, stride, 0x0);//xr12[31:24] <- src_topleft[stride+3] (l0) ; // S32LDIV(xr3, src_topleft, stride, 0x0); // xr3[31:24] <- src_topleft[2*stride+3] (l1) ; S32LDIV(xr4, src_left, MB_CHROM_EDGED_WIDTH, 0x0); // xr4[31:24] <- src_topleft[3*stride+3] (l2) ; S32SFL(xr5, xr12, xr1, xr0, ptn2); // xr5[31:16] <- l0, lt ; S32SFL(xr6, xr4, xr3, xr0, ptn2); // xr8[31:16] <- l2, l1 ; S32SFL(xr10, xr6, xr5, xr0, ptn3); // xr10[31:0] <- l2, l1, l0, lt ; src_left += MB_CHROM_EDGED_WIDTH; S32LDIV(xr4, src_left, MB_CHROM_EDGED_WIDTH, 0x0); S32LDIV(xr3, src_left, MB_CHROM_EDGED_WIDTH, 0x0); S32LDIV(xr12, src_left, MB_CHROM_EDGED_WIDTH, 0x0); S32LDIV(xr1, src_left, MB_CHROM_EDGED_WIDTH, 0x0); S32SFL(xr6, xr4, xr3, xr0, ptn2); S32SFL(xr5, xr12, xr1, xr0, ptn2); S32SFL(xr11, xr6, xr5, xr0, ptn3); // xr11[31:0] <- l4, l5, l6, l7 ; //----- V, SUM ----- Q8ADDE_SS(xr3, xr11, xr10, xr4); S32LUI(xr1, 0x1, ptn0); // xr1[31:0]: 0x00000001 ; D16MUL_WW(xr5, xr8, xr3, xr6); D16MAC_AA_WW(xr5, xr9, xr4, xr6); D32ADD_AA(xr13, xr5, xr6, xr0); // xr13 <- 1*(l4-l2)+3*(l6-l0)+2*(l5-l1)+4*(l7-lt) ; //----- P, CAL ----- useful XRs:xr13, xr7, xr2, xr11; // S32LUI(xr1, 0x1, ptn0); // xr1[31:0]: 0x00000001 ; D32SLL(xr5, xr1, xr1, xr6, 0x4); // xr5: 0x00000010; xr6: 0x00000010; D32SLL(xr3, xr13, xr7, xr4, 0x4); D32ACC_AA(xr5, xr13, xr3, xr0); // xr5: 17*V+16 D32ACC_AA(xr6, xr7, xr4, xr0); // xr6: 17*H+16 Q8ACCE_AA(xr0, xr2, xr11, xr1); // xr1[15:0]: src1[0] + src2[8] + 1 D32SLR(xr8, xr5, xr6, xr9, 0x5); // xr8: (17*V+16) >> 5 ; xr9: (17*H+16) >> 5 ; // Q8ACCE_AA(xr0, xr2, xr11, xr1); // xr1[15:0]: src1[0] + src2[8] + 1 D32SLL(xr2, xr1, xr0, xr0, 0x4); // xr2[15:0]: 16*(src1[0] + src2[16] + 1) Q16ADD_AA_WW(xr7, xr8, xr9, xr0); // xr7: V+H S32I2M(xr4, MUX_H16); // xr4: 0x0000ffff ; D32SLL(xr12, xr7, xr0, xr0, 0x1); D32ADD_AA(xr5, xr12, xr7, xr0); // xr5: 3*(V+H) // S32LUI(xr12, 0x3, ptn0); // xr12[31:0]: 0x00000003 ; // D16MUL_WW(xr0, xr7, xr12, xr5); // xr5: 3*(V+H) // S32I2M(xr4, MUX_H16); // xr4: 0x0000ffff ; Q16ADD_SS_WW(xr6, xr2, xr5, xr0); // xr6: 16*(src1[0] + src2[16] + 1) - 3*(V+H) // S32I2M(xr4, MUX_H16); // xr4: 0x0000ffff ; S32SFL(xr0, xr8, xr8, xr14, ptn3);// xr14[31:16]: V ; xr14[15:0]: V ; S32SFL(xr0, xr6, xr6, xr5, ptn3); // xr5[31:16]: a ; xr5[15:0]: a ; D32SLL(xr7, xr9, xr0, xr0, 0x1); S32SFL(xr0, xr7, xr7, xr8, ptn3); // xr8[31:16]: 2H ; xr8[15:0]: 2H ; // S32I2M(xr4, MUX_H16); // xr4: 0x0000ffff ; S32AND(xr9, xr4, xr9); Q16ADD_AA_WW(xr15, xr5, xr9, xr0); // xr15[31:16]: a ; xr15[15:0]: a + H ; dst -= MB_CHROM_EDGED_WIDTH; //----- SRC, STORE ----- for (i=0; i<8; i++) { Q16ADD_AA_WW(xr1, xr15, xr8, xr0); Q16ADD_AA_WW(xr2, xr1, xr8, xr0); Q16SAR(xr9, xr15, xr1, xr1, 0x5); Q16ADD_AA_WW(xr3, xr2, xr8, xr0); Q16SAT(xr10, xr9, xr1); // Q16SAR(xr9, xr15, xr1, xr1, 0x5); Q16SAR(xr2, xr2, xr3, xr3, 0x5); // Q16SAT(xr10, xr9, xr1); Q16SAT(xr11, xr2, xr3); S32SDIVR(xr10, dst, MB_CHROM_EDGED_WIDTH, 0x0); Q16ADD_AA_WW(xr15, xr15, xr14, xr0); S32STDR(xr11, dst, 0x4); } }