Esempio n. 1
0
// MODE 3
static void pred4x4_down_left_mxu(uint8_t *dst, uint8_t *src, uint8_t *topright,
				  uint8_t *top, uint8_t *topleft){
  //load
  S32LDD(xr1, top, 0x0);        // xr1 <- t3, t2, t1, t0 ;
  S32LDD(xr2, topright, 0x0);   // xr2 <- t7, t6, t5, t4 ;
  S32LDDR(xr15, topright, 0x0); //xr15 <- t4, t5, t6, t7 ;

  S32ALNI(xr3, xr2, xr1, ptn2); // xr3: t5, t4, t3, t2 ;
  Q8AVG(xr4, xr1, xr3);
  S32ALNI(xr5, xr2, xr1, ptn3); // xr5: t4, t3, t2, t1 ;
  Q8AVGR(xr6, xr4, xr5);

  S32ALNI(xr7, xr2, xr1, ptn1); // xr7: t6, t5, t4, t3 ;
  S32ALNI(xr8, xr15, xr2, ptn3);// xr8: t7, t7, t6, t5 ;
  Q8AVG(xr9, xr7, xr8);
  Q8AVGR(xr10, xr9, xr2);

  D32SLL(xr11, xr6, xr0, xr0, 0x8);
  S32ALNI(xr12, xr10, xr11, ptn1);
  S32ALNI(xr13, xr10, xr11, ptn2);

  //store
  S32STD(xr6, dst, 0x0);
  S32SDIV(xr13, dst, MB_LUMA_EDGED_WIDTH, 0x0);
  S32SDIV(xr12, dst, MB_LUMA_EDGED_WIDTH, 0x0);
  S32SDIV(xr10, dst, MB_LUMA_EDGED_WIDTH, 0x0);
}
Esempio n. 2
0
// MODE 5
static void pred4x4_vertical_right_mxu(uint8_t *dst, uint8_t *src, uint8_t *topright,
				       uint8_t *top, uint8_t *topleft){
  uint8_t *src_left;  // left address
  src_left = src -0x4;
  // load right
  S32LDD(xr8, top, 0x0); // xr8: t3, t2, t1, t0 ;  high -> low, [31->0];
  // load left
  S32LDD(xr1, topleft, -0x4); // xr1[31:24] <- src_topleft[3] (lt) ;
  S32LDD(xr2, src_left, 0x0); // xr2[31:24] <- src_topleft[stride+3] (l0) ;
  S32LDIV(xr3, src_left, MB_LUMA_EDGED_WIDTH, 0x0); // xr3[31:24] <- src_topleft[2*stride+3] (l1) ;
  S32LDIV(xr4, src_left, MB_LUMA_EDGED_WIDTH, 0x0); // xr4[31:24] <- src_topleft[3*stride+3] (l2) ;
  S32SFL(xr5, xr2, xr1, xr0, ptn2);       // xr5[31:16] <- l0, lt ;
  S32SFL(xr6, xr4, xr3, xr0, ptn2);       // xr6[31:16] <- l2, l1 ;
  S32SFL(xr7, xr3, xr2, xr0, ptn2);       // xr7[31:16] <- l1, l0 ;
  // alni
  S32ALNI(xr3, xr8, xr1, ptn1); // xr3: t2, t1, t0, lt ;
  S32ALNI(xr4, xr3, xr2, ptn1); // xr4: t1, t0, lt, l0 ;
  // cal
  Q8AVGR(xr1, xr3, xr8); // xr1: 
  Q8AVG(xr9, xr4, xr8);
  Q8AVGR(xr2, xr9, xr3); // xr2:
  Q8AVG(xr10, xr5, xr6);
  Q8AVGR(xr11, xr10, xr7); // xr11: src[0,3], src[0,2], ~, ~ ;
  // alni
  S32ALNI(xr12, xr2, xr11, ptn1);
  D32SLL(xr13, xr11, xr0, xr0, 0x8);
  S32ALNI(xr14, xr1, xr13, ptn1);
  // store
  S32STD(xr1, dst, 0x0);
  S32SDIV(xr2, dst, MB_LUMA_EDGED_WIDTH, 0x0);
  S32SDIV(xr14, dst, MB_LUMA_EDGED_WIDTH, 0x0);
  S32SDIV(xr12, dst, MB_LUMA_EDGED_WIDTH, 0x0);
}
Esempio n. 3
0
// MODE 6
static void pred4x4_horizontal_down_mxu(uint8_t *dst, uint8_t *src, uint8_t *topright,
					uint8_t *top, uint8_t *topleft){
  uint8_t *src_left; // left address
  src_left = src - 0x4;
  // load TOP
  S32LDDR(xr8, top, 0x0);      // xr8[31:0]: t0, t1, t2, t3 ;
  S32LDDR(xr15, topleft, -0x4);// xr15[7:0]: lt ;
  S32LDD(xr9, topleft, -0x4);  // xr9[31:24]: lt ;
  S32ALNI(xr10, xr15, xr8, ptn3); //xr10[31:0]: lt, t0, t1, t2 ;
  // load LEFT
  S32LDDR(xr1, src_left, 0x0);          // xr1[7:0] <- src_left[3] (l0) ;
  S32LDIVR(xr2, src_left, MB_LUMA_EDGED_WIDTH, 0x0); // xr2[7:0] <- src_left[stride+3] (l1) ;
  S32LDIVR(xr3, src_left, MB_LUMA_EDGED_WIDTH, 0x0); // xr3[7:0] <- src_left[2*stride+3] (l2) ;
  S32LDIVR(xr4, src_left, MB_LUMA_EDGED_WIDTH, 0x0); // xr4[7:0] <- src_left[3*stride+3] (l3) ;
  S32SFL(xr0, xr2, xr1, xr5, ptn2);    // xr5[15:0] <- l1, l0 ;
  S32SFL(xr0, xr4, xr3, xr6, ptn2);    // xr6[15:0] <- l3, l2 ;
  S32SFL(xr0, xr6, xr5, xr7, ptn3);    // xr7[31:0] <- l3, l2, l1, l0 ;
  // ALNI for CAL
  S32ALNI(xr11, xr7, xr9, ptn1);  // xr11: l2, l1, l0, lt ;
  S32ALNI(xr12, xr1, xr10, ptn3); // xr12: l0, lt, t0, t1 ;
  D32SLL(xr0, xr0, xr11, xr13, 0x8); // xr13: l1, l0, lt, 0 ;
  // CAL
  Q8AVGR(xr1, xr11, xr7); // xr1: src[0,3], src[0,2]/src[2,3], src[0,1]/src[2,2], src[0,0]/src[2,1] ;
  Q8AVG(xr2, xr12, xr8);
  Q8AVGR(xr3, xr2, xr10); // xr3: src[1,0]/src[3,1], src[2,0], src[3,0], ~ ;
  Q8AVG(xr4, xr13, xr7);
  Q8AVGR(xr5, xr4, xr11); // xr5: src[1,3], src[1,2]/src[3,3], src[1,1]/src[3,2], ~ ;
  // ALNI for STORE
  S32ALNI(xr8, xr1, xr3, ptn3); // xr8: src[0,0]/src[2,1], src[1,0]/src[3,1], src[2,0], src[3,0] ;
  S32SFL(xr9, xr1, xr5, xr10, ptn0); // xr9: src[0,3], src[1,3], src[0,2]/src[2,3], src[1,2]/src[3,3] ;
                                     //xr10: src[0,1]/src[2,2], src[1,1]/src[3,2], src[0,0]/src[2,1], ~ ;
  S32SFL(xr11, xr10, xr8, xr0, ptn3); // xr11: src[0,1]/src[2,2], src[1,1]/src[3,2],
                                      //       src[0,0]/src[2,1], src[1,0]/src[3,1] ;
  S32ALNI(xr12, xr9, xr10, ptn2); // xr12: src[0,2]/src[2,3], src[1,2]/src[3,3],
                                  //       src[0,1]/src[2,2], src[1,1]/src[3,2] ;
  // STORE
  S32STDR(xr8, dst, 0x0);
  S32SDIVR(xr11, dst, MB_LUMA_EDGED_WIDTH, 0x0);
  S32SDIVR(xr12, dst, MB_LUMA_EDGED_WIDTH, 0x0);
  S32SDIVR(xr9, dst, MB_LUMA_EDGED_WIDTH, 0x0);
}
Esempio n. 4
0
// MODE 4
static void pred4x4_down_right_mxu(uint8_t *dst, uint8_t *src, uint8_t *topright,
				   uint8_t *top, uint8_t *topleft){
  uint8_t *src_left; // left address
  src_left = src - 0x4;
  // load right
  S32LDDR(xr8, top, 0x0);  // xr8: t0, t1, t2, t3 ;  high -> low, [31->0];
  S32LDDR(xr9, topleft, -0x4); // xr9[7:0]: lt ;
  // load left
  S32LDD(xr7, topleft, -0x4);          // xr7[31:24]: lt ;
  S32LDD(xr1, src_left, 0x0);          // xr1[31:24] <- src_left[3] (l0) ;
  S32LDIV(xr2, src_left, MB_LUMA_EDGED_WIDTH, 0x0); // xr2[31:24] <- src_left[stride+3] (l1) ;
  S32LDIV(xr3, src_left, MB_LUMA_EDGED_WIDTH, 0x0); // xr3[31:24] <- src_left[2*stride+3] (l2) ;
  S32LDIV(xr4, src_left, MB_LUMA_EDGED_WIDTH, 0x0); // xr4[31:24] <- src_left[3*stride+3] (l3) ;
  S32SFL(xr5, xr2, xr1, xr0, ptn2);    // xr5[31:16] <- l1, l0 ;
  S32SFL(xr6, xr4, xr3, xr0, ptn2);    // xr6[31:16] <- l3, l2 ;
  S32SFL(xr1, xr6, xr5, xr0, ptn3);    // xr1[31: 0] <- l3, l2, l1, l0 ;
  // alni
  S32ALNI(xr10, xr9, xr8, ptn3); // xr10: lt, t0, t1, t2 ;
  S32ALNI(xr11, xr1, xr7, ptn1); // xr11: l2, l1, l0, lt ;
  S32ALNI(xr12, xr11, xr8, ptn2);// xr12: l0, lt, t0, t1 ;
  S32ALNI(xr13, xr1, xr10, ptn2);// xr13: l1, l0, lt, t0 ;
  // cal
  Q8AVG(xr3, xr1, xr13);
  Q8AVGR(xr4, xr3, xr11); // xr4: src[0,3], src[0,2]/src[1,3], src[0,1]/src[1,2]/src[2,3],
                          //      src[0,0]/src[1,1]/src[2,2]/src[3,3] ;
  Q8AVG(xr5, xr8, xr12);
  Q8AVGR(xr6, xr5, xr10); // xr6: src[0,0]/src[1,1]/src[2,2]/src[3,3],
                          //      src[1,0]/src[2,1]/src[3,2], src[2,0]/src[3,1], src[3,0] ;
  // alni for store
  D32SLL(xr7, xr6, xr0, xr0, 0x8); // xr7: src[1,0]/src[2,1]/src[3,2], src[2,0]/src[3,1], src[3,0] ;
  S32ALNI(xr8, xr4, xr7, ptn1);
  S32ALNI(xr9, xr4, xr7, ptn2);
  //store
  S32STDR(xr6, dst, 0x0);
  S32SDIVR(xr9, dst, MB_LUMA_EDGED_WIDTH, 0x0);
  S32SDIVR(xr8, dst, MB_LUMA_EDGED_WIDTH, 0x0);
  S32SDIVR(xr4, dst, MB_LUMA_EDGED_WIDTH, 0x0);
}
Esempio n. 5
0
// MODE 8
static void pred4x4_horizontal_up_mxu(uint8_t *dst, uint8_t *src, uint8_t *topright,
				      uint8_t *top, uint8_t *topleft){
  uint8_t *src_left; // left address
  src_left = src - 0x4;
  //load
  S32LDD(xr1, src_left, 0x0);          // xr1[31:24] <- src_left[3] (l0) ;
  S32LDIV(xr2, src_left, MB_LUMA_EDGED_WIDTH, 0x0); // xr2[31:24] <- src_left[stride+3] (l1) ;
  S32LDIV(xr3, src_left, MB_LUMA_EDGED_WIDTH, 0x0); // xr3[31:24] <- src_left[2*stride+3] (l2) ;
  S32LDIV(xr4, src_left, MB_LUMA_EDGED_WIDTH, 0x0); // xr4[31:24] <- src_left[3*stride+3] (l3) ;
  S32SFL(xr5, xr2, xr1, xr0, ptn2);    // xr5[31:16] <- l1, l0 ;
  S32SFL(xr6, xr4, xr3, xr0, ptn2);    // xr6[31:16] <- l3, l2 ;
  S32SFL(xr1, xr6, xr5, xr0, ptn3);    // xr1[31: 0] <- l3, l2, l1, l0 ;

  D32SLL(xr2, xr1, xr0, xr0, 0x8);  // xr2: l2, l1, l0, 0 ;
  S32SFL(xr3, xr1, xr1, xr0, ptn0); // xr3: l3, l3, l2, l2;

  Q8AVGR(xr4, xr1, xr2); // xr4: src[2,1]/src[0,2], src[2,0]/src[0,1], src[0,0], ~ ;

  Q8AVG(xr5, xr2, xr3);
  Q8AVGR(xr6, xr5, xr1); // xr6: src[3,1]/src[1,2], src[3,0]/src[1,1], src[1,0], ~ ;

  S32SFL(xr7, xr6, xr4, xr0, ptn0); // xr7: src[3,1]/src[1,2], src[2,1]/src[0,2],
                                    //      src[3,0]/src[1,1], src[2,0]/src[0,1];

  D32SLR(xr8, xr4, xr6, xr9, 0x8); // xr8: 0, src[2,1]/src[0,2], src[2,0]/src[0,1], src[0,0] ;
                                   // xr9: 0, src[3,1]/src[1,2], src[3,0]/src[1,1], src[1,0] ;
  S32SFL(xr0, xr9, xr8, xr10, ptn0); // xr10: src[3,0], src[2,0], src[1,0], src[0,0] ;

  S32SFL(xr11, xr3, xr7, xr0, ptn3); // xr11: l3, l3, src[3,1]/src[1,2], src[2,1]/src[0,2] ;

  S32SFL(xr12, xr3, xr3, xr0, ptn3); // xr12: l3, l3, l3, l3 ;

  //store
  S32STD(xr10, dst, 0x0);
  S32SDIV(xr7, dst, MB_LUMA_EDGED_WIDTH, 0x0);
  S32SDIV(xr11, dst, MB_LUMA_EDGED_WIDTH, 0x0);
  S32SDIV(xr12, dst, MB_LUMA_EDGED_WIDTH, 0x0);
}
Esempio n. 6
0
void imdct_half_fix_c(MDCTContext_fix *s, FFTSample_fix *output,
                      const FFTSample_fix *input)
{
    //PMON_ON(qmf);
    int k, n8, n4, n2, n, j,j1;
    const FFTSample_fix *in1, *in2;
    const unsigned short *revtab = s->fft.revtab;
    const FFTSample_fix *tcos = s->tcos;
    const FFTSample_fix *tsin = s->tsin;
    FFTComplex_fix *z = (FFTComplex_fix *)output;

    n = 1 << s->nbits;//64
    n2 = n >> 1;//32
    n4 = n >> 2;//16
    n8 = n >> 3;//8
    /* pre rotation */
    in1 = input; //head
    in2 = input + n2 - 1;//tail
    for(k = 0; k < n8; k++) {
#if 0
        j=revtab[k];
        FFT_CMUL_fix(z[j].re, z[j].im, *in2, *in1, tcos[k], tsin[k]);
        in1 += 2;
        in2 -= 2;
#else
        FFTSample_fix _are,_bre,_aim,_bim,are,aim;
        _are = *in2;
        _bre = tcos[k];
        _aim = *in1;
        _bim = tsin[k];
        j=revtab[k];
        n=n4-k-1;
        j1=revtab[n];
        S32MUL(xr1,xr2, _are, _bre);
        S32MUL(xr3, xr4, _are, _bim);
        in2--;
        are = *in2;
        S32MUL(xr7,xr8, are, _bre);
        S32MUL(xr9, xr10, are, _bim);
        S32MSUB(xr1, xr2, _aim, _bim);
        S32MADD(xr3, xr4, _aim, _bre);        ;
        in1++;
        aim = *in1;
        D32SLL(xr5,xr1,xr3,xr6,1);
        S32MSUB(xr7, xr8, aim, _bim);
        S32MADD(xr9, xr10, aim, _bre);
        z[j].re=S32M2I(xr5);
        D32SLL(xr11,xr7,xr9,xr12,1);
        z[j].im=S32M2I(xr6);
        in1++;
        in2--;
        z[j1].re=S32M2I(xr11);
        z[j1].im=S32M2I(xr12);
#endif
    }

    s->fft.fft_calc(&s->fft, z);

    /* post rotation + reordering */
    /* XXX: optimize */
    for(k = 0; k < n8; k++) {
        FFTSample_fix r0, i0, r1, i1;
        FFT_CMUL_fix(r0, i1, z[n8-k-1].im, z[n8-k-1].re, tsin[n8-k-1], tcos[n8-k-1]);
        FFT_CMUL_fix(r1, i0, z[n8+k  ].im, z[n8+k  ].re, tsin[n8+k  ], tcos[n8+k  ]);
        z[n8-k-1].re = r0;
        z[n8-k-1].im = i0;
        z[n8+k  ].re = r1;
        z[n8+k  ].im = i1;
    }
    //PMON_OFF(qmf);
}
Esempio n. 7
0
void Predict_16x16_C(const NEW_GMC_DATA * const This,
                     uint8_t *dst,
                     const uint8_t *src,
                     int dststride,
                     int srcstride,
                     int x,
                     int y,
                     int rounding)
{
    const int W       = This->sW;
    const int H	    = This->sH;
    const int rho     = 3 - This->accuracy;
    const int Rounder = ( (1<<7) - (rounding<<(2*rho)) ) << 16;

    const int dUx = This->dU[0];
    const int dVx = This->dV[0];
    const int dUy = This->dU[1];
    const int dVy = This->dV[1];

    int Uo = This->Uo + 16*(dUy*y + dUx*x);
    int Vo = This->Vo + 16*(dVy*y + dVx*x);

    int i, j;

    dst += 16;
    {
        unsigned int ri = 16;
        unsigned int rj = 16;
        int Offset;
        int u,v;

        uint8_t *srctmp;
        uint32_t tmpf = 0;

        S32I2M(xr15,dUx);
        S32I2M(xr14,dVx);
        S32I2M(xr13,dUy);
        S32I2M(xr12,dVy);

        S32I2M(xr11,Uo); // Uo 11
        S32I2M(xr10,Vo); // Vo 10

        S32I2M(xr5, Rounder);

        for (j = 16; j>0; --j)
        {
            D32SLL(xr9,xr11,xr10,xr8, 0x0); // U 9 ,V 8
            D32ASUM_AA(xr11,xr13,xr12,xr10); // += dUy; +=dVy;


            for (i = -16; i<0; ++i)
            {
                ri = 16;
                rj = 16;

                // ( U >> 16 ) ,( V >> 16 )
                D32SAR(xr7,xr9,xr8,xr6, 0x8);
                D32SAR(xr7,xr7,xr6,xr6, 0x8);

                D32SLLV(xr7,xr6, rho); // << rho

                u = S32M2I(xr7);
                v = S32M2I(xr6);
                D32ASUM_AA(xr9,xr15,xr14,xr8); // U += dUx; V += dVx;

                if (u > 0 && u <= W)
                {
                    ri = MTab[u&15];
                    Offset = u>>4;
                }
                else
                {
                    if (u > W)
                        Offset = W>>4;
                    else
                        Offset = 0;

                    ri = MTab[0];
                }

                if (v > 0 && v <= H)
                {
                    rj      = MTab[v&15];
                    Offset += (v>>4)*srcstride;
                }
Esempio n. 8
0
uint32_t
dequant_h263_intra_mxu(int16_t * data,	uint8_t yuv_len,				
					 const uint32_t quant,
					 const uint32_t dcscalar,
					 const uint16_t * mpeg_quant_matrices)
{     
 	uint32_t i = 0; 
	
	S32LUI(xr9,1,0);
	S32I2M(xr1,quant);
	
	D32SLL(xr5,xr1,xr0,xr0,1);// quant_m_2

	/* quant_add  */
	S32AND(xr15,xr1,xr9);
	S32MOVN(xr2,xr15,xr1);
	D32ADD_SS(xr1,xr1,xr9,xr3);
	S32MOVZ(xr2,xr15,xr1);

	S32I2M(xr3,-2048);
	S32I2M(xr4,2047);

	/* part1 */
	//S32MUL(xr4,xr6,*data,dcscalar);
	S32MUL(xr0,xr6,(int32_t)data[0],dcscalar);
	D16MUL_WW(xr0,xr6,xr9,xr6);

	S32MIN(xr6,xr6,xr4);
	S32MAX(xr6,xr6,xr3);  

	/* part2 */
	yuv_len = ((yuv_len&~1)+3)>>1;
	data-=2;    
	for (i = 0; i < yuv_len; i++) {
	    S32LDI(xr1,data,4);
      
	    D16MUL_LW(xr13,xr9,xr1,xr14);// resave sign of data[i] and data[i+1] 	    
	    D16CPS(xr1,xr1,xr1); 

	    /*  quant_m_2 * acLevel + quant_add */
	    D16MUL_LW(xr7,xr5,xr1,xr8);
            D32ADD_AA(xr7,xr7,xr2,xr0);
	    D32ADD_AA(xr8,xr8,xr2,xr0);

#if 0	   
	    /* -2048 < data[i+1] <2047  */
            S32CPS(xr7,xr7,xr13);
	    S32MAX(xr10,xr7,xr3);
	    S32MIN(xr10,xr10,xr4);
	    S32MOVZ(xr10,xr13,xr13);
	    
	    /* -2048 < data[i] <2047  */
	    S32CPS(xr8,xr8,xr14);
	    S32MAX(xr11,xr8,xr3);
	    S32MIN(xr11,xr11,xr4);
	    S32MOVZ(xr11,xr14,xr14);
#else

	    /* -2048 < data[i+1] <2047  */
	    S32AND(xr7,xr7,xr4);
            S32CPS(xr10,xr7,xr13);
	    S32MOVZ(xr10,xr13,xr13);
	    
	    /* -2048 < data[i] <2047  */
	    S32AND(xr8,xr8,xr4);
	    S32CPS(xr11,xr8,xr14);
	    S32MOVZ(xr11,xr14,xr14);

#endif
	   
            S32SFL(xr0,xr10,xr11,xr12,3);
	   
	       S32STD(xr12,data,0);
        }  
	S16STD(xr6,data-(yuv_len*2-2),0,0);// data[0]

	return(0);
}
Esempio n. 9
0
void fft_calc_fix_inverse(FFTContext_fix *s, FFTComplex_fix *z)
{
    int ln = s->nbits;
    int j, np, np2;
    int nblocks, nloops;
    register FFTComplex_fix *p, *q;
    FFTComplex_fix *exptab = s->exptab;
    int l;
    FFTSample_fix tmp_re, tmp_im;
    np = 1 << ln;
      /* function is :butterfly  all 4 step ,N=16 */
    /* pass 0 */
#if 0
    p=&z[0];
    j=(np >> 1);
    do {
      /*
	 X(k) = G(k)+H(k)*W  (= e j*0)
      */
        FFT_BF_fix(p[0].re, p[0].im, p[1].re, p[1].im,
           p[0].re, p[0].im, p[1].re, p[1].im);

        p+=2;
    } while (--j);
#endif

    /* pass 1 */
    p=&z[0];
    j=np >> 2;
    do {
#if 1
      S32LDD(xr1,p,0);
      S32LDD(xr2,p,4);
      S32LDD(xr3,p,8);
      S32LDD(xr4,p,12);
      S32LDD(xr5,p,16);
      S32LDD(xr6,p,20);
      S32LDD(xr7,p,24);
      S32LDD(xr8,p,28);
      D32ADD_AS(xr1,xr1,xr3,xr3);
      D32ADD_AS(xr2,xr2,xr4,xr4);
      D32ADD_AS(xr5,xr5,xr7,xr7);
      D32ADD_AS(xr6,xr6,xr8,xr8);
      D32ADD_AS(xr1,xr1,xr5,xr5);
      D32ADD_AS(xr2,xr2,xr6,xr6);
      D32ADD_SA(xr3,xr3,xr8,xr9);
      D32ADD_AS(xr4,xr4,xr7,xr8);
      S32STD(xr1,p,0);
      S32STD(xr2,p,4);
      S32STD(xr3,p,8);
      S32STD(xr4,p,12);
      S32STD(xr5,p,16);
      S32STD(xr6,p,20);
      S32STD(xr9,p,24);
      S32STD(xr8,p,28);
#else
      FFT_BF_fix(p[0].re, p[0].im, p[1].re, p[1].im,
		 p[0].re, p[0].im, p[1].re, p[1].im);
      FFT_BF_fix(p[2].re, p[2].im, p[3].re, p[3].im,
		 p[2].re, p[2].im, p[3].re, p[3].im);

      FFT_BF_fix(p[0].re, p[0].im, p[2].re, p[2].im,
		 p[0].re, p[0].im, p[2].re, p[2].im);
      FFT_BF_fix(p[1].re, p[1].im, p[3].re, p[3].im,
		 p[1].re, p[1].im, -p[3].im, p[3].re);
#endif
      p+=4;
    } while (--j);

    /* pass 2 .. ln-1 */
    nblocks = np >> 3;
    nloops = 1 << 2;
    np2 = np >> 1;
    do {
        p = z;
        q = z + nloops;
        for (j = 0; j < nblocks; ++j) {
#if 1
	  S32LDD(xr1,p,0);
	  S32LDD(xr2,p,4);
	  S32LDD(xr3,q,0);
	  S32LDD(xr4,q,4);
	  D32ADD_AS(xr1,xr1,xr3,xr3);
	  D32ADD_AS(xr2,xr2,xr4,xr4);
	  S32STD(xr1,p,0);
	  S32STD(xr2,p,4);
	  S32STD(xr3,q,0);
	  S32STD(xr4,q,4);
#else
	  FFT_BF_fix(p->re, p->im, q->re, q->im,
		     p->re, p->im, q->re, q->im);
#endif

	  p++;
	  q++;
	  for(l = nblocks; l < np2; l += nblocks) {
	    /* FFT_CMUL_fix( ) fuction is :
	       
	    (-j 2*PI/N *km)
	    H(i) * E
	    */
#if 1
	    FFTSample_fix _are = exptab[l].re;
	    FFTSample_fix _bre = q->re;
	    FFTSample_fix _aim = exptab[l].im;
	    FFTSample_fix _bim = q->im;

	    S32MUL(xr1, xr2, _are, _bre);	    
            S32MUL(xr5, xr6, _are, _bim);
	    S32LDD(xr7,p,0);	    
            S32MSUB(xr1, xr2, _aim, _bim);	
	    S32MADD(xr5, xr6, _aim, _bre);	
	    S32LDD(xr8,p,4);
	    D32SLL(xr1, xr1, xr5, xr5, 1);	

	    D32ADD_AS(xr7,xr7,xr1,xr1);
	    D32ADD_AS(xr8,xr8,xr5,xr5);
	    S32STD(xr7,p,0);
	    S32STD(xr8,p,4);
	    S32STD(xr1,q,0);
	    S32STD(xr5,q,4);

#else
	    FFT_CMUL_fix(tmp_re, tmp_im, exptab[l].re, exptab[l].im, q->re, q->im);
	    FFT_BF_fix(p->re, p->im, q->re, q->im,
		       p->re, p->im, tmp_re, tmp_im);
#endif
	    p++;
	    q++;
	  }
	  p += nloops;
	  q += nloops;
        }
        nblocks = nblocks >> 1;
        nloops = nloops << 1;
    } while (nblocks);
}
Esempio n. 10
0
/*
 * NAME:	layer->II()
 * DESCRIPTION:	decode a single Layer II frame
 */
int mad_layer_II(struct mad_stream *stream, struct mad_frame *frame)
{
  struct mad_header *header = &frame->header;
  struct mad_bitptr start;
  unsigned int index, sblimit, nbal, nch, bound, gr, ch, s, sb;
  unsigned char const *offsets;
  unsigned char allocation[2][32], scfsi[2][32], scalefactor[2][32][3];
  mad_fixed_t samples[3];

  nch = MAD_NCHANNELS(header);

  if (header->flags & MAD_FLAG_LSF_EXT)
    index = 4;
  else if (header->flags & MAD_FLAG_FREEFORMAT)
    goto freeformat;
  else {
    unsigned long bitrate_per_channel;

    bitrate_per_channel = header->bitrate;
    if (nch == 2) {
      bitrate_per_channel /= 2;

# if defined(OPT_STRICT)
      /*
       * ISO/IEC 11172-3 allows only single channel mode for 32, 48, 56, and
       * 80 kbps bitrates in Layer II, but some encoders ignore this
       * restriction. We enforce it if OPT_STRICT is defined.
       */
      if (bitrate_per_channel <= 28000 || bitrate_per_channel == 40000) {
	stream->error = MAD_ERROR_BADMODE;
	return -1;
      }
# endif
    }
    else {  /* nch == 1 */
      if (bitrate_per_channel > 192000 && bitrate_per_channel != 320000) {
	/*
	 * ISO/IEC 11172-3 does not allow single channel mode for 224, 256,
	 * 320, or 384 kbps bitrates in Layer II.
	 */
	stream->error = MAD_ERROR_BADMODE;
	return -1;
      }
    }

    if (bitrate_per_channel <= 48000)
      index = (header->samplerate == 32000) ? 3 : 2;
    else if (bitrate_per_channel <= 80000)
      index = 0;
    else {
    freeformat:
      index = (header->samplerate == 48000) ? 0 : 1;
    }
  }

  sblimit = sbquant_table[index].sblimit;
  offsets = sbquant_table[index].offsets;

  bound = 32;
  if (header->mode == MAD_MODE_JOINT_STEREO) {
    header->flags |= MAD_FLAG_I_STEREO;
    bound = 4 + header->mode_extension * 4;
  }

  if (bound > sblimit)
    bound = sblimit;

  start = stream->ptr;

  /* decode bit allocations */

  for (sb = 0; sb < bound; ++sb) {
    nbal = bitalloc_table[offsets[sb]].nbal;

    for (ch = 0; ch < nch; ++ch)
      allocation[ch][sb] = mad_bit_read(&stream->ptr, nbal);
  }

  for (sb = bound; sb < sblimit; ++sb) {
    nbal = bitalloc_table[offsets[sb]].nbal;

    allocation[0][sb] =
    allocation[1][sb] = mad_bit_read(&stream->ptr, nbal);
  }

  /* decode scalefactor selection info */

  for (sb = 0; sb < sblimit; ++sb) {
    for (ch = 0; ch < nch; ++ch) {
      if (allocation[ch][sb])
	scfsi[ch][sb] = mad_bit_read(&stream->ptr, 2);
    }
  }

  /* check CRC word */

  if (header->flags & MAD_FLAG_PROTECTION) {
    header->crc_check =
      mad_bit_crc(start, mad_bit_length(&start, &stream->ptr),
		  header->crc_check);

    if (header->crc_check != header->crc_target &&
	!(frame->options & MAD_OPTION_IGNORECRC)) {
      stream->error = MAD_ERROR_BADCRC;
      return -1;
    }
  }

  /* decode scalefactors */

  for (sb = 0; sb < sblimit; ++sb) {
    for (ch = 0; ch < nch; ++ch) {
      if (allocation[ch][sb]) {
	scalefactor[ch][sb][0] = mad_bit_read(&stream->ptr, 6);

	switch (scfsi[ch][sb]) {
	case 2:
	  scalefactor[ch][sb][2] =
	  scalefactor[ch][sb][1] =
	  scalefactor[ch][sb][0];
	  break;

	case 0:
	  scalefactor[ch][sb][1] = mad_bit_read(&stream->ptr, 6);
	  /* fall through */

	case 1:
	case 3:
	  scalefactor[ch][sb][2] = mad_bit_read(&stream->ptr, 6);
	}

	if (scfsi[ch][sb] & 1)
	  scalefactor[ch][sb][1] = scalefactor[ch][sb][scfsi[ch][sb] - 1];

# if defined(OPT_STRICT)
	/*
	 * Scalefactor index 63 does not appear in Table B.1 of
	 * ISO/IEC 11172-3. Nonetheless, other implementations accept it,
	 * so we only reject it if OPT_STRICT is defined.
	 */
	if (scalefactor[ch][sb][0] == 63 ||
	    scalefactor[ch][sb][1] == 63 ||
	    scalefactor[ch][sb][2] == 63) {
	  stream->error = MAD_ERROR_BADSCALEFACTOR;
	  return -1;
	}
# endif
      }
    }
  }

  /* decode samples */

  for (gr = 0; gr < 12; ++gr) {
    for (sb = 0; sb < bound; ++sb) {
      for (ch = 0; ch < nch; ++ch) {
	if ((index = allocation[ch][sb])) {
#ifdef JZ4750_OPT
          mad_fixed_t sf_val;
          mad_fixed_t *sb_ptr;
          sb_ptr = &(frame->sbsample[ch][3*gr-1][sb]);
          sf_val = sf_table[scalefactor[ch][sb][gr/4]];
	  index = offset_table[bitalloc_table[offsets[sb]].offset][index - 1];

	  II_samples(&stream->ptr, &qc_table[index], samples);
          S32MUL(xr1,xr2, samples[0], sf_val);
          S32MUL(xr3,xr4, samples[1], sf_val);
          S32MUL(xr5,xr6, samples[2], sf_val);
          S32EXTR(xr1,xr2,(32 - MAD_F_SCALEBITS), 31);
          S32EXTR(xr3,xr4,(32 - MAD_F_SCALEBITS), 31);
          S32EXTR(xr5,xr6,(32 - MAD_F_SCALEBITS), 31);
          D32SLL(xr1,xr1,xr3,xr3,1);
          D32SLL(xr5,xr5,xr0,xr0,1);
          S32SDIV(xr1, sb_ptr, 32, 2);
          S32SDIV(xr3, sb_ptr, 32, 2);
          S32SDIV(xr5, sb_ptr, 32, 2);
#else
	  index = offset_table[bitalloc_table[offsets[sb]].offset][index - 1];

	  II_samples(&stream->ptr, &qc_table[index], samples);

	  for (s = 0; s < 3; ++s) {
	    frame->sbsample[ch][3 * gr + s][sb] =
	      mad_f_mul(samples[s], sf_table[scalefactor[ch][sb][gr / 4]]);
	  }
#endif
	}
	else {
	  for (s = 0; s < 3; ++s)
	    frame->sbsample[ch][3 * gr + s][sb] = 0;
	}
      }
    }

    for (sb = bound; sb < sblimit; ++sb) {
      if ((index = allocation[0][sb])) {
	index = offset_table[bitalloc_table[offsets[sb]].offset][index - 1];

	II_samples(&stream->ptr, &qc_table[index], samples);

	for (ch = 0; ch < nch; ++ch) {
#ifdef JZ4750_OPT
          mad_fixed_t sf_val;
          mad_fixed_t *sb_ptr;
          sb_ptr = &(frame->sbsample[ch][3*gr-1][sb]);
          sf_val = sf_table[scalefactor[ch][sb][gr/4]];
          S32MUL(xr1,xr2, samples[0], sf_val);
          S32MUL(xr3,xr4, samples[1], sf_val);
          S32MUL(xr5,xr6, samples[2], sf_val);
          S32EXTR(xr1,xr2,(32 - MAD_F_SCALEBITS), 31);
          S32EXTR(xr3,xr4,(32 - MAD_F_SCALEBITS), 31);
          S32EXTR(xr5,xr6,(32 - MAD_F_SCALEBITS), 31);
          D32SLL(xr1,xr1,xr3,xr3,1);
          D32SLL(xr5,xr5,xr0,xr0,1);
          S32SDIV(xr1, sb_ptr, 32, 2);
          S32SDIV(xr3, sb_ptr, 32, 2);
          S32SDIV(xr5, sb_ptr, 32, 2);
#else
	  for (s = 0; s < 3; ++s) {
	    frame->sbsample[ch][3 * gr + s][sb] =
	      mad_f_mul(samples[s], sf_table[scalefactor[ch][sb][gr / 4]]);
	  }
#endif
	}
      }
      else {
	for (ch = 0; ch < nch; ++ch) {
	  for (s = 0; s < 3; ++s)
	    frame->sbsample[ch][3 * gr + s][sb] = 0;
	}
      }
    }

    for (ch = 0; ch < nch; ++ch) {
      for (s = 0; s < 3; ++s) {
	for (sb = sblimit; sb < 32; ++sb)
	  frame->sbsample[ch][3 * gr + s][sb] = 0;
      }
    }
  }

  return 0;
}
Esempio n. 11
0
// MODE 3
static void pred16x16_plane_mxu(uint8_t *dst, uint8_t *src, uint8_t *top){
  int i, j, k, a;
  uint8_t *src_top;  // top address
  uint8_t *src_topleft, *src_left;  // left address
  src_top = top;
  src_topleft = src_top - 0x14;
  src_left = src - 0x4;

  //----- H, LOAD -----
  S32LDD(xr1, src_top, -0x14);  // xr1 <- src_top[-4];  xr1: lt, 0, 0, 0 ;
  S32LDD(xr5, src_top, 0x0);   // xr5 <- src_top[0] ;  xr5: t3, t2, t1, t0 ;
  S32LDD(xr2, src_top, 0x4);   // xr2 <- src_top[4] ;  xr2: t7, t6, t5, t4 ;
  S32LDDR(xr3, src_top, 0x8);  // xr3 <- src_top[8] ;  xr3: t8, t9, t10, t11 ;
  S32LDDR(xr4, src_top, 0xc);  // xr4 <- src_top[12];  xr4: t12, t13, t14, t15 ;
  S32ALNI(xr1, xr5, xr1, ptn1);  //                    xr1: t2, t1, t0, lt ;
  S32ALNI(xr2, xr2, xr5, ptn1);  //                    xr2: t6, t5, t4, t3 ;   ---xr5 is free to use ;
  S32I2M(xr9, MUL_12);  // xr9 : 0x00010002 ;
  S32I2M(xr10, MUL_34); // xr10: 0x00030004 ;

  //----- H, SUM -----
  Q8ADDE_SS(xr5, xr3, xr2, xr6);  // xr5[31:16] <- t8-t6 ;  xr5[15:0] <- t9-t5 ;
                                  // xr6[31:16] <- t10-t4;  xr6[15:0] <- t11-t3;

  S32I2M(xr11, MUL_56); // xr11: 0x00050006 ;

  D16MUL_WW(xr13, xr9, xr5, xr14);     // xr13 <- 1*(t8-t6) ;  xr14 <- 2*(t9-t5) ;
  D16MAC_AA_WW(xr13, xr10, xr6, xr14); // xr13 <- 1*(t8-t6)+3*(t10-t4) ; xr14 <- 2*(t9-t5)+4*(t11-t3) ;
  Q8ADDE_SS(xr5, xr4, xr1, xr6);  // xr5[31:16] <- t12-t2;  xr5[15:0] <- t13-t1;
                                  // xr6[31:16] <- t14-t0;  xr6[15:0] <- t15-lt;

  S32I2M(xr12, MUL_78); // xr12: 0x00070008 ;

  D16MAC_AA_WW(xr13, xr11, xr5, xr14); // xr13 <- 1*(t8-t6)+3*(t10-t4)+5*(t12-t2) ;
                                       // xr14 <- 2*(t9-t5)+4*(t11-t3)+6*(t13-t1) ;
  D16MAC_AA_WW(xr13, xr12, xr6, xr14); // xr13 <- 1*(t8-t6)+3*(t10-t4)+5*(t12-t2)+7*(t14-t0) ;
                                       // xr14 <- 2*(t9-t5)+4*(t11-t3)+6*(t13-t1)+8*(t15-lt) ;
  S32LDD(xr1, src_topleft, 0x0);          // xr1[31:24] <- src_topleft[3] (lt) ;
  S32LDD(xr2, src_left, 0x0); // xr2[31:24] <- src_topleft[stride+3] (l0) ;
  D32ADD_AA(xr15, xr13, xr14, xr0); // xr15 <- 1*(t8-t6)+3*(t10-t4)+5*(t12-t2)+7*(t14-t0)
                                    //       + 2*(t9-t5)+4*(t11-t3)+6*(t13-t1)+8*(t15-lt) ;
  //----- V, LOAD -----
  //  S32LDD(xr1, src_topleft, 0x0);          // xr1[31:24] <- src_topleft[3] (lt) ;
  //  S32LDIV(xr2, src_topleft, stride, 0x0); // xr2[31:24] <- src_topleft[stride+3] (l0) ;
  S32LDIV(xr3, src_left, MB_LUMA_EDGED_WIDTH, 0x0); // xr3[31:24] <- src_topleft[2*stride+3] (l1) ;
  S32LDIV(xr8, src_left, MB_LUMA_EDGED_WIDTH, 0x0); // xr9[31:24] <- src_topleft[3*stride+3] (l2) ;
  S32SFL(xr5, xr2, xr1, xr0, ptn2);       // xr5[31:16] <- l0, lt ;
  S32SFL(xr6, xr8, xr3, xr0, ptn2);       // xr8[31:16] <- l2, l1 ;
  S32SFL(xr7, xr6, xr5, xr0, ptn3);       // xr7[31: 0] <- l2, l1, l0, lt ;

  S32LDIV(xr1, src_left, MB_LUMA_EDGED_WIDTH, 0x0);
  S32LDIV(xr2, src_left, MB_LUMA_EDGED_WIDTH, 0x0);
  S32LDIV(xr3, src_left, MB_LUMA_EDGED_WIDTH, 0x0);
  S32LDIV(xr8, src_left, MB_LUMA_EDGED_WIDTH, 0x0);
  S32SFL(xr5, xr2, xr1, xr0, ptn2);
  S32SFL(xr6, xr8, xr3, xr0, ptn2);
  S32SFL(xr13, xr6, xr5, xr0, ptn3); // xr13[31:0] <- l6, l5, l4, l3 ;

  src_left += MB_LUMA_EDGED_WIDTH;

  S32LDIV(xr8, src_left, MB_LUMA_EDGED_WIDTH, 0x0);
  S32LDIV(xr3, src_left, MB_LUMA_EDGED_WIDTH, 0x0);
  S32LDIV(xr2, src_left, MB_LUMA_EDGED_WIDTH, 0x0);
  S32LDIV(xr1, src_left, MB_LUMA_EDGED_WIDTH, 0x0);
  S32SFL(xr6, xr8, xr3, xr0, ptn2);
  S32SFL(xr5, xr2, xr1, xr0, ptn2);
  S32SFL(xr14, xr6, xr5, xr0, ptn3); // xr14[31:0] <- l8, l9, l10, l11 ;

  S32LDIV(xr8, src_left, MB_LUMA_EDGED_WIDTH, 0x0);
  S32LDIV(xr3, src_left, MB_LUMA_EDGED_WIDTH, 0x0);
  S32LDIV(xr2, src_left, MB_LUMA_EDGED_WIDTH, 0x0);
  S32LDIV(xr1, src_left, MB_LUMA_EDGED_WIDTH, 0x0);
  S32SFL(xr6, xr8, xr3, xr0, ptn2);
  S32SFL(xr5, xr2, xr1, xr0, ptn2);
  S32SFL(xr1, xr6, xr5, xr0, ptn3); // xr1[31: 0] <- l12, l13, l14, l15 ;

  //----- V, SUM -----
  Q8ADDE_SS(xr5, xr14, xr13, xr6);
  Q8ADDE_SS(xr2, xr1, xr7, xr3);

  D16MUL_WW(xr13, xr9, xr5, xr14);
  D16MAC_AA_WW(xr13, xr10, xr6, xr14);

  D16MAC_AA_WW(xr13, xr11, xr2, xr14);
  D16MAC_AA_WW(xr13, xr12, xr3, xr14);

  D32SLR(xr2, xr11, xr12, xr3, 0x8); // xr2: 0x00000500 ;  xr3: 0x00000700 ;
  D32SLR(xr11, xr2, xr3, xr12, 0x8); //xr11: 0x00000005 ; xr12: 0x00000007 ;

  D32ADD_AA(xr14, xr13, xr14, xr0); // xr14 <- 1*(l8-l6)+3*(l10-l4)+5*(l12-l2)+7*(l14-l0)
                                    //       + 2*(l9-l5)+4*(l11-l3)+6*(l13-l1)+8*(l15-lt) ;
  //----- P, CAL -----
  //  D32SLR(xr2, xr11, xr12, xr3, 0x8); // xr2: 0x00000500 ;  xr3: 0x00000700 ;
  //  D32SLR(xr11, xr2, xr3, xr12, 0x8); //xr11: 0x00000005 ; xr12: 0x00000007 ;

  D16MUL_WW(xr0, xr15, xr11, xr2); // xr2: 5*H ;
  D16MUL_WW(xr0, xr14, xr11, xr3); // xr3: 5*V ;

  D32SLR(xr8, xr11, xr0, xr0, 0x2); // xr8: 0x00000001 ;
  D32SLL(xr13, xr8, xr0, xr0, 0x5); //xr13: 0x00000020 ;

  Q8ACCE_AA(xr0, xr1, xr4, xr8);   // xr8[15:0]: src1[0] + src2[16] + 1

  D32ADD_AA(xr5, xr2, xr13, xr0); // xr5: 5*H+32 ;
  D32ADD_AA(xr6, xr3, xr13, xr0); // xr6: 5*V+32 ;

  D32SLR(xr2, xr5, xr6, xr3, 0x6); // xr2: ( 5*H+32 ) >> 6 ;  xr3: ( 5*V+32 ) >> 6 ;

  //  Q8ACCE_AA(xr0, xr1, xr4, xr8);   // xr8[15:0]: src1[0] + src2[16] + 1
  D32SLL(xr5, xr8, xr0, xr0, 0x4); // xr5[15:0]: 16*(src1[0] + src2[16] + 1)

  Q16ADD_AA_WW(xr7, xr2, xr3, xr0); // xr7: V+H
  //  S32NOR(xr0, xr0, xr0); // idle
  S32I2M(xr4, MUX_H16); // xr4: 0x0000ffff ;
  D16MUL_WW(xr0, xr7, xr12, xr8);   // xr8: 7*(V+H)

  S32SFL(xr0, xr3, xr3, xr14, ptn3); // xr14[31:16]: V ;  xr14[15:0]: V ;
  D32SLL(xr7, xr2, xr0, xr0, 0x1);

  Q16ADD_SS_WW(xr9, xr5, xr8, xr0); // xr9: 16*(src1[0] + src2[16] + 1) - 7*(V+H)
  S32SFL(xr0, xr9, xr9, xr5, ptn3); // xr5[31:16]: a ;  xr5[15:0]: a ;
  //  S32SFL(xr0, xr3, xr3, xr14, ptn3); // xr14[31:16]: V ;  xr14[15:0]: V ;
  //  D32SLL(xr7, xr2, xr0, xr0, 0x1);
  S32SFL(xr0, xr7, xr7, xr8, ptn3);  // xr8[31:16]: 2H ;  xr8[15:0]: 2H ;

  S32AND(xr2, xr4, xr2);

  Q16ADD_AA_WW(xr15, xr5, xr2, xr0); // xr15[31:16]: a ;  xr15[15:0]: a + H ;

  dst -= MB_LUMA_EDGED_WIDTH;
  //----- SRC, STORE -----
  for (i=0; i<16; i++) {
    Q16ADD_AA_WW(xr1, xr15, xr8, xr0);
    Q16ADD_AA_WW(xr2, xr1, xr8, xr0);
    Q16SAR(xr9, xr15, xr1, xr1, 0x5);
    Q16ADD_AA_WW(xr3, xr2, xr8, xr0);
    Q16SAT(xr10, xr9, xr1);
    Q16ADD_AA_WW(xr4, xr3, xr8, xr0);
    Q16SAR(xr2, xr2, xr3, xr3, 0x5);
    Q16ADD_AA_WW(xr5, xr4, xr8, xr0);
    Q16SAT(xr11, xr2, xr3);
    Q16ADD_AA_WW(xr6, xr5, xr8, xr0);
    Q16SAR(xr4, xr4, xr5, xr5, 0x5);
    Q16ADD_AA_WW(xr7, xr6, xr8, xr0);
    Q16SAR(xr6, xr6, xr7, xr7, 0x5);
    Q16SAT(xr12, xr4, xr5);
    Q16SAT(xr13, xr6, xr7);

    S32SDIVR(xr10, dst, MB_LUMA_EDGED_WIDTH, 0x0);
    S32STDR(xr11, dst, 0x4);
    S32STDR(xr12, dst, 0x8);
    //    S32STDR(xr13, dst, 0xc);

    Q16ADD_AA_WW(xr15, xr15, xr14, xr0);

    S32STDR(xr13, dst, 0xc);
  }

}
Esempio n. 12
0
// MODE 3
static void pred8x8_plane_mxu(uint8_t *dst, uint8_t *src, uint8_t *top){
  unsigned int i;
  uint8_t *src_top;  // top address
  uint8_t *src_topleft, *src_left;  // left address
  src_top = top;
  src_topleft = src_top - 0x1c;
  src_left = src - 0x4;

  //----- H, LOAD -----
  S32LDD(xr1, src_top, -0x1c);  // xr1 <- src_top[-4];  xr1: lt, 0, 0, 0 ;
  S32LDD(xr3, src_top, 0x0);   // xr3 <- src_top[0] ;  xr3: t3, t2, t1, t0 ;
  S32LDDR(xr2, src_top, 0x4);  // xr2 <- src_top[4] ;  xr2: t4, t5, t6, t7 ;
  S32ALNI(xr1, xr3, xr1, ptn1);//                      xr1: t2, t1, t0, lt ;
  S32I2M(xr8, MUL_12); // xr8: 0x00010002 ;
  S32I2M(xr9, MUL_34); // xr9: 0x00030004 ;
  //----- H, SUM -----
  Q8ADDE_SS(xr3, xr2, xr1, xr4);  // xr3[31:16] <- t4-t2 ;  xr3[15:0] <- t5-t1 ;
                                  // xr4[31:16] <- t6-t0 ;  xr4[15:0] <- t7-lt;

  S32LDD(xr1, src_topleft, 0x0);          // xr1[31:24] <- src_topleft[3] (lt) ;

  D16MUL_WW(xr5, xr8, xr3, xr6);    // xr5 <- 1*(t4-t2) ;  xr6 <- 2*(t5-t1) ;
  D16MAC_AA_WW(xr5, xr9, xr4, xr6); // xr5 <- 1*(t4-t2)+3*(t6-t0) ; xr6 <- 2*(t5-t1)+4*(t7-lt) ;

  S32LDD(xr12, src_left, 0x0);//xr12[31:24] <- src_topleft[stride+3] (l0) ;
  S32LDIV(xr3, src_left, MB_CHROM_EDGED_WIDTH, 0x0); // xr3[31:24] <- src_topleft[2*stride+3] (l1) ;

  D32ADD_AA(xr7, xr5, xr6, xr0); // xr7 <- 1*(t4-t2)+3*(t6-t0)+2*(t5-t1)+4*(t7-lt) ;
  //----- V, LOAD -----
  //  S32LDD(xr1, src_topleft, 0x0);          // xr1[31:24] <- src_topleft[3] (lt) ;
  //  S32LDIV(xr12, src_topleft, stride, 0x0);//xr12[31:24] <- src_topleft[stride+3] (l0) ;
  //  S32LDIV(xr3, src_topleft, stride, 0x0); // xr3[31:24] <- src_topleft[2*stride+3] (l1) ;
  S32LDIV(xr4, src_left, MB_CHROM_EDGED_WIDTH, 0x0); // xr4[31:24] <- src_topleft[3*stride+3] (l2) ;
  S32SFL(xr5, xr12, xr1, xr0, ptn2);      // xr5[31:16] <- l0, lt ;
  S32SFL(xr6, xr4, xr3, xr0, ptn2);       // xr8[31:16] <- l2, l1 ;
  S32SFL(xr10, xr6, xr5, xr0, ptn3);      // xr10[31:0] <- l2, l1, l0, lt ;
  src_left += MB_CHROM_EDGED_WIDTH;
  S32LDIV(xr4, src_left, MB_CHROM_EDGED_WIDTH, 0x0);
  S32LDIV(xr3, src_left, MB_CHROM_EDGED_WIDTH, 0x0);
  S32LDIV(xr12, src_left, MB_CHROM_EDGED_WIDTH, 0x0);
  S32LDIV(xr1, src_left, MB_CHROM_EDGED_WIDTH, 0x0);
  S32SFL(xr6, xr4, xr3, xr0, ptn2);
  S32SFL(xr5, xr12, xr1, xr0, ptn2);
  S32SFL(xr11, xr6, xr5, xr0, ptn3); // xr11[31:0] <- l4, l5, l6, l7 ;
  //----- V, SUM -----
  Q8ADDE_SS(xr3, xr11, xr10, xr4);

  S32LUI(xr1, 0x1, ptn0); // xr1[31:0]: 0x00000001 ;

  D16MUL_WW(xr5, xr8, xr3, xr6);
  D16MAC_AA_WW(xr5, xr9, xr4, xr6);

  D32ADD_AA(xr13, xr5, xr6, xr0); // xr13 <- 1*(l4-l2)+3*(l6-l0)+2*(l5-l1)+4*(l7-lt) ;

  //----- P, CAL ----- useful XRs:xr13, xr7, xr2, xr11;
  //  S32LUI(xr1, 0x1, ptn0); // xr1[31:0]: 0x00000001 ;
  D32SLL(xr5, xr1, xr1, xr6, 0x4); // xr5: 0x00000010;  xr6: 0x00000010; 
  D32SLL(xr3, xr13, xr7, xr4, 0x4);
  D32ACC_AA(xr5, xr13, xr3, xr0); // xr5: 17*V+16
  D32ACC_AA(xr6, xr7, xr4, xr0);  // xr6: 17*H+16

  Q8ACCE_AA(xr0, xr2, xr11, xr1);  // xr1[15:0]: src1[0] + src2[8] + 1

  D32SLR(xr8, xr5, xr6, xr9, 0x5); // xr8: (17*V+16) >> 5 ;  xr9: (17*H+16) >> 5 ;

  //  Q8ACCE_AA(xr0, xr2, xr11, xr1);  // xr1[15:0]: src1[0] + src2[8] + 1
  D32SLL(xr2, xr1, xr0, xr0, 0x4); // xr2[15:0]: 16*(src1[0] + src2[16] + 1)

  Q16ADD_AA_WW(xr7, xr8, xr9, xr0); // xr7: V+H
  S32I2M(xr4, MUX_H16); // xr4: 0x0000ffff ;
  D32SLL(xr12, xr7, xr0, xr0, 0x1);
  D32ADD_AA(xr5, xr12, xr7, xr0);   // xr5: 3*(V+H)
  //  S32LUI(xr12, 0x3, ptn0); // xr12[31:0]: 0x00000003 ;
  //  D16MUL_WW(xr0, xr7, xr12, xr5);   // xr5: 3*(V+H)

  //  S32I2M(xr4, MUX_H16); // xr4: 0x0000ffff ;

  Q16ADD_SS_WW(xr6, xr2, xr5, xr0); // xr6: 16*(src1[0] + src2[16] + 1) - 3*(V+H)

  //  S32I2M(xr4, MUX_H16); // xr4: 0x0000ffff ;

  S32SFL(xr0, xr8, xr8, xr14, ptn3);// xr14[31:16]: V ;  xr14[15:0]: V ;
  S32SFL(xr0, xr6, xr6, xr5, ptn3); // xr5[31:16]: a ;  xr5[15:0]: a ;
  D32SLL(xr7, xr9, xr0, xr0, 0x1);
  S32SFL(xr0, xr7, xr7, xr8, ptn3); // xr8[31:16]: 2H ;  xr8[15:0]: 2H ;

  //  S32I2M(xr4, MUX_H16); // xr4: 0x0000ffff ;
  S32AND(xr9, xr4, xr9);

  Q16ADD_AA_WW(xr15, xr5, xr9, xr0);   // xr15[31:16]: a ;  xr15[15:0]: a + H ;

  dst -= MB_CHROM_EDGED_WIDTH;
  //----- SRC, STORE -----
  for (i=0; i<8; i++) {
    Q16ADD_AA_WW(xr1, xr15, xr8, xr0);
    Q16ADD_AA_WW(xr2, xr1, xr8, xr0);
    Q16SAR(xr9, xr15, xr1, xr1, 0x5);
    Q16ADD_AA_WW(xr3, xr2, xr8, xr0);

    Q16SAT(xr10, xr9, xr1);
    //    Q16SAR(xr9, xr15, xr1, xr1, 0x5);
    Q16SAR(xr2, xr2, xr3, xr3, 0x5);

    //    Q16SAT(xr10, xr9, xr1);
    Q16SAT(xr11, xr2, xr3);

    S32SDIVR(xr10, dst, MB_CHROM_EDGED_WIDTH, 0x0);

    Q16ADD_AA_WW(xr15, xr15, xr14, xr0);

    S32STDR(xr11, dst, 0x4);
  }

}