Exemple #1
0
static void ff_vp3_idct_add_mxu(uint8_t *src, int stride, DCTELEM *input, uint8_t idct_row)
{
    int i;
    DCTELEM *blk;
    int32_t wf = (int32_t)whirl_idct;

    S32LDD(xr5, wf, 0x0);         // xr5(w7, w3)
    S32LDD(xr6, wf, 0x4);         // xr6(w9, w8)
    S32LDD(xr7, wf, 0x8);         // xr7(w11,w10)
    S32LDD(xr8, wf, 0xc);         // xr8(w13,w12)
    S32LDD(xr9, wf, 0x10);        // xr9(w6, w0)
    S32LDD(xr10,wf, 0x14);
    blk = input - 8;
    /* Inverse DCT on the rows now */
    for (i=0; i<idct_row; i++) {
        S32LDI(xr1, blk, 0x10);       //  xr1 (x4, x0)
	S32LDD(xr2, blk, 0x4);        //  xr2 (x7, x3)
	S32LDD(xr3, blk, 0x8);        //  xr3 (x6, x1)
	S32LDD(xr4, blk, 0xc);        //  xr4 (x5, x2)
	S32OR(xr12, xr2,xr3);
	S32OR(xr11,xr12,xr4);
	S32OR(xr12,xr11,xr1);
	if (S32M2I(xr12) == 0) {
	    continue;            //blk[0]= blk[1]=blk[2]=blk[3]=blk[4]=blk[5]=blk[6]=blk[7]=0
	}
	S32SFL(xr12,xr0,xr1,xr13,ptn3);
	S32OR(xr11,xr11,xr12);
	if (S32M2I(xr11) == 0 && S32M2I(xr13) != 0) {
	    D16MUL_HW(xr0,xr5,xr13,xr13);
	    D32SAR(xr0,xr0,xr13,xr13,15);
	    S32SFL(xr0,xr13,xr13,xr13,ptn3);
	    S32STD(xr13,blk, 0x0);
	    S32STD(xr13,blk, 0x4);
	    S32STD(xr13,blk, 0x8);
	    S32STD(xr13,blk, 0xc);
	    continue;            //blk[0]!=0, and blk[1]=blk[2]=blk[3]=blk[4]=blk[5]=blk[6]=blk[7]=0
	}

	S32SFL(xr1,xr1,xr2,xr2, ptn3);  //xr1:s1, s3, xr2: s0, s2
	S32SFL(xr3,xr3,xr4,xr4, ptn3);  //xr3:s5, s7, xr4: s4, s6

	D16MUL_WW(xr11, xr2, xr5, xr12);//xr11: s0*c4, xr12: s2*c2
	D16MAC_AA_WW(xr11,xr4,xr6,xr12);//xr11: s0*c4+s4*c4, xr12: s2*c2+s6*c6

	D16MUL_WW(xr13, xr2, xr6, xr14);//xr13: s0*c4, xr14: s2*c6
	D16MAC_SS_WW(xr13,xr4,xr5,xr14);//xr13: s0*c4 - s4*c4, xr14: s2*c6-s6*c2

	D16MUL_HW(xr2, xr1, xr7, xr4);  //xr2: s1*c1, xr4: s1*c3 
	D16MAC_AS_LW(xr2,xr1,xr9,xr4);  //xr2: s1*c1+s3*c3, xr4: s1*c3-s3*c7
	D16MAC_AS_HW(xr2,xr3,xr10,xr4); //xr2: s1*c1+s3*c3+s5*c5,
                                      // xr4: s1*c3-s3*c7-s5*c1
	D16MAC_AS_LW(xr2,xr3,xr8,xr4);  //xr2: s1*c1+s3*c3+s5*c5+s7*c7,
                                      //xr4: s1*c3-s3*c7-s5*c1-s7*c5
	D32SAR(xr11, xr11,xr13,xr13,15);
	S32SFL(xr0, xr11,xr13,xr11,ptn3);
	D32SAR(xr12,xr12,xr14,xr14,15);
	S32SFL(xr0, xr12,xr14,xr12,ptn3);
	D32SAR(xr2, xr2,xr4,xr4,15);
	S32SFL(xr0, xr2,xr4,xr2,ptn3);
      
	D16MUL_HW(xr4, xr1, xr8, xr15);     //xr4: s1*c7, xr15:s1*c5
	D16MAC_SS_LW(xr4,xr1,xr10,xr15);    //xr4: s1*c7-s3*c5, xr15: s1*c5-s3*c1
	D16MAC_AA_HW(xr4,xr3,xr9,xr15);     //xr4: s1*c7-s3*c5+s5*c3, xr15: s1*c5-s3*c1+s5*c7
	D16MAC_SA_LW(xr4,xr3,xr7,xr15);     //xr4: s1*c7-s3*c5+s5*c3-s7*c1
	                                    //xr15: s1*c5-s3*c1+s5*c7+s7*c3
	Q16ADD_AS_WW(xr11,xr11,xr12,xr12);  //xr11: rnd(s0*c4+s4*c4)>>15+rnd(s2*c2+s6*c6)>>15
                                          //      rnd(s0*c4-s4*c4)>>15+rnd(s2*c6-s6*c2)>>15
                                          //xr12: rnd(s0*c4+s4*c4)>>15-rnd(s2*c2+s6*c6)>>15
                                          //      rnd(s0*c4-s4*c4)>>15-rnd(s2*c6-s6*c2)>>15
	D32SAR(xr15,xr15,xr4,xr4,15);
	S32SFL(xr0,xr15,xr4,xr15,ptn3);
	Q16ADD_AS_WW(xr11, xr11, xr2, xr2);
              //xr11: rnd(s0*c4+s4*c4)>>15+rnd(s2*c2+s6*c6)>>15 + rnd(s1*c1+s3*c3+s5*c5+s7*c7)>>15
              //    : rnd(s0*c4-s4*c4)>>15+rnd(s2*c6-s6*c2)>>15 + rnd(s1*c3-s3*c7-s5*c1-s7*c5)>>15
              //xr2: rnd(s0*c4+s4*c4)>>15+rnd(s2*c2+s6*c6)>>15 - rnd(s1*c1+s3*c3+s5*c5+s7*c7)>>15
              //   : rnd(s0*c4-s4*c4)>>15+rnd(s2*c6-s6*c2)>>15 - rnd(s1*c3-s3*c7-s5*c1-s7*c5)>>15

	Q16ADD_AS_XW(xr12, xr12, xr15, xr15);
              //xr12: rnd(s0*c4+s4*c4)>>15-rnd(s2*c2+s6*c6)>>15+rnd(s1*c5-s3*c1+s5*c7+s7*c3)>>15
              //    : rnd(s0*c4-s4*c4)>>15+rnd(s2*c6-s6*c2)>>15+rnd(s1*c7-s3*c5+s5*c3-s7*c1)>>15
              //xr15: rnd(s0*c4+s4*c4)>>15-rnd(s2*c2+s6*c6)>>15-rnd(s1*c5-s3*c1+s5*c7+s7*c3)>>15
              //    : rnd(s0*c4-s4*c4)>>15+rnd(s2*c6-s6*c2)>>15-rnd(s1*c7-s3*c5+s5*c3-s7*c1)>>15

	S32SFL(xr11,xr11,xr12,xr12, ptn3);
              //xr11: rnd(s0*c4+s4*c4)>>15+rnd(s2*c2+s6*c6)>>15 + rnd(s1*c1+s3*c3+s5*c5+s7*c7)>>15
              //    : rnd(s0*c4+s4*c4)>>15-rnd(s2*c2+s6*c6)>>15+rnd(s1*c5-s3*c1+s5*c7+s7*c3)>>15
              //xr12: rnd(s0*c4-s4*c4)>>15+rnd(s2*c6-s6*c2)>>15 + rnd(s1*c3-s3*c7-s5*c1-s7*c5)>>15
              //    : rnd(s0*c4-s4*c4)>>15+rnd(s2*c6-s6*c2)>>15+rnd(s1*c7-s3*c5+s5*c3-s7*c1)>>15
	S32SFL(xr12,xr12,xr11,xr11, ptn3);

              //xr12: rnd(s0*c4-s4*c4)>>16+rnd(s2*c6-s6*c2)>>16 + rnd(s1*c3-s3*c7-s5*c1-s7*c5)>>16
              //    : rnd(s0*c4+s4*c4)>>16+rnd(s2*c2+s6*c6)>>16 + rnd(s1*c1+s3*c3+s5*c5+s7*c7)>>16
              //xr11: rnd(s0*c4-s4*c4)>>16+rnd(s2*c6-s6*c2)>>16+rnd(s1*c7-s3*c5+s5*c3-s7*c1)>>16
              //    : rnd(s0*c4+s4*c4)>>16-rnd(s2*c2+s6*c6)>>16+rnd(s1*c5-s3*c1+s5*c7+s7*c3)>>16
	S32STD(xr12, blk, 0x0);
	S32STD(xr11, blk, 0x4);
	S32STD(xr15, blk, 0x8);
	S32STD(xr2, blk, 0xc);
    }
      
    blk = input - 2;
    for (i=0; i<4; i++)               /* idct columns */
    {
        S32I2M(xr5,wxr5);        //xr5: c4 , c2
	S32I2M(xr6,wxr6);        //xr5: c4 , c2
	S32LDI(xr1, blk, 0x4);   //xr1: ss0, s0
	S32LDD(xr3, blk, 0x20);  //xr3: ss2, s2
	S32LDD(xr11, blk, 0x40); //xr11: ss4, s4
	S32LDD(xr13, blk, 0x60); //xr13: ss6, s6

	D16MUL_HW(xr15, xr5, xr1, xr2);    //xr15: ss0*c4, xr9: s0*c4
	D16MAC_AA_HW(xr15,xr5,xr11,xr2);   //xr15: ss0*c4+ss4*c4, xr9: s0*c4+s4*c4
	D16MUL_LW(xr10, xr5, xr3, xr9);    //xr10: ss2*c2, xr9: s2*c2
	D16MAC_AA_LW(xr10,xr6,xr13,xr9);   //xr10: ss2*c2+ss6*c6, xr9: s2*c2+s6*c6
	D32SAR(xr15,xr15,xr2,xr2,15);      
	S32SFL(xr0,xr15,xr2,xr15,ptn3);    //xr15: (ss0*c4+ss4*c4)>>15
	D32SAR(xr10,xr10,xr9,xr9,15);      
	S32SFL(xr0,xr10,xr9,xr10,ptn3);    //xr10: (ss2*c2+ss6*c6)>>15

	S32LDD(xr2, blk, 0x10);            //xr2: ss1, s1
	S32LDD(xr4, blk, 0x30);            //xr4: ss3, s3
	Q16ADD_AS_WW(xr15,xr15,xr10,xr9);  //xr15: rnd(ss0*c4+ss4*c4)>>15+rnd(ss2*c2+ss6*c6)>>15
                                         //    :rnd(s0*c4+s4*c4)>>15 + rnd(s2*c2 + s6*c6)>>15
                                         //xr9: rnd(ss0*c4+ss4*c4)>>15 - rnd(ss2*c2+ss6*c6)>>15
                                         //   : rnd(s0*c4+s4*c4)>>15 - rnd(s2*c2 + s6*c6)>>15
	D16MUL_HW(xr10, xr5, xr1, xr1);    //xr10: ss0*c4, xr1: s0*c4
	D16MAC_SS_HW(xr10,xr5,xr11,xr1);   //xr10: ss0*c4-ss4*c4, xr1: s0*c4 - s4*c4
	D16MUL_LW(xr11, xr6, xr3, xr12);    //xr11: ss2*c6, xr1: s2*c6
	D16MAC_SS_LW(xr11,xr5,xr13,xr12);   //xr11: ss2*c6-ss6*c2, xr1: s2*c6-s6*c2
	D32SAR(xr10,xr10,xr1,xr1,15);
	S32SFL(xr0,xr10,xr1,xr10,ptn3);    //xr10: (ss0*c4-ss4*c4)>>15 //    : (s0*c4 - s4*c4)>>15
	D32SAR(xr11,xr11,xr12,xr12,15);      
	S32SFL(xr0,xr11,xr12,xr11,ptn3);    //xr11:(ss2*c6-ss6*c2)>>15
                                         //    :(s2*c6-s6*c2)>>15

	S32LDD(xr12, blk, 0x50);           //xr12: ss5, s5
	S32LDD(xr14, blk, 0x70);           //xr14: ss7, s7
	Q16ADD_AS_WW(xr10,xr10,xr11,xr1);  //xr10: rnd(ss0*c4-ss4*c4)>>15)+rnd(ss2*c6-ss6*c2)>>15
                                         //    : rnd(s0*c4 - s4*c4)>>15 +rnd(s2*c6 - s6*c2)>>15
                                         //xr1 : rnd(ss0*c4-ss4*c4)>>15-rnd(ss2*c6-ss6*c2)>>15
                                         //    : rnd(s0*c4 - s4*c4)>>15-rnd(s2*c6 - s6*c2)>>15

	D16MUL_HW(xr11, xr7, xr2, xr13);   //xr11: ss1*c1, xr13: s1*c1
	D16MAC_AA_LW(xr11,xr7,xr4,xr13);   //xr11: ss1*c1+ss3*c3, xr13: s1*c1+s3*c3
	D16MAC_AA_LW(xr11,xr8,xr12,xr13);  //xr11: ss1*c1+ss3*c3+ss5*c5 //xr13: s1*c1+s3*c3+s5*c5
	D16MAC_AA_HW(xr11,xr8,xr14,xr13);  //xr11: ss1*c1+ss3*c3+ss5*c5+ss7*c7
                                         //xr13: s1*c1+s3*c3+s5*c5+s7*c7
	D16MUL_LW(xr3, xr7, xr2, xr5);    //xr3: ss1*c3, xr13: s1*c3
	D16MAC_SS_HW(xr3,xr8,xr4,xr5);    //xr3: ss1*c3-ss3*c7, xr13: s1*c3-s3*c7
	D16MAC_SS_HW(xr3,xr7,xr12,xr5);   //xr3: ss1*c3-ss3*c7-ss5*c1
                                         //xr13: s1*c3-s3*c7-s5*c1
	D16MAC_SS_LW(xr3,xr8,xr14,xr5);   //xr3: ss1*c3-ss3*c7-ss5*c1-ss7*c5
                                         //xr13: s1*c3-s3*c7-s7*c5
	D32SAR(xr11,xr11,xr13,xr13,15); 
	S32SFL(xr0,xr11,xr13,xr11,ptn3);   //xr11: (ss1*c1+ss3*c3+ss5*c5+ss7*c7)>>15 //    : (s1*c1+s3*c3+s5*c5+s7*c7)>>15
	D32SAR(xr3,xr3,xr5,xr5,15);
	S32SFL(xr0,xr3,xr5,xr3,ptn3);     //xr3: (ss1*c3-ss3*c7-ss5*c1-ss7*c5)>>15
                                         //   : (s1*c3-s3*c7-s7*c5)>>15
	D16MUL_LW(xr5, xr8, xr2, xr13);    //xr5: ss1*c5, xr13:s1*c5
	D16MAC_SS_HW(xr5,xr7,xr4,xr13);    //xr5: ss1*c5-ss3*c1, xr13:s1*c5-s3*c1
	D16MAC_AA_HW(xr5,xr8,xr12,xr13);   //xr5: ss1*c5-ss3*c1+ss5*c7
                                         //   : s1*c5 - s3*c1+ s5*c7
	D16MAC_AA_LW(xr5,xr7,xr14,xr13);   //xr5: ss1*c5-ss3*c1+ss5*c7+ss7*c1
                                         //   : s1*c5 - s3*c1+ s5*c7+ s7*c1
	D16MUL_HW(xr2, xr8, xr2, xr6);    //xr2: ss1*c7, xr13: s1*c7
	D16MAC_SS_LW(xr2,xr8,xr4,xr6);    //xr2: ss1*c7-ss3*c5, xr13: s1*c7-s3*c5
	D16MAC_AA_LW(xr2,xr7,xr12,xr6);   //xr2: ss1*c7-ss3*c5+ss5*c1 //xr13: s1*c7-s3*c5+s5*c1
	D16MAC_SS_HW(xr2,xr7,xr14,xr6);   //xr2: ss1*c7-ss3*c5+ss5*c1-ss7*c3
                                         //xr13: s1*c7-s3*c5+s5*c1-s7*c3
	D32SAR(xr5,xr5,xr13,xr13,15);
	S32SFL(xr0,xr5,xr13,xr5,ptn3);     //xr5: (ss1*c5-ss3*c1+ss5*c7+ss7*c1)>>15 //  :(s1*c5 - s3*c1+ s5*c7+ s7*c1)>>15
	D32SAR(xr2,xr2,xr6,xr6,15);
	S32SFL(xr0,xr2,xr6,xr2,ptn3);     //xr2:(ss1*c7-ss3*c5+ss5*c1-ss7*c3)>>15
                                         //   :(s1*c7-s3*c5+s5*c1-s7*c3)>>15

	S32I2M(xr4, 0x00080008);//round value 8;
	Q16ADD_AS_WW(xr15,xr15,xr11,xr11); //xr15:rnd(ss0*c4+ss4*c4)>>16+rnd(ss2*c2+ss6*c6)>>16+
                                         //     rnd(ss1*c1+ss3*c3+ss5*c5+ss7*c7)>>16
                                         //     rnd(s0*c4+s4*c4)>>16 + rnd(s2*c2 + s6*c6)>>16+
                                         //     rnd(s1*c1+s3*c3+s5*c5+s7*c7)>>16

                                         //xr11:rnd(ss0*c4+ss4*c4)>>16+rnd(ss2*c2+ss6*c6)>>16-
                                         //     rnd(ss1*c1+ss3*c3+ss5*c5+ss7*c7)>>16
                                         //     rnd(s0*c4+s4*c4)>>16 + rnd(s2*c2 + s6*c6)>>16-
                                         //     rnd(s1*c1+s3*c3+s5*c5+s7*c7)>>16
	Q16ADD_AS_WW(xr10,xr10,xr3,xr3);   //xr10:rnd(ss0*c4-ss4*c4)>>16)+rnd(ss2*c6-ss6*c2)>>16+
                                         //     rnd(ss1*c3-ss3*c7-ss5*c1-ss7*c5)>>16
                                         //     rnd(s0*c4 - s4*c4)>>16 +rnd(s2*c6 - s6*c2)>>16+
                                         //     rnd(s1*c3-s3*c7-s7*c5)>>16
                                         //xr10:rnd(ss0*c4-ss4*c4)>>16)+rnd(ss2*c6-ss6*c2)>>16-
                                         //     rnd(ss1*c3-ss3*c7-ss5*c1-ss7*c5)>>16
                                         //     rnd(s0*c4 - s4*c4)>>16 +rnd(s2*c6 - s6*c2)>>16-
                                         //     rnd(s1*c3-s3*c7-s7*c5)>>16
	Q16ADD_AS_WW(xr1,xr1,xr5,xr5);     //xr1: rnd(ss0*c4-ss4*c4)>>16-rnd(ss2*c6-ss6*c2)>>16+
                                         //     rnd(ss1*c5-ss3*c1+ss5*c7+ss7*c1)>>16
                                         //     rnd(s0*c4 - s4*c4)>>16 +rnd(s2*c6 - s6*c2)>>16+
                                         //     rnd(s1*c5 - s3*c1+ s5*c7+ s7*c1)>>16
                                         //xr1: rnd(ss0*c4-ss4*c4)>>16-rnd(ss2*c6-ss6*c2)>>16-
                                         //     rnd(ss1*c5-ss3*c1+ss5*c7+ss7*c1)>>16
                                         //     rnd(s0*c4 - s4*c4)>>16 +rnd(s2*c6 - s6*c2)>>16-
                                         //     rnd(s1*c5 - s3*c1+ s5*c7+ s7*c1)>>16
	Q16ADD_AS_WW(xr9,xr9,xr2,xr2);     //xr9: rnd(ss0*c4+ss4*c4)>>16 - rnd(ss2*c2+ss6*c6)>>16+
                                         //     rnd(ss1*c7-ss3*c5+ss5*c1-ss7*c3)>>16
                                         //     rnd(s0*c4+s4*c4)>>16 - rnd(s2*c2 + s6*c6)>>16+
                                         //     rnd(s1*c7-s3*c5+s5*c1-s7*c3)>>16
                                         //xr9: rnd(ss0*c4+ss4*c4)>>16 - rnd(ss2*c2+ss6*c6)>>16-
                                         //     rnd(ss1*c7-ss3*c5+ss5*c1-ss7*c3)>>16
                                         //     rnd(s0*c4+s4*c4)>>16 - rnd(s2*c2 + s6*c6)>>16-
                                         //     rnd(s1*c7-s3*c5+s5*c1-s7*c3)>>16

	Q16ACCM_AA(xr15,xr4,xr4,xr10);
	Q16ACCM_AA(xr11,xr4,xr4,xr1);
	Q16ACCM_AA(xr9,xr4,xr4,xr2);
	Q16ACCM_AA(xr5,xr4,xr4,xr3);
	Q16SAR(xr15,xr15,xr10,xr10,4);
	Q16SAR(xr11,xr11,xr1,xr1,4);
	Q16SAR(xr9,xr9,xr2,xr2,4);
	Q16SAR(xr5,xr5,xr3,xr3,4);
	
	S32STD(xr15, blk, 0x00);
	S32STD(xr10, blk, 0x10);
	S32STD(xr1, blk, 0x20);
	S32STD(xr9, blk, 0x30);
	S32STD(xr2, blk, 0x40);
	S32STD(xr5, blk, 0x50);
	S32STD(xr3, blk, 0x60);
	S32STD(xr11, blk, 0x70);
    }

    blk = input - 8;
    src -= stride;
    for (i=0; i<8; i++) {
        S32LDIV(xr1, src, stride, 0x0);
	S32LDI(xr3, blk, 0x10);
	S32LDD(xr4, blk, 0x4);
	Q8ACCE_AA(xr4, xr1, xr0, xr3);
	S32LDD(xr2, src, 0x4);
	S32LDD(xr5, blk, 0x8);
	S32LDD(xr6, blk, 0xc);
	Q8ACCE_AA(xr6, xr2, xr0, xr5);
	Q16SAT(xr1, xr4, xr3);
	S32STD(xr1, src, 0x0);
	Q16SAT(xr2, xr6, xr5);
	S32STD(xr2, src, 0x4);
    }
}
Exemple #2
0
void Predict_16x16_C(const NEW_GMC_DATA * const This,
                     uint8_t *dst,
                     const uint8_t *src,
                     int dststride,
                     int srcstride,
                     int x,
                     int y,
                     int rounding)
{
    const int W       = This->sW;
    const int H	    = This->sH;
    const int rho     = 3 - This->accuracy;
    const int Rounder = ( (1<<7) - (rounding<<(2*rho)) ) << 16;

    const int dUx = This->dU[0];
    const int dVx = This->dV[0];
    const int dUy = This->dU[1];
    const int dVy = This->dV[1];

    int Uo = This->Uo + 16*(dUy*y + dUx*x);
    int Vo = This->Vo + 16*(dVy*y + dVx*x);

    int i, j;

    dst += 16;
    {
        unsigned int ri = 16;
        unsigned int rj = 16;
        int Offset;
        int u,v;

        uint8_t *srctmp;
        uint32_t tmpf = 0;

        S32I2M(xr15,dUx);
        S32I2M(xr14,dVx);
        S32I2M(xr13,dUy);
        S32I2M(xr12,dVy);

        S32I2M(xr11,Uo); // Uo 11
        S32I2M(xr10,Vo); // Vo 10

        S32I2M(xr5, Rounder);

        for (j = 16; j>0; --j)
        {
            D32SLL(xr9,xr11,xr10,xr8, 0x0); // U 9 ,V 8
            D32ASUM_AA(xr11,xr13,xr12,xr10); // += dUy; +=dVy;


            for (i = -16; i<0; ++i)
            {
                ri = 16;
                rj = 16;

                // ( U >> 16 ) ,( V >> 16 )
                D32SAR(xr7,xr9,xr8,xr6, 0x8);
                D32SAR(xr7,xr7,xr6,xr6, 0x8);

                D32SLLV(xr7,xr6, rho); // << rho

                u = S32M2I(xr7);
                v = S32M2I(xr6);
                D32ASUM_AA(xr9,xr15,xr14,xr8); // U += dUx; V += dVx;

                if (u > 0 && u <= W)
                {
                    ri = MTab[u&15];
                    Offset = u>>4;
                }
                else
                {
                    if (u > W)
                        Offset = W>>4;
                    else
                        Offset = 0;

                    ri = MTab[0];
                }

                if (v > 0 && v <= H)
                {
                    rj      = MTab[v&15];
                    Offset += (v>>4)*srcstride;
                }