예제 #1
0
static inline void ifft4(buf)
{ 
   uint32_t tm4=(uint32_t)(buf);
   S32LDI(xr1, tm4, 0);        
   S32LDI(xr2, tm4, 8);       
   S32LDI(xr3, tm4, 8);      
   S32LDI(xr4, tm4, 8);     
                        
   D32ADD_AS(xr5, xr1, xr2, xr6);   
   D32ADD_AS(xr7, xr4, xr3, xr8);  
   D32ADD_AS(xr9, xr5, xr7, xr10);
                              
   S32SDI(xr10,tm4,-8);         
   S32SDI(xr9,tm4,-16);        
                           
   S32LDI(xr1, tm4, 4);      
   S32LDI(xr2, tm4, 8);     
   S32LDI(xr3, tm4, 8);    
   S32LDI(xr4, tm4, 8);   
                      
   D32ADD_AS(xr5, xr1, xr2, xr9);   
   D32ADD_AS(xr7, xr3, xr4, xr10);  
   D32ADD_AS(xr1,xr5,xr7,xr2);      
                                 
   D32ADD_AS(xr11,xr6,xr10,xr12);   
   D32ADD_AS(xr13,xr9,xr8,xr14);    
                                 
   S32SDI(xr14,tm4,0);              
   S32SDI(xr12,tm4,-4);             
   S32SDI(xr2, tm4,-4);             
   S32SDI(xr13,tm4,-8);             
   S32SDI(xr11,tm4,-4);             
   S32SDI(xr1, tm4,-4);     
         
} 
예제 #2
0
static void rv40_dequant4x4(DCTELEM *block,uint32_t *dst, int n)
{
  int i;
  uint32_t src=block-4;
  uint32_t dst_t = dst-4;
#if 0
  for(i = 0; i < n; i++){   
    S32LDI(xr1,src,0x8);
    S32LDD(xr2,src,0x4);
    S32LDI(xr7,src,0x8);
    S32LDD(xr8,src,0x4);

    D16MUL_LW(xr4,xr12,xr1,xr3);
    D16MUL_LW(xr10,xr12,xr7,xr9);
    D16MUL_LW(xr14,xr12,xr8,xr15);
    D32ASUM_AA(xr3,xr13,xr13,xr4);
    D16MUL_LW(xr6,xr12,xr2,xr5);
    D32SLR(xr3,xr3,xr4,xr4,4);
    D32ASUM_AA(xr5,xr13,xr13,xr6);
    D32ASUM_AA(xr9,xr13,xr13,xr10);
    D32SLR(xr5,xr5,xr6,xr6,4);
    D32SLR(xr9,xr9,xr10,xr10,4);
    D32ASUM_AA(xr15,xr13,xr13,xr14);

    S32SDI(xr3,dst_t,0x10);
    S32STD(xr4,dst_t,0x4);
    S32STD(xr5,dst_t,0x8);
    S32STD(xr6,dst_t,0xc);

    D32SLR(xr15,xr15,xr14,xr14,4);
    S32SDI(xr9,dst_t,0x10);
    S32STD(xr10,dst_t,0x4);
    S32STD(xr15,dst_t,0x8);
    S32STD(xr14,dst_t,0xc);      
  }
#else
  /////////////////////     
  if(n == 1){
    S32LDI(xr1,src,0x8);
    S32LDI(xr2,src,0x8);
    S32LDI(xr7,src,0x8);
    S32LDI(xr8,src,0x8);
	  
    D16MUL_XW(xr4,xr12,xr1,xr3);
    D16MUL_LW(xr10,xr12,xr7,xr9);
    D16MUL_LW(xr14,xr12,xr8,xr15);
    D32ASUM_AA(xr3,xr13,xr13,xr4);
    D16MUL_LW(xr6,xr12,xr2,xr5);
    D32SLR(xr3,xr3,xr4,xr4,4);
    D32ASUM_AA(xr5,xr13,xr13,xr6);
    D32ASUM_AA(xr9,xr13,xr13,xr10);
    D32SLR(xr5,xr5,xr6,xr6,4);
    D32SLR(xr9,xr9,xr10,xr10,4);
    D32ASUM_AA(xr15,xr13,xr13,xr14);

    S32SDI(xr3,dst_t,0x10);
    S32STD(xr4,dst_t,0x4);
    S32STD(xr0,dst_t,0x8);
    S32STD(xr0,dst_t,0xc);

    S32SDI(xr5,dst_t,0x10);
    S32STD(xr6,dst_t,0x4);
    S32STD(xr0,dst_t,0x8);
    S32STD(xr0,dst_t,0xc);

    D32SLR(xr15,xr15,xr14,xr14,4);
    S32SDI(xr9,dst_t,0x10);
    S32STD(xr10,dst_t,0x4);
    S32STD(xr0,dst_t,0x8);
    S32STD(xr0,dst_t,0xc);
    //S32STD(xr15,dst_t,0x8);
    //S32STD(xr14,dst_t,0xc);
    S32SDI(xr15,dst_t,0x10);
    S32STD(xr14,dst_t,0x4);
    S32STD(xr0,dst_t,0x8);
    S32STD(xr0,dst_t,0xc);
  }
  else if(n==2)
    {
      S32LDI(xr1,src,0x8);
      S32LDD(xr2,src,0x4);
      S32LDI(xr7,src,0x8);
      S32LDD(xr8,src,0x4);
	  
      D16MUL_XW(xr4,xr12,xr1,xr3);
      D16MUL_LW(xr10,xr12,xr7,xr9);
      D16MUL_LW(xr14,xr12,xr8,xr15);
      D32ASUM_AA(xr3,xr13,xr13,xr4);
      D16MUL_LW(xr6,xr12,xr2,xr5);
      D32SLR(xr3,xr3,xr4,xr4,4);
      D32ASUM_AA(xr5,xr13,xr13,xr6);
      D32ASUM_AA(xr9,xr13,xr13,xr10);
      D32SLR(xr5,xr5,xr6,xr6,4);
      D32SLR(xr9,xr9,xr10,xr10,4);
      D32ASUM_AA(xr15,xr13,xr13,xr14);

      S32SDI(xr3,dst_t,0x10);
      S32STD(xr4,dst_t,0x4);
      S32STD(xr5,dst_t,0x8);
      S32STD(xr6,dst_t,0xc);

      D32SLR(xr15,xr15,xr14,xr14,4);
      S32SDI(xr9,dst_t,0x10);
      S32STD(xr10,dst_t,0x4);
      S32STD(xr15,dst_t,0x8);
      S32STD(xr14,dst_t,0xc);

      S32SDI(xr0,dst_t,0x10);
      S32STD(xr0,dst_t,0x4);
      S32STD(xr0,dst_t,0x8);
      S32STD(xr0,dst_t,0xc);
	  
      S32SDI(xr0,dst_t,0x10);
      S32STD(xr0,dst_t,0x4);
      S32STD(xr0,dst_t,0x8);
      S32STD(xr0,dst_t,0xc);	  
    }

  else
    {
      S32LDI(xr1,src,0x8);
      S32LDD(xr2,src,0x4);
      S32LDI(xr7,src,0x8);
      S32LDD(xr8,src,0x4);

      D16MUL_XW(xr4,xr12,xr1,xr3);
      D16MUL_LW(xr10,xr12,xr7,xr9);
      D16MUL_LW(xr14,xr12,xr8,xr15);
      D32ASUM_AA(xr3,xr13,xr13,xr4);
      D16MUL_LW(xr6,xr12,xr2,xr5);
      D32SLR(xr3,xr3,xr4,xr4,4);
      D32ASUM_AA(xr5,xr13,xr13,xr6);
      D32ASUM_AA(xr9,xr13,xr13,xr10);
      D32SLR(xr5,xr5,xr6,xr6,4);
      D32SLR(xr9,xr9,xr10,xr10,4);
      D32ASUM_AA(xr15,xr13,xr13,xr14);

      S32SDI(xr3,dst_t,0x10);
      S32STD(xr4,dst_t,0x4);
      S32STD(xr5,dst_t,0x8);
      S32STD(xr6,dst_t,0xc);

      D32SLR(xr15,xr15,xr14,xr14,4);
      S32SDI(xr9,dst_t,0x10);
      S32STD(xr10,dst_t,0x4);
      S32STD(xr15,dst_t,0x8);
      S32STD(xr14,dst_t,0xc);

      S32LDI(xr1,src,0x8);
      S32LDD(xr2,src,0x4);
      S32LDI(xr7,src,0x8);
      S32LDD(xr8,src,0x4);

      D16MUL_LW(xr4,xr12,xr1,xr3);
      D16MUL_LW(xr10,xr12,xr7,xr9);
      D16MUL_LW(xr14,xr12,xr8,xr15);
      D32ASUM_AA(xr3,xr13,xr13,xr4);
      D16MUL_LW(xr6,xr12,xr2,xr5);
      D32SLR(xr3,xr3,xr4,xr4,4);
      D32ASUM_AA(xr5,xr13,xr13,xr6);
      D32ASUM_AA(xr9,xr13,xr13,xr10);
      D32SLR(xr5,xr5,xr6,xr6,4);
      D32SLR(xr9,xr9,xr10,xr10,4);
      D32ASUM_AA(xr15,xr13,xr13,xr14);

      S32SDI(xr3,dst_t,0x10);
      S32STD(xr4,dst_t,0x4);
      S32STD(xr5,dst_t,0x8);
      S32STD(xr6,dst_t,0xc);

      D32SLR(xr15,xr15,xr14,xr14,4);
      S32SDI(xr9,dst_t,0x10);
      S32STD(xr10,dst_t,0x4);
      S32STD(xr15,dst_t,0x8);
      S32STD(xr14,dst_t,0xc);      
    }
#endif

}
예제 #3
0
void ff_init_cabac_states_mxu(CABACContext *c){
    uint32_t i = 0;
    uint8_t *p_lps_range = lps_range[0]-4;
    uint8_t *p_mps_state = mps_state-4;
    uint8_t *p_lps_state = lps_state-4;
    uint8_t *p_ff_h264_mlps_state1 = &ff_h264_mlps_state[0]+124;
    uint8_t *p_ff_h264_mlps_state2 = &ff_h264_mlps_state[0]+128;//
    uint8_t *p_ff_h264_lps_range = ff_h264_lps_range-128;
    S32I2M(xr15,0x1010101);

    for(i=0; i<16; i++){
      
      /* load 32 */
      S32LDI(xr1,p_lps_range,4);
      S32LDI(xr2,p_lps_range,4);
      S32LDI(xr3,p_lps_range,4);
      S32LDI(xr4,p_lps_range,4);

      S32SFL(xr7,xr2,xr1,xr8,0);
      S32SFL(xr9,xr4,xr3,xr10,0);

      S32SFL(xr11,xr7,xr7,xr12,0);
      S32SFL(xr13,xr8,xr8,xr14,0);
      S32SFL(xr1,xr9,xr9,xr2,0);
      S32SFL(xr3,xr10,xr10,xr4,0);

      S32SDI(xr14,p_ff_h264_lps_range,128); 
      S32SDI(xr4,p_ff_h264_lps_range,4); 
      S32SDI(xr13,p_ff_h264_lps_range,124);
      S32SDI(xr3,p_ff_h264_lps_range,4); 
      S32SDI(xr12,p_ff_h264_lps_range,124);
      S32SDI(xr2,p_ff_h264_lps_range,4); 
      S32SDI(xr11,p_ff_h264_lps_range,124);
      S32SDI(xr1,p_ff_h264_lps_range,4);
      p_ff_h264_lps_range-=128*4-4;

     
      /* part2 */
      S32LDI(xr1,p_mps_state,4); 
     
      Q8ADD_AA(xr3,xr1,xr1);// 2*mps_state[i]+0
      Q8ADD_AA(xr4,xr3,xr15);//2*mps_state[i]+1  
      S32SFL(xr11,xr4,xr3,xr12,0);     
      
      S32SDI(xr12,p_ff_h264_mlps_state1,4);//  ff_h264_mlps_state[128+2*i+1]= 2*mps_state[i]+0;
      S32SDI(xr11,p_ff_h264_mlps_state1,4);
         
      /*part3 */ 
      S32LDI(xr1,p_lps_state,4);   

      Q8ADD_AA(xr3,xr1,xr1);// 2*mps_state[i]+0
      Q8ADD_AA(xr4,xr3,xr15);//2*mps_state[i]+1
    
      S32SFL(xr11,xr3,xr4,xr12,0);
         
      S32ALN(xr11,xr11,xr11,2);
      S32ALN(xr12,xr12,xr12,2);
    
      S32SDI(xr12,p_ff_h264_mlps_state2,-4);
      S32SDI(xr11,p_ff_h264_mlps_state2,-4);
     
   }
    
   ff_h264_mlps_state[127]= 1;
   ff_h264_mlps_state[126]= 0;    

}