static inline void ifft4(buf) { uint32_t tm4=(uint32_t)(buf); S32LDI(xr1, tm4, 0); S32LDI(xr2, tm4, 8); S32LDI(xr3, tm4, 8); S32LDI(xr4, tm4, 8); D32ADD_AS(xr5, xr1, xr2, xr6); D32ADD_AS(xr7, xr4, xr3, xr8); D32ADD_AS(xr9, xr5, xr7, xr10); S32SDI(xr10,tm4,-8); S32SDI(xr9,tm4,-16); S32LDI(xr1, tm4, 4); S32LDI(xr2, tm4, 8); S32LDI(xr3, tm4, 8); S32LDI(xr4, tm4, 8); D32ADD_AS(xr5, xr1, xr2, xr9); D32ADD_AS(xr7, xr3, xr4, xr10); D32ADD_AS(xr1,xr5,xr7,xr2); D32ADD_AS(xr11,xr6,xr10,xr12); D32ADD_AS(xr13,xr9,xr8,xr14); S32SDI(xr14,tm4,0); S32SDI(xr12,tm4,-4); S32SDI(xr2, tm4,-4); S32SDI(xr13,tm4,-8); S32SDI(xr11,tm4,-4); S32SDI(xr1, tm4,-4); }
static void rv40_dequant4x4(DCTELEM *block,uint32_t *dst, int n) { int i; uint32_t src=block-4; uint32_t dst_t = dst-4; #if 0 for(i = 0; i < n; i++){ S32LDI(xr1,src,0x8); S32LDD(xr2,src,0x4); S32LDI(xr7,src,0x8); S32LDD(xr8,src,0x4); D16MUL_LW(xr4,xr12,xr1,xr3); D16MUL_LW(xr10,xr12,xr7,xr9); D16MUL_LW(xr14,xr12,xr8,xr15); D32ASUM_AA(xr3,xr13,xr13,xr4); D16MUL_LW(xr6,xr12,xr2,xr5); D32SLR(xr3,xr3,xr4,xr4,4); D32ASUM_AA(xr5,xr13,xr13,xr6); D32ASUM_AA(xr9,xr13,xr13,xr10); D32SLR(xr5,xr5,xr6,xr6,4); D32SLR(xr9,xr9,xr10,xr10,4); D32ASUM_AA(xr15,xr13,xr13,xr14); S32SDI(xr3,dst_t,0x10); S32STD(xr4,dst_t,0x4); S32STD(xr5,dst_t,0x8); S32STD(xr6,dst_t,0xc); D32SLR(xr15,xr15,xr14,xr14,4); S32SDI(xr9,dst_t,0x10); S32STD(xr10,dst_t,0x4); S32STD(xr15,dst_t,0x8); S32STD(xr14,dst_t,0xc); } #else ///////////////////// if(n == 1){ S32LDI(xr1,src,0x8); S32LDI(xr2,src,0x8); S32LDI(xr7,src,0x8); S32LDI(xr8,src,0x8); D16MUL_XW(xr4,xr12,xr1,xr3); D16MUL_LW(xr10,xr12,xr7,xr9); D16MUL_LW(xr14,xr12,xr8,xr15); D32ASUM_AA(xr3,xr13,xr13,xr4); D16MUL_LW(xr6,xr12,xr2,xr5); D32SLR(xr3,xr3,xr4,xr4,4); D32ASUM_AA(xr5,xr13,xr13,xr6); D32ASUM_AA(xr9,xr13,xr13,xr10); D32SLR(xr5,xr5,xr6,xr6,4); D32SLR(xr9,xr9,xr10,xr10,4); D32ASUM_AA(xr15,xr13,xr13,xr14); S32SDI(xr3,dst_t,0x10); S32STD(xr4,dst_t,0x4); S32STD(xr0,dst_t,0x8); S32STD(xr0,dst_t,0xc); S32SDI(xr5,dst_t,0x10); S32STD(xr6,dst_t,0x4); S32STD(xr0,dst_t,0x8); S32STD(xr0,dst_t,0xc); D32SLR(xr15,xr15,xr14,xr14,4); S32SDI(xr9,dst_t,0x10); S32STD(xr10,dst_t,0x4); S32STD(xr0,dst_t,0x8); S32STD(xr0,dst_t,0xc); //S32STD(xr15,dst_t,0x8); //S32STD(xr14,dst_t,0xc); S32SDI(xr15,dst_t,0x10); S32STD(xr14,dst_t,0x4); S32STD(xr0,dst_t,0x8); S32STD(xr0,dst_t,0xc); } else if(n==2) { S32LDI(xr1,src,0x8); S32LDD(xr2,src,0x4); S32LDI(xr7,src,0x8); S32LDD(xr8,src,0x4); D16MUL_XW(xr4,xr12,xr1,xr3); D16MUL_LW(xr10,xr12,xr7,xr9); D16MUL_LW(xr14,xr12,xr8,xr15); D32ASUM_AA(xr3,xr13,xr13,xr4); D16MUL_LW(xr6,xr12,xr2,xr5); D32SLR(xr3,xr3,xr4,xr4,4); D32ASUM_AA(xr5,xr13,xr13,xr6); D32ASUM_AA(xr9,xr13,xr13,xr10); D32SLR(xr5,xr5,xr6,xr6,4); D32SLR(xr9,xr9,xr10,xr10,4); D32ASUM_AA(xr15,xr13,xr13,xr14); S32SDI(xr3,dst_t,0x10); S32STD(xr4,dst_t,0x4); S32STD(xr5,dst_t,0x8); S32STD(xr6,dst_t,0xc); D32SLR(xr15,xr15,xr14,xr14,4); S32SDI(xr9,dst_t,0x10); S32STD(xr10,dst_t,0x4); S32STD(xr15,dst_t,0x8); S32STD(xr14,dst_t,0xc); S32SDI(xr0,dst_t,0x10); S32STD(xr0,dst_t,0x4); S32STD(xr0,dst_t,0x8); S32STD(xr0,dst_t,0xc); S32SDI(xr0,dst_t,0x10); S32STD(xr0,dst_t,0x4); S32STD(xr0,dst_t,0x8); S32STD(xr0,dst_t,0xc); } else { S32LDI(xr1,src,0x8); S32LDD(xr2,src,0x4); S32LDI(xr7,src,0x8); S32LDD(xr8,src,0x4); D16MUL_XW(xr4,xr12,xr1,xr3); D16MUL_LW(xr10,xr12,xr7,xr9); D16MUL_LW(xr14,xr12,xr8,xr15); D32ASUM_AA(xr3,xr13,xr13,xr4); D16MUL_LW(xr6,xr12,xr2,xr5); D32SLR(xr3,xr3,xr4,xr4,4); D32ASUM_AA(xr5,xr13,xr13,xr6); D32ASUM_AA(xr9,xr13,xr13,xr10); D32SLR(xr5,xr5,xr6,xr6,4); D32SLR(xr9,xr9,xr10,xr10,4); D32ASUM_AA(xr15,xr13,xr13,xr14); S32SDI(xr3,dst_t,0x10); S32STD(xr4,dst_t,0x4); S32STD(xr5,dst_t,0x8); S32STD(xr6,dst_t,0xc); D32SLR(xr15,xr15,xr14,xr14,4); S32SDI(xr9,dst_t,0x10); S32STD(xr10,dst_t,0x4); S32STD(xr15,dst_t,0x8); S32STD(xr14,dst_t,0xc); S32LDI(xr1,src,0x8); S32LDD(xr2,src,0x4); S32LDI(xr7,src,0x8); S32LDD(xr8,src,0x4); D16MUL_LW(xr4,xr12,xr1,xr3); D16MUL_LW(xr10,xr12,xr7,xr9); D16MUL_LW(xr14,xr12,xr8,xr15); D32ASUM_AA(xr3,xr13,xr13,xr4); D16MUL_LW(xr6,xr12,xr2,xr5); D32SLR(xr3,xr3,xr4,xr4,4); D32ASUM_AA(xr5,xr13,xr13,xr6); D32ASUM_AA(xr9,xr13,xr13,xr10); D32SLR(xr5,xr5,xr6,xr6,4); D32SLR(xr9,xr9,xr10,xr10,4); D32ASUM_AA(xr15,xr13,xr13,xr14); S32SDI(xr3,dst_t,0x10); S32STD(xr4,dst_t,0x4); S32STD(xr5,dst_t,0x8); S32STD(xr6,dst_t,0xc); D32SLR(xr15,xr15,xr14,xr14,4); S32SDI(xr9,dst_t,0x10); S32STD(xr10,dst_t,0x4); S32STD(xr15,dst_t,0x8); S32STD(xr14,dst_t,0xc); } #endif }
void ff_init_cabac_states_mxu(CABACContext *c){ uint32_t i = 0; uint8_t *p_lps_range = lps_range[0]-4; uint8_t *p_mps_state = mps_state-4; uint8_t *p_lps_state = lps_state-4; uint8_t *p_ff_h264_mlps_state1 = &ff_h264_mlps_state[0]+124; uint8_t *p_ff_h264_mlps_state2 = &ff_h264_mlps_state[0]+128;// uint8_t *p_ff_h264_lps_range = ff_h264_lps_range-128; S32I2M(xr15,0x1010101); for(i=0; i<16; i++){ /* load 32 */ S32LDI(xr1,p_lps_range,4); S32LDI(xr2,p_lps_range,4); S32LDI(xr3,p_lps_range,4); S32LDI(xr4,p_lps_range,4); S32SFL(xr7,xr2,xr1,xr8,0); S32SFL(xr9,xr4,xr3,xr10,0); S32SFL(xr11,xr7,xr7,xr12,0); S32SFL(xr13,xr8,xr8,xr14,0); S32SFL(xr1,xr9,xr9,xr2,0); S32SFL(xr3,xr10,xr10,xr4,0); S32SDI(xr14,p_ff_h264_lps_range,128); S32SDI(xr4,p_ff_h264_lps_range,4); S32SDI(xr13,p_ff_h264_lps_range,124); S32SDI(xr3,p_ff_h264_lps_range,4); S32SDI(xr12,p_ff_h264_lps_range,124); S32SDI(xr2,p_ff_h264_lps_range,4); S32SDI(xr11,p_ff_h264_lps_range,124); S32SDI(xr1,p_ff_h264_lps_range,4); p_ff_h264_lps_range-=128*4-4; /* part2 */ S32LDI(xr1,p_mps_state,4); Q8ADD_AA(xr3,xr1,xr1);// 2*mps_state[i]+0 Q8ADD_AA(xr4,xr3,xr15);//2*mps_state[i]+1 S32SFL(xr11,xr4,xr3,xr12,0); S32SDI(xr12,p_ff_h264_mlps_state1,4);// ff_h264_mlps_state[128+2*i+1]= 2*mps_state[i]+0; S32SDI(xr11,p_ff_h264_mlps_state1,4); /*part3 */ S32LDI(xr1,p_lps_state,4); Q8ADD_AA(xr3,xr1,xr1);// 2*mps_state[i]+0 Q8ADD_AA(xr4,xr3,xr15);//2*mps_state[i]+1 S32SFL(xr11,xr3,xr4,xr12,0); S32ALN(xr11,xr11,xr11,2); S32ALN(xr12,xr12,xr12,2); S32SDI(xr12,p_ff_h264_mlps_state2,-4); S32SDI(xr11,p_ff_h264_mlps_state2,-4); } ff_h264_mlps_state[127]= 1; ff_h264_mlps_state[126]= 0; }