static void avc_dct4x4dc_msa( int16_t *p_src, int16_t *p_dst, int32_t i_src_stride ) { v8i16 src0, src1, src2, src3, ver_res0, ver_res1, ver_res2, ver_res3; v4i32 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3; v4i32 hor_res0, hor_res1, hor_res2, hor_res3; v4i32 ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r; LD_SH4( p_src, i_src_stride, src0, src1, src2, src3 ); UNPCK_R_SH_SW( src0, src0_r ); UNPCK_R_SH_SW( src1, src1_r ); UNPCK_R_SH_SW( src2, src2_r ); UNPCK_R_SH_SW( src3, src3_r ); BUTTERFLY_4( src0_r, src2_r, src3_r, src1_r, tmp0, tmp3, tmp2, tmp1 ); BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3, hor_res0, hor_res3, hor_res2, hor_res1 ); TRANSPOSE4x4_SW_SW( hor_res0, hor_res1, hor_res2, hor_res3, hor_res0, hor_res1, hor_res2, hor_res3 ); BUTTERFLY_4( hor_res0, hor_res2, hor_res3, hor_res1, tmp0, tmp3, tmp2, tmp1 ); BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3, ver_res0_r, ver_res3_r, ver_res2_r, ver_res1_r ); SRARI_W4_SW( ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r, 1 ); PCKEV_H4_SH( ver_res0_r, ver_res0_r, ver_res1_r, ver_res1_r, ver_res2_r, ver_res2_r, ver_res3_r, ver_res3_r, ver_res0, ver_res1, ver_res2, ver_res3 ); PCKOD_D2_SH( ver_res1, ver_res0, ver_res3, ver_res2, ver_res0, ver_res2 ); ST_SH2( ver_res0, ver_res2, p_dst, 8 ); }
void vp8_short_fdct8x4_msa(int16_t *input, int16_t *output, int32_t pitch) { v8i16 in0, in1, in2, in3; v8i16 temp0, temp1, tmp0, tmp1; v8i16 const0, const1, const2; v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 }; v8i16 zero = { 0 }; v4i32 vec0_w, vec1_w, vec2_w, vec3_w; LD_SH4(input, pitch / 2, in0, in1, in2, in3); TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); SLLI_4V(temp0, temp1, in1, in3, 3); in0 = temp0 + temp1; in2 = temp0 - temp1; SET_DOTP_VALUES(coeff, 0, 1, 2, const1, const2); temp0 = __msa_splati_h(coeff, 3); vec1_w = (v4i32)__msa_ilvev_h(zero, temp0); coeff = __msa_ilvl_h(zero, coeff); vec3_w = __msa_splati_w((v4i32)coeff, 0); ILVRL_H2_SH(in3, in1, tmp1, tmp0); vec0_w = vec1_w; vec2_w = vec3_w; DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2, vec0_w, vec1_w, vec2_w, vec3_w); SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 12); PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3); TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); in0 = temp0 + temp1 + 7; in2 = temp0 - temp1 + 7; in0 >>= 4; in2 >>= 4; SPLATI_W2_SW(coeff, 2, vec3_w, vec1_w); vec3_w += vec1_w; vec1_w = __msa_splati_w((v4i32)coeff, 1); const0 = RET_1_IF_NZERO_H(in3); ILVRL_H2_SH(in3, in1, tmp1, tmp0); vec0_w = vec1_w; vec2_w = vec3_w; DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2, vec0_w, vec1_w, vec2_w, vec3_w); SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 16); PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3); in1 += const0; PCKEV_D2_SH(in1, in0, in3, in2, temp0, temp1); ST_SH2(temp0, temp1, output, 8); PCKOD_D2_SH(in1, in0, in3, in2, in0, in2); ST_SH2(in0, in2, output + 16, 8); }