static int TTransform(const uint8_t* in, const uint16_t* w) { int sum; uint32_t in0_m, in1_m, in2_m, in3_m; v16i8 src0; v8i16 in0, in1, tmp0, tmp1, tmp2, tmp3; v4i32 dst0, dst1; const v16i8 zero = { 0 }; const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 }; const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 }; const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 }; const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 }; LW4(in, BPS, in0_m, in1_m, in2_m, in3_m); INSERT_W4_SB(in0_m, in1_m, in2_m, in3_m, src0); ILVRL_B2_SH(zero, src0, tmp0, tmp1); VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1); ADDSUB2(in0, in1, tmp0, tmp1); VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3); ADDSUB2(tmp2, tmp3, tmp0, tmp1); VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1); ADDSUB2(in0, in1, tmp0, tmp1); VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3); ADDSUB2(tmp2, tmp3, tmp0, tmp1); tmp0 = __msa_add_a_h(tmp0, (v8i16)zero); tmp1 = __msa_add_a_h(tmp1, (v8i16)zero); LD_SH2(w, 8, tmp2, tmp3); DOTP_SH2_SW(tmp0, tmp1, tmp2, tmp3, dst0, dst1); dst0 = dst0 + dst1; sum = HADD_SW_S32(dst0); return sum; }
static void avc_sub4x4_dct_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_ref, int32_t i_dst_stride, int16_t *p_dst ) { uint32_t i_src0, i_src1, i_src2, i_src3; uint32_t i_ref0, i_ref1, i_ref2, i_ref3; v16i8 src = { 0 }; v16i8 ref = { 0 }; v16u8 inp0, inp1; v8i16 diff0, diff1, diff2, diff3; v8i16 temp0, temp1, temp2, temp3; LW4( p_src, i_src_stride, i_src0, i_src1, i_src2, i_src3 ); LW4( p_ref, i_dst_stride, i_ref0, i_ref1, i_ref2, i_ref3 ); INSERT_W4_SB( i_src0, i_src1, i_src2, i_src3, src ); INSERT_W4_SB( i_ref0, i_ref1, i_ref2, i_ref3, ref ); ILVRL_B2_UB( src, ref, inp0, inp1 ); HSUB_UB2_SH( inp0, inp1, diff0, diff2 ); diff1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff0, ( v2i64 ) diff0 ); diff3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff2, ( v2i64 ) diff2 ); BUTTERFLY_4( diff0, diff1, diff2, diff3, temp0, temp1, temp2, temp3 ); diff0 = temp0 + temp1; diff1 = ( temp3 << 1 ) + temp2; diff2 = temp0 - temp1; diff3 = temp3 - ( temp2 << 1 ); TRANSPOSE4x4_SH_SH( diff0, diff1, diff2, diff3, temp0, temp1, temp2, temp3 ); BUTTERFLY_4( temp0, temp1, temp2, temp3, diff0, diff1, diff2, diff3 ); temp0 = diff0 + diff1; temp1 = ( diff3 << 1 ) + diff2; temp2 = diff0 - diff1; temp3 = diff3 - ( diff2 << 1 ); ILVR_D2_UB( temp1, temp0, temp3, temp2, inp0, inp1 ); ST_UB2( inp0, inp1, p_dst, 8 ); }
static int32_t subtract_sum4x4_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *pred_ptr, int32_t i_pred_stride ) { int16_t i_sum; uint32_t i_src0, i_src1, i_src2, i_src3; uint32_t i_pred0, i_pred1, i_pred2, i_pred3; v16i8 src = { 0 }; v16i8 pred = { 0 }; v16u8 src_l0, src_l1; v8i16 diff0, diff1; LW4( p_src, i_src_stride, i_src0, i_src1, i_src2, i_src3 ); LW4( pred_ptr, i_pred_stride, i_pred0, i_pred1, i_pred2, i_pred3 ); INSERT_W4_SB( i_src0, i_src1, i_src2, i_src3, src ); INSERT_W4_SB( i_pred0, i_pred1, i_pred2, i_pred3, pred ); ILVRL_B2_UB( src, pred, src_l0, src_l1 ); HSUB_UB2_SH( src_l0, src_l1, diff0, diff1 ); i_sum = HADD_UH_U32( diff0 + diff1 ); return i_sum; }