static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { uint64_t out0, out1, out2, out3; uint32_t in0, in1, in2, in3; v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; v8i16 t0, t1, t2, t3; v16u8 srcl0, srcl1, src0, src1; const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 }; const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 }; const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 }; const v8i16 mask3 = { 0, 4, 1, 5, 2, 6, 3, 7 }; const v8i16 cnst0 = { 2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352 }; const v8i16 cnst1 = { 5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217 }; LW4(src, BPS, in0, in1, in2, in3); INSERT_W4_UB(in0, in1, in2, in3, src0); LW4(ref, BPS, in0, in1, in2, in3); INSERT_W4_UB(in0, in1, in2, in3, src1); ILVRL_B2_UB(src0, src1, srcl0, srcl1); HSUB_UB2_SH(srcl0, srcl1, t0, t1); VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3); ADDSUB2(t2, t3, t0, t1); t0 = SRLI_H(t0, 3); VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2); tmp0 = __msa_hadd_s_w(t3, t3); tmp2 = __msa_hsub_s_w(t3, t3); FILL_W2_SW(1812, 937, tmp1, tmp3); DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1); SRAI_W2_SW(tmp1, tmp3, 9); PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1); VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3); ADDSUB2(t2, t3, t0, t1); VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2); tmp0 = __msa_hadd_s_w(t3, t3); tmp2 = __msa_hsub_s_w(t3, t3); ADDVI_W2_SW(tmp0, 7, tmp2, 7, tmp0, tmp2); SRAI_W2_SW(tmp0, tmp2, 4); FILL_W2_SW(12000, 51000, tmp1, tmp3); DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1); SRAI_W2_SW(tmp1, tmp3, 16); UNPCK_R_SH_SW(t1, tmp4); tmp5 = __msa_ceqi_w(tmp4, 0); tmp4 = (v4i32)__msa_nor_v((v16u8)tmp5, (v16u8)tmp5); tmp5 = __msa_fill_w(1); tmp5 = (v4i32)__msa_and_v((v16u8)tmp5, (v16u8)tmp4); tmp1 += tmp5; PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1); out0 = __msa_copy_s_d((v2i64)t0, 0); out1 = __msa_copy_s_d((v2i64)t0, 1); out2 = __msa_copy_s_d((v2i64)t1, 0); out3 = __msa_copy_s_d((v2i64)t1, 1); SD4(out0, out1, out2, out3, out, 8); }
static int TTransform(const uint8_t* in, const uint16_t* w) { int sum; uint32_t in0_m, in1_m, in2_m, in3_m; v16i8 src0; v8i16 in0, in1, tmp0, tmp1, tmp2, tmp3; v4i32 dst0, dst1; const v16i8 zero = { 0 }; const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 }; const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 }; const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 }; const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 }; LW4(in, BPS, in0_m, in1_m, in2_m, in3_m); INSERT_W4_SB(in0_m, in1_m, in2_m, in3_m, src0); ILVRL_B2_SH(zero, src0, tmp0, tmp1); VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1); ADDSUB2(in0, in1, tmp0, tmp1); VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3); ADDSUB2(tmp2, tmp3, tmp0, tmp1); VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1); ADDSUB2(in0, in1, tmp0, tmp1); VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3); ADDSUB2(tmp2, tmp3, tmp0, tmp1); tmp0 = __msa_add_a_h(tmp0, (v8i16)zero); tmp1 = __msa_add_a_h(tmp1, (v8i16)zero); LD_SH2(w, 8, tmp2, tmp3); DOTP_SH2_SW(tmp0, tmp1, tmp2, tmp3, dst0, dst1); dst0 = dst0 + dst1; sum = HADD_SW_S32(dst0); return sum; }
static void avc_sub4x4_dct_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_ref, int32_t i_dst_stride, int16_t *p_dst ) { uint32_t i_src0, i_src1, i_src2, i_src3; uint32_t i_ref0, i_ref1, i_ref2, i_ref3; v16i8 src = { 0 }; v16i8 ref = { 0 }; v16u8 inp0, inp1; v8i16 diff0, diff1, diff2, diff3; v8i16 temp0, temp1, temp2, temp3; LW4( p_src, i_src_stride, i_src0, i_src1, i_src2, i_src3 ); LW4( p_ref, i_dst_stride, i_ref0, i_ref1, i_ref2, i_ref3 ); INSERT_W4_SB( i_src0, i_src1, i_src2, i_src3, src ); INSERT_W4_SB( i_ref0, i_ref1, i_ref2, i_ref3, ref ); ILVRL_B2_UB( src, ref, inp0, inp1 ); HSUB_UB2_SH( inp0, inp1, diff0, diff2 ); diff1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff0, ( v2i64 ) diff0 ); diff3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff2, ( v2i64 ) diff2 ); BUTTERFLY_4( diff0, diff1, diff2, diff3, temp0, temp1, temp2, temp3 ); diff0 = temp0 + temp1; diff1 = ( temp3 << 1 ) + temp2; diff2 = temp0 - temp1; diff3 = temp3 - ( temp2 << 1 ); TRANSPOSE4x4_SH_SH( diff0, diff1, diff2, diff3, temp0, temp1, temp2, temp3 ); BUTTERFLY_4( temp0, temp1, temp2, temp3, diff0, diff1, diff2, diff3 ); temp0 = diff0 + diff1; temp1 = ( diff3 << 1 ) + diff2; temp2 = diff0 - diff1; temp3 = diff3 - ( diff2 << 1 ); ILVR_D2_UB( temp1, temp0, temp3, temp2, inp0, inp1 ); ST_UB2( inp0, inp1, p_dst, 8 ); }
static int32_t subtract_sum4x4_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *pred_ptr, int32_t i_pred_stride ) { int16_t i_sum; uint32_t i_src0, i_src1, i_src2, i_src3; uint32_t i_pred0, i_pred1, i_pred2, i_pred3; v16i8 src = { 0 }; v16i8 pred = { 0 }; v16u8 src_l0, src_l1; v8i16 diff0, diff1; LW4( p_src, i_src_stride, i_src0, i_src1, i_src2, i_src3 ); LW4( pred_ptr, i_pred_stride, i_pred0, i_pred1, i_pred2, i_pred3 ); INSERT_W4_SB( i_src0, i_src1, i_src2, i_src3, src ); INSERT_W4_SB( i_pred0, i_pred1, i_pred2, i_pred3, pred ); ILVRL_B2_UB( src, pred, src_l0, src_l1 ); HSUB_UB2_SH( src_l0, src_l1, diff0, diff1 ); i_sum = HADD_UH_U32( diff0 + diff1 ); return i_sum; }
static void hevc_addblk_4x4_msa(int16_t *coeffs, uint8_t *dst, int32_t stride) { uint32_t dst0, dst1, dst2, dst3; v8i16 dst_r0, dst_l0, in0, in1; v4i32 dst_vec = { 0 }; v16u8 zeros = { 0 }; LD_SH2(coeffs, 8, in0, in1); LW4(dst, stride, dst0, dst1, dst2, dst3); INSERT_W4_SW(dst0, dst1, dst2, dst3, dst_vec); ILVRL_B2_SH(zeros, dst_vec, dst_r0, dst_l0); ADD2(dst_r0, in0, dst_l0, in1, dst_r0, dst_l0); CLIP_SH2_0_255(dst_r0, dst_l0); dst_vec = (v4i32) __msa_pckev_b((v16i8) dst_l0, (v16i8) dst_r0); ST4x4_UB(dst_vec, dst_vec, 0, 1, 2, 3, dst, stride); }
uint32_t vp10_avg_4x4_msa(const uint8_t *src, int32_t src_stride) { uint32_t sum_out; uint32_t src0, src1, src2, src3; v16u8 vec = { 0 }; v8u16 sum0; v4u32 sum1; v2u64 sum2; LW4(src, src_stride, src0, src1, src2, src3); INSERT_W4_UB(src0, src1, src2, src3, vec); sum0 = __msa_hadd_u_h(vec, vec); sum1 = __msa_hadd_u_w(sum0, sum0); sum0 = (v8u16)__msa_pckev_h((v8i16)sum1, (v8i16)sum1); sum1 = __msa_hadd_u_w(sum0, sum0); sum2 = __msa_hadd_u_d(sum1, sum1); sum1 = (v4u32)__msa_srari_w((v4i32)sum2, 4); sum_out = __msa_copy_u_w((v4i32)sum1, 0); return sum_out; }
static void avc_idct4x4_addblk_dc_msa( uint8_t *p_dst, int16_t *p_src, int32_t i_dst_stride ) { int16_t i_dc; uint32_t i_src0, i_src1, i_src2, i_src3; v16u8 pred = { 0 }; v16i8 out; v8i16 input_dc, pred_r, pred_l; i_dc = ( p_src[0] + 32 ) >> 6; input_dc = __msa_fill_h( i_dc ); p_src[ 0 ] = 0; LW4( p_dst, i_dst_stride, i_src0, i_src1, i_src2, i_src3 ); INSERT_W4_UB( i_src0, i_src1, i_src2, i_src3, pred ); UNPCK_UB_SH( pred, pred_r, pred_l ); pred_r += input_dc; pred_l += input_dc; CLIP_SH2_0_255( pred_r, pred_l ); out = __msa_pckev_b( ( v16i8 ) pred_l, ( v16i8 ) pred_r ); ST4x4_UB( out, out, 0, 1, 2, 3, p_dst, i_dst_stride ); }