static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; v16u8 dst0, dst1, dst2, dst3, out; v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; v16i8 src10998, filt0, filt1, filt2, filt3; v8i16 filt, out10, out32; src -= (3 * src_stride); filt = LD_SH(filter); SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); src += (7 * src_stride); ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, src54_r, src21_r); ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110, src4332, src6554); XORI_B3_128_SB(src2110, src4332, src6554); for (loop_cnt = (height >> 2); loop_cnt--;) {
static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1; v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4; v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; mask0 = LD_UB(&mc_filt_mask_arr[16]); src -= (3 + 3 * src_stride); /* rearranging filter */ filt = LD_SH(filter_horiz); SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); mask1 = mask0 + 2; mask2 = mask0 + 4; mask3 = mask0 + 6; LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); src += (7 * src_stride); hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8); filt = LD_SH(filter_vert); SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); for (loop_cnt = (height >> 2); loop_cnt--;) {