static void filter_by_weight16x16_msa(const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, int32_t dst_stride, int32_t src_weight) { int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight; int32_t row; v16i8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l; src_wt = __msa_fill_h(src_weight); dst_wt = __msa_fill_h(dst_weight); for (row = 4; row--;) { LD_SB4(src_ptr, src_stride, src0, src1, src2, src3); src_ptr += (4 * src_stride); LD_SB4(dst_ptr, dst_stride, dst0, dst1, dst2, dst3); UNPCK_UB_SH(src0, src_r, src_l); UNPCK_UB_SH(dst0, dst_r, dst_l); res_h_r = (src_r * src_wt); res_h_r += (dst_r * dst_wt); res_h_l = (src_l * src_wt); res_h_l += (dst_l * dst_wt); SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); dst_ptr += dst_stride; UNPCK_UB_SH(src1, src_r, src_l); UNPCK_UB_SH(dst1, dst_r, dst_l); res_h_r = (src_r * src_wt); res_h_r += (dst_r * dst_wt); res_h_l = (src_l * src_wt); res_h_l += (dst_l * dst_wt); SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); dst_ptr += dst_stride; UNPCK_UB_SH(src2, src_r, src_l); UNPCK_UB_SH(dst2, dst_r, dst_l); res_h_r = (src_r * src_wt); res_h_r += (dst_r * dst_wt); res_h_l = (src_l * src_wt); res_h_l += (dst_l * dst_wt); SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); dst_ptr += dst_stride; UNPCK_UB_SH(src3, src_r, src_l); UNPCK_UB_SH(dst3, dst_r, dst_l); res_h_r = (src_r * src_wt); res_h_r += (dst_r * dst_wt); res_h_l = (src_l * src_wt); res_h_l += (dst_l * dst_wt); SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); dst_ptr += dst_stride; } }
static void intra_predict_tm_32x32_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t dst_stride) { uint8_t top_left = src_top[-1]; uint32_t loop_cnt; v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3; v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1; LD_SB2(src_top, 16, src_top0, src_top1); src_top_left = (v8u16)__msa_fill_h(top_left); for (loop_cnt = 8; loop_cnt--;) { src_left0 = __msa_fill_b(src_left[0]); src_left1 = __msa_fill_b(src_left[1]); src_left2 = __msa_fill_b(src_left[2]); src_left3 = __msa_fill_b(src_left[3]); src_left += 4; ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1); ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1); HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); PCKEV_ST_SB(res_r0, res_l0, dst); PCKEV_ST_SB(res_r1, res_l1, dst + 16); dst += dst_stride; ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1); ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1); HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); PCKEV_ST_SB(res_r0, res_l0, dst); PCKEV_ST_SB(res_r1, res_l1, dst + 16); dst += dst_stride; ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1); ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1); HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); PCKEV_ST_SB(res_r0, res_l0, dst); PCKEV_ST_SB(res_r1, res_l1, dst + 16); dst += dst_stride; ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1); ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1); HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); PCKEV_ST_SB(res_r0, res_l0, dst); PCKEV_ST_SB(res_r1, res_l1, dst + 16); dst += dst_stride; } }
static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr, const uint8_t *src_left, uint8_t *dst, int32_t dst_stride) { uint8_t top_left = src_top_ptr[-1]; uint32_t loop_cnt; v16i8 src_top, src_left0, src_left1, src_left2, src_left3; v8u16 src_top_left, res_r, res_l; src_top = LD_SB(src_top_ptr); src_top_left = (v8u16)__msa_fill_h(top_left); for (loop_cnt = 4; loop_cnt--;) { src_left0 = __msa_fill_b(src_left[0]); src_left1 = __msa_fill_b(src_left[1]); src_left2 = __msa_fill_b(src_left[2]); src_left3 = __msa_fill_b(src_left[3]); src_left += 4; ILVRL_B2_UH(src_left0, src_top, res_r, res_l); HADD_UB2_UH(res_r, res_l, res_r, res_l); IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); SAT_UH2_UH(res_r, res_l, 7); PCKEV_ST_SB(res_r, res_l, dst); dst += dst_stride; ILVRL_B2_UH(src_left1, src_top, res_r, res_l); HADD_UB2_UH(res_r, res_l, res_r, res_l); IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); SAT_UH2_UH(res_r, res_l, 7); PCKEV_ST_SB(res_r, res_l, dst); dst += dst_stride; ILVRL_B2_UH(src_left2, src_top, res_r, res_l); HADD_UB2_UH(res_r, res_l, res_r, res_l); IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); SAT_UH2_UH(res_r, res_l, 7); PCKEV_ST_SB(res_r, res_l, dst); dst += dst_stride; ILVRL_B2_UH(src_left3, src_top, res_r, res_l); HADD_UB2_UH(res_r, res_l, res_r, res_l); IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); SAT_UH2_UH(res_r, res_l, 7); PCKEV_ST_SB(res_r, res_l, dst); dst += dst_stride; } }