static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr, const uint8_t *src_left, uint8_t *dst, int32_t dst_stride) { uint32_t val; uint8_t top_left = src_top_ptr[-1]; v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 }; v16u8 src0, src1, src2, src3; v8u16 src_top_left, vec0, vec1, vec2, vec3; src_top_left = (v8u16)__msa_fill_h(top_left); val = LW(src_top_ptr); src_top = (v16i8)__msa_insert_w((v4i32)src_top, 0, val); src_left0 = __msa_fill_b(src_left[0]); src_left1 = __msa_fill_b(src_left[1]); src_left2 = __msa_fill_b(src_left[2]); src_left3 = __msa_fill_b(src_left[3]); ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top, src_left3, src_top, src0, src1, src2, src3); HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3); IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1); IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3); SAT_UH4_UH(vec0, vec1, vec2, vec3, 7); PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1); ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride); }
static void intra_predict_dc_tl_4x4_msa(const uint8_t *src, uint8_t *dst, int32_t dst_stride) { uint32_t val0; v16i8 store, data = { 0 }; v8u16 sum_h; v4u32 sum_w; val0 = LW(src); data = (v16i8)__msa_insert_w((v4i32)data, 0, val0); sum_h = __msa_hadd_u_h((v16u8)data, (v16u8)data); sum_w = __msa_hadd_u_w(sum_h, sum_h); sum_w = (v4u32)__msa_srari_w((v4i32)sum_w, 2); store = __msa_splati_b((v16i8)sum_w, 0); val0 = __msa_copy_u_w((v4i32)store, 0); SW4(val0, val0, val0, val0, dst, dst_stride); }