void ff_vp8_h_loop_filter16_inner_msa(uint8_t *src, ptrdiff_t pitch, int32_t e, int32_t i, int32_t h) { v16u8 mask, hev, flat; v16u8 thresh, b_limit, limit; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 row0, row1, row2, row3, row4, row5, row6, row7; v16u8 row8, row9, row10, row11, row12, row13, row14, row15; v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7); LD_UB8(src - 4 + (8 * pitch), pitch, row8, row9, row10, row11, row12, row13, row14, row15); TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, row9, row10, row11, row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2, q3); thresh = (v16u8) __msa_fill_b(h); b_limit = (v16u8) __msa_fill_b(e); limit = (v16u8) __msa_fill_b(i); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1); ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3); ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1); ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5); src -= 2; ST4x8_UB(tmp2, tmp3, src, pitch); src += (8 * pitch); ST4x8_UB(tmp4, tmp5, src, pitch); }
static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr, const uint8_t *src_left, uint8_t *dst, int32_t dst_stride) { uint64_t val; uint8_t top_left = src_top_ptr[-1]; uint32_t loop_cnt; v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 }; v8u16 src_top_left, vec0, vec1, vec2, vec3; v16u8 src0, src1, src2, src3; val = LD(src_top_ptr); src_top = (v16i8)__msa_insert_d((v2i64)src_top, 0, val); src_top_left = (v8u16)__msa_fill_h(top_left); for (loop_cnt = 2; loop_cnt--;) { src_left0 = __msa_fill_b(src_left[0]); src_left1 = __msa_fill_b(src_left[1]); src_left2 = __msa_fill_b(src_left[2]); src_left3 = __msa_fill_b(src_left[3]); src_left += 4; ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top, src_left3, src_top, src0, src1, src2, src3); HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3); IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1); IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3); SAT_UH4_UH(vec0, vec1, vec2, vec3, 7); PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1); ST8x4_UB(tmp0, tmp1, dst, dst_stride); dst += (4 * dst_stride); } }
static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst, int32_t dst_stride) { uint32_t row; uint8_t inp0, inp1, inp2, inp3; v16u8 src0, src1, src2, src3; for (row = 8; row--;) { inp0 = src[0]; inp1 = src[1]; inp2 = src[2]; inp3 = src[3]; src += 4; src0 = (v16u8)__msa_fill_b(inp0); src1 = (v16u8)__msa_fill_b(inp1); src2 = (v16u8)__msa_fill_b(inp2); src3 = (v16u8)__msa_fill_b(inp3); ST_UB2(src0, src0, dst, 16); dst += dst_stride; ST_UB2(src1, src1, dst, 16); dst += dst_stride; ST_UB2(src2, src2, dst, 16); dst += dst_stride; ST_UB2(src3, src3, dst, 16); dst += dst_stride; } }
static void intra_predict_horiz_16x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride) { uint32_t row; uint8_t inp0, inp1, inp2, inp3; v16u8 src0, src1, src2, src3; for (row = 4; row--;) { inp0 = src[0]; src += src_stride; inp1 = src[0]; src += src_stride; inp2 = src[0]; src += src_stride; inp3 = src[0]; src += src_stride; src0 = (v16u8)__msa_fill_b(inp0); src1 = (v16u8)__msa_fill_b(inp1); src2 = (v16u8)__msa_fill_b(inp2); src3 = (v16u8)__msa_fill_b(inp3); ST_UB4(src0, src1, src2, src3, dst, dst_stride); dst += (4 * dst_stride); } }
void vp9_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit_ptr, const uint8_t *limit_ptr, const uint8_t *thresh_ptr, int32_t count) { uint64_t p1_d, p0_d, q0_d, q1_d; v16u8 mask, hev, flat, thresh, b_limit, limit; v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out; (void)count; /* load vector elements */ LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); thresh = (v16u8)__msa_fill_b(*thresh_ptr); b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); limit = (v16u8)__msa_fill_b(*limit_ptr); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); p1_d = __msa_copy_u_d((v2i64)p1_out, 0); p0_d = __msa_copy_u_d((v2i64)p0_out, 0); q0_d = __msa_copy_u_d((v2i64)q0_out, 0); q1_d = __msa_copy_u_d((v2i64)q1_out, 0); SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch); }
void vp9_lpf_vertical_4_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit_ptr, const uint8_t *limit_ptr, const uint8_t *thresh_ptr, int32_t count) { v16u8 mask, hev, flat, limit, thresh, b_limit; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v8i16 vec0, vec1, vec2, vec3; (void)count; LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3); thresh = (v16u8)__msa_fill_b(*thresh_ptr); b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); limit = (v16u8)__msa_fill_b(*limit_ptr); TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2, q3); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1); ILVRL_H2_SH(vec1, vec0, vec2, vec3); src -= 2; ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); src += 4 * pitch; ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); }
static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr, const uint8_t *src_left, uint8_t *dst, int32_t dst_stride) { uint32_t val; uint8_t top_left = src_top_ptr[-1]; v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 }; v16u8 src0, src1, src2, src3; v8u16 src_top_left, vec0, vec1, vec2, vec3; src_top_left = (v8u16)__msa_fill_h(top_left); val = LW(src_top_ptr); src_top = (v16i8)__msa_insert_w((v4i32)src_top, 0, val); src_left0 = __msa_fill_b(src_left[0]); src_left1 = __msa_fill_b(src_left[1]); src_left2 = __msa_fill_b(src_left[2]); src_left3 = __msa_fill_b(src_left[3]); ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top, src_left3, src_top, src0, src1, src2, src3); HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3); IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1); IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3); SAT_UH4_UH(vec0, vec1, vec2, vec3, 7); PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1); ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride); }
static void intra_predict_tm_32x32_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t dst_stride) { uint8_t top_left = src_top[-1]; uint32_t loop_cnt; v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3; v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1; LD_SB2(src_top, 16, src_top0, src_top1); src_top_left = (v8u16)__msa_fill_h(top_left); for (loop_cnt = 8; loop_cnt--;) { src_left0 = __msa_fill_b(src_left[0]); src_left1 = __msa_fill_b(src_left[1]); src_left2 = __msa_fill_b(src_left[2]); src_left3 = __msa_fill_b(src_left[3]); src_left += 4; ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1); ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1); HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); PCKEV_ST_SB(res_r0, res_l0, dst); PCKEV_ST_SB(res_r1, res_l1, dst + 16); dst += dst_stride; ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1); ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1); HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); PCKEV_ST_SB(res_r0, res_l0, dst); PCKEV_ST_SB(res_r1, res_l1, dst + 16); dst += dst_stride; ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1); ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1); HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); PCKEV_ST_SB(res_r0, res_l0, dst); PCKEV_ST_SB(res_r1, res_l1, dst + 16); dst += dst_stride; ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1); ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1); HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); PCKEV_ST_SB(res_r0, res_l0, dst); PCKEV_ST_SB(res_r1, res_l1, dst + 16); dst += dst_stride; } }
static void mbloop_filter_horizontal_edge_uv_msa(uint8_t *src_u, uint8_t *src_v, int32_t pitch, const uint8_t b_limit_in, const uint8_t limit_in, const uint8_t thresh_in) { uint8_t *temp_src; uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 mask, hev, flat, thresh, limit, b_limit; v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u; v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v; b_limit = (v16u8)__msa_fill_b(b_limit_in); limit = (v16u8)__msa_fill_b(limit_in); thresh = (v16u8)__msa_fill_b(thresh_in); temp_src = src_u - (pitch << 2); LD_UB8(temp_src, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u); temp_src = src_v - (pitch << 2); LD_UB8(temp_src, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v); ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0); ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); p2_d = __msa_copy_u_d((v2i64)p2, 0); p1_d = __msa_copy_u_d((v2i64)p1, 0); p0_d = __msa_copy_u_d((v2i64)p0, 0); q0_d = __msa_copy_u_d((v2i64)q0, 0); q1_d = __msa_copy_u_d((v2i64)q1, 0); q2_d = __msa_copy_u_d((v2i64)q2, 0); src_u -= (pitch * 3); SD4(p2_d, p1_d, p0_d, q0_d, src_u, pitch); src_u += 4 * pitch; SD(q1_d, src_u); src_u += pitch; SD(q2_d, src_u); p2_d = __msa_copy_u_d((v2i64)p2, 1); p1_d = __msa_copy_u_d((v2i64)p1, 1); p0_d = __msa_copy_u_d((v2i64)p0, 1); q0_d = __msa_copy_u_d((v2i64)q0, 1); q1_d = __msa_copy_u_d((v2i64)q1, 1); q2_d = __msa_copy_u_d((v2i64)q2, 1); src_v -= (pitch * 3); SD4(p2_d, p1_d, p0_d, q0_d, src_v, pitch); src_v += 4 * pitch; SD(q1_d, src_v); src_v += pitch; SD(q2_d, src_v); }
void ff_vp8_v_loop_filter8uv_msa(uint8_t *src_u, uint8_t *src_v, ptrdiff_t pitch, int b_limit_in, int limit_in, int thresh_in) { uint8_t *temp_src; uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 mask, hev, flat, thresh, limit, b_limit; v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u; v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v; b_limit = (v16u8) __msa_fill_b(b_limit_in); limit = (v16u8) __msa_fill_b(limit_in); thresh = (v16u8) __msa_fill_b(thresh_in); temp_src = src_u - (pitch << 2); LD_UB8(temp_src, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u); temp_src = src_v - (pitch << 2); LD_UB8(temp_src, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v); /* rht 8 element of p3 are u pixel and left 8 element of p3 are v pixel */ ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0); ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); p2_d = __msa_copy_u_d((v2i64) p2, 0); p1_d = __msa_copy_u_d((v2i64) p1, 0); p0_d = __msa_copy_u_d((v2i64) p0, 0); q0_d = __msa_copy_u_d((v2i64) q0, 0); q1_d = __msa_copy_u_d((v2i64) q1, 0); q2_d = __msa_copy_u_d((v2i64) q2, 0); src_u -= (pitch * 3); SD4(p2_d, p1_d, p0_d, q0_d, src_u, pitch); src_u += 4 * pitch; SD(q1_d, src_u); src_u += pitch; SD(q2_d, src_u); p2_d = __msa_copy_u_d((v2i64) p2, 1); p1_d = __msa_copy_u_d((v2i64) p1, 1); p0_d = __msa_copy_u_d((v2i64) p0, 1); q0_d = __msa_copy_u_d((v2i64) q0, 1); q1_d = __msa_copy_u_d((v2i64) q1, 1); q2_d = __msa_copy_u_d((v2i64) q2, 1); src_v -= (pitch * 3); SD4(p2_d, p1_d, p0_d, q0_d, src_v, pitch); src_v += 4 * pitch; SD(q1_d, src_v); src_v += pitch; SD(q2_d, src_v); }
static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr, const uint8_t *src_left, uint8_t *dst, int32_t dst_stride) { uint8_t top_left = src_top_ptr[-1]; uint32_t loop_cnt; v16i8 src_top, src_left0, src_left1, src_left2, src_left3; v8u16 src_top_left, res_r, res_l; src_top = LD_SB(src_top_ptr); src_top_left = (v8u16)__msa_fill_h(top_left); for (loop_cnt = 4; loop_cnt--;) { src_left0 = __msa_fill_b(src_left[0]); src_left1 = __msa_fill_b(src_left[1]); src_left2 = __msa_fill_b(src_left[2]); src_left3 = __msa_fill_b(src_left[3]); src_left += 4; ILVRL_B2_UH(src_left0, src_top, res_r, res_l); HADD_UB2_UH(res_r, res_l, res_r, res_l); IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); SAT_UH2_UH(res_r, res_l, 7); PCKEV_ST_SB(res_r, res_l, dst); dst += dst_stride; ILVRL_B2_UH(src_left1, src_top, res_r, res_l); HADD_UB2_UH(res_r, res_l, res_r, res_l); IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); SAT_UH2_UH(res_r, res_l, 7); PCKEV_ST_SB(res_r, res_l, dst); dst += dst_stride; ILVRL_B2_UH(src_left2, src_top, res_r, res_l); HADD_UB2_UH(res_r, res_l, res_r, res_l); IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); SAT_UH2_UH(res_r, res_l, 7); PCKEV_ST_SB(res_r, res_l, dst); dst += dst_stride; ILVRL_B2_UH(src_left3, src_top, res_r, res_l); HADD_UB2_UH(res_r, res_l, res_r, res_l); IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); SAT_UH2_UH(res_r, res_l, 7); PCKEV_ST_SB(res_r, res_l, dst); dst += dst_stride; } }
void vp8_loop_filter_simple_vertical_edge_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit_ptr) { uint8_t *temp_src; v16u8 p1, p0, q1, q0; v16u8 mask, b_limit; v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8; v16u8 row9, row10, row11, row12, row13, row14, row15; v8i16 tmp0, tmp1; b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); temp_src = src - 2; LD_UB8(temp_src, pitch, row0, row1, row2, row3, row4, row5, row6, row7); temp_src += (8 * pitch); LD_UB8(temp_src, pitch, row8, row9, row10, row11, row12, row13, row14, row15); TRANSPOSE16x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, row9, row10, row11, row12, row13, row14, row15, p1, p0, q0, q1); VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask); VP8_SIMPLE_FILT(p1, p0, q0, q1, mask); ILVRL_B2_SH(q0, p0, tmp1, tmp0); src -= 1; ST2x4_UB(tmp1, 0, src, pitch); src += 4 * pitch; ST2x4_UB(tmp1, 4, src, pitch); src += 4 * pitch; ST2x4_UB(tmp0, 0, src, pitch); src += 4 * pitch; ST2x4_UB(tmp0, 4, src, pitch); src += 4 * pitch; }
static void intra_predict_dc_8x8_msa(uint8_t *src_top, uint8_t *src_left, int32_t src_stride_left, uint8_t *dst, int32_t dst_stride, uint8_t is_above, uint8_t is_left) { uint32_t row, addition = 0; uint64_t out; v16u8 src_above, store; v8u16 sum_above; v4u32 sum_top; v2u64 sum; if (is_left && is_above) { src_above = LD_UB(src_top); sum_above = __msa_hadd_u_h(src_above, src_above); sum_top = __msa_hadd_u_w(sum_above, sum_above); sum = __msa_hadd_u_d(sum_top, sum_top); addition = __msa_copy_u_w((v4i32)sum, 0); for (row = 0; row < 8; ++row) { addition += src_left[row * src_stride_left]; } addition = (addition + 8) >> 4; store = (v16u8)__msa_fill_b(addition); }
void aom_plane_add_noise_msa(uint8_t *start_ptr, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], uint32_t width, uint32_t height, int32_t pitch) { uint32_t i, j; for (i = 0; i < height / 2; ++i) { uint8_t *pos0_ptr = start_ptr + (2 * i) * pitch; int8_t *ref0_ptr = (int8_t *)(noise + (rand() & 0xff)); uint8_t *pos1_ptr = start_ptr + (2 * i + 1) * pitch; int8_t *ref1_ptr = (int8_t *)(noise + (rand() & 0xff)); for (j = width / 16; j--;) { v16i8 temp00_s, temp01_s; v16u8 temp00, temp01, black_clamp, white_clamp; v16u8 pos0, ref0, pos1, ref1; v16i8 const127 = __msa_ldi_b(127); pos0 = LD_UB(pos0_ptr); ref0 = LD_UB(ref0_ptr); pos1 = LD_UB(pos1_ptr); ref1 = LD_UB(ref1_ptr); black_clamp = (v16u8)__msa_fill_b(blackclamp[0]); white_clamp = (v16u8)__msa_fill_b(whiteclamp[0]); temp00 = (pos0 < black_clamp); pos0 = __msa_bmnz_v(pos0, black_clamp, temp00); temp01 = (pos1 < black_clamp); pos1 = __msa_bmnz_v(pos1, black_clamp, temp01); XORI_B2_128_UB(pos0, pos1); temp00_s = __msa_adds_s_b((v16i8)white_clamp, const127); temp00 = (v16u8)(temp00_s < pos0); pos0 = (v16u8)__msa_bmnz_v((v16u8)pos0, (v16u8)temp00_s, temp00); temp01_s = __msa_adds_s_b((v16i8)white_clamp, const127); temp01 = (temp01_s < pos1); pos1 = (v16u8)__msa_bmnz_v((v16u8)pos1, (v16u8)temp01_s, temp01); XORI_B2_128_UB(pos0, pos1); pos0 += ref0; ST_UB(pos0, pos0_ptr); pos1 += ref1; ST_UB(pos1, pos1_ptr); pos0_ptr += 16; pos1_ptr += 16; ref0_ptr += 16; ref1_ptr += 16; } } }
static WEBP_INLINE void VerticalPred16x16(uint8_t* dst, const uint8_t* top) { if (top != NULL) { const v16u8 out = LD_UB(top); STORE16x16(out, dst); } else { const v16u8 out = (v16u8)__msa_fill_b(0x7f); STORE16x16(out, dst); } }
static WEBP_INLINE void HorizontalPred16x16(uint8_t* dst, const uint8_t* left) { if (left != NULL) { int j; for (j = 0; j < 16; j += 4) { const v16u8 L0 = (v16u8)__msa_fill_b(left[0]); const v16u8 L1 = (v16u8)__msa_fill_b(left[1]); const v16u8 L2 = (v16u8)__msa_fill_b(left[2]); const v16u8 L3 = (v16u8)__msa_fill_b(left[3]); ST_UB4(L0, L1, L2, L3, dst, BPS); dst += 4 * BPS; left += 4; } } else { const v16u8 out = (v16u8)__msa_fill_b(0x81); STORE16x16(out, dst); } }
void ff_vp8_v_loop_filter16_inner_msa(uint8_t *src, ptrdiff_t pitch, int32_t e, int32_t i, int32_t h) { v16u8 mask, hev, flat; v16u8 thresh, b_limit, limit; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; /* load vector elements */ LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); thresh = (v16u8) __msa_fill_b(h); b_limit = (v16u8) __msa_fill_b(e); limit = (v16u8) __msa_fill_b(i); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch); }
static void loop_filter_horizontal_edge_uv_msa(uint8_t *src_u, uint8_t *src_v, int32_t pitch, const uint8_t b_limit_in, const uint8_t limit_in, const uint8_t thresh_in) { uint64_t p1_d, p0_d, q0_d, q1_d; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 mask, hev, flat, thresh, limit, b_limit; v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u; v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v; thresh = (v16u8)__msa_fill_b(thresh_in); limit = (v16u8)__msa_fill_b(limit_in); b_limit = (v16u8)__msa_fill_b(b_limit_in); src_u = src_u - (pitch << 2); LD_UB8(src_u, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u); src_u += (5 * pitch); src_v = src_v - (pitch << 2); LD_UB8(src_v, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v); src_v += (5 * pitch); /* right 8 element of p3 are u pixel and left 8 element of p3 are v pixel */ ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0); ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); p1_d = __msa_copy_u_d((v2i64)p1, 0); p0_d = __msa_copy_u_d((v2i64)p0, 0); q0_d = __msa_copy_u_d((v2i64)q0, 0); q1_d = __msa_copy_u_d((v2i64)q1, 0); SD4(q1_d, q0_d, p0_d, p1_d, src_u, (-pitch)); p1_d = __msa_copy_u_d((v2i64)p1, 1); p0_d = __msa_copy_u_d((v2i64)p0, 1); q0_d = __msa_copy_u_d((v2i64)q0, 1); q1_d = __msa_copy_u_d((v2i64)q1, 1); SD4(q1_d, q0_d, p0_d, p1_d, src_v, (-pitch)); }
static void loop_filter_vertical_edge_uv_msa(uint8_t *src_u, uint8_t *src_v, int32_t pitch, const uint8_t b_limit_in, const uint8_t limit_in, const uint8_t thresh_in) { uint8_t *temp_src_u, *temp_src_v; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 mask, hev, flat, thresh, limit, b_limit; v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8; v16u8 row9, row10, row11, row12, row13, row14, row15; v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; thresh = (v16u8)__msa_fill_b(thresh_in); limit = (v16u8)__msa_fill_b(limit_in); b_limit = (v16u8)__msa_fill_b(b_limit_in); LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7); LD_UB8(src_v - 4, pitch, row8, row9, row10, row11, row12, row13, row14, row15); TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, row9, row10, row11, row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2, q3); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); ILVR_B2_SW(p0, p1, q1, q0, tmp0, tmp1); ILVRL_H2_SW(tmp1, tmp0, tmp2, tmp3); tmp0 = (v4i32)__msa_ilvl_b((v16i8)p0, (v16i8)p1); tmp1 = (v4i32)__msa_ilvl_b((v16i8)q1, (v16i8)q0); ILVRL_H2_SW(tmp1, tmp0, tmp4, tmp5); temp_src_u = src_u - 2; ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, temp_src_u, pitch); temp_src_u += 4 * pitch; ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, temp_src_u, pitch); temp_src_v = src_v - 2; ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, temp_src_v, pitch); temp_src_v += 4 * pitch; ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, temp_src_v, pitch); }
static void copy_8bit_value_width8_msa(uint8_t *src, uint8_t val, int32_t src_stride, int32_t height) { int32_t cnt; uint64_t dst0; v16u8 val0; val0 = (v16u8) __msa_fill_b(val); dst0 = __msa_copy_u_d((v2i64) val0, 0); for (cnt = (height >> 2); cnt--;) {
void vp8_loop_filter_simple_horizontal_edge_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit_ptr) { v16u8 p1, p0, q1, q0; v16u8 mask, b_limit; b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); LD_UB4(src - (pitch << 1), pitch, p1, p0, q0, q1); VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask); VP8_SIMPLE_FILT(p1, p0, q0, q1, mask); ST_UB2(p0, q0, (src - pitch), pitch); }
static void mbloop_filter_horizontal_edge_y_msa(uint8_t *src, int32_t pitch, const uint8_t b_limit_in, const uint8_t limit_in, const uint8_t thresh_in) { uint8_t *temp_src; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 mask, hev, flat, thresh, limit, b_limit; b_limit = (v16u8)__msa_fill_b(b_limit_in); limit = (v16u8)__msa_fill_b(limit_in); thresh = (v16u8)__msa_fill_b(thresh_in); temp_src = src - (pitch << 2); LD_UB8(temp_src, pitch, p3, p2, p1, p0, q0, q1, q2, q3); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); temp_src = src - 3 * pitch; ST_UB4(p2, p1, p0, q0, temp_src, pitch); temp_src += (4 * pitch); ST_UB2(q1, q2, temp_src, pitch); }
void vp9_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit0_ptr, const uint8_t *limit0_ptr, const uint8_t *thresh0_ptr, const uint8_t *b_limit1_ptr, const uint8_t *limit1_ptr, const uint8_t *thresh1_ptr) { v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; /* load vector elements */ LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr); thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr); thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0); b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr); b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr); b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0); limit0 = (v16u8)__msa_fill_b(*limit0_ptr); limit1 = (v16u8)__msa_fill_b(*limit1_ptr); limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, mask, flat); VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch); }
static void loop_filter_horizontal_4_dual_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit0_ptr, const uint8_t *limit0_ptr, const uint8_t *thresh0_ptr, const uint8_t *b_limit1_ptr, const uint8_t *limit1_ptr, const uint8_t *thresh1_ptr) { v16u8 mask, hev, flat; v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr); thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr); thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0); b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr); b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr); b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0); limit0 = (v16u8)__msa_fill_b(*limit0_ptr); limit1 = (v16u8)__msa_fill_b(*limit1_ptr); limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, mask, flat); VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch); }
void ff_vp8_v_loop_filter16_msa(uint8_t *src, ptrdiff_t pitch, int b_limit_in, int limit_in, int thresh_in) { uint8_t *temp_src; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 mask, hev, flat, thresh, limit, b_limit; b_limit = (v16u8) __msa_fill_b(b_limit_in); limit = (v16u8) __msa_fill_b(limit_in); thresh = (v16u8) __msa_fill_b(thresh_in); /* load vector elements */ temp_src = src - (pitch << 2); LD_UB8(temp_src, pitch, p3, p2, p1, p0, q0, q1, q2, q3); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); /* store vector elements */ temp_src = src - 3 * pitch; ST_UB4(p2, p1, p0, q0, temp_src, pitch); temp_src += (4 * pitch); ST_UB2(q1, q2, temp_src, pitch); }
void ff_vp8_v_loop_filter_simple_msa(uint8_t *src, ptrdiff_t pitch, int b_limit_ptr) { v16u8 p1, p0, q1, q0; v16u8 mask, b_limit; b_limit = (v16u8) __msa_fill_b(b_limit_ptr); /* load vector elements */ LD_UB4(src - (pitch << 1), pitch, p1, p0, q0, q1); VP8_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask); VP8_SIMPLE_FILT(p1, p0, q0, q1, mask); ST_UB2(p0, q0, (src - pitch), pitch); }
void vp9_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit0_ptr, const uint8_t *limit0_ptr, const uint8_t *thresh0_ptr, const uint8_t *b_limit1_ptr, const uint8_t *limit1_ptr, const uint8_t *thresh1_ptr) { v16u8 mask, hev, flat; v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 row0, row1, row2, row3, row4, row5, row6, row7; v16u8 row8, row9, row10, row11, row12, row13, row14, row15; v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7); LD_UB8(src - 4 + (8 * pitch), pitch, row8, row9, row10, row11, row12, row13, row14, row15); TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, row9, row10, row11, row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2, q3); thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr); thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr); thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0); b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr); b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr); b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0); limit0 = (v16u8)__msa_fill_b(*limit0_ptr); limit1 = (v16u8)__msa_fill_b(*limit1_ptr); limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, mask, flat); VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1); ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3); ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1); ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5); src -= 2; ST4x8_UB(tmp2, tmp3, src, pitch); src += (8 * pitch); ST4x8_UB(tmp4, tmp5, src, pitch); }
static WEBP_INLINE void TrueMotion16x16(uint8_t* dst, const uint8_t* left, const uint8_t* top) { if (left != NULL) { if (top != NULL) { int j; v8i16 d1, d2; const v16i8 zero = { 0 }; const v8i16 TL = (v8i16)__msa_fill_h(left[-1]); const v16u8 T = LD_UB(top); ILVRL_B2_SH(zero, T, d1, d2); SUB2(d1, TL, d2, TL, d1, d2); for (j = 0; j < 16; j += 4) { v16i8 t0, t1, t2, t3; v8i16 r0, r1, r2, r3, r4, r5, r6, r7; const v8i16 L0 = (v8i16)__msa_fill_h(left[j + 0]); const v8i16 L1 = (v8i16)__msa_fill_h(left[j + 1]); const v8i16 L2 = (v8i16)__msa_fill_h(left[j + 2]); const v8i16 L3 = (v8i16)__msa_fill_h(left[j + 3]); ADD4(d1, L0, d1, L1, d1, L2, d1, L3, r0, r1, r2, r3); ADD4(d2, L0, d2, L1, d2, L2, d2, L3, r4, r5, r6, r7); CLIP_SH4_0_255(r0, r1, r2, r3); CLIP_SH4_0_255(r4, r5, r6, r7); PCKEV_B4_SB(r4, r0, r5, r1, r6, r2, r7, r3, t0, t1, t2, t3); ST_SB4(t0, t1, t2, t3, dst, BPS); dst += 4 * BPS; } } else { HorizontalPred16x16(dst, left); } } else { if (top != NULL) { VerticalPred16x16(dst, top); } else { const v16u8 out = (v16u8)__msa_fill_b(0x81); STORE16x16(out, dst); } } }
void vpx_lpf_horizontal_8_dual_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *b_limit1, const uint8_t *limit1, const uint8_t *thresh1) { v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; v16u8 flat, mask, hev, tmp, thresh, b_limit, limit; v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; v16u8 zero = { 0 }; /* load vector elements */ LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); thresh = (v16u8)__msa_fill_b(*thresh0); tmp = (v16u8)__msa_fill_b(*thresh1); thresh = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)thresh); b_limit = (v16u8)__msa_fill_b(*b_limit0); tmp = (v16u8)__msa_fill_b(*b_limit1); b_limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)b_limit); limit = (v16u8)__msa_fill_b(*limit0); tmp = (v16u8)__msa_fill_b(*limit1); limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)limit); /* mask and hev */ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); if (__msa_test_bz_v(flat)) { ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); } else { ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); /* convert 16 bit output data into 8 bit */ PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r); PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, q2_filt8_r); /* store pixel values */ p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); src -= 3 * pitch; ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch); src += (4 * pitch); ST_UB2(q1_out, q2_out, src, pitch); src += (2 * pitch); } }
void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *b_limit1, const uint8_t *limit1, const uint8_t *thresh1) { uint8_t *temp_src; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 p1_out, p0_out, q0_out, q1_out; v16u8 flat, mask, hev, thresh, b_limit, limit; v16u8 row4, row5, row6, row7, row12, row13, row14, row15; v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; v16u8 zero = { 0 }; v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; temp_src = src - 4; LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7); temp_src += (8 * pitch); LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15); /* transpose 16x8 matrix into 8x16 */ TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0, row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2, q3); thresh = (v16u8)__msa_fill_b(*thresh0); vec0 = (v8i16)__msa_fill_b(*thresh1); thresh = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)thresh); b_limit = (v16u8)__msa_fill_b(*b_limit0); vec0 = (v8i16)__msa_fill_b(*b_limit1); b_limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)b_limit); limit = (v16u8)__msa_fill_b(*limit0); vec0 = (v8i16)__msa_fill_b(*limit1); limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)limit); /* mask and hev */ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); /* flat4 */ VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); /* filter4 */ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); if (__msa_test_bz_v(flat)) { ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); ILVRL_H2_SH(vec1, vec0, vec2, vec3); ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); ILVRL_H2_SH(vec1, vec0, vec4, vec5); src -= 2; ST4x8_UB(vec2, vec3, src, pitch); src += 8 * pitch; ST4x8_UB(vec4, vec5, src, pitch); } else { ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); /* filter8 */ VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); /* convert 16 bit output data into 8 bit */ PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r); PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, q2_filt8_r); /* store pixel values */ p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); ILVRL_H2_SH(vec1, vec0, vec3, vec4); ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); ILVRL_H2_SH(vec1, vec0, vec6, vec7); ILVRL_B2_SH(q2, q1, vec2, vec5); src -= 3; ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); ST2x4_UB(vec2, 0, src + 4, pitch); src += (4 * pitch); ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch); ST2x4_UB(vec2, 4, src + 4, pitch); src += (4 * pitch); ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch); ST2x4_UB(vec5, 0, src + 4, pitch); src += (4 * pitch); ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch); ST2x4_UB(vec5, 4, src + 4, pitch); } }