static void loop_filter_vertical_edge_uv_msa(uint8_t *src_u, uint8_t *src_v, int32_t pitch, const uint8_t b_limit_in, const uint8_t limit_in, const uint8_t thresh_in) { uint8_t *temp_src_u, *temp_src_v; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 mask, hev, flat, thresh, limit, b_limit; v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8; v16u8 row9, row10, row11, row12, row13, row14, row15; v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; thresh = (v16u8)__msa_fill_b(thresh_in); limit = (v16u8)__msa_fill_b(limit_in); b_limit = (v16u8)__msa_fill_b(b_limit_in); LD_UB8(src_u - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7); LD_UB8(src_v - 4, pitch, row8, row9, row10, row11, row12, row13, row14, row15); TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, row9, row10, row11, row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2, q3); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); ILVR_B2_SW(p0, p1, q1, q0, tmp0, tmp1); ILVRL_H2_SW(tmp1, tmp0, tmp2, tmp3); tmp0 = (v4i32)__msa_ilvl_b((v16i8)p0, (v16i8)p1); tmp1 = (v4i32)__msa_ilvl_b((v16i8)q1, (v16i8)q0); ILVRL_H2_SW(tmp1, tmp0, tmp4, tmp5); temp_src_u = src_u - 2; ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, temp_src_u, pitch); temp_src_u += 4 * pitch; ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, temp_src_u, pitch); temp_src_v = src_v - 2; ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, temp_src_v, pitch); temp_src_v += 4 * pitch; ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, temp_src_v, pitch); }
static void hevc_idct_4x4_msa(int16_t *coeffs) { v8i16 in0, in1; v4i32 in_r0, in_l0, in_r1, in_l1; v4i32 sum0, sum1, sum2, sum3; v8i16 zeros = { 0 }; LD_SH2(coeffs, 8, in0, in1); ILVRL_H2_SW(zeros, in0, in_r0, in_l0); ILVRL_H2_SW(zeros, in1, in_r1, in_l1); HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 7); TRANSPOSE4x4_SW_SW(sum0, sum1, sum2, sum3, in_r0, in_l0, in_r1, in_l1); HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 12); /* Pack and transpose */ PCKEV_H2_SH(sum2, sum0, sum3, sum1, in0, in1); ILVRL_H2_SW(in1, in0, sum0, sum1); ILVRL_W2_SH(sum1, sum0, in0, in1); ST_SH2(in0, in1, coeffs, 8); }
static void hevc_idct_luma_4x4_msa(int16_t *coeffs) { v8i16 in0, in1, dst0, dst1; v4i32 in_r0, in_l0, in_r1, in_l1, res0, res1, res2, res3; LD_SH2(coeffs, 8, in0, in1); UNPCK_SH_SW(in0, in_r0, in_l0); UNPCK_SH_SW(in1, in_r1, in_l1); HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1, res0, res1, res2, res3, 7); TRANSPOSE4x4_SW_SW(res0, res1, res2, res3, in_r0, in_l0, in_r1, in_l1); HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1, res0, res1, res2, res3, 12); /* Pack and transpose */ PCKEV_H2_SH(res2, res0, res3, res1, dst0, dst1); ILVRL_H2_SW(dst1, dst0, res0, res1); ILVRL_W2_SH(res1, res0, dst0, dst1); ST_SH2(dst0, dst1, coeffs, 8); }