void vp9_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
                                   const uint8_t *b_limit0_ptr,
                                   const uint8_t *limit0_ptr,
                                   const uint8_t *thresh0_ptr,
                                   const uint8_t *b_limit1_ptr,
                                   const uint8_t *limit1_ptr,
                                   const uint8_t *thresh1_ptr) {
  v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;

  /* load vector elements */
  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);

  thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
  thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
  thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);

  b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
  b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
  b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);

  limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
  limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
  limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);

  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
               hev, mask, flat);
  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);

  ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
}
static void loop_filter_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
                                              const uint8_t *b_limit0_ptr,
                                              const uint8_t *limit0_ptr,
                                              const uint8_t *thresh0_ptr,
                                              const uint8_t *b_limit1_ptr,
                                              const uint8_t *limit1_ptr,
                                              const uint8_t *thresh1_ptr) {
  v16u8 mask, hev, flat;
  v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;

  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
  thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
  thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
  thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);

  b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
  b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
  b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);

  limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
  limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
  limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);

  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
               mask, flat);
  VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);

  ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
}
void vp9_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch,
                                 const uint8_t *b_limit0_ptr,
                                 const uint8_t *limit0_ptr,
                                 const uint8_t *thresh0_ptr,
                                 const uint8_t *b_limit1_ptr,
                                 const uint8_t *limit1_ptr,
                                 const uint8_t *thresh1_ptr) {
  v16u8 mask, hev, flat;
  v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;

  LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
  LD_UB8(src - 4 + (8 * pitch), pitch,
         row8, row9, row10, row11, row12, row13, row14, row15);

  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
                      row8, row9, row10, row11, row12, row13, row14, row15,
                      p3, p2, p1, p0, q0, q1, q2, q3);

  thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
  thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
  thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);

  b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
  b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
  b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);

  limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
  limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
  limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);

  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
               hev, mask, flat);
  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
  ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
  ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
  ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
  ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);

  src -= 2;

  ST4x8_UB(tmp2, tmp3, src, pitch);
  src += (8 * pitch);
  ST4x8_UB(tmp4, tmp5, src, pitch);
}
void vpx_lpf_horizontal_8_dual_msa(uint8_t *src, int32_t pitch,
                                   const uint8_t *b_limit0,
                                   const uint8_t *limit0,
                                   const uint8_t *thresh0,
                                   const uint8_t *b_limit1,
                                   const uint8_t *limit1,
                                   const uint8_t *thresh1) {
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
  v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
  v16u8 zero = { 0 };

  /* load vector elements */
  LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);

  thresh = (v16u8)__msa_fill_b(*thresh0);
  tmp = (v16u8)__msa_fill_b(*thresh1);
  thresh = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)thresh);

  b_limit = (v16u8)__msa_fill_b(*b_limit0);
  tmp = (v16u8)__msa_fill_b(*b_limit1);
  b_limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)b_limit);

  limit = (v16u8)__msa_fill_b(*limit0);
  tmp = (v16u8)__msa_fill_b(*limit1);
  limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)limit);

  /* mask and hev */
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
               hev, mask, flat);
  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);

  if (__msa_test_bz_v(flat)) {
    ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
  } else {
    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
               q2_r, q3_r);
    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);

    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
    ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
    VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);

    /* convert 16 bit output data into 8 bit */
    PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
                p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
                p0_filt8_r, q0_filt8_r);
    PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
                q2_filt8_r);

    /* store pixel values */
    p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
    q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);

    src -= 3 * pitch;

    ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
    src += (4 * pitch);
    ST_UB2(q1_out, q2_out, src, pitch);
    src += (2 * pitch);
  }
}
void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
                                 const uint8_t *b_limit0,
                                 const uint8_t *limit0,
                                 const uint8_t *thresh0,
                                 const uint8_t *b_limit1,
                                 const uint8_t *limit1,
                                 const uint8_t *thresh1) {
  uint8_t *temp_src;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
  v16u8 p1_out, p0_out, q0_out, q1_out;
  v16u8 flat, mask, hev, thresh, b_limit, limit;
  v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
  v16u8 zero = { 0 };
  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;

  temp_src = src - 4;

  LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
  temp_src += (8 * pitch);
  LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);

  /* transpose 16x8 matrix into 8x16 */
  TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
                      q3, q2, q1, q0, row12, row13, row14, row15,
                      p3, p2, p1, p0, q0, q1, q2, q3);

  thresh = (v16u8)__msa_fill_b(*thresh0);
  vec0 = (v8i16)__msa_fill_b(*thresh1);
  thresh = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)thresh);

  b_limit = (v16u8)__msa_fill_b(*b_limit0);
  vec0 = (v8i16)__msa_fill_b(*b_limit1);
  b_limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)b_limit);

  limit = (v16u8)__msa_fill_b(*limit0);
  vec0 = (v8i16)__msa_fill_b(*limit1);
  limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)limit);

  /* mask and hev */
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
               hev, mask, flat);
  /* flat4 */
  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
  /* filter4 */
  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);

  if (__msa_test_bz_v(flat)) {
    ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
    ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
    ILVRL_H2_SH(vec1, vec0, vec4, vec5);

    src -= 2;
    ST4x8_UB(vec2, vec3, src, pitch);
    src += 8 * pitch;
    ST4x8_UB(vec4, vec5, src, pitch);
  } else {
    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
               q3_r);
    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);

    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
    ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);

    /* filter8 */
    VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);

    /* convert 16 bit output data into 8 bit */
    PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
                p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
                p0_filt8_r, q0_filt8_r);
    PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
                q2_filt8_r);

    /* store pixel values */
    p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
    p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
    p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
    q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
    q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
    q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);

    ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
    ILVRL_H2_SH(vec1, vec0, vec3, vec4);
    ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
    ILVRL_H2_SH(vec1, vec0, vec6, vec7);
    ILVRL_B2_SH(q2, q1, vec2, vec5);

    src -= 3;
    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
    ST2x4_UB(vec2, 0, src + 4, pitch);
    src += (4 * pitch);
    ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
    ST2x4_UB(vec2, 4, src + 4, pitch);
    src += (4 * pitch);
    ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
    ST2x4_UB(vec5, 0, src + 4, pitch);
    src += (4 * pitch);
    ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
    ST2x4_UB(vec5, 4, src + 4, pitch);
  }
}
void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
                            const uint8_t *b_limit_ptr,
                            const uint8_t *limit_ptr,
                            const uint8_t *thresh_ptr) {
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
  v16u8 p1_out, p0_out, q0_out, q1_out;
  v16u8 flat, mask, hev, thresh, b_limit, limit;
  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
  v16u8 zero = { 0 };
  v8i16 vec0, vec1, vec2, vec3, vec4;

  /* load vector elements */
  LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);

  TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
                     p3, p2, p1, p0, q0, q1, q2, q3);

  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
  limit = (v16u8)__msa_fill_b(*limit_ptr);

  /* mask and hev */
  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
               hev, mask, flat);
  /* flat4 */
  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
  /* filter4 */
  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);

  flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);

  if (__msa_test_bz_v(flat)) {
    /* Store 4 pixels p1-_q1 */
    ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
    ILVRL_H2_SH(vec1, vec0, vec2, vec3);

    src -= 2;
    ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
    src += 4 * pitch;
    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
  } else {
    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
               q3_r);
    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
    /* convert 16 bit output data into 8 bit */
    PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r,
                p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r,
                p0_filt8_r, q0_filt8_r);
    PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r,
                q2_filt8_r);

    /* store pixel values */
    p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
    p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
    p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
    q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
    q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
    q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);

    /* Store 6 pixels p2-_q2 */
    ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
    vec4 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1);

    src -= 3;
    ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
    ST2x4_UB(vec4, 0, src + 4, pitch);
    src += (4 * pitch);
    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
    ST2x4_UB(vec4, 4, src + 4, pitch);
  }
}
void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
                              const uint8_t *b_limit_ptr,
                              const uint8_t *limit_ptr,
                              const uint8_t *thresh_ptr) {
  uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
  v16u8 mask, hev, flat, thresh, b_limit, limit;
  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
  v8i16 p2_filter8, p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8;
  v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
  v16i8 zero = { 0 };

  /* load vector elements */
  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);

  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
  limit = (v16u8)__msa_fill_b(*limit_ptr);

  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
               hev, mask, flat);
  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);

  flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);

  if (__msa_test_bz_v(flat)) {
    p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
    p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
    q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
    q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
    SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
  } else {
    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
               zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
               q2_r, q3_r);
    VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
                p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);

    /* convert 16 bit output data into 8 bit */
    PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
                zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
                q0_filter8);
    PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);

    /* store pixel values */
    p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat);
    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat);
    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat);
    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat);
    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat);
    q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat);

    p2_d = __msa_copy_u_d((v2i64)p2_out, 0);
    p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
    p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
    q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
    q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
    q2_d = __msa_copy_u_d((v2i64)q2_out, 0);

    src -= 3 * pitch;

    SD4(p2_d, p1_d, p0_d, q0_d, src, pitch);
    src += (4 * pitch);
    SD(q1_d, src);
    src += pitch;
    SD(q2_d, src);
  }
}
void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
    v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
    v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
    v2f64 src_b0, src_b1, src_b2, src_b3, src_b5, src_b6, src_b7;
    v2f64 src_b10, src_b11, src_b15;
    FLOAT *c_nxt1line = c + ldc;
    FLOAT *c_nxt2line = c + 2 * ldc;
    FLOAT *c_nxt3line = c + 3 * ldc;

    LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
    LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7);
    LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11);
    LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15);

    if (bk)
    {
        BLASLONG i, pref_offset;
        FLOAT *pa0_pref;
        v2f64 src_a0, src_a1, src_a2, src_a3, src_b;

        pref_offset = (uintptr_t)a & (L1_DATA_LINESIZE - 1);

        if (pref_offset)
        {
            pref_offset = L1_DATA_LINESIZE - pref_offset;
            pref_offset = pref_offset / sizeof(FLOAT);
        }

        pa0_pref = a + pref_offset;

        for (i = (bk >> 1); i--;)
        {
            PREF_OFFSET(pa0_pref, 128);
            PREF_OFFSET(pa0_pref, 160);
            PREF_OFFSET(pa0_pref, 192);
            PREF_OFFSET(pa0_pref, 224);

            LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3);
            LD_DP2_INC(b, 2, src_b0, src_b1);

            src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
            src_c0 -= src_a0 * src_b;
            src_c1 -= src_a1 * src_b;
            src_c2 -= src_a2 * src_b;
            src_c3 -= src_a3 * src_b;

            src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
            src_c4 -= src_a0 * src_b;
            src_c5 -= src_a1 * src_b;
            src_c6 -= src_a2 * src_b;
            src_c7 -= src_a3 * src_b;

            src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
            src_c8  -= src_a0 * src_b;
            src_c9  -= src_a1 * src_b;
            src_c10 -= src_a2 * src_b;
            src_c11 -= src_a3 * src_b;

            src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
            src_c12 -= src_a0 * src_b;
            src_c13 -= src_a1 * src_b;
            src_c14 -= src_a2 * src_b;
            src_c15 -= src_a3 * src_b;

            LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3);
            LD_DP2_INC(b, 2, src_b0, src_b1);

            src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
            src_c0 -= src_a0 * src_b;
            src_c1 -= src_a1 * src_b;
            src_c2 -= src_a2 * src_b;
            src_c3 -= src_a3 * src_b;

            src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
            src_c4 -= src_a0 * src_b;
            src_c5 -= src_a1 * src_b;
            src_c6 -= src_a2 * src_b;
            src_c7 -= src_a3 * src_b;

            src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
            src_c8  -= src_a0 * src_b;
            src_c9  -= src_a1 * src_b;
            src_c10 -= src_a2 * src_b;
            src_c11 -= src_a3 * src_b;

            src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
            src_c12 -= src_a0 * src_b;
            src_c13 -= src_a1 * src_b;
            src_c14 -= src_a2 * src_b;
            src_c15 -= src_a3 * src_b;

            pa0_pref += 16;
        }

        if (bk & 1)
        {
            LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3);
            LD_DP2_INC(b, 2, src_b0, src_b1);

            src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
            src_c0 -= src_a0 * src_b;
            src_c1 -= src_a1 * src_b;
            src_c2 -= src_a2 * src_b;
            src_c3 -= src_a3 * src_b;

            src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
            src_c4 -= src_a0 * src_b;
            src_c5 -= src_a1 * src_b;
            src_c6 -= src_a2 * src_b;
            src_c7 -= src_a3 * src_b;

            src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
            src_c8  -= src_a0 * src_b;
            src_c9  -= src_a1 * src_b;
            src_c10 -= src_a2 * src_b;
            src_c11 -= src_a3 * src_b;

            src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
            src_c12 -= src_a0 * src_b;
            src_c13 -= src_a1 * src_b;
            src_c14 -= src_a2 * src_b;
            src_c15 -= src_a3 * src_b;
        }
    }
static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
    v2f64 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3;
    v2f64 src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12, src_a13;
    v2f64 src_a14, src_a15;

    LD_DP2(c, 2, src_c0, src_c1);
    LD_DP2(c + ldc, 2, src_c2, src_c3);

    if (bk > 0)
    {
        BLASLONG i;
        FLOAT *aa = a, *bb = b;
        v2f64 src_a0, src_a1, src_b, src_b0;

        for (i = bk; i--;)
        {
            LD_DP2(aa, 2, src_a0, src_a1);
            src_b0 = LD_DP(bb);

            src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
            src_c0 -= src_a0 * src_b;
            src_c1 -= src_a1 * src_b;

            src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
            src_c2 -= src_a0 * src_b;
            src_c3 -= src_a1 * src_b;

            aa += 4;
            bb += 2;
        }
    }

    a -= 16;
    b -= 8;

    ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1);
    ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3);

    src_a14 = LD_DP(a + 14);
    src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1);
    src_a14 = (v2f64) __msa_splati_d((v2i64) src_a14, 0);

    src_a12 = LD_DP(a + 12);
    src_a13 = (v2f64) __msa_splati_d((v2i64) src_a12, 1);
    src_a12 = (v2f64) __msa_splati_d((v2i64) src_a12, 0);

    src_a9 = LD_DP(a + 9);
    src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1);
    src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0);

    src_a8 = __msa_cast_to_vector_double(*(a + 8));
    src_a0 = __msa_cast_to_vector_double(*(a + 0));

    src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
    src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);

    src_a4 = LD_DP(a + 4);
    src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1);
    src_a4 = (v2f64) __msa_splati_d((v2i64) src_a4, 0);

    res_c3 *= src_a15;

    res_c2 -= res_c3 * src_a14;
    res_c2 *= src_a10;

    res_c1 -= res_c3 * src_a13;
    res_c1 -= res_c2 * src_a9;
    res_c1 *= src_a5;

    res_c0 -= res_c3 * src_a12;
    res_c0 -= res_c2 * src_a8;
    res_c0 -= res_c1 * src_a4;
    res_c0 *= src_a0;

    ST_DP(res_c3, b + 6);
    ST_DP(res_c2, b + 4);
    ST_DP(res_c1, b + 2);
    ST_DP(res_c0, b + 0);

    ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2);
    ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3);

    ST_DP2(src_c0, src_c1, c, 2);
    ST_DP2(src_c2, src_c3, c + ldc, 2);
}
static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
    v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
    v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
    v2f64 src_a0, src_a1, src_a2, src_a3, src_a8, src_a9, src_a16, src_a17;
    v2f64 src_a18, src_a24, src_a25, src_a26, src_a27, src_a32, src_a33;
    v2f64 src_a34, src_a35, src_a36, src_a40, src_a41, src_a42, src_a43;
    v2f64 src_a44, src_a45, src_a48, src_a49, src_a50, src_a51, src_a52;
    v2f64 src_a53, src_a54, src_a56, src_a57, src_a58, src_a59, src_a60;
    v2f64 src_a61, src_a62, src_a63;

    LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
    LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7);

    if (bk > 0)
    {
        BLASLONG i;
        FLOAT *pba = a, *pbb = b;
        v2f64 src_b, src_b0, src_b1;

        LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3);
        src_b0 = LD_DP(pbb);

        for (i = bk - 1; i--;)
        {
            pba += 8;
            pbb += 2;

            LD_DP4(pba, 2, src_a8, src_a9, src_a16, src_a17);
            src_b1 = LD_DP(pbb);

            src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
            src_c0 -= src_a0 * src_b;
            src_c1 -= src_a1 * src_b;
            src_c2 -= src_a2 * src_b;
            src_c3 -= src_a3 * src_b;

            src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
            src_c4 -= src_a0 * src_b;
            src_c5 -= src_a1 * src_b;
            src_c6 -= src_a2 * src_b;
            src_c7 -= src_a3 * src_b;

            src_a0 = src_a8;
            src_a1 = src_a9;
            src_a2 = src_a16;
            src_a3 = src_a17;
            src_b0 = src_b1;
        }

        src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
        src_c0 -= src_a0 * src_b;
        src_c1 -= src_a1 * src_b;
        src_c2 -= src_a2 * src_b;
        src_c3 -= src_a3 * src_b;

        src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
        src_c4 -= src_a0 * src_b;
        src_c5 -= src_a1 * src_b;
        src_c6 -= src_a2 * src_b;
        src_c7 -= src_a3 * src_b;
    }

    ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1);
    ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3);
    ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5);
    ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7);

    src_a56 = LD_DP(a - 8);
    src_a57 = (v2f64) __msa_splati_d((v2i64) src_a56, 1);
    src_a56 = (v2f64) __msa_splati_d((v2i64) src_a56, 0);
    src_a58 = LD_DP(a - 6);
    src_a59 = (v2f64) __msa_splati_d((v2i64) src_a58, 1);
    src_a58 = (v2f64) __msa_splati_d((v2i64) src_a58, 0);
    src_a60 = LD_DP(a - 4);
    src_a61 = (v2f64) __msa_splati_d((v2i64) src_a60, 1);
    src_a60 = (v2f64) __msa_splati_d((v2i64) src_a60, 0);
    src_a62 = LD_DP(a - 2);
    src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1);
    src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0);

    res_c7 *= src_a63;
    res_c6 -= res_c7 * src_a62;
    res_c5 -= res_c7 * src_a61;
    res_c4 -= res_c7 * src_a60;
    res_c3 -= res_c7 * src_a59;
    res_c2 -= res_c7 * src_a58;
    res_c1 -= res_c7 * src_a57;
    res_c0 -= res_c7 * src_a56;

    src_a48 = LD_DP(a - 16);
    src_a49 = (v2f64) __msa_splati_d((v2i64) src_a48, 1);
    src_a48 = (v2f64) __msa_splati_d((v2i64) src_a48, 0);
    src_a50 = LD_DP(a - 14);
    src_a51 = (v2f64) __msa_splati_d((v2i64) src_a50, 1);
    src_a50 = (v2f64) __msa_splati_d((v2i64) src_a50, 0);
    src_a52 = LD_DP(a - 12);
    src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1);
    src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0);
    src_a54 = __msa_cast_to_vector_double(*(a - 10));
    src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);

    src_a40 = LD_DP(a - 24);
    src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1);
    src_a40 = (v2f64) __msa_splati_d((v2i64) src_a40, 0);
    src_a42 = LD_DP(a - 22);
    src_a43 = (v2f64) __msa_splati_d((v2i64) src_a42, 1);
    src_a42 = (v2f64) __msa_splati_d((v2i64) src_a42, 0);
    src_a44 = LD_DP(a - 20);
    src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1);
    src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0);

    res_c6 *= src_a54;
    res_c5 -= res_c6 * src_a53;
    res_c4 -= res_c6 * src_a52;
    res_c3 -= res_c6 * src_a51;
    res_c2 -= res_c6 * src_a50;
    res_c1 -= res_c6 * src_a49;
    res_c0 -= res_c6 * src_a48;

    res_c5 *= src_a45;
    res_c4 -= res_c5 * src_a44;
    res_c3 -= res_c5 * src_a43;
    res_c2 -= res_c5 * src_a42;
    res_c1 -= res_c5 * src_a41;
    res_c0 -= res_c5 * src_a40;

    ST_DP(res_c7, b - 2);
    ST_DP(res_c6, b - 4);
    ST_DP(res_c5, b - 6);

    src_a32 = LD_DP(a - 32);
    src_a33 = (v2f64) __msa_splati_d((v2i64) src_a32, 1);
    src_a32 = (v2f64) __msa_splati_d((v2i64) src_a32, 0);
    src_a34 = LD_DP(a - 30);
    src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1);
    src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0);
    src_a36 = __msa_cast_to_vector_double(*(a - 28));
    src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0);

    res_c4 *= src_a36;
    res_c3 -= res_c4 * src_a35;
    res_c2 -= res_c4 * src_a34;
    res_c1 -= res_c4 * src_a33;
    res_c0 -= res_c4 * src_a32;

    src_a24 = LD_DP(a - 40);
    src_a25 = (v2f64) __msa_splati_d((v2i64) src_a24, 1);
    src_a24 = (v2f64) __msa_splati_d((v2i64) src_a24, 0);
    src_a26 = LD_DP(a - 38);
    src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1);
    src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0);
    src_a16 = LD_DP(a - 48);
    src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1);
    src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0);
    src_a18 = __msa_cast_to_vector_double(*(a - 46));
    src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0);
    src_a0 = __msa_cast_to_vector_double(*(a - 64));
    src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
    src_a8 = LD_DP(a - 56);
    src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1);
    src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);

    res_c3 *= src_a27;
    res_c2 -= res_c3 * src_a26;
    res_c1 -= res_c3 * src_a25;
    res_c0 -= res_c3 * src_a24;

    res_c2 *= src_a18;
    res_c1 -= res_c2 * src_a17;
    res_c0 -= res_c2 * src_a16;

    res_c1 *= src_a9;
    res_c0 -= res_c1 * src_a8;

    res_c0 *= src_a0;

    ST_DP(res_c4, b - 8);
    ST_DP(res_c3, b - 10);
    ST_DP(res_c2, b - 12);
    ST_DP(res_c1, b - 14);
    ST_DP(res_c0, b - 16);

    ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4);
    ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5);
    ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6);
    ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7);

    ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2);
    ST_DP4(src_c4, src_c5, src_c6, src_c7, c + ldc, 2);
}
static void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
    v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
    v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
    v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
    v2f64 res_c8, res_c9, res_c10, res_c11, res_c12, res_c13, res_c14, res_c15;
    v2f64 src_a0, src_a1, src_a2, src_a3, src_a8, src_a9, src_a16, src_a17;
    v2f64 src_a18, src_a24, src_a25, src_a26, src_a27, src_a32, src_a33;
    v2f64 src_a34, src_a35, src_a36, src_a40, src_a41, src_a42, src_a43;
    v2f64 src_a44, src_a45, src_a48, src_a49, src_a50, src_a51, src_a52;
    v2f64 src_a53, src_a54, src_a56, src_a57, src_a58, src_a59, src_a60;
    v2f64 src_a61, src_a62, src_a63;
    FLOAT *c_nxt1line = c + ldc;
    FLOAT *c_nxt2line = c + 2 * ldc;
    FLOAT *c_nxt3line = c + 3 * ldc;

    LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
    LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7);
    LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11);
    LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15);

    if (bk > 0)
    {
        BLASLONG i;
        FLOAT *pba = a, *pbb = b;
        v2f64 src_b, src_b0, src_b1, src_b2, src_b3;

        LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3);
        LD_DP2(pbb, 2, src_b0, src_b1);

        for (i = (bk - 1); i--;)
        {
            pba += 8;
            pbb += 4;

            LD_DP4(pba, 2, src_a8, src_a9, src_a16, src_a17);
            LD_DP2(pbb, 2, src_b2, src_b3);

            src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
            src_c0 -= src_a0 * src_b;
            src_c1 -= src_a1 * src_b;
            src_c2 -= src_a2 * src_b;
            src_c3 -= src_a3 * src_b;

            src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
            src_c4 -= src_a0 * src_b;
            src_c5 -= src_a1 * src_b;
            src_c6 -= src_a2 * src_b;
            src_c7 -= src_a3 * src_b;

            src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
            src_c8  -= src_a0 * src_b;
            src_c9  -= src_a1 * src_b;
            src_c10 -= src_a2 * src_b;
            src_c11 -= src_a3 * src_b;

            src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
            src_c12 -= src_a0 * src_b;
            src_c13 -= src_a1 * src_b;
            src_c14 -= src_a2 * src_b;
            src_c15 -= src_a3 * src_b;

            src_a0 = src_a8;
            src_a1 = src_a9;
            src_a2 = src_a16;
            src_a3 = src_a17;
            src_b0 = src_b2;
            src_b1 = src_b3;
        }

        src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
        src_c0 -= src_a0 * src_b;
        src_c1 -= src_a1 * src_b;
        src_c2 -= src_a2 * src_b;
        src_c3 -= src_a3 * src_b;

        src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
        src_c4 -= src_a0 * src_b;
        src_c5 -= src_a1 * src_b;
        src_c6 -= src_a2 * src_b;
        src_c7 -= src_a3 * src_b;

        src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
        src_c8  -= src_a0 * src_b;
        src_c9  -= src_a1 * src_b;
        src_c10 -= src_a2 * src_b;
        src_c11 -= src_a3 * src_b;

        src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
        src_c12 -= src_a0 * src_b;
        src_c13 -= src_a1 * src_b;
        src_c14 -= src_a2 * src_b;
        src_c15 -= src_a3 * src_b;
    }

    a -= 64;
    b -= 32;

    ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1);
    ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3);
    ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5);
    ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7);
    ILVRL_D2_DP(src_c12, src_c8, res_c8, res_c9);
    ILVRL_D2_DP(src_c13, src_c9, res_c10, res_c11);
    ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13);
    ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15);

    src_a54 = __msa_cast_to_vector_double(*(a + 54));
    src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
    src_a62 = LD_DP(a + 62);
    src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1);
    src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0);
    src_a60 = LD_DP(a + 60);
    src_a61 = (v2f64) __msa_splati_d((v2i64) src_a60, 1);
    src_a60 = (v2f64) __msa_splati_d((v2i64) src_a60, 0);
    src_a52 = LD_DP(a + 52);
    src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1);
    src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0);
    src_a44 = LD_DP(a + 44);
    src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1);
    src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0);
    src_a36 = __msa_cast_to_vector_double(*(a + 36));
    src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0);

    res_c7 *= src_a63;
    res_c6 -= res_c7 * src_a62;
    res_c6 *= src_a54;

    res_c15 *= src_a63;
    res_c14 -= res_c15 * src_a62;
    res_c14 *= src_a54;

    ST_DP(res_c7, b + 28);
    ST_DP(res_c6, b + 24);
    ST_DP(res_c15, b + 30);
    ST_DP(res_c14, b + 26);
    ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7);
    ILVRL_D2_DP(res_c15, res_c14, src_c11, src_c15);
    ST_DP(src_c3, c + 6);
    ST_DP(src_c7, c_nxt1line + 6);
    ST_DP(src_c11, c_nxt2line + 6);
    ST_DP(src_c15, c_nxt3line + 6);

    res_c5 -= res_c7 * src_a61;
    res_c5 -= res_c6 * src_a53;
    res_c5 *= src_a45;

    res_c4 -= res_c7 * src_a60;
    res_c4 -= res_c6 * src_a52;
    res_c4 -= res_c5 * src_a44;
    res_c4 *= src_a36;

    res_c13 -= res_c15 * src_a61;
    res_c13 -= res_c14 * src_a53;
    res_c13 *= src_a45;

    res_c12 -= res_c15 * src_a60;
    res_c12 -= res_c14 * src_a52;
    res_c12 -= res_c13 * src_a44;
    res_c12 *= src_a36;

    src_a56 = LD_DP(a + 56);
    src_a57 = (v2f64) __msa_splati_d((v2i64) src_a56, 1);
    src_a56 = (v2f64) __msa_splati_d((v2i64) src_a56, 0);
    src_a58 = LD_DP(a + 58);
    src_a59 = (v2f64) __msa_splati_d((v2i64) src_a58, 1);
    src_a58 = (v2f64) __msa_splati_d((v2i64) src_a58, 0);

    ST_DP(res_c4, b + 16);
    ST_DP(res_c5, b + 20);
    ST_DP(res_c12, b + 18);
    ST_DP(res_c13, b + 22);

    ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6);
    ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14);
    ST_DP(src_c2, c + 4);
    ST_DP(src_c6, c_nxt1line + 4);
    ST_DP(src_c10, c_nxt2line + 4);
    ST_DP(src_c14, c_nxt3line + 4);

    src_a50 = LD_DP(a + 50);
    src_a51 = (v2f64) __msa_splati_d((v2i64) src_a50, 1);
    src_a50 = (v2f64) __msa_splati_d((v2i64) src_a50, 0);
    src_a42 = LD_DP(a + 42);
    src_a43 = (v2f64) __msa_splati_d((v2i64) src_a42, 1);
    src_a42 = (v2f64) __msa_splati_d((v2i64) src_a42, 0);
    src_a34 = LD_DP(a + 34);
    src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1);
    src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0);
    src_a26 = LD_DP(a + 26);
    src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1);
    src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0);
    src_a18 = __msa_cast_to_vector_double(*(a + 18));
    src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0);

    res_c3 -= res_c7 * src_a59;
    res_c2 -= res_c7 * src_a58;
    res_c1 -= res_c7 * src_a57;
    res_c0 -= res_c7 * src_a56;

    res_c11 -= res_c15 * src_a59;
    res_c10 -= res_c15 * src_a58;
    res_c9 -= res_c15 * src_a57;
    res_c8 -= res_c15 * src_a56;

    res_c3 -= res_c6 * src_a51;
    res_c3 -= res_c5 * src_a43;
    res_c3 -= res_c4 * src_a35;
    res_c3 *= src_a27;

    res_c2 -= res_c6 * src_a50;
    res_c2 -= res_c5 * src_a42;
    res_c2 -= res_c4 * src_a34;
    res_c2 -= res_c3 * src_a26;
    res_c2 *= src_a18;

    res_c11 -= res_c14 * src_a51;
    res_c11 -= res_c13 * src_a43;
    res_c11 -= res_c12 * src_a35;
    res_c11 *= src_a27;

    res_c10 -= res_c14 * src_a50;
    res_c10 -= res_c13 * src_a42;
    res_c10 -= res_c12 * src_a34;
    res_c10 -= res_c11 * src_a26;
    res_c10 *= src_a18;

    src_a48 = LD_DP(a + 48);
    src_a49 = (v2f64) __msa_splati_d((v2i64) src_a48, 1);
    src_a48 = (v2f64) __msa_splati_d((v2i64) src_a48, 0);
    src_a40 = LD_DP(a + 40);
    src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1);
    src_a40 = (v2f64) __msa_splati_d((v2i64) src_a40, 0);

    ST_DP(res_c2, b + 8);
    ST_DP(res_c3, b + 12);
    ST_DP(res_c10, b + 10);
    ST_DP(res_c11, b + 14);

    src_a32 = LD_DP(a + 32);
    src_a33 = (v2f64) __msa_splati_d((v2i64) src_a32, 1);
    src_a32 = (v2f64) __msa_splati_d((v2i64) src_a32, 0);
    src_a24 = LD_DP(a + 24);
    src_a25 = (v2f64) __msa_splati_d((v2i64) src_a24, 1);
    src_a24 = (v2f64) __msa_splati_d((v2i64) src_a24, 0);

    ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5);
    ILVRL_D2_DP(res_c11, res_c10, src_c9, src_c13);
    ST_DP(src_c1, c + 2);
    ST_DP(src_c5, c_nxt1line + 2);
    ST_DP(src_c9, c_nxt2line + 2);
    ST_DP(src_c13, c_nxt3line + 2);

    res_c1 -= res_c6 * src_a49;
    res_c1 -= res_c5 * src_a41;
    res_c1 -= res_c4 * src_a33;
    res_c1 -= res_c3 * src_a25;

    res_c0 -= res_c6 * src_a48;
    res_c0 -= res_c5 * src_a40;
    res_c0 -= res_c4 * src_a32;
    res_c0 -= res_c3 * src_a24;

    res_c9 -= res_c14 * src_a49;
    res_c9 -= res_c13 * src_a41;
    res_c9 -= res_c12 * src_a33;
    res_c9 -= res_c11 * src_a25;

    res_c8 -= res_c14 * src_a48;
    res_c8 -= res_c13 * src_a40;
    res_c8 -= res_c12 * src_a32;
    res_c8 -= res_c11 * src_a24;

    src_a16 = LD_DP(a + 16);
    src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1);
    src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0);
    src_a8 = LD_DP(a + 8);
    src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1);
    src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
    src_a0 = __msa_cast_to_vector_double(*(a + 0));
    src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);

    res_c1 -= res_c2 * src_a17;
    res_c1 *= src_a9;

    res_c9 -= res_c10 * src_a17;
    res_c9 *= src_a9;

    res_c0 -= res_c2 * src_a16;
    res_c0 -= res_c1 * src_a8;
    res_c0 *= src_a0;

    res_c8 -= res_c10 * src_a16;
    res_c8 -= res_c9 * src_a8;
    res_c8 *= src_a0;

    ST_DP(res_c0, b + 0);
    ST_DP(res_c8, b + 2);
    ST_DP(res_c1, b + 4);
    ST_DP(res_c9, b + 6);

    ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4);
    ILVRL_D2_DP(res_c9, res_c8, src_c8, src_c12);

    ST_DP(src_c0, c);
    ST_DP(src_c4, c_nxt1line);
    ST_DP(src_c8, c_nxt2line);
    ST_DP(src_c12, c_nxt3line);
}
    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
               src87_r, src98_r, src109_r);
    ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
    XORI_B2_128_SB(src8776, src10998);
    out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
                                filt1, filt2, filt3);
    out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
                                filt1, filt2, filt3);
    SRARI_H2_SH(out10, out32, FILTER_BITS);
    SAT_SH2_SH(out10, out32, 7);
    out = PCKEV_XORI128_UB(out10, out32);
    ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);

    dst0 = (v16u8)__msa_ilvr_d((v2i64)dst2, (v2i64)dst0);
    out = __msa_aver_u_b(out, dst0);

    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
    dst += (4 * dst_stride);

    src2110 = src6554;
    src4332 = src8776;
    src6554 = src10998;
    src6 = src10;
  }
}

static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
                                             int32_t src_stride, uint8_t *dst,
                                             int32_t dst_stride, int8_t *filter,