src54_r, src21_r);
  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
             src4332, src6554);
  XORI_B3_128_SB(src2110, src4332, src6554);

  for (loop_cnt = (height >> 2); loop_cnt--;) {
    LD_SB4(src, src_stride, src7, src8, src9, src10);
    src += (4 * src_stride);

    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
               src87_r, src98_r, src109_r);
    ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
    XORI_B2_128_SB(src8776, src10998);
    out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
                                filt1, filt2, filt3);
    out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
                                filt1, filt2, filt3);
    SRARI_H2_SH(out10, out32, FILTER_BITS);
    SAT_SH2_SH(out10, out32, 7);
    out = PCKEV_XORI128_UB(out10, out32);
    ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);

    dst0 = (v16u8)__msa_ilvr_d((v2i64)dst2, (v2i64)dst0);
    out = __msa_aver_u_b(out, dst0);

    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
    dst += (4 * dst_stride);

    src2110 = src6554;
    src4332 = src8776;
  filt = LD_SH(filter_vert);
  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);

  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
  out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);

  for (loop_cnt = (height >> 2); loop_cnt--;) {
    LD_SB4(src, src_stride, src7, src8, src9, src10);
    XORI_B4_128_SB(src7, src8, src9, src10);
    src += (4 * src_stride);

    hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
    hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
    out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
    tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
                               filt_vt2, filt_vt3);

    hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
    hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
    out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
    tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1,
                               filt_vt2, filt_vt3);
    SRARI_H2_SH(tmp0, tmp1, FILTER_BITS);
    SAT_SH2_SH(tmp0, tmp1, 7);
    out = PCKEV_XORI128_UB(tmp0, tmp1);
    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
    dst += (4 * dst_stride);

    hz_out5 = hz_out9;
    out0 = out2;
  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);

  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
  vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);

  for (loop_cnt = (height >> 2); loop_cnt--;) {
    LD_SB4(src, src_stride, src7, src8, src9, src10);
    XORI_B4_128_SB(src7, src8, src9, src10);
    src += (4 * src_stride);

    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
    hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
    hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
    vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
    res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1,
                               filt_vt2, filt_vt3);

    hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
    hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
    vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
    res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
                               filt_vt2, filt_vt3);
    ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);

    SRARI_H2_SH(res0, res1, FILTER_BITS);
    SAT_SH2_SH(res0, res1, 7);
    PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1);
    XORI_B2_128_UB(tmp0, tmp1);
    AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1);
    ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);