void vp9_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit0_ptr, const uint8_t *limit0_ptr, const uint8_t *thresh0_ptr, const uint8_t *b_limit1_ptr, const uint8_t *limit1_ptr, const uint8_t *thresh1_ptr) { v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; /* load vector elements */ LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr); thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr); thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0); b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr); b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr); b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0); limit0 = (v16u8)__msa_fill_b(*limit0_ptr); limit1 = (v16u8)__msa_fill_b(*limit1_ptr); limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, mask, flat); VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch); }
static void loop_filter_horizontal_4_dual_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit0_ptr, const uint8_t *limit0_ptr, const uint8_t *thresh0_ptr, const uint8_t *b_limit1_ptr, const uint8_t *limit1_ptr, const uint8_t *thresh1_ptr) { v16u8 mask, hev, flat; v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr); thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr); thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0); b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr); b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr); b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0); limit0 = (v16u8)__msa_fill_b(*limit0_ptr); limit1 = (v16u8)__msa_fill_b(*limit1_ptr); limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, mask, flat); VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch); }
void vp9_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit0_ptr, const uint8_t *limit0_ptr, const uint8_t *thresh0_ptr, const uint8_t *b_limit1_ptr, const uint8_t *limit1_ptr, const uint8_t *thresh1_ptr) { v16u8 mask, hev, flat; v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 row0, row1, row2, row3, row4, row5, row6, row7; v16u8 row8, row9, row10, row11, row12, row13, row14, row15; v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7); LD_UB8(src - 4 + (8 * pitch), pitch, row8, row9, row10, row11, row12, row13, row14, row15); TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, row9, row10, row11, row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2, q3); thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr); thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr); thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0); b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr); b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr); b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0); limit0 = (v16u8)__msa_fill_b(*limit0_ptr); limit1 = (v16u8)__msa_fill_b(*limit1_ptr); limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, mask, flat); VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1); ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3); ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1); ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5); src -= 2; ST4x8_UB(tmp2, tmp3, src, pitch); src += (8 * pitch); ST4x8_UB(tmp4, tmp5, src, pitch); }
void vpx_lpf_horizontal_8_dual_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *b_limit1, const uint8_t *limit1, const uint8_t *thresh1) { v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; v16u8 flat, mask, hev, tmp, thresh, b_limit, limit; v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; v16u8 zero = { 0 }; /* load vector elements */ LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); thresh = (v16u8)__msa_fill_b(*thresh0); tmp = (v16u8)__msa_fill_b(*thresh1); thresh = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)thresh); b_limit = (v16u8)__msa_fill_b(*b_limit0); tmp = (v16u8)__msa_fill_b(*b_limit1); b_limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)b_limit); limit = (v16u8)__msa_fill_b(*limit0); tmp = (v16u8)__msa_fill_b(*limit1); limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)limit); /* mask and hev */ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); if (__msa_test_bz_v(flat)) { ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); } else { ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); /* convert 16 bit output data into 8 bit */ PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r); PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, q2_filt8_r); /* store pixel values */ p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); src -= 3 * pitch; ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch); src += (4 * pitch); ST_UB2(q1_out, q2_out, src, pitch); src += (2 * pitch); } }
void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *b_limit1, const uint8_t *limit1, const uint8_t *thresh1) { uint8_t *temp_src; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 p1_out, p0_out, q0_out, q1_out; v16u8 flat, mask, hev, thresh, b_limit, limit; v16u8 row4, row5, row6, row7, row12, row13, row14, row15; v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; v16u8 zero = { 0 }; v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; temp_src = src - 4; LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7); temp_src += (8 * pitch); LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15); /* transpose 16x8 matrix into 8x16 */ TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0, row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2, q3); thresh = (v16u8)__msa_fill_b(*thresh0); vec0 = (v8i16)__msa_fill_b(*thresh1); thresh = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)thresh); b_limit = (v16u8)__msa_fill_b(*b_limit0); vec0 = (v8i16)__msa_fill_b(*b_limit1); b_limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)b_limit); limit = (v16u8)__msa_fill_b(*limit0); vec0 = (v8i16)__msa_fill_b(*limit1); limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)limit); /* mask and hev */ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); /* flat4 */ VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); /* filter4 */ VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); if (__msa_test_bz_v(flat)) { ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); ILVRL_H2_SH(vec1, vec0, vec2, vec3); ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); ILVRL_H2_SH(vec1, vec0, vec4, vec5); src -= 2; ST4x8_UB(vec2, vec3, src, pitch); src += 8 * pitch; ST4x8_UB(vec4, vec5, src, pitch); } else { ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); /* filter8 */ VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); /* convert 16 bit output data into 8 bit */ PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r); PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, q2_filt8_r); /* store pixel values */ p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); ILVRL_H2_SH(vec1, vec0, vec3, vec4); ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); ILVRL_H2_SH(vec1, vec0, vec6, vec7); ILVRL_B2_SH(q2, q1, vec2, vec5); src -= 3; ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); ST2x4_UB(vec2, 0, src + 4, pitch); src += (4 * pitch); ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch); ST2x4_UB(vec2, 4, src + 4, pitch); src += (4 * pitch); ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch); ST2x4_UB(vec5, 0, src + 4, pitch); src += (4 * pitch); ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch); ST2x4_UB(vec5, 4, src + 4, pitch); } }
void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit_ptr, const uint8_t *limit_ptr, const uint8_t *thresh_ptr) { v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 p1_out, p0_out, q0_out, q1_out; v16u8 flat, mask, hev, thresh, b_limit, limit; v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; v16u8 zero = { 0 }; v8i16 vec0, vec1, vec2, vec3, vec4; /* load vector elements */ LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3); TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2, q3); thresh = (v16u8)__msa_fill_b(*thresh_ptr); b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); limit = (v16u8)__msa_fill_b(*limit_ptr); /* mask and hev */ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); /* flat4 */ VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); /* filter4 */ VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); if (__msa_test_bz_v(flat)) { /* Store 4 pixels p1-_q1 */ ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); ILVRL_H2_SH(vec1, vec0, vec2, vec3); src -= 2; ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); src += 4 * pitch; ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); } else { ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); /* convert 16 bit output data into 8 bit */ PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r); PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r, q2_filt8_r); /* store pixel values */ p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); /* Store 6 pixels p2-_q2 */ ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); ILVRL_H2_SH(vec1, vec0, vec2, vec3); vec4 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1); src -= 3; ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); ST2x4_UB(vec4, 0, src + 4, pitch); src += (4 * pitch); ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); ST2x4_UB(vec4, 4, src + 4, pitch); } }
void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit_ptr, const uint8_t *limit_ptr, const uint8_t *thresh_ptr) { uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; v16u8 mask, hev, flat, thresh, b_limit, limit; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; v8i16 p2_filter8, p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8; v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r; v16i8 zero = { 0 }; /* load vector elements */ LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); thresh = (v16u8)__msa_fill_b(*thresh_ptr); b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); limit = (v16u8)__msa_fill_b(*limit_ptr); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); if (__msa_test_bz_v(flat)) { p1_d = __msa_copy_u_d((v2i64)p1_out, 0); p0_d = __msa_copy_u_d((v2i64)p0_out, 0); q0_d = __msa_copy_u_d((v2i64)q0_out, 0); q1_d = __msa_copy_u_d((v2i64)q1_out, 0); SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch); } else { ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8, p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); /* convert 16 bit output data into 8 bit */ PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8); PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8); /* store pixel values */ p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat); p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat); p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat); q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat); q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat); q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat); p2_d = __msa_copy_u_d((v2i64)p2_out, 0); p1_d = __msa_copy_u_d((v2i64)p1_out, 0); p0_d = __msa_copy_u_d((v2i64)p0_out, 0); q0_d = __msa_copy_u_d((v2i64)q0_out, 0); q1_d = __msa_copy_u_d((v2i64)q1_out, 0); q2_d = __msa_copy_u_d((v2i64)q2_out, 0); src -= 3 * pitch; SD4(p2_d, p1_d, p0_d, q0_d, src, pitch); src += (4 * pitch); SD(q1_d, src); src += pitch; SD(q2_d, src); } }
void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; v2f64 src_b0, src_b1, src_b2, src_b3, src_b5, src_b6, src_b7; v2f64 src_b10, src_b11, src_b15; FLOAT *c_nxt1line = c + ldc; FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7); LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11); LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15); if (bk) { BLASLONG i, pref_offset; FLOAT *pa0_pref; v2f64 src_a0, src_a1, src_a2, src_a3, src_b; pref_offset = (uintptr_t)a & (L1_DATA_LINESIZE - 1); if (pref_offset) { pref_offset = L1_DATA_LINESIZE - pref_offset; pref_offset = pref_offset / sizeof(FLOAT); } pa0_pref = a + pref_offset; for (i = (bk >> 1); i--;) { PREF_OFFSET(pa0_pref, 128); PREF_OFFSET(pa0_pref, 160); PREF_OFFSET(pa0_pref, 192); PREF_OFFSET(pa0_pref, 224); LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3); LD_DP2_INC(b, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_c2 -= src_a2 * src_b; src_c3 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c4 -= src_a0 * src_b; src_c5 -= src_a1 * src_b; src_c6 -= src_a2 * src_b; src_c7 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); src_c8 -= src_a0 * src_b; src_c9 -= src_a1 * src_b; src_c10 -= src_a2 * src_b; src_c11 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); src_c12 -= src_a0 * src_b; src_c13 -= src_a1 * src_b; src_c14 -= src_a2 * src_b; src_c15 -= src_a3 * src_b; LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3); LD_DP2_INC(b, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_c2 -= src_a2 * src_b; src_c3 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c4 -= src_a0 * src_b; src_c5 -= src_a1 * src_b; src_c6 -= src_a2 * src_b; src_c7 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); src_c8 -= src_a0 * src_b; src_c9 -= src_a1 * src_b; src_c10 -= src_a2 * src_b; src_c11 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); src_c12 -= src_a0 * src_b; src_c13 -= src_a1 * src_b; src_c14 -= src_a2 * src_b; src_c15 -= src_a3 * src_b; pa0_pref += 16; } if (bk & 1) { LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3); LD_DP2_INC(b, 2, src_b0, src_b1); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_c2 -= src_a2 * src_b; src_c3 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c4 -= src_a0 * src_b; src_c5 -= src_a1 * src_b; src_c6 -= src_a2 * src_b; src_c7 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); src_c8 -= src_a0 * src_b; src_c9 -= src_a1 * src_b; src_c10 -= src_a2 * src_b; src_c11 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); src_c12 -= src_a0 * src_b; src_c13 -= src_a1 * src_b; src_c14 -= src_a2 * src_b; src_c15 -= src_a3 * src_b; } }
static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3; v2f64 src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12, src_a13; v2f64 src_a14, src_a15; LD_DP2(c, 2, src_c0, src_c1); LD_DP2(c + ldc, 2, src_c2, src_c3); if (bk > 0) { BLASLONG i; FLOAT *aa = a, *bb = b; v2f64 src_a0, src_a1, src_b, src_b0; for (i = bk; i--;) { LD_DP2(aa, 2, src_a0, src_a1); src_b0 = LD_DP(bb); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c2 -= src_a0 * src_b; src_c3 -= src_a1 * src_b; aa += 4; bb += 2; } } a -= 16; b -= 8; ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1); ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3); src_a14 = LD_DP(a + 14); src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1); src_a14 = (v2f64) __msa_splati_d((v2i64) src_a14, 0); src_a12 = LD_DP(a + 12); src_a13 = (v2f64) __msa_splati_d((v2i64) src_a12, 1); src_a12 = (v2f64) __msa_splati_d((v2i64) src_a12, 0); src_a9 = LD_DP(a + 9); src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1); src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); src_a8 = __msa_cast_to_vector_double(*(a + 8)); src_a0 = __msa_cast_to_vector_double(*(a + 0)); src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); src_a4 = LD_DP(a + 4); src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); src_a4 = (v2f64) __msa_splati_d((v2i64) src_a4, 0); res_c3 *= src_a15; res_c2 -= res_c3 * src_a14; res_c2 *= src_a10; res_c1 -= res_c3 * src_a13; res_c1 -= res_c2 * src_a9; res_c1 *= src_a5; res_c0 -= res_c3 * src_a12; res_c0 -= res_c2 * src_a8; res_c0 -= res_c1 * src_a4; res_c0 *= src_a0; ST_DP(res_c3, b + 6); ST_DP(res_c2, b + 4); ST_DP(res_c1, b + 2); ST_DP(res_c0, b + 0); ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2); ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3); ST_DP2(src_c0, src_c1, c, 2); ST_DP2(src_c2, src_c3, c + ldc, 2); }
static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; v2f64 src_a0, src_a1, src_a2, src_a3, src_a8, src_a9, src_a16, src_a17; v2f64 src_a18, src_a24, src_a25, src_a26, src_a27, src_a32, src_a33; v2f64 src_a34, src_a35, src_a36, src_a40, src_a41, src_a42, src_a43; v2f64 src_a44, src_a45, src_a48, src_a49, src_a50, src_a51, src_a52; v2f64 src_a53, src_a54, src_a56, src_a57, src_a58, src_a59, src_a60; v2f64 src_a61, src_a62, src_a63; LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7); if (bk > 0) { BLASLONG i; FLOAT *pba = a, *pbb = b; v2f64 src_b, src_b0, src_b1; LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3); src_b0 = LD_DP(pbb); for (i = bk - 1; i--;) { pba += 8; pbb += 2; LD_DP4(pba, 2, src_a8, src_a9, src_a16, src_a17); src_b1 = LD_DP(pbb); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_c2 -= src_a2 * src_b; src_c3 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c4 -= src_a0 * src_b; src_c5 -= src_a1 * src_b; src_c6 -= src_a2 * src_b; src_c7 -= src_a3 * src_b; src_a0 = src_a8; src_a1 = src_a9; src_a2 = src_a16; src_a3 = src_a17; src_b0 = src_b1; } src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_c2 -= src_a2 * src_b; src_c3 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c4 -= src_a0 * src_b; src_c5 -= src_a1 * src_b; src_c6 -= src_a2 * src_b; src_c7 -= src_a3 * src_b; } ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1); ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3); ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5); ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7); src_a56 = LD_DP(a - 8); src_a57 = (v2f64) __msa_splati_d((v2i64) src_a56, 1); src_a56 = (v2f64) __msa_splati_d((v2i64) src_a56, 0); src_a58 = LD_DP(a - 6); src_a59 = (v2f64) __msa_splati_d((v2i64) src_a58, 1); src_a58 = (v2f64) __msa_splati_d((v2i64) src_a58, 0); src_a60 = LD_DP(a - 4); src_a61 = (v2f64) __msa_splati_d((v2i64) src_a60, 1); src_a60 = (v2f64) __msa_splati_d((v2i64) src_a60, 0); src_a62 = LD_DP(a - 2); src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1); src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0); res_c7 *= src_a63; res_c6 -= res_c7 * src_a62; res_c5 -= res_c7 * src_a61; res_c4 -= res_c7 * src_a60; res_c3 -= res_c7 * src_a59; res_c2 -= res_c7 * src_a58; res_c1 -= res_c7 * src_a57; res_c0 -= res_c7 * src_a56; src_a48 = LD_DP(a - 16); src_a49 = (v2f64) __msa_splati_d((v2i64) src_a48, 1); src_a48 = (v2f64) __msa_splati_d((v2i64) src_a48, 0); src_a50 = LD_DP(a - 14); src_a51 = (v2f64) __msa_splati_d((v2i64) src_a50, 1); src_a50 = (v2f64) __msa_splati_d((v2i64) src_a50, 0); src_a52 = LD_DP(a - 12); src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1); src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0); src_a54 = __msa_cast_to_vector_double(*(a - 10)); src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); src_a40 = LD_DP(a - 24); src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1); src_a40 = (v2f64) __msa_splati_d((v2i64) src_a40, 0); src_a42 = LD_DP(a - 22); src_a43 = (v2f64) __msa_splati_d((v2i64) src_a42, 1); src_a42 = (v2f64) __msa_splati_d((v2i64) src_a42, 0); src_a44 = LD_DP(a - 20); src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1); src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0); res_c6 *= src_a54; res_c5 -= res_c6 * src_a53; res_c4 -= res_c6 * src_a52; res_c3 -= res_c6 * src_a51; res_c2 -= res_c6 * src_a50; res_c1 -= res_c6 * src_a49; res_c0 -= res_c6 * src_a48; res_c5 *= src_a45; res_c4 -= res_c5 * src_a44; res_c3 -= res_c5 * src_a43; res_c2 -= res_c5 * src_a42; res_c1 -= res_c5 * src_a41; res_c0 -= res_c5 * src_a40; ST_DP(res_c7, b - 2); ST_DP(res_c6, b - 4); ST_DP(res_c5, b - 6); src_a32 = LD_DP(a - 32); src_a33 = (v2f64) __msa_splati_d((v2i64) src_a32, 1); src_a32 = (v2f64) __msa_splati_d((v2i64) src_a32, 0); src_a34 = LD_DP(a - 30); src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1); src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0); src_a36 = __msa_cast_to_vector_double(*(a - 28)); src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); res_c4 *= src_a36; res_c3 -= res_c4 * src_a35; res_c2 -= res_c4 * src_a34; res_c1 -= res_c4 * src_a33; res_c0 -= res_c4 * src_a32; src_a24 = LD_DP(a - 40); src_a25 = (v2f64) __msa_splati_d((v2i64) src_a24, 1); src_a24 = (v2f64) __msa_splati_d((v2i64) src_a24, 0); src_a26 = LD_DP(a - 38); src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1); src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0); src_a16 = LD_DP(a - 48); src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1); src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0); src_a18 = __msa_cast_to_vector_double(*(a - 46)); src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); src_a0 = __msa_cast_to_vector_double(*(a - 64)); src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); src_a8 = LD_DP(a - 56); src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); res_c3 *= src_a27; res_c2 -= res_c3 * src_a26; res_c1 -= res_c3 * src_a25; res_c0 -= res_c3 * src_a24; res_c2 *= src_a18; res_c1 -= res_c2 * src_a17; res_c0 -= res_c2 * src_a16; res_c1 *= src_a9; res_c0 -= res_c1 * src_a8; res_c0 *= src_a0; ST_DP(res_c4, b - 8); ST_DP(res_c3, b - 10); ST_DP(res_c2, b - 12); ST_DP(res_c1, b - 14); ST_DP(res_c0, b - 16); ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4); ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5); ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6); ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7); ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2); ST_DP4(src_c4, src_c5, src_c6, src_c7, c + ldc, 2); }
static void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) { v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; v2f64 res_c8, res_c9, res_c10, res_c11, res_c12, res_c13, res_c14, res_c15; v2f64 src_a0, src_a1, src_a2, src_a3, src_a8, src_a9, src_a16, src_a17; v2f64 src_a18, src_a24, src_a25, src_a26, src_a27, src_a32, src_a33; v2f64 src_a34, src_a35, src_a36, src_a40, src_a41, src_a42, src_a43; v2f64 src_a44, src_a45, src_a48, src_a49, src_a50, src_a51, src_a52; v2f64 src_a53, src_a54, src_a56, src_a57, src_a58, src_a59, src_a60; v2f64 src_a61, src_a62, src_a63; FLOAT *c_nxt1line = c + ldc; FLOAT *c_nxt2line = c + 2 * ldc; FLOAT *c_nxt3line = c + 3 * ldc; LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7); LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11); LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15); if (bk > 0) { BLASLONG i; FLOAT *pba = a, *pbb = b; v2f64 src_b, src_b0, src_b1, src_b2, src_b3; LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3); LD_DP2(pbb, 2, src_b0, src_b1); for (i = (bk - 1); i--;) { pba += 8; pbb += 4; LD_DP4(pba, 2, src_a8, src_a9, src_a16, src_a17); LD_DP2(pbb, 2, src_b2, src_b3); src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_c2 -= src_a2 * src_b; src_c3 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c4 -= src_a0 * src_b; src_c5 -= src_a1 * src_b; src_c6 -= src_a2 * src_b; src_c7 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); src_c8 -= src_a0 * src_b; src_c9 -= src_a1 * src_b; src_c10 -= src_a2 * src_b; src_c11 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); src_c12 -= src_a0 * src_b; src_c13 -= src_a1 * src_b; src_c14 -= src_a2 * src_b; src_c15 -= src_a3 * src_b; src_a0 = src_a8; src_a1 = src_a9; src_a2 = src_a16; src_a3 = src_a17; src_b0 = src_b2; src_b1 = src_b3; } src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); src_c0 -= src_a0 * src_b; src_c1 -= src_a1 * src_b; src_c2 -= src_a2 * src_b; src_c3 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); src_c4 -= src_a0 * src_b; src_c5 -= src_a1 * src_b; src_c6 -= src_a2 * src_b; src_c7 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); src_c8 -= src_a0 * src_b; src_c9 -= src_a1 * src_b; src_c10 -= src_a2 * src_b; src_c11 -= src_a3 * src_b; src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); src_c12 -= src_a0 * src_b; src_c13 -= src_a1 * src_b; src_c14 -= src_a2 * src_b; src_c15 -= src_a3 * src_b; } a -= 64; b -= 32; ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1); ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3); ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5); ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7); ILVRL_D2_DP(src_c12, src_c8, res_c8, res_c9); ILVRL_D2_DP(src_c13, src_c9, res_c10, res_c11); ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13); ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15); src_a54 = __msa_cast_to_vector_double(*(a + 54)); src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); src_a62 = LD_DP(a + 62); src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1); src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0); src_a60 = LD_DP(a + 60); src_a61 = (v2f64) __msa_splati_d((v2i64) src_a60, 1); src_a60 = (v2f64) __msa_splati_d((v2i64) src_a60, 0); src_a52 = LD_DP(a + 52); src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1); src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0); src_a44 = LD_DP(a + 44); src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1); src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0); src_a36 = __msa_cast_to_vector_double(*(a + 36)); src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); res_c7 *= src_a63; res_c6 -= res_c7 * src_a62; res_c6 *= src_a54; res_c15 *= src_a63; res_c14 -= res_c15 * src_a62; res_c14 *= src_a54; ST_DP(res_c7, b + 28); ST_DP(res_c6, b + 24); ST_DP(res_c15, b + 30); ST_DP(res_c14, b + 26); ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7); ILVRL_D2_DP(res_c15, res_c14, src_c11, src_c15); ST_DP(src_c3, c + 6); ST_DP(src_c7, c_nxt1line + 6); ST_DP(src_c11, c_nxt2line + 6); ST_DP(src_c15, c_nxt3line + 6); res_c5 -= res_c7 * src_a61; res_c5 -= res_c6 * src_a53; res_c5 *= src_a45; res_c4 -= res_c7 * src_a60; res_c4 -= res_c6 * src_a52; res_c4 -= res_c5 * src_a44; res_c4 *= src_a36; res_c13 -= res_c15 * src_a61; res_c13 -= res_c14 * src_a53; res_c13 *= src_a45; res_c12 -= res_c15 * src_a60; res_c12 -= res_c14 * src_a52; res_c12 -= res_c13 * src_a44; res_c12 *= src_a36; src_a56 = LD_DP(a + 56); src_a57 = (v2f64) __msa_splati_d((v2i64) src_a56, 1); src_a56 = (v2f64) __msa_splati_d((v2i64) src_a56, 0); src_a58 = LD_DP(a + 58); src_a59 = (v2f64) __msa_splati_d((v2i64) src_a58, 1); src_a58 = (v2f64) __msa_splati_d((v2i64) src_a58, 0); ST_DP(res_c4, b + 16); ST_DP(res_c5, b + 20); ST_DP(res_c12, b + 18); ST_DP(res_c13, b + 22); ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6); ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14); ST_DP(src_c2, c + 4); ST_DP(src_c6, c_nxt1line + 4); ST_DP(src_c10, c_nxt2line + 4); ST_DP(src_c14, c_nxt3line + 4); src_a50 = LD_DP(a + 50); src_a51 = (v2f64) __msa_splati_d((v2i64) src_a50, 1); src_a50 = (v2f64) __msa_splati_d((v2i64) src_a50, 0); src_a42 = LD_DP(a + 42); src_a43 = (v2f64) __msa_splati_d((v2i64) src_a42, 1); src_a42 = (v2f64) __msa_splati_d((v2i64) src_a42, 0); src_a34 = LD_DP(a + 34); src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1); src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0); src_a26 = LD_DP(a + 26); src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1); src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0); src_a18 = __msa_cast_to_vector_double(*(a + 18)); src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); res_c3 -= res_c7 * src_a59; res_c2 -= res_c7 * src_a58; res_c1 -= res_c7 * src_a57; res_c0 -= res_c7 * src_a56; res_c11 -= res_c15 * src_a59; res_c10 -= res_c15 * src_a58; res_c9 -= res_c15 * src_a57; res_c8 -= res_c15 * src_a56; res_c3 -= res_c6 * src_a51; res_c3 -= res_c5 * src_a43; res_c3 -= res_c4 * src_a35; res_c3 *= src_a27; res_c2 -= res_c6 * src_a50; res_c2 -= res_c5 * src_a42; res_c2 -= res_c4 * src_a34; res_c2 -= res_c3 * src_a26; res_c2 *= src_a18; res_c11 -= res_c14 * src_a51; res_c11 -= res_c13 * src_a43; res_c11 -= res_c12 * src_a35; res_c11 *= src_a27; res_c10 -= res_c14 * src_a50; res_c10 -= res_c13 * src_a42; res_c10 -= res_c12 * src_a34; res_c10 -= res_c11 * src_a26; res_c10 *= src_a18; src_a48 = LD_DP(a + 48); src_a49 = (v2f64) __msa_splati_d((v2i64) src_a48, 1); src_a48 = (v2f64) __msa_splati_d((v2i64) src_a48, 0); src_a40 = LD_DP(a + 40); src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1); src_a40 = (v2f64) __msa_splati_d((v2i64) src_a40, 0); ST_DP(res_c2, b + 8); ST_DP(res_c3, b + 12); ST_DP(res_c10, b + 10); ST_DP(res_c11, b + 14); src_a32 = LD_DP(a + 32); src_a33 = (v2f64) __msa_splati_d((v2i64) src_a32, 1); src_a32 = (v2f64) __msa_splati_d((v2i64) src_a32, 0); src_a24 = LD_DP(a + 24); src_a25 = (v2f64) __msa_splati_d((v2i64) src_a24, 1); src_a24 = (v2f64) __msa_splati_d((v2i64) src_a24, 0); ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5); ILVRL_D2_DP(res_c11, res_c10, src_c9, src_c13); ST_DP(src_c1, c + 2); ST_DP(src_c5, c_nxt1line + 2); ST_DP(src_c9, c_nxt2line + 2); ST_DP(src_c13, c_nxt3line + 2); res_c1 -= res_c6 * src_a49; res_c1 -= res_c5 * src_a41; res_c1 -= res_c4 * src_a33; res_c1 -= res_c3 * src_a25; res_c0 -= res_c6 * src_a48; res_c0 -= res_c5 * src_a40; res_c0 -= res_c4 * src_a32; res_c0 -= res_c3 * src_a24; res_c9 -= res_c14 * src_a49; res_c9 -= res_c13 * src_a41; res_c9 -= res_c12 * src_a33; res_c9 -= res_c11 * src_a25; res_c8 -= res_c14 * src_a48; res_c8 -= res_c13 * src_a40; res_c8 -= res_c12 * src_a32; res_c8 -= res_c11 * src_a24; src_a16 = LD_DP(a + 16); src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1); src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0); src_a8 = LD_DP(a + 8); src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); src_a0 = __msa_cast_to_vector_double(*(a + 0)); src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); res_c1 -= res_c2 * src_a17; res_c1 *= src_a9; res_c9 -= res_c10 * src_a17; res_c9 *= src_a9; res_c0 -= res_c2 * src_a16; res_c0 -= res_c1 * src_a8; res_c0 *= src_a0; res_c8 -= res_c10 * src_a16; res_c8 -= res_c9 * src_a8; res_c8 *= src_a0; ST_DP(res_c0, b + 0); ST_DP(res_c8, b + 2); ST_DP(res_c1, b + 4); ST_DP(res_c9, b + 6); ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4); ILVRL_D2_DP(res_c9, res_c8, src_c8, src_c12); ST_DP(src_c0, c); ST_DP(src_c4, c_nxt1line); ST_DP(src_c8, c_nxt2line); ST_DP(src_c12, c_nxt3line); }
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, src87_r, src98_r, src109_r); ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998); XORI_B2_128_SB(src8776, src10998); out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0, filt1, filt2, filt3); out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0, filt1, filt2, filt3); SRARI_H2_SH(out10, out32, FILTER_BITS); SAT_SH2_SH(out10, out32, 7); out = PCKEV_XORI128_UB(out10, out32); ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); dst0 = (v16u8)__msa_ilvr_d((v2i64)dst2, (v2i64)dst0); out = __msa_aver_u_b(out, dst0); ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); dst += (4 * dst_stride); src2110 = src6554; src4332 = src8776; src6554 = src10998; src6 = src10; } } static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter,