static void intra_predict_dc_8x8_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t dst_stride) { uint64_t val0, val1; v16i8 store; v16u8 src = { 0 }; v8u16 sum_h; v4u32 sum_w; v2u64 sum_d; val0 = LD(src_top); val1 = LD(src_left); INSERT_D2_UB(val0, val1, src); sum_h = __msa_hadd_u_h(src, src); sum_w = __msa_hadd_u_w(sum_h, sum_h); sum_d = __msa_hadd_u_d(sum_w, sum_w); sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); sum_d = __msa_hadd_u_d(sum_w, sum_w); sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4); store = __msa_splati_b((v16i8)sum_w, 0); val0 = __msa_copy_u_d((v2i64)store, 0); SD4(val0, val0, val0, val0, dst, dst_stride); dst += (4 * dst_stride); SD4(val0, val0, val0, val0, dst, dst_stride); }
static void intra_predict_vert_8x8_msa(uint8_t *src, uint8_t *dst, int32_t dst_stride) { uint64_t out = LD(src); SD4(out, out, out, out, dst, dst_stride); dst += (4 * dst_stride); SD4(out, out, out, out, dst, dst_stride); }
static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) { uint64_t out; const v16i8 store = __msa_ldi_b(128); out = __msa_copy_u_d((v2i64)store, 0); SD4(out, out, out, out, dst, dst_stride); dst += (4 * dst_stride); SD4(out, out, out, out, dst, dst_stride); }
static void copy_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height) { int32_t cnt; uint64_t out0, out1, out2, out3, out4, out5, out6, out7; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; if (0 == height % 12) { for (cnt = (height / 12); cnt--;) { LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); src += (8 * src_stride); out0 = __msa_copy_u_d((v2i64)src0, 0); out1 = __msa_copy_u_d((v2i64)src1, 0); out2 = __msa_copy_u_d((v2i64)src2, 0); out3 = __msa_copy_u_d((v2i64)src3, 0); out4 = __msa_copy_u_d((v2i64)src4, 0); out5 = __msa_copy_u_d((v2i64)src5, 0); out6 = __msa_copy_u_d((v2i64)src6, 0); out7 = __msa_copy_u_d((v2i64)src7, 0); SD4(out0, out1, out2, out3, dst, dst_stride); dst += (4 * dst_stride); SD4(out4, out5, out6, out7, dst, dst_stride); dst += (4 * dst_stride); LD_UB4(src, src_stride, src0, src1, src2, src3); src += (4 * src_stride); out0 = __msa_copy_u_d((v2i64)src0, 0); out1 = __msa_copy_u_d((v2i64)src1, 0); out2 = __msa_copy_u_d((v2i64)src2, 0); out3 = __msa_copy_u_d((v2i64)src3, 0); SD4(out0, out1, out2, out3, dst, dst_stride); dst += (4 * dst_stride); } } else if (0 == height % 8) { for (cnt = height >> 3; cnt--;) { LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); src += (8 * src_stride); out0 = __msa_copy_u_d((v2i64)src0, 0); out1 = __msa_copy_u_d((v2i64)src1, 0); out2 = __msa_copy_u_d((v2i64)src2, 0); out3 = __msa_copy_u_d((v2i64)src3, 0); out4 = __msa_copy_u_d((v2i64)src4, 0); out5 = __msa_copy_u_d((v2i64)src5, 0); out6 = __msa_copy_u_d((v2i64)src6, 0); out7 = __msa_copy_u_d((v2i64)src7, 0); SD4(out0, out1, out2, out3, dst, dst_stride); dst += (4 * dst_stride); SD4(out4, out5, out6, out7, dst, dst_stride); dst += (4 * dst_stride); } } else if (0 == height % 4) {
static void mbloop_filter_horizontal_edge_uv_msa(uint8_t *src_u, uint8_t *src_v, int32_t pitch, const uint8_t b_limit_in, const uint8_t limit_in, const uint8_t thresh_in) { uint8_t *temp_src; uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 mask, hev, flat, thresh, limit, b_limit; v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u; v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v; b_limit = (v16u8)__msa_fill_b(b_limit_in); limit = (v16u8)__msa_fill_b(limit_in); thresh = (v16u8)__msa_fill_b(thresh_in); temp_src = src_u - (pitch << 2); LD_UB8(temp_src, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u); temp_src = src_v - (pitch << 2); LD_UB8(temp_src, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v); ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0); ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); p2_d = __msa_copy_u_d((v2i64)p2, 0); p1_d = __msa_copy_u_d((v2i64)p1, 0); p0_d = __msa_copy_u_d((v2i64)p0, 0); q0_d = __msa_copy_u_d((v2i64)q0, 0); q1_d = __msa_copy_u_d((v2i64)q1, 0); q2_d = __msa_copy_u_d((v2i64)q2, 0); src_u -= (pitch * 3); SD4(p2_d, p1_d, p0_d, q0_d, src_u, pitch); src_u += 4 * pitch; SD(q1_d, src_u); src_u += pitch; SD(q2_d, src_u); p2_d = __msa_copy_u_d((v2i64)p2, 1); p1_d = __msa_copy_u_d((v2i64)p1, 1); p0_d = __msa_copy_u_d((v2i64)p0, 1); q0_d = __msa_copy_u_d((v2i64)q0, 1); q1_d = __msa_copy_u_d((v2i64)q1, 1); q2_d = __msa_copy_u_d((v2i64)q2, 1); src_v -= (pitch * 3); SD4(p2_d, p1_d, p0_d, q0_d, src_v, pitch); src_v += 4 * pitch; SD(q1_d, src_v); src_v += pitch; SD(q2_d, src_v); }
void ff_vp8_v_loop_filter8uv_msa(uint8_t *src_u, uint8_t *src_v, ptrdiff_t pitch, int b_limit_in, int limit_in, int thresh_in) { uint8_t *temp_src; uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 mask, hev, flat, thresh, limit, b_limit; v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u; v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v; b_limit = (v16u8) __msa_fill_b(b_limit_in); limit = (v16u8) __msa_fill_b(limit_in); thresh = (v16u8) __msa_fill_b(thresh_in); temp_src = src_u - (pitch << 2); LD_UB8(temp_src, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u); temp_src = src_v - (pitch << 2); LD_UB8(temp_src, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v); /* rht 8 element of p3 are u pixel and left 8 element of p3 are v pixel */ ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0); ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); p2_d = __msa_copy_u_d((v2i64) p2, 0); p1_d = __msa_copy_u_d((v2i64) p1, 0); p0_d = __msa_copy_u_d((v2i64) p0, 0); q0_d = __msa_copy_u_d((v2i64) q0, 0); q1_d = __msa_copy_u_d((v2i64) q1, 0); q2_d = __msa_copy_u_d((v2i64) q2, 0); src_u -= (pitch * 3); SD4(p2_d, p1_d, p0_d, q0_d, src_u, pitch); src_u += 4 * pitch; SD(q1_d, src_u); src_u += pitch; SD(q2_d, src_u); p2_d = __msa_copy_u_d((v2i64) p2, 1); p1_d = __msa_copy_u_d((v2i64) p1, 1); p0_d = __msa_copy_u_d((v2i64) p0, 1); q0_d = __msa_copy_u_d((v2i64) q0, 1); q1_d = __msa_copy_u_d((v2i64) q1, 1); q2_d = __msa_copy_u_d((v2i64) q2, 1); src_v -= (pitch * 3); SD4(p2_d, p1_d, p0_d, q0_d, src_v, pitch); src_v += 4 * pitch; SD(q1_d, src_v); src_v += pitch; SD(q2_d, src_v); }
void vp9_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit_ptr, const uint8_t *limit_ptr, const uint8_t *thresh_ptr, int32_t count) { uint64_t p1_d, p0_d, q0_d, q1_d; v16u8 mask, hev, flat, thresh, b_limit, limit; v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out; (void)count; /* load vector elements */ LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); thresh = (v16u8)__msa_fill_b(*thresh_ptr); b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); limit = (v16u8)__msa_fill_b(*limit_ptr); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); p1_d = __msa_copy_u_d((v2i64)p1_out, 0); p0_d = __msa_copy_u_d((v2i64)p0_out, 0); q0_d = __msa_copy_u_d((v2i64)q0_out, 0); q1_d = __msa_copy_u_d((v2i64)q1_out, 0); SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch); }
static void intra_predict_horiz_8x8_msa(const uint8_t *src, uint8_t *dst, int32_t dst_stride) { uint64_t out0, out1, out2, out3, out4, out5, out6, out7; out0 = src[0] * 0x0101010101010101ull; out1 = src[1] * 0x0101010101010101ull; out2 = src[2] * 0x0101010101010101ull; out3 = src[3] * 0x0101010101010101ull; out4 = src[4] * 0x0101010101010101ull; out5 = src[5] * 0x0101010101010101ull; out6 = src[6] * 0x0101010101010101ull; out7 = src[7] * 0x0101010101010101ull; SD4(out0, out1, out2, out3, dst, dst_stride); dst += (4 * dst_stride); SD4(out4, out5, out6, out7, dst, dst_stride); }
static void intra_predict_horiz_8x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride) { uint64_t out0, out1, out2, out3, out4, out5, out6, out7; out0 = src[0 * src_stride] * 0x0101010101010101ull; out1 = src[1 * src_stride] * 0x0101010101010101ull; out2 = src[2 * src_stride] * 0x0101010101010101ull; out3 = src[3 * src_stride] * 0x0101010101010101ull; out4 = src[4 * src_stride] * 0x0101010101010101ull; out5 = src[5 * src_stride] * 0x0101010101010101ull; out6 = src[6 * src_stride] * 0x0101010101010101ull; out7 = src[7 * src_stride] * 0x0101010101010101ull; SD4(out0, out1, out2, out3, dst, dst_stride); dst += (4 * dst_stride); SD4(out4, out5, out6, out7, dst, dst_stride); }
static void loop_filter_horizontal_edge_uv_msa(uint8_t *src_u, uint8_t *src_v, int32_t pitch, const uint8_t b_limit_in, const uint8_t limit_in, const uint8_t thresh_in) { uint64_t p1_d, p0_d, q0_d, q1_d; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 mask, hev, flat, thresh, limit, b_limit; v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u; v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v; thresh = (v16u8)__msa_fill_b(thresh_in); limit = (v16u8)__msa_fill_b(limit_in); b_limit = (v16u8)__msa_fill_b(b_limit_in); src_u = src_u - (pitch << 2); LD_UB8(src_u, pitch, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u); src_u += (5 * pitch); src_v = src_v - (pitch << 2); LD_UB8(src_v, pitch, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v); src_v += (5 * pitch); /* right 8 element of p3 are u pixel and left 8 element of p3 are v pixel */ ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0); ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); p1_d = __msa_copy_u_d((v2i64)p1, 0); p0_d = __msa_copy_u_d((v2i64)p0, 0); q0_d = __msa_copy_u_d((v2i64)q0, 0); q1_d = __msa_copy_u_d((v2i64)q1, 0); SD4(q1_d, q0_d, p0_d, p1_d, src_u, (-pitch)); p1_d = __msa_copy_u_d((v2i64)p1, 1); p0_d = __msa_copy_u_d((v2i64)p0, 1); q0_d = __msa_copy_u_d((v2i64)q0, 1); q1_d = __msa_copy_u_d((v2i64)q1, 1); SD4(q1_d, q0_d, p0_d, p1_d, src_v, (-pitch)); }
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { uint64_t out0, out1, out2, out3; uint32_t in0, in1, in2, in3; v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; v8i16 t0, t1, t2, t3; v16u8 srcl0, srcl1, src0, src1; const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 }; const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 }; const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 }; const v8i16 mask3 = { 0, 4, 1, 5, 2, 6, 3, 7 }; const v8i16 cnst0 = { 2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352 }; const v8i16 cnst1 = { 5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217 }; LW4(src, BPS, in0, in1, in2, in3); INSERT_W4_UB(in0, in1, in2, in3, src0); LW4(ref, BPS, in0, in1, in2, in3); INSERT_W4_UB(in0, in1, in2, in3, src1); ILVRL_B2_UB(src0, src1, srcl0, srcl1); HSUB_UB2_SH(srcl0, srcl1, t0, t1); VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3); ADDSUB2(t2, t3, t0, t1); t0 = SRLI_H(t0, 3); VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2); tmp0 = __msa_hadd_s_w(t3, t3); tmp2 = __msa_hsub_s_w(t3, t3); FILL_W2_SW(1812, 937, tmp1, tmp3); DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1); SRAI_W2_SW(tmp1, tmp3, 9); PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1); VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3); ADDSUB2(t2, t3, t0, t1); VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2); tmp0 = __msa_hadd_s_w(t3, t3); tmp2 = __msa_hsub_s_w(t3, t3); ADDVI_W2_SW(tmp0, 7, tmp2, 7, tmp0, tmp2); SRAI_W2_SW(tmp0, tmp2, 4); FILL_W2_SW(12000, 51000, tmp1, tmp3); DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1); SRAI_W2_SW(tmp1, tmp3, 16); UNPCK_R_SH_SW(t1, tmp4); tmp5 = __msa_ceqi_w(tmp4, 0); tmp4 = (v4i32)__msa_nor_v((v16u8)tmp5, (v16u8)tmp5); tmp5 = __msa_fill_w(1); tmp5 = (v4i32)__msa_and_v((v16u8)tmp5, (v16u8)tmp4); tmp1 += tmp5; PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1); out0 = __msa_copy_s_d((v2i64)t0, 0); out1 = __msa_copy_s_d((v2i64)t0, 1); out2 = __msa_copy_s_d((v2i64)t1, 0); out3 = __msa_copy_s_d((v2i64)t1, 1); SD4(out0, out1, out2, out3, out, 8); }
static void intra_predict_dc_tl_8x8_msa(const uint8_t *src, uint8_t *dst, int32_t dst_stride) { uint64_t val0; v16i8 store; v16u8 data = { 0 }; v8u16 sum_h; v4u32 sum_w; v2u64 sum_d; val0 = LD(src); data = (v16u8)__msa_insert_d((v2i64)data, 0, val0); sum_h = __msa_hadd_u_h(data, data); sum_w = __msa_hadd_u_w(sum_h, sum_h); sum_d = __msa_hadd_u_d(sum_w, sum_w); sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3); store = __msa_splati_b((v16i8)sum_w, 0); val0 = __msa_copy_u_d((v2i64)store, 0); SD4(val0, val0, val0, val0, dst, dst_stride); dst += (4 * dst_stride); SD4(val0, val0, val0, val0, dst, dst_stride); }
void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit_ptr, const uint8_t *limit_ptr, const uint8_t *thresh_ptr) { uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; v16u8 mask, hev, flat, thresh, b_limit, limit; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; v8i16 p2_filter8, p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8; v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r; v16i8 zero = { 0 }; /* load vector elements */ LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); thresh = (v16u8)__msa_fill_b(*thresh_ptr); b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); limit = (v16u8)__msa_fill_b(*limit_ptr); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); if (__msa_test_bz_v(flat)) { p1_d = __msa_copy_u_d((v2i64)p1_out, 0); p0_d = __msa_copy_u_d((v2i64)p0_out, 0); q0_d = __msa_copy_u_d((v2i64)q0_out, 0); q1_d = __msa_copy_u_d((v2i64)q1_out, 0); SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch); } else { ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8, p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); /* convert 16 bit output data into 8 bit */ PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8); PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8); /* store pixel values */ p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat); p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat); p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat); q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat); q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat); q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat); p2_d = __msa_copy_u_d((v2i64)p2_out, 0); p1_d = __msa_copy_u_d((v2i64)p1_out, 0); p0_d = __msa_copy_u_d((v2i64)p0_out, 0); q0_d = __msa_copy_u_d((v2i64)q0_out, 0); q1_d = __msa_copy_u_d((v2i64)q1_out, 0); q2_d = __msa_copy_u_d((v2i64)q2_out, 0); src -= 3 * pitch; SD4(p2_d, p1_d, p0_d, q0_d, src, pitch); src += (4 * pitch); SD(q1_d, src); src += pitch; SD(q2_d, src); } }