void vp9_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit0_ptr, const uint8_t *limit0_ptr, const uint8_t *thresh0_ptr, const uint8_t *b_limit1_ptr, const uint8_t *limit1_ptr, const uint8_t *thresh1_ptr) { v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; /* load vector elements */ LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr); thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr); thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0); b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr); b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr); b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0); limit0 = (v16u8)__msa_fill_b(*limit0_ptr); limit1 = (v16u8)__msa_fill_b(*limit1_ptr); limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, mask, flat); VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch); }
static void loop_filter_horizontal_4_dual_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit0_ptr, const uint8_t *limit0_ptr, const uint8_t *thresh0_ptr, const uint8_t *b_limit1_ptr, const uint8_t *limit1_ptr, const uint8_t *thresh1_ptr) { v16u8 mask, hev, flat; v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr); thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr); thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0); b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr); b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr); b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0); limit0 = (v16u8)__msa_fill_b(*limit0_ptr); limit1 = (v16u8)__msa_fill_b(*limit1_ptr); limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, mask, flat); VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch); }
static void intra_predict_horiz_16x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride) { uint32_t row; uint8_t inp0, inp1, inp2, inp3; v16u8 src0, src1, src2, src3; for (row = 4; row--;) { inp0 = src[0]; src += src_stride; inp1 = src[0]; src += src_stride; inp2 = src[0]; src += src_stride; inp3 = src[0]; src += src_stride; src0 = (v16u8)__msa_fill_b(inp0); src1 = (v16u8)__msa_fill_b(inp1); src2 = (v16u8)__msa_fill_b(inp2); src3 = (v16u8)__msa_fill_b(inp3); ST_UB4(src0, src1, src2, src3, dst, dst_stride); dst += (4 * dst_stride); } }
static WEBP_INLINE void HorizontalPred16x16(uint8_t* dst, const uint8_t* left) { if (left != NULL) { int j; for (j = 0; j < 16; j += 4) { const v16u8 L0 = (v16u8)__msa_fill_b(left[0]); const v16u8 L1 = (v16u8)__msa_fill_b(left[1]); const v16u8 L2 = (v16u8)__msa_fill_b(left[2]); const v16u8 L3 = (v16u8)__msa_fill_b(left[3]); ST_UB4(L0, L1, L2, L3, dst, BPS); dst += 4 * BPS; left += 4; } } else { const v16u8 out = (v16u8)__msa_fill_b(0x81); STORE16x16(out, dst); } }
void ff_vp8_v_loop_filter16_inner_msa(uint8_t *src, ptrdiff_t pitch, int32_t e, int32_t i, int32_t h) { v16u8 mask, hev, flat; v16u8 thresh, b_limit, limit; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; /* load vector elements */ LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); thresh = (v16u8) __msa_fill_b(h); b_limit = (v16u8) __msa_fill_b(e); limit = (v16u8) __msa_fill_b(i); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch); }
static void mbloop_filter_horizontal_edge_y_msa(uint8_t *src, int32_t pitch, const uint8_t b_limit_in, const uint8_t limit_in, const uint8_t thresh_in) { uint8_t *temp_src; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 mask, hev, flat, thresh, limit, b_limit; b_limit = (v16u8)__msa_fill_b(b_limit_in); limit = (v16u8)__msa_fill_b(limit_in); thresh = (v16u8)__msa_fill_b(thresh_in); temp_src = src - (pitch << 2); LD_UB8(temp_src, pitch, p3, p2, p1, p0, q0, q1, q2, q3); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); temp_src = src - 3 * pitch; ST_UB4(p2, p1, p0, q0, temp_src, pitch); temp_src += (4 * pitch); ST_UB2(q1, q2, temp_src, pitch); }
void ff_vp8_v_loop_filter16_msa(uint8_t *src, ptrdiff_t pitch, int b_limit_in, int limit_in, int thresh_in) { uint8_t *temp_src; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 mask, hev, flat, thresh, limit, b_limit; b_limit = (v16u8) __msa_fill_b(b_limit_in); limit = (v16u8) __msa_fill_b(limit_in); thresh = (v16u8) __msa_fill_b(thresh_in); /* load vector elements */ temp_src = src - (pitch << 2); LD_UB8(temp_src, pitch, p3, p2, p1, p0, q0, q1, q2, q3); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); /* store vector elements */ temp_src = src - 3 * pitch; ST_UB4(p2, p1, p0, q0, temp_src, pitch); temp_src += (4 * pitch); ST_UB2(q1, q2, temp_src, pitch); }
void yuv_abgr_convert_msa (JSAMPROW p_in_y, JSAMPROW p_in_cb, JSAMPROW p_in_cr, JSAMPROW p_rgb, JDIMENSION out_width) { int y, cb, cr; unsigned int col, num_cols_mul_16 = out_width >> 4; unsigned int remaining_wd = out_width & 0xF; v16i8 alpha = __msa_ldi_b(0xFF); v16i8 const_128 = __msa_ldi_b(128); v16u8 out0, out1, out2, out3, input_y = {0}; v16i8 input_cb, input_cr, out_rgb0, out_rgb1, out_ab0, out_ab1; v8i16 y_h0, y_h1, cb_h0, cb_h1, cr_h0, cr_h1; v4i32 cb_w0, cb_w1, cb_w2, cb_w3, cr_w0, cr_w1, cr_w2, cr_w3, zero = {0}; v16i8 out_r0, out_g0, out_b0; for (col = num_cols_mul_16; col--;) { input_y = LD_UB(p_in_y); input_cb = LD_SB(p_in_cb); input_cr = LD_SB(p_in_cr); p_in_y += 16; p_in_cb += 16; p_in_cr += 16; input_cb -= const_128; input_cr -= const_128; UNPCK_UB_SH(input_y, y_h0, y_h1); UNPCK_SB_SH(input_cb, cb_h0, cb_h1); UNPCK_SB_SH(input_cr, cr_h0, cr_h1); CALC_G4_FRM_YUV(y_h0, y_h1, cb_h0, cb_h1, cr_h0, cr_h1, out_g0); UNPCK_SH_SW(cr_h0, cr_w0, cr_w1); UNPCK_SH_SW(cr_h1, cr_w2, cr_w3); CALC_R4_FRM_YUV(y_h0, y_h1, cr_w0, cr_w1, cr_w2, cr_w3, out_r0); UNPCK_SH_SW(cb_h0, cb_w0, cb_w1); UNPCK_SH_SW(cb_h1, cb_w2, cb_w3); CALC_B4_FRM_YUV(y_h0, y_h1, cb_w0, cb_w1, cb_w2, cb_w3, out_b0); ILVRL_B2_SB(out_r0, out_g0, out_rgb0, out_rgb1); ILVRL_B2_SB(out_b0, alpha, out_ab0, out_ab1); ILVRL_H2_UB(out_rgb0, out_ab0, out0, out1); ILVRL_H2_UB(out_rgb1, out_ab1, out2, out3); ST_UB4(out0, out1, out2, out3, p_rgb, 16); p_rgb += 16 * 4; } if (remaining_wd >= 8) { uint64_t in_y, in_cb, in_cr; v16i8 input_cbcr = {0}; in_y = LD(p_in_y); in_cb = LD(p_in_cb); in_cr = LD(p_in_cr); p_in_y += 8; p_in_cb += 8; p_in_cr += 8; input_y = (v16u8) __msa_insert_d((v2i64) input_y, 0, in_y); input_cbcr = (v16i8) __msa_insert_d((v2i64) input_cbcr, 0, in_cb); input_cbcr = (v16i8) __msa_insert_d((v2i64) input_cbcr, 1, in_cr); input_cbcr -= const_128; y_h0 = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) input_y); UNPCK_SB_SH(input_cbcr, cb_h0, cr_h0); UNPCK_SH_SW(cb_h0, cb_w0, cb_w1); UNPCK_SH_SW(cr_h0, cr_w0, cr_w1); CALC_R2_FRM_YUV(y_h0, cr_w0, cr_w1, out_r0); CALC_G2_FRM_YUV(y_h0, cb_h0, cr_h0, out_g0); CALC_B2_FRM_YUV(y_h0, cb_w0, cb_w1, out_b0); out_rgb0 = (v16i8) __msa_ilvr_b((v16i8) out_r0, (v16i8) out_g0); out_ab0 = (v16i8) __msa_ilvr_b((v16i8) out_b0, alpha); ILVRL_H2_UB(out_rgb0, out_ab0, out0, out1); ST_UB2(out0, out1, p_rgb, 16); p_rgb += 16 * 2; remaining_wd -= 8; } for (col = 0; col < remaining_wd; col++) { y = (int) (p_in_y[col]); cb = (int) (p_in_cb[col]) - 128; cr = (int) (p_in_cr[col]) - 128; p_rgb[0] = 0xFF; p_rgb[1] = clip_pixel(y + ROUND_POWER_OF_TWO(FIX_1_77200 * cb, 16)); p_rgb[2] = clip_pixel(y + ROUND_POWER_OF_TWO(((-FIX_0_34414) * cb - FIX_0_71414 * cr), 16)); p_rgb[3] = clip_pixel(y + ROUND_POWER_OF_TWO(FIX_1_40200 * cr, 16)); p_rgb += 4; } }
void vpx_lpf_horizontal_8_dual_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *b_limit1, const uint8_t *limit1, const uint8_t *thresh1) { v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; v16u8 flat, mask, hev, tmp, thresh, b_limit, limit; v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; v16u8 zero = { 0 }; /* load vector elements */ LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); thresh = (v16u8)__msa_fill_b(*thresh0); tmp = (v16u8)__msa_fill_b(*thresh1); thresh = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)thresh); b_limit = (v16u8)__msa_fill_b(*b_limit0); tmp = (v16u8)__msa_fill_b(*b_limit1); b_limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)b_limit); limit = (v16u8)__msa_fill_b(*limit0); tmp = (v16u8)__msa_fill_b(*limit1); limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)limit); /* mask and hev */ LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); if (__msa_test_bz_v(flat)) { ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); } else { ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); /* convert 16 bit output data into 8 bit */ PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r); PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, q2_filt8_r); /* store pixel values */ p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); src -= 3 * pitch; ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch); src += (4 * pitch); ST_UB2(q1_out, q2_out, src, pitch); src += (2 * pitch); } }
static void hevc_addblk_16x16_msa(int16_t *coeffs, uint8_t *dst, int32_t stride) { uint8_t loop_cnt; uint8_t *temp_dst = dst; v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; v8i16 dst_r0, dst_l0, dst_r1, dst_l1, dst_r2, dst_l2, dst_r3, dst_l3; v8i16 in0, in1, in2, in3, in4, in5, in6, in7; /* Pre-load for next iteration */ LD_UB4(temp_dst, stride, dst4, dst5, dst6, dst7); temp_dst += (4 * stride); LD_SH4(coeffs, 16, in0, in2, in4, in6); LD_SH4((coeffs + 8), 16, in1, in3, in5, in7); coeffs += 64; for (loop_cnt = 3; loop_cnt--;) { UNPCK_UB_SH(dst4, dst_r0, dst_l0); UNPCK_UB_SH(dst5, dst_r1, dst_l1); UNPCK_UB_SH(dst6, dst_r2, dst_l2); UNPCK_UB_SH(dst7, dst_r3, dst_l3); dst_r0 += in0; dst_l0 += in1; dst_r1 += in2; dst_l1 += in3; dst_r2 += in4; dst_l2 += in5; dst_r3 += in6; dst_l3 += in7; /* Pre-load for next iteration */ LD_UB4(temp_dst, stride, dst4, dst5, dst6, dst7); temp_dst += (4 * stride); LD_SH4(coeffs, 16, in0, in2, in4, in6); LD_SH4((coeffs + 8), 16, in1, in3, in5, in7); coeffs += 64; CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1); CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3); PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3, dst_r3, dst0, dst1, dst2, dst3); ST_UB4(dst0, dst1, dst2, dst3, dst, stride); dst += (4 * stride); } UNPCK_UB_SH(dst4, dst_r0, dst_l0); UNPCK_UB_SH(dst5, dst_r1, dst_l1); UNPCK_UB_SH(dst6, dst_r2, dst_l2); UNPCK_UB_SH(dst7, dst_r3, dst_l3); dst_r0 += in0; dst_l0 += in1; dst_r1 += in2; dst_l1 += in3; dst_r2 += in4; dst_l2 += in5; dst_r3 += in6; dst_l3 += in7; CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1); CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3); PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3, dst_r3, dst0, dst1, dst2, dst3); ST_UB4(dst0, dst1, dst2, dst3, dst, stride); }