static void filter_by_weight8x8_msa(const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, int32_t dst_stride, int32_t src_weight) { int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight; int32_t row; uint64_t src0_d, src1_d, dst0_d, dst1_d; v16i8 src0 = { 0 }; v16i8 src1 = { 0 }; v16i8 dst0 = { 0 }; v16i8 dst1 = { 0 }; v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l; src_wt = __msa_fill_h(src_weight); dst_wt = __msa_fill_h(dst_weight); for (row = 2; row--;) { LD2(src_ptr, src_stride, src0_d, src1_d); src_ptr += (2 * src_stride); LD2(dst_ptr, dst_stride, dst0_d, dst1_d); INSERT_D2_SB(src0_d, src1_d, src0); INSERT_D2_SB(dst0_d, dst1_d, dst0); LD2(src_ptr, src_stride, src0_d, src1_d); src_ptr += (2 * src_stride); LD2((dst_ptr + 2 * dst_stride), dst_stride, dst0_d, dst1_d); INSERT_D2_SB(src0_d, src1_d, src1); INSERT_D2_SB(dst0_d, dst1_d, dst1); UNPCK_UB_SH(src0, src_r, src_l); UNPCK_UB_SH(dst0, dst_r, dst_l); res_h_r = (src_r * src_wt); res_h_r += (dst_r * dst_wt); res_h_l = (src_l * src_wt); res_h_l += (dst_l * dst_wt); SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); dst0 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r); ST8x2_UB(dst0, dst_ptr, dst_stride); dst_ptr += (2 * dst_stride); UNPCK_UB_SH(src1, src_r, src_l); UNPCK_UB_SH(dst1, dst_r, dst_l); res_h_r = (src_r * src_wt); res_h_r += (dst_r * dst_wt); res_h_l = (src_l * src_wt); res_h_l += (dst_l * dst_wt); SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); dst1 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r); ST8x2_UB(dst1, dst_ptr, dst_stride); dst_ptr += (2 * dst_stride); } }
static void filter_by_weight16x16_msa(const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, int32_t dst_stride, int32_t src_weight) { int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight; int32_t row; v16i8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l; src_wt = __msa_fill_h(src_weight); dst_wt = __msa_fill_h(dst_weight); for (row = 4; row--;) { LD_SB4(src_ptr, src_stride, src0, src1, src2, src3); src_ptr += (4 * src_stride); LD_SB4(dst_ptr, dst_stride, dst0, dst1, dst2, dst3); UNPCK_UB_SH(src0, src_r, src_l); UNPCK_UB_SH(dst0, dst_r, dst_l); res_h_r = (src_r * src_wt); res_h_r += (dst_r * dst_wt); res_h_l = (src_l * src_wt); res_h_l += (dst_l * dst_wt); SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); dst_ptr += dst_stride; UNPCK_UB_SH(src1, src_r, src_l); UNPCK_UB_SH(dst1, dst_r, dst_l); res_h_r = (src_r * src_wt); res_h_r += (dst_r * dst_wt); res_h_l = (src_l * src_wt); res_h_l += (dst_l * dst_wt); SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); dst_ptr += dst_stride; UNPCK_UB_SH(src2, src_r, src_l); UNPCK_UB_SH(dst2, dst_r, dst_l); res_h_r = (src_r * src_wt); res_h_r += (dst_r * dst_wt); res_h_l = (src_l * src_wt); res_h_l += (dst_l * dst_wt); SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); dst_ptr += dst_stride; UNPCK_UB_SH(src3, src_r, src_l); UNPCK_UB_SH(dst3, dst_r, dst_l); res_h_r = (src_r * src_wt); res_h_r += (dst_r * dst_wt); res_h_l = (src_l * src_wt); res_h_l += (dst_l * dst_wt); SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); dst_ptr += dst_stride; } }
static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride) { uint8_t *temp_dst = dst; uint64_t dst0, dst1, dst2, dst3; v2i64 dst_vec0 = { 0 }; v2i64 dst_vec1 = { 0 }; v8i16 dst_r0, dst_l0, dst_r1, dst_l1; v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v16u8 zeros = { 0 }; LD_SH8(coeffs, 8, in0, in1, in2, in3, in4, in5, in6, in7); LD4(temp_dst, stride, dst0, dst1, dst2, dst3); temp_dst += (4 * stride); INSERT_D2_SD(dst0, dst1, dst_vec0); INSERT_D2_SD(dst2, dst3, dst_vec1); ILVRL_B2_SH(zeros, dst_vec0, dst_r0, dst_l0); ILVRL_B2_SH(zeros, dst_vec1, dst_r1, dst_l1); ADD4(dst_r0, in0, dst_l0, in1, dst_r1, in2, dst_l1, in3, dst_r0, dst_l0, dst_r1, dst_l1); CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1); PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1); ST8x4_UB(dst_r0, dst_r1, dst, stride); dst += (4 * stride); LD4(temp_dst, stride, dst0, dst1, dst2, dst3); INSERT_D2_SD(dst0, dst1, dst_vec0); INSERT_D2_SD(dst2, dst3, dst_vec1); UNPCK_UB_SH(dst_vec0, dst_r0, dst_l0); UNPCK_UB_SH(dst_vec1, dst_r1, dst_l1); ADD4(dst_r0, in4, dst_l0, in5, dst_r1, in6, dst_l1, in7, dst_r0, dst_l0, dst_r1, dst_l1); CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1); PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1); ST8x4_UB(dst_r0, dst_r1, dst, stride); }
static void avc_idct4x4_addblk_dc_msa( uint8_t *p_dst, int16_t *p_src, int32_t i_dst_stride ) { int16_t i_dc; uint32_t i_src0, i_src1, i_src2, i_src3; v16u8 pred = { 0 }; v16i8 out; v8i16 input_dc, pred_r, pred_l; i_dc = ( p_src[0] + 32 ) >> 6; input_dc = __msa_fill_h( i_dc ); p_src[ 0 ] = 0; LW4( p_dst, i_dst_stride, i_src0, i_src1, i_src2, i_src3 ); INSERT_W4_UB( i_src0, i_src1, i_src2, i_src3, pred ); UNPCK_UB_SH( pred, pred_r, pred_l ); pred_r += input_dc; pred_l += input_dc; CLIP_SH2_0_255( pred_r, pred_l ); out = __msa_pckev_b( ( v16i8 ) pred_l, ( v16i8 ) pred_r ); ST4x4_UB( out, out, 0, 1, 2, 3, p_dst, i_dst_stride ); }
void yuv_abgr_convert_msa (JSAMPROW p_in_y, JSAMPROW p_in_cb, JSAMPROW p_in_cr, JSAMPROW p_rgb, JDIMENSION out_width) { int y, cb, cr; unsigned int col, num_cols_mul_16 = out_width >> 4; unsigned int remaining_wd = out_width & 0xF; v16i8 alpha = __msa_ldi_b(0xFF); v16i8 const_128 = __msa_ldi_b(128); v16u8 out0, out1, out2, out3, input_y = {0}; v16i8 input_cb, input_cr, out_rgb0, out_rgb1, out_ab0, out_ab1; v8i16 y_h0, y_h1, cb_h0, cb_h1, cr_h0, cr_h1; v4i32 cb_w0, cb_w1, cb_w2, cb_w3, cr_w0, cr_w1, cr_w2, cr_w3, zero = {0}; v16i8 out_r0, out_g0, out_b0; for (col = num_cols_mul_16; col--;) { input_y = LD_UB(p_in_y); input_cb = LD_SB(p_in_cb); input_cr = LD_SB(p_in_cr); p_in_y += 16; p_in_cb += 16; p_in_cr += 16; input_cb -= const_128; input_cr -= const_128; UNPCK_UB_SH(input_y, y_h0, y_h1); UNPCK_SB_SH(input_cb, cb_h0, cb_h1); UNPCK_SB_SH(input_cr, cr_h0, cr_h1); CALC_G4_FRM_YUV(y_h0, y_h1, cb_h0, cb_h1, cr_h0, cr_h1, out_g0); UNPCK_SH_SW(cr_h0, cr_w0, cr_w1); UNPCK_SH_SW(cr_h1, cr_w2, cr_w3); CALC_R4_FRM_YUV(y_h0, y_h1, cr_w0, cr_w1, cr_w2, cr_w3, out_r0); UNPCK_SH_SW(cb_h0, cb_w0, cb_w1); UNPCK_SH_SW(cb_h1, cb_w2, cb_w3); CALC_B4_FRM_YUV(y_h0, y_h1, cb_w0, cb_w1, cb_w2, cb_w3, out_b0); ILVRL_B2_SB(out_r0, out_g0, out_rgb0, out_rgb1); ILVRL_B2_SB(out_b0, alpha, out_ab0, out_ab1); ILVRL_H2_UB(out_rgb0, out_ab0, out0, out1); ILVRL_H2_UB(out_rgb1, out_ab1, out2, out3); ST_UB4(out0, out1, out2, out3, p_rgb, 16); p_rgb += 16 * 4; } if (remaining_wd >= 8) { uint64_t in_y, in_cb, in_cr; v16i8 input_cbcr = {0}; in_y = LD(p_in_y); in_cb = LD(p_in_cb); in_cr = LD(p_in_cr); p_in_y += 8; p_in_cb += 8; p_in_cr += 8; input_y = (v16u8) __msa_insert_d((v2i64) input_y, 0, in_y); input_cbcr = (v16i8) __msa_insert_d((v2i64) input_cbcr, 0, in_cb); input_cbcr = (v16i8) __msa_insert_d((v2i64) input_cbcr, 1, in_cr); input_cbcr -= const_128; y_h0 = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) input_y); UNPCK_SB_SH(input_cbcr, cb_h0, cr_h0); UNPCK_SH_SW(cb_h0, cb_w0, cb_w1); UNPCK_SH_SW(cr_h0, cr_w0, cr_w1); CALC_R2_FRM_YUV(y_h0, cr_w0, cr_w1, out_r0); CALC_G2_FRM_YUV(y_h0, cb_h0, cr_h0, out_g0); CALC_B2_FRM_YUV(y_h0, cb_w0, cb_w1, out_b0); out_rgb0 = (v16i8) __msa_ilvr_b((v16i8) out_r0, (v16i8) out_g0); out_ab0 = (v16i8) __msa_ilvr_b((v16i8) out_b0, alpha); ILVRL_H2_UB(out_rgb0, out_ab0, out0, out1); ST_UB2(out0, out1, p_rgb, 16); p_rgb += 16 * 2; remaining_wd -= 8; } for (col = 0; col < remaining_wd; col++) { y = (int) (p_in_y[col]); cb = (int) (p_in_cb[col]) - 128; cr = (int) (p_in_cr[col]) - 128; p_rgb[0] = 0xFF; p_rgb[1] = clip_pixel(y + ROUND_POWER_OF_TWO(FIX_1_77200 * cb, 16)); p_rgb[2] = clip_pixel(y + ROUND_POWER_OF_TWO(((-FIX_0_34414) * cb - FIX_0_71414 * cr), 16)); p_rgb[3] = clip_pixel(y + ROUND_POWER_OF_TWO(FIX_1_40200 * cr, 16)); p_rgb += 4; } }
void yuv_bgr_convert_msa (JSAMPROW p_in_y, JSAMPROW p_in_cb, JSAMPROW p_in_cr, JSAMPROW p_rgb, JDIMENSION out_width) { int32_t y, cb, cr; uint32_t col, num_cols_mul_16 = out_width >> 4; uint32_t remaining_wd = out_width & 0xF; v16u8 mask_rgb0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10}; v16u8 mask_rgb1 = {11, 21, 12, 13, 22, 14, 15, 23, 0, 1, 24, 2, 3, 25, 4, 5}; v16u8 mask_rgb2 = {26, 6, 7, 27, 8, 9, 28, 10, 11, 29, 12, 13, 30, 14, 15, 31}; v16u8 tmp0, tmp1, out0, out1, out2, input_y = {0}; v16i8 input_cb, input_cr, out_rgb0, out_rgb1, const_128 = __msa_ldi_b(128); v8i16 y_h0, y_h1, cb_h0, cb_h1, cr_h0, cr_h1; v4i32 cb_w0, cb_w1, cb_w2, cb_w3, cr_w0, cr_w1, cr_w2, cr_w3, zero = {0}; v16i8 out_r0, out_g0, out_b0; for (col = num_cols_mul_16; col--;) { input_y = LD_UB(p_in_y); input_cb = LD_SB(p_in_cb); input_cr = LD_SB(p_in_cr); p_in_y += 16; p_in_cb += 16; p_in_cr += 16; input_cb -= const_128; input_cr -= const_128; UNPCK_UB_SH(input_y, y_h0, y_h1); UNPCK_SB_SH(input_cb, cb_h0, cb_h1); UNPCK_SB_SH(input_cr, cr_h0, cr_h1); CALC_G4_FRM_YUV(y_h0, y_h1, cb_h0, cb_h1, cr_h0, cr_h1, out_g0); UNPCK_SH_SW(cr_h0, cr_w0, cr_w1); UNPCK_SH_SW(cr_h1, cr_w2, cr_w3); CALC_R4_FRM_YUV(y_h0, y_h1, cr_w0, cr_w1, cr_w2, cr_w3, out_r0); UNPCK_SH_SW(cb_h0, cb_w0, cb_w1); UNPCK_SH_SW(cb_h1, cb_w2, cb_w3); CALC_B4_FRM_YUV(y_h0, y_h1, cb_w0, cb_w1, cb_w2, cb_w3, out_b0); ILVRL_B2_SB(out_g0, out_b0, out_rgb0, out_rgb1); VSHF_B2_UB(out_rgb0, out_r0, out_rgb0, out_r0, mask_rgb0, mask_rgb1, out0, tmp0); VSHF_B2_UB(out_rgb1, out_r0, out_rgb1, out_r0, mask_rgb1, mask_rgb2, tmp1, out2); out1 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) tmp1, 8); out1 = (v16u8) __msa_pckev_d((v2i64) out1, (v2i64) tmp0); ST_UB(out0, p_rgb); p_rgb += 16; ST_UB(out1, p_rgb); p_rgb += 16; ST_UB(out2, p_rgb); p_rgb += 16; } if (remaining_wd >= 8) { uint64_t in_y, in_cb, in_cr; v16i8 input_cbcr = {0}; in_y = LD(p_in_y); in_cb = LD(p_in_cb); in_cr = LD(p_in_cr); p_in_y += 8; p_in_cb += 8; p_in_cr += 8; input_y = (v16u8) __msa_insert_d((v2i64) input_y, 0, in_y); input_cbcr = (v16i8) __msa_insert_d((v2i64) input_cbcr, 0, in_cb); input_cbcr = (v16i8) __msa_insert_d((v2i64) input_cbcr, 1, in_cr); input_cbcr -= const_128; y_h0 = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) input_y); UNPCK_SB_SH(input_cbcr, cb_h0, cr_h0); UNPCK_SH_SW(cb_h0, cb_w0, cb_w1); UNPCK_SH_SW(cr_h0, cr_w0, cr_w1); CALC_R2_FRM_YUV(y_h0, cr_w0, cr_w1, out_r0); CALC_G2_FRM_YUV(y_h0, cb_h0, cr_h0, out_g0); CALC_B2_FRM_YUV(y_h0, cb_w0, cb_w1, out_b0); out_rgb0 = (v16i8) __msa_ilvr_b((v16i8) out_g0, (v16i8) out_b0); VSHF_B2_UB(out_rgb0, out_r0, out_rgb0, out_r0, mask_rgb0, mask_rgb1, out0, out1); ST_UB(out0, p_rgb); p_rgb += 16; ST8x1_UB(out1, p_rgb); p_rgb += 8; remaining_wd -= 8; } for (col = 0; col < remaining_wd; col++) { y = (int) (p_in_y[col]); cb = (int) (p_in_cb[col]) - 128; cr = (int) (p_in_cr[col]) - 128; /* Range-limiting is essential due to noise introduced by DCT losses. */ p_rgb[0] = clip_pixel(y + ROUND_POWER_OF_TWO(FIX_1_77200 * cb, 16)); p_rgb[1] = clip_pixel(y + ROUND_POWER_OF_TWO(((-FIX_0_34414) * cb - FIX_0_71414 * cr), 16)); p_rgb[2] = clip_pixel(y + ROUND_POWER_OF_TWO(FIX_1_40200 * cr, 16)); p_rgb += 3; } }
static void temporal_filter_apply_16size_msa(uint8_t *frame1_ptr, uint32_t stride, uint8_t *frame2_ptr, int32_t strength_in, int32_t filter_wt_in, uint32_t *acc, uint16_t *cnt) { uint32_t row; v16i8 frame1_0_b, frame1_1_b, frame2_0_b, frame2_1_b; v16u8 frame_l, frame_h; v16i8 zero = { 0 }; v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h; v8i16 diff0, diff1, cnt0, cnt1; v4i32 const3, const16, filter_wt, strength; v4i32 mod0_w, mod1_w, mod2_w, mod3_w; v4i32 diff0_r, diff0_l, diff1_r, diff1_l; v4i32 frame2_0, frame2_1, frame2_2, frame2_3; v4i32 acc0, acc1, acc2, acc3; filter_wt = __msa_fill_w(filter_wt_in); strength = __msa_fill_w(strength_in); const3 = __msa_ldi_w(3); const16 = __msa_ldi_w(16); for (row = 8; row--;) { frame1_0_b = LD_SB(frame1_ptr); frame2_0_b = LD_SB(frame2_ptr); frame1_ptr += stride; frame2_ptr += 16; frame1_1_b = LD_SB(frame1_ptr); frame2_1_b = LD_SB(frame2_ptr); LD_SW2(acc, 4, acc0, acc1); LD_SW2(acc + 8, 4, acc2, acc3); LD_SH2(cnt, 8, cnt0, cnt1); ILVRL_B2_UB(frame1_0_b, frame2_0_b, frame_l, frame_h); HSUB_UB2_SH(frame_l, frame_h, diff0, diff1); UNPCK_SH_SW(diff0, diff0_r, diff0_l); UNPCK_SH_SW(diff1, diff1_r, diff1_l); MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w, mod1_w, mod2_w, mod3_w); SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); diff0_r = (mod0_w < const16); diff0_l = (mod1_w < const16); diff1_r = (mod2_w < const16); diff1_l = (mod3_w < const16); SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w, mod0_w, mod1_w, mod2_w, mod3_w); mod0_w = diff0_r & mod0_w; mod1_w = diff0_l & mod1_w; mod2_w = diff1_r & mod2_w; mod3_w = diff1_l & mod3_w; MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w, filter_wt, mod0_w, mod1_w, mod2_w, mod3_w); PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h) ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); ST_SH2(mod0_h, mod1_h, cnt, 8); cnt += 16; ILVRL_B2_SH(zero, frame2_0_b, frame2_0_h, frame2_1_h); UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1); UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3); MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3, mod0_w, mod1_w, mod2_w, mod3_w); ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, mod2_w, mod3_w); ST_SW2(mod0_w, mod1_w, acc, 4); ST_SW2(mod2_w, mod3_w, acc + 8, 4); acc += 16; LD_SW2(acc, 4, acc0, acc1); LD_SW2(acc + 8, 4, acc2, acc3); LD_SH2(cnt, 8, cnt0, cnt1); ILVRL_B2_UB(frame1_1_b, frame2_1_b, frame_l, frame_h); HSUB_UB2_SH(frame_l, frame_h, diff0, diff1); UNPCK_SH_SW(diff0, diff0_r, diff0_l); UNPCK_SH_SW(diff1, diff1_r, diff1_l); MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w, mod1_w, mod2_w, mod3_w); SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); diff0_r = (mod0_w < const16); diff0_l = (mod1_w < const16); diff1_r = (mod2_w < const16); diff1_l = (mod3_w < const16); SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w, mod0_w, mod1_w, mod2_w, mod3_w); mod0_w = diff0_r & mod0_w; mod1_w = diff0_l & mod1_w; mod2_w = diff1_r & mod2_w; mod3_w = diff1_l & mod3_w; MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w, filter_wt, mod0_w, mod1_w, mod2_w, mod3_w); PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); ST_SH2(mod0_h, mod1_h, cnt, 8); cnt += 16; UNPCK_UB_SH(frame2_1_b, frame2_0_h, frame2_1_h); UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1); UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3); MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3, mod0_w, mod1_w, mod2_w, mod3_w); ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, mod2_w, mod3_w); ST_SW2(mod0_w, mod1_w, acc, 4); ST_SW2(mod2_w, mod3_w, acc + 8, 4); acc += 16; frame1_ptr += stride; frame2_ptr += 16; } }
static void temporal_filter_apply_8size_msa(uint8_t *frame1_ptr, uint32_t stride, uint8_t *frame2_ptr, int32_t strength_in, int32_t filter_wt_in, uint32_t *acc, uint16_t *cnt) { uint32_t row; uint64_t f0, f1, f2, f3, f4, f5, f6, f7; v16i8 frame1 = { 0 }; v16i8 frame2 = { 0 }; v16i8 frame3 = { 0 }; v16i8 frame4 = { 0 }; v16u8 frame_l, frame_h; v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h; v8i16 diff0, diff1, cnt0, cnt1; v4i32 const3, const16; v4i32 filter_wt, strength; v4i32 mod0_w, mod1_w, mod2_w, mod3_w; v4i32 diff0_r, diff0_l, diff1_r, diff1_l; v4i32 frame2_0, frame2_1, frame2_2, frame2_3; v4i32 acc0, acc1, acc2, acc3; filter_wt = __msa_fill_w(filter_wt_in); strength = __msa_fill_w(strength_in); const3 = __msa_ldi_w(3); const16 = __msa_ldi_w(16); for (row = 2; row--;) { LD2(frame1_ptr, stride, f0, f1); frame1_ptr += (2 * stride); LD2(frame2_ptr, 8, f2, f3); frame2_ptr += 16; LD2(frame1_ptr, stride, f4, f5); frame1_ptr += (2 * stride); LD2(frame2_ptr, 8, f6, f7); frame2_ptr += 16; LD_SW2(acc, 4, acc0, acc1); LD_SW2(acc + 8, 4, acc2, acc3); LD_SH2(cnt, 8, cnt0, cnt1); INSERT_D2_SB(f0, f1, frame1); INSERT_D2_SB(f2, f3, frame2); INSERT_D2_SB(f4, f5, frame3); INSERT_D2_SB(f6, f7, frame4); ILVRL_B2_UB(frame1, frame2, frame_l, frame_h); HSUB_UB2_SH(frame_l, frame_h, diff0, diff1); UNPCK_SH_SW(diff0, diff0_r, diff0_l); UNPCK_SH_SW(diff1, diff1_r, diff1_l); MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w, mod1_w, mod2_w, mod3_w); SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); diff0_r = (mod0_w < const16); diff0_l = (mod1_w < const16); diff1_r = (mod2_w < const16); diff1_l = (mod3_w < const16); SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w, mod0_w, mod1_w, mod2_w, mod3_w); mod0_w = diff0_r & mod0_w; mod1_w = diff0_l & mod1_w; mod2_w = diff1_r & mod2_w; mod3_w = diff1_l & mod3_w; MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w, filter_wt, mod0_w, mod1_w, mod2_w, mod3_w); PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); ST_SH2(mod0_h, mod1_h, cnt, 8); cnt += 16; UNPCK_UB_SH(frame2, frame2_0_h, frame2_1_h); UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1); UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3); MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3, mod0_w, mod1_w, mod2_w, mod3_w); ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, mod2_w, mod3_w); ST_SW2(mod0_w, mod1_w, acc, 4); ST_SW2(mod2_w, mod3_w, acc + 8, 4); acc += 16; LD_SW2(acc, 4, acc0, acc1); LD_SW2(acc + 8, 4, acc2, acc3); LD_SH2(cnt, 8, cnt0, cnt1); ILVRL_B2_UB(frame3, frame4, frame_l, frame_h); HSUB_UB2_SH(frame_l, frame_h, diff0, diff1); UNPCK_SH_SW(diff0, diff0_r, diff0_l); UNPCK_SH_SW(diff1, diff1_r, diff1_l); MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w, mod1_w, mod2_w, mod3_w); SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); diff0_r = (mod0_w < const16); diff0_l = (mod1_w < const16); diff1_r = (mod2_w < const16); diff1_l = (mod3_w < const16); SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w, mod0_w, mod1_w, mod2_w, mod3_w); mod0_w = diff0_r & mod0_w; mod1_w = diff0_l & mod1_w; mod2_w = diff1_r & mod2_w; mod3_w = diff1_l & mod3_w; MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w, filter_wt, mod0_w, mod1_w, mod2_w, mod3_w); PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); ST_SH2(mod0_h, mod1_h, cnt, 8); cnt += 16; UNPCK_UB_SH(frame4, frame2_0_h, frame2_1_h); UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1); UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3); MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3, mod0_w, mod1_w, mod2_w, mod3_w); ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, mod2_w, mod3_w); ST_SW2(mod0_w, mod1_w, acc, 4); ST_SW2(mod2_w, mod3_w, acc + 8, 4); acc += 16; } }
static void hevc_addblk_32x32_msa(int16_t *coeffs, uint8_t *dst, int32_t stride) { uint8_t loop_cnt; uint8_t *temp_dst = dst; v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; v8i16 dst_r0, dst_l0, dst_r1, dst_l1, dst_r2, dst_l2, dst_r3, dst_l3; v8i16 in0, in1, in2, in3, in4, in5, in6, in7; /* Pre-load for next iteration */ LD_UB2(temp_dst, 16, dst4, dst5); temp_dst += stride; LD_UB2(temp_dst, 16, dst6, dst7); temp_dst += stride; LD_SH4(coeffs, 16, in0, in2, in4, in6); LD_SH4((coeffs + 8), 16, in1, in3, in5, in7); coeffs += 64; for (loop_cnt = 14; loop_cnt--;) { UNPCK_UB_SH(dst4, dst_r0, dst_l0); UNPCK_UB_SH(dst5, dst_r1, dst_l1); UNPCK_UB_SH(dst6, dst_r2, dst_l2); UNPCK_UB_SH(dst7, dst_r3, dst_l3); dst_r0 += in0; dst_l0 += in1; dst_r1 += in2; dst_l1 += in3; dst_r2 += in4; dst_l2 += in5; dst_r3 += in6; dst_l3 += in7; /* Pre-load for next iteration */ LD_UB2(temp_dst, 16, dst4, dst5); temp_dst += stride; LD_UB2(temp_dst, 16, dst6, dst7); temp_dst += stride; LD_SH4(coeffs, 16, in0, in2, in4, in6); LD_SH4((coeffs + 8), 16, in1, in3, in5, in7); coeffs += 64; CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1); CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3); PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3, dst_r3, dst0, dst1, dst2, dst3); ST_UB2(dst0, dst1, dst, 16); dst += stride; ST_UB2(dst2, dst3, dst, 16); dst += stride; } UNPCK_UB_SH(dst4, dst_r0, dst_l0); UNPCK_UB_SH(dst5, dst_r1, dst_l1); UNPCK_UB_SH(dst6, dst_r2, dst_l2); UNPCK_UB_SH(dst7, dst_r3, dst_l3); dst_r0 += in0; dst_l0 += in1; dst_r1 += in2; dst_l1 += in3; dst_r2 += in4; dst_l2 += in5; dst_r3 += in6; dst_l3 += in7; /* Pre-load for next iteration */ LD_UB2(temp_dst, 16, dst4, dst5); temp_dst += stride; LD_UB2(temp_dst, 16, dst6, dst7); temp_dst += stride; LD_SH4(coeffs, 16, in0, in2, in4, in6); LD_SH4((coeffs + 8), 16, in1, in3, in5, in7); CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1); CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3); PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3, dst_r3, dst0, dst1, dst2, dst3); ST_UB2(dst0, dst1, dst, 16); dst += stride; ST_UB2(dst2, dst3, dst, 16); dst += stride; UNPCK_UB_SH(dst4, dst_r0, dst_l0); UNPCK_UB_SH(dst5, dst_r1, dst_l1); UNPCK_UB_SH(dst6, dst_r2, dst_l2); UNPCK_UB_SH(dst7, dst_r3, dst_l3); dst_r0 += in0; dst_l0 += in1; dst_r1 += in2; dst_l1 += in3; dst_r2 += in4; dst_l2 += in5; dst_r3 += in6; dst_l3 += in7; CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1); CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3); PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3, dst_r3, dst0, dst1, dst2, dst3); ST_UB2(dst0, dst1, dst, 16); dst += stride; ST_UB2(dst2, dst3, dst, 16); }