static void avc_dequant_4x4_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16], int32_t i_qp ) { const int32_t i_mf = i_qp % 6; const int32_t q_bits = i_qp / 6 - 4; v8i16 dct0, dct1; v4i32 dequant_m_f0, dequant_m_f1, dequant_m_f2, dequant_m_f3; LD_SH2( p_dct, 8, dct0, dct1 ); LD_SW2( pi_dequant_mf[i_mf], 4, dequant_m_f0, dequant_m_f1 ); LD_SW2( pi_dequant_mf[i_mf] + 8, 4, dequant_m_f2, dequant_m_f3 ); if( q_bits >= 0 ) { v8i16 dequant_mf_h0, dequant_mf_h1, q_bits_vec; q_bits_vec = __msa_fill_h( q_bits ); PCKEV_H2_SH( dequant_m_f1, dequant_m_f0, dequant_m_f3, dequant_m_f2, dequant_mf_h0, dequant_mf_h1 ); dct0 *= dequant_mf_h0; dct1 *= dequant_mf_h1; dct0 <<= q_bits_vec; dct1 <<= q_bits_vec; ST_SH2( dct0, dct1, p_dct, 8 ); } else { const int32_t q_bits_add = 1 << ( -q_bits - 1 ); v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3; v4i32 q_bits_vec, q_bits_vec_add; q_bits_vec_add = __msa_fill_w( q_bits_add ); q_bits_vec = __msa_fill_w( -q_bits ); UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); dct_signed_w0 *= dequant_m_f0; dct_signed_w1 *= dequant_m_f1; dct_signed_w2 *= dequant_m_f2; dct_signed_w3 *= dequant_m_f3; dct_signed_w0 += q_bits_vec_add; dct_signed_w1 += q_bits_vec_add; dct_signed_w2 += q_bits_vec_add; dct_signed_w3 += q_bits_vec_add; SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3, q_bits_vec ); PCKEV_H2_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2, dct0, dct1 ); ST_SH2( dct0, dct1, p_dct, 8 ); } }
static void avc_dequant_4x4_dc_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16], int32_t i_qp ) { const int32_t q_bits = i_qp / 6 - 6; int32_t i_dmf = pi_dequant_mf[i_qp % 6][0]; v8i16 dct0, dct1, dequant_mf_h; LD_SH2( p_dct, 8, dct0, dct1 ); if( q_bits >= 0 ) { i_dmf <<= q_bits; dequant_mf_h = __msa_fill_h( i_dmf ); dct0 = dct0 * dequant_mf_h; dct1 = dct1 * dequant_mf_h; ST_SH2( dct0, dct1, p_dct, 8 ); } else { const int32_t q_bits_add = 1 << ( -q_bits - 1 ); v4i32 dequant_m_f, q_bits_vec, q_bits_vec_add; v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3; q_bits_vec_add = __msa_fill_w( q_bits_add ); q_bits_vec = __msa_fill_w( -q_bits ); dequant_m_f = __msa_fill_w( i_dmf ); UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); dct_signed_w0 *= dequant_m_f; dct_signed_w1 *= dequant_m_f; dct_signed_w2 *= dequant_m_f; dct_signed_w3 *= dequant_m_f; dct_signed_w0 += q_bits_vec_add; dct_signed_w1 += q_bits_vec_add; dct_signed_w2 += q_bits_vec_add; dct_signed_w3 += q_bits_vec_add; SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3, q_bits_vec ); PCKEV_H2_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2, dct0, dct1 ); ST_SH2( dct0, dct1, p_dct, 8 ); } }
static int32_t avc_quant_4x4_dc_msa( int16_t *p_dct, int32_t i_mf, int32_t i_bias ) { int32_t non_zero = 0; v8i16 dct0, dct1, dct0_mask, dct1_mask; v8i16 zero = { 0 }; v8i16 dct_h0, dct_h1; v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3; v4i32 dct_w0, dct_w1, dct_w2, dct_w3; v4i32 mf_vec, bias_vec; LD_SH2( p_dct, 8, dct0, dct1 ); dct0_mask = __msa_clei_s_h( dct0, 0 ); dct1_mask = __msa_clei_s_h( dct1, 0 ); UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); bias_vec = __msa_fill_w( i_bias ); mf_vec = __msa_fill_w( i_mf ); dct_w0 = __msa_add_a_w( dct_signed_w0, bias_vec ); dct_w1 = __msa_add_a_w( dct_signed_w1, bias_vec ); dct_w2 = __msa_add_a_w( dct_signed_w2, bias_vec ); dct_w3 = __msa_add_a_w( dct_signed_w3, bias_vec ); dct_w0 *= mf_vec; dct_w1 *= mf_vec; dct_w2 *= mf_vec; dct_w3 *= mf_vec; SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 ); PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 ); dct0 = zero - dct_h0; dct1 = zero - dct_h1; dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0, ( v16u8 ) dct0, ( v16u8 ) dct0_mask ); dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1, ( v16u8 ) dct1, ( v16u8 ) dct1_mask ); non_zero = HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 ) ); ST_SH2( dct0, dct1, p_dct, 8 ); return !!non_zero; }
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { uint64_t out0, out1, out2, out3; uint32_t in0, in1, in2, in3; v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; v8i16 t0, t1, t2, t3; v16u8 srcl0, srcl1, src0, src1; const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 }; const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 }; const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 }; const v8i16 mask3 = { 0, 4, 1, 5, 2, 6, 3, 7 }; const v8i16 cnst0 = { 2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352 }; const v8i16 cnst1 = { 5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217 }; LW4(src, BPS, in0, in1, in2, in3); INSERT_W4_UB(in0, in1, in2, in3, src0); LW4(ref, BPS, in0, in1, in2, in3); INSERT_W4_UB(in0, in1, in2, in3, src1); ILVRL_B2_UB(src0, src1, srcl0, srcl1); HSUB_UB2_SH(srcl0, srcl1, t0, t1); VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3); ADDSUB2(t2, t3, t0, t1); t0 = SRLI_H(t0, 3); VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2); tmp0 = __msa_hadd_s_w(t3, t3); tmp2 = __msa_hsub_s_w(t3, t3); FILL_W2_SW(1812, 937, tmp1, tmp3); DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1); SRAI_W2_SW(tmp1, tmp3, 9); PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1); VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3); ADDSUB2(t2, t3, t0, t1); VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2); tmp0 = __msa_hadd_s_w(t3, t3); tmp2 = __msa_hsub_s_w(t3, t3); ADDVI_W2_SW(tmp0, 7, tmp2, 7, tmp0, tmp2); SRAI_W2_SW(tmp0, tmp2, 4); FILL_W2_SW(12000, 51000, tmp1, tmp3); DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1); SRAI_W2_SW(tmp1, tmp3, 16); UNPCK_R_SH_SW(t1, tmp4); tmp5 = __msa_ceqi_w(tmp4, 0); tmp4 = (v4i32)__msa_nor_v((v16u8)tmp5, (v16u8)tmp5); tmp5 = __msa_fill_w(1); tmp5 = (v4i32)__msa_and_v((v16u8)tmp5, (v16u8)tmp4); tmp1 += tmp5; PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1); out0 = __msa_copy_s_d((v2i64)t0, 0); out1 = __msa_copy_s_d((v2i64)t0, 1); out2 = __msa_copy_s_d((v2i64)t1, 0); out3 = __msa_copy_s_d((v2i64)t1, 1); SD4(out0, out1, out2, out3, out, 8); }
static void temporal_filter_apply_16size_msa(uint8_t *frame1_ptr, uint32_t stride, uint8_t *frame2_ptr, int32_t strength_in, int32_t filter_wt_in, uint32_t *acc, uint16_t *cnt) { uint32_t row; v16i8 frame1_0_b, frame1_1_b, frame2_0_b, frame2_1_b; v16u8 frame_l, frame_h; v16i8 zero = { 0 }; v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h; v8i16 diff0, diff1, cnt0, cnt1; v4i32 const3, const16, filter_wt, strength; v4i32 mod0_w, mod1_w, mod2_w, mod3_w; v4i32 diff0_r, diff0_l, diff1_r, diff1_l; v4i32 frame2_0, frame2_1, frame2_2, frame2_3; v4i32 acc0, acc1, acc2, acc3; filter_wt = __msa_fill_w(filter_wt_in); strength = __msa_fill_w(strength_in); const3 = __msa_ldi_w(3); const16 = __msa_ldi_w(16); for (row = 8; row--;) { frame1_0_b = LD_SB(frame1_ptr); frame2_0_b = LD_SB(frame2_ptr); frame1_ptr += stride; frame2_ptr += 16; frame1_1_b = LD_SB(frame1_ptr); frame2_1_b = LD_SB(frame2_ptr); LD_SW2(acc, 4, acc0, acc1); LD_SW2(acc + 8, 4, acc2, acc3); LD_SH2(cnt, 8, cnt0, cnt1); ILVRL_B2_UB(frame1_0_b, frame2_0_b, frame_l, frame_h); HSUB_UB2_SH(frame_l, frame_h, diff0, diff1); UNPCK_SH_SW(diff0, diff0_r, diff0_l); UNPCK_SH_SW(diff1, diff1_r, diff1_l); MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w, mod1_w, mod2_w, mod3_w); SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); diff0_r = (mod0_w < const16); diff0_l = (mod1_w < const16); diff1_r = (mod2_w < const16); diff1_l = (mod3_w < const16); SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w, mod0_w, mod1_w, mod2_w, mod3_w); mod0_w = diff0_r & mod0_w; mod1_w = diff0_l & mod1_w; mod2_w = diff1_r & mod2_w; mod3_w = diff1_l & mod3_w; MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w, filter_wt, mod0_w, mod1_w, mod2_w, mod3_w); PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h) ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); ST_SH2(mod0_h, mod1_h, cnt, 8); cnt += 16; ILVRL_B2_SH(zero, frame2_0_b, frame2_0_h, frame2_1_h); UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1); UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3); MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3, mod0_w, mod1_w, mod2_w, mod3_w); ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, mod2_w, mod3_w); ST_SW2(mod0_w, mod1_w, acc, 4); ST_SW2(mod2_w, mod3_w, acc + 8, 4); acc += 16; LD_SW2(acc, 4, acc0, acc1); LD_SW2(acc + 8, 4, acc2, acc3); LD_SH2(cnt, 8, cnt0, cnt1); ILVRL_B2_UB(frame1_1_b, frame2_1_b, frame_l, frame_h); HSUB_UB2_SH(frame_l, frame_h, diff0, diff1); UNPCK_SH_SW(diff0, diff0_r, diff0_l); UNPCK_SH_SW(diff1, diff1_r, diff1_l); MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w, mod1_w, mod2_w, mod3_w); SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); diff0_r = (mod0_w < const16); diff0_l = (mod1_w < const16); diff1_r = (mod2_w < const16); diff1_l = (mod3_w < const16); SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w, mod0_w, mod1_w, mod2_w, mod3_w); mod0_w = diff0_r & mod0_w; mod1_w = diff0_l & mod1_w; mod2_w = diff1_r & mod2_w; mod3_w = diff1_l & mod3_w; MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w, filter_wt, mod0_w, mod1_w, mod2_w, mod3_w); PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); ST_SH2(mod0_h, mod1_h, cnt, 8); cnt += 16; UNPCK_UB_SH(frame2_1_b, frame2_0_h, frame2_1_h); UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1); UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3); MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3, mod0_w, mod1_w, mod2_w, mod3_w); ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, mod2_w, mod3_w); ST_SW2(mod0_w, mod1_w, acc, 4); ST_SW2(mod2_w, mod3_w, acc + 8, 4); acc += 16; frame1_ptr += stride; frame2_ptr += 16; } }
static void temporal_filter_apply_8size_msa(uint8_t *frame1_ptr, uint32_t stride, uint8_t *frame2_ptr, int32_t strength_in, int32_t filter_wt_in, uint32_t *acc, uint16_t *cnt) { uint32_t row; uint64_t f0, f1, f2, f3, f4, f5, f6, f7; v16i8 frame1 = { 0 }; v16i8 frame2 = { 0 }; v16i8 frame3 = { 0 }; v16i8 frame4 = { 0 }; v16u8 frame_l, frame_h; v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h; v8i16 diff0, diff1, cnt0, cnt1; v4i32 const3, const16; v4i32 filter_wt, strength; v4i32 mod0_w, mod1_w, mod2_w, mod3_w; v4i32 diff0_r, diff0_l, diff1_r, diff1_l; v4i32 frame2_0, frame2_1, frame2_2, frame2_3; v4i32 acc0, acc1, acc2, acc3; filter_wt = __msa_fill_w(filter_wt_in); strength = __msa_fill_w(strength_in); const3 = __msa_ldi_w(3); const16 = __msa_ldi_w(16); for (row = 2; row--;) { LD2(frame1_ptr, stride, f0, f1); frame1_ptr += (2 * stride); LD2(frame2_ptr, 8, f2, f3); frame2_ptr += 16; LD2(frame1_ptr, stride, f4, f5); frame1_ptr += (2 * stride); LD2(frame2_ptr, 8, f6, f7); frame2_ptr += 16; LD_SW2(acc, 4, acc0, acc1); LD_SW2(acc + 8, 4, acc2, acc3); LD_SH2(cnt, 8, cnt0, cnt1); INSERT_D2_SB(f0, f1, frame1); INSERT_D2_SB(f2, f3, frame2); INSERT_D2_SB(f4, f5, frame3); INSERT_D2_SB(f6, f7, frame4); ILVRL_B2_UB(frame1, frame2, frame_l, frame_h); HSUB_UB2_SH(frame_l, frame_h, diff0, diff1); UNPCK_SH_SW(diff0, diff0_r, diff0_l); UNPCK_SH_SW(diff1, diff1_r, diff1_l); MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w, mod1_w, mod2_w, mod3_w); SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); diff0_r = (mod0_w < const16); diff0_l = (mod1_w < const16); diff1_r = (mod2_w < const16); diff1_l = (mod3_w < const16); SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w, mod0_w, mod1_w, mod2_w, mod3_w); mod0_w = diff0_r & mod0_w; mod1_w = diff0_l & mod1_w; mod2_w = diff1_r & mod2_w; mod3_w = diff1_l & mod3_w; MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w, filter_wt, mod0_w, mod1_w, mod2_w, mod3_w); PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); ST_SH2(mod0_h, mod1_h, cnt, 8); cnt += 16; UNPCK_UB_SH(frame2, frame2_0_h, frame2_1_h); UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1); UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3); MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3, mod0_w, mod1_w, mod2_w, mod3_w); ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, mod2_w, mod3_w); ST_SW2(mod0_w, mod1_w, acc, 4); ST_SW2(mod2_w, mod3_w, acc + 8, 4); acc += 16; LD_SW2(acc, 4, acc0, acc1); LD_SW2(acc + 8, 4, acc2, acc3); LD_SH2(cnt, 8, cnt0, cnt1); ILVRL_B2_UB(frame3, frame4, frame_l, frame_h); HSUB_UB2_SH(frame_l, frame_h, diff0, diff1); UNPCK_SH_SW(diff0, diff0_r, diff0_l); UNPCK_SH_SW(diff1, diff1_r, diff1_l); MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w, mod1_w, mod2_w, mod3_w); SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); diff0_r = (mod0_w < const16); diff0_l = (mod1_w < const16); diff1_r = (mod2_w < const16); diff1_l = (mod3_w < const16); SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w, mod0_w, mod1_w, mod2_w, mod3_w); mod0_w = diff0_r & mod0_w; mod1_w = diff0_l & mod1_w; mod2_w = diff1_r & mod2_w; mod3_w = diff1_l & mod3_w; MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w, filter_wt, mod0_w, mod1_w, mod2_w, mod3_w); PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); ST_SH2(mod0_h, mod1_h, cnt, 8); cnt += 16; UNPCK_UB_SH(frame4, frame2_0_h, frame2_1_h); UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1); UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3); MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3, mod0_w, mod1_w, mod2_w, mod3_w); ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, mod2_w, mod3_w); ST_SW2(mod0_w, mod1_w, acc, 4); ST_SW2(mod2_w, mod3_w, acc + 8, 4); acc += 16; } }
static void hevc_idct_8x32_column_msa(int16_t *coeffs, uint8_t buf_pitch, uint8_t round) { uint8_t i; const int16_t *filter_ptr0 = >32x32_cnst0[0]; const int16_t *filter_ptr1 = >32x32_cnst1[0]; const int16_t *filter_ptr2 = >32x32_cnst2[0]; const int16_t *filter_ptr3 = >8x8_cnst[0]; int16_t *src0 = (coeffs + buf_pitch); int16_t *src1 = (coeffs + 2 * buf_pitch); int16_t *src2 = (coeffs + 4 * buf_pitch); int16_t *src3 = (coeffs); int32_t cnst0, cnst1; int32_t tmp_buf[8 * 32 + 15]; int32_t *tmp_buf_ptr = tmp_buf + 15; v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r; v8i16 src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l; v8i16 filt0, filter0, filter1, filter2, filter3; v4i32 sum0_r, sum0_l, sum1_r, sum1_l, tmp0_r, tmp0_l, tmp1_r, tmp1_l; /* Align pointer to 64 byte boundary */ tmp_buf_ptr = (int32_t *)(((uintptr_t) tmp_buf_ptr) & ~(uintptr_t) 63); /* process coeff 4, 12, 20, 28 */ LD_SH4(src2, 8 * buf_pitch, in0, in1, in2, in3); ILVR_H2_SH(in1, in0, in3, in2, src0_r, src1_r); ILVL_H2_SH(in1, in0, in3, in2, src0_l, src1_l); LD_SH2(src3, 16 * buf_pitch, in4, in6); LD_SH2((src3 + 8 * buf_pitch), 16 * buf_pitch, in5, in7); ILVR_H2_SH(in6, in4, in7, in5, src2_r, src3_r); ILVL_H2_SH(in6, in4, in7, in5, src2_l, src3_l); /* loop for all columns of constants */ for (i = 0; i < 2; i++) { /* processing single column of constants */ cnst0 = LW(filter_ptr2); cnst1 = LW(filter_ptr2 + 2); filter0 = (v8i16) __msa_fill_w(cnst0); filter1 = (v8i16) __msa_fill_w(cnst1); DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l); DPADD_SH2_SW(src1_r, src1_l, filter1, filter1, sum0_r, sum0_l); ST_SW2(sum0_r, sum0_l, (tmp_buf_ptr + 2 * i * 8), 4); /* processing single column of constants */ cnst0 = LW(filter_ptr2 + 4); cnst1 = LW(filter_ptr2 + 6); filter0 = (v8i16) __msa_fill_w(cnst0); filter1 = (v8i16) __msa_fill_w(cnst1); DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l); DPADD_SH2_SW(src1_r, src1_l, filter1, filter1, sum0_r, sum0_l); ST_SW2(sum0_r, sum0_l, (tmp_buf_ptr + (2 * i + 1) * 8), 4); filter_ptr2 += 8; } /* process coeff 0, 8, 16, 24 */ /* loop for all columns of constants */ for (i = 0; i < 2; i++) { /* processing first column of filter constants */ cnst0 = LW(filter_ptr3); cnst1 = LW(filter_ptr3 + 2); filter0 = (v8i16) __msa_fill_w(cnst0); filter1 = (v8i16) __msa_fill_w(cnst1); DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter0, filter0, filter1, filter1, sum0_r, sum0_l, tmp1_r, tmp1_l); sum1_r = sum0_r - tmp1_r; sum1_l = sum0_l - tmp1_l; sum0_r = sum0_r + tmp1_r; sum0_l = sum0_l + tmp1_l; HEVC_EVEN16_CALC(tmp_buf_ptr, sum0_r, sum0_l, i, (7 - i)); HEVC_EVEN16_CALC(tmp_buf_ptr, sum1_r, sum1_l, (3 - i), (4 + i)); filter_ptr3 += 8; } /* process coeff 2 6 10 14 18 22 26 30 */ LD_SH8(src1, 4 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7); ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, src0_r, src1_r, src2_r, src3_r); ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, src0_l, src1_l, src2_l, src3_l); /* loop for all columns of constants */ for (i = 0; i < 8; i++) { /* processing single column of constants */ filt0 = LD_SH(filter_ptr1); SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3); DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l); DPADD_SH4_SW(src1_r, src1_l, src2_r, src2_l, filter1, filter1, filter2, filter2, sum0_r, sum0_l, sum0_r, sum0_l); DPADD_SH2_SW(src3_r, src3_l, filter3, filter3, sum0_r, sum0_l); LD_SW2(tmp_buf_ptr + i * 8, 4, tmp0_r, tmp0_l); tmp1_r = tmp0_r; tmp1_l = tmp0_l; tmp0_r += sum0_r; tmp0_l += sum0_l; ST_SW2(tmp0_r, tmp0_l, (tmp_buf_ptr + i * 8), 4); tmp1_r -= sum0_r; tmp1_l -= sum0_l; ST_SW2(tmp1_r, tmp1_l, (tmp_buf_ptr + (15 - i) * 8), 4); filter_ptr1 += 8; } /* process coeff 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 */ LD_SH8(src0, 2 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7); src0 += 16 * buf_pitch; ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, src0_r, src1_r, src2_r, src3_r); ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, src0_l, src1_l, src2_l, src3_l); LD_SH8(src0, 2 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7); ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, src4_r, src5_r, src6_r, src7_r); ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, src4_l, src5_l, src6_l, src7_l); /* loop for all columns of filter constants */ for (i = 0; i < 16; i++) { /* processing single column of constants */ filt0 = LD_SH(filter_ptr0); SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3); DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l); DPADD_SH4_SW(src1_r, src1_l, src2_r, src2_l, filter1, filter1, filter2, filter2, sum0_r, sum0_l, sum0_r, sum0_l); DPADD_SH2_SW(src3_r, src3_l, filter3, filter3, sum0_r, sum0_l); tmp1_r = sum0_r; tmp1_l = sum0_l; filt0 = LD_SH(filter_ptr0 + 8); SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3); DOTP_SH2_SW(src4_r, src4_l, filter0, filter0, sum0_r, sum0_l); DPADD_SH4_SW(src5_r, src5_l, src6_r, src6_l, filter1, filter1, filter2, filter2, sum0_r, sum0_l, sum0_r, sum0_l); DPADD_SH2_SW(src7_r, src7_l, filter3, filter3, sum0_r, sum0_l); sum0_r += tmp1_r; sum0_l += tmp1_l; LD_SW2(tmp_buf_ptr + i * 8, 4, tmp0_r, tmp0_l); tmp1_r = tmp0_r; tmp1_l = tmp0_l; tmp0_r += sum0_r; tmp0_l += sum0_l; sum1_r = __msa_fill_w(round); SRAR_W2_SW(tmp0_r, tmp0_l, sum1_r); SAT_SW2_SW(tmp0_r, tmp0_l, 15); in0 = __msa_pckev_h((v8i16) tmp0_l, (v8i16) tmp0_r); ST_SH(in0, (coeffs + i * buf_pitch)); tmp1_r -= sum0_r; tmp1_l -= sum0_l; SRAR_W2_SW(tmp1_r, tmp1_l, sum1_r); SAT_SW2_SW(tmp1_r, tmp1_l, 15); in0 = __msa_pckev_h((v8i16) tmp1_l, (v8i16) tmp1_r); ST_SH(in0, (coeffs + (31 - i) * buf_pitch)); filter_ptr0 += 16; } }
static void avc_dequant_8x8_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][64], int32_t i_qp ) { const int32_t i_mf = i_qp % 6; const int32_t q_bits = i_qp / 6 - 6; v8i16 dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7; v4i32 dequant_m_f0, dequant_m_f1, dequant_m_f2, dequant_m_f3; v4i32 dequant_m_f4, dequant_m_f5, dequant_m_f6, dequant_m_f7; v4i32 dequant_m_f8, dequant_m_f9, dequant_m_f10, dequant_m_f11; v4i32 dequant_m_f12, dequant_m_f13, dequant_m_f14, dequant_m_f15; LD_SH8( p_dct, 8, dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7 ); LD_SW2( pi_dequant_mf[i_mf], 4, dequant_m_f0, dequant_m_f1 ); LD_SW2( pi_dequant_mf[i_mf] + 8, 4, dequant_m_f2, dequant_m_f3 ); LD_SW2( pi_dequant_mf[i_mf] + 16, 4, dequant_m_f4, dequant_m_f5 ); LD_SW2( pi_dequant_mf[i_mf] + 24, 4, dequant_m_f6, dequant_m_f7 ); LD_SW2( pi_dequant_mf[i_mf] + 32, 4, dequant_m_f8, dequant_m_f9 ); LD_SW2( pi_dequant_mf[i_mf] + 40, 4, dequant_m_f10, dequant_m_f11 ); LD_SW2( pi_dequant_mf[i_mf] + 48, 4, dequant_m_f12, dequant_m_f13 ); LD_SW2( pi_dequant_mf[i_mf] + 56, 4, dequant_m_f14, dequant_m_f15 ); if( q_bits >= 0 ) { v8i16 q_bits_vec; v8i16 dequant_mf_h0, dequant_mf_h1, dequant_mf_h2, dequant_mf_h3; v8i16 dequant_mf_h4, dequant_mf_h5, dequant_mf_h6, dequant_mf_h7; q_bits_vec = __msa_fill_h( q_bits ); PCKEV_H4_SH( dequant_m_f1, dequant_m_f0, dequant_m_f3, dequant_m_f2, dequant_m_f5, dequant_m_f4, dequant_m_f7, dequant_m_f6, dequant_mf_h0, dequant_mf_h1, dequant_mf_h2, dequant_mf_h3 ); PCKEV_H4_SH( dequant_m_f9, dequant_m_f8, dequant_m_f11, dequant_m_f10, dequant_m_f13, dequant_m_f12, dequant_m_f15, dequant_m_f14, dequant_mf_h4, dequant_mf_h5, dequant_mf_h6, dequant_mf_h7 ); dct0 *= dequant_mf_h0; dct1 *= dequant_mf_h1; dct2 *= dequant_mf_h2; dct3 *= dequant_mf_h3; dct4 *= dequant_mf_h4; dct5 *= dequant_mf_h5; dct6 *= dequant_mf_h6; dct7 *= dequant_mf_h7; SLLI_4V( dct0, dct1, dct2, dct3, q_bits_vec ); SLLI_4V( dct4, dct5, dct6, dct7, q_bits_vec ); ST_SH8( dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7, p_dct, 8 ); } else { const int32_t q_bits_add = 1 << ( -q_bits - 1 ); v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3; v4i32 dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7; v4i32 dct_signed_w8, dct_signed_w9, dct_signed_w10, dct_signed_w11; v4i32 dct_signed_w12, dct_signed_w13, dct_signed_w14, dct_signed_w15; v4i32 q_bits_vec, q_bits_vec_add; q_bits_vec_add = __msa_fill_w( q_bits_add ); q_bits_vec = __msa_fill_w( -q_bits ); UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 ); UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 ); UNPCK_SH_SW( dct4, dct_signed_w8, dct_signed_w9 ); UNPCK_SH_SW( dct5, dct_signed_w10, dct_signed_w11 ); UNPCK_SH_SW( dct6, dct_signed_w12, dct_signed_w13 ); UNPCK_SH_SW( dct7, dct_signed_w14, dct_signed_w15 ); dct_signed_w0 *= dequant_m_f0; dct_signed_w1 *= dequant_m_f1; dct_signed_w2 *= dequant_m_f2; dct_signed_w3 *= dequant_m_f3; dct_signed_w4 *= dequant_m_f4; dct_signed_w5 *= dequant_m_f5; dct_signed_w6 *= dequant_m_f6; dct_signed_w7 *= dequant_m_f7; dct_signed_w8 *= dequant_m_f8; dct_signed_w9 *= dequant_m_f9; dct_signed_w10 *= dequant_m_f10; dct_signed_w11 *= dequant_m_f11; dct_signed_w12 *= dequant_m_f12; dct_signed_w13 *= dequant_m_f13; dct_signed_w14 *= dequant_m_f14; dct_signed_w15 *= dequant_m_f15; dct_signed_w0 += q_bits_vec_add; dct_signed_w1 += q_bits_vec_add; dct_signed_w2 += q_bits_vec_add; dct_signed_w3 += q_bits_vec_add; dct_signed_w4 += q_bits_vec_add; dct_signed_w5 += q_bits_vec_add; dct_signed_w6 += q_bits_vec_add; dct_signed_w7 += q_bits_vec_add; dct_signed_w8 += q_bits_vec_add; dct_signed_w9 += q_bits_vec_add; dct_signed_w10 += q_bits_vec_add; dct_signed_w11 += q_bits_vec_add; dct_signed_w12 += q_bits_vec_add; dct_signed_w13 += q_bits_vec_add; dct_signed_w14 += q_bits_vec_add; dct_signed_w15 += q_bits_vec_add; SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3, q_bits_vec ); SRA_4V( dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7, q_bits_vec ); SRA_4V( dct_signed_w8, dct_signed_w9, dct_signed_w10, dct_signed_w11, q_bits_vec ); SRA_4V( dct_signed_w12, dct_signed_w13, dct_signed_w14, dct_signed_w15, q_bits_vec ); PCKEV_H4_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2, dct_signed_w5, dct_signed_w4, dct_signed_w7, dct_signed_w6, dct0, dct1, dct2, dct3 ); PCKEV_H4_SH( dct_signed_w9, dct_signed_w8, dct_signed_w11, dct_signed_w10, dct_signed_w13, dct_signed_w12, dct_signed_w15, dct_signed_w14, dct4, dct5, dct6, dct7 ); ST_SH8( dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7, p_dct, 8 ); } }