static void avc_dequant_4x4_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16], int32_t i_qp ) { const int32_t i_mf = i_qp % 6; const int32_t q_bits = i_qp / 6 - 4; v8i16 dct0, dct1; v4i32 dequant_m_f0, dequant_m_f1, dequant_m_f2, dequant_m_f3; LD_SH2( p_dct, 8, dct0, dct1 ); LD_SW2( pi_dequant_mf[i_mf], 4, dequant_m_f0, dequant_m_f1 ); LD_SW2( pi_dequant_mf[i_mf] + 8, 4, dequant_m_f2, dequant_m_f3 ); if( q_bits >= 0 ) { v8i16 dequant_mf_h0, dequant_mf_h1, q_bits_vec; q_bits_vec = __msa_fill_h( q_bits ); PCKEV_H2_SH( dequant_m_f1, dequant_m_f0, dequant_m_f3, dequant_m_f2, dequant_mf_h0, dequant_mf_h1 ); dct0 *= dequant_mf_h0; dct1 *= dequant_mf_h1; dct0 <<= q_bits_vec; dct1 <<= q_bits_vec; ST_SH2( dct0, dct1, p_dct, 8 ); } else { const int32_t q_bits_add = 1 << ( -q_bits - 1 ); v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3; v4i32 q_bits_vec, q_bits_vec_add; q_bits_vec_add = __msa_fill_w( q_bits_add ); q_bits_vec = __msa_fill_w( -q_bits ); UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); dct_signed_w0 *= dequant_m_f0; dct_signed_w1 *= dequant_m_f1; dct_signed_w2 *= dequant_m_f2; dct_signed_w3 *= dequant_m_f3; dct_signed_w0 += q_bits_vec_add; dct_signed_w1 += q_bits_vec_add; dct_signed_w2 += q_bits_vec_add; dct_signed_w3 += q_bits_vec_add; SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3, q_bits_vec ); PCKEV_H2_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2, dct0, dct1 ); ST_SH2( dct0, dct1, p_dct, 8 ); } }
void vp8_short_fdct8x4_msa(int16_t *input, int16_t *output, int32_t pitch) { v8i16 in0, in1, in2, in3; v8i16 temp0, temp1, tmp0, tmp1; v8i16 const0, const1, const2; v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 }; v8i16 zero = { 0 }; v4i32 vec0_w, vec1_w, vec2_w, vec3_w; LD_SH4(input, pitch / 2, in0, in1, in2, in3); TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); SLLI_4V(temp0, temp1, in1, in3, 3); in0 = temp0 + temp1; in2 = temp0 - temp1; SET_DOTP_VALUES(coeff, 0, 1, 2, const1, const2); temp0 = __msa_splati_h(coeff, 3); vec1_w = (v4i32)__msa_ilvev_h(zero, temp0); coeff = __msa_ilvl_h(zero, coeff); vec3_w = __msa_splati_w((v4i32)coeff, 0); ILVRL_H2_SH(in3, in1, tmp1, tmp0); vec0_w = vec1_w; vec2_w = vec3_w; DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2, vec0_w, vec1_w, vec2_w, vec3_w); SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 12); PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3); TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); in0 = temp0 + temp1 + 7; in2 = temp0 - temp1 + 7; in0 >>= 4; in2 >>= 4; SPLATI_W2_SW(coeff, 2, vec3_w, vec1_w); vec3_w += vec1_w; vec1_w = __msa_splati_w((v4i32)coeff, 1); const0 = RET_1_IF_NZERO_H(in3); ILVRL_H2_SH(in3, in1, tmp1, tmp0); vec0_w = vec1_w; vec2_w = vec3_w; DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2, vec0_w, vec1_w, vec2_w, vec3_w); SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 16); PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3); in1 += const0; PCKEV_D2_SH(in1, in0, in3, in2, temp0, temp1); ST_SH2(temp0, temp1, output, 8); PCKOD_D2_SH(in1, in0, in3, in2, in0, in2); ST_SH2(in0, in2, output + 16, 8); }
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { uint64_t out0, out1, out2, out3; uint32_t in0, in1, in2, in3; v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; v8i16 t0, t1, t2, t3; v16u8 srcl0, srcl1, src0, src1; const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 }; const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 }; const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 }; const v8i16 mask3 = { 0, 4, 1, 5, 2, 6, 3, 7 }; const v8i16 cnst0 = { 2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352 }; const v8i16 cnst1 = { 5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217 }; LW4(src, BPS, in0, in1, in2, in3); INSERT_W4_UB(in0, in1, in2, in3, src0); LW4(ref, BPS, in0, in1, in2, in3); INSERT_W4_UB(in0, in1, in2, in3, src1); ILVRL_B2_UB(src0, src1, srcl0, srcl1); HSUB_UB2_SH(srcl0, srcl1, t0, t1); VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3); ADDSUB2(t2, t3, t0, t1); t0 = SRLI_H(t0, 3); VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2); tmp0 = __msa_hadd_s_w(t3, t3); tmp2 = __msa_hsub_s_w(t3, t3); FILL_W2_SW(1812, 937, tmp1, tmp3); DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1); SRAI_W2_SW(tmp1, tmp3, 9); PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1); VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3); ADDSUB2(t2, t3, t0, t1); VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2); tmp0 = __msa_hadd_s_w(t3, t3); tmp2 = __msa_hsub_s_w(t3, t3); ADDVI_W2_SW(tmp0, 7, tmp2, 7, tmp0, tmp2); SRAI_W2_SW(tmp0, tmp2, 4); FILL_W2_SW(12000, 51000, tmp1, tmp3); DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1); SRAI_W2_SW(tmp1, tmp3, 16); UNPCK_R_SH_SW(t1, tmp4); tmp5 = __msa_ceqi_w(tmp4, 0); tmp4 = (v4i32)__msa_nor_v((v16u8)tmp5, (v16u8)tmp5); tmp5 = __msa_fill_w(1); tmp5 = (v4i32)__msa_and_v((v16u8)tmp5, (v16u8)tmp4); tmp1 += tmp5; PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1); out0 = __msa_copy_s_d((v2i64)t0, 0); out1 = __msa_copy_s_d((v2i64)t0, 1); out2 = __msa_copy_s_d((v2i64)t1, 0); out3 = __msa_copy_s_d((v2i64)t1, 1); SD4(out0, out1, out2, out3, out, 8); }
void vp8_short_walsh4x4_msa(int16_t *input, int16_t *output, int32_t pitch) { v8i16 in0_h, in1_h, in2_h, in3_h; v4i32 in0_w, in1_w, in2_w, in3_w, temp0, temp1, temp2, temp3; LD_SH4(input, pitch / 2, in0_h, in1_h, in2_h, in3_h); TRANSPOSE4x4_SH_SH(in0_h, in1_h, in2_h, in3_h, in0_h, in1_h, in2_h, in3_h); UNPCK_R_SH_SW(in0_h, in0_w); UNPCK_R_SH_SW(in1_h, in1_w); UNPCK_R_SH_SW(in2_h, in2_w); UNPCK_R_SH_SW(in3_h, in3_w); BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1); SLLI_4V(temp0, temp1, temp2, temp3, 2); BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w); temp0 = RET_1_IF_NZERO_W(temp0); in0_w += temp0; TRANSPOSE4x4_SW_SW(in0_w, in1_w, in2_w, in3_w, in0_w, in1_w, in2_w, in3_w); BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1); BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w); in0_w += RET_1_IF_NEG_W(in0_w); in1_w += RET_1_IF_NEG_W(in1_w); in2_w += RET_1_IF_NEG_W(in2_w); in3_w += RET_1_IF_NEG_W(in3_w); ADD4(in0_w, 3, in1_w, 3, in2_w, 3, in3_w, 3, in0_w, in1_w, in2_w, in3_w); SRA_4V(in0_w, in1_w, in2_w, in3_w, 3); PCKEV_H2_SH(in1_w, in0_w, in3_w, in2_w, in0_h, in1_h); ST_SH2(in0_h, in1_h, output, 8); }
void vp8_short_fdct4x4_msa(int16_t *input, int16_t *output, int32_t pitch) { v8i16 in0, in1, in2, in3; v8i16 temp0, temp1; v8i16 const0, const1; v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 }; v4i32 out0, out1, out2, out3; v8i16 zero = { 0 }; LD_SH4(input, pitch / 2, in0, in1, in2, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); SLLI_4V(temp0, temp1, in1, in3, 3); in0 = temp0 + temp1; in2 = temp0 - temp1; SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1); temp0 = __msa_ilvr_h(in3, in1); in1 = __msa_splati_h(coeff, 3); out0 = (v4i32)__msa_ilvev_h(zero, in1); coeff = __msa_ilvl_h(zero, coeff); out1 = __msa_splati_w((v4i32)coeff, 0); DPADD_SH2_SW(temp0, temp0, const0, const1, out0, out1); out0 >>= 12; out1 >>= 12; PCKEV_H2_SH(out0, out0, out1, out1, in1, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); in0 = temp0 + temp1 + 7; in2 = temp0 - temp1 + 7; in0 >>= 4; in2 >>= 4; ILVR_H2_SW(zero, in0, zero, in2, out0, out2); temp1 = RET_1_IF_NZERO_H(in3); ILVR_H2_SH(zero, temp1, in3, in1, temp1, temp0); SPLATI_W2_SW(coeff, 2, out3, out1); out3 += out1; out1 = __msa_splati_w((v4i32)coeff, 1); DPADD_SH2_SW(temp0, temp0, const0, const1, out1, out3); out1 >>= 16; out3 >>= 16; out1 += (v4i32)temp1; PCKEV_H2_SH(out1, out0, out3, out2, in0, in2); ST_SH2(in0, in2, output, 8); }
static int32_t avc_quant_4x4_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias ) { int32_t non_zero = 0; v8i16 dct0, dct1; v8i16 zero = { 0 }; v8i16 dct0_mask, dct1_mask; v8i16 dct_h0, dct_h1, mf_h0, mf_h1, bias_h0, bias_h1; v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3; v4i32 dct_w0, dct_w1, dct_w2, dct_w3; v4i32 mf_vec0, mf_vec1, mf_vec2, mf_vec3; v4i32 bias0, bias1, bias2, bias3; LD_SH2( p_dct, 8, dct0, dct1 ); LD_SH2( p_bias, 8, bias_h0, bias_h1 ); LD_SH2( p_mf, 8, mf_h0, mf_h1 ); dct0_mask = __msa_clei_s_h( dct0, 0 ); dct1_mask = __msa_clei_s_h( dct1, 0 ); UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); ILVR_H2_SW( zero, bias_h0, zero, bias_h1, bias0, bias2 ); ILVL_H2_SW( zero, bias_h0, zero, bias_h1, bias1, bias3 ); ILVR_H2_SW( zero, mf_h0, zero, mf_h1, mf_vec0, mf_vec2 ); ILVL_H2_SW( zero, mf_h0, zero, mf_h1, mf_vec1, mf_vec3 ); dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 ); dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 ); dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 ); dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 ); dct_w0 *= mf_vec0; dct_w1 *= mf_vec1; dct_w2 *= mf_vec2; dct_w3 *= mf_vec3; SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 ); PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 ); dct0 = zero - dct_h0; dct1 = zero - dct_h1; dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0, ( v16u8 ) dct0, ( v16u8 ) dct0_mask ); dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1, ( v16u8 ) dct1, ( v16u8 ) dct1_mask ); non_zero = HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 ) ); ST_SH2( dct0, dct1, p_dct, 8 ); return !!non_zero; }
static void avc_dequant_4x4_dc_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16], int32_t i_qp ) { const int32_t q_bits = i_qp / 6 - 6; int32_t i_dmf = pi_dequant_mf[i_qp % 6][0]; v8i16 dct0, dct1, dequant_mf_h; LD_SH2( p_dct, 8, dct0, dct1 ); if( q_bits >= 0 ) { i_dmf <<= q_bits; dequant_mf_h = __msa_fill_h( i_dmf ); dct0 = dct0 * dequant_mf_h; dct1 = dct1 * dequant_mf_h; ST_SH2( dct0, dct1, p_dct, 8 ); } else { const int32_t q_bits_add = 1 << ( -q_bits - 1 ); v4i32 dequant_m_f, q_bits_vec, q_bits_vec_add; v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3; q_bits_vec_add = __msa_fill_w( q_bits_add ); q_bits_vec = __msa_fill_w( -q_bits ); dequant_m_f = __msa_fill_w( i_dmf ); UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); dct_signed_w0 *= dequant_m_f; dct_signed_w1 *= dequant_m_f; dct_signed_w2 *= dequant_m_f; dct_signed_w3 *= dequant_m_f; dct_signed_w0 += q_bits_vec_add; dct_signed_w1 += q_bits_vec_add; dct_signed_w2 += q_bits_vec_add; dct_signed_w3 += q_bits_vec_add; SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3, q_bits_vec ); PCKEV_H2_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2, dct0, dct1 ); ST_SH2( dct0, dct1, p_dct, 8 ); } }
static int32_t avc_quant_4x4_dc_msa( int16_t *p_dct, int32_t i_mf, int32_t i_bias ) { int32_t non_zero = 0; v8i16 dct0, dct1, dct0_mask, dct1_mask; v8i16 zero = { 0 }; v8i16 dct_h0, dct_h1; v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3; v4i32 dct_w0, dct_w1, dct_w2, dct_w3; v4i32 mf_vec, bias_vec; LD_SH2( p_dct, 8, dct0, dct1 ); dct0_mask = __msa_clei_s_h( dct0, 0 ); dct1_mask = __msa_clei_s_h( dct1, 0 ); UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); bias_vec = __msa_fill_w( i_bias ); mf_vec = __msa_fill_w( i_mf ); dct_w0 = __msa_add_a_w( dct_signed_w0, bias_vec ); dct_w1 = __msa_add_a_w( dct_signed_w1, bias_vec ); dct_w2 = __msa_add_a_w( dct_signed_w2, bias_vec ); dct_w3 = __msa_add_a_w( dct_signed_w3, bias_vec ); dct_w0 *= mf_vec; dct_w1 *= mf_vec; dct_w2 *= mf_vec; dct_w3 *= mf_vec; SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 ); PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 ); dct0 = zero - dct_h0; dct1 = zero - dct_h1; dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0, ( v16u8 ) dct0, ( v16u8 ) dct0_mask ); dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1, ( v16u8 ) dct1, ( v16u8 ) dct1_mask ); non_zero = HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 ) ); ST_SH2( dct0, dct1, p_dct, 8 ); return !!non_zero; }
static void hevc_idct_luma_4x4_msa(int16_t *coeffs) { v8i16 in0, in1, dst0, dst1; v4i32 in_r0, in_l0, in_r1, in_l1, res0, res1, res2, res3; LD_SH2(coeffs, 8, in0, in1); UNPCK_SH_SW(in0, in_r0, in_l0); UNPCK_SH_SW(in1, in_r1, in_l1); HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1, res0, res1, res2, res3, 7); TRANSPOSE4x4_SW_SW(res0, res1, res2, res3, in_r0, in_l0, in_r1, in_l1); HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1, res0, res1, res2, res3, 12); /* Pack and transpose */ PCKEV_H2_SH(res2, res0, res3, res1, dst0, dst1); ILVRL_H2_SW(dst1, dst0, res0, res1); ILVRL_W2_SH(res1, res0, dst0, dst1); ST_SH2(dst0, dst1, coeffs, 8); }
static void hevc_idct_4x4_msa(int16_t *coeffs) { v8i16 in0, in1; v4i32 in_r0, in_l0, in_r1, in_l1; v4i32 sum0, sum1, sum2, sum3; v8i16 zeros = { 0 }; LD_SH2(coeffs, 8, in0, in1); ILVRL_H2_SW(zeros, in0, in_r0, in_l0); ILVRL_H2_SW(zeros, in1, in_r1, in_l1); HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 7); TRANSPOSE4x4_SW_SW(sum0, sum1, sum2, sum3, in_r0, in_l0, in_r1, in_l1); HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 12); /* Pack and transpose */ PCKEV_H2_SH(sum2, sum0, sum3, sum1, in0, in1); ILVRL_H2_SW(in1, in0, sum0, sum1); ILVRL_W2_SH(sum1, sum0, in0, in1); ST_SH2(in0, in1, coeffs, 8); }
static void temporal_filter_apply_16size_msa(uint8_t *frame1_ptr, uint32_t stride, uint8_t *frame2_ptr, int32_t strength_in, int32_t filter_wt_in, uint32_t *acc, uint16_t *cnt) { uint32_t row; v16i8 frame1_0_b, frame1_1_b, frame2_0_b, frame2_1_b; v16u8 frame_l, frame_h; v16i8 zero = { 0 }; v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h; v8i16 diff0, diff1, cnt0, cnt1; v4i32 const3, const16, filter_wt, strength; v4i32 mod0_w, mod1_w, mod2_w, mod3_w; v4i32 diff0_r, diff0_l, diff1_r, diff1_l; v4i32 frame2_0, frame2_1, frame2_2, frame2_3; v4i32 acc0, acc1, acc2, acc3; filter_wt = __msa_fill_w(filter_wt_in); strength = __msa_fill_w(strength_in); const3 = __msa_ldi_w(3); const16 = __msa_ldi_w(16); for (row = 8; row--;) { frame1_0_b = LD_SB(frame1_ptr); frame2_0_b = LD_SB(frame2_ptr); frame1_ptr += stride; frame2_ptr += 16; frame1_1_b = LD_SB(frame1_ptr); frame2_1_b = LD_SB(frame2_ptr); LD_SW2(acc, 4, acc0, acc1); LD_SW2(acc + 8, 4, acc2, acc3); LD_SH2(cnt, 8, cnt0, cnt1); ILVRL_B2_UB(frame1_0_b, frame2_0_b, frame_l, frame_h); HSUB_UB2_SH(frame_l, frame_h, diff0, diff1); UNPCK_SH_SW(diff0, diff0_r, diff0_l); UNPCK_SH_SW(diff1, diff1_r, diff1_l); MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w, mod1_w, mod2_w, mod3_w); SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); diff0_r = (mod0_w < const16); diff0_l = (mod1_w < const16); diff1_r = (mod2_w < const16); diff1_l = (mod3_w < const16); SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w, mod0_w, mod1_w, mod2_w, mod3_w); mod0_w = diff0_r & mod0_w; mod1_w = diff0_l & mod1_w; mod2_w = diff1_r & mod2_w; mod3_w = diff1_l & mod3_w; MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w, filter_wt, mod0_w, mod1_w, mod2_w, mod3_w); PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h) ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); ST_SH2(mod0_h, mod1_h, cnt, 8); cnt += 16; ILVRL_B2_SH(zero, frame2_0_b, frame2_0_h, frame2_1_h); UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1); UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3); MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3, mod0_w, mod1_w, mod2_w, mod3_w); ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, mod2_w, mod3_w); ST_SW2(mod0_w, mod1_w, acc, 4); ST_SW2(mod2_w, mod3_w, acc + 8, 4); acc += 16; LD_SW2(acc, 4, acc0, acc1); LD_SW2(acc + 8, 4, acc2, acc3); LD_SH2(cnt, 8, cnt0, cnt1); ILVRL_B2_UB(frame1_1_b, frame2_1_b, frame_l, frame_h); HSUB_UB2_SH(frame_l, frame_h, diff0, diff1); UNPCK_SH_SW(diff0, diff0_r, diff0_l); UNPCK_SH_SW(diff1, diff1_r, diff1_l); MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w, mod1_w, mod2_w, mod3_w); SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); diff0_r = (mod0_w < const16); diff0_l = (mod1_w < const16); diff1_r = (mod2_w < const16); diff1_l = (mod3_w < const16); SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w, mod0_w, mod1_w, mod2_w, mod3_w); mod0_w = diff0_r & mod0_w; mod1_w = diff0_l & mod1_w; mod2_w = diff1_r & mod2_w; mod3_w = diff1_l & mod3_w; MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w, filter_wt, mod0_w, mod1_w, mod2_w, mod3_w); PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); ST_SH2(mod0_h, mod1_h, cnt, 8); cnt += 16; UNPCK_UB_SH(frame2_1_b, frame2_0_h, frame2_1_h); UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1); UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3); MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3, mod0_w, mod1_w, mod2_w, mod3_w); ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, mod2_w, mod3_w); ST_SW2(mod0_w, mod1_w, acc, 4); ST_SW2(mod2_w, mod3_w, acc + 8, 4); acc += 16; frame1_ptr += stride; frame2_ptr += 16; } }
static void temporal_filter_apply_8size_msa(uint8_t *frame1_ptr, uint32_t stride, uint8_t *frame2_ptr, int32_t strength_in, int32_t filter_wt_in, uint32_t *acc, uint16_t *cnt) { uint32_t row; uint64_t f0, f1, f2, f3, f4, f5, f6, f7; v16i8 frame1 = { 0 }; v16i8 frame2 = { 0 }; v16i8 frame3 = { 0 }; v16i8 frame4 = { 0 }; v16u8 frame_l, frame_h; v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h; v8i16 diff0, diff1, cnt0, cnt1; v4i32 const3, const16; v4i32 filter_wt, strength; v4i32 mod0_w, mod1_w, mod2_w, mod3_w; v4i32 diff0_r, diff0_l, diff1_r, diff1_l; v4i32 frame2_0, frame2_1, frame2_2, frame2_3; v4i32 acc0, acc1, acc2, acc3; filter_wt = __msa_fill_w(filter_wt_in); strength = __msa_fill_w(strength_in); const3 = __msa_ldi_w(3); const16 = __msa_ldi_w(16); for (row = 2; row--;) { LD2(frame1_ptr, stride, f0, f1); frame1_ptr += (2 * stride); LD2(frame2_ptr, 8, f2, f3); frame2_ptr += 16; LD2(frame1_ptr, stride, f4, f5); frame1_ptr += (2 * stride); LD2(frame2_ptr, 8, f6, f7); frame2_ptr += 16; LD_SW2(acc, 4, acc0, acc1); LD_SW2(acc + 8, 4, acc2, acc3); LD_SH2(cnt, 8, cnt0, cnt1); INSERT_D2_SB(f0, f1, frame1); INSERT_D2_SB(f2, f3, frame2); INSERT_D2_SB(f4, f5, frame3); INSERT_D2_SB(f6, f7, frame4); ILVRL_B2_UB(frame1, frame2, frame_l, frame_h); HSUB_UB2_SH(frame_l, frame_h, diff0, diff1); UNPCK_SH_SW(diff0, diff0_r, diff0_l); UNPCK_SH_SW(diff1, diff1_r, diff1_l); MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w, mod1_w, mod2_w, mod3_w); SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); diff0_r = (mod0_w < const16); diff0_l = (mod1_w < const16); diff1_r = (mod2_w < const16); diff1_l = (mod3_w < const16); SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w, mod0_w, mod1_w, mod2_w, mod3_w); mod0_w = diff0_r & mod0_w; mod1_w = diff0_l & mod1_w; mod2_w = diff1_r & mod2_w; mod3_w = diff1_l & mod3_w; MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w, filter_wt, mod0_w, mod1_w, mod2_w, mod3_w); PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); ST_SH2(mod0_h, mod1_h, cnt, 8); cnt += 16; UNPCK_UB_SH(frame2, frame2_0_h, frame2_1_h); UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1); UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3); MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3, mod0_w, mod1_w, mod2_w, mod3_w); ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, mod2_w, mod3_w); ST_SW2(mod0_w, mod1_w, acc, 4); ST_SW2(mod2_w, mod3_w, acc + 8, 4); acc += 16; LD_SW2(acc, 4, acc0, acc1); LD_SW2(acc + 8, 4, acc2, acc3); LD_SH2(cnt, 8, cnt0, cnt1); ILVRL_B2_UB(frame3, frame4, frame_l, frame_h); HSUB_UB2_SH(frame_l, frame_h, diff0, diff1); UNPCK_SH_SW(diff0, diff0_r, diff0_l); UNPCK_SH_SW(diff1, diff1_r, diff1_l); MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w, mod1_w, mod2_w, mod3_w); SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); diff0_r = (mod0_w < const16); diff0_l = (mod1_w < const16); diff1_r = (mod2_w < const16); diff1_l = (mod3_w < const16); SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w, mod0_w, mod1_w, mod2_w, mod3_w); mod0_w = diff0_r & mod0_w; mod1_w = diff0_l & mod1_w; mod2_w = diff1_r & mod2_w; mod3_w = diff1_l & mod3_w; MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w, filter_wt, mod0_w, mod1_w, mod2_w, mod3_w); PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); ST_SH2(mod0_h, mod1_h, cnt, 8); cnt += 16; UNPCK_UB_SH(frame4, frame2_0_h, frame2_1_h); UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1); UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3); MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3, mod0_w, mod1_w, mod2_w, mod3_w); ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, mod2_w, mod3_w); ST_SW2(mod0_w, mod1_w, acc, 4); ST_SW2(mod2_w, mod3_w, acc + 8, 4); acc += 16; } }
static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr, int16_t *out) { v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 in8, in9, in10, in11, in12, in13, in14, in15; v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v4i32 vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l; v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r; v4i32 tmp0_w, tmp1_w, tmp2_w, tmp3_w; /* fdct32 even */ /* stage 2 */ LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14, in15); ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8); ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8); /* Stage 3 */ UNPCK_SH_SW(vec0, vec0_l, vec0_r); UNPCK_SH_SW(vec1, vec1_l, vec1_r); UNPCK_SH_SW(vec2, vec2_l, vec2_r); UNPCK_SH_SW(vec3, vec3_l, vec3_r); UNPCK_SH_SW(vec4, vec4_l, vec4_r); UNPCK_SH_SW(vec5, vec5_l, vec5_r); UNPCK_SH_SW(vec6, vec6_l, vec6_r); UNPCK_SH_SW(vec7, vec7_l, vec7_r); ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r, tmp0_w, tmp1_w, tmp2_w, tmp3_w); BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r); ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l, vec0_r, vec1_r, vec2_r, vec3_r); tmp3_w = vec0_r + vec3_r; vec0_r = vec0_r - vec3_r; vec3_r = vec1_r + vec2_r; vec1_r = vec1_r - vec2_r; DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64, vec4_r, tmp3_w, vec6_r, vec3_r); FDCT32_POSTPROC_NEG_W(vec4_r); FDCT32_POSTPROC_NEG_W(tmp3_w); FDCT32_POSTPROC_NEG_W(vec6_r); FDCT32_POSTPROC_NEG_W(vec3_r); PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); ST_SH2(vec5, vec4, out, 8); DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64, vec4_r, tmp3_w, vec6_r, vec3_r); FDCT32_POSTPROC_NEG_W(vec4_r); FDCT32_POSTPROC_NEG_W(tmp3_w); FDCT32_POSTPROC_NEG_W(vec6_r); FDCT32_POSTPROC_NEG_W(vec3_r); PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); ST_SH2(vec5, vec4, out + 16, 8); LD_SH8(interm_ptr, 8, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); ADD2(vec4, vec5, vec7, vec6, vec0, vec1); DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 32); ST_SH(in5, out + 56); SUB2(vec4, vec5, vec7, vec6, vec4, vec7); DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 40); ST_SH(in5, out + 48); LD_SH8(interm_ptr + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); ADD2(in0, in1, in2, in3, vec0, vec7); DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 64); ST_SH(in5, out + 120); SUB2(in0, in1, in2, in3, in0, in2); DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 72); ST_SH(in5, out + 112); SUB2(in9, vec2, in14, vec5, vec2, vec5); DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 80); ST_SH(in5, out + 104); ADD2(in3, in2, in0, in1, vec3, vec4); DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 96); ST_SH(in5, out + 88); }
static int32_t avc_quant_8x8_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias ) { int32_t non_zero = 0; v8i16 dct0, dct1, dct2, dct3; v8i16 zero = { 0 }; v8i16 dct0_mask, dct1_mask, dct2_mask, dct3_mask; v8i16 dct_h0, dct_h1, dct_h2, dct_h3, mf_h0, mf_h1, mf_h2, mf_h3; v8i16 bias_h0, bias_h1, bias_h2, bias_h3; v4i32 dct_w0, dct_w1, dct_w2, dct_w3, dct_w4, dct_w5, dct_w6, dct_w7; v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3; v4i32 dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7; v4i32 mf_vec0, mf_vec1, mf_vec2, mf_vec3; v4i32 mf_vec4, mf_vec5, mf_vec6, mf_vec7; v4i32 bias0, bias1, bias2, bias3, bias4, bias5, bias6, bias7; LD_SH4( p_dct, 8, dct0, dct1, dct2, dct3 ); dct0_mask = __msa_clei_s_h( dct0, 0 ); dct1_mask = __msa_clei_s_h( dct1, 0 ); dct2_mask = __msa_clei_s_h( dct2, 0 ); dct3_mask = __msa_clei_s_h( dct3, 0 ); UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 ); UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 ); LD_SH4( p_bias, 8, bias_h0, bias_h1, bias_h2, bias_h3 ); ILVR_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3, bias0, bias2, bias4, bias6 ); ILVL_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3, bias1, bias3, bias5, bias7 ); LD_SH4( p_mf, 8, mf_h0, mf_h1, mf_h2, mf_h3 ); ILVR_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3, mf_vec0, mf_vec2, mf_vec4, mf_vec6 ); ILVL_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3, mf_vec1, mf_vec3, mf_vec5, mf_vec7 ); dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 ); dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 ); dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 ); dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 ); dct_w4 = __msa_add_a_w( dct_signed_w4, bias4 ); dct_w5 = __msa_add_a_w( dct_signed_w5, bias5 ); dct_w6 = __msa_add_a_w( dct_signed_w6, bias6 ); dct_w7 = __msa_add_a_w( dct_signed_w7, bias7 ); dct_w0 *= mf_vec0; dct_w1 *= mf_vec1; dct_w2 *= mf_vec2; dct_w3 *= mf_vec3; dct_w4 *= mf_vec4; dct_w5 *= mf_vec5; dct_w6 *= mf_vec6; dct_w7 *= mf_vec7; SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 ); SRA_4V( dct_w4, dct_w5, dct_w6, dct_w7, 16 ); PCKEV_H4_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_w5, dct_w4, dct_w7, dct_w6, dct_h0, dct_h1, dct_h2, dct_h3 ); SUB4( zero, dct_h0, zero, dct_h1, zero, dct_h2, zero, dct_h3, dct0, dct1, dct2, dct3 ); dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0, ( v16u8 ) dct0, ( v16u8 ) dct0_mask ); dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1, ( v16u8 ) dct1, ( v16u8 ) dct1_mask ); dct2 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h2, ( v16u8 ) dct2, ( v16u8 ) dct2_mask ); dct3 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h3, ( v16u8 ) dct3, ( v16u8 ) dct3_mask ); non_zero = HADD_SW_S32( ( v4u32 )( dct_h0 + dct_h1 + dct_h2 + dct_h3 ) ); ST_SH4( dct0, dct1, dct2, dct3, p_dct, 8 ); LD_SH4( p_dct + 32, 8, dct0, dct1, dct2, dct3 ); dct0_mask = __msa_clei_s_h( dct0, 0 ); dct1_mask = __msa_clei_s_h( dct1, 0 ); dct2_mask = __msa_clei_s_h( dct2, 0 ); dct3_mask = __msa_clei_s_h( dct3, 0 ); UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 ); UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 ); LD_SH4( p_bias + 32, 8, bias_h0, bias_h1, bias_h2, bias_h3 ); ILVR_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3, bias0, bias2, bias4, bias6 ); ILVL_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3, bias1, bias3, bias5, bias7 ); LD_SH4( p_mf + 32, 8, mf_h0, mf_h1, mf_h2, mf_h3 ); ILVR_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3, mf_vec0, mf_vec2, mf_vec4, mf_vec6 ); ILVL_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3, mf_vec1, mf_vec3, mf_vec5, mf_vec7 ); dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 ); dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 ); dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 ); dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 ); dct_w4 = __msa_add_a_w( dct_signed_w4, bias4 ); dct_w5 = __msa_add_a_w( dct_signed_w5, bias5 ); dct_w6 = __msa_add_a_w( dct_signed_w6, bias6 ); dct_w7 = __msa_add_a_w( dct_signed_w7, bias7 ); dct_w0 *= mf_vec0; dct_w1 *= mf_vec1; dct_w2 *= mf_vec2; dct_w3 *= mf_vec3; dct_w4 *= mf_vec4; dct_w5 *= mf_vec5; dct_w6 *= mf_vec6; dct_w7 *= mf_vec7; SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 ); SRA_4V( dct_w4, dct_w5, dct_w6, dct_w7, 16 ); PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 ); PCKEV_H2_SH( dct_w5, dct_w4, dct_w7, dct_w6, dct_h2, dct_h3 ); SUB4( zero, dct_h0, zero, dct_h1, zero, dct_h2, zero, dct_h3, dct0, dct1, dct2, dct3 ); dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0, ( v16u8 ) dct0, ( v16u8 ) dct0_mask ); dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1, ( v16u8 ) dct1, ( v16u8 ) dct1_mask ); dct2 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h2, ( v16u8 ) dct2, ( v16u8 ) dct2_mask ); dct3 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h3, ( v16u8 ) dct3, ( v16u8 ) dct3_mask ); non_zero += HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 + dct_h2 + dct_h3 ) ); ST_SH4( dct0, dct1, dct2, dct3, p_dct + 32, 8 ); return !!non_zero; }