static int32_t avc_quant_4x4_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias ) { int32_t non_zero = 0; v8i16 dct0, dct1; v8i16 zero = { 0 }; v8i16 dct0_mask, dct1_mask; v8i16 dct_h0, dct_h1, mf_h0, mf_h1, bias_h0, bias_h1; v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3; v4i32 dct_w0, dct_w1, dct_w2, dct_w3; v4i32 mf_vec0, mf_vec1, mf_vec2, mf_vec3; v4i32 bias0, bias1, bias2, bias3; LD_SH2( p_dct, 8, dct0, dct1 ); LD_SH2( p_bias, 8, bias_h0, bias_h1 ); LD_SH2( p_mf, 8, mf_h0, mf_h1 ); dct0_mask = __msa_clei_s_h( dct0, 0 ); dct1_mask = __msa_clei_s_h( dct1, 0 ); UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); ILVR_H2_SW( zero, bias_h0, zero, bias_h1, bias0, bias2 ); ILVL_H2_SW( zero, bias_h0, zero, bias_h1, bias1, bias3 ); ILVR_H2_SW( zero, mf_h0, zero, mf_h1, mf_vec0, mf_vec2 ); ILVL_H2_SW( zero, mf_h0, zero, mf_h1, mf_vec1, mf_vec3 ); dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 ); dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 ); dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 ); dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 ); dct_w0 *= mf_vec0; dct_w1 *= mf_vec1; dct_w2 *= mf_vec2; dct_w3 *= mf_vec3; SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 ); PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 ); dct0 = zero - dct_h0; dct1 = zero - dct_h1; dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0, ( v16u8 ) dct0, ( v16u8 ) dct0_mask ); dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1, ( v16u8 ) dct1, ( v16u8 ) dct1_mask ); non_zero = HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 ) ); ST_SH2( dct0, dct1, p_dct, 8 ); return !!non_zero; }
void vp8_short_fdct4x4_msa(int16_t *input, int16_t *output, int32_t pitch) { v8i16 in0, in1, in2, in3; v8i16 temp0, temp1; v8i16 const0, const1; v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 }; v4i32 out0, out1, out2, out3; v8i16 zero = { 0 }; LD_SH4(input, pitch / 2, in0, in1, in2, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); SLLI_4V(temp0, temp1, in1, in3, 3); in0 = temp0 + temp1; in2 = temp0 - temp1; SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1); temp0 = __msa_ilvr_h(in3, in1); in1 = __msa_splati_h(coeff, 3); out0 = (v4i32)__msa_ilvev_h(zero, in1); coeff = __msa_ilvl_h(zero, coeff); out1 = __msa_splati_w((v4i32)coeff, 0); DPADD_SH2_SW(temp0, temp0, const0, const1, out0, out1); out0 >>= 12; out1 >>= 12; PCKEV_H2_SH(out0, out0, out1, out1, in1, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); in0 = temp0 + temp1 + 7; in2 = temp0 - temp1 + 7; in0 >>= 4; in2 >>= 4; ILVR_H2_SW(zero, in0, zero, in2, out0, out2); temp1 = RET_1_IF_NZERO_H(in3); ILVR_H2_SH(zero, temp1, in3, in1, temp1, temp0); SPLATI_W2_SW(coeff, 2, out3, out1); out3 += out1; out1 = __msa_splati_w((v4i32)coeff, 1); DPADD_SH2_SW(temp0, temp0, const0, const1, out1, out3); out1 >>= 16; out3 >>= 16; out1 += (v4i32)temp1; PCKEV_H2_SH(out1, out0, out3, out2, in0, in2); ST_SH2(in0, in2, output, 8); }