static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, uint8_t* dst) { v8i16 input0, input1; v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3; v4i32 res0, res1, res2, res3; v16i8 dest0, dest1, dest2, dest3; const v16i8 zero = { 0 }; LD_SH2(in, 8, input0, input1); UNPCK_SH_SW(input0, in0, in1); UNPCK_SH_SW(input1, in2, in3); IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3); TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3); IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3); SRARI_W4_SW(vt0, vt1, vt2, vt3, 3); TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3); LD_SB4(ref, BPS, dest0, dest1, dest2, dest3); ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3, res0, res1, res2, res3); ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, res0, res1, res2, res3); ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3); CLIP_SW4_0_255(res0, res1, res2, res3); PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1); res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1); ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS); }
static int32_t avc_quant_8x8_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias ) { int32_t non_zero = 0; v8i16 dct0, dct1, dct2, dct3; v8i16 zero = { 0 }; v8i16 dct0_mask, dct1_mask, dct2_mask, dct3_mask; v8i16 dct_h0, dct_h1, dct_h2, dct_h3, mf_h0, mf_h1, mf_h2, mf_h3; v8i16 bias_h0, bias_h1, bias_h2, bias_h3; v4i32 dct_w0, dct_w1, dct_w2, dct_w3, dct_w4, dct_w5, dct_w6, dct_w7; v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3; v4i32 dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7; v4i32 mf_vec0, mf_vec1, mf_vec2, mf_vec3; v4i32 mf_vec4, mf_vec5, mf_vec6, mf_vec7; v4i32 bias0, bias1, bias2, bias3, bias4, bias5, bias6, bias7; LD_SH4( p_dct, 8, dct0, dct1, dct2, dct3 ); dct0_mask = __msa_clei_s_h( dct0, 0 ); dct1_mask = __msa_clei_s_h( dct1, 0 ); dct2_mask = __msa_clei_s_h( dct2, 0 ); dct3_mask = __msa_clei_s_h( dct3, 0 ); UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 ); UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 ); LD_SH4( p_bias, 8, bias_h0, bias_h1, bias_h2, bias_h3 ); ILVR_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3, bias0, bias2, bias4, bias6 ); ILVL_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3, bias1, bias3, bias5, bias7 ); LD_SH4( p_mf, 8, mf_h0, mf_h1, mf_h2, mf_h3 ); ILVR_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3, mf_vec0, mf_vec2, mf_vec4, mf_vec6 ); ILVL_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3, mf_vec1, mf_vec3, mf_vec5, mf_vec7 ); dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 ); dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 ); dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 ); dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 ); dct_w4 = __msa_add_a_w( dct_signed_w4, bias4 ); dct_w5 = __msa_add_a_w( dct_signed_w5, bias5 ); dct_w6 = __msa_add_a_w( dct_signed_w6, bias6 ); dct_w7 = __msa_add_a_w( dct_signed_w7, bias7 ); dct_w0 *= mf_vec0; dct_w1 *= mf_vec1; dct_w2 *= mf_vec2; dct_w3 *= mf_vec3; dct_w4 *= mf_vec4; dct_w5 *= mf_vec5; dct_w6 *= mf_vec6; dct_w7 *= mf_vec7; SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 ); SRA_4V( dct_w4, dct_w5, dct_w6, dct_w7, 16 ); PCKEV_H4_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_w5, dct_w4, dct_w7, dct_w6, dct_h0, dct_h1, dct_h2, dct_h3 ); SUB4( zero, dct_h0, zero, dct_h1, zero, dct_h2, zero, dct_h3, dct0, dct1, dct2, dct3 ); dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0, ( v16u8 ) dct0, ( v16u8 ) dct0_mask ); dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1, ( v16u8 ) dct1, ( v16u8 ) dct1_mask ); dct2 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h2, ( v16u8 ) dct2, ( v16u8 ) dct2_mask ); dct3 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h3, ( v16u8 ) dct3, ( v16u8 ) dct3_mask ); non_zero = HADD_SW_S32( ( v4u32 )( dct_h0 + dct_h1 + dct_h2 + dct_h3 ) ); ST_SH4( dct0, dct1, dct2, dct3, p_dct, 8 ); LD_SH4( p_dct + 32, 8, dct0, dct1, dct2, dct3 ); dct0_mask = __msa_clei_s_h( dct0, 0 ); dct1_mask = __msa_clei_s_h( dct1, 0 ); dct2_mask = __msa_clei_s_h( dct2, 0 ); dct3_mask = __msa_clei_s_h( dct3, 0 ); UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 ); UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 ); LD_SH4( p_bias + 32, 8, bias_h0, bias_h1, bias_h2, bias_h3 ); ILVR_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3, bias0, bias2, bias4, bias6 ); ILVL_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3, bias1, bias3, bias5, bias7 ); LD_SH4( p_mf + 32, 8, mf_h0, mf_h1, mf_h2, mf_h3 ); ILVR_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3, mf_vec0, mf_vec2, mf_vec4, mf_vec6 ); ILVL_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3, mf_vec1, mf_vec3, mf_vec5, mf_vec7 ); dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 ); dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 ); dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 ); dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 ); dct_w4 = __msa_add_a_w( dct_signed_w4, bias4 ); dct_w5 = __msa_add_a_w( dct_signed_w5, bias5 ); dct_w6 = __msa_add_a_w( dct_signed_w6, bias6 ); dct_w7 = __msa_add_a_w( dct_signed_w7, bias7 ); dct_w0 *= mf_vec0; dct_w1 *= mf_vec1; dct_w2 *= mf_vec2; dct_w3 *= mf_vec3; dct_w4 *= mf_vec4; dct_w5 *= mf_vec5; dct_w6 *= mf_vec6; dct_w7 *= mf_vec7; SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 ); SRA_4V( dct_w4, dct_w5, dct_w6, dct_w7, 16 ); PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 ); PCKEV_H2_SH( dct_w5, dct_w4, dct_w7, dct_w6, dct_h2, dct_h3 ); SUB4( zero, dct_h0, zero, dct_h1, zero, dct_h2, zero, dct_h3, dct0, dct1, dct2, dct3 ); dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0, ( v16u8 ) dct0, ( v16u8 ) dct0_mask ); dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1, ( v16u8 ) dct1, ( v16u8 ) dct1_mask ); dct2 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h2, ( v16u8 ) dct2, ( v16u8 ) dct2_mask ); dct3 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h3, ( v16u8 ) dct3, ( v16u8 ) dct3_mask ); non_zero += HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 + dct_h2 + dct_h3 ) ); ST_SH4( dct0, dct1, dct2, dct3, p_dct + 32, 8 ); return !!non_zero; }