static void fadst16_rows_step1_msa(int16_t *input, const int32_t *const0, int16_t *int_buf) { v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3; v4i32 k0, k1, k2, k3; /* load input data */ r0 = LD_SH(input); r7 = LD_SH(input + 7 * 8); r8 = LD_SH(input + 8 * 8); r15 = LD_SH(input + 15 * 8); /* stage 1 */ LD_SW2(const0, 4, k0, k1); LD_SW2(const0 + 4 * 2, 4, k2, k3); MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3); r3 = LD_SH(input + 3 * 8); r4 = LD_SH(input + 4 * 8); r11 = LD_SH(input + 11 * 8); r12 = LD_SH(input + 12 * 8); LD_SW2(const0 + 4 * 4, 4, k0, k1); LD_SW2(const0 + 4 * 6, 4, k2, k3); MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11); /* stage 2 */ BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1); ST_SH2(tp0, tp1, int_buf, 4 * 8); ST_SH2(tp2, tp3, int_buf + 8, 4 * 8); LD_SW2(const0 + 4 * 8, 4, k0, k1); k2 = LD_SW(const0 + 4 * 10); MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3); ST_SH2(h0, h3, int_buf + 8 * 8, 4 * 8); ST_SH2(h1, h2, int_buf + 9 * 8, 4 * 8); r1 = LD_SH(input + 8); r6 = LD_SH(input + 6 * 8); r9 = LD_SH(input + 9 * 8); r14 = LD_SH(input + 14 * 8); LD_SW2(const0 + 4 * 11, 4, k0, k1); LD_SW2(const0 + 4 * 13, 4, k2, k3); MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3); ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8); r2 = LD_SH(input + 2 * 8); r5 = LD_SH(input + 5 * 8); r10 = LD_SH(input + 10 * 8); r13 = LD_SH(input + 13 * 8); LD_SW2(const0 + 4 * 15, 4, k0, k1); LD_SW2(const0 + 4 * 17, 4, k2, k3); MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3); ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8); BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3); ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8); }
static void fdct8x32_1d_column_load_butterfly(const int16_t *input, int32_t src_stride, int16_t *temp_buff) { v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 step0, step1, step2, step3; v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1; v8i16 step0_1, step1_1, step2_1, step3_1; /* 1st and 2nd set */ LD_SH4(input, src_stride, in0, in1, in2, in3); LD_SH4(input + (28 * src_stride), src_stride, in4, in5, in6, in7); LD_SH4(input + (4 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1); LD_SH4(input + (24 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1); SLLI_4V(in0, in1, in2, in3, 2); SLLI_4V(in4, in5, in6, in7, 2); SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2); SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2); BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2, step3, in4, in5, in6, in7); BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1); ST_SH4(step0, step1, step2, step3, temp_buff, 8); ST_SH4(in4, in5, in6, in7, temp_buff + (28 * 8), 8); ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (4 * 8), 8); ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (24 * 8), 8); /* 3rd and 4th set */ LD_SH4(input + (8 * src_stride), src_stride, in0, in1, in2, in3); LD_SH4(input + (20 * src_stride), src_stride, in4, in5, in6, in7); LD_SH4(input + (12 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1); LD_SH4(input + (16 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1); SLLI_4V(in0, in1, in2, in3, 2); SLLI_4V(in4, in5, in6, in7, 2); SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2); SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2); BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2, step3, in4, in5, in6, in7); BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1); ST_SH4(step0, step1, step2, step3, temp_buff + (8 * 8), 8); ST_SH4(in4, in5, in6, in7, temp_buff + (20 * 8), 8); ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (12 * 8), 8); ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (15 * 8) + 8, 8); }
static int32_t avc_quant_8x8_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias ) { int32_t non_zero = 0; v8i16 dct0, dct1, dct2, dct3; v8i16 zero = { 0 }; v8i16 dct0_mask, dct1_mask, dct2_mask, dct3_mask; v8i16 dct_h0, dct_h1, dct_h2, dct_h3, mf_h0, mf_h1, mf_h2, mf_h3; v8i16 bias_h0, bias_h1, bias_h2, bias_h3; v4i32 dct_w0, dct_w1, dct_w2, dct_w3, dct_w4, dct_w5, dct_w6, dct_w7; v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3; v4i32 dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7; v4i32 mf_vec0, mf_vec1, mf_vec2, mf_vec3; v4i32 mf_vec4, mf_vec5, mf_vec6, mf_vec7; v4i32 bias0, bias1, bias2, bias3, bias4, bias5, bias6, bias7; LD_SH4( p_dct, 8, dct0, dct1, dct2, dct3 ); dct0_mask = __msa_clei_s_h( dct0, 0 ); dct1_mask = __msa_clei_s_h( dct1, 0 ); dct2_mask = __msa_clei_s_h( dct2, 0 ); dct3_mask = __msa_clei_s_h( dct3, 0 ); UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 ); UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 ); LD_SH4( p_bias, 8, bias_h0, bias_h1, bias_h2, bias_h3 ); ILVR_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3, bias0, bias2, bias4, bias6 ); ILVL_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3, bias1, bias3, bias5, bias7 ); LD_SH4( p_mf, 8, mf_h0, mf_h1, mf_h2, mf_h3 ); ILVR_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3, mf_vec0, mf_vec2, mf_vec4, mf_vec6 ); ILVL_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3, mf_vec1, mf_vec3, mf_vec5, mf_vec7 ); dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 ); dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 ); dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 ); dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 ); dct_w4 = __msa_add_a_w( dct_signed_w4, bias4 ); dct_w5 = __msa_add_a_w( dct_signed_w5, bias5 ); dct_w6 = __msa_add_a_w( dct_signed_w6, bias6 ); dct_w7 = __msa_add_a_w( dct_signed_w7, bias7 ); dct_w0 *= mf_vec0; dct_w1 *= mf_vec1; dct_w2 *= mf_vec2; dct_w3 *= mf_vec3; dct_w4 *= mf_vec4; dct_w5 *= mf_vec5; dct_w6 *= mf_vec6; dct_w7 *= mf_vec7; SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 ); SRA_4V( dct_w4, dct_w5, dct_w6, dct_w7, 16 ); PCKEV_H4_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_w5, dct_w4, dct_w7, dct_w6, dct_h0, dct_h1, dct_h2, dct_h3 ); SUB4( zero, dct_h0, zero, dct_h1, zero, dct_h2, zero, dct_h3, dct0, dct1, dct2, dct3 ); dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0, ( v16u8 ) dct0, ( v16u8 ) dct0_mask ); dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1, ( v16u8 ) dct1, ( v16u8 ) dct1_mask ); dct2 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h2, ( v16u8 ) dct2, ( v16u8 ) dct2_mask ); dct3 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h3, ( v16u8 ) dct3, ( v16u8 ) dct3_mask ); non_zero = HADD_SW_S32( ( v4u32 )( dct_h0 + dct_h1 + dct_h2 + dct_h3 ) ); ST_SH4( dct0, dct1, dct2, dct3, p_dct, 8 ); LD_SH4( p_dct + 32, 8, dct0, dct1, dct2, dct3 ); dct0_mask = __msa_clei_s_h( dct0, 0 ); dct1_mask = __msa_clei_s_h( dct1, 0 ); dct2_mask = __msa_clei_s_h( dct2, 0 ); dct3_mask = __msa_clei_s_h( dct3, 0 ); UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 ); UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 ); LD_SH4( p_bias + 32, 8, bias_h0, bias_h1, bias_h2, bias_h3 ); ILVR_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3, bias0, bias2, bias4, bias6 ); ILVL_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3, bias1, bias3, bias5, bias7 ); LD_SH4( p_mf + 32, 8, mf_h0, mf_h1, mf_h2, mf_h3 ); ILVR_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3, mf_vec0, mf_vec2, mf_vec4, mf_vec6 ); ILVL_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3, mf_vec1, mf_vec3, mf_vec5, mf_vec7 ); dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 ); dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 ); dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 ); dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 ); dct_w4 = __msa_add_a_w( dct_signed_w4, bias4 ); dct_w5 = __msa_add_a_w( dct_signed_w5, bias5 ); dct_w6 = __msa_add_a_w( dct_signed_w6, bias6 ); dct_w7 = __msa_add_a_w( dct_signed_w7, bias7 ); dct_w0 *= mf_vec0; dct_w1 *= mf_vec1; dct_w2 *= mf_vec2; dct_w3 *= mf_vec3; dct_w4 *= mf_vec4; dct_w5 *= mf_vec5; dct_w6 *= mf_vec6; dct_w7 *= mf_vec7; SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 ); SRA_4V( dct_w4, dct_w5, dct_w6, dct_w7, 16 ); PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 ); PCKEV_H2_SH( dct_w5, dct_w4, dct_w7, dct_w6, dct_h2, dct_h3 ); SUB4( zero, dct_h0, zero, dct_h1, zero, dct_h2, zero, dct_h3, dct0, dct1, dct2, dct3 ); dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0, ( v16u8 ) dct0, ( v16u8 ) dct0_mask ); dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1, ( v16u8 ) dct1, ( v16u8 ) dct1_mask ); dct2 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h2, ( v16u8 ) dct2, ( v16u8 ) dct2_mask ); dct3 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h3, ( v16u8 ) dct3, ( v16u8 ) dct3_mask ); non_zero += HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 + dct_h2 + dct_h3 ) ); ST_SH4( dct0, dct1, dct2, dct3, p_dct + 32, 8 ); return !!non_zero; }