Example #1
0
static int32_t avc_quant_4x4_msa( int16_t *p_dct, uint16_t *p_mf,
                                  uint16_t *p_bias )
{
    int32_t non_zero = 0;
    v8i16 dct0, dct1;
    v8i16 zero = { 0 };
    v8i16 dct0_mask, dct1_mask;
    v8i16 dct_h0, dct_h1, mf_h0, mf_h1, bias_h0, bias_h1;
    v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
    v4i32 dct_w0, dct_w1, dct_w2, dct_w3;
    v4i32 mf_vec0, mf_vec1, mf_vec2, mf_vec3;
    v4i32 bias0, bias1, bias2, bias3;

    LD_SH2( p_dct, 8, dct0, dct1 );
    LD_SH2( p_bias, 8, bias_h0, bias_h1 );
    LD_SH2( p_mf, 8, mf_h0, mf_h1 );

    dct0_mask = __msa_clei_s_h( dct0, 0 );
    dct1_mask = __msa_clei_s_h( dct1, 0 );

    UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
    UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
    ILVR_H2_SW( zero, bias_h0, zero, bias_h1, bias0, bias2 );
    ILVL_H2_SW( zero, bias_h0, zero, bias_h1, bias1, bias3 );
    ILVR_H2_SW( zero, mf_h0, zero, mf_h1, mf_vec0, mf_vec2 );
    ILVL_H2_SW( zero, mf_h0, zero, mf_h1, mf_vec1, mf_vec3 );

    dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 );
    dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 );
    dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 );
    dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 );

    dct_w0 *= mf_vec0;
    dct_w1 *= mf_vec1;
    dct_w2 *= mf_vec2;
    dct_w3 *= mf_vec3;

    SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
    PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 );

    dct0 = zero - dct_h0;
    dct1 = zero - dct_h1;

    dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0, ( v16u8 ) dct0,
                                   ( v16u8 ) dct0_mask );
    dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1, ( v16u8 ) dct1,
                                   ( v16u8 ) dct1_mask );
    non_zero = HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 ) );
    ST_SH2( dct0, dct1, p_dct, 8 );

    return !!non_zero;
}
Example #2
0
void vp8_short_fdct4x4_msa(int16_t *input, int16_t *output, int32_t pitch) {
  v8i16 in0, in1, in2, in3;
  v8i16 temp0, temp1;
  v8i16 const0, const1;
  v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 };
  v4i32 out0, out1, out2, out3;
  v8i16 zero = { 0 };

  LD_SH4(input, pitch / 2, in0, in1, in2, in3);
  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);

  BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
  SLLI_4V(temp0, temp1, in1, in3, 3);
  in0 = temp0 + temp1;
  in2 = temp0 - temp1;
  SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1);
  temp0 = __msa_ilvr_h(in3, in1);
  in1 = __msa_splati_h(coeff, 3);
  out0 = (v4i32)__msa_ilvev_h(zero, in1);
  coeff = __msa_ilvl_h(zero, coeff);
  out1 = __msa_splati_w((v4i32)coeff, 0);
  DPADD_SH2_SW(temp0, temp0, const0, const1, out0, out1);
  out0 >>= 12;
  out1 >>= 12;
  PCKEV_H2_SH(out0, out0, out1, out1, in1, in3);
  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);

  BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
  in0 = temp0 + temp1 + 7;
  in2 = temp0 - temp1 + 7;
  in0 >>= 4;
  in2 >>= 4;
  ILVR_H2_SW(zero, in0, zero, in2, out0, out2);
  temp1 = RET_1_IF_NZERO_H(in3);
  ILVR_H2_SH(zero, temp1, in3, in1, temp1, temp0);
  SPLATI_W2_SW(coeff, 2, out3, out1);
  out3 += out1;
  out1 = __msa_splati_w((v4i32)coeff, 1);
  DPADD_SH2_SW(temp0, temp0, const0, const1, out1, out3);
  out1 >>= 16;
  out3 >>= 16;
  out1 += (v4i32)temp1;
  PCKEV_H2_SH(out1, out0, out3, out2, in0, in2);
  ST_SH2(in0, in2, output, 8);
}