Exemplos de ST_SH2 em C++ (Cpp)

Exemplo n.º 1

0

Exibir arquivo

Arquivo: quant-c.c Projeto: xing2fan/x264

static void avc_dequant_4x4_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
                                 int32_t i_qp )
{
    const int32_t i_mf = i_qp % 6;
    const int32_t q_bits = i_qp / 6 - 4;
    v8i16 dct0, dct1;
    v4i32 dequant_m_f0, dequant_m_f1, dequant_m_f2, dequant_m_f3;

    LD_SH2( p_dct, 8, dct0, dct1 );

    LD_SW2( pi_dequant_mf[i_mf], 4, dequant_m_f0, dequant_m_f1 );
    LD_SW2( pi_dequant_mf[i_mf] + 8, 4, dequant_m_f2, dequant_m_f3 );

    if( q_bits >= 0 )
    {
        v8i16 dequant_mf_h0, dequant_mf_h1, q_bits_vec;

        q_bits_vec = __msa_fill_h( q_bits );

        PCKEV_H2_SH( dequant_m_f1, dequant_m_f0, dequant_m_f3, dequant_m_f2,
                     dequant_mf_h0, dequant_mf_h1 );

        dct0 *= dequant_mf_h0;
        dct1 *= dequant_mf_h1;
        dct0 <<= q_bits_vec;
        dct1 <<= q_bits_vec;
        ST_SH2( dct0, dct1, p_dct, 8 );
    }
    else
    {
        const int32_t q_bits_add = 1 << ( -q_bits - 1 );
        v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
        v4i32 q_bits_vec, q_bits_vec_add;

        q_bits_vec_add = __msa_fill_w( q_bits_add );
        q_bits_vec = __msa_fill_w( -q_bits );

        UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
        UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );

        dct_signed_w0 *= dequant_m_f0;
        dct_signed_w1 *= dequant_m_f1;
        dct_signed_w2 *= dequant_m_f2;
        dct_signed_w3 *= dequant_m_f3;
        dct_signed_w0 += q_bits_vec_add;
        dct_signed_w1 += q_bits_vec_add;
        dct_signed_w2 += q_bits_vec_add;
        dct_signed_w3 += q_bits_vec_add;

        SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3,
                q_bits_vec );
        PCKEV_H2_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2,
                     dct0, dct1 );
        ST_SH2( dct0, dct1, p_dct, 8 );
    }
}

Exemplo n.º 2

0

Exibir arquivo

Arquivo: dct_msa.c Projeto: Corax26/libvpx

void vp8_short_fdct8x4_msa(int16_t *input, int16_t *output, int32_t pitch) {
  v8i16 in0, in1, in2, in3;
  v8i16 temp0, temp1, tmp0, tmp1;
  v8i16 const0, const1, const2;
  v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 };
  v8i16 zero = { 0 };
  v4i32 vec0_w, vec1_w, vec2_w, vec3_w;

  LD_SH4(input, pitch / 2, in0, in1, in2, in3);
  TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);

  BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
  SLLI_4V(temp0, temp1, in1, in3, 3);
  in0 = temp0 + temp1;
  in2 = temp0 - temp1;
  SET_DOTP_VALUES(coeff, 0, 1, 2, const1, const2);
  temp0 = __msa_splati_h(coeff, 3);
  vec1_w = (v4i32)__msa_ilvev_h(zero, temp0);
  coeff = __msa_ilvl_h(zero, coeff);
  vec3_w = __msa_splati_w((v4i32)coeff, 0);
  ILVRL_H2_SH(in3, in1, tmp1, tmp0);
  vec0_w = vec1_w;
  vec2_w = vec3_w;
  DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2, vec0_w,
               vec1_w, vec2_w, vec3_w);
  SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 12);
  PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3);
  TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);

  BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
  in0 = temp0 + temp1 + 7;
  in2 = temp0 - temp1 + 7;
  in0 >>= 4;
  in2 >>= 4;
  SPLATI_W2_SW(coeff, 2, vec3_w, vec1_w);
  vec3_w += vec1_w;
  vec1_w = __msa_splati_w((v4i32)coeff, 1);
  const0 = RET_1_IF_NZERO_H(in3);
  ILVRL_H2_SH(in3, in1, tmp1, tmp0);
  vec0_w = vec1_w;
  vec2_w = vec3_w;
  DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2, vec0_w,
               vec1_w, vec2_w, vec3_w);
  SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 16);
  PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3);
  in1 += const0;
  PCKEV_D2_SH(in1, in0, in3, in2, temp0, temp1);
  ST_SH2(temp0, temp1, output, 8);

  PCKOD_D2_SH(in1, in0, in3, in2, in0, in2);
  ST_SH2(in0, in2, output + 16, 8);
}

Exemplo n.º 3

0

Exibir arquivo

Arquivo: quant-c.c Projeto: xing2fan/x264

static void avc_dequant_4x4_dc_msa( int16_t *p_dct,
                                    int32_t pi_dequant_mf[6][16],
                                    int32_t i_qp )
{
    const int32_t q_bits = i_qp / 6 - 6;
    int32_t i_dmf = pi_dequant_mf[i_qp % 6][0];
    v8i16 dct0, dct1, dequant_mf_h;

    LD_SH2( p_dct, 8, dct0, dct1 );

    if( q_bits >= 0 )
    {
        i_dmf <<= q_bits;

        dequant_mf_h = __msa_fill_h( i_dmf );
        dct0 = dct0 * dequant_mf_h;
        dct1 = dct1 * dequant_mf_h;

        ST_SH2( dct0, dct1, p_dct, 8 );
    }
    else
    {
        const int32_t q_bits_add = 1 << ( -q_bits - 1 );
        v4i32 dequant_m_f, q_bits_vec, q_bits_vec_add;
        v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;

        q_bits_vec_add = __msa_fill_w( q_bits_add );
        q_bits_vec = __msa_fill_w( -q_bits );

        dequant_m_f = __msa_fill_w( i_dmf );

        UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
        UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );

        dct_signed_w0 *= dequant_m_f;
        dct_signed_w1 *= dequant_m_f;
        dct_signed_w2 *= dequant_m_f;
        dct_signed_w3 *= dequant_m_f;

        dct_signed_w0 += q_bits_vec_add;
        dct_signed_w1 += q_bits_vec_add;
        dct_signed_w2 += q_bits_vec_add;
        dct_signed_w3 += q_bits_vec_add;

        SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3,
                q_bits_vec );
        PCKEV_H2_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2,
                     dct0, dct1 );
        ST_SH2( dct0, dct1, p_dct, 8 );
    }
}

Exemplo n.º 4

0

Exibir arquivo

Arquivo: fwd_txfm_msa.c Projeto: MIPS/external-libvpx

void vpx_fdct4x4_msa(const int16_t *input, int16_t *output,
                     int32_t src_stride) {
  v8i16 in0, in1, in2, in3;

  LD_SH4(input, src_stride, in0, in1, in2, in3);

  /* fdct4 pre-process */
  {
    v8i16 vec, mask;
    v16i8 zero = { 0 };
    v16i8 one = __msa_ldi_b(1);

    mask = (v8i16)__msa_sldi_b(zero, one, 15);
    SLLI_4V(in0, in1, in2, in3, 4);
    vec = __msa_ceqi_h(in0, 0);
    vec = vec ^ 255;
    vec = mask & vec;
    in0 += vec;
  }

  VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
  VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
  ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
  SRA_4V(in0, in1, in2, in3, 2);
  PCKEV_D2_SH(in1, in0, in3, in2, in0, in2);
  ST_SH2(in0, in2, output, 8);
}

Exemplo n.º 5

0

Exibir arquivo

Arquivo: enc_msa.c Projeto: PixpieCo/WebP

static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
                             int start_block, int end_block,
                             VP8Histogram* const histo) {
  int j;
  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
  for (j = start_block; j < end_block; ++j) {
    int16_t out[16];
    VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
    {
      int k;
      v8i16 coeff0, coeff1;
      const v8i16 zero = { 0 };
      const v8i16 max_coeff_thr = __msa_ldi_h(MAX_COEFF_THRESH);
      LD_SH2(&out[0], 8, coeff0, coeff1);
      coeff0 = __msa_add_a_h(coeff0, zero);
      coeff1 = __msa_add_a_h(coeff1, zero);
      SRAI_H2_SH(coeff0, coeff1, 3);
      coeff0 = __msa_min_s_h(coeff0, max_coeff_thr);
      coeff1 = __msa_min_s_h(coeff1, max_coeff_thr);
      ST_SH2(coeff0, coeff1, &out[0], 8);
      for (k = 0; k < 16; ++k) {
        ++distribution[out[k]];
      }
    }
  }
  VP8SetHistogramData(distribution, histo);
}

Exemplo n.º 6

0

Exibir arquivo

Arquivo: dct_msa.c Projeto: Corax26/libvpx

void vp8_short_walsh4x4_msa(int16_t *input, int16_t *output, int32_t pitch) {
  v8i16 in0_h, in1_h, in2_h, in3_h;
  v4i32 in0_w, in1_w, in2_w, in3_w, temp0, temp1, temp2, temp3;

  LD_SH4(input, pitch / 2, in0_h, in1_h, in2_h, in3_h);
  TRANSPOSE4x4_SH_SH(in0_h, in1_h, in2_h, in3_h, in0_h, in1_h, in2_h, in3_h);

  UNPCK_R_SH_SW(in0_h, in0_w);
  UNPCK_R_SH_SW(in1_h, in1_w);
  UNPCK_R_SH_SW(in2_h, in2_w);
  UNPCK_R_SH_SW(in3_h, in3_w);
  BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1);
  SLLI_4V(temp0, temp1, temp2, temp3, 2);
  BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w);
  temp0 = RET_1_IF_NZERO_W(temp0);
  in0_w += temp0;
  TRANSPOSE4x4_SW_SW(in0_w, in1_w, in2_w, in3_w, in0_w, in1_w, in2_w, in3_w);

  BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1);
  BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w);
  in0_w += RET_1_IF_NEG_W(in0_w);
  in1_w += RET_1_IF_NEG_W(in1_w);
  in2_w += RET_1_IF_NEG_W(in2_w);
  in3_w += RET_1_IF_NEG_W(in3_w);
  ADD4(in0_w, 3, in1_w, 3, in2_w, 3, in3_w, 3, in0_w, in1_w, in2_w, in3_w);
  SRA_4V(in0_w, in1_w, in2_w, in3_w, 3);
  PCKEV_H2_SH(in1_w, in0_w, in3_w, in2_w, in0_h, in1_h);
  ST_SH2(in0_h, in1_h, output, 8);
}

Exemplo n.º 7

0

Exibir arquivo

Arquivo: dct-c.c Projeto: 0x0B501E7E/x264

static void avc_dct4x4dc_msa( int16_t *p_src, int16_t *p_dst,
                              int32_t i_src_stride )
{
    v8i16 src0, src1, src2, src3, ver_res0, ver_res1, ver_res2, ver_res3;
    v4i32 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
    v4i32 hor_res0, hor_res1, hor_res2, hor_res3;
    v4i32 ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r;

    LD_SH4( p_src, i_src_stride, src0, src1, src2, src3 );
    UNPCK_R_SH_SW( src0, src0_r );
    UNPCK_R_SH_SW( src1, src1_r );
    UNPCK_R_SH_SW( src2, src2_r );
    UNPCK_R_SH_SW( src3, src3_r );
    BUTTERFLY_4( src0_r, src2_r, src3_r, src1_r,
                 tmp0, tmp3, tmp2, tmp1 );
    BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3,
                 hor_res0, hor_res3, hor_res2, hor_res1 );
    TRANSPOSE4x4_SW_SW( hor_res0, hor_res1, hor_res2, hor_res3,
                        hor_res0, hor_res1, hor_res2, hor_res3 );
    BUTTERFLY_4( hor_res0, hor_res2, hor_res3, hor_res1,
                 tmp0, tmp3, tmp2, tmp1 );
    BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3,
                 ver_res0_r, ver_res3_r, ver_res2_r, ver_res1_r );
    SRARI_W4_SW( ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r, 1 );
    PCKEV_H4_SH( ver_res0_r, ver_res0_r, ver_res1_r, ver_res1_r,
                 ver_res2_r, ver_res2_r, ver_res3_r, ver_res3_r,
                 ver_res0, ver_res1, ver_res2, ver_res3 );
    PCKOD_D2_SH( ver_res1, ver_res0, ver_res3, ver_res2, ver_res0, ver_res2 );
    ST_SH2( ver_res0, ver_res2, p_dst, 8 );
}

Exemplo n.º 8

0

Exibir arquivo

Arquivo: dct-c.c Projeto: 0x0B501E7E/x264

static void avc_zigzag_scan_4x4_frame_msa( int16_t pi_dct[16],
                                           int16_t pi_level[16] )
{
    v8i16 src0, src1;
    v8i16 mask0 = { 0, 4, 1, 2, 5, 8, 12, 9 };
    v8i16 mask1 = { 6, 3, 7, 10, 13, 14, 11, 15 };

    LD_SH2( pi_dct, 8, src0, src1 );
    VSHF_H2_SH( src0, src1, src0, src1, mask0, mask1, mask0, mask1 );
    ST_SH2( mask0, mask1, pi_level, 8 );
}

Exemplo n.º 9

0

Exibir arquivo

Arquivo: hevc_idct_msa.c Projeto: DeHackEd/FFmpeg

static void hevc_idct_dc_4x4_msa(int16_t *coeffs)
{
    int32_t val;
    v8i16 dst;

    val = (coeffs[0] + 1) >> 1;
    val = (val + 32) >> 6;
    dst = __msa_fill_h(val);

    ST_SH2(dst, dst, coeffs, 8);
}

Exemplo n.º 10

0

Exibir arquivo

Arquivo: quant-c.c Projeto: xing2fan/x264

static int32_t avc_quant_4x4_msa( int16_t *p_dct, uint16_t *p_mf,
                                  uint16_t *p_bias )
{
    int32_t non_zero = 0;
    v8i16 dct0, dct1;
    v8i16 zero = { 0 };
    v8i16 dct0_mask, dct1_mask;
    v8i16 dct_h0, dct_h1, mf_h0, mf_h1, bias_h0, bias_h1;
    v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
    v4i32 dct_w0, dct_w1, dct_w2, dct_w3;
    v4i32 mf_vec0, mf_vec1, mf_vec2, mf_vec3;
    v4i32 bias0, bias1, bias2, bias3;

    LD_SH2( p_dct, 8, dct0, dct1 );
    LD_SH2( p_bias, 8, bias_h0, bias_h1 );
    LD_SH2( p_mf, 8, mf_h0, mf_h1 );

    dct0_mask = __msa_clei_s_h( dct0, 0 );
    dct1_mask = __msa_clei_s_h( dct1, 0 );

    UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
    UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
    ILVR_H2_SW( zero, bias_h0, zero, bias_h1, bias0, bias2 );
    ILVL_H2_SW( zero, bias_h0, zero, bias_h1, bias1, bias3 );
    ILVR_H2_SW( zero, mf_h0, zero, mf_h1, mf_vec0, mf_vec2 );
    ILVL_H2_SW( zero, mf_h0, zero, mf_h1, mf_vec1, mf_vec3 );

    dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 );
    dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 );
    dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 );
    dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 );

    dct_w0 *= mf_vec0;
    dct_w1 *= mf_vec1;
    dct_w2 *= mf_vec2;
    dct_w3 *= mf_vec3;

    SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
    PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 );

    dct0 = zero - dct_h0;
    dct1 = zero - dct_h1;

    dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0, ( v16u8 ) dct0,
                                   ( v16u8 ) dct0_mask );
    dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1, ( v16u8 ) dct1,
                                   ( v16u8 ) dct1_mask );
    non_zero = HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 ) );
    ST_SH2( dct0, dct1, p_dct, 8 );

    return !!non_zero;
}

Exemplo n.º 11

0

Exibir arquivo

Arquivo: vp9_fdct4x4_msa.c Projeto: krunalsoni01/src

void vp9_fht4x4_msa(const int16_t *input, int16_t *output, int32_t stride,
                    int32_t tx_type) {
  v8i16 in0, in1, in2, in3;

  LD_SH4(input, stride, in0, in1, in2, in3);

  /* fdct4 pre-process */
  {
    v8i16 temp, mask;
    v16i8 zero = { 0 };
    v16i8 one = __msa_ldi_b(1);

    mask = (v8i16)__msa_sldi_b(zero, one, 15);
    SLLI_4V(in0, in1, in2, in3, 4);
    temp = __msa_ceqi_h(in0, 0);
    temp = (v8i16)__msa_xori_b((v16u8)temp, 255);
    temp = mask & temp;
    in0 += temp;
  }

  switch (tx_type) {
    case DCT_DCT:
      VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
      VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
      break;
    case ADST_DCT:
      VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
      VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
      break;
    case DCT_ADST:
      VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
      VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
      break;
    case ADST_ADST:
      VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
      TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
      VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
      break;
    default:
      assert(0);
      break;
  }

  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
  ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
  SRA_4V(in0, in1, in2, in3, 2);
  PCKEV_D2_SH(in1, in0, in3, in2, in0, in2);
  ST_SH2(in0, in2, output, 8);
}

Exemplo n.º 12

0

Exibir arquivo

Arquivo: fdct16x16_msa.c Projeto: nwgat/Mirror-aom

static void fadst16_rows_step1_msa(int16_t *input, const int32_t *const0,
                                   int16_t *int_buf) {
  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
  v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3;
  v4i32 k0, k1, k2, k3;

  /* load input data */
  r0 = LD_SH(input);
  r7 = LD_SH(input + 7 * 8);
  r8 = LD_SH(input + 8 * 8);
  r15 = LD_SH(input + 15 * 8);

  /* stage 1 */
  LD_SW2(const0, 4, k0, k1);
  LD_SW2(const0 + 4 * 2, 4, k2, k3);
  MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);

  r3 = LD_SH(input + 3 * 8);
  r4 = LD_SH(input + 4 * 8);
  r11 = LD_SH(input + 11 * 8);
  r12 = LD_SH(input + 12 * 8);

  LD_SW2(const0 + 4 * 4, 4, k0, k1);
  LD_SW2(const0 + 4 * 6, 4, k2, k3);
  MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);

  /* stage 2 */
  BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1);
  ST_SH2(tp0, tp1, int_buf, 4 * 8);
  ST_SH2(tp2, tp3, int_buf + 8, 4 * 8);

  LD_SW2(const0 + 4 * 8, 4, k0, k1);
  k2 = LD_SW(const0 + 4 * 10);
  MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
  ST_SH2(h0, h3, int_buf + 8 * 8, 4 * 8);
  ST_SH2(h1, h2, int_buf + 9 * 8, 4 * 8);

  r1 = LD_SH(input + 8);
  r6 = LD_SH(input + 6 * 8);
  r9 = LD_SH(input + 9 * 8);
  r14 = LD_SH(input + 14 * 8);

  LD_SW2(const0 + 4 * 11, 4, k0, k1);
  LD_SW2(const0 + 4 * 13, 4, k2, k3);
  MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3);
  ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8);

  r2 = LD_SH(input + 2 * 8);
  r5 = LD_SH(input + 5 * 8);
  r10 = LD_SH(input + 10 * 8);
  r13 = LD_SH(input + 13 * 8);

  LD_SW2(const0 + 4 * 15, 4, k0, k1);
  LD_SW2(const0 + 4 * 17, 4, k2, k3);
  MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3);
  ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8);
  BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3);
  ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8);
}

Exemplo n.º 13

0

Exibir arquivo

Arquivo: quant-c.c Projeto: xing2fan/x264

static int32_t avc_quant_4x4_dc_msa( int16_t *p_dct, int32_t i_mf,
                                     int32_t i_bias )
{
    int32_t non_zero = 0;
    v8i16 dct0, dct1, dct0_mask, dct1_mask;
    v8i16 zero = { 0 };
    v8i16 dct_h0, dct_h1;
    v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
    v4i32 dct_w0, dct_w1, dct_w2, dct_w3;
    v4i32 mf_vec, bias_vec;

    LD_SH2( p_dct, 8, dct0, dct1 );

    dct0_mask = __msa_clei_s_h( dct0, 0 );
    dct1_mask = __msa_clei_s_h( dct1, 0 );

    UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
    UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );

    bias_vec = __msa_fill_w( i_bias );
    mf_vec = __msa_fill_w( i_mf );

    dct_w0 = __msa_add_a_w( dct_signed_w0, bias_vec );
    dct_w1 = __msa_add_a_w( dct_signed_w1, bias_vec );
    dct_w2 = __msa_add_a_w( dct_signed_w2, bias_vec );
    dct_w3 = __msa_add_a_w( dct_signed_w3, bias_vec );

    dct_w0 *= mf_vec;
    dct_w1 *= mf_vec;
    dct_w2 *= mf_vec;
    dct_w3 *= mf_vec;

    SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
    PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 );

    dct0 = zero - dct_h0;
    dct1 = zero - dct_h1;
    dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0,
                                   ( v16u8 ) dct0, ( v16u8 ) dct0_mask );
    dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1,
                                   ( v16u8 ) dct1, ( v16u8 ) dct1_mask );
    non_zero = HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 ) );

    ST_SH2( dct0, dct1, p_dct, 8 );

    return !!non_zero;
}

Exemplo n.º 14

0

Exibir arquivo

Arquivo: dct-c.c Projeto: 0x0B501E7E/x264

static void avc_idct4x4_addblk_msa( uint8_t *p_dst, int16_t *p_src,
                                    int32_t i_dst_stride )
{
    v8i16 src0, src1, src2, src3;
    v8i16 hres0, hres1, hres2, hres3;
    v8i16 vres0, vres1, vres2, vres3;
    v8i16 zeros = { 0 };

    LD4x4_SH( p_src, src0, src1, src2, src3 );
    AVC_ITRANS_H( src0, src1, src2, src3, hres0, hres1, hres2, hres3 );
    TRANSPOSE4x4_SH_SH( hres0, hres1, hres2, hres3,
                        hres0, hres1, hres2, hres3 );
    AVC_ITRANS_H( hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3 );
    SRARI_H4_SH( vres0, vres1, vres2, vres3, 6 );
    ADDBLK_ST4x4_UB( vres0, vres1, vres2, vres3, p_dst, i_dst_stride );
    ST_SH2( zeros, zeros, p_src, 8 );
}

Exemplo n.º 15

0

Exibir arquivo

Arquivo: dct_msa.c Projeto: Corax26/libvpx

void vp8_short_fdct4x4_msa(int16_t *input, int16_t *output, int32_t pitch) {
  v8i16 in0, in1, in2, in3;
  v8i16 temp0, temp1;
  v8i16 const0, const1;
  v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 };
  v4i32 out0, out1, out2, out3;
  v8i16 zero = { 0 };

  LD_SH4(input, pitch / 2, in0, in1, in2, in3);
  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);

  BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
  SLLI_4V(temp0, temp1, in1, in3, 3);
  in0 = temp0 + temp1;
  in2 = temp0 - temp1;
  SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1);
  temp0 = __msa_ilvr_h(in3, in1);
  in1 = __msa_splati_h(coeff, 3);
  out0 = (v4i32)__msa_ilvev_h(zero, in1);
  coeff = __msa_ilvl_h(zero, coeff);
  out1 = __msa_splati_w((v4i32)coeff, 0);
  DPADD_SH2_SW(temp0, temp0, const0, const1, out0, out1);
  out0 >>= 12;
  out1 >>= 12;
  PCKEV_H2_SH(out0, out0, out1, out1, in1, in3);
  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);

  BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
  in0 = temp0 + temp1 + 7;
  in2 = temp0 - temp1 + 7;
  in0 >>= 4;
  in2 >>= 4;
  ILVR_H2_SW(zero, in0, zero, in2, out0, out2);
  temp1 = RET_1_IF_NZERO_H(in3);
  ILVR_H2_SH(zero, temp1, in3, in1, temp1, temp0);
  SPLATI_W2_SW(coeff, 2, out3, out1);
  out3 += out1;
  out1 = __msa_splati_w((v4i32)coeff, 1);
  DPADD_SH2_SW(temp0, temp0, const0, const1, out1, out3);
  out1 >>= 16;
  out3 >>= 16;
  out1 += (v4i32)temp1;
  PCKEV_H2_SH(out1, out0, out3, out2, in0, in2);
  ST_SH2(in0, in2, output, 8);
}

Exemplo n.º 16

0

Exibir arquivo

Arquivo: hevc_idct_msa.c Projeto: DeHackEd/FFmpeg

static void hevc_idct_luma_4x4_msa(int16_t *coeffs)
{
    v8i16 in0, in1, dst0, dst1;
    v4i32 in_r0, in_l0, in_r1, in_l1, res0, res1, res2, res3;

    LD_SH2(coeffs, 8, in0, in1);
    UNPCK_SH_SW(in0, in_r0, in_l0);
    UNPCK_SH_SW(in1, in_r1, in_l1);
    HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1, res0, res1, res2, res3,
                          7);
    TRANSPOSE4x4_SW_SW(res0, res1, res2, res3, in_r0, in_l0, in_r1, in_l1);
    HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1, res0, res1, res2, res3,
                          12);

    /* Pack and transpose */
    PCKEV_H2_SH(res2, res0, res3, res1, dst0, dst1);
    ILVRL_H2_SW(dst1, dst0, res0, res1);
    ILVRL_W2_SH(res1, res0, dst0, dst1);

    ST_SH2(dst0, dst1, coeffs, 8);
}

Exemplo n.º 17

0

Exibir arquivo

Arquivo: hevc_idct_msa.c Projeto: DeHackEd/FFmpeg

static void hevc_idct_4x4_msa(int16_t *coeffs)
{
    v8i16 in0, in1;
    v4i32 in_r0, in_l0, in_r1, in_l1;
    v4i32 sum0, sum1, sum2, sum3;
    v8i16 zeros = { 0 };

    LD_SH2(coeffs, 8, in0, in1);
    ILVRL_H2_SW(zeros, in0, in_r0, in_l0);
    ILVRL_H2_SW(zeros, in1, in_r1, in_l1);

    HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 7);
    TRANSPOSE4x4_SW_SW(sum0, sum1, sum2, sum3, in_r0, in_l0, in_r1, in_l1);
    HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 12);

    /* Pack and transpose */
    PCKEV_H2_SH(sum2, sum0, sum3, sum1, in0, in1);
    ILVRL_H2_SW(in1, in0, sum0, sum1);
    ILVRL_W2_SH(sum1, sum0, in0, in1);

    ST_SH2(in0, in1, coeffs, 8);
}

Exemplo n.º 18

0

Exibir arquivo

Arquivo: enc_msa.c Projeto: garrettmoon/libwebp

static void FTransformWHT(const int16_t* in, int16_t* out) {
  v8i16 in0 = { 0 };
  v8i16 in1 = { 0 };
  v8i16 tmp0, tmp1, tmp2, tmp3;
  v8i16 out0, out1;
  const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
  const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
  const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
  const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };

  in0 = __msa_insert_h(in0, 0, in[  0]);
  in0 = __msa_insert_h(in0, 1, in[ 64]);
  in0 = __msa_insert_h(in0, 2, in[128]);
  in0 = __msa_insert_h(in0, 3, in[192]);
  in0 = __msa_insert_h(in0, 4, in[ 16]);
  in0 = __msa_insert_h(in0, 5, in[ 80]);
  in0 = __msa_insert_h(in0, 6, in[144]);
  in0 = __msa_insert_h(in0, 7, in[208]);
  in1 = __msa_insert_h(in1, 0, in[ 48]);
  in1 = __msa_insert_h(in1, 1, in[112]);
  in1 = __msa_insert_h(in1, 2, in[176]);
  in1 = __msa_insert_h(in1, 3, in[240]);
  in1 = __msa_insert_h(in1, 4, in[ 32]);
  in1 = __msa_insert_h(in1, 5, in[ 96]);
  in1 = __msa_insert_h(in1, 6, in[160]);
  in1 = __msa_insert_h(in1, 7, in[224]);
  ADDSUB2(in0, in1, tmp0, tmp1);
  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
  ADDSUB2(tmp2, tmp3, tmp0, tmp1);
  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
  ADDSUB2(in0, in1, tmp0, tmp1);
  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
  ADDSUB2(tmp2, tmp3, out0, out1);
  SRAI_H2_SH(out0, out1, 1);
  ST_SH2(out0, out1, out, 8);
}

Exemplo n.º 19

0

Exibir arquivo

Arquivo: temporal_filter_msa.c Projeto: ALEJANDROJ19/VTW-server

static void temporal_filter_apply_8size_msa(uint8_t *frame1_ptr,
                                            uint32_t stride,
                                            uint8_t *frame2_ptr,
                                            int32_t strength_in,
                                            int32_t filter_wt_in,
                                            uint32_t *acc, uint16_t *cnt)
{
    uint32_t row;
    uint64_t f0, f1, f2, f3, f4, f5, f6, f7;
    v16i8 frame1 = { 0 };
    v16i8 frame2 = { 0 };
    v16i8 frame3 = { 0 };
    v16i8 frame4 = { 0 };
    v16u8 frame_l, frame_h;
    v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h;
    v8i16 diff0, diff1, cnt0, cnt1;
    v4i32 const3, const16;
    v4i32 filter_wt, strength;
    v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
    v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
    v4i32 frame2_0, frame2_1, frame2_2, frame2_3;
    v4i32 acc0, acc1, acc2, acc3;

    filter_wt = __msa_fill_w(filter_wt_in);
    strength = __msa_fill_w(strength_in);
    const3 = __msa_ldi_w(3);
    const16 = __msa_ldi_w(16);

    for (row = 2; row--;)
    {
        LD2(frame1_ptr, stride, f0, f1);
        frame1_ptr += (2 * stride);
        LD2(frame2_ptr, 8, f2, f3);
        frame2_ptr += 16;
        LD2(frame1_ptr, stride, f4, f5);
        frame1_ptr += (2 * stride);
        LD2(frame2_ptr, 8, f6, f7);
        frame2_ptr += 16;

        LD_SW2(acc, 4, acc0, acc1);
        LD_SW2(acc + 8, 4, acc2, acc3);
        LD_SH2(cnt, 8, cnt0, cnt1);
        INSERT_D2_SB(f0, f1, frame1);
        INSERT_D2_SB(f2, f3, frame2);
        INSERT_D2_SB(f4, f5, frame3);
        INSERT_D2_SB(f6, f7, frame4);
        ILVRL_B2_UB(frame1, frame2, frame_l, frame_h);
        HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
        UNPCK_SH_SW(diff0, diff0_r, diff0_l);
        UNPCK_SH_SW(diff1, diff1_r, diff1_l);
        MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
             diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
        MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
             mod0_w, mod1_w, mod2_w, mod3_w);
        SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
        diff0_r = (mod0_w < const16);
        diff0_l = (mod1_w < const16);
        diff1_r = (mod2_w < const16);
        diff1_l = (mod3_w < const16);
        SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
             mod0_w, mod1_w, mod2_w, mod3_w);
        mod0_w = diff0_r & mod0_w;
        mod1_w = diff0_l & mod1_w;
        mod2_w = diff1_r & mod2_w;
        mod3_w = diff1_l & mod3_w;
        MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
             filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
        PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
        ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
        ST_SH2(mod0_h, mod1_h, cnt, 8);
        cnt += 16;

        UNPCK_UB_SH(frame2, frame2_0_h, frame2_1_h);
        UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
        UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
        MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
             frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
        ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
             mod0_w, mod1_w, mod2_w, mod3_w);
        ST_SW2(mod0_w, mod1_w, acc, 4);
        ST_SW2(mod2_w, mod3_w, acc + 8, 4);
        acc += 16;

        LD_SW2(acc, 4, acc0, acc1);
        LD_SW2(acc + 8, 4, acc2, acc3);
        LD_SH2(cnt, 8, cnt0, cnt1);
        ILVRL_B2_UB(frame3, frame4, frame_l, frame_h);
        HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
        UNPCK_SH_SW(diff0, diff0_r, diff0_l);
        UNPCK_SH_SW(diff1, diff1_r, diff1_l);
        MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
             diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
        MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
             mod0_w, mod1_w, mod2_w, mod3_w);
        SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
        diff0_r = (mod0_w < const16);
        diff0_l = (mod1_w < const16);
        diff1_r = (mod2_w < const16);
        diff1_l = (mod3_w < const16);
        SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
             mod0_w, mod1_w, mod2_w, mod3_w);
        mod0_w = diff0_r & mod0_w;
        mod1_w = diff0_l & mod1_w;
        mod2_w = diff1_r & mod2_w;
        mod3_w = diff1_l & mod3_w;
        MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
             filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
        PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
        ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
        ST_SH2(mod0_h, mod1_h, cnt, 8);
        cnt += 16;

        UNPCK_UB_SH(frame4, frame2_0_h, frame2_1_h);
        UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
        UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
        MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
             frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
        ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
             mod0_w, mod1_w, mod2_w, mod3_w);
        ST_SW2(mod0_w, mod1_w, acc, 4);
        ST_SW2(mod2_w, mod3_w, acc + 8, 4);
        acc += 16;
    }
}

Exemplo n.º 20

0

Exibir arquivo

Arquivo: temporal_filter_msa.c Projeto: ALEJANDROJ19/VTW-server

static void temporal_filter_apply_16size_msa(uint8_t *frame1_ptr,
                                             uint32_t stride,
                                             uint8_t *frame2_ptr,
                                             int32_t strength_in,
                                             int32_t filter_wt_in,
                                             uint32_t *acc, uint16_t *cnt)
{
    uint32_t row;
    v16i8 frame1_0_b, frame1_1_b, frame2_0_b, frame2_1_b;
    v16u8 frame_l, frame_h;
    v16i8 zero = { 0 };
    v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h;
    v8i16 diff0, diff1, cnt0, cnt1;
    v4i32 const3, const16, filter_wt, strength;
    v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
    v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
    v4i32 frame2_0, frame2_1, frame2_2, frame2_3;
    v4i32 acc0, acc1, acc2, acc3;

    filter_wt = __msa_fill_w(filter_wt_in);
    strength = __msa_fill_w(strength_in);
    const3 = __msa_ldi_w(3);
    const16 = __msa_ldi_w(16);

    for (row = 8; row--;)
    {
        frame1_0_b = LD_SB(frame1_ptr);
        frame2_0_b = LD_SB(frame2_ptr);
        frame1_ptr += stride;
        frame2_ptr += 16;
        frame1_1_b = LD_SB(frame1_ptr);
        frame2_1_b = LD_SB(frame2_ptr);
        LD_SW2(acc, 4, acc0, acc1);
        LD_SW2(acc + 8, 4, acc2, acc3);
        LD_SH2(cnt, 8, cnt0, cnt1);
        ILVRL_B2_UB(frame1_0_b, frame2_0_b, frame_l, frame_h);
        HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
        UNPCK_SH_SW(diff0, diff0_r, diff0_l);
        UNPCK_SH_SW(diff1, diff1_r, diff1_l);
        MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
             diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
        MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
             mod0_w, mod1_w, mod2_w, mod3_w);
        SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
        diff0_r = (mod0_w < const16);
        diff0_l = (mod1_w < const16);
        diff1_r = (mod2_w < const16);
        diff1_l = (mod3_w < const16);
        SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
             mod0_w, mod1_w, mod2_w, mod3_w);
        mod0_w = diff0_r & mod0_w;
        mod1_w = diff0_l & mod1_w;
        mod2_w = diff1_r & mod2_w;
        mod3_w = diff1_l & mod3_w;
        MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
             filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
        PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h)
        ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
        ST_SH2(mod0_h, mod1_h, cnt, 8);
        cnt += 16;
        ILVRL_B2_SH(zero, frame2_0_b, frame2_0_h, frame2_1_h);
        UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
        UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
        MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
             frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
        ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
             mod0_w, mod1_w, mod2_w, mod3_w);
        ST_SW2(mod0_w, mod1_w, acc, 4);
        ST_SW2(mod2_w, mod3_w, acc + 8, 4);
        acc += 16;
        LD_SW2(acc, 4, acc0, acc1);
        LD_SW2(acc + 8, 4, acc2, acc3);
        LD_SH2(cnt, 8, cnt0, cnt1);
        ILVRL_B2_UB(frame1_1_b, frame2_1_b, frame_l, frame_h);
        HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
        UNPCK_SH_SW(diff0, diff0_r, diff0_l);
        UNPCK_SH_SW(diff1, diff1_r, diff1_l);
        MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l,
             diff1_l, mod0_w, mod1_w, mod2_w, mod3_w);
        MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3,
             mod0_w, mod1_w, mod2_w, mod3_w);
        SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
        diff0_r = (mod0_w < const16);
        diff0_l = (mod1_w < const16);
        diff1_r = (mod2_w < const16);
        diff1_l = (mod3_w < const16);
        SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
             mod0_w, mod1_w, mod2_w, mod3_w);
        mod0_w = diff0_r & mod0_w;
        mod1_w = diff0_l & mod1_w;
        mod2_w = diff1_r & mod2_w;
        mod3_w = diff1_l & mod3_w;
        MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
             filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
        PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
        ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
        ST_SH2(mod0_h, mod1_h, cnt, 8);
        cnt += 16;

        UNPCK_UB_SH(frame2_1_b, frame2_0_h, frame2_1_h);
        UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
        UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
        MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w,
             frame2_3, mod0_w, mod1_w, mod2_w, mod3_w);
        ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3,
             mod0_w, mod1_w, mod2_w, mod3_w);
        ST_SW2(mod0_w, mod1_w, acc, 4);
        ST_SW2(mod2_w, mod3_w, acc + 8, 4);
        acc += 16;
        frame1_ptr += stride;
        frame2_ptr += 16;
    }
}

Exemplo n.º 21

0

Exibir arquivo

Arquivo: fwd_dct32x32_msa.c Projeto: ALEJANDROJ19/VTW-server

static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
                                    int16_t *out) {
  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  v4i32 vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l;
  v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r;
  v4i32 tmp0_w, tmp1_w, tmp2_w, tmp3_w;

  /* fdct32 even */
  /* stage 2 */
  LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
  LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);

  BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
               in8, in9, in10, in11, in12, in13, in14, in15,
               vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7,
               in8, in9, in10, in11, in12, in13, in14, in15);
  ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8);
  ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8);

  /* Stage 3 */
  UNPCK_SH_SW(vec0, vec0_l, vec0_r);
  UNPCK_SH_SW(vec1, vec1_l, vec1_r);
  UNPCK_SH_SW(vec2, vec2_l, vec2_r);
  UNPCK_SH_SW(vec3, vec3_l, vec3_r);
  UNPCK_SH_SW(vec4, vec4_l, vec4_r);
  UNPCK_SH_SW(vec5, vec5_l, vec5_r);
  UNPCK_SH_SW(vec6, vec6_l, vec6_r);
  UNPCK_SH_SW(vec7, vec7_l, vec7_r);
  ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r,
       tmp0_w, tmp1_w, tmp2_w, tmp3_w);
  BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r);
  ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l,
       vec0_r, vec1_r, vec2_r, vec3_r);

  tmp3_w = vec0_r + vec3_r;
  vec0_r = vec0_r - vec3_r;
  vec3_r = vec1_r + vec2_r;
  vec1_r = vec1_r - vec2_r;

  DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64,
                    cospi_16_64, vec4_r, tmp3_w, vec6_r, vec3_r);
  FDCT32_POSTPROC_NEG_W(vec4_r);
  FDCT32_POSTPROC_NEG_W(tmp3_w);
  FDCT32_POSTPROC_NEG_W(vec6_r);
  FDCT32_POSTPROC_NEG_W(vec3_r);
  PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
  ST_SH2(vec5, vec4, out, 8);

  DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64,
                    cospi_8_64, vec4_r, tmp3_w, vec6_r, vec3_r);
  FDCT32_POSTPROC_NEG_W(vec4_r);
  FDCT32_POSTPROC_NEG_W(tmp3_w);
  FDCT32_POSTPROC_NEG_W(vec6_r);
  FDCT32_POSTPROC_NEG_W(vec3_r);
  PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
  ST_SH2(vec5, vec4, out + 16, 8);

  LD_SH8(interm_ptr, 8, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
  SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
  ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4);
  FDCT_POSTPROC_2V_NEG_H(in4, in5);
  ST_SH(in4, out + 32);
  ST_SH(in5, out + 56);

  SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4);
  FDCT_POSTPROC_2V_NEG_H(in4, in5);
  ST_SH(in4, out + 40);
  ST_SH(in5, out + 48);

  LD_SH8(interm_ptr + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
  ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
  ADD2(in0, in1, in2, in3, vec0, vec7);
  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4);
  FDCT_POSTPROC_2V_NEG_H(in4, in5);
  ST_SH(in4, out + 64);
  ST_SH(in5, out + 120);

  SUB2(in0, in1, in2, in3, in0, in2);
  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4);
  FDCT_POSTPROC_2V_NEG_H(in4, in5);
  ST_SH(in4, out + 72);
  ST_SH(in5, out + 112);

  SUB2(in9, vec2, in14, vec5, vec2, vec5);
  DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
  SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4);
  FDCT_POSTPROC_2V_NEG_H(in4, in5);
  ST_SH(in4, out + 80);
  ST_SH(in5, out + 104);

  ADD2(in3, in2, in0, in1, vec3, vec4);
  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5);
  FDCT_POSTPROC_2V_NEG_H(in4, in5);
  ST_SH(in4, out + 96);
  ST_SH(in5, out + 88);
}