Ejemplo n.º 1
0
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  uint64_t out0, out1, out2, out3;
  uint32_t in0, in1, in2, in3;
  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
  v8i16 t0, t1, t2, t3;
  v16u8 srcl0, srcl1, src0, src1;
  const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 };
  const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 };
  const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 };
  const v8i16 mask3 = { 0, 4, 1, 5, 2, 6, 3, 7 };
  const v8i16 cnst0 = { 2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352 };
  const v8i16 cnst1 = { 5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217 };

  LW4(src, BPS, in0, in1, in2, in3);
  INSERT_W4_UB(in0, in1, in2, in3, src0);
  LW4(ref, BPS, in0, in1, in2, in3);
  INSERT_W4_UB(in0, in1, in2, in3, src1);
  ILVRL_B2_UB(src0, src1, srcl0, srcl1);
  HSUB_UB2_SH(srcl0, srcl1, t0, t1);
  VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
  ADDSUB2(t2, t3, t0, t1);
  t0 = SRLI_H(t0, 3);
  VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
  tmp0 = __msa_hadd_s_w(t3, t3);
  tmp2 = __msa_hsub_s_w(t3, t3);
  FILL_W2_SW(1812, 937, tmp1, tmp3);
  DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
  SRAI_W2_SW(tmp1, tmp3, 9);
  PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
  VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
  ADDSUB2(t2, t3, t0, t1);
  VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
  tmp0 = __msa_hadd_s_w(t3, t3);
  tmp2 = __msa_hsub_s_w(t3, t3);
  ADDVI_W2_SW(tmp0, 7, tmp2, 7, tmp0, tmp2);
  SRAI_W2_SW(tmp0, tmp2, 4);
  FILL_W2_SW(12000, 51000, tmp1, tmp3);
  DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
  SRAI_W2_SW(tmp1, tmp3, 16);
  UNPCK_R_SH_SW(t1, tmp4);
  tmp5 = __msa_ceqi_w(tmp4, 0);
  tmp4 = (v4i32)__msa_nor_v((v16u8)tmp5, (v16u8)tmp5);
  tmp5 = __msa_fill_w(1);
  tmp5 = (v4i32)__msa_and_v((v16u8)tmp5, (v16u8)tmp4);
  tmp1 += tmp5;
  PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
  out0 = __msa_copy_s_d((v2i64)t0, 0);
  out1 = __msa_copy_s_d((v2i64)t0, 1);
  out2 = __msa_copy_s_d((v2i64)t1, 0);
  out3 = __msa_copy_s_d((v2i64)t1, 1);
  SD4(out0, out1, out2, out3, out, 8);
}
Ejemplo n.º 2
0
uint32_t vp10_avg_4x4_msa(const uint8_t *src, int32_t src_stride) {
  uint32_t sum_out;
  uint32_t src0, src1, src2, src3;
  v16u8 vec = { 0 };
  v8u16 sum0;
  v4u32 sum1;
  v2u64 sum2;

  LW4(src, src_stride, src0, src1, src2, src3);
  INSERT_W4_UB(src0, src1, src2, src3, vec);

  sum0 = __msa_hadd_u_h(vec, vec);
  sum1 = __msa_hadd_u_w(sum0, sum0);
  sum0 = (v8u16)__msa_pckev_h((v8i16)sum1, (v8i16)sum1);
  sum1 = __msa_hadd_u_w(sum0, sum0);
  sum2 = __msa_hadd_u_d(sum1, sum1);
  sum1 = (v4u32)__msa_srari_w((v4i32)sum2, 4);
  sum_out = __msa_copy_u_w((v4i32)sum1, 0);

  return sum_out;
}
Ejemplo n.º 3
0
static void avc_idct4x4_addblk_dc_msa( uint8_t *p_dst, int16_t *p_src,
                                       int32_t i_dst_stride )
{
    int16_t i_dc;
    uint32_t i_src0, i_src1, i_src2, i_src3;
    v16u8 pred = { 0 };
    v16i8 out;
    v8i16 input_dc, pred_r, pred_l;

    i_dc = ( p_src[0] + 32 ) >> 6;
    input_dc = __msa_fill_h( i_dc );
    p_src[ 0 ] = 0;

    LW4( p_dst, i_dst_stride, i_src0, i_src1, i_src2, i_src3 );
    INSERT_W4_UB( i_src0, i_src1, i_src2, i_src3, pred );
    UNPCK_UB_SH( pred, pred_r, pred_l );

    pred_r += input_dc;
    pred_l += input_dc;

    CLIP_SH2_0_255( pred_r, pred_l );
    out = __msa_pckev_b( ( v16i8 ) pred_l, ( v16i8 ) pred_r );
    ST4x4_UB( out, out, 0, 1, 2, 3, p_dst, i_dst_stride );
}