コード例 #1
0
int32_t vp8_block_error_msa(int16_t *coeff_ptr, int16_t *dq_coeff_ptr)
{
    int32_t err = 0;
    uint32_t loop_cnt;
    v8i16 coeff, dq_coeff, coeff0, coeff1;
    v4i32 diff0, diff1;
    v2i64 err0 = { 0 };
    v2i64 err1 = { 0 };

    for (loop_cnt = 2; loop_cnt--;)
    {
        coeff = LD_SH(coeff_ptr);
        dq_coeff = LD_SH(dq_coeff_ptr);
        ILVRL_H2_SH(coeff, dq_coeff, coeff0, coeff1);
        HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
        DPADD_SD2_SD(diff0, diff1, err0, err1);
        coeff_ptr += 8;
        dq_coeff_ptr += 8;
    }

    err0 += __msa_splati_d(err0, 1);
    err1 += __msa_splati_d(err1, 1);
    err = __msa_copy_s_d(err0, 0);
    err += __msa_copy_s_d(err1, 0);

    return err;
}
コード例 #2
0
ファイル: enc_msa.c プロジェクト: garrettmoon/libwebp
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  uint64_t out0, out1, out2, out3;
  uint32_t in0, in1, in2, in3;
  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
  v8i16 t0, t1, t2, t3;
  v16u8 srcl0, srcl1, src0, src1;
  const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 };
  const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 };
  const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 };
  const v8i16 mask3 = { 0, 4, 1, 5, 2, 6, 3, 7 };
  const v8i16 cnst0 = { 2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352 };
  const v8i16 cnst1 = { 5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217 };

  LW4(src, BPS, in0, in1, in2, in3);
  INSERT_W4_UB(in0, in1, in2, in3, src0);
  LW4(ref, BPS, in0, in1, in2, in3);
  INSERT_W4_UB(in0, in1, in2, in3, src1);
  ILVRL_B2_UB(src0, src1, srcl0, srcl1);
  HSUB_UB2_SH(srcl0, srcl1, t0, t1);
  VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
  ADDSUB2(t2, t3, t0, t1);
  t0 = SRLI_H(t0, 3);
  VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
  tmp0 = __msa_hadd_s_w(t3, t3);
  tmp2 = __msa_hsub_s_w(t3, t3);
  FILL_W2_SW(1812, 937, tmp1, tmp3);
  DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
  SRAI_W2_SW(tmp1, tmp3, 9);
  PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
  VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
  ADDSUB2(t2, t3, t0, t1);
  VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
  tmp0 = __msa_hadd_s_w(t3, t3);
  tmp2 = __msa_hsub_s_w(t3, t3);
  ADDVI_W2_SW(tmp0, 7, tmp2, 7, tmp0, tmp2);
  SRAI_W2_SW(tmp0, tmp2, 4);
  FILL_W2_SW(12000, 51000, tmp1, tmp3);
  DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
  SRAI_W2_SW(tmp1, tmp3, 16);
  UNPCK_R_SH_SW(t1, tmp4);
  tmp5 = __msa_ceqi_w(tmp4, 0);
  tmp4 = (v4i32)__msa_nor_v((v16u8)tmp5, (v16u8)tmp5);
  tmp5 = __msa_fill_w(1);
  tmp5 = (v4i32)__msa_and_v((v16u8)tmp5, (v16u8)tmp4);
  tmp1 += tmp5;
  PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
  out0 = __msa_copy_s_d((v2i64)t0, 0);
  out1 = __msa_copy_s_d((v2i64)t0, 1);
  out2 = __msa_copy_s_d((v2i64)t1, 0);
  out3 = __msa_copy_s_d((v2i64)t1, 1);
  SD4(out0, out1, out2, out3, out, 8);
}
コード例 #3
0
int32_t vp8_mbblock_error_msa(MACROBLOCK *mb, int32_t dc)
{
    BLOCK *be;
    BLOCKD *bd;
    int16_t *coeff_ptr, *dq_coeff_ptr;
    int32_t err = 0;
    uint32_t loop_cnt;
    v8i16 coeff, coeff0, coeff1, coeff2, coeff3, coeff4;
    v8i16 dq_coeff, dq_coeff2, dq_coeff3, dq_coeff4;
    v4i32 diff0, diff1;
    v2i64 err0, err1;
    v16u8 zero  = { 0 };
    v16u8 mask0 = (v16u8)__msa_ldi_b(255);

    if (1 == dc)
    {
        mask0 = (v16u8)__msa_insve_w((v4i32)mask0, 0, (v4i32)zero);
    }

    for (loop_cnt = 0; loop_cnt < 8; loop_cnt++)
    {
        be = &mb->block[2 * loop_cnt];
        bd = &mb->e_mbd.block[2 * loop_cnt];
        coeff_ptr = be->coeff;
        dq_coeff_ptr = bd->dqcoeff;
        coeff = LD_SH(coeff_ptr);
        dq_coeff = LD_SH(dq_coeff_ptr);
        coeff_ptr += 8;
        dq_coeff_ptr += 8;
        coeff2 = LD_SH(coeff_ptr);
        dq_coeff2 = LD_SH(dq_coeff_ptr);
        be = &mb->block[2 * loop_cnt + 1];
        bd = &mb->e_mbd.block[2 * loop_cnt + 1];
        coeff_ptr = be->coeff;
        dq_coeff_ptr = bd->dqcoeff;
        coeff3 = LD_SH(coeff_ptr);
        dq_coeff3 = LD_SH(dq_coeff_ptr);
        coeff_ptr += 8;
        dq_coeff_ptr += 8;
        coeff4 = LD_SH(coeff_ptr);
        dq_coeff4 = LD_SH(dq_coeff_ptr);
        ILVRL_H2_SH(coeff, dq_coeff, coeff0, coeff1);
        HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
        diff0 = (v4i32)__msa_bmnz_v(zero, (v16u8)diff0, mask0);
        DOTP_SW2_SD(diff0, diff1, diff0, diff1, err0, err1);
        ILVRL_H2_SH(coeff2, dq_coeff2, coeff0, coeff1);
        HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
        DPADD_SD2_SD(diff0, diff1, err0, err1);
        err0 += __msa_splati_d(err0, 1);
        err1 += __msa_splati_d(err1, 1);
        err += __msa_copy_s_d(err0, 0);
        err += __msa_copy_s_d(err1, 0);

        ILVRL_H2_SH(coeff3, dq_coeff3, coeff0, coeff1);
        HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
        diff0 = (v4i32)__msa_bmnz_v(zero, (v16u8)diff0, mask0);
        DOTP_SW2_SD(diff0, diff1, diff0, diff1, err0, err1);
        ILVRL_H2_SH(coeff4, dq_coeff4, coeff0, coeff1);
        HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
        DPADD_SD2_SD(diff0, diff1, err0, err1);
        err0 += __msa_splati_d(err0, 1);
        err1 += __msa_splati_d(err1, 1);
        err += __msa_copy_s_d(err0, 0);
        err += __msa_copy_s_d(err1, 0);
    }

    return err;
}
コード例 #4
0
int32_t vp8_mbuverror_msa(MACROBLOCK *mb)
{
    BLOCK *be;
    BLOCKD *bd;
    int16_t *coeff_ptr, *dq_coeff_ptr;
    int32_t err = 0;
    uint32_t loop_cnt;
    v8i16 coeff, coeff0, coeff1, coeff2, coeff3, coeff4;
    v8i16 dq_coeff, dq_coeff2, dq_coeff3, dq_coeff4;
    v4i32 diff0, diff1;
    v2i64 err0, err1, err_dup0, err_dup1;

    for (loop_cnt = 16; loop_cnt < 24; loop_cnt += 2)
    {
        be = &mb->block[loop_cnt];
        bd = &mb->e_mbd.block[loop_cnt];
        coeff_ptr = be->coeff;
        dq_coeff_ptr = bd->dqcoeff;
        coeff = LD_SH(coeff_ptr);
        dq_coeff = LD_SH(dq_coeff_ptr);
        coeff_ptr += 8;
        dq_coeff_ptr += 8;
        coeff2 = LD_SH(coeff_ptr);
        dq_coeff2 = LD_SH(dq_coeff_ptr);
        be = &mb->block[loop_cnt + 1];
        bd = &mb->e_mbd.block[loop_cnt + 1];
        coeff_ptr = be->coeff;
        dq_coeff_ptr = bd->dqcoeff;
        coeff3 = LD_SH(coeff_ptr);
        dq_coeff3 = LD_SH(dq_coeff_ptr);
        coeff_ptr += 8;
        dq_coeff_ptr += 8;
        coeff4 = LD_SH(coeff_ptr);
        dq_coeff4 = LD_SH(dq_coeff_ptr);

        ILVRL_H2_SH(coeff, dq_coeff, coeff0, coeff1);
        HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
        DOTP_SW2_SD(diff0, diff1, diff0, diff1, err0, err1);

        ILVRL_H2_SH(coeff2, dq_coeff2, coeff0, coeff1);
        HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
        DPADD_SD2_SD(diff0, diff1, err0, err1);
        err_dup0 = __msa_splati_d(err0, 1);
        err_dup1 = __msa_splati_d(err1, 1);
        ADD2(err0, err_dup0, err1, err_dup1, err0, err1);
        err += __msa_copy_s_d(err0, 0);
        err += __msa_copy_s_d(err1, 0);

        ILVRL_H2_SH(coeff3, dq_coeff3, coeff0, coeff1);
        HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
        DOTP_SW2_SD(diff0, diff1, diff0, diff1, err0, err1);
        ILVRL_H2_SH(coeff4, dq_coeff4, coeff0, coeff1);
        HSUB_UH2_SW(coeff0, coeff1, diff0, diff1);
        DPADD_SD2_SD(diff0, diff1, err0, err1);
        err_dup0 = __msa_splati_d(err0, 1);
        err_dup1 = __msa_splati_d(err1, 1);
        ADD2(err0, err_dup0, err1, err_dup1, err0, err1);
        err += __msa_copy_s_d(err0, 0);
        err += __msa_copy_s_d(err1, 0);
    }

    return err;
}