예제 #1
0
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  uint64_t out0, out1, out2, out3;
  uint32_t in0, in1, in2, in3;
  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
  v8i16 t0, t1, t2, t3;
  v16u8 srcl0, srcl1, src0, src1;
  const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 };
  const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 };
  const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 };
  const v8i16 mask3 = { 0, 4, 1, 5, 2, 6, 3, 7 };
  const v8i16 cnst0 = { 2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352 };
  const v8i16 cnst1 = { 5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217 };

  LW4(src, BPS, in0, in1, in2, in3);
  INSERT_W4_UB(in0, in1, in2, in3, src0);
  LW4(ref, BPS, in0, in1, in2, in3);
  INSERT_W4_UB(in0, in1, in2, in3, src1);
  ILVRL_B2_UB(src0, src1, srcl0, srcl1);
  HSUB_UB2_SH(srcl0, srcl1, t0, t1);
  VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
  ADDSUB2(t2, t3, t0, t1);
  t0 = SRLI_H(t0, 3);
  VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
  tmp0 = __msa_hadd_s_w(t3, t3);
  tmp2 = __msa_hsub_s_w(t3, t3);
  FILL_W2_SW(1812, 937, tmp1, tmp3);
  DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
  SRAI_W2_SW(tmp1, tmp3, 9);
  PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
  VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
  ADDSUB2(t2, t3, t0, t1);
  VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
  tmp0 = __msa_hadd_s_w(t3, t3);
  tmp2 = __msa_hsub_s_w(t3, t3);
  ADDVI_W2_SW(tmp0, 7, tmp2, 7, tmp0, tmp2);
  SRAI_W2_SW(tmp0, tmp2, 4);
  FILL_W2_SW(12000, 51000, tmp1, tmp3);
  DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
  SRAI_W2_SW(tmp1, tmp3, 16);
  UNPCK_R_SH_SW(t1, tmp4);
  tmp5 = __msa_ceqi_w(tmp4, 0);
  tmp4 = (v4i32)__msa_nor_v((v16u8)tmp5, (v16u8)tmp5);
  tmp5 = __msa_fill_w(1);
  tmp5 = (v4i32)__msa_and_v((v16u8)tmp5, (v16u8)tmp4);
  tmp1 += tmp5;
  PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
  out0 = __msa_copy_s_d((v2i64)t0, 0);
  out1 = __msa_copy_s_d((v2i64)t0, 1);
  out2 = __msa_copy_s_d((v2i64)t1, 0);
  out3 = __msa_copy_s_d((v2i64)t1, 1);
  SD4(out0, out1, out2, out3, out, 8);
}
예제 #2
0
파일: dct_msa.c 프로젝트: Corax26/libvpx
void vp8_short_fdct4x4_msa(int16_t *input, int16_t *output, int32_t pitch) {
  v8i16 in0, in1, in2, in3;
  v8i16 temp0, temp1;
  v8i16 const0, const1;
  v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 };
  v4i32 out0, out1, out2, out3;
  v8i16 zero = { 0 };

  LD_SH4(input, pitch / 2, in0, in1, in2, in3);
  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);

  BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
  SLLI_4V(temp0, temp1, in1, in3, 3);
  in0 = temp0 + temp1;
  in2 = temp0 - temp1;
  SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1);
  temp0 = __msa_ilvr_h(in3, in1);
  in1 = __msa_splati_h(coeff, 3);
  out0 = (v4i32)__msa_ilvev_h(zero, in1);
  coeff = __msa_ilvl_h(zero, coeff);
  out1 = __msa_splati_w((v4i32)coeff, 0);
  DPADD_SH2_SW(temp0, temp0, const0, const1, out0, out1);
  out0 >>= 12;
  out1 >>= 12;
  PCKEV_H2_SH(out0, out0, out1, out1, in1, in3);
  TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);

  BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);
  in0 = temp0 + temp1 + 7;
  in2 = temp0 - temp1 + 7;
  in0 >>= 4;
  in2 >>= 4;
  ILVR_H2_SW(zero, in0, zero, in2, out0, out2);
  temp1 = RET_1_IF_NZERO_H(in3);
  ILVR_H2_SH(zero, temp1, in3, in1, temp1, temp0);
  SPLATI_W2_SW(coeff, 2, out3, out1);
  out3 += out1;
  out1 = __msa_splati_w((v4i32)coeff, 1);
  DPADD_SH2_SW(temp0, temp0, const0, const1, out1, out3);
  out1 >>= 16;
  out3 >>= 16;
  out1 += (v4i32)temp1;
  PCKEV_H2_SH(out1, out0, out3, out2, in0, in2);
  ST_SH2(in0, in2, output, 8);
}
예제 #3
0
static void hevc_idct_8x32_column_msa(int16_t *coeffs, uint8_t buf_pitch,
                                      uint8_t round)
{
    uint8_t i;
    const int16_t *filter_ptr0 = &gt32x32_cnst0[0];
    const int16_t *filter_ptr1 = &gt32x32_cnst1[0];
    const int16_t *filter_ptr2 = &gt32x32_cnst2[0];
    const int16_t *filter_ptr3 = &gt8x8_cnst[0];
    int16_t *src0 = (coeffs + buf_pitch);
    int16_t *src1 = (coeffs + 2 * buf_pitch);
    int16_t *src2 = (coeffs + 4 * buf_pitch);
    int16_t *src3 = (coeffs);
    int32_t cnst0, cnst1;
    int32_t tmp_buf[8 * 32 + 15];
    int32_t *tmp_buf_ptr = tmp_buf + 15;
    v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
    v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
    v8i16 src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l;
    v8i16 filt0, filter0, filter1, filter2, filter3;
    v4i32 sum0_r, sum0_l, sum1_r, sum1_l, tmp0_r, tmp0_l, tmp1_r, tmp1_l;

    /* Align pointer to 64 byte boundary */
    tmp_buf_ptr = (int32_t *)(((uintptr_t) tmp_buf_ptr) & ~(uintptr_t) 63);

    /* process coeff 4, 12, 20, 28 */
    LD_SH4(src2, 8 * buf_pitch, in0, in1, in2, in3);
    ILVR_H2_SH(in1, in0, in3, in2, src0_r, src1_r);
    ILVL_H2_SH(in1, in0, in3, in2, src0_l, src1_l);

    LD_SH2(src3, 16 * buf_pitch, in4, in6);
    LD_SH2((src3 + 8 * buf_pitch), 16 * buf_pitch, in5, in7);
    ILVR_H2_SH(in6, in4, in7, in5, src2_r, src3_r);
    ILVL_H2_SH(in6, in4, in7, in5, src2_l, src3_l);

    /* loop for all columns of constants */
    for (i = 0; i < 2; i++) {
        /* processing single column of constants */
        cnst0 = LW(filter_ptr2);
        cnst1 = LW(filter_ptr2 + 2);

        filter0 = (v8i16) __msa_fill_w(cnst0);
        filter1 = (v8i16) __msa_fill_w(cnst1);

        DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l);
        DPADD_SH2_SW(src1_r, src1_l, filter1, filter1, sum0_r, sum0_l);
        ST_SW2(sum0_r, sum0_l, (tmp_buf_ptr + 2 * i * 8), 4);

        /* processing single column of constants */
        cnst0 = LW(filter_ptr2 + 4);
        cnst1 = LW(filter_ptr2 + 6);

        filter0 = (v8i16) __msa_fill_w(cnst0);
        filter1 = (v8i16) __msa_fill_w(cnst1);

        DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l);
        DPADD_SH2_SW(src1_r, src1_l, filter1, filter1, sum0_r, sum0_l);
        ST_SW2(sum0_r, sum0_l, (tmp_buf_ptr + (2 * i + 1) * 8), 4);

        filter_ptr2 += 8;
    }

    /* process coeff 0, 8, 16, 24 */
    /* loop for all columns of constants */
    for (i = 0; i < 2; i++) {
        /* processing first column of filter constants */
        cnst0 = LW(filter_ptr3);
        cnst1 = LW(filter_ptr3 + 2);

        filter0 = (v8i16) __msa_fill_w(cnst0);
        filter1 = (v8i16) __msa_fill_w(cnst1);

        DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter0, filter0, filter1,
                    filter1, sum0_r, sum0_l, tmp1_r, tmp1_l);

        sum1_r = sum0_r - tmp1_r;
        sum1_l = sum0_l - tmp1_l;
        sum0_r = sum0_r + tmp1_r;
        sum0_l = sum0_l + tmp1_l;

        HEVC_EVEN16_CALC(tmp_buf_ptr, sum0_r, sum0_l, i, (7 - i));
        HEVC_EVEN16_CALC(tmp_buf_ptr, sum1_r, sum1_l, (3 - i), (4 + i));

        filter_ptr3 += 8;
    }

    /* process coeff 2 6 10 14 18 22 26 30 */
    LD_SH8(src1, 4 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7);
    ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
               src0_r, src1_r, src2_r, src3_r);
    ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
               src0_l, src1_l, src2_l, src3_l);

    /* loop for all columns of constants */
    for (i = 0; i < 8; i++) {
        /* processing single column of constants */
        filt0 = LD_SH(filter_ptr1);
        SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3);
        DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l);
        DPADD_SH4_SW(src1_r, src1_l, src2_r, src2_l, filter1, filter1, filter2,
                     filter2, sum0_r, sum0_l, sum0_r, sum0_l);
        DPADD_SH2_SW(src3_r, src3_l, filter3, filter3, sum0_r, sum0_l);

        LD_SW2(tmp_buf_ptr + i * 8, 4, tmp0_r, tmp0_l);
        tmp1_r = tmp0_r;
        tmp1_l = tmp0_l;
        tmp0_r += sum0_r;
        tmp0_l += sum0_l;
        ST_SW2(tmp0_r, tmp0_l, (tmp_buf_ptr + i * 8), 4);
        tmp1_r -= sum0_r;
        tmp1_l -= sum0_l;
        ST_SW2(tmp1_r, tmp1_l, (tmp_buf_ptr + (15 - i) * 8), 4);

        filter_ptr1 += 8;
    }

    /* process coeff 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 */
    LD_SH8(src0, 2 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7);
    src0 += 16 * buf_pitch;
    ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
               src0_r, src1_r, src2_r, src3_r);
    ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
               src0_l, src1_l, src2_l, src3_l);

    LD_SH8(src0, 2 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7);
    ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
               src4_r, src5_r, src6_r, src7_r);
    ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
               src4_l, src5_l, src6_l, src7_l);

    /* loop for all columns of filter constants */
    for (i = 0; i < 16; i++) {
        /* processing single column of constants */
        filt0 = LD_SH(filter_ptr0);
        SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3);
        DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l);
        DPADD_SH4_SW(src1_r, src1_l, src2_r, src2_l, filter1, filter1, filter2,
                     filter2, sum0_r, sum0_l, sum0_r, sum0_l);
        DPADD_SH2_SW(src3_r, src3_l, filter3, filter3, sum0_r, sum0_l);

        tmp1_r = sum0_r;
        tmp1_l = sum0_l;

        filt0 = LD_SH(filter_ptr0 + 8);
        SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3);
        DOTP_SH2_SW(src4_r, src4_l, filter0, filter0, sum0_r, sum0_l);
        DPADD_SH4_SW(src5_r, src5_l, src6_r, src6_l, filter1, filter1, filter2,
                     filter2, sum0_r, sum0_l, sum0_r, sum0_l);
        DPADD_SH2_SW(src7_r, src7_l, filter3, filter3, sum0_r, sum0_l);

        sum0_r += tmp1_r;
        sum0_l += tmp1_l;

        LD_SW2(tmp_buf_ptr + i * 8, 4, tmp0_r, tmp0_l);
        tmp1_r = tmp0_r;
        tmp1_l = tmp0_l;
        tmp0_r += sum0_r;
        tmp0_l += sum0_l;
        sum1_r = __msa_fill_w(round);
        SRAR_W2_SW(tmp0_r, tmp0_l, sum1_r);
        SAT_SW2_SW(tmp0_r, tmp0_l, 15);
        in0 = __msa_pckev_h((v8i16) tmp0_l, (v8i16) tmp0_r);
        ST_SH(in0, (coeffs + i * buf_pitch));
        tmp1_r -= sum0_r;
        tmp1_l -= sum0_l;
        SRAR_W2_SW(tmp1_r, tmp1_l, sum1_r);
        SAT_SW2_SW(tmp1_r, tmp1_l, 15);
        in0 = __msa_pckev_h((v8i16) tmp1_l, (v8i16) tmp1_r);
        ST_SH(in0, (coeffs + (31 - i) * buf_pitch));

        filter_ptr0 += 16;
    }
}