C++ (Cpp) vnegq_s32 Beispiele

Beispiel #1

0

Datei anzeigen

Datei: transform_neon.c Projekt: MekliCZ/positron

static inline int32_t TransformAndFindMaxNeon(int16_t* inre,
                                              int16_t* inim,
                                              int32_t* outre,
                                              int32_t* outim) {
  int k;
  int16_t* inre1 = inre;
  int16_t* inre2 = &inre[FRAMESAMPLES/2 - 4];
  int16_t* inim1 = inim;
  int16_t* inim2 = &inim[FRAMESAMPLES/2 - 4];
  int32_t* outre1 = outre;
  int32_t* outre2 = &outre[FRAMESAMPLES/2 - 4];
  int32_t* outim1 = outim;
  int32_t* outim2 = &outim[FRAMESAMPLES/2 - 4];
  const int16_t* kSinTab1 = &WebRtcIsacfix_kSinTab2[0];
  const int16_t* kSinTab2 = &WebRtcIsacfix_kSinTab2[FRAMESAMPLES/4 - 4];
  uint32x4_t max_r = vdupq_n_u32(0);
  uint32x4_t max_i = vdupq_n_u32(0);

  // Use ">> 5", instead of "<< 9" and then ">> 14" as in the C code.
  for (k = 0; k < FRAMESAMPLES/4; k += 4) {
    int16x4_t tmpi = vld1_s16(kSinTab1);
    kSinTab1 += 4;
    int16x4_t tmpr = vld1_s16(kSinTab2);
    kSinTab2 -= 4;
    int16x4_t inre_0 = vld1_s16(inre1);
    inre1 += 4;
    int16x4_t inre_1 = vld1_s16(inre2);
    inre2 -= 4;
    int16x4_t inim_0 = vld1_s16(inim1);
    inim1 += 4;
    int16x4_t inim_1 = vld1_s16(inim2);
    inim2 -= 4;
    tmpr = vneg_s16(tmpr);
    inre_1 = vrev64_s16(inre_1);
    inim_1 = vrev64_s16(inim_1);
    tmpr = vrev64_s16(tmpr);

    int32x4_t xr = vmull_s16(tmpr, inre_0);
    int32x4_t xi = vmull_s16(tmpr, inim_0);
    int32x4_t yr = vmull_s16(tmpr, inim_1);
    int32x4_t yi = vmull_s16(tmpi, inim_1);
    xr = vmlal_s16(xr, tmpi, inim_0);
    xi = vmlsl_s16(xi, tmpi, inre_0);
    yr = vmlal_s16(yr, tmpi, inre_1);
    yi = vmlsl_s16(yi, tmpr, inre_1);
    yr = vnegq_s32(yr);

    xr = vshrq_n_s32(xr, 5);
    xi = vshrq_n_s32(xi, 5);
    yr = vshrq_n_s32(yr, 5);
    yi = vshrq_n_s32(yi, 5);

    int32x4_t outr0 = vsubq_s32(xr, yi);
    int32x4_t outr1 = vaddq_s32(xr, yi);
    int32x4_t outi0 = vaddq_s32(xi, yr);
    int32x4_t outi1 = vsubq_s32(yr, xi);

    // Find the absolute maximum in the vectors.
    int32x4_t tmp0 = vabsq_s32(outr0);
    int32x4_t tmp1 = vabsq_s32(outr1);
    int32x4_t tmp2 = vabsq_s32(outi0);
    int32x4_t tmp3 = vabsq_s32(outi1);
    // vabs doesn't change the value of 0x80000000.
    // Use u32 so we don't lose the value 0x80000000.
    max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp0));
    max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp2));
    max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp1));
    max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp3));

    // Store the vectors.
    outr1 = vrev64q_s32(outr1);
    outi1 = vrev64q_s32(outi1);
    int32x4_t outr_1 = vcombine_s32(vget_high_s32(outr1), vget_low_s32(outr1));
    int32x4_t outi_1 = vcombine_s32(vget_high_s32(outi1), vget_low_s32(outi1));

    vst1q_s32(outre1, outr0);
    outre1 += 4;
    vst1q_s32(outim1, outi0);
    outim1 += 4;
    vst1q_s32(outre2, outr_1);
    outre2 -= 4;
    vst1q_s32(outim2, outi_1);
    outim2 -= 4;
  }

  max_r = vmaxq_u32(max_r, max_i);
#if defined(WEBRTC_ARCH_ARM64)
  uint32_t maximum = vmaxvq_u32(max_r);
#else
  uint32x2_t max32x2_r = vmax_u32(vget_low_u32(max_r), vget_high_u32(max_r));
  max32x2_r = vpmax_u32(max32x2_r, max32x2_r);
  uint32_t maximum = vget_lane_u32(max32x2_r, 0);
#endif

  return (int32_t)maximum;
}

Beispiel #2

0

Datei anzeigen

Datei: vp9_highbd_iht8x8_add_neon.c Projekt: wolfviking0/webcl-webkit

static INLINE void iadst8_bd12(int32x4_t *const io0, int32x4_t *const io1,
                               int32x4_t *const io2, int32x4_t *const io3,
                               int32x4_t *const io4, int32x4_t *const io5,
                               int32x4_t *const io6, int32x4_t *const io7) {
  const int32x4_t c0 =
      create_s32x4_neon(cospi_2_64, cospi_30_64, cospi_10_64, cospi_22_64);
  const int32x4_t c1 =
      create_s32x4_neon(cospi_18_64, cospi_14_64, cospi_26_64, cospi_6_64);
  const int32x4_t c2 =
      create_s32x4_neon(cospi_16_64, 0, cospi_8_64, cospi_24_64);
  int32x4_t x[8], t[4];
  int64x2_t s[8][2];

  x[0] = *io7;
  x[1] = *io0;
  x[2] = *io5;
  x[3] = *io2;
  x[4] = *io3;
  x[5] = *io4;
  x[6] = *io1;
  x[7] = *io6;

  // stage 1
  iadst_butterfly_lane_0_1_bd12_neon(x[0], x[1], vget_low_s32(c0), s[0], s[1]);
  iadst_butterfly_lane_0_1_bd12_neon(x[2], x[3], vget_high_s32(c0), s[2], s[3]);
  iadst_butterfly_lane_0_1_bd12_neon(x[4], x[5], vget_low_s32(c1), s[4], s[5]);
  iadst_butterfly_lane_0_1_bd12_neon(x[6], x[7], vget_high_s32(c1), s[6], s[7]);

  x[0] = add_dct_const_round_shift_low_8_bd12(s[0], s[4]);
  x[1] = add_dct_const_round_shift_low_8_bd12(s[1], s[5]);
  x[2] = add_dct_const_round_shift_low_8_bd12(s[2], s[6]);
  x[3] = add_dct_const_round_shift_low_8_bd12(s[3], s[7]);
  x[4] = sub_dct_const_round_shift_low_8_bd12(s[0], s[4]);
  x[5] = sub_dct_const_round_shift_low_8_bd12(s[1], s[5]);
  x[6] = sub_dct_const_round_shift_low_8_bd12(s[2], s[6]);
  x[7] = sub_dct_const_round_shift_low_8_bd12(s[3], s[7]);

  // stage 2
  t[0] = x[0];
  t[1] = x[1];
  t[2] = x[2];
  t[3] = x[3];
  iadst_butterfly_lane_0_1_bd12_neon(x[4], x[5], vget_high_s32(c2), s[4], s[5]);
  iadst_butterfly_lane_1_0_bd12_neon(x[7], x[6], vget_high_s32(c2), s[7], s[6]);

  x[0] = vaddq_s32(t[0], t[2]);
  x[1] = vaddq_s32(t[1], t[3]);
  x[2] = vsubq_s32(t[0], t[2]);
  x[3] = vsubq_s32(t[1], t[3]);
  x[4] = add_dct_const_round_shift_low_8_bd12(s[4], s[6]);
  x[5] = add_dct_const_round_shift_low_8_bd12(s[5], s[7]);
  x[6] = sub_dct_const_round_shift_low_8_bd12(s[4], s[6]);
  x[7] = sub_dct_const_round_shift_low_8_bd12(s[5], s[7]);

  // stage 3
  iadst_half_butterfly_bd12_neon(x + 2, vget_low_s32(c2));
  iadst_half_butterfly_bd12_neon(x + 6, vget_low_s32(c2));

  *io0 = x[0];
  *io1 = vnegq_s32(x[4]);
  *io2 = x[6];
  *io3 = vnegq_s32(x[2]);
  *io4 = x[3];
  *io5 = vnegq_s32(x[7]);
  *io6 = x[5];
  *io7 = vnegq_s32(x[1]);
}

Beispiel #3

0

Datei anzeigen

Datei: transform_neon.c Projekt: MekliCZ/positron

static inline void PostShiftAndSeparateNeon(int16_t* inre,
                                            int16_t* inim,
                                            int16_t* outre,
                                            int16_t* outim,
                                            int32_t sh) {
  int k;
  int16_t* inre1 = inre;
  int16_t* inre2 = &inre[FRAMESAMPLES/2 - 4];
  int16_t* inim1 = inim;
  int16_t* inim2 = &inim[FRAMESAMPLES/2 - 4];
  int16_t* outre1 = outre;
  int16_t* outre2 = &outre[FRAMESAMPLES/2 - 4];
  int16_t* outim1 = outim;
  int16_t* outim2 = &outim[FRAMESAMPLES/2 - 4];
  const int16_t* kSinTab1 = &WebRtcIsacfix_kSinTab2[0];
  const int16_t* kSinTab2 = &WebRtcIsacfix_kSinTab2[FRAMESAMPLES/4 -4];
  // By vshl, we effectively did "<< (-sh - 23)", instead of "<< (-sh)",
  // ">> 14" and then ">> 9" as in the C code.
  int32x4_t shift = vdupq_n_s32(-sh - 23);

  for (k = 0; k < FRAMESAMPLES/4; k += 4) {
    int16x4_t tmpi = vld1_s16(kSinTab1);
    kSinTab1 += 4;
    int16x4_t tmpr = vld1_s16(kSinTab2);
    kSinTab2 -= 4;
    int16x4_t inre_0 = vld1_s16(inre1);
    inre1 += 4;
    int16x4_t inre_1 = vld1_s16(inre2);
    inre2 -= 4;
    int16x4_t inim_0 = vld1_s16(inim1);
    inim1 += 4;
    int16x4_t inim_1 = vld1_s16(inim2);
    inim2 -= 4;
    tmpr = vneg_s16(tmpr);
    inre_1 = vrev64_s16(inre_1);
    inim_1 = vrev64_s16(inim_1);
    tmpr = vrev64_s16(tmpr);

    int16x4_t xr = vqadd_s16(inre_0, inre_1);
    int16x4_t xi = vqsub_s16(inim_0, inim_1);
    int16x4_t yr = vqadd_s16(inim_0, inim_1);
    int16x4_t yi = vqsub_s16(inre_1, inre_0);

    int32x4_t outr0 = vmull_s16(tmpr, xr);
    int32x4_t outi0 = vmull_s16(tmpi, xr);
    int32x4_t outr1 = vmull_s16(tmpi, yr);
    int32x4_t outi1 = vmull_s16(tmpi, yi);
    outr0 = vmlsl_s16(outr0, tmpi, xi);
    outi0 = vmlal_s16(outi0, tmpr, xi);
    outr1 = vmlal_s16(outr1, tmpr, yi);
    outi1 = vmlsl_s16(outi1, tmpr, yr);

    outr0 = vshlq_s32(outr0, shift);
    outi0 = vshlq_s32(outi0, shift);
    outr1 = vshlq_s32(outr1, shift);
    outi1 = vshlq_s32(outi1, shift);
    outr1 = vnegq_s32(outr1);

    int16x4_t outre_0  = vmovn_s32(outr0);
    int16x4_t outim_0  = vmovn_s32(outi0);
    int16x4_t outre_1  = vmovn_s32(outr1);
    int16x4_t outim_1  = vmovn_s32(outi1);
    outre_1 = vrev64_s16(outre_1);
    outim_1 = vrev64_s16(outim_1);

    vst1_s16(outre1, outre_0);
    outre1 += 4;
    vst1_s16(outim1, outim_0);
    outim1 += 4;
    vst1_s16(outre2, outre_1);
    outre2 -= 4;
    vst1_s16(outim2, outim_1);
    outim2 -= 4;
  }
}