C++ (Cpp) vreinterpretq_u16_s16 Examples

Example #1

0

Show file

File: yuv_neon.c Project: 1vanK/Urho3D

static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
                                     uint16_t* dst, int len) {
  int i;
  const int16x8_t zero = vdupq_n_s16(0);
  const int16x8_t max = vdupq_n_s16(MAX_Y);
  uint64x2_t sum = vdupq_n_u64(0);
  uint64_t diff;

  for (i = 0; i + 8 <= len; i += 8) {
    const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
    const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
    const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
    const int16x8_t D = vsubq_s16(A, B);       // diff_y
    const int16x8_t F = vaddq_s16(C, D);       // new_y
    const uint16x8_t H =
        vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
    const int16x8_t I = vabsq_s16(D);          // abs(diff_y)
    vst1q_u16(dst + i, H);
    sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
  }
  diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
  for (; i < len; ++i) {
    const int diff_y = ref[i] - src[i];
    const int new_y = (int)(dst[i]) + diff_y;
    dst[i] = clip_y(new_y);
    diff += (uint64_t)(abs(diff_y));
  }
  return diff;
}

Example #2

0

Show file

File: aecm_core_neon.c Project: dreaflove/webrtc-lib

void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore* aecm,
                                         const uint16_t* far_spectrum,
                                         int32_t* echo_est) {
  assert((uintptr_t)echo_est % 32 == 0);
  assert((uintptr_t)(aecm->channelStored) % 16 == 0);
  assert((uintptr_t)(aecm->channelAdapt16) % 16 == 0);

  // This is C code of following optimized code.
  // During startup we store the channel every block.
  //  memcpy(aecm->channelStored,
  //         aecm->channelAdapt16,
  //         sizeof(int16_t) * PART_LEN1);
  // Recalculate echo estimate
  //  for (i = 0; i < PART_LEN; i += 4) {
  //    echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
  //                                        far_spectrum[i]);
  //    echo_est[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1],
  //                                            far_spectrum[i + 1]);
  //    echo_est[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2],
  //                                            far_spectrum[i + 2]);
  //    echo_est[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3],
  //                                            far_spectrum[i + 3]);
  //  }
  //  echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
  //                                     far_spectrum[i]);
  const uint16_t* far_spectrum_p = far_spectrum;
  int16_t* start_adapt_p = aecm->channelAdapt16;
  int16_t* start_stored_p = aecm->channelStored;
  const int16_t* end_stored_p = aecm->channelStored + PART_LEN;
  int32_t* echo_est_p = echo_est;

  uint16x8_t far_spectrum_v;
  int16x8_t adapt_v;
  uint32x4_t echo_est_v_low, echo_est_v_high;

  while (start_stored_p < end_stored_p) {
    far_spectrum_v = vld1q_u16(far_spectrum_p);
    adapt_v = vld1q_s16(start_adapt_p);

    vst1q_s16(start_stored_p, adapt_v);

    echo_est_v_low = vmull_u16(vget_low_u16(far_spectrum_v),
                               vget_low_u16(vreinterpretq_u16_s16(adapt_v)));
    echo_est_v_high = vmull_u16(vget_high_u16(far_spectrum_v),
                                vget_high_u16(vreinterpretq_u16_s16(adapt_v)));

    vst1q_s32(echo_est_p, vreinterpretq_s32_u32(echo_est_v_low));
    vst1q_s32(echo_est_p + 4, vreinterpretq_s32_u32(echo_est_v_high));

    far_spectrum_p += 8;
    start_adapt_p += 8;
    start_stored_p += 8;
    echo_est_p += 8;
  }
  aecm->channelStored[PART_LEN] = aecm->channelAdapt16[PART_LEN];
  echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN],
                                             far_spectrum[PART_LEN]);
}

Example #3

0

Show file

File: idct_dequant_0_2x_neon.c Project: Acidburn0zzz/libvpx

void idct_dequant_0_2x_neon(
        int16_t *q,
        int16_t dq,
        unsigned char *dst,
        int stride) {
    unsigned char *dst0;
    int i, a0, a1;
    int16x8x2_t q2Add;
    int32x2_t d2s32, d4s32;
    uint8x8_t d2u8, d4u8;
    uint16x8_t q1u16, q2u16;

    a0 = ((q[0] * dq) + 4) >> 3;
    a1 = ((q[16] * dq) + 4) >> 3;
    q[0] = q[16] = 0;
    q2Add.val[0] = vdupq_n_s16((int16_t)a0);
    q2Add.val[1] = vdupq_n_s16((int16_t)a1);

    for (i = 0; i < 2; i++, dst += 4) {
        dst0 = dst;
        d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0);
        dst0 += stride;
        d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1);
        dst0 += stride;
        d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0);
        dst0 += stride;
        d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1);

        q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
                         vreinterpret_u8_s32(d2s32));
        q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
                         vreinterpret_u8_s32(d4s32));

        d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
        d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));

        d2s32 = vreinterpret_s32_u8(d2u8);
        d4s32 = vreinterpret_s32_u8(d4u8);

        dst0 = dst;
        vst1_lane_s32((int32_t *)dst0, d2s32, 0);
        dst0 += stride;
        vst1_lane_s32((int32_t *)dst0, d2s32, 1);
        dst0 += stride;
        vst1_lane_s32((int32_t *)dst0, d4s32, 0);
        dst0 += stride;
        vst1_lane_s32((int32_t *)dst0, d4s32, 1);
    }
    return;
}

Example #4

0

Show file

File: mpegvideo.c Project: TaoheGit/hmi_sdl_android

static void inline ff_dct_unquantize_h263_neon(int qscale, int qadd, int nCoeffs,
                                               int16_t *block)
{
    int16x8_t q0s16, q2s16, q3s16, q8s16, q10s16, q11s16, q13s16;
    int16x8_t q14s16, q15s16, qzs16;
    int16x4_t d0s16, d2s16, d3s16, dzs16;
    uint16x8_t q1u16, q9u16;
    uint16x4_t d1u16;

    dzs16 = vdup_n_s16(0);
    qzs16 = vdupq_n_s16(0);

    q15s16 = vdupq_n_s16(qscale << 1);
    q14s16 = vdupq_n_s16(qadd);
    q13s16 = vnegq_s16(q14s16);

    if (nCoeffs > 4) {
        for (; nCoeffs > 8; nCoeffs -= 16, block += 16) {
            q0s16 = vld1q_s16(block);
            q3s16 = vreinterpretq_s16_u16(vcltq_s16(q0s16, qzs16));
            q8s16 = vld1q_s16(block + 8);
            q1u16 = vceqq_s16(q0s16, qzs16);
            q2s16 = vmulq_s16(q0s16, q15s16);
            q11s16 = vreinterpretq_s16_u16(vcltq_s16(q8s16, qzs16));
            q10s16 = vmulq_s16(q8s16, q15s16);
            q3s16 = vbslq_s16(vreinterpretq_u16_s16(q3s16), q13s16, q14s16);
            q11s16 = vbslq_s16(vreinterpretq_u16_s16(q11s16), q13s16, q14s16);
            q2s16 = vaddq_s16(q2s16, q3s16);
            q9u16 = vceqq_s16(q8s16, qzs16);
            q10s16 = vaddq_s16(q10s16, q11s16);
            q0s16 = vbslq_s16(q1u16, q0s16, q2s16);
            q8s16 = vbslq_s16(q9u16, q8s16, q10s16);
            vst1q_s16(block, q0s16);
            vst1q_s16(block + 8, q8s16);
        }
    }
    if (nCoeffs <= 0)
        return;

    d0s16 = vld1_s16(block);
    d3s16 = vreinterpret_s16_u16(vclt_s16(d0s16, dzs16));
    d1u16 = vceq_s16(d0s16, dzs16);
    d2s16 = vmul_s16(d0s16, vget_high_s16(q15s16));
    d3s16 = vbsl_s16(vreinterpret_u16_s16(d3s16),
                     vget_high_s16(q13s16), vget_high_s16(q14s16));
    d2s16 = vadd_s16(d2s16, d3s16);
    d0s16 = vbsl_s16(d1u16, d0s16, d2s16);
    vst1_s16(block, d0s16);
}

Example #5

0

Show file

File: idct4x4_1_add_neon.c Project: 93i/godot

void vpx_idct4x4_1_add_neon(
        int16_t *input,
        uint8_t *dest,
        int dest_stride) {
    uint8x8_t d6u8;
    uint32x2_t d2u32 = vdup_n_u32(0);
    uint16x8_t q8u16;
    int16x8_t q0s16;
    uint8_t *d1, *d2;
    int16_t i, a1, cospi_16_64 = 11585;
    int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
    out = dct_const_round_shift(out * cospi_16_64);
    a1 = ROUND_POWER_OF_TWO(out, 4);

    q0s16 = vdupq_n_s16(a1);

    // dc_only_idct_add
    d1 = d2 = dest;
    for (i = 0; i < 2; i++) {
        d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0);
        d1 += dest_stride;
        d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1);
        d1 += dest_stride;

        q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16),
                         vreinterpret_u8_u32(d2u32));
        d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));

        vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0);
        d2 += dest_stride;
        vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1);
        d2 += dest_stride;
    }
    return;
}

Example #6

0

Show file

File: vreinterpretQu16_s16.c Project: 5432935/crossbridge

void test_vreinterpretQu16_s16 (void)
{
  uint16x8_t out_uint16x8_t;
  int16x8_t arg0_int16x8_t;

  out_uint16x8_t = vreinterpretq_u16_s16 (arg0_int16x8_t);
}

Example #7

0

Show file

File: yuv_neon.c Project: 1vanK/Urho3D

static void SharpYUVFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
                                   const uint16_t* best_y, uint16_t* out) {
  int i;
  const int16x8_t max = vdupq_n_s16(MAX_Y);
  const int16x8_t zero = vdupq_n_s16(0);
  for (i = 0; i + 8 <= len; i += 8) {
    const int16x8_t a0 = vld1q_s16(A + i + 0);
    const int16x8_t a1 = vld1q_s16(A + i + 1);
    const int16x8_t b0 = vld1q_s16(B + i + 0);
    const int16x8_t b1 = vld1q_s16(B + i + 1);
    const int16x8_t a0b1 = vaddq_s16(a0, b1);
    const int16x8_t a1b0 = vaddq_s16(a1, b0);
    const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0);  // A0+A1+B0+B1
    const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1);    // 2*(A0+B1)
    const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0);    // 2*(A1+B0)
    const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3);
    const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3);
    const int16x8_t d0 = vaddq_s16(c1, a0);
    const int16x8_t d1 = vaddq_s16(c0, a1);
    const int16x8_t e0 = vrshrq_n_s16(d0, 1);
    const int16x8_t e1 = vrshrq_n_s16(d1, 1);
    const int16x8x2_t f = vzipq_s16(e0, e1);
    const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0));
    const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8));
    const int16x8_t h0 = vaddq_s16(g0, f.val[0]);
    const int16x8_t h1 = vaddq_s16(g1, f.val[1]);
    const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
    const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
    vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0));
    vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1));
  }
  for (; i < len; ++i) {
    const int a0b1 = A[i + 0] + B[i + 1];
    const int a1b0 = A[i + 1] + B[i + 0];
    const int a0a1b0b1 = a0b1 + a1b0 + 8;
    const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
    const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
    out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
    out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
  }
}

Example #8

0

Show file

File: qdrawhelper_neon.cpp Project: Nacto1/qt-everywhere-opensource-src-4.6.2

QT_BEGIN_NAMESPACE

static inline int16x8_t qvdiv_255_s16(int16x8_t x, int16x8_t half)
{
    // result = (x + (x >> 8) + 0x80) >> 8

    const int16x8_t temp = vshrq_n_s16(x, 8); // x >> 8
    const int16x8_t sum_part = vaddq_s16(x, half); // x + 0x80
    const int16x8_t sum = vaddq_s16(temp, sum_part);

    return vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(sum), 8));
}

Example #9

0

Show file

File: highbd_idct32x32_add_neon.c Project: webmproject/libvpx

static INLINE void highbd_idct32x32_1_add_pos_kernel(uint16_t **dest,
                                                     const int stride,
                                                     const int16x8_t res,
                                                     const int16x8_t max) {
  const uint16x8_t a0 = vld1q_u16(*dest);
  const uint16x8_t a1 = vld1q_u16(*dest + 8);
  const uint16x8_t a2 = vld1q_u16(*dest + 16);
  const uint16x8_t a3 = vld1q_u16(*dest + 24);
  const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
  const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
  const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2));
  const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3));
  const int16x8_t c0 = vminq_s16(b0, max);
  const int16x8_t c1 = vminq_s16(b1, max);
  const int16x8_t c2 = vminq_s16(b2, max);
  const int16x8_t c3 = vminq_s16(b3, max);
  vst1q_u16(*dest, vreinterpretq_u16_s16(c0));
  vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1));
  vst1q_u16(*dest + 16, vreinterpretq_u16_s16(c2));
  vst1q_u16(*dest + 24, vreinterpretq_u16_s16(c3));
  *dest += stride;
}

Example #10

0

Show file

File: idct4x4_1_add_neon.c Project: MIPS/external-libvpx

static INLINE void idct4x4_1_add_kernel(uint8_t **dest, const int stride,
                                        const int16x8_t res,
                                        uint32x2_t *const d) {
  uint16x8_t a;
  uint8x8_t b;
  *d = vld1_lane_u32((const uint32_t *)*dest, *d, 0);
  *d = vld1_lane_u32((const uint32_t *)(*dest + stride), *d, 1);
  a = vaddw_u8(vreinterpretq_u16_s16(res), vreinterpret_u8_u32(*d));
  b = vqmovun_s16(vreinterpretq_s16_u16(a));
  vst1_lane_u32((uint32_t *)*dest, vreinterpret_u32_u8(b), 0);
  *dest += stride;
  vst1_lane_u32((uint32_t *)*dest, vreinterpret_u32_u8(b), 1);
  *dest += stride;
}

Example #11

0

Show file

File: vp9_idct8x8_1_add_neon.c Project: Distrotech/libvpx

void vp9_idct8x8_1_add_neon(
        int16_t *input,
        uint8_t *dest,
        int dest_stride) {
    uint8x8_t d2u8, d3u8, d30u8, d31u8;
    uint64x1_t d2u64, d3u64, d4u64, d5u64;
    uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
    int16x8_t q0s16;
    uint8_t *d1, *d2;
    int16_t i, a1, cospi_16_64 = 11585;
    int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
    out = dct_const_round_shift(out * cospi_16_64);
    a1 = ROUND_POWER_OF_TWO(out, 5);

    q0s16 = vdupq_n_s16(a1);
    q0u16 = vreinterpretq_u16_s16(q0s16);

    d1 = d2 = dest;
    for (i = 0; i < 2; i++) {
        d2u64 = vld1_u64((const uint64_t *)d1);
        d1 += dest_stride;
        d3u64 = vld1_u64((const uint64_t *)d1);
        d1 += dest_stride;
        d4u64 = vld1_u64((const uint64_t *)d1);
        d1 += dest_stride;
        d5u64 = vld1_u64((const uint64_t *)d1);
        d1 += dest_stride;

        q9u16  = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
        q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
        q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
        q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));

        d2u8  = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
        d3u8  = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
        d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
        d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));

        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
        d2 += dest_stride;
        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
        d2 += dest_stride;
        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
        d2 += dest_stride;
        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8));
        d2 += dest_stride;
    }
    return;
}

Example #12

0

Show file

File: idct8x8_add_neon.c Project: 93i/godot

void vpx_idct8x8_12_add_neon(
        int16_t *input,
        uint8_t *dest,
        int dest_stride) {
    uint8_t *d1, *d2;
    uint8x8_t d0u8, d1u8, d2u8, d3u8;
    int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16;
    int16x4_t d26s16, d27s16, d28s16, d29s16;
    uint64x1_t d0u64, d1u64, d2u64, d3u64;
    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
    uint16x8_t q8u16, q9u16, q10u16, q11u16;
    int32x4_t q9s32, q10s32, q11s32, q12s32;

    q8s16 = vld1q_s16(input);
    q9s16 = vld1q_s16(input + 8);
    q10s16 = vld1q_s16(input + 16);
    q11s16 = vld1q_s16(input + 24);
    q12s16 = vld1q_s16(input + 32);
    q13s16 = vld1q_s16(input + 40);
    q14s16 = vld1q_s16(input + 48);
    q15s16 = vld1q_s16(input + 56);

    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
                 &q12s16, &q13s16, &q14s16, &q15s16);

    // First transform rows
    // stage 1
    q0s16 = vdupq_n_s16(cospi_28_64 * 2);
    q1s16 = vdupq_n_s16(cospi_4_64 * 2);

    q4s16 = vqrdmulhq_s16(q9s16, q0s16);

    q0s16 = vdupq_n_s16(-cospi_20_64 * 2);

    q7s16 = vqrdmulhq_s16(q9s16, q1s16);

    q1s16 = vdupq_n_s16(cospi_12_64 * 2);

    q5s16 = vqrdmulhq_s16(q11s16, q0s16);

    q0s16 = vdupq_n_s16(cospi_16_64 * 2);

    q6s16 = vqrdmulhq_s16(q11s16, q1s16);

    // stage 2 & stage 3 - even half
    q1s16 = vdupq_n_s16(cospi_24_64 * 2);

    q9s16 = vqrdmulhq_s16(q8s16, q0s16);

    q0s16 = vdupq_n_s16(cospi_8_64 * 2);

    q13s16 = vqrdmulhq_s16(q10s16, q1s16);

    q15s16 = vqrdmulhq_s16(q10s16, q0s16);

    // stage 3 -odd half
    q0s16 = vaddq_s16(q9s16, q15s16);
    q1s16 = vaddq_s16(q9s16, q13s16);
    q2s16 = vsubq_s16(q9s16, q13s16);
    q3s16 = vsubq_s16(q9s16, q15s16);

    // stage 2 - odd half
    q13s16 = vsubq_s16(q4s16, q5s16);
    q4s16 = vaddq_s16(q4s16, q5s16);
    q14s16 = vsubq_s16(q7s16, q6s16);
    q7s16 = vaddq_s16(q7s16, q6s16);
    d26s16 = vget_low_s16(q13s16);
    d27s16 = vget_high_s16(q13s16);
    d28s16 = vget_low_s16(q14s16);
    d29s16 = vget_high_s16(q14s16);

    d16s16 = vdup_n_s16(cospi_16_64);
    q9s32 = vmull_s16(d28s16, d16s16);
    q10s32 = vmull_s16(d29s16, d16s16);
    q11s32 = vmull_s16(d28s16, d16s16);
    q12s32 = vmull_s16(d29s16, d16s16);

    q9s32 = vmlsl_s16(q9s32,  d26s16, d16s16);
    q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
    q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
    q12s32 = vmlal_s16(q12s32, d27s16, d16s16);

    d10s16 = vqrshrn_n_s32(q9s32, 14);
    d11s16 = vqrshrn_n_s32(q10s32, 14);
    d12s16 = vqrshrn_n_s32(q11s32, 14);
    d13s16 = vqrshrn_n_s32(q12s32, 14);
    q5s16 = vcombine_s16(d10s16, d11s16);
    q6s16 = vcombine_s16(d12s16, d13s16);

    // stage 4
    q8s16 = vaddq_s16(q0s16, q7s16);
    q9s16 = vaddq_s16(q1s16, q6s16);
    q10s16 = vaddq_s16(q2s16, q5s16);
    q11s16 = vaddq_s16(q3s16, q4s16);
    q12s16 = vsubq_s16(q3s16, q4s16);
    q13s16 = vsubq_s16(q2s16, q5s16);
    q14s16 = vsubq_s16(q1s16, q6s16);
    q15s16 = vsubq_s16(q0s16, q7s16);

    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
                 &q12s16, &q13s16, &q14s16, &q15s16);

    IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
               &q12s16, &q13s16, &q14s16, &q15s16);

    q8s16 = vrshrq_n_s16(q8s16, 5);
    q9s16 = vrshrq_n_s16(q9s16, 5);
    q10s16 = vrshrq_n_s16(q10s16, 5);
    q11s16 = vrshrq_n_s16(q11s16, 5);
    q12s16 = vrshrq_n_s16(q12s16, 5);
    q13s16 = vrshrq_n_s16(q13s16, 5);
    q14s16 = vrshrq_n_s16(q14s16, 5);
    q15s16 = vrshrq_n_s16(q15s16, 5);

    d1 = d2 = dest;

    d0u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;
    d1u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;
    d2u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;
    d3u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;

    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
                     vreinterpret_u8_u64(d0u64));
    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
                     vreinterpret_u8_u64(d1u64));
    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
                      vreinterpret_u8_u64(d2u64));
    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
                      vreinterpret_u8_u64(d3u64));

    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));

    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
    d2 += dest_stride;
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
    d2 += dest_stride;
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
    d2 += dest_stride;
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
    d2 += dest_stride;

    q8s16 = q12s16;
    q9s16 = q13s16;
    q10s16 = q14s16;
    q11s16 = q15s16;

    d0u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;
    d1u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;
    d2u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;
    d3u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;

    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
                     vreinterpret_u8_u64(d0u64));
    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
                     vreinterpret_u8_u64(d1u64));
    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
                      vreinterpret_u8_u64(d2u64));
    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
                      vreinterpret_u8_u64(d3u64));

    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));

    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
    d2 += dest_stride;
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
    d2 += dest_stride;
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
    d2 += dest_stride;
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
    d2 += dest_stride;
    return;
}

Example #13

0

Show file

File: idct8x8_add_neon.c Project: 93i/godot

void vpx_idct8x8_64_add_neon(
        int16_t *input,
        uint8_t *dest,
        int dest_stride) {
    uint8_t *d1, *d2;
    uint8x8_t d0u8, d1u8, d2u8, d3u8;
    uint64x1_t d0u64, d1u64, d2u64, d3u64;
    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
    uint16x8_t q8u16, q9u16, q10u16, q11u16;

    q8s16 = vld1q_s16(input);
    q9s16 = vld1q_s16(input + 8);
    q10s16 = vld1q_s16(input + 16);
    q11s16 = vld1q_s16(input + 24);
    q12s16 = vld1q_s16(input + 32);
    q13s16 = vld1q_s16(input + 40);
    q14s16 = vld1q_s16(input + 48);
    q15s16 = vld1q_s16(input + 56);

    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
                 &q12s16, &q13s16, &q14s16, &q15s16);

    IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
               &q12s16, &q13s16, &q14s16, &q15s16);

    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
                 &q12s16, &q13s16, &q14s16, &q15s16);

    IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
               &q12s16, &q13s16, &q14s16, &q15s16);

    q8s16 = vrshrq_n_s16(q8s16, 5);
    q9s16 = vrshrq_n_s16(q9s16, 5);
    q10s16 = vrshrq_n_s16(q10s16, 5);
    q11s16 = vrshrq_n_s16(q11s16, 5);
    q12s16 = vrshrq_n_s16(q12s16, 5);
    q13s16 = vrshrq_n_s16(q13s16, 5);
    q14s16 = vrshrq_n_s16(q14s16, 5);
    q15s16 = vrshrq_n_s16(q15s16, 5);

    d1 = d2 = dest;

    d0u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;
    d1u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;
    d2u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;
    d3u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;

    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
                     vreinterpret_u8_u64(d0u64));
    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
                     vreinterpret_u8_u64(d1u64));
    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
                      vreinterpret_u8_u64(d2u64));
    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
                      vreinterpret_u8_u64(d3u64));

    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));

    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
    d2 += dest_stride;
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
    d2 += dest_stride;
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
    d2 += dest_stride;
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
    d2 += dest_stride;

    q8s16 = q12s16;
    q9s16 = q13s16;
    q10s16 = q14s16;
    q11s16 = q15s16;

    d0u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;
    d1u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;
    d2u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;
    d3u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;

    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
                     vreinterpret_u8_u64(d0u64));
    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
                     vreinterpret_u8_u64(d1u64));
    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
                      vreinterpret_u8_u64(d2u64));
    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
                      vreinterpret_u8_u64(d3u64));

    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));

    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
    d2 += dest_stride;
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
    d2 += dest_stride;
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
    d2 += dest_stride;
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
    d2 += dest_stride;
    return;
}

Example #14

0

Show file

File: NE10_resize.neon.c Project: HarveyLiuFly/Ne10

void ne10_img_vresize_linear_neon (const int** src, unsigned char* dst, const short* beta, int width)
{
    const int *S0 = src[0], *S1 = src[1];

    int32x4_t qS0_0123, qS0_4567, qS1_0123, qS1_4567;
    int32x4_t qT_0123, qT_4567;
    int16x4_t dT_0123, dT_4567;
    uint16x8_t qT_01234567;
    uint8x8_t dT_01234567, dDst_01234567;

    int32x2_t dBeta;
    dBeta = vset_lane_s32 ( (int) (beta[0]), dBeta, 0);
    dBeta = vset_lane_s32 ( (int) (beta[1]), dBeta, 1);

    int32x4_t qDelta, qMin, qMax;
    qDelta = vdupq_n_s32 (DELTA);
    qMin = vdupq_n_s32 (0);
    qMax = vdupq_n_s32 (255);

    int x = 0;
    for (; x <= width - 8; x += 8)
    {
        qS0_0123 = vld1q_s32 (&S0[x]);
        qS0_4567 = vld1q_s32 (&S0[x + 4]);
        qS1_0123 = vld1q_s32 (&S1[x]);
        qS1_4567 = vld1q_s32 (&S1[x + 4]);

        qT_0123 = vmulq_lane_s32 (qS0_0123, dBeta, 0);
        qT_4567 = vmulq_lane_s32 (qS0_4567, dBeta, 0);
        qT_0123 = vmlaq_lane_s32 (qT_0123, qS1_0123, dBeta, 1);
        qT_4567 = vmlaq_lane_s32 (qT_4567, qS1_4567, dBeta, 1);

        qT_0123 = vaddq_s32 (qT_0123, qDelta);
        qT_4567 = vaddq_s32 (qT_4567, qDelta);

        qT_0123 = vshrq_n_s32 (qT_0123, BITS);
        qT_4567 = vshrq_n_s32 (qT_4567, BITS);

        qT_0123 = vmaxq_s32 (qT_0123, qMin);
        qT_4567 = vmaxq_s32 (qT_4567, qMin);
        qT_0123 = vminq_s32 (qT_0123, qMax);
        qT_4567 = vminq_s32 (qT_4567, qMax);

        dT_0123 = vmovn_s32 (qT_0123);
        dT_4567 = vmovn_s32 (qT_4567);
        qT_01234567 = vreinterpretq_u16_s16 (vcombine_s16 (dT_0123, dT_4567));
        dT_01234567 = vmovn_u16 (qT_01234567);

        vst1_u8 (&dst[x], dT_01234567);
    }

    if (x < width)
    {
        uint8x8_t dMask;
        dMask = vld1_u8 ( (uint8_t *) (&ne10_img_vresize_linear_mask_residual_table[ (width - x - 1)]));
        dDst_01234567 = vld1_u8 (&dst[x]);

        qS0_0123 = vld1q_s32 (&S0[x]);
        qS0_4567 = vld1q_s32 (&S0[x + 4]);
        qS1_0123 = vld1q_s32 (&S1[x]);
        qS1_4567 = vld1q_s32 (&S1[x + 4]);

        qT_0123 = vmulq_lane_s32 (qS0_0123, dBeta, 0);
        qT_4567 = vmulq_lane_s32 (qS0_4567, dBeta, 0);
        qT_0123 = vmlaq_lane_s32 (qT_0123, qS1_0123, dBeta, 1);
        qT_4567 = vmlaq_lane_s32 (qT_4567, qS1_4567, dBeta, 1);

        qT_0123 = vaddq_s32 (qT_0123, qDelta);
        qT_4567 = vaddq_s32 (qT_4567, qDelta);

        qT_0123 = vshrq_n_s32 (qT_0123, BITS);
        qT_4567 = vshrq_n_s32 (qT_4567, BITS);

        qT_0123 = vmaxq_s32 (qT_0123, qMin);
        qT_4567 = vmaxq_s32 (qT_4567, qMin);
        qT_0123 = vminq_s32 (qT_0123, qMax);
        qT_4567 = vminq_s32 (qT_4567, qMax);

        dT_0123 = vmovn_s32 (qT_0123);
        dT_4567 = vmovn_s32 (qT_4567);
        qT_01234567 = vreinterpretq_u16_s16 (vcombine_s16 (dT_0123, dT_4567));
        dT_01234567 = vmovn_u16 (qT_01234567);

        dMask = vbsl_u8 (dMask, dT_01234567, dDst_01234567);
        vst1_u8 (&dst[x], dMask);
    }
}

Example #15

0

Show file

File: shortidct4x4llm_neon.c Project: Corax26/libvpx

void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr,
                               int pred_stride, unsigned char *dst_ptr,
                               int dst_stride) {
  int i;
  uint32x2_t d6u32 = vdup_n_u32(0);
  uint8x8_t d1u8;
  int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
  uint16x8_t q1u16;
  int16x8_t q1s16, q2s16, q3s16, q4s16;
  int32x2x2_t v2tmp0, v2tmp1;
  int16x4x2_t v2tmp2, v2tmp3;

  d2 = vld1_s16(input);
  d3 = vld1_s16(input + 4);
  d4 = vld1_s16(input + 8);
  d5 = vld1_s16(input + 12);

  // 1st for loop
  q1s16 = vcombine_s16(d2, d4);  // Swap d3 d4 here
  q2s16 = vcombine_s16(d3, d5);

  q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
  q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);

  d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // a1
  d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // b1

  q3s16 = vshrq_n_s16(q3s16, 1);
  q4s16 = vshrq_n_s16(q4s16, 1);

  q3s16 = vqaddq_s16(q3s16, q2s16);
  q4s16 = vqaddq_s16(q4s16, q2s16);

  d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16));  // c1
  d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16));  // d1

  d2 = vqadd_s16(d12, d11);
  d3 = vqadd_s16(d13, d10);
  d4 = vqsub_s16(d13, d10);
  d5 = vqsub_s16(d12, d11);

  v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
  v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
  v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
                    vreinterpret_s16_s32(v2tmp1.val[0]));
  v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
                    vreinterpret_s16_s32(v2tmp1.val[1]));

  // 2nd for loop
  q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp3.val[0]);
  q2s16 = vcombine_s16(v2tmp2.val[1], v2tmp3.val[1]);

  q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
  q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);

  d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // a1
  d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // b1

  q3s16 = vshrq_n_s16(q3s16, 1);
  q4s16 = vshrq_n_s16(q4s16, 1);

  q3s16 = vqaddq_s16(q3s16, q2s16);
  q4s16 = vqaddq_s16(q4s16, q2s16);

  d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16));  // c1
  d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16));  // d1

  d2 = vqadd_s16(d12, d11);
  d3 = vqadd_s16(d13, d10);
  d4 = vqsub_s16(d13, d10);
  d5 = vqsub_s16(d12, d11);

  d2 = vrshr_n_s16(d2, 3);
  d3 = vrshr_n_s16(d3, 3);
  d4 = vrshr_n_s16(d4, 3);
  d5 = vrshr_n_s16(d5, 3);

  v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
  v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
  v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
                    vreinterpret_s16_s32(v2tmp1.val[0]));
  v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
                    vreinterpret_s16_s32(v2tmp1.val[1]));

  q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp2.val[1]);
  q2s16 = vcombine_s16(v2tmp3.val[0], v2tmp3.val[1]);

  // dc_only_idct_add
  for (i = 0; i < 2; i++, q1s16 = q2s16) {
    d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 0);
    pred_ptr += pred_stride;
    d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 1);
    pred_ptr += pred_stride;

    q1u16 = vaddw_u8(vreinterpretq_u16_s16(q1s16), vreinterpret_u8_u32(d6u32));
    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));

    vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 0);
    dst_ptr += dst_stride;
    vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 1);
    dst_ptr += dst_stride;
  }
  return;
}

Example #16

0

Show file

File: idct4x4_add_neon.c Project: Wafflespeanut/gecko-dev

void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
  uint8x8_t d26u8, d27u8;
  uint32x2_t d26u32, d27u32;
  uint16x8_t q8u16, q9u16;
  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16;
  int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16;
  int16x8_t q8s16, q9s16, q13s16, q14s16;
  int32x4_t q1s32, q13s32, q14s32, q15s32;
  int16x4x2_t d0x2s16, d1x2s16;
  int32x4x2_t q0x2s32;
  uint8_t *d;

  d26u32 = d27u32 = vdup_n_u32(0);

  q8s16 = vld1q_s16(input);
  q9s16 = vld1q_s16(input + 8);

  d16s16 = vget_low_s16(q8s16);
  d17s16 = vget_high_s16(q8s16);
  d18s16 = vget_low_s16(q9s16);
  d19s16 = vget_high_s16(q9s16);

  d0x2s16 = vtrn_s16(d16s16, d17s16);
  d1x2s16 = vtrn_s16(d18s16, d19s16);
  q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
  q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);

  d20s16 = vdup_n_s16((int16_t)cospi_8_64);
  d21s16 = vdup_n_s16((int16_t)cospi_16_64);

  q0x2s32 =
      vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
  d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
  d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
  d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
  d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));

  d22s16 = vdup_n_s16((int16_t)cospi_24_64);

  // stage 1
  d23s16 = vadd_s16(d16s16, d18s16);
  d24s16 = vsub_s16(d16s16, d18s16);

  q15s32 = vmull_s16(d17s16, d22s16);
  q1s32 = vmull_s16(d17s16, d20s16);
  q13s32 = vmull_s16(d23s16, d21s16);
  q14s32 = vmull_s16(d24s16, d21s16);

  q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
  q1s32 = vmlal_s16(q1s32, d19s16, d22s16);

  d26s16 = vqrshrn_n_s32(q13s32, 14);
  d27s16 = vqrshrn_n_s32(q14s32, 14);
  d29s16 = vqrshrn_n_s32(q15s32, 14);
  d28s16 = vqrshrn_n_s32(q1s32, 14);
  q13s16 = vcombine_s16(d26s16, d27s16);
  q14s16 = vcombine_s16(d28s16, d29s16);

  // stage 2
  q8s16 = vaddq_s16(q13s16, q14s16);
  q9s16 = vsubq_s16(q13s16, q14s16);

  d16s16 = vget_low_s16(q8s16);
  d17s16 = vget_high_s16(q8s16);
  d18s16 = vget_high_s16(q9s16);  // vswp d18 d19
  d19s16 = vget_low_s16(q9s16);

  d0x2s16 = vtrn_s16(d16s16, d17s16);
  d1x2s16 = vtrn_s16(d18s16, d19s16);
  q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
  q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);

  q0x2s32 =
      vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
  d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
  d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
  d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
  d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));

  // do the transform on columns
  // stage 1
  d23s16 = vadd_s16(d16s16, d18s16);
  d24s16 = vsub_s16(d16s16, d18s16);

  q15s32 = vmull_s16(d17s16, d22s16);
  q1s32 = vmull_s16(d17s16, d20s16);
  q13s32 = vmull_s16(d23s16, d21s16);
  q14s32 = vmull_s16(d24s16, d21s16);

  q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
  q1s32 = vmlal_s16(q1s32, d19s16, d22s16);

  d26s16 = vqrshrn_n_s32(q13s32, 14);
  d27s16 = vqrshrn_n_s32(q14s32, 14);
  d29s16 = vqrshrn_n_s32(q15s32, 14);
  d28s16 = vqrshrn_n_s32(q1s32, 14);
  q13s16 = vcombine_s16(d26s16, d27s16);
  q14s16 = vcombine_s16(d28s16, d29s16);

  // stage 2
  q8s16 = vaddq_s16(q13s16, q14s16);
  q9s16 = vsubq_s16(q13s16, q14s16);

  q8s16 = vrshrq_n_s16(q8s16, 4);
  q9s16 = vrshrq_n_s16(q9s16, 4);

  d = dest;
  d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0);
  d += dest_stride;
  d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1);
  d += dest_stride;
  d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1);
  d += dest_stride;
  d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0);

  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));

  d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
  d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));

  d = dest;
  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0);
  d += dest_stride;
  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1);
  d += dest_stride;
  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1);
  d += dest_stride;
  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0);
  return;
}

Example #17

0

Show file

File: i40e_rxtx_vec_neon.c Project: cleveritcz/f-stack

 /*
 * Notice:
 * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
 * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
 *   numbers of DD bits
 */
static inline uint16_t
_recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
		   uint16_t nb_pkts, uint8_t *split_packet)
{
	volatile union i40e_rx_desc *rxdp;
	struct i40e_rx_entry *sw_ring;
	uint16_t nb_pkts_recd;
	int pos;
	uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;

	/* mask to shuffle from desc. to mbuf */
	uint8x16_t shuf_msk = {
		0xFF, 0xFF,   /* pkt_type set as unknown */
		0xFF, 0xFF,   /* pkt_type set as unknown */
		14, 15,       /* octet 15~14, low 16 bits pkt_len */
		0xFF, 0xFF,   /* skip high 16 bits pkt_len, zero out */
		14, 15,       /* octet 15~14, 16 bits data_len */
		2, 3,         /* octet 2~3, low 16 bits vlan_macip */
		4, 5, 6, 7    /* octet 4~7, 32bits rss */
		};

	uint8x16_t eop_check = {
		0x02, 0x00, 0x02, 0x00,
		0x02, 0x00, 0x02, 0x00,
		0x00, 0x00, 0x00, 0x00,
		0x00, 0x00, 0x00, 0x00
		};

	uint16x8_t crc_adjust = {
		0, 0,         /* ignore pkt_type field */
		rxq->crc_len, /* sub crc on pkt_len */
		0,            /* ignore high-16bits of pkt_len */
		rxq->crc_len, /* sub crc on data_len */
		0, 0, 0       /* ignore non-length fields */
		};

	/* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */
	nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST);

	/* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */
	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP);

	/* Just the act of getting into the function from the application is
	 * going to cost about 7 cycles
	 */
	rxdp = rxq->rx_ring + rxq->rx_tail;

	rte_prefetch_non_temporal(rxdp);

	/* See if we need to rearm the RX queue - gives the prefetch a bit
	 * of time to act
	 */
	if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
		i40e_rxq_rearm(rxq);

	/* Before we start moving massive data around, check to see if
	 * there is actually a packet available
	 */
	if (!(rxdp->wb.qword1.status_error_len &
			rte_cpu_to_le_32(1 << I40E_RX_DESC_STATUS_DD_SHIFT)))
		return 0;

	/* Cache is empty -> need to scan the buffer rings, but first move
	 * the next 'n' mbufs into the cache
	 */
	sw_ring = &rxq->sw_ring[rxq->rx_tail];

	/* A. load 4 packet in one loop
	 * [A*. mask out 4 unused dirty field in desc]
	 * B. copy 4 mbuf point from swring to rx_pkts
	 * C. calc the number of DD bits among the 4 packets
	 * [C*. extract the end-of-packet bit, if requested]
	 * D. fill info. from desc to mbuf
	 */

	for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts;
			pos += RTE_I40E_DESCS_PER_LOOP,
			rxdp += RTE_I40E_DESCS_PER_LOOP) {
		uint64x2_t descs[RTE_I40E_DESCS_PER_LOOP];
		uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
		uint16x8x2_t sterr_tmp1, sterr_tmp2;
		uint64x2_t mbp1, mbp2;
		uint16x8_t staterr;
		uint16x8_t tmp;
		uint64_t stat;

		int32x4_t len_shl = {0, 0, 0, PKTLEN_SHIFT};

		/* B.1 load 1 mbuf point */
		mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]);
		/* Read desc statuses backwards to avoid race condition */
		/* A.1 load 4 pkts desc */
		descs[3] =  vld1q_u64((uint64_t *)(rxdp + 3));
		rte_rmb();

		/* B.2 copy 2 mbuf point into rx_pkts  */
		vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);

		/* B.1 load 1 mbuf point */
		mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]);

		descs[2] =  vld1q_u64((uint64_t *)(rxdp + 2));
		/* B.1 load 2 mbuf point */
		descs[1] =  vld1q_u64((uint64_t *)(rxdp + 1));
		descs[0] =  vld1q_u64((uint64_t *)(rxdp));

		/* B.2 copy 2 mbuf point into rx_pkts  */
		vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);

		if (split_packet) {
			rte_mbuf_prefetch_part2(rx_pkts[pos]);
			rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
			rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
			rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
		}

		/* avoid compiler reorder optimization */
		rte_compiler_barrier();

		/* pkt 3,4 shift the pktlen field to be 16-bit aligned*/
		uint32x4_t len3 = vshlq_u32(vreinterpretq_u32_u64(descs[3]),
					    len_shl);
		descs[3] = vreinterpretq_u64_u32(len3);
		uint32x4_t len2 = vshlq_u32(vreinterpretq_u32_u64(descs[2]),
					    len_shl);
		descs[2] = vreinterpretq_u64_u32(len2);

		/* D.1 pkt 3,4 convert format from desc to pktmbuf */
		pkt_mb4 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[3]), shuf_msk);
		pkt_mb3 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[2]), shuf_msk);

		/* C.1 4=>2 filter staterr info only */
		sterr_tmp2 = vzipq_u16(vreinterpretq_u16_u64(descs[1]),
				       vreinterpretq_u16_u64(descs[3]));
		/* C.1 4=>2 filter staterr info only */
		sterr_tmp1 = vzipq_u16(vreinterpretq_u16_u64(descs[0]),
				       vreinterpretq_u16_u64(descs[2]));

		/* C.2 get 4 pkts staterr value  */
		staterr = vzipq_u16(sterr_tmp1.val[1],
				    sterr_tmp2.val[1]).val[0];

		desc_to_olflags_v(rxq, descs, &rx_pkts[pos]);

		/* D.2 pkt 3,4 set in_port/nb_seg and remove crc */
		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb4), crc_adjust);
		pkt_mb4 = vreinterpretq_u8_u16(tmp);
		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust);
		pkt_mb3 = vreinterpretq_u8_u16(tmp);

		/* pkt 1,2 shift the pktlen field to be 16-bit aligned*/
		uint32x4_t len1 = vshlq_u32(vreinterpretq_u32_u64(descs[1]),
					    len_shl);
		descs[1] = vreinterpretq_u64_u32(len1);
		uint32x4_t len0 = vshlq_u32(vreinterpretq_u32_u64(descs[0]),
					    len_shl);
		descs[0] = vreinterpretq_u64_u32(len0);

		/* D.1 pkt 1,2 convert format from desc to pktmbuf */
		pkt_mb2 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[1]), shuf_msk);
		pkt_mb1 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[0]), shuf_msk);

		/* D.3 copy final 3,4 data to rx_pkts */
		vst1q_u8((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1,
				 pkt_mb4);
		vst1q_u8((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1,
				 pkt_mb3);

		/* D.2 pkt 1,2 set in_port/nb_seg and remove crc */
		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust);
		pkt_mb2 = vreinterpretq_u8_u16(tmp);
		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust);
		pkt_mb1 = vreinterpretq_u8_u16(tmp);

		/* C* extract and record EOP bit */
		if (split_packet) {
			uint8x16_t eop_shuf_mask = {
					0x00, 0x02, 0x04, 0x06,
					0xFF, 0xFF, 0xFF, 0xFF,
					0xFF, 0xFF, 0xFF, 0xFF,
					0xFF, 0xFF, 0xFF, 0xFF};
			uint8x16_t eop_bits;

			/* and with mask to extract bits, flipping 1-0 */
			eop_bits = vmvnq_u8(vreinterpretq_u8_u16(staterr));
			eop_bits = vandq_u8(eop_bits, eop_check);
			/* the staterr values are not in order, as the count
			 * count of dd bits doesn't care. However, for end of
			 * packet tracking, we do care, so shuffle. This also
			 * compresses the 32-bit values to 8-bit
			 */
			eop_bits = vqtbl1q_u8(eop_bits, eop_shuf_mask);

			/* store the resulting 32-bit value */
			vst1q_lane_u32((uint32_t *)split_packet,
				       vreinterpretq_u32_u8(eop_bits), 0);
			split_packet += RTE_I40E_DESCS_PER_LOOP;

			/* zero-out next pointers */
			rx_pkts[pos]->next = NULL;
			rx_pkts[pos + 1]->next = NULL;
			rx_pkts[pos + 2]->next = NULL;
			rx_pkts[pos + 3]->next = NULL;
		}

		staterr = vshlq_n_u16(staterr, I40E_UINT16_BIT - 1);
		staterr = vreinterpretq_u16_s16(
				vshrq_n_s16(vreinterpretq_s16_u16(staterr),
					    I40E_UINT16_BIT - 1));
		stat = ~vgetq_lane_u64(vreinterpretq_u64_u16(staterr), 0);

		rte_prefetch_non_temporal(rxdp + RTE_I40E_DESCS_PER_LOOP);

		/* D.3 copy final 1,2 data to rx_pkts */
		vst1q_u8((void *)&rx_pkts[pos + 1]->rx_descriptor_fields1,
			 pkt_mb2);
		vst1q_u8((void *)&rx_pkts[pos]->rx_descriptor_fields1,
			 pkt_mb1);
		desc_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl);
		/* C.4 calc avaialbe number of desc */
		if (unlikely(stat == 0)) {
			nb_pkts_recd += RTE_I40E_DESCS_PER_LOOP;
		} else {
			nb_pkts_recd += __builtin_ctzl(stat) / I40E_UINT16_BIT;
			break;
		}
	}

	/* Update our internal tail pointer */
	rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd);
	rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1));
	rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd);

	return nb_pkts_recd;
}

Example #18

0

Show file

File: idct_dequant_full_2x_neon.c Project: 93i/godot

void idct_dequant_full_2x_neon(
        int16_t *q,
        int16_t *dq,
        unsigned char *dst,
        int stride) {
    unsigned char *dst0, *dst1;
    int32x2_t d28, d29, d30, d31;
    int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
    int16x8_t qEmpty = vdupq_n_s16(0);
    int32x4x2_t q2tmp0, q2tmp1;
    int16x8x2_t q2tmp2, q2tmp3;
    int16x4_t dLow0, dLow1, dHigh0, dHigh1;

    d28 = d29 = d30 = d31 = vdup_n_s32(0);

    // load dq
    q0 = vld1q_s16(dq);
    dq += 8;
    q1 = vld1q_s16(dq);

    // load q
    q2 = vld1q_s16(q);
    vst1q_s16(q, qEmpty);
    q += 8;
    q3 = vld1q_s16(q);
    vst1q_s16(q, qEmpty);
    q += 8;
    q4 = vld1q_s16(q);
    vst1q_s16(q, qEmpty);
    q += 8;
    q5 = vld1q_s16(q);
    vst1q_s16(q, qEmpty);

    // load src from dst
    dst0 = dst;
    dst1 = dst + 4;
    d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0);
    dst0 += stride;
    d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1);
    dst1 += stride;
    d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0);
    dst0 += stride;
    d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1);
    dst1 += stride;

    d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0);
    dst0 += stride;
    d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1);
    dst1 += stride;
    d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0);
    d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1);

    q2 = vmulq_s16(q2, q0);
    q3 = vmulq_s16(q3, q1);
    q4 = vmulq_s16(q4, q0);
    q5 = vmulq_s16(q5, q1);

    // vswp
    dLow0 = vget_low_s16(q2);
    dHigh0 = vget_high_s16(q2);
    dLow1 = vget_low_s16(q4);
    dHigh1 = vget_high_s16(q4);
    q2 = vcombine_s16(dLow0, dLow1);
    q4 = vcombine_s16(dHigh0, dHigh1);

    dLow0 = vget_low_s16(q3);
    dHigh0 = vget_high_s16(q3);
    dLow1 = vget_low_s16(q5);
    dHigh1 = vget_high_s16(q5);
    q3 = vcombine_s16(dLow0, dLow1);
    q5 = vcombine_s16(dHigh0, dHigh1);

    q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2);
    q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2);
    q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1);
    q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1);

    q10 = vqaddq_s16(q2, q3);
    q11 = vqsubq_s16(q2, q3);

    q8 = vshrq_n_s16(q8, 1);
    q9 = vshrq_n_s16(q9, 1);

    q4 = vqaddq_s16(q4, q8);
    q5 = vqaddq_s16(q5, q9);

    q2 = vqsubq_s16(q6, q5);
    q3 = vqaddq_s16(q7, q4);

    q4 = vqaddq_s16(q10, q3);
    q5 = vqaddq_s16(q11, q2);
    q6 = vqsubq_s16(q11, q2);
    q7 = vqsubq_s16(q10, q3);

    q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
    q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
    q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
                       vreinterpretq_s16_s32(q2tmp1.val[0]));
    q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
                       vreinterpretq_s16_s32(q2tmp1.val[1]));

    // loop 2
    q8  = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2);
    q9  = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2);
    q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1);
    q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1);

    q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]);
    q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]);

    q10 = vshrq_n_s16(q10, 1);
    q11 = vshrq_n_s16(q11, 1);

    q10 = vqaddq_s16(q2tmp2.val[1], q10);
    q11 = vqaddq_s16(q2tmp3.val[1], q11);

    q8 = vqsubq_s16(q8, q11);
    q9 = vqaddq_s16(q9, q10);

    q4 = vqaddq_s16(q2, q9);
    q5 = vqaddq_s16(q3, q8);
    q6 = vqsubq_s16(q3, q8);
    q7 = vqsubq_s16(q2, q9);

    q4 = vrshrq_n_s16(q4, 3);
    q5 = vrshrq_n_s16(q5, 3);
    q6 = vrshrq_n_s16(q6, 3);
    q7 = vrshrq_n_s16(q7, 3);

    q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
    q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
    q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
                       vreinterpretq_s16_s32(q2tmp1.val[0]));
    q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
                       vreinterpretq_s16_s32(q2tmp1.val[1]));

    q4 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]),
                                          vreinterpret_u8_s32(d28)));
    q5 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]),
                                          vreinterpret_u8_s32(d29)));
    q6 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]),
                                          vreinterpret_u8_s32(d30)));
    q7 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]),
                                          vreinterpret_u8_s32(d31)));

    d28 = vreinterpret_s32_u8(vqmovun_s16(q4));
    d29 = vreinterpret_s32_u8(vqmovun_s16(q5));
    d30 = vreinterpret_s32_u8(vqmovun_s16(q6));
    d31 = vreinterpret_s32_u8(vqmovun_s16(q7));

    dst0 = dst;
    dst1 = dst + 4;
    vst1_lane_s32((int32_t *)dst0, d28, 0);
    dst0 += stride;
    vst1_lane_s32((int32_t *)dst1, d28, 1);
    dst1 += stride;
    vst1_lane_s32((int32_t *)dst0, d29, 0);
    dst0 += stride;
    vst1_lane_s32((int32_t *)dst1, d29, 1);
    dst1 += stride;

    vst1_lane_s32((int32_t *)dst0, d30, 0);
    dst0 += stride;
    vst1_lane_s32((int32_t *)dst1, d30, 1);
    dst1 += stride;
    vst1_lane_s32((int32_t *)dst0, d31, 0);
    vst1_lane_s32((int32_t *)dst1, d31, 1);
    return;
}

Example #19

0

Show file

File: iht8x8_add_neon.c Project: luke-chang/gecko-1

void av1_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
                            int dest_stride, const TxfmParam *txfm_param) {
  int i;
  uint8_t *d1, *d2;
  uint8x8_t d0u8, d1u8, d2u8, d3u8;
  uint64x1_t d0u64, d1u64, d2u64, d3u64;
  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
  uint16x8_t q8u16, q9u16, q10u16, q11u16;

  q8s16 = vld1q_s16(input);
  q9s16 = vld1q_s16(input + 8);
  q10s16 = vld1q_s16(input + 8 * 2);
  q11s16 = vld1q_s16(input + 8 * 3);
  q12s16 = vld1q_s16(input + 8 * 4);
  q13s16 = vld1q_s16(input + 8 * 5);
  q14s16 = vld1q_s16(input + 8 * 6);
  q15s16 = vld1q_s16(input + 8 * 7);

  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
               &q15s16);

  const TX_TYPE tx_type = txfm_param->tx_type;
  switch (tx_type) {
    case DCT_DCT:  // idct_idct is not supported. Fall back to C
      av1_iht8x8_64_add_c(input, dest, dest_stride, txfm_param);
      return;
      break;
    case ADST_DCT:  // iadst_idct
      // generate IDCT constants
      // GENERATE_IDCT_CONSTANTS

      // first transform rows
      IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
                 &q15s16);

      // transpose the matrix
      TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
                   &q15s16);

      // generate IADST constants
      // GENERATE_IADST_CONSTANTS

      // then transform columns
      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
                  &q15s16);
      break;
    case DCT_ADST:  // idct_iadst
      // generate IADST constants
      // GENERATE_IADST_CONSTANTS

      // first transform rows
      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
                  &q15s16);

      // transpose the matrix
      TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
                   &q15s16);

      // generate IDCT constants
      // GENERATE_IDCT_CONSTANTS

      // then transform columns
      IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
                 &q15s16);
      break;
    case ADST_ADST:  // iadst_iadst
      // generate IADST constants
      // GENERATE_IADST_CONSTANTS

      // first transform rows
      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
                  &q15s16);

      // transpose the matrix
      TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
                   &q15s16);

      // then transform columns
      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
                  &q15s16);
      break;
    default:  // iadst_idct
      assert(0);
      break;
  }

  q8s16 = vrshrq_n_s16(q8s16, 5);
  q9s16 = vrshrq_n_s16(q9s16, 5);
  q10s16 = vrshrq_n_s16(q10s16, 5);
  q11s16 = vrshrq_n_s16(q11s16, 5);
  q12s16 = vrshrq_n_s16(q12s16, 5);
  q13s16 = vrshrq_n_s16(q13s16, 5);
  q14s16 = vrshrq_n_s16(q14s16, 5);
  q15s16 = vrshrq_n_s16(q15s16, 5);

  for (d1 = d2 = dest, i = 0; i < 2; i++) {
    if (i != 0) {
      q8s16 = q12s16;
      q9s16 = q13s16;
      q10s16 = q14s16;
      q11s16 = q15s16;
    }

    d0u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;
    d1u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;
    d2u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;
    d3u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;

    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
    q10u16 =
        vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
    q11u16 =
        vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));

    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));

    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
    d2 += dest_stride;
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
    d2 += dest_stride;
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
    d2 += dest_stride;
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
    d2 += dest_stride;
  }
  return;
}

Example #20

0

Show file

File: iht4x4_add_neon.c Project: jmvalin/aom

void vp10_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
                             int dest_stride, int tx_type) {
  uint8x8_t d26u8, d27u8;
  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16;
  uint32x2_t d26u32, d27u32;
  int16x8_t q3s16, q8s16, q9s16;
  uint16x8_t q8u16, q9u16;

  d26u32 = d27u32 = vdup_n_u32(0);

  q8s16 = vld1q_s16(input);
  q9s16 = vld1q_s16(input + 8);

  TRANSPOSE4X4(&q8s16, &q9s16);

  switch (tx_type) {
    case 0:  // idct_idct is not supported. Fall back to C
      vp10_iht4x4_16_add_c(input, dest, dest_stride, tx_type);
      return;
      break;
    case 1:  // iadst_idct
      // generate constants
      GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);

      // first transform rows
      IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);

      // transpose the matrix
      TRANSPOSE4X4(&q8s16, &q9s16);

      // then transform columns
      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
      break;
    case 2:  // idct_iadst
      // generate constantsyy
      GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);

      // first transform rows
      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);

      // transpose the matrix
      TRANSPOSE4X4(&q8s16, &q9s16);

      // then transform columns
      IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
      break;
    case 3:  // iadst_iadst
      // generate constants
      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);

      // first transform rows
      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);

      // transpose the matrix
      TRANSPOSE4X4(&q8s16, &q9s16);

      // then transform columns
      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
      break;
    default:  // iadst_idct
      assert(0);
      break;
  }

  q8s16 = vrshrq_n_s16(q8s16, 4);
  q9s16 = vrshrq_n_s16(q9s16, 4);

  d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0);
  dest += dest_stride;
  d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1);
  dest += dest_stride;
  d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0);
  dest += dest_stride;
  d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1);

  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));

  d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
  d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));

  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1);
  dest -= dest_stride;
  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0);
  dest -= dest_stride;
  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1);
  dest -= dest_stride;
  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0);
  return;
}