C++ (Cpp) vget_low_s16 Examples

Example #1

0

Show file

File: iht4x4_add_neon.c Project: jmvalin/aom

static INLINE void IDCT4x4_1D(int16x4_t *d0s16, int16x4_t *d1s16,
                              int16x4_t *d2s16, int16x8_t *q8s16,
                              int16x8_t *q9s16) {
  int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16;
  int16x4_t d26s16, d27s16, d28s16, d29s16;
  int32x4_t q10s32, q13s32, q14s32, q15s32;
  int16x8_t q13s16, q14s16;

  d16s16 = vget_low_s16(*q8s16);
  d17s16 = vget_high_s16(*q8s16);
  d18s16 = vget_low_s16(*q9s16);
  d19s16 = vget_high_s16(*q9s16);

  d23s16 = vadd_s16(d16s16, d18s16);
  d24s16 = vsub_s16(d16s16, d18s16);

  q15s32 = vmull_s16(d17s16, *d2s16);
  q10s32 = vmull_s16(d17s16, *d0s16);
  q13s32 = vmull_s16(d23s16, *d1s16);
  q14s32 = vmull_s16(d24s16, *d1s16);
  q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16);
  q10s32 = vmlal_s16(q10s32, d19s16, *d2s16);

  d26s16 = vqrshrn_n_s32(q13s32, 14);
  d27s16 = vqrshrn_n_s32(q14s32, 14);
  d29s16 = vqrshrn_n_s32(q15s32, 14);
  d28s16 = vqrshrn_n_s32(q10s32, 14);

  q13s16 = vcombine_s16(d26s16, d27s16);
  q14s16 = vcombine_s16(d28s16, d29s16);
  *q8s16 = vaddq_s16(q13s16, q14s16);
  *q9s16 = vsubq_s16(q13s16, q14s16);
  *q9s16 = vcombine_s16(vget_high_s16(*q9s16), vget_low_s16(*q9s16));  // vswp
  return;
}

Example #2

0

Show file

File: intrin_neon.hpp Project: 165-goethals/opencv

inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
{
    int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
    int32x4_t d = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
    int32x4x2_t cd = vtrnq_s32(c, d);
    return v_int32x4(vaddq_s32(cd.val[0], cd.val[1]));
}

Example #3

0

Show file

File: vp8_subpixelvariance_neon.c Project: Andrel322/gecko-dev

static void variance_neon_w8(const uint8_t *a, int a_stride,
                             const uint8_t *b, int b_stride,
                             int w, int h, unsigned int *sse, int *sum) {
  int i, j;
  int16x8_t v_sum = vdupq_n_s16(0);
  int32x4_t v_sse_lo = vdupq_n_s32(0);
  int32x4_t v_sse_hi = vdupq_n_s32(0);

  for (i = 0; i < h; ++i) {
    for (j = 0; j < w; j += 8) {
      const uint8x8_t v_a = vld1_u8(&a[j]);
      const uint8x8_t v_b = vld1_u8(&b[j]);
      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
      v_sum = vaddq_s16(v_sum, sv_diff);
      v_sse_lo = vmlal_s16(v_sse_lo,
                           vget_low_s16(sv_diff),
                           vget_low_s16(sv_diff));
      v_sse_hi = vmlal_s16(v_sse_hi,
                           vget_high_s16(sv_diff),
                           vget_high_s16(sv_diff));
    }
    a += a_stride;
    b += b_stride;
  }

  *sum = horizontal_add_s16x8(v_sum);
  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
}

Example #4

0

Show file

File: variance_neon.c Project: lianhaidong/libvpx

unsigned int vpx_get4x4sse_cs_neon(
        const unsigned char *src_ptr,
        int source_stride,
        const unsigned char *ref_ptr,
        int recon_stride) {
    int16x4_t d22s16, d24s16, d26s16, d28s16;
    int64x1_t d0s64;
    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
    int32x4_t q7s32, q8s32, q9s32, q10s32;
    uint16x8_t q11u16, q12u16, q13u16, q14u16;
    int64x2_t q1s64;

    d0u8 = vld1_u8(src_ptr);
    src_ptr += source_stride;
    d4u8 = vld1_u8(ref_ptr);
    ref_ptr += recon_stride;
    d1u8 = vld1_u8(src_ptr);
    src_ptr += source_stride;
    d5u8 = vld1_u8(ref_ptr);
    ref_ptr += recon_stride;
    d2u8 = vld1_u8(src_ptr);
    src_ptr += source_stride;
    d6u8 = vld1_u8(ref_ptr);
    ref_ptr += recon_stride;
    d3u8 = vld1_u8(src_ptr);
    src_ptr += source_stride;
    d7u8 = vld1_u8(ref_ptr);
    ref_ptr += recon_stride;

    q11u16 = vsubl_u8(d0u8, d4u8);
    q12u16 = vsubl_u8(d1u8, d5u8);
    q13u16 = vsubl_u8(d2u8, d6u8);
    q14u16 = vsubl_u8(d3u8, d7u8);

    d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
    d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
    d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
    d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));

    q7s32 = vmull_s16(d22s16, d22s16);
    q8s32 = vmull_s16(d24s16, d24s16);
    q9s32 = vmull_s16(d26s16, d26s16);
    q10s32 = vmull_s16(d28s16, d28s16);

    q7s32 = vaddq_s32(q7s32, q8s32);
    q9s32 = vaddq_s32(q9s32, q10s32);
    q9s32 = vaddq_s32(q7s32, q9s32);

    q1s64 = vpaddlq_s32(q9s32);
    d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));

    return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
}

Example #5

0

Show file

File: cross_correlation_neon.c Project: golden1232004/webrtc_new

static inline void DotProductWithScaleNeon(int32_t* cross_correlation,
                                           const int16_t* vector1,
                                           const int16_t* vector2,
                                           size_t length,
                                           int scaling) {
  size_t i = 0;
  size_t len1 = length >> 3;
  size_t len2 = length & 7;
  int64x2_t sum0 = vdupq_n_s64(0);
  int64x2_t sum1 = vdupq_n_s64(0);

  for (i = len1; i > 0; i -= 1) {
    int16x8_t seq1_16x8 = vld1q_s16(vector1);
    int16x8_t seq2_16x8 = vld1q_s16(vector2);
#if defined(WEBRTC_ARCH_ARM64)
    int32x4_t tmp0 = vmull_s16(vget_low_s16(seq1_16x8),
                               vget_low_s16(seq2_16x8));
    int32x4_t tmp1 = vmull_high_s16(seq1_16x8, seq2_16x8);
#else
    int32x4_t tmp0 = vmull_s16(vget_low_s16(seq1_16x8),
                               vget_low_s16(seq2_16x8));
    int32x4_t tmp1 = vmull_s16(vget_high_s16(seq1_16x8),
                               vget_high_s16(seq2_16x8));
#endif
    sum0 = vpadalq_s32(sum0, tmp0);
    sum1 = vpadalq_s32(sum1, tmp1);
    vector1 += 8;
    vector2 += 8;
  }

  // Calculate the rest of the samples.
  int64_t sum_res = 0;
  for (i = len2; i > 0; i -= 1) {
    sum_res += WEBRTC_SPL_MUL_16_16(*vector1, *vector2);
    vector1++;
    vector2++;
  }

  sum0 = vaddq_s64(sum0, sum1);
#if defined(WEBRTC_ARCH_ARM64)
  int64_t sum2 = vaddvq_s64(sum0);
  *cross_correlation = (int32_t)((sum2 + sum_res) >> scaling);
#else
  int64x1_t shift = vdup_n_s64(-scaling);
  int64x1_t sum2 = vadd_s64(vget_low_s64(sum0), vget_high_s64(sum0));
  sum2 = vadd_s64(sum2, vdup_n_s64(sum_res));
  sum2 = vshl_s64(sum2, shift);
  vst1_lane_s32(cross_correlation, vreinterpret_s32_s64(sum2), 0);
#endif
}

Example #6

0

Show file

File: vget_lows16.c Project: 5432935/crossbridge

void test_vget_lows16 (void)
{
  int16x4_t out_int16x4_t;
  int16x8_t arg0_int16x8_t;

  out_int16x4_t = vget_low_s16 (arg0_int16x8_t);
}

Example #7

0

Show file

File: neon.c Project: yaojingguo/gcc-intrinsics-samplecode

static inline void char_to_float_vectors(const unsigned char * sourcep,
			   float32x4_t *mp0, float32x4_t * mp1)
{
 uint8x8_t rawpixels; /* source pixels as {[YUYV]0 [YUYV]1}   */
 int16x8_t widerpixels; /*  rawpixels promoted to shorts per component */
 int16x4_t high16, low16;
 int32x4_t high32, low32;
 const  int16x8_t uvbias = {0, 128, 0, 128, 0, 128, 0, 128};
 
 rawpixels = vld1_u8(sourcep);
 widerpixels = vreinterpretq_s16_u16(vmovl_u8(rawpixels));

 /* subtract uvbias from widerpixels  */
 widerpixels = vsubq_s16(widerpixels, uvbias);

 /* now take widerpixels apart into (low16, high16) and   */
 /* then expand those into (low32, high32)    */
 low16 = vget_low_s16(widerpixels);
 high16 = vget_high_s16(widerpixels);
 high32 = vmovl_s16(high16);
 low32  = vmovl_s16(low16);

 /* now convert low32 and high32 into floats and store them in   */
 /*  *mp0,  *mp1 */

 *mp0 = vcvtq_f32_s32(low32);
 *mp1 = vcvtq_f32_s32(high32);
  
}

Example #8

0

Show file

File: iht4x4_add_neon.c Project: jmvalin/aom

static INLINE void TRANSPOSE4X4(int16x8_t *q8s16, int16x8_t *q9s16) {
  int32x4_t q8s32, q9s32;
  int16x4x2_t d0x2s16, d1x2s16;
  int32x4x2_t q0x2s32;

  d0x2s16 = vtrn_s16(vget_low_s16(*q8s16), vget_high_s16(*q8s16));
  d1x2s16 = vtrn_s16(vget_low_s16(*q9s16), vget_high_s16(*q9s16));

  q8s32 = vreinterpretq_s32_s16(vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]));
  q9s32 = vreinterpretq_s32_s16(vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]));
  q0x2s32 = vtrnq_s32(q8s32, q9s32);

  *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]);
  *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]);
  return;
}

Example #9

0

Show file

File: error_neon.c Project: Wafflespeanut/gecko-dev

int64_t av1_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff,
                                int block_size) {
  int64x2_t error = vdupq_n_s64(0);

  assert(block_size >= 8);
  assert((block_size % 8) == 0);

  do {
    const int16x8_t c = vld1q_s16(coeff);
    const int16x8_t d = vld1q_s16(dqcoeff);
    const int16x8_t diff = vsubq_s16(c, d);
    const int16x4_t diff_lo = vget_low_s16(diff);
    const int16x4_t diff_hi = vget_high_s16(diff);
    // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before
    // accumulating them in 64-bits.
    const int32x4_t err0 = vmull_s16(diff_lo, diff_lo);
    const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi);
    const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1));
    error = vaddq_s64(error, err2);
    coeff += 8;
    dqcoeff += 8;
    block_size -= 8;
  } while (block_size != 0);

  return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1);
}

Example #10

0

Show file

File: vget_lows16.c Project: Akheon23/chromecast-mirrored-source.toolchain

void test_vget_lows16 (void)
{
  register int16x4_t out_int16x4_t asm ("d18");
  int16x8_t arg0_int16x8_t;

  out_int16x4_t = vget_low_s16 (arg0_int16x8_t);
}

Example #11

0

Show file

File: iht4x4_add_neon.c Project: jmvalin/aom

static INLINE void IADST4x4_1D(int16x4_t *d3s16, int16x4_t *d4s16,
                               int16x4_t *d5s16, int16x8_t *q3s16,
                               int16x8_t *q8s16, int16x8_t *q9s16) {
  int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16;
  int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;

  d6s16 = vget_low_s16(*q3s16);

  d16s16 = vget_low_s16(*q8s16);
  d17s16 = vget_high_s16(*q8s16);
  d18s16 = vget_low_s16(*q9s16);
  d19s16 = vget_high_s16(*q9s16);

  q10s32 = vmull_s16(*d3s16, d16s16);
  q11s32 = vmull_s16(*d4s16, d16s16);
  q12s32 = vmull_s16(d6s16, d17s16);
  q13s32 = vmull_s16(*d5s16, d18s16);
  q14s32 = vmull_s16(*d3s16, d18s16);
  q15s32 = vmovl_s16(d16s16);
  q15s32 = vaddw_s16(q15s32, d19s16);
  q8s32 = vmull_s16(*d4s16, d19s16);
  q15s32 = vsubw_s16(q15s32, d18s16);
  q9s32 = vmull_s16(*d5s16, d19s16);

  q10s32 = vaddq_s32(q10s32, q13s32);
  q10s32 = vaddq_s32(q10s32, q8s32);
  q11s32 = vsubq_s32(q11s32, q14s32);
  q8s32 = vdupq_n_s32(sinpi_3_9);
  q11s32 = vsubq_s32(q11s32, q9s32);
  q15s32 = vmulq_s32(q15s32, q8s32);

  q13s32 = vaddq_s32(q10s32, q12s32);
  q10s32 = vaddq_s32(q10s32, q11s32);
  q14s32 = vaddq_s32(q11s32, q12s32);
  q10s32 = vsubq_s32(q10s32, q12s32);

  d16s16 = vqrshrn_n_s32(q13s32, 14);
  d17s16 = vqrshrn_n_s32(q14s32, 14);
  d18s16 = vqrshrn_n_s32(q15s32, 14);
  d19s16 = vqrshrn_n_s32(q10s32, 14);

  *q8s16 = vcombine_s16(d16s16, d17s16);
  *q9s16 = vcombine_s16(d18s16, d19s16);
  return;
}

Example #12

0

Show file

File: qdrawhelper_neon.cpp Project: Nacto1/qt-everywhere-opensource-src-4.6.2

static inline int16x8_t qvsource_over_s16(int16x8_t src16, int16x8_t dst16, int16x8_t half, int16x8_t full)
{
    const int16x4_t alpha16_high = vdup_lane_s16(vget_high_s16(src16), 3);
    const int16x4_t alpha16_low = vdup_lane_s16(vget_low_s16(src16), 3);

    const int16x8_t alpha16 = vsubq_s16(full, vcombine_s16(alpha16_low, alpha16_high));

    return vaddq_s16(src16, qvbyte_mul_s16(dst16, alpha16, half));
}

Example #13

0

Show file

File: vpx_scaled_convolve8_neon.c Project: webmproject/libvpx

static INLINE void scaledconvolve_vert_w4(
    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
    const int y0_q4, const int y_step_q4, const int w, const int h) {
  int y;
  int y_q4 = y0_q4;

  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
  y = h;
  do {
    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];

    if (y_q4 & SUBPEL_MASK) {
      const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
      const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
      const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
      uint8x8_t s[8], d;
      int16x4_t t[8], tt;

      load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
                  &s[6], &s[7]);
      t[0] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[0])));
      t[1] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[1])));
      t[2] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[2])));
      t[3] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[3])));
      t[4] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[4])));
      t[5] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[5])));
      t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6])));
      t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7])));

      tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters,
                       filter3, filter4);
      d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
    } else {
      memcpy(dst, &src_y[3 * src_stride], w);
    }

    dst += dst_stride;
    y_q4 += y_step_q4;
  } while (--y);
}

Example #14

0

Show file

File: avg_neon.c Project: negge/aom

// coeff: 16 bits, dynamic range [-32640, 32640].
// length: value range {16, 64, 256, 1024}.
int aom_satd_neon(const int16_t *coeff, int length) {
  const int16x4_t zero = vdup_n_s16(0);
  int32x4_t accum = vdupq_n_s32(0);

  do {
    const int16x8_t src0 = vld1q_s16(coeff);
    const int16x8_t src8 = vld1q_s16(coeff + 8);
    accum = vabal_s16(accum, vget_low_s16(src0), zero);
    accum = vabal_s16(accum, vget_high_s16(src0), zero);
    accum = vabal_s16(accum, vget_low_s16(src8), zero);
    accum = vabal_s16(accum, vget_high_s16(src8), zero);
    length -= 16;
    coeff += 16;
  } while (length != 0);

  {
    // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
    const int64x2_t s0 = vpaddlq_s32(accum);  // cascading summation of 'accum'.
    const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),
                                  vreinterpret_s32_s64(vget_high_s64(s0)));
    const int satd = vget_lane_s32(s1, 0);
    return satd;
  }
}

Example #15

0

Show file

File: aom_convolve8_neon.c Project: kevleyski/FFmpeg

static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1,
                                       int16x4_t dsrc2, int16x4_t dsrc3,
                                       int16x4_t dsrc4, int16x4_t dsrc5,
                                       int16x4_t dsrc6, int16x4_t dsrc7,
                                       int16x8_t q0s16) {
  int32x4_t qdst;
  int16x4_t d0s16, d1s16;

  d0s16 = vget_low_s16(q0s16);
  d1s16 = vget_high_s16(q0s16);

  qdst = vmull_lane_s16(dsrc0, d0s16, 0);
  qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
  qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
  qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
  qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
  qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
  qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
  qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
  return qdst;
}

Example #16

0

Show file

void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint16_t *dest,
                                    int stride, int bd) {
  const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
  int32x4_t c0 = vld1q_s32(input);
  int32x4_t c1 = vld1q_s32(input + 4);
  int32x4_t c2 = vld1q_s32(input + 8);
  int32x4_t c3 = vld1q_s32(input + 12);
  int16x8_t a0, a1;

  if (bd == 8) {
    const int16x4_t cospis = vld1_s16(kCospi);

    // Rows
    a0 = vcombine_s16(vmovn_s32(c0), vmovn_s32(c1));
    a1 = vcombine_s16(vmovn_s32(c2), vmovn_s32(c3));
    idct4x4_16_kernel_bd8(cospis, &a0, &a1);

    // Columns
    a1 = vcombine_s16(vget_high_s16(a1), vget_low_s16(a1));
    idct4x4_16_kernel_bd8(cospis, &a0, &a1);
    a0 = vrshrq_n_s16(a0, 4);
    a1 = vrshrq_n_s16(a1, 4);
  } else {
    const int32x4_t cospis = vld1q_s32(kCospi32);

    if (bd == 10) {
      idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3);
      idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3);
    } else {
      idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3);
      idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3);
    }
    a0 = vcombine_s16(vqrshrn_n_s32(c0, 4), vqrshrn_n_s32(c1, 4));
    a1 = vcombine_s16(vqrshrn_n_s32(c3, 4), vqrshrn_n_s32(c2, 4));
  }

  highbd_idct4x4_1_add_kernel1(&dest, stride, a0, max);
  highbd_idct4x4_1_add_kernel2(&dest, stride, a1, max);
}

Example #17

0

Show file

File: vp9_avg_neon.c Project: d4rkuzs/ffmpeg-codecs-libs

// ref, src = [0, 510] - max diff = 16-bits
// bwl = {2, 3, 4}, width = {16, 32, 64}
int vp9_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
  int width = 4 << bwl;
  int32x4_t sse = vdupq_n_s32(0);
  int16x8_t total = vdupq_n_s16(0);

  assert(width >= 8);
  assert((width % 8) == 0);

  do {
    const int16x8_t r = vld1q_s16(ref);
    const int16x8_t s = vld1q_s16(src);
    const int16x8_t diff = vsubq_s16(r, s);  // [-510, 510], 10 bits.
    const int16x4_t diff_lo = vget_low_s16(diff);
    const int16x4_t diff_hi = vget_high_s16(diff);
    sse = vmlal_s16(sse, diff_lo, diff_lo);  // dynamic range 26 bits.
    sse = vmlal_s16(sse, diff_hi, diff_hi);
    total = vaddq_s16(total, diff);  // dynamic range 16 bits.

    ref += 8;
    src += 8;
    width -= 8;
  } while (width != 0);

  {
    // Note: 'total''s pairwise addition could be implemented similarly to
    // horizontal_add_u16x8(), but one less vpaddl with 'total' when paired
    // with the summation of 'sse' performed better on a Cortex-A15.
    const int32x4_t t0 = vpaddlq_s16(total);  // cascading summation of 'total'
    const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0));
    const int32x2_t t2 = vpadd_s32(t1, t1);
    const int t = vget_lane_s32(t2, 0);
    const int64x2_t s0 = vpaddlq_s32(sse);  // cascading summation of 'sse'.
    const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),
                                  vreinterpret_s32_s64(vget_high_s64(s0)));
    const int s = vget_lane_s32(s1, 0);
    const int shift_factor = bwl + 2;
    return s - ((t * t) >> shift_factor);
  }
}

Example #18

0

Show file

File: aecm_core_neon.c Project: dreaflove/webrtc-lib

void WebRtcAecm_ResetAdaptiveChannelNeon(AecmCore* aecm) {
  assert((uintptr_t)(aecm->channelStored) % 16 == 0);
  assert((uintptr_t)(aecm->channelAdapt16) % 16 == 0);
  assert((uintptr_t)(aecm->channelAdapt32) % 32 == 0);

  // The C code of following optimized code.
  // for (i = 0; i < PART_LEN1; i++) {
  //   aecm->channelAdapt16[i] = aecm->channelStored[i];
  //   aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
  //              (int32_t)aecm->channelStored[i], 16);
  // }

  int16_t* start_stored_p = aecm->channelStored;
  int16_t* start_adapt16_p = aecm->channelAdapt16;
  int32_t* start_adapt32_p = aecm->channelAdapt32;
  const int16_t* end_stored_p = start_stored_p + PART_LEN;

  int16x8_t stored_v;
  int32x4_t adapt32_v_low, adapt32_v_high;

  while (start_stored_p < end_stored_p) {
    stored_v = vld1q_s16(start_stored_p);
    vst1q_s16(start_adapt16_p, stored_v);

    adapt32_v_low = vshll_n_s16(vget_low_s16(stored_v), 16);
    adapt32_v_high = vshll_n_s16(vget_high_s16(stored_v), 16);

    vst1q_s32(start_adapt32_p, adapt32_v_low);
    vst1q_s32(start_adapt32_p + 4, adapt32_v_high);

    start_stored_p += 8;
    start_adapt16_p += 8;
    start_adapt32_p += 8;
  }
  aecm->channelAdapt16[PART_LEN] = aecm->channelStored[PART_LEN];
  aecm->channelAdapt32[PART_LEN] = (int32_t)aecm->channelStored[PART_LEN] << 16;
}

Example #19

0

Show file

File: aecm_core_neon.c Project: dreaflove/webrtc-lib

void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore* aecm,
                                       const uint16_t* far_spectrum,
                                       int32_t* echo_est,
                                       uint32_t* far_energy,
                                       uint32_t* echo_energy_adapt,
                                       uint32_t* echo_energy_stored) {
  int16_t* start_stored_p = aecm->channelStored;
  int16_t* start_adapt_p = aecm->channelAdapt16;
  int32_t* echo_est_p = echo_est;
  const int16_t* end_stored_p = aecm->channelStored + PART_LEN;
  const uint16_t* far_spectrum_p = far_spectrum;
  int16x8_t store_v, adapt_v;
  uint16x8_t spectrum_v;
  uint32x4_t echo_est_v_low, echo_est_v_high;
  uint32x4_t far_energy_v, echo_stored_v, echo_adapt_v;

  far_energy_v = vdupq_n_u32(0);
  echo_adapt_v = vdupq_n_u32(0);
  echo_stored_v = vdupq_n_u32(0);

  // Get energy for the delayed far end signal and estimated
  // echo using both stored and adapted channels.
  // The C code:
  //  for (i = 0; i < PART_LEN1; i++) {
  //      echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
  //                                         far_spectrum[i]);
  //      (*far_energy) += (uint32_t)(far_spectrum[i]);
  //      *echo_energy_adapt += aecm->channelAdapt16[i] * far_spectrum[i];
  //      (*echo_energy_stored) += (uint32_t)echo_est[i];
  //  }
  while (start_stored_p < end_stored_p) {
    spectrum_v = vld1q_u16(far_spectrum_p);
    adapt_v = vld1q_s16(start_adapt_p);
    store_v = vld1q_s16(start_stored_p);

    far_energy_v = vaddw_u16(far_energy_v, vget_low_u16(spectrum_v));
    far_energy_v = vaddw_u16(far_energy_v, vget_high_u16(spectrum_v));

    echo_est_v_low = vmull_u16(vreinterpret_u16_s16(vget_low_s16(store_v)),
                               vget_low_u16(spectrum_v));
    echo_est_v_high = vmull_u16(vreinterpret_u16_s16(vget_high_s16(store_v)),
                                vget_high_u16(spectrum_v));
    vst1q_s32(echo_est_p, vreinterpretq_s32_u32(echo_est_v_low));
    vst1q_s32(echo_est_p + 4, vreinterpretq_s32_u32(echo_est_v_high));

    echo_stored_v = vaddq_u32(echo_est_v_low, echo_stored_v);
    echo_stored_v = vaddq_u32(echo_est_v_high, echo_stored_v);

    echo_adapt_v = vmlal_u16(echo_adapt_v,
                             vreinterpret_u16_s16(vget_low_s16(adapt_v)),
                             vget_low_u16(spectrum_v));
    echo_adapt_v = vmlal_u16(echo_adapt_v,
                             vreinterpret_u16_s16(vget_high_s16(adapt_v)),
                             vget_high_u16(spectrum_v));

    start_stored_p += 8;
    start_adapt_p += 8;
    far_spectrum_p += 8;
    echo_est_p += 8;
  }

  AddLanes(far_energy, far_energy_v);
  AddLanes(echo_energy_stored, echo_stored_v);
  AddLanes(echo_energy_adapt, echo_adapt_v);

  echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN],
                                             far_spectrum[PART_LEN]);
  *echo_energy_stored += (uint32_t)echo_est[PART_LEN];
  *far_energy += (uint32_t)far_spectrum[PART_LEN];
  *echo_energy_adapt += aecm->channelAdapt16[PART_LEN] * far_spectrum[PART_LEN];
}

Example #20

0

Show file

File: idct8x8_add_neon.c Project: 93i/godot

static INLINE void IDCT8x8_1D(
        int16x8_t *q8s16,
        int16x8_t *q9s16,
        int16x8_t *q10s16,
        int16x8_t *q11s16,
        int16x8_t *q12s16,
        int16x8_t *q13s16,
        int16x8_t *q14s16,
        int16x8_t *q15s16) {
    int16x4_t d0s16, d1s16, d2s16, d3s16;
    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
    int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
    int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;

    d0s16 = vdup_n_s16(cospi_28_64);
    d1s16 = vdup_n_s16(cospi_4_64);
    d2s16 = vdup_n_s16(cospi_12_64);
    d3s16 = vdup_n_s16(cospi_20_64);

    d16s16 = vget_low_s16(*q8s16);
    d17s16 = vget_high_s16(*q8s16);
    d18s16 = vget_low_s16(*q9s16);
    d19s16 = vget_high_s16(*q9s16);
    d20s16 = vget_low_s16(*q10s16);
    d21s16 = vget_high_s16(*q10s16);
    d22s16 = vget_low_s16(*q11s16);
    d23s16 = vget_high_s16(*q11s16);
    d24s16 = vget_low_s16(*q12s16);
    d25s16 = vget_high_s16(*q12s16);
    d26s16 = vget_low_s16(*q13s16);
    d27s16 = vget_high_s16(*q13s16);
    d28s16 = vget_low_s16(*q14s16);
    d29s16 = vget_high_s16(*q14s16);
    d30s16 = vget_low_s16(*q15s16);
    d31s16 = vget_high_s16(*q15s16);

    q2s32 = vmull_s16(d18s16, d0s16);
    q3s32 = vmull_s16(d19s16, d0s16);
    q5s32 = vmull_s16(d26s16, d2s16);
    q6s32 = vmull_s16(d27s16, d2s16);

    q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
    q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
    q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
    q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);

    d8s16 = vqrshrn_n_s32(q2s32, 14);
    d9s16 = vqrshrn_n_s32(q3s32, 14);
    d10s16 = vqrshrn_n_s32(q5s32, 14);
    d11s16 = vqrshrn_n_s32(q6s32, 14);
    q4s16 = vcombine_s16(d8s16, d9s16);
    q5s16 = vcombine_s16(d10s16, d11s16);

    q2s32 = vmull_s16(d18s16, d1s16);
    q3s32 = vmull_s16(d19s16, d1s16);
    q9s32 = vmull_s16(d26s16, d3s16);
    q13s32 = vmull_s16(d27s16, d3s16);

    q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
    q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
    q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
    q13s32 = vmlal_s16(q13s32, d23s16, d2s16);

    d14s16 = vqrshrn_n_s32(q2s32, 14);
    d15s16 = vqrshrn_n_s32(q3s32, 14);
    d12s16 = vqrshrn_n_s32(q9s32, 14);
    d13s16 = vqrshrn_n_s32(q13s32, 14);
    q6s16 = vcombine_s16(d12s16, d13s16);
    q7s16 = vcombine_s16(d14s16, d15s16);

    d0s16 = vdup_n_s16(cospi_16_64);

    q2s32 = vmull_s16(d16s16, d0s16);
    q3s32 = vmull_s16(d17s16, d0s16);
    q13s32 = vmull_s16(d16s16, d0s16);
    q15s32 = vmull_s16(d17s16, d0s16);

    q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
    q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
    q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
    q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);

    d0s16 = vdup_n_s16(cospi_24_64);
    d1s16 = vdup_n_s16(cospi_8_64);

    d18s16 = vqrshrn_n_s32(q2s32, 14);
    d19s16 = vqrshrn_n_s32(q3s32, 14);
    d22s16 = vqrshrn_n_s32(q13s32, 14);
    d23s16 = vqrshrn_n_s32(q15s32, 14);
    *q9s16 = vcombine_s16(d18s16, d19s16);
    *q11s16 = vcombine_s16(d22s16, d23s16);

    q2s32 = vmull_s16(d20s16, d0s16);
    q3s32 = vmull_s16(d21s16, d0s16);
    q8s32 = vmull_s16(d20s16, d1s16);
    q12s32 = vmull_s16(d21s16, d1s16);

    q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
    q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
    q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
    q12s32 = vmlal_s16(q12s32, d29s16, d0s16);

    d26s16 = vqrshrn_n_s32(q2s32, 14);
    d27s16 = vqrshrn_n_s32(q3s32, 14);
    d30s16 = vqrshrn_n_s32(q8s32, 14);
    d31s16 = vqrshrn_n_s32(q12s32, 14);
    *q13s16 = vcombine_s16(d26s16, d27s16);
    *q15s16 = vcombine_s16(d30s16, d31s16);

    q0s16 = vaddq_s16(*q9s16, *q15s16);
    q1s16 = vaddq_s16(*q11s16, *q13s16);
    q2s16 = vsubq_s16(*q11s16, *q13s16);
    q3s16 = vsubq_s16(*q9s16, *q15s16);

    *q13s16 = vsubq_s16(q4s16, q5s16);
    q4s16 = vaddq_s16(q4s16, q5s16);
    *q14s16 = vsubq_s16(q7s16, q6s16);
    q7s16 = vaddq_s16(q7s16, q6s16);
    d26s16 = vget_low_s16(*q13s16);
    d27s16 = vget_high_s16(*q13s16);
    d28s16 = vget_low_s16(*q14s16);
    d29s16 = vget_high_s16(*q14s16);

    d16s16 = vdup_n_s16(cospi_16_64);

    q9s32 = vmull_s16(d28s16, d16s16);
    q10s32 = vmull_s16(d29s16, d16s16);
    q11s32 = vmull_s16(d28s16, d16s16);
    q12s32 = vmull_s16(d29s16, d16s16);

    q9s32 = vmlsl_s16(q9s32,  d26s16, d16s16);
    q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
    q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
    q12s32 = vmlal_s16(q12s32, d27s16, d16s16);

    d10s16 = vqrshrn_n_s32(q9s32, 14);
    d11s16 = vqrshrn_n_s32(q10s32, 14);
    d12s16 = vqrshrn_n_s32(q11s32, 14);
    d13s16 = vqrshrn_n_s32(q12s32, 14);
    q5s16 = vcombine_s16(d10s16, d11s16);
    q6s16 = vcombine_s16(d12s16, d13s16);

    *q8s16 = vaddq_s16(q0s16, q7s16);
    *q9s16 = vaddq_s16(q1s16, q6s16);
    *q10s16 = vaddq_s16(q2s16, q5s16);
    *q11s16 = vaddq_s16(q3s16, q4s16);
    *q12s16 = vsubq_s16(q3s16, q4s16);
    *q13s16 = vsubq_s16(q2s16, q5s16);
    *q14s16 = vsubq_s16(q1s16, q6s16);
    *q15s16 = vsubq_s16(q0s16, q7s16);
    return;
}

Example #21

0

Show file

File: idct8x8_add_neon.c Project: 93i/godot

void vpx_idct8x8_12_add_neon(
        int16_t *input,
        uint8_t *dest,
        int dest_stride) {
    uint8_t *d1, *d2;
    uint8x8_t d0u8, d1u8, d2u8, d3u8;
    int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16;
    int16x4_t d26s16, d27s16, d28s16, d29s16;
    uint64x1_t d0u64, d1u64, d2u64, d3u64;
    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
    uint16x8_t q8u16, q9u16, q10u16, q11u16;
    int32x4_t q9s32, q10s32, q11s32, q12s32;

    q8s16 = vld1q_s16(input);
    q9s16 = vld1q_s16(input + 8);
    q10s16 = vld1q_s16(input + 16);
    q11s16 = vld1q_s16(input + 24);
    q12s16 = vld1q_s16(input + 32);
    q13s16 = vld1q_s16(input + 40);
    q14s16 = vld1q_s16(input + 48);
    q15s16 = vld1q_s16(input + 56);

    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
                 &q12s16, &q13s16, &q14s16, &q15s16);

    // First transform rows
    // stage 1
    q0s16 = vdupq_n_s16(cospi_28_64 * 2);
    q1s16 = vdupq_n_s16(cospi_4_64 * 2);

    q4s16 = vqrdmulhq_s16(q9s16, q0s16);

    q0s16 = vdupq_n_s16(-cospi_20_64 * 2);

    q7s16 = vqrdmulhq_s16(q9s16, q1s16);

    q1s16 = vdupq_n_s16(cospi_12_64 * 2);

    q5s16 = vqrdmulhq_s16(q11s16, q0s16);

    q0s16 = vdupq_n_s16(cospi_16_64 * 2);

    q6s16 = vqrdmulhq_s16(q11s16, q1s16);

    // stage 2 & stage 3 - even half
    q1s16 = vdupq_n_s16(cospi_24_64 * 2);

    q9s16 = vqrdmulhq_s16(q8s16, q0s16);

    q0s16 = vdupq_n_s16(cospi_8_64 * 2);

    q13s16 = vqrdmulhq_s16(q10s16, q1s16);

    q15s16 = vqrdmulhq_s16(q10s16, q0s16);

    // stage 3 -odd half
    q0s16 = vaddq_s16(q9s16, q15s16);
    q1s16 = vaddq_s16(q9s16, q13s16);
    q2s16 = vsubq_s16(q9s16, q13s16);
    q3s16 = vsubq_s16(q9s16, q15s16);

    // stage 2 - odd half
    q13s16 = vsubq_s16(q4s16, q5s16);
    q4s16 = vaddq_s16(q4s16, q5s16);
    q14s16 = vsubq_s16(q7s16, q6s16);
    q7s16 = vaddq_s16(q7s16, q6s16);
    d26s16 = vget_low_s16(q13s16);
    d27s16 = vget_high_s16(q13s16);
    d28s16 = vget_low_s16(q14s16);
    d29s16 = vget_high_s16(q14s16);

    d16s16 = vdup_n_s16(cospi_16_64);
    q9s32 = vmull_s16(d28s16, d16s16);
    q10s32 = vmull_s16(d29s16, d16s16);
    q11s32 = vmull_s16(d28s16, d16s16);
    q12s32 = vmull_s16(d29s16, d16s16);

    q9s32 = vmlsl_s16(q9s32,  d26s16, d16s16);
    q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
    q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
    q12s32 = vmlal_s16(q12s32, d27s16, d16s16);

    d10s16 = vqrshrn_n_s32(q9s32, 14);
    d11s16 = vqrshrn_n_s32(q10s32, 14);
    d12s16 = vqrshrn_n_s32(q11s32, 14);
    d13s16 = vqrshrn_n_s32(q12s32, 14);
    q5s16 = vcombine_s16(d10s16, d11s16);
    q6s16 = vcombine_s16(d12s16, d13s16);

    // stage 4
    q8s16 = vaddq_s16(q0s16, q7s16);
    q9s16 = vaddq_s16(q1s16, q6s16);
    q10s16 = vaddq_s16(q2s16, q5s16);
    q11s16 = vaddq_s16(q3s16, q4s16);
    q12s16 = vsubq_s16(q3s16, q4s16);
    q13s16 = vsubq_s16(q2s16, q5s16);
    q14s16 = vsubq_s16(q1s16, q6s16);
    q15s16 = vsubq_s16(q0s16, q7s16);

    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
                 &q12s16, &q13s16, &q14s16, &q15s16);

    IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
               &q12s16, &q13s16, &q14s16, &q15s16);

    q8s16 = vrshrq_n_s16(q8s16, 5);
    q9s16 = vrshrq_n_s16(q9s16, 5);
    q10s16 = vrshrq_n_s16(q10s16, 5);
    q11s16 = vrshrq_n_s16(q11s16, 5);
    q12s16 = vrshrq_n_s16(q12s16, 5);
    q13s16 = vrshrq_n_s16(q13s16, 5);
    q14s16 = vrshrq_n_s16(q14s16, 5);
    q15s16 = vrshrq_n_s16(q15s16, 5);

    d1 = d2 = dest;

    d0u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;
    d1u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;
    d2u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;
    d3u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;

    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
                     vreinterpret_u8_u64(d0u64));
    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
                     vreinterpret_u8_u64(d1u64));
    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
                      vreinterpret_u8_u64(d2u64));
    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
                      vreinterpret_u8_u64(d3u64));

    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));

    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
    d2 += dest_stride;
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
    d2 += dest_stride;
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
    d2 += dest_stride;
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
    d2 += dest_stride;

    q8s16 = q12s16;
    q9s16 = q13s16;
    q10s16 = q14s16;
    q11s16 = q15s16;

    d0u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;
    d1u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;
    d2u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;
    d3u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;

    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
                     vreinterpret_u8_u64(d0u64));
    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
                     vreinterpret_u8_u64(d1u64));
    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
                      vreinterpret_u8_u64(d2u64));
    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
                      vreinterpret_u8_u64(d3u64));

    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));

    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
    d2 += dest_stride;
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
    d2 += dest_stride;
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
    d2 += dest_stride;
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
    d2 += dest_stride;
    return;
}

Example #22

0

Show file

File: idct8x8_add_neon.c Project: 93i/godot

static INLINE void TRANSPOSE8X8(
        int16x8_t *q8s16,
        int16x8_t *q9s16,
        int16x8_t *q10s16,
        int16x8_t *q11s16,
        int16x8_t *q12s16,
        int16x8_t *q13s16,
        int16x8_t *q14s16,
        int16x8_t *q15s16) {
    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
    int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
    int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;

    d16s16 = vget_low_s16(*q8s16);
    d17s16 = vget_high_s16(*q8s16);
    d18s16 = vget_low_s16(*q9s16);
    d19s16 = vget_high_s16(*q9s16);
    d20s16 = vget_low_s16(*q10s16);
    d21s16 = vget_high_s16(*q10s16);
    d22s16 = vget_low_s16(*q11s16);
    d23s16 = vget_high_s16(*q11s16);
    d24s16 = vget_low_s16(*q12s16);
    d25s16 = vget_high_s16(*q12s16);
    d26s16 = vget_low_s16(*q13s16);
    d27s16 = vget_high_s16(*q13s16);
    d28s16 = vget_low_s16(*q14s16);
    d29s16 = vget_high_s16(*q14s16);
    d30s16 = vget_low_s16(*q15s16);
    d31s16 = vget_high_s16(*q15s16);

    *q8s16  = vcombine_s16(d16s16, d24s16);  // vswp d17, d24
    *q9s16  = vcombine_s16(d18s16, d26s16);  // vswp d19, d26
    *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
    *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
    *q12s16 = vcombine_s16(d17s16, d25s16);
    *q13s16 = vcombine_s16(d19s16, d27s16);
    *q14s16 = vcombine_s16(d21s16, d29s16);
    *q15s16 = vcombine_s16(d23s16, d31s16);

    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
                        vreinterpretq_s32_s16(*q10s16));
    q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
                        vreinterpretq_s32_s16(*q11s16));
    q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
                        vreinterpretq_s32_s16(*q14s16));
    q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
                        vreinterpretq_s32_s16(*q15s16));

    q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
                        vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
    q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
                        vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
    q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
                        vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
    q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
                        vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15

    *q8s16  = q0x2s16.val[0];
    *q9s16  = q0x2s16.val[1];
    *q10s16 = q1x2s16.val[0];
    *q11s16 = q1x2s16.val[1];
    *q12s16 = q2x2s16.val[0];
    *q13s16 = q2x2s16.val[1];
    *q14s16 = q3x2s16.val[0];
    *q15s16 = q3x2s16.val[1];
    return;
}

Example #23

0

Show file

File: NE10_resize.neon.c Project: HarveyLiuFly/Ne10

void ne10_img_hresize_4channels_linear_neon (const unsigned char** src, int** dst, int count,
        const int* xofs, const short* alpha,
        int swidth, int dwidth, int cn, int xmin, int xmax)
{
    int dx, k;
    int dx0 = 0;

    int16x4x2_t alpha_vec;

    uint8x8_t dS0_vec, dS1_vec;
    int16x8_t qS0_vec, qS1_vec;
    int16x4_t dS0_0123, dS0_4567, dS1_0123, dS1_4567;

    int32x4_t qT0_vec, qT1_vec;

    int16x4_t dCoeff;
    dCoeff = vdup_n_s16 (INTER_RESIZE_COEF_SCALE);

    for (k = 0; k <= count - 2; k++)
    {
        const unsigned char *S0 = src[k], *S1 = src[k + 1];
        int *D0 = dst[k], *D1 = dst[k + 1];

        for (dx = dx0; dx < xmax; dx += 4)
        {
            int sx = xofs[dx];

            alpha_vec = vld2_s16 (&alpha[dx * 2]);

            dS0_vec = vld1_u8 (&S0[sx]);
            dS1_vec = vld1_u8 (&S1[sx]);

            qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));
            qS1_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS1_vec));

            dS0_0123 = vget_low_s16 (qS0_vec);
            dS0_4567 = vget_high_s16 (qS0_vec);
            dS1_0123 = vget_low_s16 (qS1_vec);
            dS1_4567 = vget_high_s16 (qS1_vec);

            qT0_vec = vmull_s16 (dS0_0123, alpha_vec.val[0]);
            qT1_vec = vmull_s16 (dS1_0123, alpha_vec.val[0]);
            qT0_vec = vmlal_s16 (qT0_vec, dS0_4567, alpha_vec.val[1]);
            qT1_vec = vmlal_s16 (qT1_vec, dS1_4567, alpha_vec.val[1]);

            vst1q_s32 (&D0[dx], qT0_vec);
            vst1q_s32 (&D1[dx], qT1_vec);
        }

        for (; dx < dwidth; dx += 4)
        {
            int sx = xofs[dx];

            dS0_vec = vld1_u8 (&S0[sx]);
            dS1_vec = vld1_u8 (&S1[sx]);

            qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));
            qS1_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS1_vec));

            dS0_0123 = vget_low_s16 (qS0_vec);
            dS1_0123 = vget_low_s16 (qS1_vec);

            qT0_vec = vmull_s16 (dS0_0123, dCoeff);
            qT1_vec = vmull_s16 (dS1_0123, dCoeff);

            vst1q_s32 (&D0[dx], qT0_vec);
            vst1q_s32 (&D1[dx], qT1_vec);
        }
    }

    for (; k < count; k++)
    {
        const unsigned char *S = src[k];
        int *D = dst[k];
        for (dx = 0; dx < xmax; dx += 4)
        {
            int sx = xofs[dx];

            alpha_vec = vld2_s16 (&alpha[dx * 2]);

            dS0_vec = vld1_u8 (&S[sx]);
            qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));

            dS0_0123 = vget_low_s16 (qS0_vec);
            dS0_4567 = vget_high_s16 (qS0_vec);

            qT0_vec = vmull_s16 (dS0_0123, alpha_vec.val[0]);
            qT0_vec = vmlal_s16 (qT0_vec, dS0_4567, alpha_vec.val[1]);

            vst1q_s32 (&D[dx], qT0_vec);
        }

        for (; dx < dwidth; dx += 4)
        {
            int sx = xofs[dx];

            dS0_vec = vld1_u8 (&S[sx]);
            qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));
            dS0_0123 = vget_low_s16 (qS0_vec);
            qT0_vec = vmull_s16 (dS0_0123, dCoeff);

            vst1q_s32 (&D[dx], qT0_vec);
        }
    }
}

Example #24

0

Show file

File: shortidct4x4llm_neon.c Project: Corax26/libvpx

void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr,
                               int pred_stride, unsigned char *dst_ptr,
                               int dst_stride) {
  int i;
  uint32x2_t d6u32 = vdup_n_u32(0);
  uint8x8_t d1u8;
  int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
  uint16x8_t q1u16;
  int16x8_t q1s16, q2s16, q3s16, q4s16;
  int32x2x2_t v2tmp0, v2tmp1;
  int16x4x2_t v2tmp2, v2tmp3;

  d2 = vld1_s16(input);
  d3 = vld1_s16(input + 4);
  d4 = vld1_s16(input + 8);
  d5 = vld1_s16(input + 12);

  // 1st for loop
  q1s16 = vcombine_s16(d2, d4);  // Swap d3 d4 here
  q2s16 = vcombine_s16(d3, d5);

  q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
  q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);

  d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // a1
  d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // b1

  q3s16 = vshrq_n_s16(q3s16, 1);
  q4s16 = vshrq_n_s16(q4s16, 1);

  q3s16 = vqaddq_s16(q3s16, q2s16);
  q4s16 = vqaddq_s16(q4s16, q2s16);

  d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16));  // c1
  d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16));  // d1

  d2 = vqadd_s16(d12, d11);
  d3 = vqadd_s16(d13, d10);
  d4 = vqsub_s16(d13, d10);
  d5 = vqsub_s16(d12, d11);

  v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
  v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
  v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
                    vreinterpret_s16_s32(v2tmp1.val[0]));
  v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
                    vreinterpret_s16_s32(v2tmp1.val[1]));

  // 2nd for loop
  q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp3.val[0]);
  q2s16 = vcombine_s16(v2tmp2.val[1], v2tmp3.val[1]);

  q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
  q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);

  d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // a1
  d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // b1

  q3s16 = vshrq_n_s16(q3s16, 1);
  q4s16 = vshrq_n_s16(q4s16, 1);

  q3s16 = vqaddq_s16(q3s16, q2s16);
  q4s16 = vqaddq_s16(q4s16, q2s16);

  d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16));  // c1
  d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16));  // d1

  d2 = vqadd_s16(d12, d11);
  d3 = vqadd_s16(d13, d10);
  d4 = vqsub_s16(d13, d10);
  d5 = vqsub_s16(d12, d11);

  d2 = vrshr_n_s16(d2, 3);
  d3 = vrshr_n_s16(d3, 3);
  d4 = vrshr_n_s16(d4, 3);
  d5 = vrshr_n_s16(d5, 3);

  v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
  v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
  v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
                    vreinterpret_s16_s32(v2tmp1.val[0]));
  v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
                    vreinterpret_s16_s32(v2tmp1.val[1]));

  q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp2.val[1]);
  q2s16 = vcombine_s16(v2tmp3.val[0], v2tmp3.val[1]);

  // dc_only_idct_add
  for (i = 0; i < 2; i++, q1s16 = q2s16) {
    d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 0);
    pred_ptr += pred_stride;
    d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 1);
    pred_ptr += pred_stride;

    q1u16 = vaddw_u8(vreinterpretq_u16_s16(q1s16), vreinterpret_u8_u32(d6u32));
    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));

    vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 0);
    dst_ptr += dst_stride;
    vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 1);
    dst_ptr += dst_stride;
  }
  return;
}

Example #25

0

Show file

File: vp9_highbd_iht8x8_add_neon.c Project: wolfviking0/webcl-webkit

void vp9_highbd_iht8x8_64_add_neon(const tran_low_t *input, uint16_t *dest,
                                   int stride, int tx_type, int bd) {
  int32x4_t a[16];
  int16x8_t c[8];

  a[0] = vld1q_s32(input);
  a[1] = vld1q_s32(input + 4);
  a[2] = vld1q_s32(input + 8);
  a[3] = vld1q_s32(input + 12);
  a[4] = vld1q_s32(input + 16);
  a[5] = vld1q_s32(input + 20);
  a[6] = vld1q_s32(input + 24);
  a[7] = vld1q_s32(input + 28);
  a[8] = vld1q_s32(input + 32);
  a[9] = vld1q_s32(input + 36);
  a[10] = vld1q_s32(input + 40);
  a[11] = vld1q_s32(input + 44);
  a[12] = vld1q_s32(input + 48);
  a[13] = vld1q_s32(input + 52);
  a[14] = vld1q_s32(input + 56);
  a[15] = vld1q_s32(input + 60);

  if (bd == 8) {
    c[0] = vcombine_s16(vmovn_s32(a[0]), vmovn_s32(a[1]));
    c[1] = vcombine_s16(vmovn_s32(a[2]), vmovn_s32(a[3]));
    c[2] = vcombine_s16(vmovn_s32(a[4]), vmovn_s32(a[5]));
    c[3] = vcombine_s16(vmovn_s32(a[6]), vmovn_s32(a[7]));
    c[4] = vcombine_s16(vmovn_s32(a[8]), vmovn_s32(a[9]));
    c[5] = vcombine_s16(vmovn_s32(a[10]), vmovn_s32(a[11]));
    c[6] = vcombine_s16(vmovn_s32(a[12]), vmovn_s32(a[13]));
    c[7] = vcombine_s16(vmovn_s32(a[14]), vmovn_s32(a[15]));

    switch (tx_type) {
      case DCT_DCT: {
        const int16x8_t cospis = vld1q_s16(kCospi);
        const int16x4_t cospis0 = vget_low_s16(cospis);   // cospi 0, 8, 16, 24
        const int16x4_t cospis1 = vget_high_s16(cospis);  // cospi 4, 12, 20, 28

        idct8x8_64_1d_bd8(cospis0, cospis1, c);
        idct8x8_64_1d_bd8(cospis0, cospis1, c);
        break;
      }

      case ADST_DCT: {
        const int16x8_t cospis = vld1q_s16(kCospi);
        const int16x4_t cospis0 = vget_low_s16(cospis);   // cospi 0, 8, 16, 24
        const int16x4_t cospis1 = vget_high_s16(cospis);  // cospi 4, 12, 20, 28

        idct8x8_64_1d_bd8(cospis0, cospis1, c);
        transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6],
                          &c[7]);
        iadst8(c);
        break;
      }

      case DCT_ADST: {
        const int16x8_t cospis = vld1q_s16(kCospi);
        const int16x4_t cospis0 = vget_low_s16(cospis);   // cospi 0, 8, 16, 24
        const int16x4_t cospis1 = vget_high_s16(cospis);  // cospi 4, 12, 20, 28

        transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6],
                          &c[7]);
        iadst8(c);
        idct8x8_64_1d_bd8(cospis0, cospis1, c);
        break;
      }

      default: {
        transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6],
                          &c[7]);
        iadst8(c);
        transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6],
                          &c[7]);
        iadst8(c);
        break;
      }
    }

    c[0] = vrshrq_n_s16(c[0], 5);
    c[1] = vrshrq_n_s16(c[1], 5);
    c[2] = vrshrq_n_s16(c[2], 5);
    c[3] = vrshrq_n_s16(c[3], 5);
    c[4] = vrshrq_n_s16(c[4], 5);
    c[5] = vrshrq_n_s16(c[5], 5);
    c[6] = vrshrq_n_s16(c[6], 5);
    c[7] = vrshrq_n_s16(c[7], 5);
  } else {
    switch (tx_type) {
      case DCT_DCT: {
        const int32x4_t cospis0 = vld1q_s32(kCospi32);  // cospi 0, 8, 16, 24
        const int32x4_t cospis1 =
            vld1q_s32(kCospi32 + 4);  // cospi 4, 12, 20, 28

        if (bd == 10) {
          idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
                                 &a[4], &a[5], &a[6], &a[7]);
          idct8x8_64_half1d_bd10(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
                                 &a[12], &a[13], &a[14], &a[15]);
          idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
                                 &a[2], &a[10], &a[3], &a[11]);
          idct8x8_64_half1d_bd10(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
                                 &a[6], &a[14], &a[7], &a[15]);
        } else {
          idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
                                 &a[4], &a[5], &a[6], &a[7]);
          idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
                                 &a[12], &a[13], &a[14], &a[15]);
          idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
                                 &a[2], &a[10], &a[3], &a[11]);
          idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
                                 &a[6], &a[14], &a[7], &a[15]);
        }
        break;
      }

      case ADST_DCT: {
        const int32x4_t cospis0 = vld1q_s32(kCospi32);  // cospi 0, 8, 16, 24
        const int32x4_t cospis1 =
            vld1q_s32(kCospi32 + 4);  // cospi 4, 12, 20, 28

        if (bd == 10) {
          idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
                                 &a[4], &a[5], &a[6], &a[7]);
          idct8x8_64_half1d_bd10(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
                                 &a[12], &a[13], &a[14], &a[15]);
          transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3],
                            &a[11]);
          iadst8_bd10(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]);
          transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
                            &a[15]);
          iadst8_bd10(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
                      &a[15]);
        } else {
          idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
                                 &a[4], &a[5], &a[6], &a[7]);
          idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
                                 &a[12], &a[13], &a[14], &a[15]);
          transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3],
                            &a[11]);
          iadst8_bd12(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]);
          transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
                            &a[15]);
          iadst8_bd12(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
                      &a[15]);
        }
        break;
      }

      case DCT_ADST: {
        const int32x4_t cospis0 = vld1q_s32(kCospi32);  // cospi 0, 8, 16, 24
        const int32x4_t cospis1 =
            vld1q_s32(kCospi32 + 4);  // cospi 4, 12, 20, 28

        if (bd == 10) {
          transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6],
                            &a[7]);
          iadst8_bd10(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
          transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13],
                            &a[14], &a[15]);
          iadst8_bd10(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
                      &a[15]);
          idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
                                 &a[2], &a[10], &a[3], &a[11]);
          idct8x8_64_half1d_bd10(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
                                 &a[6], &a[14], &a[7], &a[15]);
        } else {
          transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6],
                            &a[7]);
          iadst8_bd12(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
          transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13],
                            &a[14], &a[15]);
          iadst8_bd12(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
                      &a[15]);
          idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
                                 &a[2], &a[10], &a[3], &a[11]);
          idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
                                 &a[6], &a[14], &a[7], &a[15]);
        }
        break;
      }

      default: {
        assert(tx_type == ADST_ADST);
        if (bd == 10) {
          transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6],
                            &a[7]);
          iadst8_bd10(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
          transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13],
                            &a[14], &a[15]);
          iadst8_bd10(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
                      &a[15]);
          transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3],
                            &a[11]);
          iadst8_bd10(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]);
          transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
                            &a[15]);
          iadst8_bd10(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
                      &a[15]);
        } else {
          transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6],
                            &a[7]);
          iadst8_bd12(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
          transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13],
                            &a[14], &a[15]);
          iadst8_bd12(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
                      &a[15]);
          transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3],
                            &a[11]);
          iadst8_bd12(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]);
          transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
                            &a[15]);
          iadst8_bd12(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
                      &a[15]);
        }
        break;
      }
    }

    c[0] = vcombine_s16(vrshrn_n_s32(a[0], 5), vrshrn_n_s32(a[4], 5));
    c[1] = vcombine_s16(vrshrn_n_s32(a[8], 5), vrshrn_n_s32(a[12], 5));
    c[2] = vcombine_s16(vrshrn_n_s32(a[1], 5), vrshrn_n_s32(a[5], 5));
    c[3] = vcombine_s16(vrshrn_n_s32(a[9], 5), vrshrn_n_s32(a[13], 5));
    c[4] = vcombine_s16(vrshrn_n_s32(a[2], 5), vrshrn_n_s32(a[6], 5));
    c[5] = vcombine_s16(vrshrn_n_s32(a[10], 5), vrshrn_n_s32(a[14], 5));
    c[6] = vcombine_s16(vrshrn_n_s32(a[3], 5), vrshrn_n_s32(a[7], 5));
    c[7] = vcombine_s16(vrshrn_n_s32(a[11], 5), vrshrn_n_s32(a[15], 5));
  }
  highbd_add8x8(c, dest, stride, bd);
}

Example #26

0

Show file

File: aarch64-neon-vget-hilo.c Project: Chxnew/minix

int16x4_t test_vget_low_s16(int16x8_t a) {
  // CHECK-LABEL: test_vget_low_s16:
  return vget_low_s16(a);
  // CHECK-NEXT: ret
}

Example #27

0

Show file

File: idct4x4_add_neon.c Project: Wafflespeanut/gecko-dev

void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
  uint8x8_t d26u8, d27u8;
  uint32x2_t d26u32, d27u32;
  uint16x8_t q8u16, q9u16;
  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16;
  int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16;
  int16x8_t q8s16, q9s16, q13s16, q14s16;
  int32x4_t q1s32, q13s32, q14s32, q15s32;
  int16x4x2_t d0x2s16, d1x2s16;
  int32x4x2_t q0x2s32;
  uint8_t *d;

  d26u32 = d27u32 = vdup_n_u32(0);

  q8s16 = vld1q_s16(input);
  q9s16 = vld1q_s16(input + 8);

  d16s16 = vget_low_s16(q8s16);
  d17s16 = vget_high_s16(q8s16);
  d18s16 = vget_low_s16(q9s16);
  d19s16 = vget_high_s16(q9s16);

  d0x2s16 = vtrn_s16(d16s16, d17s16);
  d1x2s16 = vtrn_s16(d18s16, d19s16);
  q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
  q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);

  d20s16 = vdup_n_s16((int16_t)cospi_8_64);
  d21s16 = vdup_n_s16((int16_t)cospi_16_64);

  q0x2s32 =
      vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
  d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
  d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
  d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
  d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));

  d22s16 = vdup_n_s16((int16_t)cospi_24_64);

  // stage 1
  d23s16 = vadd_s16(d16s16, d18s16);
  d24s16 = vsub_s16(d16s16, d18s16);

  q15s32 = vmull_s16(d17s16, d22s16);
  q1s32 = vmull_s16(d17s16, d20s16);
  q13s32 = vmull_s16(d23s16, d21s16);
  q14s32 = vmull_s16(d24s16, d21s16);

  q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
  q1s32 = vmlal_s16(q1s32, d19s16, d22s16);

  d26s16 = vqrshrn_n_s32(q13s32, 14);
  d27s16 = vqrshrn_n_s32(q14s32, 14);
  d29s16 = vqrshrn_n_s32(q15s32, 14);
  d28s16 = vqrshrn_n_s32(q1s32, 14);
  q13s16 = vcombine_s16(d26s16, d27s16);
  q14s16 = vcombine_s16(d28s16, d29s16);

  // stage 2
  q8s16 = vaddq_s16(q13s16, q14s16);
  q9s16 = vsubq_s16(q13s16, q14s16);

  d16s16 = vget_low_s16(q8s16);
  d17s16 = vget_high_s16(q8s16);
  d18s16 = vget_high_s16(q9s16);  // vswp d18 d19
  d19s16 = vget_low_s16(q9s16);

  d0x2s16 = vtrn_s16(d16s16, d17s16);
  d1x2s16 = vtrn_s16(d18s16, d19s16);
  q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
  q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);

  q0x2s32 =
      vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
  d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
  d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
  d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
  d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));

  // do the transform on columns
  // stage 1
  d23s16 = vadd_s16(d16s16, d18s16);
  d24s16 = vsub_s16(d16s16, d18s16);

  q15s32 = vmull_s16(d17s16, d22s16);
  q1s32 = vmull_s16(d17s16, d20s16);
  q13s32 = vmull_s16(d23s16, d21s16);
  q14s32 = vmull_s16(d24s16, d21s16);

  q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
  q1s32 = vmlal_s16(q1s32, d19s16, d22s16);

  d26s16 = vqrshrn_n_s32(q13s32, 14);
  d27s16 = vqrshrn_n_s32(q14s32, 14);
  d29s16 = vqrshrn_n_s32(q15s32, 14);
  d28s16 = vqrshrn_n_s32(q1s32, 14);
  q13s16 = vcombine_s16(d26s16, d27s16);
  q14s16 = vcombine_s16(d28s16, d29s16);

  // stage 2
  q8s16 = vaddq_s16(q13s16, q14s16);
  q9s16 = vsubq_s16(q13s16, q14s16);

  q8s16 = vrshrq_n_s16(q8s16, 4);
  q9s16 = vrshrq_n_s16(q9s16, 4);

  d = dest;
  d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0);
  d += dest_stride;
  d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1);
  d += dest_stride;
  d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1);
  d += dest_stride;
  d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0);

  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));

  d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
  d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));

  d = dest;
  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0);
  d += dest_stride;
  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1);
  d += dest_stride;
  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1);
  d += dest_stride;
  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0);
  return;
}

Example #28

0

Show file

File: vp9_quantize_neon.c Project: bear101/libvpx

void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
                          int skip_block, const int16_t *zbin_ptr,
                          const int16_t *round_ptr, const int16_t *quant_ptr,
                          const int16_t *quant_shift_ptr,
                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
                          const int16_t *scan, const int16_t *iscan) {
  // TODO(jingning) Decide the need of these arguments after the
  // quantization process is completed.
  (void)zbin_ptr;
  (void)quant_shift_ptr;
  (void)scan;

  if (!skip_block) {
    // Quantization pass: All coefficients with index >= zero_flag are
    // skippable. Note: zero_flag can be zero.
    int i;
    const int16x8_t v_zero = vdupq_n_s16(0);
    const int16x8_t v_one = vdupq_n_s16(1);
    int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
    int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
    int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
    int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
    // adjust for dc
    v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
    v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
    v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
    // process dc and the first seven ac coeffs
    {
      const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
      const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
      const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
      const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
      const int32x4_t v_tmp_lo =
          vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
      const int32x4_t v_tmp_hi =
          vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
      const int16x8_t v_tmp2 =
          vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
      const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
      const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
      const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
      const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
      const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
      const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
      v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
      store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
      store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff);
      v_round = vmovq_n_s16(round_ptr[1]);
      v_quant = vmovq_n_s16(quant_ptr[1]);
      v_dequant = vmovq_n_s16(dequant_ptr[1]);
    }
    // now process the rest of the ac coeffs
    for (i = 8; i < count; i += 8) {
      const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
      const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr + i);
      const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
      const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
      const int32x4_t v_tmp_lo =
          vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
      const int32x4_t v_tmp_hi =
          vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
      const int16x8_t v_tmp2 =
          vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
      const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
      const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
      const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
      const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
      const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
      const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
      v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
      store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff);
      store_s16q_to_tran_low(dqcoeff_ptr + i, v_dqcoeff);
    }
    {
      const int16x4_t v_eobmax_3210 = vmax_s16(
          vget_low_s16(v_eobmax_76543210), vget_high_s16(v_eobmax_76543210));
      const int64x1_t v_eobmax_xx32 =
          vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
      const int16x4_t v_eobmax_tmp =
          vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
      const int64x1_t v_eobmax_xxx3 =
          vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
      const int16x4_t v_eobmax_final =
          vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));

      *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
    }
  } else {
    memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
    memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
    *eob_ptr = 0;
  }
}

Example #29

0

Show file

File: vpx_scaled_convolve8_neon.c Project: webmproject/libvpx

static INLINE void scaledconvolve_horiz_w4(
    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
    const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
    const int x0_q4, const int x_step_q4, const int w, const int h) {
  DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
  int x, y, z;

  src -= SUBPEL_TAPS / 2 - 1;

  y = h;
  do {
    int x_q4 = x0_q4;
    x = 0;
    do {
      // process 4 src_x steps
      for (z = 0; z < 4; ++z) {
        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
        if (x_q4 & SUBPEL_MASK) {
          const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
          const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
          const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
          uint8x8_t s[8], d;
          int16x8_t ss[4];
          int16x4_t t[8], tt;

          load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]);
          transpose_u8_8x4(&s[0], &s[1], &s[2], &s[3]);

          ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
          ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
          ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
          ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
          t[0] = vget_low_s16(ss[0]);
          t[1] = vget_low_s16(ss[1]);
          t[2] = vget_low_s16(ss[2]);
          t[3] = vget_low_s16(ss[3]);
          t[4] = vget_high_s16(ss[0]);
          t[5] = vget_high_s16(ss[1]);
          t[6] = vget_high_s16(ss[2]);
          t[7] = vget_high_s16(ss[3]);

          tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
                           filters, filter3, filter4);
          d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
          vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0);
        } else {
          int i;
          for (i = 0; i < 4; ++i) {
            temp[z * 4 + i] = src_x[i * src_stride + 3];
          }
        }
        x_q4 += x_step_q4;
      }

      // transpose the 4x4 filters values back to dst
      {
        const uint8x8x4_t d4 = vld4_u8(temp);
        vst1_lane_u32((uint32_t *)&dst[x + 0 * dst_stride],
                      vreinterpret_u32_u8(d4.val[0]), 0);
        vst1_lane_u32((uint32_t *)&dst[x + 1 * dst_stride],
                      vreinterpret_u32_u8(d4.val[1]), 0);
        vst1_lane_u32((uint32_t *)&dst[x + 2 * dst_stride],
                      vreinterpret_u32_u8(d4.val[2]), 0);
        vst1_lane_u32((uint32_t *)&dst[x + 3 * dst_stride],
                      vreinterpret_u32_u8(d4.val[3]), 0);
      }
      x += 4;
    } while (x < w);

    src += src_stride * 4;
    dst += dst_stride * 4;
    y -= 4;
  } while (y > 0);
}

Example #30

0

Show file

File: b9193874-neon.c Project: 007gzs/android-platform-ndk

void UpsampleRgbaLinePairNEON(const uint8_t *top_y, const uint8_t *bottom_y, const uint8_t *top_u, const uint8_t *top_v, const uint8_t *cur_u, const uint8_t *cur_v, uint8_t *top_dst, uint8_t *bottom_dst, int len)
{
    int block;
    uint8_t uv_buf[2 * 32 + 15];
    uint8_t *const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);
    const int uv_len = (len + 1) >> 1;
    const int num_blocks = (uv_len - 1) >> 3;
    const int leftover = uv_len - num_blocks * 8;
    const int last_pos = 1 + 16 * num_blocks;
    const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;
    const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;
    const int16x4_t cf16 = vld1_s16(coef);
    const int32x2_t cf32 = vmov_n_s32(76283);
    const uint8x8_t u16 = vmov_n_u8(16);
    const uint8x8_t u128 = vmov_n_u8(128);
    for (block = 0; block < num_blocks; ++block) {
        {
            uint8x8_t a = vld1_u8(top_u);
            uint8x8_t b = vld1_u8(top_u + 1);
            uint8x8_t c = vld1_u8(cur_u);
            uint8x8_t d = vld1_u8(cur_u + 1);
            uint16x8_t al = vshll_n_u8(a, 1);
            uint16x8_t bl = vshll_n_u8(b, 1);
            uint16x8_t cl = vshll_n_u8(c, 1);
            uint16x8_t dl = vshll_n_u8(d, 1);
            uint8x8_t diag1, diag2;
            uint16x8_t sl;
            sl = vaddl_u8(a, b);
            sl = vaddw_u8(sl, c);
            sl = vaddw_u8(sl, d);
            al = vaddq_u16(sl, al);
            bl = vaddq_u16(sl, bl);
            al = vaddq_u16(al, dl);
            bl = vaddq_u16(bl, cl);
            diag2 = vshrn_n_u16(al, 3);
            diag1 = vshrn_n_u16(bl, 3);
            a = vrhadd_u8(a, diag1);
            b = vrhadd_u8(b, diag2);
            c = vrhadd_u8(c, diag2);
            d = vrhadd_u8(d, diag1);
            {
                const uint8x8x2_t a_b = {{ a, b }};
                const uint8x8x2_t c_d = {{ c, d }};
                vst2_u8(r_uv, a_b);
                vst2_u8(r_uv + 32, c_d);
            }
        }
        {
            uint8x8_t a = vld1_u8(top_v);
            uint8x8_t b = vld1_u8(top_v + 1);
            uint8x8_t c = vld1_u8(cur_v);
            uint8x8_t d = vld1_u8(cur_v + 1);
            uint16x8_t al = vshll_n_u8(a, 1);
            uint16x8_t bl = vshll_n_u8(b, 1);
            uint16x8_t cl = vshll_n_u8(c, 1);
            uint16x8_t dl = vshll_n_u8(d, 1);
            uint8x8_t diag1, diag2;
            uint16x8_t sl;
            sl = vaddl_u8(a, b);
            sl = vaddw_u8(sl, c);
            sl = vaddw_u8(sl, d);
            al = vaddq_u16(sl, al);
            bl = vaddq_u16(sl, bl);
            al = vaddq_u16(al, dl);
            bl = vaddq_u16(bl, cl);
            diag2 = vshrn_n_u16(al, 3);
            diag1 = vshrn_n_u16(bl, 3);
            a = vrhadd_u8(a, diag1);
            b = vrhadd_u8(b, diag2);
            c = vrhadd_u8(c, diag2);
            d = vrhadd_u8(d, diag1);
            {
                const uint8x8x2_t a_b = {{ a, b }};
                const uint8x8x2_t c_d = {{ c, d }};
                vst2_u8(r_uv + 16, a_b);
                vst2_u8(r_uv + 16 + 32, c_d);
            }
        }
        {
            if (top_y) {
                {
                    int i;
                    for (i = 0; i < 16; i += 8) {
                        int off = ((16 * block + 1) + i) * 4;
                        uint8x8_t y = vld1_u8(top_y + (16 * block + 1) + i);
                        uint8x8_t u = vld1_u8((r_uv) + i);
                        uint8x8_t v = vld1_u8((r_uv) + i + 16);
                        int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16));
                        int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128));
                        int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128));
                        int16x8_t ud = vshlq_n_s16(uu, 1);
                        int16x8_t vd = vshlq_n_s16(vv, 1);
                        int32x4_t vrl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(vv), 1), vget_low_s16(vd), cf16, 0);
                        int32x4_t vrh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(vv), 1), vget_high_s16(vd), cf16, 0);
                        int16x8_t vr = vcombine_s16(vrshrn_n_s32(vrl, 16), vrshrn_n_s32(vrh, 16));
                        int32x4_t vl = vmovl_s16(vget_low_s16(vv));
                        int32x4_t vh = vmovl_s16(vget_high_s16(vv));
                        int32x4_t ugl = vmlal_lane_s16(vl, vget_low_s16(uu), cf16, 1);
                        int32x4_t ugh = vmlal_lane_s16(vh, vget_high_s16(uu), cf16, 1);
                        int32x4_t gcl = vqdmlal_lane_s16(ugl, vget_low_s16(vv), cf16, 2);
                        int32x4_t gch = vqdmlal_lane_s16(ugh, vget_high_s16(vv), cf16, 2);
                        int16x8_t gc = vcombine_s16(vrshrn_n_s32(gcl, 16), vrshrn_n_s32(gch, 16));
                        int32x4_t ubl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(uu), 1), vget_low_s16(ud), cf16, 3);
                        int32x4_t ubh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(uu), 1), vget_high_s16(ud), cf16, 3);
                        int16x8_t ub = vcombine_s16(vrshrn_n_s32(ubl, 16), vrshrn_n_s32(ubh, 16));
                        int32x4_t rl = vaddl_s16(vget_low_s16(yy), vget_low_s16(vr));
                        int32x4_t rh = vaddl_s16(vget_high_s16(yy), vget_high_s16(vr));
                        int32x4_t gl = vsubl_s16(vget_low_s16(yy), vget_low_s16(gc));
                        int32x4_t gh = vsubl_s16(vget_high_s16(yy), vget_high_s16(gc));
                        int32x4_t bl = vaddl_s16(vget_low_s16(yy), vget_low_s16(ub));
                        int32x4_t bh = vaddl_s16(vget_high_s16(yy), vget_high_s16(ub));
                        rl = vmulq_lane_s32(rl, cf32, 0);
                        rh = vmulq_lane_s32(rh, cf32, 0);
                        gl = vmulq_lane_s32(gl, cf32, 0);
                        gh = vmulq_lane_s32(gh, cf32, 0);
                        bl = vmulq_lane_s32(bl, cf32, 0);
                        bh = vmulq_lane_s32(bh, cf32, 0);
                        y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, 16), vrshrn_n_s32(rh, 16)));
                        u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, 16), vrshrn_n_s32(gh, 16)));
                        v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(bl, 16), vrshrn_n_s32(bh, 16)));
                        do {
                            const uint8x8x4_t r_g_b_v255 = {{ y, u, v, vmov_n_u8(255) }};
                            vst4_u8(top_dst + off, r_g_b_v255); 
                        } while (0);
                    }
                }
            }
            if (bottom_y) {
                {
                    int i;
                    for (i = 0; i < 16; i += 8) {
                        int off = ((16 * block + 1) + i) * 4;
                        uint8x8_t y = vld1_u8(bottom_y + (16 * block + 1) + i);
                        uint8x8_t u = vld1_u8(((r_uv) + 32) + i);
                        uint8x8_t v = vld1_u8(((r_uv) + 32) + i + 16);
                        int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16));
                        int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128));
                        int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128));
                        int16x8_t ud = vshlq_n_s16(uu, 1);
                        int16x8_t vd = vshlq_n_s16(vv, 1);
                        int32x4_t vrl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(vv), 1), vget_low_s16(vd), cf16, 0);
                        int32x4_t vrh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(vv), 1), vget_high_s16(vd), cf16, 0);
                        int16x8_t vr = vcombine_s16(vrshrn_n_s32(vrl, 16), vrshrn_n_s32(vrh, 16));
                        int32x4_t vl = vmovl_s16(vget_low_s16(vv));
                        int32x4_t vh = vmovl_s16(vget_high_s16(vv));
                        int32x4_t ugl = vmlal_lane_s16(vl, vget_low_s16(uu), cf16, 1);
                        int32x4_t ugh = vmlal_lane_s16(vh, vget_high_s16(uu), cf16, 1);
                        int32x4_t gcl = vqdmlal_lane_s16(ugl, vget_low_s16(vv), cf16, 2);
                        int32x4_t gch = vqdmlal_lane_s16(ugh, vget_high_s16(vv), cf16, 2);
                        int16x8_t gc = vcombine_s16(vrshrn_n_s32(gcl, 16), vrshrn_n_s32(gch, 16));
                        int32x4_t ubl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(uu), 1), vget_low_s16(ud), cf16, 3);
                        int32x4_t ubh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(uu), 1), vget_high_s16(ud), cf16, 3);
                        int16x8_t ub = vcombine_s16(vrshrn_n_s32(ubl, 16), vrshrn_n_s32(ubh, 16));
                        int32x4_t rl = vaddl_s16(vget_low_s16(yy), vget_low_s16(vr));
                        int32x4_t rh = vaddl_s16(vget_high_s16(yy), vget_high_s16(vr));
                        int32x4_t gl = vsubl_s16(vget_low_s16(yy), vget_low_s16(gc));
                        int32x4_t gh = vsubl_s16(vget_high_s16(yy), vget_high_s16(gc));
                        int32x4_t bl = vaddl_s16(vget_low_s16(yy), vget_low_s16(ub));
                        int32x4_t bh = vaddl_s16(vget_high_s16(yy), vget_high_s16(ub));
                        rl = vmulq_lane_s32(rl, cf32, 0);
                        rh = vmulq_lane_s32(rh, cf32, 0);
                        gl = vmulq_lane_s32(gl, cf32, 0);
                        gh = vmulq_lane_s32(gh, cf32, 0);
                        bl = vmulq_lane_s32(bl, cf32, 0);
                        bh = vmulq_lane_s32(bh, cf32, 0);
                        y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, 16), vrshrn_n_s32(rh, 16)));
                        u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, 16), vrshrn_n_s32(gh, 16)));
                        v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(bl, 16), vrshrn_n_s32(bh, 16)));
                        do {
                            const uint8x8x4_t r_g_b_v255 = {{ y, u, v, vmov_n_u8(255) }};
                            vst4_u8(bottom_dst + off, r_g_b_v255);
                        } while (0);
                    }
                }
            }
        }
    }
}