int32_t dot_product(int16_t *x,
                    int16_t *y,
                    uint32_t N, //must be a multiple of 8
                    uint8_t output_shift)
{

  uint32_t n;

#if defined(__x86_64__) || defined(__i386__)
  __m128i *x128,*y128,mmtmp1,mmtmp2,mmtmp3,mmcumul,mmcumul_re,mmcumul_im;
  __m64 mmtmp7;
  __m128i minus_i = _mm_set_epi16(-1,1,-1,1,-1,1,-1,1);
  int32_t result;

  x128 = (__m128i*) x;
  y128 = (__m128i*) y;

  mmcumul_re = _mm_setzero_si128();
  mmcumul_im = _mm_setzero_si128();

  for (n=0; n<(N>>2); n++) {

    //printf("n=%d, x128=%p, y128=%p\n",n,x128,y128);
    //    print_shorts("x",&x128[0]);
    //    print_shorts("y",&y128[0]);

    // this computes Re(z) = Re(x)*Re(y) + Im(x)*Im(y)
    mmtmp1 = _mm_madd_epi16(x128[0],y128[0]);
    //    print_ints("re",&mmtmp1);
    // mmtmp1 contains real part of 4 consecutive outputs (32-bit)

    // shift and accumulate results
    mmtmp1 = _mm_srai_epi32(mmtmp1,output_shift);
    mmcumul_re = _mm_add_epi32(mmcumul_re,mmtmp1);
    //    print_ints("re",&mmcumul_re);


    // this computes Im(z) = Re(x)*Im(y) - Re(y)*Im(x)
    mmtmp2 = _mm_shufflelo_epi16(y128[0],_MM_SHUFFLE(2,3,0,1));
    //    print_shorts("y",&mmtmp2);
    mmtmp2 = _mm_shufflehi_epi16(mmtmp2,_MM_SHUFFLE(2,3,0,1));
    //    print_shorts("y",&mmtmp2);
    mmtmp2 = _mm_sign_epi16(mmtmp2,minus_i);
    //        print_shorts("y",&mmtmp2);

    mmtmp3 = _mm_madd_epi16(x128[0],mmtmp2);
    //        print_ints("im",&mmtmp3);
    // mmtmp3 contains imag part of 4 consecutive outputs (32-bit)

    // shift and accumulate results
    mmtmp3 = _mm_srai_epi32(mmtmp3,output_shift);
    mmcumul_im = _mm_add_epi32(mmcumul_im,mmtmp3);
    //    print_ints("im",&mmcumul_im);

    x128++;
    y128++;
  }

  // this gives Re Re Im Im
  mmcumul = _mm_hadd_epi32(mmcumul_re,mmcumul_im);
  //  print_ints("cumul1",&mmcumul);

  // this gives Re Im Re Im
  mmcumul = _mm_hadd_epi32(mmcumul,mmcumul);

  //  print_ints("cumul2",&mmcumul);


  //mmcumul = _mm_srai_epi32(mmcumul,output_shift);
  // extract the lower half
  mmtmp7 = _mm_movepi64_pi64(mmcumul);
  //  print_ints("mmtmp7",&mmtmp7);
  // pack the result
  mmtmp7 = _mm_packs_pi32(mmtmp7,mmtmp7);
  //  print_shorts("mmtmp7",&mmtmp7);
  // convert back to integer
  result = _mm_cvtsi64_si32(mmtmp7);

  _mm_empty();
  _m_empty();

  return(result);

#elif defined(__arm__)
  int16x4_t *x_128=(int16x4_t*)x;
  int16x4_t *y_128=(int16x4_t*)y;
  int32x4_t tmp_re,tmp_im;
  int32x4_t tmp_re1,tmp_im1;
  int32x4_t re_cumul,im_cumul;
  int32x2_t re_cumul2,im_cumul2;
  int32x4_t shift = vdupq_n_s32(-output_shift); 
  int32x2x2_t result2;
  int16_t conjug[4]__attribute__((aligned(16))) = {-1,1,-1,1} ;

  re_cumul = vdupq_n_s32(0);
  im_cumul = vdupq_n_s32(0); 

  for (n=0; n<(N>>2); n++) {

    tmp_re  = vmull_s16(*x_128++, *y_128++);
    //tmp_re = [Re(x[0])Re(y[0]) Im(x[0])Im(y[0]) Re(x[1])Re(y[1]) Im(x[1])Im(y[1])] 
    tmp_re1 = vmull_s16(*x_128++, *y_128++);
    //tmp_re1 = [Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1]) Re(x1[1])Re(x2[2]) Im(x1[1])Im(x2[2])] 
    tmp_re  = vcombine_s32(vpadd_s32(vget_low_s32(tmp_re),vget_high_s32(tmp_re)),
                           vpadd_s32(vget_low_s32(tmp_re1),vget_high_s32(tmp_re1)));
    //tmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2]) Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] 

    tmp_im  = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++);
    //tmp_im = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])]
    tmp_im1 = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++);
    //tmp_im1 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])]
    tmp_im  = vcombine_s32(vpadd_s32(vget_low_s32(tmp_im),vget_high_s32(tmp_im)),
                           vpadd_s32(vget_low_s32(tmp_im1),vget_high_s32(tmp_im1)));
    //tmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])]

    re_cumul = vqaddq_s32(re_cumul,vqshlq_s32(tmp_re,shift));
    im_cumul = vqaddq_s32(im_cumul,vqshlq_s32(tmp_im,shift));
  }
  
  re_cumul2 = vpadd_s32(vget_low_s32(re_cumul),vget_high_s32(re_cumul));
  im_cumul2 = vpadd_s32(vget_low_s32(im_cumul),vget_high_s32(im_cumul));
  re_cumul2 = vpadd_s32(re_cumul2,re_cumul2);
  im_cumul2 = vpadd_s32(im_cumul2,im_cumul2);
  result2   = vzip_s32(re_cumul2,im_cumul2);
  return(vget_lane_s32(result2.val[0],0));
#endif
}
Beispiel #2
0
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
{
    int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
    int32x4_t d = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
    int32x4x2_t cd = vtrnq_s32(c, d);
    return v_int32x4(vaddq_s32(cd.val[0], cd.val[1]));
}
Beispiel #3
0
static INLINE void IDCT4x4_1D(int16x4_t *d0s16, int16x4_t *d1s16,
                              int16x4_t *d2s16, int16x8_t *q8s16,
                              int16x8_t *q9s16) {
  int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16;
  int16x4_t d26s16, d27s16, d28s16, d29s16;
  int32x4_t q10s32, q13s32, q14s32, q15s32;
  int16x8_t q13s16, q14s16;

  d16s16 = vget_low_s16(*q8s16);
  d17s16 = vget_high_s16(*q8s16);
  d18s16 = vget_low_s16(*q9s16);
  d19s16 = vget_high_s16(*q9s16);

  d23s16 = vadd_s16(d16s16, d18s16);
  d24s16 = vsub_s16(d16s16, d18s16);

  q15s32 = vmull_s16(d17s16, *d2s16);
  q10s32 = vmull_s16(d17s16, *d0s16);
  q13s32 = vmull_s16(d23s16, *d1s16);
  q14s32 = vmull_s16(d24s16, *d1s16);
  q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16);
  q10s32 = vmlal_s16(q10s32, d19s16, *d2s16);

  d26s16 = vqrshrn_n_s32(q13s32, 14);
  d27s16 = vqrshrn_n_s32(q14s32, 14);
  d29s16 = vqrshrn_n_s32(q15s32, 14);
  d28s16 = vqrshrn_n_s32(q10s32, 14);

  q13s16 = vcombine_s16(d26s16, d27s16);
  q14s16 = vcombine_s16(d28s16, d29s16);
  *q8s16 = vaddq_s16(q13s16, q14s16);
  *q9s16 = vsubq_s16(q13s16, q14s16);
  *q9s16 = vcombine_s16(vget_high_s16(*q9s16), vget_low_s16(*q9s16));  // vswp
  return;
}
Beispiel #4
0
unsigned int vpx_get4x4sse_cs_neon(
        const unsigned char *src_ptr,
        int source_stride,
        const unsigned char *ref_ptr,
        int recon_stride) {
    int16x4_t d22s16, d24s16, d26s16, d28s16;
    int64x1_t d0s64;
    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
    int32x4_t q7s32, q8s32, q9s32, q10s32;
    uint16x8_t q11u16, q12u16, q13u16, q14u16;
    int64x2_t q1s64;

    d0u8 = vld1_u8(src_ptr);
    src_ptr += source_stride;
    d4u8 = vld1_u8(ref_ptr);
    ref_ptr += recon_stride;
    d1u8 = vld1_u8(src_ptr);
    src_ptr += source_stride;
    d5u8 = vld1_u8(ref_ptr);
    ref_ptr += recon_stride;
    d2u8 = vld1_u8(src_ptr);
    src_ptr += source_stride;
    d6u8 = vld1_u8(ref_ptr);
    ref_ptr += recon_stride;
    d3u8 = vld1_u8(src_ptr);
    src_ptr += source_stride;
    d7u8 = vld1_u8(ref_ptr);
    ref_ptr += recon_stride;

    q11u16 = vsubl_u8(d0u8, d4u8);
    q12u16 = vsubl_u8(d1u8, d5u8);
    q13u16 = vsubl_u8(d2u8, d6u8);
    q14u16 = vsubl_u8(d3u8, d7u8);

    d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
    d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
    d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
    d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));

    q7s32 = vmull_s16(d22s16, d22s16);
    q8s32 = vmull_s16(d24s16, d24s16);
    q9s32 = vmull_s16(d26s16, d26s16);
    q10s32 = vmull_s16(d28s16, d28s16);

    q7s32 = vaddq_s32(q7s32, q8s32);
    q9s32 = vaddq_s32(q9s32, q10s32);
    q9s32 = vaddq_s32(q7s32, q9s32);

    q1s64 = vpaddlq_s32(q9s32);
    d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));

    return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
}
static inline void DotProductWithScaleNeon(int32_t* cross_correlation,
                                           const int16_t* vector1,
                                           const int16_t* vector2,
                                           size_t length,
                                           int scaling) {
  size_t i = 0;
  size_t len1 = length >> 3;
  size_t len2 = length & 7;
  int64x2_t sum0 = vdupq_n_s64(0);
  int64x2_t sum1 = vdupq_n_s64(0);

  for (i = len1; i > 0; i -= 1) {
    int16x8_t seq1_16x8 = vld1q_s16(vector1);
    int16x8_t seq2_16x8 = vld1q_s16(vector2);
#if defined(WEBRTC_ARCH_ARM64)
    int32x4_t tmp0 = vmull_s16(vget_low_s16(seq1_16x8),
                               vget_low_s16(seq2_16x8));
    int32x4_t tmp1 = vmull_high_s16(seq1_16x8, seq2_16x8);
#else
    int32x4_t tmp0 = vmull_s16(vget_low_s16(seq1_16x8),
                               vget_low_s16(seq2_16x8));
    int32x4_t tmp1 = vmull_s16(vget_high_s16(seq1_16x8),
                               vget_high_s16(seq2_16x8));
#endif
    sum0 = vpadalq_s32(sum0, tmp0);
    sum1 = vpadalq_s32(sum1, tmp1);
    vector1 += 8;
    vector2 += 8;
  }

  // Calculate the rest of the samples.
  int64_t sum_res = 0;
  for (i = len2; i > 0; i -= 1) {
    sum_res += WEBRTC_SPL_MUL_16_16(*vector1, *vector2);
    vector1++;
    vector2++;
  }

  sum0 = vaddq_s64(sum0, sum1);
#if defined(WEBRTC_ARCH_ARM64)
  int64_t sum2 = vaddvq_s64(sum0);
  *cross_correlation = (int32_t)((sum2 + sum_res) >> scaling);
#else
  int64x1_t shift = vdup_n_s64(-scaling);
  int64x1_t sum2 = vadd_s64(vget_low_s64(sum0), vget_high_s64(sum0));
  sum2 = vadd_s64(sum2, vdup_n_s64(sum_res));
  sum2 = vshl_s64(sum2, shift);
  vst1_lane_s32(cross_correlation, vreinterpret_s32_s64(sum2), 0);
#endif
}
Beispiel #6
0
int64_t av1_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff,
                                int block_size) {
  int64x2_t error = vdupq_n_s64(0);

  assert(block_size >= 8);
  assert((block_size % 8) == 0);

  do {
    const int16x8_t c = vld1q_s16(coeff);
    const int16x8_t d = vld1q_s16(dqcoeff);
    const int16x8_t diff = vsubq_s16(c, d);
    const int16x4_t diff_lo = vget_low_s16(diff);
    const int16x4_t diff_hi = vget_high_s16(diff);
    // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before
    // accumulating them in 64-bits.
    const int32x4_t err0 = vmull_s16(diff_lo, diff_lo);
    const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi);
    const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1));
    error = vaddq_s64(error, err2);
    coeff += 8;
    dqcoeff += 8;
    block_size -= 8;
  } while (block_size != 0);

  return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1);
}
//compute average channel_level on each (TX,RX) antenna pair
int dl_channel_level(int16_t *dl_ch,
                     LTE_DL_FRAME_PARMS *frame_parms)
{

    int16_t rb;
#if defined(__x86_64__) || defined(__i386__)
    __m128i *dl_ch128;
#elif defined(__arm__)
    int16x4_t *dl_ch128;
#endif
    int avg;

    //clear average level
#if defined(__x86_64__) || defined(__i386__)
    avg128F = _mm_setzero_si128();
    dl_ch128=(__m128i *)dl_ch;

    for (rb=0; rb<frame_parms->N_RB_DL; rb++) {

        avg128F = _mm_add_epi32(avg128F,_mm_madd_epi16(dl_ch128[0],dl_ch128[0]));
        avg128F = _mm_add_epi32(avg128F,_mm_madd_epi16(dl_ch128[1],dl_ch128[1]));
        avg128F = _mm_add_epi32(avg128F,_mm_madd_epi16(dl_ch128[2],dl_ch128[2]));

        dl_ch128+=3;

    }
#elif defined(__arm__)
    avg128F = vdupq_n_s32(0);
    dl_ch128=(int16x4_t *)dl_ch;

    for (rb=0; rb<frame_parms->N_RB_DL; rb++) {

        avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[0],dl_ch128[0]));
        avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[1],dl_ch128[1]));
        avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[2],dl_ch128[2]));
        avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[3],dl_ch128[3]));
        avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[4],dl_ch128[4]));
        avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[5],dl_ch128[5]));
        dl_ch128+=6;


    }


#endif
    DevAssert( frame_parms->N_RB_DL );
    avg = (((int*)&avg128F)[0] +
           ((int*)&avg128F)[1] +
           ((int*)&avg128F)[2] +
           ((int*)&avg128F)[3])/(frame_parms->N_RB_DL*12);


#if defined(__x86_64__) || defined(__i386__)
    _mm_empty();
    _m_empty();
#endif
    return(avg);
}
Beispiel #8
0
static INLINE void IADST4x4_1D(int16x4_t *d3s16, int16x4_t *d4s16,
                               int16x4_t *d5s16, int16x8_t *q3s16,
                               int16x8_t *q8s16, int16x8_t *q9s16) {
  int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16;
  int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;

  d6s16 = vget_low_s16(*q3s16);

  d16s16 = vget_low_s16(*q8s16);
  d17s16 = vget_high_s16(*q8s16);
  d18s16 = vget_low_s16(*q9s16);
  d19s16 = vget_high_s16(*q9s16);

  q10s32 = vmull_s16(*d3s16, d16s16);
  q11s32 = vmull_s16(*d4s16, d16s16);
  q12s32 = vmull_s16(d6s16, d17s16);
  q13s32 = vmull_s16(*d5s16, d18s16);
  q14s32 = vmull_s16(*d3s16, d18s16);
  q15s32 = vmovl_s16(d16s16);
  q15s32 = vaddw_s16(q15s32, d19s16);
  q8s32 = vmull_s16(*d4s16, d19s16);
  q15s32 = vsubw_s16(q15s32, d18s16);
  q9s32 = vmull_s16(*d5s16, d19s16);

  q10s32 = vaddq_s32(q10s32, q13s32);
  q10s32 = vaddq_s32(q10s32, q8s32);
  q11s32 = vsubq_s32(q11s32, q14s32);
  q8s32 = vdupq_n_s32(sinpi_3_9);
  q11s32 = vsubq_s32(q11s32, q9s32);
  q15s32 = vmulq_s32(q15s32, q8s32);

  q13s32 = vaddq_s32(q10s32, q12s32);
  q10s32 = vaddq_s32(q10s32, q11s32);
  q14s32 = vaddq_s32(q11s32, q12s32);
  q10s32 = vsubq_s32(q10s32, q12s32);

  d16s16 = vqrshrn_n_s32(q13s32, 14);
  d17s16 = vqrshrn_n_s32(q14s32, 14);
  d18s16 = vqrshrn_n_s32(q15s32, 14);
  d19s16 = vqrshrn_n_s32(q10s32, 14);

  *q8s16 = vcombine_s16(d16s16, d17s16);
  *q9s16 = vcombine_s16(d18s16, d19s16);
  return;
}
Beispiel #9
0
void vpx_idct8x8_12_add_neon(
        int16_t *input,
        uint8_t *dest,
        int dest_stride) {
    uint8_t *d1, *d2;
    uint8x8_t d0u8, d1u8, d2u8, d3u8;
    int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16;
    int16x4_t d26s16, d27s16, d28s16, d29s16;
    uint64x1_t d0u64, d1u64, d2u64, d3u64;
    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
    uint16x8_t q8u16, q9u16, q10u16, q11u16;
    int32x4_t q9s32, q10s32, q11s32, q12s32;

    q8s16 = vld1q_s16(input);
    q9s16 = vld1q_s16(input + 8);
    q10s16 = vld1q_s16(input + 16);
    q11s16 = vld1q_s16(input + 24);
    q12s16 = vld1q_s16(input + 32);
    q13s16 = vld1q_s16(input + 40);
    q14s16 = vld1q_s16(input + 48);
    q15s16 = vld1q_s16(input + 56);

    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
                 &q12s16, &q13s16, &q14s16, &q15s16);

    // First transform rows
    // stage 1
    q0s16 = vdupq_n_s16(cospi_28_64 * 2);
    q1s16 = vdupq_n_s16(cospi_4_64 * 2);

    q4s16 = vqrdmulhq_s16(q9s16, q0s16);

    q0s16 = vdupq_n_s16(-cospi_20_64 * 2);

    q7s16 = vqrdmulhq_s16(q9s16, q1s16);

    q1s16 = vdupq_n_s16(cospi_12_64 * 2);

    q5s16 = vqrdmulhq_s16(q11s16, q0s16);

    q0s16 = vdupq_n_s16(cospi_16_64 * 2);

    q6s16 = vqrdmulhq_s16(q11s16, q1s16);

    // stage 2 & stage 3 - even half
    q1s16 = vdupq_n_s16(cospi_24_64 * 2);

    q9s16 = vqrdmulhq_s16(q8s16, q0s16);

    q0s16 = vdupq_n_s16(cospi_8_64 * 2);

    q13s16 = vqrdmulhq_s16(q10s16, q1s16);

    q15s16 = vqrdmulhq_s16(q10s16, q0s16);

    // stage 3 -odd half
    q0s16 = vaddq_s16(q9s16, q15s16);
    q1s16 = vaddq_s16(q9s16, q13s16);
    q2s16 = vsubq_s16(q9s16, q13s16);
    q3s16 = vsubq_s16(q9s16, q15s16);

    // stage 2 - odd half
    q13s16 = vsubq_s16(q4s16, q5s16);
    q4s16 = vaddq_s16(q4s16, q5s16);
    q14s16 = vsubq_s16(q7s16, q6s16);
    q7s16 = vaddq_s16(q7s16, q6s16);
    d26s16 = vget_low_s16(q13s16);
    d27s16 = vget_high_s16(q13s16);
    d28s16 = vget_low_s16(q14s16);
    d29s16 = vget_high_s16(q14s16);

    d16s16 = vdup_n_s16(cospi_16_64);
    q9s32 = vmull_s16(d28s16, d16s16);
    q10s32 = vmull_s16(d29s16, d16s16);
    q11s32 = vmull_s16(d28s16, d16s16);
    q12s32 = vmull_s16(d29s16, d16s16);

    q9s32 = vmlsl_s16(q9s32,  d26s16, d16s16);
    q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
    q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
    q12s32 = vmlal_s16(q12s32, d27s16, d16s16);

    d10s16 = vqrshrn_n_s32(q9s32, 14);
    d11s16 = vqrshrn_n_s32(q10s32, 14);
    d12s16 = vqrshrn_n_s32(q11s32, 14);
    d13s16 = vqrshrn_n_s32(q12s32, 14);
    q5s16 = vcombine_s16(d10s16, d11s16);
    q6s16 = vcombine_s16(d12s16, d13s16);

    // stage 4
    q8s16 = vaddq_s16(q0s16, q7s16);
    q9s16 = vaddq_s16(q1s16, q6s16);
    q10s16 = vaddq_s16(q2s16, q5s16);
    q11s16 = vaddq_s16(q3s16, q4s16);
    q12s16 = vsubq_s16(q3s16, q4s16);
    q13s16 = vsubq_s16(q2s16, q5s16);
    q14s16 = vsubq_s16(q1s16, q6s16);
    q15s16 = vsubq_s16(q0s16, q7s16);

    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
                 &q12s16, &q13s16, &q14s16, &q15s16);

    IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
               &q12s16, &q13s16, &q14s16, &q15s16);

    q8s16 = vrshrq_n_s16(q8s16, 5);
    q9s16 = vrshrq_n_s16(q9s16, 5);
    q10s16 = vrshrq_n_s16(q10s16, 5);
    q11s16 = vrshrq_n_s16(q11s16, 5);
    q12s16 = vrshrq_n_s16(q12s16, 5);
    q13s16 = vrshrq_n_s16(q13s16, 5);
    q14s16 = vrshrq_n_s16(q14s16, 5);
    q15s16 = vrshrq_n_s16(q15s16, 5);

    d1 = d2 = dest;

    d0u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;
    d1u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;
    d2u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;
    d3u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;

    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
                     vreinterpret_u8_u64(d0u64));
    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
                     vreinterpret_u8_u64(d1u64));
    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
                      vreinterpret_u8_u64(d2u64));
    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
                      vreinterpret_u8_u64(d3u64));

    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));

    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
    d2 += dest_stride;
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
    d2 += dest_stride;
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
    d2 += dest_stride;
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
    d2 += dest_stride;

    q8s16 = q12s16;
    q9s16 = q13s16;
    q10s16 = q14s16;
    q11s16 = q15s16;

    d0u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;
    d1u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;
    d2u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;
    d3u64 = vld1_u64((uint64_t *)d1);
    d1 += dest_stride;

    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
                     vreinterpret_u8_u64(d0u64));
    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
                     vreinterpret_u8_u64(d1u64));
    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
                      vreinterpret_u8_u64(d2u64));
    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
                      vreinterpret_u8_u64(d3u64));

    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));

    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
    d2 += dest_stride;
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
    d2 += dest_stride;
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
    d2 += dest_stride;
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
    d2 += dest_stride;
    return;
}
Beispiel #10
0
// Update the noise estimation information.
static void UpdateNoiseEstimateNeon(NoiseSuppressionFixedC* inst, int offset) {
  const int16_t kExp2Const = 11819; // Q13
  int16_t* ptr_noiseEstLogQuantile = NULL;
  int16_t* ptr_noiseEstQuantile = NULL;
  int16x4_t kExp2Const16x4 = vdup_n_s16(kExp2Const);
  int32x4_t twentyOne32x4 = vdupq_n_s32(21);
  int32x4_t constA32x4 = vdupq_n_s32(0x1fffff);
  int32x4_t constB32x4 = vdupq_n_s32(0x200000);

  int16_t tmp16 = WebRtcSpl_MaxValueW16(inst->noiseEstLogQuantile + offset,
                                        inst->magnLen);

  // Guarantee a Q-domain as high as possible and still fit in int16
  inst->qNoise = 14 - (int) WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(kExp2Const,
                                                                 tmp16,
                                                                 21);

  int32x4_t qNoise32x4 = vdupq_n_s32(inst->qNoise);

  for (ptr_noiseEstLogQuantile = &inst->noiseEstLogQuantile[offset],
       ptr_noiseEstQuantile = &inst->noiseEstQuantile[0];
       ptr_noiseEstQuantile < &inst->noiseEstQuantile[inst->magnLen - 3];
       ptr_noiseEstQuantile += 4, ptr_noiseEstLogQuantile += 4) {

    // tmp32no2 = kExp2Const * inst->noiseEstLogQuantile[offset + i];
    int16x4_t v16x4 = vld1_s16(ptr_noiseEstLogQuantile);
    int32x4_t v32x4B = vmull_s16(v16x4, kExp2Const16x4);

    // tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac
    int32x4_t v32x4A = vandq_s32(v32x4B, constA32x4);
    v32x4A = vorrq_s32(v32x4A, constB32x4);

    // tmp16 = (int16_t)(tmp32no2 >> 21);
    v32x4B = vshrq_n_s32(v32x4B, 21);

    // tmp16 -= 21;// shift 21 to get result in Q0
    v32x4B = vsubq_s32(v32x4B, twentyOne32x4);

    // tmp16 += (int16_t) inst->qNoise;
    // shift to get result in Q(qNoise)
    v32x4B = vaddq_s32(v32x4B, qNoise32x4);

    // if (tmp16 < 0) {
    //   tmp32no1 >>= -tmp16;
    // } else {
    //   tmp32no1 <<= tmp16;
    // }
    v32x4B = vshlq_s32(v32x4A, v32x4B);

    // tmp16 = WebRtcSpl_SatW32ToW16(tmp32no1);
    v16x4 = vqmovn_s32(v32x4B);

    //inst->noiseEstQuantile[i] = tmp16;
    vst1_s16(ptr_noiseEstQuantile, v16x4);
  }

  // Last iteration:

  // inst->quantile[i]=exp(inst->lquantile[offset+i]);
  // in Q21
  int32_t tmp32no2 = kExp2Const * *ptr_noiseEstLogQuantile;
  int32_t tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac

  tmp16 = (int16_t)(tmp32no2 >> 21);
  tmp16 -= 21;// shift 21 to get result in Q0
  tmp16 += (int16_t) inst->qNoise; //shift to get result in Q(qNoise)
  if (tmp16 < 0) {
    tmp32no1 >>= -tmp16;
  } else {
Beispiel #11
0
void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
                          int skip_block, const int16_t *zbin_ptr,
                          const int16_t *round_ptr, const int16_t *quant_ptr,
                          const int16_t *quant_shift_ptr,
                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
                          const int16_t *scan, const int16_t *iscan) {
  // TODO(jingning) Decide the need of these arguments after the
  // quantization process is completed.
  (void)zbin_ptr;
  (void)quant_shift_ptr;
  (void)scan;

  if (!skip_block) {
    // Quantization pass: All coefficients with index >= zero_flag are
    // skippable. Note: zero_flag can be zero.
    int i;
    const int16x8_t v_zero = vdupq_n_s16(0);
    const int16x8_t v_one = vdupq_n_s16(1);
    int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
    int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
    int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
    int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
    // adjust for dc
    v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
    v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
    v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
    // process dc and the first seven ac coeffs
    {
      const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
      const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
      const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
      const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
      const int32x4_t v_tmp_lo =
          vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
      const int32x4_t v_tmp_hi =
          vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
      const int16x8_t v_tmp2 =
          vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
      const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
      const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
      const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
      const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
      const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
      const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
      v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
      store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
      store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff);
      v_round = vmovq_n_s16(round_ptr[1]);
      v_quant = vmovq_n_s16(quant_ptr[1]);
      v_dequant = vmovq_n_s16(dequant_ptr[1]);
    }
    // now process the rest of the ac coeffs
    for (i = 8; i < count; i += 8) {
      const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
      const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr + i);
      const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
      const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
      const int32x4_t v_tmp_lo =
          vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
      const int32x4_t v_tmp_hi =
          vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
      const int16x8_t v_tmp2 =
          vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
      const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
      const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
      const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
      const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
      const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
      const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
      v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
      store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff);
      store_s16q_to_tran_low(dqcoeff_ptr + i, v_dqcoeff);
    }
    {
      const int16x4_t v_eobmax_3210 = vmax_s16(
          vget_low_s16(v_eobmax_76543210), vget_high_s16(v_eobmax_76543210));
      const int64x1_t v_eobmax_xx32 =
          vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
      const int16x4_t v_eobmax_tmp =
          vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
      const int64x1_t v_eobmax_xxx3 =
          vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
      const int16x4_t v_eobmax_final =
          vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));

      *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
    }
  } else {
    memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
    memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
    *eob_ptr = 0;
  }
}
Beispiel #12
0
inline   int32x4_t vmull(const int16x4_t   & v0, const int16x4_t   & v1) { return vmull_s16(v0, v1); }
Beispiel #13
0
static void PCorr2Q32(const int16_t *in, int32_t *logcorQ8)
{
  int16_t scaling,n,k;
  int32_t ysum32,csum32, lys, lcs;
  int32_t oneQ8;


  const int16_t *x, *inptr;

  oneQ8 = WEBRTC_SPL_LSHIFT_W32((int32_t)1, 8);  // 1.00 in Q8

  x = in + PITCH_MAX_LAG/2 + 2;
  scaling = WebRtcSpl_GetScalingSquare ((int16_t *) in, PITCH_CORR_LEN2, PITCH_CORR_LEN2);
  ysum32 = 1;
  csum32 = 0;
  x = in + PITCH_MAX_LAG/2 + 2;
  for (n = 0; n < PITCH_CORR_LEN2; n++) {
    ysum32 += WEBRTC_SPL_MUL_16_16_RSFT( (int16_t) in[n],(int16_t) in[n], scaling);  // Q0
    csum32 += WEBRTC_SPL_MUL_16_16_RSFT((int16_t) x[n],(int16_t) in[n], scaling); // Q0
  }

  logcorQ8 += PITCH_LAG_SPAN2 - 1;

  lys=Log2Q8((uint32_t) ysum32); // Q8
  lys=WEBRTC_SPL_RSHIFT_W32(lys, 1); //sqrt(ysum);

  if (csum32>0) {

    lcs=Log2Q8((uint32_t) csum32);   // 2log(csum) in Q8

    if (lcs>(lys + oneQ8) ){ // csum/sqrt(ysum) > 2 in Q8
      *logcorQ8 = lcs - lys;  // log2(csum/sqrt(ysum))
    } else {
      *logcorQ8 = oneQ8;  // 1.00
    }

  } else {
    *logcorQ8 = 0;
  }


  for (k = 1; k < PITCH_LAG_SPAN2; k++) {
    inptr = &in[k];
    ysum32 -= WEBRTC_SPL_MUL_16_16_RSFT( (int16_t) in[k-1],(int16_t) in[k-1], scaling);
    ysum32 += WEBRTC_SPL_MUL_16_16_RSFT( (int16_t) in[PITCH_CORR_LEN2 + k - 1],(int16_t) in[PITCH_CORR_LEN2 + k - 1], scaling);

#ifdef WEBRTC_ARCH_ARM_NEON
    {
      int32_t vbuff[4];
      int32x4_t int_32x4_sum = vmovq_n_s32(0);
      // Can't shift a Neon register to right with a non-constant shift value.
      int32x4_t int_32x4_scale = vdupq_n_s32(-scaling);
      // Assert a codition used in loop unrolling at compile-time.
      COMPILE_ASSERT(PITCH_CORR_LEN2 %4 == 0);

      for (n = 0; n < PITCH_CORR_LEN2; n += 4) {
        int16x4_t int_16x4_x = vld1_s16(&x[n]);
        int16x4_t int_16x4_in = vld1_s16(&inptr[n]);
        int32x4_t int_32x4 = vmull_s16(int_16x4_x, int_16x4_in);
        int_32x4 = vshlq_s32(int_32x4, int_32x4_scale);
        int_32x4_sum = vaddq_s32(int_32x4_sum, int_32x4);
      }

      // Use vector store to avoid long stall from data trasferring
      // from vector to general register.
      vst1q_s32(vbuff, int_32x4_sum);
      csum32 = vbuff[0] + vbuff[1];
      csum32 += vbuff[2];
      csum32 += vbuff[3];
    }
#else
    csum32 = 0;
    if(scaling == 0) {
      for (n = 0; n < PITCH_CORR_LEN2; n++) {
        csum32 += x[n] * inptr[n];
      }
    } else {
      for (n = 0; n < PITCH_CORR_LEN2; n++) {
        csum32 += (x[n] * inptr[n]) >> scaling;
      }
    }
#endif

    logcorQ8--;

    lys=Log2Q8((uint32_t)ysum32); // Q8
    lys=WEBRTC_SPL_RSHIFT_W32(lys, 1); //sqrt(ysum);

    if (csum32>0) {

      lcs=Log2Q8((uint32_t) csum32);   // 2log(csum) in Q8

      if (lcs>(lys + oneQ8) ){ // csum/sqrt(ysum) > 2
        *logcorQ8 = lcs - lys;  // log2(csum/sqrt(ysum))
      } else {
        *logcorQ8 = oneQ8;  // 1.00
      }

    } else {
      *logcorQ8 = 0;
    }
  }
}
Beispiel #14
0
static inline int32_t ComplexMulAndFindMaxNeon(int16_t* inre1Q9,
                                               int16_t* inre2Q9,
                                               int32_t* outreQ16,
                                               int32_t* outimQ16) {
  int k;
  const int16_t* kCosTab = &WebRtcIsacfix_kCosTab1[0];
  const int16_t* kSinTab = &WebRtcIsacfix_kSinTab1[0];
  // 0.5 / sqrt(240) in Q19 is round((.5 / sqrt(240)) * (2^19)) = 16921.
  // Use "16921 << 5" and vqdmulh, instead of ">> 26" as in the C code.
  int32_t fact  = 16921 << 5;
  int32x4_t factq = vdupq_n_s32(fact);
  uint32x4_t max_r = vdupq_n_u32(0);
  uint32x4_t max_i = vdupq_n_u32(0);

  for (k = 0; k < FRAMESAMPLES/2; k += 8) {
    int16x8_t tmpr = vld1q_s16(kCosTab);
    int16x8_t tmpi = vld1q_s16(kSinTab);
    int16x8_t inre1 = vld1q_s16(inre1Q9);
    int16x8_t inre2 = vld1q_s16(inre2Q9);
    kCosTab += 8;
    kSinTab += 8;
    inre1Q9 += 8;
    inre2Q9 += 8;

    // Use ">> 26", instead of ">> 7", ">> 16" and then ">> 3" as in the C code.
    int32x4_t tmp0 = vmull_s16(vget_low_s16(tmpr), vget_low_s16(inre1));
    int32x4_t tmp1 = vmull_s16(vget_low_s16(tmpr), vget_low_s16(inre2));
    tmp0 = vmlal_s16(tmp0, vget_low_s16(tmpi), vget_low_s16(inre2));
    tmp1 = vmlsl_s16(tmp1, vget_low_s16(tmpi), vget_low_s16(inre1));
#if defined(WEBRTC_ARCH_ARM64)
    int32x4_t tmp2 = vmull_high_s16(tmpr, inre1);
    int32x4_t tmp3 = vmull_high_s16(tmpr, inre2);
    tmp2 = vmlal_high_s16(tmp2, tmpi, inre2);
    tmp3 = vmlsl_high_s16(tmp3, tmpi, inre1);
#else
    int32x4_t tmp2 = vmull_s16(vget_high_s16(tmpr), vget_high_s16(inre1));
    int32x4_t tmp3 = vmull_s16(vget_high_s16(tmpr), vget_high_s16(inre2));
    tmp2 = vmlal_s16(tmp2, vget_high_s16(tmpi), vget_high_s16(inre2));
    tmp3 = vmlsl_s16(tmp3, vget_high_s16(tmpi), vget_high_s16(inre1));
#endif

    int32x4_t outr_0 = vqdmulhq_s32(tmp0, factq);
    int32x4_t outr_1 = vqdmulhq_s32(tmp2, factq);
    int32x4_t outi_0 = vqdmulhq_s32(tmp1, factq);
    int32x4_t outi_1 = vqdmulhq_s32(tmp3, factq);
    vst1q_s32(outreQ16, outr_0);
    outreQ16 += 4;
    vst1q_s32(outreQ16, outr_1);
    outreQ16 += 4;
    vst1q_s32(outimQ16, outi_0);
    outimQ16 += 4;
    vst1q_s32(outimQ16, outi_1);
    outimQ16 += 4;

    // Find the absolute maximum in the vectors.
    tmp0 = vabsq_s32(outr_0);
    tmp1 = vabsq_s32(outr_1);
    tmp2 = vabsq_s32(outi_0);
    tmp3 = vabsq_s32(outi_1);
    // vabs doesn't change the value of 0x80000000.
    // Use u32 so we don't lose the value 0x80000000.
    max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp0));
    max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp2));
    max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp1));
    max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp3));
  }

  max_r = vmaxq_u32(max_r, max_i);
#if defined(WEBRTC_ARCH_ARM64)
  uint32_t maximum = vmaxvq_u32(max_r);
#else
  uint32x2_t max32x2_r = vmax_u32(vget_low_u32(max_r), vget_high_u32(max_r));
  max32x2_r = vpmax_u32(max32x2_r, max32x2_r);
  uint32_t maximum = vget_lane_u32(max32x2_r, 0);
#endif

  return (int32_t)maximum;
}
void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
  uint8x8_t d26u8, d27u8;
  uint32x2_t d26u32, d27u32;
  uint16x8_t q8u16, q9u16;
  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16;
  int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16;
  int16x8_t q8s16, q9s16, q13s16, q14s16;
  int32x4_t q1s32, q13s32, q14s32, q15s32;
  int16x4x2_t d0x2s16, d1x2s16;
  int32x4x2_t q0x2s32;
  uint8_t *d;

  d26u32 = d27u32 = vdup_n_u32(0);

  q8s16 = vld1q_s16(input);
  q9s16 = vld1q_s16(input + 8);

  d16s16 = vget_low_s16(q8s16);
  d17s16 = vget_high_s16(q8s16);
  d18s16 = vget_low_s16(q9s16);
  d19s16 = vget_high_s16(q9s16);

  d0x2s16 = vtrn_s16(d16s16, d17s16);
  d1x2s16 = vtrn_s16(d18s16, d19s16);
  q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
  q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);

  d20s16 = vdup_n_s16((int16_t)cospi_8_64);
  d21s16 = vdup_n_s16((int16_t)cospi_16_64);

  q0x2s32 =
      vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
  d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
  d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
  d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
  d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));

  d22s16 = vdup_n_s16((int16_t)cospi_24_64);

  // stage 1
  d23s16 = vadd_s16(d16s16, d18s16);
  d24s16 = vsub_s16(d16s16, d18s16);

  q15s32 = vmull_s16(d17s16, d22s16);
  q1s32 = vmull_s16(d17s16, d20s16);
  q13s32 = vmull_s16(d23s16, d21s16);
  q14s32 = vmull_s16(d24s16, d21s16);

  q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
  q1s32 = vmlal_s16(q1s32, d19s16, d22s16);

  d26s16 = vqrshrn_n_s32(q13s32, 14);
  d27s16 = vqrshrn_n_s32(q14s32, 14);
  d29s16 = vqrshrn_n_s32(q15s32, 14);
  d28s16 = vqrshrn_n_s32(q1s32, 14);
  q13s16 = vcombine_s16(d26s16, d27s16);
  q14s16 = vcombine_s16(d28s16, d29s16);

  // stage 2
  q8s16 = vaddq_s16(q13s16, q14s16);
  q9s16 = vsubq_s16(q13s16, q14s16);

  d16s16 = vget_low_s16(q8s16);
  d17s16 = vget_high_s16(q8s16);
  d18s16 = vget_high_s16(q9s16);  // vswp d18 d19
  d19s16 = vget_low_s16(q9s16);

  d0x2s16 = vtrn_s16(d16s16, d17s16);
  d1x2s16 = vtrn_s16(d18s16, d19s16);
  q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
  q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);

  q0x2s32 =
      vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
  d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
  d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
  d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
  d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));

  // do the transform on columns
  // stage 1
  d23s16 = vadd_s16(d16s16, d18s16);
  d24s16 = vsub_s16(d16s16, d18s16);

  q15s32 = vmull_s16(d17s16, d22s16);
  q1s32 = vmull_s16(d17s16, d20s16);
  q13s32 = vmull_s16(d23s16, d21s16);
  q14s32 = vmull_s16(d24s16, d21s16);

  q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
  q1s32 = vmlal_s16(q1s32, d19s16, d22s16);

  d26s16 = vqrshrn_n_s32(q13s32, 14);
  d27s16 = vqrshrn_n_s32(q14s32, 14);
  d29s16 = vqrshrn_n_s32(q15s32, 14);
  d28s16 = vqrshrn_n_s32(q1s32, 14);
  q13s16 = vcombine_s16(d26s16, d27s16);
  q14s16 = vcombine_s16(d28s16, d29s16);

  // stage 2
  q8s16 = vaddq_s16(q13s16, q14s16);
  q9s16 = vsubq_s16(q13s16, q14s16);

  q8s16 = vrshrq_n_s16(q8s16, 4);
  q9s16 = vrshrq_n_s16(q9s16, 4);

  d = dest;
  d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0);
  d += dest_stride;
  d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1);
  d += dest_stride;
  d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1);
  d += dest_stride;
  d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0);

  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));

  d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
  d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));

  d = dest;
  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0);
  d += dest_stride;
  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1);
  d += dest_stride;
  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1);
  d += dest_stride;
  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0);
  return;
}
Beispiel #16
0
inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
                         v_int32x4& c, v_int32x4& d)
{
    c.val = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
    d.val = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
}
int rotate_cpx_vector(int16_t *x,
                      int16_t *alpha,
                      int16_t *y,
                      uint32_t N,
                      uint16_t output_shift)
{
  // Multiply elementwise two complex vectors of N elements
  // x        - input 1    in the format  |Re0  Im0 |,......,|Re(N-1) Im(N-1)|
  //            We assume x1 with a dynamic of 15 bit maximum
  //
  // alpha      - input 2    in the format  |Re0 Im0|
  //            We assume x2 with a dynamic of 15 bit maximum
  //
  // y        - output     in the format  |Re0  Im0|,......,|Re(N-1) Im(N-1)|
  //
  // N        - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
  //
  // log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0)
  //            WARNING: log2_amp>0 can cause overflow!!

  uint32_t i;                 // loop counter


  simd_q15_t *y_128,alpha_128;
  int32_t *xd=(int32_t *)x; 

#if defined(__x86_64__) || defined(__i386__)
  __m128i shift = _mm_cvtsi32_si128(output_shift);
  register simd_q15_t m0,m1,m2,m3;

  ((int16_t *)&alpha_128)[0] = alpha[0];
  ((int16_t *)&alpha_128)[1] = -alpha[1];
  ((int16_t *)&alpha_128)[2] = alpha[1];
  ((int16_t *)&alpha_128)[3] = alpha[0];
  ((int16_t *)&alpha_128)[4] = alpha[0];
  ((int16_t *)&alpha_128)[5] = -alpha[1];
  ((int16_t *)&alpha_128)[6] = alpha[1];
  ((int16_t *)&alpha_128)[7] = alpha[0];
#elif defined(__arm__)
  int32x4_t shift;
  int32x4_t ab_re0,ab_re1,ab_im0,ab_im1,re32,im32;
  int16_t reflip[8]  __attribute__((aligned(16))) = {1,-1,1,-1,1,-1,1,-1};
  int32x4x2_t xtmp;

  ((int16_t *)&alpha_128)[0] = alpha[0];
  ((int16_t *)&alpha_128)[1] = alpha[1];
  ((int16_t *)&alpha_128)[2] = alpha[0];
  ((int16_t *)&alpha_128)[3] = alpha[1];
  ((int16_t *)&alpha_128)[4] = alpha[0];
  ((int16_t *)&alpha_128)[5] = alpha[1];
  ((int16_t *)&alpha_128)[6] = alpha[0];
  ((int16_t *)&alpha_128)[7] = alpha[1];
  int16x8_t bflip = vrev32q_s16(alpha_128);
  int16x8_t bconj = vmulq_s16(alpha_128,*(int16x8_t *)reflip);
  shift = vdupq_n_s32(-output_shift);
#endif
  y_128 = (simd_q15_t *) y;


  for(i=0; i<N>>2; i++) {
#if defined(__x86_64__) || defined(__i386__)
    m0 = _mm_setr_epi32(xd[0],xd[0],xd[1],xd[1]);
    m1 = _mm_setr_epi32(xd[2],xd[2],xd[3],xd[3]);
    m2 = _mm_madd_epi16(m0,alpha_128); //complex multiply. result is 32bit [Re Im Re Im]
    m3 = _mm_madd_epi16(m1,alpha_128); //complex multiply. result is 32bit [Re Im Re Im]
    m2 = _mm_sra_epi32(m2,shift);        // shift right by shift in order to  compensate for the input amplitude
    m3 = _mm_sra_epi32(m3,shift);        // shift right by shift in order to  compensate for the input amplitude

    y_128[0] = _mm_packs_epi32(m2,m3);        // pack in 16bit integers with saturation [re im re im re im re im]
#elif defined(__arm__)

  ab_re0 = vmull_s16(((int16x4_t*)xd)[0],((int16x4_t*)&bconj)[0]);
  ab_re1 = vmull_s16(((int16x4_t*)xd)[1],((int16x4_t*)&bconj)[1]);
  ab_im0 = vmull_s16(((int16x4_t*)xd)[0],((int16x4_t*)&bflip)[0]);
  ab_im1 = vmull_s16(((int16x4_t*)xd)[1],((int16x4_t*)&bflip)[1]);
  re32 = vshlq_s32(vcombine_s32(vpadd_s32(((int32x2_t*)&ab_re0)[0],((int32x2_t*)&ab_re0)[1]),
                                vpadd_s32(((int32x2_t*)&ab_re1)[0],((int32x2_t*)&ab_re1)[1])),
                   shift);
  im32 = vshlq_s32(vcombine_s32(vpadd_s32(((int32x2_t*)&ab_im0)[0],((int32x2_t*)&ab_im0)[1]),
                                vpadd_s32(((int32x2_t*)&ab_im1)[0],((int32x2_t*)&ab_im1)[1])),
                   shift);

  xtmp = vzipq_s32(re32,im32);
  
  y_128[0] = vcombine_s16(vmovn_s32(xtmp.val[0]),vmovn_s32(xtmp.val[1]));

#endif

    xd+=4;
    y_128+=1;
  }


  _mm_empty();
  _m_empty();

  return(0);
}
void silk_warped_autocorrelation_FIX_neon(
          opus_int32                *corr,                                  /* O    Result [order + 1]                                                          */
          opus_int                  *scale,                                 /* O    Scaling of the correlation vector                                           */
    const opus_int16                *input,                                 /* I    Input data to correlate                                                     */
    const opus_int                  warping_Q16,                            /* I    Warping coefficient                                                         */
    const opus_int                  length,                                 /* I    Length of input                                                             */
    const opus_int                  order                                   /* I    Correlation order (even)                                                    */
)
{
    if( ( MAX_SHAPE_LPC_ORDER > 24 ) || ( order < 6 ) ) {
        silk_warped_autocorrelation_FIX_c( corr, scale, input, warping_Q16, length, order );
    } else {
        opus_int       n, i, lsh;
        opus_int64     corr_QC[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 }; /* In reverse order */
        opus_int64     corr_QC_orderT;
        int64x2_t      lsh_s64x2;
        const opus_int orderT = ( order + 3 ) & ~3;
        opus_int64     *corr_QCT;
        opus_int32     *input_QS;
        VARDECL( opus_int32, input_QST );
        VARDECL( opus_int32, state );
        SAVE_STACK;

        /* Order must be even */
        silk_assert( ( order & 1 ) == 0 );
        silk_assert( 2 * QS - QC >= 0 );

        ALLOC( input_QST, length + 2 * MAX_SHAPE_LPC_ORDER, opus_int32 );

        input_QS = input_QST;
        /* input_QS has zero paddings in the beginning and end. */
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;

        /* Loop over samples */
        for( n = 0; n < length - 7; n += 8, input_QS += 8 ) {
            const int16x8_t t0_s16x4 = vld1q_s16( input + n );
            vst1q_s32( input_QS + 0, vshll_n_s16( vget_low_s16( t0_s16x4 ), QS ) );
            vst1q_s32( input_QS + 4, vshll_n_s16( vget_high_s16( t0_s16x4 ), QS ) );
        }
        for( ; n < length; n++, input_QS++ ) {
            input_QS[ 0 ] = silk_LSHIFT32( (opus_int32)input[ n ], QS );
        }
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS = input_QST + MAX_SHAPE_LPC_ORDER - orderT;

        /* The following loop runs ( length + order ) times, with ( order ) extra epilogues.                  */
        /* The zero paddings in input_QS guarantee corr_QC's correctness even with the extra epilogues.       */
        /* The values of state_QS will be polluted by the extra epilogues, however they are temporary values. */

        /* Keep the C code here to help understand the intrinsics optimization. */
        /*
        {
            opus_int32 state_QS[ 2 ][ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 };
            opus_int32 *state_QST[ 3 ];
            state_QST[ 0 ] = state_QS[ 0 ];
            state_QST[ 1 ] = state_QS[ 1 ];
            for( n = 0; n < length + order; n++, input_QS++ ) {
                state_QST[ 0 ][ orderT ] = input_QS[ orderT ];
                for( i = 0; i < orderT; i++ ) {
                    corr_QC[ i ] += silk_RSHIFT64( silk_SMULL( state_QST[ 0 ][ i ], input_QS[ i ] ), 2 * QS - QC );
                    state_QST[ 1 ][ i ] = silk_SMLAWB( state_QST[ 1 ][ i + 1 ], state_QST[ 0 ][ i ] - state_QST[ 0 ][ i + 1 ], warping_Q16 );
                }
                state_QST[ 2 ] = state_QST[ 0 ];
                state_QST[ 0 ] = state_QST[ 1 ];
                state_QST[ 1 ] = state_QST[ 2 ];
            }
        }
        */

        {
            const int32x4_t warping_Q16_s32x4 = vdupq_n_s32( warping_Q16 << 15 );
            const opus_int32 *in = input_QS + orderT;
            opus_int o = orderT;
            int32x4_t state_QS_s32x4[ 3 ][ 2 ];

            ALLOC( state, length + orderT, opus_int32 );
            state_QS_s32x4[ 2 ][ 1 ] = vdupq_n_s32( 0 );

            /* Calculate 8 taps of all inputs in each loop. */
            do {
                state_QS_s32x4[ 0 ][ 0 ] = state_QS_s32x4[ 0 ][ 1 ] =
                state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 1 ][ 1 ] = vdupq_n_s32( 0 );
                n = 0;
                do {
                    calc_corr( input_QS + n, corr_QC, o - 8, state_QS_s32x4[ 0 ][ 0 ] );
                    calc_corr( input_QS + n, corr_QC, o - 4, state_QS_s32x4[ 0 ][ 1 ] );
                    state_QS_s32x4[ 2 ][ 1 ] = vld1q_s32( in + n );
                    vst1q_lane_s32( state + n, state_QS_s32x4[ 0 ][ 0 ], 0 );
                    state_QS_s32x4[ 2 ][ 0 ] = vextq_s32( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 0 ][ 1 ], 1 );
                    state_QS_s32x4[ 2 ][ 1 ] = vextq_s32( state_QS_s32x4[ 0 ][ 1 ], state_QS_s32x4[ 2 ][ 1 ], 1 );
                    state_QS_s32x4[ 0 ][ 0 ] = calc_state( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], state_QS_s32x4[ 1 ][ 0 ], warping_Q16_s32x4 );
                    state_QS_s32x4[ 0 ][ 1 ] = calc_state( state_QS_s32x4[ 0 ][ 1 ], state_QS_s32x4[ 2 ][ 1 ], state_QS_s32x4[ 1 ][ 1 ], warping_Q16_s32x4 );
                    state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 2 ][ 0 ];
                    state_QS_s32x4[ 1 ][ 1 ] = state_QS_s32x4[ 2 ][ 1 ];
                } while( ++n < ( length + order ) );
                in = state;
                o -= 8;
            } while( o > 4 );

            if( o ) {
                /* Calculate the last 4 taps of all inputs. */
                opus_int32 *stateT = state;
                silk_assert( o == 4 );
                state_QS_s32x4[ 0 ][ 0 ] = state_QS_s32x4[ 1 ][ 0 ] = vdupq_n_s32( 0 );
                n = length + order;
                do {
                    calc_corr( input_QS, corr_QC, 0, state_QS_s32x4[ 0 ][ 0 ] );
                    state_QS_s32x4[ 2 ][ 0 ] = vld1q_s32( stateT );
                    vst1q_lane_s32( stateT, state_QS_s32x4[ 0 ][ 0 ], 0 );
                    state_QS_s32x4[ 2 ][ 0 ] = vextq_s32( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], 1 );
                    state_QS_s32x4[ 0 ][ 0 ] = calc_state( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], state_QS_s32x4[ 1 ][ 0 ], warping_Q16_s32x4 );
                    state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 2 ][ 0 ];
                    input_QS++;
                    stateT++;
                } while( --n );
            }
        }

        {
            const opus_int16 *inputT = input;
            int32x4_t t_s32x4;
            int64x1_t t_s64x1;
            int64x2_t t_s64x2 = vdupq_n_s64( 0 );
            for( n = 0; n <= length - 8; n += 8 ) {
                int16x8_t input_s16x8 = vld1q_s16( inputT );
                t_s32x4 = vmull_s16( vget_low_s16( input_s16x8 ), vget_low_s16( input_s16x8 ) );
                t_s32x4 = vmlal_s16( t_s32x4, vget_high_s16( input_s16x8 ), vget_high_s16( input_s16x8 ) );
                t_s64x2 = vaddw_s32( t_s64x2, vget_low_s32( t_s32x4 ) );
                t_s64x2 = vaddw_s32( t_s64x2, vget_high_s32( t_s32x4 ) );
                inputT += 8;
            }
            t_s64x1 = vadd_s64( vget_low_s64( t_s64x2 ), vget_high_s64( t_s64x2 ) );
            corr_QC_orderT = vget_lane_s64( t_s64x1, 0 );
            for( ; n < length; n++ ) {
                corr_QC_orderT += silk_SMULL( input[ n ], input[ n ] );
            }
            corr_QC_orderT = silk_LSHIFT64( corr_QC_orderT, QC );
            corr_QC[ orderT ] = corr_QC_orderT;
        }

        corr_QCT = corr_QC + orderT - order;
        lsh = silk_CLZ64( corr_QC_orderT ) - 35;
        lsh = silk_LIMIT( lsh, -12 - QC, 30 - QC );
        *scale = -( QC + lsh );
        silk_assert( *scale >= -30 && *scale <= 12 );
        lsh_s64x2 = vdupq_n_s64( lsh );
        for( i = 0; i <= order - 3; i += 4 ) {
            int32x4_t corr_s32x4;
            int64x2_t corr_QC0_s64x2, corr_QC1_s64x2;
            corr_QC0_s64x2 = vld1q_s64( corr_QCT + i );
            corr_QC1_s64x2 = vld1q_s64( corr_QCT + i + 2 );
            corr_QC0_s64x2 = vshlq_s64( corr_QC0_s64x2, lsh_s64x2 );
            corr_QC1_s64x2 = vshlq_s64( corr_QC1_s64x2, lsh_s64x2 );
            corr_s32x4     = vcombine_s32( vmovn_s64( corr_QC1_s64x2 ), vmovn_s64( corr_QC0_s64x2 ) );
            corr_s32x4     = vrev64q_s32( corr_s32x4 );
            vst1q_s32( corr + order - i - 3, corr_s32x4 );
        }
        if( lsh >= 0 ) {
            for( ; i < order + 1; i++ ) {
                corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_LSHIFT64( corr_QCT[ i ], lsh ) );
            }
        } else {
            for( ; i < order + 1; i++ ) {
                corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_RSHIFT64( corr_QCT[ i ], -lsh ) );
            }
        }
        silk_assert( corr_QCT[ order ] >= 0 ); /* If breaking, decrease QC*/
        RESTORE_STACK;
    }

#ifdef OPUS_CHECK_ASM
    {
        opus_int32 corr_c[ MAX_SHAPE_LPC_ORDER + 1 ];
        opus_int   scale_c;
        silk_warped_autocorrelation_FIX_c( corr_c, &scale_c, input, warping_Q16, length, order );
        silk_assert( !memcmp( corr_c, corr, sizeof( corr_c[ 0 ] ) * ( order + 1 ) ) );
        silk_assert( scale_c == *scale );
    }
#endif
}
Beispiel #19
0
static inline void PostShiftAndDivideAndDemodulateNeon(int16_t* inre,
                                                       int16_t* inim,
                                                       int32_t* outre1,
                                                       int32_t* outre2,
                                                       int32_t sh) {
  int k;
  int16_t* p_inre = inre;
  int16_t* p_inim = inim;
  int32_t* p_outre1 = outre1;
  int32_t* p_outre2 = outre2;
  const int16_t* kCosTab = &WebRtcIsacfix_kCosTab1[0];
  const int16_t* kSinTab = &WebRtcIsacfix_kSinTab1[0];
  int32x4_t shift = vdupq_n_s32(-sh - 16);
  // Divide through by the normalizing constant:
  // scale all values with 1/240, i.e. with 273 in Q16.
  // 273/65536 ~= 0.0041656
  // 1/240 ~= 0.0041666
  int16x8_t scale = vdupq_n_s16(273);
  // Sqrt(240) in Q11 is round(15.49193338482967 * 2048) = 31727.
  int factQ19 = 31727 << 16;
  int32x4_t fact = vdupq_n_s32(factQ19);

  for (k = 0; k < FRAMESAMPLES/2; k += 8) {
    int16x8_t inre16x8 = vld1q_s16(p_inre);
    int16x8_t inim16x8 = vld1q_s16(p_inim);
    p_inre += 8;
    p_inim += 8;
    int16x8_t tmpr = vld1q_s16(kCosTab);
    int16x8_t tmpi = vld1q_s16(kSinTab);
    kCosTab += 8;
    kSinTab += 8;
    // By vshl and vmull, we effectively did "<< (-sh - 16)",
    // instead of "<< (-sh)" and ">> 16" as in the C code.
    int32x4_t outre1_0 = vmull_s16(vget_low_s16(inre16x8), vget_low_s16(scale));
    int32x4_t outre2_0 = vmull_s16(vget_low_s16(inim16x8), vget_low_s16(scale));
#if defined(WEBRTC_ARCH_ARM64)
    int32x4_t outre1_1 = vmull_high_s16(inre16x8, scale);
    int32x4_t outre2_1 = vmull_high_s16(inim16x8, scale);
#else
    int32x4_t outre1_1 = vmull_s16(vget_high_s16(inre16x8),
                                   vget_high_s16(scale));
    int32x4_t outre2_1 = vmull_s16(vget_high_s16(inim16x8),
                                   vget_high_s16(scale));
#endif

    outre1_0 = vshlq_s32(outre1_0, shift);
    outre1_1 = vshlq_s32(outre1_1, shift);
    outre2_0 = vshlq_s32(outre2_0, shift);
    outre2_1 = vshlq_s32(outre2_1, shift);

    // Demodulate and separate.
    int32x4_t tmpr_0 = vmovl_s16(vget_low_s16(tmpr));
    int32x4_t tmpi_0 = vmovl_s16(vget_low_s16(tmpi));
#if defined(WEBRTC_ARCH_ARM64)
    int32x4_t tmpr_1 = vmovl_high_s16(tmpr);
    int32x4_t tmpi_1 = vmovl_high_s16(tmpi);
#else
    int32x4_t tmpr_1 = vmovl_s16(vget_high_s16(tmpr));
    int32x4_t tmpi_1 = vmovl_s16(vget_high_s16(tmpi));
#endif

    int64x2_t xr0 = vmull_s32(vget_low_s32(tmpr_0), vget_low_s32(outre1_0));
    int64x2_t xi0 = vmull_s32(vget_low_s32(tmpr_0), vget_low_s32(outre2_0));
    int64x2_t xr2 = vmull_s32(vget_low_s32(tmpr_1), vget_low_s32(outre1_1));
    int64x2_t xi2 = vmull_s32(vget_low_s32(tmpr_1), vget_low_s32(outre2_1));
    xr0 = vmlsl_s32(xr0, vget_low_s32(tmpi_0), vget_low_s32(outre2_0));
    xi0 = vmlal_s32(xi0, vget_low_s32(tmpi_0), vget_low_s32(outre1_0));
    xr2 = vmlsl_s32(xr2, vget_low_s32(tmpi_1), vget_low_s32(outre2_1));
    xi2 = vmlal_s32(xi2, vget_low_s32(tmpi_1), vget_low_s32(outre1_1));

#if defined(WEBRTC_ARCH_ARM64)
    int64x2_t xr1 = vmull_high_s32(tmpr_0, outre1_0);
    int64x2_t xi1 = vmull_high_s32(tmpr_0, outre2_0);
    int64x2_t xr3 = vmull_high_s32(tmpr_1, outre1_1);
    int64x2_t xi3 = vmull_high_s32(tmpr_1, outre2_1);
    xr1 = vmlsl_high_s32(xr1, tmpi_0, outre2_0);
    xi1 = vmlal_high_s32(xi1, tmpi_0, outre1_0);
    xr3 = vmlsl_high_s32(xr3, tmpi_1, outre2_1);
    xi3 = vmlal_high_s32(xi3, tmpi_1, outre1_1);
#else
    int64x2_t xr1 = vmull_s32(vget_high_s32(tmpr_0), vget_high_s32(outre1_0));
    int64x2_t xi1 = vmull_s32(vget_high_s32(tmpr_0), vget_high_s32(outre2_0));
    int64x2_t xr3 = vmull_s32(vget_high_s32(tmpr_1), vget_high_s32(outre1_1));
    int64x2_t xi3 = vmull_s32(vget_high_s32(tmpr_1), vget_high_s32(outre2_1));
    xr1 = vmlsl_s32(xr1, vget_high_s32(tmpi_0), vget_high_s32(outre2_0));
    xi1 = vmlal_s32(xi1, vget_high_s32(tmpi_0), vget_high_s32(outre1_0));
    xr3 = vmlsl_s32(xr3, vget_high_s32(tmpi_1), vget_high_s32(outre2_1));
    xi3 = vmlal_s32(xi3, vget_high_s32(tmpi_1), vget_high_s32(outre1_1));
#endif

    outre1_0 = vcombine_s32(vshrn_n_s64(xr0, 10), vshrn_n_s64(xr1, 10));
    outre2_0 = vcombine_s32(vshrn_n_s64(xi0, 10), vshrn_n_s64(xi1, 10));
    outre1_1 = vcombine_s32(vshrn_n_s64(xr2, 10), vshrn_n_s64(xr3, 10));
    outre2_1 = vcombine_s32(vshrn_n_s64(xi2, 10), vshrn_n_s64(xi3, 10));
    outre1_0 = vqdmulhq_s32(outre1_0, fact);
    outre2_0 = vqdmulhq_s32(outre2_0, fact);
    outre1_1 = vqdmulhq_s32(outre1_1, fact);
    outre2_1 = vqdmulhq_s32(outre2_1, fact);

    vst1q_s32(p_outre1, outre1_0);
    p_outre1 += 4;
    vst1q_s32(p_outre1, outre1_1);
    p_outre1 += 4;
    vst1q_s32(p_outre2, outre2_0);
    p_outre2 += 4;
    vst1q_s32(p_outre2, outre2_1);
    p_outre2 += 4;
  }
}
Beispiel #20
0
static inline int32_t TransformAndFindMaxNeon(int16_t* inre,
                                              int16_t* inim,
                                              int32_t* outre,
                                              int32_t* outim) {
  int k;
  int16_t* inre1 = inre;
  int16_t* inre2 = &inre[FRAMESAMPLES/2 - 4];
  int16_t* inim1 = inim;
  int16_t* inim2 = &inim[FRAMESAMPLES/2 - 4];
  int32_t* outre1 = outre;
  int32_t* outre2 = &outre[FRAMESAMPLES/2 - 4];
  int32_t* outim1 = outim;
  int32_t* outim2 = &outim[FRAMESAMPLES/2 - 4];
  const int16_t* kSinTab1 = &WebRtcIsacfix_kSinTab2[0];
  const int16_t* kSinTab2 = &WebRtcIsacfix_kSinTab2[FRAMESAMPLES/4 - 4];
  uint32x4_t max_r = vdupq_n_u32(0);
  uint32x4_t max_i = vdupq_n_u32(0);

  // Use ">> 5", instead of "<< 9" and then ">> 14" as in the C code.
  for (k = 0; k < FRAMESAMPLES/4; k += 4) {
    int16x4_t tmpi = vld1_s16(kSinTab1);
    kSinTab1 += 4;
    int16x4_t tmpr = vld1_s16(kSinTab2);
    kSinTab2 -= 4;
    int16x4_t inre_0 = vld1_s16(inre1);
    inre1 += 4;
    int16x4_t inre_1 = vld1_s16(inre2);
    inre2 -= 4;
    int16x4_t inim_0 = vld1_s16(inim1);
    inim1 += 4;
    int16x4_t inim_1 = vld1_s16(inim2);
    inim2 -= 4;
    tmpr = vneg_s16(tmpr);
    inre_1 = vrev64_s16(inre_1);
    inim_1 = vrev64_s16(inim_1);
    tmpr = vrev64_s16(tmpr);

    int32x4_t xr = vmull_s16(tmpr, inre_0);
    int32x4_t xi = vmull_s16(tmpr, inim_0);
    int32x4_t yr = vmull_s16(tmpr, inim_1);
    int32x4_t yi = vmull_s16(tmpi, inim_1);
    xr = vmlal_s16(xr, tmpi, inim_0);
    xi = vmlsl_s16(xi, tmpi, inre_0);
    yr = vmlal_s16(yr, tmpi, inre_1);
    yi = vmlsl_s16(yi, tmpr, inre_1);
    yr = vnegq_s32(yr);

    xr = vshrq_n_s32(xr, 5);
    xi = vshrq_n_s32(xi, 5);
    yr = vshrq_n_s32(yr, 5);
    yi = vshrq_n_s32(yi, 5);

    int32x4_t outr0 = vsubq_s32(xr, yi);
    int32x4_t outr1 = vaddq_s32(xr, yi);
    int32x4_t outi0 = vaddq_s32(xi, yr);
    int32x4_t outi1 = vsubq_s32(yr, xi);

    // Find the absolute maximum in the vectors.
    int32x4_t tmp0 = vabsq_s32(outr0);
    int32x4_t tmp1 = vabsq_s32(outr1);
    int32x4_t tmp2 = vabsq_s32(outi0);
    int32x4_t tmp3 = vabsq_s32(outi1);
    // vabs doesn't change the value of 0x80000000.
    // Use u32 so we don't lose the value 0x80000000.
    max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp0));
    max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp2));
    max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp1));
    max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp3));

    // Store the vectors.
    outr1 = vrev64q_s32(outr1);
    outi1 = vrev64q_s32(outi1);
    int32x4_t outr_1 = vcombine_s32(vget_high_s32(outr1), vget_low_s32(outr1));
    int32x4_t outi_1 = vcombine_s32(vget_high_s32(outi1), vget_low_s32(outi1));

    vst1q_s32(outre1, outr0);
    outre1 += 4;
    vst1q_s32(outim1, outi0);
    outim1 += 4;
    vst1q_s32(outre2, outr_1);
    outre2 -= 4;
    vst1q_s32(outim2, outi_1);
    outim2 -= 4;
  }

  max_r = vmaxq_u32(max_r, max_i);
#if defined(WEBRTC_ARCH_ARM64)
  uint32_t maximum = vmaxvq_u32(max_r);
#else
  uint32x2_t max32x2_r = vmax_u32(vget_low_u32(max_r), vget_high_u32(max_r));
  max32x2_r = vpmax_u32(max32x2_r, max32x2_r);
  uint32_t maximum = vget_lane_u32(max32x2_r, 0);
#endif

  return (int32_t)maximum;
}
Beispiel #21
0
void ne10_img_hresize_4channels_linear_neon (const unsigned char** src, int** dst, int count,
        const int* xofs, const short* alpha,
        int swidth, int dwidth, int cn, int xmin, int xmax)
{
    int dx, k;
    int dx0 = 0;

    int16x4x2_t alpha_vec;

    uint8x8_t dS0_vec, dS1_vec;
    int16x8_t qS0_vec, qS1_vec;
    int16x4_t dS0_0123, dS0_4567, dS1_0123, dS1_4567;

    int32x4_t qT0_vec, qT1_vec;

    int16x4_t dCoeff;
    dCoeff = vdup_n_s16 (INTER_RESIZE_COEF_SCALE);

    for (k = 0; k <= count - 2; k++)
    {
        const unsigned char *S0 = src[k], *S1 = src[k + 1];
        int *D0 = dst[k], *D1 = dst[k + 1];

        for (dx = dx0; dx < xmax; dx += 4)
        {
            int sx = xofs[dx];

            alpha_vec = vld2_s16 (&alpha[dx * 2]);

            dS0_vec = vld1_u8 (&S0[sx]);
            dS1_vec = vld1_u8 (&S1[sx]);

            qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));
            qS1_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS1_vec));

            dS0_0123 = vget_low_s16 (qS0_vec);
            dS0_4567 = vget_high_s16 (qS0_vec);
            dS1_0123 = vget_low_s16 (qS1_vec);
            dS1_4567 = vget_high_s16 (qS1_vec);

            qT0_vec = vmull_s16 (dS0_0123, alpha_vec.val[0]);
            qT1_vec = vmull_s16 (dS1_0123, alpha_vec.val[0]);
            qT0_vec = vmlal_s16 (qT0_vec, dS0_4567, alpha_vec.val[1]);
            qT1_vec = vmlal_s16 (qT1_vec, dS1_4567, alpha_vec.val[1]);

            vst1q_s32 (&D0[dx], qT0_vec);
            vst1q_s32 (&D1[dx], qT1_vec);
        }

        for (; dx < dwidth; dx += 4)
        {
            int sx = xofs[dx];

            dS0_vec = vld1_u8 (&S0[sx]);
            dS1_vec = vld1_u8 (&S1[sx]);

            qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));
            qS1_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS1_vec));

            dS0_0123 = vget_low_s16 (qS0_vec);
            dS1_0123 = vget_low_s16 (qS1_vec);

            qT0_vec = vmull_s16 (dS0_0123, dCoeff);
            qT1_vec = vmull_s16 (dS1_0123, dCoeff);

            vst1q_s32 (&D0[dx], qT0_vec);
            vst1q_s32 (&D1[dx], qT1_vec);
        }
    }

    for (; k < count; k++)
    {
        const unsigned char *S = src[k];
        int *D = dst[k];
        for (dx = 0; dx < xmax; dx += 4)
        {
            int sx = xofs[dx];

            alpha_vec = vld2_s16 (&alpha[dx * 2]);

            dS0_vec = vld1_u8 (&S[sx]);
            qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));

            dS0_0123 = vget_low_s16 (qS0_vec);
            dS0_4567 = vget_high_s16 (qS0_vec);

            qT0_vec = vmull_s16 (dS0_0123, alpha_vec.val[0]);
            qT0_vec = vmlal_s16 (qT0_vec, dS0_4567, alpha_vec.val[1]);

            vst1q_s32 (&D[dx], qT0_vec);
        }

        for (; dx < dwidth; dx += 4)
        {
            int sx = xofs[dx];

            dS0_vec = vld1_u8 (&S[sx]);
            qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));
            dS0_0123 = vget_low_s16 (qS0_vec);
            qT0_vec = vmull_s16 (dS0_0123, dCoeff);

            vst1q_s32 (&D[dx], qT0_vec);
        }
    }
}
Beispiel #22
0
static inline void PostShiftAndSeparateNeon(int16_t* inre,
                                            int16_t* inim,
                                            int16_t* outre,
                                            int16_t* outim,
                                            int32_t sh) {
  int k;
  int16_t* inre1 = inre;
  int16_t* inre2 = &inre[FRAMESAMPLES/2 - 4];
  int16_t* inim1 = inim;
  int16_t* inim2 = &inim[FRAMESAMPLES/2 - 4];
  int16_t* outre1 = outre;
  int16_t* outre2 = &outre[FRAMESAMPLES/2 - 4];
  int16_t* outim1 = outim;
  int16_t* outim2 = &outim[FRAMESAMPLES/2 - 4];
  const int16_t* kSinTab1 = &WebRtcIsacfix_kSinTab2[0];
  const int16_t* kSinTab2 = &WebRtcIsacfix_kSinTab2[FRAMESAMPLES/4 -4];
  // By vshl, we effectively did "<< (-sh - 23)", instead of "<< (-sh)",
  // ">> 14" and then ">> 9" as in the C code.
  int32x4_t shift = vdupq_n_s32(-sh - 23);

  for (k = 0; k < FRAMESAMPLES/4; k += 4) {
    int16x4_t tmpi = vld1_s16(kSinTab1);
    kSinTab1 += 4;
    int16x4_t tmpr = vld1_s16(kSinTab2);
    kSinTab2 -= 4;
    int16x4_t inre_0 = vld1_s16(inre1);
    inre1 += 4;
    int16x4_t inre_1 = vld1_s16(inre2);
    inre2 -= 4;
    int16x4_t inim_0 = vld1_s16(inim1);
    inim1 += 4;
    int16x4_t inim_1 = vld1_s16(inim2);
    inim2 -= 4;
    tmpr = vneg_s16(tmpr);
    inre_1 = vrev64_s16(inre_1);
    inim_1 = vrev64_s16(inim_1);
    tmpr = vrev64_s16(tmpr);

    int16x4_t xr = vqadd_s16(inre_0, inre_1);
    int16x4_t xi = vqsub_s16(inim_0, inim_1);
    int16x4_t yr = vqadd_s16(inim_0, inim_1);
    int16x4_t yi = vqsub_s16(inre_1, inre_0);

    int32x4_t outr0 = vmull_s16(tmpr, xr);
    int32x4_t outi0 = vmull_s16(tmpi, xr);
    int32x4_t outr1 = vmull_s16(tmpi, yr);
    int32x4_t outi1 = vmull_s16(tmpi, yi);
    outr0 = vmlsl_s16(outr0, tmpi, xi);
    outi0 = vmlal_s16(outi0, tmpr, xi);
    outr1 = vmlal_s16(outr1, tmpr, yi);
    outi1 = vmlsl_s16(outi1, tmpr, yr);

    outr0 = vshlq_s32(outr0, shift);
    outi0 = vshlq_s32(outi0, shift);
    outr1 = vshlq_s32(outr1, shift);
    outi1 = vshlq_s32(outi1, shift);
    outr1 = vnegq_s32(outr1);

    int16x4_t outre_0  = vmovn_s32(outr0);
    int16x4_t outim_0  = vmovn_s32(outi0);
    int16x4_t outre_1  = vmovn_s32(outr1);
    int16x4_t outim_1  = vmovn_s32(outi1);
    outre_1 = vrev64_s16(outre_1);
    outim_1 = vrev64_s16(outim_1);

    vst1_s16(outre1, outre_0);
    outre1 += 4;
    vst1_s16(outim1, outim_0);
    outim1 += 4;
    vst1_s16(outre2, outre_1);
    outre2 -= 4;
    vst1_s16(outim2, outim_1);
    outim2 -= 4;
  }
}
Beispiel #23
0
static INLINE void IDCT8x8_1D(
        int16x8_t *q8s16,
        int16x8_t *q9s16,
        int16x8_t *q10s16,
        int16x8_t *q11s16,
        int16x8_t *q12s16,
        int16x8_t *q13s16,
        int16x8_t *q14s16,
        int16x8_t *q15s16) {
    int16x4_t d0s16, d1s16, d2s16, d3s16;
    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
    int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
    int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;

    d0s16 = vdup_n_s16(cospi_28_64);
    d1s16 = vdup_n_s16(cospi_4_64);
    d2s16 = vdup_n_s16(cospi_12_64);
    d3s16 = vdup_n_s16(cospi_20_64);

    d16s16 = vget_low_s16(*q8s16);
    d17s16 = vget_high_s16(*q8s16);
    d18s16 = vget_low_s16(*q9s16);
    d19s16 = vget_high_s16(*q9s16);
    d20s16 = vget_low_s16(*q10s16);
    d21s16 = vget_high_s16(*q10s16);
    d22s16 = vget_low_s16(*q11s16);
    d23s16 = vget_high_s16(*q11s16);
    d24s16 = vget_low_s16(*q12s16);
    d25s16 = vget_high_s16(*q12s16);
    d26s16 = vget_low_s16(*q13s16);
    d27s16 = vget_high_s16(*q13s16);
    d28s16 = vget_low_s16(*q14s16);
    d29s16 = vget_high_s16(*q14s16);
    d30s16 = vget_low_s16(*q15s16);
    d31s16 = vget_high_s16(*q15s16);

    q2s32 = vmull_s16(d18s16, d0s16);
    q3s32 = vmull_s16(d19s16, d0s16);
    q5s32 = vmull_s16(d26s16, d2s16);
    q6s32 = vmull_s16(d27s16, d2s16);

    q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
    q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
    q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
    q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);

    d8s16 = vqrshrn_n_s32(q2s32, 14);
    d9s16 = vqrshrn_n_s32(q3s32, 14);
    d10s16 = vqrshrn_n_s32(q5s32, 14);
    d11s16 = vqrshrn_n_s32(q6s32, 14);
    q4s16 = vcombine_s16(d8s16, d9s16);
    q5s16 = vcombine_s16(d10s16, d11s16);

    q2s32 = vmull_s16(d18s16, d1s16);
    q3s32 = vmull_s16(d19s16, d1s16);
    q9s32 = vmull_s16(d26s16, d3s16);
    q13s32 = vmull_s16(d27s16, d3s16);

    q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
    q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
    q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
    q13s32 = vmlal_s16(q13s32, d23s16, d2s16);

    d14s16 = vqrshrn_n_s32(q2s32, 14);
    d15s16 = vqrshrn_n_s32(q3s32, 14);
    d12s16 = vqrshrn_n_s32(q9s32, 14);
    d13s16 = vqrshrn_n_s32(q13s32, 14);
    q6s16 = vcombine_s16(d12s16, d13s16);
    q7s16 = vcombine_s16(d14s16, d15s16);

    d0s16 = vdup_n_s16(cospi_16_64);

    q2s32 = vmull_s16(d16s16, d0s16);
    q3s32 = vmull_s16(d17s16, d0s16);
    q13s32 = vmull_s16(d16s16, d0s16);
    q15s32 = vmull_s16(d17s16, d0s16);

    q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
    q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
    q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
    q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);

    d0s16 = vdup_n_s16(cospi_24_64);
    d1s16 = vdup_n_s16(cospi_8_64);

    d18s16 = vqrshrn_n_s32(q2s32, 14);
    d19s16 = vqrshrn_n_s32(q3s32, 14);
    d22s16 = vqrshrn_n_s32(q13s32, 14);
    d23s16 = vqrshrn_n_s32(q15s32, 14);
    *q9s16 = vcombine_s16(d18s16, d19s16);
    *q11s16 = vcombine_s16(d22s16, d23s16);

    q2s32 = vmull_s16(d20s16, d0s16);
    q3s32 = vmull_s16(d21s16, d0s16);
    q8s32 = vmull_s16(d20s16, d1s16);
    q12s32 = vmull_s16(d21s16, d1s16);

    q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
    q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
    q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
    q12s32 = vmlal_s16(q12s32, d29s16, d0s16);

    d26s16 = vqrshrn_n_s32(q2s32, 14);
    d27s16 = vqrshrn_n_s32(q3s32, 14);
    d30s16 = vqrshrn_n_s32(q8s32, 14);
    d31s16 = vqrshrn_n_s32(q12s32, 14);
    *q13s16 = vcombine_s16(d26s16, d27s16);
    *q15s16 = vcombine_s16(d30s16, d31s16);

    q0s16 = vaddq_s16(*q9s16, *q15s16);
    q1s16 = vaddq_s16(*q11s16, *q13s16);
    q2s16 = vsubq_s16(*q11s16, *q13s16);
    q3s16 = vsubq_s16(*q9s16, *q15s16);

    *q13s16 = vsubq_s16(q4s16, q5s16);
    q4s16 = vaddq_s16(q4s16, q5s16);
    *q14s16 = vsubq_s16(q7s16, q6s16);
    q7s16 = vaddq_s16(q7s16, q6s16);
    d26s16 = vget_low_s16(*q13s16);
    d27s16 = vget_high_s16(*q13s16);
    d28s16 = vget_low_s16(*q14s16);
    d29s16 = vget_high_s16(*q14s16);

    d16s16 = vdup_n_s16(cospi_16_64);

    q9s32 = vmull_s16(d28s16, d16s16);
    q10s32 = vmull_s16(d29s16, d16s16);
    q11s32 = vmull_s16(d28s16, d16s16);
    q12s32 = vmull_s16(d29s16, d16s16);

    q9s32 = vmlsl_s16(q9s32,  d26s16, d16s16);
    q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
    q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
    q12s32 = vmlal_s16(q12s32, d27s16, d16s16);

    d10s16 = vqrshrn_n_s32(q9s32, 14);
    d11s16 = vqrshrn_n_s32(q10s32, 14);
    d12s16 = vqrshrn_n_s32(q11s32, 14);
    d13s16 = vqrshrn_n_s32(q12s32, 14);
    q5s16 = vcombine_s16(d10s16, d11s16);
    q6s16 = vcombine_s16(d12s16, d13s16);

    *q8s16 = vaddq_s16(q0s16, q7s16);
    *q9s16 = vaddq_s16(q1s16, q6s16);
    *q10s16 = vaddq_s16(q2s16, q5s16);
    *q11s16 = vaddq_s16(q3s16, q4s16);
    *q12s16 = vsubq_s16(q3s16, q4s16);
    *q13s16 = vsubq_s16(q2s16, q5s16);
    *q14s16 = vsubq_s16(q1s16, q6s16);
    *q15s16 = vsubq_s16(q0s16, q7s16);
    return;
}
Beispiel #24
0
int mult_cpx_conj_vector(int16_t *x1,
                         int16_t *x2,
                         int16_t *y,
                         uint32_t N,
                         int output_shift,
			 int madd)
{
  // Multiply elementwise the complex conjugate of x1 with x2.
  // x1       - input 1    in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
  //            We assume x1 with a dinamic of 15 bit maximum
  //
  // x2       - input 2    in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
  //            We assume x2 with a dinamic of 14 bit maximum
  ///
  // y        - output     in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
  //
  // N        - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
  //
  // output_shift  - shift to be applied to generate output
  //
  // madd - add the output to y

  uint32_t i;                 // loop counter

  simd_q15_t *x1_128;
  simd_q15_t *x2_128;
  simd_q15_t *y_128;
#if defined(__x86_64__) || defined(__i386__)
  simd_q15_t tmp_re,tmp_im;
  simd_q15_t tmpy0,tmpy1;

#elif defined(__arm__)
  int32x4_t tmp_re,tmp_im;
  int32x4_t tmp_re1,tmp_im1;
  int16x4x2_t tmpy;
  int32x4_t shift = vdupq_n_s32(-output_shift);
#endif

  x1_128 = (simd_q15_t *)&x1[0];
  x2_128 = (simd_q15_t *)&x2[0];
  y_128  = (simd_q15_t *)&y[0];


  // we compute 4 cpx multiply for each loop
  for(i=0; i<(N>>2); i++) {
#if defined(__x86_64__) || defined(__i386__)
    tmp_re = _mm_madd_epi16(*x1_128,*x2_128);
    tmp_im = _mm_shufflelo_epi16(*x1_128,_MM_SHUFFLE(2,3,0,1));
    tmp_im = _mm_shufflehi_epi16(tmp_im,_MM_SHUFFLE(2,3,0,1));
    tmp_im = _mm_sign_epi16(tmp_im,*(__m128i*)&conjug[0]);
    tmp_im = _mm_madd_epi16(tmp_im,*x2_128);
    tmp_re = _mm_srai_epi32(tmp_re,output_shift);
    tmp_im = _mm_srai_epi32(tmp_im,output_shift);
    tmpy0  = _mm_unpacklo_epi32(tmp_re,tmp_im);
    tmpy1  = _mm_unpackhi_epi32(tmp_re,tmp_im);
    if (madd==0)
      *y_128 = _mm_packs_epi32(tmpy0,tmpy1);
    else
      *y_128 += _mm_packs_epi32(tmpy0,tmpy1);

#elif defined(__arm__)

    tmp_re  = vmull_s16(((simdshort_q15_t *)x1_128)[0], ((simdshort_q15_t*)x2_128)[0]);
    //tmp_re = [Re(x1[0])Re(x2[0]) Im(x1[0])Im(x2[0]) Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1])]
    tmp_re1 = vmull_s16(((simdshort_q15_t *)x1_128)[1], ((simdshort_q15_t*)x2_128)[1]);
    //tmp_re1 = [Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1]) Re(x1[1])Re(x2[2]) Im(x1[1])Im(x2[2])]
    tmp_re  = vcombine_s32(vpadd_s32(vget_low_s32(tmp_re),vget_high_s32(tmp_re)),
                           vpadd_s32(vget_low_s32(tmp_re1),vget_high_s32(tmp_re1)));
    //tmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2]) Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])]

    tmp_im  = vmull_s16(vrev32_s16(vmul_s16(((simdshort_q15_t*)x2_128)[0],*(simdshort_q15_t*)conjug)), ((simdshort_q15_t*)x1_128)[0]);
    //tmp_im = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])]
    tmp_im1 = vmull_s16(vrev32_s16(vmul_s16(((simdshort_q15_t*)x2_128)[1],*(simdshort_q15_t*)conjug)), ((simdshort_q15_t*)x1_128)[1]);
    //tmp_im1 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])]
    tmp_im  = vcombine_s32(vpadd_s32(vget_low_s32(tmp_im),vget_high_s32(tmp_im)),
                           vpadd_s32(vget_low_s32(tmp_im1),vget_high_s32(tmp_im1)));
    //tmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])]

    tmp_re = vqshlq_s32(tmp_re,shift);
    tmp_im = vqshlq_s32(tmp_im,shift);
    tmpy   = vzip_s16(vmovn_s32(tmp_re),vmovn_s32(tmp_im));
    if (madd==0)
      *y_128 = vcombine_s16(tmpy.val[0],tmpy.val[1]);
    else
      *y_128 += vcombine_s16(tmpy.val[0],tmpy.val[1]);
#endif
    x1_128++;
    x2_128++;
    y_128++;
  }


  _mm_empty();
  _m_empty();

  return(0);
}