int32_t dot_product(int16_t *x, int16_t *y, uint32_t N, //must be a multiple of 8 uint8_t output_shift) { uint32_t n; #if defined(__x86_64__) || defined(__i386__) __m128i *x128,*y128,mmtmp1,mmtmp2,mmtmp3,mmcumul,mmcumul_re,mmcumul_im; __m64 mmtmp7; __m128i minus_i = _mm_set_epi16(-1,1,-1,1,-1,1,-1,1); int32_t result; x128 = (__m128i*) x; y128 = (__m128i*) y; mmcumul_re = _mm_setzero_si128(); mmcumul_im = _mm_setzero_si128(); for (n=0; n<(N>>2); n++) { //printf("n=%d, x128=%p, y128=%p\n",n,x128,y128); // print_shorts("x",&x128[0]); // print_shorts("y",&y128[0]); // this computes Re(z) = Re(x)*Re(y) + Im(x)*Im(y) mmtmp1 = _mm_madd_epi16(x128[0],y128[0]); // print_ints("re",&mmtmp1); // mmtmp1 contains real part of 4 consecutive outputs (32-bit) // shift and accumulate results mmtmp1 = _mm_srai_epi32(mmtmp1,output_shift); mmcumul_re = _mm_add_epi32(mmcumul_re,mmtmp1); // print_ints("re",&mmcumul_re); // this computes Im(z) = Re(x)*Im(y) - Re(y)*Im(x) mmtmp2 = _mm_shufflelo_epi16(y128[0],_MM_SHUFFLE(2,3,0,1)); // print_shorts("y",&mmtmp2); mmtmp2 = _mm_shufflehi_epi16(mmtmp2,_MM_SHUFFLE(2,3,0,1)); // print_shorts("y",&mmtmp2); mmtmp2 = _mm_sign_epi16(mmtmp2,minus_i); // print_shorts("y",&mmtmp2); mmtmp3 = _mm_madd_epi16(x128[0],mmtmp2); // print_ints("im",&mmtmp3); // mmtmp3 contains imag part of 4 consecutive outputs (32-bit) // shift and accumulate results mmtmp3 = _mm_srai_epi32(mmtmp3,output_shift); mmcumul_im = _mm_add_epi32(mmcumul_im,mmtmp3); // print_ints("im",&mmcumul_im); x128++; y128++; } // this gives Re Re Im Im mmcumul = _mm_hadd_epi32(mmcumul_re,mmcumul_im); // print_ints("cumul1",&mmcumul); // this gives Re Im Re Im mmcumul = _mm_hadd_epi32(mmcumul,mmcumul); // print_ints("cumul2",&mmcumul); //mmcumul = _mm_srai_epi32(mmcumul,output_shift); // extract the lower half mmtmp7 = _mm_movepi64_pi64(mmcumul); // print_ints("mmtmp7",&mmtmp7); // pack the result mmtmp7 = _mm_packs_pi32(mmtmp7,mmtmp7); // print_shorts("mmtmp7",&mmtmp7); // convert back to integer result = _mm_cvtsi64_si32(mmtmp7); _mm_empty(); _m_empty(); return(result); #elif defined(__arm__) int16x4_t *x_128=(int16x4_t*)x; int16x4_t *y_128=(int16x4_t*)y; int32x4_t tmp_re,tmp_im; int32x4_t tmp_re1,tmp_im1; int32x4_t re_cumul,im_cumul; int32x2_t re_cumul2,im_cumul2; int32x4_t shift = vdupq_n_s32(-output_shift); int32x2x2_t result2; int16_t conjug[4]__attribute__((aligned(16))) = {-1,1,-1,1} ; re_cumul = vdupq_n_s32(0); im_cumul = vdupq_n_s32(0); for (n=0; n<(N>>2); n++) { tmp_re = vmull_s16(*x_128++, *y_128++); //tmp_re = [Re(x[0])Re(y[0]) Im(x[0])Im(y[0]) Re(x[1])Re(y[1]) Im(x[1])Im(y[1])] tmp_re1 = vmull_s16(*x_128++, *y_128++); //tmp_re1 = [Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1]) Re(x1[1])Re(x2[2]) Im(x1[1])Im(x2[2])] tmp_re = vcombine_s32(vpadd_s32(vget_low_s32(tmp_re),vget_high_s32(tmp_re)), vpadd_s32(vget_low_s32(tmp_re1),vget_high_s32(tmp_re1))); //tmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2]) Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] tmp_im = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++); //tmp_im = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])] tmp_im1 = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++); //tmp_im1 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])] tmp_im = vcombine_s32(vpadd_s32(vget_low_s32(tmp_im),vget_high_s32(tmp_im)), vpadd_s32(vget_low_s32(tmp_im1),vget_high_s32(tmp_im1))); //tmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])] re_cumul = vqaddq_s32(re_cumul,vqshlq_s32(tmp_re,shift)); im_cumul = vqaddq_s32(im_cumul,vqshlq_s32(tmp_im,shift)); } re_cumul2 = vpadd_s32(vget_low_s32(re_cumul),vget_high_s32(re_cumul)); im_cumul2 = vpadd_s32(vget_low_s32(im_cumul),vget_high_s32(im_cumul)); re_cumul2 = vpadd_s32(re_cumul2,re_cumul2); im_cumul2 = vpadd_s32(im_cumul2,im_cumul2); result2 = vzip_s32(re_cumul2,im_cumul2); return(vget_lane_s32(result2.val[0],0)); #endif }
static INLINE void idct4x4_16_kernel_bd10(const int32x4_t cospis, int32x4_t *const a0, int32x4_t *const a1, int32x4_t *const a2, int32x4_t *const a3) { int32x4_t b0, b1, b2, b3; transpose_s32_4x4(a0, a1, a2, a3); b0 = vaddq_s32(*a0, *a2); b1 = vsubq_s32(*a0, *a2); b0 = vmulq_lane_s32(b0, vget_high_s32(cospis), 0); b1 = vmulq_lane_s32(b1, vget_high_s32(cospis), 0); b2 = vmulq_lane_s32(*a1, vget_high_s32(cospis), 1); b3 = vmulq_lane_s32(*a1, vget_low_s32(cospis), 1); b2 = vmlsq_lane_s32(b2, *a3, vget_low_s32(cospis), 1); b3 = vmlaq_lane_s32(b3, *a3, vget_high_s32(cospis), 1); b0 = vrshrq_n_s32(b0, DCT_CONST_BITS); b1 = vrshrq_n_s32(b1, DCT_CONST_BITS); b2 = vrshrq_n_s32(b2, DCT_CONST_BITS); b3 = vrshrq_n_s32(b3, DCT_CONST_BITS); *a0 = vaddq_s32(b0, b3); *a1 = vaddq_s32(b1, b2); *a2 = vsubq_s32(b1, b2); *a3 = vsubq_s32(b0, b3); }
static OPUS_INLINE void calc_corr( const opus_int32 *const input_QS, opus_int64 *const corr_QC, const opus_int offset, const int32x4_t state_QS_s32x4 ) { int64x2_t corr_QC_s64x2[ 2 ], t_s64x2[ 2 ]; const int32x4_t input_QS_s32x4 = vld1q_s32( input_QS + offset ); corr_QC_s64x2[ 0 ] = vld1q_s64( corr_QC + offset + 0 ); corr_QC_s64x2[ 1 ] = vld1q_s64( corr_QC + offset + 2 ); t_s64x2[ 0 ] = vmull_s32( vget_low_s32( state_QS_s32x4 ), vget_low_s32( input_QS_s32x4 ) ); t_s64x2[ 1 ] = vmull_s32( vget_high_s32( state_QS_s32x4 ), vget_high_s32( input_QS_s32x4 ) ); corr_QC_s64x2[ 0 ] = vsraq_n_s64( corr_QC_s64x2[ 0 ], t_s64x2[ 0 ], 2 * QS - QC ); corr_QC_s64x2[ 1 ] = vsraq_n_s64( corr_QC_s64x2[ 1 ], t_s64x2[ 1 ], 2 * QS - QC ); vst1q_s64( corr_QC + offset + 0, corr_QC_s64x2[ 0 ] ); vst1q_s64( corr_QC + offset + 2, corr_QC_s64x2[ 1 ] ); }
void dotProd_i16_neon(const float *dataf, const float *weightsf, float *vals, const int n, const int len, const float *istd) { const int16_t *data = (const int16_t *)dataf; const int16_t *weights = (const int16_t *)weightsf; weightsf += n * len / 2; // sizeof(float) / sizeof(int16_t) for (int i = 0; i < n; i += 4) { int32x4_t accum0 = { 0, 0, 0, 0 }; int32x4_t accum1 = accum0; int32x4_t accum2 = accum0; int32x4_t accum3 = accum0; for (int j = 0; j < len; j += 8) { int16x4x2_t d0 = vld2_s16(data + j); int16x4x2_t w0 = vld2_s16(weights); int16x4x2_t w1 = vld2_s16(weights + 8); int16x4x2_t w2 = vld2_s16(weights + 16); int16x4x2_t w3 = vld2_s16(weights + 24); accum0 = vmlal_s16(accum0, d0.val[0], w0.val[0]); accum0 = vmlal_s16(accum0, d0.val[1], w0.val[1]); accum1 = vmlal_s16(accum1, d0.val[0], w1.val[0]); accum1 = vmlal_s16(accum1, d0.val[1], w1.val[1]); accum2 = vmlal_s16(accum2, d0.val[0], w2.val[0]); accum2 = vmlal_s16(accum2, d0.val[1], w2.val[1]); accum3 = vmlal_s16(accum3, d0.val[0], w3.val[0]); accum3 = vmlal_s16(accum3, d0.val[1], w3.val[1]); weights += 32; } int32x2_t sum0 = vpadd_s32(vget_low_s32(accum0), vget_high_s32(accum0)); int32x2_t sum1 = vpadd_s32(vget_low_s32(accum1), vget_high_s32(accum1)); int32x2_t sum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2)); int32x2_t sum3 = vpadd_s32(vget_low_s32(accum3), vget_high_s32(accum3)); sum0 = vpadd_s32(sum0, sum1); sum1 = vpadd_s32(sum2, sum3); int32x4_t sum = vcombine_s32(sum0, sum1); float32x4_t val = vcvtq_f32_s32(sum); val = vmulq_f32(val, vld1q_f32(weightsf + i*2)); val = vmulq_n_f32(val, istd[0]); val = vaddq_f32(val, vld1q_f32(weightsf + i*2 + 4)); vst1q_f32(vals + i, val); } }
static INLINE void iadst_butterfly_lane_1_0_bd12_neon(const int32x4_t in0, const int32x4_t in1, const int32x2_t c, int64x2_t *const s0, int64x2_t *const s1) { const int64x2_t t0_lo = vmull_lane_s32(vget_low_s32(in0), c, 1); const int64x2_t t1_lo = vmull_lane_s32(vget_low_s32(in0), c, 0); const int64x2_t t0_hi = vmull_lane_s32(vget_high_s32(in0), c, 1); const int64x2_t t1_hi = vmull_lane_s32(vget_high_s32(in0), c, 0); s0[0] = vmlal_lane_s32(t0_lo, vget_low_s32(in1), c, 0); s1[0] = vmlsl_lane_s32(t1_lo, vget_low_s32(in1), c, 1); s0[1] = vmlal_lane_s32(t0_hi, vget_high_s32(in1), c, 0); s1[1] = vmlsl_lane_s32(t1_hi, vget_high_s32(in1), c, 1); }
int64_t av1_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff, int block_size) { int64x2_t error = vdupq_n_s64(0); assert(block_size >= 8); assert((block_size % 8) == 0); do { const int16x8_t c = vld1q_s16(coeff); const int16x8_t d = vld1q_s16(dqcoeff); const int16x8_t diff = vsubq_s16(c, d); const int16x4_t diff_lo = vget_low_s16(diff); const int16x4_t diff_hi = vget_high_s16(diff); // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before // accumulating them in 64-bits. const int32x4_t err0 = vmull_s16(diff_lo, diff_lo); const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi); const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1)); error = vaddq_s64(error, err2); coeff += 8; dqcoeff += 8; block_size -= 8; } while (block_size != 0); return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1); }
void test_vget_highs32 (void) { int32x2_t out_int32x2_t; int32x4_t arg0_int32x4_t; out_int32x2_t = vget_high_s32 (arg0_int32x4_t); }
static INLINE void iadst_half_butterfly_bd12_neon(int32x4_t *const x, const int32x2_t c) { const int32x4_t sum = vaddq_s32(x[0], x[1]); const int32x4_t sub = vsubq_s32(x[0], x[1]); const int64x2_t t0_lo = vmull_lane_s32(vget_low_s32(sum), c, 0); const int64x2_t t1_lo = vmull_lane_s32(vget_low_s32(sub), c, 0); const int64x2_t t0_hi = vmull_lane_s32(vget_high_s32(sum), c, 0); const int64x2_t t1_hi = vmull_lane_s32(vget_high_s32(sub), c, 0); const int32x2_t out0_lo = vrshrn_n_s64(t0_lo, DCT_CONST_BITS); const int32x2_t out1_lo = vrshrn_n_s64(t1_lo, DCT_CONST_BITS); const int32x2_t out0_hi = vrshrn_n_s64(t0_hi, DCT_CONST_BITS); const int32x2_t out1_hi = vrshrn_n_s64(t1_hi, DCT_CONST_BITS); x[0] = vcombine_s32(out0_lo, out0_hi); x[1] = vcombine_s32(out1_lo, out1_hi); }
static inline void silk_biquad_alt_stride2_kernel( const int32x4_t A_L_s32x4, const int32x4_t A_U_s32x4, const int32x4_t B_Q28_s32x4, const int32x2_t t_s32x2, const int32x4_t in_s32x4, int32x4_t *S_s32x4, int32x2_t *out32_Q14_s32x2 ) { int32x4_t t_s32x4, out32_Q14_s32x4; *out32_Q14_s32x2 = vadd_s32( vget_low_s32( *S_s32x4 ), t_s32x2 ); /* silk_SMLAWB( S{0,1}, B_Q28[ 0 ], in{0,1} ) */ *S_s32x4 = vcombine_s32( vget_high_s32( *S_s32x4 ), vdup_n_s32( 0 ) ); /* S{0,1} = S{2,3}; S{2,3} = 0; */ *out32_Q14_s32x2 = vshl_n_s32( *out32_Q14_s32x2, 2 ); /* out32_Q14_{0,1} = silk_LSHIFT( silk_SMLAWB( S{0,1}, B_Q28[ 0 ], in{0,1} ), 2 ); */ out32_Q14_s32x4 = vcombine_s32( *out32_Q14_s32x2, *out32_Q14_s32x2 ); /* out32_Q14_{0,1,0,1} */ t_s32x4 = vqdmulhq_s32( out32_Q14_s32x4, A_L_s32x4 ); /* silk_SMULWB( out32_Q14_{0,1,0,1}, A{0,0,1,1}_L_Q28 ) */ *S_s32x4 = vrsraq_n_s32( *S_s32x4, t_s32x4, 14 ); /* S{0,1} = S{2,3} + silk_RSHIFT_ROUND(); S{2,3} = silk_RSHIFT_ROUND(); */ t_s32x4 = vqdmulhq_s32( out32_Q14_s32x4, A_U_s32x4 ); /* silk_SMULWB( out32_Q14_{0,1,0,1}, A{0,0,1,1}_U_Q28 ) */ *S_s32x4 = vaddq_s32( *S_s32x4, t_s32x4 ); /* S0 = silk_SMLAWB( S{0,1,2,3}, out32_Q14_{0,1,0,1}, A{0,0,1,1}_U_Q28 ); */ t_s32x4 = vqdmulhq_s32( in_s32x4, B_Q28_s32x4 ); /* silk_SMULWB( B_Q28[ {1,1,2,2} ], in{0,1,0,1} ) */ *S_s32x4 = vaddq_s32( *S_s32x4, t_s32x4 ); /* S0 = silk_SMLAWB( S0, B_Q28[ {1,1,2,2} ], in{0,1,0,1} ); */ }
// ref, src = [0, 510] - max diff = 16-bits // bwl = {2, 3, 4}, width = {16, 32, 64} int vp9_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) { int width = 4 << bwl; int32x4_t sse = vdupq_n_s32(0); int16x8_t total = vdupq_n_s16(0); assert(width >= 8); assert((width % 8) == 0); do { const int16x8_t r = vld1q_s16(ref); const int16x8_t s = vld1q_s16(src); const int16x8_t diff = vsubq_s16(r, s); // [-510, 510], 10 bits. const int16x4_t diff_lo = vget_low_s16(diff); const int16x4_t diff_hi = vget_high_s16(diff); sse = vmlal_s16(sse, diff_lo, diff_lo); // dynamic range 26 bits. sse = vmlal_s16(sse, diff_hi, diff_hi); total = vaddq_s16(total, diff); // dynamic range 16 bits. ref += 8; src += 8; width -= 8; } while (width != 0); { // Note: 'total''s pairwise addition could be implemented similarly to // horizontal_add_u16x8(), but one less vpaddl with 'total' when paired // with the summation of 'sse' performed better on a Cortex-A15. const int32x4_t t0 = vpaddlq_s16(total); // cascading summation of 'total' const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0)); const int32x2_t t2 = vpadd_s32(t1, t1); const int t = vget_lane_s32(t2, 0); const int64x2_t s0 = vpaddlq_s32(sse); // cascading summation of 'sse'. const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)), vreinterpret_s32_s64(vget_high_s64(s0))); const int s = vget_lane_s32(s1, 0); const int shift_factor = bwl + 2; return s - ((t * t) >> shift_factor); } }
inline int32x2_t vget_high(const int32x4_t & v) { return vget_high_s32(v); }
static inline int32_t TransformAndFindMaxNeon(int16_t* inre, int16_t* inim, int32_t* outre, int32_t* outim) { int k; int16_t* inre1 = inre; int16_t* inre2 = &inre[FRAMESAMPLES/2 - 4]; int16_t* inim1 = inim; int16_t* inim2 = &inim[FRAMESAMPLES/2 - 4]; int32_t* outre1 = outre; int32_t* outre2 = &outre[FRAMESAMPLES/2 - 4]; int32_t* outim1 = outim; int32_t* outim2 = &outim[FRAMESAMPLES/2 - 4]; const int16_t* kSinTab1 = &WebRtcIsacfix_kSinTab2[0]; const int16_t* kSinTab2 = &WebRtcIsacfix_kSinTab2[FRAMESAMPLES/4 - 4]; uint32x4_t max_r = vdupq_n_u32(0); uint32x4_t max_i = vdupq_n_u32(0); // Use ">> 5", instead of "<< 9" and then ">> 14" as in the C code. for (k = 0; k < FRAMESAMPLES/4; k += 4) { int16x4_t tmpi = vld1_s16(kSinTab1); kSinTab1 += 4; int16x4_t tmpr = vld1_s16(kSinTab2); kSinTab2 -= 4; int16x4_t inre_0 = vld1_s16(inre1); inre1 += 4; int16x4_t inre_1 = vld1_s16(inre2); inre2 -= 4; int16x4_t inim_0 = vld1_s16(inim1); inim1 += 4; int16x4_t inim_1 = vld1_s16(inim2); inim2 -= 4; tmpr = vneg_s16(tmpr); inre_1 = vrev64_s16(inre_1); inim_1 = vrev64_s16(inim_1); tmpr = vrev64_s16(tmpr); int32x4_t xr = vmull_s16(tmpr, inre_0); int32x4_t xi = vmull_s16(tmpr, inim_0); int32x4_t yr = vmull_s16(tmpr, inim_1); int32x4_t yi = vmull_s16(tmpi, inim_1); xr = vmlal_s16(xr, tmpi, inim_0); xi = vmlsl_s16(xi, tmpi, inre_0); yr = vmlal_s16(yr, tmpi, inre_1); yi = vmlsl_s16(yi, tmpr, inre_1); yr = vnegq_s32(yr); xr = vshrq_n_s32(xr, 5); xi = vshrq_n_s32(xi, 5); yr = vshrq_n_s32(yr, 5); yi = vshrq_n_s32(yi, 5); int32x4_t outr0 = vsubq_s32(xr, yi); int32x4_t outr1 = vaddq_s32(xr, yi); int32x4_t outi0 = vaddq_s32(xi, yr); int32x4_t outi1 = vsubq_s32(yr, xi); // Find the absolute maximum in the vectors. int32x4_t tmp0 = vabsq_s32(outr0); int32x4_t tmp1 = vabsq_s32(outr1); int32x4_t tmp2 = vabsq_s32(outi0); int32x4_t tmp3 = vabsq_s32(outi1); // vabs doesn't change the value of 0x80000000. // Use u32 so we don't lose the value 0x80000000. max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp0)); max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp2)); max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp1)); max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp3)); // Store the vectors. outr1 = vrev64q_s32(outr1); outi1 = vrev64q_s32(outi1); int32x4_t outr_1 = vcombine_s32(vget_high_s32(outr1), vget_low_s32(outr1)); int32x4_t outi_1 = vcombine_s32(vget_high_s32(outi1), vget_low_s32(outi1)); vst1q_s32(outre1, outr0); outre1 += 4; vst1q_s32(outim1, outi0); outim1 += 4; vst1q_s32(outre2, outr_1); outre2 -= 4; vst1q_s32(outim2, outi_1); outim2 -= 4; } max_r = vmaxq_u32(max_r, max_i); #if defined(WEBRTC_ARCH_ARM64) uint32_t maximum = vmaxvq_u32(max_r); #else uint32x2_t max32x2_r = vmax_u32(vget_low_u32(max_r), vget_high_u32(max_r)); max32x2_r = vpmax_u32(max32x2_r, max32x2_r); uint32_t maximum = vget_lane_u32(max32x2_r, 0); #endif return (int32_t)maximum; }
static INLINE void iadst8_bd12(int32x4_t *const io0, int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3, int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6, int32x4_t *const io7) { const int32x4_t c0 = create_s32x4_neon(cospi_2_64, cospi_30_64, cospi_10_64, cospi_22_64); const int32x4_t c1 = create_s32x4_neon(cospi_18_64, cospi_14_64, cospi_26_64, cospi_6_64); const int32x4_t c2 = create_s32x4_neon(cospi_16_64, 0, cospi_8_64, cospi_24_64); int32x4_t x[8], t[4]; int64x2_t s[8][2]; x[0] = *io7; x[1] = *io0; x[2] = *io5; x[3] = *io2; x[4] = *io3; x[5] = *io4; x[6] = *io1; x[7] = *io6; // stage 1 iadst_butterfly_lane_0_1_bd12_neon(x[0], x[1], vget_low_s32(c0), s[0], s[1]); iadst_butterfly_lane_0_1_bd12_neon(x[2], x[3], vget_high_s32(c0), s[2], s[3]); iadst_butterfly_lane_0_1_bd12_neon(x[4], x[5], vget_low_s32(c1), s[4], s[5]); iadst_butterfly_lane_0_1_bd12_neon(x[6], x[7], vget_high_s32(c1), s[6], s[7]); x[0] = add_dct_const_round_shift_low_8_bd12(s[0], s[4]); x[1] = add_dct_const_round_shift_low_8_bd12(s[1], s[5]); x[2] = add_dct_const_round_shift_low_8_bd12(s[2], s[6]); x[3] = add_dct_const_round_shift_low_8_bd12(s[3], s[7]); x[4] = sub_dct_const_round_shift_low_8_bd12(s[0], s[4]); x[5] = sub_dct_const_round_shift_low_8_bd12(s[1], s[5]); x[6] = sub_dct_const_round_shift_low_8_bd12(s[2], s[6]); x[7] = sub_dct_const_round_shift_low_8_bd12(s[3], s[7]); // stage 2 t[0] = x[0]; t[1] = x[1]; t[2] = x[2]; t[3] = x[3]; iadst_butterfly_lane_0_1_bd12_neon(x[4], x[5], vget_high_s32(c2), s[4], s[5]); iadst_butterfly_lane_1_0_bd12_neon(x[7], x[6], vget_high_s32(c2), s[7], s[6]); x[0] = vaddq_s32(t[0], t[2]); x[1] = vaddq_s32(t[1], t[3]); x[2] = vsubq_s32(t[0], t[2]); x[3] = vsubq_s32(t[1], t[3]); x[4] = add_dct_const_round_shift_low_8_bd12(s[4], s[6]); x[5] = add_dct_const_round_shift_low_8_bd12(s[5], s[7]); x[6] = sub_dct_const_round_shift_low_8_bd12(s[4], s[6]); x[7] = sub_dct_const_round_shift_low_8_bd12(s[5], s[7]); // stage 3 iadst_half_butterfly_bd12_neon(x + 2, vget_low_s32(c2)); iadst_half_butterfly_bd12_neon(x + 6, vget_low_s32(c2)); *io0 = x[0]; *io1 = vnegq_s32(x[4]); *io2 = x[6]; *io3 = vnegq_s32(x[2]); *io4 = x[3]; *io5 = vnegq_s32(x[7]); *io6 = x[5]; *io7 = vnegq_s32(x[1]); }
int32x2_t test_vget_high_s32(int32x4_t a) { // CHECK-COMMON-LABEL: test_vget_high_s32: return vget_high_s32(a); // CHECK-AARCH64: dup d0, {{v[0-9]+}}.d[1] // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8 }
int32x2_t test_vget_high_s32(int32x4_t a) { // CHECK-LABEL: test_vget_high_s32: return vget_high_s32(a); // CHECK: dup d0, {{v[0-9]+}}.d[1] }
static inline void PostShiftAndDivideAndDemodulateNeon(int16_t* inre, int16_t* inim, int32_t* outre1, int32_t* outre2, int32_t sh) { int k; int16_t* p_inre = inre; int16_t* p_inim = inim; int32_t* p_outre1 = outre1; int32_t* p_outre2 = outre2; const int16_t* kCosTab = &WebRtcIsacfix_kCosTab1[0]; const int16_t* kSinTab = &WebRtcIsacfix_kSinTab1[0]; int32x4_t shift = vdupq_n_s32(-sh - 16); // Divide through by the normalizing constant: // scale all values with 1/240, i.e. with 273 in Q16. // 273/65536 ~= 0.0041656 // 1/240 ~= 0.0041666 int16x8_t scale = vdupq_n_s16(273); // Sqrt(240) in Q11 is round(15.49193338482967 * 2048) = 31727. int factQ19 = 31727 << 16; int32x4_t fact = vdupq_n_s32(factQ19); for (k = 0; k < FRAMESAMPLES/2; k += 8) { int16x8_t inre16x8 = vld1q_s16(p_inre); int16x8_t inim16x8 = vld1q_s16(p_inim); p_inre += 8; p_inim += 8; int16x8_t tmpr = vld1q_s16(kCosTab); int16x8_t tmpi = vld1q_s16(kSinTab); kCosTab += 8; kSinTab += 8; // By vshl and vmull, we effectively did "<< (-sh - 16)", // instead of "<< (-sh)" and ">> 16" as in the C code. int32x4_t outre1_0 = vmull_s16(vget_low_s16(inre16x8), vget_low_s16(scale)); int32x4_t outre2_0 = vmull_s16(vget_low_s16(inim16x8), vget_low_s16(scale)); #if defined(WEBRTC_ARCH_ARM64) int32x4_t outre1_1 = vmull_high_s16(inre16x8, scale); int32x4_t outre2_1 = vmull_high_s16(inim16x8, scale); #else int32x4_t outre1_1 = vmull_s16(vget_high_s16(inre16x8), vget_high_s16(scale)); int32x4_t outre2_1 = vmull_s16(vget_high_s16(inim16x8), vget_high_s16(scale)); #endif outre1_0 = vshlq_s32(outre1_0, shift); outre1_1 = vshlq_s32(outre1_1, shift); outre2_0 = vshlq_s32(outre2_0, shift); outre2_1 = vshlq_s32(outre2_1, shift); // Demodulate and separate. int32x4_t tmpr_0 = vmovl_s16(vget_low_s16(tmpr)); int32x4_t tmpi_0 = vmovl_s16(vget_low_s16(tmpi)); #if defined(WEBRTC_ARCH_ARM64) int32x4_t tmpr_1 = vmovl_high_s16(tmpr); int32x4_t tmpi_1 = vmovl_high_s16(tmpi); #else int32x4_t tmpr_1 = vmovl_s16(vget_high_s16(tmpr)); int32x4_t tmpi_1 = vmovl_s16(vget_high_s16(tmpi)); #endif int64x2_t xr0 = vmull_s32(vget_low_s32(tmpr_0), vget_low_s32(outre1_0)); int64x2_t xi0 = vmull_s32(vget_low_s32(tmpr_0), vget_low_s32(outre2_0)); int64x2_t xr2 = vmull_s32(vget_low_s32(tmpr_1), vget_low_s32(outre1_1)); int64x2_t xi2 = vmull_s32(vget_low_s32(tmpr_1), vget_low_s32(outre2_1)); xr0 = vmlsl_s32(xr0, vget_low_s32(tmpi_0), vget_low_s32(outre2_0)); xi0 = vmlal_s32(xi0, vget_low_s32(tmpi_0), vget_low_s32(outre1_0)); xr2 = vmlsl_s32(xr2, vget_low_s32(tmpi_1), vget_low_s32(outre2_1)); xi2 = vmlal_s32(xi2, vget_low_s32(tmpi_1), vget_low_s32(outre1_1)); #if defined(WEBRTC_ARCH_ARM64) int64x2_t xr1 = vmull_high_s32(tmpr_0, outre1_0); int64x2_t xi1 = vmull_high_s32(tmpr_0, outre2_0); int64x2_t xr3 = vmull_high_s32(tmpr_1, outre1_1); int64x2_t xi3 = vmull_high_s32(tmpr_1, outre2_1); xr1 = vmlsl_high_s32(xr1, tmpi_0, outre2_0); xi1 = vmlal_high_s32(xi1, tmpi_0, outre1_0); xr3 = vmlsl_high_s32(xr3, tmpi_1, outre2_1); xi3 = vmlal_high_s32(xi3, tmpi_1, outre1_1); #else int64x2_t xr1 = vmull_s32(vget_high_s32(tmpr_0), vget_high_s32(outre1_0)); int64x2_t xi1 = vmull_s32(vget_high_s32(tmpr_0), vget_high_s32(outre2_0)); int64x2_t xr3 = vmull_s32(vget_high_s32(tmpr_1), vget_high_s32(outre1_1)); int64x2_t xi3 = vmull_s32(vget_high_s32(tmpr_1), vget_high_s32(outre2_1)); xr1 = vmlsl_s32(xr1, vget_high_s32(tmpi_0), vget_high_s32(outre2_0)); xi1 = vmlal_s32(xi1, vget_high_s32(tmpi_0), vget_high_s32(outre1_0)); xr3 = vmlsl_s32(xr3, vget_high_s32(tmpi_1), vget_high_s32(outre2_1)); xi3 = vmlal_s32(xi3, vget_high_s32(tmpi_1), vget_high_s32(outre1_1)); #endif outre1_0 = vcombine_s32(vshrn_n_s64(xr0, 10), vshrn_n_s64(xr1, 10)); outre2_0 = vcombine_s32(vshrn_n_s64(xi0, 10), vshrn_n_s64(xi1, 10)); outre1_1 = vcombine_s32(vshrn_n_s64(xr2, 10), vshrn_n_s64(xr3, 10)); outre2_1 = vcombine_s32(vshrn_n_s64(xi2, 10), vshrn_n_s64(xi3, 10)); outre1_0 = vqdmulhq_s32(outre1_0, fact); outre2_0 = vqdmulhq_s32(outre2_0, fact); outre1_1 = vqdmulhq_s32(outre1_1, fact); outre2_1 = vqdmulhq_s32(outre2_1, fact); vst1q_s32(p_outre1, outre1_0); p_outre1 += 4; vst1q_s32(p_outre1, outre1_1); p_outre1 += 4; vst1q_s32(p_outre2, outre2_0); p_outre2 += 4; vst1q_s32(p_outre2, outre2_1); p_outre2 += 4; } }
void silk_biquad_alt_stride2_neon( const opus_int16 *in, /* I input signal */ const opus_int32 *B_Q28, /* I MA coefficients [3] */ const opus_int32 *A_Q28, /* I AR coefficients [2] */ opus_int32 *S, /* I/O State vector [4] */ opus_int16 *out, /* O output signal */ const opus_int32 len /* I signal length (must be even) */ ) { /* DIRECT FORM II TRANSPOSED (uses 2 element state vector) */ opus_int k = 0; const int32x2_t offset_s32x2 = vdup_n_s32( (1<<14) - 1 ); const int32x4_t offset_s32x4 = vcombine_s32( offset_s32x2, offset_s32x2 ); int16x4_t in_s16x4 = vdup_n_s16( 0 ); int16x4_t out_s16x4; int32x2_t A_Q28_s32x2, A_L_s32x2, A_U_s32x2, B_Q28_s32x2, t_s32x2; int32x4_t A_L_s32x4, A_U_s32x4, B_Q28_s32x4, S_s32x4, out32_Q14_s32x4; int32x2x2_t t0_s32x2x2, t1_s32x2x2, t2_s32x2x2, S_s32x2x2; #ifdef OPUS_CHECK_ASM opus_int32 S_c[ 4 ]; VARDECL( opus_int16, out_c ); SAVE_STACK; ALLOC( out_c, 2 * len, opus_int16 ); silk_memcpy( &S_c, S, sizeof( S_c ) ); silk_biquad_alt_stride2_c( in, B_Q28, A_Q28, S_c, out_c, len ); #endif /* Negate A_Q28 values and split in two parts */ A_Q28_s32x2 = vld1_s32( A_Q28 ); A_Q28_s32x2 = vneg_s32( A_Q28_s32x2 ); A_L_s32x2 = vshl_n_s32( A_Q28_s32x2, 18 ); /* ( -A_Q28[] & 0x00003FFF ) << 18 */ A_L_s32x2 = vreinterpret_s32_u32( vshr_n_u32( vreinterpret_u32_s32( A_L_s32x2 ), 3 ) ); /* ( -A_Q28[] & 0x00003FFF ) << 15 */ A_U_s32x2 = vshr_n_s32( A_Q28_s32x2, 14 ); /* silk_RSHIFT( -A_Q28[], 14 ) */ A_U_s32x2 = vshl_n_s32( A_U_s32x2, 16 ); /* silk_RSHIFT( -A_Q28[], 14 ) << 16 (Clip two leading bits to conform to C function.) */ A_U_s32x2 = vshr_n_s32( A_U_s32x2, 1 ); /* silk_RSHIFT( -A_Q28[], 14 ) << 15 */ B_Q28_s32x2 = vld1_s32( B_Q28 ); t_s32x2 = vld1_s32( B_Q28 + 1 ); t0_s32x2x2 = vzip_s32( A_L_s32x2, A_L_s32x2 ); t1_s32x2x2 = vzip_s32( A_U_s32x2, A_U_s32x2 ); t2_s32x2x2 = vzip_s32( t_s32x2, t_s32x2 ); A_L_s32x4 = vcombine_s32( t0_s32x2x2.val[ 0 ], t0_s32x2x2.val[ 1 ] ); /* A{0,0,1,1}_L_Q28 */ A_U_s32x4 = vcombine_s32( t1_s32x2x2.val[ 0 ], t1_s32x2x2.val[ 1 ] ); /* A{0,0,1,1}_U_Q28 */ B_Q28_s32x4 = vcombine_s32( t2_s32x2x2.val[ 0 ], t2_s32x2x2.val[ 1 ] ); /* B_Q28[ {1,1,2,2} ] */ S_s32x4 = vld1q_s32( S ); /* S0 = S[ 0 ]; S3 = S[ 3 ]; */ S_s32x2x2 = vtrn_s32( vget_low_s32( S_s32x4 ), vget_high_s32( S_s32x4 ) ); /* S2 = S[ 1 ]; S1 = S[ 2 ]; */ S_s32x4 = vcombine_s32( S_s32x2x2.val[ 0 ], S_s32x2x2.val[ 1 ] ); for( ; k < len - 1; k += 2 ) { int32x4_t in_s32x4[ 2 ], t_s32x4; int32x2_t out32_Q14_s32x2[ 2 ]; /* S[ 2 * i + 0 ], S[ 2 * i + 1 ], S[ 2 * i + 2 ], S[ 2 * i + 3 ]: Q12 */ in_s16x4 = vld1_s16( &in[ 2 * k ] ); /* in{0,1,2,3} = in[ 2 * k + {0,1,2,3} ]; */ in_s32x4[ 0 ] = vshll_n_s16( in_s16x4, 15 ); /* in{0,1,2,3} << 15 */ t_s32x4 = vqdmulhq_lane_s32( in_s32x4[ 0 ], B_Q28_s32x2, 0 ); /* silk_SMULWB( B_Q28[ 0 ], in{0,1,2,3} ) */ in_s32x4[ 1 ] = vcombine_s32( vget_high_s32( in_s32x4[ 0 ] ), vget_high_s32( in_s32x4[ 0 ] ) ); /* in{2,3,2,3} << 15 */ in_s32x4[ 0 ] = vcombine_s32( vget_low_s32 ( in_s32x4[ 0 ] ), vget_low_s32 ( in_s32x4[ 0 ] ) ); /* in{0,1,0,1} << 15 */ silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_low_s32 ( t_s32x4 ), in_s32x4[ 0 ], &S_s32x4, &out32_Q14_s32x2[ 0 ] ); silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_high_s32( t_s32x4 ), in_s32x4[ 1 ], &S_s32x4, &out32_Q14_s32x2[ 1 ] ); /* Scale back to Q0 and saturate */ out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2[ 0 ], out32_Q14_s32x2[ 1 ] ); /* out32_Q14_{0,1,2,3} */ out32_Q14_s32x4 = vaddq_s32( out32_Q14_s32x4, offset_s32x4 ); /* out32_Q14_{0,1,2,3} + (1<<14) - 1 */ out_s16x4 = vqshrn_n_s32( out32_Q14_s32x4, 14 ); /* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,2,3} + (1<<14) - 1, 14 ) ) */ vst1_s16( &out[ 2 * k ], out_s16x4 ); /* out[ 2 * k + {0,1,2,3} ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,2,3} + (1<<14) - 1, 14 ) ); */ } /* Process leftover. */ if( k < len ) { int32x4_t in_s32x4; int32x2_t out32_Q14_s32x2; /* S[ 2 * i + 0 ], S[ 2 * i + 1 ]: Q12 */ in_s16x4 = vld1_lane_s16( &in[ 2 * k + 0 ], in_s16x4, 0 ); /* in{0,1} = in[ 2 * k + {0,1} ]; */ in_s16x4 = vld1_lane_s16( &in[ 2 * k + 1 ], in_s16x4, 1 ); /* in{0,1} = in[ 2 * k + {0,1} ]; */ in_s32x4 = vshll_n_s16( in_s16x4, 15 ); /* in{0,1} << 15 */ t_s32x2 = vqdmulh_lane_s32( vget_low_s32( in_s32x4 ), B_Q28_s32x2, 0 ); /* silk_SMULWB( B_Q28[ 0 ], in{0,1} ) */ in_s32x4 = vcombine_s32( vget_low_s32( in_s32x4 ), vget_low_s32( in_s32x4 ) ); /* in{0,1,0,1} << 15 */ silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, t_s32x2, in_s32x4, &S_s32x4, &out32_Q14_s32x2 ); /* Scale back to Q0 and saturate */ out32_Q14_s32x2 = vadd_s32( out32_Q14_s32x2, offset_s32x2 ); /* out32_Q14_{0,1} + (1<<14) - 1 */ out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2, out32_Q14_s32x2 ); /* out32_Q14_{0,1,0,1} + (1<<14) - 1 */ out_s16x4 = vqshrn_n_s32( out32_Q14_s32x4, 14 ); /* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,0,1} + (1<<14) - 1, 14 ) ) */ vst1_lane_s16( &out[ 2 * k + 0 ], out_s16x4, 0 ); /* out[ 2 * k + 0 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_0 + (1<<14) - 1, 14 ) ); */ vst1_lane_s16( &out[ 2 * k + 1 ], out_s16x4, 1 ); /* out[ 2 * k + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_1 + (1<<14) - 1, 14 ) ); */ } vst1q_lane_s32( &S[ 0 ], S_s32x4, 0 ); /* S[ 0 ] = S0; */ vst1q_lane_s32( &S[ 1 ], S_s32x4, 2 ); /* S[ 1 ] = S2; */ vst1q_lane_s32( &S[ 2 ], S_s32x4, 1 ); /* S[ 2 ] = S1; */ vst1q_lane_s32( &S[ 3 ], S_s32x4, 3 ); /* S[ 3 ] = S3; */ #ifdef OPUS_CHECK_ASM silk_assert( !memcmp( S_c, S, sizeof( S_c ) ) ); silk_assert( !memcmp( out_c, out, 2 * len * sizeof( opus_int16 ) ) ); RESTORE_STACK; #endif }
static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis, int32x4_t *const a0, int32x4_t *const a1, int32x4_t *const a2, int32x4_t *const a3) { int32x4_t b0, b1, b2, b3; int64x2_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11; transpose_s32_4x4(a0, a1, a2, a3); b0 = vaddq_s32(*a0, *a2); b1 = vsubq_s32(*a0, *a2); c0 = vmull_lane_s32(vget_low_s32(b0), vget_high_s32(cospis), 0); c1 = vmull_lane_s32(vget_high_s32(b0), vget_high_s32(cospis), 0); c2 = vmull_lane_s32(vget_low_s32(b1), vget_high_s32(cospis), 0); c3 = vmull_lane_s32(vget_high_s32(b1), vget_high_s32(cospis), 0); c4 = vmull_lane_s32(vget_low_s32(*a1), vget_high_s32(cospis), 1); c5 = vmull_lane_s32(vget_high_s32(*a1), vget_high_s32(cospis), 1); c6 = vmull_lane_s32(vget_low_s32(*a1), vget_low_s32(cospis), 1); c7 = vmull_lane_s32(vget_high_s32(*a1), vget_low_s32(cospis), 1); c8 = vmull_lane_s32(vget_low_s32(*a3), vget_low_s32(cospis), 1); c9 = vmull_lane_s32(vget_high_s32(*a3), vget_low_s32(cospis), 1); c10 = vmull_lane_s32(vget_low_s32(*a3), vget_high_s32(cospis), 1); c11 = vmull_lane_s32(vget_high_s32(*a3), vget_high_s32(cospis), 1); c4 = vsubq_s64(c4, c8); c5 = vsubq_s64(c5, c9); c6 = vaddq_s64(c6, c10); c7 = vaddq_s64(c7, c11); b0 = vcombine_s32(vrshrn_n_s64(c0, DCT_CONST_BITS), vrshrn_n_s64(c1, DCT_CONST_BITS)); b1 = vcombine_s32(vrshrn_n_s64(c2, DCT_CONST_BITS), vrshrn_n_s64(c3, DCT_CONST_BITS)); b2 = vcombine_s32(vrshrn_n_s64(c4, DCT_CONST_BITS), vrshrn_n_s64(c5, DCT_CONST_BITS)); b3 = vcombine_s32(vrshrn_n_s64(c6, DCT_CONST_BITS), vrshrn_n_s64(c7, DCT_CONST_BITS)); *a0 = vaddq_s32(b0, b3); *a1 = vaddq_s32(b1, b2); *a2 = vsubq_s32(b1, b2); *a3 = vsubq_s32(b0, b3); }
void computeNetwork0new_neon(const float *dataf, const float *weightsf, uint8_t *d) { const int16_t *data = (const int16_t *)dataf; const int16_t *weights = (const int16_t *)weightsf; int32x4_t accum0 = { 0, 0, 0, 0 }; int32x4_t accum1 = accum0; int32x4_t accum2 = accum0; int32x4_t accum3 = accum0; for (int i = 0; i < 128/2; i += 8) { int16x4x2_t d0 = vld2_s16(data + i); int16x4x2_t w0 = vld2_s16(weights + i * 4); int16x4x2_t w1 = vld2_s16(weights + i * 4 + 8); int16x4x2_t w2 = vld2_s16(weights + i * 4 + 16); int16x4x2_t w3 = vld2_s16(weights + i * 4 + 24); accum0 = vmlal_s16(accum0, d0.val[0], w0.val[0]); accum0 = vmlal_s16(accum0, d0.val[1], w0.val[1]); accum1 = vmlal_s16(accum1, d0.val[0], w1.val[0]); accum1 = vmlal_s16(accum1, d0.val[1], w1.val[1]); accum2 = vmlal_s16(accum2, d0.val[0], w2.val[0]); accum2 = vmlal_s16(accum2, d0.val[1], w2.val[1]); accum3 = vmlal_s16(accum3, d0.val[0], w3.val[0]); accum3 = vmlal_s16(accum3, d0.val[1], w3.val[1]); } int32x2_t sum0 = vpadd_s32(vget_low_s32(accum0), vget_high_s32(accum0)); int32x2_t sum1 = vpadd_s32(vget_low_s32(accum1), vget_high_s32(accum1)); int32x2_t sum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2)); int32x2_t sum3 = vpadd_s32(vget_low_s32(accum3), vget_high_s32(accum3)); sum0 = vpadd_s32(sum0, sum1); sum1 = vpadd_s32(sum2, sum3); int32x4_t sum = vcombine_s32(sum0, sum1); float32x4_t m0 = vcvtq_f32_s32(sum); m0 = vmulq_f32(m0, vld1q_f32(weightsf + 512/4)); m0 = vaddq_f32(m0, vld1q_f32(weightsf + 528/4)); float32x4_t m1, m2, m3, m4; m1 = m0; m0 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(m0), sign_bits_f)); m0 = vaddq_f32(m0, ones_f); m0 = vmulq_f32(reciprocal(m0), m1); m1 = vdupq_lane_f32(vget_low_f32(m0), 0); m2 = vdupq_lane_f32(vget_low_f32(m0), 1); m3 = vdupq_lane_f32(vget_high_f32(m0), 0); m4 = vdupq_lane_f32(vget_high_f32(m0), 1); m1 = vmulq_f32(m1, vld1q_f32(weightsf + 544/4)); m2 = vmulq_f32(m2, vld1q_f32(weightsf + 560/4)); m3 = vmulq_f32(m3, vld1q_f32(weightsf + 576/4)); m4 = vmulq_f32(m4, vld1q_f32(weightsf + 592/4)); m1 = vaddq_f32(m1, m2); m3 = vaddq_f32(m3, m4); m1 = vaddq_f32(m1, m3); m1 = vaddq_f32(m1, vld1q_f32(weightsf + 608/4)); uint32x4_t gte = vcgeq_f32(m1, zeroes_f); uint16x4_t gte_u16 = vmovn_u32(gte); uint8x8_t gte_u8 = vmovn_u16(vcombine_u16(gte_u16, vget_low_u16(vreinterpretq_u16_u32(sign_bits_f)))); gte_u8 = vshr_n_u8(gte_u8, 7); vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(gte_u8), 0); }
// CHECK-LABEL: define <2 x i32> @test_vget_high_s32(<4 x i32> %a) #0 { // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3> // CHECK: ret <2 x i32> [[SHUFFLE_I]] int32x2_t test_vget_high_s32(int32x4_t a) { return vget_high_s32(a); }
void mdrc5b_apply_limiter(MDRC5B_LOCAL_STRUCT_T *HeapPtr) { unsigned int LaIdx; unsigned int NumMainCh; unsigned int Samples; unsigned int ch, k, n; MMlong *Ptr; MMlong *Ptr2; MMlong *MemOutPtr; MMshort PeakdB; MMlong PeakMax; int RmsMeasure; MMshort LimiterAtCoef; MMshort LimiterReCoef; MMshort LimiterGainMant[MDRC5B_BLOCK_SIZE + 1]; MMshort LimiterGainExp; MMshort LimiterTargetGaindB; unsigned int LimiterHoldRem; unsigned int LimiterHtSamp; MMshort Exp, TargetGain; MMshort MaxShiftBits; unsigned int lookahead_len = (unsigned int) HeapPtr->LimiterLALen; unsigned int cpt1, cpt2; uint32x2x2_t Temp_u32x2x2; uint32x2_t Ldbits_u32x2, Ldbits2_u32x2; uint32x2_t bsl_u32x2; int32x2_t LimGainMant_32x2; int64x2_t TempX_64x2, MemOut_64x2; int64x2_t Tmp_64x2; int64x2_t LimiterGainExp_64x2, sample_64x2; int64x1_t TempX_64x1, sample_64x1; int32_t *LimiterGainMant_ptr; int32x2_t Tmp_32x2, Ldbits_32x2, n_32x2; int32x2_t TempX_low_32x2, TempX_high_32x2; int32x2x2_t Tmp_32x2x2; int64x1_t Peak_64x1, PeakMax_64x1, Tmp_64x1, diffX_64x1; int64x1_t Peak_scale_pow_64x1, Peak_scale_64x1, Zero_s64x1; int64x1_t MaxShiftBits_neg_64x1, MaxShiftBits_hd_64x1; int64x2_t diffX_64x2; uint64x1_t bsl_u64x1; int32x2_t LimiterPeakCoef_32x2, diffX_low_32x2, diffX_high_32x2; int32x2_t TargetGain_32x2; uint32x2x2_t Peak_u32x2x2; uint32x2_t Peak_exp_u32x2, Peak_exp2_u32x2, Peak_mant_u32x2; int32x2_t x_32x2, xn_32x2, PeakdB_32x2, Peak_exp_32x2; int32x2_t LimiterTargetGaindB_32x2, Exp_32x2, LimiterCoef_32x2; int32x4_t Tmp_32x4; START_PMU_MEASURE(PMU_MEASURE_MRDC5B_APPLY_LIMITER) START_PMU_MEASURE(PMU_MEASURE_MRDC5B_LIMITER_COMPUTE_MAX_SHIFT_LEFT) Samples = (unsigned int) HeapPtr->BlockSize; NumMainCh = (unsigned int) HeapPtr->NumMainCh; TempX_64x2 = vdupq_n_s64(0); for(ch = 0; ch < NumMainCh; ch++) { Ptr = HeapPtr->MainInBuf[ch]; // compute the number of bits needs to be shifted to avoid overflow for(k = (Samples >> 1); k > 0; k--) { sample_64x2 = vld1q_s64(Ptr); Ptr +=2; sample_64x2 = veorq_s64(sample_64x2, vshrq_n_s64(sample_64x2, 63)); TempX_64x2 = vorrq_s64(TempX_64x2, sample_64x2); } if(Samples & 1) { sample_64x1 = vld1_s64(Ptr); sample_64x1 = veor_s64(sample_64x1, vshr_n_s64(sample_64x1, 63)); TempX_64x2 = vorrq_s64(TempX_64x2, vcombine_s64(sample_64x1, sample_64x1)); } } TempX_64x1 = vorr_s64(vget_low_s64(TempX_64x2), vget_high_s64(TempX_64x2)); Temp_u32x2x2 = vuzp_u32(vreinterpret_u32_s64(TempX_64x1), vreinterpret_u32_s64(TempX_64x1)); bsl_u32x2 = vceq_u32(Temp_u32x2x2.val[1], vdup_n_u32(0)); // MSB == 0 ? // use clz instead of cls because we are sure that input value is positive // and because cls(LSB) could be wrong (if MSB is equal to 0 and bit 31 of LSL is 1) // thus clz result will be 1 more than cls result (that's why you may see (Ldbits - 1) // instead of Ldbits below) Ldbits_u32x2 = vadd_u32(vclz_u32(Temp_u32x2x2.val[0]), vdup_n_u32(32)); // clz(LSB)+32 Ldbits2_u32x2 = vclz_u32(Temp_u32x2x2.val[1]); // clz(MSB) Ldbits_u32x2 = vbsl_u32(bsl_u32x2, Ldbits_u32x2, Ldbits2_u32x2); // MSB == 0 ? clz(LSB)+32 : clz(MSB) bsl_u32x2 = vceq_u32(Ldbits_u32x2, vdup_n_u32(64)); // Ldbits == 64 ? (i.e. TempX == 0 ?) // the aim of MaxShiftBits is that sample will be shifted so that it occupies // 24 significant bits for 24 bits samples or 32 significant bits for 32 bits samples // but we are in 64 bits architecture on CA9/NEON // so we must right shift of ((64 - 24) - (Ldbits - 1)) bits for 24 bits samples // or of ((64 - 32) - (Ldbits - 1)) bits for 32 bits samples // and we add 1 because it was done this way on MMDSP (I don't know why !) #ifdef SAMPLES_24_BITS // MaxShiftBits = ((64 - 24) - (Ldbits - 1)) + 1 // = 42 - Ldbits Ldbits_32x2 = vsub_s32(vdup_n_s32(42), vreinterpret_s32_u32(Ldbits_u32x2)); #else // SAMPLES_24_BITS // MaxShiftBits = ((64 - 32) - (Ldbits - 1)) + 1 // = 34 - Ldbits Ldbits_32x2 = vsub_s32(vdup_n_s32(34), vreinterpret_s32_u32(Ldbits_u32x2)); #endif // SAMPLES_24_BITS Ldbits_32x2 = vmax_s32(vdup_n_s32(1), Ldbits_32x2); Ldbits_32x2 = vbsl_s32(bsl_u32x2, vdup_n_s32(1), Ldbits_32x2); // if(TempX == 0) Ldbits = 1 MaxShiftBits = vget_lane_s32(Ldbits_32x2, 0); STOP_PMU_MEASURE(PMU_MEASURE_MRDC5B_LIMITER_COMPUTE_MAX_SHIFT_LEFT) #ifdef DEBUG_LIMITER_OUTPUT if((debug_cpt_samples >= DEBUG_CPT_MIN) && (debug_cpt_samples <= DEBUG_CPT_MAX)) { char string[100]; debug_write_string("MRDC5B_LIMITER_COMPUTE_MAX_SHIFT_LEFT\n"); sprintf(string, "MaxShiftBits=%d\n", MaxShiftBits); debug_write_string(string); } #endif // DEBUG_LIMITER_OUTPUT START_PMU_MEASURE(PMU_MEASURE_MRDC5B_LIMITER_INSERT_NEW_SUBBAND) // insert the new subband samples into the lookahead buffers RmsMeasure = HeapPtr->Limiter.RmsMeasure; LaIdx = (unsigned int) HeapPtr->LimiterLaIdx; if(LaIdx + Samples >= lookahead_len) { cpt1 = lookahead_len - LaIdx; cpt2 = Samples - cpt1; // update index HeapPtr->LimiterLaIdx = (int) cpt2; } else { cpt1 = Samples; cpt2 = 0; // update index HeapPtr->LimiterLaIdx = (int) (LaIdx + Samples); } LimiterPeakCoef_32x2 = vdup_n_s32(HeapPtr->LimiterPeakAtCoef); // LimiterPeakAtCoef, LimiterPeakAtCoef LimiterPeakCoef_32x2 = vset_lane_s32(HeapPtr->LimiterPeakReCoef, LimiterPeakCoef_32x2, 1); // LimiterPeakAtCoef, LimiterPeakReCoef Peak_scale_64x1 = vdup_n_s64(HeapPtr->PrevShiftBits - MaxShiftBits); Peak_scale_pow_64x1 = vshl_n_s64(Peak_scale_64x1, 1); MaxShiftBits_neg_64x1 = vdup_n_s64(-MaxShiftBits); #ifdef SAMPLES_24_BITS MaxShiftBits_hd_64x1 = vdup_n_s64(24 - MaxShiftBits); #else // SAMPLES_24_BITS MaxShiftBits_hd_64x1 = vdup_n_s64(32 - MaxShiftBits); #endif // SAMPLES_24_BITS PeakMax_64x1 = vdup_n_s64(0); for(ch = 0; ch < NumMainCh; ch++) { Ptr = HeapPtr->MainInBuf[ch]; Ptr2 = HeapPtr->LimiterLABuf[ch] + LaIdx; // go to the first valid sample Peak_64x1 = vdup_n_s64(HeapPtr->LimiterPeak[ch]); if(RmsMeasure) { // compensate Peak according to the previous shift bits Peak_64x1 = vqrshl_s64(Peak_64x1, Peak_scale_pow_64x1); // neg value => shift right rounding // rms measure for(k = cpt1; k > 0; k--) { Tmp_64x1 = vld1_s64(Ptr); Ptr++; vst1_s64(Ptr2, Tmp_64x1); Ptr2++; Tmp_64x1 = vqrshl_s64(Tmp_64x1, MaxShiftBits_neg_64x1); Tmp_64x2 = vcombine_s64(Tmp_64x1, Tmp_64x1); Tmp_32x2x2 = vuzp_s32(vget_low_s32(vreinterpretq_s32_s64(Tmp_64x2)), vget_high_s32(vreinterpretq_s32_s64(Tmp_64x2))); Tmp_32x2 = Tmp_32x2x2.val[0]; // LSB of Tmp_64x2 (MSB is dummy) TempX_64x2 = vqdmull_s32(Tmp_32x2, Tmp_32x2); TempX_64x1 = vget_low_s64(TempX_64x2); diffX_64x1 = vqsub_s64(Peak_64x1, TempX_64x1); bsl_u64x1 = vreinterpret_u64_s64(vshr_n_s64(diffX_64x1, 63)); // sign(diffX) diffX_64x2 = vcombine_s64(diffX_64x1, diffX_64x1); diffX_low_32x2 = vshrn_n_s64(vshlq_n_s64(diffX_64x2, 32), 32); // wextract_l(diffX), wextract_l(diffX) diffX_high_32x2 = vrshrn_n_s64(diffX_64x2, 32); // wround_L(diffX), wround_L(diffX) Tmp_64x2 = vmovl_s32(vqrdmulh_s32(LimiterPeakCoef_32x2, diffX_low_32x2)); // (MMlong) wfmulr(wextract_l(diffX), LimiterPeakAtCoef), (MMlong) wfmulr(wextract_l(diffX), LimiterPeakReCoef) Tmp_64x2 = vqdmlal_s32(Tmp_64x2, LimiterPeakCoef_32x2, diffX_high_32x2); // wL_addsat((MMlong) wfmulr(wextract_l(diffX), LimiterPeakAtCoef), wL_fmul(wround_L(diffX), LimiterPeakAtCoef)), wL_addsat((MMlong) wfmulr(wextract_l(diffX), LimiterPeakReCoef), wL_fmul(wround_L(diffX), LimiterPeakReCoef)) Tmp_64x2 = vqaddq_s64(TempX_64x2, Tmp_64x2); Peak_64x1 = vbsl_s64(bsl_u64x1, vget_low_s64(Tmp_64x2), vget_high_s64(Tmp_64x2)); Tmp_64x1 = vqsub_s64(Peak_64x1, PeakMax_64x1); bsl_u64x1 = vreinterpret_u64_s64(vshr_n_s64(Tmp_64x1, 63)); // sign(Peak_64x1 - PeakMax_64x1) PeakMax_64x1 = vbsl_s64(bsl_u64x1, PeakMax_64x1, Peak_64x1); } Ptr2 = HeapPtr->LimiterLABuf[ch]; for(k = cpt2; k > 0; k--) { Tmp_64x1 = vld1_s64(Ptr); Ptr++; vst1_s64(Ptr2, Tmp_64x1); Ptr2++; Tmp_64x1 = vqrshl_s64(Tmp_64x1, MaxShiftBits_neg_64x1); Tmp_64x2 = vcombine_s64(Tmp_64x1, Tmp_64x1); Tmp_32x2x2 = vuzp_s32(vget_low_s32(vreinterpretq_s32_s64(Tmp_64x2)), vget_high_s32(vreinterpretq_s32_s64(Tmp_64x2))); Tmp_32x2 = Tmp_32x2x2.val[0]; // LSB of Tmp_64x2 (MSB is dummy) TempX_64x2 = vqdmull_s32(Tmp_32x2, Tmp_32x2); TempX_64x1 = vget_low_s64(TempX_64x2); diffX_64x1 = vqsub_s64(Peak_64x1, TempX_64x1); bsl_u64x1 = vreinterpret_u64_s64(vshr_n_s64(diffX_64x1, 63)); // sign(diffX) diffX_64x2 = vcombine_s64(diffX_64x1, diffX_64x1); diffX_low_32x2 = vshrn_n_s64(vshlq_n_s64(diffX_64x2, 32), 32); // wextract_l(diffX), wextract_l(diffX) diffX_high_32x2 = vrshrn_n_s64(diffX_64x2, 32); // wround_L(diffX), wround_L(diffX) Tmp_64x2 = vmovl_s32(vqrdmulh_s32(LimiterPeakCoef_32x2, diffX_low_32x2)); // (MMlong) wfmulr(wextract_l(diffX), LimiterPeakAtCoef), (MMlong) wfmulr(wextract_l(diffX), LimiterPeakReCoef) Tmp_64x2 = vqdmlal_s32(Tmp_64x2, LimiterPeakCoef_32x2, diffX_high_32x2); // wL_addsat((MMlong) wfmulr(wextract_l(diffX), LimiterPeakAtCoef), wL_fmul(wround_L(diffX), LimiterPeakAtCoef)), wL_addsat((MMlong) wfmulr(wextract_l(diffX), LimiterPeakReCoef), wL_fmul(wround_L(diffX), LimiterPeakReCoef)) Tmp_64x2 = vqaddq_s64(TempX_64x2, Tmp_64x2); Peak_64x1 = vbsl_s64(bsl_u64x1, vget_low_s64(Tmp_64x2), vget_high_s64(Tmp_64x2)); Tmp_64x1 = vqsub_s64(Peak_64x1, PeakMax_64x1); bsl_u64x1 = vreinterpret_u64_s64(vshr_n_s64(Tmp_64x1, 63)); // sign(Peak_64x1 - PeakMax_64x1) PeakMax_64x1 = vbsl_s64(bsl_u64x1, PeakMax_64x1, Peak_64x1); } } else { // compensate Peak according to the previous shift bits Peak_64x1 = vqrshl_s64(Peak_64x1, Peak_scale_64x1); // amplitude measure Zero_s64x1 = vdup_n_s64(0); for(k = cpt1; k > 0; k--) { Tmp_64x1 = vld1_s64(Ptr); Ptr++; vst1_s64(Ptr2, Tmp_64x1); Ptr2++; bsl_u64x1 = vreinterpret_u64_s64(vshr_n_s64(Tmp_64x1, 63)); // sign(Tmp_64x1) TempX_64x1 = vqsub_s64(Zero_s64x1, Tmp_64x1); // -Tmp_64x1 TempX_64x1 = vbsl_s64(bsl_u64x1, TempX_64x1, Tmp_64x1); TempX_64x1 = vqrshl_s64(TempX_64x1, MaxShiftBits_hd_64x1); TempX_64x2 = vcombine_s64(TempX_64x1, TempX_64x1); diffX_64x1 = vqsub_s64(Peak_64x1, TempX_64x1); bsl_u64x1 = vreinterpret_u64_s64(vshr_n_s64(diffX_64x1, 63)); // sign(diffX) diffX_64x2 = vcombine_s64(diffX_64x1, diffX_64x1); diffX_low_32x2 = vshrn_n_s64(vshlq_n_s64(diffX_64x2, 32), 32); // wextract_l(diffX), wextract_l(diffX) diffX_high_32x2 = vrshrn_n_s64(diffX_64x2, 32); // wround_L(diffX), wround_L(diffX) Tmp_64x2 = vmovl_s32(vqrdmulh_s32(LimiterPeakCoef_32x2, diffX_low_32x2)); // (MMlong) wfmulr(wextract_l(diffX), LimiterPeakAtCoef), (MMlong) wfmulr(wextract_l(diffX), LimiterPeakReCoef) Tmp_64x2 = vqdmlal_s32(Tmp_64x2, LimiterPeakCoef_32x2, diffX_high_32x2); // wL_addsat((MMlong) wfmulr(wextract_l(diffX), LimiterPeakAtCoef), wL_fmul(wround_L(diffX), LimiterPeakAtCoef)), wL_addsat((MMlong) wfmulr(wextract_l(diffX), LimiterPeakReCoef), wL_fmul(wround_L(diffX), LimiterPeakReCoef)) Tmp_64x2 = vqaddq_s64(TempX_64x2, Tmp_64x2); Peak_64x1 = vbsl_s64(bsl_u64x1, vget_low_s64(Tmp_64x2), vget_high_s64(Tmp_64x2)); Tmp_64x1 = vqsub_s64(Peak_64x1, PeakMax_64x1); bsl_u64x1 = vreinterpret_u64_s64(vshr_n_s64(Tmp_64x1, 63)); // sign(Peak_64x1 - PeakMax_64x1) PeakMax_64x1 = vbsl_s64(bsl_u64x1, PeakMax_64x1, Peak_64x1); } Ptr2 = HeapPtr->LimiterLABuf[ch]; for(k = cpt2; k > 0; k--) { Tmp_64x1 = vld1_s64(Ptr); Ptr++; vst1_s64(Ptr2, Tmp_64x1); Ptr2++; bsl_u64x1 = vreinterpret_u64_s64(vshr_n_s64(Tmp_64x1, 63)); // sign(Tmp_64x1) TempX_64x1 = vqsub_s64(Zero_s64x1, Tmp_64x1); // -Tmp_64x1 TempX_64x1 = vbsl_s64(bsl_u64x1, TempX_64x1, Tmp_64x1); TempX_64x1 = vqrshl_s64(TempX_64x1, MaxShiftBits_hd_64x1); TempX_64x2 = vcombine_s64(TempX_64x1, TempX_64x1); diffX_64x1 = vqsub_s64(Peak_64x1, TempX_64x1); bsl_u64x1 = vreinterpret_u64_s64(vshr_n_s64(diffX_64x1, 63)); // sign(diffX) diffX_64x2 = vcombine_s64(diffX_64x1, diffX_64x1); diffX_low_32x2 = vshrn_n_s64(vshlq_n_s64(diffX_64x2, 32), 32); // wextract_l(diffX), wextract_l(diffX) diffX_high_32x2 = vrshrn_n_s64(diffX_64x2, 32); // wround_L(diffX), wround_L(diffX) Tmp_64x2 = vmovl_s32(vqrdmulh_s32(LimiterPeakCoef_32x2, diffX_low_32x2)); // (MMlong) wfmulr(wextract_l(diffX), LimiterPeakAtCoef), (MMlong) wfmulr(wextract_l(diffX), LimiterPeakReCoef) Tmp_64x2 = vqdmlal_s32(Tmp_64x2, LimiterPeakCoef_32x2, diffX_high_32x2); // wL_addsat((MMlong) wfmulr(wextract_l(diffX), LimiterPeakAtCoef), wL_fmul(wround_L(diffX), LimiterPeakAtCoef)), wL_addsat((MMlong) wfmulr(wextract_l(diffX), LimiterPeakReCoef), wL_fmul(wround_L(diffX), LimiterPeakReCoef)) Tmp_64x2 = vqaddq_s64(TempX_64x2, Tmp_64x2); Peak_64x1 = vbsl_s64(bsl_u64x1, vget_low_s64(Tmp_64x2), vget_high_s64(Tmp_64x2)); Tmp_64x1 = vqsub_s64(Peak_64x1, PeakMax_64x1); bsl_u64x1 = vreinterpret_u64_s64(vshr_n_s64(Tmp_64x1, 63)); // sign(Peak_64x1 - PeakMax_64x1) PeakMax_64x1 = vbsl_s64(bsl_u64x1, PeakMax_64x1, Peak_64x1); } } HeapPtr->LimiterPeak[ch] = vget_lane_s64(Peak_64x1, 0); // save history } // for(ch = 0...) PeakMax = vget_lane_s64(PeakMax_64x1, 0); HeapPtr->PrevShiftBits = MaxShiftBits; STOP_PMU_MEASURE(PMU_MEASURE_MRDC5B_LIMITER_INSERT_NEW_SUBBAND) if(PeakMax < MDRC5B_ALMOST_ZERO_THRESH) { PeakdB = (MDRC5B_POWER_DB_MINUS_INF << 16); // 8.16, [-128.0, 127.0] dB } else { Peak_u32x2x2 = vuzp_u32(vreinterpret_u32_s64(PeakMax_64x1), vreinterpret_u32_s64(PeakMax_64x1)); bsl_u32x2 = vceq_u32(Peak_u32x2x2.val[1], vdup_n_u32(0)); Peak_exp_u32x2 = vadd_u32(vclz_u32(Peak_u32x2x2.val[0]), vdup_n_u32(32)); Peak_exp2_u32x2 = vclz_u32(Peak_u32x2x2.val[1]); Peak_exp_u32x2 = vbsl_u32(bsl_u32x2, Peak_exp_u32x2, Peak_exp2_u32x2); Peak_mant_u32x2 = vrshrn_n_u64(vshlq_u64(vreinterpretq_u64_s64(vcombine_s64(PeakMax_64x1, PeakMax_64x1)), vreinterpretq_s64_u64(vmovl_u32(Peak_exp_u32x2))), 32); // if(Peak_mant >= sqrt(0.5)) // { // Peak_exp--; // Peak_mant >>= 1; // } bsl_u32x2 = vcge_u32(Peak_mant_u32x2, vdup_n_u32(0xB504F334)); Peak_exp_u32x2 = vbsl_u32(bsl_u32x2, vsub_u32(Peak_exp_u32x2, vdup_n_u32(1)), Peak_exp_u32x2); Peak_mant_u32x2 = vbsl_u32(bsl_u32x2, vrshr_n_u32(Peak_mant_u32x2, 1), Peak_mant_u32x2); Peak_exp_32x2 = vreinterpret_s32_u32(Peak_exp_u32x2); #ifdef SAMPLES_24_BITS // correction of 16 bits if input samples are 24 bits Peak_exp_32x2 = vsub_s32(Peak_exp_32x2, vdup_n_s32(16)); #endif // SAMPLES_24_BITS // at this point : sqrt(0.5)/2 <= Peak_mant < sqrt(0.5) // // ln(1+x) = x - x^2/2 + x^3/3 - x^4/4 + x^5/5 - x^6/6 + x^7/7 - x^8/8 + x^9/9 - x^10/10 ... accuracy OK if |x| < 0.5 // sqrt(0.5)/2 <= Peak_mant < sqrt(0.5) => sqrt(0.5)-1 <= 2*Peak_mant-1 < 2*sqrt(0.5)-1 // => ln(Peak_mant) = ln(1+x)-ln(2) with x=2*Peak_mant-1, i.e. |x| < 0.414214... // x=2*PeakMax_mant-1 in Q31 // => sqrt(0.5)-1 <= x < 2*sqrt(0.5)-1 x_32x2 = vreinterpret_s32_u32(vsub_u32(Peak_mant_u32x2, vdup_n_u32(0x80000000))); PeakdB_32x2 = x_32x2; // PeakdB = x xn_32x2 = vqrdmulh_s32(x_32x2, x_32x2); // xn = x^2 PeakdB_32x2 = vqsub_s32(PeakdB_32x2, vrshr_n_s32(xn_32x2, 1)); // PeakdB = x - x^2/2 xn_32x2 = vqrdmulh_s32(xn_32x2, x_32x2); // xn = x^3 PeakdB_32x2 = vqadd_s32(PeakdB_32x2, vqrdmulh_s32(xn_32x2, vdup_n_s32(0x2AAAAAAB))); // PeakdB = x - x^2/2 + x^3/3 xn_32x2 = vqrdmulh_s32(xn_32x2, x_32x2); // xn = x^4 PeakdB_32x2 = vqsub_s32(PeakdB_32x2, vrshr_n_s32(xn_32x2, 2)); // PeakdB = x - x^2/2 + x^3/3 - x^4/4 xn_32x2 = vqrdmulh_s32(xn_32x2, x_32x2); // xn = x^5 PeakdB_32x2 = vqadd_s32(PeakdB_32x2, vqrdmulh_s32(xn_32x2, vdup_n_s32(0x1999999A))); // PeakdB = x - x^2/2 + x^3/3 - x^4/4 + x^5/5 xn_32x2 = vqrdmulh_s32(xn_32x2, x_32x2); // xn = x^6 PeakdB_32x2 = vqsub_s32(PeakdB_32x2, vqrdmulh_s32(xn_32x2, vdup_n_s32(0x15555555))); // PeakdB = x - x^2/2 + x^3/3 - x^4/4 + x^5/5 - x^6/6 xn_32x2 = vqrdmulh_s32(xn_32x2, x_32x2); // xn = x^7 PeakdB_32x2 = vqadd_s32(PeakdB_32x2, vqrdmulh_s32(xn_32x2, vdup_n_s32(0x12492492))); // PeakdB = x - x^2/2 + x^3/3 - x^4/4 + x^5/5 - x^6/6 + x^7/7 xn_32x2 = vqrdmulh_s32(xn_32x2, x_32x2); // xn = x^8 PeakdB_32x2 = vqsub_s32(PeakdB_32x2, vrshr_n_s32(xn_32x2, 3)); // PeakdB = x - x^2/2 + x^3/3 - x^4/4 + x^5/5 - x^6/6 + x^7/7 - x^8/8 xn_32x2 = vqrdmulh_s32(xn_32x2, x_32x2); // xn = x^9 PeakdB_32x2 = vqadd_s32(PeakdB_32x2, vqrdmulh_s32(xn_32x2, vdup_n_s32(0x0E38E38E))); // PeakdB = x - x^2/2 + x^3/3 - x^4/4 + x^5/5 - x^6/6 + x^7/7 - x^8/8 + x^9/9 xn_32x2 = vqrdmulh_s32(xn_32x2, x_32x2); // xn = x^10 PeakdB_32x2 = vqsub_s32(PeakdB_32x2, vqrdmulh_s32(xn_32x2, vdup_n_s32(0x0CCCCCCD))); // PeakdB = x - x^2/2 + x^3/3 - x^4/4 + x^5/5 - x^6/6 + x^7/7 - x^8/8 + x^9/9 - x^10/10 // at this point : PeakMaxdB contains ln(1+x) in Q31 if(RmsMeasure) { // dB(power) = 10*log10(power) // PeakMaxdB = 10*log10(PeakMax)+20*log10(2)*(HEADROOM+MaxShiftBits) // = 10*ln(PeakMax)/ln(10)+20*ln(2)/ln(10)*(HEADROOM+MaxShiftBits) // = 10/ln(10)*ln(PeakMax_mant*2^(-PeakMax_exp))+20*ln(2)/ln(10)*(HEADROOM+MaxShiftBits) // = 10/ln(10)*(ln(PeakMax_mant)-PeakMax_exp*ln(2))+20*ln(2)/ln(10)*(HEADROOM+MaxShiftBits) // = 10/ln(10)*ln(PeakMax_mant)-PeakMax_exp*10*ln(2)/ln(10)+20*ln(2)/ln(10)*(HEADROOM+MaxShiftBits) // = 10/ln(10)*ln(PeakMax_mant)+10*ln(2)/ln(10)*(2*(HEADROOM+MaxShiftBits)-PeakMax_exp) // // => RmsdB = 10/ln(10)*ln(1+x)+10*ln(2)/ln(10)*(2*(HEADROOM+MaxShiftBits)-PeakMax_exp) // => RmsdB (Q16) = 0x457CB*ln(1+x)+0x302A3*(2*(HEADROOM+MaxShiftBits)-PeakMax_exp) // fractional mutiply 0x457CB*ln(1+x) in Q16 PeakdB_32x2 = vqrdmulh_s32(PeakdB_32x2, vdup_n_s32(0x457CB)); // PeakdB_exp = 2*(HEADROOM+MaxShiftBits)-PeakdB_exp Peak_exp_32x2 = vsub_s32(vdup_n_s32(2 * (HEADROOM + MaxShiftBits)), Peak_exp_32x2); // PeakMaxdB final value (integer mac 0x302A3*PeakdB_exp) PeakdB_32x2 = vmla_s32(PeakdB_32x2, Peak_exp_32x2, vdup_n_s32(0x302A3)); } else { // dB(power) = 20*log10(abs) // PeakMaxdB = 20*log10(PeakMax)+20*log10(2)*(HEADROOM+MaxShiftBits) // = 20*ln(PeakMax)/ln(10)+20*ln(2)/ln(10)*(HEADROOM+MaxShiftBits) // = 20/ln(10)*ln(PeakMax_mant*2^(-PeakMax_exp))+20*ln(2)/ln(10)*(HEADROOM+MaxShiftBits) // = 20/ln(10)*(ln(PeakMax_mant)-PeakMax_exp*ln(2))+20*ln(2)/ln(10)*(HEADROOM+MaxShiftBits) // = 20/ln(10)*ln(PeakMax_mant)-PeakMax_exp*20*ln(2)/ln(10)+20*ln(2)/ln(10)*(HEADROOM+MaxShiftBits) // = 20/ln(10)*ln(PeakMax_mant)+20*ln(2)/ln(10)*(HEADROOM+MaxShiftBits-PeakMax_exp) // // => RmsdB = 20/ln(10)*ln(1+x)+20*ln(2)/ln(10)*(HEADROOM+MaxShiftBits-PeakMax_exp) // => RmsdB (Q16) = 0x8AF96*ln(1+x)+0x60546*(HEADROOM+MaxShiftBits-PeakMax_exp) // fractional mutiply 0x8AF96*ln(1+x) in Q16 PeakdB_32x2 = vqrdmulh_s32(PeakdB_32x2, vdup_n_s32(0x8AF96)); // PeakdB_exp = HEADROOM+MaxShiftBits-PeakdB_exp Peak_exp_32x2 = vsub_s32(vdup_n_s32(HEADROOM + MaxShiftBits), Peak_exp_32x2); // PeakMaxdB final value (integer mac 0x60546*PeakdB_exp) PeakdB_32x2 = vmla_s32(PeakdB_32x2, Peak_exp_32x2, vdup_n_s32(0x60546)); } PeakdB = vget_lane_s32(PeakdB_32x2, 0); } #ifdef DEBUG_LIMITER_OUTPUT if((debug_cpt_samples >= DEBUG_CPT_MIN) && (debug_cpt_samples <= DEBUG_CPT_MAX)) { char string[100]; debug_write_string("MRDC5B_LIMITER_PEAKMAX_PEAKDB\n"); sprintf(string, "PeakMax=0x%012llX, HEADROOM+MaxShiftBits=%d => PeakdB=0x%06X\n", #ifdef SAMPLES_24_BITS PeakMax & 0xFFFFFFFFFFFFLL, #else // SAMPLES_24_BITS (PeakMax >> 16) & 0xFFFFFFFFFFFFLL, #endif // SAMPLES_24_BITS HEADROOM + MaxShiftBits, PeakdB & 0xFFFFFF); debug_write_string(string); }
void silk_warped_autocorrelation_FIX_neon( opus_int32 *corr, /* O Result [order + 1] */ opus_int *scale, /* O Scaling of the correlation vector */ const opus_int16 *input, /* I Input data to correlate */ const opus_int warping_Q16, /* I Warping coefficient */ const opus_int length, /* I Length of input */ const opus_int order /* I Correlation order (even) */ ) { if( ( MAX_SHAPE_LPC_ORDER > 24 ) || ( order < 6 ) ) { silk_warped_autocorrelation_FIX_c( corr, scale, input, warping_Q16, length, order ); } else { opus_int n, i, lsh; opus_int64 corr_QC[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 }; /* In reverse order */ opus_int64 corr_QC_orderT; int64x2_t lsh_s64x2; const opus_int orderT = ( order + 3 ) & ~3; opus_int64 *corr_QCT; opus_int32 *input_QS; VARDECL( opus_int32, input_QST ); VARDECL( opus_int32, state ); SAVE_STACK; /* Order must be even */ silk_assert( ( order & 1 ) == 0 ); silk_assert( 2 * QS - QC >= 0 ); ALLOC( input_QST, length + 2 * MAX_SHAPE_LPC_ORDER, opus_int32 ); input_QS = input_QST; /* input_QS has zero paddings in the beginning and end. */ vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; /* Loop over samples */ for( n = 0; n < length - 7; n += 8, input_QS += 8 ) { const int16x8_t t0_s16x4 = vld1q_s16( input + n ); vst1q_s32( input_QS + 0, vshll_n_s16( vget_low_s16( t0_s16x4 ), QS ) ); vst1q_s32( input_QS + 4, vshll_n_s16( vget_high_s16( t0_s16x4 ), QS ) ); } for( ; n < length; n++, input_QS++ ) { input_QS[ 0 ] = silk_LSHIFT32( (opus_int32)input[ n ], QS ); } vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS = input_QST + MAX_SHAPE_LPC_ORDER - orderT; /* The following loop runs ( length + order ) times, with ( order ) extra epilogues. */ /* The zero paddings in input_QS guarantee corr_QC's correctness even with the extra epilogues. */ /* The values of state_QS will be polluted by the extra epilogues, however they are temporary values. */ /* Keep the C code here to help understand the intrinsics optimization. */ /* { opus_int32 state_QS[ 2 ][ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 }; opus_int32 *state_QST[ 3 ]; state_QST[ 0 ] = state_QS[ 0 ]; state_QST[ 1 ] = state_QS[ 1 ]; for( n = 0; n < length + order; n++, input_QS++ ) { state_QST[ 0 ][ orderT ] = input_QS[ orderT ]; for( i = 0; i < orderT; i++ ) { corr_QC[ i ] += silk_RSHIFT64( silk_SMULL( state_QST[ 0 ][ i ], input_QS[ i ] ), 2 * QS - QC ); state_QST[ 1 ][ i ] = silk_SMLAWB( state_QST[ 1 ][ i + 1 ], state_QST[ 0 ][ i ] - state_QST[ 0 ][ i + 1 ], warping_Q16 ); } state_QST[ 2 ] = state_QST[ 0 ]; state_QST[ 0 ] = state_QST[ 1 ]; state_QST[ 1 ] = state_QST[ 2 ]; } } */ { const int32x4_t warping_Q16_s32x4 = vdupq_n_s32( warping_Q16 << 15 ); const opus_int32 *in = input_QS + orderT; opus_int o = orderT; int32x4_t state_QS_s32x4[ 3 ][ 2 ]; ALLOC( state, length + orderT, opus_int32 ); state_QS_s32x4[ 2 ][ 1 ] = vdupq_n_s32( 0 ); /* Calculate 8 taps of all inputs in each loop. */ do { state_QS_s32x4[ 0 ][ 0 ] = state_QS_s32x4[ 0 ][ 1 ] = state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 1 ][ 1 ] = vdupq_n_s32( 0 ); n = 0; do { calc_corr( input_QS + n, corr_QC, o - 8, state_QS_s32x4[ 0 ][ 0 ] ); calc_corr( input_QS + n, corr_QC, o - 4, state_QS_s32x4[ 0 ][ 1 ] ); state_QS_s32x4[ 2 ][ 1 ] = vld1q_s32( in + n ); vst1q_lane_s32( state + n, state_QS_s32x4[ 0 ][ 0 ], 0 ); state_QS_s32x4[ 2 ][ 0 ] = vextq_s32( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 0 ][ 1 ], 1 ); state_QS_s32x4[ 2 ][ 1 ] = vextq_s32( state_QS_s32x4[ 0 ][ 1 ], state_QS_s32x4[ 2 ][ 1 ], 1 ); state_QS_s32x4[ 0 ][ 0 ] = calc_state( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], state_QS_s32x4[ 1 ][ 0 ], warping_Q16_s32x4 ); state_QS_s32x4[ 0 ][ 1 ] = calc_state( state_QS_s32x4[ 0 ][ 1 ], state_QS_s32x4[ 2 ][ 1 ], state_QS_s32x4[ 1 ][ 1 ], warping_Q16_s32x4 ); state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 2 ][ 0 ]; state_QS_s32x4[ 1 ][ 1 ] = state_QS_s32x4[ 2 ][ 1 ]; } while( ++n < ( length + order ) ); in = state; o -= 8; } while( o > 4 ); if( o ) { /* Calculate the last 4 taps of all inputs. */ opus_int32 *stateT = state; silk_assert( o == 4 ); state_QS_s32x4[ 0 ][ 0 ] = state_QS_s32x4[ 1 ][ 0 ] = vdupq_n_s32( 0 ); n = length + order; do { calc_corr( input_QS, corr_QC, 0, state_QS_s32x4[ 0 ][ 0 ] ); state_QS_s32x4[ 2 ][ 0 ] = vld1q_s32( stateT ); vst1q_lane_s32( stateT, state_QS_s32x4[ 0 ][ 0 ], 0 ); state_QS_s32x4[ 2 ][ 0 ] = vextq_s32( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], 1 ); state_QS_s32x4[ 0 ][ 0 ] = calc_state( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], state_QS_s32x4[ 1 ][ 0 ], warping_Q16_s32x4 ); state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 2 ][ 0 ]; input_QS++; stateT++; } while( --n ); } } { const opus_int16 *inputT = input; int32x4_t t_s32x4; int64x1_t t_s64x1; int64x2_t t_s64x2 = vdupq_n_s64( 0 ); for( n = 0; n <= length - 8; n += 8 ) { int16x8_t input_s16x8 = vld1q_s16( inputT ); t_s32x4 = vmull_s16( vget_low_s16( input_s16x8 ), vget_low_s16( input_s16x8 ) ); t_s32x4 = vmlal_s16( t_s32x4, vget_high_s16( input_s16x8 ), vget_high_s16( input_s16x8 ) ); t_s64x2 = vaddw_s32( t_s64x2, vget_low_s32( t_s32x4 ) ); t_s64x2 = vaddw_s32( t_s64x2, vget_high_s32( t_s32x4 ) ); inputT += 8; } t_s64x1 = vadd_s64( vget_low_s64( t_s64x2 ), vget_high_s64( t_s64x2 ) ); corr_QC_orderT = vget_lane_s64( t_s64x1, 0 ); for( ; n < length; n++ ) { corr_QC_orderT += silk_SMULL( input[ n ], input[ n ] ); } corr_QC_orderT = silk_LSHIFT64( corr_QC_orderT, QC ); corr_QC[ orderT ] = corr_QC_orderT; } corr_QCT = corr_QC + orderT - order; lsh = silk_CLZ64( corr_QC_orderT ) - 35; lsh = silk_LIMIT( lsh, -12 - QC, 30 - QC ); *scale = -( QC + lsh ); silk_assert( *scale >= -30 && *scale <= 12 ); lsh_s64x2 = vdupq_n_s64( lsh ); for( i = 0; i <= order - 3; i += 4 ) { int32x4_t corr_s32x4; int64x2_t corr_QC0_s64x2, corr_QC1_s64x2; corr_QC0_s64x2 = vld1q_s64( corr_QCT + i ); corr_QC1_s64x2 = vld1q_s64( corr_QCT + i + 2 ); corr_QC0_s64x2 = vshlq_s64( corr_QC0_s64x2, lsh_s64x2 ); corr_QC1_s64x2 = vshlq_s64( corr_QC1_s64x2, lsh_s64x2 ); corr_s32x4 = vcombine_s32( vmovn_s64( corr_QC1_s64x2 ), vmovn_s64( corr_QC0_s64x2 ) ); corr_s32x4 = vrev64q_s32( corr_s32x4 ); vst1q_s32( corr + order - i - 3, corr_s32x4 ); } if( lsh >= 0 ) { for( ; i < order + 1; i++ ) { corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_LSHIFT64( corr_QCT[ i ], lsh ) ); } } else { for( ; i < order + 1; i++ ) { corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_RSHIFT64( corr_QCT[ i ], -lsh ) ); } } silk_assert( corr_QCT[ order ] >= 0 ); /* If breaking, decrease QC*/ RESTORE_STACK; } #ifdef OPUS_CHECK_ASM { opus_int32 corr_c[ MAX_SHAPE_LPC_ORDER + 1 ]; opus_int scale_c; silk_warped_autocorrelation_FIX_c( corr_c, &scale_c, input, warping_Q16, length, order ); silk_assert( !memcmp( corr_c, corr, sizeof( corr_c[ 0 ] ) * ( order + 1 ) ) ); silk_assert( scale_c == *scale ); } #endif }
void computeNetwork0_i16_neon(const float *inputf, const float *weightsf, uint8_t *d) { const int16_t *input = (const int16_t *)inputf; const int16_t *weights = (const int16_t *)weightsf; int32x4_t accum0 = { 0, 0, 0, 0 }; int32x4_t accum1 = accum0; int32x4_t accum2 = accum0; int32x4_t accum3 = accum0; for (int i = 0; i < 96/2; i += 8) { int16x4x2_t d0 = vld2_s16(input + i); int16x4x2_t w0 = vld2_s16(weights + i * 4); int16x4x2_t w1 = vld2_s16(weights + i * 4 + 8); int16x4x2_t w2 = vld2_s16(weights + i * 4 + 16); int16x4x2_t w3 = vld2_s16(weights + i * 4 + 24); accum0 = vmlal_s16(accum0, d0.val[0], w0.val[0]); accum0 = vmlal_s16(accum0, d0.val[1], w0.val[1]); accum1 = vmlal_s16(accum1, d0.val[0], w1.val[0]); accum1 = vmlal_s16(accum1, d0.val[1], w1.val[1]); accum2 = vmlal_s16(accum2, d0.val[0], w2.val[0]); accum2 = vmlal_s16(accum2, d0.val[1], w2.val[1]); accum3 = vmlal_s16(accum3, d0.val[0], w3.val[0]); accum3 = vmlal_s16(accum3, d0.val[1], w3.val[1]); } int32x2_t sum0 = vpadd_s32(vget_low_s32(accum0), vget_high_s32(accum0)); int32x2_t sum1 = vpadd_s32(vget_low_s32(accum1), vget_high_s32(accum1)); int32x2_t sum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2)); int32x2_t sum3 = vpadd_s32(vget_low_s32(accum3), vget_high_s32(accum3)); sum0 = vpadd_s32(sum0, sum1); sum1 = vpadd_s32(sum2, sum3); int32x4_t sum = vcombine_s32(sum0, sum1); float32x4_t m0 = vcvtq_f32_s32(sum); m0 = vmulq_f32(m0, vld1q_f32(weightsf + 384/4)); m0 = vaddq_f32(m0, vld1q_f32(weightsf + 400/4)); float32x4_t m1, m2, m3, m4, m5, m6, m7; m1 = m0; m0 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(m0), sign_bits_f_zero_l)); m0 = vaddq_f32(m0, ones_f); m0 = vmulq_f32(reciprocal(m0), m1); m1 = vdupq_lane_f32(vget_low_f32(m0), 0); m2 = vdupq_lane_f32(vget_low_f32(m0), 1); m3 = vdupq_lane_f32(vget_high_f32(m0), 0); m4 = vdupq_lane_f32(vget_high_f32(m0), 1); m1 = vmulq_f32(m1, vld1q_f32(weightsf + 416/4)); m2 = vmulq_f32(m2, vld1q_f32(weightsf + (416+16)/4)); m3 = vmulq_f32(m3, vld1q_f32(weightsf + (416+32)/4)); m4 = vmulq_f32(m4, vld1q_f32(weightsf + (416+48)/4)); m1 = vaddq_f32(m1, m2); m3 = vaddq_f32(m3, m4); m1 = vaddq_f32(m1, m3); m1 = vaddq_f32(m1, vld1q_f32(weightsf + (416+64)/4)); m7 = m1; m1 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(m1), sign_bits_f)); m1 = vaddq_f32(m1, ones_f); m7 = vmulq_f32(reciprocal(m1), m7); m3 = m0; m0 = vdupq_lane_f32(vget_low_f32(m0), 0); m1 = vdupq_lane_f32(vget_low_f32(m3), 1); m2 = vdupq_lane_f32(vget_high_f32(m3), 0); m3 = vdupq_lane_f32(vget_high_f32(m3), 1); m0 = vmulq_f32(m0, vld1q_f32(weightsf + 496/4)); m1 = vmulq_f32(m1, vld1q_f32(weightsf + (496+16)/4)); m2 = vmulq_f32(m2, vld1q_f32(weightsf + (496+32)/4)); m3 = vmulq_f32(m3, vld1q_f32(weightsf + (496+48)/4)); m4 = vdupq_lane_f32(vget_low_f32(m7), 0); m5 = vdupq_lane_f32(vget_low_f32(m7), 1); m6 = vdupq_lane_f32(vget_high_f32(m7), 0); m7 = vdupq_lane_f32(vget_high_f32(m7), 1); m4 = vmulq_f32(m4, vld1q_f32(weightsf + (496+64)/4)); m5 = vmulq_f32(m5, vld1q_f32(weightsf + (496+80)/4)); m6 = vmulq_f32(m6, vld1q_f32(weightsf + (496+96)/4)); m7 = vmulq_f32(m7, vld1q_f32(weightsf + (496+112)/4)); m0 = vaddq_f32(m0, m1); m2 = vaddq_f32(m2, m3); m4 = vaddq_f32(m4, m5); m6 = vaddq_f32(m6, m7); m0 = vaddq_f32(m0, m2); m4 = vaddq_f32(m4, m6); m0 = vaddq_f32(m0, m4); m0 = vaddq_f32(m0, vld1q_f32(weightsf + (496+128)/4)); float32x2_t maximum = vmax_f32(vget_low_f32(m0), vget_high_f32(m0)); d[0] = (vget_lane_f32(maximum, 1) <= vget_lane_f32(maximum, 0)); }
int mult_cpx_conj_vector(int16_t *x1, int16_t *x2, int16_t *y, uint32_t N, int output_shift, int madd) { // Multiply elementwise the complex conjugate of x1 with x2. // x1 - input 1 in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| // We assume x1 with a dinamic of 15 bit maximum // // x2 - input 2 in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| // We assume x2 with a dinamic of 14 bit maximum /// // y - output in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| // // N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4; // // output_shift - shift to be applied to generate output // // madd - add the output to y uint32_t i; // loop counter simd_q15_t *x1_128; simd_q15_t *x2_128; simd_q15_t *y_128; #if defined(__x86_64__) || defined(__i386__) simd_q15_t tmp_re,tmp_im; simd_q15_t tmpy0,tmpy1; #elif defined(__arm__) int32x4_t tmp_re,tmp_im; int32x4_t tmp_re1,tmp_im1; int16x4x2_t tmpy; int32x4_t shift = vdupq_n_s32(-output_shift); #endif x1_128 = (simd_q15_t *)&x1[0]; x2_128 = (simd_q15_t *)&x2[0]; y_128 = (simd_q15_t *)&y[0]; // we compute 4 cpx multiply for each loop for(i=0; i<(N>>2); i++) { #if defined(__x86_64__) || defined(__i386__) tmp_re = _mm_madd_epi16(*x1_128,*x2_128); tmp_im = _mm_shufflelo_epi16(*x1_128,_MM_SHUFFLE(2,3,0,1)); tmp_im = _mm_shufflehi_epi16(tmp_im,_MM_SHUFFLE(2,3,0,1)); tmp_im = _mm_sign_epi16(tmp_im,*(__m128i*)&conjug[0]); tmp_im = _mm_madd_epi16(tmp_im,*x2_128); tmp_re = _mm_srai_epi32(tmp_re,output_shift); tmp_im = _mm_srai_epi32(tmp_im,output_shift); tmpy0 = _mm_unpacklo_epi32(tmp_re,tmp_im); tmpy1 = _mm_unpackhi_epi32(tmp_re,tmp_im); if (madd==0) *y_128 = _mm_packs_epi32(tmpy0,tmpy1); else *y_128 += _mm_packs_epi32(tmpy0,tmpy1); #elif defined(__arm__) tmp_re = vmull_s16(((simdshort_q15_t *)x1_128)[0], ((simdshort_q15_t*)x2_128)[0]); //tmp_re = [Re(x1[0])Re(x2[0]) Im(x1[0])Im(x2[0]) Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1])] tmp_re1 = vmull_s16(((simdshort_q15_t *)x1_128)[1], ((simdshort_q15_t*)x2_128)[1]); //tmp_re1 = [Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1]) Re(x1[1])Re(x2[2]) Im(x1[1])Im(x2[2])] tmp_re = vcombine_s32(vpadd_s32(vget_low_s32(tmp_re),vget_high_s32(tmp_re)), vpadd_s32(vget_low_s32(tmp_re1),vget_high_s32(tmp_re1))); //tmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2]) Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] tmp_im = vmull_s16(vrev32_s16(vmul_s16(((simdshort_q15_t*)x2_128)[0],*(simdshort_q15_t*)conjug)), ((simdshort_q15_t*)x1_128)[0]); //tmp_im = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])] tmp_im1 = vmull_s16(vrev32_s16(vmul_s16(((simdshort_q15_t*)x2_128)[1],*(simdshort_q15_t*)conjug)), ((simdshort_q15_t*)x1_128)[1]); //tmp_im1 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])] tmp_im = vcombine_s32(vpadd_s32(vget_low_s32(tmp_im),vget_high_s32(tmp_im)), vpadd_s32(vget_low_s32(tmp_im1),vget_high_s32(tmp_im1))); //tmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])] tmp_re = vqshlq_s32(tmp_re,shift); tmp_im = vqshlq_s32(tmp_im,shift); tmpy = vzip_s16(vmovn_s32(tmp_re),vmovn_s32(tmp_im)); if (madd==0) *y_128 = vcombine_s16(tmpy.val[0],tmpy.val[1]); else *y_128 += vcombine_s16(tmpy.val[0],tmpy.val[1]); #endif x1_128++; x2_128++; y_128++; } _mm_empty(); _m_empty(); return(0); }