int32_t dot_product(int16_t *x, int16_t *y, uint32_t N, //must be a multiple of 8 uint8_t output_shift) { uint32_t n; #if defined(__x86_64__) || defined(__i386__) __m128i *x128,*y128,mmtmp1,mmtmp2,mmtmp3,mmcumul,mmcumul_re,mmcumul_im; __m64 mmtmp7; __m128i minus_i = _mm_set_epi16(-1,1,-1,1,-1,1,-1,1); int32_t result; x128 = (__m128i*) x; y128 = (__m128i*) y; mmcumul_re = _mm_setzero_si128(); mmcumul_im = _mm_setzero_si128(); for (n=0; n<(N>>2); n++) { //printf("n=%d, x128=%p, y128=%p\n",n,x128,y128); // print_shorts("x",&x128[0]); // print_shorts("y",&y128[0]); // this computes Re(z) = Re(x)*Re(y) + Im(x)*Im(y) mmtmp1 = _mm_madd_epi16(x128[0],y128[0]); // print_ints("re",&mmtmp1); // mmtmp1 contains real part of 4 consecutive outputs (32-bit) // shift and accumulate results mmtmp1 = _mm_srai_epi32(mmtmp1,output_shift); mmcumul_re = _mm_add_epi32(mmcumul_re,mmtmp1); // print_ints("re",&mmcumul_re); // this computes Im(z) = Re(x)*Im(y) - Re(y)*Im(x) mmtmp2 = _mm_shufflelo_epi16(y128[0],_MM_SHUFFLE(2,3,0,1)); // print_shorts("y",&mmtmp2); mmtmp2 = _mm_shufflehi_epi16(mmtmp2,_MM_SHUFFLE(2,3,0,1)); // print_shorts("y",&mmtmp2); mmtmp2 = _mm_sign_epi16(mmtmp2,minus_i); // print_shorts("y",&mmtmp2); mmtmp3 = _mm_madd_epi16(x128[0],mmtmp2); // print_ints("im",&mmtmp3); // mmtmp3 contains imag part of 4 consecutive outputs (32-bit) // shift and accumulate results mmtmp3 = _mm_srai_epi32(mmtmp3,output_shift); mmcumul_im = _mm_add_epi32(mmcumul_im,mmtmp3); // print_ints("im",&mmcumul_im); x128++; y128++; } // this gives Re Re Im Im mmcumul = _mm_hadd_epi32(mmcumul_re,mmcumul_im); // print_ints("cumul1",&mmcumul); // this gives Re Im Re Im mmcumul = _mm_hadd_epi32(mmcumul,mmcumul); // print_ints("cumul2",&mmcumul); //mmcumul = _mm_srai_epi32(mmcumul,output_shift); // extract the lower half mmtmp7 = _mm_movepi64_pi64(mmcumul); // print_ints("mmtmp7",&mmtmp7); // pack the result mmtmp7 = _mm_packs_pi32(mmtmp7,mmtmp7); // print_shorts("mmtmp7",&mmtmp7); // convert back to integer result = _mm_cvtsi64_si32(mmtmp7); _mm_empty(); _m_empty(); return(result); #elif defined(__arm__) int16x4_t *x_128=(int16x4_t*)x; int16x4_t *y_128=(int16x4_t*)y; int32x4_t tmp_re,tmp_im; int32x4_t tmp_re1,tmp_im1; int32x4_t re_cumul,im_cumul; int32x2_t re_cumul2,im_cumul2; int32x4_t shift = vdupq_n_s32(-output_shift); int32x2x2_t result2; int16_t conjug[4]__attribute__((aligned(16))) = {-1,1,-1,1} ; re_cumul = vdupq_n_s32(0); im_cumul = vdupq_n_s32(0); for (n=0; n<(N>>2); n++) { tmp_re = vmull_s16(*x_128++, *y_128++); //tmp_re = [Re(x[0])Re(y[0]) Im(x[0])Im(y[0]) Re(x[1])Re(y[1]) Im(x[1])Im(y[1])] tmp_re1 = vmull_s16(*x_128++, *y_128++); //tmp_re1 = [Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1]) Re(x1[1])Re(x2[2]) Im(x1[1])Im(x2[2])] tmp_re = vcombine_s32(vpadd_s32(vget_low_s32(tmp_re),vget_high_s32(tmp_re)), vpadd_s32(vget_low_s32(tmp_re1),vget_high_s32(tmp_re1))); //tmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2]) Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] tmp_im = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++); //tmp_im = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])] tmp_im1 = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++); //tmp_im1 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])] tmp_im = vcombine_s32(vpadd_s32(vget_low_s32(tmp_im),vget_high_s32(tmp_im)), vpadd_s32(vget_low_s32(tmp_im1),vget_high_s32(tmp_im1))); //tmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])] re_cumul = vqaddq_s32(re_cumul,vqshlq_s32(tmp_re,shift)); im_cumul = vqaddq_s32(im_cumul,vqshlq_s32(tmp_im,shift)); } re_cumul2 = vpadd_s32(vget_low_s32(re_cumul),vget_high_s32(re_cumul)); im_cumul2 = vpadd_s32(vget_low_s32(im_cumul),vget_high_s32(im_cumul)); re_cumul2 = vpadd_s32(re_cumul2,re_cumul2); im_cumul2 = vpadd_s32(im_cumul2,im_cumul2); result2 = vzip_s32(re_cumul2,im_cumul2); return(vget_lane_s32(result2.val[0],0)); #endif }
static inline void silk_biquad_alt_stride2_kernel( const int32x4_t A_L_s32x4, const int32x4_t A_U_s32x4, const int32x4_t B_Q28_s32x4, const int32x2_t t_s32x2, const int32x4_t in_s32x4, int32x4_t *S_s32x4, int32x2_t *out32_Q14_s32x2 ) { int32x4_t t_s32x4, out32_Q14_s32x4; *out32_Q14_s32x2 = vadd_s32( vget_low_s32( *S_s32x4 ), t_s32x2 ); /* silk_SMLAWB( S{0,1}, B_Q28[ 0 ], in{0,1} ) */ *S_s32x4 = vcombine_s32( vget_high_s32( *S_s32x4 ), vdup_n_s32( 0 ) ); /* S{0,1} = S{2,3}; S{2,3} = 0; */ *out32_Q14_s32x2 = vshl_n_s32( *out32_Q14_s32x2, 2 ); /* out32_Q14_{0,1} = silk_LSHIFT( silk_SMLAWB( S{0,1}, B_Q28[ 0 ], in{0,1} ), 2 ); */ out32_Q14_s32x4 = vcombine_s32( *out32_Q14_s32x2, *out32_Q14_s32x2 ); /* out32_Q14_{0,1,0,1} */ t_s32x4 = vqdmulhq_s32( out32_Q14_s32x4, A_L_s32x4 ); /* silk_SMULWB( out32_Q14_{0,1,0,1}, A{0,0,1,1}_L_Q28 ) */ *S_s32x4 = vrsraq_n_s32( *S_s32x4, t_s32x4, 14 ); /* S{0,1} = S{2,3} + silk_RSHIFT_ROUND(); S{2,3} = silk_RSHIFT_ROUND(); */ t_s32x4 = vqdmulhq_s32( out32_Q14_s32x4, A_U_s32x4 ); /* silk_SMULWB( out32_Q14_{0,1,0,1}, A{0,0,1,1}_U_Q28 ) */ *S_s32x4 = vaddq_s32( *S_s32x4, t_s32x4 ); /* S0 = silk_SMLAWB( S{0,1,2,3}, out32_Q14_{0,1,0,1}, A{0,0,1,1}_U_Q28 ); */ t_s32x4 = vqdmulhq_s32( in_s32x4, B_Q28_s32x4 ); /* silk_SMULWB( B_Q28[ {1,1,2,2} ], in{0,1,0,1} ) */ *S_s32x4 = vaddq_s32( *S_s32x4, t_s32x4 ); /* S0 = silk_SMLAWB( S0, B_Q28[ {1,1,2,2} ], in{0,1,0,1} ); */ }
inline int v_signmask(const v_uint32x4& a) { int32x2_t m0 = vcreate_s32(CV_BIG_UINT(0x0000000100000000)); uint32x4_t v0 = vshlq_u32(vshrq_n_u32(a.val, 31), vcombine_s32(m0, m0)); uint64x2_t v1 = vpaddlq_u32(v0); return (int)vgetq_lane_u64(v1, 0) + ((int)vgetq_lane_u64(v1, 1) << 2); }
static INLINE void iadst_half_butterfly_bd12_neon(int32x4_t *const x, const int32x2_t c) { const int32x4_t sum = vaddq_s32(x[0], x[1]); const int32x4_t sub = vsubq_s32(x[0], x[1]); const int64x2_t t0_lo = vmull_lane_s32(vget_low_s32(sum), c, 0); const int64x2_t t1_lo = vmull_lane_s32(vget_low_s32(sub), c, 0); const int64x2_t t0_hi = vmull_lane_s32(vget_high_s32(sum), c, 0); const int64x2_t t1_hi = vmull_lane_s32(vget_high_s32(sub), c, 0); const int32x2_t out0_lo = vrshrn_n_s64(t0_lo, DCT_CONST_BITS); const int32x2_t out1_lo = vrshrn_n_s64(t1_lo, DCT_CONST_BITS); const int32x2_t out0_hi = vrshrn_n_s64(t0_hi, DCT_CONST_BITS); const int32x2_t out1_hi = vrshrn_n_s64(t1_hi, DCT_CONST_BITS); x[0] = vcombine_s32(out0_lo, out0_hi); x[1] = vcombine_s32(out1_lo, out1_hi); }
static INLINE int32x4_t sub_dct_const_round_shift_low_8_bd12( const int64x2_t *const in0, const int64x2_t *const in1) { const int64x2_t sub_lo = vsubq_s64(in0[0], in1[0]); const int64x2_t sub_hi = vsubq_s64(in0[1], in1[1]); const int32x2_t out_lo = vrshrn_n_s64(sub_lo, DCT_CONST_BITS); const int32x2_t out_hi = vrshrn_n_s64(sub_hi, DCT_CONST_BITS); return vcombine_s32(out_lo, out_hi); }
int32x4_t bar (int64_t x) { int32x2_t i = vcreate_s32 (x); int32x2_t zeroes = vcreate_s32 (0l); int32x4_t ret = vcombine_s32 (i, zeroes); return ret; }
int32x4_t foo (int32x2_t *x) { int32x2_t i = *x; int32x2_t zeroes = vcreate_s32 (0l); int32x4_t ret = vcombine_s32 (i, zeroes); return ret; }
void test_vcombines32 (void) { int32x4_t out_int32x4_t; int32x2_t arg0_int32x2_t; int32x2_t arg1_int32x2_t; out_int32x4_t = vcombine_s32 (arg0_int32x2_t, arg1_int32x2_t); }
static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis, int32x4_t *const a0, int32x4_t *const a1, int32x4_t *const a2, int32x4_t *const a3) { int32x4_t b0, b1, b2, b3; int64x2_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11; transpose_s32_4x4(a0, a1, a2, a3); b0 = vaddq_s32(*a0, *a2); b1 = vsubq_s32(*a0, *a2); c0 = vmull_lane_s32(vget_low_s32(b0), vget_high_s32(cospis), 0); c1 = vmull_lane_s32(vget_high_s32(b0), vget_high_s32(cospis), 0); c2 = vmull_lane_s32(vget_low_s32(b1), vget_high_s32(cospis), 0); c3 = vmull_lane_s32(vget_high_s32(b1), vget_high_s32(cospis), 0); c4 = vmull_lane_s32(vget_low_s32(*a1), vget_high_s32(cospis), 1); c5 = vmull_lane_s32(vget_high_s32(*a1), vget_high_s32(cospis), 1); c6 = vmull_lane_s32(vget_low_s32(*a1), vget_low_s32(cospis), 1); c7 = vmull_lane_s32(vget_high_s32(*a1), vget_low_s32(cospis), 1); c8 = vmull_lane_s32(vget_low_s32(*a3), vget_low_s32(cospis), 1); c9 = vmull_lane_s32(vget_high_s32(*a3), vget_low_s32(cospis), 1); c10 = vmull_lane_s32(vget_low_s32(*a3), vget_high_s32(cospis), 1); c11 = vmull_lane_s32(vget_high_s32(*a3), vget_high_s32(cospis), 1); c4 = vsubq_s64(c4, c8); c5 = vsubq_s64(c5, c9); c6 = vaddq_s64(c6, c10); c7 = vaddq_s64(c7, c11); b0 = vcombine_s32(vrshrn_n_s64(c0, DCT_CONST_BITS), vrshrn_n_s64(c1, DCT_CONST_BITS)); b1 = vcombine_s32(vrshrn_n_s64(c2, DCT_CONST_BITS), vrshrn_n_s64(c3, DCT_CONST_BITS)); b2 = vcombine_s32(vrshrn_n_s64(c4, DCT_CONST_BITS), vrshrn_n_s64(c5, DCT_CONST_BITS)); b3 = vcombine_s32(vrshrn_n_s64(c6, DCT_CONST_BITS), vrshrn_n_s64(c7, DCT_CONST_BITS)); *a0 = vaddq_s32(b0, b3); *a1 = vaddq_s32(b1, b2); *a2 = vsubq_s32(b1, b2); *a3 = vsubq_s32(b0, b3); }
void dotProd_i16_neon(const float *dataf, const float *weightsf, float *vals, const int n, const int len, const float *istd) { const int16_t *data = (const int16_t *)dataf; const int16_t *weights = (const int16_t *)weightsf; weightsf += n * len / 2; // sizeof(float) / sizeof(int16_t) for (int i = 0; i < n; i += 4) { int32x4_t accum0 = { 0, 0, 0, 0 }; int32x4_t accum1 = accum0; int32x4_t accum2 = accum0; int32x4_t accum3 = accum0; for (int j = 0; j < len; j += 8) { int16x4x2_t d0 = vld2_s16(data + j); int16x4x2_t w0 = vld2_s16(weights); int16x4x2_t w1 = vld2_s16(weights + 8); int16x4x2_t w2 = vld2_s16(weights + 16); int16x4x2_t w3 = vld2_s16(weights + 24); accum0 = vmlal_s16(accum0, d0.val[0], w0.val[0]); accum0 = vmlal_s16(accum0, d0.val[1], w0.val[1]); accum1 = vmlal_s16(accum1, d0.val[0], w1.val[0]); accum1 = vmlal_s16(accum1, d0.val[1], w1.val[1]); accum2 = vmlal_s16(accum2, d0.val[0], w2.val[0]); accum2 = vmlal_s16(accum2, d0.val[1], w2.val[1]); accum3 = vmlal_s16(accum3, d0.val[0], w3.val[0]); accum3 = vmlal_s16(accum3, d0.val[1], w3.val[1]); weights += 32; } int32x2_t sum0 = vpadd_s32(vget_low_s32(accum0), vget_high_s32(accum0)); int32x2_t sum1 = vpadd_s32(vget_low_s32(accum1), vget_high_s32(accum1)); int32x2_t sum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2)); int32x2_t sum3 = vpadd_s32(vget_low_s32(accum3), vget_high_s32(accum3)); sum0 = vpadd_s32(sum0, sum1); sum1 = vpadd_s32(sum2, sum3); int32x4_t sum = vcombine_s32(sum0, sum1); float32x4_t val = vcvtq_f32_s32(sum); val = vmulq_f32(val, vld1q_f32(weightsf + i*2)); val = vmulq_n_f32(val, istd[0]); val = vaddq_f32(val, vld1q_f32(weightsf + i*2 + 4)); vst1q_f32(vals + i, val); } }
int main (void) { int64_t arg1; int32_t arg2; int32x4_t arg3; int64_t actual; int64_t expected; arg1 = -9223182289494545592LL; arg2 = 32768; arg3 = vcombine_s32 (vcreate_s32 (0xffff7fff8000ffffULL), vcreate_s32 (0x80000000ffff0000ULL)); actual = vqdmlals_laneq_s32 (arg1, arg2, arg3, 3); expected = -9223323026982900920LL; if (expected != actual) abort (); return 0; }
int main (void) { int64_t arg1; int32_t arg2; int32x4_t arg3; int64_t actual; int64_t expected; arg1 = 140733193453567LL; arg2 = 25544; arg3 = vcombine_s32 (vcreate_s32 (0x417b8000ffff8397LL), vcreate_s32 (0x7fffffff58488000LL)); actual = vqdmlsls_laneq_s32 (arg1, arg2, arg3, 3); expected = 31022548895631LL; if (expected != actual) abort (); return 0; }
int rotate_cpx_vector(int16_t *x, int16_t *alpha, int16_t *y, uint32_t N, uint16_t output_shift) { // Multiply elementwise two complex vectors of N elements // x - input 1 in the format |Re0 Im0 |,......,|Re(N-1) Im(N-1)| // We assume x1 with a dynamic of 15 bit maximum // // alpha - input 2 in the format |Re0 Im0| // We assume x2 with a dynamic of 15 bit maximum // // y - output in the format |Re0 Im0|,......,|Re(N-1) Im(N-1)| // // N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4; // // log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0) // WARNING: log2_amp>0 can cause overflow!! uint32_t i; // loop counter simd_q15_t *y_128,alpha_128; int32_t *xd=(int32_t *)x; #if defined(__x86_64__) || defined(__i386__) __m128i shift = _mm_cvtsi32_si128(output_shift); register simd_q15_t m0,m1,m2,m3; ((int16_t *)&alpha_128)[0] = alpha[0]; ((int16_t *)&alpha_128)[1] = -alpha[1]; ((int16_t *)&alpha_128)[2] = alpha[1]; ((int16_t *)&alpha_128)[3] = alpha[0]; ((int16_t *)&alpha_128)[4] = alpha[0]; ((int16_t *)&alpha_128)[5] = -alpha[1]; ((int16_t *)&alpha_128)[6] = alpha[1]; ((int16_t *)&alpha_128)[7] = alpha[0]; #elif defined(__arm__) int32x4_t shift; int32x4_t ab_re0,ab_re1,ab_im0,ab_im1,re32,im32; int16_t reflip[8] __attribute__((aligned(16))) = {1,-1,1,-1,1,-1,1,-1}; int32x4x2_t xtmp; ((int16_t *)&alpha_128)[0] = alpha[0]; ((int16_t *)&alpha_128)[1] = alpha[1]; ((int16_t *)&alpha_128)[2] = alpha[0]; ((int16_t *)&alpha_128)[3] = alpha[1]; ((int16_t *)&alpha_128)[4] = alpha[0]; ((int16_t *)&alpha_128)[5] = alpha[1]; ((int16_t *)&alpha_128)[6] = alpha[0]; ((int16_t *)&alpha_128)[7] = alpha[1]; int16x8_t bflip = vrev32q_s16(alpha_128); int16x8_t bconj = vmulq_s16(alpha_128,*(int16x8_t *)reflip); shift = vdupq_n_s32(-output_shift); #endif y_128 = (simd_q15_t *) y; for(i=0; i<N>>2; i++) { #if defined(__x86_64__) || defined(__i386__) m0 = _mm_setr_epi32(xd[0],xd[0],xd[1],xd[1]); m1 = _mm_setr_epi32(xd[2],xd[2],xd[3],xd[3]); m2 = _mm_madd_epi16(m0,alpha_128); //complex multiply. result is 32bit [Re Im Re Im] m3 = _mm_madd_epi16(m1,alpha_128); //complex multiply. result is 32bit [Re Im Re Im] m2 = _mm_sra_epi32(m2,shift); // shift right by shift in order to compensate for the input amplitude m3 = _mm_sra_epi32(m3,shift); // shift right by shift in order to compensate for the input amplitude y_128[0] = _mm_packs_epi32(m2,m3); // pack in 16bit integers with saturation [re im re im re im re im] #elif defined(__arm__) ab_re0 = vmull_s16(((int16x4_t*)xd)[0],((int16x4_t*)&bconj)[0]); ab_re1 = vmull_s16(((int16x4_t*)xd)[1],((int16x4_t*)&bconj)[1]); ab_im0 = vmull_s16(((int16x4_t*)xd)[0],((int16x4_t*)&bflip)[0]); ab_im1 = vmull_s16(((int16x4_t*)xd)[1],((int16x4_t*)&bflip)[1]); re32 = vshlq_s32(vcombine_s32(vpadd_s32(((int32x2_t*)&ab_re0)[0],((int32x2_t*)&ab_re0)[1]), vpadd_s32(((int32x2_t*)&ab_re1)[0],((int32x2_t*)&ab_re1)[1])), shift); im32 = vshlq_s32(vcombine_s32(vpadd_s32(((int32x2_t*)&ab_im0)[0],((int32x2_t*)&ab_im0)[1]), vpadd_s32(((int32x2_t*)&ab_im1)[0],((int32x2_t*)&ab_im1)[1])), shift); xtmp = vzipq_s32(re32,im32); y_128[0] = vcombine_s16(vmovn_s32(xtmp.val[0]),vmovn_s32(xtmp.val[1])); #endif xd+=4; y_128+=1; } _mm_empty(); _m_empty(); return(0); }
inline int32x4_t vcombine(const int32x2_t & v0, const int32x2_t & v1) { return vcombine_s32(v0, v1); }
void silk_biquad_alt_stride2_neon( const opus_int16 *in, /* I input signal */ const opus_int32 *B_Q28, /* I MA coefficients [3] */ const opus_int32 *A_Q28, /* I AR coefficients [2] */ opus_int32 *S, /* I/O State vector [4] */ opus_int16 *out, /* O output signal */ const opus_int32 len /* I signal length (must be even) */ ) { /* DIRECT FORM II TRANSPOSED (uses 2 element state vector) */ opus_int k = 0; const int32x2_t offset_s32x2 = vdup_n_s32( (1<<14) - 1 ); const int32x4_t offset_s32x4 = vcombine_s32( offset_s32x2, offset_s32x2 ); int16x4_t in_s16x4 = vdup_n_s16( 0 ); int16x4_t out_s16x4; int32x2_t A_Q28_s32x2, A_L_s32x2, A_U_s32x2, B_Q28_s32x2, t_s32x2; int32x4_t A_L_s32x4, A_U_s32x4, B_Q28_s32x4, S_s32x4, out32_Q14_s32x4; int32x2x2_t t0_s32x2x2, t1_s32x2x2, t2_s32x2x2, S_s32x2x2; #ifdef OPUS_CHECK_ASM opus_int32 S_c[ 4 ]; VARDECL( opus_int16, out_c ); SAVE_STACK; ALLOC( out_c, 2 * len, opus_int16 ); silk_memcpy( &S_c, S, sizeof( S_c ) ); silk_biquad_alt_stride2_c( in, B_Q28, A_Q28, S_c, out_c, len ); #endif /* Negate A_Q28 values and split in two parts */ A_Q28_s32x2 = vld1_s32( A_Q28 ); A_Q28_s32x2 = vneg_s32( A_Q28_s32x2 ); A_L_s32x2 = vshl_n_s32( A_Q28_s32x2, 18 ); /* ( -A_Q28[] & 0x00003FFF ) << 18 */ A_L_s32x2 = vreinterpret_s32_u32( vshr_n_u32( vreinterpret_u32_s32( A_L_s32x2 ), 3 ) ); /* ( -A_Q28[] & 0x00003FFF ) << 15 */ A_U_s32x2 = vshr_n_s32( A_Q28_s32x2, 14 ); /* silk_RSHIFT( -A_Q28[], 14 ) */ A_U_s32x2 = vshl_n_s32( A_U_s32x2, 16 ); /* silk_RSHIFT( -A_Q28[], 14 ) << 16 (Clip two leading bits to conform to C function.) */ A_U_s32x2 = vshr_n_s32( A_U_s32x2, 1 ); /* silk_RSHIFT( -A_Q28[], 14 ) << 15 */ B_Q28_s32x2 = vld1_s32( B_Q28 ); t_s32x2 = vld1_s32( B_Q28 + 1 ); t0_s32x2x2 = vzip_s32( A_L_s32x2, A_L_s32x2 ); t1_s32x2x2 = vzip_s32( A_U_s32x2, A_U_s32x2 ); t2_s32x2x2 = vzip_s32( t_s32x2, t_s32x2 ); A_L_s32x4 = vcombine_s32( t0_s32x2x2.val[ 0 ], t0_s32x2x2.val[ 1 ] ); /* A{0,0,1,1}_L_Q28 */ A_U_s32x4 = vcombine_s32( t1_s32x2x2.val[ 0 ], t1_s32x2x2.val[ 1 ] ); /* A{0,0,1,1}_U_Q28 */ B_Q28_s32x4 = vcombine_s32( t2_s32x2x2.val[ 0 ], t2_s32x2x2.val[ 1 ] ); /* B_Q28[ {1,1,2,2} ] */ S_s32x4 = vld1q_s32( S ); /* S0 = S[ 0 ]; S3 = S[ 3 ]; */ S_s32x2x2 = vtrn_s32( vget_low_s32( S_s32x4 ), vget_high_s32( S_s32x4 ) ); /* S2 = S[ 1 ]; S1 = S[ 2 ]; */ S_s32x4 = vcombine_s32( S_s32x2x2.val[ 0 ], S_s32x2x2.val[ 1 ] ); for( ; k < len - 1; k += 2 ) { int32x4_t in_s32x4[ 2 ], t_s32x4; int32x2_t out32_Q14_s32x2[ 2 ]; /* S[ 2 * i + 0 ], S[ 2 * i + 1 ], S[ 2 * i + 2 ], S[ 2 * i + 3 ]: Q12 */ in_s16x4 = vld1_s16( &in[ 2 * k ] ); /* in{0,1,2,3} = in[ 2 * k + {0,1,2,3} ]; */ in_s32x4[ 0 ] = vshll_n_s16( in_s16x4, 15 ); /* in{0,1,2,3} << 15 */ t_s32x4 = vqdmulhq_lane_s32( in_s32x4[ 0 ], B_Q28_s32x2, 0 ); /* silk_SMULWB( B_Q28[ 0 ], in{0,1,2,3} ) */ in_s32x4[ 1 ] = vcombine_s32( vget_high_s32( in_s32x4[ 0 ] ), vget_high_s32( in_s32x4[ 0 ] ) ); /* in{2,3,2,3} << 15 */ in_s32x4[ 0 ] = vcombine_s32( vget_low_s32 ( in_s32x4[ 0 ] ), vget_low_s32 ( in_s32x4[ 0 ] ) ); /* in{0,1,0,1} << 15 */ silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_low_s32 ( t_s32x4 ), in_s32x4[ 0 ], &S_s32x4, &out32_Q14_s32x2[ 0 ] ); silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_high_s32( t_s32x4 ), in_s32x4[ 1 ], &S_s32x4, &out32_Q14_s32x2[ 1 ] ); /* Scale back to Q0 and saturate */ out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2[ 0 ], out32_Q14_s32x2[ 1 ] ); /* out32_Q14_{0,1,2,3} */ out32_Q14_s32x4 = vaddq_s32( out32_Q14_s32x4, offset_s32x4 ); /* out32_Q14_{0,1,2,3} + (1<<14) - 1 */ out_s16x4 = vqshrn_n_s32( out32_Q14_s32x4, 14 ); /* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,2,3} + (1<<14) - 1, 14 ) ) */ vst1_s16( &out[ 2 * k ], out_s16x4 ); /* out[ 2 * k + {0,1,2,3} ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,2,3} + (1<<14) - 1, 14 ) ); */ } /* Process leftover. */ if( k < len ) { int32x4_t in_s32x4; int32x2_t out32_Q14_s32x2; /* S[ 2 * i + 0 ], S[ 2 * i + 1 ]: Q12 */ in_s16x4 = vld1_lane_s16( &in[ 2 * k + 0 ], in_s16x4, 0 ); /* in{0,1} = in[ 2 * k + {0,1} ]; */ in_s16x4 = vld1_lane_s16( &in[ 2 * k + 1 ], in_s16x4, 1 ); /* in{0,1} = in[ 2 * k + {0,1} ]; */ in_s32x4 = vshll_n_s16( in_s16x4, 15 ); /* in{0,1} << 15 */ t_s32x2 = vqdmulh_lane_s32( vget_low_s32( in_s32x4 ), B_Q28_s32x2, 0 ); /* silk_SMULWB( B_Q28[ 0 ], in{0,1} ) */ in_s32x4 = vcombine_s32( vget_low_s32( in_s32x4 ), vget_low_s32( in_s32x4 ) ); /* in{0,1,0,1} << 15 */ silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, t_s32x2, in_s32x4, &S_s32x4, &out32_Q14_s32x2 ); /* Scale back to Q0 and saturate */ out32_Q14_s32x2 = vadd_s32( out32_Q14_s32x2, offset_s32x2 ); /* out32_Q14_{0,1} + (1<<14) - 1 */ out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2, out32_Q14_s32x2 ); /* out32_Q14_{0,1,0,1} + (1<<14) - 1 */ out_s16x4 = vqshrn_n_s32( out32_Q14_s32x4, 14 ); /* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,0,1} + (1<<14) - 1, 14 ) ) */ vst1_lane_s16( &out[ 2 * k + 0 ], out_s16x4, 0 ); /* out[ 2 * k + 0 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_0 + (1<<14) - 1, 14 ) ); */ vst1_lane_s16( &out[ 2 * k + 1 ], out_s16x4, 1 ); /* out[ 2 * k + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_1 + (1<<14) - 1, 14 ) ); */ } vst1q_lane_s32( &S[ 0 ], S_s32x4, 0 ); /* S[ 0 ] = S0; */ vst1q_lane_s32( &S[ 1 ], S_s32x4, 2 ); /* S[ 1 ] = S2; */ vst1q_lane_s32( &S[ 2 ], S_s32x4, 1 ); /* S[ 2 ] = S1; */ vst1q_lane_s32( &S[ 3 ], S_s32x4, 3 ); /* S[ 3 ] = S3; */ #ifdef OPUS_CHECK_ASM silk_assert( !memcmp( S_c, S, sizeof( S_c ) ) ); silk_assert( !memcmp( out_c, out, 2 * len * sizeof( opus_int16 ) ) ); RESTORE_STACK; #endif }
void qt_blend_argb32_on_argb32_neon(uchar *destPixels, int dbpl, const uchar *srcPixels, int sbpl, int w, int h, int const_alpha) { const uint *src = (const uint *) srcPixels; uint *dst = (uint *) destPixels; int16x8_t half = vdupq_n_s16(0x80); int16x8_t full = vdupq_n_s16(0xff); if (const_alpha == 256) { for (int y = 0; y < h; ++y) { int x = 0; for (; x < w-3; x += 4) { int32x4_t src32 = vld1q_s32((int32_t *)&src[x]); if ((src[x] & src[x+1] & src[x+2] & src[x+3]) >= 0xff000000) { // all opaque vst1q_s32((int32_t *)&dst[x], src32); } else if (src[x] | src[x+1] | src[x+2] | src[x+3]) { int32x4_t dst32 = vld1q_s32((int32_t *)&dst[x]); const uint8x16_t src8 = vreinterpretq_u8_s32(src32); const uint8x16_t dst8 = vreinterpretq_u8_s32(dst32); const uint8x8_t src8_low = vget_low_u8(src8); const uint8x8_t dst8_low = vget_low_u8(dst8); const uint8x8_t src8_high = vget_high_u8(src8); const uint8x8_t dst8_high = vget_high_u8(dst8); const int16x8_t src16_low = vreinterpretq_s16_u16(vmovl_u8(src8_low)); const int16x8_t dst16_low = vreinterpretq_s16_u16(vmovl_u8(dst8_low)); const int16x8_t src16_high = vreinterpretq_s16_u16(vmovl_u8(src8_high)); const int16x8_t dst16_high = vreinterpretq_s16_u16(vmovl_u8(dst8_high)); const int16x8_t result16_low = qvsource_over_s16(src16_low, dst16_low, half, full); const int16x8_t result16_high = qvsource_over_s16(src16_high, dst16_high, half, full); const int32x2_t result32_low = vreinterpret_s32_s8(vmovn_s16(result16_low)); const int32x2_t result32_high = vreinterpret_s32_s8(vmovn_s16(result16_high)); vst1q_s32((int32_t *)&dst[x], vcombine_s32(result32_low, result32_high)); } } for (; x<w; ++x) { uint s = src[x]; if (s >= 0xff000000) dst[x] = s; else if (s != 0) dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); } dst = (quint32 *)(((uchar *) dst) + dbpl); src = (const quint32 *)(((const uchar *) src) + sbpl); } } else if (const_alpha != 0) { const_alpha = (const_alpha * 255) >> 8; int16x8_t const_alpha16 = vdupq_n_s16(const_alpha); for (int y = 0; y < h; ++y) { int x = 0; for (; x < w-3; x += 4) { if (src[x] | src[x+1] | src[x+2] | src[x+3]) { int32x4_t src32 = vld1q_s32((int32_t *)&src[x]); int32x4_t dst32 = vld1q_s32((int32_t *)&dst[x]); const uint8x16_t src8 = vreinterpretq_u8_s32(src32); const uint8x16_t dst8 = vreinterpretq_u8_s32(dst32); const uint8x8_t src8_low = vget_low_u8(src8); const uint8x8_t dst8_low = vget_low_u8(dst8); const uint8x8_t src8_high = vget_high_u8(src8); const uint8x8_t dst8_high = vget_high_u8(dst8); const int16x8_t src16_low = vreinterpretq_s16_u16(vmovl_u8(src8_low)); const int16x8_t dst16_low = vreinterpretq_s16_u16(vmovl_u8(dst8_low)); const int16x8_t src16_high = vreinterpretq_s16_u16(vmovl_u8(src8_high)); const int16x8_t dst16_high = vreinterpretq_s16_u16(vmovl_u8(dst8_high)); const int16x8_t srcalpha16_low = qvbyte_mul_s16(src16_low, const_alpha16, half); const int16x8_t srcalpha16_high = qvbyte_mul_s16(src16_high, const_alpha16, half); const int16x8_t result16_low = qvsource_over_s16(srcalpha16_low, dst16_low, half, full); const int16x8_t result16_high = qvsource_over_s16(srcalpha16_high, dst16_high, half, full); const int32x2_t result32_low = vreinterpret_s32_s8(vmovn_s16(result16_low)); const int32x2_t result32_high = vreinterpret_s32_s8(vmovn_s16(result16_high)); vst1q_s32((int32_t *)&dst[x], vcombine_s32(result32_low, result32_high)); } } for (; x<w; ++x) { uint s = src[x]; if (s != 0) { s = BYTE_MUL(s, const_alpha); dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); } } dst = (quint32 *)(((uchar *) dst) + dbpl); src = (const quint32 *)(((const uchar *) src) + sbpl); } }
static inline int32_t TransformAndFindMaxNeon(int16_t* inre, int16_t* inim, int32_t* outre, int32_t* outim) { int k; int16_t* inre1 = inre; int16_t* inre2 = &inre[FRAMESAMPLES/2 - 4]; int16_t* inim1 = inim; int16_t* inim2 = &inim[FRAMESAMPLES/2 - 4]; int32_t* outre1 = outre; int32_t* outre2 = &outre[FRAMESAMPLES/2 - 4]; int32_t* outim1 = outim; int32_t* outim2 = &outim[FRAMESAMPLES/2 - 4]; const int16_t* kSinTab1 = &WebRtcIsacfix_kSinTab2[0]; const int16_t* kSinTab2 = &WebRtcIsacfix_kSinTab2[FRAMESAMPLES/4 - 4]; uint32x4_t max_r = vdupq_n_u32(0); uint32x4_t max_i = vdupq_n_u32(0); // Use ">> 5", instead of "<< 9" and then ">> 14" as in the C code. for (k = 0; k < FRAMESAMPLES/4; k += 4) { int16x4_t tmpi = vld1_s16(kSinTab1); kSinTab1 += 4; int16x4_t tmpr = vld1_s16(kSinTab2); kSinTab2 -= 4; int16x4_t inre_0 = vld1_s16(inre1); inre1 += 4; int16x4_t inre_1 = vld1_s16(inre2); inre2 -= 4; int16x4_t inim_0 = vld1_s16(inim1); inim1 += 4; int16x4_t inim_1 = vld1_s16(inim2); inim2 -= 4; tmpr = vneg_s16(tmpr); inre_1 = vrev64_s16(inre_1); inim_1 = vrev64_s16(inim_1); tmpr = vrev64_s16(tmpr); int32x4_t xr = vmull_s16(tmpr, inre_0); int32x4_t xi = vmull_s16(tmpr, inim_0); int32x4_t yr = vmull_s16(tmpr, inim_1); int32x4_t yi = vmull_s16(tmpi, inim_1); xr = vmlal_s16(xr, tmpi, inim_0); xi = vmlsl_s16(xi, tmpi, inre_0); yr = vmlal_s16(yr, tmpi, inre_1); yi = vmlsl_s16(yi, tmpr, inre_1); yr = vnegq_s32(yr); xr = vshrq_n_s32(xr, 5); xi = vshrq_n_s32(xi, 5); yr = vshrq_n_s32(yr, 5); yi = vshrq_n_s32(yi, 5); int32x4_t outr0 = vsubq_s32(xr, yi); int32x4_t outr1 = vaddq_s32(xr, yi); int32x4_t outi0 = vaddq_s32(xi, yr); int32x4_t outi1 = vsubq_s32(yr, xi); // Find the absolute maximum in the vectors. int32x4_t tmp0 = vabsq_s32(outr0); int32x4_t tmp1 = vabsq_s32(outr1); int32x4_t tmp2 = vabsq_s32(outi0); int32x4_t tmp3 = vabsq_s32(outi1); // vabs doesn't change the value of 0x80000000. // Use u32 so we don't lose the value 0x80000000. max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp0)); max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp2)); max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp1)); max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp3)); // Store the vectors. outr1 = vrev64q_s32(outr1); outi1 = vrev64q_s32(outi1); int32x4_t outr_1 = vcombine_s32(vget_high_s32(outr1), vget_low_s32(outr1)); int32x4_t outi_1 = vcombine_s32(vget_high_s32(outi1), vget_low_s32(outi1)); vst1q_s32(outre1, outr0); outre1 += 4; vst1q_s32(outim1, outi0); outim1 += 4; vst1q_s32(outre2, outr_1); outre2 -= 4; vst1q_s32(outim2, outi_1); outim2 -= 4; } max_r = vmaxq_u32(max_r, max_i); #if defined(WEBRTC_ARCH_ARM64) uint32_t maximum = vmaxvq_u32(max_r); #else uint32x2_t max32x2_r = vmax_u32(vget_low_u32(max_r), vget_high_u32(max_r)); max32x2_r = vpmax_u32(max32x2_r, max32x2_r); uint32_t maximum = vget_lane_u32(max32x2_r, 0); #endif return (int32_t)maximum; }
// CHECK-LABEL: define <4 x i32> @test_vcombine_s32(<2 x i32> %low, <2 x i32> %high) #0 { // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %low, <2 x i32> %high, <4 x i32> <i32 0, i32 1, i32 2, i32 3> // CHECK: ret <4 x i32> [[SHUFFLE_I]] int32x4_t test_vcombine_s32(int32x2_t low, int32x2_t high) { return vcombine_s32(low, high); }
static inline void PostShiftAndDivideAndDemodulateNeon(int16_t* inre, int16_t* inim, int32_t* outre1, int32_t* outre2, int32_t sh) { int k; int16_t* p_inre = inre; int16_t* p_inim = inim; int32_t* p_outre1 = outre1; int32_t* p_outre2 = outre2; const int16_t* kCosTab = &WebRtcIsacfix_kCosTab1[0]; const int16_t* kSinTab = &WebRtcIsacfix_kSinTab1[0]; int32x4_t shift = vdupq_n_s32(-sh - 16); // Divide through by the normalizing constant: // scale all values with 1/240, i.e. with 273 in Q16. // 273/65536 ~= 0.0041656 // 1/240 ~= 0.0041666 int16x8_t scale = vdupq_n_s16(273); // Sqrt(240) in Q11 is round(15.49193338482967 * 2048) = 31727. int factQ19 = 31727 << 16; int32x4_t fact = vdupq_n_s32(factQ19); for (k = 0; k < FRAMESAMPLES/2; k += 8) { int16x8_t inre16x8 = vld1q_s16(p_inre); int16x8_t inim16x8 = vld1q_s16(p_inim); p_inre += 8; p_inim += 8; int16x8_t tmpr = vld1q_s16(kCosTab); int16x8_t tmpi = vld1q_s16(kSinTab); kCosTab += 8; kSinTab += 8; // By vshl and vmull, we effectively did "<< (-sh - 16)", // instead of "<< (-sh)" and ">> 16" as in the C code. int32x4_t outre1_0 = vmull_s16(vget_low_s16(inre16x8), vget_low_s16(scale)); int32x4_t outre2_0 = vmull_s16(vget_low_s16(inim16x8), vget_low_s16(scale)); #if defined(WEBRTC_ARCH_ARM64) int32x4_t outre1_1 = vmull_high_s16(inre16x8, scale); int32x4_t outre2_1 = vmull_high_s16(inim16x8, scale); #else int32x4_t outre1_1 = vmull_s16(vget_high_s16(inre16x8), vget_high_s16(scale)); int32x4_t outre2_1 = vmull_s16(vget_high_s16(inim16x8), vget_high_s16(scale)); #endif outre1_0 = vshlq_s32(outre1_0, shift); outre1_1 = vshlq_s32(outre1_1, shift); outre2_0 = vshlq_s32(outre2_0, shift); outre2_1 = vshlq_s32(outre2_1, shift); // Demodulate and separate. int32x4_t tmpr_0 = vmovl_s16(vget_low_s16(tmpr)); int32x4_t tmpi_0 = vmovl_s16(vget_low_s16(tmpi)); #if defined(WEBRTC_ARCH_ARM64) int32x4_t tmpr_1 = vmovl_high_s16(tmpr); int32x4_t tmpi_1 = vmovl_high_s16(tmpi); #else int32x4_t tmpr_1 = vmovl_s16(vget_high_s16(tmpr)); int32x4_t tmpi_1 = vmovl_s16(vget_high_s16(tmpi)); #endif int64x2_t xr0 = vmull_s32(vget_low_s32(tmpr_0), vget_low_s32(outre1_0)); int64x2_t xi0 = vmull_s32(vget_low_s32(tmpr_0), vget_low_s32(outre2_0)); int64x2_t xr2 = vmull_s32(vget_low_s32(tmpr_1), vget_low_s32(outre1_1)); int64x2_t xi2 = vmull_s32(vget_low_s32(tmpr_1), vget_low_s32(outre2_1)); xr0 = vmlsl_s32(xr0, vget_low_s32(tmpi_0), vget_low_s32(outre2_0)); xi0 = vmlal_s32(xi0, vget_low_s32(tmpi_0), vget_low_s32(outre1_0)); xr2 = vmlsl_s32(xr2, vget_low_s32(tmpi_1), vget_low_s32(outre2_1)); xi2 = vmlal_s32(xi2, vget_low_s32(tmpi_1), vget_low_s32(outre1_1)); #if defined(WEBRTC_ARCH_ARM64) int64x2_t xr1 = vmull_high_s32(tmpr_0, outre1_0); int64x2_t xi1 = vmull_high_s32(tmpr_0, outre2_0); int64x2_t xr3 = vmull_high_s32(tmpr_1, outre1_1); int64x2_t xi3 = vmull_high_s32(tmpr_1, outre2_1); xr1 = vmlsl_high_s32(xr1, tmpi_0, outre2_0); xi1 = vmlal_high_s32(xi1, tmpi_0, outre1_0); xr3 = vmlsl_high_s32(xr3, tmpi_1, outre2_1); xi3 = vmlal_high_s32(xi3, tmpi_1, outre1_1); #else int64x2_t xr1 = vmull_s32(vget_high_s32(tmpr_0), vget_high_s32(outre1_0)); int64x2_t xi1 = vmull_s32(vget_high_s32(tmpr_0), vget_high_s32(outre2_0)); int64x2_t xr3 = vmull_s32(vget_high_s32(tmpr_1), vget_high_s32(outre1_1)); int64x2_t xi3 = vmull_s32(vget_high_s32(tmpr_1), vget_high_s32(outre2_1)); xr1 = vmlsl_s32(xr1, vget_high_s32(tmpi_0), vget_high_s32(outre2_0)); xi1 = vmlal_s32(xi1, vget_high_s32(tmpi_0), vget_high_s32(outre1_0)); xr3 = vmlsl_s32(xr3, vget_high_s32(tmpi_1), vget_high_s32(outre2_1)); xi3 = vmlal_s32(xi3, vget_high_s32(tmpi_1), vget_high_s32(outre1_1)); #endif outre1_0 = vcombine_s32(vshrn_n_s64(xr0, 10), vshrn_n_s64(xr1, 10)); outre2_0 = vcombine_s32(vshrn_n_s64(xi0, 10), vshrn_n_s64(xi1, 10)); outre1_1 = vcombine_s32(vshrn_n_s64(xr2, 10), vshrn_n_s64(xr3, 10)); outre2_1 = vcombine_s32(vshrn_n_s64(xi2, 10), vshrn_n_s64(xi3, 10)); outre1_0 = vqdmulhq_s32(outre1_0, fact); outre2_0 = vqdmulhq_s32(outre2_0, fact); outre1_1 = vqdmulhq_s32(outre1_1, fact); outre2_1 = vqdmulhq_s32(outre2_1, fact); vst1q_s32(p_outre1, outre1_0); p_outre1 += 4; vst1q_s32(p_outre1, outre1_1); p_outre1 += 4; vst1q_s32(p_outre2, outre2_0); p_outre2 += 4; vst1q_s32(p_outre2, outre2_1); p_outre2 += 4; } }
void silk_warped_autocorrelation_FIX_neon( opus_int32 *corr, /* O Result [order + 1] */ opus_int *scale, /* O Scaling of the correlation vector */ const opus_int16 *input, /* I Input data to correlate */ const opus_int warping_Q16, /* I Warping coefficient */ const opus_int length, /* I Length of input */ const opus_int order /* I Correlation order (even) */ ) { if( ( MAX_SHAPE_LPC_ORDER > 24 ) || ( order < 6 ) ) { silk_warped_autocorrelation_FIX_c( corr, scale, input, warping_Q16, length, order ); } else { opus_int n, i, lsh; opus_int64 corr_QC[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 }; /* In reverse order */ opus_int64 corr_QC_orderT; int64x2_t lsh_s64x2; const opus_int orderT = ( order + 3 ) & ~3; opus_int64 *corr_QCT; opus_int32 *input_QS; VARDECL( opus_int32, input_QST ); VARDECL( opus_int32, state ); SAVE_STACK; /* Order must be even */ silk_assert( ( order & 1 ) == 0 ); silk_assert( 2 * QS - QC >= 0 ); ALLOC( input_QST, length + 2 * MAX_SHAPE_LPC_ORDER, opus_int32 ); input_QS = input_QST; /* input_QS has zero paddings in the beginning and end. */ vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; /* Loop over samples */ for( n = 0; n < length - 7; n += 8, input_QS += 8 ) { const int16x8_t t0_s16x4 = vld1q_s16( input + n ); vst1q_s32( input_QS + 0, vshll_n_s16( vget_low_s16( t0_s16x4 ), QS ) ); vst1q_s32( input_QS + 4, vshll_n_s16( vget_high_s16( t0_s16x4 ), QS ) ); } for( ; n < length; n++, input_QS++ ) { input_QS[ 0 ] = silk_LSHIFT32( (opus_int32)input[ n ], QS ); } vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS = input_QST + MAX_SHAPE_LPC_ORDER - orderT; /* The following loop runs ( length + order ) times, with ( order ) extra epilogues. */ /* The zero paddings in input_QS guarantee corr_QC's correctness even with the extra epilogues. */ /* The values of state_QS will be polluted by the extra epilogues, however they are temporary values. */ /* Keep the C code here to help understand the intrinsics optimization. */ /* { opus_int32 state_QS[ 2 ][ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 }; opus_int32 *state_QST[ 3 ]; state_QST[ 0 ] = state_QS[ 0 ]; state_QST[ 1 ] = state_QS[ 1 ]; for( n = 0; n < length + order; n++, input_QS++ ) { state_QST[ 0 ][ orderT ] = input_QS[ orderT ]; for( i = 0; i < orderT; i++ ) { corr_QC[ i ] += silk_RSHIFT64( silk_SMULL( state_QST[ 0 ][ i ], input_QS[ i ] ), 2 * QS - QC ); state_QST[ 1 ][ i ] = silk_SMLAWB( state_QST[ 1 ][ i + 1 ], state_QST[ 0 ][ i ] - state_QST[ 0 ][ i + 1 ], warping_Q16 ); } state_QST[ 2 ] = state_QST[ 0 ]; state_QST[ 0 ] = state_QST[ 1 ]; state_QST[ 1 ] = state_QST[ 2 ]; } } */ { const int32x4_t warping_Q16_s32x4 = vdupq_n_s32( warping_Q16 << 15 ); const opus_int32 *in = input_QS + orderT; opus_int o = orderT; int32x4_t state_QS_s32x4[ 3 ][ 2 ]; ALLOC( state, length + orderT, opus_int32 ); state_QS_s32x4[ 2 ][ 1 ] = vdupq_n_s32( 0 ); /* Calculate 8 taps of all inputs in each loop. */ do { state_QS_s32x4[ 0 ][ 0 ] = state_QS_s32x4[ 0 ][ 1 ] = state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 1 ][ 1 ] = vdupq_n_s32( 0 ); n = 0; do { calc_corr( input_QS + n, corr_QC, o - 8, state_QS_s32x4[ 0 ][ 0 ] ); calc_corr( input_QS + n, corr_QC, o - 4, state_QS_s32x4[ 0 ][ 1 ] ); state_QS_s32x4[ 2 ][ 1 ] = vld1q_s32( in + n ); vst1q_lane_s32( state + n, state_QS_s32x4[ 0 ][ 0 ], 0 ); state_QS_s32x4[ 2 ][ 0 ] = vextq_s32( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 0 ][ 1 ], 1 ); state_QS_s32x4[ 2 ][ 1 ] = vextq_s32( state_QS_s32x4[ 0 ][ 1 ], state_QS_s32x4[ 2 ][ 1 ], 1 ); state_QS_s32x4[ 0 ][ 0 ] = calc_state( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], state_QS_s32x4[ 1 ][ 0 ], warping_Q16_s32x4 ); state_QS_s32x4[ 0 ][ 1 ] = calc_state( state_QS_s32x4[ 0 ][ 1 ], state_QS_s32x4[ 2 ][ 1 ], state_QS_s32x4[ 1 ][ 1 ], warping_Q16_s32x4 ); state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 2 ][ 0 ]; state_QS_s32x4[ 1 ][ 1 ] = state_QS_s32x4[ 2 ][ 1 ]; } while( ++n < ( length + order ) ); in = state; o -= 8; } while( o > 4 ); if( o ) { /* Calculate the last 4 taps of all inputs. */ opus_int32 *stateT = state; silk_assert( o == 4 ); state_QS_s32x4[ 0 ][ 0 ] = state_QS_s32x4[ 1 ][ 0 ] = vdupq_n_s32( 0 ); n = length + order; do { calc_corr( input_QS, corr_QC, 0, state_QS_s32x4[ 0 ][ 0 ] ); state_QS_s32x4[ 2 ][ 0 ] = vld1q_s32( stateT ); vst1q_lane_s32( stateT, state_QS_s32x4[ 0 ][ 0 ], 0 ); state_QS_s32x4[ 2 ][ 0 ] = vextq_s32( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], 1 ); state_QS_s32x4[ 0 ][ 0 ] = calc_state( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], state_QS_s32x4[ 1 ][ 0 ], warping_Q16_s32x4 ); state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 2 ][ 0 ]; input_QS++; stateT++; } while( --n ); } } { const opus_int16 *inputT = input; int32x4_t t_s32x4; int64x1_t t_s64x1; int64x2_t t_s64x2 = vdupq_n_s64( 0 ); for( n = 0; n <= length - 8; n += 8 ) { int16x8_t input_s16x8 = vld1q_s16( inputT ); t_s32x4 = vmull_s16( vget_low_s16( input_s16x8 ), vget_low_s16( input_s16x8 ) ); t_s32x4 = vmlal_s16( t_s32x4, vget_high_s16( input_s16x8 ), vget_high_s16( input_s16x8 ) ); t_s64x2 = vaddw_s32( t_s64x2, vget_low_s32( t_s32x4 ) ); t_s64x2 = vaddw_s32( t_s64x2, vget_high_s32( t_s32x4 ) ); inputT += 8; } t_s64x1 = vadd_s64( vget_low_s64( t_s64x2 ), vget_high_s64( t_s64x2 ) ); corr_QC_orderT = vget_lane_s64( t_s64x1, 0 ); for( ; n < length; n++ ) { corr_QC_orderT += silk_SMULL( input[ n ], input[ n ] ); } corr_QC_orderT = silk_LSHIFT64( corr_QC_orderT, QC ); corr_QC[ orderT ] = corr_QC_orderT; } corr_QCT = corr_QC + orderT - order; lsh = silk_CLZ64( corr_QC_orderT ) - 35; lsh = silk_LIMIT( lsh, -12 - QC, 30 - QC ); *scale = -( QC + lsh ); silk_assert( *scale >= -30 && *scale <= 12 ); lsh_s64x2 = vdupq_n_s64( lsh ); for( i = 0; i <= order - 3; i += 4 ) { int32x4_t corr_s32x4; int64x2_t corr_QC0_s64x2, corr_QC1_s64x2; corr_QC0_s64x2 = vld1q_s64( corr_QCT + i ); corr_QC1_s64x2 = vld1q_s64( corr_QCT + i + 2 ); corr_QC0_s64x2 = vshlq_s64( corr_QC0_s64x2, lsh_s64x2 ); corr_QC1_s64x2 = vshlq_s64( corr_QC1_s64x2, lsh_s64x2 ); corr_s32x4 = vcombine_s32( vmovn_s64( corr_QC1_s64x2 ), vmovn_s64( corr_QC0_s64x2 ) ); corr_s32x4 = vrev64q_s32( corr_s32x4 ); vst1q_s32( corr + order - i - 3, corr_s32x4 ); } if( lsh >= 0 ) { for( ; i < order + 1; i++ ) { corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_LSHIFT64( corr_QCT[ i ], lsh ) ); } } else { for( ; i < order + 1; i++ ) { corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_RSHIFT64( corr_QCT[ i ], -lsh ) ); } } silk_assert( corr_QCT[ order ] >= 0 ); /* If breaking, decrease QC*/ RESTORE_STACK; } #ifdef OPUS_CHECK_ASM { opus_int32 corr_c[ MAX_SHAPE_LPC_ORDER + 1 ]; opus_int scale_c; silk_warped_autocorrelation_FIX_c( corr_c, &scale_c, input, warping_Q16, length, order ); silk_assert( !memcmp( corr_c, corr, sizeof( corr_c[ 0 ] ) * ( order + 1 ) ) ); silk_assert( scale_c == *scale ); } #endif }
void computeNetwork0new_neon(const float *dataf, const float *weightsf, uint8_t *d) { const int16_t *data = (const int16_t *)dataf; const int16_t *weights = (const int16_t *)weightsf; int32x4_t accum0 = { 0, 0, 0, 0 }; int32x4_t accum1 = accum0; int32x4_t accum2 = accum0; int32x4_t accum3 = accum0; for (int i = 0; i < 128/2; i += 8) { int16x4x2_t d0 = vld2_s16(data + i); int16x4x2_t w0 = vld2_s16(weights + i * 4); int16x4x2_t w1 = vld2_s16(weights + i * 4 + 8); int16x4x2_t w2 = vld2_s16(weights + i * 4 + 16); int16x4x2_t w3 = vld2_s16(weights + i * 4 + 24); accum0 = vmlal_s16(accum0, d0.val[0], w0.val[0]); accum0 = vmlal_s16(accum0, d0.val[1], w0.val[1]); accum1 = vmlal_s16(accum1, d0.val[0], w1.val[0]); accum1 = vmlal_s16(accum1, d0.val[1], w1.val[1]); accum2 = vmlal_s16(accum2, d0.val[0], w2.val[0]); accum2 = vmlal_s16(accum2, d0.val[1], w2.val[1]); accum3 = vmlal_s16(accum3, d0.val[0], w3.val[0]); accum3 = vmlal_s16(accum3, d0.val[1], w3.val[1]); } int32x2_t sum0 = vpadd_s32(vget_low_s32(accum0), vget_high_s32(accum0)); int32x2_t sum1 = vpadd_s32(vget_low_s32(accum1), vget_high_s32(accum1)); int32x2_t sum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2)); int32x2_t sum3 = vpadd_s32(vget_low_s32(accum3), vget_high_s32(accum3)); sum0 = vpadd_s32(sum0, sum1); sum1 = vpadd_s32(sum2, sum3); int32x4_t sum = vcombine_s32(sum0, sum1); float32x4_t m0 = vcvtq_f32_s32(sum); m0 = vmulq_f32(m0, vld1q_f32(weightsf + 512/4)); m0 = vaddq_f32(m0, vld1q_f32(weightsf + 528/4)); float32x4_t m1, m2, m3, m4; m1 = m0; m0 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(m0), sign_bits_f)); m0 = vaddq_f32(m0, ones_f); m0 = vmulq_f32(reciprocal(m0), m1); m1 = vdupq_lane_f32(vget_low_f32(m0), 0); m2 = vdupq_lane_f32(vget_low_f32(m0), 1); m3 = vdupq_lane_f32(vget_high_f32(m0), 0); m4 = vdupq_lane_f32(vget_high_f32(m0), 1); m1 = vmulq_f32(m1, vld1q_f32(weightsf + 544/4)); m2 = vmulq_f32(m2, vld1q_f32(weightsf + 560/4)); m3 = vmulq_f32(m3, vld1q_f32(weightsf + 576/4)); m4 = vmulq_f32(m4, vld1q_f32(weightsf + 592/4)); m1 = vaddq_f32(m1, m2); m3 = vaddq_f32(m3, m4); m1 = vaddq_f32(m1, m3); m1 = vaddq_f32(m1, vld1q_f32(weightsf + 608/4)); uint32x4_t gte = vcgeq_f32(m1, zeroes_f); uint16x4_t gte_u16 = vmovn_u32(gte); uint8x8_t gte_u8 = vmovn_u16(vcombine_u16(gte_u16, vget_low_u16(vreinterpretq_u16_u32(sign_bits_f)))); gte_u8 = vshr_n_u8(gte_u8, 7); vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(gte_u8), 0); }
void computeNetwork0_i16_neon(const float *inputf, const float *weightsf, uint8_t *d) { const int16_t *input = (const int16_t *)inputf; const int16_t *weights = (const int16_t *)weightsf; int32x4_t accum0 = { 0, 0, 0, 0 }; int32x4_t accum1 = accum0; int32x4_t accum2 = accum0; int32x4_t accum3 = accum0; for (int i = 0; i < 96/2; i += 8) { int16x4x2_t d0 = vld2_s16(input + i); int16x4x2_t w0 = vld2_s16(weights + i * 4); int16x4x2_t w1 = vld2_s16(weights + i * 4 + 8); int16x4x2_t w2 = vld2_s16(weights + i * 4 + 16); int16x4x2_t w3 = vld2_s16(weights + i * 4 + 24); accum0 = vmlal_s16(accum0, d0.val[0], w0.val[0]); accum0 = vmlal_s16(accum0, d0.val[1], w0.val[1]); accum1 = vmlal_s16(accum1, d0.val[0], w1.val[0]); accum1 = vmlal_s16(accum1, d0.val[1], w1.val[1]); accum2 = vmlal_s16(accum2, d0.val[0], w2.val[0]); accum2 = vmlal_s16(accum2, d0.val[1], w2.val[1]); accum3 = vmlal_s16(accum3, d0.val[0], w3.val[0]); accum3 = vmlal_s16(accum3, d0.val[1], w3.val[1]); } int32x2_t sum0 = vpadd_s32(vget_low_s32(accum0), vget_high_s32(accum0)); int32x2_t sum1 = vpadd_s32(vget_low_s32(accum1), vget_high_s32(accum1)); int32x2_t sum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2)); int32x2_t sum3 = vpadd_s32(vget_low_s32(accum3), vget_high_s32(accum3)); sum0 = vpadd_s32(sum0, sum1); sum1 = vpadd_s32(sum2, sum3); int32x4_t sum = vcombine_s32(sum0, sum1); float32x4_t m0 = vcvtq_f32_s32(sum); m0 = vmulq_f32(m0, vld1q_f32(weightsf + 384/4)); m0 = vaddq_f32(m0, vld1q_f32(weightsf + 400/4)); float32x4_t m1, m2, m3, m4, m5, m6, m7; m1 = m0; m0 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(m0), sign_bits_f_zero_l)); m0 = vaddq_f32(m0, ones_f); m0 = vmulq_f32(reciprocal(m0), m1); m1 = vdupq_lane_f32(vget_low_f32(m0), 0); m2 = vdupq_lane_f32(vget_low_f32(m0), 1); m3 = vdupq_lane_f32(vget_high_f32(m0), 0); m4 = vdupq_lane_f32(vget_high_f32(m0), 1); m1 = vmulq_f32(m1, vld1q_f32(weightsf + 416/4)); m2 = vmulq_f32(m2, vld1q_f32(weightsf + (416+16)/4)); m3 = vmulq_f32(m3, vld1q_f32(weightsf + (416+32)/4)); m4 = vmulq_f32(m4, vld1q_f32(weightsf + (416+48)/4)); m1 = vaddq_f32(m1, m2); m3 = vaddq_f32(m3, m4); m1 = vaddq_f32(m1, m3); m1 = vaddq_f32(m1, vld1q_f32(weightsf + (416+64)/4)); m7 = m1; m1 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(m1), sign_bits_f)); m1 = vaddq_f32(m1, ones_f); m7 = vmulq_f32(reciprocal(m1), m7); m3 = m0; m0 = vdupq_lane_f32(vget_low_f32(m0), 0); m1 = vdupq_lane_f32(vget_low_f32(m3), 1); m2 = vdupq_lane_f32(vget_high_f32(m3), 0); m3 = vdupq_lane_f32(vget_high_f32(m3), 1); m0 = vmulq_f32(m0, vld1q_f32(weightsf + 496/4)); m1 = vmulq_f32(m1, vld1q_f32(weightsf + (496+16)/4)); m2 = vmulq_f32(m2, vld1q_f32(weightsf + (496+32)/4)); m3 = vmulq_f32(m3, vld1q_f32(weightsf + (496+48)/4)); m4 = vdupq_lane_f32(vget_low_f32(m7), 0); m5 = vdupq_lane_f32(vget_low_f32(m7), 1); m6 = vdupq_lane_f32(vget_high_f32(m7), 0); m7 = vdupq_lane_f32(vget_high_f32(m7), 1); m4 = vmulq_f32(m4, vld1q_f32(weightsf + (496+64)/4)); m5 = vmulq_f32(m5, vld1q_f32(weightsf + (496+80)/4)); m6 = vmulq_f32(m6, vld1q_f32(weightsf + (496+96)/4)); m7 = vmulq_f32(m7, vld1q_f32(weightsf + (496+112)/4)); m0 = vaddq_f32(m0, m1); m2 = vaddq_f32(m2, m3); m4 = vaddq_f32(m4, m5); m6 = vaddq_f32(m6, m7); m0 = vaddq_f32(m0, m2); m4 = vaddq_f32(m4, m6); m0 = vaddq_f32(m0, m4); m0 = vaddq_f32(m0, vld1q_f32(weightsf + (496+128)/4)); float32x2_t maximum = vmax_f32(vget_low_f32(m0), vget_high_f32(m0)); d[0] = (vget_lane_f32(maximum, 1) <= vget_lane_f32(maximum, 0)); }
int mult_cpx_conj_vector(int16_t *x1, int16_t *x2, int16_t *y, uint32_t N, int output_shift, int madd) { // Multiply elementwise the complex conjugate of x1 with x2. // x1 - input 1 in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| // We assume x1 with a dinamic of 15 bit maximum // // x2 - input 2 in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| // We assume x2 with a dinamic of 14 bit maximum /// // y - output in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| // // N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4; // // output_shift - shift to be applied to generate output // // madd - add the output to y uint32_t i; // loop counter simd_q15_t *x1_128; simd_q15_t *x2_128; simd_q15_t *y_128; #if defined(__x86_64__) || defined(__i386__) simd_q15_t tmp_re,tmp_im; simd_q15_t tmpy0,tmpy1; #elif defined(__arm__) int32x4_t tmp_re,tmp_im; int32x4_t tmp_re1,tmp_im1; int16x4x2_t tmpy; int32x4_t shift = vdupq_n_s32(-output_shift); #endif x1_128 = (simd_q15_t *)&x1[0]; x2_128 = (simd_q15_t *)&x2[0]; y_128 = (simd_q15_t *)&y[0]; // we compute 4 cpx multiply for each loop for(i=0; i<(N>>2); i++) { #if defined(__x86_64__) || defined(__i386__) tmp_re = _mm_madd_epi16(*x1_128,*x2_128); tmp_im = _mm_shufflelo_epi16(*x1_128,_MM_SHUFFLE(2,3,0,1)); tmp_im = _mm_shufflehi_epi16(tmp_im,_MM_SHUFFLE(2,3,0,1)); tmp_im = _mm_sign_epi16(tmp_im,*(__m128i*)&conjug[0]); tmp_im = _mm_madd_epi16(tmp_im,*x2_128); tmp_re = _mm_srai_epi32(tmp_re,output_shift); tmp_im = _mm_srai_epi32(tmp_im,output_shift); tmpy0 = _mm_unpacklo_epi32(tmp_re,tmp_im); tmpy1 = _mm_unpackhi_epi32(tmp_re,tmp_im); if (madd==0) *y_128 = _mm_packs_epi32(tmpy0,tmpy1); else *y_128 += _mm_packs_epi32(tmpy0,tmpy1); #elif defined(__arm__) tmp_re = vmull_s16(((simdshort_q15_t *)x1_128)[0], ((simdshort_q15_t*)x2_128)[0]); //tmp_re = [Re(x1[0])Re(x2[0]) Im(x1[0])Im(x2[0]) Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1])] tmp_re1 = vmull_s16(((simdshort_q15_t *)x1_128)[1], ((simdshort_q15_t*)x2_128)[1]); //tmp_re1 = [Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1]) Re(x1[1])Re(x2[2]) Im(x1[1])Im(x2[2])] tmp_re = vcombine_s32(vpadd_s32(vget_low_s32(tmp_re),vget_high_s32(tmp_re)), vpadd_s32(vget_low_s32(tmp_re1),vget_high_s32(tmp_re1))); //tmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2]) Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] tmp_im = vmull_s16(vrev32_s16(vmul_s16(((simdshort_q15_t*)x2_128)[0],*(simdshort_q15_t*)conjug)), ((simdshort_q15_t*)x1_128)[0]); //tmp_im = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])] tmp_im1 = vmull_s16(vrev32_s16(vmul_s16(((simdshort_q15_t*)x2_128)[1],*(simdshort_q15_t*)conjug)), ((simdshort_q15_t*)x1_128)[1]); //tmp_im1 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])] tmp_im = vcombine_s32(vpadd_s32(vget_low_s32(tmp_im),vget_high_s32(tmp_im)), vpadd_s32(vget_low_s32(tmp_im1),vget_high_s32(tmp_im1))); //tmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])] tmp_re = vqshlq_s32(tmp_re,shift); tmp_im = vqshlq_s32(tmp_im,shift); tmpy = vzip_s16(vmovn_s32(tmp_re),vmovn_s32(tmp_im)); if (madd==0) *y_128 = vcombine_s16(tmpy.val[0],tmpy.val[1]); else *y_128 += vcombine_s16(tmpy.val[0],tmpy.val[1]); #endif x1_128++; x2_128++; y_128++; } _mm_empty(); _m_empty(); return(0); }