int32_t dot_product(int16_t *x, int16_t *y, uint32_t N, //must be a multiple of 8 uint8_t output_shift) { uint32_t n; #if defined(__x86_64__) || defined(__i386__) __m128i *x128,*y128,mmtmp1,mmtmp2,mmtmp3,mmcumul,mmcumul_re,mmcumul_im; __m64 mmtmp7; __m128i minus_i = _mm_set_epi16(-1,1,-1,1,-1,1,-1,1); int32_t result; x128 = (__m128i*) x; y128 = (__m128i*) y; mmcumul_re = _mm_setzero_si128(); mmcumul_im = _mm_setzero_si128(); for (n=0; n<(N>>2); n++) { //printf("n=%d, x128=%p, y128=%p\n",n,x128,y128); // print_shorts("x",&x128[0]); // print_shorts("y",&y128[0]); // this computes Re(z) = Re(x)*Re(y) + Im(x)*Im(y) mmtmp1 = _mm_madd_epi16(x128[0],y128[0]); // print_ints("re",&mmtmp1); // mmtmp1 contains real part of 4 consecutive outputs (32-bit) // shift and accumulate results mmtmp1 = _mm_srai_epi32(mmtmp1,output_shift); mmcumul_re = _mm_add_epi32(mmcumul_re,mmtmp1); // print_ints("re",&mmcumul_re); // this computes Im(z) = Re(x)*Im(y) - Re(y)*Im(x) mmtmp2 = _mm_shufflelo_epi16(y128[0],_MM_SHUFFLE(2,3,0,1)); // print_shorts("y",&mmtmp2); mmtmp2 = _mm_shufflehi_epi16(mmtmp2,_MM_SHUFFLE(2,3,0,1)); // print_shorts("y",&mmtmp2); mmtmp2 = _mm_sign_epi16(mmtmp2,minus_i); // print_shorts("y",&mmtmp2); mmtmp3 = _mm_madd_epi16(x128[0],mmtmp2); // print_ints("im",&mmtmp3); // mmtmp3 contains imag part of 4 consecutive outputs (32-bit) // shift and accumulate results mmtmp3 = _mm_srai_epi32(mmtmp3,output_shift); mmcumul_im = _mm_add_epi32(mmcumul_im,mmtmp3); // print_ints("im",&mmcumul_im); x128++; y128++; } // this gives Re Re Im Im mmcumul = _mm_hadd_epi32(mmcumul_re,mmcumul_im); // print_ints("cumul1",&mmcumul); // this gives Re Im Re Im mmcumul = _mm_hadd_epi32(mmcumul,mmcumul); // print_ints("cumul2",&mmcumul); //mmcumul = _mm_srai_epi32(mmcumul,output_shift); // extract the lower half mmtmp7 = _mm_movepi64_pi64(mmcumul); // print_ints("mmtmp7",&mmtmp7); // pack the result mmtmp7 = _mm_packs_pi32(mmtmp7,mmtmp7); // print_shorts("mmtmp7",&mmtmp7); // convert back to integer result = _mm_cvtsi64_si32(mmtmp7); _mm_empty(); _m_empty(); return(result); #elif defined(__arm__) int16x4_t *x_128=(int16x4_t*)x; int16x4_t *y_128=(int16x4_t*)y; int32x4_t tmp_re,tmp_im; int32x4_t tmp_re1,tmp_im1; int32x4_t re_cumul,im_cumul; int32x2_t re_cumul2,im_cumul2; int32x4_t shift = vdupq_n_s32(-output_shift); int32x2x2_t result2; int16_t conjug[4]__attribute__((aligned(16))) = {-1,1,-1,1} ; re_cumul = vdupq_n_s32(0); im_cumul = vdupq_n_s32(0); for (n=0; n<(N>>2); n++) { tmp_re = vmull_s16(*x_128++, *y_128++); //tmp_re = [Re(x[0])Re(y[0]) Im(x[0])Im(y[0]) Re(x[1])Re(y[1]) Im(x[1])Im(y[1])] tmp_re1 = vmull_s16(*x_128++, *y_128++); //tmp_re1 = [Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1]) Re(x1[1])Re(x2[2]) Im(x1[1])Im(x2[2])] tmp_re = vcombine_s32(vpadd_s32(vget_low_s32(tmp_re),vget_high_s32(tmp_re)), vpadd_s32(vget_low_s32(tmp_re1),vget_high_s32(tmp_re1))); //tmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2]) Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] tmp_im = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++); //tmp_im = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])] tmp_im1 = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++); //tmp_im1 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])] tmp_im = vcombine_s32(vpadd_s32(vget_low_s32(tmp_im),vget_high_s32(tmp_im)), vpadd_s32(vget_low_s32(tmp_im1),vget_high_s32(tmp_im1))); //tmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])] re_cumul = vqaddq_s32(re_cumul,vqshlq_s32(tmp_re,shift)); im_cumul = vqaddq_s32(im_cumul,vqshlq_s32(tmp_im,shift)); } re_cumul2 = vpadd_s32(vget_low_s32(re_cumul),vget_high_s32(re_cumul)); im_cumul2 = vpadd_s32(vget_low_s32(im_cumul),vget_high_s32(im_cumul)); re_cumul2 = vpadd_s32(re_cumul2,re_cumul2); im_cumul2 = vpadd_s32(im_cumul2,im_cumul2); result2 = vzip_s32(re_cumul2,im_cumul2); return(vget_lane_s32(result2.val[0],0)); #endif }
//compute average channel_level on each (TX,RX) antenna pair int dl_channel_level(s16 *dl_ch, LTE_DL_FRAME_PARMS *frame_parms) { s16 rb; __m128i *dl_ch128; int avg; //clear average level avg128F = _mm_xor_si128(avg128F,avg128F); dl_ch128=(__m128i *)dl_ch; for (rb=0;rb<frame_parms->N_RB_DL;rb++) { avg128F = _mm_add_epi32(avg128F,_mm_madd_epi16(dl_ch128[0],dl_ch128[0])); avg128F = _mm_add_epi32(avg128F,_mm_madd_epi16(dl_ch128[1],dl_ch128[1])); avg128F = _mm_add_epi32(avg128F,_mm_madd_epi16(dl_ch128[2],dl_ch128[2])); dl_ch128+=3; } avg = (((int*)&avg128F)[0] + ((int*)&avg128F)[1] + ((int*)&avg128F)[2] + ((int*)&avg128F)[3])/(frame_parms->N_RB_DL*12); _mm_empty(); _m_empty(); return(avg); }
void multadd_complex_vector_real_scalar(int16_t *x, int16_t alpha, int16_t *y, uint8_t zero_flag, uint32_t N) { simd_q15_t alpha_128,*x_128=(simd_q15_t *)x,*y_128=(simd_q15_t*)y; int n; alpha_128 = set1_int16(alpha); if (zero_flag == 1) for (n=0; n<N>>2; n++) { y_128[n] = mulhi_int16(x_128[n],alpha_128); } else for (n=0; n<N>>2; n++) { y_128[n] = adds_int16(y_128[n],mulhi_int16(x_128[n],alpha_128)); } _mm_empty(); _m_empty(); }
int ulsch_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms, int **rxdataF_comp, short *ulsch_llr, unsigned char symbol, unsigned short nb_rb) { __m128i *rxF=(__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)]; int i; if (symbol == 0) llr128U = (__m128i*)ulsch_llr; if (!llr128U) { msg("ulsch_qpsk_llr: llr is null, symbol %d, llr128=%p\n",symbol, llr128U); return(-1); } // printf("qpsk llr for symbol %d (pos %d), llr offset %d\n",symbol,(symbol*frame_parms->N_RB_DL*12),llr128-(__m128i*)ulsch_llr); for (i=0;i<(nb_rb*3);i++) { *llr128U = *rxF; rxF++; llr128U++; } _mm_empty(); _m_empty(); return(0); }
int complex_conjugate(int16_t *x1, int16_t *y, uint32_t N) { uint32_t i; // loop counter simd_q15_t *x1_128; simd_q15_t *y_128; int16_t x2[8] __attribute__((aligned(16))) = {1,-1,1,-1,1,-1,1,-1}; simd_q15_t *x2_128 = (simd_q15_t*)&x2[0]; x1_128 = (simd_q15_t *)&x1[0]; y_128 = (simd_q15_t *)&y[0]; // we compute 4 cpx multiply for each loop for(i=0; i<(N>>3); i++) { y_128[0] = mullo_int16(x1_128[0],*x2_128); y_128[1] = mullo_int16(x1_128[1],*x2_128); y_128[2] = mullo_int16(x1_128[2],*x2_128); y_128[3] = mullo_int16(x1_128[3],*x2_128); x1_128+=4; y_128 +=4; } _mm_empty(); _m_empty(); return(0); }
//compute average channel_level on each (TX,RX) antenna pair int dl_channel_level(int16_t *dl_ch, LTE_DL_FRAME_PARMS *frame_parms) { int16_t rb; #if defined(__x86_64__) || defined(__i386__) __m128i *dl_ch128; #elif defined(__arm__) int16x4_t *dl_ch128; #endif int avg; //clear average level #if defined(__x86_64__) || defined(__i386__) avg128F = _mm_setzero_si128(); dl_ch128=(__m128i *)dl_ch; for (rb=0; rb<frame_parms->N_RB_DL; rb++) { avg128F = _mm_add_epi32(avg128F,_mm_madd_epi16(dl_ch128[0],dl_ch128[0])); avg128F = _mm_add_epi32(avg128F,_mm_madd_epi16(dl_ch128[1],dl_ch128[1])); avg128F = _mm_add_epi32(avg128F,_mm_madd_epi16(dl_ch128[2],dl_ch128[2])); dl_ch128+=3; } #elif defined(__arm__) avg128F = vdupq_n_s32(0); dl_ch128=(int16x4_t *)dl_ch; for (rb=0; rb<frame_parms->N_RB_DL; rb++) { avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[0],dl_ch128[0])); avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[1],dl_ch128[1])); avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[2],dl_ch128[2])); avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[3],dl_ch128[3])); avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[4],dl_ch128[4])); avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[5],dl_ch128[5])); dl_ch128+=6; } #endif DevAssert( frame_parms->N_RB_DL ); avg = (((int*)&avg128F)[0] + ((int*)&avg128F)[1] + ((int*)&avg128F)[2] + ((int*)&avg128F)[3])/(frame_parms->N_RB_DL*12); #if defined(__x86_64__) || defined(__i386__) _mm_empty(); _m_empty(); #endif return(avg); }
int signal_energy_nodc(int *input,unsigned int length) { int i; int temp,temp2; register __m64 mm0,mm1,mm2,mm3; __m64 *in = (__m64 *)input; #ifdef MAIN short *printb; #endif mm0 = _m_pxor(mm0,mm0); mm3 = _m_pxor(mm3,mm3); for (i=0;i<length>>1;i++) { mm1 = in[i]; mm2 = mm1; mm1 = _m_pmaddwd(mm1,mm1);// SIMD complex multiplication mm1 = _m_psradi(mm1,shift); mm0 = _m_paddd(mm0,mm1); // temp2 = mm0; // printf("%d %d\n",((int *)&in[i])[0],((int *)&in[i])[1]); // printb = (short *)&mm2; // printf("mm2 %d : %d %d %d %d\n",i,printb[0],printb[1],printb[2],printb[3]); } /* #ifdef MAIN printb = (short *)&mm3; printf("%d %d %d %d\n",printb[0],printb[1],printb[2],printb[3]); #endif */ mm1 = mm0; mm0 = _m_psrlqi(mm0,32); mm0 = _m_paddd(mm0,mm1); temp = _m_to_int(mm0); temp/=length; temp<<=shift; // this is the average of x^2 #ifdef MAIN printf("E x^2 = %d\n",temp); #endif _mm_empty(); _m_empty(); return((temp>0)?temp:1); }
void ulsch_extract_rbs_single(int **rxdataF, int **rxdataF_ext, unsigned int first_rb, unsigned int nb_rb, unsigned char l, unsigned char Ns, LTE_DL_FRAME_PARMS *frame_parms) { unsigned short nb_rb1,nb_rb2; unsigned char aarx; int *rxF,*rxF_ext; //unsigned char symbol = l+Ns*frame_parms->symbols_per_tti/2; unsigned char symbol = l+((7-frame_parms->Ncp)*(Ns&1)); ///symbol within sub-frame for (aarx=0;aarx<frame_parms->nb_antennas_rx;aarx++) { nb_rb1 = cmin(cmax((int)(frame_parms->N_RB_UL) - (int)(2*first_rb),(int)0),(int)(2*nb_rb)); // 2 times no. RBs before the DC nb_rb2 = 2*nb_rb - nb_rb1; // 2 times no. RBs after the DC #ifdef DEBUG_ULSCH msg("ulsch_extract_rbs_single: 2*nb_rb1 = %d, 2*nb_rb2 = %d\n",nb_rb1,nb_rb2); #endif rxF_ext = &rxdataF_ext[aarx][(symbol*frame_parms->N_RB_UL*12)*2]; if (nb_rb1) { rxF = &rxdataF[aarx][(first_rb*12 + frame_parms->first_carrier_offset + symbol*frame_parms->ofdm_symbol_size)*2]; memcpy(rxF_ext, rxF, nb_rb1*12*sizeof(int)); rxF_ext += nb_rb1*12; if (nb_rb2) { #ifdef OFDMA_ULSCH rxF = &rxdataF[aarx][(1 + symbol*frame_parms->ofdm_symbol_size)*2]; #else rxF = &rxdataF[aarx][(symbol*frame_parms->ofdm_symbol_size)*2]; #endif memcpy(rxF_ext, rxF, nb_rb2*12*sizeof(int)); rxF_ext += nb_rb2*12; } } else { //there is only data in the second half #ifdef OFDMA_ULSCH rxF = &rxdataF[aarx][(1 + 6*(2*first_rb - frame_parms->N_RB_UL) + symbol*frame_parms->ofdm_symbol_size)*2]; #else rxF = &rxdataF[aarx][(6*(2*first_rb - frame_parms->N_RB_UL) + symbol*frame_parms->ofdm_symbol_size)*2]; #endif memcpy(rxF_ext, rxF, nb_rb2*12*sizeof(int)); rxF_ext += nb_rb2*12; } } _mm_empty(); _m_empty(); }
void ulsch_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms, int **rxdataF_comp, short *ulsch_llr, int **ul_ch_mag, int **ul_ch_magb, unsigned char symbol, unsigned short nb_rb) { __m128i *rxF=(__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)]; __m128i *ch_mag,*ch_magb; int j=0,i; // unsigned char symbol_mod; if (symbol == 0) llrU = ulsch_llr; // symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol; ch_mag =(__m128i*)&ul_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)]; ch_magb =(__m128i*)&ul_ch_magb[0][(symbol*frame_parms->N_RB_DL*12)]; for (i=0;i<(nb_rb*3);i++) { mmtmpU1 = _mm_abs_epi16(rxF[i]); mmtmpU1 = _mm_subs_epi16(mmtmpU1,ch_mag[i]); mmtmpU2 = _mm_abs_epi16(mmtmpU1); mmtmpU2 = _mm_subs_epi16(mmtmpU2,ch_magb[i]); for (j=0;j<8;j++) { llrU[0] = ((short *)&rxF[i])[j]; llrU[1] = ((short *)&mmtmpU1)[j]; llrU[2] = ((short *)&mmtmpU2)[j]; llrU+=3; } } _mm_empty(); _m_empty(); }
void ulsch_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms, int **rxdataF_comp, short *ulsch_llr, int **ul_ch_mag, unsigned char symbol, unsigned short nb_rb) { __m128i *rxF=(__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)]; __m128i *ch_mag; int i; // unsigned char symbol_mod; // printf("ulsch_rx.c: ulsch_16qam_llr: symbol %d\n",symbol); if (symbol == 0) llr128U = (__m128i*)&ulsch_llr[0]; // symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol; ch_mag =(__m128i*)&ul_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)]; for (i=0;i<(nb_rb*3);i++) { mmtmpU0 = _mm_abs_epi16(rxF[i]); // print_shorts("tmp0",&tmp0); mmtmpU0 = _mm_subs_epi16(mmtmpU0,ch_mag[i]); llr128U[0] = _mm_unpacklo_epi16(rxF[i],mmtmpU0); llr128U[1] = _mm_unpackhi_epi16(rxF[i],mmtmpU0); llr128U+=2; // print_bytes("rxF[i]",&rxF[i]); // print_bytes("rxF[i+1]",&rxF[i+1]); } _mm_empty(); _m_empty(); }
void multadd_real_vector_complex_scalar(int16_t *x, int16_t *alpha, int16_t *y, uint32_t N) { uint32_t i; // do 8 multiplications at a time simd_q15_t alpha_r_128,alpha_i_128,yr,yi,*x_128=(simd_q15_t*)x,*y_128=(simd_q15_t*)y; int j; // printf("alpha = %d,%d\n",alpha[0],alpha[1]); alpha_r_128 = set1_int16(alpha[0]); alpha_i_128 = set1_int16(alpha[1]); j=0; for (i=0; i<N>>3; i++) { yr = mulhi_s1_int16(alpha_r_128,x_128[i]); yi = mulhi_s1_int16(alpha_i_128,x_128[i]); #if defined(__x86_64__) || defined(__i386__) y_128[j] = _mm_adds_epi16(y_128[j],_mm_unpacklo_epi16(yr,yi)); j++; y_128[j] = _mm_adds_epi16(y_128[j],_mm_unpackhi_epi16(yr,yi)); j++; #elif defined(__arm__) int16x8x2_t yint; yint = vzipq_s16(yr,yi); y_128[j] = adds_int16(y_128[j],yint.val[0]); j++; y_128[j] = adds_int16(y_128[j],yint.val[1]); j++; #endif } _mm_empty(); _m_empty(); }
void ulsch_channel_level(int **drs_ch_estimates_ext, LTE_DL_FRAME_PARMS *frame_parms, int *avg, unsigned short nb_rb){ short rb; unsigned char aarx; __m128i *ul_ch128; for (aarx=0;aarx<frame_parms->nb_antennas_rx;aarx++) { //clear average level avg128U = _mm_xor_si128(avg128U,avg128U); ul_ch128=(__m128i *)drs_ch_estimates_ext[aarx]; for (rb=0;rb<nb_rb;rb++) { avg128U = _mm_add_epi32(avg128U,_mm_madd_epi16(ul_ch128[0],ul_ch128[0])); avg128U = _mm_add_epi32(avg128U,_mm_madd_epi16(ul_ch128[1],ul_ch128[1])); avg128U = _mm_add_epi32(avg128U,_mm_madd_epi16(ul_ch128[2],ul_ch128[2])); ul_ch128+=3; if (rb==0) { // print_shorts("ul_ch128",&ul_ch128[0]); // print_shorts("ul_ch128",&ul_ch128[1]); // print_shorts("ul_ch128",&ul_ch128[2]); } } avg[aarx] = (((int*)&avg128U)[0] + ((int*)&avg128U)[1] + ((int*)&avg128U)[2] + ((int*)&avg128U)[3])/(nb_rb*12); // printf("Channel level : %d\n",avg[aarx]); } _mm_empty(); _m_empty(); }
void ulsch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms, int **rxdataF_comp, int **ul_ch_mag, int **ul_ch_magb, unsigned char symbol, unsigned short nb_rb) { __m128i *rxdataF_comp128_0,*ul_ch_mag128_0,*ul_ch_mag128_0b; __m128i *rxdataF_comp128_1,*ul_ch_mag128_1,*ul_ch_mag128_1b; int i; if (frame_parms->nb_antennas_rx>1) { rxdataF_comp128_0 = (__m128i *)&rxdataF_comp[0][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp128_1 = (__m128i *)&rxdataF_comp[1][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128_0 = (__m128i *)&ul_ch_mag[0][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128_1 = (__m128i *)&ul_ch_mag[1][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128_0b = (__m128i *)&ul_ch_magb[0][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128_1b = (__m128i *)&ul_ch_magb[1][symbol*frame_parms->N_RB_DL*12]; // MRC on each re of rb, both on MF output and magnitude (for 16QAM/64QAM llr computation) for (i=0;i<nb_rb*3;i++) { rxdataF_comp128_0[i] = _mm_adds_epi16(_mm_srai_epi16(rxdataF_comp128_0[i],1),_mm_srai_epi16(rxdataF_comp128_1[i],1)); ul_ch_mag128_0[i] = _mm_adds_epi16(_mm_srai_epi16(ul_ch_mag128_0[i],1),_mm_srai_epi16(ul_ch_mag128_1[i],1)); ul_ch_mag128_0b[i] = _mm_adds_epi16(_mm_srai_epi16(ul_ch_mag128_0b[i],1),_mm_srai_epi16(ul_ch_mag128_1b[i],1)); } // remove any bias (DC component after IDFT) ((u32*)rxdataF_comp128_0)[0]=0; } _mm_empty(); _m_empty(); }
void ulsch_alamouti(LTE_DL_FRAME_PARMS *frame_parms,// For Distributed Alamouti Receiver Combining int **rxdataF_comp, int **rxdataF_comp_0, int **rxdataF_comp_1, int **ul_ch_mag, int **ul_ch_magb, int **ul_ch_mag_0, int **ul_ch_magb_0, int **ul_ch_mag_1, int **ul_ch_magb_1, unsigned char symbol, unsigned short nb_rb) { short *rxF,*rxF0,*rxF1; __m128i *ch_mag,*ch_magb,*ch_mag0,*ch_mag1,*ch_mag0b,*ch_mag1b; unsigned char rb,re,aarx; int jj=(symbol*frame_parms->N_RB_DL*12); for (aarx=0;aarx<frame_parms->nb_antennas_rx;aarx++) { rxF = (short*)&rxdataF_comp[aarx][jj]; rxF0 = (short*)&rxdataF_comp_0[aarx][jj]; // Contains (y)*(h0*) rxF1 = (short*)&rxdataF_comp_1[aarx][jj]; // Contains (y*)*(h1) ch_mag = (__m128i *)&ul_ch_mag[aarx][jj]; ch_mag0 = (__m128i *)&ul_ch_mag_0[aarx][jj]; ch_mag1 = (__m128i *)&ul_ch_mag_1[aarx][jj]; ch_magb = (__m128i *)&ul_ch_magb[aarx][jj]; ch_mag0b = (__m128i *)&ul_ch_magb_0[aarx][jj]; ch_mag1b = (__m128i *)&ul_ch_magb_1[aarx][jj]; for (rb=0;rb<nb_rb;rb++) { for (re=0;re<12;re+=2) { // Alamouti RX combining rxF[0] = rxF0[0] + rxF1[2]; // re((y0)*(h0*))+ re((y1*)*(h1)) = re(x0) rxF[1] = rxF0[1] + rxF1[3]; // im((y0)*(h0*))+ im((y1*)*(h1)) = im(x0) rxF[2] = rxF0[2] - rxF1[0]; // re((y1)*(h0*))- re((y0*)*(h1)) = re(x1) rxF[3] = rxF0[3] - rxF1[1]; // im((y1)*(h0*))- im((y0*)*(h1)) = im(x1) rxF+=4; rxF0+=4; rxF1+=4; } // compute levels for 16QAM or 64 QAM llr unit ch_mag[0] = _mm_adds_epi16(ch_mag0[0],ch_mag1[0]); ch_mag[1] = _mm_adds_epi16(ch_mag0[1],ch_mag1[1]); ch_mag[2] = _mm_adds_epi16(ch_mag0[2],ch_mag1[2]); ch_magb[0] = _mm_adds_epi16(ch_mag0b[0],ch_mag1b[0]); ch_magb[1] = _mm_adds_epi16(ch_mag0b[1],ch_mag1b[1]); ch_magb[2] = _mm_adds_epi16(ch_mag0b[2],ch_mag1b[2]); ch_mag+=3; ch_mag0+=3; ch_mag1+=3; ch_magb+=3; ch_mag0b+=3; ch_mag1b+=3; } } _mm_empty(); _m_empty(); }
void ulsch_channel_compensation_alamouti(int **rxdataF_ext, // For Distributed Alamouti Combining int **ul_ch_estimates_ext_0, int **ul_ch_estimates_ext_1, int **ul_ch_mag_0, int **ul_ch_magb_0, int **ul_ch_mag_1, int **ul_ch_magb_1, int **rxdataF_comp_0, int **rxdataF_comp_1, LTE_DL_FRAME_PARMS *frame_parms, unsigned char symbol, unsigned char Qm, unsigned short nb_rb, unsigned char output_shift_0, unsigned char output_shift_1) { unsigned short rb; __m128i *ul_ch128_0,*ul_ch128_1,*ul_ch_mag128_0,*ul_ch_mag128_1,*ul_ch_mag128b_0,*ul_ch_mag128b_1,*rxdataF128,*rxdataF_comp128_0,*rxdataF_comp128_1; unsigned char aarx;//,symbol_mod; // symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol; #ifndef __SSE3__ zeroU = _mm_xor_si128(zeroU,zeroU); #endif // printf("comp: symbol %d\n",symbol); if (Qm == 4) { QAM_amp128U_0 = _mm_set1_epi16(QAM16_n1); QAM_amp128U_1 = _mm_set1_epi16(QAM16_n1); } else if (Qm == 6) { QAM_amp128U_0 = _mm_set1_epi16(QAM64_n1); QAM_amp128bU_0 = _mm_set1_epi16(QAM64_n2); QAM_amp128U_1 = _mm_set1_epi16(QAM64_n1); QAM_amp128bU_1 = _mm_set1_epi16(QAM64_n2); } for (aarx=0;aarx<frame_parms->nb_antennas_rx;aarx++) { ul_ch128_0 = (__m128i *)&ul_ch_estimates_ext_0[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128_0 = (__m128i *)&ul_ch_mag_0[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128b_0 = (__m128i *)&ul_ch_magb_0[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch128_1 = (__m128i *)&ul_ch_estimates_ext_1[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128_1 = (__m128i *)&ul_ch_mag_1[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128b_1 = (__m128i *)&ul_ch_magb_1[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF128 = (__m128i *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp128_0 = (__m128i *)&rxdataF_comp_0[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp128_1 = (__m128i *)&rxdataF_comp_1[aarx][symbol*frame_parms->N_RB_DL*12]; for (rb=0;rb<nb_rb;rb++) { // printf("comp: symbol %d rb %d\n",symbol,rb); if (Qm>2) { // get channel amplitude if not QPSK mmtmpU0 = _mm_madd_epi16(ul_ch128_0[0],ul_ch128_0[0]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0); mmtmpU1 = _mm_madd_epi16(ul_ch128_0[1],ul_ch128_0[1]); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_0); mmtmpU0 = _mm_packs_epi32(mmtmpU0,mmtmpU1); ul_ch_mag128_0[0] = _mm_unpacklo_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b_0[0] = ul_ch_mag128_0[0]; ul_ch_mag128_0[0] = _mm_mulhi_epi16(ul_ch_mag128_0[0],QAM_amp128U_0); ul_ch_mag128_0[0] = _mm_slli_epi16(ul_ch_mag128_0[0],2); // 2 to compensate the scale channel estimate ul_ch_mag128_0[1] = _mm_unpackhi_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b_0[1] = ul_ch_mag128_0[1]; ul_ch_mag128_0[1] = _mm_mulhi_epi16(ul_ch_mag128_0[1],QAM_amp128U_0); ul_ch_mag128_0[1] = _mm_slli_epi16(ul_ch_mag128_0[1],2); // 2 to scale compensate the scale channel estimate mmtmpU0 = _mm_madd_epi16(ul_ch128_0[2],ul_ch128_0[2]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0); mmtmpU1 = _mm_packs_epi32(mmtmpU0,mmtmpU0); ul_ch_mag128_0[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1); ul_ch_mag128b_0[2] = ul_ch_mag128_0[2]; ul_ch_mag128_0[2] = _mm_mulhi_epi16(ul_ch_mag128_0[2],QAM_amp128U_0); ul_ch_mag128_0[2] = _mm_slli_epi16(ul_ch_mag128_0[2],2); // 2 to scale compensate the scale channel estimat ul_ch_mag128b_0[0] = _mm_mulhi_epi16(ul_ch_mag128b_0[0],QAM_amp128bU_0); ul_ch_mag128b_0[0] = _mm_slli_epi16(ul_ch_mag128b_0[0],2); // 2 to scale compensate the scale channel estima ul_ch_mag128b_0[1] = _mm_mulhi_epi16(ul_ch_mag128b_0[1],QAM_amp128bU_0); ul_ch_mag128b_0[1] = _mm_slli_epi16(ul_ch_mag128b_0[1],2); // 2 to scale compensate the scale channel estima ul_ch_mag128b_0[2] = _mm_mulhi_epi16(ul_ch_mag128b_0[2],QAM_amp128bU_0); ul_ch_mag128b_0[2] = _mm_slli_epi16(ul_ch_mag128b_0[2],2); // 2 to scale compensate the scale channel estima mmtmpU0 = _mm_madd_epi16(ul_ch128_1[0],ul_ch128_1[0]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1); mmtmpU1 = _mm_madd_epi16(ul_ch128_1[1],ul_ch128_1[1]); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_1); mmtmpU0 = _mm_packs_epi32(mmtmpU0,mmtmpU1); ul_ch_mag128_1[0] = _mm_unpacklo_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b_1[0] = ul_ch_mag128_1[0]; ul_ch_mag128_1[0] = _mm_mulhi_epi16(ul_ch_mag128_1[0],QAM_amp128U_1); ul_ch_mag128_1[0] = _mm_slli_epi16(ul_ch_mag128_1[0],2); // 2 to compensate the scale channel estimate ul_ch_mag128_1[1] = _mm_unpackhi_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b_1[1] = ul_ch_mag128_1[1]; ul_ch_mag128_1[1] = _mm_mulhi_epi16(ul_ch_mag128_1[1],QAM_amp128U_1); ul_ch_mag128_1[1] = _mm_slli_epi16(ul_ch_mag128_1[1],2); // 2 to scale compensate the scale channel estimate mmtmpU0 = _mm_madd_epi16(ul_ch128_1[2],ul_ch128_1[2]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1); mmtmpU1 = _mm_packs_epi32(mmtmpU0,mmtmpU0); ul_ch_mag128_1[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1); ul_ch_mag128b_1[2] = ul_ch_mag128_1[2]; ul_ch_mag128_1[2] = _mm_mulhi_epi16(ul_ch_mag128_1[2],QAM_amp128U_0); ul_ch_mag128_1[2] = _mm_slli_epi16(ul_ch_mag128_1[2],2); // 2 to scale compensate the scale channel estimat ul_ch_mag128b_1[0] = _mm_mulhi_epi16(ul_ch_mag128b_1[0],QAM_amp128bU_1); ul_ch_mag128b_1[0] = _mm_slli_epi16(ul_ch_mag128b_1[0],2); // 2 to scale compensate the scale channel estima ul_ch_mag128b_1[1] = _mm_mulhi_epi16(ul_ch_mag128b_1[1],QAM_amp128bU_1); ul_ch_mag128b_1[1] = _mm_slli_epi16(ul_ch_mag128b_1[1],2); // 2 to scale compensate the scale channel estima ul_ch_mag128b_1[2] = _mm_mulhi_epi16(ul_ch_mag128b_1[2],QAM_amp128bU_1); ul_ch_mag128b_1[2] = _mm_slli_epi16(ul_ch_mag128b_1[2],2); // 2 to scale compensate the scale channel estima } /************************For Computing (y)*(h0*)********************************************/ // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128_0[0],rxdataF128[0]); // print_ints("re",&mmtmpU0); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128_0[0],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)&conjugate[0]); // print_ints("im",&mmtmpU1); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[0]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0); // print_ints("re(shift)",&mmtmpU0); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_0); // print_ints("im(shift)",&mmtmpU1); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); // print_ints("c0",&mmtmpU2); // print_ints("c1",&mmtmpU3); rxdataF_comp128_0[0] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[0]); // print_shorts("ch:",ul_ch128_0[0]); // print_shorts("pack:",rxdataF_comp128_0[0]); // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128_0[1],rxdataF128[1]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128_0[1],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[1]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_0); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128_0[1] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[1]); // print_shorts("ch:",ul_ch128_0[1]); // print_shorts("pack:",rxdataF_comp128_0[1]); // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128_0[2],rxdataF128[2]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128_0[2],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[2]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_0); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128_0[2] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[2]); // print_shorts("ch:",ul_ch128_0[2]); // print_shorts("pack:",rxdataF_comp128_0[2]); /*************************For Computing (y*)*(h1)************************************/ // multiply by conjugated signal mmtmpU0 = _mm_madd_epi16(ul_ch128_1[0],rxdataF128[0]); // print_ints("re",&mmtmpU0); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(rxdataF128[0],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)&conjugate[0]); // print_ints("im",&mmtmpU1); mmtmpU1 = _mm_madd_epi16(mmtmpU1,ul_ch128_1[0]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1); // print_ints("re(shift)",&mmtmpU0); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_1); // print_ints("im(shift)",&mmtmpU1); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); // print_ints("c0",&mmtmpU2); // print_ints("c1",&mmtmpU3); rxdataF_comp128_1[0] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[0]); // print_shorts("ch_conjugate:",ul_ch128_1[0]); // print_shorts("pack:",rxdataF_comp128_1[0]); // multiply by conjugated signal mmtmpU0 = _mm_madd_epi16(ul_ch128_1[1],rxdataF128[1]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(rxdataF128[1],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,ul_ch128_1[1]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_1); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128_1[1] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[1]); // print_shorts("ch_conjugate:",ul_ch128_1[1]); // print_shorts("pack:",rxdataF_comp128_1[1]); // multiply by conjugated signal mmtmpU0 = _mm_madd_epi16(ul_ch128_1[2],rxdataF128[2]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(rxdataF128[2],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,ul_ch128_1[2]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_1); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128_1[2] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[2]); // print_shorts("ch_conjugate:",ul_ch128_0[2]); // print_shorts("pack:",rxdataF_comp128_1[2]); ul_ch128_0+=3; ul_ch_mag128_0+=3; ul_ch_mag128b_0+=3; ul_ch128_1+=3; ul_ch_mag128_1+=3; ul_ch_mag128b_1+=3; rxdataF128+=3; rxdataF_comp128_0+=3; rxdataF_comp128_1+=3; } } _mm_empty(); _m_empty(); }
void ulsch_channel_compensation(int **rxdataF_ext, int **ul_ch_estimates_ext, int **ul_ch_mag, int **ul_ch_magb, int **rxdataF_comp, LTE_DL_FRAME_PARMS *frame_parms, unsigned char symbol, unsigned char Qm, unsigned short nb_rb, unsigned char output_shift) { unsigned short rb; __m128i *ul_ch128,*ul_ch_mag128,*ul_ch_mag128b,*rxdataF128,*rxdataF_comp128; unsigned char aarx;//,symbol_mod; // symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol; #ifndef __SSE3__ zeroU = _mm_xor_si128(zeroU,zeroU); #endif // printf("comp: symbol %d\n",symbol); if (Qm == 4) QAM_amp128U = _mm_set1_epi16(QAM16_n1); else if (Qm == 6) { QAM_amp128U = _mm_set1_epi16(QAM64_n1); QAM_amp128bU = _mm_set1_epi16(QAM64_n2); } for (aarx=0;aarx<frame_parms->nb_antennas_rx;aarx++) { ul_ch128 = (__m128i *)&ul_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128 = (__m128i *)&ul_ch_mag[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128b = (__m128i *)&ul_ch_magb[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF128 = (__m128i *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp128 = (__m128i *)&rxdataF_comp[aarx][symbol*frame_parms->N_RB_DL*12]; for (rb=0;rb<nb_rb;rb++) { // printf("comp: symbol %d rb %d\n",symbol,rb); #ifdef OFDMA_ULSCH if (Qm>2) { // get channel amplitude if not QPSK mmtmpU0 = _mm_madd_epi16(ul_ch128[0],ul_ch128[0]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift); mmtmpU1 = _mm_madd_epi16(ul_ch128[1],ul_ch128[1]); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift); mmtmpU0 = _mm_packs_epi32(mmtmpU0,mmtmpU1); ul_ch_mag128[0] = _mm_unpacklo_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b[0] = ul_ch_mag128[0]; ul_ch_mag128[0] = _mm_mulhi_epi16(ul_ch_mag128[0],QAM_amp128U); ul_ch_mag128[0] = _mm_slli_epi16(ul_ch_mag128[0],2); // 2 to compensate the scale channel estimate ul_ch_mag128[1] = _mm_unpackhi_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b[1] = ul_ch_mag128[1]; ul_ch_mag128[1] = _mm_mulhi_epi16(ul_ch_mag128[1],QAM_amp128U); ul_ch_mag128[1] = _mm_slli_epi16(ul_ch_mag128[1],2); // 2 to compensate the scale channel estimate mmtmpU0 = _mm_madd_epi16(ul_ch128[2],ul_ch128[2]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift); mmtmpU1 = _mm_packs_epi32(mmtmpU0,mmtmpU0); ul_ch_mag128[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1); ul_ch_mag128b[2] = ul_ch_mag128[2]; ul_ch_mag128[2] = _mm_mulhi_epi16(ul_ch_mag128[2],QAM_amp128U); ul_ch_mag128[2] = _mm_slli_epi16(ul_ch_mag128[2],2); // 2 to compensate the scale channel estimate ul_ch_mag128b[0] = _mm_mulhi_epi16(ul_ch_mag128b[0],QAM_amp128bU); ul_ch_mag128b[0] = _mm_slli_epi16(ul_ch_mag128b[0],2); // 2 to compensate the scale channel estimate ul_ch_mag128b[1] = _mm_mulhi_epi16(ul_ch_mag128b[1],QAM_amp128bU); ul_ch_mag128b[1] = _mm_slli_epi16(ul_ch_mag128b[1],2); // 2 to compensate the scale channel estimate ul_ch_mag128b[2] = _mm_mulhi_epi16(ul_ch_mag128b[2],QAM_amp128bU); ul_ch_mag128b[2] = _mm_slli_epi16(ul_ch_mag128b[2],2);// 2 to compensate the scale channel estimate } #else mmtmpU0 = _mm_madd_epi16(ul_ch128[0],ul_ch128[0]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift-1); mmtmpU1 = _mm_madd_epi16(ul_ch128[1],ul_ch128[1]); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift-1); mmtmpU0 = _mm_packs_epi32(mmtmpU0,mmtmpU1); ul_ch_mag128[0] = _mm_unpacklo_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128[1] = _mm_unpackhi_epi16(mmtmpU0,mmtmpU0); mmtmpU0 = _mm_madd_epi16(ul_ch128[2],ul_ch128[2]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift-1); mmtmpU1 = _mm_packs_epi32(mmtmpU0,mmtmpU0); ul_ch_mag128[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1); // printf("comp: symbol %d rb %d => %d,%d,%d\n",symbol,rb,*((short*)&ul_ch_mag128[0]),*((short*)&ul_ch_mag128[1]),*((short*)&ul_ch_mag128[2])); #endif // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128[0],rxdataF128[0]); // print_ints("re",&mmtmpU0); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128[0],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)&conjugate[0]); // print_ints("im",&mmtmpU1); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[0]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift); // print_ints("re(shift)",&mmtmpU0); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift); // print_ints("im(shift)",&mmtmpU1); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); // print_ints("c0",&mmtmpU2); // print_ints("c1",&mmtmpU3); rxdataF_comp128[0] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[0]); // print_shorts("ch:",ul_ch128[0]); // print_shorts("pack:",rxdataF_comp128[0]); // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128[1],rxdataF128[1]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128[1],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[1]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128[1] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[1]); // print_shorts("ch:",ul_ch128[1]); // print_shorts("pack:",rxdataF_comp128[1]); // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128[2],rxdataF128[2]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128[2],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[2]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128[2] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[2]); // print_shorts("ch:",ul_ch128[2]); // print_shorts("pack:",rxdataF_comp128[2]); ul_ch128+=3; ul_ch_mag128+=3; ul_ch_mag128b+=3; rxdataF128+=3; rxdataF_comp128+=3; } } _mm_empty(); _m_empty(); }
int mult_cpx_vector(int16_t *x1, //Q15 int16_t *x2,//Q13 int16_t *y, uint32_t N, int output_shift) { // Multiply elementwise x1 with x2. // x1 - input 1 in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| // We assume x1 with a dinamic of 15 bit maximum // // x2 - input 2 in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| // We assume x2 with a dinamic of 14 bit maximum /// // y - output in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| // // N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4; // // output_shift - shift to be applied to generate output uint32_t i; // loop counter simd_q15_t *x1_128; simd_q15_t *x2_128; simd_q15_t *y_128; simd_q15_t tmp_re,tmp_im; simd_q15_t tmpy0,tmpy1; x1_128 = (simd_q15_t *)&x1[0]; x2_128 = (simd_q15_t *)&x2[0]; y_128 = (simd_q15_t *)&y[0]; //print_shorts("x1_128:",&x1_128[0]); // print_shorts("x2_128:",&x2_128[0]); //right shift by 13 while p_a * x0 and 15 while // we compute 4 cpx multiply for each loop for(i=0; i<(N>>2); i++) { tmp_re = _mm_sign_epi16(*x1_128,*(__m128i*)&conjug2[0]);// Q15 //print_shorts("tmp_re1:",&tmp_re[i]); tmp_re = _mm_madd_epi16(tmp_re,*x2_128); //Q28 //print_ints("tmp_re2:",&tmp_re[i]); tmp_im = _mm_shufflelo_epi16(*x1_128,_MM_SHUFFLE(2,3,0,1)); //Q15 //print_shorts("tmp_im1:",&tmp_im[i]); tmp_im = _mm_shufflehi_epi16(tmp_im,_MM_SHUFFLE(2,3,0,1)); //Q15 //print_shorts("tmp_im2:",&tmp_im[i]); tmp_im = _mm_madd_epi16(tmp_im, *x2_128); //Q28 //print_ints("tmp_im3:",&tmp_im[i]); tmp_re = _mm_srai_epi32(tmp_re,output_shift);//Q(28-shift) //print_ints("tmp_re shifted:",&tmp_re[i]); tmp_im = _mm_srai_epi32(tmp_im,output_shift); //Q(28-shift) //print_ints("tmp_im shifted:",&tmp_im[i]); tmpy0 = _mm_unpacklo_epi32(tmp_re,tmp_im); //Q(28-shift) //print_ints("unpack lo :",&tmpy0[i]); tmpy1 = _mm_unpackhi_epi32(tmp_re,tmp_im); //Q(28-shift) //print_ints("mrc rho0:",&tmpy1[i]); *y_128 = _mm_packs_epi32(tmpy0,tmpy1); //must be Q15 //print_shorts("*y_128:",&y_128[i]); x1_128++; x2_128++; y_128++; } _mm_empty(); _m_empty(); return(0); }
int multadd_cpx_vector(int16_t *x1, int16_t *x2, int16_t *y, uint8_t zero_flag, uint32_t N, int output_shift) { // Multiply elementwise the complex conjugate of x1 with x2. // x1 - input 1 in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| // We assume x1 with a dinamic of 15 bit maximum // // x2 - input 2 in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| // We assume x2 with a dinamic of 14 bit maximum /// // y - output in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| // // zero_flag - Set output (y) to zero prior to disable accumulation // // N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4; // // output_shift - shift to be applied to generate output uint32_t i; // loop counter simd_q15_t *x1_128; simd_q15_t *x2_128; simd_q15_t *y_128; #if defined(__x86_64__) || defined(__i386__) simd_q15_t tmp_re,tmp_im; simd_q15_t tmpy0,tmpy1; #elif defined(__arm__) int32x4_t tmp_re,tmp_im; int32x4_t tmp_re1,tmp_im1; int16x4x2_t tmpy; int32x4_t shift = vdupq_n_s32(-output_shift); #endif x1_128 = (simd_q15_t *)&x1[0]; x2_128 = (simd_q15_t *)&x2[0]; y_128 = (simd_q15_t *)&y[0]; // we compute 4 cpx multiply for each loop for(i=0; i<(N>>2); i++) { #if defined(__x86_64__) || defined(__i386__) tmp_re = _mm_sign_epi16(*x1_128,*(__m128i*)&conjug2[0]); tmp_re = _mm_madd_epi16(tmp_re,*x2_128); tmp_im = _mm_shufflelo_epi16(*x1_128,_MM_SHUFFLE(2,3,0,1)); tmp_im = _mm_shufflehi_epi16(tmp_im,_MM_SHUFFLE(2,3,0,1)); tmp_im = _mm_madd_epi16(tmp_im,*x2_128); tmp_re = _mm_srai_epi32(tmp_re,output_shift); tmp_im = _mm_srai_epi32(tmp_im,output_shift); tmpy0 = _mm_unpacklo_epi32(tmp_re,tmp_im); //print_ints("unpack lo:",&tmpy0[i]); tmpy1 = _mm_unpackhi_epi32(tmp_re,tmp_im); //print_ints("unpack hi:",&tmpy1[i]); if (zero_flag == 1) *y_128 = _mm_packs_epi32(tmpy0,tmpy1); else *y_128 = _mm_adds_epi16(*y_128,_mm_packs_epi32(tmpy0,tmpy1)); //print_shorts("*y_128:",&y_128[i]); #elif defined(__arm__) msg("mult_cpx_vector not implemented for __arm__"); #endif x1_128++; x2_128++; y_128++; } _mm_empty(); _m_empty(); return(0); }
int mult_cpx_conj_vector(int16_t *x1, int16_t *x2, int16_t *y, uint32_t N, int output_shift, int madd) { // Multiply elementwise the complex conjugate of x1 with x2. // x1 - input 1 in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| // We assume x1 with a dinamic of 15 bit maximum // // x2 - input 2 in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| // We assume x2 with a dinamic of 14 bit maximum /// // y - output in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| // // N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4; // // output_shift - shift to be applied to generate output // // madd - add the output to y uint32_t i; // loop counter simd_q15_t *x1_128; simd_q15_t *x2_128; simd_q15_t *y_128; #if defined(__x86_64__) || defined(__i386__) simd_q15_t tmp_re,tmp_im; simd_q15_t tmpy0,tmpy1; #elif defined(__arm__) int32x4_t tmp_re,tmp_im; int32x4_t tmp_re1,tmp_im1; int16x4x2_t tmpy; int32x4_t shift = vdupq_n_s32(-output_shift); #endif x1_128 = (simd_q15_t *)&x1[0]; x2_128 = (simd_q15_t *)&x2[0]; y_128 = (simd_q15_t *)&y[0]; // we compute 4 cpx multiply for each loop for(i=0; i<(N>>2); i++) { #if defined(__x86_64__) || defined(__i386__) tmp_re = _mm_madd_epi16(*x1_128,*x2_128); tmp_im = _mm_shufflelo_epi16(*x1_128,_MM_SHUFFLE(2,3,0,1)); tmp_im = _mm_shufflehi_epi16(tmp_im,_MM_SHUFFLE(2,3,0,1)); tmp_im = _mm_sign_epi16(tmp_im,*(__m128i*)&conjug[0]); tmp_im = _mm_madd_epi16(tmp_im,*x2_128); tmp_re = _mm_srai_epi32(tmp_re,output_shift); tmp_im = _mm_srai_epi32(tmp_im,output_shift); tmpy0 = _mm_unpacklo_epi32(tmp_re,tmp_im); tmpy1 = _mm_unpackhi_epi32(tmp_re,tmp_im); if (madd==0) *y_128 = _mm_packs_epi32(tmpy0,tmpy1); else *y_128 += _mm_packs_epi32(tmpy0,tmpy1); #elif defined(__arm__) tmp_re = vmull_s16(((simdshort_q15_t *)x1_128)[0], ((simdshort_q15_t*)x2_128)[0]); //tmp_re = [Re(x1[0])Re(x2[0]) Im(x1[0])Im(x2[0]) Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1])] tmp_re1 = vmull_s16(((simdshort_q15_t *)x1_128)[1], ((simdshort_q15_t*)x2_128)[1]); //tmp_re1 = [Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1]) Re(x1[1])Re(x2[2]) Im(x1[1])Im(x2[2])] tmp_re = vcombine_s32(vpadd_s32(vget_low_s32(tmp_re),vget_high_s32(tmp_re)), vpadd_s32(vget_low_s32(tmp_re1),vget_high_s32(tmp_re1))); //tmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2]) Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] tmp_im = vmull_s16(vrev32_s16(vmul_s16(((simdshort_q15_t*)x2_128)[0],*(simdshort_q15_t*)conjug)), ((simdshort_q15_t*)x1_128)[0]); //tmp_im = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])] tmp_im1 = vmull_s16(vrev32_s16(vmul_s16(((simdshort_q15_t*)x2_128)[1],*(simdshort_q15_t*)conjug)), ((simdshort_q15_t*)x1_128)[1]); //tmp_im1 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])] tmp_im = vcombine_s32(vpadd_s32(vget_low_s32(tmp_im),vget_high_s32(tmp_im)), vpadd_s32(vget_low_s32(tmp_im1),vget_high_s32(tmp_im1))); //tmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])] tmp_re = vqshlq_s32(tmp_re,shift); tmp_im = vqshlq_s32(tmp_im,shift); tmpy = vzip_s16(vmovn_s32(tmp_re),vmovn_s32(tmp_im)); if (madd==0) *y_128 = vcombine_s16(tmpy.val[0],tmpy.val[1]); else *y_128 += vcombine_s16(tmpy.val[0],tmpy.val[1]); #endif x1_128++; x2_128++; y_128++; } _mm_empty(); _m_empty(); return(0); }
int rotate_cpx_vector(int16_t *x, int16_t *alpha, int16_t *y, uint32_t N, uint16_t output_shift) { // Multiply elementwise two complex vectors of N elements // x - input 1 in the format |Re0 Im0 |,......,|Re(N-1) Im(N-1)| // We assume x1 with a dynamic of 15 bit maximum // // alpha - input 2 in the format |Re0 Im0| // We assume x2 with a dynamic of 15 bit maximum // // y - output in the format |Re0 Im0|,......,|Re(N-1) Im(N-1)| // // N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4; // // log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0) // WARNING: log2_amp>0 can cause overflow!! uint32_t i; // loop counter simd_q15_t *y_128,alpha_128; int32_t *xd=(int32_t *)x; #if defined(__x86_64__) || defined(__i386__) __m128i shift = _mm_cvtsi32_si128(output_shift); register simd_q15_t m0,m1,m2,m3; ((int16_t *)&alpha_128)[0] = alpha[0]; ((int16_t *)&alpha_128)[1] = -alpha[1]; ((int16_t *)&alpha_128)[2] = alpha[1]; ((int16_t *)&alpha_128)[3] = alpha[0]; ((int16_t *)&alpha_128)[4] = alpha[0]; ((int16_t *)&alpha_128)[5] = -alpha[1]; ((int16_t *)&alpha_128)[6] = alpha[1]; ((int16_t *)&alpha_128)[7] = alpha[0]; #elif defined(__arm__) int32x4_t shift; int32x4_t ab_re0,ab_re1,ab_im0,ab_im1,re32,im32; int16_t reflip[8] __attribute__((aligned(16))) = {1,-1,1,-1,1,-1,1,-1}; int32x4x2_t xtmp; ((int16_t *)&alpha_128)[0] = alpha[0]; ((int16_t *)&alpha_128)[1] = alpha[1]; ((int16_t *)&alpha_128)[2] = alpha[0]; ((int16_t *)&alpha_128)[3] = alpha[1]; ((int16_t *)&alpha_128)[4] = alpha[0]; ((int16_t *)&alpha_128)[5] = alpha[1]; ((int16_t *)&alpha_128)[6] = alpha[0]; ((int16_t *)&alpha_128)[7] = alpha[1]; int16x8_t bflip = vrev32q_s16(alpha_128); int16x8_t bconj = vmulq_s16(alpha_128,*(int16x8_t *)reflip); shift = vdupq_n_s32(-output_shift); #endif y_128 = (simd_q15_t *) y; for(i=0; i<N>>2; i++) { #if defined(__x86_64__) || defined(__i386__) m0 = _mm_setr_epi32(xd[0],xd[0],xd[1],xd[1]); m1 = _mm_setr_epi32(xd[2],xd[2],xd[3],xd[3]); m2 = _mm_madd_epi16(m0,alpha_128); //complex multiply. result is 32bit [Re Im Re Im] m3 = _mm_madd_epi16(m1,alpha_128); //complex multiply. result is 32bit [Re Im Re Im] m2 = _mm_sra_epi32(m2,shift); // shift right by shift in order to compensate for the input amplitude m3 = _mm_sra_epi32(m3,shift); // shift right by shift in order to compensate for the input amplitude y_128[0] = _mm_packs_epi32(m2,m3); // pack in 16bit integers with saturation [re im re im re im re im] #elif defined(__arm__) ab_re0 = vmull_s16(((int16x4_t*)xd)[0],((int16x4_t*)&bconj)[0]); ab_re1 = vmull_s16(((int16x4_t*)xd)[1],((int16x4_t*)&bconj)[1]); ab_im0 = vmull_s16(((int16x4_t*)xd)[0],((int16x4_t*)&bflip)[0]); ab_im1 = vmull_s16(((int16x4_t*)xd)[1],((int16x4_t*)&bflip)[1]); re32 = vshlq_s32(vcombine_s32(vpadd_s32(((int32x2_t*)&ab_re0)[0],((int32x2_t*)&ab_re0)[1]), vpadd_s32(((int32x2_t*)&ab_re1)[0],((int32x2_t*)&ab_re1)[1])), shift); im32 = vshlq_s32(vcombine_s32(vpadd_s32(((int32x2_t*)&ab_im0)[0],((int32x2_t*)&ab_im0)[1]), vpadd_s32(((int32x2_t*)&ab_im1)[0],((int32x2_t*)&ab_im1)[1])), shift); xtmp = vzipq_s32(re32,im32); y_128[0] = vcombine_s16(vmovn_s32(xtmp.val[0]),vmovn_s32(xtmp.val[1])); #endif xd+=4; y_128+=1; } _mm_empty(); _m_empty(); return(0); }
int mult_cpx_vector_h(short *x1, short *x2, short *y, unsigned int N, unsigned short output_shift, short sign) { // Multiply elementwise the complex vector x1 with the complex conjugate of the complex vecotr x2 of N elements and adds it to the vector y. // x1 - input 1 in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)| // We assume x1 with a dinamic of 15 bit maximum // // x2 - input 2 in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)| // We assume x2 with a dinamic of 14 bit maximum // // y - output in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)| // // N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4; // // log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0) // WARNING: log2_amp>0 can cause overflow!! // sign - +1..add, -1..substract unsigned int i; // loop counter register __m128i m0,m1,m2; short *temps; int *tempd; __m128i *x1_128; __m128i *x2_128; __m128i *y_128; __m128i mask; __m128i temp; shift = _mm_cvtsi32_si128(output_shift); x1_128 = (__m128i *)&x1[0]; x2_128 = (__m128i *)&x2[0]; y_128 = (__m128i *)&y[0]; if (sign == -1) mask = (__m128i) _mm_set_epi16 (-1,1,-1,-1,-1,1,-1,-1); else mask = (__m128i) _mm_set_epi16 (1,-1,1,1,1,-1,1,1); // we compute 2*4 cpx multiply for each loop for(i=0;i<(N>>3);i++) { // printf("i=%d\n",i); // unroll 1 // temps = (short *)x1_128; // printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); m1 = x1_128[0]; m2 = x2_128[0]; // temps = (short *)&x2_128[0]; // printf("x2 : %x,%x,%x,%x,%x,%x,%x,%x\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); // bring x2 in conjugate form // the first two instructions might be replaced with a single one in SSE3 m2 = _mm_shufflelo_epi16(m2,_MM_SHUFFLE(0,1,3,2)); m2 = _mm_shufflehi_epi16(m2,_MM_SHUFFLE(0,1,3,2)); m2 = _mm_mullo_epi16(m2, mask); // temp = m2; // temps = (short *)&temp; // printf("x2 conj : %x,%x,%x,%x,%x,%x,%x,%x\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); m0 = _mm_madd_epi16(m1,m2); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] // temp = m0; // tempd = &temp; // printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude // temp = m0; // tempd = (int *)&temp; // printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]); m0 = _mm_packs_epi32(m0,m0); // 1- pack in a 128 bit register [re im re im] m0 = _mm_unpacklo_epi32(m0,m0); // 1- pack in a 128 bit register [re im re im] y_128[0] = _mm_add_epi16(m0,y_128[0]); // temps = (short *)&y_128[0]; // printf("y0 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]); // unroll 2 m1 = x1_128[1]; m2 = x2_128[1]; m2 = _mm_shufflelo_epi16(m2,_MM_SHUFFLE(0,1,3,2)); m2 = _mm_shufflehi_epi16(m2,_MM_SHUFFLE(0,1,3,2)); m2 = _mm_mullo_epi16(m2, mask); m0 = _mm_madd_epi16(m1,m2); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude m0 = _mm_packs_epi32(m0,m0); // 1- pack in a 128 bit register [re im re im] m0 = _mm_unpacklo_epi32(m0,m0); // 1- pack in a 128 bit register [re im re im] y_128[1] = _mm_add_epi16(m0,y_128[1]); // unroll 3 m1 = x1_128[2]; m2 = x2_128[2]; m2 = _mm_shufflelo_epi16(m2,_MM_SHUFFLE(0,1,3,2)); m2 = _mm_shufflehi_epi16(m2,_MM_SHUFFLE(0,1,3,2)); m2 = _mm_mullo_epi16(m2, mask); m0 = _mm_madd_epi16(m1,m2); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude m0 = _mm_packs_epi32(m0,m0); // 1- pack in a 128 bit register [re im re im] m0 = _mm_unpacklo_epi32(m0,m0); // 1- pack in a 128 bit register [re im re im] y_128[2] = _mm_add_epi16(m0,y_128[2]); // unroll 4 m1 = x1_128[3]; m2 = x2_128[3]; m2 = _mm_shufflelo_epi16(m2,_MM_SHUFFLE(0,1,3,2)); m2 = _mm_shufflehi_epi16(m2,_MM_SHUFFLE(0,1,3,2)); m2 = _mm_mullo_epi16(m2, mask); m0 = _mm_madd_epi16(m1,m2); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0] m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude m0 = _mm_packs_epi32(m0,m0); // 1- pack in a 128 bit register [re im re im] m0 = _mm_unpacklo_epi32(m0,m0); // 1- pack in a 128 bit register [re im re im] y_128[3] = _mm_add_epi16(m0,y_128[3]); x1_128+=4; x2_128+=4; y_128 +=4; // printf("x1_128 = %p, x2_128 =%p, y_128=%p\n",x1_128,x2_128,y_128); } _mm_empty(); _m_empty(); return(0); }
int signal_energy(int *input,unsigned int length) { int i; int temp,temp2; register __m64 mm0,mm1,mm2,mm3; __m64 *in = (__m64 *)input; #ifdef MAIN short *printb; #endif mm0 = _m_pxor(mm0,mm0); mm3 = _m_pxor(mm3,mm3); for (i=0;i<length>>1;i++) { mm1 = in[i]; mm2 = mm1; mm1 = _m_pmaddwd(mm1,mm1); mm1 = _m_psradi(mm1,shift);// shift any 32 bits blocs of the word by the value shift mm0 = _m_paddd(mm0,mm1);// add the two 64 bits words 4 bytes by 4 bytes // temp2 = mm0; // printf("%d %d\n",((int *)&temp2)[0],((int *)&temp2)[1]); // printb = (short *)&mm2; // printf("mm2 %d : %d %d %d %d\n",i,printb[0],printb[1],printb[2],printb[3]); mm2 = _m_psrawi(mm2,shift_DC); mm3 = _m_paddw(mm3,mm2);// add the two 64 bits words 2 bytes by 2 bytes // printb = (short *)&mm3; // printf("mm3 %d : %d %d %d %d\n",i,printb[0],printb[1],printb[2],printb[3]); } /* #ifdef MAIN printb = (short *)&mm3; printf("%d %d %d %d\n",printb[0],printb[1],printb[2],printb[3]); #endif */ mm1 = mm0; mm0 = _m_psrlqi(mm0,32); mm0 = _m_paddd(mm0,mm1); temp = _m_to_int(mm0); temp/=length; temp<<=shift; // this is the average of x^2 // now remove the DC component mm2 = _m_psrlqi(mm3,32); mm2 = _m_paddw(mm2,mm3); mm2 = _m_pmaddwd(mm2,mm2); temp2 = _m_to_int(mm2); temp2/=(length*length); temp2<<=(2*shift_DC); #ifdef MAIN printf("E x^2 = %d\n",temp); #endif temp -= temp2; #ifdef MAIN printf("(E x)^2=%d\n",temp2); #endif _mm_empty(); _m_empty(); return((temp>0)?temp:1); }
int mult_cpx_vector_h_add32(short *x1, short *x2, short *y, unsigned int N, short sign) { // Multiply elementwise the complex vector x1 with the complex conjugate of the complex vecotr x2 of N elements and adds it to the vector y. // x1 - input 1 in 16bit format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)| // We assume x1 with a dinamic of 15 bit maximum // // x2 - input 2 in 16bit format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)| // We assume x2 with a dinamic of 14 bit maximum // // y - output in 32bit format |Re0 Im0|,......,|Re(N-1) Im(N-1)| // // N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4; // // sign - +1..add, -1..substract unsigned int i; // loop counter register __m128i m0,m1,m2; short *temps; int *tempd; __m128i *x1_128; __m128i *x2_128; __m128i *y_128; __m128i mask; __m128i temp; x1_128 = (__m128i *)&x1[0]; x2_128 = (__m128i *)&x2[0]; y_128 = (__m128i *)&y[0]; if (sign == -1) mask = (__m128i) _mm_set_epi16 (-1,1,-1,-1,-1,1,-1,-1); else mask = (__m128i) _mm_set_epi16 (1,-1,1,1,1,-1,1,1); // we compute 2*4 cpx multiply for each loop for(i=0;i<(N>>3);i++) { m1 = x1_128[0]; m2 = x2_128[0]; // bring x2 in conjugate form // the first two instructions might be replaced with a single one in SSE3 m2 = _mm_shufflelo_epi16(m2,_MM_SHUFFLE(0,1,3,2)); m2 = _mm_shufflehi_epi16(m2,_MM_SHUFFLE(0,1,3,2)); m2 = _mm_mullo_epi16(m2, mask); m0 = _mm_madd_epi16(m1,m2); // 1- compute x1[0]*x2[0], result is 32bit y_128[0] = _mm_add_epi32(m0,y_128[0]); // unroll 2 m1 = x1_128[1]; m2 = x2_128[1]; m2 = _mm_shufflelo_epi16(m2,_MM_SHUFFLE(0,1,3,2)); m2 = _mm_shufflehi_epi16(m2,_MM_SHUFFLE(0,1,3,2)); m2 = _mm_mullo_epi16(m2, mask); m0 = _mm_madd_epi16(m1,m2); y_128[1] = _mm_add_epi32(m0,y_128[1]); // unroll 3 m1 = x1_128[2]; m2 = x2_128[2]; m2 = _mm_shufflelo_epi16(m2,_MM_SHUFFLE(0,1,3,2)); m2 = _mm_shufflehi_epi16(m2,_MM_SHUFFLE(0,1,3,2)); m2 = _mm_mullo_epi16(m2, mask); m0 = _mm_madd_epi16(m1,m2); y_128[2] = _mm_add_epi32(m0,y_128[2]); // unroll 4 m1 = x1_128[3]; m2 = x2_128[3]; m2 = _mm_shufflelo_epi16(m2,_MM_SHUFFLE(0,1,3,2)); m2 = _mm_shufflehi_epi16(m2,_MM_SHUFFLE(0,1,3,2)); m2 = _mm_mullo_epi16(m2, mask); m0 = _mm_madd_epi16(m1,m2); y_128[3] = _mm_add_epi32(m0,y_128[3]); x1_128+=4; x2_128+=4; y_128 +=4; } _mm_empty(); _m_empty(); return(0); }