예제 #1
1
int32_t dot_product(int16_t *x,
                    int16_t *y,
                    uint32_t N, //must be a multiple of 8
                    uint8_t output_shift)
{

  uint32_t n;

#if defined(__x86_64__) || defined(__i386__)
  __m128i *x128,*y128,mmtmp1,mmtmp2,mmtmp3,mmcumul,mmcumul_re,mmcumul_im;
  __m64 mmtmp7;
  __m128i minus_i = _mm_set_epi16(-1,1,-1,1,-1,1,-1,1);
  int32_t result;

  x128 = (__m128i*) x;
  y128 = (__m128i*) y;

  mmcumul_re = _mm_setzero_si128();
  mmcumul_im = _mm_setzero_si128();

  for (n=0; n<(N>>2); n++) {

    //printf("n=%d, x128=%p, y128=%p\n",n,x128,y128);
    //    print_shorts("x",&x128[0]);
    //    print_shorts("y",&y128[0]);

    // this computes Re(z) = Re(x)*Re(y) + Im(x)*Im(y)
    mmtmp1 = _mm_madd_epi16(x128[0],y128[0]);
    //    print_ints("re",&mmtmp1);
    // mmtmp1 contains real part of 4 consecutive outputs (32-bit)

    // shift and accumulate results
    mmtmp1 = _mm_srai_epi32(mmtmp1,output_shift);
    mmcumul_re = _mm_add_epi32(mmcumul_re,mmtmp1);
    //    print_ints("re",&mmcumul_re);


    // this computes Im(z) = Re(x)*Im(y) - Re(y)*Im(x)
    mmtmp2 = _mm_shufflelo_epi16(y128[0],_MM_SHUFFLE(2,3,0,1));
    //    print_shorts("y",&mmtmp2);
    mmtmp2 = _mm_shufflehi_epi16(mmtmp2,_MM_SHUFFLE(2,3,0,1));
    //    print_shorts("y",&mmtmp2);
    mmtmp2 = _mm_sign_epi16(mmtmp2,minus_i);
    //        print_shorts("y",&mmtmp2);

    mmtmp3 = _mm_madd_epi16(x128[0],mmtmp2);
    //        print_ints("im",&mmtmp3);
    // mmtmp3 contains imag part of 4 consecutive outputs (32-bit)

    // shift and accumulate results
    mmtmp3 = _mm_srai_epi32(mmtmp3,output_shift);
    mmcumul_im = _mm_add_epi32(mmcumul_im,mmtmp3);
    //    print_ints("im",&mmcumul_im);

    x128++;
    y128++;
  }

  // this gives Re Re Im Im
  mmcumul = _mm_hadd_epi32(mmcumul_re,mmcumul_im);
  //  print_ints("cumul1",&mmcumul);

  // this gives Re Im Re Im
  mmcumul = _mm_hadd_epi32(mmcumul,mmcumul);

  //  print_ints("cumul2",&mmcumul);


  //mmcumul = _mm_srai_epi32(mmcumul,output_shift);
  // extract the lower half
  mmtmp7 = _mm_movepi64_pi64(mmcumul);
  //  print_ints("mmtmp7",&mmtmp7);
  // pack the result
  mmtmp7 = _mm_packs_pi32(mmtmp7,mmtmp7);
  //  print_shorts("mmtmp7",&mmtmp7);
  // convert back to integer
  result = _mm_cvtsi64_si32(mmtmp7);

  _mm_empty();
  _m_empty();

  return(result);

#elif defined(__arm__)
  int16x4_t *x_128=(int16x4_t*)x;
  int16x4_t *y_128=(int16x4_t*)y;
  int32x4_t tmp_re,tmp_im;
  int32x4_t tmp_re1,tmp_im1;
  int32x4_t re_cumul,im_cumul;
  int32x2_t re_cumul2,im_cumul2;
  int32x4_t shift = vdupq_n_s32(-output_shift); 
  int32x2x2_t result2;
  int16_t conjug[4]__attribute__((aligned(16))) = {-1,1,-1,1} ;

  re_cumul = vdupq_n_s32(0);
  im_cumul = vdupq_n_s32(0); 

  for (n=0; n<(N>>2); n++) {

    tmp_re  = vmull_s16(*x_128++, *y_128++);
    //tmp_re = [Re(x[0])Re(y[0]) Im(x[0])Im(y[0]) Re(x[1])Re(y[1]) Im(x[1])Im(y[1])] 
    tmp_re1 = vmull_s16(*x_128++, *y_128++);
    //tmp_re1 = [Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1]) Re(x1[1])Re(x2[2]) Im(x1[1])Im(x2[2])] 
    tmp_re  = vcombine_s32(vpadd_s32(vget_low_s32(tmp_re),vget_high_s32(tmp_re)),
                           vpadd_s32(vget_low_s32(tmp_re1),vget_high_s32(tmp_re1)));
    //tmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2]) Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] 

    tmp_im  = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++);
    //tmp_im = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])]
    tmp_im1 = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++);
    //tmp_im1 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])]
    tmp_im  = vcombine_s32(vpadd_s32(vget_low_s32(tmp_im),vget_high_s32(tmp_im)),
                           vpadd_s32(vget_low_s32(tmp_im1),vget_high_s32(tmp_im1)));
    //tmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])]

    re_cumul = vqaddq_s32(re_cumul,vqshlq_s32(tmp_re,shift));
    im_cumul = vqaddq_s32(im_cumul,vqshlq_s32(tmp_im,shift));
  }
  
  re_cumul2 = vpadd_s32(vget_low_s32(re_cumul),vget_high_s32(re_cumul));
  im_cumul2 = vpadd_s32(vget_low_s32(im_cumul),vget_high_s32(im_cumul));
  re_cumul2 = vpadd_s32(re_cumul2,re_cumul2);
  im_cumul2 = vpadd_s32(im_cumul2,im_cumul2);
  result2   = vzip_s32(re_cumul2,im_cumul2);
  return(vget_lane_s32(result2.val[0],0));
#endif
}
예제 #2
0
//compute average channel_level on each (TX,RX) antenna pair
int dl_channel_level(s16 *dl_ch,
		     LTE_DL_FRAME_PARMS *frame_parms) {

  s16 rb;
  __m128i *dl_ch128;
  int avg;

      //clear average level
  avg128F = _mm_xor_si128(avg128F,avg128F);
  dl_ch128=(__m128i *)dl_ch;

  for (rb=0;rb<frame_parms->N_RB_DL;rb++) {
    
    avg128F = _mm_add_epi32(avg128F,_mm_madd_epi16(dl_ch128[0],dl_ch128[0]));
    avg128F = _mm_add_epi32(avg128F,_mm_madd_epi16(dl_ch128[1],dl_ch128[1]));
    avg128F = _mm_add_epi32(avg128F,_mm_madd_epi16(dl_ch128[2],dl_ch128[2]));
    
    dl_ch128+=3;	
    
  }

  avg = (((int*)&avg128F)[0] + 
	 ((int*)&avg128F)[1] + 
	 ((int*)&avg128F)[2] + 
	 ((int*)&avg128F)[3])/(frame_parms->N_RB_DL*12);
  


  _mm_empty();
  _m_empty();
  
  return(avg);
}
예제 #3
0
void multadd_complex_vector_real_scalar(int16_t *x,
                                        int16_t alpha,
                                        int16_t *y,
                                        uint8_t zero_flag,
                                        uint32_t N)
{

  simd_q15_t alpha_128,*x_128=(simd_q15_t *)x,*y_128=(simd_q15_t*)y;
  int n;

  alpha_128 = set1_int16(alpha);

  if (zero_flag == 1)
    for (n=0; n<N>>2; n++) {
      y_128[n] = mulhi_int16(x_128[n],alpha_128);
    }

  else
    for (n=0; n<N>>2; n++) {
      y_128[n] = adds_int16(y_128[n],mulhi_int16(x_128[n],alpha_128));
    }

  _mm_empty();
  _m_empty();

}
예제 #4
0
int ulsch_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms,
		   int **rxdataF_comp,
		   short *ulsch_llr,
		   unsigned char symbol,
		   unsigned short nb_rb) {

  __m128i *rxF=(__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
  int i;

  if (symbol == 0)
    llr128U = (__m128i*)ulsch_llr;
 
  if (!llr128U) {
    msg("ulsch_qpsk_llr: llr is null, symbol %d, llr128=%p\n",symbol, llr128U);
    return(-1);
  }
  //  printf("qpsk llr for symbol %d (pos %d), llr offset %d\n",symbol,(symbol*frame_parms->N_RB_DL*12),llr128-(__m128i*)ulsch_llr);

  for (i=0;i<(nb_rb*3);i++) {
    *llr128U = *rxF;
    rxF++;
    llr128U++;
  }

  _mm_empty();
  _m_empty();

  return(0);

}
예제 #5
0
int complex_conjugate(int16_t *x1,
                      int16_t *y,
                      uint32_t N)

{
  uint32_t i;                 // loop counter

  simd_q15_t *x1_128;
  simd_q15_t *y_128;
  int16_t x2[8] __attribute__((aligned(16))) = {1,-1,1,-1,1,-1,1,-1}; 
  simd_q15_t *x2_128 = (simd_q15_t*)&x2[0];
  x1_128 = (simd_q15_t *)&x1[0];
  y_128 = (simd_q15_t *)&y[0];


  // we compute 4 cpx multiply for each loop
  for(i=0; i<(N>>3); i++) {
    y_128[0] = mullo_int16(x1_128[0],*x2_128);
    y_128[1] = mullo_int16(x1_128[1],*x2_128);
    y_128[2] = mullo_int16(x1_128[2],*x2_128);
    y_128[3] = mullo_int16(x1_128[3],*x2_128);


    x1_128+=4;
    y_128 +=4;
  }


  _mm_empty();
  _m_empty();

  return(0);
}
//compute average channel_level on each (TX,RX) antenna pair
int dl_channel_level(int16_t *dl_ch,
                     LTE_DL_FRAME_PARMS *frame_parms)
{

    int16_t rb;
#if defined(__x86_64__) || defined(__i386__)
    __m128i *dl_ch128;
#elif defined(__arm__)
    int16x4_t *dl_ch128;
#endif
    int avg;

    //clear average level
#if defined(__x86_64__) || defined(__i386__)
    avg128F = _mm_setzero_si128();
    dl_ch128=(__m128i *)dl_ch;

    for (rb=0; rb<frame_parms->N_RB_DL; rb++) {

        avg128F = _mm_add_epi32(avg128F,_mm_madd_epi16(dl_ch128[0],dl_ch128[0]));
        avg128F = _mm_add_epi32(avg128F,_mm_madd_epi16(dl_ch128[1],dl_ch128[1]));
        avg128F = _mm_add_epi32(avg128F,_mm_madd_epi16(dl_ch128[2],dl_ch128[2]));

        dl_ch128+=3;

    }
#elif defined(__arm__)
    avg128F = vdupq_n_s32(0);
    dl_ch128=(int16x4_t *)dl_ch;

    for (rb=0; rb<frame_parms->N_RB_DL; rb++) {

        avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[0],dl_ch128[0]));
        avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[1],dl_ch128[1]));
        avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[2],dl_ch128[2]));
        avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[3],dl_ch128[3]));
        avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[4],dl_ch128[4]));
        avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[5],dl_ch128[5]));
        dl_ch128+=6;


    }


#endif
    DevAssert( frame_parms->N_RB_DL );
    avg = (((int*)&avg128F)[0] +
           ((int*)&avg128F)[1] +
           ((int*)&avg128F)[2] +
           ((int*)&avg128F)[3])/(frame_parms->N_RB_DL*12);


#if defined(__x86_64__) || defined(__i386__)
    _mm_empty();
    _m_empty();
#endif
    return(avg);
}
예제 #7
0
int signal_energy_nodc(int *input,unsigned int length) {

  int i;
  int temp,temp2;
  register __m64 mm0,mm1,mm2,mm3;
  __m64 *in = (__m64 *)input;

#ifdef MAIN
  short *printb;
#endif

  mm0 = _m_pxor(mm0,mm0);
  mm3 = _m_pxor(mm3,mm3);

  for (i=0;i<length>>1;i++) {
    
    mm1 = in[i]; 
    mm2 = mm1;
    mm1 = _m_pmaddwd(mm1,mm1);// SIMD complex multiplication
    mm1 = _m_psradi(mm1,shift);
    mm0 = _m_paddd(mm0,mm1);
    //    temp2 = mm0;
    //    printf("%d %d\n",((int *)&in[i])[0],((int *)&in[i])[1]);


    //    printb = (short *)&mm2;
    //    printf("mm2 %d : %d %d %d %d\n",i,printb[0],printb[1],printb[2],printb[3]);


  }

  /*
#ifdef MAIN
  printb = (short *)&mm3;
  printf("%d %d %d %d\n",printb[0],printb[1],printb[2],printb[3]);
#endif
  */
  mm1 = mm0;

  mm0 = _m_psrlqi(mm0,32);

  mm0 = _m_paddd(mm0,mm1);

  temp = _m_to_int(mm0);

  temp/=length;
  temp<<=shift;   // this is the average of x^2

#ifdef MAIN
  printf("E x^2 = %d\n",temp);  
#endif
  _mm_empty();
  _m_empty();



  return((temp>0)?temp:1);
}
예제 #8
0
void ulsch_extract_rbs_single(int **rxdataF,
			      int **rxdataF_ext,
			      unsigned int first_rb,
			      unsigned int nb_rb,
			      unsigned char l,
			      unsigned char Ns,
			      LTE_DL_FRAME_PARMS *frame_parms) {


  unsigned short nb_rb1,nb_rb2;
  unsigned char aarx;
  int *rxF,*rxF_ext;
  
  //unsigned char symbol = l+Ns*frame_parms->symbols_per_tti/2;
  unsigned char symbol = l+((7-frame_parms->Ncp)*(Ns&1)); ///symbol within sub-frame

  for (aarx=0;aarx<frame_parms->nb_antennas_rx;aarx++) {
    
    nb_rb1 = cmin(cmax((int)(frame_parms->N_RB_UL) - (int)(2*first_rb),(int)0),(int)(2*nb_rb));    // 2 times no. RBs before the DC
    nb_rb2 = 2*nb_rb - nb_rb1;                                   // 2 times no. RBs after the DC
#ifdef DEBUG_ULSCH
    msg("ulsch_extract_rbs_single: 2*nb_rb1 = %d, 2*nb_rb2 = %d\n",nb_rb1,nb_rb2);
#endif

    rxF_ext   = &rxdataF_ext[aarx][(symbol*frame_parms->N_RB_UL*12)*2];
    
    if (nb_rb1) {
      rxF = &rxdataF[aarx][(first_rb*12 + frame_parms->first_carrier_offset + symbol*frame_parms->ofdm_symbol_size)*2];
      memcpy(rxF_ext, rxF, nb_rb1*12*sizeof(int));
      rxF_ext += nb_rb1*12;
    
      if (nb_rb2)  {
#ifdef OFDMA_ULSCH
	rxF = &rxdataF[aarx][(1 + symbol*frame_parms->ofdm_symbol_size)*2];
#else
	rxF = &rxdataF[aarx][(symbol*frame_parms->ofdm_symbol_size)*2];
#endif
	memcpy(rxF_ext, rxF, nb_rb2*12*sizeof(int));
	rxF_ext += nb_rb2*12;
      } 
    }
    else { //there is only data in the second half
#ifdef OFDMA_ULSCH
      rxF = &rxdataF[aarx][(1 + 6*(2*first_rb - frame_parms->N_RB_UL) + symbol*frame_parms->ofdm_symbol_size)*2];
#else
      rxF = &rxdataF[aarx][(6*(2*first_rb - frame_parms->N_RB_UL) + symbol*frame_parms->ofdm_symbol_size)*2];
#endif
      memcpy(rxF_ext, rxF, nb_rb2*12*sizeof(int));
      rxF_ext += nb_rb2*12;
    }
  }

  _mm_empty();
  _m_empty();

}
예제 #9
0
void ulsch_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
		     int **rxdataF_comp,
		     short *ulsch_llr,
		     int **ul_ch_mag,
		     int **ul_ch_magb,
		     unsigned char symbol,
		     unsigned short nb_rb) {

  __m128i *rxF=(__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
  __m128i *ch_mag,*ch_magb;
  int j=0,i;
  //  unsigned char symbol_mod;


  if (symbol == 0)
    llrU = ulsch_llr;

  //  symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol;

  ch_mag =(__m128i*)&ul_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)];
  ch_magb =(__m128i*)&ul_ch_magb[0][(symbol*frame_parms->N_RB_DL*12)];


  for (i=0;i<(nb_rb*3);i++) {


    mmtmpU1 = _mm_abs_epi16(rxF[i]);
    mmtmpU1  = _mm_subs_epi16(mmtmpU1,ch_mag[i]);
    mmtmpU2 = _mm_abs_epi16(mmtmpU1);
    mmtmpU2 = _mm_subs_epi16(mmtmpU2,ch_magb[i]);

    for (j=0;j<8;j++) {
      llrU[0] = ((short *)&rxF[i])[j];
      llrU[1] = ((short *)&mmtmpU1)[j];
      llrU[2] = ((short *)&mmtmpU2)[j];
      llrU+=3;
    }

  }

  _mm_empty();
  _m_empty();

}
예제 #10
0
void ulsch_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
		     int **rxdataF_comp,
		     short *ulsch_llr,
		     int **ul_ch_mag,
		     unsigned char symbol,
		     unsigned short nb_rb) {

  __m128i *rxF=(__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
  __m128i *ch_mag;
  int i;
  //  unsigned char symbol_mod;

  //  printf("ulsch_rx.c: ulsch_16qam_llr: symbol %d\n",symbol);

  if (symbol == 0)
    llr128U = (__m128i*)&ulsch_llr[0];

  //  symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol;

  ch_mag =(__m128i*)&ul_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)];


  for (i=0;i<(nb_rb*3);i++) {


    mmtmpU0 = _mm_abs_epi16(rxF[i]);
    //    print_shorts("tmp0",&tmp0);

    mmtmpU0 = _mm_subs_epi16(mmtmpU0,ch_mag[i]);


    llr128U[0] = _mm_unpacklo_epi16(rxF[i],mmtmpU0);
    llr128U[1] = _mm_unpackhi_epi16(rxF[i],mmtmpU0);
    llr128U+=2;

    //    print_bytes("rxF[i]",&rxF[i]);
    //    print_bytes("rxF[i+1]",&rxF[i+1]);
  }

  _mm_empty();
  _m_empty();

}
예제 #11
0
void multadd_real_vector_complex_scalar(int16_t *x,
                                        int16_t *alpha,
                                        int16_t *y,
                                        uint32_t N)
{

  uint32_t i;

  // do 8 multiplications at a time
  simd_q15_t alpha_r_128,alpha_i_128,yr,yi,*x_128=(simd_q15_t*)x,*y_128=(simd_q15_t*)y;
  int j;

  //  printf("alpha = %d,%d\n",alpha[0],alpha[1]);
  alpha_r_128 = set1_int16(alpha[0]);
  alpha_i_128 = set1_int16(alpha[1]);

  j=0;

  for (i=0; i<N>>3; i++) {

    yr     = mulhi_s1_int16(alpha_r_128,x_128[i]);
    yi     = mulhi_s1_int16(alpha_i_128,x_128[i]);
#if defined(__x86_64__) || defined(__i386__)
    y_128[j]   = _mm_adds_epi16(y_128[j],_mm_unpacklo_epi16(yr,yi));
    j++;
    y_128[j]   = _mm_adds_epi16(y_128[j],_mm_unpackhi_epi16(yr,yi));
    j++;
#elif defined(__arm__)
    int16x8x2_t yint;
    yint = vzipq_s16(yr,yi);
    y_128[j]   = adds_int16(y_128[j],yint.val[0]);
    j++;
    y_128[j]   = adds_int16(y_128[j],yint.val[1]);
 
    j++;
#endif
  }

  _mm_empty();
  _m_empty();

}
예제 #12
0
void ulsch_channel_level(int **drs_ch_estimates_ext,
			 LTE_DL_FRAME_PARMS *frame_parms,
			 int *avg,
			 unsigned short nb_rb){

  short rb;
  unsigned char aarx;
  __m128i *ul_ch128;
  

  for (aarx=0;aarx<frame_parms->nb_antennas_rx;aarx++) {
    //clear average level
    avg128U = _mm_xor_si128(avg128U,avg128U);
    ul_ch128=(__m128i *)drs_ch_estimates_ext[aarx];

    for (rb=0;rb<nb_rb;rb++) {
      
      avg128U = _mm_add_epi32(avg128U,_mm_madd_epi16(ul_ch128[0],ul_ch128[0]));
      avg128U = _mm_add_epi32(avg128U,_mm_madd_epi16(ul_ch128[1],ul_ch128[1]));
      avg128U = _mm_add_epi32(avg128U,_mm_madd_epi16(ul_ch128[2],ul_ch128[2]));
      
      ul_ch128+=3;	
            
      if (rb==0) {
	//	print_shorts("ul_ch128",&ul_ch128[0]);
	//	print_shorts("ul_ch128",&ul_ch128[1]);
	//	print_shorts("ul_ch128",&ul_ch128[2]);
      }
      
    }
    
    avg[aarx] = (((int*)&avg128U)[0] + 
		 ((int*)&avg128U)[1] + 
		 ((int*)&avg128U)[2] + 
		 ((int*)&avg128U)[3])/(nb_rb*12);
    
    //    printf("Channel level : %d\n",avg[aarx]);
  }
  _mm_empty();
  _m_empty();

}
예제 #13
0
void ulsch_detection_mrc(LTE_DL_FRAME_PARMS *frame_parms,
			 int **rxdataF_comp,
			 int **ul_ch_mag,
			 int **ul_ch_magb,
			 unsigned char symbol,
			 unsigned short nb_rb) {



  __m128i *rxdataF_comp128_0,*ul_ch_mag128_0,*ul_ch_mag128_0b;
  __m128i *rxdataF_comp128_1,*ul_ch_mag128_1,*ul_ch_mag128_1b;

  int i;

  if (frame_parms->nb_antennas_rx>1) {
    rxdataF_comp128_0   = (__m128i *)&rxdataF_comp[0][symbol*frame_parms->N_RB_DL*12];  
    rxdataF_comp128_1   = (__m128i *)&rxdataF_comp[1][symbol*frame_parms->N_RB_DL*12];  
    ul_ch_mag128_0      = (__m128i *)&ul_ch_mag[0][symbol*frame_parms->N_RB_DL*12];  
    ul_ch_mag128_1      = (__m128i *)&ul_ch_mag[1][symbol*frame_parms->N_RB_DL*12];  
    ul_ch_mag128_0b     = (__m128i *)&ul_ch_magb[0][symbol*frame_parms->N_RB_DL*12];  
    ul_ch_mag128_1b     = (__m128i *)&ul_ch_magb[1][symbol*frame_parms->N_RB_DL*12];  

    // MRC on each re of rb, both on MF output and magnitude (for 16QAM/64QAM llr computation)
    for (i=0;i<nb_rb*3;i++) {
      rxdataF_comp128_0[i] = _mm_adds_epi16(_mm_srai_epi16(rxdataF_comp128_0[i],1),_mm_srai_epi16(rxdataF_comp128_1[i],1));
      ul_ch_mag128_0[i]    = _mm_adds_epi16(_mm_srai_epi16(ul_ch_mag128_0[i],1),_mm_srai_epi16(ul_ch_mag128_1[i],1));
      ul_ch_mag128_0b[i]    = _mm_adds_epi16(_mm_srai_epi16(ul_ch_mag128_0b[i],1),_mm_srai_epi16(ul_ch_mag128_1b[i],1));
    }
    // remove any bias (DC component after IDFT)
    ((u32*)rxdataF_comp128_0)[0]=0;
  }

  _mm_empty();
  _m_empty();

}
예제 #14
0
void ulsch_alamouti(LTE_DL_FRAME_PARMS *frame_parms,// For Distributed Alamouti Receiver Combining
		    int **rxdataF_comp,
		    int **rxdataF_comp_0,
		    int **rxdataF_comp_1,
		    int **ul_ch_mag,
		    int **ul_ch_magb,
		    int **ul_ch_mag_0,
		    int **ul_ch_magb_0,
		    int **ul_ch_mag_1,
		    int **ul_ch_magb_1,
		    unsigned char symbol,
		    unsigned short nb_rb)   {

  short *rxF,*rxF0,*rxF1;
  __m128i *ch_mag,*ch_magb,*ch_mag0,*ch_mag1,*ch_mag0b,*ch_mag1b;
  unsigned char rb,re,aarx;
  int jj=(symbol*frame_parms->N_RB_DL*12);


  for (aarx=0;aarx<frame_parms->nb_antennas_rx;aarx++) {

    rxF      = (short*)&rxdataF_comp[aarx][jj];
    rxF0     = (short*)&rxdataF_comp_0[aarx][jj];   // Contains (y)*(h0*)
    rxF1     = (short*)&rxdataF_comp_1[aarx][jj];   // Contains (y*)*(h1)
    ch_mag   = (__m128i *)&ul_ch_mag[aarx][jj];
    ch_mag0 = (__m128i *)&ul_ch_mag_0[aarx][jj];
    ch_mag1 = (__m128i *)&ul_ch_mag_1[aarx][jj];
    ch_magb = (__m128i *)&ul_ch_magb[aarx][jj];
    ch_mag0b = (__m128i *)&ul_ch_magb_0[aarx][jj];
    ch_mag1b = (__m128i *)&ul_ch_magb_1[aarx][jj];
    for (rb=0;rb<nb_rb;rb++) {

      for (re=0;re<12;re+=2) {

	// Alamouti RX combining
      
	rxF[0] = rxF0[0] + rxF1[2];                   // re((y0)*(h0*))+ re((y1*)*(h1)) = re(x0)
	rxF[1] = rxF0[1] + rxF1[3];                   // im((y0)*(h0*))+ im((y1*)*(h1)) = im(x0)

	rxF[2] = rxF0[2] - rxF1[0];                   // re((y1)*(h0*))- re((y0*)*(h1)) = re(x1)
	rxF[3] = rxF0[3] - rxF1[1];                   // im((y1)*(h0*))- im((y0*)*(h1)) = im(x1)
 
	rxF+=4;
	rxF0+=4;
	rxF1+=4;
      }
 
      // compute levels for 16QAM or 64 QAM llr unit
      ch_mag[0] = _mm_adds_epi16(ch_mag0[0],ch_mag1[0]);
      ch_mag[1] = _mm_adds_epi16(ch_mag0[1],ch_mag1[1]);
      ch_mag[2] = _mm_adds_epi16(ch_mag0[2],ch_mag1[2]);
      ch_magb[0] = _mm_adds_epi16(ch_mag0b[0],ch_mag1b[0]);
      ch_magb[1] = _mm_adds_epi16(ch_mag0b[1],ch_mag1b[1]);
      ch_magb[2] = _mm_adds_epi16(ch_mag0b[2],ch_mag1b[2]);

      ch_mag+=3;
      ch_mag0+=3;
      ch_mag1+=3;
      ch_magb+=3;
      ch_mag0b+=3;
      ch_mag1b+=3;
    }
  }

  _mm_empty();
  _m_empty();
  
}
예제 #15
0
void ulsch_channel_compensation_alamouti(int **rxdataF_ext,                 // For Distributed Alamouti Combining
					 int **ul_ch_estimates_ext_0,
					 int **ul_ch_estimates_ext_1,
					 int **ul_ch_mag_0,
					 int **ul_ch_magb_0,
					 int **ul_ch_mag_1,
					 int **ul_ch_magb_1,
					 int **rxdataF_comp_0,
					 int **rxdataF_comp_1,
					 LTE_DL_FRAME_PARMS *frame_parms,
					 unsigned char symbol,
					 unsigned char Qm,
					 unsigned short nb_rb,
					 unsigned char output_shift_0,
					 unsigned char output_shift_1) {
  
  unsigned short rb;
  __m128i *ul_ch128_0,*ul_ch128_1,*ul_ch_mag128_0,*ul_ch_mag128_1,*ul_ch_mag128b_0,*ul_ch_mag128b_1,*rxdataF128,*rxdataF_comp128_0,*rxdataF_comp128_1;
  unsigned char aarx;//,symbol_mod;


  //  symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol;

#ifndef __SSE3__
  zeroU = _mm_xor_si128(zeroU,zeroU);
#endif

  //    printf("comp: symbol %d\n",symbol);

  
  if (Qm == 4) {  
    QAM_amp128U_0 = _mm_set1_epi16(QAM16_n1);
    QAM_amp128U_1 = _mm_set1_epi16(QAM16_n1);
  }
  else if (Qm == 6) {
    QAM_amp128U_0  = _mm_set1_epi16(QAM64_n1);
    QAM_amp128bU_0 = _mm_set1_epi16(QAM64_n2);

    QAM_amp128U_1  = _mm_set1_epi16(QAM64_n1);
    QAM_amp128bU_1 = _mm_set1_epi16(QAM64_n2);
  }
  for (aarx=0;aarx<frame_parms->nb_antennas_rx;aarx++) {
    
    ul_ch128_0          = (__m128i *)&ul_ch_estimates_ext_0[aarx][symbol*frame_parms->N_RB_DL*12];
    ul_ch_mag128_0      = (__m128i *)&ul_ch_mag_0[aarx][symbol*frame_parms->N_RB_DL*12];
    ul_ch_mag128b_0     = (__m128i *)&ul_ch_magb_0[aarx][symbol*frame_parms->N_RB_DL*12];
    ul_ch128_1          = (__m128i *)&ul_ch_estimates_ext_1[aarx][symbol*frame_parms->N_RB_DL*12];
    ul_ch_mag128_1      = (__m128i *)&ul_ch_mag_1[aarx][symbol*frame_parms->N_RB_DL*12];
    ul_ch_mag128b_1     = (__m128i *)&ul_ch_magb_1[aarx][symbol*frame_parms->N_RB_DL*12];
    rxdataF128        = (__m128i *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12];
    rxdataF_comp128_0   = (__m128i *)&rxdataF_comp_0[aarx][symbol*frame_parms->N_RB_DL*12];
    rxdataF_comp128_1   = (__m128i *)&rxdataF_comp_1[aarx][symbol*frame_parms->N_RB_DL*12];


    for (rb=0;rb<nb_rb;rb++) {
      //      printf("comp: symbol %d rb %d\n",symbol,rb);
      if (Qm>2) {  
	// get channel amplitude if not QPSK

	mmtmpU0 = _mm_madd_epi16(ul_ch128_0[0],ul_ch128_0[0]);
	
	mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0);
	
	mmtmpU1 = _mm_madd_epi16(ul_ch128_0[1],ul_ch128_0[1]);
	mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_0);
	mmtmpU0 = _mm_packs_epi32(mmtmpU0,mmtmpU1);
	
	ul_ch_mag128_0[0] = _mm_unpacklo_epi16(mmtmpU0,mmtmpU0);
	ul_ch_mag128b_0[0] = ul_ch_mag128_0[0];
	ul_ch_mag128_0[0] = _mm_mulhi_epi16(ul_ch_mag128_0[0],QAM_amp128U_0);
	ul_ch_mag128_0[0] = _mm_slli_epi16(ul_ch_mag128_0[0],2); // 2 to compensate the scale channel estimate
	
	ul_ch_mag128_0[1] = _mm_unpackhi_epi16(mmtmpU0,mmtmpU0);
	ul_ch_mag128b_0[1] = ul_ch_mag128_0[1];
	ul_ch_mag128_0[1] = _mm_mulhi_epi16(ul_ch_mag128_0[1],QAM_amp128U_0);
	ul_ch_mag128_0[1] = _mm_slli_epi16(ul_ch_mag128_0[1],2); // 2 to scale compensate the scale channel estimate
	
	mmtmpU0 = _mm_madd_epi16(ul_ch128_0[2],ul_ch128_0[2]);
	mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0);
	mmtmpU1 = _mm_packs_epi32(mmtmpU0,mmtmpU0);
	
	ul_ch_mag128_0[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1);
	ul_ch_mag128b_0[2] = ul_ch_mag128_0[2];
	
	ul_ch_mag128_0[2] = _mm_mulhi_epi16(ul_ch_mag128_0[2],QAM_amp128U_0);
	ul_ch_mag128_0[2] = _mm_slli_epi16(ul_ch_mag128_0[2],2);	//  2 to scale compensate the scale channel estimat
	
	
	ul_ch_mag128b_0[0] = _mm_mulhi_epi16(ul_ch_mag128b_0[0],QAM_amp128bU_0);
	ul_ch_mag128b_0[0] = _mm_slli_epi16(ul_ch_mag128b_0[0],2);  //  2 to scale compensate the scale channel estima
	
	
	ul_ch_mag128b_0[1] = _mm_mulhi_epi16(ul_ch_mag128b_0[1],QAM_amp128bU_0);
	ul_ch_mag128b_0[1] = _mm_slli_epi16(ul_ch_mag128b_0[1],2);   //  2 to scale compensate the scale channel estima
	
	ul_ch_mag128b_0[2] = _mm_mulhi_epi16(ul_ch_mag128b_0[2],QAM_amp128bU_0);
	ul_ch_mag128b_0[2] = _mm_slli_epi16(ul_ch_mag128b_0[2],2);	 //  2 to scale compensate the scale channel estima 
	

	

	mmtmpU0 = _mm_madd_epi16(ul_ch128_1[0],ul_ch128_1[0]);
	
	mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1);
	
	mmtmpU1 = _mm_madd_epi16(ul_ch128_1[1],ul_ch128_1[1]);
	mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_1);
	mmtmpU0 = _mm_packs_epi32(mmtmpU0,mmtmpU1);
	
	ul_ch_mag128_1[0] = _mm_unpacklo_epi16(mmtmpU0,mmtmpU0);
	ul_ch_mag128b_1[0] = ul_ch_mag128_1[0];
	ul_ch_mag128_1[0] = _mm_mulhi_epi16(ul_ch_mag128_1[0],QAM_amp128U_1);
	ul_ch_mag128_1[0] = _mm_slli_epi16(ul_ch_mag128_1[0],2); // 2 to compensate the scale channel estimate
	
	ul_ch_mag128_1[1] = _mm_unpackhi_epi16(mmtmpU0,mmtmpU0);
	ul_ch_mag128b_1[1] = ul_ch_mag128_1[1];
	ul_ch_mag128_1[1] = _mm_mulhi_epi16(ul_ch_mag128_1[1],QAM_amp128U_1);
	ul_ch_mag128_1[1] = _mm_slli_epi16(ul_ch_mag128_1[1],2); // 2 to scale compensate the scale channel estimate
	
	mmtmpU0 = _mm_madd_epi16(ul_ch128_1[2],ul_ch128_1[2]);
	mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1);
	mmtmpU1 = _mm_packs_epi32(mmtmpU0,mmtmpU0);
	
	ul_ch_mag128_1[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1);
	ul_ch_mag128b_1[2] = ul_ch_mag128_1[2];
	
	ul_ch_mag128_1[2] = _mm_mulhi_epi16(ul_ch_mag128_1[2],QAM_amp128U_0);
	ul_ch_mag128_1[2] = _mm_slli_epi16(ul_ch_mag128_1[2],2);	//  2 to scale compensate the scale channel estimat
	
	
	ul_ch_mag128b_1[0] = _mm_mulhi_epi16(ul_ch_mag128b_1[0],QAM_amp128bU_1);
	ul_ch_mag128b_1[0] = _mm_slli_epi16(ul_ch_mag128b_1[0],2);  //  2 to scale compensate the scale channel estima
	
	
	ul_ch_mag128b_1[1] = _mm_mulhi_epi16(ul_ch_mag128b_1[1],QAM_amp128bU_1);
	ul_ch_mag128b_1[1] = _mm_slli_epi16(ul_ch_mag128b_1[1],2);   //  2 to scale compensate the scale channel estima
	
	ul_ch_mag128b_1[2] = _mm_mulhi_epi16(ul_ch_mag128b_1[2],QAM_amp128bU_1);
	ul_ch_mag128b_1[2] = _mm_slli_epi16(ul_ch_mag128b_1[2],2);	 //  2 to scale compensate the scale channel estima 
      }
      

      /************************For Computing (y)*(h0*)********************************************/

      // multiply by conjugated channel
      mmtmpU0 = _mm_madd_epi16(ul_ch128_0[0],rxdataF128[0]);
      //	print_ints("re",&mmtmpU0);
      
      // mmtmpU0 contains real part of 4 consecutive outputs (32-bit)
      mmtmpU1 = _mm_shufflelo_epi16(ul_ch128_0[0],_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)&conjugate[0]);
      //	print_ints("im",&mmtmpU1);
      mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[0]);
      // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit)
      mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0);
      //	print_ints("re(shift)",&mmtmpU0);
      mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_0);
      //	print_ints("im(shift)",&mmtmpU1);
      mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1);
      mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1);
      //       	print_ints("c0",&mmtmpU2);
      //	print_ints("c1",&mmtmpU3);
      rxdataF_comp128_0[0] = _mm_packs_epi32(mmtmpU2,mmtmpU3);
      //      	print_shorts("rx:",rxdataF128[0]);
      //      	print_shorts("ch:",ul_ch128_0[0]);
      //      	print_shorts("pack:",rxdataF_comp128_0[0]);
      
      // multiply by conjugated channel
      mmtmpU0 = _mm_madd_epi16(ul_ch128_0[1],rxdataF128[1]);
      // mmtmpU0 contains real part of 4 consecutive outputs (32-bit)
      mmtmpU1 = _mm_shufflelo_epi16(ul_ch128_0[1],_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate);
      mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[1]);
      // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit)
      mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0);
      mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_0);
      mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1);
      mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1);
      
      rxdataF_comp128_0[1] = _mm_packs_epi32(mmtmpU2,mmtmpU3);
      //      	print_shorts("rx:",rxdataF128[1]);
      //      	print_shorts("ch:",ul_ch128_0[1]);
      //      	print_shorts("pack:",rxdataF_comp128_0[1]);	
      //       multiply by conjugated channel
      mmtmpU0 = _mm_madd_epi16(ul_ch128_0[2],rxdataF128[2]);
      // mmtmpU0 contains real part of 4 consecutive outputs (32-bit)
      mmtmpU1 = _mm_shufflelo_epi16(ul_ch128_0[2],_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate);
      mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[2]);
      // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit)
      mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0);
      mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_0);
      mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1);
      mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1);
      
      rxdataF_comp128_0[2] = _mm_packs_epi32(mmtmpU2,mmtmpU3);
      //      	print_shorts("rx:",rxdataF128[2]);
      //      	print_shorts("ch:",ul_ch128_0[2]);
      //        print_shorts("pack:",rxdataF_comp128_0[2]);
      



      /*************************For Computing (y*)*(h1)************************************/
      // multiply by conjugated signal
      mmtmpU0 = _mm_madd_epi16(ul_ch128_1[0],rxdataF128[0]);
      //	print_ints("re",&mmtmpU0);
      
      // mmtmpU0 contains real part of 4 consecutive outputs (32-bit)
      mmtmpU1 = _mm_shufflelo_epi16(rxdataF128[0],_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)&conjugate[0]);
      //	print_ints("im",&mmtmpU1);
      mmtmpU1 = _mm_madd_epi16(mmtmpU1,ul_ch128_1[0]);
      // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit)
      mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1);
      //	print_ints("re(shift)",&mmtmpU0);
      mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_1);
      //	print_ints("im(shift)",&mmtmpU1);
      mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1);
      mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1);
      //       	print_ints("c0",&mmtmpU2);
      //	print_ints("c1",&mmtmpU3);
      rxdataF_comp128_1[0] = _mm_packs_epi32(mmtmpU2,mmtmpU3);
      //      	print_shorts("rx:",rxdataF128[0]);
      //      	print_shorts("ch_conjugate:",ul_ch128_1[0]);
      //      	print_shorts("pack:",rxdataF_comp128_1[0]);


      // multiply by conjugated signal
      mmtmpU0 = _mm_madd_epi16(ul_ch128_1[1],rxdataF128[1]);
      // mmtmpU0 contains real part of 4 consecutive outputs (32-bit)
      mmtmpU1 = _mm_shufflelo_epi16(rxdataF128[1],_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate);
      mmtmpU1 = _mm_madd_epi16(mmtmpU1,ul_ch128_1[1]);
      // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit)
      mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1);
      mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_1);
      mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1);
      mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1);
      
      rxdataF_comp128_1[1] = _mm_packs_epi32(mmtmpU2,mmtmpU3);
      //      	print_shorts("rx:",rxdataF128[1]);
      //      	print_shorts("ch_conjugate:",ul_ch128_1[1]);
      //      	print_shorts("pack:",rxdataF_comp128_1[1]);


      //       multiply by conjugated signal
      mmtmpU0 = _mm_madd_epi16(ul_ch128_1[2],rxdataF128[2]);
      // mmtmpU0 contains real part of 4 consecutive outputs (32-bit)
      mmtmpU1 = _mm_shufflelo_epi16(rxdataF128[2],_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate);
      mmtmpU1 = _mm_madd_epi16(mmtmpU1,ul_ch128_1[2]);
      // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit)
      mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1);
      mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_1);
      mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1);
      mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1);
      
      rxdataF_comp128_1[2] = _mm_packs_epi32(mmtmpU2,mmtmpU3);
      //      	print_shorts("rx:",rxdataF128[2]);
      //      	print_shorts("ch_conjugate:",ul_ch128_0[2]);
      //        print_shorts("pack:",rxdataF_comp128_1[2]);



      ul_ch128_0+=3;
      ul_ch_mag128_0+=3;
      ul_ch_mag128b_0+=3;
      ul_ch128_1+=3;
      ul_ch_mag128_1+=3;
      ul_ch_mag128b_1+=3;
      rxdataF128+=3;
      rxdataF_comp128_0+=3;
      rxdataF_comp128_1+=3;
      
    }
  }


  _mm_empty();
  _m_empty();

}     
예제 #16
0
void ulsch_channel_compensation(int **rxdataF_ext,
				int **ul_ch_estimates_ext,
				int **ul_ch_mag,
				int **ul_ch_magb,
				int **rxdataF_comp,
				LTE_DL_FRAME_PARMS *frame_parms,
				unsigned char symbol,
				unsigned char Qm,
				unsigned short nb_rb,
				unsigned char output_shift) {
  
  unsigned short rb;
  __m128i *ul_ch128,*ul_ch_mag128,*ul_ch_mag128b,*rxdataF128,*rxdataF_comp128;
  unsigned char aarx;//,symbol_mod;


  //  symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol;

#ifndef __SSE3__
  zeroU = _mm_xor_si128(zeroU,zeroU);
#endif

  //    printf("comp: symbol %d\n",symbol);

  
  if (Qm == 4)
    QAM_amp128U = _mm_set1_epi16(QAM16_n1);
  else if (Qm == 6) {
    QAM_amp128U  = _mm_set1_epi16(QAM64_n1);
    QAM_amp128bU = _mm_set1_epi16(QAM64_n2);
  }
  for (aarx=0;aarx<frame_parms->nb_antennas_rx;aarx++) {
    
    ul_ch128          = (__m128i *)&ul_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12];
    ul_ch_mag128      = (__m128i *)&ul_ch_mag[aarx][symbol*frame_parms->N_RB_DL*12];
    ul_ch_mag128b     = (__m128i *)&ul_ch_magb[aarx][symbol*frame_parms->N_RB_DL*12];
    rxdataF128        = (__m128i *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12];
    rxdataF_comp128   = (__m128i *)&rxdataF_comp[aarx][symbol*frame_parms->N_RB_DL*12];


    for (rb=0;rb<nb_rb;rb++) {
      //      printf("comp: symbol %d rb %d\n",symbol,rb);
#ifdef OFDMA_ULSCH
      if (Qm>2) {  
	// get channel amplitude if not QPSK

	mmtmpU0 = _mm_madd_epi16(ul_ch128[0],ul_ch128[0]);
	
	mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift);
	
	mmtmpU1 = _mm_madd_epi16(ul_ch128[1],ul_ch128[1]);
	mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift);
	mmtmpU0 = _mm_packs_epi32(mmtmpU0,mmtmpU1);
	
	ul_ch_mag128[0] = _mm_unpacklo_epi16(mmtmpU0,mmtmpU0);
	ul_ch_mag128b[0] = ul_ch_mag128[0];
	ul_ch_mag128[0] = _mm_mulhi_epi16(ul_ch_mag128[0],QAM_amp128U);
	ul_ch_mag128[0] = _mm_slli_epi16(ul_ch_mag128[0],2);  // 2 to compensate the scale channel estimate
	ul_ch_mag128[1] = _mm_unpackhi_epi16(mmtmpU0,mmtmpU0);
	ul_ch_mag128b[1] = ul_ch_mag128[1];
	ul_ch_mag128[1] = _mm_mulhi_epi16(ul_ch_mag128[1],QAM_amp128U);
	ul_ch_mag128[1] = _mm_slli_epi16(ul_ch_mag128[1],2);  // 2 to compensate the scale channel estimate
	
	mmtmpU0 = _mm_madd_epi16(ul_ch128[2],ul_ch128[2]);
	mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift);
	mmtmpU1 = _mm_packs_epi32(mmtmpU0,mmtmpU0);
	
	ul_ch_mag128[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1);
	ul_ch_mag128b[2] = ul_ch_mag128[2];
	
	ul_ch_mag128[2] = _mm_mulhi_epi16(ul_ch_mag128[2],QAM_amp128U);
	ul_ch_mag128[2] = _mm_slli_epi16(ul_ch_mag128[2],2); // 2 to compensate the scale channel estimate	  
	
	
	ul_ch_mag128b[0] = _mm_mulhi_epi16(ul_ch_mag128b[0],QAM_amp128bU);
	ul_ch_mag128b[0] = _mm_slli_epi16(ul_ch_mag128b[0],2); // 2 to compensate the scale channel estimate
	
	
	ul_ch_mag128b[1] = _mm_mulhi_epi16(ul_ch_mag128b[1],QAM_amp128bU);
	ul_ch_mag128b[1] = _mm_slli_epi16(ul_ch_mag128b[1],2); // 2 to compensate the scale channel estimate
	
	ul_ch_mag128b[2] = _mm_mulhi_epi16(ul_ch_mag128b[2],QAM_amp128bU);
	ul_ch_mag128b[2] = _mm_slli_epi16(ul_ch_mag128b[2],2);// 2 to compensate the scale channel estimate	   
	
      }
#else

	mmtmpU0 = _mm_madd_epi16(ul_ch128[0],ul_ch128[0]);
	
	mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift-1);
	
	mmtmpU1 = _mm_madd_epi16(ul_ch128[1],ul_ch128[1]);
	mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift-1);
	mmtmpU0 = _mm_packs_epi32(mmtmpU0,mmtmpU1);
	
	ul_ch_mag128[0] = _mm_unpacklo_epi16(mmtmpU0,mmtmpU0);
	ul_ch_mag128[1] = _mm_unpackhi_epi16(mmtmpU0,mmtmpU0);
	
	mmtmpU0 = _mm_madd_epi16(ul_ch128[2],ul_ch128[2]);
	mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift-1);
	mmtmpU1 = _mm_packs_epi32(mmtmpU0,mmtmpU0);
	ul_ch_mag128[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1);

	//	printf("comp: symbol %d rb %d => %d,%d,%d\n",symbol,rb,*((short*)&ul_ch_mag128[0]),*((short*)&ul_ch_mag128[1]),*((short*)&ul_ch_mag128[2]));	
#endif          
      // multiply by conjugated channel
      mmtmpU0 = _mm_madd_epi16(ul_ch128[0],rxdataF128[0]);
      //	print_ints("re",&mmtmpU0);
      
      // mmtmpU0 contains real part of 4 consecutive outputs (32-bit)
      mmtmpU1 = _mm_shufflelo_epi16(ul_ch128[0],_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)&conjugate[0]);
      //	print_ints("im",&mmtmpU1);
      mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[0]);
      // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit)
      mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift);
      //	print_ints("re(shift)",&mmtmpU0);
      mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift);
      //	print_ints("im(shift)",&mmtmpU1);
      mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1);
      mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1);
      //       	print_ints("c0",&mmtmpU2);
      //	print_ints("c1",&mmtmpU3);
      rxdataF_comp128[0] = _mm_packs_epi32(mmtmpU2,mmtmpU3);
      //      	print_shorts("rx:",rxdataF128[0]);
      //      	print_shorts("ch:",ul_ch128[0]);
      //      	print_shorts("pack:",rxdataF_comp128[0]);
      
      // multiply by conjugated channel
      mmtmpU0 = _mm_madd_epi16(ul_ch128[1],rxdataF128[1]);
      // mmtmpU0 contains real part of 4 consecutive outputs (32-bit)
      mmtmpU1 = _mm_shufflelo_epi16(ul_ch128[1],_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate);
      mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[1]);
      // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit)
      mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift);
      mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift);
      mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1);
      mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1);
      
      rxdataF_comp128[1] = _mm_packs_epi32(mmtmpU2,mmtmpU3);
      //      	print_shorts("rx:",rxdataF128[1]);
      //      	print_shorts("ch:",ul_ch128[1]);
      //      	print_shorts("pack:",rxdataF_comp128[1]);	
      //       multiply by conjugated channel
      mmtmpU0 = _mm_madd_epi16(ul_ch128[2],rxdataF128[2]);
      // mmtmpU0 contains real part of 4 consecutive outputs (32-bit)
      mmtmpU1 = _mm_shufflelo_epi16(ul_ch128[2],_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate);
      mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[2]);
      // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit)
      mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift);
      mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift);
      mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1);
      mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1);
      
      rxdataF_comp128[2] = _mm_packs_epi32(mmtmpU2,mmtmpU3);
      //      	print_shorts("rx:",rxdataF128[2]);
      //      	print_shorts("ch:",ul_ch128[2]);
      //        print_shorts("pack:",rxdataF_comp128[2]);
      
      ul_ch128+=3;
      ul_ch_mag128+=3;
      ul_ch_mag128b+=3;
      rxdataF128+=3;
      rxdataF_comp128+=3;
      
    }
  }


  _mm_empty();
  _m_empty();

}     
예제 #17
0
int mult_cpx_vector(int16_t *x1, //Q15
                    int16_t *x2,//Q13
                    int16_t *y,
                    uint32_t N,
                    int output_shift)
{
  // Multiply elementwise x1 with x2.
  // x1       - input 1    in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
  //            We assume x1 with a dinamic of 15 bit maximum
  //
  // x2       - input 2    in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
  //            We assume x2 with a dinamic of 14 bit maximum
  ///
  // y        - output     in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
  //
  // N        - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
  //
  // output_shift  - shift to be applied to generate output
  uint32_t i;                 // loop counter
  simd_q15_t *x1_128;
  simd_q15_t *x2_128;
  simd_q15_t *y_128;
  simd_q15_t tmp_re,tmp_im;
  simd_q15_t tmpy0,tmpy1;
  x1_128 = (simd_q15_t *)&x1[0];
  x2_128 = (simd_q15_t *)&x2[0];
  y_128  = (simd_q15_t *)&y[0];
  //print_shorts("x1_128:",&x1_128[0]);
 // print_shorts("x2_128:",&x2_128[0]);

  //right shift by 13 while p_a * x0 and 15 while
  // we compute 4 cpx multiply for each loop
  for(i=0; i<(N>>2); i++) {
    tmp_re = _mm_sign_epi16(*x1_128,*(__m128i*)&conjug2[0]);// Q15
    //print_shorts("tmp_re1:",&tmp_re[i]);
    tmp_re = _mm_madd_epi16(tmp_re,*x2_128); //Q28
    //print_ints("tmp_re2:",&tmp_re[i]);
    tmp_im = _mm_shufflelo_epi16(*x1_128,_MM_SHUFFLE(2,3,0,1)); //Q15
    //print_shorts("tmp_im1:",&tmp_im[i]);
    tmp_im = _mm_shufflehi_epi16(tmp_im,_MM_SHUFFLE(2,3,0,1)); //Q15
    //print_shorts("tmp_im2:",&tmp_im[i]);
    tmp_im = _mm_madd_epi16(tmp_im, *x2_128); //Q28
    //print_ints("tmp_im3:",&tmp_im[i]);
    tmp_re = _mm_srai_epi32(tmp_re,output_shift);//Q(28-shift)
    //print_ints("tmp_re shifted:",&tmp_re[i]);
    tmp_im = _mm_srai_epi32(tmp_im,output_shift); //Q(28-shift)
    //print_ints("tmp_im shifted:",&tmp_im[i]);
    tmpy0  = _mm_unpacklo_epi32(tmp_re,tmp_im); //Q(28-shift)
    //print_ints("unpack lo :",&tmpy0[i]);
    tmpy1  = _mm_unpackhi_epi32(tmp_re,tmp_im); //Q(28-shift)
    //print_ints("mrc rho0:",&tmpy1[i]);
    *y_128 = _mm_packs_epi32(tmpy0,tmpy1); //must be Q15
    //print_shorts("*y_128:",&y_128[i]);
    x1_128++;
    x2_128++;
    y_128++;
  }
  _mm_empty();
  _m_empty();
  return(0);
}
예제 #18
0
int multadd_cpx_vector(int16_t *x1,
                    int16_t *x2,
                    int16_t *y,
                    uint8_t zero_flag,
                    uint32_t N,
                    int output_shift)
{
  // Multiply elementwise the complex conjugate of x1 with x2.
  // x1       - input 1    in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
  //            We assume x1 with a dinamic of 15 bit maximum
  //
  // x2       - input 2    in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
  //            We assume x2 with a dinamic of 14 bit maximum
  ///
  // y        - output     in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
  //
  // zero_flag - Set output (y) to zero prior to disable accumulation
  //
  // N        - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
  //
  // output_shift  - shift to be applied to generate output
  uint32_t i;                 // loop counter
  simd_q15_t *x1_128;
  simd_q15_t *x2_128;
  simd_q15_t *y_128;
#if defined(__x86_64__) || defined(__i386__)
  simd_q15_t tmp_re,tmp_im;
  simd_q15_t tmpy0,tmpy1;
#elif defined(__arm__)
  int32x4_t tmp_re,tmp_im;
  int32x4_t tmp_re1,tmp_im1;
  int16x4x2_t tmpy;
  int32x4_t shift = vdupq_n_s32(-output_shift);
#endif
  x1_128 = (simd_q15_t *)&x1[0];
  x2_128 = (simd_q15_t *)&x2[0];
  y_128  = (simd_q15_t *)&y[0];
  // we compute 4 cpx multiply for each loop
  for(i=0; i<(N>>2); i++) {
#if defined(__x86_64__) || defined(__i386__)
    tmp_re = _mm_sign_epi16(*x1_128,*(__m128i*)&conjug2[0]);
    tmp_re = _mm_madd_epi16(tmp_re,*x2_128);
    tmp_im = _mm_shufflelo_epi16(*x1_128,_MM_SHUFFLE(2,3,0,1));
    tmp_im = _mm_shufflehi_epi16(tmp_im,_MM_SHUFFLE(2,3,0,1));
    tmp_im = _mm_madd_epi16(tmp_im,*x2_128);
    tmp_re = _mm_srai_epi32(tmp_re,output_shift);
    tmp_im = _mm_srai_epi32(tmp_im,output_shift);
    tmpy0  = _mm_unpacklo_epi32(tmp_re,tmp_im);
    //print_ints("unpack lo:",&tmpy0[i]);
    tmpy1  = _mm_unpackhi_epi32(tmp_re,tmp_im);
    //print_ints("unpack hi:",&tmpy1[i]);
    if (zero_flag == 1)
      *y_128 = _mm_packs_epi32(tmpy0,tmpy1);
    else
      *y_128 = _mm_adds_epi16(*y_128,_mm_packs_epi32(tmpy0,tmpy1));
    //print_shorts("*y_128:",&y_128[i]);
#elif defined(__arm__)
    msg("mult_cpx_vector not implemented for __arm__");
#endif
    x1_128++;
    x2_128++;
    y_128++;
  }
  _mm_empty();
  _m_empty();
  return(0);
}
예제 #19
0
int mult_cpx_conj_vector(int16_t *x1,
                         int16_t *x2,
                         int16_t *y,
                         uint32_t N,
                         int output_shift,
			 int madd)
{
  // Multiply elementwise the complex conjugate of x1 with x2.
  // x1       - input 1    in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
  //            We assume x1 with a dinamic of 15 bit maximum
  //
  // x2       - input 2    in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
  //            We assume x2 with a dinamic of 14 bit maximum
  ///
  // y        - output     in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
  //
  // N        - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
  //
  // output_shift  - shift to be applied to generate output
  //
  // madd - add the output to y

  uint32_t i;                 // loop counter

  simd_q15_t *x1_128;
  simd_q15_t *x2_128;
  simd_q15_t *y_128;
#if defined(__x86_64__) || defined(__i386__)
  simd_q15_t tmp_re,tmp_im;
  simd_q15_t tmpy0,tmpy1;

#elif defined(__arm__)
  int32x4_t tmp_re,tmp_im;
  int32x4_t tmp_re1,tmp_im1;
  int16x4x2_t tmpy;
  int32x4_t shift = vdupq_n_s32(-output_shift);
#endif

  x1_128 = (simd_q15_t *)&x1[0];
  x2_128 = (simd_q15_t *)&x2[0];
  y_128  = (simd_q15_t *)&y[0];


  // we compute 4 cpx multiply for each loop
  for(i=0; i<(N>>2); i++) {
#if defined(__x86_64__) || defined(__i386__)
    tmp_re = _mm_madd_epi16(*x1_128,*x2_128);
    tmp_im = _mm_shufflelo_epi16(*x1_128,_MM_SHUFFLE(2,3,0,1));
    tmp_im = _mm_shufflehi_epi16(tmp_im,_MM_SHUFFLE(2,3,0,1));
    tmp_im = _mm_sign_epi16(tmp_im,*(__m128i*)&conjug[0]);
    tmp_im = _mm_madd_epi16(tmp_im,*x2_128);
    tmp_re = _mm_srai_epi32(tmp_re,output_shift);
    tmp_im = _mm_srai_epi32(tmp_im,output_shift);
    tmpy0  = _mm_unpacklo_epi32(tmp_re,tmp_im);
    tmpy1  = _mm_unpackhi_epi32(tmp_re,tmp_im);
    if (madd==0)
      *y_128 = _mm_packs_epi32(tmpy0,tmpy1);
    else
      *y_128 += _mm_packs_epi32(tmpy0,tmpy1);

#elif defined(__arm__)

    tmp_re  = vmull_s16(((simdshort_q15_t *)x1_128)[0], ((simdshort_q15_t*)x2_128)[0]);
    //tmp_re = [Re(x1[0])Re(x2[0]) Im(x1[0])Im(x2[0]) Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1])]
    tmp_re1 = vmull_s16(((simdshort_q15_t *)x1_128)[1], ((simdshort_q15_t*)x2_128)[1]);
    //tmp_re1 = [Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1]) Re(x1[1])Re(x2[2]) Im(x1[1])Im(x2[2])]
    tmp_re  = vcombine_s32(vpadd_s32(vget_low_s32(tmp_re),vget_high_s32(tmp_re)),
                           vpadd_s32(vget_low_s32(tmp_re1),vget_high_s32(tmp_re1)));
    //tmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2]) Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])]

    tmp_im  = vmull_s16(vrev32_s16(vmul_s16(((simdshort_q15_t*)x2_128)[0],*(simdshort_q15_t*)conjug)), ((simdshort_q15_t*)x1_128)[0]);
    //tmp_im = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])]
    tmp_im1 = vmull_s16(vrev32_s16(vmul_s16(((simdshort_q15_t*)x2_128)[1],*(simdshort_q15_t*)conjug)), ((simdshort_q15_t*)x1_128)[1]);
    //tmp_im1 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])]
    tmp_im  = vcombine_s32(vpadd_s32(vget_low_s32(tmp_im),vget_high_s32(tmp_im)),
                           vpadd_s32(vget_low_s32(tmp_im1),vget_high_s32(tmp_im1)));
    //tmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])]

    tmp_re = vqshlq_s32(tmp_re,shift);
    tmp_im = vqshlq_s32(tmp_im,shift);
    tmpy   = vzip_s16(vmovn_s32(tmp_re),vmovn_s32(tmp_im));
    if (madd==0)
      *y_128 = vcombine_s16(tmpy.val[0],tmpy.val[1]);
    else
      *y_128 += vcombine_s16(tmpy.val[0],tmpy.val[1]);
#endif
    x1_128++;
    x2_128++;
    y_128++;
  }


  _mm_empty();
  _m_empty();

  return(0);
}
예제 #20
0
int rotate_cpx_vector(int16_t *x,
                      int16_t *alpha,
                      int16_t *y,
                      uint32_t N,
                      uint16_t output_shift)
{
  // Multiply elementwise two complex vectors of N elements
  // x        - input 1    in the format  |Re0  Im0 |,......,|Re(N-1) Im(N-1)|
  //            We assume x1 with a dynamic of 15 bit maximum
  //
  // alpha      - input 2    in the format  |Re0 Im0|
  //            We assume x2 with a dynamic of 15 bit maximum
  //
  // y        - output     in the format  |Re0  Im0|,......,|Re(N-1) Im(N-1)|
  //
  // N        - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
  //
  // log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0)
  //            WARNING: log2_amp>0 can cause overflow!!

  uint32_t i;                 // loop counter


  simd_q15_t *y_128,alpha_128;
  int32_t *xd=(int32_t *)x; 

#if defined(__x86_64__) || defined(__i386__)
  __m128i shift = _mm_cvtsi32_si128(output_shift);
  register simd_q15_t m0,m1,m2,m3;

  ((int16_t *)&alpha_128)[0] = alpha[0];
  ((int16_t *)&alpha_128)[1] = -alpha[1];
  ((int16_t *)&alpha_128)[2] = alpha[1];
  ((int16_t *)&alpha_128)[3] = alpha[0];
  ((int16_t *)&alpha_128)[4] = alpha[0];
  ((int16_t *)&alpha_128)[5] = -alpha[1];
  ((int16_t *)&alpha_128)[6] = alpha[1];
  ((int16_t *)&alpha_128)[7] = alpha[0];
#elif defined(__arm__)
  int32x4_t shift;
  int32x4_t ab_re0,ab_re1,ab_im0,ab_im1,re32,im32;
  int16_t reflip[8]  __attribute__((aligned(16))) = {1,-1,1,-1,1,-1,1,-1};
  int32x4x2_t xtmp;

  ((int16_t *)&alpha_128)[0] = alpha[0];
  ((int16_t *)&alpha_128)[1] = alpha[1];
  ((int16_t *)&alpha_128)[2] = alpha[0];
  ((int16_t *)&alpha_128)[3] = alpha[1];
  ((int16_t *)&alpha_128)[4] = alpha[0];
  ((int16_t *)&alpha_128)[5] = alpha[1];
  ((int16_t *)&alpha_128)[6] = alpha[0];
  ((int16_t *)&alpha_128)[7] = alpha[1];
  int16x8_t bflip = vrev32q_s16(alpha_128);
  int16x8_t bconj = vmulq_s16(alpha_128,*(int16x8_t *)reflip);
  shift = vdupq_n_s32(-output_shift);
#endif
  y_128 = (simd_q15_t *) y;


  for(i=0; i<N>>2; i++) {
#if defined(__x86_64__) || defined(__i386__)
    m0 = _mm_setr_epi32(xd[0],xd[0],xd[1],xd[1]);
    m1 = _mm_setr_epi32(xd[2],xd[2],xd[3],xd[3]);
    m2 = _mm_madd_epi16(m0,alpha_128); //complex multiply. result is 32bit [Re Im Re Im]
    m3 = _mm_madd_epi16(m1,alpha_128); //complex multiply. result is 32bit [Re Im Re Im]
    m2 = _mm_sra_epi32(m2,shift);        // shift right by shift in order to  compensate for the input amplitude
    m3 = _mm_sra_epi32(m3,shift);        // shift right by shift in order to  compensate for the input amplitude

    y_128[0] = _mm_packs_epi32(m2,m3);        // pack in 16bit integers with saturation [re im re im re im re im]
#elif defined(__arm__)

  ab_re0 = vmull_s16(((int16x4_t*)xd)[0],((int16x4_t*)&bconj)[0]);
  ab_re1 = vmull_s16(((int16x4_t*)xd)[1],((int16x4_t*)&bconj)[1]);
  ab_im0 = vmull_s16(((int16x4_t*)xd)[0],((int16x4_t*)&bflip)[0]);
  ab_im1 = vmull_s16(((int16x4_t*)xd)[1],((int16x4_t*)&bflip)[1]);
  re32 = vshlq_s32(vcombine_s32(vpadd_s32(((int32x2_t*)&ab_re0)[0],((int32x2_t*)&ab_re0)[1]),
                                vpadd_s32(((int32x2_t*)&ab_re1)[0],((int32x2_t*)&ab_re1)[1])),
                   shift);
  im32 = vshlq_s32(vcombine_s32(vpadd_s32(((int32x2_t*)&ab_im0)[0],((int32x2_t*)&ab_im0)[1]),
                                vpadd_s32(((int32x2_t*)&ab_im1)[0],((int32x2_t*)&ab_im1)[1])),
                   shift);

  xtmp = vzipq_s32(re32,im32);
  
  y_128[0] = vcombine_s16(vmovn_s32(xtmp.val[0]),vmovn_s32(xtmp.val[1]));

#endif

    xd+=4;
    y_128+=1;
  }


  _mm_empty();
  _m_empty();

  return(0);
}
예제 #21
0
int mult_cpx_vector_h(short *x1, 
		      short *x2, 
		      short *y, 
		      unsigned int N, 
		      unsigned short output_shift,
		      short sign)
{
  // Multiply elementwise the complex vector x1 with the complex conjugate of the complex vecotr x2 of N elements and adds it to the vector y.
  // x1       - input 1    in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
  //            We assume x1 with a dinamic of 15 bit maximum
  //
  // x2       - input 2    in the format  |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)|
  //            We assume x2 with a dinamic of 14 bit maximum
  //
  // y        - output     in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
  //
  // N        - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
  //
  // log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0)
  //            WARNING: log2_amp>0 can cause overflow!!
  // sign     - +1..add, -1..substract

  unsigned int i;                 // loop counter

  register __m128i m0,m1,m2;

  short *temps;
  int *tempd;

  __m128i *x1_128; 
  __m128i *x2_128; 
  __m128i *y_128; 
  __m128i mask;

  __m128i temp;

  shift = _mm_cvtsi32_si128(output_shift);
  x1_128 = (__m128i *)&x1[0];
  x2_128 = (__m128i *)&x2[0];
  y_128 = (__m128i *)&y[0];

  if (sign == -1)
    mask = (__m128i) _mm_set_epi16 (-1,1,-1,-1,-1,1,-1,-1);
  else
    mask = (__m128i) _mm_set_epi16 (1,-1,1,1,1,-1,1,1);

  // we compute 2*4 cpx multiply for each loop
  for(i=0;i<(N>>3);i++)
  {
    
    //    printf("i=%d\n",i);

    // unroll 1
    //    temps = (short *)x1_128;
    //    printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
    m1 = x1_128[0];
    m2 = x2_128[0];

    //    temps = (short *)&x2_128[0];
    //    printf("x2 : %x,%x,%x,%x,%x,%x,%x,%x\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);

    // bring x2 in conjugate form
    // the first two instructions might be replaced with a single one in SSE3
    m2 = _mm_shufflelo_epi16(m2,_MM_SHUFFLE(0,1,3,2));
    m2 = _mm_shufflehi_epi16(m2,_MM_SHUFFLE(0,1,3,2));
    m2 = _mm_mullo_epi16(m2, mask);

    //    temp = m2;
    //    temps = (short *)&temp;
    //    printf("x2 conj : %x,%x,%x,%x,%x,%x,%x,%x\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);

    m0 = _mm_madd_epi16(m1,m2); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]

    //    temp = m0;

    //    tempd = &temp;
    //    printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);

    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude

    //    temp = m0;

    //    tempd = (int *)&temp;
    //  printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);

    m0 = _mm_packs_epi32(m0,m0);        // 1- pack in a 128 bit register [re im re im]
    m0 = _mm_unpacklo_epi32(m0,m0);        // 1- pack in a 128 bit register [re im re im]

    y_128[0] = _mm_add_epi16(m0,y_128[0]);

    //    temps = (short *)&y_128[0];
    //    printf("y0 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);

    // unroll 2
    m1 = x1_128[1];
    m2 = x2_128[1];

    m2 = _mm_shufflelo_epi16(m2,_MM_SHUFFLE(0,1,3,2));
    m2 = _mm_shufflehi_epi16(m2,_MM_SHUFFLE(0,1,3,2));
    m2 = _mm_mullo_epi16(m2, mask);

    m0 = _mm_madd_epi16(m1,m2); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]

    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude

    m0 = _mm_packs_epi32(m0,m0);        // 1- pack in a 128 bit register [re im re im]
    m0 = _mm_unpacklo_epi32(m0,m0);        // 1- pack in a 128 bit register [re im re im]

    y_128[1] = _mm_add_epi16(m0,y_128[1]);

    // unroll 3
    m1 = x1_128[2];
    m2 = x2_128[2];

    m2 = _mm_shufflelo_epi16(m2,_MM_SHUFFLE(0,1,3,2));
    m2 = _mm_shufflehi_epi16(m2,_MM_SHUFFLE(0,1,3,2));
    m2 = _mm_mullo_epi16(m2, mask);

    m0 = _mm_madd_epi16(m1,m2); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]

    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude

    m0 = _mm_packs_epi32(m0,m0);        // 1- pack in a 128 bit register [re im re im]
    m0 = _mm_unpacklo_epi32(m0,m0);        // 1- pack in a 128 bit register [re im re im]

    y_128[2] = _mm_add_epi16(m0,y_128[2]);


    // unroll 4
    m1 = x1_128[3];
    m2 = x2_128[3];

    m2 = _mm_shufflelo_epi16(m2,_MM_SHUFFLE(0,1,3,2));
    m2 = _mm_shufflehi_epi16(m2,_MM_SHUFFLE(0,1,3,2));
    m2 = _mm_mullo_epi16(m2, mask);

    m0 = _mm_madd_epi16(m1,m2); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]

    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude

    m0 = _mm_packs_epi32(m0,m0);        // 1- pack in a 128 bit register [re im re im]
    m0 = _mm_unpacklo_epi32(m0,m0);        // 1- pack in a 128 bit register [re im re im]

    y_128[3] = _mm_add_epi16(m0,y_128[3]);

    x1_128+=4;
    x2_128+=4;
    y_128 +=4;
    //    printf("x1_128 = %p, x2_128 =%p,  y_128=%p\n",x1_128,x2_128,y_128);

  }


  _mm_empty();
  _m_empty();

  return(0);
}
예제 #22
0
int signal_energy(int *input,unsigned int length) {

  int i;
  int temp,temp2;
  register __m64 mm0,mm1,mm2,mm3;
  __m64 *in = (__m64 *)input;

#ifdef MAIN
  short *printb;
#endif

  mm0 = _m_pxor(mm0,mm0);
  mm3 = _m_pxor(mm3,mm3);

  for (i=0;i<length>>1;i++) {
    
    mm1 = in[i]; 
    mm2 = mm1;
    mm1 = _m_pmaddwd(mm1,mm1);
    mm1 = _m_psradi(mm1,shift);// shift any 32 bits blocs of the word by the value shift
    mm0 = _m_paddd(mm0,mm1);// add the two 64 bits words 4 bytes by 4 bytes
    //    temp2 = mm0;
    //    printf("%d %d\n",((int *)&temp2)[0],((int *)&temp2)[1]);


    //    printb = (short *)&mm2;
    //    printf("mm2 %d : %d %d %d %d\n",i,printb[0],printb[1],printb[2],printb[3]);

    mm2 = _m_psrawi(mm2,shift_DC);
    mm3 = _m_paddw(mm3,mm2);// add the two 64 bits words 2 bytes by 2 bytes

    //    printb = (short *)&mm3;
    //    printf("mm3 %d : %d %d %d %d\n",i,printb[0],printb[1],printb[2],printb[3]);

  }

  /*
#ifdef MAIN
  printb = (short *)&mm3;
  printf("%d %d %d %d\n",printb[0],printb[1],printb[2],printb[3]);
#endif
  */
  mm1 = mm0;

  mm0 = _m_psrlqi(mm0,32);

  mm0 = _m_paddd(mm0,mm1);

  temp = _m_to_int(mm0);

  temp/=length;
  temp<<=shift;   // this is the average of x^2

  // now remove the DC component
  

  mm2 = _m_psrlqi(mm3,32);
  mm2 = _m_paddw(mm2,mm3);

  mm2 = _m_pmaddwd(mm2,mm2);

  temp2 = _m_to_int(mm2);

  temp2/=(length*length);

  temp2<<=(2*shift_DC);
#ifdef MAIN
  printf("E x^2 = %d\n",temp);  
#endif
  temp -= temp2;
#ifdef MAIN
  printf("(E x)^2=%d\n",temp2);
#endif
  _mm_empty();
  _m_empty();



  return((temp>0)?temp:1);
}
예제 #23
0
int mult_cpx_vector_h_add32(short *x1, 
		      short *x2, 
		      short *y, 
		      unsigned int N, 
		      short sign)
{
  // Multiply elementwise the complex vector x1 with the complex conjugate of the complex vecotr x2 of N elements and adds it to the vector y.
  // x1       - input 1    in 16bit format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
  //            We assume x1 with a dinamic of 15 bit maximum
  //
  // x2       - input 2    in 16bit format  |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)|
  //            We assume x2 with a dinamic of 14 bit maximum
  //
  // y        - output     in 32bit format  |Re0  Im0|,......,|Re(N-1)  Im(N-1)|
  //
  // N        - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
  //
  // sign     - +1..add, -1..substract

  unsigned int i;                 // loop counter

  register __m128i m0,m1,m2;

  short *temps;
  int *tempd;

  __m128i *x1_128; 
  __m128i *x2_128; 
  __m128i *y_128; 
  __m128i mask;

  __m128i temp;

  x1_128 = (__m128i *)&x1[0];
  x2_128 = (__m128i *)&x2[0];
  y_128 = (__m128i *)&y[0];

  if (sign == -1)
    mask = (__m128i) _mm_set_epi16 (-1,1,-1,-1,-1,1,-1,-1);
  else
    mask = (__m128i) _mm_set_epi16 (1,-1,1,1,1,-1,1,1);

  // we compute 2*4 cpx multiply for each loop
  for(i=0;i<(N>>3);i++)
  {
    
    m1 = x1_128[0];
    m2 = x2_128[0];

    // bring x2 in conjugate form
    // the first two instructions might be replaced with a single one in SSE3
    m2 = _mm_shufflelo_epi16(m2,_MM_SHUFFLE(0,1,3,2));
    m2 = _mm_shufflehi_epi16(m2,_MM_SHUFFLE(0,1,3,2));
    m2 = _mm_mullo_epi16(m2, mask);

    m0 = _mm_madd_epi16(m1,m2);         // 1- compute x1[0]*x2[0], result is 32bit 

    y_128[0] = _mm_add_epi32(m0,y_128[0]);

    // unroll 2
    m1 = x1_128[1];
    m2 = x2_128[1];

    m2 = _mm_shufflelo_epi16(m2,_MM_SHUFFLE(0,1,3,2));
    m2 = _mm_shufflehi_epi16(m2,_MM_SHUFFLE(0,1,3,2));
    m2 = _mm_mullo_epi16(m2, mask);

    m0 = _mm_madd_epi16(m1,m2); 

    y_128[1] = _mm_add_epi32(m0,y_128[1]);

    // unroll 3
    m1 = x1_128[2];
    m2 = x2_128[2];

    m2 = _mm_shufflelo_epi16(m2,_MM_SHUFFLE(0,1,3,2));
    m2 = _mm_shufflehi_epi16(m2,_MM_SHUFFLE(0,1,3,2));
    m2 = _mm_mullo_epi16(m2, mask);

    m0 = _mm_madd_epi16(m1,m2); 

    y_128[2] = _mm_add_epi32(m0,y_128[2]);


    // unroll 4
    m1 = x1_128[3];
    m2 = x2_128[3];

    m2 = _mm_shufflelo_epi16(m2,_MM_SHUFFLE(0,1,3,2));
    m2 = _mm_shufflehi_epi16(m2,_MM_SHUFFLE(0,1,3,2));
    m2 = _mm_mullo_epi16(m2, mask);

    m0 = _mm_madd_epi16(m1,m2); 

    y_128[3] = _mm_add_epi32(m0,y_128[3]);

    x1_128+=4;
    x2_128+=4;
    y_128 +=4;

  }


  _mm_empty();
  _m_empty();

  return(0);
}