int32_t dot_product(int16_t *x, int16_t *y, uint32_t N, //must be a multiple of 8 uint8_t output_shift) { uint32_t n; #if defined(__x86_64__) || defined(__i386__) __m128i *x128,*y128,mmtmp1,mmtmp2,mmtmp3,mmcumul,mmcumul_re,mmcumul_im; __m64 mmtmp7; __m128i minus_i = _mm_set_epi16(-1,1,-1,1,-1,1,-1,1); int32_t result; x128 = (__m128i*) x; y128 = (__m128i*) y; mmcumul_re = _mm_setzero_si128(); mmcumul_im = _mm_setzero_si128(); for (n=0; n<(N>>2); n++) { //printf("n=%d, x128=%p, y128=%p\n",n,x128,y128); // print_shorts("x",&x128[0]); // print_shorts("y",&y128[0]); // this computes Re(z) = Re(x)*Re(y) + Im(x)*Im(y) mmtmp1 = _mm_madd_epi16(x128[0],y128[0]); // print_ints("re",&mmtmp1); // mmtmp1 contains real part of 4 consecutive outputs (32-bit) // shift and accumulate results mmtmp1 = _mm_srai_epi32(mmtmp1,output_shift); mmcumul_re = _mm_add_epi32(mmcumul_re,mmtmp1); // print_ints("re",&mmcumul_re); // this computes Im(z) = Re(x)*Im(y) - Re(y)*Im(x) mmtmp2 = _mm_shufflelo_epi16(y128[0],_MM_SHUFFLE(2,3,0,1)); // print_shorts("y",&mmtmp2); mmtmp2 = _mm_shufflehi_epi16(mmtmp2,_MM_SHUFFLE(2,3,0,1)); // print_shorts("y",&mmtmp2); mmtmp2 = _mm_sign_epi16(mmtmp2,minus_i); // print_shorts("y",&mmtmp2); mmtmp3 = _mm_madd_epi16(x128[0],mmtmp2); // print_ints("im",&mmtmp3); // mmtmp3 contains imag part of 4 consecutive outputs (32-bit) // shift and accumulate results mmtmp3 = _mm_srai_epi32(mmtmp3,output_shift); mmcumul_im = _mm_add_epi32(mmcumul_im,mmtmp3); // print_ints("im",&mmcumul_im); x128++; y128++; } // this gives Re Re Im Im mmcumul = _mm_hadd_epi32(mmcumul_re,mmcumul_im); // print_ints("cumul1",&mmcumul); // this gives Re Im Re Im mmcumul = _mm_hadd_epi32(mmcumul,mmcumul); // print_ints("cumul2",&mmcumul); //mmcumul = _mm_srai_epi32(mmcumul,output_shift); // extract the lower half mmtmp7 = _mm_movepi64_pi64(mmcumul); // print_ints("mmtmp7",&mmtmp7); // pack the result mmtmp7 = _mm_packs_pi32(mmtmp7,mmtmp7); // print_shorts("mmtmp7",&mmtmp7); // convert back to integer result = _mm_cvtsi64_si32(mmtmp7); _mm_empty(); _m_empty(); return(result); #elif defined(__arm__) int16x4_t *x_128=(int16x4_t*)x; int16x4_t *y_128=(int16x4_t*)y; int32x4_t tmp_re,tmp_im; int32x4_t tmp_re1,tmp_im1; int32x4_t re_cumul,im_cumul; int32x2_t re_cumul2,im_cumul2; int32x4_t shift = vdupq_n_s32(-output_shift); int32x2x2_t result2; int16_t conjug[4]__attribute__((aligned(16))) = {-1,1,-1,1} ; re_cumul = vdupq_n_s32(0); im_cumul = vdupq_n_s32(0); for (n=0; n<(N>>2); n++) { tmp_re = vmull_s16(*x_128++, *y_128++); //tmp_re = [Re(x[0])Re(y[0]) Im(x[0])Im(y[0]) Re(x[1])Re(y[1]) Im(x[1])Im(y[1])] tmp_re1 = vmull_s16(*x_128++, *y_128++); //tmp_re1 = [Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1]) Re(x1[1])Re(x2[2]) Im(x1[1])Im(x2[2])] tmp_re = vcombine_s32(vpadd_s32(vget_low_s32(tmp_re),vget_high_s32(tmp_re)), vpadd_s32(vget_low_s32(tmp_re1),vget_high_s32(tmp_re1))); //tmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2]) Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] tmp_im = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++); //tmp_im = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])] tmp_im1 = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++); //tmp_im1 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])] tmp_im = vcombine_s32(vpadd_s32(vget_low_s32(tmp_im),vget_high_s32(tmp_im)), vpadd_s32(vget_low_s32(tmp_im1),vget_high_s32(tmp_im1))); //tmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])] re_cumul = vqaddq_s32(re_cumul,vqshlq_s32(tmp_re,shift)); im_cumul = vqaddq_s32(im_cumul,vqshlq_s32(tmp_im,shift)); } re_cumul2 = vpadd_s32(vget_low_s32(re_cumul),vget_high_s32(re_cumul)); im_cumul2 = vpadd_s32(vget_low_s32(im_cumul),vget_high_s32(im_cumul)); re_cumul2 = vpadd_s32(re_cumul2,re_cumul2); im_cumul2 = vpadd_s32(im_cumul2,im_cumul2); result2 = vzip_s32(re_cumul2,im_cumul2); return(vget_lane_s32(result2.val[0],0)); #endif }
static INLINE __m128i predict_unclipped(const __m128i *input, __m128i alpha_q12, __m128i alpha_sign, __m128i dc_q0) { __m128i ac_q3 = _mm_loadu_si128(input); __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3); __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12); scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign); return _mm_add_epi16(scaled_luma_q0, dc_q0); }
int main(int, char**) { volatile __m128i a = _mm_set1_epi32(42); _mm_abs_epi8(a); volatile __m128i result = _mm_sign_epi16(a, _mm_set1_epi32(64)); (void)result; return 0; }
/* Test the 128-bit form */ static void ssse3_test_psignw128 (int *i1, int *i2, int *r) { /* Assumes incoming pointers are 16-byte aligned */ __m128i t1 = *(__m128i *) i1; __m128i t2 = *(__m128i *) i2; *(__m128i *) r = _mm_sign_epi16 (t1, t2); }
static INLINE void hor_transform_row_avx2(__m128i* row){ __m128i mask_pos = _mm_set1_epi16(1); __m128i mask_neg = _mm_set1_epi16(-1); __m128i sign_mask = _mm_unpacklo_epi64(mask_pos, mask_neg); __m128i temp = _mm_shuffle_epi32(*row, KVZ_PERMUTE(2, 3, 0, 1)); *row = _mm_sign_epi16(*row, sign_mask); *row = _mm_add_epi16(*row, temp); sign_mask = _mm_unpacklo_epi32(mask_pos, mask_neg); temp = _mm_shuffle_epi32(*row, KVZ_PERMUTE(1, 0, 3, 2)); *row = _mm_sign_epi16(*row, sign_mask); *row = _mm_add_epi16(*row, temp); sign_mask = _mm_unpacklo_epi16(mask_pos, mask_neg); temp = _mm_shufflelo_epi16(*row, KVZ_PERMUTE(1,0,3,2)); temp = _mm_shufflehi_epi16(temp, KVZ_PERMUTE(1,0,3,2)); *row = _mm_sign_epi16(*row, sign_mask); *row = _mm_add_epi16(*row, temp); }
static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16], const uint16_t* const sharpen, const VP8Matrix* const mtx) { const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL); const __m128i zero = _mm_setzero_si128(); __m128i out0, out8; __m128i packed_out; // Load all inputs. // TODO(cduvivier): Make variable declarations and allocations aligned so that // we can use _mm_load_si128 instead of _mm_loadu_si128. __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]); __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]); const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]); const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq_[8]); const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q_[0]); const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q_[8]); // coeff = abs(in) __m128i coeff0 = _mm_abs_epi16(in0); __m128i coeff8 = _mm_abs_epi16(in8); // coeff = abs(in) + sharpen if (sharpen != NULL) { const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]); const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]); coeff0 = _mm_add_epi16(coeff0, sharpen0); coeff8 = _mm_add_epi16(coeff8, sharpen8); } // out = (coeff * iQ + B) >> QFIX { // doing calculations with 32b precision (QFIX=17) // out = (coeff * iQ) const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0); const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0); const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8); const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8); __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H); __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H); // out = (coeff * iQ + B) const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias_[0]); const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias_[4]); const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias_[8]); const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias_[12]); out_00 = _mm_add_epi32(out_00, bias_00); out_04 = _mm_add_epi32(out_04, bias_04); out_08 = _mm_add_epi32(out_08, bias_08); out_12 = _mm_add_epi32(out_12, bias_12); // out = QUANTDIV(coeff, iQ, B, QFIX) out_00 = _mm_srai_epi32(out_00, QFIX); out_04 = _mm_srai_epi32(out_04, QFIX); out_08 = _mm_srai_epi32(out_08, QFIX); out_12 = _mm_srai_epi32(out_12, QFIX); // pack result as 16b out0 = _mm_packs_epi32(out_00, out_04); out8 = _mm_packs_epi32(out_08, out_12); // if (coeff > 2047) coeff = 2047 out0 = _mm_min_epi16(out0, max_coeff_2047); out8 = _mm_min_epi16(out8, max_coeff_2047); } // put sign back out0 = _mm_sign_epi16(out0, in0); out8 = _mm_sign_epi16(out8, in8); // in = out * Q in0 = _mm_mullo_epi16(out0, q0); in8 = _mm_mullo_epi16(out8, q8); _mm_storeu_si128((__m128i*)&in[0], in0); _mm_storeu_si128((__m128i*)&in[8], in8); // zigzag the output before storing it. The re-ordering is: // 0 1 2 3 4 5 6 7 | 8 9 10 11 12 13 14 15 // -> 0 1 4[8]5 2 3 6 | 9 12 13 10 [7]11 14 15 // There's only two misplaced entries ([8] and [7]) that are crossing the // reg's boundaries. // We use pshufb instead of pshuflo/pshufhi. { const __m128i kCst_lo = PSHUFB_CST(0, 1, 4, -1, 5, 2, 3, 6); const __m128i kCst_7 = PSHUFB_CST(-1, -1, -1, -1, 7, -1, -1, -1); const __m128i tmp_lo = _mm_shuffle_epi8(out0, kCst_lo); const __m128i tmp_7 = _mm_shuffle_epi8(out0, kCst_7); // extract #7 const __m128i kCst_hi = PSHUFB_CST(1, 4, 5, 2, -1, 3, 6, 7); const __m128i kCst_8 = PSHUFB_CST(-1, -1, -1, 0, -1, -1, -1, -1); const __m128i tmp_hi = _mm_shuffle_epi8(out8, kCst_hi); const __m128i tmp_8 = _mm_shuffle_epi8(out8, kCst_8); // extract #8 const __m128i out_z0 = _mm_or_si128(tmp_lo, tmp_8); const __m128i out_z8 = _mm_or_si128(tmp_hi, tmp_7); _mm_storeu_si128((__m128i*)&out[0], out_z0); _mm_storeu_si128((__m128i*)&out[8], out_z8); packed_out = _mm_packs_epi16(out_z0, out_z8); } // detect if all 'out' values are zeroes or not return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff); }
void ulsch_channel_compensation_alamouti(int **rxdataF_ext, // For Distributed Alamouti Combining int **ul_ch_estimates_ext_0, int **ul_ch_estimates_ext_1, int **ul_ch_mag_0, int **ul_ch_magb_0, int **ul_ch_mag_1, int **ul_ch_magb_1, int **rxdataF_comp_0, int **rxdataF_comp_1, LTE_DL_FRAME_PARMS *frame_parms, unsigned char symbol, unsigned char Qm, unsigned short nb_rb, unsigned char output_shift_0, unsigned char output_shift_1) { unsigned short rb; __m128i *ul_ch128_0,*ul_ch128_1,*ul_ch_mag128_0,*ul_ch_mag128_1,*ul_ch_mag128b_0,*ul_ch_mag128b_1,*rxdataF128,*rxdataF_comp128_0,*rxdataF_comp128_1; unsigned char aarx;//,symbol_mod; // symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol; #ifndef __SSE3__ zeroU = _mm_xor_si128(zeroU,zeroU); #endif // printf("comp: symbol %d\n",symbol); if (Qm == 4) { QAM_amp128U_0 = _mm_set1_epi16(QAM16_n1); QAM_amp128U_1 = _mm_set1_epi16(QAM16_n1); } else if (Qm == 6) { QAM_amp128U_0 = _mm_set1_epi16(QAM64_n1); QAM_amp128bU_0 = _mm_set1_epi16(QAM64_n2); QAM_amp128U_1 = _mm_set1_epi16(QAM64_n1); QAM_amp128bU_1 = _mm_set1_epi16(QAM64_n2); } for (aarx=0;aarx<frame_parms->nb_antennas_rx;aarx++) { ul_ch128_0 = (__m128i *)&ul_ch_estimates_ext_0[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128_0 = (__m128i *)&ul_ch_mag_0[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128b_0 = (__m128i *)&ul_ch_magb_0[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch128_1 = (__m128i *)&ul_ch_estimates_ext_1[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128_1 = (__m128i *)&ul_ch_mag_1[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128b_1 = (__m128i *)&ul_ch_magb_1[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF128 = (__m128i *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp128_0 = (__m128i *)&rxdataF_comp_0[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp128_1 = (__m128i *)&rxdataF_comp_1[aarx][symbol*frame_parms->N_RB_DL*12]; for (rb=0;rb<nb_rb;rb++) { // printf("comp: symbol %d rb %d\n",symbol,rb); if (Qm>2) { // get channel amplitude if not QPSK mmtmpU0 = _mm_madd_epi16(ul_ch128_0[0],ul_ch128_0[0]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0); mmtmpU1 = _mm_madd_epi16(ul_ch128_0[1],ul_ch128_0[1]); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_0); mmtmpU0 = _mm_packs_epi32(mmtmpU0,mmtmpU1); ul_ch_mag128_0[0] = _mm_unpacklo_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b_0[0] = ul_ch_mag128_0[0]; ul_ch_mag128_0[0] = _mm_mulhi_epi16(ul_ch_mag128_0[0],QAM_amp128U_0); ul_ch_mag128_0[0] = _mm_slli_epi16(ul_ch_mag128_0[0],2); // 2 to compensate the scale channel estimate ul_ch_mag128_0[1] = _mm_unpackhi_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b_0[1] = ul_ch_mag128_0[1]; ul_ch_mag128_0[1] = _mm_mulhi_epi16(ul_ch_mag128_0[1],QAM_amp128U_0); ul_ch_mag128_0[1] = _mm_slli_epi16(ul_ch_mag128_0[1],2); // 2 to scale compensate the scale channel estimate mmtmpU0 = _mm_madd_epi16(ul_ch128_0[2],ul_ch128_0[2]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0); mmtmpU1 = _mm_packs_epi32(mmtmpU0,mmtmpU0); ul_ch_mag128_0[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1); ul_ch_mag128b_0[2] = ul_ch_mag128_0[2]; ul_ch_mag128_0[2] = _mm_mulhi_epi16(ul_ch_mag128_0[2],QAM_amp128U_0); ul_ch_mag128_0[2] = _mm_slli_epi16(ul_ch_mag128_0[2],2); // 2 to scale compensate the scale channel estimat ul_ch_mag128b_0[0] = _mm_mulhi_epi16(ul_ch_mag128b_0[0],QAM_amp128bU_0); ul_ch_mag128b_0[0] = _mm_slli_epi16(ul_ch_mag128b_0[0],2); // 2 to scale compensate the scale channel estima ul_ch_mag128b_0[1] = _mm_mulhi_epi16(ul_ch_mag128b_0[1],QAM_amp128bU_0); ul_ch_mag128b_0[1] = _mm_slli_epi16(ul_ch_mag128b_0[1],2); // 2 to scale compensate the scale channel estima ul_ch_mag128b_0[2] = _mm_mulhi_epi16(ul_ch_mag128b_0[2],QAM_amp128bU_0); ul_ch_mag128b_0[2] = _mm_slli_epi16(ul_ch_mag128b_0[2],2); // 2 to scale compensate the scale channel estima mmtmpU0 = _mm_madd_epi16(ul_ch128_1[0],ul_ch128_1[0]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1); mmtmpU1 = _mm_madd_epi16(ul_ch128_1[1],ul_ch128_1[1]); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_1); mmtmpU0 = _mm_packs_epi32(mmtmpU0,mmtmpU1); ul_ch_mag128_1[0] = _mm_unpacklo_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b_1[0] = ul_ch_mag128_1[0]; ul_ch_mag128_1[0] = _mm_mulhi_epi16(ul_ch_mag128_1[0],QAM_amp128U_1); ul_ch_mag128_1[0] = _mm_slli_epi16(ul_ch_mag128_1[0],2); // 2 to compensate the scale channel estimate ul_ch_mag128_1[1] = _mm_unpackhi_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b_1[1] = ul_ch_mag128_1[1]; ul_ch_mag128_1[1] = _mm_mulhi_epi16(ul_ch_mag128_1[1],QAM_amp128U_1); ul_ch_mag128_1[1] = _mm_slli_epi16(ul_ch_mag128_1[1],2); // 2 to scale compensate the scale channel estimate mmtmpU0 = _mm_madd_epi16(ul_ch128_1[2],ul_ch128_1[2]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1); mmtmpU1 = _mm_packs_epi32(mmtmpU0,mmtmpU0); ul_ch_mag128_1[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1); ul_ch_mag128b_1[2] = ul_ch_mag128_1[2]; ul_ch_mag128_1[2] = _mm_mulhi_epi16(ul_ch_mag128_1[2],QAM_amp128U_0); ul_ch_mag128_1[2] = _mm_slli_epi16(ul_ch_mag128_1[2],2); // 2 to scale compensate the scale channel estimat ul_ch_mag128b_1[0] = _mm_mulhi_epi16(ul_ch_mag128b_1[0],QAM_amp128bU_1); ul_ch_mag128b_1[0] = _mm_slli_epi16(ul_ch_mag128b_1[0],2); // 2 to scale compensate the scale channel estima ul_ch_mag128b_1[1] = _mm_mulhi_epi16(ul_ch_mag128b_1[1],QAM_amp128bU_1); ul_ch_mag128b_1[1] = _mm_slli_epi16(ul_ch_mag128b_1[1],2); // 2 to scale compensate the scale channel estima ul_ch_mag128b_1[2] = _mm_mulhi_epi16(ul_ch_mag128b_1[2],QAM_amp128bU_1); ul_ch_mag128b_1[2] = _mm_slli_epi16(ul_ch_mag128b_1[2],2); // 2 to scale compensate the scale channel estima } /************************For Computing (y)*(h0*)********************************************/ // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128_0[0],rxdataF128[0]); // print_ints("re",&mmtmpU0); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128_0[0],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)&conjugate[0]); // print_ints("im",&mmtmpU1); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[0]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0); // print_ints("re(shift)",&mmtmpU0); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_0); // print_ints("im(shift)",&mmtmpU1); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); // print_ints("c0",&mmtmpU2); // print_ints("c1",&mmtmpU3); rxdataF_comp128_0[0] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[0]); // print_shorts("ch:",ul_ch128_0[0]); // print_shorts("pack:",rxdataF_comp128_0[0]); // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128_0[1],rxdataF128[1]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128_0[1],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[1]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_0); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128_0[1] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[1]); // print_shorts("ch:",ul_ch128_0[1]); // print_shorts("pack:",rxdataF_comp128_0[1]); // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128_0[2],rxdataF128[2]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128_0[2],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[2]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_0); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128_0[2] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[2]); // print_shorts("ch:",ul_ch128_0[2]); // print_shorts("pack:",rxdataF_comp128_0[2]); /*************************For Computing (y*)*(h1)************************************/ // multiply by conjugated signal mmtmpU0 = _mm_madd_epi16(ul_ch128_1[0],rxdataF128[0]); // print_ints("re",&mmtmpU0); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(rxdataF128[0],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)&conjugate[0]); // print_ints("im",&mmtmpU1); mmtmpU1 = _mm_madd_epi16(mmtmpU1,ul_ch128_1[0]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1); // print_ints("re(shift)",&mmtmpU0); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_1); // print_ints("im(shift)",&mmtmpU1); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); // print_ints("c0",&mmtmpU2); // print_ints("c1",&mmtmpU3); rxdataF_comp128_1[0] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[0]); // print_shorts("ch_conjugate:",ul_ch128_1[0]); // print_shorts("pack:",rxdataF_comp128_1[0]); // multiply by conjugated signal mmtmpU0 = _mm_madd_epi16(ul_ch128_1[1],rxdataF128[1]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(rxdataF128[1],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,ul_ch128_1[1]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_1); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128_1[1] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[1]); // print_shorts("ch_conjugate:",ul_ch128_1[1]); // print_shorts("pack:",rxdataF_comp128_1[1]); // multiply by conjugated signal mmtmpU0 = _mm_madd_epi16(ul_ch128_1[2],rxdataF128[2]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(rxdataF128[2],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,ul_ch128_1[2]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_1); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128_1[2] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[2]); // print_shorts("ch_conjugate:",ul_ch128_0[2]); // print_shorts("pack:",rxdataF_comp128_1[2]); ul_ch128_0+=3; ul_ch_mag128_0+=3; ul_ch_mag128b_0+=3; ul_ch128_1+=3; ul_ch_mag128_1+=3; ul_ch_mag128b_1+=3; rxdataF128+=3; rxdataF_comp128_0+=3; rxdataF_comp128_1+=3; } } _mm_empty(); _m_empty(); }
void ulsch_channel_compensation(int **rxdataF_ext, int **ul_ch_estimates_ext, int **ul_ch_mag, int **ul_ch_magb, int **rxdataF_comp, LTE_DL_FRAME_PARMS *frame_parms, unsigned char symbol, unsigned char Qm, unsigned short nb_rb, unsigned char output_shift) { unsigned short rb; __m128i *ul_ch128,*ul_ch_mag128,*ul_ch_mag128b,*rxdataF128,*rxdataF_comp128; unsigned char aarx;//,symbol_mod; // symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol; #ifndef __SSE3__ zeroU = _mm_xor_si128(zeroU,zeroU); #endif // printf("comp: symbol %d\n",symbol); if (Qm == 4) QAM_amp128U = _mm_set1_epi16(QAM16_n1); else if (Qm == 6) { QAM_amp128U = _mm_set1_epi16(QAM64_n1); QAM_amp128bU = _mm_set1_epi16(QAM64_n2); } for (aarx=0;aarx<frame_parms->nb_antennas_rx;aarx++) { ul_ch128 = (__m128i *)&ul_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128 = (__m128i *)&ul_ch_mag[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128b = (__m128i *)&ul_ch_magb[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF128 = (__m128i *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp128 = (__m128i *)&rxdataF_comp[aarx][symbol*frame_parms->N_RB_DL*12]; for (rb=0;rb<nb_rb;rb++) { // printf("comp: symbol %d rb %d\n",symbol,rb); #ifdef OFDMA_ULSCH if (Qm>2) { // get channel amplitude if not QPSK mmtmpU0 = _mm_madd_epi16(ul_ch128[0],ul_ch128[0]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift); mmtmpU1 = _mm_madd_epi16(ul_ch128[1],ul_ch128[1]); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift); mmtmpU0 = _mm_packs_epi32(mmtmpU0,mmtmpU1); ul_ch_mag128[0] = _mm_unpacklo_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b[0] = ul_ch_mag128[0]; ul_ch_mag128[0] = _mm_mulhi_epi16(ul_ch_mag128[0],QAM_amp128U); ul_ch_mag128[0] = _mm_slli_epi16(ul_ch_mag128[0],2); // 2 to compensate the scale channel estimate ul_ch_mag128[1] = _mm_unpackhi_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b[1] = ul_ch_mag128[1]; ul_ch_mag128[1] = _mm_mulhi_epi16(ul_ch_mag128[1],QAM_amp128U); ul_ch_mag128[1] = _mm_slli_epi16(ul_ch_mag128[1],2); // 2 to compensate the scale channel estimate mmtmpU0 = _mm_madd_epi16(ul_ch128[2],ul_ch128[2]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift); mmtmpU1 = _mm_packs_epi32(mmtmpU0,mmtmpU0); ul_ch_mag128[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1); ul_ch_mag128b[2] = ul_ch_mag128[2]; ul_ch_mag128[2] = _mm_mulhi_epi16(ul_ch_mag128[2],QAM_amp128U); ul_ch_mag128[2] = _mm_slli_epi16(ul_ch_mag128[2],2); // 2 to compensate the scale channel estimate ul_ch_mag128b[0] = _mm_mulhi_epi16(ul_ch_mag128b[0],QAM_amp128bU); ul_ch_mag128b[0] = _mm_slli_epi16(ul_ch_mag128b[0],2); // 2 to compensate the scale channel estimate ul_ch_mag128b[1] = _mm_mulhi_epi16(ul_ch_mag128b[1],QAM_amp128bU); ul_ch_mag128b[1] = _mm_slli_epi16(ul_ch_mag128b[1],2); // 2 to compensate the scale channel estimate ul_ch_mag128b[2] = _mm_mulhi_epi16(ul_ch_mag128b[2],QAM_amp128bU); ul_ch_mag128b[2] = _mm_slli_epi16(ul_ch_mag128b[2],2);// 2 to compensate the scale channel estimate } #else mmtmpU0 = _mm_madd_epi16(ul_ch128[0],ul_ch128[0]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift-1); mmtmpU1 = _mm_madd_epi16(ul_ch128[1],ul_ch128[1]); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift-1); mmtmpU0 = _mm_packs_epi32(mmtmpU0,mmtmpU1); ul_ch_mag128[0] = _mm_unpacklo_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128[1] = _mm_unpackhi_epi16(mmtmpU0,mmtmpU0); mmtmpU0 = _mm_madd_epi16(ul_ch128[2],ul_ch128[2]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift-1); mmtmpU1 = _mm_packs_epi32(mmtmpU0,mmtmpU0); ul_ch_mag128[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1); // printf("comp: symbol %d rb %d => %d,%d,%d\n",symbol,rb,*((short*)&ul_ch_mag128[0]),*((short*)&ul_ch_mag128[1]),*((short*)&ul_ch_mag128[2])); #endif // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128[0],rxdataF128[0]); // print_ints("re",&mmtmpU0); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128[0],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)&conjugate[0]); // print_ints("im",&mmtmpU1); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[0]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift); // print_ints("re(shift)",&mmtmpU0); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift); // print_ints("im(shift)",&mmtmpU1); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); // print_ints("c0",&mmtmpU2); // print_ints("c1",&mmtmpU3); rxdataF_comp128[0] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[0]); // print_shorts("ch:",ul_ch128[0]); // print_shorts("pack:",rxdataF_comp128[0]); // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128[1],rxdataF128[1]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128[1],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[1]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128[1] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[1]); // print_shorts("ch:",ul_ch128[1]); // print_shorts("pack:",rxdataF_comp128[1]); // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128[2],rxdataF128[2]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128[2],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[2]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128[2] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[2]); // print_shorts("ch:",ul_ch128[2]); // print_shorts("pack:",rxdataF_comp128[2]); ul_ch128+=3; ul_ch_mag128+=3; ul_ch_mag128b+=3; rxdataF128+=3; rxdataF_comp128+=3; } } _mm_empty(); _m_empty(); }
void lte_idft(LTE_DL_FRAME_PARMS *frame_parms,int *z, unsigned short Msc_PUSCH) { int *idft_in0=(int*)idft_in128[0],*idft_out0=(int*)idft_out128[0]; int *idft_in1=(int*)idft_in128[1],*idft_out1=(int*)idft_out128[1]; int *idft_in2=(int*)idft_in128[2],*idft_out2=(int*)idft_out128[2]; int *z0,*z1,*z2,*z3,*z4,*z5,*z6,*z7,*z8,*z9,*z10,*z11; int i,ip; // printf("Doing lte_idft for Msc_PUSCH %d\n",Msc_PUSCH); if (frame_parms->Ncp == 0) { // Normal prefix z0 = z; z1 = z0+(frame_parms->N_RB_DL*12); z2 = z1+(frame_parms->N_RB_DL*12); //pilot z3 = z2+(2*frame_parms->N_RB_DL*12); z4 = z3+(frame_parms->N_RB_DL*12); z5 = z4+(frame_parms->N_RB_DL*12); z6 = z5+(frame_parms->N_RB_DL*12); z7 = z6+(frame_parms->N_RB_DL*12); z8 = z7+(frame_parms->N_RB_DL*12); //pilot z9 = z8+(2*frame_parms->N_RB_DL*12); z10 = z9+(frame_parms->N_RB_DL*12); // srs z11 = z10+(frame_parms->N_RB_DL*12); } else { // extended prefix z0 = z; z1 = z0+(frame_parms->N_RB_DL*12); //pilot z2 = z1+(2*frame_parms->N_RB_DL*12); z3 = z2+(frame_parms->N_RB_DL*12); z4 = z3+(frame_parms->N_RB_DL*12); z5 = z4+(frame_parms->N_RB_DL*12); z6 = z5+(frame_parms->N_RB_DL*12); //pilot z7 = z6+(2*frame_parms->N_RB_DL*12); z8 = z7+(frame_parms->N_RB_DL*12); // srs z9 = z8+(frame_parms->N_RB_DL*12); } // conjugate input for (i=0;i<(Msc_PUSCH>>2);i++) { *&(((__m128i*)z0)[i])=_mm_sign_epi16(*&(((__m128i*)z0)[i]),*(__m128i*)&conjugate2[0]); *&(((__m128i*)z1)[i])=_mm_sign_epi16(*&(((__m128i*)z1)[i]),*(__m128i*)&conjugate2[0]); *&(((__m128i*)z2)[i])=_mm_sign_epi16(*&(((__m128i*)z2)[i]),*(__m128i*)&conjugate2[0]); *&(((__m128i*)z3)[i])=_mm_sign_epi16(*&(((__m128i*)z3)[i]),*(__m128i*)&conjugate2[0]); *&(((__m128i*)z4)[i])=_mm_sign_epi16(*&(((__m128i*)z4)[i]),*(__m128i*)&conjugate2[0]); *&(((__m128i*)z5)[i])=_mm_sign_epi16(*&(((__m128i*)z5)[i]),*(__m128i*)&conjugate2[0]); *&(((__m128i*)z6)[i])=_mm_sign_epi16(*&(((__m128i*)z6)[i]),*(__m128i*)&conjugate2[0]); *&(((__m128i*)z7)[i])=_mm_sign_epi16(*&(((__m128i*)z7)[i]),*(__m128i*)&conjugate2[0]); *&(((__m128i*)z8)[i])=_mm_sign_epi16(*&(((__m128i*)z8)[i]),*(__m128i*)&conjugate2[0]); *&(((__m128i*)z9)[i])=_mm_sign_epi16(*&(((__m128i*)z9)[i]),*(__m128i*)&conjugate2[0]); if (frame_parms->Ncp==0) { *&(((__m128i*)z10)[i])=_mm_sign_epi16(*&(((__m128i*)z10)[i]),*(__m128i*)&conjugate2[0]); *&(((__m128i*)z11)[i])=_mm_sign_epi16(*&(((__m128i*)z11)[i]),*(__m128i*)&conjugate2[0]); } } for (i=0,ip=0;i<Msc_PUSCH;i++,ip+=4) { idft_in0[ip+0] = z0[i]; idft_in0[ip+1] = z1[i]; idft_in0[ip+2] = z2[i]; idft_in0[ip+3] = z3[i]; idft_in1[ip+0] = z4[i]; idft_in1[ip+1] = z5[i]; idft_in1[ip+2] = z6[i]; idft_in1[ip+3] = z7[i]; idft_in2[ip+0] = z8[i]; idft_in2[ip+1] = z9[i]; if (frame_parms->Ncp==0) { idft_in2[ip+2] = z10[i]; idft_in2[ip+3] = z11[i]; } } switch (Msc_PUSCH) { case 12: // dft12f(dft_in0,dft_out0,1); // dft12f(idft_in1,idft_out1,1); // dft12f(idft_in2,idft_out2,1); break; case 24: dft24(idft_in0,idft_out0,1); dft24(idft_in1,idft_out1,1); dft24(idft_in2,idft_out2,1); break; case 36: dft36(idft_in0,idft_out0,1); dft36(idft_in1,idft_out1,1); dft36(idft_in2,idft_out2,1); break; case 48: dft48(idft_in0,idft_out0,1); dft48(idft_in1,idft_out1,1); dft48(idft_in2,idft_out2,1); break; case 60: dft60(idft_in0,idft_out0,1); dft60(idft_in1,idft_out1,1); dft60(idft_in2,idft_out2,1); break; case 72: dft72(idft_in0,idft_out0,1); dft72(idft_in1,idft_out1,1); dft72(idft_in2,idft_out2,1); break; case 96: dft96(idft_in0,idft_out0,1); dft96(idft_in1,idft_out1,1); dft96(idft_in2,idft_out2,1); break; case 108: dft108(idft_in0,idft_out0,1); dft108(idft_in1,idft_out1,1); dft108(idft_in2,idft_out2,1); break; case 120: dft120(idft_in0,idft_out0,1); dft120(idft_in1,idft_out1,1); dft120(idft_in2,idft_out2,1); break; case 144: dft144(idft_in0,idft_out0,1); dft144(idft_in1,idft_out1,1); dft144(idft_in2,idft_out2,1); break; case 180: dft180(idft_in0,idft_out0,1); dft180(idft_in1,idft_out1,1); dft180(idft_in2,idft_out2,1); break; case 192: dft192(idft_in0,idft_out0,1); dft192(idft_in1,idft_out1,1); dft192(idft_in2,idft_out2,1); break; case 240: dft240(idft_in0,idft_out0,1); dft240(idft_in1,idft_out1,1); dft240(idft_in2,idft_out2,1); break; case 288: dft288(idft_in0,idft_out0,1); dft288(idft_in1,idft_out1,1); dft288(idft_in2,idft_out2,1); break; case 300: dft300(idft_in0,idft_out0,1); dft300(idft_in1,idft_out1,1); dft300(idft_in2,idft_out2,1); break; } for (i=0,ip=0;i<Msc_PUSCH;i++,ip+=4) { z0[i] = idft_out0[ip]; /* printf("out0 (%d,%d),(%d,%d),(%d,%d),(%d,%d)\n", ((short*)&idft_out0[ip])[0],((short*)&idft_out0[ip])[1], ((short*)&idft_out0[ip+1])[0],((short*)&idft_out0[ip+1])[1], ((short*)&idft_out0[ip+2])[0],((short*)&idft_out0[ip+2])[1], ((short*)&idft_out0[ip+3])[0],((short*)&idft_out0[ip+3])[1]); */ z1[i] = idft_out0[ip+1]; z2[i] = idft_out0[ip+2]; z3[i] = idft_out0[ip+3]; z4[i] = idft_out1[ip+0]; z5[i] = idft_out1[ip+1]; z6[i] = idft_out1[ip+2]; z7[i] = idft_out1[ip+3]; z8[i] = idft_out2[ip]; z9[i] = idft_out2[ip+1]; if (frame_parms->Ncp==0) { z10[i] = idft_out2[ip+2]; z11[i] = idft_out2[ip+3]; } } // conjugate output for (i=0;i<(Msc_PUSCH>>2);i++) { ((__m128i*)z0)[i]=_mm_sign_epi16(((__m128i*)z0)[i],*(__m128i*)&conjugate2[0]); ((__m128i*)z1)[i]=_mm_sign_epi16(((__m128i*)z1)[i],*(__m128i*)&conjugate2[0]); ((__m128i*)z2)[i]=_mm_sign_epi16(((__m128i*)z2)[i],*(__m128i*)&conjugate2[0]); ((__m128i*)z3)[i]=_mm_sign_epi16(((__m128i*)z3)[i],*(__m128i*)&conjugate2[0]); ((__m128i*)z4)[i]=_mm_sign_epi16(((__m128i*)z4)[i],*(__m128i*)&conjugate2[0]); ((__m128i*)z5)[i]=_mm_sign_epi16(((__m128i*)z5)[i],*(__m128i*)&conjugate2[0]); ((__m128i*)z6)[i]=_mm_sign_epi16(((__m128i*)z6)[i],*(__m128i*)&conjugate2[0]); ((__m128i*)z7)[i]=_mm_sign_epi16(((__m128i*)z7)[i],*(__m128i*)&conjugate2[0]); ((__m128i*)z8)[i]=_mm_sign_epi16(((__m128i*)z8)[i],*(__m128i*)&conjugate2[0]); ((__m128i*)z9)[i]=_mm_sign_epi16(((__m128i*)z9)[i],*(__m128i*)&conjugate2[0]); if (frame_parms->Ncp==0) { ((__m128i*)z10)[i]=_mm_sign_epi16(((__m128i*)z10)[i],*(__m128i*)&conjugate2[0]); ((__m128i*)z11)[i]=_mm_sign_epi16(((__m128i*)z11)[i],*(__m128i*)&conjugate2[0]); } } }
__m128i test_mm_sign_epi16(__m128i a, __m128i b) { // CHECK-LABEL: test_mm_sign_epi16 // CHECK: call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_sign_epi16(a, b); }
/* ------------------------------------------------------------------------- */ pstatus_t ssse3_sign_16s( const INT16 *pSrc, INT16 *pDst, INT32 len) { const INT16 *sptr = (const INT16 *) pSrc; INT16 *dptr = (INT16 *) pDst; size_t count; if (len < 16) { return general_sign_16s(pSrc, pDst, len); } /* Check for 16-byte alignment (eventually). */ if ((ULONG_PTR) pDst & 0x01) { return general_sign_16s(pSrc, pDst, len); } /* Seek 16-byte alignment. */ while ((ULONG_PTR) dptr & 0x0f) { INT16 src = *sptr++; *dptr++ = (src < 0) ? (-1) : ((src > 0) ? 1 : 0); if (--len == 0) return PRIMITIVES_SUCCESS; } /* Do 32-short chunks using 8 XMM registers. */ count = len >> 5; /* / 32 */ len -= count << 5; /* * 32 */ if ((ULONG_PTR) sptr & 0x0f) { /* Unaligned */ while (count--) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; xmm0 = _mm_set1_epi16(0x0001U); xmm1 = _mm_set1_epi16(0x0001U); xmm2 = _mm_set1_epi16(0x0001U); xmm3 = _mm_set1_epi16(0x0001U); xmm4 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8; xmm5 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8; xmm6 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8; xmm7 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8; xmm0 = _mm_sign_epi16(xmm0, xmm4); xmm1 = _mm_sign_epi16(xmm1, xmm5); xmm2 = _mm_sign_epi16(xmm2, xmm6); xmm3 = _mm_sign_epi16(xmm3, xmm7); _mm_store_si128((__m128i *) dptr, xmm0); dptr += 8; _mm_store_si128((__m128i *) dptr, xmm1); dptr += 8; _mm_store_si128((__m128i *) dptr, xmm2); dptr += 8; _mm_store_si128((__m128i *) dptr, xmm3); dptr += 8; } } else { /* Aligned */ while (count--) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; xmm0 = _mm_set1_epi16(0x0001U); xmm1 = _mm_set1_epi16(0x0001U); xmm2 = _mm_set1_epi16(0x0001U); xmm3 = _mm_set1_epi16(0x0001U); xmm4 = _mm_load_si128((__m128i *) sptr); sptr += 8; xmm5 = _mm_load_si128((__m128i *) sptr); sptr += 8; xmm6 = _mm_load_si128((__m128i *) sptr); sptr += 8; xmm7 = _mm_load_si128((__m128i *) sptr); sptr += 8; xmm0 = _mm_sign_epi16(xmm0, xmm4); xmm1 = _mm_sign_epi16(xmm1, xmm5); xmm2 = _mm_sign_epi16(xmm2, xmm6); xmm3 = _mm_sign_epi16(xmm3, xmm7); _mm_store_si128((__m128i *) dptr, xmm0); dptr += 8; _mm_store_si128((__m128i *) dptr, xmm1); dptr += 8; _mm_store_si128((__m128i *) dptr, xmm2); dptr += 8; _mm_store_si128((__m128i *) dptr, xmm3); dptr += 8; } } /* Do 8-short chunks using two XMM registers. */ count = len >> 3; len -= count << 3; while (count--) { __m128i xmm0 = _mm_set1_epi16(0x0001U); __m128i xmm1 = LOAD_SI128(sptr); sptr += 8; xmm0 = _mm_sign_epi16(xmm0, xmm1); _mm_store_si128((__m128i *) dptr, xmm0); dptr += 8; } /* Do leftovers. */ while (len--) { INT16 src = *sptr++; *dptr++ = (src < 0) ? -1 : ((src > 0) ? 1 : 0); } return PRIMITIVES_SUCCESS; }
int mult_cpx_conj_vector(int16_t *x1, int16_t *x2, int16_t *y, uint32_t N, int output_shift, int madd) { // Multiply elementwise the complex conjugate of x1 with x2. // x1 - input 1 in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| // We assume x1 with a dinamic of 15 bit maximum // // x2 - input 2 in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| // We assume x2 with a dinamic of 14 bit maximum /// // y - output in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| // // N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4; // // output_shift - shift to be applied to generate output // // madd - add the output to y uint32_t i; // loop counter simd_q15_t *x1_128; simd_q15_t *x2_128; simd_q15_t *y_128; #if defined(__x86_64__) || defined(__i386__) simd_q15_t tmp_re,tmp_im; simd_q15_t tmpy0,tmpy1; #elif defined(__arm__) int32x4_t tmp_re,tmp_im; int32x4_t tmp_re1,tmp_im1; int16x4x2_t tmpy; int32x4_t shift = vdupq_n_s32(-output_shift); #endif x1_128 = (simd_q15_t *)&x1[0]; x2_128 = (simd_q15_t *)&x2[0]; y_128 = (simd_q15_t *)&y[0]; // we compute 4 cpx multiply for each loop for(i=0; i<(N>>2); i++) { #if defined(__x86_64__) || defined(__i386__) tmp_re = _mm_madd_epi16(*x1_128,*x2_128); tmp_im = _mm_shufflelo_epi16(*x1_128,_MM_SHUFFLE(2,3,0,1)); tmp_im = _mm_shufflehi_epi16(tmp_im,_MM_SHUFFLE(2,3,0,1)); tmp_im = _mm_sign_epi16(tmp_im,*(__m128i*)&conjug[0]); tmp_im = _mm_madd_epi16(tmp_im,*x2_128); tmp_re = _mm_srai_epi32(tmp_re,output_shift); tmp_im = _mm_srai_epi32(tmp_im,output_shift); tmpy0 = _mm_unpacklo_epi32(tmp_re,tmp_im); tmpy1 = _mm_unpackhi_epi32(tmp_re,tmp_im); if (madd==0) *y_128 = _mm_packs_epi32(tmpy0,tmpy1); else *y_128 += _mm_packs_epi32(tmpy0,tmpy1); #elif defined(__arm__) tmp_re = vmull_s16(((simdshort_q15_t *)x1_128)[0], ((simdshort_q15_t*)x2_128)[0]); //tmp_re = [Re(x1[0])Re(x2[0]) Im(x1[0])Im(x2[0]) Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1])] tmp_re1 = vmull_s16(((simdshort_q15_t *)x1_128)[1], ((simdshort_q15_t*)x2_128)[1]); //tmp_re1 = [Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1]) Re(x1[1])Re(x2[2]) Im(x1[1])Im(x2[2])] tmp_re = vcombine_s32(vpadd_s32(vget_low_s32(tmp_re),vget_high_s32(tmp_re)), vpadd_s32(vget_low_s32(tmp_re1),vget_high_s32(tmp_re1))); //tmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2]) Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] tmp_im = vmull_s16(vrev32_s16(vmul_s16(((simdshort_q15_t*)x2_128)[0],*(simdshort_q15_t*)conjug)), ((simdshort_q15_t*)x1_128)[0]); //tmp_im = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])] tmp_im1 = vmull_s16(vrev32_s16(vmul_s16(((simdshort_q15_t*)x2_128)[1],*(simdshort_q15_t*)conjug)), ((simdshort_q15_t*)x1_128)[1]); //tmp_im1 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])] tmp_im = vcombine_s32(vpadd_s32(vget_low_s32(tmp_im),vget_high_s32(tmp_im)), vpadd_s32(vget_low_s32(tmp_im1),vget_high_s32(tmp_im1))); //tmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])] tmp_re = vqshlq_s32(tmp_re,shift); tmp_im = vqshlq_s32(tmp_im,shift); tmpy = vzip_s16(vmovn_s32(tmp_re),vmovn_s32(tmp_im)); if (madd==0) *y_128 = vcombine_s16(tmpy.val[0],tmpy.val[1]); else *y_128 += vcombine_s16(tmpy.val[0],tmpy.val[1]); #endif x1_128++; x2_128++; y_128++; } _mm_empty(); _m_empty(); return(0); }
int multadd_cpx_vector(int16_t *x1, int16_t *x2, int16_t *y, uint8_t zero_flag, uint32_t N, int output_shift) { // Multiply elementwise the complex conjugate of x1 with x2. // x1 - input 1 in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| // We assume x1 with a dinamic of 15 bit maximum // // x2 - input 2 in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| // We assume x2 with a dinamic of 14 bit maximum /// // y - output in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| // // zero_flag - Set output (y) to zero prior to disable accumulation // // N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4; // // output_shift - shift to be applied to generate output uint32_t i; // loop counter simd_q15_t *x1_128; simd_q15_t *x2_128; simd_q15_t *y_128; #if defined(__x86_64__) || defined(__i386__) simd_q15_t tmp_re,tmp_im; simd_q15_t tmpy0,tmpy1; #elif defined(__arm__) int32x4_t tmp_re,tmp_im; int32x4_t tmp_re1,tmp_im1; int16x4x2_t tmpy; int32x4_t shift = vdupq_n_s32(-output_shift); #endif x1_128 = (simd_q15_t *)&x1[0]; x2_128 = (simd_q15_t *)&x2[0]; y_128 = (simd_q15_t *)&y[0]; // we compute 4 cpx multiply for each loop for(i=0; i<(N>>2); i++) { #if defined(__x86_64__) || defined(__i386__) tmp_re = _mm_sign_epi16(*x1_128,*(__m128i*)&conjug2[0]); tmp_re = _mm_madd_epi16(tmp_re,*x2_128); tmp_im = _mm_shufflelo_epi16(*x1_128,_MM_SHUFFLE(2,3,0,1)); tmp_im = _mm_shufflehi_epi16(tmp_im,_MM_SHUFFLE(2,3,0,1)); tmp_im = _mm_madd_epi16(tmp_im,*x2_128); tmp_re = _mm_srai_epi32(tmp_re,output_shift); tmp_im = _mm_srai_epi32(tmp_im,output_shift); tmpy0 = _mm_unpacklo_epi32(tmp_re,tmp_im); //print_ints("unpack lo:",&tmpy0[i]); tmpy1 = _mm_unpackhi_epi32(tmp_re,tmp_im); //print_ints("unpack hi:",&tmpy1[i]); if (zero_flag == 1) *y_128 = _mm_packs_epi32(tmpy0,tmpy1); else *y_128 = _mm_adds_epi16(*y_128,_mm_packs_epi32(tmpy0,tmpy1)); //print_shorts("*y_128:",&y_128[i]); #elif defined(__arm__) msg("mult_cpx_vector not implemented for __arm__"); #endif x1_128++; x2_128++; y_128++; } _mm_empty(); _m_empty(); return(0); }
int mult_cpx_vector(int16_t *x1, //Q15 int16_t *x2,//Q13 int16_t *y, uint32_t N, int output_shift) { // Multiply elementwise x1 with x2. // x1 - input 1 in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| // We assume x1 with a dinamic of 15 bit maximum // // x2 - input 2 in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| // We assume x2 with a dinamic of 14 bit maximum /// // y - output in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| // // N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4; // // output_shift - shift to be applied to generate output uint32_t i; // loop counter simd_q15_t *x1_128; simd_q15_t *x2_128; simd_q15_t *y_128; simd_q15_t tmp_re,tmp_im; simd_q15_t tmpy0,tmpy1; x1_128 = (simd_q15_t *)&x1[0]; x2_128 = (simd_q15_t *)&x2[0]; y_128 = (simd_q15_t *)&y[0]; //print_shorts("x1_128:",&x1_128[0]); // print_shorts("x2_128:",&x2_128[0]); //right shift by 13 while p_a * x0 and 15 while // we compute 4 cpx multiply for each loop for(i=0; i<(N>>2); i++) { tmp_re = _mm_sign_epi16(*x1_128,*(__m128i*)&conjug2[0]);// Q15 //print_shorts("tmp_re1:",&tmp_re[i]); tmp_re = _mm_madd_epi16(tmp_re,*x2_128); //Q28 //print_ints("tmp_re2:",&tmp_re[i]); tmp_im = _mm_shufflelo_epi16(*x1_128,_MM_SHUFFLE(2,3,0,1)); //Q15 //print_shorts("tmp_im1:",&tmp_im[i]); tmp_im = _mm_shufflehi_epi16(tmp_im,_MM_SHUFFLE(2,3,0,1)); //Q15 //print_shorts("tmp_im2:",&tmp_im[i]); tmp_im = _mm_madd_epi16(tmp_im, *x2_128); //Q28 //print_ints("tmp_im3:",&tmp_im[i]); tmp_re = _mm_srai_epi32(tmp_re,output_shift);//Q(28-shift) //print_ints("tmp_re shifted:",&tmp_re[i]); tmp_im = _mm_srai_epi32(tmp_im,output_shift); //Q(28-shift) //print_ints("tmp_im shifted:",&tmp_im[i]); tmpy0 = _mm_unpacklo_epi32(tmp_re,tmp_im); //Q(28-shift) //print_ints("unpack lo :",&tmpy0[i]); tmpy1 = _mm_unpackhi_epi32(tmp_re,tmp_im); //Q(28-shift) //print_ints("mrc rho0:",&tmpy1[i]); *y_128 = _mm_packs_epi32(tmpy0,tmpy1); //must be Q15 //print_shorts("*y_128:",&y_128[i]); x1_128++; x2_128++; y_128++; } _mm_empty(); _m_empty(); return(0); }