template<class T> inline void dequantise_sse4_2_16_8_3(QuantisationMatrix *qmatrix, int32_t *idata, void *_odata, int ostride) { T *odata = (T *)_odata; const int slice_width = 16; const int slice_height = 8; const int Y = 0; const int X = 0; const int N = 0; T * const optr = &odata[Y*slice_height*ostride + X*slice_width]; const int32_t * iptr = &idata[N*slice_height*slice_width]; { __m128i D0; { D0 = _mm_load_si128((__m128i *)&iptr[ 0]); // [ 0 1 2 3 ] (Q) __m128i QF = _mm_unpacklo_epi64(_mm_load_si128((__m128i *)&qmatrix->qfactor[0][0]), _mm_load_si128((__m128i *)&qmatrix->qfactor[1][1])); __m128i QO = _mm_unpacklo_epi64(_mm_load_si128((__m128i *)&qmatrix->qoffset[0][0]), _mm_load_si128((__m128i *)&qmatrix->qoffset[1][1])); __m128i X = _mm_abs_epi32(D0); X = _mm_mullo_epi32(X, QF); X = _mm_add_epi32(X, QO); X = _mm_srai_epi32(X, 2); D0 = _mm_sign_epi32(X, D0); D0 = _mm_shuffle_epi32(D0, 0xD8); } const __m128i D1 = LOAD_QUANTISED(&iptr[8], qmatrix, 2, 1); const __m128i D2 = LOAD_QUANTISED(&iptr[32], qmatrix, 3, 1); const __m128i D3 = LOAD_QUANTISED(&iptr[36], qmatrix, 3, 1); const __m128i A0 = _mm_unpacklo_epi32(D0, D1); const __m128i A1 = _mm_unpackhi_epi32(D0, D1); const __m128i B0 = _mm_unpacklo_epi32(A0, D2); const __m128i B1 = _mm_unpackhi_epi32(A0, D2); const __m128i B2 = _mm_unpacklo_epi32(A1, D3); const __m128i B3 = _mm_unpackhi_epi32(A1, D3); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[0*ostride + 0], B0, B1); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[0*ostride + 8], B2, B3); } { __m128i D0; { D0 = _mm_load_si128((__m128i *)&iptr[ 4]); __m128i QF = _mm_unpacklo_epi64(_mm_load_si128((__m128i *)&qmatrix->qfactor[1][2]), _mm_load_si128((__m128i *)&qmatrix->qfactor[1][3])); __m128i QO = _mm_unpacklo_epi64(_mm_load_si128((__m128i *)&qmatrix->qoffset[1][2]), _mm_load_si128((__m128i *)&qmatrix->qoffset[1][3])); __m128i X = _mm_abs_epi32(D0); X = _mm_mullo_epi32(X, QF); X = _mm_add_epi32(X, QO); X = _mm_srai_epi32(X, 2); D0 = _mm_sign_epi32(X, D0); D0 = _mm_shuffle_epi32(D0, 0xD8); } const __m128i D1 = LOAD_QUANTISED(&iptr[12], qmatrix, 2, 1); const __m128i D2 = LOAD_QUANTISED(&iptr[48], qmatrix, 3, 1); const __m128i D3 = LOAD_QUANTISED(&iptr[52], qmatrix, 3, 1); const __m128i A0 = _mm_unpacklo_epi32(D0, D1); const __m128i A1 = _mm_unpackhi_epi32(D0, D1); const __m128i B0 = _mm_unpacklo_epi32(A0, D2); const __m128i B1 = _mm_unpackhi_epi32(A0, D2); const __m128i B2 = _mm_unpacklo_epi32(A1, D3); const __m128i B3 = _mm_unpackhi_epi32(A1, D3); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[4*ostride + 0], B0, B1); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[4*ostride + 8], B2, B3); } { const __m128i D0 = LOAD_QUANTISED(&iptr[16], qmatrix, 2, 2); const __m128i D1 = LOAD_QUANTISED(&iptr[24], qmatrix, 2, 3); const __m128i D2 = LOAD_QUANTISED(&iptr[40], qmatrix, 3, 1); const __m128i D3 = LOAD_QUANTISED(&iptr[44], qmatrix, 3, 1); const __m128i A0 = _mm_unpacklo_epi32(D0, D1); const __m128i A1 = _mm_unpackhi_epi32(D0, D1); const __m128i B0 = _mm_unpacklo_epi32(A0, D2); const __m128i B1 = _mm_unpackhi_epi32(A0, D2); const __m128i B2 = _mm_unpacklo_epi32(A1, D3); const __m128i B3 = _mm_unpackhi_epi32(A1, D3); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[2*ostride + 0], B0, B1); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[2*ostride + 8], B2, B3); } { const __m128i D0 = LOAD_QUANTISED(&iptr[20], qmatrix, 2, 2); const __m128i D1 = LOAD_QUANTISED(&iptr[28], qmatrix, 2, 3); const __m128i D2 = LOAD_QUANTISED(&iptr[56], qmatrix, 3, 1); const __m128i D3 = LOAD_QUANTISED(&iptr[60], qmatrix, 3, 1); const __m128i A0 = _mm_unpacklo_epi32(D0, D1); const __m128i A1 = _mm_unpackhi_epi32(D0, D1); const __m128i B0 = _mm_unpacklo_epi32(A0, D2); const __m128i B1 = _mm_unpackhi_epi32(A0, D2); const __m128i B2 = _mm_unpacklo_epi32(A1, D3); const __m128i B3 = _mm_unpackhi_epi32(A1, D3); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[6*ostride + 0], B0, B1); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[6*ostride + 8], B2, B3); } for (int y = 0; y < 4; y++) { const __m128i D0 = LOAD_QUANTISED(&iptr[ 64 + y*8], qmatrix, 3, 2); const __m128i D1 = LOAD_QUANTISED(&iptr[ 68 + y*8], qmatrix, 3, 2); const __m128i D2 = LOAD_QUANTISED(&iptr[ 96 + y*8], qmatrix, 3, 3); const __m128i D3 = LOAD_QUANTISED(&iptr[100 + y*8], qmatrix, 3, 3); const __m128i A0 = _mm_unpacklo_epi32(D0, D2); const __m128i A1 = _mm_unpackhi_epi32(D0, D2); const __m128i A2 = _mm_unpacklo_epi32(D1, D3); const __m128i A3 = _mm_unpackhi_epi32(D1, D3); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[(2*y + 1)*ostride + 0], A0, A1); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[(2*y + 1)*ostride + 8], A2, A3); } }
void ulsch_channel_compensation_alamouti(int **rxdataF_ext, // For Distributed Alamouti Combining int **ul_ch_estimates_ext_0, int **ul_ch_estimates_ext_1, int **ul_ch_mag_0, int **ul_ch_magb_0, int **ul_ch_mag_1, int **ul_ch_magb_1, int **rxdataF_comp_0, int **rxdataF_comp_1, LTE_DL_FRAME_PARMS *frame_parms, unsigned char symbol, unsigned char Qm, unsigned short nb_rb, unsigned char output_shift_0, unsigned char output_shift_1) { unsigned short rb; __m128i *ul_ch128_0,*ul_ch128_1,*ul_ch_mag128_0,*ul_ch_mag128_1,*ul_ch_mag128b_0,*ul_ch_mag128b_1,*rxdataF128,*rxdataF_comp128_0,*rxdataF_comp128_1; unsigned char aarx;//,symbol_mod; // symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol; #ifndef __SSE3__ zeroU = _mm_xor_si128(zeroU,zeroU); #endif // printf("comp: symbol %d\n",symbol); if (Qm == 4) { QAM_amp128U_0 = _mm_set1_epi16(QAM16_n1); QAM_amp128U_1 = _mm_set1_epi16(QAM16_n1); } else if (Qm == 6) { QAM_amp128U_0 = _mm_set1_epi16(QAM64_n1); QAM_amp128bU_0 = _mm_set1_epi16(QAM64_n2); QAM_amp128U_1 = _mm_set1_epi16(QAM64_n1); QAM_amp128bU_1 = _mm_set1_epi16(QAM64_n2); } for (aarx=0;aarx<frame_parms->nb_antennas_rx;aarx++) { ul_ch128_0 = (__m128i *)&ul_ch_estimates_ext_0[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128_0 = (__m128i *)&ul_ch_mag_0[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128b_0 = (__m128i *)&ul_ch_magb_0[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch128_1 = (__m128i *)&ul_ch_estimates_ext_1[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128_1 = (__m128i *)&ul_ch_mag_1[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128b_1 = (__m128i *)&ul_ch_magb_1[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF128 = (__m128i *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp128_0 = (__m128i *)&rxdataF_comp_0[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp128_1 = (__m128i *)&rxdataF_comp_1[aarx][symbol*frame_parms->N_RB_DL*12]; for (rb=0;rb<nb_rb;rb++) { // printf("comp: symbol %d rb %d\n",symbol,rb); if (Qm>2) { // get channel amplitude if not QPSK mmtmpU0 = _mm_madd_epi16(ul_ch128_0[0],ul_ch128_0[0]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0); mmtmpU1 = _mm_madd_epi16(ul_ch128_0[1],ul_ch128_0[1]); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_0); mmtmpU0 = _mm_packs_epi32(mmtmpU0,mmtmpU1); ul_ch_mag128_0[0] = _mm_unpacklo_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b_0[0] = ul_ch_mag128_0[0]; ul_ch_mag128_0[0] = _mm_mulhi_epi16(ul_ch_mag128_0[0],QAM_amp128U_0); ul_ch_mag128_0[0] = _mm_slli_epi16(ul_ch_mag128_0[0],2); // 2 to compensate the scale channel estimate ul_ch_mag128_0[1] = _mm_unpackhi_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b_0[1] = ul_ch_mag128_0[1]; ul_ch_mag128_0[1] = _mm_mulhi_epi16(ul_ch_mag128_0[1],QAM_amp128U_0); ul_ch_mag128_0[1] = _mm_slli_epi16(ul_ch_mag128_0[1],2); // 2 to scale compensate the scale channel estimate mmtmpU0 = _mm_madd_epi16(ul_ch128_0[2],ul_ch128_0[2]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0); mmtmpU1 = _mm_packs_epi32(mmtmpU0,mmtmpU0); ul_ch_mag128_0[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1); ul_ch_mag128b_0[2] = ul_ch_mag128_0[2]; ul_ch_mag128_0[2] = _mm_mulhi_epi16(ul_ch_mag128_0[2],QAM_amp128U_0); ul_ch_mag128_0[2] = _mm_slli_epi16(ul_ch_mag128_0[2],2); // 2 to scale compensate the scale channel estimat ul_ch_mag128b_0[0] = _mm_mulhi_epi16(ul_ch_mag128b_0[0],QAM_amp128bU_0); ul_ch_mag128b_0[0] = _mm_slli_epi16(ul_ch_mag128b_0[0],2); // 2 to scale compensate the scale channel estima ul_ch_mag128b_0[1] = _mm_mulhi_epi16(ul_ch_mag128b_0[1],QAM_amp128bU_0); ul_ch_mag128b_0[1] = _mm_slli_epi16(ul_ch_mag128b_0[1],2); // 2 to scale compensate the scale channel estima ul_ch_mag128b_0[2] = _mm_mulhi_epi16(ul_ch_mag128b_0[2],QAM_amp128bU_0); ul_ch_mag128b_0[2] = _mm_slli_epi16(ul_ch_mag128b_0[2],2); // 2 to scale compensate the scale channel estima mmtmpU0 = _mm_madd_epi16(ul_ch128_1[0],ul_ch128_1[0]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1); mmtmpU1 = _mm_madd_epi16(ul_ch128_1[1],ul_ch128_1[1]); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_1); mmtmpU0 = _mm_packs_epi32(mmtmpU0,mmtmpU1); ul_ch_mag128_1[0] = _mm_unpacklo_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b_1[0] = ul_ch_mag128_1[0]; ul_ch_mag128_1[0] = _mm_mulhi_epi16(ul_ch_mag128_1[0],QAM_amp128U_1); ul_ch_mag128_1[0] = _mm_slli_epi16(ul_ch_mag128_1[0],2); // 2 to compensate the scale channel estimate ul_ch_mag128_1[1] = _mm_unpackhi_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b_1[1] = ul_ch_mag128_1[1]; ul_ch_mag128_1[1] = _mm_mulhi_epi16(ul_ch_mag128_1[1],QAM_amp128U_1); ul_ch_mag128_1[1] = _mm_slli_epi16(ul_ch_mag128_1[1],2); // 2 to scale compensate the scale channel estimate mmtmpU0 = _mm_madd_epi16(ul_ch128_1[2],ul_ch128_1[2]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1); mmtmpU1 = _mm_packs_epi32(mmtmpU0,mmtmpU0); ul_ch_mag128_1[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1); ul_ch_mag128b_1[2] = ul_ch_mag128_1[2]; ul_ch_mag128_1[2] = _mm_mulhi_epi16(ul_ch_mag128_1[2],QAM_amp128U_0); ul_ch_mag128_1[2] = _mm_slli_epi16(ul_ch_mag128_1[2],2); // 2 to scale compensate the scale channel estimat ul_ch_mag128b_1[0] = _mm_mulhi_epi16(ul_ch_mag128b_1[0],QAM_amp128bU_1); ul_ch_mag128b_1[0] = _mm_slli_epi16(ul_ch_mag128b_1[0],2); // 2 to scale compensate the scale channel estima ul_ch_mag128b_1[1] = _mm_mulhi_epi16(ul_ch_mag128b_1[1],QAM_amp128bU_1); ul_ch_mag128b_1[1] = _mm_slli_epi16(ul_ch_mag128b_1[1],2); // 2 to scale compensate the scale channel estima ul_ch_mag128b_1[2] = _mm_mulhi_epi16(ul_ch_mag128b_1[2],QAM_amp128bU_1); ul_ch_mag128b_1[2] = _mm_slli_epi16(ul_ch_mag128b_1[2],2); // 2 to scale compensate the scale channel estima } /************************For Computing (y)*(h0*)********************************************/ // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128_0[0],rxdataF128[0]); // print_ints("re",&mmtmpU0); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128_0[0],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)&conjugate[0]); // print_ints("im",&mmtmpU1); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[0]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0); // print_ints("re(shift)",&mmtmpU0); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_0); // print_ints("im(shift)",&mmtmpU1); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); // print_ints("c0",&mmtmpU2); // print_ints("c1",&mmtmpU3); rxdataF_comp128_0[0] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[0]); // print_shorts("ch:",ul_ch128_0[0]); // print_shorts("pack:",rxdataF_comp128_0[0]); // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128_0[1],rxdataF128[1]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128_0[1],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[1]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_0); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128_0[1] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[1]); // print_shorts("ch:",ul_ch128_0[1]); // print_shorts("pack:",rxdataF_comp128_0[1]); // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128_0[2],rxdataF128[2]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128_0[2],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[2]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_0); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128_0[2] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[2]); // print_shorts("ch:",ul_ch128_0[2]); // print_shorts("pack:",rxdataF_comp128_0[2]); /*************************For Computing (y*)*(h1)************************************/ // multiply by conjugated signal mmtmpU0 = _mm_madd_epi16(ul_ch128_1[0],rxdataF128[0]); // print_ints("re",&mmtmpU0); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(rxdataF128[0],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)&conjugate[0]); // print_ints("im",&mmtmpU1); mmtmpU1 = _mm_madd_epi16(mmtmpU1,ul_ch128_1[0]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1); // print_ints("re(shift)",&mmtmpU0); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_1); // print_ints("im(shift)",&mmtmpU1); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); // print_ints("c0",&mmtmpU2); // print_ints("c1",&mmtmpU3); rxdataF_comp128_1[0] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[0]); // print_shorts("ch_conjugate:",ul_ch128_1[0]); // print_shorts("pack:",rxdataF_comp128_1[0]); // multiply by conjugated signal mmtmpU0 = _mm_madd_epi16(ul_ch128_1[1],rxdataF128[1]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(rxdataF128[1],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,ul_ch128_1[1]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_1); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128_1[1] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[1]); // print_shorts("ch_conjugate:",ul_ch128_1[1]); // print_shorts("pack:",rxdataF_comp128_1[1]); // multiply by conjugated signal mmtmpU0 = _mm_madd_epi16(ul_ch128_1[2],rxdataF128[2]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(rxdataF128[2],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,ul_ch128_1[2]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_1); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128_1[2] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[2]); // print_shorts("ch_conjugate:",ul_ch128_0[2]); // print_shorts("pack:",rxdataF_comp128_1[2]); ul_ch128_0+=3; ul_ch_mag128_0+=3; ul_ch_mag128b_0+=3; ul_ch128_1+=3; ul_ch_mag128_1+=3; ul_ch_mag128b_1+=3; rxdataF128+=3; rxdataF_comp128_0+=3; rxdataF_comp128_1+=3; } } _mm_empty(); _m_empty(); }
mlib_status __mlib_VectorDotProd_U8_Sat( mlib_d64 *z, const mlib_u8 *x, const mlib_u8 *y, mlib_s32 n) { if (n <= 0) return (MLIB_FAILURE); mlib_s32 i, ax, ay, nstep, n1, n2, n3, sum = 0; const mlib_u8 *px = x, *py = y; __m128i zero, xbuf, ybuf, zbuf32, zbuf64, buf1, buf2, buf3, buf4; zero = _mm_setzero_si128(); zbuf64 = zero; ax = (mlib_addr)x & 15; ay = (mlib_addr)y & 15; nstep = 16 / sizeof (mlib_u8); n1 = ((16 - ax) & 15) / sizeof (mlib_u8); n2 = (n - n1) / nstep; n3 = n - n1 - n2 * nstep; if (n2 > 0) { for (i = 0; i < n1; i++) { sum += (mlib_s32)(*px++) * (*py++); } mlib_s32 nblock = n2 >> 12; mlib_s32 tail = n2 & 4095; mlib_s32 k; if (ax == ay) { for (k = 0; k < nblock; k++) { zbuf32 = zero; for (i = 0; i < 4096; i++) { VECTOR_DOTPROD_U8(load); } buf1 = _mm_unpacklo_epi32(zbuf32, zero); buf2 = _mm_unpackhi_epi32(zbuf32, zero); zbuf64 = _mm_add_epi64(zbuf64, buf1); zbuf64 = _mm_add_epi64(zbuf64, buf2); } zbuf32 = zero; for (i = 0; i < tail; i++) { VECTOR_DOTPROD_U8(load); } } else { for (k = 0; k < nblock; k++) { zbuf32 = zero; for (i = 0; i < 4096; i++) { VECTOR_DOTPROD_U8(loadu); } buf1 = _mm_unpacklo_epi32(zbuf32, zero); buf2 = _mm_unpackhi_epi32(zbuf32, zero); zbuf64 = _mm_add_epi64(zbuf64, buf1); zbuf64 = _mm_add_epi64(zbuf64, buf2); } zbuf32 = zero; for (i = 0; i < tail; i++) { VECTOR_DOTPROD_U8(loadu); } } buf1 = _mm_unpacklo_epi32(zbuf32, zero); buf2 = _mm_unpackhi_epi32(zbuf32, zero); zbuf64 = _mm_add_epi64(zbuf64, buf1); zbuf64 = _mm_add_epi64(zbuf64, buf2); for (i = 0; i < n3; i++) { sum += (mlib_s32)(*px++) * (*py++); } mlib_d64 dsum = sum; long long pz[2]; _mm_storeu_si128((__m128i *)pz, zbuf64); dsum += pz[0]; dsum += pz[1]; *z = dsum; } else { for (i = 0; i < n; i++) {
template<int pixelFormat> void imageFromPixels(vl::Image & image, char unsigned const * rgb, int rowStride) { vl::ImageShape const & shape = image.getShape() ; int blockSizeX ; int blockSizeY ; int pixelStride ; int imagePlaneStride = (int)shape.width * (int)shape.height ; __m128i shuffleRgb ; __m128i const shuffleL = _mm_set_epi8(0xff, 0xff, 0xff, 3, 0xff, 0xff, 0xff, 2, 0xff, 0xff, 0xff, 1, 0xff, 0xff, 0xff, 0) ; __m128i const mask = _mm_set_epi32(0xff, 0xff, 0xff, 0xff) ; switch (pixelFormat) { case pixelFormatL: pixelStride = 1 ; blockSizeX = 16 ; blockSizeY = 4 ; break ; case pixelFormatBGR: case pixelFormatRGB: pixelStride = 3 ; blockSizeX = 4 ; blockSizeY = 4 ; assert(shape.depth == 3) ; break ; case pixelFormatRGBA: case pixelFormatBGRA: case pixelFormatBGRAasL: pixelStride = 4 ; blockSizeX = 4 ; blockSizeY = 4 ; assert(shape.depth == 3) ; break ; default: assert(false) ; } switch (pixelFormat) { case pixelFormatL: break ; case pixelFormatRGB: shuffleRgb = _mm_set_epi8(0xff, 11, 10, 9, 0xff, 8, 7, 6, 0xff, 5, 4, 3, 0xff, 2, 1, 0) ; break ; case pixelFormatRGBA: shuffleRgb = _mm_set_epi8(0xff, 14, 13, 12, 0xff, 10, 9, 8, 0xff, 6, 5, 4, 0xff, 2, 1, 0) ; break ; case pixelFormatBGR: shuffleRgb = _mm_set_epi8(0xff, 9, 10, 11, 0xff, 6, 7, 8, 0xff, 3, 4, 4, 0xff, 0, 1, 2) ; break ; case pixelFormatBGRA: shuffleRgb = _mm_set_epi8(0xff, 12, 13, 14, 0xff, 8, 9, 10, 0xff, 4, 5, 6, 0xff, 0, 1, 2) ; break ; case pixelFormatBGRAasL: shuffleRgb = _mm_set_epi8(0xff, 0xff, 0xff, 12, 0xff, 0xff, 0xff, 8, 0xff, 0xff, 0xff, 4, 0xff, 0xff, 0xff, 0) ; break ; } // we pull out these values as otherwise the compiler // will assume that the reference &image can be aliased // and recompute silly multiplications in the inner loop float * const __restrict imageMemory = image.getMemory() ; int const imageHeight = (int)shape.height ; int const imageWidth = (int)shape.width ; for (int x = 0 ; x < imageWidth ; x += blockSizeX) { int y = 0 ; float * __restrict imageMemoryX = imageMemory + x * imageHeight ; int bsx = (std::min)(imageWidth - x, blockSizeX) ; if (bsx < blockSizeX) goto boundary ; for ( ; y < imageHeight - blockSizeY + 1 ; y += blockSizeY) { char unsigned const * __restrict pixel = rgb + y * rowStride + x * pixelStride ; float * __restrict r = imageMemoryX + y ; __m128i p0, p1, p2, p3, T0, T1, T2, T3 ; /* convert a blockSizeX x blockSizeY block in the input image */ switch (pixelFormat) { case pixelFormatRGB : case pixelFormatRGBA : case pixelFormatBGR : case pixelFormatBGRA : case pixelFormatBGRAasL : // load 4x4 RGB pixels p0 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)pixel), shuffleRgb) ; pixel += rowStride ; p1 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)pixel), shuffleRgb) ; pixel += rowStride ; p2 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)pixel), shuffleRgb) ; pixel += rowStride ; p3 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)pixel), shuffleRgb) ; pixel += rowStride ; // transpose pixels as 32-bit integers (see also below) T0 = _mm_unpacklo_epi32(p0, p1); T1 = _mm_unpacklo_epi32(p2, p3); T2 = _mm_unpackhi_epi32(p0, p1); T3 = _mm_unpackhi_epi32(p2, p3); p0 = _mm_unpacklo_epi64(T0, T1); p1 = _mm_unpackhi_epi64(T0, T1); p2 = _mm_unpacklo_epi64(T2, T3); p3 = _mm_unpackhi_epi64(T2, T3); // store r _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p0, mask))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p1, mask))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p2, mask))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p3, mask))) ; if (pixelFormat == pixelFormatBGRAasL) break ; // store g r += (imageWidth - 3) * imageHeight ; p0 = _mm_srli_epi32 (p0, 8) ; p1 = _mm_srli_epi32 (p1, 8) ; p2 = _mm_srli_epi32 (p2, 8) ; p3 = _mm_srli_epi32 (p3, 8) ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p0, mask))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p1, mask))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p2, mask))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p3, mask))) ; // store b r += (imageWidth - 3) * imageHeight ; p0 = _mm_srli_epi32 (p0, 8) ; p1 = _mm_srli_epi32 (p1, 8) ; p2 = _mm_srli_epi32 (p2, 8) ; p3 = _mm_srli_epi32 (p3, 8) ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p0, mask))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p1, mask))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p2, mask))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p3, mask))) ; break ; case pixelFormatL: // load 4x16 L pixels p0 = _mm_loadu_si128((__m128i*)pixel) ; pixel += rowStride ; p1 = _mm_loadu_si128((__m128i*)pixel) ; pixel += rowStride ; p2 = _mm_loadu_si128((__m128i*)pixel) ; pixel += rowStride ; p3 = _mm_loadu_si128((__m128i*)pixel) ; pixel += rowStride ; /* Pixels are collected in little-endian order: the first pixel is at the `right' (least significant byte of p0: p[0] = a, p[1] = b, ... p0: [ ... | ... | ... | d c b a ] p1: [ ... | ... | ... | h g f e ] p2: [ ... | ... | ... | l k j i ] p3: [ ... | ... | ... | p o n m ] The goal is to transpose four 4x4 subblocks in the 4 x 16 pixel array. The first step interlaves individual pixels in p0 and p1: T0: [ ... | ... | h d g c | f b e a ] T1: [ ... | ... | p l o k | n j m i ] T2: [ ... | ... | ... | ... ] T3: [ ... | ... | ... | ... ] The second step interleaves groups of two pixels: p0: [pl hd | ok gc | nj fb | mi ea] (pixels in the rightmost 4x4 subblock) p1: ... p2: ... p3: ... The third step interlevaes groups of four pixels: T0: [ ... | njfb | ... | miea ] T1: ... T2: ... T3: ... The last step interleaves groups of eight pixels: p0: [ ... | ... | ... | miea ] p1: [ ... | ... | ... | njfb ] p2: [ ... | ... | ... | okgc ] p3: [ ... | ... | ... | dklp ] */ T0 = _mm_unpacklo_epi8(p0, p1); T1 = _mm_unpacklo_epi8(p2, p3); T2 = _mm_unpackhi_epi8(p0, p1); T3 = _mm_unpackhi_epi8(p2, p3); p0 = _mm_unpacklo_epi16(T0, T1); p1 = _mm_unpackhi_epi16(T0, T1); p2 = _mm_unpacklo_epi16(T2, T3); p3 = _mm_unpackhi_epi16(T2, T3); T0 = _mm_unpacklo_epi32(p0, p1); T1 = _mm_unpacklo_epi32(p2, p3); T2 = _mm_unpackhi_epi32(p0, p1); T3 = _mm_unpackhi_epi32(p2, p3); p0 = _mm_unpacklo_epi64(T0, T1); p1 = _mm_unpackhi_epi64(T0, T1); p2 = _mm_unpacklo_epi64(T2, T3); p3 = _mm_unpackhi_epi64(T2, T3); // store four 4x4 subblock for (int i = 0 ; i < 4 ; ++i) { _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_shuffle_epi8(p0, shuffleL))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_shuffle_epi8(p1, shuffleL))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_shuffle_epi8(p2, shuffleL))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_shuffle_epi8(p3, shuffleL))) ; r += imageHeight ; p0 = _mm_srli_si128 (p0, 4) ; p1 = _mm_srli_si128 (p1, 4) ; p2 = _mm_srli_si128 (p2, 4) ; p3 = _mm_srli_si128 (p3, 4) ; } break ; } } /* next y */ boundary: /* special case if there is not a full 4x4 block to process */ for ( ; y < imageHeight ; y += blockSizeY) { int bsy = (std::min)(imageHeight - y, blockSizeY) ; float * __restrict r ; float * rend ; for (int dx = 0 ; dx < bsx ; ++dx) { char unsigned const * __restrict pixel = rgb + y * rowStride + (x + dx) * pixelStride ; r = imageMemoryX + y + dx * imageHeight ; rend = r + bsy ; while (r != rend) { switch (pixelFormat) { case pixelFormatRGBA: case pixelFormatRGB: r[0 * imagePlaneStride] = (float) pixel[0] ; r[1 * imagePlaneStride] = (float) pixel[1] ; r[2 * imagePlaneStride] = (float) pixel[2] ; break ; case pixelFormatBGR: case pixelFormatBGRA: r[2 * imagePlaneStride] = (float) pixel[0] ; r[1 * imagePlaneStride] = (float) pixel[1] ; r[0 * imagePlaneStride] = (float) pixel[2] ; break; case pixelFormatBGRAasL: case pixelFormatL: r[0] = (float) pixel[0] ; break ; } r += 1 ; pixel += rowStride ; } } } } }
static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) { // This implementation makes use of 16-bit fixed point versions of two // multiply constants: // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16 // // To be able to use signed 16-bit integers, we use the following trick to // have constants within range: // - Associated constants are obtained by subtracting the 16-bit fixed point // version of one: // k = K - (1 << 16) => K = k + (1 << 16) // K1 = 85267 => k1 = 20091 // K2 = 35468 => k2 = -30068 // - The multiplication of a variable by a constant become the sum of the // variable and the multiplication of that variable by the associated // constant: // (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x const __m128i k1 = _mm_set1_epi16(20091); const __m128i k2 = _mm_set1_epi16(-30068); __m128i T0, T1, T2, T3; // Load and concatenate the transform coefficients (we'll do two transforms // in parallel). In the case of only one transform, the second half of the // vectors will just contain random value we'll never use nor store. __m128i in0, in1, in2, in3; { in0 = _mm_loadl_epi64((__m128i*)&in[0]); in1 = _mm_loadl_epi64((__m128i*)&in[4]); in2 = _mm_loadl_epi64((__m128i*)&in[8]); in3 = _mm_loadl_epi64((__m128i*)&in[12]); // a00 a10 a20 a30 x x x x // a01 a11 a21 a31 x x x x // a02 a12 a22 a32 x x x x // a03 a13 a23 a33 x x x x if (do_two) { const __m128i inB0 = _mm_loadl_epi64((__m128i*)&in[16]); const __m128i inB1 = _mm_loadl_epi64((__m128i*)&in[20]); const __m128i inB2 = _mm_loadl_epi64((__m128i*)&in[24]); const __m128i inB3 = _mm_loadl_epi64((__m128i*)&in[28]); in0 = _mm_unpacklo_epi64(in0, inB0); in1 = _mm_unpacklo_epi64(in1, inB1); in2 = _mm_unpacklo_epi64(in2, inB2); in3 = _mm_unpacklo_epi64(in3, inB3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } } // Vertical pass and subsequent transpose. { // First pass, c and d calculations are longer because of the "trick" // multiplications. const __m128i a = _mm_add_epi16(in0, in2); const __m128i b = _mm_sub_epi16(in0, in2); // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 const __m128i c1 = _mm_mulhi_epi16(in1, k2); const __m128i c2 = _mm_mulhi_epi16(in3, k1); const __m128i c3 = _mm_sub_epi16(in1, in3); const __m128i c4 = _mm_sub_epi16(c1, c2); const __m128i c = _mm_add_epi16(c3, c4); // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 const __m128i d1 = _mm_mulhi_epi16(in1, k1); const __m128i d2 = _mm_mulhi_epi16(in3, k2); const __m128i d3 = _mm_add_epi16(in1, in3); const __m128i d4 = _mm_add_epi16(d1, d2); const __m128i d = _mm_add_epi16(d3, d4); // Second pass. const __m128i tmp0 = _mm_add_epi16(a, d); const __m128i tmp1 = _mm_add_epi16(b, c); const __m128i tmp2 = _mm_sub_epi16(b, c); const __m128i tmp3 = _mm_sub_epi16(a, d); // Transpose the two 4x4. // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1); const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3); const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1); const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3); // a00 a10 a01 a11 a02 a12 a03 a13 // a20 a30 a21 a31 a22 a32 a23 a33 // b00 b10 b01 b11 b02 b12 b03 b13 // b20 b30 b21 b31 b22 b32 b23 b33 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); // a00 a10 a20 a30 a01 a11 a21 a31 // b00 b10 b20 b30 b01 b11 b21 b31 // a02 a12 a22 a32 a03 a13 a23 a33 // b02 b12 a22 b32 b03 b13 b23 b33 T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Horizontal pass and subsequent transpose. { // First pass, c and d calculations are longer because of the "trick" // multiplications. const __m128i four = _mm_set1_epi16(4); const __m128i dc = _mm_add_epi16(T0, four); const __m128i a = _mm_add_epi16(dc, T2); const __m128i b = _mm_sub_epi16(dc, T2); // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 const __m128i c1 = _mm_mulhi_epi16(T1, k2); const __m128i c2 = _mm_mulhi_epi16(T3, k1); const __m128i c3 = _mm_sub_epi16(T1, T3); const __m128i c4 = _mm_sub_epi16(c1, c2); const __m128i c = _mm_add_epi16(c3, c4); // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 const __m128i d1 = _mm_mulhi_epi16(T1, k1); const __m128i d2 = _mm_mulhi_epi16(T3, k2); const __m128i d3 = _mm_add_epi16(T1, T3); const __m128i d4 = _mm_add_epi16(d1, d2); const __m128i d = _mm_add_epi16(d3, d4); // Second pass. const __m128i tmp0 = _mm_add_epi16(a, d); const __m128i tmp1 = _mm_add_epi16(b, c); const __m128i tmp2 = _mm_sub_epi16(b, c); const __m128i tmp3 = _mm_sub_epi16(a, d); const __m128i shifted0 = _mm_srai_epi16(tmp0, 3); const __m128i shifted1 = _mm_srai_epi16(tmp1, 3); const __m128i shifted2 = _mm_srai_epi16(tmp2, 3); const __m128i shifted3 = _mm_srai_epi16(tmp3, 3); // Transpose the two 4x4. // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1); const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3); const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1); const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3); // a00 a10 a01 a11 a02 a12 a03 a13 // a20 a30 a21 a31 a22 a32 a23 a33 // b00 b10 b01 b11 b02 b12 b03 b13 // b20 b30 b21 b31 b22 b32 b23 b33 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); // a00 a10 a20 a30 a01 a11 a21 a31 // b00 b10 b20 b30 b01 b11 b21 b31 // a02 a12 a22 a32 a03 a13 a23 a33 // b02 b12 a22 b32 b03 b13 b23 b33 T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Add inverse transform to 'dst' and store. { const __m128i zero = _mm_setzero_si128(); // Load the reference(s). __m128i dst0, dst1, dst2, dst3; if (do_two) { // Load eight bytes/pixels per line. dst0 = _mm_loadl_epi64((__m128i*)&dst[0 * BPS]); dst1 = _mm_loadl_epi64((__m128i*)&dst[1 * BPS]); dst2 = _mm_loadl_epi64((__m128i*)&dst[2 * BPS]); dst3 = _mm_loadl_epi64((__m128i*)&dst[3 * BPS]); } else { // Load four bytes/pixels per line. dst0 = _mm_cvtsi32_si128(*(int*)&dst[0 * BPS]); dst1 = _mm_cvtsi32_si128(*(int*)&dst[1 * BPS]); dst2 = _mm_cvtsi32_si128(*(int*)&dst[2 * BPS]); dst3 = _mm_cvtsi32_si128(*(int*)&dst[3 * BPS]); } // Convert to 16b. dst0 = _mm_unpacklo_epi8(dst0, zero); dst1 = _mm_unpacklo_epi8(dst1, zero); dst2 = _mm_unpacklo_epi8(dst2, zero); dst3 = _mm_unpacklo_epi8(dst3, zero); // Add the inverse transform(s). dst0 = _mm_add_epi16(dst0, T0); dst1 = _mm_add_epi16(dst1, T1); dst2 = _mm_add_epi16(dst2, T2); dst3 = _mm_add_epi16(dst3, T3); // Unsigned saturate to 8b. dst0 = _mm_packus_epi16(dst0, dst0); dst1 = _mm_packus_epi16(dst1, dst1); dst2 = _mm_packus_epi16(dst2, dst2); dst3 = _mm_packus_epi16(dst3, dst3); // Store the results. if (do_two) { // Store eight bytes/pixels per line. _mm_storel_epi64((__m128i*)&dst[0 * BPS], dst0); _mm_storel_epi64((__m128i*)&dst[1 * BPS], dst1); _mm_storel_epi64((__m128i*)&dst[2 * BPS], dst2); _mm_storel_epi64((__m128i*)&dst[3 * BPS], dst3); } else { // Store four bytes/pixels per line. *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(dst0); *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(dst1); *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(dst2); *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(dst3); } } }
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { const __m128i zero = _mm_setzero_si128(); const __m128i seven = _mm_set1_epi16(7); const __m128i k937 = _mm_set1_epi32(937); const __m128i k1812 = _mm_set1_epi32(1812); const __m128i k51000 = _mm_set1_epi32(51000); const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16)); const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217); const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352); const __m128i k88p = _mm_set_epi16(8, 8, 8, 8, 8, 8, 8, 8); const __m128i k88m = _mm_set_epi16(-8, 8, -8, 8, -8, 8, -8, 8); const __m128i k5352_2217p = _mm_set_epi16(2217, 5352, 2217, 5352, 2217, 5352, 2217, 5352); const __m128i k5352_2217m = _mm_set_epi16(-5352, 2217, -5352, 2217, -5352, 2217, -5352, 2217); __m128i v01, v32; // Difference between src and ref and initial transpose. { // Load src and convert to 16b. const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]); const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]); const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]); const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]); const __m128i src_0 = _mm_unpacklo_epi8(src0, zero); const __m128i src_1 = _mm_unpacklo_epi8(src1, zero); const __m128i src_2 = _mm_unpacklo_epi8(src2, zero); const __m128i src_3 = _mm_unpacklo_epi8(src3, zero); // Load ref and convert to 16b. const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]); const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]); const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]); const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]); const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero); const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero); const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero); const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero); // Compute difference. -> 00 01 02 03 00 00 00 00 const __m128i diff0 = _mm_sub_epi16(src_0, ref_0); const __m128i diff1 = _mm_sub_epi16(src_1, ref_1); const __m128i diff2 = _mm_sub_epi16(src_2, ref_2); const __m128i diff3 = _mm_sub_epi16(src_3, ref_3); // Unpack and shuffle // 00 01 02 03 0 0 0 0 // 10 11 12 13 0 0 0 0 // 20 21 22 23 0 0 0 0 // 30 31 32 33 0 0 0 0 const __m128i shuf01 = _mm_unpacklo_epi32(diff0, diff1); const __m128i shuf23 = _mm_unpacklo_epi32(diff2, diff3); // 00 01 10 11 02 03 12 13 // 20 21 30 31 22 23 32 33 const __m128i shuf01_p = _mm_shufflehi_epi16(shuf01, _MM_SHUFFLE(2, 3, 0, 1)); const __m128i shuf23_p = _mm_shufflehi_epi16(shuf23, _MM_SHUFFLE(2, 3, 0, 1)); // 00 01 10 11 03 02 13 12 // 20 21 30 31 23 22 33 32 const __m128i s01 = _mm_unpacklo_epi64(shuf01_p, shuf23_p); const __m128i s32 = _mm_unpackhi_epi64(shuf01_p, shuf23_p); // 00 01 10 11 20 21 30 31 // 03 02 13 12 23 22 33 32 const __m128i a01 = _mm_add_epi16(s01, s32); const __m128i a32 = _mm_sub_epi16(s01, s32); // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ] // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ] const __m128i tmp0 = _mm_madd_epi16(a01, k88p); // [ (a0 + a1) << 3, ... ] const __m128i tmp2 = _mm_madd_epi16(a01, k88m); // [ (a0 - a1) << 3, ... ] const __m128i tmp1_1 = _mm_madd_epi16(a32, k5352_2217p); const __m128i tmp3_1 = _mm_madd_epi16(a32, k5352_2217m); const __m128i tmp1_2 = _mm_add_epi32(tmp1_1, k1812); const __m128i tmp3_2 = _mm_add_epi32(tmp3_1, k937); const __m128i tmp1 = _mm_srai_epi32(tmp1_2, 9); const __m128i tmp3 = _mm_srai_epi32(tmp3_2, 9); const __m128i s03 = _mm_packs_epi32(tmp0, tmp2); const __m128i s12 = _mm_packs_epi32(tmp1, tmp3); const __m128i s_lo = _mm_unpacklo_epi16(s03, s12); // 0 1 0 1 0 1... const __m128i s_hi = _mm_unpackhi_epi16(s03, s12); // 2 3 2 3 2 3 const __m128i v23 = _mm_unpackhi_epi32(s_lo, s_hi); v01 = _mm_unpacklo_epi32(s_lo, s_hi); v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // 3 2 3 2 3 2.. } // Second pass { // Same operations are done on the (0,3) and (1,2) pairs. // a0 = v0 + v3 // a1 = v1 + v2 // a3 = v0 - v3 // a2 = v1 - v2 const __m128i a01 = _mm_add_epi16(v01, v32); const __m128i a32 = _mm_sub_epi16(v01, v32); const __m128i a11 = _mm_unpackhi_epi64(a01, a01); const __m128i a22 = _mm_unpackhi_epi64(a32, a32); const __m128i a01_plus_7 = _mm_add_epi16(a01, seven); // d0 = (a0 + a1 + 7) >> 4; // d2 = (a0 - a1 + 7) >> 4; const __m128i c0 = _mm_add_epi16(a01_plus_7, a11); const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11); const __m128i d0 = _mm_srai_epi16(c0, 4); const __m128i d2 = _mm_srai_epi16(c2, 4); // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16) // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16) const __m128i b23 = _mm_unpacklo_epi16(a22, a32); const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one); const __m128i d3 = _mm_add_epi32(c3, k51000); const __m128i e1 = _mm_srai_epi32(d1, 16); const __m128i e3 = _mm_srai_epi32(d3, 16); const __m128i f1 = _mm_packs_epi32(e1, e1); const __m128i f3 = _mm_packs_epi32(e3, e3); // f1 = f1 + (a3 != 0); // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the // desired (0, 1), we add one earlier through k12000_plus_one. // -> f1 = f1 + 1 - (a3 == 0) const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1); const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3); _mm_storeu_si128((__m128i*)&out[0], d0_g1); _mm_storeu_si128((__m128i*)&out[8], d2_f3); } }
void transform8_otherrgb_avx(ThreadInfo* t) { RS_IMAGE16 *input = t->input; GdkPixbuf *output = t->output; RS_MATRIX3 *matrix = t->matrix; gint x,y; gint width; float mat_ps[4*4*3] __attribute__ ((aligned (16))); for (x = 0; x < 4; x++ ) { mat_ps[x] = matrix->coeff[0][0]; mat_ps[x+4] = matrix->coeff[0][1]; mat_ps[x+8] = matrix->coeff[0][2]; mat_ps[12+x] = matrix->coeff[1][0]; mat_ps[12+x+4] = matrix->coeff[1][1]; mat_ps[12+x+8] = matrix->coeff[1][2]; mat_ps[24+x] = matrix->coeff[2][0]; mat_ps[24+x+4] = matrix->coeff[2][1]; mat_ps[24+x+8] = matrix->coeff[2][2]; } int start_x = t->start_x; /* Always have aligned input and output adress */ if (start_x & 3) start_x = ((start_x) / 4) * 4; int complete_w = t->end_x - start_x; /* If width is not multiple of 4, check if we can extend it a bit */ if (complete_w & 3) { if ((t->end_x+4) < input->w) complete_w = ((complete_w+3) / 4 * 4); } __m128 gamma = _mm_set1_ps(t->output_gamma); for(y=t->start_y ; y<t->end_y ; y++) { gushort *i = GET_PIXEL(input, start_x, y); guchar *o = GET_PIXBUF_PIXEL(output, start_x, y); gboolean aligned_write = !((guintptr)(o)&0xf); width = complete_w >> 2; while(width--) { /* Load and convert to float */ __m128i zero = _mm_setzero_si128(); __m128i in = _mm_load_si128((__m128i*)i); // Load two pixels __m128i in2 = _mm_load_si128((__m128i*)i+1); // Load two pixels _mm_prefetch(i + 64, _MM_HINT_NTA); __m128i p1 =_mm_unpacklo_epi16(in, zero); __m128i p2 =_mm_unpackhi_epi16(in, zero); __m128i p3 =_mm_unpacklo_epi16(in2, zero); __m128i p4 =_mm_unpackhi_epi16(in2, zero); __m128 p1f = _mm_cvtepi32_ps(p1); __m128 p2f = _mm_cvtepi32_ps(p2); __m128 p3f = _mm_cvtepi32_ps(p3); __m128 p4f = _mm_cvtepi32_ps(p4); /* Convert to planar */ __m128 g1g0r1r0 = _mm_unpacklo_ps(p1f, p2f); __m128 b1b0 = _mm_unpackhi_ps(p1f, p2f); __m128 g3g2r3r2 = _mm_unpacklo_ps(p3f, p4f); __m128 b3b2 = _mm_unpackhi_ps(p3f, p4f); __m128 r = _mm_movelh_ps(g1g0r1r0, g3g2r3r2); __m128 g = _mm_movehl_ps(g3g2r3r2, g1g0r1r0); __m128 b = _mm_movelh_ps(b1b0, b3b2); /* Apply matrix to convert to sRGB */ __m128 r2 = sse_matrix3_mul(mat_ps, r, g, b); __m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b); __m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b); /* Normalize to 0->1 and clamp */ __m128 normalize = _mm_load_ps(_normalize); __m128 max_val = _mm_load_ps(_ones_ps); __m128 min_val = _mm_setzero_ps(); r = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, r2))); g = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, g2))); b = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, b2))); /* Apply Gamma */ __m128 upscale = _mm_load_ps(_8bit); r = _mm_mul_ps(upscale, _mm_fastpow_ps(r, gamma)); g = _mm_mul_ps(upscale, _mm_fastpow_ps(g, gamma)); b = _mm_mul_ps(upscale, _mm_fastpow_ps(b, gamma)); /* Convert to 8 bit unsigned and interleave*/ __m128i r_i = _mm_cvtps_epi32(r); __m128i g_i = _mm_cvtps_epi32(g); __m128i b_i = _mm_cvtps_epi32(b); r_i = _mm_packs_epi32(r_i, r_i); g_i = _mm_packs_epi32(g_i, g_i); b_i = _mm_packs_epi32(b_i, b_i); /* Set alpha value to 255 and store */ __m128i alpha_mask = _mm_load_si128((__m128i*)_alpha_mask); __m128i rg_i = _mm_unpacklo_epi16(r_i, g_i); __m128i bb_i = _mm_unpacklo_epi16(b_i, b_i); p1 = _mm_unpacklo_epi32(rg_i, bb_i); p2 = _mm_unpackhi_epi32(rg_i, bb_i); p1 = _mm_or_si128(alpha_mask, _mm_packus_epi16(p1, p2)); if (aligned_write) _mm_store_si128((__m128i*)o, p1); else _mm_storeu_si128((__m128i*)o, p1); i += 16; o += 16; } /* Process remaining pixels */ width = complete_w & 3; while(width--) { __m128i zero = _mm_setzero_si128(); __m128i in = _mm_loadl_epi64((__m128i*)i); // Load two pixels __m128i p1 =_mm_unpacklo_epi16(in, zero); __m128 p1f = _mm_cvtepi32_ps(p1); /* Splat r,g,b */ __m128 r = _mm_shuffle_ps(p1f, p1f, _MM_SHUFFLE(0,0,0,0)); __m128 g = _mm_shuffle_ps(p1f, p1f, _MM_SHUFFLE(1,1,1,1)); __m128 b = _mm_shuffle_ps(p1f, p1f, _MM_SHUFFLE(2,2,2,2)); __m128 r2 = sse_matrix3_mul(mat_ps, r, g, b); __m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b); __m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b); r = _mm_unpacklo_ps(r2, g2); // GG RR GG RR r = _mm_movelh_ps(r, b2); // BB BB GG RR __m128 normalize = _mm_load_ps(_normalize); __m128 max_val = _mm_load_ps(_ones_ps); __m128 min_val = _mm_setzero_ps(); r = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, r))); __m128 upscale = _mm_load_ps(_8bit); r = _mm_mul_ps(upscale, _mm_fastpow_ps(r, gamma)); /* Convert to 8 bit unsigned */ zero = _mm_setzero_si128(); __m128i r_i = _mm_cvtps_epi32(r); /* To 16 bit signed */ r_i = _mm_packs_epi32(r_i, zero); /* To 8 bit unsigned - set alpha channel*/ __m128i alpha_mask = _mm_load_si128((__m128i*)_alpha_mask); r_i = _mm_or_si128(alpha_mask, _mm_packus_epi16(r_i, zero)); *(int*)o = _mm_cvtsi128_si32(r_i); i+=4; o+=4; } } }
/* * Notice: * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST * numbers of DD bits */ static inline uint16_t _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts, uint8_t *split_packet) { volatile union i40e_rx_desc *rxdp; struct i40e_rx_entry *sw_ring; uint16_t nb_pkts_recd; int pos; uint64_t var; __m128i shuf_msk; uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; __m128i crc_adjust = _mm_set_epi16( 0, 0, 0, /* ignore non-length fields */ -rxq->crc_len, /* sub crc on data_len */ 0, /* ignore high-16bits of pkt_len */ -rxq->crc_len, /* sub crc on pkt_len */ 0, 0 /* ignore pkt_type field */ ); /* * compile-time check the above crc_adjust layout is correct. * NOTE: the first field (lowest address) is given last in set_epi16 * call above. */ RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); __m128i dd_check, eop_check; /* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */ nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST); /* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */ nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP); /* Just the act of getting into the function from the application is * going to cost about 7 cycles */ rxdp = rxq->rx_ring + rxq->rx_tail; rte_prefetch0(rxdp); /* See if we need to rearm the RX queue - gives the prefetch a bit * of time to act */ if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) i40e_rxq_rearm(rxq); /* Before we start moving massive data around, check to see if * there is actually a packet available */ if (!(rxdp->wb.qword1.status_error_len & rte_cpu_to_le_32(1 << I40E_RX_DESC_STATUS_DD_SHIFT))) return 0; /* 4 packets DD mask */ dd_check = _mm_set_epi64x(0x0000000100000001LL, 0x0000000100000001LL); /* 4 packets EOP mask */ eop_check = _mm_set_epi64x(0x0000000200000002LL, 0x0000000200000002LL); /* mask to shuffle from desc. to mbuf */ shuf_msk = _mm_set_epi8( 7, 6, 5, 4, /* octet 4~7, 32bits rss */ 3, 2, /* octet 2~3, low 16 bits vlan_macip */ 15, 14, /* octet 15~14, 16 bits data_len */ 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ 15, 14, /* octet 15~14, low 16 bits pkt_len */ 0xFF, 0xFF, /* pkt_type set as unknown */ 0xFF, 0xFF /*pkt_type set as unknown */ ); /* * Compile-time verify the shuffle mask * NOTE: some field positions already verified above, but duplicated * here for completeness in case of future modifications. */ RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10); RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); /* Cache is empty -> need to scan the buffer rings, but first move * the next 'n' mbufs into the cache */ sw_ring = &rxq->sw_ring[rxq->rx_tail]; /* A. load 4 packet in one loop * [A*. mask out 4 unused dirty field in desc] * B. copy 4 mbuf point from swring to rx_pkts * C. calc the number of DD bits among the 4 packets * [C*. extract the end-of-packet bit, if requested] * D. fill info. from desc to mbuf */ for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts; pos += RTE_I40E_DESCS_PER_LOOP, rxdp += RTE_I40E_DESCS_PER_LOOP) { __m128i descs[RTE_I40E_DESCS_PER_LOOP]; __m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4; __m128i zero, staterr, sterr_tmp1, sterr_tmp2; /* 2 64 bit or 4 32 bit mbuf pointers in one XMM reg. */ __m128i mbp1; #if defined(RTE_ARCH_X86_64) __m128i mbp2; #endif /* B.1 load 2 (64 bit) or 4 (32 bit) mbuf points */ mbp1 = _mm_loadu_si128((__m128i *)&sw_ring[pos]); /* Read desc statuses backwards to avoid race condition */ /* A.1 load 4 pkts desc */ descs[3] = _mm_loadu_si128((__m128i *)(rxdp + 3)); rte_compiler_barrier(); /* B.2 copy 2 64 bit or 4 32 bit mbuf point into rx_pkts */ _mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1); #if defined(RTE_ARCH_X86_64) /* B.1 load 2 64 bit mbuf points */ mbp2 = _mm_loadu_si128((__m128i *)&sw_ring[pos+2]); #endif descs[2] = _mm_loadu_si128((__m128i *)(rxdp + 2)); rte_compiler_barrier(); /* B.1 load 2 mbuf point */ descs[1] = _mm_loadu_si128((__m128i *)(rxdp + 1)); rte_compiler_barrier(); descs[0] = _mm_loadu_si128((__m128i *)(rxdp)); #if defined(RTE_ARCH_X86_64) /* B.2 copy 2 mbuf point into rx_pkts */ _mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2); #endif if (split_packet) { rte_mbuf_prefetch_part2(rx_pkts[pos]); rte_mbuf_prefetch_part2(rx_pkts[pos + 1]); rte_mbuf_prefetch_part2(rx_pkts[pos + 2]); rte_mbuf_prefetch_part2(rx_pkts[pos + 3]); } /* avoid compiler reorder optimization */ rte_compiler_barrier(); /* pkt 3,4 shift the pktlen field to be 16-bit aligned*/ const __m128i len3 = _mm_slli_epi32(descs[3], PKTLEN_SHIFT); const __m128i len2 = _mm_slli_epi32(descs[2], PKTLEN_SHIFT); /* merge the now-aligned packet length fields back in */ descs[3] = _mm_blend_epi16(descs[3], len3, 0x80); descs[2] = _mm_blend_epi16(descs[2], len2, 0x80); /* D.1 pkt 3,4 convert format from desc to pktmbuf */ pkt_mb4 = _mm_shuffle_epi8(descs[3], shuf_msk); pkt_mb3 = _mm_shuffle_epi8(descs[2], shuf_msk); /* C.1 4=>2 filter staterr info only */ sterr_tmp2 = _mm_unpackhi_epi32(descs[3], descs[2]); /* C.1 4=>2 filter staterr info only */ sterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]); desc_to_olflags_v(rxq, descs, &rx_pkts[pos]); /* D.2 pkt 3,4 set in_port/nb_seg and remove crc */ pkt_mb4 = _mm_add_epi16(pkt_mb4, crc_adjust); pkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust); /* pkt 1,2 shift the pktlen field to be 16-bit aligned*/ const __m128i len1 = _mm_slli_epi32(descs[1], PKTLEN_SHIFT); const __m128i len0 = _mm_slli_epi32(descs[0], PKTLEN_SHIFT); /* merge the now-aligned packet length fields back in */ descs[1] = _mm_blend_epi16(descs[1], len1, 0x80); descs[0] = _mm_blend_epi16(descs[0], len0, 0x80); /* D.1 pkt 1,2 convert format from desc to pktmbuf */ pkt_mb2 = _mm_shuffle_epi8(descs[1], shuf_msk); pkt_mb1 = _mm_shuffle_epi8(descs[0], shuf_msk); /* C.2 get 4 pkts staterr value */ zero = _mm_xor_si128(dd_check, dd_check); staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2); /* D.3 copy final 3,4 data to rx_pkts */ _mm_storeu_si128((void *)&rx_pkts[pos+3]->rx_descriptor_fields1, pkt_mb4); _mm_storeu_si128((void *)&rx_pkts[pos+2]->rx_descriptor_fields1, pkt_mb3); /* D.2 pkt 1,2 set in_port/nb_seg and remove crc */ pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust); pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust); /* C* extract and record EOP bit */ if (split_packet) { __m128i eop_shuf_mask = _mm_set_epi8( 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x04, 0x0C, 0x00, 0x08 ); /* and with mask to extract bits, flipping 1-0 */ __m128i eop_bits = _mm_andnot_si128(staterr, eop_check); /* the staterr values are not in order, as the count * count of dd bits doesn't care. However, for end of * packet tracking, we do care, so shuffle. This also * compresses the 32-bit values to 8-bit */ eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask); /* store the resulting 32-bit value */ *(int *)split_packet = _mm_cvtsi128_si32(eop_bits); split_packet += RTE_I40E_DESCS_PER_LOOP; } /* C.3 calc available number of desc */ staterr = _mm_and_si128(staterr, dd_check); staterr = _mm_packs_epi32(staterr, zero); /* D.3 copy final 1,2 data to rx_pkts */ _mm_storeu_si128((void *)&rx_pkts[pos+1]->rx_descriptor_fields1, pkt_mb2); _mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1, pkt_mb1); desc_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl); /* C.4 calc avaialbe number of desc */ var = __builtin_popcountll(_mm_cvtsi128_si64(staterr)); nb_pkts_recd += var; if (likely(var != RTE_I40E_DESCS_PER_LOOP)) break; } /* Update our internal tail pointer */ rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd); rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1)); rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd); return nb_pkts_recd; }
void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) { const int stride = pitch >> 1; int pass; // Constants // When we use them, in one case, they are all the same. In all others // it's a pair of them that we need to repeat four times. This is done // by constructing the 32 bit constant corresponding to that pair. const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); // Load input __m128i in0 = _mm_loadu_si128((const __m128i *)(input + 0 * stride)); __m128i in1 = _mm_loadu_si128((const __m128i *)(input + 1 * stride)); __m128i in2 = _mm_loadu_si128((const __m128i *)(input + 2 * stride)); __m128i in3 = _mm_loadu_si128((const __m128i *)(input + 3 * stride)); __m128i in4 = _mm_loadu_si128((const __m128i *)(input + 4 * stride)); __m128i in5 = _mm_loadu_si128((const __m128i *)(input + 5 * stride)); __m128i in6 = _mm_loadu_si128((const __m128i *)(input + 6 * stride)); __m128i in7 = _mm_loadu_si128((const __m128i *)(input + 7 * stride)); // Pre-condition input (shift by two) in0 = _mm_slli_epi16(in0, 2); in1 = _mm_slli_epi16(in1, 2); in2 = _mm_slli_epi16(in2, 2); in3 = _mm_slli_epi16(in3, 2); in4 = _mm_slli_epi16(in4, 2); in5 = _mm_slli_epi16(in5, 2); in6 = _mm_slli_epi16(in6, 2); in7 = _mm_slli_epi16(in7, 2); // We do two passes, first the columns, then the rows. The results of the // first pass are transposed so that the same column code can be reused. The // results of the second pass are also transposed so that the rows (processed // as columns) are put back in row positions. for (pass = 0; pass < 2; pass++) { // To store results of each pass before the transpose. __m128i res0, res1, res2, res3, res4, res5, res6, res7; // Add/substract const __m128i q0 = _mm_add_epi16(in0, in7); const __m128i q1 = _mm_add_epi16(in1, in6); const __m128i q2 = _mm_add_epi16(in2, in5); const __m128i q3 = _mm_add_epi16(in3, in4); const __m128i q4 = _mm_sub_epi16(in3, in4); const __m128i q5 = _mm_sub_epi16(in2, in5); const __m128i q6 = _mm_sub_epi16(in1, in6); const __m128i q7 = _mm_sub_epi16(in0, in7); // Work on first four results { // Add/substract const __m128i r0 = _mm_add_epi16(q0, q3); const __m128i r1 = _mm_add_epi16(q1, q2); const __m128i r2 = _mm_sub_epi16(q1, q2); const __m128i r3 = _mm_sub_epi16(q0, q3); // Interleave to do the multiply by constants which gets us into 32bits const __m128i t0 = _mm_unpacklo_epi16(r0, r1); const __m128i t1 = _mm_unpackhi_epi16(r0, r1); const __m128i t2 = _mm_unpacklo_epi16(r2, r3); const __m128i t3 = _mm_unpackhi_epi16(r2, r3); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); // Combine res0 = _mm_packs_epi32(w0, w1); res4 = _mm_packs_epi32(w2, w3); res2 = _mm_packs_epi32(w4, w5); res6 = _mm_packs_epi32(w6, w7); } // Work on next four results { // Interleave to do the multiply by constants which gets us into 32bits const __m128i d0 = _mm_unpacklo_epi16(q6, q5); const __m128i d1 = _mm_unpackhi_epi16(q6, q5); const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); // dct_const_round_shift const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); // Combine const __m128i r0 = _mm_packs_epi32(s0, s1); const __m128i r1 = _mm_packs_epi32(s2, s3); // Add/substract const __m128i x0 = _mm_add_epi16(q4, r0); const __m128i x1 = _mm_sub_epi16(q4, r0); const __m128i x2 = _mm_sub_epi16(q7, r1); const __m128i x3 = _mm_add_epi16(q7, r1); // Interleave to do the multiply by constants which gets us into 32bits const __m128i t0 = _mm_unpacklo_epi16(x0, x3); const __m128i t1 = _mm_unpackhi_epi16(x0, x3); const __m128i t2 = _mm_unpacklo_epi16(x1, x2); const __m128i t3 = _mm_unpackhi_epi16(x1, x2); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); // Combine res1 = _mm_packs_epi32(w0, w1); res7 = _mm_packs_epi32(w2, w3); res5 = _mm_packs_epi32(w4, w5); res3 = _mm_packs_epi32(w6, w7); } // Transpose the 8x8. { // 00 01 02 03 04 05 06 07 // 10 11 12 13 14 15 16 17 // 20 21 22 23 24 25 26 27 // 30 31 32 33 34 35 36 37 // 40 41 42 43 44 45 46 47 // 50 51 52 53 54 55 56 57 // 60 61 62 63 64 65 66 67 // 70 71 72 73 74 75 76 77 const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 // 04 14 05 15 06 16 07 17 // 24 34 25 35 26 36 27 37 // 40 50 41 51 42 52 43 53 // 60 70 61 71 62 72 63 73 // 54 54 55 55 56 56 57 57 // 64 74 65 75 66 76 67 77 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); // 00 10 20 30 01 11 21 31 // 40 50 60 70 41 51 61 71 // 02 12 22 32 03 13 23 33 // 42 52 62 72 43 53 63 73 // 04 14 24 34 05 15 21 36 // 44 54 64 74 45 55 61 76 // 06 16 26 36 07 17 27 37 // 46 56 66 76 47 57 67 77 in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); // 00 10 20 30 40 50 60 70 // 01 11 21 31 41 51 61 71 // 02 12 22 32 42 52 62 72 // 03 13 23 33 43 53 63 73 // 04 14 24 34 44 54 64 74 // 05 15 25 35 45 55 65 75 // 06 16 26 36 46 56 66 76 // 07 17 27 37 47 57 67 77 } } // Post-condition output and store it { // Post-condition (division by two) // division of two 16 bits signed numbers using shifts // n / 2 = (n - (n >> 15)) >> 1 const __m128i sign_in0 = _mm_srai_epi16(in0, 15); const __m128i sign_in1 = _mm_srai_epi16(in1, 15); const __m128i sign_in2 = _mm_srai_epi16(in2, 15); const __m128i sign_in3 = _mm_srai_epi16(in3, 15); const __m128i sign_in4 = _mm_srai_epi16(in4, 15); const __m128i sign_in5 = _mm_srai_epi16(in5, 15); const __m128i sign_in6 = _mm_srai_epi16(in6, 15); const __m128i sign_in7 = _mm_srai_epi16(in7, 15); in0 = _mm_sub_epi16(in0, sign_in0); in1 = _mm_sub_epi16(in1, sign_in1); in2 = _mm_sub_epi16(in2, sign_in2); in3 = _mm_sub_epi16(in3, sign_in3); in4 = _mm_sub_epi16(in4, sign_in4); in5 = _mm_sub_epi16(in5, sign_in5); in6 = _mm_sub_epi16(in6, sign_in6); in7 = _mm_sub_epi16(in7, sign_in7); in0 = _mm_srai_epi16(in0, 1); in1 = _mm_srai_epi16(in1, 1); in2 = _mm_srai_epi16(in2, 1); in3 = _mm_srai_epi16(in3, 1); in4 = _mm_srai_epi16(in4, 1); in5 = _mm_srai_epi16(in5, 1); in6 = _mm_srai_epi16(in6, 1); in7 = _mm_srai_epi16(in7, 1); // store results _mm_storeu_si128((__m128i *)(output + 0 * 8), in0); _mm_storeu_si128((__m128i *)(output + 1 * 8), in1); _mm_storeu_si128((__m128i *)(output + 2 * 8), in2); _mm_storeu_si128((__m128i *)(output + 3 * 8), in3); _mm_storeu_si128((__m128i *)(output + 4 * 8), in4); _mm_storeu_si128((__m128i *)(output + 5 * 8), in5); _mm_storeu_si128((__m128i *)(output + 6 * 8), in6); _mm_storeu_si128((__m128i *)(output + 7 * 8), in7); } }
/* ******************************************************************************** * * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients * of a 16x16 intra prediction macroblock, and then performs scaling. * prediction buffer * * @par Description: * The DC coefficients pass through a 2-stage inverse hadamard transform. * This inverse transformed content is scaled to based on Qp value. * * @param[in] pi2_src * input 4x4 block of DC coefficients * * @param[out] pi2_out * output 4x4 block * * @param[in] pu2_iscal_mat * pointer to scaling list * * @param[in] pu2_weigh_mat * pointer to weight matrix * * @param[in] u4_qp_div_6 * Floor (qp/6) * * @param[in] pi4_tmp * temporary buffer of size 1*16 * * @returns none * * @remarks none * ******************************************************************************* */ void ih264_ihadamard_scaling_4x4_ssse3(WORD16* pi2_src, WORD16* pi2_out, const UWORD16 *pu2_iscal_mat, const UWORD16 *pu2_weigh_mat, UWORD32 u4_qp_div_6, WORD32* pi4_tmp) { int val = 0xFFFF; __m128i src_r0_r1, src_r2_r3, sign_reg, zero_8x16b = _mm_setzero_si128(); __m128i src_r0, src_r1, src_r2, src_r3; __m128i temp0, temp1, temp2, temp3; __m128i add_rshift = _mm_set1_epi32((1 << (5 - u4_qp_div_6))); __m128i mult_val = _mm_set1_epi32(pu2_iscal_mat[0] * pu2_weigh_mat[0]); __m128i mask = _mm_set1_epi32(val); UNUSED (pi4_tmp); mult_val = _mm_and_si128(mult_val, mask); src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r0_r1); src_r0 = _mm_unpacklo_epi16(src_r0_r1, sign_reg); src_r1 = _mm_unpackhi_epi16(src_r0_r1, sign_reg); sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r2_r3); src_r2 = _mm_unpacklo_epi16(src_r2_r3, sign_reg); src_r3 = _mm_unpackhi_epi16(src_r2_r3, sign_reg); /* Perform Inverse transform */ /*-------------------------------------------------------------*/ /* IDCT [ Horizontal transformation ] */ /*-------------------------------------------------------------*/ // Matrix transpose /* * a0 a1 a2 a3 * b0 b1 b2 b3 * c0 c1 c2 c3 * d0 d1 d2 d3 */ temp0 = _mm_unpacklo_epi32(src_r0, src_r1); //a0 b0 a1 b1 temp2 = _mm_unpacklo_epi32(src_r2, src_r3); //c0 d0 c1 d1 temp1 = _mm_unpackhi_epi32(src_r0, src_r1); //a2 b2 a3 b3 temp3 = _mm_unpackhi_epi32(src_r2, src_r3); //c2 d2 c3 d3 src_r0 = _mm_unpacklo_epi64(temp0, temp2); //a0 b0 c0 d0 src_r1 = _mm_unpackhi_epi64(temp0, temp2); //a1 b1 c1 d1 src_r2 = _mm_unpacklo_epi64(temp1, temp3); //a2 b2 c2 d2 src_r3 = _mm_unpackhi_epi64(temp1, temp3); //a3 b3 c3 d3 temp0 = _mm_add_epi32(src_r0, src_r3); temp1 = _mm_add_epi32(src_r1, src_r2); temp2 = _mm_sub_epi32(src_r1, src_r2); temp3 = _mm_sub_epi32(src_r0, src_r3); src_r0 = _mm_add_epi32(temp0, temp1); src_r1 = _mm_add_epi32(temp2, temp3); src_r2 = _mm_sub_epi32(temp0, temp1); src_r3 = _mm_sub_epi32(temp3, temp2); /*-------------------------------------------------------------*/ /* IDCT [ Vertical transformation ] */ /*-------------------------------------------------------------*/ // Matrix transpose /* * a0 b0 c0 d0 * a1 b1 c1 d1 * a2 b2 c2 d2 * a3 b3 c3 d3 */ temp0 = _mm_unpacklo_epi32(src_r0, src_r1); //a0 a1 b0 b1 temp2 = _mm_unpacklo_epi32(src_r2, src_r3); //a2 a3 b2 b3 temp1 = _mm_unpackhi_epi32(src_r0, src_r1); //c0 c1 d0 d1 temp3 = _mm_unpackhi_epi32(src_r2, src_r3); //c2 c3 d2 d3 src_r0 = _mm_unpacklo_epi64(temp0, temp2); //a0 a1 a2 a3 src_r1 = _mm_unpackhi_epi64(temp0, temp2); //b0 b1 b2 b3 src_r2 = _mm_unpacklo_epi64(temp1, temp3); //c0 c1 c2 c3 src_r3 = _mm_unpackhi_epi64(temp1, temp3); //d0 d1 d2 d3 temp0 = _mm_add_epi32(src_r0, src_r3); temp1 = _mm_add_epi32(src_r1, src_r2); temp2 = _mm_sub_epi32(src_r1, src_r2); temp3 = _mm_sub_epi32(src_r0, src_r3); src_r0 = _mm_add_epi32(temp0, temp1); src_r1 = _mm_add_epi32(temp2, temp3); src_r2 = _mm_sub_epi32(temp0, temp1); src_r3 = _mm_sub_epi32(temp3, temp2); src_r0 = _mm_and_si128(src_r0, mask); src_r1 = _mm_and_si128(src_r1, mask); src_r2 = _mm_and_si128(src_r2, mask); src_r3 = _mm_and_si128(src_r3, mask); src_r0 = _mm_madd_epi16(src_r0, mult_val); src_r1 = _mm_madd_epi16(src_r1, mult_val); src_r2 = _mm_madd_epi16(src_r2, mult_val); src_r3 = _mm_madd_epi16(src_r3, mult_val); //Scaling if(u4_qp_div_6 >= 6) { src_r0 = _mm_slli_epi32(src_r0, u4_qp_div_6 - 6); src_r1 = _mm_slli_epi32(src_r1, u4_qp_div_6 - 6); src_r2 = _mm_slli_epi32(src_r2, u4_qp_div_6 - 6); src_r3 = _mm_slli_epi32(src_r3, u4_qp_div_6 - 6); } else { temp0 = _mm_add_epi32(src_r0, add_rshift); temp1 = _mm_add_epi32(src_r1, add_rshift); temp2 = _mm_add_epi32(src_r2, add_rshift); temp3 = _mm_add_epi32(src_r3, add_rshift); src_r0 = _mm_srai_epi32(temp0, 6 - u4_qp_div_6); src_r1 = _mm_srai_epi32(temp1, 6 - u4_qp_div_6); src_r2 = _mm_srai_epi32(temp2, 6 - u4_qp_div_6); src_r3 = _mm_srai_epi32(temp3, 6 - u4_qp_div_6); } src_r0_r1 = _mm_packs_epi32(src_r0, src_r1); src_r2_r3 = _mm_packs_epi32(src_r2, src_r3); _mm_storeu_si128((__m128i *) (&pi2_out[0]), src_r0_r1); _mm_storeu_si128((__m128i *) (&pi2_out[8]), src_r2_r3); }
static inline void desc_to_olflags_v(struct i40e_rx_queue *rxq, __m128i descs[4], struct rte_mbuf **rx_pkts) { const __m128i mbuf_init = _mm_set_epi64x(0, rxq->mbuf_initializer); __m128i rearm0, rearm1, rearm2, rearm3; __m128i vlan0, vlan1, rss, l3_l4e; /* mask everything except RSS, flow director and VLAN flags * bit2 is for VLAN tag, bit11 for flow director indication * bit13:12 for RSS indication. */ const __m128i rss_vlan_msk = _mm_set_epi32( 0x1c03804, 0x1c03804, 0x1c03804, 0x1c03804); const __m128i cksum_mask = _mm_set_epi32( PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD | PKT_RX_EIP_CKSUM_BAD, PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD | PKT_RX_EIP_CKSUM_BAD, PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD | PKT_RX_EIP_CKSUM_BAD, PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD | PKT_RX_EIP_CKSUM_BAD); /* map rss and vlan type to rss hash and vlan flag */ const __m128i vlan_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED, 0, 0, 0, 0); const __m128i rss_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH, 0, 0, 0, 0, PKT_RX_FDIR, 0); const __m128i l3_l4e_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, /* shift right 1 bit to make sure it not exceed 255 */ (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1, (PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD) >> 1, (PKT_RX_EIP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1, (PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD) >> 1, (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1, (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1, PKT_RX_IP_CKSUM_BAD >> 1, (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1); vlan0 = _mm_unpackhi_epi32(descs[0], descs[1]); vlan1 = _mm_unpackhi_epi32(descs[2], descs[3]); vlan0 = _mm_unpacklo_epi64(vlan0, vlan1); vlan1 = _mm_and_si128(vlan0, rss_vlan_msk); vlan0 = _mm_shuffle_epi8(vlan_flags, vlan1); rss = _mm_srli_epi32(vlan1, 11); rss = _mm_shuffle_epi8(rss_flags, rss); l3_l4e = _mm_srli_epi32(vlan1, 22); l3_l4e = _mm_shuffle_epi8(l3_l4e_flags, l3_l4e); /* then we shift left 1 bit */ l3_l4e = _mm_slli_epi32(l3_l4e, 1); /* we need to mask out the reduntant bits */ l3_l4e = _mm_and_si128(l3_l4e, cksum_mask); vlan0 = _mm_or_si128(vlan0, rss); vlan0 = _mm_or_si128(vlan0, l3_l4e); /* * At this point, we have the 4 sets of flags in the low 16-bits * of each 32-bit value in vlan0. * We want to extract these, and merge them with the mbuf init data * so we can do a single 16-byte write to the mbuf to set the flags * and all the other initialization fields. Extracting the * appropriate flags means that we have to do a shift and blend for * each mbuf before we do the write. */ rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(vlan0, 8), 0x10); rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(vlan0, 4), 0x10); rearm2 = _mm_blend_epi16(mbuf_init, vlan0, 0x10); rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(vlan0, 4), 0x10); /* write the rearm data and the olflags in one write */ RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) != offsetof(struct rte_mbuf, rearm_data) + 8); RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) != RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16)); _mm_store_si128((__m128i *)&rx_pkts[0]->rearm_data, rearm0); _mm_store_si128((__m128i *)&rx_pkts[1]->rearm_data, rearm1); _mm_store_si128((__m128i *)&rx_pkts[2]->rearm_data, rearm2); _mm_store_si128((__m128i *)&rx_pkts[3]->rearm_data, rearm3); }
/* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */ static void unshuffle16(uint8_t* dest, uint8_t* orig, size_t size) { size_t i, j, k; size_t neblock, numof16belem; __m128i xmm1[16], xmm2[16]; neblock = size / 16; numof16belem = neblock / 16; for (i = 0, k = 0; i < numof16belem; i++, k += 16) { /* Load the first 128 bytes in 16 XMM registrers */ for (j = 0; j < 16; j++) { xmm1[j] = ((__m128i *)orig)[j*numof16belem+i]; } /* Shuffle bytes */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ xmm2[j] = _mm_unpacklo_epi8(xmm1[j*2], xmm1[j*2+1]); /* Compute the hi 32 bytes */ xmm2[8+j] = _mm_unpackhi_epi8(xmm1[j*2], xmm1[j*2+1]); } /* Shuffle 2-byte words */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ xmm1[j] = _mm_unpacklo_epi16(xmm2[j*2], xmm2[j*2+1]); /* Compute the hi 32 bytes */ xmm1[8+j] = _mm_unpackhi_epi16(xmm2[j*2], xmm2[j*2+1]); } /* Shuffle 4-byte dwords */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ xmm2[j] = _mm_unpacklo_epi32(xmm1[j*2], xmm1[j*2+1]); /* Compute the hi 32 bytes */ xmm2[8+j] = _mm_unpackhi_epi32(xmm1[j*2], xmm1[j*2+1]); } /* Shuffle 8-byte qwords */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ xmm1[j] = _mm_unpacklo_epi64(xmm2[j*2], xmm2[j*2+1]); /* Compute the hi 32 bytes */ xmm1[8+j] = _mm_unpackhi_epi64(xmm2[j*2], xmm2[j*2+1]); } /* Store the result vectors in proper order */ ((__m128i *)dest)[k+0] = xmm1[0]; ((__m128i *)dest)[k+1] = xmm1[8]; ((__m128i *)dest)[k+2] = xmm1[4]; ((__m128i *)dest)[k+3] = xmm1[12]; ((__m128i *)dest)[k+4] = xmm1[2]; ((__m128i *)dest)[k+5] = xmm1[10]; ((__m128i *)dest)[k+6] = xmm1[6]; ((__m128i *)dest)[k+7] = xmm1[14]; ((__m128i *)dest)[k+8] = xmm1[1]; ((__m128i *)dest)[k+9] = xmm1[9]; ((__m128i *)dest)[k+10] = xmm1[5]; ((__m128i *)dest)[k+11] = xmm1[13]; ((__m128i *)dest)[k+12] = xmm1[3]; ((__m128i *)dest)[k+13] = xmm1[11]; ((__m128i *)dest)[k+14] = xmm1[7]; ((__m128i *)dest)[k+15] = xmm1[15]; } }
int main() { //Transpose vec4 mat[4] = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}, {13, 14, 15, 16}}; __m128i xmm0 = _mm_unpacklo_epi32(_mm_castps_si128(mat[0]), _mm_castps_si128(mat[1])); __m128i xmm1 = _mm_unpackhi_epi32(_mm_castps_si128(mat[0]), _mm_castps_si128(mat[1])); __m128i xmm2 = _mm_unpacklo_epi32(_mm_castps_si128(mat[2]), _mm_castps_si128(mat[3])); __m128i xmm3 = _mm_unpackhi_epi32(_mm_castps_si128(mat[2]), _mm_castps_si128(mat[3])); vec4 trans[4]; trans[0] = _mm_castsi128_ps(_mm_unpacklo_epi64(xmm0, xmm2)); trans[1] = _mm_castsi128_ps(_mm_unpackhi_epi64(xmm0, xmm2)); trans[2] = _mm_castsi128_ps(_mm_unpacklo_epi64(xmm1, xmm3)); trans[3] = _mm_castsi128_ps(_mm_unpackhi_epi64(xmm1, xmm3)); vec4 trans2[4]; ml::transpose(trans2, mat); FILE* file = fopen("..\\..\\AppData\\VT.swf", "rb"); fseek(file, 0, SEEK_END); size_t size = ftell(file); fseek(file, 0, SEEK_SET); unsigned char* fileData = (unsigned char*)malloc(size); fread(fileData, 1, size, file); fclose(file); MemReader data = {(const char*)fileData, (const char*)fileData+size, (const char*)fileData}; //Read SWF header const u32 signatureAndVersion = data.read<u32>(); const u32 actualSize = data.read<u32>(); u32 signature = signatureAndVersion&0x00FFFFFF; u8 version = signatureAndVersion>>24; bool isCompressed = signature=='\0SWC'; bool isUncompressed = signature=='\0SWF'; //if !isCompressed && !isUncompressed return error; MemReader data2 = {0, 0, 0}; char* uncompressed = 0; if (isCompressed) { uncompressed = (char*)malloc(actualSize-8); data2.cur = data2.start = uncompressed; data2.end = uncompressed+actualSize-8; uLongf uncompressedSize = actualSize-8; uncompress((Bytef*)uncompressed, &uncompressedSize, data.as<Bytef>(), size-8); } else if (isCompressed) { data2.cur = data2.start = data.as<char>(); data2.end = data2.start+actualSize-8; } u8 bits = data2.read<u8>(); u8 numBits = bits>>3; u32 rectSizeMinusOne = (numBits*4+5)>>3; data2.move(rectSizeMinusOne); const u16 frameRate = data2.read<u16>(); const u16 frameCount = data2.read<u16>(); std::set<u32> tagsUsed; size_t tagCount = 0; while (data2.cur!=data2.end) { u16 tagHeader = data2.read<u16>(); u32 tagLength = tagHeader&0x3F; u32 tagType = tagHeader>>6; tagsUsed.insert(tagType); if (tagLength==0x3F) tagLength = data2.read<u32>(); data2.move(tagLength); parseTag(tagType); ++tagCount; } if (uncompressed) free(uncompressed); printf("\nProcessed %d tags\n\n", tagCount); printf(" Tags used \n"); printf("-------------------------\n"); std::set<u32>::iterator it = tagsUsed.begin(), end = tagsUsed.end(); for (; it!=end; ++it) { parseTag(*it); } free(fileData); }
template<class T> inline void dequantise_sse4_2_8_8_2(QuantisationMatrix *qmatrix, int32_t *idata, void *_odata, int ostride) { T *odata = (T *)_odata; const int slice_width = 8; const int slice_height = 8; const int Y = 0; const int X = 0; const int N = 0; T * const optr = &odata[Y*slice_height*ostride + X*slice_width]; const int32_t * iptr = &idata[N*slice_height*slice_width]; const __m128i D0 = LOAD_QUANTISED(&iptr[ 0], qmatrix, 0, 0); // [ 0 1 2 3 ] const __m128i D4 = LOAD_QUANTISED(&iptr[ 4], qmatrix, 1, 1); // [ 4 5 6 7 ] const __m128i D8 = LOAD_QUANTISED(&iptr[ 8], qmatrix, 1, 2); // [ 8 9 10 11 ] const __m128i D12 = LOAD_QUANTISED(&iptr[12], qmatrix, 1, 3); // [ 12 13 14 15 ] const __m128i D16 = LOAD_QUANTISED(&iptr[16], qmatrix, 2, 1); // [ 16 17 18 19 ] const __m128i D20 = LOAD_QUANTISED(&iptr[20], qmatrix, 2, 1); // [ 20 21 22 23 ] const __m128i D24 = LOAD_QUANTISED(&iptr[24], qmatrix, 2, 1); // [ 24 25 26 27 ] const __m128i D28 = LOAD_QUANTISED(&iptr[28], qmatrix, 2, 1); // [ 28 29 30 31 ] const __m128i X0 = _mm_unpacklo_epi32(D0, D4); // [ 0 4 1 5 ] const __m128i Y0 = _mm_unpacklo_epi32(X0, D16); // [ 0 16 4 17 ] const __m128i Y1 = _mm_unpackhi_epi32(X0, D16); // [ 1 18 5 19 ] STORE_SAMPLE_PAIR<T>((__m128i *)&optr[0*ostride + 0], Y0, Y1); const __m128i X1 = _mm_unpackhi_epi32(D0, D4); // [ 2 6 3 7 ] const __m128i Y2 = _mm_unpacklo_epi32(X1, D24); // [ 2 24 6 25 ] const __m128i Y3 = _mm_unpackhi_epi32(X1, D24); // [ 3 26 7 27 ] STORE_SAMPLE_PAIR<T>((__m128i *)&optr[4*ostride + 0], Y2, Y3); const __m128i X2 = _mm_unpacklo_epi32(D8, D12); // [ 8 12 9 13 ] const __m128i Y4 = _mm_unpacklo_epi32(X2, D20); // [ 8 20 12 21 ] const __m128i Y5 = _mm_unpackhi_epi32(X2, D20); // [ 9 22 13 23 ] STORE_SAMPLE_PAIR<T>((__m128i *)&optr[2*ostride + 0], Y4, Y5); const __m128i X3 = _mm_unpackhi_epi32(D8, D12); // [ 10 14 11 15 ] const __m128i Y6 = _mm_unpacklo_epi32(X3, D28); // [ 10 28 14 29 ] const __m128i Y7 = _mm_unpackhi_epi32(X3, D28); // [ 11 30 15 31 ] STORE_SAMPLE_PAIR<T>((__m128i *)&optr[6*ostride + 0], Y6, Y7); const __m128i D32 = LOAD_QUANTISED(&iptr[32], qmatrix, 2, 2); // [ 32 33 34 35 ] const __m128i D36 = LOAD_QUANTISED(&iptr[36], qmatrix, 2, 2); // [ 36 37 38 39 ] const __m128i D40 = LOAD_QUANTISED(&iptr[40], qmatrix, 2, 2); // [ 40 41 42 43 ] const __m128i D44 = LOAD_QUANTISED(&iptr[44], qmatrix, 2, 2); // [ 44 45 46 47 ] const __m128i D48 = LOAD_QUANTISED(&iptr[48], qmatrix, 2, 3); // [ 48 49 50 51 ] const __m128i D52 = LOAD_QUANTISED(&iptr[52], qmatrix, 2, 3); // [ 52 53 54 55 ] const __m128i D56 = LOAD_QUANTISED(&iptr[56], qmatrix, 2, 3); // [ 56 57 58 59 ] const __m128i D60 = LOAD_QUANTISED(&iptr[60], qmatrix, 2, 3); // [ 60 61 62 63 ] const __m128i Z0 = _mm_unpacklo_epi32(D32, D48); // [ 32 48 33 49 ] const __m128i Z1 = _mm_unpackhi_epi32(D32, D48); // [ 34 50 35 51 ] STORE_SAMPLE_PAIR<T>((__m128i *)&optr[1*ostride + 0], Z0, Z1); const __m128i Z2 = _mm_unpacklo_epi32(D36, D52); // [ 36 52 37 53 ] const __m128i Z3 = _mm_unpackhi_epi32(D36, D52); // [ 38 54 39 55 ] STORE_SAMPLE_PAIR<T>((__m128i *)&optr[3*ostride + 0], Z2, Z3); const __m128i Z4 = _mm_unpacklo_epi32(D40, D56); // [ 40 56 41 57 ] const __m128i Z5 = _mm_unpackhi_epi32(D40, D56); // [ 42 58 43 59 ] STORE_SAMPLE_PAIR<T>((__m128i *)&optr[5*ostride + 0], Z4, Z5); const __m128i Z6 = _mm_unpacklo_epi32(D44, D60); // [ 44 60 45 61 ] const __m128i Z7 = _mm_unpackhi_epi32(D44, D60); // [ 46 62 47 63 ] STORE_SAMPLE_PAIR<T>((__m128i *)&optr[7*ostride + 0], Z6, Z7); }
OD_SIMD_INLINE od_m256i od_mm256_unpackhi_epi32(od_m256i a, od_m256i b) { od_m256i r; r.lo = _mm_unpackhi_epi32(a.lo, b.lo); r.hi = _mm_unpackhi_epi32(a.hi, b.hi); return r; }
void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int pitch) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, // as the first pass results are transposed, we tranpose the columns (that // is the transposed rows) and transpose the results (so that it goes back // in normal/row positions). const int stride = pitch >> 1; int pass; // Constants // When we use them, in one case, they are all the same. In all others // it's a pair of them that we need to repeat four times. This is done // by constructing the 32 bit constant corresponding to that pair. const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); const __m128i kOne = _mm_set1_epi16(1); __m128i in0, in1, in2, in3; // Load inputs. { in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); in2 = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); in3 = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); // x = x << 4 in0 = _mm_slli_epi16(in0, 4); in1 = _mm_slli_epi16(in1, 4); in2 = _mm_slli_epi16(in2, 4); in3 = _mm_slli_epi16(in3, 4); // if (i == 0 && input[0]) input[0] += 1; { // The mask will only contain wether the first value is zero, all // other comparison will fail as something shifted by 4 (above << 4) // can never be equal to one. To increment in the non-zero case, we // add the mask and one for the first element: // - if zero, mask = -1, v = v - 1 + 1 = v // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1 __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a); in0 = _mm_add_epi16(in0, mask); in0 = _mm_add_epi16(in0, k__nonzero_bias_b); } } // Do the two transform/transpose passes for (pass = 0; pass < 2; ++pass) { // Transform 1/2: Add/substract const __m128i r0 = _mm_add_epi16(in0, in3); const __m128i r1 = _mm_add_epi16(in1, in2); const __m128i r2 = _mm_sub_epi16(in1, in2); const __m128i r3 = _mm_sub_epi16(in0, in3); // Transform 1/2: Interleave to do the multiply by constants which gets us // into 32 bits. const __m128i t0 = _mm_unpacklo_epi16(r0, r1); const __m128i t2 = _mm_unpacklo_epi16(r2, r3); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); // Combine and transpose const __m128i res0 = _mm_packs_epi32(w0, w2); const __m128i res1 = _mm_packs_epi32(w4, w6); // 00 01 02 03 20 21 22 23 // 10 11 12 13 30 31 32 33 const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); const __m128i tr0_1 = _mm_unpackhi_epi16(res0, res1); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); in2 = _mm_unpackhi_epi32(tr0_0, tr0_1); // 00 10 20 30 01 11 21 31 in0 contains 0 followed by 1 // 02 12 22 32 03 13 23 33 in2 contains 2 followed by 3 if (0 == pass) { // Extract values in the high part for second pass as transform code // only uses the first four values. in1 = _mm_unpackhi_epi64(in0, in0); in3 = _mm_unpackhi_epi64(in2, in2); } else { // Post-condition output and store it (v + 1) >> 2, taking advantage // of the fact 1/3 are stored just after 0/2. __m128i out01 = _mm_add_epi16(in0, kOne); __m128i out23 = _mm_add_epi16(in2, kOne); out01 = _mm_srai_epi16(out01, 2); out23 = _mm_srai_epi16(out23, 2); _mm_storeu_si128((__m128i *)(output + 0 * 4), out01); _mm_storeu_si128((__m128i *)(output + 2 * 4), out23); } } }
/* * vPMD receive routine, now only accept (nb_pkts == RTE_IXGBE_VPMD_RX_BURST) * in one loop * * Notice: * - nb_pkts < RTE_IXGBE_VPMD_RX_BURST, just return no packet * - nb_pkts > RTE_IXGBE_VPMD_RX_BURST, only scan RTE_IXGBE_VPMD_RX_BURST * numbers of DD bit * - don't support ol_flags for rss and csum err */ static inline uint16_t _recv_raw_pkts_vec(struct igb_rx_queue *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts, uint8_t *split_packet) { volatile union ixgbe_adv_rx_desc *rxdp; struct igb_rx_entry *sw_ring; uint16_t nb_pkts_recd; int pos; uint64_t var; __m128i shuf_msk; __m128i crc_adjust = _mm_set_epi16( 0, 0, 0, 0, /* ignore non-length fields */ 0, /* ignore high-16bits of pkt_len */ -rxq->crc_len, /* sub crc on pkt_len */ -rxq->crc_len, /* sub crc on data_len */ 0 /* ignore pkt_type field */ ); __m128i dd_check, eop_check; if (unlikely(nb_pkts < RTE_IXGBE_VPMD_RX_BURST)) return 0; /* Just the act of getting into the function from the application is * going to cost about 7 cycles */ rxdp = rxq->rx_ring + rxq->rx_tail; _mm_prefetch((const void *)rxdp, _MM_HINT_T0); /* See if we need to rearm the RX queue - gives the prefetch a bit * of time to act */ if (rxq->rxrearm_nb > RTE_IXGBE_RXQ_REARM_THRESH) ixgbe_rxq_rearm(rxq); /* Before we start moving massive data around, check to see if * there is actually a packet available */ if (!(rxdp->wb.upper.status_error & rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD))) return 0; /* 4 packets DD mask */ dd_check = _mm_set_epi64x(0x0000000100000001LL, 0x0000000100000001LL); /* 4 packets EOP mask */ eop_check = _mm_set_epi64x(0x0000000200000002LL, 0x0000000200000002LL); /* mask to shuffle from desc. to mbuf */ shuf_msk = _mm_set_epi8( 7, 6, 5, 4, /* octet 4~7, 32bits rss */ 0xFF, 0xFF, /* skip high 16 bits vlan_macip, zero out */ 15, 14, /* octet 14~15, low 16 bits vlan_macip */ 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ 13, 12, /* octet 12~13, low 16 bits pkt_len */ 13, 12, /* octet 12~13, 16 bits data_len */ 0xFF, 0xFF /* skip pkt_type field */ ); /* Cache is empty -> need to scan the buffer rings, but first move * the next 'n' mbufs into the cache */ sw_ring = &rxq->sw_ring[rxq->rx_tail]; /* * A. load 4 packet in one loop * B. copy 4 mbuf point from swring to rx_pkts * C. calc the number of DD bits among the 4 packets * [C*. extract the end-of-packet bit, if requested] * D. fill info. from desc to mbuf */ for (pos = 0, nb_pkts_recd = 0; pos < RTE_IXGBE_VPMD_RX_BURST; pos += RTE_IXGBE_DESCS_PER_LOOP, rxdp += RTE_IXGBE_DESCS_PER_LOOP) { __m128i descs[RTE_IXGBE_DESCS_PER_LOOP]; __m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4; __m128i zero, staterr, sterr_tmp1, sterr_tmp2; __m128i mbp1, mbp2; /* two mbuf pointer in one XMM reg. */ if (split_packet) { rte_prefetch0(&rx_pkts[pos]->cacheline1); rte_prefetch0(&rx_pkts[pos + 1]->cacheline1); rte_prefetch0(&rx_pkts[pos + 2]->cacheline1); rte_prefetch0(&rx_pkts[pos + 3]->cacheline1); } /* B.1 load 1 mbuf point */ mbp1 = _mm_loadu_si128((__m128i *)&sw_ring[pos]); /* Read desc statuses backwards to avoid race condition */ /* A.1 load 4 pkts desc */ descs[3] = _mm_loadu_si128((__m128i *)(rxdp + 3)); /* B.2 copy 2 mbuf point into rx_pkts */ _mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1); /* B.1 load 1 mbuf point */ mbp2 = _mm_loadu_si128((__m128i *)&sw_ring[pos+2]); descs[2] = _mm_loadu_si128((__m128i *)(rxdp + 2)); /* B.1 load 2 mbuf point */ descs[1] = _mm_loadu_si128((__m128i *)(rxdp + 1)); descs[0] = _mm_loadu_si128((__m128i *)(rxdp)); /* B.2 copy 2 mbuf point into rx_pkts */ _mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2); /* avoid compiler reorder optimization */ rte_compiler_barrier(); /* D.1 pkt 3,4 convert format from desc to pktmbuf */ pkt_mb4 = _mm_shuffle_epi8(descs[3], shuf_msk); pkt_mb3 = _mm_shuffle_epi8(descs[2], shuf_msk); /* C.1 4=>2 filter staterr info only */ sterr_tmp2 = _mm_unpackhi_epi32(descs[3], descs[2]); /* C.1 4=>2 filter staterr info only */ sterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]); /* set ol_flags with packet type and vlan tag */ desc_to_olflags_v(descs, &rx_pkts[pos]); /* D.2 pkt 3,4 set in_port/nb_seg and remove crc */ pkt_mb4 = _mm_add_epi16(pkt_mb4, crc_adjust); pkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust); /* D.1 pkt 1,2 convert format from desc to pktmbuf */ pkt_mb2 = _mm_shuffle_epi8(descs[1], shuf_msk); pkt_mb1 = _mm_shuffle_epi8(descs[0], shuf_msk); /* C.2 get 4 pkts staterr value */ zero = _mm_xor_si128(dd_check, dd_check); staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2); /* D.3 copy final 3,4 data to rx_pkts */ _mm_storeu_si128((void *)&rx_pkts[pos+3]->rx_descriptor_fields1, pkt_mb4); _mm_storeu_si128((void *)&rx_pkts[pos+2]->rx_descriptor_fields1, pkt_mb3); /* D.2 pkt 1,2 set in_port/nb_seg and remove crc */ pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust); pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust); /* C* extract and record EOP bit */ if (split_packet) { __m128i eop_shuf_mask = _mm_set_epi8( 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x04, 0x0C, 0x00, 0x08 ); /* and with mask to extract bits, flipping 1-0 */ __m128i eop_bits = _mm_andnot_si128(staterr, eop_check); /* the staterr values are not in order, as the count * count of dd bits doesn't care. However, for end of * packet tracking, we do care, so shuffle. This also * compresses the 32-bit values to 8-bit */ eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask); /* store the resulting 32-bit value */ *(int *)split_packet = _mm_cvtsi128_si32(eop_bits); split_packet += RTE_IXGBE_DESCS_PER_LOOP; /* zero-out next pointers */ rx_pkts[pos]->next = NULL; rx_pkts[pos + 1]->next = NULL; rx_pkts[pos + 2]->next = NULL; rx_pkts[pos + 3]->next = NULL; } /* C.3 calc available number of desc */ staterr = _mm_and_si128(staterr, dd_check); staterr = _mm_packs_epi32(staterr, zero); /* D.3 copy final 1,2 data to rx_pkts */ _mm_storeu_si128((void *)&rx_pkts[pos+1]->rx_descriptor_fields1, pkt_mb2); _mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1, pkt_mb1); /* C.4 calc avaialbe number of desc */ var = __builtin_popcountll(_mm_cvtsi128_si64(staterr)); nb_pkts_recd += var; if (likely(var != RTE_IXGBE_DESCS_PER_LOOP)) break; } /* Update our internal tail pointer */ rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd); rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1)); rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd); return nb_pkts_recd; }
void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, // as the first pass results are transposed, we tranpose the columns (that // is the transposed rows) and transpose the results (so that it goes back // in normal/row positions). const int stride = pitch >> 1; int pass; // We need an intermediate buffer between passes. int16_t intermediate[256]; int16_t *in = input; int16_t *out = intermediate; // Constants // When we use them, in one case, they are all the same. In all others // it's a pair of them that we need to repeat four times. This is done // by constructing the 32 bit constant corresponding to that pair. const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i kOne = _mm_set1_epi16(1); // Do the two transform/transpose passes for (pass = 0; pass < 2; ++pass) { // We process eight columns (transposed rows in second pass) at a time. int column_start; for (column_start = 0; column_start < 16; column_start += 8) { __m128i in00, in01, in02, in03, in04, in05, in06, in07; __m128i in08, in09, in10, in11, in12, in13, in14, in15; __m128i input0, input1, input2, input3, input4, input5, input6, input7; __m128i step1_0, step1_1, step1_2, step1_3; __m128i step1_4, step1_5, step1_6, step1_7; __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; __m128i step3_0, step3_1, step3_2, step3_3; __m128i step3_4, step3_5, step3_6, step3_7; __m128i res00, res01, res02, res03, res04, res05, res06, res07; __m128i res08, res09, res10, res11, res12, res13, res14, res15; // Load and pre-condition input. if (0 == pass) { in00 = _mm_loadu_si128((const __m128i *)(in + 0 * stride)); in01 = _mm_loadu_si128((const __m128i *)(in + 1 * stride)); in02 = _mm_loadu_si128((const __m128i *)(in + 2 * stride)); in03 = _mm_loadu_si128((const __m128i *)(in + 3 * stride)); in04 = _mm_loadu_si128((const __m128i *)(in + 4 * stride)); in05 = _mm_loadu_si128((const __m128i *)(in + 5 * stride)); in06 = _mm_loadu_si128((const __m128i *)(in + 6 * stride)); in07 = _mm_loadu_si128((const __m128i *)(in + 7 * stride)); in08 = _mm_loadu_si128((const __m128i *)(in + 8 * stride)); in09 = _mm_loadu_si128((const __m128i *)(in + 9 * stride)); in10 = _mm_loadu_si128((const __m128i *)(in + 10 * stride)); in11 = _mm_loadu_si128((const __m128i *)(in + 11 * stride)); in12 = _mm_loadu_si128((const __m128i *)(in + 12 * stride)); in13 = _mm_loadu_si128((const __m128i *)(in + 13 * stride)); in14 = _mm_loadu_si128((const __m128i *)(in + 14 * stride)); in15 = _mm_loadu_si128((const __m128i *)(in + 15 * stride)); // x = x << 2 in00 = _mm_slli_epi16(in00, 2); in01 = _mm_slli_epi16(in01, 2); in02 = _mm_slli_epi16(in02, 2); in03 = _mm_slli_epi16(in03, 2); in04 = _mm_slli_epi16(in04, 2); in05 = _mm_slli_epi16(in05, 2); in06 = _mm_slli_epi16(in06, 2); in07 = _mm_slli_epi16(in07, 2); in08 = _mm_slli_epi16(in08, 2); in09 = _mm_slli_epi16(in09, 2); in10 = _mm_slli_epi16(in10, 2); in11 = _mm_slli_epi16(in11, 2); in12 = _mm_slli_epi16(in12, 2); in13 = _mm_slli_epi16(in13, 2); in14 = _mm_slli_epi16(in14, 2); in15 = _mm_slli_epi16(in15, 2); } else { in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 16)); in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 16)); in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 16)); in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 16)); in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 16)); in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 16)); in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 16)); in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 16)); in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 16)); in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 16)); in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 16)); in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 16)); in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 16)); in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 16)); in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 16)); in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 16)); // x = (x + 1) >> 2 in00 = _mm_add_epi16(in00, kOne); in01 = _mm_add_epi16(in01, kOne); in02 = _mm_add_epi16(in02, kOne); in03 = _mm_add_epi16(in03, kOne); in04 = _mm_add_epi16(in04, kOne); in05 = _mm_add_epi16(in05, kOne); in06 = _mm_add_epi16(in06, kOne); in07 = _mm_add_epi16(in07, kOne); in08 = _mm_add_epi16(in08, kOne); in09 = _mm_add_epi16(in09, kOne); in10 = _mm_add_epi16(in10, kOne); in11 = _mm_add_epi16(in11, kOne); in12 = _mm_add_epi16(in12, kOne); in13 = _mm_add_epi16(in13, kOne); in14 = _mm_add_epi16(in14, kOne); in15 = _mm_add_epi16(in15, kOne); in00 = _mm_srai_epi16(in00, 2); in01 = _mm_srai_epi16(in01, 2); in02 = _mm_srai_epi16(in02, 2); in03 = _mm_srai_epi16(in03, 2); in04 = _mm_srai_epi16(in04, 2); in05 = _mm_srai_epi16(in05, 2); in06 = _mm_srai_epi16(in06, 2); in07 = _mm_srai_epi16(in07, 2); in08 = _mm_srai_epi16(in08, 2); in09 = _mm_srai_epi16(in09, 2); in10 = _mm_srai_epi16(in10, 2); in11 = _mm_srai_epi16(in11, 2); in12 = _mm_srai_epi16(in12, 2); in13 = _mm_srai_epi16(in13, 2); in14 = _mm_srai_epi16(in14, 2); in15 = _mm_srai_epi16(in15, 2); } in += 8; // Calculate input for the first 8 results. { input0 = _mm_add_epi16(in00, in15); input1 = _mm_add_epi16(in01, in14); input2 = _mm_add_epi16(in02, in13); input3 = _mm_add_epi16(in03, in12); input4 = _mm_add_epi16(in04, in11); input5 = _mm_add_epi16(in05, in10); input6 = _mm_add_epi16(in06, in09); input7 = _mm_add_epi16(in07, in08); } // Calculate input for the next 8 results. { step1_0 = _mm_sub_epi16(in07, in08); step1_1 = _mm_sub_epi16(in06, in09); step1_2 = _mm_sub_epi16(in05, in10); step1_3 = _mm_sub_epi16(in04, in11); step1_4 = _mm_sub_epi16(in03, in12); step1_5 = _mm_sub_epi16(in02, in13); step1_6 = _mm_sub_epi16(in01, in14); step1_7 = _mm_sub_epi16(in00, in15); } // Work on the first eight values; fdct8_1d(input, even_results); { // Add/substract const __m128i q0 = _mm_add_epi16(input0, input7); const __m128i q1 = _mm_add_epi16(input1, input6); const __m128i q2 = _mm_add_epi16(input2, input5); const __m128i q3 = _mm_add_epi16(input3, input4); const __m128i q4 = _mm_sub_epi16(input3, input4); const __m128i q5 = _mm_sub_epi16(input2, input5); const __m128i q6 = _mm_sub_epi16(input1, input6); const __m128i q7 = _mm_sub_epi16(input0, input7); // Work on first four results { // Add/substract const __m128i r0 = _mm_add_epi16(q0, q3); const __m128i r1 = _mm_add_epi16(q1, q2); const __m128i r2 = _mm_sub_epi16(q1, q2); const __m128i r3 = _mm_sub_epi16(q0, q3); // Interleave to do the multiply by constants which gets us // into 32 bits. const __m128i t0 = _mm_unpacklo_epi16(r0, r1); const __m128i t1 = _mm_unpackhi_epi16(r0, r1); const __m128i t2 = _mm_unpacklo_epi16(r2, r3); const __m128i t3 = _mm_unpackhi_epi16(r2, r3); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); // Combine res00 = _mm_packs_epi32(w0, w1); res08 = _mm_packs_epi32(w2, w3); res04 = _mm_packs_epi32(w4, w5); res12 = _mm_packs_epi32(w6, w7); } // Work on next four results { // Interleave to do the multiply by constants which gets us // into 32 bits. const __m128i d0 = _mm_unpacklo_epi16(q6, q5); const __m128i d1 = _mm_unpackhi_epi16(q6, q5); const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); // dct_const_round_shift const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); // Combine const __m128i r0 = _mm_packs_epi32(s0, s1); const __m128i r1 = _mm_packs_epi32(s2, s3); // Add/substract const __m128i x0 = _mm_add_epi16(q4, r0); const __m128i x1 = _mm_sub_epi16(q4, r0); const __m128i x2 = _mm_sub_epi16(q7, r1); const __m128i x3 = _mm_add_epi16(q7, r1); // Interleave to do the multiply by constants which gets us // into 32 bits. const __m128i t0 = _mm_unpacklo_epi16(x0, x3); const __m128i t1 = _mm_unpackhi_epi16(x0, x3); const __m128i t2 = _mm_unpacklo_epi16(x1, x2); const __m128i t3 = _mm_unpackhi_epi16(x1, x2); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); // Combine res02 = _mm_packs_epi32(w0, w1); res14 = _mm_packs_epi32(w2, w3); res10 = _mm_packs_epi32(w4, w5); res06 = _mm_packs_epi32(w6, w7); } } // Work on the next eight values; step1 -> odd_results { // step 2 { const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine step2_2 = _mm_packs_epi32(w0, w1); step2_3 = _mm_packs_epi32(w2, w3); } { const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine step2_5 = _mm_packs_epi32(w0, w1); step2_4 = _mm_packs_epi32(w2, w3); } // step 3 { step3_0 = _mm_add_epi16(step1_0, step2_3); step3_1 = _mm_add_epi16(step1_1, step2_2); step3_2 = _mm_sub_epi16(step1_1, step2_2); step3_3 = _mm_sub_epi16(step1_0, step2_3); step3_4 = _mm_sub_epi16(step1_7, step2_4); step3_5 = _mm_sub_epi16(step1_6, step2_5); step3_6 = _mm_add_epi16(step1_6, step2_5); step3_7 = _mm_add_epi16(step1_7, step2_4); } // step 4 { const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine step2_1 = _mm_packs_epi32(w0, w1); step2_2 = _mm_packs_epi32(w2, w3); } { const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine step2_6 = _mm_packs_epi32(w0, w1); step2_5 = _mm_packs_epi32(w2, w3); } // step 5 { step1_0 = _mm_add_epi16(step3_0, step2_1); step1_1 = _mm_sub_epi16(step3_0, step2_1); step1_2 = _mm_sub_epi16(step3_3, step2_2); step1_3 = _mm_add_epi16(step3_3, step2_2); step1_4 = _mm_add_epi16(step3_4, step2_5); step1_5 = _mm_sub_epi16(step3_4, step2_5); step1_6 = _mm_sub_epi16(step3_7, step2_6); step1_7 = _mm_add_epi16(step3_7, step2_6); } // step 6 { const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine res01 = _mm_packs_epi32(w0, w1); res09 = _mm_packs_epi32(w2, w3); } { const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine res05 = _mm_packs_epi32(w0, w1); res13 = _mm_packs_epi32(w2, w3); } { const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine res11 = _mm_packs_epi32(w0, w1); res03 = _mm_packs_epi32(w2, w3); } { const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine res15 = _mm_packs_epi32(w0, w1); res07 = _mm_packs_epi32(w2, w3); } } // Transpose the results, do it as two 8x8 transposes. { // 00 01 02 03 04 05 06 07 // 10 11 12 13 14 15 16 17 // 20 21 22 23 24 25 26 27 // 30 31 32 33 34 35 36 37 // 40 41 42 43 44 45 46 47 // 50 51 52 53 54 55 56 57 // 60 61 62 63 64 65 66 67 // 70 71 72 73 74 75 76 77 const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01); const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03); const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01); const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03); const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05); const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07); const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05); const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 // 04 14 05 15 06 16 07 17 // 24 34 25 35 26 36 27 37 // 40 50 41 51 42 52 43 53 // 60 70 61 71 62 72 63 73 // 54 54 55 55 56 56 57 57 // 64 74 65 75 66 76 67 77 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); // 00 10 20 30 01 11 21 31 // 40 50 60 70 41 51 61 71 // 02 12 22 32 03 13 23 33 // 42 52 62 72 43 53 63 73 // 04 14 24 34 05 15 21 36 // 44 54 64 74 45 55 61 76 // 06 16 26 36 07 17 27 37 // 46 56 66 76 47 57 67 77 const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); // 00 10 20 30 40 50 60 70 // 01 11 21 31 41 51 61 71 // 02 12 22 32 42 52 62 72 // 03 13 23 33 43 53 63 73 // 04 14 24 34 44 54 64 74 // 05 15 25 35 45 55 65 75 // 06 16 26 36 46 56 66 76 // 07 17 27 37 47 57 67 77 _mm_storeu_si128((__m128i *)(out + 0 * 16), tr2_0); _mm_storeu_si128((__m128i *)(out + 1 * 16), tr2_1); _mm_storeu_si128((__m128i *)(out + 2 * 16), tr2_2); _mm_storeu_si128((__m128i *)(out + 3 * 16), tr2_3); _mm_storeu_si128((__m128i *)(out + 4 * 16), tr2_4); _mm_storeu_si128((__m128i *)(out + 5 * 16), tr2_5); _mm_storeu_si128((__m128i *)(out + 6 * 16), tr2_6); _mm_storeu_si128((__m128i *)(out + 7 * 16), tr2_7); } { // 00 01 02 03 04 05 06 07 // 10 11 12 13 14 15 16 17 // 20 21 22 23 24 25 26 27 // 30 31 32 33 34 35 36 37 // 40 41 42 43 44 45 46 47 // 50 51 52 53 54 55 56 57 // 60 61 62 63 64 65 66 67 // 70 71 72 73 74 75 76 77 const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09); const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11); const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09); const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11); const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13); const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15); const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13); const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 // 04 14 05 15 06 16 07 17 // 24 34 25 35 26 36 27 37 // 40 50 41 51 42 52 43 53 // 60 70 61 71 62 72 63 73 // 54 54 55 55 56 56 57 57 // 64 74 65 75 66 76 67 77 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); // 00 10 20 30 01 11 21 31 // 40 50 60 70 41 51 61 71 // 02 12 22 32 03 13 23 33 // 42 52 62 72 43 53 63 73 // 04 14 24 34 05 15 21 36 // 44 54 64 74 45 55 61 76 // 06 16 26 36 07 17 27 37 // 46 56 66 76 47 57 67 77 const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); // 00 10 20 30 40 50 60 70 // 01 11 21 31 41 51 61 71 // 02 12 22 32 42 52 62 72 // 03 13 23 33 43 53 63 73 // 04 14 24 34 44 54 64 74 // 05 15 25 35 45 55 65 75 // 06 16 26 36 46 56 66 76 // 07 17 27 37 47 57 67 77 // Store results _mm_storeu_si128((__m128i *)(out + 8 + 0 * 16), tr2_0); _mm_storeu_si128((__m128i *)(out + 8 + 1 * 16), tr2_1); _mm_storeu_si128((__m128i *)(out + 8 + 2 * 16), tr2_2); _mm_storeu_si128((__m128i *)(out + 8 + 3 * 16), tr2_3); _mm_storeu_si128((__m128i *)(out + 8 + 4 * 16), tr2_4); _mm_storeu_si128((__m128i *)(out + 8 + 5 * 16), tr2_5); _mm_storeu_si128((__m128i *)(out + 8 + 6 * 16), tr2_6); _mm_storeu_si128((__m128i *)(out + 8 + 7 * 16), tr2_7); } out += 8*16; } // Setup in/out for next pass. in = intermediate; out = output; } }
// Hadamard transform // Returns the difference between the weighted sum of the absolute value of // transformed coefficients. static int TTransform(const uint8_t* inA, const uint8_t* inB, const uint16_t* const w) { int32_t sum[4]; __m128i tmp_0, tmp_1, tmp_2, tmp_3; const __m128i zero = _mm_setzero_si128(); // Load, combine and transpose inputs. { const __m128i inA_0 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 0]); const __m128i inA_1 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 1]); const __m128i inA_2 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 2]); const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]); const __m128i inB_0 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 0]); const __m128i inB_1 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 1]); const __m128i inB_2 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 2]); const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]); // Combine inA and inB (we'll do two transforms in parallel). const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0); const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1); const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2); const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3); // a00 b00 a01 b01 a02 b03 a03 b03 0 0 0 0 0 0 0 0 // a10 b10 a11 b11 a12 b12 a13 b13 0 0 0 0 0 0 0 0 // a20 b20 a21 b21 a22 b22 a23 b23 0 0 0 0 0 0 0 0 // a30 b30 a31 b31 a32 b32 a33 b33 0 0 0 0 0 0 0 0 // Transpose the two 4x4, discarding the filling zeroes. const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2); const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3); // a00 a20 b00 b20 a01 a21 b01 b21 a02 a22 b02 b22 a03 a23 b03 b23 // a10 a30 b10 b30 a11 a31 b11 b31 a12 a32 b12 b32 a13 a33 b13 b33 const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1); // a00 a10 a20 a30 b00 b10 b20 b30 a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 a03 a13 a23 a33 b03 b13 b23 b33 // Convert to 16b. tmp_0 = _mm_unpacklo_epi8(transpose1_0, zero); tmp_1 = _mm_unpackhi_epi8(transpose1_0, zero); tmp_2 = _mm_unpacklo_epi8(transpose1_1, zero); tmp_3 = _mm_unpackhi_epi8(transpose1_1, zero); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Horizontal pass and subsequent transpose. { // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); const __m128i b0 = _mm_add_epi16(a0, a1); const __m128i b1 = _mm_add_epi16(a3, a2); const __m128i b2 = _mm_sub_epi16(a3, a2); const __m128i b3 = _mm_sub_epi16(a0, a1); // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 // Transpose the two 4x4. const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1); const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3); const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1); const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3); // a00 a10 a01 a11 a02 a12 a03 a13 // a20 a30 a21 a31 a22 a32 a23 a33 // b00 b10 b01 b11 b02 b12 b03 b13 // b20 b30 b21 b31 b22 b32 b23 b33 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); // a00 a10 a20 a30 a01 a11 a21 a31 // b00 b10 b20 b30 b01 b11 b21 b31 // a02 a12 a22 a32 a03 a13 a23 a33 // b02 b12 a22 b32 b03 b13 b23 b33 tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Vertical pass and difference of weighted sums. { // Load all inputs. // TODO(cduvivier): Make variable declarations and allocations aligned so // we can use _mm_load_si128 instead of _mm_loadu_si128. const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]); const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]); // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); const __m128i b0 = _mm_add_epi16(a0, a1); const __m128i b1 = _mm_add_epi16(a3, a2); const __m128i b2 = _mm_sub_epi16(a3, a2); const __m128i b3 = _mm_sub_epi16(a0, a1); // Separate the transforms of inA and inB. __m128i A_b0 = _mm_unpacklo_epi64(b0, b1); __m128i A_b2 = _mm_unpacklo_epi64(b2, b3); __m128i B_b0 = _mm_unpackhi_epi64(b0, b1); __m128i B_b2 = _mm_unpackhi_epi64(b2, b3); { // sign(b) = b >> 15 (0x0000 if positive, 0xffff if negative) const __m128i sign_A_b0 = _mm_srai_epi16(A_b0, 15); const __m128i sign_A_b2 = _mm_srai_epi16(A_b2, 15); const __m128i sign_B_b0 = _mm_srai_epi16(B_b0, 15); const __m128i sign_B_b2 = _mm_srai_epi16(B_b2, 15); // b = abs(b) = (b ^ sign) - sign A_b0 = _mm_xor_si128(A_b0, sign_A_b0); A_b2 = _mm_xor_si128(A_b2, sign_A_b2); B_b0 = _mm_xor_si128(B_b0, sign_B_b0); B_b2 = _mm_xor_si128(B_b2, sign_B_b2); A_b0 = _mm_sub_epi16(A_b0, sign_A_b0); A_b2 = _mm_sub_epi16(A_b2, sign_A_b2); B_b0 = _mm_sub_epi16(B_b0, sign_B_b0); B_b2 = _mm_sub_epi16(B_b2, sign_B_b2); } // weighted sums A_b0 = _mm_madd_epi16(A_b0, w_0); A_b2 = _mm_madd_epi16(A_b2, w_8); B_b0 = _mm_madd_epi16(B_b0, w_0); B_b2 = _mm_madd_epi16(B_b2, w_8); A_b0 = _mm_add_epi32(A_b0, A_b2); B_b0 = _mm_add_epi32(B_b0, B_b2); // difference of weighted sums A_b0 = _mm_sub_epi32(A_b0, B_b0); _mm_storeu_si128((__m128i*)&sum[0], A_b0); } return sum[0] + sum[1] + sum[2] + sum[3]; }
static void FTransformSSE2(const uint8_t* src, const uint8_t* ref, int16_t* out) { const __m128i zero = _mm_setzero_si128(); const __m128i seven = _mm_set1_epi16(7); const __m128i k7500 = _mm_set1_epi32(7500); const __m128i k14500 = _mm_set1_epi32(14500); const __m128i k51000 = _mm_set1_epi32(51000); const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16)); const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217); const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352); __m128i v01, v32; // Difference between src and ref and initial transpose. { // Load src and convert to 16b. const __m128i src0 = _mm_loadl_epi64((__m128i*)&src[0 * BPS]); const __m128i src1 = _mm_loadl_epi64((__m128i*)&src[1 * BPS]); const __m128i src2 = _mm_loadl_epi64((__m128i*)&src[2 * BPS]); const __m128i src3 = _mm_loadl_epi64((__m128i*)&src[3 * BPS]); const __m128i src_0 = _mm_unpacklo_epi8(src0, zero); const __m128i src_1 = _mm_unpacklo_epi8(src1, zero); const __m128i src_2 = _mm_unpacklo_epi8(src2, zero); const __m128i src_3 = _mm_unpacklo_epi8(src3, zero); // Load ref and convert to 16b. const __m128i ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]); const __m128i ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]); const __m128i ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]); const __m128i ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]); const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero); const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero); const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero); const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero); // Compute difference. const __m128i diff0 = _mm_sub_epi16(src_0, ref_0); const __m128i diff1 = _mm_sub_epi16(src_1, ref_1); const __m128i diff2 = _mm_sub_epi16(src_2, ref_2); const __m128i diff3 = _mm_sub_epi16(src_3, ref_3); // Transpose. // 00 01 02 03 0 0 0 0 // 10 11 12 13 0 0 0 0 // 20 21 22 23 0 0 0 0 // 30 31 32 33 0 0 0 0 const __m128i transpose0_0 = _mm_unpacklo_epi16(diff0, diff1); const __m128i transpose0_1 = _mm_unpacklo_epi16(diff2, diff3); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // a02 a12 a22 a32 a03 a13 a23 a33 // a00 a10 a20 a30 a01 a11 a21 a31 // a03 a13 a23 a33 a02 a12 a22 a32 } // First pass and subsequent transpose. { // Same operations are done on the (0,3) and (1,2) pairs. // b0 = (a0 + a3) << 3 // b1 = (a1 + a2) << 3 // b3 = (a0 - a3) << 3 // b2 = (a1 - a2) << 3 const __m128i a01 = _mm_add_epi16(v01, v32); const __m128i a32 = _mm_sub_epi16(v01, v32); const __m128i b01 = _mm_slli_epi16(a01, 3); const __m128i b32 = _mm_slli_epi16(a32, 3); const __m128i b11 = _mm_unpackhi_epi64(b01, b01); const __m128i b22 = _mm_unpackhi_epi64(b32, b32); // e0 = b0 + b1 // e2 = b0 - b1 const __m128i e0 = _mm_add_epi16(b01, b11); const __m128i e2 = _mm_sub_epi16(b01, b11); const __m128i e02 = _mm_unpacklo_epi64(e0, e2); // e1 = (b3 * 5352 + b2 * 2217 + 14500) >> 12 // e3 = (b3 * 2217 - b2 * 5352 + 7500) >> 12 const __m128i b23 = _mm_unpacklo_epi16(b22, b32); const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); const __m128i d1 = _mm_add_epi32(c1, k14500); const __m128i d3 = _mm_add_epi32(c3, k7500); const __m128i e1 = _mm_srai_epi32(d1, 12); const __m128i e3 = _mm_srai_epi32(d3, 12); const __m128i e13 = _mm_packs_epi32(e1, e3); // Transpose. // 00 01 02 03 20 21 22 23 // 10 11 12 13 30 31 32 33 const __m128i transpose0_0 = _mm_unpacklo_epi16(e02, e13); const __m128i transpose0_1 = _mm_unpackhi_epi16(e02, e13); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // 02 12 22 32 03 13 23 33 // 00 10 20 30 01 11 21 31 // 03 13 23 33 02 12 22 32 } // Second pass { // Same operations are done on the (0,3) and (1,2) pairs. // a0 = v0 + v3 // a1 = v1 + v2 // a3 = v0 - v3 // a2 = v1 - v2 const __m128i a01 = _mm_add_epi16(v01, v32); const __m128i a32 = _mm_sub_epi16(v01, v32); const __m128i a11 = _mm_unpackhi_epi64(a01, a01); const __m128i a22 = _mm_unpackhi_epi64(a32, a32); // d0 = (a0 + a1 + 7) >> 4; // d2 = (a0 - a1 + 7) >> 4; const __m128i b0 = _mm_add_epi16(a01, a11); const __m128i b2 = _mm_sub_epi16(a01, a11); const __m128i c0 = _mm_add_epi16(b0, seven); const __m128i c2 = _mm_add_epi16(b2, seven); const __m128i d0 = _mm_srai_epi16(c0, 4); const __m128i d2 = _mm_srai_epi16(c2, 4); // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16) // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16) const __m128i b23 = _mm_unpacklo_epi16(a22, a32); const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one); const __m128i d3 = _mm_add_epi32(c3, k51000); const __m128i e1 = _mm_srai_epi32(d1, 16); const __m128i e3 = _mm_srai_epi32(d3, 16); const __m128i f1 = _mm_packs_epi32(e1, e1); const __m128i f3 = _mm_packs_epi32(e3, e3); // f1 = f1 + (a3 != 0); // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the // desired (0, 1), we add one earlier through k12000_plus_one. const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); _mm_storel_epi64((__m128i*)&out[ 0], d0); _mm_storel_epi64((__m128i*)&out[ 4], g1); _mm_storel_epi64((__m128i*)&out[ 8], d2); _mm_storel_epi64((__m128i*)&out[12], f3); } }
static inline uint16_t fm10k_recv_raw_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts, uint8_t *split_packet) { volatile union fm10k_rx_desc *rxdp; struct rte_mbuf **mbufp; uint16_t nb_pkts_recd; int pos; struct fm10k_rx_queue *rxq = rx_queue; uint64_t var; __m128i shuf_msk; __m128i dd_check, eop_check; uint16_t next_dd; next_dd = rxq->next_dd; /* Just the act of getting into the function from the application is * going to cost about 7 cycles */ rxdp = rxq->hw_ring + next_dd; _mm_prefetch((const void *)rxdp, _MM_HINT_T0); /* See if we need to rearm the RX queue - gives the prefetch a bit * of time to act */ if (rxq->rxrearm_nb > RTE_FM10K_RXQ_REARM_THRESH) fm10k_rxq_rearm(rxq); /* Before we start moving massive data around, check to see if * there is actually a packet available */ if (!(rxdp->d.staterr & FM10K_RXD_STATUS_DD)) return 0; /* Vecotr RX will process 4 packets at a time, strip the unaligned * tails in case it's not multiple of 4. */ nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_FM10K_DESCS_PER_LOOP); /* 4 packets DD mask */ dd_check = _mm_set_epi64x(0x0000000100000001LL, 0x0000000100000001LL); /* 4 packets EOP mask */ eop_check = _mm_set_epi64x(0x0000000200000002LL, 0x0000000200000002LL); /* mask to shuffle from desc. to mbuf */ shuf_msk = _mm_set_epi8( 7, 6, 5, 4, /* octet 4~7, 32bits rss */ 15, 14, /* octet 14~15, low 16 bits vlan_macip */ 13, 12, /* octet 12~13, 16 bits data_len */ 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ 13, 12, /* octet 12~13, low 16 bits pkt_len */ 0xFF, 0xFF, /* skip high 16 bits pkt_type */ 0xFF, 0xFF /* Skip pkt_type field in shuffle operation */ ); /* Cache is empty -> need to scan the buffer rings, but first move * the next 'n' mbufs into the cache */ mbufp = &rxq->sw_ring[next_dd]; /* A. load 4 packet in one loop * [A*. mask out 4 unused dirty field in desc] * B. copy 4 mbuf point from swring to rx_pkts * C. calc the number of DD bits among the 4 packets * [C*. extract the end-of-packet bit, if requested] * D. fill info. from desc to mbuf */ for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts; pos += RTE_FM10K_DESCS_PER_LOOP, rxdp += RTE_FM10K_DESCS_PER_LOOP) { __m128i descs0[RTE_FM10K_DESCS_PER_LOOP]; __m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4; __m128i zero, staterr, sterr_tmp1, sterr_tmp2; __m128i mbp1, mbp2; /* two mbuf pointer in one XMM reg. */ /* B.1 load 1 mbuf point */ mbp1 = _mm_loadu_si128((__m128i *)&mbufp[pos]); /* Read desc statuses backwards to avoid race condition */ /* A.1 load 4 pkts desc */ descs0[3] = _mm_loadu_si128((__m128i *)(rxdp + 3)); /* B.2 copy 2 mbuf point into rx_pkts */ _mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1); /* B.1 load 1 mbuf point */ mbp2 = _mm_loadu_si128((__m128i *)&mbufp[pos+2]); descs0[2] = _mm_loadu_si128((__m128i *)(rxdp + 2)); /* B.1 load 2 mbuf point */ descs0[1] = _mm_loadu_si128((__m128i *)(rxdp + 1)); descs0[0] = _mm_loadu_si128((__m128i *)(rxdp)); /* B.2 copy 2 mbuf point into rx_pkts */ _mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2); /* avoid compiler reorder optimization */ rte_compiler_barrier(); if (split_packet) { rte_mbuf_prefetch_part2(rx_pkts[pos]); rte_mbuf_prefetch_part2(rx_pkts[pos + 1]); rte_mbuf_prefetch_part2(rx_pkts[pos + 2]); rte_mbuf_prefetch_part2(rx_pkts[pos + 3]); } /* D.1 pkt 3,4 convert format from desc to pktmbuf */ pkt_mb4 = _mm_shuffle_epi8(descs0[3], shuf_msk); pkt_mb3 = _mm_shuffle_epi8(descs0[2], shuf_msk); /* C.1 4=>2 filter staterr info only */ sterr_tmp2 = _mm_unpackhi_epi32(descs0[3], descs0[2]); /* C.1 4=>2 filter staterr info only */ sterr_tmp1 = _mm_unpackhi_epi32(descs0[1], descs0[0]); /* set ol_flags with vlan packet type */ fm10k_desc_to_olflags_v(descs0, &rx_pkts[pos]); /* D.1 pkt 1,2 convert format from desc to pktmbuf */ pkt_mb2 = _mm_shuffle_epi8(descs0[1], shuf_msk); pkt_mb1 = _mm_shuffle_epi8(descs0[0], shuf_msk); /* C.2 get 4 pkts staterr value */ zero = _mm_xor_si128(dd_check, dd_check); staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2); /* D.3 copy final 3,4 data to rx_pkts */ _mm_storeu_si128((void *)&rx_pkts[pos+3]->rx_descriptor_fields1, pkt_mb4); _mm_storeu_si128((void *)&rx_pkts[pos+2]->rx_descriptor_fields1, pkt_mb3); /* C* extract and record EOP bit */ if (split_packet) { __m128i eop_shuf_mask = _mm_set_epi8( 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x04, 0x0C, 0x00, 0x08 ); /* and with mask to extract bits, flipping 1-0 */ __m128i eop_bits = _mm_andnot_si128(staterr, eop_check); /* the staterr values are not in order, as the count * count of dd bits doesn't care. However, for end of * packet tracking, we do care, so shuffle. This also * compresses the 32-bit values to 8-bit */ eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask); /* store the resulting 32-bit value */ *(int *)split_packet = _mm_cvtsi128_si32(eop_bits); split_packet += RTE_FM10K_DESCS_PER_LOOP; /* zero-out next pointers */ rx_pkts[pos]->next = NULL; rx_pkts[pos + 1]->next = NULL; rx_pkts[pos + 2]->next = NULL; rx_pkts[pos + 3]->next = NULL; } /* C.3 calc available number of desc */ staterr = _mm_and_si128(staterr, dd_check); staterr = _mm_packs_epi32(staterr, zero); /* D.3 copy final 1,2 data to rx_pkts */ _mm_storeu_si128((void *)&rx_pkts[pos+1]->rx_descriptor_fields1, pkt_mb2); _mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1, pkt_mb1); fm10k_desc_to_pktype_v(descs0, &rx_pkts[pos]); /* C.4 calc avaialbe number of desc */ var = __builtin_popcountll(_mm_cvtsi128_si64(staterr)); nb_pkts_recd += var; if (likely(var != RTE_FM10K_DESCS_PER_LOOP)) break; } /* Update our internal tail pointer */ rxq->next_dd = (uint16_t)(rxq->next_dd + nb_pkts_recd); rxq->next_dd = (uint16_t)(rxq->next_dd & (rxq->nb_desc - 1)); rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd); return nb_pkts_recd; }
static void tranpose8x8(__m128i *input,int i_indx, __m128i *Transposed,int t_indx) { __m128i a; __m128i b; __m128i c; __m128i d; __m128i e; __m128i f; __m128i g; __m128i h; __m128i temp1; __m128i temp2; __m128i temp3; __m128i temp4; __m128i temp5; __m128i temp6; __m128i temp7; __m128i temp8; __m128i temp9; __m128i temp10; __m128i temp11; __m128i temp12; __m128i temp13; __m128i temp14; __m128i temp15; __m128i temp16; __m128i T0; __m128i T1; __m128i T2; __m128i T3; __m128i T4; __m128i T5; __m128i T6; __m128i T7; a = _mm_load_si128(&input[i_indx]); b = _mm_load_si128(&input[i_indx+4 ]); c = _mm_load_si128(&input[i_indx+8 ]); d = _mm_load_si128(&input[i_indx+12]); e = _mm_load_si128(&input[i_indx+16]); f = _mm_load_si128(&input[i_indx+20]); g = _mm_load_si128(&input[i_indx+24]); h = _mm_load_si128(&input[i_indx+28]); temp1 = _mm_unpacklo_epi16(a, b); //a03b03 temp2 = _mm_unpacklo_epi16(c, d); temp3 = _mm_unpacklo_epi16(e, f); temp4 = _mm_unpacklo_epi16(g, h); temp5 = _mm_unpackhi_epi16(a, b); temp6 = _mm_unpackhi_epi16(c, d); temp7 = _mm_unpackhi_epi16(e, f); temp8 = _mm_unpackhi_epi16(g, h); temp9 = _mm_unpacklo_epi32(temp1, temp2); //a01b01c01d01 temp10 = _mm_unpackhi_epi32(temp1, temp2); temp11 = _mm_unpacklo_epi32(temp3, temp4); temp12 = _mm_unpackhi_epi32(temp3, temp4); temp13 = _mm_unpacklo_epi32(temp5, temp6); temp14 = _mm_unpackhi_epi32(temp5, temp6); temp15 = _mm_unpacklo_epi32(temp7, temp8); temp16 = _mm_unpackhi_epi32(temp7, temp8); T0 = _mm_unpacklo_epi64(temp9, temp11); //a0b0c0d0e0f0g0h0 T1 = _mm_unpackhi_epi64(temp9, temp11); T2 = _mm_unpacklo_epi64(temp10, temp12); T3 = _mm_unpackhi_epi64(temp10, temp12); T4 = _mm_unpacklo_epi64(temp13, temp15); T5 = _mm_unpackhi_epi64(temp13, temp15); T6 = _mm_unpacklo_epi64(temp14, temp16); T7 = _mm_unpackhi_epi64(temp14, temp16); _mm_store_si128(&Transposed[t_indx], T0); //store transposed 8X8 matrix _mm_store_si128(&Transposed[t_indx+1], T1); _mm_store_si128(&Transposed[t_indx+2], T2); _mm_store_si128(&Transposed[t_indx+3], T3); _mm_store_si128(&Transposed[t_indx+4], T4); _mm_store_si128(&Transposed[t_indx+5], T5); _mm_store_si128(&Transposed[t_indx+6], T6); _mm_store_si128(&Transposed[t_indx+7], T7); }
mlib_status mlib_VideoColorJFIFYCC2RGB444_S16_naligned( mlib_s16 *rgb, const mlib_s16 *y, const mlib_s16 *cb, const mlib_s16 *cr, mlib_s32 n) { /* 0 & 1.402*16384 */ const __m128i x_c1 = _mm_setr_epi16(0, 22970, 0, 22970, 0, 22970, 0, 22970); /* -0.34414*16384 & -0.71414*16384 */ const __m128i x_c2 = _mm_setr_epi16(-5638, -11700, -5638, -11700, -5638, -11700, -5638, -11700); /* 1.772*16384 & 0 */ const __m128i x_c3 = _mm_setr_epi16(29032, 0, 29032, 0, 29032, 0, 29032, 0); const __m128i x_coff = _mm_set1_epi16(2048); const __m128i x_cps1 = _mm_set1_epi32(0x8000); const __m128i x_cps2 = _mm_set1_epi16(0x8000); const __m128i x_zero = _mm_setzero_si128(); const __m128i x_mask1 = _mm_setr_epi32(0xffffffff, 0xffff, 0, 0); const __m128i x_mask2 = _mm_setr_epi32(0, 0xffff0000, 0xffffffff, 0); /* __m128i variables */ __m128i x_y, x_cb, x_cr, x_r, x_g, x_b, x_y1, x_y2; __m128i x_r1, x_r2, x_g1, x_g2, x_b1, x_b2, x_t1, x_t2; __m128i x_rgbl, x_rgbh, x_rgl, x_rgh, x_bbl, x_bbh; __m128i x_cbcr1, x_cbcr2; /* pointers */ __m128i *px_y, *px_cb, *px_cr; mlib_s16 *prgb; /* other var */ mlib_d64 fr, fg, fb, fy, fcb, fcr; mlib_s32 i; px_y = (__m128i *)y; px_cb = (__m128i *)cb; px_cr = (__m128i *)cr; prgb = rgb; i = 0; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (; i <= n - 16; i += 8) { x_y = _mm_loadu_si128(px_y); x_y1 = _mm_unpacklo_epi16(x_y, x_zero); x_y1 = _mm_slli_epi32(x_y1, 4); x_y2 = _mm_unpackhi_epi16(x_y, x_zero); x_y2 = _mm_slli_epi32(x_y2, 4); px_y++; x_cb = _mm_loadu_si128(px_cb); x_cb = _mm_sub_epi16(x_cb, x_coff); px_cb++; x_cr = _mm_loadu_si128(px_cr); x_cr = _mm_sub_epi16(x_cr, x_coff); px_cr++; x_cbcr1 = _mm_unpacklo_epi16(x_cb, x_cr); x_cbcr2 = _mm_unpackhi_epi16(x_cb, x_cr); /* calc r/g/b */ x_t1 = _mm_madd_epi16(x_cbcr1, x_c1); x_t1 = _mm_srai_epi32(x_t1, 10); x_r1 = _mm_add_epi32(x_t1, x_y1); x_t1 = _mm_madd_epi16(x_cbcr1, x_c2); x_t1 = _mm_srai_epi32(x_t1, 10); x_g1 = _mm_add_epi32(x_t1, x_y1); x_t1 = _mm_madd_epi16(x_cbcr1, x_c3); x_t1 = _mm_srai_epi32(x_t1, 10); x_b1 = _mm_add_epi32(x_t1, x_y1); x_t2 = _mm_madd_epi16(x_cbcr2, x_c1); x_t2 = _mm_srai_epi32(x_t2, 10); x_r2 = _mm_add_epi32(x_t2, x_y2); x_t2 = _mm_madd_epi16(x_cbcr2, x_c2); x_t2 = _mm_srai_epi32(x_t2, 10); x_g2 = _mm_add_epi32(x_t2, x_y2); x_t2 = _mm_madd_epi16(x_cbcr2, x_c3); x_t2 = _mm_srai_epi32(x_t2, 10); x_b2 = _mm_add_epi32(x_t2, x_y2); /* signed pack & shift */ x_r1 = _mm_sub_epi32(x_r1, x_cps1); x_r2 = _mm_sub_epi32(x_r2, x_cps1); x_r = _mm_packs_epi32(x_r1, x_r2); x_r = _mm_add_epi16(x_r, x_cps2); x_r = _mm_srli_epi16(x_r, 4); x_g1 = _mm_sub_epi32(x_g1, x_cps1); x_g2 = _mm_sub_epi32(x_g2, x_cps1); x_g = _mm_packs_epi32(x_g1, x_g2); x_g = _mm_add_epi16(x_g, x_cps2); x_g = _mm_srli_epi16(x_g, 4); x_b1 = _mm_sub_epi32(x_b1, x_cps1); x_b2 = _mm_sub_epi32(x_b2, x_cps1); x_b = _mm_packs_epi32(x_b1, x_b2); x_b = _mm_add_epi16(x_b, x_cps2); x_b = _mm_srli_epi16(x_b, 4); /* create rgb sequences */ x_rgl = _mm_unpacklo_epi16(x_r, x_g); x_rgh = _mm_unpackhi_epi16(x_r, x_g); x_bbl = _mm_unpacklo_epi16(x_b, x_b); x_bbh = _mm_unpackhi_epi16(x_b, x_b); /* save */ x_rgbl = _mm_unpacklo_epi32(x_rgl, x_bbl); PACK_RGB1(x_rgbl); x_rgbh = _mm_unpackhi_epi32(x_rgl, x_bbl); PACK_RGB1(x_rgbh); x_rgbl = _mm_unpacklo_epi32(x_rgh, x_bbh); PACK_RGB1(x_rgbl); x_rgbh = _mm_unpackhi_epi32(x_rgh, x_bbh); PACK_RGB1(x_rgbh); } if (i <= (n - 8)) { x_y = _mm_loadu_si128(px_y); x_y1 = _mm_unpacklo_epi16(x_y, x_zero); x_y1 = _mm_slli_epi32(x_y1, 4); x_y2 = _mm_unpackhi_epi16(x_y, x_zero); x_y2 = _mm_slli_epi32(x_y2, 4); px_y++; x_cb = _mm_loadu_si128(px_cb); x_cb = _mm_sub_epi16(x_cb, x_coff); px_cb++; x_cr = _mm_loadu_si128(px_cr); x_cr = _mm_sub_epi16(x_cr, x_coff); px_cr++; x_cbcr1 = _mm_unpacklo_epi16(x_cb, x_cr); x_cbcr2 = _mm_unpackhi_epi16(x_cb, x_cr); /* calc r/g/b */ x_t1 = _mm_madd_epi16(x_cbcr1, x_c1); x_t1 = _mm_srai_epi32(x_t1, 10); x_r1 = _mm_add_epi32(x_t1, x_y1); x_t1 = _mm_madd_epi16(x_cbcr1, x_c2); x_t1 = _mm_srai_epi32(x_t1, 10); x_g1 = _mm_add_epi32(x_t1, x_y1); x_t1 = _mm_madd_epi16(x_cbcr1, x_c3); x_t1 = _mm_srai_epi32(x_t1, 10); x_b1 = _mm_add_epi32(x_t1, x_y1); x_t2 = _mm_madd_epi16(x_cbcr2, x_c1); x_t2 = _mm_srai_epi32(x_t2, 10); x_r2 = _mm_add_epi32(x_t2, x_y2); x_t2 = _mm_madd_epi16(x_cbcr2, x_c2); x_t2 = _mm_srai_epi32(x_t2, 10); x_g2 = _mm_add_epi32(x_t2, x_y2); x_t2 = _mm_madd_epi16(x_cbcr2, x_c3); x_t2 = _mm_srai_epi32(x_t2, 10); x_b2 = _mm_add_epi32(x_t2, x_y2); /* signed pack & shift */ x_r1 = _mm_sub_epi32(x_r1, x_cps1); x_r2 = _mm_sub_epi32(x_r2, x_cps1); x_r = _mm_packs_epi32(x_r1, x_r2); x_r = _mm_add_epi16(x_r, x_cps2); x_r = _mm_srli_epi16(x_r, 4); x_g1 = _mm_sub_epi32(x_g1, x_cps1); x_g2 = _mm_sub_epi32(x_g2, x_cps1); x_g = _mm_packs_epi32(x_g1, x_g2); x_g = _mm_add_epi16(x_g, x_cps2); x_g = _mm_srli_epi16(x_g, 4); x_b1 = _mm_sub_epi32(x_b1, x_cps1); x_b2 = _mm_sub_epi32(x_b2, x_cps1); x_b = _mm_packs_epi32(x_b1, x_b2); x_b = _mm_add_epi16(x_b, x_cps2); x_b = _mm_srli_epi16(x_b, 4); /* create rgb sequences */ x_rgl = _mm_unpacklo_epi16(x_r, x_g); x_rgh = _mm_unpackhi_epi16(x_r, x_g); x_bbl = _mm_unpacklo_epi16(x_b, x_b); x_bbh = _mm_unpackhi_epi16(x_b, x_b); /* save */ x_rgbl = _mm_unpacklo_epi32(x_rgl, x_bbl); PACK_RGB1(x_rgbl); x_rgbh = _mm_unpackhi_epi32(x_rgl, x_bbl); PACK_RGB1(x_rgbh); x_rgbl = _mm_unpacklo_epi32(x_rgh, x_bbh); PACK_RGB1(x_rgbl); x_rgbh = _mm_unpackhi_epi32(x_rgh, x_bbh); PACK_RGB2(x_rgbh); i += 8; } if (i <= (n - 4)) { x_y = _mm_loadl_epi64(px_y); x_y1 = _mm_unpacklo_epi16(x_y, x_zero); x_y1 = _mm_slli_epi32(x_y1, 4); px_y = (__m128i *)(((__m64 *)px_y) + 1); x_cb = _mm_loadl_epi64(px_cb); x_cb = _mm_sub_epi16(x_cb, x_coff); px_cb = (__m128i *)(((__m64 *)px_cb) + 1); x_cr = _mm_loadl_epi64(px_cr); x_cr = _mm_sub_epi16(x_cr, x_coff); px_cr = (__m128i *)(((__m64 *)px_cr) + 1); x_cbcr1 = _mm_unpacklo_epi16(x_cb, x_cr); /* calc r/g/b */ x_t1 = _mm_madd_epi16(x_cbcr1, x_c1); x_t1 = _mm_srai_epi32(x_t1, 10); x_r1 = _mm_add_epi32(x_t1, x_y1); x_t1 = _mm_madd_epi16(x_cbcr1, x_c2); x_t1 = _mm_srai_epi32(x_t1, 10); x_g1 = _mm_add_epi32(x_t1, x_y1); x_t1 = _mm_madd_epi16(x_cbcr1, x_c3); x_t1 = _mm_srai_epi32(x_t1, 10); x_b1 = _mm_add_epi32(x_t1, x_y1); /* signed pack & shift */ x_r1 = _mm_sub_epi32(x_r1, x_cps1); x_r = _mm_packs_epi32(x_r1, x_zero); x_r = _mm_add_epi16(x_r, x_cps2); x_r = _mm_srli_epi16(x_r, 4); x_g1 = _mm_sub_epi32(x_g1, x_cps1); x_g = _mm_packs_epi32(x_g1, x_zero); x_g = _mm_add_epi16(x_g, x_cps2); x_g = _mm_srli_epi16(x_g, 4); x_b1 = _mm_sub_epi32(x_b1, x_cps1); x_b = _mm_packs_epi32(x_b1, x_zero); x_b = _mm_add_epi16(x_b, x_cps2); x_b = _mm_srli_epi16(x_b, 4); /* create rgb sequences */ x_rgl = _mm_unpacklo_epi16(x_r, x_g); x_bbl = _mm_unpacklo_epi16(x_b, x_b); /* save */ x_rgbl = _mm_unpacklo_epi32(x_rgl, x_bbl); PACK_RGB1(x_rgbl); x_rgbh = _mm_unpackhi_epi32(x_rgl, x_bbl); PACK_RGB2(x_rgbh); i += 4; } /* pure C implementation */ for (; i < n; i++) { fy = y[i] * SCALE - SAT; fcb = (mlib_d64)((cb[i] - 2048) << 20); fcr = (mlib_d64)((cr[i] - 2048) << 20); fr = fy + 1.40200f * fcr; fg = fy - 0.34414f * fcb - 0.71414f * fcr; fb = fy + 1.77200f * fcb; rgb[3 * i] = CLAMP_U12(fr); rgb[3 * i + 1] = CLAMP_U12(fg); rgb[3 * i + 2] = CLAMP_U12(fb); } return (MLIB_SUCCESS); }
static void trans_g_aiT16(__m128i *input, __m128i *Transposed) { __m128i a; __m128i b; __m128i c; __m128i d; __m128i e; __m128i f; __m128i g; __m128i h; __m128i temp1; __m128i temp2; __m128i temp3; __m128i temp4; __m128i temp5; __m128i temp6; __m128i temp7; __m128i temp8; __m128i temp9; __m128i temp10; __m128i temp11; __m128i temp12; __m128i temp13; __m128i temp14; __m128i temp15; __m128i temp16; __m128i T0; __m128i T1; __m128i T2; __m128i T3; __m128i T4; __m128i T5; __m128i T6; __m128i T7; a = _mm_load_si128(&input[2]); b = _mm_load_si128(&input[6]); c = _mm_load_si128(&input[10]); d = _mm_load_si128(&input[14]); e = _mm_load_si128(&input[18]); f = _mm_load_si128(&input[22]); g = _mm_load_si128(&input[26]); h = _mm_load_si128(&input[30]); //store 128 bits of integer data into the memory address given _mm_store_si128(&Transposed[0], a); //store transposed 8X8 matrix _mm_store_si128(&Transposed[1], b); _mm_store_si128(&Transposed[2], c); _mm_store_si128(&Transposed[3], d); _mm_store_si128(&Transposed[4], e); _mm_store_si128(&Transposed[5], f); _mm_store_si128(&Transposed[6], g); _mm_store_si128(&Transposed[7], h); //load matrix input[0][0],[2][0]... a = _mm_load_si128(&input[0]); b = _mm_load_si128(&input[4]); c = _mm_load_si128(&input[8]); d = _mm_load_si128(&input[12]); e = _mm_load_si128(&input[16]); f = _mm_load_si128(&input[20]); g = _mm_load_si128(&input[24]); h = _mm_load_si128(&input[28]); temp1 = _mm_unpacklo_epi16(a, b); temp2 = _mm_unpacklo_epi16(c, d); temp3 = _mm_unpacklo_epi16(e, f); temp4 = _mm_unpacklo_epi16(g, h); temp5 = _mm_unpackhi_epi16(a, b); temp6 = _mm_unpackhi_epi16(c, d); temp7 = _mm_unpackhi_epi16(e, f); temp8 = _mm_unpackhi_epi16(g, h); temp9 = _mm_unpacklo_epi32(temp1, temp2); temp10 = _mm_unpackhi_epi32(temp1, temp2); temp11 = _mm_unpacklo_epi32(temp3, temp4); temp12 = _mm_unpackhi_epi32(temp3, temp4); temp13 = _mm_unpacklo_epi32(temp5, temp6); temp14 = _mm_unpackhi_epi32(temp5, temp6); temp15 = _mm_unpacklo_epi32(temp7, temp8); temp16 = _mm_unpackhi_epi32(temp7, temp8); T0 = _mm_unpacklo_epi64(temp9, temp11); T1 = _mm_unpackhi_epi64(temp9, temp11); T2 = _mm_unpacklo_epi64(temp10, temp12); T3 = _mm_unpackhi_epi64(temp10, temp12); _mm_store_si128(&Transposed[8], T0); //store transposed 8X8 matrix _mm_store_si128(&Transposed[9], T1); _mm_store_si128(&Transposed[10], T2); _mm_store_si128(&Transposed[11], T3); }
void ulsch_channel_compensation(int **rxdataF_ext, int **ul_ch_estimates_ext, int **ul_ch_mag, int **ul_ch_magb, int **rxdataF_comp, LTE_DL_FRAME_PARMS *frame_parms, unsigned char symbol, unsigned char Qm, unsigned short nb_rb, unsigned char output_shift) { unsigned short rb; __m128i *ul_ch128,*ul_ch_mag128,*ul_ch_mag128b,*rxdataF128,*rxdataF_comp128; unsigned char aarx;//,symbol_mod; // symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol; #ifndef __SSE3__ zeroU = _mm_xor_si128(zeroU,zeroU); #endif // printf("comp: symbol %d\n",symbol); if (Qm == 4) QAM_amp128U = _mm_set1_epi16(QAM16_n1); else if (Qm == 6) { QAM_amp128U = _mm_set1_epi16(QAM64_n1); QAM_amp128bU = _mm_set1_epi16(QAM64_n2); } for (aarx=0;aarx<frame_parms->nb_antennas_rx;aarx++) { ul_ch128 = (__m128i *)&ul_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128 = (__m128i *)&ul_ch_mag[aarx][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128b = (__m128i *)&ul_ch_magb[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF128 = (__m128i *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp128 = (__m128i *)&rxdataF_comp[aarx][symbol*frame_parms->N_RB_DL*12]; for (rb=0;rb<nb_rb;rb++) { // printf("comp: symbol %d rb %d\n",symbol,rb); #ifdef OFDMA_ULSCH if (Qm>2) { // get channel amplitude if not QPSK mmtmpU0 = _mm_madd_epi16(ul_ch128[0],ul_ch128[0]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift); mmtmpU1 = _mm_madd_epi16(ul_ch128[1],ul_ch128[1]); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift); mmtmpU0 = _mm_packs_epi32(mmtmpU0,mmtmpU1); ul_ch_mag128[0] = _mm_unpacklo_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b[0] = ul_ch_mag128[0]; ul_ch_mag128[0] = _mm_mulhi_epi16(ul_ch_mag128[0],QAM_amp128U); ul_ch_mag128[0] = _mm_slli_epi16(ul_ch_mag128[0],2); // 2 to compensate the scale channel estimate ul_ch_mag128[1] = _mm_unpackhi_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128b[1] = ul_ch_mag128[1]; ul_ch_mag128[1] = _mm_mulhi_epi16(ul_ch_mag128[1],QAM_amp128U); ul_ch_mag128[1] = _mm_slli_epi16(ul_ch_mag128[1],2); // 2 to compensate the scale channel estimate mmtmpU0 = _mm_madd_epi16(ul_ch128[2],ul_ch128[2]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift); mmtmpU1 = _mm_packs_epi32(mmtmpU0,mmtmpU0); ul_ch_mag128[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1); ul_ch_mag128b[2] = ul_ch_mag128[2]; ul_ch_mag128[2] = _mm_mulhi_epi16(ul_ch_mag128[2],QAM_amp128U); ul_ch_mag128[2] = _mm_slli_epi16(ul_ch_mag128[2],2); // 2 to compensate the scale channel estimate ul_ch_mag128b[0] = _mm_mulhi_epi16(ul_ch_mag128b[0],QAM_amp128bU); ul_ch_mag128b[0] = _mm_slli_epi16(ul_ch_mag128b[0],2); // 2 to compensate the scale channel estimate ul_ch_mag128b[1] = _mm_mulhi_epi16(ul_ch_mag128b[1],QAM_amp128bU); ul_ch_mag128b[1] = _mm_slli_epi16(ul_ch_mag128b[1],2); // 2 to compensate the scale channel estimate ul_ch_mag128b[2] = _mm_mulhi_epi16(ul_ch_mag128b[2],QAM_amp128bU); ul_ch_mag128b[2] = _mm_slli_epi16(ul_ch_mag128b[2],2);// 2 to compensate the scale channel estimate } #else mmtmpU0 = _mm_madd_epi16(ul_ch128[0],ul_ch128[0]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift-1); mmtmpU1 = _mm_madd_epi16(ul_ch128[1],ul_ch128[1]); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift-1); mmtmpU0 = _mm_packs_epi32(mmtmpU0,mmtmpU1); ul_ch_mag128[0] = _mm_unpacklo_epi16(mmtmpU0,mmtmpU0); ul_ch_mag128[1] = _mm_unpackhi_epi16(mmtmpU0,mmtmpU0); mmtmpU0 = _mm_madd_epi16(ul_ch128[2],ul_ch128[2]); mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift-1); mmtmpU1 = _mm_packs_epi32(mmtmpU0,mmtmpU0); ul_ch_mag128[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1); // printf("comp: symbol %d rb %d => %d,%d,%d\n",symbol,rb,*((short*)&ul_ch_mag128[0]),*((short*)&ul_ch_mag128[1]),*((short*)&ul_ch_mag128[2])); #endif // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128[0],rxdataF128[0]); // print_ints("re",&mmtmpU0); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128[0],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)&conjugate[0]); // print_ints("im",&mmtmpU1); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[0]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift); // print_ints("re(shift)",&mmtmpU0); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift); // print_ints("im(shift)",&mmtmpU1); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); // print_ints("c0",&mmtmpU2); // print_ints("c1",&mmtmpU3); rxdataF_comp128[0] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[0]); // print_shorts("ch:",ul_ch128[0]); // print_shorts("pack:",rxdataF_comp128[0]); // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128[1],rxdataF128[1]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128[1],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[1]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128[1] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[1]); // print_shorts("ch:",ul_ch128[1]); // print_shorts("pack:",rxdataF_comp128[1]); // multiply by conjugated channel mmtmpU0 = _mm_madd_epi16(ul_ch128[2],rxdataF128[2]); // mmtmpU0 contains real part of 4 consecutive outputs (32-bit) mmtmpU1 = _mm_shufflelo_epi16(ul_ch128[2],_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1)); mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate); mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[2]); // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit) mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift); mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift); mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1); mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1); rxdataF_comp128[2] = _mm_packs_epi32(mmtmpU2,mmtmpU3); // print_shorts("rx:",rxdataF128[2]); // print_shorts("ch:",ul_ch128[2]); // print_shorts("pack:",rxdataF_comp128[2]); ul_ch128+=3; ul_ch_mag128+=3; ul_ch_mag128b+=3; rxdataF128+=3; rxdataF_comp128+=3; } } _mm_empty(); _m_empty(); }
/// CURRENTLY SAME CODE AS SCALAR !! /// REPLACE HERE WITH SSE intrinsics static void partialButterflyInverse16_simd(short *src, short *dst, int shift) { int add = 1<<(shift-1); //we cast the original 16X16 matrix to an SIMD vector type __m128i *g_aiT16_vec = (__m128i *)g_aiT16; //We cast the input source (which is basically random numbers(see the main function for details)) to an SIMD vector type //We also cast the output to an SIMD vector type __m128i *in_vec = (__m128i *) src; __m128i *out_vec = (__m128i *) dst; //we declare an 8X8 array and cast it to an SIMD vector type short gt[8][8] __attribute__ ((aligned (16))); __m128i *gt_vec = (__m128i *)gt; //we declare an 16X16 array and cast it to an SIMD vector type short random[16][16] __attribute__ ((aligned (16))); __m128i *random_vec = (__m128i *)random; trans_g_aiT16(g_aiT16_vec,gt_vec); tranpose8x8(in_vec,2, random_vec,0); tranpose8x8(in_vec,3, random_vec,8); tranpose8x8(in_vec,0, random_vec,16); tranpose8x8(in_vec,1, random_vec,24); for (int j=0; j<16; j++) { /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ __m128i I0 = _mm_load_si128 (&random_vec[j]); __m128i II0 = _mm_load_si128 (&random_vec[j+16]); // for (int k=0; k<8; k++) //here we are loading up the transposed values in the initial matrix //multiplying it with the input numbers to produce intermediate 32-bit integers // we then sum up adjacent pairs of 32-bit integers and store them in the destination register __m128i I1 = _mm_load_si128 (>_vec[0]); __m128i I2 = _mm_madd_epi16 (I1, I0); __m128i I3 = _mm_load_si128 (>_vec[1]); __m128i I4 = _mm_madd_epi16 (I3, I0); __m128i I5 = _mm_load_si128 (>_vec[2]); __m128i I6 = _mm_madd_epi16 (I5, I0); __m128i I7 = _mm_load_si128 (>_vec[3]); __m128i I8 = _mm_madd_epi16 (I7, I0); __m128i I9 = _mm_load_si128 (>_vec[4]); __m128i I10 = _mm_madd_epi16 (I9, I0); __m128i I11 = _mm_load_si128 (>_vec[5]); __m128i I12 = _mm_madd_epi16 (I11, I0); __m128i I13 = _mm_load_si128 (>_vec[6]); __m128i I14 = _mm_madd_epi16 (I13, I0); __m128i I15 = _mm_load_si128 (>_vec[7]); __m128i I16 = _mm_madd_epi16 (I15, I0); //horizontally add the partial results obtained from thee previous step __m128i A1 =_mm_hadd_epi32 (I2, I4); __m128i A2 =_mm_hadd_epi32 (I6, I8); __m128i R1 =_mm_hadd_epi32 (A1, A2); __m128i A3 =_mm_hadd_epi32 (I10, I12); __m128i A4 =_mm_hadd_epi32 (I14, I16); __m128i R2 =_mm_hadd_epi32 (A3, A4); // O[k] = T[0]+T[1]+T[2]+T[3]; // for (int k=0; k<4; k++) // { //load the original matrix values, multiply it with the random values //store the low bits to I2 and the hi bits to I3 I1 = _mm_load_si128 (>_vec[8]); I2 = _mm_mullo_epi16 (I1, II0); I3 = _mm_mulhi_epi16 (I1, II0); __m128i lowI23 = _mm_unpacklo_epi16(I2,I3); __m128i hiI23 = _mm_unpackhi_epi16(I2,I3); __m128i temp1 = _mm_add_epi32(lowI23,hiI23); __m128i temp5 = _mm_hsub_epi32 (lowI23, hiI23); I4 = _mm_load_si128 (>_vec[9]); I5 = _mm_mullo_epi16 (I4, II0); I6 = _mm_mulhi_epi16 (I4, II0); __m128i lowI56 = _mm_unpacklo_epi16(I5,I6); __m128i hiI56 = _mm_unpackhi_epi16(I5,I6); __m128i temp2 = _mm_add_epi32(lowI56,hiI56); __m128i temp6 = _mm_hsub_epi32 (lowI56, hiI56); I7 = _mm_load_si128 (>_vec[10]); I8 = _mm_mullo_epi16 (I7, II0); I9 = _mm_mulhi_epi16 (I7, II0); __m128i lowI89 = _mm_unpacklo_epi16(I8,I9); __m128i hiI89 = _mm_unpackhi_epi16(I8,I9); __m128i temp3 = _mm_add_epi32(lowI89,hiI89); __m128i temp7 = _mm_hsub_epi32 (lowI89, hiI89); I10 = _mm_load_si128 (>_vec[11]); I11 = _mm_mullo_epi16 (I10, II0); I12 = _mm_mulhi_epi16 (I10, II0); __m128i lowI1112 = _mm_unpacklo_epi16(I11,I12); __m128i hiI1112 = _mm_unpackhi_epi16(I11,I12); __m128i temp4 = _mm_add_epi32(lowI1112,hiI1112); __m128i temp8 = _mm_hsub_epi32 (lowI1112, hiI1112); __m128i A5 =_mm_hadd_epi32 (temp1, temp2); __m128i A6 =_mm_hadd_epi32 (temp3, temp4); __m128i R3 =_mm_hadd_epi32 (A5, A6); __m128i A7 =_mm_hadd_epi32 (temp8, temp7); __m128i A8 =_mm_hadd_epi32 (temp6, temp5); __m128i R4 =_mm_hadd_epi32 (A7, A8); /////////////////////////// __m128i add_reg = _mm_set1_epi32(add); __m128i sum_vec0 = _mm_add_epi32(R3,R1); sum_vec0 = _mm_add_epi32(sum_vec0,add_reg); sum_vec0 = _mm_srai_epi32(sum_vec0, shift); // shift right __m128i sum_vec1 = _mm_add_epi32(R4,R2); sum_vec1 = _mm_add_epi32(sum_vec1,add_reg); sum_vec1 = _mm_srai_epi32(sum_vec1, shift); // shift right __m128i finalres0 = _mm_packs_epi32(sum_vec0, sum_vec1); // shrink packed 32bit to packed 16 bit and saturate _mm_store_si128 (&out_vec[2*j], finalres0); __m128i sum_vec2 = _mm_sub_epi32(R4, R2); sum_vec2 = _mm_add_epi32(sum_vec2,add_reg); sum_vec2 = _mm_srai_epi32(sum_vec2, shift); // shift right __m128i sum_vec3 = _mm_sub_epi32(R3, R1); sum_vec3 = _mm_add_epi32(sum_vec3,add_reg); sum_vec3 = _mm_srai_epi32(sum_vec3, shift); // shift right I5 = _mm_unpackhi_epi32(sum_vec2, sum_vec3); I6 = _mm_unpacklo_epi32(sum_vec2, sum_vec3); I7 = _mm_unpackhi_epi32(I5, I6); I8 = _mm_unpacklo_epi32(I5, I6); I9 = _mm_unpacklo_epi32(I7, I8); I10 = _mm_unpackhi_epi32(I7, I8); sum_vec3 = _mm_packs_epi32(I9, I10); // shrink packed 32bit to packed 16 bit and saturate _mm_store_si128 (&out_vec[2*j+1], sum_vec3); } }
unsigned FLAC__fixed_compute_best_predictor_wide_intrin_sse2(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER + 1]) { FLAC__uint64 total_error_0, total_error_1, total_error_2, total_error_3, total_error_4; unsigned i, order; __m128i total_err0, total_err1, total_err3; { FLAC__int32 itmp; __m128i last_error, zero = _mm_setzero_si128(); last_error = _mm_cvtsi32_si128(data[-1]); // 0 0 0 le0 itmp = data[-2]; last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0)); last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp)); // 0 0 le0 le1 itmp -= data[-3]; last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0)); last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp)); // 0 le0 le1 le2 itmp -= data[-3] - data[-4]; last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0)); last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp)); // le0 le1 le2 le3 total_err0 = total_err1 = total_err3 = _mm_setzero_si128(); for(i = 0; i < data_len; i++) { __m128i err0, err1, tmp; err0 = _mm_cvtsi32_si128(data[i]); // 0 0 0 e0 err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(0,0,0,0)); // e0 e0 e0 e0 #if 1 /* OPT_SSE */ err1 = _mm_sub_epi32(err1, last_error); last_error = _mm_srli_si128(last_error, 4); // 0 le0 le1 le2 err1 = _mm_sub_epi32(err1, last_error); last_error = _mm_srli_si128(last_error, 4); // 0 0 le0 le1 err1 = _mm_sub_epi32(err1, last_error); last_error = _mm_srli_si128(last_error, 4); // 0 0 0 le0 err1 = _mm_sub_epi32(err1, last_error); // e1 e2 e3 e4 #else last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 8)); // le0 le1 le2+le0 le3+le1 last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 4)); // le0 le1+le0 le2+le0+le1 le3+le1+le2+le0 err1 = _mm_sub_epi32(err1, last_error); // e1 e2 e3 e4 #endif tmp = _mm_slli_si128(err0, 12); // e0 0 0 0 last_error = _mm_srli_si128(err1, 4); // 0 e1 e2 e3 last_error = _mm_or_si128(last_error, tmp); // e0 e1 e2 e3 tmp = _mm_srai_epi32(err0, 31); err0 = _mm_xor_si128(err0, tmp); err0 = _mm_sub_epi32(err0, tmp); tmp = _mm_srai_epi32(err1, 31); err1 = _mm_xor_si128(err1, tmp); err1 = _mm_sub_epi32(err1, tmp); total_err0 = _mm_add_epi64(total_err0, err0); // 0 te0 err0 = _mm_unpacklo_epi32(err1, zero); // 0 |e3| 0 |e4| err1 = _mm_unpackhi_epi32(err1, zero); // 0 |e1| 0 |e2| total_err3 = _mm_add_epi64(total_err3, err0); // te3 te4 total_err1 = _mm_add_epi64(total_err1, err1); // te1 te2 } } m128i_to_i64(total_error_0, total_err0); m128i_to_i64(total_error_4, total_err3); m128i_to_i64(total_error_2, total_err1); total_err3 = _mm_srli_si128(total_err3, 8); // 0 te3 total_err1 = _mm_srli_si128(total_err1, 8); // 0 te1 m128i_to_i64(total_error_3, total_err3); m128i_to_i64(total_error_1, total_err1); /* prefer higher order */ if(total_error_0 < flac_min(flac_min(flac_min(total_error_1, total_error_2), total_error_3), total_error_4)) order = 0; else if(total_error_1 < flac_min(flac_min(total_error_2, total_error_3), total_error_4)) order = 1; else if(total_error_2 < flac_min(total_error_3, total_error_4)) order = 2; else if(total_error_3 < total_error_4) order = 3; else order = 4; /* Estimate the expected number of bits per residual signal sample. */ /* 'total_error*' is linearly related to the variance of the residual */ /* signal, so we use it directly to compute E(|x|) */ FLAC__ASSERT(data_len > 0 || total_error_0 == 0); FLAC__ASSERT(data_len > 0 || total_error_1 == 0); FLAC__ASSERT(data_len > 0 || total_error_2 == 0); FLAC__ASSERT(data_len > 0 || total_error_3 == 0); FLAC__ASSERT(data_len > 0 || total_error_4 == 0); residual_bits_per_sample[0] = (FLAC__float)((total_error_0 > 0) ? log(M_LN2 * (FLAC__double)total_error_0 / (FLAC__double)data_len) / M_LN2 : 0.0); residual_bits_per_sample[1] = (FLAC__float)((total_error_1 > 0) ? log(M_LN2 * (FLAC__double)total_error_1 / (FLAC__double)data_len) / M_LN2 : 0.0); residual_bits_per_sample[2] = (FLAC__float)((total_error_2 > 0) ? log(M_LN2 * (FLAC__double)total_error_2 / (FLAC__double)data_len) / M_LN2 : 0.0); residual_bits_per_sample[3] = (FLAC__float)((total_error_3 > 0) ? log(M_LN2 * (FLAC__double)total_error_3 / (FLAC__double)data_len) / M_LN2 : 0.0); residual_bits_per_sample[4] = (FLAC__float)((total_error_4 > 0) ? log(M_LN2 * (FLAC__double)total_error_4 / (FLAC__double)data_len) / M_LN2 : 0.0); return order; }
0, 0, PKT_RX_FDIR, 0); const __m128i l3_l4e_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, /* shift right 1 bit to make sure it not exceed 255 */ (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1, (PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD) >> 1, (PKT_RX_EIP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1, (PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD) >> 1, (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1, (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1, PKT_RX_IP_CKSUM_BAD >> 1, (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1); vlan0 = _mm_unpackhi_epi32(descs[0], descs[1]); vlan1 = _mm_unpackhi_epi32(descs[2], descs[3]); vlan0 = _mm_unpacklo_epi64(vlan0, vlan1); vlan1 = _mm_and_si128(vlan0, rss_vlan_msk); vlan0 = _mm_shuffle_epi8(vlan_flags, vlan1); rss = _mm_srli_epi32(vlan1, 11); rss = _mm_shuffle_epi8(rss_flags, rss); l3_l4e = _mm_srli_epi32(vlan1, 22); l3_l4e = _mm_shuffle_epi8(l3_l4e_flags, l3_l4e); /* then we shift left 1 bit */ l3_l4e = _mm_slli_epi32(l3_l4e, 1); /* we need to mask out the reduntant bits */ l3_l4e = _mm_and_si128(l3_l4e, cksum_mask);
/* Transpose bytes within elements for 64 bit elements. */ int64_t bshuf_trans_byte_elem_SSE_64(void* in, void* out, const size_t size) { size_t ii; char* in_b = (char*) in; char* out_b = (char*) out; __m128i a0, b0, c0, d0, e0, f0, g0, h0; __m128i a1, b1, c1, d1, e1, f1, g1, h1; for (ii=0; ii + 15 < size; ii += 16) { a0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 0*16]); b0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 1*16]); c0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 2*16]); d0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 3*16]); e0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 4*16]); f0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 5*16]); g0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 6*16]); h0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 7*16]); a1 = _mm_unpacklo_epi8(a0, b0); b1 = _mm_unpackhi_epi8(a0, b0); c1 = _mm_unpacklo_epi8(c0, d0); d1 = _mm_unpackhi_epi8(c0, d0); e1 = _mm_unpacklo_epi8(e0, f0); f1 = _mm_unpackhi_epi8(e0, f0); g1 = _mm_unpacklo_epi8(g0, h0); h1 = _mm_unpackhi_epi8(g0, h0); a0 = _mm_unpacklo_epi8(a1, b1); b0 = _mm_unpackhi_epi8(a1, b1); c0 = _mm_unpacklo_epi8(c1, d1); d0 = _mm_unpackhi_epi8(c1, d1); e0 = _mm_unpacklo_epi8(e1, f1); f0 = _mm_unpackhi_epi8(e1, f1); g0 = _mm_unpacklo_epi8(g1, h1); h0 = _mm_unpackhi_epi8(g1, h1); a1 = _mm_unpacklo_epi32(a0, c0); b1 = _mm_unpackhi_epi32(a0, c0); c1 = _mm_unpacklo_epi32(b0, d0); d1 = _mm_unpackhi_epi32(b0, d0); e1 = _mm_unpacklo_epi32(e0, g0); f1 = _mm_unpackhi_epi32(e0, g0); g1 = _mm_unpacklo_epi32(f0, h0); h1 = _mm_unpackhi_epi32(f0, h0); a0 = _mm_unpacklo_epi64(a1, e1); b0 = _mm_unpackhi_epi64(a1, e1); c0 = _mm_unpacklo_epi64(b1, f1); d0 = _mm_unpackhi_epi64(b1, f1); e0 = _mm_unpacklo_epi64(c1, g1); f0 = _mm_unpackhi_epi64(c1, g1); g0 = _mm_unpacklo_epi64(d1, h1); h0 = _mm_unpackhi_epi64(d1, h1); _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0); _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0); _mm_storeu_si128((__m128i *) &out_b[2*size + ii], c0); _mm_storeu_si128((__m128i *) &out_b[3*size + ii], d0); _mm_storeu_si128((__m128i *) &out_b[4*size + ii], e0); _mm_storeu_si128((__m128i *) &out_b[5*size + ii], f0); _mm_storeu_si128((__m128i *) &out_b[6*size + ii], g0); _mm_storeu_si128((__m128i *) &out_b[7*size + ii], h0); } return bshuf_trans_byte_elem_remainder(in, out, size, 8, size - size % 16); }
template<class T> inline void dequantise_sse4_2_32_8_3(QuantisationMatrix *qmatrix, int32_t *idata, void *_odata, int ostride) { T *odata = (T *)_odata; const int slice_width = 32; const int slice_height = 8; const int Y = 0; const int X = 0; const int N = 0; T * const optr = &odata[Y*slice_height*ostride + X*slice_width]; const int32_t * iptr = &idata[N*slice_height*slice_width]; const __m128i D0 = LOAD_QUANTISED(&iptr[ 0], qmatrix, 0, 0); const __m128i D4 = LOAD_QUANTISED(&iptr[ 4], qmatrix, 1, 1); const __m128i D16 = LOAD_QUANTISED(&iptr[16], qmatrix, 2, 1); const __m128i D20 = LOAD_QUANTISED(&iptr[20], qmatrix, 2, 1); const __m128i D64 = LOAD_QUANTISED(&iptr[64], qmatrix, 3, 1); const __m128i D68 = LOAD_QUANTISED(&iptr[68], qmatrix, 3, 1); const __m128i D72 = LOAD_QUANTISED(&iptr[72], qmatrix, 3, 1); const __m128i D76 = LOAD_QUANTISED(&iptr[76], qmatrix, 3, 1); const __m128i A0 = _mm_unpacklo_epi32(D0, D4); // ( 00 11 00 11 ) const __m128i A1 = _mm_unpackhi_epi32(D0, D4); // ( 00 11 00 11 ) const __m128i B0 = _mm_unpacklo_epi32(A0, D16); // ( 00 21 11 21 ) const __m128i B1 = _mm_unpackhi_epi32(A0, D16); // ( 00 21 11 21 ) const __m128i B2 = _mm_unpacklo_epi32(A1, D20); // ( 00 21 11 21 ) const __m128i B3 = _mm_unpackhi_epi32(A1, D20); // ( 00 21 11 21 ) const __m128i C0 = _mm_unpacklo_epi32(B0, D64); // ( 00 31 21 31 ) const __m128i C1 = _mm_unpackhi_epi32(B0, D64); // ( 11 31 21 31 ) const __m128i C2 = _mm_unpacklo_epi32(B1, D68); // ( 00 31 21 31 ) const __m128i C3 = _mm_unpackhi_epi32(B1, D68); // ( 11 31 21 31 ) const __m128i C4 = _mm_unpacklo_epi32(B2, D72); // ( 00 31 21 31 ) const __m128i C5 = _mm_unpackhi_epi32(B2, D72); // ( 11 31 21 31 ) const __m128i C6 = _mm_unpacklo_epi32(B3, D76); // ( 00 31 21 31 ) const __m128i C7 = _mm_unpackhi_epi32(B3, D76); // ( 11 31 21 31 ) STORE_SAMPLE_PAIR<T>((__m128i *)&optr[0*ostride + 0], C0, C1); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[0*ostride + 8], C2, C3); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[0*ostride + 16], C4, C5); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[0*ostride + 24], C6, C7); const __m128i D8 = LOAD_QUANTISED(&iptr[ 8], qmatrix, 1, 2); const __m128i D12 = LOAD_QUANTISED(&iptr[12], qmatrix, 1, 3); const __m128i D24 = LOAD_QUANTISED(&iptr[24], qmatrix, 2, 1); const __m128i D28 = LOAD_QUANTISED(&iptr[28], qmatrix, 2, 1); const __m128i D96 = LOAD_QUANTISED(&iptr[96], qmatrix, 3, 1); const __m128i D100 = LOAD_QUANTISED(&iptr[100], qmatrix, 3, 1); const __m128i D104 = LOAD_QUANTISED(&iptr[104], qmatrix, 3, 1); const __m128i D108 = LOAD_QUANTISED(&iptr[108], qmatrix, 3, 1); const __m128i A2 = _mm_unpacklo_epi32(D8, D12); // ( 12 13 12 13 ) const __m128i A3 = _mm_unpackhi_epi32(D8, D12); // ( 12 13 12 13 ) const __m128i B4 = _mm_unpacklo_epi32(A2, D24); // ( 12 21 13 21 ) const __m128i B5 = _mm_unpackhi_epi32(A2, D24); // ( 12 21 13 21 ) const __m128i B6 = _mm_unpacklo_epi32(A3, D28); // ( 12 21 13 21 ) const __m128i B7 = _mm_unpackhi_epi32(A3, D28); // ( 12 21 13 21 ) const __m128i C8 = _mm_unpacklo_epi32(B4, D96); // ( 12 31 21 31 ) const __m128i C9 = _mm_unpackhi_epi32(B4, D96); // ( 13 31 21 31 ) const __m128i C10 = _mm_unpacklo_epi32(B5, D100); // ( 12 31 21 31 ) const __m128i C11 = _mm_unpackhi_epi32(B5, D100); // ( 13 31 21 31 ) const __m128i C12 = _mm_unpacklo_epi32(B6, D104); // ( 12 31 21 31 ) const __m128i C13 = _mm_unpackhi_epi32(B6, D104); // ( 13 31 21 31 ) const __m128i C14 = _mm_unpacklo_epi32(B7, D108); // ( 12 31 21 31 ) const __m128i C15 = _mm_unpackhi_epi32(B7, D108); // ( 13 31 21 31 ) STORE_SAMPLE_PAIR<T>((__m128i *)&optr[4*ostride + 0], C8, C9); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[4*ostride + 8], C10, C11); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[4*ostride + 16], C12, C13); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[4*ostride + 24], C14, C15); const __m128i D32 = LOAD_QUANTISED(&iptr[ 32], qmatrix, 2, 2); const __m128i D36 = LOAD_QUANTISED(&iptr[ 36], qmatrix, 2, 2); const __m128i D48 = LOAD_QUANTISED(&iptr[ 48], qmatrix, 2, 3); const __m128i D52 = LOAD_QUANTISED(&iptr[ 52], qmatrix, 2, 3); const __m128i D80 = LOAD_QUANTISED(&iptr[ 80], qmatrix, 3, 1); const __m128i D84 = LOAD_QUANTISED(&iptr[ 84], qmatrix, 3, 1); const __m128i D88 = LOAD_QUANTISED(&iptr[ 88], qmatrix, 3, 1); const __m128i D92 = LOAD_QUANTISED(&iptr[ 92], qmatrix, 3, 1); const __m128i A4 = _mm_unpacklo_epi32(D32, D48); // ( 22 23 22 23 ) const __m128i A5 = _mm_unpackhi_epi32(D32, D48); // ( 22 23 22 23 ) const __m128i A6 = _mm_unpacklo_epi32(D36, D52); // ( 22 23 22 23 ) const __m128i A7 = _mm_unpackhi_epi32(D36, D52); // ( 22 23 22 23 ) const __m128i B8 = _mm_unpacklo_epi32(A4, D80); // ( 22 31 23 31 ) const __m128i B9 = _mm_unpackhi_epi32(A4, D80); // ( 22 31 23 31 ) const __m128i B10 = _mm_unpacklo_epi32(A5, D84); // ( 22 31 23 31 ) const __m128i B11 = _mm_unpackhi_epi32(A5, D84); // ( 22 31 23 31 ) const __m128i B12 = _mm_unpacklo_epi32(A6, D88); // ( 22 31 23 31 ) const __m128i B13 = _mm_unpackhi_epi32(A6, D88); // ( 22 31 23 31 ) const __m128i B14 = _mm_unpacklo_epi32(A7, D92); // ( 22 31 23 31 ) const __m128i B15 = _mm_unpackhi_epi32(A7, D92); // ( 22 31 23 31 ) STORE_SAMPLE_PAIR<T>((__m128i *)&optr[2*ostride + 0], B8, B9); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[2*ostride + 8], B10, B11); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[2*ostride + 16], B12, B13); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[2*ostride + 24], B14, B15); const __m128i D40 = LOAD_QUANTISED(&iptr[ 40], qmatrix, 2, 2); const __m128i D44 = LOAD_QUANTISED(&iptr[ 44], qmatrix, 2, 2); const __m128i D56 = LOAD_QUANTISED(&iptr[ 56], qmatrix, 2, 3); const __m128i D60 = LOAD_QUANTISED(&iptr[ 60], qmatrix, 2, 3); const __m128i D112 = LOAD_QUANTISED(&iptr[112], qmatrix, 3, 1); const __m128i D116 = LOAD_QUANTISED(&iptr[116], qmatrix, 3, 1); const __m128i D120 = LOAD_QUANTISED(&iptr[120], qmatrix, 3, 1); const __m128i D124 = LOAD_QUANTISED(&iptr[124], qmatrix, 3, 1); const __m128i A8 = _mm_unpacklo_epi32(D40, D56); // ( 22 23 22 23 ) const __m128i A9 = _mm_unpackhi_epi32(D40, D56); // ( 22 23 22 23 ) const __m128i A10 = _mm_unpacklo_epi32(D44, D60); // ( 22 23 22 23 ) const __m128i A11 = _mm_unpackhi_epi32(D44, D60); // ( 22 23 22 23 ) const __m128i B16 = _mm_unpacklo_epi32(A8, D112); // ( 22 31 23 31 ) const __m128i B17 = _mm_unpackhi_epi32(A8, D112); // ( 22 31 23 31 ) const __m128i B18 = _mm_unpacklo_epi32(A9, D116); // ( 22 31 23 31 ) const __m128i B19 = _mm_unpackhi_epi32(A9, D116); // ( 22 31 23 31 ) const __m128i B20 = _mm_unpacklo_epi32(A10, D120); // ( 22 31 23 31 ) const __m128i B21 = _mm_unpackhi_epi32(A10, D120); // ( 22 31 23 31 ) const __m128i B22 = _mm_unpacklo_epi32(A11, D124); // ( 22 31 23 31 ) const __m128i B23 = _mm_unpackhi_epi32(A11, D124); // ( 22 31 23 31 ) STORE_SAMPLE_PAIR<T>((__m128i *)&optr[6*ostride + 0], B16, B17); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[6*ostride + 8], B18, B19); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[6*ostride + 16], B20, B21); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[6*ostride + 24], B22, B23); for (int j = 0; j < 4; j++) { const __m128i X0 = LOAD_QUANTISED(&iptr[128 + j*16 + 0], qmatrix, 3, 2); const __m128i X1 = LOAD_QUANTISED(&iptr[128 + j*16 + 4], qmatrix, 3, 2); const __m128i X2 = LOAD_QUANTISED(&iptr[128 + j*16 + 8], qmatrix, 3, 2); const __m128i X3 = LOAD_QUANTISED(&iptr[128 + j*16 + 12], qmatrix, 3, 2); const __m128i Y0 = LOAD_QUANTISED(&iptr[192 + j*16 + 0], qmatrix, 3, 3); const __m128i Y1 = LOAD_QUANTISED(&iptr[192 + j*16 + 4], qmatrix, 3, 3); const __m128i Y2 = LOAD_QUANTISED(&iptr[192 + j*16 + 8], qmatrix, 3, 3); const __m128i Y3 = LOAD_QUANTISED(&iptr[192 + j*16 + 12], qmatrix, 3, 3); const __m128i Z0 = _mm_unpacklo_epi32(X0, Y0); const __m128i Z1 = _mm_unpackhi_epi32(X0, Y0); const __m128i Z2 = _mm_unpacklo_epi32(X1, Y1); const __m128i Z3 = _mm_unpackhi_epi32(X1, Y1); const __m128i Z4 = _mm_unpacklo_epi32(X2, Y2); const __m128i Z5 = _mm_unpackhi_epi32(X2, Y2); const __m128i Z6 = _mm_unpacklo_epi32(X3, Y3); const __m128i Z7 = _mm_unpackhi_epi32(X3, Y3); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[(2*j + 1)*ostride + 0], Z0, Z1); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[(2*j + 1)*ostride + 8], Z2, Z3); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[(2*j + 1)*ostride + 16], Z4, Z5); STORE_SAMPLE_PAIR<T>((__m128i *)&optr[(2*j + 1)*ostride + 24], Z6, Z7); } }