Esempio n. 1
0
template<class T> inline void dequantise_sse4_2_16_8_3(QuantisationMatrix *qmatrix,
                                                        int32_t *idata,
                                                        void *_odata,
                                                        int ostride) {
  T *odata = (T *)_odata;
  const int slice_width  = 16;
  const int slice_height = 8;

  const int Y = 0;
  const int X = 0;
  const int N = 0;

  T * const optr = &odata[Y*slice_height*ostride + X*slice_width];
  const int32_t * iptr = &idata[N*slice_height*slice_width];

  {
    __m128i D0;
    {
      D0 = _mm_load_si128((__m128i *)&iptr[ 0]); // [  0  1  2  3 ] (Q)
      __m128i QF = _mm_unpacklo_epi64(_mm_load_si128((__m128i *)&qmatrix->qfactor[0][0]),
                                      _mm_load_si128((__m128i *)&qmatrix->qfactor[1][1]));
      __m128i QO = _mm_unpacklo_epi64(_mm_load_si128((__m128i *)&qmatrix->qoffset[0][0]),
                                      _mm_load_si128((__m128i *)&qmatrix->qoffset[1][1]));

      __m128i X  = _mm_abs_epi32(D0);
      X = _mm_mullo_epi32(X, QF);
      X = _mm_add_epi32(X, QO);
      X = _mm_srai_epi32(X, 2);
      D0 = _mm_sign_epi32(X, D0);

      D0 = _mm_shuffle_epi32(D0, 0xD8);
    }

    const __m128i D1 = LOAD_QUANTISED(&iptr[8], qmatrix, 2, 1);

    const __m128i D2 = LOAD_QUANTISED(&iptr[32], qmatrix, 3, 1);
    const __m128i D3 = LOAD_QUANTISED(&iptr[36], qmatrix, 3, 1);

    const __m128i A0  = _mm_unpacklo_epi32(D0, D1);
    const __m128i A1  = _mm_unpackhi_epi32(D0, D1);

    const __m128i B0  = _mm_unpacklo_epi32(A0, D2);
    const __m128i B1  = _mm_unpackhi_epi32(A0, D2);
    const __m128i B2  = _mm_unpacklo_epi32(A1, D3);
    const __m128i B3  = _mm_unpackhi_epi32(A1, D3);

    STORE_SAMPLE_PAIR<T>((__m128i *)&optr[0*ostride +  0], B0, B1);
    STORE_SAMPLE_PAIR<T>((__m128i *)&optr[0*ostride +  8], B2, B3);
  }

  {
    __m128i D0;
    {
      D0 = _mm_load_si128((__m128i *)&iptr[ 4]);
      __m128i QF = _mm_unpacklo_epi64(_mm_load_si128((__m128i *)&qmatrix->qfactor[1][2]),
                                      _mm_load_si128((__m128i *)&qmatrix->qfactor[1][3]));
      __m128i QO = _mm_unpacklo_epi64(_mm_load_si128((__m128i *)&qmatrix->qoffset[1][2]),
                                      _mm_load_si128((__m128i *)&qmatrix->qoffset[1][3]));

      __m128i X  = _mm_abs_epi32(D0);
      X = _mm_mullo_epi32(X, QF);
      X = _mm_add_epi32(X, QO);
      X = _mm_srai_epi32(X, 2);
      D0 = _mm_sign_epi32(X, D0);

      D0 = _mm_shuffle_epi32(D0, 0xD8);
    }

    const __m128i D1 = LOAD_QUANTISED(&iptr[12], qmatrix, 2, 1);

    const __m128i D2 = LOAD_QUANTISED(&iptr[48], qmatrix, 3, 1);
    const __m128i D3 = LOAD_QUANTISED(&iptr[52], qmatrix, 3, 1);

    const __m128i A0  = _mm_unpacklo_epi32(D0, D1);
    const __m128i A1  = _mm_unpackhi_epi32(D0, D1);

    const __m128i B0  = _mm_unpacklo_epi32(A0, D2);
    const __m128i B1  = _mm_unpackhi_epi32(A0, D2);
    const __m128i B2  = _mm_unpacklo_epi32(A1, D3);
    const __m128i B3  = _mm_unpackhi_epi32(A1, D3);

    STORE_SAMPLE_PAIR<T>((__m128i *)&optr[4*ostride +  0], B0, B1);
    STORE_SAMPLE_PAIR<T>((__m128i *)&optr[4*ostride +  8], B2, B3);
  }

  {
    const __m128i D0 = LOAD_QUANTISED(&iptr[16], qmatrix, 2, 2);

    const __m128i D1 = LOAD_QUANTISED(&iptr[24], qmatrix, 2, 3);

    const __m128i D2 = LOAD_QUANTISED(&iptr[40], qmatrix, 3, 1);
    const __m128i D3 = LOAD_QUANTISED(&iptr[44], qmatrix, 3, 1);

    const __m128i A0  = _mm_unpacklo_epi32(D0, D1);
    const __m128i A1  = _mm_unpackhi_epi32(D0, D1);

    const __m128i B0  = _mm_unpacklo_epi32(A0, D2);
    const __m128i B1  = _mm_unpackhi_epi32(A0, D2);
    const __m128i B2  = _mm_unpacklo_epi32(A1, D3);
    const __m128i B3  = _mm_unpackhi_epi32(A1, D3);

    STORE_SAMPLE_PAIR<T>((__m128i *)&optr[2*ostride +  0], B0, B1);
    STORE_SAMPLE_PAIR<T>((__m128i *)&optr[2*ostride +  8], B2, B3);
  }

  {
    const __m128i D0 = LOAD_QUANTISED(&iptr[20], qmatrix, 2, 2);

    const __m128i D1 = LOAD_QUANTISED(&iptr[28], qmatrix, 2, 3);

    const __m128i D2 = LOAD_QUANTISED(&iptr[56], qmatrix, 3, 1);
    const __m128i D3 = LOAD_QUANTISED(&iptr[60], qmatrix, 3, 1);

    const __m128i A0  = _mm_unpacklo_epi32(D0, D1);
    const __m128i A1  = _mm_unpackhi_epi32(D0, D1);

    const __m128i B0  = _mm_unpacklo_epi32(A0, D2);
    const __m128i B1  = _mm_unpackhi_epi32(A0, D2);
    const __m128i B2  = _mm_unpacklo_epi32(A1, D3);
    const __m128i B3  = _mm_unpackhi_epi32(A1, D3);

    STORE_SAMPLE_PAIR<T>((__m128i *)&optr[6*ostride +  0], B0, B1);
    STORE_SAMPLE_PAIR<T>((__m128i *)&optr[6*ostride +  8], B2, B3);
  }

  for (int y = 0; y < 4; y++) {
    const __m128i D0 = LOAD_QUANTISED(&iptr[ 64 + y*8], qmatrix, 3, 2);
    const __m128i D1 = LOAD_QUANTISED(&iptr[ 68 + y*8], qmatrix, 3, 2);

    const __m128i D2 = LOAD_QUANTISED(&iptr[ 96 + y*8], qmatrix, 3, 3);
    const __m128i D3 = LOAD_QUANTISED(&iptr[100 + y*8], qmatrix, 3, 3);

    const __m128i A0  = _mm_unpacklo_epi32(D0, D2);
    const __m128i A1  = _mm_unpackhi_epi32(D0, D2);
    const __m128i A2  = _mm_unpacklo_epi32(D1, D3);
    const __m128i A3  = _mm_unpackhi_epi32(D1, D3);

    STORE_SAMPLE_PAIR<T>((__m128i *)&optr[(2*y + 1)*ostride +  0], A0, A1);
    STORE_SAMPLE_PAIR<T>((__m128i *)&optr[(2*y + 1)*ostride +  8], A2, A3);
  }
}
Esempio n. 2
0
void ulsch_channel_compensation_alamouti(int **rxdataF_ext,                 // For Distributed Alamouti Combining
					 int **ul_ch_estimates_ext_0,
					 int **ul_ch_estimates_ext_1,
					 int **ul_ch_mag_0,
					 int **ul_ch_magb_0,
					 int **ul_ch_mag_1,
					 int **ul_ch_magb_1,
					 int **rxdataF_comp_0,
					 int **rxdataF_comp_1,
					 LTE_DL_FRAME_PARMS *frame_parms,
					 unsigned char symbol,
					 unsigned char Qm,
					 unsigned short nb_rb,
					 unsigned char output_shift_0,
					 unsigned char output_shift_1) {
  
  unsigned short rb;
  __m128i *ul_ch128_0,*ul_ch128_1,*ul_ch_mag128_0,*ul_ch_mag128_1,*ul_ch_mag128b_0,*ul_ch_mag128b_1,*rxdataF128,*rxdataF_comp128_0,*rxdataF_comp128_1;
  unsigned char aarx;//,symbol_mod;


  //  symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol;

#ifndef __SSE3__
  zeroU = _mm_xor_si128(zeroU,zeroU);
#endif

  //    printf("comp: symbol %d\n",symbol);

  
  if (Qm == 4) {  
    QAM_amp128U_0 = _mm_set1_epi16(QAM16_n1);
    QAM_amp128U_1 = _mm_set1_epi16(QAM16_n1);
  }
  else if (Qm == 6) {
    QAM_amp128U_0  = _mm_set1_epi16(QAM64_n1);
    QAM_amp128bU_0 = _mm_set1_epi16(QAM64_n2);

    QAM_amp128U_1  = _mm_set1_epi16(QAM64_n1);
    QAM_amp128bU_1 = _mm_set1_epi16(QAM64_n2);
  }
  for (aarx=0;aarx<frame_parms->nb_antennas_rx;aarx++) {
    
    ul_ch128_0          = (__m128i *)&ul_ch_estimates_ext_0[aarx][symbol*frame_parms->N_RB_DL*12];
    ul_ch_mag128_0      = (__m128i *)&ul_ch_mag_0[aarx][symbol*frame_parms->N_RB_DL*12];
    ul_ch_mag128b_0     = (__m128i *)&ul_ch_magb_0[aarx][symbol*frame_parms->N_RB_DL*12];
    ul_ch128_1          = (__m128i *)&ul_ch_estimates_ext_1[aarx][symbol*frame_parms->N_RB_DL*12];
    ul_ch_mag128_1      = (__m128i *)&ul_ch_mag_1[aarx][symbol*frame_parms->N_RB_DL*12];
    ul_ch_mag128b_1     = (__m128i *)&ul_ch_magb_1[aarx][symbol*frame_parms->N_RB_DL*12];
    rxdataF128        = (__m128i *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12];
    rxdataF_comp128_0   = (__m128i *)&rxdataF_comp_0[aarx][symbol*frame_parms->N_RB_DL*12];
    rxdataF_comp128_1   = (__m128i *)&rxdataF_comp_1[aarx][symbol*frame_parms->N_RB_DL*12];


    for (rb=0;rb<nb_rb;rb++) {
      //      printf("comp: symbol %d rb %d\n",symbol,rb);
      if (Qm>2) {  
	// get channel amplitude if not QPSK

	mmtmpU0 = _mm_madd_epi16(ul_ch128_0[0],ul_ch128_0[0]);
	
	mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0);
	
	mmtmpU1 = _mm_madd_epi16(ul_ch128_0[1],ul_ch128_0[1]);
	mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_0);
	mmtmpU0 = _mm_packs_epi32(mmtmpU0,mmtmpU1);
	
	ul_ch_mag128_0[0] = _mm_unpacklo_epi16(mmtmpU0,mmtmpU0);
	ul_ch_mag128b_0[0] = ul_ch_mag128_0[0];
	ul_ch_mag128_0[0] = _mm_mulhi_epi16(ul_ch_mag128_0[0],QAM_amp128U_0);
	ul_ch_mag128_0[0] = _mm_slli_epi16(ul_ch_mag128_0[0],2); // 2 to compensate the scale channel estimate
	
	ul_ch_mag128_0[1] = _mm_unpackhi_epi16(mmtmpU0,mmtmpU0);
	ul_ch_mag128b_0[1] = ul_ch_mag128_0[1];
	ul_ch_mag128_0[1] = _mm_mulhi_epi16(ul_ch_mag128_0[1],QAM_amp128U_0);
	ul_ch_mag128_0[1] = _mm_slli_epi16(ul_ch_mag128_0[1],2); // 2 to scale compensate the scale channel estimate
	
	mmtmpU0 = _mm_madd_epi16(ul_ch128_0[2],ul_ch128_0[2]);
	mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0);
	mmtmpU1 = _mm_packs_epi32(mmtmpU0,mmtmpU0);
	
	ul_ch_mag128_0[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1);
	ul_ch_mag128b_0[2] = ul_ch_mag128_0[2];
	
	ul_ch_mag128_0[2] = _mm_mulhi_epi16(ul_ch_mag128_0[2],QAM_amp128U_0);
	ul_ch_mag128_0[2] = _mm_slli_epi16(ul_ch_mag128_0[2],2);	//  2 to scale compensate the scale channel estimat
	
	
	ul_ch_mag128b_0[0] = _mm_mulhi_epi16(ul_ch_mag128b_0[0],QAM_amp128bU_0);
	ul_ch_mag128b_0[0] = _mm_slli_epi16(ul_ch_mag128b_0[0],2);  //  2 to scale compensate the scale channel estima
	
	
	ul_ch_mag128b_0[1] = _mm_mulhi_epi16(ul_ch_mag128b_0[1],QAM_amp128bU_0);
	ul_ch_mag128b_0[1] = _mm_slli_epi16(ul_ch_mag128b_0[1],2);   //  2 to scale compensate the scale channel estima
	
	ul_ch_mag128b_0[2] = _mm_mulhi_epi16(ul_ch_mag128b_0[2],QAM_amp128bU_0);
	ul_ch_mag128b_0[2] = _mm_slli_epi16(ul_ch_mag128b_0[2],2);	 //  2 to scale compensate the scale channel estima 
	

	

	mmtmpU0 = _mm_madd_epi16(ul_ch128_1[0],ul_ch128_1[0]);
	
	mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1);
	
	mmtmpU1 = _mm_madd_epi16(ul_ch128_1[1],ul_ch128_1[1]);
	mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_1);
	mmtmpU0 = _mm_packs_epi32(mmtmpU0,mmtmpU1);
	
	ul_ch_mag128_1[0] = _mm_unpacklo_epi16(mmtmpU0,mmtmpU0);
	ul_ch_mag128b_1[0] = ul_ch_mag128_1[0];
	ul_ch_mag128_1[0] = _mm_mulhi_epi16(ul_ch_mag128_1[0],QAM_amp128U_1);
	ul_ch_mag128_1[0] = _mm_slli_epi16(ul_ch_mag128_1[0],2); // 2 to compensate the scale channel estimate
	
	ul_ch_mag128_1[1] = _mm_unpackhi_epi16(mmtmpU0,mmtmpU0);
	ul_ch_mag128b_1[1] = ul_ch_mag128_1[1];
	ul_ch_mag128_1[1] = _mm_mulhi_epi16(ul_ch_mag128_1[1],QAM_amp128U_1);
	ul_ch_mag128_1[1] = _mm_slli_epi16(ul_ch_mag128_1[1],2); // 2 to scale compensate the scale channel estimate
	
	mmtmpU0 = _mm_madd_epi16(ul_ch128_1[2],ul_ch128_1[2]);
	mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1);
	mmtmpU1 = _mm_packs_epi32(mmtmpU0,mmtmpU0);
	
	ul_ch_mag128_1[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1);
	ul_ch_mag128b_1[2] = ul_ch_mag128_1[2];
	
	ul_ch_mag128_1[2] = _mm_mulhi_epi16(ul_ch_mag128_1[2],QAM_amp128U_0);
	ul_ch_mag128_1[2] = _mm_slli_epi16(ul_ch_mag128_1[2],2);	//  2 to scale compensate the scale channel estimat
	
	
	ul_ch_mag128b_1[0] = _mm_mulhi_epi16(ul_ch_mag128b_1[0],QAM_amp128bU_1);
	ul_ch_mag128b_1[0] = _mm_slli_epi16(ul_ch_mag128b_1[0],2);  //  2 to scale compensate the scale channel estima
	
	
	ul_ch_mag128b_1[1] = _mm_mulhi_epi16(ul_ch_mag128b_1[1],QAM_amp128bU_1);
	ul_ch_mag128b_1[1] = _mm_slli_epi16(ul_ch_mag128b_1[1],2);   //  2 to scale compensate the scale channel estima
	
	ul_ch_mag128b_1[2] = _mm_mulhi_epi16(ul_ch_mag128b_1[2],QAM_amp128bU_1);
	ul_ch_mag128b_1[2] = _mm_slli_epi16(ul_ch_mag128b_1[2],2);	 //  2 to scale compensate the scale channel estima 
      }
      

      /************************For Computing (y)*(h0*)********************************************/

      // multiply by conjugated channel
      mmtmpU0 = _mm_madd_epi16(ul_ch128_0[0],rxdataF128[0]);
      //	print_ints("re",&mmtmpU0);
      
      // mmtmpU0 contains real part of 4 consecutive outputs (32-bit)
      mmtmpU1 = _mm_shufflelo_epi16(ul_ch128_0[0],_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)&conjugate[0]);
      //	print_ints("im",&mmtmpU1);
      mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[0]);
      // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit)
      mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0);
      //	print_ints("re(shift)",&mmtmpU0);
      mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_0);
      //	print_ints("im(shift)",&mmtmpU1);
      mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1);
      mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1);
      //       	print_ints("c0",&mmtmpU2);
      //	print_ints("c1",&mmtmpU3);
      rxdataF_comp128_0[0] = _mm_packs_epi32(mmtmpU2,mmtmpU3);
      //      	print_shorts("rx:",rxdataF128[0]);
      //      	print_shorts("ch:",ul_ch128_0[0]);
      //      	print_shorts("pack:",rxdataF_comp128_0[0]);
      
      // multiply by conjugated channel
      mmtmpU0 = _mm_madd_epi16(ul_ch128_0[1],rxdataF128[1]);
      // mmtmpU0 contains real part of 4 consecutive outputs (32-bit)
      mmtmpU1 = _mm_shufflelo_epi16(ul_ch128_0[1],_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate);
      mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[1]);
      // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit)
      mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0);
      mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_0);
      mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1);
      mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1);
      
      rxdataF_comp128_0[1] = _mm_packs_epi32(mmtmpU2,mmtmpU3);
      //      	print_shorts("rx:",rxdataF128[1]);
      //      	print_shorts("ch:",ul_ch128_0[1]);
      //      	print_shorts("pack:",rxdataF_comp128_0[1]);	
      //       multiply by conjugated channel
      mmtmpU0 = _mm_madd_epi16(ul_ch128_0[2],rxdataF128[2]);
      // mmtmpU0 contains real part of 4 consecutive outputs (32-bit)
      mmtmpU1 = _mm_shufflelo_epi16(ul_ch128_0[2],_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate);
      mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[2]);
      // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit)
      mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_0);
      mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_0);
      mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1);
      mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1);
      
      rxdataF_comp128_0[2] = _mm_packs_epi32(mmtmpU2,mmtmpU3);
      //      	print_shorts("rx:",rxdataF128[2]);
      //      	print_shorts("ch:",ul_ch128_0[2]);
      //        print_shorts("pack:",rxdataF_comp128_0[2]);
      



      /*************************For Computing (y*)*(h1)************************************/
      // multiply by conjugated signal
      mmtmpU0 = _mm_madd_epi16(ul_ch128_1[0],rxdataF128[0]);
      //	print_ints("re",&mmtmpU0);
      
      // mmtmpU0 contains real part of 4 consecutive outputs (32-bit)
      mmtmpU1 = _mm_shufflelo_epi16(rxdataF128[0],_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)&conjugate[0]);
      //	print_ints("im",&mmtmpU1);
      mmtmpU1 = _mm_madd_epi16(mmtmpU1,ul_ch128_1[0]);
      // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit)
      mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1);
      //	print_ints("re(shift)",&mmtmpU0);
      mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_1);
      //	print_ints("im(shift)",&mmtmpU1);
      mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1);
      mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1);
      //       	print_ints("c0",&mmtmpU2);
      //	print_ints("c1",&mmtmpU3);
      rxdataF_comp128_1[0] = _mm_packs_epi32(mmtmpU2,mmtmpU3);
      //      	print_shorts("rx:",rxdataF128[0]);
      //      	print_shorts("ch_conjugate:",ul_ch128_1[0]);
      //      	print_shorts("pack:",rxdataF_comp128_1[0]);


      // multiply by conjugated signal
      mmtmpU0 = _mm_madd_epi16(ul_ch128_1[1],rxdataF128[1]);
      // mmtmpU0 contains real part of 4 consecutive outputs (32-bit)
      mmtmpU1 = _mm_shufflelo_epi16(rxdataF128[1],_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate);
      mmtmpU1 = _mm_madd_epi16(mmtmpU1,ul_ch128_1[1]);
      // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit)
      mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1);
      mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_1);
      mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1);
      mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1);
      
      rxdataF_comp128_1[1] = _mm_packs_epi32(mmtmpU2,mmtmpU3);
      //      	print_shorts("rx:",rxdataF128[1]);
      //      	print_shorts("ch_conjugate:",ul_ch128_1[1]);
      //      	print_shorts("pack:",rxdataF_comp128_1[1]);


      //       multiply by conjugated signal
      mmtmpU0 = _mm_madd_epi16(ul_ch128_1[2],rxdataF128[2]);
      // mmtmpU0 contains real part of 4 consecutive outputs (32-bit)
      mmtmpU1 = _mm_shufflelo_epi16(rxdataF128[2],_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate);
      mmtmpU1 = _mm_madd_epi16(mmtmpU1,ul_ch128_1[2]);
      // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit)
      mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift_1);
      mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift_1);
      mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1);
      mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1);
      
      rxdataF_comp128_1[2] = _mm_packs_epi32(mmtmpU2,mmtmpU3);
      //      	print_shorts("rx:",rxdataF128[2]);
      //      	print_shorts("ch_conjugate:",ul_ch128_0[2]);
      //        print_shorts("pack:",rxdataF_comp128_1[2]);



      ul_ch128_0+=3;
      ul_ch_mag128_0+=3;
      ul_ch_mag128b_0+=3;
      ul_ch128_1+=3;
      ul_ch_mag128_1+=3;
      ul_ch_mag128b_1+=3;
      rxdataF128+=3;
      rxdataF_comp128_0+=3;
      rxdataF_comp128_1+=3;
      
    }
  }


  _mm_empty();
  _m_empty();

}     
Esempio n. 3
0
mlib_status
__mlib_VectorDotProd_U8_Sat(
	mlib_d64 *z,
	const mlib_u8 *x,
	const mlib_u8 *y,
	mlib_s32 n)
{
	if (n <= 0)
		return (MLIB_FAILURE);

	mlib_s32 i, ax, ay, nstep, n1, n2, n3, sum = 0;
	const mlib_u8 *px = x, *py = y;
	__m128i zero, xbuf, ybuf, zbuf32, zbuf64, buf1, buf2, buf3, buf4;
	zero = _mm_setzero_si128();
	zbuf64 = zero;

	ax = (mlib_addr)x & 15;
	ay = (mlib_addr)y & 15;
	nstep = 16 / sizeof (mlib_u8);
	n1 = ((16 - ax) & 15) / sizeof (mlib_u8);
	n2 = (n - n1) / nstep;
	n3 = n - n1 - n2 * nstep;

	if (n2 > 0) {
		for (i = 0; i < n1; i++) {
			sum += (mlib_s32)(*px++) * (*py++);
		}

		mlib_s32 nblock = n2 >> 12;
		mlib_s32 tail = n2 & 4095;
		mlib_s32 k;

		if (ax == ay) {
			for (k = 0; k < nblock; k++) {
				zbuf32 = zero;
				for (i = 0; i < 4096; i++) {
					VECTOR_DOTPROD_U8(load);
				}
				buf1 = _mm_unpacklo_epi32(zbuf32, zero);
				buf2 = _mm_unpackhi_epi32(zbuf32, zero);
				zbuf64 = _mm_add_epi64(zbuf64, buf1);
				zbuf64 = _mm_add_epi64(zbuf64, buf2);
			}
			zbuf32 = zero;
			for (i = 0; i < tail; i++) {
				VECTOR_DOTPROD_U8(load);
			}
		} else {
			for (k = 0; k < nblock; k++) {
				zbuf32 = zero;
				for (i = 0; i < 4096; i++) {
					VECTOR_DOTPROD_U8(loadu);
				}
				buf1 = _mm_unpacklo_epi32(zbuf32, zero);
				buf2 = _mm_unpackhi_epi32(zbuf32, zero);
				zbuf64 = _mm_add_epi64(zbuf64, buf1);
				zbuf64 = _mm_add_epi64(zbuf64, buf2);
			}
			zbuf32 = zero;
			for (i = 0; i < tail; i++) {
				VECTOR_DOTPROD_U8(loadu);
			}
		}
		buf1 = _mm_unpacklo_epi32(zbuf32, zero);
		buf2 = _mm_unpackhi_epi32(zbuf32, zero);
		zbuf64 = _mm_add_epi64(zbuf64, buf1);
		zbuf64 = _mm_add_epi64(zbuf64, buf2);

		for (i = 0; i < n3; i++) {
			sum += (mlib_s32)(*px++) * (*py++);
		}

		mlib_d64 dsum = sum;
		long long pz[2];
		_mm_storeu_si128((__m128i *)pz, zbuf64);
		dsum += pz[0];
		dsum += pz[1];
		*z = dsum;
	} else {
		for (i = 0; i < n; i++) {
  template<int pixelFormat> void
  imageFromPixels(vl::Image & image, char unsigned const * rgb, int rowStride)
  {
    vl::ImageShape const & shape = image.getShape() ;
    int blockSizeX ;
    int blockSizeY ;
    int pixelStride ;
    int imagePlaneStride = (int)shape.width * (int)shape.height ;
    __m128i shuffleRgb ;
    __m128i const shuffleL = _mm_set_epi8(0xff, 0xff, 0xff,  3,
                                          0xff, 0xff, 0xff,  2,
                                          0xff, 0xff, 0xff,  1,
                                          0xff, 0xff, 0xff,  0) ;
    __m128i const mask = _mm_set_epi32(0xff, 0xff, 0xff, 0xff) ;

    switch (pixelFormat) {
      case pixelFormatL:
        pixelStride = 1 ;
        blockSizeX = 16 ;
        blockSizeY = 4 ;
        break ;
      case pixelFormatBGR:
      case pixelFormatRGB:
        pixelStride = 3 ;
        blockSizeX = 4 ;
        blockSizeY = 4 ;
        assert(shape.depth == 3) ;
        break ;
      case pixelFormatRGBA:
      case pixelFormatBGRA:
      case pixelFormatBGRAasL:
        pixelStride = 4 ;
        blockSizeX = 4 ;
        blockSizeY = 4 ;
        assert(shape.depth == 3) ;
        break ;
      default:
        assert(false) ;
    }

    switch (pixelFormat) {
      case pixelFormatL:
        break ;

      case pixelFormatRGB:
        shuffleRgb = _mm_set_epi8(0xff, 11, 10,  9,
                                  0xff,  8,  7,  6,
                                  0xff,  5,  4,  3,
                                  0xff,  2,  1,  0) ;
        break ;

      case pixelFormatRGBA:
        shuffleRgb = _mm_set_epi8(0xff, 14, 13, 12,
                                  0xff, 10,  9,  8,
                                  0xff,  6,  5,  4,
                                  0xff,  2,  1,  0) ;
        break ;

      case pixelFormatBGR:
        shuffleRgb = _mm_set_epi8(0xff,  9, 10, 11,
                                  0xff,  6,  7,  8,
                                  0xff,  3,  4,  4,
                                  0xff,  0,  1,  2) ;
        break ;

      case pixelFormatBGRA:
        shuffleRgb = _mm_set_epi8(0xff, 12, 13, 14,
                                  0xff,  8,  9, 10,
                                  0xff,  4,  5,  6,
                                  0xff,  0,  1,  2) ;
        break ;

      case pixelFormatBGRAasL:
        shuffleRgb = _mm_set_epi8(0xff, 0xff, 0xff, 12,
                                  0xff, 0xff, 0xff, 8,
                                  0xff, 0xff, 0xff, 4,
                                  0xff, 0xff, 0xff, 0) ;
        break ;
    }

    // we pull out these values as otherwise the compiler
    // will assume that the reference &image can be aliased
    // and recompute silly multiplications in the inner loop
    float *  const __restrict imageMemory = image.getMemory() ;
    int const imageHeight = (int)shape.height ;
    int const imageWidth = (int)shape.width ;

    for (int x = 0 ; x < imageWidth ; x += blockSizeX) {
      int y = 0 ;
      float * __restrict imageMemoryX = imageMemory + x * imageHeight ;
      int bsx = (std::min)(imageWidth - x, blockSizeX) ;
      if (bsx < blockSizeX) goto boundary ;

      for ( ; y < imageHeight - blockSizeY + 1 ; y += blockSizeY) {
        char unsigned const * __restrict pixel = rgb + y * rowStride + x * pixelStride ;
        float * __restrict r = imageMemoryX + y ;
        __m128i p0, p1, p2, p3, T0, T1, T2, T3 ;

        /* convert a blockSizeX x blockSizeY block in the input image */
        switch (pixelFormat) {
          case pixelFormatRGB :
          case pixelFormatRGBA :
          case pixelFormatBGR :
          case pixelFormatBGRA :
          case pixelFormatBGRAasL :
            // load 4x4 RGB pixels
            p0 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)pixel), shuffleRgb) ; pixel += rowStride ;
            p1 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)pixel), shuffleRgb) ; pixel += rowStride ;
            p2 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)pixel), shuffleRgb) ; pixel += rowStride ;
            p3 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)pixel), shuffleRgb) ; pixel += rowStride ;

            // transpose pixels as 32-bit integers (see also below)
            T0 = _mm_unpacklo_epi32(p0, p1);
            T1 = _mm_unpacklo_epi32(p2, p3);
            T2 = _mm_unpackhi_epi32(p0, p1);
            T3 = _mm_unpackhi_epi32(p2, p3);
            p0 = _mm_unpacklo_epi64(T0, T1);
            p1 = _mm_unpackhi_epi64(T0, T1);
            p2 = _mm_unpacklo_epi64(T2, T3);
            p3 = _mm_unpackhi_epi64(T2, T3);

            // store r
            _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p0, mask))) ; r += imageHeight ;
            _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p1, mask))) ; r += imageHeight ;
            _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p2, mask))) ; r += imageHeight ;
            _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p3, mask))) ;

            if (pixelFormat == pixelFormatBGRAasL) break ;

            // store g
            r += (imageWidth - 3) * imageHeight ;
            p0 = _mm_srli_epi32 (p0, 8) ;
            p1 = _mm_srli_epi32 (p1, 8) ;
            p2 = _mm_srli_epi32 (p2, 8) ;
            p3 = _mm_srli_epi32 (p3, 8) ;
            _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p0, mask))) ; r += imageHeight ;
            _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p1, mask))) ; r += imageHeight ;
            _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p2, mask))) ; r += imageHeight ;
            _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p3, mask))) ;

            // store b
            r += (imageWidth - 3) * imageHeight ;
            p0 = _mm_srli_epi32 (p0, 8) ;
            p1 = _mm_srli_epi32 (p1, 8) ;
            p2 = _mm_srli_epi32 (p2, 8) ;
            p3 = _mm_srli_epi32 (p3, 8) ;
            _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p0, mask))) ; r += imageHeight ;
            _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p1, mask))) ; r += imageHeight ;
            _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p2, mask))) ; r += imageHeight ;
            _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p3, mask))) ;
            break ;

          case pixelFormatL:
            // load 4x16 L pixels
            p0 = _mm_loadu_si128((__m128i*)pixel) ; pixel += rowStride ;
            p1 = _mm_loadu_si128((__m128i*)pixel) ; pixel += rowStride ;
            p2 = _mm_loadu_si128((__m128i*)pixel) ; pixel += rowStride ;
            p3 = _mm_loadu_si128((__m128i*)pixel) ; pixel += rowStride ;

            /*
             Pixels are collected in little-endian order: the first pixel
             is at the `right' (least significant byte of p0:

             p[0] = a, p[1] = b, ...

             p0: [ ... | ... | ... | d c b a ]
             p1: [ ... | ... | ... | h g f e ]
             p2: [ ... | ... | ... | l k j i ]
             p3: [ ... | ... | ... | p o n m ]

             The goal is to transpose four 4x4 subblocks in the
             4 x 16 pixel array. The first step interlaves individual
             pixels in p0 and p1:

             T0: [ ... | ... | h d g c | f b e a ]
             T1: [ ... | ... | p l o k | n j m i ]
             T2: [ ... | ... | ... | ... ]
             T3: [ ... | ... | ... | ... ]

             The second step interleaves groups of two pixels:

             p0: [pl hd | ok gc | nj fb | mi ea] (pixels in the rightmost 4x4 subblock)
             p1: ...
             p2: ...
             p3: ...

             The third step interlevaes groups of four pixels:

             T0: [ ... | njfb | ... | miea ]
             T1: ...
             T2: ...
             T3: ...

             The last step interleaves groups of eight pixels:

             p0: [ ... | ... | ... | miea ]
             p1: [ ... | ... | ... | njfb ]
             p2: [ ... | ... | ... | okgc ]
             p3: [ ... | ... | ... | dklp ]

             */

            T0 = _mm_unpacklo_epi8(p0, p1);
            T1 = _mm_unpacklo_epi8(p2, p3);
            T2 = _mm_unpackhi_epi8(p0, p1);
            T3 = _mm_unpackhi_epi8(p2, p3);
            p0 = _mm_unpacklo_epi16(T0, T1);
            p1 = _mm_unpackhi_epi16(T0, T1);
            p2 = _mm_unpacklo_epi16(T2, T3);
            p3 = _mm_unpackhi_epi16(T2, T3);
            T0 = _mm_unpacklo_epi32(p0, p1);
            T1 = _mm_unpacklo_epi32(p2, p3);
            T2 = _mm_unpackhi_epi32(p0, p1);
            T3 = _mm_unpackhi_epi32(p2, p3);
            p0 = _mm_unpacklo_epi64(T0, T1);
            p1 = _mm_unpackhi_epi64(T0, T1);
            p2 = _mm_unpacklo_epi64(T2, T3);
            p3 = _mm_unpackhi_epi64(T2, T3);

            // store four 4x4 subblock
            for (int i = 0 ; i < 4 ; ++i) {
              _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_shuffle_epi8(p0, shuffleL))) ; r += imageHeight ;
              _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_shuffle_epi8(p1, shuffleL))) ; r += imageHeight ;
              _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_shuffle_epi8(p2, shuffleL))) ; r += imageHeight ;
              _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_shuffle_epi8(p3, shuffleL))) ; r += imageHeight ;
              p0 = _mm_srli_si128 (p0, 4) ;
              p1 = _mm_srli_si128 (p1, 4) ;
              p2 = _mm_srli_si128 (p2, 4) ;
              p3 = _mm_srli_si128 (p3, 4) ;
            }
            break ;
        }
      } /* next y */

    boundary:
      /* special case if there is not a full 4x4 block to process */
      for ( ; y < imageHeight ; y += blockSizeY) {
        int bsy = (std::min)(imageHeight - y, blockSizeY) ;
        float * __restrict r ;
        float * rend ;
        for (int dx = 0 ; dx < bsx ; ++dx) {
          char unsigned const * __restrict pixel = rgb + y * rowStride + (x + dx) * pixelStride ;
          r = imageMemoryX + y + dx * imageHeight ;
          rend = r + bsy ;
          while (r != rend) {
            switch (pixelFormat) {
              case pixelFormatRGBA:
              case pixelFormatRGB:
                r[0 * imagePlaneStride] = (float) pixel[0] ;
                r[1 * imagePlaneStride] = (float) pixel[1] ;
                r[2 * imagePlaneStride] = (float) pixel[2] ;
                break ;
              case pixelFormatBGR:
              case pixelFormatBGRA:
                r[2 * imagePlaneStride] = (float) pixel[0] ;
                r[1 * imagePlaneStride] = (float) pixel[1] ;
                r[0 * imagePlaneStride] = (float) pixel[2] ;
                break;
              case pixelFormatBGRAasL:
              case pixelFormatL:
                r[0] = (float) pixel[0] ;
                break ;
            }
            r += 1 ;
            pixel += rowStride ;
          }
        }
      }
    }
  }
Esempio n. 5
0
static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
  // This implementation makes use of 16-bit fixed point versions of two
  // multiply constants:
  //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
  //    K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16
  //
  // To be able to use signed 16-bit integers, we use the following trick to
  // have constants within range:
  // - Associated constants are obtained by subtracting the 16-bit fixed point
  //   version of one:
  //      k = K - (1 << 16)  =>  K = k + (1 << 16)
  //      K1 = 85267  =>  k1 =  20091
  //      K2 = 35468  =>  k2 = -30068
  // - The multiplication of a variable by a constant become the sum of the
  //   variable and the multiplication of that variable by the associated
  //   constant:
  //      (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x
  const __m128i k1 = _mm_set1_epi16(20091);
  const __m128i k2 = _mm_set1_epi16(-30068);
  __m128i T0, T1, T2, T3;

  // Load and concatenate the transform coefficients (we'll do two transforms
  // in parallel). In the case of only one transform, the second half of the
  // vectors will just contain random value we'll never use nor store.
  __m128i in0, in1, in2, in3;
  {
    in0 = _mm_loadl_epi64((__m128i*)&in[0]);
    in1 = _mm_loadl_epi64((__m128i*)&in[4]);
    in2 = _mm_loadl_epi64((__m128i*)&in[8]);
    in3 = _mm_loadl_epi64((__m128i*)&in[12]);
    // a00 a10 a20 a30   x x x x
    // a01 a11 a21 a31   x x x x
    // a02 a12 a22 a32   x x x x
    // a03 a13 a23 a33   x x x x
    if (do_two) {
      const __m128i inB0 = _mm_loadl_epi64((__m128i*)&in[16]);
      const __m128i inB1 = _mm_loadl_epi64((__m128i*)&in[20]);
      const __m128i inB2 = _mm_loadl_epi64((__m128i*)&in[24]);
      const __m128i inB3 = _mm_loadl_epi64((__m128i*)&in[28]);
      in0 = _mm_unpacklo_epi64(in0, inB0);
      in1 = _mm_unpacklo_epi64(in1, inB1);
      in2 = _mm_unpacklo_epi64(in2, inB2);
      in3 = _mm_unpacklo_epi64(in3, inB3);
      // a00 a10 a20 a30   b00 b10 b20 b30
      // a01 a11 a21 a31   b01 b11 b21 b31
      // a02 a12 a22 a32   b02 b12 b22 b32
      // a03 a13 a23 a33   b03 b13 b23 b33
    }
  }

  // Vertical pass and subsequent transpose.
  {
    // First pass, c and d calculations are longer because of the "trick"
    // multiplications.
    const __m128i a = _mm_add_epi16(in0, in2);
    const __m128i b = _mm_sub_epi16(in0, in2);
    // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
    const __m128i c1 = _mm_mulhi_epi16(in1, k2);
    const __m128i c2 = _mm_mulhi_epi16(in3, k1);
    const __m128i c3 = _mm_sub_epi16(in1, in3);
    const __m128i c4 = _mm_sub_epi16(c1, c2);
    const __m128i c = _mm_add_epi16(c3, c4);
    // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
    const __m128i d1 = _mm_mulhi_epi16(in1, k1);
    const __m128i d2 = _mm_mulhi_epi16(in3, k2);
    const __m128i d3 = _mm_add_epi16(in1, in3);
    const __m128i d4 = _mm_add_epi16(d1, d2);
    const __m128i d = _mm_add_epi16(d3, d4);

    // Second pass.
    const __m128i tmp0 = _mm_add_epi16(a, d);
    const __m128i tmp1 = _mm_add_epi16(b, c);
    const __m128i tmp2 = _mm_sub_epi16(b, c);
    const __m128i tmp3 = _mm_sub_epi16(a, d);

    // Transpose the two 4x4.
    // a00 a01 a02 a03   b00 b01 b02 b03
    // a10 a11 a12 a13   b10 b11 b12 b13
    // a20 a21 a22 a23   b20 b21 b22 b23
    // a30 a31 a32 a33   b30 b31 b32 b33
    const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1);
    const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3);
    const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1);
    const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3);
    // a00 a10 a01 a11   a02 a12 a03 a13
    // a20 a30 a21 a31   a22 a32 a23 a33
    // b00 b10 b01 b11   b02 b12 b03 b13
    // b20 b30 b21 b31   b22 b32 b23 b33
    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
    // a00 a10 a20 a30 a01 a11 a21 a31
    // b00 b10 b20 b30 b01 b11 b21 b31
    // a02 a12 a22 a32 a03 a13 a23 a33
    // b02 b12 a22 b32 b03 b13 b23 b33
    T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
    T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
    T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
    T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
    // a00 a10 a20 a30   b00 b10 b20 b30
    // a01 a11 a21 a31   b01 b11 b21 b31
    // a02 a12 a22 a32   b02 b12 b22 b32
    // a03 a13 a23 a33   b03 b13 b23 b33
  }

  // Horizontal pass and subsequent transpose.
  {
    // First pass, c and d calculations are longer because of the "trick"
    // multiplications.
    const __m128i four = _mm_set1_epi16(4);
    const __m128i dc = _mm_add_epi16(T0, four);
    const __m128i a =  _mm_add_epi16(dc, T2);
    const __m128i b =  _mm_sub_epi16(dc, T2);
    // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
    const __m128i c1 = _mm_mulhi_epi16(T1, k2);
    const __m128i c2 = _mm_mulhi_epi16(T3, k1);
    const __m128i c3 = _mm_sub_epi16(T1, T3);
    const __m128i c4 = _mm_sub_epi16(c1, c2);
    const __m128i c = _mm_add_epi16(c3, c4);
    // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
    const __m128i d1 = _mm_mulhi_epi16(T1, k1);
    const __m128i d2 = _mm_mulhi_epi16(T3, k2);
    const __m128i d3 = _mm_add_epi16(T1, T3);
    const __m128i d4 = _mm_add_epi16(d1, d2);
    const __m128i d = _mm_add_epi16(d3, d4);

    // Second pass.
    const __m128i tmp0 = _mm_add_epi16(a, d);
    const __m128i tmp1 = _mm_add_epi16(b, c);
    const __m128i tmp2 = _mm_sub_epi16(b, c);
    const __m128i tmp3 = _mm_sub_epi16(a, d);
    const __m128i shifted0 = _mm_srai_epi16(tmp0, 3);
    const __m128i shifted1 = _mm_srai_epi16(tmp1, 3);
    const __m128i shifted2 = _mm_srai_epi16(tmp2, 3);
    const __m128i shifted3 = _mm_srai_epi16(tmp3, 3);

    // Transpose the two 4x4.
    // a00 a01 a02 a03   b00 b01 b02 b03
    // a10 a11 a12 a13   b10 b11 b12 b13
    // a20 a21 a22 a23   b20 b21 b22 b23
    // a30 a31 a32 a33   b30 b31 b32 b33
    const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1);
    const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3);
    const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1);
    const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3);
    // a00 a10 a01 a11   a02 a12 a03 a13
    // a20 a30 a21 a31   a22 a32 a23 a33
    // b00 b10 b01 b11   b02 b12 b03 b13
    // b20 b30 b21 b31   b22 b32 b23 b33
    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
    // a00 a10 a20 a30 a01 a11 a21 a31
    // b00 b10 b20 b30 b01 b11 b21 b31
    // a02 a12 a22 a32 a03 a13 a23 a33
    // b02 b12 a22 b32 b03 b13 b23 b33
    T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
    T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
    T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
    T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
    // a00 a10 a20 a30   b00 b10 b20 b30
    // a01 a11 a21 a31   b01 b11 b21 b31
    // a02 a12 a22 a32   b02 b12 b22 b32
    // a03 a13 a23 a33   b03 b13 b23 b33
  }

  // Add inverse transform to 'dst' and store.
  {
    const __m128i zero = _mm_setzero_si128();
    // Load the reference(s).
    __m128i dst0, dst1, dst2, dst3;
    if (do_two) {
      // Load eight bytes/pixels per line.
      dst0 = _mm_loadl_epi64((__m128i*)&dst[0 * BPS]);
      dst1 = _mm_loadl_epi64((__m128i*)&dst[1 * BPS]);
      dst2 = _mm_loadl_epi64((__m128i*)&dst[2 * BPS]);
      dst3 = _mm_loadl_epi64((__m128i*)&dst[3 * BPS]);
    } else {
      // Load four bytes/pixels per line.
      dst0 = _mm_cvtsi32_si128(*(int*)&dst[0 * BPS]);
      dst1 = _mm_cvtsi32_si128(*(int*)&dst[1 * BPS]);
      dst2 = _mm_cvtsi32_si128(*(int*)&dst[2 * BPS]);
      dst3 = _mm_cvtsi32_si128(*(int*)&dst[3 * BPS]);
    }
    // Convert to 16b.
    dst0 = _mm_unpacklo_epi8(dst0, zero);
    dst1 = _mm_unpacklo_epi8(dst1, zero);
    dst2 = _mm_unpacklo_epi8(dst2, zero);
    dst3 = _mm_unpacklo_epi8(dst3, zero);
    // Add the inverse transform(s).
    dst0 = _mm_add_epi16(dst0, T0);
    dst1 = _mm_add_epi16(dst1, T1);
    dst2 = _mm_add_epi16(dst2, T2);
    dst3 = _mm_add_epi16(dst3, T3);
    // Unsigned saturate to 8b.
    dst0 = _mm_packus_epi16(dst0, dst0);
    dst1 = _mm_packus_epi16(dst1, dst1);
    dst2 = _mm_packus_epi16(dst2, dst2);
    dst3 = _mm_packus_epi16(dst3, dst3);
    // Store the results.
    if (do_two) {
      // Store eight bytes/pixels per line.
      _mm_storel_epi64((__m128i*)&dst[0 * BPS], dst0);
      _mm_storel_epi64((__m128i*)&dst[1 * BPS], dst1);
      _mm_storel_epi64((__m128i*)&dst[2 * BPS], dst2);
      _mm_storel_epi64((__m128i*)&dst[3 * BPS], dst3);
    } else {
      // Store four bytes/pixels per line.
      *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(dst0);
      *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(dst1);
      *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(dst2);
      *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(dst3);
    }
  }
}
Esempio n. 6
0
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i seven = _mm_set1_epi16(7);
  const __m128i k937 = _mm_set1_epi32(937);
  const __m128i k1812 = _mm_set1_epi32(1812);
  const __m128i k51000 = _mm_set1_epi32(51000);
  const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));
  const __m128i k5352_2217 = _mm_set_epi16(5352,  2217, 5352,  2217,
                                           5352,  2217, 5352,  2217);
  const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,
                                           2217, -5352, 2217, -5352);
  const __m128i k88p = _mm_set_epi16(8, 8, 8, 8, 8, 8, 8, 8);
  const __m128i k88m = _mm_set_epi16(-8, 8, -8, 8, -8, 8, -8, 8);
  const __m128i k5352_2217p = _mm_set_epi16(2217, 5352, 2217, 5352,
                                            2217, 5352, 2217, 5352);
  const __m128i k5352_2217m = _mm_set_epi16(-5352, 2217, -5352, 2217,
                                            -5352, 2217, -5352, 2217);
  __m128i v01, v32;


  // Difference between src and ref and initial transpose.
  {
    // Load src and convert to 16b.
    const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
    const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]);
    const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]);
    const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]);
    const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);
    const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);
    const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);
    const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);
    // Load ref and convert to 16b.
    const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
    const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
    const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
    const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
    const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);
    const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
    const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
    const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
    // Compute difference. -> 00 01 02 03 00 00 00 00
    const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
    const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
    const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
    const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);


    // Unpack and shuffle
    // 00 01 02 03   0 0 0 0
    // 10 11 12 13   0 0 0 0
    // 20 21 22 23   0 0 0 0
    // 30 31 32 33   0 0 0 0
    const __m128i shuf01 = _mm_unpacklo_epi32(diff0, diff1);
    const __m128i shuf23 = _mm_unpacklo_epi32(diff2, diff3);
    // 00 01 10 11 02 03 12 13
    // 20 21 30 31 22 23 32 33
    const __m128i shuf01_p =
        _mm_shufflehi_epi16(shuf01, _MM_SHUFFLE(2, 3, 0, 1));
    const __m128i shuf23_p =
        _mm_shufflehi_epi16(shuf23, _MM_SHUFFLE(2, 3, 0, 1));
    // 00 01 10 11 03 02 13 12
    // 20 21 30 31 23 22 33 32
    const __m128i s01 = _mm_unpacklo_epi64(shuf01_p, shuf23_p);
    const __m128i s32 = _mm_unpackhi_epi64(shuf01_p, shuf23_p);
    // 00 01 10 11 20 21 30 31
    // 03 02 13 12 23 22 33 32
    const __m128i a01 = _mm_add_epi16(s01, s32);
    const __m128i a32 = _mm_sub_epi16(s01, s32);
    // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ]
    // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ]

    const __m128i tmp0 = _mm_madd_epi16(a01, k88p);  // [ (a0 + a1) << 3, ... ]
    const __m128i tmp2 = _mm_madd_epi16(a01, k88m);  // [ (a0 - a1) << 3, ... ]
    const __m128i tmp1_1 = _mm_madd_epi16(a32, k5352_2217p);
    const __m128i tmp3_1 = _mm_madd_epi16(a32, k5352_2217m);
    const __m128i tmp1_2 = _mm_add_epi32(tmp1_1, k1812);
    const __m128i tmp3_2 = _mm_add_epi32(tmp3_1, k937);
    const __m128i tmp1   = _mm_srai_epi32(tmp1_2, 9);
    const __m128i tmp3   = _mm_srai_epi32(tmp3_2, 9);
    const __m128i s03 = _mm_packs_epi32(tmp0, tmp2);
    const __m128i s12 = _mm_packs_epi32(tmp1, tmp3);
    const __m128i s_lo = _mm_unpacklo_epi16(s03, s12);   // 0 1 0 1 0 1...
    const __m128i s_hi = _mm_unpackhi_epi16(s03, s12);   // 2 3 2 3 2 3
    const __m128i v23 = _mm_unpackhi_epi32(s_lo, s_hi);
    v01 = _mm_unpacklo_epi32(s_lo, s_hi);
    v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));  // 3 2 3 2 3 2..
  }

  // Second pass
  {
    // Same operations are done on the (0,3) and (1,2) pairs.
    // a0 = v0 + v3
    // a1 = v1 + v2
    // a3 = v0 - v3
    // a2 = v1 - v2
    const __m128i a01 = _mm_add_epi16(v01, v32);
    const __m128i a32 = _mm_sub_epi16(v01, v32);
    const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
    const __m128i a22 = _mm_unpackhi_epi64(a32, a32);
    const __m128i a01_plus_7 = _mm_add_epi16(a01, seven);

    // d0 = (a0 + a1 + 7) >> 4;
    // d2 = (a0 - a1 + 7) >> 4;
    const __m128i c0 = _mm_add_epi16(a01_plus_7, a11);
    const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11);
    const __m128i d0 = _mm_srai_epi16(c0, 4);
    const __m128i d2 = _mm_srai_epi16(c2, 4);

    // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
    // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
    const __m128i b23 = _mm_unpacklo_epi16(a22, a32);
    const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
    const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
    const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one);
    const __m128i d3 = _mm_add_epi32(c3, k51000);
    const __m128i e1 = _mm_srai_epi32(d1, 16);
    const __m128i e3 = _mm_srai_epi32(d3, 16);
    const __m128i f1 = _mm_packs_epi32(e1, e1);
    const __m128i f3 = _mm_packs_epi32(e3, e3);
    // f1 = f1 + (a3 != 0);
    // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
    // desired (0, 1), we add one earlier through k12000_plus_one.
    // -> f1 = f1 + 1 - (a3 == 0)
    const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));

    const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1);
    const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3);
    _mm_storeu_si128((__m128i*)&out[0], d0_g1);
    _mm_storeu_si128((__m128i*)&out[8], d2_f3);
  }
}
void
transform8_otherrgb_avx(ThreadInfo* t)
{
	RS_IMAGE16 *input = t->input;
	GdkPixbuf *output = t->output;
	RS_MATRIX3 *matrix = t->matrix;
	gint x,y;
	gint width;

	float mat_ps[4*4*3] __attribute__ ((aligned (16)));
	for (x = 0; x < 4; x++ ) {
		mat_ps[x] = matrix->coeff[0][0];
		mat_ps[x+4] = matrix->coeff[0][1];
		mat_ps[x+8] = matrix->coeff[0][2];
		mat_ps[12+x] = matrix->coeff[1][0];
		mat_ps[12+x+4] = matrix->coeff[1][1];
		mat_ps[12+x+8] = matrix->coeff[1][2];
		mat_ps[24+x] = matrix->coeff[2][0];
		mat_ps[24+x+4] = matrix->coeff[2][1];
		mat_ps[24+x+8] = matrix->coeff[2][2];
	}
	
	int start_x = t->start_x;
	/* Always have aligned input and output adress */
	if (start_x & 3)
		start_x = ((start_x) / 4) * 4;
	
	int complete_w = t->end_x - start_x;
	/* If width is not multiple of 4, check if we can extend it a bit */
	if (complete_w & 3)
	{
		if ((t->end_x+4) < input->w)
			complete_w = ((complete_w+3) / 4 * 4);
	}
	__m128 gamma = _mm_set1_ps(t->output_gamma);

	for(y=t->start_y ; y<t->end_y ; y++)
	{
		gushort *i = GET_PIXEL(input, start_x, y);
		guchar *o = GET_PIXBUF_PIXEL(output, start_x, y);
		gboolean aligned_write = !((guintptr)(o)&0xf);

		width = complete_w >> 2;

		while(width--)
		{
			/* Load and convert to float */
			__m128i zero = _mm_setzero_si128();
			__m128i in = _mm_load_si128((__m128i*)i); // Load two pixels
			__m128i in2 = _mm_load_si128((__m128i*)i+1); // Load two pixels
			_mm_prefetch(i + 64, _MM_HINT_NTA);
			__m128i p1 =_mm_unpacklo_epi16(in, zero);
			__m128i p2 =_mm_unpackhi_epi16(in, zero);
			__m128i p3 =_mm_unpacklo_epi16(in2, zero);
			__m128i p4 =_mm_unpackhi_epi16(in2, zero);
			__m128 p1f  = _mm_cvtepi32_ps(p1);
			__m128 p2f  = _mm_cvtepi32_ps(p2);
			__m128 p3f  = _mm_cvtepi32_ps(p3);
			__m128 p4f  = _mm_cvtepi32_ps(p4);
			
			/* Convert to planar */
			__m128 g1g0r1r0 = _mm_unpacklo_ps(p1f, p2f);
			__m128 b1b0 = _mm_unpackhi_ps(p1f, p2f);
			__m128 g3g2r3r2 = _mm_unpacklo_ps(p3f, p4f);
			__m128 b3b2 = _mm_unpackhi_ps(p3f, p4f);
			__m128 r = _mm_movelh_ps(g1g0r1r0, g3g2r3r2);
			__m128 g = _mm_movehl_ps(g3g2r3r2, g1g0r1r0);
			__m128 b = _mm_movelh_ps(b1b0, b3b2);

			/* Apply matrix to convert to sRGB */
			__m128 r2 = sse_matrix3_mul(mat_ps, r, g, b);
			__m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b);
			__m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b);

			/* Normalize to 0->1 and clamp */
			__m128 normalize = _mm_load_ps(_normalize);
			__m128 max_val = _mm_load_ps(_ones_ps);
			__m128 min_val = _mm_setzero_ps();
			r = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, r2)));
			g = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, g2)));
			b = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, b2)));

			/* Apply Gamma */
			__m128 upscale = _mm_load_ps(_8bit);
			r = _mm_mul_ps(upscale, _mm_fastpow_ps(r, gamma));
			g = _mm_mul_ps(upscale, _mm_fastpow_ps(g, gamma));
			b = _mm_mul_ps(upscale, _mm_fastpow_ps(b, gamma));

			/* Convert to 8 bit unsigned  and interleave*/
			__m128i r_i = _mm_cvtps_epi32(r);
			__m128i g_i = _mm_cvtps_epi32(g);
			__m128i b_i = _mm_cvtps_epi32(b);
			
			r_i = _mm_packs_epi32(r_i, r_i);
			g_i = _mm_packs_epi32(g_i, g_i);
			b_i = _mm_packs_epi32(b_i, b_i);

			/* Set alpha value to 255 and store */
			__m128i alpha_mask = _mm_load_si128((__m128i*)_alpha_mask);
			__m128i rg_i = _mm_unpacklo_epi16(r_i, g_i);
			__m128i bb_i = _mm_unpacklo_epi16(b_i, b_i);
			p1 = _mm_unpacklo_epi32(rg_i, bb_i);
			p2 = _mm_unpackhi_epi32(rg_i, bb_i);
	
			p1 = _mm_or_si128(alpha_mask, _mm_packus_epi16(p1, p2));

			if (aligned_write)
				_mm_store_si128((__m128i*)o, p1);
			else
				_mm_storeu_si128((__m128i*)o, p1);

			i += 16;
			o += 16;
		}
		/* Process remaining pixels */
		width = complete_w & 3;
		while(width--)
		{
			__m128i zero = _mm_setzero_si128();
			__m128i in = _mm_loadl_epi64((__m128i*)i); // Load two pixels
			__m128i p1 =_mm_unpacklo_epi16(in, zero);
			__m128 p1f  = _mm_cvtepi32_ps(p1);

			/* Splat r,g,b */
			__m128 r =  _mm_shuffle_ps(p1f, p1f, _MM_SHUFFLE(0,0,0,0));
			__m128 g =  _mm_shuffle_ps(p1f, p1f, _MM_SHUFFLE(1,1,1,1));
			__m128 b =  _mm_shuffle_ps(p1f, p1f, _MM_SHUFFLE(2,2,2,2));

			__m128 r2 = sse_matrix3_mul(mat_ps, r, g, b);
			__m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b);
			__m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b);

			r = _mm_unpacklo_ps(r2, g2);	// GG RR GG RR
			r = _mm_movelh_ps(r, b2);		// BB BB GG RR

			__m128 normalize = _mm_load_ps(_normalize);
			__m128 max_val = _mm_load_ps(_ones_ps);
			__m128 min_val = _mm_setzero_ps();
			r = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, r)));
			__m128 upscale = _mm_load_ps(_8bit);
			r = _mm_mul_ps(upscale, _mm_fastpow_ps(r, gamma));
			
			/* Convert to 8 bit unsigned */
			zero = _mm_setzero_si128();
			__m128i r_i = _mm_cvtps_epi32(r);
			/* To 16 bit signed */
			r_i = _mm_packs_epi32(r_i, zero);
			/* To 8 bit unsigned - set alpha channel*/
			__m128i alpha_mask = _mm_load_si128((__m128i*)_alpha_mask);
			r_i = _mm_or_si128(alpha_mask, _mm_packus_epi16(r_i, zero));
			*(int*)o = _mm_cvtsi128_si32(r_i);
			i+=4;
			o+=4;
		}
	}
}
Esempio n. 8
0
 /*
 * Notice:
 * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
 * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
 *   numbers of DD bits
 */
static inline uint16_t
_recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
		   uint16_t nb_pkts, uint8_t *split_packet)
{
	volatile union i40e_rx_desc *rxdp;
	struct i40e_rx_entry *sw_ring;
	uint16_t nb_pkts_recd;
	int pos;
	uint64_t var;
	__m128i shuf_msk;
	uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;

	__m128i crc_adjust = _mm_set_epi16(
				0, 0, 0,    /* ignore non-length fields */
				-rxq->crc_len, /* sub crc on data_len */
				0,          /* ignore high-16bits of pkt_len */
				-rxq->crc_len, /* sub crc on pkt_len */
				0, 0            /* ignore pkt_type field */
			);
	/*
	 * compile-time check the above crc_adjust layout is correct.
	 * NOTE: the first field (lowest address) is given last in set_epi16
	 * call above.
	 */
	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
	__m128i dd_check, eop_check;

	/* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */
	nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST);

	/* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */
	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP);

	/* Just the act of getting into the function from the application is
	 * going to cost about 7 cycles
	 */
	rxdp = rxq->rx_ring + rxq->rx_tail;

	rte_prefetch0(rxdp);

	/* See if we need to rearm the RX queue - gives the prefetch a bit
	 * of time to act
	 */
	if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
		i40e_rxq_rearm(rxq);

	/* Before we start moving massive data around, check to see if
	 * there is actually a packet available
	 */
	if (!(rxdp->wb.qword1.status_error_len &
			rte_cpu_to_le_32(1 << I40E_RX_DESC_STATUS_DD_SHIFT)))
		return 0;

	/* 4 packets DD mask */
	dd_check = _mm_set_epi64x(0x0000000100000001LL, 0x0000000100000001LL);

	/* 4 packets EOP mask */
	eop_check = _mm_set_epi64x(0x0000000200000002LL, 0x0000000200000002LL);

	/* mask to shuffle from desc. to mbuf */
	shuf_msk = _mm_set_epi8(
		7, 6, 5, 4,  /* octet 4~7, 32bits rss */
		3, 2,        /* octet 2~3, low 16 bits vlan_macip */
		15, 14,      /* octet 15~14, 16 bits data_len */
		0xFF, 0xFF,  /* skip high 16 bits pkt_len, zero out */
		15, 14,      /* octet 15~14, low 16 bits pkt_len */
		0xFF, 0xFF,  /* pkt_type set as unknown */
		0xFF, 0xFF  /*pkt_type set as unknown */
		);
	/*
	 * Compile-time verify the shuffle mask
	 * NOTE: some field positions already verified above, but duplicated
	 * here for completeness in case of future modifications.
	 */
	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
			offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);

	/* Cache is empty -> need to scan the buffer rings, but first move
	 * the next 'n' mbufs into the cache
	 */
	sw_ring = &rxq->sw_ring[rxq->rx_tail];

	/* A. load 4 packet in one loop
	 * [A*. mask out 4 unused dirty field in desc]
	 * B. copy 4 mbuf point from swring to rx_pkts
	 * C. calc the number of DD bits among the 4 packets
	 * [C*. extract the end-of-packet bit, if requested]
	 * D. fill info. from desc to mbuf
	 */

	for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts;
			pos += RTE_I40E_DESCS_PER_LOOP,
			rxdp += RTE_I40E_DESCS_PER_LOOP) {
		__m128i descs[RTE_I40E_DESCS_PER_LOOP];
		__m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
		__m128i zero, staterr, sterr_tmp1, sterr_tmp2;
		/* 2 64 bit or 4 32 bit mbuf pointers in one XMM reg. */
		__m128i mbp1;
#if defined(RTE_ARCH_X86_64)
		__m128i mbp2;
#endif

		/* B.1 load 2 (64 bit) or 4 (32 bit) mbuf points */
		mbp1 = _mm_loadu_si128((__m128i *)&sw_ring[pos]);
		/* Read desc statuses backwards to avoid race condition */
		/* A.1 load 4 pkts desc */
		descs[3] = _mm_loadu_si128((__m128i *)(rxdp + 3));
		rte_compiler_barrier();

		/* B.2 copy 2 64 bit or 4 32 bit mbuf point into rx_pkts */
		_mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1);

#if defined(RTE_ARCH_X86_64)
		/* B.1 load 2 64 bit mbuf points */
		mbp2 = _mm_loadu_si128((__m128i *)&sw_ring[pos+2]);
#endif

		descs[2] = _mm_loadu_si128((__m128i *)(rxdp + 2));
		rte_compiler_barrier();
		/* B.1 load 2 mbuf point */
		descs[1] = _mm_loadu_si128((__m128i *)(rxdp + 1));
		rte_compiler_barrier();
		descs[0] = _mm_loadu_si128((__m128i *)(rxdp));

#if defined(RTE_ARCH_X86_64)
		/* B.2 copy 2 mbuf point into rx_pkts  */
		_mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2);
#endif

		if (split_packet) {
			rte_mbuf_prefetch_part2(rx_pkts[pos]);
			rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
			rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
			rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
		}

		/* avoid compiler reorder optimization */
		rte_compiler_barrier();

		/* pkt 3,4 shift the pktlen field to be 16-bit aligned*/
		const __m128i len3 = _mm_slli_epi32(descs[3], PKTLEN_SHIFT);
		const __m128i len2 = _mm_slli_epi32(descs[2], PKTLEN_SHIFT);

		/* merge the now-aligned packet length fields back in */
		descs[3] = _mm_blend_epi16(descs[3], len3, 0x80);
		descs[2] = _mm_blend_epi16(descs[2], len2, 0x80);

		/* D.1 pkt 3,4 convert format from desc to pktmbuf */
		pkt_mb4 = _mm_shuffle_epi8(descs[3], shuf_msk);
		pkt_mb3 = _mm_shuffle_epi8(descs[2], shuf_msk);

		/* C.1 4=>2 filter staterr info only */
		sterr_tmp2 = _mm_unpackhi_epi32(descs[3], descs[2]);
		/* C.1 4=>2 filter staterr info only */
		sterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]);

		desc_to_olflags_v(rxq, descs, &rx_pkts[pos]);

		/* D.2 pkt 3,4 set in_port/nb_seg and remove crc */
		pkt_mb4 = _mm_add_epi16(pkt_mb4, crc_adjust);
		pkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust);

		/* pkt 1,2 shift the pktlen field to be 16-bit aligned*/
		const __m128i len1 = _mm_slli_epi32(descs[1], PKTLEN_SHIFT);
		const __m128i len0 = _mm_slli_epi32(descs[0], PKTLEN_SHIFT);

		/* merge the now-aligned packet length fields back in */
		descs[1] = _mm_blend_epi16(descs[1], len1, 0x80);
		descs[0] = _mm_blend_epi16(descs[0], len0, 0x80);

		/* D.1 pkt 1,2 convert format from desc to pktmbuf */
		pkt_mb2 = _mm_shuffle_epi8(descs[1], shuf_msk);
		pkt_mb1 = _mm_shuffle_epi8(descs[0], shuf_msk);

		/* C.2 get 4 pkts staterr value  */
		zero = _mm_xor_si128(dd_check, dd_check);
		staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2);

		/* D.3 copy final 3,4 data to rx_pkts */
		_mm_storeu_si128((void *)&rx_pkts[pos+3]->rx_descriptor_fields1,
				 pkt_mb4);
		_mm_storeu_si128((void *)&rx_pkts[pos+2]->rx_descriptor_fields1,
				 pkt_mb3);

		/* D.2 pkt 1,2 set in_port/nb_seg and remove crc */
		pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust);
		pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust);

		/* C* extract and record EOP bit */
		if (split_packet) {
			__m128i eop_shuf_mask = _mm_set_epi8(
					0xFF, 0xFF, 0xFF, 0xFF,
					0xFF, 0xFF, 0xFF, 0xFF,
					0xFF, 0xFF, 0xFF, 0xFF,
					0x04, 0x0C, 0x00, 0x08
					);

			/* and with mask to extract bits, flipping 1-0 */
			__m128i eop_bits = _mm_andnot_si128(staterr, eop_check);
			/* the staterr values are not in order, as the count
			 * count of dd bits doesn't care. However, for end of
			 * packet tracking, we do care, so shuffle. This also
			 * compresses the 32-bit values to 8-bit
			 */
			eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask);
			/* store the resulting 32-bit value */
			*(int *)split_packet = _mm_cvtsi128_si32(eop_bits);
			split_packet += RTE_I40E_DESCS_PER_LOOP;
		}

		/* C.3 calc available number of desc */
		staterr = _mm_and_si128(staterr, dd_check);
		staterr = _mm_packs_epi32(staterr, zero);

		/* D.3 copy final 1,2 data to rx_pkts */
		_mm_storeu_si128((void *)&rx_pkts[pos+1]->rx_descriptor_fields1,
				 pkt_mb2);
		_mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1,
				 pkt_mb1);
		desc_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl);
		/* C.4 calc avaialbe number of desc */
		var = __builtin_popcountll(_mm_cvtsi128_si64(staterr));
		nb_pkts_recd += var;
		if (likely(var != RTE_I40E_DESCS_PER_LOOP))
			break;
	}

	/* Update our internal tail pointer */
	rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd);
	rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1));
	rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd);

	return nb_pkts_recd;
}
void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
  const int stride = pitch >> 1;
  int pass;
  // Constants
  //    When we use them, in one case, they are all the same. In all others
  //    it's a pair of them that we need to repeat four times. This is done
  //    by constructing the 32 bit constant corresponding to that pair.
  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
  // Load input
  __m128i in0  = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
  __m128i in1  = _mm_loadu_si128((const __m128i *)(input + 1 * stride));
  __m128i in2  = _mm_loadu_si128((const __m128i *)(input + 2 * stride));
  __m128i in3  = _mm_loadu_si128((const __m128i *)(input + 3 * stride));
  __m128i in4  = _mm_loadu_si128((const __m128i *)(input + 4 * stride));
  __m128i in5  = _mm_loadu_si128((const __m128i *)(input + 5 * stride));
  __m128i in6  = _mm_loadu_si128((const __m128i *)(input + 6 * stride));
  __m128i in7  = _mm_loadu_si128((const __m128i *)(input + 7 * stride));
  // Pre-condition input (shift by two)
  in0 = _mm_slli_epi16(in0, 2);
  in1 = _mm_slli_epi16(in1, 2);
  in2 = _mm_slli_epi16(in2, 2);
  in3 = _mm_slli_epi16(in3, 2);
  in4 = _mm_slli_epi16(in4, 2);
  in5 = _mm_slli_epi16(in5, 2);
  in6 = _mm_slli_epi16(in6, 2);
  in7 = _mm_slli_epi16(in7, 2);

  // We do two passes, first the columns, then the rows. The results of the
  // first pass are transposed so that the same column code can be reused. The
  // results of the second pass are also transposed so that the rows (processed
  // as columns) are put back in row positions.
  for (pass = 0; pass < 2; pass++) {
    // To store results of each pass before the transpose.
    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
    // Add/substract
    const __m128i q0 = _mm_add_epi16(in0, in7);
    const __m128i q1 = _mm_add_epi16(in1, in6);
    const __m128i q2 = _mm_add_epi16(in2, in5);
    const __m128i q3 = _mm_add_epi16(in3, in4);
    const __m128i q4 = _mm_sub_epi16(in3, in4);
    const __m128i q5 = _mm_sub_epi16(in2, in5);
    const __m128i q6 = _mm_sub_epi16(in1, in6);
    const __m128i q7 = _mm_sub_epi16(in0, in7);
    // Work on first four results
    {
      // Add/substract
      const __m128i r0 = _mm_add_epi16(q0, q3);
      const __m128i r1 = _mm_add_epi16(q1, q2);
      const __m128i r2 = _mm_sub_epi16(q1, q2);
      const __m128i r3 = _mm_sub_epi16(q0, q3);
      // Interleave to do the multiply by constants which gets us into 32bits
      const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
      const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
      const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
      const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
      // dct_const_round_shift
      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
      // Combine
      res0 = _mm_packs_epi32(w0, w1);
      res4 = _mm_packs_epi32(w2, w3);
      res2 = _mm_packs_epi32(w4, w5);
      res6 = _mm_packs_epi32(w6, w7);
    }
    // Work on next four results
    {
      // Interleave to do the multiply by constants which gets us into 32bits
      const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
      const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
      const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
      const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
      const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
      const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
      // dct_const_round_shift
      const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
      const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
      const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
      const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
      const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
      const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
      const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
      const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
      // Combine
      const __m128i r0 = _mm_packs_epi32(s0, s1);
      const __m128i r1 = _mm_packs_epi32(s2, s3);
      // Add/substract
      const __m128i x0 = _mm_add_epi16(q4, r0);
      const __m128i x1 = _mm_sub_epi16(q4, r0);
      const __m128i x2 = _mm_sub_epi16(q7, r1);
      const __m128i x3 = _mm_add_epi16(q7, r1);
      // Interleave to do the multiply by constants which gets us into 32bits
      const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
      const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
      const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
      const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
      // dct_const_round_shift
      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
      // Combine
      res1 = _mm_packs_epi32(w0, w1);
      res7 = _mm_packs_epi32(w2, w3);
      res5 = _mm_packs_epi32(w4, w5);
      res3 = _mm_packs_epi32(w6, w7);
    }
    // Transpose the 8x8.
    {
      // 00 01 02 03 04 05 06 07
      // 10 11 12 13 14 15 16 17
      // 20 21 22 23 24 25 26 27
      // 30 31 32 33 34 35 36 37
      // 40 41 42 43 44 45 46 47
      // 50 51 52 53 54 55 56 57
      // 60 61 62 63 64 65 66 67
      // 70 71 72 73 74 75 76 77
      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
      // 00 10 01 11 02 12 03 13
      // 20 30 21 31 22 32 23 33
      // 04 14 05 15 06 16 07 17
      // 24 34 25 35 26 36 27 37
      // 40 50 41 51 42 52 43 53
      // 60 70 61 71 62 72 63 73
      // 54 54 55 55 56 56 57 57
      // 64 74 65 75 66 76 67 77
      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
      // 00 10 20 30 01 11 21 31
      // 40 50 60 70 41 51 61 71
      // 02 12 22 32 03 13 23 33
      // 42 52 62 72 43 53 63 73
      // 04 14 24 34 05 15 21 36
      // 44 54 64 74 45 55 61 76
      // 06 16 26 36 07 17 27 37
      // 46 56 66 76 47 57 67 77
      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
      // 00 10 20 30 40 50 60 70
      // 01 11 21 31 41 51 61 71
      // 02 12 22 32 42 52 62 72
      // 03 13 23 33 43 53 63 73
      // 04 14 24 34 44 54 64 74
      // 05 15 25 35 45 55 65 75
      // 06 16 26 36 46 56 66 76
      // 07 17 27 37 47 57 67 77
    }
  }
  // Post-condition output and store it
  {
    // Post-condition (division by two)
    //    division of two 16 bits signed numbers using shifts
    //    n / 2 = (n - (n >> 15)) >> 1
    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
    in0 = _mm_sub_epi16(in0, sign_in0);
    in1 = _mm_sub_epi16(in1, sign_in1);
    in2 = _mm_sub_epi16(in2, sign_in2);
    in3 = _mm_sub_epi16(in3, sign_in3);
    in4 = _mm_sub_epi16(in4, sign_in4);
    in5 = _mm_sub_epi16(in5, sign_in5);
    in6 = _mm_sub_epi16(in6, sign_in6);
    in7 = _mm_sub_epi16(in7, sign_in7);
    in0 = _mm_srai_epi16(in0, 1);
    in1 = _mm_srai_epi16(in1, 1);
    in2 = _mm_srai_epi16(in2, 1);
    in3 = _mm_srai_epi16(in3, 1);
    in4 = _mm_srai_epi16(in4, 1);
    in5 = _mm_srai_epi16(in5, 1);
    in6 = _mm_srai_epi16(in6, 1);
    in7 = _mm_srai_epi16(in7, 1);
    // store results
    _mm_storeu_si128((__m128i *)(output + 0 * 8), in0);
    _mm_storeu_si128((__m128i *)(output + 1 * 8), in1);
    _mm_storeu_si128((__m128i *)(output + 2 * 8), in2);
    _mm_storeu_si128((__m128i *)(output + 3 * 8), in3);
    _mm_storeu_si128((__m128i *)(output + 4 * 8), in4);
    _mm_storeu_si128((__m128i *)(output + 5 * 8), in5);
    _mm_storeu_si128((__m128i *)(output + 6 * 8), in6);
    _mm_storeu_si128((__m128i *)(output + 7 * 8), in7);
  }
}
/*
 ********************************************************************************
 *
 * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients
 * of a 16x16 intra prediction macroblock, and then performs scaling.
 * prediction buffer
 *
 * @par Description:
 *  The DC coefficients pass through a 2-stage inverse hadamard transform.
 *  This inverse transformed content is scaled to based on Qp value.
 *
 * @param[in] pi2_src
 *  input 4x4 block of DC coefficients
 *
 * @param[out] pi2_out
 *  output 4x4 block
 *
 * @param[in] pu2_iscal_mat
 *  pointer to scaling list
 *
 * @param[in] pu2_weigh_mat
 *  pointer to weight matrix
 *
 * @param[in] u4_qp_div_6
 *  Floor (qp/6)
 *
 * @param[in] pi4_tmp
 * temporary buffer of size 1*16
 *
 * @returns none
 *
 * @remarks none
 *
 *******************************************************************************
 */
void ih264_ihadamard_scaling_4x4_ssse3(WORD16* pi2_src,
                                       WORD16* pi2_out,
                                       const UWORD16 *pu2_iscal_mat,
                                       const UWORD16 *pu2_weigh_mat,
                                       UWORD32 u4_qp_div_6,
                                       WORD32* pi4_tmp)
{
    int val = 0xFFFF;
    __m128i src_r0_r1, src_r2_r3, sign_reg, zero_8x16b = _mm_setzero_si128();
    __m128i src_r0, src_r1, src_r2, src_r3;
    __m128i temp0, temp1, temp2, temp3;
    __m128i add_rshift = _mm_set1_epi32((1 << (5 - u4_qp_div_6)));
    __m128i mult_val = _mm_set1_epi32(pu2_iscal_mat[0] * pu2_weigh_mat[0]);

    __m128i mask = _mm_set1_epi32(val);
    UNUSED (pi4_tmp);

    mult_val = _mm_and_si128(mult_val, mask);

    src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
    src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row
    sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r0_r1);
    src_r0 = _mm_unpacklo_epi16(src_r0_r1, sign_reg);
    src_r1 = _mm_unpackhi_epi16(src_r0_r1, sign_reg);
    sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r2_r3);
    src_r2 = _mm_unpacklo_epi16(src_r2_r3, sign_reg);
    src_r3 = _mm_unpackhi_epi16(src_r2_r3, sign_reg);

    /* Perform Inverse transform */
    /*-------------------------------------------------------------*/
    /* IDCT [ Horizontal transformation ]                          */
    /*-------------------------------------------------------------*/
    // Matrix transpose
    /*
     *  a0 a1 a2 a3
     *  b0 b1 b2 b3
     *  c0 c1 c2 c3
     *  d0 d1 d2 d3
     */
    temp0 = _mm_unpacklo_epi32(src_r0, src_r1);                  //a0 b0 a1 b1
    temp2 = _mm_unpacklo_epi32(src_r2, src_r3);                  //c0 d0 c1 d1
    temp1 = _mm_unpackhi_epi32(src_r0, src_r1);                  //a2 b2 a3 b3
    temp3 = _mm_unpackhi_epi32(src_r2, src_r3);                  //c2 d2 c3 d3
    src_r0 = _mm_unpacklo_epi64(temp0, temp2);                    //a0 b0 c0 d0
    src_r1 = _mm_unpackhi_epi64(temp0, temp2);                    //a1 b1 c1 d1
    src_r2 = _mm_unpacklo_epi64(temp1, temp3);                    //a2 b2 c2 d2
    src_r3 = _mm_unpackhi_epi64(temp1, temp3);                    //a3 b3 c3 d3

    temp0 = _mm_add_epi32(src_r0, src_r3);
    temp1 = _mm_add_epi32(src_r1, src_r2);
    temp2 = _mm_sub_epi32(src_r1, src_r2);
    temp3 = _mm_sub_epi32(src_r0, src_r3);

    src_r0 = _mm_add_epi32(temp0, temp1);
    src_r1 = _mm_add_epi32(temp2, temp3);
    src_r2 = _mm_sub_epi32(temp0, temp1);
    src_r3 = _mm_sub_epi32(temp3, temp2);

    /*-------------------------------------------------------------*/
    /* IDCT [ Vertical transformation ]                          */
    /*-------------------------------------------------------------*/
    // Matrix transpose
    /*
     *  a0 b0 c0 d0
     *  a1 b1 c1 d1
     *  a2 b2 c2 d2
     *  a3 b3 c3 d3
     */
    temp0 = _mm_unpacklo_epi32(src_r0, src_r1);                  //a0 a1 b0 b1
    temp2 = _mm_unpacklo_epi32(src_r2, src_r3);                  //a2 a3 b2 b3
    temp1 = _mm_unpackhi_epi32(src_r0, src_r1);                  //c0 c1 d0 d1
    temp3 = _mm_unpackhi_epi32(src_r2, src_r3);                  //c2 c3 d2 d3
    src_r0 = _mm_unpacklo_epi64(temp0, temp2);                   //a0 a1 a2 a3
    src_r1 = _mm_unpackhi_epi64(temp0, temp2);                   //b0 b1 b2 b3
    src_r2 = _mm_unpacklo_epi64(temp1, temp3);                   //c0 c1 c2 c3
    src_r3 = _mm_unpackhi_epi64(temp1, temp3);                   //d0 d1 d2 d3

    temp0 = _mm_add_epi32(src_r0, src_r3);
    temp1 = _mm_add_epi32(src_r1, src_r2);
    temp2 = _mm_sub_epi32(src_r1, src_r2);
    temp3 = _mm_sub_epi32(src_r0, src_r3);

    src_r0 = _mm_add_epi32(temp0, temp1);
    src_r1 = _mm_add_epi32(temp2, temp3);
    src_r2 = _mm_sub_epi32(temp0, temp1);
    src_r3 = _mm_sub_epi32(temp3, temp2);

    src_r0 = _mm_and_si128(src_r0, mask);
    src_r1 = _mm_and_si128(src_r1, mask);
    src_r2 = _mm_and_si128(src_r2, mask);
    src_r3 = _mm_and_si128(src_r3, mask);

    src_r0 = _mm_madd_epi16(src_r0, mult_val);
    src_r1 = _mm_madd_epi16(src_r1, mult_val);
    src_r2 = _mm_madd_epi16(src_r2, mult_val);
    src_r3 = _mm_madd_epi16(src_r3, mult_val);

    //Scaling
    if(u4_qp_div_6 >= 6)
    {
        src_r0 = _mm_slli_epi32(src_r0, u4_qp_div_6 - 6);
        src_r1 = _mm_slli_epi32(src_r1, u4_qp_div_6 - 6);
        src_r2 = _mm_slli_epi32(src_r2, u4_qp_div_6 - 6);
        src_r3 = _mm_slli_epi32(src_r3, u4_qp_div_6 - 6);
    }
    else
    {
        temp0 = _mm_add_epi32(src_r0, add_rshift);
        temp1 = _mm_add_epi32(src_r1, add_rshift);
        temp2 = _mm_add_epi32(src_r2, add_rshift);
        temp3 = _mm_add_epi32(src_r3, add_rshift);
        src_r0 = _mm_srai_epi32(temp0, 6 - u4_qp_div_6);
        src_r1 = _mm_srai_epi32(temp1, 6 - u4_qp_div_6);
        src_r2 = _mm_srai_epi32(temp2, 6 - u4_qp_div_6);
        src_r3 = _mm_srai_epi32(temp3, 6 - u4_qp_div_6);
    }
    src_r0_r1 = _mm_packs_epi32(src_r0, src_r1);
    src_r2_r3 = _mm_packs_epi32(src_r2, src_r3);

    _mm_storeu_si128((__m128i *) (&pi2_out[0]), src_r0_r1);
    _mm_storeu_si128((__m128i *) (&pi2_out[8]), src_r2_r3);
}
Esempio n. 11
0
static inline void
desc_to_olflags_v(struct i40e_rx_queue *rxq, __m128i descs[4],
	struct rte_mbuf **rx_pkts)
{
	const __m128i mbuf_init = _mm_set_epi64x(0, rxq->mbuf_initializer);
	__m128i rearm0, rearm1, rearm2, rearm3;

	__m128i vlan0, vlan1, rss, l3_l4e;

	/* mask everything except RSS, flow director and VLAN flags
	 * bit2 is for VLAN tag, bit11 for flow director indication
	 * bit13:12 for RSS indication.
	 */
	const __m128i rss_vlan_msk = _mm_set_epi32(
			0x1c03804, 0x1c03804, 0x1c03804, 0x1c03804);

	const __m128i cksum_mask = _mm_set_epi32(
			PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
			PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
			PKT_RX_EIP_CKSUM_BAD,
			PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
			PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
			PKT_RX_EIP_CKSUM_BAD,
			PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
			PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
			PKT_RX_EIP_CKSUM_BAD,
			PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
			PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
			PKT_RX_EIP_CKSUM_BAD);

	/* map rss and vlan type to rss hash and vlan flag */
	const __m128i vlan_flags = _mm_set_epi8(0, 0, 0, 0,
			0, 0, 0, 0,
			0, 0, 0, PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED,
			0, 0, 0, 0);

	const __m128i rss_flags = _mm_set_epi8(0, 0, 0, 0,
			0, 0, 0, 0,
			PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH, 0, 0,
			0, 0, PKT_RX_FDIR, 0);

	const __m128i l3_l4e_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
			/* shift right 1 bit to make sure it not exceed 255 */
			(PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
			 PKT_RX_IP_CKSUM_BAD) >> 1,
			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD |
			 PKT_RX_L4_CKSUM_BAD) >> 1,
			(PKT_RX_EIP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD) >> 1,
			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
			PKT_RX_IP_CKSUM_BAD >> 1,
			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1);

	vlan0 = _mm_unpackhi_epi32(descs[0], descs[1]);
	vlan1 = _mm_unpackhi_epi32(descs[2], descs[3]);
	vlan0 = _mm_unpacklo_epi64(vlan0, vlan1);

	vlan1 = _mm_and_si128(vlan0, rss_vlan_msk);
	vlan0 = _mm_shuffle_epi8(vlan_flags, vlan1);

	rss = _mm_srli_epi32(vlan1, 11);
	rss = _mm_shuffle_epi8(rss_flags, rss);

	l3_l4e = _mm_srli_epi32(vlan1, 22);
	l3_l4e = _mm_shuffle_epi8(l3_l4e_flags, l3_l4e);
	/* then we shift left 1 bit */
	l3_l4e = _mm_slli_epi32(l3_l4e, 1);
	/* we need to mask out the reduntant bits */
	l3_l4e = _mm_and_si128(l3_l4e, cksum_mask);

	vlan0 = _mm_or_si128(vlan0, rss);
	vlan0 = _mm_or_si128(vlan0, l3_l4e);

	/*
	 * At this point, we have the 4 sets of flags in the low 16-bits
	 * of each 32-bit value in vlan0.
	 * We want to extract these, and merge them with the mbuf init data
	 * so we can do a single 16-byte write to the mbuf to set the flags
	 * and all the other initialization fields. Extracting the
	 * appropriate flags means that we have to do a shift and blend for
	 * each mbuf before we do the write.
	 */
	rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(vlan0, 8), 0x10);
	rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(vlan0, 4), 0x10);
	rearm2 = _mm_blend_epi16(mbuf_init, vlan0, 0x10);
	rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(vlan0, 4), 0x10);

	/* write the rearm data and the olflags in one write */
	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
			offsetof(struct rte_mbuf, rearm_data) + 8);
	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
			RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16));
	_mm_store_si128((__m128i *)&rx_pkts[0]->rearm_data, rearm0);
	_mm_store_si128((__m128i *)&rx_pkts[1]->rearm_data, rearm1);
	_mm_store_si128((__m128i *)&rx_pkts[2]->rearm_data, rearm2);
	_mm_store_si128((__m128i *)&rx_pkts[3]->rearm_data, rearm3);
}
Esempio n. 12
0
/* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */
static void
unshuffle16(uint8_t* dest, uint8_t* orig, size_t size)
{
  size_t i, j, k;
  size_t neblock, numof16belem;
  __m128i xmm1[16], xmm2[16];

  neblock = size / 16;
  numof16belem = neblock / 16;
  for (i = 0, k = 0; i < numof16belem; i++, k += 16) {
    /* Load the first 128 bytes in 16 XMM registrers */
    for (j = 0; j < 16; j++) {
      xmm1[j] = ((__m128i *)orig)[j*numof16belem+i];
    }
    /* Shuffle bytes */
    for (j = 0; j < 8; j++) {
      /* Compute the low 32 bytes */
      xmm2[j] = _mm_unpacklo_epi8(xmm1[j*2], xmm1[j*2+1]);
      /* Compute the hi 32 bytes */
      xmm2[8+j] = _mm_unpackhi_epi8(xmm1[j*2], xmm1[j*2+1]);
    }
    /* Shuffle 2-byte words */
    for (j = 0; j < 8; j++) {
      /* Compute the low 32 bytes */
      xmm1[j] = _mm_unpacklo_epi16(xmm2[j*2], xmm2[j*2+1]);
      /* Compute the hi 32 bytes */
      xmm1[8+j] = _mm_unpackhi_epi16(xmm2[j*2], xmm2[j*2+1]);
    }
    /* Shuffle 4-byte dwords */
    for (j = 0; j < 8; j++) {
      /* Compute the low 32 bytes */
      xmm2[j] = _mm_unpacklo_epi32(xmm1[j*2], xmm1[j*2+1]);
      /* Compute the hi 32 bytes */
      xmm2[8+j] = _mm_unpackhi_epi32(xmm1[j*2], xmm1[j*2+1]);
    }
    /* Shuffle 8-byte qwords */
    for (j = 0; j < 8; j++) {
      /* Compute the low 32 bytes */
      xmm1[j] = _mm_unpacklo_epi64(xmm2[j*2], xmm2[j*2+1]);
      /* Compute the hi 32 bytes */
      xmm1[8+j] = _mm_unpackhi_epi64(xmm2[j*2], xmm2[j*2+1]);
    }
    /* Store the result vectors in proper order */
    ((__m128i *)dest)[k+0] = xmm1[0];
    ((__m128i *)dest)[k+1] = xmm1[8];
    ((__m128i *)dest)[k+2] = xmm1[4];
    ((__m128i *)dest)[k+3] = xmm1[12];
    ((__m128i *)dest)[k+4] = xmm1[2];
    ((__m128i *)dest)[k+5] = xmm1[10];
    ((__m128i *)dest)[k+6] = xmm1[6];
    ((__m128i *)dest)[k+7] = xmm1[14];
    ((__m128i *)dest)[k+8] = xmm1[1];
    ((__m128i *)dest)[k+9] = xmm1[9];
    ((__m128i *)dest)[k+10] = xmm1[5];
    ((__m128i *)dest)[k+11] = xmm1[13];
    ((__m128i *)dest)[k+12] = xmm1[3];
    ((__m128i *)dest)[k+13] = xmm1[11];
    ((__m128i *)dest)[k+14] = xmm1[7];
    ((__m128i *)dest)[k+15] = xmm1[15];
  }
}
Esempio n. 13
0
int main()
{
	//Transpose
	vec4 mat[4] = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}, {13, 14, 15, 16}};

	__m128i xmm0 = _mm_unpacklo_epi32(_mm_castps_si128(mat[0]), _mm_castps_si128(mat[1]));
	__m128i xmm1 = _mm_unpackhi_epi32(_mm_castps_si128(mat[0]), _mm_castps_si128(mat[1]));
	__m128i xmm2 = _mm_unpacklo_epi32(_mm_castps_si128(mat[2]), _mm_castps_si128(mat[3]));
	__m128i xmm3 = _mm_unpackhi_epi32(_mm_castps_si128(mat[2]), _mm_castps_si128(mat[3]));

	vec4 trans[4];

	trans[0] = _mm_castsi128_ps(_mm_unpacklo_epi64(xmm0, xmm2));
	trans[1] = _mm_castsi128_ps(_mm_unpackhi_epi64(xmm0, xmm2));
	trans[2] = _mm_castsi128_ps(_mm_unpacklo_epi64(xmm1, xmm3));
	trans[3] = _mm_castsi128_ps(_mm_unpackhi_epi64(xmm1, xmm3));

	vec4 trans2[4];

	ml::transpose(trans2, mat);

	FILE* file = fopen("..\\..\\AppData\\VT.swf", "rb");
	fseek(file, 0, SEEK_END);
	size_t size = ftell(file);
	fseek(file, 0, SEEK_SET);
	unsigned char* fileData = (unsigned char*)malloc(size);
	fread(fileData, 1, size, file);
	fclose(file);

	MemReader data = {(const char*)fileData, (const char*)fileData+size, (const char*)fileData};
	
	//Read SWF header
	const u32 signatureAndVersion	= data.read<u32>();
	const u32 actualSize			= data.read<u32>();

	u32 signature = signatureAndVersion&0x00FFFFFF;
	u8	version = signatureAndVersion>>24;

	bool isCompressed   = signature=='\0SWC';
	bool isUncompressed = signature=='\0SWF';

	//if !isCompressed && !isUncompressed return error;

	MemReader data2 = {0, 0, 0};

	char* uncompressed = 0;
	if (isCompressed)
	{
		 uncompressed = (char*)malloc(actualSize-8);
		 data2.cur = data2.start = uncompressed;
		 data2.end = uncompressed+actualSize-8;
		 uLongf uncompressedSize = actualSize-8;
		 uncompress((Bytef*)uncompressed, &uncompressedSize, data.as<Bytef>(), size-8);
	}
	else if (isCompressed)
	{
		data2.cur = data2.start = data.as<char>();
		data2.end = data2.start+actualSize-8;
	}

	
	u8 bits = data2.read<u8>();
	u8 numBits = bits>>3;

	u32 rectSizeMinusOne = (numBits*4+5)>>3;
	data2.move(rectSizeMinusOne);
	
	const u16 frameRate	 = data2.read<u16>();
	const u16 frameCount = data2.read<u16>();

	std::set<u32>	tagsUsed;
	size_t tagCount = 0;

	while (data2.cur!=data2.end)
	{
		u16 tagHeader = data2.read<u16>();
		u32 tagLength = tagHeader&0x3F;
		u32 tagType = tagHeader>>6;
		tagsUsed.insert(tagType);

		if (tagLength==0x3F)
			tagLength = data2.read<u32>();
		data2.move(tagLength);

		parseTag(tagType);

		++tagCount;
	}

	if (uncompressed) free(uncompressed);

	printf("\nProcessed %d tags\n\n", tagCount);

	printf("        Tags used        \n");
	printf("-------------------------\n");

	std::set<u32>::iterator	it  = tagsUsed.begin(),
							end = tagsUsed.end();

	for (; it!=end; ++it)
	{
		parseTag(*it);
	}

	free(fileData);
}
Esempio n. 14
0
template<class T> inline void dequantise_sse4_2_8_8_2(QuantisationMatrix *qmatrix,
                                                      int32_t *idata,
                                                      void *_odata,
                                                      int ostride) {
  T *odata = (T *)_odata;
  const int slice_width  = 8;
  const int slice_height = 8;
  const int Y = 0;
  const int X = 0;
  const int N = 0;
  T * const optr = &odata[Y*slice_height*ostride + X*slice_width];
  const int32_t * iptr = &idata[N*slice_height*slice_width];

  const __m128i D0  = LOAD_QUANTISED(&iptr[ 0], qmatrix, 0, 0); // [  0  1  2  3 ]
  const __m128i D4  = LOAD_QUANTISED(&iptr[ 4], qmatrix, 1, 1); // [  4  5  6  7 ]
  const __m128i D8  = LOAD_QUANTISED(&iptr[ 8], qmatrix, 1, 2); // [  8  9 10 11 ]
  const __m128i D12 = LOAD_QUANTISED(&iptr[12], qmatrix, 1, 3); // [ 12 13 14 15 ]
  const __m128i D16 = LOAD_QUANTISED(&iptr[16], qmatrix, 2, 1); // [ 16 17 18 19 ]
  const __m128i D20 = LOAD_QUANTISED(&iptr[20], qmatrix, 2, 1); // [ 20 21 22 23 ]
  const __m128i D24 = LOAD_QUANTISED(&iptr[24], qmatrix, 2, 1); // [ 24 25 26 27 ]
  const __m128i D28 = LOAD_QUANTISED(&iptr[28], qmatrix, 2, 1); // [ 28 29 30 31 ]

  const __m128i X0 = _mm_unpacklo_epi32(D0,  D4); // [  0  4  1  5 ]
  const __m128i Y0 = _mm_unpacklo_epi32(X0, D16); // [  0 16  4 17 ]
  const __m128i Y1 = _mm_unpackhi_epi32(X0, D16); // [  1 18  5 19 ]
  STORE_SAMPLE_PAIR<T>((__m128i *)&optr[0*ostride + 0], Y0, Y1);

  const __m128i X1 = _mm_unpackhi_epi32(D0,  D4); // [  2  6  3  7 ]
  const __m128i Y2 = _mm_unpacklo_epi32(X1, D24); // [  2 24  6 25 ]
  const __m128i Y3 = _mm_unpackhi_epi32(X1, D24); // [  3 26  7 27 ]
  STORE_SAMPLE_PAIR<T>((__m128i *)&optr[4*ostride + 0], Y2, Y3);


  const __m128i X2 = _mm_unpacklo_epi32(D8, D12);  // [  8 12  9 13 ]
  const __m128i Y4 = _mm_unpacklo_epi32(X2, D20);  // [  8 20 12 21 ]
  const __m128i Y5 = _mm_unpackhi_epi32(X2, D20);  // [  9 22 13 23 ]
  STORE_SAMPLE_PAIR<T>((__m128i *)&optr[2*ostride + 0], Y4, Y5);

  const __m128i X3 = _mm_unpackhi_epi32(D8, D12);  // [ 10 14 11 15 ]
  const __m128i Y6 = _mm_unpacklo_epi32(X3, D28);  // [ 10 28 14 29 ]
  const __m128i Y7 = _mm_unpackhi_epi32(X3, D28);  // [ 11 30 15 31 ]
  STORE_SAMPLE_PAIR<T>((__m128i *)&optr[6*ostride + 0], Y6, Y7);



  const __m128i D32 = LOAD_QUANTISED(&iptr[32], qmatrix, 2, 2); // [ 32 33 34 35 ]
  const __m128i D36 = LOAD_QUANTISED(&iptr[36], qmatrix, 2, 2); // [ 36 37 38 39 ]
  const __m128i D40 = LOAD_QUANTISED(&iptr[40], qmatrix, 2, 2); // [ 40 41 42 43 ]
  const __m128i D44 = LOAD_QUANTISED(&iptr[44], qmatrix, 2, 2); // [ 44 45 46 47 ]
  const __m128i D48 = LOAD_QUANTISED(&iptr[48], qmatrix, 2, 3); // [ 48 49 50 51 ]
  const __m128i D52 = LOAD_QUANTISED(&iptr[52], qmatrix, 2, 3); // [ 52 53 54 55 ]
  const __m128i D56 = LOAD_QUANTISED(&iptr[56], qmatrix, 2, 3); // [ 56 57 58 59 ]
  const __m128i D60 = LOAD_QUANTISED(&iptr[60], qmatrix, 2, 3); // [ 60 61 62 63 ]

  const __m128i Z0 = _mm_unpacklo_epi32(D32, D48); // [ 32 48 33 49 ]
  const __m128i Z1 = _mm_unpackhi_epi32(D32, D48); // [ 34 50 35 51 ]
  STORE_SAMPLE_PAIR<T>((__m128i *)&optr[1*ostride + 0], Z0, Z1);

  const __m128i Z2 = _mm_unpacklo_epi32(D36, D52); // [ 36 52 37 53 ]
  const __m128i Z3 = _mm_unpackhi_epi32(D36, D52); // [ 38 54 39 55 ]
  STORE_SAMPLE_PAIR<T>((__m128i *)&optr[3*ostride + 0], Z2, Z3);

  const __m128i Z4 = _mm_unpacklo_epi32(D40, D56); // [ 40 56 41 57 ]
  const __m128i Z5 = _mm_unpackhi_epi32(D40, D56); // [ 42 58 43 59 ]
  STORE_SAMPLE_PAIR<T>((__m128i *)&optr[5*ostride + 0], Z4, Z5);

  const __m128i Z6 = _mm_unpacklo_epi32(D44, D60); // [ 44 60 45 61 ]
  const __m128i Z7 = _mm_unpackhi_epi32(D44, D60); // [ 46 62 47 63 ]
  STORE_SAMPLE_PAIR<T>((__m128i *)&optr[7*ostride + 0], Z6, Z7);
}
Esempio n. 15
0
OD_SIMD_INLINE od_m256i od_mm256_unpackhi_epi32(od_m256i a, od_m256i b) {
  od_m256i r;
  r.lo = _mm_unpackhi_epi32(a.lo, b.lo);
  r.hi = _mm_unpackhi_epi32(a.hi, b.hi);
  return r;
}
void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
  // The 2D transform is done with two passes which are actually pretty
  // similar. In the first one, we transform the columns and transpose
  // the results. In the second one, we transform the rows. To achieve that,
  // as the first pass results are transposed, we tranpose the columns (that
  // is the transposed rows) and transpose the results (so that it goes back
  // in normal/row positions).
  const int stride = pitch >> 1;
  int pass;
  // Constants
  //    When we use them, in one case, they are all the same. In all others
  //    it's a pair of them that we need to repeat four times. This is done
  //    by constructing the 32 bit constant corresponding to that pair.
  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
  const __m128i kOne = _mm_set1_epi16(1);
  __m128i in0, in1, in2, in3;
  // Load inputs.
  {
    in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
    in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
    in2  = _mm_loadl_epi64((const __m128i *)(input +  2 * stride));
    in3  = _mm_loadl_epi64((const __m128i *)(input +  3 * stride));
    // x = x << 4
    in0 = _mm_slli_epi16(in0, 4);
    in1 = _mm_slli_epi16(in1, 4);
    in2 = _mm_slli_epi16(in2, 4);
    in3 = _mm_slli_epi16(in3, 4);
    // if (i == 0 && input[0]) input[0] += 1;
    {
      // The mask will only contain wether the first value is zero, all
      // other comparison will fail as something shifted by 4 (above << 4)
      // can never be equal to one. To increment in the non-zero case, we
      // add the mask and one for the first element:
      //   - if zero, mask = -1, v = v - 1 + 1 = v
      //   - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
      __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
      in0 = _mm_add_epi16(in0, mask);
      in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
    }
  }
  // Do the two transform/transpose passes
  for (pass = 0; pass < 2; ++pass) {
    // Transform 1/2: Add/substract
    const __m128i r0 = _mm_add_epi16(in0, in3);
    const __m128i r1 = _mm_add_epi16(in1, in2);
    const __m128i r2 = _mm_sub_epi16(in1, in2);
    const __m128i r3 = _mm_sub_epi16(in0, in3);
    // Transform 1/2: Interleave to do the multiply by constants which gets us
    //                into 32 bits.
    const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
    const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
    const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
    const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
    const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
    const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
    const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
    const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
    const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
    const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
    const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
    const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
    const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
    // Combine and transpose
    const __m128i res0 = _mm_packs_epi32(w0, w2);
    const __m128i res1 = _mm_packs_epi32(w4, w6);
    // 00 01 02 03 20 21 22 23
    // 10 11 12 13 30 31 32 33
    const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
    const __m128i tr0_1 = _mm_unpackhi_epi16(res0, res1);
    // 00 10 01 11 02 12 03 13
    // 20 30 21 31 22 32 23 33
    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
    in2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
    // 00 10 20 30 01 11 21 31      in0 contains 0 followed by 1
    // 02 12 22 32 03 13 23 33      in2 contains 2 followed by 3
    if (0 == pass) {
      // Extract values in the high part for second pass as transform code
      // only uses the first four values.
      in1 = _mm_unpackhi_epi64(in0, in0);
      in3 = _mm_unpackhi_epi64(in2, in2);
    } else {
      // Post-condition output and store it (v + 1) >> 2, taking advantage
      // of the fact 1/3 are stored just after 0/2.
      __m128i out01 = _mm_add_epi16(in0, kOne);
      __m128i out23 = _mm_add_epi16(in2, kOne);
      out01 = _mm_srai_epi16(out01, 2);
      out23 = _mm_srai_epi16(out23, 2);
      _mm_storeu_si128((__m128i *)(output + 0 * 4), out01);
      _mm_storeu_si128((__m128i *)(output + 2 * 4), out23);
    }
  }
}
Esempio n. 17
0
/*
 * vPMD receive routine, now only accept (nb_pkts == RTE_IXGBE_VPMD_RX_BURST)
 * in one loop
 *
 * Notice:
 * - nb_pkts < RTE_IXGBE_VPMD_RX_BURST, just return no packet
 * - nb_pkts > RTE_IXGBE_VPMD_RX_BURST, only scan RTE_IXGBE_VPMD_RX_BURST
 *   numbers of DD bit
 * - don't support ol_flags for rss and csum err
 */
static inline uint16_t
_recv_raw_pkts_vec(struct igb_rx_queue *rxq, struct rte_mbuf **rx_pkts,
		uint16_t nb_pkts, uint8_t *split_packet)
{
	volatile union ixgbe_adv_rx_desc *rxdp;
	struct igb_rx_entry *sw_ring;
	uint16_t nb_pkts_recd;
	int pos;
	uint64_t var;
	__m128i shuf_msk;
	__m128i crc_adjust = _mm_set_epi16(
				0, 0, 0, 0, /* ignore non-length fields */
				0,          /* ignore high-16bits of pkt_len */
				-rxq->crc_len, /* sub crc on pkt_len */
				-rxq->crc_len, /* sub crc on data_len */
				0            /* ignore pkt_type field */
			);
	__m128i dd_check, eop_check;

	if (unlikely(nb_pkts < RTE_IXGBE_VPMD_RX_BURST))
		return 0;

	/* Just the act of getting into the function from the application is
	 * going to cost about 7 cycles */
	rxdp = rxq->rx_ring + rxq->rx_tail;

	_mm_prefetch((const void *)rxdp, _MM_HINT_T0);

	/* See if we need to rearm the RX queue - gives the prefetch a bit
	 * of time to act */
	if (rxq->rxrearm_nb > RTE_IXGBE_RXQ_REARM_THRESH)
		ixgbe_rxq_rearm(rxq);

	/* Before we start moving massive data around, check to see if
	 * there is actually a packet available */
	if (!(rxdp->wb.upper.status_error &
				rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD)))
		return 0;

	/* 4 packets DD mask */
	dd_check = _mm_set_epi64x(0x0000000100000001LL, 0x0000000100000001LL);

	/* 4 packets EOP mask */
	eop_check = _mm_set_epi64x(0x0000000200000002LL, 0x0000000200000002LL);

	/* mask to shuffle from desc. to mbuf */
	shuf_msk = _mm_set_epi8(
		7, 6, 5, 4,  /* octet 4~7, 32bits rss */
		0xFF, 0xFF,  /* skip high 16 bits vlan_macip, zero out */
		15, 14,      /* octet 14~15, low 16 bits vlan_macip */
		0xFF, 0xFF,  /* skip high 16 bits pkt_len, zero out */
		13, 12,      /* octet 12~13, low 16 bits pkt_len */
		13, 12,      /* octet 12~13, 16 bits data_len */
		0xFF, 0xFF   /* skip pkt_type field */
		);

	/* Cache is empty -> need to scan the buffer rings, but first move
	 * the next 'n' mbufs into the cache */
	sw_ring = &rxq->sw_ring[rxq->rx_tail];

	/*
	 * A. load 4 packet in one loop
	 * B. copy 4 mbuf point from swring to rx_pkts
	 * C. calc the number of DD bits among the 4 packets
	 * [C*. extract the end-of-packet bit, if requested]
	 * D. fill info. from desc to mbuf
	 */
	for (pos = 0, nb_pkts_recd = 0; pos < RTE_IXGBE_VPMD_RX_BURST;
			pos += RTE_IXGBE_DESCS_PER_LOOP,
			rxdp += RTE_IXGBE_DESCS_PER_LOOP) {
		__m128i descs[RTE_IXGBE_DESCS_PER_LOOP];
		__m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
		__m128i zero, staterr, sterr_tmp1, sterr_tmp2;
		__m128i mbp1, mbp2; /* two mbuf pointer in one XMM reg. */

		if (split_packet) {
			rte_prefetch0(&rx_pkts[pos]->cacheline1);
			rte_prefetch0(&rx_pkts[pos + 1]->cacheline1);
			rte_prefetch0(&rx_pkts[pos + 2]->cacheline1);
			rte_prefetch0(&rx_pkts[pos + 3]->cacheline1);
		}

		/* B.1 load 1 mbuf point */
		mbp1 = _mm_loadu_si128((__m128i *)&sw_ring[pos]);

		/* Read desc statuses backwards to avoid race condition */
		/* A.1 load 4 pkts desc */
		descs[3] = _mm_loadu_si128((__m128i *)(rxdp + 3));

		/* B.2 copy 2 mbuf point into rx_pkts  */
		_mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1);

		/* B.1 load 1 mbuf point */
		mbp2 = _mm_loadu_si128((__m128i *)&sw_ring[pos+2]);

		descs[2] = _mm_loadu_si128((__m128i *)(rxdp + 2));
		/* B.1 load 2 mbuf point */
		descs[1] = _mm_loadu_si128((__m128i *)(rxdp + 1));
		descs[0] = _mm_loadu_si128((__m128i *)(rxdp));

		/* B.2 copy 2 mbuf point into rx_pkts  */
		_mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2);

		/* avoid compiler reorder optimization */
		rte_compiler_barrier();

		/* D.1 pkt 3,4 convert format from desc to pktmbuf */
		pkt_mb4 = _mm_shuffle_epi8(descs[3], shuf_msk);
		pkt_mb3 = _mm_shuffle_epi8(descs[2], shuf_msk);

		/* C.1 4=>2 filter staterr info only */
		sterr_tmp2 = _mm_unpackhi_epi32(descs[3], descs[2]);
		/* C.1 4=>2 filter staterr info only */
		sterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]);

		/* set ol_flags with packet type and vlan tag */
		desc_to_olflags_v(descs, &rx_pkts[pos]);

		/* D.2 pkt 3,4 set in_port/nb_seg and remove crc */
		pkt_mb4 = _mm_add_epi16(pkt_mb4, crc_adjust);
		pkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust);

		/* D.1 pkt 1,2 convert format from desc to pktmbuf */
		pkt_mb2 = _mm_shuffle_epi8(descs[1], shuf_msk);
		pkt_mb1 = _mm_shuffle_epi8(descs[0], shuf_msk);

		/* C.2 get 4 pkts staterr value  */
		zero = _mm_xor_si128(dd_check, dd_check);
		staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2);

		/* D.3 copy final 3,4 data to rx_pkts */
		_mm_storeu_si128((void *)&rx_pkts[pos+3]->rx_descriptor_fields1,
				pkt_mb4);
		_mm_storeu_si128((void *)&rx_pkts[pos+2]->rx_descriptor_fields1,
				pkt_mb3);

		/* D.2 pkt 1,2 set in_port/nb_seg and remove crc */
		pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust);
		pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust);

		/* C* extract and record EOP bit */
		if (split_packet) {
			__m128i eop_shuf_mask = _mm_set_epi8(
					0xFF, 0xFF, 0xFF, 0xFF,
					0xFF, 0xFF, 0xFF, 0xFF,
					0xFF, 0xFF, 0xFF, 0xFF,
					0x04, 0x0C, 0x00, 0x08
					);

			/* and with mask to extract bits, flipping 1-0 */
			__m128i eop_bits = _mm_andnot_si128(staterr, eop_check);
			/* the staterr values are not in order, as the count
			 * count of dd bits doesn't care. However, for end of
			 * packet tracking, we do care, so shuffle. This also
			 * compresses the 32-bit values to 8-bit */
			eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask);
			/* store the resulting 32-bit value */
			*(int *)split_packet = _mm_cvtsi128_si32(eop_bits);
			split_packet += RTE_IXGBE_DESCS_PER_LOOP;

			/* zero-out next pointers */
			rx_pkts[pos]->next = NULL;
			rx_pkts[pos + 1]->next = NULL;
			rx_pkts[pos + 2]->next = NULL;
			rx_pkts[pos + 3]->next = NULL;
		}

		/* C.3 calc available number of desc */
		staterr = _mm_and_si128(staterr, dd_check);
		staterr = _mm_packs_epi32(staterr, zero);

		/* D.3 copy final 1,2 data to rx_pkts */
		_mm_storeu_si128((void *)&rx_pkts[pos+1]->rx_descriptor_fields1,
				pkt_mb2);
		_mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1,
				pkt_mb1);

		/* C.4 calc avaialbe number of desc */
		var = __builtin_popcountll(_mm_cvtsi128_si64(staterr));
		nb_pkts_recd += var;
		if (likely(var != RTE_IXGBE_DESCS_PER_LOOP))
			break;
	}

	/* Update our internal tail pointer */
	rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd);
	rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1));
	rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd);

	return nb_pkts_recd;
}
void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
  // The 2D transform is done with two passes which are actually pretty
  // similar. In the first one, we transform the columns and transpose
  // the results. In the second one, we transform the rows. To achieve that,
  // as the first pass results are transposed, we tranpose the columns (that
  // is the transposed rows) and transpose the results (so that it goes back
  // in normal/row positions).
  const int stride = pitch >> 1;
  int pass;
  // We need an intermediate buffer between passes.
  int16_t intermediate[256];
  int16_t *in = input;
  int16_t *out = intermediate;
  // Constants
  //    When we use them, in one case, they are all the same. In all others
  //    it's a pair of them that we need to repeat four times. This is done
  //    by constructing the 32 bit constant corresponding to that pair.
  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
  const __m128i kOne = _mm_set1_epi16(1);
  // Do the two transform/transpose passes
  for (pass = 0; pass < 2; ++pass) {
    // We process eight columns (transposed rows in second pass) at a time.
    int column_start;
    for (column_start = 0; column_start < 16; column_start += 8) {
      __m128i in00, in01, in02, in03, in04, in05, in06, in07;
      __m128i in08, in09, in10, in11, in12, in13, in14, in15;
      __m128i input0, input1, input2, input3, input4, input5, input6, input7;
      __m128i step1_0, step1_1, step1_2, step1_3;
      __m128i step1_4, step1_5, step1_6, step1_7;
      __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
      __m128i step3_0, step3_1, step3_2, step3_3;
      __m128i step3_4, step3_5, step3_6, step3_7;
      __m128i res00, res01, res02, res03, res04, res05, res06, res07;
      __m128i res08, res09, res10, res11, res12, res13, res14, res15;
      // Load and pre-condition input.
      if (0 == pass) {
        in00  = _mm_loadu_si128((const __m128i *)(in +  0 * stride));
        in01  = _mm_loadu_si128((const __m128i *)(in +  1 * stride));
        in02  = _mm_loadu_si128((const __m128i *)(in +  2 * stride));
        in03  = _mm_loadu_si128((const __m128i *)(in +  3 * stride));
        in04  = _mm_loadu_si128((const __m128i *)(in +  4 * stride));
        in05  = _mm_loadu_si128((const __m128i *)(in +  5 * stride));
        in06  = _mm_loadu_si128((const __m128i *)(in +  6 * stride));
        in07  = _mm_loadu_si128((const __m128i *)(in +  7 * stride));
        in08  = _mm_loadu_si128((const __m128i *)(in +  8 * stride));
        in09  = _mm_loadu_si128((const __m128i *)(in +  9 * stride));
        in10  = _mm_loadu_si128((const __m128i *)(in + 10 * stride));
        in11  = _mm_loadu_si128((const __m128i *)(in + 11 * stride));
        in12  = _mm_loadu_si128((const __m128i *)(in + 12 * stride));
        in13  = _mm_loadu_si128((const __m128i *)(in + 13 * stride));
        in14  = _mm_loadu_si128((const __m128i *)(in + 14 * stride));
        in15  = _mm_loadu_si128((const __m128i *)(in + 15 * stride));
        // x = x << 2
        in00 = _mm_slli_epi16(in00, 2);
        in01 = _mm_slli_epi16(in01, 2);
        in02 = _mm_slli_epi16(in02, 2);
        in03 = _mm_slli_epi16(in03, 2);
        in04 = _mm_slli_epi16(in04, 2);
        in05 = _mm_slli_epi16(in05, 2);
        in06 = _mm_slli_epi16(in06, 2);
        in07 = _mm_slli_epi16(in07, 2);
        in08 = _mm_slli_epi16(in08, 2);
        in09 = _mm_slli_epi16(in09, 2);
        in10 = _mm_slli_epi16(in10, 2);
        in11 = _mm_slli_epi16(in11, 2);
        in12 = _mm_slli_epi16(in12, 2);
        in13 = _mm_slli_epi16(in13, 2);
        in14 = _mm_slli_epi16(in14, 2);
        in15 = _mm_slli_epi16(in15, 2);
      } else {
        in00  = _mm_loadu_si128((const __m128i *)(in +  0 * 16));
        in01  = _mm_loadu_si128((const __m128i *)(in +  1 * 16));
        in02  = _mm_loadu_si128((const __m128i *)(in +  2 * 16));
        in03  = _mm_loadu_si128((const __m128i *)(in +  3 * 16));
        in04  = _mm_loadu_si128((const __m128i *)(in +  4 * 16));
        in05  = _mm_loadu_si128((const __m128i *)(in +  5 * 16));
        in06  = _mm_loadu_si128((const __m128i *)(in +  6 * 16));
        in07  = _mm_loadu_si128((const __m128i *)(in +  7 * 16));
        in08  = _mm_loadu_si128((const __m128i *)(in +  8 * 16));
        in09  = _mm_loadu_si128((const __m128i *)(in +  9 * 16));
        in10  = _mm_loadu_si128((const __m128i *)(in + 10 * 16));
        in11  = _mm_loadu_si128((const __m128i *)(in + 11 * 16));
        in12  = _mm_loadu_si128((const __m128i *)(in + 12 * 16));
        in13  = _mm_loadu_si128((const __m128i *)(in + 13 * 16));
        in14  = _mm_loadu_si128((const __m128i *)(in + 14 * 16));
        in15  = _mm_loadu_si128((const __m128i *)(in + 15 * 16));
        // x = (x + 1) >> 2
        in00 = _mm_add_epi16(in00, kOne);
        in01 = _mm_add_epi16(in01, kOne);
        in02 = _mm_add_epi16(in02, kOne);
        in03 = _mm_add_epi16(in03, kOne);
        in04 = _mm_add_epi16(in04, kOne);
        in05 = _mm_add_epi16(in05, kOne);
        in06 = _mm_add_epi16(in06, kOne);
        in07 = _mm_add_epi16(in07, kOne);
        in08 = _mm_add_epi16(in08, kOne);
        in09 = _mm_add_epi16(in09, kOne);
        in10 = _mm_add_epi16(in10, kOne);
        in11 = _mm_add_epi16(in11, kOne);
        in12 = _mm_add_epi16(in12, kOne);
        in13 = _mm_add_epi16(in13, kOne);
        in14 = _mm_add_epi16(in14, kOne);
        in15 = _mm_add_epi16(in15, kOne);
        in00 = _mm_srai_epi16(in00, 2);
        in01 = _mm_srai_epi16(in01, 2);
        in02 = _mm_srai_epi16(in02, 2);
        in03 = _mm_srai_epi16(in03, 2);
        in04 = _mm_srai_epi16(in04, 2);
        in05 = _mm_srai_epi16(in05, 2);
        in06 = _mm_srai_epi16(in06, 2);
        in07 = _mm_srai_epi16(in07, 2);
        in08 = _mm_srai_epi16(in08, 2);
        in09 = _mm_srai_epi16(in09, 2);
        in10 = _mm_srai_epi16(in10, 2);
        in11 = _mm_srai_epi16(in11, 2);
        in12 = _mm_srai_epi16(in12, 2);
        in13 = _mm_srai_epi16(in13, 2);
        in14 = _mm_srai_epi16(in14, 2);
        in15 = _mm_srai_epi16(in15, 2);
      }
      in += 8;
      // Calculate input for the first 8 results.
      {
        input0 = _mm_add_epi16(in00, in15);
        input1 = _mm_add_epi16(in01, in14);
        input2 = _mm_add_epi16(in02, in13);
        input3 = _mm_add_epi16(in03, in12);
        input4 = _mm_add_epi16(in04, in11);
        input5 = _mm_add_epi16(in05, in10);
        input6 = _mm_add_epi16(in06, in09);
        input7 = _mm_add_epi16(in07, in08);
      }
      // Calculate input for the next 8 results.
      {
        step1_0 = _mm_sub_epi16(in07, in08);
        step1_1 = _mm_sub_epi16(in06, in09);
        step1_2 = _mm_sub_epi16(in05, in10);
        step1_3 = _mm_sub_epi16(in04, in11);
        step1_4 = _mm_sub_epi16(in03, in12);
        step1_5 = _mm_sub_epi16(in02, in13);
        step1_6 = _mm_sub_epi16(in01, in14);
        step1_7 = _mm_sub_epi16(in00, in15);
      }
      // Work on the first eight values; fdct8_1d(input, even_results);
      {
        // Add/substract
        const __m128i q0 = _mm_add_epi16(input0, input7);
        const __m128i q1 = _mm_add_epi16(input1, input6);
        const __m128i q2 = _mm_add_epi16(input2, input5);
        const __m128i q3 = _mm_add_epi16(input3, input4);
        const __m128i q4 = _mm_sub_epi16(input3, input4);
        const __m128i q5 = _mm_sub_epi16(input2, input5);
        const __m128i q6 = _mm_sub_epi16(input1, input6);
        const __m128i q7 = _mm_sub_epi16(input0, input7);
        // Work on first four results
        {
          // Add/substract
          const __m128i r0 = _mm_add_epi16(q0, q3);
          const __m128i r1 = _mm_add_epi16(q1, q2);
          const __m128i r2 = _mm_sub_epi16(q1, q2);
          const __m128i r3 = _mm_sub_epi16(q0, q3);
          // Interleave to do the multiply by constants which gets us
          // into 32 bits.
          const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
          const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
          const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
          const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
          // dct_const_round_shift
          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
          // Combine
          res00 = _mm_packs_epi32(w0, w1);
          res08 = _mm_packs_epi32(w2, w3);
          res04 = _mm_packs_epi32(w4, w5);
          res12 = _mm_packs_epi32(w6, w7);
        }
        // Work on next four results
        {
          // Interleave to do the multiply by constants which gets us
          // into 32 bits.
          const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
          const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
          const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
          const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
          const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
          const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
          // dct_const_round_shift
          const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
          const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
          const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
          const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
          const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
          const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
          const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
          const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
          // Combine
          const __m128i r0 = _mm_packs_epi32(s0, s1);
          const __m128i r1 = _mm_packs_epi32(s2, s3);
          // Add/substract
          const __m128i x0 = _mm_add_epi16(q4, r0);
          const __m128i x1 = _mm_sub_epi16(q4, r0);
          const __m128i x2 = _mm_sub_epi16(q7, r1);
          const __m128i x3 = _mm_add_epi16(q7, r1);
          // Interleave to do the multiply by constants which gets us
          // into 32 bits.
          const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
          const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
          const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
          const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
          // dct_const_round_shift
          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
          // Combine
          res02 = _mm_packs_epi32(w0, w1);
          res14 = _mm_packs_epi32(w2, w3);
          res10 = _mm_packs_epi32(w4, w5);
          res06 = _mm_packs_epi32(w6, w7);
        }
      }
      // Work on the next eight values; step1 -> odd_results
      {
        // step 2
        {
          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16);
          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16);
          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16);
          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16);
          // dct_const_round_shift
          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
          // Combine
          step2_2 = _mm_packs_epi32(w0, w1);
          step2_3 = _mm_packs_epi32(w2, w3);
        }
        {
          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16);
          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16);
          // dct_const_round_shift
          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
          // Combine
          step2_5 = _mm_packs_epi32(w0, w1);
          step2_4 = _mm_packs_epi32(w2, w3);
        }
        // step 3
        {
          step3_0 = _mm_add_epi16(step1_0, step2_3);
          step3_1 = _mm_add_epi16(step1_1, step2_2);
          step3_2 = _mm_sub_epi16(step1_1, step2_2);
          step3_3 = _mm_sub_epi16(step1_0, step2_3);
          step3_4 = _mm_sub_epi16(step1_7, step2_4);
          step3_5 = _mm_sub_epi16(step1_6, step2_5);
          step3_6 = _mm_add_epi16(step1_6, step2_5);
          step3_7 = _mm_add_epi16(step1_7, step2_4);
        }
        // step 4
        {
          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24);
          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24);
          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08);
          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08);
          // dct_const_round_shift
          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
          // Combine
          step2_1 = _mm_packs_epi32(w0, w1);
          step2_2 = _mm_packs_epi32(w2, w3);
        }
        {
          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08);
          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08);
          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24);
          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24);
          // dct_const_round_shift
          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
          // Combine
          step2_6 = _mm_packs_epi32(w0, w1);
          step2_5 = _mm_packs_epi32(w2, w3);
        }
        // step 5
        {
          step1_0 = _mm_add_epi16(step3_0, step2_1);
          step1_1 = _mm_sub_epi16(step3_0, step2_1);
          step1_2 = _mm_sub_epi16(step3_3, step2_2);
          step1_3 = _mm_add_epi16(step3_3, step2_2);
          step1_4 = _mm_add_epi16(step3_4, step2_5);
          step1_5 = _mm_sub_epi16(step3_4, step2_5);
          step1_6 = _mm_sub_epi16(step3_7, step2_6);
          step1_7 = _mm_add_epi16(step3_7, step2_6);
        }
        // step 6
        {
          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02);
          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02);
          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18);
          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18);
          // dct_const_round_shift
          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
          // Combine
          res01 = _mm_packs_epi32(w0, w1);
          res09 = _mm_packs_epi32(w2, w3);
        }
        {
          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10);
          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10);
          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26);
          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26);
          // dct_const_round_shift
          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
          // Combine
          res05 = _mm_packs_epi32(w0, w1);
          res13 = _mm_packs_epi32(w2, w3);
        }
        {
          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22);
          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22);
          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06);
          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06);
          // dct_const_round_shift
          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
          // Combine
          res11 = _mm_packs_epi32(w0, w1);
          res03 = _mm_packs_epi32(w2, w3);
        }
        {
          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30);
          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30);
          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14);
          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14);
          // dct_const_round_shift
          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
          // Combine
          res15 = _mm_packs_epi32(w0, w1);
          res07 = _mm_packs_epi32(w2, w3);
        }
      }
      // Transpose the results, do it as two 8x8 transposes.
      {
        // 00 01 02 03 04 05 06 07
        // 10 11 12 13 14 15 16 17
        // 20 21 22 23 24 25 26 27
        // 30 31 32 33 34 35 36 37
        // 40 41 42 43 44 45 46 47
        // 50 51 52 53 54 55 56 57
        // 60 61 62 63 64 65 66 67
        // 70 71 72 73 74 75 76 77
        const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01);
        const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03);
        const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01);
        const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03);
        const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05);
        const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07);
        const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05);
        const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07);
        // 00 10 01 11 02 12 03 13
        // 20 30 21 31 22 32 23 33
        // 04 14 05 15 06 16 07 17
        // 24 34 25 35 26 36 27 37
        // 40 50 41 51 42 52 43 53
        // 60 70 61 71 62 72 63 73
        // 54 54 55 55 56 56 57 57
        // 64 74 65 75 66 76 67 77
        const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
        const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
        const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
        const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
        const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
        const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
        const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
        const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
        // 00 10 20 30 01 11 21 31
        // 40 50 60 70 41 51 61 71
        // 02 12 22 32 03 13 23 33
        // 42 52 62 72 43 53 63 73
        // 04 14 24 34 05 15 21 36
        // 44 54 64 74 45 55 61 76
        // 06 16 26 36 07 17 27 37
        // 46 56 66 76 47 57 67 77
        const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
        const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
        const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
        const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
        const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
        const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
        const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
        const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
        // 00 10 20 30 40 50 60 70
        // 01 11 21 31 41 51 61 71
        // 02 12 22 32 42 52 62 72
        // 03 13 23 33 43 53 63 73
        // 04 14 24 34 44 54 64 74
        // 05 15 25 35 45 55 65 75
        // 06 16 26 36 46 56 66 76
        // 07 17 27 37 47 57 67 77
        _mm_storeu_si128((__m128i *)(out + 0 * 16), tr2_0);
        _mm_storeu_si128((__m128i *)(out + 1 * 16), tr2_1);
        _mm_storeu_si128((__m128i *)(out + 2 * 16), tr2_2);
        _mm_storeu_si128((__m128i *)(out + 3 * 16), tr2_3);
        _mm_storeu_si128((__m128i *)(out + 4 * 16), tr2_4);
        _mm_storeu_si128((__m128i *)(out + 5 * 16), tr2_5);
        _mm_storeu_si128((__m128i *)(out + 6 * 16), tr2_6);
        _mm_storeu_si128((__m128i *)(out + 7 * 16), tr2_7);
      }
      {
        // 00 01 02 03 04 05 06 07
        // 10 11 12 13 14 15 16 17
        // 20 21 22 23 24 25 26 27
        // 30 31 32 33 34 35 36 37
        // 40 41 42 43 44 45 46 47
        // 50 51 52 53 54 55 56 57
        // 60 61 62 63 64 65 66 67
        // 70 71 72 73 74 75 76 77
        const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09);
        const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11);
        const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09);
        const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11);
        const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13);
        const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15);
        const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13);
        const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15);
        // 00 10 01 11 02 12 03 13
        // 20 30 21 31 22 32 23 33
        // 04 14 05 15 06 16 07 17
        // 24 34 25 35 26 36 27 37
        // 40 50 41 51 42 52 43 53
        // 60 70 61 71 62 72 63 73
        // 54 54 55 55 56 56 57 57
        // 64 74 65 75 66 76 67 77
        const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
        const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
        const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
        const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
        const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
        const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
        const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
        const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
        // 00 10 20 30 01 11 21 31
        // 40 50 60 70 41 51 61 71
        // 02 12 22 32 03 13 23 33
        // 42 52 62 72 43 53 63 73
        // 04 14 24 34 05 15 21 36
        // 44 54 64 74 45 55 61 76
        // 06 16 26 36 07 17 27 37
        // 46 56 66 76 47 57 67 77
        const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
        const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
        const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
        const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
        const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
        const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
        const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
        const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
        // 00 10 20 30 40 50 60 70
        // 01 11 21 31 41 51 61 71
        // 02 12 22 32 42 52 62 72
        // 03 13 23 33 43 53 63 73
        // 04 14 24 34 44 54 64 74
        // 05 15 25 35 45 55 65 75
        // 06 16 26 36 46 56 66 76
        // 07 17 27 37 47 57 67 77
        // Store results
        _mm_storeu_si128((__m128i *)(out + 8 + 0 * 16), tr2_0);
        _mm_storeu_si128((__m128i *)(out + 8 + 1 * 16), tr2_1);
        _mm_storeu_si128((__m128i *)(out + 8 + 2 * 16), tr2_2);
        _mm_storeu_si128((__m128i *)(out + 8 + 3 * 16), tr2_3);
        _mm_storeu_si128((__m128i *)(out + 8 + 4 * 16), tr2_4);
        _mm_storeu_si128((__m128i *)(out + 8 + 5 * 16), tr2_5);
        _mm_storeu_si128((__m128i *)(out + 8 + 6 * 16), tr2_6);
        _mm_storeu_si128((__m128i *)(out + 8 + 7 * 16), tr2_7);
      }
      out += 8*16;
    }
    // Setup in/out for next pass.
    in = intermediate;
    out = output;
  }
}
Esempio n. 19
0
// Hadamard transform
// Returns the difference between the weighted sum of the absolute value of
// transformed coefficients.
static int TTransform(const uint8_t* inA, const uint8_t* inB,
                      const uint16_t* const w) {
  int32_t sum[4];
  __m128i tmp_0, tmp_1, tmp_2, tmp_3;
  const __m128i zero = _mm_setzero_si128();

  // Load, combine and transpose inputs.
  {
    const __m128i inA_0 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 0]);
    const __m128i inA_1 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 1]);
    const __m128i inA_2 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 2]);
    const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]);
    const __m128i inB_0 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 0]);
    const __m128i inB_1 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 1]);
    const __m128i inB_2 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 2]);
    const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]);

    // Combine inA and inB (we'll do two transforms in parallel).
    const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0);
    const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1);
    const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2);
    const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3);
    // a00 b00 a01 b01 a02 b03 a03 b03   0 0 0 0 0 0 0 0
    // a10 b10 a11 b11 a12 b12 a13 b13   0 0 0 0 0 0 0 0
    // a20 b20 a21 b21 a22 b22 a23 b23   0 0 0 0 0 0 0 0
    // a30 b30 a31 b31 a32 b32 a33 b33   0 0 0 0 0 0 0 0

    // Transpose the two 4x4, discarding the filling zeroes.
    const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2);
    const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3);
    // a00 a20  b00 b20  a01 a21  b01 b21  a02 a22  b02 b22  a03 a23  b03 b23
    // a10 a30  b10 b30  a11 a31  b11 b31  a12 a32  b12 b32  a13 a33  b13 b33
    const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
    const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
    // a00 a10 a20 a30  b00 b10 b20 b30  a01 a11 a21 a31  b01 b11 b21 b31
    // a02 a12 a22 a32  b02 b12 b22 b32  a03 a13 a23 a33  b03 b13 b23 b33

    // Convert to 16b.
    tmp_0 = _mm_unpacklo_epi8(transpose1_0, zero);
    tmp_1 = _mm_unpackhi_epi8(transpose1_0, zero);
    tmp_2 = _mm_unpacklo_epi8(transpose1_1, zero);
    tmp_3 = _mm_unpackhi_epi8(transpose1_1, zero);
    // a00 a10 a20 a30   b00 b10 b20 b30
    // a01 a11 a21 a31   b01 b11 b21 b31
    // a02 a12 a22 a32   b02 b12 b22 b32
    // a03 a13 a23 a33   b03 b13 b23 b33
  }

  // Horizontal pass and subsequent transpose.
  {
    // Calculate a and b (two 4x4 at once).
    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
    const __m128i b0 = _mm_add_epi16(a0, a1);
    const __m128i b1 = _mm_add_epi16(a3, a2);
    const __m128i b2 = _mm_sub_epi16(a3, a2);
    const __m128i b3 = _mm_sub_epi16(a0, a1);
    // a00 a01 a02 a03   b00 b01 b02 b03
    // a10 a11 a12 a13   b10 b11 b12 b13
    // a20 a21 a22 a23   b20 b21 b22 b23
    // a30 a31 a32 a33   b30 b31 b32 b33

    // Transpose the two 4x4.
    const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1);
    const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3);
    const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1);
    const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3);
    // a00 a10 a01 a11   a02 a12 a03 a13
    // a20 a30 a21 a31   a22 a32 a23 a33
    // b00 b10 b01 b11   b02 b12 b03 b13
    // b20 b30 b21 b31   b22 b32 b23 b33
    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
    // a00 a10 a20 a30 a01 a11 a21 a31
    // b00 b10 b20 b30 b01 b11 b21 b31
    // a02 a12 a22 a32 a03 a13 a23 a33
    // b02 b12 a22 b32 b03 b13 b23 b33
    tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
    tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
    tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
    tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
    // a00 a10 a20 a30   b00 b10 b20 b30
    // a01 a11 a21 a31   b01 b11 b21 b31
    // a02 a12 a22 a32   b02 b12 b22 b32
    // a03 a13 a23 a33   b03 b13 b23 b33
  }

  // Vertical pass and difference of weighted sums.
  {
    // Load all inputs.
    // TODO(cduvivier): Make variable declarations and allocations aligned so
    //                  we can use _mm_load_si128 instead of _mm_loadu_si128.
    const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]);
    const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]);

    // Calculate a and b (two 4x4 at once).
    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
    const __m128i b0 = _mm_add_epi16(a0, a1);
    const __m128i b1 = _mm_add_epi16(a3, a2);
    const __m128i b2 = _mm_sub_epi16(a3, a2);
    const __m128i b3 = _mm_sub_epi16(a0, a1);

    // Separate the transforms of inA and inB.
    __m128i A_b0 = _mm_unpacklo_epi64(b0, b1);
    __m128i A_b2 = _mm_unpacklo_epi64(b2, b3);
    __m128i B_b0 = _mm_unpackhi_epi64(b0, b1);
    __m128i B_b2 = _mm_unpackhi_epi64(b2, b3);

    {
      // sign(b) = b >> 15  (0x0000 if positive, 0xffff if negative)
      const __m128i sign_A_b0 = _mm_srai_epi16(A_b0, 15);
      const __m128i sign_A_b2 = _mm_srai_epi16(A_b2, 15);
      const __m128i sign_B_b0 = _mm_srai_epi16(B_b0, 15);
      const __m128i sign_B_b2 = _mm_srai_epi16(B_b2, 15);

      // b = abs(b) = (b ^ sign) - sign
      A_b0 = _mm_xor_si128(A_b0, sign_A_b0);
      A_b2 = _mm_xor_si128(A_b2, sign_A_b2);
      B_b0 = _mm_xor_si128(B_b0, sign_B_b0);
      B_b2 = _mm_xor_si128(B_b2, sign_B_b2);
      A_b0 = _mm_sub_epi16(A_b0, sign_A_b0);
      A_b2 = _mm_sub_epi16(A_b2, sign_A_b2);
      B_b0 = _mm_sub_epi16(B_b0, sign_B_b0);
      B_b2 = _mm_sub_epi16(B_b2, sign_B_b2);
    }

    // weighted sums
    A_b0 = _mm_madd_epi16(A_b0, w_0);
    A_b2 = _mm_madd_epi16(A_b2, w_8);
    B_b0 = _mm_madd_epi16(B_b0, w_0);
    B_b2 = _mm_madd_epi16(B_b2, w_8);
    A_b0 = _mm_add_epi32(A_b0, A_b2);
    B_b0 = _mm_add_epi32(B_b0, B_b2);

    // difference of weighted sums
    A_b0 = _mm_sub_epi32(A_b0, B_b0);
    _mm_storeu_si128((__m128i*)&sum[0], A_b0);
  }
  return sum[0] + sum[1] + sum[2] + sum[3];
}
Esempio n. 20
0
static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
                           int16_t* out) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i seven = _mm_set1_epi16(7);
  const __m128i k7500 = _mm_set1_epi32(7500);
  const __m128i k14500 = _mm_set1_epi32(14500);
  const __m128i k51000 = _mm_set1_epi32(51000);
  const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));
  const __m128i k5352_2217 = _mm_set_epi16(5352,  2217, 5352,  2217,
                                           5352,  2217, 5352,  2217);
  const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,
                                           2217, -5352, 2217, -5352);

  __m128i v01, v32;

  // Difference between src and ref and initial transpose.
  {
    // Load src and convert to 16b.
    const __m128i src0 = _mm_loadl_epi64((__m128i*)&src[0 * BPS]);
    const __m128i src1 = _mm_loadl_epi64((__m128i*)&src[1 * BPS]);
    const __m128i src2 = _mm_loadl_epi64((__m128i*)&src[2 * BPS]);
    const __m128i src3 = _mm_loadl_epi64((__m128i*)&src[3 * BPS]);
    const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);
    const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);
    const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);
    const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);
    // Load ref and convert to 16b.
    const __m128i ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]);
    const __m128i ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]);
    const __m128i ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]);
    const __m128i ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]);
    const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);
    const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
    const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
    const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
    // Compute difference.
    const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
    const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
    const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
    const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);

    // Transpose.
    // 00 01 02 03   0 0 0 0
    // 10 11 12 13   0 0 0 0
    // 20 21 22 23   0 0 0 0
    // 30 31 32 33   0 0 0 0
    const __m128i transpose0_0 = _mm_unpacklo_epi16(diff0, diff1);
    const __m128i transpose0_1 = _mm_unpacklo_epi16(diff2, diff3);
    // 00 10 01 11   02 12 03 13
    // 20 30 21 31   22 32 23 33
    const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
    v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
    v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));
    // a02 a12 a22 a32   a03 a13 a23 a33
    // a00 a10 a20 a30   a01 a11 a21 a31
    // a03 a13 a23 a33   a02 a12 a22 a32
  }

  // First pass and subsequent transpose.
  {
    // Same operations are done on the (0,3) and (1,2) pairs.
    // b0 = (a0 + a3) << 3
    // b1 = (a1 + a2) << 3
    // b3 = (a0 - a3) << 3
    // b2 = (a1 - a2) << 3
    const __m128i a01 = _mm_add_epi16(v01, v32);
    const __m128i a32 = _mm_sub_epi16(v01, v32);
    const __m128i b01 = _mm_slli_epi16(a01, 3);
    const __m128i b32 = _mm_slli_epi16(a32, 3);
    const __m128i b11 = _mm_unpackhi_epi64(b01, b01);
    const __m128i b22 = _mm_unpackhi_epi64(b32, b32);

    // e0 = b0 + b1
    // e2 = b0 - b1
    const __m128i e0 = _mm_add_epi16(b01, b11);
    const __m128i e2 = _mm_sub_epi16(b01, b11);
    const __m128i e02 = _mm_unpacklo_epi64(e0, e2);

    // e1 = (b3 * 5352 + b2 * 2217 + 14500) >> 12
    // e3 = (b3 * 2217 - b2 * 5352 +  7500) >> 12
    const __m128i b23 = _mm_unpacklo_epi16(b22, b32);
    const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
    const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
    const __m128i d1 = _mm_add_epi32(c1, k14500);
    const __m128i d3 = _mm_add_epi32(c3, k7500);
    const __m128i e1 = _mm_srai_epi32(d1, 12);
    const __m128i e3 = _mm_srai_epi32(d3, 12);
    const __m128i e13 = _mm_packs_epi32(e1, e3);

    // Transpose.
    // 00 01 02 03  20 21 22 23
    // 10 11 12 13  30 31 32 33
    const __m128i transpose0_0 = _mm_unpacklo_epi16(e02, e13);
    const __m128i transpose0_1 = _mm_unpackhi_epi16(e02, e13);
    // 00 10 01 11   02 12 03 13
    // 20 30 21 31   22 32 23 33
    const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
    v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
    v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));
    // 02 12 22 32   03 13 23 33
    // 00 10 20 30   01 11 21 31
    // 03 13 23 33   02 12 22 32
  }

  // Second pass
  {
    // Same operations are done on the (0,3) and (1,2) pairs.
    // a0 = v0 + v3
    // a1 = v1 + v2
    // a3 = v0 - v3
    // a2 = v1 - v2
    const __m128i a01 = _mm_add_epi16(v01, v32);
    const __m128i a32 = _mm_sub_epi16(v01, v32);
    const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
    const __m128i a22 = _mm_unpackhi_epi64(a32, a32);

    // d0 = (a0 + a1 + 7) >> 4;
    // d2 = (a0 - a1 + 7) >> 4;
    const __m128i b0 = _mm_add_epi16(a01, a11);
    const __m128i b2 = _mm_sub_epi16(a01, a11);
    const __m128i c0 = _mm_add_epi16(b0, seven);
    const __m128i c2 = _mm_add_epi16(b2, seven);
    const __m128i d0 = _mm_srai_epi16(c0, 4);
    const __m128i d2 = _mm_srai_epi16(c2, 4);

    // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
    // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
    const __m128i b23 = _mm_unpacklo_epi16(a22, a32);
    const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
    const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
    const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one);
    const __m128i d3 = _mm_add_epi32(c3, k51000);
    const __m128i e1 = _mm_srai_epi32(d1, 16);
    const __m128i e3 = _mm_srai_epi32(d3, 16);
    const __m128i f1 = _mm_packs_epi32(e1, e1);
    const __m128i f3 = _mm_packs_epi32(e3, e3);
    // f1 = f1 + (a3 != 0);
    // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
    // desired (0, 1), we add one earlier through k12000_plus_one.
    const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));

    _mm_storel_epi64((__m128i*)&out[ 0], d0);
    _mm_storel_epi64((__m128i*)&out[ 4], g1);
    _mm_storel_epi64((__m128i*)&out[ 8], d2);
    _mm_storel_epi64((__m128i*)&out[12], f3);
  }
}
Esempio n. 21
0
static inline uint16_t
fm10k_recv_raw_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
		uint16_t nb_pkts, uint8_t *split_packet)
{
	volatile union fm10k_rx_desc *rxdp;
	struct rte_mbuf **mbufp;
	uint16_t nb_pkts_recd;
	int pos;
	struct fm10k_rx_queue *rxq = rx_queue;
	uint64_t var;
	__m128i shuf_msk;
	__m128i dd_check, eop_check;
	uint16_t next_dd;

	next_dd = rxq->next_dd;

	/* Just the act of getting into the function from the application is
	 * going to cost about 7 cycles
	 */
	rxdp = rxq->hw_ring + next_dd;

	_mm_prefetch((const void *)rxdp, _MM_HINT_T0);

	/* See if we need to rearm the RX queue - gives the prefetch a bit
	 * of time to act
	 */
	if (rxq->rxrearm_nb > RTE_FM10K_RXQ_REARM_THRESH)
		fm10k_rxq_rearm(rxq);

	/* Before we start moving massive data around, check to see if
	 * there is actually a packet available
	 */
	if (!(rxdp->d.staterr & FM10K_RXD_STATUS_DD))
		return 0;

	/* Vecotr RX will process 4 packets at a time, strip the unaligned
	 * tails in case it's not multiple of 4.
	 */
	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_FM10K_DESCS_PER_LOOP);

	/* 4 packets DD mask */
	dd_check = _mm_set_epi64x(0x0000000100000001LL, 0x0000000100000001LL);

	/* 4 packets EOP mask */
	eop_check = _mm_set_epi64x(0x0000000200000002LL, 0x0000000200000002LL);

	/* mask to shuffle from desc. to mbuf */
	shuf_msk = _mm_set_epi8(
		7, 6, 5, 4,  /* octet 4~7, 32bits rss */
		15, 14,      /* octet 14~15, low 16 bits vlan_macip */
		13, 12,      /* octet 12~13, 16 bits data_len */
		0xFF, 0xFF,  /* skip high 16 bits pkt_len, zero out */
		13, 12,      /* octet 12~13, low 16 bits pkt_len */
		0xFF, 0xFF,  /* skip high 16 bits pkt_type */
		0xFF, 0xFF   /* Skip pkt_type field in shuffle operation */
		);

	/* Cache is empty -> need to scan the buffer rings, but first move
	 * the next 'n' mbufs into the cache
	 */
	mbufp = &rxq->sw_ring[next_dd];

	/* A. load 4 packet in one loop
	 * [A*. mask out 4 unused dirty field in desc]
	 * B. copy 4 mbuf point from swring to rx_pkts
	 * C. calc the number of DD bits among the 4 packets
	 * [C*. extract the end-of-packet bit, if requested]
	 * D. fill info. from desc to mbuf
	 */
	for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts;
			pos += RTE_FM10K_DESCS_PER_LOOP,
			rxdp += RTE_FM10K_DESCS_PER_LOOP) {
		__m128i descs0[RTE_FM10K_DESCS_PER_LOOP];
		__m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
		__m128i zero, staterr, sterr_tmp1, sterr_tmp2;
		__m128i mbp1, mbp2; /* two mbuf pointer in one XMM reg. */

		/* B.1 load 1 mbuf point */
		mbp1 = _mm_loadu_si128((__m128i *)&mbufp[pos]);

		/* Read desc statuses backwards to avoid race condition */
		/* A.1 load 4 pkts desc */
		descs0[3] = _mm_loadu_si128((__m128i *)(rxdp + 3));

		/* B.2 copy 2 mbuf point into rx_pkts  */
		_mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1);

		/* B.1 load 1 mbuf point */
		mbp2 = _mm_loadu_si128((__m128i *)&mbufp[pos+2]);

		descs0[2] = _mm_loadu_si128((__m128i *)(rxdp + 2));
		/* B.1 load 2 mbuf point */
		descs0[1] = _mm_loadu_si128((__m128i *)(rxdp + 1));
		descs0[0] = _mm_loadu_si128((__m128i *)(rxdp));

		/* B.2 copy 2 mbuf point into rx_pkts  */
		_mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2);

		/* avoid compiler reorder optimization */
		rte_compiler_barrier();

		if (split_packet) {
			rte_mbuf_prefetch_part2(rx_pkts[pos]);
			rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
			rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
			rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
		}

		/* D.1 pkt 3,4 convert format from desc to pktmbuf */
		pkt_mb4 = _mm_shuffle_epi8(descs0[3], shuf_msk);
		pkt_mb3 = _mm_shuffle_epi8(descs0[2], shuf_msk);

		/* C.1 4=>2 filter staterr info only */
		sterr_tmp2 = _mm_unpackhi_epi32(descs0[3], descs0[2]);
		/* C.1 4=>2 filter staterr info only */
		sterr_tmp1 = _mm_unpackhi_epi32(descs0[1], descs0[0]);

		/* set ol_flags with vlan packet type */
		fm10k_desc_to_olflags_v(descs0, &rx_pkts[pos]);

		/* D.1 pkt 1,2 convert format from desc to pktmbuf */
		pkt_mb2 = _mm_shuffle_epi8(descs0[1], shuf_msk);
		pkt_mb1 = _mm_shuffle_epi8(descs0[0], shuf_msk);

		/* C.2 get 4 pkts staterr value  */
		zero = _mm_xor_si128(dd_check, dd_check);
		staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2);

		/* D.3 copy final 3,4 data to rx_pkts */
		_mm_storeu_si128((void *)&rx_pkts[pos+3]->rx_descriptor_fields1,
				pkt_mb4);
		_mm_storeu_si128((void *)&rx_pkts[pos+2]->rx_descriptor_fields1,
				pkt_mb3);

		/* C* extract and record EOP bit */
		if (split_packet) {
			__m128i eop_shuf_mask = _mm_set_epi8(
					0xFF, 0xFF, 0xFF, 0xFF,
					0xFF, 0xFF, 0xFF, 0xFF,
					0xFF, 0xFF, 0xFF, 0xFF,
					0x04, 0x0C, 0x00, 0x08
					);

			/* and with mask to extract bits, flipping 1-0 */
			__m128i eop_bits = _mm_andnot_si128(staterr, eop_check);
			/* the staterr values are not in order, as the count
			 * count of dd bits doesn't care. However, for end of
			 * packet tracking, we do care, so shuffle. This also
			 * compresses the 32-bit values to 8-bit
			 */
			eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask);
			/* store the resulting 32-bit value */
			*(int *)split_packet = _mm_cvtsi128_si32(eop_bits);
			split_packet += RTE_FM10K_DESCS_PER_LOOP;

			/* zero-out next pointers */
			rx_pkts[pos]->next = NULL;
			rx_pkts[pos + 1]->next = NULL;
			rx_pkts[pos + 2]->next = NULL;
			rx_pkts[pos + 3]->next = NULL;
		}

		/* C.3 calc available number of desc */
		staterr = _mm_and_si128(staterr, dd_check);
		staterr = _mm_packs_epi32(staterr, zero);

		/* D.3 copy final 1,2 data to rx_pkts */
		_mm_storeu_si128((void *)&rx_pkts[pos+1]->rx_descriptor_fields1,
				pkt_mb2);
		_mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1,
				pkt_mb1);

		fm10k_desc_to_pktype_v(descs0, &rx_pkts[pos]);

		/* C.4 calc avaialbe number of desc */
		var = __builtin_popcountll(_mm_cvtsi128_si64(staterr));
		nb_pkts_recd += var;
		if (likely(var != RTE_FM10K_DESCS_PER_LOOP))
			break;
	}

	/* Update our internal tail pointer */
	rxq->next_dd = (uint16_t)(rxq->next_dd + nb_pkts_recd);
	rxq->next_dd = (uint16_t)(rxq->next_dd & (rxq->nb_desc - 1));
	rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd);

	return nb_pkts_recd;
}
Esempio n. 22
0
static void tranpose8x8(__m128i *input,int i_indx, __m128i *Transposed,int t_indx)
{
__m128i a;
__m128i b;
__m128i c;
__m128i d;
__m128i e;
__m128i f;
__m128i g;
__m128i h;

__m128i temp1;
__m128i temp2;
__m128i temp3;
__m128i temp4;
__m128i temp5;
__m128i temp6;
__m128i temp7;
__m128i temp8;
        
__m128i temp9;
__m128i temp10;
__m128i temp11;
__m128i temp12;
__m128i temp13;
__m128i temp14;
__m128i temp15;
__m128i temp16;
        
__m128i T0;
__m128i T1;
__m128i T2;
__m128i T3;
__m128i T4;
__m128i T5;
__m128i T6;
__m128i T7;


a = _mm_load_si128(&input[i_indx]);
b = _mm_load_si128(&input[i_indx+4 ]);
c = _mm_load_si128(&input[i_indx+8 ]);
d = _mm_load_si128(&input[i_indx+12]);
e = _mm_load_si128(&input[i_indx+16]);
f = _mm_load_si128(&input[i_indx+20]);
g = _mm_load_si128(&input[i_indx+24]);
h = _mm_load_si128(&input[i_indx+28]);


 temp1 = _mm_unpacklo_epi16(a, b); //a03b03
 temp2 = _mm_unpacklo_epi16(c, d);
 temp3 = _mm_unpacklo_epi16(e, f);
 temp4 = _mm_unpacklo_epi16(g, h);
 temp5 = _mm_unpackhi_epi16(a, b);
 temp6 = _mm_unpackhi_epi16(c, d);
 temp7 = _mm_unpackhi_epi16(e, f);
 temp8 = _mm_unpackhi_epi16(g, h);

 temp9 = _mm_unpacklo_epi32(temp1, temp2); //a01b01c01d01
 temp10 = _mm_unpackhi_epi32(temp1, temp2);
 temp11 = _mm_unpacklo_epi32(temp3, temp4);
 temp12 = _mm_unpackhi_epi32(temp3, temp4);
 temp13 = _mm_unpacklo_epi32(temp5, temp6);
 temp14 = _mm_unpackhi_epi32(temp5, temp6);
 temp15 = _mm_unpacklo_epi32(temp7, temp8);
 temp16 = _mm_unpackhi_epi32(temp7, temp8);
 
 T0 = _mm_unpacklo_epi64(temp9, temp11);  //a0b0c0d0e0f0g0h0
 T1 = _mm_unpackhi_epi64(temp9, temp11);
 T2 = _mm_unpacklo_epi64(temp10, temp12);
 T3 = _mm_unpackhi_epi64(temp10, temp12);
 T4 = _mm_unpacklo_epi64(temp13, temp15);
 T5 = _mm_unpackhi_epi64(temp13, temp15);
 T6 = _mm_unpacklo_epi64(temp14, temp16);
 T7 = _mm_unpackhi_epi64(temp14, temp16);


_mm_store_si128(&Transposed[t_indx], T0);   //store transposed 8X8 matrix
_mm_store_si128(&Transposed[t_indx+1], T1);
_mm_store_si128(&Transposed[t_indx+2], T2);
_mm_store_si128(&Transposed[t_indx+3], T3);
_mm_store_si128(&Transposed[t_indx+4], T4);
_mm_store_si128(&Transposed[t_indx+5], T5);
_mm_store_si128(&Transposed[t_indx+6], T6);
_mm_store_si128(&Transposed[t_indx+7], T7);

}
mlib_status
mlib_VideoColorJFIFYCC2RGB444_S16_naligned(
	mlib_s16 *rgb,
	const mlib_s16 *y,
	const mlib_s16 *cb,
	const mlib_s16 *cr,
	mlib_s32 n)
{
	/* 0 & 1.402*16384 */
	const __m128i x_c1 = _mm_setr_epi16(0, 22970, 0, 22970,
		0, 22970, 0, 22970);

	/* -0.34414*16384 & -0.71414*16384 */
	const __m128i x_c2 = _mm_setr_epi16(-5638, -11700, -5638, -11700,
		-5638, -11700, -5638, -11700);

	/* 1.772*16384 & 0 */
	const __m128i x_c3 = _mm_setr_epi16(29032, 0, 29032, 0,
		29032, 0, 29032, 0);

	const __m128i x_coff = _mm_set1_epi16(2048);
	const __m128i x_cps1 = _mm_set1_epi32(0x8000);
	const __m128i x_cps2 = _mm_set1_epi16(0x8000);
	const __m128i x_zero = _mm_setzero_si128();
	const __m128i x_mask1 = _mm_setr_epi32(0xffffffff, 0xffff, 0, 0);
	const __m128i x_mask2 = _mm_setr_epi32(0, 0xffff0000, 0xffffffff, 0);

	/* __m128i variables */
	__m128i x_y, x_cb, x_cr, x_r, x_g, x_b, x_y1, x_y2;
	__m128i x_r1, x_r2, x_g1, x_g2, x_b1, x_b2, x_t1, x_t2;
	__m128i x_rgbl, x_rgbh, x_rgl, x_rgh, x_bbl, x_bbh;
	__m128i x_cbcr1, x_cbcr2;

	/* pointers */
	__m128i *px_y, *px_cb, *px_cr;
	mlib_s16 *prgb;

	/* other var */
	mlib_d64 fr, fg, fb, fy, fcb, fcr;
	mlib_s32 i;

	px_y = (__m128i *)y;
	px_cb = (__m128i *)cb;
	px_cr = (__m128i *)cr;
	prgb = rgb;
	i = 0;

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
	for (; i <= n - 16; i += 8)	{
		x_y = _mm_loadu_si128(px_y);
		x_y1 = _mm_unpacklo_epi16(x_y, x_zero);
		x_y1 = _mm_slli_epi32(x_y1, 4);
		x_y2 = _mm_unpackhi_epi16(x_y, x_zero);
		x_y2 = _mm_slli_epi32(x_y2, 4);
		px_y++;
		x_cb = _mm_loadu_si128(px_cb);
		x_cb = _mm_sub_epi16(x_cb, x_coff);
		px_cb++;
		x_cr = _mm_loadu_si128(px_cr);
		x_cr = _mm_sub_epi16(x_cr, x_coff);
		px_cr++;
		x_cbcr1 = _mm_unpacklo_epi16(x_cb, x_cr);
		x_cbcr2 = _mm_unpackhi_epi16(x_cb, x_cr);

		/* calc r/g/b */
		x_t1 = _mm_madd_epi16(x_cbcr1, x_c1);
		x_t1 = _mm_srai_epi32(x_t1, 10);
		x_r1 = _mm_add_epi32(x_t1, x_y1);
		x_t1 = _mm_madd_epi16(x_cbcr1, x_c2);
		x_t1 = _mm_srai_epi32(x_t1, 10);
		x_g1 = _mm_add_epi32(x_t1, x_y1);
		x_t1 = _mm_madd_epi16(x_cbcr1, x_c3);
		x_t1 = _mm_srai_epi32(x_t1, 10);
		x_b1 = _mm_add_epi32(x_t1, x_y1);

		x_t2 = _mm_madd_epi16(x_cbcr2, x_c1);
		x_t2 = _mm_srai_epi32(x_t2, 10);
		x_r2 = _mm_add_epi32(x_t2, x_y2);
		x_t2 = _mm_madd_epi16(x_cbcr2, x_c2);
		x_t2 = _mm_srai_epi32(x_t2, 10);
		x_g2 = _mm_add_epi32(x_t2, x_y2);
		x_t2 = _mm_madd_epi16(x_cbcr2, x_c3);
		x_t2 = _mm_srai_epi32(x_t2, 10);
		x_b2 = _mm_add_epi32(x_t2, x_y2);

		/* signed pack & shift */
		x_r1 = _mm_sub_epi32(x_r1, x_cps1);
		x_r2 = _mm_sub_epi32(x_r2, x_cps1);
		x_r = _mm_packs_epi32(x_r1, x_r2);
		x_r = _mm_add_epi16(x_r, x_cps2);
		x_r = _mm_srli_epi16(x_r, 4);

		x_g1 = _mm_sub_epi32(x_g1, x_cps1);
		x_g2 = _mm_sub_epi32(x_g2, x_cps1);
		x_g = _mm_packs_epi32(x_g1, x_g2);
		x_g = _mm_add_epi16(x_g, x_cps2);
		x_g = _mm_srli_epi16(x_g, 4);

		x_b1 = _mm_sub_epi32(x_b1, x_cps1);
		x_b2 = _mm_sub_epi32(x_b2, x_cps1);
		x_b = _mm_packs_epi32(x_b1, x_b2);
		x_b = _mm_add_epi16(x_b, x_cps2);
		x_b = _mm_srli_epi16(x_b, 4);

		/* create rgb sequences */
		x_rgl = _mm_unpacklo_epi16(x_r, x_g);
		x_rgh = _mm_unpackhi_epi16(x_r, x_g);
		x_bbl = _mm_unpacklo_epi16(x_b, x_b);
		x_bbh = _mm_unpackhi_epi16(x_b, x_b);

		/* save */
		x_rgbl = _mm_unpacklo_epi32(x_rgl, x_bbl);
		PACK_RGB1(x_rgbl);

		x_rgbh = _mm_unpackhi_epi32(x_rgl, x_bbl);
		PACK_RGB1(x_rgbh);

		x_rgbl = _mm_unpacklo_epi32(x_rgh, x_bbh);
		PACK_RGB1(x_rgbl);

		x_rgbh = _mm_unpackhi_epi32(x_rgh, x_bbh);
		PACK_RGB1(x_rgbh);
	}

	if (i <= (n - 8)) {
		x_y = _mm_loadu_si128(px_y);
		x_y1 = _mm_unpacklo_epi16(x_y, x_zero);
		x_y1 = _mm_slli_epi32(x_y1, 4);
		x_y2 = _mm_unpackhi_epi16(x_y, x_zero);
		x_y2 = _mm_slli_epi32(x_y2, 4);
		px_y++;
		x_cb = _mm_loadu_si128(px_cb);
		x_cb = _mm_sub_epi16(x_cb, x_coff);
		px_cb++;
		x_cr = _mm_loadu_si128(px_cr);
		x_cr = _mm_sub_epi16(x_cr, x_coff);
		px_cr++;
		x_cbcr1 = _mm_unpacklo_epi16(x_cb, x_cr);
		x_cbcr2 = _mm_unpackhi_epi16(x_cb, x_cr);

		/* calc r/g/b */
		x_t1 = _mm_madd_epi16(x_cbcr1, x_c1);
		x_t1 = _mm_srai_epi32(x_t1, 10);
		x_r1 = _mm_add_epi32(x_t1, x_y1);
		x_t1 = _mm_madd_epi16(x_cbcr1, x_c2);
		x_t1 = _mm_srai_epi32(x_t1, 10);
		x_g1 = _mm_add_epi32(x_t1, x_y1);
		x_t1 = _mm_madd_epi16(x_cbcr1, x_c3);
		x_t1 = _mm_srai_epi32(x_t1, 10);
		x_b1 = _mm_add_epi32(x_t1, x_y1);

		x_t2 = _mm_madd_epi16(x_cbcr2, x_c1);
		x_t2 = _mm_srai_epi32(x_t2, 10);
		x_r2 = _mm_add_epi32(x_t2, x_y2);
		x_t2 = _mm_madd_epi16(x_cbcr2, x_c2);
		x_t2 = _mm_srai_epi32(x_t2, 10);
		x_g2 = _mm_add_epi32(x_t2, x_y2);
		x_t2 = _mm_madd_epi16(x_cbcr2, x_c3);
		x_t2 = _mm_srai_epi32(x_t2, 10);
		x_b2 = _mm_add_epi32(x_t2, x_y2);

		/* signed pack & shift */
		x_r1 = _mm_sub_epi32(x_r1, x_cps1);
		x_r2 = _mm_sub_epi32(x_r2, x_cps1);
		x_r = _mm_packs_epi32(x_r1, x_r2);
		x_r = _mm_add_epi16(x_r, x_cps2);
		x_r = _mm_srli_epi16(x_r, 4);

		x_g1 = _mm_sub_epi32(x_g1, x_cps1);
		x_g2 = _mm_sub_epi32(x_g2, x_cps1);
		x_g = _mm_packs_epi32(x_g1, x_g2);
		x_g = _mm_add_epi16(x_g, x_cps2);
		x_g = _mm_srli_epi16(x_g, 4);

		x_b1 = _mm_sub_epi32(x_b1, x_cps1);
		x_b2 = _mm_sub_epi32(x_b2, x_cps1);
		x_b = _mm_packs_epi32(x_b1, x_b2);
		x_b = _mm_add_epi16(x_b, x_cps2);
		x_b = _mm_srli_epi16(x_b, 4);

		/* create rgb sequences */
		x_rgl = _mm_unpacklo_epi16(x_r, x_g);
		x_rgh = _mm_unpackhi_epi16(x_r, x_g);
		x_bbl = _mm_unpacklo_epi16(x_b, x_b);
		x_bbh = _mm_unpackhi_epi16(x_b, x_b);

		/* save */
		x_rgbl = _mm_unpacklo_epi32(x_rgl, x_bbl);
		PACK_RGB1(x_rgbl);

		x_rgbh = _mm_unpackhi_epi32(x_rgl, x_bbl);
		PACK_RGB1(x_rgbh);

		x_rgbl = _mm_unpacklo_epi32(x_rgh, x_bbh);
		PACK_RGB1(x_rgbl);

		x_rgbh = _mm_unpackhi_epi32(x_rgh, x_bbh);
		PACK_RGB2(x_rgbh);

		i += 8;
	}

	if (i <= (n - 4)) {
		x_y = _mm_loadl_epi64(px_y);
		x_y1 = _mm_unpacklo_epi16(x_y, x_zero);
		x_y1 = _mm_slli_epi32(x_y1, 4);
		px_y = (__m128i *)(((__m64 *)px_y) + 1);
		x_cb = _mm_loadl_epi64(px_cb);
		x_cb = _mm_sub_epi16(x_cb, x_coff);
		px_cb = (__m128i *)(((__m64 *)px_cb) + 1);
		x_cr = _mm_loadl_epi64(px_cr);
		x_cr = _mm_sub_epi16(x_cr, x_coff);
		px_cr = (__m128i *)(((__m64 *)px_cr) + 1);
		x_cbcr1 = _mm_unpacklo_epi16(x_cb, x_cr);

		/* calc r/g/b */
		x_t1 = _mm_madd_epi16(x_cbcr1, x_c1);
		x_t1 = _mm_srai_epi32(x_t1, 10);
		x_r1 = _mm_add_epi32(x_t1, x_y1);
		x_t1 = _mm_madd_epi16(x_cbcr1, x_c2);
		x_t1 = _mm_srai_epi32(x_t1, 10);
		x_g1 = _mm_add_epi32(x_t1, x_y1);
		x_t1 = _mm_madd_epi16(x_cbcr1, x_c3);
		x_t1 = _mm_srai_epi32(x_t1, 10);
		x_b1 = _mm_add_epi32(x_t1, x_y1);

		/* signed pack & shift */
		x_r1 = _mm_sub_epi32(x_r1, x_cps1);
		x_r = _mm_packs_epi32(x_r1, x_zero);
		x_r = _mm_add_epi16(x_r, x_cps2);
		x_r = _mm_srli_epi16(x_r, 4);

		x_g1 = _mm_sub_epi32(x_g1, x_cps1);
		x_g = _mm_packs_epi32(x_g1, x_zero);
		x_g = _mm_add_epi16(x_g, x_cps2);
		x_g = _mm_srli_epi16(x_g, 4);

		x_b1 = _mm_sub_epi32(x_b1, x_cps1);
		x_b = _mm_packs_epi32(x_b1, x_zero);
		x_b = _mm_add_epi16(x_b, x_cps2);
		x_b = _mm_srli_epi16(x_b, 4);

		/* create rgb sequences */
		x_rgl = _mm_unpacklo_epi16(x_r, x_g);
		x_bbl = _mm_unpacklo_epi16(x_b, x_b);

		/* save */
		x_rgbl = _mm_unpacklo_epi32(x_rgl, x_bbl);
		PACK_RGB1(x_rgbl);

		x_rgbh = _mm_unpackhi_epi32(x_rgl, x_bbl);
		PACK_RGB2(x_rgbh);

		i += 4;
	}

	/* pure C implementation */
	for (; i < n; i++) {
		fy = y[i] * SCALE - SAT;
		fcb = (mlib_d64)((cb[i] - 2048) << 20);
		fcr = (mlib_d64)((cr[i] - 2048) << 20);
		fr = fy + 1.40200f * fcr;
		fg = fy - 0.34414f * fcb - 0.71414f * fcr;
		fb = fy + 1.77200f * fcb;
		rgb[3 * i] = CLAMP_U12(fr);
		rgb[3 * i + 1] = CLAMP_U12(fg);
		rgb[3 * i + 2] = CLAMP_U12(fb);
	}

	return (MLIB_SUCCESS);
}
Esempio n. 24
0
static void trans_g_aiT16(__m128i *input, __m128i *Transposed)
{

__m128i a;
__m128i b;
__m128i c;
__m128i d;
__m128i e;
__m128i f;
__m128i g;
__m128i h;

__m128i temp1;
__m128i temp2;
__m128i temp3;
__m128i temp4;
__m128i temp5;
__m128i temp6;
__m128i temp7;
__m128i temp8;
        
__m128i temp9;
__m128i temp10;
__m128i temp11;
__m128i temp12;
__m128i temp13;
__m128i temp14;
__m128i temp15;
__m128i temp16;
        
__m128i T0;
__m128i T1;
__m128i T2;
__m128i T3;
__m128i T4;
__m128i T5;
__m128i T6;
__m128i T7;

a = _mm_load_si128(&input[2]);
b = _mm_load_si128(&input[6]);
c = _mm_load_si128(&input[10]);
d = _mm_load_si128(&input[14]);
e = _mm_load_si128(&input[18]);
f = _mm_load_si128(&input[22]);
g = _mm_load_si128(&input[26]);
h = _mm_load_si128(&input[30]);


//store 128 bits of integer data into the memory address given
_mm_store_si128(&Transposed[0], a);   //store transposed 8X8 matrix
_mm_store_si128(&Transposed[1], b);
_mm_store_si128(&Transposed[2], c);
_mm_store_si128(&Transposed[3], d);
_mm_store_si128(&Transposed[4], e);
_mm_store_si128(&Transposed[5], f);
_mm_store_si128(&Transposed[6], g);
_mm_store_si128(&Transposed[7], h);


//load matrix input[0][0],[2][0]...

 a = _mm_load_si128(&input[0]);
 b = _mm_load_si128(&input[4]);
 c = _mm_load_si128(&input[8]);
 d = _mm_load_si128(&input[12]);
 e = _mm_load_si128(&input[16]);
 f = _mm_load_si128(&input[20]);
 g = _mm_load_si128(&input[24]);
 h = _mm_load_si128(&input[28]);

 temp1 = _mm_unpacklo_epi16(a, b);
 temp2 = _mm_unpacklo_epi16(c, d);
 temp3 = _mm_unpacklo_epi16(e, f);
 temp4 = _mm_unpacklo_epi16(g, h);
 temp5 = _mm_unpackhi_epi16(a, b);
 temp6 = _mm_unpackhi_epi16(c, d);
 temp7 = _mm_unpackhi_epi16(e, f);
 temp8 = _mm_unpackhi_epi16(g, h);

 temp9 = _mm_unpacklo_epi32(temp1, temp2);
 temp10 = _mm_unpackhi_epi32(temp1, temp2);
 temp11 = _mm_unpacklo_epi32(temp3, temp4);
 temp12 = _mm_unpackhi_epi32(temp3, temp4);
 temp13 = _mm_unpacklo_epi32(temp5, temp6);
 temp14 = _mm_unpackhi_epi32(temp5, temp6);
 temp15 = _mm_unpacklo_epi32(temp7, temp8);
 temp16 = _mm_unpackhi_epi32(temp7, temp8);

 T0 = _mm_unpacklo_epi64(temp9, temp11);
 T1 = _mm_unpackhi_epi64(temp9, temp11);
 T2 = _mm_unpacklo_epi64(temp10, temp12);
 T3 = _mm_unpackhi_epi64(temp10, temp12);
  
_mm_store_si128(&Transposed[8], T0);   //store transposed 8X8 matrix
_mm_store_si128(&Transposed[9], T1);
_mm_store_si128(&Transposed[10], T2);
_mm_store_si128(&Transposed[11], T3);

}
Esempio n. 25
0
void ulsch_channel_compensation(int **rxdataF_ext,
				int **ul_ch_estimates_ext,
				int **ul_ch_mag,
				int **ul_ch_magb,
				int **rxdataF_comp,
				LTE_DL_FRAME_PARMS *frame_parms,
				unsigned char symbol,
				unsigned char Qm,
				unsigned short nb_rb,
				unsigned char output_shift) {
  
  unsigned short rb;
  __m128i *ul_ch128,*ul_ch_mag128,*ul_ch_mag128b,*rxdataF128,*rxdataF_comp128;
  unsigned char aarx;//,symbol_mod;


  //  symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol;

#ifndef __SSE3__
  zeroU = _mm_xor_si128(zeroU,zeroU);
#endif

  //    printf("comp: symbol %d\n",symbol);

  
  if (Qm == 4)
    QAM_amp128U = _mm_set1_epi16(QAM16_n1);
  else if (Qm == 6) {
    QAM_amp128U  = _mm_set1_epi16(QAM64_n1);
    QAM_amp128bU = _mm_set1_epi16(QAM64_n2);
  }
  for (aarx=0;aarx<frame_parms->nb_antennas_rx;aarx++) {
    
    ul_ch128          = (__m128i *)&ul_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12];
    ul_ch_mag128      = (__m128i *)&ul_ch_mag[aarx][symbol*frame_parms->N_RB_DL*12];
    ul_ch_mag128b     = (__m128i *)&ul_ch_magb[aarx][symbol*frame_parms->N_RB_DL*12];
    rxdataF128        = (__m128i *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12];
    rxdataF_comp128   = (__m128i *)&rxdataF_comp[aarx][symbol*frame_parms->N_RB_DL*12];


    for (rb=0;rb<nb_rb;rb++) {
      //      printf("comp: symbol %d rb %d\n",symbol,rb);
#ifdef OFDMA_ULSCH
      if (Qm>2) {  
	// get channel amplitude if not QPSK

	mmtmpU0 = _mm_madd_epi16(ul_ch128[0],ul_ch128[0]);
	
	mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift);
	
	mmtmpU1 = _mm_madd_epi16(ul_ch128[1],ul_ch128[1]);
	mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift);
	mmtmpU0 = _mm_packs_epi32(mmtmpU0,mmtmpU1);
	
	ul_ch_mag128[0] = _mm_unpacklo_epi16(mmtmpU0,mmtmpU0);
	ul_ch_mag128b[0] = ul_ch_mag128[0];
	ul_ch_mag128[0] = _mm_mulhi_epi16(ul_ch_mag128[0],QAM_amp128U);
	ul_ch_mag128[0] = _mm_slli_epi16(ul_ch_mag128[0],2);  // 2 to compensate the scale channel estimate
	ul_ch_mag128[1] = _mm_unpackhi_epi16(mmtmpU0,mmtmpU0);
	ul_ch_mag128b[1] = ul_ch_mag128[1];
	ul_ch_mag128[1] = _mm_mulhi_epi16(ul_ch_mag128[1],QAM_amp128U);
	ul_ch_mag128[1] = _mm_slli_epi16(ul_ch_mag128[1],2);  // 2 to compensate the scale channel estimate
	
	mmtmpU0 = _mm_madd_epi16(ul_ch128[2],ul_ch128[2]);
	mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift);
	mmtmpU1 = _mm_packs_epi32(mmtmpU0,mmtmpU0);
	
	ul_ch_mag128[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1);
	ul_ch_mag128b[2] = ul_ch_mag128[2];
	
	ul_ch_mag128[2] = _mm_mulhi_epi16(ul_ch_mag128[2],QAM_amp128U);
	ul_ch_mag128[2] = _mm_slli_epi16(ul_ch_mag128[2],2); // 2 to compensate the scale channel estimate	  
	
	
	ul_ch_mag128b[0] = _mm_mulhi_epi16(ul_ch_mag128b[0],QAM_amp128bU);
	ul_ch_mag128b[0] = _mm_slli_epi16(ul_ch_mag128b[0],2); // 2 to compensate the scale channel estimate
	
	
	ul_ch_mag128b[1] = _mm_mulhi_epi16(ul_ch_mag128b[1],QAM_amp128bU);
	ul_ch_mag128b[1] = _mm_slli_epi16(ul_ch_mag128b[1],2); // 2 to compensate the scale channel estimate
	
	ul_ch_mag128b[2] = _mm_mulhi_epi16(ul_ch_mag128b[2],QAM_amp128bU);
	ul_ch_mag128b[2] = _mm_slli_epi16(ul_ch_mag128b[2],2);// 2 to compensate the scale channel estimate	   
	
      }
#else

	mmtmpU0 = _mm_madd_epi16(ul_ch128[0],ul_ch128[0]);
	
	mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift-1);
	
	mmtmpU1 = _mm_madd_epi16(ul_ch128[1],ul_ch128[1]);
	mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift-1);
	mmtmpU0 = _mm_packs_epi32(mmtmpU0,mmtmpU1);
	
	ul_ch_mag128[0] = _mm_unpacklo_epi16(mmtmpU0,mmtmpU0);
	ul_ch_mag128[1] = _mm_unpackhi_epi16(mmtmpU0,mmtmpU0);
	
	mmtmpU0 = _mm_madd_epi16(ul_ch128[2],ul_ch128[2]);
	mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift-1);
	mmtmpU1 = _mm_packs_epi32(mmtmpU0,mmtmpU0);
	ul_ch_mag128[2] = _mm_unpacklo_epi16(mmtmpU1,mmtmpU1);

	//	printf("comp: symbol %d rb %d => %d,%d,%d\n",symbol,rb,*((short*)&ul_ch_mag128[0]),*((short*)&ul_ch_mag128[1]),*((short*)&ul_ch_mag128[2]));	
#endif          
      // multiply by conjugated channel
      mmtmpU0 = _mm_madd_epi16(ul_ch128[0],rxdataF128[0]);
      //	print_ints("re",&mmtmpU0);
      
      // mmtmpU0 contains real part of 4 consecutive outputs (32-bit)
      mmtmpU1 = _mm_shufflelo_epi16(ul_ch128[0],_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)&conjugate[0]);
      //	print_ints("im",&mmtmpU1);
      mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[0]);
      // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit)
      mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift);
      //	print_ints("re(shift)",&mmtmpU0);
      mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift);
      //	print_ints("im(shift)",&mmtmpU1);
      mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1);
      mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1);
      //       	print_ints("c0",&mmtmpU2);
      //	print_ints("c1",&mmtmpU3);
      rxdataF_comp128[0] = _mm_packs_epi32(mmtmpU2,mmtmpU3);
      //      	print_shorts("rx:",rxdataF128[0]);
      //      	print_shorts("ch:",ul_ch128[0]);
      //      	print_shorts("pack:",rxdataF_comp128[0]);
      
      // multiply by conjugated channel
      mmtmpU0 = _mm_madd_epi16(ul_ch128[1],rxdataF128[1]);
      // mmtmpU0 contains real part of 4 consecutive outputs (32-bit)
      mmtmpU1 = _mm_shufflelo_epi16(ul_ch128[1],_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate);
      mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[1]);
      // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit)
      mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift);
      mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift);
      mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1);
      mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1);
      
      rxdataF_comp128[1] = _mm_packs_epi32(mmtmpU2,mmtmpU3);
      //      	print_shorts("rx:",rxdataF128[1]);
      //      	print_shorts("ch:",ul_ch128[1]);
      //      	print_shorts("pack:",rxdataF_comp128[1]);	
      //       multiply by conjugated channel
      mmtmpU0 = _mm_madd_epi16(ul_ch128[2],rxdataF128[2]);
      // mmtmpU0 contains real part of 4 consecutive outputs (32-bit)
      mmtmpU1 = _mm_shufflelo_epi16(ul_ch128[2],_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_shufflehi_epi16(mmtmpU1,_MM_SHUFFLE(2,3,0,1));
      mmtmpU1 = _mm_sign_epi16(mmtmpU1,*(__m128i*)conjugate);
      mmtmpU1 = _mm_madd_epi16(mmtmpU1,rxdataF128[2]);
      // mmtmpU1 contains imag part of 4 consecutive outputs (32-bit)
      mmtmpU0 = _mm_srai_epi32(mmtmpU0,output_shift);
      mmtmpU1 = _mm_srai_epi32(mmtmpU1,output_shift);
      mmtmpU2 = _mm_unpacklo_epi32(mmtmpU0,mmtmpU1);
      mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1);
      
      rxdataF_comp128[2] = _mm_packs_epi32(mmtmpU2,mmtmpU3);
      //      	print_shorts("rx:",rxdataF128[2]);
      //      	print_shorts("ch:",ul_ch128[2]);
      //        print_shorts("pack:",rxdataF_comp128[2]);
      
      ul_ch128+=3;
      ul_ch_mag128+=3;
      ul_ch_mag128b+=3;
      rxdataF128+=3;
      rxdataF_comp128+=3;
      
    }
  }


  _mm_empty();
  _m_empty();

}     
Esempio n. 26
0
/// CURRENTLY SAME CODE AS SCALAR !!
/// REPLACE HERE WITH SSE intrinsics
static void partialButterflyInverse16_simd(short *src, short *dst, int shift)
{

  int add = 1<<(shift-1);

//we cast the original 16X16 matrix to an SIMD vector type
    __m128i *g_aiT16_vec  = (__m128i *)g_aiT16; 


//We cast the input source (which is basically random numbers(see the main function for details)) to an SIMD vector type
//We also cast the output to an SIMD vector type
  __m128i *in_vec = (__m128i *) src;   
  __m128i *out_vec = (__m128i *) dst;

//we declare an 8X8 array and cast it to an SIMD vector type
  short gt[8][8] __attribute__ ((aligned (16)));
  __m128i *gt_vec = (__m128i *)gt;

//we declare an 16X16 array and cast it to an SIMD vector type
  short random[16][16] __attribute__ ((aligned (16)));
  __m128i *random_vec = (__m128i *)random;  
  

trans_g_aiT16(g_aiT16_vec,gt_vec);

tranpose8x8(in_vec,2, random_vec,0);
tranpose8x8(in_vec,3, random_vec,8);
tranpose8x8(in_vec,0, random_vec,16);
tranpose8x8(in_vec,1, random_vec,24);

  for (int j=0; j<16; j++)
  {
    /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
      
    __m128i I0 = _mm_load_si128 (&random_vec[j]); 
    __m128i II0 = _mm_load_si128 (&random_vec[j+16]); 

  // for (int k=0; k<8; k++)
          //here we are loading up the transposed values in the initial matrix
          //multiplying it with the input numbers to produce intermediate 32-bit integers
          // we then sum up adjacent pairs of 32-bit integers and store them in the destination register
        __m128i I1 = _mm_load_si128 (&gt_vec[0]);   
        __m128i I2 = _mm_madd_epi16 (I1, I0);
         
        __m128i I3 = _mm_load_si128 (&gt_vec[1]);   
        __m128i I4 = _mm_madd_epi16 (I3, I0);
   
        __m128i I5 = _mm_load_si128 (&gt_vec[2]);   
        __m128i I6 = _mm_madd_epi16 (I5, I0);

        __m128i I7 = _mm_load_si128 (&gt_vec[3]);   
        __m128i I8 = _mm_madd_epi16 (I7, I0);

        __m128i I9 = _mm_load_si128 (&gt_vec[4]);   
        __m128i I10 = _mm_madd_epi16 (I9, I0);

        __m128i I11 = _mm_load_si128 (&gt_vec[5]);   
        __m128i I12 = _mm_madd_epi16 (I11, I0);

        __m128i I13 = _mm_load_si128 (&gt_vec[6]);   
        __m128i I14 = _mm_madd_epi16 (I13, I0);

        __m128i I15 = _mm_load_si128 (&gt_vec[7]);   
        __m128i I16 = _mm_madd_epi16 (I15, I0);

        //horizontally add the partial results obtained from thee previous step
       __m128i A1 =_mm_hadd_epi32 (I2, I4);
       __m128i A2 =_mm_hadd_epi32 (I6, I8);
       __m128i R1 =_mm_hadd_epi32 (A1, A2);

       __m128i A3 =_mm_hadd_epi32 (I10, I12);
       __m128i A4 =_mm_hadd_epi32 (I14, I16);
       __m128i R2 =_mm_hadd_epi32 (A3, A4);
 
   
      //  O[k] = T[0]+T[1]+T[2]+T[3];    
            
  //  for (int k=0; k<4; k++)
 //   {
       //load the original matrix values, multiply it with the random values
       //store the low bits to I2 and the hi bits to I3
       I1 = _mm_load_si128 (&gt_vec[8]);       
       I2 = _mm_mullo_epi16 (I1, II0);
       I3 = _mm_mulhi_epi16 (I1, II0);

      __m128i lowI23 = _mm_unpacklo_epi16(I2,I3);
      __m128i hiI23 = _mm_unpackhi_epi16(I2,I3);    
      __m128i temp1 = _mm_add_epi32(lowI23,hiI23);
      __m128i temp5 = _mm_hsub_epi32 (lowI23, hiI23);

       I4 = _mm_load_si128 (&gt_vec[9]);       
       I5 = _mm_mullo_epi16 (I4, II0);
       I6 = _mm_mulhi_epi16 (I4, II0);
      __m128i lowI56 = _mm_unpacklo_epi16(I5,I6);
      __m128i hiI56 = _mm_unpackhi_epi16(I5,I6);    
      __m128i temp2 = _mm_add_epi32(lowI56,hiI56);  
      __m128i temp6 = _mm_hsub_epi32 (lowI56, hiI56);   
             
       I7 = _mm_load_si128 (&gt_vec[10]);      
       I8 = _mm_mullo_epi16 (I7, II0);
       I9 = _mm_mulhi_epi16 (I7, II0);
      __m128i lowI89 = _mm_unpacklo_epi16(I8,I9);
      __m128i hiI89 = _mm_unpackhi_epi16(I8,I9);    
      __m128i temp3 = _mm_add_epi32(lowI89,hiI89);  
      __m128i temp7 = _mm_hsub_epi32 (lowI89, hiI89);    

       I10 = _mm_load_si128 (&gt_vec[11]);       
       I11 = _mm_mullo_epi16 (I10, II0);
       I12 = _mm_mulhi_epi16 (I10, II0);
      __m128i lowI1112 = _mm_unpacklo_epi16(I11,I12);
      __m128i hiI1112 = _mm_unpackhi_epi16(I11,I12);    
      __m128i temp4 = _mm_add_epi32(lowI1112,hiI1112);  
      __m128i temp8 = _mm_hsub_epi32 (lowI1112, hiI1112);   
 
       __m128i A5 =_mm_hadd_epi32 (temp1, temp2);
       __m128i A6 =_mm_hadd_epi32 (temp3, temp4);
       __m128i R3 =_mm_hadd_epi32 (A5, A6);

       __m128i A7 =_mm_hadd_epi32 (temp8, temp7);
       __m128i A8 =_mm_hadd_epi32 (temp6, temp5);
       __m128i R4 =_mm_hadd_epi32 (A7, A8);

///////////////////////////
         __m128i add_reg = _mm_set1_epi32(add);

         __m128i sum_vec0 = _mm_add_epi32(R3,R1);        
         sum_vec0 = _mm_add_epi32(sum_vec0,add_reg);
         sum_vec0 = _mm_srai_epi32(sum_vec0, shift); // shift right
	 
         
         __m128i sum_vec1 = _mm_add_epi32(R4,R2);
         sum_vec1 = _mm_add_epi32(sum_vec1,add_reg);
         sum_vec1 = _mm_srai_epi32(sum_vec1, shift); // shift right

	 __m128i finalres0 = _mm_packs_epi32(sum_vec0, sum_vec1); // shrink packed 32bit to packed 16 bit and saturate
         _mm_store_si128 (&out_vec[2*j], finalres0);
         
        __m128i  sum_vec2 = _mm_sub_epi32(R4, R2);
         sum_vec2 = _mm_add_epi32(sum_vec2,add_reg);
         sum_vec2 = _mm_srai_epi32(sum_vec2, shift); // shift right  	 

         __m128i sum_vec3 = _mm_sub_epi32(R3, R1);
         sum_vec3 = _mm_add_epi32(sum_vec3,add_reg);
         sum_vec3 = _mm_srai_epi32(sum_vec3, shift); // shift right

         I5 = _mm_unpackhi_epi32(sum_vec2, sum_vec3);
         I6 = _mm_unpacklo_epi32(sum_vec2, sum_vec3);
         I7 = _mm_unpackhi_epi32(I5, I6);
         I8 = _mm_unpacklo_epi32(I5, I6);
         I9 = _mm_unpacklo_epi32(I7, I8);
         I10 = _mm_unpackhi_epi32(I7, I8);
         
	 sum_vec3 = _mm_packs_epi32(I9, I10); // shrink packed 32bit to packed 16 bit and saturate
         _mm_store_si128 (&out_vec[2*j+1], sum_vec3);
  }
}
Esempio n. 27
0
unsigned FLAC__fixed_compute_best_predictor_wide_intrin_sse2(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER + 1])
{
	FLAC__uint64 total_error_0, total_error_1, total_error_2, total_error_3, total_error_4;
	unsigned i, order;

	__m128i total_err0, total_err1, total_err3;

	{
		FLAC__int32 itmp;
		__m128i last_error, zero = _mm_setzero_si128();
		
		last_error = _mm_cvtsi32_si128(data[-1]);							// 0   0   0   le0
		itmp = data[-2];
		last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
		last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp));	// 0   0   le0 le1
		itmp -= data[-3];
		last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
		last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp));	// 0   le0 le1 le2
		itmp -= data[-3] - data[-4];
		last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
		last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp));	// le0 le1 le2 le3

		total_err0 = total_err1 = total_err3 = _mm_setzero_si128();
		for(i = 0; i < data_len; i++) {
			__m128i err0, err1, tmp;
			err0 = _mm_cvtsi32_si128(data[i]);								// 0   0   0   e0
			err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(0,0,0,0));			// e0  e0  e0  e0
#if 1 /* OPT_SSE */
			err1 = _mm_sub_epi32(err1, last_error);
			last_error = _mm_srli_si128(last_error, 4);						// 0   le0 le1 le2
			err1 = _mm_sub_epi32(err1, last_error);
			last_error = _mm_srli_si128(last_error, 4);						// 0   0   le0 le1
			err1 = _mm_sub_epi32(err1, last_error);
			last_error = _mm_srli_si128(last_error, 4);						// 0   0   0   le0
			err1 = _mm_sub_epi32(err1, last_error);							// e1  e2  e3  e4
#else
			last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 8));	// le0  le1  le2+le0  le3+le1
			last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 4));	// le0  le1+le0  le2+le0+le1  le3+le1+le2+le0
			err1 = _mm_sub_epi32(err1, last_error);							// e1  e2  e3  e4
#endif
			tmp = _mm_slli_si128(err0, 12);									// e0   0   0   0
			last_error = _mm_srli_si128(err1, 4);							//  0  e1  e2  e3
			last_error = _mm_or_si128(last_error, tmp);						// e0  e1  e2  e3

			tmp = _mm_srai_epi32(err0, 31);
			err0 = _mm_xor_si128(err0, tmp);
			err0 = _mm_sub_epi32(err0, tmp);
			tmp = _mm_srai_epi32(err1, 31);
			err1 = _mm_xor_si128(err1, tmp);
			err1 = _mm_sub_epi32(err1, tmp);

			total_err0 = _mm_add_epi64(total_err0, err0);					//        0       te0
			err0 = _mm_unpacklo_epi32(err1, zero);							//   0  |e3|   0  |e4|
			err1 = _mm_unpackhi_epi32(err1, zero);							//   0  |e1|   0  |e2|
			total_err3 = _mm_add_epi64(total_err3, err0);					//       te3      te4
			total_err1 = _mm_add_epi64(total_err1, err1);					//       te1      te2
		}
	}
	
	m128i_to_i64(total_error_0, total_err0);
	m128i_to_i64(total_error_4, total_err3);
	m128i_to_i64(total_error_2, total_err1);
	total_err3 = _mm_srli_si128(total_err3,	8);							//         0      te3
	total_err1 = _mm_srli_si128(total_err1, 8);							//         0      te1
	m128i_to_i64(total_error_3, total_err3);
	m128i_to_i64(total_error_1, total_err1);

	/* prefer higher order */
	if(total_error_0 < flac_min(flac_min(flac_min(total_error_1, total_error_2), total_error_3), total_error_4))
		order = 0;
	else if(total_error_1 < flac_min(flac_min(total_error_2, total_error_3), total_error_4))
		order = 1;
	else if(total_error_2 < flac_min(total_error_3, total_error_4))
		order = 2;
	else if(total_error_3 < total_error_4)
		order = 3;
	else
		order = 4;

	/* Estimate the expected number of bits per residual signal sample. */
	/* 'total_error*' is linearly related to the variance of the residual */
	/* signal, so we use it directly to compute E(|x|) */
	FLAC__ASSERT(data_len > 0 || total_error_0 == 0);
	FLAC__ASSERT(data_len > 0 || total_error_1 == 0);
	FLAC__ASSERT(data_len > 0 || total_error_2 == 0);
	FLAC__ASSERT(data_len > 0 || total_error_3 == 0);
	FLAC__ASSERT(data_len > 0 || total_error_4 == 0);

	residual_bits_per_sample[0] = (FLAC__float)((total_error_0 > 0) ? log(M_LN2 * (FLAC__double)total_error_0 / (FLAC__double)data_len) / M_LN2 : 0.0);
	residual_bits_per_sample[1] = (FLAC__float)((total_error_1 > 0) ? log(M_LN2 * (FLAC__double)total_error_1 / (FLAC__double)data_len) / M_LN2 : 0.0);
	residual_bits_per_sample[2] = (FLAC__float)((total_error_2 > 0) ? log(M_LN2 * (FLAC__double)total_error_2 / (FLAC__double)data_len) / M_LN2 : 0.0);
	residual_bits_per_sample[3] = (FLAC__float)((total_error_3 > 0) ? log(M_LN2 * (FLAC__double)total_error_3 / (FLAC__double)data_len) / M_LN2 : 0.0);
	residual_bits_per_sample[4] = (FLAC__float)((total_error_4 > 0) ? log(M_LN2 * (FLAC__double)total_error_4 / (FLAC__double)data_len) / M_LN2 : 0.0);

	return order;
}
Esempio n. 28
0
			0, 0, PKT_RX_FDIR, 0);

	const __m128i l3_l4e_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
			/* shift right 1 bit to make sure it not exceed 255 */
			(PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
			 PKT_RX_IP_CKSUM_BAD) >> 1,
			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD |
			 PKT_RX_L4_CKSUM_BAD) >> 1,
			(PKT_RX_EIP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD) >> 1,
			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
			PKT_RX_IP_CKSUM_BAD >> 1,
			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1);

	vlan0 = _mm_unpackhi_epi32(descs[0], descs[1]);
	vlan1 = _mm_unpackhi_epi32(descs[2], descs[3]);
	vlan0 = _mm_unpacklo_epi64(vlan0, vlan1);

	vlan1 = _mm_and_si128(vlan0, rss_vlan_msk);
	vlan0 = _mm_shuffle_epi8(vlan_flags, vlan1);

	rss = _mm_srli_epi32(vlan1, 11);
	rss = _mm_shuffle_epi8(rss_flags, rss);

	l3_l4e = _mm_srli_epi32(vlan1, 22);
	l3_l4e = _mm_shuffle_epi8(l3_l4e_flags, l3_l4e);
	/* then we shift left 1 bit */
	l3_l4e = _mm_slli_epi32(l3_l4e, 1);
	/* we need to mask out the reduntant bits */
	l3_l4e = _mm_and_si128(l3_l4e, cksum_mask);
Esempio n. 29
0
/* Transpose bytes within elements for 64 bit elements. */
int64_t bshuf_trans_byte_elem_SSE_64(void* in, void* out, const size_t size) {

    size_t ii;
    char* in_b = (char*) in;
    char* out_b = (char*) out;
    __m128i a0, b0, c0, d0, e0, f0, g0, h0;
    __m128i a1, b1, c1, d1, e1, f1, g1, h1;

    for (ii=0; ii + 15 < size; ii += 16) {
        a0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 0*16]);
        b0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 1*16]);
        c0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 2*16]);
        d0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 3*16]);
        e0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 4*16]);
        f0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 5*16]);
        g0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 6*16]);
        h0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 7*16]);

        a1 = _mm_unpacklo_epi8(a0, b0);
        b1 = _mm_unpackhi_epi8(a0, b0);
        c1 = _mm_unpacklo_epi8(c0, d0);
        d1 = _mm_unpackhi_epi8(c0, d0);
        e1 = _mm_unpacklo_epi8(e0, f0);
        f1 = _mm_unpackhi_epi8(e0, f0);
        g1 = _mm_unpacklo_epi8(g0, h0);
        h1 = _mm_unpackhi_epi8(g0, h0);

        a0 = _mm_unpacklo_epi8(a1, b1);
        b0 = _mm_unpackhi_epi8(a1, b1);
        c0 = _mm_unpacklo_epi8(c1, d1);
        d0 = _mm_unpackhi_epi8(c1, d1);
        e0 = _mm_unpacklo_epi8(e1, f1);
        f0 = _mm_unpackhi_epi8(e1, f1);
        g0 = _mm_unpacklo_epi8(g1, h1);
        h0 = _mm_unpackhi_epi8(g1, h1);

        a1 = _mm_unpacklo_epi32(a0, c0);
        b1 = _mm_unpackhi_epi32(a0, c0);
        c1 = _mm_unpacklo_epi32(b0, d0);
        d1 = _mm_unpackhi_epi32(b0, d0);
        e1 = _mm_unpacklo_epi32(e0, g0);
        f1 = _mm_unpackhi_epi32(e0, g0);
        g1 = _mm_unpacklo_epi32(f0, h0);
        h1 = _mm_unpackhi_epi32(f0, h0);

        a0 = _mm_unpacklo_epi64(a1, e1);
        b0 = _mm_unpackhi_epi64(a1, e1);
        c0 = _mm_unpacklo_epi64(b1, f1);
        d0 = _mm_unpackhi_epi64(b1, f1);
        e0 = _mm_unpacklo_epi64(c1, g1);
        f0 = _mm_unpackhi_epi64(c1, g1);
        g0 = _mm_unpacklo_epi64(d1, h1);
        h0 = _mm_unpackhi_epi64(d1, h1);

        _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0);
        _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0);
        _mm_storeu_si128((__m128i *) &out_b[2*size + ii], c0);
        _mm_storeu_si128((__m128i *) &out_b[3*size + ii], d0);
        _mm_storeu_si128((__m128i *) &out_b[4*size + ii], e0);
        _mm_storeu_si128((__m128i *) &out_b[5*size + ii], f0);
        _mm_storeu_si128((__m128i *) &out_b[6*size + ii], g0);
        _mm_storeu_si128((__m128i *) &out_b[7*size + ii], h0);
    }
    return bshuf_trans_byte_elem_remainder(in, out, size, 8,
            size - size % 16);
}
Esempio n. 30
0
template<class T> inline void dequantise_sse4_2_32_8_3(QuantisationMatrix *qmatrix,
                                                       int32_t *idata,
                                                       void *_odata,
                                                       int ostride) {
  T *odata = (T *)_odata;
  const int slice_width  = 32;
  const int slice_height = 8;

  const int Y = 0;
  const int X = 0;
  const int N = 0;

  T * const optr = &odata[Y*slice_height*ostride + X*slice_width];
  const int32_t * iptr = &idata[N*slice_height*slice_width];

  const __m128i D0  = LOAD_QUANTISED(&iptr[ 0], qmatrix, 0, 0);
  const __m128i D4  = LOAD_QUANTISED(&iptr[ 4], qmatrix, 1, 1);

  const __m128i D16 = LOAD_QUANTISED(&iptr[16], qmatrix, 2, 1);
  const __m128i D20 = LOAD_QUANTISED(&iptr[20], qmatrix, 2, 1);

  const __m128i D64 = LOAD_QUANTISED(&iptr[64], qmatrix, 3, 1);
  const __m128i D68 = LOAD_QUANTISED(&iptr[68], qmatrix, 3, 1);
  const __m128i D72 = LOAD_QUANTISED(&iptr[72], qmatrix, 3, 1);
  const __m128i D76 = LOAD_QUANTISED(&iptr[76], qmatrix, 3, 1);

  const __m128i A0  = _mm_unpacklo_epi32(D0, D4);   // ( 00 11 00 11 )
  const __m128i A1  = _mm_unpackhi_epi32(D0, D4);   // ( 00 11 00 11 )

  const __m128i B0  = _mm_unpacklo_epi32(A0, D16);  // ( 00 21 11 21 )
  const __m128i B1  = _mm_unpackhi_epi32(A0, D16);  // ( 00 21 11 21 )
  const __m128i B2  = _mm_unpacklo_epi32(A1, D20);  // ( 00 21 11 21 )
  const __m128i B3  = _mm_unpackhi_epi32(A1, D20);  // ( 00 21 11 21 )

  const __m128i C0  = _mm_unpacklo_epi32(B0, D64);  // ( 00 31 21 31 )
  const __m128i C1  = _mm_unpackhi_epi32(B0, D64);  // ( 11 31 21 31 )
  const __m128i C2  = _mm_unpacklo_epi32(B1, D68);  // ( 00 31 21 31 )
  const __m128i C3  = _mm_unpackhi_epi32(B1, D68);  // ( 11 31 21 31 )
  const __m128i C4  = _mm_unpacklo_epi32(B2, D72);  // ( 00 31 21 31 )
  const __m128i C5  = _mm_unpackhi_epi32(B2, D72);  // ( 11 31 21 31 )
  const __m128i C6  = _mm_unpacklo_epi32(B3, D76);  // ( 00 31 21 31 )
  const __m128i C7  = _mm_unpackhi_epi32(B3, D76);  // ( 11 31 21 31 )

  STORE_SAMPLE_PAIR<T>((__m128i *)&optr[0*ostride + 0],  C0, C1);
  STORE_SAMPLE_PAIR<T>((__m128i *)&optr[0*ostride + 8],  C2, C3);
  STORE_SAMPLE_PAIR<T>((__m128i *)&optr[0*ostride + 16], C4, C5);
  STORE_SAMPLE_PAIR<T>((__m128i *)&optr[0*ostride + 24], C6, C7);



  const __m128i D8  = LOAD_QUANTISED(&iptr[ 8], qmatrix, 1, 2);
  const __m128i D12 = LOAD_QUANTISED(&iptr[12], qmatrix, 1, 3);

  const __m128i D24 = LOAD_QUANTISED(&iptr[24], qmatrix, 2, 1);
  const __m128i D28 = LOAD_QUANTISED(&iptr[28], qmatrix, 2, 1);

  const __m128i D96  = LOAD_QUANTISED(&iptr[96], qmatrix, 3, 1);
  const __m128i D100 = LOAD_QUANTISED(&iptr[100], qmatrix, 3, 1);
  const __m128i D104 = LOAD_QUANTISED(&iptr[104], qmatrix, 3, 1);
  const __m128i D108 = LOAD_QUANTISED(&iptr[108], qmatrix, 3, 1);

  const __m128i A2  = _mm_unpacklo_epi32(D8, D12);  // ( 12 13 12 13 )
  const __m128i A3  = _mm_unpackhi_epi32(D8, D12);  // ( 12 13 12 13 )

  const __m128i B4  = _mm_unpacklo_epi32(A2,  D24); // ( 12 21 13 21 )
  const __m128i B5  = _mm_unpackhi_epi32(A2,  D24); // ( 12 21 13 21 )
  const __m128i B6  = _mm_unpacklo_epi32(A3,  D28); // ( 12 21 13 21 )
  const __m128i B7  = _mm_unpackhi_epi32(A3,  D28); // ( 12 21 13 21 )

  const __m128i C8  = _mm_unpacklo_epi32(B4,  D96); // ( 12 31 21 31 )
  const __m128i C9  = _mm_unpackhi_epi32(B4,  D96); // ( 13 31 21 31 )
  const __m128i C10 = _mm_unpacklo_epi32(B5,  D100); // ( 12 31 21 31 )
  const __m128i C11 = _mm_unpackhi_epi32(B5,  D100); // ( 13 31 21 31 )
  const __m128i C12 = _mm_unpacklo_epi32(B6,  D104); // ( 12 31 21 31 )
  const __m128i C13 = _mm_unpackhi_epi32(B6,  D104); // ( 13 31 21 31 )
  const __m128i C14 = _mm_unpacklo_epi32(B7,  D108); // ( 12 31 21 31 )
  const __m128i C15 = _mm_unpackhi_epi32(B7,  D108); // ( 13 31 21 31 )

  STORE_SAMPLE_PAIR<T>((__m128i *)&optr[4*ostride + 0],  C8,  C9);
  STORE_SAMPLE_PAIR<T>((__m128i *)&optr[4*ostride + 8],  C10, C11);
  STORE_SAMPLE_PAIR<T>((__m128i *)&optr[4*ostride + 16], C12, C13);
  STORE_SAMPLE_PAIR<T>((__m128i *)&optr[4*ostride + 24], C14, C15);



  const __m128i D32  = LOAD_QUANTISED(&iptr[ 32], qmatrix, 2, 2);
  const __m128i D36  = LOAD_QUANTISED(&iptr[ 36], qmatrix, 2, 2);

  const __m128i D48  = LOAD_QUANTISED(&iptr[ 48], qmatrix, 2, 3);
  const __m128i D52  = LOAD_QUANTISED(&iptr[ 52], qmatrix, 2, 3);

  const __m128i D80 = LOAD_QUANTISED(&iptr[ 80], qmatrix, 3, 1);
  const __m128i D84 = LOAD_QUANTISED(&iptr[ 84], qmatrix, 3, 1);
  const __m128i D88 = LOAD_QUANTISED(&iptr[ 88], qmatrix, 3, 1);
  const __m128i D92 = LOAD_QUANTISED(&iptr[ 92], qmatrix, 3, 1);

  const __m128i A4  = _mm_unpacklo_epi32(D32, D48);  // ( 22 23 22 23 )
  const __m128i A5  = _mm_unpackhi_epi32(D32, D48);  // ( 22 23 22 23 )
  const __m128i A6  = _mm_unpacklo_epi32(D36, D52);  // ( 22 23 22 23 )
  const __m128i A7  = _mm_unpackhi_epi32(D36, D52);  // ( 22 23 22 23 )

  const __m128i B8  = _mm_unpacklo_epi32(A4,  D80);  // ( 22 31 23 31 )
  const __m128i B9  = _mm_unpackhi_epi32(A4,  D80);  // ( 22 31 23 31 )
  const __m128i B10 = _mm_unpacklo_epi32(A5,  D84); // ( 22 31 23 31 )
  const __m128i B11 = _mm_unpackhi_epi32(A5,  D84); // ( 22 31 23 31 )
  const __m128i B12 = _mm_unpacklo_epi32(A6,  D88); // ( 22 31 23 31 )
  const __m128i B13 = _mm_unpackhi_epi32(A6,  D88); // ( 22 31 23 31 )
  const __m128i B14 = _mm_unpacklo_epi32(A7,  D92); // ( 22 31 23 31 )
  const __m128i B15 = _mm_unpackhi_epi32(A7,  D92); // ( 22 31 23 31 )

  STORE_SAMPLE_PAIR<T>((__m128i *)&optr[2*ostride + 0],  B8,  B9);
  STORE_SAMPLE_PAIR<T>((__m128i *)&optr[2*ostride + 8],  B10, B11);
  STORE_SAMPLE_PAIR<T>((__m128i *)&optr[2*ostride + 16], B12, B13);
  STORE_SAMPLE_PAIR<T>((__m128i *)&optr[2*ostride + 24], B14, B15);



  const __m128i D40  = LOAD_QUANTISED(&iptr[ 40], qmatrix, 2, 2);
  const __m128i D44  = LOAD_QUANTISED(&iptr[ 44], qmatrix, 2, 2);

  const __m128i D56  = LOAD_QUANTISED(&iptr[ 56], qmatrix, 2, 3);
  const __m128i D60  = LOAD_QUANTISED(&iptr[ 60], qmatrix, 2, 3);

  const __m128i D112 = LOAD_QUANTISED(&iptr[112], qmatrix, 3, 1);
  const __m128i D116 = LOAD_QUANTISED(&iptr[116], qmatrix, 3, 1);
  const __m128i D120 = LOAD_QUANTISED(&iptr[120], qmatrix, 3, 1);
  const __m128i D124 = LOAD_QUANTISED(&iptr[124], qmatrix, 3, 1);

  const __m128i A8  = _mm_unpacklo_epi32(D40, D56);  // ( 22 23 22 23 )
  const __m128i A9  = _mm_unpackhi_epi32(D40, D56);  // ( 22 23 22 23 )
  const __m128i A10 = _mm_unpacklo_epi32(D44, D60);  // ( 22 23 22 23 )
  const __m128i A11 = _mm_unpackhi_epi32(D44, D60);  // ( 22 23 22 23 )

  const __m128i B16 = _mm_unpacklo_epi32(A8,  D112); // ( 22 31 23 31 )
  const __m128i B17 = _mm_unpackhi_epi32(A8,  D112); // ( 22 31 23 31 )
  const __m128i B18 = _mm_unpacklo_epi32(A9,  D116); // ( 22 31 23 31 )
  const __m128i B19 = _mm_unpackhi_epi32(A9,  D116); // ( 22 31 23 31 )
  const __m128i B20 = _mm_unpacklo_epi32(A10, D120); // ( 22 31 23 31 )
  const __m128i B21 = _mm_unpackhi_epi32(A10, D120); // ( 22 31 23 31 )
  const __m128i B22 = _mm_unpacklo_epi32(A11, D124); // ( 22 31 23 31 )
  const __m128i B23 = _mm_unpackhi_epi32(A11, D124); // ( 22 31 23 31 )

  STORE_SAMPLE_PAIR<T>((__m128i *)&optr[6*ostride + 0],  B16, B17);
  STORE_SAMPLE_PAIR<T>((__m128i *)&optr[6*ostride + 8],  B18, B19);
  STORE_SAMPLE_PAIR<T>((__m128i *)&optr[6*ostride + 16], B20, B21);
  STORE_SAMPLE_PAIR<T>((__m128i *)&optr[6*ostride + 24], B22, B23);


  for (int j = 0; j < 4; j++) {
    const __m128i X0 = LOAD_QUANTISED(&iptr[128 + j*16 +  0], qmatrix, 3, 2);
    const __m128i X1 = LOAD_QUANTISED(&iptr[128 + j*16 +  4], qmatrix, 3, 2);
    const __m128i X2 = LOAD_QUANTISED(&iptr[128 + j*16 +  8], qmatrix, 3, 2);
    const __m128i X3 = LOAD_QUANTISED(&iptr[128 + j*16 + 12], qmatrix, 3, 2);

    const __m128i Y0 = LOAD_QUANTISED(&iptr[192 + j*16 +  0], qmatrix, 3, 3);
    const __m128i Y1 = LOAD_QUANTISED(&iptr[192 + j*16 +  4], qmatrix, 3, 3);
    const __m128i Y2 = LOAD_QUANTISED(&iptr[192 + j*16 +  8], qmatrix, 3, 3);
    const __m128i Y3 = LOAD_QUANTISED(&iptr[192 + j*16 + 12], qmatrix, 3, 3);

    const __m128i Z0 = _mm_unpacklo_epi32(X0, Y0);
    const __m128i Z1 = _mm_unpackhi_epi32(X0, Y0);
    const __m128i Z2 = _mm_unpacklo_epi32(X1, Y1);
    const __m128i Z3 = _mm_unpackhi_epi32(X1, Y1);
    const __m128i Z4 = _mm_unpacklo_epi32(X2, Y2);
    const __m128i Z5 = _mm_unpackhi_epi32(X2, Y2);
    const __m128i Z6 = _mm_unpacklo_epi32(X3, Y3);
    const __m128i Z7 = _mm_unpackhi_epi32(X3, Y3);

    STORE_SAMPLE_PAIR<T>((__m128i *)&optr[(2*j + 1)*ostride + 0],  Z0, Z1);
    STORE_SAMPLE_PAIR<T>((__m128i *)&optr[(2*j + 1)*ostride + 8],  Z2, Z3);
    STORE_SAMPLE_PAIR<T>((__m128i *)&optr[(2*j + 1)*ostride + 16], Z4, Z5);
    STORE_SAMPLE_PAIR<T>((__m128i *)&optr[(2*j + 1)*ostride + 24], Z6, Z7);
  }
}