예제 #1
0
파일: main.cpp 프로젝트: CCJY/coliru
bool recogUnicodeRange(const __m128i data, int& dataLength, unsigned int mask) {
    //first check whether in the 2 bytes encoding range 
    const __m128i Unicode_80_BE = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\xBE','\x80');
    unsigned int mask_80_BE = _mm_cvtsi128_si32(_mm_cmpestrm(Unicode_80_BE, 2, data, dataLength, _SIDD_CMP_RANGES));
    const __m128i Unicode_C2_DF = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\xDF', '\xC2');
    unsigned int mask_C2_DF = _mm_cvtsi128_si32(_mm_cmpestrm(Unicode_C2_DF, 2, data, dataLength, _SIDD_CMP_RANGES));
    if( mask_C2_DF > 0 ) {
        checkIncompleteBytes(mask_C2_DF, mask, dataLength, 1);
        if( mask_C2_DF > 0 ) { 
            unsigned int mask_C2_DF_2 = mask_C2_DF << 1;
            if( (mask_C2_DF_2 & mask_80_BE) != mask_C2_DF_2 ) {
                const __m128i Unicode_80_BF = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\xBF', '\x80');
                unsigned int mask_80_BF = _mm_cvtsi128_si32(_mm_cmpestrm(Unicode_80_BF, 2, data, dataLength, _SIDD_CMP_RANGES)); 
                if( (mask_C2_DF_2 & mask_80_BF) != mask_C2_DF_2 ) { 
                    return false; 
                }
            }
            mask |= mask_C2_DF;
            mask |= mask_C2_DF_2;
            if( mask == 0xFFFFFFFF ) { 
                return true;
            }
        } else {
            if( dataLength <= 0 ) return false; 
            if( mask == 0xFFFFFFFF ) return true; 
        }
    }

    //then check whether in the 3 bytes encoding range 
    const __m128i Unicode_E1_EC_EE_EF = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\xEF', '\xEF', '\xEE', '\xEE', '\xEC', '\xE1'); 
    unsigned int mask_E1_EC_EE_EF = _mm_cvtsi128_si32(_mm_cmpestrm(Unicode_E1_EC_EE_EF, 6, data, dataLength, _SIDD_CMP_RANGES));
    if( mask_E1_EC_EE_EF > 0 ) {
        checkIncompleteBytes(mask_E1_EC_EE_EF, mask, dataLength, 2); 
        if( mask_E1_EC_EE_EF > 0 ) { 
            unsigned int mask_E1_EC_EE_EF_2 = mask_E1_EC_EE_EF << 1; 
            unsigned int mask_E1_EC_EE_EF_3 = mask_E1_EC_EE_EF << 2; 
            if( (mask_E1_EC_EE_EF_2 & mask_80_BE) == mask_E1_EC_EE_EF_2 ) { 
                if( (mask_E1_EC_EE_EF_3 & mask_80_BE) != mask_E1_EC_EE_EF_3 ) { 
                    const __m128i Unicode_80_BF = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\xBF', '\x80'); 
                    unsigned int mask_80_BF = _mm_cvtsi128_si32(_mm_cmpestrm(Unicode_80_BF, 2, data, dataLength, _SIDD_CMP_RANGES));
                    if( (mask_E1_EC_EE_EF_3 & mask_80_BF) != mask_E1_EC_EE_EF_3 ) { 
                        return false;
                    }
                }
                mask |= mask_E1_EC_EE_EF;
                mask |= mask_E1_EC_EE_EF_2;
                mask |= mask_E1_EC_EE_EF_3;
                if( mask == 0xFFFFFFFF ) { 
                    return true;
                }
            } else {
                return false;
            }
        } else {
            if( dataLength <= 0 ) return false;
            if( mask == 0xFFFFFFFF ) return true;
        }
    }
    return false;
}
예제 #2
0
void AddRoundKey_sse(BYTE state[][4], const WORD w[])
{
	BYTE subkey[16]; 
	subkey[0] = w[0] >> 24; 
	subkey[1] = w[1] >> 24; 
	subkey[2] = w[2] >> 24; 
	subkey[3] = w[3] >> 24; 
	subkey[4] = w[0] >> 16; 
	subkey[5] = w[1] >> 16; 
	subkey[6] = w[2] >> 16; 
	subkey[7] = w[3] >> 16; 
	subkey[8] = w[0] >> 8; 
	subkey[9] = w[1] >> 8; 
	subkey[10] = w[2] >> 8; 
	subkey[11] = w[3] >> 8; 
	subkey[12] = w[0]; 
	subkey[13] = w[1]; 
	subkey[14] = w[2]; 
	subkey[15] = w[3]; 

	__m128i subkeySse = _mm_set_epi8(subkey[15], subkey[14], subkey[13], subkey[12],
				subkey[11], subkey[10], subkey[9], subkey[8], 
				subkey[7], subkey[6], subkey[5], subkey[4], 
				subkey[3], subkey[2], subkey[1], subkey[0]);
	__m128i stateSse = _mm_set_epi8(state[3][3], state[3][2], state[3][1], state[3][0], 
					state[2][3], state[2][2], state[2][1], state[2][0],
					state[1][3], state[1][2], state[1][1], state[1][0],
					state[0][3], state[0][2], state[0][1], state[0][0]); 
	stateSse = _mm_xor_si128 ( stateSse, subkeySse);
	_mm_storeu_si128(state, stateSse); 
	
}
예제 #3
0
/*The input to initialization is the 128-bit key; 128-bit IV;*/
void aegis128_initialization(const unsigned char *key, const unsigned char *iv, __m128i *state)
{
       int i;

        __m128i  tmp;
        __m128i  keytmp = _mm_load_si128((__m128i*)key);
        __m128i  ivtmp  = _mm_load_si128((__m128i*)iv);

        state[0] =  ivtmp;
        state[1] = _mm_set_epi8(0xdd,0x28,0xb5,0x73,0x42,0x31,0x11,0x20,0xf1,0x2f,0xc2,0x6d,0x55,0x18,0x3d,0xdb);
        state[2] = _mm_set_epi8(0x62,0x79,0xe9,0x90,0x59,0x37,0x22,0x15,0x0d,0x08,0x05,0x03,0x02,0x01,0x1, 0x0);
        state[3] = _mm_xor_si128(keytmp, _mm_set_epi8(0x62,0x79,0xe9,0x90,0x59,0x37,0x22,0x15,0x0d,0x08,0x05,0x03,0x02,0x01,0x1,0x0));
        state[4] = _mm_xor_si128(keytmp, _mm_set_epi8(0xdd,0x28,0xb5,0x73,0x42,0x31,0x11,0x20,0xf1,0x2f,0xc2,0x6d,0x55,0x18,0x3d,0xdb));
        state[0] = _mm_xor_si128(state[0], keytmp);

        keytmp   = _mm_xor_si128(keytmp, ivtmp); 
        for (i = 0; i < 10; i++)  {
             //state update function
             tmp = state[4];
             state[4] = _mm_aesenc_si128(state[3], state[4]);
             state[3] = _mm_aesenc_si128(state[2], state[3]);
             state[2] = _mm_aesenc_si128(state[1], state[2]);
             state[1] = _mm_aesenc_si128(state[0], state[1]);
             state[0] = _mm_aesenc_si128(tmp, state[0]);

             //xor msg with state[0]
             keytmp   = _mm_xor_si128(keytmp, ivtmp);
             state[0] = _mm_xor_si128(state[0], keytmp);
        }
}
예제 #4
0
// fl48 V1
void matrix_vector_mul_SSE_f48(fl48** mat, fl48* &vec)
{
  fl48* result = new fl48[SIZE]; // should be SIZE of result!
  __m128i mask = _mm_set_epi8(11, 10, 9, 8, 7, 6, 255, 255,
  			      5, 4, 3, 2, 1, 0, 255, 255);
  __m128i shuffling_mask = _mm_set_epi8(7 ,6 ,5, 4, 3, 2, 1, 0,
			      15, 14, 13, 12, 11, 10, 9, 8);
  for(unsigned i=0;i<SIZE;i++) { // row
    __m128d running_sum = _mm_set1_pd(0.0); // running sum initially 0
    for(unsigned j=0;j<SIZE;j+=2) { // col - requires skipping on 2 at a time

      // multiply each
      // add to running sum
      __m128i mat_vect = _mm_loadu_si128((__m128i*) &mat[i][j]); // hoping that addresses are as expected - seems like this is the way it's stored
						  // ^^ needs explanation and backup for REPORT - ROW major storing order in C/C++ such as python, pascal and others
      mat_vect = _mm_shuffle_epi8(mat_vect, mask);
      __m128i vec_elem = _mm_loadu_si128((__m128i*) &vec[j]);
      vec_elem = _mm_shuffle_epi8(vec_elem, mask);

      __m128d mult = _mm_mul_pd((__m128d)mat_vect,(__m128d)vec_elem);
      running_sum = _mm_add_pd(mult,running_sum);


    }
    // shuffle & add (to make hadd)
    // store back to vec[i]
    __m128i sum_shuffled = _mm_shuffle_epi8((__m128i)running_sum, shuffling_mask);
    running_sum = _mm_add_pd(running_sum,(__m128d)sum_shuffled);
    double temp=0;
    _mm_store_sd(&temp, running_sum);
    result[i]=fl48(temp);
  }
  vec = result;
}
예제 #5
0
// Convert 16 packed ARGB 16b-values to r[], g[], b[]
static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE41(
    const uint16_t* const rgbx,
    __m128i* const r, __m128i* const g, __m128i* const b) {
  const __m128i in0 = LOAD_16(rgbx +  0);  // r0 | g0 | b0 |x| r1 | g1 | b1 |x
  const __m128i in1 = LOAD_16(rgbx +  8);  // r2 | g2 | b2 |x| r3 | g3 | b3 |x
  const __m128i in2 = LOAD_16(rgbx + 16);  // r4 | ...
  const __m128i in3 = LOAD_16(rgbx + 24);  // r6 | ...
  // aarrggbb as 16-bit.
  const __m128i shuff0 =
      _mm_set_epi8(-1, -1, -1, -1, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0);
  const __m128i shuff1 =
      _mm_set_epi8(13, 12, 5, 4, -1, -1, -1, -1, 11, 10, 3, 2, 9, 8, 1, 0);
  const __m128i A0 = _mm_shuffle_epi8(in0, shuff0);
  const __m128i A1 = _mm_shuffle_epi8(in1, shuff1);
  const __m128i A2 = _mm_shuffle_epi8(in2, shuff0);
  const __m128i A3 = _mm_shuffle_epi8(in3, shuff1);
  // R0R1G0G1
  // B0B1****
  // R2R3G2G3
  // B2B3****
  // (OR is used to free port 5 for the unpack)
  const __m128i B0 = _mm_unpacklo_epi32(A0, A1);
  const __m128i B1 = _mm_or_si128(A0, A1);
  const __m128i B2 = _mm_unpacklo_epi32(A2, A3);
  const __m128i B3 = _mm_or_si128(A2, A3);
  // Gather the channels.
  *r = _mm_unpacklo_epi64(B0, B2);
  *g = _mm_unpackhi_epi64(B0, B2);
  *b = _mm_unpackhi_epi64(B1, B3);
}
예제 #6
0
HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
                                const void* input, DataLength_gr databitlen )
{
   const int len = (int)databitlen / 128;
   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
   const int hash_offset = SIZE512 - hashlen_m128i;
   int rem = ctx->rem_ptr;
   int blocks = len / SIZE512;
   __m128i* in = (__m128i*)input;
   int i;

   // --- update ---

   // digest any full blocks, process directly from input 
   for ( i = 0; i < blocks; i++ )
      TF1024( ctx->chaining, &in[ i * SIZE512 ] );
   ctx->buf_ptr = blocks * SIZE512;

   // copy any remaining data to buffer, it may already contain data
   // from a previous update for a midstate precalc
   for ( i = 0; i < len % SIZE512; i++ )
       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
   i += rem;    // use i as rem_ptr in final

   //--- final ---

   blocks++;      // adjust for final block

   if ( i == len -1 )
   {        
       // only 128 bits left in buffer, all padding at once
       ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
                                           0,0,0,0, 0,0,0,0x80 );
   }   
   else
   {
       // add first padding
       ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0, 
                                      0,0,0,0, 0,0,0,0x80 );
       // add zero padding
       for ( i += 1; i < SIZE512 - 1; i++ )
           ctx->buffer[i] = _mm_setzero_si128();

       // add length padding, second last byte is zero unless blocks > 255
       ctx->buffer[i] = _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0,0,0, 
                                           0,         0 ,0,0, 0,0,0,0 );
   }

   // digest final padding block and do output transform
   TF1024( ctx->chaining, ctx->buffer );
   OF1024( ctx->chaining );

   // store hash result in output 
   for ( i = 0; i < hashlen_m128i; i++ )
      casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ];

   return SUCCESS_GR;
}
예제 #7
0
파일: blake2s.c 프로젝트: altoplano/PHC
static inline int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] )
{
  __m128i row1, row2, row3, row4;
  __m128i buf1, buf2, buf3, buf4;
#if defined(HAVE_SSE41)
  __m128i t0, t1;
#if !defined(HAVE_XOP)
  __m128i t2;
#endif
#endif
  __m128i ff0, ff1;
#if defined(HAVE_SSSE3) && !defined(HAVE_XOP)
  const __m128i r8 = _mm_set_epi8( 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1 );
  const __m128i r16 = _mm_set_epi8( 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2 );
#endif
#if defined(HAVE_SSE41)
  const __m128i m0 = LOADU( block +  00 );
  const __m128i m1 = LOADU( block +  16 );
  const __m128i m2 = LOADU( block +  32 );
  const __m128i m3 = LOADU( block +  48 );
#else
  const uint32_t  m0 = ( ( uint32_t * )block )[ 0];
  const uint32_t  m1 = ( ( uint32_t * )block )[ 1];
  const uint32_t  m2 = ( ( uint32_t * )block )[ 2];
  const uint32_t  m3 = ( ( uint32_t * )block )[ 3];
  const uint32_t  m4 = ( ( uint32_t * )block )[ 4];
  const uint32_t  m5 = ( ( uint32_t * )block )[ 5];
  const uint32_t  m6 = ( ( uint32_t * )block )[ 6];
  const uint32_t  m7 = ( ( uint32_t * )block )[ 7];
  const uint32_t  m8 = ( ( uint32_t * )block )[ 8];
  const uint32_t  m9 = ( ( uint32_t * )block )[ 9];
  const uint32_t m10 = ( ( uint32_t * )block )[10];
  const uint32_t m11 = ( ( uint32_t * )block )[11];
  const uint32_t m12 = ( ( uint32_t * )block )[12];
  const uint32_t m13 = ( ( uint32_t * )block )[13];
  const uint32_t m14 = ( ( uint32_t * )block )[14];
  const uint32_t m15 = ( ( uint32_t * )block )[15];
#endif
  row1 = ff0 = LOADU( &S->h[0] );
  row2 = ff1 = LOADU( &S->h[4] );
  row3 = _mm_setr_epi32( 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A );
  row4 = _mm_xor_si128( _mm_setr_epi32( 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 ), LOADU( &S->t[0] ) );
  ROUND( 0 );
  ROUND( 1 );
  ROUND( 2 );
  ROUND( 3 );
  ROUND( 4 );
  ROUND( 5 );
  ROUND( 6 );
  ROUND( 7 );
  ROUND( 8 );
  ROUND( 9 );
  STOREU( &S->h[0], _mm_xor_si128( ff0, _mm_xor_si128( row1, row3 ) ) );
  STOREU( &S->h[4], _mm_xor_si128( ff1, _mm_xor_si128( row2, row4 ) ) );
  return 0;
}
예제 #8
0
파일: freak.cpp 프로젝트: 112000/opencv
void FREAK::extractDescriptor(uchar *pointsValue, void ** ptr) const
{
    __m128i** ptrSSE = (__m128i**) ptr;

    // note that comparisons order is modified in each block (but first 128 comparisons remain globally the same-->does not affect the 128,384 bits segmanted matching strategy)
    int cnt = 0;
    for( int n = FREAK_NB_PAIRS/128; n-- ; )
    {
        __m128i result128 = _mm_setzero_si128();
        for( int m = 128/16; m--; cnt += 16 )
        {
            __m128i operand1 = _mm_set_epi8(pointsValue[descriptionPairs[cnt+0].i],
                                            pointsValue[descriptionPairs[cnt+1].i],
                                            pointsValue[descriptionPairs[cnt+2].i],
                                            pointsValue[descriptionPairs[cnt+3].i],
                                            pointsValue[descriptionPairs[cnt+4].i],
                                            pointsValue[descriptionPairs[cnt+5].i],
                                            pointsValue[descriptionPairs[cnt+6].i],
                                            pointsValue[descriptionPairs[cnt+7].i],
                                            pointsValue[descriptionPairs[cnt+8].i],
                                            pointsValue[descriptionPairs[cnt+9].i],
                                            pointsValue[descriptionPairs[cnt+10].i],
                                            pointsValue[descriptionPairs[cnt+11].i],
                                            pointsValue[descriptionPairs[cnt+12].i],
                                            pointsValue[descriptionPairs[cnt+13].i],
                                            pointsValue[descriptionPairs[cnt+14].i],
                                            pointsValue[descriptionPairs[cnt+15].i]);

            __m128i operand2 = _mm_set_epi8(pointsValue[descriptionPairs[cnt+0].j],
                                            pointsValue[descriptionPairs[cnt+1].j],
                                            pointsValue[descriptionPairs[cnt+2].j],
                                            pointsValue[descriptionPairs[cnt+3].j],
                                            pointsValue[descriptionPairs[cnt+4].j],
                                            pointsValue[descriptionPairs[cnt+5].j],
                                            pointsValue[descriptionPairs[cnt+6].j],
                                            pointsValue[descriptionPairs[cnt+7].j],
                                            pointsValue[descriptionPairs[cnt+8].j],
                                            pointsValue[descriptionPairs[cnt+9].j],
                                            pointsValue[descriptionPairs[cnt+10].j],
                                            pointsValue[descriptionPairs[cnt+11].j],
                                            pointsValue[descriptionPairs[cnt+12].j],
                                            pointsValue[descriptionPairs[cnt+13].j],
                                            pointsValue[descriptionPairs[cnt+14].j],
                                            pointsValue[descriptionPairs[cnt+15].j]);

            __m128i workReg = _mm_min_epu8(operand1, operand2); // emulated "not less than" for 8-bit UNSIGNED integers
            workReg = _mm_cmpeq_epi8(workReg, operand2);        // emulated "not less than" for 8-bit UNSIGNED integers

            workReg = _mm_and_si128(_mm_set1_epi16(short(0x8080 >> m)), workReg); // merge the last 16 bits with the 128bits std::vector until full
            result128 = _mm_or_si128(result128, workReg);
        }
        (**ptrSSE) = result128;
        ++(*ptrSSE);
    }
    (*ptrSSE) -= 8;
}
static inline void
desc_to_olflags_v(__m128i descs[4], uint8_t vlan_flags,
	struct rte_mbuf **rx_pkts)
{
	__m128i ptype0, ptype1, vtag0, vtag1;
	union {
		uint16_t e[4];
		uint64_t dword;
	} vol;

	/* mask everything except rss type */
	const __m128i rsstype_msk = _mm_set_epi16(
			0x0000, 0x0000, 0x0000, 0x0000,
			0x000F, 0x000F, 0x000F, 0x000F);

	/* map rss type to rss hash flag */
	const __m128i rss_flags = _mm_set_epi8(PKT_RX_FDIR, 0, 0, 0,
			0, 0, 0, PKT_RX_RSS_HASH,
			PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, 0,
			PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0);

	/* mask everything except vlan present bit */
	const __m128i vlan_msk = _mm_set_epi16(
			0x0000, 0x0000,
			0x0000, 0x0000,
			IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP,
			IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP);
	/* map vlan present (0x8) to ol_flags */
	const __m128i vlan_map = _mm_set_epi8(
		0, 0, 0, 0,
		0, 0, 0, vlan_flags,
		0, 0, 0, 0,
		0, 0, 0, 0);

	ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]);
	ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]);
	vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]);
	vtag1 = _mm_unpackhi_epi16(descs[2], descs[3]);

	ptype0 = _mm_unpacklo_epi32(ptype0, ptype1);
	ptype0 = _mm_and_si128(ptype0, rsstype_msk);
	ptype0 = _mm_shuffle_epi8(rss_flags, ptype0);

	vtag1 = _mm_unpacklo_epi32(vtag0, vtag1);
	vtag1 = _mm_and_si128(vtag1, vlan_msk);
	vtag1 = _mm_shuffle_epi8(vlan_map, vtag1);

	vtag1 = _mm_or_si128(ptype0, vtag1);
	vol.dword = _mm_cvtsi128_si64(vtag1);

	rx_pkts[0]->ol_flags = vol.e[0];
	rx_pkts[1]->ol_flags = vol.e[1];
	rx_pkts[2]->ol_flags = vol.e[2];
	rx_pkts[3]->ol_flags = vol.e[3];
}
void ConvertColor_BGR2GRAY_BT709_simd(const cv::Mat& src, cv::Mat& dst)
{
    CV_Assert(CV_8UC3 == src.type());
    cv::Size sz = src.size();
    dst.create(sz, CV_8UC1);

#ifdef HAVE_SSE
    // __m128i ssse3_blue_indices_0  = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 12,  9,  6,  3,  0);
    // __m128i ssse3_blue_indices_1  = _mm_set_epi8(-1, -1, -1, -1, -1, 14, 11,  8,  5,  2, -1, -1, -1, -1, -1, -1);
    // __m128i ssse3_blue_indices_2  = _mm_set_epi8(13, 10,  7,  4,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
    // __m128i ssse3_green_indices_0 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 13, 10,  7,  4,  1);
    // __m128i ssse3_green_indices_1 = _mm_set_epi8(-1, -1, -1, -1, -1, 15, 12,  9,  6,  3,  0, -1, -1, -1, -1, -1);
    // __m128i ssse3_green_indices_2 = _mm_set_epi8(14, 11,  8,  5,  2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
    __m128i ssse3_red_indices_0   = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 14, 11,  8,  5,  2);
    __m128i ssse3_red_indices_1   = _mm_set_epi8(-1, -1, -1, -1, -1, -1, 13, 10,  7,  4,  1, -1, -1, -1, -1, -1);
    __m128i ssse3_red_indices_2   = _mm_set_epi8(15, 12,  9,  6,  3,  0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
#endif

    for (int y = 0; y < sz.height; y++)
    {
        const uchar *psrc = src.ptr<uchar>(y);
        uchar *pdst = dst.ptr<uchar>(y);

        int x = 0;

#ifdef HAVE_SSE
        // Here is 16 times unrolled loop for vector processing
        for (; x <= sz.width - 16; x += 16)
        {
            __m128i chunk0 = _mm_loadu_si128((const __m128i*)(psrc + x*3 + 16*0));
            __m128i chunk1 = _mm_loadu_si128((const __m128i*)(psrc + x*3 + 16*1));
            __m128i chunk2 = _mm_loadu_si128((const __m128i*)(psrc + x*3 + 16*2));

            __m128i red = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, ssse3_red_indices_0),
                                                    _mm_shuffle_epi8(chunk1, ssse3_red_indices_1)),
                                                    _mm_shuffle_epi8(chunk2, ssse3_red_indices_2));

            /* ??? */

            _mm_storeu_si128((__m128i*)(pdst + x), red);
        }
#endif

        // Process leftover pixels
        for (; x < sz.width; x++)
        {
            /* ??? */
        }
    }

    // ! Remove this before writing your optimizations !
    ConvertColor_BGR2GRAY_BT709_fpt(src, dst);
}
예제 #11
0
파일: main.cpp 프로젝트: CCJY/coliru
void scanCharDataContentwithSTTNI(SAX2Processor* saxProcessor) { 
    unsigned int length = yylim - yycur; 
    unsigned char* data = (unsigned char*)yycur; 
    if( *data == '<' || *data == '&' || *data == ']') return; 
    unsigned int dataLen = 0; 
    // initialize the one byte encoding rule and nonCharaData rule 
    const __m128i asciiCharData = _mm_set_epi8(0,0,0,0,0,0,0x7F,0x5E,0x5C,0x3D, 0x3B,0x27,0x25,0x20,0,0); 
    const __m128i nonCharData = _mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0x5D,0x3C,0x26,0x0D,0x0A); 

    do { 
        // special new line processing for ‘x0A’,‘x0D’ 
        if( *data == '\0' ) { 
            saxProcessor->newLine((char*)data); 
            data++; length--; 
        } else if(*data == '\0') { 
            saxProcessor->newLine((char*)data); 
            if( *(data+1) == '\0' ) { 
                data += 2; length -= 2; yycur++; 
            } else { 
                *data = '\0'; data++; length--; 
            } 
        }

        while( length > 0 ) { 
            if( length >= 16 ) dataLen = 16; 
            else dataLen = length; 
            const __m128i mData = _mm_loadu_si128((__m128i*)data); 
            // locate the Character Data part with the nonCharaData characters 
            int index = _mm_cmpestri(nonCharData, 5, mData, dataLen, _SIDD_CMP_EQUAL_ANY); 
            if( index == 0 ) break;
            if( index > dataLen ) index = dataLen; 
            bool shouldBreak = index < dataLen ? true : false; 
            // check the one byte encoding rule(ASCII) 
            unsigned int mask = _mm_cvtsi128_si32(_mm_cmpestrm(asciiCharData, 10, 
                mData, index, _SIDD_CMP_RANGES|_SIDD_MASKED_NEGATIVE_POLARITY)); 
            // if not all hit ASCII, continue to check other Unicode rules 
            if( mask == 0 || recogUnicodeRange(mData, index, ~mask)) { 
                data += index; 
                length -= index; 
                if( shouldBreak ) break;
            } else { 
                break;
            } 
        }

        unsigned int passLen = (char*)data - yycur; 
        if( passLen == 0 ) break; 
        // report Character Data to user 
        saxProcessor->reportCharDataContent(yycur, passLen); 
        yycur += passLen; 
        YYSWITCHBUFFER; 
    } while( length >= STTNISTRLENLIMIT && (*data == '\0' || *data == '\0') ); 
} 
예제 #12
0
static inline void jambu_aut_ad_full(__m128i *key, __m128i *stateS, __m128i *stateR) 
{
	__m128i  msgtmp = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0x80, 0, 0, 0, 0, 0, 0, 0, 0); 
	__m128i c1 = _mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1); 

	aes_enc_128(stateS, key);
	*stateS = _mm_xor_si128(*stateS, _mm_srli_si128(*stateR, 8));
	*stateS = _mm_xor_si128(*stateS, c1);
	*stateS = _mm_xor_si128(*stateS, msgtmp);  
	*stateR = _mm_xor_si128(*stateS, *stateR);  
	return;
}
예제 #13
0
/* Compute branch metrics (gamma) */
void map_gen_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t long_cb) 
{
  __m128i res10, res20, res11, res21, res1, res2; 
  __m128i in, ap, pa, g1, g0;

  __m128i *inPtr  = (__m128i*) input;
  __m128i *appPtr = (__m128i*) app;
  __m128i *paPtr  = (__m128i*) parity;
  __m128i *resPtr = (__m128i*) h->branch;
  
  __m128i res10_mask = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0);
  __m128i res20_mask = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8);
  __m128i res11_mask = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff);
  __m128i res21_mask = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff);
  
  for (int i=0;i<long_cb/8;i++) {
    in = _mm_load_si128(inPtr);
    inPtr++;
    pa = _mm_load_si128(paPtr);
    paPtr++;
    
    if (appPtr) {
      ap = _mm_load_si128(appPtr);
      appPtr++;
      in = _mm_add_epi16(ap, in);
    }
    
    g1 = _mm_add_epi16(in, pa);
    g0 = _mm_sub_epi16(in, pa);

    g1 = _mm_srai_epi16(g1, 1);
    g0 = _mm_srai_epi16(g0, 1);
    
    res10 = _mm_shuffle_epi8(g0, res10_mask);
    res20 = _mm_shuffle_epi8(g0, res20_mask);
    res11 = _mm_shuffle_epi8(g1, res11_mask);
    res21 = _mm_shuffle_epi8(g1, res21_mask);

    res1  = _mm_or_si128(res10, res11);
    res2  = _mm_or_si128(res20, res21);

    _mm_store_si128(resPtr, res1);
    resPtr++;
    _mm_store_si128(resPtr, res2);    
    resPtr++;
  }

  for (int i=long_cb;i<long_cb+3;i++) {
    h->branch[2*i]   = (input[i] - parity[i])/2;
    h->branch[2*i+1] = (input[i] + parity[i])/2;
  }
}
예제 #14
0
파일: demod_soft.c 프로젝트: srsLTE/srsLTE
void demod_16qam_lte_b_sse(const cf_t *symbols, int8_t *llr, int nsymbols) {
  float *symbolsPtr = (float*) symbols;
  __m128i *resultPtr = (__m128i*) llr;
  __m128 symbol1, symbol2, symbol3, symbol4;
  __m128i symbol_i1, symbol_i2, symbol_i3, symbol_i4, symbol_i, symbol_abs, symbol_12, symbol_34;
  __m128i offset = _mm_set1_epi8(2*SCALE_BYTE_CONV_QAM16/sqrt(10));
  __m128i result1n, result1a, result2n, result2a;
  __m128 scale_v = _mm_set1_ps(-SCALE_BYTE_CONV_QAM16);

  __m128i shuffle_negated_1 = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0);
  __m128i shuffle_abs_1     = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff);

  __m128i shuffle_negated_2 = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8);
  __m128i shuffle_abs_2     = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff);

  for (int i=0;i<nsymbols/8;i++) {
    symbol1   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol2   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol3   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol4   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol_i1 = _mm_cvtps_epi32(_mm_mul_ps(symbol1, scale_v));
    symbol_i2 = _mm_cvtps_epi32(_mm_mul_ps(symbol2, scale_v));
    symbol_i3 = _mm_cvtps_epi32(_mm_mul_ps(symbol3, scale_v));
    symbol_i4 = _mm_cvtps_epi32(_mm_mul_ps(symbol4, scale_v));
    symbol_12  = _mm_packs_epi32(symbol_i1, symbol_i2);
    symbol_34  = _mm_packs_epi32(symbol_i3, symbol_i4);
    symbol_i   = _mm_packs_epi16(symbol_12, symbol_34);

    symbol_abs  = _mm_abs_epi8(symbol_i);
    symbol_abs  = _mm_sub_epi8(symbol_abs, offset);

    result1n = _mm_shuffle_epi8(symbol_i, shuffle_negated_1);
    result1a = _mm_shuffle_epi8(symbol_abs, shuffle_abs_1);

    result2n = _mm_shuffle_epi8(symbol_i, shuffle_negated_2);
    result2a = _mm_shuffle_epi8(symbol_abs, shuffle_abs_2);

    _mm_store_si128(resultPtr, _mm_or_si128(result1n, result1a)); resultPtr++;
    _mm_store_si128(resultPtr, _mm_or_si128(result2n, result2a)); resultPtr++;

  }
  // Demodulate last symbols
  for (int i=8*(nsymbols/8);i<nsymbols;i++) {
    short yre = (int8_t) (SCALE_BYTE_CONV_QAM16*crealf(symbols[i]));
    short yim = (int8_t) (SCALE_BYTE_CONV_QAM16*cimagf(symbols[i]));

    llr[4*i+0] = -yre;
    llr[4*i+1] = -yim;
    llr[4*i+2] = abs(yre)-2*SCALE_BYTE_CONV_QAM16/sqrt(10);
    llr[4*i+3] = abs(yim)-2*SCALE_BYTE_CONV_QAM16/sqrt(10);
  }
}
예제 #15
0
static inline void jambu_aut_ad_step(__m128i *key, const unsigned char *adblock,
									   __m128i *stateS, __m128i *stateR) 
{
	 __m128i  msgtmp = _mm_set_epi8(adblock[7], adblock[6], adblock[5], adblock[4], adblock[3], adblock[2], adblock[1], adblock[0], 0, 0, 0, 0, 0, 0, 0, 0);
	__m128i c1 = _mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1);

	aes_enc_128(stateS, key);
	*stateS = _mm_xor_si128(*stateS, _mm_srli_si128(*stateR, 8));
	*stateS = _mm_xor_si128(*stateS, c1);
	*stateS = _mm_xor_si128(*stateS, msgtmp);  
	*stateR = _mm_xor_si128(*stateS, *stateR);  
	return;
}
예제 #16
0
static inline __m128i
enc_reshuffle (__m128i in)
{
	// Slice into 32-bit chunks and operate on all chunks in parallel.
	// All processing is done within the 32-bit chunk. First, shuffle:
	// before: [eeeeeeff|ccdddddd|bbbbcccc|aaaaaabb]
	// after:  [00000000|aaaaaabb|bbbbcccc|ccdddddd]
	in = _mm_shuffle_epi8(in, _mm_set_epi8(
		-1, 9, 10, 11,
		-1, 6,  7,  8,
		-1, 3,  4,  5,
		-1, 0,  1,  2));

	// merged  = [0000aaaa|aabbbbbb|bbbbcccc|ccdddddd]
	const __m128i merged = _mm_blend_epi16(_mm_slli_epi32(in, 4), in, 0x55);

	// bd      = [00000000|00bbbbbb|00000000|00dddddd]
	const __m128i bd = _mm_and_si128(merged, _mm_set1_epi32(0x003F003F));

	// ac      = [00aaaaaa|00000000|00cccccc|00000000]
	const __m128i ac = _mm_and_si128(_mm_slli_epi32(merged, 2), _mm_set1_epi32(0x3F003F00));

	// indices = [00aaaaaa|00bbbbbb|00cccccc|00dddddd]
	const __m128i indices = _mm_or_si128(ac, bd);

	// return  = [00dddddd|00cccccc|00bbbbbb|00aaaaaa]
	return _mm_bswap_epi32(indices);
}
예제 #17
0
int
main (void)
{
  pa_xmm = register_printf_type (xmm_va);
  mod_v4f = register_printf_modifier (L"v4f");
  mod_v2d = register_printf_modifier (L"v2d");
  mod_v16i = register_printf_modifier (L"v16i");
  mod_v8i = register_printf_modifier (L"v8i");
  mod_v4i = register_printf_modifier (L"v4i");
  mod_v2i = register_printf_modifier (L"v2i");

  register_printf_specifier ('f', xmm_printf_f, xmm_ais);
  register_printf_specifier ('x', xmm_printf_x, xmm_ais);

  __m128 f = _mm_set_ps (1.0, 2.0, 3.0, 4.0);
  __m128d d = _mm_set_pd (1.0, 2.0);
  __m128i i = _mm_set_epi8 (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);

  printf ("%f\n", 1.0f);
  printf ("%f\n", 2.0);
  printf ("%v4ff\n", f);
  printf ("%v2df\n", d);
  printf ("%x\n", 1);
  printf ("%v16ix\n", i);
  printf ("%v8ix\n", i);
  printf ("%v4ix\n", i);
  printf ("%v2ix\n", i);

  return 0;
}
예제 #18
0
// TODO: function to take SIZE
// TODO: requires implementation
// matrix is square?
// same questions as above
void matrix_vector_mul_SSE_double(double** mat, double* &vec)
{
  double* result = new double[SIZE]; // should be SIZE of result!
  for(unsigned i=0;i<SIZE;i++) { // row
    __m128d running_sum = _mm_set1_pd(0.0); // running sum initially 0
    for(unsigned j=0;j<SIZE;j+=2) { // col - requires skipping on 2 at a time
      // multiply each
      // add to running sum
      __m128d mat_vect = _mm_load_pd(&mat[i][j]); // hoping that addresses are as expected - seems like this is the way it's stored
						  // ^^ needs explanation and backup for REPORT - ROW major storing order in C/C++ such as python, pascal and others
      __m128d vec_elem = _mm_load_pd(&vec[j]);
      __m128d mult = _mm_mul_pd(mat_vect,vec_elem);
      running_sum = _mm_add_pd(mult,running_sum);


    }
    // shuffle & add (to make hadd)
    // store back to vec[i]
    __m128i mask = _mm_set_epi8(7 ,6 ,5, 4, 3, 2, 1, 0,
		      15, 14, 13, 12, 11, 10, 9, 8);
    __m128i sum_shuffled = _mm_shuffle_epi8((__m128i)running_sum, mask);
    running_sum = _mm_add_pd(running_sum,(__m128d)sum_shuffled);
    // convert running_sum back to f48s and store in memory

    _mm_store_sd(&result[i], running_sum);
  }
  vec = result;
}
template<int shift, int active_bits> void Haar_invtransform_H_final_1_sse4_2_int16_t(void *_idata,
                                                                       const int istride,
                                                                       const char *odata,
                                                                       const int ostride,
                                                                       const int iwidth,
                                                                       const int iheight,
                                                                       const int ooffset_x,
                                                                       const int ooffset_y,
                                                                       const int owidth,
                                                                       const int oheight) {
  int16_t *idata = (int16_t *)_idata;
  const int skip = 1;
  const __m128i ONE = _mm_set1_epi16(1);
  const __m128i OFFSET = _mm_set1_epi16(1 << (active_bits - 1));
  const __m128i SHUF = _mm_set_epi8(15,14, 11,10, 7,6, 3,2,
                                    13,12,   9,8, 5,4, 1,0);
  const __m128i CLIP = _mm_set1_epi16((1 << active_bits) - 1);
  const __m128i ZERO = _mm_set1_epi16(0);

  (void)iwidth;
  (void)iheight;

  for (int y = ooffset_y; y < ooffset_y + oheight; y+=skip) {
    for (int x = ooffset_x; x < ooffset_x + owidth; x += 16) {
      __m128i D0 = _mm_load_si128((__m128i *)&idata[y*istride + x + 0]);
      __m128i D8 = _mm_load_si128((__m128i *)&idata[y*istride + x + 8]);

      D0 = _mm_shuffle_epi8(D0, SHUF);
      D8 = _mm_shuffle_epi8(D8, SHUF);

      __m128i E0 = _mm_unpacklo_epi64(D0, D8);
      __m128i O1 = _mm_unpackhi_epi64(D0, D8);

      __m128i X0 = _mm_sub_epi16(E0, _mm_srai_epi16(_mm_add_epi16(O1, ONE), 1));
      __m128i X1 = _mm_add_epi16(O1, X0);

      __m128i Z0 = _mm_unpacklo_epi16(X0, X1);
      __m128i Z8 = _mm_unpackhi_epi16(X0, X1);

      if (shift != 0) {
        Z0 = _mm_add_epi16(Z0, ONE);
        Z8 = _mm_add_epi16(Z8, ONE);
        Z0 = _mm_srai_epi16(Z0, shift);
        Z8 = _mm_srai_epi16(Z8, shift);
      }

      Z0 = _mm_add_epi16(Z0, OFFSET);
      Z8 = _mm_add_epi16(Z8, OFFSET);

      Z0 = _mm_min_epi16(Z0, CLIP);
      Z8 = _mm_min_epi16(Z8, CLIP);

      Z0 = _mm_max_epi16(Z0, ZERO);
      Z8 = _mm_max_epi16(Z8, ZERO);

      _mm_store_si128((__m128i *)&odata[2*((y - ooffset_y)*ostride + x + 0 - ooffset_x)], Z0);
      _mm_store_si128((__m128i *)&odata[2*((y - ooffset_y)*ostride + x + 8 - ooffset_x)], Z8);
    }
  }
}
예제 #20
0
__m128i reverse_ssse3(const __m128i v) {

    // reverse all bytes at once
    const __m128i indices = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);

    return _mm_shuffle_epi8(indices, v);
}
예제 #21
0
static inline int jambu_tag_verification(__m128i *key, unsigned long long mlen, const unsigned char *c, __m128i *stateS, __m128i *stateR)  
{

	unsigned char t[16]; 
	int check = 0; 
	int i; 
	__m128i c3 = _mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3);
	__m128i tmpT; 

	aes_enc_128(stateS, key);
	*stateS = _mm_xor_si128(*stateS, _mm_srli_si128(*stateR, 8));
	*stateS = _mm_xor_si128(*stateS, c3); 
	*stateR = _mm_xor_si128(*stateR, *stateS); 

	aes_enc_128(stateS, key);
	tmpT = _mm_srli_si128(_mm_xor_si128(*stateS, *stateR), 8); 
	tmpT = _mm_xor_si128(tmpT, *stateS); 

	_mm_store_si128((__m128i*)t, tmpT);	

	//in this program, the mac length is assumed to be multiple of bytes 
	for (i = 0; i  < 8; i++) check |= (c[mlen+i] ^ t[i]);
	if (0 == check) return 0; else return -1;

} 
예제 #22
0
파일: Sponge.c 프로젝트: altoplano/PHC
void absorbPaddedSaltSSE(__m128i *state, const unsigned char *salt) {
    //XORs the first BLOCK_LEN_INT64 words of "in" with the current state
    state[0] = _mm_xor_si128(state[0], _mm_set_epi8(salt[15],salt[14],salt[13],salt[12],salt[11],salt[10],salt[9],salt[8],salt[7],salt[6],salt[5],salt[4],salt[3],salt[2],salt[1],salt[0]));
    state[1] = _mm_xor_si128(state[1], _mm_set_epi64x(0, 0x80));
    state[3] = _mm_xor_si128(state[3], _mm_set_epi64x(0x0100000000000000ULL, 0));
    blake2bLyraSSE(state);
}
예제 #23
0
FORCE_INLINE
static inline void ft4_small_table(float qmax, const float* dists, __m128i (&ft4)[4][16], const float qmin) {
	for(int sq_i = 0; sq_i < 4; ++sq_i) {
		const float* const sq_dists = dists + (sq_i + 4) * NCENT;
		for(int h_cent_i = 0; h_cent_i < 16; ++h_cent_i) {
			const float* h_dists = sq_dists + h_cent_i * 16;
			ft4[sq_i][h_cent_i] = _mm_set_epi8(
					Q127(h_dists[15], qmin, qmax),
					Q127(h_dists[14], qmin, qmax),
					Q127(h_dists[13], qmin, qmax),
					Q127(h_dists[12], qmin, qmax),
					Q127(h_dists[11], qmin, qmax),
					Q127(h_dists[10], qmin, qmax),
					Q127(h_dists[9], qmin, qmax),
					Q127(h_dists[8], qmin, qmax),
					Q127(h_dists[7], qmin, qmax),
					Q127(h_dists[6], qmin, qmax),
					Q127(h_dists[5], qmin, qmax),
					Q127(h_dists[4], qmin, qmax),
					Q127(h_dists[3], qmin, qmax),
					Q127(h_dists[2], qmin, qmax),
					Q127(h_dists[1], qmin, qmax),
					Q127(h_dists[0], qmin, qmax)
					);
		}
	}
}
예제 #24
0
bool CPathUtils::ContainsEscapedChars(const char * psz, size_t length)
{
    // most of our strings will be tens of bytes long
    // -> affort some minor overhead to handle the main part very fast

    const char* end = psz + length;
    if (sse2supported)
    {
        __m128i mask = _mm_set_epi8 ( '%', '%', '%', '%', '%', '%', '%', '%'
            , '%', '%', '%', '%', '%', '%', '%', '%');

        for (; psz + sizeof (mask) <= end; psz += sizeof (mask))
        {
            // fetch the next 16 bytes from the source

            __m128i chunk = _mm_loadu_si128 ((const __m128i*)psz);

            // check for non-ASCII

            int flags = _mm_movemask_epi8 (_mm_cmpeq_epi8 (chunk, mask));
            if (flags != 0)
                return true;
        };
    }

    // return odd bytes at the end of the string

    for (; psz < end; ++psz)
        if (*psz == '%')
            return true;

    return false;
}
예제 #25
0
파일: main.cpp 프로젝트: minh0722/HPC2015
//using https://software.intel.com/sites/landingpage/IntrinsicsGuide/
//and further explanations here - http://stackoverflow.com/questions/12778620/mm-shuffle-epi8-and-an-example
void reverse(char* bytes, int numChunks)
{
	//64 bytes as an array of 4 16-byte parts
	__m128i chunkParts[CHUNK_PARTS_COUNT];

	//set the mask so that all the bytes in each 16-byte part in the 64-byte chunk is reversed
	__m128i shuffleControlMask = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);


	int numBytes = numChunks * CHUNK_SIZE;
	for(int i = 0; i < numBytes; i += CHUNK_SIZE)
	{
		//load current chunk
		for(int j = 0; j < CHUNK_PARTS_COUNT; ++j)
			chunkParts[j] = _mm_loadu_si128((__m128i *)&bytes[i + CHUNK_PART_SIZE*j]);

		//shuffle bytes in each part of the chunk so that they're reversed and
		//then store/write each chunk part back in the original array ordering them correctly
		for(int j = 0; j < CHUNK_PARTS_COUNT; ++j)
			_mm_storeu_si128((__m128i *)&bytes[i + (CHUNK_PARTS_COUNT - 1 - j)*CHUNK_PART_SIZE],
							 _mm_shuffle_epi8(chunkParts[j] ,shuffleControlMask));
	}


}
예제 #26
0
/* @note: When this function is changed, make corresponding change to
 * fm10k_dev_supported_ptypes_get().
 */
static inline void
fm10k_desc_to_pktype_v(__m128i descs[4], struct rte_mbuf **rx_pkts)
{
	__m128i l3l4type0, l3l4type1, l3type, l4type;
	union {
		uint16_t e[4];
		uint64_t dword;
	} vol;

	/* L3 pkt type mask  Bit4 to Bit6 */
	const __m128i l3type_msk = _mm_set_epi16(
			0x0000, 0x0000, 0x0000, 0x0000,
			0x0070, 0x0070, 0x0070, 0x0070);

	/* L4 pkt type mask  Bit7 to Bit9 */
	const __m128i l4type_msk = _mm_set_epi16(
			0x0000, 0x0000, 0x0000, 0x0000,
			0x0380, 0x0380, 0x0380, 0x0380);

	/* convert RRC l3 type to mbuf format */
	const __m128i l3type_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
			0, 0, 0, RTE_PTYPE_L3_IPV6_EXT,
			RTE_PTYPE_L3_IPV6, RTE_PTYPE_L3_IPV4_EXT,
			RTE_PTYPE_L3_IPV4, 0);

	/* Convert RRC l4 type to mbuf format l4type_flags shift-left 8 bits
	 * to fill into8 bits length.
	 */
	const __m128i l4type_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0,
			RTE_PTYPE_TUNNEL_GENEVE >> 8,
			RTE_PTYPE_TUNNEL_NVGRE >> 8,
			RTE_PTYPE_TUNNEL_VXLAN >> 8,
			RTE_PTYPE_TUNNEL_GRE >> 8,
			RTE_PTYPE_L4_UDP >> 8,
			RTE_PTYPE_L4_TCP >> 8,
			0);

	l3l4type0 = _mm_unpacklo_epi16(descs[0], descs[1]);
	l3l4type1 = _mm_unpacklo_epi16(descs[2], descs[3]);
	l3l4type0 = _mm_unpacklo_epi32(l3l4type0, l3l4type1);

	l3type = _mm_and_si128(l3l4type0, l3type_msk);
	l4type = _mm_and_si128(l3l4type0, l4type_msk);

	l3type = _mm_srli_epi16(l3type, L3TYPE_SHIFT);
	l4type = _mm_srli_epi16(l4type, L4TYPE_SHIFT);

	l3type = _mm_shuffle_epi8(l3type_flags, l3type);
	/* l4type_flags shift-left for 8 bits, need shift-right back */
	l4type = _mm_shuffle_epi8(l4type_flags, l4type);

	l4type = _mm_slli_epi16(l4type, 8);
	l3l4type0 = _mm_or_si128(l3type, l4type);
	vol.dword = _mm_cvtsi128_si64(l3l4type0);

	rx_pkts[0]->packet_type = vol.e[0];
	rx_pkts[1]->packet_type = vol.e[1];
	rx_pkts[2]->packet_type = vol.e[2];
	rx_pkts[3]->packet_type = vol.e[3];
}
static void
TEST (void)
{
  union128i_b u, s1, s2;
  char e[16];
  int i;
   
  s1.x = _mm_set_epi8 (1,2,3,4,10,20,30,90,-80,-40,-100,-15,98, 25, 98,7);
  s2.x = _mm_set_epi8 (88, 44, 33, 22, 11, 98, 76, -100, -34, -78, -39, 6, 3, 4, 5, 119);
  u.x = test (s1.x, s2.x); 
   
  for (i = 0; i < 16; i++)
     e[i] = s1.a[i] + s2.a[i];

  if (check_union128i_b (u, e))
    abort ();
}
static void
TEST (void)
{
  union128i_ub u, s1, s2;
  unsigned char e[16] = {0};
  int i; 
   
  s1.x = _mm_set_epi8 (1,2,3,4,10,20,30,90,80,40,100,15,98, 25, 98,7);
  s2.x = _mm_set_epi8 (88, 44, 33, 22, 11, 98, 76, 100, 34, 78, 39, 6, 3, 4, 5, 119);
  u.x = test (s1.x, s2.x); 
 
  for (i = 0; i < 16; i++)
    e[i] = s1.a[i] ^ s2.a[i];

  if (check_union128i_ub (u, e))
    abort ();
}
예제 #29
0
파일: i40e_rxtx_vec.c 프로젝트: RIFTIO/dpdk
static inline void
desc_to_olflags_v(__m128i descs[4], struct rte_mbuf **rx_pkts)
{
	__m128i vlan0, vlan1, rss;
	union {
		uint16_t e[4];
		uint64_t dword;
	} vol;

	/* mask everything except rss and vlan flags
	*bit2 is for vlan tag, bits 13:12 for rss
	*/
	const __m128i rss_vlan_msk = _mm_set_epi16(
			0x0000, 0x0000, 0x0000, 0x0000,
			0x3004, 0x3004, 0x3004, 0x3004);

	/* map rss and vlan type to rss hash and vlan flag */
	const __m128i vlan_flags = _mm_set_epi8(0, 0, 0, 0,
			0, 0, 0, 0,
			0, 0, 0, PKT_RX_VLAN_PKT,
			0, 0, 0, 0);

	const __m128i rss_flags = _mm_set_epi8(0, 0, 0, 0,
			0, 0, 0, 0,
			0, 0, 0, 0,
			PKT_RX_FDIR, 0, PKT_RX_RSS_HASH, 0);

	vlan0 = _mm_unpackhi_epi16(descs[0], descs[1]);
	vlan1 = _mm_unpackhi_epi16(descs[2], descs[3]);
	vlan0 = _mm_unpacklo_epi32(vlan0, vlan1);

	vlan1 = _mm_and_si128(vlan0, rss_vlan_msk);
	vlan0 = _mm_shuffle_epi8(vlan_flags, vlan1);

	rss = _mm_srli_epi16(vlan1, 12);
	rss = _mm_shuffle_epi8(rss_flags, rss);

	vlan0 = _mm_or_si128(vlan0, rss);
	vol.dword = _mm_cvtsi128_si64(vlan0);

	rx_pkts[0]->ol_flags = vol.e[0];
	rx_pkts[1]->ol_flags = vol.e[1];
	rx_pkts[2]->ol_flags = vol.e[2];
	rx_pkts[3]->ol_flags = vol.e[3];
}
예제 #30
0
static const __m128i MakeShuffler(uint8_t i0, uint8_t i1, uint8_t i2,
                                  uint8_t i3, uint8_t i4, uint8_t i5,
                                  uint8_t i6, uint8_t i7, uint8_t i8,
                                  uint8_t i9, uint8_t i10, uint8_t i11,
                                  uint8_t i12, uint8_t i13, uint8_t i14,
                                  uint8_t i15) {
  return _mm_set_epi8(i15, i14, i13, i12, i11, i10, i9, i8, i7, i6, i5, i4, i3,
                      i2, i1, i0);
}