bool recogUnicodeRange(const __m128i data, int& dataLength, unsigned int mask) { //first check whether in the 2 bytes encoding range const __m128i Unicode_80_BE = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\xBE','\x80'); unsigned int mask_80_BE = _mm_cvtsi128_si32(_mm_cmpestrm(Unicode_80_BE, 2, data, dataLength, _SIDD_CMP_RANGES)); const __m128i Unicode_C2_DF = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\xDF', '\xC2'); unsigned int mask_C2_DF = _mm_cvtsi128_si32(_mm_cmpestrm(Unicode_C2_DF, 2, data, dataLength, _SIDD_CMP_RANGES)); if( mask_C2_DF > 0 ) { checkIncompleteBytes(mask_C2_DF, mask, dataLength, 1); if( mask_C2_DF > 0 ) { unsigned int mask_C2_DF_2 = mask_C2_DF << 1; if( (mask_C2_DF_2 & mask_80_BE) != mask_C2_DF_2 ) { const __m128i Unicode_80_BF = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\xBF', '\x80'); unsigned int mask_80_BF = _mm_cvtsi128_si32(_mm_cmpestrm(Unicode_80_BF, 2, data, dataLength, _SIDD_CMP_RANGES)); if( (mask_C2_DF_2 & mask_80_BF) != mask_C2_DF_2 ) { return false; } } mask |= mask_C2_DF; mask |= mask_C2_DF_2; if( mask == 0xFFFFFFFF ) { return true; } } else { if( dataLength <= 0 ) return false; if( mask == 0xFFFFFFFF ) return true; } } //then check whether in the 3 bytes encoding range const __m128i Unicode_E1_EC_EE_EF = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\xEF', '\xEF', '\xEE', '\xEE', '\xEC', '\xE1'); unsigned int mask_E1_EC_EE_EF = _mm_cvtsi128_si32(_mm_cmpestrm(Unicode_E1_EC_EE_EF, 6, data, dataLength, _SIDD_CMP_RANGES)); if( mask_E1_EC_EE_EF > 0 ) { checkIncompleteBytes(mask_E1_EC_EE_EF, mask, dataLength, 2); if( mask_E1_EC_EE_EF > 0 ) { unsigned int mask_E1_EC_EE_EF_2 = mask_E1_EC_EE_EF << 1; unsigned int mask_E1_EC_EE_EF_3 = mask_E1_EC_EE_EF << 2; if( (mask_E1_EC_EE_EF_2 & mask_80_BE) == mask_E1_EC_EE_EF_2 ) { if( (mask_E1_EC_EE_EF_3 & mask_80_BE) != mask_E1_EC_EE_EF_3 ) { const __m128i Unicode_80_BF = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\xBF', '\x80'); unsigned int mask_80_BF = _mm_cvtsi128_si32(_mm_cmpestrm(Unicode_80_BF, 2, data, dataLength, _SIDD_CMP_RANGES)); if( (mask_E1_EC_EE_EF_3 & mask_80_BF) != mask_E1_EC_EE_EF_3 ) { return false; } } mask |= mask_E1_EC_EE_EF; mask |= mask_E1_EC_EE_EF_2; mask |= mask_E1_EC_EE_EF_3; if( mask == 0xFFFFFFFF ) { return true; } } else { return false; } } else { if( dataLength <= 0 ) return false; if( mask == 0xFFFFFFFF ) return true; } } return false; }
void AddRoundKey_sse(BYTE state[][4], const WORD w[]) { BYTE subkey[16]; subkey[0] = w[0] >> 24; subkey[1] = w[1] >> 24; subkey[2] = w[2] >> 24; subkey[3] = w[3] >> 24; subkey[4] = w[0] >> 16; subkey[5] = w[1] >> 16; subkey[6] = w[2] >> 16; subkey[7] = w[3] >> 16; subkey[8] = w[0] >> 8; subkey[9] = w[1] >> 8; subkey[10] = w[2] >> 8; subkey[11] = w[3] >> 8; subkey[12] = w[0]; subkey[13] = w[1]; subkey[14] = w[2]; subkey[15] = w[3]; __m128i subkeySse = _mm_set_epi8(subkey[15], subkey[14], subkey[13], subkey[12], subkey[11], subkey[10], subkey[9], subkey[8], subkey[7], subkey[6], subkey[5], subkey[4], subkey[3], subkey[2], subkey[1], subkey[0]); __m128i stateSse = _mm_set_epi8(state[3][3], state[3][2], state[3][1], state[3][0], state[2][3], state[2][2], state[2][1], state[2][0], state[1][3], state[1][2], state[1][1], state[1][0], state[0][3], state[0][2], state[0][1], state[0][0]); stateSse = _mm_xor_si128 ( stateSse, subkeySse); _mm_storeu_si128(state, stateSse); }
/*The input to initialization is the 128-bit key; 128-bit IV;*/ void aegis128_initialization(const unsigned char *key, const unsigned char *iv, __m128i *state) { int i; __m128i tmp; __m128i keytmp = _mm_load_si128((__m128i*)key); __m128i ivtmp = _mm_load_si128((__m128i*)iv); state[0] = ivtmp; state[1] = _mm_set_epi8(0xdd,0x28,0xb5,0x73,0x42,0x31,0x11,0x20,0xf1,0x2f,0xc2,0x6d,0x55,0x18,0x3d,0xdb); state[2] = _mm_set_epi8(0x62,0x79,0xe9,0x90,0x59,0x37,0x22,0x15,0x0d,0x08,0x05,0x03,0x02,0x01,0x1, 0x0); state[3] = _mm_xor_si128(keytmp, _mm_set_epi8(0x62,0x79,0xe9,0x90,0x59,0x37,0x22,0x15,0x0d,0x08,0x05,0x03,0x02,0x01,0x1,0x0)); state[4] = _mm_xor_si128(keytmp, _mm_set_epi8(0xdd,0x28,0xb5,0x73,0x42,0x31,0x11,0x20,0xf1,0x2f,0xc2,0x6d,0x55,0x18,0x3d,0xdb)); state[0] = _mm_xor_si128(state[0], keytmp); keytmp = _mm_xor_si128(keytmp, ivtmp); for (i = 0; i < 10; i++) { //state update function tmp = state[4]; state[4] = _mm_aesenc_si128(state[3], state[4]); state[3] = _mm_aesenc_si128(state[2], state[3]); state[2] = _mm_aesenc_si128(state[1], state[2]); state[1] = _mm_aesenc_si128(state[0], state[1]); state[0] = _mm_aesenc_si128(tmp, state[0]); //xor msg with state[0] keytmp = _mm_xor_si128(keytmp, ivtmp); state[0] = _mm_xor_si128(state[0], keytmp); } }
// fl48 V1 void matrix_vector_mul_SSE_f48(fl48** mat, fl48* &vec) { fl48* result = new fl48[SIZE]; // should be SIZE of result! __m128i mask = _mm_set_epi8(11, 10, 9, 8, 7, 6, 255, 255, 5, 4, 3, 2, 1, 0, 255, 255); __m128i shuffling_mask = _mm_set_epi8(7 ,6 ,5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); for(unsigned i=0;i<SIZE;i++) { // row __m128d running_sum = _mm_set1_pd(0.0); // running sum initially 0 for(unsigned j=0;j<SIZE;j+=2) { // col - requires skipping on 2 at a time // multiply each // add to running sum __m128i mat_vect = _mm_loadu_si128((__m128i*) &mat[i][j]); // hoping that addresses are as expected - seems like this is the way it's stored // ^^ needs explanation and backup for REPORT - ROW major storing order in C/C++ such as python, pascal and others mat_vect = _mm_shuffle_epi8(mat_vect, mask); __m128i vec_elem = _mm_loadu_si128((__m128i*) &vec[j]); vec_elem = _mm_shuffle_epi8(vec_elem, mask); __m128d mult = _mm_mul_pd((__m128d)mat_vect,(__m128d)vec_elem); running_sum = _mm_add_pd(mult,running_sum); } // shuffle & add (to make hadd) // store back to vec[i] __m128i sum_shuffled = _mm_shuffle_epi8((__m128i)running_sum, shuffling_mask); running_sum = _mm_add_pd(running_sum,(__m128d)sum_shuffled); double temp=0; _mm_store_sd(&temp, running_sum); result[i]=fl48(temp); } vec = result; }
// Convert 16 packed ARGB 16b-values to r[], g[], b[] static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE41( const uint16_t* const rgbx, __m128i* const r, __m128i* const g, __m128i* const b) { const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x const __m128i in2 = LOAD_16(rgbx + 16); // r4 | ... const __m128i in3 = LOAD_16(rgbx + 24); // r6 | ... // aarrggbb as 16-bit. const __m128i shuff0 = _mm_set_epi8(-1, -1, -1, -1, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0); const __m128i shuff1 = _mm_set_epi8(13, 12, 5, 4, -1, -1, -1, -1, 11, 10, 3, 2, 9, 8, 1, 0); const __m128i A0 = _mm_shuffle_epi8(in0, shuff0); const __m128i A1 = _mm_shuffle_epi8(in1, shuff1); const __m128i A2 = _mm_shuffle_epi8(in2, shuff0); const __m128i A3 = _mm_shuffle_epi8(in3, shuff1); // R0R1G0G1 // B0B1**** // R2R3G2G3 // B2B3**** // (OR is used to free port 5 for the unpack) const __m128i B0 = _mm_unpacklo_epi32(A0, A1); const __m128i B1 = _mm_or_si128(A0, A1); const __m128i B2 = _mm_unpacklo_epi32(A2, A3); const __m128i B3 = _mm_or_si128(A2, A3); // Gather the channels. *r = _mm_unpacklo_epi64(B0, B2); *g = _mm_unpackhi_epi64(B0, B2); *b = _mm_unpackhi_epi64(B1, B3); }
HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output, const void* input, DataLength_gr databitlen ) { const int len = (int)databitlen / 128; const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i const int hash_offset = SIZE512 - hashlen_m128i; int rem = ctx->rem_ptr; int blocks = len / SIZE512; __m128i* in = (__m128i*)input; int i; // --- update --- // digest any full blocks, process directly from input for ( i = 0; i < blocks; i++ ) TF1024( ctx->chaining, &in[ i * SIZE512 ] ); ctx->buf_ptr = blocks * SIZE512; // copy any remaining data to buffer, it may already contain data // from a previous update for a midstate precalc for ( i = 0; i < len % SIZE512; i++ ) ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ]; i += rem; // use i as rem_ptr in final //--- final --- blocks++; // adjust for final block if ( i == len -1 ) { // only 128 bits left in buffer, all padding at once ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ); } else { // add first padding ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ); // add zero padding for ( i += 1; i < SIZE512 - 1; i++ ) ctx->buffer[i] = _mm_setzero_si128(); // add length padding, second last byte is zero unless blocks > 255 ctx->buffer[i] = _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0,0,0, 0, 0 ,0,0, 0,0,0,0 ); } // digest final padding block and do output transform TF1024( ctx->chaining, ctx->buffer ); OF1024( ctx->chaining ); // store hash result in output for ( i = 0; i < hashlen_m128i; i++ ) casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ]; return SUCCESS_GR; }
static inline int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] ) { __m128i row1, row2, row3, row4; __m128i buf1, buf2, buf3, buf4; #if defined(HAVE_SSE41) __m128i t0, t1; #if !defined(HAVE_XOP) __m128i t2; #endif #endif __m128i ff0, ff1; #if defined(HAVE_SSSE3) && !defined(HAVE_XOP) const __m128i r8 = _mm_set_epi8( 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1 ); const __m128i r16 = _mm_set_epi8( 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2 ); #endif #if defined(HAVE_SSE41) const __m128i m0 = LOADU( block + 00 ); const __m128i m1 = LOADU( block + 16 ); const __m128i m2 = LOADU( block + 32 ); const __m128i m3 = LOADU( block + 48 ); #else const uint32_t m0 = ( ( uint32_t * )block )[ 0]; const uint32_t m1 = ( ( uint32_t * )block )[ 1]; const uint32_t m2 = ( ( uint32_t * )block )[ 2]; const uint32_t m3 = ( ( uint32_t * )block )[ 3]; const uint32_t m4 = ( ( uint32_t * )block )[ 4]; const uint32_t m5 = ( ( uint32_t * )block )[ 5]; const uint32_t m6 = ( ( uint32_t * )block )[ 6]; const uint32_t m7 = ( ( uint32_t * )block )[ 7]; const uint32_t m8 = ( ( uint32_t * )block )[ 8]; const uint32_t m9 = ( ( uint32_t * )block )[ 9]; const uint32_t m10 = ( ( uint32_t * )block )[10]; const uint32_t m11 = ( ( uint32_t * )block )[11]; const uint32_t m12 = ( ( uint32_t * )block )[12]; const uint32_t m13 = ( ( uint32_t * )block )[13]; const uint32_t m14 = ( ( uint32_t * )block )[14]; const uint32_t m15 = ( ( uint32_t * )block )[15]; #endif row1 = ff0 = LOADU( &S->h[0] ); row2 = ff1 = LOADU( &S->h[4] ); row3 = _mm_setr_epi32( 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A ); row4 = _mm_xor_si128( _mm_setr_epi32( 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 ), LOADU( &S->t[0] ) ); ROUND( 0 ); ROUND( 1 ); ROUND( 2 ); ROUND( 3 ); ROUND( 4 ); ROUND( 5 ); ROUND( 6 ); ROUND( 7 ); ROUND( 8 ); ROUND( 9 ); STOREU( &S->h[0], _mm_xor_si128( ff0, _mm_xor_si128( row1, row3 ) ) ); STOREU( &S->h[4], _mm_xor_si128( ff1, _mm_xor_si128( row2, row4 ) ) ); return 0; }
void FREAK::extractDescriptor(uchar *pointsValue, void ** ptr) const { __m128i** ptrSSE = (__m128i**) ptr; // note that comparisons order is modified in each block (but first 128 comparisons remain globally the same-->does not affect the 128,384 bits segmanted matching strategy) int cnt = 0; for( int n = FREAK_NB_PAIRS/128; n-- ; ) { __m128i result128 = _mm_setzero_si128(); for( int m = 128/16; m--; cnt += 16 ) { __m128i operand1 = _mm_set_epi8(pointsValue[descriptionPairs[cnt+0].i], pointsValue[descriptionPairs[cnt+1].i], pointsValue[descriptionPairs[cnt+2].i], pointsValue[descriptionPairs[cnt+3].i], pointsValue[descriptionPairs[cnt+4].i], pointsValue[descriptionPairs[cnt+5].i], pointsValue[descriptionPairs[cnt+6].i], pointsValue[descriptionPairs[cnt+7].i], pointsValue[descriptionPairs[cnt+8].i], pointsValue[descriptionPairs[cnt+9].i], pointsValue[descriptionPairs[cnt+10].i], pointsValue[descriptionPairs[cnt+11].i], pointsValue[descriptionPairs[cnt+12].i], pointsValue[descriptionPairs[cnt+13].i], pointsValue[descriptionPairs[cnt+14].i], pointsValue[descriptionPairs[cnt+15].i]); __m128i operand2 = _mm_set_epi8(pointsValue[descriptionPairs[cnt+0].j], pointsValue[descriptionPairs[cnt+1].j], pointsValue[descriptionPairs[cnt+2].j], pointsValue[descriptionPairs[cnt+3].j], pointsValue[descriptionPairs[cnt+4].j], pointsValue[descriptionPairs[cnt+5].j], pointsValue[descriptionPairs[cnt+6].j], pointsValue[descriptionPairs[cnt+7].j], pointsValue[descriptionPairs[cnt+8].j], pointsValue[descriptionPairs[cnt+9].j], pointsValue[descriptionPairs[cnt+10].j], pointsValue[descriptionPairs[cnt+11].j], pointsValue[descriptionPairs[cnt+12].j], pointsValue[descriptionPairs[cnt+13].j], pointsValue[descriptionPairs[cnt+14].j], pointsValue[descriptionPairs[cnt+15].j]); __m128i workReg = _mm_min_epu8(operand1, operand2); // emulated "not less than" for 8-bit UNSIGNED integers workReg = _mm_cmpeq_epi8(workReg, operand2); // emulated "not less than" for 8-bit UNSIGNED integers workReg = _mm_and_si128(_mm_set1_epi16(short(0x8080 >> m)), workReg); // merge the last 16 bits with the 128bits std::vector until full result128 = _mm_or_si128(result128, workReg); } (**ptrSSE) = result128; ++(*ptrSSE); } (*ptrSSE) -= 8; }
static inline void desc_to_olflags_v(__m128i descs[4], uint8_t vlan_flags, struct rte_mbuf **rx_pkts) { __m128i ptype0, ptype1, vtag0, vtag1; union { uint16_t e[4]; uint64_t dword; } vol; /* mask everything except rss type */ const __m128i rsstype_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, 0x000F, 0x000F, 0x000F, 0x000F); /* map rss type to rss hash flag */ const __m128i rss_flags = _mm_set_epi8(PKT_RX_FDIR, 0, 0, 0, 0, 0, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0); /* mask everything except vlan present bit */ const __m128i vlan_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP); /* map vlan present (0x8) to ol_flags */ const __m128i vlan_map = _mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, vlan_flags, 0, 0, 0, 0, 0, 0, 0, 0); ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]); ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]); vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]); vtag1 = _mm_unpackhi_epi16(descs[2], descs[3]); ptype0 = _mm_unpacklo_epi32(ptype0, ptype1); ptype0 = _mm_and_si128(ptype0, rsstype_msk); ptype0 = _mm_shuffle_epi8(rss_flags, ptype0); vtag1 = _mm_unpacklo_epi32(vtag0, vtag1); vtag1 = _mm_and_si128(vtag1, vlan_msk); vtag1 = _mm_shuffle_epi8(vlan_map, vtag1); vtag1 = _mm_or_si128(ptype0, vtag1); vol.dword = _mm_cvtsi128_si64(vtag1); rx_pkts[0]->ol_flags = vol.e[0]; rx_pkts[1]->ol_flags = vol.e[1]; rx_pkts[2]->ol_flags = vol.e[2]; rx_pkts[3]->ol_flags = vol.e[3]; }
void ConvertColor_BGR2GRAY_BT709_simd(const cv::Mat& src, cv::Mat& dst) { CV_Assert(CV_8UC3 == src.type()); cv::Size sz = src.size(); dst.create(sz, CV_8UC1); #ifdef HAVE_SSE // __m128i ssse3_blue_indices_0 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 12, 9, 6, 3, 0); // __m128i ssse3_blue_indices_1 = _mm_set_epi8(-1, -1, -1, -1, -1, 14, 11, 8, 5, 2, -1, -1, -1, -1, -1, -1); // __m128i ssse3_blue_indices_2 = _mm_set_epi8(13, 10, 7, 4, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); // __m128i ssse3_green_indices_0 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 13, 10, 7, 4, 1); // __m128i ssse3_green_indices_1 = _mm_set_epi8(-1, -1, -1, -1, -1, 15, 12, 9, 6, 3, 0, -1, -1, -1, -1, -1); // __m128i ssse3_green_indices_2 = _mm_set_epi8(14, 11, 8, 5, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); __m128i ssse3_red_indices_0 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 14, 11, 8, 5, 2); __m128i ssse3_red_indices_1 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, 13, 10, 7, 4, 1, -1, -1, -1, -1, -1); __m128i ssse3_red_indices_2 = _mm_set_epi8(15, 12, 9, 6, 3, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); #endif for (int y = 0; y < sz.height; y++) { const uchar *psrc = src.ptr<uchar>(y); uchar *pdst = dst.ptr<uchar>(y); int x = 0; #ifdef HAVE_SSE // Here is 16 times unrolled loop for vector processing for (; x <= sz.width - 16; x += 16) { __m128i chunk0 = _mm_loadu_si128((const __m128i*)(psrc + x*3 + 16*0)); __m128i chunk1 = _mm_loadu_si128((const __m128i*)(psrc + x*3 + 16*1)); __m128i chunk2 = _mm_loadu_si128((const __m128i*)(psrc + x*3 + 16*2)); __m128i red = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, ssse3_red_indices_0), _mm_shuffle_epi8(chunk1, ssse3_red_indices_1)), _mm_shuffle_epi8(chunk2, ssse3_red_indices_2)); /* ??? */ _mm_storeu_si128((__m128i*)(pdst + x), red); } #endif // Process leftover pixels for (; x < sz.width; x++) { /* ??? */ } } // ! Remove this before writing your optimizations ! ConvertColor_BGR2GRAY_BT709_fpt(src, dst); }
void scanCharDataContentwithSTTNI(SAX2Processor* saxProcessor) { unsigned int length = yylim - yycur; unsigned char* data = (unsigned char*)yycur; if( *data == '<' || *data == '&' || *data == ']') return; unsigned int dataLen = 0; // initialize the one byte encoding rule and nonCharaData rule const __m128i asciiCharData = _mm_set_epi8(0,0,0,0,0,0,0x7F,0x5E,0x5C,0x3D, 0x3B,0x27,0x25,0x20,0,0); const __m128i nonCharData = _mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0x5D,0x3C,0x26,0x0D,0x0A); do { // special new line processing for ‘x0A’,‘x0D’ if( *data == '\0' ) { saxProcessor->newLine((char*)data); data++; length--; } else if(*data == '\0') { saxProcessor->newLine((char*)data); if( *(data+1) == '\0' ) { data += 2; length -= 2; yycur++; } else { *data = '\0'; data++; length--; } } while( length > 0 ) { if( length >= 16 ) dataLen = 16; else dataLen = length; const __m128i mData = _mm_loadu_si128((__m128i*)data); // locate the Character Data part with the nonCharaData characters int index = _mm_cmpestri(nonCharData, 5, mData, dataLen, _SIDD_CMP_EQUAL_ANY); if( index == 0 ) break; if( index > dataLen ) index = dataLen; bool shouldBreak = index < dataLen ? true : false; // check the one byte encoding rule(ASCII) unsigned int mask = _mm_cvtsi128_si32(_mm_cmpestrm(asciiCharData, 10, mData, index, _SIDD_CMP_RANGES|_SIDD_MASKED_NEGATIVE_POLARITY)); // if not all hit ASCII, continue to check other Unicode rules if( mask == 0 || recogUnicodeRange(mData, index, ~mask)) { data += index; length -= index; if( shouldBreak ) break; } else { break; } } unsigned int passLen = (char*)data - yycur; if( passLen == 0 ) break; // report Character Data to user saxProcessor->reportCharDataContent(yycur, passLen); yycur += passLen; YYSWITCHBUFFER; } while( length >= STTNISTRLENLIMIT && (*data == '\0' || *data == '\0') ); }
static inline void jambu_aut_ad_full(__m128i *key, __m128i *stateS, __m128i *stateR) { __m128i msgtmp = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0x80, 0, 0, 0, 0, 0, 0, 0, 0); __m128i c1 = _mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1); aes_enc_128(stateS, key); *stateS = _mm_xor_si128(*stateS, _mm_srli_si128(*stateR, 8)); *stateS = _mm_xor_si128(*stateS, c1); *stateS = _mm_xor_si128(*stateS, msgtmp); *stateR = _mm_xor_si128(*stateS, *stateR); return; }
/* Compute branch metrics (gamma) */ void map_gen_gamma(map_gen_t * h, int16_t *input, int16_t *app, int16_t *parity, uint32_t long_cb) { __m128i res10, res20, res11, res21, res1, res2; __m128i in, ap, pa, g1, g0; __m128i *inPtr = (__m128i*) input; __m128i *appPtr = (__m128i*) app; __m128i *paPtr = (__m128i*) parity; __m128i *resPtr = (__m128i*) h->branch; __m128i res10_mask = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0); __m128i res20_mask = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8); __m128i res11_mask = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff); __m128i res21_mask = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff); for (int i=0;i<long_cb/8;i++) { in = _mm_load_si128(inPtr); inPtr++; pa = _mm_load_si128(paPtr); paPtr++; if (appPtr) { ap = _mm_load_si128(appPtr); appPtr++; in = _mm_add_epi16(ap, in); } g1 = _mm_add_epi16(in, pa); g0 = _mm_sub_epi16(in, pa); g1 = _mm_srai_epi16(g1, 1); g0 = _mm_srai_epi16(g0, 1); res10 = _mm_shuffle_epi8(g0, res10_mask); res20 = _mm_shuffle_epi8(g0, res20_mask); res11 = _mm_shuffle_epi8(g1, res11_mask); res21 = _mm_shuffle_epi8(g1, res21_mask); res1 = _mm_or_si128(res10, res11); res2 = _mm_or_si128(res20, res21); _mm_store_si128(resPtr, res1); resPtr++; _mm_store_si128(resPtr, res2); resPtr++; } for (int i=long_cb;i<long_cb+3;i++) { h->branch[2*i] = (input[i] - parity[i])/2; h->branch[2*i+1] = (input[i] + parity[i])/2; } }
void demod_16qam_lte_b_sse(const cf_t *symbols, int8_t *llr, int nsymbols) { float *symbolsPtr = (float*) symbols; __m128i *resultPtr = (__m128i*) llr; __m128 symbol1, symbol2, symbol3, symbol4; __m128i symbol_i1, symbol_i2, symbol_i3, symbol_i4, symbol_i, symbol_abs, symbol_12, symbol_34; __m128i offset = _mm_set1_epi8(2*SCALE_BYTE_CONV_QAM16/sqrt(10)); __m128i result1n, result1a, result2n, result2a; __m128 scale_v = _mm_set1_ps(-SCALE_BYTE_CONV_QAM16); __m128i shuffle_negated_1 = _mm_set_epi8(0xff,0xff,7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0); __m128i shuffle_abs_1 = _mm_set_epi8(7,6,0xff,0xff,5,4,0xff,0xff,3,2,0xff,0xff,1,0,0xff,0xff); __m128i shuffle_negated_2 = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8); __m128i shuffle_abs_2 = _mm_set_epi8(15,14,0xff,0xff,13,12,0xff,0xff,11,10,0xff,0xff,9,8,0xff,0xff); for (int i=0;i<nsymbols/8;i++) { symbol1 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol2 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol3 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol4 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol_i1 = _mm_cvtps_epi32(_mm_mul_ps(symbol1, scale_v)); symbol_i2 = _mm_cvtps_epi32(_mm_mul_ps(symbol2, scale_v)); symbol_i3 = _mm_cvtps_epi32(_mm_mul_ps(symbol3, scale_v)); symbol_i4 = _mm_cvtps_epi32(_mm_mul_ps(symbol4, scale_v)); symbol_12 = _mm_packs_epi32(symbol_i1, symbol_i2); symbol_34 = _mm_packs_epi32(symbol_i3, symbol_i4); symbol_i = _mm_packs_epi16(symbol_12, symbol_34); symbol_abs = _mm_abs_epi8(symbol_i); symbol_abs = _mm_sub_epi8(symbol_abs, offset); result1n = _mm_shuffle_epi8(symbol_i, shuffle_negated_1); result1a = _mm_shuffle_epi8(symbol_abs, shuffle_abs_1); result2n = _mm_shuffle_epi8(symbol_i, shuffle_negated_2); result2a = _mm_shuffle_epi8(symbol_abs, shuffle_abs_2); _mm_store_si128(resultPtr, _mm_or_si128(result1n, result1a)); resultPtr++; _mm_store_si128(resultPtr, _mm_or_si128(result2n, result2a)); resultPtr++; } // Demodulate last symbols for (int i=8*(nsymbols/8);i<nsymbols;i++) { short yre = (int8_t) (SCALE_BYTE_CONV_QAM16*crealf(symbols[i])); short yim = (int8_t) (SCALE_BYTE_CONV_QAM16*cimagf(symbols[i])); llr[4*i+0] = -yre; llr[4*i+1] = -yim; llr[4*i+2] = abs(yre)-2*SCALE_BYTE_CONV_QAM16/sqrt(10); llr[4*i+3] = abs(yim)-2*SCALE_BYTE_CONV_QAM16/sqrt(10); } }
static inline void jambu_aut_ad_step(__m128i *key, const unsigned char *adblock, __m128i *stateS, __m128i *stateR) { __m128i msgtmp = _mm_set_epi8(adblock[7], adblock[6], adblock[5], adblock[4], adblock[3], adblock[2], adblock[1], adblock[0], 0, 0, 0, 0, 0, 0, 0, 0); __m128i c1 = _mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1); aes_enc_128(stateS, key); *stateS = _mm_xor_si128(*stateS, _mm_srli_si128(*stateR, 8)); *stateS = _mm_xor_si128(*stateS, c1); *stateS = _mm_xor_si128(*stateS, msgtmp); *stateR = _mm_xor_si128(*stateS, *stateR); return; }
static inline __m128i enc_reshuffle (__m128i in) { // Slice into 32-bit chunks and operate on all chunks in parallel. // All processing is done within the 32-bit chunk. First, shuffle: // before: [eeeeeeff|ccdddddd|bbbbcccc|aaaaaabb] // after: [00000000|aaaaaabb|bbbbcccc|ccdddddd] in = _mm_shuffle_epi8(in, _mm_set_epi8( -1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2)); // merged = [0000aaaa|aabbbbbb|bbbbcccc|ccdddddd] const __m128i merged = _mm_blend_epi16(_mm_slli_epi32(in, 4), in, 0x55); // bd = [00000000|00bbbbbb|00000000|00dddddd] const __m128i bd = _mm_and_si128(merged, _mm_set1_epi32(0x003F003F)); // ac = [00aaaaaa|00000000|00cccccc|00000000] const __m128i ac = _mm_and_si128(_mm_slli_epi32(merged, 2), _mm_set1_epi32(0x3F003F00)); // indices = [00aaaaaa|00bbbbbb|00cccccc|00dddddd] const __m128i indices = _mm_or_si128(ac, bd); // return = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] return _mm_bswap_epi32(indices); }
int main (void) { pa_xmm = register_printf_type (xmm_va); mod_v4f = register_printf_modifier (L"v4f"); mod_v2d = register_printf_modifier (L"v2d"); mod_v16i = register_printf_modifier (L"v16i"); mod_v8i = register_printf_modifier (L"v8i"); mod_v4i = register_printf_modifier (L"v4i"); mod_v2i = register_printf_modifier (L"v2i"); register_printf_specifier ('f', xmm_printf_f, xmm_ais); register_printf_specifier ('x', xmm_printf_x, xmm_ais); __m128 f = _mm_set_ps (1.0, 2.0, 3.0, 4.0); __m128d d = _mm_set_pd (1.0, 2.0); __m128i i = _mm_set_epi8 (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); printf ("%f\n", 1.0f); printf ("%f\n", 2.0); printf ("%v4ff\n", f); printf ("%v2df\n", d); printf ("%x\n", 1); printf ("%v16ix\n", i); printf ("%v8ix\n", i); printf ("%v4ix\n", i); printf ("%v2ix\n", i); return 0; }
// TODO: function to take SIZE // TODO: requires implementation // matrix is square? // same questions as above void matrix_vector_mul_SSE_double(double** mat, double* &vec) { double* result = new double[SIZE]; // should be SIZE of result! for(unsigned i=0;i<SIZE;i++) { // row __m128d running_sum = _mm_set1_pd(0.0); // running sum initially 0 for(unsigned j=0;j<SIZE;j+=2) { // col - requires skipping on 2 at a time // multiply each // add to running sum __m128d mat_vect = _mm_load_pd(&mat[i][j]); // hoping that addresses are as expected - seems like this is the way it's stored // ^^ needs explanation and backup for REPORT - ROW major storing order in C/C++ such as python, pascal and others __m128d vec_elem = _mm_load_pd(&vec[j]); __m128d mult = _mm_mul_pd(mat_vect,vec_elem); running_sum = _mm_add_pd(mult,running_sum); } // shuffle & add (to make hadd) // store back to vec[i] __m128i mask = _mm_set_epi8(7 ,6 ,5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); __m128i sum_shuffled = _mm_shuffle_epi8((__m128i)running_sum, mask); running_sum = _mm_add_pd(running_sum,(__m128d)sum_shuffled); // convert running_sum back to f48s and store in memory _mm_store_sd(&result[i], running_sum); } vec = result; }
template<int shift, int active_bits> void Haar_invtransform_H_final_1_sse4_2_int16_t(void *_idata, const int istride, const char *odata, const int ostride, const int iwidth, const int iheight, const int ooffset_x, const int ooffset_y, const int owidth, const int oheight) { int16_t *idata = (int16_t *)_idata; const int skip = 1; const __m128i ONE = _mm_set1_epi16(1); const __m128i OFFSET = _mm_set1_epi16(1 << (active_bits - 1)); const __m128i SHUF = _mm_set_epi8(15,14, 11,10, 7,6, 3,2, 13,12, 9,8, 5,4, 1,0); const __m128i CLIP = _mm_set1_epi16((1 << active_bits) - 1); const __m128i ZERO = _mm_set1_epi16(0); (void)iwidth; (void)iheight; for (int y = ooffset_y; y < ooffset_y + oheight; y+=skip) { for (int x = ooffset_x; x < ooffset_x + owidth; x += 16) { __m128i D0 = _mm_load_si128((__m128i *)&idata[y*istride + x + 0]); __m128i D8 = _mm_load_si128((__m128i *)&idata[y*istride + x + 8]); D0 = _mm_shuffle_epi8(D0, SHUF); D8 = _mm_shuffle_epi8(D8, SHUF); __m128i E0 = _mm_unpacklo_epi64(D0, D8); __m128i O1 = _mm_unpackhi_epi64(D0, D8); __m128i X0 = _mm_sub_epi16(E0, _mm_srai_epi16(_mm_add_epi16(O1, ONE), 1)); __m128i X1 = _mm_add_epi16(O1, X0); __m128i Z0 = _mm_unpacklo_epi16(X0, X1); __m128i Z8 = _mm_unpackhi_epi16(X0, X1); if (shift != 0) { Z0 = _mm_add_epi16(Z0, ONE); Z8 = _mm_add_epi16(Z8, ONE); Z0 = _mm_srai_epi16(Z0, shift); Z8 = _mm_srai_epi16(Z8, shift); } Z0 = _mm_add_epi16(Z0, OFFSET); Z8 = _mm_add_epi16(Z8, OFFSET); Z0 = _mm_min_epi16(Z0, CLIP); Z8 = _mm_min_epi16(Z8, CLIP); Z0 = _mm_max_epi16(Z0, ZERO); Z8 = _mm_max_epi16(Z8, ZERO); _mm_store_si128((__m128i *)&odata[2*((y - ooffset_y)*ostride + x + 0 - ooffset_x)], Z0); _mm_store_si128((__m128i *)&odata[2*((y - ooffset_y)*ostride + x + 8 - ooffset_x)], Z8); } } }
__m128i reverse_ssse3(const __m128i v) { // reverse all bytes at once const __m128i indices = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); return _mm_shuffle_epi8(indices, v); }
static inline int jambu_tag_verification(__m128i *key, unsigned long long mlen, const unsigned char *c, __m128i *stateS, __m128i *stateR) { unsigned char t[16]; int check = 0; int i; __m128i c3 = _mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3); __m128i tmpT; aes_enc_128(stateS, key); *stateS = _mm_xor_si128(*stateS, _mm_srli_si128(*stateR, 8)); *stateS = _mm_xor_si128(*stateS, c3); *stateR = _mm_xor_si128(*stateR, *stateS); aes_enc_128(stateS, key); tmpT = _mm_srli_si128(_mm_xor_si128(*stateS, *stateR), 8); tmpT = _mm_xor_si128(tmpT, *stateS); _mm_store_si128((__m128i*)t, tmpT); //in this program, the mac length is assumed to be multiple of bytes for (i = 0; i < 8; i++) check |= (c[mlen+i] ^ t[i]); if (0 == check) return 0; else return -1; }
void absorbPaddedSaltSSE(__m128i *state, const unsigned char *salt) { //XORs the first BLOCK_LEN_INT64 words of "in" with the current state state[0] = _mm_xor_si128(state[0], _mm_set_epi8(salt[15],salt[14],salt[13],salt[12],salt[11],salt[10],salt[9],salt[8],salt[7],salt[6],salt[5],salt[4],salt[3],salt[2],salt[1],salt[0])); state[1] = _mm_xor_si128(state[1], _mm_set_epi64x(0, 0x80)); state[3] = _mm_xor_si128(state[3], _mm_set_epi64x(0x0100000000000000ULL, 0)); blake2bLyraSSE(state); }
FORCE_INLINE static inline void ft4_small_table(float qmax, const float* dists, __m128i (&ft4)[4][16], const float qmin) { for(int sq_i = 0; sq_i < 4; ++sq_i) { const float* const sq_dists = dists + (sq_i + 4) * NCENT; for(int h_cent_i = 0; h_cent_i < 16; ++h_cent_i) { const float* h_dists = sq_dists + h_cent_i * 16; ft4[sq_i][h_cent_i] = _mm_set_epi8( Q127(h_dists[15], qmin, qmax), Q127(h_dists[14], qmin, qmax), Q127(h_dists[13], qmin, qmax), Q127(h_dists[12], qmin, qmax), Q127(h_dists[11], qmin, qmax), Q127(h_dists[10], qmin, qmax), Q127(h_dists[9], qmin, qmax), Q127(h_dists[8], qmin, qmax), Q127(h_dists[7], qmin, qmax), Q127(h_dists[6], qmin, qmax), Q127(h_dists[5], qmin, qmax), Q127(h_dists[4], qmin, qmax), Q127(h_dists[3], qmin, qmax), Q127(h_dists[2], qmin, qmax), Q127(h_dists[1], qmin, qmax), Q127(h_dists[0], qmin, qmax) ); } } }
bool CPathUtils::ContainsEscapedChars(const char * psz, size_t length) { // most of our strings will be tens of bytes long // -> affort some minor overhead to handle the main part very fast const char* end = psz + length; if (sse2supported) { __m128i mask = _mm_set_epi8 ( '%', '%', '%', '%', '%', '%', '%', '%' , '%', '%', '%', '%', '%', '%', '%', '%'); for (; psz + sizeof (mask) <= end; psz += sizeof (mask)) { // fetch the next 16 bytes from the source __m128i chunk = _mm_loadu_si128 ((const __m128i*)psz); // check for non-ASCII int flags = _mm_movemask_epi8 (_mm_cmpeq_epi8 (chunk, mask)); if (flags != 0) return true; }; } // return odd bytes at the end of the string for (; psz < end; ++psz) if (*psz == '%') return true; return false; }
//using https://software.intel.com/sites/landingpage/IntrinsicsGuide/ //and further explanations here - http://stackoverflow.com/questions/12778620/mm-shuffle-epi8-and-an-example void reverse(char* bytes, int numChunks) { //64 bytes as an array of 4 16-byte parts __m128i chunkParts[CHUNK_PARTS_COUNT]; //set the mask so that all the bytes in each 16-byte part in the 64-byte chunk is reversed __m128i shuffleControlMask = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); int numBytes = numChunks * CHUNK_SIZE; for(int i = 0; i < numBytes; i += CHUNK_SIZE) { //load current chunk for(int j = 0; j < CHUNK_PARTS_COUNT; ++j) chunkParts[j] = _mm_loadu_si128((__m128i *)&bytes[i + CHUNK_PART_SIZE*j]); //shuffle bytes in each part of the chunk so that they're reversed and //then store/write each chunk part back in the original array ordering them correctly for(int j = 0; j < CHUNK_PARTS_COUNT; ++j) _mm_storeu_si128((__m128i *)&bytes[i + (CHUNK_PARTS_COUNT - 1 - j)*CHUNK_PART_SIZE], _mm_shuffle_epi8(chunkParts[j] ,shuffleControlMask)); } }
/* @note: When this function is changed, make corresponding change to * fm10k_dev_supported_ptypes_get(). */ static inline void fm10k_desc_to_pktype_v(__m128i descs[4], struct rte_mbuf **rx_pkts) { __m128i l3l4type0, l3l4type1, l3type, l4type; union { uint16_t e[4]; uint64_t dword; } vol; /* L3 pkt type mask Bit4 to Bit6 */ const __m128i l3type_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, 0x0070, 0x0070, 0x0070, 0x0070); /* L4 pkt type mask Bit7 to Bit9 */ const __m128i l4type_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, 0x0380, 0x0380, 0x0380, 0x0380); /* convert RRC l3 type to mbuf format */ const __m128i l3type_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, RTE_PTYPE_L3_IPV6_EXT, RTE_PTYPE_L3_IPV6, RTE_PTYPE_L3_IPV4_EXT, RTE_PTYPE_L3_IPV4, 0); /* Convert RRC l4 type to mbuf format l4type_flags shift-left 8 bits * to fill into8 bits length. */ const __m128i l4type_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, RTE_PTYPE_TUNNEL_GENEVE >> 8, RTE_PTYPE_TUNNEL_NVGRE >> 8, RTE_PTYPE_TUNNEL_VXLAN >> 8, RTE_PTYPE_TUNNEL_GRE >> 8, RTE_PTYPE_L4_UDP >> 8, RTE_PTYPE_L4_TCP >> 8, 0); l3l4type0 = _mm_unpacklo_epi16(descs[0], descs[1]); l3l4type1 = _mm_unpacklo_epi16(descs[2], descs[3]); l3l4type0 = _mm_unpacklo_epi32(l3l4type0, l3l4type1); l3type = _mm_and_si128(l3l4type0, l3type_msk); l4type = _mm_and_si128(l3l4type0, l4type_msk); l3type = _mm_srli_epi16(l3type, L3TYPE_SHIFT); l4type = _mm_srli_epi16(l4type, L4TYPE_SHIFT); l3type = _mm_shuffle_epi8(l3type_flags, l3type); /* l4type_flags shift-left for 8 bits, need shift-right back */ l4type = _mm_shuffle_epi8(l4type_flags, l4type); l4type = _mm_slli_epi16(l4type, 8); l3l4type0 = _mm_or_si128(l3type, l4type); vol.dword = _mm_cvtsi128_si64(l3l4type0); rx_pkts[0]->packet_type = vol.e[0]; rx_pkts[1]->packet_type = vol.e[1]; rx_pkts[2]->packet_type = vol.e[2]; rx_pkts[3]->packet_type = vol.e[3]; }
static void TEST (void) { union128i_b u, s1, s2; char e[16]; int i; s1.x = _mm_set_epi8 (1,2,3,4,10,20,30,90,-80,-40,-100,-15,98, 25, 98,7); s2.x = _mm_set_epi8 (88, 44, 33, 22, 11, 98, 76, -100, -34, -78, -39, 6, 3, 4, 5, 119); u.x = test (s1.x, s2.x); for (i = 0; i < 16; i++) e[i] = s1.a[i] + s2.a[i]; if (check_union128i_b (u, e)) abort (); }
static void TEST (void) { union128i_ub u, s1, s2; unsigned char e[16] = {0}; int i; s1.x = _mm_set_epi8 (1,2,3,4,10,20,30,90,80,40,100,15,98, 25, 98,7); s2.x = _mm_set_epi8 (88, 44, 33, 22, 11, 98, 76, 100, 34, 78, 39, 6, 3, 4, 5, 119); u.x = test (s1.x, s2.x); for (i = 0; i < 16; i++) e[i] = s1.a[i] ^ s2.a[i]; if (check_union128i_ub (u, e)) abort (); }
static inline void desc_to_olflags_v(__m128i descs[4], struct rte_mbuf **rx_pkts) { __m128i vlan0, vlan1, rss; union { uint16_t e[4]; uint64_t dword; } vol; /* mask everything except rss and vlan flags *bit2 is for vlan tag, bits 13:12 for rss */ const __m128i rss_vlan_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, 0x3004, 0x3004, 0x3004, 0x3004); /* map rss and vlan type to rss hash and vlan flag */ const __m128i vlan_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, PKT_RX_VLAN_PKT, 0, 0, 0, 0); const __m128i rss_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, PKT_RX_FDIR, 0, PKT_RX_RSS_HASH, 0); vlan0 = _mm_unpackhi_epi16(descs[0], descs[1]); vlan1 = _mm_unpackhi_epi16(descs[2], descs[3]); vlan0 = _mm_unpacklo_epi32(vlan0, vlan1); vlan1 = _mm_and_si128(vlan0, rss_vlan_msk); vlan0 = _mm_shuffle_epi8(vlan_flags, vlan1); rss = _mm_srli_epi16(vlan1, 12); rss = _mm_shuffle_epi8(rss_flags, rss); vlan0 = _mm_or_si128(vlan0, rss); vol.dword = _mm_cvtsi128_si64(vlan0); rx_pkts[0]->ol_flags = vol.e[0]; rx_pkts[1]->ol_flags = vol.e[1]; rx_pkts[2]->ol_flags = vol.e[2]; rx_pkts[3]->ol_flags = vol.e[3]; }
static const __m128i MakeShuffler(uint8_t i0, uint8_t i1, uint8_t i2, uint8_t i3, uint8_t i4, uint8_t i5, uint8_t i6, uint8_t i7, uint8_t i8, uint8_t i9, uint8_t i10, uint8_t i11, uint8_t i12, uint8_t i13, uint8_t i14, uint8_t i15) { return _mm_set_epi8(i15, i14, i13, i12, i11, i10, i9, i8, i7, i6, i5, i4, i3, i2, i1, i0); }