static inline void desc_to_olflags_v(__m128i descs[4], struct rte_mbuf **rx_pkts) { __m128i ptype0, ptype1, vtag0, vtag1; union { uint16_t e[4]; uint64_t dword; } vol; ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]); ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]); vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]); vtag1 = _mm_unpackhi_epi16(descs[2], descs[3]); ptype1 = _mm_unpacklo_epi32(ptype0, ptype1); vtag1 = _mm_unpacklo_epi32(vtag0, vtag1); ptype1 = _mm_slli_epi16(ptype1, PTYPE_SHIFT); vtag1 = _mm_srli_epi16(vtag1, VTAG_SHIFT); ptype1 = _mm_or_si128(ptype1, vtag1); vol.dword = _mm_cvtsi128_si64(ptype1) & OLFLAGS_MASK_V; rx_pkts[0]->ol_flags = vol.e[0]; rx_pkts[1]->ol_flags = vol.e[1]; rx_pkts[2]->ol_flags = vol.e[2]; rx_pkts[3]->ol_flags = vol.e[3]; }
int _normoptimized32(const unsigned char* a, const unsigned char* b) { #ifdef USE_PENTIUM4 return _normoptimized(a,b,32); #else unsigned long int _dis0a, _dis0b, _dis1a, _dis1b; long int _cnt0a, _cnt0b, _cnt1a, _cnt1b; // first __m128i a0 = _mm_loadu_si128((const __m128i*)(a)); __m128i a1 = _mm_loadu_si128((const __m128i*)(a+16)); __m128i b0 = _mm_loadu_si128((const __m128i*)(b)); __m128i b1 = _mm_loadu_si128((const __m128i*)(b+16)); b0 = _mm_xor_si128(a0, b0); b1 = _mm_xor_si128(a1, b1); a0 = _mm_srli_si128(b0,8); a1 = _mm_srli_si128(b1,8); _dis0a = _mm_cvtsi128_si64(b0); _dis0b = _mm_cvtsi128_si64(a0); _dis1a = _mm_cvtsi128_si64(b1); _dis1b = _mm_cvtsi128_si64(a1); _cnt0a = _mm_popcnt_u64(_dis0a); _cnt0b = _mm_popcnt_u64(_dis0b); _cnt1a = _mm_popcnt_u64(_dis1a); _cnt1b = _mm_popcnt_u64(_dis1b); return _cnt0a + _cnt0b + _cnt1a + _cnt1b; #endif }
/* @note: When this function is changed, make corresponding change to * fm10k_dev_supported_ptypes_get(). */ static inline void fm10k_desc_to_pktype_v(__m128i descs[4], struct rte_mbuf **rx_pkts) { __m128i l3l4type0, l3l4type1, l3type, l4type; union { uint16_t e[4]; uint64_t dword; } vol; /* L3 pkt type mask Bit4 to Bit6 */ const __m128i l3type_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, 0x0070, 0x0070, 0x0070, 0x0070); /* L4 pkt type mask Bit7 to Bit9 */ const __m128i l4type_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, 0x0380, 0x0380, 0x0380, 0x0380); /* convert RRC l3 type to mbuf format */ const __m128i l3type_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, RTE_PTYPE_L3_IPV6_EXT, RTE_PTYPE_L3_IPV6, RTE_PTYPE_L3_IPV4_EXT, RTE_PTYPE_L3_IPV4, 0); /* Convert RRC l4 type to mbuf format l4type_flags shift-left 8 bits * to fill into8 bits length. */ const __m128i l4type_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, RTE_PTYPE_TUNNEL_GENEVE >> 8, RTE_PTYPE_TUNNEL_NVGRE >> 8, RTE_PTYPE_TUNNEL_VXLAN >> 8, RTE_PTYPE_TUNNEL_GRE >> 8, RTE_PTYPE_L4_UDP >> 8, RTE_PTYPE_L4_TCP >> 8, 0); l3l4type0 = _mm_unpacklo_epi16(descs[0], descs[1]); l3l4type1 = _mm_unpacklo_epi16(descs[2], descs[3]); l3l4type0 = _mm_unpacklo_epi32(l3l4type0, l3l4type1); l3type = _mm_and_si128(l3l4type0, l3type_msk); l4type = _mm_and_si128(l3l4type0, l4type_msk); l3type = _mm_srli_epi16(l3type, L3TYPE_SHIFT); l4type = _mm_srli_epi16(l4type, L4TYPE_SHIFT); l3type = _mm_shuffle_epi8(l3type_flags, l3type); /* l4type_flags shift-left for 8 bits, need shift-right back */ l4type = _mm_shuffle_epi8(l4type_flags, l4type); l4type = _mm_slli_epi16(l4type, 8); l3l4type0 = _mm_or_si128(l3type, l4type); vol.dword = _mm_cvtsi128_si64(l3l4type0); rx_pkts[0]->packet_type = vol.e[0]; rx_pkts[1]->packet_type = vol.e[1]; rx_pkts[2]->packet_type = vol.e[2]; rx_pkts[3]->packet_type = vol.e[3]; }
long long test_mm_cvtsi128_si64(__m128i A) { // DAG-LABEL: test_mm_cvtsi128_si64 // DAG: extractelement <2 x i64> %{{.*}}, i32 0 // // ASM-LABEL: test_mm_cvtsi128_si64 // ASM: movd return _mm_cvtsi128_si64(A); }
int _normoptimized(const unsigned char* a, const unsigned char* b, const int n) { #ifdef USE_PENTIUM4 int _ddt = 0; unsigned int _dis, _dis2, _dis3, _dis4; long int _cnt, _cnt2, _cnt3, _cnt4; for (int _i = 0; _i < n; _i+=16){ // xor 128 bits __m128i a0 = _mm_loadu_si128((const __m128i*)(a + _i)); __m128i b0 = _mm_loadu_si128((const __m128i*)(b + _i)); b0 = _mm_xor_si128(a0, b0); __m128i d = _mm_srli_si128(b0, 4); __m128i e = _mm_srli_si128(d,4); __m128i f = _mm_srli_si128(e,4); _dis = _mm_cvtsi128_si32(b0); _dis2 = _mm_cvtsi128_si32(d); _dis3 = _mm_cvtsi128_si32(e); _dis4 = _mm_cvtsi128_si32(f); // now count _cnt = _mm_popcnt_u32(_dis); _cnt2 = _mm_popcnt_u32(_dis2); _cnt3 = _mm_popcnt_u32(_dis3); _cnt4 = _mm_popcnt_u32(_dis4); _ddt += _cnt + _cnt2 + _cnt3 + _cnt4; } return _ddt; #else int _ddt = 0; unsigned long int _dis, _dis2; long int _cnt, _cnt2; for (int _i = 0; _i < n; _i+=16){ // xor 128 bits __m128i a0 = _mm_loadu_si128((const __m128i*)(a + _i)); __m128i b0 = _mm_loadu_si128((const __m128i*)(b + _i)); b0 = _mm_xor_si128(a0, b0); a0 = _mm_srli_si128(b0,8); _dis = _mm_cvtsi128_si64(b0); _dis2 = _mm_cvtsi128_si64(a0); _cnt = _mm_popcnt_u64(_dis); _cnt2 = _mm_popcnt_u64(_dis2); _ddt += _cnt + _cnt2; // other commmands don't give any advantage } return _ddt; #endif }
/** * Processes two doubles at a time */ int _mandelbrot_2( double const * const c_re_arg, double const * const c_im_arg, int max_iter ) { __m128d z_re = _mm_load_pd(c_re_arg); __m128d z_im = _mm_load_pd(c_im_arg); __m128d y_re; __m128d y_im; __m128d c_re = z_re; __m128d c_im = z_im; __m128i count = _mm_set1_epi64x(0); __m128d md; __m128d mt; __m128i mi = _mm_set1_epi16(0xffff);; __m128d two = _mm_set1_pd(2.0); __m128i one = _mm_set1_epi64x(1); for (int i = 0; i<max_iter; i+=1) { // y = z .* z; y_re = _mm_mul_pd(z_re, z_re); y_im = _mm_mul_pd(z_im, z_im); // y = z * z; y_re = _mm_sub_pd(y_re, y_im); y_im = _mm_mul_pd(z_re, z_im); y_im = _mm_add_pd(y_im, y_im); // z = z * z + c z_re = _mm_add_pd(y_re, c_re); z_im = _mm_add_pd(y_im, c_im); // if condition // md = _mm_add_pd(z_re, z_im); // md = _mm_cmplt_pd(md, four); md = _mm_cmplt_pd(z_re, two); mt = _mm_cmplt_pd(z_im, two); md = _mm_and_pd(md, mt); mi = _mm_and_si128(mi, (__m128i) md); // PRINT_M128I(mi); if ( !_mm_movemask_pd(md) ) { break; } // count iterations count = _mm_add_epi64( count, _mm_and_si128( mi, one) ); } int val; count = _mm_add_epi64( _mm_srli_si128(count, 8), count ); val = _mm_cvtsi128_si64( count ); return val; }
static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) { const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff); __m128i v_acc0_q = _mm_setzero_si128(); __m128i v_acc1_q = _mm_setzero_si128(); const int16_t *const end = src + n; assert(n % 64 == 0); while (src < end) { const __m128i v_val_0_w = xx_load_128(src); const __m128i v_val_1_w = xx_load_128(src + 8); const __m128i v_val_2_w = xx_load_128(src + 16); const __m128i v_val_3_w = xx_load_128(src + 24); const __m128i v_val_4_w = xx_load_128(src + 32); const __m128i v_val_5_w = xx_load_128(src + 40); const __m128i v_val_6_w = xx_load_128(src + 48); const __m128i v_val_7_w = xx_load_128(src + 56); const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w); const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w); const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w); const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w); const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d); const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d); const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d); const __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, v_sum_4567_d); v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_and_si128(v_sum_d, v_zext_mask_q)); v_acc1_q = _mm_add_epi64(v_acc1_q, _mm_srli_epi64(v_sum_d, 32)); src += 64; } v_acc0_q = _mm_add_epi64(v_acc0_q, v_acc1_q); v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8)); #if ARCH_X86_64 return (uint64_t)_mm_cvtsi128_si64(v_acc0_q); #else { uint64_t tmp; _mm_storel_epi64((__m128i *)&tmp, v_acc0_q); return tmp; } #endif }
int _normoptimized16(const unsigned char* a, const unsigned char* b) { #ifdef USE_PENTIUM4 return _normoptimized(a,b,16); #else unsigned long int _dis, _dis2; long int _cnt, _cnt2; __m128i a0 = _mm_loadu_si128((const __m128i*)(a)); __m128i b0 = _mm_loadu_si128((const __m128i*)(b)); __m128i c = _mm_xor_si128(a0, b0); __m128i d = _mm_srli_si128(c,8); _dis = _mm_cvtsi128_si64(c); _dis2 = _mm_cvtsi128_si64(d); _cnt = _mm_popcnt_u64(_dis); _cnt2 = _mm_popcnt_u64(_dis2); int _ddt = _cnt + _cnt2; // other commmands don't give any advantage return _ddt; #endif }
static inline void desc_to_olflags_v(__m128i descs[4], uint8_t vlan_flags, struct rte_mbuf **rx_pkts) { __m128i ptype0, ptype1, vtag0, vtag1; union { uint16_t e[4]; uint64_t dword; } vol; /* mask everything except rss type */ const __m128i rsstype_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, 0x000F, 0x000F, 0x000F, 0x000F); /* map rss type to rss hash flag */ const __m128i rss_flags = _mm_set_epi8(PKT_RX_FDIR, 0, 0, 0, 0, 0, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0); /* mask everything except vlan present bit */ const __m128i vlan_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP); /* map vlan present (0x8) to ol_flags */ const __m128i vlan_map = _mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, vlan_flags, 0, 0, 0, 0, 0, 0, 0, 0); ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]); ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]); vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]); vtag1 = _mm_unpackhi_epi16(descs[2], descs[3]); ptype0 = _mm_unpacklo_epi32(ptype0, ptype1); ptype0 = _mm_and_si128(ptype0, rsstype_msk); ptype0 = _mm_shuffle_epi8(rss_flags, ptype0); vtag1 = _mm_unpacklo_epi32(vtag0, vtag1); vtag1 = _mm_and_si128(vtag1, vlan_msk); vtag1 = _mm_shuffle_epi8(vlan_map, vtag1); vtag1 = _mm_or_si128(ptype0, vtag1); vol.dword = _mm_cvtsi128_si64(vtag1); rx_pkts[0]->ol_flags = vol.e[0]; rx_pkts[1]->ol_flags = vol.e[1]; rx_pkts[2]->ol_flags = vol.e[2]; rx_pkts[3]->ol_flags = vol.e[3]; }
static inline void desc_to_olflags_v(__m128i descs[4], struct rte_mbuf **rx_pkts) { __m128i ptype0, ptype1, vtag0, vtag1; union { uint16_t e[4]; uint64_t dword; } vol; /* pkt type + vlan olflags mask */ const __m128i pkttype_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT); /* mask everything except rss type */ const __m128i rsstype_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, 0x000F, 0x000F, 0x000F, 0x000F); /* map rss type to rss hash flag */ const __m128i rss_flags = _mm_set_epi8(PKT_RX_FDIR, 0, 0, 0, 0, 0, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0); ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]); ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]); vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]); vtag1 = _mm_unpackhi_epi16(descs[2], descs[3]); ptype0 = _mm_unpacklo_epi32(ptype0, ptype1); ptype0 = _mm_and_si128(ptype0, rsstype_msk); ptype0 = _mm_shuffle_epi8(rss_flags, ptype0); vtag1 = _mm_unpacklo_epi32(vtag0, vtag1); vtag1 = _mm_srli_epi16(vtag1, VTAG_SHIFT); vtag1 = _mm_and_si128(vtag1, pkttype_msk); vtag1 = _mm_or_si128(ptype0, vtag1); vol.dword = _mm_cvtsi128_si64(vtag1); rx_pkts[0]->ol_flags = vol.e[0]; rx_pkts[1]->ol_flags = vol.e[1]; rx_pkts[2]->ol_flags = vol.e[2]; rx_pkts[3]->ol_flags = vol.e[3]; }
static inline void desc_to_olflags_v(__m128i descs[4], struct rte_mbuf **rx_pkts) { __m128i vlan0, vlan1, rss; union { uint16_t e[4]; uint64_t dword; } vol; /* mask everything except rss and vlan flags *bit2 is for vlan tag, bits 13:12 for rss */ const __m128i rss_vlan_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, 0x3004, 0x3004, 0x3004, 0x3004); /* map rss and vlan type to rss hash and vlan flag */ const __m128i vlan_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, PKT_RX_VLAN_PKT, 0, 0, 0, 0); const __m128i rss_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, PKT_RX_FDIR, 0, PKT_RX_RSS_HASH, 0); vlan0 = _mm_unpackhi_epi16(descs[0], descs[1]); vlan1 = _mm_unpackhi_epi16(descs[2], descs[3]); vlan0 = _mm_unpacklo_epi32(vlan0, vlan1); vlan1 = _mm_and_si128(vlan0, rss_vlan_msk); vlan0 = _mm_shuffle_epi8(vlan_flags, vlan1); rss = _mm_srli_epi16(vlan1, 12); rss = _mm_shuffle_epi8(rss_flags, rss); vlan0 = _mm_or_si128(vlan0, rss); vol.dword = _mm_cvtsi128_si64(vlan0); rx_pkts[0]->ol_flags = vol.e[0]; rx_pkts[1]->ol_flags = vol.e[1]; rx_pkts[2]->ol_flags = vol.e[2]; rx_pkts[3]->ol_flags = vol.e[3]; }
int _normoptimized64(const unsigned char*a, const unsigned char*b){ #ifdef USE_PENTIUM4 return _normoptimized(a,b,64); #else unsigned long int _dis0a, _dis0b, _dis1a, _dis1b, _dis2a, _dis2b, _dis3a, _dis3b; long int _cnt0a, _cnt0b, _cnt1a, _cnt1b, _cnt2a, _cnt2b, _cnt3a, _cnt3b; // first __m128i a0 = _mm_loadu_si128((const __m128i*)(a)); __m128i a1 = _mm_loadu_si128((const __m128i*)(a+16)); __m128i a2 = _mm_loadu_si128((const __m128i*)(a+32)); __m128i a3 = _mm_loadu_si128((const __m128i*)(a+48)); __m128i b0 = _mm_loadu_si128((const __m128i*)(b)); __m128i b1 = _mm_loadu_si128((const __m128i*)(b+16)); __m128i b2 = _mm_loadu_si128((const __m128i*)(b+32)); __m128i b3 = _mm_loadu_si128((const __m128i*)(b+48)); b0 = _mm_xor_si128(a0, b0); b1 = _mm_xor_si128(a1, b1); b2 = _mm_xor_si128(a2, b2); b3 = _mm_xor_si128(a3, b3); a0 = _mm_srli_si128(b0,8); a1 = _mm_srli_si128(b1,8); a2 = _mm_srli_si128(b2,8); a3 = _mm_srli_si128(b3,8); _dis0a = _mm_cvtsi128_si64(b0); _dis0b = _mm_cvtsi128_si64(a0); _dis1a = _mm_cvtsi128_si64(b1); _dis1b = _mm_cvtsi128_si64(a1); _dis2a = _mm_cvtsi128_si64(b2); _dis2b = _mm_cvtsi128_si64(a2); _dis3a = _mm_cvtsi128_si64(b3); _dis3b = _mm_cvtsi128_si64(a3); _cnt0a = _mm_popcnt_u64(_dis0a); _cnt0b = _mm_popcnt_u64(_dis0b); _cnt1a = _mm_popcnt_u64(_dis1a); _cnt1b = _mm_popcnt_u64(_dis1b); _cnt2a = _mm_popcnt_u64(_dis2a); _cnt2b = _mm_popcnt_u64(_dis2b); _cnt3a = _mm_popcnt_u64(_dis3a); _cnt3b = _mm_popcnt_u64(_dis3b); return _cnt0a + _cnt0b + _cnt1a + _cnt1b + _cnt2a + _cnt2b + _cnt3a + _cnt3b; #endif }
static inline void desc_to_olflags_v(__m128i descs[4], uint8_t vlan_flags, struct rte_mbuf **rx_pkts) { __m128i ptype0, ptype1, vtag0, vtag1, csum; union { uint16_t e[4]; uint64_t dword; } vol; /* mask everything except rss type */ const __m128i rsstype_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, 0x000F, 0x000F, 0x000F, 0x000F); /* mask the lower byte of ol_flags */ const __m128i ol_flags_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, 0x00FF, 0x00FF, 0x00FF, 0x00FF); /* map rss type to rss hash flag */ const __m128i rss_flags = _mm_set_epi8(PKT_RX_FDIR, 0, 0, 0, 0, 0, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0); /* mask everything except vlan present and l4/ip csum error */ const __m128i vlan_csum_msk = _mm_set_epi16( (IXGBE_RXDADV_ERR_TCPE | IXGBE_RXDADV_ERR_IPE) >> 16, (IXGBE_RXDADV_ERR_TCPE | IXGBE_RXDADV_ERR_IPE) >> 16, (IXGBE_RXDADV_ERR_TCPE | IXGBE_RXDADV_ERR_IPE) >> 16, (IXGBE_RXDADV_ERR_TCPE | IXGBE_RXDADV_ERR_IPE) >> 16, IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP); /* map vlan present (0x8), IPE (0x2), L4E (0x1) to ol_flags */ const __m128i vlan_csum_map_lo = _mm_set_epi8( 0, 0, 0, 0, vlan_flags | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD, vlan_flags | PKT_RX_IP_CKSUM_BAD, vlan_flags | PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD, vlan_flags | PKT_RX_IP_CKSUM_GOOD, 0, 0, 0, 0, PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD, PKT_RX_IP_CKSUM_BAD, PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD, PKT_RX_IP_CKSUM_GOOD); const __m128i vlan_csum_map_hi = _mm_set_epi8( 0, 0, 0, 0, 0, PKT_RX_L4_CKSUM_GOOD >> sizeof(uint8_t), 0, PKT_RX_L4_CKSUM_GOOD >> sizeof(uint8_t), 0, 0, 0, 0, 0, PKT_RX_L4_CKSUM_GOOD >> sizeof(uint8_t), 0, PKT_RX_L4_CKSUM_GOOD >> sizeof(uint8_t)); ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]); ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]); vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]); vtag1 = _mm_unpackhi_epi16(descs[2], descs[3]); ptype0 = _mm_unpacklo_epi32(ptype0, ptype1); ptype0 = _mm_and_si128(ptype0, rsstype_msk); ptype0 = _mm_shuffle_epi8(rss_flags, ptype0); vtag1 = _mm_unpacklo_epi32(vtag0, vtag1); vtag1 = _mm_and_si128(vtag1, vlan_csum_msk); /* csum bits are in the most significant, to use shuffle we need to * shift them. Change mask to 0xc000 to 0x0003. */ csum = _mm_srli_epi16(vtag1, 14); /* now or the most significant 64 bits containing the checksum * flags with the vlan present flags. */ csum = _mm_srli_si128(csum, 8); vtag1 = _mm_or_si128(csum, vtag1); /* convert VP, IPE, L4E to ol_flags */ vtag0 = _mm_shuffle_epi8(vlan_csum_map_hi, vtag1); vtag0 = _mm_slli_epi16(vtag0, sizeof(uint8_t)); vtag1 = _mm_shuffle_epi8(vlan_csum_map_lo, vtag1); vtag1 = _mm_and_si128(vtag1, ol_flags_msk); vtag1 = _mm_or_si128(vtag0, vtag1); vtag1 = _mm_or_si128(ptype0, vtag1); vol.dword = _mm_cvtsi128_si64(vtag1); rx_pkts[0]->ol_flags = vol.e[0]; rx_pkts[1]->ol_flags = vol.e[1]; rx_pkts[2]->ol_flags = vol.e[2]; rx_pkts[3]->ol_flags = vol.e[3]; }
/** * See av1_wedge_sign_from_residuals_c */ int av1_wedge_sign_from_residuals_sse2(const int16_t *ds, const uint8_t *m, int N, int64_t limit) { int64_t acc; __m128i v_sign_d; __m128i v_acc0_d = _mm_setzero_si128(); __m128i v_acc1_d = _mm_setzero_si128(); __m128i v_acc_q; // Input size limited to 8192 by the use of 32 bit accumulators and m // being between [0, 64]. Overflow might happen at larger sizes, // though it is practically impossible on real video input. assert(N < 8192); assert(N % 64 == 0); do { const __m128i v_m01_b = xx_load_128(m); const __m128i v_m23_b = xx_load_128(m + 16); const __m128i v_m45_b = xx_load_128(m + 32); const __m128i v_m67_b = xx_load_128(m + 48); const __m128i v_d0_w = xx_load_128(ds); const __m128i v_d1_w = xx_load_128(ds + 8); const __m128i v_d2_w = xx_load_128(ds + 16); const __m128i v_d3_w = xx_load_128(ds + 24); const __m128i v_d4_w = xx_load_128(ds + 32); const __m128i v_d5_w = xx_load_128(ds + 40); const __m128i v_d6_w = xx_load_128(ds + 48); const __m128i v_d7_w = xx_load_128(ds + 56); const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128()); const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128()); const __m128i v_m2_w = _mm_unpacklo_epi8(v_m23_b, _mm_setzero_si128()); const __m128i v_m3_w = _mm_unpackhi_epi8(v_m23_b, _mm_setzero_si128()); const __m128i v_m4_w = _mm_unpacklo_epi8(v_m45_b, _mm_setzero_si128()); const __m128i v_m5_w = _mm_unpackhi_epi8(v_m45_b, _mm_setzero_si128()); const __m128i v_m6_w = _mm_unpacklo_epi8(v_m67_b, _mm_setzero_si128()); const __m128i v_m7_w = _mm_unpackhi_epi8(v_m67_b, _mm_setzero_si128()); const __m128i v_p0_d = _mm_madd_epi16(v_d0_w, v_m0_w); const __m128i v_p1_d = _mm_madd_epi16(v_d1_w, v_m1_w); const __m128i v_p2_d = _mm_madd_epi16(v_d2_w, v_m2_w); const __m128i v_p3_d = _mm_madd_epi16(v_d3_w, v_m3_w); const __m128i v_p4_d = _mm_madd_epi16(v_d4_w, v_m4_w); const __m128i v_p5_d = _mm_madd_epi16(v_d5_w, v_m5_w); const __m128i v_p6_d = _mm_madd_epi16(v_d6_w, v_m6_w); const __m128i v_p7_d = _mm_madd_epi16(v_d7_w, v_m7_w); const __m128i v_p01_d = _mm_add_epi32(v_p0_d, v_p1_d); const __m128i v_p23_d = _mm_add_epi32(v_p2_d, v_p3_d); const __m128i v_p45_d = _mm_add_epi32(v_p4_d, v_p5_d); const __m128i v_p67_d = _mm_add_epi32(v_p6_d, v_p7_d); const __m128i v_p0123_d = _mm_add_epi32(v_p01_d, v_p23_d); const __m128i v_p4567_d = _mm_add_epi32(v_p45_d, v_p67_d); v_acc0_d = _mm_add_epi32(v_acc0_d, v_p0123_d); v_acc1_d = _mm_add_epi32(v_acc1_d, v_p4567_d); ds += 64; m += 64; N -= 64; } while (N); v_sign_d = _mm_cmplt_epi32(v_acc0_d, _mm_setzero_si128()); v_acc0_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc0_d, v_sign_d), _mm_unpackhi_epi32(v_acc0_d, v_sign_d)); v_sign_d = _mm_cmplt_epi32(v_acc1_d, _mm_setzero_si128()); v_acc1_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc1_d, v_sign_d), _mm_unpackhi_epi32(v_acc1_d, v_sign_d)); v_acc_q = _mm_add_epi64(v_acc0_d, v_acc1_d); v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8)); #if ARCH_X86_64 acc = (uint64_t)_mm_cvtsi128_si64(v_acc_q); #else xx_storel_64(&acc, v_acc_q); #endif return acc > limit; }
/** * See av1_wedge_sse_from_residuals_c */ uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d, const uint8_t *m, int N) { int n = -N; int n8 = n + 8; uint64_t csse; const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE); const __m128i v_zext_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff); __m128i v_acc0_q = _mm_setzero_si128(); assert(N % 64 == 0); r1 += N; d += N; m += N; do { const __m128i v_r0_w = xx_load_128(r1 + n); const __m128i v_r1_w = xx_load_128(r1 + n8); const __m128i v_d0_w = xx_load_128(d + n); const __m128i v_d1_w = xx_load_128(d + n8); const __m128i v_m01_b = xx_load_128(m + n); const __m128i v_rd0l_w = _mm_unpacklo_epi16(v_d0_w, v_r0_w); const __m128i v_rd0h_w = _mm_unpackhi_epi16(v_d0_w, v_r0_w); const __m128i v_rd1l_w = _mm_unpacklo_epi16(v_d1_w, v_r1_w); const __m128i v_rd1h_w = _mm_unpackhi_epi16(v_d1_w, v_r1_w); const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128()); const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128()); const __m128i v_m0l_w = _mm_unpacklo_epi16(v_m0_w, v_mask_max_w); const __m128i v_m0h_w = _mm_unpackhi_epi16(v_m0_w, v_mask_max_w); const __m128i v_m1l_w = _mm_unpacklo_epi16(v_m1_w, v_mask_max_w); const __m128i v_m1h_w = _mm_unpackhi_epi16(v_m1_w, v_mask_max_w); const __m128i v_t0l_d = _mm_madd_epi16(v_rd0l_w, v_m0l_w); const __m128i v_t0h_d = _mm_madd_epi16(v_rd0h_w, v_m0h_w); const __m128i v_t1l_d = _mm_madd_epi16(v_rd1l_w, v_m1l_w); const __m128i v_t1h_d = _mm_madd_epi16(v_rd1h_w, v_m1h_w); const __m128i v_t0_w = _mm_packs_epi32(v_t0l_d, v_t0h_d); const __m128i v_t1_w = _mm_packs_epi32(v_t1l_d, v_t1h_d); const __m128i v_sq0_d = _mm_madd_epi16(v_t0_w, v_t0_w); const __m128i v_sq1_d = _mm_madd_epi16(v_t1_w, v_t1_w); const __m128i v_sum0_q = _mm_add_epi64(_mm_and_si128(v_sq0_d, v_zext_q), _mm_srli_epi64(v_sq0_d, 32)); const __m128i v_sum1_q = _mm_add_epi64(_mm_and_si128(v_sq1_d, v_zext_q), _mm_srli_epi64(v_sq1_d, 32)); v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum0_q); v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum1_q); n8 += 16; n += 16; } while (n); v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8)); #if ARCH_X86_64 csse = (uint64_t)_mm_cvtsi128_si64(v_acc0_q); #else xx_storel_64(&csse, v_acc0_q); #endif return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS); }
void precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[], unsigned residual_samples, unsigned predictor_order, unsigned min_partition_order, unsigned max_partition_order, unsigned bps) { const unsigned default_partition_samples = (residual_samples + predictor_order) >> max_partition_order; unsigned partitions = 1u << max_partition_order; FLAC__ASSERT(default_partition_samples > predictor_order); /* first do max_partition_order */ { unsigned partition, residual_sample, end = (unsigned)(-(int)predictor_order); unsigned e1, e3; __m128i mm_res, mm_sum; if(bps <= 16) { FLAC__uint32 abs_residual_partition_sum; for(partition = residual_sample = 0; partition < partitions; partition++) { end += default_partition_samples; abs_residual_partition_sum = 0; mm_sum = _mm_setzero_si128(); e1 = (residual_sample + 3) & ~3; e3 = end & ~3; if(e1 > end) e1 = end; /* try flac -l 1 -b 16 and you'll be here */ /* assumption: residual[] is properly aligned so (residual + e1) is properly aligned too and _mm_loadu_si128() is fast*/ for( ; residual_sample < e1; residual_sample++) abs_residual_partition_sum += abs(residual[residual_sample]); /* abs(INT_MIN) is undefined, but if the residual is INT_MIN we have bigger problems */ for( ; residual_sample < e3; residual_sample+=4) { mm_res = _mm_loadu_si128((const __m128i*)(residual+residual_sample)); mm_res = _mm_abs_epi32(mm_res); mm_sum = _mm_add_epi32(mm_sum, mm_res); } mm_sum = _mm_hadd_epi32(mm_sum, mm_sum); mm_sum = _mm_hadd_epi32(mm_sum, mm_sum); abs_residual_partition_sum += _mm_cvtsi128_si32(mm_sum); for( ; residual_sample < end; residual_sample++) abs_residual_partition_sum += abs(residual[residual_sample]); abs_residual_partition_sums[partition] = abs_residual_partition_sum; } } else { /* have to pessimistically use 64 bits for accumulator */ FLAC__uint64 abs_residual_partition_sum; for(partition = residual_sample = 0; partition < partitions; partition++) { end += default_partition_samples; abs_residual_partition_sum = 0; mm_sum = _mm_setzero_si128(); e1 = (residual_sample + 1) & ~1; e3 = end & ~1; FLAC__ASSERT(e1 <= end); for( ; residual_sample < e1; residual_sample++) abs_residual_partition_sum += abs(residual[residual_sample]); for( ; residual_sample < e3; residual_sample+=2) { mm_res = _mm_loadl_epi64((const __m128i*)(residual+residual_sample)); /* 0 0 r1 r0 */ mm_res = _mm_abs_epi32(mm_res); /* 0 0 |r1| |r0| */ mm_res = _mm_shuffle_epi32(mm_res, _MM_SHUFFLE(3,1,2,0)); /* 0 |r1| 0 |r0| == |r1_64| |r0_64| */ mm_sum = _mm_add_epi64(mm_sum, mm_res); } mm_sum = _mm_add_epi64(mm_sum, _mm_srli_si128(mm_sum, 8)); #ifdef FLAC__CPU_IA32 #ifdef _MSC_VER abs_residual_partition_sum += mm_sum.m128i_u64[0]; #else { FLAC__uint64 tmp[2]; _mm_storel_epi64((__m128i *)tmp, mm_sum); abs_residual_partition_sum += tmp[0]; } #endif #else abs_residual_partition_sum += _mm_cvtsi128_si64(mm_sum); #endif for( ; residual_sample < end; residual_sample++) abs_residual_partition_sum += abs(residual[residual_sample]); abs_residual_partition_sums[partition] = abs_residual_partition_sum; } } } /* now merge partitions for lower orders */ { unsigned from_partition = 0, to_partition = partitions; int partition_order; for(partition_order = (int)max_partition_order - 1; partition_order >= (int)min_partition_order; partition_order--) { unsigned i; partitions >>= 1; for(i = 0; i < partitions; i++) { abs_residual_partition_sums[to_partition++] = abs_residual_partition_sums[from_partition ] + abs_residual_partition_sums[from_partition+1]; from_partition += 2; } } } }
/* * Notice: * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST * numbers of DD bits */ static inline uint16_t _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts, uint8_t *split_packet) { volatile union i40e_rx_desc *rxdp; struct i40e_rx_entry *sw_ring; uint16_t nb_pkts_recd; int pos; uint64_t var; __m128i shuf_msk; uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; __m128i crc_adjust = _mm_set_epi16( 0, 0, 0, /* ignore non-length fields */ -rxq->crc_len, /* sub crc on data_len */ 0, /* ignore high-16bits of pkt_len */ -rxq->crc_len, /* sub crc on pkt_len */ 0, 0 /* ignore pkt_type field */ ); /* * compile-time check the above crc_adjust layout is correct. * NOTE: the first field (lowest address) is given last in set_epi16 * call above. */ RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); __m128i dd_check, eop_check; /* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */ nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST); /* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */ nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP); /* Just the act of getting into the function from the application is * going to cost about 7 cycles */ rxdp = rxq->rx_ring + rxq->rx_tail; rte_prefetch0(rxdp); /* See if we need to rearm the RX queue - gives the prefetch a bit * of time to act */ if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) i40e_rxq_rearm(rxq); /* Before we start moving massive data around, check to see if * there is actually a packet available */ if (!(rxdp->wb.qword1.status_error_len & rte_cpu_to_le_32(1 << I40E_RX_DESC_STATUS_DD_SHIFT))) return 0; /* 4 packets DD mask */ dd_check = _mm_set_epi64x(0x0000000100000001LL, 0x0000000100000001LL); /* 4 packets EOP mask */ eop_check = _mm_set_epi64x(0x0000000200000002LL, 0x0000000200000002LL); /* mask to shuffle from desc. to mbuf */ shuf_msk = _mm_set_epi8( 7, 6, 5, 4, /* octet 4~7, 32bits rss */ 3, 2, /* octet 2~3, low 16 bits vlan_macip */ 15, 14, /* octet 15~14, 16 bits data_len */ 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ 15, 14, /* octet 15~14, low 16 bits pkt_len */ 0xFF, 0xFF, /* pkt_type set as unknown */ 0xFF, 0xFF /*pkt_type set as unknown */ ); /* * Compile-time verify the shuffle mask * NOTE: some field positions already verified above, but duplicated * here for completeness in case of future modifications. */ RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10); RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); /* Cache is empty -> need to scan the buffer rings, but first move * the next 'n' mbufs into the cache */ sw_ring = &rxq->sw_ring[rxq->rx_tail]; /* A. load 4 packet in one loop * [A*. mask out 4 unused dirty field in desc] * B. copy 4 mbuf point from swring to rx_pkts * C. calc the number of DD bits among the 4 packets * [C*. extract the end-of-packet bit, if requested] * D. fill info. from desc to mbuf */ for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts; pos += RTE_I40E_DESCS_PER_LOOP, rxdp += RTE_I40E_DESCS_PER_LOOP) { __m128i descs[RTE_I40E_DESCS_PER_LOOP]; __m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4; __m128i zero, staterr, sterr_tmp1, sterr_tmp2; /* 2 64 bit or 4 32 bit mbuf pointers in one XMM reg. */ __m128i mbp1; #if defined(RTE_ARCH_X86_64) __m128i mbp2; #endif /* B.1 load 2 (64 bit) or 4 (32 bit) mbuf points */ mbp1 = _mm_loadu_si128((__m128i *)&sw_ring[pos]); /* Read desc statuses backwards to avoid race condition */ /* A.1 load 4 pkts desc */ descs[3] = _mm_loadu_si128((__m128i *)(rxdp + 3)); rte_compiler_barrier(); /* B.2 copy 2 64 bit or 4 32 bit mbuf point into rx_pkts */ _mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1); #if defined(RTE_ARCH_X86_64) /* B.1 load 2 64 bit mbuf points */ mbp2 = _mm_loadu_si128((__m128i *)&sw_ring[pos+2]); #endif descs[2] = _mm_loadu_si128((__m128i *)(rxdp + 2)); rte_compiler_barrier(); /* B.1 load 2 mbuf point */ descs[1] = _mm_loadu_si128((__m128i *)(rxdp + 1)); rte_compiler_barrier(); descs[0] = _mm_loadu_si128((__m128i *)(rxdp)); #if defined(RTE_ARCH_X86_64) /* B.2 copy 2 mbuf point into rx_pkts */ _mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2); #endif if (split_packet) { rte_mbuf_prefetch_part2(rx_pkts[pos]); rte_mbuf_prefetch_part2(rx_pkts[pos + 1]); rte_mbuf_prefetch_part2(rx_pkts[pos + 2]); rte_mbuf_prefetch_part2(rx_pkts[pos + 3]); } /* avoid compiler reorder optimization */ rte_compiler_barrier(); /* pkt 3,4 shift the pktlen field to be 16-bit aligned*/ const __m128i len3 = _mm_slli_epi32(descs[3], PKTLEN_SHIFT); const __m128i len2 = _mm_slli_epi32(descs[2], PKTLEN_SHIFT); /* merge the now-aligned packet length fields back in */ descs[3] = _mm_blend_epi16(descs[3], len3, 0x80); descs[2] = _mm_blend_epi16(descs[2], len2, 0x80); /* D.1 pkt 3,4 convert format from desc to pktmbuf */ pkt_mb4 = _mm_shuffle_epi8(descs[3], shuf_msk); pkt_mb3 = _mm_shuffle_epi8(descs[2], shuf_msk); /* C.1 4=>2 filter staterr info only */ sterr_tmp2 = _mm_unpackhi_epi32(descs[3], descs[2]); /* C.1 4=>2 filter staterr info only */ sterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]); desc_to_olflags_v(rxq, descs, &rx_pkts[pos]); /* D.2 pkt 3,4 set in_port/nb_seg and remove crc */ pkt_mb4 = _mm_add_epi16(pkt_mb4, crc_adjust); pkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust); /* pkt 1,2 shift the pktlen field to be 16-bit aligned*/ const __m128i len1 = _mm_slli_epi32(descs[1], PKTLEN_SHIFT); const __m128i len0 = _mm_slli_epi32(descs[0], PKTLEN_SHIFT); /* merge the now-aligned packet length fields back in */ descs[1] = _mm_blend_epi16(descs[1], len1, 0x80); descs[0] = _mm_blend_epi16(descs[0], len0, 0x80); /* D.1 pkt 1,2 convert format from desc to pktmbuf */ pkt_mb2 = _mm_shuffle_epi8(descs[1], shuf_msk); pkt_mb1 = _mm_shuffle_epi8(descs[0], shuf_msk); /* C.2 get 4 pkts staterr value */ zero = _mm_xor_si128(dd_check, dd_check); staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2); /* D.3 copy final 3,4 data to rx_pkts */ _mm_storeu_si128((void *)&rx_pkts[pos+3]->rx_descriptor_fields1, pkt_mb4); _mm_storeu_si128((void *)&rx_pkts[pos+2]->rx_descriptor_fields1, pkt_mb3); /* D.2 pkt 1,2 set in_port/nb_seg and remove crc */ pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust); pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust); /* C* extract and record EOP bit */ if (split_packet) { __m128i eop_shuf_mask = _mm_set_epi8( 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x04, 0x0C, 0x00, 0x08 ); /* and with mask to extract bits, flipping 1-0 */ __m128i eop_bits = _mm_andnot_si128(staterr, eop_check); /* the staterr values are not in order, as the count * count of dd bits doesn't care. However, for end of * packet tracking, we do care, so shuffle. This also * compresses the 32-bit values to 8-bit */ eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask); /* store the resulting 32-bit value */ *(int *)split_packet = _mm_cvtsi128_si32(eop_bits); split_packet += RTE_I40E_DESCS_PER_LOOP; } /* C.3 calc available number of desc */ staterr = _mm_and_si128(staterr, dd_check); staterr = _mm_packs_epi32(staterr, zero); /* D.3 copy final 1,2 data to rx_pkts */ _mm_storeu_si128((void *)&rx_pkts[pos+1]->rx_descriptor_fields1, pkt_mb2); _mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1, pkt_mb1); desc_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl); /* C.4 calc avaialbe number of desc */ var = __builtin_popcountll(_mm_cvtsi128_si64(staterr)); nb_pkts_recd += var; if (likely(var != RTE_I40E_DESCS_PER_LOOP)) break; } /* Update our internal tail pointer */ rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd); rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1)); rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd); return nb_pkts_recd; }
static uint64_t aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width, int height) { int r, c; const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff); __m128i v_acc_q = _mm_setzero_si128(); for (r = 0; r < height; r += 8) { __m128i v_acc_d = _mm_setzero_si128(); for (c = 0; c < width; c += 8) { const int16_t *b = src + c; const __m128i v_val_0_w = _mm_load_si128((const __m128i *)(b + 0 * stride)); const __m128i v_val_1_w = _mm_load_si128((const __m128i *)(b + 1 * stride)); const __m128i v_val_2_w = _mm_load_si128((const __m128i *)(b + 2 * stride)); const __m128i v_val_3_w = _mm_load_si128((const __m128i *)(b + 3 * stride)); const __m128i v_val_4_w = _mm_load_si128((const __m128i *)(b + 4 * stride)); const __m128i v_val_5_w = _mm_load_si128((const __m128i *)(b + 5 * stride)); const __m128i v_val_6_w = _mm_load_si128((const __m128i *)(b + 6 * stride)); const __m128i v_val_7_w = _mm_load_si128((const __m128i *)(b + 7 * stride)); const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w); const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w); const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w); const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w); const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d); const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d); const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d); v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d); v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d); } v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q)); v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32)); src += 8 * stride; } v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8)); #if ARCH_X86_64 return (uint64_t)_mm_cvtsi128_si64(v_acc_q); #else { uint64_t tmp; _mm_storel_epi64((__m128i *)&tmp, v_acc_q); return tmp; } #endif }
void TestRootBoard::generateCaptures() { QTextStream xout(stderr); cpu_set_t mask; CPU_ZERO( &mask ); CPU_SET( 1, &mask ); if ( sched_setaffinity( 0, sizeof(mask), &mask ) == -1 ) qDebug() << "Could not set CPU Affinity" << endl; static const unsigned testCases = 200; static const int iter = 10000; typedef QVector<uint64_t> Sample; QVector<Sample> times(testCases, Sample(iter)); QVector<Sample> movetimes(testCases, Sample(iter)); QVector<Sample> captimes(testCases, Sample(iter)); QVector<Sample> b02flood(testCases, Sample(iter)); QVector<Sample> b02point(testCases, Sample(iter)); QVector<Sample> b02double(testCases, Sample(iter)); Move moveList[256]; uint64_t sum=0; uint64_t movesum=0; uint64_t nmoves=0; uint64_t ncap =0; uint64_t a, d, tsc; Key blah; Colors color[testCases]; double cpufreq = 3900.0; for (unsigned int i = testCases; i;) { --i; b->setup(testPositions[i]); color[i] = b->color; if (i) { b->boards[i] = b->boards[0]; } movetimes[i].reserve(iter*2); times[i].reserve(iter*2); captimes[i].reserve(iter*2); } unsigned op = 1; const unsigned int iter2 = 10000000; __v2di res = _mm_set1_epi64x(0); uint64_t time=0; #ifdef NDEBUG for (unsigned int i = 0; i < iter2; ++i) { Board& bb = b->boards[i & 0xf].wb; tsc = readtsc(); res = bb.build02Attack(op); op = _mm_cvtsi128_si64(res) & 0x3f; res = bb.build02Attack(op); op = _mm_cvtsi128_si64(res) & 0x3f; res = bb.build02Attack(op); op = _mm_cvtsi128_si64(res) & 0x3f; res = bb.build02Attack(op); op = _mm_cvtsi128_si64(res) & 0x3f; res = bb.build02Attack(op); op = _mm_cvtsi128_si64(res) & 0x3f; res = bb.build02Attack(op); op = _mm_cvtsi128_si64(res) & 0x3f; res = bb.build02Attack(op); op = _mm_cvtsi128_si64(res) & 0x3f; res = bb.build02Attack(op); op = _mm_cvtsi128_si64(res) & 0x3f; time += readtsc() - tsc; // op = fold(res) & 0x3f; } std::cout << "build02(pos): " << time/iter2 << " clocks" << std::endl; time=0; for (unsigned int i = 0; i < iter2; ++i) { Board& bb = b->boards[i & 0xf].wb; tsc = readtsc(); res = bb.build13Attack(op); op = _mm_cvtsi128_si64(res) & 0x3f; res = bb.build13Attack(op); op = _mm_cvtsi128_si64(res) & 0x3f; res = bb.build13Attack(op); op = _mm_cvtsi128_si64(res) & 0x3f; res = bb.build13Attack(op); op = _mm_cvtsi128_si64(res) & 0x3f; res = bb.build13Attack(op); op = _mm_cvtsi128_si64(res) & 0x3f; res = bb.build13Attack(op); op = _mm_cvtsi128_si64(res) & 0x3f; res = bb.build13Attack(op); op = _mm_cvtsi128_si64(res) & 0x3f; res = bb.build13Attack(op); op = _mm_cvtsi128_si64(res) & 0x3f; time += readtsc() - tsc; } std::cout << "build13(pos): " << time/iter2 << " clocks" << std::endl; // time=0; // for (unsigned int i = 0; i < iter2; ++i) { // BoardBase& bb = b->boards[i & 0xf].wb; // tsc = readtsc(); // res = bb.build02Attack(res); // time += readtsc() - tsc; // } // std::cout << "build02(vector): " << time/iter2 << " clocks" << std::endl; time=0; for (unsigned int i = 0; i < iter2; ++i) { Board& bb = b->boards[i & 0xf].wb; tsc = readtsc(); res = b->boards[0].wb.build13Attack(res); res = b->boards[1].wb.build13Attack(res); res = b->boards[2].wb.build13Attack(res); res = b->boards[3].wb.build13Attack(res); res = b->boards[4].wb.build13Attack(res); res = b->boards[5].wb.build13Attack(res); res = b->boards[6].wb.build13Attack(res); res = b->boards[7].wb.build13Attack(res); time += readtsc() - tsc; } std::cout << "build13(vector): " << time/iter2 << " clocks" << std::endl; for (int j = 0; j < iter; ++j) { nmoves = 0; ncap=0; for (unsigned int i = 0; i < testCases; ++i) { // b->setup(testPositions[i]); uint64_t overhead; /* asm volatile("cpuid\n rdtsc" : "=a" (a), "=d" (d) :: "%rbx", "%rcx"); tsc = (a + (d << 32)); asm volatile("cpuid\n rdtsc" : "=a" (a), "=d" (d) :: "%rbx", "%rcx"); overhead = (a + (d << 32)) - tsc; */ overhead = 20; if (color[i] == White) b->boards[i].wb.buildAttacks(); else b->boards[i].bb.buildAttacks(); tsc = readtsc(); Move* good = moveList+192; Move* bad = good; if (color[i] == White) b->boards[i].wb.generateCaptureMoves<AllMoves>(good, bad); else b->boards[i].bb.generateCaptureMoves<AllMoves>(good, bad); ncap += bad - good; captimes[i][j] = readtsc() - tsc - overhead; tsc = readtsc(); if (color[i] == White) b->boards[i].wb.generateNonCap(good, bad); else b->boards[i].bb.generateNonCap(good, bad); nmoves += bad - good; times[i][j] = readtsc() - tsc - overhead; for (Move* k=good; k<bad; ++k) { // std::cout << k->string() << std::endl; tsc = readtsc(); if (color[i] == White) { __v8hi est = b->boards[i].b->eval.estimate(wb, *k); ColoredBoard<Black> bb(b->boards[i].wb, *k, est); blah += bb.getZobrist(); } else { __v8hi est = b->boards[i].b->eval.estimate(bb, *k); ColoredBoard<White> bb(b->boards[i].bb, *k, est); blah += bb.getZobrist(); } movetimes[i][j] += readtsc() - tsc - overhead; } // std::string empty; // std::cin >> empty; } } for (QVector<Sample>::Iterator i = times.begin(); i != times.end(); ++i) { qSort(*i); sum += (*i)[iter / 2]; } uint64_t capsum=0; for (QVector<Sample>::Iterator i = captimes.begin(); i != captimes.end(); ++i) { qSort(*i); capsum += (*i)[iter / 2]; } for (QVector<Sample>::Iterator i = movetimes.begin(); i != movetimes.end(); ++i) { qSort(*i); movesum += (*i)[iter / 2]; } xout << endl << nmoves << " Moves, " << sum/nmoves << " Clocks, " << cpufreq* nmoves/sum << " generated Mmoves/s, " << cpufreq* nmoves/movesum << " executed Mmoves/s" << endl; xout << ncap << " Captures, " << capsum/ncap << " Clocks, " << cpufreq* ncap/capsum << " generated Mmoves/s, " /*<< cpufreq*ncap/movesum << " executed Mmoves/s" */<< endl; xout << blah + fold(res) + op64 << endl; #endif }
static inline uint16_t fm10k_recv_raw_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts, uint8_t *split_packet) { volatile union fm10k_rx_desc *rxdp; struct rte_mbuf **mbufp; uint16_t nb_pkts_recd; int pos; struct fm10k_rx_queue *rxq = rx_queue; uint64_t var; __m128i shuf_msk; __m128i dd_check, eop_check; uint16_t next_dd; next_dd = rxq->next_dd; /* Just the act of getting into the function from the application is * going to cost about 7 cycles */ rxdp = rxq->hw_ring + next_dd; rte_prefetch0(rxdp); /* See if we need to rearm the RX queue - gives the prefetch a bit * of time to act */ if (rxq->rxrearm_nb > RTE_FM10K_RXQ_REARM_THRESH) fm10k_rxq_rearm(rxq); /* Before we start moving massive data around, check to see if * there is actually a packet available */ if (!(rxdp->d.staterr & FM10K_RXD_STATUS_DD)) return 0; /* Vecotr RX will process 4 packets at a time, strip the unaligned * tails in case it's not multiple of 4. */ nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_FM10K_DESCS_PER_LOOP); /* 4 packets DD mask */ dd_check = _mm_set_epi64x(0x0000000100000001LL, 0x0000000100000001LL); /* 4 packets EOP mask */ eop_check = _mm_set_epi64x(0x0000000200000002LL, 0x0000000200000002LL); /* mask to shuffle from desc. to mbuf */ shuf_msk = _mm_set_epi8( 7, 6, 5, 4, /* octet 4~7, 32bits rss */ 15, 14, /* octet 14~15, low 16 bits vlan_macip */ 13, 12, /* octet 12~13, 16 bits data_len */ 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ 13, 12, /* octet 12~13, low 16 bits pkt_len */ 0xFF, 0xFF, /* skip high 16 bits pkt_type */ 0xFF, 0xFF /* Skip pkt_type field in shuffle operation */ ); /* * Compile-time verify the shuffle mask * NOTE: some field positions already verified above, but duplicated * here for completeness in case of future modifications. */ RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10); RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); /* Cache is empty -> need to scan the buffer rings, but first move * the next 'n' mbufs into the cache */ mbufp = &rxq->sw_ring[next_dd]; /* A. load 4 packet in one loop * [A*. mask out 4 unused dirty field in desc] * B. copy 4 mbuf point from swring to rx_pkts * C. calc the number of DD bits among the 4 packets * [C*. extract the end-of-packet bit, if requested] * D. fill info. from desc to mbuf */ for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts; pos += RTE_FM10K_DESCS_PER_LOOP, rxdp += RTE_FM10K_DESCS_PER_LOOP) { __m128i descs0[RTE_FM10K_DESCS_PER_LOOP]; __m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4; __m128i zero, staterr, sterr_tmp1, sterr_tmp2; __m128i mbp1; /* 2 64 bit or 4 32 bit mbuf pointers in one XMM reg. */ #if defined(RTE_ARCH_X86_64) __m128i mbp2; #endif /* B.1 load 2 (64 bit) or 4 (32 bit) mbuf points */ mbp1 = _mm_loadu_si128((__m128i *)&mbufp[pos]); /* Read desc statuses backwards to avoid race condition */ /* A.1 load 4 pkts desc */ descs0[3] = _mm_loadu_si128((__m128i *)(rxdp + 3)); rte_compiler_barrier(); /* B.2 copy 2 64 bit or 4 32 bit mbuf point into rx_pkts */ _mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1); #if defined(RTE_ARCH_X86_64) /* B.1 load 2 64 bit mbuf poitns */ mbp2 = _mm_loadu_si128((__m128i *)&mbufp[pos+2]); #endif descs0[2] = _mm_loadu_si128((__m128i *)(rxdp + 2)); rte_compiler_barrier(); /* B.1 load 2 mbuf point */ descs0[1] = _mm_loadu_si128((__m128i *)(rxdp + 1)); rte_compiler_barrier(); descs0[0] = _mm_loadu_si128((__m128i *)(rxdp)); #if defined(RTE_ARCH_X86_64) /* B.2 copy 2 mbuf point into rx_pkts */ _mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2); #endif /* avoid compiler reorder optimization */ rte_compiler_barrier(); if (split_packet) { rte_mbuf_prefetch_part2(rx_pkts[pos]); rte_mbuf_prefetch_part2(rx_pkts[pos + 1]); rte_mbuf_prefetch_part2(rx_pkts[pos + 2]); rte_mbuf_prefetch_part2(rx_pkts[pos + 3]); } /* D.1 pkt 3,4 convert format from desc to pktmbuf */ pkt_mb4 = _mm_shuffle_epi8(descs0[3], shuf_msk); pkt_mb3 = _mm_shuffle_epi8(descs0[2], shuf_msk); /* C.1 4=>2 filter staterr info only */ sterr_tmp2 = _mm_unpackhi_epi32(descs0[3], descs0[2]); /* C.1 4=>2 filter staterr info only */ sterr_tmp1 = _mm_unpackhi_epi32(descs0[1], descs0[0]); /* set ol_flags with vlan packet type */ fm10k_desc_to_olflags_v(descs0, &rx_pkts[pos]); /* D.1 pkt 1,2 convert format from desc to pktmbuf */ pkt_mb2 = _mm_shuffle_epi8(descs0[1], shuf_msk); pkt_mb1 = _mm_shuffle_epi8(descs0[0], shuf_msk); /* C.2 get 4 pkts staterr value */ zero = _mm_xor_si128(dd_check, dd_check); staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2); /* D.3 copy final 3,4 data to rx_pkts */ _mm_storeu_si128((void *)&rx_pkts[pos+3]->rx_descriptor_fields1, pkt_mb4); _mm_storeu_si128((void *)&rx_pkts[pos+2]->rx_descriptor_fields1, pkt_mb3); /* C* extract and record EOP bit */ if (split_packet) { __m128i eop_shuf_mask = _mm_set_epi8( 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x04, 0x0C, 0x00, 0x08 ); /* and with mask to extract bits, flipping 1-0 */ __m128i eop_bits = _mm_andnot_si128(staterr, eop_check); /* the staterr values are not in order, as the count * count of dd bits doesn't care. However, for end of * packet tracking, we do care, so shuffle. This also * compresses the 32-bit values to 8-bit */ eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask); /* store the resulting 32-bit value */ *(int *)split_packet = _mm_cvtsi128_si32(eop_bits); split_packet += RTE_FM10K_DESCS_PER_LOOP; /* zero-out next pointers */ rx_pkts[pos]->next = NULL; rx_pkts[pos + 1]->next = NULL; rx_pkts[pos + 2]->next = NULL; rx_pkts[pos + 3]->next = NULL; } /* C.3 calc available number of desc */ staterr = _mm_and_si128(staterr, dd_check); staterr = _mm_packs_epi32(staterr, zero); /* D.3 copy final 1,2 data to rx_pkts */ _mm_storeu_si128((void *)&rx_pkts[pos+1]->rx_descriptor_fields1, pkt_mb2); _mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1, pkt_mb1); fm10k_desc_to_pktype_v(descs0, &rx_pkts[pos]); /* C.4 calc avaialbe number of desc */ var = __builtin_popcountll(_mm_cvtsi128_si64(staterr)); nb_pkts_recd += var; if (likely(var != RTE_FM10K_DESCS_PER_LOOP)) break; } /* Update our internal tail pointer */ rxq->next_dd = (uint16_t)(rxq->next_dd + nb_pkts_recd); rxq->next_dd = (uint16_t)(rxq->next_dd & (rxq->nb_desc - 1)); rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd); return nb_pkts_recd; }
void thread_ibs_num(size_t i, size_t n) { const size_t npack = nBlock >> 3; const size_t npack2 = npack * 2; C_UInt8 *Base = Geno1b.Get(); IdMatTri I = Array_Thread_MatIdx[i]; C_Int64 N = Array_Thread_MatCnt[i]; TS_KINGHomo *p = ptrKING + I.Offset(); for (; N > 0; N--, ++I, p++) { C_UInt8 *p1 = Base + I.Row() * npack2; C_UInt8 *p2 = Base + I.Column() * npack2; double *pAF = AF_1_AF.Get(); double *pAF2 = AF_1_AF_2.Get(); ssize_t m = npack; #if defined(COREARRAY_SIMD_SSE2) { POPCNT_SSE2_HEAD __m128i ibs0_sum, sumsq_sum; ibs0_sum = sumsq_sum = _mm_setzero_si128(); __m128d sq_sum, sq_sum2; sq_sum = sq_sum2 = _mm_setzero_pd(); for (; m > 0; m-=16) { __m128i g1_1 = _mm_load_si128((__m128i*)p1); __m128i g1_2 = _mm_load_si128((__m128i*)(p1 + npack)); __m128i g2_1 = _mm_load_si128((__m128i*)p2); __m128i g2_2 = _mm_load_si128((__m128i*)(p2 + npack)); p1 += 16; p2 += 16; __m128i mask = (g1_1 | ~g1_2) & (g2_1 | ~g2_2); __m128i ibs0 = (~((g1_1 ^ ~g2_1) | (g1_2 ^ ~g2_2))) & mask; __m128i het = ((g1_1 ^ g1_2) ^ (g2_1 ^ g2_2)) & mask; POPCNT_SSE2_RUN(ibs0) ibs0_sum = _mm_add_epi32(ibs0_sum, ibs0); POPCNT_SSE2_RUN(het) sumsq_sum = _mm_add_epi32(_mm_add_epi32(sumsq_sum, het), _mm_slli_epi32(ibs0, 2)); C_UInt64 m1 = _mm_cvtsi128_si64(mask); C_UInt64 m2 = _mm_cvtsi128_si64(_mm_shuffle_epi32(mask, _MM_SHUFFLE(1,0,3,2))); for (size_t k=32; k > 0; k--) { switch (m1 & 0x03) { case 3: sq_sum = _mm_add_pd(sq_sum, _mm_load_pd(pAF)); sq_sum2 = _mm_add_pd(sq_sum2, _mm_load_pd(pAF2)); break; case 1: sq_sum = _mm_add_pd(sq_sum, _mm_set_pd(0, pAF[0])); sq_sum2 = _mm_add_pd(sq_sum2, _mm_set_pd(0, pAF2[0])); break; case 2: sq_sum = _mm_add_pd(sq_sum, _mm_set_pd(pAF[1], 0)); sq_sum2 = _mm_add_pd(sq_sum2, _mm_set_pd(pAF2[1], 0)); break; } pAF += 2; pAF2 += 2; m1 >>= 2; } for (size_t k=32; k > 0; k--) { switch (m2 & 0x03) { case 3: sq_sum = _mm_add_pd(sq_sum, _mm_load_pd(pAF)); sq_sum2 = _mm_add_pd(sq_sum2, _mm_load_pd(pAF2)); break; case 1: sq_sum = _mm_add_pd(sq_sum, _mm_set_pd(0, pAF[0])); sq_sum2 = _mm_add_pd(sq_sum2, _mm_set_pd(0, pAF2[0])); break; case 2: sq_sum = _mm_add_pd(sq_sum, _mm_set_pd(pAF[1], 0)); sq_sum2 = _mm_add_pd(sq_sum2, _mm_set_pd(pAF2[1], 0)); break; } pAF += 2; pAF2 += 2; m2 >>= 2; } } p->IBS0 += vec_sum_i32(ibs0_sum); p->SumSq += vec_sum_i32(sumsq_sum); p->SumAFreq += vec_sum_f64(sq_sum); p->SumAFreq2 += vec_sum_f64(sq_sum2); } #else for (; m > 0; m-=8) { C_UInt64 g1_1 = *((C_UInt64*)p1); C_UInt64 g1_2 = *((C_UInt64*)(p1 + npack)); C_UInt64 g2_1 = *((C_UInt64*)p2); C_UInt64 g2_2 = *((C_UInt64*)(p2 + npack)); p1 += 8; p2 += 8; C_UInt64 mask = (g1_1 | ~g1_2) & (g2_1 | ~g2_2); C_UInt64 ibs0 = (~((g1_1 ^ ~g2_1) | (g1_2 ^ ~g2_2))) & mask; C_UInt64 het = ((g1_1 ^ g1_2) ^ (g2_1 ^ g2_2)) & mask; p->IBS0 += POPCNT_U64(ibs0); p->SumSq += POPCNT_U64(het) + POPCNT_U64(ibs0)*4; double sum=0, sum2=0; for (size_t k=64; k > 0; k--) { if (mask & 0x01) { sum += (*pAF); sum2 += (*pAF2); } pAF ++; pAF2 ++; mask >>= 1; } p->SumAFreq += sum; p->SumAFreq2 += sum2; } #endif } }
test (__m128i b) { return _mm_cvtsi128_si64 (b); }
/* * vPMD raw receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP) * * Notice: * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST * numbers of DD bit * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two * - don't support ol_flags for rss and csum err */ static inline uint16_t _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts, uint8_t *split_packet) { volatile union ixgbe_adv_rx_desc *rxdp; struct ixgbe_rx_entry *sw_ring; uint16_t nb_pkts_recd; int pos; uint64_t var; __m128i shuf_msk; __m128i crc_adjust = _mm_set_epi16( 0, 0, 0, /* ignore non-length fields */ -rxq->crc_len, /* sub crc on data_len */ 0, /* ignore high-16bits of pkt_len */ -rxq->crc_len, /* sub crc on pkt_len */ 0, 0 /* ignore pkt_type field */ ); __m128i dd_check, eop_check; /* nb_pkts shall be less equal than RTE_IXGBE_MAX_RX_BURST */ nb_pkts = RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST); /* nb_pkts has to be floor-aligned to RTE_IXGBE_DESCS_PER_LOOP */ nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_IXGBE_DESCS_PER_LOOP); /* Just the act of getting into the function from the application is * going to cost about 7 cycles */ rxdp = rxq->rx_ring + rxq->rx_tail; _mm_prefetch((const void *)rxdp, _MM_HINT_T0); /* See if we need to rearm the RX queue - gives the prefetch a bit * of time to act */ if (rxq->rxrearm_nb > RTE_IXGBE_RXQ_REARM_THRESH) ixgbe_rxq_rearm(rxq); /* Before we start moving massive data around, check to see if * there is actually a packet available */ if (!(rxdp->wb.upper.status_error & rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD))) return 0; /* 4 packets DD mask */ dd_check = _mm_set_epi64x(0x0000000100000001LL, 0x0000000100000001LL); /* 4 packets EOP mask */ eop_check = _mm_set_epi64x(0x0000000200000002LL, 0x0000000200000002LL); /* mask to shuffle from desc. to mbuf */ shuf_msk = _mm_set_epi8( 7, 6, 5, 4, /* octet 4~7, 32bits rss */ 15, 14, /* octet 14~15, low 16 bits vlan_macip */ 13, 12, /* octet 12~13, 16 bits data_len */ 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ 13, 12, /* octet 12~13, low 16 bits pkt_len */ 0xFF, 0xFF, /* skip 32 bit pkt_type */ 0xFF, 0xFF ); /* Cache is empty -> need to scan the buffer rings, but first move * the next 'n' mbufs into the cache */ sw_ring = &rxq->sw_ring[rxq->rx_tail]; /* A. load 4 packet in one loop * [A*. mask out 4 unused dirty field in desc] * B. copy 4 mbuf point from swring to rx_pkts * C. calc the number of DD bits among the 4 packets * [C*. extract the end-of-packet bit, if requested] * D. fill info. from desc to mbuf */ for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts; pos += RTE_IXGBE_DESCS_PER_LOOP, rxdp += RTE_IXGBE_DESCS_PER_LOOP) { __m128i descs[RTE_IXGBE_DESCS_PER_LOOP]; __m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4; __m128i zero, staterr, sterr_tmp1, sterr_tmp2; __m128i mbp1, mbp2; /* two mbuf pointer in one XMM reg. */ /* B.1 load 1 mbuf point */ mbp1 = _mm_loadu_si128((__m128i *)&sw_ring[pos]); /* Read desc statuses backwards to avoid race condition */ /* A.1 load 4 pkts desc */ descs[3] = _mm_loadu_si128((__m128i *)(rxdp + 3)); /* B.2 copy 2 mbuf point into rx_pkts */ _mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1); /* B.1 load 1 mbuf point */ mbp2 = _mm_loadu_si128((__m128i *)&sw_ring[pos+2]); descs[2] = _mm_loadu_si128((__m128i *)(rxdp + 2)); /* B.1 load 2 mbuf point */ descs[1] = _mm_loadu_si128((__m128i *)(rxdp + 1)); descs[0] = _mm_loadu_si128((__m128i *)(rxdp)); /* B.2 copy 2 mbuf point into rx_pkts */ _mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2); if (split_packet) { rte_mbuf_prefetch_part2(rx_pkts[pos]); rte_mbuf_prefetch_part2(rx_pkts[pos + 1]); rte_mbuf_prefetch_part2(rx_pkts[pos + 2]); rte_mbuf_prefetch_part2(rx_pkts[pos + 3]); } /* avoid compiler reorder optimization */ rte_compiler_barrier(); /* D.1 pkt 3,4 convert format from desc to pktmbuf */ pkt_mb4 = _mm_shuffle_epi8(descs[3], shuf_msk); pkt_mb3 = _mm_shuffle_epi8(descs[2], shuf_msk); /* D.1 pkt 1,2 convert format from desc to pktmbuf */ pkt_mb2 = _mm_shuffle_epi8(descs[1], shuf_msk); pkt_mb1 = _mm_shuffle_epi8(descs[0], shuf_msk); /* C.1 4=>2 filter staterr info only */ sterr_tmp2 = _mm_unpackhi_epi32(descs[3], descs[2]); /* C.1 4=>2 filter staterr info only */ sterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]); /* set ol_flags with vlan packet type */ desc_to_olflags_v(descs, &rx_pkts[pos]); /* D.2 pkt 3,4 set in_port/nb_seg and remove crc */ pkt_mb4 = _mm_add_epi16(pkt_mb4, crc_adjust); pkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust); /* C.2 get 4 pkts staterr value */ zero = _mm_xor_si128(dd_check, dd_check); staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2); /* D.3 copy final 3,4 data to rx_pkts */ _mm_storeu_si128((void *)&rx_pkts[pos+3]->rx_descriptor_fields1, pkt_mb4); _mm_storeu_si128((void *)&rx_pkts[pos+2]->rx_descriptor_fields1, pkt_mb3); /* D.2 pkt 1,2 set in_port/nb_seg and remove crc */ pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust); pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust); /* C* extract and record EOP bit */ if (split_packet) { __m128i eop_shuf_mask = _mm_set_epi8( 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x04, 0x0C, 0x00, 0x08 ); /* and with mask to extract bits, flipping 1-0 */ __m128i eop_bits = _mm_andnot_si128(staterr, eop_check); /* the staterr values are not in order, as the count * count of dd bits doesn't care. However, for end of * packet tracking, we do care, so shuffle. This also * compresses the 32-bit values to 8-bit */ eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask); /* store the resulting 32-bit value */ *(int *)split_packet = _mm_cvtsi128_si32(eop_bits); split_packet += RTE_IXGBE_DESCS_PER_LOOP; /* zero-out next pointers */ rx_pkts[pos]->next = NULL; rx_pkts[pos + 1]->next = NULL; rx_pkts[pos + 2]->next = NULL; rx_pkts[pos + 3]->next = NULL; } /* C.3 calc available number of desc */ staterr = _mm_and_si128(staterr, dd_check); staterr = _mm_packs_epi32(staterr, zero); /* D.3 copy final 1,2 data to rx_pkts */ _mm_storeu_si128((void *)&rx_pkts[pos+1]->rx_descriptor_fields1, pkt_mb2); _mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1, pkt_mb1); /* C.4 calc avaialbe number of desc */ var = __builtin_popcountll(_mm_cvtsi128_si64(staterr)); nb_pkts_recd += var; if (likely(var != RTE_IXGBE_DESCS_PER_LOOP)) break; } /* Update our internal tail pointer */ rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd); rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1)); rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd); return nb_pkts_recd; }
static inline void fm10k_desc_to_olflags_v(__m128i descs[4], struct rte_mbuf **rx_pkts) { __m128i ptype0, ptype1, vtag0, vtag1, eflag0, eflag1, cksumflag; union { uint16_t e[4]; uint64_t dword; } vol; const __m128i pkttype_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT); /* mask everything except rss type */ const __m128i rsstype_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, 0x000F, 0x000F, 0x000F, 0x000F); /* mask for HBO and RXE flag flags */ const __m128i rxe_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0001, 0x0001, 0x0001); const __m128i l3l4cksum_flag = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD, PKT_RX_IP_CKSUM_BAD, PKT_RX_L4_CKSUM_BAD, 0); const __m128i rxe_flag = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, PKT_RX_RECIP_ERR, 0); /* map rss type to rss hash flag */ const __m128i rss_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0); /* Calculate RSS_hash and Vlan fields */ ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]); ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]); vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]); vtag1 = _mm_unpackhi_epi16(descs[2], descs[3]); ptype0 = _mm_unpacklo_epi32(ptype0, ptype1); ptype0 = _mm_and_si128(ptype0, rsstype_msk); ptype0 = _mm_shuffle_epi8(rss_flags, ptype0); vtag1 = _mm_unpacklo_epi32(vtag0, vtag1); eflag0 = vtag1; cksumflag = vtag1; vtag1 = _mm_srli_epi16(vtag1, VP_SHIFT); vtag1 = _mm_and_si128(vtag1, pkttype_msk); vtag1 = _mm_or_si128(ptype0, vtag1); /* Process err flags, simply set RECIP_ERR bit if HBO/IXE is set */ eflag1 = _mm_srli_epi16(eflag0, RXEFLAG_SHIFT); eflag0 = _mm_srli_epi16(eflag0, HBOFLAG_SHIFT); eflag0 = _mm_or_si128(eflag0, eflag1); eflag0 = _mm_and_si128(eflag0, rxe_msk); eflag0 = _mm_shuffle_epi8(rxe_flag, eflag0); vtag1 = _mm_or_si128(eflag0, vtag1); /* Process L4/L3 checksum error flags */ cksumflag = _mm_srli_epi16(cksumflag, L3L4EFLAG_SHIFT); cksumflag = _mm_shuffle_epi8(l3l4cksum_flag, cksumflag); vtag1 = _mm_or_si128(cksumflag, vtag1); vol.dword = _mm_cvtsi128_si64(vtag1); rx_pkts[0]->ol_flags = vol.e[0]; rx_pkts[1]->ol_flags = vol.e[1]; rx_pkts[2]->ol_flags = vol.e[2]; rx_pkts[3]->ol_flags = vol.e[3]; }