static void MultARGBRow(uint32_t* const ptr, int width, int inverse) { int x = 0; if (!inverse) { const int kSpan = 2; const __m128i zero = _mm_setzero_si128(); const __m128i kRound = _mm_set_epi16(0, 1 << 7, 1 << 7, 1 << 7, 0, 1 << 7, 1 << 7, 1 << 7); const __m128i kMult = _mm_set_epi16(0, 0x0101, 0x0101, 0x0101, 0, 0x0101, 0x0101, 0x0101); const __m128i kOne64 = _mm_set_epi16(1u << 8, 0, 0, 0, 1u << 8, 0, 0, 0); const int w2 = width & ~(kSpan - 1); for (x = 0; x < w2; x += kSpan) { const __m128i argb0 = _mm_loadl_epi64((__m128i*)&ptr[x]); const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero); const __m128i tmp0 = _mm_shufflelo_epi16(argb1, _MM_SHUFFLE(3, 3, 3, 3)); const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, _MM_SHUFFLE(3, 3, 3, 3)); const __m128i tmp2 = _mm_srli_epi64(tmp1, 16); const __m128i scale0 = _mm_mullo_epi16(tmp1, kMult); const __m128i scale1 = _mm_or_si128(tmp2, kOne64); const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0); const __m128i argb3 = _mm_mullo_epi16(argb1, scale1); const __m128i argb4 = _mm_adds_epu16(argb2, argb3); const __m128i argb5 = _mm_adds_epu16(argb4, kRound); const __m128i argb6 = _mm_srli_epi16(argb5, 8); const __m128i argb7 = _mm_packus_epi16(argb6, zero); _mm_storel_epi64((__m128i*)&ptr[x], argb7); } } width -= x; if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse); }
static void TEST (void) { union128i_w s1, s2; union128i_b u; char e[16]; int i; s1.x = _mm_set_epi16 (2134, -128, 1234, 6354, 1002, 3004, 4050, 9999); s2.x = _mm_set_epi16 (41124, 234, 2344, 2354, 607, 1, 2, -8009); u.x = test (s1.x, s2.x); for (i = 0; i < 8; i++) { if (s1.a[i] > 127) e[i] = 127; else if (s1.a[i] < -128) e[i] = -128; else e[i] = s1.a[i]; } for (i = 0; i < 8; i++) { if (s2.a[i] > 127) e[i+8] = 127; else if (s2.a[i] < -128) e[i+8] = -128; else e[i+8] = s2.a[i]; } if (check_union128i_b (u, e)) abort (); }
/* @note: When this function is changed, make corresponding change to * fm10k_dev_supported_ptypes_get(). */ static inline void fm10k_desc_to_pktype_v(__m128i descs[4], struct rte_mbuf **rx_pkts) { __m128i l3l4type0, l3l4type1, l3type, l4type; union { uint16_t e[4]; uint64_t dword; } vol; /* L3 pkt type mask Bit4 to Bit6 */ const __m128i l3type_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, 0x0070, 0x0070, 0x0070, 0x0070); /* L4 pkt type mask Bit7 to Bit9 */ const __m128i l4type_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, 0x0380, 0x0380, 0x0380, 0x0380); /* convert RRC l3 type to mbuf format */ const __m128i l3type_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, RTE_PTYPE_L3_IPV6_EXT, RTE_PTYPE_L3_IPV6, RTE_PTYPE_L3_IPV4_EXT, RTE_PTYPE_L3_IPV4, 0); /* Convert RRC l4 type to mbuf format l4type_flags shift-left 8 bits * to fill into8 bits length. */ const __m128i l4type_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, RTE_PTYPE_TUNNEL_GENEVE >> 8, RTE_PTYPE_TUNNEL_NVGRE >> 8, RTE_PTYPE_TUNNEL_VXLAN >> 8, RTE_PTYPE_TUNNEL_GRE >> 8, RTE_PTYPE_L4_UDP >> 8, RTE_PTYPE_L4_TCP >> 8, 0); l3l4type0 = _mm_unpacklo_epi16(descs[0], descs[1]); l3l4type1 = _mm_unpacklo_epi16(descs[2], descs[3]); l3l4type0 = _mm_unpacklo_epi32(l3l4type0, l3l4type1); l3type = _mm_and_si128(l3l4type0, l3type_msk); l4type = _mm_and_si128(l3l4type0, l4type_msk); l3type = _mm_srli_epi16(l3type, L3TYPE_SHIFT); l4type = _mm_srli_epi16(l4type, L4TYPE_SHIFT); l3type = _mm_shuffle_epi8(l3type_flags, l3type); /* l4type_flags shift-left for 8 bits, need shift-right back */ l4type = _mm_shuffle_epi8(l4type_flags, l4type); l4type = _mm_slli_epi16(l4type, 8); l3l4type0 = _mm_or_si128(l3type, l4type); vol.dword = _mm_cvtsi128_si64(l3l4type0); rx_pkts[0]->packet_type = vol.e[0]; rx_pkts[1]->packet_type = vol.e[1]; rx_pkts[2]->packet_type = vol.e[2]; rx_pkts[3]->packet_type = vol.e[3]; }
static void TransformColor(const VP8LMultipliers* const m, uint32_t* argb_data, int num_pixels) { const __m128i mults_rb = _mm_set_epi16( CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_), CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_), CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_), CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_)); const __m128i mults_b2 = _mm_set_epi16( CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0); const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff); // red-blue masks int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0 const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0 const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1 const __m128i E = _mm_slli_epi16(in, 8); // r 0 b 0 const __m128i F = _mm_mulhi_epi16(E, mults_b2); // x db2 0 0 const __m128i G = _mm_srli_epi32(F, 16); // 0 0 x db2 const __m128i H = _mm_add_epi8(G, D); // x dr x db const __m128i I = _mm_and_si128(H, mask_rb); // 0 dr 0 db const __m128i out = _mm_sub_epi8(in, I); _mm_storeu_si128((__m128i*)&argb_data[i], out); } // fallthrough and finish off with plain-C VP8LTransformColor_C(m, argb_data + i, num_pixels - i); }
static inline void desc_to_olflags_v(__m128i descs[4], uint8_t vlan_flags, struct rte_mbuf **rx_pkts) { __m128i ptype0, ptype1, vtag0, vtag1; union { uint16_t e[4]; uint64_t dword; } vol; /* mask everything except rss type */ const __m128i rsstype_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, 0x000F, 0x000F, 0x000F, 0x000F); /* map rss type to rss hash flag */ const __m128i rss_flags = _mm_set_epi8(PKT_RX_FDIR, 0, 0, 0, 0, 0, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0); /* mask everything except vlan present bit */ const __m128i vlan_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP); /* map vlan present (0x8) to ol_flags */ const __m128i vlan_map = _mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, vlan_flags, 0, 0, 0, 0, 0, 0, 0, 0); ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]); ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]); vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]); vtag1 = _mm_unpackhi_epi16(descs[2], descs[3]); ptype0 = _mm_unpacklo_epi32(ptype0, ptype1); ptype0 = _mm_and_si128(ptype0, rsstype_msk); ptype0 = _mm_shuffle_epi8(rss_flags, ptype0); vtag1 = _mm_unpacklo_epi32(vtag0, vtag1); vtag1 = _mm_and_si128(vtag1, vlan_msk); vtag1 = _mm_shuffle_epi8(vlan_map, vtag1); vtag1 = _mm_or_si128(ptype0, vtag1); vol.dword = _mm_cvtsi128_si64(vtag1); rx_pkts[0]->ol_flags = vol.e[0]; rx_pkts[1]->ol_flags = vol.e[1]; rx_pkts[2]->ol_flags = vol.e[2]; rx_pkts[3]->ol_flags = vol.e[3]; }
rfx_dwt_2d_encode_block_horiz_sse2(INT16* src, INT16* l, INT16* h, int subband_width) { int y; int n; int first; __m128i src_2n; __m128i src_2n_1; __m128i src_2n_2; __m128i h_n; __m128i h_n_m; __m128i l_n; for (y = 0; y < subband_width; y++) { for (n = 0; n < subband_width; n += 8) { /* The following 3 Set operations consumes more than half of the total DWT processing time! */ src_2n = _mm_set_epi16(src[14], src[12], src[10], src[8], src[6], src[4], src[2], src[0]); src_2n_1 = _mm_set_epi16(src[15], src[13], src[11], src[9], src[7], src[5], src[3], src[1]); src_2n_2 = _mm_set_epi16(n == subband_width - 8 ? src[14] : src[16], src[14], src[12], src[10], src[8], src[6], src[4], src[2]); /* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */ h_n = _mm_add_epi16(src_2n, src_2n_2); h_n = _mm_srai_epi16(h_n, 1); h_n = _mm_sub_epi16(src_2n_1, h_n); h_n = _mm_srai_epi16(h_n, 1); _mm_store_si128((__m128i*) h, h_n); h_n_m = _mm_loadu_si128((__m128i*) (h - 1)); if (n == 0) { first = _mm_extract_epi16(h_n_m, 1); h_n_m = _mm_insert_epi16(h_n_m, first, 0); } /* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */ l_n = _mm_add_epi16(h_n_m, h_n); l_n = _mm_srai_epi16(l_n, 1); l_n = _mm_add_epi16(l_n, src_2n); _mm_store_si128((__m128i*) l, l_n); src += 16; l += 8; h += 8; } } }
static void CollectColorBlueTransforms(const uint32_t* argb, int stride, int tile_width, int tile_height, int green_to_blue, int red_to_blue, int histo[]) { const __m128i mults_r = _mm_set_epi16( CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0); const __m128i mults_g = _mm_set_epi16( 0, CST_5b(green_to_blue), 0, CST_5b(green_to_blue), 0, CST_5b(green_to_blue), 0, CST_5b(green_to_blue)); const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask const __m128i mask_b = _mm_set1_epi32(0x0000ff); // blue mask int y; for (y = 0; y < tile_height; ++y) { const uint32_t* const src = argb + y * stride; int i, x; for (x = 0; x + SPAN <= tile_width; x += SPAN) { uint16_t values[SPAN]; const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]); const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]); const __m128i A0 = _mm_slli_epi16(in0, 8); // r 0 | b 0 const __m128i A1 = _mm_slli_epi16(in1, 8); const __m128i B0 = _mm_and_si128(in0, mask_g); // 0 0 | g 0 const __m128i B1 = _mm_and_si128(in1, mask_g); const __m128i C0 = _mm_mulhi_epi16(A0, mults_r); // x db | 0 0 const __m128i C1 = _mm_mulhi_epi16(A1, mults_r); const __m128i D0 = _mm_mulhi_epi16(B0, mults_g); // 0 0 | x db const __m128i D1 = _mm_mulhi_epi16(B1, mults_g); const __m128i E0 = _mm_sub_epi8(in0, D0); // x x | x b' const __m128i E1 = _mm_sub_epi8(in1, D1); const __m128i F0 = _mm_srli_epi32(C0, 16); // 0 0 | x db const __m128i F1 = _mm_srli_epi32(C1, 16); const __m128i G0 = _mm_sub_epi8(E0, F0); // 0 0 | x b' const __m128i G1 = _mm_sub_epi8(E1, F1); const __m128i H0 = _mm_and_si128(G0, mask_b); // 0 0 | 0 b const __m128i H1 = _mm_and_si128(G1, mask_b); const __m128i I = _mm_packs_epi32(H0, H1); // 0 b' | 0 b' _mm_storeu_si128((__m128i*)values, I); for (i = 0; i < SPAN; ++i) ++histo[values[i]]; } } { const int left_over = tile_width & (SPAN - 1); if (left_over > 0) { VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride, left_over, tile_height, green_to_blue, red_to_blue, histo); } } }
void unpack_rgb5a1_sse2(const Uint8* source, const Uint32 size, Uint8* dest) { __m128i t0, t1, t2; Uint32 i; for (i = 0; i < (size / 8); i++) { t0 = _mm_loadl_epi64((__m128i*)&source[i * 8]); t0 = _mm_unpacklo_epi16(t0, t0); t1 = _mm_unpacklo_epi16(t0, t0); t1 = _mm_and_si128(t1, _mm_set_epi16(0x8000, 0x001F, 0x03E0, 0x7C00, 0x8000, 0x001F, 0x03E0, 0x7C00)); t1 = _mm_mullo_epi16(t1, _mm_set_epi16(0x0001, 0x0800, 0x0040, 0x0002, 0x0001, 0x0800, 0x0040, 0x0002)); t1 = _mm_mulhi_epu16(t1, _mm_set_epi16(0x0200, 0x0260, 0x0260, 0x0260, 0x0200, 0x0260, 0x0260, 0x0260)); t1 = _mm_mulhi_epu16(t1, _mm_set_epi16(0xFF00, 0x6ED5, 0x6ED5, 0x6ED5, 0xFF00, 0x6ED5, 0x6ED5, 0x6ED5)); t2 = _mm_unpackhi_epi16(t0, t0); t2 = _mm_and_si128(t2, _mm_set_epi16(0x8000, 0x001F, 0x03E0, 0x7C00, 0x8000, 0x001F, 0x03E0, 0x7C00)); t2 = _mm_mullo_epi16(t2, _mm_set_epi16(0x0001, 0x0800, 0x0040, 0x0002, 0x0001, 0x0800, 0x0040, 0x0002)); t2 = _mm_mulhi_epu16(t2, _mm_set_epi16(0x0200, 0x0260, 0x0260, 0x0260, 0x0200, 0x0260, 0x0260, 0x0260)); t2 = _mm_mulhi_epu16(t2, _mm_set_epi16(0xFF00, 0x6ED5, 0x6ED5, 0x6ED5, 0xFF00, 0x6ED5, 0x6ED5, 0x6ED5)); t1 = _mm_packus_epi16(t1, t2); _mm_stream_si128((__m128i*)&dest[i * 16], t1); } }
static inline void desc_to_olflags_v(__m128i descs[4], struct rte_mbuf **rx_pkts) { __m128i ptype0, ptype1, vtag0, vtag1; union { uint16_t e[4]; uint64_t dword; } vol; /* pkt type + vlan olflags mask */ const __m128i pkttype_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT); /* mask everything except rss type */ const __m128i rsstype_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, 0x000F, 0x000F, 0x000F, 0x000F); /* map rss type to rss hash flag */ const __m128i rss_flags = _mm_set_epi8(PKT_RX_FDIR, 0, 0, 0, 0, 0, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0); ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]); ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]); vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]); vtag1 = _mm_unpackhi_epi16(descs[2], descs[3]); ptype0 = _mm_unpacklo_epi32(ptype0, ptype1); ptype0 = _mm_and_si128(ptype0, rsstype_msk); ptype0 = _mm_shuffle_epi8(rss_flags, ptype0); vtag1 = _mm_unpacklo_epi32(vtag0, vtag1); vtag1 = _mm_srli_epi16(vtag1, VTAG_SHIFT); vtag1 = _mm_and_si128(vtag1, pkttype_msk); vtag1 = _mm_or_si128(ptype0, vtag1); vol.dword = _mm_cvtsi128_si64(vtag1); rx_pkts[0]->ol_flags = vol.e[0]; rx_pkts[1]->ol_flags = vol.e[1]; rx_pkts[2]->ol_flags = vol.e[2]; rx_pkts[3]->ol_flags = vol.e[3]; }
static void TEST (void) { union128i_w u, s1, s2; short e[8]; int i; s1.x = _mm_set_epi16 (10,20,30,90,-80,-40,-100,-15); s2.x = _mm_set_epi16 (11, 98, 76, -100, -34, -78, -39, 14); u.x = test (s1.x, s2.x); for (i = 0; i < 8; i++) e[i] = s1.a[i] + s2.a[i]; if (check_union128i_w (u, e)) abort (); }
static void TEST (void) { union128i_w u, s1, s2; short e[8]; int i; s1.x = _mm_set_epi16 (1,2,3,4,5,6,7,8); s2.x = _mm_set_epi16 (8,7,6,5,4,3,2,1); u.x = test (s1.x, s2.x); for (i=0; i<8; i++) e[i] = s1.a[i]>s2.a[i]?s1.a[i]:s2.a[i]; if (check_union128i_w (u, e)) abort (); }
static void TEST (void) { union128i_uw u, s1, s2; unsigned short e[8]; int i; s1.x = _mm_set_epi16 (10,20,30,90,80,40,100,15); s2.x = _mm_set_epi16 (11, 98, 76, 100, 34, 78, 39, 14); u.x = test (s1.x, s2.x); for (i = 0; i < 8; i++) e[i] = (s1.a[i] + s2.a[i]+1)>>1; if (check_union128i_uw (u, e)) abort (); }
// SIMD.Int16x8 operation wrappers that cover instrinsics for x86/x64 system SIMDValue SIMDInt16x8Operation::OpInt16x8(int16 values[]) { X86SIMDValue x86Result; // Sets the 8 signed 16-bit integer values, note in revised order: starts with x7 x86Result.m128i_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]); return X86SIMDValue::ToSIMDValue(x86Result); }
void interpolate_gint16_cubic_sse2 (gpointer op, const gpointer ap, gint len, const gpointer icp, gint astride) { gint i = 0; gint16 *o = op, *a = ap, *ic = icp; __m128i ta, tb, tl1, tl2, th1, th2; __m128i f[2]; const gint16 *c[4] = { (gint16 *) ((gint8 *) a + 0 * astride), (gint16 *) ((gint8 *) a + 1 * astride), (gint16 *) ((gint8 *) a + 2 * astride), (gint16 *) ((gint8 *) a + 3 * astride) }; f[0] = _mm_set_epi16 (ic[1], ic[0], ic[1], ic[0], ic[1], ic[0], ic[1], ic[0]); f[1] = _mm_set_epi16 (ic[3], ic[2], ic[3], ic[2], ic[3], ic[2], ic[3], ic[2]); for (; i < len; i += 8) { ta = _mm_load_si128 ((__m128i *) (c[0] + i)); tb = _mm_load_si128 ((__m128i *) (c[1] + i)); tl1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[0]); th1 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[0]); ta = _mm_load_si128 ((__m128i *) (c[2] + i)); tb = _mm_load_si128 ((__m128i *) (c[3] + i)); tl2 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[1]); th2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[1]); tl1 = _mm_add_epi32 (tl1, tl2); th1 = _mm_add_epi32 (th1, th2); tl1 = _mm_add_epi32 (tl1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); th1 = _mm_add_epi32 (th1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); tl1 = _mm_srai_epi32 (tl1, PRECISION_S16); th1 = _mm_srai_epi32 (th1, PRECISION_S16); tl1 = _mm_packs_epi32 (tl1, th1); _mm_store_si128 ((__m128i *) (o + i), tl1); } }
static void TEST (void) { union128i_w u, s1, s2; short e[8]; int i, tmp; s1.x = _mm_set_epi16 (10,2067,-3033,90,80,40,-1000,15); s2.x = _mm_set_epi16 (11, 9834, 7444, -10222, 34, -7833, 39, 14); u.x = test (s1.x, s2.x); for (i = 0; i < 8; i++) { tmp = s1.a[i] * s2.a[i]; e[i] = tmp; } if (check_union128i_w (u, e)) abort (); }
static void TEST (void) { union128i_uw u, s1, s2; unsigned short e[8]; int i, tmp; s1.x = _mm_set_epi16 (10,2067,3033,90,80,40,1000,15); s2.x = _mm_set_epi16 (11, 9834, 7444, 10222, 34, 7833, 39, 14); u.x = test (s1.x, s2.x); for (i = 0; i < 8; i++) { tmp = s1.a[i] * s2.a[i]; e[i] = (tmp & 0xffff0000)>>16; } if (check_union128i_uw (u, e)) abort (); }
/* ----------------------------------- * weighted_merge_planar * ----------------------------------- */ void weighted_merge_planar_sse2(BYTE *p1, const BYTE *p2, int p1_pitch, int p2_pitch, int width, int height, int weight, int invweight) { __m128i round_mask = _mm_set1_epi32(0x4000); __m128i zero = _mm_setzero_si128(); __m128i mask = _mm_set_epi16(weight, invweight, weight, invweight, weight, invweight, weight, invweight); int wMod16 = (width/16) * 16; for (int y = 0; y < height; y++) { for (int x = 0; x < wMod16; x += 16) { __m128i px1 = _mm_load_si128(reinterpret_cast<const __m128i*>(p1+x)); //y7y6 y5y4 y3y2 y1y0 __m128i px2 = _mm_load_si128(reinterpret_cast<const __m128i*>(p2+x)); //Y7Y6 Y5Y4 Y3Y2 Y1Y0 __m128i p0123 = _mm_unpacklo_epi8(px1, px2); //Y3y3 Y2y2 Y1y1 Y0y0 __m128i p4567 = _mm_unpackhi_epi8(px1, px2); //Y7y7 Y6y6 Y5y5 Y4y4 __m128i p01 = _mm_unpacklo_epi8(p0123, zero); //00Y1 00y1 00Y0 00y0 __m128i p23 = _mm_unpackhi_epi8(p0123, zero); //00Y3 00y3 00Y2 00y2 __m128i p45 = _mm_unpacklo_epi8(p4567, zero); //00Y5 00y5 00Y4 00y4 __m128i p67 = _mm_unpackhi_epi8(p4567, zero); //00Y7 00y7 00Y6 00y6 p01 = _mm_madd_epi16(p01, mask); p23 = _mm_madd_epi16(p23, mask); p45 = _mm_madd_epi16(p45, mask); p67 = _mm_madd_epi16(p67, mask); p01 = _mm_add_epi32(p01, round_mask); p23 = _mm_add_epi32(p23, round_mask); p45 = _mm_add_epi32(p45, round_mask); p67 = _mm_add_epi32(p67, round_mask); p01 = _mm_srli_epi32(p01, 15); p23 = _mm_srli_epi32(p23, 15); p45 = _mm_srli_epi32(p45, 15); p67 = _mm_srli_epi32(p67, 15); p0123 = _mm_packs_epi32(p01, p23); p4567 = _mm_packs_epi32(p45, p67); __m128i result = _mm_packus_epi16(p0123, p4567); _mm_store_si128(reinterpret_cast<__m128i*>(p1+x), result); } for (int x = wMod16; x < width; x++) { p1[x] = (p1[x]*invweight + p2[x]*weight + 16384) >> 15; } p1 += p1_pitch; p2 += p2_pitch; } }
static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first, int w, int h, int stride) { const __m128i zero = _mm_setzero_si128(); const int kSpan = 2; const int w2 = w & ~(kSpan - 1); while (h-- > 0) { uint32_t* const rgbx = (uint32_t*)rgba; int i; if (!alpha_first) { const __m128i kMask = _mm_set_epi16(0xff, 0, 0, 0, 0xff, 0, 0, 0); const __m128i kMult = _mm_set_epi16(0, 0x8081, 0x8081, 0x8081, 0, 0x8081, 0x8081, 0x8081); for (i = 0; i < w2; i += kSpan) { APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 3, 3, 3), kMask, kMult); } } else { const __m128i kMask = _mm_set_epi16(0, 0, 0, 0xff, 0, 0, 0, 0xff); const __m128i kMult = _mm_set_epi16(0x8081, 0x8081, 0x8081, 0, 0x8081, 0x8081, 0x8081, 0); for (i = 0; i < w2; i += kSpan) { APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 0, 0, 3), kMask, kMult); } } // Finish with left-overs. for (; i < w; ++i) { uint8_t* const rgb = rgba + (alpha_first ? 1 : 0); const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3); const uint32_t a = alpha[4 * i]; if (a != 0xff) { const uint32_t mult = MULTIPLIER(a); rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult); rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult); rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult); } } rgba += stride; } }
void unpack_rgba4_sse2(const Uint8* source, const Uint32 size, Uint8* dest) { __m128i t0, t1, t2; Uint32 i; for (i = 0; i < (size / 8); i++) { t0 = _mm_loadl_epi64((__m128i*)&source[i * 8]); // converts 4 bit values to 8 bit values (multiply with 17) t0 = _mm_unpacklo_epi16(t0, t0); t1 = _mm_unpacklo_epi16(t0, t0); t1 = _mm_and_si128(t1, _mm_set_epi16(0xF000, 0x000F, 0x00F0, 0x0F00, 0xF000, 0x000F, 0x00F0, 0x0F00)); t1 = _mm_mullo_epi16(t1, _mm_set_epi16(0x0001, 0x1000, 0x0100, 0x0010, 0x0001, 0x1000, 0x0100, 0x0010)); t1 = _mm_mulhi_epu16(t1, _mm_set1_epi16(0x0110)); t2 = _mm_unpackhi_epi16(t0, t0); t2 = _mm_and_si128(t2, _mm_set_epi16(0xF000, 0x000F, 0x00F0, 0x0F00, 0xF000, 0x000F, 0x00F0, 0x0F00)); t2 = _mm_mullo_epi16(t2, _mm_set_epi16(0x0001, 0x1000, 0x0100, 0x0010, 0x0001, 0x1000, 0x0100, 0x0010)); t2 = _mm_mulhi_epu16(t2, _mm_set1_epi16(0x0110)); t1 = _mm_packus_epi16(t1, t2); _mm_stream_si128((__m128i*)&dest[i * 16], t1); } }
static void TransformColorInverse(const VP8LMultipliers* const m, const uint32_t* const src, int num_pixels, uint32_t* dst) { // sign-extended multiplying constants, pre-shifted by 5. #define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend const __m128i mults_rb = _mm_set_epi16( CST(green_to_red_), CST(green_to_blue_), CST(green_to_red_), CST(green_to_blue_), CST(green_to_red_), CST(green_to_blue_), CST(green_to_red_), CST(green_to_blue_)); const __m128i mults_b2 = _mm_set_epi16( CST(red_to_blue_), 0, CST(red_to_blue_), 0, CST(red_to_blue_), 0, CST(red_to_blue_), 0); #undef CST const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0 const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0 const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1 const __m128i E = _mm_add_epi8(in, D); // x r' x b' const __m128i F = _mm_slli_epi16(E, 8); // r' 0 b' 0 const __m128i G = _mm_mulhi_epi16(F, mults_b2); // x db2 0 0 const __m128i H = _mm_srli_epi32(G, 8); // 0 x db2 0 const __m128i I = _mm_add_epi8(H, F); // r' x b'' 0 const __m128i J = _mm_srli_epi16(I, 8); // 0 r' 0 b'' const __m128i out = _mm_or_si128(J, A); _mm_storeu_si128((__m128i*)&dst[i], out); } // Fall-back to C-version for left-overs. if (i != num_pixels) { VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i); } }
/* ----------------------------------- * weighted_merge_luma_yuy2 * ----------------------------------- */ static void weighted_merge_luma_yuy2_sse2(BYTE *src, const BYTE *luma, int pitch, int luma_pitch,int width, int height, int weight, int invweight) { __m128i round_mask = _mm_set1_epi32(0x4000); __m128i mask = _mm_set_epi16(weight, invweight, weight, invweight, weight, invweight, weight, invweight); __m128i luma_mask = _mm_set1_epi16(0x00FF); #pragma warning(push) #pragma warning(disable: 4309) __m128i chroma_mask = _mm_set1_epi16(0xFF00); #pragma warning(pop) int wMod16 = (width/16) * 16; for (int y = 0; y < height; y++) { for (int x = 0; x < wMod16; x += 16) { __m128i px1 = _mm_load_si128(reinterpret_cast<const __m128i*>(src+x)); //V1 Y3 U1 Y2 V0 Y1 U0 Y0 __m128i px2 = _mm_load_si128(reinterpret_cast<const __m128i*>(luma+x)); //v1 y3 u1 y2 v0 y1 u0 y0 __m128i src_lo = _mm_unpacklo_epi16(px1, px2); //v0 y1 V0 Y1 u0 y0 U0 Y0 __m128i src_hi = _mm_unpackhi_epi16(px1, px2); src_lo = _mm_and_si128(src_lo, luma_mask); //00 v0 00 V0 00 u0 00 U0 src_hi = _mm_and_si128(src_hi, luma_mask); src_lo = _mm_madd_epi16(src_lo, mask); src_hi = _mm_madd_epi16(src_hi, mask); src_lo = _mm_add_epi32(src_lo, round_mask); src_hi = _mm_add_epi32(src_hi, round_mask); src_lo = _mm_srli_epi32(src_lo, 15); src_hi = _mm_srli_epi32(src_hi, 15); __m128i result_luma = _mm_packs_epi32(src_lo, src_hi); __m128i result_chroma = _mm_and_si128(px1, chroma_mask); __m128i result = _mm_or_si128(result_chroma, result_luma); _mm_store_si128(reinterpret_cast<__m128i*>(src+x), result); } for (int x = wMod16; x < width; x+=2) { src[x] = (luma[x] * weight + src[x] * invweight + 16384) >> 15; } src += pitch; luma += luma_pitch; } }
static void avx2_test (void) { union128i_w s; union256i_q res; long long int res_ref[4]; s.x = _mm_set_epi16 (1, 2, 3, 4, -200, 50, 6, 8); res.x = _mm256_cvtepi16_epi64 (s.x); compute_movsxwq (s.a, res_ref); if (check_union256i_q (res, res_ref)) abort (); }
void main() { int i; __m128i v1, v2; char a1[16], a2[16]; // packing data v1 = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7); v2 = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x07060504, 0x03020100); // load data _mm_storeu_si128((__m128i*)a1, v1); _mm_storeu_si128((__m128i*)a2, v2); printarr(a1, "a1"); printarr(a2, "a2"); }
static inline void desc_to_olflags_v(__m128i descs[4], struct rte_mbuf **rx_pkts) { __m128i vlan0, vlan1, rss; union { uint16_t e[4]; uint64_t dword; } vol; /* mask everything except rss and vlan flags *bit2 is for vlan tag, bits 13:12 for rss */ const __m128i rss_vlan_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, 0x3004, 0x3004, 0x3004, 0x3004); /* map rss and vlan type to rss hash and vlan flag */ const __m128i vlan_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, PKT_RX_VLAN_PKT, 0, 0, 0, 0); const __m128i rss_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, PKT_RX_FDIR, 0, PKT_RX_RSS_HASH, 0); vlan0 = _mm_unpackhi_epi16(descs[0], descs[1]); vlan1 = _mm_unpackhi_epi16(descs[2], descs[3]); vlan0 = _mm_unpacklo_epi32(vlan0, vlan1); vlan1 = _mm_and_si128(vlan0, rss_vlan_msk); vlan0 = _mm_shuffle_epi8(vlan_flags, vlan1); rss = _mm_srli_epi16(vlan1, 12); rss = _mm_shuffle_epi8(rss_flags, rss); vlan0 = _mm_or_si128(vlan0, rss); vol.dword = _mm_cvtsi128_si64(vlan0); rx_pkts[0]->ol_flags = vol.e[0]; rx_pkts[1]->ol_flags = vol.e[1]; rx_pkts[2]->ol_flags = vol.e[2]; rx_pkts[3]->ol_flags = vol.e[3]; }
/* ----------------------------------- * weighted_merge_chroma_yuy2 * ----------------------------------- */ static void weighted_merge_chroma_yuy2_sse2(BYTE *src, const BYTE *chroma, int pitch, int chroma_pitch,int width, int height, int weight, int invweight ) { __m128i round_mask = _mm_set1_epi32(0x4000); __m128i mask = _mm_set_epi16(weight, invweight, weight, invweight, weight, invweight, weight, invweight); __m128i luma_mask = _mm_set1_epi16(0x00FF); int wMod16 = (width/16) * 16; for (int y = 0; y < height; y++) { for (int x = 0; x < wMod16; x += 16) { __m128i px1 = _mm_load_si128(reinterpret_cast<const __m128i*>(src+x)); __m128i px2 = _mm_load_si128(reinterpret_cast<const __m128i*>(chroma+x)); __m128i src_lo = _mm_unpacklo_epi16(px1, px2); __m128i src_hi = _mm_unpackhi_epi16(px1, px2); src_lo = _mm_srli_epi16(src_lo, 8); src_hi = _mm_srli_epi16(src_hi, 8); src_lo = _mm_madd_epi16(src_lo, mask); src_hi = _mm_madd_epi16(src_hi, mask); src_lo = _mm_add_epi32(src_lo, round_mask); src_hi = _mm_add_epi32(src_hi, round_mask); src_lo = _mm_srli_epi32(src_lo, 15); src_hi = _mm_srli_epi32(src_hi, 15); __m128i result_chroma = _mm_packs_epi32(src_lo, src_hi); result_chroma = _mm_slli_epi16(result_chroma, 8); __m128i result_luma = _mm_and_si128(px1, luma_mask); __m128i result = _mm_or_si128(result_chroma, result_luma); _mm_store_si128(reinterpret_cast<__m128i*>(src+x), result); } for (int x = wMod16; x < width; x+=2) { src[x+1] = (chroma[x+1] * weight + src[x+1] * invweight + 16384) >> 15; } src += pitch; chroma += chroma_pitch; } }
test (unsigned short *v) { __m128i x; x = _mm_set_epi16 (0, 0, 0, 0, 0, 0, 0, v[0]); check (x, v, 0); x = _mm_set_epi16 (0, 0, 0, 0, 0, 0, v[1], 0); check (x, v, 1); x = _mm_set_epi16 (0, 0, 0, 0, 0, v[2], 0, 0); check (x, v, 2); x = _mm_set_epi16 (0, 0, 0, 0, v[3], 0, 0, 0); check (x, v, 3); x = _mm_set_epi16 (0, 0, 0, v[4], 0, 0, 0, 0); check (x, v, 4); x = _mm_set_epi16 (0, 0, v[5], 0, 0, 0, 0, 0); check (x, v, 5); x = _mm_set_epi16 (0, v[6], 0, 0, 0, 0, 0, 0); check (x, v, 6); x = _mm_set_epi16 (v[7], 0, 0, 0, 0, 0, 0, 0); check (x, v, 7); }
static void TEST (void) { union128i_w u, s; union128i_q c; short e[8] = {0}; int i; s.x = _mm_set_epi16 (1, -2, 3, 4, 5, 6, -0x7000, 0x9000); c.x = _mm_set_epi64x (12, 13); u.x = test (s.x, c.x); if (c.a[0] < 16) for (i = 0; i < 8; i++) e[i] = s.a[i] >> c.a[0]; if (check_union128i_w (u, e)) abort (); }
static void CollectColorRedTransforms(const uint32_t* argb, int stride, int tile_width, int tile_height, int green_to_red, int histo[]) { const __m128i mults_g = _mm_set_epi16( 0, CST_5b(green_to_red), 0, CST_5b(green_to_red), 0, CST_5b(green_to_red), 0, CST_5b(green_to_red)); const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask const __m128i mask = _mm_set1_epi32(0xff); int y; for (y = 0; y < tile_height; ++y) { const uint32_t* const src = argb + y * stride; int i, x; for (x = 0; x + SPAN <= tile_width; x += SPAN) { uint16_t values[SPAN]; const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]); const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]); const __m128i A0 = _mm_and_si128(in0, mask_g); // 0 0 | g 0 const __m128i A1 = _mm_and_si128(in1, mask_g); const __m128i B0 = _mm_srli_epi32(in0, 16); // 0 0 | x r const __m128i B1 = _mm_srli_epi32(in1, 16); const __m128i C0 = _mm_mulhi_epi16(A0, mults_g); // 0 0 | x dr const __m128i C1 = _mm_mulhi_epi16(A1, mults_g); const __m128i E0 = _mm_sub_epi8(B0, C0); // x x | x r' const __m128i E1 = _mm_sub_epi8(B1, C1); const __m128i F0 = _mm_and_si128(E0, mask); // 0 0 | 0 r' const __m128i F1 = _mm_and_si128(E1, mask); const __m128i I = _mm_packs_epi32(F0, F1); _mm_storeu_si128((__m128i*)values, I); for (i = 0; i < SPAN; ++i) ++histo[values[i]]; } } { const int left_over = tile_width & (SPAN - 1); if (left_over > 0) { VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride, left_over, tile_height, green_to_red, histo); } } }
static void TransformAC3(const int16_t* in, uint8_t* dst) { static const int kC1 = 20091 + (1 << 16); static const int kC2 = 35468; const __m128i A = _mm_set1_epi16(in[0] + 4); const __m128i c4 = _mm_set1_epi16(MUL(in[4], kC2)); const __m128i d4 = _mm_set1_epi16(MUL(in[4], kC1)); const int c1 = MUL(in[1], kC2); const int d1 = MUL(in[1], kC1); const __m128i CD = _mm_set_epi16(0, 0, 0, 0, -d1, -c1, c1, d1); const __m128i B = _mm_adds_epi16(A, CD); const __m128i m0 = _mm_adds_epi16(B, d4); const __m128i m1 = _mm_adds_epi16(B, c4); const __m128i m2 = _mm_subs_epi16(B, c4); const __m128i m3 = _mm_subs_epi16(B, d4); const __m128i zero = _mm_setzero_si128(); // Load the source pixels. __m128i dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS)); __m128i dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS)); __m128i dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS)); __m128i dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS)); // Convert to 16b. dst0 = _mm_unpacklo_epi8(dst0, zero); dst1 = _mm_unpacklo_epi8(dst1, zero); dst2 = _mm_unpacklo_epi8(dst2, zero); dst3 = _mm_unpacklo_epi8(dst3, zero); // Add the inverse transform. dst0 = _mm_adds_epi16(dst0, _mm_srai_epi16(m0, 3)); dst1 = _mm_adds_epi16(dst1, _mm_srai_epi16(m1, 3)); dst2 = _mm_adds_epi16(dst2, _mm_srai_epi16(m2, 3)); dst3 = _mm_adds_epi16(dst3, _mm_srai_epi16(m3, 3)); // Unsigned saturate to 8b. dst0 = _mm_packus_epi16(dst0, dst0); dst1 = _mm_packus_epi16(dst1, dst1); dst2 = _mm_packus_epi16(dst2, dst2); dst3 = _mm_packus_epi16(dst3, dst3); // Store the results. *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0); *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1); *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2); *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3); }
/* * vPMD raw receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP) * * Notice: * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST * numbers of DD bit * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two * - don't support ol_flags for rss and csum err */ static inline uint16_t _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts, uint8_t *split_packet) { volatile union ixgbe_adv_rx_desc *rxdp; struct ixgbe_rx_entry *sw_ring; uint16_t nb_pkts_recd; int pos; uint64_t var; __m128i shuf_msk; __m128i crc_adjust = _mm_set_epi16( 0, 0, 0, /* ignore non-length fields */ -rxq->crc_len, /* sub crc on data_len */ 0, /* ignore high-16bits of pkt_len */ -rxq->crc_len, /* sub crc on pkt_len */ 0, 0 /* ignore pkt_type field */ ); __m128i dd_check, eop_check; /* nb_pkts shall be less equal than RTE_IXGBE_MAX_RX_BURST */ nb_pkts = RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST); /* nb_pkts has to be floor-aligned to RTE_IXGBE_DESCS_PER_LOOP */ nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_IXGBE_DESCS_PER_LOOP); /* Just the act of getting into the function from the application is * going to cost about 7 cycles */ rxdp = rxq->rx_ring + rxq->rx_tail; _mm_prefetch((const void *)rxdp, _MM_HINT_T0); /* See if we need to rearm the RX queue - gives the prefetch a bit * of time to act */ if (rxq->rxrearm_nb > RTE_IXGBE_RXQ_REARM_THRESH) ixgbe_rxq_rearm(rxq); /* Before we start moving massive data around, check to see if * there is actually a packet available */ if (!(rxdp->wb.upper.status_error & rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD))) return 0; /* 4 packets DD mask */ dd_check = _mm_set_epi64x(0x0000000100000001LL, 0x0000000100000001LL); /* 4 packets EOP mask */ eop_check = _mm_set_epi64x(0x0000000200000002LL, 0x0000000200000002LL); /* mask to shuffle from desc. to mbuf */ shuf_msk = _mm_set_epi8( 7, 6, 5, 4, /* octet 4~7, 32bits rss */ 15, 14, /* octet 14~15, low 16 bits vlan_macip */ 13, 12, /* octet 12~13, 16 bits data_len */ 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ 13, 12, /* octet 12~13, low 16 bits pkt_len */ 0xFF, 0xFF, /* skip 32 bit pkt_type */ 0xFF, 0xFF ); /* Cache is empty -> need to scan the buffer rings, but first move * the next 'n' mbufs into the cache */ sw_ring = &rxq->sw_ring[rxq->rx_tail]; /* A. load 4 packet in one loop * [A*. mask out 4 unused dirty field in desc] * B. copy 4 mbuf point from swring to rx_pkts * C. calc the number of DD bits among the 4 packets * [C*. extract the end-of-packet bit, if requested] * D. fill info. from desc to mbuf */ for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts; pos += RTE_IXGBE_DESCS_PER_LOOP, rxdp += RTE_IXGBE_DESCS_PER_LOOP) { __m128i descs[RTE_IXGBE_DESCS_PER_LOOP]; __m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4; __m128i zero, staterr, sterr_tmp1, sterr_tmp2; __m128i mbp1, mbp2; /* two mbuf pointer in one XMM reg. */ /* B.1 load 1 mbuf point */ mbp1 = _mm_loadu_si128((__m128i *)&sw_ring[pos]); /* Read desc statuses backwards to avoid race condition */ /* A.1 load 4 pkts desc */ descs[3] = _mm_loadu_si128((__m128i *)(rxdp + 3)); /* B.2 copy 2 mbuf point into rx_pkts */ _mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1); /* B.1 load 1 mbuf point */ mbp2 = _mm_loadu_si128((__m128i *)&sw_ring[pos+2]); descs[2] = _mm_loadu_si128((__m128i *)(rxdp + 2)); /* B.1 load 2 mbuf point */ descs[1] = _mm_loadu_si128((__m128i *)(rxdp + 1)); descs[0] = _mm_loadu_si128((__m128i *)(rxdp)); /* B.2 copy 2 mbuf point into rx_pkts */ _mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2); if (split_packet) { rte_mbuf_prefetch_part2(rx_pkts[pos]); rte_mbuf_prefetch_part2(rx_pkts[pos + 1]); rte_mbuf_prefetch_part2(rx_pkts[pos + 2]); rte_mbuf_prefetch_part2(rx_pkts[pos + 3]); } /* avoid compiler reorder optimization */ rte_compiler_barrier(); /* D.1 pkt 3,4 convert format from desc to pktmbuf */ pkt_mb4 = _mm_shuffle_epi8(descs[3], shuf_msk); pkt_mb3 = _mm_shuffle_epi8(descs[2], shuf_msk); /* D.1 pkt 1,2 convert format from desc to pktmbuf */ pkt_mb2 = _mm_shuffle_epi8(descs[1], shuf_msk); pkt_mb1 = _mm_shuffle_epi8(descs[0], shuf_msk); /* C.1 4=>2 filter staterr info only */ sterr_tmp2 = _mm_unpackhi_epi32(descs[3], descs[2]); /* C.1 4=>2 filter staterr info only */ sterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]); /* set ol_flags with vlan packet type */ desc_to_olflags_v(descs, &rx_pkts[pos]); /* D.2 pkt 3,4 set in_port/nb_seg and remove crc */ pkt_mb4 = _mm_add_epi16(pkt_mb4, crc_adjust); pkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust); /* C.2 get 4 pkts staterr value */ zero = _mm_xor_si128(dd_check, dd_check); staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2); /* D.3 copy final 3,4 data to rx_pkts */ _mm_storeu_si128((void *)&rx_pkts[pos+3]->rx_descriptor_fields1, pkt_mb4); _mm_storeu_si128((void *)&rx_pkts[pos+2]->rx_descriptor_fields1, pkt_mb3); /* D.2 pkt 1,2 set in_port/nb_seg and remove crc */ pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust); pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust); /* C* extract and record EOP bit */ if (split_packet) { __m128i eop_shuf_mask = _mm_set_epi8( 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x04, 0x0C, 0x00, 0x08 ); /* and with mask to extract bits, flipping 1-0 */ __m128i eop_bits = _mm_andnot_si128(staterr, eop_check); /* the staterr values are not in order, as the count * count of dd bits doesn't care. However, for end of * packet tracking, we do care, so shuffle. This also * compresses the 32-bit values to 8-bit */ eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask); /* store the resulting 32-bit value */ *(int *)split_packet = _mm_cvtsi128_si32(eop_bits); split_packet += RTE_IXGBE_DESCS_PER_LOOP; /* zero-out next pointers */ rx_pkts[pos]->next = NULL; rx_pkts[pos + 1]->next = NULL; rx_pkts[pos + 2]->next = NULL; rx_pkts[pos + 3]->next = NULL; } /* C.3 calc available number of desc */ staterr = _mm_and_si128(staterr, dd_check); staterr = _mm_packs_epi32(staterr, zero); /* D.3 copy final 1,2 data to rx_pkts */ _mm_storeu_si128((void *)&rx_pkts[pos+1]->rx_descriptor_fields1, pkt_mb2); _mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1, pkt_mb1); /* C.4 calc avaialbe number of desc */ var = __builtin_popcountll(_mm_cvtsi128_si64(staterr)); nb_pkts_recd += var; if (likely(var != RTE_IXGBE_DESCS_PER_LOOP)) break; } /* Update our internal tail pointer */ rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd); rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1)); rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd); return nb_pkts_recd; }