/* * vPMD raw receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP) * * Notice: * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST * numbers of DD bit * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two * - don't support ol_flags for rss and csum err */ static inline uint16_t _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts, uint8_t *split_packet) { volatile union ixgbe_adv_rx_desc *rxdp; struct ixgbe_rx_entry *sw_ring; uint16_t nb_pkts_recd; int pos; uint64_t var; __m128i shuf_msk; __m128i crc_adjust = _mm_set_epi16( 0, 0, 0, /* ignore non-length fields */ -rxq->crc_len, /* sub crc on data_len */ 0, /* ignore high-16bits of pkt_len */ -rxq->crc_len, /* sub crc on pkt_len */ 0, 0 /* ignore pkt_type field */ ); __m128i dd_check, eop_check; /* nb_pkts shall be less equal than RTE_IXGBE_MAX_RX_BURST */ nb_pkts = RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST); /* nb_pkts has to be floor-aligned to RTE_IXGBE_DESCS_PER_LOOP */ nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_IXGBE_DESCS_PER_LOOP); /* Just the act of getting into the function from the application is * going to cost about 7 cycles */ rxdp = rxq->rx_ring + rxq->rx_tail; _mm_prefetch((const void *)rxdp, _MM_HINT_T0); /* See if we need to rearm the RX queue - gives the prefetch a bit * of time to act */ if (rxq->rxrearm_nb > RTE_IXGBE_RXQ_REARM_THRESH) ixgbe_rxq_rearm(rxq); /* Before we start moving massive data around, check to see if * there is actually a packet available */ if (!(rxdp->wb.upper.status_error & rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD))) return 0; /* 4 packets DD mask */ dd_check = _mm_set_epi64x(0x0000000100000001LL, 0x0000000100000001LL); /* 4 packets EOP mask */ eop_check = _mm_set_epi64x(0x0000000200000002LL, 0x0000000200000002LL); /* mask to shuffle from desc. to mbuf */ shuf_msk = _mm_set_epi8( 7, 6, 5, 4, /* octet 4~7, 32bits rss */ 15, 14, /* octet 14~15, low 16 bits vlan_macip */ 13, 12, /* octet 12~13, 16 bits data_len */ 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ 13, 12, /* octet 12~13, low 16 bits pkt_len */ 0xFF, 0xFF, /* skip 32 bit pkt_type */ 0xFF, 0xFF ); /* Cache is empty -> need to scan the buffer rings, but first move * the next 'n' mbufs into the cache */ sw_ring = &rxq->sw_ring[rxq->rx_tail]; /* A. load 4 packet in one loop * [A*. mask out 4 unused dirty field in desc] * B. copy 4 mbuf point from swring to rx_pkts * C. calc the number of DD bits among the 4 packets * [C*. extract the end-of-packet bit, if requested] * D. fill info. from desc to mbuf */ for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts; pos += RTE_IXGBE_DESCS_PER_LOOP, rxdp += RTE_IXGBE_DESCS_PER_LOOP) { __m128i descs[RTE_IXGBE_DESCS_PER_LOOP]; __m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4; __m128i zero, staterr, sterr_tmp1, sterr_tmp2; __m128i mbp1, mbp2; /* two mbuf pointer in one XMM reg. */ /* B.1 load 1 mbuf point */ mbp1 = _mm_loadu_si128((__m128i *)&sw_ring[pos]); /* Read desc statuses backwards to avoid race condition */ /* A.1 load 4 pkts desc */ descs[3] = _mm_loadu_si128((__m128i *)(rxdp + 3)); /* B.2 copy 2 mbuf point into rx_pkts */ _mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1); /* B.1 load 1 mbuf point */ mbp2 = _mm_loadu_si128((__m128i *)&sw_ring[pos+2]); descs[2] = _mm_loadu_si128((__m128i *)(rxdp + 2)); /* B.1 load 2 mbuf point */ descs[1] = _mm_loadu_si128((__m128i *)(rxdp + 1)); descs[0] = _mm_loadu_si128((__m128i *)(rxdp)); /* B.2 copy 2 mbuf point into rx_pkts */ _mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2); if (split_packet) { rte_mbuf_prefetch_part2(rx_pkts[pos]); rte_mbuf_prefetch_part2(rx_pkts[pos + 1]); rte_mbuf_prefetch_part2(rx_pkts[pos + 2]); rte_mbuf_prefetch_part2(rx_pkts[pos + 3]); } /* avoid compiler reorder optimization */ rte_compiler_barrier(); /* D.1 pkt 3,4 convert format from desc to pktmbuf */ pkt_mb4 = _mm_shuffle_epi8(descs[3], shuf_msk); pkt_mb3 = _mm_shuffle_epi8(descs[2], shuf_msk); /* D.1 pkt 1,2 convert format from desc to pktmbuf */ pkt_mb2 = _mm_shuffle_epi8(descs[1], shuf_msk); pkt_mb1 = _mm_shuffle_epi8(descs[0], shuf_msk); /* C.1 4=>2 filter staterr info only */ sterr_tmp2 = _mm_unpackhi_epi32(descs[3], descs[2]); /* C.1 4=>2 filter staterr info only */ sterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]); /* set ol_flags with vlan packet type */ desc_to_olflags_v(descs, &rx_pkts[pos]); /* D.2 pkt 3,4 set in_port/nb_seg and remove crc */ pkt_mb4 = _mm_add_epi16(pkt_mb4, crc_adjust); pkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust); /* C.2 get 4 pkts staterr value */ zero = _mm_xor_si128(dd_check, dd_check); staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2); /* D.3 copy final 3,4 data to rx_pkts */ _mm_storeu_si128((void *)&rx_pkts[pos+3]->rx_descriptor_fields1, pkt_mb4); _mm_storeu_si128((void *)&rx_pkts[pos+2]->rx_descriptor_fields1, pkt_mb3); /* D.2 pkt 1,2 set in_port/nb_seg and remove crc */ pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust); pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust); /* C* extract and record EOP bit */ if (split_packet) { __m128i eop_shuf_mask = _mm_set_epi8( 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x04, 0x0C, 0x00, 0x08 ); /* and with mask to extract bits, flipping 1-0 */ __m128i eop_bits = _mm_andnot_si128(staterr, eop_check); /* the staterr values are not in order, as the count * count of dd bits doesn't care. However, for end of * packet tracking, we do care, so shuffle. This also * compresses the 32-bit values to 8-bit */ eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask); /* store the resulting 32-bit value */ *(int *)split_packet = _mm_cvtsi128_si32(eop_bits); split_packet += RTE_IXGBE_DESCS_PER_LOOP; /* zero-out next pointers */ rx_pkts[pos]->next = NULL; rx_pkts[pos + 1]->next = NULL; rx_pkts[pos + 2]->next = NULL; rx_pkts[pos + 3]->next = NULL; } /* C.3 calc available number of desc */ staterr = _mm_and_si128(staterr, dd_check); staterr = _mm_packs_epi32(staterr, zero); /* D.3 copy final 1,2 data to rx_pkts */ _mm_storeu_si128((void *)&rx_pkts[pos+1]->rx_descriptor_fields1, pkt_mb2); _mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1, pkt_mb1); /* C.4 calc avaialbe number of desc */ var = __builtin_popcountll(_mm_cvtsi128_si64(staterr)); nb_pkts_recd += var; if (likely(var != RTE_IXGBE_DESCS_PER_LOOP)) break; } /* Update our internal tail pointer */ rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd); rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1)); rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd); return nb_pkts_recd; }
static inline uint16_t fm10k_recv_raw_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts, uint8_t *split_packet) { volatile union fm10k_rx_desc *rxdp; struct rte_mbuf **mbufp; uint16_t nb_pkts_recd; int pos; struct fm10k_rx_queue *rxq = rx_queue; uint64_t var; __m128i shuf_msk; __m128i dd_check, eop_check; uint16_t next_dd; next_dd = rxq->next_dd; /* Just the act of getting into the function from the application is * going to cost about 7 cycles */ rxdp = rxq->hw_ring + next_dd; rte_prefetch0(rxdp); /* See if we need to rearm the RX queue - gives the prefetch a bit * of time to act */ if (rxq->rxrearm_nb > RTE_FM10K_RXQ_REARM_THRESH) fm10k_rxq_rearm(rxq); /* Before we start moving massive data around, check to see if * there is actually a packet available */ if (!(rxdp->d.staterr & FM10K_RXD_STATUS_DD)) return 0; /* Vecotr RX will process 4 packets at a time, strip the unaligned * tails in case it's not multiple of 4. */ nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_FM10K_DESCS_PER_LOOP); /* 4 packets DD mask */ dd_check = _mm_set_epi64x(0x0000000100000001LL, 0x0000000100000001LL); /* 4 packets EOP mask */ eop_check = _mm_set_epi64x(0x0000000200000002LL, 0x0000000200000002LL); /* mask to shuffle from desc. to mbuf */ shuf_msk = _mm_set_epi8( 7, 6, 5, 4, /* octet 4~7, 32bits rss */ 15, 14, /* octet 14~15, low 16 bits vlan_macip */ 13, 12, /* octet 12~13, 16 bits data_len */ 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ 13, 12, /* octet 12~13, low 16 bits pkt_len */ 0xFF, 0xFF, /* skip high 16 bits pkt_type */ 0xFF, 0xFF /* Skip pkt_type field in shuffle operation */ ); /* * Compile-time verify the shuffle mask * NOTE: some field positions already verified above, but duplicated * here for completeness in case of future modifications. */ RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10); RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); /* Cache is empty -> need to scan the buffer rings, but first move * the next 'n' mbufs into the cache */ mbufp = &rxq->sw_ring[next_dd]; /* A. load 4 packet in one loop * [A*. mask out 4 unused dirty field in desc] * B. copy 4 mbuf point from swring to rx_pkts * C. calc the number of DD bits among the 4 packets * [C*. extract the end-of-packet bit, if requested] * D. fill info. from desc to mbuf */ for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts; pos += RTE_FM10K_DESCS_PER_LOOP, rxdp += RTE_FM10K_DESCS_PER_LOOP) { __m128i descs0[RTE_FM10K_DESCS_PER_LOOP]; __m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4; __m128i zero, staterr, sterr_tmp1, sterr_tmp2; __m128i mbp1; /* 2 64 bit or 4 32 bit mbuf pointers in one XMM reg. */ #if defined(RTE_ARCH_X86_64) __m128i mbp2; #endif /* B.1 load 2 (64 bit) or 4 (32 bit) mbuf points */ mbp1 = _mm_loadu_si128((__m128i *)&mbufp[pos]); /* Read desc statuses backwards to avoid race condition */ /* A.1 load 4 pkts desc */ descs0[3] = _mm_loadu_si128((__m128i *)(rxdp + 3)); rte_compiler_barrier(); /* B.2 copy 2 64 bit or 4 32 bit mbuf point into rx_pkts */ _mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1); #if defined(RTE_ARCH_X86_64) /* B.1 load 2 64 bit mbuf poitns */ mbp2 = _mm_loadu_si128((__m128i *)&mbufp[pos+2]); #endif descs0[2] = _mm_loadu_si128((__m128i *)(rxdp + 2)); rte_compiler_barrier(); /* B.1 load 2 mbuf point */ descs0[1] = _mm_loadu_si128((__m128i *)(rxdp + 1)); rte_compiler_barrier(); descs0[0] = _mm_loadu_si128((__m128i *)(rxdp)); #if defined(RTE_ARCH_X86_64) /* B.2 copy 2 mbuf point into rx_pkts */ _mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2); #endif /* avoid compiler reorder optimization */ rte_compiler_barrier(); if (split_packet) { rte_mbuf_prefetch_part2(rx_pkts[pos]); rte_mbuf_prefetch_part2(rx_pkts[pos + 1]); rte_mbuf_prefetch_part2(rx_pkts[pos + 2]); rte_mbuf_prefetch_part2(rx_pkts[pos + 3]); } /* D.1 pkt 3,4 convert format from desc to pktmbuf */ pkt_mb4 = _mm_shuffle_epi8(descs0[3], shuf_msk); pkt_mb3 = _mm_shuffle_epi8(descs0[2], shuf_msk); /* C.1 4=>2 filter staterr info only */ sterr_tmp2 = _mm_unpackhi_epi32(descs0[3], descs0[2]); /* C.1 4=>2 filter staterr info only */ sterr_tmp1 = _mm_unpackhi_epi32(descs0[1], descs0[0]); /* set ol_flags with vlan packet type */ fm10k_desc_to_olflags_v(descs0, &rx_pkts[pos]); /* D.1 pkt 1,2 convert format from desc to pktmbuf */ pkt_mb2 = _mm_shuffle_epi8(descs0[1], shuf_msk); pkt_mb1 = _mm_shuffle_epi8(descs0[0], shuf_msk); /* C.2 get 4 pkts staterr value */ zero = _mm_xor_si128(dd_check, dd_check); staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2); /* D.3 copy final 3,4 data to rx_pkts */ _mm_storeu_si128((void *)&rx_pkts[pos+3]->rx_descriptor_fields1, pkt_mb4); _mm_storeu_si128((void *)&rx_pkts[pos+2]->rx_descriptor_fields1, pkt_mb3); /* C* extract and record EOP bit */ if (split_packet) { __m128i eop_shuf_mask = _mm_set_epi8( 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x04, 0x0C, 0x00, 0x08 ); /* and with mask to extract bits, flipping 1-0 */ __m128i eop_bits = _mm_andnot_si128(staterr, eop_check); /* the staterr values are not in order, as the count * count of dd bits doesn't care. However, for end of * packet tracking, we do care, so shuffle. This also * compresses the 32-bit values to 8-bit */ eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask); /* store the resulting 32-bit value */ *(int *)split_packet = _mm_cvtsi128_si32(eop_bits); split_packet += RTE_FM10K_DESCS_PER_LOOP; /* zero-out next pointers */ rx_pkts[pos]->next = NULL; rx_pkts[pos + 1]->next = NULL; rx_pkts[pos + 2]->next = NULL; rx_pkts[pos + 3]->next = NULL; } /* C.3 calc available number of desc */ staterr = _mm_and_si128(staterr, dd_check); staterr = _mm_packs_epi32(staterr, zero); /* D.3 copy final 1,2 data to rx_pkts */ _mm_storeu_si128((void *)&rx_pkts[pos+1]->rx_descriptor_fields1, pkt_mb2); _mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1, pkt_mb1); fm10k_desc_to_pktype_v(descs0, &rx_pkts[pos]); /* C.4 calc avaialbe number of desc */ var = __builtin_popcountll(_mm_cvtsi128_si64(staterr)); nb_pkts_recd += var; if (likely(var != RTE_FM10K_DESCS_PER_LOOP)) break; } /* Update our internal tail pointer */ rxq->next_dd = (uint16_t)(rxq->next_dd + nb_pkts_recd); rxq->next_dd = (uint16_t)(rxq->next_dd & (rxq->nb_desc - 1)); rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd); return nb_pkts_recd; }
/* * Notice: * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST * numbers of DD bits */ static inline uint16_t _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts, uint8_t *split_packet) { volatile union i40e_rx_desc *rxdp; struct i40e_rx_entry *sw_ring; uint16_t nb_pkts_recd; int pos; uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; /* mask to shuffle from desc. to mbuf */ uint8x16_t shuf_msk = { 0xFF, 0xFF, /* pkt_type set as unknown */ 0xFF, 0xFF, /* pkt_type set as unknown */ 14, 15, /* octet 15~14, low 16 bits pkt_len */ 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ 14, 15, /* octet 15~14, 16 bits data_len */ 2, 3, /* octet 2~3, low 16 bits vlan_macip */ 4, 5, 6, 7 /* octet 4~7, 32bits rss */ }; uint8x16_t eop_check = { 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; uint16x8_t crc_adjust = { 0, 0, /* ignore pkt_type field */ rxq->crc_len, /* sub crc on pkt_len */ 0, /* ignore high-16bits of pkt_len */ rxq->crc_len, /* sub crc on data_len */ 0, 0, 0 /* ignore non-length fields */ }; /* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */ nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST); /* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */ nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP); /* Just the act of getting into the function from the application is * going to cost about 7 cycles */ rxdp = rxq->rx_ring + rxq->rx_tail; rte_prefetch_non_temporal(rxdp); /* See if we need to rearm the RX queue - gives the prefetch a bit * of time to act */ if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) i40e_rxq_rearm(rxq); /* Before we start moving massive data around, check to see if * there is actually a packet available */ if (!(rxdp->wb.qword1.status_error_len & rte_cpu_to_le_32(1 << I40E_RX_DESC_STATUS_DD_SHIFT))) return 0; /* Cache is empty -> need to scan the buffer rings, but first move * the next 'n' mbufs into the cache */ sw_ring = &rxq->sw_ring[rxq->rx_tail]; /* A. load 4 packet in one loop * [A*. mask out 4 unused dirty field in desc] * B. copy 4 mbuf point from swring to rx_pkts * C. calc the number of DD bits among the 4 packets * [C*. extract the end-of-packet bit, if requested] * D. fill info. from desc to mbuf */ for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts; pos += RTE_I40E_DESCS_PER_LOOP, rxdp += RTE_I40E_DESCS_PER_LOOP) { uint64x2_t descs[RTE_I40E_DESCS_PER_LOOP]; uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4; uint16x8x2_t sterr_tmp1, sterr_tmp2; uint64x2_t mbp1, mbp2; uint16x8_t staterr; uint16x8_t tmp; uint64_t stat; int32x4_t len_shl = {0, 0, 0, PKTLEN_SHIFT}; /* B.1 load 1 mbuf point */ mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]); /* Read desc statuses backwards to avoid race condition */ /* A.1 load 4 pkts desc */ descs[3] = vld1q_u64((uint64_t *)(rxdp + 3)); rte_rmb(); /* B.2 copy 2 mbuf point into rx_pkts */ vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1); /* B.1 load 1 mbuf point */ mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]); descs[2] = vld1q_u64((uint64_t *)(rxdp + 2)); /* B.1 load 2 mbuf point */ descs[1] = vld1q_u64((uint64_t *)(rxdp + 1)); descs[0] = vld1q_u64((uint64_t *)(rxdp)); /* B.2 copy 2 mbuf point into rx_pkts */ vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2); if (split_packet) { rte_mbuf_prefetch_part2(rx_pkts[pos]); rte_mbuf_prefetch_part2(rx_pkts[pos + 1]); rte_mbuf_prefetch_part2(rx_pkts[pos + 2]); rte_mbuf_prefetch_part2(rx_pkts[pos + 3]); } /* avoid compiler reorder optimization */ rte_compiler_barrier(); /* pkt 3,4 shift the pktlen field to be 16-bit aligned*/ uint32x4_t len3 = vshlq_u32(vreinterpretq_u32_u64(descs[3]), len_shl); descs[3] = vreinterpretq_u64_u32(len3); uint32x4_t len2 = vshlq_u32(vreinterpretq_u32_u64(descs[2]), len_shl); descs[2] = vreinterpretq_u64_u32(len2); /* D.1 pkt 3,4 convert format from desc to pktmbuf */ pkt_mb4 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[3]), shuf_msk); pkt_mb3 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[2]), shuf_msk); /* C.1 4=>2 filter staterr info only */ sterr_tmp2 = vzipq_u16(vreinterpretq_u16_u64(descs[1]), vreinterpretq_u16_u64(descs[3])); /* C.1 4=>2 filter staterr info only */ sterr_tmp1 = vzipq_u16(vreinterpretq_u16_u64(descs[0]), vreinterpretq_u16_u64(descs[2])); /* C.2 get 4 pkts staterr value */ staterr = vzipq_u16(sterr_tmp1.val[1], sterr_tmp2.val[1]).val[0]; desc_to_olflags_v(rxq, descs, &rx_pkts[pos]); /* D.2 pkt 3,4 set in_port/nb_seg and remove crc */ tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb4), crc_adjust); pkt_mb4 = vreinterpretq_u8_u16(tmp); tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust); pkt_mb3 = vreinterpretq_u8_u16(tmp); /* pkt 1,2 shift the pktlen field to be 16-bit aligned*/ uint32x4_t len1 = vshlq_u32(vreinterpretq_u32_u64(descs[1]), len_shl); descs[1] = vreinterpretq_u64_u32(len1); uint32x4_t len0 = vshlq_u32(vreinterpretq_u32_u64(descs[0]), len_shl); descs[0] = vreinterpretq_u64_u32(len0); /* D.1 pkt 1,2 convert format from desc to pktmbuf */ pkt_mb2 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[1]), shuf_msk); pkt_mb1 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[0]), shuf_msk); /* D.3 copy final 3,4 data to rx_pkts */ vst1q_u8((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1, pkt_mb4); vst1q_u8((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1, pkt_mb3); /* D.2 pkt 1,2 set in_port/nb_seg and remove crc */ tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust); pkt_mb2 = vreinterpretq_u8_u16(tmp); tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust); pkt_mb1 = vreinterpretq_u8_u16(tmp); /* C* extract and record EOP bit */ if (split_packet) { uint8x16_t eop_shuf_mask = { 0x00, 0x02, 0x04, 0x06, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}; uint8x16_t eop_bits; /* and with mask to extract bits, flipping 1-0 */ eop_bits = vmvnq_u8(vreinterpretq_u8_u16(staterr)); eop_bits = vandq_u8(eop_bits, eop_check); /* the staterr values are not in order, as the count * count of dd bits doesn't care. However, for end of * packet tracking, we do care, so shuffle. This also * compresses the 32-bit values to 8-bit */ eop_bits = vqtbl1q_u8(eop_bits, eop_shuf_mask); /* store the resulting 32-bit value */ vst1q_lane_u32((uint32_t *)split_packet, vreinterpretq_u32_u8(eop_bits), 0); split_packet += RTE_I40E_DESCS_PER_LOOP; /* zero-out next pointers */ rx_pkts[pos]->next = NULL; rx_pkts[pos + 1]->next = NULL; rx_pkts[pos + 2]->next = NULL; rx_pkts[pos + 3]->next = NULL; } staterr = vshlq_n_u16(staterr, I40E_UINT16_BIT - 1); staterr = vreinterpretq_u16_s16( vshrq_n_s16(vreinterpretq_s16_u16(staterr), I40E_UINT16_BIT - 1)); stat = ~vgetq_lane_u64(vreinterpretq_u64_u16(staterr), 0); rte_prefetch_non_temporal(rxdp + RTE_I40E_DESCS_PER_LOOP); /* D.3 copy final 1,2 data to rx_pkts */ vst1q_u8((void *)&rx_pkts[pos + 1]->rx_descriptor_fields1, pkt_mb2); vst1q_u8((void *)&rx_pkts[pos]->rx_descriptor_fields1, pkt_mb1); desc_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl); /* C.4 calc avaialbe number of desc */ if (unlikely(stat == 0)) { nb_pkts_recd += RTE_I40E_DESCS_PER_LOOP; } else { nb_pkts_recd += __builtin_ctzl(stat) / I40E_UINT16_BIT; break; } } /* Update our internal tail pointer */ rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd); rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1)); rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd); return nb_pkts_recd; }
/* * Notice: * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST * numbers of DD bits */ static inline uint16_t _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts, uint8_t *split_packet) { volatile union i40e_rx_desc *rxdp; struct i40e_rx_entry *sw_ring; uint16_t nb_pkts_recd; int pos; uint64_t var; __m128i shuf_msk; uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; __m128i crc_adjust = _mm_set_epi16( 0, 0, 0, /* ignore non-length fields */ -rxq->crc_len, /* sub crc on data_len */ 0, /* ignore high-16bits of pkt_len */ -rxq->crc_len, /* sub crc on pkt_len */ 0, 0 /* ignore pkt_type field */ ); /* * compile-time check the above crc_adjust layout is correct. * NOTE: the first field (lowest address) is given last in set_epi16 * call above. */ RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); __m128i dd_check, eop_check; /* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */ nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST); /* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */ nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP); /* Just the act of getting into the function from the application is * going to cost about 7 cycles */ rxdp = rxq->rx_ring + rxq->rx_tail; rte_prefetch0(rxdp); /* See if we need to rearm the RX queue - gives the prefetch a bit * of time to act */ if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) i40e_rxq_rearm(rxq); /* Before we start moving massive data around, check to see if * there is actually a packet available */ if (!(rxdp->wb.qword1.status_error_len & rte_cpu_to_le_32(1 << I40E_RX_DESC_STATUS_DD_SHIFT))) return 0; /* 4 packets DD mask */ dd_check = _mm_set_epi64x(0x0000000100000001LL, 0x0000000100000001LL); /* 4 packets EOP mask */ eop_check = _mm_set_epi64x(0x0000000200000002LL, 0x0000000200000002LL); /* mask to shuffle from desc. to mbuf */ shuf_msk = _mm_set_epi8( 7, 6, 5, 4, /* octet 4~7, 32bits rss */ 3, 2, /* octet 2~3, low 16 bits vlan_macip */ 15, 14, /* octet 15~14, 16 bits data_len */ 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ 15, 14, /* octet 15~14, low 16 bits pkt_len */ 0xFF, 0xFF, /* pkt_type set as unknown */ 0xFF, 0xFF /*pkt_type set as unknown */ ); /* * Compile-time verify the shuffle mask * NOTE: some field positions already verified above, but duplicated * here for completeness in case of future modifications. */ RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10); RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); /* Cache is empty -> need to scan the buffer rings, but first move * the next 'n' mbufs into the cache */ sw_ring = &rxq->sw_ring[rxq->rx_tail]; /* A. load 4 packet in one loop * [A*. mask out 4 unused dirty field in desc] * B. copy 4 mbuf point from swring to rx_pkts * C. calc the number of DD bits among the 4 packets * [C*. extract the end-of-packet bit, if requested] * D. fill info. from desc to mbuf */ for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts; pos += RTE_I40E_DESCS_PER_LOOP, rxdp += RTE_I40E_DESCS_PER_LOOP) { __m128i descs[RTE_I40E_DESCS_PER_LOOP]; __m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4; __m128i zero, staterr, sterr_tmp1, sterr_tmp2; /* 2 64 bit or 4 32 bit mbuf pointers in one XMM reg. */ __m128i mbp1; #if defined(RTE_ARCH_X86_64) __m128i mbp2; #endif /* B.1 load 2 (64 bit) or 4 (32 bit) mbuf points */ mbp1 = _mm_loadu_si128((__m128i *)&sw_ring[pos]); /* Read desc statuses backwards to avoid race condition */ /* A.1 load 4 pkts desc */ descs[3] = _mm_loadu_si128((__m128i *)(rxdp + 3)); rte_compiler_barrier(); /* B.2 copy 2 64 bit or 4 32 bit mbuf point into rx_pkts */ _mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1); #if defined(RTE_ARCH_X86_64) /* B.1 load 2 64 bit mbuf points */ mbp2 = _mm_loadu_si128((__m128i *)&sw_ring[pos+2]); #endif descs[2] = _mm_loadu_si128((__m128i *)(rxdp + 2)); rte_compiler_barrier(); /* B.1 load 2 mbuf point */ descs[1] = _mm_loadu_si128((__m128i *)(rxdp + 1)); rte_compiler_barrier(); descs[0] = _mm_loadu_si128((__m128i *)(rxdp)); #if defined(RTE_ARCH_X86_64) /* B.2 copy 2 mbuf point into rx_pkts */ _mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2); #endif if (split_packet) { rte_mbuf_prefetch_part2(rx_pkts[pos]); rte_mbuf_prefetch_part2(rx_pkts[pos + 1]); rte_mbuf_prefetch_part2(rx_pkts[pos + 2]); rte_mbuf_prefetch_part2(rx_pkts[pos + 3]); } /* avoid compiler reorder optimization */ rte_compiler_barrier(); /* pkt 3,4 shift the pktlen field to be 16-bit aligned*/ const __m128i len3 = _mm_slli_epi32(descs[3], PKTLEN_SHIFT); const __m128i len2 = _mm_slli_epi32(descs[2], PKTLEN_SHIFT); /* merge the now-aligned packet length fields back in */ descs[3] = _mm_blend_epi16(descs[3], len3, 0x80); descs[2] = _mm_blend_epi16(descs[2], len2, 0x80); /* D.1 pkt 3,4 convert format from desc to pktmbuf */ pkt_mb4 = _mm_shuffle_epi8(descs[3], shuf_msk); pkt_mb3 = _mm_shuffle_epi8(descs[2], shuf_msk); /* C.1 4=>2 filter staterr info only */ sterr_tmp2 = _mm_unpackhi_epi32(descs[3], descs[2]); /* C.1 4=>2 filter staterr info only */ sterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]); desc_to_olflags_v(rxq, descs, &rx_pkts[pos]); /* D.2 pkt 3,4 set in_port/nb_seg and remove crc */ pkt_mb4 = _mm_add_epi16(pkt_mb4, crc_adjust); pkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust); /* pkt 1,2 shift the pktlen field to be 16-bit aligned*/ const __m128i len1 = _mm_slli_epi32(descs[1], PKTLEN_SHIFT); const __m128i len0 = _mm_slli_epi32(descs[0], PKTLEN_SHIFT); /* merge the now-aligned packet length fields back in */ descs[1] = _mm_blend_epi16(descs[1], len1, 0x80); descs[0] = _mm_blend_epi16(descs[0], len0, 0x80); /* D.1 pkt 1,2 convert format from desc to pktmbuf */ pkt_mb2 = _mm_shuffle_epi8(descs[1], shuf_msk); pkt_mb1 = _mm_shuffle_epi8(descs[0], shuf_msk); /* C.2 get 4 pkts staterr value */ zero = _mm_xor_si128(dd_check, dd_check); staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2); /* D.3 copy final 3,4 data to rx_pkts */ _mm_storeu_si128((void *)&rx_pkts[pos+3]->rx_descriptor_fields1, pkt_mb4); _mm_storeu_si128((void *)&rx_pkts[pos+2]->rx_descriptor_fields1, pkt_mb3); /* D.2 pkt 1,2 set in_port/nb_seg and remove crc */ pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust); pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust); /* C* extract and record EOP bit */ if (split_packet) { __m128i eop_shuf_mask = _mm_set_epi8( 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x04, 0x0C, 0x00, 0x08 ); /* and with mask to extract bits, flipping 1-0 */ __m128i eop_bits = _mm_andnot_si128(staterr, eop_check); /* the staterr values are not in order, as the count * count of dd bits doesn't care. However, for end of * packet tracking, we do care, so shuffle. This also * compresses the 32-bit values to 8-bit */ eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask); /* store the resulting 32-bit value */ *(int *)split_packet = _mm_cvtsi128_si32(eop_bits); split_packet += RTE_I40E_DESCS_PER_LOOP; } /* C.3 calc available number of desc */ staterr = _mm_and_si128(staterr, dd_check); staterr = _mm_packs_epi32(staterr, zero); /* D.3 copy final 1,2 data to rx_pkts */ _mm_storeu_si128((void *)&rx_pkts[pos+1]->rx_descriptor_fields1, pkt_mb2); _mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1, pkt_mb1); desc_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl); /* C.4 calc avaialbe number of desc */ var = __builtin_popcountll(_mm_cvtsi128_si64(staterr)); nb_pkts_recd += var; if (likely(var != RTE_I40E_DESCS_PER_LOOP)) break; } /* Update our internal tail pointer */ rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd); rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1)); rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd); return nb_pkts_recd; }