void rte_ivshmem_metadata_dump(FILE *f, const char *name) { unsigned i = 0; struct ivshmem_config * config; struct rte_ivshmem_metadata_entry *entry; #ifdef RTE_LIBRTE_IVSHMEM_DEBUG uint64_t addr; uint64_t end, hugepage_sz; struct memseg_cache_entry e; #endif if (name == NULL) return; /* return error if we try to use an unknown config file */ config = get_config_by_name(name); if (config == NULL) { RTE_LOG(ERR, EAL, "Cannot find IVSHMEM config %s!\n", name); return; } rte_spinlock_lock(&config->sl); entry = &config->metadata->entry[0]; while (entry->mz.addr != NULL && i < RTE_DIM(config->metadata->entry)) { fprintf(f, "Entry %u: name:<%-20s>, phys:0x%-15lx, len:0x%-15lx, " "virt:%-15p, off:0x%-15lx\n", i, entry->mz.name, entry->mz.phys_addr, entry->mz.len, entry->mz.addr, entry->offset); i++; #ifdef RTE_LIBRTE_IVSHMEM_DEBUG fprintf(f, "\tHugepage files:\n"); hugepage_sz = entry->mz.hugepage_sz; addr = RTE_ALIGN_FLOOR(entry->mz.addr_64, hugepage_sz); end = addr + RTE_ALIGN_CEIL(entry->mz.len + (entry->mz.addr_64 - addr), hugepage_sz); for (; addr < end; addr += hugepage_sz) { memset(&e, 0, sizeof(e)); get_hugefile_by_virt_addr(addr, &e); fprintf(f, "\t0x%"PRIx64 "-0x%" PRIx64 " offset: 0x%" PRIx64 " %s\n", addr, addr + hugepage_sz, e.offset, e.filepath); } #endif entry++; } rte_spinlock_unlock(&config->sl); }
/** * Register mempool as a memory region. * * @param pd * Pointer to protection domain. * @param mp * Pointer to memory pool. * * @return * Memory region pointer, NULL in case of error. */ struct ibv_mr * mlx5_mp2mr(struct ibv_pd *pd, const struct rte_mempool *mp) { const struct rte_memseg *ms = rte_eal_get_physmem_layout(); uintptr_t start = mp->elt_va_start; uintptr_t end = mp->elt_va_end; unsigned int i; DEBUG("mempool %p area start=%p end=%p size=%zu", (const void *)mp, (void *)start, (void *)end, (size_t)(end - start)); /* Round start and end to page boundary if found in memory segments. */ for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) { uintptr_t addr = (uintptr_t)ms[i].addr; size_t len = ms[i].len; unsigned int align = ms[i].hugepage_sz; if ((start > addr) && (start < addr + len)) start = RTE_ALIGN_FLOOR(start, align); if ((end > addr) && (end < addr + len)) end = RTE_ALIGN_CEIL(end, align); } DEBUG("mempool %p using start=%p end=%p size=%zu for MR", (const void *)mp, (void *)start, (void *)end, (size_t)(end - start)); return ibv_reg_mr(pd, (void *)start, end - start, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); }
/* create memory configuration in shared/mmap memory. Take out * a write lock on the memsegs, so we can auto-detect primary/secondary. * This means we never close the file while running (auto-close on exit). * We also don't lock the whole file, so that in future we can use read-locks * on other parts, e.g. memzones, to detect if there are running secondary * processes. */ static void rte_eal_config_create(void) { void *rte_mem_cfg_addr; int retval; const char *pathname = eal_runtime_config_path(); if (internal_config.no_shconf) return; /* map the config before hugepage address so that we don't waste a page */ if (internal_config.base_virtaddr != 0) rte_mem_cfg_addr = (void *) RTE_ALIGN_FLOOR(internal_config.base_virtaddr - sizeof(struct rte_mem_config), sysconf(_SC_PAGE_SIZE)); else rte_mem_cfg_addr = NULL; if (mem_cfg_fd < 0){ mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0660); if (mem_cfg_fd < 0) rte_panic("Cannot open '%s' for rte_mem_config\n", pathname); } retval = ftruncate(mem_cfg_fd, sizeof(*rte_config.mem_config)); if (retval < 0){ close(mem_cfg_fd); rte_panic("Cannot resize '%s' for rte_mem_config\n", pathname); } retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock); if (retval < 0){ close(mem_cfg_fd); rte_exit(EXIT_FAILURE, "Cannot create lock on '%s'. Is another primary " "process running?\n", pathname); } rte_mem_cfg_addr = mmap(rte_mem_cfg_addr, sizeof(*rte_config.mem_config), PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0); if (rte_mem_cfg_addr == MAP_FAILED){ rte_panic("Cannot mmap memory for rte_config\n"); } memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config)); rte_config.mem_config = (struct rte_mem_config *) rte_mem_cfg_addr; /* store address of the config in the config itself so that secondary * processes could later map the config into this exact location */ rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr; }
/* * calculate the starting point of where data of the requested size * and alignment would fit in the current element. If the data doesn't * fit, return NULL. */ static void * elem_start_pt(struct malloc_elem *elem, size_t size, unsigned align, size_t bound) { const size_t bmask = ~(bound - 1); uintptr_t end_pt = (uintptr_t)elem + elem->size - MALLOC_ELEM_TRAILER_LEN; uintptr_t new_data_start = RTE_ALIGN_FLOOR((end_pt - size), align); uintptr_t new_elem_start; /* check boundary */ if ((new_data_start & bmask) != ((end_pt - 1) & bmask)) { end_pt = RTE_ALIGN_FLOOR(end_pt, bound); new_data_start = RTE_ALIGN_FLOOR((end_pt - size), align); if (((end_pt - 1) & bmask) != (new_data_start & bmask)) return NULL; } new_elem_start = new_data_start - MALLOC_ELEM_HEADER_LEN; /* if the new start point is before the exist start, it won't fit */ return (new_elem_start < (uintptr_t)elem) ? NULL : (void *)new_elem_start; }
static int test_align(void) { #define FAIL_ALIGN(x, i, p)\ {printf(x "() test failed: %u %u\n", i, p);\ return -1;} #define ERROR_FLOOR(res, i, pow) \ (res % pow) || /* check if not aligned */ \ ((res / pow) != (i / pow)) /* check if correct alignment */ #define ERROR_CEIL(res, i, pow) \ (res % pow) || /* check if not aligned */ \ ((i % pow) == 0 ? /* check if ceiling is invoked */ \ val / pow != i / pow : /* if aligned */ \ val / pow != (i / pow) + 1) /* if not aligned, hence +1 */ uint32_t i, p, val; for (i = 1, p = 1; i <= MAX_NUM; i ++) { if (rte_align32pow2(i) != p) FAIL_ALIGN("rte_align32pow2", i, p); if (i == p) p <<= 1; } for (p = 2; p <= MAX_NUM; p <<= 1) { if (!rte_is_power_of_2(p)) FAIL("rte_is_power_of_2"); for (i = 1; i <= MAX_NUM; i++) { /* align floor */ if (RTE_ALIGN_FLOOR((uintptr_t)i, p) % p) FAIL_ALIGN("RTE_ALIGN_FLOOR", i, p); val = RTE_PTR_ALIGN_FLOOR((uintptr_t) i, p); if (ERROR_FLOOR(val, i, p)) FAIL_ALIGN("RTE_PTR_ALIGN_FLOOR", i, p); val = RTE_ALIGN_FLOOR(i, p); if (ERROR_FLOOR(val, i, p)) FAIL_ALIGN("RTE_ALIGN_FLOOR", i, p); /* align ceiling */ val = RTE_PTR_ALIGN((uintptr_t) i, p); if (ERROR_CEIL(val, i, p)) FAIL_ALIGN("RTE_PTR_ALIGN", i, p); val = RTE_ALIGN(i, p); if (ERROR_CEIL(val, i, p)) FAIL_ALIGN("RTE_ALIGN", i, p); val = RTE_ALIGN_CEIL(i, p); if (ERROR_CEIL(val, i, p)) FAIL_ALIGN("RTE_ALIGN_CEIL", i, p); val = RTE_PTR_ALIGN_CEIL((uintptr_t)i, p); if (ERROR_CEIL(val, i, p)) FAIL_ALIGN("RTE_PTR_ALIGN_CEIL", i, p); /* by this point we know that val is aligned to p */ if (!rte_is_aligned((void*)(uintptr_t) val, p)) FAIL("rte_is_aligned"); } } return 0; }
/* * vPMD raw receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP) * * Notice: * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST * numbers of DD bit * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two * - don't support ol_flags for rss and csum err */ static inline uint16_t _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts, uint8_t *split_packet) { volatile union ixgbe_adv_rx_desc *rxdp; struct ixgbe_rx_entry *sw_ring; uint16_t nb_pkts_recd; int pos; uint64_t var; __m128i shuf_msk; __m128i crc_adjust = _mm_set_epi16( 0, 0, 0, /* ignore non-length fields */ -rxq->crc_len, /* sub crc on data_len */ 0, /* ignore high-16bits of pkt_len */ -rxq->crc_len, /* sub crc on pkt_len */ 0, 0 /* ignore pkt_type field */ ); __m128i dd_check, eop_check; /* nb_pkts shall be less equal than RTE_IXGBE_MAX_RX_BURST */ nb_pkts = RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST); /* nb_pkts has to be floor-aligned to RTE_IXGBE_DESCS_PER_LOOP */ nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_IXGBE_DESCS_PER_LOOP); /* Just the act of getting into the function from the application is * going to cost about 7 cycles */ rxdp = rxq->rx_ring + rxq->rx_tail; _mm_prefetch((const void *)rxdp, _MM_HINT_T0); /* See if we need to rearm the RX queue - gives the prefetch a bit * of time to act */ if (rxq->rxrearm_nb > RTE_IXGBE_RXQ_REARM_THRESH) ixgbe_rxq_rearm(rxq); /* Before we start moving massive data around, check to see if * there is actually a packet available */ if (!(rxdp->wb.upper.status_error & rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD))) return 0; /* 4 packets DD mask */ dd_check = _mm_set_epi64x(0x0000000100000001LL, 0x0000000100000001LL); /* 4 packets EOP mask */ eop_check = _mm_set_epi64x(0x0000000200000002LL, 0x0000000200000002LL); /* mask to shuffle from desc. to mbuf */ shuf_msk = _mm_set_epi8( 7, 6, 5, 4, /* octet 4~7, 32bits rss */ 15, 14, /* octet 14~15, low 16 bits vlan_macip */ 13, 12, /* octet 12~13, 16 bits data_len */ 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ 13, 12, /* octet 12~13, low 16 bits pkt_len */ 0xFF, 0xFF, /* skip 32 bit pkt_type */ 0xFF, 0xFF ); /* Cache is empty -> need to scan the buffer rings, but first move * the next 'n' mbufs into the cache */ sw_ring = &rxq->sw_ring[rxq->rx_tail]; /* A. load 4 packet in one loop * [A*. mask out 4 unused dirty field in desc] * B. copy 4 mbuf point from swring to rx_pkts * C. calc the number of DD bits among the 4 packets * [C*. extract the end-of-packet bit, if requested] * D. fill info. from desc to mbuf */ for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts; pos += RTE_IXGBE_DESCS_PER_LOOP, rxdp += RTE_IXGBE_DESCS_PER_LOOP) { __m128i descs[RTE_IXGBE_DESCS_PER_LOOP]; __m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4; __m128i zero, staterr, sterr_tmp1, sterr_tmp2; __m128i mbp1, mbp2; /* two mbuf pointer in one XMM reg. */ /* B.1 load 1 mbuf point */ mbp1 = _mm_loadu_si128((__m128i *)&sw_ring[pos]); /* Read desc statuses backwards to avoid race condition */ /* A.1 load 4 pkts desc */ descs[3] = _mm_loadu_si128((__m128i *)(rxdp + 3)); /* B.2 copy 2 mbuf point into rx_pkts */ _mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1); /* B.1 load 1 mbuf point */ mbp2 = _mm_loadu_si128((__m128i *)&sw_ring[pos+2]); descs[2] = _mm_loadu_si128((__m128i *)(rxdp + 2)); /* B.1 load 2 mbuf point */ descs[1] = _mm_loadu_si128((__m128i *)(rxdp + 1)); descs[0] = _mm_loadu_si128((__m128i *)(rxdp)); /* B.2 copy 2 mbuf point into rx_pkts */ _mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2); if (split_packet) { rte_mbuf_prefetch_part2(rx_pkts[pos]); rte_mbuf_prefetch_part2(rx_pkts[pos + 1]); rte_mbuf_prefetch_part2(rx_pkts[pos + 2]); rte_mbuf_prefetch_part2(rx_pkts[pos + 3]); } /* avoid compiler reorder optimization */ rte_compiler_barrier(); /* D.1 pkt 3,4 convert format from desc to pktmbuf */ pkt_mb4 = _mm_shuffle_epi8(descs[3], shuf_msk); pkt_mb3 = _mm_shuffle_epi8(descs[2], shuf_msk); /* D.1 pkt 1,2 convert format from desc to pktmbuf */ pkt_mb2 = _mm_shuffle_epi8(descs[1], shuf_msk); pkt_mb1 = _mm_shuffle_epi8(descs[0], shuf_msk); /* C.1 4=>2 filter staterr info only */ sterr_tmp2 = _mm_unpackhi_epi32(descs[3], descs[2]); /* C.1 4=>2 filter staterr info only */ sterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]); /* set ol_flags with vlan packet type */ desc_to_olflags_v(descs, &rx_pkts[pos]); /* D.2 pkt 3,4 set in_port/nb_seg and remove crc */ pkt_mb4 = _mm_add_epi16(pkt_mb4, crc_adjust); pkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust); /* C.2 get 4 pkts staterr value */ zero = _mm_xor_si128(dd_check, dd_check); staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2); /* D.3 copy final 3,4 data to rx_pkts */ _mm_storeu_si128((void *)&rx_pkts[pos+3]->rx_descriptor_fields1, pkt_mb4); _mm_storeu_si128((void *)&rx_pkts[pos+2]->rx_descriptor_fields1, pkt_mb3); /* D.2 pkt 1,2 set in_port/nb_seg and remove crc */ pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust); pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust); /* C* extract and record EOP bit */ if (split_packet) { __m128i eop_shuf_mask = _mm_set_epi8( 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x04, 0x0C, 0x00, 0x08 ); /* and with mask to extract bits, flipping 1-0 */ __m128i eop_bits = _mm_andnot_si128(staterr, eop_check); /* the staterr values are not in order, as the count * count of dd bits doesn't care. However, for end of * packet tracking, we do care, so shuffle. This also * compresses the 32-bit values to 8-bit */ eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask); /* store the resulting 32-bit value */ *(int *)split_packet = _mm_cvtsi128_si32(eop_bits); split_packet += RTE_IXGBE_DESCS_PER_LOOP; /* zero-out next pointers */ rx_pkts[pos]->next = NULL; rx_pkts[pos + 1]->next = NULL; rx_pkts[pos + 2]->next = NULL; rx_pkts[pos + 3]->next = NULL; } /* C.3 calc available number of desc */ staterr = _mm_and_si128(staterr, dd_check); staterr = _mm_packs_epi32(staterr, zero); /* D.3 copy final 1,2 data to rx_pkts */ _mm_storeu_si128((void *)&rx_pkts[pos+1]->rx_descriptor_fields1, pkt_mb2); _mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1, pkt_mb1); /* C.4 calc avaialbe number of desc */ var = __builtin_popcountll(_mm_cvtsi128_si64(staterr)); nb_pkts_recd += var; if (likely(var != RTE_IXGBE_DESCS_PER_LOOP)) break; } /* Update our internal tail pointer */ rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd); rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1)); rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd); return nb_pkts_recd; }
/* * This is a complex function. What it does is the following: * 1. Goes through metadata and gets list of hugepages involved * 2. Sorts the hugepages by size (1G first) * 3. Goes through metadata again and writes correct offsets * 4. Goes through pages and finds out their filenames, offsets etc. */ static int build_config(struct rte_ivshmem_metadata * metadata) { struct rte_ivshmem_metadata_entry * e_local; struct memseg_cache_entry * ms_local; struct rte_memseg pages[IVSHMEM_MAX_PAGES]; struct rte_ivshmem_metadata_entry *entry; struct memseg_cache_entry * c_entry, * prev_entry; struct ivshmem_config * config; unsigned i, j, mz_iter, ms_iter; uint64_t biggest_len; int biggest_idx; /* return error if we try to use an unknown config file */ config = get_config_by_name(metadata->name); if (config == NULL) { RTE_LOG(ERR, EAL, "Cannot find IVSHMEM config %s!\n", metadata->name); goto fail_e; } memset(pages, 0, sizeof(pages)); e_local = malloc(sizeof(config->metadata->entry)); if (e_local == NULL) goto fail_e; ms_local = malloc(sizeof(config->memseg_cache)); if (ms_local == NULL) goto fail_ms; /* make local copies before doing anything */ memcpy(e_local, config->metadata->entry, sizeof(config->metadata->entry)); memcpy(ms_local, config->memseg_cache, sizeof(config->memseg_cache)); qsort(e_local, RTE_DIM(config->metadata->entry), sizeof(struct rte_ivshmem_metadata_entry), entry_compare); /* first pass - collect all huge pages */ for (mz_iter = 0; mz_iter < RTE_DIM(config->metadata->entry); mz_iter++) { entry = &e_local[mz_iter]; uint64_t start_addr = RTE_ALIGN_FLOOR(entry->mz.addr_64, entry->mz.hugepage_sz); uint64_t offset = entry->mz.addr_64 - start_addr; uint64_t len = RTE_ALIGN_CEIL(entry->mz.len + offset, entry->mz.hugepage_sz); if (entry->mz.addr_64 == 0 || start_addr == 0 || len == 0) continue; int start_page; /* find first unused page - mz are phys_addr sorted so we don't have to * look out for holes */ for (i = 0; i < RTE_DIM(pages); i++) { /* skip if we already have this page */ if (pages[i].addr_64 == start_addr) { start_addr += entry->mz.hugepage_sz; len -= entry->mz.hugepage_sz; continue; } /* we found a new page */ else if (pages[i].addr_64 == 0) { start_page = i; break; } } if (i == RTE_DIM(pages)) { RTE_LOG(ERR, EAL, "Cannot find unused page!\n"); goto fail; } /* populate however many pages the memzone has */ for (i = start_page; i < RTE_DIM(pages) && len != 0; i++) { pages[i].addr_64 = start_addr; pages[i].len = entry->mz.hugepage_sz; start_addr += entry->mz.hugepage_sz; len -= entry->mz.hugepage_sz; } /* if there's still length left */ if (len != 0) { RTE_LOG(ERR, EAL, "Not enough space for pages!\n"); goto fail; } } /* second pass - sort pages by size */ for (i = 0; i < RTE_DIM(pages); i++) { if (pages[i].addr == NULL) break; biggest_len = 0; biggest_idx = -1; /* * browse all entries starting at 'i', and find the * entry with the smallest addr */ for (j=i; j< RTE_DIM(pages); j++) { if (pages[j].addr == NULL) break; if (biggest_len == 0 || pages[j].len > biggest_len) { biggest_len = pages[j].len; biggest_idx = j; } } /* should not happen */ if (biggest_idx == -1) { RTE_LOG(ERR, EAL, "Error sorting by size!\n"); goto fail; } if (i != (unsigned) biggest_idx) { struct rte_memseg tmp; memcpy(&tmp, &pages[biggest_idx], sizeof(struct rte_memseg)); /* we don't want to break contiguousness, so instead of just * swapping segments, we move all the preceding segments to the * right and then put the old segment @ biggest_idx in place of * segment @ i */ for (j = biggest_idx - 1; j >= i; j--) { memcpy(&pages[j+1], &pages[j], sizeof(struct rte_memseg)); memset(&pages[j], 0, sizeof(struct rte_memseg)); } /* put old biggest segment to its new place */ memcpy(&pages[i], &tmp, sizeof(struct rte_memseg)); } } /* third pass - write correct offsets */ for (mz_iter = 0; mz_iter < RTE_DIM(config->metadata->entry); mz_iter++) { uint64_t offset = 0; entry = &e_local[mz_iter]; if (entry->mz.addr_64 == 0) break; /* find page for current memzone */ for (i = 0; i < RTE_DIM(pages); i++) { /* we found our page */ if (entry->mz.addr_64 >= pages[i].addr_64 && entry->mz.addr_64 < pages[i].addr_64 + pages[i].len) { entry->offset = (entry->mz.addr_64 - pages[i].addr_64) + offset; break; } offset += pages[i].len; } if (i == RTE_DIM(pages)) { RTE_LOG(ERR, EAL, "Page not found!\n"); goto fail; } } ms_iter = 0; prev_entry = NULL; /* fourth pass - create proper memseg cache */ for (i = 0; i < RTE_DIM(pages) && ms_iter <= RTE_DIM(config->memseg_cache); i++) { if (pages[i].addr_64 == 0) break; if (ms_iter == RTE_DIM(pages)) { RTE_LOG(ERR, EAL, "The universe has collapsed!\n"); goto fail; } c_entry = &ms_local[ms_iter]; c_entry->len = pages[i].len; if (get_hugefile_by_virt_addr(pages[i].addr_64, c_entry) < 0) goto fail; /* if previous entry has the same filename and is contiguous, * clear current entry and increase previous entry's length */ if (prev_entry != NULL && strncmp(c_entry->filepath, prev_entry->filepath, sizeof(c_entry->filepath)) == 0 && prev_entry->offset + prev_entry->len == c_entry->offset) { prev_entry->len += pages[i].len; memset(c_entry, 0, sizeof(struct memseg_cache_entry)); } else { prev_entry = c_entry; ms_iter++; } } /* update current configuration with new valid data */ memcpy(config->metadata->entry, e_local, sizeof(config->metadata->entry)); memcpy(config->memseg_cache, ms_local, sizeof(config->memseg_cache)); free(ms_local); free(e_local); return 0; fail: free(ms_local); fail_ms: free(e_local); fail_e: return -1; }
/* * Notice: * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST * numbers of DD bits */ static inline uint16_t _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts, uint8_t *split_packet) { volatile union i40e_rx_desc *rxdp; struct i40e_rx_entry *sw_ring; uint16_t nb_pkts_recd; int pos; uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; /* mask to shuffle from desc. to mbuf */ uint8x16_t shuf_msk = { 0xFF, 0xFF, /* pkt_type set as unknown */ 0xFF, 0xFF, /* pkt_type set as unknown */ 14, 15, /* octet 15~14, low 16 bits pkt_len */ 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ 14, 15, /* octet 15~14, 16 bits data_len */ 2, 3, /* octet 2~3, low 16 bits vlan_macip */ 4, 5, 6, 7 /* octet 4~7, 32bits rss */ }; uint8x16_t eop_check = { 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; uint16x8_t crc_adjust = { 0, 0, /* ignore pkt_type field */ rxq->crc_len, /* sub crc on pkt_len */ 0, /* ignore high-16bits of pkt_len */ rxq->crc_len, /* sub crc on data_len */ 0, 0, 0 /* ignore non-length fields */ }; /* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */ nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST); /* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */ nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP); /* Just the act of getting into the function from the application is * going to cost about 7 cycles */ rxdp = rxq->rx_ring + rxq->rx_tail; rte_prefetch_non_temporal(rxdp); /* See if we need to rearm the RX queue - gives the prefetch a bit * of time to act */ if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) i40e_rxq_rearm(rxq); /* Before we start moving massive data around, check to see if * there is actually a packet available */ if (!(rxdp->wb.qword1.status_error_len & rte_cpu_to_le_32(1 << I40E_RX_DESC_STATUS_DD_SHIFT))) return 0; /* Cache is empty -> need to scan the buffer rings, but first move * the next 'n' mbufs into the cache */ sw_ring = &rxq->sw_ring[rxq->rx_tail]; /* A. load 4 packet in one loop * [A*. mask out 4 unused dirty field in desc] * B. copy 4 mbuf point from swring to rx_pkts * C. calc the number of DD bits among the 4 packets * [C*. extract the end-of-packet bit, if requested] * D. fill info. from desc to mbuf */ for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts; pos += RTE_I40E_DESCS_PER_LOOP, rxdp += RTE_I40E_DESCS_PER_LOOP) { uint64x2_t descs[RTE_I40E_DESCS_PER_LOOP]; uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4; uint16x8x2_t sterr_tmp1, sterr_tmp2; uint64x2_t mbp1, mbp2; uint16x8_t staterr; uint16x8_t tmp; uint64_t stat; int32x4_t len_shl = {0, 0, 0, PKTLEN_SHIFT}; /* B.1 load 1 mbuf point */ mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]); /* Read desc statuses backwards to avoid race condition */ /* A.1 load 4 pkts desc */ descs[3] = vld1q_u64((uint64_t *)(rxdp + 3)); rte_rmb(); /* B.2 copy 2 mbuf point into rx_pkts */ vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1); /* B.1 load 1 mbuf point */ mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]); descs[2] = vld1q_u64((uint64_t *)(rxdp + 2)); /* B.1 load 2 mbuf point */ descs[1] = vld1q_u64((uint64_t *)(rxdp + 1)); descs[0] = vld1q_u64((uint64_t *)(rxdp)); /* B.2 copy 2 mbuf point into rx_pkts */ vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2); if (split_packet) { rte_mbuf_prefetch_part2(rx_pkts[pos]); rte_mbuf_prefetch_part2(rx_pkts[pos + 1]); rte_mbuf_prefetch_part2(rx_pkts[pos + 2]); rte_mbuf_prefetch_part2(rx_pkts[pos + 3]); } /* avoid compiler reorder optimization */ rte_compiler_barrier(); /* pkt 3,4 shift the pktlen field to be 16-bit aligned*/ uint32x4_t len3 = vshlq_u32(vreinterpretq_u32_u64(descs[3]), len_shl); descs[3] = vreinterpretq_u64_u32(len3); uint32x4_t len2 = vshlq_u32(vreinterpretq_u32_u64(descs[2]), len_shl); descs[2] = vreinterpretq_u64_u32(len2); /* D.1 pkt 3,4 convert format from desc to pktmbuf */ pkt_mb4 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[3]), shuf_msk); pkt_mb3 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[2]), shuf_msk); /* C.1 4=>2 filter staterr info only */ sterr_tmp2 = vzipq_u16(vreinterpretq_u16_u64(descs[1]), vreinterpretq_u16_u64(descs[3])); /* C.1 4=>2 filter staterr info only */ sterr_tmp1 = vzipq_u16(vreinterpretq_u16_u64(descs[0]), vreinterpretq_u16_u64(descs[2])); /* C.2 get 4 pkts staterr value */ staterr = vzipq_u16(sterr_tmp1.val[1], sterr_tmp2.val[1]).val[0]; desc_to_olflags_v(rxq, descs, &rx_pkts[pos]); /* D.2 pkt 3,4 set in_port/nb_seg and remove crc */ tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb4), crc_adjust); pkt_mb4 = vreinterpretq_u8_u16(tmp); tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust); pkt_mb3 = vreinterpretq_u8_u16(tmp); /* pkt 1,2 shift the pktlen field to be 16-bit aligned*/ uint32x4_t len1 = vshlq_u32(vreinterpretq_u32_u64(descs[1]), len_shl); descs[1] = vreinterpretq_u64_u32(len1); uint32x4_t len0 = vshlq_u32(vreinterpretq_u32_u64(descs[0]), len_shl); descs[0] = vreinterpretq_u64_u32(len0); /* D.1 pkt 1,2 convert format from desc to pktmbuf */ pkt_mb2 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[1]), shuf_msk); pkt_mb1 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[0]), shuf_msk); /* D.3 copy final 3,4 data to rx_pkts */ vst1q_u8((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1, pkt_mb4); vst1q_u8((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1, pkt_mb3); /* D.2 pkt 1,2 set in_port/nb_seg and remove crc */ tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust); pkt_mb2 = vreinterpretq_u8_u16(tmp); tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust); pkt_mb1 = vreinterpretq_u8_u16(tmp); /* C* extract and record EOP bit */ if (split_packet) { uint8x16_t eop_shuf_mask = { 0x00, 0x02, 0x04, 0x06, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}; uint8x16_t eop_bits; /* and with mask to extract bits, flipping 1-0 */ eop_bits = vmvnq_u8(vreinterpretq_u8_u16(staterr)); eop_bits = vandq_u8(eop_bits, eop_check); /* the staterr values are not in order, as the count * count of dd bits doesn't care. However, for end of * packet tracking, we do care, so shuffle. This also * compresses the 32-bit values to 8-bit */ eop_bits = vqtbl1q_u8(eop_bits, eop_shuf_mask); /* store the resulting 32-bit value */ vst1q_lane_u32((uint32_t *)split_packet, vreinterpretq_u32_u8(eop_bits), 0); split_packet += RTE_I40E_DESCS_PER_LOOP; /* zero-out next pointers */ rx_pkts[pos]->next = NULL; rx_pkts[pos + 1]->next = NULL; rx_pkts[pos + 2]->next = NULL; rx_pkts[pos + 3]->next = NULL; } staterr = vshlq_n_u16(staterr, I40E_UINT16_BIT - 1); staterr = vreinterpretq_u16_s16( vshrq_n_s16(vreinterpretq_s16_u16(staterr), I40E_UINT16_BIT - 1)); stat = ~vgetq_lane_u64(vreinterpretq_u64_u16(staterr), 0); rte_prefetch_non_temporal(rxdp + RTE_I40E_DESCS_PER_LOOP); /* D.3 copy final 1,2 data to rx_pkts */ vst1q_u8((void *)&rx_pkts[pos + 1]->rx_descriptor_fields1, pkt_mb2); vst1q_u8((void *)&rx_pkts[pos]->rx_descriptor_fields1, pkt_mb1); desc_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl); /* C.4 calc avaialbe number of desc */ if (unlikely(stat == 0)) { nb_pkts_recd += RTE_I40E_DESCS_PER_LOOP; } else { nb_pkts_recd += __builtin_ctzl(stat) / I40E_UINT16_BIT; break; } } /* Update our internal tail pointer */ rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd); rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1)); rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd); return nb_pkts_recd; }
static inline uint16_t fm10k_recv_raw_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts, uint8_t *split_packet) { volatile union fm10k_rx_desc *rxdp; struct rte_mbuf **mbufp; uint16_t nb_pkts_recd; int pos; struct fm10k_rx_queue *rxq = rx_queue; uint64_t var; __m128i shuf_msk; __m128i dd_check, eop_check; uint16_t next_dd; next_dd = rxq->next_dd; /* Just the act of getting into the function from the application is * going to cost about 7 cycles */ rxdp = rxq->hw_ring + next_dd; rte_prefetch0(rxdp); /* See if we need to rearm the RX queue - gives the prefetch a bit * of time to act */ if (rxq->rxrearm_nb > RTE_FM10K_RXQ_REARM_THRESH) fm10k_rxq_rearm(rxq); /* Before we start moving massive data around, check to see if * there is actually a packet available */ if (!(rxdp->d.staterr & FM10K_RXD_STATUS_DD)) return 0; /* Vecotr RX will process 4 packets at a time, strip the unaligned * tails in case it's not multiple of 4. */ nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_FM10K_DESCS_PER_LOOP); /* 4 packets DD mask */ dd_check = _mm_set_epi64x(0x0000000100000001LL, 0x0000000100000001LL); /* 4 packets EOP mask */ eop_check = _mm_set_epi64x(0x0000000200000002LL, 0x0000000200000002LL); /* mask to shuffle from desc. to mbuf */ shuf_msk = _mm_set_epi8( 7, 6, 5, 4, /* octet 4~7, 32bits rss */ 15, 14, /* octet 14~15, low 16 bits vlan_macip */ 13, 12, /* octet 12~13, 16 bits data_len */ 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ 13, 12, /* octet 12~13, low 16 bits pkt_len */ 0xFF, 0xFF, /* skip high 16 bits pkt_type */ 0xFF, 0xFF /* Skip pkt_type field in shuffle operation */ ); /* * Compile-time verify the shuffle mask * NOTE: some field positions already verified above, but duplicated * here for completeness in case of future modifications. */ RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10); RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); /* Cache is empty -> need to scan the buffer rings, but first move * the next 'n' mbufs into the cache */ mbufp = &rxq->sw_ring[next_dd]; /* A. load 4 packet in one loop * [A*. mask out 4 unused dirty field in desc] * B. copy 4 mbuf point from swring to rx_pkts * C. calc the number of DD bits among the 4 packets * [C*. extract the end-of-packet bit, if requested] * D. fill info. from desc to mbuf */ for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts; pos += RTE_FM10K_DESCS_PER_LOOP, rxdp += RTE_FM10K_DESCS_PER_LOOP) { __m128i descs0[RTE_FM10K_DESCS_PER_LOOP]; __m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4; __m128i zero, staterr, sterr_tmp1, sterr_tmp2; __m128i mbp1; /* 2 64 bit or 4 32 bit mbuf pointers in one XMM reg. */ #if defined(RTE_ARCH_X86_64) __m128i mbp2; #endif /* B.1 load 2 (64 bit) or 4 (32 bit) mbuf points */ mbp1 = _mm_loadu_si128((__m128i *)&mbufp[pos]); /* Read desc statuses backwards to avoid race condition */ /* A.1 load 4 pkts desc */ descs0[3] = _mm_loadu_si128((__m128i *)(rxdp + 3)); rte_compiler_barrier(); /* B.2 copy 2 64 bit or 4 32 bit mbuf point into rx_pkts */ _mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1); #if defined(RTE_ARCH_X86_64) /* B.1 load 2 64 bit mbuf poitns */ mbp2 = _mm_loadu_si128((__m128i *)&mbufp[pos+2]); #endif descs0[2] = _mm_loadu_si128((__m128i *)(rxdp + 2)); rte_compiler_barrier(); /* B.1 load 2 mbuf point */ descs0[1] = _mm_loadu_si128((__m128i *)(rxdp + 1)); rte_compiler_barrier(); descs0[0] = _mm_loadu_si128((__m128i *)(rxdp)); #if defined(RTE_ARCH_X86_64) /* B.2 copy 2 mbuf point into rx_pkts */ _mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2); #endif /* avoid compiler reorder optimization */ rte_compiler_barrier(); if (split_packet) { rte_mbuf_prefetch_part2(rx_pkts[pos]); rte_mbuf_prefetch_part2(rx_pkts[pos + 1]); rte_mbuf_prefetch_part2(rx_pkts[pos + 2]); rte_mbuf_prefetch_part2(rx_pkts[pos + 3]); } /* D.1 pkt 3,4 convert format from desc to pktmbuf */ pkt_mb4 = _mm_shuffle_epi8(descs0[3], shuf_msk); pkt_mb3 = _mm_shuffle_epi8(descs0[2], shuf_msk); /* C.1 4=>2 filter staterr info only */ sterr_tmp2 = _mm_unpackhi_epi32(descs0[3], descs0[2]); /* C.1 4=>2 filter staterr info only */ sterr_tmp1 = _mm_unpackhi_epi32(descs0[1], descs0[0]); /* set ol_flags with vlan packet type */ fm10k_desc_to_olflags_v(descs0, &rx_pkts[pos]); /* D.1 pkt 1,2 convert format from desc to pktmbuf */ pkt_mb2 = _mm_shuffle_epi8(descs0[1], shuf_msk); pkt_mb1 = _mm_shuffle_epi8(descs0[0], shuf_msk); /* C.2 get 4 pkts staterr value */ zero = _mm_xor_si128(dd_check, dd_check); staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2); /* D.3 copy final 3,4 data to rx_pkts */ _mm_storeu_si128((void *)&rx_pkts[pos+3]->rx_descriptor_fields1, pkt_mb4); _mm_storeu_si128((void *)&rx_pkts[pos+2]->rx_descriptor_fields1, pkt_mb3); /* C* extract and record EOP bit */ if (split_packet) { __m128i eop_shuf_mask = _mm_set_epi8( 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x04, 0x0C, 0x00, 0x08 ); /* and with mask to extract bits, flipping 1-0 */ __m128i eop_bits = _mm_andnot_si128(staterr, eop_check); /* the staterr values are not in order, as the count * count of dd bits doesn't care. However, for end of * packet tracking, we do care, so shuffle. This also * compresses the 32-bit values to 8-bit */ eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask); /* store the resulting 32-bit value */ *(int *)split_packet = _mm_cvtsi128_si32(eop_bits); split_packet += RTE_FM10K_DESCS_PER_LOOP; /* zero-out next pointers */ rx_pkts[pos]->next = NULL; rx_pkts[pos + 1]->next = NULL; rx_pkts[pos + 2]->next = NULL; rx_pkts[pos + 3]->next = NULL; } /* C.3 calc available number of desc */ staterr = _mm_and_si128(staterr, dd_check); staterr = _mm_packs_epi32(staterr, zero); /* D.3 copy final 1,2 data to rx_pkts */ _mm_storeu_si128((void *)&rx_pkts[pos+1]->rx_descriptor_fields1, pkt_mb2); _mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1, pkt_mb1); fm10k_desc_to_pktype_v(descs0, &rx_pkts[pos]); /* C.4 calc avaialbe number of desc */ var = __builtin_popcountll(_mm_cvtsi128_si64(staterr)); nb_pkts_recd += var; if (likely(var != RTE_FM10K_DESCS_PER_LOOP)) break; } /* Update our internal tail pointer */ rxq->next_dd = (uint16_t)(rxq->next_dd + nb_pkts_recd); rxq->next_dd = (uint16_t)(rxq->next_dd & (rxq->nb_desc - 1)); rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd); return nb_pkts_recd; }
/* * Notice: * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST * numbers of DD bits */ static inline uint16_t _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts, uint8_t *split_packet) { volatile union i40e_rx_desc *rxdp; struct i40e_rx_entry *sw_ring; uint16_t nb_pkts_recd; int pos; uint64_t var; __m128i shuf_msk; uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; __m128i crc_adjust = _mm_set_epi16( 0, 0, 0, /* ignore non-length fields */ -rxq->crc_len, /* sub crc on data_len */ 0, /* ignore high-16bits of pkt_len */ -rxq->crc_len, /* sub crc on pkt_len */ 0, 0 /* ignore pkt_type field */ ); /* * compile-time check the above crc_adjust layout is correct. * NOTE: the first field (lowest address) is given last in set_epi16 * call above. */ RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); __m128i dd_check, eop_check; /* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */ nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST); /* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */ nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP); /* Just the act of getting into the function from the application is * going to cost about 7 cycles */ rxdp = rxq->rx_ring + rxq->rx_tail; rte_prefetch0(rxdp); /* See if we need to rearm the RX queue - gives the prefetch a bit * of time to act */ if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) i40e_rxq_rearm(rxq); /* Before we start moving massive data around, check to see if * there is actually a packet available */ if (!(rxdp->wb.qword1.status_error_len & rte_cpu_to_le_32(1 << I40E_RX_DESC_STATUS_DD_SHIFT))) return 0; /* 4 packets DD mask */ dd_check = _mm_set_epi64x(0x0000000100000001LL, 0x0000000100000001LL); /* 4 packets EOP mask */ eop_check = _mm_set_epi64x(0x0000000200000002LL, 0x0000000200000002LL); /* mask to shuffle from desc. to mbuf */ shuf_msk = _mm_set_epi8( 7, 6, 5, 4, /* octet 4~7, 32bits rss */ 3, 2, /* octet 2~3, low 16 bits vlan_macip */ 15, 14, /* octet 15~14, 16 bits data_len */ 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ 15, 14, /* octet 15~14, low 16 bits pkt_len */ 0xFF, 0xFF, /* pkt_type set as unknown */ 0xFF, 0xFF /*pkt_type set as unknown */ ); /* * Compile-time verify the shuffle mask * NOTE: some field positions already verified above, but duplicated * here for completeness in case of future modifications. */ RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10); RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); /* Cache is empty -> need to scan the buffer rings, but first move * the next 'n' mbufs into the cache */ sw_ring = &rxq->sw_ring[rxq->rx_tail]; /* A. load 4 packet in one loop * [A*. mask out 4 unused dirty field in desc] * B. copy 4 mbuf point from swring to rx_pkts * C. calc the number of DD bits among the 4 packets * [C*. extract the end-of-packet bit, if requested] * D. fill info. from desc to mbuf */ for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts; pos += RTE_I40E_DESCS_PER_LOOP, rxdp += RTE_I40E_DESCS_PER_LOOP) { __m128i descs[RTE_I40E_DESCS_PER_LOOP]; __m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4; __m128i zero, staterr, sterr_tmp1, sterr_tmp2; /* 2 64 bit or 4 32 bit mbuf pointers in one XMM reg. */ __m128i mbp1; #if defined(RTE_ARCH_X86_64) __m128i mbp2; #endif /* B.1 load 2 (64 bit) or 4 (32 bit) mbuf points */ mbp1 = _mm_loadu_si128((__m128i *)&sw_ring[pos]); /* Read desc statuses backwards to avoid race condition */ /* A.1 load 4 pkts desc */ descs[3] = _mm_loadu_si128((__m128i *)(rxdp + 3)); rte_compiler_barrier(); /* B.2 copy 2 64 bit or 4 32 bit mbuf point into rx_pkts */ _mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1); #if defined(RTE_ARCH_X86_64) /* B.1 load 2 64 bit mbuf points */ mbp2 = _mm_loadu_si128((__m128i *)&sw_ring[pos+2]); #endif descs[2] = _mm_loadu_si128((__m128i *)(rxdp + 2)); rte_compiler_barrier(); /* B.1 load 2 mbuf point */ descs[1] = _mm_loadu_si128((__m128i *)(rxdp + 1)); rte_compiler_barrier(); descs[0] = _mm_loadu_si128((__m128i *)(rxdp)); #if defined(RTE_ARCH_X86_64) /* B.2 copy 2 mbuf point into rx_pkts */ _mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2); #endif if (split_packet) { rte_mbuf_prefetch_part2(rx_pkts[pos]); rte_mbuf_prefetch_part2(rx_pkts[pos + 1]); rte_mbuf_prefetch_part2(rx_pkts[pos + 2]); rte_mbuf_prefetch_part2(rx_pkts[pos + 3]); } /* avoid compiler reorder optimization */ rte_compiler_barrier(); /* pkt 3,4 shift the pktlen field to be 16-bit aligned*/ const __m128i len3 = _mm_slli_epi32(descs[3], PKTLEN_SHIFT); const __m128i len2 = _mm_slli_epi32(descs[2], PKTLEN_SHIFT); /* merge the now-aligned packet length fields back in */ descs[3] = _mm_blend_epi16(descs[3], len3, 0x80); descs[2] = _mm_blend_epi16(descs[2], len2, 0x80); /* D.1 pkt 3,4 convert format from desc to pktmbuf */ pkt_mb4 = _mm_shuffle_epi8(descs[3], shuf_msk); pkt_mb3 = _mm_shuffle_epi8(descs[2], shuf_msk); /* C.1 4=>2 filter staterr info only */ sterr_tmp2 = _mm_unpackhi_epi32(descs[3], descs[2]); /* C.1 4=>2 filter staterr info only */ sterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]); desc_to_olflags_v(rxq, descs, &rx_pkts[pos]); /* D.2 pkt 3,4 set in_port/nb_seg and remove crc */ pkt_mb4 = _mm_add_epi16(pkt_mb4, crc_adjust); pkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust); /* pkt 1,2 shift the pktlen field to be 16-bit aligned*/ const __m128i len1 = _mm_slli_epi32(descs[1], PKTLEN_SHIFT); const __m128i len0 = _mm_slli_epi32(descs[0], PKTLEN_SHIFT); /* merge the now-aligned packet length fields back in */ descs[1] = _mm_blend_epi16(descs[1], len1, 0x80); descs[0] = _mm_blend_epi16(descs[0], len0, 0x80); /* D.1 pkt 1,2 convert format from desc to pktmbuf */ pkt_mb2 = _mm_shuffle_epi8(descs[1], shuf_msk); pkt_mb1 = _mm_shuffle_epi8(descs[0], shuf_msk); /* C.2 get 4 pkts staterr value */ zero = _mm_xor_si128(dd_check, dd_check); staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2); /* D.3 copy final 3,4 data to rx_pkts */ _mm_storeu_si128((void *)&rx_pkts[pos+3]->rx_descriptor_fields1, pkt_mb4); _mm_storeu_si128((void *)&rx_pkts[pos+2]->rx_descriptor_fields1, pkt_mb3); /* D.2 pkt 1,2 set in_port/nb_seg and remove crc */ pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust); pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust); /* C* extract and record EOP bit */ if (split_packet) { __m128i eop_shuf_mask = _mm_set_epi8( 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x04, 0x0C, 0x00, 0x08 ); /* and with mask to extract bits, flipping 1-0 */ __m128i eop_bits = _mm_andnot_si128(staterr, eop_check); /* the staterr values are not in order, as the count * count of dd bits doesn't care. However, for end of * packet tracking, we do care, so shuffle. This also * compresses the 32-bit values to 8-bit */ eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask); /* store the resulting 32-bit value */ *(int *)split_packet = _mm_cvtsi128_si32(eop_bits); split_packet += RTE_I40E_DESCS_PER_LOOP; } /* C.3 calc available number of desc */ staterr = _mm_and_si128(staterr, dd_check); staterr = _mm_packs_epi32(staterr, zero); /* D.3 copy final 1,2 data to rx_pkts */ _mm_storeu_si128((void *)&rx_pkts[pos+1]->rx_descriptor_fields1, pkt_mb2); _mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1, pkt_mb1); desc_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl); /* C.4 calc avaialbe number of desc */ var = __builtin_popcountll(_mm_cvtsi128_si64(staterr)); nb_pkts_recd += var; if (likely(var != RTE_I40E_DESCS_PER_LOOP)) break; } /* Update our internal tail pointer */ rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd); rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1)); rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd); return nb_pkts_recd; }
/* virtio vPMD receive routine, only accept(nb_pkts >= RTE_VIRTIO_DESC_PER_LOOP) * * This routine is for non-mergeable RX, one desc for each guest buffer. * This routine is based on the RX ring layout optimization. Each entry in the * avail ring points to the desc with the same index in the desc ring and this * will never be changed in the driver. * * - nb_pkts < RTE_VIRTIO_DESC_PER_LOOP, just return no packet */ uint16_t virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) { struct virtnet_rx *rxvq = rx_queue; struct virtqueue *vq = rxvq->vq; uint16_t nb_used; uint16_t desc_idx; struct vring_used_elem *rused; struct rte_mbuf **sw_ring; struct rte_mbuf **sw_ring_end; uint16_t nb_pkts_received; uint8x16_t shuf_msk1 = { 0xFF, 0xFF, 0xFF, 0xFF, /* packet type */ 4, 5, 0xFF, 0xFF, /* pkt len */ 4, 5, /* dat len */ 0xFF, 0xFF, /* vlan tci */ 0xFF, 0xFF, 0xFF, 0xFF }; uint8x16_t shuf_msk2 = { 0xFF, 0xFF, 0xFF, 0xFF, /* packet type */ 12, 13, 0xFF, 0xFF, /* pkt len */ 12, 13, /* dat len */ 0xFF, 0xFF, /* vlan tci */ 0xFF, 0xFF, 0xFF, 0xFF }; /* Subtract the header length. * In which case do we need the header length in used->len ? */ uint16x8_t len_adjust = { 0, 0, (uint16_t)vq->hw->vtnet_hdr_size, 0, (uint16_t)vq->hw->vtnet_hdr_size, 0, 0, 0 }; if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP)) return 0; nb_used = VIRTQUEUE_NUSED(vq); rte_rmb(); if (unlikely(nb_used == 0)) return 0; nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_VIRTIO_DESC_PER_LOOP); nb_used = RTE_MIN(nb_used, nb_pkts); desc_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1)); rused = &vq->vq_ring.used->ring[desc_idx]; sw_ring = &vq->sw_ring[desc_idx]; sw_ring_end = &vq->sw_ring[vq->vq_nentries]; rte_prefetch_non_temporal(rused); if (vq->vq_free_cnt >= RTE_VIRTIO_VPMD_RX_REARM_THRESH) { virtio_rxq_rearm_vec(rxvq); if (unlikely(virtqueue_kick_prepare(vq))) virtqueue_notify(vq); } for (nb_pkts_received = 0; nb_pkts_received < nb_used;) { uint64x2_t desc[RTE_VIRTIO_DESC_PER_LOOP / 2]; uint64x2_t mbp[RTE_VIRTIO_DESC_PER_LOOP / 2]; uint64x2_t pkt_mb[RTE_VIRTIO_DESC_PER_LOOP]; mbp[0] = vld1q_u64((uint64_t *)(sw_ring + 0)); desc[0] = vld1q_u64((uint64_t *)(rused + 0)); vst1q_u64((uint64_t *)&rx_pkts[0], mbp[0]); mbp[1] = vld1q_u64((uint64_t *)(sw_ring + 2)); desc[1] = vld1q_u64((uint64_t *)(rused + 2)); vst1q_u64((uint64_t *)&rx_pkts[2], mbp[1]); mbp[2] = vld1q_u64((uint64_t *)(sw_ring + 4)); desc[2] = vld1q_u64((uint64_t *)(rused + 4)); vst1q_u64((uint64_t *)&rx_pkts[4], mbp[2]); mbp[3] = vld1q_u64((uint64_t *)(sw_ring + 6)); desc[3] = vld1q_u64((uint64_t *)(rused + 6)); vst1q_u64((uint64_t *)&rx_pkts[6], mbp[3]); pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8( vreinterpretq_u8_u64(desc[0]), shuf_msk2)); pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8( vreinterpretq_u8_u64(desc[0]), shuf_msk1)); pkt_mb[1] = vreinterpretq_u64_u16(vsubq_u16( vreinterpretq_u16_u64(pkt_mb[1]), len_adjust)); pkt_mb[0] = vreinterpretq_u64_u16(vsubq_u16( vreinterpretq_u16_u64(pkt_mb[0]), len_adjust)); vst1q_u64((void *)&rx_pkts[1]->rx_descriptor_fields1, pkt_mb[1]); vst1q_u64((void *)&rx_pkts[0]->rx_descriptor_fields1, pkt_mb[0]); pkt_mb[3] = vreinterpretq_u64_u8(vqtbl1q_u8( vreinterpretq_u8_u64(desc[1]), shuf_msk2)); pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8( vreinterpretq_u8_u64(desc[1]), shuf_msk1)); pkt_mb[3] = vreinterpretq_u64_u16(vsubq_u16( vreinterpretq_u16_u64(pkt_mb[3]), len_adjust)); pkt_mb[2] = vreinterpretq_u64_u16(vsubq_u16( vreinterpretq_u16_u64(pkt_mb[2]), len_adjust)); vst1q_u64((void *)&rx_pkts[3]->rx_descriptor_fields1, pkt_mb[3]); vst1q_u64((void *)&rx_pkts[2]->rx_descriptor_fields1, pkt_mb[2]); pkt_mb[5] = vreinterpretq_u64_u8(vqtbl1q_u8( vreinterpretq_u8_u64(desc[2]), shuf_msk2)); pkt_mb[4] = vreinterpretq_u64_u8(vqtbl1q_u8( vreinterpretq_u8_u64(desc[2]), shuf_msk1)); pkt_mb[5] = vreinterpretq_u64_u16(vsubq_u16( vreinterpretq_u16_u64(pkt_mb[5]), len_adjust)); pkt_mb[4] = vreinterpretq_u64_u16(vsubq_u16( vreinterpretq_u16_u64(pkt_mb[4]), len_adjust)); vst1q_u64((void *)&rx_pkts[5]->rx_descriptor_fields1, pkt_mb[5]); vst1q_u64((void *)&rx_pkts[4]->rx_descriptor_fields1, pkt_mb[4]); pkt_mb[7] = vreinterpretq_u64_u8(vqtbl1q_u8( vreinterpretq_u8_u64(desc[3]), shuf_msk2)); pkt_mb[6] = vreinterpretq_u64_u8(vqtbl1q_u8( vreinterpretq_u8_u64(desc[3]), shuf_msk1)); pkt_mb[7] = vreinterpretq_u64_u16(vsubq_u16( vreinterpretq_u16_u64(pkt_mb[7]), len_adjust)); pkt_mb[6] = vreinterpretq_u64_u16(vsubq_u16( vreinterpretq_u16_u64(pkt_mb[6]), len_adjust)); vst1q_u64((void *)&rx_pkts[7]->rx_descriptor_fields1, pkt_mb[7]); vst1q_u64((void *)&rx_pkts[6]->rx_descriptor_fields1, pkt_mb[6]); if (unlikely(nb_used <= RTE_VIRTIO_DESC_PER_LOOP)) { if (sw_ring + nb_used <= sw_ring_end) nb_pkts_received += nb_used; else nb_pkts_received += sw_ring_end - sw_ring; break; } else { if (unlikely(sw_ring + RTE_VIRTIO_DESC_PER_LOOP >= sw_ring_end)) { nb_pkts_received += sw_ring_end - sw_ring; break; } else { nb_pkts_received += RTE_VIRTIO_DESC_PER_LOOP; rx_pkts += RTE_VIRTIO_DESC_PER_LOOP; sw_ring += RTE_VIRTIO_DESC_PER_LOOP; rused += RTE_VIRTIO_DESC_PER_LOOP; nb_used -= RTE_VIRTIO_DESC_PER_LOOP; } } } vq->vq_used_cons_idx += nb_pkts_received; vq->vq_free_cnt += nb_pkts_received; rxvq->stats.packets += nb_pkts_received; return nb_pkts_received; }