static int dpdk_knidev_writer_tx(void *port, struct rte_mbuf *pkt) { struct dpdk_knidev_writer *p = (struct dpdk_knidev_writer *) port; struct rte_mbuf *pkt_copy; /* * KNI kernel module uses a trick to speed up packet processing. It takes * a physical address of a memory pool, converts it to the kernel virtual * address with phys_to_virt() and saves the address. * * Then in kni_net_rx_normal() instead of using phys_to_virt() per each * packet, KNI just calculates the difference between the previously * converted physical address of the given mempool and the packets * physical address. * * It works well for the mbufs from the same mempool. It also works fine * with any mempool allocated from the same physically contiguous memory * segment. * * As soon as we get a mempool allocated from another memory segment, the * difference calculations fail and thus we might have a crash. * * So we make sure the packet is from the RSS mempool. If not, we make * a copy to the RSS mempool. */ if (unlikely(pkt->pool != vr_dpdk.rss_mempool || /* Check indirect mbuf's data is within the RSS mempool. */ rte_pktmbuf_mtod(pkt, uintptr_t) < vr_dpdk.rss_mempool->elt_va_start || rte_pktmbuf_mtod(pkt, uintptr_t) > vr_dpdk.rss_mempool->elt_va_end )) { pkt_copy = vr_dpdk_pktmbuf_copy(pkt, vr_dpdk.rss_mempool); /* The original mbuf is no longer needed. */ vr_dpdk_pfree(pkt, VP_DROP_CLONED_ORIGINAL); if (unlikely(pkt_copy == NULL)) { DPDK_KNIDEV_WRITER_STATS_PKTS_DROP_ADD(p, 1); return -1; } pkt = pkt_copy; } p->tx_buf[p->tx_buf_count++] = pkt; DPDK_KNIDEV_WRITER_STATS_PKTS_IN_ADD(p, 1); if (p->tx_buf_count >= p->tx_burst_sz) send_burst(p); return 0; }
/** * Iterator function for rte_mempool_walk() to register existing mempools and * fill the MP to MR cache of a TX queue. * * @param[in] mp * Memory Pool to register. * @param *arg * Pointer to TX queue structure. */ void txq_mp2mr_iter(const struct rte_mempool *mp, void *arg) { struct txq *txq = arg; struct txq_mp2mr_mbuf_check_data data = { .mp = mp, .ret = -1, }; /* Discard empty mempools. */ if (mp->size == 0) return; /* Register mempool only if the first element looks like a mbuf. */ rte_mempool_obj_iter((void *)mp->elt_va_start, 1, mp->header_size + mp->elt_size + mp->trailer_size, 1, mp->elt_pa, mp->pg_num, mp->pg_shift, txq_mp2mr_mbuf_check, &data); if (data.ret) return; txq_mp2mr(txq, mp); } /** * Insert VLAN using mbuf headroom space. * * @param buf * Buffer for VLAN insertion. * * @return * 0 on success, errno value on failure. */ static inline int insert_vlan_sw(struct rte_mbuf *buf) { uintptr_t addr; uint32_t vlan; uint16_t head_room_len = rte_pktmbuf_headroom(buf); if (head_room_len < 4) return EINVAL; addr = rte_pktmbuf_mtod(buf, uintptr_t); vlan = htonl(0x81000000 | buf->vlan_tci); memmove((void *)(addr - 4), (void *)addr, 12); memcpy((void *)(addr + 8), &vlan, sizeof(vlan)); SET_DATA_OFF(buf, head_room_len - 4); DATA_LEN(buf) += 4; return 0; }
tx_burst_sg(struct txq *txq, unsigned int segs, struct txq_elt *elt, struct rte_mbuf *buf, unsigned int elts_head, struct ibv_sge (*sges)[MLX5_PMD_SGE_WR_N]) { unsigned int sent_size = 0; unsigned int j; int linearize = 0; /* When there are too many segments, extra segments are * linearized in the last SGE. */ if (unlikely(segs > RTE_DIM(*sges))) { segs = (RTE_DIM(*sges) - 1); linearize = 1; } /* Update element. */ elt->buf = buf; /* Register segments as SGEs. */ for (j = 0; (j != segs); ++j) { struct ibv_sge *sge = &(*sges)[j]; uint32_t lkey; /* Retrieve Memory Region key for this memory pool. */ lkey = txq_mp2mr(txq, txq_mb2mp(buf)); if (unlikely(lkey == (uint32_t)-1)) { /* MR does not exist. */ DEBUG("%p: unable to get MP <-> MR association", (void *)txq); /* Clean up TX element. */ elt->buf = NULL; goto stop; } /* Update SGE. */ sge->addr = rte_pktmbuf_mtod(buf, uintptr_t); if (txq->priv->vf) rte_prefetch0((volatile void *) (uintptr_t)sge->addr); sge->length = DATA_LEN(buf); sge->lkey = lkey; sent_size += sge->length; buf = NEXT(buf); } /* If buf is not NULL here and is not going to be linearized, * nb_segs is not valid. */ assert(j == segs); assert((buf == NULL) || (linearize)); /* Linearize extra segments. */ if (linearize) { struct ibv_sge *sge = &(*sges)[segs]; linear_t *linear = &(*txq->elts_linear)[elts_head]; unsigned int size = linearize_mbuf(linear, buf); assert(segs == (RTE_DIM(*sges) - 1)); if (size == 0) { /* Invalid packet. */ DEBUG("%p: packet too large to be linearized.", (void *)txq); /* Clean up TX element. */ elt->buf = NULL; goto stop; } /* If MLX5_PMD_SGE_WR_N is 1, free mbuf immediately. */ if (RTE_DIM(*sges) == 1) { do { struct rte_mbuf *next = NEXT(buf); rte_pktmbuf_free_seg(buf); buf = next; } while (buf != NULL); elt->buf = NULL; } /* Update SGE. */ sge->addr = (uintptr_t)&(*linear)[0]; sge->length = size; sge->lkey = txq->mr_linear->lkey; sent_size += size; /* Include last segment. */ segs++; } return (struct tx_burst_sg_ret){ .length = sent_size, .num = segs, }; stop: return (struct tx_burst_sg_ret){ .length = -1, .num = -1, }; } #endif /* MLX5_PMD_SGE_WR_N > 1 */ /** * DPDK callback for TX. * * @param dpdk_txq * Generic pointer to TX queue structure. * @param[in] pkts * Packets to transmit. * @param pkts_n * Number of packets in array. * * @return * Number of packets successfully transmitted (<= pkts_n). */ uint16_t mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) { struct txq *txq = (struct txq *)dpdk_txq; unsigned int elts_head = txq->elts_head; const unsigned int elts_n = txq->elts_n; unsigned int elts_comp_cd = txq->elts_comp_cd; unsigned int elts_comp = 0; unsigned int i; unsigned int max; int err; struct rte_mbuf *buf = pkts[0]; assert(elts_comp_cd != 0); /* Prefetch first packet cacheline. */ rte_prefetch0(buf); txq_complete(txq); max = (elts_n - (elts_head - txq->elts_tail)); if (max > elts_n) max -= elts_n; assert(max >= 1); assert(max <= elts_n); /* Always leave one free entry in the ring. */ --max; if (max == 0) return 0; if (max > pkts_n) max = pkts_n; for (i = 0; (i != max); ++i) { struct rte_mbuf *buf_next = pkts[i + 1]; unsigned int elts_head_next = (((elts_head + 1) == elts_n) ? 0 : elts_head + 1); struct txq_elt *elt = &(*txq->elts)[elts_head]; unsigned int segs = NB_SEGS(buf); #ifdef MLX5_PMD_SOFT_COUNTERS unsigned int sent_size = 0; #endif uint32_t send_flags = 0; #ifdef HAVE_VERBS_VLAN_INSERTION int insert_vlan = 0; #endif /* HAVE_VERBS_VLAN_INSERTION */ if (i + 1 < max) rte_prefetch0(buf_next); /* Request TX completion. */ if (unlikely(--elts_comp_cd == 0)) { elts_comp_cd = txq->elts_comp_cd_init; ++elts_comp; send_flags |= IBV_EXP_QP_BURST_SIGNALED; } /* Should we enable HW CKSUM offload */ if (buf->ol_flags & (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) { send_flags |= IBV_EXP_QP_BURST_IP_CSUM; /* HW does not support checksum offloads at arbitrary * offsets but automatically recognizes the packet * type. For inner L3/L4 checksums, only VXLAN (UDP) * tunnels are currently supported. */ if (RTE_ETH_IS_TUNNEL_PKT(buf->packet_type)) send_flags |= IBV_EXP_QP_BURST_TUNNEL; } if (buf->ol_flags & PKT_TX_VLAN_PKT) { #ifdef HAVE_VERBS_VLAN_INSERTION if (!txq->priv->mps) insert_vlan = 1; else #endif /* HAVE_VERBS_VLAN_INSERTION */ { err = insert_vlan_sw(buf); if (unlikely(err)) goto stop; } } if (likely(segs == 1)) { uintptr_t addr; uint32_t length; uint32_t lkey; uintptr_t buf_next_addr; /* Retrieve buffer information. */ addr = rte_pktmbuf_mtod(buf, uintptr_t); length = DATA_LEN(buf); /* Update element. */ elt->buf = buf; if (txq->priv->vf) rte_prefetch0((volatile void *) (uintptr_t)addr); /* Prefetch next buffer data. */ if (i + 1 < max) { buf_next_addr = rte_pktmbuf_mtod(buf_next, uintptr_t); rte_prefetch0((volatile void *) (uintptr_t)buf_next_addr); } /* Put packet into send queue. */ #if MLX5_PMD_MAX_INLINE > 0 if (length <= txq->max_inline) { #ifdef HAVE_VERBS_VLAN_INSERTION if (insert_vlan) err = txq->send_pending_inline_vlan (txq->qp, (void *)addr, length, send_flags, &buf->vlan_tci); else #endif /* HAVE_VERBS_VLAN_INSERTION */ err = txq->send_pending_inline (txq->qp, (void *)addr, length, send_flags); } else #endif { /* Retrieve Memory Region key for this * memory pool. */ lkey = txq_mp2mr(txq, txq_mb2mp(buf)); if (unlikely(lkey == (uint32_t)-1)) { /* MR does not exist. */ DEBUG("%p: unable to get MP <-> MR" " association", (void *)txq); /* Clean up TX element. */ elt->buf = NULL; goto stop; } #ifdef HAVE_VERBS_VLAN_INSERTION if (insert_vlan) err = txq->send_pending_vlan (txq->qp, addr, length, lkey, send_flags, &buf->vlan_tci); else #endif /* HAVE_VERBS_VLAN_INSERTION */ err = txq->send_pending (txq->qp, addr, length, lkey, send_flags); } if (unlikely(err)) goto stop; #ifdef MLX5_PMD_SOFT_COUNTERS sent_size += length; #endif } else { #if MLX5_PMD_SGE_WR_N > 1 struct ibv_sge sges[MLX5_PMD_SGE_WR_N]; struct tx_burst_sg_ret ret; ret = tx_burst_sg(txq, segs, elt, buf, elts_head, &sges); if (ret.length == (unsigned int)-1) goto stop; /* Put SG list into send queue. */ #ifdef HAVE_VERBS_VLAN_INSERTION if (insert_vlan) err = txq->send_pending_sg_list_vlan (txq->qp, sges, ret.num, send_flags, &buf->vlan_tci); else #endif /* HAVE_VERBS_VLAN_INSERTION */ err = txq->send_pending_sg_list (txq->qp, sges, ret.num, send_flags); if (unlikely(err)) goto stop; #ifdef MLX5_PMD_SOFT_COUNTERS sent_size += ret.length; #endif #else /* MLX5_PMD_SGE_WR_N > 1 */ DEBUG("%p: TX scattered buffers support not" " compiled in", (void *)txq); goto stop; #endif /* MLX5_PMD_SGE_WR_N > 1 */ } elts_head = elts_head_next; buf = buf_next; #ifdef MLX5_PMD_SOFT_COUNTERS /* Increment sent bytes counter. */ txq->stats.obytes += sent_size; #endif } stop: /* Take a shortcut if nothing must be sent. */ if (unlikely(i == 0)) return 0; #ifdef MLX5_PMD_SOFT_COUNTERS /* Increment sent packets counter. */ txq->stats.opackets += i; #endif /* Ring QP doorbell. */ err = txq->send_flush(txq->qp); if (unlikely(err)) { /* A nonzero value is not supposed to be returned. * Nothing can be done about it. */ DEBUG("%p: send_flush() failed with error %d", (void *)txq, err); } txq->elts_head = elts_head; txq->elts_comp += elts_comp; txq->elts_comp_cd = elts_comp_cd; return i; } /** * Translate RX completion flags to packet type. * * @param flags * RX completion flags returned by poll_length_flags(). * * @note: fix mlx5_dev_supported_ptypes_get() if any change here. * * @return * Packet type for struct rte_mbuf. */ static inline uint32_t rxq_cq_to_pkt_type(uint32_t flags) { uint32_t pkt_type; if (flags & IBV_EXP_CQ_RX_TUNNEL_PACKET) pkt_type = TRANSPOSE(flags, IBV_EXP_CQ_RX_OUTER_IPV4_PACKET, RTE_PTYPE_L3_IPV4) | TRANSPOSE(flags, IBV_EXP_CQ_RX_OUTER_IPV6_PACKET, RTE_PTYPE_L3_IPV6) | TRANSPOSE(flags, IBV_EXP_CQ_RX_IPV4_PACKET, RTE_PTYPE_INNER_L3_IPV4) | TRANSPOSE(flags, IBV_EXP_CQ_RX_IPV6_PACKET, RTE_PTYPE_INNER_L3_IPV6); else pkt_type = TRANSPOSE(flags, IBV_EXP_CQ_RX_IPV4_PACKET, RTE_PTYPE_L3_IPV4) | TRANSPOSE(flags, IBV_EXP_CQ_RX_IPV6_PACKET, RTE_PTYPE_L3_IPV6); return pkt_type; }