/** * Manage TX completions. * * When sending a burst, mlx5_tx_burst() posts several WRs. * To improve performance, a completion event is only required once every * MLX5_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information * for other WRs, but this information would not be used anyway. * * @param txq * Pointer to TX queue structure. * * @return * 0 on success, -1 on failure. */ static int txq_complete(struct txq *txq) { unsigned int elts_comp = txq->elts_comp; unsigned int elts_tail = txq->elts_tail; unsigned int elts_free = txq->elts_tail; const unsigned int elts_n = txq->elts_n; int wcs_n; if (unlikely(elts_comp == 0)) return 0; #ifdef DEBUG_SEND DEBUG("%p: processing %u work requests completions", (void *)txq, elts_comp); #endif wcs_n = txq->poll_cnt(txq->cq, elts_comp); if (unlikely(wcs_n == 0)) return 0; if (unlikely(wcs_n < 0)) { DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)", (void *)txq, wcs_n); return -1; } elts_comp -= wcs_n; assert(elts_comp <= txq->elts_comp); /* * Assume WC status is successful as nothing can be done about it * anyway. */ elts_tail += wcs_n * txq->elts_comp_cd_init; if (elts_tail >= elts_n) elts_tail -= elts_n; while (elts_free != elts_tail) { struct txq_elt *elt = &(*txq->elts)[elts_free]; unsigned int elts_free_next = (((elts_free + 1) == elts_n) ? 0 : elts_free + 1); struct rte_mbuf *tmp = elt->buf; struct txq_elt *elt_next = &(*txq->elts)[elts_free_next]; #ifndef NDEBUG /* Poisoning. */ memset(elt, 0x66, sizeof(*elt)); #endif RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); /* Faster than rte_pktmbuf_free(). */ do { struct rte_mbuf *next = NEXT(tmp); rte_pktmbuf_free_seg(tmp); tmp = next; } while (tmp != NULL); elts_free = elts_free_next; } txq->elts_tail = elts_tail; txq->elts_comp = elts_comp; return 0; }
tx_burst_sg(struct txq *txq, unsigned int segs, struct txq_elt *elt, struct rte_mbuf *buf, unsigned int elts_head, struct ibv_sge (*sges)[MLX5_PMD_SGE_WR_N]) { unsigned int sent_size = 0; unsigned int j; int linearize = 0; /* When there are too many segments, extra segments are * linearized in the last SGE. */ if (unlikely(segs > RTE_DIM(*sges))) { segs = (RTE_DIM(*sges) - 1); linearize = 1; } /* Update element. */ elt->buf = buf; /* Register segments as SGEs. */ for (j = 0; (j != segs); ++j) { struct ibv_sge *sge = &(*sges)[j]; uint32_t lkey; /* Retrieve Memory Region key for this memory pool. */ lkey = txq_mp2mr(txq, buf->pool); if (unlikely(lkey == (uint32_t)-1)) { /* MR does not exist. */ DEBUG("%p: unable to get MP <-> MR association", (void *)txq); /* Clean up TX element. */ elt->buf = NULL; goto stop; } /* Update SGE. */ sge->addr = rte_pktmbuf_mtod(buf, uintptr_t); if (txq->priv->vf) rte_prefetch0((volatile void *) (uintptr_t)sge->addr); sge->length = DATA_LEN(buf); sge->lkey = lkey; sent_size += sge->length; buf = NEXT(buf); } /* If buf is not NULL here and is not going to be linearized, * nb_segs is not valid. */ assert(j == segs); assert((buf == NULL) || (linearize)); /* Linearize extra segments. */ if (linearize) { struct ibv_sge *sge = &(*sges)[segs]; linear_t *linear = &(*txq->elts_linear)[elts_head]; unsigned int size = linearize_mbuf(linear, buf); assert(segs == (RTE_DIM(*sges) - 1)); if (size == 0) { /* Invalid packet. */ DEBUG("%p: packet too large to be linearized.", (void *)txq); /* Clean up TX element. */ elt->buf = NULL; goto stop; } /* If MLX5_PMD_SGE_WR_N is 1, free mbuf immediately. */ if (RTE_DIM(*sges) == 1) { do { struct rte_mbuf *next = NEXT(buf); rte_pktmbuf_free_seg(buf); buf = next; } while (buf != NULL); elt->buf = NULL; } /* Update SGE. */ sge->addr = (uintptr_t)&(*linear)[0]; sge->length = size; sge->lkey = txq->mr_linear->lkey; sent_size += size; } return (struct tx_burst_sg_ret){ .length = sent_size, .num = segs, }; stop: return (struct tx_burst_sg_ret){ .length = -1, .num = -1, }; } #endif /* MLX5_PMD_SGE_WR_N > 1 */ /** * DPDK callback for TX. * * @param dpdk_txq * Generic pointer to TX queue structure. * @param[in] pkts * Packets to transmit. * @param pkts_n * Number of packets in array. * * @return * Number of packets successfully transmitted (<= pkts_n). */ uint16_t mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) { struct txq *txq = (struct txq *)dpdk_txq; unsigned int elts_head = txq->elts_head; const unsigned int elts_tail = txq->elts_tail; const unsigned int elts_n = txq->elts_n; unsigned int elts_comp_cd = txq->elts_comp_cd; unsigned int elts_comp = 0; unsigned int i; unsigned int max; int err; assert(elts_comp_cd != 0); txq_complete(txq); max = (elts_n - (elts_head - elts_tail)); if (max > elts_n) max -= elts_n; assert(max >= 1); assert(max <= elts_n); /* Always leave one free entry in the ring. */ --max; if (max == 0) return 0; if (max > pkts_n) max = pkts_n; for (i = 0; (i != max); ++i) { struct rte_mbuf *buf = pkts[i]; unsigned int elts_head_next = (((elts_head + 1) == elts_n) ? 0 : elts_head + 1); struct txq_elt *elt_next = &(*txq->elts)[elts_head_next]; struct txq_elt *elt = &(*txq->elts)[elts_head]; unsigned int segs = NB_SEGS(buf); #ifdef MLX5_PMD_SOFT_COUNTERS unsigned int sent_size = 0; #endif uint32_t send_flags = 0; /* Clean up old buffer. */ if (likely(elt->buf != NULL)) { struct rte_mbuf *tmp = elt->buf; /* Faster than rte_pktmbuf_free(). */ do { struct rte_mbuf *next = NEXT(tmp); rte_pktmbuf_free_seg(tmp); tmp = next; } while (tmp != NULL); } /* Request TX completion. */ if (unlikely(--elts_comp_cd == 0)) { elts_comp_cd = txq->elts_comp_cd_init; ++elts_comp; send_flags |= IBV_EXP_QP_BURST_SIGNALED; } /* Should we enable HW CKSUM offload */ if (buf->ol_flags & (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) { send_flags |= IBV_EXP_QP_BURST_IP_CSUM; /* HW does not support checksum offloads at arbitrary * offsets but automatically recognizes the packet * type. For inner L3/L4 checksums, only VXLAN (UDP) * tunnels are currently supported. */ if (RTE_ETH_IS_TUNNEL_PKT(buf->packet_type)) send_flags |= IBV_EXP_QP_BURST_TUNNEL; } if (likely(segs == 1)) { uintptr_t addr; uint32_t length; uint32_t lkey; /* Retrieve buffer information. */ addr = rte_pktmbuf_mtod(buf, uintptr_t); length = DATA_LEN(buf); /* Retrieve Memory Region key for this memory pool. */ lkey = txq_mp2mr(txq, buf->pool); if (unlikely(lkey == (uint32_t)-1)) { /* MR does not exist. */ DEBUG("%p: unable to get MP <-> MR" " association", (void *)txq); /* Clean up TX element. */ elt->buf = NULL; goto stop; } /* Update element. */ elt->buf = buf; if (txq->priv->vf) rte_prefetch0((volatile void *) (uintptr_t)addr); RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); /* Put packet into send queue. */ #if MLX5_PMD_MAX_INLINE > 0 if (length <= txq->max_inline) err = txq->if_qp->send_pending_inline (txq->qp, (void *)addr, length, send_flags); else #endif err = txq->if_qp->send_pending (txq->qp, addr, length, lkey, send_flags); if (unlikely(err)) goto stop; #ifdef MLX5_PMD_SOFT_COUNTERS sent_size += length; #endif } else { #if MLX5_PMD_SGE_WR_N > 1 struct ibv_sge sges[MLX5_PMD_SGE_WR_N]; struct tx_burst_sg_ret ret; ret = tx_burst_sg(txq, segs, elt, buf, elts_head, &sges); if (ret.length == (unsigned int)-1) goto stop; RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); /* Put SG list into send queue. */ err = txq->if_qp->send_pending_sg_list (txq->qp, sges, ret.num, send_flags); if (unlikely(err)) goto stop; #ifdef MLX5_PMD_SOFT_COUNTERS sent_size += ret.length; #endif #else /* MLX5_PMD_SGE_WR_N > 1 */ DEBUG("%p: TX scattered buffers support not" " compiled in", (void *)txq); goto stop; #endif /* MLX5_PMD_SGE_WR_N > 1 */ } elts_head = elts_head_next; #ifdef MLX5_PMD_SOFT_COUNTERS /* Increment sent bytes counter. */ txq->stats.obytes += sent_size; #endif } stop: /* Take a shortcut if nothing must be sent. */ if (unlikely(i == 0)) return 0; #ifdef MLX5_PMD_SOFT_COUNTERS /* Increment sent packets counter. */ txq->stats.opackets += i; #endif /* Ring QP doorbell. */ err = txq->if_qp->send_flush(txq->qp); if (unlikely(err)) { /* A nonzero value is not supposed to be returned. * Nothing can be done about it. */ DEBUG("%p: send_flush() failed with error %d", (void *)txq, err); } txq->elts_head = elts_head; txq->elts_comp += elts_comp; txq->elts_comp_cd = elts_comp_cd; return i; } /** * Translate RX completion flags to packet type. * * @param flags * RX completion flags returned by poll_length_flags(). * * @return * Packet type for struct rte_mbuf. */ static inline uint32_t rxq_cq_to_pkt_type(uint32_t flags) { uint32_t pkt_type; if (flags & IBV_EXP_CQ_RX_TUNNEL_PACKET) pkt_type = TRANSPOSE(flags, IBV_EXP_CQ_RX_OUTER_IPV4_PACKET, RTE_PTYPE_L3_IPV4) | TRANSPOSE(flags, IBV_EXP_CQ_RX_OUTER_IPV6_PACKET, RTE_PTYPE_L3_IPV6) | TRANSPOSE(flags, IBV_EXP_CQ_RX_IPV4_PACKET, RTE_PTYPE_INNER_L3_IPV4) | TRANSPOSE(flags, IBV_EXP_CQ_RX_IPV6_PACKET, RTE_PTYPE_INNER_L3_IPV6); else pkt_type = TRANSPOSE(flags, IBV_EXP_CQ_RX_IPV4_PACKET, RTE_PTYPE_L3_IPV4) | TRANSPOSE(flags, IBV_EXP_CQ_RX_IPV6_PACKET, RTE_PTYPE_L3_IPV6); return pkt_type; }