ssize_t dst_entry_tcp::fast_send(const struct iovec* p_iov, const ssize_t sz_iov, bool b_blocked /*= true*/, bool is_rexmit /*= false*/, bool dont_inline /*= false*/) { tx_packet_template_t* p_pkt; mem_buf_desc_t *p_mem_buf_desc; size_t total_packet_len = 0; // The header is aligned for fast copy but we need to maintain this diff in order to get the real header pointer easily size_t hdr_alignment_diff = m_header.m_aligned_l2_l3_len - m_header.m_total_hdr_len; tcp_iovec* p_tcp_iov = NULL; bool no_copy = true; if (likely(sz_iov == 1 && !is_rexmit)) { p_tcp_iov = (tcp_iovec*)p_iov; if (unlikely(!m_p_ring->is_active_member(p_tcp_iov->p_desc->p_desc_owner, m_id))) { no_copy = false; dst_tcp_logdbg("p_desc=%p wrong desc_owner=%p, this ring=%p. did migration occurred?", p_tcp_iov->p_desc, p_tcp_iov->p_desc->p_desc_owner, m_p_ring); //todo can we handle this in migration (by going over all buffers lwip hold) instead for every send? } } else { no_copy = false; } if (unlikely(is_rexmit)) m_p_ring->inc_ring_stats(m_id); if (likely(no_copy)) { p_pkt = (tx_packet_template_t*)((uint8_t*)p_tcp_iov[0].iovec.iov_base - m_header.m_aligned_l2_l3_len); total_packet_len = p_tcp_iov[0].iovec.iov_len + m_header.m_total_hdr_len; m_header.copy_l2_ip_hdr(p_pkt); // We've copied to aligned address, and now we must update p_pkt to point to real // L2 header //p_pkt = (tx_packet_template_t*)((uint8_t*)p_pkt + hdr_alignment_diff); p_pkt->hdr.m_ip_hdr.tot_len = (htons)(p_tcp_iov[0].iovec.iov_len + m_header.m_ip_header_len); m_sge[0].addr = (uintptr_t)((uint8_t*)p_pkt + hdr_alignment_diff); m_sge[0].length = total_packet_len; /* for DEBUG */ if ((uint8_t*)m_sge[0].addr < p_tcp_iov[0].p_desc->p_buffer || (uint8_t*)p_pkt < p_tcp_iov[0].p_desc->p_buffer) { dst_tcp_logerr("p_buffer - addr=%d, m_total_hdr_len=%zd, p_buffer=%p, type=%d, len=%d, tot_len=%d, payload=%p, hdr_alignment_diff=%zd\n", (int)(p_tcp_iov[0].p_desc->p_buffer - (uint8_t*)m_sge[0].addr), m_header.m_total_hdr_len, p_tcp_iov[0].p_desc->p_buffer, p_tcp_iov[0].p_desc->lwip_pbuf.pbuf.type, p_tcp_iov[0].p_desc->lwip_pbuf.pbuf.len, p_tcp_iov[0].p_desc->lwip_pbuf.pbuf.tot_len, p_tcp_iov[0].p_desc->lwip_pbuf.pbuf.payload, hdr_alignment_diff); } if (!dont_inline && (total_packet_len < m_max_inline)) { // inline send m_p_send_wqe = &m_inline_send_wqe; } else { m_p_send_wqe = &m_not_inline_send_wqe; } m_p_send_wqe->wr_id = (uintptr_t)p_tcp_iov[0].p_desc; #ifdef VMA_NO_HW_CSUM p_pkt->hdr.m_ip_hdr.check = 0; // use 0 at csum calculation time p_pkt->hdr.m_ip_hdr.check = compute_ip_checksum((unsigned short*)&p_pkt->hdr.m_ip_hdr, p_pkt->hdr.m_ip_hdr.ihl * 2); struct tcphdr* p_tcphdr = (struct tcphdr*)(((uint8_t*)(&(p_pkt->hdr.m_ip_hdr))+sizeof(p_pkt->hdr.m_ip_hdr))); p_tcphdr->check = 0; p_tcphdr->check = compute_tcp_checksum(&p_pkt->hdr.m_ip_hdr, (const uint16_t *)p_tcphdr); dst_tcp_logfine("using SW checksum calculation: p_pkt->hdr.m_ip_hdr.check=%d, p_tcphdr->check=%d", (int)p_pkt->hdr.m_ip_hdr.check, (int)p_tcphdr->check); #endif m_p_ring->send_lwip_buffer(m_id, m_p_send_wqe, b_blocked); } else { // We don'nt support inline in this case, since we believe that this a very rare case p_mem_buf_desc = get_buffer(b_blocked); if (p_mem_buf_desc == NULL) { return -1; } m_header.copy_l2_ip_hdr((tx_packet_template_t*)p_mem_buf_desc->p_buffer); // Actually this is not the real packet len we will subtract the alignment diff at the end of the copy total_packet_len = m_header.m_aligned_l2_l3_len; for (int i = 0; i < sz_iov; ++i) { memcpy(p_mem_buf_desc->p_buffer + total_packet_len, p_iov[i].iov_base, p_iov[i].iov_len); total_packet_len += p_iov[i].iov_len; } m_sge[0].addr = (uintptr_t)(p_mem_buf_desc->p_buffer + hdr_alignment_diff); m_sge[0].length = total_packet_len - hdr_alignment_diff; // LKey will be updated in ring->send() // m_sge[0].lkey = p_mem_buf_desc->lkey; /* for DEBUG */ if ((uint8_t*)m_sge[0].addr < p_mem_buf_desc->p_buffer) { dst_tcp_logerr("p_buffer - addr=%d, m_total_hdr_len=%zd, p_buffer=%p, type=%d, len=%d, tot_len=%d, payload=%p, hdr_alignment_diff=%zd\n", (int)(p_mem_buf_desc->p_buffer - (uint8_t*)m_sge[0].addr), m_header.m_total_hdr_len, p_mem_buf_desc->p_buffer, p_mem_buf_desc->lwip_pbuf.pbuf.type, p_mem_buf_desc->lwip_pbuf.pbuf.len, p_mem_buf_desc->lwip_pbuf.pbuf.tot_len, p_mem_buf_desc->lwip_pbuf.pbuf.payload, hdr_alignment_diff); } p_pkt = (tx_packet_template_t*)((uint8_t*)p_mem_buf_desc->p_buffer); p_pkt->hdr.m_ip_hdr.tot_len = (htons)(m_sge[0].length - m_header.m_transport_header_len); #ifdef VMA_NO_HW_CSUM p_pkt->hdr.m_ip_hdr.check = 0; // use 0 at csum calculation time p_pkt->hdr.m_ip_hdr.check = compute_ip_checksum((unsigned short*)&p_pkt->hdr.m_ip_hdr, p_pkt->hdr.m_ip_hdr.ihl * 2); struct tcphdr* p_tcphdr = (struct tcphdr*)(((uint8_t*)(&(p_pkt->hdr.m_ip_hdr))+sizeof(p_pkt->hdr.m_ip_hdr))); p_tcphdr->check = 0; p_tcphdr->check = compute_tcp_checksum(&p_pkt->hdr.m_ip_hdr, (const uint16_t *)p_tcphdr); dst_tcp_logfine("using SW checksum calculation: p_pkt->hdr.m_ip_hdr.check=%d, p_tcphdr->check=%d", (int)p_pkt->hdr.m_ip_hdr.check, (int)p_tcphdr->check); #endif m_p_send_wqe = &m_not_inline_send_wqe; m_p_send_wqe->wr_id = (uintptr_t)p_mem_buf_desc; m_p_ring->send_ring_buffer(m_id, m_p_send_wqe, b_blocked); } #ifndef __COVERITY__ struct tcphdr* p_tcp_h = (struct tcphdr*)(((uint8_t*)(&(p_pkt->hdr.m_ip_hdr))+sizeof(p_pkt->hdr.m_ip_hdr))); NOT_IN_USE(p_tcp_h); /* to supress warning in case VMA_OPTIMIZE_LOG */ dst_tcp_logfunc("Tx TCP segment info: src_port=%d, dst_port=%d, flags='%s%s%s%s%s%s' seq=%u, ack=%u, win=%u, payload_sz=%u", ntohs(p_tcp_h->source), ntohs(p_tcp_h->dest), p_tcp_h->urg?"U":"", p_tcp_h->ack?"A":"", p_tcp_h->psh?"P":"", p_tcp_h->rst?"R":"", p_tcp_h->syn?"S":"", p_tcp_h->fin?"F":"", ntohl(p_tcp_h->seq), ntohl(p_tcp_h->ack_seq), ntohs(p_tcp_h->window), total_packet_len- p_tcp_h->doff*4 -34); #endif if (unlikely(m_p_tx_mem_buf_desc_list == NULL)) { m_p_tx_mem_buf_desc_list = m_p_ring->mem_buf_tx_get(m_id, b_blocked, m_n_sysvar_tx_bufs_batch_tcp); } return 0; }
ssize_t dst_entry_tcp::fast_send(const iovec* p_iov, const ssize_t sz_iov, bool is_dummy, bool b_blocked /*= true*/, bool is_rexmit /*= false*/) { int ret = 0; tx_packet_template_t* p_pkt; mem_buf_desc_t *p_mem_buf_desc; size_t total_packet_len = 0; // The header is aligned for fast copy but we need to maintain this diff in order to get the real header pointer easily size_t hdr_alignment_diff = m_header.m_aligned_l2_l3_len - m_header.m_total_hdr_len; tcp_iovec* p_tcp_iov = NULL; bool no_copy = true; if (likely(sz_iov == 1 && !is_rexmit)) { p_tcp_iov = (tcp_iovec*)p_iov; if (unlikely(!m_p_ring->is_active_member(p_tcp_iov->p_desc->p_desc_owner, m_id))) { no_copy = false; dst_tcp_logdbg("p_desc=%p wrong desc_owner=%p, this ring=%p. did migration occurred?", p_tcp_iov->p_desc, p_tcp_iov->p_desc->p_desc_owner, m_p_ring); //todo can we handle this in migration (by going over all buffers lwip hold) instead for every send? } } else { no_copy = false; } if (likely(no_copy)) { p_pkt = (tx_packet_template_t*)((uint8_t*)p_tcp_iov[0].iovec.iov_base - m_header.m_aligned_l2_l3_len); total_packet_len = p_tcp_iov[0].iovec.iov_len + m_header.m_total_hdr_len; m_header.copy_l2_ip_hdr(p_pkt); // We've copied to aligned address, and now we must update p_pkt to point to real // L2 header //p_pkt = (tx_packet_template_t*)((uint8_t*)p_pkt + hdr_alignment_diff); p_pkt->hdr.m_ip_hdr.tot_len = (htons)(p_tcp_iov[0].iovec.iov_len + m_header.m_ip_header_len); m_sge[0].addr = (uintptr_t)((uint8_t*)p_pkt + hdr_alignment_diff); m_sge[0].length = total_packet_len; if (total_packet_len < m_max_inline) { // inline send m_p_send_wqe = &m_inline_send_wqe; } else { m_p_send_wqe = &m_not_inline_send_wqe; } m_p_send_wqe->wr_id = (uintptr_t)p_tcp_iov[0].p_desc; #ifdef VMA_NO_HW_CSUM p_pkt->hdr.m_ip_hdr.check = 0; // use 0 at csum calculation time p_pkt->hdr.m_ip_hdr.check = compute_ip_checksum((unsigned short*)&p_pkt->hdr.m_ip_hdr, p_pkt->hdr.m_ip_hdr.ihl * 2); struct tcphdr* p_tcphdr = (struct tcphdr*)(((uint8_t*)(&(p_pkt->hdr.m_ip_hdr))+sizeof(p_pkt->hdr.m_ip_hdr))); p_tcphdr->check = 0; p_tcphdr->check = compute_tcp_checksum(&p_pkt->hdr.m_ip_hdr, (const uint16_t *)p_tcphdr); dst_tcp_logfine("using SW checksum calculation: p_pkt->hdr.m_ip_hdr.check=%d, p_tcphdr->check=%d", (int)p_pkt->hdr.m_ip_hdr.check, (int)p_tcphdr->check); #endif send_lwip_buffer(m_id, m_p_send_wqe, b_blocked, is_dummy); /* for DEBUG */ if ((uint8_t*)m_sge[0].addr < p_tcp_iov[0].p_desc->p_buffer || (uint8_t*)p_pkt < p_tcp_iov[0].p_desc->p_buffer) { dst_tcp_logerr("p_buffer - addr=%d, m_total_hdr_len=%zd, p_buffer=%p, type=%d, len=%d, tot_len=%d, payload=%p, hdr_alignment_diff=%zd\n", (int)(p_tcp_iov[0].p_desc->p_buffer - (uint8_t*)m_sge[0].addr), m_header.m_total_hdr_len, p_tcp_iov[0].p_desc->p_buffer, p_tcp_iov[0].p_desc->lwip_pbuf.pbuf.type, p_tcp_iov[0].p_desc->lwip_pbuf.pbuf.len, p_tcp_iov[0].p_desc->lwip_pbuf.pbuf.tot_len, p_tcp_iov[0].p_desc->lwip_pbuf.pbuf.payload, hdr_alignment_diff); } } else { // We don'nt support inline in this case, since we believe that this a very rare case p_mem_buf_desc = get_buffer(b_blocked); if (p_mem_buf_desc == NULL) { ret = -1; goto out; } m_header.copy_l2_ip_hdr((tx_packet_template_t*)p_mem_buf_desc->p_buffer); // Actually this is not the real packet len we will subtract the alignment diff at the end of the copy total_packet_len = m_header.m_aligned_l2_l3_len; for (int i = 0; i < sz_iov; ++i) { memcpy(p_mem_buf_desc->p_buffer + total_packet_len, p_iov[i].iov_base, p_iov[i].iov_len); total_packet_len += p_iov[i].iov_len; } m_sge[0].addr = (uintptr_t)(p_mem_buf_desc->p_buffer + hdr_alignment_diff); m_sge[0].length = total_packet_len - hdr_alignment_diff; // LKey will be updated in ring->send() // m_sge[0].lkey = p_mem_buf_desc->lkey; p_pkt = (tx_packet_template_t*)((uint8_t*)p_mem_buf_desc->p_buffer); p_pkt->hdr.m_ip_hdr.tot_len = (htons)(m_sge[0].length - m_header.m_transport_header_len); #ifdef VMA_NO_HW_CSUM p_pkt->hdr.m_ip_hdr.check = 0; // use 0 at csum calculation time p_pkt->hdr.m_ip_hdr.check = compute_ip_checksum((unsigned short*)&p_pkt->hdr.m_ip_hdr, p_pkt->hdr.m_ip_hdr.ihl * 2); struct tcphdr* p_tcphdr = (struct tcphdr*)(((uint8_t*)(&(p_pkt->hdr.m_ip_hdr))+sizeof(p_pkt->hdr.m_ip_hdr))); p_tcphdr->check = 0; p_tcphdr->check = compute_tcp_checksum(&p_pkt->hdr.m_ip_hdr, (const uint16_t *)p_tcphdr); dst_tcp_logfine("using SW checksum calculation: p_pkt->hdr.m_ip_hdr.check=%d, p_tcphdr->check=%d", (int)p_pkt->hdr.m_ip_hdr.check, (int)p_tcphdr->check); #endif m_p_send_wqe = &m_not_inline_send_wqe; m_p_send_wqe->wr_id = (uintptr_t)p_mem_buf_desc; vma_wr_tx_packet_attr attr = (vma_wr_tx_packet_attr)((VMA_TX_PACKET_BLOCK*b_blocked) | (VMA_TX_PACKET_DUMMY*is_dummy) | VMA_TX_PACKET_L3_CSUM | VMA_TX_PACKET_L4_CSUM); send_ring_buffer(m_id, m_p_send_wqe, attr); /* for DEBUG */ if ((uint8_t*)m_sge[0].addr < p_mem_buf_desc->p_buffer) { dst_tcp_logerr("p_buffer - addr=%d, m_total_hdr_len=%zd, p_buffer=%p, type=%d, len=%d, tot_len=%d, payload=%p, hdr_alignment_diff=%zd\n", (int)(p_mem_buf_desc->p_buffer - (uint8_t*)m_sge[0].addr), m_header.m_total_hdr_len, p_mem_buf_desc->p_buffer, p_mem_buf_desc->lwip_pbuf.pbuf.type, p_mem_buf_desc->lwip_pbuf.pbuf.len, p_mem_buf_desc->lwip_pbuf.pbuf.tot_len, p_mem_buf_desc->lwip_pbuf.pbuf.payload, hdr_alignment_diff); } } if (unlikely(m_p_tx_mem_buf_desc_list == NULL)) { m_p_tx_mem_buf_desc_list = m_p_ring->mem_buf_tx_get(m_id, b_blocked, m_n_sysvar_tx_bufs_batch_tcp); } out: if (unlikely(is_rexmit)) { m_p_ring->inc_tx_retransmissions(m_id); } return ret; }