void nmb_put(struct nmb *nmb, MSN msn, msgtype_t type, struct msg *key, struct msg *val, struct txnid_pair *xidpair) { char *base; uint32_t size = (NMB_ENTRY_SIZE + key->size); if (type != MSG_DELETE) size += val->size; ness_rwlock_write_lock(&nmb->rwlock); base = mempool_alloc_aligned(nmb->mpool, size); ness_rwlock_write_unlock(&nmb->rwlock); _nmb_entry_pack(base, msn, type, key, val, xidpair); /* pma is thread-safe */ pma_insert(nmb->pma, (void*)base, _nmb_entry_key_compare, (void*)nmb); atomic_fetch_and_inc(&nmb->count); }
cq_mgr::cq_mgr(ring* p_ring, ib_ctx_handler* p_ib_ctx_handler, int cq_size, struct ibv_comp_channel* p_comp_event_channel, bool is_rx) : m_p_ring(p_ring), m_p_ib_ctx_handler(p_ib_ctx_handler), m_b_is_rx(is_rx), m_comp_event_channel(p_comp_event_channel), m_p_next_rx_desc_poll(NULL) { cq_logfunc(""); m_n_wce_counter = 0; m_b_was_drained = false; m_b_notification_armed = false; m_n_out_of_free_bufs_warning = 0; m_n_cq_poll_sn = 0; m_cq_id = atomic_fetch_and_inc(&m_n_cq_id_counter); // cq id is nonzero m_transport_type = m_p_ring->get_transport_type(); m_p_ibv_cq = ibv_create_cq(m_p_ib_ctx_handler->get_ibv_context(), cq_size, (void*)this, m_comp_event_channel, 0); BULLSEYE_EXCLUDE_BLOCK_START if (!m_p_ibv_cq) { cq_logpanic("ibv_create_cq failed (errno=%d %m)", errno); } BULLSEYE_EXCLUDE_BLOCK_END // use local copy of stats by default (on rx cq get shared memory stats) m_p_cq_stat = &m_cq_stat_static; memset(m_p_cq_stat , 0, sizeof(*m_p_cq_stat)); /* m_p_cq_stat->n_rx_sw_queue_len = 0; m_p_cq_stat->n_rx_pkt_drop = 0; m_p_cq_stat->n_rx_drained_at_once_max = 0; m_p_cq_stat->n_buffer_pool_len = 0; m_p_cq_stat->buffer_miss_rate = 0.0; //*/ m_buffer_miss_count = 0; m_buffer_total_count = 0; m_buffer_prev_id = 0; m_sz_transport_header = 0; switch (m_transport_type) { case VMA_TRANSPORT_IB: m_sz_transport_header = GRH_HDR_LEN; break; case VMA_TRANSPORT_ETH: m_sz_transport_header = ETH_HDR_LEN; break; BULLSEYE_EXCLUDE_BLOCK_START default: cq_logpanic("Unknown transport type: %d", m_transport_type); break; BULLSEYE_EXCLUDE_BLOCK_END } if (m_b_is_rx) vma_stats_instance_create_cq_block(m_p_cq_stat); cq_logdbg("Created CQ as %s with fd[%d] and of size %d elements (ibv_cq_hndl=%p)", (m_b_is_rx?"Rx":"Tx"), get_channel_fd(), cq_size, m_p_ibv_cq); }
ssize_t dst_entry_udp::pass_buff_to_neigh(const iovec *p_iov, size_t & sz_iov, uint16_t packet_id) { m_header_neigh.init(); m_header_neigh.configure_udp_header(m_dst_port, m_src_port); packet_id = (safe_mce_sys().thread_mode > THREAD_MODE_SINGLE) ? atomic_fetch_and_inc(&m_a_tx_ip_id) : m_n_tx_ip_id++; packet_id = htons(packet_id); return(dst_entry::pass_buff_to_neigh(p_iov, sz_iov, packet_id)); }
ssize_t dst_entry_udp::fast_send(const iovec* p_iov, const ssize_t sz_iov, bool b_blocked /*=true*/, bool is_rexmit /*=false*/, bool dont_inline /*=false*/) { NOT_IN_USE(is_rexmit); tx_packet_template_t *p_pkt; mem_buf_desc_t* p_mem_buf_desc = NULL, *tmp; uint16_t packet_id = 0; bool b_need_sw_csum; // Calc user data payload size ssize_t sz_data_payload = 0; for (ssize_t i = 0; i < sz_iov; i++) sz_data_payload += p_iov[i].iov_len; if (unlikely(sz_data_payload > 65536)) { dst_udp_logfunc("sz_data_payload=%d, to_port=%d, local_port=%d, b_blocked=%s", sz_data_payload, ntohs(m_dst_port), ntohs(m_src_port), b_blocked?"true":"false"); dst_udp_logfunc("sz_data_payload=%d exceeds max of 64KB", sz_data_payload); errno = EMSGSIZE; return -1; } // Calc udp payload size size_t sz_udp_payload = sz_data_payload + sizeof(struct udphdr); if (!dont_inline && (sz_iov == 1 && (sz_data_payload + m_header.m_total_hdr_len) < m_max_inline)) { m_p_send_wqe = &m_inline_send_wqe; //m_sge[0].addr already points to the header //so we just need to update the payload addr + len m_sge[1].length = p_iov[0].iov_len; m_sge[1].addr = (uintptr_t)p_iov[0].iov_base; m_header.m_header.hdr.m_udp_hdr.len = htons((uint16_t)sz_udp_payload); m_header.m_header.hdr.m_ip_hdr.tot_len = htons(m_header.m_ip_header_len + sz_udp_payload); #ifdef VMA_NO_HW_CSUM dst_udp_logfunc("using SW checksum calculation"); m_header.m_header.hdr.m_ip_hdr.check = 0; // use 0 at csum calculation time m_header.m_header.hdr.m_ip_hdr.check = csum((unsigned short*)&m_header.m_header.hdr.m_ip_hdr, m_header.m_header.hdr.m_ip_hdr.ihl * 2); #endif // Get a bunch of tx buf descriptor and data buffers if (unlikely(m_p_tx_mem_buf_desc_list == NULL)) { m_p_tx_mem_buf_desc_list = m_p_ring->mem_buf_tx_get(m_id, b_blocked, safe_mce_sys().tx_bufs_batch_udp); } p_mem_buf_desc = m_p_tx_mem_buf_desc_list; if (unlikely(m_p_tx_mem_buf_desc_list == NULL)) { if (b_blocked) { dst_udp_logdbg("Error when blocking for next tx buffer (errno=%d %m)", errno); } else { dst_udp_logfunc("Packet dropped. NonBlocked call but not enough tx buffers. Returning OK"); if (!safe_mce_sys().tx_nonblocked_eagains) return sz_data_payload; } errno = EAGAIN; return -1; } else { m_p_tx_mem_buf_desc_list = m_p_tx_mem_buf_desc_list->p_next_desc; set_tx_buff_list_pending(false); } p_mem_buf_desc->p_next_desc = NULL; m_inline_send_wqe.wr_id = (uintptr_t)p_mem_buf_desc; m_p_ring->send_ring_buffer(m_id, m_p_send_wqe, b_blocked); if (unlikely(m_p_tx_mem_buf_desc_list == NULL)) { m_p_tx_mem_buf_desc_list = m_p_ring->mem_buf_tx_get(m_id, b_blocked, safe_mce_sys().tx_bufs_batch_udp); } } else { // Find number of ip fragments (-> packets, buffers, buffer descs...) int n_num_frags = 1; b_need_sw_csum = false; m_p_send_wqe = &m_not_inline_send_wqe; // Usually max inline < MTU! if (sz_udp_payload > m_max_ip_payload_size) { b_need_sw_csum = true; n_num_frags = (sz_udp_payload + m_max_ip_payload_size - 1) / m_max_ip_payload_size; packet_id = (safe_mce_sys().thread_mode > THREAD_MODE_SINGLE) ? atomic_fetch_and_inc(&m_a_tx_ip_id) : m_n_tx_ip_id++; packet_id = htons(packet_id); } #ifdef VMA_NO_HW_CSUM b_need_sw_csum = true; #endif dst_udp_logfunc("udp info: payload_sz=%d, frags=%d, scr_port=%d, dst_port=%d, blocked=%s, ", sz_data_payload, n_num_frags, ntohs(m_header.m_header.hdr.m_udp_hdr.source), ntohs(m_dst_port), b_blocked?"true":"false"); // Get all needed tx buf descriptor and data buffers p_mem_buf_desc = m_p_ring->mem_buf_tx_get(m_id, b_blocked, n_num_frags); if (unlikely(p_mem_buf_desc == NULL)) { if (b_blocked) { dst_udp_logdbg("Error when blocking for next tx buffer (errno=%d %m)", errno); } else { dst_udp_logfunc("Packet dropped. NonBlocked call but not enough tx buffers. Returning OK"); if (!safe_mce_sys().tx_nonblocked_eagains) return sz_data_payload; } errno = EAGAIN; return -1; } // Int for counting offset inside the ip datagram payload uint32_t n_ip_frag_offset = 0; size_t sz_user_data_offset = 0; while (n_num_frags--) { // Calc this ip datagram fragment size (include any udp header) size_t sz_ip_frag = min(m_max_ip_payload_size, (sz_udp_payload - n_ip_frag_offset)); size_t sz_user_data_to_copy = sz_ip_frag; size_t hdr_len = m_header.m_transport_header_len + m_header.m_ip_header_len; // Add count of L2 (ipoib or mac) header length if (safe_mce_sys().tx_prefetch_bytes) { prefetch_range(p_mem_buf_desc->p_buffer + m_header.m_transport_header_tx_offset, min(sz_ip_frag, (size_t)safe_mce_sys().tx_prefetch_bytes)); } p_pkt = (tx_packet_template_t*)p_mem_buf_desc->p_buffer; uint16_t frag_off = 0; if (n_num_frags) { frag_off |= MORE_FRAGMENTS_FLAG; } if (n_ip_frag_offset == 0) { m_header.copy_l2_ip_udp_hdr(p_pkt); // Add count of udp header length hdr_len += sizeof(udphdr); // Copy less from user data sz_user_data_to_copy -= sizeof(udphdr); // Only for first fragment add the udp header p_pkt->hdr.m_udp_hdr.len = htons((uint16_t)sz_udp_payload); } else { m_header.copy_l2_ip_hdr(p_pkt); frag_off |= FRAGMENT_OFFSET & (n_ip_frag_offset / 8); } p_pkt->hdr.m_ip_hdr.frag_off = htons(frag_off); // Update ip header specific values p_pkt->hdr.m_ip_hdr.id = packet_id; p_pkt->hdr.m_ip_hdr.tot_len = htons(m_header.m_ip_header_len + sz_ip_frag); // Calc payload start point (after the udp header if present else just after ip header) uint8_t* p_payload = p_mem_buf_desc->p_buffer + m_header.m_transport_header_tx_offset + hdr_len; // Copy user data to our tx buffers int ret = memcpy_fromiovec(p_payload, p_iov, sz_iov, sz_user_data_offset, sz_user_data_to_copy); BULLSEYE_EXCLUDE_BLOCK_START if (ret != (int)sz_user_data_to_copy) { dst_udp_logerr("memcpy_fromiovec error (sz_user_data_to_copy=%d, ret=%d)", sz_user_data_to_copy, ret); m_p_ring->mem_buf_tx_release(p_mem_buf_desc, true); errno = EINVAL; return -1; } BULLSEYE_EXCLUDE_BLOCK_END if (b_need_sw_csum) { dst_udp_logfunc("ip fragmentation detected, using SW checksum calculation"); p_pkt->hdr.m_ip_hdr.check = 0; // use 0 at csum calculation time p_pkt->hdr.m_ip_hdr.check = csum((unsigned short*)&p_pkt->hdr.m_ip_hdr, p_pkt->hdr.m_ip_hdr.ihl * 2); m_p_send_wqe_handler->disable_hw_csum(m_not_inline_send_wqe); } else { dst_udp_logfunc("using HW checksum calculation"); m_p_send_wqe_handler->enable_hw_csum(m_not_inline_send_wqe); } m_sge[1].addr = (uintptr_t)(p_mem_buf_desc->p_buffer + (uint8_t)m_header.m_transport_header_tx_offset); m_sge[1].length = sz_user_data_to_copy + hdr_len; m_not_inline_send_wqe.wr_id = (uintptr_t)p_mem_buf_desc; dst_udp_logfunc("%s packet_sz=%d, payload_sz=%d, ip_offset=%d id=%d", m_header.to_str().c_str(), m_sge[1].length - m_header.m_transport_header_len, sz_user_data_to_copy, n_ip_frag_offset, ntohs(packet_id)); tmp = p_mem_buf_desc->p_next_desc; p_mem_buf_desc->p_next_desc = NULL; // We don't check the return valuse of post send when we reach the HW we consider that we completed our job m_p_ring->send_ring_buffer(m_id, m_p_send_wqe, b_blocked); p_mem_buf_desc = tmp; // Update ip frag offset position n_ip_frag_offset += sz_ip_frag; // Update user data start offset copy location sz_user_data_offset += sz_user_data_to_copy; } // while(n_num_frags) } // If all went well :) then return the user data count transmitted return sz_data_payload; }