static netdev_tx_t ibmveth_start_xmit(struct sk_buff *skb, struct net_device *netdev) { struct ibmveth_adapter *adapter = netdev_priv(netdev); unsigned int desc_flags; union ibmveth_buf_desc descs[6]; int last, i; int force_bounce = 0; dma_addr_t dma_addr; unsigned long mss = 0; /* * veth handles a maximum of 6 segments including the header, so * we have to linearize the skb if there are more than this. */ if (skb_shinfo(skb)->nr_frags > 5 && __skb_linearize(skb)) { netdev->stats.tx_dropped++; goto out; } /* veth can't checksum offload UDP */ if (skb->ip_summed == CHECKSUM_PARTIAL && ((skb->protocol == htons(ETH_P_IP) && ip_hdr(skb)->protocol != IPPROTO_TCP) || (skb->protocol == htons(ETH_P_IPV6) && ipv6_hdr(skb)->nexthdr != IPPROTO_TCP)) && skb_checksum_help(skb)) { netdev_err(netdev, "tx: failed to checksum packet\n"); netdev->stats.tx_dropped++; goto out; } desc_flags = IBMVETH_BUF_VALID; if (skb_is_gso(skb) && adapter->fw_large_send_support) desc_flags |= IBMVETH_BUF_LRG_SND; if (skb->ip_summed == CHECKSUM_PARTIAL) { unsigned char *buf = skb_transport_header(skb) + skb->csum_offset; desc_flags |= (IBMVETH_BUF_NO_CSUM | IBMVETH_BUF_CSUM_GOOD); /* Need to zero out the checksum */ buf[0] = 0; buf[1] = 0; } retry_bounce: memset(descs, 0, sizeof(descs)); /* * If a linear packet is below the rx threshold then * copy it into the static bounce buffer. This avoids the * cost of a TCE insert and remove. */ if (force_bounce || (!skb_is_nonlinear(skb) && (skb->len < tx_copybreak))) { skb_copy_from_linear_data(skb, adapter->bounce_buffer, skb->len); descs[0].fields.flags_len = desc_flags | skb->len; descs[0].fields.address = adapter->bounce_buffer_dma; if (ibmveth_send(adapter, descs, 0)) { adapter->tx_send_failed++; netdev->stats.tx_dropped++; } else { netdev->stats.tx_packets++; netdev->stats.tx_bytes += skb->len; } goto out; } /* Map the header */ dma_addr = dma_map_single(&adapter->vdev->dev, skb->data, skb_headlen(skb), DMA_TO_DEVICE); if (dma_mapping_error(&adapter->vdev->dev, dma_addr)) goto map_failed; descs[0].fields.flags_len = desc_flags | skb_headlen(skb); descs[0].fields.address = dma_addr; /* Map the frags */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; dma_addr = skb_frag_dma_map(&adapter->vdev->dev, frag, 0, skb_frag_size(frag), DMA_TO_DEVICE); if (dma_mapping_error(&adapter->vdev->dev, dma_addr)) goto map_failed_frags; descs[i+1].fields.flags_len = desc_flags | skb_frag_size(frag); descs[i+1].fields.address = dma_addr; } if (skb_is_gso(skb)) { if (adapter->fw_large_send_support) { mss = (unsigned long)skb_shinfo(skb)->gso_size; adapter->tx_large_packets++; } else if (!skb_is_gso_v6(skb)) { /* Put -1 in the IP checksum to tell phyp it * is a largesend packet. Put the mss in * the TCP checksum. */ ip_hdr(skb)->check = 0xffff; tcp_hdr(skb)->check = cpu_to_be16(skb_shinfo(skb)->gso_size); adapter->tx_large_packets++; } } if (ibmveth_send(adapter, descs, mss)) { adapter->tx_send_failed++; netdev->stats.tx_dropped++; } else { netdev->stats.tx_packets++; netdev->stats.tx_bytes += skb->len; } dma_unmap_single(&adapter->vdev->dev, descs[0].fields.address, descs[0].fields.flags_len & IBMVETH_BUF_LEN_MASK, DMA_TO_DEVICE); for (i = 1; i < skb_shinfo(skb)->nr_frags + 1; i++) dma_unmap_page(&adapter->vdev->dev, descs[i].fields.address, descs[i].fields.flags_len & IBMVETH_BUF_LEN_MASK, DMA_TO_DEVICE); out: dev_consume_skb_any(skb); return NETDEV_TX_OK; map_failed_frags: last = i+1; for (i = 0; i < last; i++) dma_unmap_page(&adapter->vdev->dev, descs[i].fields.address, descs[i].fields.flags_len & IBMVETH_BUF_LEN_MASK, DMA_TO_DEVICE); map_failed: if (!firmware_has_feature(FW_FEATURE_CMO)) netdev_err(netdev, "tx: unable to map xmit buffer\n"); adapter->tx_map_failed++; if (skb_linearize(skb)) { netdev->stats.tx_dropped++; goto out; } force_bounce = 1; goto retry_bounce; }
static int netdev_send(struct vport *vport, struct sk_buff *skb) { struct netdev_vport *netdev_vport = netdev_vport_priv(vport); int mtu = netdev_vport->dev->mtu; int len; if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) { if (net_ratelimit()) pr_warn("%s: dropped over-mtu packet: %d > %d\n", ovs_dp_name(vport->dp), packet_length(skb), mtu); goto error; } if (unlikely(skb_warn_if_lro(skb))) goto error; skb->dev = netdev_vport->dev; forward_ip_summed(skb, true); if (vlan_tx_tag_present(skb) && !dev_supports_vlan_tx(skb->dev)) { int features; features = netif_skb_features(skb); if (!vlan_tso) features &= ~(NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_UFO | NETIF_F_FSO); if (netif_needs_gso(skb, features)) { struct sk_buff *nskb; nskb = skb_gso_segment(skb, features); if (!nskb) { if (unlikely(skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) { kfree_skb(skb); return 0; } skb_shinfo(skb)->gso_type &= ~SKB_GSO_DODGY; goto tag; } if (IS_ERR(nskb)) { kfree_skb(skb); return 0; } consume_skb(skb); skb = nskb; len = 0; do { nskb = skb->next; skb->next = NULL; skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb)); if (likely(skb)) { len += skb->len; vlan_set_tci(skb, 0); dev_queue_xmit(skb); } skb = nskb; } while (skb); return len; } tag: skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb)); if (unlikely(!skb)) return 0; vlan_set_tci(skb, 0); } len = skb->len; dev_queue_xmit(skb); return len; error: kfree_skb(skb); ovs_vport_record_error(vport, VPORT_E_TX_DROPPED); return 0; }
int ip6_forward(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct ipv6hdr *hdr = ipv6_hdr(skb); struct inet6_skb_parm *opt = IP6CB(skb); struct net *net = dev_net(dst->dev); u32 mtu; if (net->ipv6.devconf_all->forwarding == 0) goto error; if (skb_warn_if_lro(skb)) goto drop; if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS); goto drop; } if (skb->pkt_type != PACKET_HOST) goto drop; skb_forward_csum(skb); /* * We DO NOT make any processing on * RA packets, pushing them to user level AS IS * without ane WARRANTY that application will be able * to interpret them. The reason is that we * cannot make anything clever here. * * We are not end-node, so that if packet contains * AH/ESP, we cannot make anything. * Defragmentation also would be mistake, RA packets * cannot be fragmented, because there is no warranty * that different fragments will go along one path. --ANK */ if (opt->ra) { u8 *ptr = skb_network_header(skb) + opt->ra; if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3])) return 0; } /* * check and decrement ttl */ if (hdr->hop_limit <= 1) { /* Force OUTPUT device used as source address */ skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -ETIMEDOUT; } /* XXX: idev->cnf.proxy_ndp? */ if ((net->ipv6.devconf_all->proxy_ndp == 1 && pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) || net->ipv6.devconf_all->proxy_ndp >= 2) { int proxied = ip6_forward_proxy_check(skb); if (proxied > 0) return ip6_input(skb); else if (proxied < 0) { IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS); goto drop; } } if (!xfrm6_route_forward(skb)) { IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS); goto drop; } dst = skb_dst(skb); /* IPv6 specs say nothing about it, but it is clear that we cannot send redirects to source routed frames. We don't send redirects to frames decapsulated from IPsec. */ if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) { struct in6_addr *target = NULL; struct rt6_info *rt; /* * incoming and outgoing devices are the same * send a redirect. */ rt = (struct rt6_info *) dst; if (rt->rt6i_flags & RTF_GATEWAY) target = &rt->rt6i_gateway; else target = &hdr->daddr; if (!rt->rt6i_peer) rt6_bind_peer(rt, 1); /* Limit redirects both by destination (here) and by source (inside ndisc_send_redirect) */ if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ)) ndisc_send_redirect(skb, target); } else { int addrtype = ipv6_addr_type(&hdr->saddr); /* This check is security critical. */ if (addrtype == IPV6_ADDR_ANY || addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) goto error; if (addrtype & IPV6_ADDR_LINKLOCAL) { icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOT_NEIGHBOUR, 0); goto error; } } mtu = dst_mtu(dst); if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; if (skb->len > mtu && !skb_is_gso(skb)) { /* Again, force OUTPUT device used as source address */ skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS); IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); return -EMSGSIZE; } if (skb_cow(skb, dst->dev->hard_header_len)) { IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS); goto drop; } hdr = ipv6_hdr(skb); /* Mangling hops number delayed to point after skb COW */ hdr->hop_limit--; IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish); error: IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS); drop: kfree_skb(skb); return -EINVAL; }
static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb) { struct mlx5_wq_cyc *wq = &sq->wq; u16 pi = sq->pc & wq->sz_m1; struct mlx5e_tx_wqe *wqe = mlx5_wq_cyc_get_wqe(wq, pi); struct mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl; struct mlx5_wqe_eth_seg *eseg = &wqe->eth; struct mlx5_wqe_data_seg *dseg; u8 opcode = MLX5_OPCODE_SEND; dma_addr_t dma_addr = 0; u16 headlen; u16 ds_cnt; u16 ihs; int i; memset(wqe, 0, sizeof(*wqe)); if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) eseg->cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM; else sq->stats.csum_offload_none++; if (skb_is_gso(skb)) { u32 payload_len; eseg->mss = cpu_to_be16(skb_shinfo(skb)->gso_size); opcode = MLX5_OPCODE_LSO; ihs = skb_transport_offset(skb) + tcp_hdrlen(skb); payload_len = skb->len - ihs; MLX5E_TX_SKB_CB(skb)->num_bytes = skb->len + (skb_shinfo(skb)->gso_segs - 1) * ihs; sq->stats.tso_packets++; sq->stats.tso_bytes += payload_len; } else { ihs = mlx5e_get_inline_hdr_size(sq, skb); MLX5E_TX_SKB_CB(skb)->num_bytes = max_t(unsigned int, skb->len, ETH_ZLEN); } skb_copy_from_linear_data(skb, eseg->inline_hdr_start, ihs); skb_pull_inline(skb, ihs); eseg->inline_hdr_sz = cpu_to_be16(ihs); ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS; ds_cnt += DIV_ROUND_UP(ihs - sizeof(eseg->inline_hdr_start), MLX5_SEND_WQE_DS); dseg = (struct mlx5_wqe_data_seg *)cseg + ds_cnt; MLX5E_TX_SKB_CB(skb)->num_dma = 0; headlen = skb_headlen(skb); if (headlen) { dma_addr = dma_map_single(sq->pdev, skb->data, headlen, DMA_TO_DEVICE); if (unlikely(dma_mapping_error(sq->pdev, dma_addr))) goto dma_unmap_wqe_err; dseg->addr = cpu_to_be64(dma_addr); dseg->lkey = sq->mkey_be; dseg->byte_count = cpu_to_be32(headlen); mlx5e_dma_push(sq, dma_addr, headlen); MLX5E_TX_SKB_CB(skb)->num_dma++; dseg++; } for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[i]; int fsz = skb_frag_size(frag); dma_addr = skb_frag_dma_map(sq->pdev, frag, 0, fsz, DMA_TO_DEVICE); if (unlikely(dma_mapping_error(sq->pdev, dma_addr))) goto dma_unmap_wqe_err; dseg->addr = cpu_to_be64(dma_addr); dseg->lkey = sq->mkey_be; dseg->byte_count = cpu_to_be32(fsz); mlx5e_dma_push(sq, dma_addr, fsz); MLX5E_TX_SKB_CB(skb)->num_dma++; dseg++; } ds_cnt += MLX5E_TX_SKB_CB(skb)->num_dma; cseg->opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | opcode); cseg->qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt); sq->skb[pi] = skb; MLX5E_TX_SKB_CB(skb)->num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS); sq->pc += MLX5E_TX_SKB_CB(skb)->num_wqebbs; netdev_tx_sent_queue(sq->txq, MLX5E_TX_SKB_CB(skb)->num_bytes); if (unlikely(!mlx5e_sq_has_room_for(sq, MLX5E_SQ_STOP_ROOM))) { netif_tx_stop_queue(sq->txq); sq->stats.stopped++; } if (!skb->xmit_more || netif_xmit_stopped(sq->txq)) { cseg->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE; mlx5e_tx_notify_hw(sq, wqe); } /* fill sq edge with nops to avoid wqe wrap around */ while ((sq->pc & wq->sz_m1) > sq->edge) mlx5e_send_nop(sq, false); sq->stats.packets++; return NETDEV_TX_OK; dma_unmap_wqe_err: sq->stats.dropped++; mlx5e_dma_unmap_wqe_err(sq, skb); dev_kfree_skb_any(skb); return NETDEV_TX_OK; }
static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct xenvif *vif = netdev_priv(dev); struct xenvif_queue *queue = NULL; unsigned int num_queues = vif->num_queues; u16 index; int min_slots_needed; BUG_ON(skb->dev != dev); /* Drop the packet if queues are not set up */ if (num_queues < 1) goto drop; /* Obtain the queue to be used to transmit this packet */ index = skb_get_queue_mapping(skb); if (index >= num_queues) { pr_warn_ratelimited("Invalid queue %hu for packet on interface %s\n.", index, vif->dev->name); index %= num_queues; } queue = &vif->queues[index]; /* Drop the packet if queue is not ready */ if (queue->task == NULL || queue->dealloc_task == NULL || !xenvif_schedulable(vif)) goto drop; /* At best we'll need one slot for the header and one for each * frag. */ min_slots_needed = 1 + skb_shinfo(skb)->nr_frags; /* If the skb is GSO then we'll also need an extra slot for the * metadata. */ if (skb_is_gso(skb)) min_slots_needed++; /* If the skb can't possibly fit in the remaining slots * then turn off the queue to give the ring a chance to * drain. */ if (!xenvif_rx_ring_slots_available(queue, min_slots_needed)) { queue->wake_queue.function = xenvif_wake_queue_callback; queue->wake_queue.data = (unsigned long)queue; xenvif_stop_queue(queue); mod_timer(&queue->wake_queue, jiffies + rx_drain_timeout_jiffies); } skb_queue_tail(&queue->rx_queue, skb); xenvif_kick_thread(queue); return NETDEV_TX_OK; drop: vif->dev->stats.tx_dropped++; dev_kfree_skb(skb); return NETDEV_TX_OK; }
/** * stmmac_xmit: * @skb : the socket buffer * @dev : device pointer * Description : Tx entry point of the driver. */ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev) { struct stmmac_priv *priv = netdev_priv(dev); unsigned int txsize = priv->dma_tx_size; unsigned int entry; int i, csum_insertion = 0; int nfrags = skb_shinfo(skb)->nr_frags; struct dma_desc *desc, *first; if (unlikely(stmmac_tx_avail(priv) < nfrags + 1)) { if (!netif_queue_stopped(dev)) { netif_stop_queue(dev); /* This is a hard error, log it. */ pr_err("%s: BUG! Tx Ring full when queue awake\n", __func__); } return NETDEV_TX_BUSY; } entry = priv->cur_tx % txsize; #ifdef STMMAC_XMIT_DEBUG if ((skb->len > ETH_FRAME_LEN) || nfrags) pr_info("stmmac xmit:\n" "\tskb addr %p - len: %d - nopaged_len: %d\n" "\tn_frags: %d - ip_summed: %d - %s gso\n", skb, skb->len, skb_headlen(skb), nfrags, skb->ip_summed, !skb_is_gso(skb) ? "isn't" : "is"); #endif csum_insertion = (skb->ip_summed == CHECKSUM_PARTIAL); desc = priv->dma_tx + entry; first = desc; #ifdef STMMAC_XMIT_DEBUG if ((nfrags > 0) || (skb->len > ETH_FRAME_LEN)) pr_debug("stmmac xmit: skb len: %d, nopaged_len: %d,\n" "\t\tn_frags: %d, ip_summed: %d\n", skb->len, skb_headlen(skb), nfrags, skb->ip_summed); #endif priv->tx_skbuff[entry] = skb; if (unlikely(skb->len >= BUF_SIZE_4KiB)) { entry = stmmac_handle_jumbo_frames(skb, dev, csum_insertion); desc = priv->dma_tx + entry; } else { unsigned int nopaged_len = skb_headlen(skb); desc->des2 = dma_map_single(priv->device, skb->data, nopaged_len, DMA_TO_DEVICE); priv->hw->desc->prepare_tx_desc(desc, 1, nopaged_len, csum_insertion); } for (i = 0; i < nfrags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; int len = frag->size; entry = (++priv->cur_tx) % txsize; desc = priv->dma_tx + entry; TX_DBG("\t[entry %d] segment len: %d\n", entry, len); desc->des2 = dma_map_page(priv->device, frag->page, frag->page_offset, len, DMA_TO_DEVICE); priv->tx_skbuff[entry] = NULL; priv->hw->desc->prepare_tx_desc(desc, 0, len, csum_insertion); wmb(); priv->hw->desc->set_tx_owner(desc); } /* Interrupt on completition only for the latest segment */ priv->hw->desc->close_tx_desc(desc); #ifdef CONFIG_STMMAC_TIMER /* Clean IC while using timer */ if (likely(priv->tm->enable)) priv->hw->desc->clear_tx_ic(desc); #endif wmb(); /* To avoid raise condition */ priv->hw->desc->set_tx_owner(first); priv->cur_tx++; #ifdef STMMAC_XMIT_DEBUG if (netif_msg_pktdata(priv)) { pr_info("stmmac xmit: current=%d, dirty=%d, entry=%d, " "first=%p, nfrags=%d\n", (priv->cur_tx % txsize), (priv->dirty_tx % txsize), entry, first, nfrags); display_ring(priv->dma_tx, txsize); pr_info(">>> frame to be transmitted: "); print_pkt(skb->data, skb->len); } #endif if (unlikely(stmmac_tx_avail(priv) <= (MAX_SKB_FRAGS + 1))) { TX_DBG("%s: stop transmitted packets\n", __func__); netif_stop_queue(dev); } dev->stats.tx_bytes += skb->len; skb_tx_timestamp(skb); priv->hw->dma->enable_dma_transmission(priv->ioaddr); return NETDEV_TX_OK; }
/* * xmit an sk_buff (used by TCP, SCTP and DCCP) * Note : socket lock is not held for SYNACK packets, but might be modified * by calls to skb_set_owner_w() and ipv6_local_error(), * which are using proper atomic operations or spinlocks. */ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, struct ipv6_txoptions *opt, int tclass) { struct net *net = sock_net(sk); const struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *first_hop = &fl6->daddr; struct dst_entry *dst = skb_dst(skb); struct ipv6hdr *hdr; u8 proto = fl6->flowi6_proto; int seg_len = skb->len; int hlimit = -1; u32 mtu; if (opt) { unsigned int head_room; /* First: exthdrs may take lots of space (~8K for now) MAX_HEADER is not enough. */ head_room = opt->opt_nflen + opt->opt_flen; seg_len += head_room; head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); if (skb_headroom(skb) < head_room) { struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); if (!skb2) { IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); return -ENOBUFS; } consume_skb(skb); skb = skb2; /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically, * it is safe to call in our context (socket lock not held) */ skb_set_owner_w(skb, (struct sock *)sk); } if (opt->opt_flen) ipv6_push_frag_opts(skb, opt, &proto); if (opt->opt_nflen) ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop); } skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); hdr = ipv6_hdr(skb); /* * Fill in the IPv6 header */ if (np) hlimit = np->hop_limit; if (hlimit < 0) hlimit = ip6_dst_hoplimit(dst); ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, np->autoflowlabel, fl6)); hdr->payload_len = htons(seg_len); hdr->nexthdr = proto; hdr->hop_limit = hlimit; hdr->saddr = fl6->saddr; hdr->daddr = *first_hop; skb->protocol = htons(ETH_P_IPV6); skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; mtu = dst_mtu(dst); if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUT, skb->len); /* hooks should never assume socket lock is held. * we promote our socket to non const */ return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, (struct sock *)sk, skb, NULL, dst->dev, dst_output); } skb->dev = dst->dev; /* ipv6_local_error() does not require socket lock, * we promote our socket to non const */ ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); return -EMSGSIZE; }
int ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp) { struct rt6_info *rt; /* Route to the other host */ int mtu; int local; EnterFunction(10); /* check if it is a connection of no-client-port */ if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { __be16 _pt, *p; p = skb_header_pointer(skb, sizeof(struct ipv6hdr), sizeof(_pt), &_pt); if (p == NULL) goto tx_error; ip_vs_conn_fill_cport(cp, *p); IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, 0, (IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR)))) goto tx_error_icmp; local = __ip_vs_is_local_route6(rt); /* * Avoid duplicate tuple in reply direction for NAT traffic * to local address when connection is sync-ed */ #if IS_ENABLED(CONFIG_NF_CONNTRACK) if (cp->flags & IP_VS_CONN_F_SYNC && local) { enum ip_conntrack_info ctinfo; struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); if (ct && !nf_ct_is_untracked(ct)) { IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0, "ip_vs_nat_xmit_v6(): " "stopping DNAT to local address"); goto tx_error_put; } } #endif /* From world but DNAT to loopback address? */ if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) { IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0, "ip_vs_nat_xmit_v6(): " "stopping DNAT to loopback address"); goto tx_error_put; } /* MTU checking */ mtu = dst_mtu(&rt->dst); if (skb->len > mtu && !skb_is_gso(skb)) { if (!skb->dev) { struct net *net = dev_net(skb_dst(skb)->dev); skb->dev = net->loopback_dev; } icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); IP_VS_DBG_RL_PKT(0, AF_INET6, pp, skb, 0, "ip_vs_nat_xmit_v6(): frag needed for"); goto tx_error_put; } /* copy-on-write the packet before mangling it */ if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) goto tx_error_put; if (skb_cow(skb, rt->dst.dev->hard_header_len)) goto tx_error_put; /* mangle the packet */ if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) goto tx_error; ipv6_hdr(skb)->daddr = cp->daddr.in6; if (!local || !skb->dev) { /* drop the old route when skb is not shared */ skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); } else { /* destined to loopback, do we need to change route? */ dst_release(&rt->dst); } IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT"); /* FIXME: when application helper enlarges the packet and the length is larger than the MTU of outgoing device, there will be still MTU problem. */ /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local); LeaveFunction(10); return NF_STOLEN; tx_error_icmp: dst_link_failure(skb); tx_error: LeaveFunction(10); kfree_skb(skb); return NF_STOLEN; tx_error_put: dst_release(&rt->dst); goto tx_error; }
/* * IP Tunneling transmitter * * This function encapsulates the packet in a new IP packet, its * destination will be set to cp->daddr. Most code of this function * is taken from ipip.c. * * It is used in VS/TUN cluster. The load balancer selects a real * server from a cluster based on a scheduling algorithm, * encapsulates the request packet and forwards it to the selected * server. For example, all real servers are configured with * "ifconfig tunl0 <Virtual IP Address> up". When the server receives * the encapsulated packet, it will decapsulate the packet, processe * the request and return the response packets directly to the client * without passing the load balancer. This can greatly increase the * scalability of virtual server. * * Used for ANY protocol */ int ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp) { struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); struct rtable *rt; /* Route to the other host */ __be32 saddr; /* Source for tunnel */ struct net_device *tdev; /* Device to other host */ struct iphdr *old_iph = ip_hdr(skb); u8 tos = old_iph->tos; __be16 df; struct iphdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ int mtu; int ret; EnterFunction(10); if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, RT_TOS(tos), IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_CONNECT, &saddr))) goto tx_error_icmp; if (rt->rt_flags & RTCF_LOCAL) { ip_rt_put(rt); IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); } tdev = rt->dst.dev; mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); if (mtu < 68) { IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); goto tx_error_put; } if (rt_is_output_route(skb_rtable(skb))) skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); /* Copy DF, reset fragment offset and MF */ df = sysctl_pmtu_disc(ipvs) ? old_iph->frag_off & htons(IP_DF) : 0; if (df && mtu < ntohs(old_iph->tot_len) && !skb_is_gso(skb)) { icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); IP_VS_DBG_RL("%s(): frag needed\n", __func__); goto tx_error_put; } /* * Okay, now see if we can stuff it in the buffer as-is. */ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) { ip_rt_put(rt); kfree_skb(skb); IP_VS_ERR_RL("%s(): no memory\n", __func__); return NF_STOLEN; } consume_skb(skb); skb = new_skb; old_iph = ip_hdr(skb); } skb->transport_header = skb->network_header; /* fix old IP header checksum */ ip_send_check(old_iph); skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); /* drop old route */ skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); /* * Push down and install the IPIP header. */ iph = ip_hdr(skb); iph->version = 4; iph->ihl = sizeof(struct iphdr)>>2; iph->frag_off = df; iph->protocol = IPPROTO_IPIP; iph->tos = tos; iph->daddr = cp->daddr.ip; iph->saddr = saddr; iph->ttl = old_iph->ttl; ip_select_ident(iph, &rt->dst, NULL); /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; ret = IP_VS_XMIT_TUNNEL(skb, cp); if (ret == NF_ACCEPT) ip_local_out(skb); else if (ret == NF_DROP) kfree_skb(skb); LeaveFunction(10); return NF_STOLEN; tx_error_icmp: dst_link_failure(skb); tx_error: kfree_skb(skb); LeaveFunction(10); return NF_STOLEN; tx_error_put: ip_rt_put(rt); goto tx_error; }
int ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, int offset, unsigned int hooknum) { struct rt6_info *rt; /* Route to the other host */ int mtu; int rc; int local; int rt_mode; EnterFunction(10); /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be forwarded directly here, because there is no need to translate address/port back */ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { if (cp->packet_xmit) rc = cp->packet_xmit(skb, cp, pp); else rc = NF_ACCEPT; /* do not touch skb anymore */ atomic_inc(&cp->in_pkts); goto out; } /* * mangle and send the packet here (only for VS/NAT) */ /* LOCALNODE from FORWARD hook is not supported */ rt_mode = (hooknum != NF_INET_FORWARD) ? IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, 0, rt_mode))) goto tx_error_icmp; local = __ip_vs_is_local_route6(rt); /* * Avoid duplicate tuple in reply direction for NAT traffic * to local address when connection is sync-ed */ #if IS_ENABLED(CONFIG_NF_CONNTRACK) if (cp->flags & IP_VS_CONN_F_SYNC && local) { enum ip_conntrack_info ctinfo; struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); if (ct && !nf_ct_is_untracked(ct)) { IP_VS_DBG(10, "%s(): " "stopping DNAT to local address %pI6\n", __func__, &cp->daddr.in6); goto tx_error_put; } } #endif /* From world but DNAT to loopback address? */ if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) { IP_VS_DBG(1, "%s(): " "stopping DNAT to loopback %pI6\n", __func__, &cp->daddr.in6); goto tx_error_put; } /* MTU checking */ mtu = dst_mtu(&rt->dst); if (skb->len > mtu && !skb_is_gso(skb)) { if (!skb->dev) { struct net *net = dev_net(skb_dst(skb)->dev); skb->dev = net->loopback_dev; } icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); IP_VS_DBG_RL("%s(): frag needed\n", __func__); goto tx_error_put; } /* copy-on-write the packet before mangling it */ if (!skb_make_writable(skb, offset)) goto tx_error_put; if (skb_cow(skb, rt->dst.dev->hard_header_len)) goto tx_error_put; ip_vs_nat_icmp_v6(skb, pp, cp, 0); if (!local || !skb->dev) { /* drop the old route when skb is not shared */ skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); } else { /* destined to loopback, do we need to change route? */ dst_release(&rt->dst); } /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local); rc = NF_STOLEN; goto out; tx_error_icmp: dst_link_failure(skb); tx_error: dev_kfree_skb(skb); rc = NF_STOLEN; out: LeaveFunction(10); return rc; tx_error_put: dst_release(&rt->dst); goto tx_error; }
/* * NAT transmitter (only for outside-to-inside nat forwarding) * Not used for related ICMP */ int ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp) { struct rtable *rt; /* Route to the other host */ int mtu; struct iphdr *iph = ip_hdr(skb); int local; EnterFunction(10); /* check if it is a connection of no-client-port */ if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { __be16 _pt, *p; p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt); if (p == NULL) goto tx_error; ip_vs_conn_fill_cport(cp, *p); IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, RT_TOS(iph->tos), IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR, NULL))) goto tx_error_icmp; local = rt->rt_flags & RTCF_LOCAL; /* * Avoid duplicate tuple in reply direction for NAT traffic * to local address when connection is sync-ed */ #if IS_ENABLED(CONFIG_NF_CONNTRACK) if (cp->flags & IP_VS_CONN_F_SYNC && local) { enum ip_conntrack_info ctinfo; struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); if (ct && !nf_ct_is_untracked(ct)) { IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): " "stopping DNAT to local address"); goto tx_error_put; } } #endif /* From world but DNAT to loopback address? */ if (local && ipv4_is_loopback(cp->daddr.ip) && rt_is_input_route(skb_rtable(skb))) { IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): " "stopping DNAT to loopback address"); goto tx_error_put; } /* MTU checking */ mtu = dst_mtu(&rt->dst); if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) && !skb_is_gso(skb)) { icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); goto tx_error_put; } /* copy-on-write the packet before mangling it */ if (!skb_make_writable(skb, sizeof(struct iphdr))) goto tx_error_put; if (skb_cow(skb, rt->dst.dev->hard_header_len)) goto tx_error_put; /* mangle the packet */ if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) goto tx_error_put; ip_hdr(skb)->daddr = cp->daddr.ip; ip_send_check(ip_hdr(skb)); if (!local) { /* drop old route */ skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); } else { ip_rt_put(rt); /* * Some IPv4 replies get local address from routes, * not from iph, so while we DNAT after routing * we need this second input/output route. */ if (!__ip_vs_reroute_locally(skb)) goto tx_error; } IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT"); /* FIXME: when application helper enlarges the packet and the length is larger than the MTU of outgoing device, there will be still MTU problem. */ /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local); LeaveFunction(10); return NF_STOLEN; tx_error_icmp: dst_link_failure(skb); tx_error: kfree_skb(skb); LeaveFunction(10); return NF_STOLEN; tx_error_put: ip_rt_put(rt); goto tx_error; }
/* * ICMP packet transmitter * called by the ip_vs_in_icmp */ int ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, int offset, unsigned int hooknum) { struct rtable *rt; /* Route to the other host */ int mtu; int rc; int local; int rt_mode; EnterFunction(10); /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be forwarded directly here, because there is no need to translate address/port back */ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { if (cp->packet_xmit) rc = cp->packet_xmit(skb, cp, pp); else rc = NF_ACCEPT; /* do not touch skb anymore */ atomic_inc(&cp->in_pkts); goto out; } /* * mangle and send the packet here (only for VS/NAT) */ /* LOCALNODE from FORWARD hook is not supported */ rt_mode = (hooknum != NF_INET_FORWARD) ? IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, RT_TOS(ip_hdr(skb)->tos), rt_mode, NULL))) goto tx_error_icmp; local = rt->rt_flags & RTCF_LOCAL; /* * Avoid duplicate tuple in reply direction for NAT traffic * to local address when connection is sync-ed */ #if IS_ENABLED(CONFIG_NF_CONNTRACK) if (cp->flags & IP_VS_CONN_F_SYNC && local) { enum ip_conntrack_info ctinfo; struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); if (ct && !nf_ct_is_untracked(ct)) { IP_VS_DBG(10, "%s(): " "stopping DNAT to local address %pI4\n", __func__, &cp->daddr.ip); goto tx_error_put; } } #endif /* From world but DNAT to loopback address? */ if (local && ipv4_is_loopback(cp->daddr.ip) && rt_is_input_route(skb_rtable(skb))) { IP_VS_DBG(1, "%s(): " "stopping DNAT to loopback %pI4\n", __func__, &cp->daddr.ip); goto tx_error_put; } /* MTU checking */ mtu = dst_mtu(&rt->dst); if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF)) && !skb_is_gso(skb)) { icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); IP_VS_DBG_RL("%s(): frag needed\n", __func__); goto tx_error_put; } /* copy-on-write the packet before mangling it */ if (!skb_make_writable(skb, offset)) goto tx_error_put; if (skb_cow(skb, rt->dst.dev->hard_header_len)) goto tx_error_put; ip_vs_nat_icmp(skb, pp, cp, 0); if (!local) { /* drop the old route when skb is not shared */ skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); } else { ip_rt_put(rt); /* * Some IPv4 replies get local address from routes, * not from iph, so while we DNAT after routing * we need this second input/output route. */ if (!__ip_vs_reroute_locally(skb)) goto tx_error; } /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local); rc = NF_STOLEN; goto out; tx_error_icmp: dst_link_failure(skb); tx_error: dev_kfree_skb(skb); rc = NF_STOLEN; out: LeaveFunction(10); return rc; tx_error_put: ip_rt_put(rt); goto tx_error; }
int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6, struct rt6_info *rt, unsigned int flags, int dontfrag) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct inet_cork *cork; struct sk_buff *skb, *skb_prev = NULL; unsigned int maxfraglen, fragheaderlen, mtu; int exthdrlen; int dst_exthdrlen; int hh_len; int copy; int err; int offset = 0; __u8 tx_flags = 0; if (flags&MSG_PROBE) return 0; cork = &inet->cork.base; if (skb_queue_empty(&sk->sk_write_queue)) { /* * setup for corking */ if (opt) { if (WARN_ON(np->cork.opt)) return -EINVAL; np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation); if (unlikely(np->cork.opt == NULL)) return -ENOBUFS; np->cork.opt->tot_len = opt->tot_len; np->cork.opt->opt_flen = opt->opt_flen; np->cork.opt->opt_nflen = opt->opt_nflen; np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation); if (opt->dst0opt && !np->cork.opt->dst0opt) return -ENOBUFS; np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation); if (opt->dst1opt && !np->cork.opt->dst1opt) return -ENOBUFS; np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation); if (opt->hopopt && !np->cork.opt->hopopt) return -ENOBUFS; np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation); if (opt->srcrt && !np->cork.opt->srcrt) return -ENOBUFS; /* need source address above miyazawa*/ } dst_hold(&rt->dst); cork->dst = &rt->dst; inet->cork.fl.u.ip6 = *fl6; np->cork.hop_limit = hlimit; np->cork.tclass = tclass; if (rt->dst.flags & DST_XFRM_TUNNEL) mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ? rt->dst.dev->mtu : dst_mtu(&rt->dst); else mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ? rt->dst.dev->mtu : dst_mtu(rt->dst.path); if (np->frag_size < mtu) { if (np->frag_size) mtu = np->frag_size; } cork->fragsize = mtu; if (dst_allfrag(rt->dst.path)) cork->flags |= IPCORK_ALLFRAG; cork->length = 0; exthdrlen = (opt ? opt->opt_flen : 0); length += exthdrlen; transhdrlen += exthdrlen; dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; } else { rt = (struct rt6_info *)cork->dst; fl6 = &inet->cork.fl.u.ip6; opt = np->cork.opt; transhdrlen = 0; exthdrlen = 0; dst_exthdrlen = 0; mtu = cork->fragsize; } hh_len = LL_RESERVED_SPACE(rt->dst.dev); fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + (opt ? opt->opt_nflen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr); if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) { if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) { ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen); return -EMSGSIZE; } } /* For UDP, check if TX timestamp is enabled */ if (sk->sk_type == SOCK_DGRAM) sock_tx_timestamp(sk, &tx_flags); /* * Let's try using as much space as possible. * Use MTU if total length of the message fits into the MTU. * Otherwise, we need to reserve fragment header and * fragment alignment (= 8-15 octects, in total). * * Note that we may need to "move" the data from the tail of * of the buffer to the new fragment when we split * the message. * * FIXME: It may be fragmented into multiple chunks * at once if non-fragmentable extension headers * are too large. * --yoshfuji */ if ((length > mtu) && dontfrag && (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_RAW)) { ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen); return -EMSGSIZE; } skb = skb_peek_tail(&sk->sk_write_queue); cork->length += length; if (((length > mtu) || (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO)) { err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len, fragheaderlen, transhdrlen, mtu, flags, rt); if (err) goto error; return 0; } if (!skb) goto alloc_new_skb; while (length > 0) { /* Check if the remaining data fits into current packet. */ copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len; if (copy < length) copy = maxfraglen - skb->len; if (copy <= 0) { char *data; unsigned int datalen; unsigned int fraglen; unsigned int fraggap; unsigned int alloclen; alloc_new_skb: /* There's no room in the current skb */ if (skb) fraggap = skb->len - maxfraglen; else fraggap = 0; /* update mtu and maxfraglen if necessary */ if (skb == NULL || skb_prev == NULL) ip6_append_data_mtu(&mtu, &maxfraglen, fragheaderlen, skb, rt, np->pmtudisc == IPV6_PMTUDISC_PROBE); skb_prev = skb; /* * If remaining data exceeds the mtu, * we know we need more fragment(s). */ datalen = length + fraggap; if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; if ((flags & MSG_MORE) && !(rt->dst.dev->features&NETIF_F_SG)) alloclen = mtu; else alloclen = datalen + fragheaderlen; alloclen += dst_exthdrlen; if (datalen != length + fraggap) { /* * this is not the last fragment, the trailer * space is regarded as data space. */ datalen += rt->dst.trailer_len; } alloclen += rt->dst.trailer_len; fraglen = datalen + fragheaderlen; /* * We just reserve space for fragment header. * Note: this may be overallocation if the message * (without MSG_MORE) fits into the MTU. */ alloclen += sizeof(struct frag_hdr); if (transhdrlen) { skb = sock_alloc_send_skb(sk, alloclen + hh_len, (flags & MSG_DONTWAIT), &err); } else { skb = NULL; if (atomic_read(&sk->sk_wmem_alloc) <= 2 * sk->sk_sndbuf) skb = sock_wmalloc(sk, alloclen + hh_len, 1, sk->sk_allocation); if (unlikely(skb == NULL)) err = -ENOBUFS; else { /* Only the initial fragment * is time stamped. */ tx_flags = 0; } } if (skb == NULL) goto error; /* * Fill in the control structures */ skb->protocol = htons(ETH_P_IPV6); skb->ip_summed = CHECKSUM_NONE; skb->csum = 0; /* reserve for fragmentation and ipsec header */ skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + dst_exthdrlen); if (sk->sk_type == SOCK_DGRAM) skb_shinfo(skb)->tx_flags = tx_flags; /* * Find where to start putting bytes */ data = skb_put(skb, fraglen); skb_set_network_header(skb, exthdrlen); data += fragheaderlen; skb->transport_header = (skb->network_header + fragheaderlen); if (fraggap) { skb->csum = skb_copy_and_csum_bits( skb_prev, maxfraglen, data + transhdrlen, fraggap, 0); skb_prev->csum = csum_sub(skb_prev->csum, skb->csum); data += fraggap; pskb_trim_unique(skb_prev, maxfraglen); } copy = datalen - transhdrlen - fraggap; if (copy < 0) { err = -EINVAL; kfree_skb(skb); goto error; } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { err = -EFAULT; kfree_skb(skb); goto error; } offset += copy; length -= datalen - fraggap; transhdrlen = 0; exthdrlen = 0; dst_exthdrlen = 0; /* * Put the packet on the pending queue */ __skb_queue_tail(&sk->sk_write_queue, skb); continue; } if (copy > length) copy = length; if (!(rt->dst.dev->features&NETIF_F_SG)) { unsigned int off; off = skb->len; if (getfrag(from, skb_put(skb, copy), offset, copy, off, skb) < 0) { __skb_trim(skb, off); err = -EFAULT; goto error; } } else { int i = skb_shinfo(skb)->nr_frags; struct page_frag *pfrag = sk_page_frag(sk); err = -ENOMEM; if (!sk_page_frag_refill(sk, pfrag)) goto error; if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { err = -EMSGSIZE; if (i == MAX_SKB_FRAGS) goto error; __skb_fill_page_desc(skb, i, pfrag->page, pfrag->offset, 0); skb_shinfo(skb)->nr_frags = ++i; get_page(pfrag->page); } copy = min_t(int, copy, pfrag->size - pfrag->offset); if (getfrag(from, page_address(pfrag->page) + pfrag->offset, offset, copy, skb->len, skb) < 0) goto error_efault; pfrag->offset += copy; skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); skb->len += copy; skb->data_len += copy; skb->truesize += copy; atomic_add(copy, &sk->sk_wmem_alloc); } offset += copy; length -= copy; } return 0; error_efault: err = -EFAULT; error: cork->length -= length; IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); return err; }
/** * ixgbe_ipsec_tx - setup Tx flags for ipsec offload * @tx_ring: outgoing context * @first: current data packet * @itd: ipsec Tx data for later use in building context descriptor **/ int ixgbe_ipsec_tx(struct ixgbe_ring *tx_ring, struct ixgbe_tx_buffer *first, struct ixgbe_ipsec_tx_data *itd) { struct ixgbe_adapter *adapter = netdev_priv(tx_ring->netdev); struct ixgbe_ipsec *ipsec = adapter->ipsec; struct xfrm_state *xs; struct tx_sa *tsa; if (unlikely(!first->skb->sp->len)) { netdev_err(tx_ring->netdev, "%s: no xfrm state len = %d\n", __func__, first->skb->sp->len); return 0; } xs = xfrm_input_state(first->skb); if (unlikely(!xs)) { netdev_err(tx_ring->netdev, "%s: no xfrm_input_state() xs = %p\n", __func__, xs); return 0; } itd->sa_idx = xs->xso.offload_handle - IXGBE_IPSEC_BASE_TX_INDEX; if (unlikely(itd->sa_idx > IXGBE_IPSEC_MAX_SA_COUNT)) { netdev_err(tx_ring->netdev, "%s: bad sa_idx=%d handle=%lu\n", __func__, itd->sa_idx, xs->xso.offload_handle); return 0; } tsa = &ipsec->tx_tbl[itd->sa_idx]; if (unlikely(!tsa->used)) { netdev_err(tx_ring->netdev, "%s: unused sa_idx=%d\n", __func__, itd->sa_idx); return 0; } first->tx_flags |= IXGBE_TX_FLAGS_IPSEC | IXGBE_TX_FLAGS_CC; if (xs->id.proto == IPPROTO_ESP) { itd->flags |= IXGBE_ADVTXD_TUCMD_IPSEC_TYPE_ESP | IXGBE_ADVTXD_TUCMD_L4T_TCP; if (first->protocol == htons(ETH_P_IP)) itd->flags |= IXGBE_ADVTXD_TUCMD_IPV4; /* The actual trailer length is authlen (16 bytes) plus * 2 bytes for the proto and the padlen values, plus * padlen bytes of padding. This ends up not the same * as the static value found in xs->props.trailer_len (21). * * ... but if we're doing GSO, don't bother as the stack * doesn't add a trailer for those. */ if (!skb_is_gso(first->skb)) { /* The "correct" way to get the auth length would be * to use * authlen = crypto_aead_authsize(xs->data); * but since we know we only have one size to worry * about * we can let the compiler use the constant * and save us a few CPU cycles. */ const int authlen = IXGBE_IPSEC_AUTH_BITS / 8; struct sk_buff *skb = first->skb; u8 padlen; int ret; ret = skb_copy_bits(skb, skb->len - (authlen + 2), &padlen, 1); if (unlikely(ret)) return 0; itd->trailer_len = authlen + 2 + padlen; } } if (tsa->encrypt) itd->flags |= IXGBE_ADVTXD_TUCMD_IPSEC_ENCRYPT_EN; return 1; }
int ip_forward(struct sk_buff *skb) { struct iphdr *iph; /* Our header */ struct rtable *rt; /* Route we use */ struct ip_options * opt = &(IPCB(skb)->opt); if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb)) goto drop; if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb)) return NET_RX_SUCCESS; if (skb->pkt_type != PACKET_HOST) goto drop; skb_forward_csum(skb); /* * According to the RFC, we must first decrease the TTL field. If * that reaches zero, we must reply an ICMP control message telling * that the packet's lifetime expired. */ if (ip_hdr(skb)->ttl <= 1) goto too_many_hops; if (!xfrm4_route_forward(skb)) goto drop; rt = (struct rtable*)skb->dst; if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) goto sr_failed; if (unlikely(skb->len > dst_mtu(&rt->u.dst) && !skb_is_gso(skb) && (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) { IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(dst_mtu(&rt->u.dst))); goto drop; } /* We are about to mangle packet. Copy it! */ if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len)) goto drop; iph = ip_hdr(skb); /* Decrease ttl after skb cow done */ ip_decrease_ttl(iph); /* * We now generate an ICMP HOST REDIRECT giving the route * we calculated. */ if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb->sp) ip_rt_send_redirect(skb); skb->priority = rt_tos2priority(iph->tos); return NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, rt->u.dst.dev, ip_forward_finish); sr_failed: /* * Strict routing permits no gatewaying */ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0); goto drop; too_many_hops: /* Tell the sender its packet died... */ IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); drop: kfree_skb(skb); return NET_RX_DROP; }
int ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp) { struct rt6_info *rt; /* Route to the other host */ struct in6_addr saddr; /* Source for tunnel */ struct net_device *tdev; /* Device to other host */ struct ipv6hdr *old_iph = ipv6_hdr(skb); struct ipv6hdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ int mtu; int ret; EnterFunction(10); if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, &saddr, 1, (IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL)))) goto tx_error_icmp; if (__ip_vs_is_local_route6(rt)) { dst_release(&rt->dst); IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1); } tdev = rt->dst.dev; mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); if (mtu < IPV6_MIN_MTU) { IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, IPV6_MIN_MTU); goto tx_error_put; } if (skb_dst(skb)) skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr) && !skb_is_gso(skb)) { if (!skb->dev) { struct net *net = dev_net(skb_dst(skb)->dev); skb->dev = net->loopback_dev; } icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); IP_VS_DBG_RL("%s(): frag needed\n", __func__); goto tx_error_put; } /* * Okay, now see if we can stuff it in the buffer as-is. */ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) { dst_release(&rt->dst); kfree_skb(skb); IP_VS_ERR_RL("%s(): no memory\n", __func__); return NF_STOLEN; } consume_skb(skb); skb = new_skb; old_iph = ipv6_hdr(skb); } skb->transport_header = skb->network_header; skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); /* drop old route */ skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); /* * Push down and install the IPIP header. */ iph = ipv6_hdr(skb); iph->version = 6; iph->nexthdr = IPPROTO_IPV6; iph->payload_len = old_iph->payload_len; be16_add_cpu(&iph->payload_len, sizeof(*old_iph)); iph->priority = old_iph->priority; memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl)); iph->daddr = cp->daddr.in6; iph->saddr = saddr; iph->hop_limit = old_iph->hop_limit; /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; ret = IP_VS_XMIT_TUNNEL(skb, cp); if (ret == NF_ACCEPT) ip6_local_out(skb); else if (ret == NF_DROP) kfree_skb(skb); LeaveFunction(10); return NF_STOLEN; tx_error_icmp: dst_link_failure(skb); tx_error: kfree_skb(skb); LeaveFunction(10); return NF_STOLEN; tx_error_put: dst_release(&rt->dst); goto tx_error; }
static int netvsc_start_xmit(struct sk_buff *skb, struct net_device *net) { struct net_device_context *net_device_ctx = netdev_priv(net); struct hv_netvsc_packet *packet = NULL; int ret; unsigned int num_data_pgs; struct rndis_message *rndis_msg; struct rndis_packet *rndis_pkt; u32 rndis_msg_size; bool isvlan; bool linear = false; struct rndis_per_packet_info *ppi; struct ndis_tcp_ip_checksum_info *csum_info; struct ndis_tcp_lso_info *lso_info; int hdr_offset; u32 net_trans_info; u32 hash; u32 skb_length; struct hv_page_buffer page_buf[MAX_PAGE_BUFFER_COUNT]; struct hv_page_buffer *pb = page_buf; struct netvsc_stats *tx_stats = this_cpu_ptr(net_device_ctx->tx_stats); /* We will atmost need two pages to describe the rndis * header. We can only transmit MAX_PAGE_BUFFER_COUNT number * of pages in a single packet. If skb is scattered around * more pages we try linearizing it. */ check_size: skb_length = skb->len; num_data_pgs = netvsc_get_slots(skb) + 2; if (num_data_pgs > MAX_PAGE_BUFFER_COUNT && linear) { net_alert_ratelimited("packet too big: %u pages (%u bytes)\n", num_data_pgs, skb->len); ret = -EFAULT; goto drop; } else if (num_data_pgs > MAX_PAGE_BUFFER_COUNT) { if (skb_linearize(skb)) { net_alert_ratelimited("failed to linearize skb\n"); ret = -ENOMEM; goto drop; } linear = true; goto check_size; } /* * Place the rndis header in the skb head room and * the skb->cb will be used for hv_netvsc_packet * structure. */ ret = skb_cow_head(skb, RNDIS_AND_PPI_SIZE); if (ret) { netdev_err(net, "unable to alloc hv_netvsc_packet\n"); ret = -ENOMEM; goto drop; } /* Use the skb control buffer for building up the packet */ BUILD_BUG_ON(sizeof(struct hv_netvsc_packet) > FIELD_SIZEOF(struct sk_buff, cb)); packet = (struct hv_netvsc_packet *)skb->cb; packet->q_idx = skb_get_queue_mapping(skb); packet->total_data_buflen = skb->len; rndis_msg = (struct rndis_message *)skb->head; memset(rndis_msg, 0, RNDIS_AND_PPI_SIZE); isvlan = skb->vlan_tci & VLAN_TAG_PRESENT; /* Add the rndis header */ rndis_msg->ndis_msg_type = RNDIS_MSG_PACKET; rndis_msg->msg_len = packet->total_data_buflen; rndis_pkt = &rndis_msg->msg.pkt; rndis_pkt->data_offset = sizeof(struct rndis_packet); rndis_pkt->data_len = packet->total_data_buflen; rndis_pkt->per_pkt_info_offset = sizeof(struct rndis_packet); rndis_msg_size = RNDIS_MESSAGE_SIZE(struct rndis_packet); hash = skb_get_hash_raw(skb); if (hash != 0 && net->real_num_tx_queues > 1) { rndis_msg_size += NDIS_HASH_PPI_SIZE; ppi = init_ppi_data(rndis_msg, NDIS_HASH_PPI_SIZE, NBL_HASH_VALUE); *(u32 *)((void *)ppi + ppi->ppi_offset) = hash; } if (isvlan) { struct ndis_pkt_8021q_info *vlan; rndis_msg_size += NDIS_VLAN_PPI_SIZE; ppi = init_ppi_data(rndis_msg, NDIS_VLAN_PPI_SIZE, IEEE_8021Q_INFO); vlan = (struct ndis_pkt_8021q_info *)((void *)ppi + ppi->ppi_offset); vlan->vlanid = skb->vlan_tci & VLAN_VID_MASK; vlan->pri = (skb->vlan_tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; } net_trans_info = get_net_transport_info(skb, &hdr_offset); if (net_trans_info == TRANSPORT_INFO_NOT_IP) goto do_send; /* * Setup the sendside checksum offload only if this is not a * GSO packet. */ if (skb_is_gso(skb)) goto do_lso; if ((skb->ip_summed == CHECKSUM_NONE) || (skb->ip_summed == CHECKSUM_UNNECESSARY)) goto do_send; rndis_msg_size += NDIS_CSUM_PPI_SIZE; ppi = init_ppi_data(rndis_msg, NDIS_CSUM_PPI_SIZE, TCPIP_CHKSUM_PKTINFO); csum_info = (struct ndis_tcp_ip_checksum_info *)((void *)ppi + ppi->ppi_offset); if (net_trans_info & (INFO_IPV4 << 16)) csum_info->transmit.is_ipv4 = 1; else csum_info->transmit.is_ipv6 = 1; if (net_trans_info & INFO_TCP) { csum_info->transmit.tcp_checksum = 1; csum_info->transmit.tcp_header_offset = hdr_offset; } else if (net_trans_info & INFO_UDP) { /* UDP checksum offload is not supported on ws2008r2. * Furthermore, on ws2012 and ws2012r2, there are some * issues with udp checksum offload from Linux guests. * (these are host issues). * For now compute the checksum here. */ struct udphdr *uh; u16 udp_len; ret = skb_cow_head(skb, 0); if (ret) goto drop; uh = udp_hdr(skb); udp_len = ntohs(uh->len); uh->check = 0; uh->check = csum_tcpudp_magic(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, udp_len, IPPROTO_UDP, csum_partial(uh, udp_len, 0)); if (uh->check == 0) uh->check = CSUM_MANGLED_0; csum_info->transmit.udp_checksum = 0; } goto do_send; do_lso: rndis_msg_size += NDIS_LSO_PPI_SIZE; ppi = init_ppi_data(rndis_msg, NDIS_LSO_PPI_SIZE, TCP_LARGESEND_PKTINFO); lso_info = (struct ndis_tcp_lso_info *)((void *)ppi + ppi->ppi_offset); lso_info->lso_v2_transmit.type = NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE; if (net_trans_info & (INFO_IPV4 << 16)) { lso_info->lso_v2_transmit.ip_version = NDIS_TCP_LARGE_SEND_OFFLOAD_IPV4; ip_hdr(skb)->tot_len = 0; ip_hdr(skb)->check = 0; tcp_hdr(skb)->check = ~csum_tcpudp_magic(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 0, IPPROTO_TCP, 0); } else { lso_info->lso_v2_transmit.ip_version = NDIS_TCP_LARGE_SEND_OFFLOAD_IPV6; ipv6_hdr(skb)->payload_len = 0; tcp_hdr(skb)->check = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, 0, IPPROTO_TCP, 0); } lso_info->lso_v2_transmit.tcp_header_offset = hdr_offset; lso_info->lso_v2_transmit.mss = skb_shinfo(skb)->gso_size; do_send: /* Start filling in the page buffers with the rndis hdr */ rndis_msg->msg_len += rndis_msg_size; packet->total_data_buflen = rndis_msg->msg_len; packet->page_buf_cnt = init_page_array(rndis_msg, rndis_msg_size, skb, packet, &pb); /* timestamp packet in software */ skb_tx_timestamp(skb); ret = netvsc_send(net_device_ctx->device_ctx, packet, rndis_msg, &pb, skb); drop: if (ret == 0) { u64_stats_update_begin(&tx_stats->syncp); tx_stats->packets++; tx_stats->bytes += skb_length; u64_stats_update_end(&tx_stats->syncp); } else { if (ret != -EAGAIN) { dev_kfree_skb_any(skb); net->stats.tx_dropped++; } } return (ret == -EAGAIN) ? NETDEV_TX_BUSY : NETDEV_TX_OK; }
static int netvsc_start_xmit(struct sk_buff *skb, struct net_device *net) { struct net_device_context *net_device_ctx = netdev_priv(net); struct hv_netvsc_packet *packet; int ret; unsigned int num_data_pgs; struct rndis_message *rndis_msg; struct rndis_packet *rndis_pkt; u32 rndis_msg_size; bool isvlan; struct rndis_per_packet_info *ppi; struct ndis_tcp_ip_checksum_info *csum_info; struct ndis_tcp_lso_info *lso_info; int hdr_offset; u32 net_trans_info; /* We will atmost need two pages to describe the rndis * header. We can only transmit MAX_PAGE_BUFFER_COUNT number * of pages in a single packet. */ num_data_pgs = netvsc_get_slots(skb) + 2; if (num_data_pgs > MAX_PAGE_BUFFER_COUNT) { netdev_err(net, "Packet too big: %u\n", skb->len); dev_kfree_skb(skb); net->stats.tx_dropped++; return NETDEV_TX_OK; } /* Allocate a netvsc packet based on # of frags. */ packet = kzalloc(sizeof(struct hv_netvsc_packet) + (num_data_pgs * sizeof(struct hv_page_buffer)) + sizeof(struct rndis_message) + NDIS_VLAN_PPI_SIZE + NDIS_CSUM_PPI_SIZE + NDIS_LSO_PPI_SIZE, GFP_ATOMIC); if (!packet) { /* out of memory, drop packet */ netdev_err(net, "unable to allocate hv_netvsc_packet\n"); dev_kfree_skb(skb); net->stats.tx_dropped++; return NETDEV_TX_OK; } packet->vlan_tci = skb->vlan_tci; packet->is_data_pkt = true; packet->total_data_buflen = skb->len; packet->rndis_msg = (struct rndis_message *)((unsigned long)packet + sizeof(struct hv_netvsc_packet) + (num_data_pgs * sizeof(struct hv_page_buffer))); /* Set the completion routine */ packet->completion.send.send_completion = netvsc_xmit_completion; packet->completion.send.send_completion_ctx = packet; packet->completion.send.send_completion_tid = (unsigned long)skb; isvlan = packet->vlan_tci & VLAN_TAG_PRESENT; /* Add the rndis header */ rndis_msg = packet->rndis_msg; rndis_msg->ndis_msg_type = RNDIS_MSG_PACKET; rndis_msg->msg_len = packet->total_data_buflen; rndis_pkt = &rndis_msg->msg.pkt; rndis_pkt->data_offset = sizeof(struct rndis_packet); rndis_pkt->data_len = packet->total_data_buflen; rndis_pkt->per_pkt_info_offset = sizeof(struct rndis_packet); rndis_msg_size = RNDIS_MESSAGE_SIZE(struct rndis_packet); if (isvlan) { struct ndis_pkt_8021q_info *vlan; rndis_msg_size += NDIS_VLAN_PPI_SIZE; ppi = init_ppi_data(rndis_msg, NDIS_VLAN_PPI_SIZE, IEEE_8021Q_INFO); vlan = (struct ndis_pkt_8021q_info *)((void *)ppi + ppi->ppi_offset); vlan->vlanid = packet->vlan_tci & VLAN_VID_MASK; vlan->pri = (packet->vlan_tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; } net_trans_info = get_net_transport_info(skb, &hdr_offset); if (net_trans_info == TRANSPORT_INFO_NOT_IP) goto do_send; /* * Setup the sendside checksum offload only if this is not a * GSO packet. */ if (skb_is_gso(skb)) goto do_lso; if ((skb->ip_summed == CHECKSUM_NONE) || (skb->ip_summed == CHECKSUM_UNNECESSARY)) goto do_send; rndis_msg_size += NDIS_CSUM_PPI_SIZE; ppi = init_ppi_data(rndis_msg, NDIS_CSUM_PPI_SIZE, TCPIP_CHKSUM_PKTINFO); csum_info = (struct ndis_tcp_ip_checksum_info *)((void *)ppi + ppi->ppi_offset); if (net_trans_info & (INFO_IPV4 << 16)) csum_info->transmit.is_ipv4 = 1; else csum_info->transmit.is_ipv6 = 1; if (net_trans_info & INFO_TCP) { csum_info->transmit.tcp_checksum = 1; csum_info->transmit.tcp_header_offset = hdr_offset; } else if (net_trans_info & INFO_UDP) { /* UDP checksum offload is not supported on ws2008r2. * Furthermore, on ws2012 and ws2012r2, there are some * issues with udp checksum offload from Linux guests. * (these are host issues). * For now compute the checksum here. */ struct udphdr *uh; u16 udp_len; ret = skb_cow_head(skb, 0); if (ret) goto drop; uh = udp_hdr(skb); udp_len = ntohs(uh->len); uh->check = 0; uh->check = csum_tcpudp_magic(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, udp_len, IPPROTO_UDP, csum_partial(uh, udp_len, 0)); if (uh->check == 0) uh->check = CSUM_MANGLED_0; csum_info->transmit.udp_checksum = 0; } goto do_send; do_lso: rndis_msg_size += NDIS_LSO_PPI_SIZE; ppi = init_ppi_data(rndis_msg, NDIS_LSO_PPI_SIZE, TCP_LARGESEND_PKTINFO); lso_info = (struct ndis_tcp_lso_info *)((void *)ppi + ppi->ppi_offset); lso_info->lso_v2_transmit.type = NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE; if (net_trans_info & (INFO_IPV4 << 16)) { lso_info->lso_v2_transmit.ip_version = NDIS_TCP_LARGE_SEND_OFFLOAD_IPV4; ip_hdr(skb)->tot_len = 0; ip_hdr(skb)->check = 0; tcp_hdr(skb)->check = ~csum_tcpudp_magic(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 0, IPPROTO_TCP, 0); } else { lso_info->lso_v2_transmit.ip_version = NDIS_TCP_LARGE_SEND_OFFLOAD_IPV6; ipv6_hdr(skb)->payload_len = 0; tcp_hdr(skb)->check = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, 0, IPPROTO_TCP, 0); } lso_info->lso_v2_transmit.tcp_header_offset = hdr_offset; lso_info->lso_v2_transmit.mss = skb_shinfo(skb)->gso_size; do_send: /* Start filling in the page buffers with the rndis hdr */ rndis_msg->msg_len += rndis_msg_size; packet->page_buf_cnt = init_page_array(rndis_msg, rndis_msg_size, skb, &packet->page_buf[0]); ret = netvsc_send(net_device_ctx->device_ctx, packet); drop: if (ret == 0) { net->stats.tx_bytes += skb->len; net->stats.tx_packets++; } else { kfree(packet); if (ret != -EAGAIN) { dev_kfree_skb_any(skb); net->stats.tx_dropped++; } } return (ret == -EAGAIN) ? NETDEV_TX_BUSY : NETDEV_TX_OK; }
static int __ip6_append_data(struct sock *sk, struct flowi6 *fl6, struct sk_buff_head *queue, struct inet_cork *cork, struct inet6_cork *v6_cork, struct page_frag *pfrag, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, unsigned int flags, int dontfrag) { struct sk_buff *skb, *skb_prev = NULL; unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu; int exthdrlen = 0; int dst_exthdrlen = 0; int hh_len; int copy; int err; int offset = 0; __u8 tx_flags = 0; u32 tskey = 0; struct rt6_info *rt = (struct rt6_info *)cork->dst; struct ipv6_txoptions *opt = v6_cork->opt; int csummode = CHECKSUM_NONE; unsigned int maxnonfragsize, headersize; skb = skb_peek_tail(queue); if (!skb) { exthdrlen = opt ? opt->opt_flen : 0; dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; } mtu = cork->fragsize; orig_mtu = mtu; hh_len = LL_RESERVED_SPACE(rt->dst.dev); fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + (opt ? opt->opt_nflen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr); headersize = sizeof(struct ipv6hdr) + (opt ? opt->opt_flen + opt->opt_nflen : 0) + (dst_allfrag(&rt->dst) ? sizeof(struct frag_hdr) : 0) + rt->rt6i_nfheader_len; if (cork->length + length > mtu - headersize && dontfrag && (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_RAW)) { ipv6_local_rxpmtu(sk, fl6, mtu - headersize + sizeof(struct ipv6hdr)); goto emsgsize; } if (ip6_sk_ignore_df(sk)) maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; else maxnonfragsize = mtu; if (cork->length + length > maxnonfragsize - headersize) { emsgsize: ipv6_local_error(sk, EMSGSIZE, fl6, mtu - headersize + sizeof(struct ipv6hdr)); return -EMSGSIZE; } /* CHECKSUM_PARTIAL only with no extension headers and when * we are not going to fragment */ if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && headersize == sizeof(struct ipv6hdr) && length < mtu - headersize && !(flags & MSG_MORE) && rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) csummode = CHECKSUM_PARTIAL; if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) { sock_tx_timestamp(sk, &tx_flags); if (tx_flags & SKBTX_ANY_SW_TSTAMP && sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) tskey = sk->sk_tskey++; } /* * Let's try using as much space as possible. * Use MTU if total length of the message fits into the MTU. * Otherwise, we need to reserve fragment header and * fragment alignment (= 8-15 octects, in total). * * Note that we may need to "move" the data from the tail of * of the buffer to the new fragment when we split * the message. * * FIXME: It may be fragmented into multiple chunks * at once if non-fragmentable extension headers * are too large. * --yoshfuji */ cork->length += length; if (((length > mtu) || (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO) && (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) { err = ip6_ufo_append_data(sk, queue, getfrag, from, length, hh_len, fragheaderlen, transhdrlen, mtu, flags, fl6); if (err) goto error; return 0; } if (!skb) goto alloc_new_skb; while (length > 0) { /* Check if the remaining data fits into current packet. */ copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len; if (copy < length) copy = maxfraglen - skb->len; if (copy <= 0) { char *data; unsigned int datalen; unsigned int fraglen; unsigned int fraggap; unsigned int alloclen; alloc_new_skb: /* There's no room in the current skb */ if (skb) fraggap = skb->len - maxfraglen; else fraggap = 0; /* update mtu and maxfraglen if necessary */ if (!skb || !skb_prev) ip6_append_data_mtu(&mtu, &maxfraglen, fragheaderlen, skb, rt, orig_mtu); skb_prev = skb; /* * If remaining data exceeds the mtu, * we know we need more fragment(s). */ datalen = length + fraggap; if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; if ((flags & MSG_MORE) && !(rt->dst.dev->features&NETIF_F_SG)) alloclen = mtu; else alloclen = datalen + fragheaderlen; alloclen += dst_exthdrlen; if (datalen != length + fraggap) { /* * this is not the last fragment, the trailer * space is regarded as data space. */ datalen += rt->dst.trailer_len; } alloclen += rt->dst.trailer_len; fraglen = datalen + fragheaderlen; /* * We just reserve space for fragment header. * Note: this may be overallocation if the message * (without MSG_MORE) fits into the MTU. */ alloclen += sizeof(struct frag_hdr); if (transhdrlen) { skb = sock_alloc_send_skb(sk, alloclen + hh_len, (flags & MSG_DONTWAIT), &err); } else { skb = NULL; if (atomic_read(&sk->sk_wmem_alloc) <= 2 * sk->sk_sndbuf) skb = sock_wmalloc(sk, alloclen + hh_len, 1, sk->sk_allocation); if (unlikely(!skb)) err = -ENOBUFS; } if (!skb) goto error; /* * Fill in the control structures */ skb->protocol = htons(ETH_P_IPV6); skb->ip_summed = csummode; skb->csum = 0; /* reserve for fragmentation and ipsec header */ skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + dst_exthdrlen); /* Only the initial fragment is time stamped */ skb_shinfo(skb)->tx_flags = tx_flags; tx_flags = 0; skb_shinfo(skb)->tskey = tskey; tskey = 0; /* * Find where to start putting bytes */ data = skb_put(skb, fraglen); skb_set_network_header(skb, exthdrlen); data += fragheaderlen; skb->transport_header = (skb->network_header + fragheaderlen); if (fraggap) { skb->csum = skb_copy_and_csum_bits( skb_prev, maxfraglen, data + transhdrlen, fraggap, 0); skb_prev->csum = csum_sub(skb_prev->csum, skb->csum); data += fraggap; pskb_trim_unique(skb_prev, maxfraglen); } copy = datalen - transhdrlen - fraggap; if (copy < 0) { err = -EINVAL; kfree_skb(skb); goto error; } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { err = -EFAULT; kfree_skb(skb); goto error; } offset += copy; length -= datalen - fraggap; transhdrlen = 0; exthdrlen = 0; dst_exthdrlen = 0; /* * Put the packet on the pending queue */ __skb_queue_tail(queue, skb); continue; } if (copy > length) copy = length; if (!(rt->dst.dev->features&NETIF_F_SG)) { unsigned int off; off = skb->len; if (getfrag(from, skb_put(skb, copy), offset, copy, off, skb) < 0) { __skb_trim(skb, off); err = -EFAULT; goto error; } } else { int i = skb_shinfo(skb)->nr_frags; err = -ENOMEM; if (!sk_page_frag_refill(sk, pfrag)) goto error; if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { err = -EMSGSIZE; if (i == MAX_SKB_FRAGS) goto error; __skb_fill_page_desc(skb, i, pfrag->page, pfrag->offset, 0); skb_shinfo(skb)->nr_frags = ++i; get_page(pfrag->page); } copy = min_t(int, copy, pfrag->size - pfrag->offset); if (getfrag(from, page_address(pfrag->page) + pfrag->offset, offset, copy, skb->len, skb) < 0) goto error_efault; pfrag->offset += copy; skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); skb->len += copy; skb->data_len += copy; skb->truesize += copy; atomic_add(copy, &sk->sk_wmem_alloc); } offset += copy; length -= copy; } return 0; error_efault: err = -EFAULT; error: cork->length -= length; IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); return err; }
/* Get route to destination or remote server */ static int __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest, __be32 daddr, int rt_mode, __be32 *ret_saddr) { struct net *net = dev_net(skb_dst(skb)->dev); struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_dest_dst *dest_dst; struct rtable *rt; /* Route to the other host */ struct rtable *ort; /* Original route */ struct iphdr *iph; __be16 df; int mtu; int local, noref = 1; if (dest) { dest_dst = __ip_vs_dst_check(dest); if (likely(dest_dst)) rt = (struct rtable *) dest_dst->dst_cache; else { dest_dst = ip_vs_dest_dst_alloc(); spin_lock_bh(&dest->dst_lock); if (!dest_dst) { __ip_vs_dst_set(dest, NULL, NULL, 0); spin_unlock_bh(&dest->dst_lock); goto err_unreach; } rt = do_output_route4(net, dest->addr.ip, rt_mode, &dest_dst->dst_saddr.ip); if (!rt) { __ip_vs_dst_set(dest, NULL, NULL, 0); spin_unlock_bh(&dest->dst_lock); ip_vs_dest_dst_free(dest_dst); goto err_unreach; } __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0); spin_unlock_bh(&dest->dst_lock); IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n", &dest->addr.ip, &dest_dst->dst_saddr.ip, atomic_read(&rt->dst.__refcnt)); } daddr = dest->addr.ip; if (ret_saddr) *ret_saddr = dest_dst->dst_saddr.ip; } else { __be32 saddr = htonl(INADDR_ANY); noref = 0; /* For such unconfigured boxes avoid many route lookups * for performance reasons because we do not remember saddr */ rt_mode &= ~IP_VS_RT_MODE_CONNECT; rt = do_output_route4(net, daddr, rt_mode, &saddr); if (!rt) goto err_unreach; if (ret_saddr) *ret_saddr = saddr; } local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0; if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) & rt_mode)) { IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n", (rt->rt_flags & RTCF_LOCAL) ? "local":"non-local", &daddr); goto err_put; } iph = ip_hdr(skb); if (likely(!local)) { if (unlikely(ipv4_is_loopback(iph->saddr))) { IP_VS_DBG_RL("Stopping traffic from loopback address " "%pI4 to non-local address, dest: %pI4\n", &iph->saddr, &daddr); goto err_put; } } else { ort = skb_rtable(skb); if (!(rt_mode & IP_VS_RT_MODE_RDR) && !(ort->rt_flags & RTCF_LOCAL)) { IP_VS_DBG_RL("Redirect from non-local address %pI4 to " "local requires NAT method, dest: %pI4\n", &iph->daddr, &daddr); goto err_put; } /* skb to local stack, preserve old route */ if (!noref) ip_rt_put(rt); return local; } if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) { mtu = dst_mtu(&rt->dst); df = iph->frag_off & htons(IP_DF); } else { struct sock *sk = skb->sk; mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); if (mtu < 68) { IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); goto err_put; } ort = skb_rtable(skb); if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT) ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu); /* MTU check allowed? */ df = sysctl_pmtu_disc(ipvs) ? iph->frag_off & htons(IP_DF) : 0; } /* MTU checking */ if (unlikely(df && skb->len > mtu && !skb_is_gso(skb))) { icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); IP_VS_DBG(1, "frag needed for %pI4\n", &iph->saddr); goto err_put; } skb_dst_drop(skb); if (noref) { if (!local) skb_dst_set_noref_force(skb, &rt->dst); else skb_dst_set(skb, dst_clone(&rt->dst)); } else skb_dst_set(skb, &rt->dst); return local; err_put: if (!noref) ip_rt_put(rt); return -1; err_unreach: dst_link_failure(skb); return -1; }
static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { struct nf_bridge_info *nf_bridge; unsigned int mtu_reserved; mtu_reserved = nf_bridge_mtu_reduction(skb); if (skb_is_gso(skb) || skb->len + mtu_reserved <= skb->dev->mtu) { nf_bridge_info_free(skb); return br_dev_queue_push_xmit(net, sk, skb); } nf_bridge = nf_bridge_info_get(skb); /* This is wrong! We should preserve the original fragment * boundaries by preserving frag_list rather than refragmenting. */ if (IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) && skb->protocol == htons(ETH_P_IP)) { struct brnf_frag_data *data; if (br_validate_ipv4(net, skb)) goto drop; IPCB(skb)->frag_max_size = nf_bridge->frag_max_size; nf_bridge_update_protocol(skb); data = this_cpu_ptr(&brnf_frag_data_storage); data->vlan_tci = skb->vlan_tci; data->vlan_proto = skb->vlan_proto; data->encap_size = nf_bridge_encap_header_len(skb); data->size = ETH_HLEN + data->encap_size; skb_copy_from_linear_data_offset(skb, -data->size, data->mac, data->size); return br_nf_ip_fragment(net, sk, skb, br_nf_push_frag_xmit); } if (IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) && skb->protocol == htons(ETH_P_IPV6)) { const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); struct brnf_frag_data *data; if (br_validate_ipv6(net, skb)) goto drop; IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size; nf_bridge_update_protocol(skb); data = this_cpu_ptr(&brnf_frag_data_storage); data->encap_size = nf_bridge_encap_header_len(skb); data->size = ETH_HLEN + data->encap_size; skb_copy_from_linear_data_offset(skb, -data->size, data->mac, data->size); if (v6ops) return v6ops->fragment(net, sk, skb, br_nf_push_frag_xmit); kfree_skb(skb); return -EMSGSIZE; } nf_bridge_info_free(skb); return br_dev_queue_push_xmit(net, sk, skb); drop: kfree_skb(skb); return 0; }
static struct sk_buff *gre_gso_segment(struct sk_buff *skb, netdev_features_t features) { int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); struct sk_buff *segs = ERR_PTR(-EINVAL); u16 mac_offset = skb->mac_header; __be16 protocol = skb->protocol; u16 mac_len = skb->mac_len; int gre_offset, outer_hlen; bool need_csum, ufo; if (!skb->encapsulation) goto out; if (unlikely(tnl_hlen < sizeof(struct gre_base_hdr))) goto out; if (unlikely(!pskb_may_pull(skb, tnl_hlen))) goto out; /* setup inner skb. */ skb->encapsulation = 0; SKB_GSO_CB(skb)->encap_level = 0; __skb_pull(skb, tnl_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, skb_inner_network_offset(skb)); skb->mac_len = skb_inner_network_offset(skb); skb->protocol = skb->inner_protocol; need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM); skb->encap_hdr_csum = need_csum; ufo = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); features &= skb->dev->hw_enc_features; /* The only checksum offload we care about from here on out is the * outer one so strip the existing checksum feature flags based * on the fact that we will be computing our checksum in software. */ if (ufo) { features &= ~NETIF_F_CSUM_MASK; if (!need_csum) features |= NETIF_F_HW_CSUM; } /* segment inner packet. */ segs = skb_mac_gso_segment(skb, features); if (IS_ERR_OR_NULL(segs)) { skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, mac_len); goto out; } outer_hlen = skb_tnl_header_len(skb); gre_offset = outer_hlen - tnl_hlen; skb = segs; do { struct gre_base_hdr *greh; __sum16 *pcsum; /* Set up inner headers if we are offloading inner checksum */ if (skb->ip_summed == CHECKSUM_PARTIAL) { skb_reset_inner_headers(skb); skb->encapsulation = 1; } skb->mac_len = mac_len; skb->protocol = protocol; __skb_push(skb, outer_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, mac_len); skb_set_transport_header(skb, gre_offset); if (!need_csum) continue; greh = (struct gre_base_hdr *)skb_transport_header(skb); pcsum = (__sum16 *)(greh + 1); if (skb_is_gso(skb)) { unsigned int partial_adj; /* Adjust checksum to account for the fact that * the partial checksum is based on actual size * whereas headers should be based on MSS size. */ partial_adj = skb->len + skb_headroom(skb) - SKB_GSO_CB(skb)->data_offset - skb_shinfo(skb)->gso_size; *pcsum = ~csum_fold((__force __wsum)htonl(partial_adj)); } else { *pcsum = 0; } *(pcsum + 1) = 0; *pcsum = gso_make_checksum(skb, 0); } while ((skb = skb->next)); out: return segs; }
static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb) { struct mlx5_wq_cyc *wq = &sq->wq; u16 pi = sq->pc & wq->sz_m1; struct mlx5e_tx_wqe *wqe = mlx5_wq_cyc_get_wqe(wq, pi); struct mlx5e_tx_wqe_info *wi = &sq->wqe_info[pi]; struct mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl; struct mlx5_wqe_eth_seg *eseg = &wqe->eth; struct mlx5_wqe_data_seg *dseg; unsigned char *skb_data = skb->data; unsigned int skb_len = skb->len; u8 opcode = MLX5_OPCODE_SEND; dma_addr_t dma_addr = 0; unsigned int num_bytes; bool bf = false; u16 headlen; u16 ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS; u16 ihs; int i; memset(wqe, 0, sizeof(*wqe)); if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) { eseg->cs_flags = MLX5_ETH_WQE_L3_CSUM; if (skb->encapsulation) { eseg->cs_flags |= MLX5_ETH_WQE_L3_INNER_CSUM | MLX5_ETH_WQE_L4_INNER_CSUM; sq->stats.csum_partial_inner++; } else { eseg->cs_flags |= MLX5_ETH_WQE_L4_CSUM; sq->stats.csum_partial++; } } else sq->stats.csum_none++; if (sq->cc != sq->prev_cc) { sq->prev_cc = sq->cc; sq->bf_budget = (sq->cc == sq->pc) ? MLX5E_SQ_BF_BUDGET : 0; } if (skb_is_gso(skb)) { eseg->mss = cpu_to_be16(skb_shinfo(skb)->gso_size); opcode = MLX5_OPCODE_LSO; if (skb->encapsulation) { ihs = skb_inner_transport_header(skb) - skb->data + inner_tcp_hdrlen(skb); sq->stats.tso_inner_packets++; sq->stats.tso_inner_bytes += skb->len - ihs; } else { ihs = skb_transport_offset(skb) + tcp_hdrlen(skb); sq->stats.tso_packets++; sq->stats.tso_bytes += skb->len - ihs; } num_bytes = skb->len + (skb_shinfo(skb)->gso_segs - 1) * ihs; } else { bf = sq->bf_budget && !skb->xmit_more && !skb_shinfo(skb)->nr_frags; ihs = mlx5e_get_inline_hdr_size(sq, skb, bf); num_bytes = max_t(unsigned int, skb->len, ETH_ZLEN); } wi->num_bytes = num_bytes; if (skb_vlan_tag_present(skb)) { mlx5e_insert_vlan(eseg->inline_hdr_start, skb, ihs, &skb_data, &skb_len); ihs += VLAN_HLEN; } else { memcpy(eseg->inline_hdr_start, skb_data, ihs); mlx5e_tx_skb_pull_inline(&skb_data, &skb_len, ihs); } eseg->inline_hdr_sz = cpu_to_be16(ihs); ds_cnt += DIV_ROUND_UP(ihs - sizeof(eseg->inline_hdr_start), MLX5_SEND_WQE_DS); dseg = (struct mlx5_wqe_data_seg *)cseg + ds_cnt; wi->num_dma = 0; headlen = skb_len - skb->data_len; if (headlen) { dma_addr = dma_map_single(sq->pdev, skb_data, headlen, DMA_TO_DEVICE); if (unlikely(dma_mapping_error(sq->pdev, dma_addr))) goto dma_unmap_wqe_err; dseg->addr = cpu_to_be64(dma_addr); dseg->lkey = sq->mkey_be; dseg->byte_count = cpu_to_be32(headlen); mlx5e_dma_push(sq, dma_addr, headlen, MLX5E_DMA_MAP_SINGLE); wi->num_dma++; dseg++; } for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[i]; int fsz = skb_frag_size(frag); dma_addr = skb_frag_dma_map(sq->pdev, frag, 0, fsz, DMA_TO_DEVICE); if (unlikely(dma_mapping_error(sq->pdev, dma_addr))) goto dma_unmap_wqe_err; dseg->addr = cpu_to_be64(dma_addr); dseg->lkey = sq->mkey_be; dseg->byte_count = cpu_to_be32(fsz); mlx5e_dma_push(sq, dma_addr, fsz, MLX5E_DMA_MAP_PAGE); wi->num_dma++; dseg++; } ds_cnt += wi->num_dma; cseg->opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | opcode); cseg->qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt); sq->skb[pi] = skb; wi->num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS); sq->pc += wi->num_wqebbs; if (unlikely(MLX5E_TX_HW_STAMP(sq->channel->priv, skb))) skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; netdev_tx_sent_queue(sq->txq, wi->num_bytes); if (unlikely(!mlx5e_sq_has_room_for(sq, MLX5E_SQ_STOP_ROOM))) { netif_tx_stop_queue(sq->txq); sq->stats.queue_stopped++; } if (!skb->xmit_more || netif_xmit_stopped(sq->txq)) { int bf_sz = 0; if (bf && sq->uar_bf_map) bf_sz = wi->num_wqebbs << 3; cseg->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE; mlx5e_tx_notify_hw(sq, &wqe->ctrl, bf_sz); } sq->bf_budget = bf ? sq->bf_budget - 1 : 0; /* fill sq edge with nops to avoid wqe wrap around */ while ((sq->pc & wq->sz_m1) > sq->edge) mlx5e_send_nop(sq, false); sq->stats.packets++; sq->stats.bytes += num_bytes; return NETDEV_TX_OK; dma_unmap_wqe_err: sq->stats.queue_dropped++; mlx5e_dma_unmap_wqe_err(sq, wi->num_dma); dev_kfree_skb_any(skb); return NETDEV_TX_OK; }
static bool is_gre_gso(struct sk_buff *skb) { return skb_is_gso(skb); }
static int netvsc_start_xmit(struct sk_buff *skb, struct net_device *net) { struct net_device_context *net_device_ctx = netdev_priv(net); struct hv_netvsc_packet *packet = NULL; int ret; unsigned int num_data_pgs; struct rndis_message *rndis_msg; struct rndis_packet *rndis_pkt; u32 rndis_msg_size; struct rndis_per_packet_info *ppi; u32 hash; u32 skb_length; struct hv_page_buffer page_buf[MAX_PAGE_BUFFER_COUNT]; struct hv_page_buffer *pb = page_buf; /* We will atmost need two pages to describe the rndis * header. We can only transmit MAX_PAGE_BUFFER_COUNT number * of pages in a single packet. If skb is scattered around * more pages we try linearizing it. */ skb_length = skb->len; num_data_pgs = netvsc_get_slots(skb) + 2; if (unlikely(num_data_pgs > MAX_PAGE_BUFFER_COUNT)) { ++net_device_ctx->eth_stats.tx_scattered; if (skb_linearize(skb)) goto no_memory; num_data_pgs = netvsc_get_slots(skb) + 2; if (num_data_pgs > MAX_PAGE_BUFFER_COUNT) { ++net_device_ctx->eth_stats.tx_too_big; goto drop; } } /* * Place the rndis header in the skb head room and * the skb->cb will be used for hv_netvsc_packet * structure. */ ret = skb_cow_head(skb, RNDIS_AND_PPI_SIZE); if (ret) goto no_memory; /* Use the skb control buffer for building up the packet */ BUILD_BUG_ON(sizeof(struct hv_netvsc_packet) > FIELD_SIZEOF(struct sk_buff, cb)); packet = (struct hv_netvsc_packet *)skb->cb; /* TODO: This will likely evaluate to false, since RH7 and * below kernels will set next pointer to NULL before calling * into here. Should find another way to set this flag. */ packet->xmit_more = (skb->next != NULL); packet->q_idx = skb_get_queue_mapping(skb); packet->total_data_buflen = skb->len; rndis_msg = (struct rndis_message *)skb->head; memset(rndis_msg, 0, RNDIS_AND_PPI_SIZE); packet->send_completion_ctx = packet; /* Add the rndis header */ rndis_msg->ndis_msg_type = RNDIS_MSG_PACKET; rndis_msg->msg_len = packet->total_data_buflen; rndis_pkt = &rndis_msg->msg.pkt; rndis_pkt->data_offset = sizeof(struct rndis_packet); rndis_pkt->data_len = packet->total_data_buflen; rndis_pkt->per_pkt_info_offset = sizeof(struct rndis_packet); rndis_msg_size = RNDIS_MESSAGE_SIZE(struct rndis_packet); #ifdef NOTYET // Divergence from upstream commit: // 307f099520b66504cf6c5638f3f404c48b9fb45b hash = skb_get_hash_raw(skb); #endif hash = skb_get_hash(skb); if (hash != 0 && net->real_num_tx_queues > 1) { rndis_msg_size += NDIS_HASH_PPI_SIZE; ppi = init_ppi_data(rndis_msg, NDIS_HASH_PPI_SIZE, NBL_HASH_VALUE); *(u32 *)((void *)ppi + ppi->ppi_offset) = hash; } if (skb_vlan_tag_present(skb)) { struct ndis_pkt_8021q_info *vlan; rndis_msg_size += NDIS_VLAN_PPI_SIZE; ppi = init_ppi_data(rndis_msg, NDIS_VLAN_PPI_SIZE, IEEE_8021Q_INFO); vlan = (struct ndis_pkt_8021q_info *)((void *)ppi + ppi->ppi_offset); vlan->vlanid = skb->vlan_tci & VLAN_VID_MASK; vlan->pri = (skb->vlan_tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; } if (skb_is_gso(skb)) { struct ndis_tcp_lso_info *lso_info; rndis_msg_size += NDIS_LSO_PPI_SIZE; ppi = init_ppi_data(rndis_msg, NDIS_LSO_PPI_SIZE, TCP_LARGESEND_PKTINFO); lso_info = (struct ndis_tcp_lso_info *)((void *)ppi + ppi->ppi_offset); lso_info->lso_v2_transmit.type = NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE; if (skb->protocol == htons(ETH_P_IP)) { lso_info->lso_v2_transmit.ip_version = NDIS_TCP_LARGE_SEND_OFFLOAD_IPV4; ip_hdr(skb)->tot_len = 0; ip_hdr(skb)->check = 0; tcp_hdr(skb)->check = ~csum_tcpudp_magic(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 0, IPPROTO_TCP, 0); } else { lso_info->lso_v2_transmit.ip_version = NDIS_TCP_LARGE_SEND_OFFLOAD_IPV6; ipv6_hdr(skb)->payload_len = 0; tcp_hdr(skb)->check = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, 0, IPPROTO_TCP, 0); } lso_info->lso_v2_transmit.tcp_header_offset = skb_transport_offset(skb); lso_info->lso_v2_transmit.mss = skb_shinfo(skb)->gso_size; } else if (skb->ip_summed == CHECKSUM_PARTIAL) { if (net_checksum_info(skb) & net_device_ctx->tx_checksum_mask) { struct ndis_tcp_ip_checksum_info *csum_info; rndis_msg_size += NDIS_CSUM_PPI_SIZE; ppi = init_ppi_data(rndis_msg, NDIS_CSUM_PPI_SIZE, TCPIP_CHKSUM_PKTINFO); csum_info = (struct ndis_tcp_ip_checksum_info *)((void *)ppi + ppi->ppi_offset); csum_info->transmit.tcp_header_offset = skb_transport_offset(skb); if (skb->protocol == htons(ETH_P_IP)) { csum_info->transmit.is_ipv4 = 1; if (ip_hdr(skb)->protocol == IPPROTO_TCP) csum_info->transmit.tcp_checksum = 1; else csum_info->transmit.udp_checksum = 1; } else { csum_info->transmit.is_ipv6 = 1; if (ipv6_hdr(skb)->nexthdr == IPPROTO_TCP) csum_info->transmit.tcp_checksum = 1; else csum_info->transmit.udp_checksum = 1; } } else { /* Can't do offload of this type of checksum */ if (skb_checksum_help(skb)) goto drop; } } /* Start filling in the page buffers with the rndis hdr */ rndis_msg->msg_len += rndis_msg_size; packet->total_data_buflen = rndis_msg->msg_len; packet->page_buf_cnt = init_page_array(rndis_msg, rndis_msg_size, skb, packet, &pb); /* timestamp packet in software */ skb_tx_timestamp(skb); ret = netvsc_send(net_device_ctx->device_ctx, packet, rndis_msg, &pb, skb); if (likely(ret == 0)) { struct netvsc_stats *tx_stats = this_cpu_ptr(net_device_ctx->tx_stats); u64_stats_update_begin(&tx_stats->syncp); tx_stats->packets++; tx_stats->bytes += skb_length; u64_stats_update_end(&tx_stats->syncp); return NETDEV_TX_OK; } if (ret == -EAGAIN) { ++net_device_ctx->eth_stats.tx_busy; return NETDEV_TX_BUSY; } if (ret == -ENOSPC) ++net_device_ctx->eth_stats.tx_no_space; drop: dev_kfree_skb_any(skb); net->stats.tx_dropped++; return NETDEV_TX_OK; no_memory: ++net_device_ctx->eth_stats.tx_no_memory; goto drop; }
static int start_xmit(struct sk_buff *skb, struct net_device *dev) { struct virtnet_info *vi = netdev_priv(dev); int num, err; struct scatterlist sg[1+MAX_SKB_FRAGS]; struct virtio_net_hdr *hdr; const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest; sg_init_table(sg, 1+MAX_SKB_FRAGS); pr_debug("%s: xmit %p " MAC_FMT "\n", dev->name, skb, dest[0], dest[1], dest[2], dest[3], dest[4], dest[5]); /* Encode metadata header at front. */ hdr = skb_vnet_hdr(skb); if (skb->ip_summed == CHECKSUM_PARTIAL) { hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; hdr->csum_start = skb->csum_start - skb_headroom(skb); hdr->csum_offset = skb->csum_offset; } else { hdr->flags = 0; hdr->csum_offset = hdr->csum_start = 0; } if (skb_is_gso(skb)) { hdr->hdr_len = skb_transport_header(skb) - skb->data; hdr->gso_size = skb_shinfo(skb)->gso_size; if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; else if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP) hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP; else BUG(); if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_ECN) hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN; } else { hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; hdr->gso_size = hdr->hdr_len = 0; } vnet_hdr_to_sg(sg, skb); num = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1; __skb_queue_head(&vi->send, skb); again: /* Free up any pending old buffers before queueing new ones. */ free_old_xmit_skbs(vi); err = vi->svq->vq_ops->add_buf(vi->svq, sg, num, 0, skb); if (err) { pr_debug("%s: virtio not prepared to send\n", dev->name); netif_stop_queue(dev); /* Activate callback for using skbs: if this returns false it * means some were used in the meantime. */ if (unlikely(!vi->svq->vq_ops->enable_cb(vi->svq))) { vi->svq->vq_ops->disable_cb(vi->svq); netif_start_queue(dev); goto again; } __skb_unlink(skb, &vi->send); return NETDEV_TX_BUSY; } vi->svq->vq_ops->kick(vi->svq); return 0; }
int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, struct ipv6_txoptions *opt, int tclass) { struct net *net = sock_net(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *first_hop = &fl6->daddr; struct dst_entry *dst = skb_dst(skb); struct ipv6hdr *hdr; u8 proto = fl6->flowi6_proto; int seg_len = skb->len; int hlimit = -1; u32 mtu; if (opt) { unsigned int head_room; /* First: exthdrs may take lots of space (~8K for now) MAX_HEADER is not enough. */ head_room = opt->opt_nflen + opt->opt_flen; seg_len += head_room; head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); if (skb_headroom(skb) < head_room) { struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); if (skb2 == NULL) { IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); return -ENOBUFS; } kfree_skb(skb); skb = skb2; skb_set_owner_w(skb, sk); } if (opt->opt_flen) ipv6_push_frag_opts(skb, opt, &proto); if (opt->opt_nflen) ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop); } skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); hdr = ipv6_hdr(skb); /* * Fill in the IPv6 header */ if (np) hlimit = np->hop_limit; if (hlimit < 0) hlimit = ip6_dst_hoplimit(dst); *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel; hdr->payload_len = htons(seg_len); hdr->nexthdr = proto; hdr->hop_limit = hlimit; hdr->saddr = fl6->saddr; hdr->daddr = *first_hop; skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; mtu = dst_mtu(dst); if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) { IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUT, skb->len); return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev, dst_output); } net_dbg_ratelimited("IPv6: sending pkt_too_big to self\n"); skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); return -EMSGSIZE; }
int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, struct ipv6_txoptions *opt, int ipfragok) { struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *first_hop = &fl->fl6_dst; struct dst_entry *dst = skb->dst; struct ipv6hdr *hdr; u8 proto = fl->proto; int seg_len = skb->len; int hlimit, tclass; u32 mtu; if (opt) { int head_room; /* First: exthdrs may take lots of space (~8K for now) MAX_HEADER is not enough. */ head_room = opt->opt_nflen + opt->opt_flen; seg_len += head_room; head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); if (skb_headroom(skb) < head_room) { struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); if (skb2 == NULL) { IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); return -ENOBUFS; } kfree_skb(skb); skb = skb2; if (sk) skb_set_owner_w(skb, sk); } if (opt->opt_flen) ipv6_push_frag_opts(skb, opt, &proto); if (opt->opt_nflen) ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop); } hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr)); /* * Fill in the IPv6 header */ hlimit = -1; if (np) hlimit = np->hop_limit; if (hlimit < 0) hlimit = dst_metric(dst, RTAX_HOPLIMIT); if (hlimit < 0) hlimit = ipv6_get_hoplimit(dst->dev); tclass = -1; if (np) tclass = np->tclass; if (tclass < 0) tclass = 0; *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel; hdr->payload_len = htons(seg_len); hdr->nexthdr = proto; hdr->hop_limit = hlimit; ipv6_addr_copy(&hdr->saddr, &fl->fl6_src); ipv6_addr_copy(&hdr->daddr, first_hop); skb->priority = sk->sk_priority; mtu = dst_mtu(dst); if ((skb->len <= mtu) || ipfragok || skb_is_gso(skb)) { IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_OUTREQUESTS); return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, dst_output); } if (net_ratelimit()) printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n"); skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); return -EMSGSIZE; }
struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again) { int err; unsigned long flags; struct xfrm_state *x; struct sk_buff *skb2; struct softnet_data *sd; netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); if (!xo) return skb; if (!(features & NETIF_F_HW_ESP)) esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK); x = skb->sp->xvec[skb->sp->len - 1]; if (xo->flags & XFRM_GRO || x->xso.flags & XFRM_OFFLOAD_INBOUND) return skb; local_irq_save(flags); sd = this_cpu_ptr(&softnet_data); err = !skb_queue_empty(&sd->xfrm_backlog); local_irq_restore(flags); if (err) { *again = true; return skb; } if (skb_is_gso(skb)) { struct net_device *dev = skb->dev; if (unlikely(!x->xso.offload_handle || (x->xso.dev != dev))) { struct sk_buff *segs; /* Packet got rerouted, fixup features and segment it. */ esp_features = esp_features & ~(NETIF_F_HW_ESP | NETIF_F_GSO_ESP); segs = skb_gso_segment(skb, esp_features); if (IS_ERR(segs)) { kfree_skb(skb); atomic_long_inc(&dev->tx_dropped); return NULL; } else { consume_skb(skb); skb = segs; } } } if (!skb->next) { x->outer_mode->xmit(x, skb); xo->flags |= XFRM_DEV_RESUME; err = x->type_offload->xmit(x, skb, esp_features); if (err) { if (err == -EINPROGRESS) return NULL; XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR); kfree_skb(skb); return NULL; } skb_push(skb, skb->data - skb_mac_header(skb)); return skb; } skb2 = skb; do { struct sk_buff *nskb = skb2->next; skb2->next = NULL; xo = xfrm_offload(skb2); xo->flags |= XFRM_DEV_RESUME; x->outer_mode->xmit(x, skb2); err = x->type_offload->xmit(x, skb2, esp_features); if (!err) { skb2->next = nskb; } else if (err != -EINPROGRESS) { XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR); skb2->next = nskb; kfree_skb_list(skb2); return NULL; } else { if (skb == skb2) skb = nskb; if (!skb) return NULL; goto skip_push; } skb_push(skb2, skb2->data - skb_mac_header(skb2)); skip_push: skb2 = nskb; } while (skb2); return skb; }
static int xenvif_tx_submit(struct xenvif *vif) { struct gnttab_copy *gop = vif->tx_copy_ops; struct sk_buff *skb; int work_done = 0; while ((skb = __skb_dequeue(&vif->tx_queue)) != NULL) { struct xen_netif_tx_request *txp; u16 pending_idx; unsigned data_len; pending_idx = *((u16 *)skb->data); txp = &vif->pending_tx_info[pending_idx].req; /* Check the remap error code. */ if (unlikely(xenvif_tx_check_gop(vif, skb, &gop))) { netdev_dbg(vif->dev, "netback grant failed.\n"); skb_shinfo(skb)->nr_frags = 0; kfree_skb(skb); continue; } data_len = skb->len; memcpy(skb->data, (void *)(idx_to_kaddr(vif, pending_idx)|txp->offset), data_len); if (data_len < txp->size) { /* Append the packet payload as a fragment. */ txp->offset += data_len; txp->size -= data_len; } else { /* Schedule a response immediately. */ xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_OKAY); } if (txp->flags & XEN_NETTXF_csum_blank) skb->ip_summed = CHECKSUM_PARTIAL; else if (txp->flags & XEN_NETTXF_data_validated) skb->ip_summed = CHECKSUM_UNNECESSARY; xenvif_fill_frags(vif, skb); if (skb_is_nonlinear(skb) && skb_headlen(skb) < PKT_PROT_LEN) { int target = min_t(int, skb->len, PKT_PROT_LEN); __pskb_pull_tail(skb, target - skb_headlen(skb)); } skb->dev = vif->dev; skb->protocol = eth_type_trans(skb, skb->dev); skb_reset_network_header(skb); if (checksum_setup(vif, skb)) { netdev_dbg(vif->dev, "Can't setup checksum in net_tx_action\n"); kfree_skb(skb); continue; } skb_probe_transport_header(skb, 0); /* If the packet is GSO then we will have just set up the * transport header offset in checksum_setup so it's now * straightforward to calculate gso_segs. */ if (skb_is_gso(skb)) { int mss = skb_shinfo(skb)->gso_size; int hdrlen = skb_transport_header(skb) - skb_mac_header(skb) + tcp_hdrlen(skb); skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len - hdrlen, mss); } vif->dev->stats.rx_bytes += skb->len; vif->dev->stats.rx_packets++; work_done++; netif_receive_skb(skb); }