/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) { struct net *net = dev_net(skb_dst(skb)->dev); const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; struct flowi4 fl4 = {}; __be32 saddr = iph->saddr; __u8 flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0; unsigned int hh_len; if (addr_type == RTN_UNSPEC) addr_type = inet_addr_type(net, saddr); if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST) flags |= FLOWI_FLAG_ANYSRC; else saddr = 0; /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook. */ fl4.daddr = iph->daddr; fl4.saddr = saddr; fl4.flowi4_tos = RT_TOS(iph->tos); fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; fl4.flowi4_mark = skb->mark; fl4.flowi4_flags = flags; rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) return -1; /* Drop old route. */ skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); if (skb_dst(skb)->error) return -1; #ifdef CONFIG_XFRM if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && xfrm_decode_session(skb, flowi4_to_flowi(&fl4), AF_INET) == 0) { struct dst_entry *dst = skb_dst(skb); skb_dst_set(skb, NULL); dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0); if (IS_ERR(dst)) return -1; skb_dst_set(skb, dst); } #endif /* Change in oif may mean change in hh_len. */ hh_len = skb_dst(skb)->dev->hard_header_len; if (skb_headroom(skb) < hh_len && pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)), 0, GFP_ATOMIC)) return -1; return 0; }
/* * Direct Routing transmitter * Used for ANY protocol */ int ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp) { struct rtable *rt; /* Route to the other host */ struct iphdr *iph = ip_hdr(skb); int mtu; EnterFunction(10); if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, RT_TOS(iph->tos), 1|2))) goto tx_error_icmp; if (rt->rt_flags & RTCF_LOCAL) { ip_rt_put(rt); IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); } /* MTU checking */ mtu = dst_mtu(&rt->dst); if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) { icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); ip_rt_put(rt); IP_VS_DBG_RL("%s(): frag needed\n", __func__); goto tx_error; } /* * Call ip_send_check because we are not sure it is called * after ip_defrag. Is copy-on-write needed? */ if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { ip_rt_put(rt); return NF_STOLEN; } ip_send_check(ip_hdr(skb)); /* drop old route */ skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0); LeaveFunction(10); return NF_STOLEN; tx_error_icmp: dst_link_failure(skb); tx_error: kfree_skb(skb); LeaveFunction(10); return NF_STOLEN; }
/* Try to route the packet according to the routing keys specified in * route_info. Keys are : * - ifindex : * 0 if no oif preferred, * otherwise set to the index of the desired oif * - route_info->gw : * 0 if no gateway specified, * otherwise set to the next host to which the pkt must be routed * If success, skb->dev is the output device to which the packet must * be sent and skb->dst is not NULL * * RETURN: -1 if an error occured * 1 if the packet was succesfully routed to the * destination desired * 0 if the kernel routing table could not route the packet * according to the keys specified */ static int route(struct sk_buff *skb, unsigned int ifindex, const struct ipt_route_target_info *route_info) { int err; struct rtable *rt; struct iphdr *iph = ip_hdr(skb); struct flowi fl = { .oif = ifindex, .nl_u = { .ip4_u = { .daddr = iph->daddr, .saddr = 0, .tos = RT_TOS(iph->tos), .scope = RT_SCOPE_UNIVERSE, } } }; /* The destination address may be overloaded by the target */ if (route_info->gw) fl.fl4_dst = route_info->gw; /* Trying to route the packet using the standard routing table. */ if ((err = ip_route_output_key(&init_net,&rt, &fl))) { if (net_ratelimit()) pr_debug("ipt_ROUTE: couldn't route pkt (err: %i)",err); return -1; } /* Drop old route. */ skb_dst_drop(skb); /* Success if no oif specified or if the oif correspond to the * one desired */ if (!ifindex || rt->dst.dev->ifindex == ifindex) { skb_dst_set(skb, &rt->dst); skb->dev = skb_dst(skb)->dev; skb->protocol = htons(ETH_P_IP); return 1; } /* The interface selected by the routing table is not the one * specified by the user. This may happen because the dst address * is one of our own addresses. */ if (net_ratelimit()) pr_debug("ipt_ROUTE: failed to route as desired gw, oif=%i (got oif=%i)\n", //pr_debug("ipt_ROUTE: failed to route as desired gw=%u.%u.%u.%u oif=%i (got oif=%i)\n", ifindex, rt->dst.dev->ifindex); //NIPQUAD(route_info->gw), ifindex, rt->dst.dev->ifindex); return 0; }
static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) { struct net *net = ovs_dp_get_net(vport->dp); struct vxlan_port *vxlan_port = vxlan_vport(vport); __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport; struct ovs_key_ipv4_tunnel *tun_key; struct rtable *rt; struct flowi4 fl; __be16 src_port; __be16 df; int err; if (unlikely(!OVS_CB(skb)->egress_tun_info)) { err = -EINVAL; goto error; } tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; /* Route lookup */ memset(&fl, 0, sizeof(fl)); fl.daddr = tun_key->ipv4_dst; fl.saddr = tun_key->ipv4_src; fl.flowi4_tos = RT_TOS(tun_key->ipv4_tos); fl.flowi4_mark = skb->mark; fl.flowi4_proto = IPPROTO_UDP; rt = ip_route_output_key(net, &fl); if (IS_ERR(rt)) { err = PTR_ERR(rt); goto error; } df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; skb->ignore_df = 1; src_port = udp_flow_src_port(net, skb, 0, 0, true); err = vxlan_xmit_skb(vxlan_port->vs, rt, skb, fl.saddr, tun_key->ipv4_dst, tun_key->ipv4_tos, tun_key->ipv4_ttl, df, src_port, dst_port, htonl(be64_to_cpu(tun_key->tun_id) << 8), false); if (err < 0) ip_rt_put(rt); return err; error: kfree_skb(skb); return err; }
static inline struct rtable *route_mirror(struct sk_buff *skb, int local) { struct iphdr *iph = skb->nh.iph; struct dst_entry *odst; struct rt_key key = {}; struct rtable *rt; if (local) { key.dst = iph->saddr; key.src = iph->daddr; key.tos = RT_TOS(iph->tos); if (ip_route_output_key(&rt, &key) != 0) return NULL; } else { /* non-local src, find valid iif to satisfy * rp-filter when calling ip_route_input. */ key.dst = iph->daddr; if (ip_route_output_key(&rt, &key) != 0) return NULL; odst = skb->dst; if (ip_route_input(skb, iph->saddr, iph->daddr, RT_TOS(iph->tos), rt->u.dst.dev) != 0) { dst_release(&rt->u.dst); return NULL; } dst_release(&rt->u.dst); rt = (struct rtable *)skb->dst; skb->dst = odst; } if (rt->u.dst.error) { dst_release(&rt->u.dst); rt = NULL; } return rt; }
static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, struct net_device *vrf_dev) { struct iphdr *ip4h = ip_hdr(skb); int ret = NET_XMIT_DROP; struct flowi4 fl4 = { /* needed to match OIF rule */ .flowi4_oif = vrf_dev->ifindex, .flowi4_iif = LOOPBACK_IFINDEX, .flowi4_tos = RT_TOS(ip4h->tos), .flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF, .daddr = ip4h->daddr, }; if (vrf_send_v4_prep(skb, &fl4, vrf_dev)) goto err; if (!ip4h->saddr) { ip4h->saddr = inet_select_addr(skb_dst(skb)->dev, 0, RT_SCOPE_LINK); } ret = ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); if (unlikely(net_xmit_eval(ret))) vrf_dev->stats.tx_errors++; else ret = NET_XMIT_SUCCESS; out: return ret; err: vrf_tx_error(vrf_dev, skb); goto out; } static netdev_tx_t is_ip_tx_frame(struct sk_buff *skb, struct net_device *dev) { /* strip the ethernet header added for pass through VRF device */ __skb_pull(skb, skb_network_offset(skb)); switch (skb->protocol) { case htons(ETH_P_IP): return vrf_process_v4_outbound(skb, dev); case htons(ETH_P_IPV6): return vrf_process_v6_outbound(skb, dev); default: vrf_tx_error(dev, skb); return NET_XMIT_DROP; } }
static int ip_masq_user_maddr(struct ip_masq_user *ums) { struct device *dev; struct rtable *rt; int ret = -EINVAL; u32 rt_daddr, rt_saddr; u32 tos; /* * Did specify masq address. */ if (ums->maddr) return 0; /* * Select address to use for routing query */ rt_daddr = ums->rt_daddr? ums->rt_daddr : ums->daddr; rt_saddr = ums->rt_saddr? ums->rt_saddr : ums->saddr; /* * No address for routing, cannot continue */ if (rt_daddr == 0) { IP_MASQ_DEBUG(1-debug, "cannot setup maddr with daddr=%lX, rt_addr=%lX\n", ntohl(ums->daddr), ntohl(ums->rt_daddr)); return -EINVAL; } /* * Find out rt device */ rt_saddr = 0; tos = RT_TOS(ums->ip_tos) | RTO_CONN; if ((ret=ip_route_output(&rt, rt_daddr, rt_saddr, tos, 0 /* dev */))) { IP_MASQ_DEBUG(0-debug, "could not setup maddr for routing daddr=%lX, saddr=%lX\n", ntohl(rt_daddr), ntohl(rt_saddr)); return ret; } dev = rt->u.dst.dev; ums->maddr = ip_masq_select_addr(dev, rt->rt_gateway, RT_SCOPE_UNIVERSE); IP_MASQ_DEBUG(1-debug, "did setup maddr=%lX\n", ntohl(ums->maddr)); ip_rt_put(rt); return 0; }
int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info, struct net *net, const struct ovs_tunnel_info *tun_info, u8 ipproto, u32 skb_mark, __be16 tp_src, __be16 tp_dst) { const struct ovs_key_ipv4_tunnel *tun_key; struct rtable *rt; struct flowi4 fl; if (unlikely(!tun_info)) return -EINVAL; tun_key = &tun_info->tunnel; /* Route lookup to get srouce IP address. * The process may need to be changed if the corresponding process * in vports ops changed. */ memset(&fl, 0, sizeof(fl)); fl.daddr = tun_key->ipv4_dst; fl.saddr = tun_key->ipv4_src; fl.flowi4_tos = RT_TOS(tun_key->ipv4_tos); fl.flowi4_mark = skb_mark; fl.flowi4_proto = ipproto; rt = ip_route_output_key(net, &fl); if (IS_ERR(rt)) return PTR_ERR(rt); ip_rt_put(rt); /* Generate egress_tun_info based on tun_info, * saddr, tp_src and tp_dst */ __ovs_flow_tun_info_init(egress_tun_info, fl.saddr, tun_key->ipv4_dst, tun_key->ipv4_tos, tun_key->ipv4_ttl, tp_src, tp_dst, tun_key->tun_id, tun_key->tun_flags, tun_info->options, tun_info->options_len); return 0; }
static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) { struct net *net = ovs_dp_get_net(vport->dp); struct vxlan_port *vxlan_port = vxlan_vport(vport); __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->sport; struct rtable *rt; struct flowi fl; __be16 src_port; int port_min; int port_max; __be16 df; int err; if (unlikely(!OVS_CB(skb)->tun_key)) { err = -EINVAL; goto error; } /* Route lookup */ memset(&fl, 0, sizeof(fl)); fl.fl4_dst = OVS_CB(skb)->tun_key->ipv4_dst; fl.fl4_src = OVS_CB(skb)->tun_key->ipv4_src; fl.fl4_tos = RT_TOS(OVS_CB(skb)->tun_key->ipv4_tos); fl.mark = skb->mark; fl.proto = IPPROTO_UDP; err = ip_route_output_key(net, &rt, &fl); if (err) goto error; df = OVS_CB(skb)->tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; skb->local_df = 1; inet_get_local_port_range(&port_min, &port_max); src_port = vxlan_src_port(port_min, port_max, skb); err = vxlan_xmit_skb(net, vxlan_port->vs, rt, skb, fl.fl4_src, OVS_CB(skb)->tun_key->ipv4_dst, OVS_CB(skb)->tun_key->ipv4_tos, OVS_CB(skb)->tun_key->ipv4_ttl, df, src_port, dst_port, htonl(be64_to_cpu(OVS_CB(skb)->tun_key->tun_id) << 8)); if (err < 0) ip_rt_put(rt); error: return err; }
static struct rtable *gre_get_rt(struct sk_buff *skb, struct net_device *dev, struct flowi4 *fl, const struct ip_tunnel_key *key) { struct net *net = dev_net(dev); memset(fl, 0, sizeof(*fl)); fl->daddr = key->u.ipv4.dst; fl->saddr = key->u.ipv4.src; fl->flowi4_tos = RT_TOS(key->tos); fl->flowi4_mark = skb->mark; fl->flowi4_proto = IPPROTO_GRE; return ip_route_output_key(net, fl); }
static int ip_tunnel_bind_dev(struct net_device *dev) { struct net_device *tdev = NULL; struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *iph; int hlen = LL_MAX_HEADER; int mtu = ETH_DATA_LEN; int t_hlen = tunnel->hlen + sizeof(struct iphdr); iph = &tunnel->parms.iph; /* Guess output device to choose reasonable mtu and needed_headroom */ if (iph->daddr) { struct flowi4 fl4; struct rtable *rt; rt = ip_route_output_tunnel(dev_net(dev), &fl4, tunnel->parms.iph.protocol, iph->daddr, iph->saddr, tunnel->parms.o_key, RT_TOS(iph->tos), tunnel->parms.link); if (!IS_ERR(rt)) { tdev = rt->dst.dev; ip_rt_put(rt); } if (dev->type != ARPHRD_ETHER) dev->flags |= IFF_POINTOPOINT; } if (!tdev && tunnel->parms.link) tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); if (tdev) { hlen = tdev->hard_header_len + tdev->needed_headroom; mtu = tdev->mtu; } dev->iflink = tunnel->parms.link; dev->needed_headroom = t_hlen + hlen; mtu -= (dev->hard_header_len + t_hlen); if (mtu < 68) mtu = 68; return mtu; }
static struct rtable *ovs_tunnel_route_lookup(struct net *net, const struct ip_tunnel_key *key, u32 mark, struct flowi4 *fl, u8 protocol) { struct rtable *rt; memset(fl, 0, sizeof(*fl)); fl->daddr = key->u.ipv4.dst; fl->saddr = key->u.ipv4.src; fl->flowi4_tos = RT_TOS(key->tos); fl->flowi4_mark = mark; fl->flowi4_proto = protocol; rt = ip_route_output_key(net, fl); return rt; }
static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) { struct net *net = ovs_dp_get_net(vport->dp); struct ovs_key_ipv4_tunnel *tun_key; struct flowi4 fl; struct rtable *rt; int min_headroom; int tunnel_hlen; __be16 df; int err; if (unlikely(!OVS_CB(skb)->egress_tun_key)) { err = -EINVAL; goto error; } tun_key = OVS_CB(skb)->egress_tun_key; /* Route lookup */ memset(&fl, 0, sizeof(fl)); fl.daddr = tun_key->ipv4_dst; fl.saddr = tun_key->ipv4_src; fl.flowi4_tos = RT_TOS(tun_key->ipv4_tos); fl.flowi4_mark = skb->mark; fl.flowi4_proto = IPPROTO_GRE; rt = ip_route_output_key(net, &fl); if (IS_ERR(rt)) return PTR_ERR(rt); tunnel_hlen = ip_gre_calc_hlen(tun_key->tun_flags); min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + tunnel_hlen + sizeof(struct iphdr) + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0); if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) { int head_delta = SKB_DATA_ALIGN(min_headroom - skb_headroom(skb) + 16); err = pskb_expand_head(skb, max_t(int, head_delta, 0), 0, GFP_ATOMIC); if (unlikely(err)) goto err_free_rt; }
static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel_info *tun_info; struct net *net = dev_net(dev); const struct ip_tunnel_key *key; struct flowi4 fl; struct rtable *rt; int min_headroom; int tunnel_hlen; __be16 df, flags; int err; tun_info = skb_tunnel_info(skb, AF_INET); if (unlikely(!tun_info || tun_info->mode != IP_TUNNEL_INFO_TX)) goto err_free_skb; key = &tun_info->key; memset(&fl, 0, sizeof(fl)); fl.daddr = key->ipv4_dst; fl.saddr = key->ipv4_src; fl.flowi4_tos = RT_TOS(key->ipv4_tos); fl.flowi4_mark = skb->mark; fl.flowi4_proto = IPPROTO_GRE; rt = ip_route_output_key(net, &fl); if (IS_ERR(rt)) goto err_free_skb; tunnel_hlen = ip_gre_calc_hlen(key->tun_flags); min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + tunnel_hlen + sizeof(struct iphdr); if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) { int head_delta = SKB_DATA_ALIGN(min_headroom - skb_headroom(skb) + 16); err = pskb_expand_head(skb, max_t(int, head_delta, 0), 0, GFP_ATOMIC); if (unlikely(err)) goto err_free_rt; }
static int route_mirror(struct sk_buff *skb) { struct iphdr *iph = skb->nh.iph; struct rtable *rt; /* Backwards */ if (ip_route_output(&rt, iph->saddr, iph->daddr, RT_TOS(iph->tos) | RTO_CONN, 0)) { return 0; } /* check if the interface we are leaving by is the same as the one we arrived on */ if (skb->dev == rt->u.dst.dev) { /* Drop old route. */ dst_release(skb->dst); skb->dst = &rt->u.dst; return 1; } return 0; }
/* With a chainsaw... */ static int route_me_harder(struct sk_buff *skb) { struct iphdr *iph = skb->nh.iph; struct rtable *rt; struct rt_key key = { dst:iph->daddr, src:iph->saddr, oif:skb->sk ? skb->sk->bound_dev_if : 0, tos:RT_TOS(iph->tos) | RTO_CONN, #ifdef CONFIG_IP_ROUTE_FWMARK fwmark:skb->nfmark #endif }; if (ip_route_output_key(&rt, &key) != 0) return -EINVAL; /* Drop old route. */ dst_release(skb->dst); skb->dst = &rt->u.dst; return 0; }
static struct rtable *route_packet_ipv4(struct sk_buff *skb) { struct iphdr *ip_header = ip_hdr(skb); struct flowi fl; struct rtable *table; memset(&fl, 0, sizeof(fl)); fl.u.ip4.daddr = ip_header->daddr; fl.flowi_tos = RT_TOS(ip_header->tos); fl.flowi_proto = skb->protocol; table = ip_route_output_key(&init_net, &fl.u.ip4); if (!table) { log_err(ERR_ROUTE_FAILED, "ip_route_output_key() returned NULL. Cannot route packet."); return NULL; } if (IS_ERR(table)) { log_err(ERR_ROUTE_FAILED, "ip_route_output_key() returned %p. Cannot route packet.", table); return NULL; } return table; }
/* * IP Tunneling transmitter * * This function encapsulates the packet in a new IP packet, its * destination will be set to cp->daddr. Most code of this function * is taken from ipip.c. * * It is used in VS/TUN cluster. The load balancer selects a real * server from a cluster based on a scheduling algorithm, * encapsulates the request packet and forwards it to the selected * server. For example, all real servers are configured with * "ifconfig tunl0 <Virtual IP Address> up". When the server receives * the encapsulated packet, it will decapsulate the packet, processe * the request and return the response packets directly to the client * without passing the load balancer. This can greatly increase the * scalability of virtual server. * * Used for ANY protocol */ int ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp) { struct rtable *rt; /* Route to the other host */ struct net_device *tdev; /* Device to other host */ struct iphdr *old_iph = ip_hdr(skb); u8 tos = old_iph->tos; __be16 df = old_iph->frag_off; struct iphdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ int mtu; int ret; EnterFunction(10); if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, RT_TOS(tos), IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL))) goto tx_error_icmp; if (rt->rt_flags & RTCF_LOCAL) { ip_rt_put(rt); IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); } tdev = rt->dst.dev; mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); if (mtu < 68) { IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); goto tx_error_put; } if (skb_dst(skb)) skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); df |= (old_iph->frag_off & htons(IP_DF)); if ((old_iph->frag_off & htons(IP_DF) && mtu < ntohs(old_iph->tot_len) && !skb_is_gso(skb))) { icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); IP_VS_DBG_RL("%s(): frag needed\n", __func__); goto tx_error_put; } /* * Okay, now see if we can stuff it in the buffer as-is. */ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) { ip_rt_put(rt); kfree_skb(skb); IP_VS_ERR_RL("%s(): no memory\n", __func__); return NF_STOLEN; } kfree_skb(skb); skb = new_skb; old_iph = ip_hdr(skb); } skb->transport_header = skb->network_header; /* fix old IP header checksum */ ip_send_check(old_iph); skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); /* drop old route */ skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); /* * Push down and install the IPIP header. */ iph = ip_hdr(skb); iph->version = 4; iph->ihl = sizeof(struct iphdr)>>2; iph->frag_off = df; iph->protocol = IPPROTO_IPIP; iph->tos = tos; iph->daddr = rt->rt_dst; iph->saddr = rt->rt_src; iph->ttl = old_iph->ttl; ip_select_ident(iph, &rt->dst, NULL); /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; ret = IP_VS_XMIT_TUNNEL(skb, cp); if (ret == NF_ACCEPT) ip_local_out(skb); else if (ret == NF_DROP) kfree_skb(skb); LeaveFunction(10); return NF_STOLEN; tx_error_icmp: dst_link_failure(skb); tx_error: kfree_skb(skb); LeaveFunction(10); return NF_STOLEN; tx_error_put: ip_rt_put(rt); goto tx_error; }
/* * NAT transmitter (only for outside-to-inside nat forwarding) * Not used for related ICMP */ int ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp) { struct rtable *rt; /* Route to the other host */ int mtu; struct iphdr *iph = ip_hdr(skb); int local; EnterFunction(10); /* check if it is a connection of no-client-port */ if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { __be16 _pt, *p; p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt); if (p == NULL) goto tx_error; ip_vs_conn_fill_cport(cp, *p); IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, RT_TOS(iph->tos), IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR))) goto tx_error_icmp; local = rt->rt_flags & RTCF_LOCAL; /* * Avoid duplicate tuple in reply direction for NAT traffic * to local address when connection is sync-ed */ #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) if (cp->flags & IP_VS_CONN_F_SYNC && local) { enum ip_conntrack_info ctinfo; struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); if (ct && !nf_ct_is_untracked(ct)) { IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): " "stopping DNAT to local address"); goto tx_error_put; } } #endif /* From world but DNAT to loopback address? */ if (local && ipv4_is_loopback(rt->rt_dst) && rt_is_input_route(skb_rtable(skb))) { IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): " "stopping DNAT to loopback address"); goto tx_error_put; } /* MTU checking */ mtu = dst_mtu(&rt->dst); if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) && !skb_is_gso(skb)) { icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); goto tx_error_put; } /* copy-on-write the packet before mangling it */ if (!skb_make_writable(skb, sizeof(struct iphdr))) goto tx_error_put; if (skb_cow(skb, rt->dst.dev->hard_header_len)) goto tx_error_put; /* mangle the packet */ if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) goto tx_error_put; ip_hdr(skb)->daddr = cp->daddr.ip; ip_send_check(ip_hdr(skb)); if (!local) { /* drop old route */ skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); } else { ip_rt_put(rt); /* * Some IPv4 replies get local address from routes, * not from iph, so while we DNAT after routing * we need this second input/output route. */ if (!__ip_vs_reroute_locally(skb)) goto tx_error; } IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT"); /* FIXME: when application helper enlarges the packet and the length is larger than the MTU of outgoing device, there will be still MTU problem. */ /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local); LeaveFunction(10); return NF_STOLEN; tx_error_icmp: dst_link_failure(skb); tx_error: kfree_skb(skb); LeaveFunction(10); return NF_STOLEN; tx_error_put: ip_rt_put(rt); goto tx_error; }
/* Reroute packet to local IPv4 stack after DNAT */ static int __ip_vs_reroute_locally(struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); struct net_device *dev = rt->dst.dev; struct net *net = dev_net(dev); struct iphdr *iph = ip_hdr(skb); if (rt_is_input_route(rt)) { unsigned long orefdst = skb->_skb_refdst; if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, skb->dev)) return 0; refdst_drop(orefdst); } else { struct flowi4 fl4 = { .daddr = iph->daddr, .saddr = iph->saddr, .flowi4_tos = RT_TOS(iph->tos), .flowi4_mark = skb->mark, }; rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) return 0; if (!(rt->rt_flags & RTCF_LOCAL)) { ip_rt_put(rt); return 0; } /* Drop old route. */ skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); } return 1; } #ifdef CONFIG_IP_VS_IPV6 static inline int __ip_vs_is_local_route6(struct rt6_info *rt) { return rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK; } static struct dst_entry * __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr, struct in6_addr *ret_saddr, int do_xfrm) { struct dst_entry *dst; struct flowi6 fl6 = { .daddr = *daddr, }; dst = ip6_route_output(net, NULL, &fl6); if (dst->error) goto out_err; if (!ret_saddr) return dst; if (ipv6_addr_any(&fl6.saddr) && ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev, &fl6.daddr, 0, &fl6.saddr) < 0) goto out_err; if (do_xfrm) { dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); if (IS_ERR(dst)) { dst = NULL; goto out_err; } } ipv6_addr_copy(ret_saddr, &fl6.saddr); return dst; out_err: dst_release(dst); IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr); return NULL; } /* * Get route to destination or remote server * rt_mode: flags, &1=Allow local dest, &2=Allow non-local dest, * &4=Allow redirect from remote daddr to local */ static struct rt6_info * __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest, struct in6_addr *daddr, struct in6_addr *ret_saddr, int do_xfrm, int rt_mode) { struct net *net = dev_net(skb_dst(skb)->dev); struct rt6_info *rt; /* Route to the other host */ struct rt6_info *ort; /* Original route */ struct dst_entry *dst; int local; if (dest) { spin_lock(&dest->dst_lock); rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0); if (!rt) { u32 cookie; dst = __ip_vs_route_output_v6(net, &dest->addr.in6, &dest->dst_saddr, do_xfrm); if (!dst) { spin_unlock(&dest->dst_lock); return NULL; } rt = (struct rt6_info *) dst; cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie); IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", &dest->addr.in6, &dest->dst_saddr, atomic_read(&rt->dst.__refcnt)); } if (ret_saddr) ipv6_addr_copy(ret_saddr, &dest->dst_saddr); spin_unlock(&dest->dst_lock); } else { dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm); if (!dst) return NULL; rt = (struct rt6_info *) dst; } local = __ip_vs_is_local_route6(rt); if (!((local ? 1 : 2) & rt_mode)) { IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6\n", local ? "local":"non-local", daddr); dst_release(&rt->dst); return NULL; } if (local && !(rt_mode & 4) && !((ort = (struct rt6_info *) skb_dst(skb)) && __ip_vs_is_local_route6(ort))) { IP_VS_DBG_RL("Redirect from non-local address %pI6 to local " "requires NAT method, dest: %pI6\n", &ipv6_hdr(skb)->daddr, daddr); dst_release(&rt->dst); return NULL; } if (unlikely(!local && (!skb->dev || skb->dev->flags & IFF_LOOPBACK) && ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LOOPBACK)) { IP_VS_DBG_RL("Stopping traffic from loopback address %pI6 " "to non-local address, dest: %pI6\n", &ipv6_hdr(skb)->saddr, daddr); dst_release(&rt->dst); return NULL; } return rt; } #endif /* * Release dest->dst_cache before a dest is removed */ void ip_vs_dst_reset(struct ip_vs_dest *dest) { struct dst_entry *old_dst; old_dst = dest->dst_cache; dest->dst_cache = NULL; dst_release(old_dst); } #define IP_VS_XMIT_TUNNEL(skb, cp) \ ({ \ int __ret = NF_ACCEPT; \ \ (skb)->ipvs_property = 1; \ if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT)) \ __ret = ip_vs_confirm_conntrack(skb, cp); \ if (__ret == NF_ACCEPT) { \ nf_reset(skb); \ skb_forward_csum(skb); \ } \ __ret; \ }) #define IP_VS_XMIT_NAT(pf, skb, cp, local) \ do { \ (skb)->ipvs_property = 1; \ if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ ip_vs_notrack(skb); \ else \ ip_vs_update_conntrack(skb, cp, 1); \ if (local) \ return NF_ACCEPT; \ skb_forward_csum(skb); \ NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ skb_dst(skb)->dev, dst_output); \ } while (0) #define IP_VS_XMIT(pf, skb, cp, local) \ do { \ (skb)->ipvs_property = 1; \ if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ ip_vs_notrack(skb); \ if (local) \ return NF_ACCEPT; \ skb_forward_csum(skb); \ NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ skb_dst(skb)->dev, dst_output); \ } while (0) /* * NULL transmitter (do nothing except return NF_ACCEPT) */ int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp) { /* we do not touch skb and do not need pskb ptr */ IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); }
/* * ICMP packet transmitter * called by the ip_vs_in_icmp */ int ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, int offset) { struct rtable *rt; /* Route to the other host */ int mtu; int rc; int local; EnterFunction(10); /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be forwarded directly here, because there is no need to translate address/port back */ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { if (cp->packet_xmit) rc = cp->packet_xmit(skb, cp, pp); else rc = NF_ACCEPT; /* do not touch skb anymore */ atomic_inc_unchecked(&cp->in_pkts); goto out; } /* * mangle and send the packet here (only for VS/NAT) */ if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, RT_TOS(ip_hdr(skb)->tos), IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR))) goto tx_error_icmp; local = rt->rt_flags & RTCF_LOCAL; /* * Avoid duplicate tuple in reply direction for NAT traffic * to local address when connection is sync-ed */ #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) if (cp->flags & IP_VS_CONN_F_SYNC && local) { enum ip_conntrack_info ctinfo; struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); if (ct && !nf_ct_is_untracked(ct)) { IP_VS_DBG(10, "%s(): " "stopping DNAT to local address %pI4\n", __func__, &cp->daddr.ip); goto tx_error_put; } } #endif /* From world but DNAT to loopback address? */ if (local && ipv4_is_loopback(rt->rt_dst) && rt_is_input_route(skb_rtable(skb))) { IP_VS_DBG(1, "%s(): " "stopping DNAT to loopback %pI4\n", __func__, &cp->daddr.ip); goto tx_error_put; } /* MTU checking */ mtu = dst_mtu(&rt->dst); if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF)) && !skb_is_gso(skb)) { icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); IP_VS_DBG_RL("%s(): frag needed\n", __func__); goto tx_error_put; } /* copy-on-write the packet before mangling it */ if (!skb_make_writable(skb, offset)) goto tx_error_put; if (skb_cow(skb, rt->dst.dev->hard_header_len)) goto tx_error_put; ip_vs_nat_icmp(skb, pp, cp, 0); if (!local) { /* drop the old route when skb is not shared */ skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); } else { ip_rt_put(rt); /* * Some IPv4 replies get local address from routes, * not from iph, so while we DNAT after routing * we need this second input/output route. */ if (!__ip_vs_reroute_locally(skb)) goto tx_error; } /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local); rc = NF_STOLEN; goto out; tx_error_icmp: dst_link_failure(skb); tx_error: dev_kfree_skb(skb); rc = NF_STOLEN; out: LeaveFunction(10); return rc; tx_error_put: ip_rt_put(rt); goto tx_error; }
int ip_do_nat(struct sk_buff *skb) { struct rtable *rt = (struct rtable*)skb->dst; struct iphdr *iph = skb->nh.iph; u32 odaddr = iph->daddr; u32 osaddr = iph->saddr; u16 check; IPCB(skb)->flags |= IPSKB_TRANSLATED; /* Rewrite IP header */ iph->daddr = rt->rt_dst_map; iph->saddr = rt->rt_src_map; iph->check = 0; iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); /* If it is the first fragment, rewrite protocol headers */ if (!(iph->frag_off & htons(IP_OFFSET))) { u16 *cksum; switch(iph->protocol) { case IPPROTO_TCP: cksum = (u16*)&((struct tcphdr*)(((char*)iph) + (iph->ihl<<2)))->check; if ((u8*)(cksum+1) > skb->tail) goto truncated; check = *cksum; if (skb->ip_summed != CHECKSUM_HW) check = ~check; check = csum_tcpudp_magic(iph->saddr, iph->daddr, 0, 0, check); check = csum_tcpudp_magic(~osaddr, ~odaddr, 0, 0, ~check); if (skb->ip_summed == CHECKSUM_HW) check = ~check; *cksum = check; break; case IPPROTO_UDP: cksum = (u16*)&((struct udphdr*)(((char*)iph) + (iph->ihl<<2)))->check; if ((u8*)(cksum+1) > skb->tail) goto truncated; if ((check = *cksum) != 0) { check = csum_tcpudp_magic(iph->saddr, iph->daddr, 0, 0, ~check); check = csum_tcpudp_magic(~osaddr, ~odaddr, 0, 0, ~check); *cksum = check ? : 0xFFFF; } break; case IPPROTO_ICMP: { struct icmphdr *icmph = (struct icmphdr*)((char*)iph + (iph->ihl<<2)); struct iphdr *ciph; u32 idaddr, isaddr; int updated; if ((icmph->type != ICMP_DEST_UNREACH) && (icmph->type != ICMP_TIME_EXCEEDED) && (icmph->type != ICMP_PARAMETERPROB)) break; ciph = (struct iphdr *) (icmph + 1); if ((u8*)(ciph+1) > skb->tail) goto truncated; isaddr = ciph->saddr; idaddr = ciph->daddr; updated = 0; if (rt->rt_flags&RTCF_DNAT && ciph->saddr == odaddr) { ciph->saddr = iph->daddr; updated = 1; } if (rt->rt_flags&RTCF_SNAT) { if (ciph->daddr != osaddr) { struct fib_result res; struct rt_key key; unsigned flags = 0; key.src = ciph->daddr; key.dst = ciph->saddr; key.iif = skb->dev->ifindex; key.oif = 0; #ifdef CONFIG_IP_ROUTE_TOS key.tos = RT_TOS(ciph->tos); #endif #ifdef CONFIG_IP_ROUTE_FWMARK key.fwmark = 0; #endif /* Use fib_lookup() until we get our own * hash table of NATed hosts -- Rani */ if (fib_lookup(&key, &res) == 0) { if (res.r) { ciph->daddr = fib_rules_policy(ciph->daddr, &res, &flags); if (ciph->daddr != idaddr) updated = 1; } fib_res_put(&res); } } else { ciph->daddr = iph->saddr; updated = 1; } } if (updated) { cksum = &icmph->checksum; /* Using tcpudp primitive. Why not? */ check = csum_tcpudp_magic(ciph->saddr, ciph->daddr, 0, 0, ~(*cksum)); *cksum = csum_tcpudp_magic(~isaddr, ~idaddr, 0, 0, ~check); } break; } default: break; }
void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, const u8 protocol) { struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *inner_iph; struct flowi4 fl4; u8 tos, ttl; __be16 df; struct rtable *rt; /* Route to the other host */ unsigned int max_headroom; /* The extra header space needed */ __be32 dst; int err; inner_iph = (const struct iphdr *)skb_inner_network_header(skb); dst = tnl_params->daddr; if (dst == 0) { /* NBMA tunnel */ if (skb_dst(skb) == NULL) { dev->stats.tx_fifo_errors++; goto tx_error; } if (skb->protocol == htons(ETH_P_IP)) { rt = skb_rtable(skb); dst = rt_nexthop(rt, inner_iph->daddr); } #if IS_ENABLED(CONFIG_IPV6) else if (skb->protocol == htons(ETH_P_IPV6)) { const struct in6_addr *addr6; struct neighbour *neigh; bool do_tx_error_icmp; int addr_type; neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr); if (neigh == NULL) goto tx_error; addr6 = (const struct in6_addr *)&neigh->primary_key; addr_type = ipv6_addr_type(addr6); if (addr_type == IPV6_ADDR_ANY) { addr6 = &ipv6_hdr(skb)->daddr; addr_type = ipv6_addr_type(addr6); } if ((addr_type & IPV6_ADDR_COMPATv4) == 0) do_tx_error_icmp = true; else { do_tx_error_icmp = false; dst = addr6->s6_addr32[3]; } neigh_release(neigh); if (do_tx_error_icmp) goto tx_error_icmp; } #endif else goto tx_error; } tos = tnl_params->tos; if (tos & 0x1) { tos &= ~0x1; if (skb->protocol == htons(ETH_P_IP)) tos = inner_iph->tos; else if (skb->protocol == htons(ETH_P_IPV6)) tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); } rt = ip_route_output_tunnel(tunnel->net, &fl4, protocol, dst, tnl_params->saddr, tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link); if (IS_ERR(rt)) { dev->stats.tx_carrier_errors++; goto tx_error; } if (rt->dst.dev == dev) { ip_rt_put(rt); dev->stats.collisions++; goto tx_error; } if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) { ip_rt_put(rt); goto tx_error; } if (tunnel->net != dev_net(dev)) skb_scrub_packet(skb); if (tunnel->err_count > 0) { if (time_before(jiffies, tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { tunnel->err_count--; dst_link_failure(skb); } else tunnel->err_count = 0; } tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); ttl = tnl_params->ttl; if (ttl == 0) { if (skb->protocol == htons(ETH_P_IP)) ttl = inner_iph->ttl; #if IS_ENABLED(CONFIG_IPV6) else if (skb->protocol == htons(ETH_P_IPV6)) ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit; #endif else ttl = ip4_dst_hoplimit(&rt->dst); } df = tnl_params->frag_off; if (skb->protocol == htons(ETH_P_IP)) df |= (inner_iph->frag_off&htons(IP_DF)); max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) + rt->dst.header_len; if (max_headroom > dev->needed_headroom) dev->needed_headroom = max_headroom; if (skb_cow_head(skb, dev->needed_headroom)) { dev->stats.tx_dropped++; dev_kfree_skb(skb); return; } err = iptunnel_xmit(dev_net(dev), rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, df); iptunnel_xmit_stats(err, &dev->stats, dev->tstats); return; #if IS_ENABLED(CONFIG_IPV6) tx_error_icmp: dst_link_failure(skb); #endif tx_error: dev->stats.tx_errors++; dev_kfree_skb(skb); }
static int ipip_tunnel_xmit(struct sk_buff *skb, struct device *dev) { struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv; struct net_device_stats *stats = &tunnel->stat; struct iphdr *tiph = &tunnel->parms.iph; u8 tos = tunnel->parms.iph.tos; u16 df = tiph->frag_off; struct rtable *rt; /* Route to the other host */ struct device *tdev; /* Device to other host */ struct iphdr *old_iph = skb->nh.iph; struct iphdr *iph; /* Our new IP header */ int max_headroom; /* The extra header space needed */ u32 dst = tiph->daddr; int mtu; if (tunnel->recursion++) { tunnel->stat.collisions++; goto tx_error; } if (skb->protocol != __constant_htons(ETH_P_IP)) goto tx_error; if (tos&1) tos = old_iph->tos; if (!dst) { /* NBMA tunnel */ if ((rt = (struct rtable*)skb->dst) == NULL) { tunnel->stat.tx_fifo_errors++; goto tx_error; } if ((dst = rt->rt_gateway) == 0) goto tx_error_icmp; } if (ip_route_output(&rt, dst, tiph->saddr, RT_TOS(tos), tunnel->parms.link)) { tunnel->stat.tx_carrier_errors++; goto tx_error_icmp; } tdev = rt->u.dst.dev; if (tdev == dev) { ip_rt_put(rt); tunnel->stat.collisions++; goto tx_error; } mtu = rt->u.dst.pmtu - sizeof(struct iphdr); if (mtu < 68) { tunnel->stat.collisions++; ip_rt_put(rt); goto tx_error; } if (skb->dst && mtu < skb->dst->pmtu) skb->dst->pmtu = mtu; df |= (old_iph->frag_off&__constant_htons(IP_DF)); if ((old_iph->frag_off&__constant_htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) { icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); ip_rt_put(rt); goto tx_error; } if (tunnel->err_count > 0) { if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) { tunnel->err_count--; dst_link_failure(skb); } else tunnel->err_count = 0; } skb->h.raw = skb->nh.raw; /* * Okay, now see if we can stuff it in the buffer as-is. */ max_headroom = (((tdev->hard_header_len+15)&~15)+sizeof(struct iphdr)); if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) { ip_rt_put(rt); stats->tx_dropped++; dev_kfree_skb(skb); tunnel->recursion--; return 0; } if (skb->sk) skb_set_owner_w(new_skb, skb->sk); dev_kfree_skb(skb); skb = new_skb; } skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); dst_release(skb->dst); skb->dst = &rt->u.dst; /* * Push down and install the IPIP header. */ iph = skb->nh.iph; iph->version = 4; iph->ihl = sizeof(struct iphdr)>>2; iph->frag_off = df; iph->protocol = IPPROTO_IPIP; iph->tos = tos; iph->daddr = rt->rt_dst; iph->saddr = rt->rt_src; if ((iph->ttl = tiph->ttl) == 0) iph->ttl = old_iph->ttl; iph->tot_len = htons(skb->len); iph->id = htons(ip_id_count++); ip_send_check(iph); stats->tx_bytes += skb->len; stats->tx_packets++; ip_send(skb); tunnel->recursion--; return 0; tx_error_icmp: dst_link_failure(skb); tx_error: stats->tx_errors++; dev_kfree_skb(skb); tunnel->recursion--; return 0; }
/* This requires some explaining. If DNAT has taken place, * we will need to fix up the destination Ethernet address. * * There are two cases to consider: * 1. The packet was DNAT'ed to a device in the same bridge * port group as it was received on. We can still bridge * the packet. * 2. The packet was DNAT'ed to a different device, either * a non-bridged device or another bridge port group. * The packet will need to be routed. * * The correct way of distinguishing between these two cases is to * call ip_route_input() and to look at skb->dst->dev, which is * changed to the destination device if ip_route_input() succeeds. * * Let's first consider the case that ip_route_input() succeeds: * * If the output device equals the logical bridge device the packet * came in on, we can consider this bridging. The corresponding MAC * address will be obtained in br_nf_pre_routing_finish_bridge. * Otherwise, the packet is considered to be routed and we just * change the destination MAC address so that the packet will * later be passed up to the IP stack to be routed. For a redirected * packet, ip_route_input() will give back the localhost as output device, * which differs from the bridge device. * * Let's now consider the case that ip_route_input() fails: * * This can be because the destination address is martian, in which case * the packet will be dropped. * If IP forwarding is disabled, ip_route_input() will fail, while * ip_route_output_key() can return success. The source * address for ip_route_output_key() is set to zero, so ip_route_output_key() * thinks we're handling a locally generated packet and won't care * if IP forwarding is enabled. If the output device equals the logical bridge * device, we proceed as if ip_route_input() succeeded. If it differs from the * logical bridge port or if ip_route_output_key() fails we drop the packet. */ static int br_nf_pre_routing_finish(struct sk_buff *skb) { struct net_device *dev = skb->dev; struct iphdr *iph = ip_hdr(skb); struct nf_bridge_info *nf_bridge = skb->nf_bridge; struct rtable *rt; int err; if (nf_bridge->mask & BRNF_PKT_TYPE) { skb->pkt_type = PACKET_OTHERHOST; nf_bridge->mask ^= BRNF_PKT_TYPE; } nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING; if (dnat_took_place(skb)) { if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) { struct in_device *in_dev = __in_dev_get_rcu(dev); /* If err equals -EHOSTUNREACH the error is due to a * martian destination or due to the fact that * forwarding is disabled. For most martian packets, * ip_route_output_key() will fail. It won't fail for 2 types of * martian destinations: loopback destinations and destination * 0.0.0.0. In both cases the packet will be dropped because the * destination is the loopback device and not the bridge. */ if (err != -EHOSTUNREACH || !in_dev || IN_DEV_FORWARD(in_dev)) goto free_skb; rt = ip_route_output(dev_net(dev), iph->daddr, 0, RT_TOS(iph->tos), 0); if (!IS_ERR(rt)) { /* - Bridged-and-DNAT'ed traffic doesn't * require ip_forwarding. */ if (rt->dst.dev == dev) { skb_dst_set(skb, &rt->dst); goto bridged_dnat; } ip_rt_put(rt); } free_skb: kfree_skb(skb); return 0; } else { if (skb_dst(skb)->dev == dev) { bridged_dnat: skb->dev = nf_bridge->physindev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL, br_nf_pre_routing_finish_bridge, 1); return 0; } memcpy(eth_hdr(skb)->h_dest, dev->dev_addr, ETH_ALEN); skb->pkt_type = PACKET_HOST; } } else { rt = bridge_parent_rtable(nf_bridge->physindev); if (!rt) { kfree_skb(skb); return 0; } skb_dst_set_noref(skb, &rt->dst); } skb->dev = nf_bridge->physindev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL, br_handle_frame_finish, 1); return 0; }
void ipip_err(struct sk_buff *skb, unsigned char *dp, int len) { #ifndef I_WISH_WORLD_WERE_PERFECT /* It is not :-( All the routers (except for Linux) return only 8 bytes of packet payload. It means, that precise relaying of ICMP in the real Internet is absolutely infeasible. */ struct iphdr *iph = (struct iphdr*)dp; int type = skb->h.icmph->type; int code = skb->h.icmph->code; struct ip_tunnel *t; if (len < sizeof(struct iphdr)) return; switch (type) { default: case ICMP_PARAMETERPROB: return; case ICMP_DEST_UNREACH: switch (code) { case ICMP_SR_FAILED: case ICMP_PORT_UNREACH: /* Impossible event. */ return; case ICMP_FRAG_NEEDED: /* Soft state for pmtu is maintained by IP core. */ return; default: /* All others are translated to HOST_UNREACH. rfc2003 contains "deep thoughts" about NET_UNREACH, I believe they are just ether pollution. --ANK */ break; } break; case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) return; break; } t = ipip_tunnel_lookup(iph->daddr, iph->saddr); if (t == NULL || t->parms.iph.daddr == 0) return; if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) return; if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO) t->err_count++; else t->err_count = 1; t->err_time = jiffies; return; #else struct iphdr *iph = (struct iphdr*)dp; int hlen = iph->ihl<<2; struct iphdr *eiph; int type = skb->h.icmph->type; int code = skb->h.icmph->code; int rel_type = 0; int rel_code = 0; int rel_info = 0; struct sk_buff *skb2; struct rtable *rt; if (len < hlen + sizeof(struct iphdr)) return; eiph = (struct iphdr*)(dp + hlen); switch (type) { default: return; case ICMP_PARAMETERPROB: if (skb->h.icmph->un.gateway < hlen) return; /* So... This guy found something strange INSIDE encapsulated packet. Well, he is fool, but what can we do ? */ rel_type = ICMP_PARAMETERPROB; rel_info = skb->h.icmph->un.gateway - hlen; break; case ICMP_DEST_UNREACH: switch (code) { case ICMP_SR_FAILED: case ICMP_PORT_UNREACH: /* Impossible event. */ return; case ICMP_FRAG_NEEDED: /* And it is the only really necessary thing :-) */ rel_info = ntohs(skb->h.icmph->un.frag.mtu); if (rel_info < hlen+68) return; rel_info -= hlen; /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */ if (rel_info > ntohs(eiph->tot_len)) return; break; default: /* All others are translated to HOST_UNREACH. rfc2003 contains "deep thoughts" about NET_UNREACH, I believe, it is just ether pollution. --ANK */ rel_type = ICMP_DEST_UNREACH; rel_code = ICMP_HOST_UNREACH; break; } break; case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) return; break; } /* Prepare fake skb to feed it to icmp_send */ skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2 == NULL) return; dst_release(skb2->dst); skb2->dst = NULL; skb_pull(skb2, skb->data - (u8*)eiph); skb2->nh.raw = skb2->data; /* Try to guess incoming interface */ if (ip_route_output(&rt, eiph->saddr, 0, RT_TOS(eiph->tos), 0)) { kfree_skb(skb2); return; } skb2->dev = rt->u.dst.dev; /* route "incoming" packet */ if (rt->rt_flags&RTCF_LOCAL) { ip_rt_put(rt); rt = NULL; if (ip_route_output(&rt, eiph->daddr, eiph->saddr, eiph->tos, 0) || rt->u.dst.dev->type != ARPHRD_IPGRE) { ip_rt_put(rt); kfree_skb(skb2); return; } } else { ip_rt_put(rt); if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) || skb2->dst->dev->type != ARPHRD_IPGRE) { kfree_skb(skb2); return; } } /* change mtu on this route */ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { if (rel_info > skb2->dst->pmtu) { kfree_skb(skb2); return; } skb2->dst->pmtu = rel_info; rel_info = htonl(rel_info); } else if (type == ICMP_TIME_EXCEEDED) { struct ip_tunnel *t = (struct ip_tunnel*)skb2->dev->priv; if (t->parms.iph.ttl) { rel_type = ICMP_DEST_UNREACH; rel_code = ICMP_HOST_UNREACH; } } icmp_send(skb2, rel_type, rel_code, rel_info); kfree_skb(skb2); return; #endif }
/* * Create syn packet and send it to rs. * ATTENTION: we also store syn skb in cp if syn retransimition * is tured on. */ static int syn_proxy_send_rs_syn(int af, const struct tcphdr *th, struct ip_vs_conn *cp, struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_synproxy_opt *opt) { struct sk_buff *syn_skb; int tcp_hdr_size; __u8 tcp_flags = TCPCB_FLAG_SYN; unsigned int tcphoff; struct tcphdr *new_th; if (!cp->packet_xmit) { IP_VS_ERR_RL("warning: packet_xmit is null"); return 0; } syn_skb = alloc_skb(MAX_TCP_HEADER + 15, GFP_ATOMIC); if (unlikely(syn_skb == NULL)) { IP_VS_ERR_RL("alloc skb failed when send rs syn packet\n"); return 0; } /* Reserve space for headers */ skb_reserve(syn_skb, MAX_TCP_HEADER); tcp_hdr_size = (sizeof(struct tcphdr) + TCPOLEN_MSS + (opt->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) + (opt->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) + /* SACK_PERM is in the place of NOP NOP of TS */ ((opt->sack_ok && !opt->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0)); new_th = (struct tcphdr *)skb_push(syn_skb, tcp_hdr_size); /* Compose tcp header */ skb_reset_transport_header(syn_skb); syn_skb->csum = 0; /* Set tcp hdr */ new_th->source = th->source; new_th->dest = th->dest; new_th->seq = htonl(ntohl(th->seq) - 1); new_th->ack_seq = 0; *(((__u16 *) new_th) + 6) = htons(((tcp_hdr_size >> 2) << 12) | tcp_flags); /* FIX_ME: what window should we use */ new_th->window = htons(5000); new_th->check = 0; new_th->urg_ptr = 0; new_th->urg = 0; new_th->ece = 0; new_th->cwr = 0; syn_proxy_syn_build_options((__be32 *) (new_th + 1), opt); /* * Set ip hdr * Attention: set source and dest addr to ack skb's. * we rely on packet_xmit func to do NATs thing. */ #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { struct ipv6hdr *ack_iph = ipv6_hdr(skb); struct ipv6hdr *iph = (struct ipv6hdr *)skb_push(syn_skb, sizeof(struct ipv6hdr)); tcphoff = sizeof(struct ipv6hdr); skb_reset_network_header(syn_skb); memcpy(&iph->saddr, &ack_iph->saddr, sizeof(struct in6_addr)); memcpy(&iph->daddr, &ack_iph->daddr, sizeof(struct in6_addr)); iph->version = 6; iph->nexthdr = NEXTHDR_TCP; iph->payload_len = htons(tcp_hdr_size); iph->hop_limit = IPV6_DEFAULT_HOPLIMIT; new_th->check = 0; syn_skb->csum = skb_checksum(syn_skb, tcphoff, syn_skb->len - tcphoff, 0); new_th->check = csum_ipv6_magic(&iph->saddr, &iph->daddr, syn_skb->len - tcphoff, IPPROTO_TCP, syn_skb->csum); } else #endif { struct iphdr *ack_iph = ip_hdr(skb); u32 rtos = RT_TOS(ack_iph->tos); struct iphdr *iph = (struct iphdr *)skb_push(syn_skb, sizeof(struct iphdr)); tcphoff = sizeof(struct iphdr); skb_reset_network_header(syn_skb); *((__u16 *) iph) = htons((4 << 12) | (5 << 8) | (rtos & 0xff)); iph->tot_len = htons(syn_skb->len); iph->frag_off = htons(IP_DF); /* FIX_ME: what ttl shoule we use */ iph->ttl = IPDEFTTL; iph->protocol = IPPROTO_TCP; iph->saddr = ack_iph->saddr; iph->daddr = ack_iph->daddr; ip_send_check(iph); new_th->check = 0; syn_skb->csum = skb_checksum(syn_skb, tcphoff, syn_skb->len - tcphoff, 0); new_th->check = csum_tcpudp_magic(iph->saddr, iph->daddr, syn_skb->len - tcphoff, IPPROTO_TCP, syn_skb->csum); } /* Save syn_skb if syn retransmission is on */ if (sysctl_ip_vs_synproxy_syn_retry > 0) { cp->syn_skb = skb_copy(syn_skb, GFP_ATOMIC); atomic_set(&cp->syn_retry_max, sysctl_ip_vs_synproxy_syn_retry); } /* Save info for fast_response_xmit */ if(sysctl_ip_vs_fast_xmit && skb->dev && likely(skb->dev->type == ARPHRD_ETHER) && skb_mac_header_was_set(skb)) { struct ethhdr *eth = (struct ethhdr *)skb_mac_header(skb); if(likely(cp->indev == NULL)) { cp->indev = skb->dev; dev_hold(cp->indev); } if (unlikely(cp->indev != skb->dev)) { dev_put(cp->indev); cp->indev = skb->dev; dev_hold(cp->indev); } memcpy(cp->src_hwaddr, eth->h_source, ETH_ALEN); memcpy(cp->dst_hwaddr, eth->h_dest, ETH_ALEN); IP_VS_INC_ESTATS(ip_vs_esmib, FAST_XMIT_SYNPROXY_SAVE); IP_VS_DBG_RL("syn_proxy_send_rs_syn netdevice:%s\n", netdev_name(skb->dev)); } /* count in the syn packet */ ip_vs_in_stats(cp, skb); /* If xmit failed, syn_skb will be freed correctly. */ cp->packet_xmit(syn_skb, cp, pp); return 1; }
static int pptp_xmit(struct ppp_channel *chan, struct sk_buff *skb) { struct sock *sk = (struct sock *) chan->private; struct pppox_sock *po = pppox_sk(sk); struct pptp_opt *opt=&po->proto.pptp; struct pptp_gre_header *hdr; unsigned int header_len=sizeof(*hdr); int len=skb?skb->len:0; int err=0; int window; struct rtable *rt; /* Route to the other host */ struct net_device *tdev; /* Device to other host */ struct iphdr *iph; /* Our new IP header */ int max_headroom; /* The extra header space needed */ INC_TX_PACKETS; spin_lock_bh(&opt->xmit_lock); window=WRAPPED(opt->ack_recv,opt->seq_sent)?(__u32)0xffffffff-opt->seq_sent+opt->ack_recv:opt->seq_sent-opt->ack_recv; if (!skb){ if (opt->ack_sent == opt->seq_recv) goto exit; }else if (window>opt->window){ __set_bit(PPTP_FLAG_PAUSE,(unsigned long*)&opt->flags); #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) mod_timer(&opt->ack_timeout_timer,opt->stat->rtt/100*HZ/10000); #else schedule_delayed_work(&opt->ack_timeout_work,opt->stat->rtt/100*HZ/10000); #endif goto exit; } #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) { struct rt_key key = { .dst=opt->dst_addr.sin_addr.s_addr, .src=opt->src_addr.sin_addr.s_addr, .tos=RT_TOS(0), }; if ((err=ip_route_output_key(&rt, &key))) { goto tx_error; } } #else { struct flowi fl = { .oif = 0, .nl_u = { .ip4_u = { .daddr = opt->dst_addr.sin_addr.s_addr, .saddr = opt->src_addr.sin_addr.s_addr, .tos = RT_TOS(0) } }, .proto = IPPROTO_GRE }; if ((err=ip_route_output_key(&rt, &fl))) { goto tx_error; } } #endif tdev = rt->u.dst.dev; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) max_headroom = ((tdev->hard_header_len+15)&~15) + sizeof(*iph)+sizeof(*hdr)+2; #else max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(*iph)+sizeof(*hdr)+2; #endif if (!skb){ skb=dev_alloc_skb(max_headroom); if (!skb) { ip_rt_put(rt); goto tx_error; } skb_reserve(skb,max_headroom-skb_headroom(skb)); }else if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) { ip_rt_put(rt); goto tx_error; } if (skb->sk) skb_set_owner_w(new_skb, skb->sk); kfree_skb(skb); skb = new_skb; } if (skb->len){ int islcp; unsigned char *data=skb->data; islcp=((data[0] << 8) + data[1])== PPP_LCP && 1 <= data[2] && data[2] <= 7; /* compress protocol field */ if ((opt->ppp_flags & SC_COMP_PROT) && data[0]==0 && !islcp) skb_pull(skb,1); /* * Put in the address/control bytes if necessary */ if ((opt->ppp_flags & SC_COMP_AC) == 0 || islcp) { data=skb_push(skb,2); data[0]=0xff; data[1]=0x03; } } len=skb->len; if (len==0) header_len-=sizeof(hdr->seq); if (opt->ack_sent == opt->seq_recv) header_len-=sizeof(hdr->ack); // Push down and install GRE header skb_push(skb,header_len); hdr=(struct pptp_gre_header *)(skb->data); hdr->flags = PPTP_GRE_FLAG_K; hdr->ver = PPTP_GRE_VER; hdr->protocol = htons(PPTP_GRE_PROTO); hdr->call_id = htons(opt->dst_addr.call_id); if (!len){ hdr->payload_len = 0; hdr->ver |= PPTP_GRE_FLAG_A; /* ack is in odd place because S == 0 */ hdr->seq = htonl(opt->seq_recv); opt->ack_sent = opt->seq_recv; opt->stat->tx_acks++; }else { hdr->flags |= PPTP_GRE_FLAG_S; hdr->seq = htonl(opt->seq_sent++); if (log_level>=3 && opt->seq_sent<=log_packets) printk(KERN_INFO"PPTP[%i]: send packet: seq=%i",opt->src_addr.call_id,opt->seq_sent); if (opt->ack_sent != opt->seq_recv) { /* send ack with this message */ hdr->ver |= PPTP_GRE_FLAG_A; hdr->ack = htonl(opt->seq_recv); opt->ack_sent = opt->seq_recv; if (log_level>=3 && opt->seq_sent<=log_packets) printk(" ack=%i",opt->seq_recv); } hdr->payload_len = htons(len); if (log_level>=3 && opt->seq_sent<=log_packets) printk("\n"); } /* * Push down and install the IP header. */ #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22) skb->transport_header = skb->network_header; skb_push(skb, sizeof(*iph)); skb_reset_network_header(skb); #else skb->h.raw = skb->nh.raw; skb->nh.raw = skb_push(skb, sizeof(*iph)); #endif memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22) iph = ip_hdr(skb); #else iph = skb->nh.iph; #endif iph->version = 4; iph->ihl = sizeof(struct iphdr) >> 2; iph->frag_off = 0;//df; iph->protocol = IPPROTO_GRE; iph->tos = 0; iph->daddr = rt->rt_dst; iph->saddr = rt->rt_src; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) iph->ttl = sysctl_ip_default_ttl; #else iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT); #endif iph->tot_len = htons(skb->len); dst_release(skb->dst); skb->dst = &rt->u.dst; nf_reset(skb); skb->ip_summed = CHECKSUM_NONE; ip_select_ident(iph, &rt->u.dst, NULL); ip_send_check(iph); err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, dst_output); wake_up(&opt->wait); if (err == NET_XMIT_SUCCESS || err == NET_XMIT_CN) { opt->stat->tx_sent++; if (!opt->stat->pt_seq){ opt->stat->pt_seq = opt->seq_sent; do_gettimeofday(&opt->stat->pt_time); } }else{ INC_TX_ERRORS; opt->stat->tx_failed++; } spin_unlock_bh(&opt->xmit_lock); return 1; tx_error: INC_TX_ERRORS; opt->stat->tx_failed++; if (!len) kfree_skb(skb); spin_unlock_bh(&opt->xmit_lock); return 1; exit: spin_unlock_bh(&opt->xmit_lock); return 0; }
/* This requires some explaining. If DNAT has taken place, * we will need to fix up the destination Ethernet address. * This is also true when SNAT takes place (for the reply direction). * * There are two cases to consider: * 1. The packet was DNAT'ed to a device in the same bridge * port group as it was received on. We can still bridge * the packet. * 2. The packet was DNAT'ed to a different device, either * a non-bridged device or another bridge port group. * The packet will need to be routed. * * The correct way of distinguishing between these two cases is to * call ip_route_input() and to look at skb->dst->dev, which is * changed to the destination device if ip_route_input() succeeds. * * Let's first consider the case that ip_route_input() succeeds: * * If the output device equals the logical bridge device the packet * came in on, we can consider this bridging. The corresponding MAC * address will be obtained in br_nf_pre_routing_finish_bridge. * Otherwise, the packet is considered to be routed and we just * change the destination MAC address so that the packet will * later be passed up to the IP stack to be routed. For a redirected * packet, ip_route_input() will give back the localhost as output device, * which differs from the bridge device. * * Let's now consider the case that ip_route_input() fails: * * This can be because the destination address is martian, in which case * the packet will be dropped. * If IP forwarding is disabled, ip_route_input() will fail, while * ip_route_output_key() can return success. The source * address for ip_route_output_key() is set to zero, so ip_route_output_key() * thinks we're handling a locally generated packet and won't care * if IP forwarding is enabled. If the output device equals the logical bridge * device, we proceed as if ip_route_input() succeeded. If it differs from the * logical bridge port or if ip_route_output_key() fails we drop the packet. */ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb->dev; struct iphdr *iph = ip_hdr(skb); struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct rtable *rt; int err; nf_bridge->frag_max_size = IPCB(skb)->frag_max_size; if (nf_bridge->pkt_otherhost) { skb->pkt_type = PACKET_OTHERHOST; nf_bridge->pkt_otherhost = false; } nf_bridge->in_prerouting = 0; if (br_nf_ipv4_daddr_was_changed(skb, nf_bridge)) { if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) { struct in_device *in_dev = __in_dev_get_rcu(dev); /* If err equals -EHOSTUNREACH the error is due to a * martian destination or due to the fact that * forwarding is disabled. For most martian packets, * ip_route_output_key() will fail. It won't fail for 2 types of * martian destinations: loopback destinations and destination * 0.0.0.0. In both cases the packet will be dropped because the * destination is the loopback device and not the bridge. */ if (err != -EHOSTUNREACH || !in_dev || IN_DEV_FORWARD(in_dev)) goto free_skb; rt = ip_route_output(net, iph->daddr, 0, RT_TOS(iph->tos), 0); if (!IS_ERR(rt)) { /* - Bridged-and-DNAT'ed traffic doesn't * require ip_forwarding. */ if (rt->dst.dev == dev) { skb_dst_set(skb, &rt->dst); goto bridged_dnat; } ip_rt_put(rt); } free_skb: kfree_skb(skb); return 0; } else { if (skb_dst(skb)->dev == dev) { bridged_dnat: skb->dev = nf_bridge->physindev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL, br_nf_pre_routing_finish_bridge); return 0; } ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr); skb->pkt_type = PACKET_HOST; } } else { rt = bridge_parent_rtable(nf_bridge->physindev); if (!rt) { kfree_skb(skb); return 0; } skb_dst_set_noref(skb, &rt->dst); } skb->dev = nf_bridge->physindev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL, br_handle_frame_finish); return 0; }
static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv; struct net_device_stats *stats = &tunnel->stat; struct iphdr *tiph = &tunnel->parms.iph; struct ipv6hdr *iph6 = skb->nh.ipv6h; u8 tos = tunnel->parms.iph.tos; struct rtable *rt; /* Route to the other host */ struct net_device *tdev; /* Device to other host */ struct iphdr *iph; /* Our new IP header */ int max_headroom; /* The extra header space needed */ u32 dst = tiph->daddr; int mtu; struct in6_addr *addr6; int addr_type; if (tunnel->recursion++) { tunnel->stat.collisions++; goto tx_error; } if (skb->protocol != htons(ETH_P_IPV6)) goto tx_error; if (!dst) dst = try_6to4(&iph6->daddr); if (!dst) { struct neighbour *neigh = NULL; if (skb->dst) neigh = skb->dst->neighbour; if (neigh == NULL) { if (net_ratelimit()) printk(KERN_DEBUG "sit: nexthop == NULL\n"); goto tx_error; } addr6 = (struct in6_addr*)&neigh->primary_key; addr_type = ipv6_addr_type(addr6); if (addr_type == IPV6_ADDR_ANY) { addr6 = &skb->nh.ipv6h->daddr; addr_type = ipv6_addr_type(addr6); } if (addr_type & IPV6_ADDR_COMPATv4) dst = addr6->s6_addr32[3]; else #ifdef CONFIG_IPV6_6TO4_NEXTHOP if (!(dst = try_6to4(addr6))) #endif goto tx_error_icmp; } if (ip_route_output(&rt, dst, tiph->saddr, RT_TOS(tos), tunnel->parms.link)) { tunnel->stat.tx_carrier_errors++; goto tx_error_icmp; } if (rt->rt_type != RTN_UNICAST) { tunnel->stat.tx_carrier_errors++; goto tx_error_icmp; } tdev = rt->u.dst.dev; if (tdev == dev) { ip_rt_put(rt); tunnel->stat.collisions++; goto tx_error; } if (tiph->frag_off) mtu = rt->u.dst.pmtu - sizeof(struct iphdr); else mtu = skb->dst ? skb->dst->pmtu : dev->mtu; if (mtu < 68) { tunnel->stat.collisions++; ip_rt_put(rt); goto tx_error; } if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; if (skb->dst && mtu < skb->dst->pmtu) { struct rt6_info *rt6 = (struct rt6_info*)skb->dst; if (mtu < rt6->u.dst.pmtu) { if (tunnel->parms.iph.daddr || rt6->rt6i_dst.plen == 128) { rt6->rt6i_flags |= RTF_MODIFIED; rt6->u.dst.pmtu = mtu; } } } if (skb->len > mtu) { icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev); ip_rt_put(rt); goto tx_error; } if (tunnel->err_count > 0) { if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) { tunnel->err_count--; dst_link_failure(skb); } else tunnel->err_count = 0; } /* * Okay, now see if we can stuff it in the buffer as-is. */ max_headroom = (((tdev->hard_header_len+15)&~15)+sizeof(struct iphdr)); if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) { ip_rt_put(rt); stats->tx_dropped++; dev_kfree_skb(skb); tunnel->recursion--; return 0; } if (skb->sk) skb_set_owner_w(new_skb, skb->sk); dev_kfree_skb(skb); skb = new_skb; iph6 = skb->nh.ipv6h; } skb->h.raw = skb->nh.raw; skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); dst_release(skb->dst); skb->dst = &rt->u.dst; /* * Push down and install the IPIP header. */ iph = skb->nh.iph; iph->version = 4; iph->ihl = sizeof(struct iphdr)>>2; if (mtu > IPV6_MIN_MTU) iph->frag_off = htons(IP_DF); else iph->frag_off = 0; iph->protocol = IPPROTO_IPV6; iph->tos = INET_ECN_encapsulate(tos, ip6_get_dsfield(iph6)); iph->daddr = rt->rt_dst; iph->saddr = rt->rt_src; if ((iph->ttl = tiph->ttl) == 0) iph->ttl = iph6->hop_limit; nf_reset(skb); IPTUNNEL_XMIT(); tunnel->recursion--; return 0; tx_error_icmp: dst_link_failure(skb); tx_error: stats->tx_errors++; dev_kfree_skb(skb); tunnel->recursion--; return 0; }