static int udp_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in addrs[2]; struct inpcb *inp; int error; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); inp = in_pcblookup(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { INP_RLOCK_ASSERT(inp); if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); }
static void udp_common_ctlinput(int cmd, struct sockaddr *sa, void *vip, struct inpcbinfo *pcbinfo) { struct ip *ip = vip; struct udphdr *uh; struct in_addr faddr; struct inpcb *inp; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; /* * Redirects don't need to be handled up here. */ if (PRC_IS_REDIRECT(cmd)) return; /* * Hostdead is ugly because it goes linearly through all PCBs. * * XXX: We never get this from ICMP, otherwise it makes an excellent * DoS attack on machines with many connections. */ if (cmd == PRC_HOSTDEAD) ip = NULL; else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) return; if (ip != NULL) { uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport, ip->ip_src, uh->uh_sport, INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { INP_RLOCK_ASSERT(inp); if (inp->inp_socket != NULL) { udp_notify(inp, inetctlerrmap[cmd]); } INP_RUNLOCK(inp); } } else in_pcbnotifyall(pcbinfo, faddr, inetctlerrmap[cmd], udp_notify); }
void udp_input(struct mbuf *m, int off) { int iphlen = off; struct ip *ip; struct udphdr *uh; struct ifnet *ifp; struct inpcb *inp; uint16_t len, ip_len; struct inpcbinfo *pcbinfo; struct ip save_ip; struct sockaddr_in udp_in; struct m_tag *fwd_tag; int cscov_partial; uint8_t pr; ifp = m->m_pkthdr.rcvif; UDPSTAT_INC(udps_ipackets); /* * Strip IP options, if any; should skip this, make available to * user, and use on returned packets, but we don't yet have a way to * check the checksum with options still present. */ if (iphlen > sizeof (struct ip)) { ip_stripoptions(m); iphlen = sizeof(struct ip); } /* * Get IP and UDP header together in first mbuf. */ ip = mtod(m, struct ip *); if (m->m_len < iphlen + sizeof(struct udphdr)) { if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == NULL) { UDPSTAT_INC(udps_hdrops); return; } ip = mtod(m, struct ip *); } uh = (struct udphdr *)((caddr_t)ip + iphlen); pr = ip->ip_p; cscov_partial = (pr == IPPROTO_UDPLITE) ? 1 : 0; /* * Destination port of 0 is illegal, based on RFC768. */ if (uh->uh_dport == 0) goto badunlocked; /* * Construct sockaddr format source address. Stuff source address * and datagram in user buffer. */ bzero(&udp_in, sizeof(udp_in)); udp_in.sin_len = sizeof(udp_in); udp_in.sin_family = AF_INET; udp_in.sin_port = uh->uh_sport; udp_in.sin_addr = ip->ip_src; /* * Make mbuf data length reflect UDP length. If not enough data to * reflect UDP length, drop. */ len = ntohs((u_short)uh->uh_ulen); ip_len = ntohs(ip->ip_len) - iphlen; if (pr == IPPROTO_UDPLITE && len == 0) { /* Zero means checksum over the complete packet. */ len = ip_len; cscov_partial = 0; } if (ip_len != len) { if (len > ip_len || len < sizeof(struct udphdr)) { UDPSTAT_INC(udps_badlen); goto badunlocked; } if (pr == IPPROTO_UDP) m_adj(m, len - ip_len); } /* * Save a copy of the IP header in case we want restore it for * sending an ICMP error message in response. */ if (!V_udp_blackhole) save_ip = *ip; else memset(&save_ip, 0, sizeof(save_ip)); /* * Checksum extended UDP header and data. */ if (uh->uh_sum) { u_short uh_sum; if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID) && !cscov_partial) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) uh_sum = m->m_pkthdr.csum_data; else uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl((u_short)len + m->m_pkthdr.csum_data + pr)); uh_sum ^= 0xffff; } else { char b[9]; bcopy(((struct ipovly *)ip)->ih_x1, b, 9); bzero(((struct ipovly *)ip)->ih_x1, 9); ((struct ipovly *)ip)->ih_len = (pr == IPPROTO_UDP) ? uh->uh_ulen : htons(ip_len); uh_sum = in_cksum(m, len + sizeof (struct ip)); bcopy(b, ((struct ipovly *)ip)->ih_x1, 9); } if (uh_sum) { UDPSTAT_INC(udps_badsum); m_freem(m); return; } } else UDPSTAT_INC(udps_nosum); pcbinfo = get_inpcbinfo(pr); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || in_broadcast(ip->ip_dst, ifp)) { struct inpcb *last; struct inpcbhead *pcblist; struct ip_moptions *imo; INP_INFO_RLOCK(pcbinfo); pcblist = get_pcblist(pr); last = NULL; LIST_FOREACH(inp, pcblist, inp_list) { if (inp->inp_lport != uh->uh_dport) continue; #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_laddr.s_addr != INADDR_ANY && inp->inp_laddr.s_addr != ip->ip_dst.s_addr) continue; if (inp->inp_faddr.s_addr != INADDR_ANY && inp->inp_faddr.s_addr != ip->ip_src.s_addr) continue; if (inp->inp_fport != 0 && inp->inp_fport != uh->uh_sport) continue; INP_RLOCK(inp); /* * XXXRW: Because we weren't holding either the inpcb * or the hash lock when we checked for a match * before, we should probably recheck now that the * inpcb lock is held. */ /* * Handle socket delivery policy for any-source * and source-specific multicast. [RFC3678] */ imo = inp->inp_moptions; if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { struct sockaddr_in group; int blocked; if (imo == NULL) { INP_RUNLOCK(inp); continue; } bzero(&group, sizeof(struct sockaddr_in)); group.sin_len = sizeof(struct sockaddr_in); group.sin_family = AF_INET; group.sin_addr = ip->ip_dst; blocked = imo_multi_filter(imo, ifp, (struct sockaddr *)&group, (struct sockaddr *)&udp_in); if (blocked != MCAST_PASS) { if (blocked == MCAST_NOTGMEMBER) IPSTAT_INC(ips_notmember); if (blocked == MCAST_NOTSMEMBER || blocked == MCAST_MUTED) UDPSTAT_INC(udps_filtermcast); INP_RUNLOCK(inp); continue; } } if (last != NULL) { struct mbuf *n; n = m_copy(m, 0, M_COPYALL); udp_append(last, ip, n, iphlen, &udp_in); INP_RUNLOCK(last); } last = inp; /* * Don't look for additional matches if this one does * not have either the SO_REUSEPORT or SO_REUSEADDR * socket options set. This heuristic avoids * searching through all pcbs in the common case of a * non-shared port. It assumes that an application * will never clear these options after setting them. */ if ((last->inp_socket->so_options & (SO_REUSEPORT|SO_REUSEADDR)) == 0) break; } if (last == NULL) { /* * No matching pcb found; discard datagram. (No need * to send an ICMP Port Unreachable for a broadcast * or multicast datgram.) */ UDPSTAT_INC(udps_noportbcast); if (inp) INP_RUNLOCK(inp); INP_INFO_RUNLOCK(pcbinfo); goto badunlocked; } udp_append(last, ip, m, iphlen, &udp_in); INP_RUNLOCK(last); INP_INFO_RUNLOCK(pcbinfo); return; } /* * Locate pcb for datagram. */ /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ if ((m->m_flags & M_IP_NEXTHOP) && (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) { struct sockaddr_in *next_hop; next_hop = (struct sockaddr_in *)(fwd_tag + 1); /* * Transparently forwarded. Pretend to be the destination. * Already got one like this? */ inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport, ip->ip_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, ifp, m); if (!inp) { /* * It's new. Try to find the ambushing socket. * Because we've rewritten the destination address, * any hardware-generated hash is ignored. */ inp = in_pcblookup(pcbinfo, ip->ip_src, uh->uh_sport, next_hop->sin_addr, next_hop->sin_port ? htons(next_hop->sin_port) : uh->uh_dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, ifp); } /* Remove the tag from the packet. We don't need it anymore. */ m_tag_delete(m, fwd_tag); m->m_flags &= ~M_IP_NEXTHOP; } else inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport, ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, ifp, m); if (inp == NULL) { if (udp_log_in_vain) { char buf[4*sizeof "123"]; strcpy(buf, inet_ntoa(ip->ip_dst)); log(LOG_INFO, "Connection attempt to UDP %s:%d from %s:%d\n", buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src), ntohs(uh->uh_sport)); } UDPSTAT_INC(udps_noport); if (m->m_flags & (M_BCAST | M_MCAST)) { UDPSTAT_INC(udps_noportbcast); goto badunlocked; } if (V_udp_blackhole) goto badunlocked; if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0) goto badunlocked; *ip = save_ip; icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0); return; } /* * Check the minimum TTL for socket. */ INP_RLOCK_ASSERT(inp); if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) { INP_RUNLOCK(inp); m_freem(m); return; } if (cscov_partial) { struct udpcb *up; up = intoudpcb(inp); if (up->u_rxcslen > len) { INP_RUNLOCK(inp); m_freem(m); return; } } UDP_PROBE(receive, NULL, inp, ip, inp, uh); udp_append(inp, ip, m, iphlen, &udp_in); INP_RUNLOCK(inp); return; badunlocked: m_freem(m); }
/* * Connect from a socket to a specified address. * Both address and port must be specified in argument sin6. * Eventually, flow labels will have to be dealt with here, as well. * * If don't have a local address for this socket yet, * then pick one. * * I believe this has to be called at splnet(). */ int in6_pcbconnect(struct inpcb *inp, struct mbuf *nam) { struct in6_addr *in6a = NULL; struct sockaddr_in6 *sin6 = mtod(nam, struct sockaddr_in6 *); struct ifnet *ifp = NULL; /* outgoing interface */ int error = 0; struct sockaddr_in6 tmp; (void)&in6a; /* XXX fool gcc */ if (nam->m_len != sizeof(*sin6)) return (EINVAL); if (sin6->sin6_family != AF_INET6) return (EAFNOSUPPORT); if (sin6->sin6_port == 0) return (EADDRNOTAVAIL); /* reject IPv4 mapped address, we have no support for it */ if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) return EADDRNOTAVAIL; /* sanity check for mapped address case */ if (IN6_IS_ADDR_V4MAPPED(&inp->inp_laddr6)) return EINVAL; /* protect *sin6 from overwrites */ tmp = *sin6; sin6 = &tmp; /* KAME hack: embed scopeid */ if (in6_embedscope(&sin6->sin6_addr, sin6, inp, &ifp) != 0) return EINVAL; /* this must be cleared for ifa_ifwithaddr() */ sin6->sin6_scope_id = 0; /* Source address selection. */ /* * XXX: in6_selectsrc might replace the bound local address * with the address specified by setsockopt(IPV6_PKTINFO). * Is it the intended behavior? */ in6a = in6_selectsrc(sin6, inp->inp_outputopts6, inp->inp_moptions6, &inp->inp_route6, &inp->inp_laddr6, &error); if (in6a == 0) { if (error == 0) error = EADDRNOTAVAIL; return (error); } if (inp->inp_route6.ro_rt) ifp = inp->inp_route6.ro_rt->rt_ifp; inp->inp_ipv6.ip6_hlim = (u_int8_t)in6_selecthlim(inp, ifp); if (in_pcblookup(inp->inp_table, &sin6->sin6_addr, sin6->sin6_port, IN6_IS_ADDR_UNSPECIFIED(&inp->inp_laddr6) ? in6a : &inp->inp_laddr6, inp->inp_lport, INPLOOKUP_IPV6)) { return (EADDRINUSE); } if (IN6_IS_ADDR_UNSPECIFIED(&inp->inp_laddr6)) { if (inp->inp_lport == 0) (void)in6_pcbbind(inp, NULL, curproc); inp->inp_laddr6 = *in6a; } inp->inp_faddr6 = sin6->sin6_addr; inp->inp_fport = sin6->sin6_port; inp->inp_flowinfo &= ~IPV6_FLOWLABEL_MASK; if (ip6_auto_flowlabel) inp->inp_flowinfo |= (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); in_pcbrehash(inp); return (0); }
int in_pcbbind(struct inpcb *inp, struct mbuf *nam) { register struct socket *so = inp->inp_socket; unsigned short *lastport; struct sockaddr_in *sin; u_short lport = 0; int wild = 0, reuseport = (so->so_options & SO_REUSEPORT); int error; if (in_ifaddr == 0) return (EADDRNOTAVAIL); if (inp->inp_lport || inp->inp_laddr.s_addr != INADDR_ANY) return (EINVAL); if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0 && ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0 || (so->so_options & SO_ACCEPTCONN) == 0)) wild = 1; if (nam) { sin = mtod(nam, struct sockaddr_in *); if (nam->m_len != sizeof (*sin)) return (EINVAL); #ifdef notdef /* * We should check the family, but old programs * incorrectly fail to initialize it. */ if (sin->sin_family != AF_INET) return (EAFNOSUPPORT); #endif lport = sin->sin_port; if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { /* * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; * allow complete duplication of binding if * SO_REUSEPORT is set, or if SO_REUSEADDR is set * and a multicast address is bound on both * new and duplicated sockets. */ if (so->so_options & SO_REUSEADDR) reuseport = SO_REUSEADDR|SO_REUSEPORT; } else if (sin->sin_addr.s_addr != INADDR_ANY) { sin->sin_port = 0; /* yech... */ if (ifa_ifwithaddr((struct sockaddr *)sin) == 0) return (EADDRNOTAVAIL); } if (lport) { struct inpcb *t; /* GROSS */ if (ntohs(lport) < IPPORT_RESERVED && (error = suser(p->p_ucred, &p->p_acflag))) return (EACCES); if (so->so_uid) { t = in_pcblookup(inp->inp_pcbinfo, zeroin_addr, 0, sin->sin_addr, lport, INPLOOKUP_WILDCARD); if (t && (so->so_uid != t->inp_socket->so_uid)) return (EADDRINUSE); } t = in_pcblookup(inp->inp_pcbinfo, zeroin_addr, 0, sin->sin_addr, lport, wild); if (t && (reuseport & t->inp_socket->so_options) == 0) return (EADDRINUSE); } inp->inp_laddr = sin->sin_addr; } if (lport == 0) { unsigned short first, last; int count; inp->inp_flags |= INP_ANONPORT; if (inp->inp_flags & INP_HIGHPORT) { first = ipport_hifirstauto; /* sysctl */ last = ipport_hilastauto; lastport = &inp->inp_pcbinfo->lasthi; } else if (inp->inp_flags & INP_LOWPORT) { if ((error = suser(p->p_ucred, &p->p_acflag))) return (EACCES); first = ipport_lowfirstauto; /* 1023 */ last = ipport_lowlastauto; /* 600 */ lastport = &inp->inp_pcbinfo->lastlow; } else { first = ipport_firstauto; /* sysctl */ last = ipport_lastauto; lastport = &inp->inp_pcbinfo->lastport; } /* * Simple check to ensure all ports are not used up causing * a deadlock here. * * We split the two cases (up and down) so that the direction * is not being tested on each round of the loop. */ if (first > last) { /* * counting down */ count = first - last; do { if (count-- <= 0) /* completely used? */ return (EADDRNOTAVAIL); --*lastport; if (*lastport > first || *lastport < last) *lastport = first; lport = htons(*lastport); } while (in_pcblookup(inp->inp_pcbinfo, zeroin_addr, 0, inp->inp_laddr, lport, wild)); } else { /* * counting up */ count = last - first; do { if (count-- <= 0) /* completely used? */ return (EADDRNOTAVAIL); ++*lastport; if (*lastport < first || *lastport > last) *lastport = first; lport = htons(*lastport); } while (in_pcblookup(inp->inp_pcbinfo, zeroin_addr, 0, inp->inp_laddr, lport, wild)); } } inp->inp_lport = lport; in_pcbrehash(inp); return (0); }
int in6_pcbsetport(struct in6_addr *laddr, struct inpcb *inp, struct proc *p) { struct socket *so = inp->inp_socket; struct inpcbtable *table = inp->inp_table; u_int16_t first, last; u_int16_t *lastport = &inp->inp_table->inpt_lastport; u_int16_t lport = 0; int count; int wild = INPLOOKUP_IPV6; int error; /* XXX we no longer support IPv4 mapped address, so no tweaks here */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0 && ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0 || (so->so_options & SO_ACCEPTCONN) == 0)) wild |= INPLOOKUP_WILDCARD; if (inp->inp_flags & INP_HIGHPORT) { first = ipport_hifirstauto; /* sysctl */ last = ipport_hilastauto; } else if (inp->inp_flags & INP_LOWPORT) { if ((error = suser(p, 0))) return (EACCES); first = IPPORT_RESERVED-1; /* 1023 */ last = 600; /* not IPPORT_RESERVED/2 */ } else { first = ipport_firstauto; /* sysctl */ last = ipport_lastauto; } /* * Simple check to ensure all ports are not used up causing * a deadlock here. * * We split the two cases (up and down) so that the direction * is not being tested on each round of the loop. */ if (first > last) { /* * counting down */ count = first - last; if (count) *lastport = first - arc4random_uniform(count); do { if (count-- < 0) /* completely used? */ return (EADDRNOTAVAIL); --*lastport; if (*lastport > first || *lastport < last) *lastport = first; lport = htons(*lastport); } while (in_baddynamic(*lastport, so->so_proto->pr_protocol) || in_pcblookup(table, &zeroin6_addr, 0, &inp->inp_laddr6, lport, wild)); } else { /* * counting up */ count = last - first; if (count) *lastport = first + arc4random_uniform(count); do { if (count-- < 0) /* completely used? */ return (EADDRNOTAVAIL); ++*lastport; if (*lastport < first || *lastport > last) *lastport = first; lport = htons(*lastport); } while (in_baddynamic(*lastport, so->so_proto->pr_protocol) || in_pcblookup(table, &zeroin6_addr, 0, &inp->inp_laddr6, lport, wild)); } inp->inp_lport = lport; in_pcbrehash(inp); #if 0 inp->inp_flowinfo = 0; /* XXX */ #endif return 0; }
/* * Bind an address (or at least a port) to an PF_INET6 socket. */ int in6_pcbbind(struct inpcb *inp, struct mbuf *nam, struct proc *p) { struct socket *so = inp->inp_socket; struct inpcbtable *head = inp->inp_table; struct sockaddr_in6 *sin6; u_short lport = 0; int wild = INPLOOKUP_IPV6, reuseport = (so->so_options & SO_REUSEPORT); int error; /* * REMINDER: Once up to speed, flow label processing should go here, * too. (Same with in6_pcbconnect.) */ if (in6_ifaddr == 0) return EADDRNOTAVAIL; if (inp->inp_lport != 0 || !IN6_IS_ADDR_UNSPECIFIED(&inp->inp_laddr6)) return EINVAL; /* If already bound, EINVAL! */ if ((so->so_options & (SO_REUSEADDR | SO_REUSEPORT)) == 0 && ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0 || (so->so_options & SO_ACCEPTCONN) == 0)) wild |= INPLOOKUP_WILDCARD; /* * If I did get a sockaddr passed in... */ if (nam) { sin6 = mtod(nam, struct sockaddr_in6 *); if (nam->m_len != sizeof (*sin6)) return EINVAL; /* * Unlike v4, I have no qualms about EAFNOSUPPORT if the * wretched family is not filled in! */ if (sin6->sin6_family != AF_INET6) return EAFNOSUPPORT; /* KAME hack: embed scopeid */ if (in6_embedscope(&sin6->sin6_addr, sin6, inp, NULL) != 0) return EINVAL; /* this must be cleared for ifa_ifwithaddr() */ sin6->sin6_scope_id = 0; lport = sin6->sin6_port; /* reject IPv4 mapped address, we have no support for it */ if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) return EADDRNOTAVAIL; if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { /* * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; * allow complete duplication of binding if * SO_REUSEPORT is set, or if SO_REUSEADDR is set * and a multicast address is bound on both * new and duplicated sockets. */ if (so->so_options & SO_REUSEADDR) reuseport = SO_REUSEADDR | SO_REUSEPORT; } else if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { struct ifaddr *ia = NULL; sin6->sin6_port = 0; /* * Yechhhh, because of upcoming * call to ifa_ifwithaddr(), which * does bcmp's over the PORTS as * well. (What about flow?) */ sin6->sin6_flowinfo = 0; if (!(so->so_options & SO_BINDANY) && ((ia = ifa_ifwithaddr((struct sockaddr *)sin6)) == NULL)) return EADDRNOTAVAIL; /* * bind to an anycast address might accidentally * cause sending a packet with an anycast source * address, so we forbid it. * * We should allow to bind to a deprecated address, * since the application dare to use it. * But, can we assume that they are careful enough * to check if the address is deprecated or not? * Maybe, as a safeguard, we should have a setsockopt * flag to control the bind(2) behavior against * deprecated addresses (default: forbid bind(2)). */ if (ia && ((struct in6_ifaddr *)ia)->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY|IN6_IFF_DETACHED)) return (EADDRNOTAVAIL); } if (lport) { struct inpcb *t; /* * Question: Do we wish to continue the Berkeley * tradition of ports < IPPORT_RESERVED be only for * root? * Answer: For now yes, but IMHO, it should be REMOVED! * OUCH: One other thing, is there no better way of * finding a process for a socket instead of using * curproc? (Marked with BSD's {in,}famous XXX ? */ if (ntohs(lport) < IPPORT_RESERVED && (error = suser(p, 0))) return error; t = in_pcblookup(head, (struct in_addr *)&zeroin6_addr, 0, (struct in_addr *)&sin6->sin6_addr, lport, wild); if (t && (reuseport & t->inp_socket->so_options) == 0) return EADDRINUSE; } inp->inp_laddr6 = sin6->sin6_addr; } if (lport == 0) { error = in6_pcbsetport(&inp->inp_laddr6, inp, p); if (error != 0) return error; } else { inp->inp_lport = lport; in_pcbrehash(inp); } return 0; }
int in_pcbbind(struct inpcb *inp, struct mbuf *name) { struct sockaddr_in *sin; struct socket *so; struct inpcb *t; short *lastport; unsigned short lport; int wild, reuseport, i; /* Setup locals */ so = inp->inp_socket; lastport = &inp->inp_pcbinfo->lastport; reuseport = (so->so_options & SO_REUSEPORT); lport = 0; wild = 0; i = in_pcbhash(inp); if (in_ifaddr == NULL) return EADDRNOTAVAIL; /* If local port or any inet address */ if ( (inp->inp_lport != 0) || (inp->inp_laddr.s_addr != INADDR_ANY) ) return EINVAL; if ( ((so->so_options & (SO_REUSEADDR | SO_REUSEPORT)) == 0) && ( ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) || ((so->so_options & SO_ACCEPTCONN) == 0) ) ) wild = INPLOOKUP_WILDCARD; /* If name non-null */ if (name != NULL) { /* Get name */ sin = mtod(name, struct sockaddr_in *); if ( name->m_len != sizeof(struct sockaddr_in) ) return EINVAL; #ifdef NET_DIAGNOSE /* Check family */ if (sin->sin_family != AF_INET) return EAFNOSUPPORT; #endif /* NET_DIAGNOSE */ /* Get port */ lport = sin->sin_port; /* If multicast address */ if ( IN_MULTICAST( ntohl(sin->sin_addr.s_addr) ) ) { if (so->so_options & SO_REUSEADDR) reuseport = SO_REUSEADDR | SO_REUSEPORT; } /* End if multicast address */ /* Else if not any address */ else if (sin->sin_addr.s_addr != INADDR_ANY) { sin->sin_port = 0; if (ifa_ifwithaddr((struct sockaddr *) sin) == NULL) return EADDRNOTAVAIL; } /* End else if not any address */ /* If local port */ if (lport) { /* Lookup address */ t = in_pcblookup(inp->inp_pcbinfo, zeroin_addr, 0, sin->sin_addr, lport, wild); if ( (t != NULL) && ((reuseport & t->inp_socket->so_options) == 0) ) return EADDRINUSE; } /* End if local port */ /* Store local address */ inp->inp_laddr = sin->sin_addr; } /* End if name non-null */
/* * TCP input routine, follows pages 65-76 of the * protocol specification dated September, 1981 very closely. */ void tcp_input(usn_mbuf_t *m, int iphlen) { struct tcpiphdr *ti; struct inpcb *inp; u_char *optp = NULL; int optlen; int len, tlen, off; struct tcpcb *tp = 0; int tiflags; struct usn_socket *so = 0; int todrop, acked, ourfinisacked; int needoutput = 0; short ostate; struct usn_in_addr laddr; int dropsocket = 0; int iss = 0; u_long tiwin, ts_val, ts_ecr; int ts_present = 0; (void)needoutput; g_tcpstat.tcps_rcvtotal++; // Get IP and TCP header together in first mbuf. // Note: IP leaves IP header in first mbuf. ti = mtod(m, struct tcpiphdr *); if (iphlen > sizeof (usn_ip_t)) ip_stripoptions(m, (usn_mbuf_t *)0); if (m->mlen < sizeof (struct tcpiphdr)) { if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) { g_tcpstat.tcps_rcvshort++; return; } ti = mtod(m, struct tcpiphdr *); } #ifdef DUMP_PAYLOAD dump_chain(m,"tcp"); #endif /* * Checksum extended TCP header and data. */ tlen = ntohs(((usn_ip_t *)ti)->ip_len); len = sizeof (usn_ip_t) + tlen; ti->ti_next = ti->ti_prev = 0; ti->ti_x1 = 0; ti->ti_len = (u_short)tlen; HTONS(ti->ti_len); ti->ti_sum = in_cksum(m, len); if (ti->ti_sum) { g_tcpstat.tcps_rcvbadsum++; goto drop; } /* * Check that TCP offset makes sense, * pull out TCP options and adjust length. XXX */ off = ti->ti_off << 2; if (off < sizeof (struct tcphdr) || off > tlen) { g_tcpstat.tcps_rcvbadoff++; goto drop; } tlen -= off; ti->ti_len = tlen; if (off > sizeof (struct tcphdr)) { if (m->mlen < sizeof(usn_ip_t) + off) { if ((m = m_pullup(m, sizeof (usn_ip_t) + off)) == 0) { g_tcpstat.tcps_rcvshort++; return; } ti = mtod(m, struct tcpiphdr *); } optlen = off - sizeof (struct tcphdr); optp = mtod(m, u_char *) + sizeof (struct tcpiphdr); // Do quick retrieval of timestamp options ("options // prediction?"). If timestamp is the only option and it's // formatted as recommended in RFC 1323 appendix A, we // quickly get the values now and not bother calling // tcp_dooptions(), etc. if ((optlen == TCPOLEN_TSTAMP_APPA || (optlen > TCPOLEN_TSTAMP_APPA && optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && *(u_int *)optp == htonl(TCPOPT_TSTAMP_HDR) && (ti->ti_flags & TH_SYN) == 0) { ts_present = 1; ts_val = ntohl(*(u_long *)(optp + 4)); ts_ecr = ntohl(*(u_long *)(optp + 8)); optp = NULL; // we've parsed the options } } tiflags = ti->ti_flags; // Convert TCP protocol specific fields to host format. NTOHL(ti->ti_seq); NTOHL(ti->ti_ack); NTOHS(ti->ti_win); NTOHS(ti->ti_urp); // Locate pcb for segment. findpcb: inp = g_tcp_last_inpcb; if (inp->inp_lport != ti->ti_dport || inp->inp_fport != ti->ti_sport || inp->inp_faddr.s_addr != ti->ti_src.s_addr || inp->inp_laddr.s_addr != ti->ti_dst.s_addr) { inp = in_pcblookup(&g_tcb, ti->ti_src, ti->ti_sport, ti->ti_dst, ti->ti_dport, INPLOOKUP_WILDCARD); if (inp) g_tcp_last_inpcb = inp; ++g_tcpstat.tcps_pcbcachemiss; } // If the state is CLOSED (i.e., TCB does not exist) then // all data in the incoming segment is discarded. // If the TCB exists but is in CLOSED state, it is embryonic, // but should either do a listen or a connect soon. if (inp == 0) goto dropwithreset; tp = intotcpcb(inp); DEBUG("found inp cb, laddr=%x, lport=%d, faddr=%x," " fport=%d, tp_state=%d, tp_flags=%d", inp->inp_laddr.s_addr, inp->inp_lport, inp->inp_faddr.s_addr, inp->inp_fport, tp->t_state, tp->t_flags); if (tp == 0) goto dropwithreset; if (tp->t_state == TCPS_CLOSED) goto drop; // Unscale the window into a 32-bit value. if ((tiflags & TH_SYN) == 0) tiwin = ti->ti_win << tp->snd_scale; else tiwin = ti->ti_win; so = inp->inp_socket; DEBUG("socket info, options=%x", so->so_options); if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { if (so->so_options & SO_DEBUG) { ostate = tp->t_state; g_tcp_saveti = *ti; } if (so->so_options & SO_ACCEPTCONN) { if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { // Note: dropwithreset makes sure we don't // send a reset in response to a RST. if (tiflags & TH_ACK) { g_tcpstat.tcps_badsyn++; goto dropwithreset; } DEBUG("SYN is expected, tiflags=%d", tiflags); goto drop; } so = sonewconn(so, 0); if (so == 0) { DEBUG("failed to create new connection, tiflags=%d", tiflags); goto drop; } // Mark socket as temporary until we're // committed to keeping it. The code at // ``drop'' and ``dropwithreset'' check the // flag dropsocket to see if the temporary // socket created here should be discarded. // We mark the socket as discardable until // we're committed to it below in TCPS_LISTEN. dropsocket++; inp = (struct inpcb *)so->so_pcb; inp->inp_laddr = ti->ti_dst; inp->inp_lport = ti->ti_dport; // BSD >= 4.3 inp->inp_options = ip_srcroute(); tp = intotcpcb(inp); tp->t_state = TCPS_LISTEN; // Compute proper scaling value from buffer space while (tp->request_r_scale < TCP_MAX_WINSHIFT && TCP_MAXWIN << tp->request_r_scale < so->so_rcv->sb_hiwat) tp->request_r_scale++; } } // Segment received on connection. // Reset idle time and keep-alive timer. tp->t_idle = 0; tp->t_timer[TCPT_KEEP] = g_tcp_keepidle; // Process options if not in LISTEN state, // else do it below (after getting remote address). if (optp && tp->t_state != TCPS_LISTEN) tcp_dooptions(tp, optp, optlen, ti, &ts_present, &ts_val, &ts_ecr); // Header prediction: check for the two common cases // of a uni-directional data xfer. If the packet has // no control flags, is in-sequence, the window didn't // change and we're not retransmitting, it's a // candidate. If the length is zero and the ack moved // forward, we're the sender side of the xfer. Just // free the data acked & wake any higher level process // that was blocked waiting for space. If the length // is non-zero and the ack didn't move, we're the // receiver side. If we're getting packets in-order // (the reassembly queue is empty), add the data to // the socket buffer and note that we need a delayed ack. if (tp->t_state == TCPS_ESTABLISHED && (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && (!ts_present || TSTMP_GEQ(ts_val, tp->ts_recent)) && ti->ti_seq == tp->rcv_nxt && tiwin && tiwin == tp->snd_wnd && tp->snd_nxt == tp->snd_max) { // If last ACK falls within this segment's sequence numbers, // record the timestamp. if ( ts_present && TSTMP_GEQ(ts_val, tp->ts_recent) && SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) ){ tp->ts_recent_age = g_tcp_now; tp->ts_recent = ts_val; } if (ti->ti_len == 0) { if (SEQ_GT(ti->ti_ack, tp->snd_una) && SEQ_LEQ(ti->ti_ack, tp->snd_max) && tp->snd_cwnd >= tp->snd_wnd) { // this is a pure ack for outstanding data. ++g_tcpstat.tcps_predack; if (ts_present) tcp_xmit_timer(tp, g_tcp_now-ts_ecr+1); else if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq)) tcp_xmit_timer(tp, tp->t_rtt); acked = ti->ti_ack - tp->snd_una; g_tcpstat.tcps_rcvackpack++; g_tcpstat.tcps_rcvackbyte += acked; TRACE("drop so_snd buffer, drop_bytes=%d, len=%d", acked, so->so_snd.sb_cc); sbdrop(so->so_snd, acked); tp->snd_una = ti->ti_ack; usn_free_cmbuf(m); // If all outstanding data are acked, stop // retransmit timer, otherwise restart timer // using current (possibly backed-off) value. // If process is waiting for space, // wakeup/selwakeup/signal. If data // are ready to send, let tcp_output // decide between more output or persist. if (tp->snd_una == tp->snd_max) tp->t_timer[TCPT_REXMT] = 0; else if (tp->t_timer[TCPT_PERSIST] == 0) tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, &g_tcp_saveti, 0); //if (so->so_snd->sb_flags & SB_NOTIFY) { // usnet_tcpin_wwakeup(so, USN_TCP_IN, usn_tcpev_sbnotify, 0); // sowwakeup(so); //} // send buffer is available for app thread. usnet_tcpin_wwakeup(so, USN_TCP_IN, USN_TCPEV_WRITE, 0); if (so->so_snd->sb_cc) tcp_output(tp); return; } } else if (ti->ti_ack == tp->snd_una && tp->seg_next == (struct tcpiphdr *)tp && ti->ti_len <= sbspace(so->so_rcv)) { // this is a pure, in-sequence data packet // with nothing on the reassembly queue and // we have enough buffer space to take it. ++g_tcpstat.tcps_preddat; tp->rcv_nxt += ti->ti_len; g_tcpstat.tcps_rcvpack++; g_tcpstat.tcps_rcvbyte += ti->ti_len; // Drop TCP, IP headers and TCP options then add data // to socket buffer. m->head += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); m->mlen -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); TRACE("add data to rcv buf"); sbappend(so->so_rcv, m); sorwakeup(so); // new data is available for app threads. usnet_tcpin_rwakeup(so, USN_TCP_IN, USN_TCPEV_READ, m); if (so->so_options & SO_DEBUG) { TRACE("tcp trace, so_options=%d", so->so_options); tcp_trace(TA_INPUT, ostate, tp, &g_tcp_saveti, 0); } tp->t_flags |= TF_DELACK; return; } } // Drop TCP, IP headers and TCP options. m->head += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); m->mlen -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); // Calculate amount of space in receive window, // and then do TCP input processing. // Receive window is amount of space in rcv queue, // but not less than advertised window. { int win; win = sbspace(so->so_rcv); if (win < 0) win = 0; tp->rcv_wnd = max(win, (int)(tp->rcv_adv - tp->rcv_nxt)); } switch (tp->t_state) { // If the state is LISTEN then ignore segment if it contains an RST. // If the segment contains an ACK then it is bad and send a RST. // If it does not contain a SYN then it is not interesting; drop it. // Don't bother responding if the destination was a broadcast. // Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial // tp->iss, and send a segment: // <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> // Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss. // Fill in remote peer address fields if not previously specified. // Enter SYN_RECEIVED state, and process any other fields of this // segment in this state. case TCPS_LISTEN: { usn_mbuf_t *am; struct usn_sockaddr_in *sin; if (tiflags & TH_RST) goto drop; if (tiflags & TH_ACK) goto dropwithreset; if ((tiflags & TH_SYN) == 0) goto drop; // RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN // in_broadcast() should never return true on a received // packet with M_BCAST not set. //if (m->m_flags & (M_BCAST|M_MCAST) || // IN_MULTICAST(ntohl(ti->ti_dst.s_addr))) // goto drop; am = usn_get_mbuf(0, BUF_MSIZE, 0); // XXX: the size! if (am == NULL) goto drop; am->mlen = sizeof (struct usn_sockaddr_in); sin = mtod(am, struct usn_sockaddr_in *); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = ti->ti_src; sin->sin_port = ti->ti_sport; bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero)); laddr = inp->inp_laddr; if (inp->inp_laddr.s_addr == USN_INADDR_ANY) inp->inp_laddr = ti->ti_dst; if (in_pcbconnect(inp, am)) { inp->inp_laddr = laddr; usn_free_mbuf(am); goto drop; } usn_free_mbuf(am); tp->t_template = tcp_template(tp); if (tp->t_template == 0) { tp = tcp_drop(tp, ENOBUFS); dropsocket = 0; // socket is already gone goto drop; } if (optp) tcp_dooptions(tp, optp, optlen, ti, &ts_present, &ts_val, &ts_ecr); if (iss) tp->iss = iss; else tp->iss = g_tcp_iss; g_tcp_iss += TCP_ISSINCR/4; tp->irs = ti->ti_seq; tcp_sendseqinit(tp); tcp_rcvseqinit(tp); tp->t_flags |= TF_ACKNOW; TRACE("change tcp state to TCPS_SYN_RECEIVED, state=%d, tp_flags=%d", tp->t_state, tp->t_flags); tp->t_state = TCPS_SYN_RECEIVED; // tcp event usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_SYN_RECEIVED, 0); tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT; dropsocket = 0; // committed to socket g_tcpstat.tcps_accepts++; goto trimthenstep6; } // If the state is SYN_SENT: // if seg contains an ACK, but not for our SYN, drop the input. // if seg contains a RST, then drop the connection. // if seg does not contain SYN, then drop it. // Otherwise this is an acceptable SYN segment // initialize tp->rcv_nxt and tp->irs // if seg contains ack then advance tp->snd_una // if SYN has been acked change to ESTABLISHED else SYN_RCVD state // arrange for segment to be acked (eventually) // continue processing rest of data/controls, beginning with URG case TCPS_SYN_SENT: if ((tiflags & TH_ACK) && (SEQ_LEQ(ti->ti_ack, tp->iss) || SEQ_GT(ti->ti_ack, tp->snd_max))) goto dropwithreset; if (tiflags & TH_RST) { if (tiflags & TH_ACK) tp = tcp_drop(tp, ECONNREFUSED); goto drop; } if ((tiflags & TH_SYN) == 0) goto drop; if (tiflags & TH_ACK) { tp->snd_una = ti->ti_ack; if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; tp->t_timer[TCPT_REXMT] = 0; } tp->irs = ti->ti_seq; tcp_rcvseqinit(tp); tp->t_flags |= TF_ACKNOW; TRACE("ack now, tp flags=%d", tp->t_flags); // XXX: remove second test. if (tiflags & TH_ACK /*&& SEQ_GT(tp->snd_una, tp->iss)*/) { g_tcpstat.tcps_connects++; soisconnected(so); TRACE("change tcp state to TCPS_ESTABLISHED," " state=%d, tp_flags=%d", tp->t_state, tp->t_flags); tp->t_state = TCPS_ESTABLISHED; // Do window scaling on this connection? if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->snd_scale = tp->requested_s_scale; tp->rcv_scale = tp->request_r_scale; } tcp_reass(tp, (struct tcpiphdr *)0, (usn_mbuf_t *)0); // if we didn't have to retransmit the SYN, // use its rtt as our initial srtt & rtt var. if (tp->t_rtt) tcp_xmit_timer(tp, tp->t_rtt); } else { TRACE("change tcp state to TCPS_SYN_RECEIVED, state=%d, tp_flags=%d", tp->t_state, tp->t_flags); tp->t_state = TCPS_SYN_RECEIVED; // tcp event usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_SYN_RECEIVED, 0); } trimthenstep6: // Advance ti->ti_seq to correspond to first data byte. // If data, trim to stay within window, // dropping FIN if necessary. ti->ti_seq++; if (ti->ti_len > tp->rcv_wnd) { todrop = ti->ti_len - tp->rcv_wnd; m_adj(m, -todrop); ti->ti_len = tp->rcv_wnd; tiflags &= ~TH_FIN; g_tcpstat.tcps_rcvpackafterwin++; g_tcpstat.tcps_rcvbyteafterwin += todrop; } tp->snd_wl1 = ti->ti_seq - 1; tp->rcv_up = ti->ti_seq; goto step6; } // States other than LISTEN or SYN_SENT. // First check timestamp, if present. // Then check that at least some bytes of segment are within // receive window. If segment begins before rcv_nxt, // drop leading data (and SYN); if nothing left, just ack. // // RFC 1323 PAWS: If we have a timestamp reply on this segment // and it's less than ts_recent, drop it. if (ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && TSTMP_LT(ts_val, tp->ts_recent)) { // Check to see if ts_recent is over 24 days old. if ((int)(g_tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { // Invalidate ts_recent. If this segment updates // ts_recent, the age will be reset later and ts_recent // will get a valid value. If it does not, setting // ts_recent to zero will at least satisfy the // requirement that zero be placed in the timestamp // echo reply when ts_recent isn't valid. The // age isn't reset until we get a valid ts_recent // because we don't want out-of-order segments to be // dropped when ts_recent is old. tp->ts_recent = 0; } else { g_tcpstat.tcps_rcvduppack++; g_tcpstat.tcps_rcvdupbyte += ti->ti_len; g_tcpstat.tcps_pawsdrop++; goto dropafterack; } } todrop = tp->rcv_nxt - ti->ti_seq; if (todrop > 0) { if (tiflags & TH_SYN) { tiflags &= ~TH_SYN; ti->ti_seq++; if (ti->ti_urp > 1) ti->ti_urp--; else tiflags &= ~TH_URG; todrop--; } if ( todrop >= ti->ti_len || ( todrop == ti->ti_len && (tiflags & TH_FIN ) == 0 ) ) { // Any valid FIN must be to the left of the window. // At this point the FIN must be a duplicate or // out of sequence; drop it. tiflags &= ~TH_FIN; // Send an ACK to resynchronize and drop any data // But keep on processing for RST or ACK. tp->t_flags |= TF_ACKNOW; TRACE("send ack now to resync, tp_flags=%d", tp->t_flags); todrop = ti->ti_len; g_tcpstat.tcps_rcvdupbyte += ti->ti_len; g_tcpstat.tcps_rcvduppack++; } else { g_tcpstat.tcps_rcvpartduppack++; g_tcpstat.tcps_rcvpartdupbyte += ti->ti_len; } m_adj(m, todrop); ti->ti_seq += todrop; ti->ti_len -= todrop; if (ti->ti_urp > todrop) ti->ti_urp -= todrop; else { tiflags &= ~TH_URG; ti->ti_urp = 0; } } // If new data are received on a connection after the // user processes are gone, then RST the other end. if ((so->so_state & USN_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT && ti->ti_len) { tp = tcp_close(tp); g_tcpstat.tcps_rcvafterclose++; goto dropwithreset; } // If segment ends after window, drop trailing data // (and PUSH and FIN); if nothing left, just ACK. todrop = (ti->ti_seq+ti->ti_len) - (tp->rcv_nxt+tp->rcv_wnd); if (todrop > 0) { g_tcpstat.tcps_rcvpackafterwin++; if (todrop >= ti->ti_len) { g_tcpstat.tcps_rcvbyteafterwin += ti->ti_len; // If a new connection request is received // while in TIME_WAIT, drop the old connection // and start over if the sequence numbers // are above the previous ones. if (tiflags & TH_SYN && tp->t_state == TCPS_TIME_WAIT && SEQ_GT(ti->ti_seq, tp->rcv_nxt)) { iss = tp->snd_nxt + TCP_ISSINCR; tp = tcp_close(tp); goto findpcb; } // If window is closed can only take segments at // window edge, and have to drop data and PUSH from // incoming segments. Continue processing, but // remember to ack. Otherwise, drop segment // and ack. if (tp->rcv_wnd == 0 && ti->ti_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; g_tcpstat.tcps_rcvwinprobe++; } else goto dropafterack; } else g_tcpstat.tcps_rcvbyteafterwin += todrop; m_adj(m, -todrop); ti->ti_len -= todrop; tiflags &= ~(TH_PUSH|TH_FIN); } // check valid timestamp. Replace code above. if (ts_present && TSTMP_GEQ(ts_val, tp->ts_recent) && SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) ) { tp->ts_recent_age = g_tcp_now; tp->ts_recent = ts_val; } // If the RST bit is set examine the state: // SYN_RECEIVED STATE: // If passive open, return to LISTEN state. // If active open, inform user that connection was refused. // ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: // Inform user that connection was reset, and close tcb. // CLOSING, LAST_ACK, TIME_WAIT STATES // Close the tcb. if (tiflags&TH_RST) switch (tp->t_state) { case TCPS_SYN_RECEIVED: so->so_error = ECONNREFUSED; goto close; case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: so->so_error = ECONNRESET; close: DEBUG("change tcp state to TCPS_CLOSED, state=%d", tp->t_state); tp->t_state = TCPS_CLOSED; // tcp event usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_CLOSED, 0); g_tcpstat.tcps_drops++; tp = tcp_close(tp); goto drop; case TCPS_CLOSING: case TCPS_LAST_ACK: case TCPS_TIME_WAIT: tp = tcp_close(tp); goto drop; } // If a SYN is in the window, then this is an // error and we send an RST and drop the connection. if (tiflags & TH_SYN) { tp = tcp_drop(tp, ECONNRESET); goto dropwithreset; } // If the ACK bit is off we drop the segment and return. if ((tiflags & TH_ACK) == 0) goto drop; // Ack processing. switch (tp->t_state) { // In SYN_RECEIVED state if the ack ACKs our SYN then enter // ESTABLISHED state and continue processing, otherwise // send an RST. case TCPS_SYN_RECEIVED: if (SEQ_GT(tp->snd_una, ti->ti_ack) || SEQ_GT(ti->ti_ack, tp->snd_max)) goto dropwithreset; g_tcpstat.tcps_connects++; DEBUG("change tcp state to TCPS_ESTABLISHED, state=%d", tp->t_state); tp->t_state = TCPS_ESTABLISHED; soisconnected(so); // Do window scaling? if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->snd_scale = tp->requested_s_scale; tp->rcv_scale = tp->request_r_scale; } tcp_reass(tp, (struct tcpiphdr *)0, (usn_mbuf_t *)0); tp->snd_wl1 = ti->ti_seq - 1; // fall into ... // In ESTABLISHED state: drop duplicate ACKs; ACK out of range // ACKs. If the ack is in the range // tp->snd_una < ti->ti_ack <= tp->snd_max // then advance tp->snd_una to ti->ti_ack and drop // data from the retransmission queue. If this ACK reflects // more up to date window information we update our window information. case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: case TCPS_TIME_WAIT: if (SEQ_LEQ(ti->ti_ack, tp->snd_una)) { if (ti->ti_len == 0 && tiwin == tp->snd_wnd) { g_tcpstat.tcps_rcvdupack++; // If we have outstanding data (other than // a window probe), this is a completely // duplicate ack (ie, window info didn't // change), the ack is the biggest we've // seen and we've seen exactly our rexmt // threshhold of them, assume a packet // has been dropped and retransmit it. // Kludge snd_nxt & the congestion // window so we send only this one // packet. // // We know we're losing at the current // window size so do congestion avoidance // (set ssthresh to half the current window // and pull our congestion window back to // the new ssthresh). // // Dup acks mean that packets have left the // network (they're now cached at the receiver) // so bump cwnd by the amount in the receiver // to keep a constant cwnd packets in the // network. if (tp->t_timer[TCPT_REXMT] == 0 || ti->ti_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks == g_tcprexmtthresh) { // congestion avoidance tcp_seq onxt = tp->snd_nxt; u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; if (win < 2) win = 2; tp->snd_ssthresh = win * tp->t_maxseg; tp->t_timer[TCPT_REXMT] = 0; tp->t_rtt = 0; tp->snd_nxt = ti->ti_ack; tp->snd_cwnd = tp->t_maxseg; tcp_output(tp); tp->snd_cwnd = tp->snd_ssthresh + tp->t_maxseg * tp->t_dupacks; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; } else if (tp->t_dupacks > g_tcprexmtthresh) { tp->snd_cwnd += tp->t_maxseg; tcp_output(tp); goto drop; } } else tp->t_dupacks = 0; break; } // If the congestion window was inflated to account // for the other side's cached packets, retract it. if (tp->t_dupacks > g_tcprexmtthresh && tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; tp->t_dupacks = 0; if (SEQ_GT(ti->ti_ack, tp->snd_max)) { g_tcpstat.tcps_rcvacktoomuch++; goto dropafterack; } acked = ti->ti_ack - tp->snd_una; g_tcpstat.tcps_rcvackpack++; g_tcpstat.tcps_rcvackbyte += acked; // If we have a timestamp reply, update smoothed // round trip time. If no timestamp is present but // transmit timer is running and timed sequence // number was acked, update smoothed round trip time. // Since we now have an rtt measurement, cancel the // timer backoff (cf., Phil Karn's retransmit alg.). // Recompute the initial retransmit timer. if (ts_present) tcp_xmit_timer(tp, g_tcp_now-ts_ecr+1); else if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq)) tcp_xmit_timer(tp,tp->t_rtt); // If all outstanding data is acked, stop retransmit // timer and remember to restart (more output or persist). // If there is more data to be acked, restart retransmit // timer, using current (possibly backed-off) value. if (ti->ti_ack == tp->snd_max) { tp->t_timer[TCPT_REXMT] = 0; DEBUG("change needoutput to 1"); needoutput = 1; tp->t_flags |= TF_NEEDOUTPUT; } else if (tp->t_timer[TCPT_PERSIST] == 0) tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; // When new data is acked, open the congestion window. // If the window gives us less than ssthresh packets // in flight, open exponentially (maxseg per packet). // Otherwise open linearly: maxseg per window // (maxseg * (maxseg / cwnd) per packet). { u_int cw = tp->snd_cwnd; u_int incr = tp->t_maxseg; if (cw > tp->snd_ssthresh) incr = incr * incr / cw; tp->snd_cwnd = min(cw + incr, TCP_MAXWIN<<tp->snd_scale); } if (acked > so->so_snd->sb_cc) { tp->snd_wnd -= so->so_snd->sb_cc; DEBUG("drop all so_snd buffer, drop_bytes=%d, acked=%d", so->so_snd->sb_cc, acked); sbdrop(so->so_snd, (int)so->so_snd->sb_cc); ourfinisacked = 1; } else { DEBUG("drop so_snd buffer, drop_bytes=%d, len=%d", acked, so->so_snd->sb_cc); sbdrop(so->so_snd, acked); tp->snd_wnd -= acked; ourfinisacked = 0; } //if (so->so_snd->sb_flags & SB_NOTIFY) { sowwakeup(so); usnet_tcpin_wwakeup(so, USN_TCP_IN, USN_TCPEV_WRITE, 0); //} tp->snd_una = ti->ti_ack; if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; switch (tp->t_state) { // In FIN_WAIT_1 STATE in addition to the processing // for the ESTABLISHED state if our FIN is now acknowledged // then enter FIN_WAIT_2. case TCPS_FIN_WAIT_1: if (ourfinisacked) { // If we can't receive any more // data, then closing user can proceed. // Starting the timer is contrary to the // specification, but if we don't get a FIN // we'll hang forever. if (so->so_state & USN_CANTRCVMORE) { soisdisconnected(so); tp->t_timer[TCPT_2MSL] = g_tcp_maxidle; } DEBUG("change tcp state to TCPS_FIN_WAIT_2, state=%d", tp->t_state); tp->t_state = TCPS_FIN_WAIT_2; usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_FIN_WAIT2, 0); } break; // In CLOSING STATE in addition to the processing for // the ESTABLISHED state if the ACK acknowledges our FIN // then enter the TIME-WAIT state, otherwise ignore // the segment. case TCPS_CLOSING: if (ourfinisacked) { DEBUG("change tcp state to TCPS_TIME_WAIT, state=%d", tp->t_state); tp->t_state = TCPS_TIME_WAIT; usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_TIME_WAIT, 0); tcp_canceltimers(tp); tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; soisdisconnected(so); } break; // In LAST_ACK, we may still be waiting for data to drain // and/or to be acked, as well as for the ack of our FIN. // If our FIN is now acknowledged, delete the TCB, // enter the closed state and return. case TCPS_LAST_ACK: if (ourfinisacked) { tp = tcp_close(tp); goto drop; } break; // In TIME_WAIT state the only thing that should arrive // is a retransmission of the remote FIN. Acknowledge // it and restart the finack timer. case TCPS_TIME_WAIT: tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; goto dropafterack; } } step6: // Update window information. // Don't look at window if no ACK: TAC's send garbage on first SYN. if ((tiflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, ti->ti_seq) || (tp->snd_wl1 == ti->ti_seq && (SEQ_LT(tp->snd_wl2, ti->ti_ack) || (tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd) )) )) { // keep track of pure window updates if (ti->ti_len == 0 && tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd) g_tcpstat.tcps_rcvwinupd++; tp->snd_wnd = tiwin; tp->snd_wl1 = ti->ti_seq; tp->snd_wl2 = ti->ti_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; DEBUG("change needoutput to 1"); tp->t_flags |= TF_NEEDOUTPUT; needoutput = 1; } // Process segments with URG. if ((tiflags & TH_URG) && ti->ti_urp && TCPS_HAVERCVDFIN(tp->t_state) == 0) { // This is a kludge, but if we receive and accept // random urgent pointers, we'll crash in // soreceive. It's hard to imagine someone // actually wanting to send this much urgent data. if (ti->ti_urp + so->so_rcv->sb_cc > g_sb_max) { ti->ti_urp = 0; // XXX tiflags &= ~TH_URG; // XXX goto dodata; // XXX } // If this segment advances the known urgent pointer, // then mark the data stream. This should not happen // in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since // a FIN has been received from the remote side. // In these states we ignore the URG. // // According to RFC961 (Assigned Protocols), // the urgent pointer points to the last octet // of urgent data. We continue, however, // to consider it to indicate the first octet // of data past the urgent section as the original // spec states (in one of two places). if (SEQ_GT(ti->ti_seq+ti->ti_urp, tp->rcv_up)) { tp->rcv_up = ti->ti_seq + ti->ti_urp; so->so_oobmark = so->so_rcv->sb_cc + (tp->rcv_up - tp->rcv_nxt) - 1; if (so->so_oobmark == 0) so->so_state |= USN_RCVATMARK; sohasoutofband(so); // send async event to app threads. usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPEV_OUTOFBOUND, 0); tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); } // Remove out of band data so doesn't get presented to user. // This can happen independent of advancing the URG pointer, // but if two URG's are pending at once, some out-of-band // data may creep in... ick. if (ti->ti_urp <= ti->ti_len #ifdef SO_OOBINLINE && (so->so_options & SO_OOBINLINE) == 0 #endif ) tcp_pulloutofband(so, ti, m); } else // If no out of band data is expected, // pull receive urgent pointer along // with the receive window. if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) tp->rcv_up = tp->rcv_nxt; dodata: // XXX #ifdef DUMP_PAYLOAD DEBUG("Handle data"); dump_chain(m,"tcp"); #endif // Process the segment text, merging it into the TCP sequencing queue, // and arranging for acknowledgment of receipt if necessary. // This process logically involves adjusting tp->rcv_wnd as data // is presented to the user (this happens in tcp_usrreq.c, // case PRU_RCVD). If a FIN has already been received on this // connection then we just ignore the text. if ((ti->ti_len || (tiflags&TH_FIN)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { TCP_REASS(tp, ti, m, so, tiflags); // Note the amount of data that peer has sent into // our window, in order to estimate the sender's // buffer size. len = so->so_rcv->sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); } else { usn_free_cmbuf(m); tiflags &= ~TH_FIN; } // If FIN is received ACK the FIN and let the user know // that the connection is closing. if (tiflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { socantrcvmore(so); tp->t_flags |= TF_ACKNOW; TRACE("ack FIN now, tp flags=%d", tp->t_flags); tp->rcv_nxt++; } switch (tp->t_state) { // In SYN_RECEIVED and ESTABLISHED STATES // enter the CLOSE_WAIT state. case TCPS_SYN_RECEIVED: case TCPS_ESTABLISHED: TRACE("change tcp state to TCPS_CLOSE_WAIT, state=%d", tp->t_state); tp->t_state = TCPS_CLOSE_WAIT; soewakeup(so, 0); usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_CLOSE_WAIT, 0); break; // If still in FIN_WAIT_1 STATE FIN has not been acked so // enter the CLOSING state. case TCPS_FIN_WAIT_1: TRACE("change tcp state to TCPS_CLOSING, state=%d", tp->t_state); tp->t_state = TCPS_CLOSING; usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_CLOSING, 0); break; // In FIN_WAIT_2 state enter the TIME_WAIT state, // starting the time-wait timer, turning off the other // standard timers. case TCPS_FIN_WAIT_2: TRACE("change tcp state to TCPS_TIME_WAIT, state=%d", tp->t_state); tp->t_state = TCPS_TIME_WAIT; tcp_canceltimers(tp); tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; soisdisconnected(so); usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_TIME_WAIT, 0); break; // In TIME_WAIT state restart the 2 MSL time_wait timer. case TCPS_TIME_WAIT: tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; break; } } if (so->so_options & SO_DEBUG) { TRACE("tcp trace, so_options=%d", so->so_options); tcp_trace(TA_INPUT, ostate, tp, &g_tcp_saveti, 0); } // Return any desired output. //if (needoutput || (tp->t_flags & TF_ACKNOW)){ if (tp->t_flags & TF_NEEDOUTPUT || (tp->t_flags & TF_ACKNOW)){ TRACE("ack now or need to ouput, tp->t_flags=%d", tp->t_flags); tcp_output(tp); } return; dropafterack: TRACE("dropafterack"); // Generate an ACK dropping incoming segment if it occupies // sequence space, where the ACK reflects our state. if (tiflags & TH_RST) goto drop; usn_free_cmbuf(m); tp->t_flags |= TF_ACKNOW; TRACE("ack now, tp flags=%d", tp->t_flags); tcp_output(tp); return; dropwithreset: TRACE("dropwithreset"); // Generate a RST, dropping incoming segment. // Make ACK acceptable to originator of segment. // Don't bother to respond if destination was broadcast/multicast. #define USN_MULTICAST(i) (((u_int)(i) & 0xf0000000) == 0xe0000000) if ((tiflags & TH_RST) || m->flags & (BUF_BCAST|BUF_MCAST) || USN_MULTICAST(ntohl(ti->ti_dst.s_addr))) goto drop; if (tiflags & TH_ACK) tcp_respond(tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST); else { if (tiflags & TH_SYN) ti->ti_len++; tcp_respond(tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0, TH_RST|TH_ACK); } // destroy temporarily created socket if (dropsocket) soabort(so); return; drop: TRACE("drop"); // Drop space held by incoming segment and return. if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) { TRACE("tcp trace: drop a socket"); tcp_trace(TA_DROP, ostate, tp, &g_tcp_saveti, 0); } usn_free_cmbuf(m); // destroy temporarily created socket if (dropsocket) soabort(so); return; }
void so_icmpdu(PACKET p, struct destun * pdp) { ip_addr lhost; /* IP address of originator (our iface) */ ip_addr fhost; /* IP address we sent to */ unshort fport; /* TCP/UDP port we sent to */ unshort lport; /* TCP/UDP port we sent from */ struct inpcb * inp; struct socket * so; struct tcpcb * tp; /* extract information about packet which generated DU */ fhost = htonl(pdp->dip.ip_dest); lhost = htonl(pdp->dip.ip_src); lport = htons(*(unshort*)(&pdp->ddata[0])); fport = htons(*(unshort*)(&pdp->ddata[2])); #ifndef IP_PMTU /* if it's a datagram-too-big message, ignore it -- As the * build isn't using PMTU Discovery this packet is most * probably a Denial of Service Attack. */ if(pdp->dcode == DSTFRAG) { goto done; } #endif /* IP_PMTU */ /* if it's a TCP connection, clean it up */ if (pdp->dip.ip_prot == TCPTP) { /* find associated data structs and socket */ inp = in_pcblookup(&tcb, fhost, fport, lhost, lport, INPLOOKUP_WILDCARD); if (inp == 0) goto done; so = inp->inp_socket; if (so == 0) goto done; tp = intotcpcb(inp); if (tp) { if (tp->t_state <= TCPS_LISTEN) { goto done; } #ifdef ICMP_TCP_DOS { struct ip * pip; struct tcpiphdr * ti; pip = ip_head(p); /* find IP header */ ti = (struct tcpiphdr *)p->nb_prot; if(!((tp->snd_una <= ti->ti_seq) && (ti->ti_seq <= tp->snd_nxt))) goto done; /* If we get an ICMP Type 3 (Destination Unreachable) - Code 2 * (Protocol Unreachable) message and during the life of a TCP * connection, then its most probably a Denial of Service Attack. * As the only other interpretation would be that the support for * the transport protocol has been removed from the host sending * the error message during the life of the corresponding * connection. As in common practice this is higly unlikely in most * cases, we will treat this message as a DOS attack. */ if(pdp->dcode == DSTPROT) { if((tp->t_state >= TCPS_ESTABLISHED) && (tp->t_state <= TCPS_TIME_WAIT)) goto done; } /* Note some ICMP error messages generated by intermediate routers, * include more than the recommended 64 bits of the IP Data. If the * TCP ACK number happens to be present then use it in detecting a * Denial of Service attack. * * This way we can ensure that the TCP Acknowledgement number should * correspond to data that have already been acknowledged. This way * we can further reduce the possiblity of considering a spoofed ICMP * packet by a factor of 2. */ if(pip->ip_len >= 32) { if(!(ti->ti_seq <= tp->rcv_nxt)) goto done; } } #endif tcp_close(tp); } so->so_error = ECONNREFUSED; /* set error for socket owner */ } #ifdef UDP_SOCKETS /* this sockets layer supports UDP too */ else if(pdp->dip.ip_prot == UDP_PROT) { UDPCONN tmp; /* search udp table (which keeps hosts in net endian) */ for (tmp = firstudp; tmp; tmp = tmp->u_next) if ((tmp->u_fport == fport || tmp->u_fport == 0) && (tmp->u_fhost == htonl(fhost)) && (tmp->u_lport == lport)) { break; /* found our UDP table entry */ } if (!tmp) goto done; so = (struct socket *)tmp->u_data; /* May be non-socket (lightweight) UDP connection. */ if (so->so_type != SOCK_DGRAM) goto done; so->so_error = ECONNREFUSED; /* set error for socket owner */ /* do a select() notify on socket here */ sorwakeup(so); sowwakeup(so); } #endif /* UDP_SOCKETS */ else goto done; #ifdef IP_PMTU /* if this is a datagram-too-big message, update the Path MTU cache */ if (pdp->dcode == DSTFRAG) pmtucache_set(pdp->dip.ip_dest, htons(pdp->dno2)); #endif /* IP_PMTU */ done: LOCK_NET_RESOURCE(FREEQ_RESID); pk_free(p); /* done with original packet */ UNLOCK_NET_RESOURCE(FREEQ_RESID); return; }