Esempio n. 1
0
/*
 * Called when a connection is established to translate the TCP options
 * reported by HW to FreeBSD's native format.
 */
static void
assign_rxopt(struct tcpcb *tp, unsigned int opt)
{
	struct toepcb *toep = tp->t_toe;
	struct adapter *sc = td_adapter(toep->td);

	INP_LOCK_ASSERT(tp->t_inpcb);

	tp->t_maxseg = tp->t_maxopd = sc->params.mtus[G_TCPOPT_MSS(opt)] - 40;

	if (G_TCPOPT_TSTAMP(opt)) {
		tp->t_flags |= TF_RCVD_TSTMP;	/* timestamps ok */
		tp->ts_recent = 0;		/* hmmm */
		tp->ts_recent_age = tcp_ts_getticks();
		tp->t_maxseg -= TCPOLEN_TSTAMP_APPA;
	}

	if (G_TCPOPT_SACK(opt))
		tp->t_flags |= TF_SACK_PERMIT;	/* should already be set */
	else
		tp->t_flags &= ~TF_SACK_PERMIT;	/* sack disallowed by peer */

	if (G_TCPOPT_WSCALE_OK(opt))
		tp->t_flags |= TF_RCVD_SCALE;

	/* Doing window scaling? */
	if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
	    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
		tp->rcv_scale = tp->request_r_scale;
		tp->snd_scale = G_TCPOPT_SND_WSCALE(opt);
	}
}
/*
 * Initiate (or continue) disconnect.
 * If embryonic state, just send reset (once).
 * If in ``let data drain'' option and linger null, just drop.
 * Otherwise (hard), mark socket disconnecting and drop
 * current input data; switch states based on user close, and
 * send segment to peer (with FIN).
 */
static void
tcp_disconnect(struct tcpcb *tp)
{
	struct inpcb *inp = tp->t_inpcb;
	struct socket *so = inp->inp_socket;

	INP_INFO_WLOCK_ASSERT(&tcbinfo);
	INP_LOCK_ASSERT(inp);

	/*
	 * Neither tcp_close() nor tcp_drop() should return NULL, as the
	 * socket is still open.
	 */
	if (tp->t_state < TCPS_ESTABLISHED) {
		tp = tcp_close(tp);
		KASSERT(tp != NULL,
		    ("tcp_disconnect: tcp_close() returned NULL"));
	} else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
		tp = tcp_drop(tp, 0);
		KASSERT(tp != NULL,
		    ("tcp_disconnect: tcp_drop() returned NULL"));
	} else {
		soisdisconnecting(so);
		sbflush(&so->so_rcv);
		tcp_usrclosed(tp);
		if (!(inp->inp_vflag & INP_DROPPED))
			tcp_output(tp);
	}
}
Esempio n. 3
0
/*
 * IP output.  The packet in mbuf chain m contains a skeletal IP
 * header (with len, off, ttl, proto, tos, src, dst).
 * The mbuf chain containing the packet will be freed.
 * The mbuf opt, if present, will not be freed.
 * If route ro is present and has ro_rt initialized, route lookup would be
 * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL,
 * then result of route lookup is stored in ro->ro_rt.
 *
 * In the IP forwarding case, the packet will arrive with options already
 * inserted, so must have a NULL opt pointer.
 */
int
ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
    struct ip_moptions *imo, struct inpcb *inp)
{
	struct rm_priotracker in_ifa_tracker;
	struct ip *ip;
	struct ifnet *ifp = NULL;	/* keep compiler happy */
	struct mbuf *m0;
	int hlen = sizeof (struct ip);
	int mtu;
	int error = 0;
	struct sockaddr_in *dst;
	const struct sockaddr_in *gw;
	struct in_ifaddr *ia;
	int isbroadcast;
	uint16_t ip_len, ip_off;
	struct route iproute;
	struct rtentry *rte;	/* cache for ro->ro_rt */
	uint32_t fibnum;
	int have_ia_ref;
#ifdef IPSEC
	int no_route_but_check_spd = 0;
#endif
	M_ASSERTPKTHDR(m);

	if (inp != NULL) {
		INP_LOCK_ASSERT(inp);
		M_SETFIB(m, inp->inp_inc.inc_fibnum);
		if ((flags & IP_NODEFAULTFLOWID) == 0) {
			m->m_pkthdr.flowid = inp->inp_flowid;
			M_HASHTYPE_SET(m, inp->inp_flowtype);
		}
	}

	if (ro == NULL) {
		ro = &iproute;
		bzero(ro, sizeof (*ro));
	}

#ifdef FLOWTABLE
	if (ro->ro_rt == NULL)
		(void )flowtable_lookup(AF_INET, m, ro);
#endif

	if (opt) {
		int len = 0;
		m = ip_insertoptions(m, opt, &len);
		if (len != 0)
			hlen = len; /* ip->ip_hl is updated above */
	}
	ip = mtod(m, struct ip *);
	ip_len = ntohs(ip->ip_len);
	ip_off = ntohs(ip->ip_off);

	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
		ip->ip_v = IPVERSION;
		ip->ip_hl = hlen >> 2;
		ip_fillid(ip);
		IPSTAT_INC(ips_localout);
	} else {
/*
 * Export TCP internal state information via a struct tcp_info, based on the
 * Linux 2.6 API.  Not ABI compatible as our constants are mapped differently
 * (TCP state machine, etc).  We export all information using FreeBSD-native
 * constants -- for example, the numeric values for tcpi_state will differ
 * from Linux.
 */
static void
tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti)
{

	INP_LOCK_ASSERT(tp->t_inpcb);
	bzero(ti, sizeof(*ti));

	ti->tcpi_state = tp->t_state;
	if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
		ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
	if (tp->t_flags & TF_SACK_PERMIT)
		ti->tcpi_options |= TCPI_OPT_SACK;
	if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
		ti->tcpi_options |= TCPI_OPT_WSCALE;
		ti->tcpi_snd_wscale = tp->snd_scale;
		ti->tcpi_rcv_wscale = tp->rcv_scale;
	}

	ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick) >> TCP_RTT_SHIFT;
	ti->tcpi_rttvar = ((u_int64_t)tp->t_rttvar * tick) >> TCP_RTTVAR_SHIFT;

	ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
	ti->tcpi_snd_cwnd = tp->snd_cwnd;

	/*
	 * FreeBSD-specific extension fields for tcp_info.
	 */
	ti->tcpi_rcv_space = tp->rcv_wnd;
	ti->tcpi_snd_wnd = tp->snd_wnd;
	ti->tcpi_snd_bwnd = tp->snd_bwnd;
}
Esempio n. 5
0
static int
udp6_append(struct inpcb *inp, struct mbuf *n, int off,
    struct sockaddr_in6 *fromsa)
{
	struct socket *so;
	struct mbuf *opts;
	struct udpcb *up;

	INP_LOCK_ASSERT(inp);

	/*
	 * Engage the tunneling protocol.
	 */
	up = intoudpcb(inp);
	if (up->u_tun_func != NULL) {
		in_pcbref(inp);
		INP_RUNLOCK(inp);
		(*up->u_tun_func)(n, off, inp, (struct sockaddr *)fromsa,
		    up->u_tun_ctx);
		INP_RLOCK(inp);
		return (in_pcbrele_rlocked(inp));
	}
#ifdef IPSEC
	/* Check AH/ESP integrity. */
	if (ipsec6_in_reject(n, inp)) {
		m_freem(n);
		return (0);
	}
#endif /* IPSEC */
#ifdef MAC
	if (mac_inpcb_check_deliver(inp, n) != 0) {
		m_freem(n);
		return (0);
	}
#endif
	opts = NULL;
	if (inp->inp_flags & INP_CONTROLOPTS ||
	    inp->inp_socket->so_options & SO_TIMESTAMP)
		ip6_savecontrol(inp, n, &opts);
	m_adj(n, off + sizeof(struct udphdr));

	so = inp->inp_socket;
	SOCKBUF_LOCK(&so->so_rcv);
	if (sbappendaddr_locked(&so->so_rcv, (struct sockaddr *)fromsa, n,
	    opts) == 0) {
		SOCKBUF_UNLOCK(&so->so_rcv);
		m_freem(n);
		if (opts)
			m_freem(opts);
		UDPSTAT_INC(udps_fullsock);
	} else
		sorwakeup_locked(so);
	return (0);
}
Esempio n. 6
0
static struct secpolicy *
ipsec_getpcbpolicy(struct inpcb *inp, u_int dir)
{
	struct secpolicy *sp;
	int flags, downgrade;

	if (inp == NULL || inp->inp_sp == NULL)
		return (NULL);

	INP_LOCK_ASSERT(inp);

	flags = inp->inp_sp->flags;
	if (dir == IPSEC_DIR_OUTBOUND) {
		sp = inp->inp_sp->sp_out;
		flags &= INP_OUTBOUND_POLICY;
	} else {
		sp = inp->inp_sp->sp_in;
		flags &= INP_INBOUND_POLICY;
	}
	/*
	 * Check flags. If we have PCB SP, just return it.
	 * Otherwise we need to check that cached SP entry isn't stale.
	 */
	if (flags == 0) {
		if (sp == NULL)
			return (NULL);
		if (inp->inp_sp->genid != key_getspgen()) {
			/* Invalidate the cache. */
			downgrade = 0;
			if (!INP_WLOCKED(inp)) {
				if ((downgrade = INP_TRY_UPGRADE(inp)) == 0)
					return (NULL);
			}
			ipsec_invalidate_cache(inp, IPSEC_DIR_OUTBOUND);
			ipsec_invalidate_cache(inp, IPSEC_DIR_INBOUND);
			if (downgrade != 0)
				INP_DOWNGRADE(inp);
			return (NULL);
		}
		KEYDBG(IPSEC_STAMP,
		    printf("%s: PCB(%p): cache hit SP(%p)\n",
		    __func__, inp, sp));
		/* Return referenced cached policy */
	}
	key_addref(sp);
	return (sp);
}
/*
 * User issued close, and wish to trail through shutdown states:
 * if never received SYN, just forget it.  If got a SYN from peer,
 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
 * If already got a FIN from peer, then almost done; go to LAST_ACK
 * state.  In all other cases, have already sent FIN to peer (e.g.
 * after PRU_SHUTDOWN), and just have to play tedious game waiting
 * for peer to send FIN or not respond to keep-alives, etc.
 * We can let the user exit from the close as soon as the FIN is acked.
 */
static void
tcp_usrclosed(struct tcpcb *tp)
{

	INP_INFO_WLOCK_ASSERT(&tcbinfo);
	INP_LOCK_ASSERT(tp->t_inpcb);

	//printf("tcp_usrclosed: called\n");
	switch (tp->t_state) {
	case TCPS_CLOSED:
	case TCPS_LISTEN:
		tp->t_state = TCPS_CLOSED;
		tp = tcp_close(tp);
		/*
		 * tcp_close() should never return NULL here as the socket is
		 * still open.
		 */
		KASSERT(tp != NULL,
		    ("tcp_usrclosed: tcp_close() returned NULL"));
		break;

	case TCPS_SYN_SENT:
	case TCPS_SYN_RECEIVED:
		tp->t_flags |= TF_NEEDFIN;
		break;

	case TCPS_ESTABLISHED:
		tp->t_state = TCPS_FIN_WAIT_1;
		break;

	case TCPS_CLOSE_WAIT:
		tp->t_state = TCPS_LAST_ACK;
		break;
	}
	if (tp->t_state >= TCPS_FIN_WAIT_2) {
		soisdisconnected(tp->t_inpcb->inp_socket);
		/* Prevent the connection hanging in FIN_WAIT_2 forever. */
		if (tp->t_state == TCPS_FIN_WAIT_2) {
			int timeout;

			timeout = (tcp_fast_finwait2_recycle) ? 
			    tcp_finwait2_timeout : tcp_maxidle;
			tcp_timer_activate(tp, TT_2MSL, timeout);
		}
	}
}
Esempio n. 8
0
/*
 * Notify a udp user of an asynchronous error; just wake up so that they can
 * collect error status.
 */
struct inpcb *
udp_notify(struct inpcb *inp, int errno)
{

	/*
	 * While udp_ctlinput() always calls udp_notify() with a read lock
	 * when invoking it directly, in_pcbnotifyall() currently uses write
	 * locks due to sharing code with TCP.  For now, accept either a read
	 * or a write lock, but a read lock is sufficient.
	 */
	INP_LOCK_ASSERT(inp);

	inp->inp_socket->so_error = errno;
	sorwakeup(inp->inp_socket);
	sowwakeup(inp->inp_socket);
	return (inp);
}
Esempio n. 9
0
/*
 * Called when a connection is established to translate the TCP options
 * reported by HW to FreeBSD's native format.
 */
static void
assign_rxopt(struct tcpcb *tp, unsigned int opt)
{
	struct toepcb *toep = tp->t_toe;
	struct inpcb *inp = tp->t_inpcb;
	struct adapter *sc = td_adapter(toep->td);
	int n;

	INP_LOCK_ASSERT(inp);

	if (inp->inp_inc.inc_flags & INC_ISIPV6)
		n = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
	else
		n = sizeof(struct ip) + sizeof(struct tcphdr);
	tp->t_maxseg = sc->params.mtus[G_TCPOPT_MSS(opt)] - n;

	CTR4(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u)", __func__, toep->tid,
	    G_TCPOPT_MSS(opt), sc->params.mtus[G_TCPOPT_MSS(opt)]);

	if (G_TCPOPT_TSTAMP(opt)) {
		tp->t_flags |= TF_RCVD_TSTMP;	/* timestamps ok */
		tp->ts_recent = 0;		/* hmmm */
		tp->ts_recent_age = tcp_ts_getsbintime();
		tp->t_maxseg -= TCPOLEN_TSTAMP_APPA;
	}

	if (G_TCPOPT_SACK(opt))
		tp->t_flags |= TF_SACK_PERMIT;	/* should already be set */
	else
		tp->t_flags &= ~TF_SACK_PERMIT;	/* sack disallowed by peer */

	if (G_TCPOPT_WSCALE_OK(opt))
		tp->t_flags |= TF_RCVD_SCALE;

	/* Doing window scaling? */
	if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
	    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
		tp->rcv_scale = tp->request_r_scale;
		tp->snd_scale = G_TCPOPT_SND_WSCALE(opt);
	}
}
Esempio n. 10
0
/*
 * IP output.  The packet in mbuf chain m contains a skeletal IP
 * header (with len, off, ttl, proto, tos, src, dst).
 * ip_len and ip_off are in host format.
 * The mbuf chain containing the packet will be freed.
 * The mbuf opt, if present, will not be freed.
 * In the IP forwarding case, the packet will arrive with options already
 * inserted, so must have a NULL opt pointer.
 */
int
ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
    struct ip_moptions *imo, struct inpcb *inp)
{
	struct ip *ip = NULL;
	struct ifnet *ifp = NULL;	/* keep compiler happy */
	struct mbuf *m0;
	int hlen = sizeof (struct ip);
	int mtu;
	int n;	/* scratchpad */
	int error = 0;
	int nortfree = 0;
	struct sockaddr_in *dst;
	struct in_ifaddr *ia = NULL;
	int isbroadcast, sw_csum;
	struct route iproute;
	struct rtentry *rte;	/* cache for ro->ro_rt */
	struct in_addr odst;
#ifdef IPFIREWALL_FORWARD
	struct m_tag *fwd_tag = NULL;
#endif
#ifdef IPSEC
	int no_route_but_check_spd = 0;
#endif
#ifdef PROMISCUOUS_INET
	struct ifl2info *l2i_tag = NULL;
	int ispromisc = 0;
#endif
	M_ASSERTPKTHDR(m);

	if (inp != NULL) {
		INP_LOCK_ASSERT(inp);
		M_SETFIB(m, inp->inp_inc.inc_fibnum);
		if (inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID)) {
			m->m_pkthdr.flowid = inp->inp_flowid;
			m->m_flags |= M_FLOWID;
		}
	}

#ifdef PROMISCUOUS_INET
	l2i_tag = (struct ifl2info *)m_tag_locate(m,
						  MTAG_PROMISCINET,
						  MTAG_PROMISCINET_L2INFO,
						  NULL);

	if ((inp && (inp->inp_flags2 & INP_PROMISC)) || l2i_tag) {
		unsigned int fib;

		if (l2i_tag) {
			/*
			 * This is a packet that has been turned around
			 * after reception, such as a TCP SYN packet being
			 * recycled as a RST, so fib comes from the mbuf,
			 * not the (probably nonexistent) connection
			 * context.
			 */
			fib = M_GETFIB(m);
		} else {
			fib = inp->inp_fibnum;

			if (0 != if_promiscinet_add_tag(m, inp->inp_l2info)) {
				goto bad;
			}
		}

		ifp = ifnet_byfib_ref(fib);
		if (NULL == ifp) {
			IPSTAT_INC(ips_noroute);
			error = EHOSTUNREACH;
			goto bad;
		}
		
		isbroadcast = 0;
		ispromisc = 1;
	}
#endif /* PROMISCUOUS_INET */

	if (ro == NULL) {
		ro = &iproute;
		bzero(ro, sizeof (*ro));

#ifdef FLOWTABLE
		{
			struct flentry *fle;
			
			/*
			 * The flow table returns route entries valid for up to 30
			 * seconds; we rely on the remainder of ip_output() taking no
			 * longer than that long for the stability of ro_rt.  The
			 * flow ID assignment must have happened before this point.
			 */
			if ((fle = flowtable_lookup_mbuf(V_ip_ft, m, AF_INET)) != NULL) {
				flow_to_route(fle, ro);
				nortfree = 1;
			}
		}
#endif
	}

	if (opt) {
		int len = 0;
		m = ip_insertoptions(m, opt, &len);
		if (len != 0)
			hlen = len; /* ip->ip_hl is updated above */
	}
	ip = mtod(m, struct ip *);

	/*
	 * Fill in IP header.  If we are not allowing fragmentation,
	 * then the ip_id field is meaningless, but we don't set it
	 * to zero.  Doing so causes various problems when devices along
	 * the path (routers, load balancers, firewalls, etc.) illegally
	 * disable DF on our packet.  Note that a 16-bit counter
	 * will wrap around in less than 10 seconds at 100 Mbit/s on a
	 * medium with MTU 1500.  See Steven M. Bellovin, "A Technique
	 * for Counting NATted Hosts", Proc. IMW'02, available at
	 * <http://www.cs.columbia.edu/~smb/papers/fnat.pdf>.
	 */
	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
		ip->ip_v = IPVERSION;
		ip->ip_hl = hlen >> 2;
		ip->ip_id = ip_newid();
		IPSTAT_INC(ips_localout);
	} else {
Esempio n. 11
0
/*
 * IP output.  The packet in mbuf chain m contains a skeletal IP
 * header (with len, off, ttl, proto, tos, src, dst).
 * The mbuf chain containing the packet will be freed.
 * The mbuf opt, if present, will not be freed.
 * If route ro is present and has ro_rt initialized, route lookup would be
 * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL,
 * then result of route lookup is stored in ro->ro_rt.
 *
 * In the IP forwarding case, the packet will arrive with options already
 * inserted, so must have a NULL opt pointer.
 */
int
ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
    struct ip_moptions *imo, struct inpcb *inp)
{
	struct ip *ip;
	struct ifnet *ifp = NULL;	/* keep compiler happy */
	struct mbuf *m0;
	int hlen = sizeof (struct ip);
	int mtu;
	int n;	/* scratchpad */
	int error = 0;
	struct sockaddr_in *dst;
	struct in_ifaddr *ia;
	int isbroadcast;
	uint16_t ip_len, ip_off, sw_csum;
	struct route iproute;
	struct rtentry *rte;	/* cache for ro->ro_rt */
	struct in_addr odst;
#ifdef IPFIREWALL_FORWARD
	struct m_tag *fwd_tag = NULL;
#endif
#ifdef IPSEC
	int no_route_but_check_spd = 0;
#endif
	M_ASSERTPKTHDR(m);

	if (inp != NULL) {
		INP_LOCK_ASSERT(inp);
		M_SETFIB(m, inp->inp_inc.inc_fibnum);
		if (inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID)) {
			m->m_pkthdr.flowid = inp->inp_flowid;
			m->m_flags |= M_FLOWID;
		}
	}

	if (ro == NULL) {
		ro = &iproute;
		bzero(ro, sizeof (*ro));
	}

#ifdef FLOWTABLE
	if (ro->ro_rt == NULL) {
		struct flentry *fle;
			
		/*
		 * The flow table returns route entries valid for up to 30
		 * seconds; we rely on the remainder of ip_output() taking no
		 * longer than that long for the stability of ro_rt. The
		 * flow ID assignment must have happened before this point.
		 */
		fle = flowtable_lookup_mbuf(V_ip_ft, m, AF_INET);
		if (fle != NULL)
			flow_to_route(fle, ro);
	}
#endif

	if (opt) {
		int len = 0;
		m = ip_insertoptions(m, opt, &len);
		if (len != 0)
			hlen = len; /* ip->ip_hl is updated above */
	}
	ip = mtod(m, struct ip *);
	ip_len = ntohs(ip->ip_len);
	ip_off = ntohs(ip->ip_off);

	/*
	 * Fill in IP header.  If we are not allowing fragmentation,
	 * then the ip_id field is meaningless, but we don't set it
	 * to zero.  Doing so causes various problems when devices along
	 * the path (routers, load balancers, firewalls, etc.) illegally
	 * disable DF on our packet.  Note that a 16-bit counter
	 * will wrap around in less than 10 seconds at 100 Mbit/s on a
	 * medium with MTU 1500.  See Steven M. Bellovin, "A Technique
	 * for Counting NATted Hosts", Proc. IMW'02, available at
	 * <http://www.cs.columbia.edu/~smb/papers/fnat.pdf>.
	 */
	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
		ip->ip_v = IPVERSION;
		ip->ip_hl = hlen >> 2;
		ip->ip_id = ip_newid();
		IPSTAT_INC(ips_localout);
	} else {
static int
tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
{
	struct inpcb *inp = tp->t_inpcb, *oinp;
	struct socket *so = inp->inp_socket;
	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
	struct in6_addr *addr6;
	int error;

	INP_INFO_WLOCK_ASSERT(&tcbinfo);
	INP_LOCK_ASSERT(inp);

	if (inp->inp_lport == 0) {
		error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
		if (error)
			return error;
	}

	/*
	 * Cannot simply call in_pcbconnect, because there might be an
	 * earlier incarnation of this same connection still in
	 * TIME_WAIT state, creating an ADDRINUSE error.
	 * in6_pcbladdr() also handles scope zone IDs.
	 */
	error = in6_pcbladdr(inp, nam, &addr6);
	if (error)
		return error;
	oinp = in6_pcblookup_hash(inp->inp_pcbinfo,
				  &sin6->sin6_addr, sin6->sin6_port,
				  IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)
				  ? addr6
				  : &inp->in6p_laddr,
				  inp->inp_lport,  0, NULL);
	if (oinp)
		return EADDRINUSE;
	if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
		inp->in6p_laddr = *addr6;
	inp->in6p_faddr = sin6->sin6_addr;
	inp->inp_fport = sin6->sin6_port;
	/* update flowinfo - draft-itojun-ipv6-flowlabel-api-00 */
	inp->in6p_flowinfo &= ~IPV6_FLOWLABEL_MASK;
	if (inp->in6p_flags & IN6P_AUTOFLOWLABEL)
		inp->in6p_flowinfo |=
		    (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
	in_pcbrehash(inp);

	/* Compute window scaling to request.  */
	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
	    (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
		tp->request_r_scale++;

	soisconnecting(so);
	tcpstat.tcps_connattempt++;
	tp->t_state = TCPS_SYN_SENT;
	tcp_timer_activate(tp, TT_KEEP, tcp_keepinit);
	tp->iss = tcp_new_isn(tp);
	tp->t_bw_rtseq = tp->iss;
	tcp_sendseqinit(tp);

	return 0;
}
/*
 * Common subroutine to open a TCP connection to remote host specified
 * by struct sockaddr_in in mbuf *nam.  Call in_pcbbind to assign a local
 * port number if needed.  Call in_pcbconnect_setup to do the routing and
 * to choose a local host address (interface).  If there is an existing
 * incarnation of the same connection in TIME-WAIT state and if the remote
 * host was sending CC options and if the connection duration was < MSL, then
 * truncate the previous TIME-WAIT state and proceed.
 * Initialize connection parameters and enter SYN-SENT state.
 */
static int
tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
{
	struct inpcb *inp = tp->t_inpcb, *oinp;
	struct socket *so = inp->inp_socket;
	struct in_addr laddr;
	u_short lport;
	int error;

	INP_INFO_WLOCK_ASSERT(&tcbinfo);
	INP_LOCK_ASSERT(inp);

	if (inp->inp_lport == 0) {
#ifdef MAXHE_TODO
		error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
#else
		error = in_pcbbind(inp, (struct sockaddr *)0, NULL);
#endif // MAXHE_TODO
		//printf("tcp_connect: error=%d\n", error);
		if (error)
			return error;
	}

	/*
	 * Cannot simply call in_pcbconnect, because there might be an
	 * earlier incarnation of this same connection still in
	 * TIME_WAIT state, creating an ADDRINUSE error.
	 */
	laddr = inp->inp_laddr;
	lport = inp->inp_lport;
#ifdef MAXHE_TODO
	error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport,
	    &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred);
#else
	error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport,
	    &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, NULL);
#endif // MAXHE_TODO
		//printf("tcp_connect: (2) error=%d\n", error);
	if (error && oinp == NULL)
		return error;
	if (oinp)
		return EADDRINUSE;
	inp->inp_laddr = laddr;
	in_pcbrehash(inp);

	/*
	 * Compute window scaling to request:
	 * Scale to fit into sweet spot.  See tcp_syncache.c.
	 * XXX: This should move to tcp_output().
	 */
	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
	    (TCP_MAXWIN << tp->request_r_scale) < sb_max)
		tp->request_r_scale++;

	soisconnecting(so);
	tcpstat.tcps_connattempt++;
	tp->t_state = TCPS_SYN_SENT;
	tcp_timer_activate(tp, TT_KEEP, tcp_keepinit);
	tp->iss = tcp_new_isn(tp);
	tp->t_bw_rtseq = tp->iss;
	tcp_sendseqinit(tp);

		//printf("tcp_connect: (9) error=%d\n", error);
	return 0;
}
Esempio n. 14
0
/*
 * Tcp output routine: figure out what should be sent and send it.
 */
int
tcp_output(struct tcpcb *tp)
{
	struct socket *so = tp->t_inpcb->inp_socket;
	long len, recwin, sendwin;
	int off, flags, error;
#ifdef TCP_SIGNATURE
	int sigoff = 0;
#endif
	struct mbuf *m;
	struct ip *ip = NULL;
	struct ipovly *ipov = NULL;
	struct tcphdr *th;
	u_char opt[TCP_MAXOLEN];
	unsigned ipoptlen, optlen, hdrlen;
	int idle, sendalot;
	int i, sack_rxmit;
	int sack_bytes_rxmt;
	struct sackhole *p;
#if 0
	int maxburst = TCP_MAXBURST;
#endif
	struct rmxp_tao tao;
#ifdef INET6
	struct ip6_hdr *ip6 = NULL;
	int isipv6;

	bzero(&tao, sizeof(tao));
	isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
#endif
#ifdef TCP_ECN
	int needect;
#endif

	INP_LOCK_ASSERT(tp->t_inpcb);

	/*
	 * Determine length of data that should be transmitted,
	 * and flags that will be used.
	 * If there is some data or critical controls (SYN, RST)
	 * to send, then transmit; otherwise, investigate further.
	 */
	idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
	if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
		/*
		 * We have been idle for "a while" and no acks are
		 * expected to clock out any data we send --
		 * slow start to get ack "clock" running again.
		 *
		 * Set the slow-start flight size depending on whether
		 * this is a local network or not.
		 */
		int ss = ss_fltsz;
#ifdef INET6
		if (isipv6) {
			if (in6_localaddr(&tp->t_inpcb->in6p_faddr))
				ss = ss_fltsz_local;
		} else
#endif
		if (in_localaddr(tp->t_inpcb->inp_faddr))
			ss = ss_fltsz_local;
		tp->snd_cwnd = tp->t_maxseg * ss;
	}
	tp->t_flags &= ~TF_LASTIDLE;
	if (idle) {
		if (tp->t_flags & TF_MORETOCOME) {
			tp->t_flags |= TF_LASTIDLE;
			idle = 0;
		}
	}
again:
	/*
	 * If we've recently taken a timeout, snd_max will be greater than
	 * snd_nxt.  There may be SACK information that allows us to avoid
	 * resending already delivered data.  Adjust snd_nxt accordingly.
	 */
	if (tp->sack_enable && SEQ_LT(tp->snd_nxt, tp->snd_max))
		tcp_sack_adjust(tp);
	sendalot = 0;
	off = tp->snd_nxt - tp->snd_una;
	sendwin = min(tp->snd_wnd, tp->snd_cwnd);
	sendwin = min(sendwin, tp->snd_bwnd);

	flags = tcp_outflags[tp->t_state];
	/*
	 * Send any SACK-generated retransmissions.  If we're explicitly trying
	 * to send out new data (when sendalot is 1), bypass this function.
	 * If we retransmit in fast recovery mode, decrement snd_cwnd, since
	 * we're replacing a (future) new transmission with a retransmission
	 * now, and we previously incremented snd_cwnd in tcp_input().
	 */
	/*
	 * Still in sack recovery , reset rxmit flag to zero.
	 */
	sack_rxmit = 0;
	sack_bytes_rxmt = 0;
	len = 0;
	p = NULL;
	if (tp->sack_enable && IN_FASTRECOVERY(tp) &&
	    (p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
		long cwin;
		
		cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
		if (cwin < 0)
			cwin = 0;
		/* Do not retransmit SACK segments beyond snd_recover */
		if (SEQ_GT(p->end, tp->snd_recover)) {
			/*
			 * (At least) part of sack hole extends beyond
			 * snd_recover. Check to see if we can rexmit data
			 * for this hole.
			 */
			if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
				/*
				 * Can't rexmit any more data for this hole.
				 * That data will be rexmitted in the next
				 * sack recovery episode, when snd_recover
				 * moves past p->rxmit.
				 */
				p = NULL;
				goto after_sack_rexmit;
			} else
				/* Can rexmit part of the current hole */
				len = ((long)ulmin(cwin,
						   tp->snd_recover - p->rxmit));
		} else
			len = ((long)ulmin(cwin, p->end - p->rxmit));
		off = p->rxmit - tp->snd_una;
		KASSERT(off >= 0,("%s: sack block to the left of una : %d",
		    __func__, off));
		if (len > 0) {
			sack_rxmit = 1;
			sendalot = 1;
			tcpstat.tcps_sack_rexmits++;
			tcpstat.tcps_sack_rexmit_bytes +=
			    min(len, tp->t_maxseg);
		}
	}
after_sack_rexmit:
	/*
	 * Get standard flags, and add SYN or FIN if requested by 'hidden'
	 * state flags.
	 */
	if (tp->t_flags & TF_NEEDFIN)
		flags |= TH_FIN;
	if (tp->t_flags & TF_NEEDSYN)
		flags |= TH_SYN;

	SOCKBUF_LOCK(&so->so_snd);
	/*
	 * If in persist timeout with window of 0, send 1 byte.
	 * Otherwise, if window is small but nonzero
	 * and timer expired, we will send what we can
	 * and go to transmit state.
	 */
	if (tp->t_force) {
		if (sendwin == 0) {
			/*
			 * If we still have some data to send, then
			 * clear the FIN bit.  Usually this would
			 * happen below when it realizes that we
			 * aren't sending all the data.  However,
			 * if we have exactly 1 byte of unsent data,
			 * then it won't clear the FIN bit below,
			 * and if we are in persist state, we wind
			 * up sending the packet without recording
			 * that we sent the FIN bit.
			 *
			 * We can't just blindly clear the FIN bit,
			 * because if we don't have any more data
			 * to send then the probe will be the FIN
			 * itself.
			 */
			if (off < so->so_snd.sb_cc)
				flags &= ~TH_FIN;
			sendwin = 1;
		} else {
			callout_stop(tp->tt_persist);
			tp->t_rxtshift = 0;
		}
	}

	/*
	 * If snd_nxt == snd_max and we have transmitted a FIN, the
	 * offset will be > 0 even if so_snd.sb_cc is 0, resulting in
	 * a negative length.  This can also occur when TCP opens up
	 * its congestion window while receiving additional duplicate
	 * acks after fast-retransmit because TCP will reset snd_nxt
	 * to snd_max after the fast-retransmit.
	 *
	 * In the normal retransmit-FIN-only case, however, snd_nxt will
	 * be set to snd_una, the offset will be 0, and the length may
	 * wind up 0.
	 *
	 * If sack_rxmit is true we are retransmitting from the scoreboard
	 * in which case len is already set.
	 */
	if (sack_rxmit == 0) {
		if (sack_bytes_rxmt == 0)
			len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off);
		else {
			long cwin;

                        /*
			 * We are inside of a SACK recovery episode and are
			 * sending new data, having retransmitted all the
			 * data possible in the scoreboard.
			 */
			len = ((long)ulmin(so->so_snd.sb_cc, tp->snd_wnd) 
			       - off);
			/*
			 * Don't remove this (len > 0) check !
			 * We explicitly check for len > 0 here (although it 
			 * isn't really necessary), to work around a gcc 
			 * optimization issue - to force gcc to compute
			 * len above. Without this check, the computation
			 * of len is bungled by the optimizer.
			 */
			if (len > 0) {
				cwin = tp->snd_cwnd - 
					(tp->snd_nxt - tp->sack_newdata) -
					sack_bytes_rxmt;
				if (cwin < 0)
					cwin = 0;
				len = lmin(len, cwin);
			}
		}
	}

	/*
	 * Lop off SYN bit if it has already been sent.  However, if this
	 * is SYN-SENT state and if segment contains data and if we don't
	 * know that foreign host supports TAO, suppress sending segment.
	 */
	if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
		flags &= ~TH_SYN;
		off--, len++;
		if (tcp_do_rfc1644)
			tcp_hc_gettao(&tp->t_inpcb->inp_inc, &tao);
		if (len > 0 && tp->t_state == TCPS_SYN_SENT &&
		     tao.tao_ccsent == 0)
			goto just_return;
	}

	/*
	 * Be careful not to send data and/or FIN on SYN segments
	 * in cases when no CC option will be sent.
	 * This measure is needed to prevent interoperability problems
	 * with not fully conformant TCP implementations.
	 */
	if ((flags & TH_SYN) &&
	    ((tp->t_flags & TF_NOOPT) || !(tp->t_flags & TF_REQ_CC) ||
	     ((flags & TH_ACK) && !(tp->t_flags & TF_RCVD_CC)))) {
		len = 0;
		flags &= ~TH_FIN;
	}

	if (len < 0) {
		/*
		 * If FIN has been sent but not acked,
		 * but we haven't been called to retransmit,
		 * len will be < 0.  Otherwise, window shrank
		 * after we sent into it.  If window shrank to 0,
		 * cancel pending retransmit, pull snd_nxt back
		 * to (closed) window, and set the persist timer
		 * if it isn't already going.  If the window didn't
		 * close completely, just wait for an ACK.
		 */
		len = 0;
		if (sendwin == 0) {
			callout_stop(tp->tt_rexmt);
			tp->t_rxtshift = 0;
			tp->snd_nxt = tp->snd_una;
			if (!callout_active(tp->tt_persist))
				tcp_setpersist(tp);
		}
	}

	/*
	 * len will be >= 0 after this point.  Truncate to the maximum
	 * segment length and ensure that FIN is removed if the length
	 * no longer contains the last data byte.
	 */
	if (len > tp->t_maxseg) {
		len = tp->t_maxseg;
		sendalot = 1;
	}
	if (sack_rxmit) {
		if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc))
			flags &= ~TH_FIN;
	} else {
		if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
			flags &= ~TH_FIN;
	}

	recwin = sbspace(&so->so_rcv);

	/*
	 * Sender silly window avoidance.   We transmit under the following
	 * conditions when len is non-zero:
	 *
	 *	- We have a full segment
	 *	- This is the last buffer in a write()/send() and we are
	 *	  either idle or running NODELAY
	 *	- we've timed out (e.g. persist timer)
	 *	- we have more then 1/2 the maximum send window's worth of
	 *	  data (receiver may be limited the window size)
	 *	- we need to retransmit
	 */
	if (len) {
		if (len == tp->t_maxseg)
			goto send;
		/*
		 * NOTE! on localhost connections an 'ack' from the remote
		 * end may occur synchronously with the output and cause
		 * us to flush a buffer queued with moretocome.  XXX
		 *
		 * note: the len + off check is almost certainly unnecessary.
		 */
		if (!(tp->t_flags & TF_MORETOCOME) &&	/* normal case */
		    (idle || (tp->t_flags & TF_NODELAY)) &&
		    len + off >= so->so_snd.sb_cc &&
		    (tp->t_flags & TF_NOPUSH) == 0) {
			goto send;
		}
		if (tp->t_force)			/* typ. timeout case */
			goto send;
		if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
			goto send;
		if (SEQ_LT(tp->snd_nxt, tp->snd_max))	/* retransmit case */
			goto send;
		if (sack_rxmit)
			goto send;
	}

	/*
	 * Compare available window to amount of window
	 * known to peer (as advertised window less
	 * next expected input).  If the difference is at least two
	 * max size segments, or at least 50% of the maximum possible
	 * window, then want to send a window update to peer.
	 * Skip this if the connection is in T/TCP half-open state.
	 */
	if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN)) {
		/*
		 * "adv" is the amount we can increase the window,
		 * taking into account that we are limited by
		 * TCP_MAXWIN << tp->rcv_scale.
		 */
		long adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale) -
			(tp->rcv_adv - tp->rcv_nxt);

		if (adv >= (long) (2 * tp->t_maxseg))
			goto send;
		if (2 * adv >= (long) so->so_rcv.sb_hiwat)
			goto send;
	}

	/*
	 * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
	 * is also a catch-all for the retransmit timer timeout case.
	 */
	if (tp->t_flags & TF_ACKNOW)
		goto send;
	if ((flags & TH_RST) ||
	    ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
		goto send;
	if (SEQ_GT(tp->snd_up, tp->snd_una))
		goto send;
	/*
	 * If our state indicates that FIN should be sent
	 * and we have not yet done so, then we need to send.
	 */
	if (flags & TH_FIN &&
	    ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
		goto send;
	/*
	 * In SACK, it is possible for tcp_output to fail to send a segment
	 * after the retransmission timer has been turned off.  Make sure
	 * that the retransmission timer is set.
	 */
	if (tp->sack_enable && SEQ_GT(tp->snd_max, tp->snd_una) &&
	    !callout_active(tp->tt_rexmt) &&
	    !callout_active(tp->tt_persist)) {
		callout_reset(tp->tt_rexmt, tp->t_rxtcur,
			      tcp_timer_rexmt, tp);
		goto just_return;
	} 
	/*
	 * TCP window updates are not reliable, rather a polling protocol
	 * using ``persist'' packets is used to insure receipt of window
	 * updates.  The three ``states'' for the output side are:
	 *	idle			not doing retransmits or persists
	 *	persisting		to move a small or zero window
	 *	(re)transmitting	and thereby not persisting
	 *
	 * callout_active(tp->tt_persist)
	 *	is true when we are in persist state.
	 * tp->t_force
	 *	is set when we are called to send a persist packet.
	 * callout_active(tp->tt_rexmt)
	 *	is set when we are retransmitting
	 * The output side is idle when both timers are zero.
	 *
	 * If send window is too small, there is data to transmit, and no
	 * retransmit or persist is pending, then go to persist state.
	 * If nothing happens soon, send when timer expires:
	 * if window is nonzero, transmit what we can,
	 * otherwise force out a byte.
	 */
	if (so->so_snd.sb_cc && !callout_active(tp->tt_rexmt) &&
	    !callout_active(tp->tt_persist)) {
		tp->t_rxtshift = 0;
		tcp_setpersist(tp);
	}

	/*
	 * No reason to send a segment, just return.
	 */
just_return:
	SOCKBUF_UNLOCK(&so->so_snd);
	return (0);

send:
	SOCKBUF_LOCK_ASSERT(&so->so_snd);
	/*
	 * Before ESTABLISHED, force sending of initial options
	 * unless TCP set not to do any options.
	 * NOTE: we assume that the IP/TCP header plus TCP options
	 * always fit in a single mbuf, leaving room for a maximum
	 * link header, i.e.
	 *	max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
	 */
	optlen = 0;
#ifdef INET6
	if (isipv6)
		hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
	else
#endif
	hdrlen = sizeof (struct tcpiphdr);
	if (flags & TH_SYN) {
		tp->snd_nxt = tp->iss;
		if ((tp->t_flags & TF_NOOPT) == 0) {
			u_short mss;

			opt[0] = TCPOPT_MAXSEG;
			opt[1] = TCPOLEN_MAXSEG;
			mss = htons((u_short) tcp_mssopt(&tp->t_inpcb->inp_inc));
			(void)memcpy(opt + 2, &mss, sizeof(mss));
			optlen = TCPOLEN_MAXSEG;

			/*
			 * If this is the first SYN of connection (not a SYN
			 * ACK), include SACK_PERMIT_HDR option.  If this is a
			 * SYN ACK, include SACK_PERMIT_HDR option if peer has
			 * already done so. This is only for active connect,
			 * since the syncache takes care of the passive connect.
			 */
			if (tp->sack_enable && ((flags & TH_ACK) == 0 ||
			    (tp->t_flags & TF_SACK_PERMIT))) {
				*((u_int32_t *) (opt + optlen)) =
					htonl(TCPOPT_SACK_PERMIT_HDR);
				optlen += 4;
			}
			if ((tp->t_flags & TF_REQ_SCALE) &&
			    ((flags & TH_ACK) == 0 ||
			    (tp->t_flags & TF_RCVD_SCALE))) {
				*((u_int32_t *)(opt + optlen)) = htonl(
					TCPOPT_NOP << 24 |
					TCPOPT_WINDOW << 16 |
					TCPOLEN_WINDOW << 8 |
					tp->request_r_scale);
				optlen += 4;
			}
		}
	}

	/*
	 * Send a timestamp and echo-reply if this is a SYN and our side
	 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
	 * and our peer have sent timestamps in our SYN's.
	 */
	if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
	    (flags & TH_RST) == 0 &&
	    ((flags & TH_ACK) == 0 ||
	     (tp->t_flags & TF_RCVD_TSTMP))) {
		u_int32_t *lp = (u_int32_t *)(opt + optlen);

		/* Form timestamp option as shown in appendix A of RFC 1323. */
		*lp++ = htonl(TCPOPT_TSTAMP_HDR);
		*lp++ = htonl(ticks);
		*lp   = htonl(tp->ts_recent);
		optlen += TCPOLEN_TSTAMP_APPA;
	}

	/*
	 * Send SACKs if necessary.  This should be the last option processed.
	 * Only as many SACKs are sent as are permitted by the maximum options
	 * size.  No more than three SACKs are sent.
	 */
	if (tp->sack_enable && tp->t_state == TCPS_ESTABLISHED &&
	    (tp->t_flags & (TF_SACK_PERMIT|TF_NOOPT)) == TF_SACK_PERMIT &&
	    tp->rcv_numsacks) {
		u_int32_t *lp = (u_int32_t *)(opt + optlen);
		u_int32_t *olp = lp++;
		int count = 0;  /* actual number of SACKs inserted */
		int maxsack = (MAX_TCPOPTLEN - (optlen + 4))/TCPOLEN_SACK;

		tcpstat.tcps_sack_send_blocks++;
		maxsack = min(maxsack, TCP_MAX_SACK);
		for (i = 0; (i < tp->rcv_numsacks && count < maxsack); i++) {
			struct sackblk sack = tp->sackblks[i];
			if (sack.start == 0 && sack.end == 0)
				continue;
			*lp++ = htonl(sack.start);
			*lp++ = htonl(sack.end);
			count++;
		}
		*olp = htonl(TCPOPT_SACK_HDR|(TCPOLEN_SACK*count+2));
		optlen += TCPOLEN_SACK*count + 4; /* including leading NOPs */
	}
	/*
	 * Send `CC-family' options if our side wants to use them (TF_REQ_CC),
	 * options are allowed (!TF_NOOPT) and it's not a RST.
	 */
	if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
	     (flags & TH_RST) == 0) {
		switch (flags & (TH_SYN|TH_ACK)) {
		/*
		 * This is a normal ACK, send CC if we received CC before
		 * from our peer.
		 */
		case TH_ACK:
			if (!(tp->t_flags & TF_RCVD_CC))
				break;
			/*FALLTHROUGH*/

		/*
		 * We can only get here in T/TCP's SYN_SENT* state, when
		 * we're a sending a non-SYN segment without waiting for
		 * the ACK of our SYN.  A check above assures that we only
		 * do this if our peer understands T/TCP.
		 */
		case 0:
			opt[optlen++] = TCPOPT_NOP;
			opt[optlen++] = TCPOPT_NOP;
			opt[optlen++] = TCPOPT_CC;
			opt[optlen++] = TCPOLEN_CC;
			*(u_int32_t *)&opt[optlen] = htonl(tp->cc_send);

			optlen += 4;
			break;

		/*
		 * This is our initial SYN, check whether we have to use
		 * CC or CC.new.
		 */
		case TH_SYN:
			opt[optlen++] = TCPOPT_NOP;
			opt[optlen++] = TCPOPT_NOP;
			opt[optlen++] = tp->t_flags & TF_SENDCCNEW ?
						TCPOPT_CCNEW : TCPOPT_CC;
			opt[optlen++] = TCPOLEN_CC;
			*(u_int32_t *)&opt[optlen] = htonl(tp->cc_send);
			optlen += 4;
			break;

		/*
		 * This is a SYN,ACK; send CC and CC.echo if we received
		 * CC from our peer.
		 */
		case (TH_SYN|TH_ACK):
			if (tp->t_flags & TF_RCVD_CC) {
				opt[optlen++] = TCPOPT_NOP;
				opt[optlen++] = TCPOPT_NOP;
				opt[optlen++] = TCPOPT_CC;
				opt[optlen++] = TCPOLEN_CC;
				*(u_int32_t *)&opt[optlen] =
					htonl(tp->cc_send);
				optlen += 4;
				opt[optlen++] = TCPOPT_NOP;
				opt[optlen++] = TCPOPT_NOP;
				opt[optlen++] = TCPOPT_CCECHO;
				opt[optlen++] = TCPOLEN_CC;
				*(u_int32_t *)&opt[optlen] =
					htonl(tp->cc_recv);
				optlen += 4;
			}
			break;
		}
	}

#ifdef TCP_SIGNATURE
#ifdef INET6
	if (!isipv6)
#endif
	if (tp->t_flags & TF_SIGNATURE) {
		int i;
		u_char *bp;

		/* Initialize TCP-MD5 option (RFC2385) */
		bp = (u_char *)opt + optlen;
		*bp++ = TCPOPT_SIGNATURE;
		*bp++ = TCPOLEN_SIGNATURE;
		sigoff = optlen + 2;
		for (i = 0; i < TCP_SIGLEN; i++)
			*bp++ = 0;
		optlen += TCPOLEN_SIGNATURE;

		/* Terminate options list and maintain 32-bit alignment. */
		*bp++ = TCPOPT_NOP;
		*bp++ = TCPOPT_EOL;
		optlen += 2;
	}
#endif /* TCP_SIGNATURE */

	hdrlen += optlen;

#ifdef INET6
	if (isipv6)
		ipoptlen = ip6_optlen(tp->t_inpcb);
	else
#endif
	if (tp->t_inpcb->inp_options)
		ipoptlen = tp->t_inpcb->inp_options->m_len -
				offsetof(struct ipoption, ipopt_list);
	else
Esempio n. 15
0
static int
udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
    struct mbuf *control, struct thread *td)
{
	struct udpiphdr *ui;
	int len = m->m_pkthdr.len;
	struct in_addr faddr, laddr;
	struct cmsghdr *cm;
	struct inpcbinfo *pcbinfo;
	struct sockaddr_in *sin, src;
	int cscov_partial = 0;
	int error = 0;
	int ipflags;
	u_short fport, lport;
	int unlock_udbinfo;
	u_char tos;
	uint8_t pr;
	uint16_t cscov = 0;

	/*
	 * udp_output() may need to temporarily bind or connect the current
	 * inpcb.  As such, we don't know up front whether we will need the
	 * pcbinfo lock or not.  Do any work to decide what is needed up
	 * front before acquiring any locks.
	 */
	if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) {
		if (control)
			m_freem(control);
		m_freem(m);
		return (EMSGSIZE);
	}

	src.sin_family = 0;
	INP_RLOCK(inp);
	tos = inp->inp_ip_tos;
	if (control != NULL) {
		/*
		 * XXX: Currently, we assume all the optional information is
		 * stored in a single mbuf.
		 */
		if (control->m_next) {
			INP_RUNLOCK(inp);
			m_freem(control);
			m_freem(m);
			return (EINVAL);
		}
		for (; control->m_len > 0;
		    control->m_data += CMSG_ALIGN(cm->cmsg_len),
		    control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
			cm = mtod(control, struct cmsghdr *);
			if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0
			    || cm->cmsg_len > control->m_len) {
				error = EINVAL;
				break;
			}
			if (cm->cmsg_level != IPPROTO_IP)
				continue;

			switch (cm->cmsg_type) {
			case IP_SENDSRCADDR:
				if (cm->cmsg_len !=
				    CMSG_LEN(sizeof(struct in_addr))) {
					error = EINVAL;
					break;
				}
				bzero(&src, sizeof(src));
				src.sin_family = AF_INET;
				src.sin_len = sizeof(src);
				src.sin_port = inp->inp_lport;
				src.sin_addr =
				    *(struct in_addr *)CMSG_DATA(cm);
				break;

			case IP_TOS:
				if (cm->cmsg_len != CMSG_LEN(sizeof(u_char))) {
					error = EINVAL;
					break;
				}
				tos = *(u_char *)CMSG_DATA(cm);
				break;

			default:
				error = ENOPROTOOPT;
				break;
			}
			if (error)
				break;
		}
		m_freem(control);
	}
	if (error) {
		INP_RUNLOCK(inp);
		m_freem(m);
		return (error);
	}

	/*
	 * Depending on whether or not the application has bound or connected
	 * the socket, we may have to do varying levels of work.  The optimal
	 * case is for a connected UDP socket, as a global lock isn't
	 * required at all.
	 *
	 * In order to decide which we need, we require stability of the
	 * inpcb binding, which we ensure by acquiring a read lock on the
	 * inpcb.  This doesn't strictly follow the lock order, so we play
	 * the trylock and retry game; note that we may end up with more
	 * conservative locks than required the second time around, so later
	 * assertions have to accept that.  Further analysis of the number of
	 * misses under contention is required.
	 *
	 * XXXRW: Check that hash locking update here is correct.
	 */
	pr = inp->inp_socket->so_proto->pr_protocol;
	pcbinfo = get_inpcbinfo(pr);
	sin = (struct sockaddr_in *)addr;
	if (sin != NULL &&
	    (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) {
		INP_RUNLOCK(inp);
		INP_WLOCK(inp);
		INP_HASH_WLOCK(pcbinfo);
		unlock_udbinfo = UH_WLOCKED;
	} else if ((sin != NULL && (
	    (sin->sin_addr.s_addr == INADDR_ANY) ||
	    (sin->sin_addr.s_addr == INADDR_BROADCAST) ||
	    (inp->inp_laddr.s_addr == INADDR_ANY) ||
	    (inp->inp_lport == 0))) ||
	    (src.sin_family == AF_INET)) {
		INP_HASH_RLOCK(pcbinfo);
		unlock_udbinfo = UH_RLOCKED;
	} else
		unlock_udbinfo = UH_UNLOCKED;

	/*
	 * If the IP_SENDSRCADDR control message was specified, override the
	 * source address for this datagram.  Its use is invalidated if the
	 * address thus specified is incomplete or clobbers other inpcbs.
	 */
	laddr = inp->inp_laddr;
	lport = inp->inp_lport;
	if (src.sin_family == AF_INET) {
		INP_HASH_LOCK_ASSERT(pcbinfo);
		if ((lport == 0) ||
		    (laddr.s_addr == INADDR_ANY &&
		     src.sin_addr.s_addr == INADDR_ANY)) {
			error = EINVAL;
			goto release;
		}
		error = in_pcbbind_setup(inp, (struct sockaddr *)&src,
		    &laddr.s_addr, &lport, td->td_ucred);
		if (error)
			goto release;
	}

	/*
	 * If a UDP socket has been connected, then a local address/port will
	 * have been selected and bound.
	 *
	 * If a UDP socket has not been connected to, then an explicit
	 * destination address must be used, in which case a local
	 * address/port may not have been selected and bound.
	 */
	if (sin != NULL) {
		INP_LOCK_ASSERT(inp);
		if (inp->inp_faddr.s_addr != INADDR_ANY) {
			error = EISCONN;
			goto release;
		}

		/*
		 * Jail may rewrite the destination address, so let it do
		 * that before we use it.
		 */
		error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
		if (error)
			goto release;

		/*
		 * If a local address or port hasn't yet been selected, or if
		 * the destination address needs to be rewritten due to using
		 * a special INADDR_ constant, invoke in_pcbconnect_setup()
		 * to do the heavy lifting.  Once a port is selected, we
		 * commit the binding back to the socket; we also commit the
		 * binding of the address if in jail.
		 *
		 * If we already have a valid binding and we're not
		 * requesting a destination address rewrite, use a fast path.
		 */
		if (inp->inp_laddr.s_addr == INADDR_ANY ||
		    inp->inp_lport == 0 ||
		    sin->sin_addr.s_addr == INADDR_ANY ||
		    sin->sin_addr.s_addr == INADDR_BROADCAST) {
			INP_HASH_LOCK_ASSERT(pcbinfo);
			error = in_pcbconnect_setup(inp, addr, &laddr.s_addr,
			    &lport, &faddr.s_addr, &fport, NULL,
			    td->td_ucred);
			if (error)
				goto release;

			/*
			 * XXXRW: Why not commit the port if the address is
			 * !INADDR_ANY?
			 */
			/* Commit the local port if newly assigned. */
			if (inp->inp_laddr.s_addr == INADDR_ANY &&
			    inp->inp_lport == 0) {
				INP_WLOCK_ASSERT(inp);
				INP_HASH_WLOCK_ASSERT(pcbinfo);
				/*
				 * Remember addr if jailed, to prevent
				 * rebinding.
				 */
				if (prison_flag(td->td_ucred, PR_IP4))
					inp->inp_laddr = laddr;
				inp->inp_lport = lport;
				if (in_pcbinshash(inp) != 0) {
					inp->inp_lport = 0;
					error = EAGAIN;
					goto release;
				}
				inp->inp_flags |= INP_ANONPORT;
			}
		} else {
			faddr = sin->sin_addr;
			fport = sin->sin_port;
		}
	} else {
		INP_LOCK_ASSERT(inp);
		faddr = inp->inp_faddr;
		fport = inp->inp_fport;
		if (faddr.s_addr == INADDR_ANY) {
			error = ENOTCONN;
			goto release;
		}
	}

	/*
	 * Calculate data length and get a mbuf for UDP, IP, and possible
	 * link-layer headers.  Immediate slide the data pointer back forward
	 * since we won't use that space at this layer.
	 */
	M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_NOWAIT);
	if (m == NULL) {
		error = ENOBUFS;
		goto release;
	}
	m->m_data += max_linkhdr;
	m->m_len -= max_linkhdr;
	m->m_pkthdr.len -= max_linkhdr;

	/*
	 * Fill in mbuf with extended UDP header and addresses and length put
	 * into network format.
	 */
	ui = mtod(m, struct udpiphdr *);
	bzero(ui->ui_x1, sizeof(ui->ui_x1));	/* XXX still needed? */
	ui->ui_pr = pr;
	ui->ui_src = laddr;
	ui->ui_dst = faddr;
	ui->ui_sport = lport;
	ui->ui_dport = fport;
	ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr));
	if (pr == IPPROTO_UDPLITE) {
		struct udpcb *up;
		uint16_t plen;

		up = intoudpcb(inp);
		cscov = up->u_txcslen;
		plen = (u_short)len + sizeof(struct udphdr);
		if (cscov >= plen)
			cscov = 0;
		ui->ui_len = htons(plen);
		ui->ui_ulen = htons(cscov);
		/*
		 * For UDP-Lite, checksum coverage length of zero means
		 * the entire UDPLite packet is covered by the checksum.
		 */
		 cscov_partial = (cscov == 0) ? 0 : 1;
	} else
		ui->ui_v = IPVERSION << 4;

	/*
	 * Set the Don't Fragment bit in the IP header.
	 */
	if (inp->inp_flags & INP_DONTFRAG) {
		struct ip *ip;

		ip = (struct ip *)&ui->ui_i;
		ip->ip_off |= htons(IP_DF);
	}

	ipflags = 0;
	if (inp->inp_socket->so_options & SO_DONTROUTE)
		ipflags |= IP_ROUTETOIF;
	if (inp->inp_socket->so_options & SO_BROADCAST)
		ipflags |= IP_ALLOWBROADCAST;
	if (inp->inp_flags & INP_ONESBCAST)
		ipflags |= IP_SENDONES;

#ifdef MAC
	mac_inpcb_create_mbuf(inp, m);
#endif

	/*
	 * Set up checksum and output datagram.
	 */
	ui->ui_sum = 0;
	if (cscov_partial) {
		if (inp->inp_flags & INP_ONESBCAST)
			faddr.s_addr = INADDR_BROADCAST;
		if ((ui->ui_sum = in_cksum(m, sizeof(struct ip) + cscov)) == 0)
			ui->ui_sum = 0xffff;
	} else if (V_udp_cksum || !cscov_partial) {
		if (inp->inp_flags & INP_ONESBCAST)
			faddr.s_addr = INADDR_BROADCAST;
		ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr,
		    htons((u_short)len + sizeof(struct udphdr) + pr));
		m->m_pkthdr.csum_flags = CSUM_UDP;
		m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
	}
	((struct ip *)ui)->ip_len = htons(sizeof(struct udpiphdr) + len);
	((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl;	/* XXX */
	((struct ip *)ui)->ip_tos = tos;		/* XXX */
	UDPSTAT_INC(udps_opackets);

	if (unlock_udbinfo == UH_WLOCKED)
		INP_HASH_WUNLOCK(pcbinfo);
	else if (unlock_udbinfo == UH_RLOCKED)
		INP_HASH_RUNLOCK(pcbinfo);
	UDP_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u);
	error = ip_output(m, inp->inp_options, NULL, ipflags,
	    inp->inp_moptions, inp);
	if (unlock_udbinfo == UH_WLOCKED)
		INP_WUNLOCK(inp);
	else
		INP_RUNLOCK(inp);
	return (error);

release:
	if (unlock_udbinfo == UH_WLOCKED) {
		INP_HASH_WUNLOCK(pcbinfo);
		INP_WUNLOCK(inp);
	} else if (unlock_udbinfo == UH_RLOCKED) {
		INP_HASH_RUNLOCK(pcbinfo);
		INP_RUNLOCK(inp);
	} else
		INP_RUNLOCK(inp);
	m_freem(m);
	return (error);
}
Esempio n. 16
0
/*
 * Subroutine of udp_input(), which appends the provided mbuf chain to the
 * passed pcb/socket.  The caller must provide a sockaddr_in via udp_in that
 * contains the source address.  If the socket ends up being an IPv6 socket,
 * udp_append() will convert to a sockaddr_in6 before passing the address
 * into the socket code.
 */
static void
udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off,
    struct sockaddr_in *udp_in)
{
	struct sockaddr *append_sa;
	struct socket *so;
	struct mbuf *opts = 0;
#ifdef INET6
	struct sockaddr_in6 udp_in6;
#endif
	struct udpcb *up;

	INP_LOCK_ASSERT(inp);

	/*
	 * Engage the tunneling protocol.
	 */
	up = intoudpcb(inp);
	if (up->u_tun_func != NULL) {
		(*up->u_tun_func)(n, off, inp);
		return;
	}

	if (n == NULL)
		return;

	off += sizeof(struct udphdr);

#ifdef IPSEC
	/* Check AH/ESP integrity. */
	if (ipsec4_in_reject(n, inp)) {
		m_freem(n);
		IPSECSTAT_INC(ips_in_polvio);
		return;
	}
#ifdef IPSEC_NAT_T
	up = intoudpcb(inp);
	KASSERT(up != NULL, ("%s: udpcb NULL", __func__));
	if (up->u_flags & UF_ESPINUDP_ALL) {	/* IPSec UDP encaps. */
		n = udp4_espdecap(inp, n, off);
		if (n == NULL)				/* Consumed. */
			return;
	}
#endif /* IPSEC_NAT_T */
#endif /* IPSEC */
#ifdef MAC
	if (mac_inpcb_check_deliver(inp, n) != 0) {
		m_freem(n);
		return;
	}
#endif /* MAC */
	if (inp->inp_flags & INP_CONTROLOPTS ||
	    inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) {
#ifdef INET6
		if (inp->inp_vflag & INP_IPV6)
			(void)ip6_savecontrol_v4(inp, n, &opts, NULL);
		else
#endif /* INET6 */
			ip_savecontrol(inp, &opts, ip, n);
	}
#ifdef INET6
	if (inp->inp_vflag & INP_IPV6) {
		bzero(&udp_in6, sizeof(udp_in6));
		udp_in6.sin6_len = sizeof(udp_in6);
		udp_in6.sin6_family = AF_INET6;
		in6_sin_2_v4mapsin6(udp_in, &udp_in6);
		append_sa = (struct sockaddr *)&udp_in6;
	} else
#endif /* INET6 */
		append_sa = (struct sockaddr *)udp_in;
	m_adj(n, off);

	so = inp->inp_socket;
	SOCKBUF_LOCK(&so->so_rcv);
	if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) {
		SOCKBUF_UNLOCK(&so->so_rcv);
		m_freem(n);
		if (opts)
			m_freem(opts);
		UDPSTAT_INC(udps_fullsock);
	} else
		sorwakeup_locked(so);
}
/*
 * tcp_detach is called when the socket layer loses its final reference
 * to the socket, be it a file descriptor reference, a reference from TCP,
 * etc.  At this point, there is only one case in which we will keep around
 * inpcb state: time wait.
 *
 * This function can probably be re-absorbed back into tcp_usr_detach() now
 * that there is a single detach path.
 */
static void
tcp_detach(struct socket *so, struct inpcb *inp)
{
	struct tcpcb *tp;
#ifdef INET6
	int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0;
#endif

	INP_INFO_WLOCK_ASSERT(&tcbinfo);
	INP_LOCK_ASSERT(inp);

	KASSERT(so->so_pcb == inp, ("tcp_detach: so_pcb != inp"));
	KASSERT(inp->inp_socket == so, ("tcp_detach: inp_socket != so"));

	tp = intotcpcb(inp);

	if (inp->inp_vflag & INP_TIMEWAIT) {
		/*
		 * There are two cases to handle: one in which the time wait
		 * state is being discarded (INP_DROPPED), and one in which
		 * this connection will remain in timewait.  In the former,
		 * it is time to discard all state (except tcptw, which has
		 * already been discarded by the timewait close code, which
		 * should be further up the call stack somewhere).  In the
		 * latter case, we detach from the socket, but leave the pcb
		 * present until timewait ends.
		 *
		 * XXXRW: Would it be cleaner to free the tcptw here?
		 */
		if (inp->inp_vflag & INP_DROPPED) {
			KASSERT(tp == NULL, ("tcp_detach: INP_TIMEWAIT && "
			    "INP_DROPPED && tp != NULL"));
#ifdef INET6
			if (isipv6) {
				in6_pcbdetach(inp);
				in6_pcbfree(inp);
			} else {
#endif
				in_pcbdetach(inp);
				in_pcbfree(inp);
#ifdef INET6
			}
#endif
		} else {
#ifdef INET6
			if (isipv6)
				in6_pcbdetach(inp);
			else
#endif
				in_pcbdetach(inp);
			INP_UNLOCK(inp);
		}
	} else {
		/*
		 * If the connection is not in timewait, we consider two
		 * two conditions: one in which no further processing is
		 * necessary (dropped || embryonic), and one in which TCP is
		 * not yet done, but no longer requires the socket, so the
		 * pcb will persist for the time being.
		 *
		 * XXXRW: Does the second case still occur?
		 */
		if (inp->inp_vflag & INP_DROPPED ||
		    tp->t_state < TCPS_SYN_SENT) {
			tcp_discardcb(tp);
#ifdef INET6
			if (isipv6) {
				in6_pcbdetach(inp);
				in6_pcbfree(inp);
			} else {
#endif
				in_pcbdetach(inp);
				in_pcbfree(inp);
#ifdef INET6
			}
#endif
		} else {
#ifdef INET6
			if (isipv6)
				in6_pcbdetach(inp);
			else
#endif
				in_pcbdetach(inp);
		}
	}
}
Esempio n. 18
0
/*
 * IP output.  The packet in mbuf chain m contains a skeletal IP
 * header (with len, off, ttl, proto, tos, src, dst).
 * The mbuf chain containing the packet will be freed.
 * The mbuf opt, if present, will not be freed.
 * If route ro is present and has ro_rt initialized, route lookup would be
 * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL,
 * then result of route lookup is stored in ro->ro_rt.
 *
 * In the IP forwarding case, the packet will arrive with options already
 * inserted, so must have a NULL opt pointer.
 */
int
ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
    struct ip_moptions *imo, struct inpcb *inp)
{
	struct ip *ip;
	struct ifnet *ifp = NULL;	/* keep compiler happy */
	struct mbuf *m0;
	int hlen = sizeof (struct ip);
	int mtu;
	int n;	/* scratchpad */
	int error = 0;
	struct sockaddr_in *dst;
	const struct sockaddr_in *gw;
	struct in_ifaddr *ia;
	int isbroadcast;
	uint16_t ip_len, ip_off;
	struct route iproute;
	struct rtentry *rte;	/* cache for ro->ro_rt */
	struct in_addr odst;
	struct m_tag *fwd_tag = NULL;
	int have_ia_ref;
#ifdef IPSEC
	int no_route_but_check_spd = 0;
#endif
	M_ASSERTPKTHDR(m);

	if (inp != NULL) {
		INP_LOCK_ASSERT(inp);
		M_SETFIB(m, inp->inp_inc.inc_fibnum);
		if (inp->inp_flowtype != M_HASHTYPE_NONE) {
			m->m_pkthdr.flowid = inp->inp_flowid;
			M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
		}
	}

	if (ro == NULL) {
		ro = &iproute;
		bzero(ro, sizeof (*ro));
	}

#ifdef FLOWTABLE
	if (ro->ro_rt == NULL)
		(void )flowtable_lookup(AF_INET, m, ro);
#endif

	if (opt) {
		int len = 0;
		m = ip_insertoptions(m, opt, &len);
		if (len != 0)
			hlen = len; /* ip->ip_hl is updated above */
	}
	ip = mtod(m, struct ip *);
	ip_len = ntohs(ip->ip_len);
	ip_off = ntohs(ip->ip_off);

	/*
	 * Fill in IP header.  If we are not allowing fragmentation,
	 * then the ip_id field is meaningless, but we don't set it
	 * to zero.  Doing so causes various problems when devices along
	 * the path (routers, load balancers, firewalls, etc.) illegally
	 * disable DF on our packet.  Note that a 16-bit counter
	 * will wrap around in less than 10 seconds at 100 Mbit/s on a
	 * medium with MTU 1500.  See Steven M. Bellovin, "A Technique
	 * for Counting NATted Hosts", Proc. IMW'02, available at
	 * <http://www.cs.columbia.edu/~smb/papers/fnat.pdf>.
	 */
	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
		ip->ip_v = IPVERSION;
		ip->ip_hl = hlen >> 2;
		ip->ip_id = ip_newid();
		IPSTAT_INC(ips_localout);
	} else {
Esempio n. 19
0
static void
ipsec_cachepolicy(struct inpcb *inp, struct secpolicy *sp, u_int dir)
{
	uint32_t genid;
	int downgrade;

	INP_LOCK_ASSERT(inp);

	if (dir == IPSEC_DIR_OUTBOUND) {
		/* Do we have configured PCB policy? */
		if (inp->inp_sp->flags & INP_OUTBOUND_POLICY)
			return;
		/* Another thread has already set cached policy */
		if (inp->inp_sp->sp_out != NULL)
			return;
		/*
		 * Do not cache OUTBOUND policy if PCB isn't connected,
		 * i.e. foreign address is INADDR_ANY/UNSPECIFIED.
		 */
#ifdef INET
		if ((inp->inp_vflag & INP_IPV4) != 0 &&
		    inp->inp_faddr.s_addr == INADDR_ANY)
			return;
#endif
#ifdef INET6
		if ((inp->inp_vflag & INP_IPV6) != 0 &&
		    IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))
			return;
#endif
	} else {
		/* Do we have configured PCB policy? */
		if (inp->inp_sp->flags & INP_INBOUND_POLICY)
			return;
		/* Another thread has already set cached policy */
		if (inp->inp_sp->sp_in != NULL)
			return;
		/*
		 * Do not cache INBOUND policy for listen socket,
		 * that is bound to INADDR_ANY/UNSPECIFIED address.
		 */
#ifdef INET
		if ((inp->inp_vflag & INP_IPV4) != 0 &&
		    inp->inp_faddr.s_addr == INADDR_ANY)
			return;
#endif
#ifdef INET6
		if ((inp->inp_vflag & INP_IPV6) != 0 &&
		    IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))
			return;
#endif
	}
	downgrade = 0;
	if (!INP_WLOCKED(inp)) {
		if ((downgrade = INP_TRY_UPGRADE(inp)) == 0)
			return;
	}
	if (dir == IPSEC_DIR_OUTBOUND)
		inp->inp_sp->sp_out = sp;
	else
		inp->inp_sp->sp_in = sp;
	/*
	 * SP is already referenced by the lookup code.
	 * We take extra reference here to avoid race in the
	 * ipsec_getpcbpolicy() function - SP will not be freed in the
	 * time between we take SP pointer from the cache and key_addref()
	 * call.
	 */
	key_addref(sp);
	genid = key_getspgen();
	if (genid != inp->inp_sp->genid) {
		ipsec_invalidate_cache(inp, dir);
		inp->inp_sp->genid = genid;
	}
	KEYDBG(IPSEC_STAMP,
	    printf("%s: PCB(%p): cached %s SP(%p)\n",
	    __func__, inp, dir == IPSEC_DIR_OUTBOUND ? "OUTBOUND":
	    "INBOUND", sp));
	if (downgrade != 0)
		INP_DOWNGRADE(inp);
}