/* Caller should be in critical section */ static struct tcpcb * tcp_timer_persist_handler(struct tcpcb *tp) { #ifdef TCPDEBUG int ostate; #endif #ifdef TCPDEBUG ostate = tp->t_state; #endif /* * Persistance timer into zero window. * Force a byte to be output, if possible. */ tcpstat.tcps_persisttimeo++; /* * Hack: if the peer is dead/unreachable, we do not * time out if the window is closed. After a full * backoff, drop the connection if the idle time * (no responses to probes) reaches the maximum * backoff that we would use if retransmitting. */ if (tp->t_rxtshift == TCP_MAXRXTSHIFT && ((ticks - tp->t_rcvtime) >= tcp_maxpersistidle || (ticks - tp->t_rcvtime) >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { tcpstat.tcps_persistdrop++; tp = tcp_drop(tp, ETIMEDOUT); goto out; } tcp_setpersist(tp); tp->t_flags |= TF_FORCE; tcp_output(tp); tp->t_flags &= ~TF_FORCE; out: #ifdef TCPDEBUG if (tp && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); #endif return tp; }
int tcp_output(struct tcpcb * tp) { struct socket * so = tp->t_inpcb->inp_socket; int len; long win; int off, flags, error; struct mbuf * m; struct tcpiphdr * ti; unsigned optlen = 0; int idle, sendalot; struct mbuf * sendm; /* mbuf which contains data to send */ struct mbuf * tcp_mbuf; /* mbuf containing TCP header */ int bufoff; /* offset of data in sendm->m_data */ #ifdef TCP_SACK int sack_resend; int sack_hole = 0; /* next sack hole to fill */ if(tp->t_flags & TF_SACKREPLY) { /* we are resending based on a received SACK header */ sack_resend = TRUE; tp->t_flags &= ~TF_SACKREPLY; /* clear flag */ } else sack_resend = FALSE; #endif /* TCP_SACK */ /* * Determine length of data that should be transmitted, * and flags that will be used. * If there is some data or critical controls (SYN, RST) * to send, then transmit; otherwise, investigate further. */ idle = (tp->snd_max == tp->snd_una); again: sendalot = 0; off = (int)(tp->snd_nxt - tp->snd_una); win = (long)tp->snd_wnd; /* set basic send window */ if (win > (long)tp->snd_cwnd) /* see if we need congestion control */ { win = (int)(tp->snd_cwnd & ~(ALIGN_TYPE-1)); /* keep data aligned */ } /* * If in persist timeout with window of 0, send 1 byte. * Otherwise, if window is small but nonzero * and timer expired, we will send what we can * and go to transmit state. */ if (tp->t_force) { if (win == 0) win = 1; else { tp->t_timer[TCPT_PERSIST] = 0; tp->t_rxtshift = 0; } } #ifdef TCP_SACK /* See if we need to adjust the offset for a sack resend */ if(sack_resend) { off = (int)(tp->sack_hole_start[sack_hole] - tp->snd_una); /* if this hole's already been acked then punt and move to next hole */ if(off < 0) { /* clear out the acked hole */ tp->sack_hole_start[sack_hole] = tp->sack_hole_end[sack_hole] = 0; /* see if we're done with SACK hole list (2 tests) */ if(++sack_hole >= SACK_BLOCKS) return 0; if(tp->sack_hole_start[sack_hole] == tp->sack_hole_end[sack_hole]) return 0; goto again; } tp->snd_nxt = tp->sack_hole_start[sack_hole]; len = (int)(tp->sack_hole_end[sack_hole] - tp->sack_hole_start[sack_hole]); len = (int)MIN(len, (int)win); } else #endif /* TCP_SACK */ { /* set length of packets which are not sack resends */ len = (int)MIN(so->so_snd.sb_cc, (unsigned)win) - off; } flags = tcp_outflags[tp->t_state]; /* See if we need to build TCP options field. This test should be fast. */ #if (defined(TCP_TIMESTAMP) | defined(TCP_SACK)) if((flags & TH_SYN) || /* !!!??? (so->so_options & SO_TIMESTAMP) || */ (tp->t_flags & TF_SACKNOW) ) { optlen = bld_options(tp, &tcp_optionbuf[optlen], flags, so); } #else /* If other options not defined this build then don't bother to call bld_options() except * on SYN packets */ if(flags & TH_SYN) { optlen = bld_options(tp, &tcp_optionbuf[optlen], flags, so); } #endif if (len < 0) { /* * If FIN has been sent but not acked, * but we haven't been called to retransmit, * len will be -1. Otherwise, window shrank * after we sent into it. If window shrank to 0, * cancel pending retransmit and pull snd_nxt * back to (closed) window. We will enter persist * state below. If the window didn't close completely, * just wait for an ACK. */ len = 0; if (win == 0) { tp->t_timer[TCPT_REXMT] = 0; tp->snd_nxt = tp->snd_una; } } if (len > (int)tp->t_maxseg) { len = tp->t_maxseg; sendalot = 1; } #ifdef IP_V4 #ifdef IP_PMTU { int pmtu = tp->t_inpcb->inp_pmtu - 40; if (len > pmtu) { len = pmtu - 40; sendalot = 1; } } #endif /* IP_PMTU */ /* We don't need a pmtu test for IPv6. V6 code limits t_maxseg to * the Path MTU, so the test above the v4 ifdef above covers us. */ #endif /* IP_V4 */ if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) flags &= ~TH_FIN; win = (long)(sbspace(&so->so_rcv)); /* * If our state indicates that FIN should be sent * and we have not yet done so, or we're retransmitting the FIN, * then we need to send. */ if ((flags & TH_FIN) && (so->so_snd.sb_cc == 0) && ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) { goto send; } /* * Send if we owe peer an ACK. */ if (tp->t_flags & TF_ACKNOW) goto send; if (flags & (TH_SYN|TH_RST)) goto send; if (SEQ_GT(tp->snd_up, tp->snd_una)) goto send; /* * Sender silly window avoidance. If connection is idle * and can send all data, a maximum segment, * at least a maximum default-size segment do it, * or are forced, do it; otherwise don't bother. * If peer's buffer is tiny, then send * when window is at least half open. * If retransmitting (possibly after persist timer forced us * to send into a small window), then must resend. */ if (len) { if (len == (int)tp->t_maxseg) goto send; if ((idle || tp->t_flags & TF_NODELAY) && len + off >= (int)so->so_snd.sb_cc) { goto send; } if (tp->t_force) goto send; if (len >= (int)(tp->max_sndwnd / 2)) goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) goto send; } /* * Compare available window to amount of window * known to peer (as advertised window less * next expected input). If the difference is at least two * max size segments or at least 35% of the maximum possible * window, then want to send a window update to peer. */ if (win > 0) { int adv = (int)win - (int)(tp->rcv_adv - tp->rcv_nxt); if (so->so_rcv.sb_cc == 0 && adv >= (int)(tp->t_maxseg * 2)) goto send; if (100 * (u_int)adv / so->so_rcv.sb_hiwat >= 35) goto send; } /* * TCP window updates are not reliable, rather a polling protocol * using ``persist'' packets is used to insure receipt of window * updates. The three ``states'' for the output side are: * idle not doing retransmits or persists * persisting to move a small or zero window * (re)transmitting and thereby not persisting * * tp->t_timer[TCPT_PERSIST] * is set when we are in persist state. * tp->t_force * is set when we are called to send a persist packet. * tp->t_timer[TCPT_REXMT] * is set when we are retransmitting * The output side is idle when both timers are zero. * * If send window is too small, there is data to transmit, and no * retransmit or persist is pending, then go to persist state. * If nothing happens soon, send when timer expires: * if window is nonzero, transmit what we can, * otherwise force out a byte. */ if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 && tp->t_timer[TCPT_PERSIST] == 0) { tp->t_rxtshift = 0; tcp_setpersist(tp); } /* * No reason to send a segment, just return. */ return (0); send: ENTER_CRIT_SECTION(tp); /* Limit send length to the current buffer so as to * avoid doing the "mbuf shuffle" in m_copy(). */ bufoff = off; sendm = so->so_snd.sb_mb; if (len) { /* find mbuf containing data to send (at "off") */ while (sendm) /* loop through socket send list */ { bufoff -= sendm->m_len; if (bufoff < 0) /* if off is in this buffer, break */ break; sendm = sendm->m_next; } if (!sendm) { dtrap(); /* shouldn't happen */ } bufoff += sendm->m_len; /* index to next data to send in msend */ /* if socket has multiple unsent mbufs, set flag for send to loop */ if ((sendm->m_next) && (len > (int)sendm->m_len)) { flags &= ~TH_FIN; /* don't FIN on segment prior to last */ sendalot = 1; /* set to send more segments */ } if((flags & TH_FIN) && (so->so_snd.sb_cc > (unsigned)len)) { /* This can happen on slow links (PPP) which retry the last * segment - the one with the FIN bit attached to data. */ flags &= ~TH_FIN; /* don't FIN on segment prior to last */ } /* only send the rest of msend */ len = min(len, (int)sendm->m_len); /* if we're not sending starting at sendm->m_data (in which * case bufoff != 0), then we will copy the data; else we would * write IP/TCP headers over sent but un-ack'ed data in sendm. * Similarly, if sendm->m_data is not aligned with respect to * sendm->m_base and ALIGN_TYPE, we will copy the data to * ensure that it (and the then-prepended IP/TCP headers) will * be aligned according to ALIGN_TYPE. */ if ((bufoff != 0) || /* data not front aligned in send mbuf? */ (((sendm->m_data - sendm->m_base) & (ALIGN_TYPE - 1)) != 0)) { len = min(len, (int)(sendm->m_len - bufoff)); /* limit len again */ /* One more test - if this data is not aligned with the front * of the m_data buffer then we can't use it in place, else we * might write the IP/TCP header over data that has not yet * been acked. In this case we must make sure our send * fits into a little buffer and send what we can. */ if ((len > (int)(lilbufsiz - HDRSLEN)) && /* length is bigger the small buffer? */ (bigfreeq.q_len < 2)) /* and we are low on big buffers */ { len = lilbufsiz - HDRSLEN; } } } /* if send data is sufficiently aligned in packet, prepend TCP/IP header * in the space provided. */ if (len && (bufoff == 0) && (sendm->pkt->inuse == 1) && (((sendm->m_data - sendm->m_base) & (ALIGN_TYPE - 1)) == 0) && (optlen == 0)) { /* get an empty mbuf to "clone" the data */ m = m_getnbuf(MT_TXDATA, 0); if (!m) { EXIT_CRIT_SECTION(tp); return (ENOBUFS); } m->pkt = sendm->pkt; /* copy packet location in new mbuf */ m->pkt->inuse++; /* bump packet's use count */ m->m_base = sendm->m_base; /* clone mbuf members */ m->m_memsz = sendm->m_memsz; m->m_len = len + TCPIPHDRSZ; /* adjust clone for header */ m->m_data = sendm->m_data - TCPIPHDRSZ; } else /* either no data or data is not front aligned in mbuf */ { /* Grab a header mbuf, attaching a copy of data to be * transmitted, and initialize the header from * the template for sends on this connection. */ m = m_getwithdata (MT_HEADER, IFNETHDR_SIZE + TCPIPHDRSZ); if (m ==(struct mbuf *)NULL) { EXIT_CRIT_SECTION(tp); return ENOBUFS; } m->m_len = TCPIPHDRSZ; m->m_data += IFNETHDR_SIZE;/* Move this to sizeof tcpip hdr leave*/ /* 14 bytes for ethernet header */ if (len) /* attach any data to send */ { m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len); if (m->m_next == 0) { m_freem(m); EXIT_CRIT_SECTION(tp); return ENOBUFS; } } } EXIT_CRIT_SECTION(tp); if (len) { if (tp->t_force && len == 1) tcpstat.tcps_sndprobe++; else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { tcpstat.tcps_sndrexmitpack++; tcpstat.tcps_sndrexmitbyte += len; #ifdef TCP_SACK if(sack_resend) tcpstat.tcps_sackresend++; #endif } else { tcpstat.tcps_sndpack++; tcpstat.tcps_sndbyte += len; } } else if (tp->t_flags & TF_ACKNOW) { tcpstat.tcps_sndacks++; } else if (flags & (TH_SYN|TH_FIN|TH_RST)) tcpstat.tcps_sndctrl++; else if (SEQ_GT(tp->snd_up, tp->snd_una)) tcpstat.tcps_sndurg++; else tcpstat.tcps_sndwinup++; ti = (struct tcpiphdr *)(m->m_data+sizeof(struct ip)-sizeof(struct ipovly)); if ((char *)ti < m->pkt->nb_buff) { panic("tcp_out- packet ptr underflow\n"); } tcp_mbuf = m; /* flag TCP header mbuf */ #ifdef IP_V6 /* Dual mode code */ if(so->so_domain == AF_INET6) { m = mbuf_prepend(m, sizeof(struct ipv6)); if(m == NULL) { /* this can happen when we run out of mbufs or pkt buffers * That is, mfreeq is empty or (lilfreeq, bigfreeq) are empty. * One solution is to find out which one is getting full and * then increase them. */ dtrap(); /* This is really rare... */ m_freem(tcp_mbuf); /* Free TCP/data chain */ return ENOBUFS; } /* strip overlay from front of TCP header */ tcp_mbuf->m_data += sizeof(struct ipovly); tcp_mbuf->m_len -= sizeof(struct ipovly); } #endif /* end IP_V6 */ if (tp->t_template == 0) panic("tcp_output"); MEMCPY((char*)ti, (char*)tp->t_template, sizeof(struct tcpiphdr)); /* * Fill in fields, remembering maximum advertised * window for use in delaying messages about window sizes. * If resending a FIN, be sure not to use a new sequence number. */ if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && tp->snd_nxt == tp->snd_max) { tp->snd_nxt--; } ti->ti_seq = htonl(tp->snd_nxt); ti->ti_ack = htonl(tp->rcv_nxt); /* * If we're sending a SYN, check the IP address of the interface * that we will (likely) use to send the IP datagram -- if it's * changed from what is in the template (as it might if this is * a retransmission, and the original SYN caused PPP to start * bringing the interface up, and PPP has got a new IP address * via IPCP), update the template and the inpcb with the new * address. */ if (flags & TH_SYN) { struct inpcb * inp; inp = (struct inpcb *)so->so_pcb; switch(so->so_domain) { #ifdef IP_V4 case AF_INET: { ip_addr src; #ifdef INCLUDE_PPP if(((flags & TH_ACK) == 0) && /* SYN only, not SYN/ACK */ (inp->ifp) && /* Make sure we have iface */ (inp->ifp->mib.ifType == PPP)) /* only PPP type */ { dtrap(); /* remove after confirmed to work in PPP */ src = ip_mymach(ti->ti_dst.s_addr); if (src != ti->ti_src.s_addr) { ti->ti_src.s_addr = src; tp->t_template->ti_src.s_addr = src; tp->t_inpcb->inp_laddr.s_addr = src; } } #endif /* INCLUDE_PPP */ /* If this is a SYN (not a SYN/ACK) then set the pmtu */ if((flags & TH_ACK) == 0) { #ifdef IP_PMTU inp->inp_pmtu = pmtucache_get(inp->inp_faddr.s_addr); #else /* not compiled for pathmtu, guess based on iface */ { NET ifp; /* find iface for route. Pass "src" as nexthop return */ ifp = iproute(ti->ti_dst.s_addr, &src); if(ifp) inp->inp_pmtu = ifp->n_mtu - (ifp->n_lnh + 40); else inp->inp_pmtu = 580; /* Ugh. */ } #endif /* IP_PMTU */ } break; } #endif /* IP_V4 */ #ifdef IP_V6 case AF_INET6: { struct ip6_inaddr * local; local = ip6_myaddr(&tp->t_inpcb->ip6_faddr, inp->ifp); /* If we got a local address & it's not the one in the pcb, then * we assume it changed at the iface and fix it in the pcb. Unlike * v4, we don't have an IP header yet, not do we have a template * to worry about. */ if((local) && (!IP6EQ(&local->addr, &tp->t_inpcb->ip6_laddr))) { IP6CPY(&tp->t_inpcb->ip6_laddr, &local->addr); } /* If this is a SYN (not a SYN/ACK) then set the pmtu */ if((flags & TH_ACK) == 0) { inp->inp_pmtu = ip6_pmtulookup(&inp->ip6_laddr, inp->ifp); } break; } #endif /* IP_V6 */ default: dtrap(); /* bad domain setting */ } } /* fill in options if any are set */ if (optlen) { struct mbuf * mopt; mopt = m_getwithdata(MT_TXDATA, MAXOPTLEN); if (mopt == NULL) { m_freem(m); return (ENOBUFS); } /* insert options mbuf after after tmp_mbuf */ mopt->m_next = tcp_mbuf->m_next; tcp_mbuf->m_next = mopt; /* extend options to aligned address */ while(optlen & 0x03) tcp_optionbuf[optlen++] = TCPOPT_EOL; MEMCPY(mtod(mopt, char *), tcp_optionbuf, optlen); mopt->m_len = optlen; /* use portable macro to set tcp data offset bits */ SET_TH_OFF(ti->ti_t, ((sizeof (struct tcphdr) + optlen) >> 2)); } ti->ti_flags = (u_char)flags; /* * Calculate receive window. Don't shrink window, * but avoid silly window syndrome. */ if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg) win = 0; if (win < (long)(tp->rcv_adv - tp->rcv_nxt)) win = (long)(tp->rcv_adv - tp->rcv_nxt); /* do check for Iniche buffer limits -JB- */ if (bigfreeq.q_len == 0) /* If queue length is 0, set window to 0 */ { win = 0; } else if(win > (((long)bigfreeq.q_len - 1) * (long)bigbufsiz)) { win = ((long)bigfreeq.q_len - 1) * bigbufsiz; } #ifdef TCP_WIN_SCALE if(tp->t_flags & TF_WINSCALE) { ti->ti_win = htons((u_short)(win >> tp->rcv_wind_scale)); /* apply scale */ }
/* * Tcp output routine: figure out what should be sent and send it. */ int tcp_output(struct tcpcb *tp) { struct inpcb * const inp = tp->t_inpcb; struct socket *so = inp->inp_socket; long len, recvwin, sendwin; int nsacked = 0; int off, flags, error; #ifdef TCP_SIGNATURE int sigoff = 0; #endif struct mbuf *m; struct ip *ip = NULL; struct ipovly *ipov = NULL; struct tcphdr *th; u_char opt[TCP_MAXOLEN]; unsigned int ipoptlen, optlen, hdrlen; int idle; boolean_t sendalot; struct ip6_hdr *ip6 = NULL; #ifdef INET6 const boolean_t isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #else const boolean_t isipv6 = FALSE; #endif KKASSERT(so->so_port == &curthread->td_msgport); /* * Determine length of data that should be transmitted, * and flags that will be used. * If there is some data or critical controls (SYN, RST) * to send, then transmit; otherwise, investigate further. */ /* * If we have been idle for a while, the send congestion window * could be no longer representative of the current state of the link. * So unless we are expecting more acks to come in, slow-start from * scratch to re-determine the send congestion window. */ if (tp->snd_max == tp->snd_una && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) { if (tcp_do_rfc3390) { int initial_cwnd = min(4 * tp->t_maxseg, max(2 * tp->t_maxseg, 4380)); tp->snd_cwnd = min(tp->snd_cwnd, initial_cwnd); } else { tp->snd_cwnd = tp->t_maxseg; } tp->snd_wacked = 0; } /* * Calculate whether the transmit stream was previously idle * and adjust TF_LASTIDLE for the next time. */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); if (idle && (tp->t_flags & TF_MORETOCOME)) tp->t_flags |= TF_LASTIDLE; else tp->t_flags &= ~TF_LASTIDLE; if (TCP_DO_SACK(tp) && tp->snd_nxt != tp->snd_max && !IN_FASTRECOVERY(tp)) nsacked = tcp_sack_bytes_below(&tp->scb, tp->snd_nxt); again: /* Make use of SACK information when slow-starting after a RTO. */ if (TCP_DO_SACK(tp) && tp->snd_nxt != tp->snd_max && !IN_FASTRECOVERY(tp)) { tcp_seq old_snd_nxt = tp->snd_nxt; tcp_sack_skip_sacked(&tp->scb, &tp->snd_nxt); nsacked += tp->snd_nxt - old_snd_nxt; } sendalot = FALSE; off = tp->snd_nxt - tp->snd_una; sendwin = min(tp->snd_wnd, tp->snd_cwnd + nsacked); sendwin = min(sendwin, tp->snd_bwnd); flags = tcp_outflags[tp->t_state]; /* * Get standard flags, and add SYN or FIN if requested by 'hidden' * state flags. */ if (tp->t_flags & TF_NEEDFIN) flags |= TH_FIN; if (tp->t_flags & TF_NEEDSYN) flags |= TH_SYN; /* * If in persist timeout with window of 0, send 1 byte. * Otherwise, if window is small but nonzero * and timer expired, we will send what we can * and go to transmit state. */ if (tp->t_flags & TF_FORCE) { if (sendwin == 0) { /* * If we still have some data to send, then * clear the FIN bit. Usually this would * happen below when it realizes that we * aren't sending all the data. However, * if we have exactly 1 byte of unsent data, * then it won't clear the FIN bit below, * and if we are in persist state, we wind * up sending the packet without recording * that we sent the FIN bit. * * We can't just blindly clear the FIN bit, * because if we don't have any more data * to send then the probe will be the FIN * itself. */ if (off < so->so_snd.ssb_cc) flags &= ~TH_FIN; sendwin = 1; } else { tcp_callout_stop(tp, tp->tt_persist); tp->t_rxtshift = 0; } } /* * If snd_nxt == snd_max and we have transmitted a FIN, the * offset will be > 0 even if so_snd.ssb_cc is 0, resulting in * a negative length. This can also occur when TCP opens up * its congestion window while receiving additional duplicate * acks after fast-retransmit because TCP will reset snd_nxt * to snd_max after the fast-retransmit. * * In the normal retransmit-FIN-only case, however, snd_nxt will * be set to snd_una, the offset will be 0, and the length may * wind up 0. */ len = (long)ulmin(so->so_snd.ssb_cc, sendwin) - off; /* * Lop off SYN bit if it has already been sent. However, if this * is SYN-SENT state and if segment contains data, suppress sending * segment (sending the segment would be an option if we still * did TAO and the remote host supported it). */ if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { flags &= ~TH_SYN; off--, len++; if (len > 0 && tp->t_state == TCPS_SYN_SENT) return 0; } /* * Be careful not to send data and/or FIN on SYN segments. * This measure is needed to prevent interoperability problems * with not fully conformant TCP implementations. */ if (flags & TH_SYN) { len = 0; flags &= ~TH_FIN; } if (len < 0) { /* * If FIN has been sent but not acked, * but we haven't been called to retransmit, * len will be < 0. Otherwise, window shrank * after we sent into it. If window shrank to 0, * cancel pending retransmit, pull snd_nxt back * to (closed) window, and set the persist timer * if it isn't already going. If the window didn't * close completely, just wait for an ACK. */ len = 0; if (sendwin == 0) { tcp_callout_stop(tp, tp->tt_rexmt); tp->t_rxtshift = 0; tp->snd_nxt = tp->snd_una; if (!tcp_callout_active(tp, tp->tt_persist)) tcp_setpersist(tp); } } KASSERT(len >= 0, ("%s: len < 0", __func__)); /* * Automatic sizing of send socket buffer. Often the send buffer * size is not optimally adjusted to the actual network conditions * at hand (delay bandwidth product). Setting the buffer size too * small limits throughput on links with high bandwidth and high * delay (eg. trans-continental/oceanic links). Setting the * buffer size too big consumes too much real kernel memory, * especially with many connections on busy servers. * * The criteria to step up the send buffer one notch are: * 1. receive window of remote host is larger than send buffer * (with a fudge factor of 5/4th); * 2. send buffer is filled to 7/8th with data (so we actually * have data to make use of it); * 3. send buffer fill has not hit maximal automatic size; * 4. our send window (slow start and cogestion controlled) is * larger than sent but unacknowledged data in send buffer. * * The remote host receive window scaling factor may limit the * growing of the send buffer before it reaches its allowed * maximum. * * It scales directly with slow start or congestion window * and does at most one step per received ACK. This fast * scaling has the drawback of growing the send buffer beyond * what is strictly necessary to make full use of a given * delay*bandwith product. However testing has shown this not * to be much of an problem. At worst we are trading wasting * of available bandwith (the non-use of it) for wasting some * socket buffer memory. * * TODO: Shrink send buffer during idle periods together * with congestion window. Requires another timer. Has to * wait for upcoming tcp timer rewrite. */ if (tcp_do_autosndbuf && so->so_snd.ssb_flags & SSB_AUTOSIZE) { if ((tp->snd_wnd / 4 * 5) >= so->so_snd.ssb_hiwat && so->so_snd.ssb_cc >= (so->so_snd.ssb_hiwat / 8 * 7) && so->so_snd.ssb_cc < tcp_autosndbuf_max && sendwin >= (so->so_snd.ssb_cc - (tp->snd_nxt - tp->snd_una))) { u_long newsize; newsize = ulmin(so->so_snd.ssb_hiwat + tcp_autosndbuf_inc, tcp_autosndbuf_max); if (!ssb_reserve(&so->so_snd, newsize, so, NULL)) atomic_clear_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE); if (newsize >= (TCP_MAXWIN << tp->snd_scale)) atomic_clear_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE); } } /* * Truncate to the maximum segment length and ensure that FIN is * removed if the length no longer contains the last data byte. */ if (len > tp->t_maxseg) { len = tp->t_maxseg; sendalot = TRUE; } if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.ssb_cc)) flags &= ~TH_FIN; recvwin = ssb_space(&so->so_rcv); /* * Sender silly window avoidance. We transmit under the following * conditions when len is non-zero: * * - We have a full segment * - This is the last buffer in a write()/send() and we are * either idle or running NODELAY * - we've timed out (e.g. persist timer) * - we have more then 1/2 the maximum send window's worth of * data (receiver may be limiting the window size) * - we need to retransmit */ if (len) { if (len == tp->t_maxseg) goto send; /* * NOTE! on localhost connections an 'ack' from the remote * end may occur synchronously with the output and cause * us to flush a buffer queued with moretocome. XXX * * note: the len + off check is almost certainly unnecessary. */ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ (idle || (tp->t_flags & TF_NODELAY)) && len + off >= so->so_snd.ssb_cc && !(tp->t_flags & TF_NOPUSH)) { goto send; } if (tp->t_flags & TF_FORCE) /* typ. timeout case */ goto send; if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */ goto send; } /* * Compare available window to amount of window * known to peer (as advertised window less * next expected input). If the difference is at least two * max size segments, or at least 50% of the maximum possible * window, then want to send a window update to peer. */ if (recvwin > 0) { /* * "adv" is the amount we can increase the window, * taking into account that we are limited by * TCP_MAXWIN << tp->rcv_scale. */ long adv = min(recvwin, (long)TCP_MAXWIN << tp->rcv_scale) - (tp->rcv_adv - tp->rcv_nxt); long hiwat; /* * This ack case typically occurs when the user has drained * the TCP socket buffer sufficiently to warrent an ack * containing a 'pure window update'... that is, an ack that * ONLY updates the tcp window. * * It is unclear why we would need to do a pure window update * past 2 segments if we are going to do one at 1/2 the high * water mark anyway, especially since under normal conditions * the user program will drain the socket buffer quickly. * The 2-segment pure window update will often add a large * number of extra, unnecessary acks to the stream. * * avoid_pure_win_update now defaults to 1. */ if (avoid_pure_win_update == 0 || (tp->t_flags & TF_RXRESIZED)) { if (adv >= (long) (2 * tp->t_maxseg)) { goto send; } } hiwat = (long)(TCP_MAXWIN << tp->rcv_scale); if (hiwat > (long)so->so_rcv.ssb_hiwat) hiwat = (long)so->so_rcv.ssb_hiwat; if (adv >= hiwat / 2) goto send; } /* * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW * is also a catch-all for the retransmit timer timeout case. */ if (tp->t_flags & TF_ACKNOW) goto send; if ((flags & TH_RST) || ((flags & TH_SYN) && !(tp->t_flags & TF_NEEDSYN))) goto send; if (SEQ_GT(tp->snd_up, tp->snd_una)) goto send; /* * If our state indicates that FIN should be sent * and we have not yet done so, then we need to send. */ if (flags & TH_FIN && (!(tp->t_flags & TF_SENTFIN) || tp->snd_nxt == tp->snd_una)) goto send; /* * TCP window updates are not reliable, rather a polling protocol * using ``persist'' packets is used to insure receipt of window * updates. The three ``states'' for the output side are: * idle not doing retransmits or persists * persisting to move a small or zero window * (re)transmitting and thereby not persisting * * tcp_callout_active(tp, tp->tt_persist) * is true when we are in persist state. * The TF_FORCE flag in tp->t_flags * is set when we are called to send a persist packet. * tcp_callout_active(tp, tp->tt_rexmt) * is set when we are retransmitting * The output side is idle when both timers are zero. * * If send window is too small, there is data to transmit, and no * retransmit or persist is pending, then go to persist state. * If nothing happens soon, send when timer expires: * if window is nonzero, transmit what we can, * otherwise force out a byte. */ if (so->so_snd.ssb_cc > 0 && !tcp_callout_active(tp, tp->tt_rexmt) && !tcp_callout_active(tp, tp->tt_persist)) { tp->t_rxtshift = 0; tcp_setpersist(tp); } /* * No reason to send a segment, just return. */ return (0); send: /* * Before ESTABLISHED, force sending of initial options * unless TCP set not to do any options. * NOTE: we assume that the IP/TCP header plus TCP options * always fit in a single mbuf, leaving room for a maximum * link header, i.e. * max_linkhdr + sizeof(struct tcpiphdr) + optlen <= MCLBYTES */ optlen = 0; if (isipv6) hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else hdrlen = sizeof(struct tcpiphdr); if (flags & TH_SYN) { tp->snd_nxt = tp->iss; if (!(tp->t_flags & TF_NOOPT)) { u_short mss; opt[0] = TCPOPT_MAXSEG; opt[1] = TCPOLEN_MAXSEG; mss = htons((u_short) tcp_mssopt(tp)); memcpy(opt + 2, &mss, sizeof mss); optlen = TCPOLEN_MAXSEG; if ((tp->t_flags & TF_REQ_SCALE) && (!(flags & TH_ACK) || (tp->t_flags & TF_RCVD_SCALE))) { *((u_int32_t *)(opt + optlen)) = htonl( TCPOPT_NOP << 24 | TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | tp->request_r_scale); optlen += 4; } if ((tcp_do_sack && !(flags & TH_ACK)) || tp->t_flags & TF_SACK_PERMITTED) { uint32_t *lp = (uint32_t *)(opt + optlen); *lp = htonl(TCPOPT_SACK_PERMITTED_ALIGNED); optlen += TCPOLEN_SACK_PERMITTED_ALIGNED; } } } /* * Send a timestamp and echo-reply if this is a SYN and our side * wants to use timestamps (TF_REQ_TSTMP is set) or both our side * and our peer have sent timestamps in our SYN's. */ if ((tp->t_flags & (TF_REQ_TSTMP | TF_NOOPT)) == TF_REQ_TSTMP && !(flags & TH_RST) && (!(flags & TH_ACK) || (tp->t_flags & TF_RCVD_TSTMP))) { u_int32_t *lp = (u_int32_t *)(opt + optlen); /* Form timestamp option as shown in appendix A of RFC 1323. */ *lp++ = htonl(TCPOPT_TSTAMP_HDR); *lp++ = htonl(ticks); *lp = htonl(tp->ts_recent); optlen += TCPOLEN_TSTAMP_APPA; } /* Set receive buffer autosizing timestamp. */ if (tp->rfbuf_ts == 0 && (so->so_rcv.ssb_flags & SSB_AUTOSIZE)) tp->rfbuf_ts = ticks; /* * If this is a SACK connection and we have a block to report, * fill in the SACK blocks in the TCP options. */ if ((tp->t_flags & (TF_SACK_PERMITTED | TF_NOOPT)) == TF_SACK_PERMITTED && (!LIST_EMPTY(&tp->t_segq) || tp->reportblk.rblk_start != tp->reportblk.rblk_end)) tcp_sack_fill_report(tp, opt, &optlen); #ifdef TCP_SIGNATURE if (tp->t_flags & TF_SIGNATURE) { int i; u_char *bp; /* * Initialize TCP-MD5 option (RFC2385) */ bp = (u_char *)opt + optlen; *bp++ = TCPOPT_SIGNATURE; *bp++ = TCPOLEN_SIGNATURE; sigoff = optlen + 2; for (i = 0; i < TCP_SIGLEN; i++) *bp++ = 0; optlen += TCPOLEN_SIGNATURE; /* * Terminate options list and maintain 32-bit alignment. */ *bp++ = TCPOPT_NOP; *bp++ = TCPOPT_EOL; optlen += 2; } #endif /* TCP_SIGNATURE */ KASSERT(optlen <= TCP_MAXOLEN, ("too many TCP options")); hdrlen += optlen; if (isipv6) { ipoptlen = ip6_optlen(inp); } else { if (inp->inp_options) { ipoptlen = inp->inp_options->m_len - offsetof(struct ipoption, ipopt_list); } else {
void tcp_timer_persist(void *xtp) { struct tcpcb *tp = xtp; struct inpcb *inp; CURVNET_SET(tp->t_vnet); #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_persist) || !callout_active(&tp->t_timers->tt_persist)) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_persist); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); /* * Persistence timer into zero window. * Force a byte to be output, if possible. */ TCPSTAT_INC(tcps_persisttimeo); /* * Hack: if the peer is dead/unreachable, we do not * time out if the window is closed. After a full * backoff, drop the connection if the idle time * (no responses to probes) reaches the maximum * backoff that we would use if retransmitting. */ if (tp->t_rxtshift == TCP_MAXRXTSHIFT && (ticks - tp->t_rcvtime >= tcp_maxpersistidle || ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { TCPSTAT_INC(tcps_persistdrop); if (tcp_inpinfo_lock_add(inp)) { tcp_inpinfo_lock_del(inp, tp); goto out; } tp = tcp_drop(tp, ETIMEDOUT); tcp_inpinfo_lock_del(inp, tp); goto out; } /* * If the user has closed the socket then drop a persisting * connection after a much reduced timeout. */ if (tp->t_state > TCPS_CLOSE_WAIT && (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { TCPSTAT_INC(tcps_persistdrop); if (tcp_inpinfo_lock_add(inp)) { tcp_inpinfo_lock_del(inp, tp); goto out; } tp = tcp_drop(tp, ETIMEDOUT); tcp_inpinfo_lock_del(inp, tp); goto out; } tcp_setpersist(tp); tp->t_flags |= TF_FORCEDATA; (void) tp->t_fb->tfb_tcp_output(tp); tp->t_flags &= ~TF_FORCEDATA; #ifdef TCPDEBUG if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); INP_WUNLOCK(inp); out: CURVNET_RESTORE(); }
void tcp_timer_persist(void *arg) { struct tcpcb *tp = arg; uint32_t rto; #ifdef TCP_DEBUG struct socket *so = NULL; short ostate; #endif mutex_enter(softnet_lock); if ((tp->t_flags & TF_DEAD) != 0) { mutex_exit(softnet_lock); return; } if (!callout_expired(&tp->t_timer[TCPT_PERSIST])) { mutex_exit(softnet_lock); return; } KERNEL_LOCK(1, NULL); #ifdef TCP_DEBUG #ifdef INET if (tp->t_inpcb) so = tp->t_inpcb->inp_socket; #endif #ifdef INET6 if (tp->t_in6pcb) so = tp->t_in6pcb->in6p_socket; #endif ostate = tp->t_state; #endif /* TCP_DEBUG */ /* * Persistance timer into zero window. * Force a byte to be output, if possible. */ /* * Hack: if the peer is dead/unreachable, we do not * time out if the window is closed. After a full * backoff, drop the connection if the idle time * (no responses to probes) reaches the maximum * backoff that we would use if retransmitting. */ rto = TCP_REXMTVAL(tp); if (rto < tp->t_rttmin) rto = tp->t_rttmin; if (tp->t_rxtshift == TCP_MAXRXTSHIFT && ((tcp_now - tp->t_rcvtime) >= tcp_maxpersistidle || (tcp_now - tp->t_rcvtime) >= rto * tcp_totbackoff)) { TCP_STATINC(TCP_STAT_PERSISTDROPS); tp = tcp_drop(tp, ETIMEDOUT); goto out; } TCP_STATINC(TCP_STAT_PERSISTTIMEO); tcp_setpersist(tp); tp->t_force = 1; (void) tcp_output(tp); tp->t_force = 0; out: #ifdef TCP_DEBUG if (tp && so->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, NULL, PRU_SLOWTIMO | (TCPT_PERSIST << 8)); #endif KERNEL_UNLOCK_ONE(NULL); mutex_exit(softnet_lock); }
void tcp_timer_persist(void *xtp) { struct tcpcb *tp = xtp; struct inpcb *inp; CURVNET_SET(tp->t_vnet); #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif INP_INFO_WLOCK(&V_tcbinfo); inp = tp->t_inpcb; /* * XXXRW: While this assert is in fact correct, bugs in the tcpcb * tear-down mean we need it as a work-around for races between * timers and tcp_discardcb(). * * KASSERT(inp != NULL, ("tcp_timer_persist: inp == NULL")); */ if (inp == NULL) { tcp_timer_race++; INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } INP_WLOCK(inp); if ((inp->inp_flags & INP_DROPPED) || callout_pending(&tp->t_timers->tt_persist) || !callout_active(&tp->t_timers->tt_persist)) { INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_persist); /* * Persistance timer into zero window. * Force a byte to be output, if possible. */ TCPSTAT_INC(tcps_persisttimeo); /* * Hack: if the peer is dead/unreachable, we do not * time out if the window is closed. After a full * backoff, drop the connection if the idle time * (no responses to probes) reaches the maximum * backoff that we would use if retransmitting. */ if (tp->t_rxtshift == TCP_MAXRXTSHIFT && (ticks - tp->t_rcvtime >= tcp_maxpersistidle || ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { TCPSTAT_INC(tcps_persistdrop); tp = tcp_drop(tp, ETIMEDOUT); goto out; } tcp_setpersist(tp); tp->t_flags |= TF_FORCEDATA; (void) tcp_output(tp); tp->t_flags &= ~TF_FORCEDATA; out: #ifdef TCPDEBUG if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); #endif if (tp != NULL) INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); }
/* * Tcp output routine: figure out what should be sent and send it. */ int tcp_output(struct tcpcb *tp) { struct inpcb * const inp = tp->t_inpcb; struct socket *so = inp->inp_socket; long len, recvwin, sendwin; int nsacked = 0; int off, flags, error = 0; #ifdef TCP_SIGNATURE int sigoff = 0; #endif struct mbuf *m; struct ip *ip; struct tcphdr *th; u_char opt[TCP_MAXOLEN]; unsigned int ipoptlen, optlen, hdrlen; int idle; boolean_t sendalot; struct ip6_hdr *ip6; #ifdef INET6 const boolean_t isipv6 = INP_ISIPV6(inp); #else const boolean_t isipv6 = FALSE; #endif boolean_t can_tso = FALSE, use_tso; boolean_t report_sack, idle_cwv = FALSE; u_int segsz, tso_hlen, tso_lenmax = 0; int segcnt = 0; boolean_t need_sched = FALSE; KKASSERT(so->so_port == &curthread->td_msgport); /* * Determine length of data that should be transmitted, * and flags that will be used. * If there is some data or critical controls (SYN, RST) * to send, then transmit; otherwise, investigate further. */ /* * If we have been idle for a while, the send congestion window * could be no longer representative of the current state of the * link; need to validate congestion window. However, we should * not perform congestion window validation here, since we could * be asked to send pure ACK. */ if (tp->snd_max == tp->snd_una && (ticks - tp->snd_last) >= tp->t_rxtcur && tcp_idle_restart) idle_cwv = TRUE; /* * Calculate whether the transmit stream was previously idle * and adjust TF_LASTIDLE for the next time. */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); if (idle && (tp->t_flags & TF_MORETOCOME)) tp->t_flags |= TF_LASTIDLE; else tp->t_flags &= ~TF_LASTIDLE; if (TCP_DO_SACK(tp) && tp->snd_nxt != tp->snd_max && !IN_FASTRECOVERY(tp)) nsacked = tcp_sack_bytes_below(&tp->scb, tp->snd_nxt); /* * Find out whether TSO could be used or not * * For TSO capable devices, the following assumptions apply to * the processing of TCP flags: * - If FIN is set on the large TCP segment, the device must set * FIN on the last segment that it creates from the large TCP * segment. * - If PUSH is set on the large TCP segment, the device must set * PUSH on the last segment that it creates from the large TCP * segment. */ #if !defined(IPSEC) && !defined(FAST_IPSEC) if (tcp_do_tso #ifdef TCP_SIGNATURE && (tp->t_flags & TF_SIGNATURE) == 0 #endif ) { if (!isipv6) { struct rtentry *rt = inp->inp_route.ro_rt; if (rt != NULL && (rt->rt_flags & RTF_UP) && (rt->rt_ifp->if_hwassist & CSUM_TSO)) { can_tso = TRUE; tso_lenmax = rt->rt_ifp->if_tsolen; } } } #endif /* !IPSEC && !FAST_IPSEC */ again: m = NULL; ip = NULL; th = NULL; ip6 = NULL; if ((tp->t_flags & (TF_SACK_PERMITTED | TF_NOOPT)) == TF_SACK_PERMITTED && (!TAILQ_EMPTY(&tp->t_segq) || tp->reportblk.rblk_start != tp->reportblk.rblk_end)) report_sack = TRUE; else report_sack = FALSE; /* Make use of SACK information when slow-starting after a RTO. */ if (TCP_DO_SACK(tp) && tp->snd_nxt != tp->snd_max && !IN_FASTRECOVERY(tp)) { tcp_seq old_snd_nxt = tp->snd_nxt; tcp_sack_skip_sacked(&tp->scb, &tp->snd_nxt); nsacked += tp->snd_nxt - old_snd_nxt; } sendalot = FALSE; off = tp->snd_nxt - tp->snd_una; sendwin = min(tp->snd_wnd, tp->snd_cwnd + nsacked); sendwin = min(sendwin, tp->snd_bwnd); flags = tcp_outflags[tp->t_state]; /* * Get standard flags, and add SYN or FIN if requested by 'hidden' * state flags. */ if (tp->t_flags & TF_NEEDFIN) flags |= TH_FIN; if (tp->t_flags & TF_NEEDSYN) flags |= TH_SYN; /* * If in persist timeout with window of 0, send 1 byte. * Otherwise, if window is small but nonzero * and timer expired, we will send what we can * and go to transmit state. */ if (tp->t_flags & TF_FORCE) { if (sendwin == 0) { /* * If we still have some data to send, then * clear the FIN bit. Usually this would * happen below when it realizes that we * aren't sending all the data. However, * if we have exactly 1 byte of unsent data, * then it won't clear the FIN bit below, * and if we are in persist state, we wind * up sending the packet without recording * that we sent the FIN bit. * * We can't just blindly clear the FIN bit, * because if we don't have any more data * to send then the probe will be the FIN * itself. */ if (off < so->so_snd.ssb_cc) flags &= ~TH_FIN; sendwin = 1; } else { tcp_callout_stop(tp, tp->tt_persist); tp->t_rxtshift = 0; } } /* * If snd_nxt == snd_max and we have transmitted a FIN, the * offset will be > 0 even if so_snd.ssb_cc is 0, resulting in * a negative length. This can also occur when TCP opens up * its congestion window while receiving additional duplicate * acks after fast-retransmit because TCP will reset snd_nxt * to snd_max after the fast-retransmit. * * A negative length can also occur when we are in the * TCPS_SYN_RECEIVED state due to a simultanious connect where * our SYN has not been acked yet. * * In the normal retransmit-FIN-only case, however, snd_nxt will * be set to snd_una, the offset will be 0, and the length may * wind up 0. */ len = (long)ulmin(so->so_snd.ssb_cc, sendwin) - off; /* * Lop off SYN bit if it has already been sent. However, if this * is SYN-SENT state and if segment contains data, suppress sending * segment (sending the segment would be an option if we still * did TAO and the remote host supported it). */ if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { flags &= ~TH_SYN; off--, len++; if (len > 0 && tp->t_state == TCPS_SYN_SENT) { tp->t_flags &= ~(TF_ACKNOW | TF_XMITNOW); return 0; } } /* * Be careful not to send data and/or FIN on SYN segments. * This measure is needed to prevent interoperability problems * with not fully conformant TCP implementations. */ if (flags & TH_SYN) { len = 0; flags &= ~TH_FIN; } if (len < 0) { /* * A negative len can occur if our FIN has been sent but not * acked, or if we are in a simultanious connect in the * TCPS_SYN_RECEIVED state with our SYN sent but not yet * acked. * * If our window has contracted to 0 in the FIN case * (which can only occur if we have NOT been called to * retransmit as per code a few paragraphs up) then we * want to shift the retransmit timer over to the * persist timer. * * However, if we are in the TCPS_SYN_RECEIVED state * (the SYN case) we will be in a simultanious connect and * the window may be zero degeneratively. In this case we * do not want to shift to the persist timer after the SYN * or the SYN+ACK transmission. */ len = 0; if (sendwin == 0 && tp->t_state != TCPS_SYN_RECEIVED) { tcp_callout_stop(tp, tp->tt_rexmt); tp->t_rxtshift = 0; tp->snd_nxt = tp->snd_una; if (!tcp_callout_active(tp, tp->tt_persist)) tcp_setpersist(tp); } } KASSERT(len >= 0, ("%s: len < 0", __func__)); /* * Automatic sizing of send socket buffer. Often the send buffer * size is not optimally adjusted to the actual network conditions * at hand (delay bandwidth product). Setting the buffer size too * small limits throughput on links with high bandwidth and high * delay (eg. trans-continental/oceanic links). Setting the * buffer size too big consumes too much real kernel memory, * especially with many connections on busy servers. * * The criteria to step up the send buffer one notch are: * 1. receive window of remote host is larger than send buffer * (with a fudge factor of 5/4th); * 2. hiwat has not significantly exceeded bwnd (inflight) * (bwnd is a maximal value if inflight is disabled). * 3. send buffer is filled to 7/8th with data (so we actually * have data to make use of it); * 4. hiwat has not hit maximal automatic size; * 5. our send window (slow start and cogestion controlled) is * larger than sent but unacknowledged data in send buffer. * * The remote host receive window scaling factor may limit the * growing of the send buffer before it reaches its allowed * maximum. * * It scales directly with slow start or congestion window * and does at most one step per received ACK. This fast * scaling has the drawback of growing the send buffer beyond * what is strictly necessary to make full use of a given * delay*bandwith product. However testing has shown this not * to be much of an problem. At worst we are trading wasting * of available bandwith (the non-use of it) for wasting some * socket buffer memory. * * The criteria for shrinking the buffer is based solely on * the inflight code (snd_bwnd). If inflight is disabled, * the buffer will not be shrinked. Note that snd_bwnd already * has a fudge factor. Our test adds a little hysteresis. */ if (tcp_do_autosndbuf && (so->so_snd.ssb_flags & SSB_AUTOSIZE)) { const int asbinc = tcp_autosndbuf_inc; const int hiwat = so->so_snd.ssb_hiwat; const int lowat = so->so_snd.ssb_lowat; u_long newsize; if ((tp->snd_wnd / 4 * 5) >= hiwat && so->so_snd.ssb_cc >= (hiwat / 8 * 7) && hiwat < tp->snd_bwnd + hiwat / 10 && hiwat + asbinc < tcp_autosndbuf_max && hiwat < (TCP_MAXWIN << tp->snd_scale) && sendwin >= (so->so_snd.ssb_cc - (tp->snd_nxt - tp->snd_una))) { newsize = ulmin(hiwat + asbinc, tcp_autosndbuf_max); if (!ssb_reserve(&so->so_snd, newsize, so, NULL)) atomic_clear_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE); #if 0 if (newsize >= (TCP_MAXWIN << tp->snd_scale)) atomic_clear_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE); #endif } else if ((long)tp->snd_bwnd < (long)(hiwat * 3 / 4 - lowat - asbinc) && hiwat > tp->t_maxseg * 2 + asbinc && hiwat + asbinc >= tcp_autosndbuf_min && tcp_do_autosndbuf == 1) { newsize = ulmax(hiwat - asbinc, tp->t_maxseg * 2); ssb_reserve(&so->so_snd, newsize, so, NULL); } } /* * Don't use TSO, if: * - Congestion window needs validation * - There are SACK blocks to report * - RST or SYN flags is set * - URG will be set * * XXX * Checking for SYN|RST looks overkill, just to be safe than sorry */ use_tso = can_tso; if (report_sack || idle_cwv || (flags & (TH_RST | TH_SYN))) use_tso = FALSE; if (use_tso) { tcp_seq ugr_nxt = tp->snd_nxt; if ((flags & TH_FIN) && (tp->t_flags & TF_SENTFIN) && tp->snd_nxt == tp->snd_max) --ugr_nxt; if (SEQ_GT(tp->snd_up, ugr_nxt)) use_tso = FALSE; } if (use_tso) { /* * Find out segment size and header length for TSO */ error = tcp_tso_getsize(tp, &segsz, &tso_hlen); if (error) use_tso = FALSE; } if (!use_tso) { segsz = tp->t_maxseg; tso_hlen = 0; /* not used */ } /* * Truncate to the maximum segment length if not TSO, and ensure that * FIN is removed if the length no longer contains the last data byte. */ if (len > segsz) { if (!use_tso) { len = segsz; ++segcnt; } else { int nsegs; if (__predict_false(tso_lenmax < segsz)) tso_lenmax = segsz << 1; /* * Truncate TSO transfers to (IP_MAXPACKET - iphlen - * thoff), and make sure that we send equal size * transfers down the stack (rather than big-small- * big-small-...). */ len = min(len, tso_lenmax); nsegs = min(len, (IP_MAXPACKET - tso_hlen)) / segsz; KKASSERT(nsegs > 0); len = nsegs * segsz; if (len <= segsz) { use_tso = FALSE; ++segcnt; } else { segcnt += nsegs; } } sendalot = TRUE; } else { use_tso = FALSE; if (len > 0) ++segcnt; } if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.ssb_cc)) flags &= ~TH_FIN; recvwin = ssb_space(&so->so_rcv); /* * Sender silly window avoidance. We transmit under the following * conditions when len is non-zero: * * - We have a full segment * - This is the last buffer in a write()/send() and we are * either idle or running NODELAY * - we've timed out (e.g. persist timer) * - we have more then 1/2 the maximum send window's worth of * data (receiver may be limiting the window size) * - we need to retransmit */ if (len) { if (len >= segsz) goto send; /* * NOTE! on localhost connections an 'ack' from the remote * end may occur synchronously with the output and cause * us to flush a buffer queued with moretocome. XXX * * note: the len + off check is almost certainly unnecessary. */ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ (idle || (tp->t_flags & TF_NODELAY)) && len + off >= so->so_snd.ssb_cc && !(tp->t_flags & TF_NOPUSH)) { goto send; } if (tp->t_flags & TF_FORCE) /* typ. timeout case */ goto send; if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */ goto send; if (tp->t_flags & TF_XMITNOW) goto send; } /* * Compare available window to amount of window * known to peer (as advertised window less * next expected input). If the difference is at least two * max size segments, or at least 50% of the maximum possible * window, then want to send a window update to peer. */ if (recvwin > 0) { /* * "adv" is the amount we can increase the window, * taking into account that we are limited by * TCP_MAXWIN << tp->rcv_scale. */ long adv = min(recvwin, (long)TCP_MAXWIN << tp->rcv_scale) - (tp->rcv_adv - tp->rcv_nxt); long hiwat; /* * This ack case typically occurs when the user has drained * the TCP socket buffer sufficiently to warrent an ack * containing a 'pure window update'... that is, an ack that * ONLY updates the tcp window. * * It is unclear why we would need to do a pure window update * past 2 segments if we are going to do one at 1/2 the high * water mark anyway, especially since under normal conditions * the user program will drain the socket buffer quickly. * The 2-segment pure window update will often add a large * number of extra, unnecessary acks to the stream. * * avoid_pure_win_update now defaults to 1. */ if (avoid_pure_win_update == 0 || (tp->t_flags & TF_RXRESIZED)) { if (adv >= (long) (2 * segsz)) { goto send; } } hiwat = (long)(TCP_MAXWIN << tp->rcv_scale); if (hiwat > (long)so->so_rcv.ssb_hiwat) hiwat = (long)so->so_rcv.ssb_hiwat; if (adv >= hiwat / 2) goto send; } /* * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW * is also a catch-all for the retransmit timer timeout case. */ if (tp->t_flags & TF_ACKNOW) goto send; if ((flags & TH_RST) || ((flags & TH_SYN) && !(tp->t_flags & TF_NEEDSYN))) goto send; if (SEQ_GT(tp->snd_up, tp->snd_una)) goto send; /* * If our state indicates that FIN should be sent * and we have not yet done so, then we need to send. */ if ((flags & TH_FIN) && (!(tp->t_flags & TF_SENTFIN) || tp->snd_nxt == tp->snd_una)) goto send; /* * TCP window updates are not reliable, rather a polling protocol * using ``persist'' packets is used to insure receipt of window * updates. The three ``states'' for the output side are: * idle not doing retransmits or persists * persisting to move a small or zero window * (re)transmitting and thereby not persisting * * tcp_callout_active(tp, tp->tt_persist) * is true when we are in persist state. * The TF_FORCE flag in tp->t_flags * is set when we are called to send a persist packet. * tcp_callout_active(tp, tp->tt_rexmt) * is set when we are retransmitting * The output side is idle when both timers are zero. * * If send window is too small, there is data to transmit, and no * retransmit or persist is pending, then go to persist state. * * If nothing happens soon, send when timer expires: * if window is nonzero, transmit what we can, otherwise force out * a byte. * * Don't try to set the persist state if we are in TCPS_SYN_RECEIVED * with data pending. This situation can occur during a * simultanious connect. */ if (so->so_snd.ssb_cc > 0 && tp->t_state != TCPS_SYN_RECEIVED && !tcp_callout_active(tp, tp->tt_rexmt) && !tcp_callout_active(tp, tp->tt_persist)) { tp->t_rxtshift = 0; tcp_setpersist(tp); } /* * No reason to send a segment, just return. */ tp->t_flags &= ~TF_XMITNOW; return (0); send: if (need_sched && len > 0) { tcp_output_sched(tp); return 0; } /* * Before ESTABLISHED, force sending of initial options * unless TCP set not to do any options. * NOTE: we assume that the IP/TCP header plus TCP options * always fit in a single mbuf, leaving room for a maximum * link header, i.e. * max_linkhdr + sizeof(struct tcpiphdr) + optlen <= MCLBYTES */ optlen = 0; if (isipv6) hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else hdrlen = sizeof(struct tcpiphdr); if (flags & TH_SYN) { tp->snd_nxt = tp->iss; if (!(tp->t_flags & TF_NOOPT)) { u_short mss; opt[0] = TCPOPT_MAXSEG; opt[1] = TCPOLEN_MAXSEG; mss = htons((u_short) tcp_mssopt(tp)); memcpy(opt + 2, &mss, sizeof mss); optlen = TCPOLEN_MAXSEG; if ((tp->t_flags & TF_REQ_SCALE) && (!(flags & TH_ACK) || (tp->t_flags & TF_RCVD_SCALE))) { *((u_int32_t *)(opt + optlen)) = htonl( TCPOPT_NOP << 24 | TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | tp->request_r_scale); optlen += 4; } if ((tcp_do_sack && !(flags & TH_ACK)) || tp->t_flags & TF_SACK_PERMITTED) { uint32_t *lp = (uint32_t *)(opt + optlen); *lp = htonl(TCPOPT_SACK_PERMITTED_ALIGNED); optlen += TCPOLEN_SACK_PERMITTED_ALIGNED; } } } /* * Send a timestamp and echo-reply if this is a SYN and our side * wants to use timestamps (TF_REQ_TSTMP is set) or both our side * and our peer have sent timestamps in our SYN's. */ if ((tp->t_flags & (TF_REQ_TSTMP | TF_NOOPT)) == TF_REQ_TSTMP && !(flags & TH_RST) && (!(flags & TH_ACK) || (tp->t_flags & TF_RCVD_TSTMP))) { u_int32_t *lp = (u_int32_t *)(opt + optlen); /* Form timestamp option as shown in appendix A of RFC 1323. */ *lp++ = htonl(TCPOPT_TSTAMP_HDR); *lp++ = htonl(ticks); *lp = htonl(tp->ts_recent); optlen += TCPOLEN_TSTAMP_APPA; } /* Set receive buffer autosizing timestamp. */ if (tp->rfbuf_ts == 0 && (so->so_rcv.ssb_flags & SSB_AUTOSIZE)) tp->rfbuf_ts = ticks; /* * If this is a SACK connection and we have a block to report, * fill in the SACK blocks in the TCP options. */ if (report_sack) tcp_sack_fill_report(tp, opt, &optlen); #ifdef TCP_SIGNATURE if (tp->t_flags & TF_SIGNATURE) { int i; u_char *bp; /* * Initialize TCP-MD5 option (RFC2385) */ bp = (u_char *)opt + optlen; *bp++ = TCPOPT_SIGNATURE; *bp++ = TCPOLEN_SIGNATURE; sigoff = optlen + 2; for (i = 0; i < TCP_SIGLEN; i++) *bp++ = 0; optlen += TCPOLEN_SIGNATURE; /* * Terminate options list and maintain 32-bit alignment. */ *bp++ = TCPOPT_NOP; *bp++ = TCPOPT_EOL; optlen += 2; } #endif /* TCP_SIGNATURE */ KASSERT(optlen <= TCP_MAXOLEN, ("too many TCP options")); hdrlen += optlen; if (isipv6) { ipoptlen = ip6_optlen(inp); } else { if (inp->inp_options) { ipoptlen = inp->inp_options->m_len - offsetof(struct ipoption, ipopt_list); } else {
//-------------------------------------------------------------------------// // TCP timer processing. //-------------------------------------------------------------------------// static struct tcpcb * tcp_timers( Node *node, struct tcpcb *tp, int timer, UInt32 tcp_now, struct tcpstat *tcp_stat) { int rexmt; TransportDataTcp *tcpLayer = (TransportDataTcp *) node->transportData.tcp; switch (timer) { // // 2 MSL timeout in shutdown went off. If we're closed but // still waiting for peer to close and connection has been idle // too long, or if 2MSL time is up from TIME_WAIT, delete connection // control block. Otherwise, check again in a bit. // case TCPT_2MSL: if (tp->t_state != TCPS_TIME_WAIT && tp->t_idle <= TCPTV_MAXIDLE) tp->t_timer[TCPT_2MSL] = TCPTV_KEEPINTVL; else { // printf("TCP: Connection closed by timer\n"); tp = tcp_close(node, tp, tcp_stat); } break; // // Retransmission timer went off. Message has not // been acked within retransmit interval. Back off // to a longer retransmit interval and retransmit one segment. // case TCPT_REXMT: if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; //if (tcp_stat) //tcp_stat->tcps_timeoutdrop++; printf("TCP: Retransmission timer went off\n"); tp = tcp_drop(node, tp, tcp_now, tcp_stat); break; } //if (tcp_stat) //tcp_stat->tcps_rexmttimeo++; rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX); tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; // // If we backed off this far, // our srtt estimate is probably bogus. Clobber it // so we'll take the next rtt measurement as our srtt; // move the current srtt into rttvar to keep the current // retransmit times until then. // if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { tp->t_rttvar += (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); tp->t_srtt = 0; } tp->snd_nxt = tp->snd_una; if (TCP_VARIANT_IS_SACK(tp) && tp->isSackFastRextOn) { TransportTcpSackRextTimeoutInit(tp); TransportTcpTrace(node, 0, 0, "Faxt: timeout"); } // Force a segment to be sent. tp->t_flags |= TF_ACKNOW; // If timing a segment in this window, stop the timer. // The retransmitted segment shouldn't be timed. tp->t_rtt = 0; // // Close the congestion window down to one segment // (we'll open it by one segment for each ack we get). // Since we probably have a window's worth of unacked // data accumulated, this "slow start" keeps us from // dumping all that data as back-to-back packets (which // might overwhelm an intermediate gateway). // // There are two phases to the opening: Initially we // open by one mss on each ack. This makes the window // size increase exponentially with time. If the // window is larger than the path can handle, this // exponential growth results in dropped packet(s) // almost immediately. To get more time between // drops but still "push" the network to take advantage // of improving conditions, we switch from exponential // to linear window opening at some threshhold size. // For a threshhold, we use half the current window // size, truncated to a multiple of the mss. // // (the minimum cwnd that will give us exponential // growth is 2 mss. We don't allow the threshhold // to go below this.) // { unsigned int win; win = MIN(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; if (win < 2) win = 2; tp->snd_cwnd = tp->t_maxseg; tp->snd_ssthresh = win * tp->t_maxseg; tp->t_partialacks = -1; tp->t_dupacks = 0; } tp->t_ecnFlags |= TF_CWND_REDUCED; TransportTcpTrace(node, 0, 0, "Rext: timeout"); // // To eliminates the problem of multiple Fast Retransmits we uses this // new variable "send_high", whose initial value is the initial send // sequence number. After each retransmit timeout, the highest sequence // numbers transmitted so far is recorded in the variable "send_high". // if (TCP_VARIANT_IS_NEWRENO(tp)) { tp->send_high = tp->snd_max; } tcp_output(node, tp, tcp_now, tcp_stat); break; // // Persistance timer into zero window. // Force a byte to be output, if possible. // case TCPT_PERSIST: //if (tcp_stat) //tcp_stat->tcps_persisttimeo++; // // Hack: if the peer is dead/unreachable, we do not // time out if the window is closed. After a full // backoff, drop the connection if the idle time // (no responses to probes) reaches the maximum // backoff that we would use if retransmitting. // if (tp->t_rxtshift == TCP_MAXRXTSHIFT) { UInt32 maxidle = TCP_REXMTVAL(tp); if (maxidle < tp->t_rttmin) maxidle = tp->t_rttmin; maxidle *= tcp_totbackoff; if (tp->t_idle >= TCPTV_KEEP_IDLE || tp->t_idle >= maxidle) { //if (tcp_stat) //tcp_stat->tcps_persistdrop++; printf("TCP: Idle timer went off\n"); tp = tcp_drop(node, tp, tcp_now, tcp_stat); break; } } tcp_setpersist(tp); tp->t_force = 1; tcp_output(node, tp, tcp_now, tcp_stat); tp->t_force = 0; break; // // Keep-alive timer went off; send something // or drop connection if idle for too long. // case TCPT_KEEP: //if (tcp_stat) //tcp_stat->tcps_keeptimeo++; if (tp->t_state < TCPS_ESTABLISHED) printf("TCP: Keep-alive timer went off before established\n"); goto dropit; if (tcpLayer->tcpUseKeepAliveProbes && tp->t_state <= TCPS_CLOSING) { // // If the connection has been idle for more than the sum of // TCPTV_KEEP_IDLE (set to 2 hours) and TCPTV_MAXIDLE // (set to the total time taken to send all the probes), // it's time to drop the connection. // if (tp->t_idle >= TCPTV_KEEP_IDLE + TCPTV_MAXIDLE) printf("TCP: Keep-alive timer went off\n"); goto dropit; // // Send a packet designed to force a response // if the peer is up and reachable: // either an ACK if the connection is still alive, // or an RST if the peer has closed the connection // due to timeout or reboot. // Using sequence number tp->snd_una-1 // causes the transmitted zero-length segment // to lie outside the receive window; // by the protocol spec, this requires the // correspondent TCP to respond. // //if (tcp_stat) //tcp_stat->tcps_keepprobe++; tcp_respond(node, tp, tp->t_template, 0, tp->rcv_nxt, tp->snd_una - 1, 0, tcp_stat); tp->t_timer[TCPT_KEEP] = TCPTV_KEEPINTVL; } else { // // If the tcpUseKeepAliveProbes is FALSE // or the connection state is greater than TCPS_CLOSING, // reset the keepalive timer to TCPTV_KEEP_IDLE. // tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_IDLE; } break; dropit: //if (tcp_stat) { // // Note that this counter counts connection drops due to // failure in connection establishment and the keepalive // timer timeouts // //tcp_stat->tcps_keepdrops++; //} // printf("TCP: Unknown timer went off\n"); tp = tcp_drop(node, tp, tcp_now, tcp_stat); break; }
/* * Tcp output routine: figure out what should be sent and send it. */ int tcp_output(struct tcpcb *tp) { struct socket *so = tp->t_inpcb->inp_socket; long len, recwin, sendwin; int off, flags, error; #ifdef TCP_SIGNATURE int sigoff = 0; #endif struct mbuf *m; struct ip *ip = NULL; struct ipovly *ipov = NULL; struct tcphdr *th; u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen; int idle, sendalot; int i, sack_rxmit; int sack_bytes_rxmt; struct sackhole *p; #if 0 int maxburst = TCP_MAXBURST; #endif struct rmxp_tao tao; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6; bzero(&tao, sizeof(tao)); isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; #endif #ifdef TCP_ECN int needect; #endif INP_LOCK_ASSERT(tp->t_inpcb); /* * Determine length of data that should be transmitted, * and flags that will be used. * If there is some data or critical controls (SYN, RST) * to send, then transmit; otherwise, investigate further. */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) { /* * We have been idle for "a while" and no acks are * expected to clock out any data we send -- * slow start to get ack "clock" running again. * * Set the slow-start flight size depending on whether * this is a local network or not. */ int ss = ss_fltsz; #ifdef INET6 if (isipv6) { if (in6_localaddr(&tp->t_inpcb->in6p_faddr)) ss = ss_fltsz_local; } else #endif if (in_localaddr(tp->t_inpcb->inp_faddr)) ss = ss_fltsz_local; tp->snd_cwnd = tp->t_maxseg * ss; } tp->t_flags &= ~TF_LASTIDLE; if (idle) { if (tp->t_flags & TF_MORETOCOME) { tp->t_flags |= TF_LASTIDLE; idle = 0; } } again: /* * If we've recently taken a timeout, snd_max will be greater than * snd_nxt. There may be SACK information that allows us to avoid * resending already delivered data. Adjust snd_nxt accordingly. */ if (tp->sack_enable && SEQ_LT(tp->snd_nxt, tp->snd_max)) tcp_sack_adjust(tp); sendalot = 0; off = tp->snd_nxt - tp->snd_una; sendwin = min(tp->snd_wnd, tp->snd_cwnd); sendwin = min(sendwin, tp->snd_bwnd); flags = tcp_outflags[tp->t_state]; /* * Send any SACK-generated retransmissions. If we're explicitly trying * to send out new data (when sendalot is 1), bypass this function. * If we retransmit in fast recovery mode, decrement snd_cwnd, since * we're replacing a (future) new transmission with a retransmission * now, and we previously incremented snd_cwnd in tcp_input(). */ /* * Still in sack recovery , reset rxmit flag to zero. */ sack_rxmit = 0; sack_bytes_rxmt = 0; len = 0; p = NULL; if (tp->sack_enable && IN_FASTRECOVERY(tp) && (p = tcp_sack_output(tp, &sack_bytes_rxmt))) { long cwin; cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt; if (cwin < 0) cwin = 0; /* Do not retransmit SACK segments beyond snd_recover */ if (SEQ_GT(p->end, tp->snd_recover)) { /* * (At least) part of sack hole extends beyond * snd_recover. Check to see if we can rexmit data * for this hole. */ if (SEQ_GEQ(p->rxmit, tp->snd_recover)) { /* * Can't rexmit any more data for this hole. * That data will be rexmitted in the next * sack recovery episode, when snd_recover * moves past p->rxmit. */ p = NULL; goto after_sack_rexmit; } else /* Can rexmit part of the current hole */ len = ((long)ulmin(cwin, tp->snd_recover - p->rxmit)); } else len = ((long)ulmin(cwin, p->end - p->rxmit)); off = p->rxmit - tp->snd_una; KASSERT(off >= 0,("%s: sack block to the left of una : %d", __func__, off)); if (len > 0) { sack_rxmit = 1; sendalot = 1; tcpstat.tcps_sack_rexmits++; tcpstat.tcps_sack_rexmit_bytes += min(len, tp->t_maxseg); } } after_sack_rexmit: /* * Get standard flags, and add SYN or FIN if requested by 'hidden' * state flags. */ if (tp->t_flags & TF_NEEDFIN) flags |= TH_FIN; if (tp->t_flags & TF_NEEDSYN) flags |= TH_SYN; SOCKBUF_LOCK(&so->so_snd); /* * If in persist timeout with window of 0, send 1 byte. * Otherwise, if window is small but nonzero * and timer expired, we will send what we can * and go to transmit state. */ if (tp->t_force) { if (sendwin == 0) { /* * If we still have some data to send, then * clear the FIN bit. Usually this would * happen below when it realizes that we * aren't sending all the data. However, * if we have exactly 1 byte of unsent data, * then it won't clear the FIN bit below, * and if we are in persist state, we wind * up sending the packet without recording * that we sent the FIN bit. * * We can't just blindly clear the FIN bit, * because if we don't have any more data * to send then the probe will be the FIN * itself. */ if (off < so->so_snd.sb_cc) flags &= ~TH_FIN; sendwin = 1; } else { callout_stop(tp->tt_persist); tp->t_rxtshift = 0; } } /* * If snd_nxt == snd_max and we have transmitted a FIN, the * offset will be > 0 even if so_snd.sb_cc is 0, resulting in * a negative length. This can also occur when TCP opens up * its congestion window while receiving additional duplicate * acks after fast-retransmit because TCP will reset snd_nxt * to snd_max after the fast-retransmit. * * In the normal retransmit-FIN-only case, however, snd_nxt will * be set to snd_una, the offset will be 0, and the length may * wind up 0. * * If sack_rxmit is true we are retransmitting from the scoreboard * in which case len is already set. */ if (sack_rxmit == 0) { if (sack_bytes_rxmt == 0) len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off); else { long cwin; /* * We are inside of a SACK recovery episode and are * sending new data, having retransmitted all the * data possible in the scoreboard. */ len = ((long)ulmin(so->so_snd.sb_cc, tp->snd_wnd) - off); /* * Don't remove this (len > 0) check ! * We explicitly check for len > 0 here (although it * isn't really necessary), to work around a gcc * optimization issue - to force gcc to compute * len above. Without this check, the computation * of len is bungled by the optimizer. */ if (len > 0) { cwin = tp->snd_cwnd - (tp->snd_nxt - tp->sack_newdata) - sack_bytes_rxmt; if (cwin < 0) cwin = 0; len = lmin(len, cwin); } } } /* * Lop off SYN bit if it has already been sent. However, if this * is SYN-SENT state and if segment contains data and if we don't * know that foreign host supports TAO, suppress sending segment. */ if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { flags &= ~TH_SYN; off--, len++; if (tcp_do_rfc1644) tcp_hc_gettao(&tp->t_inpcb->inp_inc, &tao); if (len > 0 && tp->t_state == TCPS_SYN_SENT && tao.tao_ccsent == 0) goto just_return; } /* * Be careful not to send data and/or FIN on SYN segments * in cases when no CC option will be sent. * This measure is needed to prevent interoperability problems * with not fully conformant TCP implementations. */ if ((flags & TH_SYN) && ((tp->t_flags & TF_NOOPT) || !(tp->t_flags & TF_REQ_CC) || ((flags & TH_ACK) && !(tp->t_flags & TF_RCVD_CC)))) { len = 0; flags &= ~TH_FIN; } if (len < 0) { /* * If FIN has been sent but not acked, * but we haven't been called to retransmit, * len will be < 0. Otherwise, window shrank * after we sent into it. If window shrank to 0, * cancel pending retransmit, pull snd_nxt back * to (closed) window, and set the persist timer * if it isn't already going. If the window didn't * close completely, just wait for an ACK. */ len = 0; if (sendwin == 0) { callout_stop(tp->tt_rexmt); tp->t_rxtshift = 0; tp->snd_nxt = tp->snd_una; if (!callout_active(tp->tt_persist)) tcp_setpersist(tp); } } /* * len will be >= 0 after this point. Truncate to the maximum * segment length and ensure that FIN is removed if the length * no longer contains the last data byte. */ if (len > tp->t_maxseg) { len = tp->t_maxseg; sendalot = 1; } if (sack_rxmit) { if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc)) flags &= ~TH_FIN; } else { if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) flags &= ~TH_FIN; } recwin = sbspace(&so->so_rcv); /* * Sender silly window avoidance. We transmit under the following * conditions when len is non-zero: * * - We have a full segment * - This is the last buffer in a write()/send() and we are * either idle or running NODELAY * - we've timed out (e.g. persist timer) * - we have more then 1/2 the maximum send window's worth of * data (receiver may be limited the window size) * - we need to retransmit */ if (len) { if (len == tp->t_maxseg) goto send; /* * NOTE! on localhost connections an 'ack' from the remote * end may occur synchronously with the output and cause * us to flush a buffer queued with moretocome. XXX * * note: the len + off check is almost certainly unnecessary. */ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ (idle || (tp->t_flags & TF_NODELAY)) && len + off >= so->so_snd.sb_cc && (tp->t_flags & TF_NOPUSH) == 0) { goto send; } if (tp->t_force) /* typ. timeout case */ goto send; if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */ goto send; if (sack_rxmit) goto send; } /* * Compare available window to amount of window * known to peer (as advertised window less * next expected input). If the difference is at least two * max size segments, or at least 50% of the maximum possible * window, then want to send a window update to peer. * Skip this if the connection is in T/TCP half-open state. */ if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN)) { /* * "adv" is the amount we can increase the window, * taking into account that we are limited by * TCP_MAXWIN << tp->rcv_scale. */ long adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale) - (tp->rcv_adv - tp->rcv_nxt); if (adv >= (long) (2 * tp->t_maxseg)) goto send; if (2 * adv >= (long) so->so_rcv.sb_hiwat) goto send; } /* * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW * is also a catch-all for the retransmit timer timeout case. */ if (tp->t_flags & TF_ACKNOW) goto send; if ((flags & TH_RST) || ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) goto send; if (SEQ_GT(tp->snd_up, tp->snd_una)) goto send; /* * If our state indicates that FIN should be sent * and we have not yet done so, then we need to send. */ if (flags & TH_FIN && ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) goto send; /* * In SACK, it is possible for tcp_output to fail to send a segment * after the retransmission timer has been turned off. Make sure * that the retransmission timer is set. */ if (tp->sack_enable && SEQ_GT(tp->snd_max, tp->snd_una) && !callout_active(tp->tt_rexmt) && !callout_active(tp->tt_persist)) { callout_reset(tp->tt_rexmt, tp->t_rxtcur, tcp_timer_rexmt, tp); goto just_return; } /* * TCP window updates are not reliable, rather a polling protocol * using ``persist'' packets is used to insure receipt of window * updates. The three ``states'' for the output side are: * idle not doing retransmits or persists * persisting to move a small or zero window * (re)transmitting and thereby not persisting * * callout_active(tp->tt_persist) * is true when we are in persist state. * tp->t_force * is set when we are called to send a persist packet. * callout_active(tp->tt_rexmt) * is set when we are retransmitting * The output side is idle when both timers are zero. * * If send window is too small, there is data to transmit, and no * retransmit or persist is pending, then go to persist state. * If nothing happens soon, send when timer expires: * if window is nonzero, transmit what we can, * otherwise force out a byte. */ if (so->so_snd.sb_cc && !callout_active(tp->tt_rexmt) && !callout_active(tp->tt_persist)) { tp->t_rxtshift = 0; tcp_setpersist(tp); } /* * No reason to send a segment, just return. */ just_return: SOCKBUF_UNLOCK(&so->so_snd); return (0); send: SOCKBUF_LOCK_ASSERT(&so->so_snd); /* * Before ESTABLISHED, force sending of initial options * unless TCP set not to do any options. * NOTE: we assume that the IP/TCP header plus TCP options * always fit in a single mbuf, leaving room for a maximum * link header, i.e. * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES */ optlen = 0; #ifdef INET6 if (isipv6) hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); else #endif hdrlen = sizeof (struct tcpiphdr); if (flags & TH_SYN) { tp->snd_nxt = tp->iss; if ((tp->t_flags & TF_NOOPT) == 0) { u_short mss; opt[0] = TCPOPT_MAXSEG; opt[1] = TCPOLEN_MAXSEG; mss = htons((u_short) tcp_mssopt(&tp->t_inpcb->inp_inc)); (void)memcpy(opt + 2, &mss, sizeof(mss)); optlen = TCPOLEN_MAXSEG; /* * If this is the first SYN of connection (not a SYN * ACK), include SACK_PERMIT_HDR option. If this is a * SYN ACK, include SACK_PERMIT_HDR option if peer has * already done so. This is only for active connect, * since the syncache takes care of the passive connect. */ if (tp->sack_enable && ((flags & TH_ACK) == 0 || (tp->t_flags & TF_SACK_PERMIT))) { *((u_int32_t *) (opt + optlen)) = htonl(TCPOPT_SACK_PERMIT_HDR); optlen += 4; } if ((tp->t_flags & TF_REQ_SCALE) && ((flags & TH_ACK) == 0 || (tp->t_flags & TF_RCVD_SCALE))) { *((u_int32_t *)(opt + optlen)) = htonl( TCPOPT_NOP << 24 | TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | tp->request_r_scale); optlen += 4; } } } /* * Send a timestamp and echo-reply if this is a SYN and our side * wants to use timestamps (TF_REQ_TSTMP is set) or both our side * and our peer have sent timestamps in our SYN's. */ if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && (flags & TH_RST) == 0 && ((flags & TH_ACK) == 0 || (tp->t_flags & TF_RCVD_TSTMP))) { u_int32_t *lp = (u_int32_t *)(opt + optlen); /* Form timestamp option as shown in appendix A of RFC 1323. */ *lp++ = htonl(TCPOPT_TSTAMP_HDR); *lp++ = htonl(ticks); *lp = htonl(tp->ts_recent); optlen += TCPOLEN_TSTAMP_APPA; } /* * Send SACKs if necessary. This should be the last option processed. * Only as many SACKs are sent as are permitted by the maximum options * size. No more than three SACKs are sent. */ if (tp->sack_enable && tp->t_state == TCPS_ESTABLISHED && (tp->t_flags & (TF_SACK_PERMIT|TF_NOOPT)) == TF_SACK_PERMIT && tp->rcv_numsacks) { u_int32_t *lp = (u_int32_t *)(opt + optlen); u_int32_t *olp = lp++; int count = 0; /* actual number of SACKs inserted */ int maxsack = (MAX_TCPOPTLEN - (optlen + 4))/TCPOLEN_SACK; tcpstat.tcps_sack_send_blocks++; maxsack = min(maxsack, TCP_MAX_SACK); for (i = 0; (i < tp->rcv_numsacks && count < maxsack); i++) { struct sackblk sack = tp->sackblks[i]; if (sack.start == 0 && sack.end == 0) continue; *lp++ = htonl(sack.start); *lp++ = htonl(sack.end); count++; } *olp = htonl(TCPOPT_SACK_HDR|(TCPOLEN_SACK*count+2)); optlen += TCPOLEN_SACK*count + 4; /* including leading NOPs */ } /* * Send `CC-family' options if our side wants to use them (TF_REQ_CC), * options are allowed (!TF_NOOPT) and it's not a RST. */ if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && (flags & TH_RST) == 0) { switch (flags & (TH_SYN|TH_ACK)) { /* * This is a normal ACK, send CC if we received CC before * from our peer. */ case TH_ACK: if (!(tp->t_flags & TF_RCVD_CC)) break; /*FALLTHROUGH*/ /* * We can only get here in T/TCP's SYN_SENT* state, when * we're a sending a non-SYN segment without waiting for * the ACK of our SYN. A check above assures that we only * do this if our peer understands T/TCP. */ case 0: opt[optlen++] = TCPOPT_NOP; opt[optlen++] = TCPOPT_NOP; opt[optlen++] = TCPOPT_CC; opt[optlen++] = TCPOLEN_CC; *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send); optlen += 4; break; /* * This is our initial SYN, check whether we have to use * CC or CC.new. */ case TH_SYN: opt[optlen++] = TCPOPT_NOP; opt[optlen++] = TCPOPT_NOP; opt[optlen++] = tp->t_flags & TF_SENDCCNEW ? TCPOPT_CCNEW : TCPOPT_CC; opt[optlen++] = TCPOLEN_CC; *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send); optlen += 4; break; /* * This is a SYN,ACK; send CC and CC.echo if we received * CC from our peer. */ case (TH_SYN|TH_ACK): if (tp->t_flags & TF_RCVD_CC) { opt[optlen++] = TCPOPT_NOP; opt[optlen++] = TCPOPT_NOP; opt[optlen++] = TCPOPT_CC; opt[optlen++] = TCPOLEN_CC; *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send); optlen += 4; opt[optlen++] = TCPOPT_NOP; opt[optlen++] = TCPOPT_NOP; opt[optlen++] = TCPOPT_CCECHO; opt[optlen++] = TCPOLEN_CC; *(u_int32_t *)&opt[optlen] = htonl(tp->cc_recv); optlen += 4; } break; } } #ifdef TCP_SIGNATURE #ifdef INET6 if (!isipv6) #endif if (tp->t_flags & TF_SIGNATURE) { int i; u_char *bp; /* Initialize TCP-MD5 option (RFC2385) */ bp = (u_char *)opt + optlen; *bp++ = TCPOPT_SIGNATURE; *bp++ = TCPOLEN_SIGNATURE; sigoff = optlen + 2; for (i = 0; i < TCP_SIGLEN; i++) *bp++ = 0; optlen += TCPOLEN_SIGNATURE; /* Terminate options list and maintain 32-bit alignment. */ *bp++ = TCPOPT_NOP; *bp++ = TCPOPT_EOL; optlen += 2; } #endif /* TCP_SIGNATURE */ hdrlen += optlen; #ifdef INET6 if (isipv6) ipoptlen = ip6_optlen(tp->t_inpcb); else #endif if (tp->t_inpcb->inp_options) ipoptlen = tp->t_inpcb->inp_options->m_len - offsetof(struct ipoption, ipopt_list); else
/* * Tcp output routine: figure out what should be sent and send it. */ int tcp_output(PNATState pData, register struct tcpcb *tp) { register struct socket *so = tp->t_socket; register long len, win; int off, flags, error; register struct mbuf *m = NULL; register struct tcpiphdr *ti; u_char opt[MAX_TCPOPTLEN]; unsigned optlen, hdrlen; int idle, sendalot; int size = 0; LogFlowFunc(("ENTER: tcp_output: tp = %R[tcpcb793]\n", tp)); /* * Determine length of data that should be transmitted, * and flags that will be used. * If there is some data or critical controls (SYN, RST) * to send, then transmit; otherwise, investigate further. */ idle = (tp->snd_max == tp->snd_una); if (idle && tp->t_idle >= tp->t_rxtcur) /* * We have been idle for "a while" and no acks are * expected to clock out any data we send -- * slow start to get ack "clock" running again. */ tp->snd_cwnd = tp->t_maxseg; again: sendalot = 0; off = tp->snd_nxt - tp->snd_una; win = min(tp->snd_wnd, tp->snd_cwnd); flags = tcp_outflags[tp->t_state]; Log2((" --- tcp_output flags = 0x%x\n", flags)); /* * If in persist timeout with window of 0, send 1 byte. * Otherwise, if window is small but nonzero * and timer expired, we will send what we can * and go to transmit state. */ if (tp->t_force) { if (win == 0) { /* * If we still have some data to send, then * clear the FIN bit. Usually this would * happen below when it realizes that we * aren't sending all the data. However, * if we have exactly 1 byte of unset data, * then it won't clear the FIN bit below, * and if we are in persist state, we wind * up sending the packet without recording * that we sent the FIN bit. * * We can't just blindly clear the FIN bit, * because if we don't have any more data * to send then the probe will be the FIN * itself. */ if (off < SBUF_LEN(&so->so_snd)) flags &= ~TH_FIN; win = 1; } else { tp->t_timer[TCPT_PERSIST] = 0; tp->t_rxtshift = 0; } } len = min(SBUF_LEN(&so->so_snd), win) - off; if (len < 0) { /* * If FIN has been sent but not acked, * but we haven't been called to retransmit, * len will be -1. Otherwise, window shrank * after we sent into it. If window shrank to 0, * cancel pending retransmit and pull snd_nxt * back to (closed) window. We will enter persist * state below. If the window didn't close completely, * just wait for an ACK. */ len = 0; if (win == 0) { tp->t_timer[TCPT_REXMT] = 0; tp->snd_nxt = tp->snd_una; } } if (len > tp->t_maxseg) { len = tp->t_maxseg; sendalot = 1; } if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + SBUF_LEN(&so->so_snd))) flags &= ~TH_FIN; win = sbspace(&so->so_rcv); /* * Sender silly window avoidance. If connection is idle * and can send all data, a maximum segment, * at least a maximum default-size segment do it, * or are forced, do it; otherwise don't bother. * If peer's buffer is tiny, then send * when window is at least half open. * If retransmitting (possibly after persist timer forced us * to send into a small window), then must resend. */ if (len) { if (len == tp->t_maxseg) goto send; if ((1 || idle || tp->t_flags & TF_NODELAY) && len + off >= SBUF_LEN(&so->so_snd)) goto send; if (tp->t_force) goto send; if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) goto send; } /* * Compare available window to amount of window * known to peer (as advertised window less * next expected input). If the difference is at least two * max size segments, or at least 50% of the maximum possible * window, then want to send a window update to peer. */ if (win > 0) { /* * "adv" is the amount we can increase the window, * taking into account that we are limited by * TCP_MAXWIN << tp->rcv_scale. */ long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale); if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) adv -= tp->rcv_adv - tp->rcv_nxt; if (adv >= (long) (2 * tp->t_maxseg)) goto send; if (2 * adv >= (long) SBUF_SIZE(&so->so_rcv)) goto send; } /* * Send if we owe peer an ACK. */ if (tp->t_flags & TF_ACKNOW) goto send; if (flags & (TH_SYN|TH_RST)) goto send; if (SEQ_GT(tp->snd_up, tp->snd_una)) goto send; /* * If our state indicates that FIN should be sent * and we have not yet done so, or we're retransmitting the FIN, * then we need to send. */ if ( flags & TH_FIN && ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) goto send; /* * TCP window updates are not reliable, rather a polling protocol * using ``persist'' packets is used to insure receipt of window * updates. The three ``states'' for the output side are: * idle not doing retransmits or persists * persisting to move a small or zero window * (re)transmitting and thereby not persisting * * tp->t_timer[TCPT_PERSIST] * is set when we are in persist state. * tp->t_force * is set when we are called to send a persist packet. * tp->t_timer[TCPT_REXMT] * is set when we are retransmitting * The output side is idle when both timers are zero. * * If send window is too small, there is data to transmit, and no * retransmit or persist is pending, then go to persist state. * If nothing happens soon, send when timer expires: * if window is nonzero, transmit what we can, * otherwise force out a byte. */ if ( SBUF_LEN(&so->so_snd) && tp->t_timer[TCPT_REXMT] == 0 && tp->t_timer[TCPT_PERSIST] == 0) { tp->t_rxtshift = 0; tcp_setpersist(tp); } /* * No reason to send a segment, just return. */ tcpstat.tcps_didnuttin++; LogFlowFuncLeave(); return (0); send: LogFlowFunc(("send\n")); /* * Before ESTABLISHED, force sending of initial options * unless TCP set not to do any options. * NOTE: we assume that the IP/TCP header plus TCP options * always fit in a single mbuf, leaving room for a maximum * link header, i.e. * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MHLEN */ optlen = 0; hdrlen = sizeof (struct tcpiphdr); if (flags & TH_SYN) { tp->snd_nxt = tp->iss; if ((tp->t_flags & TF_NOOPT) == 0) { u_int16_t mss; opt[0] = TCPOPT_MAXSEG; opt[1] = 4; mss = RT_H2N_U16((u_int16_t) tcp_mss(pData, tp, 0)); memcpy((caddr_t)(opt + 2), (caddr_t)&mss, sizeof(mss)); optlen = 4; #if 0 if ( (tp->t_flags & TF_REQ_SCALE) && ( (flags & TH_ACK) == 0 || (tp->t_flags & TF_RCVD_SCALE))) { *((u_int32_t *) (opt + optlen)) = RT_H2N_U32( TCPOPT_NOP << 24 | TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | tp->request_r_scale); optlen += 4; } #endif } } /* * Send a timestamp and echo-reply if this is a SYN and our side * wants to use timestamps (TF_REQ_TSTMP is set) or both our side * and our peer have sent timestamps in our SYN's. */ #if 0 if ( (tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && (flags & TH_RST) == 0 && ( (flags & (TH_SYN|TH_ACK)) == TH_SYN || (tp->t_flags & TF_RCVD_TSTMP))) { u_int32_t *lp = (u_int32_t *)(opt + optlen); /* Form timestamp option as shown in appendix A of RFC 1323. */ *lp++ = RT_H2N_U32_C(TCPOPT_TSTAMP_HDR); *lp++ = RT_H2N_U32(tcp_now); *lp = RT_H2N_U32(tp->ts_recent); optlen += TCPOLEN_TSTAMP_APPA; } #endif hdrlen += optlen; /* * Adjust data length if insertion of options will * bump the packet length beyond the t_maxseg length. */ if (len > tp->t_maxseg - optlen) { len = tp->t_maxseg - optlen; sendalot = 1; } /* * Grab a header mbuf, attaching a copy of data to * be transmitted, and initialize the header from * the template for sends on this connection. */ if (len) { if (tp->t_force && len == 1) tcpstat.tcps_sndprobe++; else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { tcpstat.tcps_sndrexmitpack++; tcpstat.tcps_sndrexmitbyte += len; } else { tcpstat.tcps_sndpack++; tcpstat.tcps_sndbyte += len; } size = MCLBYTES; if ((len + hdrlen + ETH_HLEN) < MSIZE) size = MCLBYTES; else if ((len + hdrlen + ETH_HLEN) < MCLBYTES) size = MCLBYTES; else if((len + hdrlen + ETH_HLEN) < MJUM9BYTES) size = MJUM9BYTES; else if ((len + hdrlen + ETH_HLEN) < MJUM16BYTES) size = MJUM16BYTES; else AssertMsgFailed(("Unsupported size")); m = m_getjcl(pData, M_NOWAIT, MT_HEADER, M_PKTHDR, size); if (m == NULL) { /* error = ENOBUFS; */ error = 1; goto out; } m->m_data += if_maxlinkhdr; m->m_pkthdr.header = mtod(m, void *); m->m_len = hdrlen; /* * This will always succeed, since we make sure our mbufs * are big enough to hold one MSS packet + header + ... etc. */ #if 0 if (len <= MHLEN - hdrlen - max_linkhdr) { #endif sbcopy(&so->so_snd, off, (int) len, mtod(m, caddr_t) + hdrlen); m->m_len += len; #if 0 } else { m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len); if (m->m_next == 0) len = 0; } #endif /* * If we're sending everything we've got, set PUSH. * (This will keep happy those implementations which only * give data to the user when a buffer fills or * a PUSH comes in.) */ if (off + len == SBUF_LEN(&so->so_snd)) flags |= TH_PUSH; } else {
/* * TCP timer processing. */ struct tcpcb* tcp_timers(struct tcpcb *tp, int timer) { int rexmt; switch (timer) { /* * 2 MSL timeout in shutdown went off. If we're closed but * still waiting for peer to close and connection has been idle * too long, or if 2MSL time is up from TIME_WAIT, delete connection * control block. Otherwise, check again in a bit. * If TIME_WAIT is not set, this is FIN_WAIT_2 timer. */ case TCPT_2MSL: if (tp->t_state != TCPS_TIME_WAIT && tp->t_idle <= g_tcp_maxidle) tp->t_timer[TCPT_2MSL] = g_tcp_keepintvl; else tp = tcp_close(tp); break; /* * Retransmission timer went off. Message has not * been acked within retransmit interval. Back off * to a longer retransmit interval and retransmit one segment. */ case TCPT_REXMT: if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; g_tcpstat.tcps_timeoutdrop++; tp = tcp_drop(tp, tp->t_softerror ? tp->t_softerror : ETIMEDOUT); break; } g_tcpstat.tcps_rexmttimeo++; rexmt = TCP_REXMTVAL(tp) * g_tcp_backoff[tp->t_rxtshift]; TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX); tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; /* * If losing, let the lower level know and try for * a better route. Also, if we backed off this far, * our srtt estimate is probably bogus. Clobber it * so we'll take the next rtt measurement as our srtt; * move the current srtt into rttvar to keep the current * retransmit times until then. */ if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { in_losing(tp->t_inpcb); tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); tp->t_srtt = 0; } tp->snd_nxt = tp->snd_una; /* * If timing a segment in this window, stop the timer. */ tp->t_rtt = 0; /* * Close the congestion window down to one segment * (we'll open it by one segment for each ack we get). * Since we probably have a window's worth of unacked * data accumulated, this "slow start" keeps us from * dumping all that data as back-to-back packets (which * might overwhelm an intermediate gateway). * * There are two phases to the opening: Initially we * open by one mss on each ack. This makes the window * size increase exponentially with time. If the * window is larger than the path can handle, this * exponential growth results in dropped packet(s) * almost immediately. To get more time between * drops but still "push" the network to take advantage * of improving conditions, we switch from exponential * to linear window opening at some threshhold size. * For a threshhold, we use half the current window * size, truncated to a multiple of the mss. * * (the minimum cwnd that will give us exponential * growth is 2 mss. We don't allow the threshhold * to go below this.) */ { u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; if (win < 2) win = 2; tp->snd_cwnd = tp->t_maxseg; tp->snd_ssthresh = win * tp->t_maxseg; tp->t_dupacks = 0; } (void) tcp_output(tp); break; /* * Persistance timer into zero window. * Force a byte to be output, if possible. */ case TCPT_PERSIST: g_tcpstat.tcps_persisttimeo++; /* * Hack: if the peer is dead/unreachable, we do not * time out if the window is closed. After a full * backoff, drop the connection if the idle time * (no responses to probes) reaches the maximum * backoff that we would use if retransmitting. */ if (tp->t_rxtshift == TCP_MAXRXTSHIFT && (tp->t_idle >= g_tcp_maxpersistidle || tp->t_idle >= TCP_REXMTVAL(tp) * g_tcp_totbackoff)) { g_tcpstat.tcps_persistdrop++; tp = tcp_drop(tp, ETIMEDOUT); break; } tcp_setpersist(tp); tp->t_force = 1; (void) tcp_output(tp); tp->t_force = 0; break; /* * Keep-alive timer went off; send something * or drop connection if idle for too long. */ case TCPT_KEEP: g_tcpstat.tcps_keeptimeo++; if (tp->t_state < TCPS_ESTABLISHED) // connection-establishment timer. goto dropit; if (tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE && tp->t_state <= TCPS_CLOSE_WAIT) { // keepalive timer. if (tp->t_idle >= g_tcp_keepidle + g_tcp_maxidle) goto dropit; /* * Send a packet designed to force a response * if the peer is up and reachable: * either an ACK if the connection is still alive, * or an RST if the peer has closed the connection * due to timeout or reboot. * Using sequence number tp->snd_una-1 * causes the transmitted zero-length segment * to lie outside the receive window; * by the protocol spec, this requires the * correspondent TCP to respond. */ g_tcpstat.tcps_keepprobe++; tcp_respond(tp, tp->t_template, (usn_mbuf_t *)NULL, tp->rcv_nxt, tp->snd_una - 1, 0); tp->t_timer[TCPT_KEEP] = g_tcp_keepintvl; } else tp->t_timer[TCPT_KEEP] = g_tcp_keepidle; break; dropit: g_tcpstat.tcps_keepdrops++; tp = tcp_drop(tp, ETIMEDOUT); break; }