Beispiel #1
0
/* Caller should be in critical section */
static struct tcpcb *
tcp_timer_persist_handler(struct tcpcb *tp)
{
#ifdef TCPDEBUG
	int ostate;
#endif

#ifdef TCPDEBUG
	ostate = tp->t_state;
#endif
	/*
	 * Persistance timer into zero window.
	 * Force a byte to be output, if possible.
	 */
	tcpstat.tcps_persisttimeo++;
	/*
	 * Hack: if the peer is dead/unreachable, we do not
	 * time out if the window is closed.  After a full
	 * backoff, drop the connection if the idle time
	 * (no responses to probes) reaches the maximum
	 * backoff that we would use if retransmitting.
	 */
	if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
	    ((ticks - tp->t_rcvtime) >= tcp_maxpersistidle ||
	     (ticks - tp->t_rcvtime) >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
		tcpstat.tcps_persistdrop++;
		tp = tcp_drop(tp, ETIMEDOUT);
		goto out;
	}
	tcp_setpersist(tp);
	tp->t_flags |= TF_FORCE;
	tcp_output(tp);
	tp->t_flags &= ~TF_FORCE;

out:
#ifdef TCPDEBUG
	if (tp && tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
		tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO);
#endif
	return tp;
}
Beispiel #2
0
int
tcp_output(struct tcpcb * tp)
{
   struct socket *   so =  tp->t_inpcb->inp_socket;
   int   len;
   long  win;
   int   off,  flags,   error;
   struct mbuf *  m;
   struct tcpiphdr * ti;
   unsigned optlen = 0;
   int   idle, sendalot;
   struct mbuf *  sendm;   /* mbuf which contains data to send */
   struct mbuf * tcp_mbuf; /* mbuf containing TCP header */
   int   bufoff;           /* offset of data in sendm->m_data */

#ifdef TCP_SACK
   int   sack_resend;
   int   sack_hole = 0;    /* next sack hole to fill */

   if(tp->t_flags & TF_SACKREPLY)
   {
      /* we are resending based on a received SACK header */
      sack_resend = TRUE;
      tp->t_flags &= ~TF_SACKREPLY;    /* clear flag */
   }
   else
      sack_resend = FALSE;
#endif /* TCP_SACK */
   
   /*
    * Determine length of data that should be transmitted,
    * and flags that will be used.
    * If there is some data or critical controls (SYN, RST)
    * to send, then transmit; otherwise, investigate further.
    */
   idle = (tp->snd_max == tp->snd_una);

again:
   sendalot = 0;
   off = (int)(tp->snd_nxt - tp->snd_una);
   win = (long)tp->snd_wnd;   /* set basic send window */
   if (win > (long)tp->snd_cwnd) /* see if we need congestion control */
   {
      win = (int)(tp->snd_cwnd & ~(ALIGN_TYPE-1)); /* keep data aligned */
   }

   /*
    * If in persist timeout with window of 0, send 1 byte.
    * Otherwise, if window is small but nonzero
    * and timer expired, we will send what we can
    * and go to transmit state.
    */
   if (tp->t_force) 
   {
      if (win == 0)
         win = 1;
      else 
      {
         tp->t_timer[TCPT_PERSIST] = 0;
         tp->t_rxtshift = 0;
      }
   }

#ifdef TCP_SACK
   /* See if we need to adjust the offset for a sack resend */
   if(sack_resend)
   {
      off = (int)(tp->sack_hole_start[sack_hole] - tp->snd_una);
      /* if this hole's already been acked then punt and move to next hole */
      if(off < 0)
      {
         /* clear out the acked hole */
         tp->sack_hole_start[sack_hole] = tp->sack_hole_end[sack_hole] = 0;
         /* see if we're done with SACK hole list (2 tests) */
         if(++sack_hole >= SACK_BLOCKS)
            return 0;
         if(tp->sack_hole_start[sack_hole] == tp->sack_hole_end[sack_hole])
            return 0;
         goto again;
      }
      tp->snd_nxt = tp->sack_hole_start[sack_hole];
      len = (int)(tp->sack_hole_end[sack_hole] - tp->sack_hole_start[sack_hole]);
      len = (int)MIN(len, (int)win);
   }
   else
#endif /* TCP_SACK */
   {
      /* set length of packets which are not sack resends */
      len = (int)MIN(so->so_snd.sb_cc, (unsigned)win) - off;
   }

   flags = tcp_outflags[tp->t_state];


   /* See if we need to build TCP options field. This test should be fast. */

#if (defined(TCP_TIMESTAMP) | defined(TCP_SACK))	   
   if((flags & TH_SYN) ||
/*   !!!???   (so->so_options & SO_TIMESTAMP) ||  */
	  (tp->t_flags & TF_SACKNOW)
	 )
   {
      optlen = bld_options(tp, &tcp_optionbuf[optlen], flags, so);
   }
#else
   /* If other options not defined this build then don't bother to call bld_options() except 
    * on SYN packets
    */
   if(flags & TH_SYN)
   {
      optlen = bld_options(tp, &tcp_optionbuf[optlen], flags, so);
   }
#endif

   if (len < 0)
   {
      /*
       * If FIN has been sent but not acked,
       * but we haven't been called to retransmit,
       * len will be -1.  Otherwise, window shrank
       * after we sent into it.  If window shrank to 0,
       * cancel pending retransmit and pull snd_nxt
       * back to (closed) window.  We will enter persist
       * state below.  If the window didn't close completely,
       * just wait for an ACK.
       */
      len = 0;
      if (win == 0) 
      {
         tp->t_timer[TCPT_REXMT] = 0;
         tp->snd_nxt = tp->snd_una;
      }
   }

   if (len > (int)tp->t_maxseg)
   {
      len = tp->t_maxseg;
      sendalot = 1;
   }

#ifdef IP_V4
#ifdef IP_PMTU
   {
      int pmtu = tp->t_inpcb->inp_pmtu - 40;

      if (len > pmtu)
      {
         len = pmtu - 40;
         sendalot = 1;
      }
   }
#endif /* IP_PMTU */
   /* We don't need a pmtu test for IPv6. V6 code limits t_maxseg to
    * the Path MTU, so the test above the v4 ifdef above covers us.
    */
#endif /* IP_V4 */

   if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
      flags &= ~TH_FIN;
   win = (long)(sbspace(&so->so_rcv));

   /*
    * If our state indicates that FIN should be sent
    * and we have not yet done so, or we're retransmitting the FIN,
    * then we need to send.
    */
   if ((flags & TH_FIN) &&
       (so->so_snd.sb_cc == 0) &&
       ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
   {
      goto send;
   }
   /*
    * Send if we owe peer an ACK.
    */
   if (tp->t_flags & TF_ACKNOW)
      goto send;
   if (flags & (TH_SYN|TH_RST))
      goto send;
   if (SEQ_GT(tp->snd_up, tp->snd_una))
      goto send;

   /*
    * Sender silly window avoidance.  If connection is idle
    * and can send all data, a maximum segment,
    * at least a maximum default-size segment do it,
    * or are forced, do it; otherwise don't bother.
    * If peer's buffer is tiny, then send
    * when window is at least half open.
    * If retransmitting (possibly after persist timer forced us
    * to send into a small window), then must resend.
    */
   if (len)
   {
      if (len == (int)tp->t_maxseg)
         goto send;
      if ((idle || tp->t_flags & TF_NODELAY) &&
          len + off >= (int)so->so_snd.sb_cc)
      {
         goto send;
      }
      if (tp->t_force)
         goto send;
      if (len >= (int)(tp->max_sndwnd / 2))
         goto send;
      if (SEQ_LT(tp->snd_nxt, tp->snd_max))
         goto send;
   }

   /*
    * Compare available window to amount of window
    * known to peer (as advertised window less
    * next expected input).  If the difference is at least two
    * max size segments or at least 35% of the maximum possible
    * window, then want to send a window update to peer.
    */
   if (win > 0)
   {
      int   adv   =  (int)win -  (int)(tp->rcv_adv -  tp->rcv_nxt);

      if (so->so_rcv.sb_cc == 0 && adv >= (int)(tp->t_maxseg * 2))
         goto send;
      if (100 * (u_int)adv / so->so_rcv.sb_hiwat >= 35)
         goto send;
   }

   /*
    * TCP window updates are not reliable, rather a polling protocol
    * using ``persist'' packets is used to insure receipt of window
    * updates.  The three ``states'' for the output side are:
    *   idle         not doing retransmits or persists
    *   persisting      to move a small or zero window
    *   (re)transmitting   and thereby not persisting
    *
    * tp->t_timer[TCPT_PERSIST]
    *   is set when we are in persist state.
    * tp->t_force
    *   is set when we are called to send a persist packet.
    * tp->t_timer[TCPT_REXMT]
    *   is set when we are retransmitting
    * The output side is idle when both timers are zero.
    *
    * If send window is too small, there is data to transmit, and no
    * retransmit or persist is pending, then go to persist state.
    * If nothing happens soon, send when timer expires:
    * if window is nonzero, transmit what we can,
    * otherwise force out a byte.
    */
   if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 &&
       tp->t_timer[TCPT_PERSIST] == 0) 
   {
      tp->t_rxtshift = 0;
      tcp_setpersist(tp);
   }

   /*
    * No reason to send a segment, just return.
    */
   return (0);

send:
   ENTER_CRIT_SECTION(tp);

   /* Limit send length to the current buffer so as to
    * avoid doing the "mbuf shuffle" in m_copy().
    */
   bufoff = off;
   sendm = so->so_snd.sb_mb;
   if (len)
   {
      /* find mbuf containing data to send (at "off") */
      while (sendm)  /* loop through socket send list */
      {
         bufoff -= sendm->m_len;
         if (bufoff < 0)   /* if off is in this buffer, break */
            break;
         sendm = sendm->m_next;
      }
      if (!sendm) { dtrap();  /* shouldn't happen */ }
      bufoff += sendm->m_len; /* index to next data to send in msend */

      /* if socket has multiple unsent mbufs, set flag for send to loop */
      if ((sendm->m_next) && (len > (int)sendm->m_len))
      {
         flags &= ~TH_FIN; /* don't FIN on segment prior to last */
         sendalot = 1;     /* set to send more segments */
      }
      if((flags & TH_FIN) && (so->so_snd.sb_cc > (unsigned)len))
      {
         /* This can happen on slow links (PPP) which retry the last 
          * segment - the one with the FIN bit attached to data.
          */
         flags &= ~TH_FIN; /* don't FIN on segment prior to last */
      }

      /* only send the rest of msend */
      len = min(len, (int)sendm->m_len);

      /* if we're not sending starting at sendm->m_data (in which 
       * case bufoff != 0), then we will copy the data; else we would 
       * write IP/TCP headers over sent but un-ack'ed data in sendm. 
       * Similarly, if sendm->m_data is not aligned with respect to 
       * sendm->m_base and ALIGN_TYPE, we will copy the data to 
       * ensure that it (and the then-prepended IP/TCP headers) will 
       * be aligned according to ALIGN_TYPE. 
       */
      if ((bufoff != 0) ||       /* data not front aligned in send mbuf? */
          (((sendm->m_data - sendm->m_base) & (ALIGN_TYPE - 1)) != 0))
      {
         len = min(len, (int)(sendm->m_len - bufoff));   /* limit len again */

         /* One more test - if this data is not aligned with the front
          * of the m_data buffer then we can't use it in place, else we
          * might write the IP/TCP header over data that has not yet
          * been acked. In this case we must make sure our send
          * fits into a little buffer and send what we can.
          */
         if ((len > (int)(lilbufsiz - HDRSLEN)) && /* length is bigger the small buffer? */
             (bigfreeq.q_len < 2))      /* and we are low on big buffers */
         {
            len = lilbufsiz - HDRSLEN;
         }
      }
   }

   /* if send data is sufficiently aligned in packet, prepend TCP/IP header
    * in the space provided. 
    */
   if (len && (bufoff == 0) && 
       (sendm->pkt->inuse == 1) &&
       (((sendm->m_data - sendm->m_base) & (ALIGN_TYPE - 1)) == 0) && 
       (optlen == 0))
   {
      /* get an empty mbuf to "clone" the data */
      m = m_getnbuf(MT_TXDATA, 0);
      if (!m)
      {
         EXIT_CRIT_SECTION(tp);
         return (ENOBUFS);
      }
      m->pkt = sendm->pkt; /* copy packet location in new mbuf */
      m->pkt->inuse++;     /* bump packet's use count */
      m->m_base = sendm->m_base; /* clone mbuf members */
      m->m_memsz = sendm->m_memsz;
      m->m_len = len + TCPIPHDRSZ;  /* adjust clone for header */
      m->m_data = sendm->m_data - TCPIPHDRSZ;
   }
   else  /* either no data or data is not front aligned in mbuf */
   {
      /* Grab a header mbuf, attaching a copy of data to be 
       * transmitted, and initialize the header from 
       * the template for sends on this connection.
       */
      m = m_getwithdata (MT_HEADER, IFNETHDR_SIZE + TCPIPHDRSZ);
      if (m ==(struct mbuf *)NULL)
      {
         EXIT_CRIT_SECTION(tp);
         return ENOBUFS;
      }

      m->m_len = TCPIPHDRSZ;
      m->m_data += IFNETHDR_SIZE;/* Move this to sizeof tcpip hdr leave*/
      /* 14 bytes for ethernet header      */

      if (len) /* attach any data to send */
      {
         m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len);
         if (m->m_next == 0)
         {
            m_freem(m);
            EXIT_CRIT_SECTION(tp);
            return ENOBUFS;
         }
      }
   }
   EXIT_CRIT_SECTION(tp);

   if (len) 
   {
      if (tp->t_force && len == 1)
         tcpstat.tcps_sndprobe++;
      else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 
      {
         tcpstat.tcps_sndrexmitpack++;
         tcpstat.tcps_sndrexmitbyte += len;
#ifdef TCP_SACK
      if(sack_resend)
         tcpstat.tcps_sackresend++;
#endif
      } 
      else 
      {
         tcpstat.tcps_sndpack++;
         tcpstat.tcps_sndbyte += len;
      }
   }
   else if (tp->t_flags & TF_ACKNOW)
   {
      tcpstat.tcps_sndacks++;
   }
   else if (flags & (TH_SYN|TH_FIN|TH_RST))
      tcpstat.tcps_sndctrl++;
   else if (SEQ_GT(tp->snd_up, tp->snd_una))
      tcpstat.tcps_sndurg++;
   else
      tcpstat.tcps_sndwinup++;

   ti = (struct tcpiphdr *)(m->m_data+sizeof(struct ip)-sizeof(struct ipovly));
   if ((char *)ti < m->pkt->nb_buff)
   {
      panic("tcp_out- packet ptr underflow\n");
   }
   tcp_mbuf = m;        /* flag TCP header mbuf */

#ifdef IP_V6  /* Dual mode code */
   if(so->so_domain == AF_INET6)
   {
      m = mbuf_prepend(m, sizeof(struct ipv6));
      if(m == NULL)
      {
         /* this can happen when we run out of mbufs or pkt buffers
          * That is, mfreeq is empty or (lilfreeq, bigfreeq) are empty.
          * One solution is to find out which one is getting full and
          * then increase them.
          */
         dtrap();             /* This is really rare... */
         m_freem(tcp_mbuf);   /* Free TCP/data chain */
         return ENOBUFS;
      }

      /* strip overlay from front of TCP header */
      tcp_mbuf->m_data += sizeof(struct ipovly);
      tcp_mbuf->m_len -= sizeof(struct ipovly);
   }
#endif   /* end IP_V6 */

   if (tp->t_template == 0)
      panic("tcp_output");

   MEMCPY((char*)ti, (char*)tp->t_template, sizeof(struct tcpiphdr));

   /*
    * Fill in fields, remembering maximum advertised
    * window for use in delaying messages about window sizes.
    * If resending a FIN, be sure not to use a new sequence number.
    */
   if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && 
       tp->snd_nxt == tp->snd_max)
   {
      tp->snd_nxt--;
   }

   ti->ti_seq = htonl(tp->snd_nxt);
   ti->ti_ack = htonl(tp->rcv_nxt);

   /*
    * If we're sending a SYN, check the IP address of the interface
    * that we will (likely) use to send the IP datagram -- if it's
    * changed from what is in the template (as it might if this is
    * a retransmission, and the original SYN caused PPP to start
    * bringing the interface up, and PPP has got a new IP address
    * via IPCP), update the template and the inpcb with the new 
    * address.
    */
   if (flags & TH_SYN)
   {
      struct inpcb * inp;
      inp = (struct inpcb *)so->so_pcb;

      switch(so->so_domain)
      {
#ifdef IP_V4
      case AF_INET:
      {
         ip_addr src;

#ifdef INCLUDE_PPP

         if(((flags & TH_ACK) == 0) && /* SYN only, not SYN/ACK */
            (inp->ifp) &&              /* Make sure we have iface */
            (inp->ifp->mib.ifType == PPP))   /* only PPP type */
         {
            dtrap(); /* remove after confirmed to work in PPP */ 
            src = ip_mymach(ti->ti_dst.s_addr);

         if (src != ti->ti_src.s_addr)
         {
            ti->ti_src.s_addr = src;
            tp->t_template->ti_src.s_addr = src;
            tp->t_inpcb->inp_laddr.s_addr = src;
         }
         }
#endif   /* INCLUDE_PPP */

         /* If this is a SYN (not a SYN/ACK) then set the pmtu */
         if((flags & TH_ACK) == 0)
         {
#ifdef IP_PMTU
            inp->inp_pmtu = pmtucache_get(inp->inp_faddr.s_addr);
#else    /* not compiled for pathmtu, guess based on iface */
            {
               NET ifp;
               /* find iface for route. Pass "src" as nexthop return */
               ifp = iproute(ti->ti_dst.s_addr, &src);
               if(ifp)
                  inp->inp_pmtu = ifp->n_mtu - (ifp->n_lnh + 40);
               else
                  inp->inp_pmtu = 580;  /* Ugh. */
            }
#endif   /* IP_PMTU */
         }
         break;
      }
#endif   /* IP_V4 */

#ifdef IP_V6
      case AF_INET6:
      {
         struct ip6_inaddr * local;
         
         local = ip6_myaddr(&tp->t_inpcb->ip6_faddr, inp->ifp);

         /* If we got a local address & it's not the one in the pcb, then
          * we assume it changed at the iface and fix it in the pcb. Unlike 
          * v4, we don't have an IP header yet, not do we have a template 
          * to worry about.
          */
         if((local) && 
            (!IP6EQ(&local->addr, &tp->t_inpcb->ip6_laddr)))
         {
            IP6CPY(&tp->t_inpcb->ip6_laddr, &local->addr);
         }
         /* If this is a SYN (not a SYN/ACK) then set the pmtu */
         if((flags & TH_ACK) == 0)
         {
            inp->inp_pmtu = ip6_pmtulookup(&inp->ip6_laddr, inp->ifp);
         }
         break;
      }
#endif   /* IP_V6 */
      default:
         dtrap();    /* bad domain setting */
      }
   }

   /* fill in options if any are set */
   if (optlen)
   {
      struct mbuf * mopt;

      mopt = m_getwithdata(MT_TXDATA, MAXOPTLEN);
      if (mopt == NULL) 
      {
         m_freem(m);
         return (ENOBUFS);
      }

      /* insert options mbuf after after tmp_mbuf */
      mopt->m_next = tcp_mbuf->m_next;
      tcp_mbuf->m_next = mopt;

      /* extend options to aligned address */
      while(optlen & 0x03)
         tcp_optionbuf[optlen++] = TCPOPT_EOL;

      MEMCPY(mtod(mopt, char *), tcp_optionbuf, optlen);
      mopt->m_len = optlen;
      /* use portable macro to set tcp data offset bits */
      SET_TH_OFF(ti->ti_t, ((sizeof (struct tcphdr) + optlen) >> 2));
   }

   ti->ti_flags = (u_char)flags;
   /*
    * Calculate receive window. Don't shrink window,
    * but avoid silly window syndrome.
    */
   if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg)
      win = 0;
   if (win < (long)(tp->rcv_adv - tp->rcv_nxt))
      win = (long)(tp->rcv_adv - tp->rcv_nxt);

   /* do check for Iniche buffer limits -JB- */
   if (bigfreeq.q_len == 0)   /* If queue length is 0, set window to 0 */
   {
      win = 0;
   }
   else if(win > (((long)bigfreeq.q_len - 1) * (long)bigbufsiz))
   {
      win = ((long)bigfreeq.q_len - 1) * bigbufsiz;
   }

#ifdef TCP_WIN_SCALE
   if(tp->t_flags & TF_WINSCALE)
   {
      ti->ti_win = htons((u_short)(win >> tp->rcv_wind_scale)); /* apply scale */
   }
Beispiel #3
0
/*
 * Tcp output routine: figure out what should be sent and send it.
 */
int
tcp_output(struct tcpcb *tp)
{
	struct inpcb * const inp = tp->t_inpcb;
	struct socket *so = inp->inp_socket;
	long len, recvwin, sendwin;
	int nsacked = 0;
	int off, flags, error;
#ifdef TCP_SIGNATURE
	int sigoff = 0;
#endif
	struct mbuf *m;
	struct ip *ip = NULL;
	struct ipovly *ipov = NULL;
	struct tcphdr *th;
	u_char opt[TCP_MAXOLEN];
	unsigned int ipoptlen, optlen, hdrlen;
	int idle;
	boolean_t sendalot;
	struct ip6_hdr *ip6 = NULL;
#ifdef INET6
	const boolean_t isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
#else
	const boolean_t isipv6 = FALSE;
#endif

	KKASSERT(so->so_port == &curthread->td_msgport);

	/*
	 * Determine length of data that should be transmitted,
	 * and flags that will be used.
	 * If there is some data or critical controls (SYN, RST)
	 * to send, then transmit; otherwise, investigate further.
	 */

	/*
	 * If we have been idle for a while, the send congestion window
	 * could be no longer representative of the current state of the link.
	 * So unless we are expecting more acks to come in, slow-start from
	 * scratch to re-determine the send congestion window.
	 */
	if (tp->snd_max == tp->snd_una &&
	    (ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
		if (tcp_do_rfc3390) {
			int initial_cwnd =
			    min(4 * tp->t_maxseg, max(2 * tp->t_maxseg, 4380));

			tp->snd_cwnd = min(tp->snd_cwnd, initial_cwnd);
		} else {
			tp->snd_cwnd = tp->t_maxseg;
		}
		tp->snd_wacked = 0;
	}

	/*
	 * Calculate whether the transmit stream was previously idle 
	 * and adjust TF_LASTIDLE for the next time.
	 */
	idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
	if (idle && (tp->t_flags & TF_MORETOCOME))
		tp->t_flags |= TF_LASTIDLE;
	else
		tp->t_flags &= ~TF_LASTIDLE;

	if (TCP_DO_SACK(tp) && tp->snd_nxt != tp->snd_max &&
	    !IN_FASTRECOVERY(tp))
		nsacked = tcp_sack_bytes_below(&tp->scb, tp->snd_nxt);

again:
	/* Make use of SACK information when slow-starting after a RTO. */
	if (TCP_DO_SACK(tp) && tp->snd_nxt != tp->snd_max &&
	    !IN_FASTRECOVERY(tp)) {
		tcp_seq old_snd_nxt = tp->snd_nxt;

		tcp_sack_skip_sacked(&tp->scb, &tp->snd_nxt);
		nsacked += tp->snd_nxt - old_snd_nxt;
	}

	sendalot = FALSE;
	off = tp->snd_nxt - tp->snd_una;
	sendwin = min(tp->snd_wnd, tp->snd_cwnd + nsacked);
	sendwin = min(sendwin, tp->snd_bwnd);

	flags = tcp_outflags[tp->t_state];
	/*
	 * Get standard flags, and add SYN or FIN if requested by 'hidden'
	 * state flags.
	 */
	if (tp->t_flags & TF_NEEDFIN)
		flags |= TH_FIN;
	if (tp->t_flags & TF_NEEDSYN)
		flags |= TH_SYN;

	/*
	 * If in persist timeout with window of 0, send 1 byte.
	 * Otherwise, if window is small but nonzero
	 * and timer expired, we will send what we can
	 * and go to transmit state.
	 */
	if (tp->t_flags & TF_FORCE) {
		if (sendwin == 0) {
			/*
			 * If we still have some data to send, then
			 * clear the FIN bit.  Usually this would
			 * happen below when it realizes that we
			 * aren't sending all the data.  However,
			 * if we have exactly 1 byte of unsent data,
			 * then it won't clear the FIN bit below,
			 * and if we are in persist state, we wind
			 * up sending the packet without recording
			 * that we sent the FIN bit.
			 *
			 * We can't just blindly clear the FIN bit,
			 * because if we don't have any more data
			 * to send then the probe will be the FIN
			 * itself.
			 */
			if (off < so->so_snd.ssb_cc)
				flags &= ~TH_FIN;
			sendwin = 1;
		} else {
			tcp_callout_stop(tp, tp->tt_persist);
			tp->t_rxtshift = 0;
		}
	}

	/*
	 * If snd_nxt == snd_max and we have transmitted a FIN, the
	 * offset will be > 0 even if so_snd.ssb_cc is 0, resulting in
	 * a negative length.  This can also occur when TCP opens up
	 * its congestion window while receiving additional duplicate
	 * acks after fast-retransmit because TCP will reset snd_nxt
	 * to snd_max after the fast-retransmit.
	 *
	 * In the normal retransmit-FIN-only case, however, snd_nxt will
	 * be set to snd_una, the offset will be 0, and the length may
	 * wind up 0.
	 */
	len = (long)ulmin(so->so_snd.ssb_cc, sendwin) - off;

	/*
	 * Lop off SYN bit if it has already been sent.  However, if this
	 * is SYN-SENT state and if segment contains data, suppress sending
	 * segment (sending the segment would be an option if we still
	 * did TAO and the remote host supported it).
	 */
	if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
		flags &= ~TH_SYN;
		off--, len++;
		if (len > 0 && tp->t_state == TCPS_SYN_SENT)
			return 0;
	}

	/*
	 * Be careful not to send data and/or FIN on SYN segments.
	 * This measure is needed to prevent interoperability problems
	 * with not fully conformant TCP implementations.
	 */
	if (flags & TH_SYN) {
		len = 0;
		flags &= ~TH_FIN;
	}

	if (len < 0) {
		/*
		 * If FIN has been sent but not acked,
		 * but we haven't been called to retransmit,
		 * len will be < 0.  Otherwise, window shrank
		 * after we sent into it.  If window shrank to 0,
		 * cancel pending retransmit, pull snd_nxt back
		 * to (closed) window, and set the persist timer
		 * if it isn't already going.  If the window didn't
		 * close completely, just wait for an ACK.
		 */
		len = 0;
		if (sendwin == 0) {
			tcp_callout_stop(tp, tp->tt_rexmt);
			tp->t_rxtshift = 0;
			tp->snd_nxt = tp->snd_una;
			if (!tcp_callout_active(tp, tp->tt_persist))
				tcp_setpersist(tp);
		}
	}

	KASSERT(len >= 0, ("%s: len < 0", __func__));
	/*
	 * Automatic sizing of send socket buffer.  Often the send buffer
	 * size is not optimally adjusted to the actual network conditions
	 * at hand (delay bandwidth product).  Setting the buffer size too
	 * small limits throughput on links with high bandwidth and high
	 * delay (eg. trans-continental/oceanic links).  Setting the
	 * buffer size too big consumes too much real kernel memory,
	 * especially with many connections on busy servers.
	 *
	 * The criteria to step up the send buffer one notch are:
	 *  1. receive window of remote host is larger than send buffer
	 *     (with a fudge factor of 5/4th);
	 *  2. send buffer is filled to 7/8th with data (so we actually
	 *     have data to make use of it);
	 *  3. send buffer fill has not hit maximal automatic size;
	 *  4. our send window (slow start and cogestion controlled) is
	 *     larger than sent but unacknowledged data in send buffer.
	 *
	 * The remote host receive window scaling factor may limit the
	 * growing of the send buffer before it reaches its allowed
	 * maximum.
	 *
	 * It scales directly with slow start or congestion window
	 * and does at most one step per received ACK.  This fast
	 * scaling has the drawback of growing the send buffer beyond
	 * what is strictly necessary to make full use of a given
	 * delay*bandwith product.  However testing has shown this not
	 * to be much of an problem.  At worst we are trading wasting
	 * of available bandwith (the non-use of it) for wasting some
	 * socket buffer memory.
	 *
	 * TODO: Shrink send buffer during idle periods together
	 * with congestion window.  Requires another timer.  Has to
	 * wait for upcoming tcp timer rewrite.
	 */
	if (tcp_do_autosndbuf && so->so_snd.ssb_flags & SSB_AUTOSIZE) {
		if ((tp->snd_wnd / 4 * 5) >= so->so_snd.ssb_hiwat &&
		    so->so_snd.ssb_cc >= (so->so_snd.ssb_hiwat / 8 * 7) &&
		    so->so_snd.ssb_cc < tcp_autosndbuf_max &&
		    sendwin >= (so->so_snd.ssb_cc - (tp->snd_nxt - tp->snd_una))) {
			u_long newsize;

			newsize = ulmin(so->so_snd.ssb_hiwat +
					 tcp_autosndbuf_inc,
					tcp_autosndbuf_max);
			if (!ssb_reserve(&so->so_snd, newsize, so, NULL))
				atomic_clear_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE);
			if (newsize >= (TCP_MAXWIN << tp->snd_scale))
				atomic_clear_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE);
		}
	}

	/*
	 * Truncate to the maximum segment length and ensure that FIN is
	 * removed if the length no longer contains the last data byte.
	 */
	if (len > tp->t_maxseg) {
		len = tp->t_maxseg;
		sendalot = TRUE;
	}
	if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.ssb_cc))
		flags &= ~TH_FIN;

	recvwin = ssb_space(&so->so_rcv);

	/*
	 * Sender silly window avoidance.   We transmit under the following
	 * conditions when len is non-zero:
	 *
	 *	- We have a full segment
	 *	- This is the last buffer in a write()/send() and we are
	 *	  either idle or running NODELAY
	 *	- we've timed out (e.g. persist timer)
	 *	- we have more then 1/2 the maximum send window's worth of
	 *	  data (receiver may be limiting the window size)
	 *	- we need to retransmit
	 */
	if (len) {
		if (len == tp->t_maxseg)
			goto send;
		/*
		 * NOTE! on localhost connections an 'ack' from the remote
		 * end may occur synchronously with the output and cause
		 * us to flush a buffer queued with moretocome.  XXX
		 *
		 * note: the len + off check is almost certainly unnecessary.
		 */
		if (!(tp->t_flags & TF_MORETOCOME) &&	/* normal case */
		    (idle || (tp->t_flags & TF_NODELAY)) &&
		    len + off >= so->so_snd.ssb_cc &&
		    !(tp->t_flags & TF_NOPUSH)) {
			goto send;
		}
		if (tp->t_flags & TF_FORCE)		/* typ. timeout case */
			goto send;
		if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
			goto send;
		if (SEQ_LT(tp->snd_nxt, tp->snd_max))	/* retransmit case */
			goto send;
	}

	/*
	 * Compare available window to amount of window
	 * known to peer (as advertised window less
	 * next expected input).  If the difference is at least two
	 * max size segments, or at least 50% of the maximum possible
	 * window, then want to send a window update to peer.
	 */
	if (recvwin > 0) {
		/*
		 * "adv" is the amount we can increase the window,
		 * taking into account that we are limited by
		 * TCP_MAXWIN << tp->rcv_scale.
		 */
		long adv = min(recvwin, (long)TCP_MAXWIN << tp->rcv_scale) -
			(tp->rcv_adv - tp->rcv_nxt);
		long hiwat;

		/*
		 * This ack case typically occurs when the user has drained
		 * the TCP socket buffer sufficiently to warrent an ack
		 * containing a 'pure window update'... that is, an ack that
		 * ONLY updates the tcp window.
		 *
		 * It is unclear why we would need to do a pure window update
		 * past 2 segments if we are going to do one at 1/2 the high
		 * water mark anyway, especially since under normal conditions
		 * the user program will drain the socket buffer quickly.
		 * The 2-segment pure window update will often add a large
		 * number of extra, unnecessary acks to the stream.
		 *
		 * avoid_pure_win_update now defaults to 1.
		 */
		if (avoid_pure_win_update == 0 ||
		    (tp->t_flags & TF_RXRESIZED)) {
			if (adv >= (long) (2 * tp->t_maxseg)) {
				goto send;
			}
		}
		hiwat = (long)(TCP_MAXWIN << tp->rcv_scale);
		if (hiwat > (long)so->so_rcv.ssb_hiwat)
			hiwat = (long)so->so_rcv.ssb_hiwat;
		if (adv >= hiwat / 2)
			goto send;
	}

	/*
	 * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
	 * is also a catch-all for the retransmit timer timeout case.
	 */
	if (tp->t_flags & TF_ACKNOW)
		goto send;
	if ((flags & TH_RST) ||
	    ((flags & TH_SYN) && !(tp->t_flags & TF_NEEDSYN)))
		goto send;
	if (SEQ_GT(tp->snd_up, tp->snd_una))
		goto send;
	/*
	 * If our state indicates that FIN should be sent
	 * and we have not yet done so, then we need to send.
	 */
	if (flags & TH_FIN &&
	    (!(tp->t_flags & TF_SENTFIN) || tp->snd_nxt == tp->snd_una))
		goto send;

	/*
	 * TCP window updates are not reliable, rather a polling protocol
	 * using ``persist'' packets is used to insure receipt of window
	 * updates.  The three ``states'' for the output side are:
	 *	idle			not doing retransmits or persists
	 *	persisting		to move a small or zero window
	 *	(re)transmitting	and thereby not persisting
	 *
	 * tcp_callout_active(tp, tp->tt_persist)
	 *	is true when we are in persist state.
	 * The TF_FORCE flag in tp->t_flags
	 *	is set when we are called to send a persist packet.
	 * tcp_callout_active(tp, tp->tt_rexmt)
	 *	is set when we are retransmitting
	 * The output side is idle when both timers are zero.
	 *
	 * If send window is too small, there is data to transmit, and no
	 * retransmit or persist is pending, then go to persist state.
	 * If nothing happens soon, send when timer expires:
	 * if window is nonzero, transmit what we can,
	 * otherwise force out a byte.
	 */
	if (so->so_snd.ssb_cc > 0 &&
	    !tcp_callout_active(tp, tp->tt_rexmt) &&
	    !tcp_callout_active(tp, tp->tt_persist)) {
		tp->t_rxtshift = 0;
		tcp_setpersist(tp);
	}

	/*
	 * No reason to send a segment, just return.
	 */
	return (0);

send:
	/*
	 * Before ESTABLISHED, force sending of initial options
	 * unless TCP set not to do any options.
	 * NOTE: we assume that the IP/TCP header plus TCP options
	 * always fit in a single mbuf, leaving room for a maximum
	 * link header, i.e.
	 *	max_linkhdr + sizeof(struct tcpiphdr) + optlen <= MCLBYTES
	 */
	optlen = 0;
	if (isipv6)
		hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
	else
		hdrlen = sizeof(struct tcpiphdr);
	if (flags & TH_SYN) {
		tp->snd_nxt = tp->iss;
		if (!(tp->t_flags & TF_NOOPT)) {
			u_short mss;

			opt[0] = TCPOPT_MAXSEG;
			opt[1] = TCPOLEN_MAXSEG;
			mss = htons((u_short) tcp_mssopt(tp));
			memcpy(opt + 2, &mss, sizeof mss);
			optlen = TCPOLEN_MAXSEG;

			if ((tp->t_flags & TF_REQ_SCALE) &&
			    (!(flags & TH_ACK) ||
			     (tp->t_flags & TF_RCVD_SCALE))) {
				*((u_int32_t *)(opt + optlen)) = htonl(
					TCPOPT_NOP << 24 |
					TCPOPT_WINDOW << 16 |
					TCPOLEN_WINDOW << 8 |
					tp->request_r_scale);
				optlen += 4;
			}

			if ((tcp_do_sack && !(flags & TH_ACK)) ||
			    tp->t_flags & TF_SACK_PERMITTED) {
				uint32_t *lp = (uint32_t *)(opt + optlen);

				*lp = htonl(TCPOPT_SACK_PERMITTED_ALIGNED);
				optlen += TCPOLEN_SACK_PERMITTED_ALIGNED;
			}
		}
	}

	/*
	 * Send a timestamp and echo-reply if this is a SYN and our side
	 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
	 * and our peer have sent timestamps in our SYN's.
	 */
	if ((tp->t_flags & (TF_REQ_TSTMP | TF_NOOPT)) == TF_REQ_TSTMP &&
	    !(flags & TH_RST) &&
	    (!(flags & TH_ACK) || (tp->t_flags & TF_RCVD_TSTMP))) {
		u_int32_t *lp = (u_int32_t *)(opt + optlen);

		/* Form timestamp option as shown in appendix A of RFC 1323. */
		*lp++ = htonl(TCPOPT_TSTAMP_HDR);
		*lp++ = htonl(ticks);
		*lp   = htonl(tp->ts_recent);
		optlen += TCPOLEN_TSTAMP_APPA;
	}

	/* Set receive buffer autosizing timestamp. */
	if (tp->rfbuf_ts == 0 && (so->so_rcv.ssb_flags & SSB_AUTOSIZE))
		tp->rfbuf_ts = ticks;

	/*
	 * If this is a SACK connection and we have a block to report,
	 * fill in the SACK blocks in the TCP options.
	 */
	if ((tp->t_flags & (TF_SACK_PERMITTED | TF_NOOPT)) ==
		TF_SACK_PERMITTED &&
	    (!LIST_EMPTY(&tp->t_segq) ||
	     tp->reportblk.rblk_start != tp->reportblk.rblk_end))
		tcp_sack_fill_report(tp, opt, &optlen);

#ifdef TCP_SIGNATURE
	if (tp->t_flags & TF_SIGNATURE) {
		int i;
		u_char *bp;
		/*
		 * Initialize TCP-MD5 option (RFC2385)
		 */
		bp = (u_char *)opt + optlen;
		*bp++ = TCPOPT_SIGNATURE;
		*bp++ = TCPOLEN_SIGNATURE;
		sigoff = optlen + 2;
		for (i = 0; i < TCP_SIGLEN; i++)
			*bp++ = 0;
		optlen += TCPOLEN_SIGNATURE;
		/*
		 * Terminate options list and maintain 32-bit alignment.
		 */
		*bp++ = TCPOPT_NOP;
		*bp++ = TCPOPT_EOL;
		optlen += 2;
	}
#endif /* TCP_SIGNATURE */
	KASSERT(optlen <= TCP_MAXOLEN, ("too many TCP options"));
	hdrlen += optlen;

	if (isipv6) {
		ipoptlen = ip6_optlen(inp);
	} else {
		if (inp->inp_options) {
			ipoptlen = inp->inp_options->m_len -
			    offsetof(struct ipoption, ipopt_list);
		} else {
Beispiel #4
0
void
tcp_timer_persist(void *xtp)
{
	struct tcpcb *tp = xtp;
	struct inpcb *inp;
	CURVNET_SET(tp->t_vnet);
#ifdef TCPDEBUG
	int ostate;

	ostate = tp->t_state;
#endif
	inp = tp->t_inpcb;
	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
	INP_WLOCK(inp);
	if (callout_pending(&tp->t_timers->tt_persist) ||
	    !callout_active(&tp->t_timers->tt_persist)) {
		INP_WUNLOCK(inp);
		CURVNET_RESTORE();
		return;
	}
	callout_deactivate(&tp->t_timers->tt_persist);
	if ((inp->inp_flags & INP_DROPPED) != 0) {
		INP_WUNLOCK(inp);
		CURVNET_RESTORE();
		return;
	}
	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
	/*
	 * Persistence timer into zero window.
	 * Force a byte to be output, if possible.
	 */
	TCPSTAT_INC(tcps_persisttimeo);
	/*
	 * Hack: if the peer is dead/unreachable, we do not
	 * time out if the window is closed.  After a full
	 * backoff, drop the connection if the idle time
	 * (no responses to probes) reaches the maximum
	 * backoff that we would use if retransmitting.
	 */
	if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
	    (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
	     ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
		TCPSTAT_INC(tcps_persistdrop);
		if (tcp_inpinfo_lock_add(inp)) {
			tcp_inpinfo_lock_del(inp, tp);
			goto out;
		}
		tp = tcp_drop(tp, ETIMEDOUT);
		tcp_inpinfo_lock_del(inp, tp);
		goto out;
	}
	/*
	 * If the user has closed the socket then drop a persisting
	 * connection after a much reduced timeout.
	 */
	if (tp->t_state > TCPS_CLOSE_WAIT &&
	    (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
		TCPSTAT_INC(tcps_persistdrop);
		if (tcp_inpinfo_lock_add(inp)) {
			tcp_inpinfo_lock_del(inp, tp);
			goto out;
		}
		tp = tcp_drop(tp, ETIMEDOUT);
		tcp_inpinfo_lock_del(inp, tp);
		goto out;
	}
	tcp_setpersist(tp);
	tp->t_flags |= TF_FORCEDATA;
	(void) tp->t_fb->tfb_tcp_output(tp);
	tp->t_flags &= ~TF_FORCEDATA;

#ifdef TCPDEBUG
	if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
		tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO);
#endif
	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
	INP_WUNLOCK(inp);
out:
	CURVNET_RESTORE();
}
Beispiel #5
0
void
tcp_timer_persist(void *arg)
{
	struct tcpcb *tp = arg;
	uint32_t rto;
#ifdef TCP_DEBUG
	struct socket *so = NULL;
	short ostate;
#endif

	mutex_enter(softnet_lock);
	if ((tp->t_flags & TF_DEAD) != 0) {
		mutex_exit(softnet_lock);
		return;
	}
	if (!callout_expired(&tp->t_timer[TCPT_PERSIST])) {
		mutex_exit(softnet_lock);
		return;
	}

	KERNEL_LOCK(1, NULL);
#ifdef TCP_DEBUG
#ifdef INET
	if (tp->t_inpcb)
		so = tp->t_inpcb->inp_socket;
#endif
#ifdef INET6
	if (tp->t_in6pcb)
		so = tp->t_in6pcb->in6p_socket;
#endif

	ostate = tp->t_state;
#endif /* TCP_DEBUG */

	/*
	 * Persistance timer into zero window.
	 * Force a byte to be output, if possible.
	 */

	/*
	 * Hack: if the peer is dead/unreachable, we do not
	 * time out if the window is closed.  After a full
	 * backoff, drop the connection if the idle time
	 * (no responses to probes) reaches the maximum
	 * backoff that we would use if retransmitting.
	 */
	rto = TCP_REXMTVAL(tp);
	if (rto < tp->t_rttmin)
		rto = tp->t_rttmin;
	if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
	    ((tcp_now - tp->t_rcvtime) >= tcp_maxpersistidle ||
	    (tcp_now - tp->t_rcvtime) >= rto * tcp_totbackoff)) {
		TCP_STATINC(TCP_STAT_PERSISTDROPS);
		tp = tcp_drop(tp, ETIMEDOUT);
		goto out;
	}
	TCP_STATINC(TCP_STAT_PERSISTTIMEO);
	tcp_setpersist(tp);
	tp->t_force = 1;
	(void) tcp_output(tp);
	tp->t_force = 0;

 out:
#ifdef TCP_DEBUG
	if (tp && so->so_options & SO_DEBUG)
		tcp_trace(TA_USER, ostate, tp, NULL,
		    PRU_SLOWTIMO | (TCPT_PERSIST << 8));
#endif
	KERNEL_UNLOCK_ONE(NULL);
	mutex_exit(softnet_lock);
}
Beispiel #6
0
void
tcp_timer_persist(void *xtp)
{
	struct tcpcb *tp = xtp;
	struct inpcb *inp;
	CURVNET_SET(tp->t_vnet);
#ifdef TCPDEBUG
	int ostate;

	ostate = tp->t_state;
#endif
	INP_INFO_WLOCK(&V_tcbinfo);
	inp = tp->t_inpcb;
	/*
	 * XXXRW: While this assert is in fact correct, bugs in the tcpcb
	 * tear-down mean we need it as a work-around for races between
	 * timers and tcp_discardcb().
	 *
	 * KASSERT(inp != NULL, ("tcp_timer_persist: inp == NULL"));
	 */
	if (inp == NULL) {
		tcp_timer_race++;
		INP_INFO_WUNLOCK(&V_tcbinfo);
		CURVNET_RESTORE();
		return;
	}
	INP_WLOCK(inp);
	if ((inp->inp_flags & INP_DROPPED) || callout_pending(&tp->t_timers->tt_persist)
	    || !callout_active(&tp->t_timers->tt_persist)) {
		INP_WUNLOCK(inp);
		INP_INFO_WUNLOCK(&V_tcbinfo);
		CURVNET_RESTORE();
		return;
	}
	callout_deactivate(&tp->t_timers->tt_persist);
	/*
	 * Persistance timer into zero window.
	 * Force a byte to be output, if possible.
	 */
	TCPSTAT_INC(tcps_persisttimeo);
	/*
	 * Hack: if the peer is dead/unreachable, we do not
	 * time out if the window is closed.  After a full
	 * backoff, drop the connection if the idle time
	 * (no responses to probes) reaches the maximum
	 * backoff that we would use if retransmitting.
	 */
	if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
	    (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
	     ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
		TCPSTAT_INC(tcps_persistdrop);
		tp = tcp_drop(tp, ETIMEDOUT);
		goto out;
	}
	tcp_setpersist(tp);
	tp->t_flags |= TF_FORCEDATA;
	(void) tcp_output(tp);
	tp->t_flags &= ~TF_FORCEDATA;

out:
#ifdef TCPDEBUG
	if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
		tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO);
#endif
	if (tp != NULL)
		INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_tcbinfo);
	CURVNET_RESTORE();
}
Beispiel #7
0
/*
 * Tcp output routine: figure out what should be sent and send it.
 */
int
tcp_output(struct tcpcb *tp)
{
	struct inpcb * const inp = tp->t_inpcb;
	struct socket *so = inp->inp_socket;
	long len, recvwin, sendwin;
	int nsacked = 0;
	int off, flags, error = 0;
#ifdef TCP_SIGNATURE
	int sigoff = 0;
#endif
	struct mbuf *m;
	struct ip *ip;
	struct tcphdr *th;
	u_char opt[TCP_MAXOLEN];
	unsigned int ipoptlen, optlen, hdrlen;
	int idle;
	boolean_t sendalot;
	struct ip6_hdr *ip6;
#ifdef INET6
	const boolean_t isipv6 = INP_ISIPV6(inp);
#else
	const boolean_t isipv6 = FALSE;
#endif
	boolean_t can_tso = FALSE, use_tso;
	boolean_t report_sack, idle_cwv = FALSE;
	u_int segsz, tso_hlen, tso_lenmax = 0;
	int segcnt = 0;
	boolean_t need_sched = FALSE;

	KKASSERT(so->so_port == &curthread->td_msgport);

	/*
	 * Determine length of data that should be transmitted,
	 * and flags that will be used.
	 * If there is some data or critical controls (SYN, RST)
	 * to send, then transmit; otherwise, investigate further.
	 */

	/*
	 * If we have been idle for a while, the send congestion window
	 * could be no longer representative of the current state of the
	 * link; need to validate congestion window.  However, we should
	 * not perform congestion window validation here, since we could
	 * be asked to send pure ACK.
	 */
	if (tp->snd_max == tp->snd_una &&
	    (ticks - tp->snd_last) >= tp->t_rxtcur && tcp_idle_restart)
		idle_cwv = TRUE;

	/*
	 * Calculate whether the transmit stream was previously idle 
	 * and adjust TF_LASTIDLE for the next time.
	 */
	idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
	if (idle && (tp->t_flags & TF_MORETOCOME))
		tp->t_flags |= TF_LASTIDLE;
	else
		tp->t_flags &= ~TF_LASTIDLE;

	if (TCP_DO_SACK(tp) && tp->snd_nxt != tp->snd_max &&
	    !IN_FASTRECOVERY(tp))
		nsacked = tcp_sack_bytes_below(&tp->scb, tp->snd_nxt);

	/*
	 * Find out whether TSO could be used or not
	 *
	 * For TSO capable devices, the following assumptions apply to
	 * the processing of TCP flags:
	 * - If FIN is set on the large TCP segment, the device must set
	 *   FIN on the last segment that it creates from the large TCP
	 *   segment.
	 * - If PUSH is set on the large TCP segment, the device must set
	 *   PUSH on the last segment that it creates from the large TCP
	 *   segment.
	 */
#if !defined(IPSEC) && !defined(FAST_IPSEC)
	if (tcp_do_tso
#ifdef TCP_SIGNATURE
	    && (tp->t_flags & TF_SIGNATURE) == 0
#endif
	) {
		if (!isipv6) {
			struct rtentry *rt = inp->inp_route.ro_rt;

			if (rt != NULL && (rt->rt_flags & RTF_UP) &&
			    (rt->rt_ifp->if_hwassist & CSUM_TSO)) {
				can_tso = TRUE;
				tso_lenmax = rt->rt_ifp->if_tsolen;
			}
		}
	}
#endif	/* !IPSEC && !FAST_IPSEC */

again:
	m = NULL;
	ip = NULL;
	th = NULL;
	ip6 = NULL;

	if ((tp->t_flags & (TF_SACK_PERMITTED | TF_NOOPT)) ==
		TF_SACK_PERMITTED &&
	    (!TAILQ_EMPTY(&tp->t_segq) ||
	     tp->reportblk.rblk_start != tp->reportblk.rblk_end))
		report_sack = TRUE;
	else
		report_sack = FALSE;

	/* Make use of SACK information when slow-starting after a RTO. */
	if (TCP_DO_SACK(tp) && tp->snd_nxt != tp->snd_max &&
	    !IN_FASTRECOVERY(tp)) {
		tcp_seq old_snd_nxt = tp->snd_nxt;

		tcp_sack_skip_sacked(&tp->scb, &tp->snd_nxt);
		nsacked += tp->snd_nxt - old_snd_nxt;
	}

	sendalot = FALSE;
	off = tp->snd_nxt - tp->snd_una;
	sendwin = min(tp->snd_wnd, tp->snd_cwnd + nsacked);
	sendwin = min(sendwin, tp->snd_bwnd);

	flags = tcp_outflags[tp->t_state];
	/*
	 * Get standard flags, and add SYN or FIN if requested by 'hidden'
	 * state flags.
	 */
	if (tp->t_flags & TF_NEEDFIN)
		flags |= TH_FIN;
	if (tp->t_flags & TF_NEEDSYN)
		flags |= TH_SYN;

	/*
	 * If in persist timeout with window of 0, send 1 byte.
	 * Otherwise, if window is small but nonzero
	 * and timer expired, we will send what we can
	 * and go to transmit state.
	 */
	if (tp->t_flags & TF_FORCE) {
		if (sendwin == 0) {
			/*
			 * If we still have some data to send, then
			 * clear the FIN bit.  Usually this would
			 * happen below when it realizes that we
			 * aren't sending all the data.  However,
			 * if we have exactly 1 byte of unsent data,
			 * then it won't clear the FIN bit below,
			 * and if we are in persist state, we wind
			 * up sending the packet without recording
			 * that we sent the FIN bit.
			 *
			 * We can't just blindly clear the FIN bit,
			 * because if we don't have any more data
			 * to send then the probe will be the FIN
			 * itself.
			 */
			if (off < so->so_snd.ssb_cc)
				flags &= ~TH_FIN;
			sendwin = 1;
		} else {
			tcp_callout_stop(tp, tp->tt_persist);
			tp->t_rxtshift = 0;
		}
	}

	/*
	 * If snd_nxt == snd_max and we have transmitted a FIN, the
	 * offset will be > 0 even if so_snd.ssb_cc is 0, resulting in
	 * a negative length.  This can also occur when TCP opens up
	 * its congestion window while receiving additional duplicate
	 * acks after fast-retransmit because TCP will reset snd_nxt
	 * to snd_max after the fast-retransmit.
	 *
	 * A negative length can also occur when we are in the
	 * TCPS_SYN_RECEIVED state due to a simultanious connect where
	 * our SYN has not been acked yet.
	 *
	 * In the normal retransmit-FIN-only case, however, snd_nxt will
	 * be set to snd_una, the offset will be 0, and the length may
	 * wind up 0.
	 */
	len = (long)ulmin(so->so_snd.ssb_cc, sendwin) - off;

	/*
	 * Lop off SYN bit if it has already been sent.  However, if this
	 * is SYN-SENT state and if segment contains data, suppress sending
	 * segment (sending the segment would be an option if we still
	 * did TAO and the remote host supported it).
	 */
	if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
		flags &= ~TH_SYN;
		off--, len++;
		if (len > 0 && tp->t_state == TCPS_SYN_SENT) {
			tp->t_flags &= ~(TF_ACKNOW | TF_XMITNOW);
			return 0;
		}
	}

	/*
	 * Be careful not to send data and/or FIN on SYN segments.
	 * This measure is needed to prevent interoperability problems
	 * with not fully conformant TCP implementations.
	 */
	if (flags & TH_SYN) {
		len = 0;
		flags &= ~TH_FIN;
	}

	if (len < 0) {
		/*
		 * A negative len can occur if our FIN has been sent but not
		 * acked, or if we are in a simultanious connect in the
		 * TCPS_SYN_RECEIVED state with our SYN sent but not yet
		 * acked.
		 *
		 * If our window has contracted to 0 in the FIN case
		 * (which can only occur if we have NOT been called to
		 * retransmit as per code a few paragraphs up) then we
		 * want to shift the retransmit timer over to the
		 * persist timer.
		 *
		 * However, if we are in the TCPS_SYN_RECEIVED state
		 * (the SYN case) we will be in a simultanious connect and
		 * the window may be zero degeneratively.  In this case we
		 * do not want to shift to the persist timer after the SYN
		 * or the SYN+ACK transmission.
		 */
		len = 0;
		if (sendwin == 0 && tp->t_state != TCPS_SYN_RECEIVED) {
			tcp_callout_stop(tp, tp->tt_rexmt);
			tp->t_rxtshift = 0;
			tp->snd_nxt = tp->snd_una;
			if (!tcp_callout_active(tp, tp->tt_persist))
				tcp_setpersist(tp);
		}
	}

	KASSERT(len >= 0, ("%s: len < 0", __func__));
	/*
	 * Automatic sizing of send socket buffer.  Often the send buffer
	 * size is not optimally adjusted to the actual network conditions
	 * at hand (delay bandwidth product).  Setting the buffer size too
	 * small limits throughput on links with high bandwidth and high
	 * delay (eg. trans-continental/oceanic links).  Setting the
	 * buffer size too big consumes too much real kernel memory,
	 * especially with many connections on busy servers.
	 *
	 * The criteria to step up the send buffer one notch are:
	 *  1. receive window of remote host is larger than send buffer
	 *     (with a fudge factor of 5/4th);
	 *  2. hiwat has not significantly exceeded bwnd (inflight)
	 *     (bwnd is a maximal value if inflight is disabled).
	 *  3. send buffer is filled to 7/8th with data (so we actually
	 *     have data to make use of it);
	 *  4. hiwat has not hit maximal automatic size;
	 *  5. our send window (slow start and cogestion controlled) is
	 *     larger than sent but unacknowledged data in send buffer.
	 *
	 * The remote host receive window scaling factor may limit the
	 * growing of the send buffer before it reaches its allowed
	 * maximum.
	 *
	 * It scales directly with slow start or congestion window
	 * and does at most one step per received ACK.  This fast
	 * scaling has the drawback of growing the send buffer beyond
	 * what is strictly necessary to make full use of a given
	 * delay*bandwith product.  However testing has shown this not
	 * to be much of an problem.  At worst we are trading wasting
	 * of available bandwith (the non-use of it) for wasting some
	 * socket buffer memory.
	 *
	 * The criteria for shrinking the buffer is based solely on
	 * the inflight code (snd_bwnd).  If inflight is disabled,
	 * the buffer will not be shrinked.  Note that snd_bwnd already
	 * has a fudge factor.  Our test adds a little hysteresis.
	 */
	if (tcp_do_autosndbuf && (so->so_snd.ssb_flags & SSB_AUTOSIZE)) {
		const int asbinc = tcp_autosndbuf_inc;
		const int hiwat = so->so_snd.ssb_hiwat;
		const int lowat = so->so_snd.ssb_lowat;
		u_long newsize;

		if ((tp->snd_wnd / 4 * 5) >= hiwat &&
		    so->so_snd.ssb_cc >= (hiwat / 8 * 7) &&
		    hiwat < tp->snd_bwnd + hiwat / 10 &&
		    hiwat + asbinc < tcp_autosndbuf_max &&
		    hiwat < (TCP_MAXWIN << tp->snd_scale) &&
		    sendwin >= (so->so_snd.ssb_cc -
				(tp->snd_nxt - tp->snd_una))) {
			newsize = ulmin(hiwat + asbinc, tcp_autosndbuf_max);
			if (!ssb_reserve(&so->so_snd, newsize, so, NULL))
				atomic_clear_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE);
#if 0
			if (newsize >= (TCP_MAXWIN << tp->snd_scale))
				atomic_clear_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE);
#endif
		} else if ((long)tp->snd_bwnd <
			   (long)(hiwat * 3 / 4 - lowat - asbinc) &&
			   hiwat > tp->t_maxseg * 2 + asbinc &&
			   hiwat + asbinc >= tcp_autosndbuf_min &&
			   tcp_do_autosndbuf == 1) {
			newsize = ulmax(hiwat - asbinc, tp->t_maxseg * 2);
			ssb_reserve(&so->so_snd, newsize, so, NULL);
		}
	}

	/*
	 * Don't use TSO, if:
	 * - Congestion window needs validation
	 * - There are SACK blocks to report
	 * - RST or SYN flags is set
	 * - URG will be set
	 *
	 * XXX
	 * Checking for SYN|RST looks overkill, just to be safe than sorry
	 */
	use_tso = can_tso;
	if (report_sack || idle_cwv || (flags & (TH_RST | TH_SYN)))
		use_tso = FALSE;
	if (use_tso) {
		tcp_seq ugr_nxt = tp->snd_nxt;

		if ((flags & TH_FIN) && (tp->t_flags & TF_SENTFIN) &&
		    tp->snd_nxt == tp->snd_max)
			--ugr_nxt;

		if (SEQ_GT(tp->snd_up, ugr_nxt))
			use_tso = FALSE;
	}

	if (use_tso) {
		/*
		 * Find out segment size and header length for TSO
		 */
		error = tcp_tso_getsize(tp, &segsz, &tso_hlen);
		if (error)
			use_tso = FALSE;
	}
	if (!use_tso) {
		segsz = tp->t_maxseg;
		tso_hlen = 0; /* not used */
	}

	/*
	 * Truncate to the maximum segment length if not TSO, and ensure that
	 * FIN is removed if the length no longer contains the last data byte.
	 */
	if (len > segsz) {
		if (!use_tso) {
			len = segsz;
			++segcnt;
		} else {
			int nsegs;

			if (__predict_false(tso_lenmax < segsz))
				tso_lenmax = segsz << 1;

			/*
			 * Truncate TSO transfers to (IP_MAXPACKET - iphlen -
			 * thoff), and make sure that we send equal size
			 * transfers down the stack (rather than big-small-
			 * big-small-...).
			 */
			len = min(len, tso_lenmax);
			nsegs = min(len, (IP_MAXPACKET - tso_hlen)) / segsz;
			KKASSERT(nsegs > 0);

			len = nsegs * segsz;

			if (len <= segsz) {
				use_tso = FALSE;
				++segcnt;
			} else {
				segcnt += nsegs;
			}
		}
		sendalot = TRUE;
	} else {
		use_tso = FALSE;
		if (len > 0)
			++segcnt;
	}
	if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.ssb_cc))
		flags &= ~TH_FIN;

	recvwin = ssb_space(&so->so_rcv);

	/*
	 * Sender silly window avoidance.   We transmit under the following
	 * conditions when len is non-zero:
	 *
	 *	- We have a full segment
	 *	- This is the last buffer in a write()/send() and we are
	 *	  either idle or running NODELAY
	 *	- we've timed out (e.g. persist timer)
	 *	- we have more then 1/2 the maximum send window's worth of
	 *	  data (receiver may be limiting the window size)
	 *	- we need to retransmit
	 */
	if (len) {
		if (len >= segsz)
			goto send;
		/*
		 * NOTE! on localhost connections an 'ack' from the remote
		 * end may occur synchronously with the output and cause
		 * us to flush a buffer queued with moretocome.  XXX
		 *
		 * note: the len + off check is almost certainly unnecessary.
		 */
		if (!(tp->t_flags & TF_MORETOCOME) &&	/* normal case */
		    (idle || (tp->t_flags & TF_NODELAY)) &&
		    len + off >= so->so_snd.ssb_cc &&
		    !(tp->t_flags & TF_NOPUSH)) {
			goto send;
		}
		if (tp->t_flags & TF_FORCE)		/* typ. timeout case */
			goto send;
		if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
			goto send;
		if (SEQ_LT(tp->snd_nxt, tp->snd_max))	/* retransmit case */
			goto send;
		if (tp->t_flags & TF_XMITNOW)
			goto send;
	}

	/*
	 * Compare available window to amount of window
	 * known to peer (as advertised window less
	 * next expected input).  If the difference is at least two
	 * max size segments, or at least 50% of the maximum possible
	 * window, then want to send a window update to peer.
	 */
	if (recvwin > 0) {
		/*
		 * "adv" is the amount we can increase the window,
		 * taking into account that we are limited by
		 * TCP_MAXWIN << tp->rcv_scale.
		 */
		long adv = min(recvwin, (long)TCP_MAXWIN << tp->rcv_scale) -
			(tp->rcv_adv - tp->rcv_nxt);
		long hiwat;

		/*
		 * This ack case typically occurs when the user has drained
		 * the TCP socket buffer sufficiently to warrent an ack
		 * containing a 'pure window update'... that is, an ack that
		 * ONLY updates the tcp window.
		 *
		 * It is unclear why we would need to do a pure window update
		 * past 2 segments if we are going to do one at 1/2 the high
		 * water mark anyway, especially since under normal conditions
		 * the user program will drain the socket buffer quickly.
		 * The 2-segment pure window update will often add a large
		 * number of extra, unnecessary acks to the stream.
		 *
		 * avoid_pure_win_update now defaults to 1.
		 */
		if (avoid_pure_win_update == 0 ||
		    (tp->t_flags & TF_RXRESIZED)) {
			if (adv >= (long) (2 * segsz)) {
				goto send;
			}
		}
		hiwat = (long)(TCP_MAXWIN << tp->rcv_scale);
		if (hiwat > (long)so->so_rcv.ssb_hiwat)
			hiwat = (long)so->so_rcv.ssb_hiwat;
		if (adv >= hiwat / 2)
			goto send;
	}

	/*
	 * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
	 * is also a catch-all for the retransmit timer timeout case.
	 */
	if (tp->t_flags & TF_ACKNOW)
		goto send;
	if ((flags & TH_RST) ||
	    ((flags & TH_SYN) && !(tp->t_flags & TF_NEEDSYN)))
		goto send;
	if (SEQ_GT(tp->snd_up, tp->snd_una))
		goto send;
	/*
	 * If our state indicates that FIN should be sent
	 * and we have not yet done so, then we need to send.
	 */
	if ((flags & TH_FIN) &&
	    (!(tp->t_flags & TF_SENTFIN) || tp->snd_nxt == tp->snd_una))
		goto send;

	/*
	 * TCP window updates are not reliable, rather a polling protocol
	 * using ``persist'' packets is used to insure receipt of window
	 * updates.  The three ``states'' for the output side are:
	 *	idle			not doing retransmits or persists
	 *	persisting		to move a small or zero window
	 *	(re)transmitting	and thereby not persisting
	 *
	 * tcp_callout_active(tp, tp->tt_persist)
	 *	is true when we are in persist state.
	 * The TF_FORCE flag in tp->t_flags
	 *	is set when we are called to send a persist packet.
	 * tcp_callout_active(tp, tp->tt_rexmt)
	 *	is set when we are retransmitting
	 * The output side is idle when both timers are zero.
	 *
	 * If send window is too small, there is data to transmit, and no
	 * retransmit or persist is pending, then go to persist state.
	 *
	 * If nothing happens soon, send when timer expires:
	 * if window is nonzero, transmit what we can, otherwise force out
	 * a byte.
	 *
	 * Don't try to set the persist state if we are in TCPS_SYN_RECEIVED
	 * with data pending.  This situation can occur during a
	 * simultanious connect.
	 */
	if (so->so_snd.ssb_cc > 0 &&
	    tp->t_state != TCPS_SYN_RECEIVED &&
	    !tcp_callout_active(tp, tp->tt_rexmt) &&
	    !tcp_callout_active(tp, tp->tt_persist)) {
		tp->t_rxtshift = 0;
		tcp_setpersist(tp);
	}

	/*
	 * No reason to send a segment, just return.
	 */
	tp->t_flags &= ~TF_XMITNOW;
	return (0);

send:
	if (need_sched && len > 0) {
		tcp_output_sched(tp);
		return 0;
	}

	/*
	 * Before ESTABLISHED, force sending of initial options
	 * unless TCP set not to do any options.
	 * NOTE: we assume that the IP/TCP header plus TCP options
	 * always fit in a single mbuf, leaving room for a maximum
	 * link header, i.e.
	 *	max_linkhdr + sizeof(struct tcpiphdr) + optlen <= MCLBYTES
	 */
	optlen = 0;
	if (isipv6)
		hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
	else
		hdrlen = sizeof(struct tcpiphdr);
	if (flags & TH_SYN) {
		tp->snd_nxt = tp->iss;
		if (!(tp->t_flags & TF_NOOPT)) {
			u_short mss;

			opt[0] = TCPOPT_MAXSEG;
			opt[1] = TCPOLEN_MAXSEG;
			mss = htons((u_short) tcp_mssopt(tp));
			memcpy(opt + 2, &mss, sizeof mss);
			optlen = TCPOLEN_MAXSEG;

			if ((tp->t_flags & TF_REQ_SCALE) &&
			    (!(flags & TH_ACK) ||
			     (tp->t_flags & TF_RCVD_SCALE))) {
				*((u_int32_t *)(opt + optlen)) = htonl(
					TCPOPT_NOP << 24 |
					TCPOPT_WINDOW << 16 |
					TCPOLEN_WINDOW << 8 |
					tp->request_r_scale);
				optlen += 4;
			}

			if ((tcp_do_sack && !(flags & TH_ACK)) ||
			    tp->t_flags & TF_SACK_PERMITTED) {
				uint32_t *lp = (uint32_t *)(opt + optlen);

				*lp = htonl(TCPOPT_SACK_PERMITTED_ALIGNED);
				optlen += TCPOLEN_SACK_PERMITTED_ALIGNED;
			}
		}
	}

	/*
	 * Send a timestamp and echo-reply if this is a SYN and our side
	 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
	 * and our peer have sent timestamps in our SYN's.
	 */
	if ((tp->t_flags & (TF_REQ_TSTMP | TF_NOOPT)) == TF_REQ_TSTMP &&
	    !(flags & TH_RST) &&
	    (!(flags & TH_ACK) || (tp->t_flags & TF_RCVD_TSTMP))) {
		u_int32_t *lp = (u_int32_t *)(opt + optlen);

		/* Form timestamp option as shown in appendix A of RFC 1323. */
		*lp++ = htonl(TCPOPT_TSTAMP_HDR);
		*lp++ = htonl(ticks);
		*lp   = htonl(tp->ts_recent);
		optlen += TCPOLEN_TSTAMP_APPA;
	}

	/* Set receive buffer autosizing timestamp. */
	if (tp->rfbuf_ts == 0 && (so->so_rcv.ssb_flags & SSB_AUTOSIZE))
		tp->rfbuf_ts = ticks;

	/*
	 * If this is a SACK connection and we have a block to report,
	 * fill in the SACK blocks in the TCP options.
	 */
	if (report_sack)
		tcp_sack_fill_report(tp, opt, &optlen);

#ifdef TCP_SIGNATURE
	if (tp->t_flags & TF_SIGNATURE) {
		int i;
		u_char *bp;
		/*
		 * Initialize TCP-MD5 option (RFC2385)
		 */
		bp = (u_char *)opt + optlen;
		*bp++ = TCPOPT_SIGNATURE;
		*bp++ = TCPOLEN_SIGNATURE;
		sigoff = optlen + 2;
		for (i = 0; i < TCP_SIGLEN; i++)
			*bp++ = 0;
		optlen += TCPOLEN_SIGNATURE;
		/*
		 * Terminate options list and maintain 32-bit alignment.
		 */
		*bp++ = TCPOPT_NOP;
		*bp++ = TCPOPT_EOL;
		optlen += 2;
	}
#endif /* TCP_SIGNATURE */
	KASSERT(optlen <= TCP_MAXOLEN, ("too many TCP options"));
	hdrlen += optlen;

	if (isipv6) {
		ipoptlen = ip6_optlen(inp);
	} else {
		if (inp->inp_options) {
			ipoptlen = inp->inp_options->m_len -
			    offsetof(struct ipoption, ipopt_list);
		} else {
//-------------------------------------------------------------------------//
// TCP timer processing.
//-------------------------------------------------------------------------//
static
struct tcpcb * tcp_timers(
    Node *node,
    struct tcpcb *tp,
    int timer,
    UInt32 tcp_now,
    struct tcpstat *tcp_stat)
{
    int rexmt;
    TransportDataTcp *tcpLayer = (TransportDataTcp *)
                                 node->transportData.tcp;

    switch (timer) {

    //
    // 2 MSL timeout in shutdown went off.  If we're closed but
    // still waiting for peer to close and connection has been idle
    // too long, or if 2MSL time is up from TIME_WAIT, delete connection
    // control block.  Otherwise, check again in a bit.
    //
    case TCPT_2MSL:
        if (tp->t_state != TCPS_TIME_WAIT &&
            tp->t_idle <= TCPTV_MAXIDLE)
            tp->t_timer[TCPT_2MSL] = TCPTV_KEEPINTVL;
        else {
        	// printf("TCP: Connection closed by timer\n");
            tp = tcp_close(node, tp, tcp_stat);
        }
        break;

    //
    // Retransmission timer went off.  Message has not
    // been acked within retransmit interval.  Back off
    // to a longer retransmit interval and retransmit one segment.
    //
    case TCPT_REXMT:
        if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
            tp->t_rxtshift = TCP_MAXRXTSHIFT;
            //if (tcp_stat)
                //tcp_stat->tcps_timeoutdrop++;
            printf("TCP: Retransmission timer went off\n");
            tp = tcp_drop(node, tp, tcp_now, tcp_stat);
            break;
        }
        //if (tcp_stat)
            //tcp_stat->tcps_rexmttimeo++;
        rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];

        TCPT_RANGESET(tp->t_rxtcur, rexmt,
                      tp->t_rttmin, TCPTV_REXMTMAX);
        tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;

        //
        // If we backed off this far,
        // our srtt estimate is probably bogus.  Clobber it
        // so we'll take the next rtt measurement as our srtt;
        // move the current srtt into rttvar to keep the current
        // retransmit times until then.
        //
        if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {

            tp->t_rttvar +=
                (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));

            tp->t_srtt = 0;
        }
        tp->snd_nxt = tp->snd_una;

        if (TCP_VARIANT_IS_SACK(tp) && tp->isSackFastRextOn) {
            TransportTcpSackRextTimeoutInit(tp);
            TransportTcpTrace(node, 0, 0, "Faxt: timeout");
        }

        // Force a segment to be sent.
        tp->t_flags |= TF_ACKNOW;

        // If timing a segment in this window, stop the timer.
        // The retransmitted segment shouldn't be timed.
        tp->t_rtt = 0;

        //
        // Close the congestion window down to one segment
        // (we'll open it by one segment for each ack we get).
        // Since we probably have a window's worth of unacked
        // data accumulated, this "slow start" keeps us from
        // dumping all that data as back-to-back packets (which
        // might overwhelm an intermediate gateway).
        //
        // There are two phases to the opening: Initially we
        // open by one mss on each ack.  This makes the window
        // size increase exponentially with time.  If the
        // window is larger than the path can handle, this
        // exponential growth results in dropped packet(s)
        // almost immediately.  To get more time between
        // drops but still "push" the network to take advantage
        // of improving conditions, we switch from exponential
        // to linear window opening at some threshhold size.
        // For a threshhold, we use half the current window
        // size, truncated to a multiple of the mss.
        //
        // (the minimum cwnd that will give us exponential
        // growth is 2 mss.  We don't allow the threshhold
        // to go below this.)
        //
        {
            unsigned int win;
            win = MIN(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
            if (win < 2)
                win = 2;
            tp->snd_cwnd = tp->t_maxseg;

            tp->snd_ssthresh = win * tp->t_maxseg;
            tp->t_partialacks = -1;
            tp->t_dupacks = 0;
        }
        tp->t_ecnFlags |= TF_CWND_REDUCED;
        TransportTcpTrace(node, 0, 0, "Rext: timeout");

        //
        // To eliminates the problem of multiple Fast Retransmits we uses this
        // new variable "send_high", whose initial value is the initial send
        // sequence number. After each retransmit timeout, the highest sequence
        // numbers transmitted so far is recorded in the variable "send_high".
        //
        if (TCP_VARIANT_IS_NEWRENO(tp)) {
            tp->send_high = tp->snd_max;
        }

        tcp_output(node, tp, tcp_now, tcp_stat);
        break;

    //
    // Persistance timer into zero window.
    // Force a byte to be output, if possible.
    //
    case TCPT_PERSIST:
        //if (tcp_stat)
            //tcp_stat->tcps_persisttimeo++;
        //
        // Hack: if the peer is dead/unreachable, we do not
        // time out if the window is closed.  After a full
        // backoff, drop the connection if the idle time
        // (no responses to probes) reaches the maximum
        // backoff that we would use if retransmitting.
        //
        if (tp->t_rxtshift == TCP_MAXRXTSHIFT) {
            UInt32 maxidle = TCP_REXMTVAL(tp);
            if (maxidle < tp->t_rttmin)
                maxidle = tp->t_rttmin;
            maxidle *= tcp_totbackoff;
            if (tp->t_idle >= TCPTV_KEEP_IDLE ||
                tp->t_idle >= maxidle) {
                //if (tcp_stat)
                    //tcp_stat->tcps_persistdrop++;
            	printf("TCP: Idle timer went off\n");
                tp = tcp_drop(node, tp, tcp_now, tcp_stat);
                break;
            }
        }
        tcp_setpersist(tp);
        tp->t_force = 1;
        tcp_output(node, tp, tcp_now, tcp_stat);
        tp->t_force = 0;
        break;

    //
    // Keep-alive timer went off; send something
    // or drop connection if idle for too long.
    //
    case TCPT_KEEP:
        //if (tcp_stat)
            //tcp_stat->tcps_keeptimeo++;
        if (tp->t_state < TCPS_ESTABLISHED)
        	printf("TCP: Keep-alive timer went off before established\n");
            goto dropit;
        if (tcpLayer->tcpUseKeepAliveProbes && tp->t_state <= TCPS_CLOSING) {

            //
            // If the connection has been idle for more than the sum of
            // TCPTV_KEEP_IDLE (set to 2 hours) and TCPTV_MAXIDLE
            // (set to the total time taken to send all the probes),
            // it's time to drop the connection.
            //
            if (tp->t_idle >= TCPTV_KEEP_IDLE + TCPTV_MAXIDLE)
            	printf("TCP: Keep-alive timer went off\n");
                goto dropit;

            //
            // Send a packet designed to force a response
            // if the peer is up and reachable:
            // either an ACK if the connection is still alive,
            // or an RST if the peer has closed the connection
            // due to timeout or reboot.
            // Using sequence number tp->snd_una-1
            // causes the transmitted zero-length segment
            // to lie outside the receive window;
            // by the protocol spec, this requires the
            // correspondent TCP to respond.
            //
            //if (tcp_stat)
                //tcp_stat->tcps_keepprobe++;
            tcp_respond(node, tp, tp->t_template,
                        0, tp->rcv_nxt, tp->snd_una - 1,
                        0, tcp_stat);

            tp->t_timer[TCPT_KEEP] = TCPTV_KEEPINTVL;
        } else {
            //
            // If the tcpUseKeepAliveProbes is FALSE
            // or the connection state is greater than TCPS_CLOSING,
            // reset the keepalive timer to TCPTV_KEEP_IDLE.
            //
            tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_IDLE;
        }
        break;
    dropit:
        //if (tcp_stat) {
            //
            // Note that this counter counts connection drops due to
            // failure in connection establishment and the keepalive
            // timer timeouts
            //
            //tcp_stat->tcps_keepdrops++;
        //}
	    // printf("TCP: Unknown timer went off\n");
        tp = tcp_drop(node, tp, tcp_now, tcp_stat);
        break;
    }
Beispiel #9
0
/*
 * Tcp output routine: figure out what should be sent and send it.
 */
int
tcp_output(struct tcpcb *tp)
{
	struct socket *so = tp->t_inpcb->inp_socket;
	long len, recwin, sendwin;
	int off, flags, error;
#ifdef TCP_SIGNATURE
	int sigoff = 0;
#endif
	struct mbuf *m;
	struct ip *ip = NULL;
	struct ipovly *ipov = NULL;
	struct tcphdr *th;
	u_char opt[TCP_MAXOLEN];
	unsigned ipoptlen, optlen, hdrlen;
	int idle, sendalot;
	int i, sack_rxmit;
	int sack_bytes_rxmt;
	struct sackhole *p;
#if 0
	int maxburst = TCP_MAXBURST;
#endif
	struct rmxp_tao tao;
#ifdef INET6
	struct ip6_hdr *ip6 = NULL;
	int isipv6;

	bzero(&tao, sizeof(tao));
	isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
#endif
#ifdef TCP_ECN
	int needect;
#endif

	INP_LOCK_ASSERT(tp->t_inpcb);

	/*
	 * Determine length of data that should be transmitted,
	 * and flags that will be used.
	 * If there is some data or critical controls (SYN, RST)
	 * to send, then transmit; otherwise, investigate further.
	 */
	idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
	if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
		/*
		 * We have been idle for "a while" and no acks are
		 * expected to clock out any data we send --
		 * slow start to get ack "clock" running again.
		 *
		 * Set the slow-start flight size depending on whether
		 * this is a local network or not.
		 */
		int ss = ss_fltsz;
#ifdef INET6
		if (isipv6) {
			if (in6_localaddr(&tp->t_inpcb->in6p_faddr))
				ss = ss_fltsz_local;
		} else
#endif
		if (in_localaddr(tp->t_inpcb->inp_faddr))
			ss = ss_fltsz_local;
		tp->snd_cwnd = tp->t_maxseg * ss;
	}
	tp->t_flags &= ~TF_LASTIDLE;
	if (idle) {
		if (tp->t_flags & TF_MORETOCOME) {
			tp->t_flags |= TF_LASTIDLE;
			idle = 0;
		}
	}
again:
	/*
	 * If we've recently taken a timeout, snd_max will be greater than
	 * snd_nxt.  There may be SACK information that allows us to avoid
	 * resending already delivered data.  Adjust snd_nxt accordingly.
	 */
	if (tp->sack_enable && SEQ_LT(tp->snd_nxt, tp->snd_max))
		tcp_sack_adjust(tp);
	sendalot = 0;
	off = tp->snd_nxt - tp->snd_una;
	sendwin = min(tp->snd_wnd, tp->snd_cwnd);
	sendwin = min(sendwin, tp->snd_bwnd);

	flags = tcp_outflags[tp->t_state];
	/*
	 * Send any SACK-generated retransmissions.  If we're explicitly trying
	 * to send out new data (when sendalot is 1), bypass this function.
	 * If we retransmit in fast recovery mode, decrement snd_cwnd, since
	 * we're replacing a (future) new transmission with a retransmission
	 * now, and we previously incremented snd_cwnd in tcp_input().
	 */
	/*
	 * Still in sack recovery , reset rxmit flag to zero.
	 */
	sack_rxmit = 0;
	sack_bytes_rxmt = 0;
	len = 0;
	p = NULL;
	if (tp->sack_enable && IN_FASTRECOVERY(tp) &&
	    (p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
		long cwin;
		
		cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
		if (cwin < 0)
			cwin = 0;
		/* Do not retransmit SACK segments beyond snd_recover */
		if (SEQ_GT(p->end, tp->snd_recover)) {
			/*
			 * (At least) part of sack hole extends beyond
			 * snd_recover. Check to see if we can rexmit data
			 * for this hole.
			 */
			if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
				/*
				 * Can't rexmit any more data for this hole.
				 * That data will be rexmitted in the next
				 * sack recovery episode, when snd_recover
				 * moves past p->rxmit.
				 */
				p = NULL;
				goto after_sack_rexmit;
			} else
				/* Can rexmit part of the current hole */
				len = ((long)ulmin(cwin,
						   tp->snd_recover - p->rxmit));
		} else
			len = ((long)ulmin(cwin, p->end - p->rxmit));
		off = p->rxmit - tp->snd_una;
		KASSERT(off >= 0,("%s: sack block to the left of una : %d",
		    __func__, off));
		if (len > 0) {
			sack_rxmit = 1;
			sendalot = 1;
			tcpstat.tcps_sack_rexmits++;
			tcpstat.tcps_sack_rexmit_bytes +=
			    min(len, tp->t_maxseg);
		}
	}
after_sack_rexmit:
	/*
	 * Get standard flags, and add SYN or FIN if requested by 'hidden'
	 * state flags.
	 */
	if (tp->t_flags & TF_NEEDFIN)
		flags |= TH_FIN;
	if (tp->t_flags & TF_NEEDSYN)
		flags |= TH_SYN;

	SOCKBUF_LOCK(&so->so_snd);
	/*
	 * If in persist timeout with window of 0, send 1 byte.
	 * Otherwise, if window is small but nonzero
	 * and timer expired, we will send what we can
	 * and go to transmit state.
	 */
	if (tp->t_force) {
		if (sendwin == 0) {
			/*
			 * If we still have some data to send, then
			 * clear the FIN bit.  Usually this would
			 * happen below when it realizes that we
			 * aren't sending all the data.  However,
			 * if we have exactly 1 byte of unsent data,
			 * then it won't clear the FIN bit below,
			 * and if we are in persist state, we wind
			 * up sending the packet without recording
			 * that we sent the FIN bit.
			 *
			 * We can't just blindly clear the FIN bit,
			 * because if we don't have any more data
			 * to send then the probe will be the FIN
			 * itself.
			 */
			if (off < so->so_snd.sb_cc)
				flags &= ~TH_FIN;
			sendwin = 1;
		} else {
			callout_stop(tp->tt_persist);
			tp->t_rxtshift = 0;
		}
	}

	/*
	 * If snd_nxt == snd_max and we have transmitted a FIN, the
	 * offset will be > 0 even if so_snd.sb_cc is 0, resulting in
	 * a negative length.  This can also occur when TCP opens up
	 * its congestion window while receiving additional duplicate
	 * acks after fast-retransmit because TCP will reset snd_nxt
	 * to snd_max after the fast-retransmit.
	 *
	 * In the normal retransmit-FIN-only case, however, snd_nxt will
	 * be set to snd_una, the offset will be 0, and the length may
	 * wind up 0.
	 *
	 * If sack_rxmit is true we are retransmitting from the scoreboard
	 * in which case len is already set.
	 */
	if (sack_rxmit == 0) {
		if (sack_bytes_rxmt == 0)
			len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off);
		else {
			long cwin;

                        /*
			 * We are inside of a SACK recovery episode and are
			 * sending new data, having retransmitted all the
			 * data possible in the scoreboard.
			 */
			len = ((long)ulmin(so->so_snd.sb_cc, tp->snd_wnd) 
			       - off);
			/*
			 * Don't remove this (len > 0) check !
			 * We explicitly check for len > 0 here (although it 
			 * isn't really necessary), to work around a gcc 
			 * optimization issue - to force gcc to compute
			 * len above. Without this check, the computation
			 * of len is bungled by the optimizer.
			 */
			if (len > 0) {
				cwin = tp->snd_cwnd - 
					(tp->snd_nxt - tp->sack_newdata) -
					sack_bytes_rxmt;
				if (cwin < 0)
					cwin = 0;
				len = lmin(len, cwin);
			}
		}
	}

	/*
	 * Lop off SYN bit if it has already been sent.  However, if this
	 * is SYN-SENT state and if segment contains data and if we don't
	 * know that foreign host supports TAO, suppress sending segment.
	 */
	if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
		flags &= ~TH_SYN;
		off--, len++;
		if (tcp_do_rfc1644)
			tcp_hc_gettao(&tp->t_inpcb->inp_inc, &tao);
		if (len > 0 && tp->t_state == TCPS_SYN_SENT &&
		     tao.tao_ccsent == 0)
			goto just_return;
	}

	/*
	 * Be careful not to send data and/or FIN on SYN segments
	 * in cases when no CC option will be sent.
	 * This measure is needed to prevent interoperability problems
	 * with not fully conformant TCP implementations.
	 */
	if ((flags & TH_SYN) &&
	    ((tp->t_flags & TF_NOOPT) || !(tp->t_flags & TF_REQ_CC) ||
	     ((flags & TH_ACK) && !(tp->t_flags & TF_RCVD_CC)))) {
		len = 0;
		flags &= ~TH_FIN;
	}

	if (len < 0) {
		/*
		 * If FIN has been sent but not acked,
		 * but we haven't been called to retransmit,
		 * len will be < 0.  Otherwise, window shrank
		 * after we sent into it.  If window shrank to 0,
		 * cancel pending retransmit, pull snd_nxt back
		 * to (closed) window, and set the persist timer
		 * if it isn't already going.  If the window didn't
		 * close completely, just wait for an ACK.
		 */
		len = 0;
		if (sendwin == 0) {
			callout_stop(tp->tt_rexmt);
			tp->t_rxtshift = 0;
			tp->snd_nxt = tp->snd_una;
			if (!callout_active(tp->tt_persist))
				tcp_setpersist(tp);
		}
	}

	/*
	 * len will be >= 0 after this point.  Truncate to the maximum
	 * segment length and ensure that FIN is removed if the length
	 * no longer contains the last data byte.
	 */
	if (len > tp->t_maxseg) {
		len = tp->t_maxseg;
		sendalot = 1;
	}
	if (sack_rxmit) {
		if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc))
			flags &= ~TH_FIN;
	} else {
		if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
			flags &= ~TH_FIN;
	}

	recwin = sbspace(&so->so_rcv);

	/*
	 * Sender silly window avoidance.   We transmit under the following
	 * conditions when len is non-zero:
	 *
	 *	- We have a full segment
	 *	- This is the last buffer in a write()/send() and we are
	 *	  either idle or running NODELAY
	 *	- we've timed out (e.g. persist timer)
	 *	- we have more then 1/2 the maximum send window's worth of
	 *	  data (receiver may be limited the window size)
	 *	- we need to retransmit
	 */
	if (len) {
		if (len == tp->t_maxseg)
			goto send;
		/*
		 * NOTE! on localhost connections an 'ack' from the remote
		 * end may occur synchronously with the output and cause
		 * us to flush a buffer queued with moretocome.  XXX
		 *
		 * note: the len + off check is almost certainly unnecessary.
		 */
		if (!(tp->t_flags & TF_MORETOCOME) &&	/* normal case */
		    (idle || (tp->t_flags & TF_NODELAY)) &&
		    len + off >= so->so_snd.sb_cc &&
		    (tp->t_flags & TF_NOPUSH) == 0) {
			goto send;
		}
		if (tp->t_force)			/* typ. timeout case */
			goto send;
		if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
			goto send;
		if (SEQ_LT(tp->snd_nxt, tp->snd_max))	/* retransmit case */
			goto send;
		if (sack_rxmit)
			goto send;
	}

	/*
	 * Compare available window to amount of window
	 * known to peer (as advertised window less
	 * next expected input).  If the difference is at least two
	 * max size segments, or at least 50% of the maximum possible
	 * window, then want to send a window update to peer.
	 * Skip this if the connection is in T/TCP half-open state.
	 */
	if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN)) {
		/*
		 * "adv" is the amount we can increase the window,
		 * taking into account that we are limited by
		 * TCP_MAXWIN << tp->rcv_scale.
		 */
		long adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale) -
			(tp->rcv_adv - tp->rcv_nxt);

		if (adv >= (long) (2 * tp->t_maxseg))
			goto send;
		if (2 * adv >= (long) so->so_rcv.sb_hiwat)
			goto send;
	}

	/*
	 * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
	 * is also a catch-all for the retransmit timer timeout case.
	 */
	if (tp->t_flags & TF_ACKNOW)
		goto send;
	if ((flags & TH_RST) ||
	    ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
		goto send;
	if (SEQ_GT(tp->snd_up, tp->snd_una))
		goto send;
	/*
	 * If our state indicates that FIN should be sent
	 * and we have not yet done so, then we need to send.
	 */
	if (flags & TH_FIN &&
	    ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
		goto send;
	/*
	 * In SACK, it is possible for tcp_output to fail to send a segment
	 * after the retransmission timer has been turned off.  Make sure
	 * that the retransmission timer is set.
	 */
	if (tp->sack_enable && SEQ_GT(tp->snd_max, tp->snd_una) &&
	    !callout_active(tp->tt_rexmt) &&
	    !callout_active(tp->tt_persist)) {
		callout_reset(tp->tt_rexmt, tp->t_rxtcur,
			      tcp_timer_rexmt, tp);
		goto just_return;
	} 
	/*
	 * TCP window updates are not reliable, rather a polling protocol
	 * using ``persist'' packets is used to insure receipt of window
	 * updates.  The three ``states'' for the output side are:
	 *	idle			not doing retransmits or persists
	 *	persisting		to move a small or zero window
	 *	(re)transmitting	and thereby not persisting
	 *
	 * callout_active(tp->tt_persist)
	 *	is true when we are in persist state.
	 * tp->t_force
	 *	is set when we are called to send a persist packet.
	 * callout_active(tp->tt_rexmt)
	 *	is set when we are retransmitting
	 * The output side is idle when both timers are zero.
	 *
	 * If send window is too small, there is data to transmit, and no
	 * retransmit or persist is pending, then go to persist state.
	 * If nothing happens soon, send when timer expires:
	 * if window is nonzero, transmit what we can,
	 * otherwise force out a byte.
	 */
	if (so->so_snd.sb_cc && !callout_active(tp->tt_rexmt) &&
	    !callout_active(tp->tt_persist)) {
		tp->t_rxtshift = 0;
		tcp_setpersist(tp);
	}

	/*
	 * No reason to send a segment, just return.
	 */
just_return:
	SOCKBUF_UNLOCK(&so->so_snd);
	return (0);

send:
	SOCKBUF_LOCK_ASSERT(&so->so_snd);
	/*
	 * Before ESTABLISHED, force sending of initial options
	 * unless TCP set not to do any options.
	 * NOTE: we assume that the IP/TCP header plus TCP options
	 * always fit in a single mbuf, leaving room for a maximum
	 * link header, i.e.
	 *	max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
	 */
	optlen = 0;
#ifdef INET6
	if (isipv6)
		hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
	else
#endif
	hdrlen = sizeof (struct tcpiphdr);
	if (flags & TH_SYN) {
		tp->snd_nxt = tp->iss;
		if ((tp->t_flags & TF_NOOPT) == 0) {
			u_short mss;

			opt[0] = TCPOPT_MAXSEG;
			opt[1] = TCPOLEN_MAXSEG;
			mss = htons((u_short) tcp_mssopt(&tp->t_inpcb->inp_inc));
			(void)memcpy(opt + 2, &mss, sizeof(mss));
			optlen = TCPOLEN_MAXSEG;

			/*
			 * If this is the first SYN of connection (not a SYN
			 * ACK), include SACK_PERMIT_HDR option.  If this is a
			 * SYN ACK, include SACK_PERMIT_HDR option if peer has
			 * already done so. This is only for active connect,
			 * since the syncache takes care of the passive connect.
			 */
			if (tp->sack_enable && ((flags & TH_ACK) == 0 ||
			    (tp->t_flags & TF_SACK_PERMIT))) {
				*((u_int32_t *) (opt + optlen)) =
					htonl(TCPOPT_SACK_PERMIT_HDR);
				optlen += 4;
			}
			if ((tp->t_flags & TF_REQ_SCALE) &&
			    ((flags & TH_ACK) == 0 ||
			    (tp->t_flags & TF_RCVD_SCALE))) {
				*((u_int32_t *)(opt + optlen)) = htonl(
					TCPOPT_NOP << 24 |
					TCPOPT_WINDOW << 16 |
					TCPOLEN_WINDOW << 8 |
					tp->request_r_scale);
				optlen += 4;
			}
		}
	}

	/*
	 * Send a timestamp and echo-reply if this is a SYN and our side
	 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
	 * and our peer have sent timestamps in our SYN's.
	 */
	if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
	    (flags & TH_RST) == 0 &&
	    ((flags & TH_ACK) == 0 ||
	     (tp->t_flags & TF_RCVD_TSTMP))) {
		u_int32_t *lp = (u_int32_t *)(opt + optlen);

		/* Form timestamp option as shown in appendix A of RFC 1323. */
		*lp++ = htonl(TCPOPT_TSTAMP_HDR);
		*lp++ = htonl(ticks);
		*lp   = htonl(tp->ts_recent);
		optlen += TCPOLEN_TSTAMP_APPA;
	}

	/*
	 * Send SACKs if necessary.  This should be the last option processed.
	 * Only as many SACKs are sent as are permitted by the maximum options
	 * size.  No more than three SACKs are sent.
	 */
	if (tp->sack_enable && tp->t_state == TCPS_ESTABLISHED &&
	    (tp->t_flags & (TF_SACK_PERMIT|TF_NOOPT)) == TF_SACK_PERMIT &&
	    tp->rcv_numsacks) {
		u_int32_t *lp = (u_int32_t *)(opt + optlen);
		u_int32_t *olp = lp++;
		int count = 0;  /* actual number of SACKs inserted */
		int maxsack = (MAX_TCPOPTLEN - (optlen + 4))/TCPOLEN_SACK;

		tcpstat.tcps_sack_send_blocks++;
		maxsack = min(maxsack, TCP_MAX_SACK);
		for (i = 0; (i < tp->rcv_numsacks && count < maxsack); i++) {
			struct sackblk sack = tp->sackblks[i];
			if (sack.start == 0 && sack.end == 0)
				continue;
			*lp++ = htonl(sack.start);
			*lp++ = htonl(sack.end);
			count++;
		}
		*olp = htonl(TCPOPT_SACK_HDR|(TCPOLEN_SACK*count+2));
		optlen += TCPOLEN_SACK*count + 4; /* including leading NOPs */
	}
	/*
	 * Send `CC-family' options if our side wants to use them (TF_REQ_CC),
	 * options are allowed (!TF_NOOPT) and it's not a RST.
	 */
	if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
	     (flags & TH_RST) == 0) {
		switch (flags & (TH_SYN|TH_ACK)) {
		/*
		 * This is a normal ACK, send CC if we received CC before
		 * from our peer.
		 */
		case TH_ACK:
			if (!(tp->t_flags & TF_RCVD_CC))
				break;
			/*FALLTHROUGH*/

		/*
		 * We can only get here in T/TCP's SYN_SENT* state, when
		 * we're a sending a non-SYN segment without waiting for
		 * the ACK of our SYN.  A check above assures that we only
		 * do this if our peer understands T/TCP.
		 */
		case 0:
			opt[optlen++] = TCPOPT_NOP;
			opt[optlen++] = TCPOPT_NOP;
			opt[optlen++] = TCPOPT_CC;
			opt[optlen++] = TCPOLEN_CC;
			*(u_int32_t *)&opt[optlen] = htonl(tp->cc_send);

			optlen += 4;
			break;

		/*
		 * This is our initial SYN, check whether we have to use
		 * CC or CC.new.
		 */
		case TH_SYN:
			opt[optlen++] = TCPOPT_NOP;
			opt[optlen++] = TCPOPT_NOP;
			opt[optlen++] = tp->t_flags & TF_SENDCCNEW ?
						TCPOPT_CCNEW : TCPOPT_CC;
			opt[optlen++] = TCPOLEN_CC;
			*(u_int32_t *)&opt[optlen] = htonl(tp->cc_send);
			optlen += 4;
			break;

		/*
		 * This is a SYN,ACK; send CC and CC.echo if we received
		 * CC from our peer.
		 */
		case (TH_SYN|TH_ACK):
			if (tp->t_flags & TF_RCVD_CC) {
				opt[optlen++] = TCPOPT_NOP;
				opt[optlen++] = TCPOPT_NOP;
				opt[optlen++] = TCPOPT_CC;
				opt[optlen++] = TCPOLEN_CC;
				*(u_int32_t *)&opt[optlen] =
					htonl(tp->cc_send);
				optlen += 4;
				opt[optlen++] = TCPOPT_NOP;
				opt[optlen++] = TCPOPT_NOP;
				opt[optlen++] = TCPOPT_CCECHO;
				opt[optlen++] = TCPOLEN_CC;
				*(u_int32_t *)&opt[optlen] =
					htonl(tp->cc_recv);
				optlen += 4;
			}
			break;
		}
	}

#ifdef TCP_SIGNATURE
#ifdef INET6
	if (!isipv6)
#endif
	if (tp->t_flags & TF_SIGNATURE) {
		int i;
		u_char *bp;

		/* Initialize TCP-MD5 option (RFC2385) */
		bp = (u_char *)opt + optlen;
		*bp++ = TCPOPT_SIGNATURE;
		*bp++ = TCPOLEN_SIGNATURE;
		sigoff = optlen + 2;
		for (i = 0; i < TCP_SIGLEN; i++)
			*bp++ = 0;
		optlen += TCPOLEN_SIGNATURE;

		/* Terminate options list and maintain 32-bit alignment. */
		*bp++ = TCPOPT_NOP;
		*bp++ = TCPOPT_EOL;
		optlen += 2;
	}
#endif /* TCP_SIGNATURE */

	hdrlen += optlen;

#ifdef INET6
	if (isipv6)
		ipoptlen = ip6_optlen(tp->t_inpcb);
	else
#endif
	if (tp->t_inpcb->inp_options)
		ipoptlen = tp->t_inpcb->inp_options->m_len -
				offsetof(struct ipoption, ipopt_list);
	else
Beispiel #10
0
/*
 * Tcp output routine: figure out what should be sent and send it.
 */
int
tcp_output(PNATState pData, register struct tcpcb *tp)
{
    register struct socket *so = tp->t_socket;
    register long len, win;
    int off, flags, error;
    register struct mbuf *m = NULL;
    register struct tcpiphdr *ti;
    u_char opt[MAX_TCPOPTLEN];
    unsigned optlen, hdrlen;
    int idle, sendalot;
    int size = 0;

    LogFlowFunc(("ENTER: tcp_output: tp = %R[tcpcb793]\n", tp));

    /*
     * Determine length of data that should be transmitted,
     * and flags that will be used.
     * If there is some data or critical controls (SYN, RST)
     * to send, then transmit; otherwise, investigate further.
     */
    idle = (tp->snd_max == tp->snd_una);
    if (idle && tp->t_idle >= tp->t_rxtcur)
        /*
         * We have been idle for "a while" and no acks are
         * expected to clock out any data we send --
         * slow start to get ack "clock" running again.
         */
        tp->snd_cwnd = tp->t_maxseg;

again:
    sendalot = 0;
    off = tp->snd_nxt - tp->snd_una;
    win = min(tp->snd_wnd, tp->snd_cwnd);

    flags = tcp_outflags[tp->t_state];

    Log2((" --- tcp_output flags = 0x%x\n", flags));

    /*
     * If in persist timeout with window of 0, send 1 byte.
     * Otherwise, if window is small but nonzero
     * and timer expired, we will send what we can
     * and go to transmit state.
     */
    if (tp->t_force)
    {
        if (win == 0)
        {
            /*
             * If we still have some data to send, then
             * clear the FIN bit.  Usually this would
             * happen below when it realizes that we
             * aren't sending all the data.  However,
             * if we have exactly 1 byte of unset data,
             * then it won't clear the FIN bit below,
             * and if we are in persist state, we wind
             * up sending the packet without recording
             * that we sent the FIN bit.
             *
             * We can't just blindly clear the FIN bit,
             * because if we don't have any more data
             * to send then the probe will be the FIN
             * itself.
             */
            if (off < SBUF_LEN(&so->so_snd))
                flags &= ~TH_FIN;
            win = 1;
        }
        else
        {
            tp->t_timer[TCPT_PERSIST] = 0;
            tp->t_rxtshift = 0;
        }
    }

    len = min(SBUF_LEN(&so->so_snd), win) - off;
    if (len < 0)
    {
        /*
         * If FIN has been sent but not acked,
         * but we haven't been called to retransmit,
         * len will be -1.  Otherwise, window shrank
         * after we sent into it.  If window shrank to 0,
         * cancel pending retransmit and pull snd_nxt
         * back to (closed) window.  We will enter persist
         * state below.  If the window didn't close completely,
         * just wait for an ACK.
         */
        len = 0;
        if (win == 0)
        {
            tp->t_timer[TCPT_REXMT] = 0;
            tp->snd_nxt = tp->snd_una;
        }
    }
    if (len > tp->t_maxseg)
    {
        len = tp->t_maxseg;
        sendalot = 1;
    }
    if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + SBUF_LEN(&so->so_snd)))
        flags &= ~TH_FIN;

    win = sbspace(&so->so_rcv);

    /*
     * Sender silly window avoidance.  If connection is idle
     * and can send all data, a maximum segment,
     * at least a maximum default-size segment do it,
     * or are forced, do it; otherwise don't bother.
     * If peer's buffer is tiny, then send
     * when window is at least half open.
     * If retransmitting (possibly after persist timer forced us
     * to send into a small window), then must resend.
     */
    if (len)
    {
        if (len == tp->t_maxseg)
            goto send;
        if ((1 || idle || tp->t_flags & TF_NODELAY) &&
                len + off >= SBUF_LEN(&so->so_snd))
            goto send;
        if (tp->t_force)
            goto send;
        if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
            goto send;
        if (SEQ_LT(tp->snd_nxt, tp->snd_max))
            goto send;
    }

    /*
     * Compare available window to amount of window
     * known to peer (as advertised window less
     * next expected input).  If the difference is at least two
     * max size segments, or at least 50% of the maximum possible
     * window, then want to send a window update to peer.
     */
    if (win > 0)
    {
        /*
         * "adv" is the amount we can increase the window,
         * taking into account that we are limited by
         * TCP_MAXWIN << tp->rcv_scale.
         */
        long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale);
        if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt))
            adv -= tp->rcv_adv - tp->rcv_nxt;

        if (adv >= (long) (2 * tp->t_maxseg))
            goto send;
        if (2 * adv >= (long) SBUF_SIZE(&so->so_rcv))
            goto send;
    }

    /*
     * Send if we owe peer an ACK.
     */
    if (tp->t_flags & TF_ACKNOW)
        goto send;
    if (flags & (TH_SYN|TH_RST))
        goto send;
    if (SEQ_GT(tp->snd_up, tp->snd_una))
        goto send;
    /*
     * If our state indicates that FIN should be sent
     * and we have not yet done so, or we're retransmitting the FIN,
     * then we need to send.
     */
    if (   flags & TH_FIN
        && ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
        goto send;

    /*
     * TCP window updates are not reliable, rather a polling protocol
     * using ``persist'' packets is used to insure receipt of window
     * updates.  The three ``states'' for the output side are:
     *      idle                    not doing retransmits or persists
     *      persisting              to move a small or zero window
     *      (re)transmitting        and thereby not persisting
     *
     * tp->t_timer[TCPT_PERSIST]
     *      is set when we are in persist state.
     * tp->t_force
     *      is set when we are called to send a persist packet.
     * tp->t_timer[TCPT_REXMT]
     *      is set when we are retransmitting
     * The output side is idle when both timers are zero.
     *
     * If send window is too small, there is data to transmit, and no
     * retransmit or persist is pending, then go to persist state.
     * If nothing happens soon, send when timer expires:
     * if window is nonzero, transmit what we can,
     * otherwise force out a byte.
     */
    if (   SBUF_LEN(&so->so_snd)
        && tp->t_timer[TCPT_REXMT] == 0
        && tp->t_timer[TCPT_PERSIST] == 0)
    {
        tp->t_rxtshift = 0;
        tcp_setpersist(tp);
    }

    /*
     * No reason to send a segment, just return.
     */
    tcpstat.tcps_didnuttin++;

    LogFlowFuncLeave();
    return (0);

send:
    LogFlowFunc(("send\n"));
    /*
     * Before ESTABLISHED, force sending of initial options
     * unless TCP set not to do any options.
     * NOTE: we assume that the IP/TCP header plus TCP options
     * always fit in a single mbuf, leaving room for a maximum
     * link header, i.e.
     *      max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MHLEN
     */
    optlen = 0;
    hdrlen = sizeof (struct tcpiphdr);
    if (flags & TH_SYN)
    {
        tp->snd_nxt = tp->iss;
        if ((tp->t_flags & TF_NOOPT) == 0)
        {
            u_int16_t mss;

            opt[0] = TCPOPT_MAXSEG;
            opt[1] = 4;
            mss = RT_H2N_U16((u_int16_t) tcp_mss(pData, tp, 0));
            memcpy((caddr_t)(opt + 2), (caddr_t)&mss, sizeof(mss));
            optlen = 4;

#if 0
            if (   (tp->t_flags & TF_REQ_SCALE)
                && (   (flags & TH_ACK) == 0
                    || (tp->t_flags & TF_RCVD_SCALE)))
            {
                *((u_int32_t *) (opt + optlen)) = RT_H2N_U32(  TCPOPT_NOP << 24
                                                             | TCPOPT_WINDOW << 16
                                                             | TCPOLEN_WINDOW << 8
                                                             | tp->request_r_scale);
                optlen += 4;
            }
#endif
        }
    }

    /*
     * Send a timestamp and echo-reply if this is a SYN and our side
     * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
     * and our peer have sent timestamps in our SYN's.
     */
#if 0
    if (   (tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP
        && (flags & TH_RST) == 0
        && (   (flags & (TH_SYN|TH_ACK)) == TH_SYN
            || (tp->t_flags & TF_RCVD_TSTMP)))
    {
        u_int32_t *lp = (u_int32_t *)(opt + optlen);

        /* Form timestamp option as shown in appendix A of RFC 1323. */
        *lp++ = RT_H2N_U32_C(TCPOPT_TSTAMP_HDR);
        *lp++ = RT_H2N_U32(tcp_now);
        *lp   = RT_H2N_U32(tp->ts_recent);
        optlen += TCPOLEN_TSTAMP_APPA;
    }
#endif
    hdrlen += optlen;

    /*
     * Adjust data length if insertion of options will
     * bump the packet length beyond the t_maxseg length.
     */
    if (len > tp->t_maxseg - optlen)
    {
        len = tp->t_maxseg - optlen;
        sendalot = 1;
    }

    /*
     * Grab a header mbuf, attaching a copy of data to
     * be transmitted, and initialize the header from
     * the template for sends on this connection.
     */
    if (len)
    {
        if (tp->t_force && len == 1)
            tcpstat.tcps_sndprobe++;
        else if (SEQ_LT(tp->snd_nxt, tp->snd_max))
        {
            tcpstat.tcps_sndrexmitpack++;
            tcpstat.tcps_sndrexmitbyte += len;
        }
        else
        {
            tcpstat.tcps_sndpack++;
            tcpstat.tcps_sndbyte += len;
        }

        size = MCLBYTES;
        if ((len + hdrlen + ETH_HLEN) < MSIZE)
            size = MCLBYTES;
        else if ((len + hdrlen + ETH_HLEN) < MCLBYTES)
            size = MCLBYTES;
        else if((len + hdrlen + ETH_HLEN) < MJUM9BYTES)
            size = MJUM9BYTES;
        else if ((len + hdrlen + ETH_HLEN) < MJUM16BYTES)
            size = MJUM16BYTES;
        else
            AssertMsgFailed(("Unsupported size"));
        m = m_getjcl(pData, M_NOWAIT, MT_HEADER, M_PKTHDR, size);
        if (m == NULL)
        {
/*          error = ENOBUFS; */
            error = 1;
            goto out;
        }
        m->m_data += if_maxlinkhdr;
        m->m_pkthdr.header = mtod(m, void *);
        m->m_len = hdrlen;

        /*
         * This will always succeed, since we make sure our mbufs
         * are big enough to hold one MSS packet + header + ... etc.
         */
#if 0
        if (len <= MHLEN - hdrlen - max_linkhdr)
        {
#endif
            sbcopy(&so->so_snd, off, (int) len, mtod(m, caddr_t) + hdrlen);
            m->m_len += len;
#if 0
        }
        else
        {
            m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len);
            if (m->m_next == 0)
                len = 0;
        }
#endif
        /*
         * If we're sending everything we've got, set PUSH.
         * (This will keep happy those implementations which only
         * give data to the user when a buffer fills or
         * a PUSH comes in.)
         */
        if (off + len == SBUF_LEN(&so->so_snd))
            flags |= TH_PUSH;
    }
    else
    {
Beispiel #11
0
/*
 * TCP timer processing.
 */
struct tcpcb*
tcp_timers(struct tcpcb *tp, int timer)
{
	int rexmt;

	switch (timer) {

	/*
	 * 2 MSL timeout in shutdown went off.  If we're closed but
	 * still waiting for peer to close and connection has been idle
	 * too long, or if 2MSL time is up from TIME_WAIT, delete connection
	 * control block.  Otherwise, check again in a bit.
    * If TIME_WAIT is not set, this is FIN_WAIT_2 timer.
	 */
	case TCPT_2MSL:
		if (tp->t_state != TCPS_TIME_WAIT &&
		    tp->t_idle <= g_tcp_maxidle)
			tp->t_timer[TCPT_2MSL] = g_tcp_keepintvl;
		else
			tp = tcp_close(tp);
		break;

	/*
	 * Retransmission timer went off.  Message has not
	 * been acked within retransmit interval.  Back off
	 * to a longer retransmit interval and retransmit one segment.
	 */
	case TCPT_REXMT:
		if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
			tp->t_rxtshift = TCP_MAXRXTSHIFT;
			g_tcpstat.tcps_timeoutdrop++;
			tp = tcp_drop(tp, tp->t_softerror ?
			    tp->t_softerror : ETIMEDOUT);
			break;
		}
		g_tcpstat.tcps_rexmttimeo++;
		rexmt = TCP_REXMTVAL(tp) * g_tcp_backoff[tp->t_rxtshift];
		TCPT_RANGESET(tp->t_rxtcur, rexmt,
		    tp->t_rttmin, TCPTV_REXMTMAX);
		tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
		/*
		 * If losing, let the lower level know and try for
		 * a better route.  Also, if we backed off this far,
		 * our srtt estimate is probably bogus.  Clobber it
		 * so we'll take the next rtt measurement as our srtt;
		 * move the current srtt into rttvar to keep the current
		 * retransmit times until then.
		 */
		if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
			in_losing(tp->t_inpcb);
			tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
			tp->t_srtt = 0;
		}
		tp->snd_nxt = tp->snd_una;
		/*
		 * If timing a segment in this window, stop the timer.
		 */
		tp->t_rtt = 0;
		/*
		 * Close the congestion window down to one segment
		 * (we'll open it by one segment for each ack we get).
		 * Since we probably have a window's worth of unacked
		 * data accumulated, this "slow start" keeps us from
		 * dumping all that data as back-to-back packets (which
		 * might overwhelm an intermediate gateway).
		 *
		 * There are two phases to the opening: Initially we
		 * open by one mss on each ack.  This makes the window
		 * size increase exponentially with time.  If the
		 * window is larger than the path can handle, this
		 * exponential growth results in dropped packet(s)
		 * almost immediately.  To get more time between 
		 * drops but still "push" the network to take advantage
		 * of improving conditions, we switch from exponential
		 * to linear window opening at some threshhold size.
		 * For a threshhold, we use half the current window
		 * size, truncated to a multiple of the mss.
		 *
		 * (the minimum cwnd that will give us exponential
		 * growth is 2 mss.  We don't allow the threshhold
		 * to go below this.)
		 */
		{
		u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
		if (win < 2)
			win = 2;
		tp->snd_cwnd = tp->t_maxseg;
		tp->snd_ssthresh = win * tp->t_maxseg;
		tp->t_dupacks = 0;
		}
		(void) tcp_output(tp);
		break;

	/*
	 * Persistance timer into zero window.
	 * Force a byte to be output, if possible.
	 */
	case TCPT_PERSIST:
		g_tcpstat.tcps_persisttimeo++;
		/*
		 * Hack: if the peer is dead/unreachable, we do not
		 * time out if the window is closed.  After a full
		 * backoff, drop the connection if the idle time
		 * (no responses to probes) reaches the maximum
		 * backoff that we would use if retransmitting.
		 */
		if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
		    (tp->t_idle >= g_tcp_maxpersistidle ||
		    tp->t_idle >= TCP_REXMTVAL(tp) * g_tcp_totbackoff)) {
			g_tcpstat.tcps_persistdrop++;
			tp = tcp_drop(tp, ETIMEDOUT);
			break;
		}
		tcp_setpersist(tp);
		tp->t_force = 1;
		(void) tcp_output(tp);
		tp->t_force = 0;
		break;

	/*
	 * Keep-alive timer went off; send something
	 * or drop connection if idle for too long.
	 */
	case TCPT_KEEP:
		g_tcpstat.tcps_keeptimeo++;
		if (tp->t_state < TCPS_ESTABLISHED) // connection-establishment timer.
			goto dropit;
		if (tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE &&
		    tp->t_state <= TCPS_CLOSE_WAIT) { // keepalive timer.
		    	if (tp->t_idle >= g_tcp_keepidle + g_tcp_maxidle)
				   goto dropit;
			/*
			 * Send a packet designed to force a response
			 * if the peer is up and reachable:
			 * either an ACK if the connection is still alive,
			 * or an RST if the peer has closed the connection
			 * due to timeout or reboot.
			 * Using sequence number tp->snd_una-1
			 * causes the transmitted zero-length segment
			 * to lie outside the receive window;
			 * by the protocol spec, this requires the
			 * correspondent TCP to respond.
			 */
			g_tcpstat.tcps_keepprobe++;
			tcp_respond(tp, tp->t_template, (usn_mbuf_t *)NULL,
			    tp->rcv_nxt, tp->snd_una - 1, 0);
			tp->t_timer[TCPT_KEEP] = g_tcp_keepintvl;
		} else
			tp->t_timer[TCPT_KEEP] = g_tcp_keepidle;
		break;
	dropit:
		g_tcpstat.tcps_keepdrops++;
		tp = tcp_drop(tp, ETIMEDOUT);
		break;
	}