Exemplo n.º 1
0
int
tcp_rcv(PACKET pkt)     /* NOTE: pkt has nb_prot pointing to IP header */
{
   struct mbuf *  m_in;
   struct ip * bip;  /* IP header, berkeley version */
   struct tcphdr *   tcpp;
   unshort  len;  /* scratch length holder */

   /* For TCP, the netport IP layer is modified to set nb_prot to the 
    * start of the IP header (not TCP). We need to do some further
    * mods which the BSD code expects:
    */
   bip = (struct ip *)pkt->nb_prot;    /* get ip header */
   len = ntohs(bip->ip_len);  /* get length in local endian */

   /* verify checksum of received packet */

   tcpp = (struct tcphdr *)ip_data(bip);
   if (tcp_cksum(bip) != tcpp->th_sum)
   {
      TCP_MIB_INC(tcpInErrs);    /* keep MIB stats */
      tcpstat.tcps_rcvbadsum++;  /* keep BSD stats */
      LOCK_NET_RESOURCE(FREEQ_RESID);
      pk_free(pkt);  /* punt packet */
      UNLOCK_NET_RESOURCE(FREEQ_RESID);
      return ENP_BAD_HEADER;
   }

   m_in = m_getnbuf(MT_RXDATA, 0);
   if (!m_in){
      LOCK_NET_RESOURCE(FREEQ_RESID);
      pk_free(pkt);
      UNLOCK_NET_RESOURCE(FREEQ_RESID);
      return ENP_RESOURCE;  
   }

   IN_PROFILER(PF_TCP, PF_ENTRY);      /* measure time in TCP */

   /* subtract IP header length from total IP packet length */
   len -= ((unshort)(bip->ip_ver_ihl & 0x0f) << 2);
   bip->ip_len = len;   /* put TCP length in struct for TCP code to use */

   /* set mbuf to point to start of IP header (not TCP) */
   m_in->pkt = pkt;
   m_in->m_data = pkt->nb_prot;
   m_in->m_len = pkt->nb_plen;
   m_in->m_base = pkt->nb_buff;     /* ??? */
   m_in->m_memsz = pkt->nb_blen;    /* ??? */

   tcp_input(m_in, pkt->net);

   IN_PROFILER(PF_TCP, PF_EXIT);      /* measure time in TCP */

   return 0;
}
Exemplo n.º 2
0
int
tcp_output(struct tcpcb * tp)
{
   struct socket *   so =  tp->t_inpcb->inp_socket;
   int   len;
   long  win;
   int   off,  flags,   error;
   struct mbuf *  m;
   struct tcpiphdr * ti;
   unsigned optlen = 0;
   int   idle, sendalot;
   struct mbuf *  sendm;   /* mbuf which contains data to send */
   struct mbuf * tcp_mbuf; /* mbuf containing TCP header */
   int   bufoff;           /* offset of data in sendm->m_data */

#ifdef TCP_SACK
   int   sack_resend;
   int   sack_hole = 0;    /* next sack hole to fill */

   if(tp->t_flags & TF_SACKREPLY)
   {
      /* we are resending based on a received SACK header */
      sack_resend = TRUE;
      tp->t_flags &= ~TF_SACKREPLY;    /* clear flag */
   }
   else
      sack_resend = FALSE;
#endif /* TCP_SACK */
   
   /*
    * Determine length of data that should be transmitted,
    * and flags that will be used.
    * If there is some data or critical controls (SYN, RST)
    * to send, then transmit; otherwise, investigate further.
    */
   idle = (tp->snd_max == tp->snd_una);

again:
   sendalot = 0;
   off = (int)(tp->snd_nxt - tp->snd_una);
   win = (long)tp->snd_wnd;   /* set basic send window */
   if (win > (long)tp->snd_cwnd) /* see if we need congestion control */
   {
      win = (int)(tp->snd_cwnd & ~(ALIGN_TYPE-1)); /* keep data aligned */
   }

   /*
    * If in persist timeout with window of 0, send 1 byte.
    * Otherwise, if window is small but nonzero
    * and timer expired, we will send what we can
    * and go to transmit state.
    */
   if (tp->t_force) 
   {
      if (win == 0)
         win = 1;
      else 
      {
         tp->t_timer[TCPT_PERSIST] = 0;
         tp->t_rxtshift = 0;
      }
   }

#ifdef TCP_SACK
   /* See if we need to adjust the offset for a sack resend */
   if(sack_resend)
   {
      off = (int)(tp->sack_hole_start[sack_hole] - tp->snd_una);
      /* if this hole's already been acked then punt and move to next hole */
      if(off < 0)
      {
         /* clear out the acked hole */
         tp->sack_hole_start[sack_hole] = tp->sack_hole_end[sack_hole] = 0;
         /* see if we're done with SACK hole list (2 tests) */
         if(++sack_hole >= SACK_BLOCKS)
            return 0;
         if(tp->sack_hole_start[sack_hole] == tp->sack_hole_end[sack_hole])
            return 0;
         goto again;
      }
      tp->snd_nxt = tp->sack_hole_start[sack_hole];
      len = (int)(tp->sack_hole_end[sack_hole] - tp->sack_hole_start[sack_hole]);
      len = (int)MIN(len, (int)win);
   }
   else
#endif /* TCP_SACK */
   {
      /* set length of packets which are not sack resends */
      len = (int)MIN(so->so_snd.sb_cc, (unsigned)win) - off;
   }

   flags = tcp_outflags[tp->t_state];


   /* See if we need to build TCP options field. This test should be fast. */

#if (defined(TCP_TIMESTAMP) | defined(TCP_SACK))	   
   if((flags & TH_SYN) ||
/*   !!!???   (so->so_options & SO_TIMESTAMP) ||  */
	  (tp->t_flags & TF_SACKNOW)
	 )
   {
      optlen = bld_options(tp, &tcp_optionbuf[optlen], flags, so);
   }
#else
   /* If other options not defined this build then don't bother to call bld_options() except 
    * on SYN packets
    */
   if(flags & TH_SYN)
   {
      optlen = bld_options(tp, &tcp_optionbuf[optlen], flags, so);
   }
#endif

   if (len < 0)
   {
      /*
       * If FIN has been sent but not acked,
       * but we haven't been called to retransmit,
       * len will be -1.  Otherwise, window shrank
       * after we sent into it.  If window shrank to 0,
       * cancel pending retransmit and pull snd_nxt
       * back to (closed) window.  We will enter persist
       * state below.  If the window didn't close completely,
       * just wait for an ACK.
       */
      len = 0;
      if (win == 0) 
      {
         tp->t_timer[TCPT_REXMT] = 0;
         tp->snd_nxt = tp->snd_una;
      }
   }

   if (len > (int)tp->t_maxseg)
   {
      len = tp->t_maxseg;
      sendalot = 1;
   }

#ifdef IP_V4
#ifdef IP_PMTU
   {
      int pmtu = tp->t_inpcb->inp_pmtu - 40;

      if (len > pmtu)
      {
         len = pmtu - 40;
         sendalot = 1;
      }
   }
#endif /* IP_PMTU */
   /* We don't need a pmtu test for IPv6. V6 code limits t_maxseg to
    * the Path MTU, so the test above the v4 ifdef above covers us.
    */
#endif /* IP_V4 */

   if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
      flags &= ~TH_FIN;
   win = (long)(sbspace(&so->so_rcv));

   /*
    * If our state indicates that FIN should be sent
    * and we have not yet done so, or we're retransmitting the FIN,
    * then we need to send.
    */
   if ((flags & TH_FIN) &&
       (so->so_snd.sb_cc == 0) &&
       ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
   {
      goto send;
   }
   /*
    * Send if we owe peer an ACK.
    */
   if (tp->t_flags & TF_ACKNOW)
      goto send;
   if (flags & (TH_SYN|TH_RST))
      goto send;
   if (SEQ_GT(tp->snd_up, tp->snd_una))
      goto send;

   /*
    * Sender silly window avoidance.  If connection is idle
    * and can send all data, a maximum segment,
    * at least a maximum default-size segment do it,
    * or are forced, do it; otherwise don't bother.
    * If peer's buffer is tiny, then send
    * when window is at least half open.
    * If retransmitting (possibly after persist timer forced us
    * to send into a small window), then must resend.
    */
   if (len)
   {
      if (len == (int)tp->t_maxseg)
         goto send;
      if ((idle || tp->t_flags & TF_NODELAY) &&
          len + off >= (int)so->so_snd.sb_cc)
      {
         goto send;
      }
      if (tp->t_force)
         goto send;
      if (len >= (int)(tp->max_sndwnd / 2))
         goto send;
      if (SEQ_LT(tp->snd_nxt, tp->snd_max))
         goto send;
   }

   /*
    * Compare available window to amount of window
    * known to peer (as advertised window less
    * next expected input).  If the difference is at least two
    * max size segments or at least 35% of the maximum possible
    * window, then want to send a window update to peer.
    */
   if (win > 0)
   {
      int   adv   =  (int)win -  (int)(tp->rcv_adv -  tp->rcv_nxt);

      if (so->so_rcv.sb_cc == 0 && adv >= (int)(tp->t_maxseg * 2))
         goto send;
      if (100 * (u_int)adv / so->so_rcv.sb_hiwat >= 35)
         goto send;
   }

   /*
    * TCP window updates are not reliable, rather a polling protocol
    * using ``persist'' packets is used to insure receipt of window
    * updates.  The three ``states'' for the output side are:
    *   idle         not doing retransmits or persists
    *   persisting      to move a small or zero window
    *   (re)transmitting   and thereby not persisting
    *
    * tp->t_timer[TCPT_PERSIST]
    *   is set when we are in persist state.
    * tp->t_force
    *   is set when we are called to send a persist packet.
    * tp->t_timer[TCPT_REXMT]
    *   is set when we are retransmitting
    * The output side is idle when both timers are zero.
    *
    * If send window is too small, there is data to transmit, and no
    * retransmit or persist is pending, then go to persist state.
    * If nothing happens soon, send when timer expires:
    * if window is nonzero, transmit what we can,
    * otherwise force out a byte.
    */
   if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 &&
       tp->t_timer[TCPT_PERSIST] == 0) 
   {
      tp->t_rxtshift = 0;
      tcp_setpersist(tp);
   }

   /*
    * No reason to send a segment, just return.
    */
   return (0);

send:
   ENTER_CRIT_SECTION(tp);

   /* Limit send length to the current buffer so as to
    * avoid doing the "mbuf shuffle" in m_copy().
    */
   bufoff = off;
   sendm = so->so_snd.sb_mb;
   if (len)
   {
      /* find mbuf containing data to send (at "off") */
      while (sendm)  /* loop through socket send list */
      {
         bufoff -= sendm->m_len;
         if (bufoff < 0)   /* if off is in this buffer, break */
            break;
         sendm = sendm->m_next;
      }
      if (!sendm) { dtrap();  /* shouldn't happen */ }
      bufoff += sendm->m_len; /* index to next data to send in msend */

      /* if socket has multiple unsent mbufs, set flag for send to loop */
      if ((sendm->m_next) && (len > (int)sendm->m_len))
      {
         flags &= ~TH_FIN; /* don't FIN on segment prior to last */
         sendalot = 1;     /* set to send more segments */
      }
      if((flags & TH_FIN) && (so->so_snd.sb_cc > (unsigned)len))
      {
         /* This can happen on slow links (PPP) which retry the last 
          * segment - the one with the FIN bit attached to data.
          */
         flags &= ~TH_FIN; /* don't FIN on segment prior to last */
      }

      /* only send the rest of msend */
      len = min(len, (int)sendm->m_len);

      /* if we're not sending starting at sendm->m_data (in which 
       * case bufoff != 0), then we will copy the data; else we would 
       * write IP/TCP headers over sent but un-ack'ed data in sendm. 
       * Similarly, if sendm->m_data is not aligned with respect to 
       * sendm->m_base and ALIGN_TYPE, we will copy the data to 
       * ensure that it (and the then-prepended IP/TCP headers) will 
       * be aligned according to ALIGN_TYPE. 
       */
      if ((bufoff != 0) ||       /* data not front aligned in send mbuf? */
          (((sendm->m_data - sendm->m_base) & (ALIGN_TYPE - 1)) != 0))
      {
         len = min(len, (int)(sendm->m_len - bufoff));   /* limit len again */

         /* One more test - if this data is not aligned with the front
          * of the m_data buffer then we can't use it in place, else we
          * might write the IP/TCP header over data that has not yet
          * been acked. In this case we must make sure our send
          * fits into a little buffer and send what we can.
          */
         if ((len > (int)(lilbufsiz - HDRSLEN)) && /* length is bigger the small buffer? */
             (bigfreeq.q_len < 2))      /* and we are low on big buffers */
         {
            len = lilbufsiz - HDRSLEN;
         }
      }
   }

   /* if send data is sufficiently aligned in packet, prepend TCP/IP header
    * in the space provided. 
    */
   if (len && (bufoff == 0) && 
       (sendm->pkt->inuse == 1) &&
       (((sendm->m_data - sendm->m_base) & (ALIGN_TYPE - 1)) == 0) && 
       (optlen == 0))
   {
      /* get an empty mbuf to "clone" the data */
      m = m_getnbuf(MT_TXDATA, 0);
      if (!m)
      {
         EXIT_CRIT_SECTION(tp);
         return (ENOBUFS);
      }
      m->pkt = sendm->pkt; /* copy packet location in new mbuf */
      m->pkt->inuse++;     /* bump packet's use count */
      m->m_base = sendm->m_base; /* clone mbuf members */
      m->m_memsz = sendm->m_memsz;
      m->m_len = len + TCPIPHDRSZ;  /* adjust clone for header */
      m->m_data = sendm->m_data - TCPIPHDRSZ;
   }
   else  /* either no data or data is not front aligned in mbuf */
   {
      /* Grab a header mbuf, attaching a copy of data to be 
       * transmitted, and initialize the header from 
       * the template for sends on this connection.
       */
      m = m_getwithdata (MT_HEADER, IFNETHDR_SIZE + TCPIPHDRSZ);
      if (m ==(struct mbuf *)NULL)
      {
         EXIT_CRIT_SECTION(tp);
         return ENOBUFS;
      }

      m->m_len = TCPIPHDRSZ;
      m->m_data += IFNETHDR_SIZE;/* Move this to sizeof tcpip hdr leave*/
      /* 14 bytes for ethernet header      */

      if (len) /* attach any data to send */
      {
         m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len);
         if (m->m_next == 0)
         {
            m_freem(m);
            EXIT_CRIT_SECTION(tp);
            return ENOBUFS;
         }
      }
   }
   EXIT_CRIT_SECTION(tp);

   if (len) 
   {
      if (tp->t_force && len == 1)
         tcpstat.tcps_sndprobe++;
      else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 
      {
         tcpstat.tcps_sndrexmitpack++;
         tcpstat.tcps_sndrexmitbyte += len;
#ifdef TCP_SACK
      if(sack_resend)
         tcpstat.tcps_sackresend++;
#endif
      } 
      else 
      {
         tcpstat.tcps_sndpack++;
         tcpstat.tcps_sndbyte += len;
      }
   }
   else if (tp->t_flags & TF_ACKNOW)
   {
      tcpstat.tcps_sndacks++;
   }
   else if (flags & (TH_SYN|TH_FIN|TH_RST))
      tcpstat.tcps_sndctrl++;
   else if (SEQ_GT(tp->snd_up, tp->snd_una))
      tcpstat.tcps_sndurg++;
   else
      tcpstat.tcps_sndwinup++;

   ti = (struct tcpiphdr *)(m->m_data+sizeof(struct ip)-sizeof(struct ipovly));
   if ((char *)ti < m->pkt->nb_buff)
   {
      panic("tcp_out- packet ptr underflow\n");
   }
   tcp_mbuf = m;        /* flag TCP header mbuf */

#ifdef IP_V6  /* Dual mode code */
   if(so->so_domain == AF_INET6)
   {
      m = mbuf_prepend(m, sizeof(struct ipv6));
      if(m == NULL)
      {
         /* this can happen when we run out of mbufs or pkt buffers
          * That is, mfreeq is empty or (lilfreeq, bigfreeq) are empty.
          * One solution is to find out which one is getting full and
          * then increase them.
          */
         dtrap();             /* This is really rare... */
         m_freem(tcp_mbuf);   /* Free TCP/data chain */
         return ENOBUFS;
      }

      /* strip overlay from front of TCP header */
      tcp_mbuf->m_data += sizeof(struct ipovly);
      tcp_mbuf->m_len -= sizeof(struct ipovly);
   }
#endif   /* end IP_V6 */

   if (tp->t_template == 0)
      panic("tcp_output");

   MEMCPY((char*)ti, (char*)tp->t_template, sizeof(struct tcpiphdr));

   /*
    * Fill in fields, remembering maximum advertised
    * window for use in delaying messages about window sizes.
    * If resending a FIN, be sure not to use a new sequence number.
    */
   if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && 
       tp->snd_nxt == tp->snd_max)
   {
      tp->snd_nxt--;
   }

   ti->ti_seq = htonl(tp->snd_nxt);
   ti->ti_ack = htonl(tp->rcv_nxt);

   /*
    * If we're sending a SYN, check the IP address of the interface
    * that we will (likely) use to send the IP datagram -- if it's
    * changed from what is in the template (as it might if this is
    * a retransmission, and the original SYN caused PPP to start
    * bringing the interface up, and PPP has got a new IP address
    * via IPCP), update the template and the inpcb with the new 
    * address.
    */
   if (flags & TH_SYN)
   {
      struct inpcb * inp;
      inp = (struct inpcb *)so->so_pcb;

      switch(so->so_domain)
      {
#ifdef IP_V4
      case AF_INET:
      {
         ip_addr src;

#ifdef INCLUDE_PPP

         if(((flags & TH_ACK) == 0) && /* SYN only, not SYN/ACK */
            (inp->ifp) &&              /* Make sure we have iface */
            (inp->ifp->mib.ifType == PPP))   /* only PPP type */
         {
            dtrap(); /* remove after confirmed to work in PPP */ 
            src = ip_mymach(ti->ti_dst.s_addr);

         if (src != ti->ti_src.s_addr)
         {
            ti->ti_src.s_addr = src;
            tp->t_template->ti_src.s_addr = src;
            tp->t_inpcb->inp_laddr.s_addr = src;
         }
         }
#endif   /* INCLUDE_PPP */

         /* If this is a SYN (not a SYN/ACK) then set the pmtu */
         if((flags & TH_ACK) == 0)
         {
#ifdef IP_PMTU
            inp->inp_pmtu = pmtucache_get(inp->inp_faddr.s_addr);
#else    /* not compiled for pathmtu, guess based on iface */
            {
               NET ifp;
               /* find iface for route. Pass "src" as nexthop return */
               ifp = iproute(ti->ti_dst.s_addr, &src);
               if(ifp)
                  inp->inp_pmtu = ifp->n_mtu - (ifp->n_lnh + 40);
               else
                  inp->inp_pmtu = 580;  /* Ugh. */
            }
#endif   /* IP_PMTU */
         }
         break;
      }
#endif   /* IP_V4 */

#ifdef IP_V6
      case AF_INET6:
      {
         struct ip6_inaddr * local;
         
         local = ip6_myaddr(&tp->t_inpcb->ip6_faddr, inp->ifp);

         /* If we got a local address & it's not the one in the pcb, then
          * we assume it changed at the iface and fix it in the pcb. Unlike 
          * v4, we don't have an IP header yet, not do we have a template 
          * to worry about.
          */
         if((local) && 
            (!IP6EQ(&local->addr, &tp->t_inpcb->ip6_laddr)))
         {
            IP6CPY(&tp->t_inpcb->ip6_laddr, &local->addr);
         }
         /* If this is a SYN (not a SYN/ACK) then set the pmtu */
         if((flags & TH_ACK) == 0)
         {
            inp->inp_pmtu = ip6_pmtulookup(&inp->ip6_laddr, inp->ifp);
         }
         break;
      }
#endif   /* IP_V6 */
      default:
         dtrap();    /* bad domain setting */
      }
   }

   /* fill in options if any are set */
   if (optlen)
   {
      struct mbuf * mopt;

      mopt = m_getwithdata(MT_TXDATA, MAXOPTLEN);
      if (mopt == NULL) 
      {
         m_freem(m);
         return (ENOBUFS);
      }

      /* insert options mbuf after after tmp_mbuf */
      mopt->m_next = tcp_mbuf->m_next;
      tcp_mbuf->m_next = mopt;

      /* extend options to aligned address */
      while(optlen & 0x03)
         tcp_optionbuf[optlen++] = TCPOPT_EOL;

      MEMCPY(mtod(mopt, char *), tcp_optionbuf, optlen);
      mopt->m_len = optlen;
      /* use portable macro to set tcp data offset bits */
      SET_TH_OFF(ti->ti_t, ((sizeof (struct tcphdr) + optlen) >> 2));
   }

   ti->ti_flags = (u_char)flags;
   /*
    * Calculate receive window. Don't shrink window,
    * but avoid silly window syndrome.
    */
   if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg)
      win = 0;
   if (win < (long)(tp->rcv_adv - tp->rcv_nxt))
      win = (long)(tp->rcv_adv - tp->rcv_nxt);

   /* do check for Iniche buffer limits -JB- */
   if (bigfreeq.q_len == 0)   /* If queue length is 0, set window to 0 */
   {
      win = 0;
   }
   else if(win > (((long)bigfreeq.q_len - 1) * (long)bigbufsiz))
   {
      win = ((long)bigfreeq.q_len - 1) * bigbufsiz;
   }

#ifdef TCP_WIN_SCALE
   if(tp->t_flags & TF_WINSCALE)
   {
      ti->ti_win = htons((u_short)(win >> tp->rcv_wind_scale)); /* apply scale */
   }
Exemplo n.º 3
0
int
rawip_soinput(PACKET pkt, void * so_ptr)
{
   struct mbuf *  m_in;    /* packet/data mbuf */
   struct socket *   so =  (struct  socket *)so_ptr;
   struct sockaddr_in   sin;

   LOCK_NET_RESOURCE(NET_RESID); 

   /* make sure we're not flooding input buffers */
   if ((so->so_rcv.sb_cc + pkt->nb_plen) >= so->so_rcv.sb_hiwat)
   {
      UNLOCK_NET_RESOURCE(NET_RESID);
      return ENOBUFS;
   }

   /* alloc mbuf for received data */
   m_in = m_getnbuf(MT_RXDATA, 0);
   if (!m_in)
   {
      UNLOCK_NET_RESOURCE(NET_RESID);
      return ENOBUFS;
   }

   /* set data mbuf to point to start of IP header */
   m_in->pkt = pkt;
   m_in->m_base = pkt->nb_buff;
   m_in->m_memsz = pkt->nb_blen;
   m_in->m_data = pkt->nb_prot;
   m_in->m_len = pkt->nb_plen;

   /* if this socket doesn't have IP_HDRINCL set, adjust the
    * mbuf to skip past the IP header
    */
   if (!(so->so_options & SO_HDRINCL))
   {
      unsigned int ihl = 
         (((struct ip *)(pkt->nb_prot))->ip_ver_ihl & 0x0f) << 2;
      m_in->m_data += ihl;
      m_in->m_len -= ihl;
   }

   /* fill in net address info for pass to socket append()ers */
   sin.sin_addr.s_addr = pkt->fhost;
   sin.sin_port = 0;
   sin.sin_family = AF_INET;

   /* attempt to append address information to mbuf */
   if (!sbappendaddr(&so->so_rcv, (struct sockaddr *)&sin, m_in))
   {
      /* set the pkt field in the mbuf to NULL so m_free() below wont 
       * free the packet buffer, because that is left to the 
       * underlying stack
       */
      m_in->pkt = NULL;
      /* free only the mbuf itself */
      m_free(m_in);
      /* return error condition so caller can free the packet buffer */
      UNLOCK_NET_RESOURCE(NET_RESID);
      return ENOBUFS;
   }

   tcp_wakeup(&so->so_rcv);   /* wake anyone waiting for this */

   sorwakeup(so);    /* wake up selects too */

   UNLOCK_NET_RESOURCE(NET_RESID);
   return 0;
}