Beispiel #1
0
int tcp_close(struct tcp_pcb * __tp)
{
	int ret;

	if (__tp == NULL) {
		DCC_LOG(LOG_WARNING, "NULL pointer");
		return -1;
	}

	tcpip_net_lock();

#ifdef ENABLE_SANITY
	if ((pcb_find((struct pcb *)__tp, &__tcp__.active) < 0) && 
		(pcb_find((struct pcb *)__tp, &__tcp__.listen) < 0) &&
		pcb_find((struct pcb *)__tp, &__tcp__.closed) < 0) {
		DBG(DBG_ERROR, "<%05x> pcb_find()", (int)__tp);
		tcpip_net_unlock();
		return -1;
	}
#endif

	switch(__tp->t_state) {
		case TCPS_LISTEN: {
			ret = tcp_pcb_free(__tp);
			tcpip_net_unlock();
			return ret;
		}

		case TCPS_TIME_WAIT:
		case TCPS_CLOSED:  
		case TCPS_SYN_SENT:
			DCC_LOG2(LOG_TRACE, "<%05x> [%s]", (int)__tp, 
					 __tcp_state[__tp->t_state]);
			if (__tp->t_cond >= 0) {
				__os_cond_free(__tp->t_cond);
				__tp->t_cond = -1;
			}
			ret = tcp_pcb_free(__tp);
			tcpip_net_unlock();
			return ret;

		/* active close */
		case TCPS_SYN_RCVD:
		case TCPS_ESTABLISHED:
			/* Close the receive window */
			/*
			 * XXX: if we close the receive window we may stuck at
			 * FIN_WAIT_2 state...
			 */
//			__tp->rcv_wnd = 0;
			__tp->t_state = TCPS_FIN_WAIT_1;
			DCC_LOG1(LOG_TRACE, "<%05x> [FIN_WAIT_1]", (int)__tp);
			break;

		/* passive close */
		case TCPS_CLOSE_WAIT:
			__tp->t_state = TCPS_LAST_ACK;
			DCC_LOG1(LOG_TRACE, "<%05x> [LAST_ACK]", (int)__tp);
			/* discard the data 
			 *  TODO: check whether both buffers must be 
			 * released or not. Probably they where released already.
			 */
			/* discards unsent data */
			__tp->snd_off -= __tp->snd_q.len;
			__tp->snd_max -= __tp->snd_q.len;
			mbuf_queue_free(&__tp->snd_q);
			mbuf_queue_free(&__tp->rcv_q);
			/*  notify the upper layer that we are closed */
			break;

		default: {
			DCC_LOG2(LOG_ERROR, "<%05x> state=[%s]", (int)__tp, 
				__tcp_state[__tp->t_state]);
			tcpip_net_unlock();
			return -1;
		}

	}

	/* ACK now */
	__tp->t_flags |= TF_ACKNOW;
	/* schedule output */
	tcp_output_sched(__tp);

	tcpip_net_unlock();
	return 0;
}
Beispiel #2
0
/*
 * Tcp output routine: figure out what should be sent and send it.
 */
int
tcp_output(struct tcpcb *tp)
{
	struct inpcb * const inp = tp->t_inpcb;
	struct socket *so = inp->inp_socket;
	long len, recvwin, sendwin;
	int nsacked = 0;
	int off, flags, error = 0;
#ifdef TCP_SIGNATURE
	int sigoff = 0;
#endif
	struct mbuf *m;
	struct ip *ip;
	struct tcphdr *th;
	u_char opt[TCP_MAXOLEN];
	unsigned int ipoptlen, optlen, hdrlen;
	int idle;
	boolean_t sendalot;
	struct ip6_hdr *ip6;
#ifdef INET6
	const boolean_t isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
#else
	const boolean_t isipv6 = FALSE;
#endif
	boolean_t can_tso = FALSE, use_tso;
	boolean_t report_sack, idle_cwv = FALSE;
	u_int segsz, tso_hlen, tso_lenmax = 0;
	int segcnt = 0;
	boolean_t need_sched = FALSE;

	KKASSERT(so->so_port == &curthread->td_msgport);

	/*
	 * Determine length of data that should be transmitted,
	 * and flags that will be used.
	 * If there is some data or critical controls (SYN, RST)
	 * to send, then transmit; otherwise, investigate further.
	 */

	/*
	 * If we have been idle for a while, the send congestion window
	 * could be no longer representative of the current state of the
	 * link; need to validate congestion window.  However, we should
	 * not perform congestion window validation here, since we could
	 * be asked to send pure ACK.
	 */
	if (tp->snd_max == tp->snd_una &&
	    (ticks - tp->snd_last) >= tp->t_rxtcur && tcp_idle_restart)
		idle_cwv = TRUE;

	/*
	 * Calculate whether the transmit stream was previously idle 
	 * and adjust TF_LASTIDLE for the next time.
	 */
	idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
	if (idle && (tp->t_flags & TF_MORETOCOME))
		tp->t_flags |= TF_LASTIDLE;
	else
		tp->t_flags &= ~TF_LASTIDLE;

	if (TCP_DO_SACK(tp) && tp->snd_nxt != tp->snd_max &&
	    !IN_FASTRECOVERY(tp))
		nsacked = tcp_sack_bytes_below(&tp->scb, tp->snd_nxt);

	/*
	 * Find out whether TSO could be used or not
	 *
	 * For TSO capable devices, the following assumptions apply to
	 * the processing of TCP flags:
	 * - If FIN is set on the large TCP segment, the device must set
	 *   FIN on the last segment that it creates from the large TCP
	 *   segment.
	 * - If PUSH is set on the large TCP segment, the device must set
	 *   PUSH on the last segment that it creates from the large TCP
	 *   segment.
	 */
#if !defined(IPSEC) && !defined(FAST_IPSEC)
	if (tcp_do_tso
#ifdef TCP_SIGNATURE
	    && (tp->t_flags & TF_SIGNATURE) == 0
#endif
	) {
		if (!isipv6) {
			struct rtentry *rt = inp->inp_route.ro_rt;

			if (rt != NULL && (rt->rt_flags & RTF_UP) &&
			    (rt->rt_ifp->if_hwassist & CSUM_TSO)) {
				can_tso = TRUE;
				tso_lenmax = rt->rt_ifp->if_tsolen;
			}
		}
	}
#endif	/* !IPSEC && !FAST_IPSEC */

again:
	m = NULL;
	ip = NULL;
	th = NULL;
	ip6 = NULL;

	if ((tp->t_flags & (TF_SACK_PERMITTED | TF_NOOPT)) ==
		TF_SACK_PERMITTED &&
	    (!TAILQ_EMPTY(&tp->t_segq) ||
	     tp->reportblk.rblk_start != tp->reportblk.rblk_end))
		report_sack = TRUE;
	else
		report_sack = FALSE;

	/* Make use of SACK information when slow-starting after a RTO. */
	if (TCP_DO_SACK(tp) && tp->snd_nxt != tp->snd_max &&
	    !IN_FASTRECOVERY(tp)) {
		tcp_seq old_snd_nxt = tp->snd_nxt;

		tcp_sack_skip_sacked(&tp->scb, &tp->snd_nxt);
		nsacked += tp->snd_nxt - old_snd_nxt;
	}

	sendalot = FALSE;
	off = tp->snd_nxt - tp->snd_una;
	sendwin = min(tp->snd_wnd, tp->snd_cwnd + nsacked);
	sendwin = min(sendwin, tp->snd_bwnd);

	flags = tcp_outflags[tp->t_state];
	/*
	 * Get standard flags, and add SYN or FIN if requested by 'hidden'
	 * state flags.
	 */
	if (tp->t_flags & TF_NEEDFIN)
		flags |= TH_FIN;
	if (tp->t_flags & TF_NEEDSYN)
		flags |= TH_SYN;

	/*
	 * If in persist timeout with window of 0, send 1 byte.
	 * Otherwise, if window is small but nonzero
	 * and timer expired, we will send what we can
	 * and go to transmit state.
	 */
	if (tp->t_flags & TF_FORCE) {
		if (sendwin == 0) {
			/*
			 * If we still have some data to send, then
			 * clear the FIN bit.  Usually this would
			 * happen below when it realizes that we
			 * aren't sending all the data.  However,
			 * if we have exactly 1 byte of unsent data,
			 * then it won't clear the FIN bit below,
			 * and if we are in persist state, we wind
			 * up sending the packet without recording
			 * that we sent the FIN bit.
			 *
			 * We can't just blindly clear the FIN bit,
			 * because if we don't have any more data
			 * to send then the probe will be the FIN
			 * itself.
			 */
			if (off < so->so_snd.ssb_cc)
				flags &= ~TH_FIN;
			sendwin = 1;
		} else {
			tcp_callout_stop(tp, tp->tt_persist);
			tp->t_rxtshift = 0;
		}
	}

	/*
	 * If snd_nxt == snd_max and we have transmitted a FIN, the
	 * offset will be > 0 even if so_snd.ssb_cc is 0, resulting in
	 * a negative length.  This can also occur when TCP opens up
	 * its congestion window while receiving additional duplicate
	 * acks after fast-retransmit because TCP will reset snd_nxt
	 * to snd_max after the fast-retransmit.
	 *
	 * A negative length can also occur when we are in the
	 * TCPS_SYN_RECEIVED state due to a simultanious connect where
	 * our SYN has not been acked yet.
	 *
	 * In the normal retransmit-FIN-only case, however, snd_nxt will
	 * be set to snd_una, the offset will be 0, and the length may
	 * wind up 0.
	 */
	len = (long)ulmin(so->so_snd.ssb_cc, sendwin) - off;

	/*
	 * Lop off SYN bit if it has already been sent.  However, if this
	 * is SYN-SENT state and if segment contains data, suppress sending
	 * segment (sending the segment would be an option if we still
	 * did TAO and the remote host supported it).
	 */
	if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
		flags &= ~TH_SYN;
		off--, len++;
		if (len > 0 && tp->t_state == TCPS_SYN_SENT) {
			tp->t_flags &= ~(TF_ACKNOW | TF_XMITNOW);
			return 0;
		}
	}

	/*
	 * Be careful not to send data and/or FIN on SYN segments.
	 * This measure is needed to prevent interoperability problems
	 * with not fully conformant TCP implementations.
	 */
	if (flags & TH_SYN) {
		len = 0;
		flags &= ~TH_FIN;
	}

	if (len < 0) {
		/*
		 * A negative len can occur if our FIN has been sent but not
		 * acked, or if we are in a simultanious connect in the
		 * TCPS_SYN_RECEIVED state with our SYN sent but not yet
		 * acked.
		 *
		 * If our window has contracted to 0 in the FIN case
		 * (which can only occur if we have NOT been called to
		 * retransmit as per code a few paragraphs up) then we
		 * want to shift the retransmit timer over to the
		 * persist timer.
		 *
		 * However, if we are in the TCPS_SYN_RECEIVED state
		 * (the SYN case) we will be in a simultanious connect and
		 * the window may be zero degeneratively.  In this case we
		 * do not want to shift to the persist timer after the SYN
		 * or the SYN+ACK transmission.
		 */
		len = 0;
		if (sendwin == 0 && tp->t_state != TCPS_SYN_RECEIVED) {
			tcp_callout_stop(tp, tp->tt_rexmt);
			tp->t_rxtshift = 0;
			tp->snd_nxt = tp->snd_una;
			if (!tcp_callout_active(tp, tp->tt_persist))
				tcp_setpersist(tp);
		}
	}

	KASSERT(len >= 0, ("%s: len < 0", __func__));
	/*
	 * Automatic sizing of send socket buffer.  Often the send buffer
	 * size is not optimally adjusted to the actual network conditions
	 * at hand (delay bandwidth product).  Setting the buffer size too
	 * small limits throughput on links with high bandwidth and high
	 * delay (eg. trans-continental/oceanic links).  Setting the
	 * buffer size too big consumes too much real kernel memory,
	 * especially with many connections on busy servers.
	 *
	 * The criteria to step up the send buffer one notch are:
	 *  1. receive window of remote host is larger than send buffer
	 *     (with a fudge factor of 5/4th);
	 *  2. send buffer is filled to 7/8th with data (so we actually
	 *     have data to make use of it);
	 *  3. send buffer fill has not hit maximal automatic size;
	 *  4. our send window (slow start and cogestion controlled) is
	 *     larger than sent but unacknowledged data in send buffer.
	 *
	 * The remote host receive window scaling factor may limit the
	 * growing of the send buffer before it reaches its allowed
	 * maximum.
	 *
	 * It scales directly with slow start or congestion window
	 * and does at most one step per received ACK.  This fast
	 * scaling has the drawback of growing the send buffer beyond
	 * what is strictly necessary to make full use of a given
	 * delay*bandwith product.  However testing has shown this not
	 * to be much of an problem.  At worst we are trading wasting
	 * of available bandwith (the non-use of it) for wasting some
	 * socket buffer memory.
	 *
	 * TODO: Shrink send buffer during idle periods together
	 * with congestion window.  Requires another timer.  Has to
	 * wait for upcoming tcp timer rewrite.
	 */
	if (tcp_do_autosndbuf && so->so_snd.ssb_flags & SSB_AUTOSIZE) {
		if ((tp->snd_wnd / 4 * 5) >= so->so_snd.ssb_hiwat &&
		    so->so_snd.ssb_cc >= (so->so_snd.ssb_hiwat / 8 * 7) &&
		    so->so_snd.ssb_cc < tcp_autosndbuf_max &&
		    sendwin >= (so->so_snd.ssb_cc - (tp->snd_nxt - tp->snd_una))) {
			u_long newsize;

			newsize = ulmin(so->so_snd.ssb_hiwat +
					 tcp_autosndbuf_inc,
					tcp_autosndbuf_max);
			if (!ssb_reserve(&so->so_snd, newsize, so, NULL))
				atomic_clear_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE);
			if (newsize >= (TCP_MAXWIN << tp->snd_scale))
				atomic_clear_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE);
		}
	}

	/*
	 * Don't use TSO, if:
	 * - Congestion window needs validation
	 * - There are SACK blocks to report
	 * - RST or SYN flags is set
	 * - URG will be set
	 *
	 * XXX
	 * Checking for SYN|RST looks overkill, just to be safe than sorry
	 */
	use_tso = can_tso;
	if (report_sack || idle_cwv || (flags & (TH_RST | TH_SYN)))
		use_tso = FALSE;
	if (use_tso) {
		tcp_seq ugr_nxt = tp->snd_nxt;

		if ((flags & TH_FIN) && (tp->t_flags & TF_SENTFIN) &&
		    tp->snd_nxt == tp->snd_max)
			--ugr_nxt;

		if (SEQ_GT(tp->snd_up, ugr_nxt))
			use_tso = FALSE;
	}

	if (use_tso) {
		/*
		 * Find out segment size and header length for TSO
		 */
		error = tcp_tso_getsize(tp, &segsz, &tso_hlen);
		if (error)
			use_tso = FALSE;
	}
	if (!use_tso) {
		segsz = tp->t_maxseg;
		tso_hlen = 0; /* not used */
	}

	/*
	 * Truncate to the maximum segment length if not TSO, and ensure that
	 * FIN is removed if the length no longer contains the last data byte.
	 */
	if (len > segsz) {
		if (!use_tso) {
			len = segsz;
			++segcnt;
		} else {
			int nsegs;

			if (__predict_false(tso_lenmax < segsz))
				tso_lenmax = segsz << 1;

			/*
			 * Truncate TSO transfers to (IP_MAXPACKET - iphlen -
			 * thoff), and make sure that we send equal size
			 * transfers down the stack (rather than big-small-
			 * big-small-...).
			 */
			len = min(len, tso_lenmax);
			nsegs = min(len, (IP_MAXPACKET - tso_hlen)) / segsz;
			KKASSERT(nsegs > 0);

			len = nsegs * segsz;

			if (len <= segsz) {
				use_tso = FALSE;
				++segcnt;
			} else {
				segcnt += nsegs;
			}
		}
		sendalot = TRUE;
	} else {
		use_tso = FALSE;
		if (len > 0)
			++segcnt;
	}
	if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.ssb_cc))
		flags &= ~TH_FIN;

	recvwin = ssb_space(&so->so_rcv);

	/*
	 * Sender silly window avoidance.   We transmit under the following
	 * conditions when len is non-zero:
	 *
	 *	- We have a full segment
	 *	- This is the last buffer in a write()/send() and we are
	 *	  either idle or running NODELAY
	 *	- we've timed out (e.g. persist timer)
	 *	- we have more then 1/2 the maximum send window's worth of
	 *	  data (receiver may be limiting the window size)
	 *	- we need to retransmit
	 */
	if (len) {
		if (len >= segsz)
			goto send;
		/*
		 * NOTE! on localhost connections an 'ack' from the remote
		 * end may occur synchronously with the output and cause
		 * us to flush a buffer queued with moretocome.  XXX
		 *
		 * note: the len + off check is almost certainly unnecessary.
		 */
		if (!(tp->t_flags & TF_MORETOCOME) &&	/* normal case */
		    (idle || (tp->t_flags & TF_NODELAY)) &&
		    len + off >= so->so_snd.ssb_cc &&
		    !(tp->t_flags & TF_NOPUSH)) {
			goto send;
		}
		if (tp->t_flags & TF_FORCE)		/* typ. timeout case */
			goto send;
		if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
			goto send;
		if (SEQ_LT(tp->snd_nxt, tp->snd_max))	/* retransmit case */
			goto send;
		if (tp->t_flags & TF_XMITNOW)
			goto send;
	}

	/*
	 * Compare available window to amount of window
	 * known to peer (as advertised window less
	 * next expected input).  If the difference is at least two
	 * max size segments, or at least 50% of the maximum possible
	 * window, then want to send a window update to peer.
	 */
	if (recvwin > 0) {
		/*
		 * "adv" is the amount we can increase the window,
		 * taking into account that we are limited by
		 * TCP_MAXWIN << tp->rcv_scale.
		 */
		long adv = min(recvwin, (long)TCP_MAXWIN << tp->rcv_scale) -
			(tp->rcv_adv - tp->rcv_nxt);
		long hiwat;

		/*
		 * This ack case typically occurs when the user has drained
		 * the TCP socket buffer sufficiently to warrent an ack
		 * containing a 'pure window update'... that is, an ack that
		 * ONLY updates the tcp window.
		 *
		 * It is unclear why we would need to do a pure window update
		 * past 2 segments if we are going to do one at 1/2 the high
		 * water mark anyway, especially since under normal conditions
		 * the user program will drain the socket buffer quickly.
		 * The 2-segment pure window update will often add a large
		 * number of extra, unnecessary acks to the stream.
		 *
		 * avoid_pure_win_update now defaults to 1.
		 */
		if (avoid_pure_win_update == 0 ||
		    (tp->t_flags & TF_RXRESIZED)) {
			if (adv >= (long) (2 * segsz)) {
				goto send;
			}
		}
		hiwat = (long)(TCP_MAXWIN << tp->rcv_scale);
		if (hiwat > (long)so->so_rcv.ssb_hiwat)
			hiwat = (long)so->so_rcv.ssb_hiwat;
		if (adv >= hiwat / 2)
			goto send;
	}

	/*
	 * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
	 * is also a catch-all for the retransmit timer timeout case.
	 */
	if (tp->t_flags & TF_ACKNOW)
		goto send;
	if ((flags & TH_RST) ||
	    ((flags & TH_SYN) && !(tp->t_flags & TF_NEEDSYN)))
		goto send;
	if (SEQ_GT(tp->snd_up, tp->snd_una))
		goto send;
	/*
	 * If our state indicates that FIN should be sent
	 * and we have not yet done so, then we need to send.
	 */
	if ((flags & TH_FIN) &&
	    (!(tp->t_flags & TF_SENTFIN) || tp->snd_nxt == tp->snd_una))
		goto send;

	/*
	 * TCP window updates are not reliable, rather a polling protocol
	 * using ``persist'' packets is used to insure receipt of window
	 * updates.  The three ``states'' for the output side are:
	 *	idle			not doing retransmits or persists
	 *	persisting		to move a small or zero window
	 *	(re)transmitting	and thereby not persisting
	 *
	 * tcp_callout_active(tp, tp->tt_persist)
	 *	is true when we are in persist state.
	 * The TF_FORCE flag in tp->t_flags
	 *	is set when we are called to send a persist packet.
	 * tcp_callout_active(tp, tp->tt_rexmt)
	 *	is set when we are retransmitting
	 * The output side is idle when both timers are zero.
	 *
	 * If send window is too small, there is data to transmit, and no
	 * retransmit or persist is pending, then go to persist state.
	 *
	 * If nothing happens soon, send when timer expires:
	 * if window is nonzero, transmit what we can, otherwise force out
	 * a byte.
	 *
	 * Don't try to set the persist state if we are in TCPS_SYN_RECEIVED
	 * with data pending.  This situation can occur during a
	 * simultanious connect.
	 */
	if (so->so_snd.ssb_cc > 0 &&
	    tp->t_state != TCPS_SYN_RECEIVED &&
	    !tcp_callout_active(tp, tp->tt_rexmt) &&
	    !tcp_callout_active(tp, tp->tt_persist)) {
		tp->t_rxtshift = 0;
		tcp_setpersist(tp);
	}

	/*
	 * No reason to send a segment, just return.
	 */
	tp->t_flags &= ~TF_XMITNOW;
	return (0);

send:
	if (need_sched && len > 0) {
		tcp_output_sched(tp);
		return 0;
	}

	/*
	 * Before ESTABLISHED, force sending of initial options
	 * unless TCP set not to do any options.
	 * NOTE: we assume that the IP/TCP header plus TCP options
	 * always fit in a single mbuf, leaving room for a maximum
	 * link header, i.e.
	 *	max_linkhdr + sizeof(struct tcpiphdr) + optlen <= MCLBYTES
	 */
	optlen = 0;
	if (isipv6)
		hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
	else
		hdrlen = sizeof(struct tcpiphdr);
	if (flags & TH_SYN) {
		tp->snd_nxt = tp->iss;
		if (!(tp->t_flags & TF_NOOPT)) {
			u_short mss;

			opt[0] = TCPOPT_MAXSEG;
			opt[1] = TCPOLEN_MAXSEG;
			mss = htons((u_short) tcp_mssopt(tp));
			memcpy(opt + 2, &mss, sizeof mss);
			optlen = TCPOLEN_MAXSEG;

			if ((tp->t_flags & TF_REQ_SCALE) &&
			    (!(flags & TH_ACK) ||
			     (tp->t_flags & TF_RCVD_SCALE))) {
				*((u_int32_t *)(opt + optlen)) = htonl(
					TCPOPT_NOP << 24 |
					TCPOPT_WINDOW << 16 |
					TCPOLEN_WINDOW << 8 |
					tp->request_r_scale);
				optlen += 4;
			}

			if ((tcp_do_sack && !(flags & TH_ACK)) ||
			    tp->t_flags & TF_SACK_PERMITTED) {
				uint32_t *lp = (uint32_t *)(opt + optlen);

				*lp = htonl(TCPOPT_SACK_PERMITTED_ALIGNED);
				optlen += TCPOLEN_SACK_PERMITTED_ALIGNED;
			}
		}
	}

	/*
	 * Send a timestamp and echo-reply if this is a SYN and our side
	 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
	 * and our peer have sent timestamps in our SYN's.
	 */
	if ((tp->t_flags & (TF_REQ_TSTMP | TF_NOOPT)) == TF_REQ_TSTMP &&
	    !(flags & TH_RST) &&
	    (!(flags & TH_ACK) || (tp->t_flags & TF_RCVD_TSTMP))) {
		u_int32_t *lp = (u_int32_t *)(opt + optlen);

		/* Form timestamp option as shown in appendix A of RFC 1323. */
		*lp++ = htonl(TCPOPT_TSTAMP_HDR);
		*lp++ = htonl(ticks);
		*lp   = htonl(tp->ts_recent);
		optlen += TCPOLEN_TSTAMP_APPA;
	}

	/* Set receive buffer autosizing timestamp. */
	if (tp->rfbuf_ts == 0 && (so->so_rcv.ssb_flags & SSB_AUTOSIZE))
		tp->rfbuf_ts = ticks;

	/*
	 * If this is a SACK connection and we have a block to report,
	 * fill in the SACK blocks in the TCP options.
	 */
	if (report_sack)
		tcp_sack_fill_report(tp, opt, &optlen);

#ifdef TCP_SIGNATURE
	if (tp->t_flags & TF_SIGNATURE) {
		int i;
		u_char *bp;
		/*
		 * Initialize TCP-MD5 option (RFC2385)
		 */
		bp = (u_char *)opt + optlen;
		*bp++ = TCPOPT_SIGNATURE;
		*bp++ = TCPOLEN_SIGNATURE;
		sigoff = optlen + 2;
		for (i = 0; i < TCP_SIGLEN; i++)
			*bp++ = 0;
		optlen += TCPOLEN_SIGNATURE;
		/*
		 * Terminate options list and maintain 32-bit alignment.
		 */
		*bp++ = TCPOPT_NOP;
		*bp++ = TCPOPT_EOL;
		optlen += 2;
	}
#endif /* TCP_SIGNATURE */
	KASSERT(optlen <= TCP_MAXOLEN, ("too many TCP options"));
	hdrlen += optlen;

	if (isipv6) {
		ipoptlen = ip6_optlen(inp);
	} else {
		if (inp->inp_options) {
			ipoptlen = inp->inp_options->m_len -
			    offsetof(struct ipoption, ipopt_list);
		} else {
Beispiel #3
0
int tcp_send(struct tcp_pcb * __tp, const void * __buf, 
	int __len, int __flags)
{
	uint8_t * src;
	int rem;
	int n;
	int m;

	if (__tp == NULL) {
		DCC_LOG(LOG_WARNING, "NULL pointer");
		return -1;
	}

#ifdef ENABLE_SANITY
	if (__buf == NULL) {
		DCC_LOG1(LOG_WARNING, "<%04x> NULL pointer:", (int)__tp);
		return -1;
	}

	if (__len < 0) {
		DCC_LOG2(LOG_WARNING, "<%04x> invalid length: %d", (int)__tp, __len);
		return -1;
	}
#endif

	tcpip_net_lock();

#ifdef ENABLE_SANITY
	if (pcb_find((struct pcb *)__tp, &__tcp__.active) < 0) {
		DCC_LOG(LOG_ERROR, "<%04x> pcb_find()", (int)__tp);
		tcpip_net_unlock();
		return -1;
	}
#endif

	DCC_LOG3(LOG_INFO, "<%05x> buf=%05x len=%d", (int)__tp, (int)__buf, __len);

	src = (uint8_t *)__buf;
	rem = __len;

again:
	if (__tp->t_state != TCPS_ESTABLISHED)  {
/*
	if ((__tp->t_state != TCPS_ESTABLISHED) &&
		(__tp->t_state != TCPS_CLOSE_WAIT)) {
*/
		DCC_LOG2(LOG_WARNING, "<%05x> [%s]", (int)__tp, 
				 __tcp_state[__tp->t_state]);

		if (__tp->t_state == TCPS_SYN_RCVD) {
			DCC_LOG1(LOG_TRACE, "<%05x> wait", (int)__tp);
			__os_cond_wait(__tp->t_cond, net_mutex);
			DCC_LOG2(LOG_TRACE, "<%05x> again [%s]",
					 (int)__tp, __tcp_state[__tp->t_state]);
			goto again;
		} 
		

		DCC_LOG(LOG_TRACE, "done.");

		tcpip_net_unlock();
		return -1;
	}

	while (rem) {
		/* buffer limit ... */
		m = tcp_maxsnd - __tp->snd_q.len;
		if (m <= 0) {
			DCC_LOG1(LOG_INFO, "<%05x> queue limit", (int)__tp);
			__tp->t_flags |= TF_ACKNOW;
			
			DCC_LOG(LOG_INFO, "output request.");
			tcp_output_sched(__tp);
		
			DCC_LOG(LOG_INFO, "waiting for buffer space.");
			__os_cond_wait(__tp->t_cond, net_mutex);

			goto again;
		}

		m = MIN(m, rem);

		if ((n = mbuf_queue_add(&__tp->snd_q, src, m)) == 0) {
			DCC_LOG(LOG_TRACE, "mbuf_wait...");
			mbuf_wait(net_mutex);
			goto again;
		}
		rem -= n;
		src += n;
	}

#if 0
	/* FIXME: Set retransmit timer if not currently set,
	   and not doing an ack or a keepalive probe.
	   Initial value for retransmit is smoothed
	   round-trip time + 2 * round-trip time variance.
	   Initialize counter which is used for backoff
	   :of retransmit time. */
	if ((__tp->t_rxmt_tmr == 0) && (__tp->snd_una != 0)) {
		__tp->t_rxmt_tmr = tcp_rxmtintvl[0];
		__tp->t_rxmt_cnt = 0;
		/* tp->t_flags &= ~TF_IDLE; */
	}
#endif

	if (__len > 0) {
		/* TCP_SEND_NOWAIT flag set or one maximum segment size pending for
		   send then send now */
		if ((__flags & TCP_SEND_NOWAIT) || 
			((__tp->snd_q.len - (int)__tp->snd_q.offs) >= __tp->t_maxseg)) {
			DCC_LOG(LOG_INFO, "output request.");
			tcp_output_sched(__tp);
//			if (tcp_output(__tp) < 0) {
				/* if the reason to fail was an arp failure
				   try query an address pending for resolution ... */
//				arp_query_pending();
//			}
		} else  {
			__tp->t_flags |= TF_DELACK;
		}
	}

	DCC_LOG(LOG_INFO, "done.");

	tcpip_net_unlock();

	return __len;
}
Beispiel #4
0
int tcp_recv(struct tcp_pcb * __tp, void * __buf, int __len)
{
    int n;

    if (__tp == NULL) {
        DCC_LOG(LOG_WARNING, "NULL pointer");
        return -1;
    }

    if (__len == 0) {
        /* invalid argument */
        DCC_LOG(LOG_WARNING, "invalid argument");
        return -1;
    }

    tcpip_net_lock();

#ifdef ENABLE_SANITY_CHECK
    if (pcb_find((struct pcb *)__tp, &__tcp__.active) < 0) {
        DCC_LOG1(LOG_ERROR, "<%05x> pcb_find()", (int)__tp);
        tcpip_net_unlock();
        return -1;
    }
#endif

    for (;;) {
        if ((__tp->t_state == TCPS_CLOSED)) {
            DCC_LOG(LOG_WARNING, "closed!");
            tcpip_net_unlock();
            return -1;
        }

        if ((__tp->t_state == TCPS_TIME_WAIT) ||
                (__tp->t_state == TCPS_CLOSING) ||
                (__tp->t_state == TCPS_LAST_ACK)) {
            tcpip_net_unlock();
            return 0;
        }

        if (__tp->rcv_q.len)
            break;

        if (__tp->t_state == TCPS_CLOSE_WAIT) {
            tcpip_net_unlock();
            return 0;
        }

        DCC_LOG2(LOG_MSG, "<%05x> wait [%d]", (int)__tp, __tp->t_cond);

        thinkos_cond_wait(__tp->t_cond, net_mutex);
    }

    n = mbuf_queue_remove(&__tp->rcv_q, __buf, __len);

    DCC_LOG1(LOG_INFO, "len=%d", n);

    /* Half close: don't reopen the receiver window, i'm not sure
       whether it is a rule break or not, but it may prevent
       resources been consumed by an about to die connection! */
    if ((__tp->t_state == TCPS_FIN_WAIT_1) ||
            (__tp->t_state == TCPS_FIN_WAIT_2)) {
        DCC_LOG1(LOG_TRACE, "<%05x> FIN_WAIT", (int)__tp);
        tcpip_net_unlock();
        return n;
    }

    /* XXX: revisit this ... */
//	if ((__tp->rcv_q.len == 0) || (__tp->t_flags & TF_DELACK)) {
    if ((__tp->rcv_q.len == 0)) {
        if (__tp->t_flags & TF_DELACK) {
            __tp->t_flags |= TF_ACKNOW;
        }

        DCC_LOG(LOG_INFO, "empty queue, call tcp_out.");

        tcp_output_sched(__tp);
    }

    tcpip_net_unlock();

    return n;
}
Beispiel #5
0
int tcp_input(struct ifnet * __if, struct iphdr * iph, 
			   struct tcphdr * th, int len)
{
	struct tcp_listen_pcb * mux;
	struct tcp_pcb * tp;
#if (ENABLE_NET_TCP_CHECKSUM)
	unsigned int sum;
#endif
	int ti_len;
	int acked = 0;
	int ourfinisacked = 0;
	int needoutput = 0;
	unsigned int optlen;
	int tiflags;
	int todrop;
	uint32_t snd_una;
	uint32_t snd_nxt;
	uint32_t snd_max;
	uint32_t ti_seq;
	uint32_t ti_ack;
	int rcv_wnd;
	int tiwin;
	int hdrlen;
	uint8_t * data;
	int ret;

#if (ENABLE_TCPDUMP)
	tcp_dump(iph, th, TCPDUMP_RX);
#endif

	/* get TCP options, if any */
	optlen = ((th->th_off << 2) - sizeof(struct tcphdr));
	hdrlen = sizeof(struct tcphdr) + optlen;

	data = (uint8_t *)&th->th_opt[optlen];
	ti_len = len - hdrlen;
	
#if (ENABLE_NET_TCP_CHECKSUM)
	/* initialize checksum */
	sum = htons(len) + (IPPROTO_TCP << 8);
	sum = in_chksum(sum, &iph->saddr,  8);
	sum = in_chksum(sum, th,  hdrlen);

	if (ti_len) {
		sum = in_chksum(sum, data, ti_len);
	}

	if (sum != 0x0000ffff) {
		DCC_LOG3(LOG_WARNING, "checksum error: 0x%04x hdrlen=%d, len=%d", 
				 sum, hdrlen, len);
		TCP_PROTO_STAT_ADD(rx_err, 1);
		goto drop;
	}
#endif

	tiflags = th->th_flags;
	/* convert TCP protocol specific fields to host format */
	tiwin = ntohs(th->th_win);
	ti_seq = ntohl(th->th_seq);
	ti_ack = ntohl(th->th_ack);

	TCP_PROTO_STAT_ADD(rx_ok, 1);

	/* Serch in active list first */
	if ((tp = tcp_active_lookup(iph->saddr, th->th_sport, 
								iph->daddr, th->th_dport)) == NULL) {
		/* lookup into listening pcb list */
		if ((mux = tcp_listen_lookup(iph->saddr, th->th_sport, 
									 iph->daddr, th->th_dport)) == NULL) {
			DCC_LOG(LOG_WARNING, "invalid peer ???");
			goto dropwithreset;
		}

		if ((tiflags & TH_ACK)) {
			DCC_LOG(LOG_WARNING, "listen ACK ?");
			goto dropwithreset;
		}

		if (ti_len != 0) {
			DCC_LOG(LOG_WARNING, "ti_len != 0");
			goto dropwithreset;
		}

		/* Completion of Passive Open
		   Ref.: TCP/IP Illustrated Volume 2, pg. 942 */
		if (!(tiflags & TH_SYN)) {
			DCC_LOG(LOG_WARNING, "listen !SYN ?");
			goto drop;
		}
	
		/* In the LISTEN state, we check for incoming SYN segments,
		   creates a new PCB, and responds with a SYN|ACK. */
		if ((tiflags & TH_RST)) {
			DCC_LOG(LOG_WARNING, "listen RST?");
			goto drop;
		}

		if ((tp = tcp_passive_open(mux, iph, th, optlen)) == NULL) {
			DCC_LOG(LOG_WARNING, "tcp_passive_open()");
			goto dropwithreset;
		}

		/* schedule output */
		tcp_output_sched(tp);

		/* packet handled */
		return 0;
	}

	DCC_LOG1(LOG_MSG, "<%05x> active", (int)tp);

	snd_una = tp->snd_seq;
	snd_nxt = tp->snd_seq + tp->snd_off;
	snd_max = tp->snd_seq + tp->snd_max;

 	/* Remove acknowledged bytes from the send buffer */
	/* Wakeup processes waiting on send buffer */

	/* Segment received on a connection.
	   Reset the idle detection timer 
	   Ref.: TCP/IP Illustrated Volume 2, pg. 932  */
	tp->t_conn_tmr = tcp_idle_det_tmo;
	if (tp->t_flags & TF_IDLE) {
		/* exits from the idle state */
		tp->t_flags &= ~TF_IDLE;
		DCC_LOG1(LOG_INFO, "<%05x> IDLE exit", (int)tp);		
	}

#if 0
	/* Process options, we don't need to check if the socket is 
	   in the LISTEN state, because only active (non LISTENING) sockets
	   will actually fall into this code. 
	   XXX: options after connection stablished ??? 
	 */
	if (optlen)
		tcp_parse_options(tp, th, th->th_opt, optlen);
#endif

	/* Ref.: TCP/IP Illustrated Volume 2, pg. 934  */
#if (TCP_ENABLE_HEADER_PREDICTION)
	if ((tp->t_state == TCPS_ESTABLISHED) &&
		(tiflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK)) == TH_ACK &&
		(ti_seq == tp->rcv_nxt) && 
		(tiwin) && 
		(tiwin == tp->snd_wnd) && 
		(snd_nxt == snd_max)) {

		if (ti_len == 0) {

			if (SEQ_GT(ti_ack, snd_una) &&
				SEQ_LEQ(ti_ack, snd_max)) {
				acked = ti_ack - snd_una;
			
				DCC_LOG(LOG_INFO, "header prediction, ACK ...");

				mbuf_queue_trim(&tp->snd_q, acked);
				snd_una = ti_ack;

				tp->snd_seq = snd_una;
				tp->snd_off = snd_nxt - tp->snd_seq;
				tp->snd_max = snd_max - tp->snd_seq;

				if (snd_una == snd_max) {
					tp->t_rxmt_tmr = 0;
					tp->t_rxmt_cnt = 0;
					DCC_LOG(LOG_INFO, "acked all data, rxmt tmr stopped");
				} else {
					if (tp->t_rxmt_tmr == 0) {
						DCC_LOG(LOG_INFO, 
								"not all data acked restart rxmt tmr");
						tp->t_rxmt_tmr = tcp_rxmtintvl[tp->t_rxmt_cnt / 2];
					}
				}

				thinkos_cond_broadcast(tp->t_cond);

				if (tp->snd_q.len) {
					/* schedule output */
					tcp_output_sched(tp);
				}

				return 0;
			}
		} else {
			if ((ti_ack == snd_una) && 
				ti_len <= (tcp_maxrcv - tp->rcv_q.len)) {
				int len;

				DCC_LOG1(LOG_INFO, "header prediction, data (%d)", ti_len);

				/* append data */
				len = mbuf_queue_add(&tp->rcv_q, data, ti_len);
				tp->rcv_nxt += len;
				thinkos_cond_broadcast(tp->t_cond);

				if (len != ti_len) {
					DCC_LOG1(LOG_WARNING, "<%05x> no more mbufs", (int)tp);
					tp->t_flags |= TF_ACKNOW;
					/* schedule output */
					tcp_output_sched(tp);
				} else {
					tp->t_flags |= TF_DELACK;
				}

				return 0;
			 }
		}
	}

#endif /* TCP_ENABLE_HEADER_PREDICTION */

	/* Slow path input processing
	   Ref.: TCP/IP Illustrated Volume 2, pg. 941  */

	/* TODO: Drop TCP, IP headers and TCP options. 
		Well, only if these structures were dynamic allocated... */
	
	if (ti_len == 0) {
		DCC_LOG(LOG_INFO, "slow path ACK");
	} else {
		DCC_LOG1(LOG_INFO, "slow path (%d)", ti_len);
	}

	/* Calculate the amount of space in receive window,
	   and then do TCP input processing.
	   Receive window is amount of space in rcv queue,
	   but not less than advertise window.
	   Ref.: TCP/IP Illustrated Volume 2, pg. 941  */
	{
		int win;
		
		/* space left in the input queue */
		win = tcp_maxrcv - tp->rcv_q.len;
		
		if (win <= 0) {
			win = 0;
			DCC_LOG(LOG_INFO, "receive buffer full!");
		}


//		rcv_wnd = MAX(win, tp->rcv_adv_wnd);
		rcv_wnd = win;

		DCC_LOG3(LOG_INFO, "adv_wnd=%d rcv_wnd=%d win=%d", 
				tp->rcv_adv_wnd, rcv_wnd, win);
	} 

	if (tp->t_state == TCPS_SYN_SENT) {
		/* response to an active open. 
		   Ref.: TCP/IP Illustrated Volume 2, pg. 947  */

		/* Common proccessing for receipt of SYN. 
		   Ref.: TCP/IP Illustrated Volume 2, pg. 950 */
		if ((tiflags & TH_RST)) {
			goto close;
		}

		if (!(tiflags & TH_SYN)) {
			DCC_LOG(LOG_WARNING, "SYN_SENT SYN ?");
			/* TODO: reset */
			goto close_and_reset;
		}

		if (!(tiflags & TH_ACK)) {
			DCC_LOG(LOG_WARNING, "SYN_SENT ACK ?");
			/* TODO: reset */
			goto close_and_reset;
		}

		if (ti_len != 0) {
			DCC_LOG(LOG_WARNING, "ti_len != 0");
			/* TODO: reset */
			goto close_and_reset;
		}

		/* update the send sequence */
		tp->snd_seq++;
		if (tp->snd_seq != ti_ack) {
			DCC_LOG3(LOG_WARNING, "<%05x> tp->snd_seq(%d) != ti_ack(%d)",
					 (int)tp, tp->snd_seq, ti_ack);
			/* TODO: reset */
			goto close_and_reset;
		}
		tp->snd_off--;
		tp->snd_max--;
//		tp->snd_off = 0;
//		tp->snd_max = 0;

		if (optlen)
			tcp_parse_options(tp, th, th->th_opt, optlen);

		/* Advance tp->ti_seq to correspond to first data byte. */
		ti_seq++;
		if (ti_len > rcv_wnd) {
			DCC_LOG3(LOG_WARNING, "<%05x> ti_len(%d) > rcv_wnd(%d)", 
				(int)tp, ti_len, rcv_wnd);
		/* TODO: if data, trim to stay within window. */
			ti_len = rcv_wnd;
		}

		/* update the sequence number */
		tp->rcv_nxt = ti_seq;

		/* update the window size */
		tp->snd_wnd = ntohs(th->th_win);

		tp->t_state = TCPS_ESTABLISHED;
		DCC_LOG1(LOG_INFO, "<%05x> [ESTABLISHED]", (int)tp);
		/* TODO: initialization of receive urgent pointer
		tcp->rcv_up = ti_seq; */
		/* XXX: */ 
		tp->t_flags |= TF_ACKNOW;
		thinkos_cond_broadcast(tp->t_cond);

		goto step6;

close_and_reset:
		tp->t_state = TCPS_CLOSED;
		pcb_move((struct pcb *)tp, &__tcp__.active, &__tcp__.closed);
		DCC_LOG1(LOG_INFO, "<%05x> [CLOSED]", (int)tp);

		/* XXX: discard the data */
		mbuf_queue_free(&tp->snd_q);
		mbuf_queue_free(&tp->rcv_q);

		/* notify the upper layer */
		thinkos_cond_broadcast(tp->t_cond);

		goto dropwithreset;	
	}

/* States other than LISTEN or SYN_SENT 
   First check timestamp, if present.
   Then check that at least some bytes of segment are within
   receive window.  If segment begins before rcv_nxt,
   drop leading data (and SYN); if nothing left, just ti_ack. */

	/* Trim Segment so Data is Within Window
	   Ref.: TCP/IP Illustrated Volume 2, pg. 954 */
	todrop = tp->rcv_nxt - ti_seq;
	if (todrop > 0) {
		if (tiflags & TH_SYN) {
			DCC_LOG(LOG_INFO, "SYN");
			tiflags &= ~TH_SYN;
			ti_seq++;
			todrop--;
		}
		if ((todrop > ti_len) || 
		   ((todrop == ti_len) && ((tiflags & TH_FIN) == 0))) {
			tiflags &= ~TH_FIN;
			tp->t_flags |= TF_ACKNOW;
			todrop = ti_len;		
		}

		DCC_LOG4(LOG_WARNING, "<%05x> drop: len=%d drop=%d rem=%d!", 
			(int)tp, ti_len, todrop, ti_len - todrop);

		/* adjust the data pointer */
		data += todrop;

		ti_seq += todrop;
		ti_len -= todrop;

		/* TODO: adjust the urgent pointer */
	} 

	/* FIXME: only reset the connection if there are no more 
		application to handle the incomming data, half-close */
	if ((tp->t_state > TCPS_FIN_WAIT_1) && (ti_len)) { 
		DCC_LOG1(LOG_INFO, "<%05x> segment received after FIN", (int)tp);
		/* TODO: stat */
		goto dropwithreset;	
	}

	/* If segment ends after window, drop trailing data
	   and (PUSH and FIN); if nothing left, just ACK.
	   Ref.: TCP/IP Illustrated Volume 2, pg. 958 */
	todrop = (ti_seq + ti_len) - (tp->rcv_nxt + rcv_wnd);

	DCC_LOG4(LOG_INFO, "ti_seq=%u ti_len=%d rcv_nxt=%u rcv_wnd=%d", 
			ti_seq,  ti_len, tp->rcv_nxt, rcv_wnd);
	/* */

	if (todrop > 0) {
//		TCP_LOG(tp, "tcp_input: trailing data drop");
		if (todrop >= ti_len) {

	   		/* 
			 * If a new connection request is received 
			 * while in TIME_WAIT, drop the old connection ...
			 * Ref.: TCP/IP Illustrated Volume 2, pg. 958 
			if ((tiflags & TH_SYN) && (tp->t_state == TCPS_TIMEWAIT) &&
			   (SEQ_GT(ti_seq, tp->rcv_nxt))) {
				__tcp__.iss += tcp_issincr;
				tcp_rst(tp);
				goto findpcb;
			} */

			if ((rcv_wnd == 0) && (ti_seq == tp->rcv_nxt)) {
				tp->t_flags |= TF_ACKNOW;
			} else
				goto dropafterack;
		}

		DCC_LOG2(LOG_WARNING, "<%05x> data drop: %d!", (int)tp, todrop);
		ti_len -= todrop;
		tiflags &= ~(TH_PSH | TH_FIN);
	}

	/* If the RST bit is set eximine the state: ...
	   Ref.: TCP/IP Illustrated Volume 2, pg. 964 */
	if ((tiflags & TH_RST)) {
		DCC_LOG1(LOG_WARNING, "<%05x> RST received", (int)tp);
		switch(tp->t_state) {
		case TCPS_SYN_RCVD:
//			tp->errno = ECONNREFUSED;
			goto close;
		case TCPS_ESTABLISHED:
		case TCPS_CLOSE_WAIT:
//			tp->errno = ECONNRESET;
close:
			/* discard the data */
			mbuf_queue_free(&tp->snd_q);
			mbuf_queue_free(&tp->rcv_q);

			tp->t_state = TCPS_CLOSED;
			pcb_move((struct pcb *)tp, &__tcp__.active, &__tcp__.closed);
			DCC_LOG1(LOG_INFO, "<%05x> [CLOSED]", (int)tp);

			/* notify the upper layer */
			thinkos_cond_broadcast(tp->t_cond);
			/* PCBs in the close state should be cleared by the application */
			goto drop;

		case TCPS_FIN_WAIT_1:
		case TCPS_FIN_WAIT_2:
		case TCPS_CLOSING:
		case TCPS_LAST_ACK:
		case TCPS_TIME_WAIT:
			/* Our side was already closed */
			tcp_pcb_free(tp);
			goto drop;
		}
	}

	/* If a SYN is in the window, then this is an 
	   error and we send an RST and drop the connection.
	   Ref.: TCP/IP Illustrated Volume 2, pg. 965 */
	if ((tiflags & TH_SYN)) {
		DCC_LOG1(LOG_WARNING, "<%05x> the SYN bit is set inside the window", 
			(int)tp);
		goto dropwithreset;
	}

	/* If the ACK bit is off we drop the segment and return. */
	if ((!(tiflags & TH_ACK))) {
		DCC_LOG1(LOG_WARNING, "<%05x> the ACK bit is off", (int)tp);
		goto drop;
	}
	
/*
 * ACK processing.
 * Ref.: TCP/IP Illustrated Volume 2, pg. 969 
 *
 */

	DCC_LOG4(LOG_INFO, "ack=%u una=%u nxt=%u max=%u", 
			 ti_ack, snd_una, snd_nxt, snd_max);

	switch(tp->t_state) {
	case TCPS_SYN_RCVD:
		if (SEQ_GT(snd_una, ti_ack) || 
			SEQ_GT(ti_ack, snd_max)) {
			DCC_LOG1(LOG_WARNING, 
					 "<%05x> ti_ack < snd_una || snd_max < ti_ack", 
					 (int)tp);
			goto dropwithreset;
		}
		tp->t_state = TCPS_ESTABLISHED;
		tp->snd_off--;
		tp->snd_max--;
		DCC_LOG1(LOG_INFO, "<%05x> SYN ackd [ESTABLISHED]", (int)tp);
		/* notify the upper layer*/
//		thinkos_cond_signal(tp->t_cond);

		/* TODO: tcp reassembly
		tcp_reass(tp); */
	case TCPS_ESTABLISHED:
	case TCPS_FIN_WAIT_1:
	case TCPS_FIN_WAIT_2:
	case TCPS_CLOSE_WAIT:
	case TCPS_CLOSING:
	case TCPS_LAST_ACK:
	case TCPS_TIME_WAIT:
		/* TODO: tcp reassembly
		   tcp_reass(tp); */
		if (SEQ_LEQ(ti_ack, snd_una)) {
			/* TODO: check for completly duplicated ACKs.
			   Ref.: TCP/IP Illustrated Volume 2, pg. 971 */
			if ((ti_len == 0) && (tiwin == tp->snd_wnd)) {
				if ((tp->t_rxmt_tmr == 0) || ti_ack != snd_una) {
//					dupacks = 0;
				} else {
					DCC_LOG2(LOG_INFO, "duplicated ACK. ti_ack=%u snd_una=%u", 
							 ti_ack, snd_una);
				}
			} else {
//				dupacks = 0;
			}
			break;
		}

		/* Check out of range ACK */
		/*  Ref.: TCP/IP Illustrated Volume 2, pg. 974 */
		if (SEQ_GT(ti_ack, snd_max)) {
			/* TODO:
			   tcpstat.tcps_rcvacktoomuch++;
			 */
			DCC_LOG3(LOG_WARNING, "(%04x) out of range ACK. "
				"th_ack=%u > snd_max=%u !", 
				(int)tp, ti_ack, snd_max);
			goto dropafterack;	
		}

		acked = ti_ack - snd_una;

		/* TODO:
		   tcpstat.tcps_rcvackpack++;
		   tcpstat.tcps_rcvackbyte += acked;		
		 */

		DCC_LOG1(LOG_INFO, "acked=%d", acked);

		/* If all outstanding data is acked, stop retransmit timer else
		   restarts it ....
		   Ref.: TCP/IP Illustrated Volume 2, pg. 976 */
		if (ti_ack == snd_max) {
			tp->t_rxmt_tmr = 0;
			tp->t_rxmt_cnt = 0;
			needoutput = 1;
			DCC_LOG(LOG_INFO, "acked all data, rxmt tmr stopped");
		} else {
			/* TODO: peristent timer */
//			if (tp->t_persist_tmr == 0) {
				DCC_LOG(LOG_INFO, "not all data acked restart rxmt tmr");
				tp->t_rxmt_tmr = tcp_rxmtintvl[tp->t_rxmt_cnt / 2];
//			}
		}

		/* TODO:
		   tcpstat.tcps_rcvackpack++;
		   tcpstat.tcps_rcvackbyte += acked;		
		 */

		/* TODO: remove acknowledged data from send buffer 
		   Ref.: TCP/IP Illustrated Volume 2, pg. 978 */
		/* FIXME: send buffer bytes count */
		if (acked > tp->snd_q.len) {
			mbuf_queue_trim(&tp->snd_q, tp->snd_q.len);
			ourfinisacked = 1;
		} else {
			/* TODO: estimate the send window */
			mbuf_queue_trim(&tp->snd_q, acked);
			ourfinisacked = 0;
		}

		/* awaken a thread waiting on the send buffer ... */
		thinkos_cond_broadcast(tp->t_cond);

		snd_una = ti_ack;

		if (SEQ_LT(snd_nxt, snd_una)) {
			snd_nxt = snd_una;
		}

		tp->snd_seq = snd_una;
		tp->snd_off = snd_nxt - tp->snd_seq;
		tp->snd_max = snd_max - tp->snd_seq;

		DCC_LOG4(LOG_INFO, "<%05x> snd_seq=%u snd_max=%u snd_q.len=%d", 
			(int)tp, tp->snd_seq, snd_max, tp->snd_q.len); 

		switch(tp->t_state) {
		case TCPS_FIN_WAIT_1:
			if (ourfinisacked) {
				/* FIXME: If we can't receive any more data..
				   Ref.: TCP/IP Illustrated Volume 2, pg. 979 */
				tp->t_conn_tmr = 4 * tcp_msl;
				tp->t_state = TCPS_FIN_WAIT_2;
				DCC_LOG1(LOG_INFO, "<%05x> [FIN_WAIT_2]", (int)tp);
			}
			break;
		case TCPS_CLOSING:
			if (ourfinisacked) {
				mbuf_queue_free(&tp->snd_q);
				mbuf_queue_free(&tp->rcv_q);
				tp->t_state = TCPS_TIME_WAIT;
				DCC_LOG1(LOG_INFO, "<%05x> [TIME_WAIT]", (int)tp);
				tp->t_rxmt_tmr = 0;
				tp->t_conn_tmr = 2 * tcp_msl;
				DCC_LOG1(LOG_INFO, "stop rxmt tmr, start 2MSL tmr: %d",
						 tp->t_conn_tmr);
			}
			break;
		case TCPS_LAST_ACK:
			if (ourfinisacked) {
				tcp_pcb_free(tp);
				goto drop;
			}
			break;

		case TCPS_TIME_WAIT:
			/* restart the finack timer 
			   Ref.: TCP/IP Illustrated Volume 2, pg. 981 */
			tp->t_conn_tmr = 2 * tcp_msl;
			goto dropafterack;
		}
		break;
	}

	DCC_LOG4(LOG_INFO, "<%05x> recvd=%d acked=%d rcv_q.len=%d", (int)tp, 
		ti_len, acked, tp->rcv_q.len);
step6:
	/* Update window information 
	   Ref.: TCP/IP Illustrated Volume 2, pg. 982 */
	DCC_LOG(LOG_MSG, "setp6");
	
//	if ((tiflags & TH_ACK) && (tiwin > tp->snd_wnd)) {
	if ((tiflags & TH_ACK) && (tiwin != tp->snd_wnd)) {
		/* Keep track of pure window updates */
		/* TODO: TCP Statistics */
		/* TODO: Update window information */
		DCC_LOG1(LOG_INFO, "window update, win=%d", tiwin);
		tp->snd_wnd = tiwin;
		needoutput = 1;
	}

	/* TODO: Urgent mode processing */
	/* Process the segment text, 
	   merging it into the TCP sequencing queue,
dodata:
	   ...
	   Ref.: TCP/IP Illustrated Volume 2, pg. 988 */
	if ((ti_len || (tiflags & TH_FIN)) && 
		TCPS_HAVERCVDFIN(tp->t_state) == 0) {

		if ((ti_seq == tp->rcv_nxt) && (tp->t_state == TCPS_ESTABLISHED)) {

			/* append data */
			int n;

			tp->t_flags |= TF_DELACK;

			n = mbuf_queue_add(&tp->rcv_q, data, ti_len);
			if (n != ti_len) {
				DCC_LOG2(LOG_WARNING, "no more mbufs, %d != %d", n, ti_len);
			}
			ti_len = n;

			tp->rcv_nxt += ti_len;
			/* TODO: statistics */

			tiflags &= TH_FIN;

//			if (tp->rcv_q.len == ti_len) {
//				DCC_LOG3(LOG_INFO, "<%05x> rcvd %d, signaling %d ...", 
//					(int)tp, ti_len, tp->t_cond);
			/* 
			 * notify the upper layer of the data arrival...
			 */
			thinkos_cond_signal(tp->t_cond);
//			} else {
//				DCC_LOG2(LOG_INFO, "<%05x> rcvd %d", (int)tp, ti_len);
//			}

		} else {
			/* TODO: half-close */
			/* TODO: reassembly */
//			m = mlink_free(m);
			if (tp->t_state == TCPS_ESTABLISHED) {
//				DCC_LOG(LOG_WARNING, "out of order, drop!");
				DCC_LOG(LOG_WARNING, "out of order, drop");
				TCP_PROTO_STAT_ADD(rx_drop, 1);
			}
			tp->t_flags |= TF_ACKNOW;
		}
	} else {
		DCC_LOG(LOG_INFO, "!!!!!!!!!");
		tiflags &= ~TH_FIN;
	}

	/* FIN Processing */
	if (tiflags & TH_FIN) {
		if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
			tp->t_flags |= TF_ACKNOW;
			tp->rcv_nxt++;
		}
		switch(tp->t_state) {
		case TCPS_SYN_RCVD:
		case TCPS_ESTABLISHED:
			tp->t_state = TCPS_CLOSE_WAIT;
			DCC_LOG1(LOG_INFO, "<%05x> [CLOSE_WAIT]", (int)tp);
			/* notify the application that our peer 
			   has closed its side. Sockets: marks 
			   the socket as write-only */
			if (tp->rcv_q.len == 0) {
				thinkos_cond_broadcast(tp->t_cond);
			}
			break;
		case TCPS_FIN_WAIT_1:
			tp->t_state = TCPS_CLOSING;
			DCC_LOG1(LOG_INFO, "<%05x> [CLOSING]", (int)tp);
			break;
		case TCPS_FIN_WAIT_2:
			mbuf_queue_free(&tp->rcv_q);
			mbuf_queue_free(&tp->snd_q);
			tp->t_state = TCPS_TIME_WAIT;
			DCC_LOG1(LOG_INFO, "<%05x> [TIME_WAIT]", (int)tp);
			tp->t_rxmt_tmr = 0;
			tp->t_conn_tmr = 2 * tcp_msl;
			DCC_LOG1(LOG_INFO, "stop rxmt tmr, start 2MSL tmr: %d",
					 tp->t_conn_tmr);
			break;
		case TCPS_TIME_WAIT:
			/* restart the counter */
			tp->t_conn_tmr = 2 * tcp_msl;
			break;
		}
	}

	/* Final Processing */
	if (needoutput || (tp->t_flags & TF_ACKNOW)) {
		if (needoutput) {
			DCC_LOG(LOG_INFO, "needoutput, call tcp_out.");
		}
		if (tp->t_flags & TF_ACKNOW) {
			DCC_LOG(LOG_INFO, "ACKNOW set, call tcp_out.");
		}
		/* schedule output */
		tcp_output_sched(tp);
	}
	return 0;

dropafterack:
	DCC_LOG1(LOG_INFO, "<%05x> drop and ACK", (int)tp);

	if (tiflags & TH_RST)
		goto drop;

	tp->t_flags |= TF_ACKNOW;
	/* schedule output */
	tcp_output_sched(tp);
	return 0;

dropwithreset:
	DCC_LOG1(LOG_TRACE, "<%05x> drop and RST", (int)tp);

	ret = 0;
	/* TODO: check for a broadcast/multicast */
	if (!(tiflags & TH_RST)) {
		if (tiflags & TH_ACK) {
			ret = tcp_respond(iph, th, 0, ti_ack, TH_RST);
		} else if (tiflags & TH_SYN) {
				ti_len++;
			ret = tcp_respond(iph, th, ti_seq + ti_len, 0, TH_ACK | TH_RST);
		}
	}
	TCP_PROTO_STAT_ADD(rx_drop, 1);
	return ret;

drop:
	DCC_LOG(LOG_TRACE, "drop");
	TCP_PROTO_STAT_ADD(rx_drop, 1);

	return 0;
}