/*
 * This function is called from the receive side with the sequence number of
 * the last packet received.
 */
void AJ_SerialTx_ReceivedSeq(uint8_t seq)
{
    /*
     * If we think we have already acked this sequence number we don't adjust
     * the ack count.
     */
    if (!SEQ_GT(currentTxAck, seq)) {
        currentTxAck = (seq + 1) & 0x7;
    }

#ifdef ALWAYS_ACK
    AJ_SerialTX_EnqueueCtrl(NULL, 0, AJ_SERIAL_ACK);
#else
    ++pendingAcks;

    /*
     * If there are no packets to send we are allowed to accumulate a
     * backlog of pending ACKs up to a maximum equal to the window size.
     * In any case we are required to send an ack within a timeout
     * period so if this is the first pending ack we need to prime a timer.
     */
    if (pendingAcks == 1) {
        AJ_InitTimer(&ackTime);
        AJ_TimeAddOffset(&ackTime, AJ_SerialLinkParams.txAckTimeout);
        return;
    }

    /*
     * If we have hit our pending ACK limit send a explicit ACK packet immediately.
     */
    if (pendingAcks == AJ_SerialLinkParams.windowSize) {
        AJ_SerialTX_EnqueueCtrl(NULL, 0, AJ_SERIAL_ACK);
    }
#endif
}
Beispiel #2
0
static void
dctcp_ack_received(struct cc_var *ccv, uint16_t type)
{
	struct dctcp *dctcp_data;
	int bytes_acked = 0;

	dctcp_data = ccv->cc_data;

	if (CCV(ccv, t_flags) & TF_ECN_PERMIT) {
		/*
		 * DCTCP doesn't treat receipt of ECN marked packet as a
		 * congestion event. Thus, DCTCP always executes the ACK
		 * processing out of congestion recovery.
		 */
		if (IN_CONGRECOVERY(CCV(ccv, t_flags))) {
			EXIT_CONGRECOVERY(CCV(ccv, t_flags));
			newreno_cc_algo.ack_received(ccv, type);
			ENTER_CONGRECOVERY(CCV(ccv, t_flags));
		} else
			newreno_cc_algo.ack_received(ccv, type);

		if (type == CC_DUPACK)
			bytes_acked = CCV(ccv, t_maxseg);

		if (type == CC_ACK)
			bytes_acked = ccv->bytes_this_ack;

		/* Update total bytes. */
		dctcp_data->bytes_total += bytes_acked;

		/* Update total marked bytes. */
		if (dctcp_data->ece_curr) {
			if (!dctcp_data->ece_prev
			    && bytes_acked > CCV(ccv, t_maxseg)) {
				dctcp_data->bytes_ecn +=
				    (bytes_acked - CCV(ccv, t_maxseg));
			} else
				dctcp_data->bytes_ecn += bytes_acked;
			dctcp_data->ece_prev = 1;
		} else {
			if (dctcp_data->ece_prev
			    && bytes_acked > CCV(ccv, t_maxseg))
				dctcp_data->bytes_ecn += CCV(ccv, t_maxseg);
			dctcp_data->ece_prev = 0;
		}
		dctcp_data->ece_curr = 0;

		/*
		 * Update the fraction of marked bytes at the end of
		 * current window size.
		 */
		if ((IN_FASTRECOVERY(CCV(ccv, t_flags)) &&
		    SEQ_GEQ(ccv->curack, CCV(ccv, snd_recover))) ||
		    (!IN_FASTRECOVERY(CCV(ccv, t_flags)) &&
		    SEQ_GT(ccv->curack, dctcp_data->save_sndnxt)))
			dctcp_update_alpha(ccv);
	} else
		newreno_cc_algo.ack_received(ccv, type);
}
int StreamIterator(Flow *f, TcpStream *stream, int close, void *cbdata, uint8_t iflags)
{
    SCLogDebug("called with %p, %d, %p, %02x", f, close, cbdata, iflags);
    int logged = 0;

    /* optimization: don't iterate list if we've logged all,
     * so check the last segment's flags */
    if (stream->seg_list_tail != NULL &&
        (!(stream->seg_list_tail->flags & SEGMENTTCP_FLAG_LOGAPI_PROCESSED)))
    {
        TcpSegment *seg = stream->seg_list;
        while (seg) {
            uint8_t flags = iflags;

            if (seg->flags & SEGMENTTCP_FLAG_LOGAPI_PROCESSED) {
                seg = seg->next;
                continue;
            }

            if (SEQ_GT(seg->seq + seg->payload_len, stream->last_ack)) {
                SCLogDebug("seg not (fully) acked yet");
                break;
            }

            if (seg->seq == stream->isn + 1)
                flags |= OUTPUT_STREAMING_FLAG_OPEN;
            /* if we need to close and we're at the last segment in the list
             * we add the 'close' flag so the logger can close up. */
            if (close && seg->next == NULL)
                flags |= OUTPUT_STREAMING_FLAG_CLOSE;

            Streamer(cbdata, f, seg->payload, (uint32_t)seg->payload_len, 0, flags);

            seg->flags |= SEGMENTTCP_FLAG_LOGAPI_PROCESSED;

            seg = seg->next;

            logged = 1;
        }
    }

    /* if we need to close we need to invoke the Streamer for sure. If we
     * logged no segments, we call the Streamer with NULL data so it can
     * close up. */
    if (logged == 0 && close) {
        Streamer(cbdata, f, NULL, 0, 0, OUTPUT_STREAMING_FLAG_CLOSE);
    }

    return 0;
}
Beispiel #4
0
static void tcp_state_update(struct tcpup_info *upp, int state)
{
	fprintf(stderr, "%x/%-4d  %s\t -> %s\n",
			upp->t_conv, _tot_tcp, tcpstates[upp->t_state], tcpstates[state]);

	if (SEQ_GT(upp->snd_nxt, upp->snd_max)) {
		/* update snd max to nxt */
		upp->snd_max = upp->snd_nxt;
	}

	upp->t_state = state;
	upp->x_state = state;
	return;
}
Beispiel #5
0
/**
 *  \brief Replace (part of) the payload portion of a packet by the data
 *         in a TCP segment
 *
 *  \param p Packet
 *  \param seg TCP segment
 *
 *  \todo What about reassembled fragments?
 *  \todo What about unwrapped tunnel packets?
 */
void StreamTcpInlineSegmentReplacePacket(Packet *p, TcpSegment *seg) {
    SCEnter();

    uint32_t pseq = TCP_GET_SEQ(p);
    uint32_t tseq = seg->seq;

    /* check if segment is within the packet */
    if (tseq + seg->payload_len < pseq) {
        SCReturn;
    } else if (pseq + p->payload_len < tseq) {
        SCReturn;
    } else {
        /** \todo review logic */
        uint32_t pend = pseq + p->payload_len;
        uint32_t tend = tseq + seg->payload_len;
        SCLogDebug("pend %u, tend %u", pend, tend);

        //SCLogDebug("packet");
        //PrintRawDataFp(stdout,p->payload,p->payload_len);
        //SCLogDebug("seg");
        //PrintRawDataFp(stdout,seg->payload,seg->payload_len);

        /* get the minimal seg*_end */
        uint32_t end = (SEQ_GT(pend, tend)) ? tend : pend;
        /* and the max seq */
        uint32_t seq = (SEQ_LT(pseq, tseq)) ? tseq : pseq;

        SCLogDebug("seq %u, end %u", seq, end);

        uint16_t poff = seq - pseq;
        uint16_t toff = seq - tseq;
        SCLogDebug("poff %u, toff %u", poff, toff);

        uint32_t range = end - seq;
        SCLogDebug("range %u", range);
        BUG_ON(range > 65536);

        if (range) {
            /* update the packets payload. As payload is a ptr to either
             * p->pkt or p->ext_pkt that is updated as well */
            memcpy(p->payload+poff, seg->payload+toff, range);

            /* flag as modified so we can reinject / replace after
             * recalculating the checksum */
            p->flags |= PKT_STREAM_MODIFIED;
        }
    }
}
Beispiel #6
0
/*
 * To remove a SACK block.
 *
 * Parameters:
 *	sack_blk_t *head: pointer to the array of SACK blks.
 *	tcp_seq end: to remove all sack blk with seq num less than end.
 *	int32_t *num: (referenced) total num of SACK blks in the array.
 */
void
tcp_sack_remove(sack_blk_t *head, tcp_seq end, int32_t *num)
{
	sack_blk_t tmp[MAX_SACK_BLK];
	int32_t i, j, old_num, new_num;

	if (*num == 0)
		return;

	old_num = *num;
	new_num = old_num;
	j = 0;
	/* Walk thru the whole list and copy the new list to tmp[]. */
	for (i = 0; i < old_num; i++) {
		if (SEQ_GT(end, head[i].begin)) {
			/*
			 * Check to see if the old SACK blk needs to be
			 * removed or updated.  If the old blk is just
			 * partially covered, update begin and continue.
			 * If the old blk is completely covered, remove it
			 * and continue to check.
			 */
			if (SEQ_GEQ(end, head[i].end)) {
				new_num--;
				continue;
			} else {
				tmp[j].begin = end;
				tmp[j].end = head[i].end;
			}
		} else {
			tmp[j].begin = head[i].begin;
			tmp[j].end = head[i].end;
		}
		j++;
	}
	/* Copy tmp[] back to the original list. */
	for (i = 0; i < new_num; i++) {
		head[i].begin = tmp[i].begin;
		head[i].end = tmp[i].end;
	}
	*num = new_num;
}
/**
 * This function is called by the receive layer when a data packet or an explicit ACK
 * has been received. The ACK value is one greater (modulo 8) than the seq number of the
 * last packet successfully received.
 */
void AJ_SerialTx_ReceivedAck(uint8_t ack)
{
    TxPkt volatile* ackedPkt = NULL;

    if (txSent == NULL) {
        return;
    }

    /*
     * Remove acknowledged packets from sent queue.
     */
    while ((txSent != NULL) && SEQ_GT(ack, txSent->seq)) {
        ackedPkt = txSent;
        txSent = txSent->next;
        //AJ_AlwaysPrintf("Releasing seq=%d (acked by %d)\n", ackedPkt->seq, ack);

        AJ_ASSERT(ackedPkt->type == AJ_SERIAL_DATA);
        /*
         * Return pkt to ACL free list.
         */
        ackedPkt->next = txFreeList;
        txFreeList = ackedPkt;

        /*
         * If all packet have been ack'd, halt the resend timer and return.
         */
        if (txSent == NULL) {
            AJ_InitTimer(&resendTime);
            AJ_TimeAddOffset(&resendTime, AJ_TIMER_FOREVER);
            resendPrimed = FALSE;
            return;
        }
    }
    /*
     * Reset the resend timer if one or more packets were ack'd.
     */
    if (ackedPkt != NULL) {
        AJ_InitTimer(&resendTime);
        AJ_TimeAddOffset(&resendTime, AJ_SerialLinkParams.txResendTimeout);
        resendPrimed = TRUE;
    }
}
Beispiel #8
0
/*
 * Perform any necessary tasks before we exit congestion recovery.
 */
static void
newreno_post_recovery(struct cc_var *ccv)
{
	if (IN_FASTRECOVERY(CCV(ccv, t_flags))) {
		/*
		 * Fast recovery will conclude after returning from this
		 * function. Window inflation should have left us with
		 * approximately snd_ssthresh outstanding data. But in case we
		 * would be inclined to send a burst, better to do it via the
		 * slow start mechanism.
		 *
		 * XXXLAS: Find a way to do this without needing curack
		 */
		if (SEQ_GT(ccv->curack + CCV(ccv, snd_ssthresh),
		    CCV(ccv, snd_max)))
			CCV(ccv, snd_cwnd) = CCV(ccv, snd_max) -
			ccv->curack + CCV(ccv, t_maxseg);
		else
			CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
	}
}
Beispiel #9
0
/*
 * USTAT PDU / SOS_READY Processor
 *
 * Arguments:
 *	sop	pointer to sscop connection block
 *	m	pointer to PDU buffer (without trailer)
 *	trlr	pointer to PDU trailer
 *
 * Returns:
 *	none
 *
 */
void
sscop_ustat_ready(struct sscop *sop, KBuffer *m, caddr_t trlr)
{
	struct ustat_pdu	*up = (struct ustat_pdu *)trlr;
	struct pdu_hdr	*php;
	sscop_seq	seq1, seq2;

	up->ustat_nmr = ntohl(up->ustat_nmr);
	up->ustat_nr = ntohl(up->ustat_nr);

	/*
	 * Validate peer's current receive data sequence number
	 */
	if (SEQ_GT(sop->so_ack, up->ustat_nr, sop->so_ack) ||
	    SEQ_GEQ(up->ustat_nr, sop->so_send, sop->so_ack)) {
		/*
		 * Bad data sequence number
		 */
		goto goterr;
	}

	/*
	 * Free acknowledged PDUs
	 */
	for (seq1 = sop->so_ack, SEQ_SET(seq2, up->ustat_nr);
			SEQ_LT(seq1, seq2, sop->so_ack);
			SEQ_INCR(seq1, 1)) {
		sscop_pack_free(sop, seq1);
	}

	/*
	 * Update transmit state variables
	 */
	sop->so_ack = seq2;
	SEQ_SET(sop->so_sendmax, up->ustat_nmr);

	/*
	 * Get USTAT list elements
	 */
	SEQ_SET(seq1, ntohl(up->ustat_le1));
	SEQ_SET(seq2, ntohl(up->ustat_le2));

	/*
	 * Validate elements
	 */
	if (SEQ_GT(sop->so_ack, seq1, sop->so_ack) ||
	    SEQ_GEQ(seq1, seq2, sop->so_ack) ||
	    SEQ_GEQ(seq2, sop->so_send, sop->so_ack)) {
		/*
		 * Bad element sequence number
		 */
		goto goterr;
	}

	/*
	 * Process each missing sequence number in this gap
	 */
	while (SEQ_LT(seq1, seq2, sop->so_ack)) {
		/*
		 * Find corresponding SD PDU on pending ack queue
		 */
		php = sscop_pack_locate(sop, seq1);
		if (php == NULL) {
			goto goterr;
		}

		/*
		 * Retransmit this SD PDU if it's not
		 * already scheduled for retranmission.
		 */
		if ((php->ph_rexmit_lk == NULL) &&
		    (sop->so_rexmit_tl != php)) {
			/*
			 * Put PDU on retransmit queue and schedule
			 * transmit servicing
			 */
			sscop_rexmit_insert(sop, php);
			sop->so_flags |= SOF_XMITSRVC;
		}

		/*
		 * Bump to next sequence number
		 */
		SEQ_INCR(seq1, 1);
	}

	/*
	 * Report retransmitted PDUs
	 */
	sscop_maa_error(sop, 'V');

	/*
	 * Free PDU buffer chain
	 */
	KB_FREEALL(m);

	/*
	 * See if transmit queues need servicing
	 */
	if (sop->so_flags & SOF_XMITSRVC)
		sscop_service_xmit(sop);

	return;

goterr:
	/*
	 * Protocol/parameter error encountered
	 */
	sscop_maa_error(sop, 'T');

	/*
	 * Free PDU buffer chain
	 */
	KB_FREEALL(m);

	if (sop->so_vers == SSCOP_VERS_QSAAL)
		/*
		 * Reestablish a new connection
		 */
		qsaal1_reestablish(sop);
	else
		/*
		 * Initiate error recovery
		 */
		q2110_error_recovery(sop);

	return;
}
Beispiel #10
0
/*
 * STAT PDU / SOS_READY Processor
 *
 * Arguments:
 *	sop	pointer to sscop connection block
 *	m	pointer to PDU buffer (without trailer)
 *	trlr	pointer to PDU trailer
 *
 * Returns:
 *	none
 *
 */
void
sscop_stat_ready(struct sscop *sop, KBuffer *m, caddr_t trlr)
{
	struct stat_pdu	*sp = (struct stat_pdu *)trlr;
	struct pdu_hdr	*php;
	KBuffer		*m0 = m;
	sscop_seq	seq1, seq2, opa;
	int		cnt = 0;

	sp->stat_nps = ntohl(sp->stat_nps);
	sp->stat_nmr = ntohl(sp->stat_nmr);
	sp->stat_nr = ntohl(sp->stat_nr);

	/*
	 * Validate peer's received poll sequence number
	 */
	if (SEQ_GT(sop->so_pollack, sp->stat_nps, sop->so_pollack) ||
	    SEQ_GT(sp->stat_nps, sop->so_pollsend, sop->so_pollack)) {
		/*
		 * Bad poll sequence number
		 */
		sscop_maa_error(sop, 'R');
		goto goterr;
	}

	/*
	 * Validate peer's current receive data sequence number
	 */
	if (SEQ_GT(sop->so_ack, sp->stat_nr, sop->so_ack) ||
	    SEQ_GT(sp->stat_nr, sop->so_send, sop->so_ack)) {
		/*
		 * Bad data sequence number
		 */
		sscop_maa_error(sop, 'S');
		goto goterr;
	}

	/*
	 * Free acknowledged PDUs
	 */
	for (seq1 = sop->so_ack, SEQ_SET(seq2, sp->stat_nr);
			SEQ_LT(seq1, seq2, sop->so_ack);
			SEQ_INCR(seq1, 1)) {
		sscop_pack_free(sop, seq1);
	}

	/*
	 * Update transmit state variables
	 */
	opa = sop->so_pollack;
	sop->so_ack = seq2;
	SEQ_SET(sop->so_pollack, sp->stat_nps);
	SEQ_SET(sop->so_sendmax, sp->stat_nmr);

	/*
	 * Get first element in STAT list
	 */
	while (m && (KB_LEN(m) == 0))
		m = KB_NEXT(m);
	if (m == NULL)
		goto done;
	m = sscop_stat_getelem(m, &seq1);

	/*
	 * Make sure there's a second element too
	 */
	if (m == NULL)
		goto done;

	/*
	 * Validate first element (start of missing pdus)
	 */
	if (SEQ_GT(sop->so_ack, seq1, sop->so_ack) ||
	    SEQ_GEQ(seq1, sop->so_send, sop->so_ack)) {
		/*
		 * Bad element sequence number
		 */
		sscop_maa_error(sop, 'S');
		goto goterr;
	}

	/*
	 * Loop thru all STAT elements in list
	 */
	while (m) {
		/*
		 * Get next even element (start of received pdus)
		 */
		m = sscop_stat_getelem(m, &seq2);

		/*
		 * Validate seqence number
		 */
		if (SEQ_GEQ(seq1, seq2, sop->so_ack) ||
		    SEQ_GT(seq2, sop->so_send, sop->so_ack)) {
			/*
			 * Bad element sequence number
			 */
			sscop_maa_error(sop, 'S');
			goto goterr;
		}

		/*
		 * Process each missing sequence number in this gap
		 */
		while (SEQ_LT(seq1, seq2, sop->so_ack)) {
			/*
			 * Find corresponding SD PDU on pending ack queue
			 */
			php = sscop_pack_locate(sop, seq1);
			if (php == NULL) {
				sscop_maa_error(sop, 'S');
				goto goterr;
			}

			/*
			 * Retransmit this SD PDU only if it was last sent
			 * during an earlier poll sequence and it's not
			 * already scheduled for retranmission.
			 */
			if (SEQ_LT(php->ph_nps, sp->stat_nps, opa) &&
			    (php->ph_rexmit_lk == NULL) &&
			    (sop->so_rexmit_tl != php)) {
				/*
				 * Put PDU on retransmit queue and schedule
				 * transmit servicing
				 */
				sscop_rexmit_insert(sop, php);
				sop->so_flags |= SOF_XMITSRVC;
				cnt++;
			}

			/*
			 * Bump to next sequence number
			 */
			SEQ_INCR(seq1, 1);
		}

		/*
		 * Now process series of acknowledged PDUs
		 *
		 * Get next odd element (start of missing pdus),
		 * but make sure there is one and that it's valid
		 */
		if (m == NULL)
			goto done;
		m = sscop_stat_getelem(m, &seq2);
		if (SEQ_GEQ(seq1, seq2, sop->so_ack) ||
		    SEQ_GT(seq2, sop->so_send, sop->so_ack)) {
			/*
			 * Bad element sequence number
			 */
			sscop_maa_error(sop, 'S');
			goto goterr;
		}

		/*
		 * Process each acked sequence number
		 */
		while (SEQ_LT(seq1, seq2, sop->so_ack)) {
			/*
			 * Can we clear transmit buffers ??
			 */
			if ((sop->so_flags & SOF_NOCLRBUF) == 0) {
				/*
				 * Yes, free acked buffers
				 */
				sscop_pack_free(sop, seq1);
			}

			/*
			 * Bump to next sequence number
			 */
			SEQ_INCR(seq1, 1);
		}
	}

done:
	/*
	 * Free PDU buffer chain
	 */
	KB_FREEALL(m0);

	/*
	 * Report retransmitted PDUs
	 */
	if (cnt)
		sscop_maa_error(sop, 'V');

	/*
	 * Record transmit window closed transitions
	 */
	if (SEQ_LT(sop->so_send, sop->so_sendmax, sop->so_ack)) {
		if (sop->so_flags & SOF_NOCREDIT) {
			sop->so_flags &= ~SOF_NOCREDIT;
			sscop_maa_error(sop, 'X');
		}
	} else {
		if ((sop->so_flags & SOF_NOCREDIT) == 0) {
			sop->so_flags |= SOF_NOCREDIT;
			sscop_maa_error(sop, 'W');
		}
	}

	if (sop->so_vers == SSCOP_VERS_QSAAL)
		/*
		 * Restart lost poll/stat timer
		 */
		sop->so_timer[SSCOP_T_NORESP] = sop->so_parm.sp_timeresp;
	else {
		/*
		 * Determine new polling phase
		 */
		if ((sop->so_timer[SSCOP_T_POLL] != 0) &&
		    ((sop->so_flags & SOF_KEEPALIVE) == 0)) {
			/*
			 * Remain in active phase - reset NO-RESPONSE timer
			 */
			sop->so_timer[SSCOP_T_NORESP] =
					sop->so_parm.sp_timeresp;

		} else if (sop->so_timer[SSCOP_T_IDLE] == 0) {
			/*
			 * Go from transient to idle phase
			 */
			sop->so_timer[SSCOP_T_POLL] = 0;
			sop->so_flags &= ~SOF_KEEPALIVE;
			sop->so_timer[SSCOP_T_NORESP] = 0;
			sop->so_timer[SSCOP_T_IDLE] = sop->so_parm.sp_timeidle;
		}
	}

	/*
	 * See if transmit queues need servicing
	 */
	if (sop->so_flags & SOF_XMITSRVC)
		sscop_service_xmit(sop);

	return;

goterr:
	/*
	 * Protocol/parameter error encountered
	 */

	/*
	 * Free PDU buffer chain
	 */
	KB_FREEALL(m0);

	if (sop->so_vers == SSCOP_VERS_QSAAL)
		/*
		 * Reestablish a new connection
		 */
		qsaal1_reestablish(sop);
	else
		/*
		 * Initiate error recovery
		 */
		q2110_error_recovery(sop);

	return;
}
Beispiel #11
0
/*
 * When a new ack with SACK is received, check if it indicates packet
 * reordering. If there is packet reordering, the socket is marked and
 * the late time offset by which the packet was reordered with
 * respect to its closest neighboring packets is computed.
 */
static void
tcp_sack_detect_reordering(struct tcpcb *tp, struct sackhole *s,
    tcp_seq sacked_seq, tcp_seq snd_fack)
{
	int32_t rext = 0, reordered = 0;

	/*
	 * If the SACK hole is past snd_fack, this is from new SACK
	 * information, so we can ignore it.
	 */
	if (SEQ_GT(s->end, snd_fack))
		return;
	/*
	 * If there has been a retransmit timeout, then the timestamp on 
	 * the SACK segment will be newer. This might lead to a
	 * false-positive. Avoid re-ordering detection in this case.
	 */
	if (tp->t_rxtshift > 0)
		return;

	/*
	 * Detect reordering from SACK information by checking
	 * if recently sacked data was never retransmitted from this hole.
	 */
	if (SEQ_LT(s->rxmit, sacked_seq)) {
		reordered = 1;
		tcpstat.tcps_avoid_rxmt++;
	}

	if (reordered) {
		if (tcp_detect_reordering == 1 &&
		    !(tp->t_flagsext & TF_PKTS_REORDERED)) {
			tp->t_flagsext |= TF_PKTS_REORDERED;
			tcpstat.tcps_detect_reordering++;
		}

		tcpstat.tcps_reordered_pkts++;
		tp->t_reordered_pkts++;

		/*
		 * If reordering is seen on a connection wth ECN enabled,
		 * increment the heuristic
		 */
		if (TCP_ECN_ENABLED(tp)) {
			INP_INC_IFNET_STAT(tp->t_inpcb, ecn_fallback_reorder);
			tcpstat.tcps_ecn_fallback_reorder++;
			tcp_heuristic_ecn_aggressive(tp);
		}

		VERIFY(SEQ_GEQ(snd_fack, s->rxmit));

		if (s->rxmit_start > 0) {
			rext = timer_diff(tcp_now, 0, s->rxmit_start, 0);
			if (rext < 0)
				return;

			/*
			 * We take the maximum reorder window to schedule
			 * DELAYFR timer as that will take care of jitter
			 * on the network path.
			 *
			 * Computing average and standard deviation seems
			 * to cause unnecessary retransmissions when there
			 * is high jitter.
			 *
			 * We set a maximum of SRTT/2 and a minimum of
			 * 10 ms on the reorder window.
			 */
			tp->t_reorderwin = max(tp->t_reorderwin, rext);
			tp->t_reorderwin = min(tp->t_reorderwin,
			    (tp->t_srtt >> (TCP_RTT_SHIFT - 1)));
			tp->t_reorderwin = max(tp->t_reorderwin, 10);
		}
	}
Beispiel #12
0
/*
 * Tcp output routine: figure out what should be sent and send it.
 */
int
tcp_output(struct tcpcb *tp)
{
	struct inpcb * const inp = tp->t_inpcb;
	struct socket *so = inp->inp_socket;
	long len, recvwin, sendwin;
	int nsacked = 0;
	int off, flags, error;
#ifdef TCP_SIGNATURE
	int sigoff = 0;
#endif
	struct mbuf *m;
	struct ip *ip = NULL;
	struct ipovly *ipov = NULL;
	struct tcphdr *th;
	u_char opt[TCP_MAXOLEN];
	unsigned int ipoptlen, optlen, hdrlen;
	int idle;
	boolean_t sendalot;
	struct ip6_hdr *ip6 = NULL;
#ifdef INET6
	const boolean_t isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
#else
	const boolean_t isipv6 = FALSE;
#endif

	KKASSERT(so->so_port == &curthread->td_msgport);

	/*
	 * Determine length of data that should be transmitted,
	 * and flags that will be used.
	 * If there is some data or critical controls (SYN, RST)
	 * to send, then transmit; otherwise, investigate further.
	 */

	/*
	 * If we have been idle for a while, the send congestion window
	 * could be no longer representative of the current state of the link.
	 * So unless we are expecting more acks to come in, slow-start from
	 * scratch to re-determine the send congestion window.
	 */
	if (tp->snd_max == tp->snd_una &&
	    (ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
		if (tcp_do_rfc3390) {
			int initial_cwnd =
			    min(4 * tp->t_maxseg, max(2 * tp->t_maxseg, 4380));

			tp->snd_cwnd = min(tp->snd_cwnd, initial_cwnd);
		} else {
			tp->snd_cwnd = tp->t_maxseg;
		}
		tp->snd_wacked = 0;
	}

	/*
	 * Calculate whether the transmit stream was previously idle 
	 * and adjust TF_LASTIDLE for the next time.
	 */
	idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
	if (idle && (tp->t_flags & TF_MORETOCOME))
		tp->t_flags |= TF_LASTIDLE;
	else
		tp->t_flags &= ~TF_LASTIDLE;

	if (TCP_DO_SACK(tp) && tp->snd_nxt != tp->snd_max &&
	    !IN_FASTRECOVERY(tp))
		nsacked = tcp_sack_bytes_below(&tp->scb, tp->snd_nxt);

again:
	/* Make use of SACK information when slow-starting after a RTO. */
	if (TCP_DO_SACK(tp) && tp->snd_nxt != tp->snd_max &&
	    !IN_FASTRECOVERY(tp)) {
		tcp_seq old_snd_nxt = tp->snd_nxt;

		tcp_sack_skip_sacked(&tp->scb, &tp->snd_nxt);
		nsacked += tp->snd_nxt - old_snd_nxt;
	}

	sendalot = FALSE;
	off = tp->snd_nxt - tp->snd_una;
	sendwin = min(tp->snd_wnd, tp->snd_cwnd + nsacked);
	sendwin = min(sendwin, tp->snd_bwnd);

	flags = tcp_outflags[tp->t_state];
	/*
	 * Get standard flags, and add SYN or FIN if requested by 'hidden'
	 * state flags.
	 */
	if (tp->t_flags & TF_NEEDFIN)
		flags |= TH_FIN;
	if (tp->t_flags & TF_NEEDSYN)
		flags |= TH_SYN;

	/*
	 * If in persist timeout with window of 0, send 1 byte.
	 * Otherwise, if window is small but nonzero
	 * and timer expired, we will send what we can
	 * and go to transmit state.
	 */
	if (tp->t_flags & TF_FORCE) {
		if (sendwin == 0) {
			/*
			 * If we still have some data to send, then
			 * clear the FIN bit.  Usually this would
			 * happen below when it realizes that we
			 * aren't sending all the data.  However,
			 * if we have exactly 1 byte of unsent data,
			 * then it won't clear the FIN bit below,
			 * and if we are in persist state, we wind
			 * up sending the packet without recording
			 * that we sent the FIN bit.
			 *
			 * We can't just blindly clear the FIN bit,
			 * because if we don't have any more data
			 * to send then the probe will be the FIN
			 * itself.
			 */
			if (off < so->so_snd.ssb_cc)
				flags &= ~TH_FIN;
			sendwin = 1;
		} else {
			tcp_callout_stop(tp, tp->tt_persist);
			tp->t_rxtshift = 0;
		}
	}

	/*
	 * If snd_nxt == snd_max and we have transmitted a FIN, the
	 * offset will be > 0 even if so_snd.ssb_cc is 0, resulting in
	 * a negative length.  This can also occur when TCP opens up
	 * its congestion window while receiving additional duplicate
	 * acks after fast-retransmit because TCP will reset snd_nxt
	 * to snd_max after the fast-retransmit.
	 *
	 * In the normal retransmit-FIN-only case, however, snd_nxt will
	 * be set to snd_una, the offset will be 0, and the length may
	 * wind up 0.
	 */
	len = (long)ulmin(so->so_snd.ssb_cc, sendwin) - off;

	/*
	 * Lop off SYN bit if it has already been sent.  However, if this
	 * is SYN-SENT state and if segment contains data, suppress sending
	 * segment (sending the segment would be an option if we still
	 * did TAO and the remote host supported it).
	 */
	if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
		flags &= ~TH_SYN;
		off--, len++;
		if (len > 0 && tp->t_state == TCPS_SYN_SENT)
			return 0;
	}

	/*
	 * Be careful not to send data and/or FIN on SYN segments.
	 * This measure is needed to prevent interoperability problems
	 * with not fully conformant TCP implementations.
	 */
	if (flags & TH_SYN) {
		len = 0;
		flags &= ~TH_FIN;
	}

	if (len < 0) {
		/*
		 * If FIN has been sent but not acked,
		 * but we haven't been called to retransmit,
		 * len will be < 0.  Otherwise, window shrank
		 * after we sent into it.  If window shrank to 0,
		 * cancel pending retransmit, pull snd_nxt back
		 * to (closed) window, and set the persist timer
		 * if it isn't already going.  If the window didn't
		 * close completely, just wait for an ACK.
		 */
		len = 0;
		if (sendwin == 0) {
			tcp_callout_stop(tp, tp->tt_rexmt);
			tp->t_rxtshift = 0;
			tp->snd_nxt = tp->snd_una;
			if (!tcp_callout_active(tp, tp->tt_persist))
				tcp_setpersist(tp);
		}
	}

	KASSERT(len >= 0, ("%s: len < 0", __func__));
	/*
	 * Automatic sizing of send socket buffer.  Often the send buffer
	 * size is not optimally adjusted to the actual network conditions
	 * at hand (delay bandwidth product).  Setting the buffer size too
	 * small limits throughput on links with high bandwidth and high
	 * delay (eg. trans-continental/oceanic links).  Setting the
	 * buffer size too big consumes too much real kernel memory,
	 * especially with many connections on busy servers.
	 *
	 * The criteria to step up the send buffer one notch are:
	 *  1. receive window of remote host is larger than send buffer
	 *     (with a fudge factor of 5/4th);
	 *  2. send buffer is filled to 7/8th with data (so we actually
	 *     have data to make use of it);
	 *  3. send buffer fill has not hit maximal automatic size;
	 *  4. our send window (slow start and cogestion controlled) is
	 *     larger than sent but unacknowledged data in send buffer.
	 *
	 * The remote host receive window scaling factor may limit the
	 * growing of the send buffer before it reaches its allowed
	 * maximum.
	 *
	 * It scales directly with slow start or congestion window
	 * and does at most one step per received ACK.  This fast
	 * scaling has the drawback of growing the send buffer beyond
	 * what is strictly necessary to make full use of a given
	 * delay*bandwith product.  However testing has shown this not
	 * to be much of an problem.  At worst we are trading wasting
	 * of available bandwith (the non-use of it) for wasting some
	 * socket buffer memory.
	 *
	 * TODO: Shrink send buffer during idle periods together
	 * with congestion window.  Requires another timer.  Has to
	 * wait for upcoming tcp timer rewrite.
	 */
	if (tcp_do_autosndbuf && so->so_snd.ssb_flags & SSB_AUTOSIZE) {
		if ((tp->snd_wnd / 4 * 5) >= so->so_snd.ssb_hiwat &&
		    so->so_snd.ssb_cc >= (so->so_snd.ssb_hiwat / 8 * 7) &&
		    so->so_snd.ssb_cc < tcp_autosndbuf_max &&
		    sendwin >= (so->so_snd.ssb_cc - (tp->snd_nxt - tp->snd_una))) {
			u_long newsize;

			newsize = ulmin(so->so_snd.ssb_hiwat +
					 tcp_autosndbuf_inc,
					tcp_autosndbuf_max);
			if (!ssb_reserve(&so->so_snd, newsize, so, NULL))
				atomic_clear_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE);
			if (newsize >= (TCP_MAXWIN << tp->snd_scale))
				atomic_clear_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE);
		}
	}

	/*
	 * Truncate to the maximum segment length and ensure that FIN is
	 * removed if the length no longer contains the last data byte.
	 */
	if (len > tp->t_maxseg) {
		len = tp->t_maxseg;
		sendalot = TRUE;
	}
	if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.ssb_cc))
		flags &= ~TH_FIN;

	recvwin = ssb_space(&so->so_rcv);

	/*
	 * Sender silly window avoidance.   We transmit under the following
	 * conditions when len is non-zero:
	 *
	 *	- We have a full segment
	 *	- This is the last buffer in a write()/send() and we are
	 *	  either idle or running NODELAY
	 *	- we've timed out (e.g. persist timer)
	 *	- we have more then 1/2 the maximum send window's worth of
	 *	  data (receiver may be limiting the window size)
	 *	- we need to retransmit
	 */
	if (len) {
		if (len == tp->t_maxseg)
			goto send;
		/*
		 * NOTE! on localhost connections an 'ack' from the remote
		 * end may occur synchronously with the output and cause
		 * us to flush a buffer queued with moretocome.  XXX
		 *
		 * note: the len + off check is almost certainly unnecessary.
		 */
		if (!(tp->t_flags & TF_MORETOCOME) &&	/* normal case */
		    (idle || (tp->t_flags & TF_NODELAY)) &&
		    len + off >= so->so_snd.ssb_cc &&
		    !(tp->t_flags & TF_NOPUSH)) {
			goto send;
		}
		if (tp->t_flags & TF_FORCE)		/* typ. timeout case */
			goto send;
		if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
			goto send;
		if (SEQ_LT(tp->snd_nxt, tp->snd_max))	/* retransmit case */
			goto send;
	}

	/*
	 * Compare available window to amount of window
	 * known to peer (as advertised window less
	 * next expected input).  If the difference is at least two
	 * max size segments, or at least 50% of the maximum possible
	 * window, then want to send a window update to peer.
	 */
	if (recvwin > 0) {
		/*
		 * "adv" is the amount we can increase the window,
		 * taking into account that we are limited by
		 * TCP_MAXWIN << tp->rcv_scale.
		 */
		long adv = min(recvwin, (long)TCP_MAXWIN << tp->rcv_scale) -
			(tp->rcv_adv - tp->rcv_nxt);
		long hiwat;

		/*
		 * This ack case typically occurs when the user has drained
		 * the TCP socket buffer sufficiently to warrent an ack
		 * containing a 'pure window update'... that is, an ack that
		 * ONLY updates the tcp window.
		 *
		 * It is unclear why we would need to do a pure window update
		 * past 2 segments if we are going to do one at 1/2 the high
		 * water mark anyway, especially since under normal conditions
		 * the user program will drain the socket buffer quickly.
		 * The 2-segment pure window update will often add a large
		 * number of extra, unnecessary acks to the stream.
		 *
		 * avoid_pure_win_update now defaults to 1.
		 */
		if (avoid_pure_win_update == 0 ||
		    (tp->t_flags & TF_RXRESIZED)) {
			if (adv >= (long) (2 * tp->t_maxseg)) {
				goto send;
			}
		}
		hiwat = (long)(TCP_MAXWIN << tp->rcv_scale);
		if (hiwat > (long)so->so_rcv.ssb_hiwat)
			hiwat = (long)so->so_rcv.ssb_hiwat;
		if (adv >= hiwat / 2)
			goto send;
	}

	/*
	 * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
	 * is also a catch-all for the retransmit timer timeout case.
	 */
	if (tp->t_flags & TF_ACKNOW)
		goto send;
	if ((flags & TH_RST) ||
	    ((flags & TH_SYN) && !(tp->t_flags & TF_NEEDSYN)))
		goto send;
	if (SEQ_GT(tp->snd_up, tp->snd_una))
		goto send;
	/*
	 * If our state indicates that FIN should be sent
	 * and we have not yet done so, then we need to send.
	 */
	if (flags & TH_FIN &&
	    (!(tp->t_flags & TF_SENTFIN) || tp->snd_nxt == tp->snd_una))
		goto send;

	/*
	 * TCP window updates are not reliable, rather a polling protocol
	 * using ``persist'' packets is used to insure receipt of window
	 * updates.  The three ``states'' for the output side are:
	 *	idle			not doing retransmits or persists
	 *	persisting		to move a small or zero window
	 *	(re)transmitting	and thereby not persisting
	 *
	 * tcp_callout_active(tp, tp->tt_persist)
	 *	is true when we are in persist state.
	 * The TF_FORCE flag in tp->t_flags
	 *	is set when we are called to send a persist packet.
	 * tcp_callout_active(tp, tp->tt_rexmt)
	 *	is set when we are retransmitting
	 * The output side is idle when both timers are zero.
	 *
	 * If send window is too small, there is data to transmit, and no
	 * retransmit or persist is pending, then go to persist state.
	 * If nothing happens soon, send when timer expires:
	 * if window is nonzero, transmit what we can,
	 * otherwise force out a byte.
	 */
	if (so->so_snd.ssb_cc > 0 &&
	    !tcp_callout_active(tp, tp->tt_rexmt) &&
	    !tcp_callout_active(tp, tp->tt_persist)) {
		tp->t_rxtshift = 0;
		tcp_setpersist(tp);
	}

	/*
	 * No reason to send a segment, just return.
	 */
	return (0);

send:
	/*
	 * Before ESTABLISHED, force sending of initial options
	 * unless TCP set not to do any options.
	 * NOTE: we assume that the IP/TCP header plus TCP options
	 * always fit in a single mbuf, leaving room for a maximum
	 * link header, i.e.
	 *	max_linkhdr + sizeof(struct tcpiphdr) + optlen <= MCLBYTES
	 */
	optlen = 0;
	if (isipv6)
		hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
	else
		hdrlen = sizeof(struct tcpiphdr);
	if (flags & TH_SYN) {
		tp->snd_nxt = tp->iss;
		if (!(tp->t_flags & TF_NOOPT)) {
			u_short mss;

			opt[0] = TCPOPT_MAXSEG;
			opt[1] = TCPOLEN_MAXSEG;
			mss = htons((u_short) tcp_mssopt(tp));
			memcpy(opt + 2, &mss, sizeof mss);
			optlen = TCPOLEN_MAXSEG;

			if ((tp->t_flags & TF_REQ_SCALE) &&
			    (!(flags & TH_ACK) ||
			     (tp->t_flags & TF_RCVD_SCALE))) {
				*((u_int32_t *)(opt + optlen)) = htonl(
					TCPOPT_NOP << 24 |
					TCPOPT_WINDOW << 16 |
					TCPOLEN_WINDOW << 8 |
					tp->request_r_scale);
				optlen += 4;
			}

			if ((tcp_do_sack && !(flags & TH_ACK)) ||
			    tp->t_flags & TF_SACK_PERMITTED) {
				uint32_t *lp = (uint32_t *)(opt + optlen);

				*lp = htonl(TCPOPT_SACK_PERMITTED_ALIGNED);
				optlen += TCPOLEN_SACK_PERMITTED_ALIGNED;
			}
		}
	}

	/*
	 * Send a timestamp and echo-reply if this is a SYN and our side
	 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
	 * and our peer have sent timestamps in our SYN's.
	 */
	if ((tp->t_flags & (TF_REQ_TSTMP | TF_NOOPT)) == TF_REQ_TSTMP &&
	    !(flags & TH_RST) &&
	    (!(flags & TH_ACK) || (tp->t_flags & TF_RCVD_TSTMP))) {
		u_int32_t *lp = (u_int32_t *)(opt + optlen);

		/* Form timestamp option as shown in appendix A of RFC 1323. */
		*lp++ = htonl(TCPOPT_TSTAMP_HDR);
		*lp++ = htonl(ticks);
		*lp   = htonl(tp->ts_recent);
		optlen += TCPOLEN_TSTAMP_APPA;
	}

	/* Set receive buffer autosizing timestamp. */
	if (tp->rfbuf_ts == 0 && (so->so_rcv.ssb_flags & SSB_AUTOSIZE))
		tp->rfbuf_ts = ticks;

	/*
	 * If this is a SACK connection and we have a block to report,
	 * fill in the SACK blocks in the TCP options.
	 */
	if ((tp->t_flags & (TF_SACK_PERMITTED | TF_NOOPT)) ==
		TF_SACK_PERMITTED &&
	    (!LIST_EMPTY(&tp->t_segq) ||
	     tp->reportblk.rblk_start != tp->reportblk.rblk_end))
		tcp_sack_fill_report(tp, opt, &optlen);

#ifdef TCP_SIGNATURE
	if (tp->t_flags & TF_SIGNATURE) {
		int i;
		u_char *bp;
		/*
		 * Initialize TCP-MD5 option (RFC2385)
		 */
		bp = (u_char *)opt + optlen;
		*bp++ = TCPOPT_SIGNATURE;
		*bp++ = TCPOLEN_SIGNATURE;
		sigoff = optlen + 2;
		for (i = 0; i < TCP_SIGLEN; i++)
			*bp++ = 0;
		optlen += TCPOLEN_SIGNATURE;
		/*
		 * Terminate options list and maintain 32-bit alignment.
		 */
		*bp++ = TCPOPT_NOP;
		*bp++ = TCPOPT_EOL;
		optlen += 2;
	}
#endif /* TCP_SIGNATURE */
	KASSERT(optlen <= TCP_MAXOLEN, ("too many TCP options"));
	hdrlen += optlen;

	if (isipv6) {
		ipoptlen = ip6_optlen(inp);
	} else {
		if (inp->inp_options) {
			ipoptlen = inp->inp_options->m_len -
			    offsetof(struct ipoption, ipopt_list);
		} else {
Beispiel #13
0
static int tcpup_state_receive(struct tcpup_info *upp, struct tcpiphdr *tcp, size_t dlen)
{
	int xflags = 0;
	int snd_una = htonl(tcp->th_ack);

	if (tcp->th_flags & TH_RST) {
		upp->t_state = TCPS_CLOSED;
		return 0;
	}

	if ((tcp->th_flags & TH_ACK) && SEQ_GT(snd_una, upp->snd_una)) {
		/* update snd una from peer */
		upp->snd_una = snd_una;
	}

	switch (upp->t_state) {
		case TCPS_SYN_SENT:
			xflags = TH_SYN| TH_ACK;
			if ((tcp->th_flags & xflags) == TH_SYN) {
				assert((tcp->th_flags & TH_FIN) != TH_FIN);
				tcp_state_preload(upp, TCPS_SYN_RECEIVED, htonl(tcp->th_seq) + 1);
				return 0;
			}

			if ((tcp->th_flags & xflags) == xflags
					&& SEQ_GEQ(htonl(tcp->th_ack), upp->snd_nxt)) {
				assert((tcp->th_flags & TH_FIN) != TH_FIN);
				tcp_state_preload(upp, TCPS_ESTABLISHED, htonl(tcp->th_seq));
				return 0;
			}
			break;

		case TCPS_SYN_RECEIVED:
			if ((tcp->th_flags & TH_ACK) == TH_ACK
					&& SEQ_GEQ(htonl(tcp->th_ack), upp->snd_nxt)) {
				assert((tcp->th_flags & TH_FIN) != TH_FIN);
				tcp_state_preload(upp, TCPS_ESTABLISHED, htonl(tcp->th_seq));
				return 0;
			}
			break;

		case TCPS_ESTABLISHED:
			if ((tcp->th_flags & TH_FIN) == TH_FIN) {
				tcp_state_preload(upp, TCPS_CLOSE_WAIT, htonl(tcp->th_seq) + 1);
				return 0;
			}
			break;

		case TCPS_FIN_WAIT_1:
			xflags = TH_FIN| TH_ACK;
			if ((tcp->th_flags & xflags) == xflags
					&& SEQ_GEQ(htonl(tcp->th_ack), upp->snd_nxt)) {
				tcp_state_preload(upp, TCPS_TIME_WAIT, htonl(tcp->th_seq) + dlen + 1);
				return 0;
			}

			if ((tcp->th_flags & TH_FIN) == TH_FIN) {
				tcp_state_preload(upp, TCPS_CLOSING, htonl(tcp->th_seq) + dlen + 1);
				return 0;
			}

			if ((tcp->th_flags & TH_ACK) == TH_ACK
					&& SEQ_GEQ(htonl(tcp->th_ack), upp->snd_nxt)) {
				tcp_state_preload(upp, TCPS_FIN_WAIT_2, htonl(tcp->th_seq) + dlen);
				return 0;
			}
			break;

		case TCPS_FIN_WAIT_2:
			if ((tcp->th_flags & TH_FIN) == TH_FIN) {
				tcp_state_preload(upp, TCPS_TIME_WAIT, htonl(tcp->th_seq) + dlen + 1);
				return 0;
			}
			break;

		case TCPS_CLOSING:
			if ((tcp->th_flags & TH_ACK) == TH_ACK
					&& SEQ_GEQ(htonl(tcp->th_ack), upp->snd_nxt)) {
				tcp_state_preload(upp, TCPS_TIME_WAIT, htonl(tcp->th_seq) + dlen);
				return 0;
			}
			break;

		case TCPS_LAST_ACK:
			if ((tcp->th_flags & TH_ACK) == TH_ACK
					&& SEQ_GEQ(htonl(tcp->th_ack), upp->snd_nxt)) {
				tcp_state_preload(upp, TCPS_CLOSED, htonl(tcp->th_seq) + dlen);
				return 0;
			}
			break;

		case TCPS_TIME_WAIT:
			fprintf(stderr, "before TIME_WAIT -> TIME_WAIT\n");
			break;
	}

	return 0;
}
Beispiel #14
0
/*
 * This function is called upon receipt of new valid data (while not in
 * header prediction mode), and it updates the ordered list of sacks.
 */
void
tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end)
{
    /*
     * First reported block MUST be the most recent one.  Subsequent
     * blocks SHOULD be in the order in which they arrived at the
     * receiver.  These two conditions make the implementation fully
     * compliant with RFC 2018.
     */
    struct sackblk head_blk, saved_blks[MAX_SACK_BLKS];
    int num_head, num_saved, i;

//ScenSim-Port//    INP_WLOCK_ASSERT(tp->t_inpcb);

    /* Check arguments. */
//ScenSim-Port//    KASSERT(SEQ_LT(rcv_start, rcv_end), ("rcv_start < rcv_end"));

    /* SACK block for the received segment. */
    head_blk.start = rcv_start;
    head_blk.end = rcv_end;

    /*
     * Merge updated SACK blocks into head_blk, and save unchanged SACK
     * blocks into saved_blks[].  num_saved will have the number of the
     * saved SACK blocks.
     */
    num_saved = 0;
    for (i = 0; i < tp->rcv_numsacks; i++) {
        tcp_seq start = tp->sackblks[i].start;
        tcp_seq end = tp->sackblks[i].end;
        if (SEQ_GEQ(start, end) || SEQ_LEQ(start, tp->rcv_nxt)) {
            /*
             * Discard this SACK block.
             */
        } else if (SEQ_LEQ(head_blk.start, end) &&
               SEQ_GEQ(head_blk.end, start)) {
            /*
             * Merge this SACK block into head_blk.  This SACK
             * block itself will be discarded.
             */
            if (SEQ_GT(head_blk.start, start))
                head_blk.start = start;
            if (SEQ_LT(head_blk.end, end))
                head_blk.end = end;
        } else {
            /*
             * Save this SACK block.
             */
            saved_blks[num_saved].start = start;
            saved_blks[num_saved].end = end;
            num_saved++;
        }
    }

    /*
     * Update SACK list in tp->sackblks[].
     */
    num_head = 0;
    if (SEQ_GT(head_blk.start, tp->rcv_nxt)) {
        /*
         * The received data segment is an out-of-order segment.  Put
         * head_blk at the top of SACK list.
         */
        tp->sackblks[0] = head_blk;
        num_head = 1;
        /*
         * If the number of saved SACK blocks exceeds its limit,
         * discard the last SACK block.
         */
        if (num_saved >= MAX_SACK_BLKS)
            num_saved--;
    }
    if (num_saved > 0) {
        /*
         * Copy the saved SACK blocks back.
         */
        bcopy(saved_blks, &tp->sackblks[num_head],
              sizeof(struct sackblk) * num_saved);
    }

    /* Save the number of SACK blocks. */
    tp->rcv_numsacks = num_head + num_saved;
}
Beispiel #15
0
/*
 * Process cumulative ACK and the TCP SACK option to update the scoreboard.
 * tp->snd_holes is an ordered list of holes (oldest to newest, in terms of
 * the sequence space).
 */
void
tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
{
    struct sackhole *cur, *temp;
    struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1], *sblkp;
    int i, j, num_sack_blks;

//ScenSim-Port//    INP_WLOCK_ASSERT(tp->t_inpcb);

    num_sack_blks = 0;
    /*
     * If SND.UNA will be advanced by SEG.ACK, and if SACK holes exist,
     * treat [SND.UNA, SEG.ACK) as if it is a SACK block.
     */
    if (SEQ_LT(tp->snd_una, th_ack) && !TAILQ_EMPTY(&tp->snd_holes)) {
        sack_blocks[num_sack_blks].start = tp->snd_una;
        sack_blocks[num_sack_blks++].end = th_ack;
    }
    /*
     * Append received valid SACK blocks to sack_blocks[], but only if we
     * received new blocks from the other side.
     */
    if (to->to_flags & TOF_SACK) {
        for (i = 0; i < to->to_nsacks; i++) {
            bcopy((to->to_sacks + i * TCPOLEN_SACK),
                &sack, sizeof(sack));
            sack.start = ntohl(sack.start);
            sack.end = ntohl(sack.end);
            if (SEQ_GT(sack.end, sack.start) &&
                SEQ_GT(sack.start, tp->snd_una) &&
                SEQ_GT(sack.start, th_ack) &&
                SEQ_LT(sack.start, tp->snd_max) &&
                SEQ_GT(sack.end, tp->snd_una) &&
                SEQ_LEQ(sack.end, tp->snd_max))
                sack_blocks[num_sack_blks++] = sack;
        }
    }
    /*
     * Return if SND.UNA is not advanced and no valid SACK block is
     * received.
     */
    if (num_sack_blks == 0)
        return;

    /*
     * Sort the SACK blocks so we can update the scoreboard with just one
     * pass. The overhead of sorting upto 4+1 elements is less than
     * making upto 4+1 passes over the scoreboard.
     */
    for (i = 0; i < num_sack_blks; i++) {
        for (j = i + 1; j < num_sack_blks; j++) {
            if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) {
                sack = sack_blocks[i];
                sack_blocks[i] = sack_blocks[j];
                sack_blocks[j] = sack;
            }
        }
    }
    if (TAILQ_EMPTY(&tp->snd_holes))
        /*
         * Empty scoreboard. Need to initialize snd_fack (it may be
         * uninitialized or have a bogus value). Scoreboard holes
         * (from the sack blocks received) are created later below
         * (in the logic that adds holes to the tail of the
         * scoreboard).
         */
        tp->snd_fack = SEQ_MAX(tp->snd_una, th_ack);
    /*
     * In the while-loop below, incoming SACK blocks (sack_blocks[]) and
     * SACK holes (snd_holes) are traversed from their tails with just
     * one pass in order to reduce the number of compares especially when
     * the bandwidth-delay product is large.
     *
     * Note: Typically, in the first RTT of SACK recovery, the highest
     * three or four SACK blocks with the same ack number are received.
     * In the second RTT, if retransmitted data segments are not lost,
     * the highest three or four SACK blocks with ack number advancing
     * are received.
     */
    sblkp = &sack_blocks[num_sack_blks - 1];    /* Last SACK block */
    tp->sackhint.last_sack_ack = sblkp->end;
    if (SEQ_LT(tp->snd_fack, sblkp->start)) {
        /*
         * The highest SACK block is beyond fack.  Append new SACK
         * hole at the tail.  If the second or later highest SACK
         * blocks are also beyond the current fack, they will be
         * inserted by way of hole splitting in the while-loop below.
         */
        temp = tcp_sackhole_insert(tp, tp->snd_fack,sblkp->start,NULL);
        if (temp != NULL) {
            tp->snd_fack = sblkp->end;
            /* Go to the previous sack block. */
            sblkp--;
        } else {
            /*
             * We failed to add a new hole based on the current
             * sack block.  Skip over all the sack blocks that
             * fall completely to the right of snd_fack and
             * proceed to trim the scoreboard based on the
             * remaining sack blocks.  This also trims the
             * scoreboard for th_ack (which is sack_blocks[0]).
             */
            while (sblkp >= sack_blocks &&
                   SEQ_LT(tp->snd_fack, sblkp->start))
                sblkp--;
            if (sblkp >= sack_blocks &&
                SEQ_LT(tp->snd_fack, sblkp->end))
                tp->snd_fack = sblkp->end;
        }
    } else if (SEQ_LT(tp->snd_fack, sblkp->end))
        /* fack is advanced. */
        tp->snd_fack = sblkp->end;
    /* We must have at least one SACK hole in scoreboard. */
//ScenSim-Port//    KASSERT(!TAILQ_EMPTY(&tp->snd_holes),
//ScenSim-Port//        ("SACK scoreboard must not be empty"));
    cur = TAILQ_LAST(&tp->snd_holes, sackhole_head); /* Last SACK hole. */
    /*
     * Since the incoming sack blocks are sorted, we can process them
     * making one sweep of the scoreboard.
     */
    while (sblkp >= sack_blocks  && cur != NULL) {
        if (SEQ_GEQ(sblkp->start, cur->end)) {
            /*
             * SACKs data beyond the current hole.  Go to the
             * previous sack block.
             */
            sblkp--;
            continue;
        }
        if (SEQ_LEQ(sblkp->end, cur->start)) {
            /*
             * SACKs data before the current hole.  Go to the
             * previous hole.
             */
            cur = TAILQ_PREV(cur, sackhole_head, scblink);
            continue;
        }
        tp->sackhint.sack_bytes_rexmit -= (cur->rxmit - cur->start);
//ScenSim-Port//        KASSERT(tp->sackhint.sack_bytes_rexmit >= 0,
//ScenSim-Port//            ("sackhint bytes rtx >= 0"));
        if (SEQ_LEQ(sblkp->start, cur->start)) {
            /* Data acks at least the beginning of hole. */
            if (SEQ_GEQ(sblkp->end, cur->end)) {
                /* Acks entire hole, so delete hole. */
                temp = cur;
                cur = TAILQ_PREV(cur, sackhole_head, scblink);
                tcp_sackhole_remove(tp, temp);
                /*
                 * The sack block may ack all or part of the
                 * next hole too, so continue onto the next
                 * hole.
                 */
                continue;
            } else {
                /* Move start of hole forward. */
                cur->start = sblkp->end;
                cur->rxmit = SEQ_MAX(cur->rxmit, cur->start);
            }
        } else {
            /* Data acks at least the end of hole. */
            if (SEQ_GEQ(sblkp->end, cur->end)) {
                /* Move end of hole backward. */
                cur->end = sblkp->start;
                cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
            } else {
                /*
                 * ACKs some data in middle of a hole; need
                 * to split current hole
                 */
                temp = tcp_sackhole_insert(tp, sblkp->end,
                    cur->end, cur);
                if (temp != NULL) {
                    if (SEQ_GT(cur->rxmit, temp->rxmit)) {
                        temp->rxmit = cur->rxmit;
                        tp->sackhint.sack_bytes_rexmit
                            += (temp->rxmit
                            - temp->start);
                    }
                    cur->end = sblkp->start;
                    cur->rxmit = SEQ_MIN(cur->rxmit,
                        cur->end);
                }
            }
        }
        tp->sackhint.sack_bytes_rexmit += (cur->rxmit - cur->start);
        /*
         * Testing sblkp->start against cur->start tells us whether
         * we're done with the sack block or the sack hole.
         * Accordingly, we advance one or the other.
         */
        if (SEQ_LEQ(sblkp->start, cur->start))
            cur = TAILQ_PREV(cur, sackhole_head, scblink);
        else
            sblkp--;
    }
}
Beispiel #16
0
int
tcp_reass(struct tcpcb *tp,struct tcpiphdr *ti, usn_mbuf_t *m)
{
	struct tcpiphdr *q;
	struct usn_socket *so = tp->t_inpcb->inp_socket;
	int flags;

	// Call with ti==0 after become established to
	// force pre-ESTABLISHED data up to user socket.
	if (ti == 0)
		goto present;

	// Find a segment which begins after this one does.
	for (q = tp->seg_next; q != (struct tcpiphdr *)tp;
	    q = (struct tcpiphdr *)q->ti_next)
		if (SEQ_GT(q->ti_seq, ti->ti_seq))
			break;

	// If there is a preceding segment, it may provide some of
	// our data already.  If so, drop the data from the incoming
	// segment.  If it provides all of our data, drop us.
	if ((struct tcpiphdr *)q->ti_prev != (struct tcpiphdr *)tp) {
		int i;
		q = (struct tcpiphdr *)q->ti_prev;
		// conversion to int (in i) handles seq wraparound
		i = q->ti_seq + q->ti_len - ti->ti_seq;
		if (i > 0) {
			if (i >= ti->ti_len) {
				g_tcpstat.tcps_rcvduppack++;
				g_tcpstat.tcps_rcvdupbyte += ti->ti_len;
				usn_free_mbuf(m);
				return (0);
			}
			m_adj(m, i);
			ti->ti_len -= i;
			ti->ti_seq += i;
		}
		q = (struct tcpiphdr *)(q->ti_next);
	}
	g_tcpstat.tcps_rcvoopack++;
	g_tcpstat.tcps_rcvoobyte += ti->ti_len;
	REASS_MBUF(ti) = m; // XXX: wrong assumtion dst and src port for mbuf pointer

	// While we overlap succeeding segments trim them or,
	// if they are completely covered, dequeue them.

	while (q != (struct tcpiphdr *)tp) {
		int i = (ti->ti_seq + ti->ti_len) - q->ti_seq;
		if (i <= 0)
			break;
		if (i < q->ti_len) {
			q->ti_seq += i;
			q->ti_len -= i;
			m_adj(REASS_MBUF(q), i);
			break;
		}
		q = (struct tcpiphdr *)q->ti_next;
		m = REASS_MBUF((struct tcpiphdr *)q->ti_prev);
      // FIXME
		//remque(q->ti_prev);
		usn_free_mbuf(m);
	}

	// FIXME:  Stick new segment in its place.
	//insque(ti, q->ti_prev);

present:
	// Present data to user, advancing rcv_nxt through
	// completed sequence space.
	if (TCPS_HAVERCVDSYN(tp->t_state) == 0)
		return (0);
	ti = tp->seg_next;
	if (ti == (struct tcpiphdr *)tp || ti->ti_seq != tp->rcv_nxt)
		return (0);
	if (tp->t_state == TCPS_SYN_RECEIVED && ti->ti_len)
		return (0);
	do {
		tp->rcv_nxt += ti->ti_len;
		flags = ti->ti_flags & TH_FIN;
      // FIXME
		//remque(ti);
		m = REASS_MBUF(ti);
		ti = (struct tcpiphdr *)ti->ti_next;
		if (so->so_state & USN_CANTRCVMORE) {
			usn_free_mbuf(m);
      } else {
			sbappend(so->so_rcv, m);
      }
	} while (ti != (struct tcpiphdr *)tp && ti->ti_seq == tp->rcv_nxt);

	sorwakeup(so);
   usnet_tcpin_rwakeup(so, USN_TCP_IN, USN_TCPEV_READ, m);
	return (flags);
}
Beispiel #17
0
/*
 * TCP input routine, follows pages 65-76 of the
 * protocol specification dated September, 1981 very closely.
 */
void
tcp_input(usn_mbuf_t *m, int iphlen)
{
	struct tcpiphdr *ti;
	struct inpcb *inp;
	u_char *optp = NULL;
	int optlen;
	int len, tlen, off;
	struct tcpcb *tp = 0;
	int tiflags;
	struct usn_socket *so = 0;
	int todrop, acked, ourfinisacked;
   int needoutput = 0;
	short ostate;
	struct usn_in_addr laddr;
	int dropsocket = 0;
	int iss = 0;
	u_long tiwin, ts_val, ts_ecr;
	int ts_present = 0;

   (void)needoutput;
	g_tcpstat.tcps_rcvtotal++;
 
	// Get IP and TCP header together in first mbuf.
	// Note: IP leaves IP header in first mbuf.
	ti = mtod(m, struct tcpiphdr *);
	if (iphlen > sizeof (usn_ip_t))
		ip_stripoptions(m, (usn_mbuf_t *)0);
	if (m->mlen < sizeof (struct tcpiphdr)) {
		if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) {
			g_tcpstat.tcps_rcvshort++;
			return;
		}
		ti = mtod(m, struct tcpiphdr *);
	}

#ifdef DUMP_PAYLOAD
   dump_chain(m,"tcp");
#endif

   /*
	 * Checksum extended TCP header and data.
    */
	tlen = ntohs(((usn_ip_t *)ti)->ip_len);
	len = sizeof (usn_ip_t) + tlen;
	ti->ti_next = ti->ti_prev = 0;
	ti->ti_x1 = 0;
	ti->ti_len = (u_short)tlen;
	HTONS(ti->ti_len);
   ti->ti_sum = in_cksum(m, len);
	if (ti->ti_sum) {
		g_tcpstat.tcps_rcvbadsum++;
		goto drop;
	}
   /*
	 * Check that TCP offset makes sense,
	 * pull out TCP options and adjust length. XXX
    */
	off = ti->ti_off << 2;
	if (off < sizeof (struct tcphdr) || off > tlen) {
		g_tcpstat.tcps_rcvbadoff++;
		goto drop;
	}
	tlen -= off;
	ti->ti_len = tlen;
	if (off > sizeof (struct tcphdr)) {
		if (m->mlen < sizeof(usn_ip_t) + off) {
			if ((m = m_pullup(m, sizeof (usn_ip_t) + off)) == 0) {
				g_tcpstat.tcps_rcvshort++;
				return;
			}
			ti = mtod(m, struct tcpiphdr *);
		}
		optlen = off - sizeof (struct tcphdr);
		optp = mtod(m, u_char *) + sizeof (struct tcpiphdr);

      //	Do quick retrieval of timestamp options ("options
      // prediction?"). If timestamp is the only option and it's
      // formatted as recommended in RFC 1323 appendix A, we
      // quickly get the values now and not bother calling
      // tcp_dooptions(), etc.
		if ((optlen == TCPOLEN_TSTAMP_APPA ||
		     (optlen > TCPOLEN_TSTAMP_APPA &&
			optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
		     *(u_int *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
		     (ti->ti_flags & TH_SYN) == 0) {
			ts_present = 1;
			ts_val = ntohl(*(u_long *)(optp + 4));
			ts_ecr = ntohl(*(u_long *)(optp + 8));
			optp = NULL;	// we've parsed the options
		}
	}
	tiflags = ti->ti_flags;

	// Convert TCP protocol specific fields to host format.
	NTOHL(ti->ti_seq);
	NTOHL(ti->ti_ack);
	NTOHS(ti->ti_win);
	NTOHS(ti->ti_urp);

	// Locate pcb for segment.
findpcb:
	inp = g_tcp_last_inpcb;
	if (inp->inp_lport != ti->ti_dport ||
	    inp->inp_fport != ti->ti_sport ||
	    inp->inp_faddr.s_addr != ti->ti_src.s_addr ||
	    inp->inp_laddr.s_addr != ti->ti_dst.s_addr) {
		inp = in_pcblookup(&g_tcb, ti->ti_src, ti->ti_sport,
		    ti->ti_dst, ti->ti_dport, INPLOOKUP_WILDCARD);
		if (inp)
			g_tcp_last_inpcb = inp;
		++g_tcpstat.tcps_pcbcachemiss;
	}

	// If the state is CLOSED (i.e., TCB does not exist) then
	// all data in the incoming segment is discarded.
	// If the TCB exists but is in CLOSED state, it is embryonic,
	// but should either do a listen or a connect soon.
	if (inp == 0)
		goto dropwithreset;

	tp = intotcpcb(inp);

   DEBUG("found inp cb, laddr=%x, lport=%d, faddr=%x,"
         " fport=%d, tp_state=%d, tp_flags=%d",
         inp->inp_laddr.s_addr,
         inp->inp_lport,
         inp->inp_faddr.s_addr,
         inp->inp_fport, tp->t_state, tp->t_flags);

	if (tp == 0)
		goto dropwithreset;
	if (tp->t_state == TCPS_CLOSED)
		goto drop;
	
	// Unscale the window into a 32-bit value. 
	if ((tiflags & TH_SYN) == 0)
		tiwin = ti->ti_win << tp->snd_scale;
	else
		tiwin = ti->ti_win;

	so = inp->inp_socket;
   DEBUG("socket info, options=%x", so->so_options);

	if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
		if (so->so_options & SO_DEBUG) {
			ostate = tp->t_state;
			g_tcp_saveti = *ti;
		}
		if (so->so_options & SO_ACCEPTCONN) {
			if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
				// Note: dropwithreset makes sure we don't
				// send a reset in response to a RST.
				if (tiflags & TH_ACK) {
					g_tcpstat.tcps_badsyn++;
					goto dropwithreset;
				}
            DEBUG("SYN is expected, tiflags=%d", tiflags);
				goto drop;
			}
			so = sonewconn(so, 0);
			if (so == 0) {
            DEBUG("failed to create new connection, tiflags=%d", tiflags);
				goto drop;
         }

			// Mark socket as temporary until we're
			// committed to keeping it.  The code at
			// ``drop'' and ``dropwithreset'' check the
			// flag dropsocket to see if the temporary
			// socket created here should be discarded.
			// We mark the socket as discardable until
			// we're committed to it below in TCPS_LISTEN.
			dropsocket++;
			inp = (struct inpcb *)so->so_pcb;
			inp->inp_laddr = ti->ti_dst;
			inp->inp_lport = ti->ti_dport;

         // BSD >= 4.3
			inp->inp_options = ip_srcroute();

			tp = intotcpcb(inp);
			tp->t_state = TCPS_LISTEN;

			// Compute proper scaling value from buffer space
			while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
			   TCP_MAXWIN << tp->request_r_scale < so->so_rcv->sb_hiwat)
				tp->request_r_scale++;
		}
	}

	// Segment received on connection.
	// Reset idle time and keep-alive timer.
	tp->t_idle = 0;
	tp->t_timer[TCPT_KEEP] = g_tcp_keepidle;

	// Process options if not in LISTEN state,
	// else do it below (after getting remote address).
	if (optp && tp->t_state != TCPS_LISTEN)
		tcp_dooptions(tp, optp, optlen, ti,
			&ts_present, &ts_val, &ts_ecr);

	// Header prediction: check for the two common cases
	// of a uni-directional data xfer.  If the packet has
	// no control flags, is in-sequence, the window didn't
	// change and we're not retransmitting, it's a
	// candidate.  If the length is zero and the ack moved
	// forward, we're the sender side of the xfer.  Just
	// free the data acked & wake any higher level process
	// that was blocked waiting for space.  If the length
	// is non-zero and the ack didn't move, we're the
	// receiver side.  If we're getting packets in-order
	// (the reassembly queue is empty), add the data to
	// the socket buffer and note that we need a delayed ack.
	if (tp->t_state == TCPS_ESTABLISHED &&
	    (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
	    (!ts_present || TSTMP_GEQ(ts_val, tp->ts_recent)) &&
	    ti->ti_seq == tp->rcv_nxt &&
	    tiwin && tiwin == tp->snd_wnd &&
	    tp->snd_nxt == tp->snd_max) {
		// If last ACK falls within this segment's sequence numbers,
		// record the timestamp.
      if ( ts_present && TSTMP_GEQ(ts_val, tp->ts_recent) &&
            SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) ){
			tp->ts_recent_age = g_tcp_now;
			tp->ts_recent = ts_val;
		}

		if (ti->ti_len == 0) {
			if (SEQ_GT(ti->ti_ack, tp->snd_una) &&
			    SEQ_LEQ(ti->ti_ack, tp->snd_max) &&
			    tp->snd_cwnd >= tp->snd_wnd) {
				// this is a pure ack for outstanding data.
				++g_tcpstat.tcps_predack;
				if (ts_present)
					tcp_xmit_timer(tp, g_tcp_now-ts_ecr+1);
				else if (tp->t_rtt &&
					    SEQ_GT(ti->ti_ack, tp->t_rtseq))
					tcp_xmit_timer(tp, tp->t_rtt);

				acked = ti->ti_ack - tp->snd_una;
				g_tcpstat.tcps_rcvackpack++;
				g_tcpstat.tcps_rcvackbyte += acked;
            TRACE("drop so_snd buffer, drop_bytes=%d, len=%d", 
                  acked, so->so_snd.sb_cc);

				sbdrop(so->so_snd, acked);
				tp->snd_una = ti->ti_ack;
				usn_free_cmbuf(m);

				// If all outstanding data are acked, stop
				// retransmit timer, otherwise restart timer
				// using current (possibly backed-off) value.
				// If process is waiting for space,
				// wakeup/selwakeup/signal.  If data
				// are ready to send, let tcp_output
				// decide between more output or persist.
				if (tp->snd_una == tp->snd_max)
					tp->t_timer[TCPT_REXMT] = 0;
				else if (tp->t_timer[TCPT_PERSIST] == 0)
					tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;

	         if (so->so_options & SO_DEBUG)
             	tcp_trace(TA_INPUT, ostate, tp, &g_tcp_saveti, 0);

				//if (so->so_snd->sb_flags & SB_NOTIFY) {
            //   usnet_tcpin_wwakeup(so, USN_TCP_IN, usn_tcpev_sbnotify, 0);
				//	sowwakeup(so);
            //}

            // send buffer is available for app thread. 
            usnet_tcpin_wwakeup(so, USN_TCP_IN, USN_TCPEV_WRITE, 0);

				if (so->so_snd->sb_cc)
					tcp_output(tp);
				return;
			}
		} else if (ti->ti_ack == tp->snd_una &&
		    tp->seg_next == (struct tcpiphdr *)tp &&
		    ti->ti_len <= sbspace(so->so_rcv)) {

			// this is a pure, in-sequence data packet
			// with nothing on the reassembly queue and
			// we have enough buffer space to take it.
			++g_tcpstat.tcps_preddat;
			tp->rcv_nxt += ti->ti_len;
			g_tcpstat.tcps_rcvpack++;
			g_tcpstat.tcps_rcvbyte += ti->ti_len;

			// Drop TCP, IP headers and TCP options then add data
			// to socket buffer.
			m->head += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
			m->mlen -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);

         TRACE("add data to rcv buf");
			sbappend(so->so_rcv, m);
			sorwakeup(so);

         // new data is available for app threads.
         usnet_tcpin_rwakeup(so, USN_TCP_IN, USN_TCPEV_READ, m);

	      if (so->so_options & SO_DEBUG) {
            TRACE("tcp trace, so_options=%d", so->so_options);
          	tcp_trace(TA_INPUT, ostate, tp, &g_tcp_saveti, 0);
         }

			tp->t_flags |= TF_DELACK;
			return;
		}
	}

	// Drop TCP, IP headers and TCP options.
	m->head += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
	m->mlen -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);

	// Calculate amount of space in receive window,
	// and then do TCP input processing.
	// Receive window is amount of space in rcv queue,
	// but not less than advertised window.
   {
	   int win;
	   win = sbspace(so->so_rcv);
	   if (win < 0)
	      win = 0;
  	   tp->rcv_wnd = max(win, (int)(tp->rcv_adv - tp->rcv_nxt));
	}

	switch (tp->t_state) {
	// If the state is LISTEN then ignore segment if it contains an RST.
	// If the segment contains an ACK then it is bad and send a RST.
	// If it does not contain a SYN then it is not interesting; drop it.
	// Don't bother responding if the destination was a broadcast.
	// Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial
	// tp->iss, and send a segment:
	//     <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
	// Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
	// Fill in remote peer address fields if not previously specified.
	// Enter SYN_RECEIVED state, and process any other fields of this
	// segment in this state.
	case TCPS_LISTEN: {
		usn_mbuf_t *am;
		struct usn_sockaddr_in *sin;

		if (tiflags & TH_RST)
			goto drop;
		if (tiflags & TH_ACK)
			goto dropwithreset;
		if ((tiflags & TH_SYN) == 0)
			goto drop;

		// RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
		// in_broadcast() should never return true on a received
		// packet with M_BCAST not set.

		//if (m->m_flags & (M_BCAST|M_MCAST) ||
		//    IN_MULTICAST(ntohl(ti->ti_dst.s_addr)))
		//	goto drop;

		am = usn_get_mbuf(0, BUF_MSIZE, 0);	// XXX: the size!
		if (am == NULL)
			goto drop;
		am->mlen = sizeof (struct usn_sockaddr_in);
		sin = mtod(am, struct usn_sockaddr_in *);
		sin->sin_family = AF_INET;
		sin->sin_len = sizeof(*sin);
		sin->sin_addr = ti->ti_src;
		sin->sin_port = ti->ti_sport;
		bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero));

		laddr = inp->inp_laddr;
		if (inp->inp_laddr.s_addr == USN_INADDR_ANY)
			inp->inp_laddr = ti->ti_dst;

		if (in_pcbconnect(inp, am)) {
			inp->inp_laddr = laddr;
			usn_free_mbuf(am);
			goto drop;
		}
		usn_free_mbuf(am);
		tp->t_template = tcp_template(tp);
		if (tp->t_template == 0) {
			tp = tcp_drop(tp, ENOBUFS);
			dropsocket = 0;		// socket is already gone
			goto drop;
		}
		if (optp)
			tcp_dooptions(tp, optp, optlen, ti,
				&ts_present, &ts_val, &ts_ecr);
		if (iss)
			tp->iss = iss;
		else
			tp->iss = g_tcp_iss;
		g_tcp_iss += TCP_ISSINCR/4;
		tp->irs = ti->ti_seq;
		tcp_sendseqinit(tp);
		tcp_rcvseqinit(tp);
		tp->t_flags |= TF_ACKNOW;
      TRACE("change tcp state to TCPS_SYN_RECEIVED, state=%d, tp_flags=%d",
            tp->t_state, tp->t_flags);
		tp->t_state = TCPS_SYN_RECEIVED;

      // tcp event
      usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_SYN_RECEIVED, 0);

		tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT;
		dropsocket = 0;		// committed to socket
		g_tcpstat.tcps_accepts++;
		goto trimthenstep6;
	}


	// If the state is SYN_SENT:
	//	if seg contains an ACK, but not for our SYN, drop the input.
	//	if seg contains a RST, then drop the connection.
	//	if seg does not contain SYN, then drop it.
	// Otherwise this is an acceptable SYN segment
	//	initialize tp->rcv_nxt and tp->irs
	//	if seg contains ack then advance tp->snd_una
	//	if SYN has been acked change to ESTABLISHED else SYN_RCVD state
	//	arrange for segment to be acked (eventually)
	//	continue processing rest of data/controls, beginning with URG
	case TCPS_SYN_SENT:
		if ((tiflags & TH_ACK) &&
		    (SEQ_LEQ(ti->ti_ack, tp->iss) ||
		     SEQ_GT(ti->ti_ack, tp->snd_max)))
			goto dropwithreset;
		if (tiflags & TH_RST) {
			if (tiflags & TH_ACK)
				tp = tcp_drop(tp, ECONNREFUSED);
			goto drop;
		}
		if ((tiflags & TH_SYN) == 0)
			goto drop;
		if (tiflags & TH_ACK) {
			tp->snd_una = ti->ti_ack;
			if (SEQ_LT(tp->snd_nxt, tp->snd_una))
				tp->snd_nxt = tp->snd_una;
		   tp->t_timer[TCPT_REXMT] = 0; 
		}
		
		tp->irs = ti->ti_seq;
		tcp_rcvseqinit(tp);
		tp->t_flags |= TF_ACKNOW;
      TRACE("ack now, tp flags=%d", tp->t_flags);

      // XXX: remove second test.
		if (tiflags & TH_ACK /*&& SEQ_GT(tp->snd_una, tp->iss)*/) {
			g_tcpstat.tcps_connects++;
			soisconnected(so);
         TRACE("change tcp state to TCPS_ESTABLISHED,"
               " state=%d, tp_flags=%d", tp->t_state, tp->t_flags);
			tp->t_state = TCPS_ESTABLISHED;

			// Do window scaling on this connection?
			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
				tp->snd_scale = tp->requested_s_scale;
				tp->rcv_scale = tp->request_r_scale;
			}
			tcp_reass(tp, (struct tcpiphdr *)0, (usn_mbuf_t *)0);

			// if we didn't have to retransmit the SYN,
			// use its rtt as our initial srtt & rtt var.
			if (tp->t_rtt)
				tcp_xmit_timer(tp, tp->t_rtt);
		} else {
         TRACE("change tcp state to TCPS_SYN_RECEIVED, state=%d, tp_flags=%d", 
               tp->t_state, tp->t_flags);
			tp->t_state = TCPS_SYN_RECEIVED;
         // tcp event
         usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_SYN_RECEIVED, 0);
      }

trimthenstep6:

		// Advance ti->ti_seq to correspond to first data byte.
		// If data, trim to stay within window,
		// dropping FIN if necessary.
		ti->ti_seq++;
		if (ti->ti_len > tp->rcv_wnd) {
			todrop = ti->ti_len - tp->rcv_wnd;
			m_adj(m, -todrop);
			ti->ti_len = tp->rcv_wnd;
			tiflags &= ~TH_FIN;
			g_tcpstat.tcps_rcvpackafterwin++;
			g_tcpstat.tcps_rcvbyteafterwin += todrop;
		}
		tp->snd_wl1 = ti->ti_seq - 1;
		tp->rcv_up = ti->ti_seq;
		goto step6;
	}

	// States other than LISTEN or SYN_SENT.
	// First check timestamp, if present.
	// Then check that at least some bytes of segment are within 
	// receive window.  If segment begins before rcv_nxt,
	// drop leading data (and SYN); if nothing left, just ack.
	// 
	// RFC 1323 PAWS: If we have a timestamp reply on this segment
	// and it's less than ts_recent, drop it.
	if (ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent &&
	    TSTMP_LT(ts_val, tp->ts_recent)) {
		// Check to see if ts_recent is over 24 days old.
		if ((int)(g_tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) {
			// Invalidate ts_recent.  If this segment updates
			// ts_recent, the age will be reset later and ts_recent
			// will get a valid value.  If it does not, setting
			// ts_recent to zero will at least satisfy the
			// requirement that zero be placed in the timestamp
			// echo reply when ts_recent isn't valid.  The
			// age isn't reset until we get a valid ts_recent
			// because we don't want out-of-order segments to be
			// dropped when ts_recent is old.
			tp->ts_recent = 0;
		} else {
			g_tcpstat.tcps_rcvduppack++;
			g_tcpstat.tcps_rcvdupbyte += ti->ti_len;
			g_tcpstat.tcps_pawsdrop++;
			goto dropafterack;
		}
	}

	todrop = tp->rcv_nxt - ti->ti_seq;
	if (todrop > 0) {
		if (tiflags & TH_SYN) {
			tiflags &= ~TH_SYN;
			ti->ti_seq++;
			if (ti->ti_urp > 1) 
				ti->ti_urp--;
			else
				tiflags &= ~TH_URG;
			todrop--;
		}
      if ( todrop >= ti->ti_len || 
           ( todrop == ti->ti_len && (tiflags & TH_FIN ) == 0 ) ) {
         // Any valid FIN must be to the left of the window.
         // At this point the FIN must be a duplicate or
         // out of sequence; drop it.
         tiflags &= ~TH_FIN;
         // Send an ACK to resynchronize and drop any data
         // But keep on processing for RST or ACK.
         tp->t_flags |= TF_ACKNOW;
         TRACE("send ack now to resync, tp_flags=%d", tp->t_flags);
         todrop = ti->ti_len;
         g_tcpstat.tcps_rcvdupbyte += ti->ti_len;
         g_tcpstat.tcps_rcvduppack++;
      } else {
         g_tcpstat.tcps_rcvpartduppack++;
         g_tcpstat.tcps_rcvpartdupbyte += ti->ti_len;
      }

		m_adj(m, todrop);
		ti->ti_seq += todrop;
		ti->ti_len -= todrop;
		if (ti->ti_urp > todrop)
			ti->ti_urp -= todrop;
		else {
			tiflags &= ~TH_URG;
			ti->ti_urp = 0;
		}
	}

	// If new data are received on a connection after the
	// user processes are gone, then RST the other end.
	if ((so->so_state & USN_NOFDREF) && 
	    tp->t_state > TCPS_CLOSE_WAIT && ti->ti_len) {
		tp = tcp_close(tp);
		g_tcpstat.tcps_rcvafterclose++;
		goto dropwithreset;
	}


	// If segment ends after window, drop trailing data
	// (and PUSH and FIN); if nothing left, just ACK.
	todrop = (ti->ti_seq+ti->ti_len) - (tp->rcv_nxt+tp->rcv_wnd);
	if (todrop > 0) {
		g_tcpstat.tcps_rcvpackafterwin++;
		if (todrop >= ti->ti_len) {
			g_tcpstat.tcps_rcvbyteafterwin += ti->ti_len;

			// If a new connection request is received
			// while in TIME_WAIT, drop the old connection
			// and start over if the sequence numbers
			// are above the previous ones.
			if (tiflags & TH_SYN &&
			    tp->t_state == TCPS_TIME_WAIT &&
			    SEQ_GT(ti->ti_seq, tp->rcv_nxt)) {
				iss = tp->snd_nxt + TCP_ISSINCR;
				tp = tcp_close(tp);
				goto findpcb;
			}

			// If window is closed can only take segments at
			// window edge, and have to drop data and PUSH from
			// incoming segments.  Continue processing, but
			// remember to ack.  Otherwise, drop segment
			// and ack.
			if (tp->rcv_wnd == 0 && ti->ti_seq == tp->rcv_nxt) {
				tp->t_flags |= TF_ACKNOW;
				g_tcpstat.tcps_rcvwinprobe++;
			} else
				goto dropafterack;
		} else
			g_tcpstat.tcps_rcvbyteafterwin += todrop;
		m_adj(m, -todrop);
		ti->ti_len -= todrop;
		tiflags &= ~(TH_PUSH|TH_FIN);
	}

   // check valid timestamp. Replace code above.
   if (ts_present && TSTMP_GEQ(ts_val, tp->ts_recent) &&
         SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) ) {
		tp->ts_recent_age = g_tcp_now;
		tp->ts_recent = ts_val;
   }

	// If the RST bit is set examine the state:
	//    SYN_RECEIVED STATE:
	//	If passive open, return to LISTEN state.
	//	If active open, inform user that connection was refused.
	//    ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
	//	Inform user that connection was reset, and close tcb.
	//    CLOSING, LAST_ACK, TIME_WAIT STATES
	//	Close the tcb.
	if (tiflags&TH_RST) switch (tp->t_state) {

	case TCPS_SYN_RECEIVED:
		so->so_error = ECONNREFUSED;
		goto close;

	case TCPS_ESTABLISHED:
	case TCPS_FIN_WAIT_1:
	case TCPS_FIN_WAIT_2:
	case TCPS_CLOSE_WAIT:
		so->so_error = ECONNRESET;
close:
      DEBUG("change tcp state to TCPS_CLOSED, state=%d", tp->t_state);
		tp->t_state = TCPS_CLOSED;
      // tcp event
      usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_CLOSED, 0);
		g_tcpstat.tcps_drops++;
		tp = tcp_close(tp);
		goto drop;

	case TCPS_CLOSING:
	case TCPS_LAST_ACK:
	case TCPS_TIME_WAIT:
		tp = tcp_close(tp);
		goto drop;
	}

	// If a SYN is in the window, then this is an
	// error and we send an RST and drop the connection.
	if (tiflags & TH_SYN) {
		tp = tcp_drop(tp, ECONNRESET);
		goto dropwithreset;
	}

	// If the ACK bit is off we drop the segment and return.
	if ((tiflags & TH_ACK) == 0)
		goto drop;

	// Ack processing.
	switch (tp->t_state) {

	// In SYN_RECEIVED state if the ack ACKs our SYN then enter
	// ESTABLISHED state and continue processing, otherwise
	// send an RST.
	case TCPS_SYN_RECEIVED:
		if (SEQ_GT(tp->snd_una, ti->ti_ack) ||
		    SEQ_GT(ti->ti_ack, tp->snd_max))
			goto dropwithreset;
		g_tcpstat.tcps_connects++;

      DEBUG("change tcp state to TCPS_ESTABLISHED, state=%d", tp->t_state);
		tp->t_state = TCPS_ESTABLISHED;
		soisconnected(so);

		// Do window scaling?
		if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
			(TF_RCVD_SCALE|TF_REQ_SCALE)) {
			tp->snd_scale = tp->requested_s_scale;
			tp->rcv_scale = tp->request_r_scale;
		}
		tcp_reass(tp, (struct tcpiphdr *)0, (usn_mbuf_t *)0);
		tp->snd_wl1 = ti->ti_seq - 1;
		// fall into ...

	// In ESTABLISHED state: drop duplicate ACKs; ACK out of range
	// ACKs.  If the ack is in the range
	//	tp->snd_una < ti->ti_ack <= tp->snd_max
	// then advance tp->snd_una to ti->ti_ack and drop
	// data from the retransmission queue.  If this ACK reflects
	// more up to date window information we update our window information.
	case TCPS_ESTABLISHED:
	case TCPS_FIN_WAIT_1:
	case TCPS_FIN_WAIT_2:
	case TCPS_CLOSE_WAIT:
	case TCPS_CLOSING:
	case TCPS_LAST_ACK:
	case TCPS_TIME_WAIT:

		if (SEQ_LEQ(ti->ti_ack, tp->snd_una)) {
			if (ti->ti_len == 0 && tiwin == tp->snd_wnd) {
				g_tcpstat.tcps_rcvdupack++;
				// If we have outstanding data (other than
				// a window probe), this is a completely
				// duplicate ack (ie, window info didn't
				// change), the ack is the biggest we've
				// seen and we've seen exactly our rexmt
				// threshhold of them, assume a packet
				// has been dropped and retransmit it.
				// Kludge snd_nxt & the congestion
				// window so we send only this one
				// packet.
				//
				// We know we're losing at the current
				// window size so do congestion avoidance
				// (set ssthresh to half the current window
				// and pull our congestion window back to
				// the new ssthresh).
				//
				// Dup acks mean that packets have left the
				// network (they're now cached at the receiver) 
				// so bump cwnd by the amount in the receiver
				// to keep a constant cwnd packets in the
				// network.
				if (tp->t_timer[TCPT_REXMT] == 0 ||
				    ti->ti_ack != tp->snd_una)
					tp->t_dupacks = 0;
				else if (++tp->t_dupacks == g_tcprexmtthresh) {
               // congestion avoidance
					tcp_seq onxt = tp->snd_nxt;
					u_int win =
					    min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;

					if (win < 2)
						win = 2;
					tp->snd_ssthresh = win * tp->t_maxseg;
					tp->t_timer[TCPT_REXMT] = 0;
					tp->t_rtt = 0;
					tp->snd_nxt = ti->ti_ack;
					tp->snd_cwnd = tp->t_maxseg;
					tcp_output(tp);
					tp->snd_cwnd = tp->snd_ssthresh +
					       tp->t_maxseg * tp->t_dupacks;
					if (SEQ_GT(onxt, tp->snd_nxt))
						tp->snd_nxt = onxt;
					goto drop;
				} else if (tp->t_dupacks > g_tcprexmtthresh) {
					tp->snd_cwnd += tp->t_maxseg;
					tcp_output(tp);
					goto drop;
				}
			} else
				tp->t_dupacks = 0;
			break;
		}

		// If the congestion window was inflated to account
		// for the other side's cached packets, retract it.
		if (tp->t_dupacks > g_tcprexmtthresh &&
		    tp->snd_cwnd > tp->snd_ssthresh)
			tp->snd_cwnd = tp->snd_ssthresh;
		tp->t_dupacks = 0;
		if (SEQ_GT(ti->ti_ack, tp->snd_max)) {
			g_tcpstat.tcps_rcvacktoomuch++;
			goto dropafterack;
		}
		acked = ti->ti_ack - tp->snd_una;
		g_tcpstat.tcps_rcvackpack++;
		g_tcpstat.tcps_rcvackbyte += acked;

		// If we have a timestamp reply, update smoothed
		// round trip time.  If no timestamp is present but
		// transmit timer is running and timed sequence
		// number was acked, update smoothed round trip time.
		// Since we now have an rtt measurement, cancel the
		// timer backoff (cf., Phil Karn's retransmit alg.).
		// Recompute the initial retransmit timer.
		if (ts_present)
			tcp_xmit_timer(tp, g_tcp_now-ts_ecr+1);
		else if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq))
			tcp_xmit_timer(tp,tp->t_rtt);

		// If all outstanding data is acked, stop retransmit
		// timer and remember to restart (more output or persist).
		// If there is more data to be acked, restart retransmit
		// timer, using current (possibly backed-off) value.
		if (ti->ti_ack == tp->snd_max) {
			tp->t_timer[TCPT_REXMT] = 0;
         DEBUG("change needoutput to 1");
			needoutput = 1;
         tp->t_flags |= TF_NEEDOUTPUT;
		} else if (tp->t_timer[TCPT_PERSIST] == 0)
			tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;

		// When new data is acked, open the congestion window.
		// If the window gives us less than ssthresh packets
		// in flight, open exponentially (maxseg per packet).
		// Otherwise open linearly: maxseg per window
		// (maxseg * (maxseg / cwnd) per packet).
		{
		   u_int cw = tp->snd_cwnd;
	   	u_int incr = tp->t_maxseg;

	   	if (cw > tp->snd_ssthresh)
	   		incr = incr * incr / cw;
   		tp->snd_cwnd = min(cw + incr, TCP_MAXWIN<<tp->snd_scale);
		}

		if (acked > so->so_snd->sb_cc) {
			tp->snd_wnd -= so->so_snd->sb_cc;
         DEBUG("drop all so_snd buffer, drop_bytes=%d, acked=%d", 
               so->so_snd->sb_cc, acked);
			sbdrop(so->so_snd, (int)so->so_snd->sb_cc);
			ourfinisacked = 1;
		} else {
         DEBUG("drop so_snd buffer, drop_bytes=%d, len=%d", acked, so->so_snd->sb_cc);
			sbdrop(so->so_snd, acked);
			tp->snd_wnd -= acked;
			ourfinisacked = 0;
		}
		//if (so->so_snd->sb_flags & SB_NOTIFY) {
			sowwakeup(so);
         usnet_tcpin_wwakeup(so, USN_TCP_IN, USN_TCPEV_WRITE, 0);
      //}

		tp->snd_una = ti->ti_ack;
		if (SEQ_LT(tp->snd_nxt, tp->snd_una))
			tp->snd_nxt = tp->snd_una;

		switch (tp->t_state) {

		// In FIN_WAIT_1 STATE in addition to the processing
		// for the ESTABLISHED state if our FIN is now acknowledged
		// then enter FIN_WAIT_2.
		case TCPS_FIN_WAIT_1:
			if (ourfinisacked) {
				// If we can't receive any more
				// data, then closing user can proceed.
				// Starting the timer is contrary to the
				// specification, but if we don't get a FIN
				// we'll hang forever.
				if (so->so_state & USN_CANTRCVMORE) {
					soisdisconnected(so);
					tp->t_timer[TCPT_2MSL] = g_tcp_maxidle;
				}
            DEBUG("change tcp state to TCPS_FIN_WAIT_2, state=%d", tp->t_state);
				tp->t_state = TCPS_FIN_WAIT_2;
            usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_FIN_WAIT2, 0);
			}
			break;

		// In CLOSING STATE in addition to the processing for
		// the ESTABLISHED state if the ACK acknowledges our FIN
		// then enter the TIME-WAIT state, otherwise ignore
		// the segment.
		case TCPS_CLOSING:
			if (ourfinisacked) {
            DEBUG("change tcp state to TCPS_TIME_WAIT, state=%d", tp->t_state);
				tp->t_state = TCPS_TIME_WAIT;
            usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_TIME_WAIT, 0);
				tcp_canceltimers(tp);
				tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
				soisdisconnected(so);
			}
			break;
		
		// In LAST_ACK, we may still be waiting for data to drain
		// and/or to be acked, as well as for the ack of our FIN.
		// If our FIN is now acknowledged, delete the TCB,
		// enter the closed state and return.
		case TCPS_LAST_ACK:
			if (ourfinisacked) {
				tp = tcp_close(tp);
				goto drop;
			}
			break;


		// In TIME_WAIT state the only thing that should arrive
		// is a retransmission of the remote FIN.  Acknowledge
		// it and restart the finack timer.
		case TCPS_TIME_WAIT:
			tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
			goto dropafterack;
		}
	}

step6:

	// Update window information.
	// Don't look at window if no ACK: TAC's send garbage on first SYN.
	if ((tiflags & TH_ACK) &&
	    (SEQ_LT(tp->snd_wl1, ti->ti_seq) || 
        (tp->snd_wl1 == ti->ti_seq && (SEQ_LT(tp->snd_wl2, ti->ti_ack) ||
	     (tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd) ))  )) {
		// keep track of pure window updates
		if (ti->ti_len == 0 &&
		    tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd)
			g_tcpstat.tcps_rcvwinupd++;
		tp->snd_wnd = tiwin;
		tp->snd_wl1 = ti->ti_seq;
		tp->snd_wl2 = ti->ti_ack;
		if (tp->snd_wnd > tp->max_sndwnd)
			tp->max_sndwnd = tp->snd_wnd;
      DEBUG("change needoutput to 1");
      tp->t_flags |= TF_NEEDOUTPUT;
		needoutput = 1;
	}

	
	// Process segments with URG.
	if ((tiflags & TH_URG) && ti->ti_urp &&
	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {

		// This is a kludge, but if we receive and accept
		// random urgent pointers, we'll crash in
		// soreceive.  It's hard to imagine someone
		// actually wanting to send this much urgent data.
		if (ti->ti_urp + so->so_rcv->sb_cc > g_sb_max) {
			ti->ti_urp = 0;			// XXX
			tiflags &= ~TH_URG;		// XXX
			goto dodata;			// XXX
		}

		// If this segment advances the known urgent pointer,
		// then mark the data stream.  This should not happen
		// in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
		// a FIN has been received from the remote side. 
		// In these states we ignore the URG.
		//
		// According to RFC961 (Assigned Protocols),
		// the urgent pointer points to the last octet
		// of urgent data.  We continue, however,
		// to consider it to indicate the first octet
		// of data past the urgent section as the original 
		// spec states (in one of two places).
		if (SEQ_GT(ti->ti_seq+ti->ti_urp, tp->rcv_up)) {
			tp->rcv_up = ti->ti_seq + ti->ti_urp;
			so->so_oobmark = so->so_rcv->sb_cc +
			    (tp->rcv_up - tp->rcv_nxt) - 1;
			if (so->so_oobmark == 0)
				so->so_state |= USN_RCVATMARK;
			sohasoutofband(so);
         // send async event to app threads.
         usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPEV_OUTOFBOUND, 0);
			tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
		}

		// Remove out of band data so doesn't get presented to user.
		// This can happen independent of advancing the URG pointer,
		// but if two URG's are pending at once, some out-of-band
		// data may creep in... ick.
		if (ti->ti_urp <= ti->ti_len
#ifdef SO_OOBINLINE
		     && (so->so_options & SO_OOBINLINE) == 0
#endif
		     )
			tcp_pulloutofband(so, ti, m);
	} else
		// If no out of band data is expected,
		// pull receive urgent pointer along
		// with the receive window.
		if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
			tp->rcv_up = tp->rcv_nxt;
dodata:							// XXX
#ifdef DUMP_PAYLOAD
   DEBUG("Handle data");
   dump_chain(m,"tcp");
#endif

	// Process the segment text, merging it into the TCP sequencing queue,
	// and arranging for acknowledgment of receipt if necessary.
	// This process logically involves adjusting tp->rcv_wnd as data
	// is presented to the user (this happens in tcp_usrreq.c,
	// case PRU_RCVD).  If a FIN has already been received on this
	// connection then we just ignore the text.
	if ((ti->ti_len || (tiflags&TH_FIN)) &&
	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
		TCP_REASS(tp, ti, m, so, tiflags);
		// Note the amount of data that peer has sent into
		// our window, in order to estimate the sender's
		// buffer size.
		len = so->so_rcv->sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
	} else {
		usn_free_cmbuf(m);
		tiflags &= ~TH_FIN;
	}

	// If FIN is received ACK the FIN and let the user know
	// that the connection is closing.
	if (tiflags & TH_FIN) {
		if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
			socantrcvmore(so);
			tp->t_flags |= TF_ACKNOW;
         TRACE("ack FIN now, tp flags=%d", tp->t_flags);
			tp->rcv_nxt++;
		}
		switch (tp->t_state) {

		// In SYN_RECEIVED and ESTABLISHED STATES
		// enter the CLOSE_WAIT state.
		case TCPS_SYN_RECEIVED:
		case TCPS_ESTABLISHED:
         TRACE("change tcp state to TCPS_CLOSE_WAIT, state=%d", tp->t_state);
			tp->t_state = TCPS_CLOSE_WAIT;
         soewakeup(so, 0);
         usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_CLOSE_WAIT, 0);
			break;

		// If still in FIN_WAIT_1 STATE FIN has not been acked so
		// enter the CLOSING state.
		case TCPS_FIN_WAIT_1:
         TRACE("change tcp state to TCPS_CLOSING, state=%d", tp->t_state);
			tp->t_state = TCPS_CLOSING;
         usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_CLOSING, 0);
			break;

		// In FIN_WAIT_2 state enter the TIME_WAIT state,
		// starting the time-wait timer, turning off the other 
		// standard timers.
		case TCPS_FIN_WAIT_2:
         TRACE("change tcp state to TCPS_TIME_WAIT, state=%d", tp->t_state);
			tp->t_state = TCPS_TIME_WAIT;
			tcp_canceltimers(tp);
			tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
			soisdisconnected(so);
         usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_TIME_WAIT, 0);
			break;

		// In TIME_WAIT state restart the 2 MSL time_wait timer.
		case TCPS_TIME_WAIT:
			tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
			break;
		}
	}
	if (so->so_options & SO_DEBUG) {
      TRACE("tcp trace, so_options=%d", so->so_options);
		tcp_trace(TA_INPUT, ostate, tp, &g_tcp_saveti, 0);
   }

	// Return any desired output.
	//if (needoutput || (tp->t_flags & TF_ACKNOW)){
	if (tp->t_flags & TF_NEEDOUTPUT || (tp->t_flags & TF_ACKNOW)){
      TRACE("ack now or need to ouput, tp->t_flags=%d", tp->t_flags);
		tcp_output(tp);
   }
	return;

dropafterack:
   TRACE("dropafterack");
	// Generate an ACK dropping incoming segment if it occupies
	// sequence space, where the ACK reflects our state.
	if (tiflags & TH_RST)
		goto drop;
	usn_free_cmbuf(m);
	tp->t_flags |= TF_ACKNOW;
   TRACE("ack now, tp flags=%d", tp->t_flags);
	tcp_output(tp);
	return;

dropwithreset:
   TRACE("dropwithreset");
	// Generate a RST, dropping incoming segment.
	// Make ACK acceptable to originator of segment.
	// Don't bother to respond if destination was broadcast/multicast.
#define USN_MULTICAST(i) (((u_int)(i) & 0xf0000000) == 0xe0000000)
	if ((tiflags & TH_RST) || m->flags & (BUF_BCAST|BUF_MCAST) ||
	    USN_MULTICAST(ntohl(ti->ti_dst.s_addr)))
		goto drop;
   
	if (tiflags & TH_ACK)
		tcp_respond(tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST);
	else {
		if (tiflags & TH_SYN)
			ti->ti_len++;
		tcp_respond(tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0,
		    TH_RST|TH_ACK);
	}
	// destroy temporarily created socket
	if (dropsocket)
		soabort(so);
	return;

drop:
   TRACE("drop");
	// Drop space held by incoming segment and return.
	if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) {
      TRACE("tcp trace: drop a socket");
		tcp_trace(TA_DROP, ostate, tp, &g_tcp_saveti, 0);
   }
	usn_free_cmbuf(m);
	// destroy temporarily created socket
	if (dropsocket)
		soabort(so);
	return;
}
Beispiel #18
0
/*
 * To insert a new blk to the array of SACK blk in receiver.
 *
 * Parameters:
 *	sack_blk_t *head: pointer to the array of SACK blks.
 *	tcp_seq begin: starting seq num of the new blk.
 *	tcp_seq end: ending seq num of the new blk.
 *	int32_t *num: (referenced) total num of SACK blks on the list.
 */
void
tcp_sack_insert(sack_blk_t *head, tcp_seq begin, tcp_seq end, int32_t *num)
{
	int32_t	i, j, old_num, new_num;
	sack_blk_t tmp[MAX_SACK_BLK - 1];

	/* The array is empty, just add the new one. */
	if (*num == 0) {
		head[0].begin = begin;
		head[0].end = end;
		*num = 1;
		return;
	}

	/*
	 * Check for overlap.  There are five cases.
	 *
	 * 1. there is no overlap with any other SACK blks.
	 * 2. new SACK blk is completely contained in another blk.
	 * 3. tail part of new SACK blk overlaps with another blk.
	 * 4. head part of new SACK blk overlaps with another blk.
	 * 5. new SACK blk completely contains another blk.
	 *
	 * Use tmp to hold old SACK blks.  After the loop, copy them back
	 * to head.
	 */
	old_num = *num;
	if (old_num > MAX_SACK_BLK - 1) {
		old_num = MAX_SACK_BLK - 1;
	}
	new_num = old_num;
	j = 0;
	for (i = 0; i < old_num; i++) {
		if (SEQ_LT(end, head[i].begin) || SEQ_GT(begin, head[i].end)) {
			/* Case 1: continue to check. */
			tmp[j].begin = head[i].begin;
			tmp[j].end = head[i].end;
			j++;
			continue;
		} else if (SEQ_GEQ(begin, head[i].begin) &&
		    SEQ_LEQ(end, head[i].end)) {
			/* Case 2: re-insert the old blk to the head. */
			begin = head[i].begin;
			end = head[i].end;
		} else if (SEQ_LEQ(end, head[i].end) &&
		    SEQ_GEQ(end, head[i].begin)) {
			/*
			 * Case 3: Extend the new blk, remove the old one
			 * and continue to check.
			 */
			end = head[i].end;
		} else if (SEQ_GEQ(begin, head[i].begin) &&
		    SEQ_LEQ(begin, head[i].end)) {
			/* Case 4 */
			begin = head[i].begin;
		}
		/*
		 * Common code for all cases except the first one, which
		 * copies the original SACK blk into the tmp storage.  Other
		 * cases remove the original SACK blk by not copying into
		 * tmp storage.
		 */
		new_num--;
	}

	head[0].begin = begin;
	head[0].end = end;
	for (i = 0; i < new_num; i++) {
		head[i+1].begin = tmp[i].begin;
		head[i+1].end = tmp[i].end;
	}
	*num = new_num + 1;
}
Beispiel #19
0
int
tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
{
	struct socket *so = tp->t_inpcb->inp_socket;
	struct mbuf *mq, *mp;
	int flags, wakeup;

	INP_WLOCK_ASSERT(tp->t_inpcb);

	/*
	 * XXX: tcp_reass() is rather inefficient with its data structures
	 * and should be rewritten (see NetBSD for optimizations).
	 */

	/*
	 * Call with th==NULL after become established to
	 * force pre-ESTABLISHED data up to user socket.
	 */
	if (th == NULL)
		goto present;

	M_ASSERTPKTHDR(m);
	KASSERT(*tlenp == m->m_pkthdr.len, ("%s: tlenp %u len %u", __func__,
	    *tlenp, m->m_pkthdr.len));

	/*
	 * Limit the number of segments that can be queued to reduce the
	 * potential for mbuf exhaustion. For best performance, we want to be
	 * able to queue a full window's worth of segments. The size of the
	 * socket receive buffer determines our advertised window and grows
	 * automatically when socket buffer autotuning is enabled. Use it as the
	 * basis for our queue limit.
	 * Always let the missing segment through which caused this queue.
	 * NB: Access to the socket buffer is left intentionally unlocked as we
	 * can tolerate stale information here.
	 */
	if ((th->th_seq != tp->rcv_nxt || !TCPS_HAVEESTABLISHED(tp->t_state)) &&
	    tp->t_segqlen + m->m_pkthdr.len >= sbspace(&so->so_rcv)) {
		char *s;

		TCPSTAT_INC(tcps_rcvreassfull);
		*tlenp = 0;
		if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL,
		    NULL))) {
			log(LOG_DEBUG, "%s; %s: queue limit reached, "
			    "segment dropped\n", s, __func__);
			free(s, M_TCPLOG);
		}
		m_freem(m);
		return (0);
	}

	/*
	 * Find a segment which begins after this one does.
	 */
	mp = NULL;
	for (mq = tp->t_segq; mq != NULL; mq = mq->m_nextpkt) {
		if (SEQ_GT(M_TCPHDR(mq)->th_seq, th->th_seq))
			break;
		mp = mq;
	}

	/*
	 * If there is a preceding segment, it may provide some of
	 * our data already.  If so, drop the data from the incoming
	 * segment.  If it provides all of our data, drop us.
	 */
	if (mp != NULL) {
		int i;

		/* conversion to int (in i) handles seq wraparound */
		i = M_TCPHDR(mp)->th_seq + mp->m_pkthdr.len - th->th_seq;
		if (i > 0) {
			if (i >= *tlenp) {
				TCPSTAT_INC(tcps_rcvduppack);
				TCPSTAT_ADD(tcps_rcvdupbyte, *tlenp);
				m_freem(m);
				/*
				 * Try to present any queued data
				 * at the left window edge to the user.
				 * This is needed after the 3-WHS
				 * completes.
				 */
				goto present;	/* ??? */
			}
			m_adj(m, i);
			*tlenp -= i;
			th->th_seq += i;
		}
	}
	tp->t_rcvoopack++;
	TCPSTAT_INC(tcps_rcvoopack);
	TCPSTAT_ADD(tcps_rcvoobyte, *tlenp);

	/*
	 * While we overlap succeeding segments trim them or,
	 * if they are completely covered, dequeue them.
	 */
	while (mq) {
		struct mbuf *nq;
		int i;

		i = (th->th_seq + *tlenp) - M_TCPHDR(mq)->th_seq;
		if (i <= 0)
			break;
		if (i < mq->m_pkthdr.len) {
			M_TCPHDR(mq)->th_seq += i;
			m_adj(mq, i);
			tp->t_segqlen -= i;
			break;
		}

		nq = mq->m_nextpkt;
		tp->t_segqlen -= mq->m_pkthdr.len;
		m_freem(mq);
		if (mp)
			mp->m_nextpkt = nq;
		else
			tp->t_segq = nq;
		mq = nq;
	}

	/*
	 * Insert the new segment queue entry into place.  Try to collapse
	 * mbuf chains if segments are adjacent.
	 */
	if (mp) {
		if (M_TCPHDR(mp)->th_seq + mp->m_pkthdr.len == th->th_seq)
			m_catpkt(mp, m);
		else {
			m->m_nextpkt = mp->m_nextpkt;
			mp->m_nextpkt = m;
			m->m_pkthdr.pkt_tcphdr = th;
		}
	} else {
		mq = tp->t_segq;
		tp->t_segq = m;
		if (mq && th->th_seq + *tlenp == M_TCPHDR(mq)->th_seq) {
			m->m_nextpkt = mq->m_nextpkt;
			mq->m_nextpkt = NULL;
			m_catpkt(m, mq);
		} else
			m->m_nextpkt = mq;
		m->m_pkthdr.pkt_tcphdr = th;
	}
	tp->t_segqlen += *tlenp;

present:
	/*
	 * Present data to user, advancing rcv_nxt through
	 * completed sequence space.
	 */
	if (!TCPS_HAVEESTABLISHED(tp->t_state))
		return (0);

	flags = 0;
	wakeup = 0;
	SOCKBUF_LOCK(&so->so_rcv);
	while ((mq = tp->t_segq) != NULL &&
	    M_TCPHDR(mq)->th_seq == tp->rcv_nxt) {
		tp->t_segq = mq->m_nextpkt;

		tp->rcv_nxt += mq->m_pkthdr.len;
		tp->t_segqlen -= mq->m_pkthdr.len;
		flags = M_TCPHDR(mq)->th_flags & TH_FIN;

		if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
			m_freem(mq);
		else {
			mq->m_nextpkt = NULL;
			sbappendstream_locked(&so->so_rcv, mq, 0);
			wakeup = 1;
		}
	}
	ND6_HINT(tp);
	if (wakeup)
		sorwakeup_locked(so);
	else
		SOCKBUF_UNLOCK(&so->so_rcv);
	return (flags);
}
Beispiel #20
0
static int
tcp_reass(register struct tcpcb *tp, register struct tcpiphdr *ti,
          struct mbuf *m)
{
	register struct tcpiphdr *q;
	struct socket *so = tp->t_socket;
	int flags;

	/*
	 * Call with ti==NULL after become established to
	 * force pre-ESTABLISHED data up to user socket.
	 */
        if (ti == NULL)
		goto present;

	/*
	 * Find a segment which begins after this one does.
	 */
	for (q = tcpfrag_list_first(tp); !tcpfrag_list_end(q, tp);
            q = tcpiphdr_next(q))
		if (SEQ_GT(q->ti_seq, ti->ti_seq))
			break;

	/*
	 * If there is a preceding segment, it may provide some of
	 * our data already.  If so, drop the data from the incoming
	 * segment.  If it provides all of our data, drop us.
	 */
	if (!tcpfrag_list_end(tcpiphdr_prev(q), tp)) {
		register int i;
		q = tcpiphdr_prev(q);
		/* conversion to int (in i) handles seq wraparound */
		i = q->ti_seq + q->ti_len - ti->ti_seq;
		if (i > 0) {
			if (i >= ti->ti_len) {
				m_free(m);
				/*
				 * Try to present any queued data
				 * at the left window edge to the user.
				 * This is needed after the 3-WHS
				 * completes.
				 */
				goto present;   /* ??? */
			}
			m_adj(m, i);
			ti->ti_len -= i;
			ti->ti_seq += i;
		}
		q = tcpiphdr_next(q);
	}
	ti->ti_mbuf = m;

	/*
	 * While we overlap succeeding segments trim them or,
	 * if they are completely covered, dequeue them.
	 */
	while (!tcpfrag_list_end(q, tp)) {
		register int i = (ti->ti_seq + ti->ti_len) - q->ti_seq;
		if (i <= 0)
			break;
		if (i < q->ti_len) {
			q->ti_seq += i;
			q->ti_len -= i;
			m_adj(q->ti_mbuf, i);
			break;
		}
		q = tcpiphdr_next(q);
		m = tcpiphdr_prev(q)->ti_mbuf;
		remque(tcpiphdr2qlink(tcpiphdr_prev(q)));
		m_free(m);
	}

	/*
	 * Stick new segment in its place.
	 */
	insque(tcpiphdr2qlink(ti), tcpiphdr2qlink(tcpiphdr_prev(q)));

present:
	/*
	 * Present data to user, advancing rcv_nxt through
	 * completed sequence space.
	 */
	if (!TCPS_HAVEESTABLISHED(tp->t_state))
		return (0);
	ti = tcpfrag_list_first(tp);
	if (tcpfrag_list_end(ti, tp) || ti->ti_seq != tp->rcv_nxt)
		return (0);
	if (tp->t_state == TCPS_SYN_RECEIVED && ti->ti_len)
		return (0);
	do {
		tp->rcv_nxt += ti->ti_len;
		flags = ti->ti_flags & TH_FIN;
		remque(tcpiphdr2qlink(ti));
		m = ti->ti_mbuf;
		ti = tcpiphdr_next(ti);
		if (so->so_state & SS_FCANTSENDMORE)
			m_free(m);
		else {
			if (so->so_emu) {
				if (tcp_emu(so,m)) sbappend(so, m);
			} else
				sbappend(so, m);
		}
	} while (ti != (struct tcpiphdr *)tp && ti->ti_seq == tp->rcv_nxt);
	return (flags);
}
Beispiel #21
0
static int tcpup_state_send(struct tcpup_info *upp, struct tcpiphdr *tcp, size_t dlen)
{
	int xflags = 0;

	if (tcp->th_flags & TH_RST) {
		upp->t_state = TCPS_CLOSED;
		return 0;
	}

	if (upp->x_state != upp->t_state
			&& (tcp->th_flags & TH_ACK)
			&& SEQ_GEQ(htonl(tcp->th_ack), upp->rcv_una)) {
		tcp_state_update(upp, upp->x_state);
		upp->t_state = upp->x_state;
	}

	switch (upp->t_state) {
		case TCPS_CLOSED:
			xflags = TH_SYN| TH_ACK;
			if ((tcp->th_flags & xflags) == TH_SYN) {
				upp->snd_nxt = htonl(tcp->th_seq) + 1;
				upp->snd_max = htonl(tcp->th_seq) + 1;
				upp->snd_una = htonl(tcp->th_seq) + 1;
				tcp_state_update(upp, TCPS_SYN_SENT);
				return 0;
			}
			break;

		case TCPS_SYN_RECEIVED:
			assert((tcp->th_flags & TH_FIN) != TH_FIN);
			xflags = TH_SYN| TH_ACK;
			if ((tcp->th_flags & xflags) == TH_ACK
					&& SEQ_GT(htonl(tcp->th_seq), upp->snd_nxt)) {
				tcp_state_update(upp, upp->x_state);
				return 0;
			}
			break;

		case TCPS_ESTABLISHED:
			if ((tcp->th_flags & TH_FIN) == TH_FIN) {
				upp->snd_nxt = htonl(tcp->th_seq) + dlen + 1;
				tcp_state_update(upp, TCPS_FIN_WAIT_1);
				return 0;
			}
			break;

		case TCPS_CLOSE_WAIT:
			if ((tcp->th_flags & TH_FIN) == TH_FIN) {
				upp->snd_nxt = htonl(tcp->th_seq) + dlen + 1;
				tcp_state_update(upp, TCPS_LAST_ACK);
				return 0;
			}
			break;

		case TCPS_FIN_WAIT_1:
			xflags = TH_FIN| TH_ACK;
			if ((tcp->th_flags & xflags) == TH_ACK) {
				tcp_state_update(upp, upp->x_state);
				return 0;
			}
			break;
	}

	if (dlen > 0) {
		upp->snd_nxt = htonl(tcp->th_seq) + dlen;
		if (SEQ_GT(upp->snd_nxt, upp->snd_max)) {
			/* update snd max to nxt */
			upp->snd_max = upp->snd_nxt;
		}
	}

	return 0;
}
Beispiel #22
0
enum CT_UPDATE_RES
OvsConntrackUpdateTcpEntry(OVS_CT_ENTRY* conn_,
                           const TCPHdr *tcp,
                           PNET_BUFFER_LIST nbl,
                           BOOLEAN reply,
                           UINT64 now)
{
    struct conn_tcp *conn = OvsCastConntrackEntryToTcpEntry(conn_);
    /* The peer that sent 'pkt' */
    struct tcp_peer *src = &conn->peer[reply ? 1 : 0];
    /* The peer that should receive 'pkt' */
    struct tcp_peer *dst = &conn->peer[reply ? 0 : 1];
    uint8_t sws = 0, dws = 0;
    UINT16 tcp_flags = ntohs(tcp->flags);
    uint16_t win = ntohs(tcp->window);
    uint32_t ack, end, seq, orig_seq;
    uint32_t p_len = OvsGetTcpPayloadLength(nbl);
    int ackskew;

    if (OvsCtInvalidTcpFlags(tcp_flags)) {
        return CT_UPDATE_INVALID;
    }

    if (((tcp_flags & (TCP_SYN|TCP_ACK)) == TCP_SYN)
            && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2
            && src->state >= CT_DPIF_TCPS_FIN_WAIT_2) {
        src->state = dst->state = CT_DPIF_TCPS_CLOSED;
        return CT_UPDATE_NEW;
    }

    if (src->wscale & CT_WSCALE_FLAG
        && dst->wscale & CT_WSCALE_FLAG
        && !(tcp_flags & TCP_SYN)) {

        sws = src->wscale & CT_WSCALE_MASK;
        dws = dst->wscale & CT_WSCALE_MASK;

    } else if (src->wscale & CT_WSCALE_UNKNOWN
        && dst->wscale & CT_WSCALE_UNKNOWN
        && !(tcp_flags & TCP_SYN)) {

        sws = TCP_MAX_WSCALE;
        dws = TCP_MAX_WSCALE;
    }

    /*
     * Sequence tracking algorithm from Guido van Rooij's paper:
     *   http://www.madison-gurkha.com/publications/tcp_filtering/
     *      tcp_filtering.ps
     */

    orig_seq = seq = ntohl(tcp->seq);
    if (src->state < CT_DPIF_TCPS_SYN_SENT) {
        /* First packet from this end. Set its state */

        ack = ntohl(tcp->ack_seq);

        end = seq + p_len;
        if (tcp_flags & TCP_SYN) {
            end++;
            if (dst->wscale & CT_WSCALE_FLAG) {
                src->wscale = OvsTcpGetWscale(tcp);
                if (src->wscale & CT_WSCALE_FLAG) {
                    /* Remove scale factor from initial window */
                    sws = src->wscale & CT_WSCALE_MASK;
                    win = DIV_ROUND_UP((uint32_t) win, 1 << sws);
                    dws = dst->wscale & CT_WSCALE_MASK;
                } else {
                    /* fixup other window */
                    dst->max_win <<= dst->wscale & CT_WSCALE_MASK;
                    /* in case of a retrans SYN|ACK */
                    dst->wscale = 0;
                }
            }
        }
        if (tcp_flags & TCP_FIN) {
            end++;
        }

        src->seqlo = seq;
        src->state = CT_DPIF_TCPS_SYN_SENT;
        /*
         * May need to slide the window (seqhi may have been set by
         * the crappy stack check or if we picked up the connection
         * after establishment)
         */
        if (src->seqhi == 1 ||
                SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi)) {
            src->seqhi = end + MAX(1, dst->max_win << dws);
        }
        if (win > src->max_win) {
            src->max_win = win;
        }

    } else {
        ack = ntohl(tcp->ack_seq);
        end = seq + p_len;
        if (tcp_flags & TCP_SYN) {
            end++;
        }
        if (tcp_flags & TCP_FIN) {
            end++;
        }
    }

    if ((tcp_flags & TCP_ACK) == 0) {
        /* Let it pass through the ack skew check */
        ack = dst->seqlo;
    } else if ((ack == 0
                && (tcp_flags & (TCP_ACK|TCP_RST)) == (TCP_ACK|TCP_RST))
               /* broken tcp stacks do not set ack */) {
        /* Many stacks (ours included) will set the ACK number in an
         * FIN|ACK if the SYN times out -- no sequence to ACK. */
        ack = dst->seqlo;
    }

    if (seq == end) {
        /* Ease sequencing restrictions on no data packets */
        seq = src->seqlo;
        end = seq;
    }

    ackskew = dst->seqlo - ack;
#define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */
    if (SEQ_GEQ(src->seqhi, end)
        /* Last octet inside other's window space */
        && SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws))
        /* Retrans: not more than one window back */
        && (ackskew >= -MAXACKWINDOW)
        /* Acking not more than one reassembled fragment backwards */
        && (ackskew <= (MAXACKWINDOW << sws))
        /* Acking not more than one window forward */
        && ((tcp_flags & TCP_RST) == 0 || orig_seq == src->seqlo
            || (orig_seq == src->seqlo + 1)
            || (orig_seq + 1 == src->seqlo))) {
        /* Require an exact/+1 sequence match on resets when possible */

        /* update max window */
        if (src->max_win < win) {
            src->max_win = win;
        }
        /* synchronize sequencing */
        if (SEQ_GT(end, src->seqlo)) {
            src->seqlo = end;
        }
        /* slide the window of what the other end can send */
        if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {
            dst->seqhi = ack + MAX((win << sws), 1);
        }

        /* update states */
        if (tcp_flags & TCP_SYN && src->state < CT_DPIF_TCPS_SYN_SENT) {
                src->state = CT_DPIF_TCPS_SYN_SENT;
        }
        if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) {
                src->state = CT_DPIF_TCPS_CLOSING;
        }
        if (tcp_flags & TCP_ACK) {
            if (dst->state == CT_DPIF_TCPS_SYN_SENT) {
                dst->state = CT_DPIF_TCPS_ESTABLISHED;
            } else if (dst->state == CT_DPIF_TCPS_CLOSING) {
                dst->state = CT_DPIF_TCPS_FIN_WAIT_2;
            }
        }
        if (tcp_flags & TCP_RST) {
            src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;
        }

        if (src->state >= CT_DPIF_TCPS_FIN_WAIT_2
            && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2) {
            OvsConntrackUpdateExpiration(conn, now, 30 * CT_INTERVAL_SEC);
        } else if (src->state >= CT_DPIF_TCPS_CLOSING
                   && dst->state >= CT_DPIF_TCPS_CLOSING) {
            OvsConntrackUpdateExpiration(conn, now, 45 * CT_INTERVAL_SEC);
        } else if (src->state < CT_DPIF_TCPS_ESTABLISHED
                   || dst->state < CT_DPIF_TCPS_ESTABLISHED) {
            OvsConntrackUpdateExpiration(conn, now, 30 * CT_INTERVAL_SEC);
        } else if (src->state >= CT_DPIF_TCPS_CLOSING
                   || dst->state >= CT_DPIF_TCPS_CLOSING) {
            OvsConntrackUpdateExpiration(conn, now, 15 * 60 * CT_INTERVAL_SEC);
        } else {
            OvsConntrackUpdateExpiration(conn, now, 24 * 60 * 60 * CT_INTERVAL_SEC);
        }
    } else if ((dst->state < CT_DPIF_TCPS_SYN_SENT
                || dst->state >= CT_DPIF_TCPS_FIN_WAIT_2
                || src->state >= CT_DPIF_TCPS_FIN_WAIT_2)
               && SEQ_GEQ(src->seqhi + MAXACKWINDOW, end)
               /* Within a window forward of the originating packet */
               && SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
               /* Within a window backward of the originating packet */

        /*
         * This currently handles three situations:
         *  1) Stupid stacks will shotgun SYNs before their peer
         *     replies.
         *  2) When PF catches an already established stream (the
         *     firewall rebooted, the state table was flushed, routes
         *     changed...)
         *  3) Packets get funky immediately after the connection
         *     closes (this should catch Solaris spurious ACK|FINs
         *     that web servers like to spew after a close)
         *
         * This must be a little more careful than the above code
         * since packet floods will also be caught here. We don't
         * update the TTL here to mitigate the damage of a packet
         * flood and so the same code can handle awkward establishment
         * and a loosened connection close.
         * In the establishment case, a correct peer response will
         * validate the connection, go through the normal state code
         * and keep updating the state TTL.
         */

        /* update max window */
        if (src->max_win < win) {
            src->max_win = win;
        }
        /* synchronize sequencing */
        if (SEQ_GT(end, src->seqlo)) {
            src->seqlo = end;
        }
        /* slide the window of what the other end can send */
        if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {
            dst->seqhi = ack + MAX((win << sws), 1);
        }

        /*
         * Cannot set dst->seqhi here since this could be a shotgunned
         * SYN and not an already established connection.
         */

        if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) {
            src->state = CT_DPIF_TCPS_CLOSING;
        }

        if (tcp_flags & TCP_RST) {
            src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;
        }
    } else {
        return CT_UPDATE_INVALID;
    }

    return CT_UPDATE_VALID;
}
Beispiel #23
0
/*
 * Returns 1 if the TIME_WAIT state was killed and we should start over,
 * looking for a pcb in the listen state.  Returns 0 otherwise.
 */
int
tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
    struct mbuf *m, int tlen)
{
    struct tcptw *tw;
    int thflags;
    tcp_seq seq;

    /* tcbinfo lock required for tcp_twclose(), tcp_tw_2msl_reset(). */
//ScenSim-Port//    INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
//ScenSim-Port//    INP_WLOCK_ASSERT(inp);

    /*
     * XXXRW: Time wait state for inpcb has been recycled, but inpcb is
     * still present.  This is undesirable, but temporarily necessary
     * until we work out how to handle inpcb's who's timewait state has
     * been removed.
     */
    tw = intotw(inp);
    if (tw == NULL)
        goto drop;

    thflags = th->th_flags;

    /*
     * NOTE: for FIN_WAIT_2 (to be added later),
     * must validate sequence number before accepting RST
     */

    /*
     * If the segment contains RST:
     *  Drop the segment - see Stevens, vol. 2, p. 964 and
     *      RFC 1337.
     */
    if (thflags & TH_RST)
        goto drop;

//ScenSim-Port//#if 0
//ScenSim-Port///* PAWS not needed at the moment */
//ScenSim-Port//    /*
//ScenSim-Port//     * RFC 1323 PAWS: If we have a timestamp reply on this segment
//ScenSim-Port//     * and it's less than ts_recent, drop it.
//ScenSim-Port//     */
//ScenSim-Port//    if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
//ScenSim-Port//        TSTMP_LT(to.to_tsval, tp->ts_recent)) {
//ScenSim-Port//        if ((thflags & TH_ACK) == 0)
//ScenSim-Port//            goto drop;
//ScenSim-Port//        goto ack;
//ScenSim-Port//    }
//ScenSim-Port//    /*
//ScenSim-Port//     * ts_recent is never updated because we never accept new segments.
//ScenSim-Port//     */
//ScenSim-Port//#endif

    /*
     * If a new connection request is received
     * while in TIME_WAIT, drop the old connection
     * and start over if the sequence numbers
     * are above the previous ones.
     */
    if ((thflags & TH_SYN) && SEQ_GT(th->th_seq, tw->rcv_nxt)) {
        tcp_twclose(tw, 0);
        return (1);
    }

    /*
     * Drop the segment if it does not contain an ACK.
     */
    if ((thflags & TH_ACK) == 0)
        goto drop;

    /*
     * Reset the 2MSL timer if this is a duplicate FIN.
     */
    if (thflags & TH_FIN) {
        seq = th->th_seq + tlen + (thflags & TH_SYN ? 1 : 0);
        if (seq + 1 == tw->rcv_nxt)
            tcp_tw_2msl_reset(tw, 1);
    }

    /*
     * Acknowledge the segment if it has data or is not a duplicate ACK.
     */
    if (thflags != TH_ACK || tlen != 0 ||
        th->th_seq != tw->rcv_nxt || th->th_ack != tw->snd_nxt)
        tcp_twrespond(tw, TH_ACK);
drop:
//ScenSim-Port//    INP_WUNLOCK(inp);
    m_freem(m);
    return (0);
}
Beispiel #24
0
/* Receive packet */
void
l2tp_ctrl_input(l2tpd *_this, int listener_index, struct sockaddr *peer,
    struct sockaddr *sock, void *nat_t_ctx, u_char *pkt, int pktlen)
{
	int i, len, offsiz, reqlen, is_ctrl;
	uint16_t mestype;
	struct l2tp_avp *avp, *avp0;
	l2tp_ctrl *ctrl;
	l2tp_call *call;
	char buf[L2TP_AVP_MAXSIZ], errmsg[256];
	time_t curr_time;
	u_char *pkt0;
	struct l2tp_header hdr;
	char hbuf[NI_MAXHOST + NI_MAXSERV + 16];

	ctrl = NULL;
	curr_time = get_monosec();
	pkt0 = pkt;

	L2TP_CTRL_ASSERT(peer->sa_family == sock->sa_family);
	L2TP_CTRL_ASSERT(peer->sa_family == AF_INET ||
	    peer->sa_family == AF_INET6)
    /*
     * Parse L2TP Header
     */
	memset(&hdr, 0, sizeof(hdr));
	if (pktlen < 2) {
		snprintf(errmsg, sizeof(errmsg), "a short packet.  "
		    "length=%d", pktlen);
		goto bad_packet;
	}
	memcpy(&hdr, pkt, 2);
	pkt += 2;
	if (hdr.ver != L2TP_HEADER_VERSION_RFC2661) {
		/* XXX: only RFC2661 is supported */
		snprintf(errmsg, sizeof(errmsg),
		    "Unsupported version at header = %d", hdr.ver);
		goto bad_packet;
	}
	is_ctrl = (hdr.t != 0)? 1 : 0;

	/* calc required length */
	reqlen = 6;		/* for Flags, Tunnel-Id, Session-Id field */
	if (hdr.l) reqlen += 2;	/* for Length field (opt) */
	if (hdr.s) reqlen += 4;	/* for Ns, Nr field (opt) */
	if (hdr.o) reqlen += 2;	/* for Offset Size field (opt) */
	if (reqlen > pktlen) {
		snprintf(errmsg, sizeof(errmsg),
		    "a short packet. length=%d", pktlen);
		goto bad_packet;
	}

	if (hdr.l != 0) {
		GETSHORT(hdr.length, pkt);
		if (hdr.length > pktlen) {
			snprintf(errmsg, sizeof(errmsg),
			    "Actual packet size is smaller than the length "
			    "field %d < %d", pktlen, hdr.length);
			goto bad_packet;
		}
		pktlen = hdr.length;	/* remove trailing trash */
	}
	GETSHORT(hdr.tunnel_id, pkt);
	GETSHORT(hdr.session_id, pkt);
	if (hdr.s != 0) {
		GETSHORT(hdr.ns, pkt);
		GETSHORT(hdr.nr, pkt);
	}
	if (hdr.o != 0) {
		GETSHORT(offsiz, pkt);
		if (pktlen < offsiz) {
			snprintf(errmsg, sizeof(errmsg),
			    "offset field is bigger than remaining packet "
			    "length %d > %d", offsiz, pktlen);
			goto bad_packet;
		}
		pkt += offsiz;
	}
	L2TP_CTRL_ASSERT(pkt - pkt0 == reqlen);
	pktlen -= (pkt - pkt0);	/* cut down the length of header */

	ctrl = NULL;
	memset(buf, 0, sizeof(buf));
	mestype = 0;
	avp = NULL;

	if (is_ctrl) {
		avp0 = (struct l2tp_avp *)buf;
		avp = avp_find_message_type_avp(avp0, pkt, pktlen);
		if (avp != NULL)
			mestype = avp->attr_value[0] << 8 | avp->attr_value[1];
	}
	ctrl = l2tpd_get_ctrl(_this, hdr.tunnel_id);

	if (ctrl == NULL) {
		/* new control */
		if (!is_ctrl) {
			snprintf(errmsg, sizeof(errmsg),
			    "bad data message: tunnelId=%d is not "
			    "found.", hdr.tunnel_id);
			goto bad_packet;
		}
		if (mestype != L2TP_AVP_MESSAGE_TYPE_SCCRQ) {
			snprintf(errmsg, sizeof(errmsg),
			    "bad control message: tunnelId=%d is not "
			    "found.  mestype=%s", hdr.tunnel_id,
			    avp_mes_type_string(mestype));
			goto bad_packet;
		}

		if ((ctrl = l2tp_ctrl_create()) == NULL) {
			l2tp_ctrl_log(ctrl, LOG_ERR,
			    "l2tp_ctrl_create() failed: %m");
			goto fail;
		}

		if (l2tp_ctrl_init(ctrl, _this, peer, sock, nat_t_ctx) != 0) {
			l2tp_ctrl_log(ctrl, LOG_ERR,
			    "l2tp_ctrl_start() failed: %m");
			goto fail;
		}

		ctrl->listener_index = listener_index;
		l2tp_ctrl_reload(ctrl);
	} else {
		/*
		 * treat as an error if src address and port is not
		 * match. (because it is potentially DoS attach)
		 */
		int notmatch = 0;

		if (ctrl->peer.ss_family != peer->sa_family)
			notmatch = 1;
		else if (peer->sa_family == AF_INET) {
			if (SIN(peer)->sin_addr.s_addr != 
			    SIN(&ctrl->peer)->sin_addr.s_addr ||
			    SIN(peer)->sin_port != SIN(&ctrl->peer)->sin_port)
				notmatch = 1;
		} else if (peer->sa_family == AF_INET6) {
			if (!IN6_ARE_ADDR_EQUAL(&(SIN6(peer)->sin6_addr),
				    &(SIN6(&ctrl->peer)->sin6_addr)) ||
			    SIN6(peer)->sin6_port !=
				    SIN6(&ctrl->peer)->sin6_port)
				notmatch = 1;
 		}
		if (notmatch) {
			snprintf(errmsg, sizeof(errmsg),
			    "tunnelId=%u is already assigned for %s",
			    hdr.tunnel_id, addrport_tostring(
				(struct sockaddr *)&ctrl->peer,
				ctrl->peer.ss_len, hbuf, sizeof(hbuf)));
			goto bad_packet;
		}
	}
	ctrl->last_rcv = curr_time;
	call = NULL;
	if (hdr.session_id != 0) {
		/* search l2tp_call by Session ID */
		/* linear search is enough for this purpose */
		len = slist_length(&ctrl->call_list);
		for (i = 0; i < len; i++) {
			call = slist_get(&ctrl->call_list, i);
			if (call->session_id == hdr.session_id)
				break;
			call = NULL;
		}
	}
	if (!is_ctrl) {
		int delayed = 0;

		/* L2TP data */
		if (ctrl->state != L2TP_CTRL_STATE_ESTABLISHED) {
			l2tp_ctrl_log(ctrl, LOG_WARNING,
			    "Received Data packet in '%s'",
			    l2tp_ctrl_state_string(ctrl));
			goto fail;
		}
		if (call == NULL) {
			l2tp_ctrl_log(ctrl, LOG_WARNING,
			    "Received a data packet but it has no call.  "
			    "session_id=%u",  hdr.session_id);
			goto fail;
		}
		L2TP_CTRL_DBG((ctrl, DEBUG_LEVEL_2,
		    "call=%u RECV   ns=%u nr=%u snd_nxt=%u rcv_nxt=%u len=%d",
		    call->id, hdr.ns, hdr.nr, call->snd_nxt, call->rcv_nxt,
		    pktlen));
		if (call->state != L2TP_CALL_STATE_ESTABLISHED){
			l2tp_ctrl_log(ctrl, LOG_WARNING,
			    "Received a data packet but call is not "
			    "established");
			goto fail;
		}

		if (hdr.s != 0) {
			if (SEQ_LT(hdr.ns, call->rcv_nxt)) {
				if (SEQ_LT(hdr.ns,
				    call->rcv_nxt - L2TP_CALL_DELAY_LIMIT)) {
					/* sequence number seems to be delayed */
					/* XXX: need to log? */
					L2TP_CTRL_DBG((ctrl, LOG_DEBUG,
					    "receive a out of sequence "
					    "data packet: %u < %u.",
					    hdr.ns, call->rcv_nxt));
					return;
				}
				delayed = 1;
			} else {
				call->rcv_nxt = hdr.ns + 1;
			}
		}

		l2tp_call_ppp_input(call, pkt, pktlen, delayed);

		return;
	}
	if (hdr.s != 0) {
		L2TP_CTRL_DBG((ctrl, DEBUG_LEVEL_2,
		    "RECV %s ns=%u nr=%u snd_nxt=%u snd_una=%u rcv_nxt=%u "
		    "len=%d", (is_ctrl)? "C" : "", hdr.ns, hdr.nr,
		    ctrl->snd_nxt, ctrl->snd_una, ctrl->rcv_nxt, pktlen));

		if (pktlen <= 0)
			l2tp_ctrl_log(ctrl, LOG_INFO, "RecvZLB");

		if (SEQ_GT(hdr.nr, ctrl->snd_una)) {
			if (hdr.nr == ctrl->snd_nxt ||
			    SEQ_LT(hdr.nr, ctrl->snd_nxt))
				ctrl->snd_una = hdr.nr;
			else {
				l2tp_ctrl_log(ctrl, LOG_INFO,
				    "Received message has bad Nr field: "
				    "%u < %u.", hdr.ns, ctrl->snd_nxt);
				/* XXX Drop with ZLB? */
				goto fail;
			}
		}
		if (l2tp_ctrl_txwin_size(ctrl) <= 0) {
			/* no waiting ack */
			if (ctrl->hello_wait_ack != 0) {
				/*
				 * Reset Hello state, as an ack for the Hello
				 * is recived.
				 */
				ctrl->hello_wait_ack = 0;
				ctrl->hello_io_time = curr_time;
			}
			switch (ctrl->state) {
			case L2TP_CTRL_STATE_CLEANUP_WAIT:
				l2tp_ctrl_stop(ctrl, 0);
				return;
			}
		}
		if (hdr.ns != ctrl->rcv_nxt) {
			/* there are remaining packet */
			if (l2tp_ctrl_resend_una_packets(ctrl) <= 0) {
				/* resend or sent ZLB */
				l2tp_ctrl_send_ZLB(ctrl);
			}
#ifdef	L2TP_CTRL_DEBUG
			if (pktlen != 0) {	/* not ZLB */
				L2TP_CTRL_DBG((ctrl, LOG_DEBUG,
				    "receive out of sequence %u must be %u.  "
				    "mestype=%s", hdr.ns, ctrl->rcv_nxt,
				    avp_mes_type_string(mestype)));
			}
#endif
			return;
		}
		if (pktlen <= 0)
			return;		/* ZLB */

		if (l2tp_ctrl_txwin_is_full(ctrl)) {
			L2TP_CTRL_DBG((ctrl, LOG_DEBUG,
			    "Received message cannot be handled. "
			    "Transmission window is full."));
			l2tp_ctrl_send_ZLB(ctrl);
			return;
		}

		ctrl->rcv_nxt++;
		if (avp == NULL) {
			l2tpd_log(_this, LOG_WARNING,
			    "bad control message: no message-type AVP.");
			goto fail;
		}
	}

    /*
     * state machine (RFC2661 pp. 56-57)
     */
	switch (ctrl->state) {
	case L2TP_CTRL_STATE_IDLE:
		switch (mestype) {
		case L2TP_AVP_MESSAGE_TYPE_SCCRQ:
			if (l2tp_ctrl_recv_SCCRQ(ctrl, pkt, pktlen, _this,
			    peer) == 0) {
				/* acceptable */
				l2tp_ctrl_send_SCCRP(ctrl);
				ctrl->state = L2TP_CTRL_STATE_WAIT_CTL_CONN;
				return;
			}
			/*
			 * in case un-acceptable, it was already processed
			 * at l2tcp_ctrl_recv_SCCRQ
			 */
			return;
		case L2TP_AVP_MESSAGE_TYPE_SCCRP:
			/*
			 * RFC specifies that sent of StopCCN in the state,
			 * However as this implementation only support Passive
			 * open, this packet will not received.
			 */
			/* FALLTHROUGH */
		case L2TP_AVP_MESSAGE_TYPE_SCCCN:
		default:
			break;
		}
		goto fsm_fail;

	case L2TP_CTRL_STATE_WAIT_CTL_CONN:
	    /* Wait-Ctl-Conn */
		switch (mestype) {
		case L2TP_AVP_MESSAGE_TYPE_SCCCN:
			l2tp_ctrl_log(ctrl, LOG_INFO, "RecvSCCN");
			if (l2tp_ctrl_send_ZLB(ctrl) == 0) {
				ctrl->state = L2TP_CTRL_STATE_ESTABLISHED;
			}
			return;
		case L2TP_AVP_MESSAGE_TYPE_StopCCN:
			goto receive_stop_ccn;
		case L2TP_AVP_MESSAGE_TYPE_SCCRQ:
		case L2TP_AVP_MESSAGE_TYPE_SCCRP:
		default:
			break;
		}
		break;	/* fsm_fail */
	case L2TP_CTRL_STATE_ESTABLISHED:
	    /* Established */
		switch (mestype) {
		case L2TP_AVP_MESSAGE_TYPE_SCCCN:
		case L2TP_AVP_MESSAGE_TYPE_SCCRQ:
		case L2TP_AVP_MESSAGE_TYPE_SCCRP:
			break;
receive_stop_ccn:
		case L2TP_AVP_MESSAGE_TYPE_StopCCN:
			if (l2tp_ctrl_recv_StopCCN(ctrl, pkt, pktlen) == 0) {
				if (l2tp_ctrl_resend_una_packets(ctrl) <= 0)
					l2tp_ctrl_send_ZLB(ctrl);
				l2tp_ctrl_stop(ctrl, 0);
				return;
			}
			l2tp_ctrl_log(ctrl, LOG_ERR, "Received bad StopCCN");
			l2tp_ctrl_send_ZLB(ctrl);
			l2tp_ctrl_stop(ctrl, 0);
			return;

		case L2TP_AVP_MESSAGE_TYPE_HELLO:
			if (l2tp_ctrl_resend_una_packets(ctrl) <= 0)
				l2tp_ctrl_send_ZLB(ctrl);
			return;
		case L2TP_AVP_MESSAGE_TYPE_CDN:
		case L2TP_AVP_MESSAGE_TYPE_ICRP:
		case L2TP_AVP_MESSAGE_TYPE_ICCN:
			if (call == NULL) {
				l2tp_ctrl_log(ctrl, LOG_INFO,
				    "Unknown call message: %s",
				    avp_mes_type_string(mestype));
				goto fail;
			}
			/* FALLTHROUGH */
		case L2TP_AVP_MESSAGE_TYPE_ICRQ:
			l2tp_call_recv_packet(ctrl, call, mestype, pkt,
			    pktlen);
			return;
		default:
			break;
		}
		break;	/* fsm_fail */
	case L2TP_CTRL_STATE_CLEANUP_WAIT:
		if (mestype == L2TP_AVP_MESSAGE_TYPE_StopCCN) {
			/*
			 * We left ESTABLISHED state, but the peer sent StopCCN.
			 */
			goto receive_stop_ccn;
		}
		break;	/* fsm_fail */
	}

fsm_fail:
	/* state machine error */
	l2tp_ctrl_log(ctrl, LOG_WARNING, "Received %s in '%s' state",
	    avp_mes_type_string(mestype), l2tp_ctrl_state_string(ctrl));
	l2tp_ctrl_stop(ctrl, L2TP_STOP_CCN_RCODE_FSM_ERROR);

	return;
fail:
	if (ctrl != NULL && mestype != 0) {
		l2tp_ctrl_log(ctrl, LOG_WARNING, "Received %s in '%s' state",
		    avp_mes_type_string(mestype), l2tp_ctrl_state_string(ctrl));
		l2tp_ctrl_stop(ctrl, L2TP_STOP_CCN_RCODE_GENERAL_ERROR);
	}
	return;

bad_packet:
	l2tpd_log(_this, LOG_INFO, "Received from=%s: %s",
	    addrport_tostring(peer, peer->sa_len, hbuf, sizeof(hbuf)), errmsg);

	return;
}
Beispiel #25
0
int
tcp_output(struct tcpcb * tp)
{
   struct socket *   so =  tp->t_inpcb->inp_socket;
   int   len;
   long  win;
   int   off,  flags,   error;
   struct mbuf *  m;
   struct tcpiphdr * ti;
   unsigned optlen = 0;
   int   idle, sendalot;
   struct mbuf *  sendm;   /* mbuf which contains data to send */
   struct mbuf * tcp_mbuf; /* mbuf containing TCP header */
   int   bufoff;           /* offset of data in sendm->m_data */

#ifdef TCP_SACK
   int   sack_resend;
   int   sack_hole = 0;    /* next sack hole to fill */

   if(tp->t_flags & TF_SACKREPLY)
   {
      /* we are resending based on a received SACK header */
      sack_resend = TRUE;
      tp->t_flags &= ~TF_SACKREPLY;    /* clear flag */
   }
   else
      sack_resend = FALSE;
#endif /* TCP_SACK */
   
   /*
    * Determine length of data that should be transmitted,
    * and flags that will be used.
    * If there is some data or critical controls (SYN, RST)
    * to send, then transmit; otherwise, investigate further.
    */
   idle = (tp->snd_max == tp->snd_una);

again:
   sendalot = 0;
   off = (int)(tp->snd_nxt - tp->snd_una);
   win = (long)tp->snd_wnd;   /* set basic send window */
   if (win > (long)tp->snd_cwnd) /* see if we need congestion control */
   {
      win = (int)(tp->snd_cwnd & ~(ALIGN_TYPE-1)); /* keep data aligned */
   }

   /*
    * If in persist timeout with window of 0, send 1 byte.
    * Otherwise, if window is small but nonzero
    * and timer expired, we will send what we can
    * and go to transmit state.
    */
   if (tp->t_force) 
   {
      if (win == 0)
         win = 1;
      else 
      {
         tp->t_timer[TCPT_PERSIST] = 0;
         tp->t_rxtshift = 0;
      }
   }

#ifdef TCP_SACK
   /* See if we need to adjust the offset for a sack resend */
   if(sack_resend)
   {
      off = (int)(tp->sack_hole_start[sack_hole] - tp->snd_una);
      /* if this hole's already been acked then punt and move to next hole */
      if(off < 0)
      {
         /* clear out the acked hole */
         tp->sack_hole_start[sack_hole] = tp->sack_hole_end[sack_hole] = 0;
         /* see if we're done with SACK hole list (2 tests) */
         if(++sack_hole >= SACK_BLOCKS)
            return 0;
         if(tp->sack_hole_start[sack_hole] == tp->sack_hole_end[sack_hole])
            return 0;
         goto again;
      }
      tp->snd_nxt = tp->sack_hole_start[sack_hole];
      len = (int)(tp->sack_hole_end[sack_hole] - tp->sack_hole_start[sack_hole]);
      len = (int)MIN(len, (int)win);
   }
   else
#endif /* TCP_SACK */
   {
      /* set length of packets which are not sack resends */
      len = (int)MIN(so->so_snd.sb_cc, (unsigned)win) - off;
   }

   flags = tcp_outflags[tp->t_state];


   /* See if we need to build TCP options field. This test should be fast. */

#if (defined(TCP_TIMESTAMP) | defined(TCP_SACK))	   
   if((flags & TH_SYN) ||
/*   !!!???   (so->so_options & SO_TIMESTAMP) ||  */
	  (tp->t_flags & TF_SACKNOW)
	 )
   {
      optlen = bld_options(tp, &tcp_optionbuf[optlen], flags, so);
   }
#else
   /* If other options not defined this build then don't bother to call bld_options() except 
    * on SYN packets
    */
   if(flags & TH_SYN)
   {
      optlen = bld_options(tp, &tcp_optionbuf[optlen], flags, so);
   }
#endif

   if (len < 0)
   {
      /*
       * If FIN has been sent but not acked,
       * but we haven't been called to retransmit,
       * len will be -1.  Otherwise, window shrank
       * after we sent into it.  If window shrank to 0,
       * cancel pending retransmit and pull snd_nxt
       * back to (closed) window.  We will enter persist
       * state below.  If the window didn't close completely,
       * just wait for an ACK.
       */
      len = 0;
      if (win == 0) 
      {
         tp->t_timer[TCPT_REXMT] = 0;
         tp->snd_nxt = tp->snd_una;
      }
   }

   if (len > (int)tp->t_maxseg)
   {
      len = tp->t_maxseg;
      sendalot = 1;
   }

#ifdef IP_V4
#ifdef IP_PMTU
   {
      int pmtu = tp->t_inpcb->inp_pmtu - 40;

      if (len > pmtu)
      {
         len = pmtu - 40;
         sendalot = 1;
      }
   }
#endif /* IP_PMTU */
   /* We don't need a pmtu test for IPv6. V6 code limits t_maxseg to
    * the Path MTU, so the test above the v4 ifdef above covers us.
    */
#endif /* IP_V4 */

   if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
      flags &= ~TH_FIN;
   win = (long)(sbspace(&so->so_rcv));

   /*
    * If our state indicates that FIN should be sent
    * and we have not yet done so, or we're retransmitting the FIN,
    * then we need to send.
    */
   if ((flags & TH_FIN) &&
       (so->so_snd.sb_cc == 0) &&
       ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
   {
      goto send;
   }
   /*
    * Send if we owe peer an ACK.
    */
   if (tp->t_flags & TF_ACKNOW)
      goto send;
   if (flags & (TH_SYN|TH_RST))
      goto send;
   if (SEQ_GT(tp->snd_up, tp->snd_una))
      goto send;

   /*
    * Sender silly window avoidance.  If connection is idle
    * and can send all data, a maximum segment,
    * at least a maximum default-size segment do it,
    * or are forced, do it; otherwise don't bother.
    * If peer's buffer is tiny, then send
    * when window is at least half open.
    * If retransmitting (possibly after persist timer forced us
    * to send into a small window), then must resend.
    */
   if (len)
   {
      if (len == (int)tp->t_maxseg)
         goto send;
      if ((idle || tp->t_flags & TF_NODELAY) &&
          len + off >= (int)so->so_snd.sb_cc)
      {
         goto send;
      }
      if (tp->t_force)
         goto send;
      if (len >= (int)(tp->max_sndwnd / 2))
         goto send;
      if (SEQ_LT(tp->snd_nxt, tp->snd_max))
         goto send;
   }

   /*
    * Compare available window to amount of window
    * known to peer (as advertised window less
    * next expected input).  If the difference is at least two
    * max size segments or at least 35% of the maximum possible
    * window, then want to send a window update to peer.
    */
   if (win > 0)
   {
      int   adv   =  (int)win -  (int)(tp->rcv_adv -  tp->rcv_nxt);

      if (so->so_rcv.sb_cc == 0 && adv >= (int)(tp->t_maxseg * 2))
         goto send;
      if (100 * (u_int)adv / so->so_rcv.sb_hiwat >= 35)
         goto send;
   }

   /*
    * TCP window updates are not reliable, rather a polling protocol
    * using ``persist'' packets is used to insure receipt of window
    * updates.  The three ``states'' for the output side are:
    *   idle         not doing retransmits or persists
    *   persisting      to move a small or zero window
    *   (re)transmitting   and thereby not persisting
    *
    * tp->t_timer[TCPT_PERSIST]
    *   is set when we are in persist state.
    * tp->t_force
    *   is set when we are called to send a persist packet.
    * tp->t_timer[TCPT_REXMT]
    *   is set when we are retransmitting
    * The output side is idle when both timers are zero.
    *
    * If send window is too small, there is data to transmit, and no
    * retransmit or persist is pending, then go to persist state.
    * If nothing happens soon, send when timer expires:
    * if window is nonzero, transmit what we can,
    * otherwise force out a byte.
    */
   if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 &&
       tp->t_timer[TCPT_PERSIST] == 0) 
   {
      tp->t_rxtshift = 0;
      tcp_setpersist(tp);
   }

   /*
    * No reason to send a segment, just return.
    */
   return (0);

send:
   ENTER_CRIT_SECTION(tp);

   /* Limit send length to the current buffer so as to
    * avoid doing the "mbuf shuffle" in m_copy().
    */
   bufoff = off;
   sendm = so->so_snd.sb_mb;
   if (len)
   {
      /* find mbuf containing data to send (at "off") */
      while (sendm)  /* loop through socket send list */
      {
         bufoff -= sendm->m_len;
         if (bufoff < 0)   /* if off is in this buffer, break */
            break;
         sendm = sendm->m_next;
      }
      if (!sendm) { dtrap();  /* shouldn't happen */ }
      bufoff += sendm->m_len; /* index to next data to send in msend */

      /* if socket has multiple unsent mbufs, set flag for send to loop */
      if ((sendm->m_next) && (len > (int)sendm->m_len))
      {
         flags &= ~TH_FIN; /* don't FIN on segment prior to last */
         sendalot = 1;     /* set to send more segments */
      }
      if((flags & TH_FIN) && (so->so_snd.sb_cc > (unsigned)len))
      {
         /* This can happen on slow links (PPP) which retry the last 
          * segment - the one with the FIN bit attached to data.
          */
         flags &= ~TH_FIN; /* don't FIN on segment prior to last */
      }

      /* only send the rest of msend */
      len = min(len, (int)sendm->m_len);

      /* if we're not sending starting at sendm->m_data (in which 
       * case bufoff != 0), then we will copy the data; else we would 
       * write IP/TCP headers over sent but un-ack'ed data in sendm. 
       * Similarly, if sendm->m_data is not aligned with respect to 
       * sendm->m_base and ALIGN_TYPE, we will copy the data to 
       * ensure that it (and the then-prepended IP/TCP headers) will 
       * be aligned according to ALIGN_TYPE. 
       */
      if ((bufoff != 0) ||       /* data not front aligned in send mbuf? */
          (((sendm->m_data - sendm->m_base) & (ALIGN_TYPE - 1)) != 0))
      {
         len = min(len, (int)(sendm->m_len - bufoff));   /* limit len again */

         /* One more test - if this data is not aligned with the front
          * of the m_data buffer then we can't use it in place, else we
          * might write the IP/TCP header over data that has not yet
          * been acked. In this case we must make sure our send
          * fits into a little buffer and send what we can.
          */
         if ((len > (int)(lilbufsiz - HDRSLEN)) && /* length is bigger the small buffer? */
             (bigfreeq.q_len < 2))      /* and we are low on big buffers */
         {
            len = lilbufsiz - HDRSLEN;
         }
      }
   }

   /* if send data is sufficiently aligned in packet, prepend TCP/IP header
    * in the space provided. 
    */
   if (len && (bufoff == 0) && 
       (sendm->pkt->inuse == 1) &&
       (((sendm->m_data - sendm->m_base) & (ALIGN_TYPE - 1)) == 0) && 
       (optlen == 0))
   {
      /* get an empty mbuf to "clone" the data */
      m = m_getnbuf(MT_TXDATA, 0);
      if (!m)
      {
         EXIT_CRIT_SECTION(tp);
         return (ENOBUFS);
      }
      m->pkt = sendm->pkt; /* copy packet location in new mbuf */
      m->pkt->inuse++;     /* bump packet's use count */
      m->m_base = sendm->m_base; /* clone mbuf members */
      m->m_memsz = sendm->m_memsz;
      m->m_len = len + TCPIPHDRSZ;  /* adjust clone for header */
      m->m_data = sendm->m_data - TCPIPHDRSZ;
   }
   else  /* either no data or data is not front aligned in mbuf */
   {
      /* Grab a header mbuf, attaching a copy of data to be 
       * transmitted, and initialize the header from 
       * the template for sends on this connection.
       */
      m = m_getwithdata (MT_HEADER, IFNETHDR_SIZE + TCPIPHDRSZ);
      if (m ==(struct mbuf *)NULL)
      {
         EXIT_CRIT_SECTION(tp);
         return ENOBUFS;
      }

      m->m_len = TCPIPHDRSZ;
      m->m_data += IFNETHDR_SIZE;/* Move this to sizeof tcpip hdr leave*/
      /* 14 bytes for ethernet header      */

      if (len) /* attach any data to send */
      {
         m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len);
         if (m->m_next == 0)
         {
            m_freem(m);
            EXIT_CRIT_SECTION(tp);
            return ENOBUFS;
         }
      }
   }
   EXIT_CRIT_SECTION(tp);

   if (len) 
   {
      if (tp->t_force && len == 1)
         tcpstat.tcps_sndprobe++;
      else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 
      {
         tcpstat.tcps_sndrexmitpack++;
         tcpstat.tcps_sndrexmitbyte += len;
#ifdef TCP_SACK
      if(sack_resend)
         tcpstat.tcps_sackresend++;
#endif
      } 
      else 
      {
         tcpstat.tcps_sndpack++;
         tcpstat.tcps_sndbyte += len;
      }
   }
   else if (tp->t_flags & TF_ACKNOW)
   {
      tcpstat.tcps_sndacks++;
   }
   else if (flags & (TH_SYN|TH_FIN|TH_RST))
      tcpstat.tcps_sndctrl++;
   else if (SEQ_GT(tp->snd_up, tp->snd_una))
      tcpstat.tcps_sndurg++;
   else
      tcpstat.tcps_sndwinup++;

   ti = (struct tcpiphdr *)(m->m_data+sizeof(struct ip)-sizeof(struct ipovly));
   if ((char *)ti < m->pkt->nb_buff)
   {
      panic("tcp_out- packet ptr underflow\n");
   }
   tcp_mbuf = m;        /* flag TCP header mbuf */

#ifdef IP_V6  /* Dual mode code */
   if(so->so_domain == AF_INET6)
   {
      m = mbuf_prepend(m, sizeof(struct ipv6));
      if(m == NULL)
      {
         /* this can happen when we run out of mbufs or pkt buffers
          * That is, mfreeq is empty or (lilfreeq, bigfreeq) are empty.
          * One solution is to find out which one is getting full and
          * then increase them.
          */
         dtrap();             /* This is really rare... */
         m_freem(tcp_mbuf);   /* Free TCP/data chain */
         return ENOBUFS;
      }

      /* strip overlay from front of TCP header */
      tcp_mbuf->m_data += sizeof(struct ipovly);
      tcp_mbuf->m_len -= sizeof(struct ipovly);
   }
#endif   /* end IP_V6 */

   if (tp->t_template == 0)
      panic("tcp_output");

   MEMCPY((char*)ti, (char*)tp->t_template, sizeof(struct tcpiphdr));

   /*
    * Fill in fields, remembering maximum advertised
    * window for use in delaying messages about window sizes.
    * If resending a FIN, be sure not to use a new sequence number.
    */
   if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && 
       tp->snd_nxt == tp->snd_max)
   {
      tp->snd_nxt--;
   }

   ti->ti_seq = htonl(tp->snd_nxt);
   ti->ti_ack = htonl(tp->rcv_nxt);

   /*
    * If we're sending a SYN, check the IP address of the interface
    * that we will (likely) use to send the IP datagram -- if it's
    * changed from what is in the template (as it might if this is
    * a retransmission, and the original SYN caused PPP to start
    * bringing the interface up, and PPP has got a new IP address
    * via IPCP), update the template and the inpcb with the new 
    * address.
    */
   if (flags & TH_SYN)
   {
      struct inpcb * inp;
      inp = (struct inpcb *)so->so_pcb;

      switch(so->so_domain)
      {
#ifdef IP_V4
      case AF_INET:
      {
         ip_addr src;

#ifdef INCLUDE_PPP

         if(((flags & TH_ACK) == 0) && /* SYN only, not SYN/ACK */
            (inp->ifp) &&              /* Make sure we have iface */
            (inp->ifp->mib.ifType == PPP))   /* only PPP type */
         {
            dtrap(); /* remove after confirmed to work in PPP */ 
            src = ip_mymach(ti->ti_dst.s_addr);

         if (src != ti->ti_src.s_addr)
         {
            ti->ti_src.s_addr = src;
            tp->t_template->ti_src.s_addr = src;
            tp->t_inpcb->inp_laddr.s_addr = src;
         }
         }
#endif   /* INCLUDE_PPP */

         /* If this is a SYN (not a SYN/ACK) then set the pmtu */
         if((flags & TH_ACK) == 0)
         {
#ifdef IP_PMTU
            inp->inp_pmtu = pmtucache_get(inp->inp_faddr.s_addr);
#else    /* not compiled for pathmtu, guess based on iface */
            {
               NET ifp;
               /* find iface for route. Pass "src" as nexthop return */
               ifp = iproute(ti->ti_dst.s_addr, &src);
               if(ifp)
                  inp->inp_pmtu = ifp->n_mtu - (ifp->n_lnh + 40);
               else
                  inp->inp_pmtu = 580;  /* Ugh. */
            }
#endif   /* IP_PMTU */
         }
         break;
      }
#endif   /* IP_V4 */

#ifdef IP_V6
      case AF_INET6:
      {
         struct ip6_inaddr * local;
         
         local = ip6_myaddr(&tp->t_inpcb->ip6_faddr, inp->ifp);

         /* If we got a local address & it's not the one in the pcb, then
          * we assume it changed at the iface and fix it in the pcb. Unlike 
          * v4, we don't have an IP header yet, not do we have a template 
          * to worry about.
          */
         if((local) && 
            (!IP6EQ(&local->addr, &tp->t_inpcb->ip6_laddr)))
         {
            IP6CPY(&tp->t_inpcb->ip6_laddr, &local->addr);
         }
         /* If this is a SYN (not a SYN/ACK) then set the pmtu */
         if((flags & TH_ACK) == 0)
         {
            inp->inp_pmtu = ip6_pmtulookup(&inp->ip6_laddr, inp->ifp);
         }
         break;
      }
#endif   /* IP_V6 */
      default:
         dtrap();    /* bad domain setting */
      }
   }

   /* fill in options if any are set */
   if (optlen)
   {
      struct mbuf * mopt;

      mopt = m_getwithdata(MT_TXDATA, MAXOPTLEN);
      if (mopt == NULL) 
      {
         m_freem(m);
         return (ENOBUFS);
      }

      /* insert options mbuf after after tmp_mbuf */
      mopt->m_next = tcp_mbuf->m_next;
      tcp_mbuf->m_next = mopt;

      /* extend options to aligned address */
      while(optlen & 0x03)
         tcp_optionbuf[optlen++] = TCPOPT_EOL;

      MEMCPY(mtod(mopt, char *), tcp_optionbuf, optlen);
      mopt->m_len = optlen;
      /* use portable macro to set tcp data offset bits */
      SET_TH_OFF(ti->ti_t, ((sizeof (struct tcphdr) + optlen) >> 2));
   }

   ti->ti_flags = (u_char)flags;
   /*
    * Calculate receive window. Don't shrink window,
    * but avoid silly window syndrome.
    */
   if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg)
      win = 0;
   if (win < (long)(tp->rcv_adv - tp->rcv_nxt))
      win = (long)(tp->rcv_adv - tp->rcv_nxt);

   /* do check for Iniche buffer limits -JB- */
   if (bigfreeq.q_len == 0)   /* If queue length is 0, set window to 0 */
   {
      win = 0;
   }
   else if(win > (((long)bigfreeq.q_len - 1) * (long)bigbufsiz))
   {
      win = ((long)bigfreeq.q_len - 1) * bigbufsiz;
   }

#ifdef TCP_WIN_SCALE
   if(tp->t_flags & TF_WINSCALE)
   {
      ti->ti_win = htons((u_short)(win >> tp->rcv_wind_scale)); /* apply scale */
   }
Beispiel #26
0
/*
 * Tcp output routine: figure out what should be sent and send it.
 */
int
tcp_output(struct tcpcb *tp)
{
	struct inpcb * const inp = tp->t_inpcb;
	struct socket *so = inp->inp_socket;
	long len, recvwin, sendwin;
	int nsacked = 0;
	int off, flags, error = 0;
#ifdef TCP_SIGNATURE
	int sigoff = 0;
#endif
	struct mbuf *m;
	struct ip *ip;
	struct tcphdr *th;
	u_char opt[TCP_MAXOLEN];
	unsigned int ipoptlen, optlen, hdrlen;
	int idle;
	boolean_t sendalot;
	struct ip6_hdr *ip6;
#ifdef INET6
	const boolean_t isipv6 = INP_ISIPV6(inp);
#else
	const boolean_t isipv6 = FALSE;
#endif
	boolean_t can_tso = FALSE, use_tso;
	boolean_t report_sack, idle_cwv = FALSE;
	u_int segsz, tso_hlen, tso_lenmax = 0;
	int segcnt = 0;
	boolean_t need_sched = FALSE;

	KKASSERT(so->so_port == &curthread->td_msgport);

	/*
	 * Determine length of data that should be transmitted,
	 * and flags that will be used.
	 * If there is some data or critical controls (SYN, RST)
	 * to send, then transmit; otherwise, investigate further.
	 */

	/*
	 * If we have been idle for a while, the send congestion window
	 * could be no longer representative of the current state of the
	 * link; need to validate congestion window.  However, we should
	 * not perform congestion window validation here, since we could
	 * be asked to send pure ACK.
	 */
	if (tp->snd_max == tp->snd_una &&
	    (ticks - tp->snd_last) >= tp->t_rxtcur && tcp_idle_restart)
		idle_cwv = TRUE;

	/*
	 * Calculate whether the transmit stream was previously idle 
	 * and adjust TF_LASTIDLE for the next time.
	 */
	idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
	if (idle && (tp->t_flags & TF_MORETOCOME))
		tp->t_flags |= TF_LASTIDLE;
	else
		tp->t_flags &= ~TF_LASTIDLE;

	if (TCP_DO_SACK(tp) && tp->snd_nxt != tp->snd_max &&
	    !IN_FASTRECOVERY(tp))
		nsacked = tcp_sack_bytes_below(&tp->scb, tp->snd_nxt);

	/*
	 * Find out whether TSO could be used or not
	 *
	 * For TSO capable devices, the following assumptions apply to
	 * the processing of TCP flags:
	 * - If FIN is set on the large TCP segment, the device must set
	 *   FIN on the last segment that it creates from the large TCP
	 *   segment.
	 * - If PUSH is set on the large TCP segment, the device must set
	 *   PUSH on the last segment that it creates from the large TCP
	 *   segment.
	 */
#if !defined(IPSEC) && !defined(FAST_IPSEC)
	if (tcp_do_tso
#ifdef TCP_SIGNATURE
	    && (tp->t_flags & TF_SIGNATURE) == 0
#endif
	) {
		if (!isipv6) {
			struct rtentry *rt = inp->inp_route.ro_rt;

			if (rt != NULL && (rt->rt_flags & RTF_UP) &&
			    (rt->rt_ifp->if_hwassist & CSUM_TSO)) {
				can_tso = TRUE;
				tso_lenmax = rt->rt_ifp->if_tsolen;
			}
		}
	}
#endif	/* !IPSEC && !FAST_IPSEC */

again:
	m = NULL;
	ip = NULL;
	th = NULL;
	ip6 = NULL;

	if ((tp->t_flags & (TF_SACK_PERMITTED | TF_NOOPT)) ==
		TF_SACK_PERMITTED &&
	    (!TAILQ_EMPTY(&tp->t_segq) ||
	     tp->reportblk.rblk_start != tp->reportblk.rblk_end))
		report_sack = TRUE;
	else
		report_sack = FALSE;

	/* Make use of SACK information when slow-starting after a RTO. */
	if (TCP_DO_SACK(tp) && tp->snd_nxt != tp->snd_max &&
	    !IN_FASTRECOVERY(tp)) {
		tcp_seq old_snd_nxt = tp->snd_nxt;

		tcp_sack_skip_sacked(&tp->scb, &tp->snd_nxt);
		nsacked += tp->snd_nxt - old_snd_nxt;
	}

	sendalot = FALSE;
	off = tp->snd_nxt - tp->snd_una;
	sendwin = min(tp->snd_wnd, tp->snd_cwnd + nsacked);
	sendwin = min(sendwin, tp->snd_bwnd);

	flags = tcp_outflags[tp->t_state];
	/*
	 * Get standard flags, and add SYN or FIN if requested by 'hidden'
	 * state flags.
	 */
	if (tp->t_flags & TF_NEEDFIN)
		flags |= TH_FIN;
	if (tp->t_flags & TF_NEEDSYN)
		flags |= TH_SYN;

	/*
	 * If in persist timeout with window of 0, send 1 byte.
	 * Otherwise, if window is small but nonzero
	 * and timer expired, we will send what we can
	 * and go to transmit state.
	 */
	if (tp->t_flags & TF_FORCE) {
		if (sendwin == 0) {
			/*
			 * If we still have some data to send, then
			 * clear the FIN bit.  Usually this would
			 * happen below when it realizes that we
			 * aren't sending all the data.  However,
			 * if we have exactly 1 byte of unsent data,
			 * then it won't clear the FIN bit below,
			 * and if we are in persist state, we wind
			 * up sending the packet without recording
			 * that we sent the FIN bit.
			 *
			 * We can't just blindly clear the FIN bit,
			 * because if we don't have any more data
			 * to send then the probe will be the FIN
			 * itself.
			 */
			if (off < so->so_snd.ssb_cc)
				flags &= ~TH_FIN;
			sendwin = 1;
		} else {
			tcp_callout_stop(tp, tp->tt_persist);
			tp->t_rxtshift = 0;
		}
	}

	/*
	 * If snd_nxt == snd_max and we have transmitted a FIN, the
	 * offset will be > 0 even if so_snd.ssb_cc is 0, resulting in
	 * a negative length.  This can also occur when TCP opens up
	 * its congestion window while receiving additional duplicate
	 * acks after fast-retransmit because TCP will reset snd_nxt
	 * to snd_max after the fast-retransmit.
	 *
	 * A negative length can also occur when we are in the
	 * TCPS_SYN_RECEIVED state due to a simultanious connect where
	 * our SYN has not been acked yet.
	 *
	 * In the normal retransmit-FIN-only case, however, snd_nxt will
	 * be set to snd_una, the offset will be 0, and the length may
	 * wind up 0.
	 */
	len = (long)ulmin(so->so_snd.ssb_cc, sendwin) - off;

	/*
	 * Lop off SYN bit if it has already been sent.  However, if this
	 * is SYN-SENT state and if segment contains data, suppress sending
	 * segment (sending the segment would be an option if we still
	 * did TAO and the remote host supported it).
	 */
	if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
		flags &= ~TH_SYN;
		off--, len++;
		if (len > 0 && tp->t_state == TCPS_SYN_SENT) {
			tp->t_flags &= ~(TF_ACKNOW | TF_XMITNOW);
			return 0;
		}
	}

	/*
	 * Be careful not to send data and/or FIN on SYN segments.
	 * This measure is needed to prevent interoperability problems
	 * with not fully conformant TCP implementations.
	 */
	if (flags & TH_SYN) {
		len = 0;
		flags &= ~TH_FIN;
	}

	if (len < 0) {
		/*
		 * A negative len can occur if our FIN has been sent but not
		 * acked, or if we are in a simultanious connect in the
		 * TCPS_SYN_RECEIVED state with our SYN sent but not yet
		 * acked.
		 *
		 * If our window has contracted to 0 in the FIN case
		 * (which can only occur if we have NOT been called to
		 * retransmit as per code a few paragraphs up) then we
		 * want to shift the retransmit timer over to the
		 * persist timer.
		 *
		 * However, if we are in the TCPS_SYN_RECEIVED state
		 * (the SYN case) we will be in a simultanious connect and
		 * the window may be zero degeneratively.  In this case we
		 * do not want to shift to the persist timer after the SYN
		 * or the SYN+ACK transmission.
		 */
		len = 0;
		if (sendwin == 0 && tp->t_state != TCPS_SYN_RECEIVED) {
			tcp_callout_stop(tp, tp->tt_rexmt);
			tp->t_rxtshift = 0;
			tp->snd_nxt = tp->snd_una;
			if (!tcp_callout_active(tp, tp->tt_persist))
				tcp_setpersist(tp);
		}
	}

	KASSERT(len >= 0, ("%s: len < 0", __func__));
	/*
	 * Automatic sizing of send socket buffer.  Often the send buffer
	 * size is not optimally adjusted to the actual network conditions
	 * at hand (delay bandwidth product).  Setting the buffer size too
	 * small limits throughput on links with high bandwidth and high
	 * delay (eg. trans-continental/oceanic links).  Setting the
	 * buffer size too big consumes too much real kernel memory,
	 * especially with many connections on busy servers.
	 *
	 * The criteria to step up the send buffer one notch are:
	 *  1. receive window of remote host is larger than send buffer
	 *     (with a fudge factor of 5/4th);
	 *  2. hiwat has not significantly exceeded bwnd (inflight)
	 *     (bwnd is a maximal value if inflight is disabled).
	 *  3. send buffer is filled to 7/8th with data (so we actually
	 *     have data to make use of it);
	 *  4. hiwat has not hit maximal automatic size;
	 *  5. our send window (slow start and cogestion controlled) is
	 *     larger than sent but unacknowledged data in send buffer.
	 *
	 * The remote host receive window scaling factor may limit the
	 * growing of the send buffer before it reaches its allowed
	 * maximum.
	 *
	 * It scales directly with slow start or congestion window
	 * and does at most one step per received ACK.  This fast
	 * scaling has the drawback of growing the send buffer beyond
	 * what is strictly necessary to make full use of a given
	 * delay*bandwith product.  However testing has shown this not
	 * to be much of an problem.  At worst we are trading wasting
	 * of available bandwith (the non-use of it) for wasting some
	 * socket buffer memory.
	 *
	 * The criteria for shrinking the buffer is based solely on
	 * the inflight code (snd_bwnd).  If inflight is disabled,
	 * the buffer will not be shrinked.  Note that snd_bwnd already
	 * has a fudge factor.  Our test adds a little hysteresis.
	 */
	if (tcp_do_autosndbuf && (so->so_snd.ssb_flags & SSB_AUTOSIZE)) {
		const int asbinc = tcp_autosndbuf_inc;
		const int hiwat = so->so_snd.ssb_hiwat;
		const int lowat = so->so_snd.ssb_lowat;
		u_long newsize;

		if ((tp->snd_wnd / 4 * 5) >= hiwat &&
		    so->so_snd.ssb_cc >= (hiwat / 8 * 7) &&
		    hiwat < tp->snd_bwnd + hiwat / 10 &&
		    hiwat + asbinc < tcp_autosndbuf_max &&
		    hiwat < (TCP_MAXWIN << tp->snd_scale) &&
		    sendwin >= (so->so_snd.ssb_cc -
				(tp->snd_nxt - tp->snd_una))) {
			newsize = ulmin(hiwat + asbinc, tcp_autosndbuf_max);
			if (!ssb_reserve(&so->so_snd, newsize, so, NULL))
				atomic_clear_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE);
#if 0
			if (newsize >= (TCP_MAXWIN << tp->snd_scale))
				atomic_clear_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE);
#endif
		} else if ((long)tp->snd_bwnd <
			   (long)(hiwat * 3 / 4 - lowat - asbinc) &&
			   hiwat > tp->t_maxseg * 2 + asbinc &&
			   hiwat + asbinc >= tcp_autosndbuf_min &&
			   tcp_do_autosndbuf == 1) {
			newsize = ulmax(hiwat - asbinc, tp->t_maxseg * 2);
			ssb_reserve(&so->so_snd, newsize, so, NULL);
		}
	}

	/*
	 * Don't use TSO, if:
	 * - Congestion window needs validation
	 * - There are SACK blocks to report
	 * - RST or SYN flags is set
	 * - URG will be set
	 *
	 * XXX
	 * Checking for SYN|RST looks overkill, just to be safe than sorry
	 */
	use_tso = can_tso;
	if (report_sack || idle_cwv || (flags & (TH_RST | TH_SYN)))
		use_tso = FALSE;
	if (use_tso) {
		tcp_seq ugr_nxt = tp->snd_nxt;

		if ((flags & TH_FIN) && (tp->t_flags & TF_SENTFIN) &&
		    tp->snd_nxt == tp->snd_max)
			--ugr_nxt;

		if (SEQ_GT(tp->snd_up, ugr_nxt))
			use_tso = FALSE;
	}

	if (use_tso) {
		/*
		 * Find out segment size and header length for TSO
		 */
		error = tcp_tso_getsize(tp, &segsz, &tso_hlen);
		if (error)
			use_tso = FALSE;
	}
	if (!use_tso) {
		segsz = tp->t_maxseg;
		tso_hlen = 0; /* not used */
	}

	/*
	 * Truncate to the maximum segment length if not TSO, and ensure that
	 * FIN is removed if the length no longer contains the last data byte.
	 */
	if (len > segsz) {
		if (!use_tso) {
			len = segsz;
			++segcnt;
		} else {
			int nsegs;

			if (__predict_false(tso_lenmax < segsz))
				tso_lenmax = segsz << 1;

			/*
			 * Truncate TSO transfers to (IP_MAXPACKET - iphlen -
			 * thoff), and make sure that we send equal size
			 * transfers down the stack (rather than big-small-
			 * big-small-...).
			 */
			len = min(len, tso_lenmax);
			nsegs = min(len, (IP_MAXPACKET - tso_hlen)) / segsz;
			KKASSERT(nsegs > 0);

			len = nsegs * segsz;

			if (len <= segsz) {
				use_tso = FALSE;
				++segcnt;
			} else {
				segcnt += nsegs;
			}
		}
		sendalot = TRUE;
	} else {
		use_tso = FALSE;
		if (len > 0)
			++segcnt;
	}
	if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.ssb_cc))
		flags &= ~TH_FIN;

	recvwin = ssb_space(&so->so_rcv);

	/*
	 * Sender silly window avoidance.   We transmit under the following
	 * conditions when len is non-zero:
	 *
	 *	- We have a full segment
	 *	- This is the last buffer in a write()/send() and we are
	 *	  either idle or running NODELAY
	 *	- we've timed out (e.g. persist timer)
	 *	- we have more then 1/2 the maximum send window's worth of
	 *	  data (receiver may be limiting the window size)
	 *	- we need to retransmit
	 */
	if (len) {
		if (len >= segsz)
			goto send;
		/*
		 * NOTE! on localhost connections an 'ack' from the remote
		 * end may occur synchronously with the output and cause
		 * us to flush a buffer queued with moretocome.  XXX
		 *
		 * note: the len + off check is almost certainly unnecessary.
		 */
		if (!(tp->t_flags & TF_MORETOCOME) &&	/* normal case */
		    (idle || (tp->t_flags & TF_NODELAY)) &&
		    len + off >= so->so_snd.ssb_cc &&
		    !(tp->t_flags & TF_NOPUSH)) {
			goto send;
		}
		if (tp->t_flags & TF_FORCE)		/* typ. timeout case */
			goto send;
		if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
			goto send;
		if (SEQ_LT(tp->snd_nxt, tp->snd_max))	/* retransmit case */
			goto send;
		if (tp->t_flags & TF_XMITNOW)
			goto send;
	}

	/*
	 * Compare available window to amount of window
	 * known to peer (as advertised window less
	 * next expected input).  If the difference is at least two
	 * max size segments, or at least 50% of the maximum possible
	 * window, then want to send a window update to peer.
	 */
	if (recvwin > 0) {
		/*
		 * "adv" is the amount we can increase the window,
		 * taking into account that we are limited by
		 * TCP_MAXWIN << tp->rcv_scale.
		 */
		long adv = min(recvwin, (long)TCP_MAXWIN << tp->rcv_scale) -
			(tp->rcv_adv - tp->rcv_nxt);
		long hiwat;

		/*
		 * This ack case typically occurs when the user has drained
		 * the TCP socket buffer sufficiently to warrent an ack
		 * containing a 'pure window update'... that is, an ack that
		 * ONLY updates the tcp window.
		 *
		 * It is unclear why we would need to do a pure window update
		 * past 2 segments if we are going to do one at 1/2 the high
		 * water mark anyway, especially since under normal conditions
		 * the user program will drain the socket buffer quickly.
		 * The 2-segment pure window update will often add a large
		 * number of extra, unnecessary acks to the stream.
		 *
		 * avoid_pure_win_update now defaults to 1.
		 */
		if (avoid_pure_win_update == 0 ||
		    (tp->t_flags & TF_RXRESIZED)) {
			if (adv >= (long) (2 * segsz)) {
				goto send;
			}
		}
		hiwat = (long)(TCP_MAXWIN << tp->rcv_scale);
		if (hiwat > (long)so->so_rcv.ssb_hiwat)
			hiwat = (long)so->so_rcv.ssb_hiwat;
		if (adv >= hiwat / 2)
			goto send;
	}

	/*
	 * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
	 * is also a catch-all for the retransmit timer timeout case.
	 */
	if (tp->t_flags & TF_ACKNOW)
		goto send;
	if ((flags & TH_RST) ||
	    ((flags & TH_SYN) && !(tp->t_flags & TF_NEEDSYN)))
		goto send;
	if (SEQ_GT(tp->snd_up, tp->snd_una))
		goto send;
	/*
	 * If our state indicates that FIN should be sent
	 * and we have not yet done so, then we need to send.
	 */
	if ((flags & TH_FIN) &&
	    (!(tp->t_flags & TF_SENTFIN) || tp->snd_nxt == tp->snd_una))
		goto send;

	/*
	 * TCP window updates are not reliable, rather a polling protocol
	 * using ``persist'' packets is used to insure receipt of window
	 * updates.  The three ``states'' for the output side are:
	 *	idle			not doing retransmits or persists
	 *	persisting		to move a small or zero window
	 *	(re)transmitting	and thereby not persisting
	 *
	 * tcp_callout_active(tp, tp->tt_persist)
	 *	is true when we are in persist state.
	 * The TF_FORCE flag in tp->t_flags
	 *	is set when we are called to send a persist packet.
	 * tcp_callout_active(tp, tp->tt_rexmt)
	 *	is set when we are retransmitting
	 * The output side is idle when both timers are zero.
	 *
	 * If send window is too small, there is data to transmit, and no
	 * retransmit or persist is pending, then go to persist state.
	 *
	 * If nothing happens soon, send when timer expires:
	 * if window is nonzero, transmit what we can, otherwise force out
	 * a byte.
	 *
	 * Don't try to set the persist state if we are in TCPS_SYN_RECEIVED
	 * with data pending.  This situation can occur during a
	 * simultanious connect.
	 */
	if (so->so_snd.ssb_cc > 0 &&
	    tp->t_state != TCPS_SYN_RECEIVED &&
	    !tcp_callout_active(tp, tp->tt_rexmt) &&
	    !tcp_callout_active(tp, tp->tt_persist)) {
		tp->t_rxtshift = 0;
		tcp_setpersist(tp);
	}

	/*
	 * No reason to send a segment, just return.
	 */
	tp->t_flags &= ~TF_XMITNOW;
	return (0);

send:
	if (need_sched && len > 0) {
		tcp_output_sched(tp);
		return 0;
	}

	/*
	 * Before ESTABLISHED, force sending of initial options
	 * unless TCP set not to do any options.
	 * NOTE: we assume that the IP/TCP header plus TCP options
	 * always fit in a single mbuf, leaving room for a maximum
	 * link header, i.e.
	 *	max_linkhdr + sizeof(struct tcpiphdr) + optlen <= MCLBYTES
	 */
	optlen = 0;
	if (isipv6)
		hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
	else
		hdrlen = sizeof(struct tcpiphdr);
	if (flags & TH_SYN) {
		tp->snd_nxt = tp->iss;
		if (!(tp->t_flags & TF_NOOPT)) {
			u_short mss;

			opt[0] = TCPOPT_MAXSEG;
			opt[1] = TCPOLEN_MAXSEG;
			mss = htons((u_short) tcp_mssopt(tp));
			memcpy(opt + 2, &mss, sizeof mss);
			optlen = TCPOLEN_MAXSEG;

			if ((tp->t_flags & TF_REQ_SCALE) &&
			    (!(flags & TH_ACK) ||
			     (tp->t_flags & TF_RCVD_SCALE))) {
				*((u_int32_t *)(opt + optlen)) = htonl(
					TCPOPT_NOP << 24 |
					TCPOPT_WINDOW << 16 |
					TCPOLEN_WINDOW << 8 |
					tp->request_r_scale);
				optlen += 4;
			}

			if ((tcp_do_sack && !(flags & TH_ACK)) ||
			    tp->t_flags & TF_SACK_PERMITTED) {
				uint32_t *lp = (uint32_t *)(opt + optlen);

				*lp = htonl(TCPOPT_SACK_PERMITTED_ALIGNED);
				optlen += TCPOLEN_SACK_PERMITTED_ALIGNED;
			}
		}
	}

	/*
	 * Send a timestamp and echo-reply if this is a SYN and our side
	 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
	 * and our peer have sent timestamps in our SYN's.
	 */
	if ((tp->t_flags & (TF_REQ_TSTMP | TF_NOOPT)) == TF_REQ_TSTMP &&
	    !(flags & TH_RST) &&
	    (!(flags & TH_ACK) || (tp->t_flags & TF_RCVD_TSTMP))) {
		u_int32_t *lp = (u_int32_t *)(opt + optlen);

		/* Form timestamp option as shown in appendix A of RFC 1323. */
		*lp++ = htonl(TCPOPT_TSTAMP_HDR);
		*lp++ = htonl(ticks);
		*lp   = htonl(tp->ts_recent);
		optlen += TCPOLEN_TSTAMP_APPA;
	}

	/* Set receive buffer autosizing timestamp. */
	if (tp->rfbuf_ts == 0 && (so->so_rcv.ssb_flags & SSB_AUTOSIZE))
		tp->rfbuf_ts = ticks;

	/*
	 * If this is a SACK connection and we have a block to report,
	 * fill in the SACK blocks in the TCP options.
	 */
	if (report_sack)
		tcp_sack_fill_report(tp, opt, &optlen);

#ifdef TCP_SIGNATURE
	if (tp->t_flags & TF_SIGNATURE) {
		int i;
		u_char *bp;
		/*
		 * Initialize TCP-MD5 option (RFC2385)
		 */
		bp = (u_char *)opt + optlen;
		*bp++ = TCPOPT_SIGNATURE;
		*bp++ = TCPOLEN_SIGNATURE;
		sigoff = optlen + 2;
		for (i = 0; i < TCP_SIGLEN; i++)
			*bp++ = 0;
		optlen += TCPOLEN_SIGNATURE;
		/*
		 * Terminate options list and maintain 32-bit alignment.
		 */
		*bp++ = TCPOPT_NOP;
		*bp++ = TCPOPT_EOL;
		optlen += 2;
	}
#endif /* TCP_SIGNATURE */
	KASSERT(optlen <= TCP_MAXOLEN, ("too many TCP options"));
	hdrlen += optlen;

	if (isipv6) {
		ipoptlen = ip6_optlen(inp);
	} else {
		if (inp->inp_options) {
			ipoptlen = inp->inp_options->m_len -
			    offsetof(struct ipoption, ipopt_list);
		} else {
Beispiel #27
0
/*
 * TCP input routine, follows pages 65-76 of the
 * protocol specification dated September, 1981 very closely.
 */
void
tcp_input(struct mbuf *m, int iphlen, struct socket *inso)
{
  	struct ip save_ip, *ip;
	register struct tcpiphdr *ti;
	caddr_t optp = NULL;
	int optlen = 0;
	int len, tlen, off;
        register struct tcpcb *tp = NULL;
	register int tiflags;
        struct socket *so = NULL;
	int todrop, acked, ourfinisacked, needoutput = 0;
	int iss = 0;
	u_long tiwin;
	int ret;
    struct ex_list *ex_ptr;
    Slirp *slirp;

	DEBUG_CALL("tcp_input");
	DEBUG_ARGS((dfd, " m = %8lx  iphlen = %2d  inso = %lx\n",
		    (long )m, iphlen, (long )inso ));

	/*
	 * If called with m == 0, then we're continuing the connect
	 */
	if (m == NULL) {
		so = inso;
		slirp = so->slirp;

		/* Re-set a few variables */
		tp = sototcpcb(so);
		m = so->so_m;
                so->so_m = NULL;
		ti = so->so_ti;
		tiwin = ti->ti_win;
		tiflags = ti->ti_flags;

		goto cont_conn;
	}
	slirp = m->slirp;

	/*
	 * Get IP and TCP header together in first mbuf.
	 * Note: IP leaves IP header in first mbuf.
	 */
	ti = mtod(m, struct tcpiphdr *);
	if (iphlen > sizeof(struct ip )) {
	  ip_stripoptions(m, (struct mbuf *)0);
	  iphlen=sizeof(struct ip );
	}
	/* XXX Check if too short */


	/*
	 * Save a copy of the IP header in case we want restore it
	 * for sending an ICMP error message in response.
	 */
	ip=mtod(m, struct ip *);
	save_ip = *ip;
	save_ip.ip_len+= iphlen;

	/*
	 * Checksum extended TCP header and data.
	 */
	tlen = ((struct ip *)ti)->ip_len;
        tcpiphdr2qlink(ti)->next = tcpiphdr2qlink(ti)->prev = NULL;
        memset(&ti->ti_i.ih_mbuf, 0 , sizeof(struct mbuf_ptr));
	ti->ti_x1 = 0;
	ti->ti_len = htons((uint16_t)tlen);
	len = sizeof(struct ip ) + tlen;
	if(cksum(m, len)) {
	  goto drop;
	}

	/*
	 * Check that TCP offset makes sense,
	 * pull out TCP options and adjust length.		XXX
	 */
	off = ti->ti_off << 2;
	if (off < sizeof (struct tcphdr) || off > tlen) {
	  goto drop;
	}
	tlen -= off;
	ti->ti_len = tlen;
	if (off > sizeof (struct tcphdr)) {
	  optlen = off - sizeof (struct tcphdr);
	  optp = mtod(m, caddr_t) + sizeof (struct tcpiphdr);
	}
	tiflags = ti->ti_flags;

	/*
	 * Convert TCP protocol specific fields to host format.
	 */
	NTOHL(ti->ti_seq);
	NTOHL(ti->ti_ack);
	NTOHS(ti->ti_win);
	NTOHS(ti->ti_urp);

	/*
	 * Drop TCP, IP headers and TCP options.
	 */
	m->m_data += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
	m->m_len  -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);

	/*
	 * Locate pcb for segment.
	 */
findso:
	so = slirp->tcp_last_so;
	if (so->so_fport != ti->ti_dport ||
	    so->so_lport != ti->ti_sport ||
	    so->so_laddr.s_addr != ti->ti_src.s_addr ||
	    so->so_faddr.s_addr != ti->ti_dst.s_addr) {
		so = solookup(&slirp->tcb, ti->ti_src, ti->ti_sport,
			       ti->ti_dst, ti->ti_dport);
		if (so)
			slirp->tcp_last_so = so;
	}

	/*
	 * If the state is CLOSED (i.e., TCB does not exist) then
	 * all data in the incoming segment is discarded.
	 * If the TCB exists but is in CLOSED state, it is embryonic,
	 * but should either do a listen or a connect soon.
	 *
	 * state == CLOSED means we've done socreate() but haven't
	 * attached it to a protocol yet...
	 *
	 * XXX If a TCB does not exist, and the TH_SYN flag is
	 * the only flag set, then create a session, mark it
	 * as if it was LISTENING, and continue...
	 */
        if (so == NULL) {
          if (slirp->restricted) {
            /* Any hostfwds will have an existing socket, so we only get here
             * for non-hostfwd connections. These should be dropped, unless it
             * happens to be a guestfwd.
             */
            for (ex_ptr = slirp->exec_list; ex_ptr; ex_ptr = ex_ptr->ex_next) {
                if (ex_ptr->ex_fport == ti->ti_dport &&
                    ti->ti_dst.s_addr == ex_ptr->ex_addr.s_addr) {
                    break;
                }
            }
            if (!ex_ptr) {
                goto dropwithreset;
            }
          }

	  if ((tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) != TH_SYN)
	    goto dropwithreset;

	  if ((so = socreate(slirp)) == NULL)
	    goto dropwithreset;
	  if (tcp_attach(so) < 0) {
	    free(so); /* Not sofree (if it failed, it's not insqued) */
	    goto dropwithreset;
	  }

	  sbreserve(&so->so_snd, TCP_SNDSPACE);
	  sbreserve(&so->so_rcv, TCP_RCVSPACE);

	  so->so_laddr = ti->ti_src;
	  so->so_lport = ti->ti_sport;
	  so->so_faddr = ti->ti_dst;
	  so->so_fport = ti->ti_dport;

	  if ((so->so_iptos = tcp_tos(so)) == 0)
	    so->so_iptos = ((struct ip *)ti)->ip_tos;

	  tp = sototcpcb(so);
	  tp->t_state = TCPS_LISTEN;
	}

        /*
         * If this is a still-connecting socket, this probably
         * a retransmit of the SYN.  Whether it's a retransmit SYN
	 * or something else, we nuke it.
         */
        if (so->so_state & SS_ISFCONNECTING)
                goto drop;

	tp = sototcpcb(so);

	/* XXX Should never fail */
        if (tp == NULL)
		goto dropwithreset;
	if (tp->t_state == TCPS_CLOSED)
		goto drop;

	tiwin = ti->ti_win;

	/*
	 * Segment received on connection.
	 * Reset idle time and keep-alive timer.
	 */
	tp->t_idle = 0;
	if (SO_OPTIONS)
	   tp->t_timer[TCPT_KEEP] = TCPTV_KEEPINTVL;
	else
	   tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_IDLE;

	/*
	 * Process options if not in LISTEN state,
	 * else do it below (after getting remote address).
	 */
	if (optp && tp->t_state != TCPS_LISTEN)
		tcp_dooptions(tp, (u_char *)optp, optlen, ti);

	/*
	 * Header prediction: check for the two common cases
	 * of a uni-directional data xfer.  If the packet has
	 * no control flags, is in-sequence, the window didn't
	 * change and we're not retransmitting, it's a
	 * candidate.  If the length is zero and the ack moved
	 * forward, we're the sender side of the xfer.  Just
	 * free the data acked & wake any higher level process
	 * that was blocked waiting for space.  If the length
	 * is non-zero and the ack didn't move, we're the
	 * receiver side.  If we're getting packets in-order
	 * (the reassembly queue is empty), add the data to
	 * the socket buffer and note that we need a delayed ack.
	 *
	 * XXX Some of these tests are not needed
	 * eg: the tiwin == tp->snd_wnd prevents many more
	 * predictions.. with no *real* advantage..
	 */
	if (tp->t_state == TCPS_ESTABLISHED &&
	    (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
	    ti->ti_seq == tp->rcv_nxt &&
	    tiwin && tiwin == tp->snd_wnd &&
	    tp->snd_nxt == tp->snd_max) {
		if (ti->ti_len == 0) {
			if (SEQ_GT(ti->ti_ack, tp->snd_una) &&
			    SEQ_LEQ(ti->ti_ack, tp->snd_max) &&
			    tp->snd_cwnd >= tp->snd_wnd) {
				/*
				 * this is a pure ack for outstanding data.
				 */
				if (tp->t_rtt &&
				    SEQ_GT(ti->ti_ack, tp->t_rtseq))
					tcp_xmit_timer(tp, tp->t_rtt);
				acked = ti->ti_ack - tp->snd_una;
				sbdrop(&so->so_snd, acked);
				tp->snd_una = ti->ti_ack;
				m_free(m);

				/*
				 * If all outstanding data are acked, stop
				 * retransmit timer, otherwise restart timer
				 * using current (possibly backed-off) value.
				 * If process is waiting for space,
				 * wakeup/selwakeup/signal.  If data
				 * are ready to send, let tcp_output
				 * decide between more output or persist.
				 */
				if (tp->snd_una == tp->snd_max)
					tp->t_timer[TCPT_REXMT] = 0;
				else if (tp->t_timer[TCPT_PERSIST] == 0)
					tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;

				/*
				 * This is called because sowwakeup might have
				 * put data into so_snd.  Since we don't so sowwakeup,
				 * we don't need this.. XXX???
				 */
				if (so->so_snd.sb_cc)
					(void) tcp_output(tp);

				return;
			}
		} else if (ti->ti_ack == tp->snd_una &&
		    tcpfrag_list_empty(tp) &&
		    ti->ti_len <= sbspace(&so->so_rcv)) {
			/*
			 * this is a pure, in-sequence data packet
			 * with nothing on the reassembly queue and
			 * we have enough buffer space to take it.
			 */
			tp->rcv_nxt += ti->ti_len;
			/*
			 * Add data to socket buffer.
			 */
			if (so->so_emu) {
				if (tcp_emu(so,m)) sbappend(so, m);
			} else
				sbappend(so, m);

			/*
			 * If this is a short packet, then ACK now - with Nagel
			 *	congestion avoidance sender won't send more until
			 *	he gets an ACK.
			 *
			 * It is better to not delay acks at all to maximize
			 * TCP throughput.  See RFC 2581.
			 */
			tp->t_flags |= TF_ACKNOW;
			tcp_output(tp);
			return;
		}
	} /* header prediction */
	/*
	 * Calculate amount of space in receive window,
	 * and then do TCP input processing.
	 * Receive window is amount of space in rcv queue,
	 * but not less than advertised window.
	 */
	{ int win;
          win = sbspace(&so->so_rcv);
	  if (win < 0)
	    win = 0;
	  tp->rcv_wnd = max(win, (int)(tp->rcv_adv - tp->rcv_nxt));
	}

	switch (tp->t_state) {

	/*
	 * If the state is LISTEN then ignore segment if it contains an RST.
	 * If the segment contains an ACK then it is bad and send a RST.
	 * If it does not contain a SYN then it is not interesting; drop it.
	 * Don't bother responding if the destination was a broadcast.
	 * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial
	 * tp->iss, and send a segment:
	 *     <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
	 * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
	 * Fill in remote peer address fields if not previously specified.
	 * Enter SYN_RECEIVED state, and process any other fields of this
	 * segment in this state.
	 */
	case TCPS_LISTEN: {

	  if (tiflags & TH_RST)
	    goto drop;
	  if (tiflags & TH_ACK)
	    goto dropwithreset;
	  if ((tiflags & TH_SYN) == 0)
	    goto drop;

	  /*
	   * This has way too many gotos...
	   * But a bit of spaghetti code never hurt anybody :)
	   */

	  /*
	   * If this is destined for the control address, then flag to
	   * tcp_ctl once connected, otherwise connect
	   */
	  if ((so->so_faddr.s_addr & slirp->vnetwork_mask.s_addr) ==
	      slirp->vnetwork_addr.s_addr) {
	    if (so->so_faddr.s_addr != slirp->vhost_addr.s_addr &&
		so->so_faddr.s_addr != slirp->vnameserver_addr.s_addr) {
		/* May be an add exec */
		for (ex_ptr = slirp->exec_list; ex_ptr;
		     ex_ptr = ex_ptr->ex_next) {
		  if(ex_ptr->ex_fport == so->so_fport &&
		     so->so_faddr.s_addr == ex_ptr->ex_addr.s_addr) {
		    so->so_state |= SS_CTL;
		    break;
		  }
		}
		if (so->so_state & SS_CTL) {
		    goto cont_input;
		}
	    }
	    /* CTL_ALIAS: Do nothing, tcp_fconnect will be called on it */
	  }

	  if (so->so_emu & EMU_NOCONNECT) {
	    so->so_emu &= ~EMU_NOCONNECT;
	    goto cont_input;
	  }

          if ((tcp_fconnect(so) == -1) &&
#if defined(_WIN32)
              socket_error() != WSAEWOULDBLOCK
#else
              (errno != EINPROGRESS) && (errno != EWOULDBLOCK)
#endif
          ) {
	    u_char code=ICMP_UNREACH_NET;
	    DEBUG_MISC((dfd, " tcp fconnect errno = %d-%s\n",
			errno,strerror(errno)));
	    if(errno == ECONNREFUSED) {
	      /* ACK the SYN, send RST to refuse the connection */
	      tcp_respond(tp, ti, m, ti->ti_seq+1, (tcp_seq)0,
			  TH_RST|TH_ACK);
	    } else {
	      if(errno == EHOSTUNREACH) code=ICMP_UNREACH_HOST;
	      HTONL(ti->ti_seq);             /* restore tcp header */
	      HTONL(ti->ti_ack);
	      HTONS(ti->ti_win);
	      HTONS(ti->ti_urp);
	      m->m_data -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
	      m->m_len  += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
	      *ip=save_ip;
	      icmp_error(m, ICMP_UNREACH,code, 0,strerror(errno));
	    }
            tcp_close(tp);
	    m_free(m);
	  } else {
	    /*
	     * Haven't connected yet, save the current mbuf
	     * and ti, and return
	     * XXX Some OS's don't tell us whether the connect()
	     * succeeded or not.  So we must time it out.
	     */
	    so->so_m = m;
	    so->so_ti = ti;
	    tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT;
	    tp->t_state = TCPS_SYN_RECEIVED;
	    tcp_template(tp);
	  }
	  return;

	cont_conn:
	  /* m==NULL
	   * Check if the connect succeeded
	   */
	  if (so->so_state & SS_NOFDREF) {
	    tp = tcp_close(tp);
	    goto dropwithreset;
	  }
	cont_input:
	  tcp_template(tp);

	  if (optp)
	    tcp_dooptions(tp, (u_char *)optp, optlen, ti);

	  if (iss)
	    tp->iss = iss;
	  else
	    tp->iss = slirp->tcp_iss;
	  slirp->tcp_iss += TCP_ISSINCR/2;
	  tp->irs = ti->ti_seq;
	  tcp_sendseqinit(tp);
	  tcp_rcvseqinit(tp);
	  tp->t_flags |= TF_ACKNOW;
	  tp->t_state = TCPS_SYN_RECEIVED;
	  tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT;
	  goto trimthenstep6;
	} /* case TCPS_LISTEN */

	/*
	 * If the state is SYN_SENT:
	 *	if seg contains an ACK, but not for our SYN, drop the input.
	 *	if seg contains a RST, then drop the connection.
	 *	if seg does not contain SYN, then drop it.
	 * Otherwise this is an acceptable SYN segment
	 *	initialize tp->rcv_nxt and tp->irs
	 *	if seg contains ack then advance tp->snd_una
	 *	if SYN has been acked change to ESTABLISHED else SYN_RCVD state
	 *	arrange for segment to be acked (eventually)
	 *	continue processing rest of data/controls, beginning with URG
	 */
	case TCPS_SYN_SENT:
		if ((tiflags & TH_ACK) &&
		    (SEQ_LEQ(ti->ti_ack, tp->iss) ||
		     SEQ_GT(ti->ti_ack, tp->snd_max)))
			goto dropwithreset;

		if (tiflags & TH_RST) {
                        if (tiflags & TH_ACK) {
                                tcp_drop(tp, 0); /* XXX Check t_softerror! */
                        }
			goto drop;
		}

		if ((tiflags & TH_SYN) == 0)
			goto drop;
		if (tiflags & TH_ACK) {
			tp->snd_una = ti->ti_ack;
			if (SEQ_LT(tp->snd_nxt, tp->snd_una))
				tp->snd_nxt = tp->snd_una;
		}

		tp->t_timer[TCPT_REXMT] = 0;
		tp->irs = ti->ti_seq;
		tcp_rcvseqinit(tp);
		tp->t_flags |= TF_ACKNOW;
		if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) {
			soisfconnected(so);
			tp->t_state = TCPS_ESTABLISHED;

			(void) tcp_reass(tp, (struct tcpiphdr *)0,
				(struct mbuf *)0);
			/*
			 * if we didn't have to retransmit the SYN,
			 * use its rtt as our initial srtt & rtt var.
			 */
			if (tp->t_rtt)
				tcp_xmit_timer(tp, tp->t_rtt);
		} else
			tp->t_state = TCPS_SYN_RECEIVED;

trimthenstep6:
		/*
		 * Advance ti->ti_seq to correspond to first data byte.
		 * If data, trim to stay within window,
		 * dropping FIN if necessary.
		 */
		ti->ti_seq++;
		if (ti->ti_len > tp->rcv_wnd) {
			todrop = ti->ti_len - tp->rcv_wnd;
			m_adj(m, -todrop);
			ti->ti_len = tp->rcv_wnd;
			tiflags &= ~TH_FIN;
		}
		tp->snd_wl1 = ti->ti_seq - 1;
		tp->rcv_up = ti->ti_seq;
		goto step6;
	} /* switch tp->t_state */
	/*
	 * States other than LISTEN or SYN_SENT.
	 * Check that at least some bytes of segment are within
	 * receive window.  If segment begins before rcv_nxt,
	 * drop leading data (and SYN); if nothing left, just ack.
	 */
	todrop = tp->rcv_nxt - ti->ti_seq;
	if (todrop > 0) {
		if (tiflags & TH_SYN) {
			tiflags &= ~TH_SYN;
			ti->ti_seq++;
			if (ti->ti_urp > 1)
				ti->ti_urp--;
			else
				tiflags &= ~TH_URG;
			todrop--;
		}
		/*
		 * Following if statement from Stevens, vol. 2, p. 960.
		 */
		if (todrop > ti->ti_len
		    || (todrop == ti->ti_len && (tiflags & TH_FIN) == 0)) {
			/*
			 * Any valid FIN must be to the left of the window.
			 * At this point the FIN must be a duplicate or out
			 * of sequence; drop it.
			 */
			tiflags &= ~TH_FIN;

			/*
			 * Send an ACK to resynchronize and drop any data.
			 * But keep on processing for RST or ACK.
			 */
			tp->t_flags |= TF_ACKNOW;
			todrop = ti->ti_len;
		}
		m_adj(m, todrop);
		ti->ti_seq += todrop;
		ti->ti_len -= todrop;
		if (ti->ti_urp > todrop)
			ti->ti_urp -= todrop;
		else {
			tiflags &= ~TH_URG;
			ti->ti_urp = 0;
		}
	}
	/*
	 * If new data are received on a connection after the
	 * user processes are gone, then RST the other end.
	 */
	if ((so->so_state & SS_NOFDREF) &&
	    tp->t_state > TCPS_CLOSE_WAIT && ti->ti_len) {
		tp = tcp_close(tp);
		goto dropwithreset;
	}

	/*
	 * If segment ends after window, drop trailing data
	 * (and PUSH and FIN); if nothing left, just ACK.
	 */
	todrop = (ti->ti_seq+ti->ti_len) - (tp->rcv_nxt+tp->rcv_wnd);
	if (todrop > 0) {
		if (todrop >= ti->ti_len) {
			/*
			 * If a new connection request is received
			 * while in TIME_WAIT, drop the old connection
			 * and start over if the sequence numbers
			 * are above the previous ones.
			 */
			if (tiflags & TH_SYN &&
			    tp->t_state == TCPS_TIME_WAIT &&
			    SEQ_GT(ti->ti_seq, tp->rcv_nxt)) {
				iss = tp->rcv_nxt + TCP_ISSINCR;
				tp = tcp_close(tp);
				goto findso;
			}
			/*
			 * If window is closed can only take segments at
			 * window edge, and have to drop data and PUSH from
			 * incoming segments.  Continue processing, but
			 * remember to ack.  Otherwise, drop segment
			 * and ack.
			 */
			if (tp->rcv_wnd == 0 && ti->ti_seq == tp->rcv_nxt) {
				tp->t_flags |= TF_ACKNOW;
			} else {
				goto dropafterack;
			}
		}
		m_adj(m, -todrop);
		ti->ti_len -= todrop;
		tiflags &= ~(TH_PUSH|TH_FIN);
	}

	/*
	 * If the RST bit is set examine the state:
	 *    SYN_RECEIVED STATE:
	 *	If passive open, return to LISTEN state.
	 *	If active open, inform user that connection was refused.
	 *    ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
	 *	Inform user that connection was reset, and close tcb.
	 *    CLOSING, LAST_ACK, TIME_WAIT STATES
	 *	Close the tcb.
	 */
	if (tiflags&TH_RST) switch (tp->t_state) {

	case TCPS_SYN_RECEIVED:
	case TCPS_ESTABLISHED:
	case TCPS_FIN_WAIT_1:
	case TCPS_FIN_WAIT_2:
	case TCPS_CLOSE_WAIT:
		tp->t_state = TCPS_CLOSED;
                tcp_close(tp);
		goto drop;

	case TCPS_CLOSING:
	case TCPS_LAST_ACK:
	case TCPS_TIME_WAIT:
                tcp_close(tp);
		goto drop;
	}

	/*
	 * If a SYN is in the window, then this is an
	 * error and we send an RST and drop the connection.
	 */
	if (tiflags & TH_SYN) {
		tp = tcp_drop(tp,0);
		goto dropwithreset;
	}

	/*
	 * If the ACK bit is off we drop the segment and return.
	 */
	if ((tiflags & TH_ACK) == 0) goto drop;

	/*
	 * Ack processing.
	 */
	switch (tp->t_state) {
	/*
	 * In SYN_RECEIVED state if the ack ACKs our SYN then enter
	 * ESTABLISHED state and continue processing, otherwise
	 * send an RST.  una<=ack<=max
	 */
	case TCPS_SYN_RECEIVED:

		if (SEQ_GT(tp->snd_una, ti->ti_ack) ||
		    SEQ_GT(ti->ti_ack, tp->snd_max))
			goto dropwithreset;
		tp->t_state = TCPS_ESTABLISHED;
		/*
		 * The sent SYN is ack'ed with our sequence number +1
		 * The first data byte already in the buffer will get
		 * lost if no correction is made.  This is only needed for
		 * SS_CTL since the buffer is empty otherwise.
		 * tp->snd_una++; or:
		 */
		tp->snd_una=ti->ti_ack;
		if (so->so_state & SS_CTL) {
		  /* So tcp_ctl reports the right state */
		  ret = tcp_ctl(so);
		  if (ret == 1) {
		    soisfconnected(so);
		    so->so_state &= ~SS_CTL;   /* success XXX */
		  } else if (ret == 2) {
		    so->so_state &= SS_PERSISTENT_MASK;
		    so->so_state |= SS_NOFDREF; /* CTL_CMD */
		  } else {
		    needoutput = 1;
		    tp->t_state = TCPS_FIN_WAIT_1;
		  }
		} else {
		  soisfconnected(so);
		}

		(void) tcp_reass(tp, (struct tcpiphdr *)0, (struct mbuf *)0);
		tp->snd_wl1 = ti->ti_seq - 1;
		/* Avoid ack processing; snd_una==ti_ack  =>  dup ack */
		goto synrx_to_est;
		/* fall into ... */

	/*
	 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
	 * ACKs.  If the ack is in the range
	 *	tp->snd_una < ti->ti_ack <= tp->snd_max
	 * then advance tp->snd_una to ti->ti_ack and drop
	 * data from the retransmission queue.  If this ACK reflects
	 * more up to date window information we update our window information.
	 */
	case TCPS_ESTABLISHED:
	case TCPS_FIN_WAIT_1:
	case TCPS_FIN_WAIT_2:
	case TCPS_CLOSE_WAIT:
	case TCPS_CLOSING:
	case TCPS_LAST_ACK:
	case TCPS_TIME_WAIT:

		if (SEQ_LEQ(ti->ti_ack, tp->snd_una)) {
			if (ti->ti_len == 0 && tiwin == tp->snd_wnd) {
			  DEBUG_MISC((dfd, " dup ack  m = %lx  so = %lx\n",
				      (long )m, (long )so));
				/*
				 * If we have outstanding data (other than
				 * a window probe), this is a completely
				 * duplicate ack (ie, window info didn't
				 * change), the ack is the biggest we've
				 * seen and we've seen exactly our rexmt
				 * threshold of them, assume a packet
				 * has been dropped and retransmit it.
				 * Kludge snd_nxt & the congestion
				 * window so we send only this one
				 * packet.
				 *
				 * We know we're losing at the current
				 * window size so do congestion avoidance
				 * (set ssthresh to half the current window
				 * and pull our congestion window back to
				 * the new ssthresh).
				 *
				 * Dup acks mean that packets have left the
				 * network (they're now cached at the receiver)
				 * so bump cwnd by the amount in the receiver
				 * to keep a constant cwnd packets in the
				 * network.
				 */
				if (tp->t_timer[TCPT_REXMT] == 0 ||
				    ti->ti_ack != tp->snd_una)
					tp->t_dupacks = 0;
				else if (++tp->t_dupacks == TCPREXMTTHRESH) {
					tcp_seq onxt = tp->snd_nxt;
					u_int win =
					    min(tp->snd_wnd, tp->snd_cwnd) / 2 /
						tp->t_maxseg;

					if (win < 2)
						win = 2;
					tp->snd_ssthresh = win * tp->t_maxseg;
					tp->t_timer[TCPT_REXMT] = 0;
					tp->t_rtt = 0;
					tp->snd_nxt = ti->ti_ack;
					tp->snd_cwnd = tp->t_maxseg;
					(void) tcp_output(tp);
					tp->snd_cwnd = tp->snd_ssthresh +
					       tp->t_maxseg * tp->t_dupacks;
					if (SEQ_GT(onxt, tp->snd_nxt))
						tp->snd_nxt = onxt;
					goto drop;
				} else if (tp->t_dupacks > TCPREXMTTHRESH) {
					tp->snd_cwnd += tp->t_maxseg;
					(void) tcp_output(tp);
					goto drop;
				}
			} else
				tp->t_dupacks = 0;
			break;
		}
	synrx_to_est:
		/*
		 * If the congestion window was inflated to account
		 * for the other side's cached packets, retract it.
		 */
		if (tp->t_dupacks > TCPREXMTTHRESH &&
		    tp->snd_cwnd > tp->snd_ssthresh)
			tp->snd_cwnd = tp->snd_ssthresh;
		tp->t_dupacks = 0;
		if (SEQ_GT(ti->ti_ack, tp->snd_max)) {
			goto dropafterack;
		}
		acked = ti->ti_ack - tp->snd_una;

		/*
		 * If transmit timer is running and timed sequence
		 * number was acked, update smoothed round trip time.
		 * Since we now have an rtt measurement, cancel the
		 * timer backoff (cf., Phil Karn's retransmit alg.).
		 * Recompute the initial retransmit timer.
		 */
		if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq))
			tcp_xmit_timer(tp,tp->t_rtt);

		/*
		 * If all outstanding data is acked, stop retransmit
		 * timer and remember to restart (more output or persist).
		 * If there is more data to be acked, restart retransmit
		 * timer, using current (possibly backed-off) value.
		 */
		if (ti->ti_ack == tp->snd_max) {
			tp->t_timer[TCPT_REXMT] = 0;
			needoutput = 1;
		} else if (tp->t_timer[TCPT_PERSIST] == 0)
			tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
		/*
		 * When new data is acked, open the congestion window.
		 * If the window gives us less than ssthresh packets
		 * in flight, open exponentially (maxseg per packet).
		 * Otherwise open linearly: maxseg per window
		 * (maxseg^2 / cwnd per packet).
		 */
		{
		  register u_int cw = tp->snd_cwnd;
		  register u_int incr = tp->t_maxseg;

		  if (cw > tp->snd_ssthresh)
		    incr = incr * incr / cw;
		  tp->snd_cwnd = min(cw + incr, TCP_MAXWIN<<tp->snd_scale);
		}
		if (acked > so->so_snd.sb_cc) {
			tp->snd_wnd -= so->so_snd.sb_cc;
			sbdrop(&so->so_snd, (int )so->so_snd.sb_cc);
			ourfinisacked = 1;
		} else {
			sbdrop(&so->so_snd, acked);
			tp->snd_wnd -= acked;
			ourfinisacked = 0;
		}
		tp->snd_una = ti->ti_ack;
		if (SEQ_LT(tp->snd_nxt, tp->snd_una))
			tp->snd_nxt = tp->snd_una;

		switch (tp->t_state) {

		/*
		 * In FIN_WAIT_1 STATE in addition to the processing
		 * for the ESTABLISHED state if our FIN is now acknowledged
		 * then enter FIN_WAIT_2.
		 */
		case TCPS_FIN_WAIT_1:
			if (ourfinisacked) {
				/*
				 * If we can't receive any more
				 * data, then closing user can proceed.
				 * Starting the timer is contrary to the
				 * specification, but if we don't get a FIN
				 * we'll hang forever.
				 */
				if (so->so_state & SS_FCANTRCVMORE) {
					tp->t_timer[TCPT_2MSL] = TCP_MAXIDLE;
				}
				tp->t_state = TCPS_FIN_WAIT_2;
			}
			break;

	 	/*
		 * In CLOSING STATE in addition to the processing for
		 * the ESTABLISHED state if the ACK acknowledges our FIN
		 * then enter the TIME-WAIT state, otherwise ignore
		 * the segment.
		 */
		case TCPS_CLOSING:
			if (ourfinisacked) {
				tp->t_state = TCPS_TIME_WAIT;
				tcp_canceltimers(tp);
				tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
			}
			break;

		/*
		 * In LAST_ACK, we may still be waiting for data to drain
		 * and/or to be acked, as well as for the ack of our FIN.
		 * If our FIN is now acknowledged, delete the TCB,
		 * enter the closed state and return.
		 */
		case TCPS_LAST_ACK:
			if (ourfinisacked) {
                                tcp_close(tp);
				goto drop;
			}
			break;

		/*
		 * In TIME_WAIT state the only thing that should arrive
		 * is a retransmission of the remote FIN.  Acknowledge
		 * it and restart the finack timer.
		 */
		case TCPS_TIME_WAIT:
			tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
			goto dropafterack;
		}
	} /* switch(tp->t_state) */

step6:
	/*
	 * Update window information.
	 * Don't look at window if no ACK: TAC's send garbage on first SYN.
	 */
	if ((tiflags & TH_ACK) &&
	    (SEQ_LT(tp->snd_wl1, ti->ti_seq) ||
	    (tp->snd_wl1 == ti->ti_seq && (SEQ_LT(tp->snd_wl2, ti->ti_ack) ||
	    (tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd))))) {
		tp->snd_wnd = tiwin;
		tp->snd_wl1 = ti->ti_seq;
		tp->snd_wl2 = ti->ti_ack;
		if (tp->snd_wnd > tp->max_sndwnd)
			tp->max_sndwnd = tp->snd_wnd;
		needoutput = 1;
	}

	/*
	 * Process segments with URG.
	 */
	if ((tiflags & TH_URG) && ti->ti_urp &&
	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
		/*
		 * This is a kludge, but if we receive and accept
		 * random urgent pointers, we'll crash in
		 * soreceive.  It's hard to imagine someone
		 * actually wanting to send this much urgent data.
		 */
		if (ti->ti_urp + so->so_rcv.sb_cc > so->so_rcv.sb_datalen) {
			ti->ti_urp = 0;
			tiflags &= ~TH_URG;
			goto dodata;
		}
		/*
		 * If this segment advances the known urgent pointer,
		 * then mark the data stream.  This should not happen
		 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
		 * a FIN has been received from the remote side.
		 * In these states we ignore the URG.
		 *
		 * According to RFC961 (Assigned Protocols),
		 * the urgent pointer points to the last octet
		 * of urgent data.  We continue, however,
		 * to consider it to indicate the first octet
		 * of data past the urgent section as the original
		 * spec states (in one of two places).
		 */
		if (SEQ_GT(ti->ti_seq+ti->ti_urp, tp->rcv_up)) {
			tp->rcv_up = ti->ti_seq + ti->ti_urp;
			so->so_urgc =  so->so_rcv.sb_cc +
				(tp->rcv_up - tp->rcv_nxt); /* -1; */
			tp->rcv_up = ti->ti_seq + ti->ti_urp;

		}
	} else
		/*
		 * If no out of band data is expected,
		 * pull receive urgent pointer along
		 * with the receive window.
		 */
		if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
			tp->rcv_up = tp->rcv_nxt;
dodata:

	/*
	 * If this is a small packet, then ACK now - with Nagel
	 *      congestion avoidance sender won't send more until
	 *      he gets an ACK.
	 */
	if (ti->ti_len && (unsigned)ti->ti_len <= 5 &&
	    ((struct tcpiphdr_2 *)ti)->first_char == (char)27) {
		tp->t_flags |= TF_ACKNOW;
	}

	/*
	 * Process the segment text, merging it into the TCP sequencing queue,
	 * and arranging for acknowledgment of receipt if necessary.
	 * This process logically involves adjusting tp->rcv_wnd as data
	 * is presented to the user (this happens in tcp_usrreq.c,
	 * case PRU_RCVD).  If a FIN has already been received on this
	 * connection then we just ignore the text.
	 */
	if ((ti->ti_len || (tiflags&TH_FIN)) &&
	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
		TCP_REASS(tp, ti, m, so, tiflags);
	} else {
		m_free(m);
		tiflags &= ~TH_FIN;
	}

	/*
	 * If FIN is received ACK the FIN and let the user know
	 * that the connection is closing.
	 */
	if (tiflags & TH_FIN) {
		if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
			/*
			 * If we receive a FIN we can't send more data,
			 * set it SS_FDRAIN
                         * Shutdown the socket if there is no rx data in the
			 * buffer.
			 * soread() is called on completion of shutdown() and
			 * will got to TCPS_LAST_ACK, and use tcp_output()
			 * to send the FIN.
			 */
			sofwdrain(so);

			tp->t_flags |= TF_ACKNOW;
			tp->rcv_nxt++;
		}
		switch (tp->t_state) {

	 	/*
		 * In SYN_RECEIVED and ESTABLISHED STATES
		 * enter the CLOSE_WAIT state.
		 */
		case TCPS_SYN_RECEIVED:
		case TCPS_ESTABLISHED:
		  if(so->so_emu == EMU_CTL)        /* no shutdown on socket */
		    tp->t_state = TCPS_LAST_ACK;
		  else
		    tp->t_state = TCPS_CLOSE_WAIT;
		  break;

	 	/*
		 * If still in FIN_WAIT_1 STATE FIN has not been acked so
		 * enter the CLOSING state.
		 */
		case TCPS_FIN_WAIT_1:
			tp->t_state = TCPS_CLOSING;
			break;

	 	/*
		 * In FIN_WAIT_2 state enter the TIME_WAIT state,
		 * starting the time-wait timer, turning off the other
		 * standard timers.
		 */
		case TCPS_FIN_WAIT_2:
			tp->t_state = TCPS_TIME_WAIT;
			tcp_canceltimers(tp);
			tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
			break;

		/*
		 * In TIME_WAIT state restart the 2 MSL time_wait timer.
		 */
		case TCPS_TIME_WAIT:
			tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL;
			break;
		}
	}

	/*
	 * Return any desired output.
	 */
	if (needoutput || (tp->t_flags & TF_ACKNOW)) {
		(void) tcp_output(tp);
	}
	return;

dropafterack:
	/*
	 * Generate an ACK dropping incoming segment if it occupies
	 * sequence space, where the ACK reflects our state.
	 */
	if (tiflags & TH_RST)
		goto drop;
	m_free(m);
	tp->t_flags |= TF_ACKNOW;
	(void) tcp_output(tp);
	return;

dropwithreset:
	/* reuses m if m!=NULL, m_free() unnecessary */
	if (tiflags & TH_ACK)
		tcp_respond(tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST);
	else {
		if (tiflags & TH_SYN) ti->ti_len++;
		tcp_respond(tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0,
		    TH_RST|TH_ACK);
	}

	return;

drop:
	/*
	 * Drop space held by incoming segment and return.
	 */
	m_free(m);
}
Beispiel #28
0
static int
tcp_lro_matching_tuple(struct ip* ip_hdr, struct tcphdr *tcp_hdr, int *hash, 
			int *flow_id )
{
	struct lro_flow *flow;
	tcp_seq seqnum;
	unsigned int off = 0;
	int payload_len = 0;

	*hash = LRO_HASH(ip_hdr->ip_src.s_addr, ip_hdr->ip_dst.s_addr, 
		tcp_hdr->th_sport, tcp_hdr->th_dport, (TCP_LRO_FLOW_MAP - 1));

	*flow_id = lro_flow_map[*hash];
	if (*flow_id == TCP_LRO_FLOW_NOTFOUND) {
		return TCP_LRO_NAN;
	}

	seqnum = tcp_hdr->th_seq;
	off = tcp_hdr->th_off << 2;
	payload_len = ip_hdr->ip_len - off;

	flow = &lro_flow_list[*flow_id];

	if ((flow->lr_faddr.s_addr == ip_hdr->ip_src.s_addr) &&
			(flow->lr_laddr.s_addr == ip_hdr->ip_dst.s_addr) &&
			(flow->lr_fport == tcp_hdr->th_sport) &&
			(flow->lr_lport == tcp_hdr->th_dport)) {
		if (flow->lr_tcphdr == NULL) {
			if (ntohl(seqnum) == flow->lr_seq) {
				return TCP_LRO_COALESCE;
			}
			if (lrodebug >= 4) {
				printf("%s: seqnum = %x, lr_seq = %x\n",
					__func__, ntohl(seqnum), flow->lr_seq);
			}
			lro_seq_mismatch++;
			if (SEQ_GT(ntohl(seqnum), flow->lr_seq)) {
				lro_seq_outoforder++;
				/* 
				 * Whenever we receive out of order packets it
				 * signals loss and recovery and LRO doesn't 
				 * let flows recover quickly. So eject.
				 */
				 flow->lr_flags |= LRO_EJECT_REQ;

			}
			return TCP_LRO_NAN;
		}

		if (flow->lr_flags & LRO_EJECT_REQ) {
			if (lrodebug)
				printf("%s: eject. \n", __func__);
			return TCP_LRO_EJECT_FLOW;
		}
		if (SEQ_GT(tcp_hdr->th_ack, flow->lr_tcphdr->th_ack)) { 
			if (lrodebug) {
				printf("%s: th_ack = %x flow_ack = %x \n", 
					__func__, tcp_hdr->th_ack, 
					flow->lr_tcphdr->th_ack);
			}
			return TCP_LRO_EJECT_FLOW;
		}

		if (ntohl(seqnum) == (ntohl(lro_flow_list[*flow_id].lr_tcphdr->th_seq) + lro_flow_list[*flow_id].lr_len)) { 
			return TCP_LRO_COALESCE;
		} else {
			/* LRO does not handle loss recovery well, eject */
			flow->lr_flags |= LRO_EJECT_REQ;
			return TCP_LRO_EJECT_FLOW;
		}
	}
	if (lrodebug) printf("tcp_lro_matching_tuple: collision \n");
	return TCP_LRO_COLLISION;
}
Beispiel #29
0
/*
 * Ertt_packet_measurements uses a small amount of state kept on each packet
 * sent to match incoming acknowledgements. This enables more accurate and
 * secure round trip time measurements. The resulting measurement is used for
 * congestion control algorithms which require a more accurate time.
 * Ertt_packet_measurements is called via the helper hook in tcp_input.c
 */
static int
ertt_packet_measurement_hook(int hhook_type, int hhook_id, void *udata,
    void *ctx_data, void *hdata, struct osd *hosd)
{
	struct ertt *e_t;
	struct tcpcb *tp;
	struct tcphdr *th;
	struct tcpopt *to;
	struct tcp_hhook_data *thdp;
	struct txseginfo *txsi;
	int acked, measurenext_len, multiack, new_sacked_bytes, rtt_bytes_adjust;
	uint32_t measurenext, rts;
	tcp_seq ack;

	KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__));
	KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__));

	e_t = (struct ertt *)hdata;
	thdp = ctx_data;
	tp = thdp->tp;
	th = thdp->th;
	to = thdp->to;
	new_sacked_bytes = (tp->sackhint.last_sack_ack != 0);
	measurenext = measurenext_len = multiack = rts = rtt_bytes_adjust = 0;
	acked = th->th_ack - tp->snd_una;

	INP_WLOCK_ASSERT(tp->t_inpcb);

	/* Packet has provided new acknowledgements. */
	if (acked > 0 || new_sacked_bytes) {
		if (acked == 0 && new_sacked_bytes) {
			/* Use last sacked data. */
			ack = tp->sackhint.last_sack_ack;
		} else
			ack = th->th_ack;

		txsi = TAILQ_FIRST(&e_t->txsegi_q);
		while (txsi != NULL) {
			rts = 0;

			/* Acknowledgement is acking more than this txsi. */
			if (SEQ_GT(ack, txsi->seq + txsi->len)) {
				if (txsi->flags & TXSI_RTT_MEASURE_START ||
				    measurenext) {
					marked_packet_rtt(txsi, e_t, tp,
					    &measurenext, &measurenext_len,
					    &rtt_bytes_adjust, MULTI_ACK);
				}
				TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk);
				uma_zfree(txseginfo_zone, txsi);
				txsi = TAILQ_FIRST(&e_t->txsegi_q);
				continue;
			}

			/*
			 * Guess if delayed acks are being used by the receiver.
			 *
			 * XXXDH: A simple heuristic that could be improved
			 */
			if (!new_sacked_bytes) {
				if (acked > tp->t_maxseg) {
					e_t->dlyack_rx +=
					    (e_t->dlyack_rx < DLYACK_SMOOTH) ?
					    1 : 0;
					multiack = 1;
				} else if (acked > txsi->len) {
					multiack = 1;
					e_t->dlyack_rx +=
					    (e_t->dlyack_rx < DLYACK_SMOOTH) ?
					    1 : 0;
				} else if (acked == tp->t_maxseg ||
					   acked == txsi->len) {
					e_t->dlyack_rx -=
					    (e_t->dlyack_rx > 0) ? 1 : 0;
				}
				/* Otherwise leave dlyack_rx the way it was. */
			}

			/*
			 * Time stamps are only to help match the txsi with the
			 * received acknowledgements.
			 */
			if (e_t->timestamp_errors < MAX_TS_ERR &&
			    (to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
				/*
				 * Note: All packets sent with the offload will
				 * have the same time stamp. If we are sending
				 * on a fast interface and the t_maxseg is much
				 * smaller than one tick, this will be fine. The
				 * time stamp would be the same whether we were
				 * using tso or not. However, if the interface
				 * is slow, this will cause problems with the
				 * calculations. If the interface is slow, there
				 * is not reason to be using tso, and it should
				 * be turned off.
				 */
				/*
				 * If there are too many time stamp errors, time
				 * stamps won't be trusted
				 */
				rts = to->to_tsecr;
				/* Before this packet. */
				if (!e_t->dlyack_rx && TSTMP_LT(rts, txsi->tx_ts))
					/* When delayed acking is used, the
					 * reflected time stamp is of the first
					 * packet and thus may be before
					 * txsi->tx_ts.
					 */
					break;
				if (TSTMP_GT(rts, txsi->tx_ts)) {
					/*
					 * If reflected time stamp is later than
					 * tx_tsi, then this txsi is old.
					 */
					if (txsi->flags & TXSI_RTT_MEASURE_START
					    || measurenext) {
						marked_packet_rtt(txsi, e_t, tp,
						    &measurenext, &measurenext_len,
						    &rtt_bytes_adjust, OLD_TXSI);
					}
					TAILQ_REMOVE(&e_t->txsegi_q, txsi,
					    txsegi_lnk);
					uma_zfree(txseginfo_zone, txsi);
					txsi = TAILQ_FIRST(&e_t->txsegi_q);
					continue;
				}
				if (rts == txsi->tx_ts &&
				    TSTMP_LT(to->to_tsval, txsi->rx_ts)) {
					/*
					 * Segment received before sent!
					 * Something is wrong with the received
					 * timestamps so increment errors. If
					 * this keeps up we will ignore
					 * timestamps.
					 */
					e_t->timestamp_errors++;
				}
			}
			/*
			 * Acknowledging a sequence number before this txsi.
			 * If it is an old txsi that may have had the same seq
			 * numbers, it should have been removed if time stamps
			 * are being used.
			 */
			if (SEQ_LEQ(ack, txsi->seq))
				break; /* Before first packet in txsi. */

			/*
			 * Only ack > txsi->seq and ack <= txsi->seq+txsi->len
			 * past this point.
			 *
			 * If delayed acks are being used, an acknowledgement
			 * for a single segment will have been delayed by the
			 * receiver and will yield an inaccurate measurement. In
			 * this case, we only make the measurement if more than
			 * one segment is being acknowledged or sack is
			 * currently being used.
			 */
			if (!e_t->dlyack_rx || multiack || new_sacked_bytes) {
				/* Make an accurate new measurement. */
				e_t->rtt = tcp_ts_getticks() - txsi->tx_ts + 1;

				if (e_t->rtt < e_t->minrtt || e_t->minrtt == 0)
					e_t->minrtt = e_t->rtt;

				if (e_t->rtt > e_t->maxrtt || e_t->maxrtt == 0)
					e_t->maxrtt = e_t->rtt;
			}

			if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext)
				marked_packet_rtt(txsi, e_t, tp,
				    &measurenext, &measurenext_len,
				    &rtt_bytes_adjust, CORRECT_ACK);

			if (txsi->flags & TXSI_TSO) {
				if (txsi->len > acked) {
					txsi->len -= acked;
					/*
					 * This presumes ack for first bytes in
					 * txsi, this may not be true but it
					 * shouldn't cause problems for the
					 * timing.
					 *
					 * We remeasure RTT even though we only
					 * have a single txsi. The rationale
					 * behind this is that it is better to
					 * have a slightly inaccurate
					 * measurement than no additional
					 * measurement for the rest of the bulk
					 * transfer. Since TSO is only used on
					 * high speed interface cards, so the
					 * packets should be transmitted at line
					 * rate back to back with little
					 * difference in transmission times (in
					 * ticks).
					 */
					txsi->seq += acked;
					/*
					 * Reset txsi measure flag so we don't
					 * use it for another RTT measurement.
					 */
					txsi->flags &= ~TXSI_RTT_MEASURE_START;
					/*
					 * There is still more data to be acked
					 * from tso bulk transmission, so we
					 * won't remove it from the TAILQ yet.
					 */
					break;
				}
				txsi->len = 0;
			}

			TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk);
			uma_zfree(txseginfo_zone, txsi);
			break;
		}

		if (measurenext) {
			/*
			 * We need to do a RTT measurement. It won't be the best
			 * if we do it here.
			 */
			marked_packet_rtt(txsi, e_t, tp,
			    &measurenext, &measurenext_len,
			    &rtt_bytes_adjust, FORCED_MEASUREMENT);
		}
	}

	return (0);
}
Beispiel #30
0
/*
 * This function is called upon receipt of new valid data (while not in header
 * prediction mode), and it updates the ordered list of sacks.
 */
void
tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end)
{
	/*
	 * First reported block MUST be the most recent one.  Subsequent
	 * blocks SHOULD be in the order in which they arrived at the
	 * receiver.  These two conditions make the implementation fully
	 * compliant with RFC 2018.
	 */
	struct sackblk head_blk, saved_blks[MAX_SACK_BLKS];
	int num_head, num_saved, i;

	/* SACK block for the received segment. */
	head_blk.start = rcv_start;
	head_blk.end = rcv_end;

	/*
	 * Merge updated SACK blocks into head_blk, and
	 * save unchanged SACK blocks into saved_blks[].
	 * num_saved will have the number of the saved SACK blocks.
	 */
	num_saved = 0;
	for (i = 0; i < tp->rcv_numsacks; i++) {
		tcp_seq start = tp->sackblks[i].start;
		tcp_seq end = tp->sackblks[i].end;
		if (SEQ_GEQ(start, end) || SEQ_LEQ(start, tp->rcv_nxt)) {
			/*
			 * Discard this SACK block.
			 */
		} else if (SEQ_LEQ(head_blk.start, end) &&
			   SEQ_GEQ(head_blk.end, start)) {
			/*
			 * Merge this SACK block into head_blk.
			 * This SACK block itself will be discarded.
			 */
			if (SEQ_GT(head_blk.start, start))
				head_blk.start = start;
			if (SEQ_LT(head_blk.end, end))
				head_blk.end = end;
		} else {
			/*
			 * Save this SACK block.
			 */
			saved_blks[num_saved].start = start;
			saved_blks[num_saved].end = end;
			num_saved++;
		}
	}

	/*
	 * Update SACK list in tp->sackblks[].
	 */
	num_head = 0;
	if (SEQ_GT(head_blk.start, tp->rcv_nxt)) {
		/*
		 * The received data segment is an out-of-order segment.
		 * Put head_blk at the top of SACK list.
		 */
		tp->sackblks[0] = head_blk;
		num_head = 1;
		/*
		 * If the number of saved SACK blocks exceeds its limit,
		 * discard the last SACK block.
		 */
		if (num_saved >= MAX_SACK_BLKS)
			num_saved--;
	}
	if (num_saved > 0) {
		/*
		 * Copy the saved SACK blocks back.
		 */
		bcopy(saved_blks, &tp->sackblks[num_head],
		      sizeof(struct sackblk) * num_saved);
	}

	/* Save the number of SACK blocks. */
	tp->rcv_numsacks = num_head + num_saved;

	/* If we are requesting SACK recovery, reset the stretch-ack state
	 * so that connection will generate more acks after recovery and
	 * sender's cwnd will open.
	 */
	if ((tp->t_flags & TF_STRETCHACK) != 0 && tp->rcv_numsacks > 0)
		tcp_reset_stretch_ack(tp);

#if TRAFFIC_MGT
	if (tp->acc_iaj > 0 && tp->rcv_numsacks > 0) 
		reset_acc_iaj(tp);
#endif /* TRAFFIC_MGT */
}