예제 #1
0
/*
 * Ertt_packet_measurements uses a small amount of state kept on each packet
 * sent to match incoming acknowledgements. This enables more accurate and
 * secure round trip time measurements. The resulting measurement is used for
 * congestion control algorithms which require a more accurate time.
 * Ertt_packet_measurements is called via the helper hook in tcp_input.c
 */
static int
ertt_packet_measurement_hook(int hhook_type, int hhook_id, void *udata,
    void *ctx_data, void *hdata, struct osd *hosd)
{
	struct ertt *e_t;
	struct tcpcb *tp;
	struct tcphdr *th;
	struct tcpopt *to;
	struct tcp_hhook_data *thdp;
	struct txseginfo *txsi;
	int acked, measurenext_len, multiack, new_sacked_bytes, rtt_bytes_adjust;
	uint32_t measurenext, rts;
	tcp_seq ack;

	KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__));
	KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__));

	e_t = (struct ertt *)hdata;
	thdp = ctx_data;
	tp = thdp->tp;
	th = thdp->th;
	to = thdp->to;
	new_sacked_bytes = (tp->sackhint.last_sack_ack != 0);
	measurenext = measurenext_len = multiack = rts = rtt_bytes_adjust = 0;
	acked = th->th_ack - tp->snd_una;

	INP_WLOCK_ASSERT(tp->t_inpcb);

	/* Packet has provided new acknowledgements. */
	if (acked > 0 || new_sacked_bytes) {
		if (acked == 0 && new_sacked_bytes) {
			/* Use last sacked data. */
			ack = tp->sackhint.last_sack_ack;
		} else
			ack = th->th_ack;

		txsi = TAILQ_FIRST(&e_t->txsegi_q);
		while (txsi != NULL) {
			rts = 0;

			/* Acknowledgement is acking more than this txsi. */
			if (SEQ_GT(ack, txsi->seq + txsi->len)) {
				if (txsi->flags & TXSI_RTT_MEASURE_START ||
				    measurenext) {
					marked_packet_rtt(txsi, e_t, tp,
					    &measurenext, &measurenext_len,
					    &rtt_bytes_adjust, MULTI_ACK);
				}
				TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk);
				uma_zfree(txseginfo_zone, txsi);
				txsi = TAILQ_FIRST(&e_t->txsegi_q);
				continue;
			}

			/*
			 * Guess if delayed acks are being used by the receiver.
			 *
			 * XXXDH: A simple heuristic that could be improved
			 */
			if (!new_sacked_bytes) {
				if (acked > tp->t_maxseg) {
					e_t->dlyack_rx +=
					    (e_t->dlyack_rx < DLYACK_SMOOTH) ?
					    1 : 0;
					multiack = 1;
				} else if (acked > txsi->len) {
					multiack = 1;
					e_t->dlyack_rx +=
					    (e_t->dlyack_rx < DLYACK_SMOOTH) ?
					    1 : 0;
				} else if (acked == tp->t_maxseg ||
					   acked == txsi->len) {
					e_t->dlyack_rx -=
					    (e_t->dlyack_rx > 0) ? 1 : 0;
				}
				/* Otherwise leave dlyack_rx the way it was. */
			}

			/*
			 * Time stamps are only to help match the txsi with the
			 * received acknowledgements.
			 */
			if (e_t->timestamp_errors < MAX_TS_ERR &&
			    (to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
				/*
				 * Note: All packets sent with the offload will
				 * have the same time stamp. If we are sending
				 * on a fast interface and the t_maxseg is much
				 * smaller than one tick, this will be fine. The
				 * time stamp would be the same whether we were
				 * using tso or not. However, if the interface
				 * is slow, this will cause problems with the
				 * calculations. If the interface is slow, there
				 * is not reason to be using tso, and it should
				 * be turned off.
				 */
				/*
				 * If there are too many time stamp errors, time
				 * stamps won't be trusted
				 */
				rts = to->to_tsecr;
				/* Before this packet. */
				if (!e_t->dlyack_rx && TSTMP_LT(rts, txsi->tx_ts))
					/* When delayed acking is used, the
					 * reflected time stamp is of the first
					 * packet and thus may be before
					 * txsi->tx_ts.
					 */
					break;
				if (TSTMP_GT(rts, txsi->tx_ts)) {
					/*
					 * If reflected time stamp is later than
					 * tx_tsi, then this txsi is old.
					 */
					if (txsi->flags & TXSI_RTT_MEASURE_START
					    || measurenext) {
						marked_packet_rtt(txsi, e_t, tp,
						    &measurenext, &measurenext_len,
						    &rtt_bytes_adjust, OLD_TXSI);
					}
					TAILQ_REMOVE(&e_t->txsegi_q, txsi,
					    txsegi_lnk);
					uma_zfree(txseginfo_zone, txsi);
					txsi = TAILQ_FIRST(&e_t->txsegi_q);
					continue;
				}
				if (rts == txsi->tx_ts &&
				    TSTMP_LT(to->to_tsval, txsi->rx_ts)) {
					/*
					 * Segment received before sent!
					 * Something is wrong with the received
					 * timestamps so increment errors. If
					 * this keeps up we will ignore
					 * timestamps.
					 */
					e_t->timestamp_errors++;
				}
			}
			/*
			 * Acknowledging a sequence number before this txsi.
			 * If it is an old txsi that may have had the same seq
			 * numbers, it should have been removed if time stamps
			 * are being used.
			 */
			if (SEQ_LEQ(ack, txsi->seq))
				break; /* Before first packet in txsi. */

			/*
			 * Only ack > txsi->seq and ack <= txsi->seq+txsi->len
			 * past this point.
			 *
			 * If delayed acks are being used, an acknowledgement
			 * for a single segment will have been delayed by the
			 * receiver and will yield an inaccurate measurement. In
			 * this case, we only make the measurement if more than
			 * one segment is being acknowledged or sack is
			 * currently being used.
			 */
			if (!e_t->dlyack_rx || multiack || new_sacked_bytes) {
				/* Make an accurate new measurement. */
				e_t->rtt = tcp_ts_getticks() - txsi->tx_ts + 1;

				if (e_t->rtt < e_t->minrtt || e_t->minrtt == 0)
					e_t->minrtt = e_t->rtt;

				if (e_t->rtt > e_t->maxrtt || e_t->maxrtt == 0)
					e_t->maxrtt = e_t->rtt;
			}

			if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext)
				marked_packet_rtt(txsi, e_t, tp,
				    &measurenext, &measurenext_len,
				    &rtt_bytes_adjust, CORRECT_ACK);

			if (txsi->flags & TXSI_TSO) {
				if (txsi->len > acked) {
					txsi->len -= acked;
					/*
					 * This presumes ack for first bytes in
					 * txsi, this may not be true but it
					 * shouldn't cause problems for the
					 * timing.
					 *
					 * We remeasure RTT even though we only
					 * have a single txsi. The rationale
					 * behind this is that it is better to
					 * have a slightly inaccurate
					 * measurement than no additional
					 * measurement for the rest of the bulk
					 * transfer. Since TSO is only used on
					 * high speed interface cards, so the
					 * packets should be transmitted at line
					 * rate back to back with little
					 * difference in transmission times (in
					 * ticks).
					 */
					txsi->seq += acked;
					/*
					 * Reset txsi measure flag so we don't
					 * use it for another RTT measurement.
					 */
					txsi->flags &= ~TXSI_RTT_MEASURE_START;
					/*
					 * There is still more data to be acked
					 * from tso bulk transmission, so we
					 * won't remove it from the TAILQ yet.
					 */
					break;
				}
				txsi->len = 0;
			}

			TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk);
			uma_zfree(txseginfo_zone, txsi);
			break;
		}

		if (measurenext) {
			/*
			 * We need to do a RTT measurement. It won't be the best
			 * if we do it here.
			 */
			marked_packet_rtt(txsi, e_t, tp,
			    &measurenext, &measurenext_len,
			    &rtt_bytes_adjust, FORCED_MEASUREMENT);
		}
	}

	return (0);
}
예제 #2
0
파일: tcp_lro.c 프로젝트: CptFrazz/xnu
static void
tcp_lro_coalesce(int flow_id, struct mbuf *lro_mb, struct tcphdr *tcphdr, 
			int payload_len, int drop_hdrlen, struct tcpopt *topt, 
			u_int32_t* tsval, u_int32_t* tsecr, int thflags)
{
	struct lro_flow *flow = NULL;
	struct mbuf *last;
	struct ip *ip = NULL;

	flow =  &lro_flow_list[flow_id];
	if (flow->lr_mhead) {
		if (lrodebug) 
			printf("%s: lr_mhead %x %d \n", __func__, flow->lr_seq,
				payload_len);
		m_adj(lro_mb, drop_hdrlen);

		last = flow->lr_mtail;
		while (last->m_next != NULL) {
			last = last->m_next;
		}
		last->m_next = lro_mb;

		flow->lr_mtail = lro_mb;

		ip = mtod(flow->lr_mhead, struct ip *);
		ip->ip_len += lro_mb->m_pkthdr.len;
		flow->lr_mhead->m_pkthdr.len += lro_mb->m_pkthdr.len;

		if (flow->lr_len == 0) {
			panic_plain("%s: Inconsistent LRO flow state", __func__);
		}
		flow->lr_len += payload_len;
		flow->lr_seq += payload_len;
		/* 
		 * This bit is re-OR'd each time a packet is added to the 
		 * large coalesced packet.
		 */
		flow->lr_mhead->m_pkthdr.aux_flags |= MAUXF_SW_LRO_PKT;
		flow->lr_mhead->m_pkthdr.lro_npkts++; /* for tcpstat.tcps_rcvpack */
		if (flow->lr_mhead->m_pkthdr.lro_pktlen < 
				lro_mb->m_pkthdr.lro_pktlen) {
			/* 
			 * For TCP Inter Arrival Jitter calculation, return max  
			 * size encountered while coalescing a stream of pkts.
			 */
			flow->lr_mhead->m_pkthdr.lro_pktlen = 
						lro_mb->m_pkthdr.lro_pktlen;
		}
        	/* Update the timestamp value */
		if (topt->to_flags & TOF_TS) {
			if ((flow->lr_tsval) && 
				(TSTMP_GT(topt->to_tsval, ntohl(*(flow->lr_tsval))))) {
				*(flow->lr_tsval) = htonl(topt->to_tsval);
			}
			if ((flow->lr_tsecr) &&
				(topt->to_tsecr != 0) &&
				(TSTMP_GT(topt->to_tsecr, ntohl(*(flow->lr_tsecr))))) {
				if (lrodebug >= 2) {
					printf("%s: instantaneous RTT = %d \n", __func__, 
						topt->to_tsecr - ntohl(*(flow->lr_tsecr)));
				}
				*(flow->lr_tsecr) = htonl(topt->to_tsecr);
			}
		}
		/* Coalesce the flags */
		if (thflags) {
			flow->lr_tcphdr->th_flags |= thflags;
		}
		/* Update receive window */
		flow->lr_tcphdr->th_win = tcphdr->th_win;
	} else {
		if (lro_mb) {