/* * Ertt_packet_measurements uses a small amount of state kept on each packet * sent to match incoming acknowledgements. This enables more accurate and * secure round trip time measurements. The resulting measurement is used for * congestion control algorithms which require a more accurate time. * Ertt_packet_measurements is called via the helper hook in tcp_input.c */ static int ertt_packet_measurement_hook(int hhook_type, int hhook_id, void *udata, void *ctx_data, void *hdata, struct osd *hosd) { struct ertt *e_t; struct tcpcb *tp; struct tcphdr *th; struct tcpopt *to; struct tcp_hhook_data *thdp; struct txseginfo *txsi; int acked, measurenext_len, multiack, new_sacked_bytes, rtt_bytes_adjust; uint32_t measurenext, rts; tcp_seq ack; KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__)); KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__)); e_t = (struct ertt *)hdata; thdp = ctx_data; tp = thdp->tp; th = thdp->th; to = thdp->to; new_sacked_bytes = (tp->sackhint.last_sack_ack != 0); measurenext = measurenext_len = multiack = rts = rtt_bytes_adjust = 0; acked = th->th_ack - tp->snd_una; INP_WLOCK_ASSERT(tp->t_inpcb); /* Packet has provided new acknowledgements. */ if (acked > 0 || new_sacked_bytes) { if (acked == 0 && new_sacked_bytes) { /* Use last sacked data. */ ack = tp->sackhint.last_sack_ack; } else ack = th->th_ack; txsi = TAILQ_FIRST(&e_t->txsegi_q); while (txsi != NULL) { rts = 0; /* Acknowledgement is acking more than this txsi. */ if (SEQ_GT(ack, txsi->seq + txsi->len)) { if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext) { marked_packet_rtt(txsi, e_t, tp, &measurenext, &measurenext_len, &rtt_bytes_adjust, MULTI_ACK); } TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk); uma_zfree(txseginfo_zone, txsi); txsi = TAILQ_FIRST(&e_t->txsegi_q); continue; } /* * Guess if delayed acks are being used by the receiver. * * XXXDH: A simple heuristic that could be improved */ if (!new_sacked_bytes) { if (acked > tp->t_maxseg) { e_t->dlyack_rx += (e_t->dlyack_rx < DLYACK_SMOOTH) ? 1 : 0; multiack = 1; } else if (acked > txsi->len) { multiack = 1; e_t->dlyack_rx += (e_t->dlyack_rx < DLYACK_SMOOTH) ? 1 : 0; } else if (acked == tp->t_maxseg || acked == txsi->len) { e_t->dlyack_rx -= (e_t->dlyack_rx > 0) ? 1 : 0; } /* Otherwise leave dlyack_rx the way it was. */ } /* * Time stamps are only to help match the txsi with the * received acknowledgements. */ if (e_t->timestamp_errors < MAX_TS_ERR && (to->to_flags & TOF_TS) != 0 && to->to_tsecr) { /* * Note: All packets sent with the offload will * have the same time stamp. If we are sending * on a fast interface and the t_maxseg is much * smaller than one tick, this will be fine. The * time stamp would be the same whether we were * using tso or not. However, if the interface * is slow, this will cause problems with the * calculations. If the interface is slow, there * is not reason to be using tso, and it should * be turned off. */ /* * If there are too many time stamp errors, time * stamps won't be trusted */ rts = to->to_tsecr; /* Before this packet. */ if (!e_t->dlyack_rx && TSTMP_LT(rts, txsi->tx_ts)) /* When delayed acking is used, the * reflected time stamp is of the first * packet and thus may be before * txsi->tx_ts. */ break; if (TSTMP_GT(rts, txsi->tx_ts)) { /* * If reflected time stamp is later than * tx_tsi, then this txsi is old. */ if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext) { marked_packet_rtt(txsi, e_t, tp, &measurenext, &measurenext_len, &rtt_bytes_adjust, OLD_TXSI); } TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk); uma_zfree(txseginfo_zone, txsi); txsi = TAILQ_FIRST(&e_t->txsegi_q); continue; } if (rts == txsi->tx_ts && TSTMP_LT(to->to_tsval, txsi->rx_ts)) { /* * Segment received before sent! * Something is wrong with the received * timestamps so increment errors. If * this keeps up we will ignore * timestamps. */ e_t->timestamp_errors++; } } /* * Acknowledging a sequence number before this txsi. * If it is an old txsi that may have had the same seq * numbers, it should have been removed if time stamps * are being used. */ if (SEQ_LEQ(ack, txsi->seq)) break; /* Before first packet in txsi. */ /* * Only ack > txsi->seq and ack <= txsi->seq+txsi->len * past this point. * * If delayed acks are being used, an acknowledgement * for a single segment will have been delayed by the * receiver and will yield an inaccurate measurement. In * this case, we only make the measurement if more than * one segment is being acknowledged or sack is * currently being used. */ if (!e_t->dlyack_rx || multiack || new_sacked_bytes) { /* Make an accurate new measurement. */ e_t->rtt = tcp_ts_getticks() - txsi->tx_ts + 1; if (e_t->rtt < e_t->minrtt || e_t->minrtt == 0) e_t->minrtt = e_t->rtt; if (e_t->rtt > e_t->maxrtt || e_t->maxrtt == 0) e_t->maxrtt = e_t->rtt; } if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext) marked_packet_rtt(txsi, e_t, tp, &measurenext, &measurenext_len, &rtt_bytes_adjust, CORRECT_ACK); if (txsi->flags & TXSI_TSO) { if (txsi->len > acked) { txsi->len -= acked; /* * This presumes ack for first bytes in * txsi, this may not be true but it * shouldn't cause problems for the * timing. * * We remeasure RTT even though we only * have a single txsi. The rationale * behind this is that it is better to * have a slightly inaccurate * measurement than no additional * measurement for the rest of the bulk * transfer. Since TSO is only used on * high speed interface cards, so the * packets should be transmitted at line * rate back to back with little * difference in transmission times (in * ticks). */ txsi->seq += acked; /* * Reset txsi measure flag so we don't * use it for another RTT measurement. */ txsi->flags &= ~TXSI_RTT_MEASURE_START; /* * There is still more data to be acked * from tso bulk transmission, so we * won't remove it from the TAILQ yet. */ break; } txsi->len = 0; } TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk); uma_zfree(txseginfo_zone, txsi); break; } if (measurenext) { /* * We need to do a RTT measurement. It won't be the best * if we do it here. */ marked_packet_rtt(txsi, e_t, tp, &measurenext, &measurenext_len, &rtt_bytes_adjust, FORCED_MEASUREMENT); } } return (0); }
static void tcp_lro_coalesce(int flow_id, struct mbuf *lro_mb, struct tcphdr *tcphdr, int payload_len, int drop_hdrlen, struct tcpopt *topt, u_int32_t* tsval, u_int32_t* tsecr, int thflags) { struct lro_flow *flow = NULL; struct mbuf *last; struct ip *ip = NULL; flow = &lro_flow_list[flow_id]; if (flow->lr_mhead) { if (lrodebug) printf("%s: lr_mhead %x %d \n", __func__, flow->lr_seq, payload_len); m_adj(lro_mb, drop_hdrlen); last = flow->lr_mtail; while (last->m_next != NULL) { last = last->m_next; } last->m_next = lro_mb; flow->lr_mtail = lro_mb; ip = mtod(flow->lr_mhead, struct ip *); ip->ip_len += lro_mb->m_pkthdr.len; flow->lr_mhead->m_pkthdr.len += lro_mb->m_pkthdr.len; if (flow->lr_len == 0) { panic_plain("%s: Inconsistent LRO flow state", __func__); } flow->lr_len += payload_len; flow->lr_seq += payload_len; /* * This bit is re-OR'd each time a packet is added to the * large coalesced packet. */ flow->lr_mhead->m_pkthdr.aux_flags |= MAUXF_SW_LRO_PKT; flow->lr_mhead->m_pkthdr.lro_npkts++; /* for tcpstat.tcps_rcvpack */ if (flow->lr_mhead->m_pkthdr.lro_pktlen < lro_mb->m_pkthdr.lro_pktlen) { /* * For TCP Inter Arrival Jitter calculation, return max * size encountered while coalescing a stream of pkts. */ flow->lr_mhead->m_pkthdr.lro_pktlen = lro_mb->m_pkthdr.lro_pktlen; } /* Update the timestamp value */ if (topt->to_flags & TOF_TS) { if ((flow->lr_tsval) && (TSTMP_GT(topt->to_tsval, ntohl(*(flow->lr_tsval))))) { *(flow->lr_tsval) = htonl(topt->to_tsval); } if ((flow->lr_tsecr) && (topt->to_tsecr != 0) && (TSTMP_GT(topt->to_tsecr, ntohl(*(flow->lr_tsecr))))) { if (lrodebug >= 2) { printf("%s: instantaneous RTT = %d \n", __func__, topt->to_tsecr - ntohl(*(flow->lr_tsecr))); } *(flow->lr_tsecr) = htonl(topt->to_tsecr); } } /* Coalesce the flags */ if (thflags) { flow->lr_tcphdr->th_flags |= thflags; } /* Update receive window */ flow->lr_tcphdr->th_win = tcphdr->th_win; } else { if (lro_mb) {