/* * This fuction measures the RTT of a particular segment/ack pair, or the next * closest if this will yield an inaccurate result due to delayed acking or * other issues. */ static void inline marked_packet_rtt(struct txseginfo *txsi, struct ertt *e_t, struct tcpcb *tp, uint32_t *pmeasurenext, int *pmeasurenext_len, int *prtt_bytes_adjust, int mflag) { /* * If we can't measure this one properly due to delayed acking adjust * byte counters and flag to measure next txsi. Note that since the * marked packet's transmitted bytes are measured we need to subtract the * transmitted bytes. Then pretend the next txsi was marked. */ if (mflag & (MULTI_ACK|OLD_TXSI)) { *pmeasurenext = txsi->tx_ts; *pmeasurenext_len = txsi->len; *prtt_bytes_adjust += *pmeasurenext_len; } else { if (mflag & FORCED_MEASUREMENT) { e_t->markedpkt_rtt = tcp_ts_getticks() - *pmeasurenext + 1; e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt + *pmeasurenext_len - *prtt_bytes_adjust; } else { e_t->markedpkt_rtt = tcp_ts_getticks() - txsi->tx_ts + 1; e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt - *prtt_bytes_adjust; } e_t->marked_snd_cwnd = tp->snd_cwnd; /* * Reset the ERTT_MEASUREMENT_IN_PROGRESS flag to indicate to * add_tx_segment_info that a new measurement should be started. */ e_t->flags &= ~ERTT_MEASUREMENT_IN_PROGRESS; /* * Set ERTT_NEW_MEASUREMENT to tell the congestion control * algorithm that a new marked RTT measurement has has been made * and is available for use. */ e_t->flags |= ERTT_NEW_MEASUREMENT; if (tp->t_flags & TF_TSO) { /* Temporarily disable TSO to aid a new measurment. */ tp->t_flags &= ~TF_TSO; /* Keep track that we've disabled it. */ e_t->flags |= ERTT_TSO_DISABLED; } } }
/* * Called when a connection is established to translate the TCP options * reported by HW to FreeBSD's native format. */ static void assign_rxopt(struct tcpcb *tp, unsigned int opt) { struct toepcb *toep = tp->t_toe; struct adapter *sc = td_adapter(toep->td); INP_LOCK_ASSERT(tp->t_inpcb); tp->t_maxseg = tp->t_maxopd = sc->params.mtus[G_TCPOPT_MSS(opt)] - 40; if (G_TCPOPT_TSTAMP(opt)) { tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */ tp->ts_recent = 0; /* hmmm */ tp->ts_recent_age = tcp_ts_getticks(); tp->t_maxseg -= TCPOLEN_TSTAMP_APPA; } if (G_TCPOPT_SACK(opt)) tp->t_flags |= TF_SACK_PERMIT; /* should already be set */ else tp->t_flags &= ~TF_SACK_PERMIT; /* sack disallowed by peer */ if (G_TCPOPT_WSCALE_OK(opt)) tp->t_flags |= TF_RCVD_SCALE; /* Doing window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == (TF_RCVD_SCALE | TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; tp->snd_scale = G_TCPOPT_SND_WSCALE(opt); } }
/* * Called when a connection is established to translate the TCP options * reported by HW to FreeBSD's native format. */ static void assign_rxopt(struct tcpcb *tp, unsigned int opt) { struct toepcb *toep = tp->t_toe; struct inpcb *inp = tp->t_inpcb; struct adapter *sc = td_adapter(toep->td); int n; INP_LOCK_ASSERT(inp); if (inp->inp_inc.inc_flags & INC_ISIPV6) n = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else n = sizeof(struct ip) + sizeof(struct tcphdr); tp->t_maxseg = tp->t_maxopd = sc->params.mtus[G_TCPOPT_MSS(opt)] - n; CTR4(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u)", __func__, toep->tid, G_TCPOPT_MSS(opt), sc->params.mtus[G_TCPOPT_MSS(opt)]); if (G_TCPOPT_TSTAMP(opt)) { tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */ tp->ts_recent = 0; /* hmmm */ tp->ts_recent_age = tcp_ts_getticks(); tp->t_maxseg -= TCPOLEN_TSTAMP_APPA; } if (G_TCPOPT_SACK(opt)) tp->t_flags |= TF_SACK_PERMIT; /* should already be set */ else tp->t_flags &= ~TF_SACK_PERMIT; /* sack disallowed by peer */ if (G_TCPOPT_WSCALE_OK(opt)) tp->t_flags |= TF_RCVD_SCALE; /* Doing window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == (TF_RCVD_SCALE | TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; tp->snd_scale = G_TCPOPT_SND_WSCALE(opt); } }
/* * Add information about a transmitted segment to a list. * This is called via the helper hook in tcp_output.c */ static int ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id, void *udata, void *ctx_data, void *hdata, struct osd *hosd) { struct ertt *e_t; struct tcpcb *tp; struct tcphdr *th; struct tcpopt *to; struct tcp_hhook_data *thdp; struct txseginfo *txsi; uint32_t len; int tso; KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__)); KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__)); e_t = (struct ertt *)hdata; thdp = ctx_data; tp = thdp->tp; th = thdp->th; to = thdp->to; len = thdp->len; tso = thdp->tso; INP_WLOCK_ASSERT(tp->t_inpcb); if (len > 0) { txsi = uma_zalloc(txseginfo_zone, M_NOWAIT); if (txsi != NULL) { /* Construct txsi setting the necessary flags. */ txsi->flags = 0; /* Needs to be initialised. */ txsi->seq = ntohl(th->th_seq); txsi->len = len; if (tso) txsi->flags |= TXSI_TSO; else if (e_t->flags & ERTT_TSO_DISABLED) { tp->t_flags |= TF_TSO; e_t->flags &= ~ERTT_TSO_DISABLED; } if (e_t->flags & ERTT_MEASUREMENT_IN_PROGRESS) { e_t->bytes_tx_in_rtt += len; } else { txsi->flags |= TXSI_RTT_MEASURE_START; e_t->flags |= ERTT_MEASUREMENT_IN_PROGRESS; e_t->bytes_tx_in_rtt = len; } if (((tp->t_flags & TF_NOOPT) == 0) && (to->to_flags & TOF_TS)) { txsi->tx_ts = ntohl(to->to_tsval) - tp->ts_offset; txsi->rx_ts = ntohl(to->to_tsecr); } else { txsi->tx_ts = tcp_ts_getticks(); txsi->rx_ts = 0; /* No received time stamp. */ } TAILQ_INSERT_TAIL(&e_t->txsegi_q, txsi, txsegi_lnk); } } return (0); }
/* * Ertt_packet_measurements uses a small amount of state kept on each packet * sent to match incoming acknowledgements. This enables more accurate and * secure round trip time measurements. The resulting measurement is used for * congestion control algorithms which require a more accurate time. * Ertt_packet_measurements is called via the helper hook in tcp_input.c */ static int ertt_packet_measurement_hook(int hhook_type, int hhook_id, void *udata, void *ctx_data, void *hdata, struct osd *hosd) { struct ertt *e_t; struct tcpcb *tp; struct tcphdr *th; struct tcpopt *to; struct tcp_hhook_data *thdp; struct txseginfo *txsi; int acked, measurenext_len, multiack, new_sacked_bytes, rtt_bytes_adjust; uint32_t measurenext, rts; tcp_seq ack; KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__)); KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__)); e_t = (struct ertt *)hdata; thdp = ctx_data; tp = thdp->tp; th = thdp->th; to = thdp->to; new_sacked_bytes = (tp->sackhint.last_sack_ack != 0); measurenext = measurenext_len = multiack = rts = rtt_bytes_adjust = 0; acked = th->th_ack - tp->snd_una; INP_WLOCK_ASSERT(tp->t_inpcb); /* Packet has provided new acknowledgements. */ if (acked > 0 || new_sacked_bytes) { if (acked == 0 && new_sacked_bytes) { /* Use last sacked data. */ ack = tp->sackhint.last_sack_ack; } else ack = th->th_ack; txsi = TAILQ_FIRST(&e_t->txsegi_q); while (txsi != NULL) { rts = 0; /* Acknowledgement is acking more than this txsi. */ if (SEQ_GT(ack, txsi->seq + txsi->len)) { if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext) { marked_packet_rtt(txsi, e_t, tp, &measurenext, &measurenext_len, &rtt_bytes_adjust, MULTI_ACK); } TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk); uma_zfree(txseginfo_zone, txsi); txsi = TAILQ_FIRST(&e_t->txsegi_q); continue; } /* * Guess if delayed acks are being used by the receiver. * * XXXDH: A simple heuristic that could be improved */ if (!new_sacked_bytes) { if (acked > tp->t_maxseg) { e_t->dlyack_rx += (e_t->dlyack_rx < DLYACK_SMOOTH) ? 1 : 0; multiack = 1; } else if (acked > txsi->len) { multiack = 1; e_t->dlyack_rx += (e_t->dlyack_rx < DLYACK_SMOOTH) ? 1 : 0; } else if (acked == tp->t_maxseg || acked == txsi->len) { e_t->dlyack_rx -= (e_t->dlyack_rx > 0) ? 1 : 0; } /* Otherwise leave dlyack_rx the way it was. */ } /* * Time stamps are only to help match the txsi with the * received acknowledgements. */ if (e_t->timestamp_errors < MAX_TS_ERR && (to->to_flags & TOF_TS) != 0 && to->to_tsecr) { /* * Note: All packets sent with the offload will * have the same time stamp. If we are sending * on a fast interface and the t_maxseg is much * smaller than one tick, this will be fine. The * time stamp would be the same whether we were * using tso or not. However, if the interface * is slow, this will cause problems with the * calculations. If the interface is slow, there * is not reason to be using tso, and it should * be turned off. */ /* * If there are too many time stamp errors, time * stamps won't be trusted */ rts = to->to_tsecr; /* Before this packet. */ if (!e_t->dlyack_rx && TSTMP_LT(rts, txsi->tx_ts)) /* When delayed acking is used, the * reflected time stamp is of the first * packet and thus may be before * txsi->tx_ts. */ break; if (TSTMP_GT(rts, txsi->tx_ts)) { /* * If reflected time stamp is later than * tx_tsi, then this txsi is old. */ if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext) { marked_packet_rtt(txsi, e_t, tp, &measurenext, &measurenext_len, &rtt_bytes_adjust, OLD_TXSI); } TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk); uma_zfree(txseginfo_zone, txsi); txsi = TAILQ_FIRST(&e_t->txsegi_q); continue; } if (rts == txsi->tx_ts && TSTMP_LT(to->to_tsval, txsi->rx_ts)) { /* * Segment received before sent! * Something is wrong with the received * timestamps so increment errors. If * this keeps up we will ignore * timestamps. */ e_t->timestamp_errors++; } } /* * Acknowledging a sequence number before this txsi. * If it is an old txsi that may have had the same seq * numbers, it should have been removed if time stamps * are being used. */ if (SEQ_LEQ(ack, txsi->seq)) break; /* Before first packet in txsi. */ /* * Only ack > txsi->seq and ack <= txsi->seq+txsi->len * past this point. * * If delayed acks are being used, an acknowledgement * for a single segment will have been delayed by the * receiver and will yield an inaccurate measurement. In * this case, we only make the measurement if more than * one segment is being acknowledged or sack is * currently being used. */ if (!e_t->dlyack_rx || multiack || new_sacked_bytes) { /* Make an accurate new measurement. */ e_t->rtt = tcp_ts_getticks() - txsi->tx_ts + 1; if (e_t->rtt < e_t->minrtt || e_t->minrtt == 0) e_t->minrtt = e_t->rtt; if (e_t->rtt > e_t->maxrtt || e_t->maxrtt == 0) e_t->maxrtt = e_t->rtt; } if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext) marked_packet_rtt(txsi, e_t, tp, &measurenext, &measurenext_len, &rtt_bytes_adjust, CORRECT_ACK); if (txsi->flags & TXSI_TSO) { if (txsi->len > acked) { txsi->len -= acked; /* * This presumes ack for first bytes in * txsi, this may not be true but it * shouldn't cause problems for the * timing. * * We remeasure RTT even though we only * have a single txsi. The rationale * behind this is that it is better to * have a slightly inaccurate * measurement than no additional * measurement for the rest of the bulk * transfer. Since TSO is only used on * high speed interface cards, so the * packets should be transmitted at line * rate back to back with little * difference in transmission times (in * ticks). */ txsi->seq += acked; /* * Reset txsi measure flag so we don't * use it for another RTT measurement. */ txsi->flags &= ~TXSI_RTT_MEASURE_START; /* * There is still more data to be acked * from tso bulk transmission, so we * won't remove it from the TAILQ yet. */ break; } txsi->len = 0; } TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk); uma_zfree(txseginfo_zone, txsi); break; } if (measurenext) { /* * We need to do a RTT measurement. It won't be the best * if we do it here. */ marked_packet_rtt(txsi, e_t, tp, &measurenext, &measurenext_len, &rtt_bytes_adjust, FORCED_MEASUREMENT); } } return (0); }
static int do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp; struct tcpcb *tp; struct socket *so; uint8_t credits = cpl->credits; struct ofld_tx_sdesc *txsd; int plen; #ifdef INVARIANTS unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl))); #endif /* * Very unusual case: we'd sent a flowc + abort_req for a synq entry and * now this comes back carrying the credits for the flowc. */ if (__predict_false(toep->flags & TPF_SYNQE)) { KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, ("%s: credits for a synq entry %p", __func__, toep)); return (0); } inp = toep->inp; KASSERT(opcode == CPL_FW4_ACK, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); INP_WLOCK(inp); if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) { INP_WUNLOCK(inp); return (0); } KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0, ("%s: inp_flags 0x%x", __func__, inp->inp_flags)); tp = intotcpcb(inp); if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) { tcp_seq snd_una = be32toh(cpl->snd_una); #ifdef INVARIANTS if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { log(LOG_ERR, "%s: unexpected seq# %x for TID %u, snd_una %x\n", __func__, snd_una, toep->tid, tp->snd_una); } #endif if (tp->snd_una != snd_una) { tp->snd_una = snd_una; tp->ts_recent_age = tcp_ts_getticks(); } } so = inp->inp_socket; txsd = &toep->txsd[toep->txsd_cidx]; plen = 0; while (credits) { KASSERT(credits >= txsd->tx_credits, ("%s: too many (or partial) credits", __func__)); credits -= txsd->tx_credits; toep->tx_credits += txsd->tx_credits; plen += txsd->plen; txsd++; toep->txsd_avail++; KASSERT(toep->txsd_avail <= toep->txsd_total, ("%s: txsd avail > total", __func__)); if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) { txsd = &toep->txsd[0]; toep->txsd_cidx = 0; } } if (plen > 0) { struct sockbuf *sb = &so->so_snd; SOCKBUF_LOCK(sb); sbdrop_locked(sb, plen); sowwakeup_locked(so); SOCKBUF_UNLOCK_ASSERT(sb); } /* XXX */ if ((toep->flags & TPF_TX_SUSPENDED && toep->tx_credits >= MIN_OFLD_TX_CREDITS) || toep->tx_credits == toep->txsd_total * howmany((sizeof(struct fw_ofld_tx_data_wr) + 1), 16)) { toep->flags &= ~TPF_TX_SUSPENDED; t4_push_frames(sc, toep); } INP_WUNLOCK(inp); return (0); }
static int do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp; struct tcpcb *tp; struct socket *so; uint8_t credits = cpl->credits; struct ofld_tx_sdesc *txsd; int plen; #ifdef INVARIANTS unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl))); #endif /* * Very unusual case: we'd sent a flowc + abort_req for a synq entry and * now this comes back carrying the credits for the flowc. */ if (__predict_false(toep->flags & TPF_SYNQE)) { KASSERT(toep->flags & TPF_ABORT_SHUTDOWN, ("%s: credits for a synq entry %p", __func__, toep)); return (0); } inp = toep->inp; KASSERT(opcode == CPL_FW4_ACK, ("%s: unexpected opcode 0x%x", __func__, opcode)); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); INP_WLOCK(inp); if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) { INP_WUNLOCK(inp); return (0); } KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0, ("%s: inp_flags 0x%x", __func__, inp->inp_flags)); tp = intotcpcb(inp); if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) { tcp_seq snd_una = be32toh(cpl->snd_una); #ifdef INVARIANTS if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { log(LOG_ERR, "%s: unexpected seq# %x for TID %u, snd_una %x\n", __func__, snd_una, toep->tid, tp->snd_una); } #endif if (tp->snd_una != snd_una) { tp->snd_una = snd_una; tp->ts_recent_age = tcp_ts_getticks(); } } so = inp->inp_socket; txsd = &toep->txsd[toep->txsd_cidx]; plen = 0; while (credits) { KASSERT(credits >= txsd->tx_credits, ("%s: too many (or partial) credits", __func__)); credits -= txsd->tx_credits; toep->tx_credits += txsd->tx_credits; plen += txsd->plen; txsd++; toep->txsd_avail++; KASSERT(toep->txsd_avail <= toep->txsd_total, ("%s: txsd avail > total", __func__)); if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) { txsd = &toep->txsd[0]; toep->txsd_cidx = 0; } } if (toep->tx_credits == toep->tx_total) { toep->tx_nocompl = 0; toep->plen_nocompl = 0; } if (toep->flags & TPF_TX_SUSPENDED && toep->tx_credits >= toep->tx_total / 4) { toep->flags &= ~TPF_TX_SUSPENDED; if (toep->ulp_mode == ULP_MODE_ISCSI) t4_push_pdus(sc, toep, plen); else t4_push_frames(sc, toep, plen); } else if (plen > 0) { struct sockbuf *sb = &so->so_snd; int sbu; SOCKBUF_LOCK(sb); sbu = sbused(sb); if (toep->ulp_mode == ULP_MODE_ISCSI) { if (__predict_false(sbu > 0)) { /* * The data trasmitted before the tid's ULP mode * changed to ISCSI is still in so_snd. * Incoming credits should account for so_snd * first. */ sbdrop_locked(sb, min(sbu, plen)); plen -= min(sbu, plen); } sowwakeup_locked(so); /* unlocks so_snd */ rqdrop_locked(&toep->ulp_pdu_reclaimq, plen); } else { sbdrop_locked(sb, plen); sowwakeup_locked(so); /* unlocks so_snd */ } SOCKBUF_UNLOCK_ASSERT(sb); } INP_WUNLOCK(inp); return (0); }