/* * This function is called from the receive side with the sequence number of * the last packet received. */ void AJ_SerialTx_ReceivedSeq(uint8_t seq) { /* * If we think we have already acked this sequence number we don't adjust * the ack count. */ if (!SEQ_GT(currentTxAck, seq)) { currentTxAck = (seq + 1) & 0x7; } #ifdef ALWAYS_ACK AJ_SerialTX_EnqueueCtrl(NULL, 0, AJ_SERIAL_ACK); #else ++pendingAcks; /* * If there are no packets to send we are allowed to accumulate a * backlog of pending ACKs up to a maximum equal to the window size. * In any case we are required to send an ack within a timeout * period so if this is the first pending ack we need to prime a timer. */ if (pendingAcks == 1) { AJ_InitTimer(&ackTime); AJ_TimeAddOffset(&ackTime, AJ_SerialLinkParams.txAckTimeout); return; } /* * If we have hit our pending ACK limit send a explicit ACK packet immediately. */ if (pendingAcks == AJ_SerialLinkParams.windowSize) { AJ_SerialTX_EnqueueCtrl(NULL, 0, AJ_SERIAL_ACK); } #endif }
static void dctcp_ack_received(struct cc_var *ccv, uint16_t type) { struct dctcp *dctcp_data; int bytes_acked = 0; dctcp_data = ccv->cc_data; if (CCV(ccv, t_flags) & TF_ECN_PERMIT) { /* * DCTCP doesn't treat receipt of ECN marked packet as a * congestion event. Thus, DCTCP always executes the ACK * processing out of congestion recovery. */ if (IN_CONGRECOVERY(CCV(ccv, t_flags))) { EXIT_CONGRECOVERY(CCV(ccv, t_flags)); newreno_cc_algo.ack_received(ccv, type); ENTER_CONGRECOVERY(CCV(ccv, t_flags)); } else newreno_cc_algo.ack_received(ccv, type); if (type == CC_DUPACK) bytes_acked = CCV(ccv, t_maxseg); if (type == CC_ACK) bytes_acked = ccv->bytes_this_ack; /* Update total bytes. */ dctcp_data->bytes_total += bytes_acked; /* Update total marked bytes. */ if (dctcp_data->ece_curr) { if (!dctcp_data->ece_prev && bytes_acked > CCV(ccv, t_maxseg)) { dctcp_data->bytes_ecn += (bytes_acked - CCV(ccv, t_maxseg)); } else dctcp_data->bytes_ecn += bytes_acked; dctcp_data->ece_prev = 1; } else { if (dctcp_data->ece_prev && bytes_acked > CCV(ccv, t_maxseg)) dctcp_data->bytes_ecn += CCV(ccv, t_maxseg); dctcp_data->ece_prev = 0; } dctcp_data->ece_curr = 0; /* * Update the fraction of marked bytes at the end of * current window size. */ if ((IN_FASTRECOVERY(CCV(ccv, t_flags)) && SEQ_GEQ(ccv->curack, CCV(ccv, snd_recover))) || (!IN_FASTRECOVERY(CCV(ccv, t_flags)) && SEQ_GT(ccv->curack, dctcp_data->save_sndnxt))) dctcp_update_alpha(ccv); } else newreno_cc_algo.ack_received(ccv, type); }
int StreamIterator(Flow *f, TcpStream *stream, int close, void *cbdata, uint8_t iflags) { SCLogDebug("called with %p, %d, %p, %02x", f, close, cbdata, iflags); int logged = 0; /* optimization: don't iterate list if we've logged all, * so check the last segment's flags */ if (stream->seg_list_tail != NULL && (!(stream->seg_list_tail->flags & SEGMENTTCP_FLAG_LOGAPI_PROCESSED))) { TcpSegment *seg = stream->seg_list; while (seg) { uint8_t flags = iflags; if (seg->flags & SEGMENTTCP_FLAG_LOGAPI_PROCESSED) { seg = seg->next; continue; } if (SEQ_GT(seg->seq + seg->payload_len, stream->last_ack)) { SCLogDebug("seg not (fully) acked yet"); break; } if (seg->seq == stream->isn + 1) flags |= OUTPUT_STREAMING_FLAG_OPEN; /* if we need to close and we're at the last segment in the list * we add the 'close' flag so the logger can close up. */ if (close && seg->next == NULL) flags |= OUTPUT_STREAMING_FLAG_CLOSE; Streamer(cbdata, f, seg->payload, (uint32_t)seg->payload_len, 0, flags); seg->flags |= SEGMENTTCP_FLAG_LOGAPI_PROCESSED; seg = seg->next; logged = 1; } } /* if we need to close we need to invoke the Streamer for sure. If we * logged no segments, we call the Streamer with NULL data so it can * close up. */ if (logged == 0 && close) { Streamer(cbdata, f, NULL, 0, 0, OUTPUT_STREAMING_FLAG_CLOSE); } return 0; }
static void tcp_state_update(struct tcpup_info *upp, int state) { fprintf(stderr, "%x/%-4d %s\t -> %s\n", upp->t_conv, _tot_tcp, tcpstates[upp->t_state], tcpstates[state]); if (SEQ_GT(upp->snd_nxt, upp->snd_max)) { /* update snd max to nxt */ upp->snd_max = upp->snd_nxt; } upp->t_state = state; upp->x_state = state; return; }
/** * \brief Replace (part of) the payload portion of a packet by the data * in a TCP segment * * \param p Packet * \param seg TCP segment * * \todo What about reassembled fragments? * \todo What about unwrapped tunnel packets? */ void StreamTcpInlineSegmentReplacePacket(Packet *p, TcpSegment *seg) { SCEnter(); uint32_t pseq = TCP_GET_SEQ(p); uint32_t tseq = seg->seq; /* check if segment is within the packet */ if (tseq + seg->payload_len < pseq) { SCReturn; } else if (pseq + p->payload_len < tseq) { SCReturn; } else { /** \todo review logic */ uint32_t pend = pseq + p->payload_len; uint32_t tend = tseq + seg->payload_len; SCLogDebug("pend %u, tend %u", pend, tend); //SCLogDebug("packet"); //PrintRawDataFp(stdout,p->payload,p->payload_len); //SCLogDebug("seg"); //PrintRawDataFp(stdout,seg->payload,seg->payload_len); /* get the minimal seg*_end */ uint32_t end = (SEQ_GT(pend, tend)) ? tend : pend; /* and the max seq */ uint32_t seq = (SEQ_LT(pseq, tseq)) ? tseq : pseq; SCLogDebug("seq %u, end %u", seq, end); uint16_t poff = seq - pseq; uint16_t toff = seq - tseq; SCLogDebug("poff %u, toff %u", poff, toff); uint32_t range = end - seq; SCLogDebug("range %u", range); BUG_ON(range > 65536); if (range) { /* update the packets payload. As payload is a ptr to either * p->pkt or p->ext_pkt that is updated as well */ memcpy(p->payload+poff, seg->payload+toff, range); /* flag as modified so we can reinject / replace after * recalculating the checksum */ p->flags |= PKT_STREAM_MODIFIED; } } }
/* * To remove a SACK block. * * Parameters: * sack_blk_t *head: pointer to the array of SACK blks. * tcp_seq end: to remove all sack blk with seq num less than end. * int32_t *num: (referenced) total num of SACK blks in the array. */ void tcp_sack_remove(sack_blk_t *head, tcp_seq end, int32_t *num) { sack_blk_t tmp[MAX_SACK_BLK]; int32_t i, j, old_num, new_num; if (*num == 0) return; old_num = *num; new_num = old_num; j = 0; /* Walk thru the whole list and copy the new list to tmp[]. */ for (i = 0; i < old_num; i++) { if (SEQ_GT(end, head[i].begin)) { /* * Check to see if the old SACK blk needs to be * removed or updated. If the old blk is just * partially covered, update begin and continue. * If the old blk is completely covered, remove it * and continue to check. */ if (SEQ_GEQ(end, head[i].end)) { new_num--; continue; } else { tmp[j].begin = end; tmp[j].end = head[i].end; } } else { tmp[j].begin = head[i].begin; tmp[j].end = head[i].end; } j++; } /* Copy tmp[] back to the original list. */ for (i = 0; i < new_num; i++) { head[i].begin = tmp[i].begin; head[i].end = tmp[i].end; } *num = new_num; }
/** * This function is called by the receive layer when a data packet or an explicit ACK * has been received. The ACK value is one greater (modulo 8) than the seq number of the * last packet successfully received. */ void AJ_SerialTx_ReceivedAck(uint8_t ack) { TxPkt volatile* ackedPkt = NULL; if (txSent == NULL) { return; } /* * Remove acknowledged packets from sent queue. */ while ((txSent != NULL) && SEQ_GT(ack, txSent->seq)) { ackedPkt = txSent; txSent = txSent->next; //AJ_AlwaysPrintf("Releasing seq=%d (acked by %d)\n", ackedPkt->seq, ack); AJ_ASSERT(ackedPkt->type == AJ_SERIAL_DATA); /* * Return pkt to ACL free list. */ ackedPkt->next = txFreeList; txFreeList = ackedPkt; /* * If all packet have been ack'd, halt the resend timer and return. */ if (txSent == NULL) { AJ_InitTimer(&resendTime); AJ_TimeAddOffset(&resendTime, AJ_TIMER_FOREVER); resendPrimed = FALSE; return; } } /* * Reset the resend timer if one or more packets were ack'd. */ if (ackedPkt != NULL) { AJ_InitTimer(&resendTime); AJ_TimeAddOffset(&resendTime, AJ_SerialLinkParams.txResendTimeout); resendPrimed = TRUE; } }
/* * Perform any necessary tasks before we exit congestion recovery. */ static void newreno_post_recovery(struct cc_var *ccv) { if (IN_FASTRECOVERY(CCV(ccv, t_flags))) { /* * Fast recovery will conclude after returning from this * function. Window inflation should have left us with * approximately snd_ssthresh outstanding data. But in case we * would be inclined to send a burst, better to do it via the * slow start mechanism. * * XXXLAS: Find a way to do this without needing curack */ if (SEQ_GT(ccv->curack + CCV(ccv, snd_ssthresh), CCV(ccv, snd_max))) CCV(ccv, snd_cwnd) = CCV(ccv, snd_max) - ccv->curack + CCV(ccv, t_maxseg); else CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); } }
/* * USTAT PDU / SOS_READY Processor * * Arguments: * sop pointer to sscop connection block * m pointer to PDU buffer (without trailer) * trlr pointer to PDU trailer * * Returns: * none * */ void sscop_ustat_ready(struct sscop *sop, KBuffer *m, caddr_t trlr) { struct ustat_pdu *up = (struct ustat_pdu *)trlr; struct pdu_hdr *php; sscop_seq seq1, seq2; up->ustat_nmr = ntohl(up->ustat_nmr); up->ustat_nr = ntohl(up->ustat_nr); /* * Validate peer's current receive data sequence number */ if (SEQ_GT(sop->so_ack, up->ustat_nr, sop->so_ack) || SEQ_GEQ(up->ustat_nr, sop->so_send, sop->so_ack)) { /* * Bad data sequence number */ goto goterr; } /* * Free acknowledged PDUs */ for (seq1 = sop->so_ack, SEQ_SET(seq2, up->ustat_nr); SEQ_LT(seq1, seq2, sop->so_ack); SEQ_INCR(seq1, 1)) { sscop_pack_free(sop, seq1); } /* * Update transmit state variables */ sop->so_ack = seq2; SEQ_SET(sop->so_sendmax, up->ustat_nmr); /* * Get USTAT list elements */ SEQ_SET(seq1, ntohl(up->ustat_le1)); SEQ_SET(seq2, ntohl(up->ustat_le2)); /* * Validate elements */ if (SEQ_GT(sop->so_ack, seq1, sop->so_ack) || SEQ_GEQ(seq1, seq2, sop->so_ack) || SEQ_GEQ(seq2, sop->so_send, sop->so_ack)) { /* * Bad element sequence number */ goto goterr; } /* * Process each missing sequence number in this gap */ while (SEQ_LT(seq1, seq2, sop->so_ack)) { /* * Find corresponding SD PDU on pending ack queue */ php = sscop_pack_locate(sop, seq1); if (php == NULL) { goto goterr; } /* * Retransmit this SD PDU if it's not * already scheduled for retranmission. */ if ((php->ph_rexmit_lk == NULL) && (sop->so_rexmit_tl != php)) { /* * Put PDU on retransmit queue and schedule * transmit servicing */ sscop_rexmit_insert(sop, php); sop->so_flags |= SOF_XMITSRVC; } /* * Bump to next sequence number */ SEQ_INCR(seq1, 1); } /* * Report retransmitted PDUs */ sscop_maa_error(sop, 'V'); /* * Free PDU buffer chain */ KB_FREEALL(m); /* * See if transmit queues need servicing */ if (sop->so_flags & SOF_XMITSRVC) sscop_service_xmit(sop); return; goterr: /* * Protocol/parameter error encountered */ sscop_maa_error(sop, 'T'); /* * Free PDU buffer chain */ KB_FREEALL(m); if (sop->so_vers == SSCOP_VERS_QSAAL) /* * Reestablish a new connection */ qsaal1_reestablish(sop); else /* * Initiate error recovery */ q2110_error_recovery(sop); return; }
/* * STAT PDU / SOS_READY Processor * * Arguments: * sop pointer to sscop connection block * m pointer to PDU buffer (without trailer) * trlr pointer to PDU trailer * * Returns: * none * */ void sscop_stat_ready(struct sscop *sop, KBuffer *m, caddr_t trlr) { struct stat_pdu *sp = (struct stat_pdu *)trlr; struct pdu_hdr *php; KBuffer *m0 = m; sscop_seq seq1, seq2, opa; int cnt = 0; sp->stat_nps = ntohl(sp->stat_nps); sp->stat_nmr = ntohl(sp->stat_nmr); sp->stat_nr = ntohl(sp->stat_nr); /* * Validate peer's received poll sequence number */ if (SEQ_GT(sop->so_pollack, sp->stat_nps, sop->so_pollack) || SEQ_GT(sp->stat_nps, sop->so_pollsend, sop->so_pollack)) { /* * Bad poll sequence number */ sscop_maa_error(sop, 'R'); goto goterr; } /* * Validate peer's current receive data sequence number */ if (SEQ_GT(sop->so_ack, sp->stat_nr, sop->so_ack) || SEQ_GT(sp->stat_nr, sop->so_send, sop->so_ack)) { /* * Bad data sequence number */ sscop_maa_error(sop, 'S'); goto goterr; } /* * Free acknowledged PDUs */ for (seq1 = sop->so_ack, SEQ_SET(seq2, sp->stat_nr); SEQ_LT(seq1, seq2, sop->so_ack); SEQ_INCR(seq1, 1)) { sscop_pack_free(sop, seq1); } /* * Update transmit state variables */ opa = sop->so_pollack; sop->so_ack = seq2; SEQ_SET(sop->so_pollack, sp->stat_nps); SEQ_SET(sop->so_sendmax, sp->stat_nmr); /* * Get first element in STAT list */ while (m && (KB_LEN(m) == 0)) m = KB_NEXT(m); if (m == NULL) goto done; m = sscop_stat_getelem(m, &seq1); /* * Make sure there's a second element too */ if (m == NULL) goto done; /* * Validate first element (start of missing pdus) */ if (SEQ_GT(sop->so_ack, seq1, sop->so_ack) || SEQ_GEQ(seq1, sop->so_send, sop->so_ack)) { /* * Bad element sequence number */ sscop_maa_error(sop, 'S'); goto goterr; } /* * Loop thru all STAT elements in list */ while (m) { /* * Get next even element (start of received pdus) */ m = sscop_stat_getelem(m, &seq2); /* * Validate seqence number */ if (SEQ_GEQ(seq1, seq2, sop->so_ack) || SEQ_GT(seq2, sop->so_send, sop->so_ack)) { /* * Bad element sequence number */ sscop_maa_error(sop, 'S'); goto goterr; } /* * Process each missing sequence number in this gap */ while (SEQ_LT(seq1, seq2, sop->so_ack)) { /* * Find corresponding SD PDU on pending ack queue */ php = sscop_pack_locate(sop, seq1); if (php == NULL) { sscop_maa_error(sop, 'S'); goto goterr; } /* * Retransmit this SD PDU only if it was last sent * during an earlier poll sequence and it's not * already scheduled for retranmission. */ if (SEQ_LT(php->ph_nps, sp->stat_nps, opa) && (php->ph_rexmit_lk == NULL) && (sop->so_rexmit_tl != php)) { /* * Put PDU on retransmit queue and schedule * transmit servicing */ sscop_rexmit_insert(sop, php); sop->so_flags |= SOF_XMITSRVC; cnt++; } /* * Bump to next sequence number */ SEQ_INCR(seq1, 1); } /* * Now process series of acknowledged PDUs * * Get next odd element (start of missing pdus), * but make sure there is one and that it's valid */ if (m == NULL) goto done; m = sscop_stat_getelem(m, &seq2); if (SEQ_GEQ(seq1, seq2, sop->so_ack) || SEQ_GT(seq2, sop->so_send, sop->so_ack)) { /* * Bad element sequence number */ sscop_maa_error(sop, 'S'); goto goterr; } /* * Process each acked sequence number */ while (SEQ_LT(seq1, seq2, sop->so_ack)) { /* * Can we clear transmit buffers ?? */ if ((sop->so_flags & SOF_NOCLRBUF) == 0) { /* * Yes, free acked buffers */ sscop_pack_free(sop, seq1); } /* * Bump to next sequence number */ SEQ_INCR(seq1, 1); } } done: /* * Free PDU buffer chain */ KB_FREEALL(m0); /* * Report retransmitted PDUs */ if (cnt) sscop_maa_error(sop, 'V'); /* * Record transmit window closed transitions */ if (SEQ_LT(sop->so_send, sop->so_sendmax, sop->so_ack)) { if (sop->so_flags & SOF_NOCREDIT) { sop->so_flags &= ~SOF_NOCREDIT; sscop_maa_error(sop, 'X'); } } else { if ((sop->so_flags & SOF_NOCREDIT) == 0) { sop->so_flags |= SOF_NOCREDIT; sscop_maa_error(sop, 'W'); } } if (sop->so_vers == SSCOP_VERS_QSAAL) /* * Restart lost poll/stat timer */ sop->so_timer[SSCOP_T_NORESP] = sop->so_parm.sp_timeresp; else { /* * Determine new polling phase */ if ((sop->so_timer[SSCOP_T_POLL] != 0) && ((sop->so_flags & SOF_KEEPALIVE) == 0)) { /* * Remain in active phase - reset NO-RESPONSE timer */ sop->so_timer[SSCOP_T_NORESP] = sop->so_parm.sp_timeresp; } else if (sop->so_timer[SSCOP_T_IDLE] == 0) { /* * Go from transient to idle phase */ sop->so_timer[SSCOP_T_POLL] = 0; sop->so_flags &= ~SOF_KEEPALIVE; sop->so_timer[SSCOP_T_NORESP] = 0; sop->so_timer[SSCOP_T_IDLE] = sop->so_parm.sp_timeidle; } } /* * See if transmit queues need servicing */ if (sop->so_flags & SOF_XMITSRVC) sscop_service_xmit(sop); return; goterr: /* * Protocol/parameter error encountered */ /* * Free PDU buffer chain */ KB_FREEALL(m0); if (sop->so_vers == SSCOP_VERS_QSAAL) /* * Reestablish a new connection */ qsaal1_reestablish(sop); else /* * Initiate error recovery */ q2110_error_recovery(sop); return; }
/* * When a new ack with SACK is received, check if it indicates packet * reordering. If there is packet reordering, the socket is marked and * the late time offset by which the packet was reordered with * respect to its closest neighboring packets is computed. */ static void tcp_sack_detect_reordering(struct tcpcb *tp, struct sackhole *s, tcp_seq sacked_seq, tcp_seq snd_fack) { int32_t rext = 0, reordered = 0; /* * If the SACK hole is past snd_fack, this is from new SACK * information, so we can ignore it. */ if (SEQ_GT(s->end, snd_fack)) return; /* * If there has been a retransmit timeout, then the timestamp on * the SACK segment will be newer. This might lead to a * false-positive. Avoid re-ordering detection in this case. */ if (tp->t_rxtshift > 0) return; /* * Detect reordering from SACK information by checking * if recently sacked data was never retransmitted from this hole. */ if (SEQ_LT(s->rxmit, sacked_seq)) { reordered = 1; tcpstat.tcps_avoid_rxmt++; } if (reordered) { if (tcp_detect_reordering == 1 && !(tp->t_flagsext & TF_PKTS_REORDERED)) { tp->t_flagsext |= TF_PKTS_REORDERED; tcpstat.tcps_detect_reordering++; } tcpstat.tcps_reordered_pkts++; tp->t_reordered_pkts++; /* * If reordering is seen on a connection wth ECN enabled, * increment the heuristic */ if (TCP_ECN_ENABLED(tp)) { INP_INC_IFNET_STAT(tp->t_inpcb, ecn_fallback_reorder); tcpstat.tcps_ecn_fallback_reorder++; tcp_heuristic_ecn_aggressive(tp); } VERIFY(SEQ_GEQ(snd_fack, s->rxmit)); if (s->rxmit_start > 0) { rext = timer_diff(tcp_now, 0, s->rxmit_start, 0); if (rext < 0) return; /* * We take the maximum reorder window to schedule * DELAYFR timer as that will take care of jitter * on the network path. * * Computing average and standard deviation seems * to cause unnecessary retransmissions when there * is high jitter. * * We set a maximum of SRTT/2 and a minimum of * 10 ms on the reorder window. */ tp->t_reorderwin = max(tp->t_reorderwin, rext); tp->t_reorderwin = min(tp->t_reorderwin, (tp->t_srtt >> (TCP_RTT_SHIFT - 1))); tp->t_reorderwin = max(tp->t_reorderwin, 10); } }
/* * Tcp output routine: figure out what should be sent and send it. */ int tcp_output(struct tcpcb *tp) { struct inpcb * const inp = tp->t_inpcb; struct socket *so = inp->inp_socket; long len, recvwin, sendwin; int nsacked = 0; int off, flags, error; #ifdef TCP_SIGNATURE int sigoff = 0; #endif struct mbuf *m; struct ip *ip = NULL; struct ipovly *ipov = NULL; struct tcphdr *th; u_char opt[TCP_MAXOLEN]; unsigned int ipoptlen, optlen, hdrlen; int idle; boolean_t sendalot; struct ip6_hdr *ip6 = NULL; #ifdef INET6 const boolean_t isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #else const boolean_t isipv6 = FALSE; #endif KKASSERT(so->so_port == &curthread->td_msgport); /* * Determine length of data that should be transmitted, * and flags that will be used. * If there is some data or critical controls (SYN, RST) * to send, then transmit; otherwise, investigate further. */ /* * If we have been idle for a while, the send congestion window * could be no longer representative of the current state of the link. * So unless we are expecting more acks to come in, slow-start from * scratch to re-determine the send congestion window. */ if (tp->snd_max == tp->snd_una && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) { if (tcp_do_rfc3390) { int initial_cwnd = min(4 * tp->t_maxseg, max(2 * tp->t_maxseg, 4380)); tp->snd_cwnd = min(tp->snd_cwnd, initial_cwnd); } else { tp->snd_cwnd = tp->t_maxseg; } tp->snd_wacked = 0; } /* * Calculate whether the transmit stream was previously idle * and adjust TF_LASTIDLE for the next time. */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); if (idle && (tp->t_flags & TF_MORETOCOME)) tp->t_flags |= TF_LASTIDLE; else tp->t_flags &= ~TF_LASTIDLE; if (TCP_DO_SACK(tp) && tp->snd_nxt != tp->snd_max && !IN_FASTRECOVERY(tp)) nsacked = tcp_sack_bytes_below(&tp->scb, tp->snd_nxt); again: /* Make use of SACK information when slow-starting after a RTO. */ if (TCP_DO_SACK(tp) && tp->snd_nxt != tp->snd_max && !IN_FASTRECOVERY(tp)) { tcp_seq old_snd_nxt = tp->snd_nxt; tcp_sack_skip_sacked(&tp->scb, &tp->snd_nxt); nsacked += tp->snd_nxt - old_snd_nxt; } sendalot = FALSE; off = tp->snd_nxt - tp->snd_una; sendwin = min(tp->snd_wnd, tp->snd_cwnd + nsacked); sendwin = min(sendwin, tp->snd_bwnd); flags = tcp_outflags[tp->t_state]; /* * Get standard flags, and add SYN or FIN if requested by 'hidden' * state flags. */ if (tp->t_flags & TF_NEEDFIN) flags |= TH_FIN; if (tp->t_flags & TF_NEEDSYN) flags |= TH_SYN; /* * If in persist timeout with window of 0, send 1 byte. * Otherwise, if window is small but nonzero * and timer expired, we will send what we can * and go to transmit state. */ if (tp->t_flags & TF_FORCE) { if (sendwin == 0) { /* * If we still have some data to send, then * clear the FIN bit. Usually this would * happen below when it realizes that we * aren't sending all the data. However, * if we have exactly 1 byte of unsent data, * then it won't clear the FIN bit below, * and if we are in persist state, we wind * up sending the packet without recording * that we sent the FIN bit. * * We can't just blindly clear the FIN bit, * because if we don't have any more data * to send then the probe will be the FIN * itself. */ if (off < so->so_snd.ssb_cc) flags &= ~TH_FIN; sendwin = 1; } else { tcp_callout_stop(tp, tp->tt_persist); tp->t_rxtshift = 0; } } /* * If snd_nxt == snd_max and we have transmitted a FIN, the * offset will be > 0 even if so_snd.ssb_cc is 0, resulting in * a negative length. This can also occur when TCP opens up * its congestion window while receiving additional duplicate * acks after fast-retransmit because TCP will reset snd_nxt * to snd_max after the fast-retransmit. * * In the normal retransmit-FIN-only case, however, snd_nxt will * be set to snd_una, the offset will be 0, and the length may * wind up 0. */ len = (long)ulmin(so->so_snd.ssb_cc, sendwin) - off; /* * Lop off SYN bit if it has already been sent. However, if this * is SYN-SENT state and if segment contains data, suppress sending * segment (sending the segment would be an option if we still * did TAO and the remote host supported it). */ if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { flags &= ~TH_SYN; off--, len++; if (len > 0 && tp->t_state == TCPS_SYN_SENT) return 0; } /* * Be careful not to send data and/or FIN on SYN segments. * This measure is needed to prevent interoperability problems * with not fully conformant TCP implementations. */ if (flags & TH_SYN) { len = 0; flags &= ~TH_FIN; } if (len < 0) { /* * If FIN has been sent but not acked, * but we haven't been called to retransmit, * len will be < 0. Otherwise, window shrank * after we sent into it. If window shrank to 0, * cancel pending retransmit, pull snd_nxt back * to (closed) window, and set the persist timer * if it isn't already going. If the window didn't * close completely, just wait for an ACK. */ len = 0; if (sendwin == 0) { tcp_callout_stop(tp, tp->tt_rexmt); tp->t_rxtshift = 0; tp->snd_nxt = tp->snd_una; if (!tcp_callout_active(tp, tp->tt_persist)) tcp_setpersist(tp); } } KASSERT(len >= 0, ("%s: len < 0", __func__)); /* * Automatic sizing of send socket buffer. Often the send buffer * size is not optimally adjusted to the actual network conditions * at hand (delay bandwidth product). Setting the buffer size too * small limits throughput on links with high bandwidth and high * delay (eg. trans-continental/oceanic links). Setting the * buffer size too big consumes too much real kernel memory, * especially with many connections on busy servers. * * The criteria to step up the send buffer one notch are: * 1. receive window of remote host is larger than send buffer * (with a fudge factor of 5/4th); * 2. send buffer is filled to 7/8th with data (so we actually * have data to make use of it); * 3. send buffer fill has not hit maximal automatic size; * 4. our send window (slow start and cogestion controlled) is * larger than sent but unacknowledged data in send buffer. * * The remote host receive window scaling factor may limit the * growing of the send buffer before it reaches its allowed * maximum. * * It scales directly with slow start or congestion window * and does at most one step per received ACK. This fast * scaling has the drawback of growing the send buffer beyond * what is strictly necessary to make full use of a given * delay*bandwith product. However testing has shown this not * to be much of an problem. At worst we are trading wasting * of available bandwith (the non-use of it) for wasting some * socket buffer memory. * * TODO: Shrink send buffer during idle periods together * with congestion window. Requires another timer. Has to * wait for upcoming tcp timer rewrite. */ if (tcp_do_autosndbuf && so->so_snd.ssb_flags & SSB_AUTOSIZE) { if ((tp->snd_wnd / 4 * 5) >= so->so_snd.ssb_hiwat && so->so_snd.ssb_cc >= (so->so_snd.ssb_hiwat / 8 * 7) && so->so_snd.ssb_cc < tcp_autosndbuf_max && sendwin >= (so->so_snd.ssb_cc - (tp->snd_nxt - tp->snd_una))) { u_long newsize; newsize = ulmin(so->so_snd.ssb_hiwat + tcp_autosndbuf_inc, tcp_autosndbuf_max); if (!ssb_reserve(&so->so_snd, newsize, so, NULL)) atomic_clear_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE); if (newsize >= (TCP_MAXWIN << tp->snd_scale)) atomic_clear_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE); } } /* * Truncate to the maximum segment length and ensure that FIN is * removed if the length no longer contains the last data byte. */ if (len > tp->t_maxseg) { len = tp->t_maxseg; sendalot = TRUE; } if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.ssb_cc)) flags &= ~TH_FIN; recvwin = ssb_space(&so->so_rcv); /* * Sender silly window avoidance. We transmit under the following * conditions when len is non-zero: * * - We have a full segment * - This is the last buffer in a write()/send() and we are * either idle or running NODELAY * - we've timed out (e.g. persist timer) * - we have more then 1/2 the maximum send window's worth of * data (receiver may be limiting the window size) * - we need to retransmit */ if (len) { if (len == tp->t_maxseg) goto send; /* * NOTE! on localhost connections an 'ack' from the remote * end may occur synchronously with the output and cause * us to flush a buffer queued with moretocome. XXX * * note: the len + off check is almost certainly unnecessary. */ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ (idle || (tp->t_flags & TF_NODELAY)) && len + off >= so->so_snd.ssb_cc && !(tp->t_flags & TF_NOPUSH)) { goto send; } if (tp->t_flags & TF_FORCE) /* typ. timeout case */ goto send; if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */ goto send; } /* * Compare available window to amount of window * known to peer (as advertised window less * next expected input). If the difference is at least two * max size segments, or at least 50% of the maximum possible * window, then want to send a window update to peer. */ if (recvwin > 0) { /* * "adv" is the amount we can increase the window, * taking into account that we are limited by * TCP_MAXWIN << tp->rcv_scale. */ long adv = min(recvwin, (long)TCP_MAXWIN << tp->rcv_scale) - (tp->rcv_adv - tp->rcv_nxt); long hiwat; /* * This ack case typically occurs when the user has drained * the TCP socket buffer sufficiently to warrent an ack * containing a 'pure window update'... that is, an ack that * ONLY updates the tcp window. * * It is unclear why we would need to do a pure window update * past 2 segments if we are going to do one at 1/2 the high * water mark anyway, especially since under normal conditions * the user program will drain the socket buffer quickly. * The 2-segment pure window update will often add a large * number of extra, unnecessary acks to the stream. * * avoid_pure_win_update now defaults to 1. */ if (avoid_pure_win_update == 0 || (tp->t_flags & TF_RXRESIZED)) { if (adv >= (long) (2 * tp->t_maxseg)) { goto send; } } hiwat = (long)(TCP_MAXWIN << tp->rcv_scale); if (hiwat > (long)so->so_rcv.ssb_hiwat) hiwat = (long)so->so_rcv.ssb_hiwat; if (adv >= hiwat / 2) goto send; } /* * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW * is also a catch-all for the retransmit timer timeout case. */ if (tp->t_flags & TF_ACKNOW) goto send; if ((flags & TH_RST) || ((flags & TH_SYN) && !(tp->t_flags & TF_NEEDSYN))) goto send; if (SEQ_GT(tp->snd_up, tp->snd_una)) goto send; /* * If our state indicates that FIN should be sent * and we have not yet done so, then we need to send. */ if (flags & TH_FIN && (!(tp->t_flags & TF_SENTFIN) || tp->snd_nxt == tp->snd_una)) goto send; /* * TCP window updates are not reliable, rather a polling protocol * using ``persist'' packets is used to insure receipt of window * updates. The three ``states'' for the output side are: * idle not doing retransmits or persists * persisting to move a small or zero window * (re)transmitting and thereby not persisting * * tcp_callout_active(tp, tp->tt_persist) * is true when we are in persist state. * The TF_FORCE flag in tp->t_flags * is set when we are called to send a persist packet. * tcp_callout_active(tp, tp->tt_rexmt) * is set when we are retransmitting * The output side is idle when both timers are zero. * * If send window is too small, there is data to transmit, and no * retransmit or persist is pending, then go to persist state. * If nothing happens soon, send when timer expires: * if window is nonzero, transmit what we can, * otherwise force out a byte. */ if (so->so_snd.ssb_cc > 0 && !tcp_callout_active(tp, tp->tt_rexmt) && !tcp_callout_active(tp, tp->tt_persist)) { tp->t_rxtshift = 0; tcp_setpersist(tp); } /* * No reason to send a segment, just return. */ return (0); send: /* * Before ESTABLISHED, force sending of initial options * unless TCP set not to do any options. * NOTE: we assume that the IP/TCP header plus TCP options * always fit in a single mbuf, leaving room for a maximum * link header, i.e. * max_linkhdr + sizeof(struct tcpiphdr) + optlen <= MCLBYTES */ optlen = 0; if (isipv6) hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else hdrlen = sizeof(struct tcpiphdr); if (flags & TH_SYN) { tp->snd_nxt = tp->iss; if (!(tp->t_flags & TF_NOOPT)) { u_short mss; opt[0] = TCPOPT_MAXSEG; opt[1] = TCPOLEN_MAXSEG; mss = htons((u_short) tcp_mssopt(tp)); memcpy(opt + 2, &mss, sizeof mss); optlen = TCPOLEN_MAXSEG; if ((tp->t_flags & TF_REQ_SCALE) && (!(flags & TH_ACK) || (tp->t_flags & TF_RCVD_SCALE))) { *((u_int32_t *)(opt + optlen)) = htonl( TCPOPT_NOP << 24 | TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | tp->request_r_scale); optlen += 4; } if ((tcp_do_sack && !(flags & TH_ACK)) || tp->t_flags & TF_SACK_PERMITTED) { uint32_t *lp = (uint32_t *)(opt + optlen); *lp = htonl(TCPOPT_SACK_PERMITTED_ALIGNED); optlen += TCPOLEN_SACK_PERMITTED_ALIGNED; } } } /* * Send a timestamp and echo-reply if this is a SYN and our side * wants to use timestamps (TF_REQ_TSTMP is set) or both our side * and our peer have sent timestamps in our SYN's. */ if ((tp->t_flags & (TF_REQ_TSTMP | TF_NOOPT)) == TF_REQ_TSTMP && !(flags & TH_RST) && (!(flags & TH_ACK) || (tp->t_flags & TF_RCVD_TSTMP))) { u_int32_t *lp = (u_int32_t *)(opt + optlen); /* Form timestamp option as shown in appendix A of RFC 1323. */ *lp++ = htonl(TCPOPT_TSTAMP_HDR); *lp++ = htonl(ticks); *lp = htonl(tp->ts_recent); optlen += TCPOLEN_TSTAMP_APPA; } /* Set receive buffer autosizing timestamp. */ if (tp->rfbuf_ts == 0 && (so->so_rcv.ssb_flags & SSB_AUTOSIZE)) tp->rfbuf_ts = ticks; /* * If this is a SACK connection and we have a block to report, * fill in the SACK blocks in the TCP options. */ if ((tp->t_flags & (TF_SACK_PERMITTED | TF_NOOPT)) == TF_SACK_PERMITTED && (!LIST_EMPTY(&tp->t_segq) || tp->reportblk.rblk_start != tp->reportblk.rblk_end)) tcp_sack_fill_report(tp, opt, &optlen); #ifdef TCP_SIGNATURE if (tp->t_flags & TF_SIGNATURE) { int i; u_char *bp; /* * Initialize TCP-MD5 option (RFC2385) */ bp = (u_char *)opt + optlen; *bp++ = TCPOPT_SIGNATURE; *bp++ = TCPOLEN_SIGNATURE; sigoff = optlen + 2; for (i = 0; i < TCP_SIGLEN; i++) *bp++ = 0; optlen += TCPOLEN_SIGNATURE; /* * Terminate options list and maintain 32-bit alignment. */ *bp++ = TCPOPT_NOP; *bp++ = TCPOPT_EOL; optlen += 2; } #endif /* TCP_SIGNATURE */ KASSERT(optlen <= TCP_MAXOLEN, ("too many TCP options")); hdrlen += optlen; if (isipv6) { ipoptlen = ip6_optlen(inp); } else { if (inp->inp_options) { ipoptlen = inp->inp_options->m_len - offsetof(struct ipoption, ipopt_list); } else {
static int tcpup_state_receive(struct tcpup_info *upp, struct tcpiphdr *tcp, size_t dlen) { int xflags = 0; int snd_una = htonl(tcp->th_ack); if (tcp->th_flags & TH_RST) { upp->t_state = TCPS_CLOSED; return 0; } if ((tcp->th_flags & TH_ACK) && SEQ_GT(snd_una, upp->snd_una)) { /* update snd una from peer */ upp->snd_una = snd_una; } switch (upp->t_state) { case TCPS_SYN_SENT: xflags = TH_SYN| TH_ACK; if ((tcp->th_flags & xflags) == TH_SYN) { assert((tcp->th_flags & TH_FIN) != TH_FIN); tcp_state_preload(upp, TCPS_SYN_RECEIVED, htonl(tcp->th_seq) + 1); return 0; } if ((tcp->th_flags & xflags) == xflags && SEQ_GEQ(htonl(tcp->th_ack), upp->snd_nxt)) { assert((tcp->th_flags & TH_FIN) != TH_FIN); tcp_state_preload(upp, TCPS_ESTABLISHED, htonl(tcp->th_seq)); return 0; } break; case TCPS_SYN_RECEIVED: if ((tcp->th_flags & TH_ACK) == TH_ACK && SEQ_GEQ(htonl(tcp->th_ack), upp->snd_nxt)) { assert((tcp->th_flags & TH_FIN) != TH_FIN); tcp_state_preload(upp, TCPS_ESTABLISHED, htonl(tcp->th_seq)); return 0; } break; case TCPS_ESTABLISHED: if ((tcp->th_flags & TH_FIN) == TH_FIN) { tcp_state_preload(upp, TCPS_CLOSE_WAIT, htonl(tcp->th_seq) + 1); return 0; } break; case TCPS_FIN_WAIT_1: xflags = TH_FIN| TH_ACK; if ((tcp->th_flags & xflags) == xflags && SEQ_GEQ(htonl(tcp->th_ack), upp->snd_nxt)) { tcp_state_preload(upp, TCPS_TIME_WAIT, htonl(tcp->th_seq) + dlen + 1); return 0; } if ((tcp->th_flags & TH_FIN) == TH_FIN) { tcp_state_preload(upp, TCPS_CLOSING, htonl(tcp->th_seq) + dlen + 1); return 0; } if ((tcp->th_flags & TH_ACK) == TH_ACK && SEQ_GEQ(htonl(tcp->th_ack), upp->snd_nxt)) { tcp_state_preload(upp, TCPS_FIN_WAIT_2, htonl(tcp->th_seq) + dlen); return 0; } break; case TCPS_FIN_WAIT_2: if ((tcp->th_flags & TH_FIN) == TH_FIN) { tcp_state_preload(upp, TCPS_TIME_WAIT, htonl(tcp->th_seq) + dlen + 1); return 0; } break; case TCPS_CLOSING: if ((tcp->th_flags & TH_ACK) == TH_ACK && SEQ_GEQ(htonl(tcp->th_ack), upp->snd_nxt)) { tcp_state_preload(upp, TCPS_TIME_WAIT, htonl(tcp->th_seq) + dlen); return 0; } break; case TCPS_LAST_ACK: if ((tcp->th_flags & TH_ACK) == TH_ACK && SEQ_GEQ(htonl(tcp->th_ack), upp->snd_nxt)) { tcp_state_preload(upp, TCPS_CLOSED, htonl(tcp->th_seq) + dlen); return 0; } break; case TCPS_TIME_WAIT: fprintf(stderr, "before TIME_WAIT -> TIME_WAIT\n"); break; } return 0; }
/* * This function is called upon receipt of new valid data (while not in * header prediction mode), and it updates the ordered list of sacks. */ void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end) { /* * First reported block MUST be the most recent one. Subsequent * blocks SHOULD be in the order in which they arrived at the * receiver. These two conditions make the implementation fully * compliant with RFC 2018. */ struct sackblk head_blk, saved_blks[MAX_SACK_BLKS]; int num_head, num_saved, i; //ScenSim-Port// INP_WLOCK_ASSERT(tp->t_inpcb); /* Check arguments. */ //ScenSim-Port// KASSERT(SEQ_LT(rcv_start, rcv_end), ("rcv_start < rcv_end")); /* SACK block for the received segment. */ head_blk.start = rcv_start; head_blk.end = rcv_end; /* * Merge updated SACK blocks into head_blk, and save unchanged SACK * blocks into saved_blks[]. num_saved will have the number of the * saved SACK blocks. */ num_saved = 0; for (i = 0; i < tp->rcv_numsacks; i++) { tcp_seq start = tp->sackblks[i].start; tcp_seq end = tp->sackblks[i].end; if (SEQ_GEQ(start, end) || SEQ_LEQ(start, tp->rcv_nxt)) { /* * Discard this SACK block. */ } else if (SEQ_LEQ(head_blk.start, end) && SEQ_GEQ(head_blk.end, start)) { /* * Merge this SACK block into head_blk. This SACK * block itself will be discarded. */ if (SEQ_GT(head_blk.start, start)) head_blk.start = start; if (SEQ_LT(head_blk.end, end)) head_blk.end = end; } else { /* * Save this SACK block. */ saved_blks[num_saved].start = start; saved_blks[num_saved].end = end; num_saved++; } } /* * Update SACK list in tp->sackblks[]. */ num_head = 0; if (SEQ_GT(head_blk.start, tp->rcv_nxt)) { /* * The received data segment is an out-of-order segment. Put * head_blk at the top of SACK list. */ tp->sackblks[0] = head_blk; num_head = 1; /* * If the number of saved SACK blocks exceeds its limit, * discard the last SACK block. */ if (num_saved >= MAX_SACK_BLKS) num_saved--; } if (num_saved > 0) { /* * Copy the saved SACK blocks back. */ bcopy(saved_blks, &tp->sackblks[num_head], sizeof(struct sackblk) * num_saved); } /* Save the number of SACK blocks. */ tp->rcv_numsacks = num_head + num_saved; }
/* * Process cumulative ACK and the TCP SACK option to update the scoreboard. * tp->snd_holes is an ordered list of holes (oldest to newest, in terms of * the sequence space). */ void tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) { struct sackhole *cur, *temp; struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1], *sblkp; int i, j, num_sack_blks; //ScenSim-Port// INP_WLOCK_ASSERT(tp->t_inpcb); num_sack_blks = 0; /* * If SND.UNA will be advanced by SEG.ACK, and if SACK holes exist, * treat [SND.UNA, SEG.ACK) as if it is a SACK block. */ if (SEQ_LT(tp->snd_una, th_ack) && !TAILQ_EMPTY(&tp->snd_holes)) { sack_blocks[num_sack_blks].start = tp->snd_una; sack_blocks[num_sack_blks++].end = th_ack; } /* * Append received valid SACK blocks to sack_blocks[], but only if we * received new blocks from the other side. */ if (to->to_flags & TOF_SACK) { for (i = 0; i < to->to_nsacks; i++) { bcopy((to->to_sacks + i * TCPOLEN_SACK), &sack, sizeof(sack)); sack.start = ntohl(sack.start); sack.end = ntohl(sack.end); if (SEQ_GT(sack.end, sack.start) && SEQ_GT(sack.start, tp->snd_una) && SEQ_GT(sack.start, th_ack) && SEQ_LT(sack.start, tp->snd_max) && SEQ_GT(sack.end, tp->snd_una) && SEQ_LEQ(sack.end, tp->snd_max)) sack_blocks[num_sack_blks++] = sack; } } /* * Return if SND.UNA is not advanced and no valid SACK block is * received. */ if (num_sack_blks == 0) return; /* * Sort the SACK blocks so we can update the scoreboard with just one * pass. The overhead of sorting upto 4+1 elements is less than * making upto 4+1 passes over the scoreboard. */ for (i = 0; i < num_sack_blks; i++) { for (j = i + 1; j < num_sack_blks; j++) { if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { sack = sack_blocks[i]; sack_blocks[i] = sack_blocks[j]; sack_blocks[j] = sack; } } } if (TAILQ_EMPTY(&tp->snd_holes)) /* * Empty scoreboard. Need to initialize snd_fack (it may be * uninitialized or have a bogus value). Scoreboard holes * (from the sack blocks received) are created later below * (in the logic that adds holes to the tail of the * scoreboard). */ tp->snd_fack = SEQ_MAX(tp->snd_una, th_ack); /* * In the while-loop below, incoming SACK blocks (sack_blocks[]) and * SACK holes (snd_holes) are traversed from their tails with just * one pass in order to reduce the number of compares especially when * the bandwidth-delay product is large. * * Note: Typically, in the first RTT of SACK recovery, the highest * three or four SACK blocks with the same ack number are received. * In the second RTT, if retransmitted data segments are not lost, * the highest three or four SACK blocks with ack number advancing * are received. */ sblkp = &sack_blocks[num_sack_blks - 1]; /* Last SACK block */ tp->sackhint.last_sack_ack = sblkp->end; if (SEQ_LT(tp->snd_fack, sblkp->start)) { /* * The highest SACK block is beyond fack. Append new SACK * hole at the tail. If the second or later highest SACK * blocks are also beyond the current fack, they will be * inserted by way of hole splitting in the while-loop below. */ temp = tcp_sackhole_insert(tp, tp->snd_fack,sblkp->start,NULL); if (temp != NULL) { tp->snd_fack = sblkp->end; /* Go to the previous sack block. */ sblkp--; } else { /* * We failed to add a new hole based on the current * sack block. Skip over all the sack blocks that * fall completely to the right of snd_fack and * proceed to trim the scoreboard based on the * remaining sack blocks. This also trims the * scoreboard for th_ack (which is sack_blocks[0]). */ while (sblkp >= sack_blocks && SEQ_LT(tp->snd_fack, sblkp->start)) sblkp--; if (sblkp >= sack_blocks && SEQ_LT(tp->snd_fack, sblkp->end)) tp->snd_fack = sblkp->end; } } else if (SEQ_LT(tp->snd_fack, sblkp->end)) /* fack is advanced. */ tp->snd_fack = sblkp->end; /* We must have at least one SACK hole in scoreboard. */ //ScenSim-Port// KASSERT(!TAILQ_EMPTY(&tp->snd_holes), //ScenSim-Port// ("SACK scoreboard must not be empty")); cur = TAILQ_LAST(&tp->snd_holes, sackhole_head); /* Last SACK hole. */ /* * Since the incoming sack blocks are sorted, we can process them * making one sweep of the scoreboard. */ while (sblkp >= sack_blocks && cur != NULL) { if (SEQ_GEQ(sblkp->start, cur->end)) { /* * SACKs data beyond the current hole. Go to the * previous sack block. */ sblkp--; continue; } if (SEQ_LEQ(sblkp->end, cur->start)) { /* * SACKs data before the current hole. Go to the * previous hole. */ cur = TAILQ_PREV(cur, sackhole_head, scblink); continue; } tp->sackhint.sack_bytes_rexmit -= (cur->rxmit - cur->start); //ScenSim-Port// KASSERT(tp->sackhint.sack_bytes_rexmit >= 0, //ScenSim-Port// ("sackhint bytes rtx >= 0")); if (SEQ_LEQ(sblkp->start, cur->start)) { /* Data acks at least the beginning of hole. */ if (SEQ_GEQ(sblkp->end, cur->end)) { /* Acks entire hole, so delete hole. */ temp = cur; cur = TAILQ_PREV(cur, sackhole_head, scblink); tcp_sackhole_remove(tp, temp); /* * The sack block may ack all or part of the * next hole too, so continue onto the next * hole. */ continue; } else { /* Move start of hole forward. */ cur->start = sblkp->end; cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); } } else { /* Data acks at least the end of hole. */ if (SEQ_GEQ(sblkp->end, cur->end)) { /* Move end of hole backward. */ cur->end = sblkp->start; cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); } else { /* * ACKs some data in middle of a hole; need * to split current hole */ temp = tcp_sackhole_insert(tp, sblkp->end, cur->end, cur); if (temp != NULL) { if (SEQ_GT(cur->rxmit, temp->rxmit)) { temp->rxmit = cur->rxmit; tp->sackhint.sack_bytes_rexmit += (temp->rxmit - temp->start); } cur->end = sblkp->start; cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); } } } tp->sackhint.sack_bytes_rexmit += (cur->rxmit - cur->start); /* * Testing sblkp->start against cur->start tells us whether * we're done with the sack block or the sack hole. * Accordingly, we advance one or the other. */ if (SEQ_LEQ(sblkp->start, cur->start)) cur = TAILQ_PREV(cur, sackhole_head, scblink); else sblkp--; } }
int tcp_reass(struct tcpcb *tp,struct tcpiphdr *ti, usn_mbuf_t *m) { struct tcpiphdr *q; struct usn_socket *so = tp->t_inpcb->inp_socket; int flags; // Call with ti==0 after become established to // force pre-ESTABLISHED data up to user socket. if (ti == 0) goto present; // Find a segment which begins after this one does. for (q = tp->seg_next; q != (struct tcpiphdr *)tp; q = (struct tcpiphdr *)q->ti_next) if (SEQ_GT(q->ti_seq, ti->ti_seq)) break; // If there is a preceding segment, it may provide some of // our data already. If so, drop the data from the incoming // segment. If it provides all of our data, drop us. if ((struct tcpiphdr *)q->ti_prev != (struct tcpiphdr *)tp) { int i; q = (struct tcpiphdr *)q->ti_prev; // conversion to int (in i) handles seq wraparound i = q->ti_seq + q->ti_len - ti->ti_seq; if (i > 0) { if (i >= ti->ti_len) { g_tcpstat.tcps_rcvduppack++; g_tcpstat.tcps_rcvdupbyte += ti->ti_len; usn_free_mbuf(m); return (0); } m_adj(m, i); ti->ti_len -= i; ti->ti_seq += i; } q = (struct tcpiphdr *)(q->ti_next); } g_tcpstat.tcps_rcvoopack++; g_tcpstat.tcps_rcvoobyte += ti->ti_len; REASS_MBUF(ti) = m; // XXX: wrong assumtion dst and src port for mbuf pointer // While we overlap succeeding segments trim them or, // if they are completely covered, dequeue them. while (q != (struct tcpiphdr *)tp) { int i = (ti->ti_seq + ti->ti_len) - q->ti_seq; if (i <= 0) break; if (i < q->ti_len) { q->ti_seq += i; q->ti_len -= i; m_adj(REASS_MBUF(q), i); break; } q = (struct tcpiphdr *)q->ti_next; m = REASS_MBUF((struct tcpiphdr *)q->ti_prev); // FIXME //remque(q->ti_prev); usn_free_mbuf(m); } // FIXME: Stick new segment in its place. //insque(ti, q->ti_prev); present: // Present data to user, advancing rcv_nxt through // completed sequence space. if (TCPS_HAVERCVDSYN(tp->t_state) == 0) return (0); ti = tp->seg_next; if (ti == (struct tcpiphdr *)tp || ti->ti_seq != tp->rcv_nxt) return (0); if (tp->t_state == TCPS_SYN_RECEIVED && ti->ti_len) return (0); do { tp->rcv_nxt += ti->ti_len; flags = ti->ti_flags & TH_FIN; // FIXME //remque(ti); m = REASS_MBUF(ti); ti = (struct tcpiphdr *)ti->ti_next; if (so->so_state & USN_CANTRCVMORE) { usn_free_mbuf(m); } else { sbappend(so->so_rcv, m); } } while (ti != (struct tcpiphdr *)tp && ti->ti_seq == tp->rcv_nxt); sorwakeup(so); usnet_tcpin_rwakeup(so, USN_TCP_IN, USN_TCPEV_READ, m); return (flags); }
/* * TCP input routine, follows pages 65-76 of the * protocol specification dated September, 1981 very closely. */ void tcp_input(usn_mbuf_t *m, int iphlen) { struct tcpiphdr *ti; struct inpcb *inp; u_char *optp = NULL; int optlen; int len, tlen, off; struct tcpcb *tp = 0; int tiflags; struct usn_socket *so = 0; int todrop, acked, ourfinisacked; int needoutput = 0; short ostate; struct usn_in_addr laddr; int dropsocket = 0; int iss = 0; u_long tiwin, ts_val, ts_ecr; int ts_present = 0; (void)needoutput; g_tcpstat.tcps_rcvtotal++; // Get IP and TCP header together in first mbuf. // Note: IP leaves IP header in first mbuf. ti = mtod(m, struct tcpiphdr *); if (iphlen > sizeof (usn_ip_t)) ip_stripoptions(m, (usn_mbuf_t *)0); if (m->mlen < sizeof (struct tcpiphdr)) { if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) { g_tcpstat.tcps_rcvshort++; return; } ti = mtod(m, struct tcpiphdr *); } #ifdef DUMP_PAYLOAD dump_chain(m,"tcp"); #endif /* * Checksum extended TCP header and data. */ tlen = ntohs(((usn_ip_t *)ti)->ip_len); len = sizeof (usn_ip_t) + tlen; ti->ti_next = ti->ti_prev = 0; ti->ti_x1 = 0; ti->ti_len = (u_short)tlen; HTONS(ti->ti_len); ti->ti_sum = in_cksum(m, len); if (ti->ti_sum) { g_tcpstat.tcps_rcvbadsum++; goto drop; } /* * Check that TCP offset makes sense, * pull out TCP options and adjust length. XXX */ off = ti->ti_off << 2; if (off < sizeof (struct tcphdr) || off > tlen) { g_tcpstat.tcps_rcvbadoff++; goto drop; } tlen -= off; ti->ti_len = tlen; if (off > sizeof (struct tcphdr)) { if (m->mlen < sizeof(usn_ip_t) + off) { if ((m = m_pullup(m, sizeof (usn_ip_t) + off)) == 0) { g_tcpstat.tcps_rcvshort++; return; } ti = mtod(m, struct tcpiphdr *); } optlen = off - sizeof (struct tcphdr); optp = mtod(m, u_char *) + sizeof (struct tcpiphdr); // Do quick retrieval of timestamp options ("options // prediction?"). If timestamp is the only option and it's // formatted as recommended in RFC 1323 appendix A, we // quickly get the values now and not bother calling // tcp_dooptions(), etc. if ((optlen == TCPOLEN_TSTAMP_APPA || (optlen > TCPOLEN_TSTAMP_APPA && optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && *(u_int *)optp == htonl(TCPOPT_TSTAMP_HDR) && (ti->ti_flags & TH_SYN) == 0) { ts_present = 1; ts_val = ntohl(*(u_long *)(optp + 4)); ts_ecr = ntohl(*(u_long *)(optp + 8)); optp = NULL; // we've parsed the options } } tiflags = ti->ti_flags; // Convert TCP protocol specific fields to host format. NTOHL(ti->ti_seq); NTOHL(ti->ti_ack); NTOHS(ti->ti_win); NTOHS(ti->ti_urp); // Locate pcb for segment. findpcb: inp = g_tcp_last_inpcb; if (inp->inp_lport != ti->ti_dport || inp->inp_fport != ti->ti_sport || inp->inp_faddr.s_addr != ti->ti_src.s_addr || inp->inp_laddr.s_addr != ti->ti_dst.s_addr) { inp = in_pcblookup(&g_tcb, ti->ti_src, ti->ti_sport, ti->ti_dst, ti->ti_dport, INPLOOKUP_WILDCARD); if (inp) g_tcp_last_inpcb = inp; ++g_tcpstat.tcps_pcbcachemiss; } // If the state is CLOSED (i.e., TCB does not exist) then // all data in the incoming segment is discarded. // If the TCB exists but is in CLOSED state, it is embryonic, // but should either do a listen or a connect soon. if (inp == 0) goto dropwithreset; tp = intotcpcb(inp); DEBUG("found inp cb, laddr=%x, lport=%d, faddr=%x," " fport=%d, tp_state=%d, tp_flags=%d", inp->inp_laddr.s_addr, inp->inp_lport, inp->inp_faddr.s_addr, inp->inp_fport, tp->t_state, tp->t_flags); if (tp == 0) goto dropwithreset; if (tp->t_state == TCPS_CLOSED) goto drop; // Unscale the window into a 32-bit value. if ((tiflags & TH_SYN) == 0) tiwin = ti->ti_win << tp->snd_scale; else tiwin = ti->ti_win; so = inp->inp_socket; DEBUG("socket info, options=%x", so->so_options); if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { if (so->so_options & SO_DEBUG) { ostate = tp->t_state; g_tcp_saveti = *ti; } if (so->so_options & SO_ACCEPTCONN) { if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { // Note: dropwithreset makes sure we don't // send a reset in response to a RST. if (tiflags & TH_ACK) { g_tcpstat.tcps_badsyn++; goto dropwithreset; } DEBUG("SYN is expected, tiflags=%d", tiflags); goto drop; } so = sonewconn(so, 0); if (so == 0) { DEBUG("failed to create new connection, tiflags=%d", tiflags); goto drop; } // Mark socket as temporary until we're // committed to keeping it. The code at // ``drop'' and ``dropwithreset'' check the // flag dropsocket to see if the temporary // socket created here should be discarded. // We mark the socket as discardable until // we're committed to it below in TCPS_LISTEN. dropsocket++; inp = (struct inpcb *)so->so_pcb; inp->inp_laddr = ti->ti_dst; inp->inp_lport = ti->ti_dport; // BSD >= 4.3 inp->inp_options = ip_srcroute(); tp = intotcpcb(inp); tp->t_state = TCPS_LISTEN; // Compute proper scaling value from buffer space while (tp->request_r_scale < TCP_MAX_WINSHIFT && TCP_MAXWIN << tp->request_r_scale < so->so_rcv->sb_hiwat) tp->request_r_scale++; } } // Segment received on connection. // Reset idle time and keep-alive timer. tp->t_idle = 0; tp->t_timer[TCPT_KEEP] = g_tcp_keepidle; // Process options if not in LISTEN state, // else do it below (after getting remote address). if (optp && tp->t_state != TCPS_LISTEN) tcp_dooptions(tp, optp, optlen, ti, &ts_present, &ts_val, &ts_ecr); // Header prediction: check for the two common cases // of a uni-directional data xfer. If the packet has // no control flags, is in-sequence, the window didn't // change and we're not retransmitting, it's a // candidate. If the length is zero and the ack moved // forward, we're the sender side of the xfer. Just // free the data acked & wake any higher level process // that was blocked waiting for space. If the length // is non-zero and the ack didn't move, we're the // receiver side. If we're getting packets in-order // (the reassembly queue is empty), add the data to // the socket buffer and note that we need a delayed ack. if (tp->t_state == TCPS_ESTABLISHED && (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && (!ts_present || TSTMP_GEQ(ts_val, tp->ts_recent)) && ti->ti_seq == tp->rcv_nxt && tiwin && tiwin == tp->snd_wnd && tp->snd_nxt == tp->snd_max) { // If last ACK falls within this segment's sequence numbers, // record the timestamp. if ( ts_present && TSTMP_GEQ(ts_val, tp->ts_recent) && SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) ){ tp->ts_recent_age = g_tcp_now; tp->ts_recent = ts_val; } if (ti->ti_len == 0) { if (SEQ_GT(ti->ti_ack, tp->snd_una) && SEQ_LEQ(ti->ti_ack, tp->snd_max) && tp->snd_cwnd >= tp->snd_wnd) { // this is a pure ack for outstanding data. ++g_tcpstat.tcps_predack; if (ts_present) tcp_xmit_timer(tp, g_tcp_now-ts_ecr+1); else if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq)) tcp_xmit_timer(tp, tp->t_rtt); acked = ti->ti_ack - tp->snd_una; g_tcpstat.tcps_rcvackpack++; g_tcpstat.tcps_rcvackbyte += acked; TRACE("drop so_snd buffer, drop_bytes=%d, len=%d", acked, so->so_snd.sb_cc); sbdrop(so->so_snd, acked); tp->snd_una = ti->ti_ack; usn_free_cmbuf(m); // If all outstanding data are acked, stop // retransmit timer, otherwise restart timer // using current (possibly backed-off) value. // If process is waiting for space, // wakeup/selwakeup/signal. If data // are ready to send, let tcp_output // decide between more output or persist. if (tp->snd_una == tp->snd_max) tp->t_timer[TCPT_REXMT] = 0; else if (tp->t_timer[TCPT_PERSIST] == 0) tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, &g_tcp_saveti, 0); //if (so->so_snd->sb_flags & SB_NOTIFY) { // usnet_tcpin_wwakeup(so, USN_TCP_IN, usn_tcpev_sbnotify, 0); // sowwakeup(so); //} // send buffer is available for app thread. usnet_tcpin_wwakeup(so, USN_TCP_IN, USN_TCPEV_WRITE, 0); if (so->so_snd->sb_cc) tcp_output(tp); return; } } else if (ti->ti_ack == tp->snd_una && tp->seg_next == (struct tcpiphdr *)tp && ti->ti_len <= sbspace(so->so_rcv)) { // this is a pure, in-sequence data packet // with nothing on the reassembly queue and // we have enough buffer space to take it. ++g_tcpstat.tcps_preddat; tp->rcv_nxt += ti->ti_len; g_tcpstat.tcps_rcvpack++; g_tcpstat.tcps_rcvbyte += ti->ti_len; // Drop TCP, IP headers and TCP options then add data // to socket buffer. m->head += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); m->mlen -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); TRACE("add data to rcv buf"); sbappend(so->so_rcv, m); sorwakeup(so); // new data is available for app threads. usnet_tcpin_rwakeup(so, USN_TCP_IN, USN_TCPEV_READ, m); if (so->so_options & SO_DEBUG) { TRACE("tcp trace, so_options=%d", so->so_options); tcp_trace(TA_INPUT, ostate, tp, &g_tcp_saveti, 0); } tp->t_flags |= TF_DELACK; return; } } // Drop TCP, IP headers and TCP options. m->head += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); m->mlen -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); // Calculate amount of space in receive window, // and then do TCP input processing. // Receive window is amount of space in rcv queue, // but not less than advertised window. { int win; win = sbspace(so->so_rcv); if (win < 0) win = 0; tp->rcv_wnd = max(win, (int)(tp->rcv_adv - tp->rcv_nxt)); } switch (tp->t_state) { // If the state is LISTEN then ignore segment if it contains an RST. // If the segment contains an ACK then it is bad and send a RST. // If it does not contain a SYN then it is not interesting; drop it. // Don't bother responding if the destination was a broadcast. // Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial // tp->iss, and send a segment: // <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> // Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss. // Fill in remote peer address fields if not previously specified. // Enter SYN_RECEIVED state, and process any other fields of this // segment in this state. case TCPS_LISTEN: { usn_mbuf_t *am; struct usn_sockaddr_in *sin; if (tiflags & TH_RST) goto drop; if (tiflags & TH_ACK) goto dropwithreset; if ((tiflags & TH_SYN) == 0) goto drop; // RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN // in_broadcast() should never return true on a received // packet with M_BCAST not set. //if (m->m_flags & (M_BCAST|M_MCAST) || // IN_MULTICAST(ntohl(ti->ti_dst.s_addr))) // goto drop; am = usn_get_mbuf(0, BUF_MSIZE, 0); // XXX: the size! if (am == NULL) goto drop; am->mlen = sizeof (struct usn_sockaddr_in); sin = mtod(am, struct usn_sockaddr_in *); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = ti->ti_src; sin->sin_port = ti->ti_sport; bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero)); laddr = inp->inp_laddr; if (inp->inp_laddr.s_addr == USN_INADDR_ANY) inp->inp_laddr = ti->ti_dst; if (in_pcbconnect(inp, am)) { inp->inp_laddr = laddr; usn_free_mbuf(am); goto drop; } usn_free_mbuf(am); tp->t_template = tcp_template(tp); if (tp->t_template == 0) { tp = tcp_drop(tp, ENOBUFS); dropsocket = 0; // socket is already gone goto drop; } if (optp) tcp_dooptions(tp, optp, optlen, ti, &ts_present, &ts_val, &ts_ecr); if (iss) tp->iss = iss; else tp->iss = g_tcp_iss; g_tcp_iss += TCP_ISSINCR/4; tp->irs = ti->ti_seq; tcp_sendseqinit(tp); tcp_rcvseqinit(tp); tp->t_flags |= TF_ACKNOW; TRACE("change tcp state to TCPS_SYN_RECEIVED, state=%d, tp_flags=%d", tp->t_state, tp->t_flags); tp->t_state = TCPS_SYN_RECEIVED; // tcp event usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_SYN_RECEIVED, 0); tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT; dropsocket = 0; // committed to socket g_tcpstat.tcps_accepts++; goto trimthenstep6; } // If the state is SYN_SENT: // if seg contains an ACK, but not for our SYN, drop the input. // if seg contains a RST, then drop the connection. // if seg does not contain SYN, then drop it. // Otherwise this is an acceptable SYN segment // initialize tp->rcv_nxt and tp->irs // if seg contains ack then advance tp->snd_una // if SYN has been acked change to ESTABLISHED else SYN_RCVD state // arrange for segment to be acked (eventually) // continue processing rest of data/controls, beginning with URG case TCPS_SYN_SENT: if ((tiflags & TH_ACK) && (SEQ_LEQ(ti->ti_ack, tp->iss) || SEQ_GT(ti->ti_ack, tp->snd_max))) goto dropwithreset; if (tiflags & TH_RST) { if (tiflags & TH_ACK) tp = tcp_drop(tp, ECONNREFUSED); goto drop; } if ((tiflags & TH_SYN) == 0) goto drop; if (tiflags & TH_ACK) { tp->snd_una = ti->ti_ack; if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; tp->t_timer[TCPT_REXMT] = 0; } tp->irs = ti->ti_seq; tcp_rcvseqinit(tp); tp->t_flags |= TF_ACKNOW; TRACE("ack now, tp flags=%d", tp->t_flags); // XXX: remove second test. if (tiflags & TH_ACK /*&& SEQ_GT(tp->snd_una, tp->iss)*/) { g_tcpstat.tcps_connects++; soisconnected(so); TRACE("change tcp state to TCPS_ESTABLISHED," " state=%d, tp_flags=%d", tp->t_state, tp->t_flags); tp->t_state = TCPS_ESTABLISHED; // Do window scaling on this connection? if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->snd_scale = tp->requested_s_scale; tp->rcv_scale = tp->request_r_scale; } tcp_reass(tp, (struct tcpiphdr *)0, (usn_mbuf_t *)0); // if we didn't have to retransmit the SYN, // use its rtt as our initial srtt & rtt var. if (tp->t_rtt) tcp_xmit_timer(tp, tp->t_rtt); } else { TRACE("change tcp state to TCPS_SYN_RECEIVED, state=%d, tp_flags=%d", tp->t_state, tp->t_flags); tp->t_state = TCPS_SYN_RECEIVED; // tcp event usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_SYN_RECEIVED, 0); } trimthenstep6: // Advance ti->ti_seq to correspond to first data byte. // If data, trim to stay within window, // dropping FIN if necessary. ti->ti_seq++; if (ti->ti_len > tp->rcv_wnd) { todrop = ti->ti_len - tp->rcv_wnd; m_adj(m, -todrop); ti->ti_len = tp->rcv_wnd; tiflags &= ~TH_FIN; g_tcpstat.tcps_rcvpackafterwin++; g_tcpstat.tcps_rcvbyteafterwin += todrop; } tp->snd_wl1 = ti->ti_seq - 1; tp->rcv_up = ti->ti_seq; goto step6; } // States other than LISTEN or SYN_SENT. // First check timestamp, if present. // Then check that at least some bytes of segment are within // receive window. If segment begins before rcv_nxt, // drop leading data (and SYN); if nothing left, just ack. // // RFC 1323 PAWS: If we have a timestamp reply on this segment // and it's less than ts_recent, drop it. if (ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && TSTMP_LT(ts_val, tp->ts_recent)) { // Check to see if ts_recent is over 24 days old. if ((int)(g_tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { // Invalidate ts_recent. If this segment updates // ts_recent, the age will be reset later and ts_recent // will get a valid value. If it does not, setting // ts_recent to zero will at least satisfy the // requirement that zero be placed in the timestamp // echo reply when ts_recent isn't valid. The // age isn't reset until we get a valid ts_recent // because we don't want out-of-order segments to be // dropped when ts_recent is old. tp->ts_recent = 0; } else { g_tcpstat.tcps_rcvduppack++; g_tcpstat.tcps_rcvdupbyte += ti->ti_len; g_tcpstat.tcps_pawsdrop++; goto dropafterack; } } todrop = tp->rcv_nxt - ti->ti_seq; if (todrop > 0) { if (tiflags & TH_SYN) { tiflags &= ~TH_SYN; ti->ti_seq++; if (ti->ti_urp > 1) ti->ti_urp--; else tiflags &= ~TH_URG; todrop--; } if ( todrop >= ti->ti_len || ( todrop == ti->ti_len && (tiflags & TH_FIN ) == 0 ) ) { // Any valid FIN must be to the left of the window. // At this point the FIN must be a duplicate or // out of sequence; drop it. tiflags &= ~TH_FIN; // Send an ACK to resynchronize and drop any data // But keep on processing for RST or ACK. tp->t_flags |= TF_ACKNOW; TRACE("send ack now to resync, tp_flags=%d", tp->t_flags); todrop = ti->ti_len; g_tcpstat.tcps_rcvdupbyte += ti->ti_len; g_tcpstat.tcps_rcvduppack++; } else { g_tcpstat.tcps_rcvpartduppack++; g_tcpstat.tcps_rcvpartdupbyte += ti->ti_len; } m_adj(m, todrop); ti->ti_seq += todrop; ti->ti_len -= todrop; if (ti->ti_urp > todrop) ti->ti_urp -= todrop; else { tiflags &= ~TH_URG; ti->ti_urp = 0; } } // If new data are received on a connection after the // user processes are gone, then RST the other end. if ((so->so_state & USN_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT && ti->ti_len) { tp = tcp_close(tp); g_tcpstat.tcps_rcvafterclose++; goto dropwithreset; } // If segment ends after window, drop trailing data // (and PUSH and FIN); if nothing left, just ACK. todrop = (ti->ti_seq+ti->ti_len) - (tp->rcv_nxt+tp->rcv_wnd); if (todrop > 0) { g_tcpstat.tcps_rcvpackafterwin++; if (todrop >= ti->ti_len) { g_tcpstat.tcps_rcvbyteafterwin += ti->ti_len; // If a new connection request is received // while in TIME_WAIT, drop the old connection // and start over if the sequence numbers // are above the previous ones. if (tiflags & TH_SYN && tp->t_state == TCPS_TIME_WAIT && SEQ_GT(ti->ti_seq, tp->rcv_nxt)) { iss = tp->snd_nxt + TCP_ISSINCR; tp = tcp_close(tp); goto findpcb; } // If window is closed can only take segments at // window edge, and have to drop data and PUSH from // incoming segments. Continue processing, but // remember to ack. Otherwise, drop segment // and ack. if (tp->rcv_wnd == 0 && ti->ti_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; g_tcpstat.tcps_rcvwinprobe++; } else goto dropafterack; } else g_tcpstat.tcps_rcvbyteafterwin += todrop; m_adj(m, -todrop); ti->ti_len -= todrop; tiflags &= ~(TH_PUSH|TH_FIN); } // check valid timestamp. Replace code above. if (ts_present && TSTMP_GEQ(ts_val, tp->ts_recent) && SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) ) { tp->ts_recent_age = g_tcp_now; tp->ts_recent = ts_val; } // If the RST bit is set examine the state: // SYN_RECEIVED STATE: // If passive open, return to LISTEN state. // If active open, inform user that connection was refused. // ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: // Inform user that connection was reset, and close tcb. // CLOSING, LAST_ACK, TIME_WAIT STATES // Close the tcb. if (tiflags&TH_RST) switch (tp->t_state) { case TCPS_SYN_RECEIVED: so->so_error = ECONNREFUSED; goto close; case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: so->so_error = ECONNRESET; close: DEBUG("change tcp state to TCPS_CLOSED, state=%d", tp->t_state); tp->t_state = TCPS_CLOSED; // tcp event usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_CLOSED, 0); g_tcpstat.tcps_drops++; tp = tcp_close(tp); goto drop; case TCPS_CLOSING: case TCPS_LAST_ACK: case TCPS_TIME_WAIT: tp = tcp_close(tp); goto drop; } // If a SYN is in the window, then this is an // error and we send an RST and drop the connection. if (tiflags & TH_SYN) { tp = tcp_drop(tp, ECONNRESET); goto dropwithreset; } // If the ACK bit is off we drop the segment and return. if ((tiflags & TH_ACK) == 0) goto drop; // Ack processing. switch (tp->t_state) { // In SYN_RECEIVED state if the ack ACKs our SYN then enter // ESTABLISHED state and continue processing, otherwise // send an RST. case TCPS_SYN_RECEIVED: if (SEQ_GT(tp->snd_una, ti->ti_ack) || SEQ_GT(ti->ti_ack, tp->snd_max)) goto dropwithreset; g_tcpstat.tcps_connects++; DEBUG("change tcp state to TCPS_ESTABLISHED, state=%d", tp->t_state); tp->t_state = TCPS_ESTABLISHED; soisconnected(so); // Do window scaling? if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->snd_scale = tp->requested_s_scale; tp->rcv_scale = tp->request_r_scale; } tcp_reass(tp, (struct tcpiphdr *)0, (usn_mbuf_t *)0); tp->snd_wl1 = ti->ti_seq - 1; // fall into ... // In ESTABLISHED state: drop duplicate ACKs; ACK out of range // ACKs. If the ack is in the range // tp->snd_una < ti->ti_ack <= tp->snd_max // then advance tp->snd_una to ti->ti_ack and drop // data from the retransmission queue. If this ACK reflects // more up to date window information we update our window information. case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: case TCPS_TIME_WAIT: if (SEQ_LEQ(ti->ti_ack, tp->snd_una)) { if (ti->ti_len == 0 && tiwin == tp->snd_wnd) { g_tcpstat.tcps_rcvdupack++; // If we have outstanding data (other than // a window probe), this is a completely // duplicate ack (ie, window info didn't // change), the ack is the biggest we've // seen and we've seen exactly our rexmt // threshhold of them, assume a packet // has been dropped and retransmit it. // Kludge snd_nxt & the congestion // window so we send only this one // packet. // // We know we're losing at the current // window size so do congestion avoidance // (set ssthresh to half the current window // and pull our congestion window back to // the new ssthresh). // // Dup acks mean that packets have left the // network (they're now cached at the receiver) // so bump cwnd by the amount in the receiver // to keep a constant cwnd packets in the // network. if (tp->t_timer[TCPT_REXMT] == 0 || ti->ti_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks == g_tcprexmtthresh) { // congestion avoidance tcp_seq onxt = tp->snd_nxt; u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; if (win < 2) win = 2; tp->snd_ssthresh = win * tp->t_maxseg; tp->t_timer[TCPT_REXMT] = 0; tp->t_rtt = 0; tp->snd_nxt = ti->ti_ack; tp->snd_cwnd = tp->t_maxseg; tcp_output(tp); tp->snd_cwnd = tp->snd_ssthresh + tp->t_maxseg * tp->t_dupacks; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; } else if (tp->t_dupacks > g_tcprexmtthresh) { tp->snd_cwnd += tp->t_maxseg; tcp_output(tp); goto drop; } } else tp->t_dupacks = 0; break; } // If the congestion window was inflated to account // for the other side's cached packets, retract it. if (tp->t_dupacks > g_tcprexmtthresh && tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; tp->t_dupacks = 0; if (SEQ_GT(ti->ti_ack, tp->snd_max)) { g_tcpstat.tcps_rcvacktoomuch++; goto dropafterack; } acked = ti->ti_ack - tp->snd_una; g_tcpstat.tcps_rcvackpack++; g_tcpstat.tcps_rcvackbyte += acked; // If we have a timestamp reply, update smoothed // round trip time. If no timestamp is present but // transmit timer is running and timed sequence // number was acked, update smoothed round trip time. // Since we now have an rtt measurement, cancel the // timer backoff (cf., Phil Karn's retransmit alg.). // Recompute the initial retransmit timer. if (ts_present) tcp_xmit_timer(tp, g_tcp_now-ts_ecr+1); else if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq)) tcp_xmit_timer(tp,tp->t_rtt); // If all outstanding data is acked, stop retransmit // timer and remember to restart (more output or persist). // If there is more data to be acked, restart retransmit // timer, using current (possibly backed-off) value. if (ti->ti_ack == tp->snd_max) { tp->t_timer[TCPT_REXMT] = 0; DEBUG("change needoutput to 1"); needoutput = 1; tp->t_flags |= TF_NEEDOUTPUT; } else if (tp->t_timer[TCPT_PERSIST] == 0) tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; // When new data is acked, open the congestion window. // If the window gives us less than ssthresh packets // in flight, open exponentially (maxseg per packet). // Otherwise open linearly: maxseg per window // (maxseg * (maxseg / cwnd) per packet). { u_int cw = tp->snd_cwnd; u_int incr = tp->t_maxseg; if (cw > tp->snd_ssthresh) incr = incr * incr / cw; tp->snd_cwnd = min(cw + incr, TCP_MAXWIN<<tp->snd_scale); } if (acked > so->so_snd->sb_cc) { tp->snd_wnd -= so->so_snd->sb_cc; DEBUG("drop all so_snd buffer, drop_bytes=%d, acked=%d", so->so_snd->sb_cc, acked); sbdrop(so->so_snd, (int)so->so_snd->sb_cc); ourfinisacked = 1; } else { DEBUG("drop so_snd buffer, drop_bytes=%d, len=%d", acked, so->so_snd->sb_cc); sbdrop(so->so_snd, acked); tp->snd_wnd -= acked; ourfinisacked = 0; } //if (so->so_snd->sb_flags & SB_NOTIFY) { sowwakeup(so); usnet_tcpin_wwakeup(so, USN_TCP_IN, USN_TCPEV_WRITE, 0); //} tp->snd_una = ti->ti_ack; if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; switch (tp->t_state) { // In FIN_WAIT_1 STATE in addition to the processing // for the ESTABLISHED state if our FIN is now acknowledged // then enter FIN_WAIT_2. case TCPS_FIN_WAIT_1: if (ourfinisacked) { // If we can't receive any more // data, then closing user can proceed. // Starting the timer is contrary to the // specification, but if we don't get a FIN // we'll hang forever. if (so->so_state & USN_CANTRCVMORE) { soisdisconnected(so); tp->t_timer[TCPT_2MSL] = g_tcp_maxidle; } DEBUG("change tcp state to TCPS_FIN_WAIT_2, state=%d", tp->t_state); tp->t_state = TCPS_FIN_WAIT_2; usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_FIN_WAIT2, 0); } break; // In CLOSING STATE in addition to the processing for // the ESTABLISHED state if the ACK acknowledges our FIN // then enter the TIME-WAIT state, otherwise ignore // the segment. case TCPS_CLOSING: if (ourfinisacked) { DEBUG("change tcp state to TCPS_TIME_WAIT, state=%d", tp->t_state); tp->t_state = TCPS_TIME_WAIT; usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_TIME_WAIT, 0); tcp_canceltimers(tp); tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; soisdisconnected(so); } break; // In LAST_ACK, we may still be waiting for data to drain // and/or to be acked, as well as for the ack of our FIN. // If our FIN is now acknowledged, delete the TCB, // enter the closed state and return. case TCPS_LAST_ACK: if (ourfinisacked) { tp = tcp_close(tp); goto drop; } break; // In TIME_WAIT state the only thing that should arrive // is a retransmission of the remote FIN. Acknowledge // it and restart the finack timer. case TCPS_TIME_WAIT: tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; goto dropafterack; } } step6: // Update window information. // Don't look at window if no ACK: TAC's send garbage on first SYN. if ((tiflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, ti->ti_seq) || (tp->snd_wl1 == ti->ti_seq && (SEQ_LT(tp->snd_wl2, ti->ti_ack) || (tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd) )) )) { // keep track of pure window updates if (ti->ti_len == 0 && tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd) g_tcpstat.tcps_rcvwinupd++; tp->snd_wnd = tiwin; tp->snd_wl1 = ti->ti_seq; tp->snd_wl2 = ti->ti_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; DEBUG("change needoutput to 1"); tp->t_flags |= TF_NEEDOUTPUT; needoutput = 1; } // Process segments with URG. if ((tiflags & TH_URG) && ti->ti_urp && TCPS_HAVERCVDFIN(tp->t_state) == 0) { // This is a kludge, but if we receive and accept // random urgent pointers, we'll crash in // soreceive. It's hard to imagine someone // actually wanting to send this much urgent data. if (ti->ti_urp + so->so_rcv->sb_cc > g_sb_max) { ti->ti_urp = 0; // XXX tiflags &= ~TH_URG; // XXX goto dodata; // XXX } // If this segment advances the known urgent pointer, // then mark the data stream. This should not happen // in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since // a FIN has been received from the remote side. // In these states we ignore the URG. // // According to RFC961 (Assigned Protocols), // the urgent pointer points to the last octet // of urgent data. We continue, however, // to consider it to indicate the first octet // of data past the urgent section as the original // spec states (in one of two places). if (SEQ_GT(ti->ti_seq+ti->ti_urp, tp->rcv_up)) { tp->rcv_up = ti->ti_seq + ti->ti_urp; so->so_oobmark = so->so_rcv->sb_cc + (tp->rcv_up - tp->rcv_nxt) - 1; if (so->so_oobmark == 0) so->so_state |= USN_RCVATMARK; sohasoutofband(so); // send async event to app threads. usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPEV_OUTOFBOUND, 0); tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); } // Remove out of band data so doesn't get presented to user. // This can happen independent of advancing the URG pointer, // but if two URG's are pending at once, some out-of-band // data may creep in... ick. if (ti->ti_urp <= ti->ti_len #ifdef SO_OOBINLINE && (so->so_options & SO_OOBINLINE) == 0 #endif ) tcp_pulloutofband(so, ti, m); } else // If no out of band data is expected, // pull receive urgent pointer along // with the receive window. if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) tp->rcv_up = tp->rcv_nxt; dodata: // XXX #ifdef DUMP_PAYLOAD DEBUG("Handle data"); dump_chain(m,"tcp"); #endif // Process the segment text, merging it into the TCP sequencing queue, // and arranging for acknowledgment of receipt if necessary. // This process logically involves adjusting tp->rcv_wnd as data // is presented to the user (this happens in tcp_usrreq.c, // case PRU_RCVD). If a FIN has already been received on this // connection then we just ignore the text. if ((ti->ti_len || (tiflags&TH_FIN)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { TCP_REASS(tp, ti, m, so, tiflags); // Note the amount of data that peer has sent into // our window, in order to estimate the sender's // buffer size. len = so->so_rcv->sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); } else { usn_free_cmbuf(m); tiflags &= ~TH_FIN; } // If FIN is received ACK the FIN and let the user know // that the connection is closing. if (tiflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { socantrcvmore(so); tp->t_flags |= TF_ACKNOW; TRACE("ack FIN now, tp flags=%d", tp->t_flags); tp->rcv_nxt++; } switch (tp->t_state) { // In SYN_RECEIVED and ESTABLISHED STATES // enter the CLOSE_WAIT state. case TCPS_SYN_RECEIVED: case TCPS_ESTABLISHED: TRACE("change tcp state to TCPS_CLOSE_WAIT, state=%d", tp->t_state); tp->t_state = TCPS_CLOSE_WAIT; soewakeup(so, 0); usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_CLOSE_WAIT, 0); break; // If still in FIN_WAIT_1 STATE FIN has not been acked so // enter the CLOSING state. case TCPS_FIN_WAIT_1: TRACE("change tcp state to TCPS_CLOSING, state=%d", tp->t_state); tp->t_state = TCPS_CLOSING; usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_CLOSING, 0); break; // In FIN_WAIT_2 state enter the TIME_WAIT state, // starting the time-wait timer, turning off the other // standard timers. case TCPS_FIN_WAIT_2: TRACE("change tcp state to TCPS_TIME_WAIT, state=%d", tp->t_state); tp->t_state = TCPS_TIME_WAIT; tcp_canceltimers(tp); tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; soisdisconnected(so); usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_TIME_WAIT, 0); break; // In TIME_WAIT state restart the 2 MSL time_wait timer. case TCPS_TIME_WAIT: tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; break; } } if (so->so_options & SO_DEBUG) { TRACE("tcp trace, so_options=%d", so->so_options); tcp_trace(TA_INPUT, ostate, tp, &g_tcp_saveti, 0); } // Return any desired output. //if (needoutput || (tp->t_flags & TF_ACKNOW)){ if (tp->t_flags & TF_NEEDOUTPUT || (tp->t_flags & TF_ACKNOW)){ TRACE("ack now or need to ouput, tp->t_flags=%d", tp->t_flags); tcp_output(tp); } return; dropafterack: TRACE("dropafterack"); // Generate an ACK dropping incoming segment if it occupies // sequence space, where the ACK reflects our state. if (tiflags & TH_RST) goto drop; usn_free_cmbuf(m); tp->t_flags |= TF_ACKNOW; TRACE("ack now, tp flags=%d", tp->t_flags); tcp_output(tp); return; dropwithreset: TRACE("dropwithreset"); // Generate a RST, dropping incoming segment. // Make ACK acceptable to originator of segment. // Don't bother to respond if destination was broadcast/multicast. #define USN_MULTICAST(i) (((u_int)(i) & 0xf0000000) == 0xe0000000) if ((tiflags & TH_RST) || m->flags & (BUF_BCAST|BUF_MCAST) || USN_MULTICAST(ntohl(ti->ti_dst.s_addr))) goto drop; if (tiflags & TH_ACK) tcp_respond(tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST); else { if (tiflags & TH_SYN) ti->ti_len++; tcp_respond(tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0, TH_RST|TH_ACK); } // destroy temporarily created socket if (dropsocket) soabort(so); return; drop: TRACE("drop"); // Drop space held by incoming segment and return. if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) { TRACE("tcp trace: drop a socket"); tcp_trace(TA_DROP, ostate, tp, &g_tcp_saveti, 0); } usn_free_cmbuf(m); // destroy temporarily created socket if (dropsocket) soabort(so); return; }
/* * To insert a new blk to the array of SACK blk in receiver. * * Parameters: * sack_blk_t *head: pointer to the array of SACK blks. * tcp_seq begin: starting seq num of the new blk. * tcp_seq end: ending seq num of the new blk. * int32_t *num: (referenced) total num of SACK blks on the list. */ void tcp_sack_insert(sack_blk_t *head, tcp_seq begin, tcp_seq end, int32_t *num) { int32_t i, j, old_num, new_num; sack_blk_t tmp[MAX_SACK_BLK - 1]; /* The array is empty, just add the new one. */ if (*num == 0) { head[0].begin = begin; head[0].end = end; *num = 1; return; } /* * Check for overlap. There are five cases. * * 1. there is no overlap with any other SACK blks. * 2. new SACK blk is completely contained in another blk. * 3. tail part of new SACK blk overlaps with another blk. * 4. head part of new SACK blk overlaps with another blk. * 5. new SACK blk completely contains another blk. * * Use tmp to hold old SACK blks. After the loop, copy them back * to head. */ old_num = *num; if (old_num > MAX_SACK_BLK - 1) { old_num = MAX_SACK_BLK - 1; } new_num = old_num; j = 0; for (i = 0; i < old_num; i++) { if (SEQ_LT(end, head[i].begin) || SEQ_GT(begin, head[i].end)) { /* Case 1: continue to check. */ tmp[j].begin = head[i].begin; tmp[j].end = head[i].end; j++; continue; } else if (SEQ_GEQ(begin, head[i].begin) && SEQ_LEQ(end, head[i].end)) { /* Case 2: re-insert the old blk to the head. */ begin = head[i].begin; end = head[i].end; } else if (SEQ_LEQ(end, head[i].end) && SEQ_GEQ(end, head[i].begin)) { /* * Case 3: Extend the new blk, remove the old one * and continue to check. */ end = head[i].end; } else if (SEQ_GEQ(begin, head[i].begin) && SEQ_LEQ(begin, head[i].end)) { /* Case 4 */ begin = head[i].begin; } /* * Common code for all cases except the first one, which * copies the original SACK blk into the tmp storage. Other * cases remove the original SACK blk by not copying into * tmp storage. */ new_num--; } head[0].begin = begin; head[0].end = end; for (i = 0; i < new_num; i++) { head[i+1].begin = tmp[i].begin; head[i+1].end = tmp[i].end; } *num = new_num + 1; }
int tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) { struct socket *so = tp->t_inpcb->inp_socket; struct mbuf *mq, *mp; int flags, wakeup; INP_WLOCK_ASSERT(tp->t_inpcb); /* * XXX: tcp_reass() is rather inefficient with its data structures * and should be rewritten (see NetBSD for optimizations). */ /* * Call with th==NULL after become established to * force pre-ESTABLISHED data up to user socket. */ if (th == NULL) goto present; M_ASSERTPKTHDR(m); KASSERT(*tlenp == m->m_pkthdr.len, ("%s: tlenp %u len %u", __func__, *tlenp, m->m_pkthdr.len)); /* * Limit the number of segments that can be queued to reduce the * potential for mbuf exhaustion. For best performance, we want to be * able to queue a full window's worth of segments. The size of the * socket receive buffer determines our advertised window and grows * automatically when socket buffer autotuning is enabled. Use it as the * basis for our queue limit. * Always let the missing segment through which caused this queue. * NB: Access to the socket buffer is left intentionally unlocked as we * can tolerate stale information here. */ if ((th->th_seq != tp->rcv_nxt || !TCPS_HAVEESTABLISHED(tp->t_state)) && tp->t_segqlen + m->m_pkthdr.len >= sbspace(&so->so_rcv)) { char *s; TCPSTAT_INC(tcps_rcvreassfull); *tlenp = 0; if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: queue limit reached, " "segment dropped\n", s, __func__); free(s, M_TCPLOG); } m_freem(m); return (0); } /* * Find a segment which begins after this one does. */ mp = NULL; for (mq = tp->t_segq; mq != NULL; mq = mq->m_nextpkt) { if (SEQ_GT(M_TCPHDR(mq)->th_seq, th->th_seq)) break; mp = mq; } /* * If there is a preceding segment, it may provide some of * our data already. If so, drop the data from the incoming * segment. If it provides all of our data, drop us. */ if (mp != NULL) { int i; /* conversion to int (in i) handles seq wraparound */ i = M_TCPHDR(mp)->th_seq + mp->m_pkthdr.len - th->th_seq; if (i > 0) { if (i >= *tlenp) { TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, *tlenp); m_freem(m); /* * Try to present any queued data * at the left window edge to the user. * This is needed after the 3-WHS * completes. */ goto present; /* ??? */ } m_adj(m, i); *tlenp -= i; th->th_seq += i; } } tp->t_rcvoopack++; TCPSTAT_INC(tcps_rcvoopack); TCPSTAT_ADD(tcps_rcvoobyte, *tlenp); /* * While we overlap succeeding segments trim them or, * if they are completely covered, dequeue them. */ while (mq) { struct mbuf *nq; int i; i = (th->th_seq + *tlenp) - M_TCPHDR(mq)->th_seq; if (i <= 0) break; if (i < mq->m_pkthdr.len) { M_TCPHDR(mq)->th_seq += i; m_adj(mq, i); tp->t_segqlen -= i; break; } nq = mq->m_nextpkt; tp->t_segqlen -= mq->m_pkthdr.len; m_freem(mq); if (mp) mp->m_nextpkt = nq; else tp->t_segq = nq; mq = nq; } /* * Insert the new segment queue entry into place. Try to collapse * mbuf chains if segments are adjacent. */ if (mp) { if (M_TCPHDR(mp)->th_seq + mp->m_pkthdr.len == th->th_seq) m_catpkt(mp, m); else { m->m_nextpkt = mp->m_nextpkt; mp->m_nextpkt = m; m->m_pkthdr.pkt_tcphdr = th; } } else { mq = tp->t_segq; tp->t_segq = m; if (mq && th->th_seq + *tlenp == M_TCPHDR(mq)->th_seq) { m->m_nextpkt = mq->m_nextpkt; mq->m_nextpkt = NULL; m_catpkt(m, mq); } else m->m_nextpkt = mq; m->m_pkthdr.pkt_tcphdr = th; } tp->t_segqlen += *tlenp; present: /* * Present data to user, advancing rcv_nxt through * completed sequence space. */ if (!TCPS_HAVEESTABLISHED(tp->t_state)) return (0); flags = 0; wakeup = 0; SOCKBUF_LOCK(&so->so_rcv); while ((mq = tp->t_segq) != NULL && M_TCPHDR(mq)->th_seq == tp->rcv_nxt) { tp->t_segq = mq->m_nextpkt; tp->rcv_nxt += mq->m_pkthdr.len; tp->t_segqlen -= mq->m_pkthdr.len; flags = M_TCPHDR(mq)->th_flags & TH_FIN; if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(mq); else { mq->m_nextpkt = NULL; sbappendstream_locked(&so->so_rcv, mq, 0); wakeup = 1; } } ND6_HINT(tp); if (wakeup) sorwakeup_locked(so); else SOCKBUF_UNLOCK(&so->so_rcv); return (flags); }
static int tcp_reass(register struct tcpcb *tp, register struct tcpiphdr *ti, struct mbuf *m) { register struct tcpiphdr *q; struct socket *so = tp->t_socket; int flags; /* * Call with ti==NULL after become established to * force pre-ESTABLISHED data up to user socket. */ if (ti == NULL) goto present; /* * Find a segment which begins after this one does. */ for (q = tcpfrag_list_first(tp); !tcpfrag_list_end(q, tp); q = tcpiphdr_next(q)) if (SEQ_GT(q->ti_seq, ti->ti_seq)) break; /* * If there is a preceding segment, it may provide some of * our data already. If so, drop the data from the incoming * segment. If it provides all of our data, drop us. */ if (!tcpfrag_list_end(tcpiphdr_prev(q), tp)) { register int i; q = tcpiphdr_prev(q); /* conversion to int (in i) handles seq wraparound */ i = q->ti_seq + q->ti_len - ti->ti_seq; if (i > 0) { if (i >= ti->ti_len) { m_free(m); /* * Try to present any queued data * at the left window edge to the user. * This is needed after the 3-WHS * completes. */ goto present; /* ??? */ } m_adj(m, i); ti->ti_len -= i; ti->ti_seq += i; } q = tcpiphdr_next(q); } ti->ti_mbuf = m; /* * While we overlap succeeding segments trim them or, * if they are completely covered, dequeue them. */ while (!tcpfrag_list_end(q, tp)) { register int i = (ti->ti_seq + ti->ti_len) - q->ti_seq; if (i <= 0) break; if (i < q->ti_len) { q->ti_seq += i; q->ti_len -= i; m_adj(q->ti_mbuf, i); break; } q = tcpiphdr_next(q); m = tcpiphdr_prev(q)->ti_mbuf; remque(tcpiphdr2qlink(tcpiphdr_prev(q))); m_free(m); } /* * Stick new segment in its place. */ insque(tcpiphdr2qlink(ti), tcpiphdr2qlink(tcpiphdr_prev(q))); present: /* * Present data to user, advancing rcv_nxt through * completed sequence space. */ if (!TCPS_HAVEESTABLISHED(tp->t_state)) return (0); ti = tcpfrag_list_first(tp); if (tcpfrag_list_end(ti, tp) || ti->ti_seq != tp->rcv_nxt) return (0); if (tp->t_state == TCPS_SYN_RECEIVED && ti->ti_len) return (0); do { tp->rcv_nxt += ti->ti_len; flags = ti->ti_flags & TH_FIN; remque(tcpiphdr2qlink(ti)); m = ti->ti_mbuf; ti = tcpiphdr_next(ti); if (so->so_state & SS_FCANTSENDMORE) m_free(m); else { if (so->so_emu) { if (tcp_emu(so,m)) sbappend(so, m); } else sbappend(so, m); } } while (ti != (struct tcpiphdr *)tp && ti->ti_seq == tp->rcv_nxt); return (flags); }
static int tcpup_state_send(struct tcpup_info *upp, struct tcpiphdr *tcp, size_t dlen) { int xflags = 0; if (tcp->th_flags & TH_RST) { upp->t_state = TCPS_CLOSED; return 0; } if (upp->x_state != upp->t_state && (tcp->th_flags & TH_ACK) && SEQ_GEQ(htonl(tcp->th_ack), upp->rcv_una)) { tcp_state_update(upp, upp->x_state); upp->t_state = upp->x_state; } switch (upp->t_state) { case TCPS_CLOSED: xflags = TH_SYN| TH_ACK; if ((tcp->th_flags & xflags) == TH_SYN) { upp->snd_nxt = htonl(tcp->th_seq) + 1; upp->snd_max = htonl(tcp->th_seq) + 1; upp->snd_una = htonl(tcp->th_seq) + 1; tcp_state_update(upp, TCPS_SYN_SENT); return 0; } break; case TCPS_SYN_RECEIVED: assert((tcp->th_flags & TH_FIN) != TH_FIN); xflags = TH_SYN| TH_ACK; if ((tcp->th_flags & xflags) == TH_ACK && SEQ_GT(htonl(tcp->th_seq), upp->snd_nxt)) { tcp_state_update(upp, upp->x_state); return 0; } break; case TCPS_ESTABLISHED: if ((tcp->th_flags & TH_FIN) == TH_FIN) { upp->snd_nxt = htonl(tcp->th_seq) + dlen + 1; tcp_state_update(upp, TCPS_FIN_WAIT_1); return 0; } break; case TCPS_CLOSE_WAIT: if ((tcp->th_flags & TH_FIN) == TH_FIN) { upp->snd_nxt = htonl(tcp->th_seq) + dlen + 1; tcp_state_update(upp, TCPS_LAST_ACK); return 0; } break; case TCPS_FIN_WAIT_1: xflags = TH_FIN| TH_ACK; if ((tcp->th_flags & xflags) == TH_ACK) { tcp_state_update(upp, upp->x_state); return 0; } break; } if (dlen > 0) { upp->snd_nxt = htonl(tcp->th_seq) + dlen; if (SEQ_GT(upp->snd_nxt, upp->snd_max)) { /* update snd max to nxt */ upp->snd_max = upp->snd_nxt; } } return 0; }
enum CT_UPDATE_RES OvsConntrackUpdateTcpEntry(OVS_CT_ENTRY* conn_, const TCPHdr *tcp, PNET_BUFFER_LIST nbl, BOOLEAN reply, UINT64 now) { struct conn_tcp *conn = OvsCastConntrackEntryToTcpEntry(conn_); /* The peer that sent 'pkt' */ struct tcp_peer *src = &conn->peer[reply ? 1 : 0]; /* The peer that should receive 'pkt' */ struct tcp_peer *dst = &conn->peer[reply ? 0 : 1]; uint8_t sws = 0, dws = 0; UINT16 tcp_flags = ntohs(tcp->flags); uint16_t win = ntohs(tcp->window); uint32_t ack, end, seq, orig_seq; uint32_t p_len = OvsGetTcpPayloadLength(nbl); int ackskew; if (OvsCtInvalidTcpFlags(tcp_flags)) { return CT_UPDATE_INVALID; } if (((tcp_flags & (TCP_SYN|TCP_ACK)) == TCP_SYN) && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2 && src->state >= CT_DPIF_TCPS_FIN_WAIT_2) { src->state = dst->state = CT_DPIF_TCPS_CLOSED; return CT_UPDATE_NEW; } if (src->wscale & CT_WSCALE_FLAG && dst->wscale & CT_WSCALE_FLAG && !(tcp_flags & TCP_SYN)) { sws = src->wscale & CT_WSCALE_MASK; dws = dst->wscale & CT_WSCALE_MASK; } else if (src->wscale & CT_WSCALE_UNKNOWN && dst->wscale & CT_WSCALE_UNKNOWN && !(tcp_flags & TCP_SYN)) { sws = TCP_MAX_WSCALE; dws = TCP_MAX_WSCALE; } /* * Sequence tracking algorithm from Guido van Rooij's paper: * http://www.madison-gurkha.com/publications/tcp_filtering/ * tcp_filtering.ps */ orig_seq = seq = ntohl(tcp->seq); if (src->state < CT_DPIF_TCPS_SYN_SENT) { /* First packet from this end. Set its state */ ack = ntohl(tcp->ack_seq); end = seq + p_len; if (tcp_flags & TCP_SYN) { end++; if (dst->wscale & CT_WSCALE_FLAG) { src->wscale = OvsTcpGetWscale(tcp); if (src->wscale & CT_WSCALE_FLAG) { /* Remove scale factor from initial window */ sws = src->wscale & CT_WSCALE_MASK; win = DIV_ROUND_UP((uint32_t) win, 1 << sws); dws = dst->wscale & CT_WSCALE_MASK; } else { /* fixup other window */ dst->max_win <<= dst->wscale & CT_WSCALE_MASK; /* in case of a retrans SYN|ACK */ dst->wscale = 0; } } } if (tcp_flags & TCP_FIN) { end++; } src->seqlo = seq; src->state = CT_DPIF_TCPS_SYN_SENT; /* * May need to slide the window (seqhi may have been set by * the crappy stack check or if we picked up the connection * after establishment) */ if (src->seqhi == 1 || SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi)) { src->seqhi = end + MAX(1, dst->max_win << dws); } if (win > src->max_win) { src->max_win = win; } } else { ack = ntohl(tcp->ack_seq); end = seq + p_len; if (tcp_flags & TCP_SYN) { end++; } if (tcp_flags & TCP_FIN) { end++; } } if ((tcp_flags & TCP_ACK) == 0) { /* Let it pass through the ack skew check */ ack = dst->seqlo; } else if ((ack == 0 && (tcp_flags & (TCP_ACK|TCP_RST)) == (TCP_ACK|TCP_RST)) /* broken tcp stacks do not set ack */) { /* Many stacks (ours included) will set the ACK number in an * FIN|ACK if the SYN times out -- no sequence to ACK. */ ack = dst->seqlo; } if (seq == end) { /* Ease sequencing restrictions on no data packets */ seq = src->seqlo; end = seq; } ackskew = dst->seqlo - ack; #define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */ if (SEQ_GEQ(src->seqhi, end) /* Last octet inside other's window space */ && SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) /* Retrans: not more than one window back */ && (ackskew >= -MAXACKWINDOW) /* Acking not more than one reassembled fragment backwards */ && (ackskew <= (MAXACKWINDOW << sws)) /* Acking not more than one window forward */ && ((tcp_flags & TCP_RST) == 0 || orig_seq == src->seqlo || (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo))) { /* Require an exact/+1 sequence match on resets when possible */ /* update max window */ if (src->max_win < win) { src->max_win = win; } /* synchronize sequencing */ if (SEQ_GT(end, src->seqlo)) { src->seqlo = end; } /* slide the window of what the other end can send */ if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) { dst->seqhi = ack + MAX((win << sws), 1); } /* update states */ if (tcp_flags & TCP_SYN && src->state < CT_DPIF_TCPS_SYN_SENT) { src->state = CT_DPIF_TCPS_SYN_SENT; } if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) { src->state = CT_DPIF_TCPS_CLOSING; } if (tcp_flags & TCP_ACK) { if (dst->state == CT_DPIF_TCPS_SYN_SENT) { dst->state = CT_DPIF_TCPS_ESTABLISHED; } else if (dst->state == CT_DPIF_TCPS_CLOSING) { dst->state = CT_DPIF_TCPS_FIN_WAIT_2; } } if (tcp_flags & TCP_RST) { src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT; } if (src->state >= CT_DPIF_TCPS_FIN_WAIT_2 && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2) { OvsConntrackUpdateExpiration(conn, now, 30 * CT_INTERVAL_SEC); } else if (src->state >= CT_DPIF_TCPS_CLOSING && dst->state >= CT_DPIF_TCPS_CLOSING) { OvsConntrackUpdateExpiration(conn, now, 45 * CT_INTERVAL_SEC); } else if (src->state < CT_DPIF_TCPS_ESTABLISHED || dst->state < CT_DPIF_TCPS_ESTABLISHED) { OvsConntrackUpdateExpiration(conn, now, 30 * CT_INTERVAL_SEC); } else if (src->state >= CT_DPIF_TCPS_CLOSING || dst->state >= CT_DPIF_TCPS_CLOSING) { OvsConntrackUpdateExpiration(conn, now, 15 * 60 * CT_INTERVAL_SEC); } else { OvsConntrackUpdateExpiration(conn, now, 24 * 60 * 60 * CT_INTERVAL_SEC); } } else if ((dst->state < CT_DPIF_TCPS_SYN_SENT || dst->state >= CT_DPIF_TCPS_FIN_WAIT_2 || src->state >= CT_DPIF_TCPS_FIN_WAIT_2) && SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) /* Within a window forward of the originating packet */ && SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) { /* Within a window backward of the originating packet */ /* * This currently handles three situations: * 1) Stupid stacks will shotgun SYNs before their peer * replies. * 2) When PF catches an already established stream (the * firewall rebooted, the state table was flushed, routes * changed...) * 3) Packets get funky immediately after the connection * closes (this should catch Solaris spurious ACK|FINs * that web servers like to spew after a close) * * This must be a little more careful than the above code * since packet floods will also be caught here. We don't * update the TTL here to mitigate the damage of a packet * flood and so the same code can handle awkward establishment * and a loosened connection close. * In the establishment case, a correct peer response will * validate the connection, go through the normal state code * and keep updating the state TTL. */ /* update max window */ if (src->max_win < win) { src->max_win = win; } /* synchronize sequencing */ if (SEQ_GT(end, src->seqlo)) { src->seqlo = end; } /* slide the window of what the other end can send */ if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) { dst->seqhi = ack + MAX((win << sws), 1); } /* * Cannot set dst->seqhi here since this could be a shotgunned * SYN and not an already established connection. */ if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) { src->state = CT_DPIF_TCPS_CLOSING; } if (tcp_flags & TCP_RST) { src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT; } } else { return CT_UPDATE_INVALID; } return CT_UPDATE_VALID; }
/* * Returns 1 if the TIME_WAIT state was killed and we should start over, * looking for a pcb in the listen state. Returns 0 otherwise. */ int tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th, struct mbuf *m, int tlen) { struct tcptw *tw; int thflags; tcp_seq seq; /* tcbinfo lock required for tcp_twclose(), tcp_tw_2msl_reset(). */ //ScenSim-Port// INP_INFO_WLOCK_ASSERT(&V_tcbinfo); //ScenSim-Port// INP_WLOCK_ASSERT(inp); /* * XXXRW: Time wait state for inpcb has been recycled, but inpcb is * still present. This is undesirable, but temporarily necessary * until we work out how to handle inpcb's who's timewait state has * been removed. */ tw = intotw(inp); if (tw == NULL) goto drop; thflags = th->th_flags; /* * NOTE: for FIN_WAIT_2 (to be added later), * must validate sequence number before accepting RST */ /* * If the segment contains RST: * Drop the segment - see Stevens, vol. 2, p. 964 and * RFC 1337. */ if (thflags & TH_RST) goto drop; //ScenSim-Port//#if 0 //ScenSim-Port///* PAWS not needed at the moment */ //ScenSim-Port// /* //ScenSim-Port// * RFC 1323 PAWS: If we have a timestamp reply on this segment //ScenSim-Port// * and it's less than ts_recent, drop it. //ScenSim-Port// */ //ScenSim-Port// if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && //ScenSim-Port// TSTMP_LT(to.to_tsval, tp->ts_recent)) { //ScenSim-Port// if ((thflags & TH_ACK) == 0) //ScenSim-Port// goto drop; //ScenSim-Port// goto ack; //ScenSim-Port// } //ScenSim-Port// /* //ScenSim-Port// * ts_recent is never updated because we never accept new segments. //ScenSim-Port// */ //ScenSim-Port//#endif /* * If a new connection request is received * while in TIME_WAIT, drop the old connection * and start over if the sequence numbers * are above the previous ones. */ if ((thflags & TH_SYN) && SEQ_GT(th->th_seq, tw->rcv_nxt)) { tcp_twclose(tw, 0); return (1); } /* * Drop the segment if it does not contain an ACK. */ if ((thflags & TH_ACK) == 0) goto drop; /* * Reset the 2MSL timer if this is a duplicate FIN. */ if (thflags & TH_FIN) { seq = th->th_seq + tlen + (thflags & TH_SYN ? 1 : 0); if (seq + 1 == tw->rcv_nxt) tcp_tw_2msl_reset(tw, 1); } /* * Acknowledge the segment if it has data or is not a duplicate ACK. */ if (thflags != TH_ACK || tlen != 0 || th->th_seq != tw->rcv_nxt || th->th_ack != tw->snd_nxt) tcp_twrespond(tw, TH_ACK); drop: //ScenSim-Port// INP_WUNLOCK(inp); m_freem(m); return (0); }
/* Receive packet */ void l2tp_ctrl_input(l2tpd *_this, int listener_index, struct sockaddr *peer, struct sockaddr *sock, void *nat_t_ctx, u_char *pkt, int pktlen) { int i, len, offsiz, reqlen, is_ctrl; uint16_t mestype; struct l2tp_avp *avp, *avp0; l2tp_ctrl *ctrl; l2tp_call *call; char buf[L2TP_AVP_MAXSIZ], errmsg[256]; time_t curr_time; u_char *pkt0; struct l2tp_header hdr; char hbuf[NI_MAXHOST + NI_MAXSERV + 16]; ctrl = NULL; curr_time = get_monosec(); pkt0 = pkt; L2TP_CTRL_ASSERT(peer->sa_family == sock->sa_family); L2TP_CTRL_ASSERT(peer->sa_family == AF_INET || peer->sa_family == AF_INET6) /* * Parse L2TP Header */ memset(&hdr, 0, sizeof(hdr)); if (pktlen < 2) { snprintf(errmsg, sizeof(errmsg), "a short packet. " "length=%d", pktlen); goto bad_packet; } memcpy(&hdr, pkt, 2); pkt += 2; if (hdr.ver != L2TP_HEADER_VERSION_RFC2661) { /* XXX: only RFC2661 is supported */ snprintf(errmsg, sizeof(errmsg), "Unsupported version at header = %d", hdr.ver); goto bad_packet; } is_ctrl = (hdr.t != 0)? 1 : 0; /* calc required length */ reqlen = 6; /* for Flags, Tunnel-Id, Session-Id field */ if (hdr.l) reqlen += 2; /* for Length field (opt) */ if (hdr.s) reqlen += 4; /* for Ns, Nr field (opt) */ if (hdr.o) reqlen += 2; /* for Offset Size field (opt) */ if (reqlen > pktlen) { snprintf(errmsg, sizeof(errmsg), "a short packet. length=%d", pktlen); goto bad_packet; } if (hdr.l != 0) { GETSHORT(hdr.length, pkt); if (hdr.length > pktlen) { snprintf(errmsg, sizeof(errmsg), "Actual packet size is smaller than the length " "field %d < %d", pktlen, hdr.length); goto bad_packet; } pktlen = hdr.length; /* remove trailing trash */ } GETSHORT(hdr.tunnel_id, pkt); GETSHORT(hdr.session_id, pkt); if (hdr.s != 0) { GETSHORT(hdr.ns, pkt); GETSHORT(hdr.nr, pkt); } if (hdr.o != 0) { GETSHORT(offsiz, pkt); if (pktlen < offsiz) { snprintf(errmsg, sizeof(errmsg), "offset field is bigger than remaining packet " "length %d > %d", offsiz, pktlen); goto bad_packet; } pkt += offsiz; } L2TP_CTRL_ASSERT(pkt - pkt0 == reqlen); pktlen -= (pkt - pkt0); /* cut down the length of header */ ctrl = NULL; memset(buf, 0, sizeof(buf)); mestype = 0; avp = NULL; if (is_ctrl) { avp0 = (struct l2tp_avp *)buf; avp = avp_find_message_type_avp(avp0, pkt, pktlen); if (avp != NULL) mestype = avp->attr_value[0] << 8 | avp->attr_value[1]; } ctrl = l2tpd_get_ctrl(_this, hdr.tunnel_id); if (ctrl == NULL) { /* new control */ if (!is_ctrl) { snprintf(errmsg, sizeof(errmsg), "bad data message: tunnelId=%d is not " "found.", hdr.tunnel_id); goto bad_packet; } if (mestype != L2TP_AVP_MESSAGE_TYPE_SCCRQ) { snprintf(errmsg, sizeof(errmsg), "bad control message: tunnelId=%d is not " "found. mestype=%s", hdr.tunnel_id, avp_mes_type_string(mestype)); goto bad_packet; } if ((ctrl = l2tp_ctrl_create()) == NULL) { l2tp_ctrl_log(ctrl, LOG_ERR, "l2tp_ctrl_create() failed: %m"); goto fail; } if (l2tp_ctrl_init(ctrl, _this, peer, sock, nat_t_ctx) != 0) { l2tp_ctrl_log(ctrl, LOG_ERR, "l2tp_ctrl_start() failed: %m"); goto fail; } ctrl->listener_index = listener_index; l2tp_ctrl_reload(ctrl); } else { /* * treat as an error if src address and port is not * match. (because it is potentially DoS attach) */ int notmatch = 0; if (ctrl->peer.ss_family != peer->sa_family) notmatch = 1; else if (peer->sa_family == AF_INET) { if (SIN(peer)->sin_addr.s_addr != SIN(&ctrl->peer)->sin_addr.s_addr || SIN(peer)->sin_port != SIN(&ctrl->peer)->sin_port) notmatch = 1; } else if (peer->sa_family == AF_INET6) { if (!IN6_ARE_ADDR_EQUAL(&(SIN6(peer)->sin6_addr), &(SIN6(&ctrl->peer)->sin6_addr)) || SIN6(peer)->sin6_port != SIN6(&ctrl->peer)->sin6_port) notmatch = 1; } if (notmatch) { snprintf(errmsg, sizeof(errmsg), "tunnelId=%u is already assigned for %s", hdr.tunnel_id, addrport_tostring( (struct sockaddr *)&ctrl->peer, ctrl->peer.ss_len, hbuf, sizeof(hbuf))); goto bad_packet; } } ctrl->last_rcv = curr_time; call = NULL; if (hdr.session_id != 0) { /* search l2tp_call by Session ID */ /* linear search is enough for this purpose */ len = slist_length(&ctrl->call_list); for (i = 0; i < len; i++) { call = slist_get(&ctrl->call_list, i); if (call->session_id == hdr.session_id) break; call = NULL; } } if (!is_ctrl) { int delayed = 0; /* L2TP data */ if (ctrl->state != L2TP_CTRL_STATE_ESTABLISHED) { l2tp_ctrl_log(ctrl, LOG_WARNING, "Received Data packet in '%s'", l2tp_ctrl_state_string(ctrl)); goto fail; } if (call == NULL) { l2tp_ctrl_log(ctrl, LOG_WARNING, "Received a data packet but it has no call. " "session_id=%u", hdr.session_id); goto fail; } L2TP_CTRL_DBG((ctrl, DEBUG_LEVEL_2, "call=%u RECV ns=%u nr=%u snd_nxt=%u rcv_nxt=%u len=%d", call->id, hdr.ns, hdr.nr, call->snd_nxt, call->rcv_nxt, pktlen)); if (call->state != L2TP_CALL_STATE_ESTABLISHED){ l2tp_ctrl_log(ctrl, LOG_WARNING, "Received a data packet but call is not " "established"); goto fail; } if (hdr.s != 0) { if (SEQ_LT(hdr.ns, call->rcv_nxt)) { if (SEQ_LT(hdr.ns, call->rcv_nxt - L2TP_CALL_DELAY_LIMIT)) { /* sequence number seems to be delayed */ /* XXX: need to log? */ L2TP_CTRL_DBG((ctrl, LOG_DEBUG, "receive a out of sequence " "data packet: %u < %u.", hdr.ns, call->rcv_nxt)); return; } delayed = 1; } else { call->rcv_nxt = hdr.ns + 1; } } l2tp_call_ppp_input(call, pkt, pktlen, delayed); return; } if (hdr.s != 0) { L2TP_CTRL_DBG((ctrl, DEBUG_LEVEL_2, "RECV %s ns=%u nr=%u snd_nxt=%u snd_una=%u rcv_nxt=%u " "len=%d", (is_ctrl)? "C" : "", hdr.ns, hdr.nr, ctrl->snd_nxt, ctrl->snd_una, ctrl->rcv_nxt, pktlen)); if (pktlen <= 0) l2tp_ctrl_log(ctrl, LOG_INFO, "RecvZLB"); if (SEQ_GT(hdr.nr, ctrl->snd_una)) { if (hdr.nr == ctrl->snd_nxt || SEQ_LT(hdr.nr, ctrl->snd_nxt)) ctrl->snd_una = hdr.nr; else { l2tp_ctrl_log(ctrl, LOG_INFO, "Received message has bad Nr field: " "%u < %u.", hdr.ns, ctrl->snd_nxt); /* XXX Drop with ZLB? */ goto fail; } } if (l2tp_ctrl_txwin_size(ctrl) <= 0) { /* no waiting ack */ if (ctrl->hello_wait_ack != 0) { /* * Reset Hello state, as an ack for the Hello * is recived. */ ctrl->hello_wait_ack = 0; ctrl->hello_io_time = curr_time; } switch (ctrl->state) { case L2TP_CTRL_STATE_CLEANUP_WAIT: l2tp_ctrl_stop(ctrl, 0); return; } } if (hdr.ns != ctrl->rcv_nxt) { /* there are remaining packet */ if (l2tp_ctrl_resend_una_packets(ctrl) <= 0) { /* resend or sent ZLB */ l2tp_ctrl_send_ZLB(ctrl); } #ifdef L2TP_CTRL_DEBUG if (pktlen != 0) { /* not ZLB */ L2TP_CTRL_DBG((ctrl, LOG_DEBUG, "receive out of sequence %u must be %u. " "mestype=%s", hdr.ns, ctrl->rcv_nxt, avp_mes_type_string(mestype))); } #endif return; } if (pktlen <= 0) return; /* ZLB */ if (l2tp_ctrl_txwin_is_full(ctrl)) { L2TP_CTRL_DBG((ctrl, LOG_DEBUG, "Received message cannot be handled. " "Transmission window is full.")); l2tp_ctrl_send_ZLB(ctrl); return; } ctrl->rcv_nxt++; if (avp == NULL) { l2tpd_log(_this, LOG_WARNING, "bad control message: no message-type AVP."); goto fail; } } /* * state machine (RFC2661 pp. 56-57) */ switch (ctrl->state) { case L2TP_CTRL_STATE_IDLE: switch (mestype) { case L2TP_AVP_MESSAGE_TYPE_SCCRQ: if (l2tp_ctrl_recv_SCCRQ(ctrl, pkt, pktlen, _this, peer) == 0) { /* acceptable */ l2tp_ctrl_send_SCCRP(ctrl); ctrl->state = L2TP_CTRL_STATE_WAIT_CTL_CONN; return; } /* * in case un-acceptable, it was already processed * at l2tcp_ctrl_recv_SCCRQ */ return; case L2TP_AVP_MESSAGE_TYPE_SCCRP: /* * RFC specifies that sent of StopCCN in the state, * However as this implementation only support Passive * open, this packet will not received. */ /* FALLTHROUGH */ case L2TP_AVP_MESSAGE_TYPE_SCCCN: default: break; } goto fsm_fail; case L2TP_CTRL_STATE_WAIT_CTL_CONN: /* Wait-Ctl-Conn */ switch (mestype) { case L2TP_AVP_MESSAGE_TYPE_SCCCN: l2tp_ctrl_log(ctrl, LOG_INFO, "RecvSCCN"); if (l2tp_ctrl_send_ZLB(ctrl) == 0) { ctrl->state = L2TP_CTRL_STATE_ESTABLISHED; } return; case L2TP_AVP_MESSAGE_TYPE_StopCCN: goto receive_stop_ccn; case L2TP_AVP_MESSAGE_TYPE_SCCRQ: case L2TP_AVP_MESSAGE_TYPE_SCCRP: default: break; } break; /* fsm_fail */ case L2TP_CTRL_STATE_ESTABLISHED: /* Established */ switch (mestype) { case L2TP_AVP_MESSAGE_TYPE_SCCCN: case L2TP_AVP_MESSAGE_TYPE_SCCRQ: case L2TP_AVP_MESSAGE_TYPE_SCCRP: break; receive_stop_ccn: case L2TP_AVP_MESSAGE_TYPE_StopCCN: if (l2tp_ctrl_recv_StopCCN(ctrl, pkt, pktlen) == 0) { if (l2tp_ctrl_resend_una_packets(ctrl) <= 0) l2tp_ctrl_send_ZLB(ctrl); l2tp_ctrl_stop(ctrl, 0); return; } l2tp_ctrl_log(ctrl, LOG_ERR, "Received bad StopCCN"); l2tp_ctrl_send_ZLB(ctrl); l2tp_ctrl_stop(ctrl, 0); return; case L2TP_AVP_MESSAGE_TYPE_HELLO: if (l2tp_ctrl_resend_una_packets(ctrl) <= 0) l2tp_ctrl_send_ZLB(ctrl); return; case L2TP_AVP_MESSAGE_TYPE_CDN: case L2TP_AVP_MESSAGE_TYPE_ICRP: case L2TP_AVP_MESSAGE_TYPE_ICCN: if (call == NULL) { l2tp_ctrl_log(ctrl, LOG_INFO, "Unknown call message: %s", avp_mes_type_string(mestype)); goto fail; } /* FALLTHROUGH */ case L2TP_AVP_MESSAGE_TYPE_ICRQ: l2tp_call_recv_packet(ctrl, call, mestype, pkt, pktlen); return; default: break; } break; /* fsm_fail */ case L2TP_CTRL_STATE_CLEANUP_WAIT: if (mestype == L2TP_AVP_MESSAGE_TYPE_StopCCN) { /* * We left ESTABLISHED state, but the peer sent StopCCN. */ goto receive_stop_ccn; } break; /* fsm_fail */ } fsm_fail: /* state machine error */ l2tp_ctrl_log(ctrl, LOG_WARNING, "Received %s in '%s' state", avp_mes_type_string(mestype), l2tp_ctrl_state_string(ctrl)); l2tp_ctrl_stop(ctrl, L2TP_STOP_CCN_RCODE_FSM_ERROR); return; fail: if (ctrl != NULL && mestype != 0) { l2tp_ctrl_log(ctrl, LOG_WARNING, "Received %s in '%s' state", avp_mes_type_string(mestype), l2tp_ctrl_state_string(ctrl)); l2tp_ctrl_stop(ctrl, L2TP_STOP_CCN_RCODE_GENERAL_ERROR); } return; bad_packet: l2tpd_log(_this, LOG_INFO, "Received from=%s: %s", addrport_tostring(peer, peer->sa_len, hbuf, sizeof(hbuf)), errmsg); return; }
int tcp_output(struct tcpcb * tp) { struct socket * so = tp->t_inpcb->inp_socket; int len; long win; int off, flags, error; struct mbuf * m; struct tcpiphdr * ti; unsigned optlen = 0; int idle, sendalot; struct mbuf * sendm; /* mbuf which contains data to send */ struct mbuf * tcp_mbuf; /* mbuf containing TCP header */ int bufoff; /* offset of data in sendm->m_data */ #ifdef TCP_SACK int sack_resend; int sack_hole = 0; /* next sack hole to fill */ if(tp->t_flags & TF_SACKREPLY) { /* we are resending based on a received SACK header */ sack_resend = TRUE; tp->t_flags &= ~TF_SACKREPLY; /* clear flag */ } else sack_resend = FALSE; #endif /* TCP_SACK */ /* * Determine length of data that should be transmitted, * and flags that will be used. * If there is some data or critical controls (SYN, RST) * to send, then transmit; otherwise, investigate further. */ idle = (tp->snd_max == tp->snd_una); again: sendalot = 0; off = (int)(tp->snd_nxt - tp->snd_una); win = (long)tp->snd_wnd; /* set basic send window */ if (win > (long)tp->snd_cwnd) /* see if we need congestion control */ { win = (int)(tp->snd_cwnd & ~(ALIGN_TYPE-1)); /* keep data aligned */ } /* * If in persist timeout with window of 0, send 1 byte. * Otherwise, if window is small but nonzero * and timer expired, we will send what we can * and go to transmit state. */ if (tp->t_force) { if (win == 0) win = 1; else { tp->t_timer[TCPT_PERSIST] = 0; tp->t_rxtshift = 0; } } #ifdef TCP_SACK /* See if we need to adjust the offset for a sack resend */ if(sack_resend) { off = (int)(tp->sack_hole_start[sack_hole] - tp->snd_una); /* if this hole's already been acked then punt and move to next hole */ if(off < 0) { /* clear out the acked hole */ tp->sack_hole_start[sack_hole] = tp->sack_hole_end[sack_hole] = 0; /* see if we're done with SACK hole list (2 tests) */ if(++sack_hole >= SACK_BLOCKS) return 0; if(tp->sack_hole_start[sack_hole] == tp->sack_hole_end[sack_hole]) return 0; goto again; } tp->snd_nxt = tp->sack_hole_start[sack_hole]; len = (int)(tp->sack_hole_end[sack_hole] - tp->sack_hole_start[sack_hole]); len = (int)MIN(len, (int)win); } else #endif /* TCP_SACK */ { /* set length of packets which are not sack resends */ len = (int)MIN(so->so_snd.sb_cc, (unsigned)win) - off; } flags = tcp_outflags[tp->t_state]; /* See if we need to build TCP options field. This test should be fast. */ #if (defined(TCP_TIMESTAMP) | defined(TCP_SACK)) if((flags & TH_SYN) || /* !!!??? (so->so_options & SO_TIMESTAMP) || */ (tp->t_flags & TF_SACKNOW) ) { optlen = bld_options(tp, &tcp_optionbuf[optlen], flags, so); } #else /* If other options not defined this build then don't bother to call bld_options() except * on SYN packets */ if(flags & TH_SYN) { optlen = bld_options(tp, &tcp_optionbuf[optlen], flags, so); } #endif if (len < 0) { /* * If FIN has been sent but not acked, * but we haven't been called to retransmit, * len will be -1. Otherwise, window shrank * after we sent into it. If window shrank to 0, * cancel pending retransmit and pull snd_nxt * back to (closed) window. We will enter persist * state below. If the window didn't close completely, * just wait for an ACK. */ len = 0; if (win == 0) { tp->t_timer[TCPT_REXMT] = 0; tp->snd_nxt = tp->snd_una; } } if (len > (int)tp->t_maxseg) { len = tp->t_maxseg; sendalot = 1; } #ifdef IP_V4 #ifdef IP_PMTU { int pmtu = tp->t_inpcb->inp_pmtu - 40; if (len > pmtu) { len = pmtu - 40; sendalot = 1; } } #endif /* IP_PMTU */ /* We don't need a pmtu test for IPv6. V6 code limits t_maxseg to * the Path MTU, so the test above the v4 ifdef above covers us. */ #endif /* IP_V4 */ if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) flags &= ~TH_FIN; win = (long)(sbspace(&so->so_rcv)); /* * If our state indicates that FIN should be sent * and we have not yet done so, or we're retransmitting the FIN, * then we need to send. */ if ((flags & TH_FIN) && (so->so_snd.sb_cc == 0) && ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) { goto send; } /* * Send if we owe peer an ACK. */ if (tp->t_flags & TF_ACKNOW) goto send; if (flags & (TH_SYN|TH_RST)) goto send; if (SEQ_GT(tp->snd_up, tp->snd_una)) goto send; /* * Sender silly window avoidance. If connection is idle * and can send all data, a maximum segment, * at least a maximum default-size segment do it, * or are forced, do it; otherwise don't bother. * If peer's buffer is tiny, then send * when window is at least half open. * If retransmitting (possibly after persist timer forced us * to send into a small window), then must resend. */ if (len) { if (len == (int)tp->t_maxseg) goto send; if ((idle || tp->t_flags & TF_NODELAY) && len + off >= (int)so->so_snd.sb_cc) { goto send; } if (tp->t_force) goto send; if (len >= (int)(tp->max_sndwnd / 2)) goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) goto send; } /* * Compare available window to amount of window * known to peer (as advertised window less * next expected input). If the difference is at least two * max size segments or at least 35% of the maximum possible * window, then want to send a window update to peer. */ if (win > 0) { int adv = (int)win - (int)(tp->rcv_adv - tp->rcv_nxt); if (so->so_rcv.sb_cc == 0 && adv >= (int)(tp->t_maxseg * 2)) goto send; if (100 * (u_int)adv / so->so_rcv.sb_hiwat >= 35) goto send; } /* * TCP window updates are not reliable, rather a polling protocol * using ``persist'' packets is used to insure receipt of window * updates. The three ``states'' for the output side are: * idle not doing retransmits or persists * persisting to move a small or zero window * (re)transmitting and thereby not persisting * * tp->t_timer[TCPT_PERSIST] * is set when we are in persist state. * tp->t_force * is set when we are called to send a persist packet. * tp->t_timer[TCPT_REXMT] * is set when we are retransmitting * The output side is idle when both timers are zero. * * If send window is too small, there is data to transmit, and no * retransmit or persist is pending, then go to persist state. * If nothing happens soon, send when timer expires: * if window is nonzero, transmit what we can, * otherwise force out a byte. */ if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 && tp->t_timer[TCPT_PERSIST] == 0) { tp->t_rxtshift = 0; tcp_setpersist(tp); } /* * No reason to send a segment, just return. */ return (0); send: ENTER_CRIT_SECTION(tp); /* Limit send length to the current buffer so as to * avoid doing the "mbuf shuffle" in m_copy(). */ bufoff = off; sendm = so->so_snd.sb_mb; if (len) { /* find mbuf containing data to send (at "off") */ while (sendm) /* loop through socket send list */ { bufoff -= sendm->m_len; if (bufoff < 0) /* if off is in this buffer, break */ break; sendm = sendm->m_next; } if (!sendm) { dtrap(); /* shouldn't happen */ } bufoff += sendm->m_len; /* index to next data to send in msend */ /* if socket has multiple unsent mbufs, set flag for send to loop */ if ((sendm->m_next) && (len > (int)sendm->m_len)) { flags &= ~TH_FIN; /* don't FIN on segment prior to last */ sendalot = 1; /* set to send more segments */ } if((flags & TH_FIN) && (so->so_snd.sb_cc > (unsigned)len)) { /* This can happen on slow links (PPP) which retry the last * segment - the one with the FIN bit attached to data. */ flags &= ~TH_FIN; /* don't FIN on segment prior to last */ } /* only send the rest of msend */ len = min(len, (int)sendm->m_len); /* if we're not sending starting at sendm->m_data (in which * case bufoff != 0), then we will copy the data; else we would * write IP/TCP headers over sent but un-ack'ed data in sendm. * Similarly, if sendm->m_data is not aligned with respect to * sendm->m_base and ALIGN_TYPE, we will copy the data to * ensure that it (and the then-prepended IP/TCP headers) will * be aligned according to ALIGN_TYPE. */ if ((bufoff != 0) || /* data not front aligned in send mbuf? */ (((sendm->m_data - sendm->m_base) & (ALIGN_TYPE - 1)) != 0)) { len = min(len, (int)(sendm->m_len - bufoff)); /* limit len again */ /* One more test - if this data is not aligned with the front * of the m_data buffer then we can't use it in place, else we * might write the IP/TCP header over data that has not yet * been acked. In this case we must make sure our send * fits into a little buffer and send what we can. */ if ((len > (int)(lilbufsiz - HDRSLEN)) && /* length is bigger the small buffer? */ (bigfreeq.q_len < 2)) /* and we are low on big buffers */ { len = lilbufsiz - HDRSLEN; } } } /* if send data is sufficiently aligned in packet, prepend TCP/IP header * in the space provided. */ if (len && (bufoff == 0) && (sendm->pkt->inuse == 1) && (((sendm->m_data - sendm->m_base) & (ALIGN_TYPE - 1)) == 0) && (optlen == 0)) { /* get an empty mbuf to "clone" the data */ m = m_getnbuf(MT_TXDATA, 0); if (!m) { EXIT_CRIT_SECTION(tp); return (ENOBUFS); } m->pkt = sendm->pkt; /* copy packet location in new mbuf */ m->pkt->inuse++; /* bump packet's use count */ m->m_base = sendm->m_base; /* clone mbuf members */ m->m_memsz = sendm->m_memsz; m->m_len = len + TCPIPHDRSZ; /* adjust clone for header */ m->m_data = sendm->m_data - TCPIPHDRSZ; } else /* either no data or data is not front aligned in mbuf */ { /* Grab a header mbuf, attaching a copy of data to be * transmitted, and initialize the header from * the template for sends on this connection. */ m = m_getwithdata (MT_HEADER, IFNETHDR_SIZE + TCPIPHDRSZ); if (m ==(struct mbuf *)NULL) { EXIT_CRIT_SECTION(tp); return ENOBUFS; } m->m_len = TCPIPHDRSZ; m->m_data += IFNETHDR_SIZE;/* Move this to sizeof tcpip hdr leave*/ /* 14 bytes for ethernet header */ if (len) /* attach any data to send */ { m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len); if (m->m_next == 0) { m_freem(m); EXIT_CRIT_SECTION(tp); return ENOBUFS; } } } EXIT_CRIT_SECTION(tp); if (len) { if (tp->t_force && len == 1) tcpstat.tcps_sndprobe++; else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { tcpstat.tcps_sndrexmitpack++; tcpstat.tcps_sndrexmitbyte += len; #ifdef TCP_SACK if(sack_resend) tcpstat.tcps_sackresend++; #endif } else { tcpstat.tcps_sndpack++; tcpstat.tcps_sndbyte += len; } } else if (tp->t_flags & TF_ACKNOW) { tcpstat.tcps_sndacks++; } else if (flags & (TH_SYN|TH_FIN|TH_RST)) tcpstat.tcps_sndctrl++; else if (SEQ_GT(tp->snd_up, tp->snd_una)) tcpstat.tcps_sndurg++; else tcpstat.tcps_sndwinup++; ti = (struct tcpiphdr *)(m->m_data+sizeof(struct ip)-sizeof(struct ipovly)); if ((char *)ti < m->pkt->nb_buff) { panic("tcp_out- packet ptr underflow\n"); } tcp_mbuf = m; /* flag TCP header mbuf */ #ifdef IP_V6 /* Dual mode code */ if(so->so_domain == AF_INET6) { m = mbuf_prepend(m, sizeof(struct ipv6)); if(m == NULL) { /* this can happen when we run out of mbufs or pkt buffers * That is, mfreeq is empty or (lilfreeq, bigfreeq) are empty. * One solution is to find out which one is getting full and * then increase them. */ dtrap(); /* This is really rare... */ m_freem(tcp_mbuf); /* Free TCP/data chain */ return ENOBUFS; } /* strip overlay from front of TCP header */ tcp_mbuf->m_data += sizeof(struct ipovly); tcp_mbuf->m_len -= sizeof(struct ipovly); } #endif /* end IP_V6 */ if (tp->t_template == 0) panic("tcp_output"); MEMCPY((char*)ti, (char*)tp->t_template, sizeof(struct tcpiphdr)); /* * Fill in fields, remembering maximum advertised * window for use in delaying messages about window sizes. * If resending a FIN, be sure not to use a new sequence number. */ if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && tp->snd_nxt == tp->snd_max) { tp->snd_nxt--; } ti->ti_seq = htonl(tp->snd_nxt); ti->ti_ack = htonl(tp->rcv_nxt); /* * If we're sending a SYN, check the IP address of the interface * that we will (likely) use to send the IP datagram -- if it's * changed from what is in the template (as it might if this is * a retransmission, and the original SYN caused PPP to start * bringing the interface up, and PPP has got a new IP address * via IPCP), update the template and the inpcb with the new * address. */ if (flags & TH_SYN) { struct inpcb * inp; inp = (struct inpcb *)so->so_pcb; switch(so->so_domain) { #ifdef IP_V4 case AF_INET: { ip_addr src; #ifdef INCLUDE_PPP if(((flags & TH_ACK) == 0) && /* SYN only, not SYN/ACK */ (inp->ifp) && /* Make sure we have iface */ (inp->ifp->mib.ifType == PPP)) /* only PPP type */ { dtrap(); /* remove after confirmed to work in PPP */ src = ip_mymach(ti->ti_dst.s_addr); if (src != ti->ti_src.s_addr) { ti->ti_src.s_addr = src; tp->t_template->ti_src.s_addr = src; tp->t_inpcb->inp_laddr.s_addr = src; } } #endif /* INCLUDE_PPP */ /* If this is a SYN (not a SYN/ACK) then set the pmtu */ if((flags & TH_ACK) == 0) { #ifdef IP_PMTU inp->inp_pmtu = pmtucache_get(inp->inp_faddr.s_addr); #else /* not compiled for pathmtu, guess based on iface */ { NET ifp; /* find iface for route. Pass "src" as nexthop return */ ifp = iproute(ti->ti_dst.s_addr, &src); if(ifp) inp->inp_pmtu = ifp->n_mtu - (ifp->n_lnh + 40); else inp->inp_pmtu = 580; /* Ugh. */ } #endif /* IP_PMTU */ } break; } #endif /* IP_V4 */ #ifdef IP_V6 case AF_INET6: { struct ip6_inaddr * local; local = ip6_myaddr(&tp->t_inpcb->ip6_faddr, inp->ifp); /* If we got a local address & it's not the one in the pcb, then * we assume it changed at the iface and fix it in the pcb. Unlike * v4, we don't have an IP header yet, not do we have a template * to worry about. */ if((local) && (!IP6EQ(&local->addr, &tp->t_inpcb->ip6_laddr))) { IP6CPY(&tp->t_inpcb->ip6_laddr, &local->addr); } /* If this is a SYN (not a SYN/ACK) then set the pmtu */ if((flags & TH_ACK) == 0) { inp->inp_pmtu = ip6_pmtulookup(&inp->ip6_laddr, inp->ifp); } break; } #endif /* IP_V6 */ default: dtrap(); /* bad domain setting */ } } /* fill in options if any are set */ if (optlen) { struct mbuf * mopt; mopt = m_getwithdata(MT_TXDATA, MAXOPTLEN); if (mopt == NULL) { m_freem(m); return (ENOBUFS); } /* insert options mbuf after after tmp_mbuf */ mopt->m_next = tcp_mbuf->m_next; tcp_mbuf->m_next = mopt; /* extend options to aligned address */ while(optlen & 0x03) tcp_optionbuf[optlen++] = TCPOPT_EOL; MEMCPY(mtod(mopt, char *), tcp_optionbuf, optlen); mopt->m_len = optlen; /* use portable macro to set tcp data offset bits */ SET_TH_OFF(ti->ti_t, ((sizeof (struct tcphdr) + optlen) >> 2)); } ti->ti_flags = (u_char)flags; /* * Calculate receive window. Don't shrink window, * but avoid silly window syndrome. */ if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg) win = 0; if (win < (long)(tp->rcv_adv - tp->rcv_nxt)) win = (long)(tp->rcv_adv - tp->rcv_nxt); /* do check for Iniche buffer limits -JB- */ if (bigfreeq.q_len == 0) /* If queue length is 0, set window to 0 */ { win = 0; } else if(win > (((long)bigfreeq.q_len - 1) * (long)bigbufsiz)) { win = ((long)bigfreeq.q_len - 1) * bigbufsiz; } #ifdef TCP_WIN_SCALE if(tp->t_flags & TF_WINSCALE) { ti->ti_win = htons((u_short)(win >> tp->rcv_wind_scale)); /* apply scale */ }
/* * Tcp output routine: figure out what should be sent and send it. */ int tcp_output(struct tcpcb *tp) { struct inpcb * const inp = tp->t_inpcb; struct socket *so = inp->inp_socket; long len, recvwin, sendwin; int nsacked = 0; int off, flags, error = 0; #ifdef TCP_SIGNATURE int sigoff = 0; #endif struct mbuf *m; struct ip *ip; struct tcphdr *th; u_char opt[TCP_MAXOLEN]; unsigned int ipoptlen, optlen, hdrlen; int idle; boolean_t sendalot; struct ip6_hdr *ip6; #ifdef INET6 const boolean_t isipv6 = INP_ISIPV6(inp); #else const boolean_t isipv6 = FALSE; #endif boolean_t can_tso = FALSE, use_tso; boolean_t report_sack, idle_cwv = FALSE; u_int segsz, tso_hlen, tso_lenmax = 0; int segcnt = 0; boolean_t need_sched = FALSE; KKASSERT(so->so_port == &curthread->td_msgport); /* * Determine length of data that should be transmitted, * and flags that will be used. * If there is some data or critical controls (SYN, RST) * to send, then transmit; otherwise, investigate further. */ /* * If we have been idle for a while, the send congestion window * could be no longer representative of the current state of the * link; need to validate congestion window. However, we should * not perform congestion window validation here, since we could * be asked to send pure ACK. */ if (tp->snd_max == tp->snd_una && (ticks - tp->snd_last) >= tp->t_rxtcur && tcp_idle_restart) idle_cwv = TRUE; /* * Calculate whether the transmit stream was previously idle * and adjust TF_LASTIDLE for the next time. */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); if (idle && (tp->t_flags & TF_MORETOCOME)) tp->t_flags |= TF_LASTIDLE; else tp->t_flags &= ~TF_LASTIDLE; if (TCP_DO_SACK(tp) && tp->snd_nxt != tp->snd_max && !IN_FASTRECOVERY(tp)) nsacked = tcp_sack_bytes_below(&tp->scb, tp->snd_nxt); /* * Find out whether TSO could be used or not * * For TSO capable devices, the following assumptions apply to * the processing of TCP flags: * - If FIN is set on the large TCP segment, the device must set * FIN on the last segment that it creates from the large TCP * segment. * - If PUSH is set on the large TCP segment, the device must set * PUSH on the last segment that it creates from the large TCP * segment. */ #if !defined(IPSEC) && !defined(FAST_IPSEC) if (tcp_do_tso #ifdef TCP_SIGNATURE && (tp->t_flags & TF_SIGNATURE) == 0 #endif ) { if (!isipv6) { struct rtentry *rt = inp->inp_route.ro_rt; if (rt != NULL && (rt->rt_flags & RTF_UP) && (rt->rt_ifp->if_hwassist & CSUM_TSO)) { can_tso = TRUE; tso_lenmax = rt->rt_ifp->if_tsolen; } } } #endif /* !IPSEC && !FAST_IPSEC */ again: m = NULL; ip = NULL; th = NULL; ip6 = NULL; if ((tp->t_flags & (TF_SACK_PERMITTED | TF_NOOPT)) == TF_SACK_PERMITTED && (!TAILQ_EMPTY(&tp->t_segq) || tp->reportblk.rblk_start != tp->reportblk.rblk_end)) report_sack = TRUE; else report_sack = FALSE; /* Make use of SACK information when slow-starting after a RTO. */ if (TCP_DO_SACK(tp) && tp->snd_nxt != tp->snd_max && !IN_FASTRECOVERY(tp)) { tcp_seq old_snd_nxt = tp->snd_nxt; tcp_sack_skip_sacked(&tp->scb, &tp->snd_nxt); nsacked += tp->snd_nxt - old_snd_nxt; } sendalot = FALSE; off = tp->snd_nxt - tp->snd_una; sendwin = min(tp->snd_wnd, tp->snd_cwnd + nsacked); sendwin = min(sendwin, tp->snd_bwnd); flags = tcp_outflags[tp->t_state]; /* * Get standard flags, and add SYN or FIN if requested by 'hidden' * state flags. */ if (tp->t_flags & TF_NEEDFIN) flags |= TH_FIN; if (tp->t_flags & TF_NEEDSYN) flags |= TH_SYN; /* * If in persist timeout with window of 0, send 1 byte. * Otherwise, if window is small but nonzero * and timer expired, we will send what we can * and go to transmit state. */ if (tp->t_flags & TF_FORCE) { if (sendwin == 0) { /* * If we still have some data to send, then * clear the FIN bit. Usually this would * happen below when it realizes that we * aren't sending all the data. However, * if we have exactly 1 byte of unsent data, * then it won't clear the FIN bit below, * and if we are in persist state, we wind * up sending the packet without recording * that we sent the FIN bit. * * We can't just blindly clear the FIN bit, * because if we don't have any more data * to send then the probe will be the FIN * itself. */ if (off < so->so_snd.ssb_cc) flags &= ~TH_FIN; sendwin = 1; } else { tcp_callout_stop(tp, tp->tt_persist); tp->t_rxtshift = 0; } } /* * If snd_nxt == snd_max and we have transmitted a FIN, the * offset will be > 0 even if so_snd.ssb_cc is 0, resulting in * a negative length. This can also occur when TCP opens up * its congestion window while receiving additional duplicate * acks after fast-retransmit because TCP will reset snd_nxt * to snd_max after the fast-retransmit. * * A negative length can also occur when we are in the * TCPS_SYN_RECEIVED state due to a simultanious connect where * our SYN has not been acked yet. * * In the normal retransmit-FIN-only case, however, snd_nxt will * be set to snd_una, the offset will be 0, and the length may * wind up 0. */ len = (long)ulmin(so->so_snd.ssb_cc, sendwin) - off; /* * Lop off SYN bit if it has already been sent. However, if this * is SYN-SENT state and if segment contains data, suppress sending * segment (sending the segment would be an option if we still * did TAO and the remote host supported it). */ if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { flags &= ~TH_SYN; off--, len++; if (len > 0 && tp->t_state == TCPS_SYN_SENT) { tp->t_flags &= ~(TF_ACKNOW | TF_XMITNOW); return 0; } } /* * Be careful not to send data and/or FIN on SYN segments. * This measure is needed to prevent interoperability problems * with not fully conformant TCP implementations. */ if (flags & TH_SYN) { len = 0; flags &= ~TH_FIN; } if (len < 0) { /* * A negative len can occur if our FIN has been sent but not * acked, or if we are in a simultanious connect in the * TCPS_SYN_RECEIVED state with our SYN sent but not yet * acked. * * If our window has contracted to 0 in the FIN case * (which can only occur if we have NOT been called to * retransmit as per code a few paragraphs up) then we * want to shift the retransmit timer over to the * persist timer. * * However, if we are in the TCPS_SYN_RECEIVED state * (the SYN case) we will be in a simultanious connect and * the window may be zero degeneratively. In this case we * do not want to shift to the persist timer after the SYN * or the SYN+ACK transmission. */ len = 0; if (sendwin == 0 && tp->t_state != TCPS_SYN_RECEIVED) { tcp_callout_stop(tp, tp->tt_rexmt); tp->t_rxtshift = 0; tp->snd_nxt = tp->snd_una; if (!tcp_callout_active(tp, tp->tt_persist)) tcp_setpersist(tp); } } KASSERT(len >= 0, ("%s: len < 0", __func__)); /* * Automatic sizing of send socket buffer. Often the send buffer * size is not optimally adjusted to the actual network conditions * at hand (delay bandwidth product). Setting the buffer size too * small limits throughput on links with high bandwidth and high * delay (eg. trans-continental/oceanic links). Setting the * buffer size too big consumes too much real kernel memory, * especially with many connections on busy servers. * * The criteria to step up the send buffer one notch are: * 1. receive window of remote host is larger than send buffer * (with a fudge factor of 5/4th); * 2. hiwat has not significantly exceeded bwnd (inflight) * (bwnd is a maximal value if inflight is disabled). * 3. send buffer is filled to 7/8th with data (so we actually * have data to make use of it); * 4. hiwat has not hit maximal automatic size; * 5. our send window (slow start and cogestion controlled) is * larger than sent but unacknowledged data in send buffer. * * The remote host receive window scaling factor may limit the * growing of the send buffer before it reaches its allowed * maximum. * * It scales directly with slow start or congestion window * and does at most one step per received ACK. This fast * scaling has the drawback of growing the send buffer beyond * what is strictly necessary to make full use of a given * delay*bandwith product. However testing has shown this not * to be much of an problem. At worst we are trading wasting * of available bandwith (the non-use of it) for wasting some * socket buffer memory. * * The criteria for shrinking the buffer is based solely on * the inflight code (snd_bwnd). If inflight is disabled, * the buffer will not be shrinked. Note that snd_bwnd already * has a fudge factor. Our test adds a little hysteresis. */ if (tcp_do_autosndbuf && (so->so_snd.ssb_flags & SSB_AUTOSIZE)) { const int asbinc = tcp_autosndbuf_inc; const int hiwat = so->so_snd.ssb_hiwat; const int lowat = so->so_snd.ssb_lowat; u_long newsize; if ((tp->snd_wnd / 4 * 5) >= hiwat && so->so_snd.ssb_cc >= (hiwat / 8 * 7) && hiwat < tp->snd_bwnd + hiwat / 10 && hiwat + asbinc < tcp_autosndbuf_max && hiwat < (TCP_MAXWIN << tp->snd_scale) && sendwin >= (so->so_snd.ssb_cc - (tp->snd_nxt - tp->snd_una))) { newsize = ulmin(hiwat + asbinc, tcp_autosndbuf_max); if (!ssb_reserve(&so->so_snd, newsize, so, NULL)) atomic_clear_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE); #if 0 if (newsize >= (TCP_MAXWIN << tp->snd_scale)) atomic_clear_int(&so->so_snd.ssb_flags, SSB_AUTOSIZE); #endif } else if ((long)tp->snd_bwnd < (long)(hiwat * 3 / 4 - lowat - asbinc) && hiwat > tp->t_maxseg * 2 + asbinc && hiwat + asbinc >= tcp_autosndbuf_min && tcp_do_autosndbuf == 1) { newsize = ulmax(hiwat - asbinc, tp->t_maxseg * 2); ssb_reserve(&so->so_snd, newsize, so, NULL); } } /* * Don't use TSO, if: * - Congestion window needs validation * - There are SACK blocks to report * - RST or SYN flags is set * - URG will be set * * XXX * Checking for SYN|RST looks overkill, just to be safe than sorry */ use_tso = can_tso; if (report_sack || idle_cwv || (flags & (TH_RST | TH_SYN))) use_tso = FALSE; if (use_tso) { tcp_seq ugr_nxt = tp->snd_nxt; if ((flags & TH_FIN) && (tp->t_flags & TF_SENTFIN) && tp->snd_nxt == tp->snd_max) --ugr_nxt; if (SEQ_GT(tp->snd_up, ugr_nxt)) use_tso = FALSE; } if (use_tso) { /* * Find out segment size and header length for TSO */ error = tcp_tso_getsize(tp, &segsz, &tso_hlen); if (error) use_tso = FALSE; } if (!use_tso) { segsz = tp->t_maxseg; tso_hlen = 0; /* not used */ } /* * Truncate to the maximum segment length if not TSO, and ensure that * FIN is removed if the length no longer contains the last data byte. */ if (len > segsz) { if (!use_tso) { len = segsz; ++segcnt; } else { int nsegs; if (__predict_false(tso_lenmax < segsz)) tso_lenmax = segsz << 1; /* * Truncate TSO transfers to (IP_MAXPACKET - iphlen - * thoff), and make sure that we send equal size * transfers down the stack (rather than big-small- * big-small-...). */ len = min(len, tso_lenmax); nsegs = min(len, (IP_MAXPACKET - tso_hlen)) / segsz; KKASSERT(nsegs > 0); len = nsegs * segsz; if (len <= segsz) { use_tso = FALSE; ++segcnt; } else { segcnt += nsegs; } } sendalot = TRUE; } else { use_tso = FALSE; if (len > 0) ++segcnt; } if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.ssb_cc)) flags &= ~TH_FIN; recvwin = ssb_space(&so->so_rcv); /* * Sender silly window avoidance. We transmit under the following * conditions when len is non-zero: * * - We have a full segment * - This is the last buffer in a write()/send() and we are * either idle or running NODELAY * - we've timed out (e.g. persist timer) * - we have more then 1/2 the maximum send window's worth of * data (receiver may be limiting the window size) * - we need to retransmit */ if (len) { if (len >= segsz) goto send; /* * NOTE! on localhost connections an 'ack' from the remote * end may occur synchronously with the output and cause * us to flush a buffer queued with moretocome. XXX * * note: the len + off check is almost certainly unnecessary. */ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ (idle || (tp->t_flags & TF_NODELAY)) && len + off >= so->so_snd.ssb_cc && !(tp->t_flags & TF_NOPUSH)) { goto send; } if (tp->t_flags & TF_FORCE) /* typ. timeout case */ goto send; if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */ goto send; if (tp->t_flags & TF_XMITNOW) goto send; } /* * Compare available window to amount of window * known to peer (as advertised window less * next expected input). If the difference is at least two * max size segments, or at least 50% of the maximum possible * window, then want to send a window update to peer. */ if (recvwin > 0) { /* * "adv" is the amount we can increase the window, * taking into account that we are limited by * TCP_MAXWIN << tp->rcv_scale. */ long adv = min(recvwin, (long)TCP_MAXWIN << tp->rcv_scale) - (tp->rcv_adv - tp->rcv_nxt); long hiwat; /* * This ack case typically occurs when the user has drained * the TCP socket buffer sufficiently to warrent an ack * containing a 'pure window update'... that is, an ack that * ONLY updates the tcp window. * * It is unclear why we would need to do a pure window update * past 2 segments if we are going to do one at 1/2 the high * water mark anyway, especially since under normal conditions * the user program will drain the socket buffer quickly. * The 2-segment pure window update will often add a large * number of extra, unnecessary acks to the stream. * * avoid_pure_win_update now defaults to 1. */ if (avoid_pure_win_update == 0 || (tp->t_flags & TF_RXRESIZED)) { if (adv >= (long) (2 * segsz)) { goto send; } } hiwat = (long)(TCP_MAXWIN << tp->rcv_scale); if (hiwat > (long)so->so_rcv.ssb_hiwat) hiwat = (long)so->so_rcv.ssb_hiwat; if (adv >= hiwat / 2) goto send; } /* * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW * is also a catch-all for the retransmit timer timeout case. */ if (tp->t_flags & TF_ACKNOW) goto send; if ((flags & TH_RST) || ((flags & TH_SYN) && !(tp->t_flags & TF_NEEDSYN))) goto send; if (SEQ_GT(tp->snd_up, tp->snd_una)) goto send; /* * If our state indicates that FIN should be sent * and we have not yet done so, then we need to send. */ if ((flags & TH_FIN) && (!(tp->t_flags & TF_SENTFIN) || tp->snd_nxt == tp->snd_una)) goto send; /* * TCP window updates are not reliable, rather a polling protocol * using ``persist'' packets is used to insure receipt of window * updates. The three ``states'' for the output side are: * idle not doing retransmits or persists * persisting to move a small or zero window * (re)transmitting and thereby not persisting * * tcp_callout_active(tp, tp->tt_persist) * is true when we are in persist state. * The TF_FORCE flag in tp->t_flags * is set when we are called to send a persist packet. * tcp_callout_active(tp, tp->tt_rexmt) * is set when we are retransmitting * The output side is idle when both timers are zero. * * If send window is too small, there is data to transmit, and no * retransmit or persist is pending, then go to persist state. * * If nothing happens soon, send when timer expires: * if window is nonzero, transmit what we can, otherwise force out * a byte. * * Don't try to set the persist state if we are in TCPS_SYN_RECEIVED * with data pending. This situation can occur during a * simultanious connect. */ if (so->so_snd.ssb_cc > 0 && tp->t_state != TCPS_SYN_RECEIVED && !tcp_callout_active(tp, tp->tt_rexmt) && !tcp_callout_active(tp, tp->tt_persist)) { tp->t_rxtshift = 0; tcp_setpersist(tp); } /* * No reason to send a segment, just return. */ tp->t_flags &= ~TF_XMITNOW; return (0); send: if (need_sched && len > 0) { tcp_output_sched(tp); return 0; } /* * Before ESTABLISHED, force sending of initial options * unless TCP set not to do any options. * NOTE: we assume that the IP/TCP header plus TCP options * always fit in a single mbuf, leaving room for a maximum * link header, i.e. * max_linkhdr + sizeof(struct tcpiphdr) + optlen <= MCLBYTES */ optlen = 0; if (isipv6) hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else hdrlen = sizeof(struct tcpiphdr); if (flags & TH_SYN) { tp->snd_nxt = tp->iss; if (!(tp->t_flags & TF_NOOPT)) { u_short mss; opt[0] = TCPOPT_MAXSEG; opt[1] = TCPOLEN_MAXSEG; mss = htons((u_short) tcp_mssopt(tp)); memcpy(opt + 2, &mss, sizeof mss); optlen = TCPOLEN_MAXSEG; if ((tp->t_flags & TF_REQ_SCALE) && (!(flags & TH_ACK) || (tp->t_flags & TF_RCVD_SCALE))) { *((u_int32_t *)(opt + optlen)) = htonl( TCPOPT_NOP << 24 | TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | tp->request_r_scale); optlen += 4; } if ((tcp_do_sack && !(flags & TH_ACK)) || tp->t_flags & TF_SACK_PERMITTED) { uint32_t *lp = (uint32_t *)(opt + optlen); *lp = htonl(TCPOPT_SACK_PERMITTED_ALIGNED); optlen += TCPOLEN_SACK_PERMITTED_ALIGNED; } } } /* * Send a timestamp and echo-reply if this is a SYN and our side * wants to use timestamps (TF_REQ_TSTMP is set) or both our side * and our peer have sent timestamps in our SYN's. */ if ((tp->t_flags & (TF_REQ_TSTMP | TF_NOOPT)) == TF_REQ_TSTMP && !(flags & TH_RST) && (!(flags & TH_ACK) || (tp->t_flags & TF_RCVD_TSTMP))) { u_int32_t *lp = (u_int32_t *)(opt + optlen); /* Form timestamp option as shown in appendix A of RFC 1323. */ *lp++ = htonl(TCPOPT_TSTAMP_HDR); *lp++ = htonl(ticks); *lp = htonl(tp->ts_recent); optlen += TCPOLEN_TSTAMP_APPA; } /* Set receive buffer autosizing timestamp. */ if (tp->rfbuf_ts == 0 && (so->so_rcv.ssb_flags & SSB_AUTOSIZE)) tp->rfbuf_ts = ticks; /* * If this is a SACK connection and we have a block to report, * fill in the SACK blocks in the TCP options. */ if (report_sack) tcp_sack_fill_report(tp, opt, &optlen); #ifdef TCP_SIGNATURE if (tp->t_flags & TF_SIGNATURE) { int i; u_char *bp; /* * Initialize TCP-MD5 option (RFC2385) */ bp = (u_char *)opt + optlen; *bp++ = TCPOPT_SIGNATURE; *bp++ = TCPOLEN_SIGNATURE; sigoff = optlen + 2; for (i = 0; i < TCP_SIGLEN; i++) *bp++ = 0; optlen += TCPOLEN_SIGNATURE; /* * Terminate options list and maintain 32-bit alignment. */ *bp++ = TCPOPT_NOP; *bp++ = TCPOPT_EOL; optlen += 2; } #endif /* TCP_SIGNATURE */ KASSERT(optlen <= TCP_MAXOLEN, ("too many TCP options")); hdrlen += optlen; if (isipv6) { ipoptlen = ip6_optlen(inp); } else { if (inp->inp_options) { ipoptlen = inp->inp_options->m_len - offsetof(struct ipoption, ipopt_list); } else {
/* * TCP input routine, follows pages 65-76 of the * protocol specification dated September, 1981 very closely. */ void tcp_input(struct mbuf *m, int iphlen, struct socket *inso) { struct ip save_ip, *ip; register struct tcpiphdr *ti; caddr_t optp = NULL; int optlen = 0; int len, tlen, off; register struct tcpcb *tp = NULL; register int tiflags; struct socket *so = NULL; int todrop, acked, ourfinisacked, needoutput = 0; int iss = 0; u_long tiwin; int ret; struct ex_list *ex_ptr; Slirp *slirp; DEBUG_CALL("tcp_input"); DEBUG_ARGS((dfd, " m = %8lx iphlen = %2d inso = %lx\n", (long )m, iphlen, (long )inso )); /* * If called with m == 0, then we're continuing the connect */ if (m == NULL) { so = inso; slirp = so->slirp; /* Re-set a few variables */ tp = sototcpcb(so); m = so->so_m; so->so_m = NULL; ti = so->so_ti; tiwin = ti->ti_win; tiflags = ti->ti_flags; goto cont_conn; } slirp = m->slirp; /* * Get IP and TCP header together in first mbuf. * Note: IP leaves IP header in first mbuf. */ ti = mtod(m, struct tcpiphdr *); if (iphlen > sizeof(struct ip )) { ip_stripoptions(m, (struct mbuf *)0); iphlen=sizeof(struct ip ); } /* XXX Check if too short */ /* * Save a copy of the IP header in case we want restore it * for sending an ICMP error message in response. */ ip=mtod(m, struct ip *); save_ip = *ip; save_ip.ip_len+= iphlen; /* * Checksum extended TCP header and data. */ tlen = ((struct ip *)ti)->ip_len; tcpiphdr2qlink(ti)->next = tcpiphdr2qlink(ti)->prev = NULL; memset(&ti->ti_i.ih_mbuf, 0 , sizeof(struct mbuf_ptr)); ti->ti_x1 = 0; ti->ti_len = htons((uint16_t)tlen); len = sizeof(struct ip ) + tlen; if(cksum(m, len)) { goto drop; } /* * Check that TCP offset makes sense, * pull out TCP options and adjust length. XXX */ off = ti->ti_off << 2; if (off < sizeof (struct tcphdr) || off > tlen) { goto drop; } tlen -= off; ti->ti_len = tlen; if (off > sizeof (struct tcphdr)) { optlen = off - sizeof (struct tcphdr); optp = mtod(m, caddr_t) + sizeof (struct tcpiphdr); } tiflags = ti->ti_flags; /* * Convert TCP protocol specific fields to host format. */ NTOHL(ti->ti_seq); NTOHL(ti->ti_ack); NTOHS(ti->ti_win); NTOHS(ti->ti_urp); /* * Drop TCP, IP headers and TCP options. */ m->m_data += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); m->m_len -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); /* * Locate pcb for segment. */ findso: so = slirp->tcp_last_so; if (so->so_fport != ti->ti_dport || so->so_lport != ti->ti_sport || so->so_laddr.s_addr != ti->ti_src.s_addr || so->so_faddr.s_addr != ti->ti_dst.s_addr) { so = solookup(&slirp->tcb, ti->ti_src, ti->ti_sport, ti->ti_dst, ti->ti_dport); if (so) slirp->tcp_last_so = so; } /* * If the state is CLOSED (i.e., TCB does not exist) then * all data in the incoming segment is discarded. * If the TCB exists but is in CLOSED state, it is embryonic, * but should either do a listen or a connect soon. * * state == CLOSED means we've done socreate() but haven't * attached it to a protocol yet... * * XXX If a TCB does not exist, and the TH_SYN flag is * the only flag set, then create a session, mark it * as if it was LISTENING, and continue... */ if (so == NULL) { if (slirp->restricted) { /* Any hostfwds will have an existing socket, so we only get here * for non-hostfwd connections. These should be dropped, unless it * happens to be a guestfwd. */ for (ex_ptr = slirp->exec_list; ex_ptr; ex_ptr = ex_ptr->ex_next) { if (ex_ptr->ex_fport == ti->ti_dport && ti->ti_dst.s_addr == ex_ptr->ex_addr.s_addr) { break; } } if (!ex_ptr) { goto dropwithreset; } } if ((tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) != TH_SYN) goto dropwithreset; if ((so = socreate(slirp)) == NULL) goto dropwithreset; if (tcp_attach(so) < 0) { free(so); /* Not sofree (if it failed, it's not insqued) */ goto dropwithreset; } sbreserve(&so->so_snd, TCP_SNDSPACE); sbreserve(&so->so_rcv, TCP_RCVSPACE); so->so_laddr = ti->ti_src; so->so_lport = ti->ti_sport; so->so_faddr = ti->ti_dst; so->so_fport = ti->ti_dport; if ((so->so_iptos = tcp_tos(so)) == 0) so->so_iptos = ((struct ip *)ti)->ip_tos; tp = sototcpcb(so); tp->t_state = TCPS_LISTEN; } /* * If this is a still-connecting socket, this probably * a retransmit of the SYN. Whether it's a retransmit SYN * or something else, we nuke it. */ if (so->so_state & SS_ISFCONNECTING) goto drop; tp = sototcpcb(so); /* XXX Should never fail */ if (tp == NULL) goto dropwithreset; if (tp->t_state == TCPS_CLOSED) goto drop; tiwin = ti->ti_win; /* * Segment received on connection. * Reset idle time and keep-alive timer. */ tp->t_idle = 0; if (SO_OPTIONS) tp->t_timer[TCPT_KEEP] = TCPTV_KEEPINTVL; else tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_IDLE; /* * Process options if not in LISTEN state, * else do it below (after getting remote address). */ if (optp && tp->t_state != TCPS_LISTEN) tcp_dooptions(tp, (u_char *)optp, optlen, ti); /* * Header prediction: check for the two common cases * of a uni-directional data xfer. If the packet has * no control flags, is in-sequence, the window didn't * change and we're not retransmitting, it's a * candidate. If the length is zero and the ack moved * forward, we're the sender side of the xfer. Just * free the data acked & wake any higher level process * that was blocked waiting for space. If the length * is non-zero and the ack didn't move, we're the * receiver side. If we're getting packets in-order * (the reassembly queue is empty), add the data to * the socket buffer and note that we need a delayed ack. * * XXX Some of these tests are not needed * eg: the tiwin == tp->snd_wnd prevents many more * predictions.. with no *real* advantage.. */ if (tp->t_state == TCPS_ESTABLISHED && (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && ti->ti_seq == tp->rcv_nxt && tiwin && tiwin == tp->snd_wnd && tp->snd_nxt == tp->snd_max) { if (ti->ti_len == 0) { if (SEQ_GT(ti->ti_ack, tp->snd_una) && SEQ_LEQ(ti->ti_ack, tp->snd_max) && tp->snd_cwnd >= tp->snd_wnd) { /* * this is a pure ack for outstanding data. */ if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq)) tcp_xmit_timer(tp, tp->t_rtt); acked = ti->ti_ack - tp->snd_una; sbdrop(&so->so_snd, acked); tp->snd_una = ti->ti_ack; m_free(m); /* * If all outstanding data are acked, stop * retransmit timer, otherwise restart timer * using current (possibly backed-off) value. * If process is waiting for space, * wakeup/selwakeup/signal. If data * are ready to send, let tcp_output * decide between more output or persist. */ if (tp->snd_una == tp->snd_max) tp->t_timer[TCPT_REXMT] = 0; else if (tp->t_timer[TCPT_PERSIST] == 0) tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; /* * This is called because sowwakeup might have * put data into so_snd. Since we don't so sowwakeup, * we don't need this.. XXX??? */ if (so->so_snd.sb_cc) (void) tcp_output(tp); return; } } else if (ti->ti_ack == tp->snd_una && tcpfrag_list_empty(tp) && ti->ti_len <= sbspace(&so->so_rcv)) { /* * this is a pure, in-sequence data packet * with nothing on the reassembly queue and * we have enough buffer space to take it. */ tp->rcv_nxt += ti->ti_len; /* * Add data to socket buffer. */ if (so->so_emu) { if (tcp_emu(so,m)) sbappend(so, m); } else sbappend(so, m); /* * If this is a short packet, then ACK now - with Nagel * congestion avoidance sender won't send more until * he gets an ACK. * * It is better to not delay acks at all to maximize * TCP throughput. See RFC 2581. */ tp->t_flags |= TF_ACKNOW; tcp_output(tp); return; } } /* header prediction */ /* * Calculate amount of space in receive window, * and then do TCP input processing. * Receive window is amount of space in rcv queue, * but not less than advertised window. */ { int win; win = sbspace(&so->so_rcv); if (win < 0) win = 0; tp->rcv_wnd = max(win, (int)(tp->rcv_adv - tp->rcv_nxt)); } switch (tp->t_state) { /* * If the state is LISTEN then ignore segment if it contains an RST. * If the segment contains an ACK then it is bad and send a RST. * If it does not contain a SYN then it is not interesting; drop it. * Don't bother responding if the destination was a broadcast. * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial * tp->iss, and send a segment: * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss. * Fill in remote peer address fields if not previously specified. * Enter SYN_RECEIVED state, and process any other fields of this * segment in this state. */ case TCPS_LISTEN: { if (tiflags & TH_RST) goto drop; if (tiflags & TH_ACK) goto dropwithreset; if ((tiflags & TH_SYN) == 0) goto drop; /* * This has way too many gotos... * But a bit of spaghetti code never hurt anybody :) */ /* * If this is destined for the control address, then flag to * tcp_ctl once connected, otherwise connect */ if ((so->so_faddr.s_addr & slirp->vnetwork_mask.s_addr) == slirp->vnetwork_addr.s_addr) { if (so->so_faddr.s_addr != slirp->vhost_addr.s_addr && so->so_faddr.s_addr != slirp->vnameserver_addr.s_addr) { /* May be an add exec */ for (ex_ptr = slirp->exec_list; ex_ptr; ex_ptr = ex_ptr->ex_next) { if(ex_ptr->ex_fport == so->so_fport && so->so_faddr.s_addr == ex_ptr->ex_addr.s_addr) { so->so_state |= SS_CTL; break; } } if (so->so_state & SS_CTL) { goto cont_input; } } /* CTL_ALIAS: Do nothing, tcp_fconnect will be called on it */ } if (so->so_emu & EMU_NOCONNECT) { so->so_emu &= ~EMU_NOCONNECT; goto cont_input; } if ((tcp_fconnect(so) == -1) && #if defined(_WIN32) socket_error() != WSAEWOULDBLOCK #else (errno != EINPROGRESS) && (errno != EWOULDBLOCK) #endif ) { u_char code=ICMP_UNREACH_NET; DEBUG_MISC((dfd, " tcp fconnect errno = %d-%s\n", errno,strerror(errno))); if(errno == ECONNREFUSED) { /* ACK the SYN, send RST to refuse the connection */ tcp_respond(tp, ti, m, ti->ti_seq+1, (tcp_seq)0, TH_RST|TH_ACK); } else { if(errno == EHOSTUNREACH) code=ICMP_UNREACH_HOST; HTONL(ti->ti_seq); /* restore tcp header */ HTONL(ti->ti_ack); HTONS(ti->ti_win); HTONS(ti->ti_urp); m->m_data -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); m->m_len += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); *ip=save_ip; icmp_error(m, ICMP_UNREACH,code, 0,strerror(errno)); } tcp_close(tp); m_free(m); } else { /* * Haven't connected yet, save the current mbuf * and ti, and return * XXX Some OS's don't tell us whether the connect() * succeeded or not. So we must time it out. */ so->so_m = m; so->so_ti = ti; tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT; tp->t_state = TCPS_SYN_RECEIVED; tcp_template(tp); } return; cont_conn: /* m==NULL * Check if the connect succeeded */ if (so->so_state & SS_NOFDREF) { tp = tcp_close(tp); goto dropwithreset; } cont_input: tcp_template(tp); if (optp) tcp_dooptions(tp, (u_char *)optp, optlen, ti); if (iss) tp->iss = iss; else tp->iss = slirp->tcp_iss; slirp->tcp_iss += TCP_ISSINCR/2; tp->irs = ti->ti_seq; tcp_sendseqinit(tp); tcp_rcvseqinit(tp); tp->t_flags |= TF_ACKNOW; tp->t_state = TCPS_SYN_RECEIVED; tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT; goto trimthenstep6; } /* case TCPS_LISTEN */ /* * If the state is SYN_SENT: * if seg contains an ACK, but not for our SYN, drop the input. * if seg contains a RST, then drop the connection. * if seg does not contain SYN, then drop it. * Otherwise this is an acceptable SYN segment * initialize tp->rcv_nxt and tp->irs * if seg contains ack then advance tp->snd_una * if SYN has been acked change to ESTABLISHED else SYN_RCVD state * arrange for segment to be acked (eventually) * continue processing rest of data/controls, beginning with URG */ case TCPS_SYN_SENT: if ((tiflags & TH_ACK) && (SEQ_LEQ(ti->ti_ack, tp->iss) || SEQ_GT(ti->ti_ack, tp->snd_max))) goto dropwithreset; if (tiflags & TH_RST) { if (tiflags & TH_ACK) { tcp_drop(tp, 0); /* XXX Check t_softerror! */ } goto drop; } if ((tiflags & TH_SYN) == 0) goto drop; if (tiflags & TH_ACK) { tp->snd_una = ti->ti_ack; if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; } tp->t_timer[TCPT_REXMT] = 0; tp->irs = ti->ti_seq; tcp_rcvseqinit(tp); tp->t_flags |= TF_ACKNOW; if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { soisfconnected(so); tp->t_state = TCPS_ESTABLISHED; (void) tcp_reass(tp, (struct tcpiphdr *)0, (struct mbuf *)0); /* * if we didn't have to retransmit the SYN, * use its rtt as our initial srtt & rtt var. */ if (tp->t_rtt) tcp_xmit_timer(tp, tp->t_rtt); } else tp->t_state = TCPS_SYN_RECEIVED; trimthenstep6: /* * Advance ti->ti_seq to correspond to first data byte. * If data, trim to stay within window, * dropping FIN if necessary. */ ti->ti_seq++; if (ti->ti_len > tp->rcv_wnd) { todrop = ti->ti_len - tp->rcv_wnd; m_adj(m, -todrop); ti->ti_len = tp->rcv_wnd; tiflags &= ~TH_FIN; } tp->snd_wl1 = ti->ti_seq - 1; tp->rcv_up = ti->ti_seq; goto step6; } /* switch tp->t_state */ /* * States other than LISTEN or SYN_SENT. * Check that at least some bytes of segment are within * receive window. If segment begins before rcv_nxt, * drop leading data (and SYN); if nothing left, just ack. */ todrop = tp->rcv_nxt - ti->ti_seq; if (todrop > 0) { if (tiflags & TH_SYN) { tiflags &= ~TH_SYN; ti->ti_seq++; if (ti->ti_urp > 1) ti->ti_urp--; else tiflags &= ~TH_URG; todrop--; } /* * Following if statement from Stevens, vol. 2, p. 960. */ if (todrop > ti->ti_len || (todrop == ti->ti_len && (tiflags & TH_FIN) == 0)) { /* * Any valid FIN must be to the left of the window. * At this point the FIN must be a duplicate or out * of sequence; drop it. */ tiflags &= ~TH_FIN; /* * Send an ACK to resynchronize and drop any data. * But keep on processing for RST or ACK. */ tp->t_flags |= TF_ACKNOW; todrop = ti->ti_len; } m_adj(m, todrop); ti->ti_seq += todrop; ti->ti_len -= todrop; if (ti->ti_urp > todrop) ti->ti_urp -= todrop; else { tiflags &= ~TH_URG; ti->ti_urp = 0; } } /* * If new data are received on a connection after the * user processes are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT && ti->ti_len) { tp = tcp_close(tp); goto dropwithreset; } /* * If segment ends after window, drop trailing data * (and PUSH and FIN); if nothing left, just ACK. */ todrop = (ti->ti_seq+ti->ti_len) - (tp->rcv_nxt+tp->rcv_wnd); if (todrop > 0) { if (todrop >= ti->ti_len) { /* * If a new connection request is received * while in TIME_WAIT, drop the old connection * and start over if the sequence numbers * are above the previous ones. */ if (tiflags & TH_SYN && tp->t_state == TCPS_TIME_WAIT && SEQ_GT(ti->ti_seq, tp->rcv_nxt)) { iss = tp->rcv_nxt + TCP_ISSINCR; tp = tcp_close(tp); goto findso; } /* * If window is closed can only take segments at * window edge, and have to drop data and PUSH from * incoming segments. Continue processing, but * remember to ack. Otherwise, drop segment * and ack. */ if (tp->rcv_wnd == 0 && ti->ti_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; } else { goto dropafterack; } } m_adj(m, -todrop); ti->ti_len -= todrop; tiflags &= ~(TH_PUSH|TH_FIN); } /* * If the RST bit is set examine the state: * SYN_RECEIVED STATE: * If passive open, return to LISTEN state. * If active open, inform user that connection was refused. * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: * Inform user that connection was reset, and close tcb. * CLOSING, LAST_ACK, TIME_WAIT STATES * Close the tcb. */ if (tiflags&TH_RST) switch (tp->t_state) { case TCPS_SYN_RECEIVED: case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: tp->t_state = TCPS_CLOSED; tcp_close(tp); goto drop; case TCPS_CLOSING: case TCPS_LAST_ACK: case TCPS_TIME_WAIT: tcp_close(tp); goto drop; } /* * If a SYN is in the window, then this is an * error and we send an RST and drop the connection. */ if (tiflags & TH_SYN) { tp = tcp_drop(tp,0); goto dropwithreset; } /* * If the ACK bit is off we drop the segment and return. */ if ((tiflags & TH_ACK) == 0) goto drop; /* * Ack processing. */ switch (tp->t_state) { /* * In SYN_RECEIVED state if the ack ACKs our SYN then enter * ESTABLISHED state and continue processing, otherwise * send an RST. una<=ack<=max */ case TCPS_SYN_RECEIVED: if (SEQ_GT(tp->snd_una, ti->ti_ack) || SEQ_GT(ti->ti_ack, tp->snd_max)) goto dropwithreset; tp->t_state = TCPS_ESTABLISHED; /* * The sent SYN is ack'ed with our sequence number +1 * The first data byte already in the buffer will get * lost if no correction is made. This is only needed for * SS_CTL since the buffer is empty otherwise. * tp->snd_una++; or: */ tp->snd_una=ti->ti_ack; if (so->so_state & SS_CTL) { /* So tcp_ctl reports the right state */ ret = tcp_ctl(so); if (ret == 1) { soisfconnected(so); so->so_state &= ~SS_CTL; /* success XXX */ } else if (ret == 2) { so->so_state &= SS_PERSISTENT_MASK; so->so_state |= SS_NOFDREF; /* CTL_CMD */ } else { needoutput = 1; tp->t_state = TCPS_FIN_WAIT_1; } } else { soisfconnected(so); } (void) tcp_reass(tp, (struct tcpiphdr *)0, (struct mbuf *)0); tp->snd_wl1 = ti->ti_seq - 1; /* Avoid ack processing; snd_una==ti_ack => dup ack */ goto synrx_to_est; /* fall into ... */ /* * In ESTABLISHED state: drop duplicate ACKs; ACK out of range * ACKs. If the ack is in the range * tp->snd_una < ti->ti_ack <= tp->snd_max * then advance tp->snd_una to ti->ti_ack and drop * data from the retransmission queue. If this ACK reflects * more up to date window information we update our window information. */ case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: case TCPS_TIME_WAIT: if (SEQ_LEQ(ti->ti_ack, tp->snd_una)) { if (ti->ti_len == 0 && tiwin == tp->snd_wnd) { DEBUG_MISC((dfd, " dup ack m = %lx so = %lx\n", (long )m, (long )so)); /* * If we have outstanding data (other than * a window probe), this is a completely * duplicate ack (ie, window info didn't * change), the ack is the biggest we've * seen and we've seen exactly our rexmt * threshold of them, assume a packet * has been dropped and retransmit it. * Kludge snd_nxt & the congestion * window so we send only this one * packet. * * We know we're losing at the current * window size so do congestion avoidance * (set ssthresh to half the current window * and pull our congestion window back to * the new ssthresh). * * Dup acks mean that packets have left the * network (they're now cached at the receiver) * so bump cwnd by the amount in the receiver * to keep a constant cwnd packets in the * network. */ if (tp->t_timer[TCPT_REXMT] == 0 || ti->ti_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks == TCPREXMTTHRESH) { tcp_seq onxt = tp->snd_nxt; u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; if (win < 2) win = 2; tp->snd_ssthresh = win * tp->t_maxseg; tp->t_timer[TCPT_REXMT] = 0; tp->t_rtt = 0; tp->snd_nxt = ti->ti_ack; tp->snd_cwnd = tp->t_maxseg; (void) tcp_output(tp); tp->snd_cwnd = tp->snd_ssthresh + tp->t_maxseg * tp->t_dupacks; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; } else if (tp->t_dupacks > TCPREXMTTHRESH) { tp->snd_cwnd += tp->t_maxseg; (void) tcp_output(tp); goto drop; } } else tp->t_dupacks = 0; break; } synrx_to_est: /* * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ if (tp->t_dupacks > TCPREXMTTHRESH && tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; tp->t_dupacks = 0; if (SEQ_GT(ti->ti_ack, tp->snd_max)) { goto dropafterack; } acked = ti->ti_ack - tp->snd_una; /* * If transmit timer is running and timed sequence * number was acked, update smoothed round trip time. * Since we now have an rtt measurement, cancel the * timer backoff (cf., Phil Karn's retransmit alg.). * Recompute the initial retransmit timer. */ if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq)) tcp_xmit_timer(tp,tp->t_rtt); /* * If all outstanding data is acked, stop retransmit * timer and remember to restart (more output or persist). * If there is more data to be acked, restart retransmit * timer, using current (possibly backed-off) value. */ if (ti->ti_ack == tp->snd_max) { tp->t_timer[TCPT_REXMT] = 0; needoutput = 1; } else if (tp->t_timer[TCPT_PERSIST] == 0) tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; /* * When new data is acked, open the congestion window. * If the window gives us less than ssthresh packets * in flight, open exponentially (maxseg per packet). * Otherwise open linearly: maxseg per window * (maxseg^2 / cwnd per packet). */ { register u_int cw = tp->snd_cwnd; register u_int incr = tp->t_maxseg; if (cw > tp->snd_ssthresh) incr = incr * incr / cw; tp->snd_cwnd = min(cw + incr, TCP_MAXWIN<<tp->snd_scale); } if (acked > so->so_snd.sb_cc) { tp->snd_wnd -= so->so_snd.sb_cc; sbdrop(&so->so_snd, (int )so->so_snd.sb_cc); ourfinisacked = 1; } else { sbdrop(&so->so_snd, acked); tp->snd_wnd -= acked; ourfinisacked = 0; } tp->snd_una = ti->ti_ack; if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; switch (tp->t_state) { /* * In FIN_WAIT_1 STATE in addition to the processing * for the ESTABLISHED state if our FIN is now acknowledged * then enter FIN_WAIT_2. */ case TCPS_FIN_WAIT_1: if (ourfinisacked) { /* * If we can't receive any more * data, then closing user can proceed. * Starting the timer is contrary to the * specification, but if we don't get a FIN * we'll hang forever. */ if (so->so_state & SS_FCANTRCVMORE) { tp->t_timer[TCPT_2MSL] = TCP_MAXIDLE; } tp->t_state = TCPS_FIN_WAIT_2; } break; /* * In CLOSING STATE in addition to the processing for * the ESTABLISHED state if the ACK acknowledges our FIN * then enter the TIME-WAIT state, otherwise ignore * the segment. */ case TCPS_CLOSING: if (ourfinisacked) { tp->t_state = TCPS_TIME_WAIT; tcp_canceltimers(tp); tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; } break; /* * In LAST_ACK, we may still be waiting for data to drain * and/or to be acked, as well as for the ack of our FIN. * If our FIN is now acknowledged, delete the TCB, * enter the closed state and return. */ case TCPS_LAST_ACK: if (ourfinisacked) { tcp_close(tp); goto drop; } break; /* * In TIME_WAIT state the only thing that should arrive * is a retransmission of the remote FIN. Acknowledge * it and restart the finack timer. */ case TCPS_TIME_WAIT: tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; goto dropafterack; } } /* switch(tp->t_state) */ step6: /* * Update window information. * Don't look at window if no ACK: TAC's send garbage on first SYN. */ if ((tiflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, ti->ti_seq) || (tp->snd_wl1 == ti->ti_seq && (SEQ_LT(tp->snd_wl2, ti->ti_ack) || (tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd))))) { tp->snd_wnd = tiwin; tp->snd_wl1 = ti->ti_seq; tp->snd_wl2 = ti->ti_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; needoutput = 1; } /* * Process segments with URG. */ if ((tiflags & TH_URG) && ti->ti_urp && TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* * This is a kludge, but if we receive and accept * random urgent pointers, we'll crash in * soreceive. It's hard to imagine someone * actually wanting to send this much urgent data. */ if (ti->ti_urp + so->so_rcv.sb_cc > so->so_rcv.sb_datalen) { ti->ti_urp = 0; tiflags &= ~TH_URG; goto dodata; } /* * If this segment advances the known urgent pointer, * then mark the data stream. This should not happen * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since * a FIN has been received from the remote side. * In these states we ignore the URG. * * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section as the original * spec states (in one of two places). */ if (SEQ_GT(ti->ti_seq+ti->ti_urp, tp->rcv_up)) { tp->rcv_up = ti->ti_seq + ti->ti_urp; so->so_urgc = so->so_rcv.sb_cc + (tp->rcv_up - tp->rcv_nxt); /* -1; */ tp->rcv_up = ti->ti_seq + ti->ti_urp; } } else /* * If no out of band data is expected, * pull receive urgent pointer along * with the receive window. */ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) tp->rcv_up = tp->rcv_nxt; dodata: /* * If this is a small packet, then ACK now - with Nagel * congestion avoidance sender won't send more until * he gets an ACK. */ if (ti->ti_len && (unsigned)ti->ti_len <= 5 && ((struct tcpiphdr_2 *)ti)->first_char == (char)27) { tp->t_flags |= TF_ACKNOW; } /* * Process the segment text, merging it into the TCP sequencing queue, * and arranging for acknowledgment of receipt if necessary. * This process logically involves adjusting tp->rcv_wnd as data * is presented to the user (this happens in tcp_usrreq.c, * case PRU_RCVD). If a FIN has already been received on this * connection then we just ignore the text. */ if ((ti->ti_len || (tiflags&TH_FIN)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { TCP_REASS(tp, ti, m, so, tiflags); } else { m_free(m); tiflags &= ~TH_FIN; } /* * If FIN is received ACK the FIN and let the user know * that the connection is closing. */ if (tiflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* * If we receive a FIN we can't send more data, * set it SS_FDRAIN * Shutdown the socket if there is no rx data in the * buffer. * soread() is called on completion of shutdown() and * will got to TCPS_LAST_ACK, and use tcp_output() * to send the FIN. */ sofwdrain(so); tp->t_flags |= TF_ACKNOW; tp->rcv_nxt++; } switch (tp->t_state) { /* * In SYN_RECEIVED and ESTABLISHED STATES * enter the CLOSE_WAIT state. */ case TCPS_SYN_RECEIVED: case TCPS_ESTABLISHED: if(so->so_emu == EMU_CTL) /* no shutdown on socket */ tp->t_state = TCPS_LAST_ACK; else tp->t_state = TCPS_CLOSE_WAIT; break; /* * If still in FIN_WAIT_1 STATE FIN has not been acked so * enter the CLOSING state. */ case TCPS_FIN_WAIT_1: tp->t_state = TCPS_CLOSING; break; /* * In FIN_WAIT_2 state enter the TIME_WAIT state, * starting the time-wait timer, turning off the other * standard timers. */ case TCPS_FIN_WAIT_2: tp->t_state = TCPS_TIME_WAIT; tcp_canceltimers(tp); tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; break; /* * In TIME_WAIT state restart the 2 MSL time_wait timer. */ case TCPS_TIME_WAIT: tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; break; } } /* * Return any desired output. */ if (needoutput || (tp->t_flags & TF_ACKNOW)) { (void) tcp_output(tp); } return; dropafterack: /* * Generate an ACK dropping incoming segment if it occupies * sequence space, where the ACK reflects our state. */ if (tiflags & TH_RST) goto drop; m_free(m); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); return; dropwithreset: /* reuses m if m!=NULL, m_free() unnecessary */ if (tiflags & TH_ACK) tcp_respond(tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST); else { if (tiflags & TH_SYN) ti->ti_len++; tcp_respond(tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0, TH_RST|TH_ACK); } return; drop: /* * Drop space held by incoming segment and return. */ m_free(m); }
static int tcp_lro_matching_tuple(struct ip* ip_hdr, struct tcphdr *tcp_hdr, int *hash, int *flow_id ) { struct lro_flow *flow; tcp_seq seqnum; unsigned int off = 0; int payload_len = 0; *hash = LRO_HASH(ip_hdr->ip_src.s_addr, ip_hdr->ip_dst.s_addr, tcp_hdr->th_sport, tcp_hdr->th_dport, (TCP_LRO_FLOW_MAP - 1)); *flow_id = lro_flow_map[*hash]; if (*flow_id == TCP_LRO_FLOW_NOTFOUND) { return TCP_LRO_NAN; } seqnum = tcp_hdr->th_seq; off = tcp_hdr->th_off << 2; payload_len = ip_hdr->ip_len - off; flow = &lro_flow_list[*flow_id]; if ((flow->lr_faddr.s_addr == ip_hdr->ip_src.s_addr) && (flow->lr_laddr.s_addr == ip_hdr->ip_dst.s_addr) && (flow->lr_fport == tcp_hdr->th_sport) && (flow->lr_lport == tcp_hdr->th_dport)) { if (flow->lr_tcphdr == NULL) { if (ntohl(seqnum) == flow->lr_seq) { return TCP_LRO_COALESCE; } if (lrodebug >= 4) { printf("%s: seqnum = %x, lr_seq = %x\n", __func__, ntohl(seqnum), flow->lr_seq); } lro_seq_mismatch++; if (SEQ_GT(ntohl(seqnum), flow->lr_seq)) { lro_seq_outoforder++; /* * Whenever we receive out of order packets it * signals loss and recovery and LRO doesn't * let flows recover quickly. So eject. */ flow->lr_flags |= LRO_EJECT_REQ; } return TCP_LRO_NAN; } if (flow->lr_flags & LRO_EJECT_REQ) { if (lrodebug) printf("%s: eject. \n", __func__); return TCP_LRO_EJECT_FLOW; } if (SEQ_GT(tcp_hdr->th_ack, flow->lr_tcphdr->th_ack)) { if (lrodebug) { printf("%s: th_ack = %x flow_ack = %x \n", __func__, tcp_hdr->th_ack, flow->lr_tcphdr->th_ack); } return TCP_LRO_EJECT_FLOW; } if (ntohl(seqnum) == (ntohl(lro_flow_list[*flow_id].lr_tcphdr->th_seq) + lro_flow_list[*flow_id].lr_len)) { return TCP_LRO_COALESCE; } else { /* LRO does not handle loss recovery well, eject */ flow->lr_flags |= LRO_EJECT_REQ; return TCP_LRO_EJECT_FLOW; } } if (lrodebug) printf("tcp_lro_matching_tuple: collision \n"); return TCP_LRO_COLLISION; }
/* * Ertt_packet_measurements uses a small amount of state kept on each packet * sent to match incoming acknowledgements. This enables more accurate and * secure round trip time measurements. The resulting measurement is used for * congestion control algorithms which require a more accurate time. * Ertt_packet_measurements is called via the helper hook in tcp_input.c */ static int ertt_packet_measurement_hook(int hhook_type, int hhook_id, void *udata, void *ctx_data, void *hdata, struct osd *hosd) { struct ertt *e_t; struct tcpcb *tp; struct tcphdr *th; struct tcpopt *to; struct tcp_hhook_data *thdp; struct txseginfo *txsi; int acked, measurenext_len, multiack, new_sacked_bytes, rtt_bytes_adjust; uint32_t measurenext, rts; tcp_seq ack; KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__)); KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__)); e_t = (struct ertt *)hdata; thdp = ctx_data; tp = thdp->tp; th = thdp->th; to = thdp->to; new_sacked_bytes = (tp->sackhint.last_sack_ack != 0); measurenext = measurenext_len = multiack = rts = rtt_bytes_adjust = 0; acked = th->th_ack - tp->snd_una; INP_WLOCK_ASSERT(tp->t_inpcb); /* Packet has provided new acknowledgements. */ if (acked > 0 || new_sacked_bytes) { if (acked == 0 && new_sacked_bytes) { /* Use last sacked data. */ ack = tp->sackhint.last_sack_ack; } else ack = th->th_ack; txsi = TAILQ_FIRST(&e_t->txsegi_q); while (txsi != NULL) { rts = 0; /* Acknowledgement is acking more than this txsi. */ if (SEQ_GT(ack, txsi->seq + txsi->len)) { if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext) { marked_packet_rtt(txsi, e_t, tp, &measurenext, &measurenext_len, &rtt_bytes_adjust, MULTI_ACK); } TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk); uma_zfree(txseginfo_zone, txsi); txsi = TAILQ_FIRST(&e_t->txsegi_q); continue; } /* * Guess if delayed acks are being used by the receiver. * * XXXDH: A simple heuristic that could be improved */ if (!new_sacked_bytes) { if (acked > tp->t_maxseg) { e_t->dlyack_rx += (e_t->dlyack_rx < DLYACK_SMOOTH) ? 1 : 0; multiack = 1; } else if (acked > txsi->len) { multiack = 1; e_t->dlyack_rx += (e_t->dlyack_rx < DLYACK_SMOOTH) ? 1 : 0; } else if (acked == tp->t_maxseg || acked == txsi->len) { e_t->dlyack_rx -= (e_t->dlyack_rx > 0) ? 1 : 0; } /* Otherwise leave dlyack_rx the way it was. */ } /* * Time stamps are only to help match the txsi with the * received acknowledgements. */ if (e_t->timestamp_errors < MAX_TS_ERR && (to->to_flags & TOF_TS) != 0 && to->to_tsecr) { /* * Note: All packets sent with the offload will * have the same time stamp. If we are sending * on a fast interface and the t_maxseg is much * smaller than one tick, this will be fine. The * time stamp would be the same whether we were * using tso or not. However, if the interface * is slow, this will cause problems with the * calculations. If the interface is slow, there * is not reason to be using tso, and it should * be turned off. */ /* * If there are too many time stamp errors, time * stamps won't be trusted */ rts = to->to_tsecr; /* Before this packet. */ if (!e_t->dlyack_rx && TSTMP_LT(rts, txsi->tx_ts)) /* When delayed acking is used, the * reflected time stamp is of the first * packet and thus may be before * txsi->tx_ts. */ break; if (TSTMP_GT(rts, txsi->tx_ts)) { /* * If reflected time stamp is later than * tx_tsi, then this txsi is old. */ if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext) { marked_packet_rtt(txsi, e_t, tp, &measurenext, &measurenext_len, &rtt_bytes_adjust, OLD_TXSI); } TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk); uma_zfree(txseginfo_zone, txsi); txsi = TAILQ_FIRST(&e_t->txsegi_q); continue; } if (rts == txsi->tx_ts && TSTMP_LT(to->to_tsval, txsi->rx_ts)) { /* * Segment received before sent! * Something is wrong with the received * timestamps so increment errors. If * this keeps up we will ignore * timestamps. */ e_t->timestamp_errors++; } } /* * Acknowledging a sequence number before this txsi. * If it is an old txsi that may have had the same seq * numbers, it should have been removed if time stamps * are being used. */ if (SEQ_LEQ(ack, txsi->seq)) break; /* Before first packet in txsi. */ /* * Only ack > txsi->seq and ack <= txsi->seq+txsi->len * past this point. * * If delayed acks are being used, an acknowledgement * for a single segment will have been delayed by the * receiver and will yield an inaccurate measurement. In * this case, we only make the measurement if more than * one segment is being acknowledged or sack is * currently being used. */ if (!e_t->dlyack_rx || multiack || new_sacked_bytes) { /* Make an accurate new measurement. */ e_t->rtt = tcp_ts_getticks() - txsi->tx_ts + 1; if (e_t->rtt < e_t->minrtt || e_t->minrtt == 0) e_t->minrtt = e_t->rtt; if (e_t->rtt > e_t->maxrtt || e_t->maxrtt == 0) e_t->maxrtt = e_t->rtt; } if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext) marked_packet_rtt(txsi, e_t, tp, &measurenext, &measurenext_len, &rtt_bytes_adjust, CORRECT_ACK); if (txsi->flags & TXSI_TSO) { if (txsi->len > acked) { txsi->len -= acked; /* * This presumes ack for first bytes in * txsi, this may not be true but it * shouldn't cause problems for the * timing. * * We remeasure RTT even though we only * have a single txsi. The rationale * behind this is that it is better to * have a slightly inaccurate * measurement than no additional * measurement for the rest of the bulk * transfer. Since TSO is only used on * high speed interface cards, so the * packets should be transmitted at line * rate back to back with little * difference in transmission times (in * ticks). */ txsi->seq += acked; /* * Reset txsi measure flag so we don't * use it for another RTT measurement. */ txsi->flags &= ~TXSI_RTT_MEASURE_START; /* * There is still more data to be acked * from tso bulk transmission, so we * won't remove it from the TAILQ yet. */ break; } txsi->len = 0; } TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk); uma_zfree(txseginfo_zone, txsi); break; } if (measurenext) { /* * We need to do a RTT measurement. It won't be the best * if we do it here. */ marked_packet_rtt(txsi, e_t, tp, &measurenext, &measurenext_len, &rtt_bytes_adjust, FORCED_MEASUREMENT); } } return (0); }
/* * This function is called upon receipt of new valid data (while not in header * prediction mode), and it updates the ordered list of sacks. */ void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end) { /* * First reported block MUST be the most recent one. Subsequent * blocks SHOULD be in the order in which they arrived at the * receiver. These two conditions make the implementation fully * compliant with RFC 2018. */ struct sackblk head_blk, saved_blks[MAX_SACK_BLKS]; int num_head, num_saved, i; /* SACK block for the received segment. */ head_blk.start = rcv_start; head_blk.end = rcv_end; /* * Merge updated SACK blocks into head_blk, and * save unchanged SACK blocks into saved_blks[]. * num_saved will have the number of the saved SACK blocks. */ num_saved = 0; for (i = 0; i < tp->rcv_numsacks; i++) { tcp_seq start = tp->sackblks[i].start; tcp_seq end = tp->sackblks[i].end; if (SEQ_GEQ(start, end) || SEQ_LEQ(start, tp->rcv_nxt)) { /* * Discard this SACK block. */ } else if (SEQ_LEQ(head_blk.start, end) && SEQ_GEQ(head_blk.end, start)) { /* * Merge this SACK block into head_blk. * This SACK block itself will be discarded. */ if (SEQ_GT(head_blk.start, start)) head_blk.start = start; if (SEQ_LT(head_blk.end, end)) head_blk.end = end; } else { /* * Save this SACK block. */ saved_blks[num_saved].start = start; saved_blks[num_saved].end = end; num_saved++; } } /* * Update SACK list in tp->sackblks[]. */ num_head = 0; if (SEQ_GT(head_blk.start, tp->rcv_nxt)) { /* * The received data segment is an out-of-order segment. * Put head_blk at the top of SACK list. */ tp->sackblks[0] = head_blk; num_head = 1; /* * If the number of saved SACK blocks exceeds its limit, * discard the last SACK block. */ if (num_saved >= MAX_SACK_BLKS) num_saved--; } if (num_saved > 0) { /* * Copy the saved SACK blocks back. */ bcopy(saved_blks, &tp->sackblks[num_head], sizeof(struct sackblk) * num_saved); } /* Save the number of SACK blocks. */ tp->rcv_numsacks = num_head + num_saved; /* If we are requesting SACK recovery, reset the stretch-ack state * so that connection will generate more acks after recovery and * sender's cwnd will open. */ if ((tp->t_flags & TF_STRETCHACK) != 0 && tp->rcv_numsacks > 0) tcp_reset_stretch_ack(tp); #if TRAFFIC_MGT if (tp->acc_iaj > 0 && tp->rcv_numsacks > 0) reset_acc_iaj(tp); #endif /* TRAFFIC_MGT */ }