/* * After a timeout, the SACK list may be rebuilt. This SACK information * should be used to avoid retransmitting SACKed data. This function * traverses the SACK list to see if snd_nxt should be moved forward. */ void tcp_sack_adjust(struct tcpcb *tp) { struct sackhole *p, *cur = TAILQ_FIRST(&tp->snd_holes); //ScenSim-Port// INP_WLOCK_ASSERT(tp->t_inpcb); if (cur == NULL) return; /* No holes */ if (SEQ_GEQ(tp->snd_nxt, tp->snd_fack)) return; /* We're already beyond any SACKed blocks */ /*- * Two cases for which we want to advance snd_nxt: * i) snd_nxt lies between end of one hole and beginning of another * ii) snd_nxt lies between end of last hole and snd_fack */ while ((p = TAILQ_NEXT(cur, scblink)) != NULL) { if (SEQ_LT(tp->snd_nxt, cur->end)) return; if (SEQ_GEQ(tp->snd_nxt, p->start)) cur = p; else { tp->snd_nxt = p->start; return; } } if (SEQ_LT(tp->snd_nxt, cur->end)) return; tp->snd_nxt = tp->snd_fack; }
static void dctcp_ack_received(struct cc_var *ccv, uint16_t type) { struct dctcp *dctcp_data; int bytes_acked = 0; dctcp_data = ccv->cc_data; if (CCV(ccv, t_flags) & TF_ECN_PERMIT) { /* * DCTCP doesn't treat receipt of ECN marked packet as a * congestion event. Thus, DCTCP always executes the ACK * processing out of congestion recovery. */ if (IN_CONGRECOVERY(CCV(ccv, t_flags))) { EXIT_CONGRECOVERY(CCV(ccv, t_flags)); newreno_cc_algo.ack_received(ccv, type); ENTER_CONGRECOVERY(CCV(ccv, t_flags)); } else newreno_cc_algo.ack_received(ccv, type); if (type == CC_DUPACK) bytes_acked = CCV(ccv, t_maxseg); if (type == CC_ACK) bytes_acked = ccv->bytes_this_ack; /* Update total bytes. */ dctcp_data->bytes_total += bytes_acked; /* Update total marked bytes. */ if (dctcp_data->ece_curr) { if (!dctcp_data->ece_prev && bytes_acked > CCV(ccv, t_maxseg)) { dctcp_data->bytes_ecn += (bytes_acked - CCV(ccv, t_maxseg)); } else dctcp_data->bytes_ecn += bytes_acked; dctcp_data->ece_prev = 1; } else { if (dctcp_data->ece_prev && bytes_acked > CCV(ccv, t_maxseg)) dctcp_data->bytes_ecn += CCV(ccv, t_maxseg); dctcp_data->ece_prev = 0; } dctcp_data->ece_curr = 0; /* * Update the fraction of marked bytes at the end of * current window size. */ if ((IN_FASTRECOVERY(CCV(ccv, t_flags)) && SEQ_GEQ(ccv->curack, CCV(ccv, snd_recover))) || (!IN_FASTRECOVERY(CCV(ccv, t_flags)) && SEQ_GT(ccv->curack, dctcp_data->save_sndnxt))) dctcp_update_alpha(ccv); } else newreno_cc_algo.ack_received(ccv, type); }
/* * To remove a SACK block. * * Parameters: * sack_blk_t *head: pointer to the array of SACK blks. * tcp_seq end: to remove all sack blk with seq num less than end. * int32_t *num: (referenced) total num of SACK blks in the array. */ void tcp_sack_remove(sack_blk_t *head, tcp_seq end, int32_t *num) { sack_blk_t tmp[MAX_SACK_BLK]; int32_t i, j, old_num, new_num; if (*num == 0) return; old_num = *num; new_num = old_num; j = 0; /* Walk thru the whole list and copy the new list to tmp[]. */ for (i = 0; i < old_num; i++) { if (SEQ_GT(end, head[i].begin)) { /* * Check to see if the old SACK blk needs to be * removed or updated. If the old blk is just * partially covered, update begin and continue. * If the old blk is completely covered, remove it * and continue to check. */ if (SEQ_GEQ(end, head[i].end)) { new_num--; continue; } else { tmp[j].begin = end; tmp[j].end = head[i].end; } } else { tmp[j].begin = head[i].begin; tmp[j].end = head[i].end; } j++; } /* Copy tmp[] back to the original list. */ for (i = 0; i < new_num; i++) { head[i].begin = tmp[i].begin; head[i].end = tmp[i].end; } *num = new_num; }
/** * \brief Update stream with SACK records from a TCP packet. * * \param stream The stream to update. * \param p packet to get the SACK records from * * \retval -1 error * \retval 0 ok */ int StreamTcpSackUpdatePacket(TcpStream *stream, Packet *p) { int records = TCP_GET_SACK_CNT(p); int record = 0; TCPOptSackRecord *sack_rec = (TCPOptSackRecord *)(TCP_GET_SACK_PTR(p)); for (record = 0; record < records; record++) { SCLogDebug("%p last_ack %u, left edge %u, right edge %u", sack_rec, stream->last_ack, ntohl(sack_rec->le), ntohl(sack_rec->re)); if (SEQ_LEQ(ntohl(sack_rec->re), stream->last_ack)) { SCLogDebug("record before last_ack"); goto next; } /** \todo need a metric to a check for a right edge limit */ /* if (SEQ_GT(ntohl(sack_rec->re), stream->next_seq)) { SCLogDebug("record beyond next_seq %u", stream->next_seq); goto next; } */ if (SEQ_GEQ(ntohl(sack_rec->le), ntohl(sack_rec->re))) { SCLogDebug("invalid record: le >= re"); goto next; } if (StreamTcpSackInsertRange(stream, ntohl(sack_rec->le), ntohl(sack_rec->re)) == -1) { SCReturnInt(-1); } next: sack_rec++; } #ifdef DEBUG StreamTcpSackPrintList(stream); #endif SCReturnInt(0); }
/* * USTAT PDU / SOS_READY Processor * * Arguments: * sop pointer to sscop connection block * m pointer to PDU buffer (without trailer) * trlr pointer to PDU trailer * * Returns: * none * */ void sscop_ustat_ready(struct sscop *sop, KBuffer *m, caddr_t trlr) { struct ustat_pdu *up = (struct ustat_pdu *)trlr; struct pdu_hdr *php; sscop_seq seq1, seq2; up->ustat_nmr = ntohl(up->ustat_nmr); up->ustat_nr = ntohl(up->ustat_nr); /* * Validate peer's current receive data sequence number */ if (SEQ_GT(sop->so_ack, up->ustat_nr, sop->so_ack) || SEQ_GEQ(up->ustat_nr, sop->so_send, sop->so_ack)) { /* * Bad data sequence number */ goto goterr; } /* * Free acknowledged PDUs */ for (seq1 = sop->so_ack, SEQ_SET(seq2, up->ustat_nr); SEQ_LT(seq1, seq2, sop->so_ack); SEQ_INCR(seq1, 1)) { sscop_pack_free(sop, seq1); } /* * Update transmit state variables */ sop->so_ack = seq2; SEQ_SET(sop->so_sendmax, up->ustat_nmr); /* * Get USTAT list elements */ SEQ_SET(seq1, ntohl(up->ustat_le1)); SEQ_SET(seq2, ntohl(up->ustat_le2)); /* * Validate elements */ if (SEQ_GT(sop->so_ack, seq1, sop->so_ack) || SEQ_GEQ(seq1, seq2, sop->so_ack) || SEQ_GEQ(seq2, sop->so_send, sop->so_ack)) { /* * Bad element sequence number */ goto goterr; } /* * Process each missing sequence number in this gap */ while (SEQ_LT(seq1, seq2, sop->so_ack)) { /* * Find corresponding SD PDU on pending ack queue */ php = sscop_pack_locate(sop, seq1); if (php == NULL) { goto goterr; } /* * Retransmit this SD PDU if it's not * already scheduled for retranmission. */ if ((php->ph_rexmit_lk == NULL) && (sop->so_rexmit_tl != php)) { /* * Put PDU on retransmit queue and schedule * transmit servicing */ sscop_rexmit_insert(sop, php); sop->so_flags |= SOF_XMITSRVC; } /* * Bump to next sequence number */ SEQ_INCR(seq1, 1); } /* * Report retransmitted PDUs */ sscop_maa_error(sop, 'V'); /* * Free PDU buffer chain */ KB_FREEALL(m); /* * See if transmit queues need servicing */ if (sop->so_flags & SOF_XMITSRVC) sscop_service_xmit(sop); return; goterr: /* * Protocol/parameter error encountered */ sscop_maa_error(sop, 'T'); /* * Free PDU buffer chain */ KB_FREEALL(m); if (sop->so_vers == SSCOP_VERS_QSAAL) /* * Reestablish a new connection */ qsaal1_reestablish(sop); else /* * Initiate error recovery */ q2110_error_recovery(sop); return; }
/* * STAT PDU / SOS_READY Processor * * Arguments: * sop pointer to sscop connection block * m pointer to PDU buffer (without trailer) * trlr pointer to PDU trailer * * Returns: * none * */ void sscop_stat_ready(struct sscop *sop, KBuffer *m, caddr_t trlr) { struct stat_pdu *sp = (struct stat_pdu *)trlr; struct pdu_hdr *php; KBuffer *m0 = m; sscop_seq seq1, seq2, opa; int cnt = 0; sp->stat_nps = ntohl(sp->stat_nps); sp->stat_nmr = ntohl(sp->stat_nmr); sp->stat_nr = ntohl(sp->stat_nr); /* * Validate peer's received poll sequence number */ if (SEQ_GT(sop->so_pollack, sp->stat_nps, sop->so_pollack) || SEQ_GT(sp->stat_nps, sop->so_pollsend, sop->so_pollack)) { /* * Bad poll sequence number */ sscop_maa_error(sop, 'R'); goto goterr; } /* * Validate peer's current receive data sequence number */ if (SEQ_GT(sop->so_ack, sp->stat_nr, sop->so_ack) || SEQ_GT(sp->stat_nr, sop->so_send, sop->so_ack)) { /* * Bad data sequence number */ sscop_maa_error(sop, 'S'); goto goterr; } /* * Free acknowledged PDUs */ for (seq1 = sop->so_ack, SEQ_SET(seq2, sp->stat_nr); SEQ_LT(seq1, seq2, sop->so_ack); SEQ_INCR(seq1, 1)) { sscop_pack_free(sop, seq1); } /* * Update transmit state variables */ opa = sop->so_pollack; sop->so_ack = seq2; SEQ_SET(sop->so_pollack, sp->stat_nps); SEQ_SET(sop->so_sendmax, sp->stat_nmr); /* * Get first element in STAT list */ while (m && (KB_LEN(m) == 0)) m = KB_NEXT(m); if (m == NULL) goto done; m = sscop_stat_getelem(m, &seq1); /* * Make sure there's a second element too */ if (m == NULL) goto done; /* * Validate first element (start of missing pdus) */ if (SEQ_GT(sop->so_ack, seq1, sop->so_ack) || SEQ_GEQ(seq1, sop->so_send, sop->so_ack)) { /* * Bad element sequence number */ sscop_maa_error(sop, 'S'); goto goterr; } /* * Loop thru all STAT elements in list */ while (m) { /* * Get next even element (start of received pdus) */ m = sscop_stat_getelem(m, &seq2); /* * Validate seqence number */ if (SEQ_GEQ(seq1, seq2, sop->so_ack) || SEQ_GT(seq2, sop->so_send, sop->so_ack)) { /* * Bad element sequence number */ sscop_maa_error(sop, 'S'); goto goterr; } /* * Process each missing sequence number in this gap */ while (SEQ_LT(seq1, seq2, sop->so_ack)) { /* * Find corresponding SD PDU on pending ack queue */ php = sscop_pack_locate(sop, seq1); if (php == NULL) { sscop_maa_error(sop, 'S'); goto goterr; } /* * Retransmit this SD PDU only if it was last sent * during an earlier poll sequence and it's not * already scheduled for retranmission. */ if (SEQ_LT(php->ph_nps, sp->stat_nps, opa) && (php->ph_rexmit_lk == NULL) && (sop->so_rexmit_tl != php)) { /* * Put PDU on retransmit queue and schedule * transmit servicing */ sscop_rexmit_insert(sop, php); sop->so_flags |= SOF_XMITSRVC; cnt++; } /* * Bump to next sequence number */ SEQ_INCR(seq1, 1); } /* * Now process series of acknowledged PDUs * * Get next odd element (start of missing pdus), * but make sure there is one and that it's valid */ if (m == NULL) goto done; m = sscop_stat_getelem(m, &seq2); if (SEQ_GEQ(seq1, seq2, sop->so_ack) || SEQ_GT(seq2, sop->so_send, sop->so_ack)) { /* * Bad element sequence number */ sscop_maa_error(sop, 'S'); goto goterr; } /* * Process each acked sequence number */ while (SEQ_LT(seq1, seq2, sop->so_ack)) { /* * Can we clear transmit buffers ?? */ if ((sop->so_flags & SOF_NOCLRBUF) == 0) { /* * Yes, free acked buffers */ sscop_pack_free(sop, seq1); } /* * Bump to next sequence number */ SEQ_INCR(seq1, 1); } } done: /* * Free PDU buffer chain */ KB_FREEALL(m0); /* * Report retransmitted PDUs */ if (cnt) sscop_maa_error(sop, 'V'); /* * Record transmit window closed transitions */ if (SEQ_LT(sop->so_send, sop->so_sendmax, sop->so_ack)) { if (sop->so_flags & SOF_NOCREDIT) { sop->so_flags &= ~SOF_NOCREDIT; sscop_maa_error(sop, 'X'); } } else { if ((sop->so_flags & SOF_NOCREDIT) == 0) { sop->so_flags |= SOF_NOCREDIT; sscop_maa_error(sop, 'W'); } } if (sop->so_vers == SSCOP_VERS_QSAAL) /* * Restart lost poll/stat timer */ sop->so_timer[SSCOP_T_NORESP] = sop->so_parm.sp_timeresp; else { /* * Determine new polling phase */ if ((sop->so_timer[SSCOP_T_POLL] != 0) && ((sop->so_flags & SOF_KEEPALIVE) == 0)) { /* * Remain in active phase - reset NO-RESPONSE timer */ sop->so_timer[SSCOP_T_NORESP] = sop->so_parm.sp_timeresp; } else if (sop->so_timer[SSCOP_T_IDLE] == 0) { /* * Go from transient to idle phase */ sop->so_timer[SSCOP_T_POLL] = 0; sop->so_flags &= ~SOF_KEEPALIVE; sop->so_timer[SSCOP_T_NORESP] = 0; sop->so_timer[SSCOP_T_IDLE] = sop->so_parm.sp_timeidle; } } /* * See if transmit queues need servicing */ if (sop->so_flags & SOF_XMITSRVC) sscop_service_xmit(sop); return; goterr: /* * Protocol/parameter error encountered */ /* * Free PDU buffer chain */ KB_FREEALL(m0); if (sop->so_vers == SSCOP_VERS_QSAAL) /* * Reestablish a new connection */ qsaal1_reestablish(sop); else /* * Initiate error recovery */ q2110_error_recovery(sop); return; }
/* * When a new ack with SACK is received, check if it indicates packet * reordering. If there is packet reordering, the socket is marked and * the late time offset by which the packet was reordered with * respect to its closest neighboring packets is computed. */ static void tcp_sack_detect_reordering(struct tcpcb *tp, struct sackhole *s, tcp_seq sacked_seq, tcp_seq snd_fack) { int32_t rext = 0, reordered = 0; /* * If the SACK hole is past snd_fack, this is from new SACK * information, so we can ignore it. */ if (SEQ_GT(s->end, snd_fack)) return; /* * If there has been a retransmit timeout, then the timestamp on * the SACK segment will be newer. This might lead to a * false-positive. Avoid re-ordering detection in this case. */ if (tp->t_rxtshift > 0) return; /* * Detect reordering from SACK information by checking * if recently sacked data was never retransmitted from this hole. */ if (SEQ_LT(s->rxmit, sacked_seq)) { reordered = 1; tcpstat.tcps_avoid_rxmt++; } if (reordered) { if (tcp_detect_reordering == 1 && !(tp->t_flagsext & TF_PKTS_REORDERED)) { tp->t_flagsext |= TF_PKTS_REORDERED; tcpstat.tcps_detect_reordering++; } tcpstat.tcps_reordered_pkts++; tp->t_reordered_pkts++; /* * If reordering is seen on a connection wth ECN enabled, * increment the heuristic */ if (TCP_ECN_ENABLED(tp)) { INP_INC_IFNET_STAT(tp->t_inpcb, ecn_fallback_reorder); tcpstat.tcps_ecn_fallback_reorder++; tcp_heuristic_ecn_aggressive(tp); } VERIFY(SEQ_GEQ(snd_fack, s->rxmit)); if (s->rxmit_start > 0) { rext = timer_diff(tcp_now, 0, s->rxmit_start, 0); if (rext < 0) return; /* * We take the maximum reorder window to schedule * DELAYFR timer as that will take care of jitter * on the network path. * * Computing average and standard deviation seems * to cause unnecessary retransmissions when there * is high jitter. * * We set a maximum of SRTT/2 and a minimum of * 10 ms on the reorder window. */ tp->t_reorderwin = max(tp->t_reorderwin, rext); tp->t_reorderwin = min(tp->t_reorderwin, (tp->t_srtt >> (TCP_RTT_SHIFT - 1))); tp->t_reorderwin = max(tp->t_reorderwin, 10); } }
/* * This function is called upon receipt of new valid data (while not in header * prediction mode), and it updates the ordered list of sacks. */ void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end) { /* * First reported block MUST be the most recent one. Subsequent * blocks SHOULD be in the order in which they arrived at the * receiver. These two conditions make the implementation fully * compliant with RFC 2018. */ struct sackblk head_blk, saved_blks[MAX_SACK_BLKS]; int num_head, num_saved, i; /* SACK block for the received segment. */ head_blk.start = rcv_start; head_blk.end = rcv_end; /* * Merge updated SACK blocks into head_blk, and * save unchanged SACK blocks into saved_blks[]. * num_saved will have the number of the saved SACK blocks. */ num_saved = 0; for (i = 0; i < tp->rcv_numsacks; i++) { tcp_seq start = tp->sackblks[i].start; tcp_seq end = tp->sackblks[i].end; if (SEQ_GEQ(start, end) || SEQ_LEQ(start, tp->rcv_nxt)) { /* * Discard this SACK block. */ } else if (SEQ_LEQ(head_blk.start, end) && SEQ_GEQ(head_blk.end, start)) { /* * Merge this SACK block into head_blk. * This SACK block itself will be discarded. */ if (SEQ_GT(head_blk.start, start)) head_blk.start = start; if (SEQ_LT(head_blk.end, end)) head_blk.end = end; } else { /* * Save this SACK block. */ saved_blks[num_saved].start = start; saved_blks[num_saved].end = end; num_saved++; } } /* * Update SACK list in tp->sackblks[]. */ num_head = 0; if (SEQ_GT(head_blk.start, tp->rcv_nxt)) { /* * The received data segment is an out-of-order segment. * Put head_blk at the top of SACK list. */ tp->sackblks[0] = head_blk; num_head = 1; /* * If the number of saved SACK blocks exceeds its limit, * discard the last SACK block. */ if (num_saved >= MAX_SACK_BLKS) num_saved--; } if (num_saved > 0) { /* * Copy the saved SACK blocks back. */ bcopy(saved_blks, &tp->sackblks[num_head], sizeof(struct sackblk) * num_saved); } /* Save the number of SACK blocks. */ tp->rcv_numsacks = num_head + num_saved; /* If we are requesting SACK recovery, reset the stretch-ack state * so that connection will generate more acks after recovery and * sender's cwnd will open. */ if ((tp->t_flags & TF_STRETCHACK) != 0 && tp->rcv_numsacks > 0) tcp_reset_stretch_ack(tp); #if TRAFFIC_MGT if (tp->acc_iaj > 0 && tp->rcv_numsacks > 0) reset_acc_iaj(tp); #endif /* TRAFFIC_MGT */ }
/* * Tcp output routine: figure out what should be sent and send it. */ int tcp_output(struct tcpcb *tp) { struct socket *so = tp->t_inpcb->inp_socket; long len, recwin, sendwin; int off, flags, error; #ifdef TCP_SIGNATURE int sigoff = 0; #endif struct mbuf *m; struct ip *ip = NULL; struct ipovly *ipov = NULL; struct tcphdr *th; u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen; int idle, sendalot; int i, sack_rxmit; int sack_bytes_rxmt; struct sackhole *p; #if 0 int maxburst = TCP_MAXBURST; #endif struct rmxp_tao tao; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6; bzero(&tao, sizeof(tao)); isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; #endif #ifdef TCP_ECN int needect; #endif INP_LOCK_ASSERT(tp->t_inpcb); /* * Determine length of data that should be transmitted, * and flags that will be used. * If there is some data or critical controls (SYN, RST) * to send, then transmit; otherwise, investigate further. */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) { /* * We have been idle for "a while" and no acks are * expected to clock out any data we send -- * slow start to get ack "clock" running again. * * Set the slow-start flight size depending on whether * this is a local network or not. */ int ss = ss_fltsz; #ifdef INET6 if (isipv6) { if (in6_localaddr(&tp->t_inpcb->in6p_faddr)) ss = ss_fltsz_local; } else #endif if (in_localaddr(tp->t_inpcb->inp_faddr)) ss = ss_fltsz_local; tp->snd_cwnd = tp->t_maxseg * ss; } tp->t_flags &= ~TF_LASTIDLE; if (idle) { if (tp->t_flags & TF_MORETOCOME) { tp->t_flags |= TF_LASTIDLE; idle = 0; } } again: /* * If we've recently taken a timeout, snd_max will be greater than * snd_nxt. There may be SACK information that allows us to avoid * resending already delivered data. Adjust snd_nxt accordingly. */ if (tp->sack_enable && SEQ_LT(tp->snd_nxt, tp->snd_max)) tcp_sack_adjust(tp); sendalot = 0; off = tp->snd_nxt - tp->snd_una; sendwin = min(tp->snd_wnd, tp->snd_cwnd); sendwin = min(sendwin, tp->snd_bwnd); flags = tcp_outflags[tp->t_state]; /* * Send any SACK-generated retransmissions. If we're explicitly trying * to send out new data (when sendalot is 1), bypass this function. * If we retransmit in fast recovery mode, decrement snd_cwnd, since * we're replacing a (future) new transmission with a retransmission * now, and we previously incremented snd_cwnd in tcp_input(). */ /* * Still in sack recovery , reset rxmit flag to zero. */ sack_rxmit = 0; sack_bytes_rxmt = 0; len = 0; p = NULL; if (tp->sack_enable && IN_FASTRECOVERY(tp) && (p = tcp_sack_output(tp, &sack_bytes_rxmt))) { long cwin; cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt; if (cwin < 0) cwin = 0; /* Do not retransmit SACK segments beyond snd_recover */ if (SEQ_GT(p->end, tp->snd_recover)) { /* * (At least) part of sack hole extends beyond * snd_recover. Check to see if we can rexmit data * for this hole. */ if (SEQ_GEQ(p->rxmit, tp->snd_recover)) { /* * Can't rexmit any more data for this hole. * That data will be rexmitted in the next * sack recovery episode, when snd_recover * moves past p->rxmit. */ p = NULL; goto after_sack_rexmit; } else /* Can rexmit part of the current hole */ len = ((long)ulmin(cwin, tp->snd_recover - p->rxmit)); } else len = ((long)ulmin(cwin, p->end - p->rxmit)); off = p->rxmit - tp->snd_una; KASSERT(off >= 0,("%s: sack block to the left of una : %d", __func__, off)); if (len > 0) { sack_rxmit = 1; sendalot = 1; tcpstat.tcps_sack_rexmits++; tcpstat.tcps_sack_rexmit_bytes += min(len, tp->t_maxseg); } } after_sack_rexmit: /* * Get standard flags, and add SYN or FIN if requested by 'hidden' * state flags. */ if (tp->t_flags & TF_NEEDFIN) flags |= TH_FIN; if (tp->t_flags & TF_NEEDSYN) flags |= TH_SYN; SOCKBUF_LOCK(&so->so_snd); /* * If in persist timeout with window of 0, send 1 byte. * Otherwise, if window is small but nonzero * and timer expired, we will send what we can * and go to transmit state. */ if (tp->t_force) { if (sendwin == 0) { /* * If we still have some data to send, then * clear the FIN bit. Usually this would * happen below when it realizes that we * aren't sending all the data. However, * if we have exactly 1 byte of unsent data, * then it won't clear the FIN bit below, * and if we are in persist state, we wind * up sending the packet without recording * that we sent the FIN bit. * * We can't just blindly clear the FIN bit, * because if we don't have any more data * to send then the probe will be the FIN * itself. */ if (off < so->so_snd.sb_cc) flags &= ~TH_FIN; sendwin = 1; } else { callout_stop(tp->tt_persist); tp->t_rxtshift = 0; } } /* * If snd_nxt == snd_max and we have transmitted a FIN, the * offset will be > 0 even if so_snd.sb_cc is 0, resulting in * a negative length. This can also occur when TCP opens up * its congestion window while receiving additional duplicate * acks after fast-retransmit because TCP will reset snd_nxt * to snd_max after the fast-retransmit. * * In the normal retransmit-FIN-only case, however, snd_nxt will * be set to snd_una, the offset will be 0, and the length may * wind up 0. * * If sack_rxmit is true we are retransmitting from the scoreboard * in which case len is already set. */ if (sack_rxmit == 0) { if (sack_bytes_rxmt == 0) len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off); else { long cwin; /* * We are inside of a SACK recovery episode and are * sending new data, having retransmitted all the * data possible in the scoreboard. */ len = ((long)ulmin(so->so_snd.sb_cc, tp->snd_wnd) - off); /* * Don't remove this (len > 0) check ! * We explicitly check for len > 0 here (although it * isn't really necessary), to work around a gcc * optimization issue - to force gcc to compute * len above. Without this check, the computation * of len is bungled by the optimizer. */ if (len > 0) { cwin = tp->snd_cwnd - (tp->snd_nxt - tp->sack_newdata) - sack_bytes_rxmt; if (cwin < 0) cwin = 0; len = lmin(len, cwin); } } } /* * Lop off SYN bit if it has already been sent. However, if this * is SYN-SENT state and if segment contains data and if we don't * know that foreign host supports TAO, suppress sending segment. */ if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { flags &= ~TH_SYN; off--, len++; if (tcp_do_rfc1644) tcp_hc_gettao(&tp->t_inpcb->inp_inc, &tao); if (len > 0 && tp->t_state == TCPS_SYN_SENT && tao.tao_ccsent == 0) goto just_return; } /* * Be careful not to send data and/or FIN on SYN segments * in cases when no CC option will be sent. * This measure is needed to prevent interoperability problems * with not fully conformant TCP implementations. */ if ((flags & TH_SYN) && ((tp->t_flags & TF_NOOPT) || !(tp->t_flags & TF_REQ_CC) || ((flags & TH_ACK) && !(tp->t_flags & TF_RCVD_CC)))) { len = 0; flags &= ~TH_FIN; } if (len < 0) { /* * If FIN has been sent but not acked, * but we haven't been called to retransmit, * len will be < 0. Otherwise, window shrank * after we sent into it. If window shrank to 0, * cancel pending retransmit, pull snd_nxt back * to (closed) window, and set the persist timer * if it isn't already going. If the window didn't * close completely, just wait for an ACK. */ len = 0; if (sendwin == 0) { callout_stop(tp->tt_rexmt); tp->t_rxtshift = 0; tp->snd_nxt = tp->snd_una; if (!callout_active(tp->tt_persist)) tcp_setpersist(tp); } } /* * len will be >= 0 after this point. Truncate to the maximum * segment length and ensure that FIN is removed if the length * no longer contains the last data byte. */ if (len > tp->t_maxseg) { len = tp->t_maxseg; sendalot = 1; } if (sack_rxmit) { if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc)) flags &= ~TH_FIN; } else { if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) flags &= ~TH_FIN; } recwin = sbspace(&so->so_rcv); /* * Sender silly window avoidance. We transmit under the following * conditions when len is non-zero: * * - We have a full segment * - This is the last buffer in a write()/send() and we are * either idle or running NODELAY * - we've timed out (e.g. persist timer) * - we have more then 1/2 the maximum send window's worth of * data (receiver may be limited the window size) * - we need to retransmit */ if (len) { if (len == tp->t_maxseg) goto send; /* * NOTE! on localhost connections an 'ack' from the remote * end may occur synchronously with the output and cause * us to flush a buffer queued with moretocome. XXX * * note: the len + off check is almost certainly unnecessary. */ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ (idle || (tp->t_flags & TF_NODELAY)) && len + off >= so->so_snd.sb_cc && (tp->t_flags & TF_NOPUSH) == 0) { goto send; } if (tp->t_force) /* typ. timeout case */ goto send; if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */ goto send; if (sack_rxmit) goto send; } /* * Compare available window to amount of window * known to peer (as advertised window less * next expected input). If the difference is at least two * max size segments, or at least 50% of the maximum possible * window, then want to send a window update to peer. * Skip this if the connection is in T/TCP half-open state. */ if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN)) { /* * "adv" is the amount we can increase the window, * taking into account that we are limited by * TCP_MAXWIN << tp->rcv_scale. */ long adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale) - (tp->rcv_adv - tp->rcv_nxt); if (adv >= (long) (2 * tp->t_maxseg)) goto send; if (2 * adv >= (long) so->so_rcv.sb_hiwat) goto send; } /* * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW * is also a catch-all for the retransmit timer timeout case. */ if (tp->t_flags & TF_ACKNOW) goto send; if ((flags & TH_RST) || ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) goto send; if (SEQ_GT(tp->snd_up, tp->snd_una)) goto send; /* * If our state indicates that FIN should be sent * and we have not yet done so, then we need to send. */ if (flags & TH_FIN && ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) goto send; /* * In SACK, it is possible for tcp_output to fail to send a segment * after the retransmission timer has been turned off. Make sure * that the retransmission timer is set. */ if (tp->sack_enable && SEQ_GT(tp->snd_max, tp->snd_una) && !callout_active(tp->tt_rexmt) && !callout_active(tp->tt_persist)) { callout_reset(tp->tt_rexmt, tp->t_rxtcur, tcp_timer_rexmt, tp); goto just_return; } /* * TCP window updates are not reliable, rather a polling protocol * using ``persist'' packets is used to insure receipt of window * updates. The three ``states'' for the output side are: * idle not doing retransmits or persists * persisting to move a small or zero window * (re)transmitting and thereby not persisting * * callout_active(tp->tt_persist) * is true when we are in persist state. * tp->t_force * is set when we are called to send a persist packet. * callout_active(tp->tt_rexmt) * is set when we are retransmitting * The output side is idle when both timers are zero. * * If send window is too small, there is data to transmit, and no * retransmit or persist is pending, then go to persist state. * If nothing happens soon, send when timer expires: * if window is nonzero, transmit what we can, * otherwise force out a byte. */ if (so->so_snd.sb_cc && !callout_active(tp->tt_rexmt) && !callout_active(tp->tt_persist)) { tp->t_rxtshift = 0; tcp_setpersist(tp); } /* * No reason to send a segment, just return. */ just_return: SOCKBUF_UNLOCK(&so->so_snd); return (0); send: SOCKBUF_LOCK_ASSERT(&so->so_snd); /* * Before ESTABLISHED, force sending of initial options * unless TCP set not to do any options. * NOTE: we assume that the IP/TCP header plus TCP options * always fit in a single mbuf, leaving room for a maximum * link header, i.e. * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES */ optlen = 0; #ifdef INET6 if (isipv6) hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); else #endif hdrlen = sizeof (struct tcpiphdr); if (flags & TH_SYN) { tp->snd_nxt = tp->iss; if ((tp->t_flags & TF_NOOPT) == 0) { u_short mss; opt[0] = TCPOPT_MAXSEG; opt[1] = TCPOLEN_MAXSEG; mss = htons((u_short) tcp_mssopt(&tp->t_inpcb->inp_inc)); (void)memcpy(opt + 2, &mss, sizeof(mss)); optlen = TCPOLEN_MAXSEG; /* * If this is the first SYN of connection (not a SYN * ACK), include SACK_PERMIT_HDR option. If this is a * SYN ACK, include SACK_PERMIT_HDR option if peer has * already done so. This is only for active connect, * since the syncache takes care of the passive connect. */ if (tp->sack_enable && ((flags & TH_ACK) == 0 || (tp->t_flags & TF_SACK_PERMIT))) { *((u_int32_t *) (opt + optlen)) = htonl(TCPOPT_SACK_PERMIT_HDR); optlen += 4; } if ((tp->t_flags & TF_REQ_SCALE) && ((flags & TH_ACK) == 0 || (tp->t_flags & TF_RCVD_SCALE))) { *((u_int32_t *)(opt + optlen)) = htonl( TCPOPT_NOP << 24 | TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 | tp->request_r_scale); optlen += 4; } } } /* * Send a timestamp and echo-reply if this is a SYN and our side * wants to use timestamps (TF_REQ_TSTMP is set) or both our side * and our peer have sent timestamps in our SYN's. */ if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && (flags & TH_RST) == 0 && ((flags & TH_ACK) == 0 || (tp->t_flags & TF_RCVD_TSTMP))) { u_int32_t *lp = (u_int32_t *)(opt + optlen); /* Form timestamp option as shown in appendix A of RFC 1323. */ *lp++ = htonl(TCPOPT_TSTAMP_HDR); *lp++ = htonl(ticks); *lp = htonl(tp->ts_recent); optlen += TCPOLEN_TSTAMP_APPA; } /* * Send SACKs if necessary. This should be the last option processed. * Only as many SACKs are sent as are permitted by the maximum options * size. No more than three SACKs are sent. */ if (tp->sack_enable && tp->t_state == TCPS_ESTABLISHED && (tp->t_flags & (TF_SACK_PERMIT|TF_NOOPT)) == TF_SACK_PERMIT && tp->rcv_numsacks) { u_int32_t *lp = (u_int32_t *)(opt + optlen); u_int32_t *olp = lp++; int count = 0; /* actual number of SACKs inserted */ int maxsack = (MAX_TCPOPTLEN - (optlen + 4))/TCPOLEN_SACK; tcpstat.tcps_sack_send_blocks++; maxsack = min(maxsack, TCP_MAX_SACK); for (i = 0; (i < tp->rcv_numsacks && count < maxsack); i++) { struct sackblk sack = tp->sackblks[i]; if (sack.start == 0 && sack.end == 0) continue; *lp++ = htonl(sack.start); *lp++ = htonl(sack.end); count++; } *olp = htonl(TCPOPT_SACK_HDR|(TCPOLEN_SACK*count+2)); optlen += TCPOLEN_SACK*count + 4; /* including leading NOPs */ } /* * Send `CC-family' options if our side wants to use them (TF_REQ_CC), * options are allowed (!TF_NOOPT) and it's not a RST. */ if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && (flags & TH_RST) == 0) { switch (flags & (TH_SYN|TH_ACK)) { /* * This is a normal ACK, send CC if we received CC before * from our peer. */ case TH_ACK: if (!(tp->t_flags & TF_RCVD_CC)) break; /*FALLTHROUGH*/ /* * We can only get here in T/TCP's SYN_SENT* state, when * we're a sending a non-SYN segment without waiting for * the ACK of our SYN. A check above assures that we only * do this if our peer understands T/TCP. */ case 0: opt[optlen++] = TCPOPT_NOP; opt[optlen++] = TCPOPT_NOP; opt[optlen++] = TCPOPT_CC; opt[optlen++] = TCPOLEN_CC; *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send); optlen += 4; break; /* * This is our initial SYN, check whether we have to use * CC or CC.new. */ case TH_SYN: opt[optlen++] = TCPOPT_NOP; opt[optlen++] = TCPOPT_NOP; opt[optlen++] = tp->t_flags & TF_SENDCCNEW ? TCPOPT_CCNEW : TCPOPT_CC; opt[optlen++] = TCPOLEN_CC; *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send); optlen += 4; break; /* * This is a SYN,ACK; send CC and CC.echo if we received * CC from our peer. */ case (TH_SYN|TH_ACK): if (tp->t_flags & TF_RCVD_CC) { opt[optlen++] = TCPOPT_NOP; opt[optlen++] = TCPOPT_NOP; opt[optlen++] = TCPOPT_CC; opt[optlen++] = TCPOLEN_CC; *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send); optlen += 4; opt[optlen++] = TCPOPT_NOP; opt[optlen++] = TCPOPT_NOP; opt[optlen++] = TCPOPT_CCECHO; opt[optlen++] = TCPOLEN_CC; *(u_int32_t *)&opt[optlen] = htonl(tp->cc_recv); optlen += 4; } break; } } #ifdef TCP_SIGNATURE #ifdef INET6 if (!isipv6) #endif if (tp->t_flags & TF_SIGNATURE) { int i; u_char *bp; /* Initialize TCP-MD5 option (RFC2385) */ bp = (u_char *)opt + optlen; *bp++ = TCPOPT_SIGNATURE; *bp++ = TCPOLEN_SIGNATURE; sigoff = optlen + 2; for (i = 0; i < TCP_SIGLEN; i++) *bp++ = 0; optlen += TCPOLEN_SIGNATURE; /* Terminate options list and maintain 32-bit alignment. */ *bp++ = TCPOPT_NOP; *bp++ = TCPOPT_EOL; optlen += 2; } #endif /* TCP_SIGNATURE */ hdrlen += optlen; #ifdef INET6 if (isipv6) ipoptlen = ip6_optlen(tp->t_inpcb); else #endif if (tp->t_inpcb->inp_options) ipoptlen = tp->t_inpcb->inp_options->m_len - offsetof(struct ipoption, ipopt_list); else
static int tcpup_state_receive(struct tcpup_info *upp, struct tcpiphdr *tcp, size_t dlen) { int xflags = 0; int snd_una = htonl(tcp->th_ack); if (tcp->th_flags & TH_RST) { upp->t_state = TCPS_CLOSED; return 0; } if ((tcp->th_flags & TH_ACK) && SEQ_GT(snd_una, upp->snd_una)) { /* update snd una from peer */ upp->snd_una = snd_una; } switch (upp->t_state) { case TCPS_SYN_SENT: xflags = TH_SYN| TH_ACK; if ((tcp->th_flags & xflags) == TH_SYN) { assert((tcp->th_flags & TH_FIN) != TH_FIN); tcp_state_preload(upp, TCPS_SYN_RECEIVED, htonl(tcp->th_seq) + 1); return 0; } if ((tcp->th_flags & xflags) == xflags && SEQ_GEQ(htonl(tcp->th_ack), upp->snd_nxt)) { assert((tcp->th_flags & TH_FIN) != TH_FIN); tcp_state_preload(upp, TCPS_ESTABLISHED, htonl(tcp->th_seq)); return 0; } break; case TCPS_SYN_RECEIVED: if ((tcp->th_flags & TH_ACK) == TH_ACK && SEQ_GEQ(htonl(tcp->th_ack), upp->snd_nxt)) { assert((tcp->th_flags & TH_FIN) != TH_FIN); tcp_state_preload(upp, TCPS_ESTABLISHED, htonl(tcp->th_seq)); return 0; } break; case TCPS_ESTABLISHED: if ((tcp->th_flags & TH_FIN) == TH_FIN) { tcp_state_preload(upp, TCPS_CLOSE_WAIT, htonl(tcp->th_seq) + 1); return 0; } break; case TCPS_FIN_WAIT_1: xflags = TH_FIN| TH_ACK; if ((tcp->th_flags & xflags) == xflags && SEQ_GEQ(htonl(tcp->th_ack), upp->snd_nxt)) { tcp_state_preload(upp, TCPS_TIME_WAIT, htonl(tcp->th_seq) + dlen + 1); return 0; } if ((tcp->th_flags & TH_FIN) == TH_FIN) { tcp_state_preload(upp, TCPS_CLOSING, htonl(tcp->th_seq) + dlen + 1); return 0; } if ((tcp->th_flags & TH_ACK) == TH_ACK && SEQ_GEQ(htonl(tcp->th_ack), upp->snd_nxt)) { tcp_state_preload(upp, TCPS_FIN_WAIT_2, htonl(tcp->th_seq) + dlen); return 0; } break; case TCPS_FIN_WAIT_2: if ((tcp->th_flags & TH_FIN) == TH_FIN) { tcp_state_preload(upp, TCPS_TIME_WAIT, htonl(tcp->th_seq) + dlen + 1); return 0; } break; case TCPS_CLOSING: if ((tcp->th_flags & TH_ACK) == TH_ACK && SEQ_GEQ(htonl(tcp->th_ack), upp->snd_nxt)) { tcp_state_preload(upp, TCPS_TIME_WAIT, htonl(tcp->th_seq) + dlen); return 0; } break; case TCPS_LAST_ACK: if ((tcp->th_flags & TH_ACK) == TH_ACK && SEQ_GEQ(htonl(tcp->th_ack), upp->snd_nxt)) { tcp_state_preload(upp, TCPS_CLOSED, htonl(tcp->th_seq) + dlen); return 0; } break; case TCPS_TIME_WAIT: fprintf(stderr, "before TIME_WAIT -> TIME_WAIT\n"); break; } return 0; }
/* * To insert a new blk to the array of SACK blk in receiver. * * Parameters: * sack_blk_t *head: pointer to the array of SACK blks. * tcp_seq begin: starting seq num of the new blk. * tcp_seq end: ending seq num of the new blk. * int32_t *num: (referenced) total num of SACK blks on the list. */ void tcp_sack_insert(sack_blk_t *head, tcp_seq begin, tcp_seq end, int32_t *num) { int32_t i, j, old_num, new_num; sack_blk_t tmp[MAX_SACK_BLK - 1]; /* The array is empty, just add the new one. */ if (*num == 0) { head[0].begin = begin; head[0].end = end; *num = 1; return; } /* * Check for overlap. There are five cases. * * 1. there is no overlap with any other SACK blks. * 2. new SACK blk is completely contained in another blk. * 3. tail part of new SACK blk overlaps with another blk. * 4. head part of new SACK blk overlaps with another blk. * 5. new SACK blk completely contains another blk. * * Use tmp to hold old SACK blks. After the loop, copy them back * to head. */ old_num = *num; if (old_num > MAX_SACK_BLK - 1) { old_num = MAX_SACK_BLK - 1; } new_num = old_num; j = 0; for (i = 0; i < old_num; i++) { if (SEQ_LT(end, head[i].begin) || SEQ_GT(begin, head[i].end)) { /* Case 1: continue to check. */ tmp[j].begin = head[i].begin; tmp[j].end = head[i].end; j++; continue; } else if (SEQ_GEQ(begin, head[i].begin) && SEQ_LEQ(end, head[i].end)) { /* Case 2: re-insert the old blk to the head. */ begin = head[i].begin; end = head[i].end; } else if (SEQ_LEQ(end, head[i].end) && SEQ_GEQ(end, head[i].begin)) { /* * Case 3: Extend the new blk, remove the old one * and continue to check. */ end = head[i].end; } else if (SEQ_GEQ(begin, head[i].begin) && SEQ_LEQ(begin, head[i].end)) { /* Case 4 */ begin = head[i].begin; } /* * Common code for all cases except the first one, which * copies the original SACK blk into the tmp storage. Other * cases remove the original SACK blk by not copying into * tmp storage. */ new_num--; } head[0].begin = begin; head[0].end = end; for (i = 0; i < new_num; i++) { head[i+1].begin = tmp[i].begin; head[i+1].end = tmp[i].end; } *num = new_num + 1; }
/* * SD PDU / SOS_READY Processor * * Arguments: * sop pointer to sscop connection block * m pointer to PDU buffer (without trailer) * trlr pointer to PDU trailer * * Returns: * none * */ static void sscop_sd_ready(struct sscop *sop, KBuffer *m, caddr_t trlr) { struct sd_pdu *sp = (struct sd_pdu *)trlr; struct pdu_hdr *php; KBuffer *n; sscop_seq ns; int err, space; /* * Get PDU sequence number */ SEQ_SET(ns, ntohl(sp->sd_ns)); /* * Ensure that the sequence number fits within the window */ if (SEQ_GEQ(ns, sop->so_rcvmax, sop->so_rcvnext)) { /* * It doesn't, drop received data */ KB_FREEALL(m); /* * If next highest PDU hasn't reached window end yet, * then send a USTAT to inform transmitter of this gap */ if (SEQ_LT(sop->so_rcvhigh, sop->so_rcvmax, sop->so_rcvnext)) { sscop_send_ustat(sop, sop->so_rcvmax); sop->so_rcvhigh = sop->so_rcvmax; } return; } /* * If this is the next in-sequence PDU, hand it to user */ if (ns == sop->so_rcvnext) { STACK_CALL(SSCOP_DATA_IND, sop->so_upper, sop->so_toku, sop->so_connvc, (int)m, ns, err); if (err) { KB_FREEALL(m); return; } /* * Bump next expected sequence number */ SEQ_INCR(sop->so_rcvnext, 1); /* * Slide receive window down */ SEQ_INCR(sop->so_rcvmax, 1); /* * Is this the highest sequence PDU we've received?? */ if (ns == sop->so_rcvhigh) { /* * Yes, bump the limit and exit */ sop->so_rcvhigh = sop->so_rcvnext; return; } /* * This is a retransmitted PDU, so see if we have * more in-sequence PDUs already queued up */ while ((php = sop->so_recv_hd) && (php->ph_ns == sop->so_rcvnext)) { /* * Yup we do, so remove next PDU from queue and * pass it up to the user as well */ sop->so_recv_hd = php->ph_recv_lk; if (sop->so_recv_hd == NULL) sop->so_recv_tl = NULL; STACK_CALL(SSCOP_DATA_IND, sop->so_upper, sop->so_toku, sop->so_connvc, (int)php->ph_buf, php->ph_ns, err); if (err) { /* * Should never happen, but... */ KB_FREEALL(php->ph_buf); sscop_abort(sop, "stack memory\n"); return; } /* * Bump next expected sequence number */ SEQ_INCR(sop->so_rcvnext, 1); /* * Slide receive window down */ SEQ_INCR(sop->so_rcvmax, 1); } /* * Finished with data delivery... */ return; } /* * We're gonna have to queue this PDU, so find space * for the PDU header */ KB_HEADROOM(m, space); /* * If there's not enough room in the received buffer, * allocate & link a new buffer for the header */ if (space < sizeof(struct pdu_hdr)) { KB_ALLOC(n, sizeof(struct pdu_hdr), KB_F_NOWAIT, KB_T_HEADER); if (n == NULL) { KB_FREEALL(m); return; } KB_HEADSET(n, sizeof(struct pdu_hdr)); KB_LEN(n) = 0; KB_LINKHEAD(n, m); m = n; } /* * Build PDU header * * We can at least assume/require that the start of * the user data is aligned. Also note that we don't * include this header in the buffer len/offset fields. */ KB_DATASTART(m, php, struct pdu_hdr *); php--; php->ph_ns = ns; php->ph_buf = m; /* * Insert PDU into the receive queue */ if (sscop_recv_insert(sop, php)) { /* * Oops, a duplicate sequence number PDU is already on * the queue, somethings wrong here. */ sscop_maa_error(sop, 'Q'); /* * Free buffers */ KB_FREEALL(m); /* * Go into recovery mode */ q2110_error_recovery(sop); return; } /* * Are we at the high-water mark?? */ if (ns == sop->so_rcvhigh) { /* * Yes, just bump the mark */ SEQ_INCR(sop->so_rcvhigh, 1); return; } /* * Are we beyond the high-water mark?? */ if (SEQ_GT(ns, sop->so_rcvhigh, sop->so_rcvnext)) { /* * Yes, then there's a missing PDU, so inform the transmitter */ sscop_send_ustat(sop, ns); /* * Update high-water mark */ sop->so_rcvhigh = SEQ_ADD(ns, 1); } return; }
/** * \brief insert a SACK range * * \param le left edge in host order * \param re right edge in host order * * \retval 0 all is good * \retval -1 error */ static int StreamTcpSackInsertRange(TcpStream *stream, uint32_t le, uint32_t re) { SCLogDebug("le %u, re %u", le, re); #ifdef DEBUG StreamTcpSackPrintList(stream); #endif /* if to the left of last_ack then ignore */ if (SEQ_LT(re, stream->last_ack)) { SCLogDebug("too far left. discarding"); goto end; } /* if to the right of the tcp window then ignore */ if (SEQ_GT(le, (stream->last_ack + stream->window))) { SCLogDebug("too far right. discarding"); goto end; } if (stream->sack_head != NULL) { StreamTcpSackRecord *rec; for (rec = stream->sack_head; rec != NULL; rec = rec->next) { SCLogDebug("rec %p, le %u, re %u", rec, rec->le, rec->re); if (SEQ_LT(le, rec->le)) { SCLogDebug("SEQ_LT(le, rec->le)"); if (SEQ_LT(re, rec->le)) { SCLogDebug("SEQ_LT(re, rec->le)"); // entirely before, prepend StreamTcpSackRecord *stsr = StreamTcpSackRecordAlloc(); if (unlikely(stsr == NULL)) { SCReturnInt(-1); } stsr->le = le; stsr->re = re; stsr->next = stream->sack_head; stream->sack_head = stsr; goto end; } else if (SEQ_EQ(re, rec->le)) { SCLogDebug("SEQ_EQ(re, rec->le)"); // starts before, ends on rec->le, expand rec->le = le; } else if (SEQ_GT(re, rec->le)) { SCLogDebug("SEQ_GT(re, rec->le)"); // starts before, ends beyond rec->le if (SEQ_LEQ(re, rec->re)) { SCLogDebug("SEQ_LEQ(re, rec->re)"); // ends before rec->re, expand rec->le = le; } else { // implied if (re > rec->re) SCLogDebug("implied if (re > rec->re), le set to %u", rec->re); le = rec->re; continue; } } } else if (SEQ_EQ(le, rec->le)) { SCLogDebug("SEQ_EQ(le, rec->le)"); if (SEQ_LEQ(re, rec->re)) { SCLogDebug("SEQ_LEQ(re, rec->re)"); // new record fully overlapped SCReturnInt(0); } else { // implied re > rec->re SCLogDebug("implied re > rec->re"); if (rec->next != NULL) { if (SEQ_LEQ(re, rec->next->le)) { rec->re = re; goto end; } else { rec->re = rec->next->le; le = rec->next->le; SCLogDebug("le is now %u", le); continue; } } else { rec->re = re; goto end; } } } else { // implied (le > rec->le) SCLogDebug("implied (le > rec->le)"); if (SEQ_LT(le, rec->re)) { SCLogDebug("SEQ_LT(le, rec->re))"); // new record fully overlapped if (SEQ_GT(re, rec->re)) { SCLogDebug("SEQ_GT(re, rec->re)"); if (rec->next != NULL) { if (SEQ_LEQ(re, rec->next->le)) { rec->re = re; goto end; } else { rec->re = rec->next->le; le = rec->next->le; continue; } } else { rec->re = re; goto end; } } SCLogDebug("new range fully overlapped"); SCReturnInt(0); } else if (SEQ_EQ(le, rec->re)) { SCLogDebug("here"); // new record fully overlapped //int r = StreamTcpSackInsertRange(stream, rec->re+1, re); //SCReturnInt(r); le = rec->re; continue; } else { /* implied le > rec->re */ SCLogDebug("implied le > rec->re"); if (rec->next == NULL) { SCLogDebug("rec->next == NULL"); StreamTcpSackRecord *stsr = StreamTcpSackRecordAlloc(); if (unlikely(stsr == NULL)) { SCReturnInt(-1); } stsr->le = le; stsr->re = re; stsr->next = NULL; stream->sack_tail->next = stsr; stream->sack_tail = stsr; goto end; } else { SCLogDebug("implied rec->next != NULL"); if (SEQ_LT(le, rec->next->le) && SEQ_LT(re, rec->next->le)) { SCLogDebug("SEQ_LT(le, rec->next->le) && SEQ_LT(re, rec->next->le)"); StreamTcpSackRecord *stsr = StreamTcpSackRecordAlloc(); if (unlikely(stsr == NULL)) { SCReturnInt(-1); } stsr->le = le; stsr->re = re; stsr->next = rec->next; rec->next = stsr; } else if (SEQ_LT(le, rec->next->le) && SEQ_GEQ(re, rec->next->le)) { SCLogDebug("SEQ_LT(le, rec->next->le) && SEQ_GEQ(re, rec->next->le)"); StreamTcpSackRecord *stsr = StreamTcpSackRecordAlloc(); if (unlikely(stsr == NULL)) { SCReturnInt(-1); } stsr->le = le; stsr->re = rec->next->le; stsr->next = rec->next; rec->next = stsr; le = rec->next->le; } } } } } } else { SCLogDebug("implied empty list"); StreamTcpSackRecord *stsr = StreamTcpSackRecordAlloc(); if (unlikely(stsr == NULL)) { SCReturnInt(-1); } stsr->le = le; stsr->re = re; stsr->next = NULL; stream->sack_head = stsr; stream->sack_tail = stsr; } StreamTcpSackPruneList(stream); end: SCReturnInt(0); }
static INLINE void ApplyFlowDepth(HTTPINSPECT_CONF *ServerConf, Packet *p, HttpSessionData *sd, int resp_header_size, int expected, uint32_t seq_num) { if(!ServerConf->server_flow_depth) { SetDetectLimit(p, p->dsize); } else if(ServerConf->server_flow_depth == -1) { SetDetectLimit(p, resp_header_size); } else { if(sd != NULL) { if(sd->resp_state.is_max_seq ) { if(SEQ_GEQ((sd->resp_state.max_seq), seq_num)) { if(p->dsize > (sd->resp_state.max_seq- seq_num)) { SetDetectLimit(p, (sd->resp_state.max_seq-seq_num)); return; } else { SetDetectLimit(p, p->dsize); return; } } else { SetDetectLimit(p, resp_header_size); return; } } else { if(expected) { if(p->dsize > ServerConf->server_flow_depth) { SetDetectLimit(p, ServerConf->server_flow_depth); return; } else { SetDetectLimit(p, p->dsize); return; } } else { SetDetectLimit(p, 0); return; } } } else { SetDetectLimit(p, p->dsize); } } }
void tcp_timer_rexmt(void *arg) { struct tcpcb *tp = arg; uint32_t rto; #ifdef TCP_DEBUG struct socket *so = NULL; short ostate; #endif mutex_enter(softnet_lock); if ((tp->t_flags & TF_DEAD) != 0) { mutex_exit(softnet_lock); return; } if (!callout_expired(&tp->t_timer[TCPT_REXMT])) { mutex_exit(softnet_lock); return; } KERNEL_LOCK(1, NULL); if ((tp->t_flags & TF_PMTUD_PEND) && tp->t_inpcb && SEQ_GEQ(tp->t_pmtud_th_seq, tp->snd_una) && SEQ_LT(tp->t_pmtud_th_seq, (int)(tp->snd_una + tp->t_ourmss))) { extern struct sockaddr_in icmpsrc; struct icmp icmp; tp->t_flags &= ~TF_PMTUD_PEND; /* XXX create fake icmp message with relevant entries */ icmp.icmp_nextmtu = tp->t_pmtud_nextmtu; icmp.icmp_ip.ip_len = tp->t_pmtud_ip_len; icmp.icmp_ip.ip_hl = tp->t_pmtud_ip_hl; icmpsrc.sin_addr = tp->t_inpcb->inp_faddr; icmp_mtudisc(&icmp, icmpsrc.sin_addr); /* * Notify all connections to the same peer about * new mss and trigger retransmit. */ in_pcbnotifyall(&tcbtable, icmpsrc.sin_addr, EMSGSIZE, tcp_mtudisc); KERNEL_UNLOCK_ONE(NULL); mutex_exit(softnet_lock); return; } #ifdef TCP_DEBUG #ifdef INET if (tp->t_inpcb) so = tp->t_inpcb->inp_socket; #endif #ifdef INET6 if (tp->t_in6pcb) so = tp->t_in6pcb->in6p_socket; #endif ostate = tp->t_state; #endif /* TCP_DEBUG */ /* * Clear the SACK scoreboard, reset FACK estimate. */ tcp_free_sackholes(tp); tp->snd_fack = tp->snd_una; /* * Retransmission timer went off. Message has not * been acked within retransmit interval. Back off * to a longer retransmit interval and retransmit one segment. */ if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; TCP_STATINC(TCP_STAT_TIMEOUTDROP); tp = tcp_drop(tp, tp->t_softerror ? tp->t_softerror : ETIMEDOUT); goto out; } TCP_STATINC(TCP_STAT_REXMTTIMEO); rto = TCP_REXMTVAL(tp); if (rto < tp->t_rttmin) rto = tp->t_rttmin; TCPT_RANGESET(tp->t_rxtcur, rto * tcp_backoff[tp->t_rxtshift], tp->t_rttmin, TCPTV_REXMTMAX); TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); /* * If we are losing and we are trying path MTU discovery, * try turning it off. This will avoid black holes in * the network which suppress or fail to send "packet * too big" ICMP messages. We should ideally do * lots more sophisticated searching to find the right * value here... */ if (tp->t_mtudisc && tp->t_rxtshift > TCP_MAXRXTSHIFT / 6) { TCP_STATINC(TCP_STAT_PMTUBLACKHOLE); #ifdef INET /* try turning PMTUD off */ if (tp->t_inpcb) tp->t_mtudisc = 0; #endif #ifdef INET6 /* try using IPv6 minimum MTU */ if (tp->t_in6pcb) tp->t_mtudisc = 0; #endif /* XXX: more sophisticated Black hole recovery code? */ } /* * If losing, let the lower level know and try for * a better route. Also, if we backed off this far, * our srtt estimate is probably bogus. Clobber it * so we'll take the next rtt measurement as our srtt; * move the current srtt into rttvar to keep the current * retransmit times until then. */ if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { #ifdef INET if (tp->t_inpcb) in_losing(tp->t_inpcb); #endif #ifdef INET6 if (tp->t_in6pcb) in6_losing(tp->t_in6pcb); #endif /* * This operation is not described in RFC2988. The * point is to keep srtt+4*rttvar constant, so we * should shift right 2 bits to divide by 4, and then * shift right one bit because the storage * representation of rttvar is 1/16s vs 1/32s for * srtt. */ tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); tp->t_srtt = 0; } tp->snd_nxt = tp->snd_una; tp->snd_high = tp->snd_max; /* * If timing a segment in this window, stop the timer. */ tp->t_rtttime = 0; /* * Remember if we are retransmitting a SYN, because if * we do, set the initial congestion window must be set * to 1 segment. */ if (tp->t_state == TCPS_SYN_SENT) tp->t_flags |= TF_SYN_REXMT; /* * Adjust congestion control parameters. */ tp->t_congctl->slow_retransmit(tp); (void) tcp_output(tp); out: #ifdef TCP_DEBUG if (tp && so->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, NULL, PRU_SLOWTIMO | (TCPT_REXMT << 8)); #endif KERNEL_UNLOCK_ONE(NULL); mutex_exit(softnet_lock); }
/* * This function is called upon receipt of new valid data (while not in * header prediction mode), and it updates the ordered list of sacks. */ void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end) { /* * First reported block MUST be the most recent one. Subsequent * blocks SHOULD be in the order in which they arrived at the * receiver. These two conditions make the implementation fully * compliant with RFC 2018. */ struct sackblk head_blk, saved_blks[MAX_SACK_BLKS]; int num_head, num_saved, i; //ScenSim-Port// INP_WLOCK_ASSERT(tp->t_inpcb); /* Check arguments. */ //ScenSim-Port// KASSERT(SEQ_LT(rcv_start, rcv_end), ("rcv_start < rcv_end")); /* SACK block for the received segment. */ head_blk.start = rcv_start; head_blk.end = rcv_end; /* * Merge updated SACK blocks into head_blk, and save unchanged SACK * blocks into saved_blks[]. num_saved will have the number of the * saved SACK blocks. */ num_saved = 0; for (i = 0; i < tp->rcv_numsacks; i++) { tcp_seq start = tp->sackblks[i].start; tcp_seq end = tp->sackblks[i].end; if (SEQ_GEQ(start, end) || SEQ_LEQ(start, tp->rcv_nxt)) { /* * Discard this SACK block. */ } else if (SEQ_LEQ(head_blk.start, end) && SEQ_GEQ(head_blk.end, start)) { /* * Merge this SACK block into head_blk. This SACK * block itself will be discarded. */ if (SEQ_GT(head_blk.start, start)) head_blk.start = start; if (SEQ_LT(head_blk.end, end)) head_blk.end = end; } else { /* * Save this SACK block. */ saved_blks[num_saved].start = start; saved_blks[num_saved].end = end; num_saved++; } } /* * Update SACK list in tp->sackblks[]. */ num_head = 0; if (SEQ_GT(head_blk.start, tp->rcv_nxt)) { /* * The received data segment is an out-of-order segment. Put * head_blk at the top of SACK list. */ tp->sackblks[0] = head_blk; num_head = 1; /* * If the number of saved SACK blocks exceeds its limit, * discard the last SACK block. */ if (num_saved >= MAX_SACK_BLKS) num_saved--; } if (num_saved > 0) { /* * Copy the saved SACK blocks back. */ bcopy(saved_blks, &tp->sackblks[num_head], sizeof(struct sackblk) * num_saved); } /* Save the number of SACK blocks. */ tp->rcv_numsacks = num_head + num_saved; }
static int tcpup_state_send(struct tcpup_info *upp, struct tcpiphdr *tcp, size_t dlen) { int xflags = 0; if (tcp->th_flags & TH_RST) { upp->t_state = TCPS_CLOSED; return 0; } if (upp->x_state != upp->t_state && (tcp->th_flags & TH_ACK) && SEQ_GEQ(htonl(tcp->th_ack), upp->rcv_una)) { tcp_state_update(upp, upp->x_state); upp->t_state = upp->x_state; } switch (upp->t_state) { case TCPS_CLOSED: xflags = TH_SYN| TH_ACK; if ((tcp->th_flags & xflags) == TH_SYN) { upp->snd_nxt = htonl(tcp->th_seq) + 1; upp->snd_max = htonl(tcp->th_seq) + 1; upp->snd_una = htonl(tcp->th_seq) + 1; tcp_state_update(upp, TCPS_SYN_SENT); return 0; } break; case TCPS_SYN_RECEIVED: assert((tcp->th_flags & TH_FIN) != TH_FIN); xflags = TH_SYN| TH_ACK; if ((tcp->th_flags & xflags) == TH_ACK && SEQ_GT(htonl(tcp->th_seq), upp->snd_nxt)) { tcp_state_update(upp, upp->x_state); return 0; } break; case TCPS_ESTABLISHED: if ((tcp->th_flags & TH_FIN) == TH_FIN) { upp->snd_nxt = htonl(tcp->th_seq) + dlen + 1; tcp_state_update(upp, TCPS_FIN_WAIT_1); return 0; } break; case TCPS_CLOSE_WAIT: if ((tcp->th_flags & TH_FIN) == TH_FIN) { upp->snd_nxt = htonl(tcp->th_seq) + dlen + 1; tcp_state_update(upp, TCPS_LAST_ACK); return 0; } break; case TCPS_FIN_WAIT_1: xflags = TH_FIN| TH_ACK; if ((tcp->th_flags & xflags) == TH_ACK) { tcp_state_update(upp, upp->x_state); return 0; } break; } if (dlen > 0) { upp->snd_nxt = htonl(tcp->th_seq) + dlen; if (SEQ_GT(upp->snd_nxt, upp->snd_max)) { /* update snd max to nxt */ upp->snd_max = upp->snd_nxt; } } return 0; }
/* * Process cumulative ACK and the TCP SACK option to update the scoreboard. * tp->snd_holes is an ordered list of holes (oldest to newest, in terms of * the sequence space). */ void tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) { struct sackhole *cur, *temp; struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1], *sblkp; int i, j, num_sack_blks; //ScenSim-Port// INP_WLOCK_ASSERT(tp->t_inpcb); num_sack_blks = 0; /* * If SND.UNA will be advanced by SEG.ACK, and if SACK holes exist, * treat [SND.UNA, SEG.ACK) as if it is a SACK block. */ if (SEQ_LT(tp->snd_una, th_ack) && !TAILQ_EMPTY(&tp->snd_holes)) { sack_blocks[num_sack_blks].start = tp->snd_una; sack_blocks[num_sack_blks++].end = th_ack; } /* * Append received valid SACK blocks to sack_blocks[], but only if we * received new blocks from the other side. */ if (to->to_flags & TOF_SACK) { for (i = 0; i < to->to_nsacks; i++) { bcopy((to->to_sacks + i * TCPOLEN_SACK), &sack, sizeof(sack)); sack.start = ntohl(sack.start); sack.end = ntohl(sack.end); if (SEQ_GT(sack.end, sack.start) && SEQ_GT(sack.start, tp->snd_una) && SEQ_GT(sack.start, th_ack) && SEQ_LT(sack.start, tp->snd_max) && SEQ_GT(sack.end, tp->snd_una) && SEQ_LEQ(sack.end, tp->snd_max)) sack_blocks[num_sack_blks++] = sack; } } /* * Return if SND.UNA is not advanced and no valid SACK block is * received. */ if (num_sack_blks == 0) return; /* * Sort the SACK blocks so we can update the scoreboard with just one * pass. The overhead of sorting upto 4+1 elements is less than * making upto 4+1 passes over the scoreboard. */ for (i = 0; i < num_sack_blks; i++) { for (j = i + 1; j < num_sack_blks; j++) { if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { sack = sack_blocks[i]; sack_blocks[i] = sack_blocks[j]; sack_blocks[j] = sack; } } } if (TAILQ_EMPTY(&tp->snd_holes)) /* * Empty scoreboard. Need to initialize snd_fack (it may be * uninitialized or have a bogus value). Scoreboard holes * (from the sack blocks received) are created later below * (in the logic that adds holes to the tail of the * scoreboard). */ tp->snd_fack = SEQ_MAX(tp->snd_una, th_ack); /* * In the while-loop below, incoming SACK blocks (sack_blocks[]) and * SACK holes (snd_holes) are traversed from their tails with just * one pass in order to reduce the number of compares especially when * the bandwidth-delay product is large. * * Note: Typically, in the first RTT of SACK recovery, the highest * three or four SACK blocks with the same ack number are received. * In the second RTT, if retransmitted data segments are not lost, * the highest three or four SACK blocks with ack number advancing * are received. */ sblkp = &sack_blocks[num_sack_blks - 1]; /* Last SACK block */ tp->sackhint.last_sack_ack = sblkp->end; if (SEQ_LT(tp->snd_fack, sblkp->start)) { /* * The highest SACK block is beyond fack. Append new SACK * hole at the tail. If the second or later highest SACK * blocks are also beyond the current fack, they will be * inserted by way of hole splitting in the while-loop below. */ temp = tcp_sackhole_insert(tp, tp->snd_fack,sblkp->start,NULL); if (temp != NULL) { tp->snd_fack = sblkp->end; /* Go to the previous sack block. */ sblkp--; } else { /* * We failed to add a new hole based on the current * sack block. Skip over all the sack blocks that * fall completely to the right of snd_fack and * proceed to trim the scoreboard based on the * remaining sack blocks. This also trims the * scoreboard for th_ack (which is sack_blocks[0]). */ while (sblkp >= sack_blocks && SEQ_LT(tp->snd_fack, sblkp->start)) sblkp--; if (sblkp >= sack_blocks && SEQ_LT(tp->snd_fack, sblkp->end)) tp->snd_fack = sblkp->end; } } else if (SEQ_LT(tp->snd_fack, sblkp->end)) /* fack is advanced. */ tp->snd_fack = sblkp->end; /* We must have at least one SACK hole in scoreboard. */ //ScenSim-Port// KASSERT(!TAILQ_EMPTY(&tp->snd_holes), //ScenSim-Port// ("SACK scoreboard must not be empty")); cur = TAILQ_LAST(&tp->snd_holes, sackhole_head); /* Last SACK hole. */ /* * Since the incoming sack blocks are sorted, we can process them * making one sweep of the scoreboard. */ while (sblkp >= sack_blocks && cur != NULL) { if (SEQ_GEQ(sblkp->start, cur->end)) { /* * SACKs data beyond the current hole. Go to the * previous sack block. */ sblkp--; continue; } if (SEQ_LEQ(sblkp->end, cur->start)) { /* * SACKs data before the current hole. Go to the * previous hole. */ cur = TAILQ_PREV(cur, sackhole_head, scblink); continue; } tp->sackhint.sack_bytes_rexmit -= (cur->rxmit - cur->start); //ScenSim-Port// KASSERT(tp->sackhint.sack_bytes_rexmit >= 0, //ScenSim-Port// ("sackhint bytes rtx >= 0")); if (SEQ_LEQ(sblkp->start, cur->start)) { /* Data acks at least the beginning of hole. */ if (SEQ_GEQ(sblkp->end, cur->end)) { /* Acks entire hole, so delete hole. */ temp = cur; cur = TAILQ_PREV(cur, sackhole_head, scblink); tcp_sackhole_remove(tp, temp); /* * The sack block may ack all or part of the * next hole too, so continue onto the next * hole. */ continue; } else { /* Move start of hole forward. */ cur->start = sblkp->end; cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); } } else { /* Data acks at least the end of hole. */ if (SEQ_GEQ(sblkp->end, cur->end)) { /* Move end of hole backward. */ cur->end = sblkp->start; cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); } else { /* * ACKs some data in middle of a hole; need * to split current hole */ temp = tcp_sackhole_insert(tp, sblkp->end, cur->end, cur); if (temp != NULL) { if (SEQ_GT(cur->rxmit, temp->rxmit)) { temp->rxmit = cur->rxmit; tp->sackhint.sack_bytes_rexmit += (temp->rxmit - temp->start); } cur->end = sblkp->start; cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); } } } tp->sackhint.sack_bytes_rexmit += (cur->rxmit - cur->start); /* * Testing sblkp->start against cur->start tells us whether * we're done with the sack block or the sack hole. * Accordingly, we advance one or the other. */ if (SEQ_LEQ(sblkp->start, cur->start)) cur = TAILQ_PREV(cur, sackhole_head, scblink); else sblkp--; } }
enum CT_UPDATE_RES OvsConntrackUpdateTcpEntry(OVS_CT_ENTRY* conn_, const TCPHdr *tcp, PNET_BUFFER_LIST nbl, BOOLEAN reply, UINT64 now) { struct conn_tcp *conn = OvsCastConntrackEntryToTcpEntry(conn_); /* The peer that sent 'pkt' */ struct tcp_peer *src = &conn->peer[reply ? 1 : 0]; /* The peer that should receive 'pkt' */ struct tcp_peer *dst = &conn->peer[reply ? 0 : 1]; uint8_t sws = 0, dws = 0; UINT16 tcp_flags = ntohs(tcp->flags); uint16_t win = ntohs(tcp->window); uint32_t ack, end, seq, orig_seq; uint32_t p_len = OvsGetTcpPayloadLength(nbl); int ackskew; if (OvsCtInvalidTcpFlags(tcp_flags)) { return CT_UPDATE_INVALID; } if (((tcp_flags & (TCP_SYN|TCP_ACK)) == TCP_SYN) && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2 && src->state >= CT_DPIF_TCPS_FIN_WAIT_2) { src->state = dst->state = CT_DPIF_TCPS_CLOSED; return CT_UPDATE_NEW; } if (src->wscale & CT_WSCALE_FLAG && dst->wscale & CT_WSCALE_FLAG && !(tcp_flags & TCP_SYN)) { sws = src->wscale & CT_WSCALE_MASK; dws = dst->wscale & CT_WSCALE_MASK; } else if (src->wscale & CT_WSCALE_UNKNOWN && dst->wscale & CT_WSCALE_UNKNOWN && !(tcp_flags & TCP_SYN)) { sws = TCP_MAX_WSCALE; dws = TCP_MAX_WSCALE; } /* * Sequence tracking algorithm from Guido van Rooij's paper: * http://www.madison-gurkha.com/publications/tcp_filtering/ * tcp_filtering.ps */ orig_seq = seq = ntohl(tcp->seq); if (src->state < CT_DPIF_TCPS_SYN_SENT) { /* First packet from this end. Set its state */ ack = ntohl(tcp->ack_seq); end = seq + p_len; if (tcp_flags & TCP_SYN) { end++; if (dst->wscale & CT_WSCALE_FLAG) { src->wscale = OvsTcpGetWscale(tcp); if (src->wscale & CT_WSCALE_FLAG) { /* Remove scale factor from initial window */ sws = src->wscale & CT_WSCALE_MASK; win = DIV_ROUND_UP((uint32_t) win, 1 << sws); dws = dst->wscale & CT_WSCALE_MASK; } else { /* fixup other window */ dst->max_win <<= dst->wscale & CT_WSCALE_MASK; /* in case of a retrans SYN|ACK */ dst->wscale = 0; } } } if (tcp_flags & TCP_FIN) { end++; } src->seqlo = seq; src->state = CT_DPIF_TCPS_SYN_SENT; /* * May need to slide the window (seqhi may have been set by * the crappy stack check or if we picked up the connection * after establishment) */ if (src->seqhi == 1 || SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi)) { src->seqhi = end + MAX(1, dst->max_win << dws); } if (win > src->max_win) { src->max_win = win; } } else { ack = ntohl(tcp->ack_seq); end = seq + p_len; if (tcp_flags & TCP_SYN) { end++; } if (tcp_flags & TCP_FIN) { end++; } } if ((tcp_flags & TCP_ACK) == 0) { /* Let it pass through the ack skew check */ ack = dst->seqlo; } else if ((ack == 0 && (tcp_flags & (TCP_ACK|TCP_RST)) == (TCP_ACK|TCP_RST)) /* broken tcp stacks do not set ack */) { /* Many stacks (ours included) will set the ACK number in an * FIN|ACK if the SYN times out -- no sequence to ACK. */ ack = dst->seqlo; } if (seq == end) { /* Ease sequencing restrictions on no data packets */ seq = src->seqlo; end = seq; } ackskew = dst->seqlo - ack; #define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */ if (SEQ_GEQ(src->seqhi, end) /* Last octet inside other's window space */ && SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) /* Retrans: not more than one window back */ && (ackskew >= -MAXACKWINDOW) /* Acking not more than one reassembled fragment backwards */ && (ackskew <= (MAXACKWINDOW << sws)) /* Acking not more than one window forward */ && ((tcp_flags & TCP_RST) == 0 || orig_seq == src->seqlo || (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo))) { /* Require an exact/+1 sequence match on resets when possible */ /* update max window */ if (src->max_win < win) { src->max_win = win; } /* synchronize sequencing */ if (SEQ_GT(end, src->seqlo)) { src->seqlo = end; } /* slide the window of what the other end can send */ if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) { dst->seqhi = ack + MAX((win << sws), 1); } /* update states */ if (tcp_flags & TCP_SYN && src->state < CT_DPIF_TCPS_SYN_SENT) { src->state = CT_DPIF_TCPS_SYN_SENT; } if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) { src->state = CT_DPIF_TCPS_CLOSING; } if (tcp_flags & TCP_ACK) { if (dst->state == CT_DPIF_TCPS_SYN_SENT) { dst->state = CT_DPIF_TCPS_ESTABLISHED; } else if (dst->state == CT_DPIF_TCPS_CLOSING) { dst->state = CT_DPIF_TCPS_FIN_WAIT_2; } } if (tcp_flags & TCP_RST) { src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT; } if (src->state >= CT_DPIF_TCPS_FIN_WAIT_2 && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2) { OvsConntrackUpdateExpiration(conn, now, 30 * CT_INTERVAL_SEC); } else if (src->state >= CT_DPIF_TCPS_CLOSING && dst->state >= CT_DPIF_TCPS_CLOSING) { OvsConntrackUpdateExpiration(conn, now, 45 * CT_INTERVAL_SEC); } else if (src->state < CT_DPIF_TCPS_ESTABLISHED || dst->state < CT_DPIF_TCPS_ESTABLISHED) { OvsConntrackUpdateExpiration(conn, now, 30 * CT_INTERVAL_SEC); } else if (src->state >= CT_DPIF_TCPS_CLOSING || dst->state >= CT_DPIF_TCPS_CLOSING) { OvsConntrackUpdateExpiration(conn, now, 15 * 60 * CT_INTERVAL_SEC); } else { OvsConntrackUpdateExpiration(conn, now, 24 * 60 * 60 * CT_INTERVAL_SEC); } } else if ((dst->state < CT_DPIF_TCPS_SYN_SENT || dst->state >= CT_DPIF_TCPS_FIN_WAIT_2 || src->state >= CT_DPIF_TCPS_FIN_WAIT_2) && SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) /* Within a window forward of the originating packet */ && SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) { /* Within a window backward of the originating packet */ /* * This currently handles three situations: * 1) Stupid stacks will shotgun SYNs before their peer * replies. * 2) When PF catches an already established stream (the * firewall rebooted, the state table was flushed, routes * changed...) * 3) Packets get funky immediately after the connection * closes (this should catch Solaris spurious ACK|FINs * that web servers like to spew after a close) * * This must be a little more careful than the above code * since packet floods will also be caught here. We don't * update the TTL here to mitigate the damage of a packet * flood and so the same code can handle awkward establishment * and a loosened connection close. * In the establishment case, a correct peer response will * validate the connection, go through the normal state code * and keep updating the state TTL. */ /* update max window */ if (src->max_win < win) { src->max_win = win; } /* synchronize sequencing */ if (SEQ_GT(end, src->seqlo)) { src->seqlo = end; } /* slide the window of what the other end can send */ if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) { dst->seqhi = ack + MAX((win << sws), 1); } /* * Cannot set dst->seqhi here since this could be a shotgunned * SYN and not an already established connection. */ if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) { src->state = CT_DPIF_TCPS_CLOSING; } if (tcp_flags & TCP_RST) { src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT; } } else { return CT_UPDATE_INVALID; } return CT_UPDATE_VALID; }
/* Callback function executed when something is received on fd */ int receive_callback(int file, void *arg) { char buf[sizeof(struct rudp_packet)]; struct sockaddr_in sender; size_t sender_length = sizeof(struct sockaddr_in); recvfrom(file, &buf, sizeof(struct rudp_packet), 0, (struct sockaddr *)&sender, &sender_length); struct rudp_packet *received_packet = malloc(sizeof(struct rudp_packet)); if(received_packet == NULL) { fprintf(stderr, "receive_callback: Error allocating packet\n"); return -1; } memcpy(received_packet, &buf, sizeof(struct rudp_packet)); struct rudp_hdr rudpheader = received_packet->header; char type[5]; short t = rudpheader.type; if(t == 1) strcpy(type, "DATA"); else if(t == 2) strcpy(type, "ACK"); else if(t == 4) strcpy(type, "SYN"); else if(t==5) strcpy(type, "FIN"); else strcpy(type, "BAD"); printf("Received %s packet from %s:%d seq number=%u on socket=%d\n",type, inet_ntoa(sender.sin_addr), ntohs(sender.sin_port),rudpheader.seqno,file); /* Locate the correct socket in the socket list */ if(socket_list_head == NULL) { fprintf(stderr, "Error: attempt to receive on invalid socket. No sockets in the list\n"); return -1; } else { /* We have sockets to check */ struct rudp_socket_list *curr_socket = socket_list_head; while(curr_socket != NULL) { if((int)curr_socket->rsock == file) { break; } curr_socket = curr_socket->next; } if((int)curr_socket->rsock == file) { /* We found the correct socket, now see if a session already exists for this peer */ if(curr_socket->sessions_list_head == NULL) { /* The list is empty, so we check if the sender has initiated the protocol properly (by sending a SYN) */ if(rudpheader.type == RUDP_SYN) { /* SYN Received. Create a new session at the head of the list */ u_int32_t seqno = rudpheader.seqno + 1; create_receiver_session(curr_socket, seqno, &sender); /* Respond with an ACK */ struct rudp_packet *p = create_rudp_packet(RUDP_ACK, seqno, 0, NULL); send_packet(true, (rudp_socket_t)file, p, &sender); free(p); } else { /* No sessions exist and we got a non-SYN, so ignore it */ } } else { /* Some sessions exist to be checked */ bool_t session_found = false; struct session *curr_session = curr_socket->sessions_list_head; struct session *last_session; while(curr_session != NULL) { if(curr_session->next == NULL) { last_session = curr_session; } if(compare_sockaddr(&curr_session->address, &sender) == 1) { /* Found an existing session */ session_found = true; break; } curr_session = curr_session->next; } if(session_found == false) { /* No session was found for this peer */ if(rudpheader.type == RUDP_SYN) { /* SYN Received. Send an ACK and create a new session */ u_int32_t seqno = rudpheader.seqno + 1; create_receiver_session(curr_socket, seqno, &sender); struct rudp_packet *p = create_rudp_packet(RUDP_ACK, seqno, 0, NULL); send_packet(true, (rudp_socket_t)file, p, &sender); free(p); } else { /* Session does not exist and non-SYN received - ignore it */ } } else { /* We found a matching session */ if(rudpheader.type == RUDP_SYN) { if(curr_session->receiver == NULL || curr_session->receiver->status == OPENING) { /* Create a new receiver session and ACK the SYN*/ struct receiver_session *new_receiver_session = malloc(sizeof(struct receiver_session)); if(new_receiver_session == NULL) { fprintf(stderr, "receive_callback: Error allocating receiver session\n"); return -1; } new_receiver_session->expected_seqno = rudpheader.seqno + 1; new_receiver_session->status = OPENING; new_receiver_session->session_finished = false; curr_session->receiver = new_receiver_session; u_int32_t seqno = curr_session->receiver->expected_seqno; struct rudp_packet *p = create_rudp_packet(RUDP_ACK, seqno, 0, NULL); send_packet(true, (rudp_socket_t)file, p, &sender); free(p); } else { /* Received a SYN when there is already an active receiver session, so we ignore it */ } } if(rudpheader.type == RUDP_ACK) { u_int32_t ack_sqn = received_packet->header.seqno; if(curr_session->sender->status == SYN_SENT) { /* This an ACK for a SYN */ u_int32_t syn_sqn = curr_session->sender->seqno; if( (ack_sqn - 1) == syn_sqn) { /* Delete the retransmission timeout */ event_timeout_delete(timeout_callback, curr_session->sender->syn_timeout_arg); struct timeoutargs *t = (struct timeoutargs *)curr_session->sender->syn_timeout_arg; free(t->packet); free(t->recipient); free(t); curr_session->sender->status = OPEN; while(curr_session->sender->data_queue != NULL) { /* Check if the window is already full */ if(curr_session->sender->sliding_window[RUDP_WINDOW-1] != NULL) { break; } else { int index; int i; /* Find the first unused window slot */ for(i = RUDP_WINDOW-1; i >= 0; i--) { if(curr_session->sender->sliding_window[i] == NULL) { index = i; } } /* Send packet, add to window and remove from queue */ u_int32_t seqno = ++syn_sqn; int len = curr_session->sender->data_queue->len; char *payload = curr_session->sender->data_queue->item; struct rudp_packet *datap = create_rudp_packet(RUDP_DATA, seqno, len, payload); curr_session->sender->seqno += 1; curr_session->sender->sliding_window[index] = datap; curr_session->sender->retransmission_attempts[index] = 0; struct data *temp = curr_session->sender->data_queue; curr_session->sender->data_queue = curr_session->sender->data_queue->next; free(temp->item); free(temp); send_packet(false, (rudp_socket_t)file, datap, &sender); } } } } else if(curr_session->sender->status == OPEN) { /* This is an ACK for DATA */ if(curr_session->sender->sliding_window[0] != NULL) { if(curr_session->sender->sliding_window[0]->header.seqno == (rudpheader.seqno-1)) { /* Correct ACK received. Remove the first window item and shift the rest left */ event_timeout_delete(timeout_callback, curr_session->sender->data_timeout_arg[0]); struct timeoutargs *args = (struct timeoutargs *)curr_session->sender->data_timeout_arg[0]; free(args->packet); free(args->recipient); free(args); free(curr_session->sender->sliding_window[0]); int i; if(RUDP_WINDOW == 1) { curr_session->sender->sliding_window[0] = NULL; curr_session->sender->retransmission_attempts[0] = 0; curr_session->sender->data_timeout_arg[0] = NULL; } else { for(i = 0; i < RUDP_WINDOW - 1; i++) { curr_session->sender->sliding_window[i] = curr_session->sender->sliding_window[i+1]; curr_session->sender->retransmission_attempts[i] = curr_session->sender->retransmission_attempts[i+1]; curr_session->sender->data_timeout_arg[i] = curr_session->sender->data_timeout_arg[i+1]; if(i == RUDP_WINDOW-2) { curr_session->sender->sliding_window[i+1] = NULL; curr_session->sender->retransmission_attempts[i+1] = 0; curr_session->sender->data_timeout_arg[i+1] = NULL; } } } while(curr_session->sender->data_queue != NULL) { if(curr_session->sender->sliding_window[RUDP_WINDOW-1] != NULL) { break; } else { int index; int i; /* Find the first unused window slot */ for(i = RUDP_WINDOW-1; i >= 0; i--) { if(curr_session->sender->sliding_window[i] == NULL) { index = i; } } /* Send packet, add to window and remove from queue */ curr_session->sender->seqno = curr_session->sender->seqno + 1; u_int32_t seqno = curr_session->sender->seqno; int len = curr_session->sender->data_queue->len; char *payload = curr_session->sender->data_queue->item; struct rudp_packet *datap = create_rudp_packet(RUDP_DATA, seqno, len, payload); curr_session->sender->sliding_window[index] = datap; curr_session->sender->retransmission_attempts[index] = 0; struct data *temp = curr_session->sender->data_queue; curr_session->sender->data_queue = curr_session->sender->data_queue->next; free(temp->item); free(temp); send_packet(false, (rudp_socket_t)file, datap, &sender); } } if(curr_socket->close_requested) { /* Can the socket be closed? */ struct session *head_sessions = curr_socket->sessions_list_head; while(head_sessions != NULL) { if(head_sessions->sender->session_finished == false) { if(head_sessions->sender->data_queue == NULL && head_sessions->sender->sliding_window[0] == NULL && head_sessions->sender->status == OPEN) { head_sessions->sender->seqno += 1; struct rudp_packet *p = create_rudp_packet(RUDP_FIN, head_sessions->sender->seqno, 0, NULL); send_packet(false, (rudp_socket_t)file, p, &head_sessions->address); free(p); head_sessions->sender->status = FIN_SENT; } } head_sessions = head_sessions->next; } } } } } else if(curr_session->sender->status == FIN_SENT) { /* Handle ACK for FIN */ if( (curr_session->sender->seqno + 1) == received_packet->header.seqno) { event_timeout_delete(timeout_callback, curr_session->sender->fin_timeout_arg); struct timeoutargs *t = curr_session->sender->fin_timeout_arg; free(t->packet); free(t->recipient); free(t); curr_session->sender->session_finished = true; if(curr_socket->close_requested) { /* See if we can close the socket */ struct session *head_sessions = curr_socket->sessions_list_head; bool_t all_done = true; while(head_sessions != NULL) { if(head_sessions->sender->session_finished == false) { all_done = false; } else if(head_sessions->receiver != NULL && head_sessions->receiver->session_finished == false) { all_done = false; } else { free(head_sessions->sender); if(head_sessions->receiver) { free(head_sessions->receiver); } } struct session *temp = head_sessions; head_sessions = head_sessions->next; free(temp); } if(all_done) { if(curr_socket->handler != NULL) { curr_socket->handler((rudp_socket_t)file, RUDP_EVENT_CLOSED, &sender); event_fd_delete(receive_callback, (rudp_socket_t)file); close(file); free(curr_socket); } } } } else { /* Received incorrect ACK for FIN - ignore it */ } } } else if(rudpheader.type == RUDP_DATA) { /* Handle DATA packet. If the receiver is OPENING, it can transition to OPEN */ if(curr_session->receiver->status == OPENING) { if(rudpheader.seqno == curr_session->receiver->expected_seqno) { curr_session->receiver->status = OPEN; } } if(rudpheader.seqno == curr_session->receiver->expected_seqno) { /* Sequence numbers match - ACK the data */ u_int32_t seqno = rudpheader.seqno + 1; curr_session->receiver->expected_seqno = seqno; struct rudp_packet *p = create_rudp_packet(RUDP_ACK, seqno, 0, NULL); send_packet(true, (rudp_socket_t)file, p, &sender); free(p); /* Pass the data up to the application */ if(curr_socket->recv_handler != NULL) curr_socket->recv_handler((rudp_socket_t)file, &sender, (void*)&received_packet->payload, received_packet->payload_length); } /* Handle the case where an ACK was lost */ else if(SEQ_GEQ(rudpheader.seqno, (curr_session->receiver->expected_seqno - RUDP_WINDOW)) && SEQ_LT(rudpheader.seqno, curr_session->receiver->expected_seqno)) { u_int32_t seqno = rudpheader.seqno + 1; struct rudp_packet *p = create_rudp_packet(RUDP_ACK, seqno, 0, NULL); send_packet(true, (rudp_socket_t)file, p, &sender); free(p); } } else if(rudpheader.type == RUDP_FIN) { if(curr_session->receiver->status == OPEN) { if(rudpheader.seqno == curr_session->receiver->expected_seqno) { /* If the FIN is correct, we can ACK it */ u_int32_t seqno = curr_session->receiver->expected_seqno + 1; struct rudp_packet *p = create_rudp_packet(RUDP_ACK, seqno, 0, NULL); send_packet(true, (rudp_socket_t)file, p, &sender); free(p); curr_session->receiver->session_finished = true; if(curr_socket->close_requested) { /* Can we close the socket now? */ struct session *head_sessions = curr_socket->sessions_list_head; int all_done = true; while(head_sessions != NULL) { if(head_sessions->sender->session_finished == false) { all_done = false; } else if(head_sessions->receiver != NULL && head_sessions->receiver->session_finished == false) { all_done = false; } else { free(head_sessions->sender); if(head_sessions->receiver) { free(head_sessions->receiver); } } struct session *temp = head_sessions; head_sessions = head_sessions->next; free(temp); } if(all_done) { if(curr_socket->handler != NULL) { curr_socket->handler((rudp_socket_t)file, RUDP_EVENT_CLOSED, &sender); event_fd_delete(receive_callback, (rudp_socket_t)file); close(file); free(curr_socket); } } } } else { /* FIN received with incorrect sequence number - ignore it */ } } } } } } } free(received_packet); return 0; }