/* * TCP input routine, follows pages 65-76 of the * protocol specification dated September, 1981 very closely. */ void tcp_input(struct mbuf *m, int iphlen, struct socket *inso) { struct ip save_ip, *ip; register struct tcpiphdr *ti; caddr_t optp = NULL; int optlen = 0; int len, tlen, off; register struct tcpcb *tp = NULL; register int tiflags; struct socket *so = NULL; int todrop, acked, ourfinisacked, needoutput = 0; int iss = 0; u_long tiwin; int ret; struct ex_list *ex_ptr; Slirp *slirp; DEBUG_CALL("tcp_input"); DEBUG_ARGS((dfd, " m = %8lx iphlen = %2d inso = %lx\n", (long )m, iphlen, (long )inso )); /* * If called with m == 0, then we're continuing the connect */ if (m == NULL) { so = inso; slirp = so->slirp; /* Re-set a few variables */ tp = sototcpcb(so); m = so->so_m; so->so_m = NULL; ti = so->so_ti; tiwin = ti->ti_win; tiflags = ti->ti_flags; goto cont_conn; } slirp = m->slirp; /* * Get IP and TCP header together in first mbuf. * Note: IP leaves IP header in first mbuf. */ ti = mtod(m, struct tcpiphdr *); if (iphlen > sizeof(struct ip )) { ip_stripoptions(m, (struct mbuf *)0); iphlen=sizeof(struct ip ); } /* XXX Check if too short */ /* * Save a copy of the IP header in case we want restore it * for sending an ICMP error message in response. */ ip=mtod(m, struct ip *); save_ip = *ip; save_ip.ip_len+= iphlen; /* * Checksum extended TCP header and data. */ tlen = ((struct ip *)ti)->ip_len; tcpiphdr2qlink(ti)->next = tcpiphdr2qlink(ti)->prev = NULL; memset(&ti->ti_i.ih_mbuf, 0 , sizeof(struct mbuf_ptr)); ti->ti_x1 = 0; ti->ti_len = htons((uint16_t)tlen); len = sizeof(struct ip ) + tlen; if(cksum(m, len)) { goto drop; } /* * Check that TCP offset makes sense, * pull out TCP options and adjust length. XXX */ off = ti->ti_off << 2; if (off < sizeof (struct tcphdr) || off > tlen) { goto drop; } tlen -= off; ti->ti_len = tlen; if (off > sizeof (struct tcphdr)) { optlen = off - sizeof (struct tcphdr); optp = mtod(m, caddr_t) + sizeof (struct tcpiphdr); } tiflags = ti->ti_flags; /* * Convert TCP protocol specific fields to host format. */ NTOHL(ti->ti_seq); NTOHL(ti->ti_ack); NTOHS(ti->ti_win); NTOHS(ti->ti_urp); /* * Drop TCP, IP headers and TCP options. */ m->m_data += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); m->m_len -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); /* * Locate pcb for segment. */ findso: so = slirp->tcp_last_so; if (so->so_fport != ti->ti_dport || so->so_lport != ti->ti_sport || so->so_laddr.s_addr != ti->ti_src.s_addr || so->so_faddr.s_addr != ti->ti_dst.s_addr) { so = solookup(&slirp->tcb, ti->ti_src, ti->ti_sport, ti->ti_dst, ti->ti_dport); if (so) slirp->tcp_last_so = so; } /* * If the state is CLOSED (i.e., TCB does not exist) then * all data in the incoming segment is discarded. * If the TCB exists but is in CLOSED state, it is embryonic, * but should either do a listen or a connect soon. * * state == CLOSED means we've done socreate() but haven't * attached it to a protocol yet... * * XXX If a TCB does not exist, and the TH_SYN flag is * the only flag set, then create a session, mark it * as if it was LISTENING, and continue... */ if (so == NULL) { if (slirp->restricted) { /* Any hostfwds will have an existing socket, so we only get here * for non-hostfwd connections. These should be dropped, unless it * happens to be a guestfwd. */ for (ex_ptr = slirp->exec_list; ex_ptr; ex_ptr = ex_ptr->ex_next) { if (ex_ptr->ex_fport == ti->ti_dport && ti->ti_dst.s_addr == ex_ptr->ex_addr.s_addr) { break; } } if (!ex_ptr) { goto dropwithreset; } } if ((tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) != TH_SYN) goto dropwithreset; if ((so = socreate(slirp)) == NULL) goto dropwithreset; if (tcp_attach(so) < 0) { free(so); /* Not sofree (if it failed, it's not insqued) */ goto dropwithreset; } sbreserve(&so->so_snd, TCP_SNDSPACE); sbreserve(&so->so_rcv, TCP_RCVSPACE); so->so_laddr = ti->ti_src; so->so_lport = ti->ti_sport; so->so_faddr = ti->ti_dst; so->so_fport = ti->ti_dport; if ((so->so_iptos = tcp_tos(so)) == 0) so->so_iptos = ((struct ip *)ti)->ip_tos; tp = sototcpcb(so); tp->t_state = TCPS_LISTEN; } /* * If this is a still-connecting socket, this probably * a retransmit of the SYN. Whether it's a retransmit SYN * or something else, we nuke it. */ if (so->so_state & SS_ISFCONNECTING) goto drop; tp = sototcpcb(so); /* XXX Should never fail */ if (tp == NULL) goto dropwithreset; if (tp->t_state == TCPS_CLOSED) goto drop; tiwin = ti->ti_win; /* * Segment received on connection. * Reset idle time and keep-alive timer. */ tp->t_idle = 0; if (SO_OPTIONS) tp->t_timer[TCPT_KEEP] = TCPTV_KEEPINTVL; else tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_IDLE; /* * Process options if not in LISTEN state, * else do it below (after getting remote address). */ if (optp && tp->t_state != TCPS_LISTEN) tcp_dooptions(tp, (u_char *)optp, optlen, ti); /* * Header prediction: check for the two common cases * of a uni-directional data xfer. If the packet has * no control flags, is in-sequence, the window didn't * change and we're not retransmitting, it's a * candidate. If the length is zero and the ack moved * forward, we're the sender side of the xfer. Just * free the data acked & wake any higher level process * that was blocked waiting for space. If the length * is non-zero and the ack didn't move, we're the * receiver side. If we're getting packets in-order * (the reassembly queue is empty), add the data to * the socket buffer and note that we need a delayed ack. * * XXX Some of these tests are not needed * eg: the tiwin == tp->snd_wnd prevents many more * predictions.. with no *real* advantage.. */ if (tp->t_state == TCPS_ESTABLISHED && (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && ti->ti_seq == tp->rcv_nxt && tiwin && tiwin == tp->snd_wnd && tp->snd_nxt == tp->snd_max) { if (ti->ti_len == 0) { if (SEQ_GT(ti->ti_ack, tp->snd_una) && SEQ_LEQ(ti->ti_ack, tp->snd_max) && tp->snd_cwnd >= tp->snd_wnd) { /* * this is a pure ack for outstanding data. */ if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq)) tcp_xmit_timer(tp, tp->t_rtt); acked = ti->ti_ack - tp->snd_una; sbdrop(&so->so_snd, acked); tp->snd_una = ti->ti_ack; m_free(m); /* * If all outstanding data are acked, stop * retransmit timer, otherwise restart timer * using current (possibly backed-off) value. * If process is waiting for space, * wakeup/selwakeup/signal. If data * are ready to send, let tcp_output * decide between more output or persist. */ if (tp->snd_una == tp->snd_max) tp->t_timer[TCPT_REXMT] = 0; else if (tp->t_timer[TCPT_PERSIST] == 0) tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; /* * This is called because sowwakeup might have * put data into so_snd. Since we don't so sowwakeup, * we don't need this.. XXX??? */ if (so->so_snd.sb_cc) (void) tcp_output(tp); return; } } else if (ti->ti_ack == tp->snd_una && tcpfrag_list_empty(tp) && ti->ti_len <= sbspace(&so->so_rcv)) { /* * this is a pure, in-sequence data packet * with nothing on the reassembly queue and * we have enough buffer space to take it. */ tp->rcv_nxt += ti->ti_len; /* * Add data to socket buffer. */ if (so->so_emu) { if (tcp_emu(so,m)) sbappend(so, m); } else sbappend(so, m); /* * If this is a short packet, then ACK now - with Nagel * congestion avoidance sender won't send more until * he gets an ACK. * * It is better to not delay acks at all to maximize * TCP throughput. See RFC 2581. */ tp->t_flags |= TF_ACKNOW; tcp_output(tp); return; } } /* header prediction */ /* * Calculate amount of space in receive window, * and then do TCP input processing. * Receive window is amount of space in rcv queue, * but not less than advertised window. */ { int win; win = sbspace(&so->so_rcv); if (win < 0) win = 0; tp->rcv_wnd = max(win, (int)(tp->rcv_adv - tp->rcv_nxt)); } switch (tp->t_state) { /* * If the state is LISTEN then ignore segment if it contains an RST. * If the segment contains an ACK then it is bad and send a RST. * If it does not contain a SYN then it is not interesting; drop it. * Don't bother responding if the destination was a broadcast. * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial * tp->iss, and send a segment: * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss. * Fill in remote peer address fields if not previously specified. * Enter SYN_RECEIVED state, and process any other fields of this * segment in this state. */ case TCPS_LISTEN: { if (tiflags & TH_RST) goto drop; if (tiflags & TH_ACK) goto dropwithreset; if ((tiflags & TH_SYN) == 0) goto drop; /* * This has way too many gotos... * But a bit of spaghetti code never hurt anybody :) */ /* * If this is destined for the control address, then flag to * tcp_ctl once connected, otherwise connect */ if ((so->so_faddr.s_addr & slirp->vnetwork_mask.s_addr) == slirp->vnetwork_addr.s_addr) { if (so->so_faddr.s_addr != slirp->vhost_addr.s_addr && so->so_faddr.s_addr != slirp->vnameserver_addr.s_addr) { /* May be an add exec */ for (ex_ptr = slirp->exec_list; ex_ptr; ex_ptr = ex_ptr->ex_next) { if(ex_ptr->ex_fport == so->so_fport && so->so_faddr.s_addr == ex_ptr->ex_addr.s_addr) { so->so_state |= SS_CTL; break; } } if (so->so_state & SS_CTL) { goto cont_input; } } /* CTL_ALIAS: Do nothing, tcp_fconnect will be called on it */ } if (so->so_emu & EMU_NOCONNECT) { so->so_emu &= ~EMU_NOCONNECT; goto cont_input; } if ((tcp_fconnect(so) == -1) && #if defined(_WIN32) socket_error() != WSAEWOULDBLOCK #else (errno != EINPROGRESS) && (errno != EWOULDBLOCK) #endif ) { u_char code=ICMP_UNREACH_NET; DEBUG_MISC((dfd, " tcp fconnect errno = %d-%s\n", errno,strerror(errno))); if(errno == ECONNREFUSED) { /* ACK the SYN, send RST to refuse the connection */ tcp_respond(tp, ti, m, ti->ti_seq+1, (tcp_seq)0, TH_RST|TH_ACK); } else { if(errno == EHOSTUNREACH) code=ICMP_UNREACH_HOST; HTONL(ti->ti_seq); /* restore tcp header */ HTONL(ti->ti_ack); HTONS(ti->ti_win); HTONS(ti->ti_urp); m->m_data -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); m->m_len += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); *ip=save_ip; icmp_error(m, ICMP_UNREACH,code, 0,strerror(errno)); } tcp_close(tp); m_free(m); } else { /* * Haven't connected yet, save the current mbuf * and ti, and return * XXX Some OS's don't tell us whether the connect() * succeeded or not. So we must time it out. */ so->so_m = m; so->so_ti = ti; tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT; tp->t_state = TCPS_SYN_RECEIVED; tcp_template(tp); } return; cont_conn: /* m==NULL * Check if the connect succeeded */ if (so->so_state & SS_NOFDREF) { tp = tcp_close(tp); goto dropwithreset; } cont_input: tcp_template(tp); if (optp) tcp_dooptions(tp, (u_char *)optp, optlen, ti); if (iss) tp->iss = iss; else tp->iss = slirp->tcp_iss; slirp->tcp_iss += TCP_ISSINCR/2; tp->irs = ti->ti_seq; tcp_sendseqinit(tp); tcp_rcvseqinit(tp); tp->t_flags |= TF_ACKNOW; tp->t_state = TCPS_SYN_RECEIVED; tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT; goto trimthenstep6; } /* case TCPS_LISTEN */ /* * If the state is SYN_SENT: * if seg contains an ACK, but not for our SYN, drop the input. * if seg contains a RST, then drop the connection. * if seg does not contain SYN, then drop it. * Otherwise this is an acceptable SYN segment * initialize tp->rcv_nxt and tp->irs * if seg contains ack then advance tp->snd_una * if SYN has been acked change to ESTABLISHED else SYN_RCVD state * arrange for segment to be acked (eventually) * continue processing rest of data/controls, beginning with URG */ case TCPS_SYN_SENT: if ((tiflags & TH_ACK) && (SEQ_LEQ(ti->ti_ack, tp->iss) || SEQ_GT(ti->ti_ack, tp->snd_max))) goto dropwithreset; if (tiflags & TH_RST) { if (tiflags & TH_ACK) { tcp_drop(tp, 0); /* XXX Check t_softerror! */ } goto drop; } if ((tiflags & TH_SYN) == 0) goto drop; if (tiflags & TH_ACK) { tp->snd_una = ti->ti_ack; if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; } tp->t_timer[TCPT_REXMT] = 0; tp->irs = ti->ti_seq; tcp_rcvseqinit(tp); tp->t_flags |= TF_ACKNOW; if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { soisfconnected(so); tp->t_state = TCPS_ESTABLISHED; (void) tcp_reass(tp, (struct tcpiphdr *)0, (struct mbuf *)0); /* * if we didn't have to retransmit the SYN, * use its rtt as our initial srtt & rtt var. */ if (tp->t_rtt) tcp_xmit_timer(tp, tp->t_rtt); } else tp->t_state = TCPS_SYN_RECEIVED; trimthenstep6: /* * Advance ti->ti_seq to correspond to first data byte. * If data, trim to stay within window, * dropping FIN if necessary. */ ti->ti_seq++; if (ti->ti_len > tp->rcv_wnd) { todrop = ti->ti_len - tp->rcv_wnd; m_adj(m, -todrop); ti->ti_len = tp->rcv_wnd; tiflags &= ~TH_FIN; } tp->snd_wl1 = ti->ti_seq - 1; tp->rcv_up = ti->ti_seq; goto step6; } /* switch tp->t_state */ /* * States other than LISTEN or SYN_SENT. * Check that at least some bytes of segment are within * receive window. If segment begins before rcv_nxt, * drop leading data (and SYN); if nothing left, just ack. */ todrop = tp->rcv_nxt - ti->ti_seq; if (todrop > 0) { if (tiflags & TH_SYN) { tiflags &= ~TH_SYN; ti->ti_seq++; if (ti->ti_urp > 1) ti->ti_urp--; else tiflags &= ~TH_URG; todrop--; } /* * Following if statement from Stevens, vol. 2, p. 960. */ if (todrop > ti->ti_len || (todrop == ti->ti_len && (tiflags & TH_FIN) == 0)) { /* * Any valid FIN must be to the left of the window. * At this point the FIN must be a duplicate or out * of sequence; drop it. */ tiflags &= ~TH_FIN; /* * Send an ACK to resynchronize and drop any data. * But keep on processing for RST or ACK. */ tp->t_flags |= TF_ACKNOW; todrop = ti->ti_len; } m_adj(m, todrop); ti->ti_seq += todrop; ti->ti_len -= todrop; if (ti->ti_urp > todrop) ti->ti_urp -= todrop; else { tiflags &= ~TH_URG; ti->ti_urp = 0; } } /* * If new data are received on a connection after the * user processes are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT && ti->ti_len) { tp = tcp_close(tp); goto dropwithreset; } /* * If segment ends after window, drop trailing data * (and PUSH and FIN); if nothing left, just ACK. */ todrop = (ti->ti_seq+ti->ti_len) - (tp->rcv_nxt+tp->rcv_wnd); if (todrop > 0) { if (todrop >= ti->ti_len) { /* * If a new connection request is received * while in TIME_WAIT, drop the old connection * and start over if the sequence numbers * are above the previous ones. */ if (tiflags & TH_SYN && tp->t_state == TCPS_TIME_WAIT && SEQ_GT(ti->ti_seq, tp->rcv_nxt)) { iss = tp->rcv_nxt + TCP_ISSINCR; tp = tcp_close(tp); goto findso; } /* * If window is closed can only take segments at * window edge, and have to drop data and PUSH from * incoming segments. Continue processing, but * remember to ack. Otherwise, drop segment * and ack. */ if (tp->rcv_wnd == 0 && ti->ti_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; } else { goto dropafterack; } } m_adj(m, -todrop); ti->ti_len -= todrop; tiflags &= ~(TH_PUSH|TH_FIN); } /* * If the RST bit is set examine the state: * SYN_RECEIVED STATE: * If passive open, return to LISTEN state. * If active open, inform user that connection was refused. * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: * Inform user that connection was reset, and close tcb. * CLOSING, LAST_ACK, TIME_WAIT STATES * Close the tcb. */ if (tiflags&TH_RST) switch (tp->t_state) { case TCPS_SYN_RECEIVED: case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: tp->t_state = TCPS_CLOSED; tcp_close(tp); goto drop; case TCPS_CLOSING: case TCPS_LAST_ACK: case TCPS_TIME_WAIT: tcp_close(tp); goto drop; } /* * If a SYN is in the window, then this is an * error and we send an RST and drop the connection. */ if (tiflags & TH_SYN) { tp = tcp_drop(tp,0); goto dropwithreset; } /* * If the ACK bit is off we drop the segment and return. */ if ((tiflags & TH_ACK) == 0) goto drop; /* * Ack processing. */ switch (tp->t_state) { /* * In SYN_RECEIVED state if the ack ACKs our SYN then enter * ESTABLISHED state and continue processing, otherwise * send an RST. una<=ack<=max */ case TCPS_SYN_RECEIVED: if (SEQ_GT(tp->snd_una, ti->ti_ack) || SEQ_GT(ti->ti_ack, tp->snd_max)) goto dropwithreset; tp->t_state = TCPS_ESTABLISHED; /* * The sent SYN is ack'ed with our sequence number +1 * The first data byte already in the buffer will get * lost if no correction is made. This is only needed for * SS_CTL since the buffer is empty otherwise. * tp->snd_una++; or: */ tp->snd_una=ti->ti_ack; if (so->so_state & SS_CTL) { /* So tcp_ctl reports the right state */ ret = tcp_ctl(so); if (ret == 1) { soisfconnected(so); so->so_state &= ~SS_CTL; /* success XXX */ } else if (ret == 2) { so->so_state &= SS_PERSISTENT_MASK; so->so_state |= SS_NOFDREF; /* CTL_CMD */ } else { needoutput = 1; tp->t_state = TCPS_FIN_WAIT_1; } } else { soisfconnected(so); } (void) tcp_reass(tp, (struct tcpiphdr *)0, (struct mbuf *)0); tp->snd_wl1 = ti->ti_seq - 1; /* Avoid ack processing; snd_una==ti_ack => dup ack */ goto synrx_to_est; /* fall into ... */ /* * In ESTABLISHED state: drop duplicate ACKs; ACK out of range * ACKs. If the ack is in the range * tp->snd_una < ti->ti_ack <= tp->snd_max * then advance tp->snd_una to ti->ti_ack and drop * data from the retransmission queue. If this ACK reflects * more up to date window information we update our window information. */ case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: case TCPS_TIME_WAIT: if (SEQ_LEQ(ti->ti_ack, tp->snd_una)) { if (ti->ti_len == 0 && tiwin == tp->snd_wnd) { DEBUG_MISC((dfd, " dup ack m = %lx so = %lx\n", (long )m, (long )so)); /* * If we have outstanding data (other than * a window probe), this is a completely * duplicate ack (ie, window info didn't * change), the ack is the biggest we've * seen and we've seen exactly our rexmt * threshold of them, assume a packet * has been dropped and retransmit it. * Kludge snd_nxt & the congestion * window so we send only this one * packet. * * We know we're losing at the current * window size so do congestion avoidance * (set ssthresh to half the current window * and pull our congestion window back to * the new ssthresh). * * Dup acks mean that packets have left the * network (they're now cached at the receiver) * so bump cwnd by the amount in the receiver * to keep a constant cwnd packets in the * network. */ if (tp->t_timer[TCPT_REXMT] == 0 || ti->ti_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks == TCPREXMTTHRESH) { tcp_seq onxt = tp->snd_nxt; u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; if (win < 2) win = 2; tp->snd_ssthresh = win * tp->t_maxseg; tp->t_timer[TCPT_REXMT] = 0; tp->t_rtt = 0; tp->snd_nxt = ti->ti_ack; tp->snd_cwnd = tp->t_maxseg; (void) tcp_output(tp); tp->snd_cwnd = tp->snd_ssthresh + tp->t_maxseg * tp->t_dupacks; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; } else if (tp->t_dupacks > TCPREXMTTHRESH) { tp->snd_cwnd += tp->t_maxseg; (void) tcp_output(tp); goto drop; } } else tp->t_dupacks = 0; break; } synrx_to_est: /* * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ if (tp->t_dupacks > TCPREXMTTHRESH && tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; tp->t_dupacks = 0; if (SEQ_GT(ti->ti_ack, tp->snd_max)) { goto dropafterack; } acked = ti->ti_ack - tp->snd_una; /* * If transmit timer is running and timed sequence * number was acked, update smoothed round trip time. * Since we now have an rtt measurement, cancel the * timer backoff (cf., Phil Karn's retransmit alg.). * Recompute the initial retransmit timer. */ if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq)) tcp_xmit_timer(tp,tp->t_rtt); /* * If all outstanding data is acked, stop retransmit * timer and remember to restart (more output or persist). * If there is more data to be acked, restart retransmit * timer, using current (possibly backed-off) value. */ if (ti->ti_ack == tp->snd_max) { tp->t_timer[TCPT_REXMT] = 0; needoutput = 1; } else if (tp->t_timer[TCPT_PERSIST] == 0) tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; /* * When new data is acked, open the congestion window. * If the window gives us less than ssthresh packets * in flight, open exponentially (maxseg per packet). * Otherwise open linearly: maxseg per window * (maxseg^2 / cwnd per packet). */ { register u_int cw = tp->snd_cwnd; register u_int incr = tp->t_maxseg; if (cw > tp->snd_ssthresh) incr = incr * incr / cw; tp->snd_cwnd = min(cw + incr, TCP_MAXWIN<<tp->snd_scale); } if (acked > so->so_snd.sb_cc) { tp->snd_wnd -= so->so_snd.sb_cc; sbdrop(&so->so_snd, (int )so->so_snd.sb_cc); ourfinisacked = 1; } else { sbdrop(&so->so_snd, acked); tp->snd_wnd -= acked; ourfinisacked = 0; } tp->snd_una = ti->ti_ack; if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; switch (tp->t_state) { /* * In FIN_WAIT_1 STATE in addition to the processing * for the ESTABLISHED state if our FIN is now acknowledged * then enter FIN_WAIT_2. */ case TCPS_FIN_WAIT_1: if (ourfinisacked) { /* * If we can't receive any more * data, then closing user can proceed. * Starting the timer is contrary to the * specification, but if we don't get a FIN * we'll hang forever. */ if (so->so_state & SS_FCANTRCVMORE) { tp->t_timer[TCPT_2MSL] = TCP_MAXIDLE; } tp->t_state = TCPS_FIN_WAIT_2; } break; /* * In CLOSING STATE in addition to the processing for * the ESTABLISHED state if the ACK acknowledges our FIN * then enter the TIME-WAIT state, otherwise ignore * the segment. */ case TCPS_CLOSING: if (ourfinisacked) { tp->t_state = TCPS_TIME_WAIT; tcp_canceltimers(tp); tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; } break; /* * In LAST_ACK, we may still be waiting for data to drain * and/or to be acked, as well as for the ack of our FIN. * If our FIN is now acknowledged, delete the TCB, * enter the closed state and return. */ case TCPS_LAST_ACK: if (ourfinisacked) { tcp_close(tp); goto drop; } break; /* * In TIME_WAIT state the only thing that should arrive * is a retransmission of the remote FIN. Acknowledge * it and restart the finack timer. */ case TCPS_TIME_WAIT: tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; goto dropafterack; } } /* switch(tp->t_state) */ step6: /* * Update window information. * Don't look at window if no ACK: TAC's send garbage on first SYN. */ if ((tiflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, ti->ti_seq) || (tp->snd_wl1 == ti->ti_seq && (SEQ_LT(tp->snd_wl2, ti->ti_ack) || (tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd))))) { tp->snd_wnd = tiwin; tp->snd_wl1 = ti->ti_seq; tp->snd_wl2 = ti->ti_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; needoutput = 1; } /* * Process segments with URG. */ if ((tiflags & TH_URG) && ti->ti_urp && TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* * This is a kludge, but if we receive and accept * random urgent pointers, we'll crash in * soreceive. It's hard to imagine someone * actually wanting to send this much urgent data. */ if (ti->ti_urp + so->so_rcv.sb_cc > so->so_rcv.sb_datalen) { ti->ti_urp = 0; tiflags &= ~TH_URG; goto dodata; } /* * If this segment advances the known urgent pointer, * then mark the data stream. This should not happen * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since * a FIN has been received from the remote side. * In these states we ignore the URG. * * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section as the original * spec states (in one of two places). */ if (SEQ_GT(ti->ti_seq+ti->ti_urp, tp->rcv_up)) { tp->rcv_up = ti->ti_seq + ti->ti_urp; so->so_urgc = so->so_rcv.sb_cc + (tp->rcv_up - tp->rcv_nxt); /* -1; */ tp->rcv_up = ti->ti_seq + ti->ti_urp; } } else /* * If no out of band data is expected, * pull receive urgent pointer along * with the receive window. */ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) tp->rcv_up = tp->rcv_nxt; dodata: /* * If this is a small packet, then ACK now - with Nagel * congestion avoidance sender won't send more until * he gets an ACK. */ if (ti->ti_len && (unsigned)ti->ti_len <= 5 && ((struct tcpiphdr_2 *)ti)->first_char == (char)27) { tp->t_flags |= TF_ACKNOW; } /* * Process the segment text, merging it into the TCP sequencing queue, * and arranging for acknowledgment of receipt if necessary. * This process logically involves adjusting tp->rcv_wnd as data * is presented to the user (this happens in tcp_usrreq.c, * case PRU_RCVD). If a FIN has already been received on this * connection then we just ignore the text. */ if ((ti->ti_len || (tiflags&TH_FIN)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { TCP_REASS(tp, ti, m, so, tiflags); } else { m_free(m); tiflags &= ~TH_FIN; } /* * If FIN is received ACK the FIN and let the user know * that the connection is closing. */ if (tiflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* * If we receive a FIN we can't send more data, * set it SS_FDRAIN * Shutdown the socket if there is no rx data in the * buffer. * soread() is called on completion of shutdown() and * will got to TCPS_LAST_ACK, and use tcp_output() * to send the FIN. */ sofwdrain(so); tp->t_flags |= TF_ACKNOW; tp->rcv_nxt++; } switch (tp->t_state) { /* * In SYN_RECEIVED and ESTABLISHED STATES * enter the CLOSE_WAIT state. */ case TCPS_SYN_RECEIVED: case TCPS_ESTABLISHED: if(so->so_emu == EMU_CTL) /* no shutdown on socket */ tp->t_state = TCPS_LAST_ACK; else tp->t_state = TCPS_CLOSE_WAIT; break; /* * If still in FIN_WAIT_1 STATE FIN has not been acked so * enter the CLOSING state. */ case TCPS_FIN_WAIT_1: tp->t_state = TCPS_CLOSING; break; /* * In FIN_WAIT_2 state enter the TIME_WAIT state, * starting the time-wait timer, turning off the other * standard timers. */ case TCPS_FIN_WAIT_2: tp->t_state = TCPS_TIME_WAIT; tcp_canceltimers(tp); tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; break; /* * In TIME_WAIT state restart the 2 MSL time_wait timer. */ case TCPS_TIME_WAIT: tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; break; } } /* * Return any desired output. */ if (needoutput || (tp->t_flags & TF_ACKNOW)) { (void) tcp_output(tp); } return; dropafterack: /* * Generate an ACK dropping incoming segment if it occupies * sequence space, where the ACK reflects our state. */ if (tiflags & TH_RST) goto drop; m_free(m); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); return; dropwithreset: /* reuses m if m!=NULL, m_free() unnecessary */ if (tiflags & TH_ACK) tcp_respond(tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST); else { if (tiflags & TH_SYN) ti->ti_len++; tcp_respond(tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0, TH_RST|TH_ACK); } return; drop: /* * Drop space held by incoming segment and return. */ m_free(m); }
static int ip_fw_chk(struct ip **pip, int hlen, struct ifnet *oif, int ignport, struct mbuf **m) { struct ip_fw_chain *chain; struct ip_fw *rule = NULL; struct ip *ip = *pip; struct ifnet *const rif = (*m)->m_pkthdr.rcvif; u_short offset = (ip->ip_off & IP_OFFMASK); u_short src_port, dst_port; /* * Go down the chain, looking for enlightment */ for (chain=ip_fw_chain.lh_first; chain; chain = chain->chain.le_next) { register struct ip_fw *const f = chain->rule; /* Check direction inbound */ if (!oif && !(f->fw_flg & IP_FW_F_IN)) continue; /* Check direction outbound */ if (oif && !(f->fw_flg & IP_FW_F_OUT)) continue; /* Fragments */ if ((f->fw_flg & IP_FW_F_FRAG) && !(ip->ip_off & IP_OFFMASK)) continue; /* If src-addr doesn't match, not this rule. */ if (((f->fw_flg & IP_FW_F_INVSRC) != 0) ^ ((ip->ip_src.s_addr & f->fw_smsk.s_addr) != f->fw_src.s_addr)) continue; /* If dest-addr doesn't match, not this rule. */ if (((f->fw_flg & IP_FW_F_INVDST) != 0) ^ ((ip->ip_dst.s_addr & f->fw_dmsk.s_addr) != f->fw_dst.s_addr)) continue; /* Interface check */ if ((f->fw_flg & IF_FW_F_VIAHACK) == IF_FW_F_VIAHACK) { struct ifnet *const iface = oif ? oif : rif; /* Backwards compatibility hack for "via" */ if (!iface || !iface_match(iface, &f->fw_in_if, f->fw_flg & IP_FW_F_OIFNAME)) continue; } else { /* Check receive interface */ if ((f->fw_flg & IP_FW_F_IIFACE) && (!rif || !iface_match(rif, &f->fw_in_if, f->fw_flg & IP_FW_F_IIFNAME))) continue; /* Check outgoing interface */ if ((f->fw_flg & IP_FW_F_OIFACE) && (!oif || !iface_match(oif, &f->fw_out_if, f->fw_flg & IP_FW_F_OIFNAME))) continue; } /* Check IP options */ if (f->fw_ipopt != f->fw_ipnopt && !ipopts_match(ip, f)) continue; /* Check protocol; if wildcard, match */ if (f->fw_prot == IPPROTO_IP) goto got_match; /* If different, don't match */ if (ip->ip_p != f->fw_prot) continue; #define PULLUP_TO(len) do { \ if ((*m)->m_len < (len) \ && (*m = m_pullup(*m, (len))) == 0) { \ goto bogusfrag; \ } \ *pip = ip = mtod(*m, struct ip *); \ offset = (ip->ip_off & IP_OFFMASK); \ } while (0) /* Protocol specific checks */ switch (ip->ip_p) { case IPPROTO_TCP: { struct tcphdr *tcp; if (offset == 1) /* cf. RFC 1858 */ goto bogusfrag; if (offset != 0) { /* * TCP flags and ports aren't available in this * packet -- if this rule specified either one, * we consider the rule a non-match. */ if (f->fw_nports != 0 || f->fw_tcpf != f->fw_tcpnf) continue; break; } PULLUP_TO(hlen + 14); tcp = (struct tcphdr *) ((u_long *)ip + ip->ip_hl); if (f->fw_tcpf != f->fw_tcpnf && !tcpflg_match(tcp, f)) continue; src_port = ntohs(tcp->th_sport); dst_port = ntohs(tcp->th_dport); goto check_ports; } case IPPROTO_UDP: { struct udphdr *udp; if (offset != 0) { /* * Port specification is unavailable -- if this * rule specifies a port, we consider the rule * a non-match. */ if (f->fw_nports != 0) continue; break; } PULLUP_TO(hlen + 4); udp = (struct udphdr *) ((u_long *)ip + ip->ip_hl); src_port = ntohs(udp->uh_sport); dst_port = ntohs(udp->uh_dport); check_ports: if (!port_match(&f->fw_pts[0], IP_FW_GETNSRCP(f), src_port, f->fw_flg & IP_FW_F_SRNG)) continue; if (!port_match(&f->fw_pts[IP_FW_GETNSRCP(f)], IP_FW_GETNDSTP(f), dst_port, f->fw_flg & IP_FW_F_DRNG)) continue; break; } case IPPROTO_ICMP: { struct icmp *icmp; if (offset != 0) /* Type isn't valid */ break; PULLUP_TO(hlen + 2); icmp = (struct icmp *) ((u_long *)ip + ip->ip_hl); if (!icmptype_match(icmp, f)) continue; break; } #undef PULLUP_TO bogusfrag: if (fw_verbose) ipfw_report(NULL, ip, rif, oif); goto dropit; } got_match: /* Ignore divert/tee rule if socket port is "ignport" */ switch (f->fw_flg & IP_FW_F_COMMAND) { case IP_FW_F_DIVERT: case IP_FW_F_TEE: if (f->fw_divert_port == ignport) continue; /* ignore this rule */ break; } /* Update statistics */ f->fw_pcnt += 1; f->fw_bcnt += ip->ip_len; f->timestamp = rtems_bsdnet_seconds_since_boot(); /* Log to console if desired */ if ((f->fw_flg & IP_FW_F_PRN) && fw_verbose) ipfw_report(f, ip, rif, oif); /* Take appropriate action */ switch (f->fw_flg & IP_FW_F_COMMAND) { case IP_FW_F_ACCEPT: return(0); case IP_FW_F_COUNT: continue; case IP_FW_F_DIVERT: return(f->fw_divert_port); case IP_FW_F_TEE: /* * XXX someday tee packet here, but beware that you * can't use m_copym() or m_copypacket() because * the divert input routine modifies the mbuf * (and these routines only increment reference * counts in the case of mbuf clusters), so need * to write custom routine. */ continue; case IP_FW_F_SKIPTO: #ifdef DIAGNOSTIC while (chain->chain.le_next && chain->chain.le_next->rule->fw_number < f->fw_skipto_rule) #else while (chain->chain.le_next->rule->fw_number < f->fw_skipto_rule) #endif chain = chain->chain.le_next; continue; } /* Deny/reject this packet using this rule */ rule = f; break; } #ifdef DIAGNOSTIC /* Rule 65535 should always be there and should always match */ if (!chain) panic("ip_fw: chain"); #endif /* * At this point, we're going to drop the packet. * Send a reject notice if all of the following are true: * * - The packet matched a reject rule * - The packet is not an ICMP packet * - The packet is not a multicast or broadcast packet */ if ((rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_REJECT && ip->ip_p != IPPROTO_ICMP && !((*m)->m_flags & (M_BCAST|M_MCAST)) && !IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { switch (rule->fw_reject_code) { case IP_FW_REJECT_RST: { struct tcphdr *const tcp = (struct tcphdr *) ((u_long *)ip + ip->ip_hl); struct tcpiphdr ti, *const tip = (struct tcpiphdr *) ip; if (offset != 0 || (tcp->th_flags & TH_RST)) break; ti.ti_i = *((struct ipovly *) ip); ti.ti_t = *tcp; bcopy(&ti, ip, sizeof(ti)); NTOHL(tip->ti_seq); NTOHL(tip->ti_ack); tip->ti_len = ip->ip_len - hlen - (tip->ti_off << 2); if (tcp->th_flags & TH_ACK) { tcp_respond(NULL, tip, *m, (tcp_seq)0, ntohl(tcp->th_ack), TH_RST); } else { if (tcp->th_flags & TH_SYN) tip->ti_len++; tcp_respond(NULL, tip, *m, tip->ti_seq + tip->ti_len, (tcp_seq)0, TH_RST|TH_ACK); } *m = NULL; break; } default: /* Send an ICMP unreachable using code */ icmp_error(*m, ICMP_UNREACH, rule->fw_reject_code, 0L, 0); *m = NULL; break; } } dropit: /* * Finally, drop the packet. */ if (*m) { m_freem(*m); *m = NULL; } return(0); }
void tcp_timer_keep(void *xtp) { struct tcpcb *tp = xtp; struct tcptemp *t_template; struct inpcb *inp; CURVNET_SET(tp->t_vnet); #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_keep) || !callout_active(&tp->t_timers->tt_keep)) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_keep); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); /* * Because we don't regularly reset the keepalive callout in * the ESTABLISHED state, it may be that we don't actually need * to send a keepalive yet. If that occurs, schedule another * call for the next time the keepalive timer might expire. */ if (TCPS_HAVEESTABLISHED(tp->t_state)) { u_int idletime; idletime = ticks - tp->t_rcvtime; if (idletime < TP_KEEPIDLE(tp)) { callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp) - idletime, tcp_timer_keep, tp); INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } } /* * Keep-alive timer went off; send something * or drop connection if idle for too long. */ TCPSTAT_INC(tcps_keeptimeo); if (tp->t_state < TCPS_ESTABLISHED) goto dropit; if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && tp->t_state <= TCPS_CLOSING) { if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) goto dropit; /* * Send a packet designed to force a response * if the peer is up and reachable: * either an ACK if the connection is still alive, * or an RST if the peer has closed the connection * due to timeout or reboot. * Using sequence number tp->snd_una-1 * causes the transmitted zero-length segment * to lie outside the receive window; * by the protocol spec, this requires the * correspondent TCP to respond. */ TCPSTAT_INC(tcps_keepprobe); t_template = tcpip_maketemplate(inp); if (t_template) { tcp_respond(tp, t_template->tt_ipgen, &t_template->tt_t, (struct mbuf *)NULL, tp->rcv_nxt, tp->snd_una - 1, 0); free(t_template, M_TEMP); } callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), tcp_timer_keep, tp); } else callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), tcp_timer_keep, tp); #ifdef TCPDEBUG if (inp->inp_socket->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); INP_WUNLOCK(inp); CURVNET_RESTORE(); return; dropit: TCPSTAT_INC(tcps_keepdrops); if (tcp_inpinfo_lock_add(inp)) { tcp_inpinfo_lock_del(inp, tp); goto out; } tp = tcp_drop(tp, ETIMEDOUT); #ifdef TCPDEBUG if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); tcp_inpinfo_lock_del(inp, tp); out: CURVNET_RESTORE(); }
static int ip6_fw_chk(struct ip6_hdr **pip6, struct ifnet *oif, struct mbuf **m) { struct ip6_fw_chain *chain; struct ip6_fw *rule = NULL; struct ip6_hdr *ip6 = *pip6; struct ifnet *const rif = (*m)->m_pkthdr.rcvif; u_short offset = 0; int off = sizeof(struct ip6_hdr), nxt = ip6->ip6_nxt; u_short src_port, dst_port; #ifdef IP6FW_DIVERT_RESTART u_int16_t skipto = 0; #else u_int16_t ignport = 0; #endif /* * Go down the chain, looking for enlightment * #ifdef IP6FW_DIVERT_RESTART * If we've been asked to start at a given rule immediatly, do so. * #endif */ chain = LIST_FIRST(&ip6_fw_chain); #ifdef IP6FW_DIVERT_RESTART if (skipto) { if (skipto >= 65535) goto dropit; while (chain && (chain->rule->fw_number <= skipto)) { chain = LIST_NEXT(chain, chain); } if (! chain) goto dropit; } #endif /* IP6FW_DIVERT_RESTART */ for (; chain; chain = LIST_NEXT(chain, chain)) { struct ip6_fw *const f = chain->rule; if (oif) { /* Check direction outbound */ if (!(f->fw_flg & IPV6_FW_F_OUT)) continue; } else { /* Check direction inbound */ if (!(f->fw_flg & IPV6_FW_F_IN)) continue; } #define IN6_ARE_ADDR_MASKEQUAL(x,y,z) (\ (((x)->s6_addr32[0] & (y)->s6_addr32[0]) == (z)->s6_addr32[0]) && \ (((x)->s6_addr32[1] & (y)->s6_addr32[1]) == (z)->s6_addr32[1]) && \ (((x)->s6_addr32[2] & (y)->s6_addr32[2]) == (z)->s6_addr32[2]) && \ (((x)->s6_addr32[3] & (y)->s6_addr32[3]) == (z)->s6_addr32[3])) /* If src-addr doesn't match, not this rule. */ if (((f->fw_flg & IPV6_FW_F_INVSRC) != 0) ^ (!IN6_ARE_ADDR_MASKEQUAL(&ip6->ip6_src,&f->fw_smsk,&f->fw_src))) continue; /* If dest-addr doesn't match, not this rule. */ if (((f->fw_flg & IPV6_FW_F_INVDST) != 0) ^ (!IN6_ARE_ADDR_MASKEQUAL(&ip6->ip6_dst,&f->fw_dmsk,&f->fw_dst))) continue; #undef IN6_ARE_ADDR_MASKEQUAL /* Interface check */ if ((f->fw_flg & IF6_FW_F_VIAHACK) == IF6_FW_F_VIAHACK) { struct ifnet *const iface = oif ? oif : rif; /* Backwards compatibility hack for "via" */ if (!iface || !iface_match(iface, &f->fw_in_if, f->fw_flg & IPV6_FW_F_OIFNAME)) continue; } else { /* Check receive interface */ if ((f->fw_flg & IPV6_FW_F_IIFACE) && (!rif || !iface_match(rif, &f->fw_in_if, f->fw_flg & IPV6_FW_F_IIFNAME))) continue; /* Check outgoing interface */ if ((f->fw_flg & IPV6_FW_F_OIFACE) && (!oif || !iface_match(oif, &f->fw_out_if, f->fw_flg & IPV6_FW_F_OIFNAME))) continue; } /* Check IP options */ if (!ip6opts_match(&ip6, f, m, &off, &nxt, &offset)) continue; /* Fragments */ if ((f->fw_flg & IPV6_FW_F_FRAG) && !offset) continue; /* Check protocol; if wildcard, match */ if (f->fw_prot == IPPROTO_IPV6) goto got_match; /* If different, don't match */ if (nxt != f->fw_prot) continue; #define PULLUP_TO(len) do { \ if ((*m)->m_len < (len) \ && (*m = m_pullup(*m, (len))) == 0) { \ goto dropit; \ } \ *pip6 = ip6 = mtod(*m, struct ip6_hdr *); \ } while (/*CONSTCOND*/ 0) /* Protocol specific checks */ switch (nxt) { case IPPROTO_TCP: { struct tcphdr *tcp6; if (offset == 1) { /* cf. RFC 1858 */ PULLUP_TO(off + 4); /* XXX ? */ goto bogusfrag; } if (offset != 0) { /* * TCP flags and ports aren't available in this * packet -- if this rule specified either one, * we consider the rule a non-match. */ if (f->fw_nports != 0 || f->fw_tcpf != f->fw_tcpnf) continue; break; } PULLUP_TO(off + 14); tcp6 = (struct tcphdr *) ((caddr_t)ip6 + off); if (((f->fw_tcpf != f->fw_tcpnf) || (f->fw_ipflg & IPV6_FW_IF_TCPEST)) && !tcp6flg_match(tcp6, f)) continue; src_port = ntohs(tcp6->th_sport); dst_port = ntohs(tcp6->th_dport); goto check_ports; } case IPPROTO_UDP: { struct udphdr *udp; if (offset != 0) { /* * Port specification is unavailable -- if this * rule specifies a port, we consider the rule * a non-match. */ if (f->fw_nports != 0) continue; break; } PULLUP_TO(off + 4); udp = (struct udphdr *) ((caddr_t)ip6 + off); src_port = ntohs(udp->uh_sport); dst_port = ntohs(udp->uh_dport); check_ports: if (!port_match6(&f->fw_pts[0], IPV6_FW_GETNSRCP(f), src_port, f->fw_flg & IPV6_FW_F_SRNG)) continue; if (!port_match6(&f->fw_pts[IPV6_FW_GETNSRCP(f)], IPV6_FW_GETNDSTP(f), dst_port, f->fw_flg & IPV6_FW_F_DRNG)) continue; break; } case IPPROTO_ICMPV6: { struct icmp6_hdr *icmp; if (offset != 0) /* Type isn't valid */ break; PULLUP_TO(off + 2); icmp = (struct icmp6_hdr *) ((caddr_t)ip6 + off); if (!icmp6type_match(icmp, f)) continue; break; } #undef PULLUP_TO bogusfrag: if (fw6_verbose) ip6fw_report(NULL, ip6, rif, oif, off, nxt); goto dropit; } got_match: #ifndef IP6FW_DIVERT_RESTART /* Ignore divert/tee rule if socket port is "ignport" */ switch (f->fw_flg & IPV6_FW_F_COMMAND) { case IPV6_FW_F_DIVERT: case IPV6_FW_F_TEE: if (f->fw_divert_port == ignport) continue; /* ignore this rule */ break; } #endif /* IP6FW_DIVERT_RESTART */ /* Update statistics */ f->fw_pcnt += 1; f->fw_bcnt += ntohs(ip6->ip6_plen); #ifdef __FreeBSD__ f->timestamp = time_second; #else f->timestamp = time.tv_sec; #endif /* Log to console if desired */ if ((f->fw_flg & IPV6_FW_F_PRN) && fw6_verbose) ip6fw_report(f, ip6, rif, oif, off, nxt); /* Take appropriate action */ switch (f->fw_flg & IPV6_FW_F_COMMAND) { case IPV6_FW_F_ACCEPT: return (0); case IPV6_FW_F_COUNT: continue; case IPV6_FW_F_DIVERT: return (f->fw_divert_port); case IPV6_FW_F_TEE: /* * XXX someday tee packet here, but beware that you * can't use m_copym() or m_copypacket() because * the divert input routine modifies the mbuf * (and these routines only increment reference * counts in the case of mbuf clusters), so need * to write custom routine. */ continue; case IPV6_FW_F_SKIPTO: #ifdef DIAGNOSTIC while (chain->chain.le_next && chain->chain.le_next->rule->fw_number < f->fw_skipto_rule) #else while (chain->chain.le_next->rule->fw_number < f->fw_skipto_rule) #endif chain = chain->chain.le_next; continue; } /* Deny/reject this packet using this rule */ rule = f; break; } #ifdef DIAGNOSTIC /* Rule 65535 should always be there and should always match */ if (!chain) panic("ip6_fw: chain"); #endif /* * At this point, we're going to drop the packet. * Send a reject notice if all of the following are true: * * - The packet matched a reject rule * - The packet is not an ICMP packet, or is an ICMP query packet * - The packet is not a multicast or broadcast packet */ if ((rule->fw_flg & IPV6_FW_F_COMMAND) == IPV6_FW_F_REJECT && (nxt != IPPROTO_ICMPV6 || is_icmp6_query(ip6, off)) && !((*m)->m_flags & (M_BCAST|M_MCAST)) && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { switch (rule->fw_reject_code) { case IPV6_FW_REJECT_RST: #if 1 /* not tested */ { struct tcphdr *const tcp = (struct tcphdr *) ((caddr_t)ip6 + off); struct { struct ip6_hdr ip6; struct tcphdr th; } ti; tcp_seq ack, seq; int flags; if (offset != 0 || (tcp->th_flags & TH_RST)) break; ti.ip6 = *ip6; ti.th = *tcp; ti.th.th_seq = ntohl(ti.th.th_seq); ti.th.th_ack = ntohl(ti.th.th_ack); ti.ip6.ip6_nxt = IPPROTO_TCP; if (ti.th.th_flags & TH_ACK) { ack = 0; seq = ti.th.th_ack; flags = TH_RST; } else { ack = ti.th.th_seq; if (((*m)->m_flags & M_PKTHDR) != 0) { ack += (*m)->m_pkthdr.len - off - (ti.th.th_off << 2); } else if (ip6->ip6_plen) { ack += ntohs(ip6->ip6_plen) + sizeof(*ip6) - off - (ti.th.th_off << 2); } else { m_freem(*m); *m = 0; break; } seq = 0; flags = TH_RST|TH_ACK; } bcopy(&ti, ip6, sizeof(ti)); #ifdef __NetBSD__ tcp_respond(NULL, NULL, *m, (struct tcphdr *)(ip6 + 1), ack, seq, flags); #elif defined(__FreeBSD__) tcp_respond(NULL, ip6, (struct tcphdr *)(ip6 + 1), *m, ack, seq, flags); #else m_freem(*m); #endif *m = NULL; break; } #endif default: /* Send an ICMP unreachable using code */ if (oif) (*m)->m_pkthdr.rcvif = oif; icmp6_error(*m, ICMP6_DST_UNREACH, rule->fw_reject_code, 0); *m = NULL; break; } } dropit: /* * Finally, drop the packet. */ if (*m) { m_freem(*m); *m = NULL; } return (0); }
void tcp_timer_keep(void *xtp) { struct tcpcb *tp = xtp; struct tcptemp *t_template; struct inpcb *inp; CURVNET_SET(tp->t_vnet); #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif INP_INFO_WLOCK(&V_tcbinfo); inp = tp->t_inpcb; /* * XXXRW: While this assert is in fact correct, bugs in the tcpcb * tear-down mean we need it as a work-around for races between * timers and tcp_discardcb(). * * KASSERT(inp != NULL, ("tcp_timer_keep: inp == NULL")); */ if (inp == NULL) { tcp_timer_race++; INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } INP_WLOCK(inp); if ((inp->inp_flags & INP_DROPPED) || callout_pending(&tp->t_timers->tt_keep) || !callout_active(&tp->t_timers->tt_keep)) { INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_keep); /* * Keep-alive timer went off; send something * or drop connection if idle for too long. */ TCPSTAT_INC(tcps_keeptimeo); if (tp->t_state < TCPS_ESTABLISHED) goto dropit; if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && tp->t_state <= TCPS_CLOSING) { if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) goto dropit; /* * Send a packet designed to force a response * if the peer is up and reachable: * either an ACK if the connection is still alive, * or an RST if the peer has closed the connection * due to timeout or reboot. * Using sequence number tp->snd_una-1 * causes the transmitted zero-length segment * to lie outside the receive window; * by the protocol spec, this requires the * correspondent TCP to respond. */ TCPSTAT_INC(tcps_keepprobe); t_template = tcpip_maketemplate(inp); if (t_template) { tcp_respond(tp, t_template->tt_ipgen, &t_template->tt_t, (struct mbuf *)NULL, tp->rcv_nxt, tp->snd_una - 1, 0); free(t_template, M_TEMP); } callout_reset_on(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), tcp_timer_keep, tp, INP_CPU(inp)); } else callout_reset_on(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), tcp_timer_keep, tp, INP_CPU(inp)); #ifdef TCPDEBUG if (inp->inp_socket->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; dropit: TCPSTAT_INC(tcps_keepdrops); tp = tcp_drop(tp, ETIMEDOUT); #ifdef TCPDEBUG if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif if (tp != NULL) INP_WUNLOCK(tp->t_inpcb); INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); }
void tcp_timer_keep(void *arg) { struct tcpcb *tp = arg; struct socket *so = NULL; /* Quell compiler warning */ #ifdef TCP_DEBUG short ostate; #endif mutex_enter(softnet_lock); if ((tp->t_flags & TF_DEAD) != 0) { mutex_exit(softnet_lock); return; } if (!callout_expired(&tp->t_timer[TCPT_KEEP])) { mutex_exit(softnet_lock); return; } KERNEL_LOCK(1, NULL); #ifdef TCP_DEBUG ostate = tp->t_state; #endif /* TCP_DEBUG */ /* * Keep-alive timer went off; send something * or drop connection if idle for too long. */ TCP_STATINC(TCP_STAT_KEEPTIMEO); if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) goto dropit; #ifdef INET if (tp->t_inpcb) so = tp->t_inpcb->inp_socket; #endif #ifdef INET6 if (tp->t_in6pcb) so = tp->t_in6pcb->in6p_socket; #endif KASSERT(so != NULL); if (so->so_options & SO_KEEPALIVE && tp->t_state <= TCPS_CLOSE_WAIT) { if ((tp->t_maxidle > 0) && ((tcp_now - tp->t_rcvtime) >= tp->t_keepidle + tp->t_maxidle)) goto dropit; /* * Send a packet designed to force a response * if the peer is up and reachable: * either an ACK if the connection is still alive, * or an RST if the peer has closed the connection * due to timeout or reboot. * Using sequence number tp->snd_una-1 * causes the transmitted zero-length segment * to lie outside the receive window; * by the protocol spec, this requires the * correspondent TCP to respond. */ TCP_STATINC(TCP_STAT_KEEPPROBE); if (tcp_compat_42) { /* * The keepalive packet must have nonzero * length to get a 4.2 host to respond. */ (void)tcp_respond(tp, tp->t_template, NULL, NULL, tp->rcv_nxt - 1, tp->snd_una - 1, 0); } else { (void)tcp_respond(tp, tp->t_template, NULL, NULL, tp->rcv_nxt, tp->snd_una - 1, 0); } TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepintvl); } else TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepidle); #ifdef TCP_DEBUG if (tp && so->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, NULL, PRU_SLOWTIMO | (TCPT_KEEP << 8)); #endif KERNEL_UNLOCK_ONE(NULL); mutex_exit(softnet_lock); return; dropit: TCP_STATINC(TCP_STAT_KEEPDROPS); (void) tcp_drop(tp, ETIMEDOUT); KERNEL_UNLOCK_ONE(NULL); mutex_exit(softnet_lock); }
static T_TCP_CEP * tcp_timers (T_TCP_CEP *cep, int_t tix) { uint16_t win; switch (tix) { /* * 再送タイマ */ case TCP_TIM_REXMT: /* * 最大再送回数 (TCP_MAX_REXMT_SHIFT、標準 12 回) になったときは、 * コネクションを切断する。 */ if (++ cep->rxtshift > TCP_MAX_REXMT_SHIFT) { cep->rxtshift = TCP_MAX_REXMT_SHIFT; cep->net_error = EV_REXMTMO; cep = tcp_drop(cep, E_CLS); break; } /* * 再送タイムアウトを計算する。 */ cep->rxtcur = tcp_range_set((T_TCP_TIME)(tcp_rexmt_val(cep) * tcp_back_off[cep->rxtshift]), (T_TCP_TIME)TCP_TVAL_MIN, (T_TCP_TIME)TCP_TVAL_MAX_REXMT); cep->timer[TCP_TIM_REXMT] = cep->rxtcur; /* * srtt: 平滑化された RTT * rttvar: 平滑化された分散 * * 再送回数が最大再送回数の 1/4 になったときは、 * 平滑化された分散 (rttvar) に srtt を加算し、 * 平滑化された RTT を 0 にする。 * */ if (cep->rxtshift > TCP_MAX_REXMT_SHIFT / 4) { cep->rttvar += (cep->srtt >> TCP_SRTT_SHIFT); cep->srtt = 0; } /* * snd_nxt: 次に送信する SEQ、この時点では、前回送信した SEQ * snd_una: 未確認の最小送信 SEQ または、確認された最大送信 SEQ * * 前回送信した SEQ (snd_nxt) を * 確認された最大送信 SEQ (snd_una) まで戻す。 */ cep->snd_nxt = cep->snd_una; cep->flags |= TCP_CEP_FLG_ACK_NOW; /* * rtt: 往復時間の計測を中止する。 */ cep->rtt = 0; /* * 送信ウインドの設定 * * snd_wnd: 相手の受信可能ウィンドサイズ * snd_cwnd: 輻輳ウィンドサイズ * maxseg : 相手の最大受信セグメントサイズ * * 相手の受信可能ウィンドサイズ (snd_wnd) か、 * 輻輳ウィンドサイズ (snd_cwnd) の * どちらか小さいサイズの 1/2 を、更に * 相手の最大受信セグメントサイズ (maxseg) で割った値。 * ただし、2 以上 */ if (cep->snd_wnd < cep->snd_cwnd) win = cep->snd_wnd / 2 / cep->maxseg; else win = cep->snd_cwnd / 2 / cep->maxseg; if (win < 2) win = 2; /* * 輻輳ウィンドサイズ (snd_cwnd) は * 相手の受信可能ウィンドサイズ (snd_wnd) に、 * 輻輳ウィンドサイズのしきい値 (snd_ssthresh) は * 相手の受信可能ウィンドサイズ (snd_wnd) の win 倍に * 設定する。 */ cep->snd_cwnd = cep->maxseg; cep->snd_ssthresh = win * cep->maxseg; cep->dupacks = 0; /* 出力をポストする。*/ cep->flags |= TCP_CEP_FLG_POST_OUTPUT; sig_sem(SEM_TCP_POST_OUTPUT); break; /* * 持続タイマ */ case TCP_TIM_PERSIST: /* * 最大再送回数 (TCP_MAX_REXMT_SHIFT、標準 12 回) を超えていて、 * アイドル時間が、保留タイマの標準値 (TCP_TVAL_KEEP_IDLE、 * 標準 2 * 60 * 60 秒) 以上か、 * 再送タイムアウト値 * バックオフ時間の合計以上なら * コネクションを切断する。 */ if (cep->rxtshift > TCP_MAX_REXMT_SHIFT && (cep->idle >= TCP_TVAL_KEEP_IDLE || cep->idle >= tcp_rexmt_val(cep) * TCP_TOTAL_BACK_OFF)) { cep->net_error = EV_REXMTMO; cep = tcp_drop(cep, E_CLS); break; } /* 持続タイマを再設定し、出力をポストする。*/ tcp_set_persist_timer(cep); cep->flags |= TCP_CEP_FLG_FORCE | TCP_CEP_FLG_FORCE_CLEAR | TCP_CEP_FLG_POST_OUTPUT; sig_sem(SEM_TCP_POST_OUTPUT); break; /* * 保留 (keep alive) タイマ */ case TCP_TIM_KEEP: /* * コネクションが開設されるまでにタイムアウトしたら * コネクションの開設を中止する。 */ if (cep->fsm_state < TCP_FSM_ESTABLISHED) { cep->net_error = EV_REXMTMO; cep = tcp_drop(cep, E_CLS); break; } #ifdef TCP_CFG_ALWAYS_KEEP else if (cep->fsm_state < TCP_FSM_CLOSING) { if (cep->idle >= TCP_TVAL_KEEP_IDLE + TCP_TVAL_KEEP_COUNT * TCP_TVAL_KEEP_INTERVAL) { cep->net_error = EV_REXMTMO; cep = tcp_drop(cep, E_CLS); break; } else tcp_respond(NULL, cep, cep->rcv_nxt, cep->snd_una - 1, cep->rbufsz - cep->rwbuf_count, 0); cep->timer[TCP_TIM_KEEP] = TCP_TVAL_KEEP_INTERVAL; } else cep->timer[TCP_TIM_KEEP] = TCP_TVAL_KEEP_IDLE; #else /* of #ifdef TCP_CFG_ALWAYS_KEEP */ cep->timer[TCP_TIM_KEEP] = TCP_TVAL_KEEP_IDLE; #endif /* of #ifdef TCP_CFG_ALWAYS_KEEP */ break; /* * 2MSL タイマ */ case TCP_TIM_2MSL: if (cep->fsm_state != TCP_FSM_TIME_WAIT && cep->idle <= TCP_TVAL_KEEP_COUNT * TCP_TVAL_KEEP_INTERVAL) cep->timer[TCP_TIM_2MSL] = TCP_TVAL_KEEP_INTERVAL; else cep = tcp_close(cep); break; }
void tcp_timer_keep(void *xtp) { struct tcpcb *tp = xtp; struct tcptemp *t_template; struct inpcb *inp; CURVNET_SET(tp->t_vnet); #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif INP_INFO_RLOCK(&V_tcbinfo); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_keep) || !callout_active(&tp->t_timers->tt_keep)) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_keep); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); KASSERT((tp->t_timers->tt_flags & TT_KEEP) != 0, ("%s: tp %p keep callout should be running", __func__, tp)); /* * Keep-alive timer went off; send something * or drop connection if idle for too long. */ TCPSTAT_INC(tcps_keeptimeo); if (tp->t_state < TCPS_ESTABLISHED) goto dropit; if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && tp->t_state <= TCPS_CLOSING) { if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) goto dropit; /* * Send a packet designed to force a response * if the peer is up and reachable: * either an ACK if the connection is still alive, * or an RST if the peer has closed the connection * due to timeout or reboot. * Using sequence number tp->snd_una-1 * causes the transmitted zero-length segment * to lie outside the receive window; * by the protocol spec, this requires the * correspondent TCP to respond. */ TCPSTAT_INC(tcps_keepprobe); t_template = tcpip_maketemplate(inp); if (t_template) { tcp_respond(tp, t_template->tt_ipgen, &t_template->tt_t, (struct mbuf *)NULL, tp->rcv_nxt, tp->snd_una - 1, 0); free(t_template, M_TEMP); } if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), tcp_timer_keep, tp)) { tp->t_timers->tt_flags &= ~TT_KEEP_RST; } } else if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), tcp_timer_keep, tp)) { tp->t_timers->tt_flags &= ~TT_KEEP_RST; } #ifdef TCPDEBUG if (inp->inp_socket->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; dropit: TCPSTAT_INC(tcps_keepdrops); tp = tcp_drop(tp, ETIMEDOUT); #ifdef TCPDEBUG if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); if (tp != NULL) INP_WUNLOCK(tp->t_inpcb); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); }
/* Caller should be in critical section */ static struct tcpcb * tcp_timer_keep_handler(struct tcpcb *tp) { struct tcptemp *t_template; #ifdef TCPDEBUG int ostate = tp->t_state; #endif /* * Keep-alive timer went off; send something * or drop connection if idle for too long. */ tcpstat.tcps_keeptimeo++; if (tp->t_state < TCPS_ESTABLISHED) goto dropit; if ((always_keepalive || (tp->t_flags & TF_KEEPALIVE) || (tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE)) && tp->t_state <= TCPS_CLOSING) { if ((ticks - tp->t_rcvtime) >= tp->t_keepidle + tp->t_maxidle) goto dropit; /* * Send a packet designed to force a response * if the peer is up and reachable: * either an ACK if the connection is still alive, * or an RST if the peer has closed the connection * due to timeout or reboot. * Using sequence number tp->snd_una-1 * causes the transmitted zero-length segment * to lie outside the receive window; * by the protocol spec, this requires the * correspondent TCP to respond. */ tcpstat.tcps_keepprobe++; t_template = tcp_maketemplate(tp); if (t_template) { tcp_respond(tp, t_template->tt_ipgen, &t_template->tt_t, NULL, tp->rcv_nxt, tp->snd_una - 1, 0); tcp_freetemplate(t_template); } tcp_callout_reset(tp, tp->tt_keep, tp->t_keepintvl, tcp_timer_keep); } else { tcp_callout_reset(tp, tp->tt_keep, tp->t_keepidle, tcp_timer_keep); } #ifdef TCPDEBUG if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); #endif return tp; dropit: tcpstat.tcps_keepdrops++; tp = tcp_drop(tp, ETIMEDOUT); #ifdef TCPDEBUG if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); #endif return tp; }
/* * TCP input routine, follows pages 65-76 of the * protocol specification dated September, 1981 very closely. */ void tcp_input(usn_mbuf_t *m, int iphlen) { struct tcpiphdr *ti; struct inpcb *inp; u_char *optp = NULL; int optlen; int len, tlen, off; struct tcpcb *tp = 0; int tiflags; struct usn_socket *so = 0; int todrop, acked, ourfinisacked; int needoutput = 0; short ostate; struct usn_in_addr laddr; int dropsocket = 0; int iss = 0; u_long tiwin, ts_val, ts_ecr; int ts_present = 0; (void)needoutput; g_tcpstat.tcps_rcvtotal++; // Get IP and TCP header together in first mbuf. // Note: IP leaves IP header in first mbuf. ti = mtod(m, struct tcpiphdr *); if (iphlen > sizeof (usn_ip_t)) ip_stripoptions(m, (usn_mbuf_t *)0); if (m->mlen < sizeof (struct tcpiphdr)) { if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) { g_tcpstat.tcps_rcvshort++; return; } ti = mtod(m, struct tcpiphdr *); } #ifdef DUMP_PAYLOAD dump_chain(m,"tcp"); #endif /* * Checksum extended TCP header and data. */ tlen = ntohs(((usn_ip_t *)ti)->ip_len); len = sizeof (usn_ip_t) + tlen; ti->ti_next = ti->ti_prev = 0; ti->ti_x1 = 0; ti->ti_len = (u_short)tlen; HTONS(ti->ti_len); ti->ti_sum = in_cksum(m, len); if (ti->ti_sum) { g_tcpstat.tcps_rcvbadsum++; goto drop; } /* * Check that TCP offset makes sense, * pull out TCP options and adjust length. XXX */ off = ti->ti_off << 2; if (off < sizeof (struct tcphdr) || off > tlen) { g_tcpstat.tcps_rcvbadoff++; goto drop; } tlen -= off; ti->ti_len = tlen; if (off > sizeof (struct tcphdr)) { if (m->mlen < sizeof(usn_ip_t) + off) { if ((m = m_pullup(m, sizeof (usn_ip_t) + off)) == 0) { g_tcpstat.tcps_rcvshort++; return; } ti = mtod(m, struct tcpiphdr *); } optlen = off - sizeof (struct tcphdr); optp = mtod(m, u_char *) + sizeof (struct tcpiphdr); // Do quick retrieval of timestamp options ("options // prediction?"). If timestamp is the only option and it's // formatted as recommended in RFC 1323 appendix A, we // quickly get the values now and not bother calling // tcp_dooptions(), etc. if ((optlen == TCPOLEN_TSTAMP_APPA || (optlen > TCPOLEN_TSTAMP_APPA && optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && *(u_int *)optp == htonl(TCPOPT_TSTAMP_HDR) && (ti->ti_flags & TH_SYN) == 0) { ts_present = 1; ts_val = ntohl(*(u_long *)(optp + 4)); ts_ecr = ntohl(*(u_long *)(optp + 8)); optp = NULL; // we've parsed the options } } tiflags = ti->ti_flags; // Convert TCP protocol specific fields to host format. NTOHL(ti->ti_seq); NTOHL(ti->ti_ack); NTOHS(ti->ti_win); NTOHS(ti->ti_urp); // Locate pcb for segment. findpcb: inp = g_tcp_last_inpcb; if (inp->inp_lport != ti->ti_dport || inp->inp_fport != ti->ti_sport || inp->inp_faddr.s_addr != ti->ti_src.s_addr || inp->inp_laddr.s_addr != ti->ti_dst.s_addr) { inp = in_pcblookup(&g_tcb, ti->ti_src, ti->ti_sport, ti->ti_dst, ti->ti_dport, INPLOOKUP_WILDCARD); if (inp) g_tcp_last_inpcb = inp; ++g_tcpstat.tcps_pcbcachemiss; } // If the state is CLOSED (i.e., TCB does not exist) then // all data in the incoming segment is discarded. // If the TCB exists but is in CLOSED state, it is embryonic, // but should either do a listen or a connect soon. if (inp == 0) goto dropwithreset; tp = intotcpcb(inp); DEBUG("found inp cb, laddr=%x, lport=%d, faddr=%x," " fport=%d, tp_state=%d, tp_flags=%d", inp->inp_laddr.s_addr, inp->inp_lport, inp->inp_faddr.s_addr, inp->inp_fport, tp->t_state, tp->t_flags); if (tp == 0) goto dropwithreset; if (tp->t_state == TCPS_CLOSED) goto drop; // Unscale the window into a 32-bit value. if ((tiflags & TH_SYN) == 0) tiwin = ti->ti_win << tp->snd_scale; else tiwin = ti->ti_win; so = inp->inp_socket; DEBUG("socket info, options=%x", so->so_options); if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { if (so->so_options & SO_DEBUG) { ostate = tp->t_state; g_tcp_saveti = *ti; } if (so->so_options & SO_ACCEPTCONN) { if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { // Note: dropwithreset makes sure we don't // send a reset in response to a RST. if (tiflags & TH_ACK) { g_tcpstat.tcps_badsyn++; goto dropwithreset; } DEBUG("SYN is expected, tiflags=%d", tiflags); goto drop; } so = sonewconn(so, 0); if (so == 0) { DEBUG("failed to create new connection, tiflags=%d", tiflags); goto drop; } // Mark socket as temporary until we're // committed to keeping it. The code at // ``drop'' and ``dropwithreset'' check the // flag dropsocket to see if the temporary // socket created here should be discarded. // We mark the socket as discardable until // we're committed to it below in TCPS_LISTEN. dropsocket++; inp = (struct inpcb *)so->so_pcb; inp->inp_laddr = ti->ti_dst; inp->inp_lport = ti->ti_dport; // BSD >= 4.3 inp->inp_options = ip_srcroute(); tp = intotcpcb(inp); tp->t_state = TCPS_LISTEN; // Compute proper scaling value from buffer space while (tp->request_r_scale < TCP_MAX_WINSHIFT && TCP_MAXWIN << tp->request_r_scale < so->so_rcv->sb_hiwat) tp->request_r_scale++; } } // Segment received on connection. // Reset idle time and keep-alive timer. tp->t_idle = 0; tp->t_timer[TCPT_KEEP] = g_tcp_keepidle; // Process options if not in LISTEN state, // else do it below (after getting remote address). if (optp && tp->t_state != TCPS_LISTEN) tcp_dooptions(tp, optp, optlen, ti, &ts_present, &ts_val, &ts_ecr); // Header prediction: check for the two common cases // of a uni-directional data xfer. If the packet has // no control flags, is in-sequence, the window didn't // change and we're not retransmitting, it's a // candidate. If the length is zero and the ack moved // forward, we're the sender side of the xfer. Just // free the data acked & wake any higher level process // that was blocked waiting for space. If the length // is non-zero and the ack didn't move, we're the // receiver side. If we're getting packets in-order // (the reassembly queue is empty), add the data to // the socket buffer and note that we need a delayed ack. if (tp->t_state == TCPS_ESTABLISHED && (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && (!ts_present || TSTMP_GEQ(ts_val, tp->ts_recent)) && ti->ti_seq == tp->rcv_nxt && tiwin && tiwin == tp->snd_wnd && tp->snd_nxt == tp->snd_max) { // If last ACK falls within this segment's sequence numbers, // record the timestamp. if ( ts_present && TSTMP_GEQ(ts_val, tp->ts_recent) && SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) ){ tp->ts_recent_age = g_tcp_now; tp->ts_recent = ts_val; } if (ti->ti_len == 0) { if (SEQ_GT(ti->ti_ack, tp->snd_una) && SEQ_LEQ(ti->ti_ack, tp->snd_max) && tp->snd_cwnd >= tp->snd_wnd) { // this is a pure ack for outstanding data. ++g_tcpstat.tcps_predack; if (ts_present) tcp_xmit_timer(tp, g_tcp_now-ts_ecr+1); else if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq)) tcp_xmit_timer(tp, tp->t_rtt); acked = ti->ti_ack - tp->snd_una; g_tcpstat.tcps_rcvackpack++; g_tcpstat.tcps_rcvackbyte += acked; TRACE("drop so_snd buffer, drop_bytes=%d, len=%d", acked, so->so_snd.sb_cc); sbdrop(so->so_snd, acked); tp->snd_una = ti->ti_ack; usn_free_cmbuf(m); // If all outstanding data are acked, stop // retransmit timer, otherwise restart timer // using current (possibly backed-off) value. // If process is waiting for space, // wakeup/selwakeup/signal. If data // are ready to send, let tcp_output // decide between more output or persist. if (tp->snd_una == tp->snd_max) tp->t_timer[TCPT_REXMT] = 0; else if (tp->t_timer[TCPT_PERSIST] == 0) tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, &g_tcp_saveti, 0); //if (so->so_snd->sb_flags & SB_NOTIFY) { // usnet_tcpin_wwakeup(so, USN_TCP_IN, usn_tcpev_sbnotify, 0); // sowwakeup(so); //} // send buffer is available for app thread. usnet_tcpin_wwakeup(so, USN_TCP_IN, USN_TCPEV_WRITE, 0); if (so->so_snd->sb_cc) tcp_output(tp); return; } } else if (ti->ti_ack == tp->snd_una && tp->seg_next == (struct tcpiphdr *)tp && ti->ti_len <= sbspace(so->so_rcv)) { // this is a pure, in-sequence data packet // with nothing on the reassembly queue and // we have enough buffer space to take it. ++g_tcpstat.tcps_preddat; tp->rcv_nxt += ti->ti_len; g_tcpstat.tcps_rcvpack++; g_tcpstat.tcps_rcvbyte += ti->ti_len; // Drop TCP, IP headers and TCP options then add data // to socket buffer. m->head += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); m->mlen -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); TRACE("add data to rcv buf"); sbappend(so->so_rcv, m); sorwakeup(so); // new data is available for app threads. usnet_tcpin_rwakeup(so, USN_TCP_IN, USN_TCPEV_READ, m); if (so->so_options & SO_DEBUG) { TRACE("tcp trace, so_options=%d", so->so_options); tcp_trace(TA_INPUT, ostate, tp, &g_tcp_saveti, 0); } tp->t_flags |= TF_DELACK; return; } } // Drop TCP, IP headers and TCP options. m->head += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); m->mlen -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); // Calculate amount of space in receive window, // and then do TCP input processing. // Receive window is amount of space in rcv queue, // but not less than advertised window. { int win; win = sbspace(so->so_rcv); if (win < 0) win = 0; tp->rcv_wnd = max(win, (int)(tp->rcv_adv - tp->rcv_nxt)); } switch (tp->t_state) { // If the state is LISTEN then ignore segment if it contains an RST. // If the segment contains an ACK then it is bad and send a RST. // If it does not contain a SYN then it is not interesting; drop it. // Don't bother responding if the destination was a broadcast. // Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial // tp->iss, and send a segment: // <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> // Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss. // Fill in remote peer address fields if not previously specified. // Enter SYN_RECEIVED state, and process any other fields of this // segment in this state. case TCPS_LISTEN: { usn_mbuf_t *am; struct usn_sockaddr_in *sin; if (tiflags & TH_RST) goto drop; if (tiflags & TH_ACK) goto dropwithreset; if ((tiflags & TH_SYN) == 0) goto drop; // RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN // in_broadcast() should never return true on a received // packet with M_BCAST not set. //if (m->m_flags & (M_BCAST|M_MCAST) || // IN_MULTICAST(ntohl(ti->ti_dst.s_addr))) // goto drop; am = usn_get_mbuf(0, BUF_MSIZE, 0); // XXX: the size! if (am == NULL) goto drop; am->mlen = sizeof (struct usn_sockaddr_in); sin = mtod(am, struct usn_sockaddr_in *); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = ti->ti_src; sin->sin_port = ti->ti_sport; bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero)); laddr = inp->inp_laddr; if (inp->inp_laddr.s_addr == USN_INADDR_ANY) inp->inp_laddr = ti->ti_dst; if (in_pcbconnect(inp, am)) { inp->inp_laddr = laddr; usn_free_mbuf(am); goto drop; } usn_free_mbuf(am); tp->t_template = tcp_template(tp); if (tp->t_template == 0) { tp = tcp_drop(tp, ENOBUFS); dropsocket = 0; // socket is already gone goto drop; } if (optp) tcp_dooptions(tp, optp, optlen, ti, &ts_present, &ts_val, &ts_ecr); if (iss) tp->iss = iss; else tp->iss = g_tcp_iss; g_tcp_iss += TCP_ISSINCR/4; tp->irs = ti->ti_seq; tcp_sendseqinit(tp); tcp_rcvseqinit(tp); tp->t_flags |= TF_ACKNOW; TRACE("change tcp state to TCPS_SYN_RECEIVED, state=%d, tp_flags=%d", tp->t_state, tp->t_flags); tp->t_state = TCPS_SYN_RECEIVED; // tcp event usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_SYN_RECEIVED, 0); tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT; dropsocket = 0; // committed to socket g_tcpstat.tcps_accepts++; goto trimthenstep6; } // If the state is SYN_SENT: // if seg contains an ACK, but not for our SYN, drop the input. // if seg contains a RST, then drop the connection. // if seg does not contain SYN, then drop it. // Otherwise this is an acceptable SYN segment // initialize tp->rcv_nxt and tp->irs // if seg contains ack then advance tp->snd_una // if SYN has been acked change to ESTABLISHED else SYN_RCVD state // arrange for segment to be acked (eventually) // continue processing rest of data/controls, beginning with URG case TCPS_SYN_SENT: if ((tiflags & TH_ACK) && (SEQ_LEQ(ti->ti_ack, tp->iss) || SEQ_GT(ti->ti_ack, tp->snd_max))) goto dropwithreset; if (tiflags & TH_RST) { if (tiflags & TH_ACK) tp = tcp_drop(tp, ECONNREFUSED); goto drop; } if ((tiflags & TH_SYN) == 0) goto drop; if (tiflags & TH_ACK) { tp->snd_una = ti->ti_ack; if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; tp->t_timer[TCPT_REXMT] = 0; } tp->irs = ti->ti_seq; tcp_rcvseqinit(tp); tp->t_flags |= TF_ACKNOW; TRACE("ack now, tp flags=%d", tp->t_flags); // XXX: remove second test. if (tiflags & TH_ACK /*&& SEQ_GT(tp->snd_una, tp->iss)*/) { g_tcpstat.tcps_connects++; soisconnected(so); TRACE("change tcp state to TCPS_ESTABLISHED," " state=%d, tp_flags=%d", tp->t_state, tp->t_flags); tp->t_state = TCPS_ESTABLISHED; // Do window scaling on this connection? if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->snd_scale = tp->requested_s_scale; tp->rcv_scale = tp->request_r_scale; } tcp_reass(tp, (struct tcpiphdr *)0, (usn_mbuf_t *)0); // if we didn't have to retransmit the SYN, // use its rtt as our initial srtt & rtt var. if (tp->t_rtt) tcp_xmit_timer(tp, tp->t_rtt); } else { TRACE("change tcp state to TCPS_SYN_RECEIVED, state=%d, tp_flags=%d", tp->t_state, tp->t_flags); tp->t_state = TCPS_SYN_RECEIVED; // tcp event usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_SYN_RECEIVED, 0); } trimthenstep6: // Advance ti->ti_seq to correspond to first data byte. // If data, trim to stay within window, // dropping FIN if necessary. ti->ti_seq++; if (ti->ti_len > tp->rcv_wnd) { todrop = ti->ti_len - tp->rcv_wnd; m_adj(m, -todrop); ti->ti_len = tp->rcv_wnd; tiflags &= ~TH_FIN; g_tcpstat.tcps_rcvpackafterwin++; g_tcpstat.tcps_rcvbyteafterwin += todrop; } tp->snd_wl1 = ti->ti_seq - 1; tp->rcv_up = ti->ti_seq; goto step6; } // States other than LISTEN or SYN_SENT. // First check timestamp, if present. // Then check that at least some bytes of segment are within // receive window. If segment begins before rcv_nxt, // drop leading data (and SYN); if nothing left, just ack. // // RFC 1323 PAWS: If we have a timestamp reply on this segment // and it's less than ts_recent, drop it. if (ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && TSTMP_LT(ts_val, tp->ts_recent)) { // Check to see if ts_recent is over 24 days old. if ((int)(g_tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { // Invalidate ts_recent. If this segment updates // ts_recent, the age will be reset later and ts_recent // will get a valid value. If it does not, setting // ts_recent to zero will at least satisfy the // requirement that zero be placed in the timestamp // echo reply when ts_recent isn't valid. The // age isn't reset until we get a valid ts_recent // because we don't want out-of-order segments to be // dropped when ts_recent is old. tp->ts_recent = 0; } else { g_tcpstat.tcps_rcvduppack++; g_tcpstat.tcps_rcvdupbyte += ti->ti_len; g_tcpstat.tcps_pawsdrop++; goto dropafterack; } } todrop = tp->rcv_nxt - ti->ti_seq; if (todrop > 0) { if (tiflags & TH_SYN) { tiflags &= ~TH_SYN; ti->ti_seq++; if (ti->ti_urp > 1) ti->ti_urp--; else tiflags &= ~TH_URG; todrop--; } if ( todrop >= ti->ti_len || ( todrop == ti->ti_len && (tiflags & TH_FIN ) == 0 ) ) { // Any valid FIN must be to the left of the window. // At this point the FIN must be a duplicate or // out of sequence; drop it. tiflags &= ~TH_FIN; // Send an ACK to resynchronize and drop any data // But keep on processing for RST or ACK. tp->t_flags |= TF_ACKNOW; TRACE("send ack now to resync, tp_flags=%d", tp->t_flags); todrop = ti->ti_len; g_tcpstat.tcps_rcvdupbyte += ti->ti_len; g_tcpstat.tcps_rcvduppack++; } else { g_tcpstat.tcps_rcvpartduppack++; g_tcpstat.tcps_rcvpartdupbyte += ti->ti_len; } m_adj(m, todrop); ti->ti_seq += todrop; ti->ti_len -= todrop; if (ti->ti_urp > todrop) ti->ti_urp -= todrop; else { tiflags &= ~TH_URG; ti->ti_urp = 0; } } // If new data are received on a connection after the // user processes are gone, then RST the other end. if ((so->so_state & USN_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT && ti->ti_len) { tp = tcp_close(tp); g_tcpstat.tcps_rcvafterclose++; goto dropwithreset; } // If segment ends after window, drop trailing data // (and PUSH and FIN); if nothing left, just ACK. todrop = (ti->ti_seq+ti->ti_len) - (tp->rcv_nxt+tp->rcv_wnd); if (todrop > 0) { g_tcpstat.tcps_rcvpackafterwin++; if (todrop >= ti->ti_len) { g_tcpstat.tcps_rcvbyteafterwin += ti->ti_len; // If a new connection request is received // while in TIME_WAIT, drop the old connection // and start over if the sequence numbers // are above the previous ones. if (tiflags & TH_SYN && tp->t_state == TCPS_TIME_WAIT && SEQ_GT(ti->ti_seq, tp->rcv_nxt)) { iss = tp->snd_nxt + TCP_ISSINCR; tp = tcp_close(tp); goto findpcb; } // If window is closed can only take segments at // window edge, and have to drop data and PUSH from // incoming segments. Continue processing, but // remember to ack. Otherwise, drop segment // and ack. if (tp->rcv_wnd == 0 && ti->ti_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; g_tcpstat.tcps_rcvwinprobe++; } else goto dropafterack; } else g_tcpstat.tcps_rcvbyteafterwin += todrop; m_adj(m, -todrop); ti->ti_len -= todrop; tiflags &= ~(TH_PUSH|TH_FIN); } // check valid timestamp. Replace code above. if (ts_present && TSTMP_GEQ(ts_val, tp->ts_recent) && SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) ) { tp->ts_recent_age = g_tcp_now; tp->ts_recent = ts_val; } // If the RST bit is set examine the state: // SYN_RECEIVED STATE: // If passive open, return to LISTEN state. // If active open, inform user that connection was refused. // ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: // Inform user that connection was reset, and close tcb. // CLOSING, LAST_ACK, TIME_WAIT STATES // Close the tcb. if (tiflags&TH_RST) switch (tp->t_state) { case TCPS_SYN_RECEIVED: so->so_error = ECONNREFUSED; goto close; case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: so->so_error = ECONNRESET; close: DEBUG("change tcp state to TCPS_CLOSED, state=%d", tp->t_state); tp->t_state = TCPS_CLOSED; // tcp event usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_CLOSED, 0); g_tcpstat.tcps_drops++; tp = tcp_close(tp); goto drop; case TCPS_CLOSING: case TCPS_LAST_ACK: case TCPS_TIME_WAIT: tp = tcp_close(tp); goto drop; } // If a SYN is in the window, then this is an // error and we send an RST and drop the connection. if (tiflags & TH_SYN) { tp = tcp_drop(tp, ECONNRESET); goto dropwithreset; } // If the ACK bit is off we drop the segment and return. if ((tiflags & TH_ACK) == 0) goto drop; // Ack processing. switch (tp->t_state) { // In SYN_RECEIVED state if the ack ACKs our SYN then enter // ESTABLISHED state and continue processing, otherwise // send an RST. case TCPS_SYN_RECEIVED: if (SEQ_GT(tp->snd_una, ti->ti_ack) || SEQ_GT(ti->ti_ack, tp->snd_max)) goto dropwithreset; g_tcpstat.tcps_connects++; DEBUG("change tcp state to TCPS_ESTABLISHED, state=%d", tp->t_state); tp->t_state = TCPS_ESTABLISHED; soisconnected(so); // Do window scaling? if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->snd_scale = tp->requested_s_scale; tp->rcv_scale = tp->request_r_scale; } tcp_reass(tp, (struct tcpiphdr *)0, (usn_mbuf_t *)0); tp->snd_wl1 = ti->ti_seq - 1; // fall into ... // In ESTABLISHED state: drop duplicate ACKs; ACK out of range // ACKs. If the ack is in the range // tp->snd_una < ti->ti_ack <= tp->snd_max // then advance tp->snd_una to ti->ti_ack and drop // data from the retransmission queue. If this ACK reflects // more up to date window information we update our window information. case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: case TCPS_TIME_WAIT: if (SEQ_LEQ(ti->ti_ack, tp->snd_una)) { if (ti->ti_len == 0 && tiwin == tp->snd_wnd) { g_tcpstat.tcps_rcvdupack++; // If we have outstanding data (other than // a window probe), this is a completely // duplicate ack (ie, window info didn't // change), the ack is the biggest we've // seen and we've seen exactly our rexmt // threshhold of them, assume a packet // has been dropped and retransmit it. // Kludge snd_nxt & the congestion // window so we send only this one // packet. // // We know we're losing at the current // window size so do congestion avoidance // (set ssthresh to half the current window // and pull our congestion window back to // the new ssthresh). // // Dup acks mean that packets have left the // network (they're now cached at the receiver) // so bump cwnd by the amount in the receiver // to keep a constant cwnd packets in the // network. if (tp->t_timer[TCPT_REXMT] == 0 || ti->ti_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks == g_tcprexmtthresh) { // congestion avoidance tcp_seq onxt = tp->snd_nxt; u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; if (win < 2) win = 2; tp->snd_ssthresh = win * tp->t_maxseg; tp->t_timer[TCPT_REXMT] = 0; tp->t_rtt = 0; tp->snd_nxt = ti->ti_ack; tp->snd_cwnd = tp->t_maxseg; tcp_output(tp); tp->snd_cwnd = tp->snd_ssthresh + tp->t_maxseg * tp->t_dupacks; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; } else if (tp->t_dupacks > g_tcprexmtthresh) { tp->snd_cwnd += tp->t_maxseg; tcp_output(tp); goto drop; } } else tp->t_dupacks = 0; break; } // If the congestion window was inflated to account // for the other side's cached packets, retract it. if (tp->t_dupacks > g_tcprexmtthresh && tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; tp->t_dupacks = 0; if (SEQ_GT(ti->ti_ack, tp->snd_max)) { g_tcpstat.tcps_rcvacktoomuch++; goto dropafterack; } acked = ti->ti_ack - tp->snd_una; g_tcpstat.tcps_rcvackpack++; g_tcpstat.tcps_rcvackbyte += acked; // If we have a timestamp reply, update smoothed // round trip time. If no timestamp is present but // transmit timer is running and timed sequence // number was acked, update smoothed round trip time. // Since we now have an rtt measurement, cancel the // timer backoff (cf., Phil Karn's retransmit alg.). // Recompute the initial retransmit timer. if (ts_present) tcp_xmit_timer(tp, g_tcp_now-ts_ecr+1); else if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq)) tcp_xmit_timer(tp,tp->t_rtt); // If all outstanding data is acked, stop retransmit // timer and remember to restart (more output or persist). // If there is more data to be acked, restart retransmit // timer, using current (possibly backed-off) value. if (ti->ti_ack == tp->snd_max) { tp->t_timer[TCPT_REXMT] = 0; DEBUG("change needoutput to 1"); needoutput = 1; tp->t_flags |= TF_NEEDOUTPUT; } else if (tp->t_timer[TCPT_PERSIST] == 0) tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; // When new data is acked, open the congestion window. // If the window gives us less than ssthresh packets // in flight, open exponentially (maxseg per packet). // Otherwise open linearly: maxseg per window // (maxseg * (maxseg / cwnd) per packet). { u_int cw = tp->snd_cwnd; u_int incr = tp->t_maxseg; if (cw > tp->snd_ssthresh) incr = incr * incr / cw; tp->snd_cwnd = min(cw + incr, TCP_MAXWIN<<tp->snd_scale); } if (acked > so->so_snd->sb_cc) { tp->snd_wnd -= so->so_snd->sb_cc; DEBUG("drop all so_snd buffer, drop_bytes=%d, acked=%d", so->so_snd->sb_cc, acked); sbdrop(so->so_snd, (int)so->so_snd->sb_cc); ourfinisacked = 1; } else { DEBUG("drop so_snd buffer, drop_bytes=%d, len=%d", acked, so->so_snd->sb_cc); sbdrop(so->so_snd, acked); tp->snd_wnd -= acked; ourfinisacked = 0; } //if (so->so_snd->sb_flags & SB_NOTIFY) { sowwakeup(so); usnet_tcpin_wwakeup(so, USN_TCP_IN, USN_TCPEV_WRITE, 0); //} tp->snd_una = ti->ti_ack; if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; switch (tp->t_state) { // In FIN_WAIT_1 STATE in addition to the processing // for the ESTABLISHED state if our FIN is now acknowledged // then enter FIN_WAIT_2. case TCPS_FIN_WAIT_1: if (ourfinisacked) { // If we can't receive any more // data, then closing user can proceed. // Starting the timer is contrary to the // specification, but if we don't get a FIN // we'll hang forever. if (so->so_state & USN_CANTRCVMORE) { soisdisconnected(so); tp->t_timer[TCPT_2MSL] = g_tcp_maxidle; } DEBUG("change tcp state to TCPS_FIN_WAIT_2, state=%d", tp->t_state); tp->t_state = TCPS_FIN_WAIT_2; usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_FIN_WAIT2, 0); } break; // In CLOSING STATE in addition to the processing for // the ESTABLISHED state if the ACK acknowledges our FIN // then enter the TIME-WAIT state, otherwise ignore // the segment. case TCPS_CLOSING: if (ourfinisacked) { DEBUG("change tcp state to TCPS_TIME_WAIT, state=%d", tp->t_state); tp->t_state = TCPS_TIME_WAIT; usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_TIME_WAIT, 0); tcp_canceltimers(tp); tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; soisdisconnected(so); } break; // In LAST_ACK, we may still be waiting for data to drain // and/or to be acked, as well as for the ack of our FIN. // If our FIN is now acknowledged, delete the TCB, // enter the closed state and return. case TCPS_LAST_ACK: if (ourfinisacked) { tp = tcp_close(tp); goto drop; } break; // In TIME_WAIT state the only thing that should arrive // is a retransmission of the remote FIN. Acknowledge // it and restart the finack timer. case TCPS_TIME_WAIT: tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; goto dropafterack; } } step6: // Update window information. // Don't look at window if no ACK: TAC's send garbage on first SYN. if ((tiflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, ti->ti_seq) || (tp->snd_wl1 == ti->ti_seq && (SEQ_LT(tp->snd_wl2, ti->ti_ack) || (tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd) )) )) { // keep track of pure window updates if (ti->ti_len == 0 && tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd) g_tcpstat.tcps_rcvwinupd++; tp->snd_wnd = tiwin; tp->snd_wl1 = ti->ti_seq; tp->snd_wl2 = ti->ti_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; DEBUG("change needoutput to 1"); tp->t_flags |= TF_NEEDOUTPUT; needoutput = 1; } // Process segments with URG. if ((tiflags & TH_URG) && ti->ti_urp && TCPS_HAVERCVDFIN(tp->t_state) == 0) { // This is a kludge, but if we receive and accept // random urgent pointers, we'll crash in // soreceive. It's hard to imagine someone // actually wanting to send this much urgent data. if (ti->ti_urp + so->so_rcv->sb_cc > g_sb_max) { ti->ti_urp = 0; // XXX tiflags &= ~TH_URG; // XXX goto dodata; // XXX } // If this segment advances the known urgent pointer, // then mark the data stream. This should not happen // in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since // a FIN has been received from the remote side. // In these states we ignore the URG. // // According to RFC961 (Assigned Protocols), // the urgent pointer points to the last octet // of urgent data. We continue, however, // to consider it to indicate the first octet // of data past the urgent section as the original // spec states (in one of two places). if (SEQ_GT(ti->ti_seq+ti->ti_urp, tp->rcv_up)) { tp->rcv_up = ti->ti_seq + ti->ti_urp; so->so_oobmark = so->so_rcv->sb_cc + (tp->rcv_up - tp->rcv_nxt) - 1; if (so->so_oobmark == 0) so->so_state |= USN_RCVATMARK; sohasoutofband(so); // send async event to app threads. usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPEV_OUTOFBOUND, 0); tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); } // Remove out of band data so doesn't get presented to user. // This can happen independent of advancing the URG pointer, // but if two URG's are pending at once, some out-of-band // data may creep in... ick. if (ti->ti_urp <= ti->ti_len #ifdef SO_OOBINLINE && (so->so_options & SO_OOBINLINE) == 0 #endif ) tcp_pulloutofband(so, ti, m); } else // If no out of band data is expected, // pull receive urgent pointer along // with the receive window. if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) tp->rcv_up = tp->rcv_nxt; dodata: // XXX #ifdef DUMP_PAYLOAD DEBUG("Handle data"); dump_chain(m,"tcp"); #endif // Process the segment text, merging it into the TCP sequencing queue, // and arranging for acknowledgment of receipt if necessary. // This process logically involves adjusting tp->rcv_wnd as data // is presented to the user (this happens in tcp_usrreq.c, // case PRU_RCVD). If a FIN has already been received on this // connection then we just ignore the text. if ((ti->ti_len || (tiflags&TH_FIN)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { TCP_REASS(tp, ti, m, so, tiflags); // Note the amount of data that peer has sent into // our window, in order to estimate the sender's // buffer size. len = so->so_rcv->sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); } else { usn_free_cmbuf(m); tiflags &= ~TH_FIN; } // If FIN is received ACK the FIN and let the user know // that the connection is closing. if (tiflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { socantrcvmore(so); tp->t_flags |= TF_ACKNOW; TRACE("ack FIN now, tp flags=%d", tp->t_flags); tp->rcv_nxt++; } switch (tp->t_state) { // In SYN_RECEIVED and ESTABLISHED STATES // enter the CLOSE_WAIT state. case TCPS_SYN_RECEIVED: case TCPS_ESTABLISHED: TRACE("change tcp state to TCPS_CLOSE_WAIT, state=%d", tp->t_state); tp->t_state = TCPS_CLOSE_WAIT; soewakeup(so, 0); usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_CLOSE_WAIT, 0); break; // If still in FIN_WAIT_1 STATE FIN has not been acked so // enter the CLOSING state. case TCPS_FIN_WAIT_1: TRACE("change tcp state to TCPS_CLOSING, state=%d", tp->t_state); tp->t_state = TCPS_CLOSING; usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_CLOSING, 0); break; // In FIN_WAIT_2 state enter the TIME_WAIT state, // starting the time-wait timer, turning off the other // standard timers. case TCPS_FIN_WAIT_2: TRACE("change tcp state to TCPS_TIME_WAIT, state=%d", tp->t_state); tp->t_state = TCPS_TIME_WAIT; tcp_canceltimers(tp); tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; soisdisconnected(so); usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_TIME_WAIT, 0); break; // In TIME_WAIT state restart the 2 MSL time_wait timer. case TCPS_TIME_WAIT: tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; break; } } if (so->so_options & SO_DEBUG) { TRACE("tcp trace, so_options=%d", so->so_options); tcp_trace(TA_INPUT, ostate, tp, &g_tcp_saveti, 0); } // Return any desired output. //if (needoutput || (tp->t_flags & TF_ACKNOW)){ if (tp->t_flags & TF_NEEDOUTPUT || (tp->t_flags & TF_ACKNOW)){ TRACE("ack now or need to ouput, tp->t_flags=%d", tp->t_flags); tcp_output(tp); } return; dropafterack: TRACE("dropafterack"); // Generate an ACK dropping incoming segment if it occupies // sequence space, where the ACK reflects our state. if (tiflags & TH_RST) goto drop; usn_free_cmbuf(m); tp->t_flags |= TF_ACKNOW; TRACE("ack now, tp flags=%d", tp->t_flags); tcp_output(tp); return; dropwithreset: TRACE("dropwithreset"); // Generate a RST, dropping incoming segment. // Make ACK acceptable to originator of segment. // Don't bother to respond if destination was broadcast/multicast. #define USN_MULTICAST(i) (((u_int)(i) & 0xf0000000) == 0xe0000000) if ((tiflags & TH_RST) || m->flags & (BUF_BCAST|BUF_MCAST) || USN_MULTICAST(ntohl(ti->ti_dst.s_addr))) goto drop; if (tiflags & TH_ACK) tcp_respond(tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST); else { if (tiflags & TH_SYN) ti->ti_len++; tcp_respond(tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0, TH_RST|TH_ACK); } // destroy temporarily created socket if (dropsocket) soabort(so); return; drop: TRACE("drop"); // Drop space held by incoming segment and return. if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) { TRACE("tcp trace: drop a socket"); tcp_trace(TA_DROP, ostate, tp, &g_tcp_saveti, 0); } usn_free_cmbuf(m); // destroy temporarily created socket if (dropsocket) soabort(so); return; }
void tcp_input(IO_Rec *recs, int nr_recs, iphost_st *host_st, intf_st *ifs, uint32_t m_flags, uint8_t iphlen) { /* iphlen: size of ip header incl. possible options */ register struct tcpiphdr *ti; uint16_t len, tlen; /* tlen: ip length without IP header start of code */ struct tcphdr *tcphdr; uint16_t sum; uint8_t tcphlen; /* tcp header length, incl. options */ TRC(printf("tcp input with ip header length %d\n", iphlen)); host_st->tcpstat.tcps_rcvtotal++; ti = ( struct tcpiphdr *)(recs[0].base + sizeof(struct ether_header)); tcphdr = (struct tcphdr *)(recs[0].base + sizeof(struct ether_header) + iphlen); tcphlen = tcphdr->th_off << 2; TRC(printf("tcp header length %d\n", tcphlen)); if (iphlen > sizeof (struct ip)) ip_stripoptions(recs); /* XXX AND: iphlen now wrong? */ if ((iphlen + tcphlen) < sizeof (struct tcpiphdr)) { host_st->tcpstat.tcps_rcvshort++; goto drop; } /* * Checksum extended TCP header and data. */ tlen = ((struct ip *)ti)->ip_len; TRC(printf("tcp option length %d\n", tcphlen- sizeof(struct tcphdr))); len = sizeof (struct ip) + tlen; TRC(printf("tlen %d, len %d\n", tlen, len)); ti->ti_next = ti->ti_prev = 0; ti->ti_x1 = 0; ti->ti_len = htons((uint16_t)tlen); if (recs[1].len == 0) sum = in_cksum((uint16_t *)ti, len); else { sum = cksum_morerecs((uint16_t *)ti, recs[0].len - sizeof(struct ether_header), recs, nr_recs); TRC(printf("tcp_input: checksum more recs %d\n", sum)); } if (sum) { printf("flowman: tcp_input: checksum error (%x)\n", sum); host_st->tcpstat.tcps_rcvbadsum++; goto drop; } TRC(else printf("tcp_input: checksum OK\n")); TRC(show_all(ti)); /* * Convert TCP protocol specific fields to host format. */ ti->ti_seq = ntoh32(ti->ti_seq); ti->ti_ack = ntoh32(ti->ti_ack); ti->ti_win = ntoh16(ti->ti_win); ti->ti_urp = ntoh16(ti->ti_urp); if (tcphdr->th_flags & TH_SYN) { TRC(printf("tcp syn message\n")); goto dropwithreset; } /* AND: shouldn't really get here */ #define F(f, c) (tcphdr->th_flags & f)? c : '-' printf("flowman: tcp_input: no SYN set (%c%c%c%c%c%c), ignoring\n", F(TH_FIN, 'f'), F(TH_SYN, 's'), F(TH_RST, 'r'), F(TH_PUSH, 'p'), F(TH_ACK, 'a'), F(TH_URG, 'u')); #undef F return; dropwithreset: /* * Generate a RST, dropping incoming segment. * Make ACK acceptable to originator of segment. * Don't bother to respond if destination was broadcast/multicast. */ if ((tcphdr->th_flags & TH_RST) || m_flags & (M_BCAST|M_MCAST) || IN_MULTICAST(ntoh32(ti->ti_dst.s_addr))) goto drop; if (tcphdr->th_flags & TH_ACK) { tcp_respond(ti, recs, nr_recs, host_st, ifs, (tcp_seq)0, ti->ti_ack, TH_RST); } else { TRC(printf("tcp_input: else ti_len %d, iphlen %d \n", ntoh16(ti->ti_len), iphlen)); /* Thiemo setting ti_len to host order and reducing length field*/ ti->ti_len = (ntoh16(ti->ti_len) - iphlen - (tcphlen - sizeof(struct tcphdr))); if (tcphdr->th_flags & TH_SYN) ti->ti_len++; TRC(printf("ti-ti_len network: %d\n", ti->ti_len)); tcp_respond(ti, recs, nr_recs, host_st, ifs, ti->ti_seq + ti->ti_len, (tcp_seq)0, TH_RST|TH_ACK); } return; drop: printf("flowman: tcp_input: ditched packet\n"); }
int tcp_input(struct ifnet * __if, struct iphdr * iph, struct tcphdr * th, int len) { struct tcp_listen_pcb * mux; struct tcp_pcb * tp; #if (ENABLE_NET_TCP_CHECKSUM) unsigned int sum; #endif int ti_len; int acked = 0; int ourfinisacked = 0; int needoutput = 0; unsigned int optlen; int tiflags; int todrop; uint32_t snd_una; uint32_t snd_nxt; uint32_t snd_max; uint32_t ti_seq; uint32_t ti_ack; int rcv_wnd; int tiwin; int hdrlen; uint8_t * data; int ret; #if (ENABLE_TCPDUMP) tcp_dump(iph, th, TCPDUMP_RX); #endif /* get TCP options, if any */ optlen = ((th->th_off << 2) - sizeof(struct tcphdr)); hdrlen = sizeof(struct tcphdr) + optlen; data = (uint8_t *)&th->th_opt[optlen]; ti_len = len - hdrlen; #if (ENABLE_NET_TCP_CHECKSUM) /* initialize checksum */ sum = htons(len) + (IPPROTO_TCP << 8); sum = in_chksum(sum, &iph->saddr, 8); sum = in_chksum(sum, th, hdrlen); if (ti_len) { sum = in_chksum(sum, data, ti_len); } if (sum != 0x0000ffff) { DCC_LOG3(LOG_WARNING, "checksum error: 0x%04x hdrlen=%d, len=%d", sum, hdrlen, len); TCP_PROTO_STAT_ADD(rx_err, 1); goto drop; } #endif tiflags = th->th_flags; /* convert TCP protocol specific fields to host format */ tiwin = ntohs(th->th_win); ti_seq = ntohl(th->th_seq); ti_ack = ntohl(th->th_ack); TCP_PROTO_STAT_ADD(rx_ok, 1); /* Serch in active list first */ if ((tp = tcp_active_lookup(iph->saddr, th->th_sport, iph->daddr, th->th_dport)) == NULL) { /* lookup into listening pcb list */ if ((mux = tcp_listen_lookup(iph->saddr, th->th_sport, iph->daddr, th->th_dport)) == NULL) { DCC_LOG(LOG_WARNING, "invalid peer ???"); goto dropwithreset; } if ((tiflags & TH_ACK)) { DCC_LOG(LOG_WARNING, "listen ACK ?"); goto dropwithreset; } if (ti_len != 0) { DCC_LOG(LOG_WARNING, "ti_len != 0"); goto dropwithreset; } /* Completion of Passive Open Ref.: TCP/IP Illustrated Volume 2, pg. 942 */ if (!(tiflags & TH_SYN)) { DCC_LOG(LOG_WARNING, "listen !SYN ?"); goto drop; } /* In the LISTEN state, we check for incoming SYN segments, creates a new PCB, and responds with a SYN|ACK. */ if ((tiflags & TH_RST)) { DCC_LOG(LOG_WARNING, "listen RST?"); goto drop; } if ((tp = tcp_passive_open(mux, iph, th, optlen)) == NULL) { DCC_LOG(LOG_WARNING, "tcp_passive_open()"); goto dropwithreset; } /* schedule output */ tcp_output_sched(tp); /* packet handled */ return 0; } DCC_LOG1(LOG_MSG, "<%05x> active", (int)tp); snd_una = tp->snd_seq; snd_nxt = tp->snd_seq + tp->snd_off; snd_max = tp->snd_seq + tp->snd_max; /* Remove acknowledged bytes from the send buffer */ /* Wakeup processes waiting on send buffer */ /* Segment received on a connection. Reset the idle detection timer Ref.: TCP/IP Illustrated Volume 2, pg. 932 */ tp->t_conn_tmr = tcp_idle_det_tmo; if (tp->t_flags & TF_IDLE) { /* exits from the idle state */ tp->t_flags &= ~TF_IDLE; DCC_LOG1(LOG_INFO, "<%05x> IDLE exit", (int)tp); } #if 0 /* Process options, we don't need to check if the socket is in the LISTEN state, because only active (non LISTENING) sockets will actually fall into this code. XXX: options after connection stablished ??? */ if (optlen) tcp_parse_options(tp, th, th->th_opt, optlen); #endif /* Ref.: TCP/IP Illustrated Volume 2, pg. 934 */ #if (TCP_ENABLE_HEADER_PREDICTION) if ((tp->t_state == TCPS_ESTABLISHED) && (tiflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK)) == TH_ACK && (ti_seq == tp->rcv_nxt) && (tiwin) && (tiwin == tp->snd_wnd) && (snd_nxt == snd_max)) { if (ti_len == 0) { if (SEQ_GT(ti_ack, snd_una) && SEQ_LEQ(ti_ack, snd_max)) { acked = ti_ack - snd_una; DCC_LOG(LOG_INFO, "header prediction, ACK ..."); mbuf_queue_trim(&tp->snd_q, acked); snd_una = ti_ack; tp->snd_seq = snd_una; tp->snd_off = snd_nxt - tp->snd_seq; tp->snd_max = snd_max - tp->snd_seq; if (snd_una == snd_max) { tp->t_rxmt_tmr = 0; tp->t_rxmt_cnt = 0; DCC_LOG(LOG_INFO, "acked all data, rxmt tmr stopped"); } else { if (tp->t_rxmt_tmr == 0) { DCC_LOG(LOG_INFO, "not all data acked restart rxmt tmr"); tp->t_rxmt_tmr = tcp_rxmtintvl[tp->t_rxmt_cnt / 2]; } } thinkos_cond_broadcast(tp->t_cond); if (tp->snd_q.len) { /* schedule output */ tcp_output_sched(tp); } return 0; } } else { if ((ti_ack == snd_una) && ti_len <= (tcp_maxrcv - tp->rcv_q.len)) { int len; DCC_LOG1(LOG_INFO, "header prediction, data (%d)", ti_len); /* append data */ len = mbuf_queue_add(&tp->rcv_q, data, ti_len); tp->rcv_nxt += len; thinkos_cond_broadcast(tp->t_cond); if (len != ti_len) { DCC_LOG1(LOG_WARNING, "<%05x> no more mbufs", (int)tp); tp->t_flags |= TF_ACKNOW; /* schedule output */ tcp_output_sched(tp); } else { tp->t_flags |= TF_DELACK; } return 0; } } } #endif /* TCP_ENABLE_HEADER_PREDICTION */ /* Slow path input processing Ref.: TCP/IP Illustrated Volume 2, pg. 941 */ /* TODO: Drop TCP, IP headers and TCP options. Well, only if these structures were dynamic allocated... */ if (ti_len == 0) { DCC_LOG(LOG_INFO, "slow path ACK"); } else { DCC_LOG1(LOG_INFO, "slow path (%d)", ti_len); } /* Calculate the amount of space in receive window, and then do TCP input processing. Receive window is amount of space in rcv queue, but not less than advertise window. Ref.: TCP/IP Illustrated Volume 2, pg. 941 */ { int win; /* space left in the input queue */ win = tcp_maxrcv - tp->rcv_q.len; if (win <= 0) { win = 0; DCC_LOG(LOG_INFO, "receive buffer full!"); } // rcv_wnd = MAX(win, tp->rcv_adv_wnd); rcv_wnd = win; DCC_LOG3(LOG_INFO, "adv_wnd=%d rcv_wnd=%d win=%d", tp->rcv_adv_wnd, rcv_wnd, win); } if (tp->t_state == TCPS_SYN_SENT) { /* response to an active open. Ref.: TCP/IP Illustrated Volume 2, pg. 947 */ /* Common proccessing for receipt of SYN. Ref.: TCP/IP Illustrated Volume 2, pg. 950 */ if ((tiflags & TH_RST)) { goto close; } if (!(tiflags & TH_SYN)) { DCC_LOG(LOG_WARNING, "SYN_SENT SYN ?"); /* TODO: reset */ goto close_and_reset; } if (!(tiflags & TH_ACK)) { DCC_LOG(LOG_WARNING, "SYN_SENT ACK ?"); /* TODO: reset */ goto close_and_reset; } if (ti_len != 0) { DCC_LOG(LOG_WARNING, "ti_len != 0"); /* TODO: reset */ goto close_and_reset; } /* update the send sequence */ tp->snd_seq++; if (tp->snd_seq != ti_ack) { DCC_LOG3(LOG_WARNING, "<%05x> tp->snd_seq(%d) != ti_ack(%d)", (int)tp, tp->snd_seq, ti_ack); /* TODO: reset */ goto close_and_reset; } tp->snd_off--; tp->snd_max--; // tp->snd_off = 0; // tp->snd_max = 0; if (optlen) tcp_parse_options(tp, th, th->th_opt, optlen); /* Advance tp->ti_seq to correspond to first data byte. */ ti_seq++; if (ti_len > rcv_wnd) { DCC_LOG3(LOG_WARNING, "<%05x> ti_len(%d) > rcv_wnd(%d)", (int)tp, ti_len, rcv_wnd); /* TODO: if data, trim to stay within window. */ ti_len = rcv_wnd; } /* update the sequence number */ tp->rcv_nxt = ti_seq; /* update the window size */ tp->snd_wnd = ntohs(th->th_win); tp->t_state = TCPS_ESTABLISHED; DCC_LOG1(LOG_INFO, "<%05x> [ESTABLISHED]", (int)tp); /* TODO: initialization of receive urgent pointer tcp->rcv_up = ti_seq; */ /* XXX: */ tp->t_flags |= TF_ACKNOW; thinkos_cond_broadcast(tp->t_cond); goto step6; close_and_reset: tp->t_state = TCPS_CLOSED; pcb_move((struct pcb *)tp, &__tcp__.active, &__tcp__.closed); DCC_LOG1(LOG_INFO, "<%05x> [CLOSED]", (int)tp); /* XXX: discard the data */ mbuf_queue_free(&tp->snd_q); mbuf_queue_free(&tp->rcv_q); /* notify the upper layer */ thinkos_cond_broadcast(tp->t_cond); goto dropwithreset; } /* States other than LISTEN or SYN_SENT First check timestamp, if present. Then check that at least some bytes of segment are within receive window. If segment begins before rcv_nxt, drop leading data (and SYN); if nothing left, just ti_ack. */ /* Trim Segment so Data is Within Window Ref.: TCP/IP Illustrated Volume 2, pg. 954 */ todrop = tp->rcv_nxt - ti_seq; if (todrop > 0) { if (tiflags & TH_SYN) { DCC_LOG(LOG_INFO, "SYN"); tiflags &= ~TH_SYN; ti_seq++; todrop--; } if ((todrop > ti_len) || ((todrop == ti_len) && ((tiflags & TH_FIN) == 0))) { tiflags &= ~TH_FIN; tp->t_flags |= TF_ACKNOW; todrop = ti_len; } DCC_LOG4(LOG_WARNING, "<%05x> drop: len=%d drop=%d rem=%d!", (int)tp, ti_len, todrop, ti_len - todrop); /* adjust the data pointer */ data += todrop; ti_seq += todrop; ti_len -= todrop; /* TODO: adjust the urgent pointer */ } /* FIXME: only reset the connection if there are no more application to handle the incomming data, half-close */ if ((tp->t_state > TCPS_FIN_WAIT_1) && (ti_len)) { DCC_LOG1(LOG_INFO, "<%05x> segment received after FIN", (int)tp); /* TODO: stat */ goto dropwithreset; } /* If segment ends after window, drop trailing data and (PUSH and FIN); if nothing left, just ACK. Ref.: TCP/IP Illustrated Volume 2, pg. 958 */ todrop = (ti_seq + ti_len) - (tp->rcv_nxt + rcv_wnd); DCC_LOG4(LOG_INFO, "ti_seq=%u ti_len=%d rcv_nxt=%u rcv_wnd=%d", ti_seq, ti_len, tp->rcv_nxt, rcv_wnd); /* */ if (todrop > 0) { // TCP_LOG(tp, "tcp_input: trailing data drop"); if (todrop >= ti_len) { /* * If a new connection request is received * while in TIME_WAIT, drop the old connection ... * Ref.: TCP/IP Illustrated Volume 2, pg. 958 if ((tiflags & TH_SYN) && (tp->t_state == TCPS_TIMEWAIT) && (SEQ_GT(ti_seq, tp->rcv_nxt))) { __tcp__.iss += tcp_issincr; tcp_rst(tp); goto findpcb; } */ if ((rcv_wnd == 0) && (ti_seq == tp->rcv_nxt)) { tp->t_flags |= TF_ACKNOW; } else goto dropafterack; } DCC_LOG2(LOG_WARNING, "<%05x> data drop: %d!", (int)tp, todrop); ti_len -= todrop; tiflags &= ~(TH_PSH | TH_FIN); } /* If the RST bit is set eximine the state: ... Ref.: TCP/IP Illustrated Volume 2, pg. 964 */ if ((tiflags & TH_RST)) { DCC_LOG1(LOG_WARNING, "<%05x> RST received", (int)tp); switch(tp->t_state) { case TCPS_SYN_RCVD: // tp->errno = ECONNREFUSED; goto close; case TCPS_ESTABLISHED: case TCPS_CLOSE_WAIT: // tp->errno = ECONNRESET; close: /* discard the data */ mbuf_queue_free(&tp->snd_q); mbuf_queue_free(&tp->rcv_q); tp->t_state = TCPS_CLOSED; pcb_move((struct pcb *)tp, &__tcp__.active, &__tcp__.closed); DCC_LOG1(LOG_INFO, "<%05x> [CLOSED]", (int)tp); /* notify the upper layer */ thinkos_cond_broadcast(tp->t_cond); /* PCBs in the close state should be cleared by the application */ goto drop; case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSING: case TCPS_LAST_ACK: case TCPS_TIME_WAIT: /* Our side was already closed */ tcp_pcb_free(tp); goto drop; } } /* If a SYN is in the window, then this is an error and we send an RST and drop the connection. Ref.: TCP/IP Illustrated Volume 2, pg. 965 */ if ((tiflags & TH_SYN)) { DCC_LOG1(LOG_WARNING, "<%05x> the SYN bit is set inside the window", (int)tp); goto dropwithreset; } /* If the ACK bit is off we drop the segment and return. */ if ((!(tiflags & TH_ACK))) { DCC_LOG1(LOG_WARNING, "<%05x> the ACK bit is off", (int)tp); goto drop; } /* * ACK processing. * Ref.: TCP/IP Illustrated Volume 2, pg. 969 * */ DCC_LOG4(LOG_INFO, "ack=%u una=%u nxt=%u max=%u", ti_ack, snd_una, snd_nxt, snd_max); switch(tp->t_state) { case TCPS_SYN_RCVD: if (SEQ_GT(snd_una, ti_ack) || SEQ_GT(ti_ack, snd_max)) { DCC_LOG1(LOG_WARNING, "<%05x> ti_ack < snd_una || snd_max < ti_ack", (int)tp); goto dropwithreset; } tp->t_state = TCPS_ESTABLISHED; tp->snd_off--; tp->snd_max--; DCC_LOG1(LOG_INFO, "<%05x> SYN ackd [ESTABLISHED]", (int)tp); /* notify the upper layer*/ // thinkos_cond_signal(tp->t_cond); /* TODO: tcp reassembly tcp_reass(tp); */ case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: case TCPS_TIME_WAIT: /* TODO: tcp reassembly tcp_reass(tp); */ if (SEQ_LEQ(ti_ack, snd_una)) { /* TODO: check for completly duplicated ACKs. Ref.: TCP/IP Illustrated Volume 2, pg. 971 */ if ((ti_len == 0) && (tiwin == tp->snd_wnd)) { if ((tp->t_rxmt_tmr == 0) || ti_ack != snd_una) { // dupacks = 0; } else { DCC_LOG2(LOG_INFO, "duplicated ACK. ti_ack=%u snd_una=%u", ti_ack, snd_una); } } else { // dupacks = 0; } break; } /* Check out of range ACK */ /* Ref.: TCP/IP Illustrated Volume 2, pg. 974 */ if (SEQ_GT(ti_ack, snd_max)) { /* TODO: tcpstat.tcps_rcvacktoomuch++; */ DCC_LOG3(LOG_WARNING, "(%04x) out of range ACK. " "th_ack=%u > snd_max=%u !", (int)tp, ti_ack, snd_max); goto dropafterack; } acked = ti_ack - snd_una; /* TODO: tcpstat.tcps_rcvackpack++; tcpstat.tcps_rcvackbyte += acked; */ DCC_LOG1(LOG_INFO, "acked=%d", acked); /* If all outstanding data is acked, stop retransmit timer else restarts it .... Ref.: TCP/IP Illustrated Volume 2, pg. 976 */ if (ti_ack == snd_max) { tp->t_rxmt_tmr = 0; tp->t_rxmt_cnt = 0; needoutput = 1; DCC_LOG(LOG_INFO, "acked all data, rxmt tmr stopped"); } else { /* TODO: peristent timer */ // if (tp->t_persist_tmr == 0) { DCC_LOG(LOG_INFO, "not all data acked restart rxmt tmr"); tp->t_rxmt_tmr = tcp_rxmtintvl[tp->t_rxmt_cnt / 2]; // } } /* TODO: tcpstat.tcps_rcvackpack++; tcpstat.tcps_rcvackbyte += acked; */ /* TODO: remove acknowledged data from send buffer Ref.: TCP/IP Illustrated Volume 2, pg. 978 */ /* FIXME: send buffer bytes count */ if (acked > tp->snd_q.len) { mbuf_queue_trim(&tp->snd_q, tp->snd_q.len); ourfinisacked = 1; } else { /* TODO: estimate the send window */ mbuf_queue_trim(&tp->snd_q, acked); ourfinisacked = 0; } /* awaken a thread waiting on the send buffer ... */ thinkos_cond_broadcast(tp->t_cond); snd_una = ti_ack; if (SEQ_LT(snd_nxt, snd_una)) { snd_nxt = snd_una; } tp->snd_seq = snd_una; tp->snd_off = snd_nxt - tp->snd_seq; tp->snd_max = snd_max - tp->snd_seq; DCC_LOG4(LOG_INFO, "<%05x> snd_seq=%u snd_max=%u snd_q.len=%d", (int)tp, tp->snd_seq, snd_max, tp->snd_q.len); switch(tp->t_state) { case TCPS_FIN_WAIT_1: if (ourfinisacked) { /* FIXME: If we can't receive any more data.. Ref.: TCP/IP Illustrated Volume 2, pg. 979 */ tp->t_conn_tmr = 4 * tcp_msl; tp->t_state = TCPS_FIN_WAIT_2; DCC_LOG1(LOG_INFO, "<%05x> [FIN_WAIT_2]", (int)tp); } break; case TCPS_CLOSING: if (ourfinisacked) { mbuf_queue_free(&tp->snd_q); mbuf_queue_free(&tp->rcv_q); tp->t_state = TCPS_TIME_WAIT; DCC_LOG1(LOG_INFO, "<%05x> [TIME_WAIT]", (int)tp); tp->t_rxmt_tmr = 0; tp->t_conn_tmr = 2 * tcp_msl; DCC_LOG1(LOG_INFO, "stop rxmt tmr, start 2MSL tmr: %d", tp->t_conn_tmr); } break; case TCPS_LAST_ACK: if (ourfinisacked) { tcp_pcb_free(tp); goto drop; } break; case TCPS_TIME_WAIT: /* restart the finack timer Ref.: TCP/IP Illustrated Volume 2, pg. 981 */ tp->t_conn_tmr = 2 * tcp_msl; goto dropafterack; } break; } DCC_LOG4(LOG_INFO, "<%05x> recvd=%d acked=%d rcv_q.len=%d", (int)tp, ti_len, acked, tp->rcv_q.len); step6: /* Update window information Ref.: TCP/IP Illustrated Volume 2, pg. 982 */ DCC_LOG(LOG_MSG, "setp6"); // if ((tiflags & TH_ACK) && (tiwin > tp->snd_wnd)) { if ((tiflags & TH_ACK) && (tiwin != tp->snd_wnd)) { /* Keep track of pure window updates */ /* TODO: TCP Statistics */ /* TODO: Update window information */ DCC_LOG1(LOG_INFO, "window update, win=%d", tiwin); tp->snd_wnd = tiwin; needoutput = 1; } /* TODO: Urgent mode processing */ /* Process the segment text, merging it into the TCP sequencing queue, dodata: ... Ref.: TCP/IP Illustrated Volume 2, pg. 988 */ if ((ti_len || (tiflags & TH_FIN)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { if ((ti_seq == tp->rcv_nxt) && (tp->t_state == TCPS_ESTABLISHED)) { /* append data */ int n; tp->t_flags |= TF_DELACK; n = mbuf_queue_add(&tp->rcv_q, data, ti_len); if (n != ti_len) { DCC_LOG2(LOG_WARNING, "no more mbufs, %d != %d", n, ti_len); } ti_len = n; tp->rcv_nxt += ti_len; /* TODO: statistics */ tiflags &= TH_FIN; // if (tp->rcv_q.len == ti_len) { // DCC_LOG3(LOG_INFO, "<%05x> rcvd %d, signaling %d ...", // (int)tp, ti_len, tp->t_cond); /* * notify the upper layer of the data arrival... */ thinkos_cond_signal(tp->t_cond); // } else { // DCC_LOG2(LOG_INFO, "<%05x> rcvd %d", (int)tp, ti_len); // } } else { /* TODO: half-close */ /* TODO: reassembly */ // m = mlink_free(m); if (tp->t_state == TCPS_ESTABLISHED) { // DCC_LOG(LOG_WARNING, "out of order, drop!"); DCC_LOG(LOG_WARNING, "out of order, drop"); TCP_PROTO_STAT_ADD(rx_drop, 1); } tp->t_flags |= TF_ACKNOW; } } else { DCC_LOG(LOG_INFO, "!!!!!!!!!"); tiflags &= ~TH_FIN; } /* FIN Processing */ if (tiflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { tp->t_flags |= TF_ACKNOW; tp->rcv_nxt++; } switch(tp->t_state) { case TCPS_SYN_RCVD: case TCPS_ESTABLISHED: tp->t_state = TCPS_CLOSE_WAIT; DCC_LOG1(LOG_INFO, "<%05x> [CLOSE_WAIT]", (int)tp); /* notify the application that our peer has closed its side. Sockets: marks the socket as write-only */ if (tp->rcv_q.len == 0) { thinkos_cond_broadcast(tp->t_cond); } break; case TCPS_FIN_WAIT_1: tp->t_state = TCPS_CLOSING; DCC_LOG1(LOG_INFO, "<%05x> [CLOSING]", (int)tp); break; case TCPS_FIN_WAIT_2: mbuf_queue_free(&tp->rcv_q); mbuf_queue_free(&tp->snd_q); tp->t_state = TCPS_TIME_WAIT; DCC_LOG1(LOG_INFO, "<%05x> [TIME_WAIT]", (int)tp); tp->t_rxmt_tmr = 0; tp->t_conn_tmr = 2 * tcp_msl; DCC_LOG1(LOG_INFO, "stop rxmt tmr, start 2MSL tmr: %d", tp->t_conn_tmr); break; case TCPS_TIME_WAIT: /* restart the counter */ tp->t_conn_tmr = 2 * tcp_msl; break; } } /* Final Processing */ if (needoutput || (tp->t_flags & TF_ACKNOW)) { if (needoutput) { DCC_LOG(LOG_INFO, "needoutput, call tcp_out."); } if (tp->t_flags & TF_ACKNOW) { DCC_LOG(LOG_INFO, "ACKNOW set, call tcp_out."); } /* schedule output */ tcp_output_sched(tp); } return 0; dropafterack: DCC_LOG1(LOG_INFO, "<%05x> drop and ACK", (int)tp); if (tiflags & TH_RST) goto drop; tp->t_flags |= TF_ACKNOW; /* schedule output */ tcp_output_sched(tp); return 0; dropwithreset: DCC_LOG1(LOG_TRACE, "<%05x> drop and RST", (int)tp); ret = 0; /* TODO: check for a broadcast/multicast */ if (!(tiflags & TH_RST)) { if (tiflags & TH_ACK) { ret = tcp_respond(iph, th, 0, ti_ack, TH_RST); } else if (tiflags & TH_SYN) { ti_len++; ret = tcp_respond(iph, th, ti_seq + ti_len, 0, TH_ACK | TH_RST); } } TCP_PROTO_STAT_ADD(rx_drop, 1); return ret; drop: DCC_LOG(LOG_TRACE, "drop"); TCP_PROTO_STAT_ADD(rx_drop, 1); return 0; }
//-------------------------------------------------------------------------// // TCP timer processing. //-------------------------------------------------------------------------// static struct tcpcb * tcp_timers( Node *node, struct tcpcb *tp, int timer, UInt32 tcp_now, struct tcpstat *tcp_stat) { int rexmt; TransportDataTcp *tcpLayer = (TransportDataTcp *) node->transportData.tcp; switch (timer) { // // 2 MSL timeout in shutdown went off. If we're closed but // still waiting for peer to close and connection has been idle // too long, or if 2MSL time is up from TIME_WAIT, delete connection // control block. Otherwise, check again in a bit. // case TCPT_2MSL: if (tp->t_state != TCPS_TIME_WAIT && tp->t_idle <= TCPTV_MAXIDLE) tp->t_timer[TCPT_2MSL] = TCPTV_KEEPINTVL; else { // printf("TCP: Connection closed by timer\n"); tp = tcp_close(node, tp, tcp_stat); } break; // // Retransmission timer went off. Message has not // been acked within retransmit interval. Back off // to a longer retransmit interval and retransmit one segment. // case TCPT_REXMT: if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; //if (tcp_stat) //tcp_stat->tcps_timeoutdrop++; printf("TCP: Retransmission timer went off\n"); tp = tcp_drop(node, tp, tcp_now, tcp_stat); break; } //if (tcp_stat) //tcp_stat->tcps_rexmttimeo++; rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX); tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; // // If we backed off this far, // our srtt estimate is probably bogus. Clobber it // so we'll take the next rtt measurement as our srtt; // move the current srtt into rttvar to keep the current // retransmit times until then. // if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { tp->t_rttvar += (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); tp->t_srtt = 0; } tp->snd_nxt = tp->snd_una; if (TCP_VARIANT_IS_SACK(tp) && tp->isSackFastRextOn) { TransportTcpSackRextTimeoutInit(tp); TransportTcpTrace(node, 0, 0, "Faxt: timeout"); } // Force a segment to be sent. tp->t_flags |= TF_ACKNOW; // If timing a segment in this window, stop the timer. // The retransmitted segment shouldn't be timed. tp->t_rtt = 0; // // Close the congestion window down to one segment // (we'll open it by one segment for each ack we get). // Since we probably have a window's worth of unacked // data accumulated, this "slow start" keeps us from // dumping all that data as back-to-back packets (which // might overwhelm an intermediate gateway). // // There are two phases to the opening: Initially we // open by one mss on each ack. This makes the window // size increase exponentially with time. If the // window is larger than the path can handle, this // exponential growth results in dropped packet(s) // almost immediately. To get more time between // drops but still "push" the network to take advantage // of improving conditions, we switch from exponential // to linear window opening at some threshhold size. // For a threshhold, we use half the current window // size, truncated to a multiple of the mss. // // (the minimum cwnd that will give us exponential // growth is 2 mss. We don't allow the threshhold // to go below this.) // { unsigned int win; win = MIN(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; if (win < 2) win = 2; tp->snd_cwnd = tp->t_maxseg; tp->snd_ssthresh = win * tp->t_maxseg; tp->t_partialacks = -1; tp->t_dupacks = 0; } tp->t_ecnFlags |= TF_CWND_REDUCED; TransportTcpTrace(node, 0, 0, "Rext: timeout"); // // To eliminates the problem of multiple Fast Retransmits we uses this // new variable "send_high", whose initial value is the initial send // sequence number. After each retransmit timeout, the highest sequence // numbers transmitted so far is recorded in the variable "send_high". // if (TCP_VARIANT_IS_NEWRENO(tp)) { tp->send_high = tp->snd_max; } tcp_output(node, tp, tcp_now, tcp_stat); break; // // Persistance timer into zero window. // Force a byte to be output, if possible. // case TCPT_PERSIST: //if (tcp_stat) //tcp_stat->tcps_persisttimeo++; // // Hack: if the peer is dead/unreachable, we do not // time out if the window is closed. After a full // backoff, drop the connection if the idle time // (no responses to probes) reaches the maximum // backoff that we would use if retransmitting. // if (tp->t_rxtshift == TCP_MAXRXTSHIFT) { UInt32 maxidle = TCP_REXMTVAL(tp); if (maxidle < tp->t_rttmin) maxidle = tp->t_rttmin; maxidle *= tcp_totbackoff; if (tp->t_idle >= TCPTV_KEEP_IDLE || tp->t_idle >= maxidle) { //if (tcp_stat) //tcp_stat->tcps_persistdrop++; printf("TCP: Idle timer went off\n"); tp = tcp_drop(node, tp, tcp_now, tcp_stat); break; } } tcp_setpersist(tp); tp->t_force = 1; tcp_output(node, tp, tcp_now, tcp_stat); tp->t_force = 0; break; // // Keep-alive timer went off; send something // or drop connection if idle for too long. // case TCPT_KEEP: //if (tcp_stat) //tcp_stat->tcps_keeptimeo++; if (tp->t_state < TCPS_ESTABLISHED) printf("TCP: Keep-alive timer went off before established\n"); goto dropit; if (tcpLayer->tcpUseKeepAliveProbes && tp->t_state <= TCPS_CLOSING) { // // If the connection has been idle for more than the sum of // TCPTV_KEEP_IDLE (set to 2 hours) and TCPTV_MAXIDLE // (set to the total time taken to send all the probes), // it's time to drop the connection. // if (tp->t_idle >= TCPTV_KEEP_IDLE + TCPTV_MAXIDLE) printf("TCP: Keep-alive timer went off\n"); goto dropit; // // Send a packet designed to force a response // if the peer is up and reachable: // either an ACK if the connection is still alive, // or an RST if the peer has closed the connection // due to timeout or reboot. // Using sequence number tp->snd_una-1 // causes the transmitted zero-length segment // to lie outside the receive window; // by the protocol spec, this requires the // correspondent TCP to respond. // //if (tcp_stat) //tcp_stat->tcps_keepprobe++; tcp_respond(node, tp, tp->t_template, 0, tp->rcv_nxt, tp->snd_una - 1, 0, tcp_stat); tp->t_timer[TCPT_KEEP] = TCPTV_KEEPINTVL; } else { // // If the tcpUseKeepAliveProbes is FALSE // or the connection state is greater than TCPS_CLOSING, // reset the keepalive timer to TCPTV_KEEP_IDLE. // tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_IDLE; } break; dropit: //if (tcp_stat) { // // Note that this counter counts connection drops due to // failure in connection establishment and the keepalive // timer timeouts // //tcp_stat->tcps_keepdrops++; //} // printf("TCP: Unknown timer went off\n"); tp = tcp_drop(node, tp, tcp_now, tcp_stat); break; }
/* * TCP timer processing. */ struct tcpcb* tcp_timers(struct tcpcb *tp, int timer) { int rexmt; switch (timer) { /* * 2 MSL timeout in shutdown went off. If we're closed but * still waiting for peer to close and connection has been idle * too long, or if 2MSL time is up from TIME_WAIT, delete connection * control block. Otherwise, check again in a bit. * If TIME_WAIT is not set, this is FIN_WAIT_2 timer. */ case TCPT_2MSL: if (tp->t_state != TCPS_TIME_WAIT && tp->t_idle <= g_tcp_maxidle) tp->t_timer[TCPT_2MSL] = g_tcp_keepintvl; else tp = tcp_close(tp); break; /* * Retransmission timer went off. Message has not * been acked within retransmit interval. Back off * to a longer retransmit interval and retransmit one segment. */ case TCPT_REXMT: if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; g_tcpstat.tcps_timeoutdrop++; tp = tcp_drop(tp, tp->t_softerror ? tp->t_softerror : ETIMEDOUT); break; } g_tcpstat.tcps_rexmttimeo++; rexmt = TCP_REXMTVAL(tp) * g_tcp_backoff[tp->t_rxtshift]; TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX); tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; /* * If losing, let the lower level know and try for * a better route. Also, if we backed off this far, * our srtt estimate is probably bogus. Clobber it * so we'll take the next rtt measurement as our srtt; * move the current srtt into rttvar to keep the current * retransmit times until then. */ if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { in_losing(tp->t_inpcb); tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); tp->t_srtt = 0; } tp->snd_nxt = tp->snd_una; /* * If timing a segment in this window, stop the timer. */ tp->t_rtt = 0; /* * Close the congestion window down to one segment * (we'll open it by one segment for each ack we get). * Since we probably have a window's worth of unacked * data accumulated, this "slow start" keeps us from * dumping all that data as back-to-back packets (which * might overwhelm an intermediate gateway). * * There are two phases to the opening: Initially we * open by one mss on each ack. This makes the window * size increase exponentially with time. If the * window is larger than the path can handle, this * exponential growth results in dropped packet(s) * almost immediately. To get more time between * drops but still "push" the network to take advantage * of improving conditions, we switch from exponential * to linear window opening at some threshhold size. * For a threshhold, we use half the current window * size, truncated to a multiple of the mss. * * (the minimum cwnd that will give us exponential * growth is 2 mss. We don't allow the threshhold * to go below this.) */ { u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; if (win < 2) win = 2; tp->snd_cwnd = tp->t_maxseg; tp->snd_ssthresh = win * tp->t_maxseg; tp->t_dupacks = 0; } (void) tcp_output(tp); break; /* * Persistance timer into zero window. * Force a byte to be output, if possible. */ case TCPT_PERSIST: g_tcpstat.tcps_persisttimeo++; /* * Hack: if the peer is dead/unreachable, we do not * time out if the window is closed. After a full * backoff, drop the connection if the idle time * (no responses to probes) reaches the maximum * backoff that we would use if retransmitting. */ if (tp->t_rxtshift == TCP_MAXRXTSHIFT && (tp->t_idle >= g_tcp_maxpersistidle || tp->t_idle >= TCP_REXMTVAL(tp) * g_tcp_totbackoff)) { g_tcpstat.tcps_persistdrop++; tp = tcp_drop(tp, ETIMEDOUT); break; } tcp_setpersist(tp); tp->t_force = 1; (void) tcp_output(tp); tp->t_force = 0; break; /* * Keep-alive timer went off; send something * or drop connection if idle for too long. */ case TCPT_KEEP: g_tcpstat.tcps_keeptimeo++; if (tp->t_state < TCPS_ESTABLISHED) // connection-establishment timer. goto dropit; if (tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE && tp->t_state <= TCPS_CLOSE_WAIT) { // keepalive timer. if (tp->t_idle >= g_tcp_keepidle + g_tcp_maxidle) goto dropit; /* * Send a packet designed to force a response * if the peer is up and reachable: * either an ACK if the connection is still alive, * or an RST if the peer has closed the connection * due to timeout or reboot. * Using sequence number tp->snd_una-1 * causes the transmitted zero-length segment * to lie outside the receive window; * by the protocol spec, this requires the * correspondent TCP to respond. */ g_tcpstat.tcps_keepprobe++; tcp_respond(tp, tp->t_template, (usn_mbuf_t *)NULL, tp->rcv_nxt, tp->snd_una - 1, 0); tp->t_timer[TCPT_KEEP] = g_tcp_keepintvl; } else tp->t_timer[TCPT_KEEP] = g_tcp_keepidle; break; dropit: g_tcpstat.tcps_keepdrops++; tp = tcp_drop(tp, ETIMEDOUT); break; }
static int ip_fw_chk(struct ip **pip, int hlen, struct ifnet *oif, u_int16_t *cookie, struct mbuf **m, struct ip_fw_chain **flow_id) { struct ip_fw_chain *chain; struct ip_fw *rule = NULL; struct ip *ip = NULL ; struct ifnet *const rif = (*m)->m_pkthdr.rcvif; u_short offset ; u_short src_port, dst_port; #ifdef IPFW_DIVERT_RESTART u_int16_t skipto = *cookie; #else u_int16_t ignport = ntohs(*cookie); #endif if (pip) { /* normal ip packet */ ip = *pip; offset = (ip->ip_off & IP_OFFMASK); } else { /* bridged or non-ip packet */ struct ether_header *eh = mtod(*m, struct ether_header *); switch (ntohs(eh->ether_type)) { case ETHERTYPE_IP : if ((*m)->m_len<sizeof(struct ether_header) + sizeof(struct ip)) goto non_ip ; ip = (struct ip *)(eh + 1 ); if (ip->ip_v != IPVERSION) goto non_ip ; hlen = ip->ip_hl << 2; if (hlen < sizeof(struct ip)) /* minimum header length */ goto non_ip ; if ((*m)->m_len < 14 + hlen + 14) { printf("-- m_len %d, need more...\n", (*m)->m_len); goto non_ip ; } offset = (ip->ip_off & IP_OFFMASK); break ; default : non_ip: ip = NULL ; break ; } } if (*flow_id) { if (fw_one_pass) return 0 ; /* accept if passed first test */ /* * pkt has already been tagged. Look for the next rule * to restart processing */ if ( (chain = (*flow_id)->rule->next_rule_ptr) == NULL ) chain = (*flow_id)->rule->next_rule_ptr = lookup_next_rule(*flow_id) ; if (!chain) goto dropit; } else { chain=LIST_FIRST(&ip_fw_chain); #ifdef IPFW_DIVERT_RESTART if ( skipto ) { /* * If we've been asked to start at a given rule immediatly, * do so. */ if (skipto >= 65535) goto dropit; while (chain && (chain->rule->fw_number <= skipto)) { chain = LIST_NEXT(chain, chain); } if (! chain) goto dropit; } #endif /* IPFW_DIVERT_RESTART */ } *cookie = 0; for (; chain; chain = LIST_NEXT(chain, chain)) { register struct ip_fw *f; again: f = chain->rule; if (oif) { /* Check direction outbound */ if (!(f->fw_flg & IP_FW_F_OUT)) continue; } else { /* Check direction inbound */ if (!(f->fw_flg & IP_FW_F_IN)) continue; } if (ip == NULL ) { /* * do relevant checks for non-ip packets: * after this, only goto got_match or continue */ struct ether_header *eh = mtod(*m, struct ether_header *); int i, h, l ; #if 0 printf("-- ip_fw: rule %d(%d) for %6D <- %6D type 0x%04x\n", f->fw_number, IP_FW_GETNSRCP(f), eh->ether_dhost, ".", eh->ether_shost, ".", ntohs(eh->ether_type) ); #endif /* * make default rule always match or we have a panic */ if (f->fw_number == 65535) goto got_match ; /* * temporary hack: * udp from 0.0.0.0 means this rule applies. * 1 src port is match ether type * 2 src ports (interval) is match ether type * 3 src ports is match ether address */ if (f->fw_src.s_addr != 0 || f->fw_prot != IPPROTO_UDP) continue ; switch (IP_FW_GETNSRCP(f)) { case 1: /* match one type */ if ( /* ( (f->fw_flg & IP_FW_F_INVSRC) != 0) ^ */ ( f->fw_pts[0] == ntohs(eh->ether_type) ) ) { printf("match!\n"); goto got_match ; } default: break ; } continue; } /* Fragments */ if ((f->fw_flg & IP_FW_F_FRAG) && !(ip->ip_off & IP_OFFMASK)) continue; /* If src-addr doesn't match, not this rule. */ if (((f->fw_flg & IP_FW_F_INVSRC) != 0) ^ ((ip->ip_src.s_addr & f->fw_smsk.s_addr) != f->fw_src.s_addr)) continue; /* If dest-addr doesn't match, not this rule. */ if (((f->fw_flg & IP_FW_F_INVDST) != 0) ^ ((ip->ip_dst.s_addr & f->fw_dmsk.s_addr) != f->fw_dst.s_addr)) continue; /* Interface check */ if ((f->fw_flg & IF_FW_F_VIAHACK) == IF_FW_F_VIAHACK) { struct ifnet *const iface = oif ? oif : rif; /* Backwards compatibility hack for "via" */ if (!iface || !iface_match(iface, &f->fw_in_if, f->fw_flg & IP_FW_F_OIFNAME)) continue; } else { /* Check receive interface */ if ((f->fw_flg & IP_FW_F_IIFACE) && (!rif || !iface_match(rif, &f->fw_in_if, f->fw_flg & IP_FW_F_IIFNAME))) continue; /* Check outgoing interface */ if ((f->fw_flg & IP_FW_F_OIFACE) && (!oif || !iface_match(oif, &f->fw_out_if, f->fw_flg & IP_FW_F_OIFNAME))) continue; } /* Check IP options */ if (f->fw_ipopt != f->fw_ipnopt && !ipopts_match(ip, f)) continue; /* Check protocol; if wildcard, match */ if (f->fw_prot == IPPROTO_IP) goto got_match; /* If different, don't match */ if (ip->ip_p != f->fw_prot) continue; #define PULLUP_TO(len) do { \ if ((*m)->m_len < (len) ) { \ if ( (*m = m_pullup(*m, (len))) == 0) \ goto bogusfrag; \ *pip = ip = mtod(*m, struct ip *); \ offset = (ip->ip_off & IP_OFFMASK); \ } \ } while (0) /* Protocol specific checks */ switch (ip->ip_p) { case IPPROTO_TCP: { struct tcphdr *tcp; if (offset == 1) /* cf. RFC 1858 */ goto bogusfrag; if (offset != 0) { /* * TCP flags and ports aren't available in this * packet -- if this rule specified either one, * we consider the rule a non-match. */ if (f->fw_nports != 0 || f->fw_tcpf != f->fw_tcpnf) continue; break; } PULLUP_TO(hlen + 14); tcp = (struct tcphdr *) ((u_long *)ip + ip->ip_hl); if (f->fw_tcpf != f->fw_tcpnf && !tcpflg_match(tcp, f)) continue; src_port = ntohs(tcp->th_sport); dst_port = ntohs(tcp->th_dport); goto check_ports; } case IPPROTO_UDP: { struct udphdr *udp; if (offset != 0) { /* * Port specification is unavailable -- if this * rule specifies a port, we consider the rule * a non-match. */ if (f->fw_nports != 0) continue; break; } PULLUP_TO(hlen + 4); udp = (struct udphdr *) ((u_long *)ip + ip->ip_hl); src_port = ntohs(udp->uh_sport); dst_port = ntohs(udp->uh_dport); check_ports: if (!port_match(&f->fw_pts[0], IP_FW_GETNSRCP(f), src_port, f->fw_flg & IP_FW_F_SRNG)) continue; if (!port_match(&f->fw_pts[IP_FW_GETNSRCP(f)], IP_FW_GETNDSTP(f), dst_port, f->fw_flg & IP_FW_F_DRNG)) continue; break; } case IPPROTO_ICMP: { struct icmp *icmp; if (offset != 0) /* Type isn't valid */ break; PULLUP_TO(hlen + 2); icmp = (struct icmp *) ((u_long *)ip + ip->ip_hl); if (!icmptype_match(icmp, f)) continue; break; } #undef PULLUP_TO bogusfrag: if (fw_verbose) ipfw_report(NULL, ip, rif, oif); goto dropit; } got_match: *flow_id = chain ; /* XXX set flow id */ #ifndef IPFW_DIVERT_RESTART /* Ignore divert/tee rule if socket port is "ignport" */ switch (f->fw_flg & IP_FW_F_COMMAND) { case IP_FW_F_DIVERT: case IP_FW_F_TEE: if (f->fw_divert_port == ignport) continue; /* ignore this rule */ break; } #endif /* IPFW_DIVERT_RESTART */ /* Update statistics */ f->fw_pcnt += 1; /* * note -- bridged-ip packets still have some fields * in network order, including ip_len */ if (ip) { if (pip) f->fw_bcnt += ip->ip_len; else f->fw_bcnt += ntohs(ip->ip_len); } f->timestamp = time.tv_sec; /* Log to console if desired */ if ((f->fw_flg & IP_FW_F_PRN) && fw_verbose) ipfw_report(f, ip, rif, oif); /* Take appropriate action */ switch (f->fw_flg & IP_FW_F_COMMAND) { case IP_FW_F_ACCEPT: return(0); case IP_FW_F_COUNT: continue; #ifdef IPDIVERT case IP_FW_F_DIVERT: #ifdef IPFW_DIVERT_RESTART *cookie = f->fw_number; #else *cookie = htons(f->fw_divert_port); #endif /* IPFW_DIVERT_RESTART */ return(f->fw_divert_port); #endif case IP_FW_F_TEE: /* * XXX someday tee packet here, but beware that you * can't use m_copym() or m_copypacket() because * the divert input routine modifies the mbuf * (and these routines only increment reference * counts in the case of mbuf clusters), so need * to write custom routine. */ continue; case IP_FW_F_SKIPTO: /* XXX check */ if ( f->next_rule_ptr ) chain = f->next_rule_ptr ; else chain = lookup_next_rule(chain) ; if (!chain) goto dropit; goto again ; #ifdef DUMMYNET case IP_FW_F_PIPE: return(f->fw_pipe_nr | 0x10000 ); #endif } /* Deny/reject this packet using this rule */ rule = f; break; } #ifdef DIAGNOSTIC /* Rule 65535 should always be there and should always match */ if (!chain) panic("ip_fw: chain"); #endif /* * At this point, we're going to drop the packet. * Send a reject notice if all of the following are true: * * - The packet matched a reject rule * - The packet is not an ICMP packet, or is an ICMP query packet * - The packet is not a multicast or broadcast packet */ if ((rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_REJECT && ip && (ip->ip_p != IPPROTO_ICMP || is_icmp_query(ip)) && !((*m)->m_flags & (M_BCAST|M_MCAST)) && !IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { switch (rule->fw_reject_code) { case IP_FW_REJECT_RST: { struct tcphdr *const tcp = (struct tcphdr *) ((u_long *)ip + ip->ip_hl); struct tcpiphdr ti, *const tip = (struct tcpiphdr *) ip; if (offset != 0 || (tcp->th_flags & TH_RST)) break; ti.ti_i = *((struct ipovly *) ip); ti.ti_t = *tcp; bcopy(&ti, ip, sizeof(ti)); NTOHL(tip->ti_seq); NTOHL(tip->ti_ack); tip->ti_len = ip->ip_len - hlen - (tip->ti_off << 2); if (tcp->th_flags & TH_ACK) { tcp_respond(NULL, tip, *m, (tcp_seq)0, ntohl(tcp->th_ack), TH_RST); } else { if (tcp->th_flags & TH_SYN) tip->ti_len++; tcp_respond(NULL, tip, *m, tip->ti_seq + tip->ti_len, (tcp_seq)0, TH_RST|TH_ACK); } *m = NULL; break; } default: /* Send an ICMP unreachable using code */ icmp_error(*m, ICMP_UNREACH, rule->fw_reject_code, 0L, 0); *m = NULL; break; } } dropit: /* * Finally, drop the packet. */ if (*m) { m_freem(*m); *m = NULL; } return(0); }