/* * Send a single message to the TCP at address specified by * the given TCP/IP header. If m == 0, then we make a copy * of the tcpiphdr at ti and send directly to the addressed host. * This is used to force keep alive messages out using the TCP * template for a connection tp->t_template. If flags are given * then we send a message back to the TCP which originated the * segment ti, and discard the mbuf containing it and any other * attached mbufs. * * In any case the ack and sequence number of the transmitted * segment are as specified by the parameters. */ void tcp_respond(struct tcpcb *tp, struct tcpiphdr *ti, struct mbuf *m, tcp_seq ack, tcp_seq seq, int flags, unsigned short af) { register int tlen; int win = 0; DEBUG_CALL("tcp_respond"); DEBUG_ARG("tp = %p", tp); DEBUG_ARG("ti = %p", ti); DEBUG_ARG("m = %p", m); DEBUG_ARG("ack = %u", ack); DEBUG_ARG("seq = %u", seq); DEBUG_ARG("flags = %x", flags); if (tp) win = sbspace(&tp->t_socket->so_rcv); if (m == NULL) { if (!tp || (m = m_get(tp->t_socket->slirp)) == NULL) return; tlen = 0; m->m_data += IF_MAXLINKHDR; *mtod(m, struct tcpiphdr *) = *ti; ti = mtod(m, struct tcpiphdr *); memset(&ti->ti, 0, sizeof(ti->ti)); flags = TH_ACK; } else {
int sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control) { struct mbuf *m, *mlast, *n; int space = 0; if (control == NULL) panic("sbappendcontrol"); for (m = control; ; m = m->m_next) { space += m->m_len; if (m->m_next == NULL) break; } n = m; /* save pointer to last control buffer */ for (m = m0; m; m = m->m_next) space += m->m_len; if (space > sbspace(sb)) return (0); n->m_next = m0; /* concatenate data to control */ SBLASTRECORDCHK(sb, "sbappendcontrol 1"); for (m = control; m->m_next != NULL; m = m->m_next) sballoc(sb, m); sballoc(sb, m); mlast = m; SBLINKRECORD(sb, control); sb->sb_mbtail = mlast; SBLASTMBUFCHK(sb, "sbappendcontrol"); SBLASTRECORDCHK(sb, "sbappendcontrol 2"); return (1); }
/* * Send a single message to the TCP at address specified by * the given TCP/IP header. If m == 0, then we make a copy * of the tcpiphdr at ti and send directly to the addressed host. * This is used to force keep alive messages out using the TCP * template for a connection tp->t_template. If flags are given * then we send a message back to the TCP which originated the * segment ti, and discard the mbuf containing it and any other * attached mbufs. * * In any case the ack and sequence number of the transmitted * segment are as specified by the parameters. */ void tcp_respond(struct tcpcb *tp, struct tcpiphdr *ti, struct mbuf *m, tcp_seq ack, tcp_seq seq, int flags) { int tlen; int win = 0; DEBUG_CALL("tcp_respond"); DEBUG_ARG("tp = %lx", (long)tp); DEBUG_ARG("ti = %lx", (long)ti); DEBUG_ARG("m = %lx", (long)m); DEBUG_ARG("ack = %u", ack); DEBUG_ARG("seq = %u", seq); DEBUG_ARG("flags = %x", flags); if (tp) win = sbspace(&tp->t_socket->so_rcv); if (m == 0) { if ((m = m_get()) == NULL) return; #ifdef TCP_COMPAT_42 tlen = 1; #else tlen = 0; #endif m->m_data += if_maxlinkhdr; *mtod(m, struct tcpiphdr *) = *ti; ti = mtod(m, struct tcpiphdr *); flags = TH_ACK; } else {
/* * Send a single message to the TCP at address specified by * the given TCP/IP header. If m == 0, then we make a copy * of the tcpiphdr at ti and send directly to the addressed host. * This is used to force keep alive messages out using the TCP * template for a connection tp->t_template. If flags are given * then we send a message back to the TCP which originated the * segment ti, and discard the mbuf containing it and any other * attached mbufs. * * In any case the ack and sequence number of the transmitted * segment are as specified by the parameters. */ void tcp_respond(PNATState pData, struct tcpcb *tp, struct tcpiphdr *ti, struct mbuf *m, tcp_seq ack, tcp_seq seq, int flags) { register int tlen; int win = 0; LogFlowFunc(("ENTER: tp = %R[tcpcb793], ti = %lx, m = %lx, ack = %u, seq = %u, flags = %x\n", tp, (long)ti, (long)m, ack, seq, flags)); if (tp) win = sbspace(&tp->t_socket->so_rcv); if (m == 0) { if ((m = m_gethdr(pData, M_DONTWAIT, MT_HEADER)) == NULL) return; #ifdef TCP_COMPAT_42 tlen = 1; #else tlen = 0; #endif m->m_data += if_maxlinkhdr; m->m_pkthdr.header = mtod(m, void *); *mtod(m, struct tcpiphdr *) = *ti; ti = mtod(m, struct tcpiphdr *); flags = TH_ACK; } else {
static int rfcomm_attach(struct socket *so, int proto) { int error; KASSERT(so->so_pcb == NULL); if (so->so_lock == NULL) { mutex_obj_hold(bt_lock); so->so_lock = bt_lock; solock(so); } KASSERT(solocked(so)); /* * Since we have nothing to add, we attach the DLC * structure directly to our PCB pointer. */ error = soreserve(so, rfcomm_sendspace, rfcomm_recvspace); if (error) return error; error = rfcomm_attach_pcb((struct rfcomm_dlc **)&so->so_pcb, &rfcomm_proto, so); if (error) return error; error = rfcomm_rcvd_pcb(so->so_pcb, sbspace(&so->so_rcv)); if (error) { rfcomm_detach_pcb((struct rfcomm_dlc **)&so->so_pcb); return error; } return 0; }
int sbappendcontrol_locked(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control) { struct mbuf *m, *n, *mlast; int space; SOCKBUF_LOCK_ASSERT(sb); if (control == 0) panic("sbappendcontrol_locked"); space = m_length(control, &n) + m_length(m0, NULL); if (space > sbspace(sb)) return (0); n->m_next = m0; /* concatenate data to control */ SBLASTRECORDCHK(sb); for (m = control; m->m_next; m = m->m_next) sballoc(sb, m); sballoc(sb, m); mlast = m; SBLINKRECORD(sb, control); sb->sb_mbtail = mlast; SBLASTMBUFCHK(sb); SBLASTRECORDCHK(sb); return (1); }
/* You'd certainly better have an iocount on the vnode! */ int fifo_freespace(struct vnode *vp, long *count) { struct socket *rsock; rsock = vp->v_fifoinfo->fi_readsock; socket_lock(rsock, 1); *count = sbspace(&rsock->so_rcv); socket_unlock(rsock, 1); return 0; }
/* * natmintr: interrupt * * Note: we expect a socket pointer in rcvif rather than an interface * pointer. We can get the interface pointer from the so's PCB if we really * need it. */ void natmintr(struct mbuf *m) { struct socket *so; struct natmpcb *npcb; #ifdef DIAGNOSTIC M_ASSERTPKTHDR(m); #endif NATM_LOCK(); npcb = (struct natmpcb *)m->m_pkthdr.rcvif; /* XXX: overloaded */ so = npcb->npcb_socket; npcb->npcb_inq--; if (npcb->npcb_flags & NPCB_DRAIN) { if (npcb->npcb_inq == 0) free(npcb, M_PCB); /* done! */ NATM_UNLOCK(); m_freem(m); return; } if (npcb->npcb_flags & NPCB_FREE) { NATM_UNLOCK(); m_freem(m); /* drop */ return; } #ifdef NEED_TO_RESTORE_IFP m->m_pkthdr.rcvif = npcb->npcb_ifp; #else #ifdef DIAGNOSTIC m->m_pkthdr.rcvif = NULL; /* null it out to be safe */ #endif #endif if (sbspace(&so->so_rcv) > m->m_pkthdr.len) { #ifdef NATM_STAT natm_sookcnt++; natm_sookbytes += m->m_pkthdr.len; #endif sbappendrecord(&so->so_rcv, m); sorwakeup(so); NATM_UNLOCK(); } else { #ifdef NATM_STAT natm_sodropcnt++; natm_sodropbytes += m->m_pkthdr.len; #endif NATM_UNLOCK(); m_freem(m); } }
static int rfcomm_rcvd(struct socket *so, int flags, struct lwp *l) { struct rfcomm_dlc *pcb = so->so_pcb; KASSERT(solocked(so)); if (pcb == NULL) return EINVAL; return rfcomm_rcvd_pcb(pcb, sbspace(&so->so_rcv)); }
int filt_fifowrite(struct knote *kn, long hint) { struct socket *so = (struct socket *)kn->kn_hook; kn->kn_data = sbspace(&so->so_snd); if (so->so_state & SS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; return (1); } kn->kn_flags &= ~EV_EOF; return (kn->kn_data >= so->so_snd.sb_lowat); }
/* * Append address and data, and optionally, control (ancillary) data * to the receive queue of a socket. If present, * m0 must include a packet header with total length. * Returns 0 if no space in sockbuf or insufficient mbufs. */ int sbappendaddr(struct sockbuf *sb, struct sockaddr *asa, struct mbuf *m0, struct mbuf *control) { struct mbuf *m, *n, *nlast; int space = asa->sa_len; if (m0 && (m0->m_flags & M_PKTHDR) == 0) panic("sbappendaddr"); if (m0) space += m0->m_pkthdr.len; for (n = control; n; n = n->m_next) { space += n->m_len; if (n->m_next == NULL) /* keep pointer to last control buf */ break; } if (space > sbspace(sb)) return (0); if (asa->sa_len > MLEN) return (0); MGET(m, M_DONTWAIT, MT_SONAME); if (m == NULL) return (0); m->m_len = asa->sa_len; memcpy(mtod(m, caddr_t), asa, asa->sa_len); if (n) n->m_next = m0; /* concatenate data to control */ else control = m0; m->m_next = control; SBLASTRECORDCHK(sb, "sbappendaddr 1"); for (n = m; n->m_next != NULL; n = n->m_next) sballoc(sb, n); sballoc(sb, n); nlast = n; SBLINKRECORD(sb, m); sb->sb_mbtail = nlast; SBLASTMBUFCHK(sb, "sbappendaddr"); SBLASTRECORDCHK(sb, "sbappendaddr 2"); return (1); }
/* * Append address and data, and optionally, control (ancillary) data to the * receive queue of a socket. If present, m0 must include a packet header * with total length. Returns 0 if no space in sockbuf or insufficient * mbufs. */ int sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control) { struct mbuf *m, *n, *nlast; int space = asa->sa_len; SOCKBUF_LOCK_ASSERT(sb); if (m0 && (m0->m_flags & M_PKTHDR) == 0) panic("sbappendaddr_locked"); if (m0) space += m0->m_pkthdr.len; space += m_length(control, &n); if (space > sbspace(sb)) return (0); #if MSIZE <= 256 if (asa->sa_len > MLEN) return (0); #endif MGET(m, M_DONTWAIT, MT_SONAME); if (m == 0) return (0); m->m_len = asa->sa_len; bcopy(asa, mtod(m, caddr_t), asa->sa_len); if (n){ CHECK_ADD_LINKCNT(n, m0, NULL, "sbappendaddr_locked"); n->m_next = m0; /* concatenate data to control */ }else control = m0; CHECK_ADD_LINKCNT(m, control, NULL, "sbappendaddr_locked"); m->m_next = control; for (n = m; n->m_next != NULL; n = n->m_next) sballoc(sb, n); sballoc(sb, n); nlast = n; SBLINKRECORD(sb, m); sb->sb_mbtail = nlast; SBLASTMBUFCHK(sb); SBLASTRECORDCHK(sb); return (1); }
static void sco_input(void *arg, struct mbuf *m) { struct socket *so = arg; /* * since this data is time sensitive, if the buffer * is full we just dump data until the latest one * will fit. */ while (m->m_pkthdr.len > sbspace(&so->so_rcv)) sbdroprecord(&so->so_rcv); DPRINTFN(10, "received %d bytes\n", m->m_pkthdr.len); sbappendrecord(&so->so_rcv, m); sorwakeup(so); }
key_receive(struct socket *so, struct mbuf **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) #endif { struct rawcb *rp = sotorawcb(so); struct keycb *kp = (struct keycb *)rp; int error; #ifndef __FreeBSD__ error = (*kp->kp_receive)(so, paddr, uio, mp0, controlp, flagsp); #else error = soreceive(so, paddr, uio, mp0, controlp, flagsp); #endif if (kp->kp_queue && sbspace(&rp->rcb_socket->so_rcv) > kp->kp_queue->m_pkthdr.len) sorwakeup(so); return error; }
/* * Append address and data, and optionally, control (ancillary) data to the * receive queue of a socket. If present, m0 must include a packet header * with total length. Returns 0 if no space in sockbuf or insufficient * mbufs. */ int sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control) { struct mbuf *ctrl_last; int space = asa->sa_len; SOCKBUF_LOCK_ASSERT(sb); if (m0 && (m0->m_flags & M_PKTHDR) == 0) panic("sbappendaddr_locked"); if (m0) space += m0->m_pkthdr.len; space += m_length(control, &ctrl_last); if (space > sbspace(sb)) return (0); return (sbappendaddr_locked_internal(sb, asa, m0, control, ctrl_last)); }
static int filt_fifowrite(struct knote *kn, long hint) { struct socket *so; int rv; so = (struct socket *)kn->kn_hook; if (hint != NOTE_SUBMIT) solock(so); kn->kn_data = sbspace(&so->so_snd); if (so->so_state & SS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; rv = 1; } else { kn->kn_flags &= ~EV_EOF; rv = (kn->kn_data >= so->so_snd.sb_lowat); } if (hint != NOTE_SUBMIT) sounlock(so); return rv; }
/* * Do a send by putting data in output queue and updating urgent * marker if URG set. Possibly send more data. Unlike the other * pru_*() routines, the mbuf chains are our responsibility. We * must either enqueue them or free them. The other pru_* routines * generally are caller-frees. */ static int tcp_usr_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; int headlocked = 0; #ifdef INET6 int isipv6; #endif TCPDEBUG0; /* * We require the pcbinfo lock in two cases: * * (1) An implied connect is taking place, which can result in * binding IPs and ports and hence modification of the pcb hash * chains. * * (2) PRUS_EOF is set, resulting in explicit close on the send. */ if ((nam != NULL) || (flags & PRUS_EOF)) { INP_INFO_WLOCK(&V_tcbinfo); headlocked = 1; } inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { if (control) m_freem(control); if (m) m_freem(m); error = ECONNRESET; goto out; } #ifdef INET6 isipv6 = nam && nam->sa_family == AF_INET6; #endif /* INET6 */ tp = intotcpcb(inp); TCPDEBUG1(); if (control) { /* TCP doesn't do control messages (rights, creds, etc) */ if (control->m_len) { m_freem(control); if (m) m_freem(m); error = EINVAL; goto out; } m_freem(control); /* empty control, just free it */ } if (!(flags & PRUS_OOB)) { sbappendstream(&so->so_snd, m); if (nam && tp->t_state < TCPS_SYN_SENT) { /* * Do implied connect if not yet connected, * initialize window to default value, and * initialize maxseg/maxopd using peer's cached * MSS. */ INP_INFO_WLOCK_ASSERT(&V_tcbinfo); #ifdef INET6 if (isipv6) error = tcp6_connect(tp, nam, td); else #endif /* INET6 */ error = tcp_connect(tp, nam, td); if (error) goto out; tp->snd_wnd = TTCP_CLIENT_SND_WND; tcp_mss(tp, -1); } if (flags & PRUS_EOF) { /* * Close the send side of the connection after * the data is sent. */ INP_INFO_WLOCK_ASSERT(&V_tcbinfo); socantsendmore(so); tcp_usrclosed(tp); } if (headlocked) { INP_INFO_WUNLOCK(&V_tcbinfo); headlocked = 0; } if (!(inp->inp_flags & INP_DROPPED)) { if (flags & PRUS_MORETOCOME) tp->t_flags |= TF_MORETOCOME; error = tcp_output_send(tp); if (flags & PRUS_MORETOCOME) tp->t_flags &= ~TF_MORETOCOME; } } else { /* * XXXRW: PRUS_EOF not implemented with PRUS_OOB? */ SOCKBUF_LOCK(&so->so_snd); if (sbspace(&so->so_snd) < -512) { SOCKBUF_UNLOCK(&so->so_snd); m_freem(m); error = ENOBUFS; goto out; } /* * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section. * Otherwise, snd_up should be one lower. */ sbappendstream_locked(&so->so_snd, m); SOCKBUF_UNLOCK(&so->so_snd); if (nam && tp->t_state < TCPS_SYN_SENT) { /* * Do implied connect if not yet connected, * initialize window to default value, and * initialize maxseg/maxopd using peer's cached * MSS. */ INP_INFO_WLOCK_ASSERT(&V_tcbinfo); #ifdef INET6 if (isipv6) error = tcp6_connect(tp, nam, td); else #endif /* INET6 */ error = tcp_connect(tp, nam, td); if (error) goto out; tp->snd_wnd = TTCP_CLIENT_SND_WND; tcp_mss(tp, -1); INP_INFO_WUNLOCK(&V_tcbinfo); headlocked = 0; } else if (nam) { INP_INFO_WUNLOCK(&V_tcbinfo); headlocked = 0; } tp->snd_up = tp->snd_una + so->so_snd.sb_cc; tp->t_flags |= TF_FORCEDATA; error = tcp_output_send(tp); tp->t_flags &= ~TF_FORCEDATA; } out: TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB : ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); INP_WUNLOCK(inp); if (headlocked) INP_INFO_WUNLOCK(&V_tcbinfo); return (error); }
/* * TCP input routine, follows pages 65-76 of the * protocol specification dated September, 1981 very closely. */ void tcp_input(struct mbuf *m, int iphlen, struct socket *inso) { struct ip save_ip, *ip; register struct tcpiphdr *ti; caddr_t optp = NULL; int optlen = 0; int len, tlen, off; register struct tcpcb *tp = NULL; register int tiflags; struct socket *so = NULL; int todrop, acked, ourfinisacked, needoutput = 0; int iss = 0; u_long tiwin; int ret; struct ex_list *ex_ptr; Slirp *slirp; DEBUG_CALL("tcp_input"); DEBUG_ARGS((dfd, " m = %8lx iphlen = %2d inso = %lx\n", (long )m, iphlen, (long )inso )); /* * If called with m == 0, then we're continuing the connect */ if (m == NULL) { so = inso; slirp = so->slirp; /* Re-set a few variables */ tp = sototcpcb(so); m = so->so_m; so->so_m = NULL; ti = so->so_ti; tiwin = ti->ti_win; tiflags = ti->ti_flags; goto cont_conn; } slirp = m->slirp; /* * Get IP and TCP header together in first mbuf. * Note: IP leaves IP header in first mbuf. */ ti = mtod(m, struct tcpiphdr *); if (iphlen > sizeof(struct ip )) { ip_stripoptions(m, (struct mbuf *)0); iphlen=sizeof(struct ip ); } /* XXX Check if too short */ /* * Save a copy of the IP header in case we want restore it * for sending an ICMP error message in response. */ ip=mtod(m, struct ip *); save_ip = *ip; save_ip.ip_len+= iphlen; /* * Checksum extended TCP header and data. */ tlen = ((struct ip *)ti)->ip_len; tcpiphdr2qlink(ti)->next = tcpiphdr2qlink(ti)->prev = NULL; memset(&ti->ti_i.ih_mbuf, 0 , sizeof(struct mbuf_ptr)); ti->ti_x1 = 0; ti->ti_len = htons((uint16_t)tlen); len = sizeof(struct ip ) + tlen; if(cksum(m, len)) { goto drop; } /* * Check that TCP offset makes sense, * pull out TCP options and adjust length. XXX */ off = ti->ti_off << 2; if (off < sizeof (struct tcphdr) || off > tlen) { goto drop; } tlen -= off; ti->ti_len = tlen; if (off > sizeof (struct tcphdr)) { optlen = off - sizeof (struct tcphdr); optp = mtod(m, caddr_t) + sizeof (struct tcpiphdr); } tiflags = ti->ti_flags; /* * Convert TCP protocol specific fields to host format. */ NTOHL(ti->ti_seq); NTOHL(ti->ti_ack); NTOHS(ti->ti_win); NTOHS(ti->ti_urp); /* * Drop TCP, IP headers and TCP options. */ m->m_data += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); m->m_len -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); /* * Locate pcb for segment. */ findso: so = slirp->tcp_last_so; if (so->so_fport != ti->ti_dport || so->so_lport != ti->ti_sport || so->so_laddr.s_addr != ti->ti_src.s_addr || so->so_faddr.s_addr != ti->ti_dst.s_addr) { so = solookup(&slirp->tcb, ti->ti_src, ti->ti_sport, ti->ti_dst, ti->ti_dport); if (so) slirp->tcp_last_so = so; } /* * If the state is CLOSED (i.e., TCB does not exist) then * all data in the incoming segment is discarded. * If the TCB exists but is in CLOSED state, it is embryonic, * but should either do a listen or a connect soon. * * state == CLOSED means we've done socreate() but haven't * attached it to a protocol yet... * * XXX If a TCB does not exist, and the TH_SYN flag is * the only flag set, then create a session, mark it * as if it was LISTENING, and continue... */ if (so == NULL) { if (slirp->restricted) { /* Any hostfwds will have an existing socket, so we only get here * for non-hostfwd connections. These should be dropped, unless it * happens to be a guestfwd. */ for (ex_ptr = slirp->exec_list; ex_ptr; ex_ptr = ex_ptr->ex_next) { if (ex_ptr->ex_fport == ti->ti_dport && ti->ti_dst.s_addr == ex_ptr->ex_addr.s_addr) { break; } } if (!ex_ptr) { goto dropwithreset; } } if ((tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) != TH_SYN) goto dropwithreset; if ((so = socreate(slirp)) == NULL) goto dropwithreset; if (tcp_attach(so) < 0) { free(so); /* Not sofree (if it failed, it's not insqued) */ goto dropwithreset; } sbreserve(&so->so_snd, TCP_SNDSPACE); sbreserve(&so->so_rcv, TCP_RCVSPACE); so->so_laddr = ti->ti_src; so->so_lport = ti->ti_sport; so->so_faddr = ti->ti_dst; so->so_fport = ti->ti_dport; if ((so->so_iptos = tcp_tos(so)) == 0) so->so_iptos = ((struct ip *)ti)->ip_tos; tp = sototcpcb(so); tp->t_state = TCPS_LISTEN; } /* * If this is a still-connecting socket, this probably * a retransmit of the SYN. Whether it's a retransmit SYN * or something else, we nuke it. */ if (so->so_state & SS_ISFCONNECTING) goto drop; tp = sototcpcb(so); /* XXX Should never fail */ if (tp == NULL) goto dropwithreset; if (tp->t_state == TCPS_CLOSED) goto drop; tiwin = ti->ti_win; /* * Segment received on connection. * Reset idle time and keep-alive timer. */ tp->t_idle = 0; if (SO_OPTIONS) tp->t_timer[TCPT_KEEP] = TCPTV_KEEPINTVL; else tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_IDLE; /* * Process options if not in LISTEN state, * else do it below (after getting remote address). */ if (optp && tp->t_state != TCPS_LISTEN) tcp_dooptions(tp, (u_char *)optp, optlen, ti); /* * Header prediction: check for the two common cases * of a uni-directional data xfer. If the packet has * no control flags, is in-sequence, the window didn't * change and we're not retransmitting, it's a * candidate. If the length is zero and the ack moved * forward, we're the sender side of the xfer. Just * free the data acked & wake any higher level process * that was blocked waiting for space. If the length * is non-zero and the ack didn't move, we're the * receiver side. If we're getting packets in-order * (the reassembly queue is empty), add the data to * the socket buffer and note that we need a delayed ack. * * XXX Some of these tests are not needed * eg: the tiwin == tp->snd_wnd prevents many more * predictions.. with no *real* advantage.. */ if (tp->t_state == TCPS_ESTABLISHED && (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && ti->ti_seq == tp->rcv_nxt && tiwin && tiwin == tp->snd_wnd && tp->snd_nxt == tp->snd_max) { if (ti->ti_len == 0) { if (SEQ_GT(ti->ti_ack, tp->snd_una) && SEQ_LEQ(ti->ti_ack, tp->snd_max) && tp->snd_cwnd >= tp->snd_wnd) { /* * this is a pure ack for outstanding data. */ if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq)) tcp_xmit_timer(tp, tp->t_rtt); acked = ti->ti_ack - tp->snd_una; sbdrop(&so->so_snd, acked); tp->snd_una = ti->ti_ack; m_free(m); /* * If all outstanding data are acked, stop * retransmit timer, otherwise restart timer * using current (possibly backed-off) value. * If process is waiting for space, * wakeup/selwakeup/signal. If data * are ready to send, let tcp_output * decide between more output or persist. */ if (tp->snd_una == tp->snd_max) tp->t_timer[TCPT_REXMT] = 0; else if (tp->t_timer[TCPT_PERSIST] == 0) tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; /* * This is called because sowwakeup might have * put data into so_snd. Since we don't so sowwakeup, * we don't need this.. XXX??? */ if (so->so_snd.sb_cc) (void) tcp_output(tp); return; } } else if (ti->ti_ack == tp->snd_una && tcpfrag_list_empty(tp) && ti->ti_len <= sbspace(&so->so_rcv)) { /* * this is a pure, in-sequence data packet * with nothing on the reassembly queue and * we have enough buffer space to take it. */ tp->rcv_nxt += ti->ti_len; /* * Add data to socket buffer. */ if (so->so_emu) { if (tcp_emu(so,m)) sbappend(so, m); } else sbappend(so, m); /* * If this is a short packet, then ACK now - with Nagel * congestion avoidance sender won't send more until * he gets an ACK. * * It is better to not delay acks at all to maximize * TCP throughput. See RFC 2581. */ tp->t_flags |= TF_ACKNOW; tcp_output(tp); return; } } /* header prediction */ /* * Calculate amount of space in receive window, * and then do TCP input processing. * Receive window is amount of space in rcv queue, * but not less than advertised window. */ { int win; win = sbspace(&so->so_rcv); if (win < 0) win = 0; tp->rcv_wnd = max(win, (int)(tp->rcv_adv - tp->rcv_nxt)); } switch (tp->t_state) { /* * If the state is LISTEN then ignore segment if it contains an RST. * If the segment contains an ACK then it is bad and send a RST. * If it does not contain a SYN then it is not interesting; drop it. * Don't bother responding if the destination was a broadcast. * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial * tp->iss, and send a segment: * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss. * Fill in remote peer address fields if not previously specified. * Enter SYN_RECEIVED state, and process any other fields of this * segment in this state. */ case TCPS_LISTEN: { if (tiflags & TH_RST) goto drop; if (tiflags & TH_ACK) goto dropwithreset; if ((tiflags & TH_SYN) == 0) goto drop; /* * This has way too many gotos... * But a bit of spaghetti code never hurt anybody :) */ /* * If this is destined for the control address, then flag to * tcp_ctl once connected, otherwise connect */ if ((so->so_faddr.s_addr & slirp->vnetwork_mask.s_addr) == slirp->vnetwork_addr.s_addr) { if (so->so_faddr.s_addr != slirp->vhost_addr.s_addr && so->so_faddr.s_addr != slirp->vnameserver_addr.s_addr) { /* May be an add exec */ for (ex_ptr = slirp->exec_list; ex_ptr; ex_ptr = ex_ptr->ex_next) { if(ex_ptr->ex_fport == so->so_fport && so->so_faddr.s_addr == ex_ptr->ex_addr.s_addr) { so->so_state |= SS_CTL; break; } } if (so->so_state & SS_CTL) { goto cont_input; } } /* CTL_ALIAS: Do nothing, tcp_fconnect will be called on it */ } if (so->so_emu & EMU_NOCONNECT) { so->so_emu &= ~EMU_NOCONNECT; goto cont_input; } if ((tcp_fconnect(so) == -1) && #if defined(_WIN32) socket_error() != WSAEWOULDBLOCK #else (errno != EINPROGRESS) && (errno != EWOULDBLOCK) #endif ) { u_char code=ICMP_UNREACH_NET; DEBUG_MISC((dfd, " tcp fconnect errno = %d-%s\n", errno,strerror(errno))); if(errno == ECONNREFUSED) { /* ACK the SYN, send RST to refuse the connection */ tcp_respond(tp, ti, m, ti->ti_seq+1, (tcp_seq)0, TH_RST|TH_ACK); } else { if(errno == EHOSTUNREACH) code=ICMP_UNREACH_HOST; HTONL(ti->ti_seq); /* restore tcp header */ HTONL(ti->ti_ack); HTONS(ti->ti_win); HTONS(ti->ti_urp); m->m_data -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); m->m_len += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); *ip=save_ip; icmp_error(m, ICMP_UNREACH,code, 0,strerror(errno)); } tcp_close(tp); m_free(m); } else { /* * Haven't connected yet, save the current mbuf * and ti, and return * XXX Some OS's don't tell us whether the connect() * succeeded or not. So we must time it out. */ so->so_m = m; so->so_ti = ti; tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT; tp->t_state = TCPS_SYN_RECEIVED; tcp_template(tp); } return; cont_conn: /* m==NULL * Check if the connect succeeded */ if (so->so_state & SS_NOFDREF) { tp = tcp_close(tp); goto dropwithreset; } cont_input: tcp_template(tp); if (optp) tcp_dooptions(tp, (u_char *)optp, optlen, ti); if (iss) tp->iss = iss; else tp->iss = slirp->tcp_iss; slirp->tcp_iss += TCP_ISSINCR/2; tp->irs = ti->ti_seq; tcp_sendseqinit(tp); tcp_rcvseqinit(tp); tp->t_flags |= TF_ACKNOW; tp->t_state = TCPS_SYN_RECEIVED; tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT; goto trimthenstep6; } /* case TCPS_LISTEN */ /* * If the state is SYN_SENT: * if seg contains an ACK, but not for our SYN, drop the input. * if seg contains a RST, then drop the connection. * if seg does not contain SYN, then drop it. * Otherwise this is an acceptable SYN segment * initialize tp->rcv_nxt and tp->irs * if seg contains ack then advance tp->snd_una * if SYN has been acked change to ESTABLISHED else SYN_RCVD state * arrange for segment to be acked (eventually) * continue processing rest of data/controls, beginning with URG */ case TCPS_SYN_SENT: if ((tiflags & TH_ACK) && (SEQ_LEQ(ti->ti_ack, tp->iss) || SEQ_GT(ti->ti_ack, tp->snd_max))) goto dropwithreset; if (tiflags & TH_RST) { if (tiflags & TH_ACK) { tcp_drop(tp, 0); /* XXX Check t_softerror! */ } goto drop; } if ((tiflags & TH_SYN) == 0) goto drop; if (tiflags & TH_ACK) { tp->snd_una = ti->ti_ack; if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; } tp->t_timer[TCPT_REXMT] = 0; tp->irs = ti->ti_seq; tcp_rcvseqinit(tp); tp->t_flags |= TF_ACKNOW; if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { soisfconnected(so); tp->t_state = TCPS_ESTABLISHED; (void) tcp_reass(tp, (struct tcpiphdr *)0, (struct mbuf *)0); /* * if we didn't have to retransmit the SYN, * use its rtt as our initial srtt & rtt var. */ if (tp->t_rtt) tcp_xmit_timer(tp, tp->t_rtt); } else tp->t_state = TCPS_SYN_RECEIVED; trimthenstep6: /* * Advance ti->ti_seq to correspond to first data byte. * If data, trim to stay within window, * dropping FIN if necessary. */ ti->ti_seq++; if (ti->ti_len > tp->rcv_wnd) { todrop = ti->ti_len - tp->rcv_wnd; m_adj(m, -todrop); ti->ti_len = tp->rcv_wnd; tiflags &= ~TH_FIN; } tp->snd_wl1 = ti->ti_seq - 1; tp->rcv_up = ti->ti_seq; goto step6; } /* switch tp->t_state */ /* * States other than LISTEN or SYN_SENT. * Check that at least some bytes of segment are within * receive window. If segment begins before rcv_nxt, * drop leading data (and SYN); if nothing left, just ack. */ todrop = tp->rcv_nxt - ti->ti_seq; if (todrop > 0) { if (tiflags & TH_SYN) { tiflags &= ~TH_SYN; ti->ti_seq++; if (ti->ti_urp > 1) ti->ti_urp--; else tiflags &= ~TH_URG; todrop--; } /* * Following if statement from Stevens, vol. 2, p. 960. */ if (todrop > ti->ti_len || (todrop == ti->ti_len && (tiflags & TH_FIN) == 0)) { /* * Any valid FIN must be to the left of the window. * At this point the FIN must be a duplicate or out * of sequence; drop it. */ tiflags &= ~TH_FIN; /* * Send an ACK to resynchronize and drop any data. * But keep on processing for RST or ACK. */ tp->t_flags |= TF_ACKNOW; todrop = ti->ti_len; } m_adj(m, todrop); ti->ti_seq += todrop; ti->ti_len -= todrop; if (ti->ti_urp > todrop) ti->ti_urp -= todrop; else { tiflags &= ~TH_URG; ti->ti_urp = 0; } } /* * If new data are received on a connection after the * user processes are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT && ti->ti_len) { tp = tcp_close(tp); goto dropwithreset; } /* * If segment ends after window, drop trailing data * (and PUSH and FIN); if nothing left, just ACK. */ todrop = (ti->ti_seq+ti->ti_len) - (tp->rcv_nxt+tp->rcv_wnd); if (todrop > 0) { if (todrop >= ti->ti_len) { /* * If a new connection request is received * while in TIME_WAIT, drop the old connection * and start over if the sequence numbers * are above the previous ones. */ if (tiflags & TH_SYN && tp->t_state == TCPS_TIME_WAIT && SEQ_GT(ti->ti_seq, tp->rcv_nxt)) { iss = tp->rcv_nxt + TCP_ISSINCR; tp = tcp_close(tp); goto findso; } /* * If window is closed can only take segments at * window edge, and have to drop data and PUSH from * incoming segments. Continue processing, but * remember to ack. Otherwise, drop segment * and ack. */ if (tp->rcv_wnd == 0 && ti->ti_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; } else { goto dropafterack; } } m_adj(m, -todrop); ti->ti_len -= todrop; tiflags &= ~(TH_PUSH|TH_FIN); } /* * If the RST bit is set examine the state: * SYN_RECEIVED STATE: * If passive open, return to LISTEN state. * If active open, inform user that connection was refused. * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: * Inform user that connection was reset, and close tcb. * CLOSING, LAST_ACK, TIME_WAIT STATES * Close the tcb. */ if (tiflags&TH_RST) switch (tp->t_state) { case TCPS_SYN_RECEIVED: case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: tp->t_state = TCPS_CLOSED; tcp_close(tp); goto drop; case TCPS_CLOSING: case TCPS_LAST_ACK: case TCPS_TIME_WAIT: tcp_close(tp); goto drop; } /* * If a SYN is in the window, then this is an * error and we send an RST and drop the connection. */ if (tiflags & TH_SYN) { tp = tcp_drop(tp,0); goto dropwithreset; } /* * If the ACK bit is off we drop the segment and return. */ if ((tiflags & TH_ACK) == 0) goto drop; /* * Ack processing. */ switch (tp->t_state) { /* * In SYN_RECEIVED state if the ack ACKs our SYN then enter * ESTABLISHED state and continue processing, otherwise * send an RST. una<=ack<=max */ case TCPS_SYN_RECEIVED: if (SEQ_GT(tp->snd_una, ti->ti_ack) || SEQ_GT(ti->ti_ack, tp->snd_max)) goto dropwithreset; tp->t_state = TCPS_ESTABLISHED; /* * The sent SYN is ack'ed with our sequence number +1 * The first data byte already in the buffer will get * lost if no correction is made. This is only needed for * SS_CTL since the buffer is empty otherwise. * tp->snd_una++; or: */ tp->snd_una=ti->ti_ack; if (so->so_state & SS_CTL) { /* So tcp_ctl reports the right state */ ret = tcp_ctl(so); if (ret == 1) { soisfconnected(so); so->so_state &= ~SS_CTL; /* success XXX */ } else if (ret == 2) { so->so_state &= SS_PERSISTENT_MASK; so->so_state |= SS_NOFDREF; /* CTL_CMD */ } else { needoutput = 1; tp->t_state = TCPS_FIN_WAIT_1; } } else { soisfconnected(so); } (void) tcp_reass(tp, (struct tcpiphdr *)0, (struct mbuf *)0); tp->snd_wl1 = ti->ti_seq - 1; /* Avoid ack processing; snd_una==ti_ack => dup ack */ goto synrx_to_est; /* fall into ... */ /* * In ESTABLISHED state: drop duplicate ACKs; ACK out of range * ACKs. If the ack is in the range * tp->snd_una < ti->ti_ack <= tp->snd_max * then advance tp->snd_una to ti->ti_ack and drop * data from the retransmission queue. If this ACK reflects * more up to date window information we update our window information. */ case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: case TCPS_TIME_WAIT: if (SEQ_LEQ(ti->ti_ack, tp->snd_una)) { if (ti->ti_len == 0 && tiwin == tp->snd_wnd) { DEBUG_MISC((dfd, " dup ack m = %lx so = %lx\n", (long )m, (long )so)); /* * If we have outstanding data (other than * a window probe), this is a completely * duplicate ack (ie, window info didn't * change), the ack is the biggest we've * seen and we've seen exactly our rexmt * threshold of them, assume a packet * has been dropped and retransmit it. * Kludge snd_nxt & the congestion * window so we send only this one * packet. * * We know we're losing at the current * window size so do congestion avoidance * (set ssthresh to half the current window * and pull our congestion window back to * the new ssthresh). * * Dup acks mean that packets have left the * network (they're now cached at the receiver) * so bump cwnd by the amount in the receiver * to keep a constant cwnd packets in the * network. */ if (tp->t_timer[TCPT_REXMT] == 0 || ti->ti_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks == TCPREXMTTHRESH) { tcp_seq onxt = tp->snd_nxt; u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; if (win < 2) win = 2; tp->snd_ssthresh = win * tp->t_maxseg; tp->t_timer[TCPT_REXMT] = 0; tp->t_rtt = 0; tp->snd_nxt = ti->ti_ack; tp->snd_cwnd = tp->t_maxseg; (void) tcp_output(tp); tp->snd_cwnd = tp->snd_ssthresh + tp->t_maxseg * tp->t_dupacks; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; } else if (tp->t_dupacks > TCPREXMTTHRESH) { tp->snd_cwnd += tp->t_maxseg; (void) tcp_output(tp); goto drop; } } else tp->t_dupacks = 0; break; } synrx_to_est: /* * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ if (tp->t_dupacks > TCPREXMTTHRESH && tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; tp->t_dupacks = 0; if (SEQ_GT(ti->ti_ack, tp->snd_max)) { goto dropafterack; } acked = ti->ti_ack - tp->snd_una; /* * If transmit timer is running and timed sequence * number was acked, update smoothed round trip time. * Since we now have an rtt measurement, cancel the * timer backoff (cf., Phil Karn's retransmit alg.). * Recompute the initial retransmit timer. */ if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq)) tcp_xmit_timer(tp,tp->t_rtt); /* * If all outstanding data is acked, stop retransmit * timer and remember to restart (more output or persist). * If there is more data to be acked, restart retransmit * timer, using current (possibly backed-off) value. */ if (ti->ti_ack == tp->snd_max) { tp->t_timer[TCPT_REXMT] = 0; needoutput = 1; } else if (tp->t_timer[TCPT_PERSIST] == 0) tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; /* * When new data is acked, open the congestion window. * If the window gives us less than ssthresh packets * in flight, open exponentially (maxseg per packet). * Otherwise open linearly: maxseg per window * (maxseg^2 / cwnd per packet). */ { register u_int cw = tp->snd_cwnd; register u_int incr = tp->t_maxseg; if (cw > tp->snd_ssthresh) incr = incr * incr / cw; tp->snd_cwnd = min(cw + incr, TCP_MAXWIN<<tp->snd_scale); } if (acked > so->so_snd.sb_cc) { tp->snd_wnd -= so->so_snd.sb_cc; sbdrop(&so->so_snd, (int )so->so_snd.sb_cc); ourfinisacked = 1; } else { sbdrop(&so->so_snd, acked); tp->snd_wnd -= acked; ourfinisacked = 0; } tp->snd_una = ti->ti_ack; if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; switch (tp->t_state) { /* * In FIN_WAIT_1 STATE in addition to the processing * for the ESTABLISHED state if our FIN is now acknowledged * then enter FIN_WAIT_2. */ case TCPS_FIN_WAIT_1: if (ourfinisacked) { /* * If we can't receive any more * data, then closing user can proceed. * Starting the timer is contrary to the * specification, but if we don't get a FIN * we'll hang forever. */ if (so->so_state & SS_FCANTRCVMORE) { tp->t_timer[TCPT_2MSL] = TCP_MAXIDLE; } tp->t_state = TCPS_FIN_WAIT_2; } break; /* * In CLOSING STATE in addition to the processing for * the ESTABLISHED state if the ACK acknowledges our FIN * then enter the TIME-WAIT state, otherwise ignore * the segment. */ case TCPS_CLOSING: if (ourfinisacked) { tp->t_state = TCPS_TIME_WAIT; tcp_canceltimers(tp); tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; } break; /* * In LAST_ACK, we may still be waiting for data to drain * and/or to be acked, as well as for the ack of our FIN. * If our FIN is now acknowledged, delete the TCB, * enter the closed state and return. */ case TCPS_LAST_ACK: if (ourfinisacked) { tcp_close(tp); goto drop; } break; /* * In TIME_WAIT state the only thing that should arrive * is a retransmission of the remote FIN. Acknowledge * it and restart the finack timer. */ case TCPS_TIME_WAIT: tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; goto dropafterack; } } /* switch(tp->t_state) */ step6: /* * Update window information. * Don't look at window if no ACK: TAC's send garbage on first SYN. */ if ((tiflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, ti->ti_seq) || (tp->snd_wl1 == ti->ti_seq && (SEQ_LT(tp->snd_wl2, ti->ti_ack) || (tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd))))) { tp->snd_wnd = tiwin; tp->snd_wl1 = ti->ti_seq; tp->snd_wl2 = ti->ti_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; needoutput = 1; } /* * Process segments with URG. */ if ((tiflags & TH_URG) && ti->ti_urp && TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* * This is a kludge, but if we receive and accept * random urgent pointers, we'll crash in * soreceive. It's hard to imagine someone * actually wanting to send this much urgent data. */ if (ti->ti_urp + so->so_rcv.sb_cc > so->so_rcv.sb_datalen) { ti->ti_urp = 0; tiflags &= ~TH_URG; goto dodata; } /* * If this segment advances the known urgent pointer, * then mark the data stream. This should not happen * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since * a FIN has been received from the remote side. * In these states we ignore the URG. * * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section as the original * spec states (in one of two places). */ if (SEQ_GT(ti->ti_seq+ti->ti_urp, tp->rcv_up)) { tp->rcv_up = ti->ti_seq + ti->ti_urp; so->so_urgc = so->so_rcv.sb_cc + (tp->rcv_up - tp->rcv_nxt); /* -1; */ tp->rcv_up = ti->ti_seq + ti->ti_urp; } } else /* * If no out of band data is expected, * pull receive urgent pointer along * with the receive window. */ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) tp->rcv_up = tp->rcv_nxt; dodata: /* * If this is a small packet, then ACK now - with Nagel * congestion avoidance sender won't send more until * he gets an ACK. */ if (ti->ti_len && (unsigned)ti->ti_len <= 5 && ((struct tcpiphdr_2 *)ti)->first_char == (char)27) { tp->t_flags |= TF_ACKNOW; } /* * Process the segment text, merging it into the TCP sequencing queue, * and arranging for acknowledgment of receipt if necessary. * This process logically involves adjusting tp->rcv_wnd as data * is presented to the user (this happens in tcp_usrreq.c, * case PRU_RCVD). If a FIN has already been received on this * connection then we just ignore the text. */ if ((ti->ti_len || (tiflags&TH_FIN)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { TCP_REASS(tp, ti, m, so, tiflags); } else { m_free(m); tiflags &= ~TH_FIN; } /* * If FIN is received ACK the FIN and let the user know * that the connection is closing. */ if (tiflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* * If we receive a FIN we can't send more data, * set it SS_FDRAIN * Shutdown the socket if there is no rx data in the * buffer. * soread() is called on completion of shutdown() and * will got to TCPS_LAST_ACK, and use tcp_output() * to send the FIN. */ sofwdrain(so); tp->t_flags |= TF_ACKNOW; tp->rcv_nxt++; } switch (tp->t_state) { /* * In SYN_RECEIVED and ESTABLISHED STATES * enter the CLOSE_WAIT state. */ case TCPS_SYN_RECEIVED: case TCPS_ESTABLISHED: if(so->so_emu == EMU_CTL) /* no shutdown on socket */ tp->t_state = TCPS_LAST_ACK; else tp->t_state = TCPS_CLOSE_WAIT; break; /* * If still in FIN_WAIT_1 STATE FIN has not been acked so * enter the CLOSING state. */ case TCPS_FIN_WAIT_1: tp->t_state = TCPS_CLOSING; break; /* * In FIN_WAIT_2 state enter the TIME_WAIT state, * starting the time-wait timer, turning off the other * standard timers. */ case TCPS_FIN_WAIT_2: tp->t_state = TCPS_TIME_WAIT; tcp_canceltimers(tp); tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; break; /* * In TIME_WAIT state restart the 2 MSL time_wait timer. */ case TCPS_TIME_WAIT: tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; break; } } /* * Return any desired output. */ if (needoutput || (tp->t_flags & TF_ACKNOW)) { (void) tcp_output(tp); } return; dropafterack: /* * Generate an ACK dropping incoming segment if it occupies * sequence space, where the ACK reflects our state. */ if (tiflags & TH_RST) goto drop; m_free(m); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); return; dropwithreset: /* reuses m if m!=NULL, m_free() unnecessary */ if (tiflags & TH_ACK) tcp_respond(tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST); else { if (tiflags & TH_SYN) ti->ti_len++; tcp_respond(tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0, TH_RST|TH_ACK); } return; drop: /* * Drop space held by incoming segment and return. */ m_free(m); }
int tcp_output(struct tcpcb * tp) { struct socket * so = tp->t_inpcb->inp_socket; int len; long win; int off, flags, error; struct mbuf * m; struct tcpiphdr * ti; unsigned optlen = 0; int idle, sendalot; struct mbuf * sendm; /* mbuf which contains data to send */ struct mbuf * tcp_mbuf; /* mbuf containing TCP header */ int bufoff; /* offset of data in sendm->m_data */ #ifdef TCP_SACK int sack_resend; int sack_hole = 0; /* next sack hole to fill */ if(tp->t_flags & TF_SACKREPLY) { /* we are resending based on a received SACK header */ sack_resend = TRUE; tp->t_flags &= ~TF_SACKREPLY; /* clear flag */ } else sack_resend = FALSE; #endif /* TCP_SACK */ /* * Determine length of data that should be transmitted, * and flags that will be used. * If there is some data or critical controls (SYN, RST) * to send, then transmit; otherwise, investigate further. */ idle = (tp->snd_max == tp->snd_una); again: sendalot = 0; off = (int)(tp->snd_nxt - tp->snd_una); win = (long)tp->snd_wnd; /* set basic send window */ if (win > (long)tp->snd_cwnd) /* see if we need congestion control */ { win = (int)(tp->snd_cwnd & ~(ALIGN_TYPE-1)); /* keep data aligned */ } /* * If in persist timeout with window of 0, send 1 byte. * Otherwise, if window is small but nonzero * and timer expired, we will send what we can * and go to transmit state. */ if (tp->t_force) { if (win == 0) win = 1; else { tp->t_timer[TCPT_PERSIST] = 0; tp->t_rxtshift = 0; } } #ifdef TCP_SACK /* See if we need to adjust the offset for a sack resend */ if(sack_resend) { off = (int)(tp->sack_hole_start[sack_hole] - tp->snd_una); /* if this hole's already been acked then punt and move to next hole */ if(off < 0) { /* clear out the acked hole */ tp->sack_hole_start[sack_hole] = tp->sack_hole_end[sack_hole] = 0; /* see if we're done with SACK hole list (2 tests) */ if(++sack_hole >= SACK_BLOCKS) return 0; if(tp->sack_hole_start[sack_hole] == tp->sack_hole_end[sack_hole]) return 0; goto again; } tp->snd_nxt = tp->sack_hole_start[sack_hole]; len = (int)(tp->sack_hole_end[sack_hole] - tp->sack_hole_start[sack_hole]); len = (int)MIN(len, (int)win); } else #endif /* TCP_SACK */ { /* set length of packets which are not sack resends */ len = (int)MIN(so->so_snd.sb_cc, (unsigned)win) - off; } flags = tcp_outflags[tp->t_state]; /* See if we need to build TCP options field. This test should be fast. */ #if (defined(TCP_TIMESTAMP) | defined(TCP_SACK)) if((flags & TH_SYN) || /* !!!??? (so->so_options & SO_TIMESTAMP) || */ (tp->t_flags & TF_SACKNOW) ) { optlen = bld_options(tp, &tcp_optionbuf[optlen], flags, so); } #else /* If other options not defined this build then don't bother to call bld_options() except * on SYN packets */ if(flags & TH_SYN) { optlen = bld_options(tp, &tcp_optionbuf[optlen], flags, so); } #endif if (len < 0) { /* * If FIN has been sent but not acked, * but we haven't been called to retransmit, * len will be -1. Otherwise, window shrank * after we sent into it. If window shrank to 0, * cancel pending retransmit and pull snd_nxt * back to (closed) window. We will enter persist * state below. If the window didn't close completely, * just wait for an ACK. */ len = 0; if (win == 0) { tp->t_timer[TCPT_REXMT] = 0; tp->snd_nxt = tp->snd_una; } } if (len > (int)tp->t_maxseg) { len = tp->t_maxseg; sendalot = 1; } #ifdef IP_V4 #ifdef IP_PMTU { int pmtu = tp->t_inpcb->inp_pmtu - 40; if (len > pmtu) { len = pmtu - 40; sendalot = 1; } } #endif /* IP_PMTU */ /* We don't need a pmtu test for IPv6. V6 code limits t_maxseg to * the Path MTU, so the test above the v4 ifdef above covers us. */ #endif /* IP_V4 */ if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) flags &= ~TH_FIN; win = (long)(sbspace(&so->so_rcv)); /* * If our state indicates that FIN should be sent * and we have not yet done so, or we're retransmitting the FIN, * then we need to send. */ if ((flags & TH_FIN) && (so->so_snd.sb_cc == 0) && ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) { goto send; } /* * Send if we owe peer an ACK. */ if (tp->t_flags & TF_ACKNOW) goto send; if (flags & (TH_SYN|TH_RST)) goto send; if (SEQ_GT(tp->snd_up, tp->snd_una)) goto send; /* * Sender silly window avoidance. If connection is idle * and can send all data, a maximum segment, * at least a maximum default-size segment do it, * or are forced, do it; otherwise don't bother. * If peer's buffer is tiny, then send * when window is at least half open. * If retransmitting (possibly after persist timer forced us * to send into a small window), then must resend. */ if (len) { if (len == (int)tp->t_maxseg) goto send; if ((idle || tp->t_flags & TF_NODELAY) && len + off >= (int)so->so_snd.sb_cc) { goto send; } if (tp->t_force) goto send; if (len >= (int)(tp->max_sndwnd / 2)) goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) goto send; } /* * Compare available window to amount of window * known to peer (as advertised window less * next expected input). If the difference is at least two * max size segments or at least 35% of the maximum possible * window, then want to send a window update to peer. */ if (win > 0) { int adv = (int)win - (int)(tp->rcv_adv - tp->rcv_nxt); if (so->so_rcv.sb_cc == 0 && adv >= (int)(tp->t_maxseg * 2)) goto send; if (100 * (u_int)adv / so->so_rcv.sb_hiwat >= 35) goto send; } /* * TCP window updates are not reliable, rather a polling protocol * using ``persist'' packets is used to insure receipt of window * updates. The three ``states'' for the output side are: * idle not doing retransmits or persists * persisting to move a small or zero window * (re)transmitting and thereby not persisting * * tp->t_timer[TCPT_PERSIST] * is set when we are in persist state. * tp->t_force * is set when we are called to send a persist packet. * tp->t_timer[TCPT_REXMT] * is set when we are retransmitting * The output side is idle when both timers are zero. * * If send window is too small, there is data to transmit, and no * retransmit or persist is pending, then go to persist state. * If nothing happens soon, send when timer expires: * if window is nonzero, transmit what we can, * otherwise force out a byte. */ if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 && tp->t_timer[TCPT_PERSIST] == 0) { tp->t_rxtshift = 0; tcp_setpersist(tp); } /* * No reason to send a segment, just return. */ return (0); send: ENTER_CRIT_SECTION(tp); /* Limit send length to the current buffer so as to * avoid doing the "mbuf shuffle" in m_copy(). */ bufoff = off; sendm = so->so_snd.sb_mb; if (len) { /* find mbuf containing data to send (at "off") */ while (sendm) /* loop through socket send list */ { bufoff -= sendm->m_len; if (bufoff < 0) /* if off is in this buffer, break */ break; sendm = sendm->m_next; } if (!sendm) { dtrap(); /* shouldn't happen */ } bufoff += sendm->m_len; /* index to next data to send in msend */ /* if socket has multiple unsent mbufs, set flag for send to loop */ if ((sendm->m_next) && (len > (int)sendm->m_len)) { flags &= ~TH_FIN; /* don't FIN on segment prior to last */ sendalot = 1; /* set to send more segments */ } if((flags & TH_FIN) && (so->so_snd.sb_cc > (unsigned)len)) { /* This can happen on slow links (PPP) which retry the last * segment - the one with the FIN bit attached to data. */ flags &= ~TH_FIN; /* don't FIN on segment prior to last */ } /* only send the rest of msend */ len = min(len, (int)sendm->m_len); /* if we're not sending starting at sendm->m_data (in which * case bufoff != 0), then we will copy the data; else we would * write IP/TCP headers over sent but un-ack'ed data in sendm. * Similarly, if sendm->m_data is not aligned with respect to * sendm->m_base and ALIGN_TYPE, we will copy the data to * ensure that it (and the then-prepended IP/TCP headers) will * be aligned according to ALIGN_TYPE. */ if ((bufoff != 0) || /* data not front aligned in send mbuf? */ (((sendm->m_data - sendm->m_base) & (ALIGN_TYPE - 1)) != 0)) { len = min(len, (int)(sendm->m_len - bufoff)); /* limit len again */ /* One more test - if this data is not aligned with the front * of the m_data buffer then we can't use it in place, else we * might write the IP/TCP header over data that has not yet * been acked. In this case we must make sure our send * fits into a little buffer and send what we can. */ if ((len > (int)(lilbufsiz - HDRSLEN)) && /* length is bigger the small buffer? */ (bigfreeq.q_len < 2)) /* and we are low on big buffers */ { len = lilbufsiz - HDRSLEN; } } } /* if send data is sufficiently aligned in packet, prepend TCP/IP header * in the space provided. */ if (len && (bufoff == 0) && (sendm->pkt->inuse == 1) && (((sendm->m_data - sendm->m_base) & (ALIGN_TYPE - 1)) == 0) && (optlen == 0)) { /* get an empty mbuf to "clone" the data */ m = m_getnbuf(MT_TXDATA, 0); if (!m) { EXIT_CRIT_SECTION(tp); return (ENOBUFS); } m->pkt = sendm->pkt; /* copy packet location in new mbuf */ m->pkt->inuse++; /* bump packet's use count */ m->m_base = sendm->m_base; /* clone mbuf members */ m->m_memsz = sendm->m_memsz; m->m_len = len + TCPIPHDRSZ; /* adjust clone for header */ m->m_data = sendm->m_data - TCPIPHDRSZ; } else /* either no data or data is not front aligned in mbuf */ { /* Grab a header mbuf, attaching a copy of data to be * transmitted, and initialize the header from * the template for sends on this connection. */ m = m_getwithdata (MT_HEADER, IFNETHDR_SIZE + TCPIPHDRSZ); if (m ==(struct mbuf *)NULL) { EXIT_CRIT_SECTION(tp); return ENOBUFS; } m->m_len = TCPIPHDRSZ; m->m_data += IFNETHDR_SIZE;/* Move this to sizeof tcpip hdr leave*/ /* 14 bytes for ethernet header */ if (len) /* attach any data to send */ { m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len); if (m->m_next == 0) { m_freem(m); EXIT_CRIT_SECTION(tp); return ENOBUFS; } } } EXIT_CRIT_SECTION(tp); if (len) { if (tp->t_force && len == 1) tcpstat.tcps_sndprobe++; else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { tcpstat.tcps_sndrexmitpack++; tcpstat.tcps_sndrexmitbyte += len; #ifdef TCP_SACK if(sack_resend) tcpstat.tcps_sackresend++; #endif } else { tcpstat.tcps_sndpack++; tcpstat.tcps_sndbyte += len; } } else if (tp->t_flags & TF_ACKNOW) { tcpstat.tcps_sndacks++; } else if (flags & (TH_SYN|TH_FIN|TH_RST)) tcpstat.tcps_sndctrl++; else if (SEQ_GT(tp->snd_up, tp->snd_una)) tcpstat.tcps_sndurg++; else tcpstat.tcps_sndwinup++; ti = (struct tcpiphdr *)(m->m_data+sizeof(struct ip)-sizeof(struct ipovly)); if ((char *)ti < m->pkt->nb_buff) { panic("tcp_out- packet ptr underflow\n"); } tcp_mbuf = m; /* flag TCP header mbuf */ #ifdef IP_V6 /* Dual mode code */ if(so->so_domain == AF_INET6) { m = mbuf_prepend(m, sizeof(struct ipv6)); if(m == NULL) { /* this can happen when we run out of mbufs or pkt buffers * That is, mfreeq is empty or (lilfreeq, bigfreeq) are empty. * One solution is to find out which one is getting full and * then increase them. */ dtrap(); /* This is really rare... */ m_freem(tcp_mbuf); /* Free TCP/data chain */ return ENOBUFS; } /* strip overlay from front of TCP header */ tcp_mbuf->m_data += sizeof(struct ipovly); tcp_mbuf->m_len -= sizeof(struct ipovly); } #endif /* end IP_V6 */ if (tp->t_template == 0) panic("tcp_output"); MEMCPY((char*)ti, (char*)tp->t_template, sizeof(struct tcpiphdr)); /* * Fill in fields, remembering maximum advertised * window for use in delaying messages about window sizes. * If resending a FIN, be sure not to use a new sequence number. */ if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && tp->snd_nxt == tp->snd_max) { tp->snd_nxt--; } ti->ti_seq = htonl(tp->snd_nxt); ti->ti_ack = htonl(tp->rcv_nxt); /* * If we're sending a SYN, check the IP address of the interface * that we will (likely) use to send the IP datagram -- if it's * changed from what is in the template (as it might if this is * a retransmission, and the original SYN caused PPP to start * bringing the interface up, and PPP has got a new IP address * via IPCP), update the template and the inpcb with the new * address. */ if (flags & TH_SYN) { struct inpcb * inp; inp = (struct inpcb *)so->so_pcb; switch(so->so_domain) { #ifdef IP_V4 case AF_INET: { ip_addr src; #ifdef INCLUDE_PPP if(((flags & TH_ACK) == 0) && /* SYN only, not SYN/ACK */ (inp->ifp) && /* Make sure we have iface */ (inp->ifp->mib.ifType == PPP)) /* only PPP type */ { dtrap(); /* remove after confirmed to work in PPP */ src = ip_mymach(ti->ti_dst.s_addr); if (src != ti->ti_src.s_addr) { ti->ti_src.s_addr = src; tp->t_template->ti_src.s_addr = src; tp->t_inpcb->inp_laddr.s_addr = src; } } #endif /* INCLUDE_PPP */ /* If this is a SYN (not a SYN/ACK) then set the pmtu */ if((flags & TH_ACK) == 0) { #ifdef IP_PMTU inp->inp_pmtu = pmtucache_get(inp->inp_faddr.s_addr); #else /* not compiled for pathmtu, guess based on iface */ { NET ifp; /* find iface for route. Pass "src" as nexthop return */ ifp = iproute(ti->ti_dst.s_addr, &src); if(ifp) inp->inp_pmtu = ifp->n_mtu - (ifp->n_lnh + 40); else inp->inp_pmtu = 580; /* Ugh. */ } #endif /* IP_PMTU */ } break; } #endif /* IP_V4 */ #ifdef IP_V6 case AF_INET6: { struct ip6_inaddr * local; local = ip6_myaddr(&tp->t_inpcb->ip6_faddr, inp->ifp); /* If we got a local address & it's not the one in the pcb, then * we assume it changed at the iface and fix it in the pcb. Unlike * v4, we don't have an IP header yet, not do we have a template * to worry about. */ if((local) && (!IP6EQ(&local->addr, &tp->t_inpcb->ip6_laddr))) { IP6CPY(&tp->t_inpcb->ip6_laddr, &local->addr); } /* If this is a SYN (not a SYN/ACK) then set the pmtu */ if((flags & TH_ACK) == 0) { inp->inp_pmtu = ip6_pmtulookup(&inp->ip6_laddr, inp->ifp); } break; } #endif /* IP_V6 */ default: dtrap(); /* bad domain setting */ } } /* fill in options if any are set */ if (optlen) { struct mbuf * mopt; mopt = m_getwithdata(MT_TXDATA, MAXOPTLEN); if (mopt == NULL) { m_freem(m); return (ENOBUFS); } /* insert options mbuf after after tmp_mbuf */ mopt->m_next = tcp_mbuf->m_next; tcp_mbuf->m_next = mopt; /* extend options to aligned address */ while(optlen & 0x03) tcp_optionbuf[optlen++] = TCPOPT_EOL; MEMCPY(mtod(mopt, char *), tcp_optionbuf, optlen); mopt->m_len = optlen; /* use portable macro to set tcp data offset bits */ SET_TH_OFF(ti->ti_t, ((sizeof (struct tcphdr) + optlen) >> 2)); } ti->ti_flags = (u_char)flags; /* * Calculate receive window. Don't shrink window, * but avoid silly window syndrome. */ if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg) win = 0; if (win < (long)(tp->rcv_adv - tp->rcv_nxt)) win = (long)(tp->rcv_adv - tp->rcv_nxt); /* do check for Iniche buffer limits -JB- */ if (bigfreeq.q_len == 0) /* If queue length is 0, set window to 0 */ { win = 0; } else if(win > (((long)bigfreeq.q_len - 1) * (long)bigbufsiz)) { win = ((long)bigfreeq.q_len - 1) * bigbufsiz; } #ifdef TCP_WIN_SCALE if(tp->t_flags & TF_WINSCALE) { ti->ti_win = htons((u_short)(win >> tp->rcv_wind_scale)); /* apply scale */ }
void natmintr() { int s; struct mbuf *m; struct socket *so; struct natmpcb *npcb; next: s = splnet(); IF_DEQUEUE(&natmintrq, m); splx(s); if (m == NULL) return; #ifdef DIAGNOSTIC if ((m->m_flags & M_PKTHDR) == 0) panic("natmintr no HDR"); #endif npcb = (struct natmpcb *) m->m_pkthdr.rcvif; /* XXX: overloaded */ so = npcb->npcb_socket; s = splnet(); /* could have atm devs @ different levels */ npcb->npcb_inq--; splx(s); if (npcb->npcb_flags & NPCB_DRAIN) { m_freem(m); if (npcb->npcb_inq == 0) free(npcb, M_PCB); /* done! */ goto next; } if (npcb->npcb_flags & NPCB_FREE) { m_freem(m); /* drop */ goto next; } #ifdef NEED_TO_RESTORE_IFP m->m_pkthdr.rcvif = npcb->npcb_ifp; #else #ifdef DIAGNOSTIC m->m_pkthdr.rcvif = NULL; /* null it out to be safe */ #endif #endif if (sbspace(&so->so_rcv) > m->m_pkthdr.len || ((npcb->npcb_flags & NPCB_RAW) != 0 && so->so_rcv.sb_cc < NPCB_RAWCC) ) { #ifdef NATM_STAT natm_sookcnt++; natm_sookbytes += m->m_pkthdr.len; #endif sbappendrecord(&so->so_rcv, m); sorwakeup(so); } else { #ifdef NATM_STAT natm_sodropcnt++; natm_sodropbytes += m->m_pkthdr.len; #endif m_freem(m); } goto next; }
/* * Append address and data, and optionally, control (ancillary) data * to the receive queue of a socket. If present, * m0 must include a packet header with total length. * Returns 0 if no space in sockbuf or insufficient mbufs. */ int sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control) { struct mbuf *m, *n, *nlast; int space, len; KASSERT(solocked(sb->sb_so)); space = asa->sa_len; if (m0 != NULL) { if ((m0->m_flags & M_PKTHDR) == 0) panic("sbappendaddr"); space += m0->m_pkthdr.len; #ifdef MBUFTRACE m_claimm(m0, sb->sb_mowner); #endif } for (n = control; n; n = n->m_next) { space += n->m_len; MCLAIM(n, sb->sb_mowner); if (n->m_next == 0) /* keep pointer to last control buf */ break; } if (space > sbspace(sb)) return (0); MGET(m, M_DONTWAIT, MT_SONAME); if (m == 0) return (0); MCLAIM(m, sb->sb_mowner); /* * XXX avoid 'comparison always true' warning which isn't easily * avoided. */ len = asa->sa_len; if (len > MLEN) { MEXTMALLOC(m, asa->sa_len, M_NOWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); return (0); } } m->m_len = asa->sa_len; memcpy(mtod(m, void *), asa, asa->sa_len); if (n) n->m_next = m0; /* concatenate data to control */ else control = m0; m->m_next = control; SBLASTRECORDCHK(sb, "sbappendaddr 1"); for (n = m; n->m_next != NULL; n = n->m_next) sballoc(sb, n); sballoc(sb, n); nlast = n; SBLINKRECORD(sb, m); sb->sb_mbtail = nlast; SBLASTMBUFCHK(sb, "sbappendaddr"); SBLASTRECORDCHK(sb, "sbappendaddr 2"); return (1); }
int sbappendaddrchain(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, int sbprio) { int space; struct mbuf *m, *n, *n0, *nlast; int error; KASSERT(solocked(sb->sb_so)); /* * XXX sbprio reserved for encoding priority of this* request: * SB_PRIO_NONE --> honour normal sb limits * SB_PRIO_ONESHOT_OVERFLOW --> if socket has any space, * take whole chain. Intended for large requests * that should be delivered atomically (all, or none). * SB_PRIO_OVERDRAFT -- allow a small (2*MLEN) overflow * over normal socket limits, for messages indicating * buffer overflow in earlier normal/lower-priority messages * SB_PRIO_BESTEFFORT --> ignore limits entirely. * Intended for kernel-generated messages only. * Up to generator to avoid total mbuf resource exhaustion. */ (void)sbprio; if (m0 && (m0->m_flags & M_PKTHDR) == 0) panic("sbappendaddrchain"); space = sbspace(sb); #ifdef notyet /* * Enforce SB_PRIO_* limits as described above. */ #endif n0 = NULL; nlast = NULL; for (m = m0; m; m = m->m_nextpkt) { struct mbuf *np; #ifdef MBUFTRACE m_claimm(m, sb->sb_mowner); #endif /* Prepend sockaddr to this record (m) of input chain m0 */ n = m_prepend_sockaddr(sb, m, asa); if (n == NULL) { error = ENOBUFS; goto bad; } /* Append record (asa+m) to end of new chain n0 */ if (n0 == NULL) { n0 = n; } else { nlast->m_nextpkt = n; } /* Keep track of last record on new chain */ nlast = n; for (np = n; np; np = np->m_next) sballoc(sb, np); } SBLASTRECORDCHK(sb, "sbappendaddrchain 1"); /* Drop the entire chain of (asa+m) records onto the socket */ SBLINKRECORDCHAIN(sb, n0, nlast); SBLASTRECORDCHK(sb, "sbappendaddrchain 2"); for (m = nlast; m->m_next; m = m->m_next) ; sb->sb_mbtail = m; SBLASTMBUFCHK(sb, "sbappendaddrchain"); return (1); bad: /* * On error, free the prepended addreseses. For consistency * with sbappendaddr(), leave it to our caller to free * the input record chain passed to us as m0. */ while ((n = n0) != NULL) { struct mbuf *np; /* Undo the sballoc() of this record */ for (np = n; np; np = np->m_next) sbfree(sb, np); n0 = n->m_nextpkt; /* iterate at next prepended address */ MFREE(n, np); /* free prepended address (not data) */ } return 0; }
int sosend(struct socket *so, struct mbuf *nam, /* sockaddr, if UDP socket, NULL if TCP */ char *data, /* data to send */ int *data_length, /* IN/OUT length of (remaining) data */ int flags) { struct mbuf *head = (struct mbuf *)NULL; struct mbuf *m; int space; int resid; int len; int error = 0; int dontroute; int first = 1; resid = *data_length; /* * In theory resid should be unsigned. * However, space must be signed, as it might be less than 0 * if we over-committed, and we must use a signed comparison * of space and resid. On the other hand, a negative resid * causes us to loop sending 0-length segments to the protocol. */ if (resid < 0) return (EINVAL); INET_TRACE (INETM_IO, ("INET:sosend: so %lx resid %d sb_hiwat %d so_state %x\n", so, resid, so->so_snd.sb_hiwat, so->so_state)); if (sosendallatonce(so) && (resid > (int)so->so_snd.sb_hiwat)) return (EMSGSIZE); dontroute = (flags & MSG_DONTROUTE) && ((so->so_options & SO_DONTROUTE) == 0) && (so->so_proto->pr_flags & PR_ATOMIC); #define snderr(errno) { error = errno; goto release; } restart: sblock(&so->so_snd); do { if (so->so_error) { error = so->so_error; so->so_error = 0; /* ??? */ goto release; } if (so->so_state & SS_CANTSENDMORE) snderr(EPIPE); if ((so->so_state & SS_ISCONNECTED) == 0) { if (so->so_proto->pr_flags & PR_CONNREQUIRED) snderr(ENOTCONN); if (nam == 0) snderr(EDESTADDRREQ); } if (flags & MSG_OOB) space = 1024; else { space = (int)sbspace(&so->so_snd); if ((sosendallatonce(so) && (space < resid)) || ((resid >= CLBYTES) && (space < CLBYTES) && (so->so_snd.sb_cc >= CLBYTES) && ((so->so_state & SS_NBIO) == 0) && ((flags & MSG_DONTWAIT) == 0))) { if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) { if (first) error = EWOULDBLOCK; goto release; } sbunlock(&so->so_snd); sbwait(&so->so_snd); goto restart; } } if ( space <= 0 ) { /* no space in socket send buffer - see if we can wait */ if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) { if (first) /* report first error */ error = EWOULDBLOCK; goto release; } /* If blocking socket, let someone else run */ sbunlock(&so->so_snd); sbwait(&so->so_snd); goto restart; } while (space > 0) { len = resid; if ( so->so_type == SOCK_STREAM ) { m = m_getwithdata(MT_TXDATA, len); if (!m) snderr(ENOBUFS); MEMCPY(m->m_data, data, len); so->so_snd.sb_flags |= SB_MBCOMP; /* allow compression */ } else { m = m_get (M_WAIT, MT_TXDATA); m->m_data = data; } INET_TRACE (INETM_IO, ("sosend:got %d bytes so %lx mlen %d, off %d mtod %x\n", len, so, m->m_len, m->m_off, mtod (m, caddr_t))); *data_length -= len; resid -= len; data += len; m->m_len = len; if (head == (struct mbuf *)NULL) head = m; if (error) goto release; if (*data_length <= 0) break; } if (dontroute) so->so_options |= SO_DONTROUTE; so->so_req = (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND; error = (*so->so_proto->pr_usrreq)(so, head, nam); if (dontroute) so->so_options &= ~SO_DONTROUTE; head = (struct mbuf *)NULL; first = 0; } while ((resid != 0) && (error == 0)); release: sbunlock(&so->so_snd); if (head) m_freem(head); return error; }
/* * User Request. * up is socket * m is either * optional mbuf chain containing message * ioctl command (PRU_CONTROL) * nam is either * optional mbuf chain containing an address * ioctl data (PRU_CONTROL) * optionally protocol number (PRU_ATTACH) * message flags (PRU_RCVD) * ctl is either * optional mbuf chain containing socket options * optional interface pointer (PRU_CONTROL, PRU_PURGEIF) * l is pointer to process requesting action (if any) * * we are responsible for disposing of m and ctl if * they are mbuf chains */ int rfcomm_usrreq(struct socket *up, int req, struct mbuf *m, struct mbuf *nam, struct mbuf *ctl, struct lwp *l) { struct rfcomm_dlc *pcb = up->so_pcb; struct sockaddr_bt *sa; struct mbuf *m0; int err = 0; DPRINTFN(2, "%s\n", prurequests[req]); switch (req) { case PRU_CONTROL: return EPASSTHROUGH; case PRU_PURGEIF: return EOPNOTSUPP; case PRU_ATTACH: if (up->so_lock == NULL) { mutex_obj_hold(bt_lock); up->so_lock = bt_lock; solock(up); } KASSERT(solocked(up)); if (pcb != NULL) return EINVAL; /* * Since we have nothing to add, we attach the DLC * structure directly to our PCB pointer. */ err = soreserve(up, rfcomm_sendspace, rfcomm_recvspace); if (err) return err; err = rfcomm_attach((struct rfcomm_dlc **)&up->so_pcb, &rfcomm_proto, up); if (err) return err; err = rfcomm_rcvd(up->so_pcb, sbspace(&up->so_rcv)); if (err) { rfcomm_detach((struct rfcomm_dlc **)&up->so_pcb); return err; } return 0; } if (pcb == NULL) { err = EINVAL; goto release; } switch(req) { case PRU_DISCONNECT: soisdisconnecting(up); return rfcomm_disconnect(pcb, up->so_linger); case PRU_ABORT: rfcomm_disconnect(pcb, 0); soisdisconnected(up); /* fall through to */ case PRU_DETACH: return rfcomm_detach((struct rfcomm_dlc **)&up->so_pcb); case PRU_BIND: KASSERT(nam != NULL); sa = mtod(nam, struct sockaddr_bt *); if (sa->bt_len != sizeof(struct sockaddr_bt)) return EINVAL; if (sa->bt_family != AF_BLUETOOTH) return EAFNOSUPPORT; return rfcomm_bind(pcb, sa); case PRU_CONNECT: KASSERT(nam != NULL); sa = mtod(nam, struct sockaddr_bt *); if (sa->bt_len != sizeof(struct sockaddr_bt)) return EINVAL; if (sa->bt_family != AF_BLUETOOTH) return EAFNOSUPPORT; soisconnecting(up); return rfcomm_connect(pcb, sa); case PRU_PEERADDR: KASSERT(nam != NULL); sa = mtod(nam, struct sockaddr_bt *); nam->m_len = sizeof(struct sockaddr_bt); return rfcomm_peeraddr(pcb, sa); case PRU_SOCKADDR: KASSERT(nam != NULL); sa = mtod(nam, struct sockaddr_bt *); nam->m_len = sizeof(struct sockaddr_bt); return rfcomm_sockaddr(pcb, sa); case PRU_SHUTDOWN: socantsendmore(up); break; case PRU_SEND: KASSERT(m != NULL); if (ctl) /* no use for that */ m_freem(ctl); m0 = m_copypacket(m, M_DONTWAIT); if (m0 == NULL) return ENOMEM; sbappendstream(&up->so_snd, m); return rfcomm_send(pcb, m0); case PRU_SENSE: return 0; /* (no release) */ case PRU_RCVD: return rfcomm_rcvd(pcb, sbspace(&up->so_rcv)); case PRU_RCVOOB: return EOPNOTSUPP; /* (no release) */ case PRU_LISTEN: return rfcomm_listen(pcb); case PRU_ACCEPT: KASSERT(nam != NULL); sa = mtod(nam, struct sockaddr_bt *); nam->m_len = sizeof(struct sockaddr_bt); return rfcomm_peeraddr(pcb, sa); case PRU_CONNECT2: case PRU_SENDOOB: case PRU_FASTTIMO: case PRU_SLOWTIMO: case PRU_PROTORCV: case PRU_PROTOSEND: err = EOPNOTSUPP; break; default: UNKNOWN(req); err = EOPNOTSUPP; break; } release: if (m) m_freem(m); if (ctl) m_freem(ctl); return err; }
static int soo_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { struct socket *so = fp->f_data; int error = 0; switch (cmd) { case FIONBIO: SOCK_LOCK(so); if (*(int *)data) so->so_state |= SS_NBIO; else so->so_state &= ~SS_NBIO; SOCK_UNLOCK(so); break; case FIOASYNC: /* * XXXRW: This code separately acquires SOCK_LOCK(so) and * SOCKBUF_LOCK(&so->so_rcv) even though they are the same * mutex to avoid introducing the assumption that they are * the same. */ if (*(int *)data) { SOCK_LOCK(so); so->so_state |= SS_ASYNC; SOCK_UNLOCK(so); SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_flags |= SB_ASYNC; SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_flags |= SB_ASYNC; SOCKBUF_UNLOCK(&so->so_snd); } else { SOCK_LOCK(so); so->so_state &= ~SS_ASYNC; SOCK_UNLOCK(so); SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_flags &= ~SB_ASYNC; SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_flags &= ~SB_ASYNC; SOCKBUF_UNLOCK(&so->so_snd); } break; case FIONREAD: /* Unlocked read. */ *(int *)data = sbavail(&so->so_rcv); break; case FIONWRITE: /* Unlocked read. */ *(int *)data = sbavail(&so->so_snd); break; case FIONSPACE: /* Unlocked read. */ if ((so->so_snd.sb_hiwat < sbused(&so->so_snd)) || (so->so_snd.sb_mbmax < so->so_snd.sb_mbcnt)) *(int *)data = 0; else *(int *)data = sbspace(&so->so_snd); break; case FIOSETOWN: error = fsetown(*(int *)data, &so->so_sigio); break; case FIOGETOWN: *(int *)data = fgetown(&so->so_sigio); break; case SIOCSPGRP: error = fsetown(-(*(int *)data), &so->so_sigio); break; case SIOCGPGRP: *(int *)data = -fgetown(&so->so_sigio); break; case SIOCATMARK: /* Unlocked read. */ *(int *)data = (so->so_rcv.sb_state & SBS_RCVATMARK) != 0; break; default: /* * Interface/routing/protocol specific ioctls: interface and * routing ioctls should have a different entry since a * socket is unnecessary. */ if (IOCGROUP(cmd) == 'i') error = ifioctl(so, cmd, data, td); else if (IOCGROUP(cmd) == 'r') { CURVNET_SET(so->so_vnet); error = rtioctl_fib(cmd, data, so->so_fibnum); CURVNET_RESTORE(); } else { CURVNET_SET(so->so_vnet); error = ((*so->so_proto->pr_usrreqs->pru_control) (so, cmd, data, 0, td)); CURVNET_RESTORE(); } break; } return (error); }
static int handle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len) { uint32_t report = be32toh(ddp_report); unsigned int db_flag; struct inpcb *inp = toep->inp; struct tcpcb *tp; struct socket *so; struct sockbuf *sb; struct mbuf *m; db_flag = report & F_DDP_BUF_IDX ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE; if (__predict_false(!(report & F_DDP_INV))) CXGBE_UNIMPLEMENTED("DDP buffer still valid"); INP_WLOCK(inp); so = inp_inpcbtosocket(inp); sb = &so->so_rcv; if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) { /* * XXX: think a bit more. * tcpcb probably gone, but socket should still be around * because we always wait for DDP completion in soreceive no * matter what. Just wake it up and let it clean up. */ CTR5(KTR_CXGBE, "%s: tid %u, seq 0x%x, len %d, inp_flags 0x%x", __func__, toep->tid, be32toh(rcv_nxt), len, inp->inp_flags); SOCKBUF_LOCK(sb); goto wakeup; } tp = intotcpcb(inp); len += be32toh(rcv_nxt) - tp->rcv_nxt; tp->rcv_nxt += len; tp->t_rcvtime = ticks; #ifndef USE_DDP_RX_FLOW_CONTROL KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__)); tp->rcv_wnd -= len; #endif m = get_ddp_mbuf(len); SOCKBUF_LOCK(sb); if (report & F_DDP_BUF_COMPLETE) toep->ddp_score = DDP_HIGH_SCORE; else discourage_ddp(toep); /* receive buffer autosize */ if (sb->sb_flags & SB_AUTOSIZE && V_tcp_do_autorcvbuf && sb->sb_hiwat < V_tcp_autorcvbuf_max && len > (sbspace(sb) / 8 * 7)) { unsigned int hiwat = sb->sb_hiwat; unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc, V_tcp_autorcvbuf_max); if (!sbreserve_locked(sb, newsize, so, NULL)) sb->sb_flags &= ~SB_AUTOSIZE; else toep->rx_credits += newsize - hiwat; } KASSERT(toep->sb_cc >= sbused(sb), ("%s: sb %p has more data (%d) than last time (%d).", __func__, sb, sbused(sb), toep->sb_cc)); toep->rx_credits += toep->sb_cc - sbused(sb); #ifdef USE_DDP_RX_FLOW_CONTROL toep->rx_credits -= len; /* adjust for F_RX_FC_DDP */ #endif sbappendstream_locked(sb, m, 0); toep->sb_cc = sbused(sb); wakeup: KASSERT(toep->ddp_flags & db_flag, ("%s: DDP buffer not active. toep %p, ddp_flags 0x%x, report 0x%x", __func__, toep, toep->ddp_flags, report)); toep->ddp_flags &= ~db_flag; sorwakeup_locked(so); SOCKBUF_UNLOCK_ASSERT(sb); INP_WUNLOCK(inp); return (0); }
void tcp_respond(struct tcpcb *tp, struct tcpiphdr *ti, tcp_seq ack, tcp_seq seq, int flags, struct mbuf *ti_mbuf) { int tlen; /* tcp data len - 0 or 1 */ int domain; /* AF_INET or AF_INET6 */ int win = 0; /* window to use in sent packet */ struct mbuf *m; /* mbuf to send */ struct tcpiphdr *tmp_thdr; /* scratch */ if (tp) win = (int)sbspace(&tp->t_inpcb->inp_socket->so_rcv); /* Figure out of we can recycle the passed buffer or if we need a * new one. Construct the easy parts of the the TCP and IP headers. */ if (flags == 0) /* sending keepalive from timer */ { /* no flags == need a new buffer */ m = m_getwithdata (MT_HEADER, HDRSLEN); if (m == NULL) return; tlen = 1; /* Keepalives have one byte of data */ m->m_len = TCPIPHDRSZ + tlen; /* * Copy template contents into the mbuf and set ti to point * to the header structure in the mbuf. */ tmp_thdr = (struct tcpiphdr *)((char *)m->m_data + sizeof(struct ip) - sizeof(struct ipovly)); if ((char *)tmp_thdr < m->pkt->nb_buff) { panic("tcp_respond- packet ptr underflow\n"); } MEMCPY(tmp_thdr, ti, sizeof(struct tcpiphdr)); ti = tmp_thdr; flags = TH_ACK; domain = tp->t_inpcb->inp_socket->so_domain; } else /* Flag was passed (e.g. reset); recycle passed mbuf */ { m = ti_mbuf; /*dtom(ti);*/ if (m->pkt->type == IPTP) /* IPv4 packet */ domain = AF_INET; else domain = AF_INET6; M_FREEM(m->m_next); m->m_next = 0; tlen = 0; /* NO data */ m->m_len = TCPIPHDRSZ; xchg(ti->ti_dport, ti->ti_sport, u_short); if (m->pkt->type == IPTP) xchg(ti->ti_dst.s_addr, ti->ti_src.s_addr, u_long); if (flags & TH_RST) /* count resets in MIB */ TCP_MIB_INC(tcpOutRsts); /* keep MIB stats */ } /* finish constructing the TCP header */ ti->ti_seq = htonl(seq); ti->ti_ack = htonl(ack); ti->ti_t.th_doff = 0x50; /* NetPort: init data offset bits */ ti->ti_flags = (u_char)flags; ti->ti_win = htons((u_short)win); ti->ti_urp = 0; ti->ti_t.th_sum = 0; /* Finish constructing IP header and send, based on IP type in use */ switch(domain) { #ifdef IP_V4 case AF_INET: { struct ip *pip; pip = (struct ip *)((char *)ti + sizeof(struct ipovly) - sizeof(struct ip)); m->pkt->nb_tlen = m->pkt->nb_plen = pip->ip_len = (unshort)(TCPIPHDRSZ + tlen); /* If our system's max. MAC header size is geater than the size * of the MAC header in the received packet then we need to * adjust the IP header offset to allow for this. Since the packets * are only headers they should always fit. */ if (pip >= (struct ip *)(m->pkt->nb_buff + MaxLnh)) { /* headers will fit, just set pointer */ m->m_data = m->pkt->nb_prot = (char *)pip; } else /* MAC may not fit, adjust pointer and move headers back */ { m->m_data = m->pkt->nb_prot = m->pkt->nb_buff + MaxLnh; /* new ptr */ MEMMOVE(m->m_data, pip, TCPIPHDRSZ); /* move back tcp/ip headers */ } #ifdef DOS_SYN if (!tp) { /* In the case of a SYN DOS attack, many RST|ACK replies * have no tp structure and need to be freed. */ M_FREEM(m); } else #endif { struct ip_socopts *sopts; int ret; if (tp && tp->t_inpcb && tp->t_inpcb->inp_socket) { sopts = tp->t_inpcb->inp_socket->so_optsPack; } else sopts = (struct ip_socopts *)NULL; ret = ip_output(m, sopts); } break; } #endif /* IP_V4 */ #ifdef IP_V6 case AF_INET6: { struct ipv6 * pip6; struct mbuf * ip_m; /* IP header's mbuf */ /* Get mbuf space for the IP header. mbuf m shold contain the * TCP header somewhere, so set m_dsata to that and try to prepend * an IPv6 header. */ m->m_data = (char *)&ti->ti_t; /* TCP header */ m->m_len = sizeof(struct tcphdr); ip_m = mbuf_prepend(m, sizeof(struct ipv6)); if (!ip_m) { m_free(m); return; } pip6 = (struct ipv6 *)ip_m->m_data; /* we have to find the IPv6 addresses. If a packet was passed * then get them form that, otherwise get them from the passed tp. * we should always have one or the other. */ if (ti_mbuf) { ip6_addr tmp; struct ipv6 *inpip = ti_mbuf->pkt->ip6_hdr; /* pip6 and inpip may be the same, so swap the IP addresses * through a tmp variable. */ IP6CPY(&tmp, &inpip->ip_src); IP6CPY(&pip6->ip_src, &inpip->ip_dest); IP6CPY(&pip6->ip_dest, &tmp); } else if (tp) { struct inpcb *inp = tp->t_inpcb; IP6CPY(&pip6->ip_src, &inp->ip6_laddr); IP6CPY(&pip6->ip_dest, &inp->ip6_faddr); } else { dtrap(); break; } /* best effort send */ /* send down to glue layer to IPv6 */ /* and don't forget the so_optsPack */ #ifdef DOS_SYN if (!tp) { /* In the case of a SYN DOS attack, many RST|ACK replies * have no tp structure and need to be freed. */ M_FREEM(m); } else #endif /* DOS_SYN */ { struct ip_socopts *sopts; int ret; if (tp && tp->t_inpcb && tp->t_inpcb->inp_socket) sopts = tp->t_inpcb->inp_socket->so_optsPack; else sopts = (struct ip_socopts *)NULL; ret = tcp6_send(tp, ip_m, &ti->ti_t, sizeof(struct ipv6) + sizeof(struct tcphdr) + tlen, sopts); } break; } #endif /* IP_V6 */ default: dtrap(); break; } return; }
/* * Do a send by putting data in output queue and updating urgent * marker if URG set. Possibly send more data. Unlike the other * pru_*() routines, the mbuf chains are our responsibility. We * must either enqueue them or free them. The other pru_* routines * generally are caller-frees. */ static int tcp_usr_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct proc *p) { int s = splnet(); int error = 0; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp; #ifdef INET6 int isipv6; #endif TCPDEBUG0; if (inp == NULL) { /* * OOPS! we lost a race, the TCP session got reset after * we checked SS_CANTSENDMORE, eg: while doing uiomove or a * network interrupt in the non-splnet() section of sosend(). */ if (m) m_freem(m); if (control) m_freem(control); error = ECONNRESET; /* XXX EPIPE? */ tp = NULL; TCPDEBUG1(); goto out; } #ifdef INET6 isipv6 = nam && nam->sa_family == AF_INET6; #endif /* INET6 */ tp = intotcpcb(inp); TCPDEBUG1(); if (control) { /* TCP doesn't do control messages (rights, creds, etc) */ if (control->m_len) { m_freem(control); if (m) m_freem(m); error = EINVAL; goto out; } m_freem(control); /* empty control, just free it */ } if(!(flags & PRUS_OOB)) { sbappend(&so->so_snd, m); if (nam && tp->t_state < TCPS_SYN_SENT) { /* * Do implied connect if not yet connected, * initialize window to default value, and * initialize maxseg/maxopd using peer's cached * MSS. */ #ifdef INET6 if (isipv6) error = tcp6_connect(tp, nam, p); else #endif /* INET6 */ error = tcp_connect(tp, nam, p); if (error) goto out; tp->snd_wnd = TTCP_CLIENT_SND_WND; tcp_mss(tp, -1); } if (flags & PRUS_EOF) { /* * Close the send side of the connection after * the data is sent. */ socantsendmore(so); tp = tcp_usrclosed(tp); } if (tp != NULL) { if (flags & PRUS_MORETOCOME) tp->t_flags |= TF_MORETOCOME; error = tcp_output(tp); if (flags & PRUS_MORETOCOME) tp->t_flags &= ~TF_MORETOCOME; } } else { if (sbspace(&so->so_snd) < -512) { m_freem(m); error = ENOBUFS; goto out; } /* * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section. * Otherwise, snd_up should be one lower. */ sbappend(&so->so_snd, m); if (nam && tp->t_state < TCPS_SYN_SENT) { /* * Do implied connect if not yet connected, * initialize window to default value, and * initialize maxseg/maxopd using peer's cached * MSS. */ #ifdef INET6 if (isipv6) error = tcp6_connect(tp, nam, p); else #endif /* INET6 */ error = tcp_connect(tp, nam, p); if (error) goto out; tp->snd_wnd = TTCP_CLIENT_SND_WND; tcp_mss(tp, -1); } tp->snd_up = tp->snd_una + so->so_snd.sb_cc; tp->t_force = 1; error = tcp_output(tp); tp->t_force = 0; } COMMON_END((flags & PRUS_OOB) ? PRU_SENDOOB : ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); }
/* * TCP input routine, follows pages 65-76 of the * protocol specification dated September, 1981 very closely. */ void tcp_input(usn_mbuf_t *m, int iphlen) { struct tcpiphdr *ti; struct inpcb *inp; u_char *optp = NULL; int optlen; int len, tlen, off; struct tcpcb *tp = 0; int tiflags; struct usn_socket *so = 0; int todrop, acked, ourfinisacked; int needoutput = 0; short ostate; struct usn_in_addr laddr; int dropsocket = 0; int iss = 0; u_long tiwin, ts_val, ts_ecr; int ts_present = 0; (void)needoutput; g_tcpstat.tcps_rcvtotal++; // Get IP and TCP header together in first mbuf. // Note: IP leaves IP header in first mbuf. ti = mtod(m, struct tcpiphdr *); if (iphlen > sizeof (usn_ip_t)) ip_stripoptions(m, (usn_mbuf_t *)0); if (m->mlen < sizeof (struct tcpiphdr)) { if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) { g_tcpstat.tcps_rcvshort++; return; } ti = mtod(m, struct tcpiphdr *); } #ifdef DUMP_PAYLOAD dump_chain(m,"tcp"); #endif /* * Checksum extended TCP header and data. */ tlen = ntohs(((usn_ip_t *)ti)->ip_len); len = sizeof (usn_ip_t) + tlen; ti->ti_next = ti->ti_prev = 0; ti->ti_x1 = 0; ti->ti_len = (u_short)tlen; HTONS(ti->ti_len); ti->ti_sum = in_cksum(m, len); if (ti->ti_sum) { g_tcpstat.tcps_rcvbadsum++; goto drop; } /* * Check that TCP offset makes sense, * pull out TCP options and adjust length. XXX */ off = ti->ti_off << 2; if (off < sizeof (struct tcphdr) || off > tlen) { g_tcpstat.tcps_rcvbadoff++; goto drop; } tlen -= off; ti->ti_len = tlen; if (off > sizeof (struct tcphdr)) { if (m->mlen < sizeof(usn_ip_t) + off) { if ((m = m_pullup(m, sizeof (usn_ip_t) + off)) == 0) { g_tcpstat.tcps_rcvshort++; return; } ti = mtod(m, struct tcpiphdr *); } optlen = off - sizeof (struct tcphdr); optp = mtod(m, u_char *) + sizeof (struct tcpiphdr); // Do quick retrieval of timestamp options ("options // prediction?"). If timestamp is the only option and it's // formatted as recommended in RFC 1323 appendix A, we // quickly get the values now and not bother calling // tcp_dooptions(), etc. if ((optlen == TCPOLEN_TSTAMP_APPA || (optlen > TCPOLEN_TSTAMP_APPA && optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && *(u_int *)optp == htonl(TCPOPT_TSTAMP_HDR) && (ti->ti_flags & TH_SYN) == 0) { ts_present = 1; ts_val = ntohl(*(u_long *)(optp + 4)); ts_ecr = ntohl(*(u_long *)(optp + 8)); optp = NULL; // we've parsed the options } } tiflags = ti->ti_flags; // Convert TCP protocol specific fields to host format. NTOHL(ti->ti_seq); NTOHL(ti->ti_ack); NTOHS(ti->ti_win); NTOHS(ti->ti_urp); // Locate pcb for segment. findpcb: inp = g_tcp_last_inpcb; if (inp->inp_lport != ti->ti_dport || inp->inp_fport != ti->ti_sport || inp->inp_faddr.s_addr != ti->ti_src.s_addr || inp->inp_laddr.s_addr != ti->ti_dst.s_addr) { inp = in_pcblookup(&g_tcb, ti->ti_src, ti->ti_sport, ti->ti_dst, ti->ti_dport, INPLOOKUP_WILDCARD); if (inp) g_tcp_last_inpcb = inp; ++g_tcpstat.tcps_pcbcachemiss; } // If the state is CLOSED (i.e., TCB does not exist) then // all data in the incoming segment is discarded. // If the TCB exists but is in CLOSED state, it is embryonic, // but should either do a listen or a connect soon. if (inp == 0) goto dropwithreset; tp = intotcpcb(inp); DEBUG("found inp cb, laddr=%x, lport=%d, faddr=%x," " fport=%d, tp_state=%d, tp_flags=%d", inp->inp_laddr.s_addr, inp->inp_lport, inp->inp_faddr.s_addr, inp->inp_fport, tp->t_state, tp->t_flags); if (tp == 0) goto dropwithreset; if (tp->t_state == TCPS_CLOSED) goto drop; // Unscale the window into a 32-bit value. if ((tiflags & TH_SYN) == 0) tiwin = ti->ti_win << tp->snd_scale; else tiwin = ti->ti_win; so = inp->inp_socket; DEBUG("socket info, options=%x", so->so_options); if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { if (so->so_options & SO_DEBUG) { ostate = tp->t_state; g_tcp_saveti = *ti; } if (so->so_options & SO_ACCEPTCONN) { if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { // Note: dropwithreset makes sure we don't // send a reset in response to a RST. if (tiflags & TH_ACK) { g_tcpstat.tcps_badsyn++; goto dropwithreset; } DEBUG("SYN is expected, tiflags=%d", tiflags); goto drop; } so = sonewconn(so, 0); if (so == 0) { DEBUG("failed to create new connection, tiflags=%d", tiflags); goto drop; } // Mark socket as temporary until we're // committed to keeping it. The code at // ``drop'' and ``dropwithreset'' check the // flag dropsocket to see if the temporary // socket created here should be discarded. // We mark the socket as discardable until // we're committed to it below in TCPS_LISTEN. dropsocket++; inp = (struct inpcb *)so->so_pcb; inp->inp_laddr = ti->ti_dst; inp->inp_lport = ti->ti_dport; // BSD >= 4.3 inp->inp_options = ip_srcroute(); tp = intotcpcb(inp); tp->t_state = TCPS_LISTEN; // Compute proper scaling value from buffer space while (tp->request_r_scale < TCP_MAX_WINSHIFT && TCP_MAXWIN << tp->request_r_scale < so->so_rcv->sb_hiwat) tp->request_r_scale++; } } // Segment received on connection. // Reset idle time and keep-alive timer. tp->t_idle = 0; tp->t_timer[TCPT_KEEP] = g_tcp_keepidle; // Process options if not in LISTEN state, // else do it below (after getting remote address). if (optp && tp->t_state != TCPS_LISTEN) tcp_dooptions(tp, optp, optlen, ti, &ts_present, &ts_val, &ts_ecr); // Header prediction: check for the two common cases // of a uni-directional data xfer. If the packet has // no control flags, is in-sequence, the window didn't // change and we're not retransmitting, it's a // candidate. If the length is zero and the ack moved // forward, we're the sender side of the xfer. Just // free the data acked & wake any higher level process // that was blocked waiting for space. If the length // is non-zero and the ack didn't move, we're the // receiver side. If we're getting packets in-order // (the reassembly queue is empty), add the data to // the socket buffer and note that we need a delayed ack. if (tp->t_state == TCPS_ESTABLISHED && (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && (!ts_present || TSTMP_GEQ(ts_val, tp->ts_recent)) && ti->ti_seq == tp->rcv_nxt && tiwin && tiwin == tp->snd_wnd && tp->snd_nxt == tp->snd_max) { // If last ACK falls within this segment's sequence numbers, // record the timestamp. if ( ts_present && TSTMP_GEQ(ts_val, tp->ts_recent) && SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) ){ tp->ts_recent_age = g_tcp_now; tp->ts_recent = ts_val; } if (ti->ti_len == 0) { if (SEQ_GT(ti->ti_ack, tp->snd_una) && SEQ_LEQ(ti->ti_ack, tp->snd_max) && tp->snd_cwnd >= tp->snd_wnd) { // this is a pure ack for outstanding data. ++g_tcpstat.tcps_predack; if (ts_present) tcp_xmit_timer(tp, g_tcp_now-ts_ecr+1); else if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq)) tcp_xmit_timer(tp, tp->t_rtt); acked = ti->ti_ack - tp->snd_una; g_tcpstat.tcps_rcvackpack++; g_tcpstat.tcps_rcvackbyte += acked; TRACE("drop so_snd buffer, drop_bytes=%d, len=%d", acked, so->so_snd.sb_cc); sbdrop(so->so_snd, acked); tp->snd_una = ti->ti_ack; usn_free_cmbuf(m); // If all outstanding data are acked, stop // retransmit timer, otherwise restart timer // using current (possibly backed-off) value. // If process is waiting for space, // wakeup/selwakeup/signal. If data // are ready to send, let tcp_output // decide between more output or persist. if (tp->snd_una == tp->snd_max) tp->t_timer[TCPT_REXMT] = 0; else if (tp->t_timer[TCPT_PERSIST] == 0) tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, &g_tcp_saveti, 0); //if (so->so_snd->sb_flags & SB_NOTIFY) { // usnet_tcpin_wwakeup(so, USN_TCP_IN, usn_tcpev_sbnotify, 0); // sowwakeup(so); //} // send buffer is available for app thread. usnet_tcpin_wwakeup(so, USN_TCP_IN, USN_TCPEV_WRITE, 0); if (so->so_snd->sb_cc) tcp_output(tp); return; } } else if (ti->ti_ack == tp->snd_una && tp->seg_next == (struct tcpiphdr *)tp && ti->ti_len <= sbspace(so->so_rcv)) { // this is a pure, in-sequence data packet // with nothing on the reassembly queue and // we have enough buffer space to take it. ++g_tcpstat.tcps_preddat; tp->rcv_nxt += ti->ti_len; g_tcpstat.tcps_rcvpack++; g_tcpstat.tcps_rcvbyte += ti->ti_len; // Drop TCP, IP headers and TCP options then add data // to socket buffer. m->head += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); m->mlen -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); TRACE("add data to rcv buf"); sbappend(so->so_rcv, m); sorwakeup(so); // new data is available for app threads. usnet_tcpin_rwakeup(so, USN_TCP_IN, USN_TCPEV_READ, m); if (so->so_options & SO_DEBUG) { TRACE("tcp trace, so_options=%d", so->so_options); tcp_trace(TA_INPUT, ostate, tp, &g_tcp_saveti, 0); } tp->t_flags |= TF_DELACK; return; } } // Drop TCP, IP headers and TCP options. m->head += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); m->mlen -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); // Calculate amount of space in receive window, // and then do TCP input processing. // Receive window is amount of space in rcv queue, // but not less than advertised window. { int win; win = sbspace(so->so_rcv); if (win < 0) win = 0; tp->rcv_wnd = max(win, (int)(tp->rcv_adv - tp->rcv_nxt)); } switch (tp->t_state) { // If the state is LISTEN then ignore segment if it contains an RST. // If the segment contains an ACK then it is bad and send a RST. // If it does not contain a SYN then it is not interesting; drop it. // Don't bother responding if the destination was a broadcast. // Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial // tp->iss, and send a segment: // <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> // Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss. // Fill in remote peer address fields if not previously specified. // Enter SYN_RECEIVED state, and process any other fields of this // segment in this state. case TCPS_LISTEN: { usn_mbuf_t *am; struct usn_sockaddr_in *sin; if (tiflags & TH_RST) goto drop; if (tiflags & TH_ACK) goto dropwithreset; if ((tiflags & TH_SYN) == 0) goto drop; // RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN // in_broadcast() should never return true on a received // packet with M_BCAST not set. //if (m->m_flags & (M_BCAST|M_MCAST) || // IN_MULTICAST(ntohl(ti->ti_dst.s_addr))) // goto drop; am = usn_get_mbuf(0, BUF_MSIZE, 0); // XXX: the size! if (am == NULL) goto drop; am->mlen = sizeof (struct usn_sockaddr_in); sin = mtod(am, struct usn_sockaddr_in *); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = ti->ti_src; sin->sin_port = ti->ti_sport; bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero)); laddr = inp->inp_laddr; if (inp->inp_laddr.s_addr == USN_INADDR_ANY) inp->inp_laddr = ti->ti_dst; if (in_pcbconnect(inp, am)) { inp->inp_laddr = laddr; usn_free_mbuf(am); goto drop; } usn_free_mbuf(am); tp->t_template = tcp_template(tp); if (tp->t_template == 0) { tp = tcp_drop(tp, ENOBUFS); dropsocket = 0; // socket is already gone goto drop; } if (optp) tcp_dooptions(tp, optp, optlen, ti, &ts_present, &ts_val, &ts_ecr); if (iss) tp->iss = iss; else tp->iss = g_tcp_iss; g_tcp_iss += TCP_ISSINCR/4; tp->irs = ti->ti_seq; tcp_sendseqinit(tp); tcp_rcvseqinit(tp); tp->t_flags |= TF_ACKNOW; TRACE("change tcp state to TCPS_SYN_RECEIVED, state=%d, tp_flags=%d", tp->t_state, tp->t_flags); tp->t_state = TCPS_SYN_RECEIVED; // tcp event usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_SYN_RECEIVED, 0); tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT; dropsocket = 0; // committed to socket g_tcpstat.tcps_accepts++; goto trimthenstep6; } // If the state is SYN_SENT: // if seg contains an ACK, but not for our SYN, drop the input. // if seg contains a RST, then drop the connection. // if seg does not contain SYN, then drop it. // Otherwise this is an acceptable SYN segment // initialize tp->rcv_nxt and tp->irs // if seg contains ack then advance tp->snd_una // if SYN has been acked change to ESTABLISHED else SYN_RCVD state // arrange for segment to be acked (eventually) // continue processing rest of data/controls, beginning with URG case TCPS_SYN_SENT: if ((tiflags & TH_ACK) && (SEQ_LEQ(ti->ti_ack, tp->iss) || SEQ_GT(ti->ti_ack, tp->snd_max))) goto dropwithreset; if (tiflags & TH_RST) { if (tiflags & TH_ACK) tp = tcp_drop(tp, ECONNREFUSED); goto drop; } if ((tiflags & TH_SYN) == 0) goto drop; if (tiflags & TH_ACK) { tp->snd_una = ti->ti_ack; if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; tp->t_timer[TCPT_REXMT] = 0; } tp->irs = ti->ti_seq; tcp_rcvseqinit(tp); tp->t_flags |= TF_ACKNOW; TRACE("ack now, tp flags=%d", tp->t_flags); // XXX: remove second test. if (tiflags & TH_ACK /*&& SEQ_GT(tp->snd_una, tp->iss)*/) { g_tcpstat.tcps_connects++; soisconnected(so); TRACE("change tcp state to TCPS_ESTABLISHED," " state=%d, tp_flags=%d", tp->t_state, tp->t_flags); tp->t_state = TCPS_ESTABLISHED; // Do window scaling on this connection? if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->snd_scale = tp->requested_s_scale; tp->rcv_scale = tp->request_r_scale; } tcp_reass(tp, (struct tcpiphdr *)0, (usn_mbuf_t *)0); // if we didn't have to retransmit the SYN, // use its rtt as our initial srtt & rtt var. if (tp->t_rtt) tcp_xmit_timer(tp, tp->t_rtt); } else { TRACE("change tcp state to TCPS_SYN_RECEIVED, state=%d, tp_flags=%d", tp->t_state, tp->t_flags); tp->t_state = TCPS_SYN_RECEIVED; // tcp event usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_SYN_RECEIVED, 0); } trimthenstep6: // Advance ti->ti_seq to correspond to first data byte. // If data, trim to stay within window, // dropping FIN if necessary. ti->ti_seq++; if (ti->ti_len > tp->rcv_wnd) { todrop = ti->ti_len - tp->rcv_wnd; m_adj(m, -todrop); ti->ti_len = tp->rcv_wnd; tiflags &= ~TH_FIN; g_tcpstat.tcps_rcvpackafterwin++; g_tcpstat.tcps_rcvbyteafterwin += todrop; } tp->snd_wl1 = ti->ti_seq - 1; tp->rcv_up = ti->ti_seq; goto step6; } // States other than LISTEN or SYN_SENT. // First check timestamp, if present. // Then check that at least some bytes of segment are within // receive window. If segment begins before rcv_nxt, // drop leading data (and SYN); if nothing left, just ack. // // RFC 1323 PAWS: If we have a timestamp reply on this segment // and it's less than ts_recent, drop it. if (ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && TSTMP_LT(ts_val, tp->ts_recent)) { // Check to see if ts_recent is over 24 days old. if ((int)(g_tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { // Invalidate ts_recent. If this segment updates // ts_recent, the age will be reset later and ts_recent // will get a valid value. If it does not, setting // ts_recent to zero will at least satisfy the // requirement that zero be placed in the timestamp // echo reply when ts_recent isn't valid. The // age isn't reset until we get a valid ts_recent // because we don't want out-of-order segments to be // dropped when ts_recent is old. tp->ts_recent = 0; } else { g_tcpstat.tcps_rcvduppack++; g_tcpstat.tcps_rcvdupbyte += ti->ti_len; g_tcpstat.tcps_pawsdrop++; goto dropafterack; } } todrop = tp->rcv_nxt - ti->ti_seq; if (todrop > 0) { if (tiflags & TH_SYN) { tiflags &= ~TH_SYN; ti->ti_seq++; if (ti->ti_urp > 1) ti->ti_urp--; else tiflags &= ~TH_URG; todrop--; } if ( todrop >= ti->ti_len || ( todrop == ti->ti_len && (tiflags & TH_FIN ) == 0 ) ) { // Any valid FIN must be to the left of the window. // At this point the FIN must be a duplicate or // out of sequence; drop it. tiflags &= ~TH_FIN; // Send an ACK to resynchronize and drop any data // But keep on processing for RST or ACK. tp->t_flags |= TF_ACKNOW; TRACE("send ack now to resync, tp_flags=%d", tp->t_flags); todrop = ti->ti_len; g_tcpstat.tcps_rcvdupbyte += ti->ti_len; g_tcpstat.tcps_rcvduppack++; } else { g_tcpstat.tcps_rcvpartduppack++; g_tcpstat.tcps_rcvpartdupbyte += ti->ti_len; } m_adj(m, todrop); ti->ti_seq += todrop; ti->ti_len -= todrop; if (ti->ti_urp > todrop) ti->ti_urp -= todrop; else { tiflags &= ~TH_URG; ti->ti_urp = 0; } } // If new data are received on a connection after the // user processes are gone, then RST the other end. if ((so->so_state & USN_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT && ti->ti_len) { tp = tcp_close(tp); g_tcpstat.tcps_rcvafterclose++; goto dropwithreset; } // If segment ends after window, drop trailing data // (and PUSH and FIN); if nothing left, just ACK. todrop = (ti->ti_seq+ti->ti_len) - (tp->rcv_nxt+tp->rcv_wnd); if (todrop > 0) { g_tcpstat.tcps_rcvpackafterwin++; if (todrop >= ti->ti_len) { g_tcpstat.tcps_rcvbyteafterwin += ti->ti_len; // If a new connection request is received // while in TIME_WAIT, drop the old connection // and start over if the sequence numbers // are above the previous ones. if (tiflags & TH_SYN && tp->t_state == TCPS_TIME_WAIT && SEQ_GT(ti->ti_seq, tp->rcv_nxt)) { iss = tp->snd_nxt + TCP_ISSINCR; tp = tcp_close(tp); goto findpcb; } // If window is closed can only take segments at // window edge, and have to drop data and PUSH from // incoming segments. Continue processing, but // remember to ack. Otherwise, drop segment // and ack. if (tp->rcv_wnd == 0 && ti->ti_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; g_tcpstat.tcps_rcvwinprobe++; } else goto dropafterack; } else g_tcpstat.tcps_rcvbyteafterwin += todrop; m_adj(m, -todrop); ti->ti_len -= todrop; tiflags &= ~(TH_PUSH|TH_FIN); } // check valid timestamp. Replace code above. if (ts_present && TSTMP_GEQ(ts_val, tp->ts_recent) && SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) ) { tp->ts_recent_age = g_tcp_now; tp->ts_recent = ts_val; } // If the RST bit is set examine the state: // SYN_RECEIVED STATE: // If passive open, return to LISTEN state. // If active open, inform user that connection was refused. // ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: // Inform user that connection was reset, and close tcb. // CLOSING, LAST_ACK, TIME_WAIT STATES // Close the tcb. if (tiflags&TH_RST) switch (tp->t_state) { case TCPS_SYN_RECEIVED: so->so_error = ECONNREFUSED; goto close; case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: so->so_error = ECONNRESET; close: DEBUG("change tcp state to TCPS_CLOSED, state=%d", tp->t_state); tp->t_state = TCPS_CLOSED; // tcp event usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_CLOSED, 0); g_tcpstat.tcps_drops++; tp = tcp_close(tp); goto drop; case TCPS_CLOSING: case TCPS_LAST_ACK: case TCPS_TIME_WAIT: tp = tcp_close(tp); goto drop; } // If a SYN is in the window, then this is an // error and we send an RST and drop the connection. if (tiflags & TH_SYN) { tp = tcp_drop(tp, ECONNRESET); goto dropwithreset; } // If the ACK bit is off we drop the segment and return. if ((tiflags & TH_ACK) == 0) goto drop; // Ack processing. switch (tp->t_state) { // In SYN_RECEIVED state if the ack ACKs our SYN then enter // ESTABLISHED state and continue processing, otherwise // send an RST. case TCPS_SYN_RECEIVED: if (SEQ_GT(tp->snd_una, ti->ti_ack) || SEQ_GT(ti->ti_ack, tp->snd_max)) goto dropwithreset; g_tcpstat.tcps_connects++; DEBUG("change tcp state to TCPS_ESTABLISHED, state=%d", tp->t_state); tp->t_state = TCPS_ESTABLISHED; soisconnected(so); // Do window scaling? if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->snd_scale = tp->requested_s_scale; tp->rcv_scale = tp->request_r_scale; } tcp_reass(tp, (struct tcpiphdr *)0, (usn_mbuf_t *)0); tp->snd_wl1 = ti->ti_seq - 1; // fall into ... // In ESTABLISHED state: drop duplicate ACKs; ACK out of range // ACKs. If the ack is in the range // tp->snd_una < ti->ti_ack <= tp->snd_max // then advance tp->snd_una to ti->ti_ack and drop // data from the retransmission queue. If this ACK reflects // more up to date window information we update our window information. case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: case TCPS_TIME_WAIT: if (SEQ_LEQ(ti->ti_ack, tp->snd_una)) { if (ti->ti_len == 0 && tiwin == tp->snd_wnd) { g_tcpstat.tcps_rcvdupack++; // If we have outstanding data (other than // a window probe), this is a completely // duplicate ack (ie, window info didn't // change), the ack is the biggest we've // seen and we've seen exactly our rexmt // threshhold of them, assume a packet // has been dropped and retransmit it. // Kludge snd_nxt & the congestion // window so we send only this one // packet. // // We know we're losing at the current // window size so do congestion avoidance // (set ssthresh to half the current window // and pull our congestion window back to // the new ssthresh). // // Dup acks mean that packets have left the // network (they're now cached at the receiver) // so bump cwnd by the amount in the receiver // to keep a constant cwnd packets in the // network. if (tp->t_timer[TCPT_REXMT] == 0 || ti->ti_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks == g_tcprexmtthresh) { // congestion avoidance tcp_seq onxt = tp->snd_nxt; u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; if (win < 2) win = 2; tp->snd_ssthresh = win * tp->t_maxseg; tp->t_timer[TCPT_REXMT] = 0; tp->t_rtt = 0; tp->snd_nxt = ti->ti_ack; tp->snd_cwnd = tp->t_maxseg; tcp_output(tp); tp->snd_cwnd = tp->snd_ssthresh + tp->t_maxseg * tp->t_dupacks; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; } else if (tp->t_dupacks > g_tcprexmtthresh) { tp->snd_cwnd += tp->t_maxseg; tcp_output(tp); goto drop; } } else tp->t_dupacks = 0; break; } // If the congestion window was inflated to account // for the other side's cached packets, retract it. if (tp->t_dupacks > g_tcprexmtthresh && tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; tp->t_dupacks = 0; if (SEQ_GT(ti->ti_ack, tp->snd_max)) { g_tcpstat.tcps_rcvacktoomuch++; goto dropafterack; } acked = ti->ti_ack - tp->snd_una; g_tcpstat.tcps_rcvackpack++; g_tcpstat.tcps_rcvackbyte += acked; // If we have a timestamp reply, update smoothed // round trip time. If no timestamp is present but // transmit timer is running and timed sequence // number was acked, update smoothed round trip time. // Since we now have an rtt measurement, cancel the // timer backoff (cf., Phil Karn's retransmit alg.). // Recompute the initial retransmit timer. if (ts_present) tcp_xmit_timer(tp, g_tcp_now-ts_ecr+1); else if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq)) tcp_xmit_timer(tp,tp->t_rtt); // If all outstanding data is acked, stop retransmit // timer and remember to restart (more output or persist). // If there is more data to be acked, restart retransmit // timer, using current (possibly backed-off) value. if (ti->ti_ack == tp->snd_max) { tp->t_timer[TCPT_REXMT] = 0; DEBUG("change needoutput to 1"); needoutput = 1; tp->t_flags |= TF_NEEDOUTPUT; } else if (tp->t_timer[TCPT_PERSIST] == 0) tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; // When new data is acked, open the congestion window. // If the window gives us less than ssthresh packets // in flight, open exponentially (maxseg per packet). // Otherwise open linearly: maxseg per window // (maxseg * (maxseg / cwnd) per packet). { u_int cw = tp->snd_cwnd; u_int incr = tp->t_maxseg; if (cw > tp->snd_ssthresh) incr = incr * incr / cw; tp->snd_cwnd = min(cw + incr, TCP_MAXWIN<<tp->snd_scale); } if (acked > so->so_snd->sb_cc) { tp->snd_wnd -= so->so_snd->sb_cc; DEBUG("drop all so_snd buffer, drop_bytes=%d, acked=%d", so->so_snd->sb_cc, acked); sbdrop(so->so_snd, (int)so->so_snd->sb_cc); ourfinisacked = 1; } else { DEBUG("drop so_snd buffer, drop_bytes=%d, len=%d", acked, so->so_snd->sb_cc); sbdrop(so->so_snd, acked); tp->snd_wnd -= acked; ourfinisacked = 0; } //if (so->so_snd->sb_flags & SB_NOTIFY) { sowwakeup(so); usnet_tcpin_wwakeup(so, USN_TCP_IN, USN_TCPEV_WRITE, 0); //} tp->snd_una = ti->ti_ack; if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; switch (tp->t_state) { // In FIN_WAIT_1 STATE in addition to the processing // for the ESTABLISHED state if our FIN is now acknowledged // then enter FIN_WAIT_2. case TCPS_FIN_WAIT_1: if (ourfinisacked) { // If we can't receive any more // data, then closing user can proceed. // Starting the timer is contrary to the // specification, but if we don't get a FIN // we'll hang forever. if (so->so_state & USN_CANTRCVMORE) { soisdisconnected(so); tp->t_timer[TCPT_2MSL] = g_tcp_maxidle; } DEBUG("change tcp state to TCPS_FIN_WAIT_2, state=%d", tp->t_state); tp->t_state = TCPS_FIN_WAIT_2; usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_FIN_WAIT2, 0); } break; // In CLOSING STATE in addition to the processing for // the ESTABLISHED state if the ACK acknowledges our FIN // then enter the TIME-WAIT state, otherwise ignore // the segment. case TCPS_CLOSING: if (ourfinisacked) { DEBUG("change tcp state to TCPS_TIME_WAIT, state=%d", tp->t_state); tp->t_state = TCPS_TIME_WAIT; usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_TIME_WAIT, 0); tcp_canceltimers(tp); tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; soisdisconnected(so); } break; // In LAST_ACK, we may still be waiting for data to drain // and/or to be acked, as well as for the ack of our FIN. // If our FIN is now acknowledged, delete the TCB, // enter the closed state and return. case TCPS_LAST_ACK: if (ourfinisacked) { tp = tcp_close(tp); goto drop; } break; // In TIME_WAIT state the only thing that should arrive // is a retransmission of the remote FIN. Acknowledge // it and restart the finack timer. case TCPS_TIME_WAIT: tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; goto dropafterack; } } step6: // Update window information. // Don't look at window if no ACK: TAC's send garbage on first SYN. if ((tiflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, ti->ti_seq) || (tp->snd_wl1 == ti->ti_seq && (SEQ_LT(tp->snd_wl2, ti->ti_ack) || (tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd) )) )) { // keep track of pure window updates if (ti->ti_len == 0 && tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd) g_tcpstat.tcps_rcvwinupd++; tp->snd_wnd = tiwin; tp->snd_wl1 = ti->ti_seq; tp->snd_wl2 = ti->ti_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; DEBUG("change needoutput to 1"); tp->t_flags |= TF_NEEDOUTPUT; needoutput = 1; } // Process segments with URG. if ((tiflags & TH_URG) && ti->ti_urp && TCPS_HAVERCVDFIN(tp->t_state) == 0) { // This is a kludge, but if we receive and accept // random urgent pointers, we'll crash in // soreceive. It's hard to imagine someone // actually wanting to send this much urgent data. if (ti->ti_urp + so->so_rcv->sb_cc > g_sb_max) { ti->ti_urp = 0; // XXX tiflags &= ~TH_URG; // XXX goto dodata; // XXX } // If this segment advances the known urgent pointer, // then mark the data stream. This should not happen // in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since // a FIN has been received from the remote side. // In these states we ignore the URG. // // According to RFC961 (Assigned Protocols), // the urgent pointer points to the last octet // of urgent data. We continue, however, // to consider it to indicate the first octet // of data past the urgent section as the original // spec states (in one of two places). if (SEQ_GT(ti->ti_seq+ti->ti_urp, tp->rcv_up)) { tp->rcv_up = ti->ti_seq + ti->ti_urp; so->so_oobmark = so->so_rcv->sb_cc + (tp->rcv_up - tp->rcv_nxt) - 1; if (so->so_oobmark == 0) so->so_state |= USN_RCVATMARK; sohasoutofband(so); // send async event to app threads. usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPEV_OUTOFBOUND, 0); tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); } // Remove out of band data so doesn't get presented to user. // This can happen independent of advancing the URG pointer, // but if two URG's are pending at once, some out-of-band // data may creep in... ick. if (ti->ti_urp <= ti->ti_len #ifdef SO_OOBINLINE && (so->so_options & SO_OOBINLINE) == 0 #endif ) tcp_pulloutofband(so, ti, m); } else // If no out of band data is expected, // pull receive urgent pointer along // with the receive window. if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) tp->rcv_up = tp->rcv_nxt; dodata: // XXX #ifdef DUMP_PAYLOAD DEBUG("Handle data"); dump_chain(m,"tcp"); #endif // Process the segment text, merging it into the TCP sequencing queue, // and arranging for acknowledgment of receipt if necessary. // This process logically involves adjusting tp->rcv_wnd as data // is presented to the user (this happens in tcp_usrreq.c, // case PRU_RCVD). If a FIN has already been received on this // connection then we just ignore the text. if ((ti->ti_len || (tiflags&TH_FIN)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { TCP_REASS(tp, ti, m, so, tiflags); // Note the amount of data that peer has sent into // our window, in order to estimate the sender's // buffer size. len = so->so_rcv->sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); } else { usn_free_cmbuf(m); tiflags &= ~TH_FIN; } // If FIN is received ACK the FIN and let the user know // that the connection is closing. if (tiflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { socantrcvmore(so); tp->t_flags |= TF_ACKNOW; TRACE("ack FIN now, tp flags=%d", tp->t_flags); tp->rcv_nxt++; } switch (tp->t_state) { // In SYN_RECEIVED and ESTABLISHED STATES // enter the CLOSE_WAIT state. case TCPS_SYN_RECEIVED: case TCPS_ESTABLISHED: TRACE("change tcp state to TCPS_CLOSE_WAIT, state=%d", tp->t_state); tp->t_state = TCPS_CLOSE_WAIT; soewakeup(so, 0); usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_CLOSE_WAIT, 0); break; // If still in FIN_WAIT_1 STATE FIN has not been acked so // enter the CLOSING state. case TCPS_FIN_WAIT_1: TRACE("change tcp state to TCPS_CLOSING, state=%d", tp->t_state); tp->t_state = TCPS_CLOSING; usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_CLOSING, 0); break; // In FIN_WAIT_2 state enter the TIME_WAIT state, // starting the time-wait timer, turning off the other // standard timers. case TCPS_FIN_WAIT_2: TRACE("change tcp state to TCPS_TIME_WAIT, state=%d", tp->t_state); tp->t_state = TCPS_TIME_WAIT; tcp_canceltimers(tp); tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; soisdisconnected(so); usnet_tcpin_ewakeup(so, USN_TCP_IN, USN_TCPST_TIME_WAIT, 0); break; // In TIME_WAIT state restart the 2 MSL time_wait timer. case TCPS_TIME_WAIT: tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; break; } } if (so->so_options & SO_DEBUG) { TRACE("tcp trace, so_options=%d", so->so_options); tcp_trace(TA_INPUT, ostate, tp, &g_tcp_saveti, 0); } // Return any desired output. //if (needoutput || (tp->t_flags & TF_ACKNOW)){ if (tp->t_flags & TF_NEEDOUTPUT || (tp->t_flags & TF_ACKNOW)){ TRACE("ack now or need to ouput, tp->t_flags=%d", tp->t_flags); tcp_output(tp); } return; dropafterack: TRACE("dropafterack"); // Generate an ACK dropping incoming segment if it occupies // sequence space, where the ACK reflects our state. if (tiflags & TH_RST) goto drop; usn_free_cmbuf(m); tp->t_flags |= TF_ACKNOW; TRACE("ack now, tp flags=%d", tp->t_flags); tcp_output(tp); return; dropwithreset: TRACE("dropwithreset"); // Generate a RST, dropping incoming segment. // Make ACK acceptable to originator of segment. // Don't bother to respond if destination was broadcast/multicast. #define USN_MULTICAST(i) (((u_int)(i) & 0xf0000000) == 0xe0000000) if ((tiflags & TH_RST) || m->flags & (BUF_BCAST|BUF_MCAST) || USN_MULTICAST(ntohl(ti->ti_dst.s_addr))) goto drop; if (tiflags & TH_ACK) tcp_respond(tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST); else { if (tiflags & TH_SYN) ti->ti_len++; tcp_respond(tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0, TH_RST|TH_ACK); } // destroy temporarily created socket if (dropsocket) soabort(so); return; drop: TRACE("drop"); // Drop space held by incoming segment and return. if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) { TRACE("tcp trace: drop a socket"); tcp_trace(TA_DROP, ostate, tp, &g_tcp_saveti, 0); } usn_free_cmbuf(m); // destroy temporarily created socket if (dropsocket) soabort(so); return; }
/* * send message to the socket. */ static int key_sendup0(struct rawcb *rp, struct mbuf *m, int promisc, int canwait) { struct keycb *kp = (struct keycb *)rp; struct mbuf *n; int error = 0; if (canwait) { if (kp->kp_queue) { for (n = kp->kp_queue; n && n->m_nextpkt; n = n->m_nextpkt) ; n->m_nextpkt = m; m = kp->kp_queue; kp->kp_queue = NULL; } else m->m_nextpkt = NULL; /* just for safety */ } else m->m_nextpkt = NULL; for (; m && error == 0; m = n) { n = m->m_nextpkt; if (promisc) { struct sadb_msg *pmsg; M_PREPEND(m, sizeof(struct sadb_msg), M_NOWAIT); if (m && m->m_len < sizeof(struct sadb_msg)) m = m_pullup(m, sizeof(struct sadb_msg)); if (!m) { pfkeystat.in_nomem++; error = ENOBUFS; goto recovery; } m->m_pkthdr.len += sizeof(*pmsg); pmsg = mtod(m, struct sadb_msg *); bzero(pmsg, sizeof(*pmsg)); pmsg->sadb_msg_version = PF_KEY_V2; pmsg->sadb_msg_type = SADB_X_PROMISC; pmsg->sadb_msg_len = PFKEY_UNIT64(m->m_pkthdr.len); /* pid and seq? */ pfkeystat.in_msgtype[pmsg->sadb_msg_type]++; } if (canwait && sbspace(&rp->rcb_socket->so_rcv) < m->m_pkthdr.len) { error = EAGAIN; goto recovery; } m->m_nextpkt = NULL; if (!sbappendaddr(&rp->rcb_socket->so_rcv, (struct sockaddr *)&key_src, m, NULL)) { pfkeystat.in_nomem++; error = ENOBUFS; goto recovery; } else { sorwakeup(rp->rcb_socket); error = 0; } } return (error); recovery: if (kp->kp_queue) { /* * insert m to the head of queue, as normally mbuf on the queue * is less important than others. */ if (m) { m->m_nextpkt = kp->kp_queue; kp->kp_queue = m; } } else { /* recover the queue */ if (!m) { /* first ENOBUFS case */ kp->kp_queue = n; } else { kp->kp_queue = m; m->m_nextpkt = n; } } return (error); }