static int udp6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; int error = 0; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_send: inp == NULL")); INP_WLOCK(inp); if (addr) { if (addr->sa_len != sizeof(struct sockaddr_in6)) { error = EINVAL; goto bad; } if (addr->sa_family != AF_INET6) { error = EAFNOSUPPORT; goto bad; } } #ifdef INET if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { int hasv4addr; struct sockaddr_in6 *sin6 = NULL; if (addr == NULL) hasv4addr = (inp->inp_vflag & INP_IPV4); else { sin6 = (struct sockaddr_in6 *)addr; hasv4addr = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) ? 1 : 0; } if (hasv4addr) { struct pr_usrreqs *pru; uint8_t nxt; nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ? IPPROTO_UDP : IPPROTO_UDPLITE; /* * XXXRW: We release UDP-layer locks before calling * udp_send() in order to avoid recursion. However, * this does mean there is a short window where inp's * fields are unstable. Could this lead to a * potential race in which the factors causing us to * select the UDPv4 output routine are invalidated? */ INP_WUNLOCK(inp); if (sin6) in6_sin6_2_sin_in_sock(addr); pru = inetsw[ip_protox[nxt]].pr_usrreqs; /* addr will just be freed in sendit(). */ return ((*pru->pru_send)(so, flags, m, addr, control, td)); } } #endif #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif INP_HASH_WLOCK(pcbinfo); error = udp6_output(inp, m, addr, control, td); INP_HASH_WUNLOCK(pcbinfo); INP_WUNLOCK(inp); return (error); bad: INP_WUNLOCK(inp); m_freem(m); return (error); }
/* * active open (soconnect). * * State of affairs on entry: * soisconnecting (so_state |= SS_ISCONNECTING) * tcbinfo not locked (This has changed - used to be WLOCKed) * inp WLOCKed * tp->t_state = TCPS_SYN_SENT * rtalloc1, RT_UNLOCK on rt. */ int t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt, struct sockaddr *nam) { struct adapter *sc = tod->tod_softc; struct toepcb *toep = NULL; struct wrqe *wr = NULL; struct ifnet *rt_ifp = rt->rt_ifp; struct vi_info *vi; int mtu_idx, rscale, qid_atid, rc, isipv6, txqid, rxqid; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); int reason; struct offload_settings settings; uint16_t vid = 0xfff, pcp = 0; INP_WLOCK_ASSERT(inp); KASSERT(nam->sa_family == AF_INET || nam->sa_family == AF_INET6, ("%s: dest addr %p has family %u", __func__, nam, nam->sa_family)); if (rt_ifp->if_type == IFT_ETHER) vi = rt_ifp->if_softc; else if (rt_ifp->if_type == IFT_L2VLAN) { struct ifnet *ifp = VLAN_TRUNKDEV(rt_ifp); vi = ifp->if_softc; VLAN_TAG(rt_ifp, &vid); VLAN_PCP(rt_ifp, &pcp); } else if (rt_ifp->if_type == IFT_IEEE8023ADLAG) DONT_OFFLOAD_ACTIVE_OPEN(ENOSYS); /* XXX: implement lagg+TOE */ else DONT_OFFLOAD_ACTIVE_OPEN(ENOTSUP); rw_rlock(&sc->policy_lock); settings = *lookup_offload_policy(sc, OPEN_TYPE_ACTIVE, NULL, EVL_MAKETAG(vid, pcp, 0), inp); rw_runlock(&sc->policy_lock); if (!settings.offload) DONT_OFFLOAD_ACTIVE_OPEN(EPERM); if (settings.txq >= 0 && settings.txq < vi->nofldtxq) txqid = settings.txq; else txqid = arc4random() % vi->nofldtxq; txqid += vi->first_ofld_txq; if (settings.rxq >= 0 && settings.rxq < vi->nofldrxq) rxqid = settings.rxq; else rxqid = arc4random() % vi->nofldrxq; rxqid += vi->first_ofld_rxq; toep = alloc_toepcb(vi, txqid, rxqid, M_NOWAIT | M_ZERO); if (toep == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); toep->tid = alloc_atid(sc, toep); if (toep->tid < 0) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); toep->l2te = t4_l2t_get(vi->pi, rt_ifp, rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway : nam); if (toep->l2te == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); isipv6 = nam->sa_family == AF_INET6; wr = alloc_wrqe(act_open_cpl_size(sc, isipv6), toep->ctrlq); if (wr == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); toep->vnet = so->so_vnet; set_ulp_mode(toep, select_ulp_mode(so, sc, &settings)); SOCKBUF_LOCK(&so->so_rcv); /* opt0 rcv_bufsiz initially, assumes its normal meaning later */ toep->rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ); SOCKBUF_UNLOCK(&so->so_rcv); /* * The kernel sets request_r_scale based on sb_max whereas we need to * take hardware's MAX_RCV_WND into account too. This is normally a * no-op as MAX_RCV_WND is much larger than the default sb_max. */ if (tp->t_flags & TF_REQ_SCALE) rscale = tp->request_r_scale = select_rcv_wscale(); else rscale = 0; mtu_idx = find_best_mtu_idx(sc, &inp->inp_inc, &settings); qid_atid = V_TID_QID(toep->ofld_rxq->iq.abs_id) | V_TID_TID(toep->tid) | V_TID_COOKIE(CPL_COOKIE_TOM); if (isipv6) { struct cpl_act_open_req6 *cpl = wrtod(wr); struct cpl_t5_act_open_req6 *cpl5 = (void *)cpl; struct cpl_t6_act_open_req6 *cpl6 = (void *)cpl; if ((inp->inp_vflag & INP_IPV6) == 0) DONT_OFFLOAD_ACTIVE_OPEN(ENOTSUP); toep->ce = t4_hold_lip(sc, &inp->in6p_laddr, NULL); if (toep->ce == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOENT); switch (chip_id(sc)) { case CHELSIO_T4: INIT_TP_WR(cpl, 0); cpl->params = select_ntuple(vi, toep->l2te); break; case CHELSIO_T5: INIT_TP_WR(cpl5, 0); cpl5->iss = htobe32(tp->iss); cpl5->params = select_ntuple(vi, toep->l2te); break; case CHELSIO_T6: default: INIT_TP_WR(cpl6, 0); cpl6->iss = htobe32(tp->iss); cpl6->params = select_ntuple(vi, toep->l2te); break; } OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ6, qid_atid)); cpl->local_port = inp->inp_lport; cpl->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0]; cpl->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8]; cpl->peer_port = inp->inp_fport; cpl->peer_ip_hi = *(uint64_t *)&inp->in6p_faddr.s6_addr[0]; cpl->peer_ip_lo = *(uint64_t *)&inp->in6p_faddr.s6_addr[8]; cpl->opt0 = calc_opt0(so, vi, toep->l2te, mtu_idx, rscale, toep->rx_credits, toep->ulp_mode, &settings); cpl->opt2 = calc_opt2a(so, toep, &settings); } else { struct cpl_act_open_req *cpl = wrtod(wr); struct cpl_t5_act_open_req *cpl5 = (void *)cpl; struct cpl_t6_act_open_req *cpl6 = (void *)cpl; switch (chip_id(sc)) { case CHELSIO_T4: INIT_TP_WR(cpl, 0); cpl->params = select_ntuple(vi, toep->l2te); break; case CHELSIO_T5: INIT_TP_WR(cpl5, 0); cpl5->iss = htobe32(tp->iss); cpl5->params = select_ntuple(vi, toep->l2te); break; case CHELSIO_T6: default: INIT_TP_WR(cpl6, 0); cpl6->iss = htobe32(tp->iss); cpl6->params = select_ntuple(vi, toep->l2te); break; } OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, qid_atid)); inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port, &cpl->peer_ip, &cpl->peer_port); cpl->opt0 = calc_opt0(so, vi, toep->l2te, mtu_idx, rscale, toep->rx_credits, toep->ulp_mode, &settings); cpl->opt2 = calc_opt2a(so, toep, &settings); } CTR5(KTR_CXGBE, "%s: atid %u (%s), toep %p, inp %p", __func__, toep->tid, tcpstates[tp->t_state], toep, inp); offload_socket(so, toep); rc = t4_l2t_send(sc, wr, toep->l2te); if (rc == 0) { toep->flags |= TPF_CPL_PENDING; return (0); } undo_offload_socket(so); reason = __LINE__; failed: CTR3(KTR_CXGBE, "%s: not offloading (%d), rc %d", __func__, reason, rc); if (wr) free_wrqe(wr); if (toep) { if (toep->tid >= 0) free_atid(sc, toep->tid); if (toep->l2te) t4_l2t_release(toep->l2te); if (toep->ce) t4_release_lip(sc, toep->ce); free_toepcb(toep); } return (rc); }
static int udp6_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; struct sockaddr_in6 *sin6; int error; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); sin6 = (struct sockaddr_in6 *)nam; KASSERT(inp != NULL, ("udp6_connect: inp == NULL")); /* * XXXRW: Need to clarify locking of v4/v6 flags. */ INP_WLOCK(inp); #ifdef INET if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { struct sockaddr_in sin; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { error = EINVAL; goto out; } if (inp->inp_faddr.s_addr != INADDR_ANY) { error = EISCONN; goto out; } in6_sin6_2_sin(&sin, sin6); inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; error = prison_remote_ip4(td->td_ucred, &sin.sin_addr); if (error != 0) goto out; INP_HASH_WLOCK(pcbinfo); error = in_pcbconnect(inp, (struct sockaddr *)&sin, td->td_ucred); INP_HASH_WUNLOCK(pcbinfo); if (error == 0) soisconnected(so); goto out; } #endif if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { error = EISCONN; goto out; } inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; error = prison_remote_ip6(td->td_ucred, &sin6->sin6_addr); if (error != 0) goto out; INP_HASH_WLOCK(pcbinfo); error = in6_pcbconnect(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(pcbinfo); if (error == 0) soisconnected(so); out: INP_WUNLOCK(inp); return (error); }
/* * Do a send by putting data in output queue and updating urgent * marker if URG set. Possibly send more data. Unlike the other * pru_*() routines, the mbuf chains are our responsibility. We * must either enqueue them or free them. The other pru_* routines * generally are caller-frees. */ static int tcp_usr_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; int headlocked = 0; #ifdef INET6 int isipv6; #endif TCPDEBUG0; //printf("tcp_usr_send: called\n"); /* * We require the pcbinfo lock in two cases: * * (1) An implied connect is taking place, which can result in * binding IPs and ports and hence modification of the pcb hash * chains. * * (2) PRUS_EOF is set, resulting in explicit close on the send. */ if ((nam != NULL) || (flags & PRUS_EOF)) { INP_INFO_WLOCK(&tcbinfo); headlocked = 1; } inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL")); INP_LOCK(inp); if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { if (control) m_freem(control); if (m) m_freem(m); error = ECONNRESET; goto out; } #ifdef INET6 isipv6 = nam && nam->sa_family == AF_INET6; #endif /* INET6 */ tp = intotcpcb(inp); TCPDEBUG1(); if (control) { /* TCP doesn't do control messages (rights, creds, etc) */ if (control->m_len) { m_freem(control); if (m) m_freem(m); error = EINVAL; goto out; } m_freem(control); /* empty control, just free it */ } if (!(flags & PRUS_OOB)) { //printf("===tcp_usr_send(1): ===> so->so_snd.sb_cc=%d\n", so->so_snd.sb_cc); sbappendstream(&so->so_snd, m); //printf("===tcp_usr_send(1.5): ===> so->so_snd.sb_cc=%d\n", so->so_snd.sb_cc); //printf("===tcp_usr_send(1.7): ===> m->m_data=0x%lx m->m_len=%d\n", // (long)m->m_data, m->m_len); if (nam && tp->t_state < TCPS_SYN_SENT) { /* * Do implied connect if not yet connected, * initialize window to default value, and * initialize maxseg/maxopd using peer's cached * MSS. */ INP_INFO_WLOCK_ASSERT(&tcbinfo); #ifdef INET6 if (isipv6) error = tcp6_connect(tp, nam, td); else #endif /* INET6 */ error = tcp_connect(tp, nam, td); if (error) goto out; tp->snd_wnd = TTCP_CLIENT_SND_WND; tcp_mss(tp, -1); } if (flags & PRUS_EOF) { /* * Close the send side of the connection after * the data is sent. */ INP_INFO_WLOCK_ASSERT(&tcbinfo); socantsendmore(so); tcp_usrclosed(tp); } if (headlocked) { INP_INFO_WUNLOCK(&tcbinfo); headlocked = 0; } if (tp != NULL) { if (flags & PRUS_MORETOCOME) tp->t_flags |= TF_MORETOCOME; error = tcp_output(tp); if (flags & PRUS_MORETOCOME) tp->t_flags &= ~TF_MORETOCOME; } } else { /* * XXXRW: PRUS_EOF not implemented with PRUS_OOB? */ SOCKBUF_LOCK(&so->so_snd); if (sbspace(&so->so_snd) < -512) { SOCKBUF_UNLOCK(&so->so_snd); m_freem(m); error = ENOBUFS; goto out; } /* * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section. * Otherwise, snd_up should be one lower. */ sbappendstream_locked(&so->so_snd, m); //printf("===tcp_usr_send(2): ===> m->m_data=0x%lx m->m_len=%d\n", // (long)m->m_data, m->m_len); SOCKBUF_UNLOCK(&so->so_snd); if (nam && tp->t_state < TCPS_SYN_SENT) { /* * Do implied connect if not yet connected, * initialize window to default value, and * initialize maxseg/maxopd using peer's cached * MSS. */ INP_INFO_WLOCK_ASSERT(&tcbinfo); #ifdef INET6 if (isipv6) error = tcp6_connect(tp, nam, td); else #endif /* INET6 */ error = tcp_connect(tp, nam, td); if (error) goto out; tp->snd_wnd = TTCP_CLIENT_SND_WND; tcp_mss(tp, -1); INP_INFO_WUNLOCK(&tcbinfo); headlocked = 0; } else if (nam) { INP_INFO_WUNLOCK(&tcbinfo); headlocked = 0; } tp->snd_up = tp->snd_una + so->so_snd.sb_cc; tp->t_flags |= TF_FORCEDATA; #ifdef MAXHE_TODO error = tcp_output(tp); #else if(so->nkn_where==0) { error = tcp_output(tp); } else { error = 0; } #endif // MAXHE_TODO tp->t_flags &= ~TF_FORCEDATA; } out: TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB : ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); INP_UNLOCK(inp); if (headlocked) INP_INFO_WUNLOCK(&tcbinfo); return (error); }
/* * Do a send by putting data in output queue and updating urgent * marker if URG set. Possibly send more data. Unlike the other * pru_*() routines, the mbuf chains are our responsibility. We * must either enqueue them or free them. The other pru_* routines * generally are caller-frees. */ static int tcp_usr_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct proc *p) { int s = splnet(); int error = 0; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp; #ifdef INET6 int isipv6; #endif TCPDEBUG0; if (inp == NULL) { /* * OOPS! we lost a race, the TCP session got reset after * we checked SS_CANTSENDMORE, eg: while doing uiomove or a * network interrupt in the non-splnet() section of sosend(). */ if (m) m_freem(m); if (control) m_freem(control); error = ECONNRESET; /* XXX EPIPE? */ tp = NULL; TCPDEBUG1(); goto out; } #ifdef INET6 isipv6 = nam && nam->sa_family == AF_INET6; #endif /* INET6 */ tp = intotcpcb(inp); TCPDEBUG1(); if (control) { /* TCP doesn't do control messages (rights, creds, etc) */ if (control->m_len) { m_freem(control); if (m) m_freem(m); error = EINVAL; goto out; } m_freem(control); /* empty control, just free it */ } if(!(flags & PRUS_OOB)) { sbappend(&so->so_snd, m); if (nam && tp->t_state < TCPS_SYN_SENT) { /* * Do implied connect if not yet connected, * initialize window to default value, and * initialize maxseg/maxopd using peer's cached * MSS. */ #ifdef INET6 if (isipv6) error = tcp6_connect(tp, nam, p); else #endif /* INET6 */ error = tcp_connect(tp, nam, p); if (error) goto out; tp->snd_wnd = TTCP_CLIENT_SND_WND; tcp_mss(tp, -1); } if (flags & PRUS_EOF) { /* * Close the send side of the connection after * the data is sent. */ socantsendmore(so); tp = tcp_usrclosed(tp); } if (tp != NULL) { if (flags & PRUS_MORETOCOME) tp->t_flags |= TF_MORETOCOME; error = tcp_output(tp); if (flags & PRUS_MORETOCOME) tp->t_flags &= ~TF_MORETOCOME; } } else { if (sbspace(&so->so_snd) < -512) { m_freem(m); error = ENOBUFS; goto out; } /* * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section. * Otherwise, snd_up should be one lower. */ sbappend(&so->so_snd, m); if (nam && tp->t_state < TCPS_SYN_SENT) { /* * Do implied connect if not yet connected, * initialize window to default value, and * initialize maxseg/maxopd using peer's cached * MSS. */ #ifdef INET6 if (isipv6) error = tcp6_connect(tp, nam, p); else #endif /* INET6 */ error = tcp_connect(tp, nam, p); if (error) goto out; tp->snd_wnd = TTCP_CLIENT_SND_WND; tcp_mss(tp, -1); } tp->snd_up = tp->snd_una + so->so_snd.sb_cc; tp->t_force = 1; error = tcp_output(tp); tp->t_force = 0; } COMMON_END((flags & PRUS_OOB) ? PRU_SENDOOB : ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); }
/* * The new sockopt interface makes it possible for us to block in the * copyin/out step (if we take a page fault). Taking a page fault at * splnet() is probably a Bad Thing. (Since sockets and pcbs both now * use TSM, there probably isn't any need for this function to run at * splnet() any more. This needs more examination.) * * XXXRW: The locking here is wrong; we may take a page fault while holding * the inpcb lock. */ int tcp_ctloutput(struct socket *so, struct sockopt *sopt) { int error, opt, optval; struct inpcb *inp; struct tcpcb *tp; struct tcp_info ti; error = 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL")); INP_LOCK(inp); if (sopt->sopt_level != IPPROTO_TCP) { INP_UNLOCK(inp); #ifdef INET6 if (INP_CHECK_SOCKAF(so, AF_INET6)) error = ip6_ctloutput(so, sopt); else #endif /* INET6 */ error = ip_ctloutput(so, sopt); return (error); } if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNRESET; goto out; } tp = intotcpcb(inp); switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { #ifdef TCP_SIGNATURE case TCP_MD5SIG: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; if (optval > 0) tp->t_flags |= TF_SIGNATURE; else tp->t_flags &= ~TF_SIGNATURE; break; #endif /* TCP_SIGNATURE */ case TCP_NODELAY: case TCP_NOOPT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (sopt->sopt_name) { case TCP_NODELAY: opt = TF_NODELAY; break; case TCP_NOOPT: opt = TF_NOOPT; break; default: opt = 0; /* dead code to fool gcc */ break; } if (optval) tp->t_flags |= opt; else tp->t_flags &= ~opt; break; case TCP_NOPUSH: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; if (optval) tp->t_flags |= TF_NOPUSH; else { tp->t_flags &= ~TF_NOPUSH; error = tcp_output(tp); } break; case TCP_MAXSEG: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; if (optval > 0 && optval <= tp->t_maxseg && optval + 40 >= tcp_minmss) tp->t_maxseg = optval; else error = EINVAL; break; case TCP_INFO: error = EINVAL; break; default: error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (sopt->sopt_name) { #ifdef TCP_SIGNATURE case TCP_MD5SIG: optval = (tp->t_flags & TF_SIGNATURE) ? 1 : 0; error = sooptcopyout(sopt, &optval, sizeof optval); break; #endif case TCP_NODELAY: optval = tp->t_flags & TF_NODELAY; error = sooptcopyout(sopt, &optval, sizeof optval); break; case TCP_MAXSEG: optval = tp->t_maxseg; error = sooptcopyout(sopt, &optval, sizeof optval); break; case TCP_NOOPT: optval = tp->t_flags & TF_NOOPT; error = sooptcopyout(sopt, &optval, sizeof optval); break; case TCP_NOPUSH: optval = tp->t_flags & TF_NOPUSH; error = sooptcopyout(sopt, &optval, sizeof optval); break; case TCP_INFO: tcp_fill_info(tp, &ti); error = sooptcopyout(sopt, &ti, sizeof ti); break; default: error = ENOPROTOOPT; break; } break; } out: INP_UNLOCK(inp); return (error); }
/* * NAME: tp_attach() * * CALLED FROM: * tp_usrreq, PRU_ATTACH * * FUNCTION and ARGUMENTS: * given a socket (so) and a protocol family (dom), allocate a tpcb * and ref structure, initialize everything in the structures that * needs to be initialized. * * RETURN VALUE: * 0 ok * EINVAL if DEBUG(X) in is on and a disaster has occurred * ENOPROTOOPT if TP hasn't been configured or if the * socket wasn't created with tp as its protocol * EISCONN if this socket is already part of a connection * ETOOMANYREFS if ran out of tp reference numbers. * E* whatever error is returned from soreserve() * for from the network-layer pcb allocation routine * * SIDE EFFECTS: * * NOTES: */ int tp_attach(struct socket *so, int protocol) { struct tp_pcb *tpcb; int error = 0; int dom = so->so_proto->pr_domain->dom_family; u_long lref; #ifdef ARGO_DEBUG if (argo_debug[D_CONN]) { printf("tp_attach:dom 0x%x so %p ", dom, so); } #endif #ifdef TPPT if (tp_traceflags[D_CONN]) { tptrace(TPPTmisc, "tp_attach:dom so", dom, so, 0, 0); } #endif if (so->so_pcb != NULL) { return EISCONN; /* socket already part of a connection */ } if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) error = soreserve(so, tp_sendspace, tp_recvspace); /* later an ioctl will allow reallocation IF still in closed state */ if (error) goto bad2; MALLOC(tpcb, struct tp_pcb *, sizeof(*tpcb), M_PCB, M_NOWAIT|M_ZERO); if (tpcb == NULL) { error = ENOBUFS; goto bad2; } if (((lref = tp_getref(tpcb)) & TP_ENOREF) != 0) { error = ETOOMANYREFS; goto bad3; } tpcb->tp_lref = lref; tpcb->tp_sock = so; tpcb->tp_domain = dom; tpcb->tp_rhiwat = so->so_rcv.sb_hiwat; /* tpcb->tp_proto = protocol; someday maybe? */ if (protocol && protocol < ISOPROTO_TP4) { tpcb->tp_netservice = ISO_CONS; tpcb->tp_snduna = (SeqNum) - 1; /* kludge so the pseudo-ack * from the CR/CC will * generate correct fake-ack * values */ } else { tpcb->tp_netservice = (dom == AF_INET) ? IN_CLNS : ISO_CLNS; /* the default */ } tpcb->_tp_param = tp_conn_param[tpcb->tp_netservice]; tpcb->tp_state = TP_CLOSED; tpcb->tp_vers = TP_VERSION; tpcb->tp_notdetached = 1; /* * Spec says default is 128 octets, that is, if the tpdusize argument * never appears, use 128. As the initiator, we will always "propose" * the 2048 size, that is, we will put this argument in the CR * always, but accept what the other side sends on the CC. If the * initiator sends us something larger on a CR, we'll respond w/ * this. Our maximum is 4096. See tp_chksum.c comments. */ tpcb->tp_cong_win = tpcb->tp_l_tpdusize = 1 << tpcb->tp_tpdusize; tpcb->tp_seqmask = TP_NML_FMT_MASK; tpcb->tp_seqbit = TP_NML_FMT_BIT; tpcb->tp_seqhalf = tpcb->tp_seqbit >> 1; /* attach to a network-layer protoswitch */ if ((error = tp_set_npcb(tpcb)) != 0) goto bad4; ASSERT(tpcb->tp_nlproto->nlp_afamily == tpcb->tp_domain); /* nothing to do for iso case */ if (dom == AF_INET) { /* tp_set_npcb sets it */ KASSERT(so->so_pcb != NULL); sotoinpcb(so)->inp_ppcb = (void *) tpcb; } return 0; bad4: #ifdef ARGO_DEBUG if (argo_debug[D_CONN]) { printf("BAD4 in tp_attach, so %p\n", so); } #endif tp_freeref(tpcb->tp_lref); bad3: #ifdef ARGO_DEBUG if (argo_debug[D_CONN]) { printf("BAD3 in tp_attach, so %p\n", so); } #endif free((void *) tpcb, M_PCB); /* never a cluster */ bad2: #ifdef ARGO_DEBUG if (argo_debug[D_CONN]) { printf("BAD2 in tp_attach, so %p\n", so); } #endif so->so_pcb = 0; /* bad: */ #ifdef ARGO_DEBUG if (argo_debug[D_CONN]) { printf("BAD in tp_attach, so %p\n", so); } #endif return error; }
int udp_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp; struct udpcb *up; int isudplite, error, optval; error = 0; isudplite = (so->so_proto->pr_protocol == IPPROTO_UDPLITE) ? 1 : 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); INP_WLOCK(inp); if (sopt->sopt_level != so->so_proto->pr_protocol) { #ifdef INET6 if (INP_CHECK_SOCKAF(so, AF_INET6)) { INP_WUNLOCK(inp); error = ip6_ctloutput(so, sopt); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { INP_WUNLOCK(inp); error = ip_ctloutput(so, sopt); } #endif return (error); } switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { case UDP_ENCAP: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; inp = sotoinpcb(so); KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); INP_WLOCK(inp); #ifdef IPSEC_NAT_T up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); #endif switch (optval) { case 0: /* Clear all UDP encap. */ #ifdef IPSEC_NAT_T up->u_flags &= ~UF_ESPINUDP_ALL; #endif break; #ifdef IPSEC_NAT_T case UDP_ENCAP_ESPINUDP: case UDP_ENCAP_ESPINUDP_NON_IKE: up->u_flags &= ~UF_ESPINUDP_ALL; if (optval == UDP_ENCAP_ESPINUDP) up->u_flags |= UF_ESPINUDP; else if (optval == UDP_ENCAP_ESPINUDP_NON_IKE) up->u_flags |= UF_ESPINUDP_NON_IKE; break; #endif default: error = EINVAL; break; } INP_WUNLOCK(inp); break; case UDPLITE_SEND_CSCOV: case UDPLITE_RECV_CSCOV: if (!isudplite) { INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) break; inp = sotoinpcb(so); KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); INP_WLOCK(inp); up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); if (optval != 0 && optval < 8) { INP_WUNLOCK(inp); error = EINVAL; break; } if (sopt->sopt_name == UDPLITE_SEND_CSCOV) up->u_txcslen = optval; else up->u_rxcslen = optval; INP_WUNLOCK(inp); break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (sopt->sopt_name) { #ifdef IPSEC_NAT_T case UDP_ENCAP: up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); optval = up->u_flags & UF_ESPINUDP_ALL; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; #endif case UDPLITE_SEND_CSCOV: case UDPLITE_RECV_CSCOV: if (!isudplite) { INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); if (sopt->sopt_name == UDPLITE_SEND_CSCOV) optval = up->u_txcslen; else optval = up->u_rxcslen; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(optval)); break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } break; } return (error); }
int tcp_usrreq(struct socket * so, struct mbuf * m, struct mbuf * nam) { struct inpcb * inp; struct tcpcb * tp; int error = 0; int req; #ifdef DO_TCPTRACE int ostate; #endif req = so->so_req; /* get request from socket struct */ inp = sotoinpcb(so); /* * When a TCP is attached to a socket, then there will be * a (struct inpcb) pointed at by the socket, and this * structure will point at a subsidary (struct tcpcb). */ if (inp == 0 && req != PRU_ATTACH) { return (EINVAL); } if (inp) tp = intotcpcb(inp); else /* inp and tp not set, make sure this is OK: */ { if (req == PRU_ATTACH) tp = NULL; /* stifle compiler warnings about using unassigned tp*/ else { dtrap(); /* programming error? */ return EINVAL; } } switch (req) { /* * TCP attaches to socket via PRU_ATTACH, reserving space, * and an internet control block. */ case PRU_ATTACH: if (inp) { error = EISCONN; break; } error = tcp_attach(so); if (error) break; if ((so->so_options & SO_LINGER) && so->so_linger == 0) so->so_linger = TCP_LINGERTIME; #ifdef DO_TCPTRACE SETTP(tp, sototcpcb(so)); #endif break; /* * PRU_DETACH detaches the TCP protocol from the socket. * If the protocol state is non-embryonic, then can't * do this directly: have to initiate a PRU_DISCONNECT, * which may finish later; embryonic TCB's can just * be discarded here. */ case PRU_DETACH: if (tp->t_state > TCPS_LISTEN) SETTP(tp, tcp_disconnect(tp)); else SETTP(tp, tcp_close(tp)); break; /* * Give the socket an address. */ case PRU_BIND: /* bind is quite different for IPv4 and v6, so we use two * seperate pcbbind routines. so_domain was checked for * validity way up in t_bind() */ #ifdef IP_V4 if(inp->inp_socket->so_domain == AF_INET) { error = in_pcbbind(inp, nam); break; } #endif /* IP_V4 */ #ifdef IP_V6 if(inp->inp_socket->so_domain == AF_INET6) { error = ip6_pcbbind(inp, nam); break; } #endif /* IP_V6 */ dtrap(); /* not v4 or v6? */ error = EINVAL; break; /* * Prepare to accept connections. */ case PRU_LISTEN: if (inp->inp_lport == 0) error = in_pcbbind(inp, (struct mbuf *)0); if (error == 0) tp->t_state = TCPS_LISTEN; break; /* * Initiate connection to peer. * Create a template for use in transmissions on this connection. * Enter SYN_SENT state, and mark socket as connecting. * Start keep-alive timer, and seed output sequence space. * Send initial segment on connection. */ case PRU_CONNECT: if (inp->inp_lport == 0) { #ifdef IP_V4 #ifndef IP_V6 /* v4 only */ error = in_pcbbind(inp, (struct mbuf *)0); #else /* dual mode */ if(so->so_domain == AF_INET) error = in_pcbbind(inp, (struct mbuf *)0); else error = ip6_pcbbind(inp, (struct mbuf *)0); #endif /* end dual mode code */ #else /* no v4, v6 only */ error = ip6_pcbbind(inp, (struct mbuf *)0); #endif /* end v6 only */ if (error) break; } #ifdef IP_V4 #ifndef IP_V6 /* v4 only */ error = in_pcbconnect(inp, nam); #else /* dual mode */ if(so->so_domain == AF_INET) error = in_pcbconnect(inp, nam); else error = ip6_pcbconnect(inp, nam); #endif /* end dual mode code */ #else /* no v4, v6 only */ error = ip6_pcbconnect(inp, nam); #endif /* end v6 only */ if (error) break; tp->t_template = tcp_template(tp); if (tp->t_template == 0) { #ifdef IP_V4 #ifndef IP_V6 /* v4 only */ in_pcbdisconnect(inp); #else /* dual mode */ if(so->so_domain == AF_INET) in_pcbdisconnect(inp); else ip6_pcbdisconnect(inp); #endif /* end dual mode code */ #else /* no v4, v6 only */ ip6_pcbdisconnect(inp); #endif /* end v6 only */ error = ENOBUFS; break; } soisconnecting(so); tcpstat.tcps_connattempt++; tp->t_state = TCPS_SYN_SENT; tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT; tp->iss = tcp_iss; tcp_iss += (tcp_seq)(TCP_ISSINCR/2); tcp_sendseqinit(tp); error = tcp_output(tp); if (!error) TCP_MIB_INC(tcpActiveOpens); /* keep MIB stats */ break; /* * Create a TCP connection between two sockets. */ case PRU_CONNECT2: error = EOPNOTSUPP; break; /* * Initiate disconnect from peer. * If connection never passed embryonic stage, just drop; * else if don't need to let data drain, then can just drop anyways, * else have to begin TCP shutdown process: mark socket disconnecting, * drain unread data, state switch to reflect user close, and * send segment (e.g. FIN) to peer. Socket will be really disconnected * when peer sends FIN and acks ours. * * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. */ case PRU_DISCONNECT: SETTP(tp, tcp_disconnect(tp)); break; /* * Accept a connection. Essentially all the work is * done at higher levels; just return the address * of the peer, storing through addr. */ case PRU_ACCEPT: { struct sockaddr_in * sin = mtod(nam, struct sockaddr_in *); #ifdef IP_V6 struct sockaddr_in6 * sin6 = mtod(nam, struct sockaddr_in6 *); #endif #ifdef IP_V6 if (so->so_domain == AF_INET6) { nam->m_len = sizeof (struct sockaddr_in6); sin6->sin6_port = inp->inp_fport; sin6->sin6_family = AF_INET6; IP6CPY(&sin6->sin6_addr, &inp->ip6_faddr); } #endif #ifdef IP_V4 if (so->so_domain == AF_INET) { nam->m_len = sizeof (struct sockaddr_in); sin->sin_family = AF_INET; sin->sin_port = inp->inp_fport; sin->sin_addr = inp->inp_faddr; } #endif if ( !(so->so_domain == AF_INET) && !(so->so_domain == AF_INET6) ) { dprintf("*** PRU_ACCEPT bad domain = %d\n", so->so_domain); dtrap(); } TCP_MIB_INC(tcpPassiveOpens); /* keep MIB stats */ break; } /* * Mark the connection as being incapable of further output. */ case PRU_SHUTDOWN: socantsendmore(so); tp = tcp_usrclosed(tp); if (tp) error = tcp_output(tp); break; /* * After a receive, possibly send window update to peer. */ case PRU_RCVD: (void) tcp_output(tp); break; /* * Do a send by putting data in output queue and updating urgent * marker if URG set. Possibly send more data. */ case PRU_SEND: if (so->so_pcb == NULL) { /* Return EPIPE error if socket is not connected */ error = EPIPE; break; } sbappend(&so->so_snd, m); error = tcp_output(tp); if (error == ENOBUFS) sbdropend(&so->so_snd,m); /* Remove data from socket buffer */ break; /* * Abort the TCP. */ case PRU_ABORT: SETTP(tp, tcp_drop(tp, ECONNABORTED)); break; case PRU_SENSE: /* ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat; */ dtrap(); /* does this ever happen? */ return (0); case PRU_RCVOOB: if ((so->so_oobmark == 0 && (so->so_state & SS_RCVATMARK) == 0) || #ifdef SO_OOBINLINE so->so_options & SO_OOBINLINE || #endif tp->t_oobflags & TCPOOB_HADDATA) { error = EINVAL; break; } if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { error = EWOULDBLOCK; break; } m->m_len = 1; *mtod(m, char *) = tp->t_iobc; if ((MBUF2LONG(nam) & MSG_PEEK) == 0) tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); break; case PRU_SENDOOB: if (so->so_pcb == NULL) { /* Return EPIPE error if socket is not connected */ error = EPIPE; break; } if (sbspace(&so->so_snd) == 0) { m_freem(m); error = ENOBUFS; break; } /* * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section. * Otherwise, snd_up should be one lower. */ sbappend(&so->so_snd, m); tp->snd_up = tp->snd_una + so->so_snd.sb_cc; tp->t_force = 1; error = tcp_output(tp); if (error == ENOBUFS) sbdropend(&so->so_snd,m); /* Remove data from socket buffer */ tp->t_force = 0; break; case PRU_SOCKADDR: /* sockaddr and peeraddr have to switch based on IP type */ #ifdef IP_V4 #ifndef IP_V6 /* v4 only */ in_setsockaddr(inp, nam); #else /* dual mode */ if(so->so_domain == AF_INET6) ip6_setsockaddr(inp, nam); else in_setsockaddr(inp, nam); #endif /* dual mode */ #else /* IP_V6 */ ip6_setsockaddr(inp, nam); #endif break; case PRU_PEERADDR: #ifdef IP_V4 #ifndef IP_V6 /* v4 only */ in_setpeeraddr(inp, nam); #else /* dual mode */ if(so->so_domain == AF_INET6) ip6_setpeeraddr(inp, nam); else in_setpeeraddr(inp, nam); #endif /* dual mode */ #else /* IP_V6 */ ip6_setpeeraddr(inp, nam); #endif break; case PRU_SLOWTIMO: SETTP(tp, tcp_timers(tp, (int)MBUF2LONG(nam))); #ifdef DO_TCPTRACE req |= (long)nam << 8; /* for debug's sake */ #endif break; default: panic("tcp_usrreq"); } #ifdef DO_TCPTRACE if (tp && (so->so_options & SO_DEBUG)) tcp_trace("usrreq: state: %d, tcpcb: %x, req: %d", ostate, tp, req); #endif return (error); }
/* * active open (soconnect). * * State of affairs on entry: * soisconnecting (so_state |= SS_ISCONNECTING) * tcbinfo not locked (This has changed - used to be WLOCKed) * inp WLOCKed * tp->t_state = TCPS_SYN_SENT * rtalloc1, RT_UNLOCK on rt. */ int t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt, struct sockaddr *nam) { struct adapter *sc = tod->tod_softc; struct tom_data *td = tod_td(tod); struct toepcb *toep = NULL; struct wrqe *wr = NULL; struct ifnet *rt_ifp = rt->rt_ifp; struct port_info *pi; int mtu_idx, rscale, qid_atid, rc, isipv6; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); int reason; INP_WLOCK_ASSERT(inp); KASSERT(nam->sa_family == AF_INET || nam->sa_family == AF_INET6, ("%s: dest addr %p has family %u", __func__, nam, nam->sa_family)); if (rt_ifp->if_type == IFT_ETHER) pi = rt_ifp->if_softc; else if (rt_ifp->if_type == IFT_L2VLAN) { struct ifnet *ifp = VLAN_COOKIE(rt_ifp); pi = ifp->if_softc; } else if (rt_ifp->if_type == IFT_IEEE8023ADLAG) DONT_OFFLOAD_ACTIVE_OPEN(ENOSYS); /* XXX: implement lagg+TOE */ else DONT_OFFLOAD_ACTIVE_OPEN(ENOTSUP); toep = alloc_toepcb(pi, -1, -1, M_NOWAIT); if (toep == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); toep->tid = alloc_atid(sc, toep); if (toep->tid < 0) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); toep->l2te = t4_l2t_get(pi, rt_ifp, rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway : nam); if (toep->l2te == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); isipv6 = nam->sa_family == AF_INET6; wr = alloc_wrqe(isipv6 ? sizeof(struct cpl_act_open_req6) : sizeof(struct cpl_act_open_req), toep->ctrlq); if (wr == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); if (sc->tt.ddp && (so->so_options & SO_NO_DDP) == 0) set_tcpddp_ulp_mode(toep); else toep->ulp_mode = ULP_MODE_NONE; SOCKBUF_LOCK(&so->so_rcv); /* opt0 rcv_bufsiz initially, assumes its normal meaning later */ toep->rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ); SOCKBUF_UNLOCK(&so->so_rcv); /* * The kernel sets request_r_scale based on sb_max whereas we need to * take hardware's MAX_RCV_WND into account too. This is normally a * no-op as MAX_RCV_WND is much larger than the default sb_max. */ if (tp->t_flags & TF_REQ_SCALE) rscale = tp->request_r_scale = select_rcv_wscale(); else rscale = 0; mtu_idx = find_best_mtu_idx(sc, &inp->inp_inc, 0); qid_atid = (toep->ofld_rxq->iq.abs_id << 14) | toep->tid; if (isipv6) { struct cpl_act_open_req6 *cpl = wrtod(wr); if ((inp->inp_vflag & INP_IPV6) == 0) { /* XXX think about this a bit more */ log(LOG_ERR, "%s: time to think about AF_INET6 + vflag 0x%x.\n", __func__, inp->inp_vflag); DONT_OFFLOAD_ACTIVE_OPEN(ENOTSUP); } toep->ce = hold_lip(td, &inp->in6p_laddr); if (toep->ce == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOENT); INIT_TP_WR(cpl, 0); OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ6, qid_atid)); cpl->local_port = inp->inp_lport; cpl->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0]; cpl->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8]; cpl->peer_port = inp->inp_fport; cpl->peer_ip_hi = *(uint64_t *)&inp->in6p_faddr.s6_addr[0]; cpl->peer_ip_lo = *(uint64_t *)&inp->in6p_faddr.s6_addr[8]; cpl->opt0 = calc_opt0(so, pi, toep->l2te, mtu_idx, rscale, toep->rx_credits, toep->ulp_mode); cpl->params = select_ntuple(pi, toep->l2te, sc->filter_mode); cpl->opt2 = calc_opt2a(so, toep); } else { struct cpl_act_open_req *cpl = wrtod(wr); INIT_TP_WR(cpl, 0); OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, qid_atid)); inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port, &cpl->peer_ip, &cpl->peer_port); cpl->opt0 = calc_opt0(so, pi, toep->l2te, mtu_idx, rscale, toep->rx_credits, toep->ulp_mode); cpl->params = select_ntuple(pi, toep->l2te, sc->filter_mode); cpl->opt2 = calc_opt2a(so, toep); } CTR5(KTR_CXGBE, "%s: atid %u (%s), toep %p, inp %p", __func__, toep->tid, tcpstates[tp->t_state], toep, inp); offload_socket(so, toep); rc = t4_l2t_send(sc, wr, toep->l2te); if (rc == 0) { toep->flags |= TPF_CPL_PENDING; return (0); } undo_offload_socket(so); reason = __LINE__; failed: CTR3(KTR_CXGBE, "%s: not offloading (%d), rc %d", __func__, reason, rc); if (wr) free_wrqe(wr); if (toep) { if (toep->tid >= 0) free_atid(sc, toep->tid); if (toep->l2te) t4_l2t_release(toep->l2te); if (toep->ce) release_lip(td, toep->ce); free_toepcb(toep); } return (rc); }