int
ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle)
{
	struct socket *sock = conn->ksnc_sock;
	int	    len;
	int	    rc;

	rc = ksocknal_connsock_addref(conn);
	if (rc != 0) {
		LASSERT (conn->ksnc_closing);
		*txmem = *rxmem = *nagle = 0;
		return -ESHUTDOWN;
	}

	rc = libcfs_sock_getbuf(sock, txmem, rxmem);
	if (rc == 0) {
		len = sizeof(*nagle);
		rc = kernel_getsockopt(sock, SOL_TCP, TCP_NODELAY,
					   (char *)nagle, &len);
	}

	ksocknal_connsock_decref(conn);

	if (rc == 0)
		*nagle = !*nagle;
	else
		*txmem = *rxmem = *nagle = 0;

	return rc;
}
示例#2
0
void
ksocknal_lib_push_conn (ksock_conn_t *conn)
{
	struct sock    *sk;
	struct tcp_sock *tp;
	int	     nonagle;
	int	     val = 1;
	int	     rc;
	mm_segment_t    oldmm;

	rc = ksocknal_connsock_addref(conn);
	if (rc != 0)			    /* being shut down */
		return;

	sk = conn->ksnc_sock->sk;
	tp = tcp_sk(sk);

	lock_sock (sk);
	nonagle = tp->nonagle;
	tp->nonagle = 1;
	release_sock (sk);

	oldmm = get_fs ();
	set_fs (KERNEL_DS);

	rc = sk->sk_prot->setsockopt (sk, SOL_TCP, TCP_NODELAY,
				      (char *)&val, sizeof (val));
	LASSERT (rc == 0);

	set_fs (oldmm);

	lock_sock (sk);
	tp->nonagle = nonagle;
	release_sock (sk);

	ksocknal_connsock_decref(conn);
}
示例#3
0
int
ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
{ 
#if SOCKNAL_SINGLE_FRAG_TX 
        struct iovec    scratch; 
        struct iovec   *scratchiov = &scratch; 
        unsigned int    niov = 1;
#else 
        struct iovec   *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; 
        unsigned int    niov = tx->tx_niov;
#endif
        struct socket *sock = conn->ksnc_sock;
        int            nob;
        int            rc;
        int            i;
        struct uio  suio = {
                .uio_iov        = scratchiov,
                .uio_iovcnt     = niov,
                .uio_offset     = 0,
                .uio_resid      = 0,            /* This will be valued after a while */
                .uio_segflg     = UIO_SYSSPACE,
                .uio_rw         = UIO_WRITE,
                .uio_procp      = NULL
        };
        int  flags = MSG_DONTWAIT;
        CFS_DECL_NET_DATA;

        for (nob = i = 0; i < niov; i++) { 
                scratchiov[i] = tx->tx_iov[i]; 
                nob += scratchiov[i].iov_len; 
        }
        suio.uio_resid = nob;

        CFS_NET_IN;
        rc = sosend(sock, NULL, &suio, (struct mbuf *)0, (struct mbuf *)0, flags);
        CFS_NET_EX; 

        /* NB there is no return value can indicate how many 
         * have been sent and how many resid, we have to get 
         * sent bytes from suio. */
        if (rc != 0) {
                if (suio.uio_resid != nob &&\
                    (rc == ERESTART || rc == EINTR || rc == EWOULDBLOCK))
                        /* We have sent something */
                        rc = nob - suio.uio_resid;
                else if ( rc == EWOULDBLOCK ) 
                        /* Actually, EAGAIN and EWOULDBLOCK have same value in OSX */
                        rc = -EAGAIN;   
                else 
                        rc = -rc;
        } else  /* rc == 0 */
                rc = nob - suio.uio_resid;

        return rc;
}

int
ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
{
#if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK 
        struct iovec  scratch; 
        struct iovec *scratchiov = &scratch; 
        unsigned int  niov = 1;
#else
        struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
        unsigned int  niov = tx->tx_nkiov;
#endif
        struct socket *sock = conn->ksnc_sock;
        lnet_kiov_t    *kiov = tx->tx_kiov;
        int            nob;
        int            rc;
        int            i;
        struct  uio suio = {
                .uio_iov        = scratchiov,
                .uio_iovcnt     = niov,
                .uio_offset     = 0, 
                .uio_resid      = 0,    /* It should be valued after a while */
                .uio_segflg     = UIO_SYSSPACE,
                .uio_rw         = UIO_WRITE,
                .uio_procp      = NULL
        };
        int  flags = MSG_DONTWAIT;
        CFS_DECL_NET_DATA; 
        
        for (nob = i = 0; i < niov; i++) { 
                scratchiov[i].iov_base = cfs_kmap(kiov[i].kiov_page) + 
                                         kiov[i].kiov_offset; 
                nob += scratchiov[i].iov_len = kiov[i].kiov_len; 
        }
        suio.uio_resid = nob;

        CFS_NET_IN;
        rc = sosend(sock, NULL, &suio, (struct mbuf *)0, (struct mbuf *)0, flags);
        CFS_NET_EX;

        for (i = 0; i < niov; i++) 
                cfs_kunmap(kiov[i].kiov_page);

        if (rc != 0) {
                if (suio.uio_resid != nob &&\
                    (rc == ERESTART || rc == EINTR || rc == EWOULDBLOCK))
                        /* We have sent something */
                        rc = nob - suio.uio_resid; 
                else if ( rc == EWOULDBLOCK ) 
                        /* EAGAIN and EWOULD BLOCK have same value in OSX */
                        rc = -EAGAIN;   
                else 
                        rc = -rc;
        } else  /* rc == 0 */
                rc = nob - suio.uio_resid;

        return rc;
}

/*
 * liang: Hack of inpcb and tcpcb.
 * To get tcpcb of a socket, and call tcp_output
 * to send quick ack.
 */
struct ks_tseg_qent{
        int foo;
};

struct ks_tcptemp{
        int foo;
};

LIST_HEAD(ks_tsegqe_head, ks_tseg_qent);

struct ks_tcpcb {
        struct ks_tsegqe_head t_segq;
        int     t_dupacks;
        struct ks_tcptemp *unused;
        int    t_timer[4];
        struct inpcb *t_inpcb;
        int    t_state;
        u_int  t_flags;
        /*
         * There are more fields but we dont need
         * ......
         */
};

#define TF_ACKNOW       0x00001
#define TF_DELACK       0x00002

struct ks_inpcb {
        LIST_ENTRY(ks_inpcb) inp_hash;
        struct  in_addr reserved1;
        struct  in_addr reserved2;
        u_short inp_fport;
        u_short inp_lport;
        LIST_ENTRY(inpcb) inp_list;
        caddr_t inp_ppcb;
        /*
         * There are more fields but we dont need
         * ......
         */
};

#define ks_sotoinpcb(so)   ((struct ks_inpcb *)(so)->so_pcb)
#define ks_intotcpcb(ip)   ((struct ks_tcpcb *)(ip)->inp_ppcb)
#define ks_sototcpcb(so)   (intotcpcb(sotoinpcb(so)))

void
ksocknal_lib_eager_ack (ksock_conn_t *conn)
{
        struct socket *sock = conn->ksnc_sock;
        struct ks_inpcb  *inp = ks_sotoinpcb(sock);
        struct ks_tcpcb  *tp = ks_intotcpcb(inp);
        int s;
        CFS_DECL_NET_DATA;

        extern int tcp_output(register struct ks_tcpcb *tp);

        CFS_NET_IN;
        s = splnet();

        /*
         * No TCP_QUICKACK supported in BSD, so I have to call tcp_fasttimo
         * to send immediate ACK. 
         */
        if (tp && tp->t_flags & TF_DELACK){
                tp->t_flags &= ~TF_DELACK;
                tp->t_flags |= TF_ACKNOW;
                (void) tcp_output(tp);
        }
        splx(s);

        CFS_NET_EX;

        return;
}

int
ksocknal_lib_recv_iov (ksock_conn_t *conn)
{
#if SOCKNAL_SINGLE_FRAG_RX 
        struct iovec  scratch; 
        struct iovec *scratchiov = &scratch; 
        unsigned int  niov = 1;
#else 
        struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
        unsigned int  niov = conn->ksnc_rx_niov;
#endif
        struct iovec *iov = conn->ksnc_rx_iov;
        int          nob;
        int          rc;
        int          i;
        struct uio  ruio = {
                .uio_iov        = scratchiov,
                .uio_iovcnt     = niov,
                .uio_offset     = 0,
                .uio_resid      = 0,    /* It should be valued after a while */
                .uio_segflg     = UIO_SYSSPACE,
                .uio_rw         = UIO_READ,
                .uio_procp      = NULL
        };
        int         flags = MSG_DONTWAIT;
        CFS_DECL_NET_DATA;

        for (nob = i = 0; i < niov; i++) { 
                scratchiov[i] = iov[i]; 
                nob += scratchiov[i].iov_len; 
        } 
        LASSERT (nob <= conn->ksnc_rx_nob_wanted);

        ruio.uio_resid = nob;

        CFS_NET_IN;
        rc = soreceive(conn->ksnc_sock, (struct sockaddr **)0, &ruio, (struct mbuf **)0, (struct mbuf **)0, &flags);
        CFS_NET_EX;
        if (rc){
                if (ruio.uio_resid != nob && \
                    (rc == ERESTART || rc == EINTR || rc == EWOULDBLOCK || rc == EAGAIN))
                        /* data particially received */
                        rc = nob - ruio.uio_resid; 
                else if (rc == EWOULDBLOCK) 
                        /* EAGAIN and EWOULD BLOCK have same value in OSX */
                        rc = -EAGAIN; 
                else
                        rc = -rc;
        } else 
                rc = nob - ruio.uio_resid;

        return (rc);
}

int
ksocknal_lib_recv_kiov (ksock_conn_t *conn)
{
#if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK 
        struct iovec  scratch; 
        struct iovec *scratchiov = &scratch; 
        unsigned int  niov = 1;
#else 
        struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
        unsigned int  niov = conn->ksnc_rx_nkiov;
#endif
        lnet_kiov_t    *kiov = conn->ksnc_rx_kiov;
        int           nob;
        int           rc;
        int           i;
        struct uio  ruio = {
                .uio_iov        = scratchiov,
                .uio_iovcnt     = niov,
                .uio_offset     = 0,
                .uio_resid      = 0,
                .uio_segflg     = UIO_SYSSPACE,
                .uio_rw         = UIO_READ,
                .uio_procp      = NULL
        };
        int         flags = MSG_DONTWAIT;
        CFS_DECL_NET_DATA;

        for (nob = i = 0; i < niov; i++) { 
                scratchiov[i].iov_base = cfs_kmap(kiov[i].kiov_page) + kiov[i].kiov_offset; 
                nob += scratchiov[i].iov_len = kiov[i].kiov_len; 
        } 
        LASSERT (nob <= conn->ksnc_rx_nob_wanted);

        ruio.uio_resid = nob;

        CFS_NET_IN;
        rc = soreceive(conn->ksnc_sock, (struct sockaddr **)0, &ruio, (struct mbuf **)0, NULL, &flags);
        CFS_NET_EX;

        for (i = 0; i < niov; i++) 
                cfs_kunmap(kiov[i].kiov_page);

        if (rc){
                if (ruio.uio_resid != nob && \
                    (rc == ERESTART || rc == EINTR || rc == EWOULDBLOCK))
                        /* data particially received */
                        rc = nob - ruio.uio_resid; 
                else if (rc == EWOULDBLOCK)
                        /* receive blocked, EWOULDBLOCK == EAGAIN */ 
                        rc = -EAGAIN; 
                else
                        rc = -rc;
        } else
                rc = nob - ruio.uio_resid;

        return (rc);
}

int
ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle)
{
        struct socket *sock = conn->ksnc_sock;
        int            rc;

        rc = ksocknal_connsock_addref(conn);
        if (rc != 0) {
                LASSERT (conn->ksnc_closing);
                *txmem = *rxmem = *nagle = 0;
                return -ESHUTDOWN;
        }
        rc = libcfs_sock_getbuf(sock, txmem, rxmem);
        if (rc == 0) {
                struct sockopt  sopt;
                int            len;
                CFS_DECL_NET_DATA;

                len = sizeof(*nagle);
                bzero(&sopt, sizeof sopt);
                sopt.sopt_dir = SOPT_GET; 
                sopt.sopt_level = IPPROTO_TCP;
                sopt.sopt_name = TCP_NODELAY;
                sopt.sopt_val = nagle;
                sopt.sopt_valsize = len;

                CFS_NET_IN;
                rc = -sogetopt(sock, &sopt);
                CFS_NET_EX;
        }

        ksocknal_connsock_decref(conn);

        if (rc == 0)
                *nagle = !*nagle;
        else
                *txmem = *rxmem = *nagle = 0;
        return (rc);
}

int
ksocknal_lib_setup_sock (struct socket *so)
{
        struct sockopt  sopt;
        int             rc; 
        int             option; 
        int             keep_idle; 
        int             keep_intvl; 
        int             keep_count; 
        int             do_keepalive; 
        struct linger   linger;
        CFS_DECL_NET_DATA;

        rc = libcfs_sock_setbuf(so,
                                *ksocknal_tunables.ksnd_tx_buffer_size,
                                *ksocknal_tunables.ksnd_rx_buffer_size);
        if (rc != 0) {
                CERROR ("Can't set buffer tx %d, rx %d buffers: %d\n",
                        *ksocknal_tunables.ksnd_tx_buffer_size,
                        *ksocknal_tunables.ksnd_rx_buffer_size, rc);
                return (rc);
        }

        /* Ensure this socket aborts active sends immediately when we close
         * it. */
        bzero(&sopt, sizeof sopt);

        linger.l_onoff = 0;
        linger.l_linger = 0;
        sopt.sopt_dir = SOPT_SET;
        sopt.sopt_level = SOL_SOCKET;
        sopt.sopt_name = SO_LINGER;
        sopt.sopt_val = &linger;
        sopt.sopt_valsize = sizeof(linger);

        CFS_NET_IN;
        rc = -sosetopt(so, &sopt);
        if (rc != 0) {
                CERROR ("Can't set SO_LINGER: %d\n", rc);
                goto out;
        }

        if (!*ksocknal_tunables.ksnd_nagle) { 
                option = 1; 
                bzero(&sopt, sizeof sopt);
                sopt.sopt_dir = SOPT_SET; 
                sopt.sopt_level = IPPROTO_TCP;
                sopt.sopt_name = TCP_NODELAY; 
                sopt.sopt_val = &option; 
                sopt.sopt_valsize = sizeof(option);
                rc = -sosetopt(so, &sopt);
                if (rc != 0) { 
                        CERROR ("Can't disable nagle: %d\n", rc); 
                        goto out;
                } 
        } 

        /* snapshot tunables */ 
        keep_idle  = *ksocknal_tunables.ksnd_keepalive_idle; 
        keep_count = *ksocknal_tunables.ksnd_keepalive_count; 
        keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl;

        do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0); 
        option = (do_keepalive ? 1 : 0); 
        bzero(&sopt, sizeof sopt); 
        sopt.sopt_dir = SOPT_SET; 
        sopt.sopt_level = SOL_SOCKET; 
        sopt.sopt_name = SO_KEEPALIVE; 
        sopt.sopt_val = &option; 
        sopt.sopt_valsize = sizeof(option); 
        rc = -sosetopt(so, &sopt); 
        if (rc != 0) { 
                CERROR ("Can't set SO_KEEPALIVE: %d\n", rc); 
                goto out; 
        }
        
        if (!do_keepalive) { 
                /* no more setting, just return */
                rc = 0;
                goto out;
        } 
        
        bzero(&sopt, sizeof sopt); 
        sopt.sopt_dir = SOPT_SET; 
        sopt.sopt_level = IPPROTO_TCP; 
        sopt.sopt_name = TCP_KEEPALIVE; 
        sopt.sopt_val = &keep_idle; 
        sopt.sopt_valsize = sizeof(keep_idle); 
        rc = -sosetopt(so, &sopt); 
        if (rc != 0) { 
                CERROR ("Can't set TCP_KEEPALIVE : %d\n", rc); 
                goto out; 
        }
out:
        CFS_NET_EX;
        return (rc);
}

void
ksocknal_lib_push_conn(ksock_conn_t *conn)
{ 
        struct socket   *sock; 
        struct sockopt  sopt; 
        int             val = 1; 
        int             rc; 
        CFS_DECL_NET_DATA; 
        
        rc = ksocknal_connsock_addref(conn); 
        if (rc != 0)            /* being shut down */ 
                return; 
        sock = conn->ksnc_sock; 
        bzero(&sopt, sizeof sopt); 
        sopt.sopt_dir = SOPT_SET; 
        sopt.sopt_level = IPPROTO_TCP; 
        sopt.sopt_name = TCP_NODELAY; 
        sopt.sopt_val = &val; 
        sopt.sopt_valsize = sizeof val; 

        CFS_NET_IN; 
        sosetopt(sock, &sopt); 
        CFS_NET_EX; 

        ksocknal_connsock_decref(conn);
        return;
}


extern void ksocknal_read_callback (ksock_conn_t *conn);
extern void ksocknal_write_callback (ksock_conn_t *conn);

static void
ksocknal_upcall(struct socket *so, caddr_t arg, int waitf)
{
        ksock_conn_t  *conn = (ksock_conn_t *)arg;
        ENTRY;

        read_lock (&ksocknal_data.ksnd_global_lock);
        if (conn == NULL)
                goto out;

        if (so->so_rcv.sb_flags & SB_UPCALL) {
                extern int soreadable(struct socket *so);
                if (conn->ksnc_rx_nob_wanted && soreadable(so))
                        /* To verify whether the upcall is for receive */
                        ksocknal_read_callback (conn);
        }
        /* go foward? */
        if (so->so_snd.sb_flags & SB_UPCALL){
                extern int sowriteable(struct socket *so);
                if (sowriteable(so))
                        /* socket is writable */
                        ksocknal_write_callback(conn);
        }
out:
        read_unlock (&ksocknal_data.ksnd_global_lock);

        EXIT;
}

void
ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn)
{ 
        /* No callback need to save in osx */
        return;
}

void
ksocknal_lib_set_callback(struct socket *sock, ksock_conn_t *conn)
{ 
        CFS_DECL_NET_DATA;

        CFS_NET_IN;
        sock->so_upcallarg = (void *)conn;
        sock->so_upcall = ksocknal_upcall; 
        sock->so_snd.sb_timeo = 0; 
        sock->so_rcv.sb_timeo = cfs_time_seconds(2);
        sock->so_rcv.sb_flags |= SB_UPCALL; 
        sock->so_snd.sb_flags |= SB_UPCALL; 
        CFS_NET_EX;
        return;
}

void
ksocknal_lib_act_callback(struct socket *sock, ksock_conn_t *conn)
{
        CFS_DECL_NET_DATA;

        CFS_NET_IN;
        ksocknal_upcall (sock, (void *)conn, 0);
        CFS_NET_EX;
}
示例#4
0
int
ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
{
        socket_t        sock = C2B_SOCK(conn->ksnc_sock);
        size_t          sndlen;
        int             nob;
        int             rc;

#if SOCKNAL_SINGLE_FRAG_TX
        struct iovec    scratch;
        struct iovec   *scratchiov = &scratch;
        unsigned int    niov = 1;
#else
        struct iovec   *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
        unsigned int    niov = tx->tx_niov;
#endif
        struct msghdr msg = {
                .msg_name       = NULL,
                .msg_namelen    = 0,
                .msg_iov        = scratchiov,
                .msg_iovlen     = niov,
                .msg_control    = NULL,
                .msg_controllen = 0,
                .msg_flags      = MSG_DONTWAIT
        };
        
        int  i;
        
        for (nob = i = 0; i < niov; i++) {
                scratchiov[i] = tx->tx_iov[i];
                nob += scratchiov[i].iov_len;
        } 
        
        /* 
         * XXX Liang:
         * Linux has MSG_MORE, do we have anything to
         * reduce number of partial TCP segments sent?
         */
        rc = -sock_send(sock, &msg, MSG_DONTWAIT, &sndlen);
        if (rc == 0)
                rc = sndlen;
        return rc;
}

int
ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
{
        socket_t       sock = C2B_SOCK(conn->ksnc_sock);
        lnet_kiov_t   *kiov = tx->tx_kiov;
        int            rc;
        int            nob;
        size_t         sndlen;

#if SOCKNAL_SINGLE_FRAG_TX
        struct iovec  scratch;
        struct iovec *scratchiov = &scratch;
        unsigned int  niov = 1;
#else
        struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
        unsigned int  niov = tx->tx_nkiov;
#endif
        struct msghdr msg = {
                .msg_name       = NULL,
                .msg_namelen    = 0,
                .msg_iov        = scratchiov,
                .msg_iovlen     = niov,
                .msg_control    = NULL,
                .msg_controllen = 0,
                .msg_flags      = MSG_DONTWAIT
        };
        
        int           i;
        
        for (nob = i = 0; i < niov; i++) {
                scratchiov[i].iov_base = cfs_kmap(kiov[i].kiov_page) +
                                         kiov[i].kiov_offset;
                nob += scratchiov[i].iov_len = kiov[i].kiov_len;
        }

        /* 
         * XXX Liang:
         * Linux has MSG_MORE, do wen have anyting to
         * reduce number of partial TCP segments sent?
         */
        rc = -sock_send(sock, &msg, MSG_DONTWAIT, &sndlen);
        for (i = 0; i < niov; i++)
                cfs_kunmap(kiov[i].kiov_page);
        if (rc == 0)
                rc = sndlen;
        return rc;
}

int
ksocknal_lib_recv_iov (ksock_conn_t *conn)
{
#if SOCKNAL_SINGLE_FRAG_RX
        struct iovec  scratch;
        struct iovec *scratchiov = &scratch;
        unsigned int  niov = 1;
#else
        struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
        unsigned int  niov = conn->ksnc_rx_niov;
#endif
        struct iovec *iov = conn->ksnc_rx_iov;
        struct msghdr msg = {
                .msg_name       = NULL,
                .msg_namelen    = 0,
                .msg_iov        = scratchiov,
                .msg_iovlen     = niov,
                .msg_control    = NULL,
                .msg_controllen = 0,
                .msg_flags      = 0
        };
        size_t       rcvlen;
        int          nob;
        int          i;
        int          rc;

        LASSERT (niov > 0);

        for (nob = i = 0; i < niov; i++) {
                scratchiov[i] = iov[i];
                nob += scratchiov[i].iov_len;
        }
        LASSERT (nob <= conn->ksnc_rx_nob_wanted); 
        rc = -sock_receive (C2B_SOCK(conn->ksnc_sock), &msg, MSG_DONTWAIT, &rcvlen);
        if (rc == 0)
                rc = rcvlen;

        return rc;
}

int
ksocknal_lib_recv_kiov (ksock_conn_t *conn)
{
#if SOCKNAL_SINGLE_FRAG_RX
        struct iovec  scratch;
        struct iovec *scratchiov = &scratch;
        unsigned int  niov = 1;
#else
        struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
        unsigned int  niov = conn->ksnc_rx_nkiov;
#endif
        lnet_kiov_t   *kiov = conn->ksnc_rx_kiov;
        struct msghdr msg = {
                .msg_name       = NULL,
                .msg_namelen    = 0,
                .msg_iov        = scratchiov,
                .msg_iovlen     = niov,
                .msg_control    = NULL,
                .msg_controllen = 0,
                .msg_flags      = 0
        };
        int          nob;
        int          i;
        size_t       rcvlen;
        int          rc;

        /* NB we can't trust socket ops to either consume our iovs
         * or leave them alone. */
        for (nob = i = 0; i < niov; i++) {
                scratchiov[i].iov_base = cfs_kmap(kiov[i].kiov_page) + \
                                         kiov[i].kiov_offset;
                nob += scratchiov[i].iov_len = kiov[i].kiov_len;
        }
        LASSERT (nob <= conn->ksnc_rx_nob_wanted);
        rc = -sock_receive(C2B_SOCK(conn->ksnc_sock), &msg, MSG_DONTWAIT, &rcvlen); 
        for (i = 0; i < niov; i++)
                cfs_kunmap(kiov[i].kiov_page); 
        if (rc == 0)
                rc = rcvlen;
        return (rc);
}

void
ksocknal_lib_eager_ack (ksock_conn_t *conn)
{
        /* XXX Liang: */
}

int
ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle)
{
        socket_t       sock = C2B_SOCK(conn->ksnc_sock);
        int            len;
        int            rc;

        rc = ksocknal_connsock_addref(conn);
        if (rc != 0) {
                LASSERT (conn->ksnc_closing);
                *txmem = *rxmem = *nagle = 0;
                return (-ESHUTDOWN);
        }
        rc = libcfs_sock_getbuf(conn->ksnc_sock, txmem, rxmem);
        if (rc == 0) {
                len = sizeof(*nagle);
                rc = -sock_getsockopt(sock, IPPROTO_TCP, TCP_NODELAY,
                                      nagle, &len);
        }
        ksocknal_connsock_decref(conn);

        if (rc == 0)
                *nagle = !*nagle;
        else
                *txmem = *rxmem = *nagle = 0;

        return (rc);
}

int
ksocknal_lib_setup_sock (cfs_socket_t *sock)
{
        int             rc; 
        int             option; 
        int             keep_idle; 
        int             keep_intvl; 
        int             keep_count; 
        int             do_keepalive; 
        socket_t        so = C2B_SOCK(sock);
        struct linger   linger;

        /* Ensure this socket aborts active sends immediately when we close
         * it. */
        linger.l_onoff = 0;
        linger.l_linger = 0;
        rc = -sock_setsockopt(so, SOL_SOCKET, SO_LINGER, &linger, sizeof(linger));
        if (rc != 0) {
                CERROR ("Can't set SO_LINGER: %d\n", rc);
                return (rc);
        }

        if (!*ksocknal_tunables.ksnd_nagle) { 
                option = 1; 
                rc = -sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &option, sizeof(option));
                if (rc != 0) { 
                        CERROR ("Can't disable nagle: %d\n", rc); 
                        return (rc);
                } 
        } 

        rc = libcfs_sock_setbuf(sock,
                                *ksocknal_tunables.ksnd_tx_buffer_size,
                                *ksocknal_tunables.ksnd_rx_buffer_size);
        if (rc != 0) {
                CERROR ("Can't set buffer tx %d, rx %d buffers: %d\n",
                        *ksocknal_tunables.ksnd_tx_buffer_size,
                        *ksocknal_tunables.ksnd_rx_buffer_size, rc);
                return (rc);
        }

        /* snapshot tunables */ 
        keep_idle  = *ksocknal_tunables.ksnd_keepalive_idle; 
        keep_count = *ksocknal_tunables.ksnd_keepalive_count; 
        keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl;

        do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0); 
        option = (do_keepalive ? 1 : 0); 

        rc = -sock_setsockopt(so, SOL_SOCKET, SO_KEEPALIVE, &option, sizeof(option)); 
        if (rc != 0) { 
                CERROR ("Can't set SO_KEEPALIVE: %d\n", rc); 
                return (rc);
        }
        
        if (!do_keepalive)
                return (rc);
        rc = -sock_setsockopt(so, IPPROTO_TCP, TCP_KEEPALIVE, 
                              &keep_idle, sizeof(keep_idle));
        
        return (rc);
}

void
ksocknal_lib_push_conn(ksock_conn_t *conn)
{ 
        socket_t        sock; 
        int             val = 1; 
        int             rc; 
        
        rc = ksocknal_connsock_addref(conn); 
        if (rc != 0)            /* being shut down */ 
                return; 
        sock = C2B_SOCK(conn->ksnc_sock); 

        rc = -sock_setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, &val, sizeof(val)); 
        LASSERT(rc == 0);

        ksocknal_connsock_decref(conn);
        return;
}
示例#5
0
int
ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
{
        struct socket *sock = conn->ksnc_sock;
        int            nob;
        int            rc;

        if (*ksocknal_tunables.ksnd_enable_csum        && /* checksum enabled */
            conn->ksnc_proto == &ksocknal_protocol_v2x && /* V2.x connection  */
            tx->tx_nob == tx->tx_resid                 && /* frist sending    */
            tx->tx_msg.ksm_csum == 0)                     /* not checksummed  */
                ksocknal_lib_csum_tx(tx);

        /* NB we can't trust socket ops to either consume our iovs
         * or leave them alone. */

        {
#if SOCKNAL_SINGLE_FRAG_TX
                struct iovec    scratch;
                struct iovec   *scratchiov = &scratch;
                unsigned int    niov = 1;
#else
                struct iovec   *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
                unsigned int    niov = tx->tx_niov;
#endif
                struct msghdr msg = {
                        .msg_name       = NULL,
                        .msg_namelen    = 0,
                        .msg_iov        = scratchiov,
                        .msg_iovlen     = niov,
                        .msg_control    = NULL,
                        .msg_controllen = 0,
                        .msg_flags      = MSG_DONTWAIT
                };
                mm_segment_t oldmm = get_fs();
                int  i;

                for (nob = i = 0; i < niov; i++) {
                        scratchiov[i] = tx->tx_iov[i];
                        nob += scratchiov[i].iov_len;
                }

                if (!cfs_list_empty(&conn->ksnc_tx_queue) ||
                    nob < tx->tx_resid)
                        msg.msg_flags |= MSG_MORE;

                set_fs (KERNEL_DS);
                rc = sock_sendmsg(sock, &msg, nob);
                set_fs (oldmm);
        }
        return rc;
}

int
ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
{
        struct socket *sock = conn->ksnc_sock;
        lnet_kiov_t   *kiov = tx->tx_kiov;
        int            rc;
        int            nob;

        /* Not NOOP message */
        LASSERT (tx->tx_lnetmsg != NULL);

        /* NB we can't trust socket ops to either consume our iovs
         * or leave them alone. */
        if (tx->tx_msg.ksm_zc_cookies[0] != 0) {
                /* Zero copy is enabled */
                struct sock   *sk = sock->sk;
                struct page   *page = kiov->kiov_page;
                int            offset = kiov->kiov_offset;
                int            fragsize = kiov->kiov_len;
                int            msgflg = MSG_DONTWAIT;

                CDEBUG(D_NET, "page %p + offset %x for %d\n",
                               page, offset, kiov->kiov_len);

                if (!cfs_list_empty(&conn->ksnc_tx_queue) ||
                    fragsize < tx->tx_resid)
                        msgflg |= MSG_MORE;

                if (sk->sk_prot->sendpage != NULL) {
                        rc = sk->sk_prot->sendpage(sk, page,
                                                   offset, fragsize, msgflg);
                } else {
                        rc = cfs_tcp_sendpage(sk, page, offset, fragsize,
                                              msgflg);
                }
        } else {
#if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK
                struct iovec  scratch;
                struct iovec *scratchiov = &scratch;
                unsigned int  niov = 1;
#else
#ifdef CONFIG_HIGHMEM
#warning "XXX risk of kmap deadlock on multiple frags..."
#endif
                struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
                unsigned int  niov = tx->tx_nkiov;
#endif
                struct msghdr msg = {
                        .msg_name       = NULL,
                        .msg_namelen    = 0,
                        .msg_iov        = scratchiov,
                        .msg_iovlen     = niov,
                        .msg_control    = NULL,
                        .msg_controllen = 0,
                        .msg_flags      = MSG_DONTWAIT
                };
                mm_segment_t  oldmm = get_fs();
                int           i;

                for (nob = i = 0; i < niov; i++) {
                        scratchiov[i].iov_base = kmap(kiov[i].kiov_page) +
                                                 kiov[i].kiov_offset;
                        nob += scratchiov[i].iov_len = kiov[i].kiov_len;
                }

                if (!cfs_list_empty(&conn->ksnc_tx_queue) ||
                    nob < tx->tx_resid)
                        msg.msg_flags |= MSG_MORE;

                set_fs (KERNEL_DS);
                rc = sock_sendmsg(sock, &msg, nob);
                set_fs (oldmm);

                for (i = 0; i < niov; i++)
                        kunmap(kiov[i].kiov_page);
        }
        return rc;
}

void
ksocknal_lib_eager_ack (ksock_conn_t *conn)
{
        int            opt = 1;
        mm_segment_t   oldmm = get_fs();
        struct socket *sock = conn->ksnc_sock;

        /* Remind the socket to ACK eagerly.  If I don't, the socket might
         * think I'm about to send something it could piggy-back the ACK
         * on, introducing delay in completing zero-copy sends in my
         * peer. */

        set_fs(KERNEL_DS);
        sock->ops->setsockopt (sock, SOL_TCP, TCP_QUICKACK,
                               (char *)&opt, sizeof (opt));
        set_fs(oldmm);
}

int
ksocknal_lib_recv_iov (ksock_conn_t *conn)
{
#if SOCKNAL_SINGLE_FRAG_RX
        struct iovec  scratch;
        struct iovec *scratchiov = &scratch;
        unsigned int  niov = 1;
#else
        struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
        unsigned int  niov = conn->ksnc_rx_niov;
#endif
        struct iovec *iov = conn->ksnc_rx_iov;
        struct msghdr msg = {
                .msg_name       = NULL,
                .msg_namelen    = 0,
                .msg_iov        = scratchiov,
                .msg_iovlen     = niov,
                .msg_control    = NULL,
                .msg_controllen = 0,
                .msg_flags      = 0
        };
        mm_segment_t oldmm = get_fs();
        int          nob;
        int          i;
        int          rc;
        int          fragnob;
        int          sum;
        __u32        saved_csum;

        /* NB we can't trust socket ops to either consume our iovs
         * or leave them alone. */
        LASSERT (niov > 0);

        for (nob = i = 0; i < niov; i++) {
                scratchiov[i] = iov[i];
                nob += scratchiov[i].iov_len;
        }
        LASSERT (nob <= conn->ksnc_rx_nob_wanted);

        set_fs (KERNEL_DS);
        rc = sock_recvmsg (conn->ksnc_sock, &msg, nob, MSG_DONTWAIT);
        /* NB this is just a boolean..........................^ */
        set_fs (oldmm);

        saved_csum = 0;
        if (conn->ksnc_proto == &ksocknal_protocol_v2x) {
                saved_csum = conn->ksnc_msg.ksm_csum;
                conn->ksnc_msg.ksm_csum = 0;
        }

        if (saved_csum != 0) {
                /* accumulate checksum */
                for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) {
                        LASSERT (i < niov);

                        fragnob = iov[i].iov_len;
                        if (fragnob > sum)
                                fragnob = sum;

                        conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum,
                                                           iov[i].iov_base, fragnob);
                }
                conn->ksnc_msg.ksm_csum = saved_csum;
        }

        return rc;
}

static void
ksocknal_lib_kiov_vunmap(void *addr)
{
        if (addr == NULL)
                return;

        vunmap(addr);
}

static void *
ksocknal_lib_kiov_vmap(lnet_kiov_t *kiov, int niov,
                       struct iovec *iov, struct page **pages)
{
        void             *addr;
        int               nob;
        int               i;

        if (!*ksocknal_tunables.ksnd_zc_recv || pages == NULL)
                return NULL;

        LASSERT (niov <= LNET_MAX_IOV);

        if (niov < 2 ||
            niov < *ksocknal_tunables.ksnd_zc_recv_min_nfrags)
                return NULL;

        for (nob = i = 0; i < niov; i++) {
                if ((kiov[i].kiov_offset != 0 && i > 0) ||
                    (kiov[i].kiov_offset + kiov[i].kiov_len != CFS_PAGE_SIZE && i < niov - 1))
                        return NULL;

                pages[i] = kiov[i].kiov_page;
                nob += kiov[i].kiov_len;
        }

        addr = vmap(pages, niov, VM_MAP, PAGE_KERNEL);
        if (addr == NULL)
                return NULL;

        iov->iov_base = addr + kiov[0].kiov_offset;
        iov->iov_len = nob;

        return addr;
}

int
ksocknal_lib_recv_kiov (ksock_conn_t *conn)
{
#if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK
        struct iovec   scratch;
        struct iovec  *scratchiov = &scratch;
        struct page  **pages      = NULL;
        unsigned int   niov       = 1;
#else
#ifdef CONFIG_HIGHMEM
#warning "XXX risk of kmap deadlock on multiple frags..."
#endif
        struct iovec  *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
        struct page  **pages      = conn->ksnc_scheduler->kss_rx_scratch_pgs;
        unsigned int   niov       = conn->ksnc_rx_nkiov;
#endif
        lnet_kiov_t   *kiov = conn->ksnc_rx_kiov;
        struct msghdr msg = {
                .msg_name       = NULL,
                .msg_namelen    = 0,
                .msg_iov        = scratchiov,
                .msg_control    = NULL,
                .msg_controllen = 0,
                .msg_flags      = 0
        };
        mm_segment_t oldmm = get_fs();
        int          nob;
        int          i;
        int          rc;
        void        *base;
        void        *addr;
        int          sum;
        int          fragnob;

        /* NB we can't trust socket ops to either consume our iovs
         * or leave them alone. */
        if ((addr = ksocknal_lib_kiov_vmap(kiov, niov, scratchiov, pages)) != NULL) {
                nob = scratchiov[0].iov_len;
                msg.msg_iovlen = 1;

        } else {
                for (nob = i = 0; i < niov; i++) {
                        nob += scratchiov[i].iov_len = kiov[i].kiov_len;
                        scratchiov[i].iov_base = kmap(kiov[i].kiov_page) +
                                                 kiov[i].kiov_offset;
                }
                msg.msg_iovlen = niov;
        }

        LASSERT (nob <= conn->ksnc_rx_nob_wanted);

        set_fs (KERNEL_DS);
        rc = sock_recvmsg (conn->ksnc_sock, &msg, nob, MSG_DONTWAIT);
        /* NB this is just a boolean.......................^ */
        set_fs (oldmm);

        if (conn->ksnc_msg.ksm_csum != 0) {
                for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) {
                        LASSERT (i < niov);

                        /* Dang! have to kmap again because I have nowhere to stash the
                         * mapped address.  But by doing it while the page is still
                         * mapped, the kernel just bumps the map count and returns me
                         * the address it stashed. */
                        base = kmap(kiov[i].kiov_page) + kiov[i].kiov_offset;
                        fragnob = kiov[i].kiov_len;
                        if (fragnob > sum)
                                fragnob = sum;

                        conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum,
                                                           base, fragnob);

                        kunmap(kiov[i].kiov_page);
                }
        }

        if (addr != NULL) {
                ksocknal_lib_kiov_vunmap(addr);
        } else {
                for (i = 0; i < niov; i++)
                        kunmap(kiov[i].kiov_page);
        }

        return (rc);
}

void
ksocknal_lib_csum_tx(ksock_tx_t *tx)
{
        int          i;
        __u32        csum;
        void        *base;

        LASSERT(tx->tx_iov[0].iov_base == (void *)&tx->tx_msg);
        LASSERT(tx->tx_conn != NULL);
        LASSERT(tx->tx_conn->ksnc_proto == &ksocknal_protocol_v2x);

        tx->tx_msg.ksm_csum = 0;

        csum = ksocknal_csum(~0, (void *)tx->tx_iov[0].iov_base,
                             tx->tx_iov[0].iov_len);

        if (tx->tx_kiov != NULL) {
                for (i = 0; i < tx->tx_nkiov; i++) {
                        base = kmap(tx->tx_kiov[i].kiov_page) +
                               tx->tx_kiov[i].kiov_offset;

                        csum = ksocknal_csum(csum, base, tx->tx_kiov[i].kiov_len);

                        kunmap(tx->tx_kiov[i].kiov_page);
                }
        } else {
                for (i = 1; i < tx->tx_niov; i++)
                        csum = ksocknal_csum(csum, tx->tx_iov[i].iov_base,
                                             tx->tx_iov[i].iov_len);
        }

        if (*ksocknal_tunables.ksnd_inject_csum_error) {
                csum++;
                *ksocknal_tunables.ksnd_inject_csum_error = 0;
        }

        tx->tx_msg.ksm_csum = csum;
}

int
ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle)
{
        mm_segment_t   oldmm = get_fs ();
        struct socket *sock = conn->ksnc_sock;
        int            len;
        int            rc;

        rc = ksocknal_connsock_addref(conn);
        if (rc != 0) {
                LASSERT (conn->ksnc_closing);
                *txmem = *rxmem = *nagle = 0;
                return (-ESHUTDOWN);
        }

        rc = libcfs_sock_getbuf(sock, txmem, rxmem);
        if (rc == 0) {
                len = sizeof(*nagle);
                set_fs(KERNEL_DS);
                rc = sock->ops->getsockopt(sock, SOL_TCP, TCP_NODELAY,
                                           (char *)nagle, &len);
                set_fs(oldmm);
        }

        ksocknal_connsock_decref(conn);

        if (rc == 0)
                *nagle = !*nagle;
        else
                *txmem = *rxmem = *nagle = 0;

        return (rc);
}