static void rds_ib_tasklet_fn_recv(unsigned long data) { struct rds_ib_connection *ic = (struct rds_ib_connection *)data; struct rds_connection *conn = ic->conn; struct rds_ib_device *rds_ibdev = ic->rds_ibdev; struct rds_ib_ack_state state; if (!rds_ibdev) rds_conn_drop(conn); rds_ib_stats_inc(s_ib_tasklet_call); memset(&state, 0, sizeof(state)); poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state); ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state); if (state.ack_next_valid) rds_ib_set_ack(ic, state.ack_next, state.ack_required); if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) { rds_send_drop_acked(conn, state.ack_recv, NULL); ic->i_ack_recv = state.ack_recv; } if (rds_conn_up(conn)) rds_ib_attempt_ack(ic); }
void rds_tcp_state_change(struct sock *sk) { void (*state_change)(struct sock *sk); struct rds_connection *conn; struct rds_tcp_connection *tc; read_lock_bh(&sk->sk_callback_lock); conn = sk->sk_user_data; if (!conn) { state_change = sk->sk_state_change; goto out; } tc = conn->c_transport_data; state_change = tc->t_orig_state_change; rdsdebug("sock %p state_change to %d\n", tc->t_sock, sk->sk_state); switch(sk->sk_state) { /* ignore connecting sockets as they make progress */ case TCP_SYN_SENT: case TCP_SYN_RECV: break; case TCP_ESTABLISHED: rds_connect_path_complete(conn, RDS_CONN_CONNECTING); break; case TCP_CLOSE_WAIT: case TCP_CLOSE: rds_conn_drop(conn); default: break; } out: read_unlock_bh(&sk->sk_callback_lock); state_change(sk); }
void rds_hb_worker(struct work_struct *work) { struct rds_connection *conn = container_of(work, struct rds_connection, c_hb_w.work); unsigned long now = get_seconds(); int ret; if (!rds_conn_hb_timeout || conn->c_loopback) return; if (rds_conn_state(conn) == RDS_CONN_UP) { if (!conn->c_hb_start) { ret = rds_send_hb(conn, 0); if (ret) { rdsdebug("RDS/IB: rds_hb_worker: failed %d\n", ret); return; } conn->c_hb_start = now; } else if (now - conn->c_hb_start > rds_conn_hb_timeout) { printk(KERN_NOTICE "RDS/IB: connection <%u.%u.%u.%u,%u.%u.%u.%u,%d> " "timed out (0x%lx,0x%lx)..disconnecting and reconnecting\n", NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr), conn->c_tos, conn->c_hb_start, now); rds_conn_drop(conn); return; } queue_delayed_work(rds_wq, &conn->c_hb_w, HZ); } }
static int rds_tcp_accept_one(struct socket *sock) { struct socket *new_sock = NULL; struct rds_connection *conn; int ret; struct inet_sock *inet; ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type, sock->sk->sk_protocol, &new_sock); if (ret) goto out; new_sock->type = sock->type; new_sock->ops = sock->ops; ret = sock->ops->accept(sock, new_sock, O_NONBLOCK); if (ret < 0) goto out; rds_tcp_tune(new_sock); inet = inet_sk(new_sock->sk); rdsdebug("accepted tcp %pI4:%u -> %pI4:%u\n", &inet->inet_saddr, ntohs(inet->inet_sport), &inet->inet_daddr, ntohs(inet->inet_dport)); conn = rds_conn_create(inet->inet_saddr, inet->inet_daddr, &rds_tcp_transport, GFP_KERNEL); if (IS_ERR(conn)) { ret = PTR_ERR(conn); goto out; } /* * see the comment above rds_queue_delayed_reconnect() */ if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { if (rds_conn_state(conn) == RDS_CONN_UP) rds_tcp_stats_inc(s_tcp_listen_closed_stale); else rds_tcp_stats_inc(s_tcp_connect_raced); rds_conn_drop(conn); ret = 0; goto out; } rds_tcp_set_callbacks(new_sock, conn); rds_connect_complete(conn); new_sock = NULL; ret = 0; out: if (new_sock) sock_release(new_sock); return ret; }
static void rds_ib_qp_event_handler(struct ib_event *event, void *data) { struct rds_connection *conn = data; struct rds_ib_connection *ic = conn->c_transport_data; rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event); switch (event->event) { case IB_EVENT_COMM_EST: rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); break; default: rdsdebug("Fatal QP Event %u " "- connection %pI4->%pI4, reconnecting\n", event->event, &conn->c_laddr, &conn->c_faddr); rds_conn_drop(conn); break; } }
/* the core send_sem serializes this with other xmit and shutdown */ static int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len) { struct kvec vec = { .iov_base = data, .iov_len = len, }; struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL, }; return kernel_sendmsg(sock, &msg, &vec, 1, vec.iov_len); } /* the core send_sem serializes this with other xmit and shutdown */ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off) { struct rds_tcp_connection *tc = conn->c_transport_data; int done = 0; int ret = 0; if (hdr_off == 0) { /* * m_ack_seq is set to the sequence number of the last byte of * header and data. see rds_tcp_is_acked(). */ tc->t_last_sent_nxt = rds_tcp_snd_nxt(tc); rm->m_ack_seq = tc->t_last_sent_nxt + sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len) - 1; smp_mb__before_clear_bit(); set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags); tc->t_last_expected_una = rm->m_ack_seq + 1; rdsdebug("rm %p tcp nxt %u ack_seq %llu\n", rm, rds_tcp_snd_nxt(tc), (unsigned long long)rm->m_ack_seq); } if (hdr_off < sizeof(struct rds_header)) { /* see rds_tcp_write_space() */ set_bit(SOCK_NOSPACE, &tc->t_sock->sk->sk_socket->flags); ret = rds_tcp_sendmsg(tc->t_sock, (void *)&rm->m_inc.i_hdr + hdr_off, sizeof(rm->m_inc.i_hdr) - hdr_off); if (ret < 0) goto out; done += ret; if (hdr_off + done != sizeof(struct rds_header)) goto out; } while (sg < rm->data.op_nents) { ret = tc->t_sock->ops->sendpage(tc->t_sock, sg_page(&rm->data.op_sg[sg]), rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off, MSG_DONTWAIT|MSG_NOSIGNAL); rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.op_sg[sg]), rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off, ret); if (ret <= 0) break; off += ret; done += ret; if (off == rm->data.op_sg[sg].length) { off = 0; sg++; } } out: if (ret <= 0) { /* write_space will hit after EAGAIN, all else fatal */ if (ret == -EAGAIN) { rds_tcp_stats_inc(s_tcp_sndbuf_full); ret = 0; } else { printk(KERN_WARNING "RDS/tcp: send to %pI4 " "returned %d, disconnecting and reconnecting\n", &conn->c_faddr, ret); rds_conn_drop(conn); } } if (done == 0) done = ret; return done; } /* * rm->m_ack_seq is set to the tcp sequence number that corresponds to the * last byte of the message, including the header. This means that the * entire message has been received if rm->m_ack_seq is "before" the next * unacked byte of the TCP sequence space. We have to do very careful * wrapping 32bit comparisons here. */ static int rds_tcp_is_acked(struct rds_message *rm, uint64_t ack) { if (!test_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags)) return 0; return (__s32)((u32)rm->m_ack_seq - (u32)ack) < 0; } void rds_tcp_write_space(struct sock *sk) { void (*write_space)(struct sock *sk); struct rds_connection *conn; struct rds_tcp_connection *tc; read_lock_bh(&sk->sk_callback_lock); conn = sk->sk_user_data; if (!conn) { write_space = sk->sk_write_space; goto out; } tc = conn->c_transport_data; rdsdebug("write_space for tc %p\n", tc); write_space = tc->t_orig_write_space; rds_tcp_stats_inc(s_tcp_write_space_calls); rdsdebug("tcp una %u\n", rds_tcp_snd_una(tc)); tc->t_last_seen_una = rds_tcp_snd_una(tc); rds_send_drop_acked(conn, rds_tcp_snd_una(tc), rds_tcp_is_acked); if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) queue_delayed_work(rds_wq, &conn->c_send_w, 0); out: read_unlock_bh(&sk->sk_callback_lock); /* * write_space is only called when data leaves tcp's send queue if * SOCK_NOSPACE is set. We set SOCK_NOSPACE every time we put * data in tcp's send queue because we use write_space to parse the * sequence numbers and notice that rds messages have been fully * received. * * tcp's write_space clears SOCK_NOSPACE if the send queue has more * than a certain amount of space. So we need to set it again *after* * we call tcp's write_space or else we might only get called on the * first of a series of incoming tcp acks. */ write_space(sk); if (sk->sk_socket) set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); }
int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { /* this can be null in the listening path */ struct rds_connection *conn = cm_id->context; struct rds_transport *trans; int ret = 0; rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id, event->event, rdma_event_msg(event->event)); if (cm_id->device->node_type == RDMA_NODE_RNIC) trans = &rds_iw_transport; else trans = &rds_ib_transport; /* Prevent shutdown from tearing down the connection * while we're executing. */ if (conn) { mutex_lock(&conn->c_cm_lock); /* If the connection is being shut down, bail out * right away. We return 0 so cm_id doesn't get * destroyed prematurely */ if (rds_conn_state(conn) == RDS_CONN_DISCONNECTING) { /* Reject incoming connections while we're tearing * down an existing one. */ if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) ret = 1; goto out; } } switch (event->event) { case RDMA_CM_EVENT_CONNECT_REQUEST: ret = trans->cm_handle_connect(cm_id, event); break; case RDMA_CM_EVENT_ADDR_RESOLVED: /* XXX do we need to clean up if this fails? */ ret = rdma_resolve_route(cm_id, RDS_RDMA_RESOLVE_TIMEOUT_MS); break; case RDMA_CM_EVENT_ROUTE_RESOLVED: /* Connection could have been dropped so make sure the * cm_id is valid before proceeding */ if (conn) { struct rds_ib_connection *ibic; ibic = conn->c_transport_data; if (ibic && ibic->i_cm_id == cm_id) ret = trans->cm_initiate_connect(cm_id); else rds_conn_drop(conn); } break; case RDMA_CM_EVENT_ESTABLISHED: trans->cm_connect_complete(conn, event); break; case RDMA_CM_EVENT_ADDR_ERROR: case RDMA_CM_EVENT_ROUTE_ERROR: case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_REJECTED: case RDMA_CM_EVENT_DEVICE_REMOVAL: case RDMA_CM_EVENT_ADDR_CHANGE: if (conn) rds_conn_drop(conn); break; case RDMA_CM_EVENT_DISCONNECTED: rdsdebug("DISCONNECT event - dropping connection " "%pI4->%pI4\n", &conn->c_laddr, &conn->c_faddr); rds_conn_drop(conn); break; default: /* things like device disconnect? */ printk(KERN_ERR "RDS: unknown event %u (%s)!\n", event->event, rdma_event_msg(event->event)); break; } out: if (conn) mutex_unlock(&conn->c_cm_lock); rdsdebug("id %p event %u (%s) handling ret %d\n", cm_id, event->event, rdma_event_msg(event->event), ret); return ret; }
int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { /* this can be null in the listening path */ struct rds_connection *conn = cm_id->context; struct rds_transport *trans; int ret = 0; rdsdebug("conn %p id %p handling event %u\n", conn, cm_id, event->event); if (cm_id->device->node_type == RDMA_NODE_RNIC) trans = &rds_iw_transport; else trans = &rds_ib_transport; /* Prevent shutdown from tearing down the connection * while we're executing. */ if (conn) { mutex_lock(&conn->c_cm_lock); /* If the connection is being shut down, bail out * right away. We return 0 so cm_id doesn't get * destroyed prematurely */ if (rds_conn_state(conn) == RDS_CONN_DISCONNECTING) { /* Reject incoming connections while we're tearing * down an existing one. */ if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) ret = 1; goto out; } } switch (event->event) { case RDMA_CM_EVENT_CONNECT_REQUEST: ret = trans->cm_handle_connect(cm_id, event); break; case RDMA_CM_EVENT_ADDR_RESOLVED: /* XXX do we need to clean up if this fails? */ ret = rdma_resolve_route(cm_id, RDS_RDMA_RESOLVE_TIMEOUT_MS); break; case RDMA_CM_EVENT_ROUTE_RESOLVED: /* XXX worry about racing with listen acceptance */ ret = trans->cm_initiate_connect(cm_id); break; case RDMA_CM_EVENT_ESTABLISHED: trans->cm_connect_complete(conn, event); break; case RDMA_CM_EVENT_ADDR_ERROR: case RDMA_CM_EVENT_ROUTE_ERROR: case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_REJECTED: case RDMA_CM_EVENT_DEVICE_REMOVAL: case RDMA_CM_EVENT_ADDR_CHANGE: if (conn) rds_conn_drop(conn); break; case RDMA_CM_EVENT_DISCONNECTED: printk(KERN_WARNING "RDS/RDMA: DISCONNECT event - dropping connection " "%u.%u.%u.%u->%u.%u.%u.%u\n", NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr)); rds_conn_drop(conn); break; default: /* things like device disconnect? */ printk(KERN_ERR "unknown event %u\n", event->event); BUG(); break; } out: if (conn) { //struct rds_iw_connection *ic = conn->c_transport_data; /* If we return non-zero, we must to hang on to the cm_id */ //BUG_ON(ic->i_cm_id == cm_id && ret); mutex_unlock(&conn->c_cm_lock); } rdsdebug("id %p event %u handling ret %d\n", cm_id, event->event, ret); return ret; }
static int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len) { struct kvec vec = { .iov_base = data, .iov_len = len, }; struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL, }; return kernel_sendmsg(sock, &msg, &vec, 1, vec.iov_len); } int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off) { struct rds_tcp_connection *tc = conn->c_transport_data; int done = 0; int ret = 0; if (hdr_off == 0) { tc->t_last_sent_nxt = rds_tcp_snd_nxt(tc); rm->m_ack_seq = tc->t_last_sent_nxt + sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len) - 1; smp_mb__before_clear_bit(); set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags); tc->t_last_expected_una = rm->m_ack_seq + 1; rdsdebug("rm %p tcp nxt %u ack_seq %llu\n", rm, rds_tcp_snd_nxt(tc), (unsigned long long)rm->m_ack_seq); } if (hdr_off < sizeof(struct rds_header)) { set_bit(SOCK_NOSPACE, &tc->t_sock->sk->sk_socket->flags); ret = rds_tcp_sendmsg(tc->t_sock, (void *)&rm->m_inc.i_hdr + hdr_off, sizeof(rm->m_inc.i_hdr) - hdr_off); if (ret < 0) goto out; done += ret; if (hdr_off + done != sizeof(struct rds_header)) goto out; } while (sg < rm->data.op_nents) { ret = tc->t_sock->ops->sendpage(tc->t_sock, sg_page(&rm->data.op_sg[sg]), rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off, MSG_DONTWAIT|MSG_NOSIGNAL); rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.op_sg[sg]), rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off, ret); if (ret <= 0) break; off += ret; done += ret; if (off == rm->data.op_sg[sg].length) { off = 0; sg++; } } out: if (ret <= 0) { if (ret == -EAGAIN) { rds_tcp_stats_inc(s_tcp_sndbuf_full); ret = 0; } else { printk(KERN_WARNING "RDS/tcp: send to %pI4 " "returned %d, disconnecting and reconnecting\n", &conn->c_faddr, ret); rds_conn_drop(conn); } } if (done == 0) done = ret; return done; } static int rds_tcp_is_acked(struct rds_message *rm, uint64_t ack) { if (!test_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags)) return 0; return (__s32)((u32)rm->m_ack_seq - (u32)ack) < 0; } void rds_tcp_write_space(struct sock *sk) { void (*write_space)(struct sock *sk); struct rds_connection *conn; struct rds_tcp_connection *tc; read_lock_bh(&sk->sk_callback_lock); conn = sk->sk_user_data; if (!conn) { write_space = sk->sk_write_space; goto out; } tc = conn->c_transport_data; rdsdebug("write_space for tc %p\n", tc); write_space = tc->t_orig_write_space; rds_tcp_stats_inc(s_tcp_write_space_calls); rdsdebug("tcp una %u\n", rds_tcp_snd_una(tc)); tc->t_last_seen_una = rds_tcp_snd_una(tc); rds_send_drop_acked(conn, rds_tcp_snd_una(tc), rds_tcp_is_acked); if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) queue_delayed_work(rds_wq, &conn->c_send_w, 0); out: read_unlock_bh(&sk->sk_callback_lock); write_space(sk); if (sk->sk_socket) set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); }