Ejemplo n.º 1
0
/*
 * This random exponential backoff is relied on to eventually resolve racing
 * connects.
 *
 * If connect attempts race then both parties drop both connections and come
 * here to wait for a random amount of time before trying again.  Eventually
 * the backoff range will be so much greater than the time it takes to
 * establish a connection that one of the pair will establish the connection
 * before the other's random delay fires.
 *
 * Connection attempts that arrive while a connection is already established
 * are also considered to be racing connects.  This lets a connection from
 * a rebooted machine replace an existing stale connection before the transport
 * notices that the connection has failed.
 *
 * We should *always* start with a random backoff; otherwise a broken connection
 * will always take several iterations to be re-established.
 */
void rds_queue_reconnect(struct rds_conn_path *cp)
{
	unsigned long rand;
	struct rds_connection *conn = cp->cp_conn;

	rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n",
	  conn, &conn->c_laddr, &conn->c_faddr,
	  cp->cp_reconnect_jiffies);

	/* let peer with smaller addr initiate reconnect, to avoid duels */
	if (conn->c_trans->t_type == RDS_TRANS_TCP &&
	    conn->c_laddr > conn->c_faddr)
		return;

	set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
	if (cp->cp_reconnect_jiffies == 0) {
		cp->cp_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies;
		queue_delayed_work(rds_wq, &cp->cp_conn_w, 0);
		return;
	}

	get_random_bytes(&rand, sizeof(rand));
	rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n",
		 rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies,
		 conn, &conn->c_laddr, &conn->c_faddr);
	queue_delayed_work(rds_wq, &cp->cp_conn_w,
			   rand % cp->cp_reconnect_jiffies);

	cp->cp_reconnect_jiffies = min(cp->cp_reconnect_jiffies * 2,
					rds_sysctl_reconnect_max_jiffies);
}
Ejemplo n.º 2
0
static void rds_queue_reconnect(struct rds_connection *conn)
{
	unsigned long rand;

	rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n",
	  conn, &conn->c_laddr, &conn->c_faddr,
	  conn->c_reconnect_jiffies);

	set_bit(RDS_RECONNECT_PENDING, &conn->c_flags);
	if (conn->c_reconnect_jiffies == 0) {
		conn->c_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies;
		queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
		return;
	}

	get_random_bytes(&rand, sizeof(rand));
	rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n",
		 rand % conn->c_reconnect_jiffies, conn->c_reconnect_jiffies,
		 conn, &conn->c_laddr, &conn->c_faddr);
	queue_delayed_work(rds_wq, &conn->c_conn_w,
			   rand % conn->c_reconnect_jiffies);

	conn->c_reconnect_jiffies = min(conn->c_reconnect_jiffies * 2,
					rds_sysctl_reconnect_max_jiffies);
}
Ejemplo n.º 3
0
int rds_tcp_conn_connect(struct rds_connection *conn)
{
	struct socket *sock = NULL;
	struct sockaddr_in src, dest;
	int ret;
	struct rds_tcp_connection *tc = conn->c_transport_data;

	mutex_lock(&tc->t_conn_lock);

	if (rds_conn_up(conn)) {
		mutex_unlock(&tc->t_conn_lock);
		return 0;
	}
	ret = sock_create_kern(rds_conn_net(conn), PF_INET,
			       SOCK_STREAM, IPPROTO_TCP, &sock);
	if (ret < 0)
		goto out;

	rds_tcp_tune(sock);

	src.sin_family = AF_INET;
	src.sin_addr.s_addr = (__force u32)conn->c_laddr;
	src.sin_port = (__force u16)htons(0);

	ret = sock->ops->bind(sock, (struct sockaddr *)&src, sizeof(src));
	if (ret) {
		rdsdebug("bind failed with %d at address %pI4\n",
			 ret, &conn->c_laddr);
		goto out;
	}

	dest.sin_family = AF_INET;
	dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
	dest.sin_port = (__force u16)htons(RDS_TCP_PORT);

	/*
	 * once we call connect() we can start getting callbacks and they
	 * own the socket
	 */
	rds_tcp_set_callbacks(sock, conn);
	ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest),
				 O_NONBLOCK);

	rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret);
	if (ret == -EINPROGRESS)
		ret = 0;
	if (ret == 0) {
		rds_tcp_keepalive(sock);
		sock = NULL;
	} else {
		rds_tcp_restore_callbacks(sock, conn->c_transport_data);
	}

out:
	mutex_unlock(&tc->t_conn_lock);
	if (sock)
		sock_release(sock);
	return ret;
}
Ejemplo n.º 4
0
static void
rds_recv_forward(struct rds_connection *conn, struct rds_incoming *inc,
		 gfp_t gfp)
{
	int len, ret;
	struct rds_nf_hdr *dst, *org;
	struct rds_sock *rs;

	/* initialize some bits */
	rs = NULL;

	/* pull out the destination and original rds headers */
	dst = rds_nf_hdr_dst(inc->i_skb);
	org = rds_nf_hdr_org(inc->i_skb);

	/* find the proper output socket - it should be the local one on which we originated */
	rs = rds_find_bound(dst->saddr, dst->sport);
	if (!rs) {
		rdsdebug("failed to find output rds_socket dst %u.%u.%u.%u : %u, inc %p, conn %p\n",
			 NIPQUAD(dst->daddr), dst->dport, inc, conn);
		rds_stats_inc(s_recv_drop_no_sock);
		goto out;
	}

	/* pull out the actual message len */
	len = be32_to_cpu(inc->i_hdr.h_len);

	/* now lets see if we can send it all */
	ret = rds_send_internal(conn, rs, inc->i_skb, gfp);
	if (len != ret) {
		rdsdebug("failed to send rds_data dst %u.%u.%u.%u : %u, inc %p, conn %p, len %d != ret %d\n",
			 NIPQUAD(dst->daddr), dst->dport, inc, conn, len, ret);
		goto out;
	}

	if (NULL != rs)
		rds_sock_put(rs);

	/* all good so we are done */
	return;

out:
	/* cleanup any handles */
	if (NULL != rs)
		rds_sock_put(rs);

	/* on error lets take a shot at hook cleanup */
	NF_HOOK(PF_RDS_HOOK, NF_RDS_FORWARD_ERROR, inc->i_skb, NULL, NULL, rds_recv_ok);

	/* then hand the request off to normal local processing on the old connection */
	rds_recv_local(inc->i_oconn, org->saddr, org->daddr, inc, gfp);
}
Ejemplo n.º 5
0
void rds_send_worker(struct work_struct *work)
{
	struct rds_conn_path *cp = container_of(work,
						struct rds_conn_path,
						cp_send_w.work);
	int ret;

	if (rds_conn_path_state(cp) == RDS_CONN_UP) {
		clear_bit(RDS_LL_SEND_FULL, &cp->cp_flags);
		ret = rds_send_xmit(cp);
		cond_resched();
		rdsdebug("conn %p ret %d\n", cp->cp_conn, ret);
		switch (ret) {
		case -EAGAIN:
			rds_stats_inc(s_send_immediate_retry);
			queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
			break;
		case -ENOMEM:
			rds_stats_inc(s_send_delayed_retry);
			queue_delayed_work(rds_wq, &cp->cp_send_w, 2);
		default:
			break;
		}
	}
}
Ejemplo n.º 6
0
void rds_connect_worker(struct work_struct *work)
{
	struct rds_conn_path *cp = container_of(work,
						struct rds_conn_path,
						cp_conn_w.work);
	struct rds_connection *conn = cp->cp_conn;
	int ret;

	if (cp->cp_index > 1 && cp->cp_conn->c_laddr > cp->cp_conn->c_faddr)
		return;
	clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
	ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING);
	if (ret) {
		ret = conn->c_trans->conn_path_connect(cp);
		rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n",
			conn, &conn->c_laddr, &conn->c_faddr, ret);

		if (ret) {
			if (rds_conn_path_transition(cp,
						     RDS_CONN_CONNECTING,
						     RDS_CONN_DOWN))
				rds_queue_reconnect(cp);
			else
				rds_conn_path_error(cp, "connect failed\n");
		}
	}
}
Ejemplo n.º 7
0
void rds_tcp_listen_data_ready(struct sock *sk)
{
	void (*ready)(struct sock *sk);

	rdsdebug("listen data ready sk %p\n", sk);

	read_lock(&sk->sk_callback_lock);
	ready = sk->sk_user_data;
	if (!ready) { /* check for teardown race */
		ready = sk->sk_data_ready;
		goto out;
	}

	/*
	 * ->sk_data_ready is also called for a newly established child socket
	 * before it has been accepted and the accepter has set up their
	 * data_ready.. we only want to queue listen work for our listening
	 * socket
	 */
	if (sk->sk_state == TCP_LISTEN)
		rds_tcp_accept_work(sk);

out:
	read_unlock(&sk->sk_callback_lock);
	ready(sk);
}
Ejemplo n.º 8
0
void rds_connect_complete(struct rds_connection *conn)
{
	if (!rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_UP)) {
		printk(KERN_WARNING "%s: Cannot transition to state UP"
				", current state is %d\n",
				__func__,
				atomic_read(&conn->c_state));
		atomic_set(&conn->c_state, RDS_CONN_ERROR);
		queue_work(rds_wq, &conn->c_down_w);
		return;
	}

	rdsdebug("conn %p for %pI4 to %pI4 complete\n",
	  conn, &conn->c_laddr, &conn->c_faddr);

	conn->c_reconnect_jiffies = 0;
	set_bit(0, &conn->c_map_queued);
	queue_delayed_work(rds_wq, &conn->c_send_w, 0);
	queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
	queue_delayed_work(rds_wq, &conn->c_hb_w, 0);
	conn->c_hb_start = 0;

	conn->c_connection_start = get_seconds();
	conn->c_reconnect = 1;
	conn->c_committed_version = conn->c_version;
	conn->c_proposed_version = RDS_PROTOCOL_VERSION;
}
Ejemplo n.º 9
0
void rds_hb_worker(struct work_struct *work)
{
	struct rds_connection *conn = container_of(work, struct rds_connection, c_hb_w.work);
	unsigned long now = get_seconds();
	int ret;

	if (!rds_conn_hb_timeout || conn->c_loopback)
		return;

	if (rds_conn_state(conn) == RDS_CONN_UP) {
		if (!conn->c_hb_start) {
			ret = rds_send_hb(conn, 0);
			if (ret) {
				rdsdebug("RDS/IB: rds_hb_worker: failed %d\n", ret);
				return;
			}
			conn->c_hb_start = now;
		} else if (now - conn->c_hb_start > rds_conn_hb_timeout) {
			printk(KERN_NOTICE
				"RDS/IB: connection <%u.%u.%u.%u,%u.%u.%u.%u,%d> "
				"timed out (0x%lx,0x%lx)..disconnecting and reconnecting\n",
				NIPQUAD(conn->c_laddr),
				NIPQUAD(conn->c_faddr), conn->c_tos,
				conn->c_hb_start, now);
				rds_conn_drop(conn);
			return;
		}
		queue_delayed_work(rds_wq, &conn->c_hb_w, HZ);
	}
}
Ejemplo n.º 10
0
/*
 * Before killing the tcp socket this needs to serialize with callbacks.  The
 * caller has already grabbed the sending sem so we're serialized with other
 * senders.
 *
 * TCP calls the callbacks with the sock lock so we hold it while we reset the
 * callbacks to those set by TCP.  Our callbacks won't execute again once we
 * hold the sock lock.
 */
void rds_tcp_conn_path_shutdown(struct rds_conn_path *cp)
{
	struct rds_tcp_connection *tc = cp->cp_transport_data;
	struct socket *sock = tc->t_sock;

	rdsdebug("shutting down conn %p tc %p sock %p\n",
		 cp->cp_conn, tc, sock);

	if (sock) {
		if (rds_destroy_pending(cp->cp_conn))
			rds_tcp_set_linger(sock);
		sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN);
		lock_sock(sock->sk);
		rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */

		release_sock(sock->sk);
		sock_release(sock);
	}

	if (tc->t_tinc) {
		rds_inc_put(&tc->t_tinc->ti_inc);
		tc->t_tinc = NULL;
	}
	tc->t_tinc_hdr_rem = sizeof(struct rds_header);
	tc->t_tinc_data_rem = 0;
}
Ejemplo n.º 11
0
/*
 * get_user_pages() called flush_dcache_page() on the pages for us.
 */
void rds_info_copy(struct rds_info_iterator *iter, void *data,
		   unsigned long bytes)
{
	unsigned long this;

	while (bytes) {
		if (iter->addr == NULL)
			iter->addr = kmap_atomic(*iter->pages, KM_USER0);

		this = min(bytes, PAGE_SIZE - iter->offset);

		rdsdebug("page %p addr %p offset %lu this %lu data %p "
			  "bytes %lu\n", *iter->pages, iter->addr,
			  iter->offset, this, data, bytes);

		memcpy(iter->addr + iter->offset, data, this);

		data += this;
		bytes -= this;
		iter->offset += this;

		if (iter->offset == PAGE_SIZE) {
			kunmap_atomic(iter->addr, KM_USER0);
			iter->addr = NULL;
			iter->offset = 0;
			iter->pages++;
		}
	}
}
Ejemplo n.º 12
0
void rds_connect_worker(struct work_struct *work)
{
	struct rds_connection *conn = container_of(work, struct rds_connection, c_conn_w.work);
	int ret;

	clear_bit(RDS_RECONNECT_PENDING, &conn->c_flags);
	if (rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
		/*
		 * record the time we started trying to connect so that we can
		 * drop the connection if it doesn't work out after a while
		 */
		conn->c_connection_start = get_seconds();

		ret = conn->c_trans->conn_connect(conn);
		rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n",
			conn, &conn->c_laddr, &conn->c_faddr, ret);

		if (ret) {
			if (rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_DOWN)) {
				rds_queue_reconnect(conn);
			} else
				rds_conn_error(conn, "RDS: connect failed\n");
		}
	}
}
Ejemplo n.º 13
0
void rds_tcp_listen_data_ready(struct sock *sk, int bytes)
{
	void (*ready)(struct sock *sk, int bytes);

	rdsdebug("listen data ready sk %p\n", sk);

	read_lock_bh(&sk->sk_callback_lock);
	ready = sk->sk_user_data;
	if (!ready) { /*                         */
		ready = sk->sk_data_ready;
		goto out;
	}

	/*
                                                                       
                                                                 
                                                                    
          
  */
	if (sk->sk_state == TCP_LISTEN)
		queue_work(rds_wq, &rds_tcp_listen_work);

out:
	read_unlock_bh(&sk->sk_callback_lock);
	ready(sk, bytes);
}
Ejemplo n.º 14
0
static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
				  struct rds_cong_map *map,
				  int delta, __be16 port)
{
	int now_congested;

	if (delta == 0)
		return;

	rs->rs_rcv_bytes += delta;
	now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);

	rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d "
	  "now_cong %d delta %d\n",
	  rs, &rs->rs_bound_addr,
	  ntohs(rs->rs_bound_port), rs->rs_rcv_bytes,
	  rds_sk_rcvbuf(rs), now_congested, delta);

	/* wasn't -> am congested */
	if (!rs->rs_congested && now_congested) {
		rs->rs_congested = 1;
		rds_cong_set_bit(map, port);
		rds_cong_queue_updates(map);
	}
	/* was -> aren't congested */
	/* Require more free space before reporting uncongested to prevent
	   bouncing cong/uncong state too often */
	else if (rs->rs_congested && (rs->rs_rcv_bytes < (rds_sk_rcvbuf(rs)/2))) {
		rs->rs_congested = 0;
		rds_cong_clear_bit(map, port);
		rds_cong_queue_updates(map);
	}

	/* do nothing if no change in cong state */
}
Ejemplo n.º 15
0
void rds_tcp_state_change(struct sock *sk)
{
	void (*state_change)(struct sock *sk);
	struct rds_conn_path *cp;
	struct rds_tcp_connection *tc;

	read_lock_bh(&sk->sk_callback_lock);
	cp = sk->sk_user_data;
	if (!cp) {
		state_change = sk->sk_state_change;
		goto out;
	}
	tc = cp->cp_transport_data;
	state_change = tc->t_orig_state_change;

	rdsdebug("sock %p state_change to %d\n", tc->t_sock, sk->sk_state);

	switch (sk->sk_state) {
	/* ignore connecting sockets as they make progress */
	case TCP_SYN_SENT:
	case TCP_SYN_RECV:
		break;
	case TCP_ESTABLISHED:
		rds_connect_path_complete(cp, RDS_CONN_CONNECTING);
		break;
	case TCP_CLOSE_WAIT:
	case TCP_CLOSE:
		rds_conn_path_drop(cp);
	default:
		break;
	}
out:
	read_unlock_bh(&sk->sk_callback_lock);
	state_change(sk);
}
Ejemplo n.º 16
0
int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov,
					       size_t total_len)
{
	unsigned long to_copy;
	unsigned long iov_off;
	unsigned long sg_off;
	struct iovec *iov;
	struct scatterlist *sg;
	int ret = 0;

	rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);

	/*
	 * now allocate and copy in the data payload.
	 */
	sg = rm->data.op_sg;
	iov = first_iov;
	iov_off = 0;
	sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */

	while (total_len) {
		if (!sg_page(sg)) {
			ret = rds_page_remainder_alloc(sg, total_len,
						       GFP_HIGHUSER);
			if (ret)
				goto out;
			rm->data.op_nents++;
			sg_off = 0;
		}

		while (iov_off == iov->iov_len) {
			iov_off = 0;
			iov++;
		}

		to_copy = min(iov->iov_len - iov_off, sg->length - sg_off);
		to_copy = min_t(size_t, to_copy, total_len);

		rdsdebug("copying %lu bytes from user iov [%p, %zu] + %lu to "
			 "sg [%p, %u, %u] + %lu\n",
			 to_copy, iov->iov_base, iov->iov_len, iov_off,
			 (void *)sg_page(sg), sg->offset, sg->length, sg_off);

		ret = rds_page_copy_from_user(sg_page(sg), sg->offset + sg_off,
					      iov->iov_base + iov_off,
					      to_copy);
		if (ret)
			goto out;

		iov_off += to_copy;
		total_len -= to_copy;
		sg_off += to_copy;

		if (sg_off == sg->length)
			sg++;
	}

out:
	return ret;
}
Ejemplo n.º 17
0
void rds_tcp_state_change(struct sock *sk)
{
	void (*state_change)(struct sock *sk);
	struct rds_connection *conn;
	struct rds_tcp_connection *tc;

	read_lock(&sk->sk_callback_lock);
	conn = sk->sk_user_data;
	if (conn == NULL) {
		state_change = sk->sk_state_change;
		goto out;
	}
	tc = conn->c_transport_data;
	state_change = tc->t_orig_state_change;

	rdsdebug("sock %p state_change to %d\n", tc->t_sock, sk->sk_state);

	switch(sk->sk_state) {
		/* ignore connecting sockets as they make progress */
		case TCP_SYN_SENT:
		case TCP_SYN_RECV:
			break;
		case TCP_ESTABLISHED:
			rds_connect_complete(conn);
			break;
		case TCP_CLOSE:
			rds_conn_drop(conn);
		default:
			break;
	}
out:
	read_unlock(&sk->sk_callback_lock);
	state_change(sk);
}
Ejemplo n.º 18
0
static void
rds_recv_drop(struct rds_connection *conn, __be32 saddr, __be32 daddr,
	      struct rds_incoming *inc, gfp_t gfp)
{
	/* drop the existing incoming message */
	rdsdebug("dropping request on conn %p, inc %p, %u.%u.%u.%u -> %u.%u.%u.%u",
		 conn, inc, NIPQUAD(saddr), NIPQUAD(daddr));
}
Ejemplo n.º 19
0
static void rds_rdma_listen_stop(void)
{
	if (rds_rdma_listen_id) {
		rdsdebug("cm %p\n", rds_rdma_listen_id);
		rdma_destroy_id(rds_rdma_listen_id);
		rds_rdma_listen_id = NULL;
	}
}
Ejemplo n.º 20
0
void rds_inc_put(struct rds_incoming *inc)
{
	rdsdebug("put inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
	if (atomic_dec_and_test(&inc->i_refcount)) {
		BUG_ON(!list_empty(&inc->i_item));

		inc->i_conn->c_trans->inc_free(inc);
	}
}
Ejemplo n.º 21
0
int rds_message_inc_copy_to_user(struct rds_incoming *inc,
				 struct iovec *first_iov, size_t size)
{
	struct rds_message *rm;
	struct iovec *iov;
	struct scatterlist *sg;
	unsigned long to_copy;
	unsigned long iov_off;
	unsigned long vec_off;
	int copied;
	int ret;
	u32 len;

	rm = container_of(inc, struct rds_message, m_inc);
	len = be32_to_cpu(rm->m_inc.i_hdr.h_len);

	iov = first_iov;
	iov_off = 0;
	sg = rm->data.op_sg;
	vec_off = 0;
	copied = 0;

	while (copied < size && copied < len) {
		while (iov_off == iov->iov_len) {
			iov_off = 0;
			iov++;
		}

		to_copy = min(iov->iov_len - iov_off, sg->length - vec_off);
		to_copy = min_t(size_t, to_copy, size - copied);
		to_copy = min_t(unsigned long, to_copy, len - copied);

		rdsdebug("copying %lu bytes to user iov [%p, %zu] + %lu to "
			 "sg [%p, %u, %u] + %lu\n",
			 to_copy, iov->iov_base, iov->iov_len, iov_off,
			 sg_page(sg), sg->offset, sg->length, vec_off);

		ret = rds_page_copy_to_user(sg_page(sg), sg->offset + vec_off,
					    iov->iov_base + iov_off,
					    to_copy);
		if (ret) {
			copied = ret;
			break;
		}

		iov_off += to_copy;
		vec_off += to_copy;
		copied += to_copy;

		if (vec_off == sg->length) {
			vec_off = 0;
			sg++;
		}
	}

	return copied;
}
Ejemplo n.º 22
0
static int rds_tcp_accept_one(struct socket *sock)
{
	struct socket *new_sock = NULL;
	struct rds_connection *conn;
	int ret;
	struct inet_sock *inet;

	ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
			       sock->sk->sk_protocol, &new_sock);
	if (ret)
		goto out;

	new_sock->type = sock->type;
	new_sock->ops = sock->ops;
	ret = sock->ops->accept(sock, new_sock, O_NONBLOCK);
	if (ret < 0)
		goto out;

	rds_tcp_tune(new_sock);

	inet = inet_sk(new_sock->sk);

	rdsdebug("accepted tcp %pI4:%u -> %pI4:%u\n",
		 &inet->inet_saddr, ntohs(inet->inet_sport),
		 &inet->inet_daddr, ntohs(inet->inet_dport));

	conn = rds_conn_create(inet->inet_saddr, inet->inet_daddr,
			       &rds_tcp_transport, GFP_KERNEL);
	if (IS_ERR(conn)) {
		ret = PTR_ERR(conn);
		goto out;
	}

	/*
	 * see the comment above rds_queue_delayed_reconnect()
	 */
	if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
		if (rds_conn_state(conn) == RDS_CONN_UP)
			rds_tcp_stats_inc(s_tcp_listen_closed_stale);
		else
			rds_tcp_stats_inc(s_tcp_connect_raced);
		rds_conn_drop(conn);
		ret = 0;
		goto out;
	}

	rds_tcp_set_callbacks(new_sock, conn);
	rds_connect_complete(conn);
	new_sock = NULL;
	ret = 0;

out:
	if (new_sock)
		sock_release(new_sock);
	return ret;
}
Ejemplo n.º 23
0
static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
{
	struct rds_connection *conn = data;
	struct rds_ib_connection *ic = conn->c_transport_data;

	rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event);

	switch (event->event) {
	case IB_EVENT_COMM_EST:
		rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
		break;
	default:
		rdsdebug("Fatal QP Event %u "
			"- connection %pI4->%pI4, reconnecting\n",
			event->event, &conn->c_laddr, &conn->c_faddr);
		rds_conn_drop(conn);
		break;
	}
}
Ejemplo n.º 24
0
static void rds_ib_cq_comp_handler_send(struct ib_cq *cq, void *context)
{
	struct rds_connection *conn = context;
	struct rds_ib_connection *ic = conn->c_transport_data;

	rdsdebug("conn %p cq %p\n", conn, cq);

	rds_ib_stats_inc(s_ib_evt_handler_call);

	tasklet_schedule(&ic->i_send_tasklet);
}
Ejemplo n.º 25
0
void rds_message_put(struct rds_message *rm)
{
	rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
	WARN(!atomic_read(&rm->m_refcount), "danger refcount zero on %p\n", rm);
	if (atomic_dec_and_test(&rm->m_refcount)) {
		BUG_ON(!list_empty(&rm->m_sock_item));
		BUG_ON(!list_empty(&rm->m_conn_item));
		rds_message_purge(rm);

		kfree(rm);
	}
}
Ejemplo n.º 26
0
static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic,
			  struct rds_ib_send_work *send,
			  int wc_status)
{
	struct rds_message *rm = send->s_rm;

	rdsdebug("ic %p send %p rm %p\n", ic, send, rm);

	ib_dma_unmap_sg(ic->i_cm_id->device,
		     rm->m_sg, rm->m_nents,
		     DMA_TO_DEVICE);

	if (rm->m_rdma_op != NULL) {
		rds_ib_send_unmap_rdma(ic, rm->m_rdma_op);

		/* If the user asked for a completion notification on this
		 * message, we can implement three different semantics:
		 *  1.	Notify when we received the ACK on the RDS message
		 *	that was queued with the RDMA. This provides reliable
		 *	notification of RDMA status at the expense of a one-way
		 *	packet delay.
		 *  2.	Notify when the IB stack gives us the completion event for
		 *	the RDMA operation.
		 *  3.	Notify when the IB stack gives us the completion event for
		 *	the accompanying RDS messages.
		 * Here, we implement approach #3. To implement approach #2,
		 * call rds_rdma_send_complete from the cq_handler. To implement #1,
		 * don't call rds_rdma_send_complete at all, and fall back to the notify
		 * handling in the ACK processing code.
		 *
		 * Note: There's no need to explicitly sync any RDMA buffers using
		 * ib_dma_sync_sg_for_cpu - the completion for the RDMA
		 * operation itself unmapped the RDMA buffers, which takes care
		 * of synching.
		 */
		rds_ib_send_rdma_complete(rm, wc_status);

		if (rm->m_rdma_op->r_write)
			rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes);
		else
			rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes);
	}

	/* If anyone waited for this message to get flushed out, wake
	 * them up now */
	rds_message_unmapped(rm);

	rds_message_put(rm);
	send->s_rm = NULL;
}
Ejemplo n.º 27
0
void rds_remove_bound(struct rds_sock *rs)
{

	if (!rs->rs_bound_addr)
		return;

	rdsdebug("rs %p unbinding from %pI4:%d\n",
		 rs, &rs->rs_bound_addr,
		 ntohs(rs->rs_bound_port));

	rhashtable_remove_fast(&bind_hash_table, &rs->rs_bound_node, ht_parms);
	rds_sock_put(rs);
	rs->rs_bound_addr = 0;
}
Ejemplo n.º 28
0
/* returns -ve errno or +ve port */
static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
{
	int ret = -EADDRINUSE;
	u16 rover, last;
	u64 key;

	if (*port != 0) {
		rover = be16_to_cpu(*port);
		if (rover == RDS_FLAG_PROBE_PORT)
			return -EINVAL;
		last = rover;
	} else {
		rover = max_t(u16, prandom_u32(), 2);
		last = rover - 1;
	}

	do {
		if (rover == 0)
			rover++;

		if (rover == RDS_FLAG_PROBE_PORT)
			continue;
		key = ((u64)addr << 32) | cpu_to_be16(rover);
		if (rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms))
			continue;

		rs->rs_bound_key = key;
		rs->rs_bound_addr = addr;
		net_get_random_once(&rs->rs_hash_initval,
				    sizeof(rs->rs_hash_initval));
		rs->rs_bound_port = cpu_to_be16(rover);
		rs->rs_bound_node.next = NULL;
		rds_sock_addref(rs);
		if (!rhashtable_insert_fast(&bind_hash_table,
					    &rs->rs_bound_node, ht_parms)) {
			*port = rs->rs_bound_port;
			ret = 0;
			rdsdebug("rs %p binding to %pI4:%d\n",
			  rs, &addr, (int)ntohs(*port));
			break;
		} else {
			rds_sock_put(rs);
			ret = -ENOMEM;
			break;
		}
	} while (rover++ != last);

	return ret;
}
Ejemplo n.º 29
0
void rds_page_exit(void)
{
	unsigned int cpu;

	for_each_possible_cpu(cpu) {
		struct rds_page_remainder *rem;

		rem = &per_cpu(rds_page_remainders, cpu);
		rdsdebug("cpu %u\n", cpu);

		if (rem->r_page)
			__free_page(rem->r_page);
		rem->r_page = NULL;
	}
}
Ejemplo n.º 30
0
/*
 * Return the rds_sock bound at the given local address.
 *
 * The rx path can race with rds_release.  We notice if rds_release() has
 * marked this socket and don't return a rs ref to the rx path.
 */
struct rds_sock *rds_find_bound(__be32 addr, __be16 port)
{
	u64 key = ((u64)addr << 32) | port;
	struct rds_sock *rs;

	rs = rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms);
	if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
		rds_sock_addref(rs);
	else
		rs = NULL;

	rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr,
		ntohs(port));

	return rs;
}