Example #1
0
/* Like tcp_fin - called when SDP_MID_DISCONNECT is received */
static void
sdp_handle_disconn(struct sdp_sock *ssk)
{

	sdp_dbg(ssk->socket, "%s\n", __func__);

	SDP_WLOCK_ASSERT(ssk);
	if (TCPS_HAVERCVDFIN(ssk->state) == 0)
		socantrcvmore(ssk->socket);

	switch (ssk->state) {
	case TCPS_SYN_RECEIVED:
	case TCPS_ESTABLISHED:
		ssk->state = TCPS_CLOSE_WAIT;
		break;

	case TCPS_FIN_WAIT_1:
		/* Received a reply FIN - start Infiniband tear down */
		sdp_dbg(ssk->socket,
		    "%s: Starting Infiniband tear down sending DREQ\n",
		    __func__);

		sdp_cancel_dreq_wait_timeout(ssk);
		ssk->qp_active = 0;
		if (ssk->id) {
			struct rdma_cm_id *id;

			id = ssk->id;
			SDP_WUNLOCK(ssk);
			rdma_disconnect(id);
			SDP_WLOCK(ssk);
		} else {
			sdp_warn(ssk->socket,
			    "%s: ssk->id is NULL\n", __func__);
			return;
		}
		break;
	case TCPS_TIME_WAIT:
		/* This is a mutual close situation and we've got the DREQ from
		   the peer before the SDP_MID_DISCONNECT */
		break;
	case TCPS_CLOSED:
		/* FIN arrived after IB teardown started - do nothing */
		sdp_dbg(ssk->socket, "%s: fin in state %s\n",
		    __func__, sdp_state_str(ssk->state));
		return;
	default:
		sdp_warn(ssk->socket,
		    "%s: FIN in unexpected state. state=%d\n",
		    __func__, ssk->state);
		break;
	}
}
Example #2
0
static inline void
sdp_process_tx_wc(struct sdp_sock *ssk, struct ib_wc *wc)
{

	if (likely(wc->wr_id & SDP_OP_SEND)) {
		sdp_handle_send_comp(ssk, wc);
		return;
	}

#ifdef SDP_ZCOPY
	if (wc->wr_id & SDP_OP_RDMA) {
		/* TODO: handle failed RDMA read cqe */

		sdp_dbg_data(ssk->socket,
	 	    "TX comp: RDMA read. status: %d\n", wc->status);
		sdp_prf1(sk, NULL, "TX comp: RDMA read");

		if (!ssk->tx_ring.rdma_inflight) {
			sdp_warn(ssk->socket, "ERROR: unexpected RDMA read\n");
			return;
		}

		if (!ssk->tx_ring.rdma_inflight->busy) {
			sdp_warn(ssk->socket,
			    "ERROR: too many RDMA read completions\n");
			return;
		}

		/* Only last RDMA read WR is signalled. Order is guaranteed -
		 * therefore if Last RDMA read WR is completed - all other
		 * have, too */
		ssk->tx_ring.rdma_inflight->busy = 0;
		sowwakeup(ssk->socket);
		sdp_dbg_data(ssk->socket, "woke up sleepers\n");
		return;
	}
#endif

	/* Keepalive probe sent cleanup */
	sdp_cnt(sdp_keepalive_probes_sent);

	if (likely(!wc->status))
		return;

	sdp_dbg(ssk->socket, " %s consumes KEEPALIVE status %d\n",
			__func__, wc->status);

	if (wc->status == IB_WC_WR_FLUSH_ERR)
		return;

	sdp_notify(ssk, ECONNRESET);
}
Example #3
0
/* called only from irq */
static struct mbuf *
sdp_process_rx_wc(struct sdp_sock *ssk, struct ib_wc *wc)
{
	struct mbuf *mb;
	struct sdp_bsdh *h;
	struct socket *sk = ssk->socket;
	int mseq;

	mb = sdp_recv_completion(ssk, wc->wr_id);
	if (unlikely(!mb))
		return NULL;

	if (unlikely(wc->status)) {
		if (ssk->qp_active && sk) {
			sdp_dbg(sk, "Recv completion with error. "
					"Status %d, vendor: %d\n",
				wc->status, wc->vendor_err);
			sdp_abort(sk);
			ssk->qp_active = 0;
		}
		m_freem(mb);
		return NULL;
	}

	sdp_dbg_data(sk, "Recv completion. ID %d Length %d\n",
			(int)wc->wr_id, wc->byte_len);
	if (unlikely(wc->byte_len < sizeof(struct sdp_bsdh))) {
		sdp_warn(sk, "SDP BUG! byte_len %d < %zd\n",
				wc->byte_len, sizeof(struct sdp_bsdh));
		m_freem(mb);
		return NULL;
	}
	/* Use m_adj to trim the tail of data we didn't use. */
	m_adj(mb, -(mb->m_pkthdr.len - wc->byte_len));
	h = mtod(mb, struct sdp_bsdh *);

	SDP_DUMP_PACKET(ssk->socket, "RX", mb, h);

	ssk->rx_packets++;
	ssk->rx_bytes += mb->m_pkthdr.len;

	mseq = ntohl(h->mseq);
	atomic_set(&ssk->mseq_ack, mseq);
	if (mseq != (int)wc->wr_id)
		sdp_warn(sk, "SDP BUG! mseq %d != wrid %d\n",
				mseq, (int)wc->wr_id);

	return mb;
}
Example #4
0
int sdp_tx_ring_create(struct sdp_sock *ssk, struct ib_device *device)
{
	struct ib_cq *tx_cq;
	int rc = 0, vector = 0;

	atomic_set(&ssk->tx_ring.head, 1);
	atomic_set(&ssk->tx_ring.tail, 1);

	ssk->tx_ring.buffer = kmalloc(
			sizeof *ssk->tx_ring.buffer * SDP_TX_SIZE, GFP_KERNEL);
	if (!ssk->tx_ring.buffer) {
		rc = -ENOMEM;
		sdp_warn(sk_ssk(ssk), "Can't allocate TX Ring size %zd.\n",
			 sizeof(*ssk->tx_ring.buffer) * SDP_TX_SIZE);

		goto out;
	}

	vector = sdp_get_vector_num(device);

	tx_cq = ib_create_cq(device, sdp_tx_irq, sdp_tx_cq_event_handler,
			  sk_ssk(ssk), SDP_TX_SIZE, vector);

	if (IS_ERR(tx_cq)) {
		rc = PTR_ERR(tx_cq);
		sdp_warn(sk_ssk(ssk), "Unable to allocate TX CQ: %d.\n", rc);
		goto err_cq;
	}

	ssk->tx_ring.cq = tx_cq;

	setup_timer(&ssk->tx_ring.timer, sdp_poll_tx_timeout,
			(unsigned long)ssk);
	ssk->tx_ring.poll_cnt = 0;

	tasklet_init(&ssk->tx_ring.tasklet, sdp_poll_tx_timeout,
			(unsigned long) ssk);

	setup_timer(&ssk->nagle_timer, sdp_nagle_timeout, (unsigned long) ssk);

	return 0;

err_cq:
	kfree(ssk->tx_ring.buffer);
	ssk->tx_ring.buffer = NULL;
out:
	return rc;
}
Example #5
0
void
sdp_tx_ring_destroy(struct sdp_sock *ssk)
{

	sdp_dbg(ssk->socket, "tx ring destroy\n");
	SDP_WLOCK(ssk);
	callout_stop(&ssk->tx_ring.timer);
	callout_stop(&ssk->nagle_timer);
	SDP_WUNLOCK(ssk);
	callout_drain(&ssk->tx_ring.timer);
	callout_drain(&ssk->nagle_timer);

	if (ssk->tx_ring.buffer) {
		sdp_tx_ring_purge(ssk);

		kfree(ssk->tx_ring.buffer);
		ssk->tx_ring.buffer = NULL;
	}

	if (ssk->tx_ring.cq) {
		if (ib_destroy_cq(ssk->tx_ring.cq)) {
			sdp_warn(ssk->socket, "destroy cq(%p) failed\n",
					ssk->tx_ring.cq);
		} else {
			ssk->tx_ring.cq = NULL;
		}
	}

	WARN_ON(ring_head(ssk->tx_ring) != ring_tail(ssk->tx_ring));
}
Example #6
0
void sdp_tx_ring_destroy(struct sdp_sock *ssk)
{
	del_timer_sync(&ssk->tx_ring.timer);

	if (ssk->nagle_timer.function)
		del_timer_sync(&ssk->nagle_timer);

	if (ssk->tx_ring.buffer) {
		sdp_tx_ring_purge(ssk);

		kfree(ssk->tx_ring.buffer);
		ssk->tx_ring.buffer = NULL;
	}

	if (ssk->tx_ring.cq) {
		if (ib_destroy_cq(ssk->tx_ring.cq)) {
			sdp_warn(sk_ssk(ssk), "destroy cq(%p) failed\n",
					ssk->tx_ring.cq);
		} else {
			ssk->tx_ring.cq = NULL;
		}
	}

	tasklet_kill(&ssk->tx_ring.tasklet);
	/* tx_cq is destroyed, so no more tx_irq, so no one will schedule this
	 * tasklet. */

	SDP_WARN_ON(ring_head(ssk->tx_ring) != ring_tail(ssk->tx_ring));
}
Example #7
0
void
sdp_rx_ring_destroy(struct sdp_sock *ssk)
{

	cancel_work_sync(&ssk->rx_comp_work);
	rx_ring_destroy_lock(&ssk->rx_ring);

	if (ssk->rx_ring.buffer) {
		sdp_rx_ring_purge(ssk);

		kfree(ssk->rx_ring.buffer);
		ssk->rx_ring.buffer = NULL;
	}

	if (ssk->rx_ring.cq) {
		if (ib_destroy_cq(ssk->rx_ring.cq)) {
			sdp_warn(ssk->socket, "destroy cq(%p) failed\n",
				ssk->rx_ring.cq);
		} else {
			ssk->rx_ring.cq = NULL;
		}
	}

	WARN_ON(ring_head(ssk->rx_ring) != ring_tail(ssk->rx_ring));
}
static void sdp_qp_event_handler(struct ib_event *event, void *data)
{
	if (event->event == IB_EVENT_PATH_MIG) {
		sdp_dbg(NULL, "Path migration event\n");
		return;
	}
	sdp_warn(NULL, "unexpected invocation: event: %d, data=%p\n",
			event->event, data);
}
Example #9
0
int
sdp_tx_ring_create(struct sdp_sock *ssk, struct ib_device *device)
{
	struct ib_cq *tx_cq;
	int rc = 0;

	sdp_dbg(ssk->socket, "tx ring create\n");
	callout_init_rw(&ssk->tx_ring.timer, &ssk->lock, 0);
	callout_init_rw(&ssk->nagle_timer, &ssk->lock, 0);
	atomic_set(&ssk->tx_ring.head, 1);
	atomic_set(&ssk->tx_ring.tail, 1);

	ssk->tx_ring.buffer = kzalloc(
			sizeof *ssk->tx_ring.buffer * SDP_TX_SIZE, GFP_KERNEL);
	if (!ssk->tx_ring.buffer) {
		rc = -ENOMEM;
		sdp_warn(ssk->socket, "Can't allocate TX Ring size %zd.\n",
			 sizeof(*ssk->tx_ring.buffer) * SDP_TX_SIZE);

		goto out;
	}

	tx_cq = ib_create_cq(device, sdp_tx_irq, sdp_tx_cq_event_handler,
			  ssk, SDP_TX_SIZE, IB_CQ_VECTOR_LEAST_ATTACHED);

	if (IS_ERR(tx_cq)) {
		rc = PTR_ERR(tx_cq);
		sdp_warn(ssk->socket, "Unable to allocate TX CQ: %d.\n", rc);
		goto err_cq;
	}
	ssk->tx_ring.cq = tx_cq;
	ssk->tx_ring.poll_cnt = 0;
	sdp_arm_tx_cq(ssk);

	return 0;

err_cq:
	kfree(ssk->tx_ring.buffer);
	ssk->tx_ring.buffer = NULL;
out:
	return rc;
}
Example #10
0
int
sdp_rx_ring_create(struct sdp_sock *ssk, struct ib_device *device)
{
	struct ib_cq *rx_cq;
	int rc = 0;


	sdp_dbg(ssk->socket, "rx ring created");
	INIT_WORK(&ssk->rx_comp_work, sdp_rx_comp_work);
	atomic_set(&ssk->rx_ring.head, 1);
	atomic_set(&ssk->rx_ring.tail, 1);

	ssk->rx_ring.buffer = kmalloc(
			sizeof *ssk->rx_ring.buffer * SDP_RX_SIZE, GFP_KERNEL);
	if (!ssk->rx_ring.buffer) {
		sdp_warn(ssk->socket,
			"Unable to allocate RX Ring size %zd.\n",
			 sizeof(*ssk->rx_ring.buffer) * SDP_RX_SIZE);

		return -ENOMEM;
	}

	rx_cq = ib_create_cq(device, sdp_rx_irq, sdp_rx_cq_event_handler,
			  ssk->socket, SDP_RX_SIZE, IB_CQ_VECTOR_LEAST_ATTACHED);

	if (IS_ERR(rx_cq)) {
		rc = PTR_ERR(rx_cq);
		sdp_warn(ssk->socket, "Unable to allocate RX CQ: %d.\n", rc);
		goto err_cq;
	}

	sdp_sk(ssk->socket)->rx_ring.cq = rx_cq;
	sdp_arm_rx_cq(ssk);

	return 0;

err_cq:
	kfree(ssk->rx_ring.buffer);
	ssk->rx_ring.buffer = NULL;
	return rc;
}
Example #11
0
static inline void sdp_process_tx_wc(struct sdp_sock *ssk, struct ib_wc *wc)
{
	struct sock *sk = sk_ssk(ssk);

	if (likely(wc->wr_id & SDP_OP_SEND)) {
		struct sk_buff *skb;

		skb = sdp_send_completion(ssk, wc->wr_id);
		if (likely(skb))
			sk_wmem_free_skb(sk, skb);
	} else if (wc->wr_id & SDP_OP_RDMA) {
		if (ssk->tx_ring.rdma_inflight &&
				ssk->tx_ring.rdma_inflight->busy) {
			/* Only last RDMA read WR is signalled. Order is guaranteed -
			 * therefore if Last RDMA read WR is completed - all other
			 * have, too */
			ssk->tx_ring.rdma_inflight->busy = 0;
		} else {
			sdp_warn(sk, "Unexpected RDMA read completion, "
					"probably was canceled already\n");
		}

		wake_up(sdp_sk_sleep(sk));
	} else {
		/* Keepalive probe sent cleanup */
		sdp_cnt(sdp_keepalive_probes_sent);
	}

	if (likely(!wc->status) || wc->status == IB_WC_WR_FLUSH_ERR)
		return;

	sdp_warn(sk, "Send completion with error. wr_id 0x%llx Status %d\n",
			wc->wr_id, wc->status);

	sdp_set_error(sk, -ECONNRESET);
}
static int sdp_get_max_dev_sge(struct ib_device *dev)
{
	struct ib_device_attr attr;
	static int max_sges = -1;
	int rc;

	if (max_sges > 0)
		goto out;

	rc = ib_query_device(dev, &attr);
	if (rc) {
		sdp_warn(NULL, "ib_query_device failed: %d\n", rc);
		goto out;
	}

	max_sges = attr.max_sge;

out:
	return max_sges;
}
Example #13
0
static void
sdp_poll_tx(struct sdp_sock *ssk)
{
	struct socket *sk = ssk->socket;
	u32 inflight, wc_processed;

	sdp_prf1(ssk->socket, NULL, "TX timeout: inflight=%d, head=%d tail=%d", 
		(u32) tx_ring_posted(ssk),
		ring_head(ssk->tx_ring), ring_tail(ssk->tx_ring));

	if (unlikely(ssk->state == TCPS_CLOSED)) {
		sdp_warn(sk, "Socket is closed\n");
		goto out;
	}

	wc_processed = sdp_process_tx_cq(ssk);
	if (!wc_processed)
		SDPSTATS_COUNTER_INC(tx_poll_miss);
	else
		SDPSTATS_COUNTER_INC(tx_poll_hit);

	inflight = (u32) tx_ring_posted(ssk);
	sdp_prf1(ssk->socket, NULL, "finished tx proccessing. inflight = %d",
	    inflight);

	/* If there are still packets in flight and the timer has not already
	 * been scheduled by the Tx routine then schedule it here to guarantee
	 * completion processing of these packets */
	if (inflight)
		callout_reset(&ssk->tx_ring.timer, SDP_TX_POLL_TIMEOUT,
		    sdp_poll_tx_timeout, ssk);
out:
#ifdef SDP_ZCOPY
	if (ssk->tx_ring.rdma_inflight && ssk->tx_ring.rdma_inflight->busy) {
		sdp_prf1(sk, NULL, "RDMA is inflight - arming irq");
		sdp_arm_tx_cq(ssk);
	}
#endif
	return;
}
static int sdp_init_qp(struct sock *sk, struct rdma_cm_id *id)
{
	struct ib_qp_init_attr qp_init_attr = {
		.event_handler = sdp_qp_event_handler,
		.cap.max_send_wr = SDP_TX_SIZE,
		.cap.max_recv_wr = sdp_rx_size,
		.cap.max_inline_data = sdp_inline_thresh,
        	.sq_sig_type = IB_SIGNAL_REQ_WR,
        	.qp_type = IB_QPT_RC,
	};
	struct ib_device *device = id->device;
	int rc;

	sdp_dbg(sk, "%s\n", __func__);

	sdp_sk(sk)->max_sge = sdp_get_max_dev_sge(device);
	sdp_dbg(sk, "Max sges: %d\n", sdp_sk(sk)->max_sge);

	qp_init_attr.cap.max_send_sge = MIN(sdp_sk(sk)->max_sge, SDP_MAX_SEND_SGES);
	sdp_dbg(sk, "Setting max send sge to: %d\n", qp_init_attr.cap.max_send_sge);

	qp_init_attr.cap.max_recv_sge = MIN(sdp_sk(sk)->max_sge, SDP_MAX_RECV_SGES);
	sdp_dbg(sk, "Setting max recv sge to: %d\n", qp_init_attr.cap.max_recv_sge);

	sdp_sk(sk)->sdp_dev = ib_get_client_data(device, &sdp_client);
	if (!sdp_sk(sk)->sdp_dev) {
		sdp_warn(sk, "SDP not available on device %s\n", device->name);
		rc = -ENODEV;
		goto err_rx;
	}

	rc = sdp_rx_ring_create(sdp_sk(sk), device);
	if (rc)
		goto err_rx;

	rc = sdp_tx_ring_create(sdp_sk(sk), device);
	if (rc)
		goto err_tx;

	qp_init_attr.recv_cq = sdp_sk(sk)->rx_ring.cq;
	qp_init_attr.send_cq = sdp_sk(sk)->tx_ring.cq;

	rc = rdma_create_qp(id, sdp_sk(sk)->sdp_dev->pd, &qp_init_attr);
	if (rc) {
		sdp_warn(sk, "Unable to create QP: %d.\n", rc);
		goto err_qp;
	}
	sdp_sk(sk)->qp = id->qp;
	sdp_sk(sk)->ib_device = device;
	sdp_sk(sk)->qp_active = 1;
	sdp_sk(sk)->context.device = device;
	sdp_sk(sk)->inline_thresh = qp_init_attr.cap.max_inline_data;

	sdp_dbg(sk, "%s done\n", __func__);
	return 0;

err_qp:
	sdp_tx_ring_destroy(sdp_sk(sk));
err_tx:
	sdp_rx_ring_destroy(sdp_sk(sk));
err_rx:
	return rc;
}

static int sdp_get_max_send_frags(u32 buf_size)
{
	return MIN(
		/* +1 to conpensate on not aligned buffers */
		(PAGE_ALIGN(buf_size) >> PAGE_SHIFT) + 1,
		SDP_MAX_SEND_SGES - 1);
}

static int sdp_connect_handler(struct sock *sk, struct rdma_cm_id *id,
		       	struct rdma_cm_event *event)
{
	struct sockaddr_in *dst_addr;
	struct sock *child;
	const struct sdp_hh *h;
	int rc = 0;

	sdp_dbg(sk, "%s %p -> %p\n", __func__, sdp_sk(sk)->id, id);

	h = event->param.conn.private_data;
	SDP_DUMP_PACKET(sk, "RX", NULL, &h->bsdh);

	if (h->ipv_cap & HH_IPV_MASK & ~(HH_IPV4 | HH_IPV6)) {
		sdp_warn(sk, "Bad IPV field in SDP Hello header: 0x%x\n",
				h->ipv_cap & HH_IPV_MASK);
		return -EINVAL;
	}

	if (!h->max_adverts)
		return -EINVAL;

#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 3, 0))
	child = sk_clone(sk, GFP_KERNEL);
#else
	child = sk_clone_lock(sk, GFP_KERNEL);
#endif
	if (!child)
		return -ENOMEM;

	sdp_init_sock(child);

	dst_addr = (struct sockaddr_in *)&id->route.addr.dst_addr;
	sdp_inet_dport(child) = dst_addr->sin_port;

#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
	if (inet6_sk(sk)) {
		struct ipv6_pinfo *newnp;

		newnp = inet_sk(child)->pinet6 = sdp_inet6_sk_generic(child);

		memcpy(newnp, inet6_sk(sk), sizeof(struct ipv6_pinfo));
		if ((h->ipv_cap & HH_IPV_MASK) == HH_IPV4) {
			/* V6 mapped */
			sdp_inet_daddr(child) = dst_addr->sin_addr.s_addr;
			ipv6_addr_set(&child->sk_v6_daddr, 0, 0, htonl(0x0000FFFF),
					h->src_addr.ip4.addr);

			ipv6_addr_set(&child->sk_v6_rcv_saddr, 0, 0, htonl(0x0000FFFF),
					h->dst_addr.ip4.addr);

			ipv6_addr_copy(&child->sk_v6_rcv_saddr, &child->sk_v6_daddr);
		} else if ((h->ipv_cap & HH_IPV_MASK) == HH_IPV6) {
			struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *)dst_addr;
			struct sockaddr_in6 *src_addr6 =
				(struct sockaddr_in6 *)&id->route.addr.src_addr;

			ipv6_addr_copy(&child->sk_v6_daddr, &dst_addr6->sin6_addr);
			ipv6_addr_copy(&child->sk_v6_rcv_saddr, &src_addr6->sin6_addr);
			ipv6_addr_copy(&newnp->saddr, &src_addr6->sin6_addr);
		} else {
			sdp_warn(child, "Bad IPV field: 0x%x\n", h->ipv_cap & HH_IPV_MASK);
		}

		sdp_inet_daddr(child) = sdp_inet_saddr(child) =
			sdp_inet_rcv_saddr(child) = LOOPBACK4_IPV6;
	} else
#endif
	{
		sdp_inet_daddr(child) = dst_addr->sin_addr.s_addr;
	}

#ifdef SDP_SOCK_HISTORY
	sdp_ssk_hist_rename(sk);
#endif
	__sock_put(child, SOCK_REF_CLONE);

	down_read(&device_removal_lock);

	rc = sdp_init_qp(child, id);
	if (rc) {
		bh_unlock_sock(child);
		up_read(&device_removal_lock);
		sdp_sk(child)->destructed_already = 1;
#ifdef SDP_SOCK_HISTORY
		sdp_ssk_hist_close(child);
#endif
		sk_free(child);
		return rc;
	}

	sdp_sk(child)->max_bufs = ntohs(h->bsdh.bufs);
	atomic_set(&sdp_sk(child)->tx_ring.credits, sdp_sk(child)->max_bufs);

	sdp_sk(child)->min_bufs = tx_credits(sdp_sk(child)) / 4;
	sdp_sk(child)->xmit_size_goal = ntohl(h->localrcvsz) -
		sizeof(struct sdp_bsdh);

	sdp_sk(child)->send_frags = sdp_get_max_send_frags(sdp_sk(child)->xmit_size_goal);
	sdp_init_buffers(sdp_sk(child), rcvbuf_initial_size);

	id->context = child;
	sdp_sk(child)->id = id;

	list_add_tail(&sdp_sk(child)->backlog_queue,
			&sdp_sk(sk)->backlog_queue);
	sdp_sk(child)->parent = sk;

	bh_unlock_sock(child);
	sdp_add_sock(sdp_sk(child));
	up_read(&device_removal_lock);

	sdp_exch_state(child, TCPF_LISTEN | TCPF_CLOSE, TCP_SYN_RECV);

	/* child->sk_write_space(child); */
	/* child->sk_data_ready(child, 0); */
	sk->sk_data_ready(sk);

	return 0;
}

static int sdp_response_handler(struct sock *sk, struct rdma_cm_id *id,
				struct rdma_cm_event *event)
{
	const struct sdp_hah *h;
	struct sockaddr_in *dst_addr;
	sdp_dbg(sk, "%s\n", __func__);

	sdp_exch_state(sk, TCPF_SYN_SENT, TCP_ESTABLISHED);
	sdp_set_default_moderation(sdp_sk(sk));

	if (sock_flag(sk, SOCK_KEEPOPEN))
		sdp_start_keepalive_timer(sk);

	if (sock_flag(sk, SOCK_DEAD))
		return 0;

	h = event->param.conn.private_data;
	SDP_DUMP_PACKET(sk, "RX", NULL, &h->bsdh);
	sdp_sk(sk)->max_bufs = ntohs(h->bsdh.bufs);
	atomic_set(&sdp_sk(sk)->tx_ring.credits, sdp_sk(sk)->max_bufs);
	sdp_sk(sk)->min_bufs = tx_credits(sdp_sk(sk)) / 4;
	sdp_sk(sk)->xmit_size_goal =
		ntohl(h->actrcvsz) - sizeof(struct sdp_bsdh);
	sdp_sk(sk)->send_frags = sdp_get_max_send_frags(sdp_sk(sk)->xmit_size_goal);
	sdp_sk(sk)->xmit_size_goal = MIN(sdp_sk(sk)->xmit_size_goal,
		sdp_sk(sk)->send_frags * PAGE_SIZE);

	sdp_sk(sk)->poll_cq = 1;

	sk->sk_state_change(sk);
	sk_wake_async(sk, 0, POLL_OUT);

	dst_addr = (struct sockaddr_in *)&id->route.addr.dst_addr;
	sdp_inet_dport(sk) = dst_addr->sin_port;
	sdp_inet_daddr(sk) = dst_addr->sin_addr.s_addr;

#ifdef SDP_SOCK_HISTORY
	sdp_ssk_hist_rename(sk);
#endif
	return 0;
}

static int sdp_connected_handler(struct sock *sk)
{
	struct sock *parent;
	sdp_dbg(sk, "%s\n", __func__);

	parent = sdp_sk(sk)->parent;
	BUG_ON(!parent);

	sdp_exch_state(sk, TCPF_SYN_RECV, TCP_ESTABLISHED);

#ifdef SDP_SOCK_HISTORY
	sdp_ssk_hist_rename(sk);
#endif
	sdp_set_default_moderation(sdp_sk(sk));

	if (sock_flag(sk, SOCK_KEEPOPEN))
		sdp_start_keepalive_timer(sk);

	if (sock_flag(sk, SOCK_DEAD))
		return 0;

	lock_sock(parent);
	if (!sdp_sk(parent)->id) { /* TODO: look at SOCK_DEAD? */
		sdp_dbg(sk, "parent is going away.\n");
		goto done;
	}

	sk_acceptq_added(parent);
	sdp_dbg(parent, "%s child connection established\n", __func__);
	list_del_init(&sdp_sk(sk)->backlog_queue);
	list_add_tail(&sdp_sk(sk)->accept_queue,
			&sdp_sk(parent)->accept_queue);

	parent->sk_state_change(parent);
	sk_wake_async(parent, 0, POLL_OUT);
done:
	release_sock(parent);

	return 0;
}

static int sdp_disconnected_handler(struct sock *sk)
{
	struct sdp_sock *ssk = sdp_sk(sk);

	sdp_dbg(sk, "%s\n", __func__);

	if (ssk->tx_ring.cq)
		if (sdp_xmit_poll(ssk, 1))
			sdp_post_sends(ssk, 0);

	if (sk->sk_state == TCP_SYN_RECV) {
		sdp_connected_handler(sk);

		if (rcv_nxt(ssk))
			return 0;
	}

	return -ECONNRESET;
}

int sdp_cma_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
{
	struct rdma_conn_param conn_param;
	struct sock *parent = NULL;
	struct sock *child = NULL;
	struct sock *sk;
	struct sdp_hah hah;
	struct sdp_hh hh;

	int rc = 0, rc2;

	sk = id->context;
	if (!sk) {
		sdp_dbg(NULL, "cm_id is being torn down, event %s\n",
		       	rdma_cm_event_str(event->event));
		return event->event == RDMA_CM_EVENT_CONNECT_REQUEST ?
			-EINVAL : 0;
	}

	sdp_add_to_history(sk, rdma_cm_event_str(event->event));

	lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
	sdp_dbg(sk, "event: %s\n", rdma_cm_event_str(event->event));
	if (!sdp_sk(sk)->id) {
		sdp_dbg(sk, "socket is being torn down\n");
		rc = event->event == RDMA_CM_EVENT_CONNECT_REQUEST ?
			-EINVAL : 0;
		release_sock(sk);
		return rc;
	}

	switch (event->event) {
	case RDMA_CM_EVENT_ADDR_RESOLVED:
		if (sdp_link_layer_ib_only &&
			rdma_node_get_transport(id->device->node_type) ==
				RDMA_TRANSPORT_IB &&
			rdma_port_get_link_layer(id->device, id->port_num) !=
				IB_LINK_LAYER_INFINIBAND) {
			sdp_dbg(sk, "Link layer is: %d. Only IB link layer "
				"is allowed\n",
				rdma_port_get_link_layer(id->device,
					id->port_num));
			rc = -ENETUNREACH;
			break;
		}

		rc = rdma_resolve_route(id, SDP_ROUTE_TIMEOUT);
		break;
	case RDMA_CM_EVENT_ADDR_ERROR:
		rc = -ENETUNREACH;
		break;
	case RDMA_CM_EVENT_ROUTE_RESOLVED:
		rc = sdp_init_qp(sk, id);
		if (rc)
			break;
		memset(&hh, 0, sizeof hh);
		hh.bsdh.mid = SDP_MID_HELLO;
		hh.bsdh.len = htonl(sizeof(struct sdp_hh));
		hh.max_adverts = 1;

		hh.majv_minv = SDP_MAJV_MINV;
		sdp_init_buffers(sdp_sk(sk), rcvbuf_initial_size);
		hh.bsdh.bufs = htons(rx_ring_posted(sdp_sk(sk)));
		atomic_set(&sdp_sk(sk)->remote_credits,
				rx_ring_posted(sdp_sk(sk)));
		hh.localrcvsz = hh.desremrcvsz = htonl(sdp_sk(sk)->recv_frags *
				PAGE_SIZE + sizeof(struct sdp_bsdh));
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
		if (inet6_sk(sk)) {
			struct sockaddr *src_addr = (struct sockaddr *)&id->route.addr.src_addr;
			struct sockaddr_in *addr4 = (struct sockaddr_in *)src_addr;
			struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)src_addr;

			if (src_addr->sa_family == AF_INET) {
				/* IPv4 over IPv6 */
				ipv6_addr_set(&sk->sk_v6_rcv_saddr, 0, 0, htonl(0xFFFF),
						addr4->sin_addr.s_addr);
			} else {
				sk->sk_v6_rcv_saddr = addr6->sin6_addr;
			}
			inet6_sk(sk)->saddr = sk->sk_v6_rcv_saddr;
		}
			else
#endif
		{
			sdp_inet_saddr(sk) = sdp_inet_rcv_saddr(sk) =
				((struct sockaddr_in *)&id->route.addr.src_addr)->sin_addr.s_addr;
		}
		memset(&conn_param, 0, sizeof conn_param);
		conn_param.private_data_len = sizeof hh;
		conn_param.private_data = &hh;
		conn_param.responder_resources = 4 /* TODO */;
		conn_param.initiator_depth = 4 /* TODO */;
		conn_param.retry_count = sdp_retry_count;
		SDP_DUMP_PACKET(sk, "TX", NULL, &hh.bsdh);

		if (sdp_apm_enable) {
			rc = rdma_enable_apm(id, RDMA_ALT_PATH_BEST);
			if (rc)
				sdp_warn(sk, "APM couldn't be enabled for active side: %d\n", rc);
		}

		rc = rdma_connect(id, &conn_param);
		break;

	case RDMA_CM_EVENT_ALT_ROUTE_RESOLVED:
		sdp_dbg(sk, "alt route was resolved slid=%d, dlid=%d\n",
				id->route.path_rec[1].slid, id->route.path_rec[1].dlid);
		break;

	case RDMA_CM_EVENT_ALT_PATH_LOADED:
		sdp_dbg(sk, "alt route path loaded\n");
		break;

	case RDMA_CM_EVENT_ALT_ROUTE_ERROR:
		sdp_warn(sk, "alt route resolve error\n");
		break;

	case RDMA_CM_EVENT_ROUTE_ERROR:
		rc = -ETIMEDOUT;
		break;
	case RDMA_CM_EVENT_CONNECT_REQUEST:
		rc = sdp_connect_handler(sk, id, event);
		if (rc) {
			sdp_dbg(sk, "Destroying qp\n");
			rdma_reject(id, NULL, 0);
			break;
		}
		child = id->context;
		atomic_set(&sdp_sk(child)->remote_credits,
				rx_ring_posted(sdp_sk(child)));
		memset(&hah, 0, sizeof hah);
		hah.bsdh.mid = SDP_MID_HELLO_ACK;
		hah.bsdh.bufs = htons(rx_ring_posted(sdp_sk(child)));
		hah.bsdh.len = htonl(sizeof(struct sdp_hah));
		hah.majv_minv = SDP_MAJV_MINV;
		hah.ext_max_adverts = 1; /* Doesn't seem to be mandated by spec,
					    but just in case */
		hah.actrcvsz = htonl(sdp_sk(child)->recv_frags * PAGE_SIZE +
			sizeof(struct sdp_bsdh));
		memset(&conn_param, 0, sizeof conn_param);
		conn_param.private_data_len = sizeof hah;
		conn_param.private_data = &hah;
		conn_param.responder_resources = 4 /* TODO */;
		conn_param.initiator_depth = 4 /* TODO */;
		conn_param.retry_count = sdp_retry_count;
		SDP_DUMP_PACKET(sk, "TX", NULL, &hah.bsdh);
		rc = rdma_accept(id, &conn_param);
		if (rc) {
			sdp_sk(child)->id = NULL;
			id->qp = NULL;
			id->context = NULL;
			parent = sdp_sk(child)->parent; /* TODO: hold ? */
		} else if (sdp_apm_enable) {
				rc2 = rdma_enable_apm(id, RDMA_ALT_PATH_BEST);
				if (rc2)
					sdp_warn(sk, "APM couldn't be enabled for passive side: %d\n", rc2);
		}
		break;
	case RDMA_CM_EVENT_CONNECT_RESPONSE:
		rc = sdp_response_handler(sk, id, event);
		if (rc) {
			sdp_dbg(sk, "Destroying qp\n");
			rdma_reject(id, NULL, 0);
		} else {
			rc = rdma_accept(id, NULL);
			if (!rc && sdp_apm_enable) {
				rc2 = rdma_enable_apm(id, RDMA_ALT_PATH_BEST);
				if (rc2)
					sdp_warn(sk, "APM couldn't be enabled for passive side:%d \n", rc2);
			}
		}
		break;
	case RDMA_CM_EVENT_CONNECT_ERROR:
		rc = -ETIMEDOUT;
		break;
	case RDMA_CM_EVENT_UNREACHABLE:
		rc = -ENETUNREACH;
		break;
	case RDMA_CM_EVENT_REJECTED:
		rc = -ECONNREFUSED;
		break;
	case RDMA_CM_EVENT_ESTABLISHED:
		sdp_inet_saddr(sk) = sdp_inet_rcv_saddr(sk) =
			((struct sockaddr_in *)&id->route.addr.src_addr)->sin_addr.s_addr;
		rc = sdp_connected_handler(sk);
		break;
	case RDMA_CM_EVENT_DISCONNECTED: /* This means DREQ/DREP received */
		if (sk->sk_state == TCP_LAST_ACK) {
			sdp_cancel_dreq_wait_timeout(sdp_sk(sk));

			sdp_exch_state(sk, TCPF_LAST_ACK, TCP_TIME_WAIT);

			sdp_dbg(sk, "%s: waiting for Infiniband tear down\n",
				__func__);
		}

		sdp_sk(sk)->qp_active = 0;
		rdma_disconnect(id);

		if (sk->sk_state != TCP_TIME_WAIT) {
			if (sk->sk_state == TCP_CLOSE_WAIT) {
				sdp_dbg(sk, "IB teardown while in "
					"TCP_CLOSE_WAIT taking reference to "
					"let close() finish the work\n");
				sock_hold(sk, SOCK_REF_CMA);
				sdp_start_cma_timewait_timeout(sdp_sk(sk),
						SDP_CMA_TIMEWAIT_TIMEOUT);

			}
			sdp_set_error(sk, -EPIPE);
			rc = sdp_disconnected_handler(sk);
		}
		break;
	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
		rc = sdp_disconnected_handler(sk);
		break;
	case RDMA_CM_EVENT_DEVICE_REMOVAL:
		rc = -ENETRESET;
		break;

	case RDMA_CM_EVENT_ADDR_CHANGE:
		sdp_dbg(sk, "Got Address change event\n");
		rc = 0;
		break;
	default:
		printk(KERN_ERR "SDP: Unexpected CMA event: %d\n",
		       event->event);
		rc = -ECONNABORTED;
		break;
	}

	sdp_dbg(sk, "event: %s handled\n", rdma_cm_event_str(event->event));

	if (rc && sdp_sk(sk)->id == id) {
		child = sk;
		sdp_sk(sk)->id = NULL;
		id->qp = NULL;
		id->context = NULL;
		parent = sdp_sk(sk)->parent;
		sdp_reset_sk(sk, rc);
	}

	release_sock(sk);

	sdp_dbg(sk, "event: %s done. status %d\n",
			rdma_cm_event_str(event->event), rc);

	if (parent) {
		lock_sock(parent);
		if (!sdp_sk(parent)->id) { /* TODO: look at SOCK_DEAD? */
			sdp_dbg(sk, "parent is going away.\n");
			child = NULL;
			goto done;
		}
		if (!list_empty(&sdp_sk(child)->backlog_queue))
			list_del_init(&sdp_sk(child)->backlog_queue);
		else
			child = NULL;
done:
		release_sock(parent);
		if (child)
			sdp_common_release(child);
	}
	return rc;
}
Example #15
0
static int
sdp_post_recv(struct sdp_sock *ssk)
{
	struct sdp_buf *rx_req;
	int i, rc;
	u64 addr;
	struct ib_device *dev;
	struct ib_recv_wr rx_wr = { NULL };
	struct ib_sge ibsge[SDP_MAX_RECV_SGES];
	struct ib_sge *sge = ibsge;
	struct ib_recv_wr *bad_wr;
	struct mbuf *mb, *m;
	struct sdp_bsdh *h;
	int id = ring_head(ssk->rx_ring);

	/* Now, allocate and repost recv */
	sdp_prf(ssk->socket, mb, "Posting mb");
	mb = m_getm2(NULL, ssk->recv_bytes, M_NOWAIT, MT_DATA, M_PKTHDR);
	if (mb == NULL) {
		/* Retry so we can't stall out with no memory. */
		if (!rx_ring_posted(ssk))
			queue_work(rx_comp_wq, &ssk->rx_comp_work);
		return -1;
	}
	for (m = mb; m != NULL; m = m->m_next) {
		m->m_len = (m->m_flags & M_EXT) ? m->m_ext.ext_size :
                        ((m->m_flags & M_PKTHDR) ? MHLEN : MLEN);
		mb->m_pkthdr.len += m->m_len;
	}
	h = mtod(mb, struct sdp_bsdh *);
	rx_req = ssk->rx_ring.buffer + (id & (SDP_RX_SIZE - 1));
	rx_req->mb = mb;
	dev = ssk->ib_device;
        for (i = 0;  mb != NULL; i++, mb = mb->m_next, sge++) {
		addr = ib_dma_map_single(dev, mb->m_data, mb->m_len,
		    DMA_TO_DEVICE);
		/* TODO: proper error handling */
		BUG_ON(ib_dma_mapping_error(dev, addr));
		BUG_ON(i >= SDP_MAX_RECV_SGES);
		rx_req->mapping[i] = addr;
		sge->addr = addr;
		sge->length = mb->m_len;
		sge->lkey = ssk->sdp_dev->mr->lkey;
        }

	rx_wr.next = NULL;
	rx_wr.wr_id = id | SDP_OP_RECV;
	rx_wr.sg_list = ibsge;
	rx_wr.num_sge = i;
	rc = ib_post_recv(ssk->qp, &rx_wr, &bad_wr);
	if (unlikely(rc)) {
		sdp_warn(ssk->socket, "ib_post_recv failed. status %d\n", rc);

		sdp_cleanup_sdp_buf(ssk, rx_req, DMA_FROM_DEVICE);
		m_freem(mb);

		sdp_notify(ssk, ECONNRESET);

		return -1;
	}

	atomic_inc(&ssk->rx_ring.head);
	SDPSTATS_COUNTER_INC(post_recv);

	return 0;
}
Example #16
0
/* socket lock should be taken before calling this */
static int
sdp_process_rx_ctl_mb(struct sdp_sock *ssk, struct mbuf *mb)
{
	struct sdp_bsdh *h;
	struct socket *sk;

	SDP_WLOCK_ASSERT(ssk);
	sk = ssk->socket;
 	h = mtod(mb, struct sdp_bsdh *);
	switch (h->mid) {
	case SDP_MID_DATA:
	case SDP_MID_SRCAVAIL:
		sdp_dbg(sk, "DATA after socket rcv was shutdown\n");

		/* got data in RCV_SHUTDOWN */
		if (ssk->state == TCPS_FIN_WAIT_1) {
			sdp_dbg(sk, "RX data when state = FIN_WAIT1\n");
			sdp_notify(ssk, ECONNRESET);
		}
		m_freem(mb);

		break;
#ifdef SDP_ZCOPY
	case SDP_MID_RDMARDCOMPL:
		m_freem(mb);
		break;
	case SDP_MID_SENDSM:
		sdp_handle_sendsm(ssk, ntohl(h->mseq_ack));
		m_freem(mb);
		break;
	case SDP_MID_SRCAVAIL_CANCEL:
		sdp_dbg_data(sk, "Handling SrcAvailCancel\n");
		sdp_prf(sk, NULL, "Handling SrcAvailCancel");
		if (ssk->rx_sa) {
			ssk->srcavail_cancel_mseq = ntohl(h->mseq);
			ssk->rx_sa->flags |= RX_SA_ABORTED;
			ssk->rx_sa = NULL; /* TODO: change it into SDP_MID_DATA and get 
			                      the dirty logic from recvmsg */
		} else {
			sdp_dbg(sk, "Got SrcAvailCancel - "
					"but no SrcAvail in process\n");
		}
		m_freem(mb);
		break;
	case SDP_MID_SINKAVAIL:
		sdp_dbg_data(sk, "Got SinkAvail - not supported: ignored\n");
		sdp_prf(sk, NULL, "Got SinkAvail - not supported: ignored");
		/* FALLTHROUGH */
#endif
	case SDP_MID_ABORT:
		sdp_dbg_data(sk, "Handling ABORT\n");
		sdp_prf(sk, NULL, "Handling ABORT");
		sdp_notify(ssk, ECONNRESET);
		m_freem(mb);
		break;
	case SDP_MID_DISCONN:
		sdp_dbg_data(sk, "Handling DISCONN\n");
		sdp_prf(sk, NULL, "Handling DISCONN");
		sdp_handle_disconn(ssk);
		break;
	case SDP_MID_CHRCVBUF:
		sdp_dbg_data(sk, "Handling RX CHRCVBUF\n");
		sdp_handle_resize_request(ssk, (struct sdp_chrecvbuf *)(h+1));
		m_freem(mb);
		break;
	case SDP_MID_CHRCVBUF_ACK:
		sdp_dbg_data(sk, "Handling RX CHRCVBUF_ACK\n");
		sdp_handle_resize_ack(ssk, (struct sdp_chrecvbuf *)(h+1));
		m_freem(mb);
		break;
	default:
		/* TODO: Handle other messages */
		sdp_warn(sk, "SDP: FIXME MID %d\n", h->mid);
		m_freem(mb);
	}

	return 0;
}