Beispiel #1
0
static ssize_t
fi_ibv_rdm_process_addr_resolved(struct rdma_cm_id *id,
				 struct fi_ibv_rdm_ep *ep)
{
	ssize_t ret = FI_SUCCESS;
	struct ibv_qp_init_attr qp_attr;
	struct fi_ibv_rdm_tagged_conn *conn = id->context;

	VERBS_INFO(FI_LOG_AV, "ADDR_RESOLVED conn %p, addr %s:%u\n",
		   conn, inet_ntoa(conn->addr.sin_addr),
		   ntohs(conn->addr.sin_port));

	assert(id->verbs == ep->domain->verbs);

	do {
		fi_ibv_rdm_tagged_init_qp_attributes(&qp_attr, ep);
		if (rdma_create_qp(id, ep->domain->pd, &qp_attr)) {
			VERBS_INFO_ERRNO(FI_LOG_AV,
					 "rdma_create_qp failed\n", errno);
			return -errno;
		}

		if (conn->cm_role == FI_VERBS_CM_PASSIVE) {
			break;
		}

		conn->qp[0] = id->qp;
		assert(conn->id[0] == id);
		if (conn->cm_role == FI_VERBS_CM_SELF) {
			break;
		}

		ret = fi_ibv_rdm_prepare_conn_memory(ep, conn);
		if (ret != FI_SUCCESS) {
			goto err;
		}

		ret = fi_ibv_rdm_repost_receives(conn, ep, ep->rq_wr_depth);
		if (ret < 0) {
			VERBS_INFO(FI_LOG_AV, "repost receives failed\n");
			goto err;
		} else {
			ret = FI_SUCCESS;
		}
	} while (0);

	if (rdma_resolve_route(id, FI_IBV_RDM_CM_RESOLVEADDR_TIMEOUT)) {
		VERBS_INFO(FI_LOG_AV, "rdma_resolve_route failed\n");
		ret = -FI_EHOSTUNREACH;
		goto err;
	}

	return ret;
err:
	rdma_destroy_qp(id);
	return ret;
}
Beispiel #2
0
static struct fi_info *
fi_ibv_eq_cm_getinfo(struct fi_ibv_fabric *fab, struct rdma_cm_event *event,
		struct fi_info *pep_info)
{
	struct fi_info *info, *fi;
	struct fi_ibv_connreq *connreq;
	const char *devname = ibv_get_device_name(event->id->verbs->device);

	if (strcmp(devname, fab->info->domain_attr->name)) {
		fi = fi_ibv_get_verbs_info(fab->all_infos, devname);
		if (!fi)
			return NULL;
	} else {
		fi = fab->info;
	}

	info = fi_dupinfo(fi);
	if (!info)
		return NULL;

	info->fabric_attr->fabric = &fab->util_fabric.fabric_fid;
	if (!(info->fabric_attr->prov_name = strdup(VERBS_PROV_NAME)))
		goto err;

	ofi_alter_info(info, pep_info, fab->util_fabric.fabric_fid.api_version);

	info->src_addrlen = fi_ibv_sockaddr_len(rdma_get_local_addr(event->id));
	if (!(info->src_addr = malloc(info->src_addrlen)))
		goto err;
	memcpy(info->src_addr, rdma_get_local_addr(event->id), info->src_addrlen);

	info->dest_addrlen = fi_ibv_sockaddr_len(rdma_get_peer_addr(event->id));
	if (!(info->dest_addr = malloc(info->dest_addrlen)))
		goto err;
	memcpy(info->dest_addr, rdma_get_peer_addr(event->id), info->dest_addrlen);

	VERBS_INFO(FI_LOG_CORE, "src_addr: %s:%d\n",
		   inet_ntoa(((struct sockaddr_in *)info->src_addr)->sin_addr),
		   ntohs(((struct sockaddr_in *)info->src_addr)->sin_port));

	VERBS_INFO(FI_LOG_CORE, "dst_addr: %s:%d\n",
		   inet_ntoa(((struct sockaddr_in *)info->dest_addr)->sin_addr),
		   ntohs(((struct sockaddr_in *)info->dest_addr)->sin_port));

	connreq = calloc(1, sizeof *connreq);
	if (!connreq)
		goto err;

	connreq->handle.fclass = FI_CLASS_CONNREQ;
	connreq->id = event->id;
	info->handle = &connreq->handle;
	return info;
err:
	fi_freeinfo(info);
	return NULL;
}
Beispiel #3
0
static int fi_ibv_check_hints(uint32_t version, const struct fi_info *hints,
		const struct fi_info *info)
{
	int ret;
	uint64_t prov_mode;

	if (hints->caps & ~(info->caps)) {
		VERBS_INFO(FI_LOG_CORE, "Unsupported capabilities\n");
		FI_INFO_CHECK(&fi_ibv_prov, info, hints, caps, FI_TYPE_CAPS);
		return -FI_ENODATA;
	}

	prov_mode = ofi_mr_get_prov_mode(version, hints, info);

	if ((hints->mode & prov_mode) != prov_mode) {
		VERBS_INFO(FI_LOG_CORE, "needed mode not set\n");
		FI_INFO_MODE(&fi_ibv_prov, prov_mode, hints->mode);
		return -FI_ENODATA;
	}

	if (hints->fabric_attr) {
		ret = ofi_check_fabric_attr(&fi_ibv_prov, info->fabric_attr,
					    hints->fabric_attr);
		if (ret)
			return ret;
	}

	if (hints->domain_attr) {
		ret = ofi_check_domain_attr(&fi_ibv_prov, version, info->domain_attr,
					    hints->domain_attr);
		if (ret)
			return ret;
	}

	if (hints->ep_attr) {
		ret = fi_ibv_check_ep_attr(hints->ep_attr, info);
		if (ret)
			return ret;
	}

	if (hints->rx_attr) {
		ret = fi_ibv_check_rx_attr(hints->rx_attr, hints, info);
		if (ret)
			return ret;
	}

	if (hints->tx_attr) {
		ret = fi_ibv_check_tx_attr(hints->tx_attr, hints, info);
		if (ret)
			return ret;
	}

	return 0;
}
int fi_ibv_open_ep(struct fid_domain *domain, struct fi_info *info,
		   struct fid_ep **ep, void *context)
{
	struct fi_ibv_domain *dom;
	struct fi_ibv_msg_ep *_ep;
	struct fi_ibv_connreq *connreq;
	struct fi_ibv_pep *pep;
	struct fi_info *fi;
	int ret;

	dom = container_of(domain, struct fi_ibv_domain, domain_fid);
	if (strcmp(dom->verbs->device->name, info->domain_attr->name)) {
		VERBS_INFO(FI_LOG_DOMAIN, "Invalid info->domain_attr->name\n");
		return -FI_EINVAL;
	}

	fi = fi_ibv_get_verbs_info(info->domain_attr->name);
	if (!fi) {
		VERBS_INFO(FI_LOG_DOMAIN, "Unable to find matching verbs_info\n");
		return -FI_EINVAL;
	}

	if (info->ep_attr) {
		ret = fi_ibv_check_ep_attr(info->ep_attr, fi);
		if (ret)
			return ret;
	}

	if (info->tx_attr) {
		ret = fi_ibv_check_tx_attr(info->tx_attr, info, fi);
		if (ret)
			return ret;
	}

	if (info->rx_attr) {
		ret = fi_ibv_check_rx_attr(info->rx_attr, info, fi);
		if (ret)
			return ret;
	}

	_ep = fi_ibv_alloc_msg_ep(info);
	if (!_ep)
		return -FI_ENOMEM;

	if (!info->handle) {
		ret = fi_ibv_create_ep(NULL, NULL, 0, info, NULL, &_ep->id);
		if (ret)
			goto err;
	} else if (info->handle->fclass == FI_CLASS_CONNREQ) {
		connreq = container_of(info->handle, struct fi_ibv_connreq, handle);
		_ep->id = connreq->id;
        } else if (info->handle->fclass == FI_CLASS_PEP) {
Beispiel #5
0
static ssize_t
fi_ibv_rdm_process_event_rejected(struct fi_ibv_rdm_ep *ep,
				  struct rdma_cm_event *event)
{
	struct fi_ibv_rdm_conn *conn = event->id->context;
	ssize_t ret = FI_SUCCESS;
	const int *pdata = event->param.conn.private_data;

	if ((pdata && *pdata == 0xdeadbeef) ||
	    /* 
	     * TODO: this is a workaround of the case when private_data is not
	     * arriving from rdma_reject call on iWarp devices
	     */
	    (conn->cm_role == FI_VERBS_CM_PASSIVE &&
	     event->status == -ECONNREFUSED))
	{
		errno = 0;
		rdma_destroy_qp(event->id);
		if (errno) {
			VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_destroy_qp failed\n",
					 errno);
			ret = -errno;
		}
		if (rdma_destroy_id(event->id)) {
			VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_destroy_id failed\n",
					 errno);
			if (ret == FI_SUCCESS)
				ret = -errno;
		}
		VERBS_INFO(FI_LOG_AV,
			"Rejected from conn %p, addr %s:%u, cm_role %d, status %d\n",
			conn, inet_ntoa(conn->addr.sin_addr),
			ntohs(conn->addr.sin_port),
			conn->cm_role,
			event->status);
	} else {
		VERBS_INFO(FI_LOG_AV,
			"Unexpected REJECT from conn %p, addr %s:%u, cm_role %d, "
			"msg len %d, msg %x, status %d, err %d\n",
			conn, inet_ntoa(conn->addr.sin_addr),
			ntohs(conn->addr.sin_port),
			conn->cm_role,
			event->param.conn.private_data_len,
			event->param.conn.private_data ?
			*(int *)event->param.conn.private_data : 0,
			event->status, errno);
		conn->state = FI_VERBS_CONN_REJECTED;

	}
	return ret;
}
Beispiel #6
0
static ssize_t
fi_ibv_rdm_process_route_resolved(struct rdma_cm_event *event,
				  struct fi_ibv_rdm_ep *ep)
{
	struct fi_ibv_rdm_tagged_conn *conn = event->id->context;
	ssize_t ret = FI_SUCCESS;

	struct rdma_conn_param cm_params;
	fi_ibv_rdm_pack_cm_params(&cm_params, conn, ep);

	VERBS_INFO(FI_LOG_AV,
		"ROUTE RESOLVED, conn %p, addr %s:%u\n", conn,
		inet_ntoa(conn->addr.sin_addr), ntohs(conn->addr.sin_port));

	if (rdma_connect(event->id, &cm_params)) {
		VERBS_INFO_ERRNO(FI_LOG_AV,
				 "rdma_connect failed\n", errno);
		ret = -errno;

		free((void *)cm_params.private_data);
		assert(0);
	}

	return ret;
}
Beispiel #7
0
static ssize_t
fi_ibv_rdm_process_event_established(struct rdma_cm_event *event,
				     struct fi_ibv_rdm_ep *ep)
{
	struct fi_ibv_rdm_tagged_conn *conn =
		(struct fi_ibv_rdm_tagged_conn *)event->id->context;

	if (conn->state != FI_VERBS_CONN_STARTED &&
	    conn->cm_role != FI_VERBS_CM_SELF)
	{
		VERBS_INFO(FI_LOG_AV, "state = %d, conn %p", conn->state, conn);
		assert(0 && "Wrong state");
		return -FI_ECONNABORTED;
	}

	if (conn->cm_role == FI_VERBS_CM_ACTIVE ||
	    conn->cm_role == FI_VERBS_CM_SELF)
	{
		fi_ibv_rdm_unpack_cm_params(&event->param.conn, conn, ep);
	}

	FI_INFO(&fi_ibv_prov, FI_LOG_AV, "CONN ESTABLISHED, conn %p, addr %s:%u\n",
		conn, inet_ntoa(conn->addr.sin_addr),
		ntohs(conn->addr.sin_port));
	
	/* Do not count self twice */
	if (conn->state != FI_VERBS_CONN_ESTABLISHED) {
		ep->num_active_conns++;
		conn->state = FI_VERBS_CONN_ESTABLISHED;
	}
	return FI_SUCCESS;
}
int fi_ibv_fi_to_rai(const struct fi_info *fi, uint64_t flags,
		     struct rdma_addrinfo *rai)
{
	memset(rai, 0, sizeof *rai);
	if (flags & FI_SOURCE)
		rai->ai_flags = RAI_PASSIVE;
	if (flags & FI_NUMERICHOST)
		rai->ai_flags |= RAI_NUMERICHOST;

	rai->ai_qp_type = IBV_QPT_RC;
	rai->ai_port_space = RDMA_PS_TCP;

	if (!fi)
		return 0;

	switch(fi->addr_format) {
	case FI_SOCKADDR_IN:
		rai->ai_family = AF_INET;
		rai->ai_flags |= RAI_FAMILY;
		break;
	case FI_SOCKADDR_IN6:
		rai->ai_family = AF_INET6;
		rai->ai_flags |= RAI_FAMILY;
		break;
	case FI_SOCKADDR_IB:
		rai->ai_family = AF_IB;
		rai->ai_flags |= RAI_FAMILY;
		break;
	case FI_SOCKADDR:
		if (fi->src_addrlen) {
			rai->ai_family = ((struct sockaddr *)fi->src_addr)->sa_family;
			rai->ai_flags |= RAI_FAMILY;
		} else if (fi->dest_addrlen) {
			rai->ai_family = ((struct sockaddr *)fi->dest_addr)->sa_family;
			rai->ai_flags |= RAI_FAMILY;
		}
		break;
	case FI_FORMAT_UNSPEC:
		break;
	default:
		VERBS_INFO(FI_LOG_FABRIC, "Unknown fi->addr_format\n");
	}

	if (fi->src_addrlen) {
		if (!(rai->ai_src_addr = malloc(fi->src_addrlen)))
			return -FI_ENOMEM;
		memcpy(rai->ai_src_addr, fi->src_addr, fi->src_addrlen);
		rai->ai_src_len = fi->src_addrlen;
	}
	if (fi->dest_addrlen) {
		if (!(rai->ai_dst_addr = malloc(fi->dest_addrlen)))
			return -FI_ENOMEM;
		memcpy(rai->ai_dst_addr, fi->dest_addr, fi->dest_addrlen);
		rai->ai_dst_len = fi->dest_addrlen;
	}

	return 0;
}
Beispiel #9
0
int fi_ibv_rdm_cm_bind_ep(struct fi_ibv_rdm_cm *cm, struct fi_ibv_rdm_ep *ep)
{
	char my_ipoib_addr_str[INET6_ADDRSTRLEN];

	assert(cm->ec && cm->listener);

	if (ep->info->src_addr) {
		memcpy(&ep->my_addr, ep->info->src_addr, sizeof(ep->my_addr));

		inet_ntop(ep->my_addr.sin_family,
			  &ep->my_addr.sin_addr.s_addr,
			  my_ipoib_addr_str, INET_ADDRSTRLEN);
	} else {
		strcpy(my_ipoib_addr_str, "undefined");
	}

	VERBS_INFO(FI_LOG_EP_CTRL, "My IPoIB: %s\n", my_ipoib_addr_str);

	if (!cm->is_bound) {
		errno = 0;
		if (rdma_bind_addr(cm->listener, (struct sockaddr *)&ep->my_addr)) {
			VERBS_INFO(FI_LOG_EP_CTRL,
				"Failed to bind cm listener to my IPoIB addr %s: %s\n",
				my_ipoib_addr_str, strerror(errno));
			return -FI_EOTHER;
		}
		if (rdma_listen(cm->listener, 1024)) {
			VERBS_INFO(FI_LOG_EP_CTRL, "rdma_listen failed: %s\n",
				strerror(errno));
			return -FI_EOTHER;
		}
		cm->is_bound = 1;
	}

	if (!ep->my_addr.sin_port) {
		ep->my_addr.sin_port = rdma_get_src_port(cm->listener);
	}
	assert(ep->my_addr.sin_family == AF_INET);

	VERBS_INFO(FI_LOG_EP_CTRL, "My ep_addr: %s:%u\n",
		inet_ntoa(ep->my_addr.sin_addr), ntohs(ep->my_addr.sin_port));

	return FI_SUCCESS;
}
Beispiel #10
0
static ssize_t
fi_ibv_rdm_process_event_rejected(struct fi_ibv_rdm_ep *ep,
				  struct rdma_cm_event *event)
{
	struct fi_ibv_rdm_tagged_conn *conn = event->id->context;
	ssize_t ret = FI_SUCCESS;

	if (NULL != event->param.conn.private_data &&
	    *((int *)event->param.conn.private_data) == 0xdeadbeef ) {
		assert(conn->cm_role == FI_VERBS_CM_PASSIVE);
		errno = 0;
		rdma_destroy_qp(event->id);
		if (errno) {
			VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_destroy_qp failed\n",
					 errno);
			ret = -errno;
		}
		if (rdma_destroy_id(event->id)) {
			VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_destroy_id failed\n",
					 errno);
			if (ret == FI_SUCCESS)
				ret = -errno;
		}
		VERBS_INFO(FI_LOG_AV,
			"Rejected from conn %p, addr %s:%u, cm_role %d, status %d\n",
			conn, inet_ntoa(conn->addr.sin_addr),
			ntohs(conn->addr.sin_port),
			conn->cm_role,
			event->status);
	} else {
		VERBS_INFO(FI_LOG_AV,
			"Unexpected REJECT from conn %p, addr %s:%u, cm_role %d, msg len %d, msg %x, status %d\n",
			conn, inet_ntoa(conn->addr.sin_addr),
			ntohs(conn->addr.sin_port),
			conn->cm_role,
			event->param.conn.private_data_len,
			event->param.conn.private_data ?
			*(int *)event->param.conn.private_data : 0,
			event->status);
		conn->state = FI_VERBS_CONN_REJECTED;

	}
	return ret;
}
Beispiel #11
0
int fi_ibv_check_tx_attr(const struct fi_tx_attr *attr,
			 const struct fi_info *hints, const struct fi_info *info)
{
	if (attr->caps & ~(info->tx_attr->caps)) {
		VERBS_INFO(FI_LOG_CORE,
			   "Given tx_attr->caps not supported\n");
		FI_INFO_CHECK(&fi_ibv_prov, (info->tx_attr), attr, caps, FI_TYPE_CAPS);
		return -FI_ENODATA;
	}

	if (((attr->mode ? attr->mode : hints->mode) &
	     info->tx_attr->mode) != info->tx_attr->mode) {
		size_t user_mode = (attr->mode ? attr->mode : hints->mode);
		VERBS_INFO(FI_LOG_CORE,
			   "Given tx_attr->mode not supported\n");
		FI_INFO_MODE(&fi_ibv_prov, info->tx_attr->mode, user_mode);
		return -FI_ENODATA;
	}

	if (attr->op_flags & ~(info->tx_attr->op_flags)) {
		VERBS_INFO(FI_LOG_CORE,
			   "Given tx_attr->op_flags not supported\n");
		return -FI_ENODATA;
	}

	if (attr->msg_order & ~(info->tx_attr->msg_order)) {
		VERBS_INFO(FI_LOG_CORE,
			   "Given tx_attr->msg_order not supported\n");
		return -FI_ENODATA;
	}

	if (attr->size > info->tx_attr->size) {
		VERBS_INFO(FI_LOG_CORE,
			   "Given tx_attr->size is greater than supported\n");
		FI_INFO_CHECK_VAL(&fi_ibv_prov, (info->tx_attr), attr, size);
		return -FI_ENODATA;
	}

	if (attr->iov_limit > info->tx_attr->iov_limit) {
		VERBS_INFO(FI_LOG_CORE,
			   "Given tx_attr->iov_limit greater than supported\n");
		FI_INFO_CHECK_VAL(&fi_ibv_prov, (info->tx_attr), attr,
				  iov_limit);
		return -FI_ENODATA;
	}

	if (attr->rma_iov_limit > info->tx_attr->rma_iov_limit) {
		VERBS_INFO(FI_LOG_CORE,
			   "Given tx_attr->rma_iov_limit greater than supported\n");
		FI_INFO_CHECK_VAL(&fi_ibv_prov, (info->tx_attr), attr,
				  rma_iov_limit);
		return -FI_ENODATA;
	}

	return 0;
}
Beispiel #12
0
void fi_ibv_log_ep_conn(struct fi_ibv_xrc_ep *ep, char *desc)
{
	struct sockaddr *addr;
	char buf[OFI_ADDRSTRLEN];
	size_t len = sizeof(buf);

	if (!fi_log_enabled(&fi_ibv_prov, FI_LOG_INFO, FI_LOG_FABRIC))
		return;

	VERBS_INFO(FI_LOG_FABRIC, "EP %p, %s\n", ep, desc);
	VERBS_INFO(FI_LOG_FABRIC,
		  "EP %p, CM ID %p, TGT CM ID %p, SRQN %d Peer SRQN %d\n",
		  ep, ep->base_ep.id, ep->tgt_id, ep->srqn, ep->peer_srqn);

	assert(ep->base_ep.id);

	addr = rdma_get_local_addr(ep->base_ep.id);
	if (addr) {
		ofi_straddr(buf, &len, ep->base_ep.info->addr_format, addr);
		VERBS_INFO(FI_LOG_FABRIC, "EP %p src_addr: %s\n", ep, buf);
	}
	addr = rdma_get_peer_addr(ep->base_ep.id);
	if (addr) {
		len = sizeof(buf);
		ofi_straddr(buf, &len, ep->base_ep.info->addr_format, addr);
		VERBS_INFO(FI_LOG_FABRIC, "EP %p dst_addr: %s\n", ep, buf);
	}

	if (ep->base_ep.ibv_qp) {
		VERBS_INFO(FI_LOG_FABRIC, "EP %p, INI QP Num %d\n",
			  ep, ep->base_ep.ibv_qp->qp_num);
		VERBS_INFO(FI_LOG_FABRIC, "EP %p, Remote TGT QP Num %d\n", ep,
			  ep->ini_conn->tgt_qpn);
	}
	if (ep->tgt_ibv_qp)
		VERBS_INFO(FI_LOG_FABRIC, "EP %p, TGT QP Num %d\n",
			  ep, ep->tgt_ibv_qp->qp_num);
	if (ep->conn_setup && ep->conn_setup->rsvd_ini_qpn)
		VERBS_INFO(FI_LOG_FABRIC, "EP %p, Reserved INI QPN %d\n",
			  ep, ep->conn_setup->rsvd_ini_qpn->qp_num);
	if (ep->conn_setup && ep->conn_setup->rsvd_tgt_qpn)
		VERBS_INFO(FI_LOG_FABRIC, "EP %p, Reserved TGT QPN %d\n",
			  ep, ep->conn_setup->rsvd_tgt_qpn->qp_num);
}
Beispiel #13
0
int fi_ibv_init_info(void)
{
	struct ibv_context **ctx_list;
	struct fi_info *fi = NULL, *tail = NULL;
	int ret = 0, i, num_devices;

	if (verbs_info)
		return 0;

	pthread_mutex_lock(&verbs_info_lock);
	if (verbs_info)
		goto unlock;

	if (!fi_ibv_have_device()) {
		VERBS_INFO(FI_LOG_FABRIC, "No RDMA devices found\n");
		ret = -FI_ENODATA;
		goto unlock;
	}

	ctx_list = rdma_get_devices(&num_devices);
	if (!num_devices) {
		VERBS_INFO_ERRNO(FI_LOG_FABRIC, "rdma_get_devices", errno);
		ret = -errno;
		goto unlock;
	}

	for (i = 0; i < num_devices; i++) {
		ret = fi_ibv_alloc_info(ctx_list[i], &fi, &verbs_msg_domain);
		if (!ret) {
			if (!verbs_info)
				verbs_info = fi;
			else
				tail->next = fi;
			tail = fi;

			ret = fi_ibv_alloc_info(ctx_list[i], &fi,
						&verbs_rdm_domain);
			if (!ret) {
				tail->next = fi;
				tail = fi;
			}
		}
	}

	ret = verbs_info ? 0 : ret;

	rdma_free_devices(ctx_list);
unlock:
	pthread_mutex_unlock(&verbs_info_lock);
	return ret;
}
Beispiel #14
0
static int fi_ibv_rdm_cm_init(struct fi_ibv_rdm_cm* cm,
			      const struct rdma_addrinfo* rai)
{
	struct sockaddr_in* src_addr = (struct sockaddr_in*)rai->ai_src_addr;
	cm->ec = rdma_create_event_channel();

	if (!cm->ec) {
		VERBS_INFO(FI_LOG_EP_CTRL,
			"Failed to create listener event channel: %s\n",
			strerror(errno));
		return -FI_EOTHER;
	}

	if (fi_fd_nonblock(cm->ec->fd) != 0) {
		VERBS_INFO_ERRNO(FI_LOG_EP_CTRL, "fcntl", errno);
		return -FI_EOTHER;
	}

	if (rdma_create_id(cm->ec, &cm->listener, NULL, RDMA_PS_TCP)) {
		VERBS_INFO(FI_LOG_EP_CTRL, "Failed to create cm listener: %s\n",
			     strerror(errno));
		return -FI_EOTHER;
	}

	if (fi_ibv_rdm_find_ipoib_addr(src_addr, cm)) {
		VERBS_INFO(FI_LOG_EP_CTRL, 
			   "Failed to find correct IPoIB address\n");
		return -FI_ENODEV;
	}

	cm->my_addr.sin_port = src_addr->sin_port;

	char my_ipoib_addr_str[INET6_ADDRSTRLEN];
	inet_ntop(cm->my_addr.sin_family,
		  &cm->my_addr.sin_addr.s_addr,
		  my_ipoib_addr_str, INET_ADDRSTRLEN);

	VERBS_INFO(FI_LOG_EP_CTRL, "My IPoIB: %s\n", my_ipoib_addr_str);

	if (rdma_bind_addr(cm->listener, (struct sockaddr *)&cm->my_addr)) {
		VERBS_INFO(FI_LOG_EP_CTRL,
			"Failed to bind cm listener to my IPoIB addr %s: %s\n",
			my_ipoib_addr_str, strerror(errno));
		return -FI_EOTHER;
	}

	if (!cm->my_addr.sin_port) {
		cm->my_addr.sin_port = rdma_get_src_port(cm->listener);
	}
	assert(cm->my_addr.sin_family == AF_INET);

	VERBS_INFO(FI_LOG_EP_CTRL, "My ep_addr: %s:%u\n",
		inet_ntoa(cm->my_addr.sin_addr), ntohs(cm->my_addr.sin_port));

	return FI_SUCCESS;
}
Beispiel #15
0
static ssize_t
fi_ibv_rdm_process_event(struct rdma_cm_event *event, struct fi_ibv_rdm_ep *ep)
{
	ssize_t ret = FI_SUCCESS;
	switch (event->event) {
	case RDMA_CM_EVENT_ADDR_RESOLVED:
		ret = fi_ibv_rdm_process_addr_resolved(event->id, ep);
		break;
	case RDMA_CM_EVENT_ROUTE_RESOLVED:
		ret = fi_ibv_rdm_process_route_resolved(event, ep);
		break;
	case RDMA_CM_EVENT_ESTABLISHED:
		ret = fi_ibv_rdm_process_event_established(event, ep);
		break;
	case RDMA_CM_EVENT_DISCONNECTED:
		ret = fi_ibv_rdm_process_event_disconnected(ep, event);
		break;
	case RDMA_CM_EVENT_CONNECT_REQUEST:
		ret = fi_ibv_rdm_process_connect_request(event, ep);
		break;
	case RDMA_CM_EVENT_REJECTED:
		ret = fi_ibv_rdm_process_event_rejected(ep, event);
		break;
	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
		ret = FI_SUCCESS;
		break;
	/* All cases below fall to default case to print error message*/
	case RDMA_CM_EVENT_ADDR_ERROR:
		ret = -FI_EADDRNOTAVAIL;
	case RDMA_CM_EVENT_ROUTE_ERROR:
		ret = (ret == FI_SUCCESS) ? -FI_EHOSTUNREACH : ret;
	case RDMA_CM_EVENT_CONNECT_ERROR:
		ret = (ret == FI_SUCCESS) ? -FI_ECONNREFUSED : ret;
	case RDMA_CM_EVENT_UNREACHABLE:
		ret = (ret == FI_SUCCESS) ? -FI_EADDRNOTAVAIL : ret;
	default:
		VERBS_INFO(FI_LOG_AV, "got unexpected rdmacm event, %s\n",
			   rdma_event_str(event->event));
		ret = (ret == FI_SUCCESS) ? -FI_ECONNABORTED : ret;
		break;
	}

	return ret;
}
Beispiel #16
0
static inline ssize_t
fi_ibv_rdm_batch_repost_receives(struct fi_ibv_rdm_tagged_conn *conn,
				 struct fi_ibv_rdm_ep *ep, int num_to_post)
{
	const size_t idx = (conn->cm_role == FI_VERBS_CM_SELF) ? 1 : 0;
	struct ibv_recv_wr *bad_wr = NULL;
	struct ibv_recv_wr wr[num_to_post];
	struct ibv_sge sge[num_to_post];
	int last = num_to_post - 1;
	int i;

	/* IBV_WR_SEND opcode specific */
	assert((num_to_post % ep->n_buffs) == 0);

	assert(ep->topcode == IBV_WR_SEND ||
	       ep->topcode == IBV_WR_RDMA_WRITE_WITH_IMM);

	if (ep->topcode == IBV_WR_SEND) {
		for (i = 0; i < num_to_post; i++) {
			sge[i].addr = (uint64_t)(void *)
			fi_ibv_rdm_get_rbuf(conn, ep, i % ep->n_buffs);
			sge[i].length = FI_IBV_RDM_DFLT_BUFFER_SIZE;
			sge[i].lkey = conn->r_mr->lkey;
		}
	}

	for (i = 0; i < num_to_post; i++) {
		wr[i].wr_id = (uintptr_t) conn;
		wr[i].next = &wr[i + 1];
		wr[i].sg_list = &sge[i];
		wr[i].num_sge = 1;
	}
	wr[last].next = NULL;

	if (ibv_post_recv(conn->qp[idx], wr, &bad_wr) == 0) {
		conn->recv_preposted += num_to_post;
		return num_to_post;
	}

	VERBS_INFO(FI_LOG_EP_DATA, "Failed to post recv\n");
	return -FI_ENOMEM;
}
Beispiel #17
0
static ssize_t
fi_ibv_rdm_process_event_disconnected(struct fi_ibv_rdm_ep *ep,
				      struct rdma_cm_event *event)
{
	struct fi_ibv_rdm_tagged_conn *conn = event->id->context;

	ep->num_active_conns--;
	
	if (conn->state == FI_VERBS_CONN_ESTABLISHED) {
		conn->state = FI_VERBS_CONN_REMOTE_DISCONNECT;
	} else {
		assert(conn->state == FI_VERBS_CONN_LOCAL_DISCONNECT);
		conn->state = FI_VERBS_CONN_CLOSED;
	}
	VERBS_INFO(FI_LOG_AV,
		   "Disconnected from conn %p, addr %s:%u\n",
		   conn, inet_ntoa(conn->addr.sin_addr),
		   ntohs(conn->addr.sin_port));
	if (conn->state == FI_VERBS_CONN_CLOSED) {
		return fi_ibv_rdm_conn_cleanup(conn);
	}

	return FI_SUCCESS;
}
Beispiel #18
0
/* Builds a list of interfaces that correspond to active verbs devices */
static int fi_ibv_getifaddrs(struct dlist_entry *verbs_devs)
{
	struct ifaddrs *ifaddr, *ifa;
	char name[INET6_ADDRSTRLEN];
	struct rdma_addrinfo *rai;
	struct rdma_cm_id *id;
	const char *ret_ptr;
	int ret, num_verbs_ifs = 0;

	char *iface = NULL;
	size_t iface_len = 0;
	int exact_match = 0;

	ret = getifaddrs(&ifaddr);
	if (ret) {
	       VERBS_WARN(FI_LOG_FABRIC,
			  "Unable to get interface addresses\n");
		return ret;
	}

	/* select best iface name based on user's input */
	if (fi_param_get_str(&fi_ibv_prov, "iface", &iface) == FI_SUCCESS) {
		iface_len = strlen(iface);
		if (iface_len > IFNAMSIZ) {
			VERBS_INFO(FI_LOG_EP_CTRL,
				   "Too long iface name: %s, max: %d\n",
				   iface, IFNAMSIZ);
			return -FI_EINVAL;
		}
		for (ifa = ifaddr; ifa && !exact_match; ifa = ifa->ifa_next)
			exact_match = !strcmp(ifa->ifa_name, iface);
	}

	for (ifa = ifaddr; ifa; ifa = ifa->ifa_next) {
		if (!ifa->ifa_addr || !(ifa->ifa_flags & IFF_UP) ||
				!strcmp(ifa->ifa_name, "lo"))
			continue;

		if(iface) {
			if(exact_match) {
				if(strcmp(ifa->ifa_name, iface))
					continue;
			} else {
				if(strncmp(ifa->ifa_name, iface, iface_len))
					continue;
			}
		}

		switch (ifa->ifa_addr->sa_family) {
		case AF_INET:
			ret_ptr = inet_ntop(AF_INET, &ofi_sin_addr(ifa->ifa_addr),
				name, INET6_ADDRSTRLEN);
			break;
		case AF_INET6:
			ret_ptr = inet_ntop(AF_INET6, &ofi_sin6_addr(ifa->ifa_addr),
				name, INET6_ADDRSTRLEN);
			break;
		default:
			continue;
		}
		if (!ret_ptr) {
			VERBS_WARN(FI_LOG_FABRIC,
				   "inet_ntop failed: %s(%d)\n",
				   strerror(errno), errno);
			goto err1;
		}

		ret = fi_ibv_create_ep(name, NULL, FI_NUMERICHOST | FI_SOURCE,
				NULL, &rai, &id);
		if (ret)
			continue;

		ret = fi_ibv_add_rai(verbs_devs, id, rai);
		if (ret)
			goto err2;

		VERBS_DBG(FI_LOG_FABRIC, "Found active interface for verbs device: "
			  "%s with address: %s\n",
			  ibv_get_device_name(id->verbs->device), name);

		rdma_destroy_ep(id);

		num_verbs_ifs++;
	}
	freeifaddrs(ifaddr);
	return num_verbs_ifs ? 0 : -FI_ENODATA;
err2:
	rdma_destroy_ep(id);
err1:
	fi_ibv_verbs_devs_free(verbs_devs);
	freeifaddrs(ifaddr);
	return ret;
}
Beispiel #19
0
int fi_ibv_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr,
		   struct fid_eq **eq, void *context)
{
	struct fi_ibv_eq *_eq;
	struct epoll_event event;
	int ret;

	_eq = calloc(1, sizeof *_eq);
	if (!_eq)
		return -ENOMEM;

	_eq->fab = container_of(fabric, struct fi_ibv_fabric,
				util_fabric.fabric_fid);

	fastlock_init(&_eq->lock);
	ret = dlistfd_head_init(&_eq->list_head);
	if (ret) {
		VERBS_INFO(FI_LOG_EQ, "Unable to initialize dlistfd\n");
		goto err1;
	}

	_eq->epfd = epoll_create1(0);
	if (_eq->epfd < 0) {
		ret = -errno;
		goto err2;
	}

	memset(&event, 0, sizeof(event));
	event.events = EPOLLIN;

	if (epoll_ctl(_eq->epfd, EPOLL_CTL_ADD,
		      _eq->list_head.signal.fd[FI_READ_FD], &event)) {
		ret = -errno;
		goto err3;
	}

	switch (attr->wait_obj) {
	case FI_WAIT_NONE:
	case FI_WAIT_UNSPEC:
	case FI_WAIT_FD:
		_eq->channel = rdma_create_event_channel();
		if (!_eq->channel) {
			ret = -errno;
			goto err3;
		}

		ret = fi_fd_nonblock(_eq->channel->fd);
		if (ret)
			goto err4;

		if (epoll_ctl(_eq->epfd, EPOLL_CTL_ADD, _eq->channel->fd, &event)) {
			ret = -errno;
			goto err4;
		}

		break;
	default:
		ret = -FI_ENOSYS;
		goto err1;
	}

	_eq->flags = attr->flags;
	_eq->eq_fid.fid.fclass = FI_CLASS_EQ;
	_eq->eq_fid.fid.context = context;
	_eq->eq_fid.fid.ops = &fi_ibv_eq_fi_ops;
	_eq->eq_fid.ops = &fi_ibv_eq_ops;

	*eq = &_eq->eq_fid;
	return 0;
err4:
	if (_eq->channel)
		rdma_destroy_event_channel(_eq->channel);
err3:
	close(_eq->epfd);
err2:
	dlistfd_head_free(&_eq->list_head);
err1:
	fastlock_destroy(&_eq->lock);
	free(_eq);
	return ret;
}
Beispiel #20
0
static int
fi_ibv_mr_reg(struct fid *fid, const void *buf, size_t len,
	   uint64_t access, uint64_t offset, uint64_t requested_key,
	   uint64_t flags, struct fid_mr **mr, void *context)
{
	struct fi_ibv_mem_desc *md;
	int fi_ibv_access = 0;
	struct fid_domain *domain;

	if (flags)
		return -FI_EBADFLAGS;

	if (fid->fclass != FI_CLASS_DOMAIN) {
		return -FI_EINVAL;
	}
	domain = container_of(fid, struct fid_domain, fid);

	md = calloc(1, sizeof *md);
	if (!md)
		return -FI_ENOMEM;

	md->domain = container_of(domain, struct fi_ibv_domain, domain_fid);
	md->mr_fid.fid.fclass = FI_CLASS_MR;
	md->mr_fid.fid.context = context;
	md->mr_fid.fid.ops = &fi_ibv_mr_ops;

	/* Enable local write access by default for FI_EP_RDM which hides local
	 * registration requirements. This allows to avoid buffering or double
	 * registration */
	if (!(md->domain->info->caps & FI_LOCAL_MR))
		fi_ibv_access |= IBV_ACCESS_LOCAL_WRITE;

	/* Local read access to an MR is enabled by default in verbs */

	if (access & FI_RECV)
		fi_ibv_access |= IBV_ACCESS_LOCAL_WRITE;

	/* iWARP spec requires Remote Write access for an MR that is used
	 * as a data sink for a Remote Read */
	if (access & FI_READ) {
		fi_ibv_access |= IBV_ACCESS_LOCAL_WRITE;
		if (md->domain->verbs->device->transport_type == IBV_TRANSPORT_IWARP)
			fi_ibv_access |= IBV_ACCESS_REMOTE_WRITE;
	}

	if (access & FI_WRITE)
		fi_ibv_access |= IBV_ACCESS_LOCAL_WRITE;

	if (access & FI_REMOTE_READ)
		fi_ibv_access |= IBV_ACCESS_REMOTE_READ;

	/* Verbs requires Local Write access too for Remote Write access */
	if (access & FI_REMOTE_WRITE)
		fi_ibv_access |= IBV_ACCESS_LOCAL_WRITE |
			IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_ATOMIC;

	md->mr = ibv_reg_mr(md->domain->pd, (void *) buf, len, fi_ibv_access);
	if (!md->mr)
		goto err;

	md->mr_fid.mem_desc = (void *) (uintptr_t) md->mr->lkey;
	md->mr_fid.key = md->mr->rkey;
	*mr = &md->mr_fid;
	if(md->domain->eq && (md->domain->eq_flags & FI_REG_MR)) {
		struct fi_eq_entry entry = {
			.fid = &md->mr_fid.fid,
			.context = context
		};
		fi_ibv_eq_write_event(md->domain->eq, FI_MR_COMPLETE,
			 	      &entry, sizeof(entry));
	}
	return 0;

err:
	free(md);
	return -errno;
}

static int fi_ibv_mr_regv(struct fid *fid, const struct iovec * iov,
		size_t count, uint64_t access, uint64_t offset, uint64_t requested_key,
		uint64_t flags, struct fid_mr **mr, void *context)
{
	if (count > VERBS_MR_IOV_LIMIT) {
		VERBS_WARN(FI_LOG_FABRIC,
			   "iov count > %d not supported\n",
			   VERBS_MR_IOV_LIMIT);
		return -FI_EINVAL;
	}
	return fi_ibv_mr_reg(fid, iov->iov_base, iov->iov_len, access, offset,
			requested_key, flags, mr, context);
}

static int fi_ibv_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr,
		uint64_t flags, struct fid_mr **mr)
{
	return fi_ibv_mr_regv(fid, attr->mr_iov, attr->iov_count, attr->access,
			0, attr->requested_key, flags, mr, attr->context);
}

static int fi_ibv_domain_bind(struct fid *fid, struct fid *bfid, uint64_t flags)
{
	struct fi_ibv_domain *domain;
	struct fi_ibv_eq *eq;

	domain = container_of(fid, struct fi_ibv_domain, domain_fid.fid);

	switch (bfid->fclass) {
	case FI_CLASS_EQ:
		eq = container_of(bfid, struct fi_ibv_eq, eq_fid);
		domain->eq = eq;
		domain->eq_flags = flags;
		break;
	default:
		return -EINVAL;
	}

	return 0;
}

static int fi_ibv_domain_close(fid_t fid)
{
	struct fi_ibv_domain *domain;
	int ret;

	domain = container_of(fid, struct fi_ibv_domain, domain_fid.fid);

	if (domain->rdm) {
		rdma_destroy_ep(domain->rdm_cm->listener);
		free(domain->rdm_cm);
	}

	if (domain->pd) {
		ret = ibv_dealloc_pd(domain->pd);
		if (ret)
			return -ret;
		domain->pd = NULL;
	}

	fi_freeinfo(domain->info);
	free(domain);
	return 0;
}

static int fi_ibv_open_device_by_name(struct fi_ibv_domain *domain, const char *name)
{
	struct ibv_context **dev_list;
	int i, ret = -FI_ENODEV;

	if (!name)
		return -FI_EINVAL;

	dev_list = rdma_get_devices(NULL);
	if (!dev_list)
		return -errno;

	for (i = 0; dev_list[i] && ret; i++) {
		if (domain->rdm) {
			ret = strncmp(name, ibv_get_device_name(dev_list[i]->device),
				      strlen(name) - strlen(verbs_rdm_domain.suffix));

		} else {
			ret = strcmp(name, ibv_get_device_name(dev_list[i]->device));
		}

		if (!ret)
			domain->verbs = dev_list[i];
	}
	rdma_free_devices(dev_list);
	return ret;
}

static struct fi_ops fi_ibv_fid_ops = {
	.size = sizeof(struct fi_ops),
	.close = fi_ibv_domain_close,
	.bind = fi_ibv_domain_bind,
	.control = fi_no_control,
	.ops_open = fi_no_ops_open,
};

static struct fi_ops_mr fi_ibv_domain_mr_ops = {
	.size = sizeof(struct fi_ops_mr),
	.reg = fi_ibv_mr_reg,
	.regv = fi_ibv_mr_regv,
	.regattr = fi_ibv_mr_regattr,
};

static struct fi_ops_domain fi_ibv_domain_ops = {
	.size = sizeof(struct fi_ops_domain),
	.av_open = fi_no_av_open,
	.cq_open = fi_ibv_cq_open,
	.endpoint = fi_ibv_open_ep,
	.scalable_ep = fi_no_scalable_ep,
	.cntr_open = fi_no_cntr_open,
	.poll_open = fi_no_poll_open,
	.stx_ctx = fi_no_stx_context,
	.srx_ctx = fi_ibv_srq_context,
};

static struct fi_ops_domain fi_ibv_rdm_domain_ops = {
	.size = sizeof(struct fi_ops_domain),
	.av_open = fi_ibv_rdm_av_open,
	.cq_open = fi_ibv_rdm_cq_open,
	.endpoint = fi_ibv_rdm_open_ep,
	.scalable_ep = fi_no_scalable_ep,
	.cntr_open = fi_rbv_rdm_cntr_open,
	.poll_open = fi_no_poll_open,
	.stx_ctx = fi_no_stx_context,
	.srx_ctx = fi_no_srx_context,
};

static int
fi_ibv_domain(struct fid_fabric *fabric, struct fi_info *info,
	   struct fid_domain **domain, void *context)
{
	struct fi_ibv_domain *_domain;
	struct fi_ibv_fabric *fab;
	struct fi_info *fi;
	int ret;

	fi = fi_ibv_get_verbs_info(info->domain_attr->name);
	if (!fi)
		return -FI_EINVAL;

	fab = container_of(fabric, struct fi_ibv_fabric, util_fabric.fabric_fid);
	ret = ofi_check_domain_attr(&fi_ibv_prov, fabric->api_version,
				    fi->domain_attr, info->domain_attr);
	if (ret)
		return ret;

	_domain = calloc(1, sizeof *_domain);
	if (!_domain)
		return -FI_ENOMEM;

	_domain->info = fi_dupinfo(info);
	if (!_domain->info)
		goto err1;

	_domain->rdm = FI_IBV_EP_TYPE_IS_RDM(info);
	if (_domain->rdm) {
		_domain->rdm_cm = calloc(1, sizeof(*_domain->rdm_cm));
		if (!_domain->rdm_cm) {
			ret = -FI_ENOMEM;
			goto err2;
		}
	}
	ret = fi_ibv_open_device_by_name(_domain, info->domain_attr->name);
	if (ret)
		goto err2;

	_domain->pd = ibv_alloc_pd(_domain->verbs);
	if (!_domain->pd) {
		ret = -errno;
		goto err2;
	}

	_domain->domain_fid.fid.fclass = FI_CLASS_DOMAIN;
	_domain->domain_fid.fid.context = context;
	_domain->domain_fid.fid.ops = &fi_ibv_fid_ops;
	_domain->domain_fid.mr = &fi_ibv_domain_mr_ops;
	if (_domain->rdm) {
		_domain->domain_fid.ops = &fi_ibv_rdm_domain_ops;

		_domain->rdm_cm->ec = rdma_create_event_channel();

		if (!_domain->rdm_cm->ec) {
			VERBS_INFO(FI_LOG_EP_CTRL,
				"Failed to create listener event channel: %s\n",
				strerror(errno));
			ret = -FI_EOTHER;
			goto err2;
		}

		if (fi_fd_nonblock(_domain->rdm_cm->ec->fd) != 0) {
			VERBS_INFO_ERRNO(FI_LOG_EP_CTRL, "fcntl", errno);
			ret = -FI_EOTHER;
			goto err3;
		}

		if (rdma_create_id(_domain->rdm_cm->ec,
				   &_domain->rdm_cm->listener, NULL, RDMA_PS_TCP))
		{
			VERBS_INFO(FI_LOG_EP_CTRL, "Failed to create cm listener: %s\n",
				   strerror(errno));
			ret = -FI_EOTHER;
			goto err3;
		}
		_domain->rdm_cm->is_bound = 0;
	} else {
		_domain->domain_fid.ops = &fi_ibv_domain_ops;
	}
	_domain->fab = fab;

	*domain = &_domain->domain_fid;
	return 0;
err3:
	if (_domain->rdm)
		rdma_destroy_event_channel(_domain->rdm_cm->ec);
err2:
	if (_domain->rdm)
		free(_domain->rdm_cm);
	fi_freeinfo(_domain->info);
err1:
	free(_domain);
	return ret;
}

static int fi_ibv_trywait(struct fid_fabric *fabric, struct fid **fids, int count)
{
	struct fi_ibv_cq *cq;
	int ret, i;

	for (i = 0; i < count; i++) {
		switch (fids[i]->fclass) {
		case FI_CLASS_CQ:
			cq = container_of(fids[i], struct fi_ibv_cq, cq_fid.fid);
			ret = cq->trywait(fids[i]);
			if (ret)
				return ret;
			break;
		case FI_CLASS_EQ:
			/* We are always ready to wait on an EQ since
			 * rdmacm EQ is based on an fd */
			continue;
		case FI_CLASS_CNTR:
		case FI_CLASS_WAIT:
			return -FI_ENOSYS;
		default:
			return -FI_EINVAL;
		}

	}
	return FI_SUCCESS;
}

static int fi_ibv_fabric_close(fid_t fid)
{
	struct fi_ibv_fabric *fab;
	int ret;

	fab = container_of(fid, struct fi_ibv_fabric, util_fabric.fabric_fid.fid);
	ret = ofi_fabric_close(&fab->util_fabric);
	if (ret)
		return ret;
	free(fab);

	return 0;
}

static struct fi_ops fi_ibv_fi_ops = {
	.size = sizeof(struct fi_ops),
	.close = fi_ibv_fabric_close,
	.bind = fi_no_bind,
	.control = fi_no_control,
	.ops_open = fi_no_ops_open,
};

static struct fi_ops_fabric fi_ibv_ops_fabric = {
	.size = sizeof(struct fi_ops_fabric),
	.domain = fi_ibv_domain,
	.passive_ep = fi_ibv_passive_ep,
	.eq_open = fi_ibv_eq_open,
	.wait_open = fi_no_wait_open,
	.trywait = fi_ibv_trywait
};

int fi_ibv_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric,
		  void *context)
{
	struct fi_ibv_fabric *fab;
	struct fi_info *info;
	int ret;

	ret = fi_ibv_init_info();
	if (ret)
		return ret;

	fab = calloc(1, sizeof(*fab));
	if (!fab)
		return -FI_ENOMEM;

	for (info = verbs_info; info; info = info->next) {
		ret = ofi_fabric_init(&fi_ibv_prov, info->fabric_attr, attr,
				      &fab->util_fabric, context);
		if (ret != -FI_ENODATA)
			break;
	}
	if (ret) {
		free(fab);
		return ret;
	}

	*fabric = &fab->util_fabric.fabric_fid;
	(*fabric)->fid.ops = &fi_ibv_fi_ops;
	(*fabric)->ops = &fi_ibv_ops_fabric;

	return 0;
}
Beispiel #21
0
int fi_ibv_check_ep_attr(const struct fi_ep_attr *attr,
			 const struct fi_info *info)
{
	if ((attr->type != FI_EP_UNSPEC) &&
	    (attr->type != info->ep_attr->type)) {
		VERBS_INFO(FI_LOG_CORE,
			   "Unsupported endpoint type\n");
		return -FI_ENODATA;
	}

	switch (attr->protocol) {
	case FI_PROTO_UNSPEC:
	case FI_PROTO_RDMA_CM_IB_RC:
	case FI_PROTO_IWARP:
	case FI_PROTO_IB_UD:
	case FI_PROTO_IB_RDM:
	case FI_PROTO_IWARP_RDM:
		break;
	default:
		VERBS_INFO(FI_LOG_CORE,
			   "Unsupported protocol\n");
		return -FI_ENODATA;
	}

	if (attr->protocol_version > 1) {
		VERBS_INFO(FI_LOG_CORE,
			   "Unsupported protocol version\n");
		return -FI_ENODATA;
	}

	if (attr->max_msg_size > info->ep_attr->max_msg_size) {
		VERBS_INFO(FI_LOG_CORE,
			   "Max message size too large\n");
		FI_INFO_CHECK_VAL(&fi_ibv_prov, info->ep_attr, attr,
				  max_msg_size);
		return -FI_ENODATA;
	}

	if (attr->max_order_raw_size > info->ep_attr->max_order_raw_size) {
		VERBS_INFO( FI_LOG_CORE,
			   "max_order_raw_size exceeds supported size\n");
		FI_INFO_CHECK_VAL(&fi_ibv_prov, info->ep_attr, attr,
				  max_order_raw_size);
		return -FI_ENODATA;
	}

	if (attr->max_order_war_size) {
		VERBS_INFO(FI_LOG_CORE,
			   "max_order_war_size exceeds supported size\n");
		FI_INFO_CHECK_VAL(&fi_ibv_prov, info->ep_attr, attr,
				  max_order_war_size);
		return -FI_ENODATA;
	}

	if (attr->max_order_waw_size > info->ep_attr->max_order_waw_size) {
		VERBS_INFO(FI_LOG_CORE,
			   "max_order_waw_size exceeds supported size\n");
		FI_INFO_CHECK_VAL(&fi_ibv_prov, info->ep_attr, attr,
				  max_order_waw_size);
		return -FI_ENODATA;
	}

	if (attr->tx_ctx_cnt > info->domain_attr->max_ep_tx_ctx) {
		VERBS_INFO(FI_LOG_CORE,
			   "tx_ctx_cnt exceeds supported size\n");
		VERBS_INFO(FI_LOG_CORE, "Supported: %zd\nRequested: %zd\n",
			   info->domain_attr->max_ep_tx_ctx, attr->tx_ctx_cnt);
		return -FI_ENODATA;
	}

	if ((attr->rx_ctx_cnt > info->domain_attr->max_ep_rx_ctx) &&
	    (attr->rx_ctx_cnt != FI_SHARED_CONTEXT)) {
		VERBS_INFO(FI_LOG_CORE,
			   "rx_ctx_cnt exceeds supported size\n");
		VERBS_INFO(FI_LOG_CORE, "Supported: %zd\nRequested: %zd\n",
			   info->domain_attr->max_ep_rx_ctx, attr->rx_ctx_cnt);
		return -FI_ENODATA;
	}

	if (attr->auth_key_size &&
	    (attr->auth_key_size != info->ep_attr->auth_key_size)) {
		VERBS_INFO(FI_LOG_CORE, "Unsupported authentification size.");
		FI_INFO_CHECK_VAL(&fi_ibv_prov, info->ep_attr, attr,
				  auth_key_size);
		return -FI_ENODATA;
	}

	return 0;
}
Beispiel #22
0
int fi_ibv_query_atomic(struct fid_domain *domain_fid, enum fi_datatype datatype,
			enum fi_op op, struct fi_atomic_attr *attr,
			uint64_t flags)
{
	struct fi_ibv_domain *domain = container_of(domain_fid,
						    struct fi_ibv_domain,
						    domain_fid);
	char *log_str_fetch = "fi_fetch_atomic with FI_SUM op";
	char *log_str_comp = "fi_compare_atomic";
	char *log_str;

	if (flags & FI_TAGGED)
		return -FI_ENOSYS;

	if ((flags & FI_FETCH_ATOMIC) && (flags & FI_COMPARE_ATOMIC))
		return -FI_EBADFLAGS;

	if (!flags) {
		switch (op) {
		case FI_ATOMIC_WRITE:
			break;
		default:
			return -FI_ENOSYS;
		}
	} else {
		if (flags & FI_FETCH_ATOMIC) {
			switch (op) {
			case FI_ATOMIC_READ:
				goto check_datatype;
			case FI_SUM:
				log_str = log_str_fetch;
				break;
			default:
				return -FI_ENOSYS;
			}
		} else if (flags & FI_COMPARE_ATOMIC) {
			if (op != FI_CSWAP)
				return -FI_ENOSYS;
			log_str = log_str_comp;
		} else {
			return  -FI_EBADFLAGS;
		}
		if (domain->info->tx_attr->op_flags & FI_INJECT) {
			VERBS_INFO(FI_LOG_EP_DATA,
				   "FI_INJECT not supported for %s\n", log_str);
			return -FI_EINVAL;
		}
	}
check_datatype:
	switch (datatype) {
	case FI_INT64:
	case FI_UINT64:
#if __BITS_PER_LONG == 64
	case FI_DOUBLE:
	case FI_FLOAT:
#endif
		break;
	default:
		return -FI_EINVAL;
	}

	attr->size = fi_datatype_size(datatype);
	if (attr->size == 0)
		return -FI_EINVAL;

	attr->count = 1;
	return 0;
}
Beispiel #23
0
int fi_ibv_check_rx_attr(const struct fi_rx_attr *attr,
			 const struct fi_info *hints, const struct fi_info *info)
{
	uint64_t compare_mode, check_mode;
	int rm_enabled;

	if (attr->caps & ~(info->rx_attr->caps)) {
		VERBS_INFO(FI_LOG_CORE,
			   "Given rx_attr->caps not supported\n");
		return -FI_ENODATA;
	}

	compare_mode = attr->mode ? attr->mode : hints->mode;

	check_mode = (hints->caps & FI_RMA) ? info->rx_attr->mode :
		(info->rx_attr->mode & ~FI_RX_CQ_DATA);

	if ((compare_mode & check_mode) != check_mode) {
		VERBS_INFO(FI_LOG_CORE,
			   "Given rx_attr->mode not supported\n");
		FI_INFO_MODE(&fi_ibv_prov, check_mode, compare_mode);
		return -FI_ENODATA;
	}

	if (attr->op_flags & ~(info->rx_attr->op_flags)) {
		VERBS_INFO(FI_LOG_CORE,
			   "Given rx_attr->op_flags not supported\n");
		return -FI_ENODATA;
	}

	if (attr->msg_order & ~(info->rx_attr->msg_order)) {
		VERBS_INFO(FI_LOG_CORE,
			   "Given rx_attr->msg_order not supported\n");
		return -FI_ENODATA;
	}

	if (attr->size > info->rx_attr->size) {
		VERBS_INFO(FI_LOG_CORE,
			   "Given rx_attr->size is greater than supported\n");
		FI_INFO_CHECK_VAL(&fi_ibv_prov, info->rx_attr, attr, size);
		return -FI_ENODATA;
	}

	rm_enabled =(info->domain_attr &&
		     info->domain_attr->resource_mgmt == FI_RM_ENABLED);

	if (!rm_enabled &&
	    (attr->total_buffered_recv > info->rx_attr->total_buffered_recv))
	{
		VERBS_INFO(FI_LOG_CORE,
			   "Given rx_attr->total_buffered_recv "
			   "exceeds supported size\n");
		FI_INFO_CHECK_VAL(&fi_ibv_prov, info->rx_attr, attr,
				  total_buffered_recv);
		return -FI_ENODATA;
	}

	if (attr->iov_limit > info->rx_attr->iov_limit) {
		VERBS_INFO(FI_LOG_CORE,
			   "Given rx_attr->iov_limit greater than supported\n");
		FI_INFO_CHECK_VAL(&fi_ibv_prov, info->rx_attr, attr,
				  iov_limit);
		return -FI_ENODATA;
	}

	return 0;
}
static int
fi_ibv_rdm_find_sysaddrs(struct fi_ibv_rdm_sysaddr *iface_addr,
			 struct fi_ibv_rdm_sysaddr *lo_addr)
{
	struct ifaddrs *ifaddr, *ifa;
	char iface[IFNAMSIZ];
	char *iface_tmp = "ib";
	size_t iface_len = 2;
	int ret;

	if (!iface_addr || !lo_addr) {
		return -FI_EINVAL;
	}

	iface_addr->is_found = 0;
	lo_addr->is_found = 0;

	if (fi_param_get_str(&fi_ibv_prov, "iface", &iface_tmp) == FI_SUCCESS) {
		iface_len = strlen(iface_tmp);
		if (iface_len > IFNAMSIZ) {
			VERBS_INFO(FI_LOG_EP_CTRL,
				   "Too long iface name: %s, max: %d\n",
				   iface_tmp, IFNAMSIZ);
			return -FI_EINVAL;
		}
	}
	strncpy(iface, iface_tmp, iface_len);

	ret = getifaddrs(&ifaddr);
	if (ret) {
		FI_WARN(&fi_ibv_prov, FI_LOG_FABRIC,
				"Unable to get interface addresses\n");
		return ret;
	}

	for (ifa = ifaddr; ifa; ifa = ifa->ifa_next) {
		if (!iface_addr->is_found && (ifa->ifa_addr->sa_family == AF_INET) &&
		    !strncmp(ifa->ifa_name, iface, iface_len)) {
			memcpy(&iface_addr->addr, ifa->ifa_addr,
				sizeof(iface_addr->addr));
			iface_addr->is_found = 1;
			FI_INFO(&fi_ibv_prov, FI_LOG_FABRIC,
				"iface addr %s:%u\n",
				inet_ntoa(iface_addr->addr.sin_addr),
				ntohs(iface_addr->addr.sin_port));
		}
		if (!lo_addr->is_found && (ifa->ifa_addr->sa_family == AF_INET) &&
		    !strncmp(ifa->ifa_name, "lo", strlen(ifa->ifa_name))) {
			memcpy(&lo_addr->addr, ifa->ifa_addr, sizeof(lo_addr->addr));
			lo_addr->is_found = 1;
			FI_INFO(&fi_ibv_prov, FI_LOG_FABRIC, "lo addr %s:%u\n",
				inet_ntoa(lo_addr->addr.sin_addr),
				ntohs(lo_addr->addr.sin_port));
		}
		if (iface_addr->is_found && lo_addr->is_found) {
			break;
		}
	}

	freeifaddrs(ifaddr);

	return 0;
}
Beispiel #25
0
static ssize_t
fi_ibv_rdm_process_connect_request(struct rdma_cm_event *event,
					  struct fi_ibv_rdm_ep *ep)
{
	struct ibv_qp_init_attr qp_attr;
	struct rdma_conn_param cm_params;
	struct fi_ibv_rdm_tagged_conn *conn = NULL;
	struct rdma_cm_id *id = event->id;
	ssize_t ret = FI_SUCCESS;

	char *p = (char *) event->param.conn.private_data;

	if (ep->is_closing) {
		int rej_message = 0xdeadbeef;
		if (rdma_reject(id, &rej_message, sizeof(int))) {
			VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_reject\n", errno);
			ret = -errno;
			if (rdma_destroy_id(id)) {
				VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_destroy_id\n",
						 errno);
				ret = (ret == FI_SUCCESS) ? -errno : ret;
			}
		}
		assert(ret == FI_SUCCESS);
		return ret;
	}

	HASH_FIND(hh, fi_ibv_rdm_tagged_conn_hash, p, FI_IBV_RDM_DFLT_ADDRLEN,
		  conn);

	if (!conn) {
		conn = memalign(FI_IBV_RDM_MEM_ALIGNMENT, sizeof(*conn));
		if (!conn)
			return -FI_ENOMEM;

		memset(conn, 0, sizeof(struct fi_ibv_rdm_tagged_conn));

		conn->state = FI_VERBS_CONN_ALLOCATED;
		dlist_init(&conn->postponed_requests_head);
		fi_ibv_rdm_unpack_cm_params(&event->param.conn, conn, ep);
		fi_ibv_rdm_conn_init_cm_role(conn, ep);

		FI_INFO(&fi_ibv_prov, FI_LOG_AV,
			"CONN REQUEST, NOT found in hash, new conn %p %d, addr %s:%u, HASH ADD\n",
			conn, conn->cm_role, inet_ntoa(conn->addr.sin_addr),
			ntohs(conn->addr.sin_port));

		HASH_ADD(hh, fi_ibv_rdm_tagged_conn_hash, addr,
			FI_IBV_RDM_DFLT_ADDRLEN, conn);
	} else {
		if (conn->cm_role != FI_VERBS_CM_ACTIVE) {
			/*
			 * Do it before rdma_create_qp since that call would
			 * modify event->param.conn.private_data buffer
			 */
			fi_ibv_rdm_unpack_cm_params(&event->param.conn, conn,
						    ep);
		}

		FI_INFO(&fi_ibv_prov, FI_LOG_AV,
			"CONN REQUEST,  FOUND in hash, conn %p %d, addr %s:%u\n",
			conn, conn->cm_role, inet_ntoa(conn->addr.sin_addr),
			ntohs(conn->addr.sin_port));
	}

	if (conn->cm_role == FI_VERBS_CM_ACTIVE) {
		int rej_message = 0xdeadbeef;
		if (rdma_reject(id, &rej_message, sizeof(rej_message))) {
			VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_reject\n", errno);
			ret = -errno;
			if (rdma_destroy_id(id)) {
				VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_destroy_id\n",
						 errno);
				ret = (ret == FI_SUCCESS) ? -errno : ret;
			}
		}
		if (conn->state == FI_VERBS_CONN_ALLOCATED) {
			ret = fi_ibv_rdm_start_connection(ep, conn);
			if (ret != FI_SUCCESS)
				goto err;
		}
	} else {
		assert(conn->state == FI_VERBS_CONN_ALLOCATED ||
		       conn->state == FI_VERBS_CONN_STARTED);

		const size_t idx = 
			(conn->cm_role == FI_VERBS_CM_PASSIVE) ? 0 : 1;

		conn->state = FI_VERBS_CONN_STARTED;

		assert (conn->id[idx] == NULL);
		conn->id[idx] = id;

		ret = fi_ibv_rdm_prepare_conn_memory(ep, conn);
		if (ret != FI_SUCCESS)
			goto err;

		fi_ibv_rdm_tagged_init_qp_attributes(&qp_attr, ep);
		if (rdma_create_qp(id, ep->domain->pd, &qp_attr)) {
			ret = -errno;
			goto err;
		}
		conn->qp[idx] = id->qp;

		ret = fi_ibv_rdm_repost_receives(conn, ep, ep->rq_wr_depth);
		if (ret < 0) {
			VERBS_INFO(FI_LOG_AV, "repost receives failed\n");
			goto err;
		} else {
			ret = FI_SUCCESS;
		}

		id->context = conn;

		fi_ibv_rdm_pack_cm_params(&cm_params, conn, ep);

		if (rdma_accept(id, &cm_params)) {
			VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_accept\n", errno);
			ret = -errno;
			goto err;
		}
		if (cm_params.private_data) {
			free((void *) cm_params.private_data);
		}
	}

	return ret;
err:
	/* ret err code is already set here, just cleanup resources */
	fi_ibv_rdm_conn_cleanup(conn);
	return ret;
}
Beispiel #26
0
int fi_ibv_init_info(void)
{
	struct ibv_context **ctx_list;
	struct fi_info *fi = NULL, *tail = NULL;
	int ret = 0, i, num_devices, fork_unsafe = 0;

	if (verbs_info)
		return 0;

	pthread_mutex_lock(&verbs_info_lock);
	if (verbs_info)
		goto unlock;

	if (!fi_ibv_have_device()) {
		VERBS_INFO(FI_LOG_FABRIC, "No RDMA devices found\n");
		ret = -FI_ENODATA;
		goto unlock;
	}

	fi_param_get_bool(NULL, "fork_unsafe", &fork_unsafe);

	if (!fork_unsafe) {
		FI_INFO(&fi_ibv_prov, FI_LOG_CORE, "Enabling IB fork support\n");
		ret = ibv_fork_init();
		if (ret) {
			FI_WARN(&fi_ibv_prov, FI_LOG_CORE,
					"Enabling IB fork support failed: %s (%d)\n",
					strerror(ret), ret);
			goto unlock;
		}
	} else {
		FI_INFO(&fi_ibv_prov, FI_LOG_CORE, "Not enabling IB fork support\n");
	}

	ctx_list = rdma_get_devices(&num_devices);
	if (!num_devices) {
		VERBS_INFO_ERRNO(FI_LOG_FABRIC, "rdma_get_devices", errno);
		ret = -errno;
		goto unlock;
	}

	for (i = 0; i < num_devices; i++) {
		ret = fi_ibv_alloc_info(ctx_list[i], &fi, &verbs_msg_domain);
		if (!ret) {
			if (!verbs_info)
				verbs_info = fi;
			else
				tail->next = fi;
			tail = fi;

			ret = fi_ibv_alloc_info(ctx_list[i], &fi,
						&verbs_rdm_domain);
			if (!ret) {
				tail->next = fi;
				tail = fi;
			}
		}
	}

	ret = verbs_info ? 0 : ret;

	rdma_free_devices(ctx_list);
unlock:
	pthread_mutex_unlock(&verbs_info_lock);
	return ret;
}
Beispiel #27
0
static int fi_ibv_alloc_info(struct ibv_context *ctx, struct fi_info **info,
			     const struct verbs_ep_domain *ep_dom)
{
	struct fi_info *fi;
	union ibv_gid gid;
	size_t name_len;
	int ret;
	int param;

	if (!(fi = fi_allocinfo()))
		return -FI_ENOMEM;

	fi->caps		= ep_dom->caps;
	fi->handle		= NULL;
	if (ep_dom->type == FI_EP_RDM) {
		fi->mode	= VERBS_RDM_MODE;
		*(fi->tx_attr)	= verbs_rdm_tx_attr;
	} else {
		*(fi->tx_attr)	= verbs_tx_attr;
	}

	*(fi->rx_attr)		= (ep_dom->type == FI_EP_RDM)
				? verbs_rdm_rx_attr : verbs_rx_attr;
	*(fi->ep_attr)		= verbs_ep_attr;
	*(fi->domain_attr)	= verbs_domain_attr;

	if (ep_dom->type == FI_EP_RDM)
		fi->domain_attr->mr_mode &= ~FI_MR_LOCAL;

	*(fi->fabric_attr)	= verbs_fabric_attr;

	fi->ep_attr->type	= ep_dom->type;
	fi->tx_attr->caps	= ep_dom->caps;
	fi->rx_attr->caps	= ep_dom->caps;

	ret = fi_ibv_get_device_attrs(ctx, fi);
	if (ret)
		goto err;

	if (ep_dom->type == FI_EP_RDM) {
		fi->tx_attr->inject_size = FI_IBV_RDM_DFLT_BUFFERED_SSIZE;
		fi->tx_attr->iov_limit = 1;
		fi->tx_attr->rma_iov_limit = 1;
		if (!fi_param_get_int(&fi_ibv_prov, "rdm_buffer_size", &param)) {
			if (param > sizeof (struct fi_ibv_rdm_rndv_header)) {
				fi->tx_attr->inject_size = param;
			} else {
				VERBS_INFO(FI_LOG_CORE,
					   "rdm_buffer_size too small, "
					   "should be greater then %d\n",
					   sizeof (struct fi_ibv_rdm_rndv_header));
				ret = -FI_EINVAL;
				goto err;
			}
		}
	}

	switch (ctx->device->transport_type) {
	case IBV_TRANSPORT_IB:
		if(ibv_query_gid(ctx, 1, 0, &gid)) {
			VERBS_INFO_ERRNO(FI_LOG_FABRIC, "ibv_query_gid", errno);
			ret = -errno;
			goto err;
		}

		name_len =  strlen(VERBS_IB_PREFIX) + INET6_ADDRSTRLEN;

		if (!(fi->fabric_attr->name = calloc(1, name_len + 1))) {
			ret = -FI_ENOMEM;
			goto err;
		}

		snprintf(fi->fabric_attr->name, name_len, VERBS_IB_PREFIX "%lx",
			 gid.global.subnet_prefix);

		fi->ep_attr->protocol = (ep_dom == &verbs_msg_domain) ?
					FI_PROTO_RDMA_CM_IB_RC : FI_PROTO_IB_RDM;
		break;
	case IBV_TRANSPORT_IWARP:
		fi->fabric_attr->name = strdup(VERBS_IWARP_FABRIC);
		if (!fi->fabric_attr->name) {
			ret = -FI_ENOMEM;
			goto err;
		}

		if (ep_dom == &verbs_msg_domain) {
			fi->ep_attr->protocol = FI_PROTO_IWARP;
			fi->tx_attr->op_flags = VERBS_TX_OP_FLAGS_IWARP;
		} else {
			fi->ep_attr->protocol = FI_PROTO_IWARP_RDM;
			fi->tx_attr->op_flags = VERBS_TX_OP_FLAGS_IWARP_RDM;
		}
		break;
	default:
		VERBS_INFO(FI_LOG_CORE, "Unknown transport type\n");
		ret = -FI_ENODATA;
		goto err;
	}

	name_len = strlen(ctx->device->name) + strlen(ep_dom->suffix);
	fi->domain_attr->name = malloc(name_len + 1);
	if (!fi->domain_attr->name) {
		ret = -FI_ENOMEM;
		goto err;
	}

	snprintf(fi->domain_attr->name, name_len + 1, "%s%s",
		 ctx->device->name, ep_dom->suffix);
	fi->domain_attr->name[name_len] = '\0';

	*info = fi;
	return 0;
err:
	fi_freeinfo(fi);
	return ret;
}