void IBConnection::accept_connect_request() { L_(debug) << "accepting connection"; // Accept rdma connection request auto private_data = get_private_data(); assert(private_data->size() <= 255); struct rdma_conn_param conn_param = rdma_conn_param(); conn_param.responder_resources = 1; conn_param.private_data = private_data->data(); conn_param.private_data_len = static_cast<uint8_t>(private_data->size()); int err = rdma_accept(cm_id_, &conn_param); if (err) throw InfinibandException("RDMA accept failed"); }
void Connection::on_connect_request(struct fi_eq_cm_entry* event, struct fid_domain* pd, struct fid_cq* cq) { int err = fi_endpoint(pd, event->info, &ep_, this); if (err) { L_(fatal) << "fi_endpoint failed: " << err << "=" << fi_strerror(-err); throw LibfabricException("fi_endpoint failed"); } #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wold-style-cast" err = fi_ep_bind(ep_, (::fid_t)eq_, 0); if (err) { L_(fatal) << "fi_ep_bind failed to eq: " << err << "=" << fi_strerror(-err); throw LibfabricException("fi_ep_bind failed to eq"); } err = fi_ep_bind(ep_, (fid_t)cq, FI_SEND | FI_RECV | FI_SELECTIVE_COMPLETION); if (err) { L_(fatal) << "fi_ep_bind failed to cq: " << err << "=" << fi_strerror(-err); throw LibfabricException("fi_ep_bind failed to cq"); } #pragma GCC diagnostic pop // setup(pd); setup_mr(pd); auto private_data = get_private_data(); assert(private_data->size() <= 255); err = fi_enable(ep_); if (err) { L_(fatal) << "fi_enable failed: " << err << "=" << fi_strerror(-err); throw LibfabricException("fi_enable failed"); } // accept_connect_request(); err = fi_accept(ep_, private_data->data(), private_data->size()); if (err) { L_(fatal) << "fi_accept failed: " << err << "=" << fi_strerror(-err); throw LibfabricException("fi_accept failed"); } // setup(pd); setup(); }
void IBConnection::on_route_resolved() { L_(debug) << "route resolved"; // Initiate rdma connection auto private_data = get_private_data(); assert(private_data->size() <= 255); struct rdma_conn_param conn_param = rdma_conn_param(); conn_param.initiator_depth = 1; conn_param.retry_count = 7; conn_param.private_data = private_data->data(); conn_param.private_data_len = static_cast<uint8_t>(private_data->size()); // TODO: Hack to prevent connection issues when using softiwarp. std::this_thread::sleep_for(std::chrono::milliseconds(500)); int err = rdma_connect(cm_id_, &conn_param); if (err) { L_(fatal) << "rdma_connect failed: " << strerror(errno); throw InfinibandException("rdma_connect failed"); } }
void Connection::connect(const std::string& hostname, const std::string& service, struct fid_domain* domain, struct fid_cq* cq, struct fid_av* av) { auto private_data = get_private_data(); assert(private_data->size() <= 255); L_(debug) << "connect: " << hostname << ":" << service; struct fi_info* info2 = nullptr; struct fi_info* hints = fi_dupinfo(Provider::getInst()->get_info()); hints->rx_attr->size = max_recv_wr_; hints->rx_attr->iov_limit = max_recv_sge_; // TODO this attribute causes a problem while running flesnet // hints->tx_attr->size = max_send_wr_; hints->tx_attr->iov_limit = max_send_sge_; hints->tx_attr->inject_size = max_inline_data_; hints->src_addr = nullptr; hints->src_addrlen = 0; int err = fi_getinfo(FI_VERSION(1, 1), hostname == "" ? nullptr : hostname.c_str(), service == "" ? nullptr : service.c_str(), 0, hints, &info2); if (err) { L_(fatal) << "fi_getinfo failed in make_endpoint: " << hostname << " " << service << "[" << err << "=" << fi_strerror(-err) << "]"; throw LibfabricException("fi_getinfo failed in make_endpoint"); } fi_freeinfo(hints); err = fi_endpoint(domain, info2, &ep_, this); if (err) { L_(fatal) << "fi_endpoint failed: " << err << "=" << fi_strerror(-err); throw LibfabricException("fi_endpoint failed"); } #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wold-style-cast" if (Provider::getInst()->has_eq_at_eps()) { err = fi_ep_bind(ep_, (::fid_t)eq_, 0); if (err) { L_(fatal) << "fi_ep_bind failed: " << err << "=" << fi_strerror(-err); throw LibfabricException("fi_ep_bind failed"); } } err = fi_ep_bind(ep_, (::fid_t)cq, FI_SEND | FI_RECV | FI_SELECTIVE_COMPLETION); if (err) { L_(fatal) << "fi_ep_bind failed (cq): " << err << "=" << fi_strerror(-err); throw LibfabricException("fi_ep_bind failed (cq)"); } if (Provider::getInst()->has_av()) { err = fi_ep_bind(ep_, (::fid_t)av, 0); if (err) { L_(fatal) << "fi_ep_bind failed (av): " << err << "=" << fi_strerror(-err); throw LibfabricException("fi_ep_bind failed (av)"); } } #pragma GCC diagnostic pop err = fi_enable(ep_); if (err) { L_(fatal) << "fi_enable failed: " << err << "=" << fi_strerror(-err); throw LibfabricException("fi_enable failed"); } setup_mr(domain); Provider::getInst()->connect(ep_, max_send_wr_, max_send_sge_, max_recv_wr_, max_recv_sge_, max_inline_data_, private_data->data(), private_data->size(), info2->dest_addr); setup(); }