void IBConnection::connect(const std::string& hostname, const std::string& service) { struct addrinfo hints; memset(&hints, 0, sizeof(struct addrinfo)); hints.ai_family = AF_UNSPEC; hints.ai_socktype = SOCK_STREAM; struct addrinfo* res; int err = getaddrinfo(hostname.c_str(), service.c_str(), &hints, &res); if (err) throw InfinibandException("getaddrinfo failed"); L_(debug) << "[" << index_ << "] " << "resolution of server address and route"; for (struct addrinfo* t = res; t; t = t->ai_next) { err = rdma_resolve_addr(cm_id_, nullptr, t->ai_addr, RESOLVE_TIMEOUT_MS); if (!err) break; } if (err) throw InfinibandException("rdma_resolve_addr failed"); freeaddrinfo(res); }
void InputChannelSender::on_addr_resolved(struct rdma_cm_id* id) { IBConnectionGroup<InputChannelConnection>::on_addr_resolved(id); if (!mr_data_) { // Register memory regions. mr_data_ = ibv_reg_mr( pd_, const_cast<uint8_t*>(data_source_.data_send_buffer().ptr()), data_source_.data_send_buffer().bytes(), IBV_ACCESS_LOCAL_WRITE); if (!mr_data_) { L_(error) << "ibv_reg_mr failed for mr_data: " << strerror(errno); throw InfinibandException("registration of memory region failed"); } mr_desc_ = ibv_reg_mr(pd_, const_cast<fles::MicrosliceDescriptor*>( data_source_.desc_send_buffer().ptr()), data_source_.desc_send_buffer().bytes(), IBV_ACCESS_LOCAL_WRITE); if (!mr_desc_) { L_(error) << "ibv_reg_mr failed for mr_desc: " << strerror(errno); throw InfinibandException("registration of memory region failed"); } if (true) { dump_mr(mr_desc_); dump_mr(mr_data_); } } }
/// Connection manager event dispatcher. Called by the CM event loop. void on_cm_event(struct rdma_cm_event* event) { L_(trace) << rdma_event_str(event->event); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: on_addr_resolved(event->id); return; case RDMA_CM_EVENT_ADDR_ERROR: throw InfinibandException("rdma_resolve_addr failed"); case RDMA_CM_EVENT_ROUTE_RESOLVED: on_route_resolved(event->id); return; case RDMA_CM_EVENT_ROUTE_ERROR: throw InfinibandException("rdma_resolve_route failed"); case RDMA_CM_EVENT_CONNECT_ERROR: throw InfinibandException("could not establish connection"); case RDMA_CM_EVENT_UNREACHABLE: throw InfinibandException("remote server is not reachable"); case RDMA_CM_EVENT_REJECTED: on_rejected(event); return; case RDMA_CM_EVENT_ESTABLISHED: on_established(event); return; case RDMA_CM_EVENT_CONNECT_REQUEST: on_connect_request(event); return; case RDMA_CM_EVENT_DISCONNECTED: on_disconnected(event); return; default: L_(warning) << rdma_event_str(event->event); } }
/// The IBConnectionGroup default constructor. IBConnectionGroup() { ec_ = rdma_create_event_channel(); if (!ec_) throw InfinibandException("rdma_create_event_channel failed"); fcntl(ec_->fd, F_SETFL, O_NONBLOCK); }
/// The InfiniBand completion notification handler. int poll_completion() { const int ne_max = 10; struct ibv_wc wc[ne_max]; int ne; int ne_total = 0; while ((ne = ibv_poll_cq(cq_, ne_max, wc))) { if (ne < 0) throw InfinibandException("ibv_poll_cq failed"); ne_total += ne; for (int i = 0; i < ne; ++i) { if (wc[i].status != IBV_WC_SUCCESS) { std::ostringstream s; s << ibv_wc_status_str(wc[i].status) << " for wr_id " << static_cast<int>(wc[i].wr_id); L_(error) << s.str(); continue; } on_completion(wc[i]); } } return ne_total; }
/// Initialize the InfiniBand verbs context. void init_context(struct ibv_context* context) { context_ = context; L_(debug) << "create verbs objects"; pd_ = ibv_alloc_pd(context); if (!pd_) throw InfinibandException("ibv_alloc_pd failed"); cq_ = ibv_create_cq(context, num_cqe_, nullptr, nullptr, 0); if (!cq_) throw InfinibandException("ibv_create_cq failed"); if (ibv_req_notify_cq(cq_, 0)) throw InfinibandException("ibv_req_notify_cq failed"); }
void IBConnection::disconnect() { L_(debug) << "[" << index_ << "] " << "disconnect"; int err = rdma_disconnect(cm_id_); if (err) throw InfinibandException("rdma_disconnect failed"); }
void IBConnection::on_addr_resolved(struct ibv_pd* pd, struct ibv_cq* cq) { L_(debug) << "address resolved"; struct ibv_qp_init_attr qp_attr; memset(&qp_attr, 0, sizeof qp_attr); qp_attr.cap = qp_cap_; qp_attr.send_cq = cq; qp_attr.recv_cq = cq; qp_attr.qp_type = IBV_QPT_RC; int err = rdma_create_qp(cm_id_, pd, &qp_attr); if (err) throw InfinibandException("creation of QP failed"); err = rdma_resolve_route(cm_id_, RESOLVE_TIMEOUT_MS); if (err) throw InfinibandException("rdma_resolve_route failed"); setup(pd); }
void IBConnection::create_qp(struct ibv_pd* pd, struct ibv_cq* cq) { struct ibv_qp_init_attr qp_attr; memset(&qp_attr, 0, sizeof qp_attr); qp_attr.cap = qp_cap_; qp_attr.send_cq = cq; qp_attr.recv_cq = cq; qp_attr.qp_type = IBV_QPT_RC; int err = rdma_create_qp(cm_id_, pd, &qp_attr); if (err) throw InfinibandException("creation of QP failed"); }
void IBConnection::post_recv(struct ibv_recv_wr* wr) { struct ibv_recv_wr* bad_recv_wr; int err = ibv_post_recv(qp(), wr, &bad_recv_wr); if (err) { L_(fatal) << "ibv_post_recv failed: " << strerror(err); throw InfinibandException("ibv_post_recv failed"); } ++total_recv_requests_; }
void accept(unsigned short port, unsigned int count) { conn_.resize(count); L_(debug) << "Setting up RDMA CM structures"; // Create rdma id (for listening) int err = rdma_create_id(ec_, &listen_id_, nullptr, RDMA_PS_TCP); if (err) { L_(error) << "rdma_create_id() failed"; throw InfinibandException("id creation failed"); } // Bind rdma id (for listening) to socket address (local port) struct sockaddr_in sin; memset(&sin, 0, sizeof sin); sin.sin_family = AF_INET; #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wold-style-cast" sin.sin_port = htons(port); sin.sin_addr.s_addr = INADDR_ANY; #pragma GCC diagnostic pop err = rdma_bind_addr(listen_id_, reinterpret_cast<struct sockaddr*>(&sin)); if (err) { L_(error) << "rdma_bind_addr(port=" << port << ") failed: " << strerror(errno); throw InfinibandException("RDMA bind_addr failed"); } // Listen for connection request on rdma id err = rdma_listen(listen_id_, count); if (err) { L_(error) << "rdma_listen() failed"; throw InfinibandException("RDMA listen failed"); } L_(debug) << "waiting for " << count << " connections"; }
/// The connection manager event handler. void poll_cm_events() { int err; struct rdma_cm_event* event; struct rdma_cm_event event_copy; void* private_data_copy = nullptr; while ((err = rdma_get_cm_event(ec_, &event)) == 0) { #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wold-style-cast" VALGRIND_MAKE_MEM_DEFINED(event, sizeof(struct rdma_cm_event)); memcpy(&event_copy, event, sizeof(struct rdma_cm_event)); if (event_copy.param.conn.private_data) { VALGRIND_MAKE_MEM_DEFINED( event_copy.param.conn.private_data, event_copy.param.conn.private_data_len); private_data_copy = malloc(event_copy.param.conn.private_data_len); if (!private_data_copy) throw InfinibandException("malloc failed"); memcpy(private_data_copy, event_copy.param.conn.private_data, event_copy.param.conn.private_data_len); event_copy.param.conn.private_data = private_data_copy; } #pragma GCC diagnostic pop rdma_ack_cm_event(event); on_cm_event(&event_copy); if (private_data_copy) { free(private_data_copy); private_data_copy = nullptr; } } if (err == -1 && errno == EAGAIN) return; if (err) throw InfinibandException("rdma_get_cm_event failed"); }
void IBConnection::accept_connect_request() { L_(debug) << "accepting connection"; // Accept rdma connection request auto private_data = get_private_data(); assert(private_data->size() <= 255); struct rdma_conn_param conn_param = rdma_conn_param(); conn_param.responder_resources = 1; conn_param.private_data = private_data->data(); conn_param.private_data_len = static_cast<uint8_t>(private_data->size()); int err = rdma_accept(cm_id_, &conn_param); if (err) throw InfinibandException("RDMA accept failed"); }
void IBConnection::post_send(struct ibv_send_wr* wr) { struct ibv_send_wr* bad_send_wr; int err = ibv_post_send(qp(), wr, &bad_send_wr); if (err) { L_(fatal) << "ibv_post_send failed: " << strerror(err); dump_send_wr(wr); L_(fatal) << "previous send requests: " << total_send_requests_; L_(fatal) << "previous recv requests: " << total_recv_requests_; throw InfinibandException("ibv_post_send failed"); } ++total_send_requests_; while (wr) { for (int i = 0; i < wr->num_sge; ++i) total_bytes_sent_ += wr->sg_list[i].length; wr = wr->next; } }
void IBConnection::on_route_resolved() { L_(debug) << "route resolved"; // Initiate rdma connection auto private_data = get_private_data(); assert(private_data->size() <= 255); struct rdma_conn_param conn_param = rdma_conn_param(); conn_param.initiator_depth = 1; conn_param.retry_count = 7; conn_param.private_data = private_data->data(); conn_param.private_data_len = static_cast<uint8_t>(private_data->size()); // TODO: Hack to prevent connection issues when using softiwarp. std::this_thread::sleep_for(std::chrono::milliseconds(500)); int err = rdma_connect(cm_id_, &conn_param); if (err) { L_(fatal) << "rdma_connect failed: " << strerror(errno); throw InfinibandException("rdma_connect failed"); } }
IBConnection::IBConnection(struct rdma_event_channel* ec, uint_fast16_t connection_index, uint_fast16_t remote_connection_index, struct rdma_cm_id* id) : index_(connection_index), remote_index_(remote_connection_index), cm_id_(id) { if (!cm_id_) { int err = rdma_create_id(ec, &cm_id_, this, RDMA_PS_TCP); if (err) throw InfinibandException("rdma_create_id failed"); } else { cm_id_->context = this; } qp_cap_.max_send_wr = 16; qp_cap_.max_recv_wr = 16; qp_cap_.max_send_sge = 8; qp_cap_.max_recv_sge = 8; qp_cap_.max_inline_data = 0; }
void InputChannelSender::on_completion(const struct ibv_wc& wc) { switch (wc.wr_id & 0xFF) { case ID_WRITE_DESC: { uint64_t ts = wc.wr_id >> 24; int cn = (wc.wr_id >> 8) & 0xFFFF; conn_[cn]->on_complete_write(); uint64_t acked_ts = (acked_desc_ - start_index_desc_) / timeslice_size_; if (ts != acked_ts) { // transmission has been reordered, store completion information ack_.at(ts) = ts; } else { // completion is for earliest pending timeslice, update indices do { ++acked_ts; } while (ack_.at(acked_ts) > ts); acked_desc_ = acked_ts * timeslice_size_ + start_index_desc_; acked_data_ = data_source_.desc_buffer().at(acked_desc_ - 1).offset + data_source_.desc_buffer().at(acked_desc_ - 1).size; if (acked_data_ >= cached_acked_data_ + min_acked_data_ || acked_desc_ >= cached_acked_desc_ + min_acked_desc_) { cached_acked_data_ = acked_data_; cached_acked_desc_ = acked_desc_; data_source_.set_read_index( {cached_acked_desc_, cached_acked_data_}); } } if (false) { L_(trace) << "[i" << input_index_ << "] " << "write timeslice " << ts << " complete, now: acked_data_=" << acked_data_ << " acked_desc_=" << acked_desc_; } } break; case ID_RECEIVE_STATUS: { int cn = wc.wr_id >> 8; conn_[cn]->on_complete_recv(); if (conn_[cn]->request_abort_flag()) { abort_ = true; } if (conn_[cn]->done()) { ++connections_done_; all_done_ = (connections_done_ == conn_.size()); L_(debug) << "[i" << input_index_ << "] " << "ID_RECEIVE_STATUS final for id " << cn << " all_done=" << all_done_; } } break; case ID_SEND_STATUS: { } break; default: L_(error) << "[i" << input_index_ << "] " << "wc for unknown wr_id=" << (wc.wr_id & 0xFF); throw InfinibandException("wc for unknown wr_id"); } }