Exemple #1
0
void IBConnection::connect(const std::string& hostname,
                           const std::string& service)
{
    struct addrinfo hints;
    memset(&hints, 0, sizeof(struct addrinfo));
    hints.ai_family = AF_UNSPEC;
    hints.ai_socktype = SOCK_STREAM;
    struct addrinfo* res;

    int err = getaddrinfo(hostname.c_str(), service.c_str(), &hints, &res);
    if (err)
        throw InfinibandException("getaddrinfo failed");

    L_(debug) << "[" << index_ << "] "
              << "resolution of server address and route";

    for (struct addrinfo* t = res; t; t = t->ai_next) {
        err =
            rdma_resolve_addr(cm_id_, nullptr, t->ai_addr, RESOLVE_TIMEOUT_MS);
        if (!err)
            break;
    }
    if (err)
        throw InfinibandException("rdma_resolve_addr failed");

    freeaddrinfo(res);
}
void InputChannelSender::on_addr_resolved(struct rdma_cm_id* id)
{
    IBConnectionGroup<InputChannelConnection>::on_addr_resolved(id);

    if (!mr_data_) {
        // Register memory regions.
        mr_data_ = ibv_reg_mr(
            pd_, const_cast<uint8_t*>(data_source_.data_send_buffer().ptr()),
            data_source_.data_send_buffer().bytes(), IBV_ACCESS_LOCAL_WRITE);
        if (!mr_data_) {
            L_(error) << "ibv_reg_mr failed for mr_data: " << strerror(errno);
            throw InfinibandException("registration of memory region failed");
        }

        mr_desc_ = ibv_reg_mr(pd_, const_cast<fles::MicrosliceDescriptor*>(
                                       data_source_.desc_send_buffer().ptr()),
                              data_source_.desc_send_buffer().bytes(),
                              IBV_ACCESS_LOCAL_WRITE);
        if (!mr_desc_) {
            L_(error) << "ibv_reg_mr failed for mr_desc: " << strerror(errno);
            throw InfinibandException("registration of memory region failed");
        }

        if (true) {
            dump_mr(mr_desc_);
            dump_mr(mr_data_);
        }
    }
}
Exemple #3
0
 /// Connection manager event dispatcher. Called by the CM event loop.
 void on_cm_event(struct rdma_cm_event* event)
 {
     L_(trace) << rdma_event_str(event->event);
     switch (event->event) {
     case RDMA_CM_EVENT_ADDR_RESOLVED:
         on_addr_resolved(event->id);
         return;
     case RDMA_CM_EVENT_ADDR_ERROR:
         throw InfinibandException("rdma_resolve_addr failed");
     case RDMA_CM_EVENT_ROUTE_RESOLVED:
         on_route_resolved(event->id);
         return;
     case RDMA_CM_EVENT_ROUTE_ERROR:
         throw InfinibandException("rdma_resolve_route failed");
     case RDMA_CM_EVENT_CONNECT_ERROR:
         throw InfinibandException("could not establish connection");
     case RDMA_CM_EVENT_UNREACHABLE:
         throw InfinibandException("remote server is not reachable");
     case RDMA_CM_EVENT_REJECTED:
         on_rejected(event);
         return;
     case RDMA_CM_EVENT_ESTABLISHED:
         on_established(event);
         return;
     case RDMA_CM_EVENT_CONNECT_REQUEST:
         on_connect_request(event);
         return;
     case RDMA_CM_EVENT_DISCONNECTED:
         on_disconnected(event);
         return;
     default:
         L_(warning) << rdma_event_str(event->event);
     }
 }
Exemple #4
0
 /// The IBConnectionGroup default constructor.
 IBConnectionGroup()
 {
     ec_ = rdma_create_event_channel();
     if (!ec_)
         throw InfinibandException("rdma_create_event_channel failed");
     fcntl(ec_->fd, F_SETFL, O_NONBLOCK);
 }
Exemple #5
0
    /// The InfiniBand completion notification handler.
    int poll_completion()
    {
        const int ne_max = 10;

        struct ibv_wc wc[ne_max];
        int ne;
        int ne_total = 0;

        while ((ne = ibv_poll_cq(cq_, ne_max, wc))) {
            if (ne < 0)
                throw InfinibandException("ibv_poll_cq failed");

            ne_total += ne;
            for (int i = 0; i < ne; ++i) {
                if (wc[i].status != IBV_WC_SUCCESS) {
                    std::ostringstream s;
                    s << ibv_wc_status_str(wc[i].status) << " for wr_id "
                      << static_cast<int>(wc[i].wr_id);
                    L_(error) << s.str();

                    continue;
                }

                on_completion(wc[i]);
            }
        }

        return ne_total;
    }
Exemple #6
0
    /// Initialize the InfiniBand verbs context.
    void init_context(struct ibv_context* context)
    {
        context_ = context;

        L_(debug) << "create verbs objects";

        pd_ = ibv_alloc_pd(context);
        if (!pd_)
            throw InfinibandException("ibv_alloc_pd failed");

        cq_ = ibv_create_cq(context, num_cqe_, nullptr, nullptr, 0);
        if (!cq_)
            throw InfinibandException("ibv_create_cq failed");

        if (ibv_req_notify_cq(cq_, 0))
            throw InfinibandException("ibv_req_notify_cq failed");
    }
Exemple #7
0
void IBConnection::disconnect()
{
    L_(debug) << "[" << index_ << "] "
              << "disconnect";
    int err = rdma_disconnect(cm_id_);
    if (err)
        throw InfinibandException("rdma_disconnect failed");
}
Exemple #8
0
void IBConnection::on_addr_resolved(struct ibv_pd* pd, struct ibv_cq* cq)
{
    L_(debug) << "address resolved";

    struct ibv_qp_init_attr qp_attr;
    memset(&qp_attr, 0, sizeof qp_attr);
    qp_attr.cap = qp_cap_;
    qp_attr.send_cq = cq;
    qp_attr.recv_cq = cq;
    qp_attr.qp_type = IBV_QPT_RC;
    int err = rdma_create_qp(cm_id_, pd, &qp_attr);
    if (err)
        throw InfinibandException("creation of QP failed");

    err = rdma_resolve_route(cm_id_, RESOLVE_TIMEOUT_MS);
    if (err)
        throw InfinibandException("rdma_resolve_route failed");

    setup(pd);
}
Exemple #9
0
void IBConnection::create_qp(struct ibv_pd* pd, struct ibv_cq* cq)
{
    struct ibv_qp_init_attr qp_attr;
    memset(&qp_attr, 0, sizeof qp_attr);
    qp_attr.cap = qp_cap_;
    qp_attr.send_cq = cq;
    qp_attr.recv_cq = cq;
    qp_attr.qp_type = IBV_QPT_RC;
    int err = rdma_create_qp(cm_id_, pd, &qp_attr);
    if (err)
        throw InfinibandException("creation of QP failed");
}
Exemple #10
0
void IBConnection::post_recv(struct ibv_recv_wr* wr)
{
    struct ibv_recv_wr* bad_recv_wr;

    int err = ibv_post_recv(qp(), wr, &bad_recv_wr);
    if (err) {
        L_(fatal) << "ibv_post_recv failed: " << strerror(err);
        throw InfinibandException("ibv_post_recv failed");
    }

    ++total_recv_requests_;
}
Exemple #11
0
    void accept(unsigned short port, unsigned int count)
    {
        conn_.resize(count);

        L_(debug) << "Setting up RDMA CM structures";

        // Create rdma id (for listening)
        int err = rdma_create_id(ec_, &listen_id_, nullptr, RDMA_PS_TCP);
        if (err) {
            L_(error) << "rdma_create_id() failed";
            throw InfinibandException("id creation failed");
        }

        // Bind rdma id (for listening) to socket address (local port)
        struct sockaddr_in sin;
        memset(&sin, 0, sizeof sin);
        sin.sin_family = AF_INET;
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wold-style-cast"
        sin.sin_port = htons(port);
        sin.sin_addr.s_addr = INADDR_ANY;
#pragma GCC diagnostic pop
        err = rdma_bind_addr(listen_id_,
                             reinterpret_cast<struct sockaddr*>(&sin));
        if (err) {
            L_(error) << "rdma_bind_addr(port=" << port
                      << ") failed: " << strerror(errno);
            throw InfinibandException("RDMA bind_addr failed");
        }

        // Listen for connection request on rdma id
        err = rdma_listen(listen_id_, count);
        if (err) {
            L_(error) << "rdma_listen() failed";
            throw InfinibandException("RDMA listen failed");
        }

        L_(debug) << "waiting for " << count << " connections";
    }
Exemple #12
0
    /// The connection manager event handler.
    void poll_cm_events()
    {
        int err;
        struct rdma_cm_event* event;
        struct rdma_cm_event event_copy;
        void* private_data_copy = nullptr;

        while ((err = rdma_get_cm_event(ec_, &event)) == 0) {
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wold-style-cast"
            VALGRIND_MAKE_MEM_DEFINED(event, sizeof(struct rdma_cm_event));
            memcpy(&event_copy, event, sizeof(struct rdma_cm_event));
            if (event_copy.param.conn.private_data) {
                VALGRIND_MAKE_MEM_DEFINED(
                    event_copy.param.conn.private_data,
                    event_copy.param.conn.private_data_len);
                private_data_copy =
                    malloc(event_copy.param.conn.private_data_len);
                if (!private_data_copy)
                    throw InfinibandException("malloc failed");
                memcpy(private_data_copy, event_copy.param.conn.private_data,
                       event_copy.param.conn.private_data_len);
                event_copy.param.conn.private_data = private_data_copy;
            }
#pragma GCC diagnostic pop
            rdma_ack_cm_event(event);
            on_cm_event(&event_copy);
            if (private_data_copy) {
                free(private_data_copy);
                private_data_copy = nullptr;
            }
        }
        if (err == -1 && errno == EAGAIN)
            return;
        if (err)
            throw InfinibandException("rdma_get_cm_event failed");
    }
Exemple #13
0
void IBConnection::accept_connect_request()
{
    L_(debug) << "accepting connection";

    // Accept rdma connection request
    auto private_data = get_private_data();
    assert(private_data->size() <= 255);

    struct rdma_conn_param conn_param = rdma_conn_param();
    conn_param.responder_resources = 1;
    conn_param.private_data = private_data->data();
    conn_param.private_data_len = static_cast<uint8_t>(private_data->size());
    int err = rdma_accept(cm_id_, &conn_param);
    if (err)
        throw InfinibandException("RDMA accept failed");
}
Exemple #14
0
void IBConnection::post_send(struct ibv_send_wr* wr)
{
    struct ibv_send_wr* bad_send_wr;

    int err = ibv_post_send(qp(), wr, &bad_send_wr);
    if (err) {
        L_(fatal) << "ibv_post_send failed: " << strerror(err);
        dump_send_wr(wr);
        L_(fatal) << "previous send requests: " << total_send_requests_;
        L_(fatal) << "previous recv requests: " << total_recv_requests_;
        throw InfinibandException("ibv_post_send failed");
    }

    ++total_send_requests_;

    while (wr) {
        for (int i = 0; i < wr->num_sge; ++i)
            total_bytes_sent_ += wr->sg_list[i].length;
        wr = wr->next;
    }
}
Exemple #15
0
void IBConnection::on_route_resolved()
{
    L_(debug) << "route resolved";

    // Initiate rdma connection
    auto private_data = get_private_data();
    assert(private_data->size() <= 255);

    struct rdma_conn_param conn_param = rdma_conn_param();
    conn_param.initiator_depth = 1;
    conn_param.retry_count = 7;
    conn_param.private_data = private_data->data();
    conn_param.private_data_len = static_cast<uint8_t>(private_data->size());
    // TODO: Hack to prevent connection issues when using softiwarp.
    std::this_thread::sleep_for(std::chrono::milliseconds(500));
    int err = rdma_connect(cm_id_, &conn_param);
    if (err) {
        L_(fatal) << "rdma_connect failed: " << strerror(errno);
        throw InfinibandException("rdma_connect failed");
    }
}
Exemple #16
0
IBConnection::IBConnection(struct rdma_event_channel* ec,
                           uint_fast16_t connection_index,
                           uint_fast16_t remote_connection_index,
                           struct rdma_cm_id* id)
    : index_(connection_index), remote_index_(remote_connection_index),
      cm_id_(id)
{
    if (!cm_id_) {
        int err = rdma_create_id(ec, &cm_id_, this, RDMA_PS_TCP);
        if (err)
            throw InfinibandException("rdma_create_id failed");
    } else {
        cm_id_->context = this;
    }

    qp_cap_.max_send_wr = 16;
    qp_cap_.max_recv_wr = 16;
    qp_cap_.max_send_sge = 8;
    qp_cap_.max_recv_sge = 8;
    qp_cap_.max_inline_data = 0;
}
void InputChannelSender::on_completion(const struct ibv_wc& wc)
{
    switch (wc.wr_id & 0xFF) {
    case ID_WRITE_DESC: {
        uint64_t ts = wc.wr_id >> 24;

        int cn = (wc.wr_id >> 8) & 0xFFFF;
        conn_[cn]->on_complete_write();

        uint64_t acked_ts = (acked_desc_ - start_index_desc_) / timeslice_size_;
        if (ts != acked_ts) {
            // transmission has been reordered, store completion information
            ack_.at(ts) = ts;
        } else {
            // completion is for earliest pending timeslice, update indices
            do {
                ++acked_ts;
            } while (ack_.at(acked_ts) > ts);
            acked_desc_ = acked_ts * timeslice_size_ + start_index_desc_;
            acked_data_ =
                data_source_.desc_buffer().at(acked_desc_ - 1).offset +
                data_source_.desc_buffer().at(acked_desc_ - 1).size;
            if (acked_data_ >= cached_acked_data_ + min_acked_data_ ||
                acked_desc_ >= cached_acked_desc_ + min_acked_desc_) {
                cached_acked_data_ = acked_data_;
                cached_acked_desc_ = acked_desc_;
                data_source_.set_read_index(
                    {cached_acked_desc_, cached_acked_data_});
            }
        }
        if (false) {
            L_(trace) << "[i" << input_index_ << "] "
                      << "write timeslice " << ts
                      << " complete, now: acked_data_=" << acked_data_
                      << " acked_desc_=" << acked_desc_;
        }
    } break;

    case ID_RECEIVE_STATUS: {
        int cn = wc.wr_id >> 8;
        conn_[cn]->on_complete_recv();
        if (conn_[cn]->request_abort_flag()) {
            abort_ = true;
        }
        if (conn_[cn]->done()) {
            ++connections_done_;
            all_done_ = (connections_done_ == conn_.size());
            L_(debug) << "[i" << input_index_ << "] "
                      << "ID_RECEIVE_STATUS final for id " << cn
                      << " all_done=" << all_done_;
        }
    } break;

    case ID_SEND_STATUS: {
    } break;

    default:
        L_(error) << "[i" << input_index_ << "] "
                  << "wc for unknown wr_id=" << (wc.wr_id & 0xFF);
        throw InfinibandException("wc for unknown wr_id");
    }
}