/* * initialise ibw portion of a ctdb node */ static int ctdb_ibw_add_node(struct ctdb_node *node) { struct ibw_ctx *ictx = talloc_get_type(node->ctdb->private_data, struct ibw_ctx); struct ctdb_ibw_node *cn = talloc_zero(node, struct ctdb_ibw_node); assert(cn!=NULL); cn->conn = ibw_conn_new(ictx, node); node->private_data = (void *)cn; return (cn->conn!=NULL ? 0 : -1); }
static int ibw_refill_cq_recv(struct ibw_conn *conn) { struct ibw_ctx_priv *pctx = talloc_get_type(conn->ctx->internal, struct ibw_ctx_priv); struct ibw_conn_priv *pconn = talloc_get_type(conn->internal, struct ibw_conn_priv); int rc; struct ibv_sge list = { .addr = (uintptr_t) NULL, /* filled below */ .length = pctx->opts.recv_bufsize, .lkey = pconn->mr_recv->lkey /* always the same */ }; struct ibv_recv_wr wr = { .wr_id = 0, /* filled below */ .sg_list = &list, .num_sge = 1, }; struct ibv_recv_wr *bad_wr; DEBUG(DEBUG_DEBUG, ("ibw_refill_cq_recv(cmid: %p)\n", pconn->cm_id)); list.addr = (uintptr_t) pconn->buf_recv + pctx->opts.recv_bufsize * pconn->recv_index; wr.wr_id = pconn->recv_index; pconn->recv_index = (pconn->recv_index + 1) % pctx->opts.max_recv_wr; rc = ibv_post_recv(pconn->cm_id->qp, &wr, &bad_wr); if (rc) { sprintf(ibw_lasterr, "refill/ibv_post_recv failed with %d\n", rc); DEBUG(DEBUG_ERR, (ibw_lasterr)); return -2; } return 0; } static int ibw_fill_cq(struct ibw_conn *conn) { struct ibw_ctx_priv *pctx = talloc_get_type(conn->ctx->internal, struct ibw_ctx_priv); struct ibw_conn_priv *pconn = talloc_get_type(conn->internal, struct ibw_conn_priv); int i, rc; struct ibv_sge list = { .addr = (uintptr_t) NULL, /* filled below */ .length = pctx->opts.recv_bufsize, .lkey = pconn->mr_recv->lkey /* always the same */ }; struct ibv_recv_wr wr = { .wr_id = 0, /* filled below */ .sg_list = &list, .num_sge = 1, }; struct ibv_recv_wr *bad_wr; DEBUG(DEBUG_DEBUG, ("ibw_fill_cq(cmid: %p)\n", pconn->cm_id)); for(i = pctx->opts.max_recv_wr; i!=0; i--) { list.addr = (uintptr_t) pconn->buf_recv + pctx->opts.recv_bufsize * pconn->recv_index; wr.wr_id = pconn->recv_index; pconn->recv_index = (pconn->recv_index + 1) % pctx->opts.max_recv_wr; rc = ibv_post_recv(pconn->cm_id->qp, &wr, &bad_wr); if (rc) { sprintf(ibw_lasterr, "fill/ibv_post_recv failed with %d\n", rc); DEBUG(DEBUG_ERR, (ibw_lasterr)); return -2; } } return 0; } static int ibw_manage_connect(struct ibw_conn *conn) { struct rdma_conn_param conn_param; struct ibw_conn_priv *pconn = talloc_get_type(conn->internal, struct ibw_conn_priv); int rc; DEBUG(DEBUG_DEBUG, ("ibw_manage_connect(cmid: %p)\n", pconn->cm_id)); if (ibw_setup_cq_qp(conn)) return -1; /* cm connect */ memset(&conn_param, 0, sizeof conn_param); conn_param.responder_resources = 1; conn_param.initiator_depth = 1; conn_param.retry_count = 10; rc = rdma_connect(pconn->cm_id, &conn_param); if (rc) sprintf(ibw_lasterr, "rdma_connect error %d\n", rc); return rc; } static void ibw_event_handler_cm(struct tevent_context *ev, struct tevent_fd *fde, uint16_t flags, void *private_data) { int rc; struct ibw_ctx *ctx = talloc_get_type(private_data, struct ibw_ctx); struct ibw_ctx_priv *pctx = talloc_get_type(ctx->internal, struct ibw_ctx_priv); struct ibw_conn *conn = NULL; struct ibw_conn_priv *pconn = NULL; struct rdma_cm_id *cma_id = NULL; struct rdma_cm_event *event = NULL; assert(ctx!=NULL); rc = rdma_get_cm_event(pctx->cm_channel, &event); if (rc) { ctx->state = IBWS_ERROR; event = NULL; sprintf(ibw_lasterr, "rdma_get_cm_event error %d\n", rc); goto error; } cma_id = event->id; DEBUG(DEBUG_DEBUG, ("cma_event type %d cma_id %p (%s)\n", event->event, cma_id, (cma_id == pctx->cm_id) ? "parent" : "child")); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: DEBUG(DEBUG_DEBUG, ("RDMA_CM_EVENT_ADDR_RESOLVED\n")); /* continuing from ibw_connect ... */ rc = rdma_resolve_route(cma_id, 2000); if (rc) { sprintf(ibw_lasterr, "rdma_resolve_route error %d\n", rc); goto error; } /* continued at RDMA_CM_EVENT_ROUTE_RESOLVED */ break; case RDMA_CM_EVENT_ROUTE_RESOLVED: DEBUG(DEBUG_DEBUG, ("RDMA_CM_EVENT_ROUTE_RESOLVED\n")); /* after RDMA_CM_EVENT_ADDR_RESOLVED: */ assert(cma_id->context!=NULL); conn = talloc_get_type(cma_id->context, struct ibw_conn); rc = ibw_manage_connect(conn); if (rc) goto error; break; case RDMA_CM_EVENT_CONNECT_REQUEST: DEBUG(DEBUG_DEBUG, ("RDMA_CM_EVENT_CONNECT_REQUEST\n")); ctx->state = IBWS_CONNECT_REQUEST; conn = ibw_conn_new(ctx, ctx); pconn = talloc_get_type(conn->internal, struct ibw_conn_priv); pconn->cm_id = cma_id; /* !!! event will be freed but id not */ cma_id->context = (void *)conn; DEBUG(DEBUG_DEBUG, ("pconn->cm_id %p\n", pconn->cm_id)); if (ibw_setup_cq_qp(conn)) goto error; conn->state = IBWC_INIT; pctx->connstate_func(ctx, conn); /* continued at ibw_accept when invoked by the func above */ if (!pconn->is_accepted) { rc = rdma_reject(cma_id, NULL, 0); if (rc) DEBUG(DEBUG_ERR, ("rdma_reject failed with rc=%d\n", rc)); talloc_free(conn); DEBUG(DEBUG_DEBUG, ("pconn->cm_id %p wasn't accepted\n", pconn->cm_id)); } /* TODO: clarify whether if it's needed by upper layer: */ ctx->state = IBWS_READY; pctx->connstate_func(ctx, NULL); /* NOTE: more requests can arrive until RDMA_CM_EVENT_ESTABLISHED ! */ break; case RDMA_CM_EVENT_ESTABLISHED: /* expected after ibw_accept and ibw_connect[not directly] */ DEBUG(DEBUG_INFO, ("ESTABLISHED (conn: %p)\n", cma_id->context)); conn = talloc_get_type(cma_id->context, struct ibw_conn); assert(conn!=NULL); /* important assumption */ DEBUG(DEBUG_DEBUG, ("ibw_setup_cq_qp succeeded (cmid=%p)\n", cma_id)); /* client conn is up */ conn->state = IBWC_CONNECTED; /* both ctx and conn have changed */ pctx->connstate_func(ctx, conn); break; case RDMA_CM_EVENT_ADDR_ERROR: sprintf(ibw_lasterr, "RDMA_CM_EVENT_ADDR_ERROR, error %d\n", event->status); case RDMA_CM_EVENT_ROUTE_ERROR: sprintf(ibw_lasterr, "RDMA_CM_EVENT_ROUTE_ERROR, error %d\n", event->status); case RDMA_CM_EVENT_CONNECT_ERROR: sprintf(ibw_lasterr, "RDMA_CM_EVENT_CONNECT_ERROR, error %d\n", event->status); case RDMA_CM_EVENT_UNREACHABLE: sprintf(ibw_lasterr, "RDMA_CM_EVENT_UNREACHABLE, error %d\n", event->status); goto error; case RDMA_CM_EVENT_REJECTED: sprintf(ibw_lasterr, "RDMA_CM_EVENT_REJECTED, error %d\n", event->status); DEBUG(DEBUG_INFO, ("cm event handler: %s", ibw_lasterr)); conn = talloc_get_type(cma_id->context, struct ibw_conn); if (conn) { /* must be done BEFORE connstate */ if ((rc=rdma_ack_cm_event(event))) DEBUG(DEBUG_ERR, ("reject/rdma_ack_cm_event failed with %d\n", rc)); event = NULL; /* not to touch cma_id or conn */ conn->state = IBWC_ERROR; /* it should free the conn */ pctx->connstate_func(NULL, conn); } break; /* this is not strictly an error */ case RDMA_CM_EVENT_DISCONNECTED: DEBUG(DEBUG_DEBUG, ("RDMA_CM_EVENT_DISCONNECTED\n")); if ((rc=rdma_ack_cm_event(event))) DEBUG(DEBUG_ERR, ("disc/rdma_ack_cm_event failed with %d\n", rc)); event = NULL; /* don't ack more */ if (cma_id!=pctx->cm_id) { DEBUG(DEBUG_ERR, ("client DISCONNECT event cm_id=%p\n", cma_id)); conn = talloc_get_type(cma_id->context, struct ibw_conn); conn->state = IBWC_DISCONNECTED; pctx->connstate_func(NULL, conn); } break; case RDMA_CM_EVENT_DEVICE_REMOVAL: sprintf(ibw_lasterr, "cma detected device removal!\n"); goto error; default: sprintf(ibw_lasterr, "unknown event %d\n", event->event); goto error; } if (event!=NULL && (rc=rdma_ack_cm_event(event))) { sprintf(ibw_lasterr, "rdma_ack_cm_event failed with %d\n", rc); goto error; } return; error: DEBUG(DEBUG_ERR, ("cm event handler: %s", ibw_lasterr)); if (event!=NULL) { if (cma_id!=NULL && cma_id!=pctx->cm_id) { conn = talloc_get_type(cma_id->context, struct ibw_conn); if (conn) { conn->state = IBWC_ERROR; pctx->connstate_func(NULL, conn); } } else { ctx->state = IBWS_ERROR; pctx->connstate_func(ctx, NULL); } if ((rc=rdma_ack_cm_event(event))!=0) { DEBUG(DEBUG_ERR, ("rdma_ack_cm_event failed with %d\n", rc)); } } return; }
int ctdb_ibw_connstate_handler(struct ibw_ctx *ctx, struct ibw_conn *conn) { if (ctx!=NULL) { /* ctx->state changed */ switch(ctx->state) { case IBWS_INIT: /* ctx start - after ibw_init */ break; case IBWS_READY: /* after ibw_bind & ibw_listen */ break; case IBWS_CONNECT_REQUEST: /* after [IBWS_READY + incoming request] */ /* => [(ibw_accept)IBWS_READY | (ibw_disconnect)STOPPED | ERROR] */ if (ibw_accept(ctx, conn, NULL)) { DEBUG(DEBUG_ERR, ("connstate_handler/ibw_accept failed\n")); return -1; } /* else continue in IBWC_CONNECTED */ break; case IBWS_STOPPED: /* normal stop <= ibw_disconnect+(IBWS_READY | IBWS_CONNECT_REQUEST) */ /* TODO: have a CTDB upcall for which CTDB should wait in a (final) loop */ break; case IBWS_ERROR: /* abnormal state; ibw_stop must be called after this */ break; default: assert(0); break; } } if (conn!=NULL) { /* conn->state changed */ switch(conn->state) { case IBWC_INIT: /* conn start - internal state */ break; case IBWC_CONNECTED: { /* after ibw_accept or ibw_connect */ struct ctdb_node *node = talloc_get_type(conn->conn_userdata, struct ctdb_node); if (node!=NULL) { /* after ibw_connect */ struct ctdb_ibw_node *cn = talloc_get_type(node->private_data, struct ctdb_ibw_node); node->ctdb->upcalls->node_connected(node); ctdb_flush_cn_queue(cn); } else { /* after ibw_accept */ /* NOP in CTDB case */ } } break; case IBWC_DISCONNECTED: { /* after ibw_disconnect */ struct ctdb_node *node = talloc_get_type(conn->conn_userdata, struct ctdb_node); if (node!=NULL) node->ctdb->upcalls->node_dead(node); talloc_free(conn); /* normal + intended disconnect => not reconnecting in this layer */ } break; case IBWC_ERROR: { struct ctdb_node *node = talloc_get_type(conn->conn_userdata, struct ctdb_node); if (node!=NULL) { struct ctdb_ibw_node *cn = talloc_get_type(node->private_data, struct ctdb_ibw_node); struct ibw_ctx *ictx = cn->conn->ctx; DEBUG(DEBUG_DEBUG, ("IBWC_ERROR, reconnecting...\n")); talloc_free(cn->conn); /* internal queue content is destroyed */ cn->conn = (void *)ibw_conn_new(ictx, node); tevent_add_timer(node->ctdb->ev, node, timeval_current_ofs(1, 0), ctdb_ibw_node_connect_event, node); } } break; default: assert(0); break; } }