/* * rpmem_fip_init_memory -- (internal) initialize common memory resources */ static int rpmem_fip_init_memory(struct rpmem_fip *fip) { ASSERTne(Pagesize, 0); int ret; /* * Register local memory space. The local memory will be used * with WRITE operation in rpmem_fip_persist function thus * the FI_WRITE access flag. */ ret = fi_mr_reg(fip->domain, fip->laddr, fip->size, FI_WRITE, 0, 0, 0, &fip->mr, NULL); if (ret) { RPMEM_FI_ERR(ret, "registrating memory"); return ret; } /* get local memory descriptor */ fip->mr_desc = fi_mr_desc(fip->mr); /* allocate buffer for read operation */ ASSERT(IS_PAGE_ALIGNED(RPMEM_RD_BUFF_SIZE)); errno = posix_memalign((void **)&fip->rd_buff, Pagesize, RPMEM_RD_BUFF_SIZE); if (errno) { RPMEM_LOG(ERR, "!allocating read buffer"); ret = -1; goto err_malloc_rd_buff; } /* * Register buffer for read operation. * The read operation utilizes READ operation thus * the FI_REMOTE_WRITE flag. */ ret = fi_mr_reg(fip->domain, fip->rd_buff, RPMEM_RD_BUFF_SIZE, FI_REMOTE_WRITE, 0, 0, 0, &fip->rd_mr, NULL); if (ret) { RPMEM_FI_ERR(ret, "registrating read buffer"); goto err_rd_mr; } /* get read buffer local memory descriptor */ fip->rd_mr_desc = fi_mr_desc(fip->rd_mr); return 0; err_rd_mr: free(fip->rd_buff); err_malloc_rd_buff: RPMEM_FI_CLOSE(fip->mr, "unregistering memory"); return ret; }
/* * rpmem_fip_persist_gpspm -- (internal) perform persist operation for GPSPM */ static int rpmem_fip_persist_gpspm(struct rpmem_fip *fip, size_t offset, size_t len, unsigned lane) { int ret; struct rpmem_fip_plane_gpspm *lanep = &fip->lanes.gpspm[lane]; ret = rpmem_fip_lane_wait(&lanep->lane, FI_SEND); if (unlikely(ret)) { RPMEM_LOG(ERR, "waiting for SEND buffer"); return ret; } RPMEM_ASSERT(!rpmem_fip_lane_busy(&lanep->lane)); rpmem_fip_lane_begin(&lanep->lane, FI_SEND | FI_RECV); void *laddr = (void *)((uintptr_t)fip->laddr + offset); uint64_t raddr = fip->raddr + offset; struct rpmem_msg_persist *msg; struct rpmem_fip_plane_gpspm *gpspm = (void *)lanep; /* WRITE for requested memory region */ ret = rpmem_fip_writemsg(fip->ep, &gpspm->write, laddr, len, raddr); if (unlikely(ret)) { RPMEM_FI_ERR((int)ret, "RMA write"); return ret; } /* SEND persist message */ msg = rpmem_fip_msg_get_pmsg(&gpspm->send); msg->lane = lane; msg->addr = raddr; msg->size = len; ret = rpmem_fip_sendmsg(fip->ep, &gpspm->send); if (unlikely(ret)) { RPMEM_FI_ERR(ret, "MSG send"); return ret; } /* wait for persist operation completion */ ret = rpmem_fip_lane_wait(&lanep->lane, FI_RECV); if (unlikely(ret)) { RPMEM_LOG(ERR, "persist operation failed"); return ret; } return ret; }
/* * rpmem_fip_init_fabric_res -- (internal) initialize common fabric resources */ static int rpmem_fip_init_fabric_res(struct rpmem_fip *fip) { int ret; ret = fi_fabric(fip->fi->fabric_attr, &fip->fabric, NULL); if (ret) { RPMEM_FI_ERR(ret, "opening fabric domain"); goto err_fi_fabric; } ret = fi_domain(fip->fabric, fip->fi, &fip->domain, NULL); if (ret) { RPMEM_FI_ERR(ret, "opening fabric access domain"); goto err_fi_domain; } struct fi_eq_attr eq_attr = { .size = 0, /* use default value */ .flags = 0, .wait_obj = FI_WAIT_UNSPEC, .signaling_vector = 0, .wait_set = NULL, }; ret = fi_eq_open(fip->fabric, &eq_attr, &fip->eq, NULL); if (ret) { RPMEM_FI_ERR(ret, "opening event queue"); goto err_eq_open; } return 0; err_eq_open: RPMEM_FI_CLOSE(fip->domain, "closing fabric access domain"); err_fi_domain: RPMEM_FI_CLOSE(fip->fabric, "closing fabric domain"); err_fi_fabric: return ret; } /* * rpmem_fip_fini_fabric_res -- (internal) deinitialize common fabric resources */ static void rpmem_fip_fini_fabric_res(struct rpmem_fip *fip) { RPMEM_FI_CLOSE(fip->eq, "closing event queue"); RPMEM_FI_CLOSE(fip->domain, "closing fabric access domain"); RPMEM_FI_CLOSE(fip->fabric, "closing fabric domain"); }
/* * rpmem_fip_getinfo -- (internal) get fabric interface information */ static int rpmem_fip_getinfo(struct rpmem_fip *fip, const char *node, const char *service, enum rpmem_provider provider) { int ret = 0; struct fi_info *hints = rpmem_fip_get_hints(provider); if (!hints) { RPMEM_LOG(ERR, "!getting fabric interface information hints"); goto err_hints; } ret = fi_getinfo(RPMEM_FIVERSION, node, service, 0, hints, &fip->fi); if (ret) { RPMEM_FI_ERR(ret, "getting fabric interface information"); goto err_fi_getinfo; } rpmem_fip_print_info(fip->fi); /* fallback to free the hints */ err_fi_getinfo: fi_freeinfo(hints); err_hints: return ret; }
/* * rpmem_fip_close -- close connection to remote peer */ int rpmem_fip_close(struct rpmem_fip *fip) { int ret; int lret = 0; ret = fi_shutdown(fip->ep, 0); if (ret) { RPMEM_FI_ERR(ret, "disconnecting endpoint"); lret = ret; } struct fi_eq_cm_entry entry; ret = rpmem_fip_read_eq(fip->eq, &entry, FI_SHUTDOWN, &fip->ep->fid, -1); if (ret) lret = ret; ret = rpmem_fip_fini_ep(fip); if (ret) lret = ret; ret = rpmem_fip_fini_cq(fip); if (ret) lret = ret; return lret; }
/* * rpmem_fip_gpspm_post_resp -- (internal) post persist response message buffer */ static inline int rpmem_fip_gpspm_post_resp(struct rpmem_fip *fip, struct rpmem_fip_msg *resp) { int ret = rpmem_fip_recvmsg(fip->ep, resp); if (unlikely(ret)) { RPMEM_FI_ERR(ret, "posting GPSPM recv buffer"); return ret; } return 0; }
/* * rpmem_fip_persist_apm -- (internal) perform persist operation for APM */ static int rpmem_fip_persist_apm(struct rpmem_fip *fip, size_t offset, size_t len, unsigned lane) { struct rpmem_fip_plane_apm *lanep = &fip->lanes.apm[lane]; RPMEM_ASSERT(!rpmem_fip_lane_busy(&lanep->lane)); rpmem_fip_lane_begin(&lanep->lane, FI_READ); int ret; void *laddr = (void *)((uintptr_t)fip->laddr + offset); uint64_t raddr = fip->raddr + offset; /* WRITE for requested memory region */ ret = rpmem_fip_writemsg(fip->ep, &lanep->write, laddr, len, raddr); if (unlikely(ret)) { RPMEM_FI_ERR(ret, "RMA write"); return ret; } /* READ to read-after-write buffer */ ret = rpmem_fip_readmsg(fip->ep, &lanep->read, &fip->raw_buff, sizeof(fip->raw_buff), raddr); if (unlikely(ret)) { RPMEM_FI_ERR(ret, "RMA read"); return ret; } /* wait for READ completion */ ret = rpmem_fip_lane_wait(&lanep->lane, FI_READ); if (unlikely(ret)) { RPMEM_LOG(ERR, "waiting for READ completion failed"); return ret; } return ret; }
/* * rpmem_fip_connect -- connect to remote peer */ int rpmem_fip_connect(struct rpmem_fip *fip) { int ret; struct fi_eq_cm_entry entry; ret = rpmem_fip_init_cq(fip); if (ret) goto err_init_cq; ret = rpmem_fip_init_ep(fip); if (ret) goto err_init_ep; ret = fip->ops->lanes_post(fip); if (ret) goto err_lanes_post; ret = fi_connect(fip->ep, fip->fi->dest_addr, NULL, 0); if (ret) { RPMEM_FI_ERR(ret, "initiating connection request"); goto err_fi_connect; } ret = rpmem_fip_read_eq(fip->eq, &entry, FI_CONNECTED, &fip->ep->fid, -1); if (ret) goto err_fi_eq_read; return 0; err_fi_eq_read: err_fi_connect: err_lanes_post: rpmem_fip_fini_ep(fip); err_init_ep: rpmem_fip_fini_cq(fip); err_init_cq: return ret; }
/* * rpmem_fip_close -- close connection to remote peer */ int rpmem_fip_close(struct rpmem_fip *fip) { int ret; int lret = 0; ret = fi_shutdown(fip->ep, 0); if (ret) { RPMEM_FI_ERR(ret, "disconnecting endpoint"); lret = ret; } ret = rpmem_fip_fini_ep(fip); if (ret) lret = ret; ret = rpmem_fip_fini_cq(fip); if (ret) lret = ret; return lret; }
/* * rpmem_fip_process_gpspm -- (internal) process completion queue entry for * GPSPM */ static int rpmem_fip_process_gpspm(struct rpmem_fip *fip, void *context, uint64_t flags) { if (flags & FI_RECV) { /* RECV completion */ struct rpmem_fip_msg *resp = context; struct rpmem_msg_persist_resp *msg_resp = rpmem_fip_msg_get_pres(resp); VALGRIND_DO_MAKE_MEM_DEFINED(msg_resp, sizeof(*msg_resp)); if (unlikely(msg_resp->lane >= fip->nlanes)) { RPMEM_LOG(ERR, "lane number received (%lu) is greater " "than maximum lane number (%u)", msg_resp->lane, fip->nlanes - 1); return -1; } struct rpmem_fip_lane *lanep = &fip->lanes.gpspm[msg_resp->lane].lane; /* post RECV buffer immediately */ int ret = rpmem_fip_gpspm_post_resp(fip, resp); if (unlikely(ret)) RPMEM_FI_ERR((int)ret, "MSG send"); rpmem_fip_lane_sigret(lanep, flags, ret); return ret; } struct rpmem_fip_lane *lanep = context; /* SEND completion */ rpmem_fip_lane_signal(lanep, flags); return 0; }
/* * rpmem_fip_process -- (internal) process completion events */ static int rpmem_fip_process(struct rpmem_fip *fip) { ssize_t sret; struct fi_cq_err_entry err; const char *str_err; int ret; struct fi_cq_msg_entry *cq_entries; cq_entries = malloc(fip->cq_size * sizeof(*cq_entries)); if (!cq_entries) { RPMEM_LOG(ERR, "!allocating completion queue buffer"); return -1; } while (!fip->closing) { sret = fi_cq_sread(fip->cq, cq_entries, fip->cq_size, NULL, RPMEM_FIP_CQ_WAIT_MS); if (unlikely(fip->closing)) break; if (unlikely(sret == -FI_EAGAIN)) continue; if (unlikely(sret < 0)) { ret = (int)sret; goto err_cq_read; } for (ssize_t i = 0; i < sret; i++) { struct fi_cq_msg_entry *comp = &cq_entries[i]; /* * If the context is NULL it probably means that * we get an unexpected CQ entry. The CQ is configured * with FI_SELECTIVE_COMPLETION so every inbound or * outbound operation must be issued with FI_COMPLETION * flag and non-NULL context. */ RPMEM_ASSERT(comp->op_context); /* read operation */ if (unlikely(comp->op_context == &fip->rd_lane)) { rpmem_fip_lane_signal(&fip->rd_lane.lane, FI_READ); continue; } /* persist operation */ ret = fip->ops->process(fip, comp->op_context, comp->flags); if (unlikely(ret)) { RPMEM_LOG(ERR, "persist operation failed"); goto err; } } } free(cq_entries); return 0; err_cq_read: sret = fi_cq_readerr(fip->cq, &err, 0); if (sret < 0) { RPMEM_FI_ERR((int)sret, "error reading from completion queue: " "cannot read error from event queue"); goto err; } str_err = fi_cq_strerror(fip->cq, err.prov_errno, NULL, NULL, 0); RPMEM_LOG(ERR, "error reading from completion queue: %s", str_err); err: rpmem_fip_signal_all(fip, ret); free(cq_entries); return ret; }
/* * rpmem_fip_init_lanes_gpspm -- (internal) initialize lanes for GPSPM */ static int rpmem_fip_init_lanes_gpspm(struct rpmem_fip *fip) { int ret = 0; /* allocate GPSPM lanes */ fip->lanes.gpspm = calloc(1, fip->nlanes * sizeof(*fip->lanes.gpspm)); if (!fip->lanes.gpspm) { RPMEM_LOG(ERR, "allocating GPSPM lanes"); goto err_malloc_lanes; } /* allocate persist messages buffer */ size_t msg_size = fip->nlanes * sizeof(struct rpmem_msg_persist); fip->pmsg = malloc(msg_size); if (!fip->pmsg) { RPMEM_LOG(ERR, "!allocating messages buffer"); ret = -1; goto err_malloc_pmsg; } /* * Register persist messages buffer. The persist messages * are sent to daemon thus the FI_SEND access flag. */ ret = fi_mr_reg(fip->domain, fip->pmsg, msg_size, FI_SEND, 0, 0, 0, &fip->pmsg_mr, NULL); if (ret) { RPMEM_FI_ERR(ret, "registering messages buffer"); goto err_fi_mr_reg_pmsg; } /* get persist messages buffer local descriptor */ fip->pmsg_mr_desc = fi_mr_desc(fip->pmsg_mr); /* allocate persist response messages buffer */ size_t msg_resp_size = fip->nlanes * sizeof(struct rpmem_msg_persist_resp); fip->pres = malloc(msg_resp_size); if (!fip->pres) { RPMEM_LOG(ERR, "!allocating messages response buffer"); ret = -1; goto err_malloc_pres; } /* * Register persist messages response buffer. The persist response * messages are received from daemon thus the FI_RECV access flag. */ ret = fi_mr_reg(fip->domain, fip->pres, msg_resp_size, FI_RECV, 0, 0, 0, &fip->pres_mr, NULL); if (ret) { RPMEM_FI_ERR(ret, "registering messages response buffer"); goto err_fi_mr_reg_pres; } /* get persist response messages buffer local descriptor */ fip->pres_mr_desc = fi_mr_desc(fip->pres_mr); /* allocate RECV structures for fi_recvmsg(3) */ fip->recv = malloc(fip->nlanes * sizeof(*fip->recv)); if (!fip->recv) { RPMEM_LOG(ERR, "!allocating response message iov buffer"); goto err_malloc_recv; } /* * Initialize all required structures for: * WRITE, SEND and RECV operations. * * If the completion is required the FI_COMPLETION flag and * appropriate context should be used. * * In GPSPM only the RECV and SEND completions are required. * * For RECV the context is RECV operation structure used for * fi_recvmsg(3) function call. * * For SEND the context is lane structure. * * The received buffer contains a lane id which is used * to obtain a lane which must be signaled that operation * has been completed. */ unsigned i; for (i = 0; i < fip->nlanes; i++) { ret = rpmem_fip_lane_init(&fip->lanes.gpspm[i].lane); if (ret) goto err_lane_init; /* WRITE */ rpmem_fip_rma_init(&fip->lanes.gpspm[i].write, fip->mr_desc, 0, fip->rkey, &fip->lanes.gpspm[i], 0); /* SEND */ rpmem_fip_msg_init(&fip->lanes.gpspm[i].send, fip->pmsg_mr_desc, 0, &fip->lanes.gpspm[i], &fip->pmsg[i], sizeof(fip->pmsg[i]), FI_COMPLETION); /* RECV */ rpmem_fip_msg_init(&fip->recv[i], fip->pres_mr_desc, 0, &fip->recv[i], &fip->pres[i], sizeof(fip->pres[i]), FI_COMPLETION); } return 0; err_lane_init: for (unsigned j = 0; j < i; j++) rpmem_fip_lane_fini(&fip->lanes.gpspm[i].lane); err_malloc_recv: RPMEM_FI_CLOSE(fip->pres_mr, "unregistering messages " "response buffer"); err_fi_mr_reg_pres: free(fip->pres); err_malloc_pres: RPMEM_FI_CLOSE(fip->pmsg_mr, "unregistering messages buffer"); err_fi_mr_reg_pmsg: free(fip->pmsg); err_malloc_pmsg: free(fip->lanes.gpspm); err_malloc_lanes: return ret; }
/* * rpmem_fip_init_lanes_apm -- (internal) initialize lanes for APM */ static int rpmem_fip_init_lanes_apm(struct rpmem_fip *fip) { int ret; /* allocate APM lanes */ fip->lanes.apm = calloc(1, fip->nlanes * sizeof(*fip->lanes.apm)); if (!fip->lanes.apm) { RPMEM_LOG(ERR, "!allocating APM lanes"); goto err_malloc_lanes; } /* register read-after-write buffer */ ret = fi_mr_reg(fip->domain, &fip->raw_buff, sizeof(fip->raw_buff), FI_REMOTE_WRITE, 0, 0, 0, &fip->raw_mr, NULL); if (ret) { RPMEM_FI_ERR(ret, "registering APM read buffer"); goto err_fi_raw_mr; } /* get read-after-write buffer local descriptor */ fip->raw_mr_desc = fi_mr_desc(fip->raw_mr); /* * Initialize all required structures for: * WRITE and READ operations. * * If the completion is required the FI_COMPLETION flag and * appropriate context should be used. * * In APM only the READ completion is required. * The context is a lane structure. */ unsigned i; for (i = 0; i < fip->nlanes; i++) { ret = rpmem_fip_lane_init(&fip->lanes.apm[i].lane); if (ret) goto err_lane_init; /* WRITE */ rpmem_fip_rma_init(&fip->lanes.apm[i].write, fip->mr_desc, 0, fip->rkey, &fip->lanes.apm[i], 0); /* READ */ rpmem_fip_rma_init(&fip->lanes.apm[i].read, fip->raw_mr_desc, 0, fip->rkey, &fip->lanes.apm[i], FI_COMPLETION); } return 0; err_lane_init: for (unsigned j = 0; j < i; j++) rpmem_fip_lane_fini(&fip->lanes.apm[i].lane); err_fi_raw_mr: free(fip->lanes.apm); err_malloc_lanes: return -1; }
/* * rpmem_fip_init_cq -- (internal) initialize completion queue(s) */ static int rpmem_fip_init_cq(struct rpmem_fip *fip) { int ret; struct fi_cq_attr cq_attr = { .size = fip->cq_size, .flags = 0, .format = FI_CQ_FORMAT_MSG, .wait_obj = FI_WAIT_UNSPEC, .signaling_vector = 0, .wait_cond = FI_CQ_COND_NONE, .wait_set = NULL, }; ret = fi_cq_open(fip->domain, &cq_attr, &fip->cq, NULL); if (ret) { RPMEM_FI_ERR(ret, "opening completion queue"); goto err_cq_open; } return 0; err_cq_open: return -1; } /* * rpmem_fip_fini_cq -- (internal) deinitialize completion queue(s) */ static int rpmem_fip_fini_cq(struct rpmem_fip *fip) { return RPMEM_FI_CLOSE(fip->cq, "closing completion queue"); } /* * rpmem_fip_init_ep -- (internal) initialize endpoint */ static int rpmem_fip_init_ep(struct rpmem_fip *fip) { int ret; /* create an endpoint */ ret = fi_endpoint(fip->domain, fip->fi, &fip->ep, NULL); if (ret) { RPMEM_FI_ERR(ret, "allocating endpoint"); goto err_endpoint; } /* * Bind an event queue to an endpoint to get * connection-related events for the endpoint. */ ret = fi_ep_bind(fip->ep, &fip->eq->fid, 0); if (ret) { RPMEM_FI_ERR(ret, "binding event queue to endpoint"); goto err_ep_bind_eq; } /* * Bind a completion queue to an endpoint to get completion * events of specified inbound/outbound operations. * * FI_SELECTIVE_COMPLETION means all inbound/outbound operations * must explicitly specify if the completion event should be * generated or not using FI_COMPLETION flag. * * The completion events received are highly related to the * persistency method used and are configured in lanes * initialization specified for persistency method utilized. */ ret = fi_ep_bind(fip->ep, &fip->cq->fid, FI_RECV | FI_TRANSMIT | FI_SELECTIVE_COMPLETION); if (ret) { RPMEM_FI_ERR(ret, "binding completion queue to endpoint"); goto err_ep_bind_cq; } /* * Enable endpoint so it is possible to post inbound/outbound * operations if required. */ ret = fi_enable(fip->ep); if (ret) { RPMEM_FI_ERR(ret, "activating endpoint"); goto err_fi_enable; } return 0; err_fi_enable: err_ep_bind_cq: err_ep_bind_eq: err_endpoint: return ret; }