static mca_mtl_base_module_t* ompi_mtl_ofi_component_init(bool enable_progress_threads, bool enable_mpi_threads) { int ret, fi_version; struct fi_info *hints; struct fi_info *providers = NULL, *prov = NULL; struct fi_cq_attr cq_attr = {0}; struct fi_av_attr av_attr = {0}; char ep_name[FI_NAME_MAX] = {0}; size_t namelen; /** * Hints to filter providers * See man fi_getinfo for a list of all filters * mode: Select capabilities MTL is prepared to support. * In this case, MTL will pass in context into communication calls * ep_type: reliable datagram operation * caps: Capabilities required from the provider. * Tag matching is specified to implement MPI semantics. * msg_order: Guarantee that messages with same tag are ordered. */ hints = fi_allocinfo(); if (!hints) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: Could not allocate fi_info\n", __FILE__, __LINE__); goto error; } hints->mode = FI_CONTEXT; hints->ep_attr->type = FI_EP_RDM; /* Reliable datagram */ hints->caps = FI_TAGGED; /* Tag matching interface */ hints->tx_attr->msg_order = FI_ORDER_SAS; hints->rx_attr->msg_order = FI_ORDER_SAS; hints->domain_attr->threading = FI_THREAD_UNSPEC; hints->domain_attr->control_progress = FI_PROGRESS_MANUAL; /** * FI_VERSION provides binary backward and forward compatibility support * Specify the version of OFI is coded to, the provider will select struct * layouts that are compatible with this version. */ fi_version = FI_VERSION(1, 0); /** * fi_getinfo: returns information about fabric services for reaching a * remote node or service. this does not necessarily allocate resources. * Pass NULL for name/service because we want a list of providers supported. */ ret = fi_getinfo(fi_version, /* OFI version requested */ NULL, /* Optional name or fabric to resolve */ NULL, /* Optional service name or port to request */ 0ULL, /* Optional flag */ hints, /* In: Hints to filter providers */ &providers); /* Out: List of matching providers */ if (0 != ret) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: fi_getinfo failed: %s\n", __FILE__, __LINE__, fi_strerror(-ret)); goto error; } /** * Select a provider from the list returned by fi_getinfo(). */ prov = select_ofi_provider(providers); if (!prov) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: select_ofi_provider: no provider found\n", __FILE__, __LINE__); goto error; } /** * Open fabric * The getinfo struct returns a fabric attribute struct that can be used to * instantiate the virtual or physical network. This opens a "fabric * provider". See man fi_fabric for details. */ ret = fi_fabric(prov->fabric_attr, /* In: Fabric attributes */ &ompi_mtl_ofi.fabric, /* Out: Fabric handle */ NULL); /* Optional context for fabric events */ if (0 != ret) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: fi_fabric failed: %s\n", __FILE__, __LINE__, fi_strerror(-ret)); goto error; } /** * Create the access domain, which is the physical or virtual network or * hardware port/collection of ports. Returns a domain object that can be * used to create endpoints. See man fi_domain for details. */ ret = fi_domain(ompi_mtl_ofi.fabric, /* In: Fabric object */ prov, /* In: Provider */ &ompi_mtl_ofi.domain, /* Out: Domain oject */ NULL); /* Optional context for domain events */ if (0 != ret) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: fi_domain failed: %s\n", __FILE__, __LINE__, fi_strerror(-ret)); goto error; } /** * Create a transport level communication endpoint. To use the endpoint, * it must be bound to completion counters or event queues and enabled, * and the resources consumed by it, such as address vectors, counters, * completion queues, etc. * see man fi_endpoint for more details. */ ret = fi_endpoint(ompi_mtl_ofi.domain, /* In: Domain object */ prov, /* In: Provider */ &ompi_mtl_ofi.ep, /* Out: Endpoint object */ NULL); /* Optional context */ if (0 != ret) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: fi_endpoint failed: %s\n", __FILE__, __LINE__, fi_strerror(-ret)); goto error; } /** * Save the maximum inject size. */ ompi_mtl_ofi.max_inject_size = prov->tx_attr->inject_size; /** * Create the objects that will be bound to the endpoint. * The objects include: * - completion queue for events * - address vector of other endpoint addresses * - dynamic memory-spanning memory region */ cq_attr.format = FI_CQ_FORMAT_TAGGED; ret = fi_cq_open(ompi_mtl_ofi.domain, &cq_attr, &ompi_mtl_ofi.cq, NULL); if (ret) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: fi_cq_open failed: %s\n", __FILE__, __LINE__, fi_strerror(-ret)); goto error; } /** * The remote fi_addr will be stored in the ofi_endpoint struct. * So, we use the AV in "map" mode. */ av_attr.type = FI_AV_MAP; ret = fi_av_open(ompi_mtl_ofi.domain, &av_attr, &ompi_mtl_ofi.av, NULL); if (ret) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: fi_av_open failed: %s\n", __FILE__, __LINE__, fi_strerror(-ret)); goto error; } /** * Bind the CQ and AV to the endpoint object. */ ret = fi_ep_bind(ompi_mtl_ofi.ep, (fid_t)ompi_mtl_ofi.cq, FI_SEND | FI_RECV); if (0 != ret) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: fi_bind CQ-EP failed: %s\n", __FILE__, __LINE__, fi_strerror(-ret)); goto error; } ret = fi_ep_bind(ompi_mtl_ofi.ep, (fid_t)ompi_mtl_ofi.av, 0); if (0 != ret) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: fi_bind AV-EP failed: %s\n", __FILE__, __LINE__, fi_strerror(-ret)); goto error; } /** * Enable the endpoint for communication * This commits the bind operations. */ ret = fi_enable(ompi_mtl_ofi.ep); if (0 != ret) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: fi_enable failed: %s\n", __FILE__, __LINE__, fi_strerror(-ret)); goto error; } /** * Free providers info since it's not needed anymore. */ fi_freeinfo(hints); hints = NULL; fi_freeinfo(providers); providers = NULL; /** * Get our address and publish it with modex. */ namelen = sizeof(ep_name); ret = fi_getname((fid_t)ompi_mtl_ofi.ep, &ep_name[0], &namelen); if (ret) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: fi_getname failed: %s\n", __FILE__, __LINE__, fi_strerror(-ret)); goto error; } OFI_COMPAT_MODEX_SEND(ret, &mca_mtl_ofi_component.super.mtl_version, &ep_name, namelen); if (OMPI_SUCCESS != ret) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: modex_send failed: %d\n", __FILE__, __LINE__, ret); goto error; } ompi_mtl_ofi.epnamelen = namelen; /** * Set the ANY_SRC address. */ ompi_mtl_ofi.any_addr = FI_ADDR_UNSPEC; /** * Activate progress callback. */ ret = opal_progress_register(ompi_mtl_ofi_progress_no_inline); if (OMPI_SUCCESS != ret) { opal_output_verbose(1, ompi_mtl_base_framework.framework_output, "%s:%d: opal_progress_register failed: %d\n", __FILE__, __LINE__, ret); goto error; } return &ompi_mtl_ofi.base; error: if (providers) { (void) fi_freeinfo(providers); } if (hints) { (void) fi_freeinfo(hints); } if (ompi_mtl_ofi.av) { (void) fi_close((fid_t)ompi_mtl_ofi.av); } if (ompi_mtl_ofi.cq) { (void) fi_close((fid_t)ompi_mtl_ofi.cq); } if (ompi_mtl_ofi.ep) { (void) fi_close((fid_t)ompi_mtl_ofi.ep); } if (ompi_mtl_ofi.domain) { (void) fi_close((fid_t)ompi_mtl_ofi.domain); } if (ompi_mtl_ofi.fabric) { (void) fi_close((fid_t)ompi_mtl_ofi.fabric); } return NULL; }
static int common_setup(void) { struct fi_info *fi; int ret; ret = getaddr(src_addr, port, (struct sockaddr **) &hints.src_addr, (socklen_t *) &hints.src_addrlen); if (ret) printf("source address error %s\n", gai_strerror(ret)); ret = fi_getinfo(FI_VERSION(1, 0), dst_addr, port, 0, &hints, &fi); if (ret) { printf("fi_getinfo %s\n", strerror(-ret)); goto err0; } if (fi->ep_attr->max_msg_size) { max_msg_size = fi->ep_attr->max_msg_size; } ret = fi_fabric(fi->fabric_attr, &fab, NULL); if (ret) { printf("fi_fabric %s\n", fi_strerror(-ret)); goto err1; } if (fi->mode & FI_MSG_PREFIX) { prefix_len = fi->ep_attr->msg_prefix_size; } ret = fi_domain(fab, fi, &dom, NULL); if (ret) { printf("fi_fdomain %s %s\n", fi_strerror(-ret), fi->domain_attr->name); goto err2; } if (fi->src_addr != NULL) { ((struct sockaddr_in *)fi->src_addr)->sin_port = ((struct sockaddr_in *)hints.src_addr)->sin_port; if (dst_addr == NULL) { printf("Local address %s:%d\n", inet_ntoa(((struct sockaddr_in *)fi->src_addr)->sin_addr), ntohs(((struct sockaddr_in *)fi->src_addr)->sin_port)); } } ret = fi_endpoint(dom, fi, &ep, NULL); if (ret) { printf("fi_endpoint %s\n", fi_strerror(-ret)); goto err3; } ret = alloc_ep_res(fi); if (ret) { printf("alloc_ep_res %s\n", fi_strerror(-ret)); goto err4; } ret = bind_ep_res(); if (ret) { printf("bind_ep_res %s\n", fi_strerror(-ret)); goto err5; } if (hints.src_addr) free(hints.src_addr); fi_freeinfo(fi); return 0; err5: free_ep_res(); err4: fi_close(&ep->fid); err3: fi_close(&dom->fid); err2: fi_close(&fab->fid); err1: fi_freeinfo(fi); err0: if (hints.src_addr) free(hints.src_addr); return ret; }
int do_test(void) { struct fi_cq_msg_entry comp; int len = msg_len * post_depth; int msg_cnt = num_msgs; int tx_bufs_sent = 0; int ret; char *mp; u64 time_elap; #if SREAD == 0 int eagain_cnt = EAGAIN_TRIES; #endif print_trace("in\n"); if (!ctx.buf) { ctx.buf = kmalloc(len, GFP_KERNEL); if (!ctx.buf) { print_err("kalloc failed!\n"); return -ENOMEM; } ret = fi_mr_reg(ctx.domain, ctx.buf, len, 0, 0, 0, 0, &ctx.mr, NULL); if (ret) { print_err("fi_mr_reg returned %d\n", ret); kfree(ctx.buf); ctx.buf = ERR_PTR(-EFAULT); return ret; } } else if (IS_ERR(ctx.buf)) return 0; print_msg("post_depth %d num_msgs %d msg_len %d SREAD[%d]\n", post_depth, num_msgs, msg_len, SREAD); print_dbg("ctx.buf %p '%s' len %ld msg_len %d\n", ctx.buf, ctx.buf, strlen(ctx.buf)+1, msg_len); time_elap = get_jiffies_64(); for (mp = ctx.buf; msg_cnt > 0 && !kthread_should_stop(); ) { int post_cnt, cnt; post_cnt = (msg_cnt > post_depth ? post_depth : msg_cnt); for (cnt = 0, mp = ctx.buf; cnt < post_cnt; cnt++, mp += msg_len) { if (verify) { sprintf(mp, TEST_MESSAGE, tx_bufs_sent); tx_bufs_sent++; } ret = fi_send(ctx.ep, mp, msg_len, fi_mr_desc(ctx.mr), 0, mp); if (ret) { print_err("fi_send returned %d '%s'\n", ret, fi_strerror(ret)); return ret; } if (kthread_should_stop()) return -EINTR; } /* reap completions */ for (cnt = 0; cnt < post_cnt; cnt++) { #if SREAD ret = fi_cq_sread(ctx.scq, &comp, 1, 0, TIMEOUT); if (ret == -ETIMEDOUT) { print_msg("%s(ETIMEDOUT) cnt %d post_cnt %d " "msg_cnt %d\n", "fi_cq_sread", cnt, post_cnt, msg_cnt); } if (kthread_should_stop()) return -EINTR; #else do { ret = fi_cq_read(ctx.scq, &comp, 1); if (ret == 0 || ret == -EAGAIN) { if (--eagain_cnt <= 0) { dprint(DEBUG_HIGH, "%s(resched %d) cnt " "%d post_cnt %d\n", "fi_cq_read", ret, cnt, post_cnt); eagain_cnt = EAGAIN_TRIES; schedule(); } } if (kthread_should_stop()) return -EINTR; } while (ret == 0 || ret == -EAGAIN); #endif if (ret < 0) { struct fi_cq_err_entry cqe = { 0 }; int rc; rc = fi_cq_readerr(ctx.scq, &cqe, 0); print_err("fi_cq_read returned %d '%s'\n", ret, fi_strerror(ret)); if (rc) { char buf[64]; print_err("fi_cq_readerr() err '%s'(%d)" "\n", fi_strerror(cqe.err), cqe.err); print_err("fi_cq_readerr() prov_err " "'%s'(%d)\n", fi_cq_strerror(ctx.scq, cqe.prov_errno, cqe.err_data, buf, sizeof(buf)), cqe.prov_errno); } return ret; } if (!ret) print_err("fi_cq_sread no completion? ret %d\n", ret); #if 0 if ((char *)comp.op_context < (char *)ctx.buf || (char *)comp.op_context >= (char *) &ctx.buf[msg_len*post_depth]) { print_err("cq.op_context(%p) not in range " "[ctx.buf(%p) ... &ctx.buf[%d](%p)]\n", (void *)comp.op_context, (void *)ctx.buf, msg_len, (void *)&ctx.buf[msg_len]); } #endif if (verify) print_msg("Tx '%s'\n", (char *) comp.op_context); } msg_cnt -= post_cnt; } time_elap = get_jiffies_64() - time_elap; #define AGIG (1024UL*1024UL*1024UL) #define AMEG (1024UL*1024UL) #define AKILO (1024UL) { struct timeval tv; ulong rate, rate_mod, bytes, units_of; char units; jiffies_to_timeval(time_elap, &tv); bytes = (ulong) num_msgs * (ulong) msg_len; if (bytes >= AKILO && tv.tv_sec > 0) { rate = bytes / tv.tv_sec; rate_mod = bytes % tv.tv_sec; if (rate >= AGIG) { units = 'G'; units_of = AGIG; } else if (rate >= AMEG) { units = 'M'; units_of = AMEG; } else { units = 'K'; units_of = AKILO; } rate /= units_of; } else { rate = rate_mod = 0UL; units = ' '; units_of = 1UL; } print_info("Tx %d msgs (%lu.%lu%cB) @ ~%lu.%lu %cB/sec (%ld sec %ld " "usec)\n", num_msgs, (bytes/units_of), (bytes % units_of), units, rate, rate_mod, units, tv.tv_sec, tv.tv_usec); } return 0; }
static void setup_ofi_active(struct fi_info *info, struct fid_ep **ep) { // Make an EQ int ret; ret = fi_endpoint(fidev.domain, info, ep, NULL); if (0 != ret) { error("fi_endpoint failed"); } #if WANT_FDS // Add the EQ FD to the epoll fd static struct epoll_event edt; memset(&edt, 0, sizeof(edt)); edt.events = EPOLLIN; edt.data.u32 = 2222; ret = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fidev.eq_fd, &edt); if (ret < 0) { error("epoll_ctl failed"); } #endif // Bind the EP to the EQ ret = fi_ep_bind(*ep, &fidev.eq->fid, 0); if (0 != ret) { error("fi_ep_bind(eq) failed"); } // Make a CQ struct fi_cq_attr cq_attr; memset(&cq_attr, 0, sizeof(cq_attr)); cq_attr.format = FI_CQ_FORMAT_CONTEXT; cq_attr.wait_obj = FI_WAIT_FD; cq_attr.size = 32; // JMS POC ret = fi_cq_open(fidev.domain, &cq_attr, &ficonn.cq, NULL); if (ret != 0) { error("fi_cq_open failed"); } // Bind the CQ TX and RX queues to the EQ ret = fi_ep_bind(*ep, &ficonn.cq->fid, FI_TRANSMIT); if (0 != ret) { error("fi_ep_bind(cq tx) failed"); } ret = fi_ep_bind(*ep, &ficonn.cq->fid, FI_RECV); if (0 != ret) { error("fi_ep_bind(cq rx) failed"); } #if WANT_FDS // Get the fd associated with this CQ ret = fi_control(&(ficonn.cq->fid), FI_GETWAIT, &ficonn.cq_fd); if (ret != 0) { error("fi_control to get cq fq failed"); } #endif // Enable the EP! ret = fi_enable(*ep); if (0 != ret) { error("fi_enable failed"); } // Register the buffers (must use different keys for each) ret = fi_mr_reg(fidev.domain, send_buffer, sizeof(send_buffer), FI_SEND, 0, (uintptr_t) send_buffer, 0, &ficonn.send_mr, NULL); if (ret != 0) { error("fi_mr_reg(send) failed\n"); } ret = fi_mr_reg(fidev.domain, recv_buffer, sizeof(recv_buffer), FI_RECV, 0, (uintptr_t) recv_buffer, 0, &ficonn.recv_mr, NULL); if (ret != 0) { printf("ERROR: ret=%d, %s\n", ret, fi_strerror(-ret)); error("fi_mr_reg(recv) failed\n"); } }
static void test_connect_with_accept_blocking_on_eq_fq_CLIENT(void) { int ret; printf("CLIENT running\n"); // Get the server's node (IP addr) and service (port) MPI_Recv(ofi_node, sizeof(ofi_node) - 1, MPI_CHAR, 0, 101, MPI_COMM_WORLD, MPI_STATUS_IGNORE); MPI_Recv(ofi_service, sizeof(ofi_service) - 1, MPI_CHAR, 0, 102, MPI_COMM_WORLD, MPI_STATUS_IGNORE); printf("CLIENT received via MPI: %s / %s\n", ofi_node, ofi_service); //setup_ofi(ofi_node, ofi_service); setup_ofi(NULL, NULL, 0); memset(&sin, 0, sizeof(sin)); sin.sin_family = AF_INET; inet_aton(ofi_node, &sin.sin_addr); sin.sin_port = htons(atoi(ofi_service)); printf("CLIENT translated: %s\n", addrstr(&sin)); setup_ofi_active(fidev.info, &ficonn.ep); // Print server addr printf("CLIENT connecting to %s\n", addrstr(&sin)); // Connect! printf("Client connecting...\n"); ret = fi_connect(ficonn.ep, //fidev.info->dest_addr, &sin, (void*) client_data, sizeof(client_data)); if (ret < 0) { error("fi_connect failed"); } #if WANT_FDS // Now wait for the listen to complete int nevents; #define NEVENTS 32 struct epoll_event events[NEVENTS]; int timeout = 10000; while (1) { printf("CLIENT blocking on epoll\n"); nevents = epoll_wait(epoll_fd, events, NEVENTS, timeout); if (nevents < 0) { if (errno != EINTR) { error("client epoll wait failed"); } else { continue; } } else { printf("CLIENT successfully woke up from epoll! %d events\n", nevents); for (int i = 0; i < nevents; ++i) { if (events[i].data.u32 != 2222) { error("CLIENT unexpected epoll return type"); } } // If we got the expected event, then go read from the EQ break; } } #endif // Wait for FI_CONNECTED event uint32_t event; uint8_t *entry_buffer; size_t expected_len = sizeof(struct fi_eq_cm_entry) + sizeof(client_data); entry_buffer = (uint8_t*) calloc(1, expected_len); if (NULL == entry_buffer) { error("calloc failed"); } struct fi_eq_cm_entry *entry = (struct fi_eq_cm_entry*) entry_buffer; while (1) { printf("CLIENT waiting for FI_CONNECTED\n"); #if WANT_FDS ret = fi_eq_read(fidev.eq, &event, entry, expected_len, 0); #else ret = fi_eq_sread(fidev.eq, &event, entry, expected_len, -1, 0); #endif if (-FI_EAVAIL == ret) { fprintf(stderr, "client fi_eq_sread failed because there's something in the error queue\n"); char buffer[2048]; struct fi_eq_err_entry *err_entry = (struct fi_eq_err_entry*) buffer; ret = fi_eq_readerr(fidev.eq, err_entry, 0); fprintf(stderr, "error code: %d (%s), prov err code: %d (%s)\n", err_entry->err, fi_strerror(err_entry->err), err_entry->prov_errno, fi_strerror(err_entry->prov_errno)); error("sad panda"); } else if (ret == -EAGAIN) { fprintf(stderr, "CLIENT fi_eq_sread fail got -EAGAIN... trying again...\n"); sleep(1); continue; } else if (ret < 0) { fprintf(stderr, "SERVER fi_eq_sread fail: %s, ret = %d)\n", fi_strerror(-ret), ret); error("client fi_eq_sread failed for some random reason"); } else if (event != FI_CONNECTED) { error("client got some unexpected event"); } else if (ret != expected_len) { error("client got wrong length back from fi_eq_sread"); } uint32_t *d = (uint32_t*) entry->data; for (int i = 0; i < (sizeof(server_data) / sizeof(uint32_t)); ++i) { if (d[i] != server_data[i]) { printf("CLIENT got wrong CM client data: d[%d]=%d, should be %d\n", i, d[i], server_data[i]); } } printf("client got FI_CONNECTED, correct size, and correct data -- yay!\n"); break; } printf("CLIENT connecting -- waiting for server before sending\n"); MPI_Barrier(MPI_COMM_WORLD); sleep(1); int msg[4] = { 99, 100, 101, 102 }; int len = sizeof(msg); printf("CLIENT sending len of %d\n", len); struct fid_mr no_mr; struct fid_mr *mr; void *send_context = (void*) 0x42; #if 0 fi_mr_reg(fidev.domain, msg, len, FI_SEND | FI_RECV, 0, (uint64_t)(uintptr_t) msg, 0, &mr, NULL); #else // Try using no mr, like fi_msg_pingpong... memset(&no_mr, 0, sizeof(no_mr)); mr = &no_mr; #endif ret = fi_send(ficonn.ep, msg, len, fi_mr_desc(mr), 0, send_context); if (ret < 0) { printf("fi_Send failed! %d, %s\n", ret, fi_strerror(-ret)); MPI_Abort(MPI_COMM_WORLD, 37); } // Wait for send completion struct fi_cq_entry cqe; while (1) { ret = fi_cq_sread(ficonn.cq, &cqe, 1, 0, -1); if (cqe.op_context == send_context) { printf("CLIENT send completed\n"); break; } else { printf("CLIENT got some other completion... continuing\n"); } } printf("CLIENT sent -- waiting for server before teardown\n"); MPI_Barrier(MPI_COMM_WORLD); printf("CLIENT tearing down\n"); fi_close(&(mr->fid)); teardown_ofi(); }
/* * Set up the progress thread */ static void progress_thread(void *args) { struct progress_thread_info* pti = args; const int id = pti->id; const int num_rbufs = 2; struct iovec iov[num_rbufs]; struct fi_msg msg[num_rbufs]; struct ofi_am_info* dst_buf[num_rbufs]; const int rbuf_len = 10; const size_t rbuf_size = rbuf_len*sizeof(dst_buf[0][0]); const int num_cqes = rbuf_len; struct fi_cq_data_entry cqes[num_cqes]; int num_read; int i; for (i = 0; i < num_rbufs; i++) { dst_buf[i] = chpl_mem_allocMany(rbuf_len, sizeof(dst_buf[i][0]), CHPL_RT_MD_COMM_PER_LOC_INFO, 0, 0); iov[i].iov_base = dst_buf[i]; iov[i].iov_len = rbuf_size; msg[i].msg_iov = &iov[i]; msg[i].desc = (void **) fi_mr_desc(ofi.mr); msg[i].iov_count = 1; msg[i].addr = FI_ADDR_UNSPEC; msg[i].context = (void *) (uint64_t) i; msg[i].data = 0x0; OFICHKERR(fi_recvmsg(ofi.am_rx_ep[id], &msg[i], FI_MULTI_RECV)); } // Count this progress thread as running. The creator thread wants to // be released as soon as at least one progress thread is running, so // if we're the first, do that. if (atomic_fetch_add_uint_least32_t(&progress_thread_count, 1) == 0) { CALL_CHECK_ZERO(pthread_mutex_lock(&progress_thread_entEx_cond_mutex)); CALL_CHECK_ZERO(pthread_cond_signal(&progress_thread_enter_cond)); CALL_CHECK_ZERO(pthread_mutex_unlock(&progress_thread_entEx_cond_mutex)); } // Wait for events while (!atomic_load_bool(&progress_threads_please_exit)) { num_read = fi_cq_read(ofi.am_rx_cq[id], cqes, num_cqes); if (num_read > 0) { for (i = 0; i < num_read; i++) { chpl_comm_ofi_am_handler(&cqes[i]); // send ack } } else { if (num_read != -FI_EAGAIN) { chpl_internal_error(fi_strerror(-num_read)); } } } // Un-count this progress thread. Whoever told us to exit wants to // be released once all the progress threads are done, so if we're // the last, do that. if (atomic_fetch_sub_uint_least32_t(&progress_thread_count, 1) == 1) { CALL_CHECK_ZERO(pthread_mutex_lock(&progress_thread_entEx_cond_mutex)); CALL_CHECK_ZERO(pthread_cond_signal(&progress_thread_exit_cond)); CALL_CHECK_ZERO(pthread_mutex_unlock(&progress_thread_entEx_cond_mutex)); } }
static int client_connect(void) { struct fi_eq_cm_entry entry; uint32_t event; struct fi_info *fi; ssize_t rd; int ret; if (src_addr) { ret = getaddr(src_addr, NULL, (struct sockaddr **) &hints.src_addr, (socklen_t *) &hints.src_addrlen); if (ret) printf("source address error %s\n", gai_strerror(ret)); } ret = fi_getinfo(FI_VERSION(1, 0), dst_addr, port, 0, &hints, &fi); if (ret) { printf("fi_getinfo %s\n", strerror(-ret)); goto err0; } ret = fi_fabric(fi->fabric_attr, &fab, NULL); if (ret) { printf("fi_fabric %s\n", fi_strerror(-ret)); goto err1; } ret = fi_domain(fab, fi, &dom, NULL); if (ret) { printf("fi_domain %s %s\n", fi_strerror(-ret), fi->domain_attr->name); goto err2; } ret = fi_endpoint(dom, fi, &ep, NULL); if (ret) { printf("fi_endpoint %s\n", fi_strerror(-ret)); goto err3; } ret = alloc_ep_res(fi); if (ret) goto err4; ret = bind_ep_res(); if (ret) goto err5; ret = fi_connect(ep, fi->dest_addr, NULL, 0); if (ret) { printf("fi_connect %s\n", fi_strerror(-ret)); goto err5; } rd = fi_eq_sread(cmeq, &event, &entry, sizeof entry, -1, 0); if (rd != sizeof entry) { printf("fi_eq_sread %zd %s\n", rd, fi_strerror((int) -rd)); return (int) rd; } if (event != FI_COMPLETE || entry.fid != &ep->fid) { printf("Unexpected CM event %d fid %p (ep %p)\n", event, entry.fid, ep); ret = -FI_EOTHER; goto err1; } if (hints.src_addr) free(hints.src_addr); fi_freeinfo(fi); return 0; err5: free_ep_res(); err4: fi_close(&ep->fid); err3: fi_close(&dom->fid); err2: fi_close(&fab->fid); err1: fi_freeinfo(fi); err0: if (hints.src_addr) free(hints.src_addr); return ret; }
/* mca_btl_ofi_context_alloc_scalable() * * This function allocate communication contexts and return the pointer * to the first btl context. It also take care of all the bindings needed. * USE WITH SCALABLE ENDPOINT ONLY */ mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_scalable(struct fi_info *info, struct fid_domain *domain, struct fid_ep *sep, struct fid_av *av, size_t num_contexts) { BTL_VERBOSE(("creating %zu contexts", num_contexts)); int rc; size_t i; char *linux_device_name = info->domain_attr->name; struct fi_cq_attr cq_attr = {0}; struct fi_tx_attr tx_attr = {0}; struct fi_rx_attr rx_attr = {0}; mca_btl_ofi_context_t *contexts; tx_attr.op_flags = FI_DELIVERY_COMPLETE; contexts = (mca_btl_ofi_context_t*) calloc(num_contexts, sizeof(*contexts)); if (NULL == contexts) { BTL_VERBOSE(("cannot allocate communication contexts.")); return NULL; } /* Don't really need to check, just avoiding compiler warning because * BTL_VERBOSE is a no op in performance build and the compiler will * complain about unused variable. */ if (NULL == linux_device_name) { BTL_VERBOSE(("linux device name is NULL. This shouldn't happen.")); goto scalable_fail; } /* bind AV to endpoint */ rc = fi_scalable_ep_bind(sep, (fid_t)av, 0); if (0 != rc) { BTL_VERBOSE(("%s failed fi_scalable_ep_bind with err=%s", linux_device_name, fi_strerror(-rc) )); goto scalable_fail; } for (i=0; i < num_contexts; i++) { rc = fi_tx_context(sep, i, &tx_attr, &contexts[i].tx_ctx, NULL); if (0 != rc) { BTL_VERBOSE(("%s failed fi_tx_context with err=%s", linux_device_name, fi_strerror(-rc) )); goto scalable_fail; } /* We don't actually need a receiving context as we only do one-sided. * However, sockets provider will hang if we dont have one. It is * also nice to have equal number of tx/rx context. */ rc = fi_rx_context(sep, i, &rx_attr, &contexts[i].rx_ctx, NULL); if (0 != rc) { BTL_VERBOSE(("%s failed fi_rx_context with err=%s", linux_device_name, fi_strerror(-rc) )); goto scalable_fail; } /* create CQ */ cq_attr.format = FI_CQ_FORMAT_CONTEXT; cq_attr.wait_obj = FI_WAIT_NONE; rc = fi_cq_open(domain, &cq_attr, &contexts[i].cq, NULL); if (0 != rc) { BTL_VERBOSE(("%s failed fi_cq_open with err=%s", linux_device_name, fi_strerror(-rc) )); goto scalable_fail; } /* bind cq to transmit context */ uint32_t cq_flags = (FI_TRANSMIT); rc = fi_ep_bind(contexts[i].tx_ctx, (fid_t)contexts[i].cq, cq_flags); if (0 != rc) { BTL_VERBOSE(("%s failed fi_ep_bind with err=%s", linux_device_name, fi_strerror(-rc) )); goto scalable_fail; } /* enable the context. */ rc = fi_enable(contexts[i].tx_ctx); if (0 != rc) { BTL_VERBOSE(("%s failed fi_enable with err=%s", linux_device_name, fi_strerror(-rc) )); goto scalable_fail; } rc = fi_enable(contexts[i].rx_ctx); if (0 != rc) { BTL_VERBOSE(("%s failed fi_enable with err=%s", linux_device_name, fi_strerror(-rc) )); goto scalable_fail; } /* initialize completion freelist. */ rc = ofi_comp_list_init(&contexts[i].comp_list); if (rc != OPAL_SUCCESS) { goto scalable_fail; } /* assign the id */ contexts[i].context_id = i; } return contexts; scalable_fail: /* close and free */ for(i=0; i < num_contexts; i++) { mca_btl_ofi_context_finalize(&contexts[i], true); } free(contexts); return NULL; }
/* mca_btl_ofi_context_alloc_normal() * * This function will allocate an ofi_context, map the endpoint to tx/rx context, * bind CQ,AV to the endpoint and initialize all the structure. * USE WITH NORMAL ENDPOINT ONLY */ mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_normal(struct fi_info *info, struct fid_domain *domain, struct fid_ep *ep, struct fid_av *av) { int rc; uint32_t cq_flags = FI_TRANSMIT; char *linux_device_name = info->domain_attr->name; struct fi_cq_attr cq_attr = {0}; mca_btl_ofi_context_t *context; context = (mca_btl_ofi_context_t*) calloc(1, sizeof(*context)); if (NULL == context) { BTL_VERBOSE(("cannot allocate context")); return NULL; } /* Don't really need to check, just avoiding compiler warning because * BTL_VERBOSE is a no op in performance build and the compiler will * complain about unused variable. */ if (NULL == linux_device_name) { BTL_VERBOSE(("linux device name is NULL. This shouldn't happen.")); goto single_fail; } cq_attr.format = FI_CQ_FORMAT_CONTEXT; cq_attr.wait_obj = FI_WAIT_NONE; rc = fi_cq_open(domain, &cq_attr, &context->cq, NULL); if (0 != rc) { BTL_VERBOSE(("%s failed fi_cq_open with err=%s", linux_device_name, fi_strerror(-rc) )); goto single_fail; } rc = fi_ep_bind(ep, (fid_t)av, 0); if (0 != rc) { BTL_VERBOSE(("%s failed fi_ep_bind with err=%s", linux_device_name, fi_strerror(-rc) )); goto single_fail; } rc = fi_ep_bind(ep, (fid_t)context->cq, cq_flags); if (0 != rc) { BTL_VERBOSE(("%s failed fi_scalable_ep_bind with err=%s", linux_device_name, fi_strerror(-rc) )); goto single_fail; } rc = ofi_comp_list_init(&context->comp_list); if (rc != OPAL_SUCCESS) { goto single_fail; } context->tx_ctx = ep; context->rx_ctx = ep; context->context_id = 0; return context; single_fail: mca_btl_ofi_context_finalize(context, false); return NULL; }
int _gnix_cm_nic_alloc(struct gnix_fid_domain *domain, struct fi_info *info, uint32_t cdm_id, struct gnix_auth_key *auth_key, struct gnix_cm_nic **cm_nic_ptr) { int ret = FI_SUCCESS; struct gnix_cm_nic *cm_nic = NULL; gnix_hashtable_attr_t gnix_ht_attr = {0}; uint32_t name_type = GNIX_EPN_TYPE_UNBOUND; struct gnix_nic_attr nic_attr = {0}; struct gnix_ep_name ep_name; struct gnix_dgram_hndl_attr dgram_hndl_attr = {0}; struct gnix_dgram_hndl_attr *dgram_hndl_attr_ptr = NULL; GNIX_TRACE(FI_LOG_EP_CTRL, "\n"); *cm_nic_ptr = NULL; /* * if app has specified a src_addr in the info * argument and length matches that for gnix_ep_name * we must allocate a cm_nic, otherwise we first * check to see if there is a cm_nic already for this domain * and just use it. */ if (info->src_addr) { /*TODO (optimization): strchr to name_type and strtol */ _gnix_get_ep_name(info->src_addr, 0, &ep_name, domain); name_type = ep_name.name_type; } GNIX_INFO(FI_LOG_EP_CTRL, "creating cm_nic for %u/0x%x/%u\n", auth_key->ptag, auth_key->cookie, cdm_id); cm_nic = (struct gnix_cm_nic *)calloc(1, sizeof(*cm_nic)); if (cm_nic == NULL) { ret = -FI_ENOMEM; goto err; } /* * we have to force allocation of a new nic since we want * an a particulard cdm id */ nic_attr.must_alloc = true; nic_attr.use_cdm_id = true; nic_attr.cdm_id = cdm_id; nic_attr.auth_key = auth_key; ret = gnix_nic_alloc(domain, &nic_attr, &cm_nic->nic); if (ret != FI_SUCCESS) { GNIX_WARN(FI_LOG_EP_CTRL, "gnix_nic_alloc returned %s\n", fi_strerror(-ret)); goto err; } cm_nic->my_name.gnix_addr.cdm_id = cdm_id; cm_nic->ptag = auth_key->ptag; cm_nic->my_name.cookie = auth_key->cookie; cm_nic->my_name.gnix_addr.device_addr = cm_nic->nic->device_addr; cm_nic->domain = domain; cm_nic->ctrl_progress = domain->control_progress; cm_nic->my_name.name_type = name_type; cm_nic->poll_cnt = 0; fastlock_init(&cm_nic->wq_lock); dlist_init(&cm_nic->cm_nic_wq); /* * prep the cm nic's dgram component */ if (domain->control_progress == FI_PROGRESS_AUTO) { dgram_hndl_attr.timeout_needed = __gnix_cm_nic_timeout_needed; dgram_hndl_attr.timeout_progress = __gnix_cm_nic_timeout_progress; dgram_hndl_attr.timeout_data = (void *)cm_nic; dgram_hndl_attr.timeout = domain->params.dgram_progress_timeout; dgram_hndl_attr_ptr = &dgram_hndl_attr; }; ret = _gnix_dgram_hndl_alloc(cm_nic, domain->control_progress, dgram_hndl_attr_ptr, &cm_nic->dgram_hndl); if (ret != FI_SUCCESS) goto err; /* * allocate hash table for translating ep addresses * to ep's. * This table will not be large - how many FI_EP_RDM ep's * will an app create using one domain?, nor in the critical path * so just use defaults. */ cm_nic->addr_to_ep_ht = calloc(1, sizeof(struct gnix_hashtable)); if (cm_nic->addr_to_ep_ht == NULL) goto err; gnix_ht_attr.ht_initial_size = 64; gnix_ht_attr.ht_maximum_size = 1024; gnix_ht_attr.ht_increase_step = 2; gnix_ht_attr.ht_increase_type = GNIX_HT_INCREASE_MULT; gnix_ht_attr.ht_collision_thresh = 500; gnix_ht_attr.ht_hash_seed = 0xdeadbeefbeefdead; gnix_ht_attr.ht_internal_locking = 1; gnix_ht_attr.destructor = NULL; ret = _gnix_ht_init(cm_nic->addr_to_ep_ht, &gnix_ht_attr); if (ret != FI_SUCCESS) { GNIX_WARN(FI_LOG_EP_CTRL, "gnix_ht_init returned %s\n", fi_strerror(-ret)); goto err; } _gnix_ref_init(&cm_nic->ref_cnt, 1, __cm_nic_destruct); *cm_nic_ptr = cm_nic; pthread_mutex_lock(&gnix_cm_nic_list_lock); dlist_insert_tail(&cm_nic->cm_nic_list, &gnix_cm_nic_list); pthread_mutex_unlock(&gnix_cm_nic_list_lock); return ret; err: if (cm_nic->dgram_hndl) _gnix_dgram_hndl_free(cm_nic->dgram_hndl); if (cm_nic->nic) _gnix_nic_free(cm_nic->nic); if (cm_nic->addr_to_ep_ht) { _gnix_ht_destroy(cm_nic->addr_to_ep_ht); free(cm_nic->addr_to_ep_ht); } if (cm_nic != NULL) free(cm_nic); return ret; }
static int __process_datagram(struct gnix_datagram *dgram, struct gnix_address peer_address, gni_post_state_t state) { int ret = FI_SUCCESS; struct gnix_cm_nic *cm_nic = NULL; uint8_t in_tag = 0, out_tag = 0; char rcv_buf[GNIX_CM_NIC_MAX_MSG_SIZE]; GNIX_TRACE(FI_LOG_EP_CTRL, "\n"); cm_nic = (struct gnix_cm_nic *)dgram->cache; if (cm_nic == NULL) { GNIX_WARN(FI_LOG_EP_CTRL, "process_datagram, null cache\n"); goto err; } if (state != GNI_POST_COMPLETED) { ret = __process_dgram_w_error(cm_nic, dgram, peer_address, state); GNIX_WARN(FI_LOG_EP_CTRL, "process_datagram bad post state %d\n", state); goto err; } __dgram_get_in_tag(dgram, &in_tag); if ((in_tag != GNIX_CM_NIC_BND_TAG) && (in_tag != GNIX_CM_NIC_WC_TAG)) { GNIX_WARN(FI_LOG_EP_CTRL, "datagram with unknown in tag %d\n", in_tag); goto err; } __dgram_unpack_out_tag(dgram, &out_tag); if ((out_tag != GNIX_CM_NIC_BND_TAG) && (out_tag != GNIX_CM_NIC_WC_TAG)) { GNIX_WARN(FI_LOG_EP_CTRL, "datagram with unknown out tag %d\n", out_tag); goto err; } /* * if out buf actually has data, call consumer's * receive callback */ if (out_tag == GNIX_CM_NIC_BND_TAG) { _gnix_dgram_unpack_buf(dgram, GNIX_DGRAM_OUT_BUF, rcv_buf, GNIX_CM_NIC_MAX_MSG_SIZE); ret = cm_nic->rcv_cb_fn(cm_nic, rcv_buf, peer_address); if (ret != FI_SUCCESS) { GNIX_WARN(FI_LOG_EP_CTRL, "cm_nic->rcv_cb_fn returned %s\n", fi_strerror(-ret)); goto err; } ret = _gnix_cm_nic_progress(cm_nic); if (ret != FI_SUCCESS) GNIX_WARN(FI_LOG_EP_CTRL, "_gnix_cm_nic_progress returned %s\n", fi_strerror(-ret)); } /* * if we are processing a WC datagram, repost, otherwise * just put back on the freelist. */ if (in_tag == GNIX_CM_NIC_WC_TAG) { dgram->callback_fn = __process_datagram; dgram->cache = cm_nic; __dgram_set_tag(dgram, in_tag); ret = _gnix_dgram_wc_post(dgram); if (ret != FI_SUCCESS) { GNIX_WARN(FI_LOG_EP_CTRL, "_gnix_dgram_wc_post returned %s\n", fi_strerror(-ret)); goto err; } } else { ret = _gnix_dgram_free(dgram); if (ret != FI_SUCCESS) GNIX_WARN(FI_LOG_EP_CTRL, "_gnix_dgram_free returned %s\n", fi_strerror(-ret)); } return ret; err: if (in_tag == GNIX_CM_NIC_BND_TAG) _gnix_dgram_free(dgram); return ret; }
int _gnix_cm_nic_send(struct gnix_cm_nic *cm_nic, char *sbuf, size_t len, struct gnix_address target_addr) { int ret = FI_SUCCESS; struct gnix_datagram *dgram = NULL; ssize_t __attribute__((unused)) plen; uint8_t tag; struct gnix_work_req *work_req; GNIX_TRACE(FI_LOG_EP_CTRL, "\n"); if ((cm_nic == NULL) || (sbuf == NULL)) return -FI_EINVAL; if (len > GNI_DATAGRAM_MAXSIZE) return -FI_ENOSPC; ret = _gnix_dgram_alloc(cm_nic->dgram_hndl, GNIX_DGRAM_BND, &dgram); if (ret != FI_SUCCESS) { GNIX_WARN(FI_LOG_EP_CTRL, "_gnix_dgram_alloc returned %s\n", fi_strerror(-ret)); goto exit; } dgram->target_addr = target_addr; dgram->callback_fn = __process_datagram; dgram->cache = cm_nic; tag = GNIX_CM_NIC_BND_TAG; __dgram_set_tag(dgram, tag); plen = _gnix_dgram_pack_buf(dgram, GNIX_DGRAM_IN_BUF, sbuf, len); assert (plen == len); /* If connecting with the same CM NIC, skip datagram exchange. The * caller could be holding an endpoint lock, so schedule connection * completion for later. */ if (GNIX_ADDR_EQUAL(target_addr, cm_nic->my_name.gnix_addr)) { char tmp_buf[GNIX_CM_NIC_MAX_MSG_SIZE]; /* Pack output buffer with input data. */ _gnix_dgram_unpack_buf(dgram, GNIX_DGRAM_IN_BUF, tmp_buf, GNIX_CM_NIC_MAX_MSG_SIZE); _gnix_dgram_pack_buf(dgram, GNIX_DGRAM_OUT_BUF, tmp_buf, GNIX_CM_NIC_MAX_MSG_SIZE); work_req = calloc(1, sizeof(*work_req)); if (work_req == NULL) { _gnix_dgram_free(dgram); return -FI_ENOMEM; } work_req->progress_fn = __gnix_cm_nic_intra_progress_fn; work_req->data = dgram; work_req->completer_fn = NULL; fastlock_acquire(&cm_nic->wq_lock); dlist_insert_before(&work_req->list, &cm_nic->cm_nic_wq); fastlock_release(&cm_nic->wq_lock); GNIX_INFO(FI_LOG_EP_CTRL, "Initiated intra-CM NIC connect\n"); } else { ret = _gnix_dgram_bnd_post(dgram); if (ret == -FI_EBUSY) { ret = -FI_EAGAIN; _gnix_dgram_free(dgram); } } exit: return ret; }
int _gnix_cm_nic_progress(void *arg) { struct gnix_cm_nic *cm_nic = (struct gnix_cm_nic *)arg; int ret = FI_SUCCESS; int complete; struct gnix_work_req *p = NULL; /* * if we're doing FI_PROGRESS_MANUAL, * see what's going on inside kgni's datagram * box... */ if (cm_nic->ctrl_progress == FI_PROGRESS_MANUAL) { ++cm_nic->poll_cnt; if (((cm_nic->poll_cnt % 512) == 0) || !dlist_empty(&cm_nic->cm_nic_wq)) { ret = _gnix_dgram_poll(cm_nic->dgram_hndl, GNIX_DGRAM_NOBLOCK); if (ret != FI_SUCCESS) { GNIX_WARN(FI_LOG_EP_CTRL, "_gnix_dgram_poll returned %s\n", fi_strerror(-ret)); goto err; } } } /* * do a quick check if queue doesn't have anything yet, * don't need this to be atomic */ check_again: if (dlist_empty(&cm_nic->cm_nic_wq)) return ret; /* * okay, stuff to do, lock work queue, * dequeue head, unlock, process work element, * if it doesn't compete, put back at the tail * of the queue. */ fastlock_acquire(&cm_nic->wq_lock); p = dlist_first_entry(&cm_nic->cm_nic_wq, struct gnix_work_req, list); if (p == NULL) { fastlock_release(&cm_nic->wq_lock); return ret; } dlist_remove_init(&p->list); fastlock_release(&cm_nic->wq_lock); assert(p->progress_fn); ret = p->progress_fn(p->data, &complete); if (ret != FI_SUCCESS) { GNIX_WARN(FI_LOG_EP_CTRL, "dgram prog fn returned %s\n", fi_strerror(-ret)); } if (complete == 1) { if (p->completer_fn) { ret = p->completer_fn(p->completer_data); free(p); if (ret != FI_SUCCESS) { GNIX_WARN(FI_LOG_EP_CTRL, "dgram completer fn returned %s\n", fi_strerror(-ret)); goto err; } } else { free(p); } goto check_again; } else { fastlock_acquire(&cm_nic->wq_lock); dlist_insert_before(&p->list, &cm_nic->cm_nic_wq); fastlock_release(&cm_nic->wq_lock); } err: return ret; }
std::string fi_error_to_string(int err_code) { return fi_strerror(err_code >= 0 ? err_code : -err_code); }
static int __gnix_vc_hndl_conn_req(struct gnix_cm_nic *cm_nic, char *msg_buffer, struct gnix_address src_cm_nic_addr) { int ret = FI_SUCCESS; gni_return_t __attribute__((unused)) status; struct gnix_fid_ep *ep = NULL; gnix_ht_key_t *key_ptr; struct gnix_av_addr_entry entry; struct gnix_address src_addr, target_addr; struct gnix_vc *vc = NULL; struct gnix_vc *vc_try = NULL; struct gnix_work_req *work_req; int src_vc_id; gni_smsg_attr_t src_smsg_attr; uint64_t src_vc_ptr; struct wq_hndl_conn_req *data = NULL; ssize_t __attribute__((unused)) len; GNIX_TRACE(FI_LOG_EP_CTRL, "\n"); /* * unpack the message */ __gnix_vc_unpack_conn_req(msg_buffer, &target_addr, &src_addr, &src_vc_id, &src_vc_ptr, &src_smsg_attr); GNIX_DEBUG(FI_LOG_EP_CTRL, "conn req rx: (From Aries addr 0x%x Id %d to Aries 0x%x Id %d src vc 0x%lx )\n", src_addr.device_addr, src_addr.cdm_id, target_addr.device_addr, target_addr.cdm_id, src_vc_ptr); /* * lookup the ep from the addr_to_ep_ht using the target_addr * in the datagram */ key_ptr = (gnix_ht_key_t *)&target_addr; ep = (struct gnix_fid_ep *)_gnix_ht_lookup(cm_nic->addr_to_ep_ht, *key_ptr); if (ep == NULL) { GNIX_WARN(FI_LOG_EP_DATA, "_gnix_ht_lookup addr_to_ep failed\n"); ret = -FI_ENOENT; goto err; } /* * look to see if there is a VC already for the * address of the connecting EP. */ key_ptr = (gnix_ht_key_t *)&src_addr; fastlock_acquire(&ep->vc_ht_lock); vc = (struct gnix_vc *)_gnix_ht_lookup(ep->vc_ht, *key_ptr); /* * if there is no corresponding vc in the hash, * or there is an entry and its not in connecting state * go down the conn req ack route. */ if ((vc == NULL) || (vc->conn_state == GNIX_VC_CONN_NONE)) { if (vc == NULL) { entry.gnix_addr = src_addr; entry.cm_nic_cdm_id = src_cm_nic_addr.cdm_id; ret = _gnix_vc_alloc(ep, &entry, &vc_try); if (ret != FI_SUCCESS) { GNIX_WARN(FI_LOG_EP_CTRL, "_gnix_vc_alloc returned %s\n", fi_strerror(-ret)); goto err; } vc_try->conn_state = GNIX_VC_CONNECTING; ret = _gnix_ht_insert(ep->vc_ht, *key_ptr, vc_try); if (likely(ret == FI_SUCCESS)) { vc = vc_try; vc->modes |= GNIX_VC_MODE_IN_HT; } else if (ret == -FI_ENOSPC) { _gnix_vc_destroy(vc_try); } else { GNIX_WARN(FI_LOG_EP_DATA, "_gnix_ht_insert returned %s\n", fi_strerror(-ret)); goto err; } } else vc->conn_state = GNIX_VC_CONNECTING; /* * prepare a work request to * initiate an request response */ work_req = calloc(1, sizeof(*work_req)); if (work_req == NULL) { ret = -FI_ENOMEM; goto err; } data = calloc(1, sizeof(struct wq_hndl_conn_req)); if (data == NULL) { ret = -FI_ENOMEM; goto err; } memcpy(&data->src_smsg_attr, &src_smsg_attr, sizeof(src_smsg_attr)); data->vc = vc; data->src_vc_id = src_vc_id; data->src_vc_ptr = src_vc_ptr; work_req->progress_fn = __gnix_vc_conn_ack_prog_fn; work_req->data = data; work_req->completer_fn = __gnix_vc_conn_ack_comp_fn; work_req->completer_data = data; /* * add the work request to the tail of the * cm_nic's work queue, progress the cm_nic. */ fastlock_acquire(&cm_nic->wq_lock); dlist_insert_before(&work_req->list, &cm_nic->cm_nic_wq); fastlock_release(&cm_nic->wq_lock); fastlock_release(&ep->vc_ht_lock); _gnix_vc_schedule(vc); ret = _gnix_cm_nic_progress(cm_nic); if (ret != FI_SUCCESS) GNIX_WARN(FI_LOG_EP_CTRL, "_gnix_cm_nic_progress returned %s\n", fi_strerror(-ret)); } else { /* * we can only be in connecting state if we * reach here. We have all the informatinon, * and the other side will get the information * at some point, so go ahead and build SMSG connection. */ if (vc->conn_state != GNIX_VC_CONNECTING) { GNIX_WARN(FI_LOG_EP_CTRL, "vc %p not in connecting state nor in cm wq\n", vc, vc->conn_state); ret = -FI_EINVAL; goto err; } ret = __gnix_vc_smsg_init(vc, src_vc_id, &src_smsg_attr); if (ret != FI_SUCCESS) { GNIX_WARN(FI_LOG_EP_CTRL, "_gnix_vc_smsg_init returned %s\n", fi_strerror(-ret)); goto err; } vc->conn_state = GNIX_VC_CONNECTED; GNIX_DEBUG(FI_LOG_EP_CTRL, "moving vc %p state to connected\n", vc); fastlock_release(&ep->vc_ht_lock); ret = _gnix_vc_schedule(vc); ret = _gnix_cm_nic_progress(cm_nic); if (ret != FI_SUCCESS) GNIX_WARN(FI_LOG_EP_CTRL, "_gnix_cm_nic_progress returned %s\n", fi_strerror(-ret)); } err: return ret; }
int main(int argc, char **argv) { int op, ret, ndoms = 3, neps = 3, i, j; hints = fi_allocinfo(); if (!hints) { exit(1); } while ((op = getopt(argc, argv, "f:p:d:D:n:d:e:h")) != -1) { switch (op) { case 'f': hints->fabric_attr->name = strdup(optarg); break; case 'p': hints->fabric_attr->prov_name = strdup(optarg); break; case 'd': ndoms = atoi(optarg); break; case 'e': neps = atoi(optarg); break; case 'h': default: printf("usage: %s\n", argv[0]); printf("\t[-f fabric_name]\n"); printf("\t[-p provider_name]\n"); printf("\t[-d ndomains]\n"); printf("\t[-e neps]\n"); exit(1); } } hints->mode = ~0; ret = fi_getinfo(FI_VERSION(1, 0), NULL, 0, 0, hints, &fi); if (ret != 0) { printf("fi_getinfo %s\n", fi_strerror(-ret)); exit(1); } ret = fi_fabric(fi->fabric_attr, &fabric, NULL); if (ret != 0) { printf("fi_fabric %s\n", fi_strerror(-ret)); exit(1); } domains = (struct fid_domain **)malloc(ndoms * sizeof(struct fid_domain *)); assert(domains); eps = (struct fid_ep **)malloc(neps * ndoms * sizeof(struct fid_ep *)); assert(eps); for (i = 0; i < ndoms; i++) { ret = fi_domain(fabric, fi, &domains[i], NULL); if (ret != 0) { printf("fi_domain %s\n", fi_strerror(-ret)); exit(1); } for (j = 0; j < neps; j++) { int idx = (i * neps) + j; ret = fi_endpoint(domains[i], fi, &eps[idx], NULL); if (ret != 0) { printf("[%d:%d] ]fi_endpoint %s\n", i, j, fi_strerror(-ret)); exit(1); } } } for (i = 0; i < ndoms; i++) { for (j = 0; j < neps; j++) { int idx = (i * neps) + j; ret = fi_close(&eps[idx]->fid); if (ret != 0) { printf("Error %d closing ep: %s\n", ret, fi_strerror(-ret)); exit(1); } } ret = fi_close(&domains[i]->fid); if (ret != 0) { printf("Error %d closing domain: %s\n", ret, fi_strerror(-ret)); exit(1); } } free(eps); free(domains); ret = fi_close(&fabric->fid); if (ret != 0) { printf("Error %d closing fabric: %s\n", ret, fi_strerror(-ret)); exit(1); } fi_freeinfo(fi); fi_freeinfo(hints); return ret; }
static int __gnix_vc_conn_ack_prog_fn(void *data, int *complete_ptr) { int ret = FI_SUCCESS; int complete = 0; struct wq_hndl_conn_req *work_req_data; struct gnix_vc *vc; struct gnix_mbox *mbox = NULL; gni_smsg_attr_t smsg_mbox_attr; struct gnix_fid_ep *ep = NULL; struct gnix_fid_domain *dom = NULL; struct gnix_cm_nic *cm_nic = NULL; char sbuf[GNIX_CM_NIC_MAX_MSG_SIZE] = {0}; GNIX_TRACE(FI_LOG_EP_CTRL, "\n"); work_req_data = (struct wq_hndl_conn_req *)data; vc = work_req_data->vc; if (vc == NULL) return -FI_EINVAL; ep = vc->ep; if (ep == NULL) return -FI_EINVAL; dom = ep->domain; if (dom == NULL) return -FI_EINVAL; cm_nic = ep->cm_nic; if (cm_nic == NULL) return -FI_EINVAL; fastlock_acquire(&ep->vc_ht_lock); /* * we may have already been moved to connecting or * connected, if so early exit. */ if(vc->conn_state == GNIX_VC_CONNECTED) { complete = 1; goto exit; } /* * first see if we still need a mailbox */ if (vc->smsg_mbox == NULL) { ret = _gnix_mbox_alloc(ep->nic->mbox_hndl, &mbox); if (ret == FI_SUCCESS) vc->smsg_mbox = mbox; else goto exit; } mbox = vc->smsg_mbox; /* * prep the smsg_mbox_attr ¬ */ smsg_mbox_attr.msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT; smsg_mbox_attr.msg_buffer = mbox->base; smsg_mbox_attr.buff_size = ep->nic->mem_per_mbox; smsg_mbox_attr.mem_hndl = *mbox->memory_handle; smsg_mbox_attr.mbox_offset = (uint64_t)mbox->offset; smsg_mbox_attr.mbox_maxcredit = dom->params.mbox_maxcredit; smsg_mbox_attr.msg_maxsize = dom->params.mbox_msg_maxsize; /* * serialize the resp message in the buffer */ __gnix_vc_pack_conn_resp(sbuf, work_req_data->src_vc_ptr, (uint64_t)vc, vc->vc_id, &smsg_mbox_attr); /* * try to send the message, if it succeeds, * initialize mailbox and move vc to connected * state. */ ret = _gnix_cm_nic_send(cm_nic, sbuf, GNIX_CM_NIC_MAX_MSG_SIZE, vc->peer_cm_nic_addr); if (ret == FI_SUCCESS) { ret = __gnix_vc_smsg_init(vc, work_req_data->src_vc_id, &work_req_data->src_smsg_attr); if (ret != FI_SUCCESS) { GNIX_WARN(FI_LOG_EP_CTRL, "_gnix_vc_smsg_init returned %s\n", fi_strerror(-ret)); goto exit; } complete = 1; vc->conn_state = GNIX_VC_CONNECTED; GNIX_DEBUG(FI_LOG_EP_CTRL, "moving vc %p to connected\n",vc); } else if (ret == -FI_EAGAIN) { ret = _gnix_vc_schedule(vc); ret = FI_SUCCESS; } else assert(0); exit: fastlock_release(&ep->vc_ht_lock); *complete_ptr = complete; return ret; }
static int usdf_getinfo(uint32_t version, const char *node, const char *service, uint64_t flags, struct fi_info *hints, struct fi_info **info) { struct usdf_usnic_info *dp; struct usdf_dev_entry *dep; struct usd_device_attrs *dap; struct fi_info *fi_first; struct fi_info *fi_last; struct addrinfo *ai; struct sockaddr_in *src; struct sockaddr_in *dest; enum fi_ep_type ep_type; int metric; int d; int ret; USDF_TRACE("\n"); fi_first = NULL; fi_last = NULL; ai = NULL; src = NULL; dest = NULL; /* * Get and cache usNIC device info */ if (__usdf_devinfo == NULL) { ret = usdf_get_devinfo(); if (ret != 0) { USDF_WARN("failed to usdf_get_devinfo, ret=%d (%s)\n", ret, fi_strerror(-ret)); if (ret == -FI_ENODEV) ret = -FI_ENODATA; goto fail; } } dp = __usdf_devinfo; if (node != NULL || service != NULL) { ret = getaddrinfo(node, service, NULL, &ai); if (ret != 0) { USDF_DBG("getaddrinfo failed, likely bad node/service specified (%s:%s)\n", node, service); ret = -errno; goto fail; } if (flags & FI_SOURCE) { src = (struct sockaddr_in *)ai->ai_addr; } else { dest = (struct sockaddr_in *)ai->ai_addr; } } if (hints != NULL) { if (dest == NULL && hints->dest_addr != NULL) { dest = hints->dest_addr; } if (src == NULL && hints->src_addr != NULL) { src = hints->src_addr; } } for (d = 0; d < dp->uu_num_devs; ++d) { dep = &dp->uu_info[d]; dap = &dep->ue_dattr; /* skip this device if it has some problem */ if (!dep->ue_dev_ok) { USDF_DBG("skipping %s/%s\n", dap->uda_devname, dap->uda_ifname); continue; } /* See if dest is reachable from this device */ if (dest != NULL && dest->sin_addr.s_addr != INADDR_ANY) { ret = usdf_get_distance(dap, dest->sin_addr.s_addr, &metric); if (ret != 0) { goto fail; } if (metric == -1) { USDF_DBG("dest %s unreachable from %s/%s, skipping\n", inet_ntoa(dest->sin_addr), dap->uda_devname, dap->uda_ifname); continue; } } /* Does this device match requested attributes? */ if (hints != NULL) { ret = usdf_validate_hints(hints, dap); if (ret != 0) { USDF_DBG("hints do not match for %s/%s, skipping\n", dap->uda_devname, dap->uda_ifname); continue; } ep_type = hints->ep_attr ? hints->ep_attr->type : FI_EP_UNSPEC; } else { ep_type = FI_EP_UNSPEC; } if (ep_type == FI_EP_DGRAM || ep_type == FI_EP_UNSPEC) { ret = usdf_fill_info_dgram(version, hints, src, dest, dap, &fi_first, &fi_last); if (ret != 0 && ret != -FI_ENODATA) { goto fail; } } if (ep_type == FI_EP_MSG || ep_type == FI_EP_UNSPEC) { ret = usdf_fill_info_msg(hints, src, dest, dap, &fi_first, &fi_last); if (ret != 0 && ret != -FI_ENODATA) { goto fail; } } if (ep_type == FI_EP_RDM || ep_type == FI_EP_UNSPEC) { ret = usdf_fill_info_rdm(hints, src, dest, dap, &fi_first, &fi_last); if (ret != 0 && ret != -FI_ENODATA) { goto fail; } } } if (fi_first != NULL) { *info = fi_first; ret = 0; } else { ret = -FI_ENODATA; } fail: if (ret != 0) { fi_freeinfo(fi_first); } if (ai != NULL) { freeaddrinfo(ai); } if (ret != 0) { USDF_INFO("returning %d (%s)\n", ret, fi_strerror(-ret)); } return ret; }
static int server_connect(void) { struct fi_eq_cm_entry entry; uint32_t event; struct fi_info *info = NULL; ssize_t rd; int ret; rd = fi_eq_sread(cmeq, &event, &entry, sizeof entry, -1, 0); if (rd != sizeof entry) { printf("fi_eq_sread %zd %s\n", rd, fi_strerror((int) -rd)); return (int) rd; } if (event != FI_CONNREQ) { printf("Unexpected CM event %d\n", event); ret = -FI_EOTHER; goto err1; } info = entry.info; ret = fi_domain(fab, info, &dom, NULL); if (ret) { printf("fi_domain %s\n", fi_strerror(-ret)); goto err1; } ret = fi_endpoint(dom, info, &ep, NULL); if (ret) { printf("fi_endpoint for req %s\n", fi_strerror(-ret)); goto err1; } ret = alloc_ep_res(info); if (ret) goto err2; ret = bind_ep_res(); if (ret) goto err3; ret = fi_accept(ep, NULL, 0); if (ret) { printf("fi_accept %s\n", fi_strerror(-ret)); goto err3; } rd = fi_eq_sread(cmeq, &event, &entry, sizeof entry, -1, 0); if (rd != sizeof entry) { printf("fi_eq_sread %zd %s\n", rd, fi_strerror((int) -rd)); goto err3; } if (event != FI_COMPLETE || entry.fid != &ep->fid) { printf("Unexpected CM event %d fid %p (ep %p)\n", event, entry.fid, ep); ret = -FI_EOTHER; goto err3; } fi_freeinfo(info); return 0; err3: free_ep_res(); err2: fi_close(&ep->fid); err1: fi_reject(pep, info->connreq, NULL, 0); fi_freeinfo(info); return ret; }
static int usdf_fabric_open(struct fi_fabric_attr *fattrp, struct fid_fabric **fabric, void *context) { struct fid_fabric *ff; struct usdf_fabric *fp; struct usdf_usnic_info *dp; struct usdf_dev_entry *dep; struct epoll_event ev; struct sockaddr_in sin; int ret; int d; USDF_TRACE("\n"); /* Make sure this fabric exists */ dp = __usdf_devinfo; for (d = 0; d < dp->uu_num_devs; ++d) { dep = &dp->uu_info[d]; if (dep->ue_dev_ok && strcmp(fattrp->name, dep->ue_dattr.uda_devname) == 0) { break; } } if (d >= dp->uu_num_devs) { USDF_INFO("device \"%s\" does not exit, returning -FI_ENODEV\n", fattrp->name); return -FI_ENODEV; } fp = calloc(1, sizeof(*fp)); if (fp == NULL) { USDF_INFO("unable to allocate memory for fabric\n"); return -FI_ENOMEM; } fp->fab_epollfd = -1; fp->fab_arp_sockfd = -1; LIST_INIT(&fp->fab_domain_list); fp->fab_attr.fabric = fab_utof(fp); fp->fab_attr.name = strdup(fattrp->name); fp->fab_attr.prov_name = strdup(USDF_PROV_NAME); fp->fab_attr.prov_version = USDF_PROV_VERSION; if (fp->fab_attr.name == NULL || fp->fab_attr.prov_name == NULL) { ret = -FI_ENOMEM; goto fail; } fp->fab_fid.fid.fclass = FI_CLASS_FABRIC; fp->fab_fid.fid.context = context; fp->fab_fid.fid.ops = &usdf_fi_ops; fp->fab_fid.ops = &usdf_ops_fabric; fp->fab_dev_attrs = &dep->ue_dattr; fp->fab_epollfd = epoll_create(1024); if (fp->fab_epollfd == -1) { ret = -errno; USDF_INFO("unable to allocate epoll fd\n"); goto fail; } fp->fab_eventfd = eventfd(0, EFD_NONBLOCK | EFD_SEMAPHORE); if (fp->fab_eventfd == -1) { ret = -errno; USDF_INFO("unable to allocate event fd\n"); goto fail; } fp->fab_poll_item.pi_rtn = usdf_fabric_progression_cb; fp->fab_poll_item.pi_context = fp; ev.events = EPOLLIN; ev.data.ptr = &fp->fab_poll_item; ret = epoll_ctl(fp->fab_epollfd, EPOLL_CTL_ADD, fp->fab_eventfd, &ev); if (ret == -1) { ret = -errno; USDF_INFO("unable to EPOLL_CTL_ADD\n"); goto fail; } /* initialize timer subsystem */ ret = usdf_timer_init(fp); if (ret != 0) { USDF_INFO("unable to initialize timer\n"); goto fail; } ret = pthread_create(&fp->fab_thread, NULL, usdf_fabric_progression_thread, fp); if (ret != 0) { ret = -ret; USDF_INFO("unable to create progress thread\n"); goto fail; } /* create and bind socket for ARP resolution */ memset(&sin, 0, sizeof(sin)); sin.sin_family = AF_INET; sin.sin_addr.s_addr = fp->fab_dev_attrs->uda_ipaddr_be; fp->fab_arp_sockfd = socket(AF_INET, SOCK_DGRAM, 0); if (fp->fab_arp_sockfd == -1) { USDF_INFO("unable to create socket\n"); goto fail; } ret = bind(fp->fab_arp_sockfd, (struct sockaddr *) &sin, sizeof(sin)); if (ret == -1) { ret = -errno; goto fail; } atomic_initialize(&fp->fab_refcnt, 0); fattrp->fabric = fab_utof(fp); fattrp->prov_version = USDF_PROV_VERSION; *fabric = fab_utof(fp); USDF_INFO("successfully opened %s/%s\n", fattrp->name, fp->fab_dev_attrs->uda_ifname); return 0; fail: free(fp->fab_attr.name); free(fp->fab_attr.prov_name); ff = fab_utof(fp); usdf_fabric_close(&ff->fid); USDF_DBG("returning %d (%s)\n", ret, fi_strerror(-ret)); return ret; }
static int rxd_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags) { struct rxd_ep *ep; struct rxd_av *av; struct util_cq *cq; struct util_cntr *cntr; int ret = 0; ep = container_of(ep_fid, struct rxd_ep, util_ep.ep_fid.fid); switch (bfid->fclass) { case FI_CLASS_AV: av = container_of(bfid, struct rxd_av, util_av.av_fid.fid); ret = ofi_ep_bind_av(&ep->util_ep, &av->util_av); if (ret) return ret; ret = fi_ep_bind(ep->dg_ep, &av->dg_av->fid, flags); if (ret) return ret; break; case FI_CLASS_CQ: cq = container_of(bfid, struct util_cq, cq_fid.fid); ret = ofi_ep_bind_cq(&ep->util_ep, cq, flags); if (ret) return ret; if (!ep->dg_cq) { ret = rxd_dg_cq_open(ep, cq->wait ? FI_WAIT_FD : FI_WAIT_NONE); if (ret) return ret; } if (cq->wait) ret = rxd_ep_wait_fd_add(ep, cq->wait); break; case FI_CLASS_EQ: break; case FI_CLASS_CNTR: cntr = container_of(bfid, struct util_cntr, cntr_fid.fid); ret = ofi_ep_bind_cntr(&ep->util_ep, cntr, flags); if (ret) return ret; if (!ep->dg_cq) { ret = rxd_dg_cq_open(ep, cntr->wait ? FI_WAIT_FD : FI_WAIT_NONE); } else if (!ep->dg_cq_fd && cntr->wait) { /* Reopen CQ with WAIT fd set */ ret = fi_close(&ep->dg_cq->fid); if (ret) { FI_WARN(&rxd_prov, FI_LOG_EP_CTRL, "Unable to close dg CQ: %s\n", fi_strerror(-ret)); return ret; } ep->dg_cq = NULL; ret = rxd_dg_cq_open(ep, FI_WAIT_FD); } if (ret) return ret; if (cntr->wait) ret = rxd_ep_wait_fd_add(ep, cntr->wait); break; default: FI_WARN(&rxd_prov, FI_LOG_EP_CTRL, "invalid fid class\n"); ret = -FI_EINVAL; break; } return ret; }
/* * this function is intended to be invoked as an argument to pthread_create, */ static void *_gnix_dgram_prog_thread_fn(void *the_arg) { int ret = FI_SUCCESS, prev_state; struct gnix_dgram_hndl *the_hndl = (struct gnix_dgram_hndl *)the_arg; sigset_t sigmask; GNIX_TRACE(FI_LOG_EP_CTRL, "\n"); /* * temporarily disable cancelability while we set up * some stuff */ pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &prev_state); /* * help out Cray core-spec, say we're not an app thread * and can be run on core-spec cpus. */ ret = _gnix_task_is_not_app(); if (ret) GNIX_WARN(FI_LOG_EP_CTRL, "_gnix_task_is_not_app call returned %d\n", ret); /* * block all signals, don't want this thread to catch * signals that may be for app threads */ memset(&sigmask, 0, sizeof(sigset_t)); ret = sigfillset(&sigmask); if (ret) { GNIX_WARN(FI_LOG_EP_CTRL, "sigfillset call returned %d\n", ret); } else { ret = pthread_sigmask(SIG_SETMASK, &sigmask, NULL); if (ret) GNIX_WARN(FI_LOG_EP_CTRL, "pthread_sigmask call returned %d\n", ret); } /* * okay now we're ready to be cancelable. */ pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &prev_state); pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); retry: ret = _gnix_dgram_poll(the_hndl, GNIX_DGRAM_BLOCK); if ((ret == -FI_ETIMEDOUT) || (ret == FI_SUCCESS)) goto retry; GNIX_WARN(FI_LOG_EP_CTRL, "_gnix_dgram_poll returned %s\n", fi_strerror(-ret)); /* * TODO: need to be able to enqueue events on to the * ep associated with the cm_nic. */ return NULL; }
static void test_connect_with_accept_blocking_on_eq_fq_SERVER(void) { int ret; printf("SERVER running\n"); setup_ofi(NULL, NULL, FI_SOURCE); #if WANT_FDS // Add the EQ FD to the epoll fd static struct epoll_event edt; memset(&edt, 0, sizeof(edt)); edt.events = EPOLLIN; edt.data.u32 = 2222; ret = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fidev.eq_fd, &edt); if (ret < 0) { error("server epoll_ctl failed"); } #endif // Make a PEP ret = fi_passive_ep(fidev.fabric, fidev.info, &fidev.pep, NULL); if (0 != ret) { error("fi_passive_ep failed"); } #if WANT_FIXED_PORT size_t ss = sizeof(sin); ret = fi_getname(&(fidev.pep->fid), &sin, &ss); if (0 != ret) { error("fi_setname failed"); } sin.sin_port = htons(listen_port); // Bind the PEP to listen on a specific port ret = fi_setname(&(fidev.pep->fid), &sin, sizeof(sin)); if (0 != ret) { error("fi_setname failed"); } #endif // Bind the EQ to the PEP ret = fi_pep_bind(fidev.pep, &fidev.eq->fid, 0); if (0 != ret) { error("fi_pep_bind failed"); } // Listen ret = fi_listen(fidev.pep); if (0 != ret) { error("fi_listen failed"); } // Get the actual address of this PEP struct sockaddr_in sinout; size_t s = sizeof(sinout); ret = fi_getname(&(fidev.pep->fid), &sinout, &s); if (0 != ret) { error("fi_setname failed"); } sin.sin_family = sinout.sin_family; sin.sin_addr = sinout.sin_addr; sin.sin_port = sinout.sin_port; // Print server addr printf("SERVER listening on %s\n", addrstr(&sin)); // Send our node (IP addr) and service (port) to the client snprintf(ofi_node, sizeof(ofi_node) - 1, "%s", inet_ntoa(sin.sin_addr)); snprintf(ofi_service, sizeof(ofi_service) - 1, "%d", ntohs(sin.sin_port)); MPI_Send(ofi_node, sizeof(ofi_node) - 1, MPI_CHAR, 1, 101, MPI_COMM_WORLD); MPI_Send(ofi_service, sizeof(ofi_service) - 1, MPI_CHAR, 1, 102, MPI_COMM_WORLD); printf("SERVER sent via MPI to client: %s / %s\n", ofi_node, ofi_service); #if WANT_FDS // Now wait for the listen to complete int nevents; #define NEVENTS 32 struct epoll_event events[NEVENTS]; int timeout = 10000; while (1) { printf("SERVER blocking on epoll\n"); nevents = epoll_wait(epoll_fd, events, NEVENTS, timeout); if (nevents < 0) { if (errno != EINTR) { error("server epoll wait failed"); } else { continue; } } else { printf("SERVER successfully woke up from epoll! %d events\n", nevents); for (int i = 0; i < nevents; ++i) { if (events[i].data.u32 != 2222) { error("server unexpected epoll return type"); } } // If we got the expected event, then go read from the EQ break; } } #endif // Wait for the FI_CONNREQ event uint32_t event; uint8_t *entry_buffer; size_t expected_len = sizeof(struct fi_eq_cm_entry) + sizeof(client_data); entry_buffer = (uint8_t*) calloc(1, expected_len); if (NULL == entry_buffer) { error("calloc failed"); } struct fi_eq_cm_entry *entry = (struct fi_eq_cm_entry*) entry_buffer; while (1) { printf("SERVER waiting for FI_CONNREQ\n"); #if WANT_FDS ret = fi_eq_read(fidev.eq, &event, entry, expected_len, 0); #else ret = fi_eq_sread(fidev.eq, &event, entry, expected_len, -1, 0); #endif if (-FI_EAVAIL == ret) { printf("server fi_eq_sread failed because there's something in the error queue\n"); char buffer[2048]; struct fi_eq_err_entry *err_entry = (struct fi_eq_err_entry*) buffer; ret = fi_eq_readerr(fidev.eq, err_entry, 0); printf("error code: %d (%s), prov err code: %d (%s)\n", err_entry->err, fi_strerror(err_entry->err), err_entry->prov_errno, fi_strerror(err_entry->prov_errno)); error("sad panda"); } else if (-EAGAIN == ret) { fprintf(stderr, "SERVER fi_eq_sread fail got -EAGAIN... trying again...\n"); sleep(1); continue; } else if (ret < 0) { fprintf(stderr, "SERVER fi_eq_sread fail: %s (FI_EAVAIL = %d, -ret = %d)\n", fi_strerror(-ret), FI_EAVAIL, -ret); error("SERVER fi_eq_sread failed for some random reason"); } else if (event != FI_CONNREQ) { error("SERVER got some unexpected event"); } else if (ret != expected_len) { error("SERVER got wrong length back from fi_eq_sread"); } uint32_t *d = (uint32_t*) entry->data; for (int i = 0; i < (sizeof(client_data) / sizeof(uint32_t)); ++i) { if (d[i] != client_data[i]) { printf("SERVER got wrong CM client data: d[%d]=%d, should be %d\n", i, d[i], client_data[i]); } } printf("SERVER got FI_CONNREQ, correct size, and correct data -- yay!\n"); break; } // Silly logistics: setup_ofi_active adds the fd to the epoll set. // But we already added it. So for simplicity, just remove it // here so that setup_ofi_active() can re-add it. #if WANT_FDS // Remove the EQ FD from the epoll fd ret = epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fidev.eq_fd, &edt); if (ret < 0) { error("server epoll_ctl DEL failed"); } #endif // Make an active endpoint setup_ofi_active(entry->info, &ficonn.ep); // Accept the incoming connection ret = fi_accept(ficonn.ep, (void*) server_data, sizeof(server_data)); if (ret != 0) { printf("fi_accept: ret=%d, %s\n", ret, fi_strerror(-ret)); error("SERVER fi_accept failed\n"); } // Need to read and get a FI_CONNECTED event while (1) { printf("SERVER waiting for FI_CONNECTED\n"); #if WANT_FDS ret = fi_eq_read(fidev.eq, &event, entry, expected_len, 0); #else ret = fi_eq_sread(fidev.eq, &event, entry, expected_len, -1, 0); #endif if (-FI_EAVAIL == ret) { printf("server fi_eq_sread failed because there's something in the error queue\n"); char buffer[2048]; struct fi_eq_err_entry *err_entry = (struct fi_eq_err_entry*) buffer; ret = fi_eq_readerr(fidev.eq, err_entry, 0); printf("error code: %d (%s), prov err code: %d (%s)\n", err_entry->err, fi_strerror(err_entry->err), err_entry->prov_errno, fi_strerror(err_entry->prov_errno)); error("sad panda"); } else if (-EAGAIN == ret) { fprintf(stderr, "SERVER fi_eq_sread fail got -EAGAIN... trying again...\n"); sleep(1); continue; } else if (ret < 0) { fprintf(stderr, "SERVER fi_eq_sread fail: %s (FI_EAVAIL = %d, -ret = %d)\n", fi_strerror(-ret), FI_EAVAIL, -ret); error("SERVER fi_eq_sread failed for some random reason"); } else if (event != FI_CONNECTED) { error("SERVER got some unexpected event"); } printf("SERVER got FI_CONNECTED -- yay!\n"); break; } // Post a recv buffer for the client to send int msg[4] = { 0 }; int len = sizeof(msg); printf("SERVER receiving len of %d\n", len); struct fid_mr no_mr; struct fid_mr *mr; void *recv_context = (void*) 0x17; #if 0 fi_mr_reg(fidev.domain, msg, len, FI_SEND | FI_RECV, 0, (uint64_t)(uintptr_t) msg, 0, &mr, NULL); #else // Try using no mr, like fi_msg_pingpong... memset(&no_mr, 0, sizeof(no_mr)); mr = &no_mr; #endif ret = fi_recv(ficonn.ep, msg, len, fi_mr_desc(mr), 0, recv_context); if (ret < 0) { printf("fi_recv failed! %d, %s\n", ret, fi_strerror(-ret)); MPI_Abort(MPI_COMM_WORLD, 37); } sleep(1); printf("SERVER posted receive -- waiting for client to send\n"); MPI_Barrier(MPI_COMM_WORLD); // Wait for receive completion struct fi_cq_entry cqe; while (1) { ret = fi_cq_sread(ficonn.cq, &cqe, 1, 0, -1); if (cqe.op_context == recv_context) { printf("SERVER receive completed\n"); break; } else { printf("SERVER got some other completion... continuing\n"); } } printf("SERVER finished -- waiting for client before teardown\n"); MPI_Barrier(MPI_COMM_WORLD); printf("SERVER tearing down\n"); fi_close(&(mr->fid)); teardown_ofi(); }
/* Destroy an unconnected VC. More Support is needed to shutdown and destroy * an active VC. */ int _gnix_vc_destroy(struct gnix_vc *vc) { int ret = FI_SUCCESS; struct gnix_nic *nic = NULL; gni_return_t status; GNIX_TRACE(FI_LOG_EP_CTRL, "\n"); if (vc->ep == NULL) { GNIX_WARN(FI_LOG_EP_CTRL, "ep null\n"); return -FI_EINVAL; } nic = vc->ep->nic; if (nic == NULL) { GNIX_WARN(FI_LOG_EP_CTRL, "ep nic null for vc %p\n", vc); return -FI_EINVAL; } /* * move vc state to terminating */ vc->conn_state = GNIX_VC_CONN_TERMINATING; /* * try to unbind the gni_ep if non-NULL. * If there are SMSG or PostFMA/RDMA outstanding * wait here for them to complete */ if (vc->gni_ep != NULL) { while (status == GNI_RC_NOT_DONE) { fastlock_acquire(&nic->lock); status = GNI_EpUnbind(vc->gni_ep); fastlock_release(&nic->lock); if ((status != GNI_RC_NOT_DONE) && (status != GNI_RC_SUCCESS)) { GNIX_WARN(FI_LOG_EP_CTRL, "GNI_EpUnBind returned %s\n", gni_err_str[status]); break; } if (status == GNI_RC_NOT_DONE) _gnix_nic_progress(nic); } fastlock_acquire(&nic->lock); status = GNI_EpDestroy(vc->gni_ep); fastlock_release(&nic->lock); if (status != GNI_RC_SUCCESS) GNIX_WARN(FI_LOG_EP_CTRL, "GNI_EpDestroy returned %s\n", gni_err_str[status]); } /* * if the vc is in a nic's work queue, remove it */ __gnix_vc_cancel(vc); /* * We may eventually want to check the state of the VC, if we * implement true VC shutdown. if ((vc->conn_state != GNIX_VC_CONN_NONE) && (vc->conn_state != GNIX_VC_CONN_TERMINATED)) { GNIX_WARN(FI_LOG_EP_CTRL, "vc conn state %d\n", vc->conn_state); GNIX_WARN(FI_LOG_EP_CTRL, "vc conn state error\n"); return -FI_EBUSY; } */ /* * if send_q not empty, return -FI_EBUSY * Note for FI_EP_MSG type eps, this behavior * may not be correct for handling fi_shutdown. */ if (!slist_empty(&vc->tx_queue)) { GNIX_WARN(FI_LOG_EP_CTRL, "vc sendqueue not empty\n"); return -FI_EBUSY; } fastlock_destroy(&vc->tx_queue_lock); if (vc->smsg_mbox != NULL) { ret = _gnix_mbox_free(vc->smsg_mbox); if (ret != FI_SUCCESS) GNIX_WARN(FI_LOG_EP_CTRL, "_gnix_mbox_free returned %s\n", fi_strerror(-ret)); vc->smsg_mbox = NULL; } if (vc->dgram != NULL) { ret = _gnix_dgram_free(vc->dgram); if (ret != FI_SUCCESS) GNIX_WARN(FI_LOG_EP_CTRL, "_gnix_dgram_free returned %s\n", fi_strerror(-ret)); vc->dgram = NULL; } ret = _gnix_nic_free_rem_id(nic, vc->vc_id); if (ret != FI_SUCCESS) GNIX_WARN(FI_LOG_EP_CTRL, "__gnix_vc_free_id returned %s\n", fi_strerror(-ret)); _gnix_free_bitmap(&vc->flags); free(vc); return ret; }
int ADD_SUFFIX(MPID_nem_ofi_iprobe_impl)(struct MPIDI_VC *vc, int source, int tag, MPIR_Comm * comm, int context_offset, int *flag, MPI_Status * status, MPIR_Request ** rreq_ptr) { int ret, mpi_errno = MPI_SUCCESS; fi_addr_t remote_proc = 0; uint64_t match_bits, mask_bits; size_t len; MPIR_Request rreq_s, *rreq; BEGIN_FUNC(FCNAME); if (rreq_ptr) { MPIDI_CH3I_NM_OFI_RC(MPID_nem_ofi_create_req(&rreq, 1)); rreq->kind = MPIR_REQUEST_KIND__RECV; *rreq_ptr = rreq; rreq->comm = comm; rreq->dev.match.parts.rank = source; rreq->dev.match.parts.tag = tag; rreq->dev.match.parts.context_id = comm->context_id; MPIR_Comm_add_ref(comm); } else { rreq = &rreq_s; rreq->dev.OnDataAvail = NULL; } REQ_OFI(rreq)->pack_buffer = NULL; REQ_OFI(rreq)->event_callback = ADD_SUFFIX(peek_callback); REQ_OFI(rreq)->match_state = PEEK_INIT; OFI_ADDR_INIT(source, vc, remote_proc); #if API_SET == API_SET_1 match_bits = init_recvtag(&mask_bits, comm->context_id + context_offset, source, tag); #elif API_SET == API_SET_2 match_bits = init_recvtag_2(&mask_bits, comm->context_id + context_offset, tag); #endif /* ------------------------------------------------------------------------- */ /* fi_recvmsg with FI_PEEK: */ /* Initiate a search for a match in the hardware or software queue. */ /* The search can complete immediately with -ENOMSG. */ /* I successful, libfabric will enqueue a context entry into the completion */ /* queue to make the search nonblocking. This code will poll until the */ /* entry is enqueued. */ /* ------------------------------------------------------------------------- */ msg_tagged_t msg; uint64_t msgflags = FI_PEEK; msg.msg_iov = NULL; msg.desc = NULL; msg.iov_count = 0; msg.addr = remote_proc; msg.tag = match_bits; msg.ignore = mask_bits; msg.context = (void *) &(REQ_OFI(rreq)->ofi_context); msg.data = 0; if(*flag == CLAIM_PEEK) msgflags|=FI_CLAIM; ret = fi_trecvmsg(gl_data.endpoint,&msg,msgflags); if(ret == -ENOMSG) { if (rreq_ptr) { MPIR_Request_free(rreq); *rreq_ptr = NULL; *flag = 0; } MPID_nem_ofi_poll(MPID_NONBLOCKING_POLL); goto fn_exit; } MPIR_ERR_CHKANDJUMP4((ret < 0), mpi_errno, MPI_ERR_OTHER, "**ofi_peek", "**ofi_peek %s %d %s %s", __SHORT_FILE__, __LINE__, FCNAME, fi_strerror(-ret)); while (PEEK_INIT == REQ_OFI(rreq)->match_state) MPID_nem_ofi_poll(MPID_BLOCKING_POLL); if (PEEK_NOT_FOUND == REQ_OFI(rreq)->match_state) { if (rreq_ptr) { MPIR_Request_free(rreq); *rreq_ptr = NULL; *flag = 0; } MPID_nem_ofi_poll(MPID_NONBLOCKING_POLL); goto fn_exit; } if (status != MPI_STATUS_IGNORE) *status = rreq->status; if (rreq_ptr) MPIR_Request_add_ref(rreq); *flag = 1; END_FUNC_RC(FCNAME); }
/* * connect to self, since we use a lock here * the only case we need to deal with is one * vc connect request with the peer vc not yet * being set up */ static int __gnix_vc_connect_to_same_cm_nic(struct gnix_vc *vc) { int ret = FI_SUCCESS; struct gnix_fid_domain *dom = NULL; struct gnix_fid_ep *ep = NULL; struct gnix_fid_ep *ep_peer = NULL; struct gnix_cm_nic *cm_nic = NULL; struct gnix_mbox *mbox = NULL, *mbox_peer = NULL; struct gnix_vc *vc_peer; gni_smsg_attr_t smsg_mbox_attr; gni_smsg_attr_t smsg_mbox_attr_peer; gnix_ht_key_t *key_ptr; struct gnix_av_addr_entry entry; GNIX_TRACE(FI_LOG_EP_CTRL, "\n"); ep = vc->ep; if (ep == NULL) return -FI_EINVAL; cm_nic = ep->cm_nic; if (cm_nic == NULL) { ret = -FI_EINVAL; goto exit; } dom = ep->domain; if (dom == NULL) { ret = -FI_EINVAL; goto exit; } fastlock_acquire(&ep->vc_ht_lock); if ((vc->conn_state == GNIX_VC_CONNECTING) || (vc->conn_state == GNIX_VC_CONNECTED)) { fastlock_release(&ep->vc_ht_lock); return FI_SUCCESS; } else vc->conn_state = GNIX_VC_CONNECTING; fastlock_release(&ep->vc_ht_lock); GNIX_DEBUG(FI_LOG_EP_CTRL, "moving vc %p state to connecting\n", vc); if (vc->smsg_mbox == NULL) { ret = _gnix_mbox_alloc(vc->ep->nic->mbox_hndl, &mbox); if (ret != FI_SUCCESS) { GNIX_WARN(FI_LOG_EP_DATA, "_gnix_mbox_alloc returned %s\n", fi_strerror(-ret)); goto exit; } vc->smsg_mbox = mbox; } else mbox = vc->smsg_mbox; smsg_mbox_attr.msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT; smsg_mbox_attr.msg_buffer = mbox->base; smsg_mbox_attr.buff_size = vc->ep->nic->mem_per_mbox; smsg_mbox_attr.mem_hndl = *mbox->memory_handle; smsg_mbox_attr.mbox_offset = (uint64_t)mbox->offset; smsg_mbox_attr.mbox_maxcredit = dom->params.mbox_maxcredit; smsg_mbox_attr.msg_maxsize = dom->params.mbox_msg_maxsize; key_ptr = (gnix_ht_key_t *)&vc->peer_addr; ep_peer = (struct gnix_fid_ep *)_gnix_ht_lookup(cm_nic->addr_to_ep_ht, *key_ptr); if (ep_peer == NULL) { GNIX_WARN(FI_LOG_EP_DATA, "_gnix_ht_lookup addr_to_ep failed\n"); ret = -FI_ENOENT; goto exit; } key_ptr = (gnix_ht_key_t *)&ep->my_name.gnix_addr; fastlock_acquire(&ep_peer->vc_ht_lock); vc_peer = (struct gnix_vc *)_gnix_ht_lookup(ep_peer->vc_ht, *key_ptr); /* * handle the special case of connecting to self */ if (vc_peer == vc) { ret = __gnix_vc_smsg_init(vc, vc->vc_id, &smsg_mbox_attr); if (ret != FI_SUCCESS) { GNIX_WARN(FI_LOG_EP_DATA, "_gnix_vc_smsg_init returned %s\n", fi_strerror(-ret)); goto exit_w_lock; } vc->conn_state = GNIX_VC_CONNECTED; GNIX_DEBUG(FI_LOG_EP_CTRL, "moving vc %p state to connected\n", vc); goto exit_w_lock; } if ((vc_peer != NULL) && (vc_peer->conn_state != GNIX_VC_CONN_NONE)) { GNIX_WARN(FI_LOG_EP_CTRL, "_gnix_vc_connect self, vc_peer in inconsistent state\n"); ret = -FI_ENOSPC; goto exit_w_lock; } if (vc_peer == NULL) { entry.gnix_addr = ep->my_name.gnix_addr; entry.cm_nic_cdm_id = ep->my_name.cm_nic_cdm_id; ret = _gnix_vc_alloc(ep_peer, &entry, &vc_peer); if (ret != FI_SUCCESS) { GNIX_WARN(FI_LOG_EP_CTRL, "_gnix_vc_alloc returned %s\n", fi_strerror(-ret)); goto exit_w_lock; } ret = _gnix_ht_insert(ep_peer->vc_ht, *key_ptr, vc_peer); if (ret != FI_SUCCESS) { GNIX_WARN(FI_LOG_EP_CTRL, "_gnix_ht_insert returned %s\n", fi_strerror(-ret)); goto exit_w_lock; } vc_peer->modes |= GNIX_VC_MODE_IN_HT; } vc_peer->conn_state = GNIX_VC_CONNECTING; GNIX_DEBUG(FI_LOG_EP_CTRL, "moving vc %p state to connecting\n", vc_peer); if (vc_peer->smsg_mbox == NULL) { ret = _gnix_mbox_alloc(vc_peer->ep->nic->mbox_hndl, &mbox_peer); if (ret != FI_SUCCESS) { GNIX_WARN(FI_LOG_EP_DATA, "_gnix_mbox_alloc returned %s\n", fi_strerror(-ret)); goto exit_w_lock; } vc_peer->smsg_mbox = mbox_peer; } else mbox_peer = vc_peer->smsg_mbox; smsg_mbox_attr_peer.msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT; smsg_mbox_attr_peer.msg_buffer = mbox_peer->base; smsg_mbox_attr_peer.buff_size = vc_peer->ep->nic->mem_per_mbox; smsg_mbox_attr_peer.mem_hndl = *mbox_peer->memory_handle; smsg_mbox_attr_peer.mbox_offset = (uint64_t)mbox_peer->offset; smsg_mbox_attr_peer.mbox_maxcredit = dom->params.mbox_maxcredit; smsg_mbox_attr_peer.msg_maxsize = dom->params.mbox_msg_maxsize; ret = __gnix_vc_smsg_init(vc, vc_peer->vc_id, &smsg_mbox_attr_peer); if (ret != FI_SUCCESS) { GNIX_WARN(FI_LOG_EP_DATA, "_gnix_vc_smsg_init returned %s\n", fi_strerror(-ret)); goto exit_w_lock; } ret = __gnix_vc_smsg_init(vc_peer, vc->vc_id, &smsg_mbox_attr); if (ret != FI_SUCCESS) { GNIX_WARN(FI_LOG_EP_DATA, "_gnix_vc_smsg_init returned %s\n", fi_strerror(-ret)); goto exit_w_lock; } vc->conn_state = GNIX_VC_CONNECTED; GNIX_DEBUG(FI_LOG_EP_CTRL, "moving vc %p state to connected\n", vc); vc_peer->conn_state = GNIX_VC_CONNECTED; GNIX_DEBUG(FI_LOG_EP_CTRL, "moving vc %p state to connected\n", vc_peer); exit_w_lock: fastlock_release(&ep_peer->vc_ht_lock); exit: return ret; }
static int run(void) { char *node, *service; uint64_t flags; int ret; size_t i; ret = ft_read_addr_opts(&node, &service, hints, &flags, &opts); if (ret) return ret; ret = opts.dst_addr ? client_setup() : server_setup(); if (ret) { fprintf(stderr, "error: %s\n", fi_strerror(-ret)); return ret; } /* Leave extra space for invalid size test */ cm_data = calloc(1, cm_data_size + 1); if (!cm_data) return -FI_ENOMEM; entry = calloc(1, sizeof(*entry) + cm_data_size); if (!entry) return -FI_ENOMEM; if (opts.dst_addr) { ret = ft_sock_connect(opts.dst_addr, sock_service); if (ret) goto err2; } else { ret = ft_sock_listen(sock_service); if (ret) goto err2; ret = ft_sock_accept(); if (ret) goto err1; } for (i = 1; i <= cm_data_size; i <<= 1) { printf("trying with data size: %zu\n", i); if (opts.dst_addr) ret = client(i); else ret = server(i); if (ret) goto err1; ret = ft_sock_sync(0); if (ret) goto err1; } /* Despite server not being setup to handle this, the client should fail * with -FI_EINVAL since this exceeds its max data size. */ if (opts.dst_addr) { printf("trying with data size exceeding maximum: %zu\n", cm_data_size + 1); /* Don't call client since it produces an error message. */ ret = client_connect(cm_data_size + 1); if (ret != -FI_EINVAL) { FT_ERR("expected -FI_EINVAL, got: [%d]:%s\n", ret, fi_strerror(-ret)); } else { ret = FI_SUCCESS; } } err1: ft_sock_shutdown(sock); err2: free(entry); return ret; }
static int __gnix_vc_hndl_conn_resp(struct gnix_cm_nic *cm_nic, char *msg_buffer, struct gnix_address src_cm_nic_addr) { int ret = FI_SUCCESS; int peer_id; struct gnix_vc *vc = NULL; uint64_t peer_vc_addr; struct gnix_fid_ep *ep; gni_smsg_attr_t peer_smsg_attr; GNIX_TRACE(FI_LOG_EP_CTRL, "\n"); /* * unpack the message */ __gnix_vc_unpack_resp(msg_buffer, (uint64_t *)&vc, &peer_vc_addr, &peer_id, &peer_smsg_attr); GNIX_DEBUG(FI_LOG_EP_CTRL, "resp rx: (From Aries 0x%x Id %d src vc %p peer vc addr 0x%lx)\n", src_cm_nic_addr.device_addr, src_cm_nic_addr.cdm_id, vc, peer_vc_addr); ep = vc->ep; assert(ep != NULL); fastlock_acquire(&ep->vc_ht_lock); /* * at this point vc should be in connecting state */ if (vc->conn_state != GNIX_VC_CONNECTING) { GNIX_WARN(FI_LOG_EP_CTRL, "vc %p not in connecting state, rather %d\n", vc, vc->conn_state); ret = -FI_EINVAL; goto err; } /* * build the SMSG connection */ ret = __gnix_vc_smsg_init(vc, peer_id, &peer_smsg_attr); if (ret != FI_SUCCESS) { GNIX_WARN(FI_LOG_EP_CTRL, "__gnix_vc_smsg_init returned %s\n", fi_strerror(-ret)); goto err; } /* * transition the VC to connected * put in to the nic's work queue for * further processing */ vc->conn_state = GNIX_VC_CONNECTED; GNIX_DEBUG(FI_LOG_EP_CTRL, " moving vc %p to state connected\n",vc); fastlock_release(&ep->vc_ht_lock); ret = _gnix_vc_schedule(vc); return ret; err: vc->conn_state = GNIX_VC_CONN_ERROR; fastlock_release(&ep->vc_ht_lock); return ret; }
int create_connection(void) { struct fi_info *prov; struct fi_eq_cm_entry entry; struct fi_info *info; ssize_t n; uint32_t event; int ret = -1; print_trace("in\n"); memset(&ctx, 0, sizeof(ctx)); if (match_provider(&prov)) goto err1; if (client_connect(prov, &ctx)) goto err2; dprint(DEBUG_CONNECT, "Waiting for Server to connect\n"); n = fi_eq_sread(ctx.eq, &event, &entry, sizeof(entry), CTIMEOUT, 0); if (n < sizeof(entry)) { struct fi_eq_err_entry eqe; int rc; print_err("fi_eq_sread '%s'(%d)\n", fi_strerror(n), (int) n); rc = fi_eq_readerr(ctx.eq, &eqe, 0); if (rc) print_err("fi_eq_readerr() returns %d '%s'\n", rc, fi_strerror(rc)); else { char buf[64]; print_err("fi_eq_readerr() prov_err '%s'(%d)\n", fi_eq_strerror(ctx.eq, eqe.prov_errno, eqe.err_data, buf, sizeof(buf)), eqe.prov_errno); print_err("fi_eq_readerr() err '%s'(%d)\n", fi_strerror(eqe.err), eqe.err); } return (int) n; } if (event != FI_CONNECTED) { print_err("unexpected event %d\n", event); return -FI_EOTHER; } /* same context specified in fi_endpoint()? */ if (entry.fid->context != CONTEXT) { print_err("entry.fid->context %lx != %lx\n", (ulong)entry.fid->context, (ulong)CONTEXT); } info = entry.info; dprint(DEBUG_CONNECT, "*** Client Connected\n"); dprint(DEBUG_CONNECT, "Client private data(len %ld): '%s'\n", (n - sizeof(entry)), entry.data); return 0; err2: client_disconnect(&ctx); fi_freeinfo(prov); err1: return ret; }
static int rxd_av_insert(struct fid_av *av_fid, const void *addr, size_t count, fi_addr_t *fi_addr, uint64_t flags, void *context) { struct rxd_av *av; int i = 0, index, ret = 0, success_cnt = 0, lookup = 1; uint64_t dg_fiaddr; av = container_of(av_fid, struct rxd_av, util_av.av_fid); fastlock_acquire(&av->util_av.lock); if (!av->dg_addrlen) { ret = rxd_av_set_addrlen(av, addr); if (ret) goto out; /* Skip lookups if this is the first insertion call. */ lookup = 0; } for (; i < count; i++, addr = (uint8_t *) addr + av->dg_addrlen) { ret = lookup ? rxd_av_dg_reverse_lookup(av, i, addr, &dg_fiaddr) : -FI_ENODATA; if (ret) { ret = fi_av_insert(av->dg_av, addr, 1, &dg_fiaddr, flags, context); if (ret != 1) break; } ret = ofi_av_insert_addr(&av->util_av, &dg_fiaddr, dg_fiaddr, &index); if (ret) break; success_cnt++; if (fi_addr) fi_addr[i] = index; } if (ret) { FI_WARN(&rxd_prov, FI_LOG_AV, "failed to insert address %d: %d (%s)\n", i, -ret, fi_strerror(-ret)); if (av->util_av.eq) ofi_av_write_event(&av->util_av, i, -ret, context); if (fi_addr) fi_addr[i] = FI_ADDR_NOTAVAIL; i++; } out: av->dg_av_used += success_cnt; fastlock_release(&av->util_av.lock); for (; i < count; i++) { if (av->util_av.eq) ofi_av_write_event(&av->util_av, i, FI_ECANCELED, context); if (fi_addr) fi_addr[i] = FI_ADDR_NOTAVAIL; } if (av->util_av.eq) { ofi_av_write_event(&av->util_av, success_cnt, 0, context); return 0; } return success_cnt; }