int psmx_epid_to_epaddr(struct psmx_fid_domain *domain, psm_epid_t epid, psm_epaddr_t *epaddr) { int err; psm_error_t errors; psm_epconn_t epconn; struct psmx_epaddr_context *context; err = psm_ep_epid_lookup(epid, &epconn); if (err == PSM_OK) { context = psm_epaddr_getctxt(epconn.addr); if (context && context->epid == epid) { *epaddr = epconn.addr; return 0; } } err = psm_ep_connect(domain->psm_ep, 1, &epid, NULL, &errors, epaddr, 30*1e9); if (err != PSM_OK) return psmx_errno(err); psmx_set_epaddr_context(domain,epid,*epaddr); return 0; }
static int pspsm_con_connect(pspsm_con_info_t *con_info, pspsm_info_msg_t *info_msg) { psm_error_t ret, ret1; if (memcmp(info_msg->protocol_version, PSPSM_PROTOCOL_VERSION, sizeof(info_msg->protocol_version))) { goto err_protocol; } ret = psm_ep_connect(pspsm_ep, 1, &info_msg->epid, NULL, &ret1, &con_info->epaddr, 0); con_info->send_id = info_msg->id; if (ret != PSM_OK) goto err_connect; pspsm_dprint(2, "pspsm_con_connect: OK"); pspsm_dprint(2, "sending with %"PRIx64", receiving %"PRIx64, con_info->send_id, con_info->recv_id); return 0; err_connect: pspsm_err(psm_error_get_string(ret)); pspsm_dprint(1, "pspsm_con_connect: %s", pspsm_err_str); return -1; err_protocol: { char str[80]; snprintf(str, sizeof(str), "protocol error : '%.8s' != '%.8s'", info_msg->protocol_version, PSPSM_PROTOCOL_VERSION); pspsm_err(str); pspsm_dprint(1, "pspsm_con_connect: %s", pspsm_err_str); } return -1; }
int connect_endpoints(psm_ep_t ep, int numep, const psm_epid_t *array_of_epid, psm_epaddr_t **array_of_epaddr_out) { psm_error_t *errors = (psm_error_t *) calloc(numep, sizeof(psm_error_t)); if (errors == NULL) { return -1; } psm_epaddr_t *all_epaddrs = (psm_epaddr_t *) calloc(numep, sizeof(psm_epaddr_t)); if (all_epaddrs == NULL) { return -1; } psm_error_t error; error = psm_ep_connect(ep, numep, array_of_epid, NULL, // We want to connect all epids, no mask needed errors, all_epaddrs, 30* 1000 * 1000); // 30 second timeout, <1 ns is forever if (error != PSM_OK) { fprintf(stderr, "Not connection failed\n"); } *array_of_epaddr_out = all_epaddrs; free(errors); return 1; }
int connect_endpoints(psm_ep_t ep, int numep, const psm_epid_t *array_of_epid, psm_epaddr_t **array_of_epaddr_out) { psm_error_t *errors = (psm_error_t *) calloc(numep, sizeof(psm_error_t)); if (errors == NULL) return -1; psm_epaddr_t *all_epaddrs = (psm_epaddr_t *) calloc(numep, sizeof(psm_epaddr_t)); if (all_epaddrs == NULL) return -1; psm_ep_connect(ep, numep, array_of_epid, NULL, // We want to connect all epids, no mask needed errors, all_epaddrs, 30* 1000 * 1000); // 30 second timeout, <1 ns is forever *array_of_epaddr_out = all_epaddrs; free(errors); return 1; }
int ompi_mtl_psm_add_procs(struct mca_mtl_base_module_t *mtl, size_t nprocs, struct ompi_proc_t** procs) { int i,j; int rc; psm_epid_t *epids_in = NULL; psm_epid_t *epid; psm_epaddr_t *epaddrs_out = NULL; psm_error_t *errs_out = NULL, err; size_t size; int proc_errors[PSM_ERROR_LAST] = { 0 }; int timeout_in_secs; assert(mtl == &ompi_mtl_psm.super); rc = OMPI_ERR_OUT_OF_RESOURCE; errs_out = (psm_error_t *) malloc(nprocs * sizeof(psm_error_t)); if (errs_out == NULL) { goto bail; } epids_in = (psm_epid_t *) malloc(nprocs * sizeof(psm_epid_t)); if (epids_in == NULL) { goto bail; } epaddrs_out = (psm_epaddr_t *) malloc(nprocs * sizeof(psm_epaddr_t)); if (epaddrs_out == NULL) { goto bail; } rc = OMPI_SUCCESS; /* Get the epids for all the processes from modex */ for (i = 0; i < (int) nprocs; i++) { rc = ompi_modex_recv(&mca_mtl_psm_component.super.mtl_version, procs[i], (void**)&epid, &size); if (rc != OMPI_SUCCESS || size != sizeof(psm_epid_t)) { return OMPI_ERROR; } epids_in[i] = *epid; } timeout_in_secs = max(ompi_mtl_psm.connect_timeout, 0.5 * nprocs); psm_error_register_handler(ompi_mtl_psm.ep, PSM_ERRHANDLER_NOP); err = psm_ep_connect(ompi_mtl_psm.ep, nprocs, epids_in, NULL, /* connect all */ errs_out, epaddrs_out, timeout_in_secs * 1e9); if (err) { char *errstr = (char *) ompi_mtl_psm_connect_error_msg(err); if (errstr == NULL) { opal_output(0, "PSM returned unhandled/unknown connect error: %s\n", psm_error_get_string(err)); } for (i = 0; i < (int) nprocs; i++) { psm_error_t thiserr = errs_out[i]; errstr = (char *) ompi_mtl_psm_connect_error_msg(thiserr); if (proc_errors[thiserr] == 0) { proc_errors[thiserr] = 1; opal_output(0, "PSM EP connect error (%s):", errstr ? errstr : "unknown connect error"); for (j = 0; j < (int) nprocs; j++) { if (errs_out[j] == thiserr) { opal_output(0, " %s", (NULL == procs[j]->proc_hostname) ? "unknown" : procs[j]->proc_hostname); } } opal_output(0, "\n"); } } rc = OMPI_ERROR; } else { /* Default error handling is enabled, errors will not be returned to * user. PSM prints the error and the offending endpoint's hostname * and exits with -1 */ psm_error_register_handler(ompi_mtl_psm.ep, PSM_ERRHANDLER_DEFAULT); /* Fill in endpoint data */ for (i = 0; i < (int) nprocs; i++) { mca_mtl_psm_endpoint_t *endpoint = (mca_mtl_psm_endpoint_t *) OBJ_NEW(mca_mtl_psm_endpoint_t); endpoint->peer_epid = epids_in[i]; endpoint->peer_addr = epaddrs_out[i]; procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL] = endpoint; } rc = OMPI_SUCCESS; } bail: if (epids_in != NULL) { free(epids_in); } if (errs_out != NULL) { free(errs_out); } if (epaddrs_out != NULL) { free(epaddrs_out); } return rc; }
static int psmx_av_insert(struct fid_av *av, const void *addr, size_t count, fi_addr_t *fi_addr, uint64_t flags, void *context) { struct psmx_fid_av *av_priv; psm_error_t *errors; int error_count = 0; int *mask; int i, j; fi_addr_t *result = NULL; struct psmx_epaddr_context *epaddr_context; av_priv = container_of(av, struct psmx_fid_av, av); errors = (psm_error_t *) calloc(count, sizeof *errors); if (!errors) return -FI_ENOMEM; mask = (int *) calloc(count, sizeof *mask); if (!mask) { free(errors); return -FI_ENOMEM; } if (av_priv->type == FI_AV_TABLE) { if (psmx_av_check_table_size(av_priv, count)) { free(mask); free(errors); return -FI_ENOMEM; } for (i=0; i<count; i++) av_priv->psm_epids[av_priv->last + i] = ((psm_epid_t *)addr)[i]; result = fi_addr; addr = (const void *)(av_priv->psm_epids + av_priv->last); fi_addr = (fi_addr_t *)(av_priv->psm_epaddrs + av_priv->last); } /* prevent connecting to the same ep twice, which is fatal in PSM */ for (i=0; i<count; i++) { psm_epconn_t epconn; if (psm_ep_epid_lookup(((psm_epid_t *) addr)[i], &epconn) == PSM_OK) { epaddr_context = psm_epaddr_getctxt(epconn.addr); if (epaddr_context && epaddr_context->epid == ((psm_epid_t *) addr)[i]) ((psm_epaddr_t *) fi_addr)[i] = epconn.addr; else mask[i] = 1; } else { mask[i] = 1; } } psm_ep_connect(av_priv->domain->psm_ep, count, (psm_epid_t *) addr, mask, errors, (psm_epaddr_t *) fi_addr, 30*1e9); for (i=0; i<count; i++){ if (!mask[i]) continue; if (errors[i] == PSM_OK || errors[i] == PSM_EPID_ALREADY_CONNECTED) { psmx_set_epaddr_context(av_priv->domain, ((psm_epid_t *) addr)[i], ((psm_epaddr_t *) fi_addr)[i]); } else { FI_INFO(&psmx_prov, FI_LOG_AV, "%d: psm_ep_connect returned %s. remote epid=%lx.\n", i, psm_error_get_string(errors[i]), ((psm_epid_t *)addr)[i]); if (((psm_epid_t *)addr)[i] == 0) FI_INFO(&psmx_prov, FI_LOG_AV, "does the application depend on the provider" "to resolve IP address into endpoint id? if so" "check if the name server has started correctly" "at the other side.\n"); fi_addr[i] = FI_ADDR_NOTAVAIL; error_count++; } } free(mask); free(errors); if (av_priv->type == FI_AV_TABLE) { /* NOTE: unresolved addresses are left in the AV table */ if (result) { for (i=0; i<count; i++) { j = av_priv->last + i; if ((fi_addr_t)av_priv->psm_epaddrs[j] == FI_ADDR_NOTAVAIL) result[i] = FI_ADDR_NOTAVAIL; else result[i] = j; } } av_priv->last += count; } return count - error_count; }
static int psmx_av_insert(struct fid_av *av, const void *addr, size_t count, fi_addr_t *fi_addr, uint64_t flags, void *context) { struct psmx_fid_av *av_priv; psm_error_t *errors; int error_count = 0; int *mask; int i, j, ret; fi_addr_t *result = NULL; struct psmx_epaddr_context *epaddr_context; if (count && !addr) { FI_INFO(&psmx_prov, FI_LOG_AV, "the input address array is NULL.\n"); return -FI_EINVAL; } av_priv = container_of(av, struct psmx_fid_av, av); if ((av_priv->flags & FI_EVENT) && !av_priv->eq) return -FI_ENOEQ; errors = (psm_error_t *) calloc(count, sizeof *errors); if (!errors) return -FI_ENOMEM; mask = (int *) calloc(count, sizeof *mask); if (!mask) { free(errors); return -FI_ENOMEM; } if (av_priv->type == FI_AV_TABLE) { if (psmx_av_check_table_size(av_priv, count)) { free(mask); free(errors); return -FI_ENOMEM; } for (i=0; i<count; i++) av_priv->psm_epids[av_priv->last + i] = ((psm_epid_t *)addr)[i]; result = fi_addr; addr = (const void *)(av_priv->psm_epids + av_priv->last); fi_addr = (fi_addr_t *)(av_priv->psm_epaddrs + av_priv->last); } /* prevent connecting to the same ep twice, which is fatal in PSM */ for (i=0; i<count; i++) { psm_epconn_t epconn; if (psm_ep_epid_lookup(((psm_epid_t *) addr)[i], &epconn) == PSM_OK) { epaddr_context = psm_epaddr_getctxt(epconn.addr); if (epaddr_context && epaddr_context->epid == ((psm_epid_t *) addr)[i]) ((psm_epaddr_t *) fi_addr)[i] = epconn.addr; else mask[i] = 1; } else { mask[i] = 1; } } psm_ep_connect(av_priv->domain->psm_ep, count, (psm_epid_t *) addr, mask, errors, (psm_epaddr_t *) fi_addr, 30*1e9); for (i=0; i<count; i++){ if (!mask[i]) { errors[i] = PSM_OK; continue; } if (errors[i] == PSM_OK || errors[i] == PSM_EPID_ALREADY_CONNECTED) { psmx_set_epaddr_context(av_priv->domain, ((psm_epid_t *) addr)[i], ((psm_epaddr_t *) fi_addr)[i]); errors[i] = PSM_OK; } else { psm_epconn_t epconn; /* If duplicated addresses are passed to psm_ep_connect(), all but one will fail * with error "Endpoint could not be reached". They should be treated as already * connected. */ if (psm_ep_epid_lookup(((psm_epid_t *) addr)[i], &epconn) == PSM_OK) { epaddr_context = psm_epaddr_getctxt(epconn.addr); if (epaddr_context && epaddr_context->epid == ((psm_epid_t *) addr)[i]) { ((psm_epaddr_t *) fi_addr)[i] = epconn.addr; errors[i] = PSM_OK; continue; } } FI_INFO(&psmx_prov, FI_LOG_AV, "%d: psm_ep_connect returned %s. remote epid=%lx.\n", i, psm_error_get_string(errors[i]), ((psm_epid_t *)addr)[i]); if (((psm_epid_t *)addr)[i] == 0) FI_INFO(&psmx_prov, FI_LOG_AV, "does the application depend on the provider" "to resolve IP address into endpoint id? if so" "check if the name server has started correctly" "at the other side.\n"); fi_addr[i] = FI_ADDR_NOTAVAIL; error_count++; if (av_priv->flags & FI_EVENT) psmx_av_post_completion(av_priv, context, i, errors[i]); } } if (av_priv->type == FI_AV_TABLE) { /* NOTE: unresolved addresses are left in the AV table */ if (result) { for (i=0; i<count; i++) { j = av_priv->last + i; if ((fi_addr_t)av_priv->psm_epaddrs[j] == FI_ADDR_NOTAVAIL) result[i] = FI_ADDR_NOTAVAIL; else result[i] = j; } } av_priv->last += count; } if (av_priv->flags & FI_EVENT) { psmx_av_post_completion(av_priv, context, count - error_count, 0); ret = 0; } else { if (flags & FI_SYNC_ERR) { int *fi_errors = context; for (i=0; i<count; i++) fi_errors[i] = psmx_errno(errors[i]); } ret = count - error_count; } free(mask); free(errors); return ret; }
static int psmx_av_insert(struct fid_av *av, const void *addr, size_t count, fi_addr_t *fi_addr, uint64_t flags) { struct psmx_fid_av *fid_av; psm_error_t *errors; int *mask; int err; int i; fi_addr_t *result = NULL; struct psmx_epaddr_context *context; fid_av = container_of(av, struct psmx_fid_av, av); /* TODO: support the FI_RANGE flag */ if (flags) return -FI_EBADFLAGS; errors = (psm_error_t *) calloc(count, sizeof *errors); if (!errors) return -ENOMEM; mask = (int *) calloc(count, sizeof *mask); if (!mask) { free(errors); return -ENOMEM; } if (fid_av->type == FI_AV_TABLE) { if (psmx_av_check_table_size(fid_av, count)) { free(mask); free(errors); return -ENOMEM; } for (i=0; i<count; i++) fid_av->psm_epids[fid_av->last + i] = ((psm_epid_t *)addr)[i]; result = fi_addr; addr = (const void *)(fid_av->psm_epids + fid_av->last); fi_addr = (fi_addr_t *)(fid_av->psm_epaddrs + fid_av->last); } /* prevent connecting to the same ep twice, which is fatal in PSM */ for (i=0; i<count; i++) { psm_epconn_t epconn; if (psm_ep_epid_lookup(((psm_epid_t *) addr)[i], &epconn) == PSM_OK) { context = psm_epaddr_getctxt(epconn.addr); if (context && context->epid == ((psm_epid_t *) addr)[i]) ((psm_epaddr_t *) fi_addr)[i] = epconn.addr; else mask[i] = 1; } else { mask[i] = 1; } } err = psm_ep_connect(fid_av->domain->psm_ep, count, (psm_epid_t *) addr, mask, errors, (psm_epaddr_t *) fi_addr, 30*1e9); for (i=0; i<count; i++){ if (mask[i] && errors[i] == PSM_OK) { psmx_set_epaddr_context(fid_av->domain, ((psm_epid_t *) addr)[i], ((psm_epaddr_t *) fi_addr)[i]); } } free(mask); free(errors); if (fid_av->type == FI_AV_TABLE) { if (result) { for (i=0; i<count; i++) result[i] = fid_av->last + i; } fid_av->last += count; } return psmx_errno(err); }