Beispiel #1
0
int psmx2_epid_to_epaddr(struct psmx2_fid_domain *domain,
			 psm2_epid_t epid, psm2_epaddr_t *epaddr)
{
        int err;
        psm2_error_t errors;
	psm2_epconn_t epconn;
	struct psmx2_epaddr_context *context;

	err = psm2_ep_epid_lookup(epid, &epconn);
	if (err == PSM2_OK) {
		context = psm2_epaddr_getctxt(epconn.addr);
		if (context && context->epid  == epid) {
			*epaddr = epconn.addr;
			return 0;
		}
	}

        err = psm2_ep_connect(domain->psm2_ep, 1, &epid, NULL, &errors,
			      epaddr, psmx2_conn_timeout(1));
        if (err != PSM2_OK)
                return psmx2_errno(err);

	psmx2_set_epaddr_context(domain,epid,*epaddr);

        return 0;
}
Beispiel #2
0
int
ompi_mtl_psm2_add_procs(struct mca_mtl_base_module_t *mtl,
                      size_t nprocs,
                      struct ompi_proc_t** procs)
{
    int i,j;
    int rc;
    psm2_epid_t   *epids_in = NULL;
    int *mask_in = NULL;
    psm2_epid_t	 *epid;
    psm2_epaddr_t *epaddrs_out = NULL;
    psm2_error_t  *errs_out = NULL, err;
    size_t size;
    int proc_errors[PSM2_ERROR_LAST] = { 0 };
    int timeout_in_secs;

    assert(mtl == &ompi_mtl_psm2.super);
    rc = OMPI_ERR_OUT_OF_RESOURCE;

    errs_out = (psm2_error_t *) malloc(nprocs * sizeof(psm2_error_t));
    if (errs_out == NULL) {
	goto bail;
    }
    epids_in = (psm2_epid_t *) malloc(nprocs * sizeof(psm2_epid_t));
    if (epids_in == NULL) {
	goto bail;
    }
    mask_in = (int *) malloc(nprocs * sizeof(int));
    if (mask_in == NULL) {
	goto bail;
    }
    epaddrs_out = (psm2_epaddr_t *) malloc(nprocs * sizeof(psm2_epaddr_t));
    if (epaddrs_out == NULL) {
	goto bail;
    }
    rc = OMPI_SUCCESS;

    /* Get the epids for all the processes from modex */
    for (i = 0; i < (int) nprocs; i++) {
        if (NULL != procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL]) {
            /* Already connected: don't connect again */
            mask_in[i] = 0;
            continue;
        }

        OPAL_MODEX_RECV(rc, &mca_mtl_psm2_component.super.mtl_version,
                        &procs[i]->super.proc_name, (void**)&epid, &size);
	if (rc != OMPI_SUCCESS || size != sizeof(psm2_epid_t)) {
	  return OMPI_ERROR;
	}
	epids_in[i] = *epid;
	mask_in[i] = 1;
    }

    timeout_in_secs = max(ompi_mtl_psm2.connect_timeout, 0.5 * nprocs);

    psm2_error_register_handler(ompi_mtl_psm2.ep, PSM2_ERRHANDLER_NOP);

    err = psm2_ep_connect(ompi_mtl_psm2.ep,
			 nprocs,
			 epids_in,
			 mask_in,
			 errs_out,
			 epaddrs_out,
			 timeout_in_secs * 1e9);
    if (err) {
	char *errstr = (char *) ompi_mtl_psm2_connect_error_msg(err);
	if (errstr == NULL) {
	    opal_output(0, "PSM2 returned unhandled/unknown connect error: %s\n",
			psm2_error_get_string(err));
	}
	for (i = 0; i < (int) nprocs; i++) {
            if (0 == mask_in[i]) {
                    continue;
            }

	    psm2_error_t thiserr = errs_out[i];
	    errstr = (char *) ompi_mtl_psm2_connect_error_msg(thiserr);
	    if (proc_errors[thiserr] == 0) {
		proc_errors[thiserr] = 1;
		opal_output(0, "PSM2 EP connect error (%s):",
			    errstr ? errstr : "unknown connect error");
		for (j = 0; j < (int) nprocs; j++) {
		  if (errs_out[j] == thiserr) {
                      opal_output(0, " %s", (NULL == procs[j]->super.proc_hostname) ?
                                  "unknown" : procs[j]->super.proc_hostname);
		  }
		}
		opal_output(0, "\n");
	    }
	}

	rc = OMPI_ERROR;
    }
    else {
	/* Default error handling is enabled, errors will not be returned to
	 * user.  PSM2 prints the error and the offending endpoint's hostname
	 * and exits with -1 */
	psm2_error_register_handler(ompi_mtl_psm2.ep, PSM2_ERRHANDLER_DEFAULT);

	/* Fill in endpoint data */
	for (i = 0; i < (int) nprocs; i++) {
            if (0 == mask_in[i]) {
                    continue;
            }

            mca_mtl_psm2_endpoint_t *endpoint =
		(mca_mtl_psm2_endpoint_t *) OBJ_NEW(mca_mtl_psm2_endpoint_t);
	    endpoint->peer_epid = epids_in[i];
	    endpoint->peer_addr = epaddrs_out[i];
            procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL] = endpoint;
	}

	rc = OMPI_SUCCESS;
    }

bail:
    if (epids_in != NULL) {
	free(epids_in);
    }
    if (mask_in != NULL) {
        free(mask_in);
    }
    if (errs_out != NULL) {
	free(errs_out);
    }
    if (epaddrs_out != NULL) {
	free(epaddrs_out);
    }

    return rc;
}
Beispiel #3
0
static int psmx2_av_connet_eps(struct psmx2_fid_av *av, size_t count,
			       psm2_epid_t *epids, int *mask,
			       psm2_error_t *errors,
			       psm2_epaddr_t *epaddrs,
			       void *context)
{
	int i;
	psm2_epconn_t epconn;
	struct psmx2_epaddr_context *epaddr_context;
	int error_count = 0;

	/* set up mask to prevent connecting to an already connected ep */
	for (i=0; i<count; i++) {
		if (psm2_ep_epid_lookup(epids[i], &epconn) == PSM2_OK) {
			epaddr_context = psm2_epaddr_getctxt(epconn.addr);
			if (epaddr_context && epaddr_context->epid == epids[i])
				epaddrs[i] = epconn.addr;
			else
				mask[i] = 1;
		} else {
			mask[i] = 1;
		}
	}

	psm2_ep_connect(av->domain->psm2_ep, count, epids, mask, errors,
			epaddrs, psmx2_conn_timeout(count));

	for (i=0; i<count; i++){
		if (!mask[i])
			continue;

		if (errors[i] == PSM2_OK ||
		    errors[i] == PSM2_EPID_ALREADY_CONNECTED) {
			psmx2_set_epaddr_context(av->domain, epids[i], epaddrs[i]);
		} else {
			/* If duplicated addrs are passed to psm2_ep_connect(),
			 * all but one will fail with error "Endpoint could not
			 * be reached". This should be treated the same as
			 * "Endpoint already connected".
			 */
			if (psm2_ep_epid_lookup(epids[i], &epconn) == PSM2_OK) {
				epaddr_context = psm2_epaddr_getctxt(epconn.addr);
				if (epaddr_context &&
				    epaddr_context->epid == epids[i]) {
					epaddrs[i] = epconn.addr;
					continue;
				}
			}

			FI_INFO(&psmx2_prov, FI_LOG_AV,
				"%d: psm2_ep_connect returned %s. remote epid=%lx.\n",
				i, psm2_error_get_string(errors[i]), epids[i]);
			if (epids[i] == 0)
				FI_INFO(&psmx2_prov, FI_LOG_AV,
					"does the application depend on the provider"
					"to resolve IP address into endpoint id? if so"
					"check if the name server has started correctly"
					"at the other side.\n");
			epaddrs[i] = (void *)FI_ADDR_NOTAVAIL;
			error_count++;

			if (av->flags & FI_EVENT)
				psmx2_av_post_completion(av, context, i, errors[i]);
		}
	}

	return error_count;
}