Exemplo n.º 1
0
Arquivo: psmx_av.c Projeto: ORNL/ompi
int psmx_epid_to_epaddr(struct psmx_fid_domain *domain,
			psm_epid_t epid, psm_epaddr_t *epaddr)
{
        int err;
        psm_error_t errors;
	psm_epconn_t epconn;
	struct psmx_epaddr_context *context;

	err = psm_ep_epid_lookup(epid, &epconn);
	if (err == PSM_OK) {
		context = psm_epaddr_getctxt(epconn.addr);
		if (context && context->epid  == epid) {
			*epaddr = epconn.addr;
			return 0;
		}
	}

        err = psm_ep_connect(domain->psm_ep, 1, &epid, NULL, &errors, epaddr, 30*1e9);
        if (err != PSM_OK)
                return psmx_errno(err);

	psmx_set_epaddr_context(domain,epid,*epaddr);

        return 0;
}
Exemplo n.º 2
0
static
int pspsm_con_connect(pspsm_con_info_t *con_info, pspsm_info_msg_t *info_msg)
{
	psm_error_t ret, ret1;

	if (memcmp(info_msg->protocol_version, PSPSM_PROTOCOL_VERSION,
		   sizeof(info_msg->protocol_version))) {
		goto err_protocol;
	}

	ret = psm_ep_connect(pspsm_ep, 1, &info_msg->epid, NULL, &ret1,
			     &con_info->epaddr, 0);
	con_info->send_id = info_msg->id;

	if (ret != PSM_OK) goto err_connect;
	pspsm_dprint(2, "pspsm_con_connect: OK");
	pspsm_dprint(2, "sending with %"PRIx64", receiving %"PRIx64,
		     con_info->send_id, con_info->recv_id);
	return 0;

 err_connect:
	pspsm_err(psm_error_get_string(ret));
	pspsm_dprint(1, "pspsm_con_connect: %s", pspsm_err_str);
	return -1;
 err_protocol:
	{
		char str[80];
		snprintf(str, sizeof(str), "protocol error : '%.8s' != '%.8s'",
			 info_msg->protocol_version, PSPSM_PROTOCOL_VERSION);
		pspsm_err(str);
		pspsm_dprint(1, "pspsm_con_connect: %s", pspsm_err_str);
	}
	return -1;
}
Exemplo n.º 3
0
int connect_endpoints(psm_ep_t ep, int numep, const psm_epid_t *array_of_epid,
                      psm_epaddr_t **array_of_epaddr_out)
{
  psm_error_t *errors = (psm_error_t *) calloc(numep, sizeof(psm_error_t));
  if (errors == NULL) {
    return -1;
  }
  psm_epaddr_t *all_epaddrs = (psm_epaddr_t *) calloc(numep, sizeof(psm_epaddr_t));
  if (all_epaddrs == NULL) {
    return -1;
  }
  psm_error_t error;
  error = psm_ep_connect(ep, numep, array_of_epid,
                 NULL, // We want to connect all epids, no mask needed
                 errors,
                 all_epaddrs,
                 30* 1000 * 1000); // 30 second timeout, <1 ns is forever


  if (error != PSM_OK) {
    fprintf(stderr, "Not connection failed\n");
  }
  *array_of_epaddr_out = all_epaddrs;
  free(errors);
  return 1;
}
Exemplo n.º 4
0
Arquivo: psm.c Projeto: kento/Samples
int connect_endpoints(psm_ep_t ep, int numep, const psm_epid_t *array_of_epid,
		      psm_epaddr_t **array_of_epaddr_out)
{
  psm_error_t *errors = (psm_error_t *)
    calloc(numep, sizeof(psm_error_t));
  if (errors == NULL)
    return -1;
     psm_epaddr_t *all_epaddrs =
       (psm_epaddr_t *) calloc(numep, sizeof(psm_epaddr_t));
     if (all_epaddrs == NULL)
       return -1;
     psm_ep_connect(ep, numep, array_of_epid,
                    NULL, // We want to connect all epids, no mask needed
                    errors,
                    all_epaddrs,
                    30* 1000 * 1000); // 30 second timeout, <1 ns is forever
     *array_of_epaddr_out = all_epaddrs;
     free(errors);
     return 1;
}
Exemplo n.º 5
0
int
ompi_mtl_psm_add_procs(struct mca_mtl_base_module_t *mtl,
                      size_t nprocs,
                      struct ompi_proc_t** procs)
{
    int i,j; 
    int rc;
    psm_epid_t   *epids_in = NULL;
    psm_epid_t	 *epid;
    psm_epaddr_t *epaddrs_out = NULL;
    psm_error_t  *errs_out = NULL, err;
    size_t size;
    int proc_errors[PSM_ERROR_LAST] = { 0 };
    int timeout_in_secs;
    
    assert(mtl == &ompi_mtl_psm.super);
    rc = OMPI_ERR_OUT_OF_RESOURCE;

    errs_out = (psm_error_t *) malloc(nprocs * sizeof(psm_error_t));
    if (errs_out == NULL) {
	goto bail;
    }
    epids_in = (psm_epid_t *) malloc(nprocs * sizeof(psm_epid_t));
    if (epids_in == NULL) {
	goto bail;
    }
    epaddrs_out = (psm_epaddr_t *) malloc(nprocs * sizeof(psm_epaddr_t));
    if (epaddrs_out == NULL) {
	goto bail;
    }
    rc = OMPI_SUCCESS;

    /* Get the epids for all the processes from modex */
    for (i = 0; i < (int) nprocs; i++) {
	rc = ompi_modex_recv(&mca_mtl_psm_component.super.mtl_version, 
				     procs[i], (void**)&epid, &size);
	if (rc != OMPI_SUCCESS || size != sizeof(psm_epid_t)) {
	  return OMPI_ERROR;
	}
	epids_in[i] = *epid;
    }

    timeout_in_secs = max(ompi_mtl_psm.connect_timeout, 0.5 * nprocs);

    psm_error_register_handler(ompi_mtl_psm.ep, PSM_ERRHANDLER_NOP);

    err = psm_ep_connect(ompi_mtl_psm.ep,
			 nprocs,
			 epids_in,
			 NULL, /* connect all */
			 errs_out,
			 epaddrs_out,
			 timeout_in_secs * 1e9);
    if (err) {
	char *errstr = (char *) ompi_mtl_psm_connect_error_msg(err);
	if (errstr == NULL) {
	    opal_output(0, "PSM returned unhandled/unknown connect error: %s\n",
			psm_error_get_string(err));
	}
	for (i = 0; i < (int) nprocs; i++) {
	    psm_error_t thiserr = errs_out[i];
	    errstr = (char *) ompi_mtl_psm_connect_error_msg(thiserr);
	    if (proc_errors[thiserr] == 0) {
		proc_errors[thiserr] = 1;
		opal_output(0, "PSM EP connect error (%s):", 
			    errstr ? errstr : "unknown connect error");
		for (j = 0; j < (int) nprocs; j++) {
		  if (errs_out[j] == thiserr) {
                      opal_output(0, " %s", (NULL == procs[j]->proc_hostname) ?
                                  "unknown" : procs[j]->proc_hostname);
		  }
		}
		opal_output(0, "\n");
	    }
	}

	rc = OMPI_ERROR;
    }
    else {
	/* Default error handling is enabled, errors will not be returned to
	 * user.  PSM prints the error and the offending endpoint's hostname
	 * and exits with -1 */
	psm_error_register_handler(ompi_mtl_psm.ep, PSM_ERRHANDLER_DEFAULT);
		
	/* Fill in endpoint data */
	for (i = 0; i < (int) nprocs; i++) { 
            mca_mtl_psm_endpoint_t *endpoint = 
		(mca_mtl_psm_endpoint_t *) OBJ_NEW(mca_mtl_psm_endpoint_t);
	    endpoint->peer_epid = epids_in[i];
	    endpoint->peer_addr = epaddrs_out[i];
            procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL] = endpoint;
	}

	rc = OMPI_SUCCESS;
    }
    
bail:
    if (epids_in != NULL) {
	free(epids_in);
    }
    if (errs_out != NULL) {
	free(errs_out);
    }
    if (epaddrs_out != NULL) {
	free(epaddrs_out);
    }

    return rc;
}
Exemplo n.º 6
0
Arquivo: psmx_av.c Projeto: ORNL/ompi
static int psmx_av_insert(struct fid_av *av, const void *addr, size_t count,
			  fi_addr_t *fi_addr, uint64_t flags, void *context)
{
	struct psmx_fid_av *av_priv;
	psm_error_t *errors;
	int error_count = 0;
	int *mask;
	int i, j;
	fi_addr_t *result = NULL;
	struct psmx_epaddr_context *epaddr_context;

	av_priv = container_of(av, struct psmx_fid_av, av);

	errors = (psm_error_t *) calloc(count, sizeof *errors);
	if (!errors)
		return -FI_ENOMEM;

	mask = (int *) calloc(count, sizeof *mask);
	if (!mask) {
		free(errors);
		return -FI_ENOMEM;
	}

	if (av_priv->type == FI_AV_TABLE) {
		if (psmx_av_check_table_size(av_priv, count)) {
			free(mask);
			free(errors);
			return -FI_ENOMEM;
		}

		for (i=0; i<count; i++)
			av_priv->psm_epids[av_priv->last + i] = ((psm_epid_t *)addr)[i];

		result = fi_addr;
		addr = (const void *)(av_priv->psm_epids + av_priv->last);
		fi_addr = (fi_addr_t *)(av_priv->psm_epaddrs + av_priv->last);
	}

	/* prevent connecting to the same ep twice, which is fatal in PSM */
	for (i=0; i<count; i++) {
		psm_epconn_t epconn;
		if (psm_ep_epid_lookup(((psm_epid_t *) addr)[i], &epconn) == PSM_OK) {
			epaddr_context = psm_epaddr_getctxt(epconn.addr);
			if (epaddr_context && epaddr_context->epid  == ((psm_epid_t *) addr)[i])
				((psm_epaddr_t *) fi_addr)[i] = epconn.addr;
			else
				mask[i] = 1;
		}
		else {
			mask[i] = 1;
		}
	}

	psm_ep_connect(av_priv->domain->psm_ep, count, 
			(psm_epid_t *) addr, mask, errors,
			(psm_epaddr_t *) fi_addr, 30*1e9);

	for (i=0; i<count; i++){
		if (!mask[i])
			continue;

		if (errors[i] == PSM_OK || errors[i] == PSM_EPID_ALREADY_CONNECTED) {
			psmx_set_epaddr_context(av_priv->domain,
						((psm_epid_t *) addr)[i],
						((psm_epaddr_t *) fi_addr)[i]);
		}
		else {
			FI_INFO(&psmx_prov, FI_LOG_AV,
				"%d: psm_ep_connect returned %s. remote epid=%lx.\n",
				i, psm_error_get_string(errors[i]),
				((psm_epid_t *)addr)[i]);
			if (((psm_epid_t *)addr)[i] == 0)
				FI_INFO(&psmx_prov, FI_LOG_AV,
					"does the application depend on the provider"
					"to resolve IP address into endpoint id? if so"
					"check if the name server has started correctly"
					"at the other side.\n");
			fi_addr[i] = FI_ADDR_NOTAVAIL;
			error_count++;
		}
	}

	free(mask);
	free(errors);

	if (av_priv->type == FI_AV_TABLE) {
		/* NOTE: unresolved addresses are left in the AV table */
		if (result) {
			for (i=0; i<count; i++) {
				j = av_priv->last + i;
				if ((fi_addr_t)av_priv->psm_epaddrs[j] == FI_ADDR_NOTAVAIL)
					result[i] = FI_ADDR_NOTAVAIL;
				else
					result[i] = j;
			}
		}
		av_priv->last += count;
	}

	return count - error_count;
}
Exemplo n.º 7
0
static int psmx_av_insert(struct fid_av *av, const void *addr, size_t count,
			  fi_addr_t *fi_addr, uint64_t flags, void *context)
{
	struct psmx_fid_av *av_priv;
	psm_error_t *errors;
	int error_count = 0;
	int *mask;
	int i, j, ret;
	fi_addr_t *result = NULL;
	struct psmx_epaddr_context *epaddr_context;

	if (count && !addr) {
		FI_INFO(&psmx_prov, FI_LOG_AV,
			"the input address array is NULL.\n");
		return -FI_EINVAL;
	}

	av_priv = container_of(av, struct psmx_fid_av, av);

	if ((av_priv->flags & FI_EVENT) && !av_priv->eq)
		return -FI_ENOEQ;

	errors = (psm_error_t *) calloc(count, sizeof *errors);
	if (!errors)
		return -FI_ENOMEM;

	mask = (int *) calloc(count, sizeof *mask);
	if (!mask) {
		free(errors);
		return -FI_ENOMEM;
	}

	if (av_priv->type == FI_AV_TABLE) {
		if (psmx_av_check_table_size(av_priv, count)) {
			free(mask);
			free(errors);
			return -FI_ENOMEM;
		}

		for (i=0; i<count; i++)
			av_priv->psm_epids[av_priv->last + i] = ((psm_epid_t *)addr)[i];

		result = fi_addr;
		addr = (const void *)(av_priv->psm_epids + av_priv->last);
		fi_addr = (fi_addr_t *)(av_priv->psm_epaddrs + av_priv->last);
	}

	/* prevent connecting to the same ep twice, which is fatal in PSM */
	for (i=0; i<count; i++) {
		psm_epconn_t epconn;
		if (psm_ep_epid_lookup(((psm_epid_t *) addr)[i], &epconn) == PSM_OK) {
			epaddr_context = psm_epaddr_getctxt(epconn.addr);
			if (epaddr_context && epaddr_context->epid  == ((psm_epid_t *) addr)[i])
				((psm_epaddr_t *) fi_addr)[i] = epconn.addr;
			else
				mask[i] = 1;
		} else {
			mask[i] = 1;
		}
	}

	psm_ep_connect(av_priv->domain->psm_ep, count, 
			(psm_epid_t *) addr, mask, errors,
			(psm_epaddr_t *) fi_addr, 30*1e9);

	for (i=0; i<count; i++){
		if (!mask[i]) {
			errors[i] = PSM_OK;
			continue;
		}

		if (errors[i] == PSM_OK || errors[i] == PSM_EPID_ALREADY_CONNECTED) {
			psmx_set_epaddr_context(av_priv->domain,
						((psm_epid_t *) addr)[i],
						((psm_epaddr_t *) fi_addr)[i]);
			errors[i] = PSM_OK;
		} else {
			psm_epconn_t epconn;

			/* If duplicated addresses are passed to psm_ep_connect(), all but one will fail
			 * with error "Endpoint could not be reached". They should be treated as already
			 * connected.
			 */
			if (psm_ep_epid_lookup(((psm_epid_t *) addr)[i], &epconn) == PSM_OK) {
				epaddr_context = psm_epaddr_getctxt(epconn.addr);
				if (epaddr_context && epaddr_context->epid  == ((psm_epid_t *) addr)[i]) {
					((psm_epaddr_t *) fi_addr)[i] = epconn.addr;
					errors[i] = PSM_OK;
					continue;
				}
			}

			FI_INFO(&psmx_prov, FI_LOG_AV,
				"%d: psm_ep_connect returned %s. remote epid=%lx.\n",
				i, psm_error_get_string(errors[i]),
				((psm_epid_t *)addr)[i]);
			if (((psm_epid_t *)addr)[i] == 0)
				FI_INFO(&psmx_prov, FI_LOG_AV,
					"does the application depend on the provider"
					"to resolve IP address into endpoint id? if so"
					"check if the name server has started correctly"
					"at the other side.\n");
			fi_addr[i] = FI_ADDR_NOTAVAIL;
			error_count++;

			if (av_priv->flags & FI_EVENT)
				psmx_av_post_completion(av_priv, context, i, errors[i]);
		}
	}

	if (av_priv->type == FI_AV_TABLE) {
		/* NOTE: unresolved addresses are left in the AV table */
		if (result) {
			for (i=0; i<count; i++) {
				j = av_priv->last + i;
				if ((fi_addr_t)av_priv->psm_epaddrs[j] == FI_ADDR_NOTAVAIL)
					result[i] = FI_ADDR_NOTAVAIL;
				else
					result[i] = j;
			}
		}
		av_priv->last += count;
	}

	if (av_priv->flags & FI_EVENT) {
		psmx_av_post_completion(av_priv, context, count - error_count, 0);
		ret = 0;
	} else {
		if (flags & FI_SYNC_ERR) {
			int *fi_errors = context;
			for (i=0; i<count; i++)
				fi_errors[i] = psmx_errno(errors[i]);
		}
		ret = count - error_count;
	}

	free(mask);
	free(errors);
	return ret;
}
Exemplo n.º 8
0
static int psmx_av_insert(struct fid_av *av, const void *addr, size_t count,
			  fi_addr_t *fi_addr, uint64_t flags)
{
	struct psmx_fid_av *fid_av;
	psm_error_t *errors;
	int *mask;
	int err;
	int i;
	fi_addr_t *result = NULL;
	struct psmx_epaddr_context *context;

	fid_av = container_of(av, struct psmx_fid_av, av);

	/* TODO: support the FI_RANGE flag */
	if (flags)
		return -FI_EBADFLAGS;

	errors = (psm_error_t *) calloc(count, sizeof *errors);
	if (!errors)
		return -ENOMEM;

	mask = (int *) calloc(count, sizeof *mask);
	if (!mask) {
		free(errors);
		return -ENOMEM;
	}

	if (fid_av->type == FI_AV_TABLE) {
		if (psmx_av_check_table_size(fid_av, count)) {
			free(mask);
			free(errors);
			return -ENOMEM;
		}

		for (i=0; i<count; i++)
			fid_av->psm_epids[fid_av->last + i] = ((psm_epid_t *)addr)[i];

		result = fi_addr;
		addr = (const void *)(fid_av->psm_epids + fid_av->last);
		fi_addr = (fi_addr_t *)(fid_av->psm_epaddrs + fid_av->last);
	}

	/* prevent connecting to the same ep twice, which is fatal in PSM */
	for (i=0; i<count; i++) {
		psm_epconn_t epconn;
		if (psm_ep_epid_lookup(((psm_epid_t *) addr)[i], &epconn) == PSM_OK) {
			context = psm_epaddr_getctxt(epconn.addr);
			if (context && context->epid  == ((psm_epid_t *) addr)[i])
				((psm_epaddr_t *) fi_addr)[i] = epconn.addr;
			else
				mask[i] = 1;
		}
		else {
			mask[i] = 1;
		}
	}

	err = psm_ep_connect(fid_av->domain->psm_ep, count, 
			(psm_epid_t *) addr, mask, errors,
			(psm_epaddr_t *) fi_addr, 30*1e9);

	for (i=0; i<count; i++){
		if (mask[i] && errors[i] == PSM_OK) {
			psmx_set_epaddr_context(fid_av->domain,
						((psm_epid_t *) addr)[i],
						((psm_epaddr_t *) fi_addr)[i]);
		}
	}

	free(mask);
	free(errors);

	if (fid_av->type == FI_AV_TABLE) {
		if (result) {
			for (i=0; i<count; i++)
				result[i] = fid_av->last + i;
		}
		fid_av->last += count;
	}

	return psmx_errno(err);
}