Esempio n. 1
0
File: psm.c Progetto: kento/Samples
int  main(void)
{
  int verno_major = PSM_VERNO_MAJOR;
  int verno_minor = PSM_VERNO_MINOR;
  int err = psm_error_register_handler(NULL, // Global handler
				       PSM_ERRHANDLER_NO_HANDLER); // return errors
  if (err) {
    fprintf(stderr, "t register global handler: %s\n",
	    psm_error_get_string(err));
    return -1;
  }
  err = psm_init(&verno_major, &verno_minor);
  if (err || verno_major > PSM_VERNO_MAJOR) {
    if (err)
      fprintf(stderr, "PSM initialization failure: %s\n",
	      psm_error_get_string(err));
    else
      fprintf(stderr, "PSM loaded an unexpected/unsupported "
	      "version (%d.%d)\n", verno_major, verno_minor);
    return -1;
  }
  // We were able to initialize PSM but will defer all further error
  // handling since most of the errors beyond this point will be fatal.
  err = psm_error_register_handler(NULL, // Global handler
				       PSM_ERRHANDLER_PSM_HANDLER); //
  if (err) {
    fprintf(stderr, "t register global errhandler: %s\n",
	    psm_error_get_string(err));
    return -1;
  }
  return 0;
}
Esempio n. 2
0
static mca_mtl_base_module_t *
ompi_mtl_psm2_component_init(bool enable_progress_threads,
                            bool enable_mpi_threads)
{
    psm_error_t	err;
    int	verno_major = PSM_VERNO_MAJOR;
    int verno_minor = PSM_VERNO_MINOR;
    int local_rank = -1, num_local_procs = 0;
    int num_total_procs = 0;

    /* Compute the total number of processes on this host and our local rank
     * on that node. We need to provide PSM with these values so it can
     * allocate hardware contexts appropriately across processes.
     */
    if (OMPI_SUCCESS != get_num_local_procs(&num_local_procs)) {
        opal_output(0, "Cannot determine number of local processes. "
                    "Cannot continue.\n");
        return NULL;
    }
    if (OMPI_SUCCESS != get_local_rank(&local_rank)) {
        opal_output(0, "Cannot determine local rank. Cannot continue.\n");
        return NULL;
    }
    if (OMPI_SUCCESS != get_num_total_procs(&num_total_procs)) {
        opal_output(0, "Cannot determine total number of processes. "
                    "Cannot continue.\n");
        return NULL;
    }

    err = psm_error_register_handler(NULL /* no ep */,
			             PSM_ERRHANDLER_NOP);
    if (err) {
        opal_output(0, "Error in psm_error_register_handler (error %s)\n",
		    psm_error_get_string(err));
	return NULL;
    }

    if (num_local_procs == num_total_procs) {
      setenv("PSM_DEVICES", "self,shm", 0);
    }

    err = psm_init(&verno_major, &verno_minor);
    if (err) {
      opal_show_help("help-mtl-psm.txt",
		     "psm init", true,
		     psm_error_get_string(err));
      return NULL;
    }

    /* Complete PSM initialization */
    ompi_mtl_psm2_module_init(local_rank, num_local_procs);

    ompi_mtl_psm2.super.mtl_request_size =
      sizeof(mca_mtl_psm2_request_t) -
      sizeof(struct mca_mtl_request_t);

    return &ompi_mtl_psm2.super;
}
Esempio n. 3
0
static int psmx_init_lib(void)
{
	int major, minor;
	int ret = 0, err;

	if (psmx_lib_initialized)
		return 0;

	pthread_mutex_lock(&psmx_lib_mutex);

	if (psmx_lib_initialized)
		goto out;

	psm_error_register_handler(NULL, PSM_ERRHANDLER_NO_HANDLER);

	major = PSM_VERNO_MAJOR;
	minor = PSM_VERNO_MINOR;

	err = psm_init(&major, &minor);
	if (err != PSM_OK) {
		FI_WARN(&psmx_prov, FI_LOG_CORE,
			"psm_init failed: %s\n", psm_error_get_string(err));
		ret = err;
		goto out;
	}

	FI_INFO(&psmx_prov, FI_LOG_CORE,
		"PSM header version = (%d, %d)\n", PSM_VERNO_MAJOR, PSM_VERNO_MINOR);
	FI_INFO(&psmx_prov, FI_LOG_CORE,
		"PSM library version = (%d, %d)\n", major, minor);

	if (major != PSM_VERNO_MAJOR) {
		psmx_am_compat_mode = 1;
		FI_INFO(&psmx_prov, FI_LOG_CORE,
			"PSM AM compat mode enabled: appliation %d.%d, library %d.%d.\n",
			PSM_VERNO_MAJOR, PSM_VERNO_MINOR, major, minor);
	}

	if (major > 1) {
		psmx_compat_lib = 1;
		FI_INFO(&psmx_prov, FI_LOG_CORE,
			"PSM is supported via the psm2-compat library over PSM2.\n");
	}

	psmx_lib_initialized = 1;

out:
	pthread_mutex_unlock(&psmx_lib_mutex);
	return ret;
}
Esempio n. 4
0
int ompi_mtl_psm_module_init(int local_rank, int num_local_procs) { 
    psm_error_t err;
    psm_ep_t	ep; /* endpoint handle */
    psm_mq_t	mq;
    psm_epid_t	epid; /* unique lid+port identifier */
    psm_uuid_t  unique_job_key;
    struct psm_ep_open_opts ep_opt;
    unsigned long long *uu = (unsigned long long *) unique_job_key;
    char *generated_key;
    char env_string[256];
    
    generated_key = getenv("OMPI_MCA_orte_precondition_transports");
    memset(uu, 0, sizeof(psm_uuid_t));
    
    if (!generated_key || (strlen(generated_key) != 33) ||
        sscanf(generated_key, "%016llx-%016llx", &uu[0], &uu[1]) != 2)
    {
      opal_show_help("help-mtl-psm.txt",
		     "no uuid present", true,
		     generated_key ? "could not be parsed from" :
		     "not present in", ompi_process_info.nodename);
      return OMPI_ERROR;
      
    }

    /* Handle our own errors for opening endpoints */
    psm_error_register_handler(ompi_mtl_psm.ep, ompi_mtl_psm_errhandler);

    /* Setup MPI_LOCALRANKID and MPI_LOCALNRANKS so PSM can allocate hardware
     * contexts correctly.
     */
    snprintf(env_string, sizeof(env_string), "%d", local_rank);
    setenv("MPI_LOCALRANKID", env_string, 0);
    snprintf(env_string, sizeof(env_string), "%d", num_local_procs);
    setenv("MPI_LOCALNRANKS", env_string, 0);
    
    /* Setup the endpoint options. */
    bzero((void*) &ep_opt, sizeof(ep_opt));
    ep_opt.timeout = ompi_mtl_psm.connect_timeout * 1e9;
    ep_opt.unit = ompi_mtl_psm.ib_unit;
    ep_opt.affinity = PSM_EP_OPEN_AFFINITY_SKIP; /* do not let PSM set affinity */
    ep_opt.shm_mbytes = -1; /* Choose PSM defaults */
    ep_opt.sendbufs_num = -1; /* Choose PSM defaults */

#if PSM_VERNO >= 0x0101   
    ep_opt.network_pkey = ompi_mtl_psm.ib_pkey;
#endif
    
#if PSM_VERNO >= 0x0107
    ep_opt.port = ompi_mtl_psm.ib_port;
    ep_opt.outsl = ompi_mtl_psm.ib_service_level;
#endif

#if PSM_VERNO >= 0x010d
    ep_opt.service_id = ompi_mtl_psm.ib_service_id;
    ep_opt.path_res_type = ompi_mtl_psm.path_res_type;
#endif

    /* Open PSM endpoint */
    err = psm_ep_open(unique_job_key, &ep_opt, &ep, &epid);
    if (err) {
      opal_show_help("help-mtl-psm.txt",
		     "unable to open endpoint", true,
		     psm_error_get_string(err));
      return OMPI_ERROR;
    }

    /* Future errors are handled by the default error handler */
    psm_error_register_handler(ompi_mtl_psm.ep, PSM_ERRHANDLER_DEFAULT);
    
    err = psm_mq_init(ep, 
		      0xffff000000000000ULL, 
		      NULL,
		      0,
		      &mq);
    if (err) {
      opal_show_help("help-mtl-psm.txt",
		     "psm init", true,
		     psm_error_get_string(err));
      return OMPI_ERROR;
    }

    ompi_mtl_psm.ep   = ep;
    ompi_mtl_psm.epid = epid;
    ompi_mtl_psm.mq   = mq;

    if (OMPI_SUCCESS != 
	ompi_modex_send( &mca_mtl_psm_component.super.mtl_version, 
                             &ompi_mtl_psm.epid, 
			     sizeof(psm_epid_t))) {
	opal_output(0, "Open MPI couldn't send PSM epid to head node process"); 
	return OMPI_ERROR;
    }

    /* register the psm progress function */
    opal_progress_register(ompi_mtl_psm_progress);
        
    return OMPI_SUCCESS;
}
Esempio n. 5
0
int
ompi_mtl_psm_add_procs(struct mca_mtl_base_module_t *mtl,
                      size_t nprocs,
                      struct ompi_proc_t** procs)
{
    int i,j; 
    int rc;
    psm_epid_t   *epids_in = NULL;
    psm_epid_t	 *epid;
    psm_epaddr_t *epaddrs_out = NULL;
    psm_error_t  *errs_out = NULL, err;
    size_t size;
    int proc_errors[PSM_ERROR_LAST] = { 0 };
    int timeout_in_secs;
    
    assert(mtl == &ompi_mtl_psm.super);
    rc = OMPI_ERR_OUT_OF_RESOURCE;

    errs_out = (psm_error_t *) malloc(nprocs * sizeof(psm_error_t));
    if (errs_out == NULL) {
	goto bail;
    }
    epids_in = (psm_epid_t *) malloc(nprocs * sizeof(psm_epid_t));
    if (epids_in == NULL) {
	goto bail;
    }
    epaddrs_out = (psm_epaddr_t *) malloc(nprocs * sizeof(psm_epaddr_t));
    if (epaddrs_out == NULL) {
	goto bail;
    }
    rc = OMPI_SUCCESS;

    /* Get the epids for all the processes from modex */
    for (i = 0; i < (int) nprocs; i++) {
	rc = ompi_modex_recv(&mca_mtl_psm_component.super.mtl_version, 
				     procs[i], (void**)&epid, &size);
	if (rc != OMPI_SUCCESS || size != sizeof(psm_epid_t)) {
	  return OMPI_ERROR;
	}
	epids_in[i] = *epid;
    }

    timeout_in_secs = max(ompi_mtl_psm.connect_timeout, 0.5 * nprocs);

    psm_error_register_handler(ompi_mtl_psm.ep, PSM_ERRHANDLER_NOP);

    err = psm_ep_connect(ompi_mtl_psm.ep,
			 nprocs,
			 epids_in,
			 NULL, /* connect all */
			 errs_out,
			 epaddrs_out,
			 timeout_in_secs * 1e9);
    if (err) {
	char *errstr = (char *) ompi_mtl_psm_connect_error_msg(err);
	if (errstr == NULL) {
	    opal_output(0, "PSM returned unhandled/unknown connect error: %s\n",
			psm_error_get_string(err));
	}
	for (i = 0; i < (int) nprocs; i++) {
	    psm_error_t thiserr = errs_out[i];
	    errstr = (char *) ompi_mtl_psm_connect_error_msg(thiserr);
	    if (proc_errors[thiserr] == 0) {
		proc_errors[thiserr] = 1;
		opal_output(0, "PSM EP connect error (%s):", 
			    errstr ? errstr : "unknown connect error");
		for (j = 0; j < (int) nprocs; j++) {
		  if (errs_out[j] == thiserr) {
                      opal_output(0, " %s", (NULL == procs[j]->proc_hostname) ?
                                  "unknown" : procs[j]->proc_hostname);
		  }
		}
		opal_output(0, "\n");
	    }
	}

	rc = OMPI_ERROR;
    }
    else {
	/* Default error handling is enabled, errors will not be returned to
	 * user.  PSM prints the error and the offending endpoint's hostname
	 * and exits with -1 */
	psm_error_register_handler(ompi_mtl_psm.ep, PSM_ERRHANDLER_DEFAULT);
		
	/* Fill in endpoint data */
	for (i = 0; i < (int) nprocs; i++) { 
            mca_mtl_psm_endpoint_t *endpoint = 
		(mca_mtl_psm_endpoint_t *) OBJ_NEW(mca_mtl_psm_endpoint_t);
	    endpoint->peer_epid = epids_in[i];
	    endpoint->peer_addr = epaddrs_out[i];
            procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL] = endpoint;
	}

	rc = OMPI_SUCCESS;
    }
    
bail:
    if (epids_in != NULL) {
	free(epids_in);
    }
    if (errs_out != NULL) {
	free(errs_out);
    }
    if (epaddrs_out != NULL) {
	free(epaddrs_out);
    }

    return rc;
}
Esempio n. 6
0
static mca_mtl_base_module_t *
ompi_mtl_psm_component_init(bool enable_progress_threads,
                            bool enable_mpi_threads)
{
    psm_error_t	err;
    int	verno_major = PSM_VERNO_MAJOR;
    int verno_minor = PSM_VERNO_MINOR;
    int local_rank = -1, num_local_procs = 0;
    int num_total_procs = 0;

    /* Compute the total number of processes on this host and our local rank
     * on that node. We need to provide PSM with these values so it can
     * allocate hardware contexts appropriately across processes.
     */
    if (OMPI_SUCCESS != get_num_local_procs(&num_local_procs)) {
        opal_output(0, "Cannot determine number of local processes. "
                    "Cannot continue.\n");
        return NULL;
    }
    if (OMPI_SUCCESS != get_local_rank(&local_rank)) {
        opal_output(0, "Cannot determine local rank. Cannot continue.\n");
        return NULL;
    }
    if (OMPI_SUCCESS != get_num_total_procs(&num_total_procs)) {
        opal_output(0, "Cannot determine total number of processes. "
                    "Cannot continue.\n");
        return NULL;
    }


#if PSM_VERNO >= 0x010c
    /* Set infinipath debug level */
    err = psm_setopt(PSM_COMPONENT_CORE, 0, PSM_CORE_OPT_DEBUG,
		     (const void*) &ompi_mtl_psm.debug_level,
		     sizeof(unsigned));
    if (err) {
      /* Non fatal error. Can continue */
      opal_show_help("help-mtl-psm.txt",
		     "psm init", false,
		     psm_error_get_string(err));
    }
#endif

    if (getenv("PSM_DEVICES") == NULL) {
        /* Only allow for shm and ipath devices in 2.0 and earlier releases
         * (unless the user overrides the setting).
         */
        if (PSM_VERNO >= 0x0104) {
            if (num_local_procs == num_total_procs) {
                setenv("PSM_DEVICES", "self,shm", 0);
	    } else {
                setenv("PSM_DEVICES", "self,shm,ipath", 0);
	    }
        }
        else {
            if (num_local_procs == num_total_procs) {
                setenv("PSM_DEVICES", "shm", 0);
	    } else {
                setenv("PSM_DEVICES", "shm,ipath", 0);
	    }
        }
    }

    err = psm_init(&verno_major, &verno_minor);
    if (err) {
      opal_show_help("help-mtl-psm.txt",
		     "psm init", true,
		     psm_error_get_string(err));
      return NULL;
    }

    /* Complete PSM initialization */
    ompi_mtl_psm_module_init(local_rank, num_local_procs);

    ompi_mtl_psm.super.mtl_request_size =
      sizeof(mca_mtl_psm_request_t) -
      sizeof(struct mca_mtl_request_t);

    /* don't register the err handler until we know we will be active */
    err = psm_error_register_handler(NULL /* no ep */,
			             PSM_ERRHANDLER_NOP);
    if (err) {
        opal_output(0, "Error in psm_error_register_handler (error %s)\n",
		    psm_error_get_string(err));
	return NULL;
    }

    return &ompi_mtl_psm.super;
}
static mca_mtl_base_module_t*
ompi_mtl_psm_component_init(bool enable_progress_threads,
                           bool enable_mpi_threads)
{
    psm_error_t	err;
    int rc;
    int	verno_major = PSM_VERNO_MAJOR;
    int verno_minor = PSM_VERNO_MINOR;
    ompi_proc_t *my_proc, **procs;
    size_t num_total_procs, proc;
    int local_rank = -1, num_local_procs = 0;
    
    /* Compute the total number of processes on this host and our local rank
     * on that node. We need to provide PSM with these values so it can 
     * allocate hardware contexts appropriately across processes.
     */
    if ((rc = ompi_proc_refresh()) != OMPI_SUCCESS) {
      return NULL;
    }
    
    my_proc = ompi_proc_local();
    if (NULL == (procs = ompi_proc_world(&num_total_procs))) {
      return NULL;
    }
    
    for (proc = 0; proc < num_total_procs; proc++) {
      if (my_proc == procs[proc]) {
	local_rank = num_local_procs++;
	continue;
      }
      
      if (OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags)) {
	num_local_procs++;
      }
    }
    
    assert(local_rank >= 0 && num_local_procs > 0);
    free(procs);
    
    err = psm_error_register_handler(NULL /* no ep */,
			             PSM_ERRHANDLER_NOP);
    if (err) {
        opal_output(0, "Error in psm_error_register_handler (error %s)\n", 
		    psm_error_get_string(err));
	return NULL;
    }
    
#if PSM_VERNO >= 0x010c
    /* Set infinipath debug level */
    err = psm_setopt(PSM_COMPONENT_CORE, 0, PSM_CORE_OPT_DEBUG, 
		     (const void*) &ompi_mtl_psm.debug_level, 
		     sizeof(unsigned));
    if (err) {
      /* Non fatal error. Can continue */
      orte_show_help("help-mtl-psm.txt",
		     "psm init", false,
		     psm_error_get_string(err));
    }
#endif
    
    /* Only allow for shm and ipath devices in 2.0 and earlier releases 
     * (unless the user overrides the setting).
     */
    
    if (PSM_VERNO >= 0x0104) {
      setenv("PSM_DEVICES", "self,shm,ipath", 0);
    }
    else {
      setenv("PSM_DEVICES", "shm,ipath", 0);
    }
    
    err = psm_init(&verno_major, &verno_minor);
    if (err) {
      orte_show_help("help-mtl-psm.txt",
		     "psm init", true,
		     psm_error_get_string(err));
      return NULL;
    }
    
    /* Complete PSM initialization */
    ompi_mtl_psm_module_init(local_rank, num_local_procs);

    ompi_mtl_psm.super.mtl_request_size = 
      sizeof(mca_mtl_psm_request_t) - 
      sizeof(struct mca_mtl_request_t);
    
    return &ompi_mtl_psm.super;
}