Esempio n. 1
0
static mca_mtl_base_module_t *
ompi_mtl_psm2_component_init(bool enable_progress_threads,
                            bool enable_mpi_threads)
{
    psm_error_t	err;
    int	verno_major = PSM_VERNO_MAJOR;
    int verno_minor = PSM_VERNO_MINOR;
    int local_rank = -1, num_local_procs = 0;
    int num_total_procs = 0;

    /* Compute the total number of processes on this host and our local rank
     * on that node. We need to provide PSM with these values so it can
     * allocate hardware contexts appropriately across processes.
     */
    if (OMPI_SUCCESS != get_num_local_procs(&num_local_procs)) {
        opal_output(0, "Cannot determine number of local processes. "
                    "Cannot continue.\n");
        return NULL;
    }
    if (OMPI_SUCCESS != get_local_rank(&local_rank)) {
        opal_output(0, "Cannot determine local rank. Cannot continue.\n");
        return NULL;
    }
    if (OMPI_SUCCESS != get_num_total_procs(&num_total_procs)) {
        opal_output(0, "Cannot determine total number of processes. "
                    "Cannot continue.\n");
        return NULL;
    }

    err = psm_error_register_handler(NULL /* no ep */,
			             PSM_ERRHANDLER_NOP);
    if (err) {
        opal_output(0, "Error in psm_error_register_handler (error %s)\n",
		    psm_error_get_string(err));
	return NULL;
    }

    if (num_local_procs == num_total_procs) {
      setenv("PSM_DEVICES", "self,shm", 0);
    }

    err = psm_init(&verno_major, &verno_minor);
    if (err) {
      opal_show_help("help-mtl-psm.txt",
		     "psm init", true,
		     psm_error_get_string(err));
      return NULL;
    }

    /* Complete PSM initialization */
    ompi_mtl_psm2_module_init(local_rank, num_local_procs);

    ompi_mtl_psm2.super.mtl_request_size =
      sizeof(mca_mtl_psm2_request_t) -
      sizeof(struct mca_mtl_request_t);

    return &ompi_mtl_psm2.super;
}
Esempio n. 2
0
static int psmx_init_lib(void)
{
	int major, minor;
	int ret = 0, err;

	if (psmx_lib_initialized)
		return 0;

	pthread_mutex_lock(&psmx_lib_mutex);

	if (psmx_lib_initialized)
		goto out;

	psm_error_register_handler(NULL, PSM_ERRHANDLER_NO_HANDLER);

	major = PSM_VERNO_MAJOR;
	minor = PSM_VERNO_MINOR;

	err = psm_init(&major, &minor);
	if (err != PSM_OK) {
		FI_WARN(&psmx_prov, FI_LOG_CORE,
			"psm_init failed: %s\n", psm_error_get_string(err));
		ret = err;
		goto out;
	}

	FI_INFO(&psmx_prov, FI_LOG_CORE,
		"PSM header version = (%d, %d)\n", PSM_VERNO_MAJOR, PSM_VERNO_MINOR);
	FI_INFO(&psmx_prov, FI_LOG_CORE,
		"PSM library version = (%d, %d)\n", major, minor);

	if (major != PSM_VERNO_MAJOR) {
		psmx_am_compat_mode = 1;
		FI_INFO(&psmx_prov, FI_LOG_CORE,
			"PSM AM compat mode enabled: appliation %d.%d, library %d.%d.\n",
			PSM_VERNO_MAJOR, PSM_VERNO_MINOR, major, minor);
	}

	if (major > 1) {
		psmx_compat_lib = 1;
		FI_INFO(&psmx_prov, FI_LOG_CORE,
			"PSM is supported via the psm2-compat library over PSM2.\n");
	}

	psmx_lib_initialized = 1;

out:
	pthread_mutex_unlock(&psmx_lib_mutex);
	return ret;
}
Esempio n. 3
0
int initialize(void)
{
  int verno_major = PSM_VERNO_MAJOR;
  int verno_minor = PSM_VERNO_MINOR;
  psm_ep_errhandler_t handler;
  int err;
  //err = psm_error_register_handler(NULL, // Global handler
//				       PSM_ERRHANDLER_NO_HANDLER); // return errors

  if (err) {
    fprintf(stderr, "t register global handler: %s\n",
            psm_error_get_string(err));
    return -1;
  }
  err = psm_init(&verno_major, &verno_minor);
  if (err || verno_major > PSM_VERNO_MAJOR) {
    if (err)
      fprintf(stderr, "PSM initialization failure: %s\n",
              psm_error_get_string(err));
    else
      fprintf(stderr, "PSM loaded an unexpected/unsupported "
              "version (%d.%d)\n", verno_major, verno_minor);
    return -1;
  }
  // We were able to initialize PSM but will defer all further error
  // handling since most of the errors beyond this point will be fatal.
  handler = error_handler;
  err = psm_error_register_handler(NULL, // Global handler
				   //				   PSM_ERRHANDLER_PSM_HANDLER); 
				   handler);				   
  if (err) {
    fprintf(stderr, "t register global errhandler: %s\n",
            psm_error_get_string(err));
    return -1;
  }
  return 0;
}
Esempio n. 4
0
static
int pspsm_init(void)
{
	static pspsm_init_state_t init_state = PSPSM_INIT_START;
	int verno_minor = PSM_VERNO_MINOR;
	int verno_major = PSM_VERNO_MAJOR;
	psm_error_t ret;

	if (init_state == PSPSM_INIT_START) {
		/* Check for an available /dev/ipath */
		ret = pspsm_check_dev_ipath();
		if (ret != 0) {
			goto err_dev_ipath;
		}

		ret = psm_init(&verno_major, &verno_minor);
		if (ret != PSM_OK) {
			goto err_init;
		}

		/*
		 * All processes wanting to communicate need to use
		 * the same UUID.
		 *
		 * It is unclear whether there are drawbacks from
		 * simply using the same UUID for groups of processes
		 * that will never communicate.
		 *
		 * On top of a constant fill pattern, we use:
		 *
		 * - PSP_PSM_UNIQ_ID if set and not zero, or
		 * - PMI_ID, if set and not zero - that's not entirely
		 *   clean, but a practical solution for MPI apps (as
		 *   long as we do not implement communication between
		 *   two sets of MPI processes not sharing a
		 *   communicator).
		 */
		memset(pspsm_uuid.as_uuid, DEFAULT_UUID_PATTERN,
		       sizeof(pspsm_uuid.as_uuid));

		if (pscom.env.psm_uniq_id) {
			pspsm_dprint(2, "seeding PSM UUID with %u", pscom.env.psm_uniq_id);
			pspsm_uuid.as_uint = pscom.env.psm_uniq_id;
		}

		/* Open the endpoint here in init with the hope that
		   every mpi rank call indirect psm_ep_open() before
		   transmitting any data from or to this endpoint.
		   This is to avoid a race condition in
		   libpsm_infinipath.  Downside: We consume PSM
		   Contexts even in the case of only local
		   communication. You could use PSP_PSM=0 in this
		   case.
		*/
		if (pspsm_open_endpoint()) goto err_ep;
		if (pspsm_init_mq()) goto err_mq;

		pspsm_dprint(2, "pspsm_init: OK");
		init_state = PSPSM_INIT_DONE;
	}
	return init_state; /* 0 = success, -1 = error */
err_dev_ipath:
	pspsm_dprint(2, "pspsm_init: No \"/dev/ipath\" found. Arch psm is disabled.");
	goto err_exit;
err_init:
	pspsm_err(psm_error_get_string(ret));
	pspsm_dprint(1, "pspsm_init: %s", pspsm_err_str);
	// Fall through
 err_ep:
 err_mq:
err_exit:
	init_state = PSPSM_INIT_FAILED;
	return init_state; /* 0 = success, -1 = error */
}
Esempio n. 5
0
static mca_mtl_base_module_t *
ompi_mtl_psm_component_init(bool enable_progress_threads,
                            bool enable_mpi_threads)
{
    psm_error_t	err;
    int	verno_major = PSM_VERNO_MAJOR;
    int verno_minor = PSM_VERNO_MINOR;
    int local_rank = -1, num_local_procs = 0;
    int num_total_procs = 0;

    /* Compute the total number of processes on this host and our local rank
     * on that node. We need to provide PSM with these values so it can
     * allocate hardware contexts appropriately across processes.
     */
    if (OMPI_SUCCESS != get_num_local_procs(&num_local_procs)) {
        opal_output(0, "Cannot determine number of local processes. "
                    "Cannot continue.\n");
        return NULL;
    }
    if (OMPI_SUCCESS != get_local_rank(&local_rank)) {
        opal_output(0, "Cannot determine local rank. Cannot continue.\n");
        return NULL;
    }
    if (OMPI_SUCCESS != get_num_total_procs(&num_total_procs)) {
        opal_output(0, "Cannot determine total number of processes. "
                    "Cannot continue.\n");
        return NULL;
    }


#if PSM_VERNO >= 0x010c
    /* Set infinipath debug level */
    err = psm_setopt(PSM_COMPONENT_CORE, 0, PSM_CORE_OPT_DEBUG,
		     (const void*) &ompi_mtl_psm.debug_level,
		     sizeof(unsigned));
    if (err) {
      /* Non fatal error. Can continue */
      opal_show_help("help-mtl-psm.txt",
		     "psm init", false,
		     psm_error_get_string(err));
    }
#endif

    if (getenv("PSM_DEVICES") == NULL) {
        /* Only allow for shm and ipath devices in 2.0 and earlier releases
         * (unless the user overrides the setting).
         */
        if (PSM_VERNO >= 0x0104) {
            if (num_local_procs == num_total_procs) {
                setenv("PSM_DEVICES", "self,shm", 0);
	    } else {
                setenv("PSM_DEVICES", "self,shm,ipath", 0);
	    }
        }
        else {
            if (num_local_procs == num_total_procs) {
                setenv("PSM_DEVICES", "shm", 0);
	    } else {
                setenv("PSM_DEVICES", "shm,ipath", 0);
	    }
        }
    }

    err = psm_init(&verno_major, &verno_minor);
    if (err) {
      opal_show_help("help-mtl-psm.txt",
		     "psm init", true,
		     psm_error_get_string(err));
      return NULL;
    }

    /* Complete PSM initialization */
    ompi_mtl_psm_module_init(local_rank, num_local_procs);

    ompi_mtl_psm.super.mtl_request_size =
      sizeof(mca_mtl_psm_request_t) -
      sizeof(struct mca_mtl_request_t);

    /* don't register the err handler until we know we will be active */
    err = psm_error_register_handler(NULL /* no ep */,
			             PSM_ERRHANDLER_NOP);
    if (err) {
        opal_output(0, "Error in psm_error_register_handler (error %s)\n",
		    psm_error_get_string(err));
	return NULL;
    }

    return &ompi_mtl_psm.super;
}
static mca_mtl_base_module_t*
ompi_mtl_psm_component_init(bool enable_progress_threads,
                           bool enable_mpi_threads)
{
    psm_error_t	err;
    int rc;
    int	verno_major = PSM_VERNO_MAJOR;
    int verno_minor = PSM_VERNO_MINOR;
    ompi_proc_t *my_proc, **procs;
    size_t num_total_procs, proc;
    int local_rank = -1, num_local_procs = 0;
    
    /* Compute the total number of processes on this host and our local rank
     * on that node. We need to provide PSM with these values so it can 
     * allocate hardware contexts appropriately across processes.
     */
    if ((rc = ompi_proc_refresh()) != OMPI_SUCCESS) {
      return NULL;
    }
    
    my_proc = ompi_proc_local();
    if (NULL == (procs = ompi_proc_world(&num_total_procs))) {
      return NULL;
    }
    
    for (proc = 0; proc < num_total_procs; proc++) {
      if (my_proc == procs[proc]) {
	local_rank = num_local_procs++;
	continue;
      }
      
      if (OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags)) {
	num_local_procs++;
      }
    }
    
    assert(local_rank >= 0 && num_local_procs > 0);
    free(procs);
    
    err = psm_error_register_handler(NULL /* no ep */,
			             PSM_ERRHANDLER_NOP);
    if (err) {
        opal_output(0, "Error in psm_error_register_handler (error %s)\n", 
		    psm_error_get_string(err));
	return NULL;
    }
    
#if PSM_VERNO >= 0x010c
    /* Set infinipath debug level */
    err = psm_setopt(PSM_COMPONENT_CORE, 0, PSM_CORE_OPT_DEBUG, 
		     (const void*) &ompi_mtl_psm.debug_level, 
		     sizeof(unsigned));
    if (err) {
      /* Non fatal error. Can continue */
      orte_show_help("help-mtl-psm.txt",
		     "psm init", false,
		     psm_error_get_string(err));
    }
#endif
    
    /* Only allow for shm and ipath devices in 2.0 and earlier releases 
     * (unless the user overrides the setting).
     */
    
    if (PSM_VERNO >= 0x0104) {
      setenv("PSM_DEVICES", "self,shm,ipath", 0);
    }
    else {
      setenv("PSM_DEVICES", "shm,ipath", 0);
    }
    
    err = psm_init(&verno_major, &verno_minor);
    if (err) {
      orte_show_help("help-mtl-psm.txt",
		     "psm init", true,
		     psm_error_get_string(err));
      return NULL;
    }
    
    /* Complete PSM initialization */
    ompi_mtl_psm_module_init(local_rank, num_local_procs);

    ompi_mtl_psm.super.mtl_request_size = 
      sizeof(mca_mtl_psm_request_t) - 
      sizeof(struct mca_mtl_request_t);
    
    return &ompi_mtl_psm.super;
}