Example #1
0
int orte_pls_rsh_launch(orte_jobid_t jobid)
{
    orte_job_map_t *map;
    opal_list_item_t *n_item;
    orte_mapped_node_t *rmaps_node;
    orte_std_cntr_t num_nodes;
    orte_vpid_t vpid;
    int node_name_index1;
    int node_name_index2;
    int proc_name_index;
    int local_exec_index, local_exec_index_end;
    char *jobid_string = NULL;
    char *uri, *param;
    char **argv = NULL, **tmp;
    char *prefix_dir;
    int argc;
    int rc;
    sigset_t sigs;
    struct passwd *p;
    bool remote_sh = false, remote_csh = false; 
    bool local_sh = false, local_csh = false;
    char *lib_base = NULL, *bin_base = NULL;
    orte_pls_daemon_info_t *dmn;
    orte_pls_rsh_shell_t shell;

    if (mca_pls_rsh_component.timing) {
        if (0 != gettimeofday(&joblaunchstart, NULL)) {
            opal_output(0, "pls_rsh: could not obtain start time");
            joblaunchstart.tv_sec = 0;
            joblaunchstart.tv_usec = 0;
        }        
    }
    
    /* setup a list that will contain the info for all the daemons
     * so we can store it on the registry when done and use it
     * locally to track their state
     */
    OBJ_CONSTRUCT(&active_daemons, opal_list_t);

    /* Get the map for this job
     * We need the entire mapping for a couple of reasons:
     *  - need the prefix to start with.
     *  - need to know the nodes we are launching on
     * All other mapping responsibilities fall to orted in the fork PLS
     */
    rc = orte_rmaps.get_job_map(&map, jobid);
    if (ORTE_SUCCESS != rc) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&active_daemons);
        return rc;
    }

    /* if the user requested that we re-use daemons,
     * launch the procs on any existing, re-usable daemons
     */
    if (orte_pls_base.reuse_daemons) {
        if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map))) {
            ORTE_ERROR_LOG(rc);
            OBJ_RELEASE(map);
            OBJ_DESTRUCT(&active_daemons);
            return rc;
        }
    }
    
    num_nodes = (orte_std_cntr_t)opal_list_get_size(&map->nodes);
    if (0 == num_nodes) {
        /* nothing left to do - just return */
        OBJ_RELEASE(map);
        OBJ_DESTRUCT(&active_daemons);
        return ORTE_SUCCESS;
    }

    if (mca_pls_rsh_component.debug_daemons &&
        mca_pls_rsh_component.num_concurrent < num_nodes) {
        /* we can't run in this situation, so pretty print the error
         * and exit
         */
        opal_show_help("help-pls-rsh.txt", "deadlock-params",
                       true, mca_pls_rsh_component.num_concurrent, num_nodes);
        OBJ_RELEASE(map);
        OBJ_DESTRUCT(&active_daemons);
        return ORTE_ERR_FATAL;
    }

    /*
     * After a discussion between Ralph & Jeff, we concluded that we
     * really are handling the prefix dir option incorrectly. It currently
     * is associated with an app_context, yet it really refers to the
     * location where OpenRTE/Open MPI is installed on a NODE. Fixing
     * this right now would involve significant change to orterun as well
     * as elsewhere, so we will intentionally leave this incorrect at this
     * point. The error, however, is identical to that seen in all prior
     * releases of OpenRTE/Open MPI, so our behavior is no worse than before.
     *
     * A note to fix this, along with ideas on how to do so, has been filed
     * on the project's Trac system under "feature enhancement".
     *
     * For now, default to the prefix_dir provided in the first app_context.
     * Since there always MUST be at least one app_context, we are safe in
     * doing this.
     */
    prefix_dir = map->apps[0]->prefix_dir;
    
    /*
     * Allocate a range of vpids for the daemons.
     */
    if (num_nodes == 0) {
        return ORTE_ERR_BAD_PARAM;
    }
    rc = orte_ns.reserve_range(0, num_nodes, &vpid);
    if (ORTE_SUCCESS != rc) {
        goto cleanup;
    }

    /* setup the orted triggers for passing their launch info */
    if (ORTE_SUCCESS != (rc = orte_smr.init_orted_stage_gates(jobid, num_nodes, NULL, NULL))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }
    
    /* need integer value for command line parameter */
    if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&jobid_string, jobid))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }

    /* What is our local shell? */
    shell = ORTE_PLS_RSH_SHELL_UNKNOWN;
    p = getpwuid(getuid());
    if (NULL != p) {
        param = p->pw_shell;
        shell = find_shell(p->pw_shell);
    }
    /* If we didn't find it in getpwuid(), try looking at the $SHELL
       environment variable (see
       https://svn.open-mpi.org/trac/ompi/ticket/1060) */
    if (ORTE_PLS_RSH_SHELL_UNKNOWN == shell && 
        NULL != (param = getenv("SHELL"))) {
        shell = find_shell(param);
    }

    switch (shell) {
    case ORTE_PLS_RSH_SHELL_SH:  /* fall through */
    case ORTE_PLS_RSH_SHELL_KSH: /* fall through */
    case ORTE_PLS_RSH_SHELL_ZSH: /* fall through */
    case ORTE_PLS_RSH_SHELL_BASH: local_sh = true; break;
    case ORTE_PLS_RSH_SHELL_TCSH: /* fall through */
    case ORTE_PLS_RSH_SHELL_CSH:  local_csh = true; break;
    default:
        opal_output(0, "WARNING: local probe returned unhandled shell:%s assuming bash\n",
                    (NULL != param) ? param : "unknown");
        remote_sh = true;
        break;
    }

    if (mca_pls_rsh_component.debug) {
        opal_output(0, "pls:rsh: local csh: %d, local sh: %d\n",
                    local_csh, local_sh);
    }

    /* What is our remote shell? */
    if (mca_pls_rsh_component.assume_same_shell) {
        remote_sh = local_sh;
        remote_csh = local_csh;
        if (mca_pls_rsh_component.debug) {
            opal_output(0, "pls:rsh: assuming same remote shell as local shell");
        }
    } else {
        orte_pls_rsh_shell_t shell;
        rmaps_node = (orte_mapped_node_t*)opal_list_get_first(&map->nodes);
        rc = orte_pls_rsh_probe(rmaps_node, &shell);

        if (ORTE_SUCCESS != rc) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }

        switch (shell) {
        case ORTE_PLS_RSH_SHELL_SH:  /* fall through */
        case ORTE_PLS_RSH_SHELL_KSH: /* fall through */
        case ORTE_PLS_RSH_SHELL_ZSH: /* fall through */
        case ORTE_PLS_RSH_SHELL_BASH: remote_sh = true; break;
        case ORTE_PLS_RSH_SHELL_TCSH: /* fall through */
        case ORTE_PLS_RSH_SHELL_CSH:  remote_csh = true; break;
        default:
            opal_output(0, "WARNING: rsh probe returned unhandled shell; assuming bash\n");
            remote_sh = true;
        }
    }
    if (mca_pls_rsh_component.debug) {
        opal_output(0, "pls:rsh: remote csh: %d, remote sh: %d\n",
                    remote_csh, remote_sh);
    }

    /*
     * Build argv array
     */
    argv = opal_argv_copy(mca_pls_rsh_component.agent_argv);
    argc = mca_pls_rsh_component.agent_argc;
    node_name_index1 = argc;
    opal_argv_append(&argc, &argv, "<template>");

    /* Do we need to source .profile on the remote side? */

    if (!(remote_csh || remote_sh)) {
        int i;
        tmp = opal_argv_split("( test ! -r ./.profile || . ./.profile;", ' ');
        if (NULL == tmp) {
            return ORTE_ERR_OUT_OF_RESOURCE;
        }
        for (i = 0; NULL != tmp[i]; ++i) {
            opal_argv_append(&argc, &argv, tmp[i]);
        }
        opal_argv_free(tmp);
    }

    /* add the daemon command (as specified by user) */
    local_exec_index = argc;
    opal_argv_append(&argc, &argv, mca_pls_rsh_component.orted);

    /* check for debug flags */
    orte_pls_base_mca_argv(&argc, &argv);

    opal_argv_append(&argc, &argv, "--bootproxy");
    opal_argv_append(&argc, &argv, jobid_string);
    opal_argv_append(&argc, &argv, "--name");
    proc_name_index = argc;
    opal_argv_append(&argc, &argv, "<template>");

    /* tell the daemon how many procs are in the daemon's job */
    opal_argv_append(&argc, &argv, "--num_procs");
    asprintf(&param, "%lu", (unsigned long)(vpid + num_nodes));
    opal_argv_append(&argc, &argv, param);
    free(param);
    /* tell the daemon the starting vpid of the daemon's job */
    opal_argv_append(&argc, &argv, "--vpid_start");
    opal_argv_append(&argc, &argv, "0");

    opal_argv_append(&argc, &argv, "--nodename");
    node_name_index2 = argc;
    opal_argv_append(&argc, &argv, "<template>");

    /* pass along the universe name and location info */
    opal_argv_append(&argc, &argv, "--universe");
    asprintf(&param, "%s@%s:%s", orte_universe_info.uid,
                orte_universe_info.host, orte_universe_info.name);
    opal_argv_append(&argc, &argv, param);
    free(param);

    /* setup ns contact info */
    opal_argv_append(&argc, &argv, "--nsreplica");
    if (NULL != orte_process_info.ns_replica_uri) {
        uri = strdup(orte_process_info.ns_replica_uri);
    } else {
        uri = orte_rml.get_uri();
    }
    asprintf(&param, "\"%s\"", uri);
    opal_argv_append(&argc, &argv, param);
    free(uri);
    free(param);

    /* setup gpr contact info */
    opal_argv_append(&argc, &argv, "--gprreplica");
    if (NULL != orte_process_info.gpr_replica_uri) {
        uri = strdup(orte_process_info.gpr_replica_uri);
    } else {
        uri = orte_rml.get_uri();
    }
    asprintf(&param, "\"%s\"", uri);
    opal_argv_append(&argc, &argv, param);
    free(uri);
    free(param);

    local_exec_index_end = argc;
    if (!(remote_csh || remote_sh)) {
        opal_argv_append(&argc, &argv, ")");
    }
    if (mca_pls_rsh_component.debug) {
        param = opal_argv_join(argv, ' ');
        if (NULL != param) {
            opal_output(0, "pls:rsh: final template argv:");
            opal_output(0, "pls:rsh:     %s", param);
            free(param);
        }
    }

    /* Figure out the basenames for the libdir and bindir.  This
       requires some explanation:

       - Use opal_install_dirs.libdir and opal_install_dirs.bindir instead of -D'ing some macros
         in this directory's Makefile.am because it makes all the
         dependencies work out correctly.  These are defined in
         opal/install_dirs.h.

       - After a discussion on the devel-core mailing list, the
         developers decided that we should use the local directory
         basenames as the basis for the prefix on the remote note.
         This does not handle a few notable cases (e.g., f the
         libdir/bindir is not simply a subdir under the prefix, if the
         libdir/bindir basename is not the same on the remote node as
         it is here in the local node, etc.), but we decided that
         --prefix was meant to handle "the common case".  If you need
         something more complex than this, a) edit your shell startup
         files to set PATH/LD_LIBRARY_PATH properly on the remove
         node, or b) use some new/to-be-defined options that
         explicitly allow setting the bindir/libdir on the remote
         node.  We decided to implement these options (e.g.,
         --remote-bindir and --remote-libdir) to orterun when it
         actually becomes a problem for someone (vs. a hypothetical
         situation).

       Hence, for now, we simply take the basename of this install's
       libdir and bindir and use it to append this install's prefix
       and use that on the remote node.
    */

    lib_base = opal_basename(opal_install_dirs.libdir);
    bin_base = opal_basename(opal_install_dirs.bindir);

    /*
     * Iterate through each of the nodes
     */
    if (mca_pls_rsh_component.timing) {
        /* allocate space to track the start times */
        launchstart = (struct timeval*)malloc((num_nodes+vpid) * sizeof(struct timeval));
    }
    
    for(n_item =  opal_list_get_first(&map->nodes);
        n_item != opal_list_get_end(&map->nodes);
        n_item =  opal_list_get_next(n_item)) {
        orte_process_name_t* name;
        pid_t pid;
        char *exec_path;
        char **exec_argv;
        
        rmaps_node = (orte_mapped_node_t*)n_item;
        
        if (mca_pls_rsh_component.timing) {
            if (0 != gettimeofday(&launchstart[vpid], NULL)) {
                opal_output(0, "pls_rsh: could not obtain start time");
            }
        }
        
        /* new daemon - setup to record its info */
        dmn = OBJ_NEW(orte_pls_daemon_info_t);
        dmn->active_job = jobid;
        opal_list_append(&active_daemons, &dmn->super);
        
        /* setup node name */
        free(argv[node_name_index1]);
        if (NULL != rmaps_node->username &&
            0 != strlen (rmaps_node->username)) {
            asprintf (&argv[node_name_index1], "%s@%s",
                      rmaps_node->username, rmaps_node->nodename);
        } else {
            argv[node_name_index1] = strdup(rmaps_node->nodename);
        }

        free(argv[node_name_index2]);
        argv[node_name_index2] = strdup(rmaps_node->nodename);
        
        /* save it in the daemon info */
        dmn->nodename = strdup(rmaps_node->nodename);

        /* initialize daemons process name */
        rc = orte_ns.create_process_name(&name, rmaps_node->cell, 0, vpid);
        if (ORTE_SUCCESS != rc) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        
        /* save it in the daemon info */
        dmn->cell = rmaps_node->cell;
        if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&(dmn->name), name, ORTE_NAME))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }

        /* fork a child to exec the rsh/ssh session */
        
        /* set the process state to "launched" */
        if (ORTE_SUCCESS != (rc = orte_smr.set_proc_state(name, ORTE_PROC_STATE_LAUNCHED, 0))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }

        pid = fork();
        if (pid < 0) {
            rc = ORTE_ERR_OUT_OF_RESOURCE;
            goto cleanup;
        }

        /* child */
        if (pid == 0) {
            char* name_string;
            char** env;
            char* var;
            long fd, fdmax = sysconf(_SC_OPEN_MAX);

            if (mca_pls_rsh_component.debug) {
                opal_output(0, "pls:rsh: launching on node %s\n",
                            rmaps_node->nodename);
            }

            /* We don't need to sense an oversubscribed condition and set the sched_yield
             * for the node as we are only launching the daemons at this time. The daemons
             * are now smart enough to set the oversubscribed condition themselves when
             * they launch the local procs.
             */

            /* Is this a local launch?
             *
             * Not all node names may be resolvable (if we found
             * localhost in the hostfile, for example).  So first
             * check trivial case of node_name being same as the
             * current nodename, which must be local.  If that doesn't
             * match, check using ifislocal().
             */
            if (!mca_pls_rsh_component.force_rsh &&
                (0 == strcmp(rmaps_node->nodename, orte_system_info.nodename) ||
                opal_ifislocal(rmaps_node->nodename))) {
                if (mca_pls_rsh_component.debug) {
                    opal_output(0, "pls:rsh: %s is a LOCAL node\n",
                                rmaps_node->nodename);
                }
                if (mca_pls_rsh_component.timing) {
                    /* since this is a local launch, the daemon will never reach
                     * the waitpid callback - so set the start value to
                     * something nonsensical
                     */
                    launchstart[vpid].tv_sec = 0;
                    launchstart[vpid].tv_usec = 0;
                }
                
                exec_path = opal_path_findv(argv[local_exec_index], 0, environ, NULL);

                if (NULL == exec_path && NULL == prefix_dir) {
                    rc = orte_pls_rsh_fill_exec_path (&exec_path);
                    if (ORTE_SUCCESS != rc) {
                        exit(-1);  /* the forked process MUST exit */
                    }
                } else {
                    if (NULL != prefix_dir) {
                        exec_path = opal_os_path( false, prefix_dir, bin_base, "orted", NULL );
                    }
                    /* If we yet did not fill up the execpath, do so now */
                    if (NULL == exec_path) {
                        rc = orte_pls_rsh_fill_exec_path (&exec_path);
                        if (ORTE_SUCCESS != rc) {
                            exit(-1);  /* the forked process MUST exit */
                        }
                    }
                }

                /* If we have a prefix, then modify the PATH and
                   LD_LIBRARY_PATH environment variables.  We're
                   already in the child process, so it's ok to modify
                   environ. */
                if (NULL != prefix_dir) {
                    char *oldenv, *newenv;

                    /* Reset PATH */
                    newenv = opal_os_path( false, prefix_dir, bin_base, NULL );
                    oldenv = getenv("PATH");
                    if (NULL != oldenv) {
                        char *temp;
                        asprintf(&temp, "%s:%s", newenv, oldenv );
                        free( newenv );
                        newenv = temp;
                    }
                    opal_setenv("PATH", newenv, true, &environ);
                    if (mca_pls_rsh_component.debug) {
                        opal_output(0, "pls:rsh: reset PATH: %s", newenv);
                    }
                    free(newenv);

                    /* Reset LD_LIBRARY_PATH */
                    newenv = opal_os_path( false, prefix_dir, lib_base, NULL );
                    oldenv = getenv("LD_LIBRARY_PATH");
                    if (NULL != oldenv) {
                        char* temp;
                        asprintf(&temp, "%s:%s", newenv, oldenv);
                        free(newenv);
                        newenv = temp;
                    }
                    opal_setenv("LD_LIBRARY_PATH", newenv, true, &environ);
                    if (mca_pls_rsh_component.debug) {
                        opal_output(0, "pls:rsh: reset LD_LIBRARY_PATH: %s",
                                    newenv);
                    }
                    free(newenv);
                }

                /* Since this is a local execution, we need to
                   potentially whack the final ")" in the argv (if
                   sh/csh conditionals, from above).  Note that we're
                   modifying the argv[] in the child process, so
                   there's no need to save this and restore it
                   afterward -- the parent's argv[] is unmodified. */
                if (NULL != argv[local_exec_index_end]) {
                    free(argv[local_exec_index_end]);
                    argv[local_exec_index_end] = NULL;
                }

                /* tell the daemon to setup its own process session/group */
                opal_argv_append(&argc, &argv, "--set-sid");
                exec_argv = &argv[local_exec_index];
                
                /* Finally, chdir($HOME) because we're making the
                   assumption that this is what will happen on
                   remote nodes (via rsh/ssh).  This allows a user
                   to specify a path that is relative to $HOME for
                   both the cwd and argv[0] and it will work on
                   all nodes -- including the local nost.
                   Otherwise, it would work on remote nodes and
                   not the local node.  If the user does not start
                   in $HOME on the remote nodes... well... let's
                   hope they start in $HOME.  :-) */
                var = getenv("HOME");
                if (NULL != var) {
                    if (mca_pls_rsh_component.debug) {
                        opal_output(0, "pls:rsh: changing to directory %s", var);
                    }
                    /* Ignore errors -- what are we going to do?
                       (and we ignore errors on the remote nodes
                       in the fork pls, so this is consistent) */
                    chdir(var);
                }
            } else {
                if (mca_pls_rsh_component.debug) {
                    opal_output(0, "pls:rsh: %s is a REMOTE node\n",
                                rmaps_node->nodename);
                }
                exec_argv = argv;
                exec_path = strdup(mca_pls_rsh_component.agent_path);

                if (NULL != prefix_dir) {
                    char *opal_prefix = getenv("OPAL_PREFIX");
                    if (remote_sh) {
                        asprintf (&argv[local_exec_index],
                                  "%s%s%s PATH=%s/%s:$PATH ; export PATH ; "
                                  "LD_LIBRARY_PATH=%s/%s:$LD_LIBRARY_PATH ; export LD_LIBRARY_PATH ; "
                                  "%s/%s/%s",
                                  (opal_prefix != NULL ? "OPAL_PREFIX=" : ""),
                                  (opal_prefix != NULL ? opal_prefix : ""),
                                  (opal_prefix != NULL ? " ;" : ""),
                                  prefix_dir, bin_base,
                                  prefix_dir, lib_base,
                                  prefix_dir, bin_base,
                                  mca_pls_rsh_component.orted);
                    }
                    if (remote_csh) {
                        /* [t]csh is a bit more challenging -- we
                           have to check whether LD_LIBRARY_PATH
                           is already set before we try to set it.
                           Must be very careful about obeying
                           [t]csh's order of evaluation and not
                           using a variable before it is defined.
                           See this thread for more details:
                           http://www.open-mpi.org/community/lists/users/2006/01/0517.php. */
                        asprintf (&argv[local_exec_index],
                                  "%s%s%s set path = ( %s/%s $path ) ; "
                                  "if ( $?LD_LIBRARY_PATH == 1 ) "
                                  "set OMPI_have_llp ; "
                                  "if ( $?LD_LIBRARY_PATH == 0 ) "
                                  "setenv LD_LIBRARY_PATH %s/%s ; "
                                  "if ( $?OMPI_have_llp == 1 ) "
                                  "setenv LD_LIBRARY_PATH %s/%s:$LD_LIBRARY_PATH ; "
                                  "%s/%s/%s",
                                  (opal_prefix != NULL ? "setenv OPAL_PREFIX " : ""),
                                  (opal_prefix != NULL ? opal_prefix : ""),
                                  (opal_prefix != NULL ? " ;" : ""),
                                  prefix_dir, bin_base,
                                  prefix_dir, lib_base,
                                  prefix_dir, lib_base,
                                  prefix_dir, bin_base,
                                  mca_pls_rsh_component.orted);
                    }
                }
            }

            /* setup process name */
            rc = orte_ns.get_proc_name_string(&name_string, name);
            if (ORTE_SUCCESS != rc) {
                opal_output(0, "orte_pls_rsh: unable to create process name");
                exit(-1);
            }
            free(argv[proc_name_index]);
            argv[proc_name_index] = strdup(name_string);

            if (!mca_pls_rsh_component.debug) {
                 /* setup stdin */
                int fd = open("/dev/null", O_RDWR);
                dup2(fd, 0);
                close(fd);
            }

            /* close all file descriptors w/ exception of stdin/stdout/stderr */
            for(fd=3; fd<fdmax; fd++)
                close(fd);

            /* Set signal handlers back to the default.  Do this close
                to the execve() because the event library may (and likely
                will) reset them.  If we don't do this, the event
                library may have left some set that, at least on some
                OS's, don't get reset via fork() or exec().  Hence, the
                orted could be unkillable (for example). */

            set_handler_default(SIGTERM);
            set_handler_default(SIGINT);
            set_handler_default(SIGHUP);
            set_handler_default(SIGPIPE);
            set_handler_default(SIGCHLD);
            
            /* Unblock all signals, for many of the same reasons that
                we set the default handlers, above.  This is noticable
                on Linux where the event library blocks SIGTERM, but we
                don't want that blocked by the orted (or, more
                specifically, we don't want it to be blocked by the
                orted and then inherited by the ORTE processes that it
                forks, making them unkillable by SIGTERM). */
            sigprocmask(0, 0, &sigs);
            sigprocmask(SIG_UNBLOCK, &sigs, 0);
            
            /* setup environment */
            env = opal_argv_copy(environ);
            var = mca_base_param_environ_variable("seed",NULL,NULL);
            opal_setenv(var, "0", true, &env);

            /* exec the daemon */
            if (mca_pls_rsh_component.debug) {
                param = opal_argv_join(exec_argv, ' ');
                if (NULL != param) {
                    char* env_array = opal_argv_join( env, ' ' );
                    opal_output(0, "pls:rsh: executing: (%s) %s [%s]",
                                exec_path, param, env_array);
                    free(param); free(env_array);
                }
            }
            execve(exec_path, exec_argv, env);
            opal_output(0, "pls:rsh: execv of %s failed with errno=%s(%d)\n",
                        exec_path, strerror(errno), errno);
            exit(-1);

        } else { /* father */
            OPAL_THREAD_LOCK(&mca_pls_rsh_component.lock);
            /* JJH Bug:
             * If we are in '--debug-daemons' we keep the ssh connection 
             * alive for the span of the run. If we use this option 
             * AND we launch on more than "num_concurrent" machines
             * then we will deadlock. No connections are terminated 
             * until the job is complete, no job is started
             * since all the orteds are waiting for all the others
             * to come online, and the others ore not launched because
             * we are waiting on those that have started to terminate
             * their ssh tunnels. :(
             */
            if (mca_pls_rsh_component.num_children++ >=
                mca_pls_rsh_component.num_concurrent) {
                opal_condition_wait(&mca_pls_rsh_component.cond, &mca_pls_rsh_component.lock);
            }
            OPAL_THREAD_UNLOCK(&mca_pls_rsh_component.lock);
            
            /* setup callback on sigchild - wait until setup above is complete
             * as the callback can occur in the call to orte_wait_cb
             */
            orte_wait_cb(pid, orte_pls_rsh_wait_daemon, dmn);

            /* if required - add delay to avoid problems w/ X11 authentication */
            if (mca_pls_rsh_component.debug && mca_pls_rsh_component.delay) {
                sleep(mca_pls_rsh_component.delay);
            }
            vpid++;
        }
        free(name);
    }
    
    /* all done, so store the daemon info on the registry */
    if (ORTE_SUCCESS != (rc = orte_pls_base_store_active_daemons(&active_daemons))) {
        ORTE_ERROR_LOG(rc);
    }

cleanup:
    OBJ_RELEASE(map);

    if (NULL != lib_base) {
        free(lib_base);
    }
    if (NULL != bin_base) {
        free(bin_base);
    }

    if (NULL != jobid_string) free(jobid_string);  /* done with this variable */
    if (NULL != argv) opal_argv_free(argv);

    return rc;
}
Example #2
0
int opal_crs_self_checkpoint(pid_t pid,
                             opal_crs_base_snapshot_t *base_snapshot,
                             opal_crs_base_ckpt_options_t *options,
                             opal_crs_state_type_t *state)
{
    opal_crs_self_snapshot_t *snapshot = OBJ_NEW(opal_crs_self_snapshot_t);
    int ret, exit_status = OPAL_SUCCESS;
    char * restart_cmd = NULL;

    /*
     * This function should never be called by a tool
     */
    if( opal_cr_is_tool ) {
        return OPAL_ERR_NOT_SUPPORTED;
    }

    if( options->stop ) {
        opal_output(0,
                    "crs:self: checkpoint(): Error: SIGSTOP Not currently supported!");
    }

    /*
     * Setup for snapshot directory creation
     */
    snapshot->super = *base_snapshot;
#if 0
    snapshot->super.snapshot_directory = strdup(base_snapshot->snapshot_directory);
    snapshot->super.metadata_filename  = strdup(base_snapshot->metadata_filename);
#endif

    opal_output_verbose(10, mca_crs_self_component.super.output_handle,
                        "crs:self: checkpoint(%d, ---)", pid);

    if(!mca_crs_self_component.can_checkpoint) {
        opal_show_help("help-opal-crs-self.txt", "self:ckpt_disabled", false);
        exit_status = OPAL_ERROR;
        goto cleanup;
    }

    /*
     * Update the snapshot metadata
     */
    snapshot->super.component_name = strdup(mca_crs_self_component.super.base_version.mca_component_name);
    if( NULL == snapshot->super.metadata ) {
        if (NULL == (snapshot->super.metadata = fopen(snapshot->super.metadata_filename, "a")) ) {
            opal_output(mca_crs_self_component.super.output_handle,
                        "crs:self: checkpoint(): Error: Unable to open the file (%s)",
                        snapshot->super.metadata_filename);
            exit_status = OPAL_ERROR;
            goto cleanup;
        }
    }
    fprintf(snapshot->super.metadata, "%s%s\n", CRS_METADATA_COMP, snapshot->super.component_name);

    /*
     * Call the user callback function
     */
    if(NULL != mca_crs_self_component.ucb_checkpoint_fn) {
        mca_crs_self_component.ucb_checkpoint_fn(&restart_cmd);
    }

    /*
     * Save the restart command
     */
    if( NULL == restart_cmd) {
        *state = OPAL_CRS_ERROR;
        opal_show_help("help-opal-crs-self.txt", "self:no-restart-cmd",
                       true);
        exit_status = OPAL_ERROR;
        goto cleanup;
    }
    else {
        snapshot->cmd_line = strdup(restart_cmd);

        opal_output_verbose(10, mca_crs_self_component.super.output_handle,
                            "crs:self: checkpoint: Restart Command (%s)", snapshot->cmd_line);
    }

    /*
     * The best we can do is update the metadata file with the
     * application argv and argc we started with.
     */
    if( OPAL_SUCCESS != (ret = self_update_snapshot_metadata(snapshot)) ) {
        *state = OPAL_CRS_ERROR;
        opal_output(mca_crs_self_component.super.output_handle,
                    "crs:self: checkpoint(): Error: Unable to update metadata for snapshot (%s).",
                    snapshot->super.metadata_filename);
        exit_status = ret;
        goto cleanup;
    }


    *state = OPAL_CRS_CONTINUE;
    
    /*
     * Call their continue routine for completeness
     */
    if(NULL != mca_crs_self_component.ucb_continue_fn) {
        mca_crs_self_component.ucb_continue_fn();
    }

    base_snapshot = &(snapshot->super);

 cleanup:
    if( NULL != restart_cmd) {
        free(restart_cmd);
        restart_cmd = NULL;
    }

    return exit_status;
}
Example #3
0
int opal_hwloc_base_set_binding_policy(opal_binding_policy_t *policy, char *spec)
{
    int i;
    opal_binding_policy_t tmp;
    char **tmpvals, **quals;
                
    /* set default */
    tmp = 0;

    /* binding specification */
    if (NULL == spec) {
        /* default to bind-to core, and that no binding policy was specified */
        OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_CORE);
        tmp &= ~OPAL_BIND_GIVEN;
    } else if (0 == strncasecmp(spec, "none", strlen("none"))) {
        OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_NONE);
    } else {
        tmpvals = opal_argv_split(spec, ':');
        if (1 < opal_argv_count(tmpvals) || ':' == spec[0]) {
            if (':' == spec[0]) {
                quals = opal_argv_split(&spec[1], ',');
            } else {
                quals = opal_argv_split(tmpvals[1], ',');
            }
            for (i=0; NULL != quals[i]; i++) {
                if (0 == strncasecmp(quals[i], "if-supported", strlen(quals[i]))) {
                    tmp |= OPAL_BIND_IF_SUPPORTED;
                } else if (0 == strncasecmp(quals[i], "overload-allowed", strlen(quals[i])) ||
                           0 == strncasecmp(quals[i], "oversubscribe-allowed", strlen(quals[i]))) {
                    tmp |= OPAL_BIND_ALLOW_OVERLOAD;
                } else {
                    /* unknown option */
                    opal_output(0, "Unknown qualifier to binding policy: %s", spec);
                    return OPAL_ERR_BAD_PARAM;
                }
            }
            opal_argv_free(quals);
        }
        if (NULL == tmpvals[0] || ':' == spec[0]) {
            OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_CORE);
            tmp &= ~OPAL_BIND_GIVEN;
        } else {
            if (0 == strcasecmp(tmpvals[0], "hwthread")) {
                OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_HWTHREAD);
            } else if (0 == strcasecmp(tmpvals[0], "core")) {
                OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_CORE);
            } else if (0 == strcasecmp(tmpvals[0], "l1cache")) {
                OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_L1CACHE);
            } else if (0 == strcasecmp(tmpvals[0], "l2cache")) {
                OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_L2CACHE);
            } else if (0 == strcasecmp(tmpvals[0], "l3cache")) {
                OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_L3CACHE);
            } else if (0 == strcasecmp(tmpvals[0], "socket")) {
                OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_SOCKET);
            } else if (0 == strcasecmp(tmpvals[0], "numa")) {
                OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_NUMA);
            } else if (0 == strcasecmp(tmpvals[0], "board")) {
                OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_BOARD);
            } else {
                opal_show_help("help-opal-hwloc-base.txt", "invalid binding_policy", true, "binding", spec);
                opal_argv_free(tmpvals);
                return OPAL_ERR_BAD_PARAM;
            }
        }
        opal_argv_free(tmpvals);
    }

    *policy = tmp;
    return OPAL_SUCCESS;
}
Example #4
0
/* ////////////////////////////////////////////////////////////////////////// */
static int
segment_create(opal_shmem_ds_t *ds_buf,
               const char *file_name,
               size_t size)
{
    int rc = OPAL_SUCCESS;
    pid_t my_pid = getpid();
    /* the real size of the shared memory segment.  this includes enough space
     * to store our segment header.
     */
    size_t real_size = size + sizeof(opal_shmem_seg_hdr_t);
    opal_shmem_seg_hdr_t *seg_hdrp = MAP_FAILED;

    /* init the contents of opal_shmem_ds_t */
    shmem_ds_reset(ds_buf);

    /* for posix shared memory we don't have to worry about the backing store
     * being located on a network file system... so no check is needed here.
     */

    /* calling shmem_posix_shm_open searches for an available posix shared
     * memory object name and upon successful completion populates the name
     * buffer
     */
    if (-1 == (ds_buf->seg_id = shmem_posix_shm_open(
                                    ds_buf->seg_name,
                                    OPAL_SHMEM_POSIX_FILE_LEN_MAX - 1))) {
        /* snaps!  something happened in posix_shm_open.  don't report anything
         * here because posix_shm_open will display all the necessary info.
         */
        rc = OPAL_ERROR;
        goto out;
    }
    /* size backing file - note the use of real_size here */
    else if (0 != ftruncate(ds_buf->seg_id, real_size)) {
        int err = errno;
        char hn[OPAL_MAXHOSTNAMELEN];
        gethostname(hn, sizeof(hn));
        opal_show_help("help-opal-shmem-posix.txt", "sys call fail", 1, hn,
                       "ftruncate(2)", "", strerror(err), err);
        rc = OPAL_ERROR;
        goto out;
    }
    else if (MAP_FAILED == (seg_hdrp = (opal_shmem_seg_hdr_t*)mmap(NULL, real_size,
                                                                   PROT_READ | PROT_WRITE, MAP_SHARED,
                                                                   ds_buf->seg_id, 0))) {
        int err = errno;
        char hn[OPAL_MAXHOSTNAMELEN];
        gethostname(hn, sizeof(hn));
        opal_show_help("help-opal-shmem-posix.txt", "sys call fail", 1, hn,
                       "mmap(2)", "", strerror(err), err);
        rc = OPAL_ERROR;
        goto out;
    }
    /* all is well */
    else {
        /* -- initialize the shared memory segment -- */
        opal_atomic_rmb();

        /* init segment lock */
        opal_atomic_init(&seg_hdrp->lock, OPAL_ATOMIC_UNLOCKED);
        /* i was the creator of this segment, so note that fact */
        seg_hdrp->cpid = my_pid;

        opal_atomic_wmb();

        /* -- initialize the contents of opal_shmem_ds_t -- */
        ds_buf->seg_cpid = my_pid;
        ds_buf->seg_size = real_size;
        ds_buf->seg_base_addr = (unsigned char *)seg_hdrp;

        /* notice that we are not setting ds_buf->name here.  at this point,
         * posix_shm_open was successful, so the contents of ds_buf->name are
         * already set for us :-)
         */

        /* set "valid" bit because setment creation was successful */
        OPAL_SHMEM_DS_SET_VALID(ds_buf);

        OPAL_OUTPUT_VERBOSE(
            (70, opal_shmem_base_framework.framework_output,
             "%s: %s: create successful "
             "(id: %d, size: %lu, name: %s)\n",
             mca_shmem_posix_component.super.base_version.mca_type_name,
             mca_shmem_posix_component.super.base_version.mca_component_name,
             ds_buf->seg_id, (unsigned long)ds_buf->seg_size, ds_buf->seg_name)
        );
    }

out:
    /* in this component, the id is the file descriptor returned by open.  this
     * check is here to see if it is safe to call close on the file descriptor.
     * that is, we are making sure that our call to open was successful and
     * we are not not in an error path.
     */
    if (-1 != ds_buf->seg_id) {
        if (0 != close(ds_buf->seg_id)) {
            int err = errno;
            char hn[OPAL_MAXHOSTNAMELEN];
            gethostname(hn, sizeof(hn));
            opal_show_help("help-opal-shmem-mmap.txt", "sys call fail", 1, hn,
                           "close(2)", "", strerror(err), err);
            rc = OPAL_ERROR;
         }
     }
    /* an error occured, so invalidate the shmem object and release any
     * allocated resources.
     */
    if (OPAL_SUCCESS != rc) {
        /* posix_shm_open was successful, but something else wasn't.
         * note: if the id is not equal to -1 and we are here, name will be
         * valid.  that is, we can safely call shm_unlink with ds_buf->name.
         */
        if (-1 != ds_buf->seg_id) {
            shm_unlink(ds_buf->seg_name);
        }
        if (MAP_FAILED != seg_hdrp) {
            munmap((void*)seg_hdrp, real_size);
        }
        /* always invalidate in this error path */
        shmem_ds_reset(ds_buf);
    }
    return rc;
}
int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
{
    int ret;
    ompi_proc_t** procs;
    size_t nprocs;
    char *error = NULL;
    bool timing = false;
    int param, value;
    struct timeval ompistart, ompistop;
    char *event_val = NULL;
    opal_paffinity_base_cpu_set_t mask;
    bool proc_bound;
#if 0
    /* see comment below about sched_yield */
    int num_processors;
#endif
    bool orte_setup = false;
    bool paffinity_enabled = false;

    /* bitflag of the thread level support provided. To be used
     * for the modex in order to work in heterogeneous environments. */
    uint8_t threadlevel_bf; 

    /* Indicate that we have *started* MPI_INIT*.  MPI_FINALIZE has
       something sorta similar in a static local variable in
       ompi_mpi_finalize(). */
    ompi_mpi_init_started = true;

    /* Setup enough to check get/set MCA params */

    if (ORTE_SUCCESS != (ret = opal_init_util(&argc, &argv))) {
        error = "ompi_mpi_init: opal_init_util failed";
        goto error;
    }

    /* _After_ opal_init_util() but _before_ orte_init(), we need to
       set an MCA param that tells libevent that it's ok to use any
       mechanism in libevent that is available on this platform (e.g.,
       epoll and friends).  Per opal/event/event.s, we default to
       select/poll -- but we know that MPI processes won't be using
       pty's with the event engine, so it's ok to relax this
       constraint and let any fd-monitoring mechanism be used. */
    ret = mca_base_param_reg_string_name("opal", "event_include",
                                         "Internal orted MCA param: tell opal_init() to use a specific mechanism in libevent",
                                         false, false, "all", &event_val);
    if (ret >= 0) {
        /* We have to explicitly "set" the MCA param value here
           because libevent initialization will re-register the MCA
           param and therefore override the default. Setting the value
           here puts the desired value ("all") in different storage
           that is not overwritten if/when the MCA param is
           re-registered. This is unless the user has specified a different
           value for this MCA parameter. Make sure we check to see if the
           default is specified before forcing "all" in case that is not what
           the user desires. Note that we do *NOT* set this value as an
           environment variable, just so that it won't be inherited by
           any spawned processes and potentially cause unintented
           side-effects with launching ORTE tools... */
        if (0 == strcmp("all", event_val)) {
            mca_base_param_set_string(ret, "all");
        }
    }

    if( NULL != event_val ) {
        free(event_val);
        event_val = NULL;
    }

    /* check to see if we want timing information */
    param = mca_base_param_reg_int_name("ompi", "timing",
                                        "Request that critical timing loops be measured",
                                        false, false, 0, &value);
    if (value != 0) {
        timing = true;
        gettimeofday(&ompistart, NULL);
    }
    
    /* Setup ORTE - note that we are an MPI process  */
    if (ORTE_SUCCESS != (ret = orte_init(NULL, NULL, ORTE_PROC_MPI))) {
        error = "ompi_mpi_init: orte_init failed";
        goto error;
    }
    orte_setup = true;
    
    /* check for timing request - get stop time and report elapsed time if so */
    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init [%ld]: time from start to completion of orte_init %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
        gettimeofday(&ompistart, NULL);
    }

    /* Figure out the final MPI thread levels.  If we were not
       compiled for support for MPI threads, then don't allow
       MPI_THREAD_MULTIPLE.  Set this stuff up here early in the
       process so that other components can make decisions based on
       this value. */

    ompi_mpi_thread_requested = requested;
    if (OPAL_HAVE_THREAD_SUPPORT == 0) {
        ompi_mpi_thread_provided = *provided = MPI_THREAD_SINGLE;
        ompi_mpi_main_thread = NULL;
    } else if (OMPI_ENABLE_THREAD_MULTIPLE == 1) {
        ompi_mpi_thread_provided = *provided = requested;
        ompi_mpi_main_thread = opal_thread_get_self();
    } else {
        if (MPI_THREAD_MULTIPLE == requested) {
            ompi_mpi_thread_provided = *provided = MPI_THREAD_SERIALIZED;
        } else {
            ompi_mpi_thread_provided = *provided = requested;
        }
        ompi_mpi_main_thread = opal_thread_get_self();
    }

    ompi_mpi_thread_multiple = (ompi_mpi_thread_provided == 
                                MPI_THREAD_MULTIPLE);


    /* determine the bitflag belonging to the threadlevel_support provided */
    memset ( &threadlevel_bf, 0, sizeof(uint8_t));
    OMPI_THREADLEVEL_SET_BITFLAG ( ompi_mpi_thread_provided, threadlevel_bf );

    /* add this bitflag to the modex */
    if ( OMPI_SUCCESS != (ret = ompi_modex_send_string("MPI_THREAD_LEVEL", &threadlevel_bf, sizeof(uint8_t)))) {
	error = "ompi_mpi_init: modex send thread level";
	goto error;
    }

#if OPAL_HAVE_HWLOC
    /* If orte_init() didn't fill in opal_hwloc_topology, then we need
       to go fill it in ourselves. */
    if (NULL == opal_hwloc_topology) {
        if (0 != hwloc_topology_init(&opal_hwloc_topology) ||
            0 != hwloc_topology_load(opal_hwloc_topology)) {
            return OPAL_ERR_NOT_SUPPORTED;
        }
    }
#endif

    /* Once we've joined the RTE, see if any MCA parameters were
       passed to the MPI level */

    if (OMPI_SUCCESS != (ret = ompi_mpi_register_params())) {
        error = "mca_mpi_register_params() failed";
        goto error;
    }

    /* If desired, send a notify message */
    if (ompi_notify_init_finalize) {
        orte_notifier.log(ORTE_NOTIFIER_NOTICE,
                          ORTE_SUCCESS,
                          "MPI_INIT:Starting on host %s, pid %d",
                          orte_process_info.nodename,
                          orte_process_info.pid);
    }

    /* initialize datatypes. This step should be done early as it will
     * create the local convertor and local arch used in the proc
     * init.
     */
    if (OMPI_SUCCESS != (ret = ompi_datatype_init())) {
        error = "ompi_datatype_init() failed";
        goto error;
    }

    /* Initialize OMPI procs */
    if (OMPI_SUCCESS != (ret = ompi_proc_init())) {
        error = "mca_proc_init() failed";
        goto error;
    }

    /* Initialize the op framework. This has to be done *after*
       ddt_init, but befor mca_coll_base_open, since some collective
       modules (e.g., the hierarchical coll component) may need ops in
       their query function. */
    if (OMPI_SUCCESS != (ret = ompi_op_base_open())) {
        error = "ompi_op_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != 
        (ret = ompi_op_base_find_available(OPAL_ENABLE_PROGRESS_THREADS,
                                           OMPI_ENABLE_THREAD_MULTIPLE))) {
        error = "ompi_op_base_find_available() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = ompi_op_init())) {
        error = "ompi_op_init() failed";
        goto error;
    }

    /* Open up MPI-related MCA components */

    if (OMPI_SUCCESS != (ret = mca_allocator_base_open())) {
        error = "mca_allocator_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_rcache_base_open())) {
        error = "mca_rcache_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_mpool_base_open())) {
        error = "mca_mpool_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_pml_base_open())) {
        error = "mca_pml_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_coll_base_open())) {
        error = "mca_coll_base_open() failed";
        goto error;
    }

    if (OMPI_SUCCESS != (ret = ompi_osc_base_open())) {
        error = "ompi_osc_base_open() failed";
        goto error;
    }

#if OPAL_ENABLE_FT_CR == 1
    if (OMPI_SUCCESS != (ret = ompi_crcp_base_open())) {
        error = "ompi_crcp_base_open() failed";
        goto error;
    }
#endif

    /* In order to reduce the common case for MPI apps (where they
       don't use MPI-2 IO or MPI-1 topology functions), the io and
       topo frameworks are initialized lazily, at the first use of
       relevant functions (e.g., MPI_FILE_*, MPI_CART_*, MPI_GRAPH_*),
       so they are not opened here. */

    /* Select which MPI components to use */

    if (OMPI_SUCCESS != 
        (ret = mca_mpool_base_init(OPAL_ENABLE_PROGRESS_THREADS,
                                   OMPI_ENABLE_THREAD_MULTIPLE))) {
        error = "mca_mpool_base_init() failed";
        goto error;
    }

    if (OMPI_SUCCESS != 
        (ret = mca_pml_base_select(OPAL_ENABLE_PROGRESS_THREADS,
                                   OMPI_ENABLE_THREAD_MULTIPLE))) {
        error = "mca_pml_base_select() failed";
        goto error;
    }

    /* check for timing request - get stop time and report elapsed time if so */
    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init[%ld]: time from completion of orte_init to modex %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
        gettimeofday(&ompistart, NULL);
    }
    
    /* exchange connection info - this function also acts as a barrier
     * as it will not return until the exchange is complete
     */
    if (OMPI_SUCCESS != (ret = orte_grpcomm.modex(NULL))) {
        error = "orte_grpcomm_modex failed";
        goto error;
    }

    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init[%ld]: time to execute modex %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
        gettimeofday(&ompistart, NULL);
    }
    
    /* identify the architectures of remote procs and setup
     * their datatype convertors, if required
     */
    if (OMPI_SUCCESS != (ret = ompi_proc_set_arch())) {
        error = "ompi_proc_set_arch failed";
        goto error;
    }

    /* if it hasn't already been done, setup process affinity. 
     * First check to see if a slot list was
     * specified.  If so, use it.  If no slot list was specified,
     * that's not an error -- just fall through and try the next
     * paffinity scheme.
     */
    ret = opal_paffinity_base_get(&mask);
    if (OPAL_SUCCESS == ret) {
        /* paffinity is supported - check for binding */
        OPAL_PAFFINITY_PROCESS_IS_BOUND(mask, &proc_bound);
        if (proc_bound || opal_paffinity_base_bound) {
            /* someone external set it - indicate it is set
             * so that we know
             */
            paffinity_enabled = true;
        } else {
            /* the system is capable of doing processor affinity, but it
             * has not yet been set - see if a slot_list was given
             */
            if (NULL != opal_paffinity_base_slot_list) {
                /* It's an error if multiple paffinity schemes were specified */
                if (opal_paffinity_alone) {
                    ret = OMPI_ERR_BAD_PARAM;
                    error = "Multiple processor affinity schemes specified (can only specify one)";
                    goto error;
                }
                ret = opal_paffinity_base_slot_list_set((long)ORTE_PROC_MY_NAME->vpid, opal_paffinity_base_slot_list);
                if (OPAL_ERR_NOT_FOUND != ret) {
                    error = "opal_paffinity_base_slot_list_set() returned an error";
                    goto error;
                }
                paffinity_enabled = true;
            } else if (opal_paffinity_alone) {
                /* no slot_list, but they asked for paffinity */
                int phys_cpu;
                orte_node_rank_t nrank;
                if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(ORTE_PROC_MY_NAME))) {
                    error = "Could not get node rank - cannot set processor affinity";
                    goto error;
                }
                OPAL_PAFFINITY_CPU_ZERO(mask);
                phys_cpu = opal_paffinity_base_get_physical_processor_id(nrank);
                if (0 > phys_cpu) {
                    ret = phys_cpu;
                    error = "Could not get physical processor id - cannot set processor affinity";
                    goto error;
                }
                OPAL_PAFFINITY_CPU_SET(phys_cpu, mask);
                ret = opal_paffinity_base_set(mask);
                if (OPAL_SUCCESS != ret) {
                    error = "Setting processor affinity failed";
                    goto error;
                }
                paffinity_enabled = true;
            }
        }
    }
    
    /* If we were able to set processor affinity, try setting up
       memory affinity */
    if (!opal_maffinity_setup && paffinity_enabled) {
        if (OPAL_SUCCESS == opal_maffinity_base_open() &&
            OPAL_SUCCESS == opal_maffinity_base_select()) {
            opal_maffinity_setup = true;
        }
    }
    
    /* select buffered send allocator component to be used */
    ret=mca_pml_base_bsend_init(OMPI_ENABLE_THREAD_MULTIPLE);
    if( OMPI_SUCCESS != ret ) {
        error = "mca_pml_base_bsend_init() failed";
        goto error;
    }

    if (OMPI_SUCCESS != 
        (ret = mca_coll_base_find_available(OPAL_ENABLE_PROGRESS_THREADS,
                                            OMPI_ENABLE_THREAD_MULTIPLE))) {
        error = "mca_coll_base_find_available() failed";
        goto error;
    }

    if (OMPI_SUCCESS != 
        (ret = ompi_osc_base_find_available(OPAL_ENABLE_PROGRESS_THREADS,
                                           OMPI_ENABLE_THREAD_MULTIPLE))) {
        error = "ompi_osc_base_find_available() failed";
        goto error;
    }

#if OPAL_ENABLE_FT_CR == 1
    if (OMPI_SUCCESS != (ret = ompi_crcp_base_select() ) ) {
        error = "ompi_crcp_base_select() failed";
        goto error;
    }
#endif

    /* io and topo components are not selected here -- see comment
       above about the io and topo frameworks being loaded lazily */

    /* Initialize each MPI handle subsystem */
    /* initialize requests */
    if (OMPI_SUCCESS != (ret = ompi_request_init())) {
        error = "ompi_request_init() failed";
        goto error;
    }

    /* initialize info */
    if (OMPI_SUCCESS != (ret = ompi_info_init())) {
        error = "ompi_info_init() failed";
        goto error;
    }

    /* initialize error handlers */
    if (OMPI_SUCCESS != (ret = ompi_errhandler_init())) {
        error = "ompi_errhandler_init() failed";
        goto error;
    }

    /* initialize error codes */
    if (OMPI_SUCCESS != (ret = ompi_mpi_errcode_init())) {
        error = "ompi_mpi_errcode_init() failed";
        goto error;
    }
    
    /* initialize internal error codes */
    if (OMPI_SUCCESS != (ret = ompi_errcode_intern_init())) {
        error = "ompi_errcode_intern_init() failed";
        goto error;
    }
     
    /* initialize groups  */
    if (OMPI_SUCCESS != (ret = ompi_group_init())) {
        error = "ompi_group_init() failed";
        goto error;
    }

    /* initialize communicators */
    if (OMPI_SUCCESS != (ret = ompi_comm_init())) {
        error = "ompi_comm_init() failed";
        goto error;
    }

    /* initialize file handles */
    if (OMPI_SUCCESS != (ret = ompi_file_init())) {
        error = "ompi_file_init() failed";
        goto error;
    }

    /* initialize windows */
    if (OMPI_SUCCESS != (ret = ompi_win_init())) {
        error = "ompi_win_init() failed";
        goto error;
    }

    /* initialize attribute meta-data structure for comm/win/dtype */
    if (OMPI_SUCCESS != (ret = ompi_attr_init())) {
        error = "ompi_attr_init() failed";
        goto error;
    }

    /* If thread support was enabled, then setup OPAL to allow for
       them. */
    if ((OPAL_ENABLE_PROGRESS_THREADS == 1) ||
        (*provided != MPI_THREAD_SINGLE)) {
        opal_set_using_threads(true);
    }

    /* start PML/BTL's */
    ret = MCA_PML_CALL(enable(true));
    if( OMPI_SUCCESS != ret ) {
        error = "PML control failed";
        goto error;
    }

    /* add all ompi_proc_t's to PML */
    if (NULL == (procs = ompi_proc_world(&nprocs))) {
        error = "ompi_proc_world() failed";
        goto error;
    }
    ret = MCA_PML_CALL(add_procs(procs, nprocs));
    free(procs);
    /* If we got "unreachable", then print a specific error message.
       Otherwise, if we got some other failure, fall through to print
       a generic message. */
    if (OMPI_ERR_UNREACH == ret) {
        orte_show_help("help-mpi-runtime",
                       "mpi_init:startup:pml-add-procs-fail", true);
        error = NULL;
        goto error;
    } else if (OMPI_SUCCESS != ret) {
        error = "PML add procs failed";
        goto error;
    }

    MCA_PML_CALL(add_comm(&ompi_mpi_comm_world.comm));
    MCA_PML_CALL(add_comm(&ompi_mpi_comm_self.comm));

    /*
     * Dump all MCA parameters if requested
     */
    if (ompi_mpi_show_mca_params) {
       ompi_show_all_mca_params(ompi_mpi_comm_world.comm.c_my_rank, 
                                nprocs, 
                                orte_process_info.nodename);
    }

    /* Do we need to wait for a debugger? */
    ompi_wait_for_debugger();
    
    /* check for timing request - get stop time and report elapsed
     time if so, then start the clock again */
    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init[%ld]: time from modex to first barrier %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
        gettimeofday(&ompistart, NULL);
    }
    
    /* wait for everyone to reach this point */
    if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier())) {
        error = "orte_grpcomm_barrier failed";
        goto error;
    }
    
    /* check for timing request - get stop time and report elapsed
       time if so, then start the clock again */
    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init[%ld]: time to execute barrier %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
        gettimeofday(&ompistart, NULL);
    }

#if OPAL_ENABLE_PROGRESS_THREADS == 0
    /* Start setting up the event engine for MPI operations.  Don't
       block in the event library, so that communications don't take
       forever between procs in the dynamic code.  This will increase
       CPU utilization for the remainder of MPI_INIT when we are
       blocking on ORTE-level events, but may greatly reduce non-TCP
       latency. */
    opal_progress_set_event_flag(OPAL_EVLOOP_NONBLOCK);
#endif
    
    /* wire up the mpi interface, if requested.  Do this after the
       non-block switch for non-TCP performance.  Do before the
       polling change as anyone with a complex wire-up is going to be
       using the oob. */
    if (OMPI_SUCCESS != (ret = ompi_init_preconnect_mpi())) {
        error = "ompi_mpi_do_preconnect_all() failed";
        goto error;
    }

    /* Setup the publish/subscribe (PUBSUB) framework */
    if (OMPI_SUCCESS != (ret = ompi_pubsub_base_open())) {
        error = "ompi_pubsub_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = ompi_pubsub_base_select())) {
        error = "ompi_pubsub_base_select() failed";
        goto error;
    }
    
    /* Setup the dynamic process management (DPM) framework */
    if (OMPI_SUCCESS != (ret = ompi_dpm_base_open())) {
        error = "ompi_dpm_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = ompi_dpm_base_select())) {
        error = "ompi_dpm_base_select() failed";
        goto error;
    }


    /* Determine the overall threadlevel support of all processes 
       in MPI_COMM_WORLD. This has to be done before calling 
       coll_base_comm_select, since some of the collective components
       e.g. hierarch, might create subcommunicators. The threadlevel
       requested by all processes is required in order to know
       which cid allocation algorithm can be used. */
    if ( OMPI_SUCCESS != 
	 ( ret = ompi_comm_cid_init ())) {
	error = "ompi_mpi_init: ompi_comm_cid_init failed";
	goto error;
    }

    /* Init coll for the comms. This has to be after dpm_base_select, 
       (since dpm.mark_dyncomm is not set in the communicator creation
       function else), but before dpm.dyncom_init, since this function
       might require collective for the CID allocation. */
    if (OMPI_SUCCESS !=
        (ret = mca_coll_base_comm_select(MPI_COMM_WORLD))) {
        error = "mca_coll_base_comm_select(MPI_COMM_WORLD) failed";
        goto error;
    }

    if (OMPI_SUCCESS != 
        (ret = mca_coll_base_comm_select(MPI_COMM_SELF))) {
        error = "mca_coll_base_comm_select(MPI_COMM_SELF) failed";
        goto error;
    }


    
    /* Check whether we have been spawned or not.  We introduce that
       at the very end, since we need collectives, datatypes, ptls
       etc. up and running here.... */
    if (OMPI_SUCCESS != (ret = ompi_dpm.dyn_init())) {
        error = "ompi_comm_dyn_init() failed";
        goto error;
    }

    /*
     * Startup the Checkpoint/Restart Mech.
     * Note: Always do this so tools don't hang when
     * in a non-checkpointable build
     */
    if (OMPI_SUCCESS != (ret = ompi_cr_init())) {
        error = "ompi_cr_init";
        goto error;
    }

    /* Undo OPAL calling opal_progress_event_users_increment() during 
       opal_init, to get better latency when not using TCP.  Do 
       this *after* dyn_init, as dyn init uses lots of ORTE 
       communication and we don't want to hinder the performance of 
       that code. */ 
    opal_progress_event_users_decrement(); 

    /* see if yield_when_idle was specified - if so, use it */
    param = mca_base_param_find("mpi", NULL, "yield_when_idle");
    mca_base_param_lookup_int(param, &value);
    if (value < 0) {
        /* if no info is provided, just default to conservative */
        opal_progress_set_yield_when_idle(true);
    } else {
        /* info was provided, so set idle accordingly */
        opal_progress_set_yield_when_idle(value == 0 ? false : true);
    }
    
    param = mca_base_param_find("mpi", NULL, "event_tick_rate");
    mca_base_param_lookup_int(param, &value);
    /* negative value means use default - just don't do anything */
    if (value >= 0) {
        opal_progress_set_event_poll_rate(value);
    }

    /* At this point, we are fully configured and in MPI mode.  Any
       communication calls here will work exactly like they would in
       the user's code.  Setup the connections between procs and warm
       them up with simple sends, if requested */

 error:
    if (ret != OMPI_SUCCESS) {
        /* Only print a message if one was not already printed */
        if (NULL != error) {
            const char *err_msg = opal_strerror(ret);
            /* If ORTE was not setup yet, don't use orte_show_help */
            if (orte_setup) {
                orte_show_help("help-mpi-runtime",
                               "mpi_init:startup:internal-failure", true,
                               "MPI_INIT", "MPI_INIT", error, err_msg, ret);
            } else {
                opal_show_help("help-mpi-runtime",
                               "mpi_init:startup:internal-failure", true,
                               "MPI_INIT", "MPI_INIT", error, err_msg, ret);
            }
        }
        return ret;
    }

    /* Initialize the registered datarep list to be empty */
    OBJ_CONSTRUCT(&ompi_registered_datareps, opal_list_t);

    /* Initialize the arrays used to store the F90 types returned by the
     *  MPI_Type_create_f90_XXX functions.
     */
    OBJ_CONSTRUCT( &ompi_mpi_f90_integer_hashtable, opal_hash_table_t);
    opal_hash_table_init(&ompi_mpi_f90_integer_hashtable, 16 /* why not? */);

    OBJ_CONSTRUCT( &ompi_mpi_f90_real_hashtable, opal_hash_table_t);
    opal_hash_table_init(&ompi_mpi_f90_real_hashtable, FLT_MAX_10_EXP);

    OBJ_CONSTRUCT( &ompi_mpi_f90_complex_hashtable, opal_hash_table_t);
    opal_hash_table_init(&ompi_mpi_f90_complex_hashtable, FLT_MAX_10_EXP);
    
    /* All done.  Wasn't that simple? */

    ompi_mpi_initialized = true;

    /* check for timing request - get stop time and report elapsed time if so */
    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init[%ld]: time from barrier to complete mpi_init %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
    }

    /* If desired, send a notifier message that we've finished MPI_INIT */
    if (ompi_notify_init_finalize) {
        orte_notifier.log(ORTE_NOTIFIER_NOTICE,
                          ORTE_SUCCESS,
                          "MPI_INIT:Finishing on host %s, pid %d",
                          orte_process_info.nodename,
                          orte_process_info.pid);
    }

    return MPI_SUCCESS;
}
Example #6
0
int
main(int argc, char *argv[])
{
    int ret, exit_status = OPAL_SUCCESS;
    int child_pid;
    int prev_pid = 0;
    int idx;
    opal_crs_base_snapshot_t *snapshot = NULL;
    char * tmp_env_var = NULL;
    bool select = false;

    /***************
     * Initialize
     ***************/
    if (OPAL_SUCCESS != (ret = initialize(argc, argv))) {
        exit_status = ret;
        goto cleanup;
    }

    /*
     * Check for existence of the file, or program in the case of self
     */
    if( OPAL_SUCCESS != (ret = check_file() )) {
        opal_show_help("help-opal-restart.txt", "invalid_filename", true,
                       opal_restart_globals.snapshot_ref);
        exit_status = ret;
        goto cleanup;
    }

    /* Re-enable the selection of the CRS component, so we can choose the right one */
    idx = mca_base_var_find(NULL, "crs", "base", "do_not_select");

    if (0 > idx) {
        opal_output(opal_restart_globals.output,
                    "MCA variable opal_crs_base_do_not_select not found\n");
        exit_status = OPAL_ERROR;
        goto cleanup;
    }

    ret = mca_base_var_set_value(idx, &select, 0, MCA_BASE_VAR_SOURCE_DEFAULT, NULL);
    if (OPAL_SUCCESS != ret) {
        exit_status = ret;
        goto cleanup;
    }

    /*
     * Make sure we are using the correct checkpointer
     */
    if(NULL == expected_crs_comp) {
        char * full_metadata_path = NULL;
        FILE * metadata = NULL;

        opal_asprintf(&full_metadata_path, "%s/%s/%s",
                 opal_restart_globals.snapshot_loc,
                 opal_restart_globals.snapshot_ref,
                 opal_restart_globals.snapshot_metadata);
        if( NULL == (metadata = fopen(full_metadata_path, "r")) ) {
            opal_show_help("help-opal-restart.txt", "invalid_metadata", true,
                           opal_restart_globals.snapshot_metadata,
                           full_metadata_path);
            exit_status = OPAL_ERROR;
            goto cleanup;
        }
        if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(metadata,
                                                                            &expected_crs_comp,
                                                                            &prev_pid)) ) {
            opal_show_help("help-opal-restart.txt", "invalid_metadata", true,
                           opal_restart_globals.snapshot_metadata,
                           full_metadata_path);
            exit_status = ret;
            goto cleanup;
        }

        free(full_metadata_path);
        full_metadata_path = NULL;

        fclose(metadata);
        metadata = NULL;
    }

    opal_output_verbose(10, opal_restart_globals.output,
                        "Restart Expects checkpointer: (%s)",
                        expected_crs_comp);

    (void) mca_base_var_env_name("crs", &tmp_env_var);
    opal_setenv(tmp_env_var,
                expected_crs_comp,
                true, &environ);
    free(tmp_env_var);
    tmp_env_var = NULL;

    /* Select this component or don't continue.
     * If the selection of this component fails, then we can't
     * restart on this node because it doesn't have the proper checkpointer
     * available.
     */
    if( OPAL_SUCCESS != (ret = opal_crs_base_open(MCA_BASE_OPEN_DEFAULT)) ) {
        opal_show_help("help-opal-restart.txt", "comp_select_failure", true,
                       "crs", ret);
        exit_status = ret;
        goto cleanup;
    }

    if( OPAL_SUCCESS != (ret = opal_crs_base_select()) ) {
        opal_show_help("help-opal-restart.txt", "comp_select_failure", true,
                       expected_crs_comp, ret);
        exit_status = ret;
        goto cleanup;
    }

    /*
     * Make sure we have selected the proper component
     */
    if(NULL == expected_crs_comp ||
       0 != strncmp(expected_crs_comp,
                    opal_crs_base_selected_component.base_version.mca_component_name,
                    strlen(expected_crs_comp)) ) {
        opal_show_help("help-opal-restart.txt", "comp_select_mismatch",
                       true,
                       expected_crs_comp,
                       opal_crs_base_selected_component.base_version.mca_component_name,
                       ret);
        exit_status = ret;
        goto cleanup;
    }

    /******************************
     * Restart in this process
     ******************************/
    opal_output_verbose(10, opal_restart_globals.output,
                        "Restarting from file (%s)\n",
                        opal_restart_globals.snapshot_ref);

    snapshot = OBJ_NEW(opal_crs_base_snapshot_t);
    snapshot->cold_start         = true;
    opal_asprintf(&(snapshot->snapshot_directory), "%s/%s",
             opal_restart_globals.snapshot_loc,
             opal_restart_globals.snapshot_ref);
    opal_asprintf(&(snapshot->metadata_filename), "%s/%s",
             snapshot->snapshot_directory,
             opal_restart_globals.snapshot_metadata);

    /* Since some checkpoint/restart systems don't pass along env vars to the
     * restarted app, we need to take care of that.
     *
     * Included here is the creation of any files or directories that need to be
     * created before the process is restarted.
     */
    if(OPAL_SUCCESS != (ret = post_env_vars(prev_pid, snapshot) ) ) {
        exit_status = ret;
        goto cleanup;
    }

    /*
     * Do the actual restart
     */
    ret = opal_crs.crs_restart(snapshot,
                               false,
                               &child_pid);

    if (OPAL_SUCCESS != ret) {
        opal_show_help("help-opal-restart.txt", "restart_cmd_failure", true,
                       opal_restart_globals.snapshot_ref,
                       ret,
                       opal_crs_base_selected_component.base_version.mca_component_name);
        exit_status = ret;
        goto cleanup;
    }
    /* Should never get here, since crs_restart calls exec */

    /***************
     * Cleanup
     ***************/
 cleanup:
    if (OPAL_SUCCESS != (ret = finalize())) {
        return ret;
    }

    if(NULL != snapshot )
        OBJ_DESTRUCT(snapshot);

    return exit_status;
}
Example #7
0
static int parse_args(int argc, char *argv[])
{
    int i, ret, len;
    opal_cmd_line_t cmd_line;
    char **app_env = NULL, **global_env = NULL;

    opal_restart_globals.help = false;
    opal_restart_globals.verbose = false;
    opal_restart_globals.snapshot_ref = NULL;
    opal_restart_globals.snapshot_loc = NULL;
    opal_restart_globals.snapshot_metadata = NULL;
    opal_restart_globals.snapshot_cache = NULL;
    opal_restart_globals.snapshot_compress = NULL;
    opal_restart_globals.snapshot_compress_postfix = NULL;
    opal_restart_globals.output = 0;

    /* Parse the command line options */
    opal_cmd_line_create(&cmd_line, cmd_line_opts);

    mca_base_open();
    mca_base_cmd_line_setup(&cmd_line);
    ret = opal_cmd_line_parse(&cmd_line, false, false, argc, argv);
    if (OPAL_SUCCESS != ret) {
        if (OPAL_ERR_SILENT != ret) {
            fprintf(stderr, "%s: command line error (%s)\n", argv[0],
                    opal_strerror(ret));
        }
        return 1;
    }
    if (opal_restart_globals.help ) {
        char *str, *args = NULL;
        args = opal_cmd_line_get_usage_msg(&cmd_line);
        str = opal_show_help_string("help-opal-restart.txt", "usage", true,
                                    args);
        if (NULL != str) {
            printf("%s", str);
            free(str);
        }
        free(args);
        /* If we show the help message, that should be all we do */
        exit(0);
    }

    /**
     * Put all of the MCA arguments in the environment
     */
    mca_base_cmd_line_process_args(&cmd_line, &app_env, &global_env);

    len = opal_argv_count(app_env);
    for(i = 0; i < len; ++i) {
        putenv(app_env[i]);
    }

    len = opal_argv_count(global_env);
    for(i = 0; i < len; ++i) {
        putenv(global_env[i]);
    }

    /**
     * Now start parsing our specific arguments
     */
    /* get the remaining bits */
    opal_cmd_line_get_tail(&cmd_line, &argc, &argv);

    if ( NULL == opal_restart_globals.snapshot_ref ||
         0 >= strlen(opal_restart_globals.snapshot_ref) ) {
        opal_show_help("help-opal-restart.txt", "invalid_filename", true,
                       "<none provided>");
        return OPAL_ERROR;
    }

    /* If we have arguments after the command, then assume they
     * need to be grouped together.
     * Useful in the 'mca crs self' instance.
     */
    if(argc > 0) {
        opal_restart_globals.snapshot_ref = strdup(opal_argv_join(argv, ' '));
    }

    return OPAL_SUCCESS;
}
Example #8
0
static int mca_bml_r2_add_procs( size_t nprocs, 
                                 struct ompi_proc_t** procs, 
                                 struct opal_bitmap_t* reachable )
{
    size_t p, p_index, n_new_procs = 0;
    struct mca_btl_base_endpoint_t ** btl_endpoints = NULL;  
    struct ompi_proc_t** new_procs = NULL; 
    int rc, ret = OMPI_SUCCESS;

    if(0 == nprocs) {
        return OMPI_SUCCESS;
    }
    
    if(OMPI_SUCCESS != (rc = mca_bml_r2_add_btls()) ) {
        return rc;
    }
    
    /* Select only the procs that don't yet have the BML proc struct. This prevent
     * us from calling btl->add_procs several times on the same destination proc.
     */
    for(p_index = 0; p_index < nprocs; p_index++) { 
        struct ompi_proc_t* proc = procs[p_index]; 

        if(NULL !=  proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) { 
            continue;  /* go to the next proc */
        }
        /* Allocate the new_procs on demand */
        if( NULL == new_procs ) {
            new_procs = (struct ompi_proc_t **)malloc(nprocs * sizeof(struct ompi_proc_t *));
            if( NULL == new_procs ) {
                return OMPI_ERR_OUT_OF_RESOURCE;
            }
        }
        OBJ_RETAIN(proc);
        new_procs[n_new_procs++] = proc; 
    }

    if ( 0 == n_new_procs ) {
        return OMPI_SUCCESS;
    }

    /* Starting from here we only work on the unregistered procs */
    procs = new_procs; 
    nprocs = n_new_procs; 
    
    /* attempt to add all procs to each r2 */
    btl_endpoints = (struct mca_btl_base_endpoint_t **) 
        malloc(nprocs * sizeof(struct mca_btl_base_endpoint_t*)); 
    if (NULL == btl_endpoints) {
        free(new_procs);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    for(p_index = 0; p_index < mca_bml_r2.num_btl_modules; p_index++) {
        mca_btl_base_module_t* btl = mca_bml_r2.btl_modules[p_index];
        int btl_inuse = 0;

        /* if the r2 can reach the destination proc it sets the
         * corresponding bit (proc index) in the reachable bitmap
         * and can return addressing information for each proc
         * that is passed back to the r2 on data transfer calls
         */
        opal_bitmap_clear_all_bits(reachable);
        memset(btl_endpoints, 0, nprocs *sizeof(struct mca_btl_base_endpoint_t*)); 

        rc = btl->btl_add_procs(btl, n_new_procs, new_procs, btl_endpoints, reachable);
        if(OMPI_SUCCESS != rc) {
            /* This BTL has troubles adding the nodes. Let's continue maybe some other BTL
             * can take care of this task.
             */
            continue;
        }

        /* for each proc that is reachable */
        for( p = 0; p < n_new_procs; p++ ) {
            if(opal_bitmap_is_set_bit(reachable, p)) {
                ompi_proc_t *proc = new_procs[p];
                mca_bml_base_endpoint_t * bml_endpoint = 
                    (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; 
                mca_bml_base_btl_t* bml_btl; 
                size_t size;
                
                if(NULL == bml_endpoint) { 
                    /* allocate bml specific proc data */
                    bml_endpoint = OBJ_NEW(mca_bml_base_endpoint_t);
                    if (NULL == bml_endpoint) {
                        opal_output(0, "mca_bml_r2_add_procs: unable to allocate resources");
                        free(btl_endpoints);
                        free(new_procs);
                        return OMPI_ERR_OUT_OF_RESOURCE;
                    }
                    
                    /* preallocate space in array for max number of r2s */
                    mca_bml_base_btl_array_reserve(&bml_endpoint->btl_eager, mca_bml_r2.num_btl_modules);
                    mca_bml_base_btl_array_reserve(&bml_endpoint->btl_send,  mca_bml_r2.num_btl_modules);
                    mca_bml_base_btl_array_reserve(&bml_endpoint->btl_rdma,  mca_bml_r2.num_btl_modules);
                    bml_endpoint->btl_max_send_size = -1;
                    bml_endpoint->btl_proc = proc;
                    proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML] = bml_endpoint; 
                 
                    bml_endpoint->btl_flags_or = 0;
                }

                /* dont allow an additional BTL with a lower exclusivity ranking */
                size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
                if(size > 0) {
                    bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, size-1);
                    /* skip this btl if the exclusivity is less than the previous */
                    if(bml_btl->btl->btl_exclusivity > btl->btl_exclusivity) {
                        btl->btl_del_procs(btl, 1, &proc, &btl_endpoints[p]);
                        opal_output_verbose(20, ompi_btl_base_framework.framework_output, 
                                            "mca: bml: Not using %s btl to %s on node %s "
                                            "because %s btl has higher exclusivity (%d > %d)",
                                            btl->btl_component->btl_version.mca_component_name,
                                            OMPI_NAME_PRINT(&proc->proc_name), proc->proc_hostname,
                                            bml_btl->btl->btl_component->btl_version.mca_component_name,
                                            bml_btl->btl->btl_exclusivity,
                                            btl->btl_exclusivity);
                        continue;
                    }
                }
                opal_output_verbose(1, ompi_btl_base_framework.framework_output, 
                                    "mca: bml: Using %s btl to %s on node %s",
                                    btl->btl_component->btl_version.mca_component_name,
                                    OMPI_NAME_PRINT(&proc->proc_name),
                                    proc->proc_hostname);

                /* cache the endpoint on the proc */
                bml_btl = mca_bml_base_btl_array_insert(&bml_endpoint->btl_send);
                bml_btl->btl = btl;
                bml_btl->btl_endpoint = btl_endpoints[p];
                bml_btl->btl_weight = 0;
                bml_btl->btl_flags = btl->btl_flags; 
                if( (bml_btl->btl_flags & MCA_BTL_FLAGS_PUT) && (NULL == btl->btl_put) ) {
                    opal_output(0, "mca_bml_r2_add_procs: The PUT flag is specified for"
                                " the %s BTL without any PUT function attached. Discard the flag !",
                                bml_btl->btl->btl_component->btl_version.mca_component_name);
                    bml_btl->btl_flags ^= MCA_BTL_FLAGS_PUT;
                }
                if( (bml_btl->btl_flags & MCA_BTL_FLAGS_GET) && (NULL == btl->btl_get) ) {
                    opal_output(0, "mca_bml_r2_add_procs: The GET flag is specified for"
                                " the %s BTL without any GET function attached. Discard the flag !",
                                bml_btl->btl->btl_component->btl_version.mca_component_name);
                    bml_btl->btl_flags ^= MCA_BTL_FLAGS_GET;
                }
                if( (bml_btl->btl_flags & (MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_SEND)) == 0 ) {
                    /**
                     * If no protocol specified, we have 2 choices: we ignore the BTL
                     * as we don't know which protocl to use, or we suppose that all
                     * BTLs support the send protocol. 
                     */
                    bml_btl->btl_flags |= MCA_BTL_FLAGS_SEND;
                }
                /**
                 * calculate the bitwise OR of the btl flags 
                 */
                bml_endpoint->btl_flags_or |= bml_btl->btl_flags;
                /* This BTL is in use, allow the progress registration */
                btl_inuse++;
            }
        }
        if(btl_inuse > 0 && NULL != btl->btl_component->btl_progress) {
            size_t p;
            bool found = false;
            for( p = 0; p < mca_bml_r2.num_btl_progress; p++ ) {
                if(mca_bml_r2.btl_progress[p] == btl->btl_component->btl_progress) {
                    found = true;
                    break;
                }
            }
            if(found == false) {
                mca_bml_r2.btl_progress[mca_bml_r2.num_btl_progress] = 
                    btl->btl_component->btl_progress;
                mca_bml_r2.num_btl_progress++;
                opal_progress_register( btl->btl_component->btl_progress );
            }
        }
    }
    free(btl_endpoints);

    /* iterate back through procs and compute metrics for registered r2s */
    for(p=0; p<n_new_procs; p++) {
        ompi_proc_t *proc = new_procs[p];
        mca_bml_base_endpoint_t* bml_endpoint = 
            (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
        double total_bandwidth = 0;
        uint32_t latency = 0xffffffff;
        size_t n_index;
        size_t n_size;

        /* skip over procs w/ no btl's registered */
        if(NULL == bml_endpoint) {
            continue;
        }

        /* (1) determine the total bandwidth available across all btls
         *     note that we need to do this here, as we may already have btls configured
         * (2) determine the highest priority ranking for latency
         * (3) compute the maximum amount of bytes that can be send without any
         *     weighting. Once the left over is smaller than this number we will
         *     start using the weight to compute the correct amount.
         */
        n_size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); 
        
        /* sort BTLs in descending order according to bandwidth value */
        qsort(bml_endpoint->btl_send.bml_btls, n_size,
                sizeof(mca_bml_base_btl_t), btl_bandwidth_compare);

        bml_endpoint->btl_rdma_index = 0;
        for(n_index = 0; n_index < n_size; n_index++) {
            mca_bml_base_btl_t* bml_btl = 
                mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index);
            mca_btl_base_module_t* btl = bml_btl->btl;
            total_bandwidth += bml_btl->btl->btl_bandwidth;
            if(btl->btl_latency < latency) {
                latency = btl->btl_latency;
            }
        }
        
        /* (1) set the weight of each btl as a percentage of overall bandwidth
         * (2) copy all btl instances at the highest priority ranking into the
         *     list of btls used for first fragments
         */
        for(n_index = 0; n_index < n_size; n_index++) {
            mca_bml_base_btl_t* bml_btl = 
                mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index);
            mca_btl_base_module_t *btl = bml_btl->btl;

            /* compute weighting factor for this r2 */
            if(btl->btl_bandwidth > 0) {
                bml_btl->btl_weight = (float)(btl->btl_bandwidth / total_bandwidth);
            } else {
                bml_btl->btl_weight = (float)(1.0 / n_size);
            }

            /* check to see if this r2 is already in the array of r2s 
             * used for first fragments - if not add it.
             */
            if(btl->btl_latency == latency) {
                mca_bml_base_btl_t* bml_btl_new = 
                    mca_bml_base_btl_array_insert(&bml_endpoint->btl_eager);
                *bml_btl_new = *bml_btl;
            }

            /* set endpoint max send size as min of available btls */
            if(bml_endpoint->btl_max_send_size > btl->btl_max_send_size)
               bml_endpoint->btl_max_send_size = btl->btl_max_send_size;

            /* check flags - is rdma prefered */
            if ((btl->btl_flags & (MCA_BTL_FLAGS_PUT|MCA_BTL_FLAGS_GET)) &&
                !((proc->proc_arch != ompi_proc_local_proc->proc_arch) &&
                  (0 == (btl->btl_flags & MCA_BTL_FLAGS_HETEROGENEOUS_RDMA)))) {
                mca_bml_base_btl_t* bml_btl_rdma = mca_bml_base_btl_array_insert(&bml_endpoint->btl_rdma);
                mca_btl_base_module_t* btl_rdma = bml_btl->btl;

                *bml_btl_rdma = *bml_btl;
                if(bml_endpoint->btl_pipeline_send_length < btl_rdma->btl_rdma_pipeline_send_length) {
                    bml_endpoint->btl_pipeline_send_length = btl_rdma->btl_rdma_pipeline_send_length;
                }
                if(bml_endpoint->btl_send_limit < btl_rdma->btl_min_rdma_pipeline_size) {
                    bml_endpoint->btl_send_limit = btl_rdma->btl_min_rdma_pipeline_size;
                }
            }
        }
    }

    /* see if we have a connection to everyone else */
    for(p = 0; p < n_new_procs; p++) {
        ompi_proc_t *proc = new_procs[p];

        if (NULL == proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) {
            ret = OMPI_ERR_UNREACH;
            if (mca_bml_r2.show_unreach_errors) {
                opal_show_help("help-mca-bml-r2.txt",
                               "unreachable proc",
                               true, 
                               OMPI_NAME_PRINT(&(ompi_proc_local_proc->proc_name)),
                               (NULL != ompi_proc_local_proc->proc_hostname ?
                                ompi_proc_local_proc->proc_hostname : "unknown!"),
                               OMPI_NAME_PRINT(&(proc->proc_name)),
                               (NULL != ompi_proc_local_proc->proc_hostname ?
                                ompi_proc_local_proc->proc_hostname : "unknown!"),
                               btl_names);
            }
            break;
        }
    }

    free(new_procs);

    return ret;
}
Example #9
0
int ompi_mpi_register_params(void)
{
    int value;

    /* Whether we want MPI API function parameter checking or not. Disable this by default if
       parameter checking is compiled out. */

    ompi_mpi_param_check = !!(MPI_PARAM_CHECK);
    (void) mca_base_var_register("ompi", "mpi", NULL, "param_check",
                                 "Whether you want MPI API parameters checked at run-time or not.  Possible values are 0 (no checking) and 1 (perform checking at run-time)",
                                 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
                                 OPAL_INFO_LVL_9,
                                 MCA_BASE_VAR_SCOPE_READONLY,
                                 &ompi_mpi_param_check);
    if (ompi_mpi_param_check && !MPI_PARAM_CHECK) {
        opal_show_help("help-mpi-runtime.txt",
                       "mpi-param-check-enabled-but-compiled-out",
                       true);
        ompi_mpi_param_check = false;
    }

    /*
     * opal_progress: decide whether to yield and the event library
     * tick rate
     */
    /* JMS: Need ORTE data here -- set this to 0 when
       exactly/under-subscribed, or 1 when oversubscribed */
    ompi_mpi_yield_when_idle = true;
    (void) mca_base_var_register("ompi", "mpi", NULL, "yield_when_idle",
                                 "Yield the processor when waiting for MPI communication (for MPI processes, will default to 1 when oversubscribing nodes)",
                                 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
                                 OPAL_INFO_LVL_9,
                                 MCA_BASE_VAR_SCOPE_READONLY,
                                 &ompi_mpi_yield_when_idle);

    ompi_mpi_event_tick_rate = -1;
    (void) mca_base_var_register("ompi", "mpi", NULL, "event_tick_rate",
                                 "How often to progress TCP communications (0 = never, otherwise specified in microseconds)",
                                 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
                                 OPAL_INFO_LVL_9,
                                 MCA_BASE_VAR_SCOPE_READONLY,
                                 &ompi_mpi_event_tick_rate);

    /* Whether or not to show MPI handle leaks */
    ompi_debug_show_handle_leaks = false;
    (void) mca_base_var_register("ompi", "mpi", NULL, "show_handle_leaks",
                                 "Whether MPI_FINALIZE shows all MPI handles that were not freed or not",
                                 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
                                 OPAL_INFO_LVL_9,
                                 MCA_BASE_VAR_SCOPE_READONLY,
                                 &ompi_debug_show_handle_leaks);

    /* Whether or not to free MPI handles.  Useless without run-time
       param checking, so implicitly set that to true if we don't want
       to free the handles. */
    ompi_debug_no_free_handles = false;
    (void) mca_base_var_register("ompi", "mpi", NULL, "no_free_handles",
                                 "Whether to actually free MPI objects when their handles are freed",
                                 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
                                 OPAL_INFO_LVL_9,
                                 MCA_BASE_VAR_SCOPE_READONLY,
                                 &ompi_debug_no_free_handles);
    if (ompi_debug_no_free_handles) {
        ompi_mpi_param_check = true;
        if (!MPI_PARAM_CHECK) {
            opal_output(0, "WARNING: MCA parameter mpi_no_free_handles set to true, but MPI");
            opal_output(0, "WARNING: parameter checking has been compiled out of Open MPI.");
            opal_output(0, "WARNING: mpi_no_free_handles is therefore only partially effective!");
        }
    }

    /* Whether or not to show MPI_ALLOC_MEM leaks */
    ompi_debug_show_mpi_alloc_mem_leaks = 0;
    (void) mca_base_var_register("ompi", "mpi", NULL, "show_mpi_alloc_mem_leaks",
                                 "If >0, MPI_FINALIZE will show up to this many instances of memory allocated by MPI_ALLOC_MEM that was not freed by MPI_FREE_MEM",
                                 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
                                 OPAL_INFO_LVL_9,
                                 MCA_BASE_VAR_SCOPE_READONLY,
                                 &ompi_debug_show_mpi_alloc_mem_leaks);

    /* Whether or not to print all MCA parameters in MPI_INIT */
    ompi_mpi_show_mca_params_string = NULL;
    (void) mca_base_var_register("ompi", "mpi", NULL, "show_mca_params",
                                 "Whether to show all MCA parameter values during MPI_INIT or not (good for reproducability of MPI jobs "
                                 "for debug purposes). Accepted values are all, default, file, api, and enviro - or a comma "
                                 "delimited combination of them",
                                 MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
                                 OPAL_INFO_LVL_9,
                                 MCA_BASE_VAR_SCOPE_READONLY,
                                 &ompi_mpi_show_mca_params_string);
    if (NULL != ompi_mpi_show_mca_params_string) {
        char **args;
        int i;

        ompi_mpi_show_mca_params = true;
        args = opal_argv_split(ompi_mpi_show_mca_params_string, ',');
        if (NULL == args) {
            opal_output(0, "WARNING: could not parse mpi_show_mca_params request - defaulting to show \"all\"");
            show_default_mca_params = true;
            show_file_mca_params = true;
            show_enviro_mca_params = true;
            show_override_mca_params = true;
        } else {
            for (i=0; NULL != args[i]; i++) {
                if (0 == strcasecmp(args[i], "all")  || 0 == strcmp(args[i], "1")) {
                    show_default_mca_params = true;
                    show_file_mca_params = true;
                    show_enviro_mca_params = true;
                    show_override_mca_params = true;
                } else if (0 == strcasecmp(args[i], "default")) {
                    show_default_mca_params = true;
                } else if (0 == strcasecmp(args[i], "file")) {
                    show_file_mca_params = true;
                } else if (0 == strncasecmp(args[i], "env", 3)) {
                    show_enviro_mca_params = true;
                } else if (0 == strcasecmp(args[i], "api")) {
                    show_override_mca_params = true;
                }
            }
            opal_argv_free(args);
        }
    }

    /* File to use when dumping the parameters */
    ompi_mpi_show_mca_params_file = "";
    (void) mca_base_var_register("ompi", "mpi", NULL, "show_mca_params_file",
                                 "If mpi_show_mca_params is true, setting this string to a valid filename tells Open MPI to dump all the MCA parameter values into a file suitable for reading via the mca_param_files parameter (good for reproducability of MPI jobs)",
                                 MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
                                 OPAL_INFO_LVL_9,
                                 MCA_BASE_VAR_SCOPE_READONLY,
                                 &ompi_mpi_show_mca_params_file);

    /* User-level process pinning controls */

    /* MPI_ABORT controls */
    ompi_mpi_abort_delay = 0;
    (void) mca_base_var_register("ompi", "mpi", NULL, "abort_delay",
                                "If nonzero, print out an identifying message when MPI_ABORT is invoked (hostname, PID of the process that called MPI_ABORT) and delay for that many seconds before exiting (a negative delay value means to never abort).  This allows attaching of a debugger before quitting the job.",
                                 MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
                                 OPAL_INFO_LVL_9,
                                 MCA_BASE_VAR_SCOPE_READONLY,
                                 &ompi_mpi_abort_delay);

    ompi_mpi_abort_print_stack = false;
    (void) mca_base_var_register("ompi", "mpi", NULL, "abort_print_stack",
                                 "If nonzero, print out a stack trace when MPI_ABORT is invoked",
                                 MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
                                /* If we do not have stack trace
                                   capability, make this a constant
                                   MCA variable */
#if OPAL_WANT_PRETTY_PRINT_STACKTRACE
                                 0,
                                 OPAL_INFO_LVL_9,
                                 MCA_BASE_VAR_SCOPE_READONLY,
#else
                                 MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
                                 OPAL_INFO_LVL_9,
                                 MCA_BASE_VAR_SCOPE_CONSTANT,
#endif
                                 &ompi_mpi_abort_print_stack);

    ompi_mpi_preconnect_mpi = false;
    value = mca_base_var_register("ompi", "mpi", NULL, "preconnect_mpi",
                                  "Whether to force MPI processes to fully "
                                  "wire-up the MPI connections between MPI "
                                  "processes during "
                                  "MPI_INIT (vs. making connections lazily -- "
                                  "upon the first MPI traffic between each "
                                  "process peer pair)",
                                  MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
                                  MCA_BASE_VAR_FLAG_INTERNAL,
                                  OPAL_INFO_LVL_9,
                                  MCA_BASE_VAR_SCOPE_READONLY,
                                  &ompi_mpi_preconnect_mpi);
    mca_base_var_register_synonym(value, "ompi", "mpi", NULL, "preconnect_all",
                                  MCA_BASE_VAR_SYN_FLAG_DEPRECATED);

    /* Sparse group storage support */
    (void) mca_base_var_register("ompi", "mpi", NULL, "have_sparse_group_storage",
                                 "Whether this Open MPI installation supports storing of data in MPI groups in \"sparse\" formats (good for extremely large process count MPI jobs that create many communicators/groups)",
                                 MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
                                 MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
                                 OPAL_INFO_LVL_9,
                                 MCA_BASE_VAR_SCOPE_CONSTANT,
                                 &ompi_mpi_have_sparse_group_storage);

    ompi_use_sparse_group_storage = ompi_mpi_have_sparse_group_storage;
    (void) mca_base_var_register("ompi", "mpi", NULL, "use_sparse_group_storage",
                                 "Whether to use \"sparse\" storage formats for MPI groups (only relevant if mpi_have_sparse_group_storage is 1)",
                                 MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
                                 ompi_mpi_have_sparse_group_storage ? 0 : MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
                                 OPAL_INFO_LVL_9,
                                 ompi_mpi_have_sparse_group_storage ? MCA_BASE_VAR_SCOPE_READONLY : MCA_BASE_VAR_SCOPE_CONSTANT,
                                 &ompi_use_sparse_group_storage);
    if (ompi_use_sparse_group_storage && !ompi_mpi_have_sparse_group_storage) {
        opal_show_help("help-mpi-runtime.txt",
                       "sparse groups enabled but compiled out",
                       true);
        ompi_use_sparse_group_storage = false;
    }

    value = mca_base_var_find ("opal", "opal", NULL, "cuda_support");
    if (0 <= value) {
        mca_base_var_register_synonym(value, "ompi", "mpi", NULL, "cuda_support",
                                      MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
    }

    value = mca_base_var_find ("opal", "opal", NULL, "built_with_cuda_support");
    if (0 <= value) {
        mca_base_var_register_synonym(value, "ompi", "mpi", NULL, "built_with_cuda_support", 0);
    }

    if (opal_cuda_support && !opal_built_with_cuda_support) {
        opal_show_help("help-mpi-runtime.txt", "no cuda support",
                       true);
        ompi_rte_abort(1, NULL);
    }

    ompi_add_procs_cutoff = 1024;
    (void) mca_base_var_register ("ompi", "mpi", NULL, "add_procs_cutoff",
                                  "Maximum world size for pre-allocating resources for all "
                                  "remote processes. Increasing this limit may improve "
                                  "communication performance at the cost of memory usage "
                                  "(default: 1024)", MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL,
                                  0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
                                  &ompi_add_procs_cutoff);


    return OMPI_SUCCESS;
}
Example #10
0
int opal_init_test(void)
{
    int ret;
    char *error;

    /* initialize the memory allocator */
    opal_malloc_init();

    /* initialize the output system */
    opal_output_init();

    /* initialize install dirs code */
    if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_installdirs_base_framework, 0))) {
        fprintf(stderr, "opal_installdirs_base_open() failed -- process will likely abort (%s:%d, returned %d instead of OPAL_SUCCESS)\n",
                __FILE__, __LINE__, ret);
        return ret;
    }
    
    /* initialize the help system */
    opal_show_help_init();

    /* register handler for errnum -> string converstion */
    if (OPAL_SUCCESS != 
        (ret = opal_error_register("OPAL",
                                   OPAL_ERR_BASE, OPAL_ERR_MAX, opal_err2str))) {
        error = "opal_error_register";
        goto return_error;
    }

    /* keyval lex-based parser */
    if (OPAL_SUCCESS != (ret = opal_util_keyval_parse_init())) {
        error = "opal_util_keyval_parse_init";
        goto return_error;
    }

    if (OPAL_SUCCESS != (ret = opal_net_init())) {
        error = "opal_net_init";
        goto return_error;
    }

    /* Setup the parameter system */
    if (OPAL_SUCCESS != (ret = mca_base_var_init())) {
        error = "mca_base_var_init";
        goto return_error;
    }

    /* register params for opal */
    if (OPAL_SUCCESS != (ret = opal_register_params())) {
        error = "opal_register_params";
        goto return_error;
    }

    /* pretty-print stack handlers */
    if (OPAL_SUCCESS != (ret = opal_util_register_stackhandlers())) {
        error = "opal_util_register_stackhandlers";
        goto return_error;
    }

    /* Initialize the data storage service. */
    if (OPAL_SUCCESS != (ret = opal_dss_open())) {
        error = "opal_dss_open";
        goto return_error;
    }

    /* initialize the mca */
    if (OPAL_SUCCESS != (ret = mca_base_open())) {
        error = "mca_base_open";
        goto return_error;
    }

    if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_event_base_framework, 0))) {
        error = "opal_event_base_open";
        goto return_error;
    }

    return OPAL_SUCCESS;

 return_error:
    opal_show_help( "help-opal-runtime.txt",
                    "opal_init:startup:internal-failure", true,
                    error, ret );
    return ret;
}
static int setup_qps(void)
{
    int ret = OMPI_SUCCESS, qp = 0;
    int rd_num = 0, rd_low = 0, size = 0,
        rd_win = 0, rd_rsv = 0, sd_max = 0;

    mca_bcol_iboffload_qp_type_t type = 0;

    char **queues = NULL, **params = NULL;

    queues = opal_argv_split(mca_bcol_iboffload_component.receive_queues, ':');
    if (0 == opal_argv_count(queues)) {
        opal_show_help("help-mpi-btl-openib.txt",
                       "no qps in receive_queues", true,
                       ompi_process_info.nodename,
                       mca_bcol_iboffload_component.receive_queues);

        ret = OMPI_ERROR;

        goto exit;
    }

    while (queues[qp] != NULL) {
        if (0 == strncmp("P,", queues[qp], 2)) {
            type = MCA_BCOL_IBOFFLOAD_PP_QP;
        } else if (0 == strncmp("S,", queues[qp], 2)) {
            type = MCA_BCOL_IBOFFLOAD_SRQ_QP;
        } else if (0 == strncmp("X,", queues[qp], 2)) {
#if HAVE_XRC
        type = MCA_BCOL_IBOFFLOAD_XRC_QP;
#else
            opal_show_help("help-mpi-btl-openib.txt", "No XRC support", true,
                           ompi_process_info.nodename,
                           mca_bcol_iboffload_component.receive_queues);
            ret = OMPI_ERR_NOT_AVAILABLE;
            goto exit;
#endif
        } else {
            opal_show_help("help-mpi-btl-openib.txt",
                           "invalid qp type in receive_queues", true,
                           ompi_process_info.nodename,
                           mca_bcol_iboffload_component.receive_queues,
                           queues[qp]);

            ret = OMPI_ERR_BAD_PARAM;

            goto exit;
        }

        ++qp;
    }

    mca_bcol_iboffload_component.num_qps = MCA_BCOL_IBOFFLOAD_QP_LAST;

    qp = 0;
#define P(N) (((N) > count) ? NULL : params[(N)])
    while (NULL != queues[qp]) {
        int count;

        params = opal_argv_split_with_empty(queues[qp], ',');
        count = opal_argv_count(params);

        if ('P' == params[0][0]) {
            if (count < 3 || count > 6) {
                opal_show_help("help-mpi-btl-openib.txt",
                               "invalid pp qp specification", true,
                               ompi_process_info.nodename, queues[qp]);

                ret = OMPI_ERR_BAD_PARAM;

                goto exit;
            }

            size = atoi_param(P(1), 0);

            rd_num = atoi_param(P(2), 256);

            /* by default set rd_low to be 3/4 of rd_num */
            rd_low = atoi_param(P(3), rd_num - (rd_num / 4));
            rd_win = atoi_param(P(4), (rd_num - rd_low) * 2);
            rd_rsv = atoi_param(P(5), (rd_num * 2) / rd_win);


            if ((rd_num - rd_low) > rd_win) {
                opal_show_help("help-mpi-btl-openib.txt", "non optimal rd_win",
                        true, rd_win, rd_num - rd_low);
            }
        } else {
            if (count < 3 || count > 5) {
                opal_show_help("help-mpi-btl-openib.txt",
                               "invalid srq specification", true,
                               ompi_process_info.nodename, queues[qp]);

                ret = OMPI_ERR_BAD_PARAM;

                goto exit;
            }

            size = atoi_param(P(1), 0);
            rd_num = atoi_param(P(2), 256);

            /* by default set rd_low to be 3/4 of rd_num */
            rd_low = atoi_param(P(3), rd_num - (rd_num / 4));
            sd_max = atoi_param(P(4), rd_low / 4);

            IBOFFLOAD_VERBOSE(10, ("srq: rd_num is %d rd_low is %d sd_max is %d",
                         rd_num, rd_low, sd_max));

        }

        if (rd_num <= rd_low) {
            opal_show_help("help-mpi-btl-openib.txt", "rd_num must be > rd_low",
                    true, ompi_process_info.nodename, queues[qp]);
            ret = OMPI_ERR_BAD_PARAM;

            goto exit;
        }

        opal_argv_free(params);

        ++qp;
    }

    params = NULL;

    for (qp = 0; qp < MCA_BCOL_IBOFFLOAD_QP_LAST; ++qp) {
        mca_bcol_iboffload_component.qp_infos[qp].qp_index = qp;

        mca_bcol_iboffload_component.qp_infos[qp].type = type;
        mca_bcol_iboffload_component.qp_infos[qp].size = size;

        mca_bcol_iboffload_component.qp_infos[qp].rd_num = rd_num;
        mca_bcol_iboffload_component.qp_infos[qp].rd_low = rd_low;

        mca_bcol_iboffload_component.qp_infos[qp].rd_pp_win = rd_num - rd_low;

        if (MCA_BCOL_IBOFFLOAD_PP_QP == type) {
            mca_bcol_iboffload_component.qp_infos[qp].u.pp_qp.rd_win = rd_win;
            mca_bcol_iboffload_component.qp_infos[qp].u.pp_qp.rd_rsv = rd_rsv;
        } else {
            mca_bcol_iboffload_component.qp_infos[qp].u.srq_qp.sd_max = sd_max;
        }

        if (NULL != setup_qps_fn[qp]) {
            setup_qps_fn[qp](&mca_bcol_iboffload_component.qp_infos[qp]);
        }
    }

exit:
    if (NULL != params) {
        opal_argv_free(params);
    }

    if (NULL != queues) {
        opal_argv_free(queues);
    }

    return ret;
}
Example #12
0
int
opal_init(int* pargc, char*** pargv)
{
    int ret;
    char *error = NULL;

    if( ++opal_initialized != 1 ) {
        if( opal_initialized < 1 ) {
            return OPAL_ERROR;
        }
        return OPAL_SUCCESS;
    }

    /* initialize util code */
    if (OPAL_SUCCESS != (ret = opal_init_util(pargc, pargv))) {
        return ret;
    }

    /* initialize the mca */
    if (OPAL_SUCCESS != (ret = mca_base_open())) {
        error = "mca_base_open";
        goto return_error;
    }

    /* open hwloc - since this is a static framework, no
     * select is required
     */
    if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_hwloc_base_framework, 0))) {
        error = "opal_hwloc_base_open";
        goto return_error;
    }

    /* the memcpy component should be one of the first who get
     * loaded in order to make sure we have all the available
     * versions of memcpy correctly configured.
     */
    if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_memcpy_base_framework, 0))) {
        error = "opal_memcpy_base_open";
        goto return_error;
    }

    /* open the memory manager components.  Memory hooks may be
       triggered before this (any time after mem_free_init(),
       actually).  This is a hook available for memory manager hooks
       without good initialization routine support */
    if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_memory_base_framework, 0))) {
        error = "opal_memory_base_open";
        goto return_error;
    }

    /* initialize the memory manager / tracker */
    if (OPAL_SUCCESS != (ret = opal_mem_hooks_init())) {
        error = "opal_mem_hooks_init";
        goto return_error;
    }

    /* initialize the memory checker, to allow early support for annotation */
    if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_memchecker_base_framework, 0))) {
        error = "opal_memchecker_base_open";
        goto return_error;
    }

    /* select the memory checker */
    if (OPAL_SUCCESS != (ret = opal_memchecker_base_select())) {
        error = "opal_memchecker_base_select";
        goto return_error;
    }

    if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_backtrace_base_framework, 0))) {
        error = "opal_backtrace_base_open";
        goto return_error;
    }

    if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_timer_base_framework, 0))) {
        error = "opal_timer_base_open";
        goto return_error;
    }

    /*
     * Need to start the event and progress engines if none else is.
     * opal_cr_init uses the progress engine, so it is lumped together
     * into this set as well.
     */
    /*
     * Initialize the event library
     */
    if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_event_base_framework, 0))) {
        error = "opal_event_base_open";
        goto return_error;
    }
            
    /*
     * Initialize the general progress engine
     */
    if (OPAL_SUCCESS != (ret = opal_progress_init())) {
        error = "opal_progress_init";
        goto return_error;
    }
    /* we want to tick the event library whenever possible */
    opal_progress_event_users_increment();

    /* setup the shmem framework */
    if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_shmem_base_framework, 0))) {
        error = "opal_shmem_base_open";
        goto return_error;
    }

    if (OPAL_SUCCESS != (ret = opal_shmem_base_select())) {
        error = "opal_shmem_base_select";
        goto return_error;
    }

#if OPAL_ENABLE_FT_CR    == 1
    /*
     * Initialize the compression framework
     * Note: Currently only used in C/R so it has been marked to only
     *       initialize when C/R is enabled. If other places in the code
     *       wish to use this framework, it is safe to remove the protection.
     */
    if( OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_compress_base_framework, 0)) ) {
        error = "opal_compress_base_open";
        goto return_error;
    }
    if( OPAL_SUCCESS != (ret = opal_compress_base_select()) ) {
        error = "opal_compress_base_select";
        goto return_error;
    }
#endif

    /*
     * Initalize the checkpoint/restart functionality
     * Note: Always do this so we can detect if the user
     * attempts to checkpoint a non checkpointable job,
     * otherwise the tools may hang or not clean up properly.
     */
    if (OPAL_SUCCESS != (ret = opal_cr_init() ) ) {
        error = "opal_cr_init";
        goto return_error;
    }
    
    /* initialize the security framework */
    if( OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_sec_base_framework, 0)) ) {
        error = "opal_sec_base_open";
        goto return_error;
    }
    if( OPAL_SUCCESS != (ret = opal_sec_base_select()) ) {
        error = "opal_sec_base_select";
        goto return_error;
    }

    return OPAL_SUCCESS;

 return_error:
    opal_show_help( "help-opal-runtime.txt",
                    "opal_init:startup:internal-failure", true,
                    error, ret );
    return ret;
}
Example #13
0
int
opal_init_util(int* pargc, char*** pargv)
{
    int ret;
    char *error = NULL;

    if( ++opal_util_initialized != 1 ) {
        if( opal_util_initialized < 1 ) {
            return OPAL_ERROR;
        }
        return OPAL_SUCCESS;
    }

    /* initialize the memory allocator */
    opal_malloc_init();

    /* initialize the output system */
    opal_output_init();

    /* initialize install dirs code */
    if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_installdirs_base_framework, 0))) {
        fprintf(stderr, "opal_installdirs_base_open() failed -- process will likely abort (%s:%d, returned %d instead of OPAL_SUCCESS)\n",
                __FILE__, __LINE__, ret);
        return ret;
    }
    
    /* initialize the help system */
    opal_show_help_init();

    /* register handler for errnum -> string converstion */
    if (OPAL_SUCCESS != 
        (ret = opal_error_register("OPAL",
                                   OPAL_ERR_BASE, OPAL_ERR_MAX, opal_err2str))) {
        error = "opal_error_register";
        goto return_error;
    }

    /* keyval lex-based parser */
    if (OPAL_SUCCESS != (ret = opal_util_keyval_parse_init())) {
        error = "opal_util_keyval_parse_init";
        goto return_error;
    }

    if (OPAL_SUCCESS != (ret = opal_net_init())) {
        error = "opal_net_init";
        goto return_error;
    }

    /* Setup the parameter system */
    if (OPAL_SUCCESS != (ret = mca_base_var_init())) {
        error = "mca_base_var_init";
        goto return_error;
    }

    /* register params for opal */
    if (OPAL_SUCCESS != (ret = opal_register_params())) {
        error = "opal_register_params";
        goto return_error;
    }

    /* pretty-print stack handlers */
    if (OPAL_SUCCESS != (ret = opal_util_register_stackhandlers())) {
        error = "opal_util_register_stackhandlers";
        goto return_error;
    }

    /* set system resource limits - internally protected against
     * doing so twice in cases where the launch agent did it for us
     */
    if (OPAL_SUCCESS != (ret = opal_util_init_sys_limits(&error))) {
        opal_show_help("help-opal-runtime.txt",
                        "opal_init:syslimit", false,
                        error);
        return OPAL_ERR_SILENT;
    }

    /* initialize the arch string */
    if (OPAL_SUCCESS != (ret = opal_arch_init ())) {
        error = "opal_arch_init";
        goto return_error;
    }

    /* initialize the datatype engine */
    if (OPAL_SUCCESS != (ret = opal_datatype_init ())) {
        error = "opal_datatype_init";
        goto return_error;
    }

    /* Initialize the data storage service. */
    if (OPAL_SUCCESS != (ret = opal_dss_open())) {
        error = "opal_dss_open";
        goto return_error;
    }

    return OPAL_SUCCESS;

 return_error:
    if (OPAL_ERR_SILENT != ret) {
        opal_show_help( "help-opal-runtime.txt",
                        "opal_init:startup:internal-failure", true,
                        error, ret );
    }
    return ret;
}
Example #14
0
int
opal_init_util(int* pargc, char*** pargv)
{
    int ret;
    char *error = NULL;
    char hostname[OPAL_MAXHOSTNAMELEN];
    OPAL_TIMING_ENV_INIT(otmng);

    if( ++opal_util_initialized != 1 ) {
        if( opal_util_initialized < 1 ) {
            return OPAL_ERROR;
        }
        return OPAL_SUCCESS;
    }


    OBJ_CONSTRUCT(&opal_init_util_domain, opal_finalize_domain_t);
    (void) opal_finalize_domain_init (&opal_init_util_domain, "opal_init_util");
    opal_finalize_set_domain (&opal_init_util_domain);

    opal_thread_set_main();

    opal_init_called = true;

    /* register for */
    opal_finalize_register_cleanup_arg (mca_base_framework_close_list, opal_init_util_frameworks);

    /* set the nodename right away so anyone who needs it has it. Note
     * that we don't bother with fqdn and prefix issues here - we let
     * the RTE later replace this with a modified name if the user
     * requests it */
    gethostname(hostname, sizeof(hostname));
    opal_process_info.nodename = strdup(hostname);

    /* initialize the memory allocator */
    opal_malloc_init();

    OPAL_TIMING_ENV_NEXT(otmng, "opal_malloc_init");

    /* initialize the output system */
    opal_output_init();

    /* initialize install dirs code */
    if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_installdirs_base_framework, 0))) {
        fprintf(stderr, "opal_installdirs_base_open() failed -- process will likely abort (%s:%d, returned %d instead of OPAL_SUCCESS)\n",
                __FILE__, __LINE__, ret);
        return ret;
    }

    /* initialize the help system */
    opal_show_help_init();

    OPAL_TIMING_ENV_NEXT(otmng, "opal_show_help_init");

    /* register handler for errnum -> string converstion */
    if (OPAL_SUCCESS !=
        (ret = opal_error_register("OPAL",
                                   OPAL_ERR_BASE, OPAL_ERR_MAX, opal_err2str))) {
        return opal_init_error ("opal_error_register", ret);
    }

    /* keyval lex-based parser */
    if (OPAL_SUCCESS != (ret = opal_util_keyval_parse_init())) {
        return opal_init_error ("opal_util_keyval_parse_init", ret);
    }

    // Disable PSM signal hijacking (see comment in function for more
    // details)
    opal_init_psm();

    OPAL_TIMING_ENV_NEXT(otmng, "opal_init_psm");

    /* Setup the parameter system */
    if (OPAL_SUCCESS != (ret = mca_base_var_init())) {
        return opal_init_error ("mca_base_var_init", ret);
    }
    OPAL_TIMING_ENV_NEXT(otmng, "opal_var_init");

    /* read any param files that were provided */
    if (OPAL_SUCCESS != (ret = mca_base_var_cache_files(false))) {
        return opal_init_error ("failed to cache files", ret);
    }

    OPAL_TIMING_ENV_NEXT(otmng, "opal_var_cache");


    /* register params for opal */
    if (OPAL_SUCCESS != (ret = opal_register_params())) {
        return opal_init_error ("opal_register_params", ret);
    }

    if (OPAL_SUCCESS != (ret = opal_net_init())) {
        return opal_init_error ("opal_net_init", ret);
    }

    OPAL_TIMING_ENV_NEXT(otmng, "opal_net_init");

    /* pretty-print stack handlers */
    if (OPAL_SUCCESS != (ret = opal_util_register_stackhandlers())) {
        return opal_init_error ("opal_util_register_stackhandlers", ret);
    }

    /* set system resource limits - internally protected against
     * doing so twice in cases where the launch agent did it for us
     */
    if (OPAL_SUCCESS != (ret = opal_util_init_sys_limits(&error))) {
        opal_show_help("help-opal-runtime.txt",
                        "opal_init:syslimit", false,
                        error);
        return OPAL_ERR_SILENT;
    }

    /* initialize the arch string */
    if (OPAL_SUCCESS != (ret = opal_arch_init ())) {
        return opal_init_error ("opal_arch_init", ret);
    }

    OPAL_TIMING_ENV_NEXT(otmng, "opal_arch_init");

    /* initialize the datatype engine */
    if (OPAL_SUCCESS != (ret = opal_datatype_init ())) {
        return opal_init_error ("opal_datatype_init", ret);
    }

    OPAL_TIMING_ENV_NEXT(otmng, "opal_datatype_init");

    /* Initialize the data storage service. */
    if (OPAL_SUCCESS != (ret = opal_dss_open())) {
        return opal_init_error ("opal_dss_open", ret);
    }

    OPAL_TIMING_ENV_NEXT(otmng, "opal_dss_open");

    /* initialize the mca */
    if (OPAL_SUCCESS != (ret = mca_base_open())) {
        return opal_init_error ("mca_base_open", ret);
    }

    OPAL_TIMING_ENV_NEXT(otmng, "mca_base_open");

    /* initialize if framework */
    if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_if_base_framework, 0))) {
        fprintf(stderr, "opal_if_base_open() failed -- process will likely abort (%s:%d, returned %d instead of OPAL_SUCCESS)\n",
                __FILE__, __LINE__, ret);
        return ret;
    }

    OPAL_TIMING_ENV_NEXT(otmng, "opal_if_init");

    return OPAL_SUCCESS;
}
Example #15
0
int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
{
    int ret;
    ompi_proc_t** procs;
    size_t nprocs;
    char *error = NULL;
    char *cmd=NULL, *av=NULL;
    ompi_errhandler_errtrk_t errtrk;
    OPAL_TIMING_DECLARE(tm);
    OPAL_TIMING_INIT_EXT(&tm, OPAL_TIMING_GET_TIME_OF_DAY);

    /* bitflag of the thread level support provided. To be used
     * for the modex in order to work in heterogeneous environments. */
    uint8_t threadlevel_bf;

    /* Ensure that we were not already initialized or finalized.

       This lock is held for the duration of ompi_mpi_init() and
       ompi_mpi_finalize().  Hence, if we get it, then no other thread
       is inside the critical section (and we don't have to check the
       *_started bool variables). */
    opal_mutex_lock(&ompi_mpi_bootstrap_mutex);
    if (ompi_mpi_finalized) {
        opal_show_help("help-mpi-runtime.txt",
                       "mpi_init: already finalized", true);
        opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
        return MPI_ERR_OTHER;
    } else if (ompi_mpi_initialized) {
        opal_show_help("help-mpi-runtime.txt",
                       "mpi_init: invoked multiple times", true);
        opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
        return MPI_ERR_OTHER;
    }

    /* Indicate that we have *started* MPI_INIT* */
    ompi_mpi_init_started = true;

    /* Setup enough to check get/set MCA params */

    if (OPAL_SUCCESS != (ret = opal_init_util(&argc, &argv))) {
        error = "ompi_mpi_init: opal_init_util failed";
        goto error;
    }

    /* Convince OPAL to use our naming scheme */
    opal_process_name_print = _process_name_print_for_opal;
    opal_compare_proc = _process_name_compare;
    opal_convert_string_to_process_name = _convert_string_to_process_name;
    opal_convert_process_name_to_string = _convert_process_name_to_string;
    opal_proc_for_name = ompi_proc_for_name;

    /* Register MCA variables */
    if (OPAL_SUCCESS != (ret = ompi_register_mca_variables())) {
        error = "ompi_mpi_init: ompi_register_mca_variables failed";
        goto error;
    }

    if (OPAL_SUCCESS != (ret = opal_arch_set_fortran_logical_size(sizeof(ompi_fortran_logical_t)))) {
        error = "ompi_mpi_init: opal_arch_set_fortran_logical_size failed";
        goto error;
    }

    /* _After_ opal_init_util() but _before_ orte_init(), we need to
       set an MCA param that tells libevent that it's ok to use any
       mechanism in libevent that is available on this platform (e.g.,
       epoll and friends).  Per opal/event/event.s, we default to
       select/poll -- but we know that MPI processes won't be using
       pty's with the event engine, so it's ok to relax this
       constraint and let any fd-monitoring mechanism be used. */

    ret = mca_base_var_find("opal", "event", "*", "event_include");
    if (ret >= 0) {
        char *allvalue = "all";
        /* We have to explicitly "set" the MCA param value here
           because libevent initialization will re-register the MCA
           param and therefore override the default. Setting the value
           here puts the desired value ("all") in different storage
           that is not overwritten if/when the MCA param is
           re-registered. This is unless the user has specified a different
           value for this MCA parameter. Make sure we check to see if the
           default is specified before forcing "all" in case that is not what
           the user desires. Note that we do *NOT* set this value as an
           environment variable, just so that it won't be inherited by
           any spawned processes and potentially cause unintented
           side-effects with launching RTE tools... */
        mca_base_var_set_value(ret, allvalue, 4, MCA_BASE_VAR_SOURCE_DEFAULT, NULL);
    }

    OPAL_TIMING_MSTART((&tm,"time from start to completion of rte_init"));

    /* if we were not externally started, then we need to setup
     * some envars so the MPI_INFO_ENV can get the cmd name
     * and argv (but only if the user supplied a non-NULL argv!), and
     * the requested thread level
     */
    if (NULL == getenv("OMPI_COMMAND") && NULL != argv && NULL != argv[0]) {
        asprintf(&cmd, "OMPI_COMMAND=%s", argv[0]);
        putenv(cmd);
    }
    if (NULL == getenv("OMPI_ARGV") && 1 < argc) {
        char *tmp;
        tmp = opal_argv_join(&argv[1], ' ');
        asprintf(&av, "OMPI_ARGV=%s", tmp);
        free(tmp);
        putenv(av);
    }

    /* open the rte framework */
    if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_rte_base_framework, 0))) {
        error = "ompi_rte_base_open() failed";
        goto error;
    }
    /* no select is required as this is a static framework */

    /* Setup RTE */
    if (OMPI_SUCCESS != (ret = ompi_rte_init(NULL, NULL))) {
        error = "ompi_mpi_init: ompi_rte_init failed";
        goto error;
    }
    ompi_rte_initialized = true;

    /* check for timing request - get stop time and report elapsed time if so */
    OPAL_TIMING_MNEXT((&tm,"time from completion of rte_init to modex"));

    /* if hwloc is available but didn't get setup for some
     * reason, do so now
     */
    if (NULL == opal_hwloc_topology) {
        if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) {
            error = "Topology init";
            goto error;
        }
    }

    /* Register the default errhandler callback  */
    errtrk.status = OPAL_ERROR;
    errtrk.active = true;
    opal_pmix.register_errhandler(NULL, ompi_errhandler_callback,
                                  ompi_errhandler_registration_callback,
                                  (void*)&errtrk);
    OMPI_WAIT_FOR_COMPLETION(errtrk.active);
    if (OPAL_SUCCESS != errtrk.status) {
        error = "Error handler registration";
        ret = errtrk.status;
        goto error;
    }

    /* Figure out the final MPI thread levels.  If we were not
       compiled for support for MPI threads, then don't allow
       MPI_THREAD_MULTIPLE.  Set this stuff up here early in the
       process so that other components can make decisions based on
       this value. */

    ompi_mpi_thread_level(requested, provided);

    /* determine the bitflag belonging to the threadlevel_support provided */
    memset ( &threadlevel_bf, 0, sizeof(uint8_t));
    OMPI_THREADLEVEL_SET_BITFLAG ( ompi_mpi_thread_provided, threadlevel_bf );

#if OMPI_ENABLE_THREAD_MULTIPLE
    /* add this bitflag to the modex */
    OPAL_MODEX_SEND_STRING(ret, OPAL_PMIX_GLOBAL,
                           "MPI_THREAD_LEVEL", &threadlevel_bf, sizeof(uint8_t));
    if (OPAL_SUCCESS != ret) {
        error = "ompi_mpi_init: modex send thread level";
        goto error;
    }
#endif

    /* If thread support was enabled, then setup OPAL to allow for
       them. */
    if ((OPAL_ENABLE_PROGRESS_THREADS == 1) ||
            (*provided != MPI_THREAD_SINGLE)) {
        opal_set_using_threads(true);
    }

    /* initialize datatypes. This step should be done early as it will
     * create the local convertor and local arch used in the proc
     * init.
     */
    if (OMPI_SUCCESS != (ret = ompi_datatype_init())) {
        error = "ompi_datatype_init() failed";
        goto error;
    }

    /* Initialize OMPI procs */
    if (OMPI_SUCCESS != (ret = ompi_proc_init())) {
        error = "mca_proc_init() failed";
        goto error;
    }

    /* Initialize the op framework. This has to be done *after*
       ddt_init, but befor mca_coll_base_open, since some collective
       modules (e.g., the hierarchical coll component) may need ops in
       their query function. */
    if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_op_base_framework, 0))) {
        error = "ompi_op_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS !=
            (ret = ompi_op_base_find_available(OPAL_ENABLE_PROGRESS_THREADS,
                    ompi_mpi_thread_multiple))) {
        error = "ompi_op_base_find_available() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = ompi_op_init())) {
        error = "ompi_op_init() failed";
        goto error;
    }

    /* Open up MPI-related MCA components */

    if (OMPI_SUCCESS != (ret = mca_base_framework_open(&opal_allocator_base_framework, 0))) {
        error = "mca_allocator_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_base_framework_open(&opal_rcache_base_framework, 0))) {
        error = "mca_rcache_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_base_framework_open(&opal_mpool_base_framework, 0))) {
        error = "mca_mpool_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_bml_base_framework, 0))) {
        error = "mca_bml_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_pml_base_framework, 0))) {
        error = "mca_pml_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_coll_base_framework, 0))) {
        error = "mca_coll_base_open() failed";
        goto error;
    }

    if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_osc_base_framework, 0))) {
        error = "ompi_osc_base_open() failed";
        goto error;
    }

#if OPAL_ENABLE_FT_CR == 1
    if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_crcp_base_framework, 0))) {
        error = "ompi_crcp_base_open() failed";
        goto error;
    }
#endif

    /* In order to reduce the common case for MPI apps (where they
       don't use MPI-2 IO or MPI-1 topology functions), the io and
       topo frameworks are initialized lazily, at the first use of
       relevant functions (e.g., MPI_FILE_*, MPI_CART_*, MPI_GRAPH_*),
       so they are not opened here. */

    /* Select which MPI components to use */

    if (OMPI_SUCCESS !=
            (ret = mca_mpool_base_init(OPAL_ENABLE_PROGRESS_THREADS,
                                       ompi_mpi_thread_multiple))) {
        error = "mca_mpool_base_init() failed";
        goto error;
    }

    if (OMPI_SUCCESS !=
            (ret = mca_pml_base_select(OPAL_ENABLE_PROGRESS_THREADS,
                                       ompi_mpi_thread_multiple))) {
        error = "mca_pml_base_select() failed";
        goto error;
    }

    /* check for timing request - get stop time and report elapsed time if so */
    OPAL_TIMING_MNEXT((&tm,"time to execute modex"));

    /* exchange connection info - this function may also act as a barrier
     * if data exchange is required. The modex occurs solely across procs
     * in our job. If a barrier is required, the "modex" function will
     * perform it internally */
    OPAL_MODEX();

    OPAL_TIMING_MNEXT((&tm,"time from modex to first barrier"));

    /* select buffered send allocator component to be used */
    if( OMPI_SUCCESS !=
            (ret = mca_pml_base_bsend_init(ompi_mpi_thread_multiple))) {
        error = "mca_pml_base_bsend_init() failed";
        goto error;
    }

    if (OMPI_SUCCESS !=
            (ret = mca_coll_base_find_available(OPAL_ENABLE_PROGRESS_THREADS,
                    ompi_mpi_thread_multiple))) {
        error = "mca_coll_base_find_available() failed";
        goto error;
    }

    if (OMPI_SUCCESS !=
            (ret = ompi_osc_base_find_available(OPAL_ENABLE_PROGRESS_THREADS,
                    ompi_mpi_thread_multiple))) {
        error = "ompi_osc_base_find_available() failed";
        goto error;
    }

#if OPAL_ENABLE_FT_CR == 1
    if (OMPI_SUCCESS != (ret = ompi_crcp_base_select() ) ) {
        error = "ompi_crcp_base_select() failed";
        goto error;
    }
#endif

    /* io and topo components are not selected here -- see comment
       above about the io and topo frameworks being loaded lazily */

    /* Initialize each MPI handle subsystem */
    /* initialize requests */
    if (OMPI_SUCCESS != (ret = ompi_request_init())) {
        error = "ompi_request_init() failed";
        goto error;
    }

    if (OMPI_SUCCESS != (ret = ompi_message_init())) {
        error = "ompi_message_init() failed";
        goto error;
    }

    /* initialize info */
    if (OMPI_SUCCESS != (ret = ompi_mpiinfo_init())) {
        error = "ompi_info_init() failed";
        goto error;
    }

    /* initialize error handlers */
    if (OMPI_SUCCESS != (ret = ompi_errhandler_init())) {
        error = "ompi_errhandler_init() failed";
        goto error;
    }

    /* initialize error codes */
    if (OMPI_SUCCESS != (ret = ompi_mpi_errcode_init())) {
        error = "ompi_mpi_errcode_init() failed";
        goto error;
    }

    /* initialize internal error codes */
    if (OMPI_SUCCESS != (ret = ompi_errcode_intern_init())) {
        error = "ompi_errcode_intern_init() failed";
        goto error;
    }

    /* initialize groups  */
    if (OMPI_SUCCESS != (ret = ompi_group_init())) {
        error = "ompi_group_init() failed";
        goto error;
    }

    /* initialize communicators */
    if (OMPI_SUCCESS != (ret = ompi_comm_init())) {
        error = "ompi_comm_init() failed";
        goto error;
    }

    /* initialize file handles */
    if (OMPI_SUCCESS != (ret = ompi_file_init())) {
        error = "ompi_file_init() failed";
        goto error;
    }

    /* initialize windows */
    if (OMPI_SUCCESS != (ret = ompi_win_init())) {
        error = "ompi_win_init() failed";
        goto error;
    }

    /* initialize attribute meta-data structure for comm/win/dtype */
    if (OMPI_SUCCESS != (ret = ompi_attr_init())) {
        error = "ompi_attr_init() failed";
        goto error;
    }

    /* identify the architectures of remote procs and setup
     * their datatype convertors, if required
     */
    if (OMPI_SUCCESS != (ret = ompi_proc_complete_init())) {
        error = "ompi_proc_complete_init failed";
        goto error;
    }

    /* start PML/BTL's */
    ret = MCA_PML_CALL(enable(true));
    if( OMPI_SUCCESS != ret ) {
        error = "PML control failed";
        goto error;
    }

    /* some btls/mtls require we call add_procs with all procs in the job.
     * since the btls/mtls have no visibility here it is up to the pml to
     * convey this requirement */
    if (mca_pml_base_requires_world ()) {
        if (NULL == (procs = ompi_proc_world (&nprocs))) {
            error = "ompi_proc_get_allocated () failed";
            goto error;
        }
    } else {
        /* add all allocated ompi_proc_t's to PML (below the add_procs limit this
         * behaves identically to ompi_proc_world ()) */
        if (NULL == (procs = ompi_proc_get_allocated (&nprocs))) {
            error = "ompi_proc_get_allocated () failed";
            goto error;
        }
    }
    ret = MCA_PML_CALL(add_procs(procs, nprocs));
    free(procs);
    /* If we got "unreachable", then print a specific error message.
       Otherwise, if we got some other failure, fall through to print
       a generic message. */
    if (OMPI_ERR_UNREACH == ret) {
        opal_show_help("help-mpi-runtime.txt",
                       "mpi_init:startup:pml-add-procs-fail", true);
        error = NULL;
        goto error;
    } else if (OMPI_SUCCESS != ret) {
        error = "PML add procs failed";
        goto error;
    }

    MCA_PML_CALL(add_comm(&ompi_mpi_comm_world.comm));
    MCA_PML_CALL(add_comm(&ompi_mpi_comm_self.comm));

    /*
     * Dump all MCA parameters if requested
     */
    if (ompi_mpi_show_mca_params) {
        ompi_show_all_mca_params(ompi_mpi_comm_world.comm.c_my_rank,
                                 nprocs,
                                 ompi_process_info.nodename);
    }

    /* Do we need to wait for a debugger? */
    ompi_rte_wait_for_debugger();

    /* Next timing measurement */
    OPAL_TIMING_MNEXT((&tm,"time to execute barrier"));

    /* wait for everyone to reach this point - this is a hard
     * barrier requirement at this time, though we hope to relax
     * it at a later point */
    opal_pmix.fence(NULL, 0);

    /* check for timing request - get stop time and report elapsed
       time if so, then start the clock again */
    OPAL_TIMING_MNEXT((&tm,"time from barrier to complete mpi_init"));

#if OPAL_ENABLE_PROGRESS_THREADS == 0
    /* Start setting up the event engine for MPI operations.  Don't
       block in the event library, so that communications don't take
       forever between procs in the dynamic code.  This will increase
       CPU utilization for the remainder of MPI_INIT when we are
       blocking on RTE-level events, but may greatly reduce non-TCP
       latency. */
    opal_progress_set_event_flag(OPAL_EVLOOP_NONBLOCK);
#endif

    /* wire up the mpi interface, if requested.  Do this after the
       non-block switch for non-TCP performance.  Do before the
       polling change as anyone with a complex wire-up is going to be
       using the oob. */
    if (OMPI_SUCCESS != (ret = ompi_init_preconnect_mpi())) {
        error = "ompi_mpi_do_preconnect_all() failed";
        goto error;
    }

    /* Setup the dynamic process management (DPM) subsystem */
    if (OMPI_SUCCESS != (ret = ompi_dpm_init())) {
        error = "ompi_dpm_init() failed";
        goto error;
    }

    /* Determine the overall threadlevel support of all processes
       in MPI_COMM_WORLD. This has to be done before calling
       coll_base_comm_select, since some of the collective components
       e.g. hierarch, might create subcommunicators. The threadlevel
       requested by all processes is required in order to know
       which cid allocation algorithm can be used. */
    if ( OMPI_SUCCESS !=
            ( ret = ompi_comm_cid_init ())) {
        error = "ompi_mpi_init: ompi_comm_cid_init failed";
        goto error;
    }

    /* Init coll for the comms. This has to be after dpm_base_select,
       (since dpm.mark_dyncomm is not set in the communicator creation
       function else), but before dpm.dyncom_init, since this function
       might require collective for the CID allocation. */
    if (OMPI_SUCCESS !=
            (ret = mca_coll_base_comm_select(MPI_COMM_WORLD))) {
        error = "mca_coll_base_comm_select(MPI_COMM_WORLD) failed";
        goto error;
    }

    if (OMPI_SUCCESS !=
            (ret = mca_coll_base_comm_select(MPI_COMM_SELF))) {
        error = "mca_coll_base_comm_select(MPI_COMM_SELF) failed";
        goto error;
    }

    /* Check whether we have been spawned or not.  We introduce that
       at the very end, since we need collectives, datatypes, ptls
       etc. up and running here.... */
    if (OMPI_SUCCESS != (ret = ompi_dpm_dyn_init())) {
        error = "ompi_dpm_dyn_init() failed";
        goto error;
    }

    /*
     * Startup the Checkpoint/Restart Mech.
     * Note: Always do this so tools don't hang when
     * in a non-checkpointable build
     */
    if (OMPI_SUCCESS != (ret = ompi_cr_init())) {
        error = "ompi_cr_init";
        goto error;
    }

    /* Undo OPAL calling opal_progress_event_users_increment() during
       opal_init, to get better latency when not using TCP.  Do
       this *after* dyn_init, as dyn init uses lots of RTE
       communication and we don't want to hinder the performance of
       that code. */
    opal_progress_event_users_decrement();

    /* see if yield_when_idle was specified - if so, use it */
    opal_progress_set_yield_when_idle(ompi_mpi_yield_when_idle);

    /* negative value means use default - just don't do anything */
    if (ompi_mpi_event_tick_rate >= 0) {
        opal_progress_set_event_poll_rate(ompi_mpi_event_tick_rate);
    }

    /* At this point, we are fully configured and in MPI mode.  Any
       communication calls here will work exactly like they would in
       the user's code.  Setup the connections between procs and warm
       them up with simple sends, if requested */

    if (OMPI_SUCCESS != (ret = ompi_mpiext_init())) {
        error = "ompi_mpiext_init";
        goto error;
    }

    /* Fall through */
error:
    if (ret != OMPI_SUCCESS) {
        /* Only print a message if one was not already printed */
        if (NULL != error) {
            const char *err_msg = opal_strerror(ret);
            opal_show_help("help-mpi-runtime.txt",
                           "mpi_init:startup:internal-failure", true,
                           "MPI_INIT", "MPI_INIT", error, err_msg, ret);
        }
        opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
        return ret;
    }

    /* Initialize the registered datarep list to be empty */
    OBJ_CONSTRUCT(&ompi_registered_datareps, opal_list_t);

    /* Initialize the arrays used to store the F90 types returned by the
     *  MPI_Type_create_f90_XXX functions.
     */
    OBJ_CONSTRUCT( &ompi_mpi_f90_integer_hashtable, opal_hash_table_t);
    opal_hash_table_init(&ompi_mpi_f90_integer_hashtable, 16 /* why not? */);

    OBJ_CONSTRUCT( &ompi_mpi_f90_real_hashtable, opal_hash_table_t);
    opal_hash_table_init(&ompi_mpi_f90_real_hashtable, FLT_MAX_10_EXP);

    OBJ_CONSTRUCT( &ompi_mpi_f90_complex_hashtable, opal_hash_table_t);
    opal_hash_table_init(&ompi_mpi_f90_complex_hashtable, FLT_MAX_10_EXP);

    /* All done.  Wasn't that simple? */

    ompi_mpi_initialized = true;

    /* Finish last measurement, output results
     * and clear timing structure */
    OPAL_TIMING_MSTOP(&tm);
    OPAL_TIMING_DELTAS(ompi_enable_timing, &tm);
    OPAL_TIMING_REPORT(ompi_enable_timing_ext, &tm);
    OPAL_TIMING_RELEASE(&tm);

    opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
    return MPI_SUCCESS;
}
Example #16
0
int mca_bcol_iboffload_verify_params(void)
{
    if (mca_bcol_iboffload_component.min_rnr_timer > 31) {
        opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
                       true, "bcol_iboffload_ib_min_rnr_timer > 31",
                       "bcol_iboffload_ib_min_rnr_timer reset to 31");
        mca_bcol_iboffload_component.min_rnr_timer = 31;
    } else if (mca_bcol_iboffload_component.min_rnr_timer < 0){
        opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
                   true, "bcol_iboffload_ib_min_rnr_timer < 0",
                   "bcol_iboffload_ib_min_rnr_timer reset to 0");
        mca_bcol_iboffload_component.min_rnr_timer = 0;
    }

    if (mca_bcol_iboffload_component.timeout > 31) {
        opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
                       true, "bcol_iboffload_ib_timeout > 31",
                       "bcol_iboffload_ib_timeout reset to 31");
        mca_bcol_iboffload_component.timeout = 31;
    } else if (mca_bcol_iboffload_component.timeout < 0) {
        opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
                   true, "bcol_iboffload_ib_timeout < 0",
                   "bcol_iboffload_ib_timeout reset to 0");
        mca_bcol_iboffload_component.timeout = 0;
    }

    if (mca_bcol_iboffload_component.retry_count > 7) {
        opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
                       true, "bcol_iboffload_ib_retry_count > 7",
                       "bcol_iboffload_ib_retry_count reset to 7");
        mca_bcol_iboffload_component.retry_count = 7;
    } else if (mca_bcol_iboffload_component.retry_count < 0) {
        opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
                   true, "bcol_iboffload_ib_retry_count < 0",
                   "bcol_iboffload_ib_retry_count reset to 0");
        mca_bcol_iboffload_component.retry_count = 0;
    }

    if (mca_bcol_iboffload_component.max_rdma_dst_ops > 7) {
        opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
                       true, "bcol_iboffload_ib_rnr_retry > 7",
                       "bcol_iboffload_ib_rnr_retry reset to 7");
        mca_bcol_iboffload_component.max_rdma_dst_ops = 7;
    } else if (mca_bcol_iboffload_component.max_rdma_dst_ops < 0) {
        opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
                   true, "bcol_iboffload_ib_rnr_retry < 0",
                   "bcol_iboffload_ib_rnr_retry reset to 0");
        mca_bcol_iboffload_component.max_rdma_dst_ops = 0;
    }

    if (mca_bcol_iboffload_component.service_level > 15) {
        opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
                       true, "bcol_iboffload_ib_service_level > 15",
                       "bcol_iboffload_ib_service_level reset to 15");
        mca_bcol_iboffload_component.service_level = 15;
    } else if (mca_bcol_iboffload_component.service_level < 0) {
        opal_show_help("help-mpi-btl-openib.txt", "invalid mca param value",
                   true, "bcol_iboffload_ib_service_level < 0",
                   "bcol_iboffload_ib_service_level reset to 0");
        mca_bcol_iboffload_component.service_level = 0;
    }
 
    if(mca_bcol_iboffload_component.buffer_alignment <= 1 ||
       (mca_bcol_iboffload_component.buffer_alignment & (mca_bcol_iboffload_component.buffer_alignment - 1))) {
        opal_show_help("help-mpi-btl-openib.txt", "wrong buffer alignment",
                true, mca_bcol_iboffload_component.buffer_alignment, ompi_process_info.nodename, 64);
        mca_bcol_iboffload_component.buffer_alignment = 64;
    }

    return OMPI_SUCCESS;
}
/*
 * Function to find as many components of a given type as possible.  This
 * includes statically-linked in components as well as opening up a
 * directory and looking for shared-library MCA components of the
 * appropriate type (load them if available).
 *
 * Return one consolidated array of (mca_base_component_t*) pointing to all
 * available components.
 */
int mca_base_component_find(const char *directory, const char *type, 
                            const mca_base_component_t *static_components[], 
                            char **requested_component_names,
                            bool include_mode,
                            opal_list_t *found_components,
                            bool open_dso_components)
{
    int i;
    opal_list_item_t *item;
    mca_base_component_list_item_t *cli;

    /* Find all the components that were statically linked in */
    OBJ_CONSTRUCT(found_components, opal_list_t);
    for (i = 0; NULL != static_components[i]; ++i) {
        if ( use_component(include_mode,
                           (const char**)requested_component_names,
                           static_components[i]->mca_component_name) ) {
            cli = OBJ_NEW(mca_base_component_list_item_t);
            if (NULL == cli) {
                return OPAL_ERR_OUT_OF_RESOURCE;
            }
            cli->cli_component = static_components[i];
            opal_list_append(found_components, (opal_list_item_t *) cli);
        }
    }

#if OMPI_WANT_LIBLTDL
    /* Find any available dynamic components in the specified directory */
    if (open_dso_components) {
        int param, param_disable_dlopen;
        param = mca_base_param_find("mca", NULL, "component_disable_dlopen");
        mca_base_param_lookup_int(param, &param_disable_dlopen);

        if (0 == param_disable_dlopen) {
            find_dyn_components(directory, type,
                                (const char**)requested_component_names,
                                include_mode, found_components); 
        }
    } else {
        opal_output_verbose(40, 0, 
                            "mca: base: component_find: dso loading for %s MCA components disabled", 
                            type);
    }
#endif

    /* Ensure that *all* requested components exist.  Print a warning
       and abort if they do not. */
    for (i = 0; include_mode && NULL != requested_component_names && 
             NULL != requested_component_names[i]; ++i) {
        for (item = opal_list_get_first(found_components);
             opal_list_get_end(found_components) != item; 
             item = opal_list_get_next(item)) {
            cli = (mca_base_component_list_item_t*) item;
            if (0 == strcmp(requested_component_names[i], 
                            cli->cli_component->mca_component_name)) {
                break;
            }
        }

        if (opal_list_get_end(found_components) == item) {
            char h[MAXHOSTNAMELEN];
            gethostname(h, sizeof(h));
            opal_show_help("help-mca-base.txt", 
                           "find-available:not-valid", true,
                           h, type, requested_component_names[i]);
            return OPAL_ERR_NOT_FOUND;
        }
    }

    /* All done */

    return OPAL_SUCCESS;
}
/*
 * Create an ompi_btl_usnic_proc_t and initialize it with modex info
 * and an empty array of endpoints.
 *
 * Returns OMPI_ERR_UNREACH if we can't reach the peer (i.e., we can't
 * find their modex data).
 */
static int create_proc(ompi_proc_t *ompi_proc, 
                       ompi_btl_usnic_proc_t **usnic_proc)
{
    ompi_btl_usnic_proc_t *proc = NULL;
    size_t size;
    int rc;

    *usnic_proc = NULL;

    /* Create the proc if it doesn't already exist */
    proc = OBJ_NEW(ompi_btl_usnic_proc_t);
    if (NULL == proc) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    /* Initialize number of peers */
    proc->proc_endpoint_count = 0;
    proc->proc_ompi = ompi_proc;

    /* query for the peer address info */
    rc = ompi_modex_recv(&mca_btl_usnic_component.super.btl_version,
                         ompi_proc, (void*)&proc->proc_modex,
                         &size);

    /* If this proc simply doesn't have this key, then they're not
       running the usnic BTL -- just ignore them.  Otherwise, show an
       error message. */
    if (OPAL_ERR_DATA_VALUE_NOT_FOUND == rc) {
        OBJ_RELEASE(proc);
        return OMPI_ERR_UNREACH;
    } else if (OMPI_SUCCESS != rc) {
        opal_show_help("help-mpi-btl-usnic.txt",
                       "internal error during init",
                       true,
                       ompi_process_info.nodename,
                       "<none>", 0,
                       "ompi_modex_recv() failed", __FILE__, __LINE__,
                       opal_strerror(rc));
        OBJ_RELEASE(proc);
        return OMPI_ERROR;
    }

    if ((size % sizeof(ompi_btl_usnic_addr_t)) != 0) {
        char msg[1024];

        snprintf(msg, sizeof(msg), 
                 "sizeof(modex for peer %s data) == %d, expected multiple of %d",
                 OMPI_NAME_PRINT(&ompi_proc->proc_name),
                 (int) size, (int) sizeof(ompi_btl_usnic_addr_t));
        opal_show_help("help-mpi-btl-usnic.txt", "internal error during init",
                       true,
                       ompi_process_info.nodename,
                       "<none>", 0,
                       "invalid modex data", __FILE__, __LINE__,
                       msg);

        OBJ_RELEASE(proc);
        return OMPI_ERR_VALUE_OUT_OF_BOUNDS;
    }

    proc->proc_modex_count = size / sizeof(ompi_btl_usnic_addr_t);
    if (0 == proc->proc_modex_count) {
        proc->proc_endpoints = NULL;
        OBJ_RELEASE(proc);
        return OMPI_ERR_UNREACH;
    }

    proc->proc_modex_claimed = (bool*) 
        calloc(proc->proc_modex_count, sizeof(bool));
    if (NULL == proc->proc_modex_claimed) {
        OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE);
        OBJ_RELEASE(proc);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    proc->proc_endpoints = (mca_btl_base_endpoint_t**)
        calloc(proc->proc_modex_count, sizeof(mca_btl_base_endpoint_t*));
    if (NULL == proc->proc_endpoints) {
        OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE);
        OBJ_RELEASE(proc);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    *usnic_proc = proc;
    return OMPI_SUCCESS;
}
Example #19
0
static int initialize(int argc, char *argv[])
{
    int ret, exit_status = OPAL_SUCCESS;
    char * tmp_env_var = NULL;

    /*
     * Make sure to init util before parse_args
     * to ensure installdirs is setup properly
     * before calling mca_base_open();
     */
    if( OPAL_SUCCESS != (ret = opal_init_util(&argc, &argv)) ) {
        return ret;
    }

    /*
     * Parse Command line arguments
     */
    if (OPAL_SUCCESS != (ret = parse_args(argc, argv))) {
        exit_status = ret;
        goto cleanup;
    }

    /*
     * Setup OPAL Output handle from the verbose argument
     */
    if( opal_restart_globals.verbose ) {
        opal_restart_globals.output = opal_output_open(NULL);
        opal_output_set_verbosity(opal_restart_globals.output, 10);
    } else {
        opal_restart_globals.output = 0; /* Default=STDOUT */
    }

    /*
     * Turn off the selection of the CRS component,
     * we need to do that later
     */
    (void) mca_base_var_env_name("crs_base_do_not_select", &tmp_env_var);
    opal_setenv(tmp_env_var,
                "1", /* turn off the selection */
                true, &environ);
    free(tmp_env_var);
    tmp_env_var = NULL;

    /*
     * Make sure we select the proper compress component.
     */
    if( NULL != opal_restart_globals.snapshot_compress ) {
        (void) mca_base_var_env_name("compress", &tmp_env_var);
        opal_setenv(tmp_env_var,
                    opal_restart_globals.snapshot_compress,
                    true, &environ);
        free(tmp_env_var);
        tmp_env_var = NULL;
    }

    /*
     * Initialize the OPAL layer
     */
    if (OPAL_SUCCESS != (ret = opal_init(&argc, &argv))) {
        exit_status = ret;
        goto cleanup;
    }

    /*
     * If the checkpoint was compressed, then decompress it before continuing
     */
    if( NULL != opal_restart_globals.snapshot_compress ) {
        char * zip_dir = NULL;
        char * tmp_str = NULL;

        /* Make sure to clear the selection for the restart,
         * this way the user can swich compression mechanism
         * across restart
         */
        (void) mca_base_var_env_name("compress", &tmp_env_var);
        opal_unsetenv(tmp_env_var, &environ);
        free(tmp_env_var);
        tmp_env_var = NULL;

        opal_asprintf(&zip_dir, "%s/%s%s",
                 opal_restart_globals.snapshot_loc,
                 opal_restart_globals.snapshot_ref,
                 opal_restart_globals.snapshot_compress_postfix);

        if (0 >  (ret = access(zip_dir, F_OK)) ) {
            opal_output(opal_restart_globals.output,
                        "Error: Unable to access the file [%s]!",
                        zip_dir);
            exit_status = OPAL_ERROR;
            goto cleanup;
        }

        opal_output_verbose(10, opal_restart_globals.output,
                            "Decompressing (%s)",
                            zip_dir);

        opal_compress.decompress(zip_dir, &tmp_str);

        if( NULL != zip_dir ) {
            free(zip_dir);
            zip_dir = NULL;
        }
        if( NULL != tmp_str ) {
            free(tmp_str);
            tmp_str = NULL;
        }
    }

    /*
     * If a cache directory has been suggested, see if it exists
     */
    if( NULL != opal_restart_globals.snapshot_cache ) {
        if(0 == (ret = access(opal_restart_globals.snapshot_cache, F_OK)) ) {
            opal_output_verbose(10, opal_restart_globals.output,
                                "Using the cached snapshot (%s) instead of (%s)",
                                opal_restart_globals.snapshot_cache,
                                opal_restart_globals.snapshot_loc);
            if( NULL != opal_restart_globals.snapshot_loc ) {
                free(opal_restart_globals.snapshot_loc);
                opal_restart_globals.snapshot_loc = NULL;
            }
            opal_restart_globals.snapshot_loc = opal_dirname(opal_restart_globals.snapshot_cache);
        } else {
            opal_show_help("help-opal-restart.txt", "cache_not_avail", true,
                           opal_restart_globals.snapshot_cache,
                           opal_restart_globals.snapshot_loc);
        }
    }

    /*
     * Mark this process as a tool
     */
    opal_cr_is_tool = true;

 cleanup:
    return exit_status;
}
/*
 * For a specific module, see if this proc has matching address/modex
 * info.  If so, create an endpoint and return it.
 *
 * Implementation note: This code relies on the order of modules on a local
 * side matching the order of the modex entries that we send around, otherwise
 * both sides may not agree on a bidirectional connection.  It also assumes
 * that add_procs will be invoked on the local modules in that same order, for
 * the same reason.  If those assumptions do not hold, we will need to
 * canonicalize this match ordering somehow, probably by (jobid,vpid) pair or
 * by the interface MAC or IP address.
 */
static int match_modex(ompi_btl_usnic_module_t *module,
                       ompi_btl_usnic_proc_t *proc,
                       int *index_out)
{
    int err = OMPI_SUCCESS;
    size_t i;
    uint32_t num_modules;
    ompi_btl_usnic_graph_t *g = NULL;
    int nme;
    int *me;
    bool proc_is_left;

    if (NULL == index_out) {
        return OMPI_ERR_BAD_PARAM;
    }
    *index_out = -1;

    num_modules = mca_btl_usnic_component.num_modules;

    opal_output_verbose(20, USNIC_OUT, "btl:usnic:%s: module=%p proc=%p with dimensions %d x %d",
                        __func__, (void *)module, (void *)proc,
                        num_modules, (int)proc->proc_modex_count);

    /* We compute an interface match-up table once for each (module,proc) pair
     * and cache it in the proc.  Store per-proc instead of per-module, since
     * MPI dynamic process routines can add procs but not new modules. */
    if (NULL == proc->proc_ep_match_table) {
        proc->proc_ep_match_table = malloc(num_modules *
                                       sizeof(*proc->proc_ep_match_table));
        if (NULL == proc->proc_ep_match_table) {
            OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE);
            return OMPI_ERR_OUT_OF_RESOURCE;
        }

        /* initialize to "no matches" */
        for (i = 0; i < num_modules; ++i) {
            proc->proc_ep_match_table[i] = -1;
        }

        /* For graphs where all edges are equal (and even for some other
         * graphs), two peers making matching calculations with "mirror image"
         * graphs might not end up with the same matching.  Ensure that both
         * sides are always setting up the exact same graph by always putting
         * the process with the lower (jobid,vpid) on the "left".
         */
        proc_is_left =
            (ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL,
                                          &proc->proc_ompi->proc_name,
                                          &(ompi_proc_local()->proc_name)) < 0);

        err = create_proc_module_graph(proc, proc_is_left, &g);
        if (OMPI_SUCCESS != err) {
            goto out_free_table;
        }

        nme = 0;
        err = ompi_btl_usnic_solve_bipartite_assignment(g, &nme, &me);
        if (OMPI_SUCCESS != err) {
            OMPI_ERROR_LOG(err);
            goto out_free_graph;
        }

        edge_pairs_to_match_table(proc, proc_is_left, nme, me);

        err = ompi_btl_usnic_gr_free(g);
        if (OMPI_SUCCESS != err) {
            OMPI_ERROR_LOG(err);
            return err;
        }
    }


    if (!proc->proc_match_exists) {
        opal_output_verbose(5, USNIC_OUT, "btl:usnic:%s: unable to find any valid interface pairs for proc %s",
                            __func__, OMPI_NAME_PRINT(&proc->proc_ompi->proc_name));
        return OMPI_ERR_NOT_FOUND;
    }

    /* assuming no strange failure cases, this should always be present */
    if (NULL != proc->proc_ep_match_table && proc->proc_match_exists) {
        for (i = 0; i < num_modules; ++i) {
            if (module == mca_btl_usnic_component.usnic_active_modules[i]) {
                *index_out = proc->proc_ep_match_table[i];
                break;
            }
        }
    }

    /* If MTU does not match, throw an error */
    /* TODO with UDP, do we still want to enforce this restriction or just take
     * the min of the two MTUs?  Another choice is to disqualify this pairing
     * before running the matching algorithm on it. */
    if (*index_out >= 0 &&
        proc->proc_modex[*index_out].mtu != module->if_mtu) {
        opal_show_help("help-mpi-btl-usnic.txt", "MTU mismatch",
                    true,
                    ompi_process_info.nodename,
                    ibv_get_device_name(module->device),
                    module->port_num,
                    module->if_mtu,
                    (NULL == proc->proc_ompi->proc_hostname) ?
                    "unknown" : proc->proc_ompi->proc_hostname,
                    proc->proc_modex[*index_out].mtu);
        *index_out = -1;
        return OMPI_ERR_UNREACH;
    }

    return (*index_out == -1 ? OMPI_ERR_NOT_FOUND : OMPI_SUCCESS);

out_free_graph:
    ompi_btl_usnic_gr_free(g);
out_free_table:
    free(proc->proc_ep_match_table);
    proc->proc_ep_match_table = NULL;
    proc->proc_match_exists = false;
    return err;
}
Example #21
0
static int post_env_vars(int prev_pid, opal_crs_base_snapshot_t *snapshot)
{
    int ret, exit_status = OPAL_SUCCESS;
    char *command = NULL;
    char *proc_file = NULL;
    char **loc_touch = NULL;
    char **loc_mkdir = NULL;
    int argc, i;

    if( 0 > prev_pid ) {
        opal_output(opal_restart_globals.output,
                    "Invalid PID (%d)\n",
                    prev_pid);
        exit_status = OPAL_ERROR;
        goto cleanup;
    }

    /*
     * This is needed so we can pass the previous environment to the restarted
     * application process.
     */
    opal_asprintf(&proc_file, "%s/%s-%d", opal_tmp_directory(), OPAL_CR_BASE_ENV_NAME, prev_pid);
    opal_asprintf(&command, "env | grep OMPI_ > %s", proc_file);

    opal_output_verbose(5, opal_restart_globals.output,
                        "post_env_vars: Execute: <%s>", command);

    ret = system(command);
    if( 0 > ret) {
        exit_status = ret;
        goto cleanup;
    }

    /*
     * Any directories that need to be created
     */
    if( NULL == (snapshot->metadata = fopen(snapshot->metadata_filename, "r")) ) {
        opal_show_help("help-opal-restart.txt", "invalid_metadata", true,
                       opal_restart_globals.snapshot_metadata,
                       snapshot->metadata_filename);
        exit_status = OPAL_ERROR;
        goto cleanup;
    }
    opal_crs_base_metadata_read_token(snapshot->metadata, CRS_METADATA_MKDIR, &loc_mkdir);
    argc = opal_argv_count(loc_mkdir);
    for( i = 0; i < argc; ++i ) {
        if( NULL != command ) {
            free(command);
            command = NULL;
        }
        opal_asprintf(&command, "mkdir -p %s", loc_mkdir[i]);

        opal_output_verbose(5, opal_restart_globals.output,
                            "post_env_vars: Execute: <%s>", command);

        ret = system(command);
        if( 0 > ret) {
            exit_status = ret;
            goto cleanup;
        }
    }
    if( 0 < argc ) {
        system("sync ; sync");
    }

    /*
     * Any files that need to exist
     */
    opal_crs_base_metadata_read_token(snapshot->metadata, CRS_METADATA_TOUCH, &loc_touch);
    argc = opal_argv_count(loc_touch);
    for( i = 0; i < argc; ++i ) {
        if( NULL != command ) {
            free(command);
            command = NULL;
        }
        opal_asprintf(&command, "touch %s", loc_touch[i]);

        opal_output_verbose(5, opal_restart_globals.output,
                            "post_env_vars: Execute: <%s>", command);

        ret = system(command);
        if( 0 > ret) {
            exit_status = ret;
            goto cleanup;
        }
    }
    if( 0 < argc ) {
        system("sync ; sync");
    }

 cleanup:
    if( NULL != command) {
        free(command);
        command = NULL;
    }
    if( NULL != proc_file) {
        free(proc_file);
        proc_file = NULL;
    }
    if( NULL != loc_mkdir ) {
        opal_argv_free(loc_mkdir);
        loc_mkdir = NULL;
    }
    if( NULL != loc_touch ) {
        opal_argv_free(loc_touch);
        loc_touch = NULL;
    }

    if( NULL != snapshot->metadata ) {
        fclose(snapshot->metadata);
        snapshot->metadata = NULL;
    }

    return exit_status;
}
Example #22
0
/*
 * Parse a single file
 */
static int parse_file(char *filename)
{
    int val;
    int ret = OMPI_SUCCESS;
    bool showed_no_section_warning = false;
    bool showed_unexpected_tokens_warning = false;
    parsed_section_values_t section;

    reset_section(false, &section);

    /* Open the file */
    ini_filename = filename;
    btl_openib_ini_yyin = fopen(filename, "r");
    if (NULL == btl_openib_ini_yyin) {
        opal_show_help("help-mpi-btl-openib.txt", "ini file:file not found",
                       true, filename);
        ret = OMPI_ERR_NOT_FOUND;
        goto cleanup;
    }

    /* Do the parsing */
    btl_openib_ini_parse_done = false;
    btl_openib_ini_yynewlines = 1;
    btl_openib_ini_init_buffer(btl_openib_ini_yyin);
    while (!btl_openib_ini_parse_done) {
        val = btl_openib_ini_yylex();
        switch (val) {
        case BTL_OPENIB_INI_PARSE_DONE:
            /* This will also set btl_openib_ini_parse_done to true, so just
               break here */
            break;

        case BTL_OPENIB_INI_PARSE_NEWLINE:
            /* blank line!  ignore it */
            break;

        case BTL_OPENIB_INI_PARSE_SECTION:
            /* We're starting a new section; if we have previously
               parsed a section, go see if we can use its values. */
            save_section(&section);

            reset_section(true, &section);
            section.name = strdup(btl_openib_ini_yytext);
            break;

        case BTL_OPENIB_INI_PARSE_SINGLE_WORD:
            if (NULL == section.name) {
                /* Warn that there is no current section, and ignore
                   this parameter */
                if (!showed_no_section_warning) {
                    show_help("ini file:not in a section");
                    showed_no_section_warning = true;
                }
                /* Parse it and then dump it */
                parse_line(&section);
                reset_section(true, &section);
            } else {
                parse_line(&section);
            }
            break;

        default:
            /* anything else is an error */
            if (!showed_unexpected_tokens_warning) {
                show_help("ini file:unexpected token");
                showed_unexpected_tokens_warning = true;
            }
            break;
        }
    }
    save_section(&section);
    fclose(btl_openib_ini_yyin);
    btl_openib_ini_yylex_destroy ();

cleanup:
    reset_section(true, &section);
    if (NULL != key_buffer) {
        free(key_buffer);
        key_buffer = NULL;
        key_buffer_len = 0;
    }
    return ret;
}
Example #23
0
int MPI_Init(int *argc, char ***argv)
{
    int err;
    int provided;
    char *env;
    int required = MPI_THREAD_SINGLE;

    /* Ensure that we were not already initialized or finalized */

    if (ompi_mpi_finalized) {
        if (0 == ompi_comm_rank(MPI_COMM_WORLD)) {
            opal_show_help("help-mpi-api.txt",
                           "mpi-function-after-finalize", true, FUNC_NAME);
        }
        return ompi_errhandler_invoke(NULL, NULL,
                                      OMPI_ERRHANDLER_TYPE_COMM,
                                      MPI_ERR_OTHER, FUNC_NAME);
    } else if (ompi_mpi_initialized) {
        if (0 == ompi_comm_rank(MPI_COMM_WORLD)) {
            opal_show_help("help-mpi-api.txt", "mpi-initialize-twice",
                           true, FUNC_NAME);
        }
        return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_OTHER,
                                      FUNC_NAME);
    }

    /* check for environment overrides for required thread level.  If
       there is, check to see that it is a valid/supported thread level.
       If not, default to MPI_THREAD_MULTIPLE. */

    if (NULL != (env = getenv("OMPI_MPI_THREAD_LEVEL"))) {
        required = atoi(env);
        if (required < MPI_THREAD_SINGLE || required > MPI_THREAD_MULTIPLE) {
            required = MPI_THREAD_MULTIPLE;
        }
    }

    /* Call the back-end initialization function (we need to put as
       little in this function as possible so that if it's profiled, we
       don't lose anything) */

    if (NULL != argc && NULL != argv) {
        err = ompi_mpi_init(*argc, *argv, required, &provided);
    } else {
        err = ompi_mpi_init(0, NULL, required, &provided);
    }

    /* Since we don't have a communicator to invoke an errorhandler on
       here, don't use the fancy-schmancy ERRHANDLER macros; they're
       really designed for real communicator objects.  Just use the
       back-end function directly. */

    if (MPI_SUCCESS != err) {
        return ompi_errhandler_invoke(NULL, NULL,
                                      OMPI_ERRHANDLER_TYPE_COMM,
                                      err <
                                      0 ? ompi_errcode_get_mpi_code(err) :
                                      err, FUNC_NAME);
    }

    OPAL_CR_INIT_LIBRARY();

    //Assume we have now the MPI_COMM_WORLD Communicator!!!

	void *dvMgmt_dlhandle;
	int (*commsBenchmark)(commsInfo*);

	char *error;

	dvMgmt_dlhandle = dlopen("libdvMgmt.so", RTLD_LAZY);

	if (!dvMgmt_dlhandle) {
		printf("Details:");
		fputs(dlerror(), stderr);
		exit(1);
	}

	commsBenchmark = dlsym(dvMgmt_dlhandle, "commsBenchmark");

	commsInfo cmf;
	cmf.BW_Mtx = NULL;

	char* profileFlag;
	int profilingEnabled = 0;
	profileFlag = getenv("XSCALA_PROFILING_APP");
	if (profileFlag != NULL) {
		profilingEnabled = 1;
	}

	if (profilingEnabled) {
		(*commsBenchmark)(&cmf);
	}



    return MPI_SUCCESS;
}
Example #24
0
/*
 * Parse a single line in the INI file
 */
static int parse_line(parsed_section_values_t *sv)
{
    int val, ret = OMPI_SUCCESS;
    char *value = NULL;
    bool showed_unknown_field_warning = false;

    /* Save the name name */
    if (key_buffer_len < strlen(btl_openib_ini_yytext) + 1) {
        char *tmp;
        key_buffer_len = strlen(btl_openib_ini_yytext) + 1;
        tmp = (char *) realloc(key_buffer, key_buffer_len);
        if (NULL == tmp) {
            free(key_buffer);
            key_buffer_len = 0;
            key_buffer = NULL;
            return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
        }
        key_buffer = tmp;
    }
    strncpy(key_buffer, btl_openib_ini_yytext, key_buffer_len);

    /* The first thing we have to see is an "=" */
    val = btl_openib_ini_yylex();
    if (btl_openib_ini_parse_done || BTL_OPENIB_INI_PARSE_EQUAL != val) {
        show_help("ini file:expected equals");
        return OMPI_ERROR;
    }

    /* Next we get the value */
    val = btl_openib_ini_yylex();
    if (BTL_OPENIB_INI_PARSE_SINGLE_WORD == val ||
        BTL_OPENIB_INI_PARSE_VALUE == val) {
        value = strdup(btl_openib_ini_yytext);

        /* Now we need to see the newline */
        val = btl_openib_ini_yylex();
        if (BTL_OPENIB_INI_PARSE_NEWLINE != val &&
            BTL_OPENIB_INI_PARSE_DONE != val) {
            show_help("ini file:expected newline");
            free(value);
            return OMPI_ERROR;
        }
    }

    /* If we did not get EOL or EOF, something is wrong */
    else if (BTL_OPENIB_INI_PARSE_DONE != val &&
             BTL_OPENIB_INI_PARSE_NEWLINE != val) {
        show_help("ini file:expected newline");
        return OMPI_ERROR;
    }

    /* Ok, we got a good parse.  Now figure out what it is and save
       the value.  Note that the flex already took care of trimming
       all whitespace at the beginning and ending of the value. */

    if (0 == strcasecmp(key_buffer, "vendor_id")) {
        if (OMPI_SUCCESS != (ret = ompi_btl_openib_ini_intify_list(value, &sv->vendor_ids,
                                               &sv->vendor_ids_len))) {
            return ret;
        }
    }

    else if (0 == strcasecmp(key_buffer, "vendor_part_id")) {
        if (OMPI_SUCCESS != (ret = ompi_btl_openib_ini_intify_list(value, &sv->vendor_part_ids,
                                               &sv->vendor_part_ids_len))) {
            return ret;
        }
    }

    else if (0 == strcasecmp(key_buffer, "mtu")) {
        /* Single value */
        sv->values.mtu = (uint32_t) ompi_btl_openib_ini_intify(value);
        sv->values.mtu_set = true;
    }

    else if (0 == strcasecmp(key_buffer, "use_eager_rdma")) {
        /* Single value */
        sv->values.use_eager_rdma = (uint32_t) ompi_btl_openib_ini_intify(value);
        sv->values.use_eager_rdma_set = true;
    }

    else if (0 == strcasecmp(key_buffer, "receive_queues")) {
        /* Single value (already strdup'ed) */
        sv->values.receive_queues = value;
        value = NULL;
    }

    else if (0 == strcasecmp(key_buffer, "max_inline_data")) {
        /* Single value */
        sv->values.max_inline_data = (int32_t) ompi_btl_openib_ini_intify(value);
        sv->values.max_inline_data_set = true;
    }

    else if (0 == strcasecmp(key_buffer, "rdmacm_reject_causes_connect_error")) {
        /* Single value */
        sv->values.rdmacm_reject_causes_connect_error =
            (bool) ompi_btl_openib_ini_intify(value);
        sv->values.rdmacm_reject_causes_connect_error_set = true;
    }

    else if (0 == strcasecmp(key_buffer, "ignore_device")) {
        /* Single value */
        sv->values.ignore_device = (bool) ompi_btl_openib_ini_intify(value);
        sv->values.ignore_device_set = true;
    }

    else {
        /* Have no idea what this parameter is.  Not an error -- just
           ignore it */
        if (!showed_unknown_field_warning) {
            opal_show_help("help-mpi-btl-openib.txt",
                           "ini file:unknown field", true,
                           ini_filename, btl_openib_ini_yynewlines,
                           key_buffer);
            showed_unknown_field_warning = true;
        }
    }

    /* All done */

    if (NULL != value) {
        free(value);
    }
    return ret;
}
Example #25
0
int
main(int argc, char *argv[])
{
    int exit_status = 0, ret, flags = 0, i;
    int exec_argc = 0, user_argc = 0;
    char **exec_argv = NULL, **user_argv = NULL;
    char *exec_command, *base_argv0 = NULL;
    bool disable_flags = true;
    bool real_flag = false;

    if (OPAL_SUCCESS != (ret = opal_init_util(&argc, &argv))) {
        return ret;
    }

    /****************************************************
     *
     * Setup compiler information
     *
     ****************************************************/

    base_argv0 = opal_basename(argv[0]);
#if defined(EXEEXT)
    if( 0 != strlen(EXEEXT) ) {
        char extension[] = EXEEXT;
        char* temp = strstr( base_argv0, extension );
        char* old_match = temp;
        while( NULL != temp ) {
            old_match = temp;
            temp = strstr( temp + 1, extension );
        }
        /* Only if there was a match of .exe, erase the last occurence of .exe */
        if ( NULL != old_match ) {
            *old_match = '\0';
        }
    }
#endif  /* defined(EXEEXT) */

    if (OPAL_SUCCESS != (ret = data_init(base_argv0))) {
        fprintf(stderr, "Error parsing data file %s: %s\n", base_argv0, opal_strerror(ret));
        return ret;
    }

    for (i = 1 ; i < argc && user_data_idx < 0 ; ++i) {
        user_data_idx = find_options_index(argv[i]);
    }
    /* if we didn't find a match, look for the NULL (base case) options */
    if (user_data_idx < 0) {
        user_data_idx = default_data_idx;
    }
    /* if we still didn't find a match, abort */
    if (user_data_idx < 0) {
        char *flat = opal_argv_join(argv, ' ');
        opal_show_help("help-opal-wrapper.txt", "no-options-support", true,
                       base_argv0, flat, NULL);
        free(flat);
        exit(1);
    }

    /* compiler */
    load_env_data(options_data[user_data_idx].project_short, options_data[user_data_idx].compiler_env, &options_data[user_data_idx].compiler);

    /* preprocessor flags */
    load_env_data_argv(options_data[user_data_idx].project_short, "CPPFLAGS", &options_data[user_data_idx].preproc_flags);

    /* compiler flags */
    load_env_data_argv(options_data[user_data_idx].project_short, options_data[user_data_idx].compiler_flags_env,
                       &options_data[user_data_idx].comp_flags);

    /* linker flags */
    load_env_data_argv(options_data[user_data_idx].project_short, "LDFLAGS", &options_data[user_data_idx].link_flags);

    /* libs */
    load_env_data_argv(options_data[user_data_idx].project_short, "LIBS", &options_data[user_data_idx].libs);


    /****************************************************
     *
     * Sanity Checks
     *
     ****************************************************/

    if (NULL != options_data[user_data_idx].req_file) {
        /* make sure the language is supported */
        if (0 == strcmp(options_data[user_data_idx].req_file, "not supported")) {
            opal_show_help("help-opal-wrapper.txt", "no-language-support", true,
                           options_data[user_data_idx].language, base_argv0, NULL);
            exit_status = 1;
            goto cleanup;
        }

        if (options_data[user_data_idx].req_file[0] != '\0') {
            char *filename;
            struct stat buf;
            filename = opal_os_path( false, options_data[user_data_idx].path_libdir, options_data[user_data_idx].req_file, NULL );
            if (0 != stat(filename, &buf)) {
                opal_show_help("help-opal-wrapper.txt", "file-not-found", true,
                               base_argv0, options_data[user_data_idx].req_file, options_data[user_data_idx].language, NULL);
            }
        }
    }

    /****************************************************
     *
     * Parse user flags
     *
     ****************************************************/
    flags = COMP_WANT_COMMAND|COMP_WANT_PREPROC|
        COMP_WANT_COMPILE|COMP_WANT_LINK;

    user_argv = opal_argv_copy(argv + 1);
    user_argc = opal_argv_count(user_argv);

    for (i = 0 ; i < user_argc ; ++i) {
        if (0 == strncmp(user_argv[i], "-showme", strlen("-showme")) ||
            0 == strncmp(user_argv[i], "--showme", strlen("--showme")) ||
            0 == strncmp(user_argv[i], "-show", strlen("-show")) ||
            0 == strncmp(user_argv[i], "--show", strlen("--show"))) {
            bool done_now = false;

            /* check for specific things we want to see.  First three
               still invoke all the building routines.  Last set want
               to parse out certain flags, so we don't go through the
               normal build routine - skip to cleanup. */
            if (0 == strncmp(user_argv[i], "-showme:command", strlen("-showme:command")) ||
                0 == strncmp(user_argv[i], "--showme:command", strlen("--showme:command"))) {
                flags = COMP_WANT_COMMAND;
                /* we know what we want, so don't process any more args */
                done_now = true;
            } else if (0 == strncmp(user_argv[i], "-showme:compile", strlen("-showme:compile")) ||
                0 == strncmp(user_argv[i], "--showme:compile", strlen("--showme:compile"))) {
                flags = COMP_WANT_PREPROC|COMP_WANT_COMPILE;
                /* we know what we want, so don't process any more args */
                done_now = true;
            } else if (0 == strncmp(user_argv[i], "-showme:link", strlen("-showme:link")) ||
                       0 == strncmp(user_argv[i], "--showme:link", strlen("--showme:link"))) {
                flags = COMP_WANT_COMPILE|COMP_WANT_LINK;
                /* we know what we want, so don't process any more args */
                done_now = true;
            } else if (0 == strncmp(user_argv[i], "-showme:incdirs", strlen("-showme:incdirs")) ||
                       0 == strncmp(user_argv[i], "--showme:incdirs", strlen("--showme:incdirs"))) {
                print_flags(options_data[user_data_idx].preproc_flags, OPAL_INCLUDE_FLAG);
                goto cleanup;
            } else if (0 == strncmp(user_argv[i], "-showme:libdirs", strlen("-showme:libdirs")) ||
                       0 == strncmp(user_argv[i], "--showme:libdirs", strlen("--showme:libdirs"))) {
                print_flags(options_data[user_data_idx].link_flags, OPAL_LIBDIR_FLAG);
                goto cleanup;
            } else if (0 == strncmp(user_argv[i], "-showme:libs", strlen("-showme:libs")) ||
                       0 == strncmp(user_argv[i], "--showme:libs", strlen("--showme:libs"))) {
                print_flags(options_data[user_data_idx].libs, "-l");
                goto cleanup;
            } else if (0 == strncmp(user_argv[i], "-showme:version", strlen("-showme:version")) ||
                       0 == strncmp(user_argv[i], "--showme:version", strlen("--showme:version"))) {
                char * str;
                str = opal_show_help_string("help-opal-wrapper.txt",
                                            "version", false,
                                            argv[0], options_data[user_data_idx].project, options_data[user_data_idx].version, options_data[user_data_idx].language, NULL);
                if (NULL != str) {
                    printf("%s", str);
                    free(str);
                }
                goto cleanup;
            } else if (0 == strncmp(user_argv[i], "-showme:help", strlen("-showme:help")) ||
                       0 == strncmp(user_argv[i], "--showme:help", strlen("--showme:help"))) {
                char *str;
                str = opal_show_help_string("help-opal-wrapper.txt", "usage",
                                            false, argv[0],
                                            options_data[user_data_idx].project,
                                            NULL);
                if (NULL != str) {
                    printf("%s", str);
                    free(str);
                }

                exit_status = 0;
                goto cleanup;
            } else if (0 == strncmp(user_argv[i], "-showme:", strlen("-showme:")) ||
                       0 == strncmp(user_argv[i], "--showme:", strlen("--showme:"))) {
                fprintf(stderr, "%s: unrecognized option: %s\n", argv[0],
                        user_argv[i]);
                fprintf(stderr, "Type '%s --showme:help' for usage.\n",
                        argv[0]);
                exit_status = 1;
                goto cleanup;
            }

            flags |= (COMP_DRY_RUN|COMP_SHOW_ERROR);
            /* remove element from user_argv */
            opal_argv_delete(&user_argc, &user_argv, i, 1);
            --i;

            if (done_now) {
                disable_flags = false;
                break;
            }

        } else if (0 == strcmp(user_argv[i], "-c")) {
            flags &= ~COMP_WANT_LINK;
            real_flag = true;
        } else if (0 == strcmp(user_argv[i], "-E") ||
                   0 == strcmp(user_argv[i], "-M")) {
            flags &= ~(COMP_WANT_COMPILE | COMP_WANT_LINK);
            real_flag = true;
        } else if (0 == strcmp(user_argv[i], "-S")) {
            flags &= ~COMP_WANT_LINK;
            real_flag = true;
        } else if (0 == strcmp(user_argv[i], "-lpmpi")) {
            flags |= COMP_WANT_PMPI;

            /* remove element from user_argv */
            opal_argv_delete(&user_argc, &user_argv, i, 1);
            --i;
        } else if (0 == strcmp(user_argv[i], "-static") ||
                   0 == strcmp(user_argv[i], "--static") ||
                   0 == strcmp(user_argv[i], "-Bstatic") ||
                   0 == strcmp(user_argv[i], "-Wl,-static") ||
                   0 == strcmp(user_argv[i], "-Wl,--static") ||
                   0 == strcmp(user_argv[i], "-Wl,-Bstatic")) {
            flags |= COMP_WANT_STATIC;
        } else if (0 == strcmp(user_argv[i], "-dynamic") ||
                   0 == strcmp(user_argv[i], "--dynamic") ||
                   0 == strcmp(user_argv[i], "-Bdynamic") ||
                   0 == strcmp(user_argv[i], "-Wl,-dynamic") ||
                   0 == strcmp(user_argv[i], "-Wl,--dynamic") ||
                   0 == strcmp(user_argv[i], "-Wl,-Bdynamic")) {
            flags &= ~COMP_WANT_STATIC;
        } else if (0 == strcmp(user_argv[i], "--openmpi:linkall")) {
            /* This is an intentionally undocummented wrapper compiler
               switch.  It should only be used by Open MPI developers
               -- not end users.  It will cause mpicc to use the
               static library list, even if we're compiling
               dynamically (i.e., it'll specifically -lopen-rte and
               -lopen-pal (and all their dependent libs)).  We provide
               this flag for test MPI applications that also invoke
               ORTE and/or OPAL function calls.

               On some systems (e.g., OS X), if the top-level
               application calls ORTE/OPAL functions and you don't -l
               ORTE and OPAL, then the functions won't be resolved at
               link time (i.e., the implicit library dependencies of
               libmpi won't be pulled in at link time), and therefore
               the link will fail.  This flag will cause the wrapper
               to explicitly list the ORTE and OPAL libs on the
               underlying compiler command line, so the application
               will therefore link properly. */
            flags |= COMP_WANT_LINKALL;

            /* remove element from user_argv */
            opal_argv_delete(&user_argc, &user_argv, i, 1);
        } else if ('-' != user_argv[i][0]) {
            disable_flags = false;
            flags |= COMP_SHOW_ERROR;
            real_flag = true;
        } else {
            /* if the option flag is one that we use to determine
               which set of compiler data to use, don't count it as a
               real option */
            if (find_options_index(user_argv[i]) < 0) {
                real_flag = true;
            }
        }
    }

    /* clear out the want_flags if we got no arguments not starting
       with a - (dash) and -showme wasn't given OR -showme was given
       and we had at least one more non-showme argument that started
       with a - (dash) and no other non-dash arguments.  Some examples:

       opal_wrapper                : clear our flags
       opal_wrapper -v             : clear our flags
       opal_wrapper -E a.c         : don't clear our flags
       opal_wrapper a.c            : don't clear our flags
       opal_wrapper -showme        : don't clear our flags
       opal_wrapper -showme -v     : clear our flags
       opal_wrapper -showme -E a.c : don't clear our flags
       opal_wrapper -showme a.c    : don't clear our flags
    */
    if (disable_flags && !((flags & COMP_DRY_RUN) && !real_flag)) {
        flags &= ~(COMP_WANT_PREPROC|COMP_WANT_COMPILE|COMP_WANT_LINK);
    }

#if !OMPI_ENABLE_MPI_PROFILING
    /* sanity check */
    if (flags & COMP_WANT_PMPI) {
	    opal_show_help("help-opal-wrapper.txt", "no-profiling-support", true,
		               argv[0], NULL);
    }
#endif


    /****************************************************
     *
     * Assemble the command line
     *
     ****************************************************/

    /* compiler (may be multiple arguments, so split) */
    if (flags & COMP_WANT_COMMAND) {
        exec_argv = opal_argv_split(options_data[user_data_idx].compiler, ' ');
        exec_argc = opal_argv_count(exec_argv);
    } else {
        exec_argv = (char **) malloc(sizeof(char*));
        exec_argv[0] = NULL;
        exec_argc = 0;
    }

    /* This error would normally not happen unless the user edits the
       wrapper data files manually */
    if (NULL == exec_argv) {
        opal_show_help("help-opal-wrapper.txt", "no-compiler-specified", true);
        return 1;
    }

    if (flags & COMP_WANT_COMPILE) {
        opal_argv_insert(&exec_argv, exec_argc,
                         options_data[user_data_idx].comp_flags_prefix);
        exec_argc = opal_argv_count(exec_argv);
    }

    /* Per https://svn.open-mpi.org/trac/ompi/ticket/2201, add all the
       user arguments before anything else. */
    opal_argv_insert(&exec_argv, exec_argc, user_argv);
    exec_argc = opal_argv_count(exec_argv);

    /* preproc flags */
    if (flags & COMP_WANT_PREPROC) {
        opal_argv_insert(&exec_argv, exec_argc, options_data[user_data_idx].preproc_flags);
        exec_argc = opal_argv_count(exec_argv);
    }

    /* compiler flags */
    if (flags & COMP_WANT_COMPILE) {
        opal_argv_insert(&exec_argv, exec_argc, options_data[user_data_idx].comp_flags);
        exec_argc = opal_argv_count(exec_argv);
    }

    /* link flags and libs */
    if (flags & COMP_WANT_LINK) {
        bool have_static_lib;
        bool have_dyn_lib;
        bool use_static_libs;
        char *filename1, *filename2;
        struct stat buf;

        opal_argv_insert(&exec_argv, exec_argc, options_data[user_data_idx].link_flags);
        exec_argc = opal_argv_count(exec_argv);

        /* Are we linking statically?  If so, decide what libraries to
           list.  It depends on two factors:

           1. Was --static (etc.) specified?
           2. Does OMPI have static, dynamic, or both libraries installed?

           Here's a matrix showing what we'll do in all 6 cases:

           What's installed    --static    no --static
           ----------------    ----------  -----------
           ompi .so libs       -lmpi       -lmpi
           ompi .a libs        all         all
           ompi both libs      all         -lmpi

        */

        filename1 = opal_os_path( false, options_data[user_data_idx].path_libdir, options_data[user_data_idx].static_lib_file, NULL );
        if (0 == stat(filename1, &buf)) {
            have_static_lib = true;
        } else {
            have_static_lib = false;
        }

        filename2 = opal_os_path( false, options_data[user_data_idx].path_libdir, options_data[user_data_idx].dyn_lib_file, NULL );
        if (0 == stat(filename2, &buf)) {
            have_dyn_lib = true;
        } else {
            have_dyn_lib = false;
        }

        /* Determine which set of libs to use: dynamic or static.  Be
           pedantic to make the code easy to read. */
        if (flags & COMP_WANT_LINKALL) {
            /* If --openmpi:linkall was specified, list all the libs
               (i.e., the static libs) if they're available, either in
               static or dynamic form. */
            if (have_static_lib || have_dyn_lib) {
                use_static_libs = true;
            } else {
                fprintf(stderr, "The linkall option has failed as we were unable to find either static or dynamic libs\n"
                        "Files looked for:\n  Static: %s\n  Dynamic: %s\n",
                        filename1, filename2);
                free(filename1);
                free(filename2);
                exit(1);
            }
        } else if (flags & COMP_WANT_STATIC) {
            /* If --static (or something like it) was specified, if we
               have the static libs, then use them.  Otherwise, use
               the dynamic libs. */
            if (have_static_lib) {
                use_static_libs = true;
            } else {
                use_static_libs = false;
            }
        } else {
            /* If --static (or something like it) was NOT specified
               (or if --dyanic, or something like it, was specified),
               if we have the dynamic libs, then use them.  Otherwise,
               use the static libs. */
            if (have_dyn_lib) {
                use_static_libs = false;
            } else {
                use_static_libs = true;
            }
        }
        free(filename1);
        free(filename2);

        if (use_static_libs) {
            opal_argv_insert(&exec_argv, exec_argc, options_data[user_data_idx].libs_static);
        } else {
            opal_argv_insert(&exec_argv, exec_argc, options_data[user_data_idx].libs);
        }
        exec_argc = opal_argv_count(exec_argv);
    }


    /****************************************************
     *
     * Execute the command
     *
     ****************************************************/

    if (flags & COMP_DRY_RUN) {
        exec_command = opal_argv_join(exec_argv, ' ');
        printf("%s\n", exec_command);
    } else {
        char *tmp;

#if 0
        exec_command = opal_argv_join(exec_argv, ' ');
        printf("command: %s\n", exec_command);
#endif

        tmp = opal_path_findv(exec_argv[0], 0, environ, NULL);
        if (NULL == tmp) {
            opal_show_help("help-opal-wrapper.txt", "no-compiler-found", true,
                           exec_argv[0], NULL);
            errno = 0;
            exit_status = 1;
        }  else {
            int status;

            free(exec_argv[0]);
            exec_argv[0] = tmp;
            ret = opal_few(exec_argv, &status);
            exit_status = WIFEXITED(status) ? WEXITSTATUS(status) :
                              (WIFSIGNALED(status) ? WTERMSIG(status) :
                                  (WIFSTOPPED(status) ? WSTOPSIG(status) : 255));
            if( (OPAL_SUCCESS != ret) || ((0 != exit_status) && (flags & COMP_SHOW_ERROR)) ) {
                char* exec_command = opal_argv_join(exec_argv, ' ');
                if( OPAL_SUCCESS != ret ) {
                    opal_show_help("help-opal-wrapper.txt", "spawn-failed", true,
                                   exec_argv[0], strerror(status), exec_command, NULL);
                } else {
#if 0
                    opal_show_help("help-opal-wrapper.txt", "compiler-failed", true,
                                   exec_argv[0], exit_status, exec_command, NULL);
#endif
                }
                free(exec_command);
            }
        }
    }

    /****************************************************
     *
     * Cleanup
     *
     ****************************************************/
 cleanup:

    opal_argv_free(exec_argv);
    opal_argv_free(user_argv);
    if (NULL != base_argv0) free(base_argv0);

    if (OPAL_SUCCESS != (ret = data_finalize())) {
        return ret;
    }

    if (OPAL_SUCCESS != (ret = opal_finalize_util())) {
        return ret;
    }

    return exit_status;
}
Example #26
0
static int init_ud_qp(struct ibv_context *context_arg,
                      struct mca_btl_openib_sa_qp_cache *cache)
{
    struct ibv_qp_init_attr iattr;
    struct ibv_qp_attr mattr;
    int rc;

    /* create cq */
    cache->cq = ibv_create_cq(cache->context, 4, NULL, NULL, 0);
    if (NULL == cache->cq) {
        BTL_ERROR(("error creating cq, errno says %s", strerror(errno)));
        opal_show_help("help-mpi-btl-openib.txt", "init-fail-create-q",
                true, opal_proc_local_get()->proc_hostname,
                __FILE__, __LINE__, "ibv_create_cq",
                strerror(errno), errno,
                ibv_get_device_name(context_arg->device));
        return OPAL_ERROR;
    }

    /* create qp */
    memset(&iattr, 0, sizeof(iattr));
    iattr.send_cq = cache->cq;
    iattr.recv_cq = cache->cq;
    iattr.cap.max_send_wr = 2;
    iattr.cap.max_recv_wr = 2;
    iattr.cap.max_send_sge = 1;
    iattr.cap.max_recv_sge = 1;
    iattr.qp_type = IBV_QPT_UD;
    cache->qp = ibv_create_qp(cache->pd, &iattr);
    if (NULL == cache->qp) {
        BTL_ERROR(("error creating qp %s (%d)", strerror(errno), errno));
        return OPAL_ERROR;
    }

    /* modify qp to IBV_QPS_INIT */
    memset(&mattr, 0, sizeof(mattr));
    mattr.qp_state = IBV_QPS_INIT;
    mattr.port_num = cache->port_num;
    mattr.qkey = ntohl(IB_QP1_WELL_KNOWN_Q_KEY);
    rc = ibv_modify_qp(cache->qp, &mattr,
            IBV_QP_STATE              |
            IBV_QP_PKEY_INDEX         |
            IBV_QP_PORT               |
            IBV_QP_QKEY);
    if (rc) {
        BTL_ERROR(("Error modifying QP[%x] to IBV_QPS_INIT errno says: %s [%d]",
                    cache->qp->qp_num, strerror(errno), errno));
        return OPAL_ERROR;
    }

    /* modify qp to IBV_QPS_RTR */
    memset(&mattr, 0, sizeof(mattr));
    mattr.qp_state = IBV_QPS_RTR;
    rc = ibv_modify_qp(cache->qp, &mattr, IBV_QP_STATE);
    if (rc) {
        BTL_ERROR(("Error modifying QP[%x] to IBV_QPS_RTR errno says: %s [%d]",
                    cache->qp->qp_num, strerror(errno), errno));
        return OPAL_ERROR;
    }

    /* modify qp to IBV_QPS_RTS */
    mattr.qp_state = IBV_QPS_RTS;
    rc = ibv_modify_qp(cache->qp, &mattr, IBV_QP_STATE | IBV_QP_SQ_PSN);
    if (rc) {
        BTL_ERROR(("Error modifying QP[%x] to IBV_QPS_RTR errno says: %s [%d]",
                    cache->qp->qp_num, strerror(errno), errno));
        return OPAL_ERROR;
    }

    return OPAL_SUCCESS;
}
Example #27
0
static int opal_hwloc_base_open(mca_base_open_flag_t flags)
{
    if (opal_hwloc_base_inited) {
        return OPAL_SUCCESS;
    }
    opal_hwloc_base_inited = true;

#if OPAL_HAVE_HWLOC
    {
        int rc;
        opal_data_type_t tmp;

        if (OPAL_SUCCESS != (rc = opal_hwloc_base_set_binding_policy(&opal_hwloc_binding_policy,
                                                                     opal_hwloc_base_binding_policy))) {
            return rc;
        }

        if (opal_hwloc_base_bind_to_core) {
            opal_show_help("help-opal-hwloc-base.txt", "deprecated", true,
                           "--bind-to-core", "--bind-to core",
                           "hwloc_base_bind_to_core", "hwloc_base_binding_policy=core");
            /* set binding policy to core - error if something else already set */
            if (OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy) &&
                OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy) != OPAL_BIND_TO_CORE) {
                /* error - cannot redefine the default ranking policy */
                opal_show_help("help-opal-hwloc-base.txt", "redefining-policy", true,
                               "core", opal_hwloc_base_print_binding(opal_hwloc_binding_policy));
                return OPAL_ERR_BAD_PARAM;
            }
            OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_CORE);
        }

        if (opal_hwloc_base_bind_to_socket) {
            opal_show_help("help-opal-hwloc-base.txt", "deprecated", true,
                           "--bind-to-socket", "--bind-to socket",
                           "hwloc_base_bind_to_socket", "hwloc_base_binding_policy=socket");
            /* set binding policy to socket - error if something else already set */
            if (OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy) &&
                OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy) != OPAL_BIND_TO_SOCKET) {
                /* error - cannot redefine the default ranking policy */
                opal_show_help("help-opal-hwloc-base.txt", "redefining-policy", true,
                               "socket", opal_hwloc_base_print_binding(opal_hwloc_binding_policy));
                return OPAL_ERR_SILENT;
            }
            OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_SOCKET);
        }

        /* did the user provide a slot list? */
        if (NULL != opal_hwloc_base_slot_list) {
            /* if we already were given a policy, then this is an error */
            if (OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy)) {
                opal_show_help("help-opal-hwloc-base.txt", "redefining-policy", true,
                               "socket", opal_hwloc_base_print_binding(opal_hwloc_binding_policy));
                return OPAL_ERR_SILENT;
            }
            OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_CPUSET);
        }

        /* cpu allocation specification */
        if (NULL != opal_hwloc_base_cpu_set) {
            if (!OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy)) {
                /* it is okay if a binding policy was already given - just ensure that
                 * we do bind to the given cpus if provided, otherwise this would be
                 * ignored if someone didn't also specify a binding policy
                 */
                OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_CPUSET);
            }
        }

        /* if we are binding to hwthreads, then we must use hwthreads as cpus */
        if (OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy) == OPAL_BIND_TO_HWTHREAD) {
            opal_hwloc_use_hwthreads_as_cpus = true;
        }

        /* to support tools such as ompi_info, add the components
         * to a list
         */
        if (OPAL_SUCCESS !=
            mca_base_framework_components_open(&opal_hwloc_base_framework, flags)) {
            return OPAL_ERROR;
        }

        /* declare the hwloc data types */
        tmp = OPAL_HWLOC_TOPO;
        if (OPAL_SUCCESS != (rc = opal_dss.register_type(opal_hwloc_pack,
                                                         opal_hwloc_unpack,
                                                         (opal_dss_copy_fn_t)opal_hwloc_copy,
                                                         (opal_dss_compare_fn_t)opal_hwloc_compare,
                                                         (opal_dss_print_fn_t)opal_hwloc_print,
                                                         OPAL_DSS_STRUCTURED,
                                                         "OPAL_HWLOC_TOPO", &tmp))) {
            return rc;
        }
    }
#endif

    return OPAL_SUCCESS;
}
Example #28
0
int opal_register_params(void)
{
    int ret;

    if (opal_register_done) {
        return OPAL_SUCCESS;
    }

    opal_register_done = true;

    /*
     * This string is going to be used in opal/util/stacktrace.c
     */
    {
        char *string = NULL;
        int j;
        int signals[] = {
#ifdef SIGABRT
            SIGABRT,
#endif
#ifdef SIGBUS
            SIGBUS,
#endif
#ifdef SIGFPE
            SIGFPE,
#endif
#ifdef SIGSEGV
            SIGSEGV,
#endif
            -1
        };
        for (j = 0 ; signals[j] != -1 ; ++j) {
            if (j == 0) {
                asprintf(&string, "%d", signals[j]);
            } else {
                char *tmp;
                asprintf(&tmp, "%s,%d", string, signals[j]);
                free(string);
                string = tmp;
            }
        }

        opal_signal_string = string;
        ret = mca_base_var_register ("opal", "opal", NULL, "signal",
				     "Comma-delimited list of integer signal numbers to Open MPI to attempt to intercept.  Upon receipt of the intercepted signal, Open MPI will display a stack trace and abort.  Open MPI will *not* replace signals if handlers are already installed by the time MPI_INIT is invoked.  Optionally append \":complain\" to any signal number in the comma-delimited list to make Open MPI complain if it detects another signal handler (and therefore does not insert its own).",
				     MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
				     OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
				     &opal_signal_string);
        free (string);
        if (0 > ret) {
            return ret;
        }
    }

#if defined(HAVE_SCHED_YIELD)
    opal_progress_yield_when_idle = false;
    ret = mca_base_var_register ("opal", "opal", "progress", "yield_when_idle",
                                 "Yield the processor when waiting on progress",
                                 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                 OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_LOCAL,
                                 &opal_progress_yield_when_idle);
#endif

#if OPAL_ENABLE_DEBUG
    opal_progress_debug = false;
    ret = mca_base_var_register ("opal", "opal", "progress", "debug",
				 "Set to non-zero to debug progress engine features",
				 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
				 OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_LOCAL,
				 &opal_progress_debug);
    if (0 > ret) {
        return ret;
    }

    opal_debug_threads = false;
    ret = mca_base_var_register ("opal", "opal", "debug", "threads",
				 "Debug thread usage within OPAL. Reports out "
				 "when threads are acquired and released.",
				 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
				 OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_LOCAL,
				 &opal_debug_threads);
    if (0 > ret) {
        return ret;
    }
#endif

#if OPAL_ENABLE_FT_CR == 1
    opal_base_distill_checkpoint_ready = false;
    ret = mca_base_var_register("opal", "opal", "base", "distill_checkpoint_ready",
                                "Distill only those components that are Checkpoint Ready",
                                MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_LOCAL,
                                &opal_base_distill_checkpoint_ready);

    if (0 > ret) {
        return ret;
    }
#endif

    /* RFC1918 defines
       - 10.0.0./8
       - 172.16.0.0/12
       - 192.168.0.0/16
       
       RFC3330 also mentions
       - 169.254.0.0/16 for DHCP onlink iff there's no DHCP server
    */
    opal_net_private_ipv4 = "10.0.0.0/8;172.16.0.0/12;192.168.0.0/16;169.254.0.0/16";
    ret = mca_base_var_register ("opal", "opal", "net", "private_ipv4",
				 "Semicolon-delimited list of CIDR notation entries specifying what networks are considered \"private\" (default value based on RFC1918 and RFC3330)",
				 MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
				 OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_ALL_EQ,
				 &opal_net_private_ipv4);
    if (0 > ret) {
        return ret;
    }

    opal_set_max_sys_limits = NULL;
    ret = mca_base_var_register ("opal", "opal", NULL, "set_max_sys_limits",
				 "Set the specified system-imposed limits to the specified value, including \"unlimited\"."
                                 "Supported params: core, filesize, maxmem, openfiles, stacksize, maxchildren",
				 MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
				 OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_ALL_EQ,
				 &opal_set_max_sys_limits);
    if (0 > ret) {
        return ret;
    }

    ret = mca_base_var_register("opal", "opal", NULL, "built_with_cuda_support",
                                "Whether CUDA GPU buffer support is built into library or not",
                                MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
                                OPAL_INFO_LVL_4, MCA_BASE_VAR_SCOPE_CONSTANT,
                                &opal_built_with_cuda_support);
    if (0 > ret) {
        return ret;
    }

    /* Current default is to enable CUDA support if it is built into library */
    opal_cuda_support = opal_built_with_cuda_support;
    ret = mca_base_var_register ("opal", "opal", NULL, "cuda_support",
                                 "Whether CUDA GPU buffer support is enabled or not",
                                 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                 OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_ALL_EQ,
                                 &opal_cuda_support);
    if (0 > ret) {
        return ret;
    }

    /* Leave pinned parameter */
    opal_leave_pinned = -1;
    ret = mca_base_var_register("ompi", "mpi", NULL, "leave_pinned",
                                "Whether to use the \"leave pinned\" protocol or not.  Enabling this setting can help bandwidth performance when repeatedly sending and receiving large messages with the same buffers over RDMA-based networks (0 = do not use \"leave pinned\" protocol, 1 = use \"leave pinned\" protocol, -1 = allow network to choose at runtime).",
                                MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
                                OPAL_INFO_LVL_9,
                                MCA_BASE_VAR_SCOPE_READONLY,
                                &opal_leave_pinned);
    mca_base_var_register_synonym(ret, "opal", "opal", NULL, "leave_pinned",
                                  MCA_BASE_VAR_SYN_FLAG_DEPRECATED);

    opal_leave_pinned_pipeline = false;
    ret = mca_base_var_register("ompi", "mpi", NULL, "leave_pinned_pipeline",
                                "Whether to use the \"leave pinned pipeline\" protocol or not.",
                                 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
                                 OPAL_INFO_LVL_9,
                                 MCA_BASE_VAR_SCOPE_READONLY,
                                 &opal_leave_pinned_pipeline);
    mca_base_var_register_synonym(ret, "opal", "opal", NULL, "leave_pinned_pipeline",
                                  MCA_BASE_VAR_SYN_FLAG_DEPRECATED);

    if (opal_leave_pinned > 0 && opal_leave_pinned_pipeline) {
        opal_leave_pinned_pipeline = 0;
        opal_show_help("help-opal-runtime.txt",
                       "mpi-params:leave-pinned-and-pipeline-selected",
                       true);
    }

#if OPAL_ENABLE_TIMING
    opal_timing_sync_file = NULL;
    (void) mca_base_var_register ("opal", "opal", NULL, "timing_sync_file",
                                  "Clock synchronisation information generated by mpisync tool. You don't need to touch this if you use mpirun_prof tool.",
                                  MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
                                  OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL,
                                  &opal_timing_sync_file);
    if( opal_timing_clocksync_read(opal_timing_sync_file) ){
        opal_output(0, "Cannot read file %s containing clock synchronisation information\n", opal_timing_sync_file);
    }

    opal_timing_output = NULL;
    (void) mca_base_var_register ("opal", "opal", NULL, "timing_output",
                                  "The name of output file for timing information. If this parameter is not set then output will be directed into OPAL debug channel.",
                                  MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
                                  OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL,
                                  &opal_timing_output);

    opal_timing_overhead = true;
    (void) mca_base_var_register ("opal", "opal", NULL, "timing_overhead",
                                  "Timing framework introduce additional overhead (malloc's mostly)."
                                  " The time spend in such costly routines is measured and may be accounted"
                                  " (subtracted from timestamps). 'true' means consider overhead, 'false' - ignore (default: true).",
                                  MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
                                  OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL,
                                  &opal_timing_overhead);
#endif

    opal_warn_on_fork = true;
    (void) mca_base_var_register("ompi", "mpi", NULL, "warn_on_fork",
                                 "If nonzero, issue a warning if program forks under conditions that could cause system errors",
                                 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
                                 OPAL_INFO_LVL_9,
                                 MCA_BASE_VAR_SCOPE_READONLY,
                                 &opal_warn_on_fork);

    /* The ddt engine has a few parameters */
    ret = opal_datatype_register_params();
    if (OPAL_SUCCESS != ret) {
        return ret;
    }

    /* dss has parameters */
    ret = opal_dss_register_vars ();
    if (OPAL_SUCCESS != ret) { 
        return ret; 
    }

    return OPAL_SUCCESS;
}
Example #29
0
/* ////////////////////////////////////////////////////////////////////////// */
static int
segment_create(opal_shmem_ds_t *ds_buf,
               const char *file_name,
               size_t size)
{
    int rc = OPAL_SUCCESS;
    pid_t my_pid = getpid();
    char *temp1 = NULL, *temp2 = NULL;
    bool space_available = false;
    uint64_t amount_space_avail = 0;

    /* the real size of the shared memory segment.  this includes enough space
     * to store our segment header.
     */
    size_t real_size = size + sizeof(opal_shmem_seg_hdr_t);
    opal_shmem_seg_hdr_t *seg_hdrp = NULL;
    HANDLE hMapObject = INVALID_HANDLE_VALUE;
    LPVOID lpvMem = NULL;

    /* init the contents of opal_shmem_ds_t */
    shmem_ds_reset(ds_buf);

    /* On Windows the shared file will be created by the OS directly on the
     * system ressources. Therefore, no file get involved in the operation.
     * However, a unique key should be used as name for the shared memory object
     * in order to allow all processes to access the same unique shared memory
     * region. The key will be obtained from the original file_name by replacing
     * all path separator occurences by '/' (as '\' is not allowed on the object
     * name).
     */
    temp1 = strdup(file_name);
    if (NULL == temp1) {
        return OPAL_ERR_OUT_OF_RESOURCE;
    }

    temp2 = temp1;
    while (NULL != (temp2 = strchr(temp2, OPAL_PATH_SEP[0])) ) {
        *temp2 = '/';
    }
    /* let's make sure we have enough space for the backing file */
    if (OPAL_SUCCESS != (rc = enough_space(temp1,
                                           real_size,
                                           &amount_space_avail,
                                           &space_available))) {
        opal_output(0, "shmem: windows: an error occurred while determining "
                    "whether or not %s could be created.", temp1);
        /* rc is set */
        free(temp1);
        goto out;
    }
    if (!space_available) {
        char hn[MAXHOSTNAMELEN];
        gethostname(hn, MAXHOSTNAMELEN - 1);
        hn[MAXHOSTNAMELEN - 1] = '\0';
        rc = OPAL_ERR_OUT_OF_RESOURCE;
        opal_show_help("help-opal-shmem-windows.txt", "target full", 1,
                       temp1, hn, (unsigned long)real_size,
                       (unsigned long long)amount_space_avail);
        free(temp1);
        goto out;
    }
    /* enough space is available, so create the segment */
                                   /* use paging file */
    hMapObject = CreateFileMapping(INVALID_HANDLE_VALUE,
                                   /* no security attributes */
                                   NULL,
                                   /* read/write access */
                                   PAGE_READWRITE,
                                   /* size: high 32-bits */
                                   0,
                                   /* size: low 32-bits */
                                   (DWORD)real_size,
                                   /* name of map object */
                                   temp1);
    if (NULL == hMapObject) {
        rc = OPAL_ERROR;
        goto out;
    }
    /* Get a pointer to the file-mapped shared memory. */
    lpvMem = MapViewOfFile(hMapObject,          /* object to map view of */
                           FILE_MAP_WRITE,      /* read/write access */
                           0,                   /* high offset:  map from */
                           0,                   /* low offset:   beginning */
                           0);                  /* default: map entire file */
    if (NULL == lpvMem) {
        rc = OPAL_ERROR;
        goto out;
    }

    seg_hdrp = (opal_shmem_seg_hdr_t *)lpvMem;

    /* all is well */
    /* -- initialize the shared memory segment -- */
    opal_atomic_rmb();

    /* init segment lock */
    opal_atomic_init(&seg_hdrp->lock, OPAL_ATOMIC_UNLOCKED);
    /* i was the creator of this segment, so note that fact */
    seg_hdrp->cpid = my_pid;

    opal_atomic_wmb();

    /* -- initialize the contents of opal_shmem_ds_t -- */
    ds_buf->seg_cpid = my_pid;
    ds_buf->seg_size = real_size;
    ds_buf->seg_base_addr = (unsigned char *)seg_hdrp;
    /* update path change in ds_buf */
    memcpy(ds_buf->seg_name, temp1, OPAL_PATH_MAX);
    /* relase the temporary file name */
    free(temp1);

    /* set "valid" bit because setment creation was successful */
    OPAL_SHMEM_DS_SET_VALID(ds_buf);

    OPAL_OUTPUT_VERBOSE(
        (70, opal_shmem_base_output,
         "%s: %s: create successful "
         "(id: %d, size: %"PRIsize_t", name: %s)\n",
         mca_shmem_windows_component.super.base_version.mca_type_name,
         mca_shmem_windows_component.super.base_version.mca_component_name,
         ds_buf->seg_id, ds_buf->seg_size, ds_buf->seg_name)
    );

out:
    /* an error occured, so invalidate the shmem object and munmap if needed */
    if (OPAL_SUCCESS != rc) {
        if (NULL != seg_hdrp) {
            UnmapViewOfFile((LPVOID)seg_hdrp);
        }
        shmem_ds_reset(ds_buf);
    }
    return rc;
}
Example #30
0
int mca_bml_r2_add_procs(
                         size_t nprocs, 
                         struct ompi_proc_t** procs, 
                         struct mca_bml_base_endpoint_t** bml_endpoints, 
                         struct ompi_bitmap_t* reachable
                         )
{
    size_t p;
    int rc;
    size_t p_index;
    struct mca_btl_base_endpoint_t ** btl_endpoints = NULL;  
    struct ompi_proc_t** new_procs = NULL; 
    size_t n_new_procs = 0;
    int ret = OMPI_SUCCESS;
    struct ompi_proc_t *unreach_proc = NULL;

    if(0 == nprocs) {
        return OMPI_SUCCESS;
    }
    
    if(OMPI_SUCCESS != (rc = mca_bml_r2_add_btls()) ) {
        return rc;
    }
    
    new_procs = (struct ompi_proc_t **) 
        malloc(nprocs * sizeof(struct ompi_proc_t *)); 
    if (NULL == new_procs ) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    memset(bml_endpoints, 0, nprocs * sizeof(struct mca_bml_base_endpoint_t*));

    for(p_index = 0; p_index < nprocs; p_index++) { 
        struct ompi_proc_t* proc;
        proc = procs[p_index]; 
        OBJ_RETAIN(proc); 
        
        if(NULL !=  proc->proc_bml) { 
            bml_endpoints[p_index] = 
                (mca_bml_base_endpoint_t*) proc->proc_bml; 
        } else { 
            new_procs[n_new_procs++] = proc; 
        }
    }

    if ( 0 == n_new_procs ) {
	return OMPI_SUCCESS;
    }
    
    procs = new_procs; 
    nprocs = n_new_procs; 
    
    /* attempt to add all procs to each r2 */
    btl_endpoints = (struct mca_btl_base_endpoint_t **) 
        malloc(nprocs * sizeof(struct mca_btl_base_endpoint_t*)); 
    if (NULL == btl_endpoints) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    
    for(p_index = 0; p_index < mca_bml_r2.num_btl_modules; p_index++) {
        mca_btl_base_module_t* btl = mca_bml_r2.btl_modules[p_index];
        int btl_inuse = 0;
        
        /* if the r2 can reach the destination proc it sets the
         * corresponding bit (proc index) in the reachable bitmap
         * and can return addressing information for each proc
         * that is passed back to the r2 on data transfer calls
         */
        ompi_bitmap_clear_all_bits(reachable);
        memset(btl_endpoints, 0, nprocs *sizeof(struct mca_btl_base_endpoint_t*)); 

        rc = btl->btl_add_procs(btl, n_new_procs, new_procs, btl_endpoints, reachable);
        if(OMPI_SUCCESS != rc) {
            free(btl_endpoints);
            return rc;
        }

        /* for each proc that is reachable - add the endpoint to the bml_endpoints array(s) */
        for(p=0; p<n_new_procs; p++) {
            if(ompi_bitmap_is_set_bit(reachable, p)) {
                ompi_proc_t *proc = new_procs[p]; 
                mca_bml_base_endpoint_t * bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml; 
                mca_bml_base_btl_t* bml_btl; 
                size_t size;
                
                btl_inuse++;

                if(NULL == bml_endpoint) { 
                    
                    
                    /* allocate bml specific proc data */
                    bml_endpoint = OBJ_NEW(mca_bml_base_endpoint_t);
                    if (NULL == bml_endpoint) {
                        opal_output(0, "mca_bml_r2_add_procs: unable to allocate resources");
                        free(btl_endpoints);
                        return OMPI_ERR_OUT_OF_RESOURCE;
                    }
                    
                    /* preallocate space in array for max number of r2s */
                    mca_bml_base_btl_array_reserve(&bml_endpoint->btl_eager, mca_bml_r2.num_btl_modules);
                    mca_bml_base_btl_array_reserve(&bml_endpoint->btl_send,  mca_bml_r2.num_btl_modules);
                    mca_bml_base_btl_array_reserve(&bml_endpoint->btl_rdma,  mca_bml_r2.num_btl_modules);
                    bml_endpoint->btl_max_send_size = -1;
                    bml_endpoint->btl_proc = proc;
                    proc->proc_bml = bml_endpoint; 
                 
                    bml_endpoint->btl_flags_and = 0;
                    bml_endpoint->btl_flags_or = 0;
                }

                bml_endpoints[p] =(mca_bml_base_endpoint_t*)  proc->proc_bml; 
                
                
                /* dont allow an additional BTL with a lower exclusivity ranking */
                size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
                if(size > 0) {
                    bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, size-1);
                    /* skip this btl if the exclusivity is less than the previous */
                    if(bml_btl->btl->btl_exclusivity > btl->btl_exclusivity) {
                        if(btl_endpoints[p] != NULL) {
                            btl->btl_del_procs(btl, 1, &proc, &btl_endpoints[p]);
                        }
                        btl_inuse--;
                        continue;
                    }
                }

                /* cache the endpoint on the proc */
                bml_btl = mca_bml_base_btl_array_insert(&bml_endpoint->btl_send);
                bml_btl->btl = btl;
                bml_btl->btl_eager_limit = btl->btl_eager_limit;
                bml_btl->btl_min_send_size = btl->btl_min_send_size;
                bml_btl->btl_max_send_size = btl->btl_max_send_size;
                bml_btl->btl_min_rdma_size = btl->btl_min_rdma_size;
                bml_btl->btl_max_rdma_size = btl->btl_max_rdma_size;
                bml_btl->btl_cache = NULL;
                bml_btl->btl_endpoint = btl_endpoints[p];
                bml_btl->btl_weight = 0;
                bml_btl->btl_alloc = btl->btl_alloc;
                bml_btl->btl_free = btl->btl_free;
                bml_btl->btl_prepare_src = btl->btl_prepare_src;
                bml_btl->btl_prepare_dst = btl->btl_prepare_dst;
                bml_btl->btl_send = btl->btl_send;
                bml_btl->btl_flags = btl->btl_flags; 
                bml_btl->btl_put = btl->btl_put;
                if( (bml_btl->btl_flags & MCA_BTL_FLAGS_PUT) && (NULL == bml_btl->btl_put) ) {
                    opal_output(0, "mca_bml_r2_add_procs: The PUT flag is specified for"
                                " the %s BTL without any PUT function attached. Disard the flag !",
                                bml_btl->btl->btl_component->btl_version.mca_component_name);
                    bml_btl->btl_flags ^= MCA_BTL_FLAGS_PUT;
                }
                bml_btl->btl_get = btl->btl_get;
                if( (bml_btl->btl_flags & MCA_BTL_FLAGS_GET) && (NULL == bml_btl->btl_get) ) {
                    opal_output(0, "mca_bml_r2_add_procs: The GET flag is specified for"
                                " the %s BTL without any GET function attached. Disard the flag !",
                                bml_btl->btl->btl_component->btl_version.mca_component_name);
                    bml_btl->btl_flags ^= MCA_BTL_FLAGS_GET;
                }
                bml_btl->btl_mpool = btl->btl_mpool;
                if( (bml_btl->btl_flags & (MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_SEND)) == 0 ) {
                    /**
                     * If no protocol specified, we have 2 choices: we ignore the BTL
                     * as we don't know which protocl to use, or we suppose that all
                     * BTLs support the send protocol. 
                     */
                    bml_btl->btl_flags |= MCA_BTL_FLAGS_SEND;
                }
                /**
                 * calculate the bitwise OR and AND of the btl flags 
                 */
                bml_endpoint->btl_flags_or |= bml_btl->btl_flags;
                bml_endpoint->btl_flags_and &= bml_btl->btl_flags;
            }
        }
        if(btl_inuse > 0 && NULL != btl->btl_component->btl_progress) {
            size_t p;
            bool found = false;
            for(p=0; p<mca_bml_r2.num_btl_progress; p++) {
                if(mca_bml_r2.btl_progress[p] == btl->btl_component->btl_progress) {
                    found = true;
                    break;
                }
            }
            if(found == false) {
                mca_bml_r2.btl_progress[mca_bml_r2.num_btl_progress] = 
                    btl->btl_component->btl_progress;
                mca_bml_r2.num_btl_progress++;
            }
        }
    }
    free(btl_endpoints);

    /* iterate back through procs and compute metrics for registered r2s */
    for(p=0; p<n_new_procs; p++) {
        ompi_proc_t *proc = new_procs[p];
        mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml;
        double total_bandwidth = 0;
        uint32_t latency = 0xffffffff;
        size_t n_index;
        size_t n_size;

        /* skip over procs w/ no btl's registered */
        if(NULL == bml_endpoint) {
            continue;
        }

        /* (1) determine the total bandwidth available across all btls
         *     note that we need to do this here, as we may already have btls configured
         * (2) determine the highest priority ranking for latency
         * (3) compute the maximum amount of bytes that can be send without any
         *     weighting. Once the left over is smaller than this number we will
         *     start using the weight to compute the correct amount.
         */
        n_size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); 
        bml_endpoint->bml_max_send_length = 0;
        bml_endpoint->bml_max_rdma_length = 0;
        bml_endpoint->btl_rdma_index = 0;
        for(n_index = 0; n_index < n_size; n_index++) {
            mca_bml_base_btl_t* bml_btl = 
                mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index);
            mca_btl_base_module_t* btl = bml_btl->btl;
            total_bandwidth += bml_btl->btl->btl_bandwidth;
            if(btl->btl_latency < latency) {
                latency = btl->btl_latency;
            }
            bml_endpoint->bml_max_send_length += bml_btl->btl->btl_bandwidth;
        }
        
        /* (1) set the weight of each btl as a percentage of overall bandwidth
         * (2) copy all btl instances at the highest priority ranking into the
         *     list of btls used for first fragments
         */
        for(n_index = 0; n_index < n_size; n_index++) {
            mca_bml_base_btl_t* bml_btl = 
                mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index);
            mca_btl_base_module_t *btl = bml_btl->btl;

            /* compute weighting factor for this r2 */
            if(btl->btl_bandwidth > 0) {
                bml_btl->btl_weight = btl->btl_bandwidth / total_bandwidth;
            } else {
                bml_btl->btl_weight = 1.0 / n_size;
            }

            /* check to see if this r2 is already in the array of r2s 
             * used for first fragments - if not add it.
             */
            if(btl->btl_latency == latency) {
                mca_bml_base_btl_t* bml_btl_new = 
                    mca_bml_base_btl_array_insert(&bml_endpoint->btl_eager);
                *bml_btl_new = *bml_btl;
            }

            /* set endpoint max send size as min of available btls */
            if(bml_endpoint->btl_max_send_size > btl->btl_max_send_size)
               bml_endpoint->btl_max_send_size = btl->btl_max_send_size;

            /* check flags - is rdma prefered */
            if(btl->btl_flags & (MCA_BTL_FLAGS_PUT|MCA_BTL_FLAGS_GET) &&
               proc->proc_arch == ompi_proc_local_proc->proc_arch) {
                mca_bml_base_btl_t* bml_btl_rdma = mca_bml_base_btl_array_insert(&bml_endpoint->btl_rdma);
                *bml_btl_rdma = *bml_btl;
                if(bml_endpoint->btl_rdma_offset < bml_btl_rdma->btl_min_rdma_size) {
                    bml_endpoint->btl_rdma_offset = bml_btl_rdma->btl_min_rdma_size;
                }
            }
        }
    }

    /* see if we have a connection to everyone else */
    for(p=0; p<n_new_procs; p++) {
        ompi_proc_t *proc = new_procs[p];

        if (NULL == proc->proc_bml) {
            if (NULL == unreach_proc) {
                unreach_proc = proc;
            }
            ret = OMPI_ERR_UNREACH;
        }
    }

    if (mca_bml_r2.show_unreach_errors && 
        OMPI_ERR_UNREACH == ret) {
        char *local, *remote;

        orte_ns.get_proc_name_string(&local,
                                     &(ompi_proc_local_proc->proc_name));
        orte_ns.get_proc_name_string(&remote,
                                     &(unreach_proc->proc_name));

        opal_show_help("help-mca-bml-r2",
                       "unreachable proc",
                       true, local, remote, NULL);

        free(local);
        free(remote);
    }

    free(new_procs); 

    return ret;
}