Exemple #1
0
static void orte_node_destruct(orte_node_t* node)
{
    int i;
    opal_node_stats_t *stats;

    if (NULL != node->name) {
        free(node->name);
        node->name = NULL;
    }

    if (NULL != node->alias) {
        opal_argv_free(node->alias);
        node->alias = NULL;
    }
    
    if (NULL != node->daemon) {
        node->daemon->node = NULL;
        OBJ_RELEASE(node->daemon);
        node->daemon = NULL;
    }
    
    for (i=0; i < node->procs->size; i++) {
        if (NULL != node->procs->addr[i]) {
            ((orte_proc_t*)(node->procs->addr[i]))->node = NULL;
            OBJ_RELEASE(node->procs->addr[i]);
            node->procs->addr[i] = NULL;
        }
    }
    OBJ_RELEASE(node->procs);
    
    if (NULL != node->cpu_set) {
        free(node->cpu_set);
        node->cpu_set = NULL;
    }
    if (NULL != node->username) {
        free(node->username);
        node->username = NULL;
    }
    
    /* do NOT destroy the topology */

    while (NULL != (stats = (opal_node_stats_t*)opal_ring_buffer_pop(&node->stats))) {
        OBJ_RELEASE(stats);
    }
    OBJ_DESTRUCT(&node->stats);
}
Exemple #2
0
/*
 * Locates a file with certain permissions from a list of search paths
 */
char *opal_path_findv(char *fname, int mode, char **envv, char *wrkdir)
{
    char **dirv;     
    char *fullpath; 
    char *path;    
    int dirc;    
    int i;
    bool found_dot = false;

    /* Set the local search paths. */

    dirc = 0;
    dirv = NULL;

    if (NULL != (path = list_env_get("PATH", envv))) {
        path_env_load(path, &dirc, &dirv);
    }

    /* Replace the "." path by the working directory. */

    if (NULL != wrkdir) { 
        for (i = 0; i < dirc; ++i) {
            if (0 == strcmp(dirv[i], ".")) {
                found_dot = true;
                free(dirv[i]);
                dirv[i] = strdup(wrkdir);
                if (NULL == dirv[i]){
                    return NULL;
                }
            }
        }
    }

    /* If we didn't find "." in the path and we have a wrkdir, append
       the wrkdir to the end of the path */

    if (!found_dot && NULL != wrkdir) {
        opal_argv_append(&dirc, &dirv, wrkdir);
    }

    if(NULL == dirv)
        return NULL;
    fullpath = opal_path_find(fname, dirv, mode, envv);
    opal_argv_free(dirv);
    return fullpath;
}
Exemple #3
0
static bool test2(void)
{
  int i, j;
  char **argv = NULL;
  int argc;
  char *a[] = { "aaa", "bbb", "ccc", NULL };
  char *b[4];

  /* Similar test to above, but ensure that opal_argv_add is actually
     *copying* the string by value, not by reference.  So copy the a
     array into b, and then opal_argv_add() from b.  After that,
     scribble in the first character of the b[] string that we just
     added, and compare all entries in a to argv -- they should be
     identical (even though b is now corrupted). */

  for (i = 0; a[i] != NULL; ++i) {
    b[i] = strdup(a[i]);
  }
  b[i] = NULL;

  for (i = 0; b[i] != NULL; ++i) {
    if (opal_argv_append(&argc, &argv, b[i]) != OPAL_SUCCESS) {
      return false;
    }
    ++b[i][0];
    for (j = 0; j <= i; ++j) {
      if (strcmp(argv[j], a[j]) != 0) {
        return false;
      }
    }
    if (NULL != argv[i + 1]) {
      return false;
    }
    if (argc != i + 1) {
      return false;
    }
  }

  opal_argv_free(argv);
  for (i = 0; b[i] != NULL; ++i) {
    free(b[i]);
  }

  return true;
}
Exemple #4
0
static void orte_node_destruct(orte_node_t* node)
{
    int i;
    opal_node_stats_t *stats;
    orte_proc_t *proc;

    if (NULL != node->name) {
        free(node->name);
        node->name = NULL;
    }

    if (NULL != node->alias) {
        opal_argv_free(node->alias);
        node->alias = NULL;
    }
    
    if (NULL != node->daemon) {
        node->daemon->node = NULL;
        OBJ_RELEASE(node->daemon);
        node->daemon = NULL;
    }
    
    for (i=0; i < node->procs->size; i++) {
        if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
            opal_pointer_array_set_item(node->procs, i, NULL);
            OBJ_RELEASE(proc);
        }
    }
    OBJ_RELEASE(node->procs);
    
    /* we release the topology elsewhere */

    if (NULL != node->username) {
        free(node->username);
        node->username = NULL;
    }
    
    /* do NOT destroy the topology */

    while (NULL != (stats = (opal_node_stats_t*)opal_ring_buffer_pop(&node->stats))) {
        OBJ_RELEASE(stats);
    }
    OBJ_DESTRUCT(&node->stats);
}
static char* get_node_list(orte_app_context_t *app)
{
    int j;
    char **total_host = NULL;
    char *nodes;

    if (NULL == app->dash_host) {
        return NULL;
    }
    for (j=0; NULL != app->dash_host[j]; j++) {
        opal_argv_append_unique_nosize(&total_host, app->dash_host[j], false);
    }
    if (NULL == total_host) {
        return NULL;
    }

    nodes = opal_argv_join(total_host, ',');
    opal_argv_free(total_host);
    return nodes;
}
Exemple #6
0
int opal_event_init(void)
{
    char **includes=NULL;
    bool dumpit=false;
    int i, j;

    if (opal_output_get_verbosity(opal_event_base_framework.framework_output) > 4) {
        event_enable_debug_mode();
        dumpit = true;
    }

    if (NULL == event_module_include) {
        /* Shouldn't happen, but... */
        event_module_include = strdup("select");
    }
    includes = opal_argv_split(event_module_include,',');

    /* get a configuration object */
    config = event_config_new();
    /* cycle thru the available subsystems */
    for (i = 0 ; NULL != eventops[i] ; ++i) {
        /* if this module isn't included in the given ones,
         * then exclude it
         */
        dumpit = true;
        for (j=0; NULL != includes[j]; j++) {
            if (0 == strcmp("all", includes[j]) ||
                0 == strcmp(eventops[i]->name, includes[j])) {
                dumpit = false;
                break;
            }
        }
        if (dumpit) {
            event_config_avoid_method(config, eventops[i]->name);
        }
    }
    opal_argv_free(includes);

    return OPAL_SUCCESS;
}
Exemple #7
0
int orte_util_get_ordered_dash_host_list(opal_list_t *nodes,
                                         char ** host_argv)
{
    int rc, i;
    char **mapped_nodes = NULL;
    orte_node_t *node;

    if (ORTE_SUCCESS != (rc = parse_dash_host(&mapped_nodes, host_argv))) {
        ORTE_ERROR_LOG(rc);
    }
    
    /* for each entry, create a node entry on the list */
    for (i=0; NULL != mapped_nodes[i]; i++) {
        node = OBJ_NEW(orte_node_t);
        node->name = strdup(mapped_nodes[i]);
        opal_list_append(nodes, &node->super);
    }
    
    /* cleanup */
    opal_argv_free(mapped_nodes);
    return rc;
}
Exemple #8
0
static bool test6(void)
{
  char *a = "the quick brown fox jumped over the lazy dog a_really_long_argument_to_force_a_long_copy_zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz";
  char **b;
  char *c;

  /* split the string above and then join it -- the joined version
     should be just like the original */

  b = opal_argv_split(a, ' ');
  c = opal_argv_join(b, ' ');

  if (strcmp(a, c) != 0) {
    return false;
  }

  /* All done */

  free(c);
  opal_argv_free(b);
  return true;
}
Exemple #9
0
static bool test1(void)
{
  char *a[] = { "argv1", "argv2", "argv3", NULL };
  char **argv = NULL;
  int i, j, argc = 28;

  /* Test basic functionality.  Start with a NULL argv and add the
     contents of the a array.

     Set argc to be an initiall bogus number -- opal_argv_add() should
     reset it back to zero after the first iteration.

     After adding the a[i], ensure that argv[0 - (i-1)] are the same
     as a[0 - (i-1)].

     Also ensure that a[i + 1] == NULL and that argc is the proper
     value. */

  for (i = 0; a[i] != NULL; ++i) {
    if (opal_argv_append(&argc, &argv, a[i]) != OPAL_SUCCESS) {
      return false;
    }
    for (j = 0; j <= i; ++j) {
      if (strcmp(argv[j], a[j]) != 0) {
        return false;
      }
    }
    if (NULL != argv[i + 1]) {
      return false;
    }
    if (argc != i + 1) {
      return false;
    }
  }
  opal_argv_free(argv);

  return true;
}
static void
load_env_data_argv(const char *project, const char *flag, char ***data)
{
    char *envname;
    char *envvalue;

    if (NULL == project || NULL == flag) return;

    asprintf(&envname, "%s_MPI%s", project, flag);
    if (NULL == (envvalue = getenv(envname))) {
        free(envname);
        asprintf(&envname, "%s_%s", project, flag);
        if (NULL == (envvalue = getenv(envname))) {
            free(envname);
            return;
        }
    } 
    free(envname);

    if (NULL != *data) opal_argv_free(*data);

    *data = opal_argv_split(envvalue, ' ');
}
static void
options_data_expand(const char *value)
{
    /* make space for the new set of args */
    parse_options_idx++;
    options_data = (struct options_data_t *) realloc(options_data, sizeof(struct options_data_t) * (parse_options_idx + 1));
    options_data_init(&(options_data[parse_options_idx]));

    /* if there are values, this is not the default case.
       Otherwise, it's the default case... */
    if (NULL != value && 0 != strcmp(value, "")) {
        char **values = opal_argv_split(value, ';');
        opal_argv_insert(&(options_data[parse_options_idx].compiler_args),
                         opal_argv_count(options_data[parse_options_idx].compiler_args),
                         values);
        opal_argv_free(values);
    } else {
        free(options_data[parse_options_idx].compiler_args);
        options_data[parse_options_idx].compiler_args = NULL;
        /* this is a default */
        default_data_idx = parse_options_idx;
    }
}
static int allocate(opal_list_t *nodes)
{
    char **nodelist;
    orte_node_t *node;
    int i, num_nodes;

    /* get the list of allocated nodes */
    if ((num_nodes = lsb_getalloc(&nodelist)) < 0) {
        orte_show_help("help-ras-lsf.txt", "nodelist-failed", true);
        return ORTE_ERR_NOT_AVAILABLE;
    }
    
    node = NULL;
    
    /* step through the list */
    for (i = 0; i < num_nodes; i++) {
        /* is this a repeat of the current node? */
        if (NULL != node && 0 == strcmp(nodelist[i], node->name)) {
            /* it is a repeat - just bump the slot count */
            ++node->slots;
            continue;
        }
        
        /* not a repeat - create a node entry for it */
        node = OBJ_NEW(orte_node_t);
        node->name = strdup(nodelist[i]);
        node->slots_inuse = 0;
        node->slots_max = 0;
        node->slots = 1;
        opal_list_append(nodes, &node->super);
    }
        
    /* release the nodelist from lsf */
    opal_argv_free(nodelist);

    return ORTE_SUCCESS;
}
Exemple #13
0
int main(int argc, char *argv[])
{
    int ret = 0;
    bool want_help = false;
    bool cmd_error = false;
    bool acted = false;
    bool want_all = false;
    char **app_env = NULL, **global_env = NULL;
    int i, len;
    char *str;
    
    /* Initialize the argv parsing handle */
    if (OMPI_SUCCESS != opal_init_util(&argc, &argv)) {
        orte_show_help("help-ompi_info.txt", "lib-call-fail", true, 
                       "opal_init_util", __FILE__, __LINE__, NULL);
        exit(ret);
    }
    
    ompi_info_cmd_line = OBJ_NEW(opal_cmd_line_t);
    if (NULL == ompi_info_cmd_line) {
        ret = errno;
        orte_show_help("help-ompi_info.txt", "lib-call-fail", true, 
                       "opal_cmd_line_create", __FILE__, __LINE__, NULL);
        opal_finalize_util();
        exit(ret);
    }
    
    opal_cmd_line_make_opt3(ompi_info_cmd_line, 'v', NULL, "version", 2, 
                            "Show version of Open MPI or a component.  The first parameter can be the keywords \"ompi\" or \"all\", a framework name (indicating all components in a framework), or a framework:component string (indicating a specific component).  The second parameter can be one of: full, major, minor, release, greek, svn.");
    opal_cmd_line_make_opt3(ompi_info_cmd_line, '\0', NULL, "param", 2, 
                            "Show MCA parameters.  The first parameter is the framework (or the keyword \"all\"); the second parameter is the specific component name (or the keyword \"all\").");
    opal_cmd_line_make_opt3(ompi_info_cmd_line, '\0', NULL, "internal", 0, 
                            "Show internal MCA parameters (not meant to be modified by users)");
    opal_cmd_line_make_opt3(ompi_info_cmd_line, '\0', NULL, "path", 1, 
                            "Show paths that Open MPI was configured with.  Accepts the following parameters: prefix, bindir, libdir, incdir, mandir, pkglibdir, sysconfdir");
    opal_cmd_line_make_opt3(ompi_info_cmd_line, '\0', NULL, "arch", 0, 
                            "Show architecture Open MPI was compiled on");
    opal_cmd_line_make_opt3(ompi_info_cmd_line, 'c', NULL, "config", 0, 
                            "Show configuration options");
    opal_cmd_line_make_opt3(ompi_info_cmd_line, 'h', NULL, "help", 0, 
                            "Show this help message");
    opal_cmd_line_make_opt3(ompi_info_cmd_line, '\0', NULL, "ompi_info_pretty", 0, 
                            "When used in conjunction with other parameters, the output is displayed in 'ompi_info_prettyprint' format (default)");
    opal_cmd_line_make_opt3(ompi_info_cmd_line, '\0', NULL, "parsable", 0, 
                            "When used in conjunction with other parameters, the output is displayed in a machine-parsable format");
    opal_cmd_line_make_opt3(ompi_info_cmd_line, '\0', NULL, "parseable", 0, 
                            "Synonym for --parsable");
    opal_cmd_line_make_opt3(ompi_info_cmd_line, '\0', NULL, "hostname", 0, 
                            "Show the hostname that Open MPI was configured "
                            "and built on");
    opal_cmd_line_make_opt3(ompi_info_cmd_line, 'a', NULL, "all", 0, 
                            "Show all configuration options and MCA parameters");
    
    /* Call some useless functions in order to guarantee to link in some
     * global variables.  Only check the return value so that the
     * compiler doesn't optimize out the useless function.
     */
    
    if (OMPI_SUCCESS != ompi_comm_link_function()) {
        /* Stop .. or I'll say stop again! */
        ++ret;
    } else {
        --ret;
    }
    
    /* set our threading level */
    opal_set_using_threads(false);
    
    /* Get MCA parameters, if any */
    
    if( OMPI_SUCCESS != mca_base_open() ) {
        orte_show_help("help-ompi_info.txt", "lib-call-fail", true, "mca_base_open", __FILE__, __LINE__ );
        OBJ_RELEASE(ompi_info_cmd_line);
        opal_finalize_util();
        exit(1);
    }
    mca_base_cmd_line_setup(ompi_info_cmd_line);
    
    /* Do the parsing */
    
    if (OMPI_SUCCESS != opal_cmd_line_parse(ompi_info_cmd_line, false, argc, argv)) {
        cmd_error = true;
    }
    if (!cmd_error && 
        (opal_cmd_line_is_taken(ompi_info_cmd_line, "help") || 
         opal_cmd_line_is_taken(ompi_info_cmd_line, "h"))) {
        want_help = true;
    }
    if (cmd_error || want_help) {
        char *usage = opal_cmd_line_get_usage_msg(ompi_info_cmd_line);
        orte_show_help("help-ompi_info.txt", "usage", true, usage);
        free(usage);
        mca_base_close();
        OBJ_RELEASE(ompi_info_cmd_line);
        opal_finalize_util();
        exit(cmd_error ? 1 : 0);
    }
    
    mca_base_cmd_line_process_args(ompi_info_cmd_line, &app_env, &global_env);
    
    /* putenv() all the stuff that we got back from env (in case the
     * user specified some --mca params on the command line).  This
     * creates a memory leak, but that's unfortunately how putenv()
     * works.  :-(
     */
    
    len = opal_argv_count(app_env);
    for (i = 0; i < len; ++i) {
        putenv(app_env[i]);
    }
    len = opal_argv_count(global_env);
    for (i = 0; i < len; ++i) {
        putenv(global_env[i]);
    }
    
    /* setup the mca_types array */
    OBJ_CONSTRUCT(&mca_types, opal_pointer_array_t);
    opal_pointer_array_init(&mca_types, 256, INT_MAX, 128);
    
    opal_pointer_array_add(&mca_types, "mca");
    opal_pointer_array_add(&mca_types, "mpi");
    opal_pointer_array_add(&mca_types, "orte");
    opal_pointer_array_add(&mca_types, "opal");
    
    opal_pointer_array_add(&mca_types, "filter");
    opal_pointer_array_add(&mca_types, "backtrace");
    opal_pointer_array_add(&mca_types, "memchecker");
    opal_pointer_array_add(&mca_types, "memory");
    opal_pointer_array_add(&mca_types, "paffinity");
    opal_pointer_array_add(&mca_types, "carto");
    opal_pointer_array_add(&mca_types, "shmem");
    opal_pointer_array_add(&mca_types, "maffinity");
    opal_pointer_array_add(&mca_types, "timer");
    opal_pointer_array_add(&mca_types, "installdirs");
    opal_pointer_array_add(&mca_types, "sysinfo");
    opal_pointer_array_add(&mca_types, "hwloc");
#if OPAL_ENABLE_FT_CR == 1
    opal_pointer_array_add(&mca_types, "crs");
#endif
    opal_pointer_array_add(&mca_types, "dpm");
    opal_pointer_array_add(&mca_types, "pubsub");
    opal_pointer_array_add(&mca_types, "allocator");
    opal_pointer_array_add(&mca_types, "coll");
    opal_pointer_array_add(&mca_types, "io");
    opal_pointer_array_add(&mca_types, "mpool");
    opal_pointer_array_add(&mca_types, "pml");
    opal_pointer_array_add(&mca_types, "bml");
    opal_pointer_array_add(&mca_types, "rcache");
    opal_pointer_array_add(&mca_types, "btl");
    opal_pointer_array_add(&mca_types, "mtl");
    opal_pointer_array_add(&mca_types, "topo");
    opal_pointer_array_add(&mca_types, "osc");
    opal_pointer_array_add(&mca_types, "op");
    opal_pointer_array_add(&mca_types, "common");
#if OPAL_ENABLE_FT_CR == 1
    opal_pointer_array_add(&mca_types, "crcp");
#endif
    
#if !ORTE_DISABLE_FULL_SUPPORT
    opal_pointer_array_add(&mca_types, "iof");
    opal_pointer_array_add(&mca_types, "oob");
    opal_pointer_array_add(&mca_types, "odls");
    opal_pointer_array_add(&mca_types, "ras");
    opal_pointer_array_add(&mca_types, "rmaps");
    opal_pointer_array_add(&mca_types, "rml");
    opal_pointer_array_add(&mca_types, "routed");
    opal_pointer_array_add(&mca_types, "plm");
#if OPAL_ENABLE_FT_CR == 1
    opal_pointer_array_add(&mca_types, "snapc");
#endif
    opal_pointer_array_add(&mca_types, "filem");
#endif
    /* these are always included */
    opal_pointer_array_add(&mca_types, "errmgr");
    opal_pointer_array_add(&mca_types, "ess");
    opal_pointer_array_add(&mca_types, "grpcomm");
    opal_pointer_array_add(&mca_types, "notifier");
    
    /* Execute the desired action(s) */
    
    if (opal_cmd_line_is_taken(ompi_info_cmd_line, "ompi_info_pretty")) {
        ompi_info_pretty = true;
    } else if (opal_cmd_line_is_taken(ompi_info_cmd_line, "parsable") || opal_cmd_line_is_taken(ompi_info_cmd_line, "parseable")) {
        ompi_info_pretty = false;
    }
    
    want_all = opal_cmd_line_is_taken(ompi_info_cmd_line, "all");
    if (want_all || opal_cmd_line_is_taken(ompi_info_cmd_line, "version")) {
        ompi_info_do_version(want_all, ompi_info_cmd_line);
        acted = true;
    }
    if (want_all || opal_cmd_line_is_taken(ompi_info_cmd_line, "path")) {
        ompi_info_do_path(want_all, ompi_info_cmd_line);
        acted = true;
    }
    if (want_all || opal_cmd_line_is_taken(ompi_info_cmd_line, "arch")) {
        ompi_info_do_arch();
        acted = true;
    }
    if (want_all || opal_cmd_line_is_taken(ompi_info_cmd_line, "hostname")) {
        ompi_info_do_hostname();
        acted = true;
    }
    if (want_all || opal_cmd_line_is_taken(ompi_info_cmd_line, "config")) {
        ompi_info_do_config(true);
        acted = true;
    }
    if (want_all || opal_cmd_line_is_taken(ompi_info_cmd_line, "param")) {
        ompi_info_do_params(want_all, opal_cmd_line_is_taken(ompi_info_cmd_line, "internal"));
        acted = true;
    }
    
    /* If no command line args are specified, show default set */
    
    if (!acted) {
        ompi_info_show_ompi_version(ompi_info_ver_full);
        ompi_info_show_path(ompi_info_path_prefix, opal_install_dirs.prefix);
        ompi_info_do_arch();
        ompi_info_do_hostname();
        ompi_info_do_config(false);
        ompi_info_open_components();
        for (i = 0; i < mca_types.size; ++i) {
            if (NULL == (str = (char*)opal_pointer_array_get_item(&mca_types, i))) {
                continue;
            }
            if (0 != strcmp("mpi", str)) {
                ompi_info_show_component_version(str, ompi_info_component_all, 
                                       ompi_info_ver_full, ompi_info_type_all);
            }
        }
    }
    
    /* All done */
    
    if (NULL != app_env) {
        opal_argv_free(app_env);
    }
    if (NULL != global_env) {
        opal_argv_free(global_env);
    }
    ompi_info_close_components();
    OBJ_RELEASE(ompi_info_cmd_line);
    OBJ_DESTRUCT(&mca_types);
    mca_base_close();
    
    opal_finalize_util();
    
    return 0;
}
Exemple #14
0
int opal_os_dirpath_create(const char *path, const mode_t mode)
{
    struct stat buf;
    char **parts, *tmp;
    int i, len;
    int ret;

    if (NULL == path) { /* protect ourselves from errors */
        return(OPAL_ERROR);
    }

    if (0 == (ret = stat(path, &buf))) { /* already exists */
        if (mode == (mode & buf.st_mode)) { /* has correct mode */
            return(OPAL_SUCCESS);
        }
        if (0 == (ret = chmod(path, (buf.st_mode | mode)))) { /* successfully change mode */
            return(OPAL_SUCCESS);
        }
        opal_output(0,
                    "opal_os_dirpath_create: "
                    "Error: Unable to create directory (%s), unable to set the correct mode [%d]\n",
                    path, ret);
        return(OPAL_ERROR); /* can't set correct mode */
    }

    /* quick -- try to make directory */
    if (0 == mkdir(path, mode)) {
        return(OPAL_SUCCESS);
    }

    /* didnt work, so now have to build our way down the tree */
    /* Split the requested path up into its individual parts */

    parts = opal_argv_split(path, path_sep[0]);

    /* Ensure to allocate enough space for tmp: the strlen of the
       incoming path + 1 (for \0) */

    tmp = (char*)malloc(strlen(path) + 1);
    tmp[0] = '\0';

    /* Iterate through all the subdirectory names in the path,
       building up a directory name.  Check to see if that dirname
       exists.  If it doesn't, create it. */

    len = opal_argv_count(parts);
    for (i = 0; i < len; ++i) {
        if (i == 0) {
            /* If in POSIX-land, ensure that we never end a directory
               name with path_sep */

            if ('/' == path[0]) {
                strcat(tmp, path_sep);
            }
            strcat(tmp, parts[i]);
        }

        /* If it's not the first part, ensure that there's a
           preceeding path_sep and then append this part */

        else {
            if (path_sep[0] != tmp[strlen(tmp) - 1]) {
                strcat(tmp, path_sep);
            }
            strcat(tmp, parts[i]);
        }

        /* Now that we finally have the name to check, check it.
           Create it if it doesn't exist. */
        ret = mkdir(tmp, mode);
        if ((0 > ret && EEXIST != errno) || 0 != stat(tmp, &buf)) {
            opal_output(0,
                        "opal_os_dirpath_create: "
                        "Error: Unable to create the sub-directory (%s) of (%s), mkdir failed [%d]\n",
                        tmp, path, ret);
            opal_argv_free(parts);
            free(tmp);
            return OPAL_ERROR;
        }
    }

    /* All done */

    opal_argv_free(parts);
    free(tmp);
    return OPAL_SUCCESS;
}
Exemple #15
0
int opal_util_init_sys_limits(char **errmsg)
{
    char **lims, **lim=NULL, *setlim;
    int i, rc = OPAL_ERROR;
    rlim_t value;

    /* if limits were not given, then nothing to do */
    if (NULL == opal_set_max_sys_limits) {
        return OPAL_SUCCESS;
    }

    /* parse the requested limits to set */
    lims = opal_argv_split(opal_set_max_sys_limits, ',');
    if (NULL == lims) {
        return OPAL_ERR_OUT_OF_RESOURCE;
    }

    /* each limit is expressed as a "param:value" pair */
    for (i=0; NULL != lims[i]; i++) {
        lim = opal_argv_split(lims[i], ':');
        if (1 == opal_argv_count(lim)) {
            setlim = "max";
        } else {
            setlim = lim[1];
        }

        /* for historical reasons, a value of "1" means
         * that we set the limits on #files, #children,
         * and max file size
         */
        if (0 == strcmp(lim[0], "1")) {
#if HAVE_DECL_RLIMIT_NOFILE
            if (OPAL_SUCCESS !=
                opal_setlimit(RLIMIT_NOFILE, "max", &value)) {
                *errmsg = opal_show_help_string("help-opal-util.txt", "sys-limit-failed", true, "openfiles", "max");
                goto out;
            }
            opal_sys_limits.num_files = value;
#endif
#if HAVE_DECL_RLIMIT_NPROC
            if (OPAL_SUCCESS != opal_setlimit(RLIMIT_NPROC, "max", &value)) {
                *errmsg = opal_show_help_string("help-opal-util.txt", "sys-limit-failed", true, "maxchildren", "max");
                goto out;
            }
            opal_sys_limits.num_procs = value;
#endif
#if HAVE_DECL_RLIMIT_FSIZE
            if (OPAL_SUCCESS !=
                opal_setlimit(RLIMIT_FSIZE, "max", &value)) {
                *errmsg = opal_show_help_string("help-opal-util.txt", "sys-limit-failed", true, "filesize", "max");
                goto out;
            }
            opal_sys_limits.file_size = value;
#endif
            break;
        } else if (0 == strcmp(lim[0], "0")) {
            /* user didn't want anything set */
            break;
        }

        /* process them separately */
        if (0 == strcmp(lim[0], "core")) {
#if HAVE_DECL_RLIMIT_CORE
            if (OPAL_SUCCESS != opal_setlimit(RLIMIT_CORE, setlim, &value)) {
                *errmsg = opal_show_help_string("help-opal-util.txt", "sys-limit-failed", true, "openfiles", setlim);
                goto out;
            }
#endif
        } else if (0 == strcmp(lim[0], "filesize")) {
#if HAVE_DECL_RLIMIT_FSIZE
            if (OPAL_SUCCESS != opal_setlimit(RLIMIT_FSIZE, setlim, &value)) {
                *errmsg = opal_show_help_string("help-opal-util.txt", "sys-limit-failed", true, "filesize", setlim);
                goto out;
            }
            opal_sys_limits.file_size = value;
#endif
        } else if (0 == strcmp(lim[0], "maxmem")) {
#if HAVE_DECL_RLIMIT_AS
            if (OPAL_SUCCESS != opal_setlimit(RLIMIT_AS, setlim, &value)) {
                *errmsg = opal_show_help_string("help-opal-util.txt", "sys-limit-failed", true, "maxmem", setlim);
                goto out;
            }
#endif
        } else if (0 == strcmp(lim[0], "openfiles")) {
#if HAVE_DECL_RLIMIT_NOFILE
            if (OPAL_SUCCESS != opal_setlimit(RLIMIT_NOFILE, setlim, &value)) {
                *errmsg = opal_show_help_string("help-opal-util.txt", "sys-limit-failed", true, "openfiles", setlim);
                goto out;
            }
            opal_sys_limits.num_files = value;
#endif
        } else if (0 == strcmp(lim[0], "stacksize")) {
#if HAVE_DECL_RLIMIT_STACK
            if (OPAL_SUCCESS != opal_setlimit(RLIMIT_STACK, setlim, &value)) {
                *errmsg = opal_show_help_string("help-opal-util.txt", "sys-limit-failed", true, "stacksize", setlim);
                goto out;
            }
#endif
        } else if (0 == strcmp(lim[0], "maxchildren")) {
#if HAVE_DECL_RLIMIT_NPROC
            if (OPAL_SUCCESS != opal_setlimit(RLIMIT_NPROC, setlim, &value)) {
                *errmsg = opal_show_help_string("help-opal-util.txt", "sys-limit-failed", true, "maxchildren", setlim);
                goto out;
            }
            opal_sys_limits.num_procs = value;
#endif
        } else {
            *errmsg = opal_show_help_string("help-opal-util.txt", "sys-limit-unrecognized", true, lim[0], setlim);
            goto out;
        }
        opal_argv_free(lim);
        lim = NULL;
    }

    /* indicate we initialized the limits structure */
    opal_sys_limits.initialized = true;

    rc = OPAL_SUCCESS;

out:
    opal_argv_free(lims);
    if (NULL != lim) {
        opal_argv_free(lim);
    }

    return rc;
}
Exemple #16
0
int orte_errmgr_base_update_app_context_for_cr_recovery(orte_job_t *jobdata,
                                                        orte_proc_t *proc,
                                                        opal_list_t *local_snapshots)
{
    int ret, exit_status = ORTE_SUCCESS;
    opal_list_item_t *item = NULL;
    orte_std_cntr_t i_app;
    int argc = 0;
    orte_app_context_t *cur_app_context = NULL;
    orte_app_context_t *new_app_context = NULL;
    orte_sstore_base_local_snapshot_info_t *vpid_snapshot = NULL;
    char *reference_fmt_str = NULL;
    char *location_str = NULL;
    char *cache_location_str = NULL;
    char *ref_location_fmt_str = NULL;
    char *tmp_str = NULL;
    char *global_snapshot_ref = NULL;
    char *global_snapshot_seq = NULL;

    /*
     * Get the snapshot restart command for this process
     * JJH CLEANUP: Pass in the vpid_snapshot, so we don't have to look it up every time?
     */
    for(item  = opal_list_get_first(local_snapshots);
        item != opal_list_get_end(local_snapshots);
        item  = opal_list_get_next(item) ) {
        vpid_snapshot = (orte_sstore_base_local_snapshot_info_t*)item;
        if(OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL,
                                                       &vpid_snapshot->process_name,
                                                       &proc->name) ) {
            break;
        }
        else {
            vpid_snapshot = NULL;
        }
    }

    if( NULL == vpid_snapshot ) {
        ORTE_ERROR_LOG(ORTE_ERROR);
        exit_status = ORTE_ERROR;
        goto cleanup;
    }

    orte_sstore.get_attr(vpid_snapshot->ss_handle,
                         SSTORE_METADATA_LOCAL_SNAP_REF_FMT,
                         &reference_fmt_str);
    orte_sstore.get_attr(vpid_snapshot->ss_handle,
                         SSTORE_METADATA_LOCAL_SNAP_LOC,
                         &location_str);
    orte_sstore.get_attr(vpid_snapshot->ss_handle,
                         SSTORE_METADATA_LOCAL_SNAP_REF_LOC_FMT,
                         &ref_location_fmt_str);
    orte_sstore.get_attr(vpid_snapshot->ss_handle,
                         SSTORE_METADATA_GLOBAL_SNAP_REF,
                         &global_snapshot_ref);
    orte_sstore.get_attr(vpid_snapshot->ss_handle,
                         SSTORE_METADATA_GLOBAL_SNAP_SEQ,
                         &global_snapshot_seq);

    /*
     * Find current app_context
     */
    cur_app_context = NULL;
    for(i_app = 0; i_app < opal_pointer_array_get_size(jobdata->apps); ++i_app) {
        cur_app_context = (orte_app_context_t *)opal_pointer_array_get_item(jobdata->apps,
                                                                            i_app);
        if( NULL == cur_app_context ) {
            continue;
        }
        if(proc->app_idx == cur_app_context->idx) {
            break;
        }
    }

    if( NULL == cur_app_context ) {
        ORTE_ERROR_LOG(ret);
        exit_status = ret;
        goto cleanup;
    }

    /*
     * if > 1 processes in this app context
     *   Create a new app_context
     *   Copy over attributes
     *   Add it to the job_t data structure
     *   Associate it with this process in the job
     * else
     *   Reuse this app_context
     */
    if( cur_app_context->num_procs > 1 ) {
        /* Create a new app_context */
        new_app_context = OBJ_NEW(orte_app_context_t);

        /* Copy over attributes */
        new_app_context->idx                    = cur_app_context->idx;
        new_app_context->app                    = NULL; /* strdup(cur_app_context->app); */
        new_app_context->num_procs              = 1;
        new_app_context->argv                   = NULL; /* opal_argv_copy(cur_app_context->argv); */
        new_app_context->env                    = opal_argv_copy(cur_app_context->env);
        new_app_context->cwd                    = (NULL == cur_app_context->cwd ? NULL :
                                                   strdup(cur_app_context->cwd));
        new_app_context->user_specified_cwd     = cur_app_context->user_specified_cwd;
        new_app_context->hostfile               = (NULL == cur_app_context->hostfile ? NULL :
                                                   strdup(cur_app_context->hostfile));
        new_app_context->add_hostfile           = (NULL == cur_app_context->add_hostfile ? NULL :
                                                   strdup(cur_app_context->add_hostfile));
        new_app_context->dash_host              = opal_argv_copy(cur_app_context->dash_host);
        new_app_context->prefix_dir             = (NULL == cur_app_context->prefix_dir ? NULL :
                                                   strdup(cur_app_context->prefix_dir));
        new_app_context->preload_binary         = false;
        new_app_context->preload_libs           = false;
        new_app_context->preload_files_dest_dir = NULL;
        new_app_context->preload_files_src_dir  = NULL;

        asprintf(&tmp_str, reference_fmt_str, vpid_snapshot->process_name.vpid);
        asprintf(&(new_app_context->sstore_load),
                 "%s:%s:%s:%s:%s:%s",
                 location_str,
                 global_snapshot_ref,
                 tmp_str,
                 (vpid_snapshot->compress_comp == NULL ? "" : vpid_snapshot->compress_comp),
                 (vpid_snapshot->compress_postfix == NULL ? "" : vpid_snapshot->compress_postfix),
                 global_snapshot_seq);

        new_app_context->used_on_node           = cur_app_context->used_on_node;

        /* Add it to the job_t data structure */
        /*current_global_jobdata->num_apps++; */
        new_app_context->idx = (jobdata->num_apps);
        proc->app_idx = new_app_context->idx;

        opal_pointer_array_add(jobdata->apps, new_app_context);
        ++(jobdata->num_apps);

        /* Remove association with the old app_context */
        --(cur_app_context->num_procs);
    }
    else {
        new_app_context = cur_app_context;

        /* Cleanout old stuff */
        free(new_app_context->app);
        new_app_context->app = NULL;

        opal_argv_free(new_app_context->argv);
        new_app_context->argv = NULL;

        asprintf(&tmp_str, reference_fmt_str, vpid_snapshot->process_name.vpid);
        asprintf(&(new_app_context->sstore_load),
                 "%s:%s:%s:%s:%s:%s",
                 location_str,
                 global_snapshot_ref,
                 tmp_str,
                 (vpid_snapshot->compress_comp == NULL ? "" : vpid_snapshot->compress_comp),
                 (vpid_snapshot->compress_postfix == NULL ? "" : vpid_snapshot->compress_postfix),
                 global_snapshot_seq);
    }

    /*
     * Update the app_context with the restart informaiton
     */
    new_app_context->app = strdup("opal-restart");
    opal_argv_append(&argc, &(new_app_context->argv), new_app_context->app);
    opal_argv_append(&argc, &(new_app_context->argv), "-l");
    opal_argv_append(&argc, &(new_app_context->argv), location_str);
    opal_argv_append(&argc, &(new_app_context->argv), "-m");
    opal_argv_append(&argc, &(new_app_context->argv), orte_sstore_base_local_metadata_filename);
    opal_argv_append(&argc, &(new_app_context->argv), "-r");
    if( NULL != tmp_str ) {
        free(tmp_str);
        tmp_str = NULL;
    }
    asprintf(&tmp_str, reference_fmt_str, vpid_snapshot->process_name.vpid);
    opal_argv_append(&argc, &(new_app_context->argv), tmp_str);

 cleanup:
    if( NULL != tmp_str) {
        free(tmp_str);
        tmp_str = NULL;
    }
    if( NULL != location_str ) {
        free(location_str);
        location_str = NULL;
    }
    if( NULL != cache_location_str ) {
        free(cache_location_str);
        cache_location_str = NULL;
    }
    if( NULL != reference_fmt_str ) {
        free(reference_fmt_str);
        reference_fmt_str = NULL;
    }
    if( NULL != ref_location_fmt_str ) {
        free(ref_location_fmt_str);
        ref_location_fmt_str = NULL;
    }

    return exit_status;
}
Exemple #17
0
static int rte_init(void)
{
    int ret;
    char *error = NULL;
    char **hosts = NULL;

    /* run the prolog */
    if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
        error = "orte_ess_base_std_prolog";
        goto error;
    }

    /* Start by getting a unique name */
    tm_set_name();

    /* if I am a daemon, complete my setup using the
     * default procedure
     */
    if (ORTE_PROC_IS_DAEMON) {
        if (NULL != orte_node_regex) {
            /* extract the nodes */
            if (ORTE_SUCCESS != (ret =
                orte_regex_extract_node_names(orte_node_regex, &hosts)) ||
                NULL == hosts) {
                error = "orte_regex_extract_node_names";
                goto error;
            }
        }
        if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) {
            ORTE_ERROR_LOG(ret);
            error = "orte_ess_base_orted_setup";
            goto error;
        }
        opal_argv_free(hosts);
        return ORTE_SUCCESS;
    }

    if (ORTE_PROC_IS_TOOL) {
        /* otherwise, if I am a tool proc, use that procedure */
        if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup())) {
            ORTE_ERROR_LOG(ret);
            error = "orte_ess_base_tool_setup";
            goto error;
        }
        return ORTE_SUCCESS;

    }

    /* no other options are supported! */
    error = "ess_error";
    ret = ORTE_ERROR;

  error:
    if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) {
        orte_show_help("help-orte-runtime.txt",
                       "orte_init:startup:internal-failure",
                       true, error, ORTE_ERROR_NAME(ret), ret);
    }

    return ret;
}
Exemple #18
0
/* we can only enter this routine if no other allocation
 * was found, so we only need to know that finding any
 * relative node syntax should generate an immediate error
 */
int orte_util_add_dash_host_nodes(opal_list_t *nodes,
                                  bool *override_oversubscribed,
                                  char ** host_argv)
{
    opal_list_item_t* item;
    orte_std_cntr_t i, j, k;
    int rc;
    char **mapped_nodes = NULL, **mini_map;
    orte_node_t *node;

    /* Accumulate all of the host name mappings */
    for (j = 0; j < opal_argv_count(host_argv); ++j) {
        mini_map = opal_argv_split(host_argv[j], ',');
        
        if (mapped_nodes == NULL) {
            mapped_nodes = mini_map;
        } else {
            for (k = 0; NULL != mini_map[k]; ++k) {
                rc = opal_argv_append_nosize(&mapped_nodes, 
                                             mini_map[k]);
                if (OPAL_SUCCESS != rc) {
                    goto cleanup;
                }
            }
            opal_argv_free(mini_map);
        }
    }

    /* Did we find anything? If not, then do nothing */
    if (NULL == mapped_nodes) {
        return ORTE_SUCCESS;
    }
    
    /*  go through the names found and
       add them to the host list. If they're not unique, then
       bump the slots count for each duplicate */
    
    for (i = 0; NULL != mapped_nodes[i]; ++i) {
        /* if the specified node contains a relative node syntax,
         * this is an error
         */
        if ('+' == mapped_nodes[i][0]) {
            orte_show_help("help-dash-host.txt", "dash-host:relative-syntax",
                           true, mapped_nodes[i]);
            rc = ORTE_ERR_SILENT;
            goto cleanup;
        }
        
        /* see if the node is already on the list */
        for (item = opal_list_get_first(nodes); 
             item != opal_list_get_end(nodes);
             item = opal_list_get_next(item)) {
            node = (orte_node_t*) item;
            if (0 == strcmp(node->name, mapped_nodes[i]) ||
               (0 == strcmp(node->name, orte_process_info.nodename) &&
               (0 == strcmp(mapped_nodes[i], "localhost") || opal_ifislocal(mapped_nodes[i])))) {
                ++node->slots;
                break;
            }
        }
        
        /* If we didn't find it, add it to the list */
        
        if (item == opal_list_get_end(nodes)) {
            node = OBJ_NEW(orte_node_t);
            if (NULL == node) {
                return ORTE_ERR_OUT_OF_RESOURCE;
            }
            /* check to see if this is a local name */
            if (0 == strcmp(mapped_nodes[i], "localhost") ||
                opal_ifislocal(mapped_nodes[i])) {
                /* it is local, so use the local nodename to avoid
                 * later confusion
                 */
                if (orte_show_resolved_nodenames &&
                    0 != strcmp(mapped_nodes[i], orte_process_info.nodename)) {
                    /* add to list of aliases for this node - only add if unique */
                    opal_argv_append_unique_nosize(&node->alias, mapped_nodes[i]);
                }
                node->name = strdup(orte_process_info.nodename);
            } else {
                /* not local - use the given name */
                node->name = strdup(mapped_nodes[i]);
            }
            node->state = ORTE_NODE_STATE_UP;
            node->slots_inuse = 0;
            node->slots_max = 0;
            node->slots = 1;
            /* indicate that ORTE should override any oversubscribed conditions
             * based on local hardware limits since the user (a) might not have
             * provided us any info on the #slots for a node, and (b) the user
             * might have been wrong! If we don't check the number of local physical
             * processors, then we could be too aggressive on our sched_yield setting
             * and cause performance problems.
             */
            *override_oversubscribed = true;
            opal_list_append(nodes, &node->super);
        }
    }
    rc = ORTE_SUCCESS;

cleanup:
    if (NULL != mapped_nodes) {
        opal_argv_free(mapped_nodes);
    }

    return rc;
}
static int mca_btl_tcp_component_register(void)
{
    char* message;

    /* register TCP component parameters */
    mca_btl_tcp_param_register_uint("links", NULL, 1, OPAL_INFO_LVL_4, &mca_btl_tcp_component.tcp_num_links);
    mca_btl_tcp_param_register_string("if_include", "Comma-delimited list of devices and/or CIDR notation of networks to use for MPI communication (e.g., \"eth0,192.168.0.0/16\").  Mutually exclusive with btl_tcp_if_exclude.", "", OPAL_INFO_LVL_1, &mca_btl_tcp_component.tcp_if_include);
    mca_btl_tcp_param_register_string("if_exclude", "Comma-delimited list of devices and/or CIDR notation of networks to NOT use for MPI communication -- all devices not matching these specifications will be used (e.g., \"eth0,192.168.0.0/16\").  If set to a non-default value, it is mutually exclusive with btl_tcp_if_include.", 
                                      "127.0.0.1/8,sppp",
                                      OPAL_INFO_LVL_1, &mca_btl_tcp_component.tcp_if_exclude);

    mca_btl_tcp_param_register_int ("free_list_num", NULL, 8, OPAL_INFO_LVL_5,  &mca_btl_tcp_component.tcp_free_list_num);
    mca_btl_tcp_param_register_int ("free_list_max", NULL, -1, OPAL_INFO_LVL_5,  &mca_btl_tcp_component.tcp_free_list_max);
    mca_btl_tcp_param_register_int ("free_list_inc", NULL, 32, OPAL_INFO_LVL_5,  &mca_btl_tcp_component.tcp_free_list_inc);
    mca_btl_tcp_param_register_int ("sndbuf", NULL, 128*1024, OPAL_INFO_LVL_4, &mca_btl_tcp_component.tcp_sndbuf);
    mca_btl_tcp_param_register_int ("rcvbuf", NULL, 128*1024, OPAL_INFO_LVL_4, &mca_btl_tcp_component.tcp_rcvbuf);
    mca_btl_tcp_param_register_int ("endpoint_cache",
        "The size of the internal cache for each TCP connection. This cache is"
        " used to reduce the number of syscalls, by replacing them with memcpy."
        " Every read will read the expected data plus the amount of the"
                                    " endpoint_cache", 30*1024, OPAL_INFO_LVL_4, &mca_btl_tcp_component.tcp_endpoint_cache);
    mca_btl_tcp_param_register_int ("use_nagle", "Whether to use Nagle's algorithm or not (using Nagle's algorithm may increase short message latency)", 0, OPAL_INFO_LVL_4, &mca_btl_tcp_component.tcp_not_use_nodelay);
    mca_btl_tcp_param_register_int( "port_min_v4", 
                                    "The minimum port where the TCP BTL will try to bind (default 1024)",
                                    1024, OPAL_INFO_LVL_2, &mca_btl_tcp_component.tcp_port_min);

    asprintf( &message, 
              "The number of ports where the TCP BTL will try to bind (default %d)."
              " This parameter together with the port min, define a range of ports"
              " where Open MPI will open sockets.",
              (0x1 << 16) - mca_btl_tcp_component.tcp_port_min - 1 );
    mca_btl_tcp_param_register_int( "port_range_v4", message,
                                    (0x1 << 16) - mca_btl_tcp_component.tcp_port_min - 1,
                                    OPAL_INFO_LVL_2, &mca_btl_tcp_component.tcp_port_range);
    free(message);
#if OPAL_WANT_IPV6
    mca_btl_tcp_param_register_int( "port_min_v6",
                                    "The minimum port where the TCP BTL will try to bind (default 1024)", 1024,
                                    OPAL_INFO_LVL_2, & mca_btl_tcp_component.tcp6_port_min );
    asprintf( &message, 
              "The number of ports where the TCP BTL will try to bind (default %d)."
              " This parameter together with the port min, define a range of ports"
              " where Open MPI will open sockets.",
              (0x1 << 16) - mca_btl_tcp_component.tcp6_port_min - 1 );
    mca_btl_tcp_param_register_int( "port_range_v6", message,
                                    (0x1 << 16) - mca_btl_tcp_component.tcp6_port_min - 1,
                                    OPAL_INFO_LVL_2, &mca_btl_tcp_component.tcp6_port_range );
    free(message);
#endif

    mca_btl_tcp_module.super.btl_exclusivity =  MCA_BTL_EXCLUSIVITY_LOW + 100;
    mca_btl_tcp_module.super.btl_eager_limit = 64*1024;
    mca_btl_tcp_module.super.btl_rndv_eager_limit = 64*1024;
    mca_btl_tcp_module.super.btl_max_send_size = 128*1024;
    mca_btl_tcp_module.super.btl_rdma_pipeline_send_length = 128*1024;
    mca_btl_tcp_module.super.btl_rdma_pipeline_frag_size = INT_MAX;
    mca_btl_tcp_module.super.btl_min_rdma_pipeline_size = 0;
    mca_btl_tcp_module.super.btl_flags = MCA_BTL_FLAGS_PUT |
                                       MCA_BTL_FLAGS_SEND_INPLACE |
                                       MCA_BTL_FLAGS_NEED_CSUM |
                                       MCA_BTL_FLAGS_NEED_ACK |
                                       MCA_BTL_FLAGS_HETEROGENEOUS_RDMA;
    mca_btl_tcp_module.super.btl_seg_size = sizeof (mca_btl_base_segment_t);
    mca_btl_tcp_module.super.btl_bandwidth = 100;
    mca_btl_tcp_module.super.btl_latency = 100;

    mca_btl_base_param_register(&mca_btl_tcp_component.super.btl_version,
                                &mca_btl_tcp_module.super);

    mca_btl_tcp_param_register_int ("disable_family", NULL, 0, OPAL_INFO_LVL_2,  &mca_btl_tcp_component.tcp_disable_family);

    /* Register a list of interfaces to use in sequence */
    mca_btl_tcp_param_register_string("if_seq", 
                                      "If specified, a comma-delimited list of TCP interfaces.  Interfaces will be assigned, one to each MPI process, in a round-robin fashion on each server.  For example, if the list is \"eth0,eth1\" and four MPI processes are run on a single server, then local ranks 0 and 2 will use eth0 and local ranks 1 and 3 will use eth1.", NULL, OPAL_INFO_LVL_9, &mca_btl_tcp_if_seq_string);

    mca_btl_tcp_component.tcp_if_seq = NULL;
    if (NULL != mca_btl_tcp_if_seq_string && '\0' != *mca_btl_tcp_if_seq_string) {
        char **argv = opal_argv_split(mca_btl_tcp_if_seq_string, ',');

        if (NULL != argv && '\0' != *(argv[0])) {
            int if_index, rc, count;
            ompi_node_rank_t node_rank;
            char name[256];

            node_rank = ompi_process_info.my_node_rank;

            /* Now that we've got that local rank, take the
               corresponding entry from the tcp_if_seq list (wrapping
               if necessary) */
            count = opal_argv_count(argv);
            mca_btl_tcp_component.tcp_if_seq = 
                strdup(argv[node_rank % count]);
            opal_argv_free(argv);

            /* Double check that the selected interface actually exists */
            for (if_index = opal_ifbegin(); if_index >= 0; 
                 if_index = opal_ifnext(if_index)){
                if (OPAL_SUCCESS != 
                    (rc = opal_ifindextoname(if_index, name, sizeof(name)))) {
                    return rc;
                }
                if (0 == strcmp(name, mca_btl_tcp_component.tcp_if_seq)) {
                    break;
                }
            }
            if (if_index < 0) {
                opal_show_help("help-mpi-btl-tcp.txt", 
                               "invalid if_inexclude",
                               true, "if_seq",
                               ompi_process_info.nodename,
                               mca_btl_tcp_component.tcp_if_seq,
                               "Interface does not exist");
                free(mca_btl_tcp_component.tcp_if_seq);
                mca_btl_tcp_component.tcp_if_seq = NULL;
            } else {
                BTL_VERBOSE(("Node rank %d using TCP interface %s",
                             node_rank, mca_btl_tcp_component.tcp_if_seq));
            }
        }
    }

    return mca_btl_tcp_component_verify();
}
/*
 * Register MCA parameters
 */
int ompi_btl_openib_connect_base_register(void)
{
    int i, j, save;
    char **temp = NULL, *string = NULL, *all_cpc_names = NULL;
    char *cpc_include = NULL, *cpc_exclude = NULL;

    /* Make an MCA parameter to select which connect module to use */
    for (i = 0; NULL != all[i]; ++i) {
        /* The CPC name "empty" is reserved for "fake" CPC modules */
        if (0 != strcmp(all[i]->cbc_name, "empty")) {
            opal_argv_append_nosize(&temp, all[i]->cbc_name);
        }
    }
    all_cpc_names = opal_argv_join(temp, ',');
    opal_argv_free(temp);
    asprintf(&string,
             "Method used to select OpenFabrics connections (valid values: %s)",
             all_cpc_names);

    mca_base_param_reg_string(&mca_btl_openib_component.super.btl_version,
                              "cpc_include", string, false, false, 
                              NULL, &cpc_include);
    free(string);

    asprintf(&string,
             "Method used to exclude OpenFabrics connections (valid values: %s)",
             all_cpc_names);

    mca_base_param_reg_string(&mca_btl_openib_component.super.btl_version,
                              "cpc_exclude", string, false, false, 
                              NULL, &cpc_exclude);
    free(string);

    /* Parse the if_[in|ex]clude paramters to come up with a list of
       CPCs that are available */
    available = calloc(1, sizeof(all));

    /* If we have an "include" list, then find all those CPCs and put
       them in available[] */
    if (NULL != cpc_include) {
        mca_btl_openib_component.cpc_explicitly_defined = true;
        temp = opal_argv_split(cpc_include, ',');
        for (save = j = 0; NULL != temp[j]; ++j) {
            for (i = 0; NULL != all[i]; ++i) {
                if (0 == strcmp(temp[j], all[i]->cbc_name)) { 
                    opal_output(-1, "include: saving %s", all[i]->cbc_name);
                    available[save++] = all[i];
                    ++num_available;
                    break;
                }
            }
            if (NULL == all[i]) {
                orte_show_help("help-mpi-btl-openib-cpc-base.txt",
                               "cpc name not found", true,
                               "include", orte_process_info.nodename,
                               "include", cpc_include, temp[j], 
                               all_cpc_names);
                opal_argv_free(temp);
                free(all_cpc_names);
                return OMPI_ERR_NOT_FOUND;
            }
        }
        opal_argv_free(temp);
    }

    /* Otherwise, if we have an "exclude" list, take all the CPCs that
       are not in that list and put them in available[] */
    else if (NULL != cpc_exclude) {
        mca_btl_openib_component.cpc_explicitly_defined = true;
        temp = opal_argv_split(cpc_exclude, ',');
        /* First: error check -- ensure that all the names are valid */
        for (j = 0; NULL != temp[j]; ++j) {
            for (i = 0; NULL != all[i]; ++i) {
                if (0 == strcmp(temp[j], all[i]->cbc_name)) { 
                    break;
                }
            }
            if (NULL == all[i]) {
                orte_show_help("help-mpi-btl-openib-cpc-base.txt",
                               "cpc name not found", true,
                               "exclude", orte_process_info.nodename,
                               "exclude", cpc_exclude, temp[j], 
                               all_cpc_names);
                opal_argv_free(temp);
                free(all_cpc_names);
                return OMPI_ERR_NOT_FOUND;
            }
        }

        /* Now do the exclude */
        for (save = i = 0; NULL != all[i]; ++i) {
            for (j = 0; NULL != temp[j]; ++j) {
                if (0 == strcmp(temp[j], all[i]->cbc_name)) {
                    break;
                }
            }
            if (NULL == temp[j]) {
                opal_output(-1, "exclude: saving %s", all[i]->cbc_name);
                available[save++] = all[i];
                ++num_available;
            }
        }
        opal_argv_free(temp);
    } 

    /* If there's no include/exclude list, copy all[] into available[] */
    else {
        opal_output(-1, "no include or exclude: saving all");
        memcpy(available, all, sizeof(all));
        num_available = (sizeof(all) / 
                         sizeof(ompi_btl_openib_connect_base_module_t *)) - 1;
    }

    /* Call the register function on all the CPCs so that they may
       setup any MCA params specific to the connection type */
    for (i = 0; NULL != available[i]; ++i) {
        if (NULL != available[i]->cbc_register) {
            available[i]->cbc_register();
        }
    }

    return OMPI_SUCCESS;
}
Exemple #21
0
/*
 * Scan all the files in a directory (or path) and invoke a callback
 * on each one.
 */
static int dlopen_foreachfile(const char *search_path,
                              int (*func)(const char *filename, void *data),
                              void *data)
{
    int ret;
    DIR *dp = NULL;
    char **dirs = NULL;
    char **good_files = NULL;

    dirs = opal_argv_split(search_path, OPAL_ENV_SEP);
    for (int i = 0; NULL != dirs && NULL != dirs[i]; ++i) {

        dp = opendir(dirs[i]);
        if (NULL == dp) {
            ret = OPAL_ERR_IN_ERRNO;
            goto error;
        }

        struct dirent *de;
        while (NULL != (de = readdir(dp))) {

            /* Make the absolute path name */
            char *abs_name = NULL;
            asprintf(&abs_name, "%s/%s", dirs[i], de->d_name);
            if (NULL == abs_name) {
                ret = OPAL_ERR_IN_ERRNO;
                goto error;
            }

            /* Stat the file */
            struct stat buf;
            if (stat(abs_name, &buf) < 0) {
                free(abs_name);
                ret = OPAL_ERR_IN_ERRNO;
                goto error;
            }

            /* Skip if not a file */
            if (!S_ISREG(buf.st_mode)) {
                free(abs_name);
                continue;
            }

            /* Find the suffix */
            char *ptr = strrchr(abs_name, '.');
            if (NULL != ptr) {

                /* Skip libtool files */
                if (strcmp(ptr, ".la") == 0 ||
                    strcmp(ptr, ".lo") == 0) {
                    continue;
                }

                *ptr = '\0';
            }

            /* Have we already found this file?  Or already found a
               file with the same basename (but different suffix)? */
            bool found = false;
            for (int j = 0; NULL != good_files &&
                     NULL != good_files[j]; ++j) {
                if (strcmp(good_files[j], abs_name) == 0) {
                    found = true;
                    break;
                }
            }

            if (!found) {
                opal_argv_append_nosize(&good_files, abs_name);
            }
            free(abs_name);
        }
        closedir(dp);
    }
    dp = NULL;

    /* Invoke the callback on all the found files */
    if (NULL != good_files) {
        for (int i = 0; NULL != good_files[i]; ++i) {
            ret = func(good_files[i], data);
            if (OPAL_SUCCESS != ret) {
                goto error;
            }
        }
    }

    ret = OPAL_SUCCESS;

 error:
    if (NULL != dp) {
        closedir(dp);
    }
    if (NULL != dirs) {
        opal_argv_free(dirs);
    }
    if (NULL != good_files) {
        opal_argv_free(good_files);
    }

    return ret;
}
Exemple #22
0
static int s1_init(void)
{
    PMI_BOOL initialized;
    int spawned;
    int rc, ret = OPAL_ERROR;
    int i, rank, lrank, nrank;
    char *pmix_id, tmp[64];
    opal_value_t kv;
    char *str;
    uint32_t ui32;
    opal_process_name_t ldr;
    char **localranks=NULL;

    if (PMI_SUCCESS != (rc = PMI_Initialized(&initialized))) {
        OPAL_PMI_ERROR(rc, "PMI_Initialized");
        return OPAL_ERROR;
    }

    if (PMI_TRUE != initialized && PMI_SUCCESS != (rc = PMI_Init(&spawned))) {
        OPAL_PMI_ERROR(rc, "PMI_Init");
        return OPAL_ERROR;
    }

    // setup hash table
    opal_pmix_base_hash_init();

    // Initialize space demands
    rc = PMI_KVS_Get_value_length_max(&pmix_vallen_max);
    if (PMI_SUCCESS != rc) {
        OPAL_PMI_ERROR(rc, "PMI_KVS_Get_value_length_max");
        goto err_exit;
    }
    pmix_vallen_threshold = pmix_vallen_max * 3;
    pmix_vallen_threshold >>= 2;

    rc = PMI_KVS_Get_name_length_max(&pmix_kvslen_max);
    if (PMI_SUCCESS != rc) {
        OPAL_PMI_ERROR(rc, "PMI_KVS_Get_name_length_max");
        goto err_exit;
    }

    rc = PMI_KVS_Get_key_length_max(&pmix_keylen_max);
    if (PMI_SUCCESS != rc) {
        OPAL_PMI_ERROR(rc, "PMI_KVS_Get_key_length_max");
        goto err_exit;
    }

    // Initialize job environment information
    pmix_id = (char*)malloc(pmix_vallen_max);
    if (pmix_id == NULL) {
        ret = OPAL_ERR_OUT_OF_RESOURCE;
        goto err_exit;
    }
    /* Get domain id */
    if (PMI_SUCCESS != (rc = PMI_Get_kvs_domain_id(pmix_id, pmix_vallen_max))) {
        free(pmix_id);
        goto err_exit;
    }

    /* get our rank */
    ret = PMI_Get_rank(&rank);
    if( PMI_SUCCESS != ret ) {
        OPAL_PMI_ERROR(ret, "PMI_Get_rank");
        free(pmix_id);
        goto err_exit;
    }

    /* Slurm PMI provides the job id as an integer followed
     * by a '.', followed by essentially a stepid. The first integer
     * defines an overall job number. The second integer is the number of
     * individual jobs we have run within that allocation. */
    s1_pname.jobid = strtoul(pmix_id, &str, 10);
    s1_pname.jobid = (s1_pname.jobid << 16) & 0xffff0000;
    if (NULL != str) {
        ui32 = strtoul(str, NULL, 10);
        s1_pname.jobid |= (ui32 & 0x0000ffff);
    }
    ldr.jobid = s1_pname.jobid;
    s1_pname.vpid = rank;
    /* store our name in the opal_proc_t so that
     * debug messages will make sense - an upper
     * layer will eventually overwrite it, but that
     * won't do any harm */
    opal_proc_set_name(&s1_pname);
    opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                        "%s pmix:s1: assigned tmp name",
                        OPAL_NAME_PRINT(s1_pname));

    OBJ_CONSTRUCT(&kv, opal_value_t);
    kv.key = strdup(OPAL_PMIX_JOBID);
    kv.type = OPAL_UINT32;
    kv.data.uint32 = s1_pname.jobid;
    if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) {
        OPAL_ERROR_LOG(ret);
        OBJ_DESTRUCT(&kv);
        goto err_exit;
    }
    OBJ_DESTRUCT(&kv);

    /* save it */
    OBJ_CONSTRUCT(&kv, opal_value_t);
    kv.key = strdup(OPAL_PMIX_RANK);
    kv.type = OPAL_UINT32;
    kv.data.uint32 = rank;
    if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) {
        OPAL_ERROR_LOG(ret);
        OBJ_DESTRUCT(&kv);
        goto err_exit;
    }
    OBJ_DESTRUCT(&kv);

    pmix_kvs_name = (char*)malloc(pmix_kvslen_max);
    if (pmix_kvs_name == NULL) {
        ret = OPAL_ERR_OUT_OF_RESOURCE;
        goto err_exit;
    }

    rc = PMI_KVS_Get_my_name(pmix_kvs_name, pmix_kvslen_max);
    if (PMI_SUCCESS != rc) {
        OPAL_PMI_ERROR(rc, "PMI_KVS_Get_my_name");
        goto err_exit;
    }

    /* get our local proc info to find our local rank */
    if (PMI_SUCCESS != (rc = PMI_Get_clique_size(&nlranks))) {
        OPAL_PMI_ERROR(rc, "PMI_Get_clique_size");
        return rc;
    }
    /* save the local size */
    OBJ_CONSTRUCT(&kv, opal_value_t);
    kv.key = strdup(OPAL_PMIX_LOCAL_SIZE);
    kv.type = OPAL_UINT32;
    kv.data.uint32 = nlranks;
    if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) {
        OPAL_ERROR_LOG(ret);
        OBJ_DESTRUCT(&kv);
        goto err_exit;
    }
    OBJ_DESTRUCT(&kv);
    lrank = 0;
    nrank = 0;
    ldr.vpid = rank;
    if (0 < nlranks) {
        /* now get the specific ranks */
        lranks = (int*)calloc(nlranks, sizeof(int));
        if (NULL == lranks) {
            rc = OPAL_ERR_OUT_OF_RESOURCE;
            OPAL_ERROR_LOG(rc);
            return rc;
        }
        if (PMI_SUCCESS != (rc = PMI_Get_clique_ranks(lranks, nlranks))) {
            OPAL_PMI_ERROR(rc, "PMI_Get_clique_ranks");
            free(lranks);
            return rc;
        }
        /* note the local ldr */
        ldr.vpid = lranks[0];
        /* save this */
        memset(tmp, 0, 64);
        for (i=0; i < nlranks; i++) {
            (void)snprintf(tmp, 64, "%d", lranks[i]);
            opal_argv_append_nosize(&localranks, tmp);
            if (rank == lranks[i]) {
                lrank = i;
                nrank = i;
            }
        }
        str = opal_argv_join(localranks, ',');
        opal_argv_free(localranks);
        OBJ_CONSTRUCT(&kv, opal_value_t);
        kv.key = strdup(OPAL_PMIX_LOCAL_PEERS);
        kv.type = OPAL_STRING;
        kv.data.string = str;
        if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) {
            OPAL_ERROR_LOG(ret);
            OBJ_DESTRUCT(&kv);
            goto err_exit;
        }
        OBJ_DESTRUCT(&kv);
    }

    /* save the local leader */
    OBJ_CONSTRUCT(&kv, opal_value_t);
    kv.key = strdup(OPAL_PMIX_LOCALLDR);
    kv.type = OPAL_UINT64;
    kv.data.uint64 = *(uint64_t*)&ldr;
    if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) {
        OPAL_ERROR_LOG(ret);
        OBJ_DESTRUCT(&kv);
        goto err_exit;
    }
    OBJ_DESTRUCT(&kv);
    /* save our local rank */
    OBJ_CONSTRUCT(&kv, opal_value_t);
    kv.key = strdup(OPAL_PMIX_LOCAL_RANK);
    kv.type = OPAL_UINT16;
    kv.data.uint16 = lrank;
    if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) {
        OPAL_ERROR_LOG(ret);
        OBJ_DESTRUCT(&kv);
        goto err_exit;
    }
    OBJ_DESTRUCT(&kv);
    /* and our node rank */
    OBJ_CONSTRUCT(&kv, opal_value_t);
    kv.key = strdup(OPAL_PMIX_NODE_RANK);
    kv.type = OPAL_UINT16;
    kv.data.uint16 = nrank;
    if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) {
        OPAL_ERROR_LOG(ret);
        OBJ_DESTRUCT(&kv);
        goto err_exit;
    }
    OBJ_DESTRUCT(&kv);

    /* get universe size */
    ret = PMI_Get_universe_size(&i);
    if (PMI_SUCCESS != ret) {
        OPAL_PMI_ERROR(ret, "PMI_Get_universe_size");
        goto err_exit;
    }
    /* push this into the dstore for subsequent fetches */
    OBJ_CONSTRUCT(&kv, opal_value_t);
    kv.key = strdup(OPAL_PMIX_UNIV_SIZE);
    kv.type = OPAL_UINT32;
    kv.data.uint32 = i;
    if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) {
        OPAL_ERROR_LOG(ret);
        OBJ_DESTRUCT(&kv);
        goto err_exit;
    }
    OBJ_DESTRUCT(&kv);

    /* get job size */
    ret = PMI_Get_size(&i);
    if (PMI_SUCCESS != ret) {
        OPAL_PMI_ERROR(ret, "PMI_Get_size");
        goto err_exit;
    }
    OBJ_CONSTRUCT(&kv, opal_value_t);
    kv.key = strdup(OPAL_PMIX_JOB_SIZE);
    kv.type = OPAL_UINT32;
    kv.data.uint32 = i;
    if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) {
        OPAL_ERROR_LOG(ret);
        OBJ_DESTRUCT(&kv);
        goto err_exit;
    }
    OBJ_DESTRUCT(&kv);

    /* get appnum */
    ret = PMI_Get_appnum(&i);
    if (PMI_SUCCESS != ret) {
        OPAL_PMI_ERROR(ret, "PMI_Get_appnum");
        goto err_exit;
    }
    OBJ_CONSTRUCT(&kv, opal_value_t);
    kv.key = strdup(OPAL_PMIX_APPNUM);
    kv.type = OPAL_UINT32;
    kv.data.uint32 = i;
    if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) {
        OPAL_ERROR_LOG(ret);
        OBJ_DESTRUCT(&kv);
        goto err_exit;
    }
    OBJ_DESTRUCT(&kv);

   /* increment the init count */
    ++pmix_init_count;

    return OPAL_SUCCESS;

 err_exit:
    PMI_Finalize();
    return ret;
}
/* process incoming messages in order of receipt */
void orte_plm_base_recv(int status, orte_process_name_t* sender,
                        opal_buffer_t* buffer, orte_rml_tag_t tag,
                        void* cbdata)
{
    orte_plm_cmd_flag_t command;
    orte_std_cntr_t count;
    orte_jobid_t job;
    orte_job_t *jdata, *parent;
    opal_buffer_t *answer;
    orte_vpid_t vpid;
    orte_proc_t *proc;
    orte_proc_state_t state;
    orte_exit_code_t exit_code;
    int32_t rc=ORTE_SUCCESS, ret;
    orte_app_context_t *app, *child_app;
    orte_process_name_t name;
    pid_t pid;
    bool running;
    int i;
    char **env;
    char *prefix_dir;

    OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
                         "%s plm:base:receive processing msg",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

    count = 1;
    if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &command, &count, ORTE_PLM_CMD))) {
        ORTE_ERROR_LOG(rc);
        goto CLEANUP;
    }
        
    switch (command) {
    case ORTE_PLM_LAUNCH_JOB_CMD:
        OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
                             "%s plm:base:receive job launch command from %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(sender)));
                
        /* unpack the job object */
        count = 1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &jdata, &count, ORTE_JOB))) {
            ORTE_ERROR_LOG(rc);
            goto ANSWER_LAUNCH;
        }
            
        /* record the sender so we know who to respond to */
        jdata->originator.jobid = sender->jobid;
        jdata->originator.vpid = sender->vpid;

        /* get the parent's job object */
        if (NULL != (parent = orte_get_job_data_object(sender->jobid))) {
            /* if the prefix was set in the parent's job, we need to transfer
             * that prefix to the child's app_context so any further launch of
             * orteds can find the correct binary. There always has to be at
             * least one app_context in both parent and child, so we don't
             * need to check that here. However, be sure not to overwrite
             * the prefix if the user already provided it!
             */
            app = (orte_app_context_t*)opal_pointer_array_get_item(parent->apps, 0);
            child_app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0);
            prefix_dir = NULL;
            if (orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&prefix_dir, OPAL_STRING) &&
                !orte_get_attribute(&child_app->attributes, ORTE_APP_PREFIX_DIR, NULL, OPAL_STRING)) {
                orte_set_attribute(&child_app->attributes, ORTE_APP_PREFIX_DIR, ORTE_ATTR_GLOBAL, prefix_dir, OPAL_STRING);
            }
            if (NULL != prefix_dir) {
                free(prefix_dir);
            }
        }
        
        /* if the user asked to forward any envars, cycle through the app contexts
         * in the comm_spawn request and add them
         */
        if (NULL != orte_forwarded_envars) {
            for (i=0; i < jdata->apps->size; i++) {
                if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
                    continue;
                }
                env = opal_environ_merge(orte_forwarded_envars, app->env);
                opal_argv_free(app->env);
                app->env = env;
            }
        }

        OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
                             "%s plm:base:receive adding hosts",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

        /* process any add-hostfile and add-host options that were provided */
        if (ORTE_SUCCESS != (rc = orte_ras_base_add_hosts(jdata))) {
            ORTE_ERROR_LOG(rc);
            goto ANSWER_LAUNCH;
        }

        if (NULL != parent) {
            if (NULL == parent->bookmark) {
                /* find the sender's node in the job map */
                if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(parent->procs, sender->vpid))) {
                    /* set the bookmark so the child starts from that place - this means
                     * that the first child process could be co-located with the proc
                     * that called comm_spawn, assuming slots remain on that node. Otherwise,
                     * the procs will start on the next available node
                     */
                    jdata->bookmark = proc->node;
                }
            } else {
                jdata->bookmark = parent->bookmark;
            }
        }

        /* launch it */
        OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
                             "%s plm:base:receive calling spawn",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
            ORTE_ERROR_LOG(rc);
            goto ANSWER_LAUNCH;
        }
        break;
    ANSWER_LAUNCH:
        OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
                             "%s plm:base:receive - error on launch: %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc));

        /* setup the response */
        answer = OBJ_NEW(opal_buffer_t);
        
        /* pack the error code to be returned */
        if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) {
            ORTE_ERROR_LOG(ret);
        }
                
        /* send the response back to the sender */
        if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_PLM_PROXY,
                                               orte_rml_send_callback, NULL))) {
            ORTE_ERROR_LOG(ret);
            OBJ_RELEASE(answer);
        }
        break;
                
    case ORTE_PLM_UPDATE_PROC_STATE:
        opal_output_verbose(5, orte_plm_base_framework.framework_output,
                            "%s plm:base:receive update proc state command from %s",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            ORTE_NAME_PRINT(sender));
        count = 1;
        while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &job, &count, ORTE_JOBID))) {
                    
            opal_output_verbose(5, orte_plm_base_framework.framework_output,
                                "%s plm:base:receive got update_proc_state for job %s",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                ORTE_JOBID_PRINT(job));
                    
            name.jobid = job;
            running = false;
            /* get the job object */
            jdata = orte_get_job_data_object(job);
            count = 1;
            while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &vpid, &count, ORTE_VPID))) {
                if (ORTE_VPID_INVALID == vpid) {
                    /* flag indicates that this job is complete - move on */
                    break;
                }
                name.vpid = vpid;
                /* unpack the pid */
                count = 1;
                if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &pid, &count, OPAL_PID))) {
                    ORTE_ERROR_LOG(rc);
                    goto CLEANUP;
                }
                /* unpack the state */
                count = 1;
                if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &state, &count, ORTE_PROC_STATE))) {
                    ORTE_ERROR_LOG(rc);
                    goto CLEANUP;
                }
                if (ORTE_PROC_STATE_RUNNING == state) {
                    running = true;
                }
                /* unpack the exit code */
                count = 1;
                if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &exit_code, &count, ORTE_EXIT_CODE))) {
                    ORTE_ERROR_LOG(rc);
                    goto CLEANUP;
                }
                        
                OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
                                     "%s plm:base:receive got update_proc_state for vpid %lu state %s exit_code %d",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     (unsigned long)vpid, orte_proc_state_to_str(state), (int)exit_code));

                if (NULL != jdata) {
                    /* get the proc data object */
                    if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, vpid))) {
                        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
                        ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
                    }
                    /* NEVER update the proc state before activating the state machine - let
                     * the state cbfunc update it as it may need to compare this
                     * state against the prior proc state */
                    proc->pid = pid;
                    proc->exit_code = exit_code;
                    ORTE_ACTIVATE_PROC_STATE(&name, state);
                }
            }
            /* record that we heard back from a daemon during app launch */
            if (running && NULL != jdata) {
                jdata->num_daemons_reported++;
                if (orte_report_launch_progress) {
                    if (0 == jdata->num_daemons_reported % 100 ||
                        jdata->num_daemons_reported == orte_process_info.num_procs) {
                        ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REPORT_PROGRESS);
                    }
                }
            }
            /* prepare for next job */
            count = 1;
        }
        if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
            ORTE_ERROR_LOG(rc);
        } else {
            rc = ORTE_SUCCESS;
        }
        break;
                
    case ORTE_PLM_REGISTERED_CMD:
        count=1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &job, &count, ORTE_JOBID))) {
            ORTE_ERROR_LOG(rc);
            goto DEPART;
        }
        name.jobid = job;
        /* get the job object */
        if (NULL == (jdata = orte_get_job_data_object(job))) {
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            rc = ORTE_ERR_NOT_FOUND;
            goto DEPART;
        }
        count=1;
        while (ORTE_SUCCESS == opal_dss.unpack(buffer, &vpid, &count, ORTE_VPID)) {
            name.vpid = vpid;
            ORTE_ACTIVATE_PROC_STATE(&name, ORTE_PROC_STATE_REGISTERED);
            count=1;
        }
        break;

    default:
        ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
        rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
        break;
    }
        
 CLEANUP:
    if (ORTE_SUCCESS != rc) {
        goto DEPART;
    }
        
 DEPART:
    /* see if an error occurred - if so, wakeup the HNP so we can exit */
    if (ORTE_PROC_IS_HNP && ORTE_SUCCESS != rc) {
        jdata = NULL;
        ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
    }
    
    OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
                         "%s plm:base:receive done processing commands",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
}
Exemple #24
0
static int orte_rds_hostfile_parse_line(int token, opal_list_t* existing, opal_list_t* updates)
{
    char buff[64];
    int rc;
    orte_ras_node_t* node;
    bool update = false;
    bool got_count = false;
    bool got_max = false;

    if (ORTE_RDS_HOSTFILE_STRING == token ||
        ORTE_RDS_HOSTFILE_HOSTNAME == token ||
        ORTE_RDS_HOSTFILE_INT == token ||
        ORTE_RDS_HOSTFILE_IPV4 == token) {
        char* value;
        char** argv;
        char* node_name = NULL;
        char* username = NULL;
        int cnt;

        if(ORTE_RDS_HOSTFILE_INT == token) {
            sprintf(buff,"%d", orte_rds_hostfile_value.ival);
            value = buff;
        } else {
            value = orte_rds_hostfile_value.sval;
        }
        argv = opal_argv_split (value, '@');

        cnt = opal_argv_count (argv);
        if (1 == cnt) {
            node_name = strdup(argv[0]);
        } else if (2 == cnt) {
            username = strdup(argv[0]);
            node_name = strdup(argv[1]);
        } else {
            opal_output(0, "WARNING: Unhandeled user@host-combination\n"); /* XXX */
        }
        opal_argv_free (argv);

        /* convert this into something globally unique */
        if(strcmp(node_name, "localhost") == 0) {
            /* Nodename has been allocated, that is for sure */
            free (node_name);
            node_name = strdup(orte_system_info.nodename);
        }

        /* Do we need to make a new node object?  First check to see
           if it's in the existing list. */

        if (NULL == (node = orte_rds_hostfile_lookup(existing, node_name))) {

            /* If it wasn't, see if it's already in the updates list */

            if (NULL == (node = orte_rds_hostfile_lookup(updates,
                                                         node_name))) {
                node = OBJ_NEW(orte_ras_node_t);
                node->node_name = node_name;
                node->node_username = username;
                node->node_slots = 0;

#if 0
                /* get a new cellid for this node */
                /* JMS Temporarily turned off until cell IDs are
                   properly handled elsewhere in the code */
                /* JJH This assumes that each hostname listed should be
                   placed in a new cell. Is this accurate to the design?
                */
                if (ORTE_SUCCESS !=
                    (rc = orte_ns.create_cellid(&(node->node_cellid),
                                                "UNKNOWN-SITE",
                                                node->node_name))) {
                    ORTE_ERROR_LOG(rc);
                    return rc;
                }
#endif
            }

            /* Note that we need to set update to true regardless of
               whether the node was found on the updates list or not.
               If it was found, we just removed it (in
               orte_rds_hostfile_lookup()), so the update puts it back
               (potentially after updating it, of course).  If it was
               not found, then we have a new node instance that needs
               to be added to the updates list. */

            update = true;
        }
        else {
            /* If it was in the existing list, then we can use its cellid
             * to add the reset of the hosts in the file to. */
            local_cellid = node->node_cellid;
            need_cellid = false;
        }
    } else {
        orte_rds_hostfile_parse_error(token);
        return ORTE_ERROR;
    }

    got_count = false;
    while (!orte_rds_hostfile_done) {
        token = orte_rds_hostfile_lex();

        switch (token) {
        case ORTE_RDS_HOSTFILE_DONE:
            goto done;

        case ORTE_RDS_HOSTFILE_NEWLINE:
            goto done;

        case ORTE_RDS_HOSTFILE_USERNAME:
            node->node_username = orte_rds_hostfile_parse_string();
            break;

        case ORTE_RDS_HOSTFILE_COUNT:
        case ORTE_RDS_HOSTFILE_CPU:
        case ORTE_RDS_HOSTFILE_SLOTS:
            rc = orte_rds_hostfile_parse_int();
            if (rc < 0) {
                opal_show_help("help-rds-hostfile.txt", "rds:slots",
                               true,
                               cur_hostfile_name, rc);
                OBJ_RELEASE(node);
                return ORTE_ERROR;
            }
            node->node_slots += rc;
            update = true;
            got_count = true;

            /* Ensure that node_slots_max >= node_slots */
            if (node->node_slots_max != 0 && node->node_slots_max < node->node_slots) {
                node->node_slots_max = node->node_slots;
            }
            break;

        case ORTE_RDS_HOSTFILE_SLOTS_MAX:
            rc = orte_rds_hostfile_parse_int();
            if (rc < 0) {
                opal_show_help("help-rds-hostfile.txt", "rds:max_slots",
                               true,
                               cur_hostfile_name, ((size_t) rc));
                OBJ_RELEASE(node);
                return ORTE_ERROR;
            }
            /* Only take this update if it puts us >= node_slots */
            if (rc >= node->node_slots) {
                if (node->node_slots_max != rc) {
                    node->node_slots_max = rc;
                    update = true;
                    got_max = true;
                }
            } else {
                opal_show_help("help-rds-hostfile.txt", "rds:max_slots_lt",
                               true,
                               cur_hostfile_name, node->node_slots, rc);
                ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
                OBJ_RELEASE(node);
                return ORTE_ERROR;
            }
            break;

        default:
            orte_rds_hostfile_parse_error(token);
            OBJ_RELEASE(node);
            return ORTE_ERROR;
        }
    }

done:
    if (update) {
        if (!got_count) {
            if (got_max) {
                node->node_slots = node->node_slots_max;
            } else {
                ++node->node_slots;
            }
        }
        opal_list_append(updates, &node->super);
    } else {
        OBJ_RELEASE(node);
    }
    return ORTE_SUCCESS;
}
static void orte_job_destruct(orte_job_t* job)
{
    orte_proc_t *proc;
    orte_app_context_t *app;
    int n;
    orte_timer_t *evtimer;

    if (NULL == job) {
        /* probably just a race condition - just return */
        return;
    }

    if (orte_debug_flag) {
        opal_output(0, "%s Releasing job data for %s",
                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job->jobid));
    }

    if (NULL != job->personality) {
        opal_argv_free(job->personality);
    }
    for (n=0; n < job->apps->size; n++) {
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(job->apps, n))) {
            continue;
        }
        OBJ_RELEASE(app);
    }
    OBJ_RELEASE(job->apps);

    /* release any pointers in the attributes */
    evtimer = NULL;
    if (orte_get_attribute(&job->attributes, ORTE_JOB_FAILURE_TIMER_EVENT,
                           (void**)&evtimer, OPAL_PTR)) {
        orte_remove_attribute(&job->attributes, ORTE_JOB_FAILURE_TIMER_EVENT);
        /* the timer is a pointer to orte_timer_t */
        OBJ_RELEASE(evtimer);
    }
    proc = NULL;
    if (orte_get_attribute(&job->attributes, ORTE_JOB_ABORTED_PROC,
                           (void**)&proc, OPAL_PTR)) {
        orte_remove_attribute(&job->attributes, ORTE_JOB_ABORTED_PROC);
        /* points to an orte_proc_t */
        OBJ_RELEASE(proc);
    }

    if (NULL != job->map) {
        OBJ_RELEASE(job->map);
        job->map = NULL;
    }

    for (n=0; n < job->procs->size; n++) {
        if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(job->procs, n))) {
            continue;
        }
        OBJ_RELEASE(proc);
    }
    OBJ_RELEASE(job->procs);

    /* release the attributes */
    OPAL_LIST_DESTRUCT(&job->attributes);

    if (NULL != orte_job_data && ORTE_JOBID_INVALID != job->jobid) {
        /* remove the job from the global array */
        opal_hash_table_remove_value_uint32(orte_job_data, job->jobid);
    }
}
Exemple #26
0
static int parse_env(char **personality,
                     char *path,
                     opal_cmd_line_t *cmd_line,
                     char **srcenv,
                     char ***dstenv)
{
    int i, j;
    char *param;
    char *value;
    char *env_set_flag;
    char **vars;
    bool takeus = false;

    /* see if we are included */
    for (i=0; NULL != personality[i]; i++) {
        if (0 == strcmp(personality[i], "ompi")) {
            takeus = true;
            break;
        }
    }
    if (!takeus) {
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }

    for (i = 0; NULL != srcenv[i]; ++i) {
        if (0 == strncmp("OMPI_", srcenv[i], 5)) {
            /* check for duplicate in app->env - this
             * would have been placed there by the
             * cmd line processor. By convention, we
             * always let the cmd line override the
             * environment
             */
            param = strdup(srcenv[i]);
            value = strchr(param, '=');
            *value = '\0';
            value++;
            opal_setenv(param, value, false, dstenv);
            free(param);
        }
    }

    /* set necessary env variables for external usage from tune conf file*/
    int set_from_file = 0;
    vars = NULL;
    if (OPAL_SUCCESS == mca_base_var_process_env_list_from_file(&vars) &&
            NULL != vars) {
        for (i=0; NULL != vars[i]; i++) {
            value = strchr(vars[i], '=');
            /* terminate the name of the param */
            *value = '\0';
            /* step over the equals */
            value++;
            /* overwrite any prior entry */
            opal_setenv(vars[i], value, true, dstenv);
            /* save it for any comm_spawn'd apps */
            opal_setenv(vars[i], value, true, &orte_forwarded_envars);
        }
        set_from_file = 1;
        opal_argv_free(vars);
    }
    /* Did the user request to export any environment variables on the cmd line? */
    env_set_flag = getenv("OMPI_MCA_mca_base_env_list");
    if (opal_cmd_line_is_taken(cmd_line, "x")) {
        if (NULL != env_set_flag) {
            orte_show_help("help-orterun.txt", "orterun:conflict-env-set", false);
            return ORTE_ERR_FATAL;
        }
        j = opal_cmd_line_get_ninsts(cmd_line, "x");
        for (i = 0; i < j; ++i) {
            param = opal_cmd_line_get_param(cmd_line, "x", i, 0);

            if (NULL != (value = strchr(param, '='))) {
                /* terminate the name of the param */
                *value = '\0';
                /* step over the equals */
                value++;
                /* overwrite any prior entry */
                opal_setenv(param, value, true, dstenv);
                /* save it for any comm_spawn'd apps */
                opal_setenv(param, value, true, &orte_forwarded_envars);
            } else {
                value = getenv(param);
                if (NULL != value) {
                    /* overwrite any prior entry */
                    opal_setenv(param, value, true, dstenv);
                    /* save it for any comm_spawn'd apps */
                    opal_setenv(param, value, true, &orte_forwarded_envars);
                } else {
                    opal_output(0, "Warning: could not find environment variable \"%s\"\n", param);
                }
            }
        }
    } else if (NULL != env_set_flag) {
        /* if mca_base_env_list was set, check if some of env vars were set via -x from a conf file.
         * If this is the case, error out.
         */
        if (!set_from_file) {
            /* set necessary env variables for external usage */
            vars = NULL;
            if (OPAL_SUCCESS == mca_base_var_process_env_list(&vars) &&
                    NULL != vars) {
                for (i=0; NULL != vars[i]; i++) {
                    value = strchr(vars[i], '=');
                    /* terminate the name of the param */
                    *value = '\0';
                    /* step over the equals */
                    value++;
                    /* overwrite any prior entry */
                    opal_setenv(vars[i], value, true, dstenv);
                    /* save it for any comm_spawn'd apps */
                    opal_setenv(vars[i], value, true, &orte_forwarded_envars);
                }
                opal_argv_free(vars);
            }
        } else {
            orte_show_help("help-orterun.txt", "orterun:conflict-env-set", false);
            return ORTE_ERR_FATAL;
        }
    }

    /* If the user specified --path, store it in the user's app
       environment via the OMPI_exec_path variable. */
    if (NULL != path) {
        asprintf(&value, "OMPI_exec_path=%s", path);
        opal_argv_append_nosize(dstenv, value);
        /* save it for any comm_spawn'd apps */
        opal_argv_append_nosize(&orte_forwarded_envars, value);
        free(value);
    }

    return ORTE_SUCCESS;
}
Exemple #27
0
int opal_hwloc_base_set_binding_policy(opal_binding_policy_t *policy, char *spec)
{
    int i;
    opal_binding_policy_t tmp;
    char **tmpvals, **quals;
                
    /* set default */
    tmp = 0;

    /* binding specification */
    if (NULL == spec) {
        /* default to bind-to core, and that no binding policy was specified */
        OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_CORE);
        tmp &= ~OPAL_BIND_GIVEN;
    } else if (0 == strncasecmp(spec, "none", strlen("none"))) {
        OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_NONE);
    } else {
        tmpvals = opal_argv_split(spec, ':');
        if (1 < opal_argv_count(tmpvals) || ':' == spec[0]) {
            if (':' == spec[0]) {
                quals = opal_argv_split(&spec[1], ',');
            } else {
                quals = opal_argv_split(tmpvals[1], ',');
            }
            for (i=0; NULL != quals[i]; i++) {
                if (0 == strncasecmp(quals[i], "if-supported", strlen(quals[i]))) {
                    tmp |= OPAL_BIND_IF_SUPPORTED;
                } else if (0 == strncasecmp(quals[i], "overload-allowed", strlen(quals[i])) ||
                           0 == strncasecmp(quals[i], "oversubscribe-allowed", strlen(quals[i]))) {
                    tmp |= OPAL_BIND_ALLOW_OVERLOAD;
                } else {
                    /* unknown option */
                    opal_output(0, "Unknown qualifier to binding policy: %s", spec);
                    return OPAL_ERR_BAD_PARAM;
                }
            }
            opal_argv_free(quals);
        }
        if (NULL == tmpvals[0] || ':' == spec[0]) {
            OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_CORE);
            tmp &= ~OPAL_BIND_GIVEN;
        } else {
            if (0 == strcasecmp(tmpvals[0], "hwthread")) {
                OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_HWTHREAD);
            } else if (0 == strcasecmp(tmpvals[0], "core")) {
                OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_CORE);
            } else if (0 == strcasecmp(tmpvals[0], "l1cache")) {
                OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_L1CACHE);
            } else if (0 == strcasecmp(tmpvals[0], "l2cache")) {
                OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_L2CACHE);
            } else if (0 == strcasecmp(tmpvals[0], "l3cache")) {
                OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_L3CACHE);
            } else if (0 == strcasecmp(tmpvals[0], "socket")) {
                OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_SOCKET);
            } else if (0 == strcasecmp(tmpvals[0], "numa")) {
                OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_NUMA);
            } else if (0 == strcasecmp(tmpvals[0], "board")) {
                OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_BOARD);
            } else {
                opal_show_help("help-opal-hwloc-base.txt", "invalid binding_policy", true, "binding", spec);
                opal_argv_free(tmpvals);
                return OPAL_ERR_BAD_PARAM;
            }
        }
        opal_argv_free(tmpvals);
    }

    *policy = tmp;
    return OPAL_SUCCESS;
}
Exemple #28
0
static int setup_fork(orte_job_t *jdata,
                      orte_app_context_t *app)
{
    int i;
    char *param;
    bool oversubscribed;
    orte_node_t *node;
    char **envcpy, **nps, **firstranks;
    char *npstring, *firstrankstring;
    char *num_app_ctx;
    bool takeus = false;

    /* see if we are included */
    for (i=0; NULL != jdata->personality[i]; i++) {
        if (0 == strcmp(jdata->personality[i], "ompi")) {
            takeus = true;
            break;
        }
    }
    if (!takeus) {
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }

    /* see if the mapper thinks we are oversubscribed */
    oversubscribed = false;
    if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        return ORTE_ERR_NOT_FOUND;
    }
    if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_OVERSUBSCRIBED)) {
        oversubscribed = true;
    }

    /* setup base environment: copy the current environ and merge
       in the app context environ */
    if (NULL != app->env) {
        /* manually free original context->env to avoid a memory leak */
        char **tmp = app->env;
        envcpy = opal_environ_merge(orte_launch_environ, app->env);
        if (NULL != tmp) {
            opal_argv_free(tmp);
        }
    } else {
        envcpy = opal_argv_copy(orte_launch_environ);
    }
    app->env = envcpy;

    /* special case handling for --prefix: this is somewhat icky,
       but at least some users do this.  :-\ It is possible that
       when using --prefix, the user will also "-x PATH" and/or
       "-x LD_LIBRARY_PATH", which would therefore clobber the
       work that was done in the prior pls to ensure that we have
       the prefix at the beginning of the PATH and
       LD_LIBRARY_PATH.  So examine the context->env and see if we
       find PATH or LD_LIBRARY_PATH.  If found, that means the
       prior work was clobbered, and we need to re-prefix those
       variables. */
    param = NULL;
    orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&param, OPAL_STRING);
    for (i = 0; NULL != param && NULL != app->env && NULL != app->env[i]; ++i) {
        char *newenv;

        /* Reset PATH */
        if (0 == strncmp("PATH=", app->env[i], 5)) {
            asprintf(&newenv, "%s/bin:%s", param, app->env[i] + 5);
            opal_setenv("PATH", newenv, true, &app->env);
            free(newenv);
        }

        /* Reset LD_LIBRARY_PATH */
        else if (0 == strncmp("LD_LIBRARY_PATH=", app->env[i], 16)) {
            asprintf(&newenv, "%s/lib:%s", param, app->env[i] + 16);
            opal_setenv("LD_LIBRARY_PATH", newenv, true, &app->env);
            free(newenv);
        }
    }
    if (NULL != param) {
        free(param);
    }

    /* pass my contact info to the local proc so we can talk */
    opal_setenv("OMPI_MCA_orte_local_daemon_uri", orte_process_info.my_daemon_uri, true, &app->env);

    /* pass the hnp's contact info to the local proc in case it
     * needs it
     */
    if (NULL != orte_process_info.my_hnp_uri) {
        opal_setenv("OMPI_MCA_orte_hnp_uri", orte_process_info.my_hnp_uri, true, &app->env);
    }

    /* setup yield schedule - do not override any user-supplied directive! */
    if (oversubscribed) {
        opal_setenv("OMPI_MCA_mpi_yield_when_idle", "1", false, &app->env);
    } else {
        opal_setenv("OMPI_MCA_mpi_yield_when_idle", "0", false, &app->env);
    }

    /* set the app_context number into the environment */
    asprintf(&param, "%ld", (long)app->idx);
    opal_setenv("OMPI_MCA_orte_app_num", param, true, &app->env);
    free(param);

    /* although the total_slots_alloc is the universe size, users
     * would appreciate being given a public environmental variable
     * that also represents this value - something MPI specific - so
     * do that here. Also required by the ompi_attributes code!
     *
     * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
     * We know - just live with it
     */
    asprintf(&param, "%ld", (long)jdata->total_slots_alloc);
    opal_setenv("OMPI_UNIVERSE_SIZE", param, true, &app->env);
    free(param);

    /* pass the number of nodes involved in this job */
    asprintf(&param, "%ld", (long)(jdata->map->num_nodes));
    opal_setenv("OMPI_MCA_orte_num_nodes", param, true, &app->env);
    free(param);

    /* pass a param telling the child what type and model of cpu we are on,
     * if we know it. If hwloc has the value, use what it knows. Otherwise,
     * see if we were explicitly given it and use that value.
     */
    hwloc_obj_t obj;
    char *htmp;
    if (NULL != opal_hwloc_topology) {
        obj = hwloc_get_root_obj(opal_hwloc_topology);
        if (NULL != (htmp = (char*)hwloc_obj_get_info_by_name(obj, "CPUType")) ||
            NULL != (htmp = orte_local_cpu_type)) {
            opal_setenv("OMPI_MCA_orte_cpu_type", htmp, true, &app->env);
        }
        if (NULL != (htmp = (char*)hwloc_obj_get_info_by_name(obj, "CPUModel")) ||
            NULL != (htmp = orte_local_cpu_model)) {
            opal_setenv("OMPI_MCA_orte_cpu_model", htmp, true, &app->env);
        }
    } else {
        if (NULL != orte_local_cpu_type) {
            opal_setenv("OMPI_MCA_orte_cpu_type", orte_local_cpu_type, true, &app->env);
        }
        if (NULL != orte_local_cpu_model) {
            opal_setenv("OMPI_MCA_orte_cpu_model", orte_local_cpu_model, true, &app->env);
        }
    }

    /* get shmem's best component name so we can provide a hint to the shmem
     * framework. the idea here is to have someone figure out what component to
     * select (via the shmem framework) and then have the rest of the
     * components in shmem obey that decision. for more details take a look at
     * the shmem framework in opal.
     */
    if (NULL != (param = opal_shmem_base_best_runnable_component_name())) {
        opal_setenv("OMPI_MCA_shmem_RUNTIME_QUERY_hint", param, true, &app->env);
        free(param);
    }

    /* Set an info MCA param that tells the launched processes that
     * any binding policy was applied by us (e.g., so that
     * MPI_INIT doesn't try to bind itself)
     */
    opal_setenv("OMPI_MCA_orte_bound_at_launch", "1", true, &app->env);

    /* tell the ESS to avoid the singleton component - but don't override
     * anything that may have been provided elsewhere
     */
    opal_setenv("OMPI_MCA_ess", "^singleton", false, &app->env);

    /* ensure that the spawned process ignores direct launch components,
     * but do not overrride anything we were given */
    opal_setenv("OMPI_MCA_pmix", "^s1,s2,cray", false, &app->env);

    /* since we want to pass the name as separate components, make sure
     * that the "name" environmental variable is cleared!
     */
    opal_unsetenv("OMPI_MCA_orte_ess_name", &app->env);

    asprintf(&param, "%ld", (long)jdata->num_procs);
    opal_setenv("OMPI_MCA_orte_ess_num_procs", param, true, &app->env);

    /* although the num_procs is the comm_world size, users
     * would appreciate being given a public environmental variable
     * that also represents this value - something MPI specific - so
     * do that here.
     *
     * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
     * We know - just live with it
     */
    opal_setenv("OMPI_COMM_WORLD_SIZE", param, true, &app->env);
    free(param);

    /* users would appreciate being given a public environmental variable
     * that also represents this value - something MPI specific - so
     * do that here.
     *
     * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
     * We know - just live with it
     */
    asprintf(&param, "%ld", (long)jdata->num_local_procs);
    opal_setenv("OMPI_COMM_WORLD_LOCAL_SIZE", param, true, &app->env);
    free(param);

    /* forcibly set the local tmpdir base to match ours */
    opal_setenv("OMPI_MCA_orte_tmpdir_base", orte_process_info.tmpdir_base, true, &app->env);

    /* MPI-3 requires we provide some further info to the procs,
     * so we pass them as envars to avoid introducing further
     * ORTE calls in the MPI layer
     */
    asprintf(&num_app_ctx, "%lu", (unsigned long)jdata->num_apps);

    /* build some common envars we need to pass for MPI-3 compatibility */
    nps = NULL;
    firstranks = NULL;
    for (i=0; i < jdata->apps->size; i++) {
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
            continue;
        }
        opal_argv_append_nosize(&nps, ORTE_VPID_PRINT(app->num_procs));
        opal_argv_append_nosize(&firstranks, ORTE_VPID_PRINT(app->first_rank));
    }
    npstring = opal_argv_join(nps, ' ');
    firstrankstring = opal_argv_join(firstranks, ' ');
    opal_argv_free(nps);
    opal_argv_free(firstranks);

    /* add the MPI-3 envars */
    opal_setenv("OMPI_NUM_APP_CTX", num_app_ctx, true, &app->env);
    opal_setenv("OMPI_FIRST_RANKS", firstrankstring, true, &app->env);
    opal_setenv("OMPI_APP_CTX_NUM_PROCS", npstring, true, &app->env);
    free(num_app_ctx);
    free(firstrankstring);
    free(npstring);
    return ORTE_SUCCESS;
}
Exemple #29
0
static void launch_daemons(int fd, short args, void *cbdata)
{
    orte_job_map_t *map;
    char *jobid_string = NULL;
    char *param;
    char **argv = NULL;
    int argc;
    int rc;
    char *tmp;
    char** env = NULL;
    char *nodelist_flat;
    char **nodelist_argv;
    int nodelist_argc;
    char *vpid_string;
    char **custom_strings;
    int num_args, i;
    char *cur_prefix;
    int proc_vpid_index;
    orte_app_context_t *app;
    orte_node_t *node;
    orte_std_cntr_t nnode;
    orte_job_t *daemons;
    orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;

    /* if we are launching debugger daemons, then just go
     * do it - no new daemons will be launched
     */
    if (ORTE_FLAG_TEST(state->jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
        state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
        ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
        OBJ_RELEASE(state);
        return;
    }

    /* start by setting up the virtual machine */
    daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
    if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(state->jdata))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }

   /* if we don't want to launch, then don't attempt to
     * launch the daemons - the user really wants to just
     * look at the proposed process map
     */
    if (orte_do_not_launch) {
        /* set the state to indicate the daemons reported - this
         * will trigger the daemons_reported event and cause the
         * job to move to the following step
         */
        state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
        ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
        OBJ_RELEASE(state);
        return;
    }

    /* Get the map for this job */
    if (NULL == (map = daemons->map)) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        rc = ORTE_ERR_NOT_FOUND;
        goto cleanup;
    }

    if (0 == map->num_new_daemons) {
        /* set the state to indicate the daemons reported - this
         * will trigger the daemons_reported event and cause the
         * job to move to the following step
         */
        OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
                             "%s plm:alps: no new daemons to launch",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
        ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
        OBJ_RELEASE(state);
        return;
    }
    
    /* need integer value for command line parameter */
    orte_util_convert_jobid_to_string(&jobid_string, daemons->jobid);

    /*
     * start building argv array
     */
    argv = NULL;
    argc = 0;

    /*
     * ALPS aprun  OPTIONS
     */

    /* add the aprun command */
    opal_argv_append(&argc, &argv, mca_plm_alps_component.aprun_cmd);

    /* Append user defined arguments to aprun */
    if ( NULL != mca_plm_alps_component.custom_args ) {
        custom_strings = opal_argv_split(mca_plm_alps_component.custom_args, ' ');
        num_args       = opal_argv_count(custom_strings);
        for (i = 0; i < num_args; ++i) {
            opal_argv_append(&argc, &argv, custom_strings[i]);
        }
        opal_argv_free(custom_strings);
    }

    /* number of processors needed */
    opal_argv_append(&argc, &argv, "-n");
    asprintf(&tmp, "%lu", (unsigned long) map->num_new_daemons);
    opal_argv_append(&argc, &argv, tmp);
    free(tmp);
    opal_argv_append(&argc, &argv, "-N");
    opal_argv_append(&argc, &argv, "1");
    opal_argv_append(&argc, &argv, "-cc");
    opal_argv_append(&argc, &argv, "none");

    /* create nodelist */
    nodelist_argv = NULL;
    nodelist_argc = 0;

    for (nnode=0; nnode < map->nodes->size; nnode++) {
        if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, nnode))) {
            continue;
        }

        /* if the daemon already exists on this node, then
         * don't include it
         */
        if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED)) {
            continue;
        }
        
        /* otherwise, add it to the list of nodes upon which
         * we need to launch a daemon
         */
        opal_argv_append(&nodelist_argc, &nodelist_argv, node->name);
    }
    if (0 == opal_argv_count(nodelist_argv)) {
        orte_show_help("help-plm-alps.txt", "no-hosts-in-list", true);
        rc = ORTE_ERR_FAILED_TO_START;
        goto cleanup;
    }
    nodelist_flat = opal_argv_join(nodelist_argv, ',');
    opal_argv_free(nodelist_argv);

    /* if we are using all allocated nodes, then alps
     * doesn't need a nodelist, or if running without a batch scheduler
     */
    if ((map->num_new_daemons < orte_num_allocated_nodes) || (orte_num_allocated_nodes == 0)) {
        opal_argv_append(&argc, &argv, "-L");
        opal_argv_append(&argc, &argv, nodelist_flat);
    }


    /*
     * ORTED OPTIONS
     */

    /* add the daemon command (as specified by user) */
    orte_plm_base_setup_orted_cmd(&argc, &argv);
    
    /* Add basic orted command line options, including debug flags */
    orte_plm_base_orted_append_basic_args(&argc, &argv,
                                          NULL,
                                          &proc_vpid_index,
                                          nodelist_flat);
    free(nodelist_flat);

    /* tell the new daemons the base of the name list so they can compute
     * their own name on the other end
     */
    rc = orte_util_convert_vpid_to_string(&vpid_string, map->daemon_vpid_start);
    if (ORTE_SUCCESS != rc) {
        opal_output(0, "plm_alps: unable to create process name");
        goto cleanup;
    }

    free(argv[proc_vpid_index]);
    argv[proc_vpid_index] = strdup(vpid_string);
    free(vpid_string);

    if (mca_plm_alps_component.debug) {
        param = opal_argv_join(argv, ' ');
        if (NULL != param) {
            opal_output(0, "plm:alps: final top-level argv:");
            opal_output(0, "plm:alps:     %s", param);
            free(param);
        }
    }

    /* Copy the prefix-directory specified in the
       corresponding app_context.  If there are multiple,
       different prefix's in the app context, complain (i.e., only
       allow one --prefix option for the entire alps run -- we
       don't support different --prefix'es for different nodes in
       the ALPS plm) */
    cur_prefix = NULL;
    for (i=0; i < state->jdata->apps->size; i++) {
        char *app_prefix_dir = NULL;
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(state->jdata->apps, i))) {
            continue;
        }
        orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&app_prefix_dir, OPAL_STRING);
        /* Check for already set cur_prefix_dir -- if different,
           complain */
        if (NULL != app_prefix_dir) {
            if (NULL != cur_prefix &&
                0 != strcmp (cur_prefix, app_prefix_dir)) {
                orte_show_help("help-plm-alps.txt", "multiple-prefixes",
                               true, cur_prefix, app_prefix_dir);
                goto cleanup;
            }

            /* If not yet set, copy it; iff set, then it's the
               same anyway */
            if (NULL == cur_prefix) {
                cur_prefix = strdup(app_prefix_dir);
                if (mca_plm_alps_component.debug) {
                    opal_output (0, "plm:alps: Set prefix:%s",
                                 cur_prefix);
                }
            }
            free(app_prefix_dir);
        }
    }

    /* protect the args in case someone has a script wrapper around aprun */
    mca_base_cmd_line_wrap_args(argv);

    /* setup environment */
    env = opal_argv_copy(orte_launch_environ);
    
    if (0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) {
        param = opal_argv_join(argv, ' ');
        OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
                             "%s plm:alps: final top-level argv:\n\t%s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             (NULL == param) ? "NULL" : param));
        if (NULL != param) free(param);
    }
        
    /* exec the daemon(s) */
    if (ORTE_SUCCESS != (rc = plm_alps_start_proc(argc, argv, env, cur_prefix))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }
    
    /* indicate that the daemons for this job were launched */
    state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
    daemons->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;

    /* flag that launch was successful, so far as we currently know */
    failed_launch = false;

 cleanup:
    if (NULL != argv) {
        opal_argv_free(argv);
    }
    if (NULL != env) {
        opal_argv_free(env);
    }
    
    if(NULL != jobid_string) {
        free(jobid_string);
    }
    
    /* cleanup the caddy */
    OBJ_RELEASE(state);

    /* check for failed launch - if so, force terminate */
    if (failed_launch) {
        ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
    }
}
Exemple #30
0
/* the -host option can always be used in both absolute
 * and relative mode, so we have to check for pre-existing
 * allocations if we are to use relative node syntax
 */
int orte_util_filter_dash_host_nodes(opal_list_t *nodes,
                                     char** host_argv)
{
    opal_list_item_t* item;
    bool found;
    opal_list_item_t *next;
    orte_std_cntr_t i, j, k, len_mapped_node=0;
    int rc;
    char **mapped_nodes = NULL, **mini_map, *cptr;
    orte_node_t *node, **nodepool;
    int nodeidx;
    int num_empty=0;
    opal_list_t keep;
    bool want_all_empty = false;
    
    /* if the incoming node list is empty, then there
     * is nothing to filter!
     */
    if (opal_list_is_empty(nodes)) {
        return ORTE_SUCCESS;
    }

    /* setup for relative node syntax */
    nodepool = (orte_node_t**)orte_node_pool->addr;
    
    /* Accumulate all of the host name mappings */
    for (j = 0; j < opal_argv_count(host_argv); ++j) {
        mini_map = opal_argv_split(host_argv[j], ',');
        
        for (k = 0; NULL != mini_map[k]; ++k) {
            if ('+' == mini_map[k][0]) {
                /* see if we specified empty nodes */
                if ('e' == mini_map[k][1] ||
                    'E' == mini_map[k][1]) {
                    /* request for empty nodes - do they want
                     * all of them?
                     */
                    if (NULL != (cptr = strchr(mini_map[k], ':'))) {
                        /* the colon indicates a specific # are requested */
                        cptr++; /* step past : */
                        /* put a marker into the list */
                        cptr--;
                        *cptr = '*';
                        opal_argv_append_nosize(&mapped_nodes, cptr);
                    } else {
                        /* add a marker to the list */
                        opal_argv_append_nosize(&mapped_nodes, "*");
                        want_all_empty = true;
                    }
                } else if ('n' == mini_map[k][1] ||
                           'N' == mini_map[k][1]) {
                    /* they want a specific relative node #, so
                     * look it up on global pool
                     */
                    nodeidx = strtol(&mini_map[k][2], NULL, 10);
                    if (nodeidx < 0 ||
                        nodeidx > (int)orte_node_pool->size) {
                        /* this is an error */
                        orte_show_help("help-dash-host.txt", "dash-host:relative-node-out-of-bounds",
                                       true, nodeidx, mini_map[k]);
                        rc = ORTE_ERR_SILENT;
                        goto cleanup;
                    }
                    /* if the HNP is not allocated, then we need to
                     * adjust the index as the node pool is offset
                     * by one
                     */
                    if (!orte_hnp_is_allocated) {
                        nodeidx++;
                    }
                    /* see if that location is filled */
                    
                    if (NULL == nodepool[nodeidx]) {
                        /* this is an error */
                        orte_show_help("help-dash-host.txt", "dash-host:relative-node-not-found",
                                       true, nodeidx, mini_map[k]);
                        rc = ORTE_ERR_SILENT;
                        goto cleanup;
                    }
                    /* add this node to the list */
                    opal_argv_append_nosize(&mapped_nodes, nodepool[nodeidx]->name);
                } else {
                    /* invalid relative node syntax */
                    orte_show_help("help-dash-host.txt", "dash-host:invalid-relative-node-syntax",
                                   true, mini_map[k]);
                    rc = ORTE_ERR_SILENT;
                    goto cleanup;
                }
            } else { /* non-relative syntax - add to list */
                if (OPAL_SUCCESS != (rc = opal_argv_append_nosize(&mapped_nodes, 
                                                                  mini_map[k]))) {
                    goto cleanup;
                }
            }
        }
        opal_argv_free(mini_map);
    }
    
    /* Did we find anything? If not, then do nothing */
    if (NULL == mapped_nodes && 0 == num_empty) {
        return ORTE_SUCCESS;
    }
    
    /* we found some info - filter what is on the list...
     * i.e., go through the list and remove any nodes that
     * were -not- included on the -host list.
     *
     * NOTE: The following logic is based on knowing that
     * any node can only be included on the incoming
     * nodes list ONCE.
     */
    
    len_mapped_node = opal_argv_count(mapped_nodes);
    /* setup a working list so we can put the final list
     * of nodes in order. This way, if the user specifies a
     * set of nodes, we will use them in the order in which
     * they were specifed. Note that empty node requests
     * will always be appended to the end
     */
    OBJ_CONSTRUCT(&keep, opal_list_t);
    
    for (i = 0; i < len_mapped_node; ++i) {
        /* check if we are supposed to add some number of empty
         * nodes here
         */
        if ('*' == mapped_nodes[i][0]) {
            /* if there is a number after the '*', then we are
             * to insert a specific # of nodes
             */
            if ('\0' == mapped_nodes[i][1]) {
                /* take all empty nodes from the list */
                num_empty = INT_MAX;
            } else {
                /* extract number of nodes to take */
                num_empty = strtol(&mapped_nodes[i][1], NULL, 10);
            }
            /* search for empty nodes and take them */
            item = opal_list_get_first(nodes);
            while (0 < num_empty && item != opal_list_get_end(nodes)) {
                next = opal_list_get_next(item);  /* save this position */
                node = (orte_node_t*)item;
                /* see if this node is empty */
                if (0 == node->slots_inuse) {
                    /* check to see if it is specified later */
                    for (j=i+1; j < len_mapped_node; j++) {
                        if (0 == strcmp(mapped_nodes[j], node->name)) {
                            /* specified later - skip this one */
                            goto skipnode;
                        }
                    }
                    /* remove item from list */
                    opal_list_remove_item(nodes, item);
                    /* xfer to keep list */
                    opal_list_append(&keep, item);
                    --num_empty;
                }
            skipnode:
                item = next;
            }
        } else {
            /* we are looking for a specific node on the list
             * we have a match if one of two conditions is met:
             * 1. the node_name and mapped_nodes directly match
             * 2. the node_name is the local system name AND
             *    either the mapped_node is "localhost" OR it
             *    is a local interface as found by opal_ifislocal
             */
            item = opal_list_get_first(nodes);
            while (item != opal_list_get_end(nodes)) {
                next = opal_list_get_next(item);  /* save this position */
                node = (orte_node_t*)item;
                /* search -host list to see if this one is found */
                found = false;
                if ((0 == strcmp(node->name, mapped_nodes[i]) ||
                    (0 == strcmp(node->name, orte_process_info.nodename) &&
                    (0 == strcmp(mapped_nodes[i], "localhost") || opal_ifislocal(mapped_nodes[i]))))) {
                    /* remove item from list */
                    opal_list_remove_item(nodes, item);
                    /* xfer to keep list */
                    opal_list_append(&keep, item);
                    break;
                }
                item = next;
            }
        }
        /* done with the mapped entry */
        free(mapped_nodes[i]);
        mapped_nodes[i] = NULL;
    }

    /* was something specified that was -not- found? */
    for (i=0; i < len_mapped_node; i++) {
        if (NULL != mapped_nodes[i]) {
            orte_show_help("help-dash-host.txt", "not-all-mapped-alloc",
                           true, mapped_nodes[i]);
            rc = ORTE_ERR_SILENT;
            goto cleanup;
        }
    }
    
    /* clear the rest of the nodes list */
    while (NULL != (item = opal_list_remove_first(nodes))) {
        OBJ_RELEASE(item);
    }
    
    /* the nodes list has been cleared - rebuild it in order */
    while (NULL != (item = opal_list_remove_first(&keep))) {
        opal_list_append(nodes, item);
    }
    
    /* did they ask for more than we could provide */
    if (!want_all_empty && 0 < num_empty) {
        orte_show_help("help-dash-host.txt", "dash-host:not-enough-empty",
                       true, num_empty);
        rc = ORTE_ERR_SILENT;
        goto cleanup;
    }
    
    rc = ORTE_SUCCESS;
    /* done filtering existing list */
    
cleanup:
    for (i=0; i < len_mapped_node; i++) {
        if (NULL != mapped_nodes[i]) {
            free(mapped_nodes[i]);
            mapped_nodes[i] = NULL;
        }
    }
    if (NULL != mapped_nodes) {
        free(mapped_nodes);
    }
    
    return rc;
}