/*
 * Query the registry for all nodes allocated to a specified app_context
 */
int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr_t *total_num_slots,
                                     orte_app_context_t *app, orte_mapping_policy_t policy,
                                     bool initial_map, bool silent)
{
    opal_list_item_t *item, *next;
    orte_node_t *node, *nd, *nptr;
    orte_std_cntr_t num_slots;
    orte_std_cntr_t i;
    int rc;
    orte_job_t *daemons;
    bool novm;
    opal_list_t nodes;

    /** set default answer */
    *total_num_slots = 0;
    
    /* get the daemon job object */
    daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
    /* see is we have a vm or not */
    novm = ORTE_JOB_CONTROL_NO_VM & daemons->controls;

    /* if this is NOT a managed allocation, then we use the nodes
     * that were specified for this app - there is no need to collect
     * all available nodes and "filter" them
     */
    if (!orte_managed_allocation) {
        OBJ_CONSTRUCT(&nodes, opal_list_t);
        /* if the app provided a dash-host, and we are not treating
         * them as requested or "soft" locations, then use those nodes
         */
        if (!orte_soft_locations && NULL != app->dash_host) {
            OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
                                 "%s using dash_host",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&nodes,
                                                                    app->dash_host))) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
        } else if (NULL != app->hostfile) {
            /* otherwise, if the app provided a hostfile, then use that */
            OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
                                 "%s using hostfile %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 app->hostfile));
            if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes,
                                                                   app->hostfile))) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
        } else if (NULL != orte_rankfile) {
            /* use the rankfile, if provided */
            OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
                                 "%s using rankfile %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 orte_rankfile));
            if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes,
                                                                   orte_rankfile))) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
            if (0 == opal_list_get_size(&nodes)) {
                OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
                                     "%s nothing found in given rankfile",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                OBJ_DESTRUCT(&nodes);
                return ORTE_ERR_BAD_PARAM;
            }
        } else if (NULL != orte_default_hostfile) {
            /* fall back to the default hostfile, if provided */
            OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
                                 "%s using default hostfile %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 orte_default_hostfile));
            if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes,
                                                                   orte_default_hostfile))) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
            /* this is a special case - we always install a default
             * hostfile, but it is empty. If the user didn't remove it
             * or put something into it, then we will have pursued that
             * option and found nothing. This isn't an error, we just need
             * to add all the known nodes
             */
            if (0 == opal_list_get_size(&nodes)) {
                OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
                                     "%s nothing in default hostfile - using known nodes",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                goto addknown;
            }
        } else {
            /* if nothing else was available, then use all known nodes, which
             * will include ourselves
             */
            OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
                                 "%s using known nodes",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            goto addknown;
        }
        /** if we still don't have anything */
        if (0 == opal_list_get_size(&nodes)) {
            if (!silent) {
                orte_show_help("help-orte-rmaps-base.txt",
                               "orte-rmaps-base:no-available-resources",
                               true);
            }
            OBJ_DESTRUCT(&nodes);
            return ORTE_ERR_SILENT;
        }
        /* find the nodes in our node array and assemble them
         * in daemon order if the vm was launched
         */
        while (NULL != (item = opal_list_remove_first(&nodes))) {
            nptr = (orte_node_t*)item;
            nd = NULL;
            for (i=0; i < orte_node_pool->size; i++) {
                if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
                    continue;
                }
                if (0 != strcmp(node->name, nptr->name)) {
                    OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output,
                                         "NODE %s DOESNT MATCH NODE %s",
                                         node->name, nptr->name));
                    continue;
                }
                /* ignore nodes that are marked as do-not-use for this mapping */
                if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
                    OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output,
                                         "NODE %s IS MARKED NO_USE", node->name));
                    /* reset the state so it can be used another time */
                    node->state = ORTE_NODE_STATE_UP;
                    continue;
                }
                if (ORTE_NODE_STATE_DOWN == node->state) {
                    OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output,
                                         "NODE %s IS DOWN", node->name));
                    continue;
                }
                if (ORTE_NODE_STATE_NOT_INCLUDED == node->state) {
                    OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output,
                                         "NODE %s IS MARKED NO_INCLUDE", node->name));
                    /* not to be used */
                    continue;
                }
                /* if this node wasn't included in the vm (e.g., by -host), ignore it,
                 * unless we are mapping prior to launching the vm
                 */
                if (NULL == node->daemon && !novm) {
                    OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output,
                                         "NODE %s HAS NO DAEMON", node->name));
                    continue;
                }
                /* retain a copy for our use in case the item gets
                 * destructed along the way
                 */
                OBJ_RETAIN(node);
                if (initial_map) {
                    /* if this is the first app_context we
                     * are getting for an initial map of a job,
                     * then mark all nodes as unmapped
                     */
                    node->mapped = false;
                }
                if (NULL == nd || NULL == nd->daemon ||
                    NULL == node->daemon ||
                    nd->daemon->name.vpid < node->daemon->name.vpid) {
                    /* just append to end */
                    opal_list_append(allocated_nodes, &node->super);
                    nd = node;
                } else {
                    /* starting from end, put this node in daemon-vpid order */
                    while (node->daemon->name.vpid < nd->daemon->name.vpid) {
                        if (opal_list_get_begin(allocated_nodes) == opal_list_get_prev(&nd->super)) {
                            /* insert at beginning */
                            opal_list_prepend(allocated_nodes, &node->super);
                            goto moveon1;
                        }
                        nd = (orte_node_t*)opal_list_get_prev(&nd->super);
                    }
                    item = opal_list_get_next(&nd->super);
                    if (item == opal_list_get_end(allocated_nodes)) {
                        /* we are at the end - just append */
                        opal_list_append(allocated_nodes, &node->super);
                    } else {
                        nd = (orte_node_t*)item;
                        opal_list_insert_pos(allocated_nodes, item, &node->super);
                    }
                moveon1:
                    /* reset us back to the end for the next node */
                    nd = (orte_node_t*)opal_list_get_last(allocated_nodes);
                }
            }
            OBJ_RELEASE(nptr);
        }
        OBJ_DESTRUCT(&nodes);
        /* now prune for usage and compute total slots */
        goto complete;
    }

 addknown:
    /* if the hnp was allocated, include it unless flagged not to */
    if (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(policy) & ORTE_MAPPING_NO_USE_LOCAL)) {
        if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0))) {
            if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
                OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output,
                                     "HNP IS MARKED NO_USE"));
                /* clear this for future use, but don't include it */
                node->state = ORTE_NODE_STATE_UP;
            } else if (ORTE_NODE_STATE_NOT_INCLUDED != node->state) {
                OBJ_RETAIN(node);
                if (initial_map) {
                    /* if this is the first app_context we
                     * are getting for an initial map of a job,
                     * then mark all nodes as unmapped
                     */
                    node->mapped = false;
                }
                opal_list_append(allocated_nodes, &node->super);
            }
        }
    }
    
    /* add everything in the node pool that can be used - add them
     * in daemon order, which may be different than the order in the
     * node pool. Since an empty list is passed into us, the list at
     * this point either has the HNP node or nothing, and the HNP
     * node obviously has a daemon on it (us!)
     */
    if (0 == opal_list_get_size(allocated_nodes)) {
        /* the list is empty */
        nd = NULL;
    } else {
        nd = (orte_node_t*)opal_list_get_last(allocated_nodes);
    }
    for (i=1; i < orte_node_pool->size; i++) {
        if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
            /* ignore nodes that are marked as do-not-use for this mapping */
            if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
                OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output,
                                     "NODE %s IS MARKED NO_USE", node->name));
                /* reset the state so it can be used another time */
                node->state = ORTE_NODE_STATE_UP;
                continue;
            }
            if (ORTE_NODE_STATE_DOWN == node->state) {
                OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output,
                                     "NODE %s IS MARKED DOWN", node->name));
                continue;
            }
            if (ORTE_NODE_STATE_NOT_INCLUDED == node->state) {
                OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output,
                                     "NODE %s IS MARKED NO_INCLUDE", node->name));
                /* not to be used */
                continue;
            }
            /* if this node wasn't included in the vm (e.g., by -host), ignore it,
             * unless we are mapping prior to launching the vm
             */
            if (NULL == node->daemon && !novm) {
                OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output,
                                     "NODE %s HAS NO DAEMON", node->name));
                continue;
            }
            /* retain a copy for our use in case the item gets
             * destructed along the way
             */
            OBJ_RETAIN(node);
            if (initial_map) {
                /* if this is the first app_context we
                 * are getting for an initial map of a job,
                 * then mark all nodes as unmapped
                 */
                node->mapped = false;
            }
            if (NULL == nd || NULL == nd->daemon ||
		NULL == node->daemon ||
                nd->daemon->name.vpid < node->daemon->name.vpid) {
                /* just append to end */
                opal_list_append(allocated_nodes, &node->super);
                nd = node;
            } else {
                /* starting from end, put this node in daemon-vpid order */
                while (node->daemon->name.vpid < nd->daemon->name.vpid) {
                    if (opal_list_get_begin(allocated_nodes) == opal_list_get_prev(&nd->super)) {
                        /* insert at beginning */
                        opal_list_prepend(allocated_nodes, &node->super);
                        goto moveon;
                    }
                    nd = (orte_node_t*)opal_list_get_prev(&nd->super);
                }
                item = opal_list_get_next(&nd->super);
                if (item == opal_list_get_end(allocated_nodes)) {
                    /* we are at the end - just append */
                    opal_list_append(allocated_nodes, &node->super);
                } else {
                    nd = (orte_node_t*)item;
                    opal_list_insert_pos(allocated_nodes, item, &node->super);
                }
            moveon:
                /* reset us back to the end for the next node */
                nd = (orte_node_t*)opal_list_get_last(allocated_nodes);
            }
        }
    }

    OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
                         "%s Starting with %d nodes in list",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         (int)opal_list_get_size(allocated_nodes)));

    /** check that anything is here */
    if (0 == opal_list_get_size(allocated_nodes)) {
        if (!silent) {
            orte_show_help("help-orte-rmaps-base.txt",
                           "orte-rmaps-base:no-available-resources",
                           true);
        }
        return ORTE_ERR_SILENT;
    }
    
    /* filter the nodes thru any hostfile and dash-host options */
    OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
                         "%s Filtering thru apps",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    if (ORTE_SUCCESS != (rc = orte_rmaps_base_filter_nodes(app, allocated_nodes, true))
        && ORTE_ERR_TAKE_NEXT_OPTION != rc) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
                         "%s Retained %d nodes in list",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         (int)opal_list_get_size(allocated_nodes)));

 complete:
    /* remove all nodes that are already at max usage, and
     * compute the total number of allocated slots while
     * we do so
     */
    num_slots = 0;
    item  = opal_list_get_first(allocated_nodes);
    while (item != opal_list_get_end(allocated_nodes)) {
        /** save the next pointer in case we remove this node */
        next  = opal_list_get_next(item);
        /** check to see if this node is fully used - remove if so */
        node = (orte_node_t*)item;
        if (0 != node->slots_max && node->slots_inuse > node->slots_max) {
            OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
                                 "%s Removing node %s: max %d inuse %d",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 node->name, node->slots_max, node->slots_inuse));
            opal_list_remove_item(allocated_nodes, item);
            OBJ_RELEASE(item);  /* "un-retain" it */
        } else if (node->slots <= node->slots_inuse &&
                   (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(policy))) {
            /* remove the node as fully used */
            OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
                                 "%s Removing node %s slots %d inuse %d",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 node->name, node->slots, node->slots_inuse));
            opal_list_remove_item(allocated_nodes, item);
            OBJ_RELEASE(item);  /* "un-retain" it */
        } else if (node->slots > node->slots_inuse) {
                /* add the available slots */
                OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
                                     "%s node %s has %d slots available",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     node->name, node->slots - node->slots_inuse));
                num_slots += node->slots - node->slots_inuse;
        } else if (!(ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(policy))) {
                /* nothing needed to do here - we don't add slots to the
                 * count as we don't have any available. Just let the mapper
                 * do what it needs to do to meet the request
                 */
                OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
                                     "%s node %s is fully used, but available for oversubscrition",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     node->name));
        } else {
            /* if we cannot use it, remove it from list */
            opal_list_remove_item(allocated_nodes, item);
            OBJ_RELEASE(item);  /* "un-retain" it */
        }
        /** go on to next item */
        item = next;
    }

    /* Sanity check to make sure we have resources available */
    if (0 == opal_list_get_size(allocated_nodes)) {
        if (silent) {
            /* let the caller know that the resources exist,
             * but are currently busy
             */
            return ORTE_ERR_RESOURCE_BUSY;
        } else {
            orte_show_help("help-orte-rmaps-base.txt", 
                           "orte-rmaps-base:all-available-resources-used", true);
            return ORTE_ERR_SILENT;
        }
    }
    
    *total_num_slots = num_slots;
    
    if (4 < opal_output_get_verbosity(orte_rmaps_base_framework.framework_output)) {
        opal_output(0, "AVAILABLE NODES FOR MAPPING:");
        for (item = opal_list_get_first(allocated_nodes);
             item != opal_list_get_end(allocated_nodes);
             item = opal_list_get_next(item)) {
            node = (orte_node_t*)item;
            opal_output(0, "    node: %s daemon: %s", node->name,
                        (NULL == node->daemon) ? "NULL" : ORTE_VPID_PRINT(node->daemon->name.vpid));
        }
    }

    return ORTE_SUCCESS;
}
Beispiel #2
0
int orte_rmaps_base_set_ranking_policy(orte_ranking_policy_t *policy,
                                       orte_mapping_policy_t mapping,
                                       char *spec)
{
    orte_ranking_policy_t tmp;
    char **ck;
    size_t len;

    /* set default */
    tmp = 0;

    if (NULL == spec) {
        /* check for map-by object directives - we set the
         * ranking to match if one was given
         */
        if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(mapping)) {
            if (ORTE_MAPPING_BYCORE & ORTE_GET_MAPPING_POLICY(mapping)) {
                ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_CORE);
            } else if (ORTE_MAPPING_BYNODE & ORTE_GET_MAPPING_POLICY(mapping)) {
                ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_NODE);
            } else if (ORTE_MAPPING_BYL1CACHE & ORTE_GET_MAPPING_POLICY(mapping)) {
                ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_L1CACHE);
            } else if (ORTE_MAPPING_BYL2CACHE & ORTE_GET_MAPPING_POLICY(mapping)) {
                ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_L2CACHE);
            } else if (ORTE_MAPPING_BYL3CACHE & ORTE_GET_MAPPING_POLICY(mapping)) {
                ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_L3CACHE);
            } else if (ORTE_MAPPING_BYSOCKET & ORTE_GET_MAPPING_POLICY(mapping)) {
                ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_SOCKET);
            } else if (ORTE_MAPPING_BYNUMA & ORTE_GET_MAPPING_POLICY(mapping)) {
                ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_NUMA);
            } else if (ORTE_MAPPING_BYBOARD & ORTE_GET_MAPPING_POLICY(mapping)) {
                ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_BOARD);
            } else if (ORTE_MAPPING_BYHWTHREAD & ORTE_GET_MAPPING_POLICY(mapping)) {
                ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_HWTHREAD);
            }
        } else {
            /* if no map-by was given, default to by-slot */
            ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_SLOT);
        }
    } else {
        ck = opal_argv_split(spec, ':');
        if (2 < opal_argv_count(ck)) {
            /* incorrect format */
            orte_show_help("help-orte-rmaps-base.txt", "unrecognized-policy", true, "ranking", policy);
            opal_argv_free(ck);
            return ORTE_ERR_SILENT;
        }
        if (2 == opal_argv_count(ck)) {
            if (0 == strncasecmp(ck[1], "span", strlen(ck[1]))) {
                ORTE_SET_RANKING_DIRECTIVE(tmp, ORTE_RANKING_SPAN);
            } else if (0 == strncasecmp(ck[1], "fill", strlen(ck[1]))) {
                ORTE_SET_RANKING_DIRECTIVE(tmp, ORTE_RANKING_FILL);
            } else {
                /* unrecognized modifier */
                orte_show_help("help-orte-rmaps-base.txt", "unrecognized-modifier", true, "ranking", ck[1]);
                opal_argv_free(ck);
                return ORTE_ERR_SILENT;
            }
        }
        len = strlen(ck[0]);
        if (0 == strncasecmp(ck[0], "slot", len)) {
            ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_SLOT);
        } else if (0 == strncasecmp(ck[0], "node", len)) {
            ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_NODE);
#if OPAL_HAVE_HWLOC
        } else if (0 == strncasecmp(ck[0], "hwthread", len)) {
            ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_HWTHREAD);
        } else if (0 == strncasecmp(ck[0], "core", len)) {
            ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_CORE);
        } else if (0 == strncasecmp(ck[0], "l1cache", len)) {
            ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_L1CACHE);
        } else if (0 == strncasecmp(ck[0], "l2cache", len)) {
            ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_L2CACHE);
        } else if (0 == strncasecmp(ck[0], "l3cache", len)) {
            ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_L3CACHE);
        } else if (0 == strncasecmp(ck[0], "socket", len)) {
            ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_SOCKET);
        } else if (0 == strncasecmp(ck[0], "numa", len)) {
            ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_NUMA);
        } else if (0 == strncasecmp(ck[0], "board", len)) {
            ORTE_SET_RANKING_POLICY(tmp, ORTE_RANK_BY_BOARD);
#endif
        } else {
            orte_show_help("help-orte-rmaps-base.txt", "unrecognized-policy", true, "ranking", rmaps_base_ranking_policy);
            return ORTE_ERR_SILENT;
        }
        opal_argv_free(ck);
        ORTE_SET_RANKING_DIRECTIVE(tmp, ORTE_RANKING_GIVEN);
    }

    *policy = tmp;
    return ORTE_SUCCESS;
}
static int
ompi_mtl_psm_component_register(void)
{
    int value;
    char *service_id = NULL;
    char *path_res = NULL;
    
    mca_base_param_reg_int(&mca_mtl_psm_component.super.mtl_version, 
			   "connect_timeout",
			   "PSM connection timeout value in seconds",
			   false, false, 180, &ompi_mtl_psm.connect_timeout);
  
    mca_base_param_reg_int(&mca_mtl_psm_component.super.mtl_version, 
			   "debug",
			   "PSM debug level",
			   false, false, 1, 
			   &value);
    ompi_mtl_psm.debug_level = value;
  
    mca_base_param_reg_int(&mca_mtl_psm_component.super.mtl_version, 
			   "ib_unit",
			   "Truescale unit to use",
			   false, false, -1, 
			   &ompi_mtl_psm.ib_unit);

    mca_base_param_reg_int(&mca_mtl_psm_component.super.mtl_version, 
			   "ib_port",
			   "Truescale port on unit to use",
			   false, false, 0, 
			   &ompi_mtl_psm.ib_port);

    mca_base_param_reg_int(&mca_mtl_psm_component.super.mtl_version, 
			   "ib_service_level",
			   "Infiniband service level"
			   "(0 <= SL <= 15)",
			   false, false, 0, &ompi_mtl_psm.ib_service_level);
  
    ompi_mtl_psm.ib_pkey = 0x7fffUL;
    mca_base_param_reg_int(&mca_mtl_psm_component.super.mtl_version, 
			   "ib_pkey",
			   "Infiniband partition key",
			   false, false, 0x7fffUL, 
			   &value);
    ompi_mtl_psm.ib_pkey = value;
    
#if PSM_VERNO >= 0x010d
    mca_base_param_reg_string(&mca_mtl_psm_component.super.mtl_version,
			      "ib_service_id",
			      "Infiniband service ID to use for application (default is 0)",
			      false, false, "0x1000117500000000",
			      &service_id);
    ompi_mtl_psm.ib_service_id = (uint64_t) strtoull(service_id, NULL, 0);
    
    mca_base_param_reg_string(&mca_mtl_psm_component.super.mtl_version,
			      "path_query",
			      "Path record query mechanisms (valid values: opp, none)",
			      false, false, NULL, &path_res);
    if ((NULL != path_res) && strcasecmp(path_res, "none")) {
      if (!strcasecmp(path_res, "opp"))
	ompi_mtl_psm.path_res_type = PSM_PATH_RES_OPP;
      else {
	orte_show_help("help-mtl-psm.txt",
		       "path query mechanism unknown", true,
		       path_res, "OfedPlus (opp) | Static Routes (none)");
	return OMPI_ERR_NOT_FOUND;
      }
    }
    else {
      /* Default is "static/none" path record queries */
      ompi_mtl_psm.path_res_type = PSM_PATH_RES_NONE;
    }
#endif
    
    if (ompi_mtl_psm.ib_service_level < 0)  {
      ompi_mtl_psm.ib_service_level = 0;
    } else if (ompi_mtl_psm.ib_service_level > 15) {
      ompi_mtl_psm.ib_service_level = 15;
    }
  
  return OMPI_SUCCESS;
    
}
Beispiel #4
0
static void check_config(int fd, short args, void *cbdata)
{
    DIR *dirp = NULL;
    struct dirent * dir_entry;
    struct stat buf;
    int i, rc, n, j, k, m;
    char *fullpath;
    orcm_cfgi_app_t *app, *app2, *aptr;
    orcm_cfgi_run_t *run;
    orcm_cfgi_exec_t *exec, *exec2, *eptr;
    orcm_cfgi_version_t *vers, *vers2, *vptr;
    orcm_cfgi_bin_t *bin;
    orte_job_t *jdat, *jptr;
    orte_app_context_t *ax;
    opal_pointer_array_t found_apps;
    bool found, dir_found;
    orcm_cfgi_caddy_t *caddy;

    /* take control */
    ORTE_ACQUIRE_THREAD(&orcm_cfgi_base.ctl);

    OPAL_OUTPUT_VERBOSE((2, orcm_cfgi_base.output,
                         "%s CHECKING CONFIG DIRECTORY %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         mca_orcm_cfgi_file_component.dir));

    /* Open the directory so we can get a listing */
    if (NULL == (dirp = opendir(mca_orcm_cfgi_file_component.dir))) {
        if (0 < opal_output_get_verbosity(orcm_cfgi_base.output)) {
            orte_show_help("help-cfgi-file.txt", "no-dir",
                           true, mca_orcm_cfgi_file_component.dir);
        }
        dir_found = false;
        goto restart;
    }
    dir_found = true;

    /* setup the array of apps */
    OBJ_CONSTRUCT(&found_apps, opal_pointer_array_t);
    opal_pointer_array_init(&found_apps, 16, INT_MAX, 16);

    /* cycle thru the directory */
    while (NULL != (dir_entry = readdir(dirp))) {
        /* Skip the obvious */
        if (0 == strncmp(dir_entry->d_name, ".", strlen(".")) ||
            0 == strncmp(dir_entry->d_name, "..", strlen(".."))) {
            continue;
        }
        /* Skip editor-related files */
        if (NULL != strstr(dir_entry->d_name, ".swp") ||
            NULL != strstr(dir_entry->d_name, ".swx") ||
            NULL != strchr(dir_entry->d_name, '~')) {
            continue;
        }
        if ('#' == dir_entry->d_name[0]) {
            continue;
        }

        /* parse the file, adding all found apps to the array */
        fullpath = opal_os_path(false, mca_orcm_cfgi_file_component.dir, dir_entry->d_name, NULL);
        if (ORCM_SUCCESS != (rc = parse_file(fullpath, &found_apps))) {
            OPAL_OUTPUT_VERBOSE((1, orcm_cfgi_base.output,
                                 "%s CANNOT PARSE FILE %s: %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 dir_entry->d_name, ORTE_ERROR_NAME(rc)));
        }
        free(fullpath);
    }
    closedir(dirp);

    /* cycle thru the installed apps */
    for (i=0; i < orcm_cfgi_base.installed_apps.size; i++) {
        if (NULL == (app = (orcm_cfgi_app_t*)opal_pointer_array_get_item(&orcm_cfgi_base.installed_apps, i))) {
            continue;
        }
        app->modified = false;
        /* is this app present in the found apps? */
        app2 = NULL;
        for (j=0; j < found_apps.size; j++) {
            if (NULL == (aptr = (orcm_cfgi_app_t*)opal_pointer_array_get_item(&found_apps, j))) {
                continue;
            }
            if (0 == strcmp(app->application, aptr->application)) {
                app2 = aptr;
                /* remove it from the found_apps array as we will now process it */
                opal_pointer_array_set_item(&found_apps, j, NULL);
                break;
            }
        }
        if (NULL == app2) {
            OPAL_OUTPUT_VERBOSE((2, orcm_cfgi_base.output,
                                 "%s APP %s IS NO LONGER INSTALLED",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 app->application));
            /* no longer present - remove this object from the installed array */
            opal_pointer_array_set_item(&orcm_cfgi_base.installed_apps, app->idx, NULL);
            /* find all instances */
            for (j=0; j < app->instances.size; j++) {
                if (NULL == (run = (orcm_cfgi_run_t*)opal_pointer_array_get_item(&app->instances, j))) {
                    continue;
                }
                OPAL_OUTPUT_VERBOSE((2, orcm_cfgi_base.output,
                                     "%s APP %s IS NO LONGER INSTALLED - KILLING INSTANCE %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     app->application, run->instance));
                /* remove it from the array */
                opal_pointer_array_set_item(&app->instances, j, NULL);
                run->app = NULL;
                run->app_idx = -1;
                /* delink all the binaries */
                for (k=0; k < run->binaries.size; k++) {
                    if (NULL == (bin = (orcm_cfgi_bin_t*)opal_pointer_array_get_item(&run->binaries, k))) {
                        continue;
                    }
                    bin->vers = NULL;
                    bin->exec = NULL;
                }
                /* kill the associated executing job, if any */
                caddy = OBJ_NEW(orcm_cfgi_caddy_t);
                caddy->cmd = ORCM_CFGI_KILL_JOB;
                /* retain the run object as it has -not- been removed from
                 * the running config
                 */
                OBJ_RETAIN(run);
                caddy->run = run;
                /* send it off to be processed */
                opal_fd_write(orcm_cfgi_base.launch_pipe[1], sizeof(orcm_cfgi_caddy_t*), &caddy);
            }
            /* release it */
            OBJ_RELEASE(app);
            continue;
        }
        /* app was present - did we modify it */
        if (app->max_instances != app2->max_instances) {
            app->max_instances = app2->max_instances;
            app->modified = true;
        }
        /* did we remove any executables? */
        for (j=0; j < app->executables.size; j++) {
            if (NULL == (exec = (orcm_cfgi_exec_t*)opal_pointer_array_get_item(&app->executables, j))) {
                continue;
            }
            /* is it present in the found apps */
            exec2 = NULL;
            for (k=0; k < app2->executables.size; k++) {
                if (NULL == (eptr = (orcm_cfgi_exec_t*)opal_pointer_array_get_item(&app2->executables, k))) {
                    continue;
                }
                if (0 == strcmp(exec->appname, eptr->appname)) {
                    exec2 = eptr;
                    break;
                }
            }
            if (NULL == exec2) {
                OPAL_OUTPUT_VERBOSE((2, orcm_cfgi_base.output,
                                     "%s APP %s EXECUTABLE %s IS NO LONGER INSTALLED",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     app->application, exec->appname));
                /* this executable has been removed */
                opal_pointer_array_set_item(&app->executables, j, NULL);
                /* find all instances
                 * that use this executable and kill associated binaries
                 */
                for (k=0; k < app->instances.size; k++) {
                    if (NULL == (run = (orcm_cfgi_run_t*)opal_pointer_array_get_item(&app->instances, k))) {
                        continue;
                    }
                    /* search the binaries to see if they include this executable */
                    for (n=0; n < run->binaries.size; n++) {
                        if (NULL == (bin = (orcm_cfgi_bin_t*)opal_pointer_array_get_item(&run->binaries, n))) {
                            continue;
                        }
                        if (0 == strcmp(bin->appname, exec->appname)) {
                            OPAL_OUTPUT_VERBOSE((2, orcm_cfgi_base.output,
                                                 "%s APP %s EXECUTABLE %s IS NO LONGER INSTALLED - KILLING BINARY %s",
                                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                                 app->application, exec->appname, bin->binary));
                            exec->total_procs -= bin->num_procs;
                            /* ensure we know it is no longer pointing to an installed exec/version */
                            bin->vers = NULL;
                            bin->exec = NULL;
                            /* kill the associated executing exec, if any */
                            caddy = OBJ_NEW(orcm_cfgi_caddy_t);
                            caddy->cmd = ORCM_CFGI_KILL_EXE;
                            /* retain the run object as it has -not- been removed from
                             * the running config
                             */
                            OBJ_RETAIN(run);
                            caddy->run = run;
                            /* send it off to be processed */
                            opal_fd_write(orcm_cfgi_base.launch_pipe[1], sizeof(orcm_cfgi_caddy_t*), &caddy);
                            break;
                        }
                    }
                }
                OBJ_RELEASE(exec);
                continue;
            }
            /* kept the exec, but was it modified */
            if (exec->process_limit != exec2->process_limit) {
                exec->process_limit = exec2->process_limit;
                app->modified = true;
            }
            /* did we remove any versions */
            for (k=0; k < exec->versions.size; k++) {
                if (NULL == (vers = (orcm_cfgi_version_t*)opal_pointer_array_get_item(&exec->versions, k))) {
                    continue;
                }
                /* is it present in the found app/exec */
                vers2 = NULL;
                for (n=0; n < exec2->versions.size; n++) {
                    if (NULL == (vptr = (orcm_cfgi_version_t*)opal_pointer_array_get_item(&exec2->versions, n))) {
                        continue;
                    }
                    if (0 == strcmp(vptr->version, vers->version)) {
                        vers2 = vptr;
                        /* since we have this version, we can remove it from
                         * the found app
                         */
                        opal_pointer_array_set_item(&exec2->versions, n, NULL);
                        break;
                    }
                }
                if (NULL != vers2) {
                    OBJ_RELEASE(vers2);
                    continue;
                }
                OPAL_OUTPUT_VERBOSE((2, orcm_cfgi_base.output,
                                     "%s APP %s EXEC %s VERSION %s IS NO LONGER INSTALLED",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     app->application, exec->appname, vers->version));
                /* nope - been removed, so take it out of the array */
                opal_pointer_array_set_item(&exec->versions, k, NULL);
                /* find all instances and kill this version */
                for (m=0; m < app->instances.size; m++) {
                    if (NULL == (run = (orcm_cfgi_run_t*)opal_pointer_array_get_item(&app->instances, m))) {
                        continue;
                    }
                    /* search the binaries to see if they include this version */
                    for (n=0; n < run->binaries.size; n++) {
                        if (NULL == (bin = (orcm_cfgi_bin_t*)opal_pointer_array_get_item(&run->binaries, n))) {
                            continue;
                        }
                        if (0 == strcmp(bin->appname, exec->appname) &&
                            0 == strcmp(bin->version, vers->version)) {
                            OPAL_OUTPUT_VERBOSE((2, orcm_cfgi_base.output,
                                                 "%s APP %s EXECUTABLE %s VERSION %s IS NO LONGER INSTALLED - KILLING BINARY %s",
                                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                                 app->application, exec->appname, vers->version, bin->binary));
                            exec->total_procs -= bin->num_procs;
                            /* ensure we know it is no longer pointing to an installed version */
                            bin->vers = NULL;
                            /* kill the associated executing exec, if any */
                            caddy = OBJ_NEW(orcm_cfgi_caddy_t);
                            caddy->cmd = ORCM_CFGI_KILL_EXE;
                            /* retain the run object as it has -not- been removed from
                             * the running config
                             */
                            OBJ_RETAIN(run);
                            caddy->run = run;
                            /* send it off to be processed */
                            opal_fd_write(orcm_cfgi_base.launch_pipe[1], sizeof(orcm_cfgi_caddy_t*), &caddy);
                            break;
                        }
                    }
                }
                /* cleanup */
                OBJ_RELEASE(vers);
            }
        }
        /* did we add any executables or versions */
        for (k=0; k < app2->executables.size; k++) {
            if (NULL == (exec2 = (orcm_cfgi_exec_t*)opal_pointer_array_get_item(&app2->executables, k))) {
                continue;
            }
            exec = NULL;
            for (j=0; j < app->executables.size; j++) {
                if (NULL == (eptr = (orcm_cfgi_exec_t*)opal_pointer_array_get_item(&app->executables, j))) {
                    continue;
                }
                if (0 == strcmp(eptr->appname, exec2->appname)) {
                    exec = eptr;
                    break;
                }
            }
            if (NULL == exec) {
                OPAL_OUTPUT_VERBOSE((2, orcm_cfgi_base.output,
                                     "%s APP %s ADDING EXECUTABLE %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     app->application, exec2->appname));
                /* added this exec - just move it across */
                opal_pointer_array_set_item(&app2->executables, k, NULL);
                exec2->idx = opal_pointer_array_add(&app->executables, exec2);
                app->modified = true;
                continue;
            }
            /* exec already present, and we dealt with mods above - so
             * see if any versions were added.
             */
            for (j=0; j < exec2->versions.size; j++) {
                if (NULL == (vers = (orcm_cfgi_version_t*)opal_pointer_array_get_item(&exec2->versions, j))) {
                    continue;
                }
                /* if already present, ignore */
                vers2 = NULL;
                for (n=0; n < exec->versions.size; n++) {
                    if (NULL == (vptr = (orcm_cfgi_version_t*)opal_pointer_array_get_item(&exec->versions, n))) {
                        continue;
                    }
                    if (0 == strcmp(vptr->version, vers->version)) {
                        vers2 = vptr;
                        break;
                    }
                }
                if (NULL == vers2) {
                    OPAL_OUTPUT_VERBOSE((2, orcm_cfgi_base.output,
                                         "%s APP %s ADDING EXECUTABLE %s VERSION %s",
                                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                         app->application, exec2->appname, vers->version));
                    opal_pointer_array_set_item(&exec2->versions, j, NULL);
                    vers->exec = exec;
                    vers->idx = opal_pointer_array_add(&exec->versions, vers);
                    app->modified = true;
                } else {
                    OBJ_RELEASE(vers2);
                }
            }
        }
        /* done with this entry */
        OBJ_RELEASE(app2);
    }

    /* any added applications get handled now - anything still in found_apps
     * would have been added
     */
    for (j=0; j < found_apps.size; j++) {
        if (NULL == (aptr = (orcm_cfgi_app_t*)opal_pointer_array_get_item(&found_apps, j))) {
            continue;
        }
        OPAL_OUTPUT_VERBOSE((2, orcm_cfgi_base.output,
                             "%s ADDING APP %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             aptr->application));
        /* just shift the entry to the installed_apps array */
        aptr->idx = opal_pointer_array_add(&orcm_cfgi_base.installed_apps, aptr);
        /* mark it as modified so it will be handled below */
        aptr->modified = true;
    }
    OBJ_DESTRUCT(&found_apps);

    /* check installed vs configd for anything needing starting,
     * but only check modified apps
     */
    check_installed(false);

 restart:
#ifdef HAVE_SYS_INOTIFY_H
    if (dir_found) {
        if (timer_in_use) {
            /* redefine the event to use inotify now
             * that the dir has been found
             */
            if (0 > (watch = inotify_add_watch(notifier, mca_orcm_cfgi_file_component.dir,
                                               IN_DELETE | IN_MODIFY | IN_MOVE))) {
                close(notifier);
                opal_event_evtimer_add(probe_ev, &probe_time);
            } else {
                opal_event_del(probe_ev);
                opal_event_set(opal_event_base, probe_ev, notifier,
                               OPAL_EV_READ|OPAL_EV_PERSIST, inotify_handler, NULL);
                opal_event_add(probe_ev, 0);
                timer_in_use = false;
            }
        } else {
            /* reset the event */
            opal_event_add(probe_ev, 0);
        }
    } else {
        /* restart the timer so we keep looking for it */
        opal_event_evtimer_add(probe_ev, &probe_time);
    }
#else
    /* restart the timer */
    opal_event_evtimer_add(probe_ev, &probe_time);
#endif
    /* release control */
    ORTE_RELEASE_THREAD(&orcm_cfgi_base.ctl);
}
Beispiel #5
0
/**
 * Function for finding and opening either all MCA components, or the one
 * that was specifically requested via a MCA parameter.
 */
static int orte_rmaps_base_open(mca_base_open_flag_t flags)
{
    int rc;

    /* init the globals */
    OBJ_CONSTRUCT(&orte_rmaps_base.selected_modules, opal_list_t);
    orte_rmaps_base.ppr = NULL;
    orte_rmaps_base.slot_list = NULL;
    orte_rmaps_base.mapping = 0;
    orte_rmaps_base.ranking = 0;

#if OPAL_HAVE_HWLOC
    /* if a topology file was given, then set our topology
     * from it. Even though our actual topology may differ,
     * mpirun only needs to see the compute node topology
     * for mapping purposes
     */
    if (NULL != rmaps_base_topo_file) {
        if (OPAL_SUCCESS != (rc = opal_hwloc_base_set_topology(rmaps_base_topo_file))) {
            orte_show_help("help-orte-rmaps-base.txt", "topo-file", true, rmaps_base_topo_file);
            return ORTE_ERR_SILENT;
        }
    }
#endif

    if (ORTE_SUCCESS != (rc = orte_rmaps_base_set_mapping_policy(&orte_rmaps_base.mapping,
                                                                 &orte_rmaps_base.device,
                                                                 rmaps_base_mapping_policy))) {
        return rc;
    }

    if (ORTE_SUCCESS != (rc = orte_rmaps_base_set_ranking_policy(&orte_rmaps_base.ranking,
                                                                 orte_rmaps_base.mapping,
                                                                 rmaps_base_ranking_policy))) {
        return rc;
    }

    if (rmaps_base_byslot) {
        /* set mapping policy to byslot - error if something else already set */
        if ((ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) &&
            ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) != ORTE_MAPPING_BYSLOT) {
            /* error - cannot redefine the default mapping policy */
            orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
                           "byslot", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
            return ORTE_ERR_SILENT;
        }
        ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYSLOT);
        ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN);
        /* set ranking policy to byslot - error if something else already set */
        if ((ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(orte_rmaps_base.ranking)) &&
            ORTE_GET_RANKING_POLICY(orte_rmaps_base.ranking) != ORTE_RANK_BY_SLOT) {
            /* error - cannot redefine the default ranking policy */
            orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "ranking",
                           "byslot", orte_rmaps_base_print_ranking(orte_rmaps_base.ranking));
            return ORTE_ERR_SILENT;
        }
        ORTE_SET_RANKING_POLICY(orte_rmaps_base.ranking, ORTE_RANK_BY_SLOT);
        ORTE_SET_RANKING_DIRECTIVE(orte_rmaps_base.ranking, ORTE_RANKING_GIVEN);
    }

    if (rmaps_base_bynode) {
        /* set mapping policy to bynode - error if something else already set */
        if ((ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) &&
            ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) != ORTE_MAPPING_BYNODE) {
            orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
                           "bynode", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
            return ORTE_ERR_SILENT;
        }
        ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYNODE);
        ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN);
        /* set ranking policy to bynode - error if something else already set */
        if ((ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(orte_rmaps_base.ranking)) &&
            ORTE_GET_RANKING_POLICY(orte_rmaps_base.ranking) != ORTE_RANK_BY_NODE) {
            /* error - cannot redefine the default ranking policy */
            orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "ranking",
                           "bynode", orte_rmaps_base_print_ranking(orte_rmaps_base.ranking));
            return ORTE_ERR_SILENT;
        }
        ORTE_SET_RANKING_POLICY(orte_rmaps_base.ranking, ORTE_RANK_BY_NODE);
        ORTE_SET_RANKING_DIRECTIVE(orte_rmaps_base.ranking, ORTE_RANKING_GIVEN);
    }

    /* Should we schedule on the local node or not? */
    if (rmaps_base_no_schedule_local) {
        orte_rmaps_base.mapping |= ORTE_MAPPING_NO_USE_LOCAL;
    }

    /* Should we oversubscribe or not? */
    if (rmaps_base_no_oversubscribe) {
        if ((ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) &&
            !(ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
            /* error - cannot redefine the default mapping policy */
            orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
                           "no-oversubscribe", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
            return ORTE_ERR_SILENT;
        }
        ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
        ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_SUBSCRIBE_GIVEN);
    }

    /** force oversubscription permission */
    if (rmaps_base_oversubscribe) {
        if ((ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) &&
            (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
            /* error - cannot redefine the default mapping policy */
            orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping",
                           "oversubscribe", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping));
            return ORTE_ERR_SILENT;
        }
        ORTE_UNSET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
        ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_SUBSCRIBE_GIVEN);
    }

    /* should we display a detailed (developer-quality) version of the map after determining it? */
    if (rmaps_base_display_devel_map) {
        orte_rmaps_base.display_map = true;
        orte_devel_level_output = true;
    }
   
    /* should we display a diffable report of proc locations after determining it? */
    if (rmaps_base_display_diffable_map) {
        orte_rmaps_base.display_map = true;
        orte_display_diffable_output = true;
    }

    /* Open up all available components */
    rc = mca_base_framework_components_open(&orte_rmaps_base_framework, flags);

    /* check to see if any component indicated a problem */
    if (ORTE_MAPPING_CONFLICTED & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) {
        /* the component would have already reported the error, so
         * tell the rest of the chain to shut up
         */
        return ORTE_ERR_SILENT;
    }

    /* All done */
    return rc;
}
/*
 * Construct the fullpath to the session directory
 */
int
orte_session_dir_get_name(char **fulldirpath,
                          char **return_prefix,  /* This will come back as the valid tmp dir */
                          char **return_frontend,
                          char *hostid,
                          char *batchid, 
                          orte_process_name_t *proc) {
    char *hostname  = NULL, 
        *batchname = NULL,
        *sessions  = NULL, 
        *user      = NULL, 
        *prefix = NULL,
        *frontend = NULL,
        *jobfam = NULL,
        *job = NULL,
        *vpidstr = NULL;
    bool prefix_provided = false;
    int exit_status = ORTE_SUCCESS;
    size_t len;
    int uid;
    struct passwd *pwdent;
    
    /* Ensure that system info is set */
    orte_proc_info();

     /* get the name of the user */
    uid = getuid();
#ifdef HAVE_GETPWUID
    pwdent = getpwuid(uid);
#else
    pwdent = NULL;
#endif
    if (NULL != pwdent) {
        user = strdup(pwdent->pw_name);
    } else {
        orte_show_help("help-orte-runtime.txt",
                       "orte:session:dir:nopwname", true);
        return ORTE_ERR_OUT_OF_RESOURCE;
    }
    
    /*
     * set the 'hostname'
     */
    if( NULL != hostid) { /* User specified version */
        hostname = strdup(hostid);
    }
    else {            /* check if it is set elsewhere */
        if( NULL != orte_process_info.nodename)
            hostname = strdup(orte_process_info.nodename);
        else {
            /* Couldn't find it, so fail */
            ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
            exit_status = ORTE_ERR_BAD_PARAM;
            goto cleanup;
        }
    }
    
    /*
     * set the 'batchid'
     */
    if (NULL != batchid)
        batchname = strdup(batchid);
    else 
        batchname = strdup("0");

    /*
     * get the front part of the session directory
     * Will look something like:
     *    openmpi-sessions-USERNAME@HOSTNAME_BATCHID
     */
    if (NULL != orte_process_info.top_session_dir) {
        frontend = strdup(orte_process_info.top_session_dir);
    }
    else { /* If not set then construct it */
        if (0 > asprintf(&frontend, "openmpi-sessions-%s@%s_%s", user, hostname, batchname)) {
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            exit_status = ORTE_ERR_OUT_OF_RESOURCE;
            goto cleanup;
        }
    }

    /*
     * Construct the session directory
     */
    /* If we were given a valid vpid then we can construct it fully into:
     *   openmpi-sessions-USERNAME@HOSTNAME_BATCHID/JOB-FAMILY/JOBID/VPID
     */
    if( NULL != proc) {
        if (ORTE_VPID_INVALID != proc->vpid) {
            
            if (0 > asprintf(&jobfam, "%d", ORTE_JOB_FAMILY(proc->jobid))) {
                ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
                exit_status = ORTE_ERR_OUT_OF_RESOURCE;
                goto cleanup;
            }
            
            if (0 > asprintf(&job, "%d", ORTE_LOCAL_JOBID(proc->jobid))) {
                ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
                exit_status = ORTE_ERR_OUT_OF_RESOURCE;
                goto cleanup;
            }
            
            if (ORTE_SUCCESS != orte_util_convert_vpid_to_string(&vpidstr, proc->vpid)) {
                ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
                exit_status = ORTE_ERR_OUT_OF_RESOURCE;
                goto cleanup;
            }
            
            sessions = opal_os_path( false, frontend, jobfam, job, vpidstr, NULL );
            if( NULL == sessions ) {
                ORTE_ERROR_LOG(ORTE_ERROR);
                exit_status = ORTE_ERROR;
                goto cleanup;
            }
        }
        /* If we were given a valid jobid then we can construct it partially into:
         *   openmpi-sessions-USERNAME@HOSTNAME_BATCHID/JOB-FAMILY/JOBID
         */
        else if (ORTE_JOBID_INVALID != proc->jobid) {
            if (0 > asprintf(&jobfam, "%d", ORTE_JOB_FAMILY(proc->jobid))) {
                ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
                exit_status = ORTE_ERR_OUT_OF_RESOURCE;
                goto cleanup;
            }
            
            if (0 > asprintf(&job, "%d", ORTE_LOCAL_JOBID(proc->jobid))) {
                ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
                exit_status = ORTE_ERR_OUT_OF_RESOURCE;
                goto cleanup;
            }
            
            sessions = opal_os_path( false, frontend, jobfam, job, NULL );
            if( NULL == sessions ) {
                ORTE_ERROR_LOG(ORTE_ERROR);
                exit_status = ORTE_ERROR;
                goto cleanup;
            }
        } /* if both are invalid */
        else {
            sessions = strdup(frontend); /* must dup this to avoid double-free later */
        }
        
    }    /* If we were not given a proc at all, then we just set it to frontend
     */
    else {
        sessions = strdup(frontend); /* must dup this to avoid double-free later */
    }
    
    /*
     * If the user specified an invalid prefix, or no prefix at all
     * we need to keep looking
     */
    if( NULL != fulldirpath && NULL != *fulldirpath) {
        free(*fulldirpath);
        *fulldirpath = NULL;
    }

    if( NULL != return_prefix && NULL != *return_prefix) { /* use the user specified one, if available */ 
        prefix = strdup(*return_prefix); 
        prefix_provided = true;
    }
    /* Try to find a proper alternative prefix */
    else if (NULL != orte_process_info.tmpdir_base) { /* stored value */
        prefix = strdup(orte_process_info.tmpdir_base);
    }
    else { /* General Environment var */
        prefix = strdup(opal_tmp_directory());
    }
    len = strlen(prefix);
    /* check for a trailing path separator */
    if (OPAL_PATH_SEP[0] == prefix[len-1]) {
        prefix[len-1] = '\0';
    }
    
    /* BEFORE doing anything else, check to see if this prefix is
     * allowed by the system
     */
    if (NULL != orte_prohibited_session_dirs) {
        char **list;
        int i, len;
        /* break the string into tokens - it should be
         * separated by ','
         */
        list = opal_argv_split(orte_prohibited_session_dirs, ',');
        len = opal_argv_count(list);
        /* cycle through the list */
        for (i=0; i < len; i++) {
            /* check if prefix matches */
            if (0 == strncmp(prefix, list[i], strlen(list[i]))) {
                /* this is a prohibited location */
                orte_show_help("help-orte-runtime.txt",
                               "orte:session:dir:prohibited",
                               true, prefix, orte_prohibited_session_dirs);
                return ORTE_ERR_FATAL;
            }
        }
        opal_argv_free(list);  /* done with this */
    }
    /*
     * Construct the absolute final path, if requested
     */
    if (NULL != fulldirpath) {
        *fulldirpath = opal_os_path(false, prefix, sessions, NULL);
    }

    /* 
     * Return the frontend and prefix, if user requested we do so 
     */ 
    if (NULL != return_frontend) { 
        *return_frontend = strdup(frontend); 
    } 
    if (!prefix_provided && NULL != return_prefix) { 
        *return_prefix = strdup(prefix); 
    } 

 cleanup:
    if(NULL != hostname)
        free(hostname);
    if(NULL != batchname)
        free(batchname);
    if(NULL != sessions)
        free(sessions);
    if(NULL != user)
        free(user);
    if (NULL != prefix) free(prefix);
    if (NULL != frontend) free(frontend);
    if (NULL != jobfam) free(jobfam);
    if (NULL != job) free(job);
    if (NULL != vpidstr) free(vpidstr);

    return exit_status;
}
Beispiel #7
0
static void process_opens(int fd, short args, void *cbdata)
{
    orte_dfs_request_t *dfs = (orte_dfs_request_t*)cbdata;
    int rc;
    opal_buffer_t *buffer;
    char *scheme, *host, *filename, *hostname;
    orte_process_name_t daemon;
    bool found;
    orte_vpid_t v;

    /* get the scheme to determine if we can process locally or not */
    if (NULL == (scheme = opal_uri_get_scheme(dfs->uri))) {
        OBJ_RELEASE(dfs);
        return;
    }

    if (0 == strcmp(scheme, "nfs")) {
        open_local_file(dfs);
        goto complete;
    }

    if (0 != strcmp(scheme, "file")) {
        /* not yet supported */
        orte_show_help("orte_dfs_help.txt", "unsupported-filesystem",
                       true, dfs->uri);
        if (NULL != dfs->open_cbfunc) {
            dfs->open_cbfunc(-1, dfs->cbdata);
        }
        goto complete;
    }

    /* dissect the uri to extract host and filename/path */
    if (NULL == (filename = opal_filename_from_uri(dfs->uri, &host))) {
        goto complete;
    }
    /* if the host is our own, then treat it as a local file */
    if (NULL == host ||
        0 == strcmp(host, orte_process_info.nodename) ||
        0 == strcmp(host, "localhost") ||
        opal_ifislocal(host)) {
        opal_output_verbose(1, orte_dfs_base.output,
                            "%s file %s on local host",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            filename);
        open_local_file(dfs);
        goto complete;
    }

    /* ident the daemon on that host */
    daemon.jobid = ORTE_PROC_MY_DAEMON->jobid;
    found = false;
    for (v=0; v < orte_process_info.num_daemons; v++) {
        daemon.vpid = v;
        /* fetch the hostname where this daemon is located */
        if (ORTE_SUCCESS != (rc = orte_db.fetch_pointer(&daemon, ORTE_DB_HOSTNAME, (void**)&hostname, OPAL_STRING))) {
            ORTE_ERROR_LOG(rc);
            goto complete;
        }
        if (0 == strcmp(host, hostname)) {
            found = true;
            break;
        }
    }
    if (!found) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        goto complete;
    }
    opal_output_verbose(1, orte_dfs_base.output,
                        "%s file %s on host %s daemon %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        filename, host, ORTE_NAME_PRINT(&daemon));
    /* double-check: if it is our local daemon, then we
     * treat this as local
     */
    if (daemon.vpid == ORTE_PROC_MY_DAEMON->vpid) {
        opal_output_verbose(1, orte_dfs_base.output,
                            "%s local file %s on same daemon",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            filename);
        open_local_file(dfs);
        goto complete;
    }
    /* add this request to our local list so we can
     * match it with the returned response when it comes
     */
    dfs->id = req_id++;
    opal_list_append(&requests, &dfs->super);

    /* setup a message for the daemon telling
     * them what file we want to access
     */
    buffer = OBJ_NEW(opal_buffer_t);
    if (OPAL_SUCCESS != (rc = opal_dss.pack(buffer, &dfs->cmd, 1, ORTE_DFS_CMD_T))) {
        ORTE_ERROR_LOG(rc);
        opal_list_remove_item(&requests, &dfs->super);
        goto complete;
    }
    /* pass the request id */
    if (OPAL_SUCCESS != (rc = opal_dss.pack(buffer, &dfs->id, 1, OPAL_UINT64))) {
        ORTE_ERROR_LOG(rc);
        opal_list_remove_item(&requests, &dfs->super);
        goto complete;
    }
    if (OPAL_SUCCESS != (rc = opal_dss.pack(buffer, &filename, 1, OPAL_STRING))) {
        ORTE_ERROR_LOG(rc);
        opal_list_remove_item(&requests, &dfs->super);
        goto complete;
    }
    
    opal_output_verbose(1, orte_dfs_base.output,
                        "%s sending open file request to %s file %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(&daemon),
                        filename);
    /* send it */
    if (0 > (rc = orte_rml.send_buffer_nb(&daemon, buffer,
                                          ORTE_RML_TAG_DFS_CMD, 0,
                                          orte_rml_send_callback, NULL))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(buffer);
        opal_list_remove_item(&requests, &dfs->super);
        goto complete;
    }
    /* don't release it */
    return;

 complete:
    OBJ_RELEASE(dfs);
}
Beispiel #8
0
int orte_rmaps_rr_byslot(orte_job_t *jdata,
                         orte_app_context_t *app,
                         opal_list_t *node_list,
                         orte_std_cntr_t num_slots,
                         orte_vpid_t num_procs)
{
    int rc, i, nprocs_mapped;
    orte_node_t *node;
    orte_proc_t *proc;
    int num_procs_to_assign, extra_procs_to_assign=0, nxtra_nodes=0;
    hwloc_obj_t obj=NULL;
    float balance;
    bool add_one=false;

    opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps:rr: mapping by slot for job %s slots %d num_procs %lu",
                        ORTE_JOBID_PRINT(jdata->jobid), (int)num_slots, (unsigned long)num_procs);

    /* check to see if we can map all the procs */
    if (num_slots < ((int)app->num_procs * orte_rmaps_base.cpus_per_rank)) {
        if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
            orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
                           true, app->num_procs, app->app);
            return ORTE_ERR_SILENT;
        }
    }

    /* first pass: map the number of procs to each node until we
     * map all specified procs or use all allocated slots
     */
    nprocs_mapped = 0;
    OPAL_LIST_FOREACH(node, node_list, orte_node_t) {
        opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:rr:slot working node %s",
                            node->name);
        /* get the root object as we are not assigning
         * locale here except at the node level
         */
        if (NULL != node->topology) {
            obj = hwloc_get_root_obj(node->topology);
        }
        if (node->slots <= node->slots_inuse) {
            opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps:rr:slot node %s is full - skipping",
                                node->name);
            continue;
        }
        /* assign a number of procs equal to the number of available
         * slots divided by the number of cpus/rank the user
         * requested
         */
        num_procs_to_assign = (node->slots - node->slots_inuse) / orte_rmaps_base.cpus_per_rank;
        opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:rr:slot assigning %d procs to node %s",
                            (int)num_procs_to_assign, node->name);

        for (i=0; i < num_procs_to_assign && nprocs_mapped < app->num_procs; i++) {
            /* add this node to the map - do it only once */
            if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
                if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
                    ORTE_ERROR_LOG(rc);
                    return rc;
                }
                ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED);
                OBJ_RETAIN(node);  /* maintain accounting on object */
                ++(jdata->map->num_nodes);
            }
            if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) {
                return ORTE_ERR_OUT_OF_RESOURCE;
            }
            nprocs_mapped++;
            orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
        }
    }
/*
 * Query the registry for all nodes allocated to a specified app_context
 */
int orte_rmaps_base_map_byslot(orte_job_t *jdata, orte_app_context_t *app,
                               opal_list_t *node_list, orte_vpid_t num_procs,
                               opal_list_item_t *cur_node_item)
{
    int rc=ORTE_SUCCESS;
    int i;
    orte_node_t *node;
    orte_proc_t *proc;
    opal_list_item_t *next;
    orte_vpid_t num_alloc = 0;
    orte_vpid_t start;
    int num_procs_to_assign, num_possible_procs;
    
    /* This loop continues until all procs have been mapped or we run
     out of resources. We determine that we have "run out of
     resources" when either all nodes have slots_max processes mapped to them,
     (thus there are no free slots for a process to be mapped), OR all nodes
     have reached their soft limit and the user directed us to "no oversubscribe".
     If we still have processes that haven't been mapped yet, then it's an
     "out of resources" error. */
    
    start = jdata->num_procs;
    
    while ( num_alloc < num_procs) {
        /** see if any nodes remain unused and available. We need to do this check
         * each time since we may remove nodes from the list (as they become fully
         * used) as we cycle through the loop */
        if(0 >= opal_list_get_size(node_list) ) {
            /* Everything is at max usage! :( */
            orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:alloc-error",
                           true, num_procs, app->app);
            return ORTE_ERR_SILENT;
        }
        
        /* Save the next node we can use before claiming slots, since
         * we may need to prune the nodes list removing overused nodes.
         * Wrap around to beginning if we are at the end of the list */
        if (opal_list_get_end(node_list) == opal_list_get_next(cur_node_item)) {
            next = opal_list_get_first(node_list);
        }
        else {
            next = opal_list_get_next(cur_node_item);
        }
        
        /** declare a shorter name for convenience in the code below */
        node = (orte_node_t*) cur_node_item;
        /* If we have available slots on this node, claim all of them 
         * If node_slots == 0, assume 1 slot for that node. 
         * JJH - is this assumption fully justified?
         *
         * If we are now oversubscribing the nodes, then we still take:
         * (a) if the node has not been used yet, we take a full node_slots
         * (b) if some of the slots are in-use, then we take the number of
         *     remaining slots before hitting the soft limit (node_slots)
         * (c) if we are at or above the soft limit, we take a full node_slots
         *     unless we are loadbalancing, in which case we only take one
         *
         * Note: if node_slots is zero, then we always just take 1 slot
         *
         * We continue this process until either everything is done,
         * or all nodes have hit their hard limit. This algorithm ensures we
         * fully utilize each node before oversubscribing, and preserves the ratio
         * of processes between the nodes thereafter (e.g., if one node has twice as
         * many processes as another before oversubscribing, it will continue
         * to do so after oversubscribing).
         */
        if (node->slots_inuse >= node->slots_alloc || 0 == node->slots_inuse) {
            if (0 == node->slots_alloc) {
                num_procs_to_assign = 1;
            } else {
                /* 'num_possible_procs' defines the number of ranks */
                num_possible_procs = node->slots_alloc;
                if (0 == num_possible_procs) {
                    num_procs_to_assign = 1;
                } else {
                    num_procs_to_assign = num_possible_procs;
                }
            }
        } else {
            /* 'num_possible_procs' define number of ranks on the node. Each
             * rank occupies one slot. Each slot may represent more than one
             * cpu, depending on the cpus-per-task setting
             */
            num_possible_procs = (node->slots_alloc - node->slots_inuse);
            if (0 == num_possible_procs) {
                num_procs_to_assign = 1;
            } else {
                num_procs_to_assign = num_possible_procs;
            }
        }
        
        /* check if we are in npernode mode - if so, then set the num_slots_to_take
         * to the num_per_node
         */
        if (0 < jdata->map->npernode) {
            num_procs_to_assign = jdata->map->npernode;
        }
        
        for( i = 0; i < num_procs_to_assign; ++i) {
            proc = NULL;
            if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
                                                                 jdata->map->cpus_per_rank, app->idx,
                                                                 node_list, jdata->map->oversubscribe,
                                                                 true, &proc))) {
                /** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
                 * really isn't an error - we just need to break from the loop
                 * since the node is fully used up. For now, just don't report
                 * an error
                 */
                if (ORTE_ERR_NODE_FULLY_USED != rc) {
                    ORTE_ERROR_LOG(rc);
                    return rc;
                }
            }
            
            /* assign the vpid */
            proc->name.vpid = start++;
            
            /* Update the number of procs allocated */
            ++num_alloc;
            
            /** if all the procs have been mapped, we return */
            if (num_alloc == num_procs) {
                goto complete;
            }
            
            /* if we have fully used up this node, then break from the loop */
            if (ORTE_ERR_NODE_FULLY_USED == rc) {
                break;
            }
        }
        
        /* we move on to the next node in all cases EXCEPT if we came
         * out of the loop without having taken a full bite AND the
         * node is NOT max'd out
         *
         */
        if (i < (num_procs_to_assign-1) && ORTE_ERR_NODE_FULLY_USED != rc) {
            continue;
        }
        cur_node_item = next;
    }
    
complete:    
    /* save the bookmark */
    jdata->bookmark = (orte_node_t*)cur_node_item;
    
    return ORTE_SUCCESS;
}
Beispiel #10
0
int orte_init(int* pargc, char*** pargv, orte_proc_type_t flags)
{
    int ret;
    char *error = NULL;

    if (0 < orte_initialized) {
        /* track number of times we have been called */
        orte_initialized++;
        return ORTE_SUCCESS;
    }
    orte_initialized++;

    /* initialize the opal layer */
    if (ORTE_SUCCESS != (ret = opal_init(pargc, pargv))) {
        error = "opal_init";
        goto error;
    }
    
    /* Convince OPAL to use our naming scheme */
    opal_process_name_print = _process_name_print_for_opal;
    opal_process_name_vpid = _process_name_vpid_for_opal;
    opal_process_name_jobid = _process_name_jobid_for_opal;
    opal_compare_proc = _process_name_compare;

    /* ensure we know the type of proc for when we finalize */
    orte_process_info.proc_type = flags;

    /* setup the locks */
    if (ORTE_SUCCESS != (ret = orte_locks_init())) {
        error = "orte_locks_init";
        goto error;
    }
    
    /* Register all MCA Params */
    if (ORTE_SUCCESS != (ret = orte_register_params())) {
        error = "orte_register_params";
        goto error;
    }
    
    /* setup the orte_show_help system */
    if (ORTE_SUCCESS != (ret = orte_show_help_init())) {
        error = "opal_output_init";
        goto error;
    }
    
    /* register handler for errnum -> string conversion */
    opal_error_register("ORTE", ORTE_ERR_BASE, ORTE_ERR_MAX, orte_err2str);

    /* Ensure the rest of the process info structure is initialized */
    if (ORTE_SUCCESS != (ret = orte_proc_info())) {
        error = "orte_proc_info";
        goto error;
    }

    /* open the ESS and select the correct module for this environment */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_ess_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_ess_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_ess_base_select())) {
        error = "orte_ess_base_select";
        goto error;
    }

    if (!ORTE_PROC_IS_APP) {
        /* ORTE tools "block" in their own loop over the event
         * base, so no progress thread is required - apps will
         * start their progress thread in ess_base_std_app.c
         * at the appropriate point
         */
        orte_event_base = opal_event_base;
    }

    /* initialize the RTE for this environment */
    if (ORTE_SUCCESS != (ret = orte_ess.init())) {
        error = "orte_ess_init";
        goto error;
    }
    
    /* All done */
    return ORTE_SUCCESS;
    
 error:
    if (ORTE_ERR_SILENT != ret) {
        orte_show_help("help-orte-runtime",
                       "orte_init:startup:internal-failure",
                       true, error, ORTE_ERROR_NAME(ret), ret);
    }

    return ret;
}
Beispiel #11
0
int orte_ess_base_app_setup(bool db_restrict_local)
{
    int ret;
    char *error = NULL;
    opal_list_t transports;

    OPAL_TIMING_ENV_INIT(ess_base_setup);
    /*
     * stdout/stderr buffering
     * If the user requested to override the default setting then do
     * as they wish.
     */
    if( orte_ess_base_std_buffering > -1 ) {
        if( 0 == orte_ess_base_std_buffering ) {
            setvbuf(stdout, NULL, _IONBF, 0);
            setvbuf(stderr, NULL, _IONBF, 0);
        }
        else if( 1 == orte_ess_base_std_buffering ) {
            setvbuf(stdout, NULL, _IOLBF, 0);
            setvbuf(stderr, NULL, _IOLBF, 0);
        }
        else if( 2 == orte_ess_base_std_buffering ) {
            setvbuf(stdout, NULL, _IOFBF, 0);
            setvbuf(stderr, NULL, _IOFBF, 0);
        }
    }

    /* if I am an MPI app, we will let the MPI layer define and
     * control the opal_proc_t structure. Otherwise, we need to
     * do so here */
    if (ORTE_PROC_NON_MPI) {
        orte_process_info.super.proc_name = *(opal_process_name_t*)ORTE_PROC_MY_NAME;
        orte_process_info.super.proc_hostname = orte_process_info.nodename;
        orte_process_info.super.proc_flags = OPAL_PROC_ALL_LOCAL;
        orte_process_info.super.proc_arch = opal_local_arch;
        opal_proc_local_set(&orte_process_info.super);
    }

    /* open and setup the state machine */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_state_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_state_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_state_base_select";
        goto error;
    }
    OPAL_TIMING_ENV_NEXT(ess_base_setup, "state_framework_open");

    /* open the errmgr */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_errmgr_base_open";
        goto error;
    }
    OPAL_TIMING_ENV_NEXT(ess_base_setup, "errmgr_framework_open");

    /* setup my session directory */
    if (orte_create_session_dirs) {
        OPAL_OUTPUT_VERBOSE((2, orte_ess_base_framework.framework_output,
                             "%s setting up session dir with\n\ttmpdir: %s\n\thost %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             (NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base,
                             orte_process_info.nodename));
        if (ORTE_SUCCESS != (ret = orte_session_dir(true, ORTE_PROC_MY_NAME))) {
            ORTE_ERROR_LOG(ret);
            error = "orte_session_dir";
            goto error;
        }
        /* Once the session directory location has been established, set
           the opal_output env file location to be in the
           proc-specific session directory. */
        opal_output_set_output_file_info(orte_process_info.proc_session_dir,
                                         "output-", NULL, NULL);
        /* register the directory for cleanup */
        if (NULL != opal_pmix.register_cleanup) {
            if (orte_standalone_operation) {
                if (OPAL_SUCCESS != (ret = opal_pmix.register_cleanup(orte_process_info.top_session_dir, true, false, true))) {
                    ORTE_ERROR_LOG(ret);
                    error = "register cleanup";
                    goto error;
                }
            } else {
                if (OPAL_SUCCESS != (ret = opal_pmix.register_cleanup(orte_process_info.job_session_dir, true, false, false))) {
                    ORTE_ERROR_LOG(ret);
                    error = "register cleanup";
                    goto error;
                }
            }
        }
    }
    OPAL_TIMING_ENV_NEXT(ess_base_setup, "create_session_dirs");

    /* Setup the communication infrastructure */
    /* Routed system */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_routed_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_routed_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_routed_base_select";
        goto error;
    }
    OPAL_TIMING_ENV_NEXT(ess_base_setup, "routed_framework_open");

    /*
     * OOB Layer
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_oob_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_oob_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_oob_base_select";
        goto error;
    }
    OPAL_TIMING_ENV_NEXT(ess_base_setup, "oob_framework_open");
    
    /* Runtime Messaging Layer */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_rml_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml_base_select";
        goto error;
    }
    OPAL_TIMING_ENV_NEXT(ess_base_setup, "rml_framework_open");
    
    /* if we have info on the HNP and local daemon, process it */
    if (NULL != orte_process_info.my_hnp_uri) {
        /* we have to set the HNP's name, even though we won't route messages directly
         * to it. This is required to ensure that we -do- send messages to the correct
         * HNP name
         */
        if (ORTE_SUCCESS != (ret = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri,
                                                            ORTE_PROC_MY_HNP, NULL))) {
            ORTE_ERROR_LOG(ret);
            error = "orte_rml_parse_HNP";
            goto error;
        }
    }
    if (NULL != orte_process_info.my_daemon_uri) {
        opal_value_t val;

        /* extract the daemon's name so we can update the routing table */
        if (ORTE_SUCCESS != (ret = orte_rml_base_parse_uris(orte_process_info.my_daemon_uri,
                                                            ORTE_PROC_MY_DAEMON, NULL))) {
            ORTE_ERROR_LOG(ret);
            error = "orte_rml_parse_daemon";
            goto error;
        }
        /* Set the contact info in the database - this won't actually establish
         * the connection, but just tells us how to reach the daemon
         * if/when we attempt to send to it
         */
        OBJ_CONSTRUCT(&val, opal_value_t);
        val.key = OPAL_PMIX_PROC_URI;
        val.type = OPAL_STRING;
        val.data.string = orte_process_info.my_daemon_uri;
        if (OPAL_SUCCESS != (ret = opal_pmix.store_local(ORTE_PROC_MY_DAEMON, &val))) {
            ORTE_ERROR_LOG(ret);
            val.key = NULL;
            val.data.string = NULL;
            OBJ_DESTRUCT(&val);
            error = "store DAEMON URI";
            goto error;
        }
        val.key = NULL;
        val.data.string = NULL;
        OBJ_DESTRUCT(&val);
    }

    /* setup the errmgr */
    if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_errmgr_base_select";
        goto error;
    }
    OPAL_TIMING_ENV_NEXT(ess_base_setup, "errmgr_select");

    /* get a conduit for our use - we never route IO over fabric */
    OBJ_CONSTRUCT(&transports, opal_list_t);
    orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE,
                       ORTE_ATTR_LOCAL, orte_mgmt_transport, OPAL_STRING);
    if (ORTE_RML_CONDUIT_INVALID == (orte_mgmt_conduit = orte_rml.open_conduit(&transports))) {
        ret = ORTE_ERR_OPEN_CONDUIT_FAIL;
        error = "orte_rml_open_mgmt_conduit";
        goto error;
    }
    OPAL_LIST_DESTRUCT(&transports);

    OBJ_CONSTRUCT(&transports, opal_list_t);
    orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE,
                       ORTE_ATTR_LOCAL, orte_coll_transport, OPAL_STRING);
    if (ORTE_RML_CONDUIT_INVALID == (orte_coll_conduit = orte_rml.open_conduit(&transports))) {
        ret = ORTE_ERR_OPEN_CONDUIT_FAIL;
        error = "orte_rml_open_coll_conduit";
        goto error;
    }
    OPAL_LIST_DESTRUCT(&transports);
    OPAL_TIMING_ENV_NEXT(ess_base_setup, "rml_open_conduit");

    /*
     * Group communications
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_select";
        goto error;
    }
    OPAL_TIMING_ENV_NEXT(ess_base_setup, "grpcomm_framework_open");

    /* open the distributed file system */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_dfs_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_dfs_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_dfs_base_select";
        goto error;
    }
    OPAL_TIMING_ENV_NEXT(ess_base_setup, "dfs_framework_open");

    return ORTE_SUCCESS;
 error:
    orte_show_help("help-orte-runtime.txt",
                   "orte_init:startup:internal-failure",
                   true, error, ORTE_ERROR_NAME(ret), ret);
    return ret;
}
Beispiel #12
0
static int rte_init(void)
{
    int ret;
    char *error = NULL;
    char *envar, *ev1, *ev2;
    uint64_t unique_key[2];
    char *string_key;
    char *rmluri;
    opal_value_t *kv, kvn;
    opal_list_t vals;

    /* run the prolog */
    if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
        error = "orte_ess_base_std_prolog";
        goto error;
    }
    
    /* we don't have to call pmix.init because the pmix select did it */

    /****   THE FOLLOWING ARE REQUIRED VALUES   ***/
    /* get our jobid from PMI */
    if (!opal_pmix.get_attr(PMIX_JOBID, &kv)) {
        error = "getting jobid";
        ret = ORTE_ERR_NOT_FOUND;
        goto error;
    }
    ORTE_PROC_MY_NAME->jobid = kv->data.uint32;
    OBJ_RELEASE(kv);

    /* get our global rank from PMI */
    if (!opal_pmix.get_attr(PMIX_RANK, &kv)) {
        error = "getting rank";
        ret = ORTE_ERR_NOT_FOUND;
        goto error;
    }
    ORTE_PROC_MY_NAME->vpid = kv->data.uint32;
    OBJ_RELEASE(kv);

    /* get our local rank from PMI */
    if (!opal_pmix.get_attr(PMIX_LOCAL_RANK, &kv)) {
        error = "getting local rank";
        ret = ORTE_ERR_NOT_FOUND;
        goto error;
    }
    orte_process_info.my_local_rank = (orte_local_rank_t)kv->data.uint16;
    OBJ_RELEASE(kv);

    /* get our node rank from PMI */
    if (!opal_pmix.get_attr(PMIX_NODE_RANK, &kv)) {
        error = "getting node rank";
        ret = ORTE_ERR_NOT_FOUND;
        goto error;
    }
    orte_process_info.my_node_rank = (orte_local_rank_t)kv->data.uint16;

    /* get universe size */
    if (!opal_pmix.get_attr(PMIX_UNIV_SIZE, &kv)) {
        error = "getting univ size";
        ret = ORTE_ERR_NOT_FOUND;
        goto error;
    }
    orte_process_info.num_procs = kv->data.uint32;
    OBJ_RELEASE(kv);
    /* push into the environ for pickup in MPI layer for
     * MPI-3 required info key
     */
    if (NULL == getenv(OPAL_MCA_PREFIX"orte_ess_num_procs")) {
        asprintf(&ev1, OPAL_MCA_PREFIX"orte_ess_num_procs=%d", orte_process_info.num_procs);
        putenv(ev1);
        added_num_procs = true;
    }
    if (NULL == getenv("OMPI_APP_CTX_NUM_PROCS")) {
        asprintf(&ev2, "OMPI_APP_CTX_NUM_PROCS=%d", orte_process_info.num_procs);
        putenv(ev2);
        added_app_ctx = true;
    }


    /* get our app number from PMI - ok if not found */
    if (opal_pmix.get_attr(PMIX_APPNUM, &kv)) {
        orte_process_info.app_num = kv->data.uint32;
        OBJ_RELEASE(kv);
    } else {
        orte_process_info.app_num = 0;
    }

    /* get the number of local peers - required for wireup of
     * shared memory BTL */
    if (opal_pmix.get_attr(PMIX_LOCAL_SIZE, &kv)) {
        orte_process_info.num_local_peers = kv->data.uint32 - 1;  // want number besides ourselves
        OBJ_RELEASE(kv);
    } else {
        orte_process_info.num_local_peers = 0;
    }

    /* setup transport keys in case the MPI layer needs them -
     * we can use the jobfam and stepid as unique keys
     * because they are unique values assigned by the RM
     */
    if (NULL == getenv(OPAL_MCA_PREFIX"orte_precondition_transports")) {
        unique_key[0] = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid);
        unique_key[1] = ORTE_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid);
        if (NULL == (string_key = orte_pre_condition_transports_print(unique_key))) {
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            return ORTE_ERR_OUT_OF_RESOURCE;
        }
        asprintf(&envar, OPAL_MCA_PREFIX"orte_precondition_transports=%s", string_key);
        putenv(envar);
        added_transport_keys = true;
        /* cannot free the envar as that messes up our environ */
        free(string_key);
    }

#if OPAL_HAVE_HWLOC
    /* if it wasn't passed down to us, get the topology */
    if (NULL == opal_hwloc_topology) {
        if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) {
            error = "topology discovery";
            goto error;
        }
    }
#endif

    /* we don't need to force the routed system to pick the
     * "direct" component as that should happen automatically
     * in those cases where we are direct launched (i.e., no
     * HNP is defined in the environment */

    /* now that we have all required info, complete the setup */
    if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup(false))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_ess_base_app_setup";
        goto error;
    }

    /* setup process binding */
    if (ORTE_SUCCESS != (ret = orte_ess_base_proc_binding())) {
        error = "proc_binding";
        goto error;
    }

    /* this needs to be set to enable debugger use when direct launched */
    if (NULL == orte_process_info.my_daemon_uri) {
        orte_standalone_operation = true;
    }

    /* set max procs */
    if (orte_process_info.max_procs < orte_process_info.num_procs) {
        orte_process_info.max_procs = orte_process_info.num_procs;
    }

    /***  PUSH DATA FOR OTHERS TO FIND   ***/

    /* if we are direct launched, then push our RML URI - there
     * is no need to do so when launched by mpirun as all apps
     * communicate thru their local daemon */
    if (orte_standalone_operation) {
        OBJ_CONSTRUCT(&vals, opal_list_t);
        if (OPAL_SUCCESS != opal_dstore.fetch(opal_dstore_internal, &OPAL_PROC_MY_NAME,
                                              OPAL_DSTORE_URI, &vals)) {
            /* construct the RTE string */
            rmluri = orte_rml.get_contact_info();
            /* push it out for others to use */
            OBJ_CONSTRUCT(&kvn, opal_value_t);
            kvn.key = strdup(OPAL_DSTORE_URI);
            kvn.type = OPAL_STRING;
            kvn.data.string = strdup(rmluri);
            if (ORTE_SUCCESS != (ret = opal_pmix.put(PMIX_GLOBAL, &kvn))) {
                error = "db store uri";
                OBJ_DESTRUCT(&kvn);
                goto error;
            }
            OBJ_DESTRUCT(&kvn);
            free(rmluri);
        }
        OPAL_LIST_DESTRUCT(&vals);
    }
    
    /* push our hostname so others can find us, if they need to */
    OBJ_CONSTRUCT(&kvn, opal_value_t);
    kvn.key = strdup(OPAL_DSTORE_HOSTNAME);
    kvn.type = OPAL_STRING;
    kvn.data.string = strdup(orte_process_info.nodename);
    if (ORTE_SUCCESS != (ret = opal_pmix.put(PMIX_GLOBAL, &kvn))) {
        error = "db store hostname";
        OBJ_DESTRUCT(&kvn);
        goto error;
    }
    OBJ_DESTRUCT(&kvn);

    /* if our local rank was not provided by the system, then
     * push our local rank so others can access it */
    OBJ_CONSTRUCT(&vals, opal_list_t);
    if (OPAL_SUCCESS != opal_dstore.fetch(opal_dstore_internal, &OPAL_PROC_MY_NAME,
                                          OPAL_DSTORE_LOCALRANK, &vals)) {
        OBJ_CONSTRUCT(&kvn, opal_value_t);
        kvn.key = strdup(OPAL_DSTORE_LOCALRANK);
        kvn.type = OPAL_UINT16;
        kvn.data.uint16 = orte_process_info.my_local_rank;
        if (ORTE_SUCCESS != (ret = opal_pmix.put(PMIX_GLOBAL, &kvn))) {
            error = "db store local rank";
            OBJ_DESTRUCT(&kvn);
            goto error;
        }
        OBJ_DESTRUCT(&kvn);
    }
    OPAL_LIST_DESTRUCT(&vals);

    /* if our node rank was not provided by the system, then
     * push our node rank so others can access it */
    OBJ_CONSTRUCT(&vals, opal_list_t);
    if (OPAL_SUCCESS != opal_dstore.fetch(opal_dstore_internal, &OPAL_PROC_MY_NAME,
                                          OPAL_DSTORE_NODERANK, &vals)) {
        OBJ_CONSTRUCT(&kvn, opal_value_t);
        kvn.key = strdup(OPAL_DSTORE_NODERANK);
        kvn.type = OPAL_UINT16;
        kvn.data.uint16 = orte_process_info.my_node_rank;
        if (ORTE_SUCCESS != (ret = opal_pmix.put(PMIX_GLOBAL, &kvn))) {
            error = "db store node rank";
            OBJ_DESTRUCT(&kvn);
            goto error;
        }
        OBJ_DESTRUCT(&kvn);
    }
    OPAL_LIST_DESTRUCT(&vals);

    /* if we are an ORTE app - and not an MPI app - then
     * we need to exchange our connection info here.
     * MPI_Init has its own modex, so we don't need to do
     * two of them. However, if we don't do a modex at all,
     * then processes have no way to communicate
     *
     * NOTE: only do this when the process originally launches.
     * Cannot do this on a restart as the rest of the processes
     * in the job won't be executing this step, so we would hang
     */
    if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) {
        opal_pmix.fence(NULL, 0);
    }

    return ORTE_SUCCESS;

 error:
    if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) {
        orte_show_help("help-orte-runtime.txt",
                       "orte_init:startup:internal-failure",
                       true, error, ORTE_ERROR_NAME(ret), ret);
    }
    return ret;
}
Beispiel #13
0
int orte_ess_base_proc_binding(void)
{
#if OPAL_HAVE_HWLOC
    hwloc_obj_t node, obj;
    hwloc_cpuset_t cpus, nodeset;
    hwloc_obj_type_t target;
    unsigned int cache_level = 0;
    struct hwloc_topology_support *support;
    char *map;
    int ret;
    char *error;

    /* Determine if we were pre-bound or not */
    if (NULL != getenv("OMPI_MCA_orte_bound_at_launch")) {
        orte_proc_is_bound = true;
        if (NULL != (map = getenv("OMPI_MCA_orte_base_applied_binding"))) {
            orte_proc_applied_binding = hwloc_bitmap_alloc();
            if (0 != (ret = hwloc_bitmap_list_sscanf(orte_proc_applied_binding, map))) {
                error = "applied_binding parse";
                goto error;
            }
        }
    }

    /* see if we were bound when launched */
    if (!orte_proc_is_bound) {
        OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
                             "%s Not bound at launch",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        /* we were not bound at launch */
        if (NULL != opal_hwloc_topology) {
            support = (struct hwloc_topology_support*)hwloc_topology_get_support(opal_hwloc_topology);
            /* get our node object */
            node = hwloc_get_root_obj(opal_hwloc_topology);
            nodeset = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, node);
            /* get our bindings */
            cpus = hwloc_bitmap_alloc();
            if (hwloc_get_cpubind(opal_hwloc_topology, cpus, HWLOC_CPUBIND_PROCESS) < 0) {
                /* we are NOT bound if get_cpubind fails, nor can we be bound - the
                 * environment does not support it
                 */
                hwloc_bitmap_free(cpus);
                OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
                                     "%s Binding not supported",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                goto MOVEON;
            }
            /* we are bound if the two cpusets are not equal,
             * or if there is only ONE cpu available to us
             */
            if (0 != hwloc_bitmap_compare(cpus, nodeset) ||
                opal_hwloc_base_single_cpu(nodeset) ||
                opal_hwloc_base_single_cpu(cpus)) {
                /* someone external set it - indicate it is set
                 * so that we know
                 */
                orte_proc_is_bound = true;
                hwloc_bitmap_free(cpus);
                OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
                                     "%s Process was externally bound",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            } else if (support->cpubind->set_thisproc_cpubind &&
                       OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy) &&
                       OPAL_BIND_TO_NONE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                /* the system is capable of doing processor affinity, but it
                 * has not yet been set - see if a slot_list was given
                 */
                hwloc_bitmap_zero(cpus);
                if (OPAL_BIND_TO_CPUSET == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                    if (OPAL_SUCCESS != (ret = opal_hwloc_base_slot_list_parse(opal_hwloc_base_slot_list,
                                                                               opal_hwloc_topology, cpus))) {
                        error = "Setting processor affinity failed";
                        hwloc_bitmap_free(cpus);
                        goto error;
                    }
                    if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) {
                        error = "Setting processor affinity failed";
                        hwloc_bitmap_free(cpus);
                        goto error;
                    }
                    /* try to find a level and index for this location */
                    opal_hwloc_base_get_level_and_index(cpus, &orte_process_info.bind_level, &orte_process_info.bind_idx);
                    /* cleanup */
                    hwloc_bitmap_free(cpus);
                    orte_proc_is_bound = true;
                    OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
                                         "%s Process bound according to slot_list",
                                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                } else {
                    /* cleanup */
                    hwloc_bitmap_free(cpus);
                    /* get the node rank */
                    if (ORTE_NODE_RANK_INVALID == orte_process_info.my_node_rank) {
                        /* this is not an error - could be due to being
                         * direct launched - so just ignore and leave
                         * us unbound
                         */
                        OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
                                             "%s Process not bound - no node rank available",
                                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                        goto MOVEON;
                    }
                    /* if the binding policy is hwthread, then we bind to the nrank-th
                     * hwthread on this node
                     */
                    if (OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                        if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_PU,
                                                                           0, orte_process_info.my_node_rank, OPAL_HWLOC_LOGICAL))) {
                            ret = ORTE_ERR_NOT_FOUND;
                            error = "Getting hwthread object";
                            goto error;
                        }
                        cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj);
                        if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) {
                            ret = ORTE_ERROR;
                            error = "Setting processor affinity failed";
                            goto error;
                        }
                        orte_process_info.bind_level = OPAL_HWLOC_HWTHREAD_LEVEL;
                        orte_process_info.bind_idx = orte_process_info.my_node_rank;
                        OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
                                             "%s Process bound to hwthread",
                                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                    } else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                        /* if the binding policy is core, then we bind to the nrank-th
                         * core on this node
                         */
                        if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_CORE,
                                                                           0, orte_process_info.my_node_rank, OPAL_HWLOC_LOGICAL))) {
                            ret = ORTE_ERR_NOT_FOUND;
                            error = "Getting core object";
                            goto error;
                        }
                        cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj);
                        if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) {
                            error = "Setting processor affinity failed";
                            ret = ORTE_ERROR;
                            goto error;
                        }
                        orte_process_info.bind_level = OPAL_HWLOC_CORE_LEVEL;
                        orte_process_info.bind_idx = orte_process_info.my_node_rank;
                        OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
                                             "%s Process bound to core",
                                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                    } else {
                        /* for all higher binding policies, we bind to the specified
                         * object that the nrank-th core belongs to
                         */
                        if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_CORE,
                                                                           0, orte_process_info.my_node_rank, OPAL_HWLOC_LOGICAL))) {
                            ret = ORTE_ERR_NOT_FOUND;
                            error = "Getting core object";
                            goto error;
                        }
                        if (OPAL_BIND_TO_L1CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                            target = HWLOC_OBJ_CACHE;
                            cache_level = 1;
                            orte_process_info.bind_level = OPAL_HWLOC_L1CACHE_LEVEL;
                        } else if (OPAL_BIND_TO_L2CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                            target = HWLOC_OBJ_CACHE;
                            cache_level = 2;
                            orte_process_info.bind_level = OPAL_HWLOC_L2CACHE_LEVEL;
                        } else if (OPAL_BIND_TO_L3CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                            target = HWLOC_OBJ_CACHE;
                            cache_level = 3;
                            orte_process_info.bind_level = OPAL_HWLOC_L3CACHE_LEVEL;
                        } else if (OPAL_BIND_TO_SOCKET == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                            target = HWLOC_OBJ_SOCKET;
                            orte_process_info.bind_level = OPAL_HWLOC_SOCKET_LEVEL;
                        } else if (OPAL_BIND_TO_NUMA == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                            target = HWLOC_OBJ_NODE;
                            orte_process_info.bind_level = OPAL_HWLOC_NUMA_LEVEL;
                        } else {
                            ret = ORTE_ERR_NOT_FOUND;
                            error = "Binding policy not known";
                            goto error;
                        }
                        for (obj = obj->parent; NULL != obj; obj = obj->parent) {
                            if (target == obj->type) {
                                if (HWLOC_OBJ_CACHE == target && cache_level != obj->attr->cache.depth) {
                                    continue;
                                }
                                /* this is the place! */
                                cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj);
                                if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) {
                                    ret = ORTE_ERROR;
                                    error = "Setting processor affinity failed";
                                    goto error;
                                }
                                orte_process_info.bind_idx = opal_hwloc_base_get_obj_idx(opal_hwloc_topology,
                                                                                         obj, OPAL_HWLOC_LOGICAL);
                                orte_proc_is_bound = true;
                                OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
                                                     "%s Process bound to %s",
                                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                                     opal_hwloc_base_print_level(orte_process_info.bind_level)));
                                break;
                            }
                        }
                        if (!orte_proc_is_bound) {
                            ret = ORTE_ERROR;
                            error = "Setting processor affinity failed";
                            goto error;
                        }
                    }
                }
            }
        }
    } else {
        OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
                             "%s Process bound at launch",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    }

 MOVEON:
    /* get or update our local cpuset - it will get used multiple
     * times, so it's more efficient to keep a global copy
     */
    opal_hwloc_base_get_local_cpuset();
    /* report bindings, if requested */
    if (opal_hwloc_report_bindings) {
        char bindings[64];
        hwloc_obj_t root;
        hwloc_cpuset_t cpus;
        /* get the root object for this node */
        root = hwloc_get_root_obj(opal_hwloc_topology);
        cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, root);
        /* we are not bound if this equals our cpuset */
        if (0 == hwloc_bitmap_compare(cpus, opal_hwloc_my_cpuset)) {
            opal_output(0, "%s is not bound",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
        } else {
            hwloc_bitmap_list_snprintf(bindings, 64, opal_hwloc_my_cpuset);
            opal_output(0, "%s is bound to cpus %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        bindings);
        }
    }

    return ORTE_SUCCESS;

 error:
    if (ORTE_ERR_SILENT != ret) {
        orte_show_help("help-orte-runtime",
                       "orte_init:startup:internal-failure",
                       true, error, ORTE_ERROR_NAME(ret), ret);
    }

    return ORTE_ERR_SILENT;

#else
    return ORTE_SUCCESS;
#endif
}
int orte_rmaps_base_filter_nodes(orte_app_context_t *app,
                                 opal_list_t *nodes, bool remove)
{
    int rc=ORTE_ERR_TAKE_NEXT_OPTION;

    /* did the app_context contain a hostfile? */
    if (NULL != app->hostfile) {
        /* yes - filter the node list through the file, removing
         * any nodes not found in the file
         */
        if (ORTE_SUCCESS != (rc = orte_util_filter_hostfile_nodes(nodes, app->hostfile, remove))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        /** check that anything is here */
        if (0 == opal_list_get_size(nodes)) {
            orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node",
                           true, app->app, "-hostfile", app->hostfile);
            return ORTE_ERR_SILENT;
        }
    }
    /* did the app_context contain an add-hostfile? */
    if (NULL != app->add_hostfile) {
        /* yes - filter the node list through the file, removing
         * any nodes not found in the file
         */
        if (ORTE_SUCCESS != (rc = orte_util_filter_hostfile_nodes(nodes, app->add_hostfile, remove))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        /** check that anything is here */
        if (0 == opal_list_get_size(nodes)) {
            orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node",
                           true, app->app, "-add-hostfile", app->hostfile);
            return ORTE_ERR_SILENT;
        }
    }
    /* now filter the list through any -host specification */
    if (!orte_soft_locations && NULL != app->dash_host) {
        if (ORTE_SUCCESS != (rc = orte_util_filter_dash_host_nodes(nodes, app->dash_host, remove))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        /** check that anything is left! */
        if (0 == opal_list_get_size(nodes)) {
            char *foo;
            foo = opal_argv_join(app->dash_host, ',');
            orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node",
                           true, app->app, "-host", foo);
            free(foo);
            return ORTE_ERR_SILENT;
        }
    }
    /* now filter the list through any add-host specification */
    if (NULL != app->add_host) {
        if (ORTE_SUCCESS != (rc = orte_util_filter_dash_host_nodes(nodes, app->add_host, remove))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        /** check that anything is left! */
        if (0 == opal_list_get_size(nodes)) {
            char *foo;
            foo = opal_argv_join(app->dash_host, ',');
            orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-mapped-node",
                           true, app->app, "-add-host", foo);
            free(foo);
            return ORTE_ERR_SILENT;
        }
    }

    return rc;
}
Beispiel #15
0
int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
{
    hwloc_obj_type_t hwb, hwm;
    unsigned clvl=0, clvm=0;
    opal_binding_policy_t bind;
    orte_mapping_policy_t map;
    orte_node_t *node;
    int i, rc;
    struct hwloc_topology_support *support;
    bool force_down = false;
    int bind_depth, map_depth;

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps: compute bindings for job %s with policy %s[%x]",
                        ORTE_JOBID_PRINT(jdata->jobid),
                        opal_hwloc_base_print_binding(jdata->map->binding), jdata->map->binding);

    map = ORTE_GET_MAPPING_POLICY(jdata->map->mapping);
    bind = OPAL_GET_BINDING_POLICY(jdata->map->binding);

    if (ORTE_MAPPING_BYUSER == map) {
        /* user specified binding by rankfile - nothing for us to do */
        return ORTE_SUCCESS;
    }

    if (OPAL_BIND_TO_CPUSET == bind) {
        int rc;
        /* cpuset was given - setup the bindings */
        if (ORTE_SUCCESS != (rc = bind_to_cpuset(jdata))) {
            ORTE_ERROR_LOG(rc);
        }
        return rc;
    }

    if (OPAL_BIND_TO_NONE == bind) {
        /* no binding requested */
        return ORTE_SUCCESS;
    }

    if (OPAL_BIND_TO_BOARD == bind) {
        /* doesn't do anything at this time */
        return ORTE_SUCCESS;
    }

    /* binding requested - convert the binding level to the hwloc obj type */
    switch (bind) {
    case OPAL_BIND_TO_NUMA:
        hwb = HWLOC_OBJ_NODE;
        break;
    case OPAL_BIND_TO_SOCKET:
        hwb = HWLOC_OBJ_SOCKET;
        break;
    case OPAL_BIND_TO_L3CACHE:
        hwb = HWLOC_OBJ_CACHE;
        clvl = 3;
        break;
    case OPAL_BIND_TO_L2CACHE:
        hwb = HWLOC_OBJ_CACHE;
        clvl = 2;
        break;
    case OPAL_BIND_TO_L1CACHE:
        hwb = HWLOC_OBJ_CACHE;
        clvl = 1;
        break;
    case OPAL_BIND_TO_CORE:
        hwb = HWLOC_OBJ_CORE;
        break;
    case OPAL_BIND_TO_HWTHREAD:
        hwb = HWLOC_OBJ_PU;
        break;
    default:
        ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
        return ORTE_ERR_BAD_PARAM;
    }

    /* do the same for the mapping policy */
    switch (map) {
    case ORTE_MAPPING_BYNODE:
    case ORTE_MAPPING_BYSLOT:
    case ORTE_MAPPING_SEQ:
        hwm = HWLOC_OBJ_MACHINE;
        break;
    case ORTE_MAPPING_BYDIST:
    case ORTE_MAPPING_BYNUMA:
        hwm = HWLOC_OBJ_NODE;
        break;
    case ORTE_MAPPING_BYSOCKET:
        hwm = HWLOC_OBJ_SOCKET;
        break;
    case ORTE_MAPPING_BYL3CACHE:
        hwm = HWLOC_OBJ_CACHE;
        clvm = 3;
        break;
    case ORTE_MAPPING_BYL2CACHE:
        hwm = HWLOC_OBJ_CACHE;
        clvm = 2;
        break;
    case ORTE_MAPPING_BYL1CACHE:
        hwm = HWLOC_OBJ_CACHE;
        clvm = 1;
        break;
    case ORTE_MAPPING_BYCORE:
        hwm = HWLOC_OBJ_CORE;
        break;
    case ORTE_MAPPING_BYHWTHREAD:
        hwm = HWLOC_OBJ_PU;
        break;
    default:
        ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
        return ORTE_ERR_BAD_PARAM;
    }

    /* if the job was mapped by the corresponding target, then
     * we bind in place
     *
     * otherwise, we have to bind either up or down the hwloc
     * tree. If we are binding upwards (e.g., mapped to hwthread
     * but binding to core), then we just climb the tree to find
     * the first matching object.
     *
     * if we are binding downwards (e.g., mapped to node and bind
     * to core), then we have to do a round-robin assigment of
     * procs to the resources below.
     */

    if (ORTE_MAPPING_BYDIST == map) {
        int rc = ORTE_SUCCESS;
        if (OPAL_BIND_TO_NUMA == bind) {
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps: bindings for job %s - dist to numa",
                                ORTE_JOBID_PRINT(jdata->jobid));
            if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_NODE, 0))) {
                ORTE_ERROR_LOG(rc);
            }
        } else if (OPAL_BIND_TO_NUMA < bind) {
            /* bind every proc downwards */
            force_down = true;
            goto execute;
        }
        /* if the binding policy is less than numa, then we are unbound - so
         * just ignore this and return (should have been caught in prior
         * tests anyway as only options meeting that criteria are "none"
         * and "board")
         */
        return rc;
    }

    /* now deal with the remaining binding policies based on hardware */
    if (bind == map) {
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps: bindings for job %s - bind in place",
                            ORTE_JOBID_PRINT(jdata->jobid));
        if (ORTE_SUCCESS != (rc = bind_in_place(jdata, hwb, clvl))) {
            ORTE_ERROR_LOG(rc);
        }
        return rc;
    }

    /* we need to handle the remaining binding options on a per-node
     * basis because different nodes could potentially have different
     * topologies, with different relative depths for the two levels
     */
 execute:
    /* initialize */

    for (i=0; i < jdata->map->nodes->size; i++) {
        if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) {
            continue;
        }
        if (!orte_do_not_launch) {
            /* if we don't want to launch, then we are just testing the system,
             * so ignore questions about support capabilities
             */
            support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology);
            /* check if topology supports cpubind - have to be careful here
             * as Linux doesn't currently support thread-level binding. This
             * may change in the future, though, and it isn't clear how hwloc
             * interprets the current behavior. So check both flags to be sure.
             */
            if (!support->cpubind->set_thisproc_cpubind &&
                !support->cpubind->set_thisthread_cpubind) {
                if (!OPAL_BINDING_REQUIRED(jdata->map->binding) ||
                    !OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
                    /* we are not required to bind, so ignore this */
                    continue;
                }
                orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name);
                return ORTE_ERR_SILENT;
            }
            /* check if topology supports membind - have to be careful here
             * as hwloc treats this differently than I (at least) would have
             * expected. Per hwloc, Linux memory binding is at the thread,
             * and not process, level. Thus, hwloc sets the "thisproc" flag
             * to "false" on all Linux systems, and uses the "thisthread" flag
             * to indicate binding capability - don't warn if the user didn't
             * specifically request binding
             */
            if (!support->membind->set_thisproc_membind &&
                !support->membind->set_thisthread_membind &&
                OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
                if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name);
                    membind_warned = true;
                } else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name);
                    return ORTE_ERR_SILENT;
                }
            }
        }

        /* some systems do not report cores, and so we can get a situation where our
         * default binding policy will fail for no necessary reason. So if we are
         * computing a binding due to our default policy, and no cores are found
         * on this node, just silently skip it - we will not bind
         */
        if (!OPAL_BINDING_POLICY_IS_SET(jdata->map->binding) &&
            HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology, HWLOC_OBJ_CORE)) {
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "Unable to bind-to core by default on node %s as no cores detected",
                                node->name);
            continue;
        }

        /* we share topologies in order
         * to save space, so we need to reset the usage info to reflect
         * our own current state
         */
        reset_usage(node, jdata->jobid);

        if (force_down) {
            if (ORTE_SUCCESS != (rc = bind_downwards(jdata, node, hwb, clvl))) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
        } else {
            /* determine the relative depth on this node */
            if (HWLOC_OBJ_CACHE == hwb) {
                /* must use a unique function because blasted hwloc
                 * just doesn't deal with caches very well...sigh
                 */
                bind_depth = hwloc_get_cache_type_depth(node->topology, clvl, (hwloc_obj_cache_type_t)-1);
            } else {
                bind_depth = hwloc_get_type_depth(node->topology, hwb);
            }
            if (0 > bind_depth) {
                /* didn't find such an object */
                orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-objects",
                               true, hwloc_obj_type_string(hwb), node->name);
                return ORTE_ERR_SILENT;
            }
            if (HWLOC_OBJ_CACHE == hwm) {
                /* must use a unique function because blasted hwloc
                 * just doesn't deal with caches very well...sigh
                 */
                map_depth = hwloc_get_cache_type_depth(node->topology, clvm, (hwloc_obj_cache_type_t)-1);
            } else {
                map_depth = hwloc_get_type_depth(node->topology, hwm);
            }
            if (0 > map_depth) {
                /* didn't find such an object */
                orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-objects",
                               true, hwloc_obj_type_string(hwm), node->name);
                return ORTE_ERR_SILENT;
            }
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "%s bind_depth: %d map_depth %d",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                bind_depth, map_depth);
            if (bind_depth > map_depth) {
                if (ORTE_SUCCESS != (rc = bind_downwards(jdata, node, hwb, clvl))) {
                    ORTE_ERROR_LOG(rc);
                    return rc;
                }
            } else {
                if (ORTE_SUCCESS != (rc = bind_upwards(jdata, node, hwb, clvl))) {
                    ORTE_ERROR_LOG(rc);
                    return rc;
                }
            }
        }
    }

    return ORTE_SUCCESS;
}
int orte_rmaps_base_map_bynode(orte_job_t *jdata, orte_app_context_t *app,
                               opal_list_t *node_list, orte_vpid_t num_procs,
                               opal_list_item_t *cur_node_item)
{
    int rc = ORTE_SUCCESS;
    opal_list_item_t *next;
    orte_node_t *node;
    orte_proc_t *proc;
    orte_vpid_t num_alloc=0;
    orte_vpid_t start;
    
    /* This loop continues until all procs have been mapped or we run
     out of resources. We determine that we have "run out of
     resources" when all nodes have slots_max processes mapped to them,
     thus there are no free slots for a process to be mapped, or we have
     hit the soft limit on all nodes and are in a "no oversubscribe" state.
     If we still have processes that haven't been mapped yet, then it's an 
     "out of resources" error.
     
     In this scenario, we rely on the claim_slot function to handle the
     oversubscribed case. The claim_slot function will leave a node on the
     list until it either reaches slots_max OR reaches the
     soft limit and the "no_oversubscribe" flag has been set - at which point,
     the node will be removed to prevent any more processes from being mapped to
     it. Since we are taking one slot from each node as we cycle through, the
     list, oversubscription is automatically taken care of via this logic.
     */
    
    start = jdata->num_procs;
    
    while (num_alloc < num_procs) {
        /** see if any nodes remain unused and available. We need to do this check
         * each time since we may remove nodes from the list (as they become fully
         * used) as we cycle through the loop */
        if(0 >= opal_list_get_size(node_list) ) {
            /* No more nodes to allocate :( */
            orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:alloc-error",
                           true, num_procs, app->app);
            return ORTE_ERR_SILENT;
        }
        
        /* Save the next node we can use before claiming slots, since
         * we may need to prune the nodes list removing overused nodes.
         * Wrap around to beginning if we are at the end of the list */
        if (opal_list_get_end(node_list) == opal_list_get_next(cur_node_item)) {
            next = opal_list_get_first(node_list);
        }
        else {
            next = opal_list_get_next(cur_node_item);
        }
        
        /* Allocate a slot on this node */
        node = (orte_node_t*) cur_node_item;
        proc = NULL;
        if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node, jdata->map->cpus_per_rank, app->idx,
                                                             node_list, jdata->map->oversubscribe, true, &proc))) {
            /** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this
             * really isn't an error - we just need to break from the loop
             * since the node is fully used up. For now, just don't report
             * an error
             */
            if (ORTE_ERR_NODE_FULLY_USED != rc) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
        }
        
        /* assign the vpid */
        proc->name.vpid = start++;
        
        /* Update the number of procs allocated */
        ++num_alloc;
        
        cur_node_item = next;
    }
    
    /* save the bookmark */
    jdata->bookmark = (orte_node_t*)cur_node_item;

    return ORTE_SUCCESS;
}
static int rte_init(void)
{
    int ret;
    char *error = NULL;
    char **hosts = NULL;

    /* run the prolog */
    if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
        error = "orte_ess_base_std_prolog";
        goto error;
    }
    
    /* Start by getting a unique name */
    lsf_set_name();
    
    /* if I am a daemon, complete my setup using the
     * default procedure
     */
    if (ORTE_PROC_IS_DAEMON) {
        if (NULL != orte_node_regex) {
            /* extract the nodes */
            if (ORTE_SUCCESS != (ret = orte_regex_extract_node_names(orte_node_regex, &hosts))) {
                error = "orte_regex_extract_node_names";
                goto error;
            }
        }
        if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) {
            ORTE_ERROR_LOG(ret);
            error = "orte_ess_base_orted_setup";
            goto error;
        }
        opal_argv_free(hosts);
        return ORTE_SUCCESS;
    }
    
    if (ORTE_PROC_IS_TOOL) {
        /* otherwise, if I am a tool proc, use that procedure */
        if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup())) {
            ORTE_ERROR_LOG(ret);
            error = "orte_ess_base_tool_setup";
            goto error;
        }
        /* as a tool, I don't need a nidmap - so just return now */
        return ORTE_SUCCESS;
        
    }
    
    /* otherwise, I must be an application process - use
     * the default procedure to finish my setup
     */
    if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_ess_base_app_setup";
        goto error;
    }
    
    /* setup the nidmap arrays */
    if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(orte_process_info.sync_buf))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_util_nidmap_init";
        goto error;
    }
    
    return ORTE_SUCCESS;
    
error:
    if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) {
        orte_show_help("help-orte-runtime.txt",
                       "orte_init:startup:internal-failure",
                       true, error, ORTE_ERROR_NAME(ret), ret);
    }

    return ret;
}
Beispiel #18
0
int mca_pml_dr_add_procs(ompi_proc_t** procs, size_t nprocs)
{
    opal_bitmap_t reachable;
    int rc;
    size_t i;
    opal_list_item_t *item;

    if(nprocs == 0)
        return OMPI_SUCCESS;

#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
    for (i = 0 ; i < nprocs ; ++i) {
        if (procs[i]->proc_arch != ompi_proc_local()->proc_arch) {
            return OMPI_ERR_NOT_SUPPORTED;
        }
    }
#endif

    /* make sure remote procs are using the same PML as us */
    if (OMPI_SUCCESS != (rc = mca_pml_base_pml_check_selected("dr",
                                                              procs,
                                                              nprocs))) {
        return rc;
    }

    OBJ_CONSTRUCT(&reachable, opal_bitmap_t);
    rc = opal_bitmap_init(&reachable, (int)nprocs);
    if(OMPI_SUCCESS != rc)
        return rc;

    /* initialize bml endpoint data */
    rc = mca_bml.bml_add_procs(
                               nprocs,
                               procs,
                               &reachable
                               );
    if(OMPI_SUCCESS != rc)
        return rc;

    /* Check that values supplied by all initialized btls will work
       for us.  Note that this is the list of all initialized BTLs,
       not the ones used for the just added procs.  This is a little
       overkill and inaccurate, as we may end up not using the BTL in
       question and all add_procs calls after the first one are
       duplicating an already completed check.  But the final
       initialization of the PML occurs before the final
       initialization of the BTLs, and iterating through the in-use
       BTLs requires iterating over the procs, as the BML does not
       expose all currently in use btls. */

    for (item = opal_list_get_first(&mca_btl_base_modules_initialized) ;
         item != opal_list_get_end(&mca_btl_base_modules_initialized) ;
         item = opal_list_get_next(item)) {
        mca_btl_base_selected_module_t *sm = 
            (mca_btl_base_selected_module_t*) item;
        if (sm->btl_module->btl_eager_limit < sizeof(mca_pml_dr_hdr_t)) {
	    orte_show_help("help-mpi-pml-dr.txt", "eager_limit_too_small",
			   true, 
			   sm->btl_component->btl_version.mca_component_name,
			   orte_process_info.nodename,
			   sm->btl_component->btl_version.mca_component_name,
			   sm->btl_module->btl_eager_limit,
			   sm->btl_component->btl_version.mca_component_name,
			   sizeof(mca_pml_dr_hdr_t),
			   sm->btl_component->btl_version.mca_component_name);
            rc = OMPI_ERR_BAD_PARAM;
            return rc;
        }
    }

    /* register recv handler */
    rc = mca_bml.bml_register(
                              MCA_BTL_TAG_PML,
                              mca_pml_dr_recv_frag_callback,
                              NULL);

    if(OMPI_SUCCESS != rc)
        return rc;

    /* register error handlers */
    rc = mca_bml.bml_register_error(mca_pml_dr_error_handler);
    
    if(OMPI_SUCCESS != rc)
        return rc;
 
    ompi_free_list_init_new(
                        &mca_pml_dr.buffers,
                        sizeof(mca_pml_dr_buffer_t) + mca_pml_dr.eager_limit,
                        opal_cache_line_size,
                        OBJ_CLASS(mca_pml_dr_buffer_t),
                        0,opal_cache_line_size,
                        0,
                        mca_pml_dr.free_list_max,
                        mca_pml_dr.free_list_inc,
                        NULL);

    /* initialize pml endpoint data */
    for (i = 0 ; i < nprocs ; ++i) {
        int idx;
        mca_pml_dr_endpoint_t *endpoint;

        endpoint = OBJ_NEW(mca_pml_dr_endpoint_t);
        endpoint->proc_ompi = procs[i];
        procs[i]->proc_pml = (struct mca_pml_endpoint_t*) endpoint;
        MCA_PML_DR_DEBUG(10, (0, "%s:%d: adding endpoint %p to proc_pml %p\n", 
                              __FILE__, __LINE__, (void*)endpoint, (void*)procs[i]));
        
        /* this won't work for comm spawn and other dynamic
           processes, but will work for initial job start */
        idx = opal_pointer_array_add(&mca_pml_dr.endpoints, (void*) endpoint);
        if(OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL,
                           ORTE_PROC_MY_NAME,
                           &(endpoint->proc_ompi->proc_name))) {
            mca_pml_dr.my_rank = idx;
        }
        endpoint->local = endpoint->dst = idx;
        MCA_PML_DR_DEBUG(10, (0, "%s:%d: setting endpoint->dst to %d\n", 
                              __FILE__, __LINE__, idx));
        
        endpoint->bml_endpoint = procs[i]->proc_bml;
    }
    
    for(i = 0; i < nprocs; i++) { 
        mca_pml_dr_endpoint_t* ep =  (mca_pml_dr_endpoint_t*) 
            opal_pointer_array_get_item(&mca_pml_dr.endpoints, i);
            ep->src = mca_pml_dr.my_rank;
    }
    return rc;
}
Beispiel #19
0
int mca_spml_ikrit_add_procs(ompi_proc_t** procs, size_t nprocs)
{
    spml_ikrit_mxm_ep_conn_info_t *ep_info = NULL;
    spml_ikrit_mxm_ep_conn_info_t *ep_hw_rdma_info = NULL;
    spml_ikrit_mxm_ep_conn_info_t my_ep_info;
    size_t mxm_addr_len = MXM_MAX_ADDR_LEN;
    mxm_error_t err;
    size_t i, n;
    int rc = OSHMEM_ERROR;
    ompi_proc_t *proc_self;
    int my_rank = oshmem_my_proc_id();

    OBJ_CONSTRUCT(&mca_spml_ikrit.active_peers, opal_list_t);
    /* Allocate connection requests */
    ep_info = calloc(sizeof(spml_ikrit_mxm_ep_conn_info_t), nprocs);
    if (NULL == ep_info) {
        rc = OSHMEM_ERR_OUT_OF_RESOURCE;
        goto bail;
    }

    if (mca_spml_ikrit.hw_rdma_channel) {
        ep_hw_rdma_info = calloc(sizeof(spml_ikrit_mxm_ep_conn_info_t), nprocs);
        if (NULL == ep_hw_rdma_info) {
            rc = OSHMEM_ERR_OUT_OF_RESOURCE;
            goto bail;
        }
    }

    mca_spml_ikrit.mxm_peers = (mxm_peer_t *) calloc(nprocs , sizeof(mxm_peer_t));
    if (NULL == mca_spml_ikrit.mxm_peers) {
        rc = OSHMEM_ERR_OUT_OF_RESOURCE;
        goto bail;
    }

    memset(&my_ep_info, 0, sizeof(my_ep_info));

    if (mca_spml_ikrit.hw_rdma_channel) {
        err = mxm_ep_get_address(mca_spml_ikrit.mxm_hw_rdma_ep, &my_ep_info.addr.ep_addr, &mxm_addr_len);
        if (MXM_OK != err) {
            orte_show_help("help-oshmem-spml-ikrit.txt", "unable to get endpoint address", true,
                    mxm_error_string(err));
            rc = OSHMEM_ERROR;
            goto bail;
        }
        oshmem_shmem_allgather(&my_ep_info, ep_hw_rdma_info,
                sizeof(spml_ikrit_mxm_ep_conn_info_t));
    }
    err = mxm_ep_get_address(mca_spml_ikrit.mxm_ep, &my_ep_info.addr.ep_addr, &mxm_addr_len);
    if (MXM_OK != err) {
        orte_show_help("help-oshmem-spml-ikrit.txt", "unable to get endpoint address", true,
                mxm_error_string(err));
        rc = OSHMEM_ERROR;
        goto bail;
    }

    oshmem_shmem_allgather(&my_ep_info, ep_info,
                           sizeof(spml_ikrit_mxm_ep_conn_info_t));

    opal_progress_register(spml_ikrit_progress);

    /* Get the EP connection requests for all the processes from modex */
    for (n = 0; n < nprocs; ++n) {

        /* mxm 2.0 keeps its connections on a list. Make sure
         * that list have different order on every rank */
        i = (my_rank + n) % nprocs;
        mxm_peer_construct(&mca_spml_ikrit.mxm_peers[i]);

        err = mxm_ep_connect(mca_spml_ikrit.mxm_ep, ep_info[i].addr.ep_addr, &mca_spml_ikrit.mxm_peers[i].mxm_conn);
        if (MXM_OK != err) {
            SPML_ERROR("MXM returned connect error: %s\n", mxm_error_string(err));
            goto bail;
        }
        mxm_conn_ctx_set(mca_spml_ikrit.mxm_peers[i].mxm_conn, &mca_spml_ikrit.mxm_peers[i]);
        if (mca_spml_ikrit.hw_rdma_channel) {
            err = mxm_ep_connect(mca_spml_ikrit.mxm_hw_rdma_ep, ep_hw_rdma_info[i].addr.ep_addr, &mca_spml_ikrit.mxm_peers[i].mxm_hw_rdma_conn);
            if (MXM_OK != err) {
                SPML_ERROR("MXM returned connect error: %s\n", mxm_error_string(err));
                goto bail;
            }
        } else {
            mca_spml_ikrit.mxm_peers[i].mxm_hw_rdma_conn = mca_spml_ikrit.mxm_peers[i].mxm_conn;
        }
    }

    if (ep_info)
        free(ep_info);
    if (ep_hw_rdma_info)
        free(ep_hw_rdma_info);

    if (mca_spml_ikrit.bulk_connect) {
        /* Need a barrier to ensure remote peers already created connection */
        oshmem_shmem_barrier();
        mxm_ep_wireup(mca_spml_ikrit.mxm_ep);
    }

    proc_self = oshmem_proc_group_find(oshmem_group_all, my_rank);
    /* identify local processes and change transport to SHM */
    for (i = 0; i < nprocs; i++) {
        if (procs[i]->super.proc_name.jobid != proc_self->super.proc_name.jobid ||
            !OPAL_PROC_ON_LOCAL_NODE(procs[i]->super.proc_flags)) {
            continue;
        }
        if (procs[i] == proc_self)
            continue;

        /* use zcopy for put/get via sysv shared memory with fallback to RDMA */
        mca_spml_ikrit.mxm_peers[i].ptl_id = MXM_PTL_SHM;
    }

    SPML_VERBOSE(50, "*** ADDED PROCS ***");
    return OSHMEM_SUCCESS;

bail:
	if (ep_info)
		free(ep_info);
	if (ep_hw_rdma_info)
		free(ep_hw_rdma_info);
    SPML_ERROR("add procs FAILED rc=%d", rc);

    return rc;

}
int orte_ess_base_orted_setup(char **hosts)
{
    int ret = ORTE_ERROR;
    int fd;
    char log_file[PATH_MAX];
    char *jobidstring;
    char *error = NULL;
    orte_job_t *jdata;
    orte_proc_t *proc;
    orte_app_context_t *app;
    orte_node_t *node;
    char *param;

    plm_in_use = false;

    /* setup callback for SIGPIPE */
    setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback);
    /* Set signal handlers to catch kill signals so we can properly clean up
     * after ourselves. 
     */
    setup_sighandler(SIGTERM, &term_handler, shutdown_signal);
    setup_sighandler(SIGINT, &int_handler, shutdown_signal);
    
    /** setup callbacks for signals we should ignore */
    setup_sighandler(SIGUSR1, &sigusr1_handler, signal_callback);
    setup_sighandler(SIGUSR2, &sigusr2_handler, signal_callback);
    
    signals_set = true;
    
#if OPAL_HAVE_HWLOC
    {
        hwloc_obj_t obj;
        unsigned i, j;
        
        /* get the local topology */
        if (NULL == opal_hwloc_topology) {
            if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
                error = "topology discovery";
                goto error;
            }
        }
        
        /* remove the hostname from the topology. Unfortunately, hwloc
         * decided to add the source hostname to the "topology", thus
         * rendering it unusable as a pure topological description. So
         * we remove that information here.
         */
        obj = hwloc_get_root_obj(opal_hwloc_topology);
        for (i=0; i < obj->infos_count; i++) {
            if (NULL == obj->infos[i].name ||
                NULL == obj->infos[i].value) {
                continue;
            }
            if (0 == strncmp(obj->infos[i].name, "HostName", strlen("HostName"))) {
                free(obj->infos[i].name);
                free(obj->infos[i].value);
                /* left justify the array */
                for (j=i; j < obj->infos_count-1; j++) {
                    obj->infos[j] = obj->infos[j+1];
                }
                obj->infos[obj->infos_count-1].name = NULL;
                obj->infos[obj->infos_count-1].value = NULL;
                obj->infos_count--;
                break;
            }
        }
        
        if (4 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) {
            opal_output(0, "%s Topology Info:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO);
        }
    }
#endif
    
    /* setup the global nidmap/pidmap object */
    orte_nidmap.bytes = NULL;
    orte_nidmap.size = 0;
    orte_pidmap.bytes = NULL;
    orte_pidmap.size = 0;

    /* open and setup the opal_pstat framework so we can provide
     * process stats if requested
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_pstat_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "opal_pstat_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "opal_pstat_base_select";
        goto error;
    }
    
    /* open and setup the state machine */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_state_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_state_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_state_base_select";
        goto error;
    }
    
    /* open the errmgr */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_errmgr_base_open";
        goto error;
    }
    
    /* some environments allow remote launches - e.g., ssh - so
     * open and select something -only- if we are given
     * a specific module to use
     */
    (void) mca_base_var_env_name("plm", &param);
    
    plm_in_use = !!(getenv(param));
    free (param);

    if (plm_in_use)  {

        if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_plm_base_framework, 0))) {
            ORTE_ERROR_LOG(ret);
            error = "orte_plm_base_open";
            goto error;
        }
        if (ORTE_SUCCESS != (ret = orte_plm_base_select())) {
            ORTE_ERROR_LOG(ret);
            error = "orte_plm_base_select";
            goto error;
        }
    }
    
    /* Setup the communication infrastructure */
        if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_oob_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_oob_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_oob_base_select";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_rml_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml_base_select";
        goto error;
    }
    
    /* select the errmgr */
    if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_errmgr_base_select";
        goto error;
    }
    
    /* Routed system */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_routed_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_routed_base_select";
        goto error;
    }
    
    /* database */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_db_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_db_base_open";
        goto error;
    }
    /* always restrict daemons to local database components */
    if (ORTE_SUCCESS != (ret = opal_db_base_select(true))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_db_base_select";
        goto error;
    }
    /* set our id */
    opal_db.set_id((opal_identifier_t*)ORTE_PROC_MY_NAME);

    /*
     * Group communications
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_select";
        goto error;
    }
    
    /* Open/select the odls */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_odls_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_odls_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_odls_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_odls_base_select";
        goto error;
    }
    
    /* enable communication with the rml */
    if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml.enable_comm";
        goto error;
    }
    
    /* initialize the nidmaps */
    if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_util_nidmap_init";
        goto error;
    }
#if ORTE_ENABLE_STATIC_PORTS
    /* if we are using static ports, then we need to setup
     * the daemon info so the RML can function properly
     * without requiring a wireup stage. This must be done
     * after we enable_comm as that function determines our
     * own port, which we need in order to construct the nidmap
     */
    if (orte_static_ports) {
        /* define the routing tree so we know the pattern
         * if we are trying to setup common or static ports
         */
        orte_routed.update_routing_plan();

        /* extract the node info from the environment and
         * build a nidmap from it
         */
        if (ORTE_SUCCESS != (ret = orte_util_build_daemon_nidmap(hosts))) {
            ORTE_ERROR_LOG(ret);
            error = "construct daemon map from static ports";
            goto error;
        }
    }
#endif
    /* be sure to update the routing tree so the initial "phone home"
     * to mpirun goes through the tree if static ports were enabled - still
     * need to do it anyway just to initialize things
     */
    orte_routed.update_routing_plan();
    
    /* Now provide a chance for the PLM
     * to perform any module-specific init functions. This
     * needs to occur AFTER the communications are setup
     * as it may involve starting a non-blocking recv
     * Do this only if a specific PLM was given to us - the
     * orted has no need of the proxy PLM at all
     */
    if (plm_in_use) {
        if (ORTE_SUCCESS != (ret = orte_plm.init())) {
            ORTE_ERROR_LOG(ret);
            error = "orte_plm_init";
            goto error;
        }
    }
    
    /* setup my session directory */
    if (orte_create_session_dirs) {
        OPAL_OUTPUT_VERBOSE((2, orte_ess_base_framework.framework_output,
                             "%s setting up session dir with\n\ttmpdir: %s\n\thost %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             (NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base,
                             orte_process_info.nodename));
        
        /* take a pass thru the session directory code to fillin the
         * tmpdir names - don't create anything yet
         */
        if (ORTE_SUCCESS != (ret = orte_session_dir(false,
                                                    orte_process_info.tmpdir_base,
                                                    orte_process_info.nodename, NULL,
                                                    ORTE_PROC_MY_NAME))) {
            ORTE_ERROR_LOG(ret);
            error = "orte_session_dir define";
            goto error;
        }
        /* clear the session directory just in case there are
         * stale directories laying around
         */
        orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);

        /* now actually create the directory tree */
        if (ORTE_SUCCESS != (ret = orte_session_dir(true,
                                                    orte_process_info.tmpdir_base,
                                                    orte_process_info.nodename, NULL,
                                                    ORTE_PROC_MY_NAME))) {
            ORTE_ERROR_LOG(ret);
            error = "orte_session_dir";
            goto error;
        }
        /* Once the session directory location has been established, set
           the opal_output env file location to be in the
           proc-specific session directory. */
        opal_output_set_output_file_info(orte_process_info.proc_session_dir,
                                         "output-", NULL, NULL);
        
        /* setup stdout/stderr */
        if (orte_debug_daemons_file_flag) {
            /* if we are debugging to a file, then send stdout/stderr to
             * the orted log file
             */
            
            /* get my jobid */
            if (ORTE_SUCCESS != (ret = orte_util_convert_jobid_to_string(&jobidstring,
                                                                         ORTE_PROC_MY_NAME->jobid))) {
                ORTE_ERROR_LOG(ret);
                error = "convert_jobid";
                goto error;
            }
            
            /* define a log file name in the session directory */
            snprintf(log_file, PATH_MAX, "output-orted-%s-%s.log",
                     jobidstring, orte_process_info.nodename);
            log_path = opal_os_path(false,
                                    orte_process_info.tmpdir_base,
                                    orte_process_info.top_session_dir,
                                    log_file,
                                    NULL);
            
            fd = open(log_path, O_RDWR|O_CREAT|O_TRUNC, 0640);
            if (fd < 0) {
                /* couldn't open the file for some reason, so
                 * just connect everything to /dev/null
                 */
                fd = open("/dev/null", O_RDWR|O_CREAT|O_TRUNC, 0666);
            } else {
                dup2(fd, STDOUT_FILENO);
                dup2(fd, STDERR_FILENO);
                if(fd != STDOUT_FILENO && fd != STDERR_FILENO) {
                    close(fd);
                }
            }
        }
    }
    
    /* setup the global job and node arrays */
    orte_job_data = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data,
                                                       1,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       1))) {
        ORTE_ERROR_LOG(ret);
        error = "setup job array";
        goto error;
    }
    
    orte_node_pool = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
        ORTE_ERROR_LOG(ret);
        error = "setup node array";
        goto error;
    }
    orte_node_topologies = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_topologies,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
        ORTE_ERROR_LOG(ret);
        error = "setup node topologies array";
        goto error;
    }

    /* Setup the job data object for the daemons */        
    /* create and store the job data object */
    jdata = OBJ_NEW(orte_job_t);
    jdata->jobid = ORTE_PROC_MY_NAME->jobid;
    opal_pointer_array_set_item(orte_job_data, 0, jdata);
    
    /* every job requires at least one app */
    app = OBJ_NEW(orte_app_context_t);
    opal_pointer_array_set_item(jdata->apps, 0, app);
    jdata->num_apps++;
    
    /* create and store a node object where we are */
    node = OBJ_NEW(orte_node_t);
    node->name = strdup(orte_process_info.nodename);
    node->index = opal_pointer_array_set_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid, node);
#if OPAL_HAVE_HWLOC
    /* point our topology to the one detected locally */
    node->topology = opal_hwloc_topology;
#endif

    /* create and store a proc object for us */
    proc = OBJ_NEW(orte_proc_t);
    proc->name.jobid = ORTE_PROC_MY_NAME->jobid;
    proc->name.vpid = ORTE_PROC_MY_NAME->vpid;
    
    proc->pid = orte_process_info.pid;
    proc->rml_uri = orte_rml.get_contact_info();
    proc->state = ORTE_PROC_STATE_RUNNING;
    opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc);
    
    /* record that the daemon (i.e., us) is on this node 
     * NOTE: we do not add the proc object to the node's
     * proc array because we are not an application proc.
     * Instead, we record it in the daemon field of the
     * node object
     */
    OBJ_RETAIN(proc);   /* keep accounting straight */
    node->daemon = proc;
    node->daemon_launched = true;
    node->state = ORTE_NODE_STATE_UP;
    
    /* now point our proc node field to the node */
    OBJ_RETAIN(node);   /* keep accounting straight */
    proc->node = node;

    /* record that the daemon job is running */
    jdata->num_procs = 1;
    jdata->state = ORTE_JOB_STATE_RUNNING;
    /* obviously, we have "reported" */
    jdata->num_reported = 1;
    
    /* setup the routed info - the selected routed component
     * will know what to do. 
     */
    if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_routed.init_routes";
        goto error;
    }
    
    /* setup I/O forwarding system - must come after we init routes */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_iof_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_iof_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_iof_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_iof_base_select";
        goto error;
    }
    
    /* setup the FileM */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_filem_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_filem_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_filem_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_filem_base_select";
        goto error;
    }
    
#if OPAL_ENABLE_FT_CR == 1
    /*
     * Setup the SnapC
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_snapc_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_snapc_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_sstore_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_sstore_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_snapc_base_select(!ORTE_PROC_IS_HNP, ORTE_PROC_IS_DAEMON))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_snapc_base_select";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_sstore_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_sstore_base_select";
        goto error;
    }
    
    /* For daemons, ORTE doesn't need the OPAL CR stuff */
    opal_cr_set_enabled(false);
#else
    opal_cr_set_enabled(false);
#endif
    
    /*
     * Initalize the CR setup
     * Note: Always do this, even in non-FT builds.
     * If we don't some user level tools may hang.
     */
    if (ORTE_SUCCESS != (ret = orte_cr_init())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_cr_init";
        goto error;
    }
    
    /* setup the DFS framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_dfs_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_dfs_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_dfs_select";
        goto error;
    }

    return ORTE_SUCCESS;
    
 error:
    orte_show_help("help-orte-runtime.txt",
                   "orte_init:startup:internal-failure",
                   true, error, ORTE_ERROR_NAME(ret), ret);
    
    return ORTE_ERR_SILENT;
}
Beispiel #21
0
static void activate(void)
{
    int rc;
    DIR *dirp;

    /* take control */
    ORTE_ACQUIRE_THREAD(&orcm_cfgi_base.ctl);

    if (enabled) {
        /* we get reentered when daemons reappear so that
         * any pending jobs can be started
         */
        check_installed(true);
        /* release control */
        ORTE_RELEASE_THREAD(&orcm_cfgi_base.ctl);
        return;
    }
    enabled = true;

    /* check for existence of the directory. If it doesn't yet
     * exist, then we have to use the timer until it shows up
     */
    if (NULL == (dirp = opendir(mca_orcm_cfgi_file_component.dir))) {
        if (0 < opal_output_get_verbosity(orcm_cfgi_base.output)) {
            orte_show_help("help-cfgi-file.txt", "no-dir",
                           true, mca_orcm_cfgi_file_component.dir);
        }
        timer_in_use = true;
        goto fallback;
    }

#ifdef HAVE_SYS_INOTIFY_H
    /* setup to watch the config dir - CREATE always is followed by
     * a MODIFY event, so don't need both
     */
    if (0 > (watch = inotify_add_watch(notifier, mca_orcm_cfgi_file_component.dir,
                                           IN_DELETE | IN_MODIFY | IN_MOVE))) {
        /* error */
        close(notifier);
        goto fallback;
    }
    /* start the watcher event */
    probe_ev = (opal_event_t*)malloc(sizeof(opal_event_t));
    opal_event_set(opal_event_base, probe_ev, notifier,
                   OPAL_EV_READ|OPAL_EV_PERSIST, inotify_handler, NULL);
    timer_in_use = false;
    ORTE_RELEASE_THREAD(&orcm_cfgi_base.ctl);
    /* process it the first time */
    check_config(0, 0, NULL);
    return;
#endif

 fallback:

    /* setup the probe timer */
    if (0 <  mca_orcm_cfgi_file_component.rate) {
        probe_time.tv_sec = mca_orcm_cfgi_file_component.rate;
        probe_time.tv_usec = 0;
        probe_ev = (opal_event_t*)malloc(sizeof(opal_event_t));
        opal_event_evtimer_set(opal_event_base, probe_ev, check_config, NULL);
        timer_in_use = true;
        /* process it the first time */
        ORTE_RELEASE_THREAD(&orcm_cfgi_base.ctl);
        check_config(0, 0, NULL);
        return;
    }

    opal_output(0, "%s CANNOT ACTIVATE INSTALL CONFIG MONITORING",
                   ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
    enabled = false;
    ORTE_RELEASE_THREAD(&orcm_cfgi_base.ctl);
}
Beispiel #22
0
static int bind_upwards(orte_job_t *jdata,
                        orte_node_t *node,
                        hwloc_obj_type_t target,
                        unsigned cache_level)
{
    /* traverse the hwloc topology tree on each node upwards
     * until we find an object of type target - and then bind
     * the process to that target
     */
    int j;
    orte_job_map_t *map;
    orte_proc_t *proc;
    hwloc_obj_t obj;
    hwloc_cpuset_t cpus;
    unsigned int idx, ncpus;
    opal_hwloc_obj_data_t *data;
    hwloc_obj_t locale;
    char *cpu_bitmap;

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps: bind upwards for job %s with bindings %s",
                        ORTE_JOBID_PRINT(jdata->jobid),
                        opal_hwloc_base_print_binding(jdata->map->binding));
    /* initialize */
    map = jdata->map;


    /* cycle thru the procs */
    for (j=0; j < node->procs->size; j++) {
        if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
            continue;
        }
        /* ignore procs from other jobs */
        if (proc->name.jobid != jdata->jobid) {
            continue;
        }
        /* bozo check */
        if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
            orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-locale", true, ORTE_NAME_PRINT(&proc->name));
            return ORTE_ERR_SILENT;
        }
        /* starting at the locale, move up thru the parents
         * to find the target object type
         */
        cpu_bitmap = NULL;
        for (obj = locale->parent; NULL != obj; obj = obj->parent) {
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "%s bind:upward target %s type %s",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                hwloc_obj_type_string(target),
                                hwloc_obj_type_string(obj->type));
            if (target == obj->type) {
                if (HWLOC_OBJ_CACHE == target && cache_level != obj->attr->cache.depth) {
                    continue;
                }
                /* get its index */
                if (UINT_MAX == (idx = opal_hwloc_base_get_obj_idx(node->topology, obj, OPAL_HWLOC_AVAILABLE))) {
                    ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
                    return ORTE_ERR_SILENT;
                }
                /* track the number bound */
                data = (opal_hwloc_obj_data_t*)obj->userdata;
                data->num_bound++;
                /* get the number of cpus under this location */
                if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology, obj))) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
                    return ORTE_ERR_SILENT;
                }
                /* error out if adding a proc would cause overload and that wasn't allowed,
                 * and it wasn't a default binding policy (i.e., the user requested it)
                 */
                if (ncpus < data->num_bound &&
                    !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) {
                    if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
                        /* if the user specified a binding policy, then we cannot meet
                         * it since overload isn't allowed, so error out - have the
                         * message indicate that setting overload allowed will remove
                         * this restriction */
                        orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
                                       opal_hwloc_base_print_binding(map->binding), node->name,
                                       data->num_bound, ncpus);
                        return ORTE_ERR_SILENT;
                    } else {
                        /* if we have the default binding policy, then just don't bind */
                        OPAL_SET_BINDING_POLICY(map->binding, OPAL_BIND_TO_NONE);
                        unbind_procs(jdata);
                        return ORTE_SUCCESS;
                    }
                }
                /* bind it here */
                cpus = opal_hwloc_base_get_available_cpus(node->topology, obj);
                hwloc_bitmap_list_asprintf(&cpu_bitmap, cpus);
                orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, cpu_bitmap, OPAL_STRING);
                /* record the location */
                orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_BOUND, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
                opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                    "%s BOUND PROC %s TO %s[%s:%u] on node %s",
                                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                    ORTE_NAME_PRINT(&proc->name),
                                    cpu_bitmap,
                                    hwloc_obj_type_string(target),
                                    idx, node->name);
                break;
            }
        }
        if (NULL == cpu_bitmap && OPAL_BINDING_REQUIRED(jdata->map->binding)) {
            /* didn't find anyone to bind to - this is an error
             * unless the user specified if-supported
             */
            orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-target-not-found", true,
                           opal_hwloc_base_print_binding(map->binding), node->name);
            return ORTE_ERR_SILENT;
        }
        if (NULL != cpu_bitmap) {
            free(cpu_bitmap);
        }
    }

    return ORTE_SUCCESS;
}
Beispiel #23
0
static int parse_file(char *filename,
                      opal_pointer_array_t *apps)
{
    char *cptr, *executable, *version, *argv, *tmp, *binary;
    int token, rc=ORCM_SUCCESS;
    int i, ival;
    orcm_cfgi_app_t *app=NULL, *aptr, *curapp;
    orcm_cfgi_exec_t *exec=NULL, *eptr;
    orcm_cfgi_version_t *vers=NULL, *vptr;
    bool found;
    struct stat buf;

    orcm_cfgi_file_in = fopen(filename, "r");
    if (NULL == orcm_cfgi_file_in) {
        opal_output(0, "%s UNABLE TO OPEN FILE %s",
                       ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), filename);
        return ORTE_ERR_FILE_OPEN_FAILURE;
    }

    OPAL_OUTPUT_VERBOSE((2, orcm_cfgi_base.output,
                         "%s PARSING CONFIGURATION FILE %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), filename));

    while (!orcm_cfgi_file_done) {
        token = orcm_cfgi_file_lex();

        switch (token) {
        case ORCM_CFGI_FILE_DONE:
            orcm_cfgi_file_done = true;
            if (NULL != app) {
                if (1 < opal_output_get_verbosity(orcm_cfgi_base.output)) {
                    opal_output(0, "READ INSTALL CONFIG:");
                    orcm_cfgi_base_dump(NULL, NULL, app, ORCM_CFGI_APP);
                }
                /* if the app hasn't been added yet */
                if (app->idx < 0) {
                    /* see if we already have a definition for this app */
                    found = false;
                    for (i=0; i < apps->size; i++) {
                        if (NULL == (aptr = (orcm_cfgi_app_t*)opal_pointer_array_get_item(apps, i))) {
                            continue;
                        }
                        if (0 == strcmp(app->application, aptr->application)) {
                            /* found app */
                            found = true;
                            break;
                        }
                    }
                    if (!found) {
                        /* this is a new app - check for validity */
                        if (!orcm_cfgi_app_definition_valid(app)) {
                            opal_output(0, "%s APP %s NOT VALID",
                                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                        (NULL == app->application) ? "NULL" : app->application);
                            rc = ORTE_ERR_BAD_PARAM;
                            OBJ_RELEASE(app);
                            goto depart;
                        }
                        /* store it before leaving */
                        app->idx = opal_pointer_array_add(apps, app);
                    }
                }
            }
            break;

        case ORCM_CFGI_FILE_NEWLINE:
            break;

        case ORCM_CFGI_FILE_APPLICATION:
            if (NULL == (cptr = parse_string())) {
                orte_show_help("help-cfgi-file.txt", "no-app-name", true);
                rc = ORTE_ERR_BAD_PARAM;
                goto depart;
            }
            if (NULL == app) {
                /* start a new app - create record for it */
                app = OBJ_NEW(orcm_cfgi_app_t);
                app->application = strdup(cptr);
                exec = NULL;  /* start the exec over */
                vers = NULL;
            } else {
                /* are we already working this app */
                if (0 != strcmp(cptr, app->application)) {
                    /* this starts a different app */
                    if (1 < opal_output_get_verbosity(orcm_cfgi_base.output)) {
                        opal_output(0, "READ INSTALL CONFIG:");
                        orcm_cfgi_base_dump(NULL, NULL, app, ORCM_CFGI_APP);
                    }
                    /* check for validity */
                    if (!orcm_cfgi_app_definition_valid(app)) {
                        opal_output(0, "%s APP %s NOT VALID",
                                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                    (NULL == app->application) ? "NULL" : app->application);
                        rc = ORTE_ERR_BAD_PARAM;
                        OBJ_RELEASE(app);
                        free(cptr);
                        goto depart;
                    }
                    if (app->idx < 0) {
                        /* hasn't been added yet, so do so
                         * before we start on the new one
                         */
                        app->idx = opal_pointer_array_add(apps, app);
                    }
                    /* see if we already have a definition for this app */
                    found = false;
                    for (i=0; i < orcm_cfgi_base.installed_apps.size; i++) {
                        if (NULL == (aptr = (orcm_cfgi_app_t*)opal_pointer_array_get_item(apps, i))) {
                            continue;
                        }
                        if (0 == strcmp(cptr, aptr->application)) {
                            /* found app */
                            app = aptr;
                            found = true;
                            break;
                        }
                    }
                    if (!found) {
                        /* start a new app - create record for it */
                        app = OBJ_NEW(orcm_cfgi_app_t);
                        app->application = strdup(cptr);
                    }
                }
                exec = NULL;  /* start the exec over */
                vers = NULL;
            }
            free(cptr);
            break;

        case ORCM_CFGI_FILE_MAX_INSTANCES:
            /* if we don't have an active app, that's an error */
            if (NULL == app) {
                /* cannot process this */
                rc = ORTE_ERR_BAD_PARAM;
                goto depart;
            }
            if ((app->max_instances = parse_int()) < 0) {
                orte_show_help("help-cfgi-file.txt", "bad-max-instances", true,
                               filename, orcm_cfgi_file_line, token, 
                               orcm_cfgi_file_value.sval);
                OBJ_RELEASE(app);
                rc = ORTE_ERR_BAD_PARAM;
                goto depart;
            }
            break;

        case ORCM_CFGI_FILE_EXECUTABLE:
            /* if we don't have an active app, that's an error */
            if (NULL == app) {
                /* cannot process this */
                rc = ORTE_ERR_BAD_PARAM;
                goto depart;
            }
            if (NULL == (executable = parse_string())) {
                orte_show_help("help-cfgi-file.txt", "no-exec", true,
                               filename, orcm_cfgi_file_line, token,
                               orcm_cfgi_file_value.sval);
                OBJ_RELEASE(app);
                rc = ORTE_ERR_BAD_PARAM;
                goto depart;
            }
            /* do I already have an entry for this executable in this app? */
            exec = NULL;
            for (i=0; i < app->executables.size; i++) {
                if (NULL == (eptr = (orcm_cfgi_exec_t*)opal_pointer_array_get_item(&app->executables, i))) {
                    continue;
                }
                if (0 == strcmp(eptr->appname, executable)) {
                    exec = eptr;
                    break;
                }
            }
            if (NULL == exec) {
                /* nope - add this executable */
                exec = OBJ_NEW(orcm_cfgi_exec_t);
                exec->appname = strdup(executable);
                exec->idx = opal_pointer_array_add(&app->executables, exec);
            }
            free(executable);
            break;

        case ORCM_CFGI_FILE_VERSION:
            /* if we don't have an active job, that's an error */
            if (NULL == app) {
                /* cannot process this */
                rc = ORTE_ERR_BAD_PARAM;
                goto depart;
            }
            /* if we don't have an active exec, that's an error */
            if (NULL == exec) {
                /* cannot process this */
                OBJ_RELEASE(app);
                rc = ORTE_ERR_BAD_PARAM;
                goto depart;
            }
            if (NULL == (version = parse_string())) {
                orte_show_help("help-cfgi-file.txt", "no-version", true,
                               filename, orcm_cfgi_file_line, token,
                               orcm_cfgi_file_value.sval);
                OBJ_RELEASE(app);
                rc = ORTE_ERR_BAD_PARAM;
                goto depart;
            }
            /* do we already have a record of this version? */
            vers = NULL;
            for (i=0; i < exec->versions.size; i++) {
                if (NULL == (vptr = (orcm_cfgi_version_t*)opal_pointer_array_get_item(&exec->versions, i))) {
                    continue;
                }
                if (0 == strcmp(vptr->version, version)) {
                    vers = vptr;
                    break;
                }
            }
            if (NULL == vers) {
                /* nope - add it */
                vers = OBJ_NEW(orcm_cfgi_version_t);
                vers->exec = exec;
                vers->version = strdup(version);
                vers->idx = opal_pointer_array_add(&exec->versions, vers);
                /* get the modification time stamp for that binary */
                asprintf(&binary, "%s_%s", exec->appname, version);
                tmp = opal_path_findv(binary, X_OK, environ, NULL);
                free(binary);
                if (NULL == tmp) {
                    /* binary wasn't found - just leave it */
                    free(version);
                    break;
                }
                if (0 > stat(tmp, &buf)) {
                    /* cannot stat file */
                    OPAL_OUTPUT_VERBOSE((1, orcm_cfgi_base.output,
                                         "%s could not stat %s",
                                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp));
                    free(tmp);
                    free(version);
                    break;
                }
                vers->mod_time = strdup(ctime(&buf.st_mtime));
                /* strip the ending cr */
                vers->mod_time[strlen(vers->mod_time)-1] = '\0';
                OPAL_OUTPUT_VERBOSE((1, orcm_cfgi_base.output,
                                     "%s BIN %s MODTIME %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     binary, vers->mod_time));
                free(tmp);
            }
            free(version);
            break;

        case ORCM_CFGI_FILE_ARGV:
            /* if we don't have an active job, that's an error */
            if (NULL == app) {
                /* cannot process this */
                rc = ORTE_ERR_BAD_PARAM;
                goto depart;
            }
            /* if we don't have an active exec, that's an error */
            if (NULL == exec) {
                /* cannot process this */
                OBJ_RELEASE(app);
                rc = ORTE_ERR_BAD_PARAM;
                goto depart;
            }
            /* if we don't have an active version, that's an error */
            if (NULL == vers) {
                /* cannot process this */
                OBJ_RELEASE(app);
                rc = ORTE_ERR_BAD_PARAM;
                goto depart;
            }
            if (NULL == (argv = parse_string())) {
                orte_show_help("help-cfgi-file.txt", "no-argv", true,
                               filename, orcm_cfgi_file_line, token,
                               orcm_cfgi_file_value.sval);
                OBJ_RELEASE(app);
                rc = ORTE_ERR_BAD_PARAM;
                goto depart;
            }
            vers->argv = opal_argv_split(argv, ' ');
            free(argv);
            break;

        case ORCM_CFGI_FILE_PROCESS_LIMIT:
            /* if we don't have an active job, that's an error */
            if (NULL == app) {
                /* cannot process this */
                rc = ORTE_ERR_BAD_PARAM;
                goto depart;
            }
            /* if we don't have an active exec, that's an error */
            if (NULL == exec) {
                /* cannot process this */
                OBJ_RELEASE(app);
                rc = ORTE_ERR_BAD_PARAM;
                goto depart;
            }
            /* set the process limit */
            if ((exec->process_limit = parse_int()) < 0) {
                orte_show_help("help-cfgi-file.txt", "bad-max-procs", true,
                               filename, orcm_cfgi_file_line, token, 
                               orcm_cfgi_file_value.sval);
                OBJ_RELEASE(app);
                rc = ORTE_ERR_BAD_PARAM;
                goto depart;
            }
            break;

        case ORCM_CFGI_FILE_STRING:
        case ORCM_CFGI_FILE_QUOTED_STRING:
            orte_show_help("help-cfgi-file.txt", "parse_error_string",
                           true,
                           filename, 
                           orcm_cfgi_file_line, 
                           token, 
                           orcm_cfgi_file_value.sval);
            break;

        case ORCM_CFGI_FILE_INT:
            orte_show_help("help-cfgi-file.txt", "parse_error_int",
                           true,
                           filename, 
                           orcm_cfgi_file_line, 
                           token, 
                           orcm_cfgi_file_value.ival);
            break;

        default:
            orte_show_help("help-cfgi-file.txt", "parse_error",
                           true,
                           filename, 
                           orcm_cfgi_file_line, 
                           token);
        }
    }

 depart:
    fclose(orcm_cfgi_file_in);
    orcm_cfgi_file_done = false;
    return rc;
}
Beispiel #24
0
static int bind_downwards(orte_job_t *jdata,
                          orte_node_t *node,
                          hwloc_obj_type_t target,
                          unsigned cache_level)
{
    int j;
    orte_job_map_t *map;
    orte_proc_t *proc;
    hwloc_obj_t trg_obj, nxt_obj;
    hwloc_cpuset_t cpus;
    unsigned int ncpus;
    opal_hwloc_obj_data_t *data;
    int total_cpus;
    hwloc_cpuset_t totalcpuset;
    hwloc_obj_t locale;
    char *cpu_bitmap;

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps: bind downward for job %s with bindings %s",
                        ORTE_JOBID_PRINT(jdata->jobid),
                        opal_hwloc_base_print_binding(jdata->map->binding));
    /* initialize */
    map = jdata->map;
    totalcpuset = hwloc_bitmap_alloc();

    /* cycle thru the procs */
    for (j=0; j < node->procs->size; j++) {
        if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
            continue;
        }
        /* ignore procs from other jobs */
        if (proc->name.jobid != jdata->jobid) {
            continue;
        }
        /* bozo check */
        locale = NULL;
        if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR) ||
            NULL == locale) {
            orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-locale", true, ORTE_NAME_PRINT(&proc->name));
            hwloc_bitmap_free(totalcpuset);
            return ORTE_ERR_SILENT;
        }
        /* we don't know if the target is a direct child of this locale,
         * or if it is some depth below it, so we have to conduct a bit
         * of a search. Let hwloc find the min usage one for us.
         */
        trg_obj = opal_hwloc_base_find_min_bound_target_under_obj(node->topology, locale,
                                                                  target, cache_level);
        if (NULL == trg_obj) {
            /* there aren't any such targets under this object */
            orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
            hwloc_bitmap_free(totalcpuset);
            return ORTE_ERR_SILENT;
        }
        /* record the location */
        orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_BOUND, ORTE_ATTR_LOCAL, trg_obj, OPAL_PTR);
        /* start with a clean slate */
        hwloc_bitmap_zero(totalcpuset);
        total_cpus = 0;
        nxt_obj = trg_obj;
        do {
            if (NULL == nxt_obj) {
                /* could not find enough cpus to meet request */
                orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
                hwloc_bitmap_free(totalcpuset);
                return ORTE_ERR_SILENT;
            }
            trg_obj = nxt_obj;
            /* get the number of cpus under this location */
            ncpus = opal_hwloc_base_get_npus(node->topology, trg_obj);
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "%s GOT %d CPUS",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ncpus);
            /* track the number bound */
            if (NULL == (data = (opal_hwloc_obj_data_t*)trg_obj->userdata)) {
                data = OBJ_NEW(opal_hwloc_obj_data_t);
                trg_obj->userdata = data;
            }
            data->num_bound++;
            /* error out if adding a proc would cause overload and that wasn't allowed,
             * and it wasn't a default binding policy (i.e., the user requested it)
             */
            if (ncpus < data->num_bound &&
                !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) {
                if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
                    /* if the user specified a binding policy, then we cannot meet
                     * it since overload isn't allowed, so error out - have the
                     * message indicate that setting overload allowed will remove
                     * this restriction */
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
                                   opal_hwloc_base_print_binding(map->binding), node->name,
                                   data->num_bound, ncpus);
                    hwloc_bitmap_free(totalcpuset);
                    return ORTE_ERR_SILENT;
                } else {
                    /* if we have the default binding policy, then just don't bind */
                    OPAL_SET_BINDING_POLICY(map->binding, OPAL_BIND_TO_NONE);
                    unbind_procs(jdata);
                    hwloc_bitmap_zero(totalcpuset);
                    return ORTE_SUCCESS;
                }
            }
            /* bind the proc here */
            cpus = opal_hwloc_base_get_available_cpus(node->topology, trg_obj);
            hwloc_bitmap_or(totalcpuset, totalcpuset, cpus);
            /* track total #cpus */
            total_cpus += ncpus;
            /* move to the next location, in case we need it */
            nxt_obj = trg_obj->next_cousin;
        } while (total_cpus < orte_rmaps_base.cpus_per_rank);
        hwloc_bitmap_list_asprintf(&cpu_bitmap, totalcpuset);
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "%s PROC %s BITMAP %s",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            ORTE_NAME_PRINT(&proc->name), cpu_bitmap);
        orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, cpu_bitmap, OPAL_STRING);
        if (NULL != cpu_bitmap) {
            free(cpu_bitmap);
        }
        if (4 < opal_output_get_verbosity(orte_rmaps_base_framework.framework_output)) {
            char tmp1[1024], tmp2[1024];
            if (OPAL_ERR_NOT_BOUND == opal_hwloc_base_cset2str(tmp1, sizeof(tmp1),
                                                               node->topology, totalcpuset)) {
                opal_output(orte_rmaps_base_framework.framework_output,
                            "%s PROC %s ON %s IS NOT BOUND",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            ORTE_NAME_PRINT(&proc->name), node->name);
            } else {
                opal_hwloc_base_cset2mapstr(tmp2, sizeof(tmp2), node->topology, totalcpuset);
                opal_output(orte_rmaps_base_framework.framework_output,
                            "%s BOUND PROC %s[%s] TO %s: %s",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            ORTE_NAME_PRINT(&proc->name), node->name,
                            tmp1, tmp2);
            }
        }
    }
    hwloc_bitmap_free(totalcpuset);

    return ORTE_SUCCESS;
}
Beispiel #25
0
int orte_rmaps_base_set_mapping_policy(orte_mapping_policy_t *policy,
                                       char **device, char *spec)
{
    char **ck, **ck2;
    orte_mapping_policy_t tmp;
    int i;
    size_t len;

    /* set defaults */
    tmp = 0;
    *device = NULL;

    if (NULL == spec) {
        ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYSOCKET);
        ORTE_SET_MAPPING_DIRECTIVE(tmp, ORTE_MAPPING_SPAN);
        ORTE_UNSET_MAPPING_DIRECTIVE(tmp, ORTE_MAPPING_GIVEN);
    } else {
        ck = opal_argv_split(spec, ':');
        if (2 < opal_argv_count(ck)) {
            /* incorrect format */
            orte_show_help("help-orte-rmaps-base.txt", "unrecognized-policy", true, "mapping", rmaps_base_mapping_policy);
            opal_argv_free(ck);
            return ORTE_ERR_SILENT;
        }
        if (2 == opal_argv_count(ck)) {
            /* if the policy is "dist", then we set the policy to that value
             * and save the second argument as the device
             */
#if OPAL_HAVE_HWLOC
            if (0 == strncasecmp(ck[0], "dist", strlen(ck[0]))) {
                ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYDIST);
                ck2 = opal_argv_split(ck[1], ',');
                if (ck2[0] != NULL) {
                    *device = strdup(ck2[0]);
                    for (i=1; NULL != ck2[i]; i++) {
                        if (0 == strncasecmp(ck2[i], "span", strlen(ck2[i]))) {
                            ORTE_SET_MAPPING_DIRECTIVE(tmp, ORTE_MAPPING_SPAN);
                        } 
                    }
                }
                opal_argv_free(ck2);
                goto setpolicy;
            }
#endif
            ck2 = opal_argv_split(ck[1], ',');
            for (i=0; NULL != ck2[i]; i++) {
                if (0 == strncasecmp(ck2[i], "span", strlen(ck2[i]))) {
                    ORTE_SET_MAPPING_DIRECTIVE(tmp, ORTE_MAPPING_SPAN);
                } else if (0 == strncasecmp(ck2[i], "oversubscribe", strlen(ck2[i]))) {
                    if (ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(tmp)) {
                        ORTE_UNSET_MAPPING_DIRECTIVE(tmp, ORTE_MAPPING_NO_OVERSUBSCRIBE);
                        ORTE_SET_MAPPING_DIRECTIVE(tmp, ORTE_MAPPING_SUBSCRIBE_GIVEN);
                    } else if (0 == strncasecmp(ck2[i], "nooversubscribe", strlen(ck2[i]))) {
                        ORTE_SET_MAPPING_DIRECTIVE(tmp, ORTE_MAPPING_NO_OVERSUBSCRIBE);
                        ORTE_SET_MAPPING_DIRECTIVE(tmp, ORTE_MAPPING_SUBSCRIBE_GIVEN);
                    } else {
                        /* unrecognized modifier */
                        orte_show_help("help-orte-rmaps-base.txt", "unrecognized-modifier", true, "mapping", ck2[i]);
                        opal_argv_free(ck);
                        opal_argv_free(ck2);
                        return ORTE_ERR_SILENT;
                    }
                }
                opal_argv_free(ck2);
            }
        }
        len = strlen(ck[0]);
        if (0 == strncasecmp(ck[0], "slot", len)) {
            ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYSLOT);
        } else if (0 == strncasecmp(ck[0], "node", len)) {
            ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYNODE);
#if OPAL_HAVE_HWLOC
        } else if (0 == strncasecmp(ck[0], "core", len)) {
            ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYCORE);
        } else if (0 == strncasecmp(ck[0], "l1cache", len)) {
            ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYL1CACHE);
        } else if (0 == strncasecmp(ck[0], "l2cache", len)) {
            ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYL2CACHE);
        } else if (0 == strncasecmp(ck[0], "l3cache", len)) {
            ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYL3CACHE);
        } else if (0 == strncasecmp(ck[0], "socket", len)) {
            ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYSOCKET);
        } else if (0 == strncasecmp(ck[0], "numa", len)) {
            ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYNUMA);
        } else if (0 == strncasecmp(ck[0], "board", len)) {
            ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYBOARD);
        } else if (0 == strncasecmp(ck[0], "hwthread", len)) {
            ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYHWTHREAD);
            /* if we are mapping processes to individual hwthreads, then
             * we need to treat those hwthreads as separate cpus
             */
            opal_hwloc_use_hwthreads_as_cpus = true;
#endif
        } else {
            orte_show_help("help-orte-rmaps-base.txt", "unrecognized-policy", true, "mapping", rmaps_base_mapping_policy);
            opal_argv_free(ck);
            return ORTE_ERR_SILENT;
        }
        opal_argv_free(ck);
        ORTE_SET_MAPPING_DIRECTIVE(tmp, ORTE_MAPPING_GIVEN);
    }

#if OPAL_HAVE_HWLOC
 setpolicy:
#endif
    *policy = tmp;

    return ORTE_SUCCESS;
}
Beispiel #26
0
static int bind_in_place(orte_job_t *jdata,
                         hwloc_obj_type_t target,
                         unsigned cache_level)
{
    /* traverse the hwloc topology tree on each node downwards
     * until we find an unused object of type target - and then bind
     * the process to that target
     */
    int i, j;
    orte_job_map_t *map;
    orte_node_t *node;
    orte_proc_t *proc;
    hwloc_cpuset_t cpus;
    unsigned int idx, ncpus;
    struct hwloc_topology_support *support;
    opal_hwloc_obj_data_t *data;
    hwloc_obj_t locale, sib;
    char *cpu_bitmap;
    bool found;

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps: bind in place for job %s with bindings %s",
                        ORTE_JOBID_PRINT(jdata->jobid),
                        opal_hwloc_base_print_binding(jdata->map->binding));
    /* initialize */
    map = jdata->map;

    for (i=0; i < map->nodes->size; i++) {
        if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
            continue;
        }
        if (!orte_do_not_launch) {
            /* if we don't want to launch, then we are just testing the system,
             * so ignore questions about support capabilities
             */
            support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology);
            /* check if topology supports cpubind - have to be careful here
             * as Linux doesn't currently support thread-level binding. This
             * may change in the future, though, and it isn't clear how hwloc
             * interprets the current behavior. So check both flags to be sure.
             */
            if (!support->cpubind->set_thisproc_cpubind &&
                !support->cpubind->set_thisthread_cpubind) {
                if (!OPAL_BINDING_REQUIRED(map->binding) ||
                    !OPAL_BINDING_POLICY_IS_SET(map->binding)) {
                    /* we are not required to bind, so ignore this */
                    continue;
                }
                orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name);
                return ORTE_ERR_SILENT;
            }
            /* check if topology supports membind - have to be careful here
             * as hwloc treats this differently than I (at least) would have
             * expected. Per hwloc, Linux memory binding is at the thread,
             * and not process, level. Thus, hwloc sets the "thisproc" flag
             * to "false" on all Linux systems, and uses the "thisthread" flag
             * to indicate binding capability - don't warn if the user didn't
             * specifically request binding
             */
            if (!support->membind->set_thisproc_membind &&
                !support->membind->set_thisthread_membind &&
                OPAL_BINDING_POLICY_IS_SET(map->binding)) {
                if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name);
                    membind_warned = true;
                } else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name);
                    return ORTE_ERR_SILENT;
                }
            }
        }

        /* some systems do not report cores, and so we can get a situation where our
         * default binding policy will fail for no necessary reason. So if we are
         * computing a binding due to our default policy, and no cores are found
         * on this node, just silently skip it - we will not bind
         */
        if (!OPAL_BINDING_POLICY_IS_SET(map->binding) &&
            HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology, HWLOC_OBJ_CORE)) {
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "Unable to bind-to core by default on node %s as no cores detected",
                                node->name);
            continue;
        }

        /* we share topologies in order
         * to save space, so we need to reset the usage info to reflect
         * our own current state
         */
        reset_usage(node, jdata->jobid);

        /* cycle thru the procs */
        for (j=0; j < node->procs->size; j++) {
            if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
                continue;
            }
            /* ignore procs from other jobs */
            if (proc->name.jobid != jdata->jobid) {
                continue;
            }
            /* bozo check */
            if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
                orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-locale", true, ORTE_NAME_PRINT(&proc->name));
                return ORTE_ERR_SILENT;
            }
            /* get the index of this location */
            if (UINT_MAX == (idx = opal_hwloc_base_get_obj_idx(node->topology, locale, OPAL_HWLOC_AVAILABLE))) {
                ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
                return ORTE_ERR_SILENT;
            }
            data = (opal_hwloc_obj_data_t*)locale->userdata;
            /* get the number of cpus under this location */
            if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology, locale))) {
                orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
                return ORTE_ERR_SILENT;
            }
            /* if we don't have enough cpus to support this additional proc, try
             * shifting the location to a cousin that can support it - the important
             * thing is that we maintain the same level in the topology */
            if (ncpus < (data->num_bound+1)) {
                opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                    "%s bind_in_place: searching right",
                                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
                sib = locale;
                found = false;
                while (NULL != (sib = sib->next_cousin)) {
                    data = (opal_hwloc_obj_data_t*)sib->userdata;
                    ncpus = opal_hwloc_base_get_npus(node->topology, sib);
                    if (data->num_bound < ncpus) {
                        found = true;
                        locale = sib;
                        break;
                    }
                }
                if (!found) {
                    /* try the other direction */
                    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                        "%s bind_in_place: searching left",
                                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
                    sib = locale;
                    while (NULL != (sib = sib->prev_cousin)) {
                        data = (opal_hwloc_obj_data_t*)sib->userdata;
                        ncpus = opal_hwloc_base_get_npus(node->topology, sib);
                        if (data->num_bound < ncpus) {
                            found = true;
                            locale = sib;
                            break;
                        }
                    }
                }
                if (!found) {
                    /* no place to put this - see if overload is allowed */
                    if (!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) {
                        if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
                            /* if the user specified a binding policy, then we cannot meet
                             * it since overload isn't allowed, so error out - have the
                             * message indicate that setting overload allowed will remove
                             * this restriction */
                            orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
                                           opal_hwloc_base_print_binding(map->binding), node->name,
                                           data->num_bound, ncpus);
                            return ORTE_ERR_SILENT;
                        } else {
                            /* if we have the default binding policy, then just don't bind */
                            OPAL_SET_BINDING_POLICY(map->binding, OPAL_BIND_TO_NONE);
                            unbind_procs(jdata);
                            return ORTE_SUCCESS;
                        }
                    }
                }
            }
            /* track the number bound */
            data = (opal_hwloc_obj_data_t*)locale->userdata;  // just in case it changed
            data->num_bound++;
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "BINDING PROC %s TO %s NUMBER %u",
                                ORTE_NAME_PRINT(&proc->name),
                                hwloc_obj_type_string(locale->type), idx);
            /* bind the proc here */
            cpus = opal_hwloc_base_get_available_cpus(node->topology, locale);
            hwloc_bitmap_list_asprintf(&cpu_bitmap, cpus);
            orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, cpu_bitmap, OPAL_STRING);
            /* update the location, in case it changed */
            orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_BOUND, ORTE_ATTR_LOCAL, locale, OPAL_PTR);
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "%s BOUND PROC %s TO %s[%s:%u] on node %s",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                ORTE_NAME_PRINT(&proc->name),
                                cpu_bitmap, hwloc_obj_type_string(locale->type),
                                idx, node->name);
            if (NULL != cpu_bitmap) {
                free(cpu_bitmap);
            }
        }
    }

    return ORTE_SUCCESS;
}
static mca_mtl_base_module_t*
ompi_mtl_psm_component_init(bool enable_progress_threads,
                           bool enable_mpi_threads)
{
    psm_error_t	err;
    int rc;
    int	verno_major = PSM_VERNO_MAJOR;
    int verno_minor = PSM_VERNO_MINOR;
    ompi_proc_t *my_proc, **procs;
    size_t num_total_procs, proc;
    int local_rank = -1, num_local_procs = 0;
    
    /* Compute the total number of processes on this host and our local rank
     * on that node. We need to provide PSM with these values so it can 
     * allocate hardware contexts appropriately across processes.
     */
    if ((rc = ompi_proc_refresh()) != OMPI_SUCCESS) {
      return NULL;
    }
    
    my_proc = ompi_proc_local();
    if (NULL == (procs = ompi_proc_world(&num_total_procs))) {
      return NULL;
    }
    
    for (proc = 0; proc < num_total_procs; proc++) {
      if (my_proc == procs[proc]) {
	local_rank = num_local_procs++;
	continue;
      }
      
      if (OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags)) {
	num_local_procs++;
      }
    }
    
    assert(local_rank >= 0 && num_local_procs > 0);
    free(procs);
    
    err = psm_error_register_handler(NULL /* no ep */,
			             PSM_ERRHANDLER_NOP);
    if (err) {
        opal_output(0, "Error in psm_error_register_handler (error %s)\n", 
		    psm_error_get_string(err));
	return NULL;
    }
    
#if PSM_VERNO >= 0x010c
    /* Set infinipath debug level */
    err = psm_setopt(PSM_COMPONENT_CORE, 0, PSM_CORE_OPT_DEBUG, 
		     (const void*) &ompi_mtl_psm.debug_level, 
		     sizeof(unsigned));
    if (err) {
      /* Non fatal error. Can continue */
      orte_show_help("help-mtl-psm.txt",
		     "psm init", false,
		     psm_error_get_string(err));
    }
#endif
    
    /* Only allow for shm and ipath devices in 2.0 and earlier releases 
     * (unless the user overrides the setting).
     */
    
    if (PSM_VERNO >= 0x0104) {
      setenv("PSM_DEVICES", "self,shm,ipath", 0);
    }
    else {
      setenv("PSM_DEVICES", "shm,ipath", 0);
    }
    
    err = psm_init(&verno_major, &verno_minor);
    if (err) {
      orte_show_help("help-mtl-psm.txt",
		     "psm init", true,
		     psm_error_get_string(err));
      return NULL;
    }
    
    /* Complete PSM initialization */
    ompi_mtl_psm_module_init(local_rank, num_local_procs);

    ompi_mtl_psm.super.mtl_request_size = 
      sizeof(mca_mtl_psm_request_t) - 
      sizeof(struct mca_mtl_request_t);
    
    return &ompi_mtl_psm.super;
}
Beispiel #28
0
static int bind_to_cpuset(orte_job_t *jdata)
{
    /* bind each process to opal_hwloc_base_cpu_set */
    int i, j;
    orte_job_map_t *map;
    orte_node_t *node;
    orte_proc_t *proc;
    struct hwloc_topology_support *support;
    opal_hwloc_topo_data_t *sum;
    hwloc_obj_t root;
    char *cpu_bitmap;

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps: bind job %s to cpuset %s",
                        ORTE_JOBID_PRINT(jdata->jobid),
                        opal_hwloc_base_cpu_set);
    /* initialize */
    map = jdata->map;

    for (i=0; i < map->nodes->size; i++) {
        if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
            continue;
        }
        if (!orte_do_not_launch) {
            /* if we don't want to launch, then we are just testing the system,
             * so ignore questions about support capabilities
             */
            support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology);
            /* check if topology supports cpubind - have to be careful here
             * as Linux doesn't currently support thread-level binding. This
             * may change in the future, though, and it isn't clear how hwloc
             * interprets the current behavior. So check both flags to be sure.
             */
            if (!support->cpubind->set_thisproc_cpubind &&
                !support->cpubind->set_thisthread_cpubind) {
                if (!OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy)) {
                    /* we are not required to bind, so ignore this */
                    continue;
                }
                orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name);
                return ORTE_ERR_SILENT;
            }
            /* check if topology supports membind - have to be careful here
             * as hwloc treats this differently than I (at least) would have
             * expected. Per hwloc, Linux memory binding is at the thread,
             * and not process, level. Thus, hwloc sets the "thisproc" flag
             * to "false" on all Linux systems, and uses the "thisthread" flag
             * to indicate binding capability
             */
            if (!support->membind->set_thisproc_membind &&
                !support->membind->set_thisthread_membind) {
                if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name);
                    membind_warned = true;
                } else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name);
                    return ORTE_ERR_SILENT;
                }
            }
        }
        root = hwloc_get_root_obj(node->topology);
        if (NULL == root->userdata) {
            /* something went wrong */
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            return ORTE_ERR_NOT_FOUND;
        }
        sum = (opal_hwloc_topo_data_t*)root->userdata;
        if (NULL == sum->available) {
            /* another error */
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            return ORTE_ERR_NOT_FOUND;
        }
        for (j=0; j < node->procs->size; j++) {
            if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
                continue;
            }
            /* ignore procs from other jobs */
            if (proc->name.jobid != jdata->jobid) {
                continue;
            }
            hwloc_bitmap_list_asprintf(&cpu_bitmap, sum->available);
            orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, cpu_bitmap, OPAL_STRING);
            if (NULL != cpu_bitmap) {
                free(cpu_bitmap);
            }
        }
    }
    return ORTE_SUCCESS;
}
/**
 *  Discover available (pre-allocated) nodes. Allocate the
 *  requested number of nodes/process slots to the job.
 *  
 */
static int orte_ras_gridengine_allocate(opal_list_t *nodelist)
{
    char *pe_hostfile = getenv("PE_HOSTFILE");
    char *job_id = getenv("JOB_ID");
    char buf[1024], *tok, *num, *queue, *arch, *ptr;
    int rc;
    FILE *fp;
    orte_node_t *node;
    opal_list_item_t *item;
    bool found;

    /* show the Grid Engine's JOB_ID */
    if (mca_ras_gridengine_component.show_jobid ||
        mca_ras_gridengine_component.verbose != -1) {
        opal_output(0, "ras:gridengine: JOB_ID: %s", job_id);
    }
   
    /* check the PE_HOSTFILE before continuing on */
    if (!(fp = fopen(pe_hostfile, "r"))) {
        orte_show_help("help-ras-gridengine.txt", "cannot-read-pe-hostfile",
            true, pe_hostfile, strerror(errno));
        rc = ORTE_ERROR;
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }

    /* parse the pe_hostfile for hostname, slots, etc, then compare the
     * current node with a list of hosts in the nodelist, if the current
     * node is not found in nodelist, add it in */
    opal_output(mca_ras_gridengine_component.verbose, 
		"ras:gridengine: PE_HOSTFILE: %s", pe_hostfile);

    while (fgets(buf, sizeof(buf), fp)) {
        ptr = strtok_r(buf, " \n", &tok);
        num = strtok_r(NULL, " \n", &tok);
        queue = strtok_r(NULL, " \n", &tok);
        arch = strtok_r(NULL, " \n", &tok);

        /* see if we already have this node */
        found = false;
        for (item = opal_list_get_first(nodelist);
             item != opal_list_get_end(nodelist);
             item = opal_list_get_next(item)) {
            node = (orte_node_t*)item;
            if (0 == strcmp(ptr, node->name)) {
                /* just add the slots */
                node->slots += (int)strtol(num, (char **)NULL, 10);
                found = true;
                break;
            }
        }
        if (!found) {
            /* create a new node entry */
            node = OBJ_NEW(orte_node_t);
            if (NULL == node) {
                fclose(fp);
                return ORTE_ERR_OUT_OF_RESOURCE;
            }
            node->name = strdup(ptr);
            node->state = ORTE_NODE_STATE_UP;
            node->slots_inuse = 0;
            node->slots_max = 0;
            node->slots = (int)strtol(num, (char **)NULL, 10);
            opal_output(mca_ras_gridengine_component.verbose,
                        "ras:gridengine: %s: PE_HOSTFILE shows slots=%d",
                        node->name, node->slots);
            opal_list_append(nodelist, &node->super);
        }
    } /* finished reading the $PE_HOSTFILE */

cleanup:
    fclose(fp);

    /* in gridengine, if we didn't find anything, then something
     * is wrong. The user may not have indicated this was a parallel
     * job, or may not have an allocation at all. In any case, this
     * is considered an unrecoverable error and we need to report it
     */
    if (opal_list_is_empty(nodelist)) {
        orte_show_help("help-ras-gridengine.txt", "no-nodes-found", true);
        return ORTE_ERR_NOT_FOUND;
    }

    return ORTE_SUCCESS;
    
}
Beispiel #30
0
/*
 * Initialize global variables used w/in the server.
 */
int pmix_server_init(void)
{
    int rc;
    opal_list_t info;
    opal_value_t *kv;

    if (orte_pmix_server_globals.initialized) {
        return ORTE_SUCCESS;
    }
    orte_pmix_server_globals.initialized = true;

    /* setup the server's state variables */
    OBJ_CONSTRUCT(&orte_pmix_server_globals.reqs, opal_hotel_t);
    if (OPAL_SUCCESS != (rc = opal_hotel_init(&orte_pmix_server_globals.reqs,
                                              orte_pmix_server_globals.num_rooms,
                                              orte_event_base, orte_pmix_server_globals.timeout*1000000,
                                              ORTE_ERROR_PRI, eviction_cbfunc))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    OBJ_CONSTRUCT(&orte_pmix_server_globals.notifications, opal_list_t);

   /* setup recv for direct modex requests */
    orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DIRECT_MODEX,
                            ORTE_RML_PERSISTENT, pmix_server_dmdx_recv, NULL);

    /* setup recv for replies to direct modex requests */
    orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DIRECT_MODEX_RESP,
                            ORTE_RML_PERSISTENT, pmix_server_dmdx_resp, NULL);

    /* setup recv for replies to proxy launch requests */
    orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_LAUNCH_RESP,
                            ORTE_RML_PERSISTENT, pmix_server_launch_resp, NULL);

    /* setup recv for replies from data server */
    orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DATA_CLIENT,
                            ORTE_RML_PERSISTENT, pmix_server_keyval_client, NULL);

    /* setup recv for notifications */
    orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_NOTIFICATION,
                            ORTE_RML_PERSISTENT, pmix_server_notify, NULL);

    /* ensure the PMIx server uses the proper rendezvous directory */
    opal_setenv("PMIX_SERVER_TMPDIR", orte_process_info.proc_session_dir, true, &environ);

    /* pass the server the local topology - we do this so the procs won't read the
     * topology themselves as this could overwhelm the local
     * system on large-scale SMPs */
    OBJ_CONSTRUCT(&info, opal_list_t);
    if (NULL != opal_hwloc_topology) {
        char *xmlbuffer=NULL;
        int len;
        kv = OBJ_NEW(opal_value_t);
        kv->key = strdup(OPAL_PMIX_LOCAL_TOPO);
        if (0 != hwloc_topology_export_xmlbuffer(opal_hwloc_topology, &xmlbuffer, &len)) {
            OBJ_RELEASE(kv);
            OBJ_DESTRUCT(&info);
            return ORTE_ERROR;
        }
        kv->data.string = xmlbuffer;
        kv->type = OPAL_STRING;
        opal_list_append(&info, &kv->super);
    }
    /* tell the server to allow tool connections */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_SERVER_TOOL_SUPPORT);
    kv->type = OPAL_BOOL;
    kv->data.flag = true;
    opal_list_append(&info, &kv->super);
    /* tell the server our temp directory */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_SERVER_TMPDIR);
    kv->type = OPAL_STRING;
    kv->data.string = opal_os_path(false, orte_process_info.jobfam_session_dir, NULL);
    opal_list_append(&info, &kv->super);
    /* use the same for the system temp directory - this is
     * where the system-level tool connections will go */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_SYSTEM_TMPDIR);
    kv->type = OPAL_STRING;
    kv->data.string = strdup(orte_process_info.tmpdir_base);
    opal_list_append(&info, &kv->super);

    /* setup the local server */
    if (ORTE_SUCCESS != (rc = opal_pmix.server_init(&pmix_server, &info))) {
        /* pmix will provide a nice show_help output here */
        return rc;
    }
    OPAL_LIST_DESTRUCT(&info);

    /* if the universal server wasn't specified, then we use
     * our own HNP for that purpose */
    if (NULL == orte_pmix_server_globals.server_uri) {
        orte_pmix_server_globals.server = *ORTE_PROC_MY_HNP;
    } else {
        char *server;
        opal_buffer_t buf;
        if (0 == strncmp(orte_pmix_server_globals.server_uri, "file", strlen("file")) ||
            0 == strncmp(orte_pmix_server_globals.server_uri, "FILE", strlen("FILE"))) {
            char input[1024], *filename;
            FILE *fp;

            /* it is a file - get the filename */
            filename = strchr(orte_pmix_server_globals.server_uri, ':');
            if (NULL == filename) {
                /* filename is not correctly formatted */
                orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-bad", true,
                               orte_basename, orte_pmix_server_globals.server_uri);
                return ORTE_ERR_BAD_PARAM;
            }
            ++filename; /* space past the : */

            if (0 >= strlen(filename)) {
                /* they forgot to give us the name! */
                orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-missing", true,
                               orte_basename, orte_pmix_server_globals.server_uri);
                return ORTE_ERR_BAD_PARAM;
            }

            /* open the file and extract the uri */
            fp = fopen(filename, "r");
            if (NULL == fp) { /* can't find or read file! */
                orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-access", true,
                               orte_basename, orte_pmix_server_globals.server_uri);
                return ORTE_ERR_BAD_PARAM;
            }
            if (NULL == fgets(input, 1024, fp)) {
                /* something malformed about file */
                fclose(fp);
                orte_show_help("help-orterun.txt", "orterun:ompi-server-file-bad", true,
                               orte_basename, orte_pmix_server_globals.server_uri,
                               orte_basename);
                return ORTE_ERR_BAD_PARAM;
            }
            fclose(fp);
            input[strlen(input)-1] = '\0';  /* remove newline */
            server = strdup(input);
        } else {
            server = strdup(orte_pmix_server_globals.server_uri);
        }
        /* setup our route to the server */
        OBJ_CONSTRUCT(&buf, opal_buffer_t);
        opal_dss.pack(&buf, &server, 1, OPAL_STRING);
        if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(&buf))) {
            ORTE_ERROR_LOG(rc);
            ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
            return rc;
        }
        OBJ_DESTRUCT(&buf);
        /* parse the URI to get the server's name */
        if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(server, &orte_pmix_server_globals.server, NULL))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        /* check if we are to wait for the server to start - resolves
         * a race condition that can occur when the server is run
         * as a background job - e.g., in scripts
         */
        if (orte_pmix_server_globals.wait_for_server) {
            /* ping the server */
            struct timeval timeout;
            timeout.tv_sec = orte_pmix_server_globals.timeout;
            timeout.tv_usec = 0;
            if (ORTE_SUCCESS != (rc = orte_rml.ping(server, &timeout))) {
                /* try it one more time */
                if (ORTE_SUCCESS != (rc = orte_rml.ping(server, &timeout))) {
                    /* okay give up */
                    orte_show_help("help-orterun.txt", "orterun:server-not-found", true,
                                   orte_basename, server,
                                   (long)orte_pmix_server_globals.timeout,
                                   ORTE_ERROR_NAME(rc));
                    ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
                    return rc;
                }
            }
        }
    }

    return rc;
}