static char* get_node_list(orte_app_context_t *app) { int j; char **total_host = NULL; char *nodes; if (NULL == app->dash_host) { return NULL; } for (j=0; NULL != app->dash_host[j]; j++) { opal_argv_append_unique_nosize(&total_host, app->dash_host[j], false); } if (NULL == total_host) { return NULL; } nodes = opal_argv_join(total_host, ','); opal_argv_free(total_host); return nodes; }
static char* get_node_list(orte_app_context_t *app) { int j; char **total_host = NULL; char *nodes; char **dash_host, *dh; if (!orte_get_attribute(&app->attributes, ORTE_APP_DASH_HOST, (void**)&dh, OPAL_STRING)) { return NULL; } dash_host = opal_argv_split(dh, ','); free(dh); for (j=0; NULL != dash_host[j]; j++) { opal_argv_append_unique_nosize(&total_host, dash_host[j], false); } opal_argv_free(dash_host); if (NULL == total_host) { return NULL; } nodes = opal_argv_join(total_host, ','); opal_argv_free(total_host); return nodes; }
/* we can only enter this routine if no other allocation * was found, so we only need to know that finding any * relative node syntax should generate an immediate error */ int orte_util_add_dash_host_nodes(opal_list_t *nodes, bool *override_oversubscribed, char ** host_argv) { opal_list_item_t* item; orte_std_cntr_t i, j, k; int rc; char **mapped_nodes = NULL, **mini_map; orte_node_t *node; /* Accumulate all of the host name mappings */ for (j = 0; j < opal_argv_count(host_argv); ++j) { mini_map = opal_argv_split(host_argv[j], ','); if (mapped_nodes == NULL) { mapped_nodes = mini_map; } else { for (k = 0; NULL != mini_map[k]; ++k) { rc = opal_argv_append_nosize(&mapped_nodes, mini_map[k]); if (OPAL_SUCCESS != rc) { goto cleanup; } } opal_argv_free(mini_map); } } /* Did we find anything? If not, then do nothing */ if (NULL == mapped_nodes) { return ORTE_SUCCESS; } /* go through the names found and add them to the host list. If they're not unique, then bump the slots count for each duplicate */ for (i = 0; NULL != mapped_nodes[i]; ++i) { /* if the specified node contains a relative node syntax, * this is an error */ if ('+' == mapped_nodes[i][0]) { orte_show_help("help-dash-host.txt", "dash-host:relative-syntax", true, mapped_nodes[i]); rc = ORTE_ERR_SILENT; goto cleanup; } /* see if the node is already on the list */ for (item = opal_list_get_first(nodes); item != opal_list_get_end(nodes); item = opal_list_get_next(item)) { node = (orte_node_t*) item; if (0 == strcmp(node->name, mapped_nodes[i]) || (0 == strcmp(node->name, orte_process_info.nodename) && (0 == strcmp(mapped_nodes[i], "localhost") || opal_ifislocal(mapped_nodes[i])))) { ++node->slots; break; } } /* If we didn't find it, add it to the list */ if (item == opal_list_get_end(nodes)) { node = OBJ_NEW(orte_node_t); if (NULL == node) { return ORTE_ERR_OUT_OF_RESOURCE; } /* check to see if this is a local name */ if (0 == strcmp(mapped_nodes[i], "localhost") || opal_ifislocal(mapped_nodes[i])) { /* it is local, so use the local nodename to avoid * later confusion */ if (orte_show_resolved_nodenames && 0 != strcmp(mapped_nodes[i], orte_process_info.nodename)) { /* add to list of aliases for this node - only add if unique */ opal_argv_append_unique_nosize(&node->alias, mapped_nodes[i]); } node->name = strdup(orte_process_info.nodename); } else { /* not local - use the given name */ node->name = strdup(mapped_nodes[i]); } node->state = ORTE_NODE_STATE_UP; node->slots_inuse = 0; node->slots_max = 0; node->slots = 1; /* indicate that ORTE should override any oversubscribed conditions * based on local hardware limits since the user (a) might not have * provided us any info on the #slots for a node, and (b) the user * might have been wrong! If we don't check the number of local physical * processors, then we could be too aggressive on our sched_yield setting * and cause performance problems. */ *override_oversubscribed = true; opal_list_append(nodes, &node->super); } } rc = ORTE_SUCCESS; cleanup: if (NULL != mapped_nodes) { opal_argv_free(mapped_nodes); } return rc; }
static int hostfile_parse_line(int token, opal_list_t* updates, opal_list_t* exclude, bool keep_all) { int rc; orte_node_t* node; bool got_count = false; bool got_max = false; char* value; char** argv; char* node_name = NULL; char* node_alias = NULL; char* username = NULL; int cnt; int number_of_slots = 0; char buff[64]; if (ORTE_HOSTFILE_STRING == token || ORTE_HOSTFILE_HOSTNAME == token || ORTE_HOSTFILE_INT == token || ORTE_HOSTFILE_IPV4 == token || ORTE_HOSTFILE_IPV6 == token) { if(ORTE_HOSTFILE_INT == token) { snprintf(buff, 64, "%d", orte_util_hostfile_value.ival); value = buff; } else { value = orte_util_hostfile_value.sval; } argv = opal_argv_split (value, '@'); cnt = opal_argv_count (argv); if (1 == cnt) { node_name = strdup(argv[0]); } else if (2 == cnt) { username = strdup(argv[0]); node_name = strdup(argv[1]); } else { opal_output(0, "WARNING: Unhandled user@host-combination\n"); /* XXX */ } opal_argv_free (argv); /* if the first letter of the name is '^', then this is a node * to be excluded. Remove the ^ character so the nodename is * usable, and put it on the exclude list */ if ('^' == node_name[0]) { int i, len; len = strlen(node_name); for (i=1; i < len; i++) { node_name[i-1] = node_name[i]; } node_name[len-1] = '\0'; /* truncate */ OPAL_OUTPUT_VERBOSE((3, orte_ras_base_framework.framework_output, "%s hostfile: node %s is being excluded", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node_name)); /* convert this into something globally unique */ if (strcmp(node_name, "localhost") == 0 || opal_ifislocal(node_name)) { /* Nodename has been allocated, that is for sure */ if (orte_show_resolved_nodenames && 0 != strcmp(node_name, orte_process_info.nodename)) { node_alias = strdup(node_name); } free (node_name); node_name = strdup(orte_process_info.nodename); } /* Do we need to make a new node object? First check to see if it's already in the exclude list */ if (NULL == (node = hostfile_lookup(exclude, node_name))) { node = OBJ_NEW(orte_node_t); node->name = node_name; if (NULL != username) { node->username = strdup(username); } } /* Note that we need to add this back to the exclude list. If it was found, we just removed it (in hostfile_lookup()), so this puts it back. If it was not found, then we have to add it to the exclude list anyway. */ opal_list_append(exclude, &node->super); return ORTE_SUCCESS; } /* this is not a node to be excluded, so we need to process it and * add it to the "include" list. See if this host is actually us. */ if (strcmp(node_name, "localhost") == 0 || opal_ifislocal(node_name)) { /* Nodename has been allocated, that is for sure */ if (orte_show_resolved_nodenames && 0 != strcmp(node_name, orte_process_info.nodename)) { node_alias = strdup(node_name); } free (node_name); node_name = strdup(orte_process_info.nodename); } OPAL_OUTPUT_VERBOSE((3, orte_ras_base_framework.framework_output, "%s hostfile: node %s is being included - keep all is %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node_name, keep_all ? "TRUE" : "FALSE")); /* Do we need to make a new node object? First check to see * if we are keeping everything or if it's already in the updates * list. Because we check keep_all first, if that is set we will * not do the hostfile_lookup call, and thus won't remove the * pre-existing node from the updates list */ if (keep_all || NULL == (node = hostfile_lookup(updates, node_name))) { node = OBJ_NEW(orte_node_t); node->name = node_name; if (NULL != username) { node->username = strdup(username); } } /* do we need to record an alias for this node? */ if (NULL != node_alias) { /* add to list of aliases for this node - only add if unique */ opal_argv_append_unique_nosize(&node->alias, node_alias, false); free(node_alias); } } else if (ORTE_HOSTFILE_RELATIVE == token) { /* store this for later processing */ node = OBJ_NEW(orte_node_t); node->name = strdup(orte_util_hostfile_value.sval); if (NULL != username) { node->username = strdup(username); } } else if (ORTE_HOSTFILE_RANK == token) { /* we can ignore the rank, but we need to extract the node name. we * first need to shift over to the other side of the equal sign as * this is where the node name will be */ while (!orte_util_hostfile_done && ORTE_HOSTFILE_EQUAL != token) { token = orte_util_hostfile_lex(); } if (orte_util_hostfile_done) { /* bad syntax somewhere */ return ORTE_ERROR; } /* next position should be the node name */ token = orte_util_hostfile_lex(); if(ORTE_HOSTFILE_INT == token) { snprintf(buff, 64, "%d", orte_util_hostfile_value.ival); value = buff; } else { value = orte_util_hostfile_value.sval; } argv = opal_argv_split (value, '@'); cnt = opal_argv_count (argv); if (1 == cnt) { node_name = strdup(argv[0]); } else if (2 == cnt) { username = strdup(argv[0]); node_name = strdup(argv[1]); } else { opal_output(0, "WARNING: Unhandled user@host-combination\n"); /* XXX */ } opal_argv_free (argv); /* Do we need to make a new node object? First check to see * if we are keeping everything or if it's already in the updates * list. Because we check keep_all first, if that is set we will * not do the hostfile_lookup call, and thus won't remove the * pre-existing node from the updates list */ if (keep_all || NULL == (node = hostfile_lookup(updates, node_name))) { node = OBJ_NEW(orte_node_t); node->name = node_name; if (NULL != username) { node->username = strdup(username); } } /* add a slot */ node->slots++; /* do we need to record an alias for this node? */ if (NULL != node_alias) { /* add to list of aliases for this node - only add if unique */ opal_argv_append_unique_nosize(&node->alias, node_alias, false); free(node_alias); } /* skip to end of line */ while (!orte_util_hostfile_done && ORTE_HOSTFILE_NEWLINE != token) { token = orte_util_hostfile_lex(); } opal_list_append(updates, &node->super); return ORTE_SUCCESS; } else { hostfile_parse_error(token); return ORTE_ERROR; } got_count = false; while (!orte_util_hostfile_done) { token = orte_util_hostfile_lex(); switch (token) { case ORTE_HOSTFILE_DONE: goto done; case ORTE_HOSTFILE_NEWLINE: goto done; case ORTE_HOSTFILE_USERNAME: node->username = hostfile_parse_string(); break; case ORTE_HOSTFILE_COUNT: case ORTE_HOSTFILE_CPU: case ORTE_HOSTFILE_SLOTS: rc = hostfile_parse_int(); if (rc < 0) { orte_show_help("help-hostfile.txt", "slots", true, cur_hostfile_name, rc); OBJ_RELEASE(node); return ORTE_ERROR; } node->slots += rc; got_count = true; /* Ensure that slots_max >= slots */ if (node->slots_max != 0 && node->slots_max < node->slots) { node->slots_max = node->slots; } break; case ORTE_HOSTFILE_SLOTS_MAX: rc = hostfile_parse_int(); if (rc < 0) { orte_show_help("help-hostfile.txt", "max_slots", true, cur_hostfile_name, ((size_t) rc)); OBJ_RELEASE(node); return ORTE_ERROR; } /* Only take this update if it puts us >= node_slots */ if (rc >= node->slots) { if (node->slots_max != rc) { node->slots_max = rc; got_max = true; } } else { orte_show_help("help-hostfile.txt", "max_slots_lt", true, cur_hostfile_name, node->slots, rc); ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); OBJ_RELEASE(node); return ORTE_ERROR; } break; default: hostfile_parse_error(token); OBJ_RELEASE(node); return ORTE_ERROR; } if (number_of_slots > node->slots) { ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); OBJ_RELEASE(node); return ORTE_ERROR; } } done: if (got_count) { node->slots_given = true; } else if (got_max) { node->slots = node->slots_max; node->slots_given = true; } else { /* should be set by obj_new, but just to be clear */ node->slots_given = false; /* if we weren't give a count or a max, then * just increment by one to support RMs that * count slots by listing the node multiple * times in the file */ ++node->slots; } opal_list_append(updates, &node->super); return ORTE_SUCCESS; }
/* * Add the specified node definitions to the global data store * NOTE: this removes all items from the list! */ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata) { opal_list_item_t* item; orte_std_cntr_t num_nodes; int rc, i; orte_node_t *node, *hnp_node; char *ptr; bool hnp_alone = true; /* get the number of nodes */ num_nodes = (orte_std_cntr_t)opal_list_get_size(nodes); if (0 == num_nodes) { return ORTE_SUCCESS; /* nothing to do */ } OPAL_OUTPUT_VERBOSE((5, orte_ras_base.ras_output, "%s ras:base:node_insert inserting %ld nodes", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)num_nodes)); /* set the size of the global array - this helps minimize time * spent doing realloc's */ if (ORTE_SUCCESS != (rc = opal_pointer_array_set_size(orte_node_pool, num_nodes))) { ORTE_ERROR_LOG(rc); return rc; } /* get the hnp node's info */ hnp_node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); /* cycle through the list */ while (NULL != (item = opal_list_remove_first(nodes))) { node = (orte_node_t*)item; /* the HNP had to already enter its node on the array - that entry is in the * first position since it is the first one entered. We need to check to see * if this node is the same as the HNP's node so we don't double-enter it */ if (NULL != hnp_node && (0 == strcmp(node->name, hnp_node->name) || 0 == strcmp(node->name, "localhost") || opal_ifislocal(node->name))) { OPAL_OUTPUT_VERBOSE((5, orte_ras_base.ras_output, "%s ras:base:node_insert updating HNP [%s] info to %ld slots", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, (long)node->slots)); /* flag that hnp has been allocated */ orte_hnp_is_allocated = true; /* update the total slots in the job */ orte_ras_base.total_slots_alloc += node->slots; /* copy the allocation data to that node's info */ hnp_node->slots += node->slots; hnp_node->slots_max = node->slots_max; hnp_node->launch_id = node->launch_id; if (orte_managed_allocation) { /* the slots are always treated as sacred * in managed allocations */ hnp_node->slots_given = true; } else { /* in unmanaged allocations, take whatever * was determined by the hostfile or dash-host * parsers */ hnp_node->slots_given = node->slots_given; } /* use the local name for our node - don't trust what * we got from an RM. If requested, store the resolved * nodename info */ if (orte_show_resolved_nodenames) { /* if the node name is different, store it as an alias */ if (0 != strcmp(node->name, hnp_node->name)) { /* add to list of aliases for this node - only add if unique */ opal_argv_append_unique_nosize(&hnp_node->alias, node->name, false); } if (NULL != node->alias) { /* now copy over any aliases that are unique */ for (i=0; NULL != node->alias[i]; i++) { opal_argv_append_unique_nosize(&hnp_node->alias, node->alias[i], false); } } } /* don't keep duplicate copy */ OBJ_RELEASE(node); } else { /* insert the object onto the orte_nodes global array */ OPAL_OUTPUT_VERBOSE((5, orte_ras_base.ras_output, "%s ras:base:node_insert node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == node->name) ? "NULL" : node->name)); if (orte_managed_allocation) { /* the slots are always treated as sacred * in managed allocations */ node->slots_given = true; } /* insert it into the array */ node->index = opal_pointer_array_add(orte_node_pool, (void*)node); if (ORTE_SUCCESS > (rc = node->index)) { ORTE_ERROR_LOG(rc); return rc; } /* update the total slots in the job */ orte_ras_base.total_slots_alloc += node->slots; /* check if we have fqdn names in the allocation */ if (NULL != strchr(node->name, '.')) { orte_have_fqdn_allocation = true; } /* indicate the HNP is not alone */ hnp_alone = false; } } /* if we didn't find any fqdn names in the allocation, then * ensure we don't have any domain info in the node record * for the hnp */ if (!orte_have_fqdn_allocation && !hnp_alone) { if (NULL != (ptr = strchr(hnp_node->name, '.'))) { *ptr = '\0'; } } return ORTE_SUCCESS; }
/* we can only enter this routine if no other allocation * was found, so we only need to know that finding any * relative node syntax should generate an immediate error */ int orte_util_add_dash_host_nodes(opal_list_t *nodes, char ** host_argv) { opal_list_item_t* item; orte_std_cntr_t i, j, k; int rc; char **mapped_nodes = NULL, **mini_map; orte_node_t *node; /* Accumulate all of the host name mappings */ for (j = 0; j < opal_argv_count(host_argv); ++j) { mini_map = opal_argv_split(host_argv[j], ','); if (mapped_nodes == NULL) { mapped_nodes = mini_map; } else { for (k = 0; NULL != mini_map[k]; ++k) { rc = opal_argv_append_nosize(&mapped_nodes, mini_map[k]); if (OPAL_SUCCESS != rc) { goto cleanup; } } opal_argv_free(mini_map); } } /* Did we find anything? If not, then do nothing */ if (NULL == mapped_nodes) { return ORTE_SUCCESS; } /* go through the names found and add them to the host list. If they're not unique, then bump the slots count for each duplicate */ for (i = 0; NULL != mapped_nodes[i]; ++i) { /* if the specified node contains a relative node syntax, * this is an error */ if ('+' == mapped_nodes[i][0]) { orte_show_help("help-dash-host.txt", "dash-host:relative-syntax", true, mapped_nodes[i]); rc = ORTE_ERR_SILENT; goto cleanup; } /* see if the node is already on the list */ for (item = opal_list_get_first(nodes); item != opal_list_get_end(nodes); item = opal_list_get_next(item)) { node = (orte_node_t*) item; if (0 == strcmp(node->name, mapped_nodes[i]) || (0 == strcmp(node->name, orte_process_info.nodename) && (0 == strcmp(mapped_nodes[i], "localhost") || opal_ifislocal(mapped_nodes[i])))) { ++node->slots; /* the dash-host option presumes definition of num_slots */ node->slots_given = true; break; } } /* If we didn't find it, add it to the list */ if (item == opal_list_get_end(nodes)) { node = OBJ_NEW(orte_node_t); if (NULL == node) { return ORTE_ERR_OUT_OF_RESOURCE; } /* check to see if this is a local name */ if (0 == strcmp(mapped_nodes[i], "localhost") || opal_ifislocal(mapped_nodes[i])) { /* it is local, so use the local nodename to avoid * later confusion */ if (orte_show_resolved_nodenames && 0 != strcmp(mapped_nodes[i], orte_process_info.nodename)) { /* add to list of aliases for this node - only add if unique */ opal_argv_append_unique_nosize(&node->alias, mapped_nodes[i], false); } node->name = strdup(orte_process_info.nodename); } else { /* not local - use the given name */ node->name = strdup(mapped_nodes[i]); } node->state = ORTE_NODE_STATE_UP; node->slots_inuse = 0; node->slots_max = 0; node->slots = 1; /* the dash-host option presumes definition of num_slots */ node->slots_given = true; opal_list_append(nodes, &node->super); } } rc = ORTE_SUCCESS; cleanup: if (NULL != mapped_nodes) { opal_argv_free(mapped_nodes); } return rc; }
/* * Add the specified node definitions to the global data store * NOTE: this removes all items from the list! */ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata) { opal_list_item_t* item; orte_std_cntr_t num_nodes; int rc, i; orte_node_t *node, *hnp_node; /* get the number of nodes */ num_nodes = (orte_std_cntr_t)opal_list_get_size(nodes); if (0 == num_nodes) { return ORTE_SUCCESS; /* nothing to do */ } OPAL_OUTPUT_VERBOSE((5, orte_ras_base.ras_output, "%s ras:base:node_insert inserting %ld nodes", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)num_nodes)); /* set the size of the global array - this helps minimize time * spent doing realloc's */ if (ORTE_SUCCESS != (rc = opal_pointer_array_set_size(orte_node_pool, num_nodes))) { ORTE_ERROR_LOG(rc); return rc; } /* get the hnp node's info */ hnp_node = (orte_node_t*)(orte_node_pool->addr[0]); /* cycle through the list */ while (NULL != (item = opal_list_remove_first(nodes))) { node = (orte_node_t*)item; /* the HNP had to already enter its node on the array - that entry is in the * first position since it is the first one entered. We need to check to see * if this node is the same as the HNP's node so we don't double-enter it */ if (0 == strcmp(node->name, hnp_node->name) || opal_ifislocal(node->name)) { OPAL_OUTPUT_VERBOSE((5, orte_ras_base.ras_output, "%s ras:base:node_insert updating HNP info to %ld slots", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)node->slots)); /* flag that hnp has been allocated */ orte_hnp_is_allocated = true; /* adjust the total slots in the job */ jdata->total_slots_alloc -= hnp_node->slots; /* copy the allocation data to that node's info */ hnp_node->slots = node->slots; hnp_node->slots_max = node->slots_max; hnp_node->launch_id = node->launch_id; /* default allocate all the slots - may be modified later * as a result of filtering actions in mapper */ hnp_node->slots_alloc = node->slots; /* use the local name for our node - don't trust what * we got from an RM. If requested, store the resolved * nodename info */ if (orte_show_resolved_nodenames) { /* if the node name is different, store it as an alias */ if (0 != strcmp(node->name, hnp_node->name)) { /* add to list of aliases for this node - only add if unique */ opal_argv_append_unique_nosize(&hnp_node->alias, node->name); } if (NULL != node->alias) { /* now copy over any aliases that are unique */ for (i=0; NULL != node->alias[i]; i++) { opal_argv_append_unique_nosize(&hnp_node->alias, node->alias[i]); } } } /* update the total slots in the job */ jdata->total_slots_alloc += hnp_node->slots; /* don't keep duplicate copy */ OBJ_RELEASE(node); } else { /* insert the object onto the orte_nodes global array */ OPAL_OUTPUT_VERBOSE((5, orte_ras_base.ras_output, "%s ras:base:node_insert node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == node->name) ? "NULL" : node->name)); /* default allocate all the slots - may be modified later * as a result of filtering actions in mapper */ node->slots_alloc = node->slots; /* insert it into the array */ node->index = opal_pointer_array_add(orte_node_pool, (void*)node); if (ORTE_SUCCESS > (rc = node->index)) { ORTE_ERROR_LOG(rc); return rc; } /* update the total slots in the job */ jdata->total_slots_alloc += node->slots; } } return ORTE_SUCCESS; }