/* * Discover available (pre-allocated) nodes. Allocate the * requested number of nodes/process slots to the job. */ static int orte_ras_loadleveler_allocate(orte_jobid_t jobid, opal_list_t *attributes) { int i, ret; opal_list_t nodes_list; opal_list_item_t* item; orte_ras_node_t* node; char ** hostlist = NULL; int num_hosts = 0; OBJ_CONSTRUCT(&nodes_list, opal_list_t); ret = orte_ras_loadleveler_get_hostlist(&num_hosts, &hostlist); if(ORTE_SUCCESS != ret) { goto cleanup; } for (i = 0; i < num_hosts; i++) { /* check for duplicated nodes */ for (item = opal_list_get_first(&nodes_list); opal_list_get_end(&nodes_list) != item; item = opal_list_get_next(item)) { node = (orte_ras_node_t*) item; if (0 == strcmp(node->node_name, hostlist[i])) { ++node->node_slots; break; } } if(opal_list_get_end(&nodes_list) == item) { /* we did not find a duplicate, so add a new item to the list */ node = OBJ_NEW(orte_ras_node_t); if (NULL == node) { ret = ORTE_ERR_OUT_OF_RESOURCE; goto cleanup; } node->node_name = strdup(hostlist[i]); node->node_arch = NULL; node->node_state = ORTE_NODE_STATE_UP; node->node_cellid = 0; node->node_slots_inuse = 0; node->node_slots_max = 0; node->node_slots = 1; opal_list_append(&nodes_list, &node->super); } } ret = orte_ras_base_node_insert(&nodes_list); ret = orte_ras_base_allocate_nodes(jobid, &nodes_list); cleanup: while (NULL != (item = opal_list_remove_first(&nodes_list))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&nodes_list); opal_argv_free(hostlist); return ret; }
/* discover number of available resouces. Always exactly what asked for (surprise...) */ static int discover(orte_jobid_t jobid, opal_list_t* nodelist) { int ret; orte_ras_node_t *node; orte_std_cntr_t num_requested = 0; orte_std_cntr_t i; char *hostname; /* how many slots do we need? */ if(ORTE_SUCCESS != (ret = orte_rmgr_base_get_job_slots(jobid, &num_requested))) { return ret; } /* create a "node" for each slot */ for (i = 0 ; i < num_requested ; ++i) { asprintf(&hostname, "xgrid-node-%d", (int) i); node = OBJ_NEW(orte_ras_node_t); node->node_name = hostname; node->node_arch = NULL; node->node_state = ORTE_NODE_STATE_UP; node->node_cellid = 0; node->node_slots_inuse = 0; node->node_slots_max = 0; node->node_slots = 1; opal_list_append(nodelist, &node->super); } /* Add these nodes to the registry, and return all the values */ opal_output(orte_ras_base.ras_output, "ras:xgrid:allocate:discover: done -- adding to registry"); ret = orte_ras_base_node_insert(nodelist); /* All done */ if (ORTE_SUCCESS == ret) { opal_output(orte_ras_base.ras_output, "ras:xgrid:allocate:discover: success"); } else { opal_output(orte_ras_base.ras_output, "ras:xgrid:allocate:discover: failed (rc=%d)", ret); } return ret; }
int orte_ras_base_add_hosts(orte_job_t *jdata) { int rc; opal_list_t nodes; int i; orte_app_context_t *app; /* construct a list to hold the results */ OBJ_CONSTRUCT(&nodes, opal_list_t); /* Individual add-hostfile names, if given, are included * in the app_contexts for this job. We therefore need to * retrieve the app_contexts for the job, and then cycle * through them to see if anything is there. The parser will * add the nodes found in each add-hostfile to our list - i.e., * the resulting list contains the UNION of all nodes specified * in add-hostfiles from across all app_contexts * * Note that any relative node syntax found in the add-hostfiles will * generate an error in this scenario, so only non-relative syntax * can be present */ for (i=0; i < jdata->apps->size; i++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { continue; } if (NULL != app->add_hostfile) { OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output, "%s ras:base:add_hosts checking add-hostfile %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->add_hostfile)); /* hostfile was specified - parse it and add it to the list */ if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes, app->add_hostfile))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nodes); return rc; } /* now indicate that this app is to run across it */ app->hostfile = app->add_hostfile; app->add_hostfile = NULL; } } /* We next check for and add any add-host options. Note this is * a -little- different than dash-host in that (a) we add these * nodes to the global pool regardless of what may already be there, * and (b) as a result, any job and/or app_context can access them. * * Note that any relative node syntax found in the add-host lists will * generate an error in this scenario, so only non-relative syntax * can be present */ for (i=0; i < jdata->apps->size; i++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { continue; } if (NULL != app->add_host) { if (4 < opal_output_get_verbosity(orte_ras_base_framework.framework_output)) { char *fff = opal_argv_join(app->add_host, ','); opal_output(0, "%s ras:base:add_hosts checking add-host %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), fff); free(fff); } if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&nodes, app->add_host))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nodes); return rc; } /* now indicate that this app is to run across them */ app->dash_host = app->add_host; app->add_host = NULL; } } /* if something was found, we add that to our global pool */ if (!opal_list_is_empty(&nodes)) { /* store the results in the global resource pool - this removes the * list items */ if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) { ORTE_ERROR_LOG(rc); } /* cleanup */ OBJ_DESTRUCT(&nodes); } /* shall we display the results? */ if (0 < opal_output_get_verbosity(orte_ras_base_framework.framework_output)) { orte_ras_base_display_alloc(); } return ORTE_SUCCESS; }
/* * Function for selecting one component from all those that are * available. */ void orte_ras_base_allocate(int fd, short args, void *cbdata) { int rc; orte_job_t *jdata; opal_list_t nodes; orte_node_t *node; orte_std_cntr_t i; orte_app_context_t *app; orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output, "%s ras:base:allocate", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* convenience */ jdata = caddy->jdata; /* if we already did this, don't do it again - the pool of * global resources is set. */ if (orte_ras_base.allocation_read) { OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output, "%s ras:base:allocate allocation already read", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto next_state; } orte_ras_base.allocation_read = true; /* Otherwise, we have to create * the initial set of resources that will delineate all * further operations serviced by this HNP. This list will * contain ALL nodes that can be used by any subsequent job. * * In other words, if a node isn't found in this step, then * no job launched by this HNP will be able to utilize it. */ /* construct a list to hold the results */ OBJ_CONSTRUCT(&nodes, opal_list_t); /* if a component was selected, then we know we are in a managed * environment. - the active module will return a list of what it found */ if (NULL != orte_ras_base.active_module) { /* read the allocation */ if (ORTE_SUCCESS != (rc = orte_ras_base.active_module->allocate(jdata, &nodes))) { if (ORTE_ERR_ALLOCATION_PENDING == rc) { /* an allocation request is underway, so just do nothing */ OBJ_DESTRUCT(&nodes); OBJ_RELEASE(caddy); return; } if (ORTE_ERR_SYSTEM_WILL_BOOTSTRAP == rc) { /* this module indicates that nodes will be discovered * on a bootstrap basis, so all we do here is add our * own node to the list */ goto addlocal; } if (ORTE_ERR_TAKE_NEXT_OPTION == rc) { /* we have an active module, but it is unable to * allocate anything for this job - this indicates * that it isn't a fatal error, but could be if * an allocation is required */ if (orte_allocation_required) { /* an allocation is required, so this is fatal */ OBJ_DESTRUCT(&nodes); orte_show_help("help-ras-base.txt", "ras-base:no-allocation", true); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } else { /* an allocation is not required, so we can just * run on the local node - go add it */ goto addlocal; } } ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nodes); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } } /* If something came back, save it and we are done */ if (!opal_list_is_empty(&nodes)) { /* store the results in the global resource pool - this removes the * list items */ if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nodes); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } OBJ_DESTRUCT(&nodes); /* default to no-oversubscribe-allowed for managed systems */ if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) { ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE); } /* flag that the allocation is managed */ orte_managed_allocation = true; goto DISPLAY; } else if (orte_allocation_required) { /* if nothing was found, and an allocation is * required, then error out */ OBJ_DESTRUCT(&nodes); orte_show_help("help-ras-base.txt", "ras-base:no-allocation", true); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output, "%s ras:base:allocate nothing found in module - proceeding to hostfile", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* nothing was found, or no active module was alive. Our next * option is to look for a hostfile and assign our global * pool from there. * * Individual hostfile names, if given, are included * in the app_contexts for this job. We therefore need to * retrieve the app_contexts for the job, and then cycle * through them to see if anything is there. The parser will * add the nodes found in each hostfile to our list - i.e., * the resulting list contains the UNION of all nodes specified * in hostfiles from across all app_contexts * * We then continue to add any hosts provided by dash-host and * the default hostfile, if we have it. We will then filter out * all the non-desired hosts (i.e., those not specified by * -host and/or -hostfile) when we start the mapping process * * Note that any relative node syntax found in the hostfiles will * generate an error in this scenario, so only non-relative syntax * can be present */ if (NULL != orte_default_hostfile) { OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output, "%s ras:base:allocate parsing default hostfile %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_default_hostfile)); /* a default hostfile was provided - parse it */ if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes, orte_default_hostfile))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nodes); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } } for (i=0; i < jdata->apps->size; i++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { continue; } if (NULL != app->hostfile) { OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output, "%s ras:base:allocate adding hostfile %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->hostfile)); /* hostfile was specified - parse it and add it to the list */ if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes, app->hostfile))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nodes); /* set an error event */ ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } } else if (!orte_soft_locations && NULL != app->dash_host) { /* if we are using soft locations, then any dash-host would * just include desired nodes and not required. We don't want * to pick them up here as this would mean the request was * always satisfied - instead, we want to allow the request * to fail later on and use whatever nodes are actually * available */ OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output, "%s ras:base:allocate adding dash_hosts", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&nodes, app->dash_host))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nodes); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } } } /* if something was found in the hostfile(s), we use that as our global * pool - set it and we are done */ if (!opal_list_is_empty(&nodes)) { /* store the results in the global resource pool - this removes the * list items */ if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) { ORTE_ERROR_LOG(rc); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } /* cleanup */ OBJ_DESTRUCT(&nodes); goto DISPLAY; } OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output, "%s ras:base:allocate nothing found in hostfiles - checking for rankfile", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* Our next option is to look for a rankfile - if one was provided, we * will use its nodes to create a default allocation pool */ if (NULL != orte_rankfile) { /* check the rankfile for node information */ if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes, orte_rankfile))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nodes); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return ; } } /* if something was found in rankfile, we use that as our global * pool - set it and we are done */ if (!opal_list_is_empty(&nodes)) { /* store the results in the global resource pool - this removes the * list items */ if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) { ORTE_ERROR_LOG(rc); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } /* rankfile is considered equivalent to an RM allocation */ if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) { ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE); } /* cleanup */ OBJ_DESTRUCT(&nodes); goto DISPLAY; } OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output, "%s ras:base:allocate nothing found in rankfile - inserting current node", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); addlocal: /* if nothing was found by any of the above methods, then we have no * earthly idea what to do - so just add the local host */ node = OBJ_NEW(orte_node_t); if (NULL == node) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); OBJ_DESTRUCT(&nodes); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } /* use the same name we got in orte_process_info so we avoid confusion in * the session directories */ node->name = strdup(orte_process_info.nodename); node->state = ORTE_NODE_STATE_UP; node->slots_inuse = 0; node->slots_max = 0; node->slots = 1; opal_list_append(&nodes, &node->super); /* store the results in the global resource pool - this removes the * list items */ if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nodes, jdata))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nodes); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } OBJ_DESTRUCT(&nodes); DISPLAY: /* shall we display the results? */ if (4 < opal_output_get_verbosity(orte_ras_base_framework.framework_output)) { orte_ras_base_display_alloc(); } next_state: /* are we to report this event? */ if (orte_report_events) { if (ORTE_SUCCESS != (rc = orte_util_comm_report_event(ORTE_COMM_EVENT_ALLOCATE))) { ORTE_ERROR_LOG(rc); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); } } /* set total slots alloc */ jdata->total_slots_alloc = orte_ras_base.total_slots_alloc; /* set the job state to the next position */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOCATION_COMPLETE); /* cleanup */ OBJ_RELEASE(caddy); }
static int orte_rds_hostfile_query(orte_jobid_t job) { opal_list_t existing; opal_list_t updates, rds_updates; opal_list_item_t *item; orte_rds_cell_desc_t *rds_item; orte_rds_cell_attr_t *new_attr; orte_ras_node_t *ras_item; int rc; if (orte_rds_hostfile_queried) { /* if we have already been queried, then * our info is on the registry, so just * return. Note that this restriction * may eventually be lifted - ideally, * we might check to see if this is a * new file name and go ahead with the * query if so. */ return ORTE_SUCCESS; } orte_rds_hostfile_queried = true; OBJ_CONSTRUCT(&existing, opal_list_t); OBJ_CONSTRUCT(&updates, opal_list_t); OBJ_CONSTRUCT(&rds_updates, opal_list_t); rc = orte_ras_base_node_query(&existing); if(ORTE_SUCCESS != rc) { goto cleanup; } rc = mca_base_param_find("rds", "hostfile", "path"); mca_base_param_lookup_string(rc, &mca_rds_hostfile_component.path); rc = orte_rds_hostfile_parse(mca_rds_hostfile_component.path, &existing, &updates); if (ORTE_ERR_NOT_FOUND == rc) { if(mca_rds_hostfile_component.default_hostfile) { rc = ORTE_SUCCESS; } else { opal_show_help("help-rds-hostfile.txt", "rds:no-hostfile", true, mca_rds_hostfile_component.path); } goto cleanup; } else if (ORTE_SUCCESS != rc) { goto cleanup; } if ( !opal_list_is_empty(&updates) ) { /* Convert RAS update list to RDS update list */ for ( ras_item = (orte_ras_node_t*)opal_list_get_first(&updates); ras_item != (orte_ras_node_t*)opal_list_get_end(&updates); ras_item = (orte_ras_node_t*)opal_list_get_next(ras_item)) { rds_item = OBJ_NEW(orte_rds_cell_desc_t); if (NULL == rds_item) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } rds_item->site = strdup("Hostfile"); rds_item->name = strdup(ras_item->node_name); if (need_cellid) { #if 0 /* JJH Repair when cellid's are fixed */ /* Create a new cellid for this hostfile */ rc = orte_ns.create_cellid(&local_cellid, rds_item->site, rds_item->name); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); return rc; } #endif local_cellid = 0; need_cellid = false; } rds_item->cellid = local_cellid; ras_item->node_cellid = local_cellid; new_attr = OBJ_NEW(orte_rds_cell_attr_t); if (NULL == new_attr) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } new_attr->keyval.key = strdup(ORTE_RDS_NAME); new_attr->keyval.value = OBJ_NEW(orte_data_value_t); if (NULL == new_attr->keyval.value) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } new_attr->keyval.value->type = ORTE_STRING; new_attr->keyval.value->data = strdup(ras_item->node_name); opal_list_append(&(rds_item->attributes), &new_attr->super); new_attr = OBJ_NEW(orte_rds_cell_attr_t); if (NULL == new_attr) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } new_attr->keyval.key = strdup(ORTE_CELLID_KEY); new_attr->keyval.value = OBJ_NEW(orte_data_value_t); if (NULL == new_attr->keyval.value) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } new_attr->keyval.value->type = ORTE_CELLID; if (ORTE_SUCCESS != (rc = orte_dss.copy(&(new_attr->keyval.value->data), &(rds_item->cellid), ORTE_CELLID))) { ORTE_ERROR_LOG(rc); return rc; } opal_list_append(&(rds_item->attributes), &new_attr->super); opal_list_append(&rds_updates, &rds_item->super); } /* Insert the new node into the RDS */ rc = orte_rds.store_resource(&rds_updates); if (ORTE_SUCCESS != rc) { goto cleanup; } /* Then the RAS, since we can assume that any * resources listed in the hostfile have been * already allocated for our use. */ rc = orte_ras_base_node_insert(&updates); if (ORTE_SUCCESS != rc) { goto cleanup; } /* and now, indicate that ORTE should override any oversubscribed conditions * based on local hardware limits since the user (a) might not have * provided us any info on the #slots for a node, and (b) the user * might have been wrong! If we don't check the number of local physical * processors, then we could be too aggressive on our sched_yield setting * and cause performance problems. */ rc = orte_ras_base_set_oversubscribe_override(job); if (ORTE_SUCCESS != rc) { goto cleanup; } } cleanup: if (NULL != mca_rds_hostfile_component.path) { free(mca_rds_hostfile_component.path); mca_rds_hostfile_component.path = NULL; } while(NULL != (item = opal_list_remove_first(&existing))) { OBJ_RELEASE(item); } while(NULL != (item = opal_list_remove_first(&updates))) { OBJ_RELEASE(item); } while (NULL != (rds_item = (orte_rds_cell_desc_t*)opal_list_remove_first(&rds_updates))) { while (NULL != (new_attr = (orte_rds_cell_attr_t*)opal_list_remove_first(&(rds_item->attributes)))) { OBJ_RELEASE(new_attr); } OBJ_RELEASE(rds_item); } OBJ_DESTRUCT(&existing); OBJ_DESTRUCT(&updates); OBJ_DESTRUCT(&rds_updates); return rc; }
static void recv_data(int fd, short args, void *cbdata) { bool found; int i, rc; orte_node_t *nd, *nd2; opal_list_t nds, ndtmp; opal_list_item_t *item, *itm; char recv_msg[8192]; int nbytes, idx, sjob; char **alloc, *nodelist, *tpn; local_jobtracker_t *ptr, *jtrk; local_apptracker_t *aptrk; orte_app_context_t *app; orte_jobid_t jobid; orte_job_t *jdata; char **dash_host = NULL; opal_output_verbose(2, orte_ras_base_framework.framework_output, "%s ras:slurm: dynamic allocation - data recvd", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); /* read the data from the socket and put it in the * nodes field of op */ memset(recv_msg, 0, sizeof(recv_msg)); nbytes = read(fd, recv_msg, sizeof(recv_msg) - 1); opal_output_verbose(2, orte_ras_base_framework.framework_output, "%s ras:slurm: dynamic allocation msg: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv_msg); /* check if we got something */ if (0 == nbytes || 0 == strlen(recv_msg) || strstr(recv_msg, "failure") != NULL) { /* show an error here - basically, a "nothing was available" * message */ orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, (0 == strlen(recv_msg)) ? "NO MSG" : recv_msg); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALLOC_FAILED); return; } /* break the message into its component parts, separated by colons */ alloc = opal_argv_split(recv_msg, ':'); /* the first section contains the ORTE jobid for this allocation */ tpn = strchr(alloc[0], '='); orte_util_convert_string_to_jobid(&jobid, tpn+1); /* get the corresponding job object */ jdata = orte_get_job_data_object(jobid); jtrk = NULL; /* find the associated tracking object */ for (item = opal_list_get_first(&jobs); item != opal_list_get_end(&jobs); item = opal_list_get_next(item)) { ptr = (local_jobtracker_t*)item; if (ptr->jobid == jobid) { jtrk = ptr; break; } } if (NULL == jtrk) { orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, "NO JOB TRACKER"); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALLOC_FAILED); opal_argv_free(alloc); return; } /* stop the timeout event */ opal_event_del(&jtrk->timeout_ev); /* cycle across all the remaining parts - each is the allocation for * an app in this job */ OBJ_CONSTRUCT(&nds, opal_list_t); OBJ_CONSTRUCT(&ndtmp, opal_list_t); idx = -1; sjob = -1; nodelist = NULL; tpn = NULL; for (i=1; NULL != alloc[i]; i++) { if (ORTE_SUCCESS != parse_alloc_msg(alloc[i], &idx, &sjob, &nodelist, &tpn)) { orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, jtrk->cmd); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED); opal_argv_free(alloc); if (NULL != nodelist) { free(nodelist); } if (NULL != tpn) { free(tpn); } return; } if (idx < 0) { orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, jtrk->cmd); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED); opal_argv_free(alloc); free(nodelist); free(tpn); return; } if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, idx))) { orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, jtrk->cmd); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED); opal_argv_free(alloc); free(nodelist); free(tpn); return; } /* release the current dash_host as that contained the *desired* allocation */ orte_remove_attribute(&app->attributes, ORTE_APP_DASH_HOST); /* track the Slurm jobid */ if (NULL == (aptrk = (local_apptracker_t*)opal_pointer_array_get_item(&jtrk->apps, idx))) { aptrk = OBJ_NEW(local_apptracker_t); opal_pointer_array_set_item(&jtrk->apps, idx, aptrk); } aptrk->sjob = sjob; /* since the nodelist/tpn may contain regular expressions, parse them */ if (ORTE_SUCCESS != (rc = orte_ras_slurm_discover(nodelist, tpn, &ndtmp))) { ORTE_ERROR_LOG(rc); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED); opal_argv_free(alloc); free(nodelist); free(tpn); return; } /* transfer the discovered nodes to our node list, and construct * the new dash_host entry to match what was allocated */ while (NULL != (item = opal_list_remove_first(&ndtmp))) { nd = (orte_node_t*)item; opal_argv_append_nosize(&dash_host, nd->name); /* check for duplicates */ found = false; for (itm = opal_list_get_first(&nds); itm != opal_list_get_end(&nds); itm = opal_list_get_next(itm)) { nd2 = (orte_node_t*)itm; if (0 == strcmp(nd->name, nd2->name)) { found = true; nd2->slots += nd->slots; OBJ_RELEASE(item); break; } } if (!found) { /* append the new node to our list */ opal_list_append(&nds, item); } } /* cleanup */ free(nodelist); free(tpn); } /* cleanup */ opal_argv_free(alloc); OBJ_DESTRUCT(&ndtmp); if (NULL != dash_host) { tpn = opal_argv_join(dash_host, ','); orte_set_attribute(&app->attributes, ORTE_APP_DASH_HOST, ORTE_ATTR_LOCAL, (void*)tpn, OPAL_STRING); opal_argv_free(dash_host); free(tpn); } if (opal_list_is_empty(&nds)) { /* if we get here, then we were able to contact slurm, * which means we are in an actively managed cluster. * However, slurm indicated that nothing is currently * available that meets our requirements. This is a fatal * situation - we do NOT have the option of running on * user-specified hosts as the cluster is managed. */ OBJ_DESTRUCT(&nds); orte_show_help("help-ras-base.txt", "ras-base:no-allocation", true); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } /* store the found nodes */ if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nds, jdata))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nds); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); return; } OBJ_DESTRUCT(&nds); /* default to no-oversubscribe-allowed for managed systems */ if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) { ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE); } /* flag that the allocation is managed */ orte_managed_allocation = true; /* move the job along */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOCATION_COMPLETE); /* all done */ return; }
static int orte_ras_localhost_allocate(orte_jobid_t jobid, opal_list_t *attributes) { bool empty; int ret; opal_list_t nodes; orte_ras_node_t *node; opal_list_item_t *item; /* If the node segment is not empty, do nothing */ if (ORTE_SUCCESS != (ret = orte_ras_base_node_segment_empty(&empty))) { ORTE_ERROR_LOG(ret); return ret; } if (!empty) { opal_output(orte_ras_base.ras_output, "orte:ras:localhost: node segment not empty; not doing anything"); return ORTE_SUCCESS; } opal_output(orte_ras_base.ras_output, "orte:ras:localhost: node segment empty; adding \"localhost\""); /* Ok, the node segment is empty -- so add a localhost node */ node = OBJ_NEW(orte_ras_node_t); if (NULL == node) { return ORTE_ERR_OUT_OF_RESOURCE; } /* use the same name we got in orte_system_info so we avoid confusion in * the session directories */ node->node_name = strdup(orte_system_info.nodename); node->node_arch = NULL; node->node_state = ORTE_NODE_STATE_UP; /* JMS: this should not be hard-wired to 0, but there's no other value to put it to [yet]... */ node->node_cellid = 0; node->node_slots_inuse = 0; node->node_slots_max = 0; node->node_slots = 1; OBJ_CONSTRUCT(&nodes, opal_list_t); opal_list_append(&nodes, &node->super); /* Put it on the segment and allocate it */ if (ORTE_SUCCESS != (ret = orte_ras_base_node_insert(&nodes)) || ORTE_SUCCESS != (ret = orte_ras_base_allocate_nodes(jobid, &nodes))) { goto cleanup; } /* now indicate that there is uncertainty about the number of slots here, * so the launcher should use knowledge of the local number of processors to * override any oversubscription flags */ ret = orte_ras_base_set_oversubscribe_override(jobid); if (ORTE_SUCCESS != ret) { goto cleanup; } cleanup: item = opal_list_remove_first(&nodes); OBJ_RELEASE(item); OBJ_DESTRUCT(&nodes); /* All done */ return ret; }
static int orte_ras_bjs_discover( opal_list_t* nodelist, orte_app_context_t** context, size_t num_context) { char* nodes; char* ptr; opal_list_item_t* item; opal_list_t new_nodes; int rc; /* query the nodelist from the registry */ if(ORTE_SUCCESS != (rc = orte_ras_base_node_query(nodelist))) { ORTE_ERROR_LOG(rc); return rc; } /* validate that any user supplied nodes actually exist, etc. */ item = opal_list_get_first(nodelist); while(item != opal_list_get_end(nodelist)) { opal_list_item_t* next = opal_list_get_next(item); int node_num; orte_ras_node_t* node = (orte_ras_node_t*)item; if(ORTE_SUCCESS != orte_ras_bjs_node_resolve(node->node_name, &node_num)) { opal_list_remove_item(nodelist,item); OBJ_DESTRUCT(item); item = next; continue; } if(orte_ras_bjs_node_state(node_num) != ORTE_NODE_STATE_UP) { opal_list_remove_item(nodelist,item); OBJ_DESTRUCT(item); item = next; continue; } if(bproc_access(node_num, BPROC_X_OK) != 0) { opal_list_remove_item(nodelist,item); OBJ_DESTRUCT(item); item = next; continue; } /* try and determine the number of available slots */ if(node->node_slots == 0) { node->node_slots = orte_ras_bjs_node_slots(node->node_name); } item = next; } /* parse the node list and check node status/access */ nodes = getenv("NODES"); if (NULL == nodes) { return ORTE_ERR_NOT_AVAILABLE; } OBJ_CONSTRUCT(&new_nodes, opal_list_t); while(NULL != (ptr = strsep(&nodes,","))) { orte_ras_node_t *node; orte_node_state_t node_state; int node_num; /* is this node already in the list */ for(item = opal_list_get_first(nodelist); item != opal_list_get_end(nodelist); item = opal_list_get_next(item)) { node = (orte_ras_node_t*)item; if(strcmp(node->node_name, ptr) == 0) break; } if(item != opal_list_get_end(nodelist)) continue; if(sscanf(ptr, "%d", &node_num) != 1) { continue; } if(ORTE_NODE_STATE_UP != (node_state = orte_ras_bjs_node_state(node_num))) { opal_output(0, "error: a specified node (%d) is not up.\n", node_num); rc = ORTE_ERROR; goto cleanup; } if(bproc_access(node_num, BPROC_X_OK) != 0) { opal_output(0, "error: a specified node (%d) is not accessible.\n", node_num); rc = ORTE_ERROR; goto cleanup; } /* create a new node entry */ node = OBJ_NEW(orte_ras_node_t); node->node_name = strdup(ptr); node->node_state = node_state; node->node_slots = orte_ras_bjs_node_slots(node->node_name); opal_list_append(&new_nodes, &node->super); } /* add any newly discovered nodes to the registry */ if(opal_list_get_size(&new_nodes)) { rc = orte_ras_base_node_insert(&new_nodes); if(ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); } } /* append them to the nodelist */ while(NULL != (item = opal_list_remove_first(&new_nodes))) opal_list_append(nodelist, item); cleanup: OBJ_DESTRUCT(&new_nodes); return rc; }