static void recv_data(int fd, short args, void *cbdata) { bool found; int i, rc; orte_node_t *nd, *nd2; opal_list_t nds, ndtmp; opal_list_item_t *item, *itm; char recv_msg[8192]; int nbytes, idx, sjob; char **alloc, *nodelist, *tpn; local_jobtracker_t *ptr, *jtrk; local_apptracker_t *aptrk; orte_app_context_t *app; orte_jobid_t jobid; orte_job_t *jdata; char **dash_host = NULL; opal_output_verbose(2, orte_ras_base_framework.framework_output, "%s ras:slurm: dynamic allocation - data recvd", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); /* read the data from the socket and put it in the * nodes field of op */ memset(recv_msg, 0, sizeof(recv_msg)); nbytes = read(fd, recv_msg, sizeof(recv_msg) - 1); opal_output_verbose(2, orte_ras_base_framework.framework_output, "%s ras:slurm: dynamic allocation msg: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv_msg); /* check if we got something */ if (0 == nbytes || 0 == strlen(recv_msg) || strstr(recv_msg, "failure") != NULL) { /* show an error here - basically, a "nothing was available" * message */ orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, (0 == strlen(recv_msg)) ? "NO MSG" : recv_msg); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALLOC_FAILED); return; } /* break the message into its component parts, separated by colons */ alloc = opal_argv_split(recv_msg, ':'); /* the first section contains the ORTE jobid for this allocation */ tpn = strchr(alloc[0], '='); orte_util_convert_string_to_jobid(&jobid, tpn+1); /* get the corresponding job object */ jdata = orte_get_job_data_object(jobid); jtrk = NULL; /* find the associated tracking object */ for (item = opal_list_get_first(&jobs); item != opal_list_get_end(&jobs); item = opal_list_get_next(item)) { ptr = (local_jobtracker_t*)item; if (ptr->jobid == jobid) { jtrk = ptr; break; } } if (NULL == jtrk) { orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, "NO JOB TRACKER"); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALLOC_FAILED); opal_argv_free(alloc); return; } /* stop the timeout event */ opal_event_del(&jtrk->timeout_ev); /* cycle across all the remaining parts - each is the allocation for * an app in this job */ OBJ_CONSTRUCT(&nds, opal_list_t); OBJ_CONSTRUCT(&ndtmp, opal_list_t); idx = -1; sjob = -1; nodelist = NULL; tpn = NULL; for (i=1; NULL != alloc[i]; i++) { if (ORTE_SUCCESS != parse_alloc_msg(alloc[i], &idx, &sjob, &nodelist, &tpn)) { orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, jtrk->cmd); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED); opal_argv_free(alloc); if (NULL != nodelist) { free(nodelist); } if (NULL != tpn) { free(tpn); } return; } if (idx < 0) { orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, jtrk->cmd); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED); opal_argv_free(alloc); free(nodelist); free(tpn); return; } if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, idx))) { orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, jtrk->cmd); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED); opal_argv_free(alloc); free(nodelist); free(tpn); return; } /* release the current dash_host as that contained the *desired* allocation */ orte_remove_attribute(&app->attributes, ORTE_APP_DASH_HOST); /* track the Slurm jobid */ if (NULL == (aptrk = (local_apptracker_t*)opal_pointer_array_get_item(&jtrk->apps, idx))) { aptrk = OBJ_NEW(local_apptracker_t); opal_pointer_array_set_item(&jtrk->apps, idx, aptrk); } aptrk->sjob = sjob; /* since the nodelist/tpn may contain regular expressions, parse them */ if (ORTE_SUCCESS != (rc = orte_ras_slurm_discover(nodelist, tpn, &ndtmp))) { ORTE_ERROR_LOG(rc); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED); opal_argv_free(alloc); free(nodelist); free(tpn); return; } /* transfer the discovered nodes to our node list, and construct * the new dash_host entry to match what was allocated */ while (NULL != (item = opal_list_remove_first(&ndtmp))) { nd = (orte_node_t*)item; opal_argv_append_nosize(&dash_host, nd->name); /* check for duplicates */ found = false; for (itm = opal_list_get_first(&nds); itm != opal_list_get_end(&nds); itm = opal_list_get_next(itm)) { nd2 = (orte_node_t*)itm; if (0 == strcmp(nd->name, nd2->name)) { found = true; nd2->slots += nd->slots; OBJ_RELEASE(item); break; } } if (!found) { /* append the new node to our list */ opal_list_append(&nds, item); } } /* cleanup */ free(nodelist); free(tpn); } /* cleanup */ opal_argv_free(alloc); OBJ_DESTRUCT(&ndtmp); if (NULL != dash_host) { tpn = opal_argv_join(dash_host, ','); orte_set_attribute(&app->attributes, ORTE_APP_DASH_HOST, ORTE_ATTR_LOCAL, (void*)tpn, OPAL_STRING); opal_argv_free(dash_host); free(tpn); } if (opal_list_is_empty(&nds)) { /* if we get here, then we were able to contact slurm, * which means we are in an actively managed cluster. * However, slurm indicated that nothing is currently * available that meets our requirements. This is a fatal * situation - we do NOT have the option of running on * user-specified hosts as the cluster is managed. */ OBJ_DESTRUCT(&nds); orte_show_help("help-ras-base.txt", "ras-base:no-allocation", true); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } /* store the found nodes */ if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nds, jdata))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&nds); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); return; } OBJ_DESTRUCT(&nds); /* default to no-oversubscribe-allowed for managed systems */ if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) { ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE); } /* flag that the allocation is managed */ orte_managed_allocation = true; /* move the job along */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOCATION_COMPLETE); /* all done */ return; }
/** * Discover available (pre-allocated) nodes. Allocate the * requested number of nodes/process slots to the job. * */ static int orte_ras_slurm_allocate(opal_list_t *nodes) { int ret, cpus_per_task; char *slurm_node_str, *regexp; char *tasks_per_node, *node_tasks; char * tmp; char *slurm_jobid; slurm_jobid = getenv("SLURM_JOBID"); /* don't need to check this for NULL as we wouldn't * have been selected if it wasn't already found * * save that value in the global job ident string for * later use in any error reporting */ orte_job_ident = strdup(slurm_jobid); slurm_node_str = getenv("SLURM_NODELIST"); if (NULL == slurm_node_str) { orte_show_help("help-ras-slurm.txt", "slurm-env-var-not-found", 1, "SLURM_NODELIST"); return ORTE_ERR_NOT_FOUND; } regexp = strdup(slurm_node_str); tasks_per_node = getenv("SLURM_TASKS_PER_NODE"); if (NULL == tasks_per_node) { /* couldn't find any version - abort */ orte_show_help("help-ras-slurm.txt", "slurm-env-var-not-found", 1, "SLURM_TASKS_PER_NODE"); return ORTE_ERR_NOT_FOUND; } node_tasks = strdup(tasks_per_node); if(NULL == regexp || NULL == node_tasks) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } /* get the number of CPUs per task that the user provided to slurm */ tmp = getenv("SLURM_CPUS_PER_TASK"); if(NULL != tmp) { cpus_per_task = atoi(tmp); if(0 >= cpus_per_task) { opal_output(0, "ras:slurm:allocate: Got bad value from SLURM_CPUS_PER_TASK. " "Variable was: %s\n", tmp); ORTE_ERROR_LOG(ORTE_ERROR); return ORTE_ERROR; } } else { cpus_per_task = 1; } ret = orte_ras_slurm_discover(regexp, node_tasks, cpus_per_task, nodes); free(regexp); free(node_tasks); if (ORTE_SUCCESS != ret) { OPAL_OUTPUT_VERBOSE((1, orte_ras_base.ras_output, "%s ras:slurm:allocate: discover failed!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return ret; } /* All done */ if (ORTE_SUCCESS == ret) { OPAL_OUTPUT_VERBOSE((1, orte_ras_base.ras_output, "%s ras:slurm:allocate: success", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } else { OPAL_OUTPUT_VERBOSE((1, orte_ras_base.ras_output, "%s ras:slurm:allocate: failure (base_allocate_nodes=%d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ret)); } return ret; }
/** * Discover available (pre-allocated) nodes. Allocate the * requested number of nodes/process slots to the job. * */ static int orte_ras_slurm_allocate(orte_job_t *jdata, opal_list_t *nodes) { int ret, cpus_per_task; char *slurm_node_str, *regexp; char *tasks_per_node, *node_tasks; char *tmp; char *slurm_jobid; if (NULL == (slurm_jobid = getenv("SLURM_JOBID"))) { /* we are not in a slurm allocation - see if dyn alloc * is enabled */ if (!mca_ras_slurm_component.dyn_alloc_enabled) { /* nope - nothing we can do */ opal_output_verbose(2, orte_ras_base_framework.framework_output, "%s ras:slurm: no prior allocation and dynamic alloc disabled", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); return ORTE_ERR_TAKE_NEXT_OPTION; } } else { /* save this value in the global job ident string for * later use in any error reporting */ orte_job_ident = strdup(slurm_jobid); } slurm_node_str = getenv("SLURM_NODELIST"); if (NULL == slurm_node_str) { /* see if dynamic allocation is enabled */ if (mca_ras_slurm_component.dyn_alloc_enabled) { /* attempt to get the allocation - the function * dyn_allocate will return as ORTE_ERR_ALLOCATION_PENDING * if it succeeds in sending the allocation request */ ret = dyn_allocate(jdata); /* return to the above layer in ras/base/ras_base_allocate.c * to wait for event (libevent) happening */ return ret; } orte_show_help("help-ras-slurm.txt", "slurm-env-var-not-found", 1, "SLURM_NODELIST"); return ORTE_ERR_NOT_FOUND; } regexp = strdup(slurm_node_str); if(NULL == regexp) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } if (mca_ras_slurm_component.use_all) { /* this is an oddball case required for debug situations where * a tool is started that will then call mpirun. In this case, * Slurm will assign only 1 tasks/per node to the tool, but * we want mpirun to use the entire allocation. They don't give * us a specific variable for this purpose, so we have to fudge * a bit - but this is a special edge case, and we'll live with it */ tasks_per_node = getenv("SLURM_JOB_CPUS_PER_NODE"); if (NULL == tasks_per_node) { /* couldn't find any version - abort */ orte_show_help("help-ras-slurm.txt", "slurm-env-var-not-found", 1, "SLURM_JOB_CPUS_PER_NODE"); free(regexp); return ORTE_ERR_NOT_FOUND; } node_tasks = strdup(tasks_per_node); if (NULL == node_tasks) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); free(regexp); return ORTE_ERR_OUT_OF_RESOURCE; } cpus_per_task = 1; } else { /* get the number of process slots we were assigned on each node */ tasks_per_node = getenv("SLURM_TASKS_PER_NODE"); if (NULL == tasks_per_node) { /* couldn't find any version - abort */ orte_show_help("help-ras-slurm.txt", "slurm-env-var-not-found", 1, "SLURM_TASKS_PER_NODE"); free(regexp); return ORTE_ERR_NOT_FOUND; } node_tasks = strdup(tasks_per_node); if (NULL == node_tasks) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); free(regexp); return ORTE_ERR_OUT_OF_RESOURCE; } /* get the number of CPUs per task that the user provided to slurm */ tmp = getenv("SLURM_CPUS_PER_TASK"); if(NULL != tmp) { cpus_per_task = atoi(tmp); if(0 >= cpus_per_task) { opal_output(0, "ras:slurm:allocate: Got bad value from SLURM_CPUS_PER_TASK. " "Variable was: %s\n", tmp); ORTE_ERROR_LOG(ORTE_ERROR); free(node_tasks); free(regexp); return ORTE_ERROR; } } else { cpus_per_task = 1; } } ret = orte_ras_slurm_discover(regexp, node_tasks, nodes); free(regexp); free(node_tasks); if (ORTE_SUCCESS != ret) { OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output, "%s ras:slurm:allocate: discover failed!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return ret; } /* record the number of allocated nodes */ orte_num_allocated_nodes = opal_list_get_size(nodes); /* All done */ OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output, "%s ras:slurm:allocate: success", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return ORTE_SUCCESS; }