/* Spawn health check function for every node that is not DOWN */ extern void run_health_check(void) { #ifdef HAVE_FRONT_END front_end_record_t *front_end_ptr; #else struct node_record *node_ptr; #endif int i; char *host_str = NULL; agent_arg_t *check_agent_args = NULL; check_agent_args = xmalloc (sizeof (agent_arg_t)); check_agent_args->msg_type = REQUEST_HEALTH_CHECK; check_agent_args->retry = 0; check_agent_args->hostlist = hostlist_create(""); if (check_agent_args->hostlist == NULL) fatal("hostlist_create: malloc failure"); #ifdef HAVE_FRONT_END for (i = 0, front_end_ptr = front_end_nodes; i < front_end_node_cnt; i++, front_end_ptr++) { if (IS_NODE_NO_RESPOND(front_end_ptr)) continue; hostlist_push(check_agent_args->hostlist, front_end_ptr->name); check_agent_args->node_count++; } #else for (i=0, node_ptr=node_record_table_ptr; i<node_record_count; i++, node_ptr++) { if (IS_NODE_NO_RESPOND(node_ptr) || IS_NODE_FUTURE(node_ptr) || IS_NODE_POWER_SAVE(node_ptr)) continue; hostlist_push(check_agent_args->hostlist, node_ptr->name); check_agent_args->node_count++; } #endif if (check_agent_args->node_count == 0) { hostlist_destroy(check_agent_args->hostlist); xfree (check_agent_args); } else { hostlist_uniq(check_agent_args->hostlist); host_str = hostlist_ranged_string_xmalloc( check_agent_args->hostlist); debug("Spawning health check agent for %s", host_str); xfree(host_str); ping_begin(); agent_queue_request(check_agent_args); } }
/* Update acct_gather data for every node that is not DOWN */ extern void update_nodes_acct_gather_data(void) { #ifdef HAVE_FRONT_END front_end_record_t *front_end_ptr; #else struct node_record *node_ptr; #endif int i; char *host_str = NULL; agent_arg_t *agent_args = NULL; agent_args = xmalloc (sizeof (agent_arg_t)); agent_args->msg_type = REQUEST_ACCT_GATHER_UPDATE; agent_args->retry = 0; agent_args->hostlist = hostlist_create(""); if (agent_args->hostlist == NULL) fatal("hostlist_create: malloc failure"); #ifdef HAVE_FRONT_END for (i = 0, front_end_ptr = front_end_nodes; i < front_end_node_cnt; i++, front_end_ptr++) { if (IS_NODE_NO_RESPOND(front_end_ptr)) continue; hostlist_push(agent_args->hostlist, front_end_ptr->name); agent_args->node_count++; } #else for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count; i++, node_ptr++) { if (IS_NODE_NO_RESPOND(node_ptr) || IS_NODE_FUTURE(node_ptr) || IS_NODE_POWER_SAVE(node_ptr)) continue; hostlist_push(agent_args->hostlist, node_ptr->name); agent_args->node_count++; } #endif if (agent_args->node_count == 0) { hostlist_destroy(agent_args->hostlist); xfree (agent_args); } else { hostlist_uniq(agent_args->hostlist); host_str = hostlist_ranged_string_xmalloc(agent_args->hostlist); if (slurmctld_conf.debug_flags & DEBUG_FLAG_ENERGY) info("Updating acct_gather data for %s", host_str); xfree(host_str); ping_begin(); agent_queue_request(agent_args); } }
static int _setup_requeue(agent_arg_t *agent_arg_ptr, thd_t *thread_ptr, int count, int *spot) { ret_data_info_t *ret_data_info = NULL; ListIterator itr = list_iterator_create(thread_ptr->ret_list); while((ret_data_info = list_next(itr))) { debug2("got err of %d", ret_data_info->err); if (ret_data_info->err != DSH_NO_RESP) continue; debug("got the name %s to resend out of %d", ret_data_info->node_name, count); if(agent_arg_ptr) { hostlist_push(agent_arg_ptr->hostlist, ret_data_info->node_name); if ((++(*spot)) == count) { list_iterator_destroy(itr); return 1; } } } list_iterator_destroy(itr); return 0; }
static hostlist_t _read_genders_attr(char *attr, int iopt) { FILE *f; hostlist_t hl = hostlist_create(NULL); char cmd[LINEBUFSIZE]; char buf[LINEBUFSIZE]; /* * xpopen sets uid back to real user id, so it is ok to use * "nodeattr" from user's path here */ snprintf(cmd, sizeof(cmd), "%s %s%s -%sn %s", _PATH_NODEATTR, gfile ? "-f " : "", gfile ? gfile : "", iopt ? "" : "r", attr); f = xpopen(cmd, "r"); if (f == NULL) errx("%p: error running \"%s\"\n", _PATH_NODEATTR); while (fgets(buf, LINEBUFSIZE, f) != NULL) { xstrcln(buf, NULL); if (hostlist_push(hl, buf) <= 0) err("%p: warning: target `%s' not parsed\n", buf); } if (xpclose(f) != 0) errx("%p: error running nodeattr\n"); return hl; }
static int _hostset_from_ranges(const pmix_proc_t *procs, size_t nprocs, hostlist_t *hl_out) { int i; hostlist_t hl = hostlist_create(""); pmixp_namespace_t *nsptr = NULL; for (i = 0; i < nprocs; i++) { char *node = NULL; hostlist_t tmp; nsptr = pmixp_nspaces_find(procs[i].nspace); if (NULL == nsptr) { goto err_exit; } if (procs[i].rank == PMIX_RANK_WILDCARD) { tmp = hostlist_copy(nsptr->hl); } else { tmp = pmixp_nspace_rankhosts(nsptr, &procs[i].rank, 1); } while (NULL != (node = hostlist_pop(tmp))) { hostlist_push(hl, node); free(node); } hostlist_destroy(tmp); } hostlist_uniq(hl); *hl_out = hl; return SLURM_SUCCESS; err_exit: hostlist_destroy(hl); return SLURM_ERROR; }
/* * bitmap2node_name_sortable - given a bitmap, build a list of comma * separated node names. names may include regular expressions * (e.g. "lx[01-10]") * IN bitmap - bitmap pointer * IN sort - returned sorted list or not * RET pointer to node list or NULL on error * globals: node_record_table_ptr - pointer to node table * NOTE: the caller must xfree the memory at node_list when no longer required */ char * bitmap2node_name_sortable (bitstr_t *bitmap, bool sort) { int i, first, last; hostlist_t hl; char *buf; if (bitmap == NULL) return xstrdup(""); first = bit_ffs(bitmap); if (first == -1) return xstrdup(""); last = bit_fls(bitmap); hl = hostlist_create(""); for (i = first; i <= last; i++) { if (bit_test(bitmap, i) == 0) continue; hostlist_push(hl, node_record_table_ptr[i].name); } if (sort) hostlist_sort(hl); buf = hostlist_ranged_string_xmalloc(hl); hostlist_destroy(hl); return buf; }
/* * _queue_agent_retry - Queue any failed RPCs for later replay * IN agent_info_ptr - pointer to info on completed agent requests * IN count - number of agent requests which failed, count to requeue */ static void _queue_agent_retry(agent_info_t * agent_info_ptr, int count) { agent_arg_t *agent_arg_ptr; queued_request_t *queued_req_ptr = NULL; thd_t *thread_ptr = agent_info_ptr->thread_struct; int i, j; if (count == 0) return; /* build agent argument with just the RPCs to retry */ agent_arg_ptr = xmalloc(sizeof(agent_arg_t)); agent_arg_ptr->node_count = count; agent_arg_ptr->retry = 1; agent_arg_ptr->hostlist = hostlist_create(""); agent_arg_ptr->msg_type = agent_info_ptr->msg_type; agent_arg_ptr->msg_args = *(agent_info_ptr->msg_args_pptr); *(agent_info_ptr->msg_args_pptr) = NULL; j = 0; for (i = 0; i < agent_info_ptr->thread_count; i++) { if(!thread_ptr[i].ret_list) { if (thread_ptr[i].state != DSH_NO_RESP) continue; debug("got the name %s to resend", thread_ptr[i].nodelist); hostlist_push(agent_arg_ptr->hostlist, thread_ptr[i].nodelist); if ((++j) == count) break; } else { if(_setup_requeue(agent_arg_ptr, &thread_ptr[i], count, &j)) break; } } if (count != j) { error("agent: Retry count (%d) != actual count (%d)", count, j); agent_arg_ptr->node_count = j; } debug2("Queue RPC msg_type=%u, nodes=%d for retry", agent_arg_ptr->msg_type, j); /* add the requeust to a list */ queued_req_ptr = xmalloc(sizeof(queued_request_t)); queued_req_ptr->agent_arg_ptr = agent_arg_ptr; queued_req_ptr->last_attempt = time(NULL); slurm_mutex_lock(&retry_mutex); if (retry_list == NULL) { retry_list = list_create(_list_delete_retry); if (retry_list == NULL) fatal("list_create failed"); } if (list_append(retry_list, (void *) queued_req_ptr) == 0) fatal("list_append failed"); slurm_mutex_unlock(&retry_mutex); }
static char * _dump_all_nodes(int *node_cnt, time_t update_time) { int i, cnt = 0, rc; struct node_record *node_ptr = node_record_table_ptr; char *tmp_buf = NULL, *buf = NULL; struct node_record *uniq_node_ptr = NULL; hostlist_t hl = NULL; for (i=0; i<node_record_count; i++, node_ptr++) { if (node_ptr->name == NULL) continue; if (IS_NODE_FUTURE(node_ptr)) continue; if (_hidden_node(node_ptr)) continue; if (use_host_exp == 2) { rc = _same_info(uniq_node_ptr, node_ptr, update_time); if (rc == 0) { uniq_node_ptr = node_ptr; if (hl) { hostlist_push(hl, node_ptr->name); } else { hl = hostlist_create(node_ptr->name); if (hl == NULL) fatal("malloc failure"); } continue; } else { tmp_buf = _dump_node(uniq_node_ptr, hl, update_time); hostlist_destroy(hl); hl = hostlist_create(node_ptr->name); if (hl == NULL) fatal("malloc failure"); uniq_node_ptr = node_ptr; } } else { tmp_buf = _dump_node(node_ptr, hl, update_time); } if (cnt > 0) xstrcat(buf, "#"); xstrcat(buf, tmp_buf); xfree(tmp_buf); cnt++; } if (hl) { tmp_buf = _dump_node(uniq_node_ptr, hl, update_time); hostlist_destroy(hl); if (cnt > 0) xstrcat(buf, "#"); xstrcat(buf, tmp_buf); xfree(tmp_buf); cnt++; } *node_cnt = cnt; return buf; }
static hostlist_t _hl_append (hostlist_t hl, char *nodes) { if (hl == NULL) return (hostlist_create (nodes)); else hostlist_push (hl, nodes); return (hl); }
/* Return task list in Moab format 2: tux[0-1]*2:tux2 */ static char * _task_list_exp(struct job_record *job_ptr) { int i, node_inx = 0, reps = -1, task_cnt; char *buf = NULL, *host; hostlist_t hl_tmp = (hostlist_t) NULL; job_resources_t *job_resrcs_ptr = job_ptr->job_resrcs; xassert(job_resrcs_ptr); for (i=0; i<job_resrcs_ptr->nhosts; i++) { if (i == 0) { xassert(job_resrcs_ptr->cpus && job_resrcs_ptr->node_bitmap); node_inx = bit_ffs(job_resrcs_ptr->node_bitmap); } else { for (node_inx++; node_inx<node_record_count; node_inx++) { if (bit_test(job_resrcs_ptr->node_bitmap, node_inx)) break; } if (node_inx >= node_record_count) { error("Improperly formed job_resrcs for %u", job_ptr->job_id); break; } } host = node_record_table_ptr[node_inx].name; task_cnt = job_resrcs_ptr->cpus[i]; if (job_ptr->details && job_ptr->details->cpus_per_task) task_cnt /= job_ptr->details->cpus_per_task; if (task_cnt < 1) { error("Invalid task_cnt for job %u on node %s", job_ptr->job_id, host); task_cnt = 1; } if (reps == task_cnt) { /* append to existing hostlist record */ if (hostlist_push(hl_tmp, host) == 0) error("hostlist_push failure"); } else { if (hl_tmp) _append_hl_buf(&buf, &hl_tmp, &reps); /* start new hostlist record */ hl_tmp = hostlist_create(host); if (hl_tmp) reps = task_cnt; else error("hostlist_create failure"); } } if (hl_tmp) _append_hl_buf(&buf, &hl_tmp, &reps); return buf; }
extern int basil_node_ranking(struct node_record *node_array, int node_cnt) { enum basil_version version = get_basil_version(); struct basil_inventory *inv; struct basil_node *node; int rank_count = 0, i; hostlist_t hl = hostlist_create(NULL); bool bad_node = 0; inv = get_full_inventory(version); if (inv == NULL) /* FIXME: should retry here if the condition is transient */ fatal("failed to get BASIL %s ranking", bv_names_long[version]); else if (!inv->batch_total) fatal("system has no usable batch compute nodes"); debug("BASIL %s RANKING INVENTORY: %d/%d batch nodes", bv_names_long[version], inv->batch_avail, inv->batch_total); /* * Node ranking is based on a subset of the inventory: only nodes in * batch allocation mode which are up and not allocated. Assign a * 'NO_VAL' rank to all other nodes, which will translate as a very * high value, (unsigned)-2, to put those nodes last in the ranking. * The rest of the code must ensure that those nodes are never chosen. */ for (i = 0; i < node_cnt; i++) node_array[i].node_rank = NO_VAL; for (node = inv->f->node_head; node; node = node->next) { struct node_record *node_ptr; char tmp[50]; node_ptr = _find_node_by_basil_id(node->node_id); if (node_ptr == NULL) { error("nid%05u (%s node in state %s) not in slurm.conf", node->node_id, nam_noderole[node->role], nam_nodestate[node->state]); bad_node = 1; } else node_ptr->node_rank = inv->nodes_total - rank_count++; sprintf(tmp, "nid%05u", node->node_id); hostlist_push(hl, tmp); } free_inv(inv); if (bad_node) { hostlist_sort(hl); char *name = hostlist_ranged_string_xmalloc(hl); info("It appears your slurm.conf nodelist doesn't " "match the alps system. Here are the nodes alps knows " "about\n%s", name); } hostlist_destroy(hl); return SLURM_SUCCESS; }
static int _cb_host (conffile_t cf, struct conffile_data *data, char *optionname, int option_type, void *option_ptr, int option_data, void *app_ptr, int app_data) { if (!hostlist_push (conf.hosts, data->string)) err_exit ("hostlist_push: %s", strerror (errno)); return (0); }
static void _push_inputted_nodes (struct ipmidetect_arguments *cmd_args, const char *nodes) { assert (cmd_args); assert (nodes); /* Error if nodes aren't short hostnames */ if (strchr (nodes, '.')) err_exit ("nodes must be listed in short hostname format"); if (!hostlist_push (cmd_args->inputted_nodes, nodes)) err_exit ("nodes improperly formatted"); }
int wrap_hostlist_push(WRAPPERS_ARGS, hostlist_t hl, const char *host) { int rv; assert(file && function); if (!hl || !host) WRAPPERS_ERR_INVALID_PARAMETERS("hostlist_push"); if (!(rv = hostlist_push(hl, host))) WRAPPERS_ERR_ERRNO("hostlist_push"); return rv; }
static agent_info_t *_make_agent_info(agent_arg_t *agent_arg_ptr) { int i = 0, j = 0; agent_info_t *agent_info_ptr = NULL; thd_t *thread_ptr = NULL; int *span = NULL; int thr_count = 0; hostlist_t hl = NULL; char *name = NULL; agent_info_ptr = xmalloc(sizeof(agent_info_t)); slurm_mutex_init(&agent_info_ptr->thread_mutex); if (pthread_cond_init(&agent_info_ptr->thread_cond, NULL)) fatal("pthread_cond_init error %m"); agent_info_ptr->thread_count = agent_arg_ptr->node_count; agent_info_ptr->retry = agent_arg_ptr->retry; agent_info_ptr->threads_active = 0; thread_ptr = xmalloc(agent_info_ptr->thread_count * sizeof(thd_t)); memset(thread_ptr, 0, (agent_info_ptr->thread_count * sizeof(thd_t))); agent_info_ptr->thread_struct = thread_ptr; agent_info_ptr->msg_type = agent_arg_ptr->msg_type; agent_info_ptr->msg_args_pptr = &agent_arg_ptr->msg_args; if ((agent_arg_ptr->msg_type != REQUEST_JOB_NOTIFY) && (agent_arg_ptr->msg_type != REQUEST_SHUTDOWN) && (agent_arg_ptr->msg_type != REQUEST_RECONFIGURE) && (agent_arg_ptr->msg_type != SRUN_EXEC) && (agent_arg_ptr->msg_type != SRUN_TIMEOUT) && (agent_arg_ptr->msg_type != SRUN_NODE_FAIL) && (agent_arg_ptr->msg_type != SRUN_REQUEST_SUSPEND) && (agent_arg_ptr->msg_type != SRUN_USER_MSG) && (agent_arg_ptr->msg_type != SRUN_STEP_MISSING) && (agent_arg_ptr->msg_type != SRUN_JOB_COMPLETE)) { #ifdef HAVE_FRONT_END span = set_span(agent_arg_ptr->node_count, agent_arg_ptr->node_count); #else /* Sending message to a possibly large number of slurmd. * Push all message forwarding to slurmd in order to * offload as much work from slurmctld as possible. */ span = set_span(agent_arg_ptr->node_count, 1); #endif agent_info_ptr->get_reply = true; } else { /* Message is going to one node (for srun) or we want * it to get processed ASAP (SHUTDOWN or RECONFIGURE). * Send the message directly to each node. */ span = set_span(agent_arg_ptr->node_count, agent_arg_ptr->node_count); } i = 0; while(i < agent_info_ptr->thread_count) { thread_ptr[thr_count].state = DSH_NEW; thread_ptr[thr_count].addr = agent_arg_ptr->addr; name = hostlist_shift(agent_arg_ptr->hostlist); if(!name) { debug3("no more nodes to send to"); break; } hl = hostlist_create(name); if(thread_ptr[thr_count].addr && span[thr_count]) { debug("warning: you will only be sending this to %s", name); span[thr_count] = 0; } free(name); i++; for (j = 0; j < span[thr_count]; j++) { name = hostlist_shift(agent_arg_ptr->hostlist); if (!name) break; hostlist_push(hl, name); free(name); i++; } hostlist_uniq(hl); thread_ptr[thr_count].nodelist = hostlist_ranged_string_xmalloc(hl); hostlist_destroy(hl); #if 0 info("sending msg_type %u to nodes %s", agent_arg_ptr->msg_type, thread_ptr[thr_count].nodelist); #endif thr_count++; } xfree(span); agent_info_ptr->thread_count = thr_count; return agent_info_ptr; }
/* * Read a SLURM hostfile specified by "filename". "filename" must contain * a list of SLURM NodeNames, one per line. Reads up to "n" number of hostnames * from the file. Returns a string representing a hostlist ranged string of * the contents of the file. This is a helper function, it does not * contact any SLURM daemons. * * Returns a string representing the hostlist. Returns NULL if there are fewer * than "n" hostnames in the file, or if an error occurs. If "n" == * NO_VAL then the entire file is read in * * Returned string must be freed with free(). */ char *slurm_read_hostfile(char *filename, int n) { FILE *fp = NULL; char in_line[BUFFER_SIZE]; /* input line */ int i, j; int line_size; int line_num = 0; hostlist_t hostlist = NULL; char *nodelist = NULL; if (filename == NULL || strlen(filename) == 0) return NULL; if ((fp = fopen(filename, "r")) == NULL) { error("slurm_allocate_resources error opening file %s, %m", filename); return NULL; } hostlist = hostlist_create(NULL); if (hostlist == NULL) { fclose(fp); return NULL; } while (fgets(in_line, BUFFER_SIZE, fp) != NULL) { line_num++; line_size = strlen(in_line); if (line_size == (BUFFER_SIZE - 1)) { error ("Line %d, of hostfile %s too long", line_num, filename); fclose (fp); hostlist_destroy(hostlist); return NULL; } for (i = 0; i < line_size; i++) { if (in_line[i] == '\n') { in_line[i] = '\0'; break; } if (in_line[i] == '\0') break; if (in_line[i] != '#') continue; if ((i > 0) && (in_line[i - 1] == '\\')) { for (j = i; j < line_size; j++) { in_line[j - 1] = in_line[j]; } line_size--; continue; } in_line[i] = '\0'; break; } hostlist_push(hostlist, in_line); if (n != (int)NO_VAL && hostlist_count(hostlist) == n) break; } fclose(fp); if (hostlist_count(hostlist) <= 0) { error("Hostlist is empty!"); goto cleanup_hostfile; } if (hostlist_count(hostlist) < n) { error("Too few NodeNames in SLURM Hostfile"); goto cleanup_hostfile; } nodelist = (char *)malloc(0xffff); if (!nodelist) { error("Nodelist xmalloc failed"); goto cleanup_hostfile; } if (hostlist_ranged_string(hostlist, 0xffff, nodelist) == -1) { error("Hostlist is too long for the allocate RPC!"); free(nodelist); nodelist = NULL; goto cleanup_hostfile; } debug2("Hostlist from SLURM_HOSTFILE = %s", nodelist); cleanup_hostfile: hostlist_destroy(hostlist); return nodelist; }
/* * Create an srun job structure for a step w/out an allocation response msg. * (i.e. inside an allocation) */ srun_job_t * job_step_create_allocation(resource_allocation_response_msg_t *resp) { uint32_t job_id = resp->job_id; srun_job_t *job = NULL; allocation_info_t *ai = xmalloc(sizeof(*ai)); hostlist_t hl = NULL; char *buf = NULL; int count = 0; uint32_t alloc_count = 0; ai->jobid = job_id; ai->stepid = NO_VAL; ai->nodelist = opt.alloc_nodelist; hl = hostlist_create(ai->nodelist); hostlist_uniq(hl); alloc_count = hostlist_count(hl); ai->nnodes = alloc_count; hostlist_destroy(hl); if (opt.exc_nodes) { hostlist_t exc_hl = hostlist_create(opt.exc_nodes); hostlist_t inc_hl = NULL; char *node_name = NULL; hl = hostlist_create(ai->nodelist); if(opt.nodelist) { inc_hl = hostlist_create(opt.nodelist); } hostlist_uniq(hl); //info("using %s or %s", opt.nodelist, ai->nodelist); while ((node_name = hostlist_shift(exc_hl))) { int inx = hostlist_find(hl, node_name); if (inx >= 0) { debug("excluding node %s", node_name); hostlist_delete_nth(hl, inx); ai->nnodes--; /* decrement node count */ } if(inc_hl) { inx = hostlist_find(inc_hl, node_name); if (inx >= 0) { error("Requested node %s is also " "in the excluded list.", node_name); error("Job not submitted."); hostlist_destroy(exc_hl); hostlist_destroy(inc_hl); goto error; } } free(node_name); } hostlist_destroy(exc_hl); /* we need to set this here so if there are more nodes * available than we requested we can set it * straight. If there is no exclude list then we set * the vars then. */ if (!opt.nodes_set) { /* we don't want to set the number of nodes = * to the number of requested processes unless we * know it is less than the number of nodes * in the allocation */ if(opt.ntasks_set && (opt.ntasks < ai->nnodes)) opt.min_nodes = opt.ntasks; else opt.min_nodes = ai->nnodes; opt.nodes_set = true; } if(!opt.max_nodes) opt.max_nodes = opt.min_nodes; if((opt.max_nodes > 0) && (opt.max_nodes < ai->nnodes)) ai->nnodes = opt.max_nodes; count = hostlist_count(hl); if(!count) { error("Hostlist is now nothing! Can't run job."); hostlist_destroy(hl); goto error; } if(inc_hl) { count = hostlist_count(inc_hl); if(count < ai->nnodes) { /* add more nodes to get correct number for allocation */ hostlist_t tmp_hl = hostlist_copy(hl); int i=0; int diff = ai->nnodes - count; buf = hostlist_ranged_string_xmalloc(inc_hl); hostlist_delete(tmp_hl, buf); xfree(buf); while ((node_name = hostlist_shift(tmp_hl)) && (i < diff)) { hostlist_push(inc_hl, node_name); i++; } hostlist_destroy(tmp_hl); } buf = hostlist_ranged_string_xmalloc(inc_hl); hostlist_destroy(inc_hl); xfree(opt.nodelist); opt.nodelist = buf; } else { if (count > ai->nnodes) { /* remove more nodes than needed for allocation */ int i=0; for (i=count; i>ai->nnodes; i--) hostlist_delete_nth(hl, i); } xfree(opt.nodelist); opt.nodelist = hostlist_ranged_string_xmalloc(hl); } hostlist_destroy(hl); } else { if (!opt.nodes_set) { /* we don't want to set the number of nodes = * to the number of requested processes unless we * know it is less than the number of nodes * in the allocation */ if(opt.ntasks_set && (opt.ntasks < ai->nnodes)) opt.min_nodes = opt.ntasks; else opt.min_nodes = ai->nnodes; opt.nodes_set = true; } if(!opt.max_nodes) opt.max_nodes = opt.min_nodes; if((opt.max_nodes > 0) && (opt.max_nodes < ai->nnodes)) ai->nnodes = opt.max_nodes; /* Don't reset the ai->nodelist because that is the * nodelist we want to say the allocation is under * opt.nodelist is what is used for the allocation. */ /* xfree(ai->nodelist); */ /* ai->nodelist = xstrdup(buf); */ } /* get the correct number of hosts to run tasks on */ if (opt.nodelist) { hl = hostlist_create(opt.nodelist); if (opt.distribution != SLURM_DIST_ARBITRARY) hostlist_uniq(hl); if (!hostlist_count(hl)) { error("Hostlist is now nothing! Can not run job."); hostlist_destroy(hl); goto error; } buf = hostlist_ranged_string_xmalloc(hl); count = hostlist_count(hl); hostlist_destroy(hl); /* Don't reset the ai->nodelist because that is the * nodelist we want to say the allocation is under * opt.nodelist is what is used for the allocation. */ /* xfree(ai->nodelist); */ /* ai->nodelist = xstrdup(buf); */ xfree(opt.nodelist); opt.nodelist = buf; } if (opt.distribution == SLURM_DIST_ARBITRARY) { if (count != opt.ntasks) { error("You asked for %d tasks but specified %d nodes", opt.ntasks, count); goto error; } } if (ai->nnodes == 0) { error("No nodes in allocation, can't run job"); goto error; } ai->num_cpu_groups = resp->num_cpu_groups; ai->cpus_per_node = resp->cpus_per_node; ai->cpu_count_reps = resp->cpu_count_reps; /* info("looking for %d nodes out of %s with a must list of %s", */ /* ai->nnodes, ai->nodelist, opt.nodelist); */ /* * Create job */ job = _job_create_structure(ai); error: xfree(ai); return (job); }
int _do_stat(uint32_t jobid, uint32_t stepid, char *nodelist, uint32_t req_cpufreq) { job_step_stat_response_msg_t *step_stat_response = NULL; int rc = SLURM_SUCCESS; ListIterator itr; slurmdb_stats_t temp_stats; job_step_stat_t *step_stat = NULL; int ntasks = 0; int tot_tasks = 0; hostlist_t hl = NULL; debug("requesting info for job %u.%u", jobid, stepid); if ((rc = slurm_job_step_stat(jobid, stepid, nodelist, &step_stat_response)) != SLURM_SUCCESS) { if (rc == ESLURM_INVALID_JOB_ID) { debug("job step %u.%u has already completed", jobid, stepid); } else { error("problem getting step_layout for %u.%u: %s", jobid, stepid, slurm_strerror(rc)); } return rc; } memset(&job, 0, sizeof(slurmdb_job_rec_t)); job.jobid = jobid; memset(&step, 0, sizeof(slurmdb_step_rec_t)); memset(&temp_stats, 0, sizeof(slurmdb_stats_t)); temp_stats.cpu_min = NO_VAL; memset(&step.stats, 0, sizeof(slurmdb_stats_t)); step.stats.cpu_min = NO_VAL; step.job_ptr = &job; step.stepid = stepid; step.nodes = xmalloc(BUF_SIZE); step.req_cpufreq = req_cpufreq; step.stepname = NULL; step.state = JOB_RUNNING; hl = hostlist_create(NULL); itr = list_iterator_create(step_stat_response->stats_list); while ((step_stat = list_next(itr))) { if (!step_stat->step_pids || !step_stat->step_pids->node_name) continue; if (step_stat->step_pids->pid_cnt > 0 ) { int i; for(i=0; i<step_stat->step_pids->pid_cnt; i++) { if (step.pid_str) xstrcat(step.pid_str, ","); xstrfmtcat(step.pid_str, "%u", step_stat->step_pids->pid[i]); } } if (params.pid_format) { step.nodes = step_stat->step_pids->node_name; print_fields(&step); xfree(step.pid_str); } else { hostlist_push(hl, step_stat->step_pids->node_name); jobacctinfo_2_stats(&temp_stats, step_stat->jobacct); ntasks += step_stat->num_tasks; aggregate_stats(&step.stats, &temp_stats); } } list_iterator_destroy(itr); slurm_job_step_pids_response_msg_free(step_stat_response); /* we printed it out already */ if (params.pid_format) return rc; hostlist_sort(hl); hostlist_ranged_string(hl, BUF_SIZE, step.nodes); hostlist_destroy(hl); tot_tasks += ntasks; if (tot_tasks) { step.stats.cpu_ave /= (double)tot_tasks; step.stats.rss_ave /= (double)tot_tasks; step.stats.vsize_ave /= (double)tot_tasks; step.stats.pages_ave /= (double)tot_tasks; step.stats.disk_read_ave /= (double)tot_tasks; step.stats.disk_write_ave /= (double)tot_tasks; step.stats.act_cpufreq /= (double)tot_tasks; step.ntasks = tot_tasks; } print_fields(&step); return rc; }
/* Reserve ports for a job step * NOTE: We keep track of last port reserved and go round-robin through full * set of available ports. This helps avoid re-using busy ports when * restarting job steps. * RET SLURM_SUCCESS or an error code */ extern int resv_port_alloc(struct step_record *step_ptr) { int i, port_inx; int *port_array = NULL; char port_str[16], *tmp_str; hostlist_t hl; static int last_port_alloc = 0; if (step_ptr->resv_port_cnt > port_resv_cnt) { info("step %u.%u needs %u reserved ports, but only %d exist", step_ptr->job_ptr->job_id, step_ptr->step_id, step_ptr->resv_port_cnt, port_resv_cnt); return ESLURM_PORTS_INVALID; } /* Identify available ports */ port_array = xmalloc(sizeof(int) * step_ptr->resv_port_cnt); port_inx = 0; for (i=0; i<port_resv_cnt; i++) { if (++last_port_alloc >= port_resv_cnt) last_port_alloc = 0; if (bit_overlap(step_ptr->step_node_bitmap, port_resv_table[last_port_alloc])) continue; port_array[port_inx++] = last_port_alloc; if (port_inx >= step_ptr->resv_port_cnt) break; } if (port_inx < step_ptr->resv_port_cnt) { info("insufficient ports for step %u.%u to reserve (%d of %u)", step_ptr->job_ptr->job_id, step_ptr->step_id, port_inx, step_ptr->resv_port_cnt); xfree(port_array); return ESLURM_PORTS_BUSY; } /* Reserve selected ports */ hl = hostlist_create(NULL); for (i=0; i<port_inx; i++) { /* NOTE: We give the port a name like "[1234]" rather than * just "1234" to avoid hostlists of the form "1[234-236]" */ bit_or(port_resv_table[port_array[i]], step_ptr->step_node_bitmap); port_array[i] += port_resv_min; snprintf(port_str, sizeof(port_str), "[%d]", port_array[i]); hostlist_push(hl, port_str); } hostlist_sort(hl); step_ptr->resv_ports = hostlist_ranged_string_xmalloc(hl); hostlist_destroy(hl); step_ptr->resv_port_array = port_array; if (step_ptr->resv_ports[0] == '[') { /* Remove brackets from hostlist */ i = strlen(step_ptr->resv_ports); step_ptr->resv_ports[i-1] = '\0'; tmp_str = xmalloc(i); strcpy(tmp_str, step_ptr->resv_ports + 1); xfree(step_ptr->resv_ports); step_ptr->resv_ports = tmp_str; } debug("reserved ports %s for step %u.%u", step_ptr->resv_ports, step_ptr->job_ptr->job_id, step_ptr->step_id); return SLURM_SUCCESS; }
/* * start_msg_tree - logic to begin the forward tree and * accumulate the return codes from processes getting the * the forwarded message * * IN: hl - hostlist_t - list of every node to send message to * IN: msg - slurm_msg_t - message to send. * IN: timeout - int - how long to wait in milliseconds. * RET List - List containing the responses of the childern * (if any) we forwarded the message to. List * containing type (ret_data_info_t). */ extern List start_msg_tree(hostlist_t hl, slurm_msg_t *msg, int timeout) { int *span = NULL; fwd_tree_t *fwd_tree = NULL; pthread_mutex_t tree_mutex; pthread_cond_t notify; int j = 0, count = 0; List ret_list = NULL; char *name = NULL; int thr_count = 0; int host_count = 0; xassert(hl); xassert(msg); hostlist_uniq(hl); host_count = hostlist_count(hl); span = set_span(host_count, 0); slurm_mutex_init(&tree_mutex); pthread_cond_init(¬ify, NULL); ret_list = list_create(destroy_data_info); while ((name = hostlist_shift(hl))) { pthread_attr_t attr_agent; pthread_t thread_agent; int retries = 0; slurm_attr_init(&attr_agent); if (pthread_attr_setdetachstate (&attr_agent, PTHREAD_CREATE_DETACHED)) error("pthread_attr_setdetachstate error %m"); fwd_tree = xmalloc(sizeof(fwd_tree_t)); fwd_tree->orig_msg = msg; fwd_tree->ret_list = ret_list; fwd_tree->timeout = timeout; fwd_tree->notify = ¬ify; fwd_tree->tree_mutex = &tree_mutex; if(fwd_tree->timeout <= 0) { /* convert secs to msec */ fwd_tree->timeout = slurm_get_msg_timeout() * 1000; } fwd_tree->tree_hl = hostlist_create(name); free(name); for (j = 0; j < span[thr_count]; j++) { name = hostlist_shift(hl); if (!name) break; hostlist_push(fwd_tree->tree_hl, name); free(name); } while (pthread_create(&thread_agent, &attr_agent, _fwd_tree_thread, (void *)fwd_tree)) { error("pthread_create error %m"); if (++retries > MAX_RETRIES) fatal("Can't create pthread"); sleep(1); /* sleep and try again */ } slurm_attr_destroy(&attr_agent); thr_count++; } xfree(span); slurm_mutex_lock(&tree_mutex); count = list_count(ret_list); debug2("Tree head got back %d looking for %d", count, host_count); while ((count < host_count)) { pthread_cond_wait(¬ify, &tree_mutex); count = list_count(ret_list); debug2("Tree head got back %d", count); } debug2("Tree head got them all"); slurm_mutex_unlock(&tree_mutex); slurm_mutex_destroy(&tree_mutex); pthread_cond_destroy(¬ify); return ret_list; }
/* * forward_msg - logic to forward a message which has been received and * accumulate the return codes from processes getting the * the forwarded message * * IN: forward_struct - forward_struct_t * - holds information about message * that needs to be forwarded to * childern processes * IN: header - header_t - header from message that came in * needing to be forwarded. * RET: SLURM_SUCCESS - int */ extern int forward_msg(forward_struct_t *forward_struct, header_t *header) { int j = 0; int retries = 0; forward_msg_t *forward_msg = NULL; int thr_count = 0; int *span = set_span(header->forward.cnt, 0); hostlist_t hl = NULL; hostlist_t forward_hl = NULL; char *name = NULL; if (!forward_struct->ret_list) { error("didn't get a ret_list from forward_struct"); xfree(span); return SLURM_ERROR; } hl = hostlist_create(header->forward.nodelist); hostlist_uniq(hl); while ((name = hostlist_shift(hl))) { pthread_attr_t attr_agent; pthread_t thread_agent; char *buf = NULL; slurm_attr_init(&attr_agent); if (pthread_attr_setdetachstate (&attr_agent, PTHREAD_CREATE_DETACHED)) error("pthread_attr_setdetachstate error %m"); forward_msg = &forward_struct->forward_msg[thr_count]; forward_msg->ret_list = forward_struct->ret_list; forward_msg->timeout = forward_struct->timeout; if (forward_msg->timeout <= 0) { /* convert secs to msec */ forward_msg->timeout = slurm_get_msg_timeout() * 1000; } forward_msg->notify = &forward_struct->notify; forward_msg->forward_mutex = &forward_struct->forward_mutex; forward_msg->buf_len = forward_struct->buf_len; forward_msg->buf = forward_struct->buf; memcpy(&forward_msg->header.orig_addr, &header->orig_addr, sizeof(slurm_addr_t)); forward_msg->header.version = header->version; forward_msg->header.flags = header->flags; forward_msg->header.msg_type = header->msg_type; forward_msg->header.body_length = header->body_length; forward_msg->header.ret_list = NULL; forward_msg->header.ret_cnt = 0; forward_hl = hostlist_create(name); free(name); for(j = 0; j < span[thr_count]; j++) { name = hostlist_shift(hl); if (!name) break; hostlist_push(forward_hl, name); free(name); } buf = hostlist_ranged_string_xmalloc(forward_hl); hostlist_destroy(forward_hl); forward_init(&forward_msg->header.forward, NULL); forward_msg->header.forward.nodelist = buf; while (pthread_create(&thread_agent, &attr_agent, _forward_thread, (void *)forward_msg)) { error("pthread_create error %m"); if (++retries > MAX_RETRIES) fatal("Can't create pthread"); sleep(1); /* sleep and try again */ } slurm_attr_destroy(&attr_agent); thr_count++; } hostlist_destroy(hl); xfree(span); return SLURM_SUCCESS; }
/* * start_msg_tree - logic to begin the forward tree and * accumulate the return codes from processes getting the * the forwarded message * * IN: hl - hostlist_t - list of every node to send message to * IN: msg - slurm_msg_t - message to send. * IN: timeout - int - how long to wait in milliseconds. * RET List - List containing the responses of the childern * (if any) we forwarded the message to. List * containing type (ret_data_info_t). */ extern List start_msg_tree(hostlist_t hl, slurm_msg_t *msg, int timeout) { int *span = NULL; fwd_tree_t *fwd_tree = NULL; pthread_mutex_t tree_mutex; pthread_cond_t notify; int j = 0, count = 0; List ret_list = NULL; char *name = NULL; int thr_count = 0; int host_count = 0; xassert(hl); xassert(msg); hostlist_uniq(hl); host_count = hostlist_count(hl); span = set_span(host_count, 0); slurm_mutex_init(&tree_mutex); pthread_cond_init(¬ify, NULL); ret_list = list_create(destroy_data_info); while ((name = hostlist_shift(hl))) { pthread_attr_t attr_agent; pthread_t thread_agent; int retries = 0; slurm_attr_init(&attr_agent); if (pthread_attr_setdetachstate (&attr_agent, PTHREAD_CREATE_DETACHED)) error("pthread_attr_setdetachstate error %m"); fwd_tree = xmalloc(sizeof(fwd_tree_t)); fwd_tree->orig_msg = msg; fwd_tree->ret_list = ret_list; fwd_tree->timeout = timeout; fwd_tree->notify = ¬ify; fwd_tree->p_thr_count = &thr_count; fwd_tree->tree_mutex = &tree_mutex; if (fwd_tree->timeout <= 0) { /* convert secs to msec */ fwd_tree->timeout = slurm_get_msg_timeout() * 1000; } fwd_tree->tree_hl = hostlist_create(name); free(name); for (j = 0; j < span[thr_count]; j++) { name = hostlist_shift(hl); if (!name) break; hostlist_push(fwd_tree->tree_hl, name); free(name); } /* * Lock and increase thread counter, we need that to protect * the start_msg_tree waiting loop that was originally designed * around a "while ((count < host_count))" loop. In case where a * fwd thread was not able to get all the return codes from * children, the waiting loop was deadlocked. */ slurm_mutex_lock(&tree_mutex); thr_count++; slurm_mutex_unlock(&tree_mutex); while (pthread_create(&thread_agent, &attr_agent, _fwd_tree_thread, (void *)fwd_tree)) { error("pthread_create error %m"); if (++retries > MAX_RETRIES) fatal("Can't create pthread"); sleep(1); /* sleep and try again */ } slurm_attr_destroy(&attr_agent); } xfree(span); slurm_mutex_lock(&tree_mutex); count = list_count(ret_list); debug2("Tree head got back %d looking for %d", count, host_count); while (thr_count > 0) { pthread_cond_wait(¬ify, &tree_mutex); count = list_count(ret_list); debug2("Tree head got back %d", count); } xassert(count >= host_count); /* Tree head did not get all responses, * but no more active fwd threads!*/ slurm_mutex_unlock(&tree_mutex); slurm_mutex_destroy(&tree_mutex); pthread_cond_destroy(¬ify); return ret_list; }
/* * ping_nodes - check that all nodes and daemons are alive, * get nodes in UNKNOWN state to register */ void ping_nodes (void) { static bool restart_flag = true; /* system just restarted */ static int offset = 0; /* mutex via node table write lock on entry */ static int max_reg_threads = 0; /* max node registration threads * this can include DOWN nodes, so * limit the number to avoid huge * communication delays */ int i; time_t now, still_live_time, node_dead_time; static time_t last_ping_time = (time_t) 0; hostlist_t down_hostlist = NULL; char *host_str = NULL; agent_arg_t *ping_agent_args = NULL; agent_arg_t *reg_agent_args = NULL; #ifdef HAVE_FRONT_END front_end_record_t *front_end_ptr = NULL; #else struct node_record *node_ptr = NULL; #endif now = time (NULL); ping_agent_args = xmalloc (sizeof (agent_arg_t)); ping_agent_args->msg_type = REQUEST_PING; ping_agent_args->retry = 0; ping_agent_args->hostlist = hostlist_create(""); reg_agent_args = xmalloc (sizeof (agent_arg_t)); reg_agent_args->msg_type = REQUEST_NODE_REGISTRATION_STATUS; reg_agent_args->retry = 0; reg_agent_args->hostlist = hostlist_create(""); /* * If there are a large number of down nodes, the node ping * can take a long time to complete: * ping_time = down_nodes * agent_timeout / agent_parallelism * ping_time = down_nodes * 10_seconds / 10 * ping_time = down_nodes (seconds) * Because of this, we extend the SlurmdTimeout by the * time needed to complete a ping of all nodes. */ if ((slurmctld_conf.slurmd_timeout == 0) || (last_ping_time == (time_t) 0)) { node_dead_time = (time_t) 0; } else { node_dead_time = last_ping_time - slurmctld_conf.slurmd_timeout; } still_live_time = now - (slurmctld_conf.slurmd_timeout / 3); last_ping_time = now; if (max_reg_threads == 0) { max_reg_threads = MAX(slurm_get_tree_width(), 1); } offset += max_reg_threads; if ((offset > node_record_count) && (offset >= (max_reg_threads * MAX_REG_FREQUENCY))) offset = 0; #ifdef HAVE_FRONT_END for (i = 0, front_end_ptr = front_end_nodes; i < front_end_node_cnt; i++, front_end_ptr++) { if ((slurmctld_conf.slurmd_timeout == 0) && (!restart_flag) && (!IS_NODE_UNKNOWN(front_end_ptr)) && (!IS_NODE_NO_RESPOND(front_end_ptr))) continue; if ((front_end_ptr->last_response != (time_t) 0) && (front_end_ptr->last_response <= node_dead_time) && (!IS_NODE_DOWN(front_end_ptr))) { if (down_hostlist) (void) hostlist_push_host(down_hostlist, front_end_ptr->name); else { down_hostlist = hostlist_create(front_end_ptr->name); if (down_hostlist == NULL) fatal("hostlist_create: malloc error"); } set_front_end_down(front_end_ptr, "Not responding"); front_end_ptr->not_responding = false; continue; } if (restart_flag) { front_end_ptr->last_response = slurmctld_conf.last_update; } /* Request a node registration if its state is UNKNOWN or * on a periodic basis (about every MAX_REG_FREQUENCY ping, * this mechanism avoids an additional (per node) timer or * counter and gets updated configuration information * once in a while). We limit these requests since they * can generate a flood of incoming RPCs. */ if (IS_NODE_UNKNOWN(front_end_ptr) || restart_flag || ((i >= offset) && (i < (offset + max_reg_threads)))) { hostlist_push(reg_agent_args->hostlist, front_end_ptr->name); reg_agent_args->node_count++; continue; } if ((!IS_NODE_NO_RESPOND(front_end_ptr)) && (front_end_ptr->last_response >= still_live_time)) continue; /* Do not keep pinging down nodes since this can induce * huge delays in hierarchical communication fail-over */ if (IS_NODE_NO_RESPOND(front_end_ptr) && IS_NODE_DOWN(front_end_ptr)) continue; hostlist_push(ping_agent_args->hostlist, front_end_ptr->name); ping_agent_args->node_count++; } #else for (i=0, node_ptr=node_record_table_ptr; i<node_record_count; i++, node_ptr++) { if (IS_NODE_FUTURE(node_ptr) || IS_NODE_POWER_SAVE(node_ptr)) continue; if ((slurmctld_conf.slurmd_timeout == 0) && (!restart_flag) && (!IS_NODE_UNKNOWN(node_ptr)) && (!IS_NODE_NO_RESPOND(node_ptr))) continue; if ((node_ptr->last_response != (time_t) 0) && (node_ptr->last_response <= node_dead_time) && (!IS_NODE_DOWN(node_ptr))) { if (down_hostlist) (void) hostlist_push_host(down_hostlist, node_ptr->name); else { down_hostlist = hostlist_create(node_ptr->name); if (down_hostlist == NULL) fatal("hostlist_create: malloc error"); } set_node_down_ptr(node_ptr, "Not responding"); node_ptr->not_responding = false; /* logged below */ continue; } if (restart_flag) node_ptr->last_response = slurmctld_conf.last_update; /* Request a node registration if its state is UNKNOWN or * on a periodic basis (about every MAX_REG_FREQUENCY ping, * this mechanism avoids an additional (per node) timer or * counter and gets updated configuration information * once in a while). We limit these requests since they * can generate a flood of incoming RPCs. */ if (IS_NODE_UNKNOWN(node_ptr) || restart_flag || ((i >= offset) && (i < (offset + max_reg_threads)))) { hostlist_push(reg_agent_args->hostlist, node_ptr->name); reg_agent_args->node_count++; continue; } if ((!IS_NODE_NO_RESPOND(node_ptr)) && (node_ptr->last_response >= still_live_time)) continue; /* Do not keep pinging down nodes since this can induce * huge delays in hierarchical communication fail-over */ if (IS_NODE_NO_RESPOND(node_ptr) && IS_NODE_DOWN(node_ptr)) continue; hostlist_push(ping_agent_args->hostlist, node_ptr->name); ping_agent_args->node_count++; } #endif restart_flag = false; if (ping_agent_args->node_count == 0) { hostlist_destroy(ping_agent_args->hostlist); xfree (ping_agent_args); } else { hostlist_uniq(ping_agent_args->hostlist); host_str = hostlist_ranged_string_xmalloc( ping_agent_args->hostlist); debug("Spawning ping agent for %s", host_str); xfree(host_str); ping_begin(); agent_queue_request(ping_agent_args); } if (reg_agent_args->node_count == 0) { hostlist_destroy(reg_agent_args->hostlist); xfree (reg_agent_args); } else { hostlist_uniq(reg_agent_args->hostlist); host_str = hostlist_ranged_string_xmalloc( reg_agent_args->hostlist); debug("Spawning registration agent for %s %d hosts", host_str, reg_agent_args->node_count); xfree(host_str); ping_begin(); agent_queue_request(reg_agent_args); } if (down_hostlist) { hostlist_uniq(down_hostlist); host_str = hostlist_ranged_string_xmalloc(down_hostlist); error("Nodes %s not responding, setting DOWN", host_str); xfree(host_str); hostlist_destroy(down_hostlist); } }
extern int basil_node_ranking(struct node_record *node_array, int node_cnt) { enum basil_version version = get_basil_version(); struct basil_inventory *inv; struct basil_node *node; int rank_count = 0, i; hostlist_t hl = hostlist_create(NULL); bool bad_node = 0; /* * When obtaining the initial configuration, we can not allow ALPS to * fail. If there is a problem at this stage it is better to restart * SLURM completely, after investigating (and/or fixing) the cause. */ inv = get_full_inventory(version); if (inv == NULL) fatal("failed to get BASIL %s ranking", bv_names_long[version]); else if (!inv->batch_total) fatal("system has no usable batch compute nodes"); else if (inv->batch_total < node_cnt) info("Warning: ALPS sees only %d/%d slurm.conf nodes, " "check DownNodes", inv->batch_total, node_cnt); debug("BASIL %s RANKING INVENTORY: %d/%d batch nodes", bv_names_long[version], inv->batch_avail, inv->batch_total); /* * Node ranking is based on a subset of the inventory: only nodes in * batch allocation mode which are up and not allocated. Assign a * 'NO_VAL' rank to all other nodes, which will translate as a very * high value, (unsigned)-2, to put those nodes last in the ranking. * The rest of the code must ensure that those nodes are never chosen. */ for (i = 0; i < node_cnt; i++) node_array[i].node_rank = NO_VAL; for (node = inv->f->node_head; node; node = node->next) { struct node_record *node_ptr; char tmp[50]; /* This will ignore interactive nodes when iterating through * the apbasil inventory. If we don't do this, SLURM is * unable to resolve the ID to a nidXXX name since it's not in * the slurm.conf file. (Chris North) */ if (node->role == BNR_INTER) continue; node_ptr = _find_node_by_basil_id(node->node_id); if (node_ptr == NULL) { error("nid%05u (%s node in state %s) not in slurm.conf", node->node_id, nam_noderole[node->role], nam_nodestate[node->state]); bad_node = 1; } else node_ptr->node_rank = inv->nodes_total - rank_count++; sprintf(tmp, "nid%05u", node->node_id); hostlist_push(hl, tmp); } free_inv(inv); if (bad_node) { hostlist_sort(hl); char *name = hostlist_ranged_string_xmalloc(hl); info("It appears your slurm.conf nodelist doesn't " "match the alps system. Here are the nodes alps knows " "about\n%s", name); } hostlist_destroy(hl); return SLURM_SUCCESS; }
static int _resources_set(char ***env) { char *p = NULL; /* Initialize all memory pointers that would be allocated to NULL * So in case of error exit we will know what to xfree */ _pmixp_job_info.job_hl = hostlist_create(""); _pmixp_job_info.step_hl = hostlist_create(""); _pmixp_job_info.hostname = NULL; /* Save step host list */ p = getenvp(*env, PMIXP_STEP_NODES_ENV); if (!p) { PMIXP_ERROR_NO(ENOENT, "Environment variable %s not found", PMIXP_STEP_NODES_ENV); goto err_exit; } hostlist_push(_pmixp_job_info.step_hl, p); /* Extract our node name */ p = hostlist_nth(_pmixp_job_info.step_hl, _pmixp_job_info.node_id); _pmixp_job_info.hostname = xstrdup(p); free(p); /* Determine job-wide node id and job-wide node count */ p = getenvp(*env, PMIXP_JOB_NODES_ENV); if (p == NULL) { p = getenvp(*env, PMIXP_JOB_NODES_ENV_DEP); if (p == NULL) { /* shouldn't happen if we are under SLURM! */ PMIXP_ERROR_NO(ENOENT, "Neither of nodelist environment variables: %s OR %s was found!", PMIXP_JOB_NODES_ENV, PMIXP_JOB_NODES_ENV_DEP); goto err_exit; } } hostlist_push(_pmixp_job_info.job_hl, p); _pmixp_job_info.nnodes_job = hostlist_count(_pmixp_job_info.job_hl); _pmixp_job_info.node_id_job = hostlist_find(_pmixp_job_info.job_hl, _pmixp_job_info.hostname); /* FIXME!! ------------------------------------------------------------- */ /* TODO: _get_task_count not always works well. if (_get_task_count(env, &_pmixp_job_info.ntasks_job, &_pmixp_job_info.ncpus_job) < 0) { _pmixp_job_info.ntasks_job = _pmixp_job_info.ntasks; _pmixp_job_info.ncpus_job = _pmixp_job_info.ntasks; } xassert(_pmixp_job_info.ntasks <= _pmixp_job_info.ntasks_job); */ _pmixp_job_info.ntasks_job = _pmixp_job_info.ntasks; _pmixp_job_info.ncpus_job = _pmixp_job_info.ntasks; /* Save task-to-node mapping */ p = getenvp(*env, PMIXP_SLURM_MAPPING_ENV); if (p == NULL) { /* Direct modex won't work */ PMIXP_ERROR_NO(ENOENT, "No %s environment variable found!", PMIXP_SLURM_MAPPING_ENV); goto err_exit; } _pmixp_job_info.task_map_packed = xstrdup(p); return SLURM_SUCCESS; err_exit: hostlist_destroy(_pmixp_job_info.job_hl); hostlist_destroy(_pmixp_job_info.step_hl); if (NULL != _pmixp_job_info.hostname) { xfree(_pmixp_job_info.hostname); } return SLURM_ERROR; }
/* * Read a SLURM hostfile specified by "filename". "filename" must contain * a list of SLURM NodeNames, one per line. Reads up to "n" number of hostnames * from the file. Returns a string representing a hostlist ranged string of * the contents of the file. This is a helper function, it does not * contact any SLURM daemons. * * Returns a string representing the hostlist. Returns NULL if there are fewer * than "n" hostnames in the file, or if an error occurs. If "n" == * NO_VAL then the entire file is read in * * Returned string must be freed with free(). */ char *slurm_read_hostfile(char *filename, int n) { FILE *fp = NULL; char in_line[BUFFER_SIZE]; /* input line */ int i, j; int line_size; int line_num = 0; hostlist_t hostlist = NULL; char *nodelist = NULL; char *asterisk, *tmp_text, *save_ptr = NULL, *host_name; int total_file_len = 0; if (filename == NULL || strlen(filename) == 0) return NULL; if ((fp = fopen(filename, "r")) == NULL) { error("slurm_allocate_resources error opening file %s, %m", filename); return NULL; } hostlist = hostlist_create(NULL); if (hostlist == NULL) { fclose(fp); return NULL; } while (fgets(in_line, BUFFER_SIZE, fp) != NULL) { line_num++; if (!isalpha(in_line[0]) && !isdigit(in_line[0])) { error ("Invalid hostfile %s contents on line %d", filename, line_num); fclose (fp); hostlist_destroy(hostlist); return NULL; } line_size = strlen(in_line); total_file_len += line_size; if (line_size == (BUFFER_SIZE - 1)) { error ("Line %d, of hostfile %s too long", line_num, filename); fclose (fp); hostlist_destroy(hostlist); return NULL; } for (i = 0; i < line_size; i++) { if (in_line[i] == '\n') { in_line[i] = '\0'; break; } if (in_line[i] == '\0') break; if (in_line[i] != '#') continue; if ((i > 0) && (in_line[i - 1] == '\\')) { for (j = i; j < line_size; j++) { in_line[j - 1] = in_line[j]; } line_size--; continue; } in_line[i] = '\0'; break; } tmp_text = xstrdup(in_line); host_name = strtok_r(tmp_text, ",", &save_ptr); while (host_name) { if ((asterisk = strchr(host_name, '*')) && (i = atoi(asterisk + 1))) { asterisk[0] = '\0'; for (j = 0; j < i; j++) hostlist_push(hostlist, host_name); } else { hostlist_push(hostlist, host_name); } host_name = strtok_r(NULL, ",", &save_ptr); } xfree(tmp_text); if ((n != (int)NO_VAL) && (hostlist_count(hostlist) == n)) break; } fclose(fp); if (hostlist_count(hostlist) <= 0) { error("Hostlist is empty!"); goto cleanup_hostfile; } if (hostlist_count(hostlist) < n) { error("Too few NodeNames in SLURM Hostfile"); goto cleanup_hostfile; } total_file_len += 1024; nodelist = (char *)malloc(total_file_len); if (!nodelist) { error("Nodelist xmalloc failed"); goto cleanup_hostfile; } if (hostlist_ranged_string(hostlist, total_file_len, nodelist) == -1) { error("Hostlist is too long for the allocate RPC!"); free(nodelist); nodelist = NULL; goto cleanup_hostfile; } debug2("Hostlist from SLURM_HOSTFILE = %s", nodelist); cleanup_hostfile: hostlist_destroy(hostlist); return nodelist; }
/* * Based on ideas provided by Hongjia Cao <*****@*****.**> in PMI2 plugin */ int pmixp_coll_init(pmixp_coll_t *coll, const pmix_proc_t *procs, size_t nprocs, pmixp_coll_type_t type) { hostlist_t hl; uint32_t nodeid = 0, nodes = 0; int parent_id, depth, max_depth, tmp; int width, my_nspace = -1; char *p; int i, *ch_nodeids = NULL; #ifndef NDEBUG coll->magic = PMIXP_COLL_STATE_MAGIC; #endif coll->type = type; coll->state = PMIXP_COLL_SYNC; coll->procs = xmalloc(sizeof(*procs) * nprocs); memcpy(coll->procs, procs, sizeof(*procs) * nprocs); coll->nprocs = nprocs; coll->my_nspace = my_nspace; if (SLURM_SUCCESS != _hostset_from_ranges(procs, nprocs, &hl)) { /* TODO: provide ranges output routine */ PMIXP_ERROR("Bad ranges information"); goto err_exit; } width = slurm_get_tree_width(); nodes = hostlist_count(hl); nodeid = hostlist_find(hl, pmixp_info_hostname()); reverse_tree_info(nodeid, nodes, width, &parent_id, &tmp, &depth, &max_depth); coll->children_cnt = tmp; coll->nodeid = nodeid; /* We interested in amount of direct childs */ coll->seq = 0; coll->contrib_cntr = 0; coll->contrib_local = false; ch_nodeids = xmalloc(sizeof(int) * width); coll->ch_contribs = xmalloc(sizeof(int) * width); coll->children_cnt = reverse_tree_direct_children(nodeid, nodes, width, depth, ch_nodeids); /* create the hostlist with extract direct children's hostnames */ coll->ch_hosts = hostlist_create(""); for (i = 0; i < coll->children_cnt; i++) { char *hname = hostlist_nth(hl, ch_nodeids[i]); hostlist_push(coll->ch_hosts, hname); } /* just in case, shouldn't be needed */ hostlist_uniq(coll->ch_hosts); xfree(ch_nodeids); if (parent_id == -1) { /* if we are the root of the tree: * - we don't have a parent; * - we have large list of all_childrens (we don't want ourselfs there) */ coll->parent_host = NULL; hostlist_delete_host(hl, pmixp_info_hostname()); coll->all_children = hl; } else if (parent_id >= 0) { /* for all other nodes in the tree we need to know: * - nodename of our parent; * - we don't need a list of all_childrens and hl anymore */ p = hostlist_nth(hl, parent_id); coll->parent_host = xstrdup(p); /* use empty hostlist here */ coll->all_children = hostlist_create(""); free(p); hostlist_destroy(hl); } /* Collective data */ coll->buf = pmixp_server_new_buf(); coll->serv_offs = get_buf_offset(coll->buf); if (SLURM_SUCCESS != _pack_ranges(coll)) { PMIXP_ERROR("Cannot pack ranges to coll message header!"); goto err_exit; } /* Callback information */ coll->cbdata = NULL; coll->cbfunc = NULL; /* init fine grained lock */ slurm_mutex_init(&coll->lock); return SLURM_SUCCESS; err_exit: return SLURM_ERROR; }
static int _pstdout_output_buffer_data(pstdout_state_t pstate, FILE *stream, char **whichbuffer, unsigned int *whichbufferlen, uint32_t whichprependmask, uint32_t whichbuffermask, uint32_t whichconsolidatemask, List whichconsolidatedlist, pthread_mutex_t *whichconsolidatedmutex) { assert(pstate); assert(pstate->magic == PSTDOUT_STATE_MAGIC); assert(pstate->p_stdout); assert(pstate->p_stderr); assert(stream); assert(stream == stdout || stream == stderr); assert(whichbuffer); assert(whichbufferlen); assert(whichprependmask == PSTDOUT_OUTPUT_STDOUT_PREPEND_HOSTNAME || whichprependmask == PSTDOUT_OUTPUT_STDERR_PREPEND_HOSTNAME); assert(whichbuffermask == PSTDOUT_OUTPUT_BUFFER_STDOUT || whichbuffermask == PSTDOUT_OUTPUT_BUFFER_STDERR); assert(whichconsolidatemask == PSTDOUT_OUTPUT_STDOUT_CONSOLIDATE || whichconsolidatemask == PSTDOUT_OUTPUT_STDERR_CONSOLIDATE); assert(whichconsolidatedlist); assert(whichconsolidatedmutex); if ((*whichbuffer && *whichbufferlen) && (pstdout_output_flags & whichbuffermask || pstdout_output_flags & whichconsolidatemask)) { /* Need to write a '\0' */ if (!(*whichbuffer = (char *)realloc(*whichbuffer, *whichbufferlen + 1))) { pstdout_errnum = PSTDOUT_ERR_OUTMEM; goto cleanup; } (*whichbuffer)[*whichbufferlen] = '\0'; *whichbufferlen += 1; if (pstdout_output_flags & whichbuffermask) { if (!(pstdout_output_flags & whichprependmask)) { fprintf(stream, "----------------\n"); fprintf(stream, "%s\n", pstate->hostname); fprintf(stream, "----------------\n"); } fprintf(stream, "%s", *whichbuffer); fflush(stream); } else { struct pstdout_consolidated_data *cdata; int rc; if ((rc = pthread_mutex_lock(whichconsolidatedmutex))) { if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD) fprintf(stderr, "pthread_mutex_lock: %s\n", strerror(rc)); pstdout_errnum = PSTDOUT_ERR_INTERNAL; goto cleanup; } if (!(cdata = list_find_first(whichconsolidatedlist, _pstdout_consolidated_data_find, *whichbuffer))) { if (!(cdata = _pstdout_consolidated_data_create(pstate->hostname, *whichbuffer))) goto cleanup; if (!list_append(whichconsolidatedlist, cdata)) { if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD) fprintf(stderr, "list_append: %s\n", strerror(errno)); pstdout_errnum = PSTDOUT_ERR_INTERNAL; _pstdout_consolidated_data_destroy(cdata); goto cleanup; } } else { if (!hostlist_push(cdata->h, pstate->hostname)) { if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD) fprintf(stderr, "hostlist_push: %s\n", strerror(errno)); pstdout_errnum = PSTDOUT_ERR_INTERNAL; goto cleanup; } } if ((rc = pthread_mutex_unlock(whichconsolidatedmutex))) { if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD) fprintf(stderr, "pthread_mutex_unlock: %s\n", strerror(rc)); pstdout_errnum = PSTDOUT_ERR_INTERNAL; goto cleanup; } } } return 0; cleanup: return -1; }
extern int slurm_hostlist_push(hostlist_t hl, const char *hosts) { return hostlist_push(hl, hosts); }
static void _update_sinfo(sinfo_data_t *sinfo_ptr, node_info_t *node_ptr, uint32_t node_scaling) { uint16_t base_state; uint16_t used_cpus = 0, error_cpus = 0; int total_cpus = 0, total_nodes = 0; /* since node_scaling could be less here, we need to use the * global node scaling which should never change. */ int single_node_cpus = (node_ptr->cpus / g_node_scaling); base_state = node_ptr->node_state & NODE_STATE_BASE; if (sinfo_ptr->nodes_total == 0) { /* first node added */ sinfo_ptr->node_state = node_ptr->node_state; sinfo_ptr->features = node_ptr->features; sinfo_ptr->gres = node_ptr->gres; sinfo_ptr->reason = node_ptr->reason; sinfo_ptr->reason_time= node_ptr->reason_time; sinfo_ptr->reason_uid = node_ptr->reason_uid; sinfo_ptr->min_cpus = node_ptr->cpus; sinfo_ptr->max_cpus = node_ptr->cpus; sinfo_ptr->min_sockets = node_ptr->sockets; sinfo_ptr->max_sockets = node_ptr->sockets; sinfo_ptr->min_cores = node_ptr->cores; sinfo_ptr->max_cores = node_ptr->cores; sinfo_ptr->min_threads = node_ptr->threads; sinfo_ptr->max_threads = node_ptr->threads; sinfo_ptr->min_disk = node_ptr->tmp_disk; sinfo_ptr->max_disk = node_ptr->tmp_disk; sinfo_ptr->min_mem = node_ptr->real_memory; sinfo_ptr->max_mem = node_ptr->real_memory; sinfo_ptr->min_weight = node_ptr->weight; sinfo_ptr->max_weight = node_ptr->weight; sinfo_ptr->min_cpu_load = node_ptr->cpu_load; sinfo_ptr->max_cpu_load = node_ptr->cpu_load; } else if (hostlist_find(sinfo_ptr->nodes, node_ptr->name) != -1) { /* we already have this node in this record, * just return, don't duplicate */ return; } else { if (sinfo_ptr->min_cpus > node_ptr->cpus) sinfo_ptr->min_cpus = node_ptr->cpus; if (sinfo_ptr->max_cpus < node_ptr->cpus) sinfo_ptr->max_cpus = node_ptr->cpus; if (sinfo_ptr->min_sockets > node_ptr->sockets) sinfo_ptr->min_sockets = node_ptr->sockets; if (sinfo_ptr->max_sockets < node_ptr->sockets) sinfo_ptr->max_sockets = node_ptr->sockets; if (sinfo_ptr->min_cores > node_ptr->cores) sinfo_ptr->min_cores = node_ptr->cores; if (sinfo_ptr->max_cores < node_ptr->cores) sinfo_ptr->max_cores = node_ptr->cores; if (sinfo_ptr->min_threads > node_ptr->threads) sinfo_ptr->min_threads = node_ptr->threads; if (sinfo_ptr->max_threads < node_ptr->threads) sinfo_ptr->max_threads = node_ptr->threads; if (sinfo_ptr->min_disk > node_ptr->tmp_disk) sinfo_ptr->min_disk = node_ptr->tmp_disk; if (sinfo_ptr->max_disk < node_ptr->tmp_disk) sinfo_ptr->max_disk = node_ptr->tmp_disk; if (sinfo_ptr->min_mem > node_ptr->real_memory) sinfo_ptr->min_mem = node_ptr->real_memory; if (sinfo_ptr->max_mem < node_ptr->real_memory) sinfo_ptr->max_mem = node_ptr->real_memory; if (sinfo_ptr->min_weight> node_ptr->weight) sinfo_ptr->min_weight = node_ptr->weight; if (sinfo_ptr->max_weight < node_ptr->weight) sinfo_ptr->max_weight = node_ptr->weight; if (sinfo_ptr->min_cpu_load > node_ptr->cpu_load) sinfo_ptr->min_cpu_load = node_ptr->cpu_load; if (sinfo_ptr->max_cpu_load < node_ptr->cpu_load) sinfo_ptr->max_cpu_load = node_ptr->cpu_load; } hostlist_push(sinfo_ptr->nodes, node_ptr->name); hostlist_push(sinfo_ptr->node_addr, node_ptr->node_addr); hostlist_push(sinfo_ptr->hostnames, node_ptr->node_hostname); total_cpus = node_ptr->cpus; total_nodes = node_scaling; select_g_select_nodeinfo_get(node_ptr->select_nodeinfo, SELECT_NODEDATA_SUBCNT, NODE_STATE_ALLOCATED, &used_cpus); select_g_select_nodeinfo_get(node_ptr->select_nodeinfo, SELECT_NODEDATA_SUBCNT, NODE_STATE_ERROR, &error_cpus); if (params.cluster_flags & CLUSTER_FLAG_BG) { if (!params.match_flags.state_flag && (used_cpus || error_cpus)) { /* We only get one shot at this (because all states * are combined together), so we need to make * sure we get all the subgrps accounted. (So use * g_node_scaling for safe measure) */ total_nodes = g_node_scaling; sinfo_ptr->nodes_alloc += used_cpus; sinfo_ptr->nodes_other += error_cpus; sinfo_ptr->nodes_idle += (total_nodes - (used_cpus + error_cpus)); used_cpus *= single_node_cpus; error_cpus *= single_node_cpus; } else { /* process only for this subgrp and then return */ total_cpus = total_nodes * single_node_cpus; if ((base_state == NODE_STATE_ALLOCATED) || (node_ptr->node_state & NODE_STATE_COMPLETING)) { sinfo_ptr->nodes_alloc += total_nodes; sinfo_ptr->cpus_alloc += total_cpus; } else if (IS_NODE_DRAIN(node_ptr) || (base_state == NODE_STATE_DOWN)) { sinfo_ptr->nodes_other += total_nodes; sinfo_ptr->cpus_other += total_cpus; } else { sinfo_ptr->nodes_idle += total_nodes; sinfo_ptr->cpus_idle += total_cpus; } sinfo_ptr->nodes_total += total_nodes; sinfo_ptr->cpus_total += total_cpus; return; } } else { if ((base_state == NODE_STATE_ALLOCATED) || IS_NODE_COMPLETING(node_ptr)) sinfo_ptr->nodes_alloc += total_nodes; else if (IS_NODE_DRAIN(node_ptr) || (base_state == NODE_STATE_DOWN)) sinfo_ptr->nodes_other += total_nodes; else sinfo_ptr->nodes_idle += total_nodes; } sinfo_ptr->nodes_total += total_nodes; sinfo_ptr->cpus_alloc += used_cpus; sinfo_ptr->cpus_total += total_cpus; total_cpus -= used_cpus + error_cpus; if (error_cpus) { sinfo_ptr->cpus_idle += total_cpus; sinfo_ptr->cpus_other += error_cpus; } else if (IS_NODE_DRAIN(node_ptr) || (base_state == NODE_STATE_DOWN)) { sinfo_ptr->cpus_other += total_cpus; } else sinfo_ptr->cpus_idle += total_cpus; }