/* Update acct_gather data for every node that is not DOWN */ extern void update_nodes_acct_gather_data(void) { #ifdef HAVE_FRONT_END front_end_record_t *front_end_ptr; #else struct node_record *node_ptr; #endif int i; char *host_str = NULL; agent_arg_t *agent_args = NULL; agent_args = xmalloc (sizeof (agent_arg_t)); agent_args->msg_type = REQUEST_ACCT_GATHER_UPDATE; agent_args->retry = 0; agent_args->protocol_version = SLURM_PROTOCOL_VERSION; agent_args->hostlist = hostlist_create(NULL); #ifdef HAVE_FRONT_END for (i = 0, front_end_ptr = front_end_nodes; i < front_end_node_cnt; i++, front_end_ptr++) { if (IS_NODE_NO_RESPOND(front_end_ptr)) continue; if (agent_args->protocol_version > front_end_ptr->protocol_version) agent_args->protocol_version = front_end_ptr->protocol_version; hostlist_push_host(agent_args->hostlist, front_end_ptr->name); agent_args->node_count++; } #else for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count; i++, node_ptr++) { if (IS_NODE_NO_RESPOND(node_ptr) || IS_NODE_FUTURE(node_ptr) || IS_NODE_POWER_SAVE(node_ptr)) continue; if (agent_args->protocol_version > node_ptr->protocol_version) agent_args->protocol_version = node_ptr->protocol_version; hostlist_push_host(agent_args->hostlist, node_ptr->name); agent_args->node_count++; } #endif if (agent_args->node_count == 0) { hostlist_destroy(agent_args->hostlist); xfree (agent_args); } else { hostlist_uniq(agent_args->hostlist); host_str = hostlist_ranged_string_xmalloc(agent_args->hostlist); if (slurmctld_conf.debug_flags & DEBUG_FLAG_ENERGY) info("Updating acct_gather data for %s", host_str); xfree(host_str); ping_begin(); agent_queue_request(agent_args); } }
static int _sort_by_node_list(void *void1, void *void2) { int diff = 0; sinfo_data_t *sinfo1; sinfo_data_t *sinfo2; char *val1, *val2; #if PURE_ALPHA_SORT == 0 int inx; #endif _get_sinfo_from_void(&sinfo1, &sinfo2, void1, void2); val1 = hostlist_shift(sinfo1->nodes); if (val1) { hostlist_push_host(sinfo1->nodes, val1); hostlist_sort(sinfo1->nodes); } else val1 = ""; val2 = hostlist_shift(sinfo2->nodes); if (val2) { hostlist_push_host(sinfo2->nodes, val2); hostlist_sort(sinfo2->nodes); } else val2 = ""; #if PURE_ALPHA_SORT diff = xstrcmp(val1, val2); #else for (inx=0; ; inx++) { if (val1[inx] == val2[inx]) { if (val1[inx] == '\0') break; continue; } if ((isdigit((int)val1[inx])) && (isdigit((int)val2[inx]))) { int num1, num2; num1 = atoi(val1+inx); num2 = atoi(val2+inx); diff = num1 - num2; } else diff = xstrcmp(val1, val2); break; } #endif if (strlen(val1)) free(val1); if (strlen(val2)) free(val2); if (reverse_order) diff = -diff; return diff; }
extern void scontrol_print_completing_job(job_info_t *job_ptr, node_info_msg_t *node_info_msg) { int i, c_offset = 0; node_info_t *node_info; hostlist_t comp_nodes, down_nodes; char *node_buf; comp_nodes = hostlist_create(NULL); down_nodes = hostlist_create(NULL); if (job_ptr->cluster && federation_flag && !local_flag) c_offset = get_cluster_node_offset(job_ptr->cluster, node_info_msg); for (i = 0; job_ptr->node_inx[i] != -1; i+=2) { int j = job_ptr->node_inx[i]; for (; j <= job_ptr->node_inx[i+1]; j++) { int node_inx = j + c_offset; if (node_inx >= node_info_msg->record_count) break; node_info = &(node_info_msg->node_array[node_inx]); if (IS_NODE_COMPLETING(node_info)) hostlist_push_host(comp_nodes, node_info->name); else if (IS_NODE_DOWN(node_info)) hostlist_push_host(down_nodes, node_info->name); } } fprintf(stdout, "JobId=%u ", job_ptr->job_id); node_buf = hostlist_ranged_string_xmalloc(comp_nodes); if (node_buf && node_buf[0]) fprintf(stdout, "Nodes(COMPLETING)=%s ", node_buf); xfree(node_buf); node_buf = hostlist_ranged_string_xmalloc(down_nodes); if (node_buf && node_buf[0]) fprintf(stdout, "Nodes(DOWN)=%s ", node_buf); xfree(node_buf); fprintf(stdout, "\n"); hostlist_destroy(comp_nodes); hostlist_destroy(down_nodes); }
/* _get_all_nodes creates a hostlist containing all the nodes in the * node_record_table. * * Caller must destroy the list. */ static hostlist_t _get_all_nodes( void ) { int i; hostlist_t nodes = hostlist_create(NULL); for (i = 0; i < node_record_count; i++) { hostlist_push_host(nodes, node_record_table_ptr[i].name); } return nodes; }
extern void scontrol_print_completing_job(job_info_t *job_ptr, node_info_msg_t *node_info_msg) { int i; node_info_t *node_info; hostlist_t all_nodes, comp_nodes, down_nodes; char *node_buf; all_nodes = hostlist_create(job_ptr->nodes); comp_nodes = hostlist_create(""); down_nodes = hostlist_create(""); for (i=0; i<node_info_msg->record_count; i++) { node_info = &(node_info_msg->node_array[i]); if (IS_NODE_COMPLETING(node_info) && (_in_node_bit_list(i, job_ptr->node_inx))) hostlist_push_host(comp_nodes, node_info->name); else if (IS_NODE_DOWN(node_info) && (hostlist_find(all_nodes, node_info->name) != -1)) hostlist_push_host(down_nodes, node_info->name); } fprintf(stdout, "JobId=%u ", job_ptr->job_id); node_buf = hostlist_ranged_string_xmalloc(comp_nodes); if (node_buf && node_buf[0]) fprintf(stdout, "Nodes(COMPLETING)=%s ", node_buf); xfree(node_buf); node_buf = hostlist_ranged_string_xmalloc(down_nodes); if (node_buf && node_buf[0]) fprintf(stdout, "Nodes(DOWN)=%s ", node_buf); xfree(node_buf); fprintf(stdout, "\n"); hostlist_destroy(all_nodes); hostlist_destroy(comp_nodes); hostlist_destroy(down_nodes); }
/* * _node_name2bitmap - given a node name regular expression, build a bitmap * representation, any invalid hostnames are added to a hostlist * IN node_names - set of node namess * OUT bitmap - set to bitmap, may not have all bits set on error * IN/OUT invalid_hostlist - hostlist of invalid host names, initialize to NULL * RET 0 if no error, otherwise EINVAL * NOTE: call FREE_NULL_BITMAP(bitmap) and hostlist_destroy(invalid_hostlist) * to free memory when variables are no longer required */ static int _node_name2bitmap(char *node_names, bitstr_t **bitmap, hostlist_t *invalid_hostlist) { char *this_node_name; bitstr_t *my_bitmap; hostlist_t host_list; my_bitmap = (bitstr_t *) bit_alloc(node_record_count); *bitmap = my_bitmap; if (node_names == NULL) { error("_node_name2bitmap: node_names is NULL"); return EINVAL; } if ( (host_list = hostlist_create(node_names)) == NULL) { /* likely a badly formatted hostlist */ error("_node_name2bitmap: hostlist_create(%s) error", node_names); return EINVAL; } while ( (this_node_name = hostlist_shift(host_list)) ) { struct node_record *node_ptr; node_ptr = find_node_record(this_node_name); if (node_ptr) { bit_set(my_bitmap, (bitoff_t) (node_ptr - node_record_table_ptr)); } else { debug2("_node_name2bitmap: invalid node specified %s", this_node_name); if (*invalid_hostlist) { hostlist_push_host(*invalid_hostlist, this_node_name); } else { *invalid_hostlist = hostlist_create(this_node_name); } } free (this_node_name); } hostlist_destroy(host_list); return SLURM_SUCCESS; }
/* * bitmap2hostlist - given a bitmap, build a hostlist * IN bitmap - bitmap pointer * RET pointer to hostlist or NULL on error * globals: node_record_table_ptr - pointer to node table * NOTE: the caller must xfree the memory at node_list when no longer required */ hostlist_t bitmap2hostlist (bitstr_t *bitmap) { int i, first, last; hostlist_t hl; if (bitmap == NULL) return NULL; first = bit_ffs(bitmap); if (first == -1) return NULL; last = bit_fls(bitmap); hl = hostlist_create(NULL); for (i = first; i <= last; i++) { if (bit_test(bitmap, i) == 0) continue; hostlist_push_host(hl, node_record_table_ptr[i].name); } return hl; }
/* * route_split_hostlist_treewidth - logic to split an input hostlist into * a set of hostlists to forward to. * * This is the default behavior. It is implemented here as there are cases * where the topology version also needs to split the message list based * on TreeWidth. * * IN: hl - hostlist_t - list of every node to send message to * will be empty on return which is same behavior * as similar code replaced in forward.c * OUT: sp_hl - hostlist_t** - the array of hostlists that will be malloced * OUT: count - int* - the count of created hostlists * RET: SLURM_SUCCESS - int * * Note: created hostlist will have to be freed independently using * hostlist_destroy by the caller. * Note: the hostlist_t array will have to be xfree. */ extern int route_split_hostlist_treewidth(hostlist_t hl, hostlist_t** sp_hl, int* count) { int host_count; int *span = NULL; char *name = NULL; char *buf; int nhl = 0; int j; host_count = hostlist_count(hl); span = set_span(host_count, tree_width); *sp_hl = (hostlist_t*) xmalloc(tree_width * sizeof(hostlist_t)); while ((name = hostlist_shift(hl))) { (*sp_hl)[nhl] = hostlist_create(name); free(name); for (j = 0; j < span[nhl]; j++) { name = hostlist_shift(hl); if (!name) { break; } hostlist_push_host((*sp_hl)[nhl], name); free(name); } if (debug_flags & DEBUG_FLAG_ROUTE) { buf = hostlist_ranged_string_xmalloc((*sp_hl)[nhl]); debug("ROUTE: ... sublist[%d] %s", nhl, buf); xfree(buf); } nhl++; } xfree(span); *count = nhl; return SLURM_SUCCESS; }
/* * ping_nodes - check that all nodes and daemons are alive, * get nodes in UNKNOWN state to register */ void ping_nodes (void) { static bool restart_flag = true; /* system just restarted */ static int offset = 0; /* mutex via node table write lock on entry */ static int max_reg_threads = 0; /* max node registration threads * this can include DOWN nodes, so * limit the number to avoid huge * communication delays */ int i; time_t now, still_live_time, node_dead_time; static time_t last_ping_time = (time_t) 0; hostlist_t down_hostlist = NULL; char *host_str = NULL; agent_arg_t *ping_agent_args = NULL; agent_arg_t *reg_agent_args = NULL; #ifdef HAVE_FRONT_END front_end_record_t *front_end_ptr = NULL; #else struct node_record *node_ptr = NULL; #endif now = time (NULL); ping_agent_args = xmalloc (sizeof (agent_arg_t)); ping_agent_args->msg_type = REQUEST_PING; ping_agent_args->retry = 0; ping_agent_args->hostlist = hostlist_create(""); reg_agent_args = xmalloc (sizeof (agent_arg_t)); reg_agent_args->msg_type = REQUEST_NODE_REGISTRATION_STATUS; reg_agent_args->retry = 0; reg_agent_args->hostlist = hostlist_create(""); /* * If there are a large number of down nodes, the node ping * can take a long time to complete: * ping_time = down_nodes * agent_timeout / agent_parallelism * ping_time = down_nodes * 10_seconds / 10 * ping_time = down_nodes (seconds) * Because of this, we extend the SlurmdTimeout by the * time needed to complete a ping of all nodes. */ if ((slurmctld_conf.slurmd_timeout == 0) || (last_ping_time == (time_t) 0)) { node_dead_time = (time_t) 0; } else { node_dead_time = last_ping_time - slurmctld_conf.slurmd_timeout; } still_live_time = now - (slurmctld_conf.slurmd_timeout / 3); last_ping_time = now; if (max_reg_threads == 0) { max_reg_threads = MAX(slurm_get_tree_width(), 1); } offset += max_reg_threads; if ((offset > node_record_count) && (offset >= (max_reg_threads * MAX_REG_FREQUENCY))) offset = 0; #ifdef HAVE_FRONT_END for (i = 0, front_end_ptr = front_end_nodes; i < front_end_node_cnt; i++, front_end_ptr++) { if ((slurmctld_conf.slurmd_timeout == 0) && (!restart_flag) && (!IS_NODE_UNKNOWN(front_end_ptr)) && (!IS_NODE_NO_RESPOND(front_end_ptr))) continue; if ((front_end_ptr->last_response != (time_t) 0) && (front_end_ptr->last_response <= node_dead_time) && (!IS_NODE_DOWN(front_end_ptr))) { if (down_hostlist) (void) hostlist_push_host(down_hostlist, front_end_ptr->name); else { down_hostlist = hostlist_create(front_end_ptr->name); if (down_hostlist == NULL) fatal("hostlist_create: malloc error"); } set_front_end_down(front_end_ptr, "Not responding"); front_end_ptr->not_responding = false; continue; } if (restart_flag) { front_end_ptr->last_response = slurmctld_conf.last_update; } /* Request a node registration if its state is UNKNOWN or * on a periodic basis (about every MAX_REG_FREQUENCY ping, * this mechanism avoids an additional (per node) timer or * counter and gets updated configuration information * once in a while). We limit these requests since they * can generate a flood of incoming RPCs. */ if (IS_NODE_UNKNOWN(front_end_ptr) || restart_flag || ((i >= offset) && (i < (offset + max_reg_threads)))) { hostlist_push(reg_agent_args->hostlist, front_end_ptr->name); reg_agent_args->node_count++; continue; } if ((!IS_NODE_NO_RESPOND(front_end_ptr)) && (front_end_ptr->last_response >= still_live_time)) continue; /* Do not keep pinging down nodes since this can induce * huge delays in hierarchical communication fail-over */ if (IS_NODE_NO_RESPOND(front_end_ptr) && IS_NODE_DOWN(front_end_ptr)) continue; hostlist_push(ping_agent_args->hostlist, front_end_ptr->name); ping_agent_args->node_count++; } #else for (i=0, node_ptr=node_record_table_ptr; i<node_record_count; i++, node_ptr++) { if (IS_NODE_FUTURE(node_ptr) || IS_NODE_POWER_SAVE(node_ptr)) continue; if ((slurmctld_conf.slurmd_timeout == 0) && (!restart_flag) && (!IS_NODE_UNKNOWN(node_ptr)) && (!IS_NODE_NO_RESPOND(node_ptr))) continue; if ((node_ptr->last_response != (time_t) 0) && (node_ptr->last_response <= node_dead_time) && (!IS_NODE_DOWN(node_ptr))) { if (down_hostlist) (void) hostlist_push_host(down_hostlist, node_ptr->name); else { down_hostlist = hostlist_create(node_ptr->name); if (down_hostlist == NULL) fatal("hostlist_create: malloc error"); } set_node_down_ptr(node_ptr, "Not responding"); node_ptr->not_responding = false; /* logged below */ continue; } if (restart_flag) node_ptr->last_response = slurmctld_conf.last_update; /* Request a node registration if its state is UNKNOWN or * on a periodic basis (about every MAX_REG_FREQUENCY ping, * this mechanism avoids an additional (per node) timer or * counter and gets updated configuration information * once in a while). We limit these requests since they * can generate a flood of incoming RPCs. */ if (IS_NODE_UNKNOWN(node_ptr) || restart_flag || ((i >= offset) && (i < (offset + max_reg_threads)))) { hostlist_push(reg_agent_args->hostlist, node_ptr->name); reg_agent_args->node_count++; continue; } if ((!IS_NODE_NO_RESPOND(node_ptr)) && (node_ptr->last_response >= still_live_time)) continue; /* Do not keep pinging down nodes since this can induce * huge delays in hierarchical communication fail-over */ if (IS_NODE_NO_RESPOND(node_ptr) && IS_NODE_DOWN(node_ptr)) continue; hostlist_push(ping_agent_args->hostlist, node_ptr->name); ping_agent_args->node_count++; } #endif restart_flag = false; if (ping_agent_args->node_count == 0) { hostlist_destroy(ping_agent_args->hostlist); xfree (ping_agent_args); } else { hostlist_uniq(ping_agent_args->hostlist); host_str = hostlist_ranged_string_xmalloc( ping_agent_args->hostlist); debug("Spawning ping agent for %s", host_str); xfree(host_str); ping_begin(); agent_queue_request(ping_agent_args); } if (reg_agent_args->node_count == 0) { hostlist_destroy(reg_agent_args->hostlist); xfree (reg_agent_args); } else { hostlist_uniq(reg_agent_args->hostlist); host_str = hostlist_ranged_string_xmalloc( reg_agent_args->hostlist); debug("Spawning registration agent for %s %d hosts", host_str, reg_agent_args->node_count); xfree(host_str); ping_begin(); agent_queue_request(reg_agent_args); } if (down_hostlist) { hostlist_uniq(down_hostlist); host_str = hostlist_ranged_string_xmalloc(down_hostlist); error("Nodes %s not responding, setting DOWN", host_str); xfree(host_str); hostlist_destroy(down_hostlist); } }
/* * Read a SLURM hostfile specified by "filename". "filename" must contain * a list of SLURM NodeNames, one per line. Reads up to "n" number of hostnames * from the file. Returns a string representing a hostlist ranged string of * the contents of the file. This is a helper function, it does not * contact any SLURM daemons. * * Returns a string representing the hostlist. Returns NULL if there are fewer * than "n" hostnames in the file, or if an error occurs. If "n" == * NO_VAL then the entire file is read in * * Returned string must be freed with free(). */ char *slurm_read_hostfile(char *filename, int n) { FILE *fp = NULL; char in_line[BUFFER_SIZE]; /* input line */ int i, j; int line_size; int line_num = 0; hostlist_t hostlist = NULL; char *nodelist = NULL; char *asterisk, *tmp_text, *save_ptr = NULL, *host_name; int total_file_len = 0; if (filename == NULL || strlen(filename) == 0) return NULL; if ((fp = fopen(filename, "r")) == NULL) { error("slurm_allocate_resources error opening file %s, %m", filename); return NULL; } hostlist = hostlist_create(NULL); if (hostlist == NULL) { fclose(fp); return NULL; } while (fgets(in_line, BUFFER_SIZE, fp) != NULL) { line_num++; if (!isalpha(in_line[0]) && !isdigit(in_line[0])) { error ("Invalid hostfile %s contents on line %d", filename, line_num); fclose (fp); hostlist_destroy(hostlist); return NULL; } line_size = strlen(in_line); total_file_len += line_size; if (line_size == (BUFFER_SIZE - 1)) { error ("Line %d, of hostfile %s too long", line_num, filename); fclose (fp); hostlist_destroy(hostlist); return NULL; } for (i = 0; i < line_size; i++) { if (in_line[i] == '\n') { in_line[i] = '\0'; break; } if (in_line[i] == '\0') break; if (in_line[i] != '#') continue; if ((i > 0) && (in_line[i - 1] == '\\')) { for (j = i; j < line_size; j++) { in_line[j - 1] = in_line[j]; } line_size--; continue; } in_line[i] = '\0'; break; } tmp_text = xstrdup(in_line); host_name = strtok_r(tmp_text, ",", &save_ptr); while (host_name) { if ((asterisk = strchr(host_name, '*')) && (i = atoi(asterisk + 1))) { asterisk[0] = '\0'; for (j = 0; j < i; j++) hostlist_push_host(hostlist, host_name); } else { hostlist_push_host(hostlist, host_name); } host_name = strtok_r(NULL, ",", &save_ptr); } xfree(tmp_text); if ((n != (int)NO_VAL) && (hostlist_count(hostlist) == n)) break; } fclose(fp); if (hostlist_count(hostlist) <= 0) { error("Hostlist is empty!"); goto cleanup_hostfile; } if (hostlist_count(hostlist) < n) { error("Too few NodeNames in SLURM Hostfile"); goto cleanup_hostfile; } total_file_len += 1024; nodelist = (char *)malloc(total_file_len); if (!nodelist) { error("Nodelist xmalloc failed"); goto cleanup_hostfile; } if (hostlist_ranged_string(hostlist, total_file_len, nodelist) == -1) { error("Hostlist is too long for the allocate RPC!"); free(nodelist); nodelist = NULL; goto cleanup_hostfile; } debug2("Hostlist from SLURM_HOSTFILE = %s", nodelist); cleanup_hostfile: hostlist_destroy(hostlist); return nodelist; }
extern int slurm_hostlist_push_host(hostlist_t hl, const char *host) { return hostlist_push_host(hl, host); }
/* Reserve ports for a job step * NOTE: We keep track of last port reserved and go round-robin through full * set of available ports. This helps avoid re-using busy ports when * restarting job steps. * RET SLURM_SUCCESS or an error code */ extern int resv_port_alloc(struct step_record *step_ptr) { int i, port_inx; int *port_array = NULL; char port_str[16], *tmp_str; hostlist_t hl; static int last_port_alloc = 0; if (step_ptr->resv_port_cnt > port_resv_cnt) { info("step %u.%u needs %u reserved ports, but only %d exist", step_ptr->job_ptr->job_id, step_ptr->step_id, step_ptr->resv_port_cnt, port_resv_cnt); return ESLURM_PORTS_INVALID; } /* Identify available ports */ port_array = xmalloc(sizeof(int) * step_ptr->resv_port_cnt); port_inx = 0; for (i=0; i<port_resv_cnt; i++) { if (++last_port_alloc >= port_resv_cnt) last_port_alloc = 0; if (bit_overlap(step_ptr->step_node_bitmap, port_resv_table[last_port_alloc])) continue; port_array[port_inx++] = last_port_alloc; if (port_inx >= step_ptr->resv_port_cnt) break; } if (port_inx < step_ptr->resv_port_cnt) { info("insufficient ports for step %u.%u to reserve (%d of %u)", step_ptr->job_ptr->job_id, step_ptr->step_id, port_inx, step_ptr->resv_port_cnt); xfree(port_array); return ESLURM_PORTS_BUSY; } /* Reserve selected ports */ hl = hostlist_create(NULL); for (i=0; i<port_inx; i++) { bit_or(port_resv_table[port_array[i]], step_ptr->step_node_bitmap); port_array[i] += port_resv_min; snprintf(port_str, sizeof(port_str), "%d", port_array[i]); hostlist_push_host(hl, port_str); } hostlist_sort(hl); step_ptr->resv_ports = hostlist_ranged_string_xmalloc(hl); hostlist_destroy(hl); step_ptr->resv_port_array = port_array; if (step_ptr->resv_ports[0] == '[') { /* Remove brackets from hostlist */ i = strlen(step_ptr->resv_ports); step_ptr->resv_ports[i-1] = '\0'; tmp_str = xmalloc(i); strcpy(tmp_str, step_ptr->resv_ports + 1); xfree(step_ptr->resv_ports); step_ptr->resv_ports = tmp_str; } debug("reserved ports %s for step %u.%u", step_ptr->resv_ports, step_ptr->job_ptr->job_id, step_ptr->step_id); return SLURM_SUCCESS; }
/* Spawn health check function for every node that is not DOWN */ extern void run_health_check(void) { #ifdef HAVE_FRONT_END front_end_record_t *front_end_ptr; #else struct node_record *node_ptr; int node_test_cnt = 0, node_limit, node_states, run_cyclic; static int base_node_loc = -1; static time_t cycle_start_time = (time_t) 0; #endif int i; char *host_str = NULL; agent_arg_t *check_agent_args = NULL; /* Sync plugin internal data with * node select_nodeinfo. This is important * after reconfig otherwise select_nodeinfo * will not return the correct number of * allocated cpus. */ select_g_select_nodeinfo_set_all(); check_agent_args = xmalloc (sizeof (agent_arg_t)); check_agent_args->msg_type = REQUEST_HEALTH_CHECK; check_agent_args->retry = 0; check_agent_args->hostlist = hostlist_create(NULL); #ifdef HAVE_FRONT_END for (i = 0, front_end_ptr = front_end_nodes; i < front_end_node_cnt; i++, front_end_ptr++) { if (IS_NODE_NO_RESPOND(front_end_ptr)) continue; hostlist_push_host(check_agent_args->hostlist, front_end_ptr->name); check_agent_args->node_count++; } #else run_cyclic = slurmctld_conf.health_check_node_state & HEALTH_CHECK_CYCLE; node_states = slurmctld_conf.health_check_node_state & (~HEALTH_CHECK_CYCLE); if (run_cyclic) { time_t now = time(NULL); if (cycle_start_time == (time_t) 0) cycle_start_time = now; else if (base_node_loc >= 0) ; /* mid-cycle */ else if (difftime(now, cycle_start_time) < slurmctld_conf.health_check_interval) { return; /* Wait to start next cycle */ } cycle_start_time = now; /* Determine how many nodes we want to test on each call of * run_health_check() to spread out the work. */ node_limit = (node_record_count * 2) / slurmctld_conf.health_check_interval; node_limit = MAX(node_limit, 10); } if ((node_states != HEALTH_CHECK_NODE_ANY) && (node_states != HEALTH_CHECK_NODE_IDLE)) { /* Update each node's alloc_cpus count */ select_g_select_nodeinfo_set_all(); } for (i = 0; i < node_record_count; i++) { if (run_cyclic) { if (node_test_cnt++ >= node_limit) break; base_node_loc++; if (base_node_loc >= node_record_count) { base_node_loc = -1; break; } node_ptr = node_record_table_ptr + base_node_loc; } else { node_ptr = node_record_table_ptr + i; } if (IS_NODE_NO_RESPOND(node_ptr) || IS_NODE_FUTURE(node_ptr) || IS_NODE_POWER_SAVE(node_ptr)) continue; if (node_states != HEALTH_CHECK_NODE_ANY) { uint16_t cpus_total, cpus_used = 0; if (slurmctld_conf.fast_schedule) { cpus_total = node_ptr->config_ptr->cpus; } else { cpus_total = node_ptr->cpus; } if (!IS_NODE_IDLE(node_ptr)) { select_g_select_nodeinfo_get( node_ptr->select_nodeinfo, SELECT_NODEDATA_SUBCNT, NODE_STATE_ALLOCATED, &cpus_used); } /* Here the node state is inferred from * the cpus allocated on it. * - cpus_used == 0 * means node is idle * - cpus_used < cpus_total * means the node is in mixed state * else cpus_used == cpus_total * means the node is allocated */ if (cpus_used == 0) { if (!(node_states & HEALTH_CHECK_NODE_IDLE)) continue; if (!IS_NODE_IDLE(node_ptr)) continue; } else if (cpus_used < cpus_total) { if (!(node_states & HEALTH_CHECK_NODE_MIXED)) continue; } else { if (!(node_states & HEALTH_CHECK_NODE_ALLOC)) continue; } } hostlist_push_host(check_agent_args->hostlist, node_ptr->name); check_agent_args->node_count++; } if (run_cyclic && (i >= node_record_count)) base_node_loc = -1; #endif if (check_agent_args->node_count == 0) { hostlist_destroy(check_agent_args->hostlist); xfree (check_agent_args); } else { hostlist_uniq(check_agent_args->hostlist); host_str = hostlist_ranged_string_xmalloc( check_agent_args->hostlist); debug("Spawning health check agent for %s", host_str); xfree(host_str); ping_begin(); agent_queue_request(check_agent_args); } }
static void _update_sinfo(sinfo_data_t *sinfo_ptr, node_info_t *node_ptr, uint32_t node_scaling) { uint16_t base_state; uint16_t used_cpus = 0, error_cpus = 0; int total_cpus = 0, total_nodes = 0; /* since node_scaling could be less here, we need to use the * global node scaling which should never change. */ int single_node_cpus = (node_ptr->cpus / g_node_scaling); base_state = node_ptr->node_state & NODE_STATE_BASE; if (sinfo_ptr->nodes_total == 0) { /* first node added */ sinfo_ptr->node_state = node_ptr->node_state; sinfo_ptr->features = node_ptr->features; sinfo_ptr->gres = node_ptr->gres; sinfo_ptr->reason = node_ptr->reason; sinfo_ptr->reason_time= node_ptr->reason_time; sinfo_ptr->reason_uid = node_ptr->reason_uid; sinfo_ptr->min_cpus = node_ptr->cpus; sinfo_ptr->max_cpus = node_ptr->cpus; sinfo_ptr->min_sockets = node_ptr->sockets; sinfo_ptr->max_sockets = node_ptr->sockets; sinfo_ptr->min_cores = node_ptr->cores; sinfo_ptr->max_cores = node_ptr->cores; sinfo_ptr->min_threads = node_ptr->threads; sinfo_ptr->max_threads = node_ptr->threads; sinfo_ptr->min_disk = node_ptr->tmp_disk; sinfo_ptr->max_disk = node_ptr->tmp_disk; sinfo_ptr->min_mem = node_ptr->real_memory; sinfo_ptr->max_mem = node_ptr->real_memory; sinfo_ptr->min_weight = node_ptr->weight; sinfo_ptr->max_weight = node_ptr->weight; sinfo_ptr->min_cpu_load = node_ptr->cpu_load; sinfo_ptr->max_cpu_load = node_ptr->cpu_load; sinfo_ptr->max_cpus_per_node = sinfo_ptr->part_info-> max_cpus_per_node; sinfo_ptr->version = node_ptr->version; } else if (hostlist_find(sinfo_ptr->nodes, node_ptr->name) != -1) { /* we already have this node in this record, * just return, don't duplicate */ return; } else { if (sinfo_ptr->min_cpus > node_ptr->cpus) sinfo_ptr->min_cpus = node_ptr->cpus; if (sinfo_ptr->max_cpus < node_ptr->cpus) sinfo_ptr->max_cpus = node_ptr->cpus; if (sinfo_ptr->min_sockets > node_ptr->sockets) sinfo_ptr->min_sockets = node_ptr->sockets; if (sinfo_ptr->max_sockets < node_ptr->sockets) sinfo_ptr->max_sockets = node_ptr->sockets; if (sinfo_ptr->min_cores > node_ptr->cores) sinfo_ptr->min_cores = node_ptr->cores; if (sinfo_ptr->max_cores < node_ptr->cores) sinfo_ptr->max_cores = node_ptr->cores; if (sinfo_ptr->min_threads > node_ptr->threads) sinfo_ptr->min_threads = node_ptr->threads; if (sinfo_ptr->max_threads < node_ptr->threads) sinfo_ptr->max_threads = node_ptr->threads; if (sinfo_ptr->min_disk > node_ptr->tmp_disk) sinfo_ptr->min_disk = node_ptr->tmp_disk; if (sinfo_ptr->max_disk < node_ptr->tmp_disk) sinfo_ptr->max_disk = node_ptr->tmp_disk; if (sinfo_ptr->min_mem > node_ptr->real_memory) sinfo_ptr->min_mem = node_ptr->real_memory; if (sinfo_ptr->max_mem < node_ptr->real_memory) sinfo_ptr->max_mem = node_ptr->real_memory; if (sinfo_ptr->min_weight> node_ptr->weight) sinfo_ptr->min_weight = node_ptr->weight; if (sinfo_ptr->max_weight < node_ptr->weight) sinfo_ptr->max_weight = node_ptr->weight; if (sinfo_ptr->min_cpu_load > node_ptr->cpu_load) sinfo_ptr->min_cpu_load = node_ptr->cpu_load; if (sinfo_ptr->max_cpu_load < node_ptr->cpu_load) sinfo_ptr->max_cpu_load = node_ptr->cpu_load; } hostlist_push_host(sinfo_ptr->nodes, node_ptr->name); if (params.match_flags.node_addr_flag) hostlist_push_host(sinfo_ptr->node_addr, node_ptr->node_addr); if (params.match_flags.hostnames_flag) hostlist_push_host(sinfo_ptr->hostnames, node_ptr->node_hostname); total_cpus = node_ptr->cpus; total_nodes = node_scaling; select_g_select_nodeinfo_get(node_ptr->select_nodeinfo, SELECT_NODEDATA_SUBCNT, NODE_STATE_ALLOCATED, &used_cpus); select_g_select_nodeinfo_get(node_ptr->select_nodeinfo, SELECT_NODEDATA_SUBCNT, NODE_STATE_ERROR, &error_cpus); if (params.cluster_flags & CLUSTER_FLAG_BG) { if (!params.match_flags.state_flag && (used_cpus || error_cpus)) { /* We only get one shot at this (because all states * are combined together), so we need to make * sure we get all the subgrps accounted. (So use * g_node_scaling for safe measure) */ total_nodes = g_node_scaling; sinfo_ptr->nodes_alloc += used_cpus; sinfo_ptr->nodes_other += error_cpus; sinfo_ptr->nodes_idle += (total_nodes - (used_cpus + error_cpus)); used_cpus *= single_node_cpus; error_cpus *= single_node_cpus; } else { /* process only for this subgrp and then return */ total_cpus = total_nodes * single_node_cpus; if ((base_state == NODE_STATE_ALLOCATED) || (base_state == NODE_STATE_MIXED) || (node_ptr->node_state & NODE_STATE_COMPLETING)) { sinfo_ptr->nodes_alloc += total_nodes; sinfo_ptr->cpus_alloc += total_cpus; } else if (IS_NODE_DRAIN(node_ptr) || (base_state == NODE_STATE_DOWN)) { sinfo_ptr->nodes_other += total_nodes; sinfo_ptr->cpus_other += total_cpus; } else { sinfo_ptr->nodes_idle += total_nodes; sinfo_ptr->cpus_idle += total_cpus; } sinfo_ptr->nodes_total += total_nodes; sinfo_ptr->cpus_total += total_cpus; return; } } else { if ((base_state == NODE_STATE_ALLOCATED) || (base_state == NODE_STATE_MIXED) || IS_NODE_COMPLETING(node_ptr)) sinfo_ptr->nodes_alloc += total_nodes; else if (IS_NODE_DRAIN(node_ptr) || (base_state == NODE_STATE_DOWN)) sinfo_ptr->nodes_other += total_nodes; else sinfo_ptr->nodes_idle += total_nodes; } sinfo_ptr->nodes_total += total_nodes; sinfo_ptr->cpus_alloc += used_cpus; sinfo_ptr->cpus_total += total_cpus; total_cpus -= used_cpus + error_cpus; if (error_cpus) { sinfo_ptr->cpus_idle += total_cpus; sinfo_ptr->cpus_other += error_cpus; } else if (IS_NODE_DRAIN(node_ptr) || (base_state == NODE_STATE_DOWN)) { sinfo_ptr->cpus_other += total_cpus; } else sinfo_ptr->cpus_idle += total_cpus; }
/* * Read a Slurm hostfile specified by "filename". "filename" must contain * a list of Slurm NodeNames, one per line. Reads up to "n" number of hostnames * from the file. Returns a string representing a hostlist ranged string of * the contents of the file. This is a helper function, it does not * contact any Slurm daemons. * * Returns a string representing the hostlist. Returns NULL if there are fewer * than "n" hostnames in the file, or if an error occurs. If "n" == * NO_VAL then the entire file is read in * * Returned string must be freed with free(). */ char *slurm_read_hostfile(const char *filename, int n) { FILE *fp = NULL; char in_line[BUFFER_SIZE]; /* input line */ int i, j; int line_size; int line_num = 0; hostlist_t hostlist = NULL; char *nodelist = NULL, *end_part = NULL; char *asterisk, *tmp_text = NULL, *save_ptr = NULL, *host_name; int total_file_len = 0; if (filename == NULL || strlen(filename) == 0) return NULL; if ((fp = fopen(filename, "r")) == NULL) { error("slurm_allocate_resources error opening file %s, %m", filename); return NULL; } hostlist = hostlist_create(NULL); if (hostlist == NULL) { fclose(fp); return NULL; } while (fgets(in_line, BUFFER_SIZE, fp) != NULL) { line_size = strlen(in_line); for (i = 0; i < line_size; i++) { if (in_line[i] == '\n') { in_line[i] = '\0'; break; } if (in_line[i] == '\0') break; if (in_line[i] != '#') continue; if ((i > 0) && (in_line[i - 1] == '\\')) { for (j = i; j < line_size; j++) { in_line[j - 1] = in_line[j]; } line_size--; continue; } in_line[i] = '\0'; break; } /* * Get the string length again just to in case it changed from * the above loop */ line_size = strlen(in_line); total_file_len += line_size; /* * If there was an end section from before set it up to be on * the front of this next chunk. */ if (end_part) { tmp_text = end_part; end_part = NULL; } if (line_size == (BUFFER_SIZE - 1)) { /* * If we filled up the buffer get the end past the last * comma. We will tack it on the next pass through. */ char *last_comma = strrchr(in_line, ','); if (!last_comma) { error("Line %d, of hostfile %s too long", line_num, filename); fclose(fp); hostlist_destroy(hostlist); return NULL; } end_part = xstrdup(last_comma + 1); *last_comma = '\0'; } else line_num++; xstrcat(tmp_text, in_line); /* Skip this line */ if (tmp_text[0] == '\0') continue; if (!isalpha(tmp_text[0]) && !isdigit(tmp_text[0])) { error("Invalid hostfile %s contents on line %d", filename, line_num); fclose(fp); hostlist_destroy(hostlist); xfree(end_part); xfree(tmp_text); return NULL; } host_name = strtok_r(tmp_text, ",", &save_ptr); while (host_name) { if ((asterisk = strchr(host_name, '*')) && (i = atoi(asterisk + 1))) { asterisk[0] = '\0'; /* * Don't forget the extra space potentially * needed */ total_file_len += strlen(host_name) * i; for (j = 0; j < i; j++) hostlist_push_host(hostlist, host_name); } else { hostlist_push_host(hostlist, host_name); } host_name = strtok_r(NULL, ",", &save_ptr); } xfree(tmp_text); if ((n != (int)NO_VAL) && (hostlist_count(hostlist) == n)) break; } fclose(fp); if (hostlist_count(hostlist) <= 0) { error("Hostlist is empty!"); goto cleanup_hostfile; } if (hostlist_count(hostlist) < n) { error("Too few NodeNames in Slurm Hostfile"); goto cleanup_hostfile; } total_file_len += 1024; nodelist = (char *)malloc(total_file_len); if (!nodelist) { error("Nodelist xmalloc failed"); goto cleanup_hostfile; } if (hostlist_ranged_string(hostlist, total_file_len, nodelist) == -1) { error("Hostlist is too long for the allocate RPC!"); free(nodelist); nodelist = NULL; goto cleanup_hostfile; } debug2("Hostlist from SLURM_HOSTFILE = %s", nodelist); cleanup_hostfile: hostlist_destroy(hostlist); xfree(end_part); xfree(tmp_text); return nodelist; }
/* * topo_get_node_addr - build node address and the associated pattern * based on the topology information * * example of output : * address : s0.s4.s8.tux1 * pattern : switch.switch.switch.node */ extern int topo_get_node_addr(char* node_name, char** paddr, char** ppattern) { struct node_record *node_ptr; int node_inx; hostlist_t sl = NULL; int s_max_level = 0; int i, j; /* no switches found, return */ if ( switch_record_cnt == 0 ) { *paddr = xstrdup(node_name); *ppattern = xstrdup("node"); return SLURM_SUCCESS; } node_ptr = find_node_record(node_name); /* node not found in configuration */ if ( node_ptr == NULL ) return SLURM_ERROR; node_inx = node_ptr - node_record_table_ptr; /* look for switches max level */ for (i=0; i<switch_record_cnt; i++) { if ( switch_record_table[i].level > s_max_level ) s_max_level = switch_record_table[i].level; } /* initialize output parameters */ *paddr = xstrdup(""); *ppattern = xstrdup(""); /* build node topology address and the associated pattern */ for (j=s_max_level; j>=0; j--) { for (i=0; i<switch_record_cnt; i++) { if ( switch_record_table[i].level != j ) continue; if ( !bit_test(switch_record_table[i]. node_bitmap, node_inx) ) continue; if ( sl == NULL ) { sl = hostlist_create(switch_record_table[i]. name); } else { hostlist_push_host(sl, switch_record_table[i]. name); } } if ( sl ) { char *buf = hostlist_ranged_string_xmalloc(sl); xstrcat(*paddr,buf); xfree(buf); hostlist_destroy(sl); sl = NULL; } xstrcat(*paddr, "."); xstrcat(*ppattern, "switch."); } /* append node name */ xstrcat(*paddr, node_name); xstrcat(*ppattern, "node"); return SLURM_SUCCESS; }
/* * forward_msg - logic to forward a message which has been received and * accumulate the return codes from processes getting the * the forwarded message * * IN: forward_struct - forward_struct_t * - holds information about message * that needs to be forwarded to * children processes * IN: header - header_t - header from message that came in * needing to be forwarded. * RET: SLURM_SUCCESS - int */ extern int forward_msg(forward_struct_t *forward_struct, header_t *header) { int j = 0; int retries = 0; forward_msg_t *forward_msg = NULL; int thr_count = 0; int *span = set_span(header->forward.cnt, 0); hostlist_t hl = NULL; hostlist_t forward_hl = NULL; char *name = NULL; if (!forward_struct->ret_list) { error("didn't get a ret_list from forward_struct"); xfree(span); return SLURM_ERROR; } hl = hostlist_create(header->forward.nodelist); hostlist_uniq(hl); while ((name = hostlist_shift(hl))) { pthread_attr_t attr_agent; pthread_t thread_agent; char *buf = NULL; slurm_attr_init(&attr_agent); if (pthread_attr_setdetachstate (&attr_agent, PTHREAD_CREATE_DETACHED)) error("pthread_attr_setdetachstate error %m"); forward_msg = &forward_struct->forward_msg[thr_count]; forward_msg->ret_list = forward_struct->ret_list; forward_msg->timeout = forward_struct->timeout; if (forward_msg->timeout <= 0) { /* convert secs to msec */ forward_msg->timeout = slurm_get_msg_timeout() * 1000; } forward_msg->notify = &forward_struct->notify; forward_msg->forward_mutex = &forward_struct->forward_mutex; forward_msg->buf_len = forward_struct->buf_len; forward_msg->buf = forward_struct->buf; memcpy(&forward_msg->header.orig_addr, &header->orig_addr, sizeof(slurm_addr_t)); forward_msg->header.version = header->version; forward_msg->header.flags = header->flags; forward_msg->header.msg_type = header->msg_type; forward_msg->header.body_length = header->body_length; forward_msg->header.ret_list = NULL; forward_msg->header.ret_cnt = 0; forward_hl = hostlist_create(name); free(name); for(j = 0; j < span[thr_count]; j++) { name = hostlist_shift(hl); if (!name) break; hostlist_push_host(forward_hl, name); free(name); } buf = hostlist_ranged_string_xmalloc(forward_hl); hostlist_destroy(forward_hl); forward_init(&forward_msg->header.forward, NULL); forward_msg->header.forward.nodelist = buf; while (pthread_create(&thread_agent, &attr_agent, _forward_thread, (void *)forward_msg)) { error("pthread_create error %m"); if (++retries > MAX_RETRIES) fatal("Can't create pthread"); sleep(1); /* sleep and try again */ } slurm_attr_destroy(&attr_agent); thr_count++; } hostlist_destroy(hl); xfree(span); return SLURM_SUCCESS; }
extern int basil_node_ranking(struct node_record *node_array, int node_cnt) { enum basil_version version = get_basil_version(); struct basil_inventory *inv; struct basil_node *node; int rank_count = 0, i; hostlist_t hl = hostlist_create(NULL); bool bad_node = 0; node_rank_inv = 1; /* * When obtaining the initial configuration, we can not allow ALPS to * fail. If there is a problem at this stage it is better to restart * SLURM completely, after investigating (and/or fixing) the cause. */ inv = get_full_inventory(version); if (inv == NULL) fatal("failed to get BASIL %s ranking", bv_names_long[version]); else if (!inv->batch_total) fatal("system has no usable batch compute nodes"); else if (inv->batch_total < node_cnt) info("Warning: ALPS sees only %d/%d slurm.conf nodes, " "check DownNodes", inv->batch_total, node_cnt); debug("BASIL %s RANKING INVENTORY: %d/%d batch nodes", bv_names_long[version], inv->batch_avail, inv->batch_total); /* * Node ranking is based on a subset of the inventory: only nodes in * batch allocation mode which are up and not allocated. Assign a * 'NO_VAL' rank to all other nodes, which will translate as a very * high value, (unsigned)-2, to put those nodes last in the ranking. * The rest of the code must ensure that those nodes are never chosen. */ for (i = 0; i < node_cnt; i++) node_array[i].node_rank = NO_VAL; for (node = inv->f->node_head; node; node = node->next) { struct node_record *node_ptr; char tmp[50]; /* This will ignore interactive nodes when iterating through * the apbasil inventory. If we don't do this, SLURM is * unable to resolve the ID to a nidXXX name since it's not in * the slurm.conf file. (Chris North) */ if (node->role == BNR_INTER) continue; node_ptr = _find_node_by_basil_id(node->node_id); if (node_ptr == NULL) { error("nid%05u (%s node in state %s) not in slurm.conf", node->node_id, nam_noderole[node->role], nam_nodestate[node->state]); bad_node = 1; } else if ((slurmctld_conf.fast_schedule != 2) && (node->cpu_count != node_ptr->config_ptr->cpus)) { fatal("slurm.conf: node %s has %u cpus " "but configured as CPUs=%u in your slurm.conf", node_ptr->name, node->cpu_count, node_ptr->config_ptr->cpus); } else if ((slurmctld_conf.fast_schedule != 2) && (node->mem_size != node_ptr->config_ptr->real_memory)) { fatal("slurm.conf: node %s has RealMemory=%u " "but configured as RealMemory=%u in your " "slurm.conf", node_ptr->name, node->mem_size, node_ptr->config_ptr->real_memory); } else { node_ptr->node_rank = inv->nodes_total - rank_count++; /* * Convention: since we are using SLURM in * frontend-mode, we use * NodeHostName as follows. * * NodeHostName: c#-#c#s#n# using the NID convention * <cabinet>-<row><chassis><slot><node> * - each cabinet can accommodate 3 chassis (c1..c3) * - each chassis has 8 slots (s0..s7) * - each slot contains 2 or 4 nodes (n0..n3) * o either 2 service nodes (n0/n3) * o or 4 compute nodes (n0..n3) * o or 2 gemini chips (g0/g1 serving n0..n3) * * Example: c0-0c1s0n1 * - c0- = cabinet 0 * - 0 = row 0 * - c1 = chassis 1 * - s0 = slot 0 * - n1 = node 1 */ xfree(node_ptr->node_hostname); node_ptr->node_hostname = xstrdup(node->name); } sprintf(tmp, "nid%05u", node->node_id); hostlist_push_host(hl, tmp); } free_inv(inv); if (bad_node) { hostlist_sort(hl); char *name = hostlist_ranged_string_xmalloc(hl); info("It appears your slurm.conf nodelist doesn't " "match the alps system. Here are the nodes alps knows " "about\n%s", name); } hostlist_destroy(hl); node_rank_inv = 0; return SLURM_SUCCESS; }
/* * start_msg_tree - logic to begin the forward tree and * accumulate the return codes from processes getting the * the forwarded message * * IN: hl - hostlist_t - list of every node to send message to * IN: msg - slurm_msg_t - message to send. * IN: timeout - int - how long to wait in milliseconds. * RET List - List containing the responses of the children * (if any) we forwarded the message to. List * containing type (ret_data_info_t). */ extern List start_msg_tree(hostlist_t hl, slurm_msg_t *msg, int timeout) { int *span = NULL; fwd_tree_t *fwd_tree = NULL; pthread_mutex_t tree_mutex; pthread_cond_t notify; int j = 0, count = 0; List ret_list = NULL; char *name = NULL; int thr_count = 0; int host_count = 0; xassert(hl); xassert(msg); hostlist_uniq(hl); host_count = hostlist_count(hl); span = set_span(host_count, 0); slurm_mutex_init(&tree_mutex); pthread_cond_init(¬ify, NULL); ret_list = list_create(destroy_data_info); while ((name = hostlist_shift(hl))) { pthread_attr_t attr_agent; pthread_t thread_agent; int retries = 0; slurm_attr_init(&attr_agent); if (pthread_attr_setdetachstate (&attr_agent, PTHREAD_CREATE_DETACHED)) error("pthread_attr_setdetachstate error %m"); fwd_tree = xmalloc(sizeof(fwd_tree_t)); fwd_tree->orig_msg = msg; fwd_tree->ret_list = ret_list; fwd_tree->timeout = timeout; fwd_tree->notify = ¬ify; fwd_tree->p_thr_count = &thr_count; fwd_tree->tree_mutex = &tree_mutex; if (fwd_tree->timeout <= 0) { /* convert secs to msec */ fwd_tree->timeout = slurm_get_msg_timeout() * 1000; } fwd_tree->tree_hl = hostlist_create(name); free(name); for (j = 0; j < span[thr_count]; j++) { name = hostlist_shift(hl); if (!name) break; hostlist_push_host(fwd_tree->tree_hl, name); free(name); } /* * Lock and increase thread counter, we need that to protect * the start_msg_tree waiting loop that was originally designed * around a "while ((count < host_count))" loop. In case where a * fwd thread was not able to get all the return codes from * children, the waiting loop was deadlocked. */ slurm_mutex_lock(&tree_mutex); thr_count++; slurm_mutex_unlock(&tree_mutex); while (pthread_create(&thread_agent, &attr_agent, _fwd_tree_thread, (void *)fwd_tree)) { error("pthread_create error %m"); if (++retries > MAX_RETRIES) fatal("Can't create pthread"); sleep(1); /* sleep and try again */ } slurm_attr_destroy(&attr_agent); } xfree(span); slurm_mutex_lock(&tree_mutex); count = list_count(ret_list); debug2("Tree head got back %d looking for %d", count, host_count); while (thr_count > 0) { pthread_cond_wait(¬ify, &tree_mutex); count = list_count(ret_list); debug2("Tree head got back %d", count); } xassert(count >= host_count); /* Tree head did not get all responses, * but no more active fwd threads!*/ slurm_mutex_unlock(&tree_mutex); slurm_mutex_destroy(&tree_mutex); pthread_cond_destroy(¬ify); return ret_list; }
/* * slurm_sprint_job_info - output information about a specific Slurm * job based upon message as loaded using slurm_load_jobs * IN job_ptr - an individual job information record pointer * IN one_liner - print as a single line if true * RET out - char * containing formatted output (must be freed after call) * NULL is returned on failure. */ extern char * slurm_sprint_job_info ( job_info_t * job_ptr, int one_liner ) { int i, j, k; char time_str[32], *group_name, *user_name; char *gres_last = "", tmp1[128], tmp2[128]; char *tmp6_ptr; char tmp_line[1024 * 128]; char tmp_path[MAXPATHLEN]; char *ionodes = NULL; uint16_t exit_status = 0, term_sig = 0; job_resources_t *job_resrcs = job_ptr->job_resrcs; char *out = NULL; time_t run_time; uint32_t min_nodes, max_nodes = 0; char *nodelist = "NodeList"; bitstr_t *cpu_bitmap; char *host; int sock_inx, sock_reps, last; int abs_node_inx, rel_node_inx; int64_t nice; int bit_inx, bit_reps; uint64_t *last_mem_alloc_ptr = NULL; uint64_t last_mem_alloc = NO_VAL64; char *last_hosts; hostlist_t hl, hl_last; char select_buf[122]; uint32_t cluster_flags = slurmdb_setup_cluster_flags(); uint32_t threads; char *line_end = (one_liner) ? " " : "\n "; if (cluster_flags & CLUSTER_FLAG_BG) { nodelist = "MidplaneList"; select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_IONODES, &ionodes); } /****** Line 1 ******/ xstrfmtcat(out, "JobId=%u ", job_ptr->job_id); if (job_ptr->array_job_id) { if (job_ptr->array_task_str) { xstrfmtcat(out, "ArrayJobId=%u ArrayTaskId=%s ", job_ptr->array_job_id, job_ptr->array_task_str); } else { xstrfmtcat(out, "ArrayJobId=%u ArrayTaskId=%u ", job_ptr->array_job_id, job_ptr->array_task_id); } } xstrfmtcat(out, "JobName=%s", job_ptr->name); xstrcat(out, line_end); /****** Line 2 ******/ user_name = uid_to_string((uid_t) job_ptr->user_id); group_name = gid_to_string((gid_t) job_ptr->group_id); xstrfmtcat(out, "UserId=%s(%u) GroupId=%s(%u) MCS_label=%s", user_name, job_ptr->user_id, group_name, job_ptr->group_id, (job_ptr->mcs_label==NULL) ? "N/A" : job_ptr->mcs_label); xfree(user_name); xfree(group_name); xstrcat(out, line_end); /****** Line 3 ******/ nice = ((int64_t)job_ptr->nice) - NICE_OFFSET; xstrfmtcat(out, "Priority=%u Nice=%"PRIi64" Account=%s QOS=%s", job_ptr->priority, nice, job_ptr->account, job_ptr->qos); if (slurm_get_track_wckey()) xstrfmtcat(out, " WCKey=%s", job_ptr->wckey); xstrcat(out, line_end); /****** Line 4 ******/ xstrfmtcat(out, "JobState=%s ", job_state_string(job_ptr->job_state)); if (job_ptr->state_desc) { /* Replace white space with underscore for easier parsing */ for (j=0; job_ptr->state_desc[j]; j++) { if (isspace((int)job_ptr->state_desc[j])) job_ptr->state_desc[j] = '_'; } xstrfmtcat(out, "Reason=%s ", job_ptr->state_desc); } else xstrfmtcat(out, "Reason=%s ", job_reason_string(job_ptr->state_reason)); xstrfmtcat(out, "Dependency=%s", job_ptr->dependency); xstrcat(out, line_end); /****** Line 5 ******/ xstrfmtcat(out, "Requeue=%u Restarts=%u BatchFlag=%u Reboot=%u ", job_ptr->requeue, job_ptr->restart_cnt, job_ptr->batch_flag, job_ptr->reboot); if (WIFSIGNALED(job_ptr->exit_code)) term_sig = WTERMSIG(job_ptr->exit_code); exit_status = WEXITSTATUS(job_ptr->exit_code); xstrfmtcat(out, "ExitCode=%u:%u", exit_status, term_sig); xstrcat(out, line_end); /****** Line 5a (optional) ******/ if (job_ptr->show_flags & SHOW_DETAIL) { if (WIFSIGNALED(job_ptr->derived_ec)) term_sig = WTERMSIG(job_ptr->derived_ec); else term_sig = 0; exit_status = WEXITSTATUS(job_ptr->derived_ec); xstrfmtcat(out, "DerivedExitCode=%u:%u", exit_status, term_sig); xstrcat(out, line_end); } /****** Line 6 ******/ if (IS_JOB_PENDING(job_ptr)) run_time = 0; else if (IS_JOB_SUSPENDED(job_ptr)) run_time = job_ptr->pre_sus_time; else { time_t end_time; if (IS_JOB_RUNNING(job_ptr) || (job_ptr->end_time == 0)) end_time = time(NULL); else end_time = job_ptr->end_time; if (job_ptr->suspend_time) { run_time = (time_t) (difftime(end_time, job_ptr->suspend_time) + job_ptr->pre_sus_time); } else run_time = (time_t) difftime(end_time, job_ptr->start_time); } secs2time_str(run_time, time_str, sizeof(time_str)); xstrfmtcat(out, "RunTime=%s ", time_str); if (job_ptr->time_limit == NO_VAL) xstrcat(out, "TimeLimit=Partition_Limit "); else { mins2time_str(job_ptr->time_limit, time_str, sizeof(time_str)); xstrfmtcat(out, "TimeLimit=%s ", time_str); } if (job_ptr->time_min == 0) xstrcat(out, "TimeMin=N/A"); else { mins2time_str(job_ptr->time_min, time_str, sizeof(time_str)); xstrfmtcat(out, "TimeMin=%s", time_str); } xstrcat(out, line_end); /****** Line 7 ******/ slurm_make_time_str(&job_ptr->submit_time, time_str, sizeof(time_str)); xstrfmtcat(out, "SubmitTime=%s ", time_str); slurm_make_time_str(&job_ptr->eligible_time, time_str, sizeof(time_str)); xstrfmtcat(out, "EligibleTime=%s", time_str); xstrcat(out, line_end); /****** Line 8 (optional) ******/ if (job_ptr->resize_time) { slurm_make_time_str(&job_ptr->resize_time, time_str, sizeof(time_str)); xstrfmtcat(out, "ResizeTime=%s", time_str); xstrcat(out, line_end); } /****** Line 9 ******/ slurm_make_time_str(&job_ptr->start_time, time_str, sizeof(time_str)); xstrfmtcat(out, "StartTime=%s ", time_str); if ((job_ptr->time_limit == INFINITE) && (job_ptr->end_time > time(NULL))) xstrcat(out, "EndTime=Unknown "); else { slurm_make_time_str(&job_ptr->end_time, time_str, sizeof(time_str)); xstrfmtcat(out, "EndTime=%s ", time_str); } if (job_ptr->deadline) { slurm_make_time_str(&job_ptr->deadline, time_str, sizeof(time_str)); xstrfmtcat(out, "Deadline=%s", time_str); } else { xstrcat(out, "Deadline=N/A"); } xstrcat(out, line_end); /****** Line 10 ******/ if (job_ptr->preempt_time == 0) xstrcat(out, "PreemptTime=None "); else { slurm_make_time_str(&job_ptr->preempt_time, time_str, sizeof(time_str)); xstrfmtcat(out, "PreemptTime=%s ", time_str); } if (job_ptr->suspend_time) { slurm_make_time_str(&job_ptr->suspend_time, time_str, sizeof(time_str)); xstrfmtcat(out, "SuspendTime=%s ", time_str); } else xstrcat(out, "SuspendTime=None "); xstrfmtcat(out, "SecsPreSuspend=%ld", (long int)job_ptr->pre_sus_time); xstrcat(out, line_end); /****** Line 11 ******/ xstrfmtcat(out, "Partition=%s AllocNode:Sid=%s:%u", job_ptr->partition, job_ptr->alloc_node, job_ptr->alloc_sid); xstrcat(out, line_end); /****** Line 12 ******/ xstrfmtcat(out, "Req%s=%s Exc%s=%s", nodelist, job_ptr->req_nodes, nodelist, job_ptr->exc_nodes); xstrcat(out, line_end); /****** Line 13 ******/ xstrfmtcat(out, "%s=%s", nodelist, job_ptr->nodes); if (job_ptr->nodes && ionodes) { xstrfmtcat(out, "[%s]", ionodes); xfree(ionodes); } if (job_ptr->sched_nodes) xstrfmtcat(out, " Sched%s=%s", nodelist, job_ptr->sched_nodes); xstrcat(out, line_end); /****** Line 14 (optional) ******/ if (job_ptr->batch_host) { xstrfmtcat(out, "BatchHost=%s", job_ptr->batch_host); xstrcat(out, line_end); } /****** Line 14a (optional) ******/ if (job_ptr->fed_siblings) { xstrfmtcat(out, "FedOrigin=%s FedSiblings=%s", job_ptr->fed_origin_str, job_ptr->fed_siblings_str); xstrcat(out, line_end); } /****** Line 15 ******/ if (cluster_flags & CLUSTER_FLAG_BG) { select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_NODE_CNT, &min_nodes); if ((min_nodes == 0) || (min_nodes == NO_VAL)) { min_nodes = job_ptr->num_nodes; max_nodes = job_ptr->max_nodes; } else if (job_ptr->max_nodes) max_nodes = min_nodes; } else if (IS_JOB_PENDING(job_ptr)) { min_nodes = job_ptr->num_nodes; max_nodes = job_ptr->max_nodes; if (max_nodes && (max_nodes < min_nodes)) min_nodes = max_nodes; } else { min_nodes = job_ptr->num_nodes; max_nodes = 0; } _sprint_range(tmp_line, sizeof(tmp_line), min_nodes, max_nodes); xstrfmtcat(out, "NumNodes=%s ", tmp_line); _sprint_range(tmp_line, sizeof(tmp_line), job_ptr->num_cpus, job_ptr->max_cpus); xstrfmtcat(out, "NumCPUs=%s ", tmp_line); xstrfmtcat(out, "NumTasks=%u ", job_ptr->num_tasks); xstrfmtcat(out, "CPUs/Task=%u ", job_ptr->cpus_per_task); if (job_ptr->boards_per_node == (uint16_t) NO_VAL) xstrcat(out, "ReqB:S:C:T=*:"); else xstrfmtcat(out, "ReqB:S:C:T=%u:", job_ptr->boards_per_node); if (job_ptr->sockets_per_board == (uint16_t) NO_VAL) xstrcat(out, "*:"); else xstrfmtcat(out, "%u:", job_ptr->sockets_per_board); if (job_ptr->cores_per_socket == (uint16_t) NO_VAL) xstrcat(out, "*:"); else xstrfmtcat(out, "%u:", job_ptr->cores_per_socket); if (job_ptr->threads_per_core == (uint16_t) NO_VAL) xstrcat(out, "*"); else xstrfmtcat(out, "%u", job_ptr->threads_per_core); xstrcat(out, line_end); /****** Line 16 ******/ /* Tres should already of been converted at this point from simple */ xstrfmtcat(out, "TRES=%s", job_ptr->tres_alloc_str ? job_ptr->tres_alloc_str : job_ptr->tres_req_str); xstrcat(out, line_end); /****** Line 17 ******/ if (job_ptr->sockets_per_node == (uint16_t) NO_VAL) xstrcat(out, "Socks/Node=* "); else xstrfmtcat(out, "Socks/Node=%u ", job_ptr->sockets_per_node); if (job_ptr->ntasks_per_node == (uint16_t) NO_VAL) xstrcat(out, "NtasksPerN:B:S:C=*:"); else xstrfmtcat(out, "NtasksPerN:B:S:C=%u:", job_ptr->ntasks_per_node); if (job_ptr->ntasks_per_board == (uint16_t) NO_VAL) xstrcat(out, "*:"); else xstrfmtcat(out, "%u:", job_ptr->ntasks_per_board); if ((job_ptr->ntasks_per_socket == (uint16_t) NO_VAL) || (job_ptr->ntasks_per_socket == (uint16_t) INFINITE)) xstrcat(out, "*:"); else xstrfmtcat(out, "%u:", job_ptr->ntasks_per_socket); if ((job_ptr->ntasks_per_core == (uint16_t) NO_VAL) || (job_ptr->ntasks_per_core == (uint16_t) INFINITE)) xstrcat(out, "* "); else xstrfmtcat(out, "%u ", job_ptr->ntasks_per_core); if (job_ptr->core_spec == (uint16_t) NO_VAL) xstrcat(out, "CoreSpec=*"); else if (job_ptr->core_spec & CORE_SPEC_THREAD) xstrfmtcat(out, "ThreadSpec=%d", (job_ptr->core_spec & (~CORE_SPEC_THREAD))); else xstrfmtcat(out, "CoreSpec=%u", job_ptr->core_spec); xstrcat(out, line_end); if (job_resrcs && cluster_flags & CLUSTER_FLAG_BG) { if ((job_resrcs->cpu_array_cnt > 0) && (job_resrcs->cpu_array_value) && (job_resrcs->cpu_array_reps)) { int length = 0; xstrcat(out, "CPUs="); for (i = 0; i < job_resrcs->cpu_array_cnt; i++) { /* only print 60 characters worth of this record */ if (length > 60) { /* skip to last CPU group entry */ if (i < job_resrcs->cpu_array_cnt - 1) { continue; } /* add ellipsis before last entry */ xstrcat(out, "...,"); } length += xstrfmtcat(out, "%d", job_resrcs->cpus[i]); if (job_resrcs->cpu_array_reps[i] > 1) { length += xstrfmtcat(out, "*%d", job_resrcs->cpu_array_reps[i]); } if (i < job_resrcs->cpu_array_cnt - 1) { xstrcat(out, ","); length++; } } xstrcat(out, line_end); } } else if (job_resrcs && job_resrcs->core_bitmap && ((last = bit_fls(job_resrcs->core_bitmap)) != -1)) { hl = hostlist_create(job_resrcs->nodes); if (!hl) { error("slurm_sprint_job_info: hostlist_create: %s", job_resrcs->nodes); return NULL; } hl_last = hostlist_create(NULL); if (!hl_last) { error("slurm_sprint_job_info: hostlist_create: NULL"); hostlist_destroy(hl); return NULL; } bit_inx = 0; i = sock_inx = sock_reps = 0; abs_node_inx = job_ptr->node_inx[i]; gres_last = ""; /* tmp1[] stores the current cpu(s) allocated */ tmp2[0] = '\0'; /* stores last cpu(s) allocated */ for (rel_node_inx=0; rel_node_inx < job_resrcs->nhosts; rel_node_inx++) { if (sock_reps >= job_resrcs->sock_core_rep_count[sock_inx]) { sock_inx++; sock_reps = 0; } sock_reps++; bit_reps = job_resrcs->sockets_per_node[sock_inx] * job_resrcs->cores_per_socket[sock_inx]; host = hostlist_shift(hl); threads = _threads_per_core(host); cpu_bitmap = bit_alloc(bit_reps * threads); for (j = 0; j < bit_reps; j++) { if (bit_test(job_resrcs->core_bitmap, bit_inx)){ for (k = 0; k < threads; k++) bit_set(cpu_bitmap, (j * threads) + k); } bit_inx++; } bit_fmt(tmp1, sizeof(tmp1), cpu_bitmap); FREE_NULL_BITMAP(cpu_bitmap); /* * If the allocation values for this host are not the * same as the last host, print the report of the last * group of hosts that had identical allocation values. */ if (xstrcmp(tmp1, tmp2) || ((rel_node_inx < job_ptr->gres_detail_cnt) && xstrcmp(job_ptr->gres_detail_str[rel_node_inx], gres_last)) || (last_mem_alloc_ptr != job_resrcs->memory_allocated) || (job_resrcs->memory_allocated && (last_mem_alloc != job_resrcs->memory_allocated[rel_node_inx]))) { if (hostlist_count(hl_last)) { last_hosts = hostlist_ranged_string_xmalloc( hl_last); xstrfmtcat(out, " Nodes=%s CPU_IDs=%s " "Mem=%"PRIu64" GRES_IDX=%s", last_hosts, tmp2, last_mem_alloc_ptr ? last_mem_alloc : 0, gres_last); xfree(last_hosts); xstrcat(out, line_end); hostlist_destroy(hl_last); hl_last = hostlist_create(NULL); } strcpy(tmp2, tmp1); if (rel_node_inx < job_ptr->gres_detail_cnt) { gres_last = job_ptr-> gres_detail_str[rel_node_inx]; } else { gres_last = ""; } last_mem_alloc_ptr = job_resrcs->memory_allocated; if (last_mem_alloc_ptr) last_mem_alloc = job_resrcs-> memory_allocated[rel_node_inx]; else last_mem_alloc = NO_VAL64; } hostlist_push_host(hl_last, host); free(host); if (bit_inx > last) break; if (abs_node_inx > job_ptr->node_inx[i+1]) { i += 2; abs_node_inx = job_ptr->node_inx[i]; } else { abs_node_inx++; } } if (hostlist_count(hl_last)) { last_hosts = hostlist_ranged_string_xmalloc(hl_last); xstrfmtcat(out, " Nodes=%s CPU_IDs=%s Mem=%"PRIu64" GRES_IDX=%s", last_hosts, tmp2, last_mem_alloc_ptr ? last_mem_alloc : 0, gres_last); xfree(last_hosts); xstrcat(out, line_end); } hostlist_destroy(hl); hostlist_destroy(hl_last); } /****** Line 18 ******/ if (job_ptr->pn_min_memory & MEM_PER_CPU) { job_ptr->pn_min_memory &= (~MEM_PER_CPU); tmp6_ptr = "CPU"; } else tmp6_ptr = "Node"; if (cluster_flags & CLUSTER_FLAG_BG) { convert_num_unit((float)job_ptr->pn_min_cpus, tmp1, sizeof(tmp1), UNIT_NONE, NO_VAL, CONVERT_NUM_UNIT_EXACT); xstrfmtcat(out, "MinCPUsNode=%s ", tmp1); } else { xstrfmtcat(out, "MinCPUsNode=%u ", job_ptr->pn_min_cpus); } convert_num_unit((float)job_ptr->pn_min_memory, tmp1, sizeof(tmp1), UNIT_MEGA, NO_VAL, CONVERT_NUM_UNIT_EXACT); convert_num_unit((float)job_ptr->pn_min_tmp_disk, tmp2, sizeof(tmp2), UNIT_MEGA, NO_VAL, CONVERT_NUM_UNIT_EXACT); xstrfmtcat(out, "MinMemory%s=%s MinTmpDiskNode=%s", tmp6_ptr, tmp1, tmp2); xstrcat(out, line_end); /****** Line ******/ secs2time_str((time_t)job_ptr->delay_boot, tmp1, sizeof(tmp1)); xstrfmtcat(out, "Features=%s DelayBoot=%s", job_ptr->features, tmp1); xstrcat(out, line_end); /****** Line ******/ xstrfmtcat(out, "Gres=%s Reservation=%s", job_ptr->gres, job_ptr->resv_name); xstrcat(out, line_end); /****** Line 20 ******/ xstrfmtcat(out, "OverSubscribe=%s Contiguous=%d Licenses=%s Network=%s", job_share_string(job_ptr->shared), job_ptr->contiguous, job_ptr->licenses, job_ptr->network); xstrcat(out, line_end); /****** Line 21 ******/ xstrfmtcat(out, "Command=%s", job_ptr->command); xstrcat(out, line_end); /****** Line 22 ******/ xstrfmtcat(out, "WorkDir=%s", job_ptr->work_dir); if (cluster_flags & CLUSTER_FLAG_BG) { /****** Line 23 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_BG_ID); if (select_buf[0] != '\0') { xstrcat(out, line_end); xstrfmtcat(out, "Block_ID=%s", select_buf); } /****** Line 24 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_MIXED_SHORT); if (select_buf[0] != '\0') { xstrcat(out, line_end); xstrcat(out, select_buf); } /****** Line 26 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_LINUX_IMAGE); if (select_buf[0] != '\0') { xstrcat(out, line_end); xstrfmtcat(out, "CnloadImage=%s", select_buf); } /****** Line 27 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_MLOADER_IMAGE); if (select_buf[0] != '\0') { xstrcat(out, line_end); xstrfmtcat(out, "MloaderImage=%s", select_buf); } /****** Line 28 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_RAMDISK_IMAGE); if (select_buf[0] != '\0') { xstrcat(out, line_end); xstrfmtcat(out, "IoloadImage=%s", select_buf); } } /****** Line (optional) ******/ if (job_ptr->admin_comment) { xstrcat(out, line_end); xstrfmtcat(out, "AdminComment=%s ", job_ptr->admin_comment); } /****** Line (optional) ******/ if (job_ptr->comment) { xstrcat(out, line_end); xstrfmtcat(out, "Comment=%s ", job_ptr->comment); } /****** Line 30 (optional) ******/ if (job_ptr->batch_flag) { xstrcat(out, line_end); slurm_get_job_stderr(tmp_path, sizeof(tmp_path), job_ptr); xstrfmtcat(out, "StdErr=%s", tmp_path); } /****** Line 31 (optional) ******/ if (job_ptr->batch_flag) { xstrcat(out, line_end); slurm_get_job_stdin(tmp_path, sizeof(tmp_path), job_ptr); xstrfmtcat(out, "StdIn=%s", tmp_path); } /****** Line 32 (optional) ******/ if (job_ptr->batch_flag) { xstrcat(out, line_end); slurm_get_job_stdout(tmp_path, sizeof(tmp_path), job_ptr); xstrfmtcat(out, "StdOut=%s", tmp_path); } /****** Line 33 (optional) ******/ if (job_ptr->batch_script) { xstrcat(out, line_end); xstrcat(out, "BatchScript=\n"); xstrcat(out, job_ptr->batch_script); } /****** Line 34 (optional) ******/ if (job_ptr->req_switch) { char time_buf[32]; xstrcat(out, line_end); secs2time_str((time_t) job_ptr->wait4switch, time_buf, sizeof(time_buf)); xstrfmtcat(out, "Switches=%u@%s\n", job_ptr->req_switch, time_buf); } /****** Line 35 (optional) ******/ if (job_ptr->burst_buffer) { xstrcat(out, line_end); xstrfmtcat(out, "BurstBuffer=%s", job_ptr->burst_buffer); } /****** Line (optional) ******/ if (job_ptr->burst_buffer_state) { xstrcat(out, line_end); xstrfmtcat(out, "BurstBufferState=%s", job_ptr->burst_buffer_state); } /****** Line 36 (optional) ******/ if (cpu_freq_debug(NULL, NULL, tmp1, sizeof(tmp1), job_ptr->cpu_freq_gov, job_ptr->cpu_freq_min, job_ptr->cpu_freq_max, NO_VAL) != 0) { xstrcat(out, line_end); xstrcat(out, tmp1); } /****** Line 37 ******/ xstrcat(out, line_end); xstrfmtcat(out, "Power=%s", power_flags_str(job_ptr->power_flags)); /****** Line 38 (optional) ******/ if (job_ptr->bitflags) { xstrcat(out, line_end); if (job_ptr->bitflags & GRES_ENFORCE_BIND) xstrcat(out, "GresEnforceBind=Yes"); if (job_ptr->bitflags & KILL_INV_DEP) xstrcat(out, "KillOInInvalidDependent=Yes"); if (job_ptr->bitflags & NO_KILL_INV_DEP) xstrcat(out, "KillOInInvalidDependent=No"); if (job_ptr->bitflags & SPREAD_JOB) xstrcat(out, "SpreadJob=Yes"); } /****** END OF JOB RECORD ******/ if (one_liner) xstrcat(out, "\n"); else xstrcat(out, "\n\n"); return out; }
static char * _dump_all_nodes(int *node_cnt, time_t update_time) { int i, cnt = 0, rc; struct node_record *node_ptr = node_record_table_ptr; char *tmp_buf = NULL, *buf = NULL; struct node_record *uniq_node_ptr = NULL; hostlist_t hl = NULL; for (i=0; i<node_record_count; i++, node_ptr++) { if (node_ptr->name == NULL) continue; if (IS_NODE_FUTURE(node_ptr)) continue; if (_hidden_node(node_ptr)) continue; if (use_host_exp == 2) { rc = _same_info(uniq_node_ptr, node_ptr, update_time); if (rc == 0) { uniq_node_ptr = node_ptr; if (hl) { hostlist_push_host(hl, node_ptr->name); } else { hl = hostlist_create(node_ptr->name); if (!hl) { fatal("Invalid node_name: %s", node_ptr->name); } } continue; } else { tmp_buf = _dump_node(uniq_node_ptr, hl, update_time); hostlist_destroy(hl); hl = hostlist_create(node_ptr->name); if (!hl) { fatal("Invalid node_name: %s", node_ptr->name); } uniq_node_ptr = node_ptr; } } else { tmp_buf = _dump_node(node_ptr, hl, update_time); } if (cnt > 0) xstrcat(buf, "#"); xstrcat(buf, tmp_buf); xfree(tmp_buf); cnt++; } if (hl) { tmp_buf = _dump_node(uniq_node_ptr, hl, update_time); hostlist_destroy(hl); if (cnt > 0) xstrcat(buf, "#"); xstrcat(buf, tmp_buf); xfree(tmp_buf); cnt++; } *node_cnt = cnt; return buf; }
/* * slurm_sprint_job_info - output information about a specific Slurm * job based upon message as loaded using slurm_load_jobs * IN job_ptr - an individual job information record pointer * IN one_liner - print as a single line if true * RET out - char * containing formatted output (must be freed after call) * NULL is returned on failure. */ extern char * slurm_sprint_job_info ( job_info_t * job_ptr, int one_liner ) { int i, j; char time_str[32], *group_name, *user_name; char tmp1[128], tmp2[128], tmp3[128], tmp4[128], tmp5[128], *tmp6_ptr; char tmp_line[512]; char *ionodes = NULL; uint16_t exit_status = 0, term_sig = 0; job_resources_t *job_resrcs = job_ptr->job_resrcs; char *out = NULL; time_t run_time; uint32_t min_nodes, max_nodes = 0; char *nodelist = "NodeList"; bitstr_t *core_bitmap; char *host; int sock_inx, sock_reps, last; int abs_node_inx, rel_node_inx; int bit_inx, bit_reps; uint32_t *last_mem_alloc_ptr = NULL; uint32_t last_mem_alloc = NO_VAL; char *last_hosts; hostlist_t hl, hl_last; char select_buf[122]; uint32_t cluster_flags = slurmdb_setup_cluster_flags(); if (cluster_flags & CLUSTER_FLAG_BG) { nodelist = "MidplaneList"; select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_IONODES, &ionodes); } /****** Line 1 ******/ snprintf(tmp_line, sizeof(tmp_line), "JobId=%u ", job_ptr->job_id); out = xstrdup(tmp_line); if (job_ptr->array_job_id) { snprintf(tmp_line, sizeof(tmp_line), "ArrayJobId=%u ArrayTaskId=%u ", job_ptr->array_job_id, job_ptr->array_task_id); xstrcat(out, tmp_line); } snprintf(tmp_line, sizeof(tmp_line), "Name=%s", job_ptr->name); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 2 ******/ user_name = uid_to_string((uid_t) job_ptr->user_id); group_name = gid_to_string((gid_t) job_ptr->group_id); snprintf(tmp_line, sizeof(tmp_line), "UserId=%s(%u) GroupId=%s(%u)", user_name, job_ptr->user_id, group_name, job_ptr->group_id); xfree(user_name); xfree(group_name); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 3 ******/ snprintf(tmp_line, sizeof(tmp_line), "Priority=%u Account=%s QOS=%s", job_ptr->priority, job_ptr->account, job_ptr->qos); xstrcat(out, tmp_line); if (slurm_get_track_wckey()) { snprintf(tmp_line, sizeof(tmp_line), " WCKey=%s", job_ptr->wckey); xstrcat(out, tmp_line); } if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 4 ******/ if (job_ptr->state_desc) { /* Replace white space with underscore for easier parsing */ for (j=0; job_ptr->state_desc[j]; j++) { if (isspace((int)job_ptr->state_desc[j])) job_ptr->state_desc[j] = '_'; } tmp6_ptr = job_ptr->state_desc; } else tmp6_ptr = job_reason_string(job_ptr->state_reason); snprintf(tmp_line, sizeof(tmp_line), "JobState=%s Reason=%s Dependency=%s", job_state_string(job_ptr->job_state), tmp6_ptr, job_ptr->dependency); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 5 ******/ snprintf(tmp_line, sizeof(tmp_line), "Requeue=%u Restarts=%u BatchFlag=%u ", job_ptr->requeue, job_ptr->restart_cnt, job_ptr->batch_flag); xstrcat(out, tmp_line); if (WIFSIGNALED(job_ptr->exit_code)) term_sig = WTERMSIG(job_ptr->exit_code); exit_status = WEXITSTATUS(job_ptr->exit_code); snprintf(tmp_line, sizeof(tmp_line), "ExitCode=%u:%u", exit_status, term_sig); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 5a (optional) ******/ if (!(job_ptr->show_flags & SHOW_DETAIL)) goto line6; if (WIFSIGNALED(job_ptr->derived_ec)) term_sig = WTERMSIG(job_ptr->derived_ec); else term_sig = 0; exit_status = WEXITSTATUS(job_ptr->derived_ec); snprintf(tmp_line, sizeof(tmp_line), "DerivedExitCode=%u:%u", exit_status, term_sig); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 6 ******/ line6: snprintf(tmp_line, sizeof(tmp_line), "RunTime="); xstrcat(out, tmp_line); if (IS_JOB_PENDING(job_ptr)) run_time = 0; else if (IS_JOB_SUSPENDED(job_ptr)) run_time = job_ptr->pre_sus_time; else { time_t end_time; if (IS_JOB_RUNNING(job_ptr) || (job_ptr->end_time == 0)) end_time = time(NULL); else end_time = job_ptr->end_time; if (job_ptr->suspend_time) { run_time = (time_t) (difftime(end_time, job_ptr->suspend_time) + job_ptr->pre_sus_time); } else run_time = (time_t) difftime(end_time, job_ptr->start_time); } secs2time_str(run_time, tmp1, sizeof(tmp1)); sprintf(tmp_line, "%s ", tmp1); xstrcat(out, tmp_line); snprintf(tmp_line, sizeof(tmp_line), "TimeLimit="); xstrcat(out, tmp_line); if (job_ptr->time_limit == NO_VAL) sprintf(tmp_line, "Partition_Limit"); else { mins2time_str(job_ptr->time_limit, tmp_line, sizeof(tmp_line)); } xstrcat(out, tmp_line); snprintf(tmp_line, sizeof(tmp_line), " TimeMin="); xstrcat(out, tmp_line); if (job_ptr->time_min == 0) sprintf(tmp_line, "N/A"); else { mins2time_str(job_ptr->time_min, tmp_line, sizeof(tmp_line)); } xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 7 ******/ slurm_make_time_str((time_t *)&job_ptr->submit_time, time_str, sizeof(time_str)); snprintf(tmp_line, sizeof(tmp_line), "SubmitTime=%s ", time_str); xstrcat(out, tmp_line); slurm_make_time_str((time_t *)&job_ptr->eligible_time, time_str, sizeof(time_str)); snprintf(tmp_line, sizeof(tmp_line), "EligibleTime=%s", time_str); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 8 (optional) ******/ if (job_ptr->resize_time) { slurm_make_time_str((time_t *)&job_ptr->resize_time, time_str, sizeof(time_str)); snprintf(tmp_line, sizeof(tmp_line), "ResizeTime=%s", time_str); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); } /****** Line 9 ******/ slurm_make_time_str((time_t *)&job_ptr->start_time, time_str, sizeof(time_str)); snprintf(tmp_line, sizeof(tmp_line), "StartTime=%s ", time_str); xstrcat(out, tmp_line); snprintf(tmp_line, sizeof(tmp_line), "EndTime="); xstrcat(out, tmp_line); if ((job_ptr->time_limit == INFINITE) && (job_ptr->end_time > time(NULL))) sprintf(tmp_line, "Unknown"); else { slurm_make_time_str ((time_t *)&job_ptr->end_time, time_str, sizeof(time_str)); sprintf(tmp_line, "%s", time_str); } xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 10 ******/ if (job_ptr->preempt_time == 0) sprintf(tmp_line, "PreemptTime=None "); else { slurm_make_time_str((time_t *)&job_ptr->preempt_time, time_str, sizeof(time_str)); snprintf(tmp_line, sizeof(tmp_line), "PreemptTime=%s ", time_str); } xstrcat(out, tmp_line); if (job_ptr->suspend_time) { slurm_make_time_str ((time_t *)&job_ptr->suspend_time, time_str, sizeof(time_str)); } else { strncpy(time_str, "None", sizeof(time_str)); } snprintf(tmp_line, sizeof(tmp_line), "SuspendTime=%s SecsPreSuspend=%ld", time_str, (long int)job_ptr->pre_sus_time); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 11 ******/ snprintf(tmp_line, sizeof(tmp_line), "Partition=%s AllocNode:Sid=%s:%u", job_ptr->partition, job_ptr->alloc_node, job_ptr->alloc_sid); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 12 ******/ snprintf(tmp_line, sizeof(tmp_line), "Req%s=%s Exc%s=%s", nodelist, job_ptr->req_nodes, nodelist, job_ptr->exc_nodes); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 13 ******/ xstrfmtcat(out, "%s=", nodelist); xstrcat(out, job_ptr->nodes); if (job_ptr->nodes && ionodes) { snprintf(tmp_line, sizeof(tmp_line), "[%s]", ionodes); xstrcat(out, tmp_line); xfree(ionodes); } if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 14 (optional) ******/ if (job_ptr->batch_host) { snprintf(tmp_line, sizeof(tmp_line), "BatchHost=%s", job_ptr->batch_host); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); } /****** Line 15 ******/ if (cluster_flags & CLUSTER_FLAG_BG) { select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_NODE_CNT, &min_nodes); if ((min_nodes == 0) || (min_nodes == NO_VAL)) { min_nodes = job_ptr->num_nodes; max_nodes = job_ptr->max_nodes; } else if (job_ptr->max_nodes) max_nodes = min_nodes; } else { min_nodes = job_ptr->num_nodes; max_nodes = job_ptr->max_nodes; } _sprint_range(tmp1, sizeof(tmp1), job_ptr->num_cpus, job_ptr->max_cpus); _sprint_range(tmp2, sizeof(tmp2), min_nodes, max_nodes); if (job_ptr->sockets_per_node == (uint16_t) NO_VAL) strcpy(tmp3, "*"); else snprintf(tmp3, sizeof(tmp3), "%u", job_ptr->sockets_per_node); if (job_ptr->cores_per_socket == (uint16_t) NO_VAL) strcpy(tmp4, "*"); else snprintf(tmp4, sizeof(tmp4), "%u", job_ptr->cores_per_socket); if (job_ptr->threads_per_core == (uint16_t) NO_VAL) strcpy(tmp5, "*"); else snprintf(tmp5, sizeof(tmp5), "%u", job_ptr->threads_per_core); snprintf(tmp_line, sizeof(tmp_line), "NumNodes=%s NumCPUs=%s CPUs/Task=%u ReqS:C:T=%s:%s:%s", tmp2, tmp1, job_ptr->cpus_per_task, tmp3, tmp4, tmp5); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); if (!job_resrcs) goto line15; if (cluster_flags & CLUSTER_FLAG_BG) { if ((job_resrcs->cpu_array_cnt > 0) && (job_resrcs->cpu_array_value) && (job_resrcs->cpu_array_reps)) { int length = 0; xstrcat(out, "CPUs="); length += 10; for (i = 0; i < job_resrcs->cpu_array_cnt; i++) { if (length > 70) { /* skip to last CPU group entry */ if (i < job_resrcs->cpu_array_cnt - 1) { continue; } /* add ellipsis before last entry */ xstrcat(out, "...,"); length += 4; } snprintf(tmp_line, sizeof(tmp_line), "%d", job_resrcs->cpus[i]); xstrcat(out, tmp_line); length += strlen(tmp_line); if (job_resrcs->cpu_array_reps[i] > 1) { snprintf(tmp_line, sizeof(tmp_line), "*%d", job_resrcs->cpu_array_reps[i]); xstrcat(out, tmp_line); length += strlen(tmp_line); } if (i < job_resrcs->cpu_array_cnt - 1) { xstrcat(out, ","); length++; } } if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); } } else { if (!job_resrcs->core_bitmap) goto line15; last = bit_fls(job_resrcs->core_bitmap); if (last == -1) goto line15; hl = hostlist_create(job_ptr->nodes); if (!hl) { error("slurm_sprint_job_info: hostlist_create: %s", job_ptr->nodes); return NULL; } hl_last = hostlist_create(NULL); if (!hl_last) { error("slurm_sprint_job_info: hostlist_create: NULL"); hostlist_destroy(hl); return NULL; } bit_inx = 0; i = sock_inx = sock_reps = 0; abs_node_inx = job_ptr->node_inx[i]; /* tmp1[] stores the current cpu(s) allocated */ tmp2[0] = '\0'; /* stores last cpu(s) allocated */ for (rel_node_inx=0; rel_node_inx < job_resrcs->nhosts; rel_node_inx++) { if (sock_reps >= job_resrcs->sock_core_rep_count[sock_inx]) { sock_inx++; sock_reps = 0; } sock_reps++; bit_reps = job_resrcs->sockets_per_node[sock_inx] * job_resrcs->cores_per_socket[sock_inx]; core_bitmap = bit_alloc(bit_reps); for (j=0; j < bit_reps; j++) { if (bit_test(job_resrcs->core_bitmap, bit_inx)) bit_set(core_bitmap, j); bit_inx++; } bit_fmt(tmp1, sizeof(tmp1), core_bitmap); FREE_NULL_BITMAP(core_bitmap); host = hostlist_shift(hl); /* * If the allocation values for this host are not the same as the * last host, print the report of the last group of hosts that had * identical allocation values. */ if (strcmp(tmp1, tmp2) || (last_mem_alloc_ptr != job_resrcs->memory_allocated) || (job_resrcs->memory_allocated && (last_mem_alloc != job_resrcs->memory_allocated[rel_node_inx]))) { if (hostlist_count(hl_last)) { last_hosts = hostlist_ranged_string_xmalloc( hl_last); snprintf(tmp_line, sizeof(tmp_line), " Nodes=%s CPU_IDs=%s Mem=%u", last_hosts, tmp2, last_mem_alloc_ptr ? last_mem_alloc : 0); xfree(last_hosts); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); hostlist_destroy(hl_last); hl_last = hostlist_create(NULL); } strcpy(tmp2, tmp1); last_mem_alloc_ptr = job_resrcs->memory_allocated; if (last_mem_alloc_ptr) last_mem_alloc = job_resrcs-> memory_allocated[rel_node_inx]; else last_mem_alloc = NO_VAL; } hostlist_push_host(hl_last, host); free(host); if (bit_inx > last) break; if (abs_node_inx > job_ptr->node_inx[i+1]) { i += 2; abs_node_inx = job_ptr->node_inx[i]; } else { abs_node_inx++; } } if (hostlist_count(hl_last)) { last_hosts = hostlist_ranged_string_xmalloc(hl_last); snprintf(tmp_line, sizeof(tmp_line), " Nodes=%s CPU_IDs=%s Mem=%u", last_hosts, tmp2, last_mem_alloc_ptr ? last_mem_alloc : 0); xfree(last_hosts); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); } hostlist_destroy(hl); hostlist_destroy(hl_last); } /****** Line 15 ******/ line15: if (job_ptr->pn_min_memory & MEM_PER_CPU) { job_ptr->pn_min_memory &= (~MEM_PER_CPU); tmp6_ptr = "CPU"; } else tmp6_ptr = "Node"; if (cluster_flags & CLUSTER_FLAG_BG) { convert_num_unit((float)job_ptr->pn_min_cpus, tmp1, sizeof(tmp1), UNIT_NONE); snprintf(tmp_line, sizeof(tmp_line), "MinCPUsNode=%s", tmp1); } else { snprintf(tmp_line, sizeof(tmp_line), "MinCPUsNode=%u", job_ptr->pn_min_cpus); } xstrcat(out, tmp_line); convert_num_unit((float)job_ptr->pn_min_memory, tmp1, sizeof(tmp1), UNIT_MEGA); convert_num_unit((float)job_ptr->pn_min_tmp_disk, tmp2, sizeof(tmp2), UNIT_MEGA); snprintf(tmp_line, sizeof(tmp_line), " MinMemory%s=%s MinTmpDiskNode=%s", tmp6_ptr, tmp1, tmp2); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 16 ******/ snprintf(tmp_line, sizeof(tmp_line), "Features=%s Gres=%s Reservation=%s", job_ptr->features, job_ptr->gres, job_ptr->resv_name); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 17 ******/ snprintf(tmp_line, sizeof(tmp_line), "Shared=%s Contiguous=%d Licenses=%s Network=%s", (job_ptr->shared == 0 ? "0" : job_ptr->shared == 1 ? "1" : "OK"), job_ptr->contiguous, job_ptr->licenses, job_ptr->network); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 18 ******/ snprintf(tmp_line, sizeof(tmp_line), "Command=%s", job_ptr->command); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 19 ******/ snprintf(tmp_line, sizeof(tmp_line), "WorkDir=%s", job_ptr->work_dir); xstrcat(out, tmp_line); if (cluster_flags & CLUSTER_FLAG_BG) { /****** Line 20 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_BG_ID); if (select_buf[0] != '\0') { if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); snprintf(tmp_line, sizeof(tmp_line), "Block_ID=%s", select_buf); xstrcat(out, tmp_line); } /****** Line 21 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_MIXED_SHORT); if (select_buf[0] != '\0') { if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); xstrcat(out, select_buf); } if (cluster_flags & CLUSTER_FLAG_BGL) { /****** Line 22 (optional) ******/ select_g_select_jobinfo_sprint( job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_BLRTS_IMAGE); if (select_buf[0] != '\0') { if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); snprintf(tmp_line, sizeof(tmp_line), "BlrtsImage=%s", select_buf); xstrcat(out, tmp_line); } } /****** Line 23 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_LINUX_IMAGE); if (select_buf[0] != '\0') { if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); if (cluster_flags & CLUSTER_FLAG_BGL) snprintf(tmp_line, sizeof(tmp_line), "LinuxImage=%s", select_buf); else snprintf(tmp_line, sizeof(tmp_line), "CnloadImage=%s", select_buf); xstrcat(out, tmp_line); } /****** Line 24 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_MLOADER_IMAGE); if (select_buf[0] != '\0') { if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); snprintf(tmp_line, sizeof(tmp_line), "MloaderImage=%s", select_buf); xstrcat(out, tmp_line); } /****** Line 25 (optional) ******/ select_g_select_jobinfo_sprint(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_RAMDISK_IMAGE); if (select_buf[0] != '\0') { if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); if (cluster_flags & CLUSTER_FLAG_BGL) snprintf(tmp_line, sizeof(tmp_line), "RamDiskImage=%s", select_buf); else snprintf(tmp_line, sizeof(tmp_line), "IoloadImage=%s", select_buf); xstrcat(out, tmp_line); } } /****** Line 26 (optional) ******/ if (job_ptr->comment) { if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); snprintf(tmp_line, sizeof(tmp_line), "Comment=%s ", job_ptr->comment); xstrcat(out, tmp_line); } /****** Line 27 (optional) ******/ if (job_ptr->batch_script) { if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); xstrcat(out, "BatchScript=\n"); xstrcat(out, job_ptr->batch_script); } /****** Line 28 (optional) ******/ if (job_ptr->req_switch) { char time_buf[32]; if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); secs2time_str((time_t) job_ptr->wait4switch, time_buf, sizeof(time_buf)); snprintf(tmp_line, sizeof(tmp_line), "Switches=%u@%s\n", job_ptr->req_switch, time_buf); xstrcat(out, tmp_line); } /****** Line 29 (optional) ******/ if (one_liner) xstrcat(out, "\n"); else xstrcat(out, "\n\n"); return out; }
int main (int argc, char *argv[]) { char *server = NULL; int msize = 65536; uid_t uid = geteuid (); int topt = 0; Npcfsys *fs = NULL; Npcfid *fid, *afid, *root; int c, fd; char buf[80], *host, *p; hostlist_t hl; hostlist_iterator_t itr; int lopt = 0; diod_log_init (argv[0]); opterr = 0; while ((c = GETOPT (argc, argv, OPTIONS, longopts)) != -1) { switch (c) { case 's': /* --server HOST[:PORT] or /path/to/socket */ server = optarg; break; case 'm': /* --msize SIZE */ msize = strtoul (optarg, NULL, 10); break; case 'u': /* --uid UID */ uid = strtoul (optarg, NULL, 10); break; case 't': /* --timeout SECS */ topt = strtoul (optarg, NULL, 10); break; case 'l': /* --long */ lopt = 1; break; default: usage (); } } if (signal (SIGPIPE, SIG_IGN) == SIG_ERR) err_exit ("signal"); if (signal (SIGALRM, sigalarm) == SIG_ERR) err_exit ("signal"); if (topt > 0) alarm (topt); if ((fd = diod_sock_connect (server, 0)) < 0) exit (1); if (!(fs = npc_start (fd, fd, msize, 0))) errn_exit (np_rerror (), "error negotiating protocol with server"); if (!(afid = npc_auth (fs, "ctl", uid, diod_auth)) && np_rerror () != 0) errn_exit (np_rerror (), "error authenticating to server"); if (!(root = npc_attach (fs, afid, "ctl", uid))) errn_exit (np_rerror (), "error attaching to aname=ctl"); if (!(fid = npc_open_bypath (root, "connections", O_RDONLY))) errn_exit (np_rerror (), "open connections"); if (!(hl = hostlist_create (NULL))) err_exit ("hostlist_create"); while (npc_gets (fid, buf, sizeof(buf))) { if ((p = strchr (buf, ' '))) *p = '\0'; if (!lopt && (p = strchr (buf, '.'))) *p = '\0'; if (!hostlist_push_host (hl, buf)) err_exit ("hostlist_push_host"); } hostlist_uniq (hl); if (lopt) { if (!(itr = hostlist_iterator_create (hl))) err_exit ("hostlist_iterator_create"); while ((host = hostlist_next (itr))) printf ("%s\n", host); hostlist_iterator_destroy (itr); } else { char s[1024]; if (hostlist_ranged_string (hl, sizeof (s), s) < 0) msg_exit ("hostlist output would be too long (use -l)"); printf ("%s\n", s); } hostlist_destroy (hl); if (npc_clunk (fid) < 0) errn_exit (np_rerror (), "clunk connections"); if (npc_clunk (root) < 0) errn_exit (np_rerror (), "error clunking ctl"); if (npc_clunk (afid) < 0) errn_exit (np_rerror (), "error clunking afid"); npc_finish (fs); exit(0); }
int _do_stat(uint32_t jobid, uint32_t stepid, char *nodelist, uint32_t req_cpufreq_min, uint32_t req_cpufreq_max, uint32_t req_cpufreq_gov) { job_step_stat_response_msg_t *step_stat_response = NULL; int rc = SLURM_SUCCESS; ListIterator itr; slurmdb_stats_t temp_stats; job_step_stat_t *step_stat = NULL; int ntasks = 0; int tot_tasks = 0; hostlist_t hl = NULL; debug("requesting info for job %u.%u", jobid, stepid); if ((rc = slurm_job_step_stat(jobid, stepid, nodelist, &step_stat_response)) != SLURM_SUCCESS) { if (rc == ESLURM_INVALID_JOB_ID) { debug("job step %u.%u has already completed", jobid, stepid); } else { error("problem getting step_layout for %u.%u: %s", jobid, stepid, slurm_strerror(rc)); } return rc; } memset(&job, 0, sizeof(slurmdb_job_rec_t)); job.jobid = jobid; memset(&step, 0, sizeof(slurmdb_step_rec_t)); memset(&temp_stats, 0, sizeof(slurmdb_stats_t)); temp_stats.cpu_min = NO_VAL; memset(&step.stats, 0, sizeof(slurmdb_stats_t)); step.stats.cpu_min = NO_VAL; step.job_ptr = &job; step.stepid = stepid; step.nodes = xmalloc(BUF_SIZE); step.req_cpufreq_min = req_cpufreq_min; step.req_cpufreq_max = req_cpufreq_max; step.req_cpufreq_gov = req_cpufreq_gov; step.stepname = NULL; step.state = JOB_RUNNING; hl = hostlist_create(NULL); itr = list_iterator_create(step_stat_response->stats_list); while ((step_stat = list_next(itr))) { if (!step_stat->step_pids || !step_stat->step_pids->node_name) continue; if (step_stat->step_pids->pid_cnt > 0 ) { int i; for(i=0; i<step_stat->step_pids->pid_cnt; i++) { if (step.pid_str) xstrcat(step.pid_str, ","); xstrfmtcat(step.pid_str, "%u", step_stat->step_pids->pid[i]); } } if (params.pid_format) { step.nodes = step_stat->step_pids->node_name; print_fields(&step); xfree(step.pid_str); } else { hostlist_push_host(hl, step_stat->step_pids->node_name); ntasks += step_stat->num_tasks; if (step_stat->jobacct) { jobacctinfo_2_stats(&temp_stats, step_stat->jobacct); aggregate_stats(&step.stats, &temp_stats); } } } list_iterator_destroy(itr); slurm_job_step_pids_response_msg_free(step_stat_response); /* we printed it out already */ if (params.pid_format) return rc; hostlist_sort(hl); hostlist_ranged_string(hl, BUF_SIZE, step.nodes); hostlist_destroy(hl); tot_tasks += ntasks; if (tot_tasks) { step.stats.cpu_ave /= (double)tot_tasks; step.stats.rss_ave /= (double)tot_tasks; step.stats.vsize_ave /= (double)tot_tasks; step.stats.pages_ave /= (double)tot_tasks; step.stats.disk_read_ave /= (double)tot_tasks; step.stats.disk_write_ave /= (double)tot_tasks; step.stats.act_cpufreq /= (double)tot_tasks; step.ntasks = tot_tasks; } print_fields(&step); return rc; }