/* Send a signal RPC to a list of nodes */ static void _send_sig(uint32_t job_id, uint32_t step_id, uint16_t signal, char *nodelist) { agent_arg_t *agent_args; signal_tasks_msg_t *signal_tasks_msg; hostlist_iterator_t hi; char *host; struct node_record *node_ptr; signal_tasks_msg = xmalloc(sizeof(signal_tasks_msg_t)); signal_tasks_msg->job_id = job_id; signal_tasks_msg->job_step_id = step_id; signal_tasks_msg->signal = signal; agent_args = xmalloc(sizeof(agent_arg_t)); agent_args->msg_type = REQUEST_SIGNAL_TASKS; agent_args->retry = 1; agent_args->msg_args = signal_tasks_msg; agent_args->hostlist = hostlist_create(nodelist); agent_args->node_count = hostlist_count(agent_args->hostlist); agent_args->protocol_version = SLURM_PROTOCOL_VERSION; hi = hostlist_iterator_create(agent_args->hostlist); while ((host = hostlist_next(hi))) { if ((node_ptr = find_node_record(host)) && (agent_args->protocol_version > node_ptr->protocol_version)) agent_args->protocol_version = node_ptr->protocol_version; free(host); } hostlist_iterator_destroy(hi); agent_queue_request(agent_args); }
/* Send a signal RPC to a specific node */ static void _send_sig(uint32_t job_id, uint32_t step_id, uint16_t signal, char *node_name, slurm_addr_t node_addr) { agent_arg_t *agent_args; kill_tasks_msg_t *kill_tasks_msg; kill_tasks_msg = xmalloc(sizeof(kill_tasks_msg_t)); kill_tasks_msg->job_id = job_id; kill_tasks_msg->job_step_id = step_id; kill_tasks_msg->signal = signal; agent_args = xmalloc(sizeof(agent_arg_t)); agent_args->msg_type = REQUEST_SIGNAL_TASKS; agent_args->retry = 1; agent_args->msg_args = kill_tasks_msg; agent_args->hostlist = hostlist_create(node_name); agent_args->node_count = 1; if ((node_ptr = find_node_record(node_name))) agent_args->protocol_version = node_ptr->protocol_version; hostlist_iterator_destroy(hi); agent_queue_request(agent_args); }
/** Convert between Cray NID and slurm nodename format */ static struct node_record *_find_node_by_basil_id(uint32_t node_id) { char nid[9]; /* nid%05d\0 */ snprintf(nid, sizeof(nid), "nid%05u", node_id); return find_node_record(nid); }
/* * srun_node_fail - notify srun of a node's failure * IN job_ptr - job to notify * IN node_name - name of failed node */ extern void srun_node_fail(struct job_record *job_ptr, char *node_name) { #ifndef HAVE_FRONT_END struct node_record *node_ptr; #endif int bit_position = -1; slurm_addr_t * addr; srun_node_fail_msg_t *msg_arg; ListIterator step_iterator; struct step_record *step_ptr; xassert(job_ptr); xassert(node_name); if (!job_ptr || !IS_JOB_RUNNING(job_ptr)) return; #ifdef HAVE_FRONT_END /* Purge all jobs steps in front end mode */ #else if (!node_name || (node_ptr = find_node_record(node_name)) == NULL) return; bit_position = node_ptr - node_record_table_ptr; #endif step_iterator = list_iterator_create(job_ptr->step_list); while ((step_ptr = (struct step_record *) list_next(step_iterator))) { if (step_ptr->step_node_bitmap == NULL) /* pending step */ continue; if ((bit_position >= 0) && (!bit_test(step_ptr->step_node_bitmap, bit_position))) continue; /* job step not on this node */ if ( (step_ptr->port == 0) || (step_ptr->host == NULL) || (step_ptr->batch_step) || (step_ptr->host[0] == '\0') ) continue; addr = xmalloc(sizeof(struct sockaddr_in)); slurm_set_addr(addr, step_ptr->port, step_ptr->host); msg_arg = xmalloc(sizeof(srun_node_fail_msg_t)); msg_arg->job_id = job_ptr->job_id; msg_arg->step_id = step_ptr->step_id; msg_arg->nodelist = xstrdup(node_name); _srun_agent_launch(addr, step_ptr->host, SRUN_NODE_FAIL, msg_arg, step_ptr->start_protocol_ver); } list_iterator_destroy(step_iterator); if (job_ptr->other_port && job_ptr->alloc_node && job_ptr->resp_host) { addr = xmalloc(sizeof(struct sockaddr_in)); slurm_set_addr(addr, job_ptr->other_port, job_ptr->resp_host); msg_arg = xmalloc(sizeof(srun_node_fail_msg_t)); msg_arg->job_id = job_ptr->job_id; msg_arg->step_id = NO_VAL; msg_arg->nodelist = xstrdup(node_name); _srun_agent_launch(addr, job_ptr->alloc_node, SRUN_NODE_FAIL, msg_arg, job_ptr->start_protocol_ver); } }
/* * topo_get_node_addr - build node address and the associated pattern * based on the topology information * * example of output : * address : s0.s4.s8.tux1 * pattern : switch.switch.switch.node */ extern int topo_get_node_addr(char* node_name, char** paddr, char** ppattern) { #ifndef HAVE_FRONT_END if (find_node_record(node_name) == NULL) return SLURM_ERROR; #endif *paddr = xstrdup(node_name); *ppattern = xstrdup("node"); return SLURM_SUCCESS; }
extern int up_nodecard(char *mp_name, bitstr_t *ionode_bitmap) { ListIterator itr = NULL; bg_record_t *bg_record = NULL; struct node_record *node_ptr = NULL; int mp_bit = 0; int ret = 0; xassert(mp_name); xassert(ionode_bitmap); node_ptr = find_node_record(mp_name); if (!node_ptr) { error ("down_sub_node_blocks: invalid node specified %s", mp_name); return EINVAL; } mp_bit = (node_ptr - node_record_table_ptr); slurm_mutex_lock(&block_state_mutex); itr = list_iterator_create(bg_lists->main); while ((bg_record = list_next(itr))) { if (bg_record->job_running != BLOCK_ERROR_STATE) continue; if (!bit_test(bg_record->mp_bitmap, mp_bit)) continue; if (!bit_overlap(bg_record->ionode_bitmap, ionode_bitmap)) { continue; } resume_block(bg_record); } list_iterator_destroy(itr); slurm_mutex_unlock(&block_state_mutex); /* FIX ME: This needs to call the opposite of slurm_drain_nodes which does not yet exist. */ if ((ret = node_already_down(mp_name))) { /* means it was drained */ if (ret == 2) { /* debug("node %s put back into service after " */ /* "being in an error state", */ /* mp_name); */ } } return SLURM_SUCCESS; }
/* Determine if specific slurm node is already in DOWN or DRAIN state */ extern int node_already_down(char *node_name) { struct node_record *node_ptr = find_node_record(node_name); if (node_ptr) { if (IS_NODE_DRAIN(node_ptr)) return 2; else if (IS_NODE_DOWN(node_ptr)) return 1; else return 0; } return 0; }
/* * _node_name2bitmap - given a node name regular expression, build a bitmap * representation, any invalid hostnames are added to a hostlist * IN node_names - set of node namess * OUT bitmap - set to bitmap, may not have all bits set on error * IN/OUT invalid_hostlist - hostlist of invalid host names, initialize to NULL * RET 0 if no error, otherwise EINVAL * NOTE: call FREE_NULL_BITMAP(bitmap) and hostlist_destroy(invalid_hostlist) * to free memory when variables are no longer required */ static int _node_name2bitmap(char *node_names, bitstr_t **bitmap, hostlist_t *invalid_hostlist) { char *this_node_name; bitstr_t *my_bitmap; hostlist_t host_list; my_bitmap = (bitstr_t *) bit_alloc(node_record_count); *bitmap = my_bitmap; if (node_names == NULL) { error("_node_name2bitmap: node_names is NULL"); return EINVAL; } if ( (host_list = hostlist_create(node_names)) == NULL) { /* likely a badly formatted hostlist */ error("_node_name2bitmap: hostlist_create(%s) error", node_names); return EINVAL; } while ( (this_node_name = hostlist_shift(host_list)) ) { struct node_record *node_ptr; node_ptr = find_node_record(this_node_name); if (node_ptr) { bit_set(my_bitmap, (bitoff_t) (node_ptr - node_record_table_ptr)); } else { debug2("_node_name2bitmap: invalid node specified %s", this_node_name); if (*invalid_hostlist) { hostlist_push_host(*invalid_hostlist, this_node_name); } else { *invalid_hostlist = hostlist_create(this_node_name); } } free (this_node_name); } hostlist_destroy(host_list); return SLURM_SUCCESS; }
/* * node_name2bitmap - given a node name regular expression, build a bitmap * representation * IN node_names - list of nodes * IN best_effort - if set don't return an error on invalid node name entries * OUT bitmap - set to bitmap, may not have all bits set on error * RET 0 if no error, otherwise EINVAL * NOTE: call FREE_NULL_BITMAP() to free bitmap memory when no longer required */ extern int node_name2bitmap (char *node_names, bool best_effort, bitstr_t **bitmap) { int rc = SLURM_SUCCESS; char *this_node_name; bitstr_t *my_bitmap; hostlist_t host_list; my_bitmap = (bitstr_t *) bit_alloc (node_record_count); if (my_bitmap == NULL) fatal("bit_alloc malloc failure"); *bitmap = my_bitmap; if (node_names == NULL) { info("node_name2bitmap: node_names is NULL"); return rc; } if ( (host_list = hostlist_create (node_names)) == NULL) { /* likely a badly formatted hostlist */ error ("hostlist_create on %s error:", node_names); if (!best_effort) rc = EINVAL; return rc; } while ( (this_node_name = hostlist_shift (host_list)) ) { struct node_record *node_ptr; node_ptr = find_node_record (this_node_name); if (node_ptr) { bit_set (my_bitmap, (bitoff_t) (node_ptr - node_record_table_ptr)); } else { error ("node_name2bitmap: invalid node specified %s", this_node_name); if (!best_effort) rc = EINVAL; } free (this_node_name); } hostlist_destroy (host_list); return rc; }
/* Tries to find a node fast using the hash table * * Used by: slurmctld */ static sw_gen_node_info_t * _find_node(char *node_name) { int i; sw_gen_node_info_t *n; struct node_record *node_ptr; if (node_name == NULL) { error("%s: _find_node node name is NULL", plugin_type); return NULL; } if (libstate->node_count == 0) return NULL; xassert(libstate->magic == SW_GEN_LIBSTATE_MAGIC); if (libstate->hash_table) { i = _hash_index(node_name); n = libstate->hash_table[i]; while (n) { xassert(n->magic == SW_GEN_NODE_INFO_MAGIC); if (!strcmp(n->node_name, node_name)) return n; n = n->next; } } /* This code is only needed if NodeName and NodeHostName differ */ node_ptr = find_node_record(node_name); if (node_ptr && libstate->hash_table) { i = _hash_index(node_ptr->node_hostname); n = libstate->hash_table[i]; while (n) { xassert(n->magic == SW_GEN_NODE_INFO_MAGIC); if (!strcmp(n->node_name, node_name)) return n; n = n->next; } } return NULL; }
/* * This could potentially lock the node lock in the slurmctld with * slurm_drain_node, or slurm_fail_job so if slurmctld_locked is called we * will call the functions without locking the locks again. */ extern int down_nodecard(char *mp_name, bitoff_t io_start, bool slurmctld_locked) { List requests = NULL; List delete_list = NULL; ListIterator itr = NULL; bg_record_t *bg_record = NULL, *found_record = NULL, tmp_record; bg_record_t *smallest_bg_record = NULL; struct node_record *node_ptr = NULL; int mp_bit = 0; static int io_cnt = NO_VAL; static int create_size = NO_VAL; static select_ba_request_t blockreq; int rc = SLURM_SUCCESS; char *reason = "select_bluegene: nodecard down"; xassert(mp_name); if (io_cnt == NO_VAL) { io_cnt = 1; /* Translate 1 nodecard count to ionode count */ if ((io_cnt *= bg_conf->io_ratio)) io_cnt--; /* make sure we create something that is able to be created */ if (bg_conf->smallest_block < bg_conf->nodecard_cnode_cnt) create_size = bg_conf->nodecard_cnode_cnt; else create_size = bg_conf->smallest_block; } node_ptr = find_node_record(mp_name); if (!node_ptr) { error ("down_sub_node_blocks: invalid node specified '%s'", mp_name); return EINVAL; } /* this is here for sanity check to make sure we don't core on these bits when we set them below. */ if (io_start >= bg_conf->ionodes_per_mp || (io_start+io_cnt) >= bg_conf->ionodes_per_mp) { debug("io %d-%d not configured on this " "system, only %d ionodes per midplane", io_start, io_start+io_cnt, bg_conf->ionodes_per_mp); return EINVAL; } mp_bit = (node_ptr - node_record_table_ptr); memset(&blockreq, 0, sizeof(select_ba_request_t)); blockreq.conn_type[0] = SELECT_SMALL; blockreq.save_name = mp_name; debug3("here setting node %d of %d and ionodes %d-%d of %d", mp_bit, node_record_count, io_start, io_start+io_cnt, bg_conf->ionodes_per_mp); memset(&tmp_record, 0, sizeof(bg_record_t)); tmp_record.mp_count = 1; tmp_record.cnode_cnt = bg_conf->nodecard_cnode_cnt; tmp_record.mp_bitmap = bit_alloc(node_record_count); bit_set(tmp_record.mp_bitmap, mp_bit); tmp_record.ionode_bitmap = bit_alloc(bg_conf->ionodes_per_mp); bit_nset(tmp_record.ionode_bitmap, io_start, io_start+io_cnt); slurm_mutex_lock(&block_state_mutex); itr = list_iterator_create(bg_lists->main); while ((bg_record = list_next(itr))) { if (!bit_test(bg_record->mp_bitmap, mp_bit)) continue; if (!blocks_overlap(bg_record, &tmp_record)) continue; if (bg_record->job_running > NO_JOB_RUNNING) { if (slurmctld_locked) job_fail(bg_record->job_running); else slurm_fail_job(bg_record->job_running); } /* If Running Dynamic mode and the block is smaller than the create size just continue on. */ if ((bg_conf->layout_mode == LAYOUT_DYNAMIC) && (bg_record->cnode_cnt < create_size)) { if (!delete_list) delete_list = list_create(NULL); list_append(delete_list, bg_record); continue; } /* keep track of the smallest size that is at least the size of create_size. */ if (!smallest_bg_record || (smallest_bg_record->cnode_cnt > bg_record->cnode_cnt)) smallest_bg_record = bg_record; } list_iterator_destroy(itr); slurm_mutex_unlock(&block_state_mutex); if (bg_conf->layout_mode != LAYOUT_DYNAMIC) { debug3("running non-dynamic mode"); /* This should never happen, but just in case... */ if (delete_list) list_destroy(delete_list); /* If we found a block that is smaller or equal to a midplane we will just mark it in an error state as opposed to draining the node. */ if (smallest_bg_record && (smallest_bg_record->cnode_cnt < bg_conf->mp_cnode_cnt)){ if (smallest_bg_record->state & BG_BLOCK_ERROR_FLAG) { rc = SLURM_NO_CHANGE_IN_DATA; goto cleanup; } rc = put_block_in_error_state( smallest_bg_record, reason); goto cleanup; } debug("No block under 1 midplane available for this nodecard. " "Draining the whole node."); if (!node_already_down(mp_name)) { if (slurmctld_locked) drain_nodes(mp_name, reason, slurm_get_slurm_user_id()); else slurm_drain_nodes(mp_name, reason, slurm_get_slurm_user_id()); } rc = SLURM_SUCCESS; goto cleanup; } /* below is only for Dynamic mode */ if (delete_list) { int cnt_set = 0; bitstr_t *iobitmap = bit_alloc(bg_conf->ionodes_per_mp); /* don't lock here since it is handled inside the put_block_in_error_state */ itr = list_iterator_create(delete_list); while ((bg_record = list_next(itr))) { debug2("combining smaller than nodecard " "dynamic block %s", bg_record->bg_block_id); while (bg_record->job_running > NO_JOB_RUNNING) sleep(1); bit_or(iobitmap, bg_record->ionode_bitmap); cnt_set++; } list_iterator_destroy(itr); list_destroy(delete_list); if (!cnt_set) { FREE_NULL_BITMAP(iobitmap); rc = SLURM_ERROR; goto cleanup; } /* set the start to be the same as the start of the ionode_bitmap. If no ionodes set (not a small block) set io_start = 0. */ if ((io_start = bit_ffs(iobitmap)) == -1) { io_start = 0; if (create_size > bg_conf->nodecard_cnode_cnt) blockreq.small128 = 4; else blockreq.small32 = 16; } else if (create_size <= bg_conf->nodecard_cnode_cnt) blockreq.small32 = 1; else /* this should never happen */ blockreq.small128 = 1; FREE_NULL_BITMAP(iobitmap); } else if (smallest_bg_record) { debug2("smallest dynamic block is %s", smallest_bg_record->bg_block_id); if (smallest_bg_record->state & BG_BLOCK_ERROR_FLAG) { rc = SLURM_NO_CHANGE_IN_DATA; goto cleanup; } while (smallest_bg_record->job_running > NO_JOB_RUNNING) sleep(1); if (smallest_bg_record->cnode_cnt == create_size) { rc = put_block_in_error_state( smallest_bg_record, reason); goto cleanup; } if (create_size > smallest_bg_record->cnode_cnt) { /* we should never get here. This means we * have a create_size that is bigger than a * block that is already made. */ rc = put_block_in_error_state( smallest_bg_record, reason); goto cleanup; } debug3("node count is %d", smallest_bg_record->cnode_cnt); switch(smallest_bg_record->cnode_cnt) { #ifndef HAVE_BGL case 64: blockreq.small32 = 2; break; case 256: blockreq.small32 = 8; break; #endif case 128: blockreq.small32 = 4; break; case 512: default: blockreq.small32 = 16; break; } if (create_size != bg_conf->nodecard_cnode_cnt) { blockreq.small128 = blockreq.small32 / 4; blockreq.small32 = 0; io_start = 0; } else if ((io_start = bit_ffs(smallest_bg_record->ionode_bitmap)) == -1) /* set the start to be the same as the start of the ionode_bitmap. If no ionodes set (not a small block) set io_start = 0. */ io_start = 0; } else { switch(create_size) { #ifndef HAVE_BGL case 64: blockreq.small64 = 8; break; case 256: blockreq.small256 = 2; #endif case 32: blockreq.small32 = 16; break; case 128: blockreq.small128 = 4; break; case 512: if (!node_already_down(mp_name)) { char *reason = "select_bluegene: nodecard down"; if (slurmctld_locked) drain_nodes(mp_name, reason, slurm_get_slurm_user_id()); else slurm_drain_nodes( mp_name, reason, slurm_get_slurm_user_id()); } rc = SLURM_SUCCESS; goto cleanup; break; default: error("Unknown create size of %d", create_size); break; } /* since we don't have a block in this midplane we need to start at the beginning. */ io_start = 0; /* we also need a bg_block to pretend to be the smallest block that takes up the entire midplane. */ } /* Here we need to add blocks that take up nodecards on this midplane. Since Slurm only keeps track of midplanes natively this is the only want to handle this case. */ requests = list_create(destroy_bg_record); add_bg_record(requests, NULL, &blockreq, 1, io_start); slurm_mutex_lock(&block_state_mutex); delete_list = list_create(NULL); while ((bg_record = list_pop(requests))) { itr = list_iterator_create(bg_lists->main); while ((found_record = list_next(itr))) { if (!blocks_overlap(bg_record, found_record)) continue; list_push(delete_list, found_record); list_remove(itr); } list_iterator_destroy(itr); /* we need to add this record since it doesn't exist */ if (bridge_block_create(bg_record) == SLURM_ERROR) { destroy_bg_record(bg_record); error("down_sub_node_blocks: " "unable to configure block in api"); continue; } debug("adding block %s to fill in small blocks " "around bad nodecards", bg_record->bg_block_id); print_bg_record(bg_record); list_append(bg_lists->main, bg_record); if (bit_overlap(bg_record->ionode_bitmap, tmp_record.ionode_bitmap)) { /* here we know the error block doesn't exist so just set the state here */ slurm_mutex_unlock(&block_state_mutex); rc = put_block_in_error_state(bg_record, reason); slurm_mutex_lock(&block_state_mutex); } } list_destroy(requests); if (delete_list) { slurm_mutex_unlock(&block_state_mutex); free_block_list(NO_VAL, delete_list, 0, 0); list_destroy(delete_list); } slurm_mutex_lock(&block_state_mutex); sort_bg_record_inc_size(bg_lists->main); slurm_mutex_unlock(&block_state_mutex); last_bg_update = time(NULL); cleanup: FREE_NULL_BITMAP(tmp_record.mp_bitmap); FREE_NULL_BITMAP(tmp_record.ionode_bitmap); return rc; }
/* * topo_get_node_addr - build node address and the associated pattern * based on the topology information * * example of output : * address : s0.s4.s8.tux1 * pattern : switch.switch.switch.node */ extern int topo_get_node_addr(char* node_name, char** paddr, char** ppattern) { struct node_record *node_ptr; int node_inx; hostlist_t sl = NULL; int s_max_level = 0; int i, j; /* no switches found, return */ if ( switch_record_cnt == 0 ) { *paddr = xstrdup(node_name); *ppattern = xstrdup("node"); return SLURM_SUCCESS; } node_ptr = find_node_record(node_name); /* node not found in configuration */ if ( node_ptr == NULL ) return SLURM_ERROR; node_inx = node_ptr - node_record_table_ptr; /* look for switches max level */ for (i=0; i<switch_record_cnt; i++) { if ( switch_record_table[i].level > s_max_level ) s_max_level = switch_record_table[i].level; } /* initialize output parameters */ *paddr = xstrdup(""); *ppattern = xstrdup(""); /* build node topology address and the associated pattern */ for (j=s_max_level; j>=0; j--) { for (i=0; i<switch_record_cnt; i++) { if ( switch_record_table[i].level != j ) continue; if ( !bit_test(switch_record_table[i]. node_bitmap, node_inx) ) continue; if ( sl == NULL ) { sl = hostlist_create(switch_record_table[i]. name); } else { hostlist_push_host(sl, switch_record_table[i]. name); } } if ( sl ) { char *buf = hostlist_ranged_string_xmalloc(sl); xstrcat(*paddr,buf); xfree(buf); hostlist_destroy(sl); sl = NULL; } xstrcat(*paddr, "."); xstrcat(*ppattern, "switch."); } /* append node name */ xstrcat(*paddr, node_name); xstrcat(*ppattern, "node"); return SLURM_SUCCESS; }
/* use specific set run tasks on each host listed in hostfile * XXX: Need to handle over-subscribe. */ static int _task_layout_hostfile(slurm_step_layout_t *step_layout, const char *arbitrary_nodes) { int i=0, j, taskid = 0, task_cnt=0; hostlist_iterator_t itr = NULL, itr_task = NULL; char *host = NULL; hostlist_t job_alloc_hosts = NULL; hostlist_t step_alloc_hosts = NULL; int step_inx = 0, step_hosts_cnt = 0; struct node_record **step_hosts_ptrs = NULL; struct node_record *host_ptr = NULL; debug2("job list is %s", step_layout->node_list); if (!arbitrary_nodes) { error("no hostlist given for arbitrary dist"); return SLURM_ERROR; } debug2("list is %s", arbitrary_nodes); step_alloc_hosts = hostlist_create(arbitrary_nodes); if (hostlist_count(step_alloc_hosts) != step_layout->task_cnt) { error("Asked for %u tasks have %d in the nodelist. " "Check your nodelist, or set the -n option to be %d", step_layout->task_cnt, hostlist_count(step_alloc_hosts), hostlist_count(step_alloc_hosts)); hostlist_destroy(step_alloc_hosts); return SLURM_ERROR; } job_alloc_hosts = hostlist_create(step_layout->node_list); itr = hostlist_iterator_create(job_alloc_hosts); itr_task = hostlist_iterator_create(step_alloc_hosts); /* * Build array of pointers so that we can do pointer comparisons rather * than strcmp's on nodes. */ step_hosts_cnt = hostlist_count(step_alloc_hosts); step_hosts_ptrs = xmalloc(sizeof(struct node_record *) * step_hosts_cnt); step_inx = 0; while((host = hostlist_next(itr_task))) { step_hosts_ptrs[step_inx++] = find_node_record_no_alias(host); free(host); } while((host = hostlist_next(itr))) { host_ptr = find_node_record(host); step_layout->tasks[i] = 0; for (step_inx = 0; step_inx < step_hosts_cnt; step_inx++) { if (host_ptr == step_hosts_ptrs[step_inx]) { step_layout->tasks[i]++; task_cnt++; } if (task_cnt >= step_layout->task_cnt) break; } debug3("%s got %u tasks", host, step_layout->tasks[i]); if (step_layout->tasks[i] == 0) goto reset_hosts; step_layout->tids[i] = xmalloc(sizeof(uint32_t) * step_layout->tasks[i]); taskid = 0; j = 0; for (step_inx = 0; step_inx < step_hosts_cnt; step_inx++) { if (host_ptr == step_hosts_ptrs[step_inx]) { step_layout->tids[i][j] = taskid; j++; } taskid++; if (j >= step_layout->tasks[i]) break; } i++; reset_hosts: free(host); if (i > step_layout->task_cnt) break; } hostlist_iterator_destroy(itr); hostlist_iterator_destroy(itr_task); hostlist_destroy(job_alloc_hosts); hostlist_destroy(step_alloc_hosts); xfree(step_hosts_ptrs); if (task_cnt != step_layout->task_cnt) { error("Asked for %u tasks but placed %d. Check your nodelist", step_layout->task_cnt, task_cnt); return SLURM_ERROR; } return SLURM_SUCCESS; }
/* * _build_part_bitmap - update the total_cpus, total_nodes, and node_bitmap * for the specified partition, also reset the partition pointers in * the node back to this partition. * IN part_ptr - pointer to the partition * RET 0 if no error, errno otherwise * global: node_record_table_ptr - pointer to global node table * NOTE: this does not report nodes defined in more than one partition. this * is checked only upon reading the configuration file, not on an update */ static int _build_part_bitmap(struct part_record *part_ptr) { char *this_node_name; bitstr_t *old_bitmap; struct node_record *node_ptr; /* pointer to node_record */ hostlist_t host_list; part_ptr->total_cpus = 0; part_ptr->total_nodes = 0; if (part_ptr->node_bitmap == NULL) { part_ptr->node_bitmap = bit_alloc(node_record_count); old_bitmap = NULL; } else { old_bitmap = bit_copy(part_ptr->node_bitmap); bit_nclear(part_ptr->node_bitmap, 0, node_record_count - 1); } if (part_ptr->nodes == NULL) { /* no nodes in partition */ _unlink_free_nodes(old_bitmap, part_ptr); FREE_NULL_BITMAP(old_bitmap); return 0; } if ((host_list = hostlist_create(part_ptr->nodes)) == NULL) { FREE_NULL_BITMAP(old_bitmap); error("hostlist_create error on %s, %m", part_ptr->nodes); return ESLURM_INVALID_NODE_NAME; } while ((this_node_name = hostlist_shift(host_list))) { node_ptr = find_node_record(this_node_name); if (node_ptr == NULL) { error("_build_part_bitmap: invalid node name %s", this_node_name); free(this_node_name); FREE_NULL_BITMAP(old_bitmap); hostlist_destroy(host_list); return ESLURM_INVALID_NODE_NAME; } part_ptr->total_nodes++; if (slurmctld_conf.fast_schedule) part_ptr->total_cpus += node_ptr->config_ptr->cpus; else part_ptr->total_cpus += node_ptr->cpus; node_ptr->part_cnt++; xrealloc(node_ptr->part_pptr, (node_ptr->part_cnt * sizeof(struct part_record *))); node_ptr->part_pptr[node_ptr->part_cnt-1] = part_ptr; if (old_bitmap) bit_clear(old_bitmap, (int) (node_ptr - node_record_table_ptr)); bit_set(part_ptr->node_bitmap, (int) (node_ptr - node_record_table_ptr)); free(this_node_name); } hostlist_destroy(host_list); _unlink_free_nodes(old_bitmap, part_ptr); last_node_update = time(NULL); FREE_NULL_BITMAP(old_bitmap); return 0; }
/* * _build_single_nodeline_info - From the slurm.conf reader, build table, * and set values * RET 0 if no error, error code otherwise * Note: Operates on common variables * default_node_record - default node configuration values */ static int _build_single_nodeline_info(slurm_conf_node_t *node_ptr, struct config_record *config_ptr) { int error_code = SLURM_SUCCESS; struct node_record *node_rec = NULL; hostlist_t address_list = NULL; hostlist_t alias_list = NULL; hostlist_t hostname_list = NULL; hostlist_t port_list = NULL; char *address = NULL; char *alias = NULL; char *hostname = NULL; char *port_str = NULL; int state_val = NODE_STATE_UNKNOWN; int address_count, alias_count, hostname_count, port_count; uint16_t port = 0; if (node_ptr->state != NULL) { state_val = state_str2int(node_ptr->state, node_ptr->nodenames); if (state_val == NO_VAL) goto cleanup; } if ((address_list = hostlist_create(node_ptr->addresses)) == NULL) { fatal("Unable to create NodeAddr list from %s", node_ptr->addresses); error_code = errno; goto cleanup; } if ((alias_list = hostlist_create(node_ptr->nodenames)) == NULL) { fatal("Unable to create NodeName list from %s", node_ptr->nodenames); error_code = errno; goto cleanup; } if ((hostname_list = hostlist_create(node_ptr->hostnames)) == NULL) { fatal("Unable to create NodeHostname list from %s", node_ptr->hostnames); error_code = errno; goto cleanup; } if (node_ptr->port_str && node_ptr->port_str[0] && (node_ptr->port_str[0] != '[') && (strchr(node_ptr->port_str, '-') || strchr(node_ptr->port_str, ','))) { xstrfmtcat(port_str, "[%s]", node_ptr->port_str); port_list = hostlist_create(port_str); xfree(port_str); } else { port_list = hostlist_create(node_ptr->port_str); } if (port_list == NULL) { error("Unable to create Port list from %s", node_ptr->port_str); error_code = errno; goto cleanup; } /* some sanity checks */ address_count = hostlist_count(address_list); alias_count = hostlist_count(alias_list); hostname_count = hostlist_count(hostname_list); port_count = hostlist_count(port_list); #ifdef HAVE_FRONT_END if ((hostname_count != alias_count) && (hostname_count != 1)) { error("NodeHostname count must equal that of NodeName " "records of there must be no more than one"); goto cleanup; } if ((address_count != alias_count) && (address_count != 1)) { error("NodeAddr count must equal that of NodeName " "records of there must be no more than one"); goto cleanup; } #else #ifdef MULTIPLE_SLURMD if ((address_count != alias_count) && (address_count != 1)) { error("NodeAddr count must equal that of NodeName " "records of there must be no more than one"); goto cleanup; } #else if (address_count < alias_count) { error("At least as many NodeAddr are required as NodeName"); goto cleanup; } if (hostname_count < alias_count) { error("At least as many NodeHostname are required " "as NodeName"); goto cleanup; } #endif /* MULTIPLE_SLURMD */ #endif /* HAVE_FRONT_END */ if ((port_count != alias_count) && (port_count > 1)) { error("Port count must equal that of NodeName " "records or there must be no more than one"); goto cleanup; } /* now build the individual node structures */ while ((alias = hostlist_shift(alias_list))) { if (address_count > 0) { address_count--; if (address) free(address); address = hostlist_shift(address_list); } if (hostname_count > 0) { hostname_count--; if (hostname) free(hostname); hostname = hostlist_shift(hostname_list); } if (port_count > 0) { int port_int; port_count--; if (port_str) free(port_str); port_str = hostlist_shift(port_list); port_int = atoi(port_str); if ((port_int <= 0) || (port_int > 0xffff)) fatal("Invalid Port %s", node_ptr->port_str); port = port_int; } /* find_node_record locks this to get the * alias so we need to unlock */ node_rec = find_node_record(alias); if (node_rec == NULL) { node_rec = create_node_record(config_ptr, alias); if ((state_val != NO_VAL) && (state_val != NODE_STATE_UNKNOWN)) node_rec->node_state = state_val; node_rec->last_response = (time_t) 0; node_rec->comm_name = xstrdup(address); node_rec->node_hostname = xstrdup(hostname); node_rec->port = port; node_rec->weight = node_ptr->weight; node_rec->features = xstrdup(node_ptr->feature); node_rec->reason = xstrdup(node_ptr->reason); } else { /* FIXME - maybe should be fatal? */ error("Reconfiguration for node %s, ignoring!", alias); } free(alias); } /* free allocated storage */ cleanup: if (address) free(address); if (hostname) free(hostname); if (port_str) free(port_str); if (address_list) hostlist_destroy(address_list); if (alias_list) hostlist_destroy(alias_list); if (hostname_list) hostlist_destroy(hostname_list); if (port_list) hostlist_destroy(port_list); return error_code; }
/* Test if a batch launch request should be defered * RET -1: abort the request, pending job cancelled * 0: execute the request now * 1: defer the request */ static int _batch_launch_defer(queued_request_t *queued_req_ptr) { agent_arg_t *agent_arg_ptr; batch_job_launch_msg_t *launch_msg_ptr; time_t now = time(NULL); struct job_record *job_ptr; int delay_time, nodes_ready = 0; agent_arg_ptr = queued_req_ptr->agent_arg_ptr; if (agent_arg_ptr->msg_type != REQUEST_BATCH_JOB_LAUNCH) return 0; if (difftime(now, queued_req_ptr->last_attempt) < 10) { /* Reduce overhead by only testing once every 10 secs */ return 1; } launch_msg_ptr = (batch_job_launch_msg_t *)agent_arg_ptr->msg_args; job_ptr = find_job_record(launch_msg_ptr->job_id); if ((job_ptr == NULL) || (!IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr))) { info("agent(batch_launch): removed pending request for " "cancelled job %u", launch_msg_ptr->job_id); return -1; /* job cancelled while waiting */ } if (job_ptr->wait_all_nodes) { (void) job_node_ready(launch_msg_ptr->job_id, &nodes_ready); } else { #ifdef HAVE_FRONT_END nodes_ready = 1; #else struct node_record *node_ptr; char *hostname; hostname = hostlist_deranged_string_xmalloc( agent_arg_ptr->hostlist); node_ptr = find_node_record(hostname); if (node_ptr == NULL) { error("agent(batch_launch) removed pending request for " "job %u, missing node %s", launch_msg_ptr->job_id, hostname); xfree(hostname); return -1; /* invalid request?? */ } xfree(hostname); if (!IS_NODE_POWER_SAVE(node_ptr) && !IS_NODE_NO_RESPOND(node_ptr)) { nodes_ready = 1; } #endif } delay_time = difftime(now, job_ptr->start_time); if (nodes_ready) { /* ready to launch, adjust time limit for boot time */ if (delay_time && (job_ptr->time_limit != INFINITE) && (!wiki2_sched)) { info("Job %u launch delayed by %d secs, " "updating end_time", launch_msg_ptr->job_id, delay_time); job_ptr->end_time += delay_time; } queued_req_ptr->last_attempt = (time_t) 0; return 0; } if (queued_req_ptr->last_attempt == 0) { queued_req_ptr->first_attempt = now; queued_req_ptr->last_attempt = now; } else if (difftime(now, queued_req_ptr->first_attempt) >= slurm_get_resume_timeout()) { error("agent waited too long for nodes to respond, " "sending batch request anyway..."); if (delay_time && (job_ptr->time_limit != INFINITE) && (!wiki2_sched)) { info("Job %u launch delayed by %d secs, " "updating end_time", launch_msg_ptr->job_id, delay_time); job_ptr->end_time += delay_time; } queued_req_ptr->last_attempt = (time_t) 0; return 0; } queued_req_ptr->last_attempt = now; return 1; }
/* * get_nodes - get information on specific node(s) changed since some time * cmd_ptr IN - CMD=GETNODES ARG=[<UPDATETIME>:<NODEID>[:<NODEID>]...] * [<UPDATETIME>:ALL] * err_code OUT - 0 or an error code * err_msg OUT - response message * NOTE: xfree() err_msg if err_code is zero * RET 0 on success, -1 on failure * * Response format * ARG=<cnt>#<NODEID>: * STATE=<state>; Moab equivalent node state * [CAT=<reason>]; Reason for a node being down or drained * colon separator * CCLASS=<[part:cpus]>; SLURM partition with CPU count of node, * make have more than one partition * [CPULOAD=<load_ave>;] One minute BSD load average * [ARCH=<architecture>;] Computer architecture * [OS=<operating_system>;] Operating system * CMEMORY=<MB>; MB of memory on node * CDISK=<MB>; MB of disk space on node * CPROC=<cpus>; CPU count on node * [FEATURE=<feature>;] Features associated with node, if any * [GRES=<name>[:<count>],...;] generic resources on the node * [#<NODEID>:...]; */ extern int get_nodes(char *cmd_ptr, int *err_code, char **err_msg) { char *arg_ptr = NULL, *tmp_char = NULL, *tmp_buf = NULL, *buf = NULL; time_t update_time; /* Locks: read node, read partition */ slurmctld_lock_t node_read_lock = { NO_LOCK, NO_LOCK, READ_LOCK, READ_LOCK }; int node_rec_cnt = 0, buf_size = 0; #ifdef HAVE_ALPS_CRAY /* Locks: write node */ slurmctld_lock_t node_write_lock = { NO_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK }; /* * Run a Basil Inventory immediately before scheduling, to avoid * race conditions caused by ALPS node state change (caused e.g. * by the node health checker). * This relies on the above write lock for the node state. */ lock_slurmctld(node_write_lock); if (select_g_reconfigure()) { unlock_slurmctld(node_write_lock); *err_code = -720; *err_msg = "Unable to run ALPS inventory"; error("wiki: Unable to run ALPS inventory"); return -1; } unlock_slurmctld(node_write_lock); #endif arg_ptr = strstr(cmd_ptr, "ARG="); if (arg_ptr == NULL) { *err_code = -300; *err_msg = "GETNODES lacks ARG"; error("wiki: GETNODES lacks ARG"); return -1; } update_time = (time_t) strtoul(arg_ptr+4, &tmp_char, 10); if (tmp_char[0] != ':') { *err_code = -300; *err_msg = "Invalid ARG value"; error("wiki: GETNODES has invalid ARG value"); return -1; } tmp_char++; lock_slurmctld(node_read_lock); if (strncmp(tmp_char, "ALL", 3) == 0) { /* report all nodes */ buf = _dump_all_nodes(&node_rec_cnt, update_time); } else { struct node_record *node_ptr = NULL; char *node_name, *slurm_hosts; int node_cnt; hostset_t slurm_hostset; slurm_hosts = moab2slurm_task_list(tmp_char, &node_cnt); if ((slurm_hostset = hostset_create(slurm_hosts))) { while ((node_name = hostset_shift(slurm_hostset))) { node_ptr = find_node_record(node_name); if (node_ptr == NULL) { error("sched/wiki2: bad hostname %s", node_name); continue; } if (_hidden_node(node_ptr)) continue; tmp_buf = _dump_node(node_ptr, NULL, update_time); if (node_rec_cnt > 0) xstrcat(buf, "#"); xstrcat(buf, tmp_buf); xfree(tmp_buf); node_rec_cnt++; } hostset_destroy(slurm_hostset); } else { error("hostset_create(%s): %m", slurm_hosts); } xfree(slurm_hosts); } unlock_slurmctld(node_read_lock); /* Prepend ("ARG=%d", node_rec_cnt) to reply message */ if (buf) buf_size = strlen(buf); tmp_buf = xmalloc(buf_size + 32); if (node_rec_cnt) sprintf(tmp_buf, "SC=0 ARG=%d#%s", node_rec_cnt, buf); else sprintf(tmp_buf, "SC=0 ARG=0#"); xfree(buf); *err_code = 0; *err_msg = tmp_buf; return 0; }