extern void select_admin_front_end(GtkTreeModel *model, GtkTreeIter *iter, display_data_t *display_data, GtkTreeView *treeview) { if (treeview) { char *node_list; hostlist_t hl = NULL; front_end_user_data_t user_data; memset(&user_data, 0, sizeof(front_end_user_data_t)); gtk_tree_selection_selected_foreach( gtk_tree_view_get_selection(treeview), _process_each_front_end, &user_data); hl = hostlist_create(user_data.node_list); hostlist_uniq(hl); hostlist_sort(hl); xfree(user_data.node_list); node_list = hostlist_ranged_string_xmalloc(hl); hostlist_destroy(hl); _admin_front_end(model, iter, display_data->name, node_list); xfree(node_list); } }
/* Convert a SLURM hostlist expression into the equivalent node index * value expression. */ static char *_get_nids(opt_t *opt_local) { hostlist_t hl; char *nids = NULL; int node_cnt; if (!opt_local->nodelist) return NULL; hl = hostlist_create(opt_local->nodelist); if (!hl) { error("Invalid hostlist: %s", opt_local->nodelist); return NULL; } //info("input hostlist: %s", nodelist); hostlist_uniq(hl); /* aprun needs the hostlist to be the exact size requested. So if it doesn't set it. */ node_cnt = hostlist_count(hl); if (opt_local->nodes_set_opt && (node_cnt != opt_local->min_nodes)) { error("You requested %d nodes and %d hosts. These numbers " "must be the same, so setting number of nodes to %d", opt_local->min_nodes, node_cnt, node_cnt); } opt_local->min_nodes = node_cnt; opt_local->nodes_set = 1; nids = cray_nodelist2nids(hl, NULL); hostlist_destroy(hl); //info("output node IDs: %s", nids); return nids; }
static int _hostset_from_ranges(const pmix_proc_t *procs, size_t nprocs, hostlist_t *hl_out) { int i; hostlist_t hl = hostlist_create(""); pmixp_namespace_t *nsptr = NULL; for (i = 0; i < nprocs; i++) { char *node = NULL; hostlist_t tmp; nsptr = pmixp_nspaces_find(procs[i].nspace); if (NULL == nsptr) { goto err_exit; } if (procs[i].rank == PMIX_RANK_WILDCARD) { tmp = hostlist_copy(nsptr->hl); } else { tmp = pmixp_nspace_rankhosts(nsptr, &procs[i].rank, 1); } while (NULL != (node = hostlist_pop(tmp))) { hostlist_push(hl, node); free(node); } hostlist_destroy(tmp); } hostlist_uniq(hl); *hl_out = hl; return SLURM_SUCCESS; err_exit: hostlist_destroy(hl); return SLURM_ERROR; }
/* * forward_msg - logic to forward a message which has been received and * accumulate the return codes from processes getting the * the forwarded message * * IN: forward_struct - forward_struct_t * - holds information about message * that needs to be forwarded to * children processes * IN: header - header_t - header from message that came in * needing to be forwarded. * RET: SLURM_SUCCESS - int */ extern int forward_msg(forward_struct_t *forward_struct, header_t *header) { hostlist_t hl = NULL; hostlist_t* sp_hl; int hl_count = 0; if (!forward_struct->ret_list) { error("didn't get a ret_list from forward_struct"); return SLURM_ERROR; } hl = hostlist_create(header->forward.nodelist); hostlist_uniq(hl); if (route_g_split_hostlist( hl, &sp_hl, &hl_count, header->forward.tree_width)) { error("unable to split forward hostlist"); hostlist_destroy(hl); return SLURM_ERROR; } _forward_msg_internal(NULL, sp_hl, forward_struct, header, forward_struct->timeout, hl_count); xfree(sp_hl); hostlist_destroy(hl); return SLURM_SUCCESS; }
static hostlist_t _read_genders (List attrs, int iopt) { ListIterator i = NULL; hostlist_t hl = NULL; char * attr = NULL; if ((attrs == NULL)) /* Special "all nodes" case */ return _read_genders_attr (ALL_NODES, iopt); if ((attrs == NULL) || (list_count (attrs) == 0)) return NULL; if ((i = list_iterator_create (attrs)) == NULL) errx ("genders: unable to create list iterator: %m\n"); while ((attr = list_next (i))) { hostlist_t l = _read_genders_attr (attr, iopt); if (hl == NULL) { hl = l; } else { hostlist_push_list (hl, l); hostlist_destroy (l); } } list_iterator_destroy (i); hostlist_uniq (hl); return (hl); }
/* * start_msg_tree - logic to begin the forward tree and * accumulate the return codes from processes getting the * the forwarded message * * IN: hl - hostlist_t - list of every node to send message to * IN: msg - slurm_msg_t - message to send. * IN: timeout - int - how long to wait in milliseconds. * RET List - List containing the responses of the children * (if any) we forwarded the message to. List * containing type (ret_data_info_t). */ extern List start_msg_tree(hostlist_t hl, slurm_msg_t *msg, int timeout) { fwd_tree_t fwd_tree; pthread_mutex_t tree_mutex; pthread_cond_t notify; int count = 0; List ret_list = NULL; int thr_count = 0; int host_count = 0; hostlist_t* sp_hl; int hl_count = 0; xassert(hl); xassert(msg); hostlist_uniq(hl); host_count = hostlist_count(hl); if (route_g_split_hostlist(hl, &sp_hl, &hl_count, msg->forward.tree_width)) { error("unable to split forward hostlist"); return NULL; } slurm_mutex_init(&tree_mutex); slurm_cond_init(¬ify, NULL); ret_list = list_create(destroy_data_info); memset(&fwd_tree, 0, sizeof(fwd_tree)); fwd_tree.orig_msg = msg; fwd_tree.ret_list = ret_list; fwd_tree.timeout = timeout; fwd_tree.notify = ¬ify; fwd_tree.p_thr_count = &thr_count; fwd_tree.tree_mutex = &tree_mutex; _start_msg_tree_internal(NULL, sp_hl, &fwd_tree, hl_count); xfree(sp_hl); slurm_mutex_lock(&tree_mutex); count = list_count(ret_list); debug2("Tree head got back %d looking for %d", count, host_count); while (thr_count > 0) { slurm_cond_wait(¬ify, &tree_mutex); count = list_count(ret_list); debug2("Tree head got back %d", count); } xassert(count >= host_count); /* Tree head did not get all responses, * but no more active fwd threads!*/ slurm_mutex_unlock(&tree_mutex); slurm_mutex_destroy(&tree_mutex); slurm_cond_destroy(¬ify); return ret_list; }
/* Update acct_gather data for every node that is not DOWN */ extern void update_nodes_acct_gather_data(void) { #ifdef HAVE_FRONT_END front_end_record_t *front_end_ptr; #else struct node_record *node_ptr; #endif int i; char *host_str = NULL; agent_arg_t *agent_args = NULL; agent_args = xmalloc (sizeof (agent_arg_t)); agent_args->msg_type = REQUEST_ACCT_GATHER_UPDATE; agent_args->retry = 0; agent_args->protocol_version = SLURM_PROTOCOL_VERSION; agent_args->hostlist = hostlist_create(NULL); #ifdef HAVE_FRONT_END for (i = 0, front_end_ptr = front_end_nodes; i < front_end_node_cnt; i++, front_end_ptr++) { if (IS_NODE_NO_RESPOND(front_end_ptr)) continue; if (agent_args->protocol_version > front_end_ptr->protocol_version) agent_args->protocol_version = front_end_ptr->protocol_version; hostlist_push_host(agent_args->hostlist, front_end_ptr->name); agent_args->node_count++; } #else for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count; i++, node_ptr++) { if (IS_NODE_NO_RESPOND(node_ptr) || IS_NODE_FUTURE(node_ptr) || IS_NODE_POWER_SAVE(node_ptr)) continue; if (agent_args->protocol_version > node_ptr->protocol_version) agent_args->protocol_version = node_ptr->protocol_version; hostlist_push_host(agent_args->hostlist, node_ptr->name); agent_args->node_count++; } #endif if (agent_args->node_count == 0) { hostlist_destroy(agent_args->hostlist); xfree (agent_args); } else { hostlist_uniq(agent_args->hostlist); host_str = hostlist_ranged_string_xmalloc(agent_args->hostlist); if (slurmctld_conf.debug_flags & DEBUG_FLAG_ENERGY) info("Updating acct_gather data for %s", host_str); xfree(host_str); ping_begin(); agent_queue_request(agent_args); } }
static error_t cmdline_parse (int key, char *arg, struct argp_state *state) { struct ipmidetect_arguments *cmd_args; char *endptr; assert (state); cmd_args = state->input; switch (key) { case IPMIDETECT_HOSTNAME_KEY: case IPMIDETECT_LEGACY_HOSTNAME_KEY: if (!(cmd_args->hostname = strdup (arg))) err_exit ("strdup: %s", strerror (errno)); break; case IPMIDETECT_PORT_KEY: errno = 0; cmd_args->port = strtol (arg, &endptr, 10); if (errno || endptr[0] != '\0') err_exit ("invalid port specified"); break; case IPMIDETECT_DETECTED_KEY: cmd_args->output_type = IPMIDETECT_DETECTED_NODES; break; case IPMIDETECT_UNDETECTED_KEY: cmd_args->output_type = IPMIDETECT_UNDETECTED_NODES; break; case IPMIDETECT_HOSTRANGE_KEY: cmd_args->output_format = 0; break; case IPMIDETECT_COMMA_KEY: cmd_args->output_format = ','; break; case IPMIDETECT_NEWLINE_KEY: cmd_args->output_format = '\n'; break; case IPMIDETECT_SPACE_KEY: cmd_args->output_format = ' '; break; case ARGP_KEY_ARG: if (!strcmp (arg, "-")) _read_nodes_from_stdin (cmd_args); else _push_inputted_nodes (cmd_args, arg); hostlist_uniq (cmd_args->inputted_nodes); break; case ARGP_KEY_END: break; default: return (ARGP_ERR_UNKNOWN); } return (0); }
/* * slurm_step_layout_create - determine how many tasks of a job will be * run on each node. Distribution is influenced * by number of cpus on each host. * IN tlist - hostlist corresponding to task layout * IN cpus_per_node - cpus per node * IN cpu_count_reps - how many nodes have same cpu count * IN num_hosts - number of hosts we have * IN num_tasks - number of tasks to distribute across these cpus * IN cpus_per_task - number of cpus per task * IN task_dist - type of distribution we are using * IN plane_size - plane size (only needed for the plane distribution) * RET a pointer to an slurm_step_layout_t structure * NOTE: allocates memory that should be xfreed by caller */ slurm_step_layout_t *slurm_step_layout_create( const char *tlist, uint16_t *cpus_per_node, uint32_t *cpu_count_reps, uint32_t num_hosts, uint32_t num_tasks, uint16_t cpus_per_task, uint16_t task_dist, uint16_t plane_size) { char *arbitrary_nodes = NULL; slurm_step_layout_t *step_layout = xmalloc(sizeof(slurm_step_layout_t)); uint32_t cluster_flags = slurmdb_setup_cluster_flags(); step_layout->task_dist = task_dist; if (task_dist == SLURM_DIST_ARBITRARY) { hostlist_t hl = NULL; char *buf = NULL; /* set the node list for the task layout later if user * supplied could be different that the job allocation */ arbitrary_nodes = xstrdup(tlist); hl = hostlist_create(tlist); hostlist_uniq(hl); buf = hostlist_ranged_string_xmalloc(hl); num_hosts = hostlist_count(hl); hostlist_destroy(hl); step_layout->node_list = buf; } else { step_layout->node_list = xstrdup(tlist); } step_layout->task_cnt = num_tasks; if (cluster_flags & CLUSTER_FLAG_FE) { /* Limited job step support on front-end systems. * All jobs execute through front-end on Blue Gene. * Normally we would not permit execution of job steps, * but can fake it by just allocating all tasks to * one of the allocated nodes. */ if ((cluster_flags & CLUSTER_FLAG_BG) || (cluster_flags & CLUSTER_FLAG_CRAY_A)) step_layout->node_cnt = num_hosts; else step_layout->node_cnt = 1; } else step_layout->node_cnt = num_hosts; if (_init_task_layout(step_layout, arbitrary_nodes, cpus_per_node, cpu_count_reps, cpus_per_task, task_dist, plane_size) != SLURM_SUCCESS) { slurm_step_layout_destroy(step_layout); step_layout = NULL; } xfree(arbitrary_nodes); return step_layout; }
void wrap_hostlist_uniq(WRAPPERS_ARGS, hostlist_t hl) { assert(file && function); if (!hl) WRAPPERS_ERR_INVALID_PARAMETERS("hostlist_uniq"); hostlist_uniq(hl); return; }
/* Spawn health check function for every node that is not DOWN */ extern void run_health_check(void) { #ifdef HAVE_FRONT_END front_end_record_t *front_end_ptr; #else struct node_record *node_ptr; #endif int i; char *host_str = NULL; agent_arg_t *check_agent_args = NULL; check_agent_args = xmalloc (sizeof (agent_arg_t)); check_agent_args->msg_type = REQUEST_HEALTH_CHECK; check_agent_args->retry = 0; check_agent_args->hostlist = hostlist_create(""); if (check_agent_args->hostlist == NULL) fatal("hostlist_create: malloc failure"); #ifdef HAVE_FRONT_END for (i = 0, front_end_ptr = front_end_nodes; i < front_end_node_cnt; i++, front_end_ptr++) { if (IS_NODE_NO_RESPOND(front_end_ptr)) continue; hostlist_push(check_agent_args->hostlist, front_end_ptr->name); check_agent_args->node_count++; } #else for (i=0, node_ptr=node_record_table_ptr; i<node_record_count; i++, node_ptr++) { if (IS_NODE_NO_RESPOND(node_ptr) || IS_NODE_FUTURE(node_ptr) || IS_NODE_POWER_SAVE(node_ptr)) continue; hostlist_push(check_agent_args->hostlist, node_ptr->name); check_agent_args->node_count++; } #endif if (check_agent_args->node_count == 0) { hostlist_destroy(check_agent_args->hostlist); xfree (check_agent_args); } else { hostlist_uniq(check_agent_args->hostlist); host_str = hostlist_ranged_string_xmalloc( check_agent_args->hostlist); debug("Spawning health check agent for %s", host_str); xfree(host_str); ping_begin(); agent_queue_request(check_agent_args); } }
static hostlist_t _slurm_wcoll (List joblist) { int i; hostlist_t hl = NULL; job_info_msg_t * msg; int32_t envjobid = 0; int alljobids = 0; if ((joblist == NULL) && (envjobid = _slurm_jobid()) < 0) return (NULL); if (slurm_load_jobs((time_t) NULL, &msg, 1) < 0) errx ("Unable to contact slurm controller: %s\n", slurm_strerror (errno)); /* * Check for "all" in joblist */ alljobids = _alljobids_requested (joblist); for (i = 0; i < msg->record_count; i++) { job_info_t *j = &msg->job_array[i]; if (alljobids && j->job_state == JOB_RUNNING) hl = _hl_append (hl, j->nodes); else if (!joblist && (j->job_id == envjobid)) { /* * Only use SLURM_JOBID environment variable if user * didn't override with -j option */ hl = hostlist_create (j->nodes); break; } else if (_jobid_requested (joblist, j->job_id)) { hl = _hl_append (hl, j->nodes); /* * Exit when there is no more jobids to search */ if (list_count (joblist) == 0) break; } } slurm_free_job_info_msg (msg); if (hl) hostlist_uniq (hl); return (hl); }
/* Append to buf a compact tasklist expression (e.g. "tux[0-1]*2") * Prepend ":" to expression as needed */ static void _append_hl_buf(char **buf, hostlist_t *hl_tmp, int *reps) { char *host_str; char *tok, *sep; int i, in_bracket = 0, fini = 0; hostlist_uniq(*hl_tmp); host_str = hostlist_ranged_string_xmalloc(*hl_tmp); /* Note that host_str may be of this form "alpha,beta". We want * to record this as "alpha*#:beta*#" and NOT "alpha,beta*#". * NOTE: Do not break up command within brackets (e.g. "tux[1,2-4]") */ if (*buf) sep = ":"; else sep = ""; tok = host_str; for (i=0; fini == 0; i++) { switch (tok[i]) { case '[': in_bracket = 1; break; case ']': in_bracket = 0; break; case '\0': fini = 1; if (in_bracket) error("badly formed hostlist %s", tok); case ',': if (in_bracket) break; tok[i] = '\0'; xstrfmtcat(*buf, "%s%s*%d", sep, tok, *reps); sep = ":"; tok += (i + 1); i = -1; break; } } xfree(host_str); hostlist_destroy(*hl_tmp); *hl_tmp = (hostlist_t) NULL; *reps = 0; }
static hostlist_t _slurm_wcoll_partition (List partitionlist) { int i; char * str; hostlist_t hl = NULL; partition_info_msg_t * msg; partition_info_t * p; ListIterator li; if (slurm_load_partitions((time_t) NULL, &msg, 1) < 0) errx ("Unable to contact slurm controller: %s\n", slurm_strerror (errno)); for (i = 0; i < msg->record_count; i++){ p = &msg->partition_array[i]; if (_partition_requested (partitionlist, p->name)) { hl = _hl_append (hl, p->nodes); /* * Exit when there is no more partitions to search */ if (list_count (partitionlist) == 0) break; } } /* * Anything left in partitionlist wasn't found, emit a warning */ li = list_iterator_create(partitionlist); while ((str = list_next(li))){ err("%p: Warning - partition %s not found\n", str); } slurm_free_partition_info_msg (msg); if (hl) hostlist_uniq (hl); return (hl); }
/* Start a job: * CMD=STARTJOB ARG=<jobid> TASKLIST=<node_list> [COMMENT=<whatever>] * RET 0 on success, -1 on failure */ extern int start_job(char *cmd_ptr, int *err_code, char **err_msg) { char *arg_ptr, *comment_ptr, *task_ptr, *tasklist, *tmp_char; int i, rc, task_cnt; uint32_t jobid; hostlist_t hl = (hostlist_t) NULL; char *host_string; static char reply_msg[128]; arg_ptr = strstr(cmd_ptr, "ARG="); if (arg_ptr == NULL) { *err_code = -300; *err_msg = "STARTJOB lacks ARG"; error("wiki: STARTJOB lacks ARG"); return -1; } jobid = strtoul(arg_ptr+4, &tmp_char, 10); if (!isspace(tmp_char[0])) { *err_code = -300; *err_msg = "Invalid ARG value"; error("wiki: STARTJOB has invalid jobid"); return -1; } comment_ptr = strstr(cmd_ptr, "COMMENT="); task_ptr = strstr(cmd_ptr, "TASKLIST="); if (comment_ptr) { comment_ptr[7] = ':'; comment_ptr += 8; if (comment_ptr[0] == '\"') { comment_ptr++; for (i=0; i<MAX_COMMENT_LEN; i++) { if (comment_ptr[i] == '\0') break; if (comment_ptr[i] == '\"') { comment_ptr[i] = '\0'; break; } } if (i == MAX_COMMENT_LEN) comment_ptr[i-1] = '\0'; } else if (comment_ptr[0] == '\'') { comment_ptr++; for (i=0; i<MAX_COMMENT_LEN; i++) { if (comment_ptr[i] == '\0') break; if (comment_ptr[i] == '\'') { comment_ptr[i] = '\0'; break; } } if (i == MAX_COMMENT_LEN) comment_ptr[i-1] = '\0'; } else null_term(comment_ptr); } if (task_ptr == NULL) { *err_code = -300; *err_msg = "STARTJOB lacks TASKLIST"; error("wiki: STARTJOB lacks TASKLIST"); return -1; } task_ptr += 9; /* skip over "TASKLIST=" */ if ((task_ptr[0] == '\0') || isspace(task_ptr[0])) { /* No TASKLIST specification, useful for testing */ host_string = xstrdup(""); task_cnt = 0; tasklist = NULL; } else { null_term(task_ptr); tasklist = moab2slurm_task_list(task_ptr, &task_cnt); if (tasklist) hl = hostlist_create(tasklist); if ((tasklist == NULL) || (hl == NULL)) { *err_code = -300; *err_msg = "STARTJOB TASKLIST is invalid"; error("wiki: STARTJOB TASKLIST is invalid: %s", task_ptr); xfree(tasklist); return -1; } hostlist_uniq(hl); hostlist_sort(hl); host_string = hostlist_ranged_string_xmalloc(hl); hostlist_destroy(hl); if (host_string == NULL) { *err_code = -300; *err_msg = "STARTJOB has invalid TASKLIST"; error("wiki: STARTJOB has invalid TASKLIST: %s", tasklist); xfree(tasklist); return -1; } } rc = _start_job(jobid, task_cnt, host_string, tasklist, comment_ptr, err_code, err_msg); xfree(host_string); xfree(tasklist); if (rc == 0) { snprintf(reply_msg, sizeof(reply_msg), "job %u started successfully", jobid); *err_msg = reply_msg; } return rc; }
static int _job_modify(uint32_t jobid, char *bank_ptr, char *depend_ptr, char *new_hostlist, uint32_t new_node_cnt, char *part_name_ptr, uint32_t new_time_limit, char *name_ptr, char *start_ptr, char *feature_ptr, char *env_ptr, char *comment_ptr, char *gres_ptr, char *wckey_ptr) { struct job_record *job_ptr; time_t now = time(NULL); bool update_accounting = false; job_ptr = find_job_record(jobid); if (job_ptr == NULL) { error("wiki: MODIFYJOB has invalid jobid %u", jobid); return ESLURM_INVALID_JOB_ID; } if (IS_JOB_FINISHED(job_ptr) || (job_ptr->details == NULL)) { info("wiki: MODIFYJOB jobid %u is finished", jobid); return ESLURM_DISABLED; } if (comment_ptr) { info("wiki: change job %u comment %s", jobid, comment_ptr); xfree(job_ptr->comment); job_ptr->comment = xstrdup(comment_ptr); last_job_update = now; } if (depend_ptr) { int rc = update_job_dependency(job_ptr, depend_ptr); if (rc == SLURM_SUCCESS) { info("wiki: changed job %u dependency to %s", jobid, depend_ptr); } else { error("wiki: changing job %u dependency to %s", jobid, depend_ptr); return EINVAL; } } if (env_ptr) { bool have_equal = false; char old_sep[1]; int begin = 0, i; if (job_ptr->batch_flag == 0) { error("wiki: attempt to set environment variables " "for non-batch job %u", jobid); return ESLURM_DISABLED; } for (i=0; ; i++) { if (env_ptr[i] == '=') { if (have_equal) { error("wiki: setting job %u invalid " "environment variables: %s", jobid, env_ptr); return EINVAL; } have_equal = true; if (env_ptr[i+1] == '\"') { for (i+=2; ; i++) { if (env_ptr[i] == '\0') { error("wiki: setting job %u " "invalid environment " "variables: %s", jobid, env_ptr); return EINVAL; } if (env_ptr[i] == '\"') { i++; break; } if (env_ptr[i] == '\\') { i++; } } } else if (env_ptr[i+1] == '\'') { for (i+=2; ; i++) { if (env_ptr[i] == '\0') { error("wiki: setting job %u " "invalid environment " "variables: %s", jobid, env_ptr); return EINVAL; } if (env_ptr[i] == '\'') { i++; break; } if (env_ptr[i] == '\\') { i++; } } } } if (isspace(env_ptr[i]) || (env_ptr[i] == ',')) { if (!have_equal) { error("wiki: setting job %u invalid " "environment variables: %s", jobid, env_ptr); return EINVAL; } old_sep[0] = env_ptr[i]; env_ptr[i] = '\0'; xrealloc(job_ptr->details->env_sup, sizeof(char *) * (job_ptr->details->env_cnt+1)); job_ptr->details->env_sup [job_ptr->details->env_cnt++] = xstrdup(&env_ptr[begin]); info("wiki: for job %u add env: %s", jobid, &env_ptr[begin]); env_ptr[i] = old_sep[0]; if (isspace(old_sep[0])) break; begin = i + 1; have_equal = false; } } } if (new_time_limit) { time_t old_time = job_ptr->time_limit; job_ptr->time_limit = new_time_limit; info("wiki: change job %u time_limit to %u", jobid, new_time_limit); /* Update end_time based upon change * to preserve suspend time info */ job_ptr->end_time = job_ptr->end_time + ((job_ptr->time_limit - old_time) * 60); last_job_update = now; } if (bank_ptr && (update_job_account("wiki", job_ptr, bank_ptr) != SLURM_SUCCESS)) { return EINVAL; } if (feature_ptr) { if (IS_JOB_PENDING(job_ptr) && (job_ptr->details)) { info("wiki: change job %u features to %s", jobid, feature_ptr); job_ptr->details->features = xstrdup(feature_ptr); last_job_update = now; } else { error("wiki: MODIFYJOB features of non-pending " "job %u", jobid); return ESLURM_DISABLED; } } if (start_ptr) { char *end_ptr; uint32_t begin_time = strtol(start_ptr, &end_ptr, 10); if (IS_JOB_PENDING(job_ptr) && (job_ptr->details)) { info("wiki: change job %u begin time to %u", jobid, begin_time); job_ptr->details->begin_time = begin_time; last_job_update = now; update_accounting = true; } else { error("wiki: MODIFYJOB begin_time of non-pending " "job %u", jobid); return ESLURM_DISABLED; } } if (name_ptr) { if (IS_JOB_PENDING(job_ptr)) { info("wiki: change job %u name %s", jobid, name_ptr); xfree(job_ptr->name); job_ptr->name = xstrdup(name_ptr); last_job_update = now; update_accounting = true; } else { error("wiki: MODIFYJOB name of non-pending job %u", jobid); return ESLURM_DISABLED; } } if (new_hostlist) { int rc = 0, task_cnt; hostlist_t hl; char *tasklist; if (!IS_JOB_PENDING(job_ptr) || !job_ptr->details) { /* Job is done, nothing to reset */ if (new_hostlist == '\0') goto host_fini; error("wiki: MODIFYJOB hostlist of non-pending " "job %u", jobid); return ESLURM_DISABLED; } xfree(job_ptr->details->req_nodes); FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap); if (new_hostlist == '\0') goto host_fini; tasklist = moab2slurm_task_list(new_hostlist, &task_cnt); if (tasklist == NULL) { rc = 1; goto host_fini; } hl = hostlist_create(tasklist); if (hl == 0) { rc = 1; goto host_fini; } hostlist_uniq(hl); hostlist_sort(hl); job_ptr->details->req_nodes = hostlist_ranged_string_xmalloc(hl); hostlist_destroy(hl); if (job_ptr->details->req_nodes == NULL) { rc = 1; goto host_fini; } if (node_name2bitmap(job_ptr->details->req_nodes, false, &job_ptr->details->req_node_bitmap)) { rc = 1; goto host_fini; } host_fini: if (rc) { info("wiki: change job %u invalid hostlist %s", jobid, new_hostlist); xfree(job_ptr->details->req_nodes); return EINVAL; } else { info("wiki: change job %u hostlist %s", jobid, new_hostlist); update_accounting = true; } } if (part_name_ptr) { struct part_record *part_ptr; if (!IS_JOB_PENDING(job_ptr)) { error("wiki: MODIFYJOB partition of non-pending " "job %u", jobid); return ESLURM_DISABLED; } part_ptr = find_part_record(part_name_ptr); if (part_ptr == NULL) { error("wiki: MODIFYJOB has invalid partition %s", part_name_ptr); return ESLURM_INVALID_PARTITION_NAME; } info("wiki: change job %u partition %s", jobid, part_name_ptr); xfree(job_ptr->partition); job_ptr->partition = xstrdup(part_name_ptr); job_ptr->part_ptr = part_ptr; last_job_update = now; update_accounting = true; } if (new_node_cnt) { job_desc_msg_t job_desc; #ifdef HAVE_BG uint16_t geometry[SYSTEM_DIMENSIONS] = {(uint16_t) NO_VAL}; static uint16_t cpus_per_node = 0; if (!cpus_per_node) { select_g_alter_node_cnt(SELECT_GET_NODE_CPU_CNT, &cpus_per_node); } #endif if(!IS_JOB_PENDING(job_ptr) || !job_ptr->details) { error("wiki: MODIFYJOB node count of non-pending " "job %u", jobid); return ESLURM_DISABLED; } memset(&job_desc, 0, sizeof(job_desc_msg_t)); job_desc.min_nodes = new_node_cnt; job_desc.max_nodes = NO_VAL; job_desc.select_jobinfo = select_g_select_jobinfo_alloc(); select_g_alter_node_cnt(SELECT_SET_NODE_CNT, &job_desc); select_g_select_jobinfo_free(job_desc.select_jobinfo); job_ptr->details->min_nodes = job_desc.min_nodes; if (job_ptr->details->max_nodes && (job_ptr->details->max_nodes < job_desc.min_nodes)) job_ptr->details->max_nodes = job_desc.min_nodes; info("wiki: change job %u min_nodes to %u", jobid, new_node_cnt); #ifdef HAVE_BG job_ptr->details->min_cpus = job_desc.min_cpus; job_ptr->details->max_cpus = job_desc.max_cpus; job_ptr->details->pn_min_cpus = job_desc.pn_min_cpus; new_node_cnt = job_ptr->details->min_cpus; if (cpus_per_node) new_node_cnt /= cpus_per_node; /* This is only set up so accounting is set up correctly */ select_g_select_jobinfo_set(job_ptr->select_jobinfo, SELECT_JOBDATA_NODE_CNT, &new_node_cnt); /* reset geo since changing this makes any geo potentially invalid */ select_g_select_jobinfo_set(job_ptr->select_jobinfo, SELECT_JOBDATA_GEOMETRY, geometry); #endif last_job_update = now; update_accounting = true; } if (gres_ptr) { char *orig_gres; if (!IS_JOB_PENDING(job_ptr)) { error("wiki: MODIFYJOB GRES of non-pending job %u", jobid); return ESLURM_DISABLED; } orig_gres = job_ptr->gres; job_ptr->gres = NULL; if (gres_ptr[0]) job_ptr->gres = xstrdup(gres_ptr); if (gres_plugin_job_state_validate(job_ptr->gres, &job_ptr->gres_list)) { error("wiki: MODIFYJOB Invalid GRES=%s", gres_ptr); xfree(job_ptr->gres); job_ptr->gres = orig_gres; return ESLURM_INVALID_GRES; } xfree(orig_gres); } if (wckey_ptr) { int rc = update_job_wckey("update_job", job_ptr, wckey_ptr); if (rc != SLURM_SUCCESS) { error("wiki: MODIFYJOB Invalid WCKEY=%s", wckey_ptr); return rc; } } if (update_accounting) { if (job_ptr->details && job_ptr->details->begin_time) { /* Update job record in accounting to reflect * the changes */ jobacct_storage_g_job_start(acct_db_conn, job_ptr); } } return SLURM_SUCCESS; }
/* * Create job description structure based off srun options * (see opt.h) */ job_desc_msg_t * job_desc_msg_create_from_opts (void) { job_desc_msg_t *j = xmalloc(sizeof(*j)); hostlist_t hl = NULL; slurm_init_job_desc_msg(j); j->contiguous = opt.contiguous; j->features = opt.constraints; j->gres = opt.gres; if (opt.immediate == 1) j->immediate = opt.immediate; if (opt.job_name) j->name = xstrdup(opt.job_name); else j->name = xstrdup(opt.cmd_name); if (opt.argc > 0) { j->argc = 1; j->argv = (char **) xmalloc(sizeof(char *) * 2); j->argv[0] = xstrdup(opt.argv[0]); } if (opt.acctg_freq >= 0) j->acctg_freq = opt.acctg_freq; j->reservation = xstrdup(opt.reservation); j->wckey = xstrdup(opt.wckey); j->req_nodes = xstrdup(opt.nodelist); /* simplify the job allocation nodelist, * not laying out tasks until step */ if(j->req_nodes) { hl = hostlist_create(j->req_nodes); xfree(opt.nodelist); opt.nodelist = hostlist_ranged_string_xmalloc(hl); hostlist_uniq(hl); xfree(j->req_nodes); j->req_nodes = hostlist_ranged_string_xmalloc(hl); hostlist_destroy(hl); } if(opt.distribution == SLURM_DIST_ARBITRARY && !j->req_nodes) { error("With Arbitrary distribution you need to " "specify a nodelist or hostfile with the -w option"); return NULL; } j->exc_nodes = opt.exc_nodes; j->partition = opt.partition; j->min_nodes = opt.min_nodes; if (opt.sockets_per_node != NO_VAL) j->sockets_per_node = opt.sockets_per_node; if (opt.cores_per_socket != NO_VAL) j->cores_per_socket = opt.cores_per_socket; if (opt.threads_per_core != NO_VAL) j->threads_per_core = opt.threads_per_core; j->user_id = opt.uid; j->dependency = opt.dependency; if (opt.nice) j->nice = NICE_OFFSET + opt.nice; if (opt.cpu_bind) j->cpu_bind = opt.cpu_bind; if (opt.cpu_bind_type) j->cpu_bind_type = opt.cpu_bind_type; if (opt.mem_bind) j->mem_bind = opt.mem_bind; if (opt.mem_bind_type) j->mem_bind_type = opt.mem_bind_type; if (opt.plane_size != NO_VAL) j->plane_size = opt.plane_size; j->task_dist = opt.distribution; j->group_id = opt.gid; j->mail_type = opt.mail_type; if (opt.ntasks_per_node != NO_VAL) j->ntasks_per_node = opt.ntasks_per_node; if (opt.ntasks_per_socket != NO_VAL) j->ntasks_per_socket = opt.ntasks_per_socket; if (opt.ntasks_per_core != NO_VAL) j->ntasks_per_core = opt.ntasks_per_core; if (opt.mail_user) j->mail_user = xstrdup(opt.mail_user); if (opt.begin) j->begin_time = opt.begin; if (opt.licenses) j->licenses = xstrdup(opt.licenses); if (opt.network) j->network = xstrdup(opt.network); if (opt.account) j->account = xstrdup(opt.account); if (opt.comment) j->comment = xstrdup(opt.comment); if (opt.qos) j->qos = xstrdup(opt.qos); if (opt.cwd) j->work_dir = xstrdup(opt.cwd); if (opt.hold) j->priority = 0; if (opt.jobid != NO_VAL) j->job_id = opt.jobid; #ifdef HAVE_BG if (opt.geometry[0] > 0) { int i; for (i=0; i<SYSTEM_DIMENSIONS; i++) j->geometry[i] = opt.geometry[i]; } #endif if (opt.conn_type != (uint16_t) NO_VAL) j->conn_type[0] = opt.conn_type; if (opt.reboot) j->reboot = 1; if (opt.no_rotate) j->rotate = 0; if (opt.blrtsimage) j->blrtsimage = xstrdup(opt.blrtsimage); if (opt.linuximage) j->linuximage = xstrdup(opt.linuximage); if (opt.mloaderimage) j->mloaderimage = xstrdup(opt.mloaderimage); if (opt.ramdiskimage) j->ramdiskimage = xstrdup(opt.ramdiskimage); if (opt.max_nodes) j->max_nodes = opt.max_nodes; else if (opt.nodes_set) { /* On an allocation if the max nodes isn't set set it * to do the same behavior as with salloc or sbatch. */ j->max_nodes = opt.min_nodes; } if (opt.pn_min_cpus != NO_VAL) j->pn_min_cpus = opt.pn_min_cpus; if (opt.pn_min_memory != NO_VAL) j->pn_min_memory = opt.pn_min_memory; else if (opt.mem_per_cpu != NO_VAL) j->pn_min_memory = opt.mem_per_cpu | MEM_PER_CPU; if (opt.pn_min_tmp_disk != NO_VAL) j->pn_min_tmp_disk = opt.pn_min_tmp_disk; if (opt.overcommit) { j->min_cpus = opt.min_nodes; j->overcommit = opt.overcommit; } else j->min_cpus = opt.ntasks * opt.cpus_per_task; if (opt.ntasks_set) j->num_tasks = opt.ntasks; if (opt.cpus_set) j->cpus_per_task = opt.cpus_per_task; if (opt.no_kill) j->kill_on_node_fail = 0; if (opt.time_limit != NO_VAL) j->time_limit = opt.time_limit; if (opt.time_min != NO_VAL) j->time_min = opt.time_min; j->shared = opt.shared; if (opt.warn_signal) j->warn_signal = opt.warn_signal; if (opt.warn_time) j->warn_time = opt.warn_time; /* srun uses the same listening port for the allocation response * message as all other messages */ j->alloc_resp_port = slurmctld_comm_addr.port; j->other_port = slurmctld_comm_addr.port; if (opt.spank_job_env_size) { j->spank_job_env = opt.spank_job_env; j->spank_job_env_size = opt.spank_job_env_size; } return (j); }
/* * Create job description structure based off srun options * (see opt.h) */ job_desc_msg_t * job_desc_msg_create_from_opts (void) { job_desc_msg_t *j = xmalloc(sizeof(*j)); hostlist_t hl = NULL; slurm_init_job_desc_msg(j); #if defined HAVE_ALPS_CRAY && defined HAVE_REAL_CRAY uint64_t pagg_id = job_getjid(getpid()); /* * Interactive sessions require pam_job.so in /etc/pam.d/common-session * since creating sgi_job containers requires root permissions. This is * the only exception where we allow the fallback of using the SID to * confirm the reservation (caught later, in do_basil_confirm). */ if (pagg_id == (uint64_t)-1) { error("No SGI job container ID detected - please enable the " "Cray job service via /etc/init.d/job"); } else { if (!j->select_jobinfo) j->select_jobinfo = select_g_select_jobinfo_alloc(); select_g_select_jobinfo_set(j->select_jobinfo, SELECT_JOBDATA_PAGG_ID, &pagg_id); } #endif j->contiguous = opt.contiguous; if (opt.core_spec) j->core_spec = opt.core_spec; j->features = opt.constraints; j->gres = opt.gres; if (opt.immediate == 1) j->immediate = opt.immediate; if (opt.job_name) j->name = opt.job_name; else j->name = opt.cmd_name; if (opt.argc > 0) { j->argc = 1; j->argv = (char **) xmalloc(sizeof(char *) * 2); j->argv[0] = xstrdup(opt.argv[0]); } if (opt.acctg_freq) j->acctg_freq = xstrdup(opt.acctg_freq); j->reservation = opt.reservation; j->wckey = opt.wckey; j->req_nodes = xstrdup(opt.nodelist); /* simplify the job allocation nodelist, * not laying out tasks until step */ if (j->req_nodes) { hl = hostlist_create(j->req_nodes); xfree(opt.nodelist); opt.nodelist = hostlist_ranged_string_xmalloc(hl); hostlist_uniq(hl); xfree(j->req_nodes); j->req_nodes = hostlist_ranged_string_xmalloc(hl); hostlist_destroy(hl); } if (opt.distribution == SLURM_DIST_ARBITRARY && !j->req_nodes) { error("With Arbitrary distribution you need to " "specify a nodelist or hostfile with the -w option"); return NULL; } j->exc_nodes = opt.exc_nodes; j->partition = opt.partition; j->min_nodes = opt.min_nodes; if (opt.sockets_per_node != NO_VAL) j->sockets_per_node = opt.sockets_per_node; if (opt.cores_per_socket != NO_VAL) j->cores_per_socket = opt.cores_per_socket; if (opt.threads_per_core != NO_VAL) j->threads_per_core = opt.threads_per_core; j->user_id = opt.uid; j->dependency = opt.dependency; if (opt.nice) j->nice = NICE_OFFSET + opt.nice; if (opt.priority) j->priority = opt.priority; if (opt.cpu_bind) j->cpu_bind = opt.cpu_bind; if (opt.cpu_bind_type) j->cpu_bind_type = opt.cpu_bind_type; if (opt.mem_bind) j->mem_bind = opt.mem_bind; if (opt.mem_bind_type) j->mem_bind_type = opt.mem_bind_type; if (opt.plane_size != NO_VAL) j->plane_size = opt.plane_size; j->task_dist = opt.distribution; j->group_id = opt.gid; j->mail_type = opt.mail_type; if (opt.ntasks_per_node != NO_VAL) j->ntasks_per_node = opt.ntasks_per_node; if (opt.ntasks_per_socket != NO_VAL) j->ntasks_per_socket = opt.ntasks_per_socket; if (opt.ntasks_per_core != NO_VAL) j->ntasks_per_core = opt.ntasks_per_core; if (opt.mail_user) j->mail_user = opt.mail_user; if (opt.begin) j->begin_time = opt.begin; if (opt.licenses) j->licenses = opt.licenses; if (opt.network) j->network = opt.network; if (opt.profile) j->profile = opt.profile; if (opt.account) j->account = opt.account; if (opt.comment) j->comment = opt.comment; if (opt.qos) j->qos = opt.qos; if (opt.cwd) j->work_dir = opt.cwd; if (opt.hold) j->priority = 0; if (opt.jobid != NO_VAL) j->job_id = opt.jobid; #ifdef HAVE_BG if (opt.geometry[0] > 0) { int i; for (i = 0; i < SYSTEM_DIMENSIONS; i++) j->geometry[i] = opt.geometry[i]; } #endif memcpy(j->conn_type, opt.conn_type, sizeof(j->conn_type)); if (opt.reboot) j->reboot = 1; if (opt.no_rotate) j->rotate = 0; if (opt.blrtsimage) j->blrtsimage = opt.blrtsimage; if (opt.linuximage) j->linuximage = opt.linuximage; if (opt.mloaderimage) j->mloaderimage = opt.mloaderimage; if (opt.ramdiskimage) j->ramdiskimage = opt.ramdiskimage; if (opt.max_nodes) j->max_nodes = opt.max_nodes; else if (opt.nodes_set) { /* On an allocation if the max nodes isn't set set it * to do the same behavior as with salloc or sbatch. */ j->max_nodes = opt.min_nodes; } if (opt.pn_min_cpus != NO_VAL) j->pn_min_cpus = opt.pn_min_cpus; if (opt.pn_min_memory != NO_VAL) j->pn_min_memory = opt.pn_min_memory; else if (opt.mem_per_cpu != NO_VAL) j->pn_min_memory = opt.mem_per_cpu | MEM_PER_CPU; if (opt.pn_min_tmp_disk != NO_VAL) j->pn_min_tmp_disk = opt.pn_min_tmp_disk; if (opt.overcommit) { j->min_cpus = opt.min_nodes; j->overcommit = opt.overcommit; } else if (opt.cpus_set) j->min_cpus = opt.ntasks * opt.cpus_per_task; else j->min_cpus = opt.ntasks; if (opt.ntasks_set) j->num_tasks = opt.ntasks; if (opt.cpus_set) j->cpus_per_task = opt.cpus_per_task; if (opt.no_kill) j->kill_on_node_fail = 0; if (opt.time_limit != NO_VAL) j->time_limit = opt.time_limit; if (opt.time_min != NO_VAL) j->time_min = opt.time_min; j->shared = opt.shared; if (opt.warn_signal) j->warn_signal = opt.warn_signal; if (opt.warn_time) j->warn_time = opt.warn_time; if (opt.req_switch >= 0) j->req_switch = opt.req_switch; if (opt.wait4switch >= 0) j->wait4switch = opt.wait4switch; /* srun uses the same listening port for the allocation response * message as all other messages */ j->alloc_resp_port = slurmctld_comm_addr.port; j->other_port = slurmctld_comm_addr.port; if (opt.spank_job_env_size) { j->spank_job_env = opt.spank_job_env; j->spank_job_env_size = opt.spank_job_env_size; } return (j); }
/* Start a job: * CMD=STARTJOB ARG=<jobid> TASKLIST=<node_list> * RET 0 on success, -1 on failure */ extern int start_job(char *cmd_ptr, int *err_code, char **err_msg) { char *arg_ptr, *task_ptr, *tasklist, *tmp_char; int rc, task_cnt; uint32_t jobid; hostlist_t hl = (hostlist_t) NULL; char *host_string; static char reply_msg[128]; arg_ptr = strstr(cmd_ptr, "ARG="); if (arg_ptr == NULL) { *err_code = -300; *err_msg = "STARTJOB lacks ARG"; error("wiki: STARTJOB lacks ARG"); return -1; } jobid = strtoul(arg_ptr+4, &tmp_char, 10); if (!isspace(tmp_char[0])) { *err_code = -300; *err_msg = "Invalid ARG value"; error("wiki: STARTJOB has invalid jobid"); return -1; } task_ptr = strstr(cmd_ptr, "TASKLIST="); if (task_ptr == NULL) { *err_code = -300; *err_msg = "STARTJOB lacks TASKLIST"; error("wiki: STARTJOB lacks TASKLIST"); return -1; } task_ptr += 9; /* skip over "TASKLIST=" */ null_term(task_ptr); tasklist = moab2slurm_task_list(task_ptr, &task_cnt); if (tasklist) hl = hostlist_create(tasklist); if ((tasklist == NULL) || (hl == NULL)) { *err_code = -300; *err_msg = "STARTJOB TASKLIST is invalid"; error("wiki: STARTJOB TASKLIST is invalid: %s", task_ptr); xfree(tasklist); return -1; } hostlist_uniq(hl); hostlist_sort(hl); host_string = hostlist_ranged_string_xmalloc(hl); hostlist_destroy(hl); if (host_string == NULL) { *err_code = -300; *err_msg = "STARTJOB has invalid TASKLIST"; error("wiki: STARTJOB has invalid TASKLIST: %s", tasklist); xfree(tasklist); return -1; } rc = _start_job(jobid, task_cnt, host_string, tasklist, err_code, err_msg); xfree(host_string); xfree(tasklist); if (rc == 0) { snprintf(reply_msg, sizeof(reply_msg), "job %u started successfully", jobid); *err_msg = reply_msg; } return rc; }
static char * _dump_node(struct node_record *node_ptr, hostlist_t hl, time_t update_time) { char tmp[16*1024], *buf = NULL; int i; uint32_t cpu_cnt; if (!node_ptr) return NULL; if (hl) { char *node_list; hostlist_sort(hl); hostlist_uniq(hl); node_list = hostlist_ranged_string_xmalloc(hl); xstrcat(buf, node_list); xfree(node_list); } else { snprintf(tmp, sizeof(tmp), "%s", node_ptr->name); xstrcat(buf, tmp); } snprintf(tmp, sizeof(tmp), ":STATE=%s;", _get_node_state(node_ptr)); xstrcat(buf, tmp); if (node_ptr->cpu_load != NO_VAL) { snprintf(tmp, sizeof(tmp), "CPULOAD=%f;", (node_ptr->cpu_load / 100.0)); xstrcat(buf, tmp); } if (node_ptr->reason) { /* Strip out any quotes, they confuse Moab */ char *reason, *bad_char; reason = xstrdup(node_ptr->reason); while ((bad_char = strchr(reason, '\''))) bad_char[0] = ' '; while ((bad_char = strchr(reason, '\"'))) bad_char[0] = ' '; snprintf(tmp, sizeof(tmp), "CAT=\"%s\";", reason); xstrcat(buf, tmp); xfree(reason); } if (update_time > last_node_update) return buf; if (slurmctld_conf.fast_schedule) { /* config from slurm.conf */ cpu_cnt = node_ptr->config_ptr->cpus; } else { /* config as reported by slurmd */ cpu_cnt = node_ptr->cpus; } for (i=0; i<node_ptr->part_cnt; i++) { if (i == 0) xstrcat(buf, "CCLASS="); snprintf(tmp, sizeof(tmp), "[%s:%u]", node_ptr->part_pptr[i]->name, cpu_cnt); xstrcat(buf, tmp); } if (i > 0) xstrcat(buf, ";"); if (node_ptr->arch) { snprintf(tmp, sizeof(tmp), "ARCH=%s;", node_ptr->arch); xstrcat(buf, tmp); } if (node_ptr->os) { snprintf(tmp, sizeof(tmp), "OS=%s;", node_ptr->os); xstrcat(buf, tmp); } if (node_ptr->config_ptr && node_ptr->config_ptr->feature) { snprintf(tmp, sizeof(tmp), "FEATURE=%s;", node_ptr->config_ptr->feature); /* comma separator to colon */ for (i=0; (tmp[i] != '\0'); i++) { if (tmp[i] == ',') tmp[i] = ':'; } xstrcat(buf, tmp); } if (node_ptr->config_ptr && node_ptr->config_ptr->gres) { snprintf(tmp, sizeof(tmp), "GRES=%s;", node_ptr->config_ptr->gres); xstrcat(buf, tmp); } if (update_time > 0) return buf; if (slurmctld_conf.fast_schedule) { /* config from slurm.conf */ snprintf(tmp, sizeof(tmp), "CMEMORY=%u;CDISK=%u;CPROC=%u;", node_ptr->config_ptr->real_memory, node_ptr->config_ptr->tmp_disk, node_ptr->config_ptr->cpus); } else { /* config as reported by slurmd */ snprintf(tmp, sizeof(tmp), "CMEMORY=%u;CDISK=%u;CPROC=%u;", node_ptr->real_memory, node_ptr->tmp_disk, node_ptr->cpus); } xstrcat(buf, tmp); return buf; }
/* * Create an srun job structure for a step w/out an allocation response msg. * (i.e. inside an allocation) */ srun_job_t * job_step_create_allocation(resource_allocation_response_msg_t *resp) { uint32_t job_id = resp->job_id; srun_job_t *job = NULL; allocation_info_t *ai = xmalloc(sizeof(*ai)); hostlist_t hl = NULL; char *buf = NULL; int count = 0; uint32_t alloc_count = 0; ai->jobid = job_id; ai->stepid = NO_VAL; ai->nodelist = opt.alloc_nodelist; hl = hostlist_create(ai->nodelist); hostlist_uniq(hl); alloc_count = hostlist_count(hl); ai->nnodes = alloc_count; hostlist_destroy(hl); if (opt.exc_nodes) { hostlist_t exc_hl = hostlist_create(opt.exc_nodes); hostlist_t inc_hl = NULL; char *node_name = NULL; hl = hostlist_create(ai->nodelist); if(opt.nodelist) { inc_hl = hostlist_create(opt.nodelist); } hostlist_uniq(hl); //info("using %s or %s", opt.nodelist, ai->nodelist); while ((node_name = hostlist_shift(exc_hl))) { int inx = hostlist_find(hl, node_name); if (inx >= 0) { debug("excluding node %s", node_name); hostlist_delete_nth(hl, inx); ai->nnodes--; /* decrement node count */ } if(inc_hl) { inx = hostlist_find(inc_hl, node_name); if (inx >= 0) { error("Requested node %s is also " "in the excluded list.", node_name); error("Job not submitted."); hostlist_destroy(exc_hl); hostlist_destroy(inc_hl); goto error; } } free(node_name); } hostlist_destroy(exc_hl); /* we need to set this here so if there are more nodes * available than we requested we can set it * straight. If there is no exclude list then we set * the vars then. */ if (!opt.nodes_set) { /* we don't want to set the number of nodes = * to the number of requested processes unless we * know it is less than the number of nodes * in the allocation */ if(opt.ntasks_set && (opt.ntasks < ai->nnodes)) opt.min_nodes = opt.ntasks; else opt.min_nodes = ai->nnodes; opt.nodes_set = true; } if(!opt.max_nodes) opt.max_nodes = opt.min_nodes; if((opt.max_nodes > 0) && (opt.max_nodes < ai->nnodes)) ai->nnodes = opt.max_nodes; count = hostlist_count(hl); if(!count) { error("Hostlist is now nothing! Can't run job."); hostlist_destroy(hl); goto error; } if(inc_hl) { count = hostlist_count(inc_hl); if(count < ai->nnodes) { /* add more nodes to get correct number for allocation */ hostlist_t tmp_hl = hostlist_copy(hl); int i=0; int diff = ai->nnodes - count; buf = hostlist_ranged_string_xmalloc(inc_hl); hostlist_delete(tmp_hl, buf); xfree(buf); while ((node_name = hostlist_shift(tmp_hl)) && (i < diff)) { hostlist_push(inc_hl, node_name); i++; } hostlist_destroy(tmp_hl); } buf = hostlist_ranged_string_xmalloc(inc_hl); hostlist_destroy(inc_hl); xfree(opt.nodelist); opt.nodelist = buf; } else { if (count > ai->nnodes) { /* remove more nodes than needed for allocation */ int i=0; for (i=count; i>ai->nnodes; i--) hostlist_delete_nth(hl, i); } xfree(opt.nodelist); opt.nodelist = hostlist_ranged_string_xmalloc(hl); } hostlist_destroy(hl); } else { if (!opt.nodes_set) { /* we don't want to set the number of nodes = * to the number of requested processes unless we * know it is less than the number of nodes * in the allocation */ if(opt.ntasks_set && (opt.ntasks < ai->nnodes)) opt.min_nodes = opt.ntasks; else opt.min_nodes = ai->nnodes; opt.nodes_set = true; } if(!opt.max_nodes) opt.max_nodes = opt.min_nodes; if((opt.max_nodes > 0) && (opt.max_nodes < ai->nnodes)) ai->nnodes = opt.max_nodes; /* Don't reset the ai->nodelist because that is the * nodelist we want to say the allocation is under * opt.nodelist is what is used for the allocation. */ /* xfree(ai->nodelist); */ /* ai->nodelist = xstrdup(buf); */ } /* get the correct number of hosts to run tasks on */ if (opt.nodelist) { hl = hostlist_create(opt.nodelist); if (opt.distribution != SLURM_DIST_ARBITRARY) hostlist_uniq(hl); if (!hostlist_count(hl)) { error("Hostlist is now nothing! Can not run job."); hostlist_destroy(hl); goto error; } buf = hostlist_ranged_string_xmalloc(hl); count = hostlist_count(hl); hostlist_destroy(hl); /* Don't reset the ai->nodelist because that is the * nodelist we want to say the allocation is under * opt.nodelist is what is used for the allocation. */ /* xfree(ai->nodelist); */ /* ai->nodelist = xstrdup(buf); */ xfree(opt.nodelist); opt.nodelist = buf; } if (opt.distribution == SLURM_DIST_ARBITRARY) { if (count != opt.ntasks) { error("You asked for %d tasks but specified %d nodes", opt.ntasks, count); goto error; } } if (ai->nnodes == 0) { error("No nodes in allocation, can't run job"); goto error; } ai->num_cpu_groups = resp->num_cpu_groups; ai->cpus_per_node = resp->cpus_per_node; ai->cpu_count_reps = resp->cpu_count_reps; /* info("looking for %d nodes out of %s with a must list of %s", */ /* ai->nnodes, ai->nodelist, opt.nodelist); */ /* * Create job */ job = _job_create_structure(ai); error: xfree(ai); return (job); }
/* * Create job description structure based off srun options * (see opt.h) */ static job_desc_msg_t *_job_desc_msg_create_from_opts(slurm_opt_t *opt_local) { srun_opt_t *srun_opt = opt_local->srun_opt; job_desc_msg_t *j = xmalloc(sizeof(*j)); hostlist_t hl = NULL; xassert(srun_opt); slurm_init_job_desc_msg(j); #if defined HAVE_ALPS_CRAY && defined HAVE_REAL_CRAY static bool sgi_err_logged = false; uint64_t pagg_id = job_getjid(getpid()); /* * Interactive sessions require pam_job.so in /etc/pam.d/common-session * since creating sgi_job containers requires root permissions. This is * the only exception where we allow the fallback of using the SID to * confirm the reservation (caught later, in do_basil_confirm). */ if (pagg_id != (uint64_t) -1) { if (!j->select_jobinfo) j->select_jobinfo = select_g_select_jobinfo_alloc(); select_g_select_jobinfo_set(j->select_jobinfo, SELECT_JOBDATA_PAGG_ID, &pagg_id); } else if (!sgi_err_logged) { error("No SGI job container ID detected - please enable the " "Cray job service via /etc/init.d/job"); sgi_err_logged = true; } #endif j->contiguous = opt_local->contiguous; if (opt_local->core_spec != NO_VAL16) j->core_spec = opt_local->core_spec; j->features = opt_local->constraints; j->cluster_features = opt_local->c_constraints; if (opt_local->gres && xstrcasecmp(opt_local->gres, "NONE")) j->gres = opt_local->gres; if (opt_local->immediate == 1) j->immediate = opt_local->immediate; if (opt_local->job_name) j->name = opt_local->job_name; else j->name = srun_opt->cmd_name; if (srun_opt->argc > 0) { j->argc = 1; j->argv = (char **) xmalloc(sizeof(char *) * 2); j->argv[0] = xstrdup(srun_opt->argv[0]); } if (opt_local->acctg_freq) j->acctg_freq = xstrdup(opt_local->acctg_freq); j->reservation = opt_local->reservation; j->wckey = opt_local->wckey; j->x11 = opt.x11; if (j->x11) { j->x11_magic_cookie = xstrdup(opt.x11_magic_cookie); j->x11_target_port = opt.x11_target_port; } j->req_nodes = xstrdup(opt_local->nodelist); /* simplify the job allocation nodelist, * not laying out tasks until step */ if (j->req_nodes) { hl = hostlist_create(j->req_nodes); xfree(opt_local->nodelist); opt_local->nodelist = hostlist_ranged_string_xmalloc(hl); hostlist_uniq(hl); xfree(j->req_nodes); j->req_nodes = hostlist_ranged_string_xmalloc(hl); hostlist_destroy(hl); } if (((opt_local->distribution & SLURM_DIST_STATE_BASE) == SLURM_DIST_ARBITRARY) && !j->req_nodes) { error("With Arbitrary distribution you need to " "specify a nodelist or hostfile with the -w option"); return NULL; } j->extra = opt_local->extra; j->exc_nodes = opt_local->exc_nodes; j->partition = opt_local->partition; j->min_nodes = opt_local->min_nodes; if (opt_local->sockets_per_node != NO_VAL) j->sockets_per_node = opt_local->sockets_per_node; if (opt_local->cores_per_socket != NO_VAL) j->cores_per_socket = opt_local->cores_per_socket; if (opt_local->threads_per_core != NO_VAL) { j->threads_per_core = opt_local->threads_per_core; /* if 1 always make sure affinity knows about it */ if (j->threads_per_core == 1) srun_opt->cpu_bind_type |= CPU_BIND_ONE_THREAD_PER_CORE; } j->user_id = opt_local->uid; j->dependency = opt_local->dependency; if (opt_local->nice != NO_VAL) j->nice = NICE_OFFSET + opt_local->nice; if (opt_local->priority) j->priority = opt_local->priority; if (srun_opt->cpu_bind) j->cpu_bind = srun_opt->cpu_bind; if (srun_opt->cpu_bind_type) j->cpu_bind_type = srun_opt->cpu_bind_type; if (opt_local->delay_boot != NO_VAL) j->delay_boot = opt_local->delay_boot; if (opt_local->mem_bind) j->mem_bind = opt_local->mem_bind; if (opt_local->mem_bind_type) j->mem_bind_type = opt_local->mem_bind_type; if (opt_local->plane_size != NO_VAL) j->plane_size = opt_local->plane_size; j->task_dist = opt_local->distribution; j->group_id = opt_local->gid; j->mail_type = opt_local->mail_type; if (opt_local->ntasks_per_node != NO_VAL) j->ntasks_per_node = opt_local->ntasks_per_node; if (opt_local->ntasks_per_socket != NO_VAL) j->ntasks_per_socket = opt_local->ntasks_per_socket; if (opt_local->ntasks_per_core != NO_VAL) j->ntasks_per_core = opt_local->ntasks_per_core; if (opt_local->mail_user) j->mail_user = opt_local->mail_user; if (opt_local->burst_buffer) j->burst_buffer = opt_local->burst_buffer; if (opt_local->begin) j->begin_time = opt_local->begin; if (opt_local->deadline) j->deadline = opt_local->deadline; if (opt_local->licenses) j->licenses = opt_local->licenses; if (opt_local->network) j->network = opt_local->network; if (opt_local->profile) j->profile = opt_local->profile; if (opt_local->account) j->account = opt_local->account; if (opt_local->comment) j->comment = opt_local->comment; if (opt_local->qos) j->qos = opt_local->qos; if (opt_local->cwd) j->work_dir = opt_local->cwd; if (opt_local->hold) j->priority = 0; if (opt_local->jobid != NO_VAL) j->job_id = opt_local->jobid; #ifdef HAVE_BG if (opt_local->geometry[0] > 0) { int i; for (i = 0; i < SYSTEM_DIMENSIONS; i++) j->geometry[i] = opt_local->geometry[i]; } #endif memcpy(j->conn_type, opt_local->conn_type, sizeof(j->conn_type)); if (opt_local->reboot) j->reboot = 1; if (opt_local->no_rotate) j->rotate = 0; if (opt_local->blrtsimage) j->blrtsimage = opt_local->blrtsimage; if (opt_local->linuximage) j->linuximage = opt_local->linuximage; if (opt_local->mloaderimage) j->mloaderimage = opt_local->mloaderimage; if (opt_local->ramdiskimage) j->ramdiskimage = opt_local->ramdiskimage; if (opt_local->max_nodes) j->max_nodes = opt_local->max_nodes; else if (opt_local->nodes_set) { /* On an allocation if the max nodes isn't set set it * to do the same behavior as with salloc or sbatch. */ j->max_nodes = opt_local->min_nodes; } if (opt_local->pn_min_cpus != NO_VAL) j->pn_min_cpus = opt_local->pn_min_cpus; if (opt_local->pn_min_memory != NO_VAL64) j->pn_min_memory = opt_local->pn_min_memory; else if (opt_local->mem_per_cpu != NO_VAL64) j->pn_min_memory = opt_local->mem_per_cpu | MEM_PER_CPU; if (opt_local->pn_min_tmp_disk != NO_VAL) j->pn_min_tmp_disk = opt_local->pn_min_tmp_disk; if (opt_local->overcommit) { j->min_cpus = opt_local->min_nodes; j->overcommit = opt_local->overcommit; } else if (opt_local->cpus_set) j->min_cpus = opt_local->ntasks * opt_local->cpus_per_task; else j->min_cpus = opt_local->ntasks; if (opt_local->ntasks_set) j->num_tasks = opt_local->ntasks; if (opt_local->cpus_set) j->cpus_per_task = opt_local->cpus_per_task; if (opt_local->no_kill) j->kill_on_node_fail = 0; if (opt_local->time_limit != NO_VAL) j->time_limit = opt_local->time_limit; if (opt_local->time_min != NO_VAL) j->time_min = opt_local->time_min; if (opt_local->shared != NO_VAL16) j->shared = opt_local->shared; if (opt_local->warn_signal) j->warn_signal = opt_local->warn_signal; if (opt_local->warn_time) j->warn_time = opt_local->warn_time; if (opt_local->job_flags) j->bitflags = opt_local->job_flags; if (opt_local->cpu_freq_min != NO_VAL) j->cpu_freq_min = opt_local->cpu_freq_min; if (opt_local->cpu_freq_max != NO_VAL) j->cpu_freq_max = opt_local->cpu_freq_max; if (opt_local->cpu_freq_gov != NO_VAL) j->cpu_freq_gov = opt_local->cpu_freq_gov; if (opt_local->req_switch >= 0) j->req_switch = opt_local->req_switch; if (opt_local->wait4switch >= 0) j->wait4switch = opt_local->wait4switch; /* srun uses the same listening port for the allocation response * message as all other messages */ j->alloc_resp_port = slurmctld_comm_addr.port; j->other_port = slurmctld_comm_addr.port; if (opt_local->spank_job_env_size) { j->spank_job_env = opt_local->spank_job_env; j->spank_job_env_size = opt_local->spank_job_env_size; } if (opt_local->power_flags) j->power_flags = opt_local->power_flags; if (opt_local->mcs_label) j->mcs_label = opt_local->mcs_label; j->wait_all_nodes = 1; /* If can run on multiple clusters find the earliest run time * and run it there */ j->clusters = xstrdup(opt_local->clusters); return j; }
/* * ping_nodes - check that all nodes and daemons are alive, * get nodes in UNKNOWN state to register */ void ping_nodes (void) { static bool restart_flag = true; /* system just restarted */ static int offset = 0; /* mutex via node table write lock on entry */ static int max_reg_threads = 0; /* max node registration threads * this can include DOWN nodes, so * limit the number to avoid huge * communication delays */ int i; time_t now, still_live_time, node_dead_time; static time_t last_ping_time = (time_t) 0; hostlist_t down_hostlist = NULL; char *host_str = NULL; agent_arg_t *ping_agent_args = NULL; agent_arg_t *reg_agent_args = NULL; #ifdef HAVE_FRONT_END front_end_record_t *front_end_ptr = NULL; #else struct node_record *node_ptr = NULL; #endif now = time (NULL); ping_agent_args = xmalloc (sizeof (agent_arg_t)); ping_agent_args->msg_type = REQUEST_PING; ping_agent_args->retry = 0; ping_agent_args->hostlist = hostlist_create(""); reg_agent_args = xmalloc (sizeof (agent_arg_t)); reg_agent_args->msg_type = REQUEST_NODE_REGISTRATION_STATUS; reg_agent_args->retry = 0; reg_agent_args->hostlist = hostlist_create(""); /* * If there are a large number of down nodes, the node ping * can take a long time to complete: * ping_time = down_nodes * agent_timeout / agent_parallelism * ping_time = down_nodes * 10_seconds / 10 * ping_time = down_nodes (seconds) * Because of this, we extend the SlurmdTimeout by the * time needed to complete a ping of all nodes. */ if ((slurmctld_conf.slurmd_timeout == 0) || (last_ping_time == (time_t) 0)) { node_dead_time = (time_t) 0; } else { node_dead_time = last_ping_time - slurmctld_conf.slurmd_timeout; } still_live_time = now - (slurmctld_conf.slurmd_timeout / 3); last_ping_time = now; if (max_reg_threads == 0) { max_reg_threads = MAX(slurm_get_tree_width(), 1); } offset += max_reg_threads; if ((offset > node_record_count) && (offset >= (max_reg_threads * MAX_REG_FREQUENCY))) offset = 0; #ifdef HAVE_FRONT_END for (i = 0, front_end_ptr = front_end_nodes; i < front_end_node_cnt; i++, front_end_ptr++) { if ((slurmctld_conf.slurmd_timeout == 0) && (!restart_flag) && (!IS_NODE_UNKNOWN(front_end_ptr)) && (!IS_NODE_NO_RESPOND(front_end_ptr))) continue; if ((front_end_ptr->last_response != (time_t) 0) && (front_end_ptr->last_response <= node_dead_time) && (!IS_NODE_DOWN(front_end_ptr))) { if (down_hostlist) (void) hostlist_push_host(down_hostlist, front_end_ptr->name); else { down_hostlist = hostlist_create(front_end_ptr->name); if (down_hostlist == NULL) fatal("hostlist_create: malloc error"); } set_front_end_down(front_end_ptr, "Not responding"); front_end_ptr->not_responding = false; continue; } if (restart_flag) { front_end_ptr->last_response = slurmctld_conf.last_update; } /* Request a node registration if its state is UNKNOWN or * on a periodic basis (about every MAX_REG_FREQUENCY ping, * this mechanism avoids an additional (per node) timer or * counter and gets updated configuration information * once in a while). We limit these requests since they * can generate a flood of incoming RPCs. */ if (IS_NODE_UNKNOWN(front_end_ptr) || restart_flag || ((i >= offset) && (i < (offset + max_reg_threads)))) { hostlist_push(reg_agent_args->hostlist, front_end_ptr->name); reg_agent_args->node_count++; continue; } if ((!IS_NODE_NO_RESPOND(front_end_ptr)) && (front_end_ptr->last_response >= still_live_time)) continue; /* Do not keep pinging down nodes since this can induce * huge delays in hierarchical communication fail-over */ if (IS_NODE_NO_RESPOND(front_end_ptr) && IS_NODE_DOWN(front_end_ptr)) continue; hostlist_push(ping_agent_args->hostlist, front_end_ptr->name); ping_agent_args->node_count++; } #else for (i=0, node_ptr=node_record_table_ptr; i<node_record_count; i++, node_ptr++) { if (IS_NODE_FUTURE(node_ptr) || IS_NODE_POWER_SAVE(node_ptr)) continue; if ((slurmctld_conf.slurmd_timeout == 0) && (!restart_flag) && (!IS_NODE_UNKNOWN(node_ptr)) && (!IS_NODE_NO_RESPOND(node_ptr))) continue; if ((node_ptr->last_response != (time_t) 0) && (node_ptr->last_response <= node_dead_time) && (!IS_NODE_DOWN(node_ptr))) { if (down_hostlist) (void) hostlist_push_host(down_hostlist, node_ptr->name); else { down_hostlist = hostlist_create(node_ptr->name); if (down_hostlist == NULL) fatal("hostlist_create: malloc error"); } set_node_down_ptr(node_ptr, "Not responding"); node_ptr->not_responding = false; /* logged below */ continue; } if (restart_flag) node_ptr->last_response = slurmctld_conf.last_update; /* Request a node registration if its state is UNKNOWN or * on a periodic basis (about every MAX_REG_FREQUENCY ping, * this mechanism avoids an additional (per node) timer or * counter and gets updated configuration information * once in a while). We limit these requests since they * can generate a flood of incoming RPCs. */ if (IS_NODE_UNKNOWN(node_ptr) || restart_flag || ((i >= offset) && (i < (offset + max_reg_threads)))) { hostlist_push(reg_agent_args->hostlist, node_ptr->name); reg_agent_args->node_count++; continue; } if ((!IS_NODE_NO_RESPOND(node_ptr)) && (node_ptr->last_response >= still_live_time)) continue; /* Do not keep pinging down nodes since this can induce * huge delays in hierarchical communication fail-over */ if (IS_NODE_NO_RESPOND(node_ptr) && IS_NODE_DOWN(node_ptr)) continue; hostlist_push(ping_agent_args->hostlist, node_ptr->name); ping_agent_args->node_count++; } #endif restart_flag = false; if (ping_agent_args->node_count == 0) { hostlist_destroy(ping_agent_args->hostlist); xfree (ping_agent_args); } else { hostlist_uniq(ping_agent_args->hostlist); host_str = hostlist_ranged_string_xmalloc( ping_agent_args->hostlist); debug("Spawning ping agent for %s", host_str); xfree(host_str); ping_begin(); agent_queue_request(ping_agent_args); } if (reg_agent_args->node_count == 0) { hostlist_destroy(reg_agent_args->hostlist); xfree (reg_agent_args); } else { hostlist_uniq(reg_agent_args->hostlist); host_str = hostlist_ranged_string_xmalloc( reg_agent_args->hostlist); debug("Spawning registration agent for %s %d hosts", host_str, reg_agent_args->node_count); xfree(host_str); ping_begin(); agent_queue_request(reg_agent_args); } if (down_hostlist) { hostlist_uniq(down_hostlist); host_str = hostlist_ranged_string_xmalloc(down_hostlist); error("Nodes %s not responding, setting DOWN", host_str); xfree(host_str); hostlist_destroy(down_hostlist); } }
/* * forward_msg - logic to forward a message which has been received and * accumulate the return codes from processes getting the * the forwarded message * * IN: forward_struct - forward_struct_t * - holds information about message * that needs to be forwarded to * childern processes * IN: header - header_t - header from message that came in * needing to be forwarded. * RET: SLURM_SUCCESS - int */ extern int forward_msg(forward_struct_t *forward_struct, header_t *header) { int j = 0; int retries = 0; forward_msg_t *forward_msg = NULL; int thr_count = 0; int *span = set_span(header->forward.cnt, 0); hostlist_t hl = NULL; hostlist_t forward_hl = NULL; char *name = NULL; if (!forward_struct->ret_list) { error("didn't get a ret_list from forward_struct"); xfree(span); return SLURM_ERROR; } hl = hostlist_create(header->forward.nodelist); hostlist_uniq(hl); while ((name = hostlist_shift(hl))) { pthread_attr_t attr_agent; pthread_t thread_agent; char *buf = NULL; slurm_attr_init(&attr_agent); if (pthread_attr_setdetachstate (&attr_agent, PTHREAD_CREATE_DETACHED)) error("pthread_attr_setdetachstate error %m"); forward_msg = &forward_struct->forward_msg[thr_count]; forward_msg->ret_list = forward_struct->ret_list; forward_msg->timeout = forward_struct->timeout; if (forward_msg->timeout <= 0) { /* convert secs to msec */ forward_msg->timeout = slurm_get_msg_timeout() * 1000; } forward_msg->notify = &forward_struct->notify; forward_msg->forward_mutex = &forward_struct->forward_mutex; forward_msg->buf_len = forward_struct->buf_len; forward_msg->buf = forward_struct->buf; memcpy(&forward_msg->header.orig_addr, &header->orig_addr, sizeof(slurm_addr_t)); forward_msg->header.version = header->version; forward_msg->header.flags = header->flags; forward_msg->header.msg_type = header->msg_type; forward_msg->header.body_length = header->body_length; forward_msg->header.ret_list = NULL; forward_msg->header.ret_cnt = 0; forward_hl = hostlist_create(name); free(name); for(j = 0; j < span[thr_count]; j++) { name = hostlist_shift(hl); if (!name) break; hostlist_push(forward_hl, name); free(name); } buf = hostlist_ranged_string_xmalloc(forward_hl); hostlist_destroy(forward_hl); forward_init(&forward_msg->header.forward, NULL); forward_msg->header.forward.nodelist = buf; while (pthread_create(&thread_agent, &attr_agent, _forward_thread, (void *)forward_msg)) { error("pthread_create error %m"); if (++retries > MAX_RETRIES) fatal("Can't create pthread"); sleep(1); /* sleep and try again */ } slurm_attr_destroy(&attr_agent); thr_count++; } hostlist_destroy(hl); xfree(span); return SLURM_SUCCESS; }
/* * start_msg_tree - logic to begin the forward tree and * accumulate the return codes from processes getting the * the forwarded message * * IN: hl - hostlist_t - list of every node to send message to * IN: msg - slurm_msg_t - message to send. * IN: timeout - int - how long to wait in milliseconds. * RET List - List containing the responses of the childern * (if any) we forwarded the message to. List * containing type (ret_data_info_t). */ extern List start_msg_tree(hostlist_t hl, slurm_msg_t *msg, int timeout) { int *span = NULL; fwd_tree_t *fwd_tree = NULL; pthread_mutex_t tree_mutex; pthread_cond_t notify; int j = 0, count = 0; List ret_list = NULL; char *name = NULL; int thr_count = 0; int host_count = 0; xassert(hl); xassert(msg); hostlist_uniq(hl); host_count = hostlist_count(hl); span = set_span(host_count, 0); slurm_mutex_init(&tree_mutex); pthread_cond_init(¬ify, NULL); ret_list = list_create(destroy_data_info); while ((name = hostlist_shift(hl))) { pthread_attr_t attr_agent; pthread_t thread_agent; int retries = 0; slurm_attr_init(&attr_agent); if (pthread_attr_setdetachstate (&attr_agent, PTHREAD_CREATE_DETACHED)) error("pthread_attr_setdetachstate error %m"); fwd_tree = xmalloc(sizeof(fwd_tree_t)); fwd_tree->orig_msg = msg; fwd_tree->ret_list = ret_list; fwd_tree->timeout = timeout; fwd_tree->notify = ¬ify; fwd_tree->p_thr_count = &thr_count; fwd_tree->tree_mutex = &tree_mutex; if (fwd_tree->timeout <= 0) { /* convert secs to msec */ fwd_tree->timeout = slurm_get_msg_timeout() * 1000; } fwd_tree->tree_hl = hostlist_create(name); free(name); for (j = 0; j < span[thr_count]; j++) { name = hostlist_shift(hl); if (!name) break; hostlist_push(fwd_tree->tree_hl, name); free(name); } /* * Lock and increase thread counter, we need that to protect * the start_msg_tree waiting loop that was originally designed * around a "while ((count < host_count))" loop. In case where a * fwd thread was not able to get all the return codes from * children, the waiting loop was deadlocked. */ slurm_mutex_lock(&tree_mutex); thr_count++; slurm_mutex_unlock(&tree_mutex); while (pthread_create(&thread_agent, &attr_agent, _fwd_tree_thread, (void *)fwd_tree)) { error("pthread_create error %m"); if (++retries > MAX_RETRIES) fatal("Can't create pthread"); sleep(1); /* sleep and try again */ } slurm_attr_destroy(&attr_agent); } xfree(span); slurm_mutex_lock(&tree_mutex); count = list_count(ret_list); debug2("Tree head got back %d looking for %d", count, host_count); while (thr_count > 0) { pthread_cond_wait(¬ify, &tree_mutex); count = list_count(ret_list); debug2("Tree head got back %d", count); } xassert(count >= host_count); /* Tree head did not get all responses, * but no more active fwd threads!*/ slurm_mutex_unlock(&tree_mutex); slurm_mutex_destroy(&tree_mutex); pthread_cond_destroy(¬ify); return ret_list; }
int main (int argc, char *argv[]) { char *server = NULL; int msize = 65536; uid_t uid = geteuid (); int topt = 0; Npcfsys *fs = NULL; Npcfid *fid, *afid, *root; int c, fd; char buf[80], *host, *p; hostlist_t hl; hostlist_iterator_t itr; int lopt = 0; diod_log_init (argv[0]); opterr = 0; while ((c = GETOPT (argc, argv, OPTIONS, longopts)) != -1) { switch (c) { case 's': /* --server HOST[:PORT] or /path/to/socket */ server = optarg; break; case 'm': /* --msize SIZE */ msize = strtoul (optarg, NULL, 10); break; case 'u': /* --uid UID */ uid = strtoul (optarg, NULL, 10); break; case 't': /* --timeout SECS */ topt = strtoul (optarg, NULL, 10); break; case 'l': /* --long */ lopt = 1; break; default: usage (); } } if (signal (SIGPIPE, SIG_IGN) == SIG_ERR) err_exit ("signal"); if (signal (SIGALRM, sigalarm) == SIG_ERR) err_exit ("signal"); if (topt > 0) alarm (topt); if ((fd = diod_sock_connect (server, 0)) < 0) exit (1); if (!(fs = npc_start (fd, fd, msize, 0))) errn_exit (np_rerror (), "error negotiating protocol with server"); if (!(afid = npc_auth (fs, "ctl", uid, diod_auth)) && np_rerror () != 0) errn_exit (np_rerror (), "error authenticating to server"); if (!(root = npc_attach (fs, afid, "ctl", uid))) errn_exit (np_rerror (), "error attaching to aname=ctl"); if (!(fid = npc_open_bypath (root, "connections", O_RDONLY))) errn_exit (np_rerror (), "open connections"); if (!(hl = hostlist_create (NULL))) err_exit ("hostlist_create"); while (npc_gets (fid, buf, sizeof(buf))) { if ((p = strchr (buf, ' '))) *p = '\0'; if (!lopt && (p = strchr (buf, '.'))) *p = '\0'; if (!hostlist_push_host (hl, buf)) err_exit ("hostlist_push_host"); } hostlist_uniq (hl); if (lopt) { if (!(itr = hostlist_iterator_create (hl))) err_exit ("hostlist_iterator_create"); while ((host = hostlist_next (itr))) printf ("%s\n", host); hostlist_iterator_destroy (itr); } else { char s[1024]; if (hostlist_ranged_string (hl, sizeof (s), s) < 0) msg_exit ("hostlist output would be too long (use -l)"); printf ("%s\n", s); } hostlist_destroy (hl); if (npc_clunk (fid) < 0) errn_exit (np_rerror (), "clunk connections"); if (npc_clunk (root) < 0) errn_exit (np_rerror (), "error clunking ctl"); if (npc_clunk (afid) < 0) errn_exit (np_rerror (), "error clunking afid"); npc_finish (fs); exit(0); }
struct ipmipower_connection * ipmipower_connection_array_create(const char *hostname, unsigned int *len) { char *str = NULL; int index = 0; hostlist_t hl = NULL; hostlist_iterator_t itr = NULL; struct ipmipower_connection *ics; int size = sizeof(struct ipmipower_connection); int hl_count; int errcount = 0; int emfilecount = 0; assert(hostname && len); *len = 0; if (!(hl = hostlist_create(hostname))) { ipmipower_output(MSG_TYPE_HOSTNAME_INVALID, hostname); return NULL; } if (!(itr = hostlist_iterator_create(hl))) ierr_exit("hostlist_iterator_create() error"); hostlist_uniq(hl); hl_count = hostlist_count(hl); ics = (struct ipmipower_connection *)Malloc(size * hl_count); memset(ics, '\0', (size * hl_count)); while ((str = hostlist_next(itr))) { ics[index].ipmi_fd = -1; ics[index].ping_fd = -1; /* cleanup only at the end, gather all error outputs for * later */ if (_connection_setup(&ics[index], str) < 0) { if (errno == EMFILE && !emfilecount) { cbuf_printf(ttyout, "file descriptor limit reached\n"); emfilecount++; } errcount++; } free(str); index++; } hostlist_iterator_destroy(itr); hostlist_destroy(hl); if (errcount) { int i; for (i = 0; i < hl_count; i++) { close(ics[i].ipmi_fd); close(ics[i].ping_fd); if (ics[i].ipmi_in) cbuf_destroy(ics[i].ipmi_in); if (ics[i].ipmi_out) cbuf_destroy(ics[i].ipmi_out); if (ics[i].ping_in) cbuf_destroy(ics[i].ping_in); if (ics[i].ping_out) cbuf_destroy(ics[i].ping_out); } Free(ics); return NULL; } *len = hl_count; return ics; }
static int _job_modify(uint32_t jobid, char *bank_ptr, char *depend_ptr, char *new_hostlist, uint32_t new_node_cnt, char *part_name_ptr, uint32_t new_time_limit) { struct job_record *job_ptr; bool update_accounting = false; job_ptr = find_job_record(jobid); if (job_ptr == NULL) { error("wiki: MODIFYJOB has invalid jobid %u", jobid); return ESLURM_INVALID_JOB_ID; } if (IS_JOB_FINISHED(job_ptr)) { error("wiki: MODIFYJOB jobid %u is finished", jobid); return ESLURM_DISABLED; } if (depend_ptr) { int rc = update_job_dependency(job_ptr, depend_ptr); if (rc == SLURM_SUCCESS) { info("wiki: changed job %u dependency to %s", jobid, depend_ptr); } else { error("wiki: changing job %u dependency to %s", jobid, depend_ptr); return EINVAL; } } if (new_time_limit) { time_t old_time = job_ptr->time_limit; job_ptr->time_limit = new_time_limit; info("wiki: change job %u time_limit to %u", jobid, new_time_limit); /* Update end_time based upon change * to preserve suspend time info */ job_ptr->end_time = job_ptr->end_time + ((job_ptr->time_limit - old_time) * 60); last_job_update = time(NULL); } if (bank_ptr) { if (update_job_account("wiki", job_ptr, bank_ptr) != SLURM_SUCCESS) return EINVAL; else update_accounting = true; } if (new_hostlist) { int rc = 0, task_cnt; hostlist_t hl; char *tasklist; if (!IS_JOB_PENDING(job_ptr) || !job_ptr->details) { /* Job is done, nothing to reset */ if (new_hostlist == '\0') goto host_fini; error("wiki: MODIFYJOB tasklist of non-pending " "job %u", jobid); return ESLURM_DISABLED; } xfree(job_ptr->details->req_nodes); FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap); if (new_hostlist == '\0') goto host_fini; tasklist = moab2slurm_task_list(new_hostlist, &task_cnt); if (tasklist == NULL) { rc = 1; goto host_fini; } hl = hostlist_create(tasklist); if (hl == 0) { rc = 1; goto host_fini; } hostlist_uniq(hl); hostlist_sort(hl); job_ptr->details->req_nodes = hostlist_ranged_string_xmalloc(hl); hostlist_destroy(hl); if (job_ptr->details->req_nodes == NULL) { rc = 1; goto host_fini; } if (node_name2bitmap(job_ptr->details->req_nodes, false, &job_ptr->details->req_node_bitmap)) { rc = 1; goto host_fini; } host_fini: if (rc) { info("wiki: change job %u invalid hostlist %s", jobid, new_hostlist); xfree(job_ptr->details->req_nodes); return EINVAL; } else { info("wiki: change job %u hostlist %s", jobid, new_hostlist); update_accounting = true; } } if (part_name_ptr) { struct part_record *part_ptr; if (!IS_JOB_PENDING(job_ptr)) { error("wiki: MODIFYJOB partition of non-pending " "job %u", jobid); return ESLURM_DISABLED; } part_ptr = find_part_record(part_name_ptr); if (part_ptr == NULL) { error("wiki: MODIFYJOB has invalid partition %s", part_name_ptr); return ESLURM_INVALID_PARTITION_NAME; } info("wiki: change job %u partition %s", jobid, part_name_ptr); xfree(job_ptr->partition); job_ptr->partition = xstrdup(part_name_ptr); job_ptr->part_ptr = part_ptr; last_job_update = time(NULL); update_accounting = true; } if (new_node_cnt) { if (IS_JOB_PENDING(job_ptr) && job_ptr->details) { job_ptr->details->min_nodes = new_node_cnt; if (job_ptr->details->max_nodes && (job_ptr->details->max_nodes < new_node_cnt)) job_ptr->details->max_nodes = new_node_cnt; info("wiki: change job %u min_nodes to %u", jobid, new_node_cnt); last_job_update = time(NULL); update_accounting = true; } else { error("wiki: MODIFYJOB node count of non-pending " "job %u", jobid); return ESLURM_DISABLED; } } if (update_accounting) { /* Update job record in accounting to reflect changes */ jobacct_storage_job_start_direct(acct_db_conn, job_ptr); } return SLURM_SUCCESS; }
/* Spawn health check function for every node that is not DOWN */ extern void run_health_check(void) { #ifdef HAVE_FRONT_END front_end_record_t *front_end_ptr; #else struct node_record *node_ptr; int node_test_cnt = 0, node_limit, node_states, run_cyclic; static int base_node_loc = -1; static time_t cycle_start_time = (time_t) 0; #endif int i; char *host_str = NULL; agent_arg_t *check_agent_args = NULL; /* Sync plugin internal data with * node select_nodeinfo. This is important * after reconfig otherwise select_nodeinfo * will not return the correct number of * allocated cpus. */ select_g_select_nodeinfo_set_all(); check_agent_args = xmalloc (sizeof (agent_arg_t)); check_agent_args->msg_type = REQUEST_HEALTH_CHECK; check_agent_args->retry = 0; check_agent_args->hostlist = hostlist_create(NULL); #ifdef HAVE_FRONT_END for (i = 0, front_end_ptr = front_end_nodes; i < front_end_node_cnt; i++, front_end_ptr++) { if (IS_NODE_NO_RESPOND(front_end_ptr)) continue; hostlist_push_host(check_agent_args->hostlist, front_end_ptr->name); check_agent_args->node_count++; } #else run_cyclic = slurmctld_conf.health_check_node_state & HEALTH_CHECK_CYCLE; node_states = slurmctld_conf.health_check_node_state & (~HEALTH_CHECK_CYCLE); if (run_cyclic) { time_t now = time(NULL); if (cycle_start_time == (time_t) 0) cycle_start_time = now; else if (base_node_loc >= 0) ; /* mid-cycle */ else if (difftime(now, cycle_start_time) < slurmctld_conf.health_check_interval) { return; /* Wait to start next cycle */ } cycle_start_time = now; /* Determine how many nodes we want to test on each call of * run_health_check() to spread out the work. */ node_limit = (node_record_count * 2) / slurmctld_conf.health_check_interval; node_limit = MAX(node_limit, 10); } if ((node_states != HEALTH_CHECK_NODE_ANY) && (node_states != HEALTH_CHECK_NODE_IDLE)) { /* Update each node's alloc_cpus count */ select_g_select_nodeinfo_set_all(); } for (i = 0; i < node_record_count; i++) { if (run_cyclic) { if (node_test_cnt++ >= node_limit) break; base_node_loc++; if (base_node_loc >= node_record_count) { base_node_loc = -1; break; } node_ptr = node_record_table_ptr + base_node_loc; } else { node_ptr = node_record_table_ptr + i; } if (IS_NODE_NO_RESPOND(node_ptr) || IS_NODE_FUTURE(node_ptr) || IS_NODE_POWER_SAVE(node_ptr)) continue; if (node_states != HEALTH_CHECK_NODE_ANY) { uint16_t cpus_total, cpus_used = 0; if (slurmctld_conf.fast_schedule) { cpus_total = node_ptr->config_ptr->cpus; } else { cpus_total = node_ptr->cpus; } if (!IS_NODE_IDLE(node_ptr)) { select_g_select_nodeinfo_get( node_ptr->select_nodeinfo, SELECT_NODEDATA_SUBCNT, NODE_STATE_ALLOCATED, &cpus_used); } /* Here the node state is inferred from * the cpus allocated on it. * - cpus_used == 0 * means node is idle * - cpus_used < cpus_total * means the node is in mixed state * else cpus_used == cpus_total * means the node is allocated */ if (cpus_used == 0) { if (!(node_states & HEALTH_CHECK_NODE_IDLE)) continue; if (!IS_NODE_IDLE(node_ptr)) continue; } else if (cpus_used < cpus_total) { if (!(node_states & HEALTH_CHECK_NODE_MIXED)) continue; } else { if (!(node_states & HEALTH_CHECK_NODE_ALLOC)) continue; } } hostlist_push_host(check_agent_args->hostlist, node_ptr->name); check_agent_args->node_count++; } if (run_cyclic && (i >= node_record_count)) base_node_loc = -1; #endif if (check_agent_args->node_count == 0) { hostlist_destroy(check_agent_args->hostlist); xfree (check_agent_args); } else { hostlist_uniq(check_agent_args->hostlist); host_str = hostlist_ranged_string_xmalloc( check_agent_args->hostlist); debug("Spawning health check agent for %s", host_str); xfree(host_str); ping_begin(); agent_queue_request(check_agent_args); } }
static agent_info_t *_make_agent_info(agent_arg_t *agent_arg_ptr) { int i = 0, j = 0; agent_info_t *agent_info_ptr = NULL; thd_t *thread_ptr = NULL; int *span = NULL; int thr_count = 0; hostlist_t hl = NULL; char *name = NULL; agent_info_ptr = xmalloc(sizeof(agent_info_t)); slurm_mutex_init(&agent_info_ptr->thread_mutex); if (pthread_cond_init(&agent_info_ptr->thread_cond, NULL)) fatal("pthread_cond_init error %m"); agent_info_ptr->thread_count = agent_arg_ptr->node_count; agent_info_ptr->retry = agent_arg_ptr->retry; agent_info_ptr->threads_active = 0; thread_ptr = xmalloc(agent_info_ptr->thread_count * sizeof(thd_t)); memset(thread_ptr, 0, (agent_info_ptr->thread_count * sizeof(thd_t))); agent_info_ptr->thread_struct = thread_ptr; agent_info_ptr->msg_type = agent_arg_ptr->msg_type; agent_info_ptr->msg_args_pptr = &agent_arg_ptr->msg_args; if ((agent_arg_ptr->msg_type != REQUEST_JOB_NOTIFY) && (agent_arg_ptr->msg_type != REQUEST_SHUTDOWN) && (agent_arg_ptr->msg_type != REQUEST_RECONFIGURE) && (agent_arg_ptr->msg_type != SRUN_EXEC) && (agent_arg_ptr->msg_type != SRUN_TIMEOUT) && (agent_arg_ptr->msg_type != SRUN_NODE_FAIL) && (agent_arg_ptr->msg_type != SRUN_REQUEST_SUSPEND) && (agent_arg_ptr->msg_type != SRUN_USER_MSG) && (agent_arg_ptr->msg_type != SRUN_STEP_MISSING) && (agent_arg_ptr->msg_type != SRUN_JOB_COMPLETE)) { #ifdef HAVE_FRONT_END span = set_span(agent_arg_ptr->node_count, agent_arg_ptr->node_count); #else /* Sending message to a possibly large number of slurmd. * Push all message forwarding to slurmd in order to * offload as much work from slurmctld as possible. */ span = set_span(agent_arg_ptr->node_count, 1); #endif agent_info_ptr->get_reply = true; } else { /* Message is going to one node (for srun) or we want * it to get processed ASAP (SHUTDOWN or RECONFIGURE). * Send the message directly to each node. */ span = set_span(agent_arg_ptr->node_count, agent_arg_ptr->node_count); } i = 0; while(i < agent_info_ptr->thread_count) { thread_ptr[thr_count].state = DSH_NEW; thread_ptr[thr_count].addr = agent_arg_ptr->addr; name = hostlist_shift(agent_arg_ptr->hostlist); if(!name) { debug3("no more nodes to send to"); break; } hl = hostlist_create(name); if(thread_ptr[thr_count].addr && span[thr_count]) { debug("warning: you will only be sending this to %s", name); span[thr_count] = 0; } free(name); i++; for (j = 0; j < span[thr_count]; j++) { name = hostlist_shift(agent_arg_ptr->hostlist); if (!name) break; hostlist_push(hl, name); free(name); i++; } hostlist_uniq(hl); thread_ptr[thr_count].nodelist = hostlist_ranged_string_xmalloc(hl); hostlist_destroy(hl); #if 0 info("sending msg_type %u to nodes %s", agent_arg_ptr->msg_type, thread_ptr[thr_count].nodelist); #endif thr_count++; } xfree(span); agent_info_ptr->thread_count = thr_count; return agent_info_ptr; }