/* * delete_partition - delete the specified partition (actually leave * the entry, just flag it as defunct) * IN job_specs - job specification from RPC */ extern int delete_partition(delete_part_msg_t *part_desc_ptr) { struct part_record *part_ptr; part_ptr = find_part_record (part_desc_ptr->name); if (part_ptr == NULL) /* No such partition */ return ESLURM_INVALID_PARTITION_NAME; if (partition_in_use(part_desc_ptr->name)) return ESLURM_PARTITION_IN_USE; if (default_part_loc == part_ptr) { error("Deleting default partition %s", part_ptr->name); default_part_loc = NULL; } (void) kill_job_by_part_name(part_desc_ptr->name); list_delete_all(part_list, list_find_part, part_desc_ptr->name); last_part_update = time(NULL); slurm_sched_partition_change(); /* notify sched plugin */ select_g_reconfigure(); /* notify select plugin too */ return SLURM_SUCCESS; }
static int _job_modify(uint32_t jobid, char *bank_ptr, char *depend_ptr, char *new_hostlist, uint32_t new_node_cnt, char *part_name_ptr, uint32_t new_time_limit, char *name_ptr, char *start_ptr, char *feature_ptr, char *env_ptr, char *comment_ptr, char *gres_ptr, char *wckey_ptr) { struct job_record *job_ptr; time_t now = time(NULL); bool update_accounting = false; job_ptr = find_job_record(jobid); if (job_ptr == NULL) { error("wiki: MODIFYJOB has invalid jobid %u", jobid); return ESLURM_INVALID_JOB_ID; } if (IS_JOB_FINISHED(job_ptr) || (job_ptr->details == NULL)) { info("wiki: MODIFYJOB jobid %u is finished", jobid); return ESLURM_DISABLED; } if (comment_ptr) { info("wiki: change job %u comment %s", jobid, comment_ptr); xfree(job_ptr->comment); job_ptr->comment = xstrdup(comment_ptr); last_job_update = now; } if (depend_ptr) { int rc = update_job_dependency(job_ptr, depend_ptr); if (rc == SLURM_SUCCESS) { info("wiki: changed job %u dependency to %s", jobid, depend_ptr); } else { error("wiki: changing job %u dependency to %s", jobid, depend_ptr); return EINVAL; } } if (env_ptr) { bool have_equal = false; char old_sep[1]; int begin = 0, i; if (job_ptr->batch_flag == 0) { error("wiki: attempt to set environment variables " "for non-batch job %u", jobid); return ESLURM_DISABLED; } for (i=0; ; i++) { if (env_ptr[i] == '=') { if (have_equal) { error("wiki: setting job %u invalid " "environment variables: %s", jobid, env_ptr); return EINVAL; } have_equal = true; if (env_ptr[i+1] == '\"') { for (i+=2; ; i++) { if (env_ptr[i] == '\0') { error("wiki: setting job %u " "invalid environment " "variables: %s", jobid, env_ptr); return EINVAL; } if (env_ptr[i] == '\"') { i++; break; } if (env_ptr[i] == '\\') { i++; } } } else if (env_ptr[i+1] == '\'') { for (i+=2; ; i++) { if (env_ptr[i] == '\0') { error("wiki: setting job %u " "invalid environment " "variables: %s", jobid, env_ptr); return EINVAL; } if (env_ptr[i] == '\'') { i++; break; } if (env_ptr[i] == '\\') { i++; } } } } if (isspace(env_ptr[i]) || (env_ptr[i] == ',')) { if (!have_equal) { error("wiki: setting job %u invalid " "environment variables: %s", jobid, env_ptr); return EINVAL; } old_sep[0] = env_ptr[i]; env_ptr[i] = '\0'; xrealloc(job_ptr->details->env_sup, sizeof(char *) * (job_ptr->details->env_cnt+1)); job_ptr->details->env_sup [job_ptr->details->env_cnt++] = xstrdup(&env_ptr[begin]); info("wiki: for job %u add env: %s", jobid, &env_ptr[begin]); env_ptr[i] = old_sep[0]; if (isspace(old_sep[0])) break; begin = i + 1; have_equal = false; } } } if (new_time_limit) { time_t old_time = job_ptr->time_limit; job_ptr->time_limit = new_time_limit; info("wiki: change job %u time_limit to %u", jobid, new_time_limit); /* Update end_time based upon change * to preserve suspend time info */ job_ptr->end_time = job_ptr->end_time + ((job_ptr->time_limit - old_time) * 60); last_job_update = now; } if (bank_ptr && (update_job_account("wiki", job_ptr, bank_ptr) != SLURM_SUCCESS)) { return EINVAL; } if (feature_ptr) { if (IS_JOB_PENDING(job_ptr) && (job_ptr->details)) { info("wiki: change job %u features to %s", jobid, feature_ptr); job_ptr->details->features = xstrdup(feature_ptr); last_job_update = now; } else { error("wiki: MODIFYJOB features of non-pending " "job %u", jobid); return ESLURM_DISABLED; } } if (start_ptr) { char *end_ptr; uint32_t begin_time = strtol(start_ptr, &end_ptr, 10); if (IS_JOB_PENDING(job_ptr) && (job_ptr->details)) { info("wiki: change job %u begin time to %u", jobid, begin_time); job_ptr->details->begin_time = begin_time; last_job_update = now; update_accounting = true; } else { error("wiki: MODIFYJOB begin_time of non-pending " "job %u", jobid); return ESLURM_DISABLED; } } if (name_ptr) { if (IS_JOB_PENDING(job_ptr)) { info("wiki: change job %u name %s", jobid, name_ptr); xfree(job_ptr->name); job_ptr->name = xstrdup(name_ptr); last_job_update = now; update_accounting = true; } else { error("wiki: MODIFYJOB name of non-pending job %u", jobid); return ESLURM_DISABLED; } } if (new_hostlist) { int rc = 0, task_cnt; hostlist_t hl; char *tasklist; if (!IS_JOB_PENDING(job_ptr) || !job_ptr->details) { /* Job is done, nothing to reset */ if (new_hostlist == '\0') goto host_fini; error("wiki: MODIFYJOB hostlist of non-pending " "job %u", jobid); return ESLURM_DISABLED; } xfree(job_ptr->details->req_nodes); FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap); if (new_hostlist == '\0') goto host_fini; tasklist = moab2slurm_task_list(new_hostlist, &task_cnt); if (tasklist == NULL) { rc = 1; goto host_fini; } hl = hostlist_create(tasklist); if (hl == 0) { rc = 1; goto host_fini; } hostlist_uniq(hl); hostlist_sort(hl); job_ptr->details->req_nodes = hostlist_ranged_string_xmalloc(hl); hostlist_destroy(hl); if (job_ptr->details->req_nodes == NULL) { rc = 1; goto host_fini; } if (node_name2bitmap(job_ptr->details->req_nodes, false, &job_ptr->details->req_node_bitmap)) { rc = 1; goto host_fini; } host_fini: if (rc) { info("wiki: change job %u invalid hostlist %s", jobid, new_hostlist); xfree(job_ptr->details->req_nodes); return EINVAL; } else { info("wiki: change job %u hostlist %s", jobid, new_hostlist); update_accounting = true; } } if (part_name_ptr) { struct part_record *part_ptr; if (!IS_JOB_PENDING(job_ptr)) { error("wiki: MODIFYJOB partition of non-pending " "job %u", jobid); return ESLURM_DISABLED; } part_ptr = find_part_record(part_name_ptr); if (part_ptr == NULL) { error("wiki: MODIFYJOB has invalid partition %s", part_name_ptr); return ESLURM_INVALID_PARTITION_NAME; } info("wiki: change job %u partition %s", jobid, part_name_ptr); xfree(job_ptr->partition); job_ptr->partition = xstrdup(part_name_ptr); job_ptr->part_ptr = part_ptr; last_job_update = now; update_accounting = true; } if (new_node_cnt) { job_desc_msg_t job_desc; #ifdef HAVE_BG uint16_t geometry[SYSTEM_DIMENSIONS] = {(uint16_t) NO_VAL}; static uint16_t cpus_per_node = 0; if (!cpus_per_node) { select_g_alter_node_cnt(SELECT_GET_NODE_CPU_CNT, &cpus_per_node); } #endif if(!IS_JOB_PENDING(job_ptr) || !job_ptr->details) { error("wiki: MODIFYJOB node count of non-pending " "job %u", jobid); return ESLURM_DISABLED; } memset(&job_desc, 0, sizeof(job_desc_msg_t)); job_desc.min_nodes = new_node_cnt; job_desc.max_nodes = NO_VAL; job_desc.select_jobinfo = select_g_select_jobinfo_alloc(); select_g_alter_node_cnt(SELECT_SET_NODE_CNT, &job_desc); select_g_select_jobinfo_free(job_desc.select_jobinfo); job_ptr->details->min_nodes = job_desc.min_nodes; if (job_ptr->details->max_nodes && (job_ptr->details->max_nodes < job_desc.min_nodes)) job_ptr->details->max_nodes = job_desc.min_nodes; info("wiki: change job %u min_nodes to %u", jobid, new_node_cnt); #ifdef HAVE_BG job_ptr->details->min_cpus = job_desc.min_cpus; job_ptr->details->max_cpus = job_desc.max_cpus; job_ptr->details->pn_min_cpus = job_desc.pn_min_cpus; new_node_cnt = job_ptr->details->min_cpus; if (cpus_per_node) new_node_cnt /= cpus_per_node; /* This is only set up so accounting is set up correctly */ select_g_select_jobinfo_set(job_ptr->select_jobinfo, SELECT_JOBDATA_NODE_CNT, &new_node_cnt); /* reset geo since changing this makes any geo potentially invalid */ select_g_select_jobinfo_set(job_ptr->select_jobinfo, SELECT_JOBDATA_GEOMETRY, geometry); #endif last_job_update = now; update_accounting = true; } if (gres_ptr) { char *orig_gres; if (!IS_JOB_PENDING(job_ptr)) { error("wiki: MODIFYJOB GRES of non-pending job %u", jobid); return ESLURM_DISABLED; } orig_gres = job_ptr->gres; job_ptr->gres = NULL; if (gres_ptr[0]) job_ptr->gres = xstrdup(gres_ptr); if (gres_plugin_job_state_validate(job_ptr->gres, &job_ptr->gres_list)) { error("wiki: MODIFYJOB Invalid GRES=%s", gres_ptr); xfree(job_ptr->gres); job_ptr->gres = orig_gres; return ESLURM_INVALID_GRES; } xfree(orig_gres); } if (wckey_ptr) { int rc = update_job_wckey("update_job", job_ptr, wckey_ptr); if (rc != SLURM_SUCCESS) { error("wiki: MODIFYJOB Invalid WCKEY=%s", wckey_ptr); return rc; } } if (update_accounting) { if (job_ptr->details && job_ptr->details->begin_time) { /* Update job record in accounting to reflect * the changes */ jobacct_storage_g_job_start(acct_db_conn, job_ptr); } } return SLURM_SUCCESS; }
/* Initialize power_save module parameters. * Return 0 on valid configuration to run power saving, * otherwise log the problem and return -1 */ static int _init_power_config(void) { slurm_ctl_conf_t *conf = slurm_conf_lock(); last_config = slurmctld_conf.last_update; idle_time = conf->suspend_time - 1; suspend_rate = conf->suspend_rate; resume_timeout = conf->resume_timeout; resume_rate = conf->resume_rate; slurmd_timeout = conf->slurmd_timeout; suspend_timeout = conf->suspend_timeout; _clear_power_config(); if (conf->suspend_program) suspend_prog = xstrdup(conf->suspend_program); if (conf->resume_program) resume_prog = xstrdup(conf->resume_program); if (conf->suspend_exc_nodes) exc_nodes = xstrdup(conf->suspend_exc_nodes); if (conf->suspend_exc_parts) exc_parts = xstrdup(conf->suspend_exc_parts); slurm_conf_unlock(); if (idle_time < 0) { /* not an error */ debug("power_save module disabled, SuspendTime < 0"); return -1; } if (suspend_rate < 0) { error("power_save module disabled, SuspendRate < 0"); return -1; } if (resume_rate < 0) { error("power_save module disabled, ResumeRate < 0"); return -1; } if (suspend_prog == NULL) { error("power_save module disabled, NULL SuspendProgram"); return -1; } else if (!_valid_prog(suspend_prog)) { error("power_save module disabled, invalid SuspendProgram %s", suspend_prog); return -1; } if (resume_prog == NULL) { error("power_save module disabled, NULL ResumeProgram"); return -1; } else if (!_valid_prog(resume_prog)) { error("power_save module disabled, invalid ResumeProgram %s", resume_prog); return -1; } if (exc_nodes && (node_name2bitmap(exc_nodes, false, &exc_node_bitmap))) { error("power_save module disabled, " "invalid SuspendExcNodes %s", exc_nodes); return -1; } if (exc_parts) { char *tmp = NULL, *one_part = NULL, *part_list = NULL; struct part_record *part_ptr = NULL; int rc = 0; part_list = xstrdup(exc_parts); one_part = strtok_r(part_list, ",", &tmp); while (one_part != NULL) { part_ptr = find_part_record(one_part); if (!part_ptr) { error("power_save module disabled, " "invalid SuspendExcPart %s", one_part); rc = -1; break; } if (exc_node_bitmap) bit_or(exc_node_bitmap, part_ptr->node_bitmap); else exc_node_bitmap = bit_copy(part_ptr-> node_bitmap); one_part = strtok_r(NULL, ",", &tmp); } xfree(part_list); if (rc) return rc; } if (exc_node_bitmap) { char *tmp = bitmap2node_name(exc_node_bitmap); debug("power_save module, excluded nodes %s", tmp); xfree(tmp); } return 0; }
static int _job_modify(uint32_t jobid, char *bank_ptr, char *depend_ptr, char *new_hostlist, uint32_t new_node_cnt, char *part_name_ptr, uint32_t new_time_limit) { struct job_record *job_ptr; bool update_accounting = false; job_ptr = find_job_record(jobid); if (job_ptr == NULL) { error("wiki: MODIFYJOB has invalid jobid %u", jobid); return ESLURM_INVALID_JOB_ID; } if (IS_JOB_FINISHED(job_ptr)) { error("wiki: MODIFYJOB jobid %u is finished", jobid); return ESLURM_DISABLED; } if (depend_ptr) { int rc = update_job_dependency(job_ptr, depend_ptr); if (rc == SLURM_SUCCESS) { info("wiki: changed job %u dependency to %s", jobid, depend_ptr); } else { error("wiki: changing job %u dependency to %s", jobid, depend_ptr); return EINVAL; } } if (new_time_limit) { time_t old_time = job_ptr->time_limit; job_ptr->time_limit = new_time_limit; info("wiki: change job %u time_limit to %u", jobid, new_time_limit); /* Update end_time based upon change * to preserve suspend time info */ job_ptr->end_time = job_ptr->end_time + ((job_ptr->time_limit - old_time) * 60); last_job_update = time(NULL); } if (bank_ptr) { if (update_job_account("wiki", job_ptr, bank_ptr) != SLURM_SUCCESS) return EINVAL; else update_accounting = true; } if (new_hostlist) { int rc = 0, task_cnt; hostlist_t hl; char *tasklist; if (!IS_JOB_PENDING(job_ptr) || !job_ptr->details) { /* Job is done, nothing to reset */ if (new_hostlist == '\0') goto host_fini; error("wiki: MODIFYJOB tasklist of non-pending " "job %u", jobid); return ESLURM_DISABLED; } xfree(job_ptr->details->req_nodes); FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap); if (new_hostlist == '\0') goto host_fini; tasklist = moab2slurm_task_list(new_hostlist, &task_cnt); if (tasklist == NULL) { rc = 1; goto host_fini; } hl = hostlist_create(tasklist); if (hl == 0) { rc = 1; goto host_fini; } hostlist_uniq(hl); hostlist_sort(hl); job_ptr->details->req_nodes = hostlist_ranged_string_xmalloc(hl); hostlist_destroy(hl); if (job_ptr->details->req_nodes == NULL) { rc = 1; goto host_fini; } if (node_name2bitmap(job_ptr->details->req_nodes, false, &job_ptr->details->req_node_bitmap)) { rc = 1; goto host_fini; } host_fini: if (rc) { info("wiki: change job %u invalid hostlist %s", jobid, new_hostlist); xfree(job_ptr->details->req_nodes); return EINVAL; } else { info("wiki: change job %u hostlist %s", jobid, new_hostlist); update_accounting = true; } } if (part_name_ptr) { struct part_record *part_ptr; if (!IS_JOB_PENDING(job_ptr)) { error("wiki: MODIFYJOB partition of non-pending " "job %u", jobid); return ESLURM_DISABLED; } part_ptr = find_part_record(part_name_ptr); if (part_ptr == NULL) { error("wiki: MODIFYJOB has invalid partition %s", part_name_ptr); return ESLURM_INVALID_PARTITION_NAME; } info("wiki: change job %u partition %s", jobid, part_name_ptr); xfree(job_ptr->partition); job_ptr->partition = xstrdup(part_name_ptr); job_ptr->part_ptr = part_ptr; last_job_update = time(NULL); update_accounting = true; } if (new_node_cnt) { if (IS_JOB_PENDING(job_ptr) && job_ptr->details) { job_ptr->details->min_nodes = new_node_cnt; if (job_ptr->details->max_nodes && (job_ptr->details->max_nodes < new_node_cnt)) job_ptr->details->max_nodes = new_node_cnt; info("wiki: change job %u min_nodes to %u", jobid, new_node_cnt); last_job_update = time(NULL); update_accounting = true; } else { error("wiki: MODIFYJOB node count of non-pending " "job %u", jobid); return ESLURM_DISABLED; } } if (update_accounting) { /* Update job record in accounting to reflect changes */ jobacct_storage_job_start_direct(acct_db_conn, job_ptr); } return SLURM_SUCCESS; }
/*****************************************************************************\ * parse_wiki_config - Results go into global variables * RET SLURM_SUCESS or error code * * wiki_conf options * JobPriority=hold|run * AuthKey=number \*****************************************************************************/ extern int parse_wiki_config(void) { s_p_options_t options[] = { {"AuthKey", S_P_STRING}, {"EHost", S_P_STRING}, {"EHostBackup", S_P_STRING}, {"EPort", S_P_UINT16}, {"ExcludePartitions", S_P_STRING}, {"HidePartitionJobs", S_P_STRING}, {"HidePartitionNodes", S_P_STRING}, {"HostFormat", S_P_UINT16}, {"JobAggregationTime", S_P_UINT16}, {"JobPriority", S_P_STRING}, {NULL} }; s_p_hashtbl_t *tbl; char *exclude_partitions, *hide_partitions, *hide_part_nodes; char *key = NULL, *priority_mode = NULL, *wiki_conf; struct stat buf; slurm_ctl_conf_t *conf; int i; /* Set default values */ for (i=0; i<EXC_PART_CNT; i++) exclude_part_ptr[i] = NULL; for (i=0; i<HIDE_PART_CNT; i++) hide_part_ptr[i] = NULL; for (i=0; i<HIDE_PART_CNT; i++) hide_part_nodes_ptr[i] = NULL; conf = slurm_conf_lock(); strncpy(e_host, conf->control_addr, sizeof(e_host)); if (conf->backup_addr) { strncpy(e_host_bu, conf->backup_addr, sizeof(e_host)); } kill_wait = conf->kill_wait; slurm_conf_unlock(); wiki_conf = get_extra_conf_path("wiki.conf"); if ((wiki_conf == NULL) || (stat(wiki_conf, &buf) == -1)) { debug("No wiki.conf file (%s)", wiki_conf); xfree(wiki_conf); return SLURM_SUCCESS; } debug("Reading wiki.conf file (%s)",wiki_conf); tbl = s_p_hashtbl_create(options); if (s_p_parse_file(tbl, NULL, wiki_conf, false) == SLURM_ERROR) fatal("something wrong with opening/reading wiki.conf file"); if (! s_p_get_string(&key, "AuthKey", tbl)) debug("Warning: No wiki_conf AuthKey specified"); else { strncpy(auth_key, key, sizeof(auth_key)); xfree(key); } if ( s_p_get_string(&key, "EHost", tbl)) { strncpy(e_host, key, sizeof(e_host)); xfree(key); } else debug("wiki: Using ControlAddr for EHost value"); if ( s_p_get_string(&key, "EHostBackup", tbl)) { strncpy(e_host_bu, key, sizeof(e_host_bu)); xfree(key); } s_p_get_uint16(&e_port, "EPort", tbl); if (s_p_get_uint16(&job_aggregation_time, "JobAggregationTime", tbl)) error("JobAggregationTime not used by sched/wiki"); if (s_p_get_uint16(&host_format, "HostFormat", tbl)) error("HostFormat not used by sched/wiki"); if (s_p_get_string(&exclude_partitions, "ExcludePartitions", tbl)) { char *tok = NULL, *tok_p = NULL; tok = strtok_r(exclude_partitions, ",", &tok_p); i = 0; while (tok) { if (i >= EXC_PART_CNT) { error("ExcludePartitions has too many entries " "skipping %s and later entries", tok); break; } exclude_part_ptr[i] = find_part_record(tok); if (exclude_part_ptr[i]) i++; else error("ExcludePartitions %s not found", tok); tok = strtok_r(NULL, ",", &tok_p); } } if (s_p_get_string(&hide_partitions, "HidePartitionJobs", tbl)) { char *tok = NULL, *tok_p = NULL; tok = strtok_r(hide_partitions, ",", &tok_p); i = 0; while (tok) { if (i >= HIDE_PART_CNT) { error("HidePartitionJobs has too many entries " "skipping %s and later entries", tok); break; } hide_part_ptr[i] = find_part_record(tok); if (hide_part_ptr[i]) i++; else error("HidePartitionJobs %s not found", tok); tok = strtok_r(NULL, ",", &tok_p); } } if (s_p_get_string(&hide_part_nodes, "HidePartitionNodes", tbl)) { char *tok = NULL, *tok_p = NULL; tok = strtok_r(hide_part_nodes, ",", &tok_p); i = 0; while (tok) { if (i >= HIDE_PART_CNT) { error("HidePartitionNodes has too many entries " "skipping %s and later entries", tok); break; } hide_part_nodes_ptr[i] = find_part_record(tok); if (hide_part_nodes_ptr[i]) i++; else error("HidePartitionNodes %s not found", tok); tok = strtok_r(NULL, ",", &tok_p); } } if (s_p_get_string(&priority_mode, "JobPriority", tbl)) { if (strcasecmp(priority_mode, "hold") == 0) init_prio_mode = PRIO_HOLD; else if (strcasecmp(priority_mode, "run") == 0) init_prio_mode = PRIO_DECREMENT; else error("Invalid value for JobPriority in wiki.conf"); xfree(priority_mode); } s_p_hashtbl_destroy(tbl); xfree(wiki_conf); #if _DEBUG info("AuthKey = %s", auth_key); info("EHost = %s", e_host); info("EHostBackup = %s", e_host_bu); info("EPort = %u", e_port); info("JobAggregationTime = %u sec", job_aggregation_time); info("JobPriority = %s", init_prio_mode ? "run" : "hold"); info("KillWait = %u sec", kill_wait); for (i=0; i<EXC_PART_CNT; i++) { if (!exclude_part_ptr[i]) continue; info("ExcludePartitions = %s", exclude_part_ptr[i]->name); } for (i=0; i<HIDE_PART_CNT; i++) { if (!hide_part_ptr[i]) continue; info("HidePartitionJobs = %s", hide_part_ptr[i]->name); } for (i=0; i<HIDE_PART_CNT; i++) { if (!hide_part_nodes_ptr[i]) continue; info("HidePartitionNodes = %s", hide_part_nodes_ptr[i]->name); } #endif return SLURM_SUCCESS; }
static spare_node_resv_t *_xlate_hot_spares(char *spare_str, int *spare_cnt) { char *tok, *tmp_str, *save_ptr = NULL; char *part, *sep; int i, node_cnt = 0; spare_node_resv_t *spare_ptr = NULL; struct part_record *part_ptr = NULL; /* Locks: Read partition */ slurmctld_lock_t part_read_lock = { NO_LOCK, NO_LOCK, NO_LOCK, READ_LOCK }; *spare_cnt = 0; if ((spare_str == NULL) || (spare_str[0] == '\0')) return spare_ptr; tmp_str = xstrdup(spare_str); tok = strtok_r(tmp_str, ",", &save_ptr); lock_slurmctld(part_read_lock); while (tok) { static bool dup = false; part = xstrdup(tok); sep = strchr(part, ':'); if (sep) { node_cnt = atoi(sep + 1); sep[0] = '\0'; part_ptr = find_part_record(part); if ((*spare_cnt > 0) && (spare_ptr == NULL)) { /* Avoid CLANG error */ fatal("%s: spare array is NULL with size=%d", __func__, *spare_cnt); return spare_ptr; } for (i = 0; i < *spare_cnt; i++) { if (spare_ptr[i].part_ptr != part_ptr) continue; dup = true; break; } } if ((sep == NULL) || (node_cnt < 0)) { error("nonstop.conf: Ignoring invalid HotSpare (%s)", tok); } else if (dup) { info("nonstop.conf: Ignoring HotSpare (%s): " "Duplicate partition record", tok); } else if (node_cnt == 0) { info("nonstop.conf: Ignoring HotSpare (%s): " "Node count is zero", tok); } else if (part_ptr == NULL) { error("nonstop.conf: Ignoring invalid HotSpare (%s):" "Partition not found", tok); } else { xrealloc(spare_ptr, (sizeof(spare_node_resv_t) * (*spare_cnt + 1))); spare_ptr[*spare_cnt].node_cnt = node_cnt; spare_ptr[*spare_cnt].partition = part; part = NULL; /* Nothing left to free */ spare_ptr[*spare_cnt].part_ptr = part_ptr; (*spare_cnt)++; } xfree(part); tok = strtok_r(NULL, ",", &save_ptr); } unlock_slurmctld(part_read_lock); xfree(tmp_str); return spare_ptr; }
/* Perform any power change work to nodes */ static void _do_power_work(time_t now) { static time_t last_log = 0, last_work_scan = 0; int i, wake_cnt = 0, sleep_cnt = 0, susp_total = 0; time_t delta_t; uint32_t susp_state; bitstr_t *wake_node_bitmap = NULL, *sleep_node_bitmap = NULL; struct node_record *node_ptr; bool run_suspend = false; if (last_work_scan == 0) { if (exc_nodes && (node_name2bitmap(exc_nodes, false, &exc_node_bitmap))) { error("Invalid SuspendExcNodes %s ignored", exc_nodes); } if (exc_parts) { char *tmp = NULL, *one_part = NULL, *part_list = NULL; struct part_record *part_ptr = NULL; part_list = xstrdup(exc_parts); one_part = strtok_r(part_list, ",", &tmp); while (one_part != NULL) { part_ptr = find_part_record(one_part); if (!part_ptr) { error("Invalid SuspendExcPart %s ignored", one_part); } else if (exc_node_bitmap) { bit_or(exc_node_bitmap, part_ptr->node_bitmap); } else { exc_node_bitmap = bit_copy(part_ptr->node_bitmap); } one_part = strtok_r(NULL, ",", &tmp); } xfree(part_list); } if (exc_node_bitmap) { char *tmp = bitmap2node_name(exc_node_bitmap); info("power_save module, excluded nodes %s", tmp); xfree(tmp); } } /* Set limit on counts of nodes to have state changed */ delta_t = now - last_work_scan; if (delta_t >= 60) { suspend_cnt_f = 0.0; resume_cnt_f = 0.0; } else { float rate = (60 - delta_t) / 60.0; suspend_cnt_f *= rate; resume_cnt_f *= rate; } suspend_cnt = (suspend_cnt_f + 0.5); resume_cnt = (resume_cnt_f + 0.5); if (now > (last_suspend + suspend_timeout)) { /* ready to start another round of node suspends */ run_suspend = true; if (last_suspend) { bit_nclear(suspend_node_bitmap, 0, (node_record_count - 1)); bit_nclear(resume_node_bitmap, 0, (node_record_count - 1)); last_suspend = (time_t) 0; } } last_work_scan = now; /* Build bitmaps identifying each node which should change state */ for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count; i++, node_ptr++) { susp_state = IS_NODE_POWER_SAVE(node_ptr); if (susp_state) susp_total++; /* Resume nodes as appropriate */ if (susp_state && ((resume_rate == 0) || (resume_cnt < resume_rate)) && (bit_test(suspend_node_bitmap, i) == 0) && (IS_NODE_ALLOCATED(node_ptr) || (node_ptr->last_idle > (now - idle_time)))) { if (wake_node_bitmap == NULL) { wake_node_bitmap = bit_alloc(node_record_count); } wake_cnt++; resume_cnt++; resume_cnt_f++; node_ptr->node_state &= (~NODE_STATE_POWER_SAVE); node_ptr->node_state |= NODE_STATE_POWER_UP; node_ptr->node_state |= NODE_STATE_NO_RESPOND; bit_clear(power_node_bitmap, i); bit_clear(avail_node_bitmap, i); node_ptr->last_response = now + resume_timeout; bit_set(wake_node_bitmap, i); bit_set(resume_node_bitmap, i); } /* Suspend nodes as appropriate */ if (run_suspend && (susp_state == 0) && ((suspend_rate == 0) || (suspend_cnt < suspend_rate)) && (IS_NODE_IDLE(node_ptr) || IS_NODE_DOWN(node_ptr)) && (node_ptr->sus_job_cnt == 0) && (!IS_NODE_COMPLETING(node_ptr)) && (!IS_NODE_POWER_UP(node_ptr)) && (node_ptr->last_idle != 0) && (node_ptr->last_idle < (now - idle_time)) && ((exc_node_bitmap == NULL) || (bit_test(exc_node_bitmap, i) == 0))) { if (sleep_node_bitmap == NULL) { sleep_node_bitmap = bit_alloc(node_record_count); } sleep_cnt++; suspend_cnt++; suspend_cnt_f++; node_ptr->node_state |= NODE_STATE_POWER_SAVE; node_ptr->node_state &= (~NODE_STATE_NO_RESPOND); if (!IS_NODE_DOWN(node_ptr) && !IS_NODE_DRAIN(node_ptr) && !IS_NODE_FAIL(node_ptr)) bit_set(avail_node_bitmap, i); bit_set(power_node_bitmap, i); bit_set(sleep_node_bitmap, i); bit_set(suspend_node_bitmap, i); last_suspend = now; } } if (((now - last_log) > 600) && (susp_total > 0)) { info("Power save mode: %d nodes", susp_total); last_log = now; } if (sleep_node_bitmap) { char *nodes; nodes = bitmap2node_name(sleep_node_bitmap); if (nodes) _do_suspend(nodes); else error("power_save: bitmap2nodename"); xfree(nodes); FREE_NULL_BITMAP(sleep_node_bitmap); /* last_node_update could be changed already by another thread! last_node_update = now; */ } if (wake_node_bitmap) { char *nodes; nodes = bitmap2node_name(wake_node_bitmap); if (nodes) _do_resume(nodes); else error("power_save: bitmap2nodename"); xfree(nodes); FREE_NULL_BITMAP(wake_node_bitmap); /* last_node_update could be changed already by another thread! last_node_update = now; */ } }