/* Attempt to shutdown all nodes in a single capmc call. * RET 0 on success, -1 on failure */ static int _update_all_nodes(char *node_names) { char *argv[10], *nid_list, *resp_msg; int rc = -1, status = 0; nid_list = _node_names_2_nid_list(node_names); if (nid_list == NULL) return -1; /* Request node power down. * Example: "capmc node_off –n 43" */ argv[0] = "capmc"; argv[1] = "node_off"; argv[2] = "-n"; argv[3] = nid_list; argv[4] = NULL; resp_msg = _run_script(argv, &status); if ((status == 0) || (resp_msg && strcasestr(resp_msg, "Success"))) { debug("%s: node_off sent to %s", prog_name, argv[3]); rc = 0; } else { error("%s: capmc(%s,%s,%s): %d %s", prog_name, argv[1], argv[2], argv[3], status, resp_msg); } xfree(resp_msg); xfree(nid_list); return rc; }
static void *_node_update(void *args) { char *node_name = (char *) args; char *argv[10], nid_str[32], *resp_msg; int i, nid = -1, status = 0; bool node_state_ok, node_off_sent = false; time_t poll_start; for (i = 0; node_name[i]; i++) { if ((node_name[i] >= '0') && (node_name[i] <= '9')) { nid = strtol(node_name + i, NULL, 10); break; } } if (nid < 0) { error("%s: No valid NID: %s", prog_name, node_name); return NULL; } snprintf(nid_str, sizeof(nid_str), "%d", nid); /* Request node power down. * Example: "capmc node_off –n 43" */ argv[0] = "capmc"; argv[1] = "node_off"; argv[2] = "-n"; argv[3] = nid_str; argv[4] = NULL; for (i = 0; ((i < NODE_OFF_RETRIES) && !node_off_sent); i++) { resp_msg = _run_script(argv, &status); if ((status != 0) || (resp_msg && (strcasestr(resp_msg, "Success") == NULL))) { error("%s: capmc(%s,%s,%s): %d %s", prog_name, argv[1], argv[2], argv[3], status, resp_msg); sleep(1); } else { debug("%s: node_off sent to %s", prog_name, nid_str); node_off_sent = true; } xfree(resp_msg); } /* Wait for node in "off" state */ poll_start = time(NULL); while (!node_state_ok && (difftime(time(NULL), poll_start) < NODE_OFF_STATE_WAIT)) { sleep(capmc_poll_freq); node_state_ok = _check_node_state(nid, nid_str, "off"); } slurm_mutex_lock(&thread_cnt_mutex); thread_cnt--; pthread_cond_signal(&thread_cnt_cond); slurm_mutex_unlock(&thread_cnt_mutex); return NULL; }
/* * Trigger a job's burst buffer stage-out to begin * * Returns a SLURM errno. */ extern int bb_p_job_start_stage_out(struct job_record *job_ptr) { bb_alloc_t *bb_ptr; char **script_argv, *resp; int i; if (debug_flag) { info("%s: %s", __func__, plugin_type); info("%s: job_id:%u", __func__, job_ptr->job_id); } if ((job_ptr->burst_buffer == NULL) || (job_ptr->burst_buffer[0] == '\0') || (_get_bb_size(job_ptr) == 0)) return SLURM_SUCCESS; pthread_mutex_lock(&bb_mutex); bb_ptr = _find_bb_job_rec(job_ptr); if (!bb_ptr) { /* No job buffers. Assuming use of persistent buffers only */ debug("%s: job_id:%u bb_rec not found", __func__, job_ptr->job_id); } else { script_argv = _build_stage_args(start_stage_out, "stage_out", job_ptr); if (script_argv) { bb_ptr->state = BB_STATE_STAGING_OUT; resp = _run_script("StartStageOut", start_stage_out, script_argv, -1); if (resp) { error("%s: StartStageOut: %s", __func__, resp); xfree(resp); } for (i = 0; script_argv[i]; i++) xfree(script_argv[i]); xfree(script_argv); } else { bb_ptr->state = BB_STATE_STAGED_OUT; } } pthread_mutex_unlock(&bb_mutex); return SLURM_SUCCESS; }
static bool _check_node_state(int nid, char *nid_str, char *state) { bool node_state_ok = false; char *argv[10], *resp_msg; int i, nid_cnt, status = 0; uint32_t *nid_array; json_object *j; argv[0] = "capmc"; argv[1] = "node_status"; argv[2] = "-n"; argv[3] = nid_str; argv[4] = NULL; resp_msg = _run_script(argv, &status); if (status != 0) { error("%s: capmc(%s,%s,%s): %d %s", prog_name, argv[1], argv[2], argv[3], status, resp_msg); xfree(resp_msg); return node_state_ok; } j = json_tokener_parse(resp_msg); if (j == NULL) { error("%s: json parser failed on %s", prog_name, resp_msg); xfree(resp_msg); return node_state_ok; } xfree(resp_msg); nid_cnt = 0; nid_array = _json_parse_nids(j, "off", &nid_cnt); json_object_put(j); /* Frees json memory */ for (i = 0; i < nid_cnt; i++) { if (nid_array[i] == nid) { node_state_ok = true; break; } } xfree(nid_array); return node_state_ok; }
/* Determine the current actual burst buffer state. * Run the program "get_sys_state" and parse stdout for details. */ static void _load_state(void) { static uint32_t last_total_space = 0; char *save_ptr = NULL, *tok, *leftover = NULL, *resp, *tmp = NULL; char *script_args[3] = { NULL, "get_sys", NULL }; s_p_hashtbl_t *state_hashtbl = NULL; static s_p_options_t state_options[] = { {"ENOENT", S_P_STRING}, {"UserID", S_P_ARRAY, _parse_job_info, _destroy_job_info}, {"TotalSize", S_P_STRING}, {NULL} }; tok = strrchr(get_sys_state, '/'); if (tok) script_args[0] = tok + 1; else script_args[0] = get_sys_state; resp = _run_script("GetSysState", get_sys_state, script_args, 10); if (resp == NULL) return; state_hashtbl = s_p_hashtbl_create(state_options); tok = strtok_r(resp, "\n", &save_ptr); while (tok) { s_p_parse_line(state_hashtbl, tok, &leftover); tok = strtok_r(NULL, "\n", &save_ptr); } if (s_p_get_string(&tmp, "TotalSize", state_hashtbl)) { total_space = _get_size_num(tmp); xfree(tmp); } else { error("%s: GetSysState failed to respond with TotalSize", plugin_type); } s_p_hashtbl_destroy(state_hashtbl); if (debug_flag && (total_space != last_total_space)) info("%s: total_space:%u", __func__, total_space); last_total_space = total_space; }
/* Wait for all identified computed nodes to enter "on" state */ static void _wait_all_nodes_on(void) { char *argv[10], *resp_msg; int i, nid_cnt = 0, status = 0; json_object *j; uint32_t *nid_array; time_t start_time = time(NULL); while ((difftime(time(NULL), start_time) < (30 * 60)) && (bit_set_count(node_bitmap) > 0)) { sleep(20); argv[0] = "capmc"; argv[1] = "node_status"; argv[2] = NULL; resp_msg = _run_script(argv, &status); if (status != 0) { error("%s: capmc(%s,%s,%s): %d %s", log_file, argv[1], argv[2], argv[3], status, resp_msg); break; } j = json_tokener_parse(resp_msg); if (j == NULL) { error("%s: json parser failed on %s", log_file, resp_msg); xfree(resp_msg); break; } xfree(resp_msg); nid_cnt = 0; nid_array = _json_parse_nids(j, "on", &nid_cnt); json_object_put(j); /* Frees json memory */ for (i = 0; i < nid_cnt; i++) { bit_clear(node_bitmap, nid_array[i]); } xfree(nid_array); } }
static void *_node_update(void *args) { char *node_name = (char *) args; char *argv[10], nid_str[32], *resp_msg; int i, nid = -1, status = 0; bool node_state_ok; for (i = 0; node_name[i]; i++) { if ((node_name[i] >= '0') && (node_name[i] <= '9')) { nid = strtol(node_name + i, NULL, 10); break; } } if (nid < 0) { error("%s: No valid NID: %s", log_file, node_name); goto fini; } snprintf(nid_str, sizeof(nid_str), "%d", nid); if (mcdram_mode) { /* Update MCDRAM mode. * Example: "capmc set_mcdram_cfg –n 43 –m cache" */ argv[0] = "capmc"; argv[1] = "set_mcdram_cfg"; argv[2] = "-n"; argv[3] = nid_str; argv[4] = "-m"; argv[5] = mcdram_mode; argv[6] = NULL; resp_msg = _run_script(argv, &status); if (status != 0) { error("%s: capmc(%s,%s,%s,%s,%s): %d %s", log_file, argv[1], argv[2], argv[3], argv[4], argv[5], status, resp_msg); } xfree(resp_msg); } if (numa_mode) { /* Update NUMA mode. * Example: "capmc set_numa_cfg –n 43 –m a2a" */ argv[0] = "capmc"; argv[1] = "set_numa_cfg"; argv[2] = "-n"; argv[3] = nid_str; argv[4] = "-m"; argv[5] = numa_mode; argv[6] = NULL; resp_msg = _run_script(argv, &status); if (status != 0) { error("%s: capmc(%s,%s,%s,%s,%s): %d %s", log_file, argv[1], argv[2], argv[3], argv[4], argv[5], status, resp_msg); } xfree(resp_msg); } /* Test if already in "off" state */ node_state_ok = _check_node_state(nid, nid_str, "off"); /* Request node power down. * Example: "capmc node_off –n 43" */ if (!node_state_ok) { argv[0] = "capmc"; argv[1] = "node_off"; argv[2] = "-n"; argv[3] = nid_str; argv[4] = NULL; resp_msg = _run_script(argv, &status); if (status != 0) { error("%s: capmc(%s,%s,%s): %d %s", log_file, argv[1], argv[2], argv[3], status, resp_msg); } xfree(resp_msg); } /* Wait for node in "off" state */ while (!node_state_ok) { sleep(2); node_state_ok = _check_node_state(nid, nid_str, "off"); } /* Request node power up. * Example: "capmc node_up –n 43" */ argv[0] = "capmc"; argv[1] = "node_up"; argv[2] = "-n"; argv[3] = nid_str; argv[4] = NULL; resp_msg = _run_script(argv, &status); if (status != 0) { error("%s: capmc(%s,%s,%s): %d %s", log_file, argv[1], argv[2], argv[3], status, resp_msg); } xfree(resp_msg); fini: slurm_mutex_lock(&thread_cnt_mutex); thread_cnt--; pthread_cond_signal(&thread_cnt_cond); slurm_mutex_unlock(&thread_cnt_mutex); return NULL; }