/* run_backup - this is the backup controller, it should run in standby * mode, assuming control when the primary controller stops responding */ void run_backup(slurm_trigger_callbacks_t *callbacks) { int i; uint32_t trigger_type; time_t last_ping = 0; pthread_attr_t thread_attr_sig, thread_attr_rpc; slurmctld_lock_t config_read_lock = { READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; slurmctld_lock_t config_write_lock = { WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, WRITE_LOCK }; info("slurmctld running in background mode"); takeover = false; last_controller_response = time(NULL); /* default: don't resume if shutdown */ slurmctld_config.resume_backup = false; if (xsignal_block(backup_sigarray) < 0) error("Unable to block signals"); /* * create attached thread to process RPCs */ slurm_attr_init(&thread_attr_rpc); while (pthread_create(&slurmctld_config.thread_id_rpc, &thread_attr_rpc, _background_rpc_mgr, NULL)) { error("pthread_create error %m"); sleep(1); } slurm_attr_destroy(&thread_attr_rpc); /* * create attached thread for signal handling */ slurm_attr_init(&thread_attr_sig); while (pthread_create(&slurmctld_config.thread_id_sig, &thread_attr_sig, _background_signal_hand, NULL)) { error("pthread_create %m"); sleep(1); } slurm_attr_destroy(&thread_attr_sig); trigger_type = TRIGGER_TYPE_BU_CTLD_RES_OP; _trigger_slurmctld_event(trigger_type); for (i = 0; ((i < 5) && (slurmctld_config.shutdown_time == 0)); i++) { sleep(1); /* Give the primary slurmctld set-up time */ } /* repeatedly ping ControlMachine */ while (slurmctld_config.shutdown_time == 0) { sleep(1); /* Lock of slurmctld_conf below not important */ if (slurmctld_conf.slurmctld_timeout && (takeover == false) && (difftime(time(NULL), last_ping) < (slurmctld_conf.slurmctld_timeout / 3))) continue; last_ping = time(NULL); if (_ping_controller() == 0) last_controller_response = time(NULL); else if (takeover) { /* in takeover mode, take control as soon as */ /* primary no longer respond */ break; } else { uint32_t timeout; lock_slurmctld(config_read_lock); timeout = slurmctld_conf.slurmctld_timeout; unlock_slurmctld(config_read_lock); if (difftime(time(NULL), last_controller_response) > timeout) { break; } } } if (slurmctld_config.shutdown_time != 0) { /* Since pidfile is created as user root (its owner is * changed to SlurmUser) SlurmUser may not be able to * remove it, so this is not necessarily an error. * No longer need slurmctld_conf lock after above join. */ if (unlink(slurmctld_conf.slurmctld_pidfile) < 0) verbose("Unable to remove pidfile '%s': %m", slurmctld_conf.slurmctld_pidfile); info("BackupController terminating"); pthread_join(slurmctld_config.thread_id_sig, NULL); log_fini(); if (dump_core) abort(); else exit(0); } lock_slurmctld(config_read_lock); error("ControlMachine %s not responding, " "BackupController %s taking over", slurmctld_conf.control_machine, slurmctld_conf.backup_controller); unlock_slurmctld(config_read_lock); backup_slurmctld_restart(); trigger_primary_ctld_fail(); trigger_backup_ctld_as_ctrl(); pthread_kill(slurmctld_config.thread_id_sig, SIGTERM); pthread_join(slurmctld_config.thread_id_sig, NULL); pthread_join(slurmctld_config.thread_id_rpc, NULL); /* The job list needs to be freed before we run * ctld_assoc_mgr_init, it should be empty here in the first place. */ lock_slurmctld(config_write_lock); job_fini(); init_job_conf(); unlock_slurmctld(config_write_lock); ctld_assoc_mgr_init(callbacks); /* clear old state and read new state */ lock_slurmctld(config_write_lock); if (switch_g_restore(slurmctld_conf.state_save_location, true)) { error("failed to restore switch state"); abort(); } if (read_slurm_conf(2, false)) { /* Recover all state */ error("Unable to recover slurm state"); abort(); } slurmctld_config.shutdown_time = (time_t) 0; unlock_slurmctld(config_write_lock); select_g_select_nodeinfo_set_all(); return; }
/* Spawn health check function for every node that is not DOWN */ extern void run_health_check(void) { #ifdef HAVE_FRONT_END front_end_record_t *front_end_ptr; #else struct node_record *node_ptr; int node_test_cnt = 0, node_limit, node_states, run_cyclic; static int base_node_loc = -1; static time_t cycle_start_time = (time_t) 0; #endif int i; char *host_str = NULL; agent_arg_t *check_agent_args = NULL; /* Sync plugin internal data with * node select_nodeinfo. This is important * after reconfig otherwise select_nodeinfo * will not return the correct number of * allocated cpus. */ select_g_select_nodeinfo_set_all(); check_agent_args = xmalloc (sizeof (agent_arg_t)); check_agent_args->msg_type = REQUEST_HEALTH_CHECK; check_agent_args->retry = 0; check_agent_args->hostlist = hostlist_create(NULL); #ifdef HAVE_FRONT_END for (i = 0, front_end_ptr = front_end_nodes; i < front_end_node_cnt; i++, front_end_ptr++) { if (IS_NODE_NO_RESPOND(front_end_ptr)) continue; hostlist_push_host(check_agent_args->hostlist, front_end_ptr->name); check_agent_args->node_count++; } #else run_cyclic = slurmctld_conf.health_check_node_state & HEALTH_CHECK_CYCLE; node_states = slurmctld_conf.health_check_node_state & (~HEALTH_CHECK_CYCLE); if (run_cyclic) { time_t now = time(NULL); if (cycle_start_time == (time_t) 0) cycle_start_time = now; else if (base_node_loc >= 0) ; /* mid-cycle */ else if (difftime(now, cycle_start_time) < slurmctld_conf.health_check_interval) { return; /* Wait to start next cycle */ } cycle_start_time = now; /* Determine how many nodes we want to test on each call of * run_health_check() to spread out the work. */ node_limit = (node_record_count * 2) / slurmctld_conf.health_check_interval; node_limit = MAX(node_limit, 10); } if ((node_states != HEALTH_CHECK_NODE_ANY) && (node_states != HEALTH_CHECK_NODE_IDLE)) { /* Update each node's alloc_cpus count */ select_g_select_nodeinfo_set_all(); } for (i = 0; i < node_record_count; i++) { if (run_cyclic) { if (node_test_cnt++ >= node_limit) break; base_node_loc++; if (base_node_loc >= node_record_count) { base_node_loc = -1; break; } node_ptr = node_record_table_ptr + base_node_loc; } else { node_ptr = node_record_table_ptr + i; } if (IS_NODE_NO_RESPOND(node_ptr) || IS_NODE_FUTURE(node_ptr) || IS_NODE_POWER_SAVE(node_ptr)) continue; if (node_states != HEALTH_CHECK_NODE_ANY) { uint16_t cpus_total, cpus_used = 0; if (slurmctld_conf.fast_schedule) { cpus_total = node_ptr->config_ptr->cpus; } else { cpus_total = node_ptr->cpus; } if (!IS_NODE_IDLE(node_ptr)) { select_g_select_nodeinfo_get( node_ptr->select_nodeinfo, SELECT_NODEDATA_SUBCNT, NODE_STATE_ALLOCATED, &cpus_used); } /* Here the node state is inferred from * the cpus allocated on it. * - cpus_used == 0 * means node is idle * - cpus_used < cpus_total * means the node is in mixed state * else cpus_used == cpus_total * means the node is allocated */ if (cpus_used == 0) { if (!(node_states & HEALTH_CHECK_NODE_IDLE)) continue; if (!IS_NODE_IDLE(node_ptr)) continue; } else if (cpus_used < cpus_total) { if (!(node_states & HEALTH_CHECK_NODE_MIXED)) continue; } else { if (!(node_states & HEALTH_CHECK_NODE_ALLOC)) continue; } } hostlist_push_host(check_agent_args->hostlist, node_ptr->name); check_agent_args->node_count++; } if (run_cyclic && (i >= node_record_count)) base_node_loc = -1; #endif if (check_agent_args->node_count == 0) { hostlist_destroy(check_agent_args->hostlist); xfree (check_agent_args); } else { hostlist_uniq(check_agent_args->hostlist); host_str = hostlist_ranged_string_xmalloc( check_agent_args->hostlist); debug("Spawning health check agent for %s", host_str); xfree(host_str); ping_begin(); agent_queue_request(check_agent_args); } }