/* main - slurmctld main function, start various threads and process RPCs */ int main(int argc, char *argv[]) { pthread_attr_t thread_attr; char node_name[128]; void *db_conn = NULL; assoc_init_args_t assoc_init_arg; _init_config(); log_init(argv[0], log_opts, LOG_DAEMON, NULL); if (read_slurmdbd_conf()) exit(1); _parse_commandline(argc, argv); _update_logging(true); _update_nice(); if (slurm_auth_init(NULL) != SLURM_SUCCESS) { fatal("Unable to initialize %s authentication plugin", slurmdbd_conf->auth_type); } if (slurm_acct_storage_init(NULL) != SLURM_SUCCESS) { fatal("Unable to initialize %s accounting storage plugin", slurmdbd_conf->storage_type); } _kill_old_slurmdbd(); if (foreground == 0) _daemonize(); /* * Need to create pidfile here in case we setuid() below * (init_pidfile() exits if it can't initialize pid file). * On Linux we also need to make this setuid job explicitly * able to write a core dump. * This also has to happen after daemon(), which closes all fd's, * so we keep the write lock of the pidfile. */ _init_pidfile(); _become_slurm_user(); if (foreground == 0) _set_work_dir(); log_config(); #ifdef PR_SET_DUMPABLE if (prctl(PR_SET_DUMPABLE, 1) < 0) debug ("Unable to set dumpable to 1"); #endif /* PR_SET_DUMPABLE */ if (xsignal_block(dbd_sigarray) < 0) error("Unable to block signals"); /* Create attached thread for signal handling */ slurm_attr_init(&thread_attr); if (pthread_create(&signal_handler_thread, &thread_attr, _signal_handler, NULL)) fatal("pthread_create %m"); slurm_attr_destroy(&thread_attr); registered_clusters = list_create(NULL); slurm_attr_init(&thread_attr); if (pthread_create(&commit_handler_thread, &thread_attr, _commit_handler, NULL)) fatal("pthread_create %m"); slurm_attr_destroy(&thread_attr); memset(&assoc_init_arg, 0, sizeof(assoc_init_args_t)); /* If we are tacking wckey we need to cache wckeys, if we aren't only cache the users, qos */ assoc_init_arg.cache_level = ASSOC_MGR_CACHE_USER | ASSOC_MGR_CACHE_QOS; if (slurmdbd_conf->track_wckey) assoc_init_arg.cache_level |= ASSOC_MGR_CACHE_WCKEY; db_conn = acct_storage_g_get_connection(NULL, 0, true, NULL); if (assoc_mgr_init(db_conn, &assoc_init_arg, errno) == SLURM_ERROR) { error("Problem getting cache of data"); acct_storage_g_close_connection(&db_conn); goto end_it; } if (gethostname_short(node_name, sizeof(node_name))) fatal("getnodename: %m"); while (1) { if (slurmdbd_conf->dbd_backup && (!strcmp(node_name, slurmdbd_conf->dbd_backup) || !strcmp(slurmdbd_conf->dbd_backup, "localhost"))) { info("slurmdbd running in background mode"); have_control = false; backup = true; /* make sure any locks are released */ acct_storage_g_commit(db_conn, 1); run_dbd_backup(); if (!shutdown_time) assoc_mgr_refresh_lists(db_conn); } else if (slurmdbd_conf->dbd_host && (!strcmp(slurmdbd_conf->dbd_host, node_name) || !strcmp(slurmdbd_conf->dbd_host, "localhost"))) { backup = false; have_control = true; } else { fatal("This host not configured to run SlurmDBD " "(%s != %s | (backup) %s)", node_name, slurmdbd_conf->dbd_host, slurmdbd_conf->dbd_backup); } if (!shutdown_time) { /* Create attached thread to process incoming RPCs */ slurm_attr_init(&thread_attr); if (pthread_create(&rpc_handler_thread, &thread_attr, rpc_mgr, NULL)) fatal("pthread_create error %m"); slurm_attr_destroy(&thread_attr); } if (!shutdown_time) { /* Create attached thread to do usage rollup */ slurm_attr_init(&thread_attr); if (pthread_create(&rollup_handler_thread, &thread_attr, _rollup_handler, db_conn)) fatal("pthread_create error %m"); slurm_attr_destroy(&thread_attr); } /* Daemon is fully operational here */ if (!shutdown_time || primary_resumed) { shutdown_time = 0; info("slurmdbd version %s started", SLURM_VERSION_STRING); if (backup) run_dbd_backup(); } _request_registrations(db_conn); acct_storage_g_commit(db_conn, 1); /* this is only ran if not backup */ if (rollup_handler_thread) pthread_join(rollup_handler_thread, NULL); if (rpc_handler_thread) pthread_join(rpc_handler_thread, NULL); if (backup && primary_resumed) { shutdown_time = 0; info("Backup has given up control"); } if (shutdown_time) break; } /* Daemon termination handled here */ end_it: if (signal_handler_thread) pthread_join(signal_handler_thread, NULL); if (commit_handler_thread) pthread_join(commit_handler_thread, NULL); acct_storage_g_commit(db_conn, 1); acct_storage_g_close_connection(&db_conn); if (slurmdbd_conf->pid_file && (unlink(slurmdbd_conf->pid_file) < 0)) { verbose("Unable to remove pidfile '%s': %m", slurmdbd_conf->pid_file); } FREE_NULL_LIST(registered_clusters); assoc_mgr_fini(NULL); slurm_acct_storage_fini(); slurm_auth_fini(); log_fini(); free_slurmdbd_conf(); exit(0); }
/* run_backup - this is the backup controller, it should run in standby * mode, assuming control when the primary controller stops responding */ void run_backup(slurm_trigger_callbacks_t *callbacks) { int i; uint32_t trigger_type; time_t last_ping = 0; pthread_attr_t thread_attr_sig, thread_attr_rpc; slurmctld_lock_t config_read_lock = { READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; slurmctld_lock_t config_write_lock = { WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, WRITE_LOCK }; info("slurmctld running in background mode"); takeover = false; last_controller_response = time(NULL); /* default: don't resume if shutdown */ slurmctld_config.resume_backup = false; if (xsignal_block(backup_sigarray) < 0) error("Unable to block signals"); /* * create attached thread to process RPCs */ slurm_attr_init(&thread_attr_rpc); while (pthread_create(&slurmctld_config.thread_id_rpc, &thread_attr_rpc, _background_rpc_mgr, NULL)) { error("pthread_create error %m"); sleep(1); } slurm_attr_destroy(&thread_attr_rpc); /* * create attached thread for signal handling */ slurm_attr_init(&thread_attr_sig); while (pthread_create(&slurmctld_config.thread_id_sig, &thread_attr_sig, _background_signal_hand, NULL)) { error("pthread_create %m"); sleep(1); } slurm_attr_destroy(&thread_attr_sig); trigger_type = TRIGGER_TYPE_BU_CTLD_RES_OP; _trigger_slurmctld_event(trigger_type); for (i = 0; ((i < 5) && (slurmctld_config.shutdown_time == 0)); i++) { sleep(1); /* Give the primary slurmctld set-up time */ } /* repeatedly ping ControlMachine */ while (slurmctld_config.shutdown_time == 0) { sleep(1); /* Lock of slurmctld_conf below not important */ if (slurmctld_conf.slurmctld_timeout && (takeover == false) && (difftime(time(NULL), last_ping) < (slurmctld_conf.slurmctld_timeout / 3))) continue; last_ping = time(NULL); if (_ping_controller() == 0) last_controller_response = time(NULL); else if (takeover) { /* in takeover mode, take control as soon as */ /* primary no longer respond */ break; } else { uint32_t timeout; lock_slurmctld(config_read_lock); timeout = slurmctld_conf.slurmctld_timeout; unlock_slurmctld(config_read_lock); if (difftime(time(NULL), last_controller_response) > timeout) { break; } } } if (slurmctld_config.shutdown_time != 0) { /* Since pidfile is created as user root (its owner is * changed to SlurmUser) SlurmUser may not be able to * remove it, so this is not necessarily an error. * No longer need slurmctld_conf lock after above join. */ if (unlink(slurmctld_conf.slurmctld_pidfile) < 0) verbose("Unable to remove pidfile '%s': %m", slurmctld_conf.slurmctld_pidfile); info("BackupController terminating"); pthread_join(slurmctld_config.thread_id_sig, NULL); log_fini(); if (dump_core) abort(); else exit(0); } lock_slurmctld(config_read_lock); error("ControlMachine %s not responding, " "BackupController %s taking over", slurmctld_conf.control_machine, slurmctld_conf.backup_controller); unlock_slurmctld(config_read_lock); trigger_primary_ctld_fail(); trigger_backup_ctld_as_ctrl(); pthread_kill(slurmctld_config.thread_id_sig, SIGTERM); pthread_join(slurmctld_config.thread_id_sig, NULL); pthread_join(slurmctld_config.thread_id_rpc, NULL); if (!acct_db_conn) { /* Make sure we get a connection right away to avoid race condition on this happening too late. */ acct_db_conn = acct_storage_g_get_connection( callbacks, 0, false, slurmctld_cluster_name); } /* clear old state and read new state */ lock_slurmctld(config_write_lock); job_fini(); if (switch_g_restore(slurmctld_conf.state_save_location, true)) { error("failed to restore switch state"); abort(); } if (read_slurm_conf(2, false)) { /* Recover all state */ error("Unable to recover slurm state"); abort(); } slurmctld_config.shutdown_time = (time_t) 0; unlock_slurmctld(config_write_lock); select_g_select_nodeinfo_set_all(); return; }