예제 #1
0
파일: slurmdbd.c 프로젝트: lindenb/slurm
/* main - slurmctld main function, start various threads and process RPCs */
int main(int argc, char *argv[])
{
	pthread_attr_t thread_attr;
	char node_name[128];
	void *db_conn = NULL;
	assoc_init_args_t assoc_init_arg;

	_init_config();
	log_init(argv[0], log_opts, LOG_DAEMON, NULL);
	if (read_slurmdbd_conf())
		exit(1);
	_parse_commandline(argc, argv);
	_update_logging(true);
	_update_nice();

	if (slurm_auth_init(NULL) != SLURM_SUCCESS) {
		fatal("Unable to initialize %s authentication plugin",
		      slurmdbd_conf->auth_type);
	}
	if (slurm_acct_storage_init(NULL) != SLURM_SUCCESS) {
		fatal("Unable to initialize %s accounting storage plugin",
		      slurmdbd_conf->storage_type);
	}
	_kill_old_slurmdbd();
	if (foreground == 0)
		_daemonize();

	/*
	 * Need to create pidfile here in case we setuid() below
	 * (init_pidfile() exits if it can't initialize pid file).
	 * On Linux we also need to make this setuid job explicitly
	 * able to write a core dump.
	 * This also has to happen after daemon(), which closes all fd's,
	 * so we keep the write lock of the pidfile.
	 */
	_init_pidfile();
	_become_slurm_user();
	if (foreground == 0)
		_set_work_dir();
	log_config();

#ifdef PR_SET_DUMPABLE
	if (prctl(PR_SET_DUMPABLE, 1) < 0)
		debug ("Unable to set dumpable to 1");
#endif /* PR_SET_DUMPABLE */

	if (xsignal_block(dbd_sigarray) < 0)
		error("Unable to block signals");

	/* Create attached thread for signal handling */
	slurm_attr_init(&thread_attr);
	if (pthread_create(&signal_handler_thread, &thread_attr,
			   _signal_handler, NULL))
		fatal("pthread_create %m");
	slurm_attr_destroy(&thread_attr);

	registered_clusters = list_create(NULL);

	slurm_attr_init(&thread_attr);
	if (pthread_create(&commit_handler_thread, &thread_attr,
			   _commit_handler, NULL))
		fatal("pthread_create %m");
	slurm_attr_destroy(&thread_attr);

	memset(&assoc_init_arg, 0, sizeof(assoc_init_args_t));

	/* If we are tacking wckey we need to cache
	   wckeys, if we aren't only cache the users, qos */
	assoc_init_arg.cache_level = ASSOC_MGR_CACHE_USER | ASSOC_MGR_CACHE_QOS;
	if (slurmdbd_conf->track_wckey)
		assoc_init_arg.cache_level |= ASSOC_MGR_CACHE_WCKEY;

	db_conn = acct_storage_g_get_connection(NULL, 0, true, NULL);
	if (assoc_mgr_init(db_conn, &assoc_init_arg, errno) == SLURM_ERROR) {
		error("Problem getting cache of data");
		acct_storage_g_close_connection(&db_conn);
		goto end_it;
	}

	if (gethostname_short(node_name, sizeof(node_name)))
		fatal("getnodename: %m");

	while (1) {
		if (slurmdbd_conf->dbd_backup &&
		    (!strcmp(node_name, slurmdbd_conf->dbd_backup) ||
		     !strcmp(slurmdbd_conf->dbd_backup, "localhost"))) {
			info("slurmdbd running in background mode");
			have_control = false;
			backup = true;
			/* make sure any locks are released */
			acct_storage_g_commit(db_conn, 1);
			run_dbd_backup();
			if (!shutdown_time)
				assoc_mgr_refresh_lists(db_conn);
		} else if (slurmdbd_conf->dbd_host &&
			   (!strcmp(slurmdbd_conf->dbd_host, node_name) ||
			    !strcmp(slurmdbd_conf->dbd_host, "localhost"))) {
			backup = false;
			have_control = true;
		} else {
			fatal("This host not configured to run SlurmDBD "
			      "(%s != %s | (backup) %s)",
			      node_name, slurmdbd_conf->dbd_host,
			      slurmdbd_conf->dbd_backup);
		}

		if (!shutdown_time) {
			/* Create attached thread to process incoming RPCs */
			slurm_attr_init(&thread_attr);
			if (pthread_create(&rpc_handler_thread, &thread_attr,
					   rpc_mgr, NULL))
				fatal("pthread_create error %m");
			slurm_attr_destroy(&thread_attr);
		}

		if (!shutdown_time) {
			/* Create attached thread to do usage rollup */
			slurm_attr_init(&thread_attr);
			if (pthread_create(&rollup_handler_thread,
					   &thread_attr,
					   _rollup_handler, db_conn))
				fatal("pthread_create error %m");
			slurm_attr_destroy(&thread_attr);
		}

		/* Daemon is fully operational here */
		if (!shutdown_time || primary_resumed) {
			shutdown_time = 0;
			info("slurmdbd version %s started",
			     SLURM_VERSION_STRING);
			if (backup)
				run_dbd_backup();
		}

		_request_registrations(db_conn);
		acct_storage_g_commit(db_conn, 1);

		/* this is only ran if not backup */
		if (rollup_handler_thread)
			pthread_join(rollup_handler_thread, NULL);
		if (rpc_handler_thread)
			pthread_join(rpc_handler_thread, NULL);

		if (backup && primary_resumed) {
			shutdown_time = 0;
			info("Backup has given up control");
		}

		if (shutdown_time)
			break;
	}
	/* Daemon termination handled here */

end_it:

	if (signal_handler_thread)
		pthread_join(signal_handler_thread, NULL);
	if (commit_handler_thread)
		pthread_join(commit_handler_thread, NULL);

	acct_storage_g_commit(db_conn, 1);
	acct_storage_g_close_connection(&db_conn);

	if (slurmdbd_conf->pid_file &&
	    (unlink(slurmdbd_conf->pid_file) < 0)) {
		verbose("Unable to remove pidfile '%s': %m",
			slurmdbd_conf->pid_file);
	}

	FREE_NULL_LIST(registered_clusters);

	assoc_mgr_fini(NULL);
	slurm_acct_storage_fini();
	slurm_auth_fini();
	log_fini();
	free_slurmdbd_conf();
	exit(0);
}
예제 #2
0
파일: backup.c 프로젝트: surban/slurm
/* run_backup - this is the backup controller, it should run in standby
 *	mode, assuming control when the primary controller stops responding */
void run_backup(slurm_trigger_callbacks_t *callbacks)
{
	int i;
	uint32_t trigger_type;
	time_t last_ping = 0;
	pthread_attr_t thread_attr_sig, thread_attr_rpc;
	slurmctld_lock_t config_read_lock = {
		READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
	slurmctld_lock_t config_write_lock = {
		WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, WRITE_LOCK };

	info("slurmctld running in background mode");
	takeover = false;
	last_controller_response = time(NULL);

	/* default: don't resume if shutdown */
	slurmctld_config.resume_backup = false;
	if (xsignal_block(backup_sigarray) < 0)
		error("Unable to block signals");

	/*
	 * create attached thread to process RPCs
	 */
	slurm_attr_init(&thread_attr_rpc);
	while (pthread_create(&slurmctld_config.thread_id_rpc,
			      &thread_attr_rpc, _background_rpc_mgr, NULL)) {
		error("pthread_create error %m");
		sleep(1);
	}
	slurm_attr_destroy(&thread_attr_rpc);

	/*
	 * create attached thread for signal handling
	 */
	slurm_attr_init(&thread_attr_sig);
	while (pthread_create(&slurmctld_config.thread_id_sig,
			      &thread_attr_sig, _background_signal_hand,
			      NULL)) {
		error("pthread_create %m");
		sleep(1);
	}
	slurm_attr_destroy(&thread_attr_sig);
	trigger_type = TRIGGER_TYPE_BU_CTLD_RES_OP;
	_trigger_slurmctld_event(trigger_type);

	for (i = 0; ((i < 5) && (slurmctld_config.shutdown_time == 0)); i++) {
		sleep(1);       /* Give the primary slurmctld set-up time */
	}

	/* repeatedly ping ControlMachine */
	while (slurmctld_config.shutdown_time == 0) {
		sleep(1);
		/* Lock of slurmctld_conf below not important */
		if (slurmctld_conf.slurmctld_timeout &&
		    (takeover == false) &&
		    (difftime(time(NULL), last_ping) <
		     (slurmctld_conf.slurmctld_timeout / 3)))
			continue;

		last_ping = time(NULL);
		if (_ping_controller() == 0)
			last_controller_response = time(NULL);
		else if (takeover) {
			/* in takeover mode, take control as soon as */
			/* primary no longer respond */
			break;
		} else {
			uint32_t timeout;
			lock_slurmctld(config_read_lock);
			timeout = slurmctld_conf.slurmctld_timeout;
			unlock_slurmctld(config_read_lock);

			if (difftime(time(NULL), last_controller_response) >
			    timeout) {
				break;
			}
		}
	}

	if (slurmctld_config.shutdown_time != 0) {
		/* Since pidfile is created as user root (its owner is
		 *   changed to SlurmUser) SlurmUser may not be able to
		 *   remove it, so this is not necessarily an error.
		 * No longer need slurmctld_conf lock after above join. */
		if (unlink(slurmctld_conf.slurmctld_pidfile) < 0)
			verbose("Unable to remove pidfile '%s': %m",
				slurmctld_conf.slurmctld_pidfile);

		info("BackupController terminating");
		pthread_join(slurmctld_config.thread_id_sig, NULL);
		log_fini();
		if (dump_core)
			abort();
		else
			exit(0);
	}

	lock_slurmctld(config_read_lock);
	error("ControlMachine %s not responding, "
		"BackupController %s taking over",
		slurmctld_conf.control_machine,
		slurmctld_conf.backup_controller);
	unlock_slurmctld(config_read_lock);

	trigger_primary_ctld_fail();
	trigger_backup_ctld_as_ctrl();

	pthread_kill(slurmctld_config.thread_id_sig, SIGTERM);
	pthread_join(slurmctld_config.thread_id_sig, NULL);
	pthread_join(slurmctld_config.thread_id_rpc, NULL);

	if (!acct_db_conn) {
		/* Make sure we get a connection right away to avoid
		   race condition on this happening too late.
		*/
		acct_db_conn = acct_storage_g_get_connection(
			callbacks, 0, false,
			slurmctld_cluster_name);
	}

	/* clear old state and read new state */
	lock_slurmctld(config_write_lock);
	job_fini();
	if (switch_g_restore(slurmctld_conf.state_save_location, true)) {
		error("failed to restore switch state");
		abort();
	}
	if (read_slurm_conf(2, false)) {	/* Recover all state */
		error("Unable to recover slurm state");
		abort();
	}
	slurmctld_config.shutdown_time = (time_t) 0;
	unlock_slurmctld(config_write_lock);
	select_g_select_nodeinfo_set_all();

	return;
}