Ejemplo n.º 1
0
/* run_backup - this is the backup controller, it should run in standby
 *	mode, assuming control when the primary controller stops responding */
void run_backup(slurm_trigger_callbacks_t *callbacks)
{
	int i;
	uint32_t trigger_type;
	time_t last_ping = 0;
	pthread_attr_t thread_attr_sig, thread_attr_rpc;
	slurmctld_lock_t config_read_lock = {
		READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
	slurmctld_lock_t config_write_lock = {
		WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, WRITE_LOCK };

	info("slurmctld running in background mode");
	takeover = false;
	last_controller_response = time(NULL);

	/* default: don't resume if shutdown */
	slurmctld_config.resume_backup = false;
	if (xsignal_block(backup_sigarray) < 0)
		error("Unable to block signals");

	/*
	 * create attached thread to process RPCs
	 */
	slurm_attr_init(&thread_attr_rpc);
	while (pthread_create(&slurmctld_config.thread_id_rpc,
			      &thread_attr_rpc, _background_rpc_mgr, NULL)) {
		error("pthread_create error %m");
		sleep(1);
	}
	slurm_attr_destroy(&thread_attr_rpc);

	/*
	 * create attached thread for signal handling
	 */
	slurm_attr_init(&thread_attr_sig);
	while (pthread_create(&slurmctld_config.thread_id_sig,
			      &thread_attr_sig, _background_signal_hand,
			      NULL)) {
		error("pthread_create %m");
		sleep(1);
	}
	slurm_attr_destroy(&thread_attr_sig);
	trigger_type = TRIGGER_TYPE_BU_CTLD_RES_OP;
	_trigger_slurmctld_event(trigger_type);

	for (i = 0; ((i < 5) && (slurmctld_config.shutdown_time == 0)); i++) {
		sleep(1);       /* Give the primary slurmctld set-up time */
	}

	/* repeatedly ping ControlMachine */
	while (slurmctld_config.shutdown_time == 0) {
		sleep(1);
		/* Lock of slurmctld_conf below not important */
		if (slurmctld_conf.slurmctld_timeout &&
		    (takeover == false) &&
		    (difftime(time(NULL), last_ping) <
		     (slurmctld_conf.slurmctld_timeout / 3)))
			continue;

		last_ping = time(NULL);
		if (_ping_controller() == 0)
			last_controller_response = time(NULL);
		else if (takeover) {
			/* in takeover mode, take control as soon as */
			/* primary no longer respond */
			break;
		} else {
			uint32_t timeout;
			lock_slurmctld(config_read_lock);
			timeout = slurmctld_conf.slurmctld_timeout;
			unlock_slurmctld(config_read_lock);

			if (difftime(time(NULL), last_controller_response) >
			    timeout) {
				break;
			}
		}
	}

	if (slurmctld_config.shutdown_time != 0) {
		/* Since pidfile is created as user root (its owner is
		 *   changed to SlurmUser) SlurmUser may not be able to
		 *   remove it, so this is not necessarily an error.
		 * No longer need slurmctld_conf lock after above join. */
		if (unlink(slurmctld_conf.slurmctld_pidfile) < 0)
			verbose("Unable to remove pidfile '%s': %m",
				slurmctld_conf.slurmctld_pidfile);

		info("BackupController terminating");
		pthread_join(slurmctld_config.thread_id_sig, NULL);
		log_fini();
		if (dump_core)
			abort();
		else
			exit(0);
	}

	lock_slurmctld(config_read_lock);
	error("ControlMachine %s not responding, "
		"BackupController %s taking over",
		slurmctld_conf.control_machine,
		slurmctld_conf.backup_controller);
	unlock_slurmctld(config_read_lock);

	backup_slurmctld_restart();
	trigger_primary_ctld_fail();
	trigger_backup_ctld_as_ctrl();

	pthread_kill(slurmctld_config.thread_id_sig, SIGTERM);
	pthread_join(slurmctld_config.thread_id_sig, NULL);
	pthread_join(slurmctld_config.thread_id_rpc, NULL);

	/* The job list needs to be freed before we run
	 * ctld_assoc_mgr_init, it should be empty here in the first place.
	 */
	lock_slurmctld(config_write_lock);
	job_fini();
	init_job_conf();
	unlock_slurmctld(config_write_lock);

	ctld_assoc_mgr_init(callbacks);

	/* clear old state and read new state */
	lock_slurmctld(config_write_lock);
	if (switch_g_restore(slurmctld_conf.state_save_location, true)) {
		error("failed to restore switch state");
		abort();
	}
	if (read_slurm_conf(2, false)) {	/* Recover all state */
		error("Unable to recover slurm state");
		abort();
	}
	slurmctld_config.shutdown_time = (time_t) 0;
	unlock_slurmctld(config_write_lock);
	select_g_select_nodeinfo_set_all();

	return;
}
Ejemplo n.º 2
0
int main(int argc, char **argv) {
     time_t now;

     /* Initialise route, runq and job classes */
     now = time(NULL);
     route_init(NULL, 0);
     route_register(&rt_filea_method);
     route_register(&rt_fileov_method);
     route_register(&rt_stdin_method);
     route_register(&rt_stdout_method);
     route_register(&rt_stderr_method);
     if ( ! elog_init(1, "job test", NULL))
	  elog_die(FATAL, "didn't initialise elog\n");
     sig_init();
     callback_init();
     runq_init(now);
     meth_init();
     job_init();

     /* Test should fail due to incorrect method */
     elog_printf(DEBUG, "Expect a complaint! -> ");
     if (job_add(5, 5, 0, 1, "test1a1", "internal_test", "stdout", 
		  "stderr", 100, NULL, "echo \"Hello, world\"") != -1)
     {
	  elog_die(FATAL, "[1a] Shouldn't be able to add\n");
     }

     /* Single test in five seconds, never to run */
     if (job_add(5, 5, 0, 1, "test1a2", "internal_test", "stdout", 
		  "stderr", 100, "exec", "echo \"Hello, world\"") == -1)
     {
	  elog_die(FATAL, "[1a] Can't add\n");
     }
				/* Attention: some white box testing */
     itree_first(runq_event);
     if (itree_getkey(runq_event) != now+5) {
	  elog_die(FATAL, "[1a] Queued at an incorrect time\n");
     }
     job_clear();
     if (!itree_empty(runq_event) || !itree_empty(runq_tab)) {
	  elog_die(FATAL, 
		  "[1a] Trees not emptied. runq_events=%d, runq_tab=%d\n", 
		  itree_n(runq_event), itree_n(runq_tab));
     }
     now = time(NULL);
     
     /* Two tests both in five seconds, never to run */
     if (job_add(5, 5, 0, 1, "test1b1", "internal_test", "stdout",
		  "stderr", 100, "exec", "echo \"Hello, world\"") == -1)
     {
	  elog_die(FATAL, "[1b] Can't add first\n");
     }
     if (job_add(5, 5, 0, 1, "test1b2", "internal_test", "stdout",
		  "stderr", 100, "exec", "echo \"Hello, world\"") == -1)
     {
	  elog_die(FATAL, "[1b] Can't add second\n");
     }
     itree_first(runq_event);
     if (itree_getkey(runq_event) != now+5) {
	  elog_die(FATAL, "[1b] First queued at an incorrect time\n");
     }
     itree_next(runq_event);
     if (itree_getkey(runq_event) != now+5) {
	  elog_die(FATAL, "[1b] Second queued at an incorrect time\n");
     }
     job_clear();
     if (!itree_empty(runq_event) || !itree_empty(runq_tab)) {
	  elog_die(FATAL, 
		  "[1b] Trees not emptied. runq_events=%d, runq_tab=%d\n", 
		  itree_n(runq_event), itree_n(runq_tab));
     }
     now = time(NULL);

     /* Two tests one in five seconds, the other in six, never to run */
     if (job_add(6, 6, 0, 1, "test1c1", "internal_test", "stdout",
		  "stderr", 100, "exec", "echo \"Hello, world\"") == -1)
     {
	  elog_die(FATAL, "[1c] Can't add first\n");
     }
     if (job_add(now+5, 5, 0, 1, "test1c2", "internal_test", "stdout",
		  "stderr", 100, "exec", "echo \"Hello, world\"") == -1)
     {
	  elog_die(FATAL, "[1c] Can't add second\n");
     }
     itree_first(runq_event);
     if (itree_getkey(runq_event) != now+5) {
	  elog_die(FATAL, "[1c] First queued at an incorrect time\n");
     }
     itree_next(runq_event);
     if (itree_getkey(runq_event) != now+6) {
	  elog_die(FATAL, "[1c] Second queued at an incorrect time\n");
     }
     job_clear();
     if (!itree_empty(runq_event) || !itree_empty(runq_tab)) {
	  elog_die(FATAL, 
		  "[1c] Trees not emptied. runq_events=%d, runq_tab=%d\n", 
		  itree_n(runq_event), itree_n(runq_tab));
     }
     now = time(NULL);
     
     /* Continuous single test supposed to start two seconds ago, 
      * next run in three; never to run */
     if (job_add(-2, 5, 0, 0, "test1d1", "internal_test", "stdout",
		  "stderr", 100, "exec", "echo \"Hello, world\"") == -1)
     {
	  elog_die(FATAL, "[1d] Can't add\n");
     }
     itree_first(runq_event);
     if (itree_getkey(runq_event) != now+3) {
	  elog_die(FATAL, 
		  "[1d] Event queued at an incorrect time: bad=%d good=%ld\n", 
		  itree_getkey(runq_event), now+3);
     }
     job_clear();
     if (runq_nsched() > 0) {
	  elog_die(FATAL, "[1d] Still active work scheduled. runq_events=%d, "
		  "runq_tab=%d runq_nsched()=%d\n", 
		  itree_n(runq_event), itree_n(runq_tab), runq_nsched());
	  runq_dump();
     }
     now = time(NULL);
     
     /* Two continous tests, starting two seconds ago, next next run in four;
      * never to run */
     if (job_add(-2, 6, 0, 0, "test1e1", "internal_test", "stdout",
		  "stderr", 100, "exec", "echo \"Hello, world\"") == -1)
     {
	  elog_die(FATAL, "[1e] Can't add first\n");
     }
     if (job_add(-3, 5, 0, 0, "test1e2", "internal_test", "stdout",
		  "stderr", 100, "exec", "echo \"Hello, world\"") == -1)
     {
	  elog_die(FATAL, "[1e] Can't add second\n");
     }
     itree_first(runq_event);
     while (((struct runq_work*) itree_get(runq_event))->expired)
	  itree_next(runq_event);
     if (itree_getkey(runq_event) != now+2) {
	  elog_die(FATAL, "[1e] First queued at an incorrect time\n");
     }
     itree_next(runq_event);
     while (((struct runq_work*) itree_get(runq_event))->expired)
	  itree_next(runq_event);
     if (itree_getkey(runq_event) != now+4) {
	  elog_die(FATAL, "[1e] Second queued at an incorrect time\n");
     }
     job_clear();
     if (runq_nsched() > 0) {
	  elog_die(FATAL, "[1e] Still active work scheduled. runq_events=%d, "
		  "runq_tab=%d runq_nsched()=%d\n", 
		  itree_n(runq_event), itree_n(runq_tab), runq_nsched());
	  runq_dump();
     }
     now = time(NULL);
     
     /* Two 5 run jobs, scheduled to start 10 seconds ago, with periods
      * of 5 and 6 seconds; never to run */
     if (job_add(-10, 6, 0, 5, "test1f1", "internal_test", "stdout",
		  "stderr", 100, "exec", "echo \"Hello, world\"") == -1)
     {
	  elog_die(FATAL, "[1f] Can't add first\n");
     }
     if (job_add(-10, 5, 0, 5, "test1f2", "internal_test", "stdout",
		  "stderr", 100, "exec", "echo \"Hello, world\"") == -1)
     {
	  elog_die(FATAL, "[1f] Can't add second\n");
     }

     itree_first(runq_event);
     while (((struct runq_work*) itree_get(runq_event))->expired)
	  itree_next(runq_event);
     if (itree_getkey(runq_event) != now+2) {
	  elog_die(FATAL, "[1f] First queued at an incorrect time\n");
     }
     itree_next(runq_event);
     while (((struct runq_work*) itree_get(runq_event))->expired)
	  itree_next(runq_event);
     if (itree_getkey(runq_event) != now+5) {
	  elog_die(FATAL, "[1f] Second queued at an incorrect time\n");
     }
     job_clear();
     if (runq_nsched() > 0) {
	  elog_die(FATAL, "[1f] Still active work scheduled. runq_events=%d, "
		  "runq_tab=%d runq_nsched()=%d\n", 
		  itree_n(runq_event), itree_n(runq_tab), runq_nsched());
	  runq_dump();
     }
     now = time(NULL);
     
     /* Two 5 run jobs, scheduled to start 100 seconds ago, with periods
      * of 5 and 6 seconds; they should never be scheduled */
     if (job_add(-100, 6, 0, 5, "test1g1", "internal_test", "stdout",
		  "stderr", 100, "exec", "echo \"Hello, world\"") == -1)
     {
	  elog_die(FATAL, "[1g] Can't add first\n");
     }
     if (job_add(-100, 5, 0, 5, "test1g2", "internal_test", "stdout",
		  "stderr", 100, "exec", "echo \"Hello, world\"") == -1)
     {
	  elog_die(FATAL, "[1g] Can't add second\n");
     }
     if (runq_nsched() > 0) {
	  elog_die(FATAL, "[1g] Still active work scheduled. runq_events=%d, "
		  "runq_tab=%d runq_nsched()=%d\n", 
		  itree_n(runq_event), itree_n(runq_tab), runq_nsched());
	  runq_dump();
     }
     job_clear();
     now = time(NULL);

     /* Two five run tests, starting at different times in the past,
      * five runs each wittth different periods; they should both
      * run now */
     if (job_add(-24, 6, 0, 5, "test1h1", "internal_test", "stdout",
		  "stderr", 100, "exec", "echo \"Hello, world\"") == -1)
     {
	  elog_die(FATAL, "[1h] Can't add first\n");
     }
     if (job_add(-20, 5, 0, 5, "test1h2", "internal_test", "stdout",
		  "stderr", 100, "exec", "echo \"Hello, world\"") == -1)
     {
	  elog_die(FATAL, "[1h] Can't add second\n");
     }
     if (runq_nsched() != 2) {
	  elog_die(FATAL, "[1h] Two jobs should be scheduled not %d\n",
		  runq_nsched());
	  runq_dump();
     }
     sig_on();
     sleep(6);		/* let it run */
     sleep(1);		/* let it run */
     sleep(1);		/* let it run */
     sleep(1);		/* let it run */
     sig_off();
     if (runq_nsched() > 0) {
	  elog_die(FATAL, "[1h] Still active work scheduled. runq_events=%d, "
		  "runq_tab=%d runq_nsched()=%d\n", 
		  itree_n(runq_event), itree_n(runq_tab), runq_nsched());
	  runq_dump();
     }

     job_clear();

#if 0
     /* check all tables/lists are empty */
     if (!itree_empty(runq_event) || !itree_empty(runq_tab)) {
	  elog_die(FATAL, "[1i] Still entries in tables. runq_events=%d, "
		  "runq_tab=%d runq_nsched()=%d\n", 
		  itree_n(runq_event), itree_n(runq_tab), runq_nsched());
	  runq_dump();
     }
#endif

     job_fini();
     meth_fini();
     runq_fini();
     elog_fini();
     route_fini();
     callback_fini();

     printf("%s: tests finished\n", argv[0]);
     exit(0);
}