Example #1
0
File: salloc.c Project: adk9/slurm
static void _reset_input_mode (void)
{
	/* SIGTTOU needs to be blocked per the POSIX spec:
	 * http://pubs.opengroup.org/onlinepubs/009695399/functions/tcsetattr.html
	 */
	int sig_block[] = { SIGTTOU, SIGTTIN, 0 };
	xsignal_block (sig_block);
	tcsetattr (STDIN_FILENO, TCSANOW, &saved_tty_attributes);
}
Example #2
0
static void _reset_input_mode (void)
{
	/* SIGTTOU needs to be blocked per the POSIX spec:
	 * http://pubs.opengroup.org/onlinepubs/009695399/functions/tcsetattr.html
	 */
	int sig_block[] = { SIGTTOU, SIGTTIN, 0 };
	xsignal_block (sig_block);
	tcsetattr (STDIN_FILENO, TCSANOW, &saved_tty_attributes);
	/* If salloc was run as interactive, with job control, reset the
	 * foreground process group of the terminal to the process group of
	 * the parent pid before exiting */
	if (is_interactive)
		tcsetpgrp(STDIN_FILENO, getpgid(getppid()));
}
Example #3
0
/* SIGWINCH should already be blocked by srun/signal.c */
void block_sigwinch(void)
{
	xsignal_block(pty_sigarray);
}
Example #4
0
extern int launch_common_create_job_step(srun_job_t *job, bool use_all_cpus,
					 void (*signal_function)(int),
					 sig_atomic_t *destroy_job)
{
	int i, rc;
	unsigned long step_wait = 0, my_sleep = 0;
	time_t begin_time;
	uint16_t base_dist;

	if (!job) {
		error("launch_common_create_job_step: no job given");
		return SLURM_ERROR;
	}

	slurm_step_ctx_params_t_init(&job->ctx_params);
	job->ctx_params.job_id = job->jobid;
	job->ctx_params.uid = opt.uid;

	/* Validate minimum and maximum node counts */
	if (opt.min_nodes && opt.max_nodes &&
	    (opt.min_nodes > opt.max_nodes)) {
		error ("Minimum node count > maximum node count (%d > %d)",
		       opt.min_nodes, opt.max_nodes);
		return SLURM_ERROR;
	}
#if !defined HAVE_FRONT_END || (defined HAVE_BGQ)
//#if !defined HAVE_FRONT_END || (defined HAVE_BGQ && defined HAVE_BG_FILES)
	if (opt.min_nodes && (opt.min_nodes > job->nhosts)) {
		error ("Minimum node count > allocated node count (%d > %d)",
		       opt.min_nodes, job->nhosts);
		return SLURM_ERROR;
	}
#endif
	job->ctx_params.min_nodes = job->nhosts;
	if (opt.min_nodes && (opt.min_nodes < job->ctx_params.min_nodes))
		job->ctx_params.min_nodes = opt.min_nodes;
	job->ctx_params.max_nodes = job->nhosts;
	if (opt.max_nodes && (opt.max_nodes < job->ctx_params.max_nodes))
		job->ctx_params.max_nodes = opt.max_nodes;

	if (!opt.ntasks_set && (opt.ntasks_per_node != NO_VAL))
		job->ntasks = opt.ntasks = job->nhosts * opt.ntasks_per_node;
	job->ctx_params.task_count = opt.ntasks;

	if (opt.mem_per_cpu != NO_VAL)
		job->ctx_params.pn_min_memory = opt.mem_per_cpu | MEM_PER_CPU;
	else if (opt.pn_min_memory != NO_VAL)
		job->ctx_params.pn_min_memory = opt.pn_min_memory;
	if (opt.gres)
		job->ctx_params.gres = opt.gres;
	else
		job->ctx_params.gres = getenv("SLURM_STEP_GRES");

	if (opt.overcommit) {
		if (use_all_cpus)	/* job allocation created by srun */
			job->ctx_params.cpu_count = job->cpu_count;
		else
			job->ctx_params.cpu_count = job->ctx_params.min_nodes;
	} else if (opt.cpus_set) {
		job->ctx_params.cpu_count = opt.ntasks * opt.cpus_per_task;
	} else if (opt.ntasks_set) {
		job->ctx_params.cpu_count = opt.ntasks;
	} else if (use_all_cpus) {	/* job allocation created by srun */
		job->ctx_params.cpu_count = job->cpu_count;
	} else {
		job->ctx_params.cpu_count = opt.ntasks;
	}

	job->ctx_params.cpu_freq_min = opt.cpu_freq_min;
	job->ctx_params.cpu_freq_max = opt.cpu_freq_max;
	job->ctx_params.cpu_freq_gov = opt.cpu_freq_gov;
	job->ctx_params.relative = (uint16_t)opt.relative;
	job->ctx_params.ckpt_interval = (uint16_t)opt.ckpt_interval;
	job->ctx_params.ckpt_dir = opt.ckpt_dir;
	job->ctx_params.exclusive = (uint16_t)opt.exclusive;
	if (opt.immediate == 1)
		job->ctx_params.immediate = (uint16_t)opt.immediate;
	if (opt.time_limit != NO_VAL)
		job->ctx_params.time_limit = (uint32_t)opt.time_limit;
	job->ctx_params.verbose_level = (uint16_t)_verbose;
	if (opt.resv_port_cnt != NO_VAL)
		job->ctx_params.resv_port_cnt = (uint16_t) opt.resv_port_cnt;
	else {
#if defined(HAVE_NATIVE_CRAY)
		/*
		 * On Cray systems default to reserving one port, or one
		 * more than the number of multi prog commands, for Cray PMI
		 */
		job->ctx_params.resv_port_cnt = (opt.multi_prog ?
						 opt.multi_prog_cmds + 1 : 1);
#endif
	}

	switch (opt.distribution & SLURM_DIST_STATE_BASE) {
	case SLURM_DIST_BLOCK:
	case SLURM_DIST_ARBITRARY:
	case SLURM_DIST_CYCLIC:
	case SLURM_DIST_CYCLIC_CYCLIC:
	case SLURM_DIST_CYCLIC_BLOCK:
	case SLURM_DIST_BLOCK_CYCLIC:
	case SLURM_DIST_BLOCK_BLOCK:
	case SLURM_DIST_CYCLIC_CFULL:
	case SLURM_DIST_BLOCK_CFULL:
		job->ctx_params.task_dist = opt.distribution;
		if (opt.ntasks_per_node != NO_VAL)
			job->ctx_params.plane_size = opt.ntasks_per_node;
		break;
	case SLURM_DIST_PLANE:
		job->ctx_params.task_dist = SLURM_DIST_PLANE;
		job->ctx_params.plane_size = opt.plane_size;
		break;
	default:
		base_dist = (job->ctx_params.task_count <=
			     job->ctx_params.min_nodes)
			     ? SLURM_DIST_CYCLIC : SLURM_DIST_BLOCK;
		opt.distribution &= SLURM_DIST_STATE_FLAGS;
		opt.distribution |= base_dist;
		job->ctx_params.task_dist = opt.distribution;
		if (opt.ntasks_per_node != NO_VAL)
			job->ctx_params.plane_size = opt.ntasks_per_node;
		break;

	}
	job->ctx_params.overcommit = opt.overcommit ? 1 : 0;

	job->ctx_params.node_list = opt.nodelist;

	job->ctx_params.network = opt.network;
	job->ctx_params.no_kill = opt.no_kill;
	if (opt.job_name_set_cmd && opt.job_name)
		job->ctx_params.name = opt.job_name;
	else
		job->ctx_params.name = opt.cmd_name;
	job->ctx_params.features = opt.constraints;

	debug("requesting job %u, user %u, nodes %u including (%s)",
	      job->ctx_params.job_id, job->ctx_params.uid,
	      job->ctx_params.min_nodes, job->ctx_params.node_list);
	debug("cpus %u, tasks %u, name %s, relative %u",
	      job->ctx_params.cpu_count, job->ctx_params.task_count,
	      job->ctx_params.name, job->ctx_params.relative);
	begin_time = time(NULL);

	for (i=0; (!(*destroy_job)); i++) {
		bool blocking_step_create = true;
		if (opt.no_alloc) {
			job->step_ctx = slurm_step_ctx_create_no_alloc(
				&job->ctx_params, job->stepid);
		} else if (opt.immediate) {
			job->step_ctx = slurm_step_ctx_create(
				&job->ctx_params);
		} else {
			/* Wait 60 to 70 seconds for response */
			step_wait = (getpid() % 10) * 1000 + 60000;
			job->step_ctx = slurm_step_ctx_create_timeout(
						&job->ctx_params, step_wait);
		}
		if (job->step_ctx != NULL) {
			if (i > 0)
				info("Job step created");

			break;
		}
		rc = slurm_get_errno();

		if (((opt.immediate != 0) &&
		     ((opt.immediate == 1) ||
		      (difftime(time(NULL), begin_time) > opt.immediate))) ||
		    ((rc != ESLURM_NODES_BUSY) && (rc != ESLURM_PORTS_BUSY) &&
		     (rc != ESLURM_PROLOG_RUNNING) &&
		     (rc != SLURM_PROTOCOL_SOCKET_IMPL_TIMEOUT) &&
		     (rc != ESLURM_INTERCONNECT_BUSY) &&
		     (rc != ESLURM_DISABLED))) {
			error ("Unable to create job step: %m");
			return SLURM_ERROR;
		}
		if (rc == ESLURM_DISABLED)	/* job suspended */
			blocking_step_create = false;

		if (i == 0) {
			if (rc == ESLURM_PROLOG_RUNNING) {
				verbose("Resources allocated for job %u and "
					"being configured, please wait",
					job->ctx_params.job_id);
			} else {
				info("Job step creation temporarily disabled, "
				     "retrying");
			}
			xsignal_unblock(sig_array);
			for (i = 0; sig_array[i]; i++)
				xsignal(sig_array[i], signal_function);
			if (!blocking_step_create)
				my_sleep = (getpid() % 1000) * 100 + 100000;
		} else {
			verbose("Job step creation still disabled, retrying");
			if (!blocking_step_create)
				my_sleep *= 2;
		}
		if (!blocking_step_create) {
			/* sleep 0.1 to 29 secs with exponential back-off */
			my_sleep = MIN(my_sleep, 29000000);
			usleep(my_sleep);
		}
		if (*destroy_job) {
			/* cancelled by signal */
			break;
		}
	}
	if (i > 0) {
		xsignal_block(sig_array);
		if (*destroy_job) {
			info("Cancelled pending job step");
			return SLURM_ERROR;
		}
	}

	slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_STEPID, &job->stepid);
	/*  Number of hosts in job may not have been initialized yet if
	 *    --jobid was used or only SLURM_JOB_ID was set in user env.
	 *    Reset the value here just in case.
	 */
	slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_NUM_HOSTS,
			   &job->nhosts);

	/*
	 * Recreate filenames which may depend upon step id
	 */
	job_update_io_fnames(job);

	return SLURM_SUCCESS;
}
Example #5
0
/* run_backup - this is the backup controller, it should run in standby
 *	mode, assuming control when the primary controller stops responding */
void run_backup(slurm_trigger_callbacks_t *callbacks)
{
	int i;
	uint32_t trigger_type;
	time_t last_ping = 0;
	pthread_attr_t thread_attr_sig, thread_attr_rpc;
	slurmctld_lock_t config_read_lock = {
		READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
	slurmctld_lock_t config_write_lock = {
		WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, WRITE_LOCK };

	info("slurmctld running in background mode");
	takeover = false;
	last_controller_response = time(NULL);

	/* default: don't resume if shutdown */
	slurmctld_config.resume_backup = false;
	if (xsignal_block(backup_sigarray) < 0)
		error("Unable to block signals");

	/*
	 * create attached thread to process RPCs
	 */
	slurm_attr_init(&thread_attr_rpc);
	while (pthread_create(&slurmctld_config.thread_id_rpc,
			      &thread_attr_rpc, _background_rpc_mgr, NULL)) {
		error("pthread_create error %m");
		sleep(1);
	}
	slurm_attr_destroy(&thread_attr_rpc);

	/*
	 * create attached thread for signal handling
	 */
	slurm_attr_init(&thread_attr_sig);
	while (pthread_create(&slurmctld_config.thread_id_sig,
			      &thread_attr_sig, _background_signal_hand,
			      NULL)) {
		error("pthread_create %m");
		sleep(1);
	}
	slurm_attr_destroy(&thread_attr_sig);
	trigger_type = TRIGGER_TYPE_BU_CTLD_RES_OP;
	_trigger_slurmctld_event(trigger_type);

	for (i = 0; ((i < 5) && (slurmctld_config.shutdown_time == 0)); i++) {
		sleep(1);       /* Give the primary slurmctld set-up time */
	}

	/* repeatedly ping ControlMachine */
	while (slurmctld_config.shutdown_time == 0) {
		sleep(1);
		/* Lock of slurmctld_conf below not important */
		if (slurmctld_conf.slurmctld_timeout &&
		    (takeover == false) &&
		    (difftime(time(NULL), last_ping) <
		     (slurmctld_conf.slurmctld_timeout / 3)))
			continue;

		last_ping = time(NULL);
		if (_ping_controller() == 0)
			last_controller_response = time(NULL);
		else if (takeover) {
			/* in takeover mode, take control as soon as */
			/* primary no longer respond */
			break;
		} else {
			uint32_t timeout;
			lock_slurmctld(config_read_lock);
			timeout = slurmctld_conf.slurmctld_timeout;
			unlock_slurmctld(config_read_lock);

			if (difftime(time(NULL), last_controller_response) >
			    timeout) {
				break;
			}
		}
	}

	if (slurmctld_config.shutdown_time != 0) {
		/* Since pidfile is created as user root (its owner is
		 *   changed to SlurmUser) SlurmUser may not be able to
		 *   remove it, so this is not necessarily an error.
		 * No longer need slurmctld_conf lock after above join. */
		if (unlink(slurmctld_conf.slurmctld_pidfile) < 0)
			verbose("Unable to remove pidfile '%s': %m",
				slurmctld_conf.slurmctld_pidfile);

		info("BackupController terminating");
		pthread_join(slurmctld_config.thread_id_sig, NULL);
		log_fini();
		if (dump_core)
			abort();
		else
			exit(0);
	}

	lock_slurmctld(config_read_lock);
	error("ControlMachine %s not responding, "
		"BackupController %s taking over",
		slurmctld_conf.control_machine,
		slurmctld_conf.backup_controller);
	unlock_slurmctld(config_read_lock);

	backup_slurmctld_restart();
	trigger_primary_ctld_fail();
	trigger_backup_ctld_as_ctrl();

	pthread_kill(slurmctld_config.thread_id_sig, SIGTERM);
	pthread_join(slurmctld_config.thread_id_sig, NULL);
	pthread_join(slurmctld_config.thread_id_rpc, NULL);

	/* The job list needs to be freed before we run
	 * ctld_assoc_mgr_init, it should be empty here in the first place.
	 */
	lock_slurmctld(config_write_lock);
	job_fini();
	init_job_conf();
	unlock_slurmctld(config_write_lock);

	ctld_assoc_mgr_init(callbacks);

	/* clear old state and read new state */
	lock_slurmctld(config_write_lock);
	if (switch_g_restore(slurmctld_conf.state_save_location, true)) {
		error("failed to restore switch state");
		abort();
	}
	if (read_slurm_conf(2, false)) {	/* Recover all state */
		error("Unable to recover slurm state");
		abort();
	}
	slurmctld_config.shutdown_time = (time_t) 0;
	unlock_slurmctld(config_write_lock);
	select_g_select_nodeinfo_set_all();

	return;
}
Example #6
0
extern int
create_job_step(srun_job_t *job, bool use_all_cpus)
{
	int i, rc;
	unsigned long my_sleep = 0;
	time_t begin_time;

	slurm_step_ctx_params_t_init(&job->ctx_params);
	job->ctx_params.job_id = job->jobid;
	job->ctx_params.uid = opt.uid;

	/* set the jobid for totalview */
	totalview_jobid = NULL;
	xstrfmtcat(totalview_jobid, "%u", job->ctx_params.job_id);

	/* Validate minimum and maximum node counts */
	if (opt.min_nodes && opt.max_nodes &&
	    (opt.min_nodes > opt.max_nodes)) {
		error ("Minimum node count > maximum node count (%d > %d)",
		       opt.min_nodes, opt.max_nodes);
		return -1;
	}
#if !defined HAVE_FRONT_END || (defined HAVE_BGQ)
//#if !defined HAVE_FRONT_END || (defined HAVE_BGQ && defined HAVE_BG_FILES)
	if (opt.min_nodes && (opt.min_nodes > job->nhosts)) {
		error ("Minimum node count > allocated node count (%d > %d)",
		       opt.min_nodes, job->nhosts);
		return -1;
	}
#endif
	job->ctx_params.min_nodes = job->nhosts;
	if (opt.min_nodes && (opt.min_nodes < job->ctx_params.min_nodes))
		job->ctx_params.min_nodes = opt.min_nodes;
	job->ctx_params.max_nodes = job->nhosts;
	if (opt.max_nodes && (opt.max_nodes < job->ctx_params.max_nodes))
		job->ctx_params.max_nodes = opt.max_nodes;

	if (!opt.ntasks_set && (opt.ntasks_per_node != NO_VAL))
		job->ntasks = opt.ntasks = job->nhosts * opt.ntasks_per_node;
	job->ctx_params.task_count = opt.ntasks;

	if (opt.mem_per_cpu != NO_VAL)
		job->ctx_params.mem_per_cpu = opt.mem_per_cpu;
	job->ctx_params.gres = opt.gres;

	if (use_all_cpus)
		job->ctx_params.cpu_count = job->cpu_count;
	else if (opt.overcommit)
		job->ctx_params.cpu_count = job->ctx_params.min_nodes;
	else
		job->ctx_params.cpu_count = opt.ntasks*opt.cpus_per_task;

	job->ctx_params.relative = (uint16_t)opt.relative;
	job->ctx_params.ckpt_interval = (uint16_t)opt.ckpt_interval;
	job->ctx_params.ckpt_dir = opt.ckpt_dir;
	job->ctx_params.gres = opt.gres;
	job->ctx_params.exclusive = (uint16_t)opt.exclusive;
	if (opt.immediate == 1)
		job->ctx_params.immediate = (uint16_t)opt.immediate;
	if (opt.time_limit != NO_VAL)
		job->ctx_params.time_limit = (uint32_t)opt.time_limit;
	job->ctx_params.verbose_level = (uint16_t)_verbose;
	if (opt.resv_port_cnt != NO_VAL)
		job->ctx_params.resv_port_cnt = (uint16_t) opt.resv_port_cnt;

	switch (opt.distribution) {
	case SLURM_DIST_BLOCK:
	case SLURM_DIST_ARBITRARY:
	case SLURM_DIST_CYCLIC:
	case SLURM_DIST_CYCLIC_CYCLIC:
	case SLURM_DIST_CYCLIC_BLOCK:
	case SLURM_DIST_BLOCK_CYCLIC:
	case SLURM_DIST_BLOCK_BLOCK:
		job->ctx_params.task_dist = opt.distribution;
		break;
	case SLURM_DIST_PLANE:
		job->ctx_params.task_dist = SLURM_DIST_PLANE;
		job->ctx_params.plane_size = opt.plane_size;
		break;
	default:
		job->ctx_params.task_dist = (job->ctx_params.task_count <=
					     job->ctx_params.min_nodes)
			? SLURM_DIST_CYCLIC : SLURM_DIST_BLOCK;
		opt.distribution = job->ctx_params.task_dist;
		break;

	}
	job->ctx_params.overcommit = opt.overcommit ? 1 : 0;

	job->ctx_params.node_list = opt.nodelist;

	job->ctx_params.network = opt.network;
	job->ctx_params.no_kill = opt.no_kill;
	if (opt.job_name_set_cmd && opt.job_name)
		job->ctx_params.name = opt.job_name;
	else
		job->ctx_params.name = opt.cmd_name;

	debug("requesting job %u, user %u, nodes %u including (%s)",
	      job->ctx_params.job_id, job->ctx_params.uid,
	      job->ctx_params.min_nodes, job->ctx_params.node_list);
	debug("cpus %u, tasks %u, name %s, relative %u",
	      job->ctx_params.cpu_count, job->ctx_params.task_count,
	      job->ctx_params.name, job->ctx_params.relative);
	begin_time = time(NULL);

	for (i=0; (!destroy_job); i++) {
		if (opt.no_alloc) {
			job->step_ctx = slurm_step_ctx_create_no_alloc(
				&job->ctx_params, job->stepid);
		} else
			job->step_ctx = slurm_step_ctx_create(
				&job->ctx_params);
		if (job->step_ctx != NULL) {
			if (i > 0)
				info("Job step created");

			break;
		}
		rc = slurm_get_errno();

		if (((opt.immediate != 0) &&
		     ((opt.immediate == 1) ||
		      (difftime(time(NULL), begin_time) > opt.immediate))) ||
		    ((rc != ESLURM_NODES_BUSY) && (rc != ESLURM_PORTS_BUSY) &&
		     (rc != ESLURM_PROLOG_RUNNING) &&
		     (rc != SLURM_PROTOCOL_SOCKET_IMPL_TIMEOUT) &&
		     (rc != ESLURM_DISABLED))) {
			error ("Unable to create job step: %m");
			return -1;
		}

		if (i == 0) {
			if (rc == ESLURM_PROLOG_RUNNING) {
				verbose("Resources allocated for job %u and "
					"being configured, please wait",
					job->ctx_params.job_id);
			} else {
				info("Job step creation temporarily disabled, "
				     "retrying");
			}
			xsignal_unblock(sig_array);
			for (i = 0; sig_array[i]; i++)
				xsignal(sig_array[i], _signal_while_allocating);

			my_sleep = (getpid() % 1000) * 100 + 100000;
		} else {
			verbose("Job step creation still disabled, retrying");
			my_sleep = MIN((my_sleep * 2), 29000000);
		}
		/* sleep 0.1 to 29 secs with exponential back-off */
		usleep(my_sleep);
		if (destroy_job) {
			/* cancelled by signal */
			break;
		}
	}
	if (i > 0) {
		xsignal_block(sig_array);
		if (destroy_job) {
			info("Cancelled pending job step");
			return -1;
		}
	}

	slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_STEPID, &job->stepid);
	/*  Number of hosts in job may not have been initialized yet if
	 *    --jobid was used or only SLURM_JOB_ID was set in user env.
	 *    Reset the value here just in case.
	 */
	slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_NUM_HOSTS,
			   &job->nhosts);

	/*
	 * Recreate filenames which may depend upon step id
	 */
	job_update_io_fnames(job);

	return 0;
}
Example #7
0
resource_allocation_response_msg_t *
allocate_nodes(void)
{
	resource_allocation_response_msg_t *resp = NULL;
	job_desc_msg_t *j = job_desc_msg_create_from_opts();
	slurm_allocation_callbacks_t callbacks;
	int i;

	if (!j)
		return NULL;

	/* Do not re-use existing job id when submitting new job
	 * from within a running job */
	if ((j->job_id != NO_VAL) && !opt.jobid_set) {
		info("WARNING: Creating SLURM job allocation from within "
		     "another allocation");
		info("WARNING: You are attempting to initiate a second job");
		if (!opt.jobid_set)	/* Let slurmctld set jobid */
			j->job_id = NO_VAL;
	}
	callbacks.ping = _ping_handler;
	callbacks.timeout = _timeout_handler;
	callbacks.job_complete = _job_complete_handler;
	callbacks.user_msg = _user_msg_handler;
	callbacks.node_fail = _node_fail_handler;

	/* create message thread to handle pings and such from slurmctld */
	msg_thr = slurm_allocation_msg_thr_create(&j->other_port, &callbacks);

	/* NOTE: Do not process signals in separate pthread. The signal will
	 * cause slurm_allocate_resources_blocking() to exit immediately. */
	xsignal_unblock(sig_array);
	for (i = 0; sig_array[i]; i++)
		xsignal(sig_array[i], _signal_while_allocating);

	while (!resp) {
		resp = slurm_allocate_resources_blocking(j, opt.immediate,
							 _set_pending_job_id);
		if (destroy_job) {
			/* cancelled by signal */
			break;
		} else if (!resp && !_retry()) {
			break;
		}
	}

	if (resp && !destroy_job) {
		/*
		 * Allocation granted!
		 */
		pending_job_id = resp->job_id;
#ifdef HAVE_BG
		if (!_wait_bluegene_block_ready(resp)) {
			if(!destroy_job)
				error("Something is wrong with the "
				      "boot of the block.");
			goto relinquish;
		}
#else
		if (!_wait_nodes_ready(resp)) {
			if(!destroy_job)
				error("Something is wrong with the "
				      "boot of the nodes.");
			goto relinquish;
		}
#endif
	} else if (destroy_job) {
		goto relinquish;
	}

	xsignal_block(sig_array);

	job_desc_msg_destroy(j);

	return resp;

relinquish:

	slurm_free_resource_allocation_response_msg(resp);
	if (!destroy_job)
		slurm_complete_job(resp->job_id, 1);
	exit(error_exit);
	return NULL;
}
Example #8
0
/* main - slurmctld main function, start various threads and process RPCs */
int main(int argc, char *argv[])
{
	pthread_attr_t thread_attr;
	char node_name[128];
	void *db_conn = NULL;
	assoc_init_args_t assoc_init_arg;

	_init_config();
	log_init(argv[0], log_opts, LOG_DAEMON, NULL);
	if (read_slurmdbd_conf())
		exit(1);
	_parse_commandline(argc, argv);
	_update_logging(true);
	_update_nice();

	if (slurm_auth_init(NULL) != SLURM_SUCCESS) {
		fatal("Unable to initialize %s authentication plugin",
		      slurmdbd_conf->auth_type);
	}
	if (slurm_acct_storage_init(NULL) != SLURM_SUCCESS) {
		fatal("Unable to initialize %s accounting storage plugin",
		      slurmdbd_conf->storage_type);
	}
	_kill_old_slurmdbd();
	if (foreground == 0)
		_daemonize();

	/*
	 * Need to create pidfile here in case we setuid() below
	 * (init_pidfile() exits if it can't initialize pid file).
	 * On Linux we also need to make this setuid job explicitly
	 * able to write a core dump.
	 * This also has to happen after daemon(), which closes all fd's,
	 * so we keep the write lock of the pidfile.
	 */
	_init_pidfile();
	_become_slurm_user();
	if (foreground == 0)
		_set_work_dir();
	log_config();

#ifdef PR_SET_DUMPABLE
	if (prctl(PR_SET_DUMPABLE, 1) < 0)
		debug ("Unable to set dumpable to 1");
#endif /* PR_SET_DUMPABLE */

	if (xsignal_block(dbd_sigarray) < 0)
		error("Unable to block signals");

	/* Create attached thread for signal handling */
	slurm_attr_init(&thread_attr);
	if (pthread_create(&signal_handler_thread, &thread_attr,
			   _signal_handler, NULL))
		fatal("pthread_create %m");
	slurm_attr_destroy(&thread_attr);

	registered_clusters = list_create(NULL);

	slurm_attr_init(&thread_attr);
	if (pthread_create(&commit_handler_thread, &thread_attr,
			   _commit_handler, NULL))
		fatal("pthread_create %m");
	slurm_attr_destroy(&thread_attr);

	memset(&assoc_init_arg, 0, sizeof(assoc_init_args_t));

	/* If we are tacking wckey we need to cache
	   wckeys, if we aren't only cache the users, qos */
	assoc_init_arg.cache_level = ASSOC_MGR_CACHE_USER | ASSOC_MGR_CACHE_QOS;
	if (slurmdbd_conf->track_wckey)
		assoc_init_arg.cache_level |= ASSOC_MGR_CACHE_WCKEY;

	db_conn = acct_storage_g_get_connection(NULL, 0, true, NULL);
	if (assoc_mgr_init(db_conn, &assoc_init_arg, errno) == SLURM_ERROR) {
		error("Problem getting cache of data");
		acct_storage_g_close_connection(&db_conn);
		goto end_it;
	}

	if (gethostname_short(node_name, sizeof(node_name)))
		fatal("getnodename: %m");

	while (1) {
		if (slurmdbd_conf->dbd_backup &&
		    (!strcmp(node_name, slurmdbd_conf->dbd_backup) ||
		     !strcmp(slurmdbd_conf->dbd_backup, "localhost"))) {
			info("slurmdbd running in background mode");
			have_control = false;
			backup = true;
			/* make sure any locks are released */
			acct_storage_g_commit(db_conn, 1);
			run_dbd_backup();
			if (!shutdown_time)
				assoc_mgr_refresh_lists(db_conn);
		} else if (slurmdbd_conf->dbd_host &&
			   (!strcmp(slurmdbd_conf->dbd_host, node_name) ||
			    !strcmp(slurmdbd_conf->dbd_host, "localhost"))) {
			backup = false;
			have_control = true;
		} else {
			fatal("This host not configured to run SlurmDBD "
			      "(%s != %s | (backup) %s)",
			      node_name, slurmdbd_conf->dbd_host,
			      slurmdbd_conf->dbd_backup);
		}

		if (!shutdown_time) {
			/* Create attached thread to process incoming RPCs */
			slurm_attr_init(&thread_attr);
			if (pthread_create(&rpc_handler_thread, &thread_attr,
					   rpc_mgr, NULL))
				fatal("pthread_create error %m");
			slurm_attr_destroy(&thread_attr);
		}

		if (!shutdown_time) {
			/* Create attached thread to do usage rollup */
			slurm_attr_init(&thread_attr);
			if (pthread_create(&rollup_handler_thread,
					   &thread_attr,
					   _rollup_handler, db_conn))
				fatal("pthread_create error %m");
			slurm_attr_destroy(&thread_attr);
		}

		/* Daemon is fully operational here */
		if (!shutdown_time || primary_resumed) {
			shutdown_time = 0;
			info("slurmdbd version %s started",
			     SLURM_VERSION_STRING);
			if (backup)
				run_dbd_backup();
		}

		_request_registrations(db_conn);
		acct_storage_g_commit(db_conn, 1);

		/* this is only ran if not backup */
		if (rollup_handler_thread)
			pthread_join(rollup_handler_thread, NULL);
		if (rpc_handler_thread)
			pthread_join(rpc_handler_thread, NULL);

		if (backup && primary_resumed) {
			shutdown_time = 0;
			info("Backup has given up control");
		}

		if (shutdown_time)
			break;
	}
	/* Daemon termination handled here */

end_it:

	if (signal_handler_thread)
		pthread_join(signal_handler_thread, NULL);
	if (commit_handler_thread)
		pthread_join(commit_handler_thread, NULL);

	acct_storage_g_commit(db_conn, 1);
	acct_storage_g_close_connection(&db_conn);

	if (slurmdbd_conf->pid_file &&
	    (unlink(slurmdbd_conf->pid_file) < 0)) {
		verbose("Unable to remove pidfile '%s': %m",
			slurmdbd_conf->pid_file);
	}

	FREE_NULL_LIST(registered_clusters);

	assoc_mgr_fini(NULL);
	slurm_acct_storage_fini();
	slurm_auth_fini();
	log_fini();
	free_slurmdbd_conf();
	exit(0);
}
Example #9
0
resource_allocation_response_msg_t *
allocate_nodes(bool handle_signals)
{
	resource_allocation_response_msg_t *resp = NULL;
	job_desc_msg_t *j = job_desc_msg_create_from_opts();
	slurm_allocation_callbacks_t callbacks;
	int i;

	if (!j)
		return NULL;

	/* Do not re-use existing job id when submitting new job
	 * from within a running job */
	if ((j->job_id != NO_VAL) && !opt.jobid_set) {
		info("WARNING: Creating SLURM job allocation from within "
		     "another allocation");
		info("WARNING: You are attempting to initiate a second job");
		if (!opt.jobid_set)	/* Let slurmctld set jobid */
			j->job_id = NO_VAL;
	}
	callbacks.ping = _ping_handler;
	callbacks.timeout = _timeout_handler;
	callbacks.job_complete = _job_complete_handler;
	callbacks.job_suspend = NULL;
	callbacks.user_msg = _user_msg_handler;
	callbacks.node_fail = _node_fail_handler;

	/* create message thread to handle pings and such from slurmctld */
	msg_thr = slurm_allocation_msg_thr_create(&j->other_port, &callbacks);

	/* NOTE: Do not process signals in separate pthread. The signal will
	 * cause slurm_allocate_resources_blocking() to exit immediately. */
	if (handle_signals) {
		xsignal_unblock(sig_array);
		for (i = 0; sig_array[i]; i++)
			xsignal(sig_array[i], _signal_while_allocating);
	}

	while (!resp) {
		resp = slurm_allocate_resources_blocking(j, opt.immediate,
							 _set_pending_job_id);
		if (destroy_job) {
			/* cancelled by signal */
			break;
		} else if (!resp && !_retry()) {
			break;
		}
	}

	if (resp && !destroy_job) {
		/*
		 * Allocation granted!
		 */
		pending_job_id = resp->job_id;

		/*
		 * These values could be changed while the job was
		 * pending so overwrite the request with what was
		 * allocated so we don't have issues when we use them
		 * in the step creation.
		 */
		if (opt.pn_min_memory != NO_VAL)
			opt.pn_min_memory = (resp->pn_min_memory &
					     (~MEM_PER_CPU));
		else if (opt.mem_per_cpu != NO_VAL)
			opt.mem_per_cpu = (resp->pn_min_memory &
					   (~MEM_PER_CPU));
		/*
		 * FIXME: timelimit should probably also be updated
		 * here since it could also change.
		 */

#ifdef HAVE_BG
		uint32_t node_cnt = 0;
		select_g_select_jobinfo_get(resp->select_jobinfo,
					    SELECT_JOBDATA_NODE_CNT,
					    &node_cnt);
		if ((node_cnt == 0) || (node_cnt == NO_VAL)) {
			opt.min_nodes = node_cnt;
			opt.max_nodes = node_cnt;
		} /* else we just use the original request */

		if (!_wait_bluegene_block_ready(resp)) {
			if (!destroy_job)
				error("Something is wrong with the "
				      "boot of the block.");
			goto relinquish;
		}
#else
		opt.min_nodes = resp->node_cnt;
		opt.max_nodes = resp->node_cnt;

		if (!_wait_nodes_ready(resp)) {
			if (!destroy_job)
				error("Something is wrong with the "
				      "boot of the nodes.");
			goto relinquish;
		}
#endif
	} else if (destroy_job) {
		goto relinquish;
	}

	if (handle_signals)
		xsignal_block(sig_array);

	job_desc_msg_destroy(j);

	return resp;

relinquish:
	if (resp) {
		if (!destroy_job)
			slurm_complete_job(resp->job_id, 1);
		slurm_free_resource_allocation_response_msg(resp);
	}
	exit(error_exit);
	return NULL;
}
Example #10
0
int
main (int argc, char *argv[])
{
	int i, pidfd;
	int blocked_signals[] = {SIGPIPE, 0};
	int cc;
	char *oom_value;
	uint32_t slurmd_uid = 0;
	uint32_t curr_uid = 0;
	char time_stamp[256];
	log_options_t lopts = LOG_OPTS_INITIALIZER;

	/* NOTE: logfile is NULL at this point */
	log_init(argv[0], lopts, LOG_DAEMON, NULL);

	/*
	 * Make sure we have no extra open files which
	 * would be propagated to spawned tasks.
	 */
	cc = sysconf(_SC_OPEN_MAX);
	for (i = 3; i < cc; i++)
		close(i);

	/*
	 * Drop supplementary groups.
	 */
	if (geteuid() == 0) {
		if (setgroups(0, NULL) != 0) {
			fatal("Failed to drop supplementary groups, "
			      "setgroups: %m");
		}
	} else {
		debug("Not running as root. Can't drop supplementary groups");
	}

	/*
	 * Create and set default values for the slurmd global
	 * config variable "conf"
	 */
	conf = xmalloc(sizeof(slurmd_conf_t));
	_init_conf();
	conf->argv = &argv;
	conf->argc = &argc;

	if (_slurmd_init() < 0) {
		error( "slurmd initialization failed" );
		fflush( NULL );
		exit(1);
	}

	slurmd_uid = slurm_get_slurmd_user_id();
	curr_uid = getuid();
	if (curr_uid != slurmd_uid) {
		struct passwd *pw = NULL;
		char *slurmd_user = NULL;
		char *curr_user = NULL;

		/* since when you do a getpwuid you get a pointer to a
		 * structure you have to do a xstrdup on the first
		 * call or your information will just get over
		 * written.  This is a memory leak, but a fatal is
		 * called right after so it isn't that big of a deal.
		 */
		if ((pw=getpwuid(slurmd_uid)))
			slurmd_user = xstrdup(pw->pw_name);
		if ((pw=getpwuid(curr_uid)))
			curr_user = pw->pw_name;

		fatal("You are running slurmd as something "
		      "other than user %s(%d).  If you want to "
		      "run as this user add SlurmdUser=%s "
		      "to the slurm.conf file.",
		      slurmd_user, slurmd_uid, curr_user);
	}
	init_setproctitle(argc, argv);

	xsignal(SIGTERM, &_term_handler);
	xsignal(SIGINT,  &_term_handler);
	xsignal(SIGHUP,  &_hup_handler );
	xsignal_block(blocked_signals);

	debug3("slurmd initialization successful");

	/*
	 * Become a daemon if desired.
	 * Do not chdir("/") or close all fd's
	 */
	if (conf->daemonize) {
		if (daemon(1,1) == -1)
			error("Couldn't daemonize slurmd: %m");
	}
	test_core_limit();
	info("slurmd version %s started", SLURM_VERSION_STRING);
	debug3("finished daemonize");

	if ((oom_value = getenv("SLURMD_OOM_ADJ"))) {
		i = atoi(oom_value);
		debug("Setting slurmd oom_adj to %d", i);
		set_oom_adj(i);
	}

	_kill_old_slurmd();

	if (conf->mlock_pages) {
		/*
		 * Call mlockall() if available to ensure slurmd
		 *  doesn't get swapped out
		 */
#ifdef _POSIX_MEMLOCK
		if (mlockall (MCL_FUTURE | MCL_CURRENT) < 0)
			error ("failed to mlock() slurmd pages: %m");
#else
		error ("mlockall() system call does not appear to be available");
#endif /* _POSIX_MEMLOCK */
	}


	/*
	 * Restore any saved revoked credential information
	 */
	if (!conf->cleanstart && (_restore_cred_state(conf->vctx) < 0))
		return SLURM_FAILURE;

	if (jobacct_gather_init() != SLURM_SUCCESS)
		fatal("Unable to initialize jobacct_gather");
	if (job_container_init() < 0)
		fatal("Unable to initialize job_container plugin.");
	if (container_g_restore(conf->spooldir, !conf->cleanstart))
		error("Unable to restore job_container state.");
	if (switch_g_node_init() < 0)
		fatal("Unable to initialize interconnect.");
	if (conf->cleanstart && switch_g_clear_node_state())
		fatal("Unable to clear interconnect state.");
	switch_g_slurmd_init();

	_create_msg_socket();

	conf->pid = getpid();
	/* This has to happen after daemon(), which closes all fd's,
	   so we keep the write lock of the pidfile.
	*/
	pidfd = create_pidfile(conf->pidfile, 0);

	rfc2822_timestamp(time_stamp, sizeof(time_stamp));
	info("%s started on %s", slurm_prog_name, time_stamp);

	_install_fork_handlers();
	list_install_fork_handlers();
	slurm_conf_install_fork_handlers();

	/*
	 * Initialize any plugins
	 */
	if (slurmd_plugstack_init())
		fatal("failed to initialize slurmd_plugstack");

	_spawn_registration_engine();
	_msg_engine();

	/*
	 * Close fd here, otherwise we'll deadlock since create_pidfile()
	 * flocks the pidfile.
	 */
	if (pidfd >= 0)			/* valid pidfd, non-error */
		(void) close(pidfd);	/* Ignore errors */
	if (unlink(conf->pidfile) < 0)
		error("Unable to remove pidfile `%s': %m", conf->pidfile);

	_wait_for_all_threads(120);
	_slurmd_fini();
	_destroy_conf();
	slurm_crypto_fini();	/* must be after _destroy_conf() */

	info("Slurmd shutdown completing");
	log_fini();
       	return 0;
}
Example #11
0
int
main (int argc, char *argv[])
{
	slurm_addr_t *cli;
	slurm_addr_t *self;
	slurm_msg_t *msg;
	slurmd_job_t *job;
	int ngids;
	gid_t *gids;
	int rc = 0;

	if (process_cmdline (argc, argv) < 0)
		fatal ("Error in slurmstepd command line");

	xsignal_block(slurmstepd_blocked_signals);
	conf = xmalloc(sizeof(*conf));
	conf->argv = &argv;
	conf->argc = &argc;
	init_setproctitle(argc, argv);
	if (slurm_select_init(1) != SLURM_SUCCESS )
		fatal( "failed to initialize node selection plugin" );

	/* Receive job parameters from the slurmd */
	_init_from_slurmd(STDIN_FILENO, argv, &cli, &self, &msg,
			  &ngids, &gids);

	/* Fancy way of closing stdin that keeps STDIN_FILENO from being
	 * allocated to any random file.  The slurmd already opened /dev/null
	 * on STDERR_FILENO for us. */
	dup2(STDERR_FILENO, STDIN_FILENO);

	/* Create the slurmd_job_t, mostly from info in a
	   launch_tasks_request_msg_t or a batch_job_launch_msg_t */
	if(!(job = _step_setup(cli, self, msg))) {
		_send_fail_to_slurmd(STDOUT_FILENO);
		rc = SLURM_FAILURE;
		goto ending;
	}
	job->ngids = ngids;
	job->gids = gids;

	/* fork handlers cause mutexes on some global data structures
	   to be re-initialized after the fork. */
	list_install_fork_handlers();
	slurm_conf_install_fork_handlers();

	/* sets job->msg_handle and job->msgid */
	if (msg_thr_create(job) == SLURM_ERROR) {
		_send_fail_to_slurmd(STDOUT_FILENO);
		rc = SLURM_FAILURE;
		goto ending;
	}

	_send_ok_to_slurmd(STDOUT_FILENO);

	/* Fancy way of closing stdout that keeps STDOUT_FILENO from being
	 * allocated to any random file.  The slurmd already opened /dev/null
	 * on STDERR_FILENO for us. */
	dup2(STDERR_FILENO, STDOUT_FILENO);

	/* This does most of the stdio setup, then launches all the tasks,
	   and blocks until the step is complete */
	rc = job_manager(job);

	/* signal the message thread to shutdown, and wait for it */
	eio_signal_shutdown(job->msg_handle);
	pthread_join(job->msgid, NULL);

	if (job->batch)
		batch_finish(job, rc); /* sends batch complete message */

ending:
#ifdef MEMORY_LEAK_DEBUG
	_step_cleanup(job, msg, rc);

	xfree(cli);
	xfree(self);
	xfree(conf->hostname);
	xfree(conf->block_map);
	xfree(conf->block_map_inv);
	xfree(conf->spooldir);
	xfree(conf->node_name);
	xfree(conf->node_topo_addr);
	xfree(conf->node_topo_pattern);
	xfree(conf->logfile);
	xfree(conf);
#endif
	info("done with job");
	return rc;
}
Example #12
0
extern int launch_common_create_job_step(srun_job_t *job, bool use_all_cpus,
					 void (*signal_function)(int),
					 sig_atomic_t *destroy_job,
					 slurm_opt_t *opt_local)
{
	srun_opt_t *srun_opt = opt_local->srun_opt;
	int i, j, rc;
	unsigned long step_wait = 0;
	uint16_t base_dist, slurmctld_timeout;
	char *add_tres;
	xassert(srun_opt);

	if (!job) {
		error("launch_common_create_job_step: no job given");
		return SLURM_ERROR;
	}

	slurm_step_ctx_params_t_init(&job->ctx_params);
	job->ctx_params.job_id = job->jobid;
	job->ctx_params.step_id = job->stepid;
	job->ctx_params.uid = opt_local->uid;

	/* Validate minimum and maximum node counts */
	if (opt_local->min_nodes && opt_local->max_nodes &&
	    (opt_local->min_nodes > opt_local->max_nodes)) {
		error ("Minimum node count > maximum node count (%d > %d)",
		       opt_local->min_nodes, opt_local->max_nodes);
		return SLURM_ERROR;
	}
#if !defined HAVE_FRONT_END
	if (opt_local->min_nodes && (opt_local->min_nodes > job->nhosts)) {
		error ("Minimum node count > allocated node count (%d > %d)",
		       opt_local->min_nodes, job->nhosts);
		return SLURM_ERROR;
	}
#endif
	job->ctx_params.min_nodes = job->nhosts;
	if (opt_local->min_nodes &&
	    (opt_local->min_nodes < job->ctx_params.min_nodes))
		job->ctx_params.min_nodes = opt_local->min_nodes;
	job->ctx_params.max_nodes = job->nhosts;
	if (opt_local->max_nodes &&
	    (opt_local->max_nodes < job->ctx_params.max_nodes))
		job->ctx_params.max_nodes = opt_local->max_nodes;

	if (!opt_local->ntasks_set && (opt_local->ntasks_per_node != NO_VAL))
		job->ntasks = opt_local->ntasks = job->nhosts *
						  opt_local->ntasks_per_node;
	job->ctx_params.task_count = opt_local->ntasks;

	if (opt_local->mem_per_cpu != NO_VAL64)
		job->ctx_params.pn_min_memory = opt_local->mem_per_cpu |
						MEM_PER_CPU;
	else if (opt_local->pn_min_memory != NO_VAL64)
		job->ctx_params.pn_min_memory = opt_local->pn_min_memory;

	if (opt_local->overcommit) {
		if (use_all_cpus)	/* job allocation created by srun */
			job->ctx_params.cpu_count = job->cpu_count;
		else
			job->ctx_params.cpu_count = job->ctx_params.min_nodes;
	} else if (opt_local->cpus_set) {
		job->ctx_params.cpu_count = opt_local->ntasks *
					    opt_local->cpus_per_task;
	} else if (opt_local->ntasks_set) {
		job->ctx_params.cpu_count = opt_local->ntasks;
	} else if (use_all_cpus) {	/* job allocation created by srun */
		job->ctx_params.cpu_count = job->cpu_count;
	} else {
		job->ctx_params.cpu_count = opt_local->ntasks;
	}

	job->ctx_params.cpu_freq_min = opt_local->cpu_freq_min;
	job->ctx_params.cpu_freq_max = opt_local->cpu_freq_max;
	job->ctx_params.cpu_freq_gov = opt_local->cpu_freq_gov;
	job->ctx_params.relative = (uint16_t)srun_opt->relative;
	job->ctx_params.ckpt_interval = (uint16_t)srun_opt->ckpt_interval;
	job->ctx_params.ckpt_dir = srun_opt->ckpt_dir;
	job->ctx_params.exclusive = (uint16_t)srun_opt->exclusive;
	if (opt_local->immediate == 1)
		job->ctx_params.immediate = (uint16_t)opt_local->immediate;
	if (opt_local->time_limit != NO_VAL)
		job->ctx_params.time_limit = (uint32_t)opt_local->time_limit;
	job->ctx_params.verbose_level = (uint16_t)_verbose;
	if (srun_opt->resv_port_cnt != NO_VAL) {
		job->ctx_params.resv_port_cnt = (uint16_t)srun_opt->resv_port_cnt;
	} else {
#if defined(HAVE_NATIVE_CRAY)
		/*
		 * On Cray systems default to reserving one port, or one
		 * more than the number of multi prog commands, for Cray PMI
		 */
		job->ctx_params.resv_port_cnt = (srun_opt->multi_prog ?
					srun_opt->multi_prog_cmds + 1 : 1);
#endif
	}

	switch (opt_local->distribution & SLURM_DIST_NODESOCKMASK) {
	case SLURM_DIST_BLOCK:
	case SLURM_DIST_ARBITRARY:
	case SLURM_DIST_CYCLIC:
	case SLURM_DIST_CYCLIC_CYCLIC:
	case SLURM_DIST_CYCLIC_BLOCK:
	case SLURM_DIST_BLOCK_CYCLIC:
	case SLURM_DIST_BLOCK_BLOCK:
	case SLURM_DIST_CYCLIC_CFULL:
	case SLURM_DIST_BLOCK_CFULL:
		job->ctx_params.task_dist = opt_local->distribution;
		if (opt_local->ntasks_per_node != NO_VAL)
			job->ctx_params.plane_size = opt_local->ntasks_per_node;
		break;
	case SLURM_DIST_PLANE:
		job->ctx_params.task_dist = SLURM_DIST_PLANE;
		job->ctx_params.plane_size = opt_local->plane_size;
		break;
	default:
		/* Leave distribution set to unknown if taskcount <= nodes and
		 * memory is set to 0. step_mgr will handle the 0mem case.
		 * ex. SallocDefaultCommand=srun -n1 -N1 --mem=0 ... */
		if (!opt_local->mem_per_cpu || !opt_local->pn_min_memory)
			base_dist = SLURM_DIST_UNKNOWN;
		else
			base_dist = (job->ctx_params.task_count <=
				     job->ctx_params.min_nodes)
				     ? SLURM_DIST_CYCLIC : SLURM_DIST_BLOCK;
		opt_local->distribution &= SLURM_DIST_STATE_FLAGS;
		opt_local->distribution |= base_dist;
		job->ctx_params.task_dist = opt_local->distribution;
		if (opt_local->ntasks_per_node != NO_VAL)
			job->ctx_params.plane_size = opt_local->ntasks_per_node;
		break;

	}
	job->ctx_params.overcommit = opt_local->overcommit ? 1 : 0;
	job->ctx_params.node_list = opt_local->nodelist;
	job->ctx_params.network = opt_local->network;
	job->ctx_params.no_kill = opt_local->no_kill;
	if (srun_opt->job_name_set_cmd && opt_local->job_name)
		job->ctx_params.name = opt_local->job_name;
	else
		job->ctx_params.name = srun_opt->cmd_name;
	job->ctx_params.features = opt_local->constraints;

	if (opt_local->cpus_per_gpu) {
		xstrfmtcat(job->ctx_params.cpus_per_tres, "gpu:%d",
			   opt_local->cpus_per_gpu);
	}
	xfree(opt_local->tres_bind);	/* Vestigial value from job allocate */
	if (opt_local->gpu_bind)
		xstrfmtcat(opt_local->tres_bind, "gpu:%s", opt_local->gpu_bind);
	if (tres_bind_verify_cmdline(opt_local->tres_bind)) {
		if (tres_bind_err_log) {	/* Log once */
			error("Invalid --tres-bind argument: %s. Ignored",
			      opt_local->tres_bind);
			tres_bind_err_log = false;
		}
		xfree(opt_local->tres_bind);
	}
	job->ctx_params.tres_bind = xstrdup(opt_local->tres_bind);
	xfree(opt_local->tres_freq);	/* Vestigial value from job allocate */
	xfmt_tres_freq(&opt_local->tres_freq, "gpu", opt_local->gpu_freq);
	if (tres_freq_verify_cmdline(opt_local->tres_freq)) {
		if (tres_freq_err_log) {	/* Log once */
			error("Invalid --tres-freq argument: %s. Ignored",
			      opt_local->tres_freq);
			tres_freq_err_log = false;
		}
		xfree(opt_local->tres_freq);
	}
	job->ctx_params.tres_freq = xstrdup(opt_local->tres_freq);
	job->ctx_params.tres_per_step = xstrdup(opt_local->tres_per_job);
	xfmt_tres(&job->ctx_params.tres_per_step, "gpu", opt_local->gpus);
	xfmt_tres(&job->ctx_params.tres_per_node, "gpu",
		  opt_local->gpus_per_node);
	if (opt_local->gres)
		add_tres = opt_local->gres;
	else
		add_tres = getenv("SLURM_STEP_GRES");
	if (add_tres) {
		if (job->ctx_params.tres_per_node) {
			xstrfmtcat(job->ctx_params.tres_per_node, ",%s",
				   add_tres);
		} else
			job->ctx_params.tres_per_node = xstrdup(add_tres);
	}
	xfmt_tres(&job->ctx_params.tres_per_socket, "gpu",
		  opt_local->gpus_per_socket);
	xfmt_tres(&job->ctx_params.tres_per_task, "gpu",
		  opt_local->gpus_per_task);
	if (opt_local->mem_per_gpu) {
		xstrfmtcat(job->ctx_params.mem_per_tres, "gpu:%"PRIi64,
			   opt.mem_per_gpu);
	}

	debug("requesting job %u, user %u, nodes %u including (%s)",
	      job->ctx_params.job_id, job->ctx_params.uid,
	      job->ctx_params.min_nodes, job->ctx_params.node_list);
	debug("cpus %u, tasks %u, name %s, relative %u",
	      job->ctx_params.cpu_count, job->ctx_params.task_count,
	      job->ctx_params.name, job->ctx_params.relative);

	for (i = 0; (!(*destroy_job)); i++) {
		if (srun_opt->no_alloc) {
			job->step_ctx = slurm_step_ctx_create_no_alloc(
				&job->ctx_params, job->stepid);
		} else {
			if (opt_local->immediate) {
				step_wait = MAX(1, opt_local->immediate -
						   difftime(time(NULL),
							    srun_begin_time)) *
					    1000;
			} else {
				slurmctld_timeout = MIN(300, MAX(60,
					slurm_get_slurmctld_timeout()));
				step_wait = ((getpid() % 10) +
					     slurmctld_timeout) * 1000;
			}
			job->step_ctx = slurm_step_ctx_create_timeout(
						&job->ctx_params, step_wait);
		}
		if (job->step_ctx != NULL) {
			if (i > 0) {
				info("Step created for job %u",
				     job->ctx_params.job_id);
			}
			break;
		}
		rc = slurm_get_errno();

		if (((opt_local->immediate != 0) &&
		     ((opt_local->immediate == 1) ||
		      (difftime(time(NULL), srun_begin_time) >=
		       opt_local->immediate))) ||
		    ((rc != ESLURM_PROLOG_RUNNING) &&
		     !slurm_step_retry_errno(rc))) {
			error("Unable to create step for job %u: %m",
			      job->ctx_params.job_id);
			return SLURM_ERROR;
		}

		if (i == 0) {
			if (rc == ESLURM_PROLOG_RUNNING) {
				verbose("Resources allocated for job %u and "
					"being configured, please wait",
					job->ctx_params.job_id);
			} else {
				info("Job %u step creation temporarily disabled, retrying",
				     job->ctx_params.job_id);
			}
			xsignal_unblock(sig_array);
			for (j = 0; sig_array[j]; j++)
				xsignal(sig_array[j], signal_function);
		} else {
			verbose("Job %u step creation still disabled, retrying",
				job->ctx_params.job_id);
		}

		if (*destroy_job) {
			/* cancelled by signal */
			break;
		}
	}
	if (i > 0) {
		xsignal_block(sig_array);
		if (*destroy_job) {
			info("Cancelled pending step for job %u",
			     job->ctx_params.job_id);
			return SLURM_ERROR;
		}
	}

	slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_STEPID, &job->stepid);
	/*
	 *  Number of hosts in job may not have been initialized yet if
	 *    --jobid was used or only SLURM_JOB_ID was set in user env.
	 *    Reset the value here just in case.
	 */
	slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_NUM_HOSTS,
			   &job->nhosts);

	/*
	 * Recreate filenames which may depend upon step id
	 */
	job_update_io_fnames(job, opt_local);

	return SLURM_SUCCESS;
}
Example #13
0
/*
 * Allocate nodes for heterogeneous/pack job from the slurm controller -- 
 * retrying the attempt if the controller appears to be down, and optionally
 * waiting for resources if none are currently available (see opt.immediate)
 *
 * Returns a pointer to a resource_allocation_response_msg which must
 * be freed with slurm_free_resource_allocation_response_msg()
 */
List allocate_pack_nodes(bool handle_signals)
{
	resource_allocation_response_msg_t *resp = NULL;
	bool jobid_log = true;
	job_desc_msg_t *j, *first_job = NULL;
	slurm_allocation_callbacks_t callbacks;
	ListIterator opt_iter, resp_iter;
	slurm_opt_t *opt_local, *first_opt = NULL;
	List job_req_list = NULL, job_resp_list = NULL;
	uint32_t my_job_id = 0;
	int i, k;

	job_req_list = list_create(NULL);
	opt_iter = list_iterator_create(opt_list);
	while ((opt_local = list_next(opt_iter))) {
		srun_opt_t *srun_opt = opt_local->srun_opt;
		xassert(srun_opt);
		if (!first_opt)
			first_opt = opt_local;
		if (srun_opt->relative_set && srun_opt->relative)
			fatal("--relative option invalid for job allocation request");

		if ((j = _job_desc_msg_create_from_opts(opt_local)) == NULL)
			return NULL;
		if (!first_job)
			first_job = j;

		j->origin_cluster = xstrdup(slurmctld_conf.cluster_name);

		/* Do not re-use existing job id when submitting new job
		 * from within a running job */
		if ((j->job_id != NO_VAL) && !opt_local->jobid_set) {
			if (jobid_log) {
				jobid_log = false;	/* log once */
				info("WARNING: Creating SLURM job allocation from within "
				     "another allocation");
				info("WARNING: You are attempting to initiate a second job");
			}
			if (!opt_local->jobid_set) /* Let slurmctld set jobid */
				j->job_id = NO_VAL;
		}

		list_append(job_req_list, j);
	}
	list_iterator_destroy(opt_iter);

	if (!first_job) {
		error("%s: No job requests found", __func__);
		return NULL;
	}

	if (first_opt && first_opt->clusters &&
	    (slurmdb_get_first_pack_cluster(job_req_list, first_opt->clusters,
					    &working_cluster_rec)
	     != SLURM_SUCCESS)) {
		print_db_notok(first_opt->clusters, 0);
		return NULL;
	}

	callbacks.ping = _ping_handler;
	callbacks.timeout = _timeout_handler;
	callbacks.job_complete = _job_complete_handler;
	callbacks.job_suspend = NULL;
	callbacks.user_msg = _user_msg_handler;
	callbacks.node_fail = _node_fail_handler;

	/* create message thread to handle pings and such from slurmctld */
	msg_thr = slurm_allocation_msg_thr_create(&first_job->other_port,
						  &callbacks);

	/* NOTE: Do not process signals in separate pthread. The signal will
	 * cause slurm_allocate_resources_blocking() to exit immediately. */
	if (handle_signals) {
		xsignal_unblock(sig_array);
		for (i = 0; sig_array[i]; i++)
			xsignal(sig_array[i], _signal_while_allocating);
	}

	while (first_opt && !job_resp_list) {
		job_resp_list = slurm_allocate_pack_job_blocking(job_req_list,
				 first_opt->immediate, _set_pending_job_id);
		if (destroy_job) {
			/* cancelled by signal */
			break;
		} else if (!job_resp_list && !_retry()) {
			break;
		}
	}

	if (job_resp_list && !destroy_job) {
		/*
		 * Allocation granted!
		 */

		opt_iter  = list_iterator_create(opt_list);
		resp_iter = list_iterator_create(job_resp_list);
		while ((opt_local = list_next(opt_iter))) {
			resp = (resource_allocation_response_msg_t *)
			       list_next(resp_iter);
			if (!resp)
				break;

			if (pending_job_id == 0)
				pending_job_id = resp->job_id;
			if (my_job_id == 0) {
				my_job_id = resp->job_id;
				i = list_count(opt_list);
				k = list_count(job_resp_list);
				if (i != k) {
					error("%s: request count != response count (%d != %d)",
					      __func__, i, k);
					goto relinquish;
				}
			}

			/*
			 * These values could be changed while the job was
			 * pending so overwrite the request with what was
			 * allocated so we don't have issues when we use them
			 * in the step creation.
			 *
			 * NOTE: pn_min_memory here is an int64, not uint64.
			 * These operations may have some bizarre side effects
			 */
			if (opt_local->pn_min_memory != NO_VAL64)
				opt_local->pn_min_memory =
					(resp->pn_min_memory & (~MEM_PER_CPU));
			else if (opt_local->mem_per_cpu != NO_VAL64)
				opt_local->mem_per_cpu =
					(resp->pn_min_memory & (~MEM_PER_CPU));

#ifdef HAVE_BG
			uint32_t node_cnt = 0;
			select_g_select_jobinfo_get(resp->select_jobinfo,
						    SELECT_JOBDATA_NODE_CNT,
						    &node_cnt);
			if ((node_cnt == 0) || (node_cnt == NO_VAL)) {
				opt_local->min_nodes = node_cnt;
				opt_local->max_nodes = node_cnt;
			} /* else we just use the original request */

			if (!_wait_bluegene_block_ready(resp)) {
				if (!destroy_job)
					error("Something is wrong with the "
					      "boot of the block.");
				goto relinquish;
			}
#else
			opt_local->min_nodes = resp->node_cnt;
			opt_local->max_nodes = resp->node_cnt;

			if (resp->working_cluster_rec)
				slurm_setup_remote_working_cluster(resp);

			if (!_wait_nodes_ready(resp)) {
				if (!destroy_job)
					error("Something is wrong with the "
					      "boot of the nodes.");
				goto relinquish;
			}
#endif
		}
		list_iterator_destroy(resp_iter);
		list_iterator_destroy(opt_iter);
	} else if (destroy_job) {
		goto relinquish;
	}

	if (handle_signals)
		xsignal_block(sig_array);

	return job_resp_list;

relinquish:
	if (job_resp_list) {
		if (!destroy_job && my_job_id)
			slurm_complete_job(my_job_id, 1);
		list_destroy(job_resp_list);
	}
	exit(error_exit);
	return NULL;
}
Example #14
0
/*
 * Allocate nodes from the slurm controller -- retrying the attempt
 * if the controller appears to be down, and optionally waiting for
 * resources if none are currently available (see opt.immediate)
 *
 * Returns a pointer to a resource_allocation_response_msg which must
 * be freed with slurm_free_resource_allocation_response_msg()
 */
extern resource_allocation_response_msg_t *
	allocate_nodes(bool handle_signals, slurm_opt_t *opt_local)

{
	srun_opt_t *srun_opt = opt_local->srun_opt;
	resource_allocation_response_msg_t *resp = NULL;
	job_desc_msg_t *j;
	slurm_allocation_callbacks_t callbacks;
	int i;

	xassert(srun_opt);

	if (srun_opt->relative_set && srun_opt->relative)
		fatal("--relative option invalid for job allocation request");

	if ((j = _job_desc_msg_create_from_opts(&opt)) == NULL)
		return NULL;

	if (opt_local->clusters &&
	    (slurmdb_get_first_avail_cluster(j, opt_local->clusters,
					     &working_cluster_rec)
	     != SLURM_SUCCESS)) {
		print_db_notok(opt_local->clusters, 0);
		return NULL;
	}

	j->origin_cluster = xstrdup(slurmctld_conf.cluster_name);

	/* Do not re-use existing job id when submitting new job
	 * from within a running job */
	if ((j->job_id != NO_VAL) && !opt_local->jobid_set) {
		info("WARNING: Creating SLURM job allocation from within "
		     "another allocation");
		info("WARNING: You are attempting to initiate a second job");
		if (!opt_local->jobid_set)	/* Let slurmctld set jobid */
			j->job_id = NO_VAL;
	}
	callbacks.ping = _ping_handler;
	callbacks.timeout = _timeout_handler;
	callbacks.job_complete = _job_complete_handler;
	callbacks.job_suspend = NULL;
	callbacks.user_msg = _user_msg_handler;
	callbacks.node_fail = _node_fail_handler;

	/* create message thread to handle pings and such from slurmctld */
	msg_thr = slurm_allocation_msg_thr_create(&j->other_port, &callbacks);

	/* NOTE: Do not process signals in separate pthread. The signal will
	 * cause slurm_allocate_resources_blocking() to exit immediately. */
	if (handle_signals) {
		xsignal_unblock(sig_array);
		for (i = 0; sig_array[i]; i++)
			xsignal(sig_array[i], _signal_while_allocating);
	}

	while (!resp) {
		resp = slurm_allocate_resources_blocking(j,
							 opt_local->immediate,
							 _set_pending_job_id);
		if (destroy_job) {
			/* cancelled by signal */
			break;
		} else if (!resp && !_retry()) {
			break;
		}
	}

	if (resp)
		print_multi_line_string(resp->job_submit_user_msg, -1);

	if (resp && !destroy_job) {
		/*
		 * Allocation granted!
		 */
		pending_job_id = resp->job_id;

		/*
		 * These values could be changed while the job was
		 * pending so overwrite the request with what was
		 * allocated so we don't have issues when we use them
		 * in the step creation.
		 */
		opt_local->pn_min_memory = NO_VAL64;
		opt_local->mem_per_cpu   = NO_VAL64;
		if (resp->pn_min_memory != NO_VAL64) {
			if (resp->pn_min_memory & MEM_PER_CPU) {
				opt_local->mem_per_cpu = (resp->pn_min_memory &
							 (~MEM_PER_CPU));
			} else {
				opt_local->pn_min_memory = resp->pn_min_memory;
			}
		}

#ifdef HAVE_BG
		uint32_t node_cnt = 0;
		select_g_select_jobinfo_get(resp->select_jobinfo,
					    SELECT_JOBDATA_NODE_CNT,
					    &node_cnt);
		if ((node_cnt == 0) || (node_cnt == NO_VAL)) {
			opt_local->min_nodes = node_cnt;
			opt_local->max_nodes = node_cnt;
		} /* else we just use the original request */

		if (!_wait_bluegene_block_ready(resp)) {
			if (!destroy_job)
				error("Something is wrong with the "
				      "boot of the block.");
			goto relinquish;
		}
#else
		opt_local->min_nodes = resp->node_cnt;
		opt_local->max_nodes = resp->node_cnt;

		if (resp->working_cluster_rec)
			slurm_setup_remote_working_cluster(resp);

		if (!_wait_nodes_ready(resp)) {
			if (!destroy_job)
				error("Something is wrong with the boot of the nodes.");
			goto relinquish;
		}
#endif
	} else if (destroy_job) {
		goto relinquish;
	}

	if (handle_signals)
		xsignal_block(sig_array);

	job_desc_msg_destroy(j);

	return resp;

relinquish:
	if (resp) {
		if (!destroy_job)
			slurm_complete_job(resp->job_id, 1);
		slurm_free_resource_allocation_response_msg(resp);
	}
	exit(error_exit);
	return NULL;
}
Example #15
0
int
main (int argc, char **argv)
{
	slurm_addr_t *cli;
	slurm_addr_t *self;
	slurm_msg_t *msg;
	stepd_step_rec_t *job;
	int ngids;
	gid_t *gids;
	int rc = 0;
	char *launch_params;

	if (_process_cmdline (argc, argv) < 0)
		fatal ("Error in slurmstepd command line");

	xsignal_block(slurmstepd_blocked_signals);
	conf = xmalloc(sizeof(*conf));
	conf->argv = &argv;
	conf->argc = &argc;
	init_setproctitle(argc, argv);
	if (slurm_select_init(1) != SLURM_SUCCESS )
		fatal( "failed to initialize node selection plugin" );
	if (slurm_auth_init(NULL) != SLURM_SUCCESS)
		fatal( "failed to initialize authentication plugin" );

	/* Receive job parameters from the slurmd */
	_init_from_slurmd(STDIN_FILENO, argv, &cli, &self, &msg,
			  &ngids, &gids);

	/* Create the stepd_step_rec_t, mostly from info in a
	 * launch_tasks_request_msg_t or a batch_job_launch_msg_t */
	if (!(job = _step_setup(cli, self, msg))) {
		_send_fail_to_slurmd(STDOUT_FILENO);
		rc = SLURM_FAILURE;
		goto ending;
	}
	job->ngids = ngids;
	job->gids = gids;

	/* fork handlers cause mutexes on some global data structures
	 * to be re-initialized after the fork. */
	list_install_fork_handlers();
	slurm_conf_install_fork_handlers();

	/* sets job->msg_handle and job->msgid */
	if (msg_thr_create(job) == SLURM_ERROR) {
		_send_fail_to_slurmd(STDOUT_FILENO);
		rc = SLURM_FAILURE;
		goto ending;
	}

	_send_ok_to_slurmd(STDOUT_FILENO);
	_got_ack_from_slurmd(STDIN_FILENO);

	/* Fancy way of closing stdin that keeps STDIN_FILENO from being
	 * allocated to any random file.  The slurmd already opened /dev/null
	 * on STDERR_FILENO for us. */
	dup2(STDERR_FILENO, STDIN_FILENO);

	/* Fancy way of closing stdout that keeps STDOUT_FILENO from being
	 * allocated to any random file.  The slurmd already opened /dev/null
	 * on STDERR_FILENO for us. */
	dup2(STDERR_FILENO, STDOUT_FILENO);

	/* slurmstepd is the only daemon that should survive upgrade. If it
	 * had been swapped out before upgrade happened it could easily lead
	 * to SIGBUS at any time after upgrade. Avoid that by locking it
	 * in-memory. */
	launch_params = slurm_get_launch_params();
	if (launch_params && strstr(launch_params, "slurmstepd_memlock")) {
#ifdef _POSIX_MEMLOCK
		int flags = MCL_CURRENT;
		if (strstr(launch_params, "slurmstepd_memlock_all"))
			flags |= MCL_FUTURE;
		if (mlockall(flags) < 0)
			info("failed to mlock() slurmstepd pages: %m");
		else
			debug("slurmstepd locked in memory");
#else
		info("mlockall() system call does not appear to be available");
#endif
	}
	xfree(launch_params);

	/* This does most of the stdio setup, then launches all the tasks,
	 * and blocks until the step is complete */
	rc = job_manager(job);

	return stepd_cleanup(msg, job, cli, self, rc, 0);
ending:
	return stepd_cleanup(msg, job, cli, self, rc, 1);
}